2021-06-28 14:38:06 +00:00
#!/bin/bash
2021-06-28 10:27:08 +00:00
function getMdLinks(){ # Use: Takes a markdown file file.md, extracts all links, finds the unique ones and saves them to file.md.links
2021-06-29 16:33:43 +00:00
linksFile="$1.links"
linksFile2="$1.links2"
2021-06-28 10:27:08 +00:00
echo ""
echo "Extracting links..."
2021-06-29 09:00:59 +00:00
rm "$1.links" -f
2021-06-28 10:27:08 +00:00
grep -Eoi '\]\((.*)\)' $1 | grep -Eo '(http|https)://[^)]+' >> "$1.links"
## sed -i 's/www.wikiwand.com\/en/en.wikipedia.org\/wiki/g' $1
2021-06-29 16:33:43 +00:00
awk '!seen[$0]++' "$linksFile" > "$linksFile2" && mv "$linksFile2" "$linksFile"
2021-06-28 10:27:08 +00:00
echo "Done."
2021-06-29 16:33:43 +00:00
numLinesLinkFile=$(wc -l "$linksFile" | awk '{ print $1 }')
totalTimeInMinutes=$(echo "scale=0; ($numLinesLinkFile*7.5 + 60*$numLinesLinkFile/15)/60" | bc)
echo "Expected to take $totalTimeInMinutes mins."
2021-06-28 10:27:08 +00:00
}
function pushToArchive(){
# Use: Takes a txt file with one link on each line and pushes all the links to the internet archive. Saves those links to a textfile
# References:
# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
# https://github.com/oduwsdl/archivenow
# For the double underscore, see: https://stackoverflow.com/questions/13797087/bash-why-double-underline-for-private-functions-why-for-bash-complet/15181999
echo "Pushing to archive.org..."
2021-06-29 16:33:43 +00:00
input="$1"
2021-06-28 10:27:08 +00:00
counter=1
2021-06-28 20:54:07 +00:00
archivedLinksFile="$1.archived"
errorsFile="$1.errors"
2021-06-29 16:33:43 +00:00
## rm -f "$archivedLinksFile"
rm -f "$errorsfile"
2021-06-28 20:54:07 +00:00
touch "$archivedLinksFile"
touch "$errorsFile"
2021-06-28 10:27:08 +00:00
2021-06-28 20:54:07 +00:00
## How to deal with errors that arise
echo "If this file contains errors, you can deal with them as follows:" >> "$errorsFile"
echo "- Do another pass with \$ longnow yourfile.md. If you don't delete yourfile.md.links.archived, past archive links are remembered, and only the links which are not there are sent again" >> "$errorsFile"
2021-06-29 16:33:43 +00:00
echo "- Input the offending links manually to https://archive.org/, add the results to the yourfile.md.links.archived file manually, and then do another pass with \$ longnow yourfile.md" >> "$errorsFile"
2021-06-28 20:54:07 +00:00
echo "" >> "$errorsFile"
## Main body
2021-06-28 10:27:08 +00:00
while IFS= read -r line
do
wait
2021-06-28 20:54:07 +00:00
if [ $(($counter % 15)) -eq 0 ]; then
printf "Archive.org doesn't accept more than 15 links per min; sleeping for 1min...\n\n"
2021-06-28 10:27:08 +00:00
sleep 1m
fi
echo "Url: $line"
2021-06-28 21:23:34 +00:00
urlAlreadyContained=$( ( grep "$line$" "$archivedLinksFile"; grep "$line/$" "$archivedLinksFile" ) | tail -1 )
2021-06-28 20:54:07 +00:00
if [ "$urlAlreadyContained" == "" ]; then
archiveURL=$(archivenow --ia $line)
if [[ "$archiveURL" == "Error"* ]]; then
echo "$line" >> "$errorsFile"
echo "$archiveURL" >> "$errorsFile"
echo "" >> "$errorsFile"
echo "There was an error. See $errorsFile for how to deal with it."
else
echo "$archiveURL" >> "$archivedLinksFile"
fi
counter=$((counter+1))
numSecondsSleep=$((5+ ($RANDOM%15)))
else
archiveURL="$urlAlreadyContained"
2021-06-28 21:23:34 +00:00
numSecondsSleep=0
2021-06-28 20:54:07 +00:00
fi
2021-06-28 10:27:08 +00:00
echo $archiveURL
2021-06-29 16:33:43 +00:00
echo "Sleeping for $numSecondsSleep seconds..."
2021-06-28 21:23:34 +00:00
sleep $numSecondsSleep
2021-06-28 10:27:08 +00:00
echo ""
done < "$input"
echo "Done."
echo ""
}
function addArchiveLinksToFile(){
originalFile="$1"
originalFileTemp="$originalFile.temp"
linksFile="$1.links"
archivedLinksFile="$1.links.archived"
2021-06-28 20:54:07 +00:00
errorsFile="$1.links.errors"
2021-06-28 10:27:08 +00:00
longNowFile="$1.longnow"
echo "Creating longnow file @ $longNowFile..."
rm -f "$longNowFile"
touch "$longNowFile"
cp "$originalFile" "$originalFileTemp"
while IFS= read -r url
do
wait
2021-06-28 21:23:34 +00:00
archivedUrl=$( ( grep "$url$" "$archivedLinksFile"; grep "$url/$" "$archivedLinksFile") | tail -1)
2021-06-28 20:54:07 +00:00
if [ "$archivedUrl" != "" ]; then
## echo "Url: $url"
## echo "ArchivedUrl: $archivedUrl"
urlForSed="${url//\//\\/}"
archiveUrlForSed="${archivedUrl//\//\\/}"
sed -i "s/$urlForSed)/$urlForSed) ([a]($archiveUrlForSed))/g" "$1"
##else
##echo "There was an error for $url; see the $errorsFile"
fi
2021-06-28 10:27:08 +00:00
done < "$linksFile"
mv "$originalFile" "$longNowFile"
mv "$originalFileTemp" "$originalFile"
echo "Done."
}
function longnow(){
doesArchiveNowExist=$(whereis "archivenow")
if [ "$doesArchiveNowExist" == "archivenow:" ]
then
echo "Required archivenow utility not found in path."
echo "Install with \$ pip install archivenow"
echo "(resp. \$ pip3 install archivenow)"
echo "Or follow instructions on https://github.com/oduwsdl/archivenow"
else
getMdLinks $1
pushToArchive $1.links
addArchiveLinksToFile $1
2021-06-29 09:00:59 +00:00
numLinesErrorFile=$(wc -l "$1.links.errors" | awk '{ print $1 }')
if [ "$numLinesErrorFile" -gt 4 ] ;then
2021-06-28 20:54:07 +00:00
echo "It seems that there are errors. To view and deal with them, see the $1.links.errors file"
fi
2021-06-28 10:27:08 +00:00
fi
}
2021-06-28 14:38:06 +00:00
longnow "$1" ## don't copy this line into your .bashrc file