#!/bin/bash function getMdLinks(){ # Use: Takes a markdown file file.md, extracts all links, finds the unique ones and saves them to file.md.links linksFile="$1.links" linksFile2="$1.links2" echo "" echo "Extracting links..." rm "$1.links" -f grep -Eoi '\]\((.*)\)' $1 | grep -Eo '(http|https)://[^)]+' >> "$1.links" ## sed -i 's/www.wikiwand.com\/en/en.wikipedia.org\/wiki/g' $1 awk '!seen[$0]++' "$linksFile" > "$linksFile2" && mv "$linksFile2" "$linksFile" echo "Done." numLinesLinkFile=$(wc -l "$linksFile" | awk '{ print $1 }') totalTimeInMinutes=$(echo "scale=0; ($numLinesLinkFile*7.5 + 60*$numLinesLinkFile/15)/60" | bc) echo "Expected to take $totalTimeInMinutes mins." } function pushToArchive(){ # Use: Takes a txt file with one link on each line and pushes all the links to the internet archive. Saves those links to a textfile # References: # https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file # https://github.com/oduwsdl/archivenow # For the double underscore, see: https://stackoverflow.com/questions/13797087/bash-why-double-underline-for-private-functions-why-for-bash-complet/15181999 echo "Pushing to archive.org..." input="$1" counter=1 archivedLinksFile="$1.archived" errorsFile="$1.errors" ## rm -f "$archivedLinksFile" rm -f "$errorsfile" touch "$archivedLinksFile" touch "$errorsFile" ## How to deal with errors that arise echo "If this file contains errors, you can deal with them as follows:" >> "$errorsFile" echo "- Do another pass with \$ longnow yourfile.md. If you don't delete yourfile.md.links.archived, past archive links are remembered, and only the links which are not there are sent again" >> "$errorsFile" echo "- Input the offending links manually to https://archive.org/, add the results to the yourfile.md.links.archived file manually, and then do another pass with \$ longnow yourfile.md" >> "$errorsFile" echo "" >> "$errorsFile" ## Main body while IFS= read -r line do wait if [ $(($counter % 15)) -eq 0 ]; then printf "Archive.org doesn't accept more than 15 links per min; sleeping for 1min...\n\n" sleep 1m fi echo "Url: $line" urlAlreadyContained=$( ( grep "$line$" "$archivedLinksFile"; grep "$line/$" "$archivedLinksFile" ) | tail -1 ) if [ "$urlAlreadyContained" == "" ]; then archiveURL=$(archivenow --ia $line) if [[ "$archiveURL" == "Error"* ]]; then echo "$line" >> "$errorsFile" echo "$archiveURL" >> "$errorsFile" echo "" >> "$errorsFile" echo "There was an error. See $errorsFile for how to deal with it." else echo "$archiveURL" >> "$archivedLinksFile" fi counter=$((counter+1)) numSecondsSleep=$((5+ ($RANDOM%15))) else archiveURL="$urlAlreadyContained" numSecondsSleep=0 fi echo $archiveURL echo "Sleeping for $numSecondsSleep seconds..." sleep $numSecondsSleep echo "" done < "$input" echo "Done." echo "" } function addArchiveLinksToFile(){ originalFile="$1" originalFileTemp="$originalFile.temp" linksFile="$1.links" archivedLinksFile="$1.links.archived" errorsFile="$1.links.errors" longNowFile="$1.longnow" echo "Creating longnow file @ $longNowFile..." rm -f "$longNowFile" touch "$longNowFile" cp "$originalFile" "$originalFileTemp" while IFS= read -r url do wait archivedUrl=$( ( grep "$url$" "$archivedLinksFile"; grep "$url/$" "$archivedLinksFile") | tail -1) if [ "$archivedUrl" != "" ]; then ## echo "Url: $url" ## echo "ArchivedUrl: $archivedUrl" urlForSed="${url//\//\\/}" archiveUrlForSed="${archivedUrl//\//\\/}" sed -i "s/$urlForSed)/$urlForSed) ([a]($archiveUrlForSed))/g" "$1" ##else ##echo "There was an error for $url; see the $errorsFile" fi done < "$linksFile" mv "$originalFile" "$longNowFile" mv "$originalFileTemp" "$originalFile" echo "Done." } function longnow(){ doesArchiveNowExist=$(whereis "archivenow") if [ "$doesArchiveNowExist" == "archivenow:" ] then echo "Required archivenow utility not found in path." echo "Install with \$ pip install archivenow" echo "(resp. \$ pip3 install archivenow)" echo "Or follow instructions on https://github.com/oduwsdl/archivenow" else getMdLinks $1 pushToArchive $1.links addArchiveLinksToFile $1 numLinesErrorFile=$(wc -l "$1.links.errors" | awk '{ print $1 }') if [ "$numLinesErrorFile" -gt 4 ] ;then echo "It seems that there are errors. To view and deal with them, see the $1.links.errors file" fi fi } longnow "$1" ## don't copy this line into your .bashrc file