2022-01-12 01:29:11 +00:00
# Filenames
input = " $1 "
root = " $( echo " $input " | sed 's/.md//g' ) "
links = " $root .links.txt "
archivedLinks = " $root .links.archived.txt "
errors = " $root .errors.txt "
output = " $root .longnow.md "
2021-06-28 14:38:06 +00:00
2022-01-12 01:29:11 +00:00
## Directories
initialDir = " $( pwd ) "
workdir = " longnow- $root "
## Move to work dir
function moveToWorkDir( ) {
mkdir -p " $workdir "
2022-03-05 15:50:30 +00:00
cp " $input " " $workdir / $input "
2022-01-12 01:29:11 +00:00
cd " $workdir "
}
## Extract markdown links
function extractMarkdownLinks( ) { # Use: Takes a markdown file file.md, extracts all links, finds the unique ones and saves them to file.md.links
links2 = " $root .links2.txt "
2021-06-28 10:27:08 +00:00
echo ""
echo "Extracting links..."
2022-01-12 01:29:11 +00:00
rm -f " $links "
grep -Eoi '\]\((.*)\)' " $input " | grep -Eo '(http|https)://[^)]+' >> " $links "
awk '!seen[$0]++' " $links " > " $links2 " && mv " $links2 " " $links "
echo "Done extracting links"
2021-06-28 10:27:08 +00:00
}
2022-01-12 01:29:11 +00:00
## Push to Archive
2021-06-28 10:27:08 +00:00
function pushToArchive( ) {
# Use: Takes a txt file with one link on each line and pushes all the links to the internet archive. Saves those links to a textfile
# References:
# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
# https://github.com/oduwsdl/archivenow
2021-06-30 20:58:58 +00:00
# For the double underscore, see: https://stackoverflow.com/questions/13797087/bash-why-double-underline-for-private-functions-why-for-bash-complet/15181999
2022-01-12 01:29:11 +00:00
echo ""
echo "Pushing to archive.org..."
numLinesLinkFile = $( wc -l " $links " | awk '{ print $1 }' )
totalTimeInMinutes = $( echo " scale=0; ( $numLinesLinkFile *7.5 + 60* $numLinesLinkFile /15)/60 " | bc)
echo " Expected to take ~ $totalTimeInMinutes mins. "
2022-03-05 15:50:30 +00:00
echo ""
2021-06-29 16:33:43 +00:00
2022-01-12 01:29:11 +00:00
## rm -f "$archivedLinks"
rm -f " $errors "
touch " $archivedLinks "
touch " $errors "
2021-06-28 10:27:08 +00:00
2021-06-28 20:54:07 +00:00
## How to deal with errors that arise
2022-01-12 01:29:11 +00:00
echo "If this file contains errors, you can deal with them as follows:" >> " $errors "
echo "- Do another pass with \$ longnow yourfile.md. If you don't delete yourfile.md.links.archived, past archive links are remembered, and only the links which are not there are sent again" >> " $errors "
echo "- Input the offending links manually to https://archive.org/, add the results to the yourfile.md.links.archived file manually, and then do another pass with \$ longnow yourfile.md" >> " $errors "
echo "" >> " $errors "
2021-06-28 20:54:07 +00:00
## Main body
2022-01-12 01:29:11 +00:00
counter = 1
2021-06-30 20:58:58 +00:00
while IFS = read -r line
do
wait
if [ $(( $counter % 15 )) -eq 0 ] ; then
printf "Archive.org doesn't accept more than 15 links per min; sleeping for 1min...\n\n"
sleep 1m
fi
echo " Url: $line "
2022-03-05 15:50:30 +00:00
urlAlreadyContainedInLocalArchivedLinks = $( ( grep " $line $" " $archivedLinks " ; grep " $line / $" " $archivedLinks " ) | tail -1 )
if [ " $urlAlreadyContainedInLocalArchivedLinks " = = "" ] ; then
urlAlreadyInArchiveOnline = " $( curl --silent http://archive.org/wayback/available?url= $line | jq '.archived_snapshots.closest.url' | sed 's/"//g' | sed 's/null//g' ) "
if [ " $urlAlreadyInArchiveOnline " = = "" ] ; then
echo "Sending to archive..."
archiveURL = $( archivenow --ia $line )
if [ [ " $archiveURL " = = "Error" * ] ] ; then
echo " $line " >> " $errors "
echo " $archiveURL " >> " $errors "
echo "" >> " $errors "
echo " There was an error. See $errors for how to deal with it. "
echo ""
else
echo " $archiveURL " >> " $archivedLinks "
fi
counter = $(( counter+1))
numSecondsSleep = $(( 5 + ( $RANDOM % 15 )) )
2021-06-30 20:58:58 +00:00
else
2022-03-05 15:50:30 +00:00
echo " Already in archive.org: $urlAlreadyInArchiveOnline "
echo " $urlAlreadyInArchiveOnline " >> " $archivedLinks "
echo ""
numSecondsSleep = 0
2021-06-30 20:58:58 +00:00
fi
2022-03-05 15:50:30 +00:00
elif [ ! -z " $urlAlreadyContainedInLocalArchivedLinks " ] ; then
echo " Already in local archive: $urlAlreadyContainedInLocalArchivedLinks "
archiveURL = " $urlAlreadyContainedInLocalArchivedLinks "
2021-06-30 20:58:58 +00:00
numSecondsSleep = 0
2022-03-05 15:50:30 +00:00
# echo $archiveURL
echo " Sleeping for $numSecondsSleep seconds... "
sleep $numSecondsSleep
echo ""
2021-06-30 20:58:58 +00:00
fi
2022-01-12 01:29:11 +00:00
done < " $links "
2021-06-30 20:58:58 +00:00
2022-01-12 01:29:11 +00:00
echo "Done pushing links to archive.org"
2021-06-28 10:27:08 +00:00
echo ""
}
2022-01-12 01:29:11 +00:00
## Add archive links to file
2021-06-28 10:27:08 +00:00
function addArchiveLinksToFile( ) {
2022-01-12 01:29:11 +00:00
echo " Creating longnow file at $output "
2021-06-28 10:27:08 +00:00
2022-01-12 01:29:11 +00:00
rm -f " $output "
cp " $input " " $output "
2021-06-28 10:27:08 +00:00
while IFS = read -r url
2021-06-30 20:58:58 +00:00
do
wait
2022-01-12 01:29:11 +00:00
archivedUrl = $( ( grep " $url $" " $archivedLinks " ; grep " $url / $" " $archivedLinks " ) | tail -1)
2021-06-30 20:58:58 +00:00
if [ " $archivedUrl " != "" ] ; then
## echo "Url: $url"
## echo "ArchivedUrl: $archivedUrl"
urlForSed = " ${ url // \/ / \\ / } "
archiveUrlForSed = " ${ archivedUrl // \/ / \\ / } "
2022-01-12 01:29:11 +00:00
sed -i " s/ $urlForSed )/ $urlForSed ) ([a]( $archiveUrlForSed ))/g " " $output "
2021-06-30 20:58:58 +00:00
##else
##echo "There was an error for $url; see the $errorsFile"
fi
2022-01-12 01:29:11 +00:00
done < " $links "
2021-06-30 20:58:58 +00:00
echo "Done."
2022-01-12 01:29:11 +00:00
}
2021-06-28 10:27:08 +00:00
2022-01-12 01:29:11 +00:00
## Explain installation
2022-03-05 15:50:30 +00:00
function explainArchiveNowInstallation( ) {
2022-01-12 01:29:11 +00:00
echo "Required archivenow utility not found in path."
echo "Install with \$ pip install archivenow"
echo "(resp. \$ pip3 install archivenow)"
echo "Or follow instructions on https://github.com/oduwsdl/archivenow"
2021-06-28 10:27:08 +00:00
}
2022-03-05 15:50:30 +00:00
function explainJqInstallation( ) {
echo "Required jq utility not found in path."
echo "Install with your package manager, e.g., \$ sudo apt install jq"
echo "Or follow instructions on https://stedolan.github.io/jq/download/"
}
2022-01-12 01:29:11 +00:00
## Report errors
function reportErrors( ) {
numLinesErrorFile = $( wc -l " $errors " | awk '{ print $1 }' )
if [ " $numLinesErrorFile " -gt 4 ] ; then
echo " It seems that there are errors. To view and deal with them, see the $errors file "
fi
}
## Clean up
function cleanup( ) {
cp " $output " " ../ $output "
cd " $initialDir "
}
## Main
function main( ) {
doesArchiveNowExist = " $( whereis "archivenow" ) "
2022-03-05 15:50:30 +00:00
doesJqExist = " $( whereis "jq" ) "
if [ " $doesArchiveNowExist " = = "archivenow:" ] ; then
explainArchiveNowInstallation
elif [ " $doesJqExist " = = "jq:" ] ; then
explainJqInstallation
2021-06-28 10:27:08 +00:00
else
2022-03-05 15:50:30 +00:00
moveToWorkDir
extractMarkdownLinks
pushToArchive
addArchiveLinksToFile
reportErrors
cleanup
2021-06-28 10:27:08 +00:00
fi
}
2022-01-12 01:29:11 +00:00
main
2021-06-28 10:27:08 +00:00