From d8515b22df3eab2d9c0940a21b85c2064a2756d7 Mon Sep 17 00:00:00 2001 From: NunoSempere Date: Sat, 5 Mar 2022 10:50:30 -0500 Subject: [PATCH] feat: Use archive.org snapshot if it already exists. --- README.md | 11 ++++++- longnow => longnow.sh | 77 +++++++++++++++++++++++++++---------------- 2 files changed, 58 insertions(+), 30 deletions(-) rename longnow => longnow.sh (66%) diff --git a/README.md b/README.md index 31e359d..c9383f5 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,6 @@ I use it to archive links in [this forecasting newsletter](https://forecasting.s > Note to the future: All links are added automatically to the Internet Archive, using this [tool](https://github.com/NunoSempere/longNowForMd) ([a](https://web.archive.org/web/20220109144543/https://github.com/NunoSempere/longNowForMd)). "(a)" for archived links was inspired by [Milan Griffes](https://www.flightfromperfection.com/) ([a](https://web.archive.org/web/20220109144604/https://www.flightfromperfection.com/)), [Andrew Zuckerman](https://www.andzuck.com/) ([a](https://web.archive.org/web/20211202120912/https://www.andzuck.com/)), and [Alexey Guzey](https://guzey.com/) ([a](https://web.archive.org/web/20220109144733/https://guzey.com/)). - ## How to install Add [this file](https://github.com/NunoSempere/longNowForMd/blob/master/longnow) to your path, for instance by moving it to the `/usr/bin` folder and giving it execute permissions (with `chmod 755 longnow`) @@ -21,6 +20,16 @@ In addition, this utility requires [archivenow](https://github.com/oduwsdl/archi pip install archivenow ## respectively, pip3 ``` +It also requires [jq](https://stedolan.github.io/jq/download/), which can be installed as: + +``` +sudo apt install jq +``` + +if on Debian, or using your distribution's package manager otherwise. + +As of the newest iteration of this program, if archive.org already has a snapshot of the page, that snapshot is taken instead. This results in massive time savings, but could imply that a less up to date copy is used. If this behavior is not desired, it can be easily excised manually, by removing the lines around `if [ "$urlAlreadyInArchiveOnline" == "" ]; then`. + ## How to use ``` diff --git a/longnow b/longnow.sh similarity index 66% rename from longnow rename to longnow.sh index a1a9939..c8b1a25 100755 --- a/longnow +++ b/longnow.sh @@ -13,7 +13,7 @@ workdir="longnow-$root" ## Move to work dir function moveToWorkDir(){ mkdir -p "$workdir" - cp "$input" "$workdir/$input" + cp "$input" "$workdir/$input" cd "$workdir" } @@ -44,7 +44,7 @@ function pushToArchive(){ numLinesLinkFile=$(wc -l "$links" | awk '{ print $1 }') totalTimeInMinutes=$(echo "scale=0; ($numLinesLinkFile*7.5 + 60*$numLinesLinkFile/15)/60" | bc) echo "Expected to take ~$totalTimeInMinutes mins." - echo "" + echo "" ## rm -f "$archivedLinks" rm -f "$errors" @@ -67,27 +67,39 @@ function pushToArchive(){ sleep 1m fi echo "Url: $line" - urlAlreadyContained=$( ( grep "$line$" "$archivedLinks"; grep "$line/$" "$archivedLinks" ) | tail -1 ) - if [ "$urlAlreadyContained" == "" ]; then - archiveURL=$(archivenow --ia $line) - if [[ "$archiveURL" == "Error"* ]]; then - echo "$line" >> "$errors" - echo "$archiveURL" >> "$errors" - echo "" >> "$errors" - echo "There was an error. See $errors for how to deal with it." + urlAlreadyContainedInLocalArchivedLinks=$( ( grep "$line$" "$archivedLinks"; grep "$line/$" "$archivedLinks" ) | tail -1 ) + + if [ "$urlAlreadyContainedInLocalArchivedLinks" == "" ]; then + urlAlreadyInArchiveOnline="$(curl --silent http://archive.org/wayback/available?url=$line | jq '.archived_snapshots.closest.url' | sed 's/"//g' | sed 's/null//g' )" + if [ "$urlAlreadyInArchiveOnline" == "" ]; then + echo "Sending to archive..." + archiveURL=$(archivenow --ia $line) + if [[ "$archiveURL" == "Error"* ]]; then + echo "$line" >> "$errors" + echo "$archiveURL" >> "$errors" + echo "" >> "$errors" + echo "There was an error. See $errors for how to deal with it." + echo "" + else + echo "$archiveURL" >> "$archivedLinks" + fi + counter=$((counter+1)) + numSecondsSleep=$((5+ ($RANDOM%15))) else - echo "$archiveURL" >> "$archivedLinks" + echo "Already in archive.org: $urlAlreadyInArchiveOnline" + echo "$urlAlreadyInArchiveOnline" >> "$archivedLinks" + echo "" + numSecondsSleep=0 fi - counter=$((counter+1)) - numSecondsSleep=$((5+ ($RANDOM%15))) - else - archiveURL="$urlAlreadyContained" + elif [ ! -z "$urlAlreadyContainedInLocalArchivedLinks" ]; then + echo "Already in local archive: $urlAlreadyContainedInLocalArchivedLinks" + archiveURL="$urlAlreadyContainedInLocalArchivedLinks" numSecondsSleep=0 + # echo $archiveURL + echo "Sleeping for $numSecondsSleep seconds..." + sleep $numSecondsSleep + echo "" fi - echo $archiveURL - echo "Sleeping for $numSecondsSleep seconds..." - sleep $numSecondsSleep - echo "" done < "$links" echo "Done pushing links to archive.org" @@ -121,13 +133,18 @@ function addArchiveLinksToFile(){ } ## Explain installation -function explainInstallation(){ +function explainArchiveNowInstallation(){ echo "Required archivenow utility not found in path." echo "Install with \$ pip install archivenow" echo "(resp. \$ pip3 install archivenow)" echo "Or follow instructions on https://github.com/oduwsdl/archivenow" } +function explainJqInstallation(){ + echo "Required jq utility not found in path." + echo "Install with your package manager, e.g., \$ sudo apt install jq" + echo "Or follow instructions on https://stedolan.github.io/jq/download/" +} ## Report errors function reportErrors(){ numLinesErrorFile=$(wc -l "$errors" | awk '{ print $1 }') @@ -145,16 +162,18 @@ function cleanup(){ ## Main function main(){ doesArchiveNowExist="$(whereis "archivenow")" - if [ "$doesArchiveNowExist" == "archivenow:" ] - then - explainInstallation + doesJqExist="$(whereis "jq")" + if [ "$doesArchiveNowExist" == "archivenow:" ]; then + explainArchiveNowInstallation + elif [ "$doesJqExist" == "jq:" ]; then + explainJqInstallation else - moveToWorkDir - extractMarkdownLinks - pushToArchive - addArchiveLinksToFile - reportErrors - cleanup + moveToWorkDir + extractMarkdownLinks + pushToArchive + addArchiveLinksToFile + reportErrors + cleanup fi } main