feat: Use archive.org snapshot if it already exists.

This commit is contained in:
NunoSempere 2022-03-05 10:50:30 -05:00
parent 26b83deb8e
commit d8515b22df
2 changed files with 58 additions and 30 deletions

View File

@ -4,7 +4,6 @@ I use it to archive links in [this forecasting newsletter](https://forecasting.s
> Note to the future: All links are added automatically to the Internet Archive, using this [tool](https://github.com/NunoSempere/longNowForMd) ([a](https://web.archive.org/web/20220109144543/https://github.com/NunoSempere/longNowForMd)). "(a)" for archived links was inspired by [Milan Griffes](https://www.flightfromperfection.com/) ([a](https://web.archive.org/web/20220109144604/https://www.flightfromperfection.com/)), [Andrew Zuckerman](https://www.andzuck.com/) ([a](https://web.archive.org/web/20211202120912/https://www.andzuck.com/)), and [Alexey Guzey](https://guzey.com/) ([a](https://web.archive.org/web/20220109144733/https://guzey.com/)). > Note to the future: All links are added automatically to the Internet Archive, using this [tool](https://github.com/NunoSempere/longNowForMd) ([a](https://web.archive.org/web/20220109144543/https://github.com/NunoSempere/longNowForMd)). "(a)" for archived links was inspired by [Milan Griffes](https://www.flightfromperfection.com/) ([a](https://web.archive.org/web/20220109144604/https://www.flightfromperfection.com/)), [Andrew Zuckerman](https://www.andzuck.com/) ([a](https://web.archive.org/web/20211202120912/https://www.andzuck.com/)), and [Alexey Guzey](https://guzey.com/) ([a](https://web.archive.org/web/20220109144733/https://guzey.com/)).
## How to install ## How to install
Add [this file](https://github.com/NunoSempere/longNowForMd/blob/master/longnow) to your path, for instance by moving it to the `/usr/bin` folder and giving it execute permissions (with `chmod 755 longnow`) Add [this file](https://github.com/NunoSempere/longNowForMd/blob/master/longnow) to your path, for instance by moving it to the `/usr/bin` folder and giving it execute permissions (with `chmod 755 longnow`)
@ -21,6 +20,16 @@ In addition, this utility requires [archivenow](https://github.com/oduwsdl/archi
pip install archivenow ## respectively, pip3 pip install archivenow ## respectively, pip3
``` ```
It also requires [jq](https://stedolan.github.io/jq/download/), which can be installed as:
```
sudo apt install jq
```
if on Debian, or using your distribution's package manager otherwise.
As of the newest iteration of this program, if archive.org already has a snapshot of the page, that snapshot is taken instead. This results in massive time savings, but could imply that a less up to date copy is used. If this behavior is not desired, it can be easily excised manually, by removing the lines around `if [ "$urlAlreadyInArchiveOnline" == "" ]; then`.
## How to use ## How to use
``` ```

View File

@ -13,7 +13,7 @@ workdir="longnow-$root"
## Move to work dir ## Move to work dir
function moveToWorkDir(){ function moveToWorkDir(){
mkdir -p "$workdir" mkdir -p "$workdir"
cp "$input" "$workdir/$input" cp "$input" "$workdir/$input"
cd "$workdir" cd "$workdir"
} }
@ -44,7 +44,7 @@ function pushToArchive(){
numLinesLinkFile=$(wc -l "$links" | awk '{ print $1 }') numLinesLinkFile=$(wc -l "$links" | awk '{ print $1 }')
totalTimeInMinutes=$(echo "scale=0; ($numLinesLinkFile*7.5 + 60*$numLinesLinkFile/15)/60" | bc) totalTimeInMinutes=$(echo "scale=0; ($numLinesLinkFile*7.5 + 60*$numLinesLinkFile/15)/60" | bc)
echo "Expected to take ~$totalTimeInMinutes mins." echo "Expected to take ~$totalTimeInMinutes mins."
echo "" echo ""
## rm -f "$archivedLinks" ## rm -f "$archivedLinks"
rm -f "$errors" rm -f "$errors"
@ -67,27 +67,39 @@ function pushToArchive(){
sleep 1m sleep 1m
fi fi
echo "Url: $line" echo "Url: $line"
urlAlreadyContained=$( ( grep "$line$" "$archivedLinks"; grep "$line/$" "$archivedLinks" ) | tail -1 ) urlAlreadyContainedInLocalArchivedLinks=$( ( grep "$line$" "$archivedLinks"; grep "$line/$" "$archivedLinks" ) | tail -1 )
if [ "$urlAlreadyContained" == "" ]; then
archiveURL=$(archivenow --ia $line) if [ "$urlAlreadyContainedInLocalArchivedLinks" == "" ]; then
if [[ "$archiveURL" == "Error"* ]]; then urlAlreadyInArchiveOnline="$(curl --silent http://archive.org/wayback/available?url=$line | jq '.archived_snapshots.closest.url' | sed 's/"//g' | sed 's/null//g' )"
echo "$line" >> "$errors" if [ "$urlAlreadyInArchiveOnline" == "" ]; then
echo "$archiveURL" >> "$errors" echo "Sending to archive..."
echo "" >> "$errors" archiveURL=$(archivenow --ia $line)
echo "There was an error. See $errors for how to deal with it." if [[ "$archiveURL" == "Error"* ]]; then
echo "$line" >> "$errors"
echo "$archiveURL" >> "$errors"
echo "" >> "$errors"
echo "There was an error. See $errors for how to deal with it."
echo ""
else
echo "$archiveURL" >> "$archivedLinks"
fi
counter=$((counter+1))
numSecondsSleep=$((5+ ($RANDOM%15)))
else else
echo "$archiveURL" >> "$archivedLinks" echo "Already in archive.org: $urlAlreadyInArchiveOnline"
echo "$urlAlreadyInArchiveOnline" >> "$archivedLinks"
echo ""
numSecondsSleep=0
fi fi
counter=$((counter+1)) elif [ ! -z "$urlAlreadyContainedInLocalArchivedLinks" ]; then
numSecondsSleep=$((5+ ($RANDOM%15))) echo "Already in local archive: $urlAlreadyContainedInLocalArchivedLinks"
else archiveURL="$urlAlreadyContainedInLocalArchivedLinks"
archiveURL="$urlAlreadyContained"
numSecondsSleep=0 numSecondsSleep=0
# echo $archiveURL
echo "Sleeping for $numSecondsSleep seconds..."
sleep $numSecondsSleep
echo ""
fi fi
echo $archiveURL
echo "Sleeping for $numSecondsSleep seconds..."
sleep $numSecondsSleep
echo ""
done < "$links" done < "$links"
echo "Done pushing links to archive.org" echo "Done pushing links to archive.org"
@ -121,13 +133,18 @@ function addArchiveLinksToFile(){
} }
## Explain installation ## Explain installation
function explainInstallation(){ function explainArchiveNowInstallation(){
echo "Required archivenow utility not found in path." echo "Required archivenow utility not found in path."
echo "Install with \$ pip install archivenow" echo "Install with \$ pip install archivenow"
echo "(resp. \$ pip3 install archivenow)" echo "(resp. \$ pip3 install archivenow)"
echo "Or follow instructions on https://github.com/oduwsdl/archivenow" echo "Or follow instructions on https://github.com/oduwsdl/archivenow"
} }
function explainJqInstallation(){
echo "Required jq utility not found in path."
echo "Install with your package manager, e.g., \$ sudo apt install jq"
echo "Or follow instructions on https://stedolan.github.io/jq/download/"
}
## Report errors ## Report errors
function reportErrors(){ function reportErrors(){
numLinesErrorFile=$(wc -l "$errors" | awk '{ print $1 }') numLinesErrorFile=$(wc -l "$errors" | awk '{ print $1 }')
@ -145,16 +162,18 @@ function cleanup(){
## Main ## Main
function main(){ function main(){
doesArchiveNowExist="$(whereis "archivenow")" doesArchiveNowExist="$(whereis "archivenow")"
if [ "$doesArchiveNowExist" == "archivenow:" ] doesJqExist="$(whereis "jq")"
then if [ "$doesArchiveNowExist" == "archivenow:" ]; then
explainInstallation explainArchiveNowInstallation
elif [ "$doesJqExist" == "jq:" ]; then
explainJqInstallation
else else
moveToWorkDir moveToWorkDir
extractMarkdownLinks extractMarkdownLinks
pushToArchive pushToArchive
addArchiveLinksToFile addArchiveLinksToFile
reportErrors reportErrors
cleanup cleanup
fi fi
} }
main main