feat: Use archive.org snapshot if it already exists.

This commit is contained in:
NunoSempere 2022-03-05 10:50:30 -05:00
parent 26b83deb8e
commit d8515b22df
2 changed files with 58 additions and 30 deletions

View File

@ -4,7 +4,6 @@ I use it to archive links in [this forecasting newsletter](https://forecasting.s
> Note to the future: All links are added automatically to the Internet Archive, using this [tool](https://github.com/NunoSempere/longNowForMd) ([a](https://web.archive.org/web/20220109144543/https://github.com/NunoSempere/longNowForMd)). "(a)" for archived links was inspired by [Milan Griffes](https://www.flightfromperfection.com/) ([a](https://web.archive.org/web/20220109144604/https://www.flightfromperfection.com/)), [Andrew Zuckerman](https://www.andzuck.com/) ([a](https://web.archive.org/web/20211202120912/https://www.andzuck.com/)), and [Alexey Guzey](https://guzey.com/) ([a](https://web.archive.org/web/20220109144733/https://guzey.com/)).
## How to install
Add [this file](https://github.com/NunoSempere/longNowForMd/blob/master/longnow) to your path, for instance by moving it to the `/usr/bin` folder and giving it execute permissions (with `chmod 755 longnow`)
@ -21,6 +20,16 @@ In addition, this utility requires [archivenow](https://github.com/oduwsdl/archi
pip install archivenow ## respectively, pip3
```
It also requires [jq](https://stedolan.github.io/jq/download/), which can be installed as:
```
sudo apt install jq
```
if on Debian, or using your distribution's package manager otherwise.
As of the newest iteration of this program, if archive.org already has a snapshot of the page, that snapshot is taken instead. This results in massive time savings, but could imply that a less up to date copy is used. If this behavior is not desired, it can be easily excised manually, by removing the lines around `if [ "$urlAlreadyInArchiveOnline" == "" ]; then`.
## How to use
```

View File

@ -13,7 +13,7 @@ workdir="longnow-$root"
## Move to work dir
function moveToWorkDir(){
mkdir -p "$workdir"
cp "$input" "$workdir/$input"
cp "$input" "$workdir/$input"
cd "$workdir"
}
@ -44,7 +44,7 @@ function pushToArchive(){
numLinesLinkFile=$(wc -l "$links" | awk '{ print $1 }')
totalTimeInMinutes=$(echo "scale=0; ($numLinesLinkFile*7.5 + 60*$numLinesLinkFile/15)/60" | bc)
echo "Expected to take ~$totalTimeInMinutes mins."
echo ""
echo ""
## rm -f "$archivedLinks"
rm -f "$errors"
@ -67,27 +67,39 @@ function pushToArchive(){
sleep 1m
fi
echo "Url: $line"
urlAlreadyContained=$( ( grep "$line$" "$archivedLinks"; grep "$line/$" "$archivedLinks" ) | tail -1 )
if [ "$urlAlreadyContained" == "" ]; then
archiveURL=$(archivenow --ia $line)
if [[ "$archiveURL" == "Error"* ]]; then
echo "$line" >> "$errors"
echo "$archiveURL" >> "$errors"
echo "" >> "$errors"
echo "There was an error. See $errors for how to deal with it."
urlAlreadyContainedInLocalArchivedLinks=$( ( grep "$line$" "$archivedLinks"; grep "$line/$" "$archivedLinks" ) | tail -1 )
if [ "$urlAlreadyContainedInLocalArchivedLinks" == "" ]; then
urlAlreadyInArchiveOnline="$(curl --silent http://archive.org/wayback/available?url=$line | jq '.archived_snapshots.closest.url' | sed 's/"//g' | sed 's/null//g' )"
if [ "$urlAlreadyInArchiveOnline" == "" ]; then
echo "Sending to archive..."
archiveURL=$(archivenow --ia $line)
if [[ "$archiveURL" == "Error"* ]]; then
echo "$line" >> "$errors"
echo "$archiveURL" >> "$errors"
echo "" >> "$errors"
echo "There was an error. See $errors for how to deal with it."
echo ""
else
echo "$archiveURL" >> "$archivedLinks"
fi
counter=$((counter+1))
numSecondsSleep=$((5+ ($RANDOM%15)))
else
echo "$archiveURL" >> "$archivedLinks"
echo "Already in archive.org: $urlAlreadyInArchiveOnline"
echo "$urlAlreadyInArchiveOnline" >> "$archivedLinks"
echo ""
numSecondsSleep=0
fi
counter=$((counter+1))
numSecondsSleep=$((5+ ($RANDOM%15)))
else
archiveURL="$urlAlreadyContained"
elif [ ! -z "$urlAlreadyContainedInLocalArchivedLinks" ]; then
echo "Already in local archive: $urlAlreadyContainedInLocalArchivedLinks"
archiveURL="$urlAlreadyContainedInLocalArchivedLinks"
numSecondsSleep=0
# echo $archiveURL
echo "Sleeping for $numSecondsSleep seconds..."
sleep $numSecondsSleep
echo ""
fi
echo $archiveURL
echo "Sleeping for $numSecondsSleep seconds..."
sleep $numSecondsSleep
echo ""
done < "$links"
echo "Done pushing links to archive.org"
@ -121,13 +133,18 @@ function addArchiveLinksToFile(){
}
## Explain installation
function explainInstallation(){
function explainArchiveNowInstallation(){
echo "Required archivenow utility not found in path."
echo "Install with \$ pip install archivenow"
echo "(resp. \$ pip3 install archivenow)"
echo "Or follow instructions on https://github.com/oduwsdl/archivenow"
}
function explainJqInstallation(){
echo "Required jq utility not found in path."
echo "Install with your package manager, e.g., \$ sudo apt install jq"
echo "Or follow instructions on https://stedolan.github.io/jq/download/"
}
## Report errors
function reportErrors(){
numLinesErrorFile=$(wc -l "$errors" | awk '{ print $1 }')
@ -145,16 +162,18 @@ function cleanup(){
## Main
function main(){
doesArchiveNowExist="$(whereis "archivenow")"
if [ "$doesArchiveNowExist" == "archivenow:" ]
then
explainInstallation
doesJqExist="$(whereis "jq")"
if [ "$doesArchiveNowExist" == "archivenow:" ]; then
explainArchiveNowInstallation
elif [ "$doesJqExist" == "jq:" ]; then
explainJqInstallation
else
moveToWorkDir
extractMarkdownLinks
pushToArchive
addArchiveLinksToFile
reportErrors
cleanup
moveToWorkDir
extractMarkdownLinks
pushToArchive
addArchiveLinksToFile
reportErrors
cleanup
fi
}
main