feat: Use archive.org snapshot if it already exists.
This commit is contained in:
parent
26b83deb8e
commit
d8515b22df
11
README.md
11
README.md
|
@ -4,7 +4,6 @@ I use it to archive links in [this forecasting newsletter](https://forecasting.s
|
||||||
|
|
||||||
> Note to the future: All links are added automatically to the Internet Archive, using this [tool](https://github.com/NunoSempere/longNowForMd) ([a](https://web.archive.org/web/20220109144543/https://github.com/NunoSempere/longNowForMd)). "(a)" for archived links was inspired by [Milan Griffes](https://www.flightfromperfection.com/) ([a](https://web.archive.org/web/20220109144604/https://www.flightfromperfection.com/)), [Andrew Zuckerman](https://www.andzuck.com/) ([a](https://web.archive.org/web/20211202120912/https://www.andzuck.com/)), and [Alexey Guzey](https://guzey.com/) ([a](https://web.archive.org/web/20220109144733/https://guzey.com/)).
|
> Note to the future: All links are added automatically to the Internet Archive, using this [tool](https://github.com/NunoSempere/longNowForMd) ([a](https://web.archive.org/web/20220109144543/https://github.com/NunoSempere/longNowForMd)). "(a)" for archived links was inspired by [Milan Griffes](https://www.flightfromperfection.com/) ([a](https://web.archive.org/web/20220109144604/https://www.flightfromperfection.com/)), [Andrew Zuckerman](https://www.andzuck.com/) ([a](https://web.archive.org/web/20211202120912/https://www.andzuck.com/)), and [Alexey Guzey](https://guzey.com/) ([a](https://web.archive.org/web/20220109144733/https://guzey.com/)).
|
||||||
|
|
||||||
|
|
||||||
## How to install
|
## How to install
|
||||||
Add [this file](https://github.com/NunoSempere/longNowForMd/blob/master/longnow) to your path, for instance by moving it to the `/usr/bin` folder and giving it execute permissions (with `chmod 755 longnow`)
|
Add [this file](https://github.com/NunoSempere/longNowForMd/blob/master/longnow) to your path, for instance by moving it to the `/usr/bin` folder and giving it execute permissions (with `chmod 755 longnow`)
|
||||||
|
|
||||||
|
@ -21,6 +20,16 @@ In addition, this utility requires [archivenow](https://github.com/oduwsdl/archi
|
||||||
pip install archivenow ## respectively, pip3
|
pip install archivenow ## respectively, pip3
|
||||||
```
|
```
|
||||||
|
|
||||||
|
It also requires [jq](https://stedolan.github.io/jq/download/), which can be installed as:
|
||||||
|
|
||||||
|
```
|
||||||
|
sudo apt install jq
|
||||||
|
```
|
||||||
|
|
||||||
|
if on Debian, or using your distribution's package manager otherwise.
|
||||||
|
|
||||||
|
As of the newest iteration of this program, if archive.org already has a snapshot of the page, that snapshot is taken instead. This results in massive time savings, but could imply that a less up to date copy is used. If this behavior is not desired, it can be easily excised manually, by removing the lines around `if [ "$urlAlreadyInArchiveOnline" == "" ]; then`.
|
||||||
|
|
||||||
## How to use
|
## How to use
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
|
@ -13,7 +13,7 @@ workdir="longnow-$root"
|
||||||
## Move to work dir
|
## Move to work dir
|
||||||
function moveToWorkDir(){
|
function moveToWorkDir(){
|
||||||
mkdir -p "$workdir"
|
mkdir -p "$workdir"
|
||||||
cp "$input" "$workdir/$input"
|
cp "$input" "$workdir/$input"
|
||||||
cd "$workdir"
|
cd "$workdir"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -44,7 +44,7 @@ function pushToArchive(){
|
||||||
numLinesLinkFile=$(wc -l "$links" | awk '{ print $1 }')
|
numLinesLinkFile=$(wc -l "$links" | awk '{ print $1 }')
|
||||||
totalTimeInMinutes=$(echo "scale=0; ($numLinesLinkFile*7.5 + 60*$numLinesLinkFile/15)/60" | bc)
|
totalTimeInMinutes=$(echo "scale=0; ($numLinesLinkFile*7.5 + 60*$numLinesLinkFile/15)/60" | bc)
|
||||||
echo "Expected to take ~$totalTimeInMinutes mins."
|
echo "Expected to take ~$totalTimeInMinutes mins."
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
## rm -f "$archivedLinks"
|
## rm -f "$archivedLinks"
|
||||||
rm -f "$errors"
|
rm -f "$errors"
|
||||||
|
@ -67,27 +67,39 @@ function pushToArchive(){
|
||||||
sleep 1m
|
sleep 1m
|
||||||
fi
|
fi
|
||||||
echo "Url: $line"
|
echo "Url: $line"
|
||||||
urlAlreadyContained=$( ( grep "$line$" "$archivedLinks"; grep "$line/$" "$archivedLinks" ) | tail -1 )
|
urlAlreadyContainedInLocalArchivedLinks=$( ( grep "$line$" "$archivedLinks"; grep "$line/$" "$archivedLinks" ) | tail -1 )
|
||||||
if [ "$urlAlreadyContained" == "" ]; then
|
|
||||||
archiveURL=$(archivenow --ia $line)
|
if [ "$urlAlreadyContainedInLocalArchivedLinks" == "" ]; then
|
||||||
if [[ "$archiveURL" == "Error"* ]]; then
|
urlAlreadyInArchiveOnline="$(curl --silent http://archive.org/wayback/available?url=$line | jq '.archived_snapshots.closest.url' | sed 's/"//g' | sed 's/null//g' )"
|
||||||
echo "$line" >> "$errors"
|
if [ "$urlAlreadyInArchiveOnline" == "" ]; then
|
||||||
echo "$archiveURL" >> "$errors"
|
echo "Sending to archive..."
|
||||||
echo "" >> "$errors"
|
archiveURL=$(archivenow --ia $line)
|
||||||
echo "There was an error. See $errors for how to deal with it."
|
if [[ "$archiveURL" == "Error"* ]]; then
|
||||||
|
echo "$line" >> "$errors"
|
||||||
|
echo "$archiveURL" >> "$errors"
|
||||||
|
echo "" >> "$errors"
|
||||||
|
echo "There was an error. See $errors for how to deal with it."
|
||||||
|
echo ""
|
||||||
|
else
|
||||||
|
echo "$archiveURL" >> "$archivedLinks"
|
||||||
|
fi
|
||||||
|
counter=$((counter+1))
|
||||||
|
numSecondsSleep=$((5+ ($RANDOM%15)))
|
||||||
else
|
else
|
||||||
echo "$archiveURL" >> "$archivedLinks"
|
echo "Already in archive.org: $urlAlreadyInArchiveOnline"
|
||||||
|
echo "$urlAlreadyInArchiveOnline" >> "$archivedLinks"
|
||||||
|
echo ""
|
||||||
|
numSecondsSleep=0
|
||||||
fi
|
fi
|
||||||
counter=$((counter+1))
|
elif [ ! -z "$urlAlreadyContainedInLocalArchivedLinks" ]; then
|
||||||
numSecondsSleep=$((5+ ($RANDOM%15)))
|
echo "Already in local archive: $urlAlreadyContainedInLocalArchivedLinks"
|
||||||
else
|
archiveURL="$urlAlreadyContainedInLocalArchivedLinks"
|
||||||
archiveURL="$urlAlreadyContained"
|
|
||||||
numSecondsSleep=0
|
numSecondsSleep=0
|
||||||
|
# echo $archiveURL
|
||||||
|
echo "Sleeping for $numSecondsSleep seconds..."
|
||||||
|
sleep $numSecondsSleep
|
||||||
|
echo ""
|
||||||
fi
|
fi
|
||||||
echo $archiveURL
|
|
||||||
echo "Sleeping for $numSecondsSleep seconds..."
|
|
||||||
sleep $numSecondsSleep
|
|
||||||
echo ""
|
|
||||||
done < "$links"
|
done < "$links"
|
||||||
|
|
||||||
echo "Done pushing links to archive.org"
|
echo "Done pushing links to archive.org"
|
||||||
|
@ -121,13 +133,18 @@ function addArchiveLinksToFile(){
|
||||||
}
|
}
|
||||||
|
|
||||||
## Explain installation
|
## Explain installation
|
||||||
function explainInstallation(){
|
function explainArchiveNowInstallation(){
|
||||||
echo "Required archivenow utility not found in path."
|
echo "Required archivenow utility not found in path."
|
||||||
echo "Install with \$ pip install archivenow"
|
echo "Install with \$ pip install archivenow"
|
||||||
echo "(resp. \$ pip3 install archivenow)"
|
echo "(resp. \$ pip3 install archivenow)"
|
||||||
echo "Or follow instructions on https://github.com/oduwsdl/archivenow"
|
echo "Or follow instructions on https://github.com/oduwsdl/archivenow"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function explainJqInstallation(){
|
||||||
|
echo "Required jq utility not found in path."
|
||||||
|
echo "Install with your package manager, e.g., \$ sudo apt install jq"
|
||||||
|
echo "Or follow instructions on https://stedolan.github.io/jq/download/"
|
||||||
|
}
|
||||||
## Report errors
|
## Report errors
|
||||||
function reportErrors(){
|
function reportErrors(){
|
||||||
numLinesErrorFile=$(wc -l "$errors" | awk '{ print $1 }')
|
numLinesErrorFile=$(wc -l "$errors" | awk '{ print $1 }')
|
||||||
|
@ -145,16 +162,18 @@ function cleanup(){
|
||||||
## Main
|
## Main
|
||||||
function main(){
|
function main(){
|
||||||
doesArchiveNowExist="$(whereis "archivenow")"
|
doesArchiveNowExist="$(whereis "archivenow")"
|
||||||
if [ "$doesArchiveNowExist" == "archivenow:" ]
|
doesJqExist="$(whereis "jq")"
|
||||||
then
|
if [ "$doesArchiveNowExist" == "archivenow:" ]; then
|
||||||
explainInstallation
|
explainArchiveNowInstallation
|
||||||
|
elif [ "$doesJqExist" == "jq:" ]; then
|
||||||
|
explainJqInstallation
|
||||||
else
|
else
|
||||||
moveToWorkDir
|
moveToWorkDir
|
||||||
extractMarkdownLinks
|
extractMarkdownLinks
|
||||||
pushToArchive
|
pushToArchive
|
||||||
addArchiveLinksToFile
|
addArchiveLinksToFile
|
||||||
reportErrors
|
reportErrors
|
||||||
cleanup
|
cleanup
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
main
|
main
|
Loading…
Reference in New Issue
Block a user