longNowForMd/longnow.sh

181 lines
5.8 KiB
Bash
Raw Normal View History

2022-01-12 01:29:11 +00:00
# Filenames
input="$1"
root="$(echo "$input" | sed 's/.md//g' )"
links="$root.links.txt"
archivedLinks="$root.links.archived.txt"
errors="$root.errors.txt"
output="$root.longnow.md"
2022-01-12 01:29:11 +00:00
## Directories
initialDir="$(pwd)"
workdir="longnow-$root"
## Move to work dir
function moveToWorkDir(){
mkdir -p "$workdir"
cp "$input" "$workdir/$input"
2022-01-12 01:29:11 +00:00
cd "$workdir"
}
## Extract markdown links
function extractMarkdownLinks(){ # Use: Takes a markdown file file.md, extracts all links, finds the unique ones and saves them to file.md.links
links2="$root.links2.txt"
2021-06-28 10:27:08 +00:00
echo ""
echo "Extracting links..."
2022-01-12 01:29:11 +00:00
rm -f "$links"
grep -Eoi '\]\((.*)\)' "$input" | grep -Eo '(http|https)://[^)]+' >> "$links"
awk '!seen[$0]++' "$links" > "$links2" && mv "$links2" "$links"
echo "Done extracting links"
2021-06-28 10:27:08 +00:00
}
2022-01-12 01:29:11 +00:00
## Push to Archive
2021-06-28 10:27:08 +00:00
function pushToArchive(){
# Use: Takes a txt file with one link on each line and pushes all the links to the internet archive. Saves those links to a textfile
# References:
# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
# https://github.com/oduwsdl/archivenow
2021-06-30 20:58:58 +00:00
# For the double underscore, see: https://stackoverflow.com/questions/13797087/bash-why-double-underline-for-private-functions-why-for-bash-complet/15181999
2022-01-12 01:29:11 +00:00
echo ""
echo "Pushing to archive.org..."
numLinesLinkFile=$(wc -l "$links" | awk '{ print $1 }')
totalTimeInMinutes=$(echo "scale=0; ($numLinesLinkFile*7.5 + 60*$numLinesLinkFile/15)/60" | bc)
echo "Expected to take ~$totalTimeInMinutes mins."
echo ""
2021-06-29 16:33:43 +00:00
2022-01-12 01:29:11 +00:00
## rm -f "$archivedLinks"
rm -f "$errors"
touch "$archivedLinks"
touch "$errors"
2021-06-28 10:27:08 +00:00
2021-06-28 20:54:07 +00:00
## How to deal with errors that arise
2022-01-12 01:29:11 +00:00
echo "If this file contains errors, you can deal with them as follows:" >> "$errors"
echo "- Do another pass with \$ longnow yourfile.md. If you don't delete yourfile.md.links.archived, past archive links are remembered, and only the links which are not there are sent again" >> "$errors"
echo "- Input the offending links manually to https://archive.org/, add the results to the yourfile.md.links.archived file manually, and then do another pass with \$ longnow yourfile.md" >> "$errors"
echo "" >> "$errors"
2021-06-28 20:54:07 +00:00
## Main body
2022-01-12 01:29:11 +00:00
counter=1
2021-06-30 20:58:58 +00:00
while IFS= read -r line
do
wait
if [ $(($counter % 15)) -eq 0 ]; then
printf "Archive.org doesn't accept more than 15 links per min; sleeping for 1min...\n\n"
sleep 1m
fi
echo "Url: $line"
urlAlreadyContainedInLocalArchivedLinks=$( ( grep "$line$" "$archivedLinks"; grep "$line/$" "$archivedLinks" ) | tail -1 )
if [ "$urlAlreadyContainedInLocalArchivedLinks" == "" ]; then
urlAlreadyInArchiveOnline="$(curl --silent http://archive.org/wayback/available?url=$line | jq '.archived_snapshots.closest.url' | sed 's/"//g' | sed 's/null//g' )"
if [ "$urlAlreadyInArchiveOnline" == "" ]; then
echo "Sending to archive..."
archiveURL=$(archivenow --ia $line)
if [[ "$archiveURL" == "Error"* ]]; then
echo "$line" >> "$errors"
echo "$archiveURL" >> "$errors"
echo "" >> "$errors"
echo "There was an error. See $errors for how to deal with it."
echo ""
else
echo "$archiveURL" >> "$archivedLinks"
fi
counter=$((counter+1))
numSecondsSleep=$((5+ ($RANDOM%15)))
2021-06-30 20:58:58 +00:00
else
echo "Already in archive.org: $urlAlreadyInArchiveOnline"
echo "$urlAlreadyInArchiveOnline" >> "$archivedLinks"
echo ""
numSecondsSleep=0
2021-06-30 20:58:58 +00:00
fi
elif [ ! -z "$urlAlreadyContainedInLocalArchivedLinks" ]; then
echo "Already in local archive: $urlAlreadyContainedInLocalArchivedLinks"
archiveURL="$urlAlreadyContainedInLocalArchivedLinks"
2021-06-30 20:58:58 +00:00
numSecondsSleep=0
# echo $archiveURL
echo "Sleeping for $numSecondsSleep seconds..."
sleep $numSecondsSleep
echo ""
2021-06-30 20:58:58 +00:00
fi
2022-01-12 01:29:11 +00:00
done < "$links"
2021-06-30 20:58:58 +00:00
2022-01-12 01:29:11 +00:00
echo "Done pushing links to archive.org"
2021-06-28 10:27:08 +00:00
echo ""
}
2022-01-12 01:29:11 +00:00
## Add archive links to file
2021-06-28 10:27:08 +00:00
function addArchiveLinksToFile(){
2022-01-12 01:29:11 +00:00
echo "Creating longnow file at $output"
2021-06-28 10:27:08 +00:00
2022-01-12 01:29:11 +00:00
rm -f "$output"
cp "$input" "$output"
2021-06-28 10:27:08 +00:00
while IFS= read -r url
2021-06-30 20:58:58 +00:00
do
wait
2022-01-12 01:29:11 +00:00
archivedUrl=$( ( grep "$url$" "$archivedLinks"; grep "$url/$" "$archivedLinks") | tail -1)
2021-06-30 20:58:58 +00:00
if [ "$archivedUrl" != "" ]; then
## echo "Url: $url"
## echo "ArchivedUrl: $archivedUrl"
urlForSed="${url//\//\\/}"
archiveUrlForSed="${archivedUrl//\//\\/}"
2022-01-12 01:29:11 +00:00
sed -i "s/$urlForSed)/$urlForSed) ([a]($archiveUrlForSed))/g" "$output"
2021-06-30 20:58:58 +00:00
##else
##echo "There was an error for $url; see the $errorsFile"
fi
2022-01-12 01:29:11 +00:00
done < "$links"
2021-06-30 20:58:58 +00:00
echo "Done."
2022-01-12 01:29:11 +00:00
}
2021-06-28 10:27:08 +00:00
2022-01-12 01:29:11 +00:00
## Explain installation
function explainArchiveNowInstallation(){
2022-01-12 01:29:11 +00:00
echo "Required archivenow utility not found in path."
echo "Install with \$ pip install archivenow"
echo "(resp. \$ pip3 install archivenow)"
echo "Or follow instructions on https://github.com/oduwsdl/archivenow"
2021-06-28 10:27:08 +00:00
}
function explainJqInstallation(){
echo "Required jq utility not found in path."
echo "Install with your package manager, e.g., \$ sudo apt install jq"
echo "Or follow instructions on https://stedolan.github.io/jq/download/"
}
2022-01-12 01:29:11 +00:00
## Report errors
function reportErrors(){
numLinesErrorFile=$(wc -l "$errors" | awk '{ print $1 }')
if [ "$numLinesErrorFile" -gt 4 ]; then
echo "It seems that there are errors. To view and deal with them, see the $errors file"
fi
}
## Clean up
function cleanup(){
cp "$output" "../$output"
cd "$initialDir"
}
## Main
function main(){
doesArchiveNowExist="$(whereis "archivenow")"
doesJqExist="$(whereis "jq")"
if [ "$doesArchiveNowExist" == "archivenow:" ]; then
explainArchiveNowInstallation
elif [ "$doesJqExist" == "jq:" ]; then
explainJqInstallation
2021-06-28 10:27:08 +00:00
else
moveToWorkDir
extractMarkdownLinks
pushToArchive
addArchiveLinksToFile
reportErrors
cleanup
2021-06-28 10:27:08 +00:00
fi
}
2022-01-12 01:29:11 +00:00
main
2021-06-28 10:27:08 +00:00