longNowForMd/longnow

# Filenames
input="$1"
root="$(echo "$input" | sed 's/.md//g' )"
links="$root.links.txt"
archivedLinks="$root.links.archived.txt"
errors="$root.errors.txt"
output="$root.longnow.md"

## Directories
initialDir="$(pwd)"
workdir="longnow-$root"

## Move to work dir
function moveToWorkDir(){
  mkdir -p "$workdir"
	cp "$input" "$workdir/$input"
  cd "$workdir"
}

## Extract markdown links
function extractMarkdownLinks(){ # Use: Takes a markdown file file.md, extracts all links, finds the unique ones and saves them to file.md.links
  links2="$root.links2.txt"
  echo ""
  echo "Extracting links..."
  
  rm -f "$links"
  grep -Eoi '\]\((.*)\)' "$input" | grep -Eo '(http|https)://[^)]+' >> "$links"
  
  awk '!seen[$0]++' "$links" > "$links2" && mv "$links2" "$links"

  echo "Done extracting links"
}

## Push to Archive
function pushToArchive(){
# Use: Takes a txt file with one link on each line and pushes all the links to the internet archive. Saves those links to a textfile
# References: 
# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
# https://github.com/oduwsdl/archivenow
# For the double underscore, see: https://stackoverflow.com/questions/13797087/bash-why-double-underline-for-private-functions-why-for-bash-complet/15181999  
  
  echo ""
  echo "Pushing to archive.org..."
  numLinesLinkFile=$(wc -l "$links" | awk '{ print $1 }')
  totalTimeInMinutes=$(echo "scale=0; ($numLinesLinkFile*7.5 + 60*$numLinesLinkFile/15)/60" | bc)
  echo "Expected to take ~$totalTimeInMinutes mins."
	echo ""

  ## rm -f "$archivedLinks"
  rm -f "$errors"
  touch "$archivedLinks"
  touch "$errors"
  
  ## How to deal with errors that arise
  echo "If this file contains errors, you can deal with them as follows:" >> "$errors"
  echo "- Do another pass with \$ longnow yourfile.md. If you don't delete yourfile.md.links.archived, past archive links are remembered, and only the links which are not there are sent again"  >> "$errors"
  echo "- Input the offending links manually to https://archive.org/, add the results to the yourfile.md.links.archived file manually, and then do another pass with \$ longnow yourfile.md" >> "$errors"
  echo "" >> "$errors"
  
  ## Main body
  counter=1
  while IFS= read -r line
  do
    wait
    if [ $(($counter % 15)) -eq 0 ]; then
      printf "Archive.org doesn't accept more than 15 links per min; sleeping for 1min...\n\n"
      sleep 1m
    fi
    echo "Url: $line"
    urlAlreadyContained=$( ( grep "$line$" "$archivedLinks"; grep "$line/$" "$archivedLinks" )  | tail -1 )
    if [ "$urlAlreadyContained" == "" ]; then
      archiveURL=$(archivenow --ia $line)
      if [[ "$archiveURL" == "Error"* ]]; then
        echo "$line" >> "$errors"
        echo "$archiveURL" >> "$errors"
        echo "" >> "$errors"
        echo "There was an error. See $errors for how to deal with it."
      else
          echo "$archiveURL" >> "$archivedLinks"
      fi
      counter=$((counter+1))
      numSecondsSleep=$((5+ ($RANDOM%15)))
    else
      archiveURL="$urlAlreadyContained"
      numSecondsSleep=0
    fi
    echo $archiveURL
    echo "Sleeping for $numSecondsSleep seconds..."
    sleep $numSecondsSleep
    echo ""
  done < "$links"
  
  echo "Done pushing links to archive.org"
  echo ""
}

## Add archive links to file
function addArchiveLinksToFile(){
    
  echo "Creating longnow file at $output"

  rm -f "$output"
  cp "$input" "$output"
  
  while IFS= read -r url
  do
    wait
    archivedUrl=$( ( grep "$url$" "$archivedLinks"; grep "$url/$" "$archivedLinks") | tail -1)
    if [ "$archivedUrl" != ""  ]; then
      ## echo "Url: $url"
      ## echo "ArchivedUrl: $archivedUrl"
      urlForSed="${url//\//\\/}"
      archiveUrlForSed="${archivedUrl//\//\\/}"
      sed -i "s/$urlForSed)/$urlForSed) ([a]($archiveUrlForSed))/g" "$output"
    ##else
      ##echo "There was an error for $url; see the $errorsFile"
    fi
  done < "$links"
  
  echo "Done."
}

## Explain installation
function explainInstallation(){
  echo "Required archivenow utility not found in path."
  echo "Install with \$ pip install archivenow"
  echo "(resp. \$ pip3 install archivenow)"
  echo "Or follow instructions on https://github.com/oduwsdl/archivenow"
}

## Report errors
function reportErrors(){
  numLinesErrorFile=$(wc -l "$errors" | awk '{ print $1 }')
  if [ "$numLinesErrorFile" -gt 4 ]; then
    echo "It seems that there are errors. To view and deal with them, see the $errors file"
  fi
}

## Clean up
function cleanup(){
  cp "$output" "../$output"
  cd "$initialDir"
}

## Main
function main(){
  doesArchiveNowExist="$(whereis "archivenow")"
  if [ "$doesArchiveNowExist" == "archivenow:" ]
	then
    explainInstallation
  else
		moveToWorkDir
		extractMarkdownLinks
		pushToArchive
		addArchiveLinksToFile
		reportErrors
		cleanup
  fi 
}
main
feat: Clean up 2022-01-12 01:29:11 +00:00			`# Filenames`
			`input="$1"`
			`root="$(echo "$input" \| sed 's/.md//g' )"`
			`links="$root.links.txt"`
			`archivedLinks="$root.links.archived.txt"`
			`errors="$root.errors.txt"`
			`output="$root.longnow.md"`
Small changes to make this more suitable as a command line utility 2021-06-28 14:38:06 +00:00
feat: Clean up 2022-01-12 01:29:11 +00:00			`## Directories`
			`initialDir="$(pwd)"`
			`workdir="longnow-$root"`

			`## Move to work dir`
			`function moveToWorkDir(){`
			`mkdir -p "$workdir"`
			`cp "$input" "$workdir/$input"`
			`cd "$workdir"`
			`}`

			`## Extract markdown links`
			`function extractMarkdownLinks(){ # Use: Takes a markdown file file.md, extracts all links, finds the unique ones and saves them to file.md.links`
			`links2="$root.links2.txt"`
First commit 2021-06-28 10:27:08 +00:00			`echo ""`
			`echo "Extracting links..."`
feat: Clean up 2022-01-12 01:29:11 +00:00
			`rm -f "$links"`
			`grep -Eoi '\]\((.*)\)' "$input" \| grep -Eo '(http\|https)://[^)]+' >> "$links"`

			`awk '!seen[$0]++' "$links" > "$links2" && mv "$links2" "$links"`

			`echo "Done extracting links"`
First commit 2021-06-28 10:27:08 +00:00			`}`

feat: Clean up 2022-01-12 01:29:11 +00:00			`## Push to Archive`
First commit 2021-06-28 10:27:08 +00:00			`function pushToArchive(){`
			`# Use: Takes a txt file with one link on each line and pushes all the links to the internet archive. Saves those links to a textfile`
			`# References:`
			`# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file`
			`# https://github.com/oduwsdl/archivenow`
Further improvements 2021-06-30 20:58:58 +00:00			`# For the double underscore, see: https://stackoverflow.com/questions/13797087/bash-why-double-underline-for-private-functions-why-for-bash-complet/15181999`

feat: Clean up 2022-01-12 01:29:11 +00:00			`echo ""`
			`echo "Pushing to archive.org..."`
			`numLinesLinkFile=$(wc -l "$links" \| awk '{ print $1 }')`
			`totalTimeInMinutes=$(echo "scale=0; ($numLinesLinkFile7.5 + 60$numLinesLinkFile/15)/60" \| bc)`
			`echo "Expected to take ~$totalTimeInMinutes mins."`
			`echo ""`
Minor usability improvements 2021-06-29 16:33:43 +00:00
feat: Clean up 2022-01-12 01:29:11 +00:00			`## rm -f "$archivedLinks"`
			`rm -f "$errors"`
			`touch "$archivedLinks"`
			`touch "$errors"`
First commit 2021-06-28 10:27:08 +00:00
Better error handling 2021-06-28 20:54:07 +00:00			`## How to deal with errors that arise`
feat: Clean up 2022-01-12 01:29:11 +00:00			`echo "If this file contains errors, you can deal with them as follows:" >> "$errors"`
			`echo "- Do another pass with \$ longnow yourfile.md. If you don't delete yourfile.md.links.archived, past archive links are remembered, and only the links which are not there are sent again" >> "$errors"`
			`echo "- Input the offending links manually to https://archive.org/, add the results to the yourfile.md.links.archived file manually, and then do another pass with \$ longnow yourfile.md" >> "$errors"`
			`echo "" >> "$errors"`
Better error handling 2021-06-28 20:54:07 +00:00
			`## Main body`
feat: Clean up 2022-01-12 01:29:11 +00:00			`counter=1`
Further improvements 2021-06-30 20:58:58 +00:00			`while IFS= read -r line`
			`do`
			`wait`
			`if [ $(($counter % 15)) -eq 0 ]; then`
			`printf "Archive.org doesn't accept more than 15 links per min; sleeping for 1min...\n\n"`
			`sleep 1m`
			`fi`
			`echo "Url: $line"`
feat: Clean up 2022-01-12 01:29:11 +00:00			`urlAlreadyContained=$( ( grep "$line$" "$archivedLinks"; grep "$line/$" "$archivedLinks" ) \| tail -1 )`
Further improvements 2021-06-30 20:58:58 +00:00			`if [ "$urlAlreadyContained" == "" ]; then`
			`archiveURL=$(archivenow --ia $line)`
			`if [[ "$archiveURL" == "Error"* ]]; then`
feat: Clean up 2022-01-12 01:29:11 +00:00			`echo "$line" >> "$errors"`
			`echo "$archiveURL" >> "$errors"`
			`echo "" >> "$errors"`
			`echo "There was an error. See $errors for how to deal with it."`
Further improvements 2021-06-30 20:58:58 +00:00			`else`
feat: Clean up 2022-01-12 01:29:11 +00:00			`echo "$archiveURL" >> "$archivedLinks"`
Further improvements 2021-06-30 20:58:58 +00:00			`fi`
			`counter=$((counter+1))`
			`numSecondsSleep=$((5+ ($RANDOM%15)))`
			`else`
			`archiveURL="$urlAlreadyContained"`
			`numSecondsSleep=0`
			`fi`
			`echo $archiveURL`
			`echo "Sleeping for $numSecondsSleep seconds..."`
			`sleep $numSecondsSleep`
			`echo ""`
feat: Clean up 2022-01-12 01:29:11 +00:00			`done < "$links"`
Further improvements 2021-06-30 20:58:58 +00:00
feat: Clean up 2022-01-12 01:29:11 +00:00			`echo "Done pushing links to archive.org"`
First commit 2021-06-28 10:27:08 +00:00			`echo ""`
			`}`

feat: Clean up 2022-01-12 01:29:11 +00:00			`## Add archive links to file`
First commit 2021-06-28 10:27:08 +00:00			`function addArchiveLinksToFile(){`

feat: Clean up 2022-01-12 01:29:11 +00:00			`echo "Creating longnow file at $output"`
First commit 2021-06-28 10:27:08 +00:00
feat: Clean up 2022-01-12 01:29:11 +00:00			`rm -f "$output"`
			`cp "$input" "$output"`
First commit 2021-06-28 10:27:08 +00:00
			`while IFS= read -r url`
Further improvements 2021-06-30 20:58:58 +00:00			`do`
			`wait`
feat: Clean up 2022-01-12 01:29:11 +00:00			`archivedUrl=$( ( grep "$url$" "$archivedLinks"; grep "$url/$" "$archivedLinks") \| tail -1)`
Further improvements 2021-06-30 20:58:58 +00:00			`if [ "$archivedUrl" != "" ]; then`
			`## echo "Url: $url"`
			`## echo "ArchivedUrl: $archivedUrl"`
			`urlForSed="${url//\//\\/}"`
			`archiveUrlForSed="${archivedUrl//\//\\/}"`
feat: Clean up 2022-01-12 01:29:11 +00:00			`sed -i "s/$urlForSed)/$urlForSed) ([a]($archiveUrlForSed))/g" "$output"`
Further improvements 2021-06-30 20:58:58 +00:00			`##else`
			`##echo "There was an error for $url; see the $errorsFile"`
			`fi`
feat: Clean up 2022-01-12 01:29:11 +00:00			`done < "$links"`
Further improvements 2021-06-30 20:58:58 +00:00
			`echo "Done."`
feat: Clean up 2022-01-12 01:29:11 +00:00			`}`
First commit 2021-06-28 10:27:08 +00:00
feat: Clean up 2022-01-12 01:29:11 +00:00			`## Explain installation`
			`function explainInstallation(){`
			`echo "Required archivenow utility not found in path."`
			`echo "Install with \$ pip install archivenow"`
			`echo "(resp. \$ pip3 install archivenow)"`
			`echo "Or follow instructions on https://github.com/oduwsdl/archivenow"`
First commit 2021-06-28 10:27:08 +00:00			`}`

feat: Clean up 2022-01-12 01:29:11 +00:00			`## Report errors`
			`function reportErrors(){`
			`numLinesErrorFile=$(wc -l "$errors" \| awk '{ print $1 }')`
			`if [ "$numLinesErrorFile" -gt 4 ]; then`
			`echo "It seems that there are errors. To view and deal with them, see the $errors file"`
			`fi`
			`}`

			`## Clean up`
			`function cleanup(){`
			`cp "$output" "../$output"`
			`cd "$initialDir"`
			`}`

			`## Main`
			`function main(){`
			`doesArchiveNowExist="$(whereis "archivenow")"`
First commit 2021-06-28 10:27:08 +00:00			`if [ "$doesArchiveNowExist" == "archivenow:" ]`
feat: Clean up 2022-01-12 01:29:11 +00:00			`then`
			`explainInstallation`
First commit 2021-06-28 10:27:08 +00:00			`else`
feat: Clean up 2022-01-12 01:29:11 +00:00			`moveToWorkDir`
			`extractMarkdownLinks`
			`pushToArchive`
			`addArchiveLinksToFile`
			`reportErrors`
			`cleanup`
First commit 2021-06-28 10:27:08 +00:00			`fi`
			`}`
feat: Clean up 2022-01-12 01:29:11 +00:00			`main`
First commit 2021-06-28 10:27:08 +00:00