#!/bin/bash

function getMdLinks(){ # Use: Takes a markdown file file.md, extracts all links, finds the unique ones and saves them to file.md.links
  echo ""
  echo "Extracting links..."
  
  grep -Eoi '\]\((.*)\)' $1 | grep -Eo '(http|https)://[^)]+' >> "$1.links"
  ## sed -i 's/www.wikiwand.com\/en/en.wikipedia.org\/wiki/g' $1
  awk '!seen[$0]++' "$1.links" > "$1.links2" && mv "$1.links2" "$1.links"
  
  echo "Done."
  echo ""
}

function pushToArchive(){
# Use: Takes a txt file with one link on each line and pushes all the links to the internet archive. Saves those links to a textfile
# References: 
# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
# https://github.com/oduwsdl/archivenow
# For the double underscore, see: https://stackoverflow.com/questions/13797087/bash-why-double-underline-for-private-functions-why-for-bash-complet/15181999	
  echo "Pushing to archive.org..."
  
	input=$1
	counter=1
	
	## rm -f "$1.archived"
  archivedLinksFile="$1.archived"
  errorsFile="$1.errors"
  touch "$archivedLinksFile"
  touch "$errorsFile"
  
  ## How to deal with errors that arise
  echo "If this file contains errors, you can deal with them as follows:" >> "$errorsFile"
  echo "- Do another pass with \$ longnow yourfile.md. If you don't delete yourfile.md.links.archived, past archive links are remembered, and only the links which are not there are sent again"  >> "$errorsFile"
  echo "- Input the offending links manually to https://archive.org/, add the results to the example.md.links.archived file manually, and then do another pass with \$ longnow yourfile.md" >> "$errorsFile"
  echo "" >> "$errorsFile"
  
  ## Main body
	while IFS= read -r line
	do
		wait
		if [ $(($counter % 15)) -eq 0 ]; then
			printf "Archive.org doesn't accept more than 15 links per min; sleeping for 1min...\n\n"
			sleep 1m
		fi
		echo "Url: $line"
		urlAlreadyContained=$(grep "$line$" "$archivedLinksFile" | tail -1)
		if [ "$urlAlreadyContained" == "" ]; then
  		archiveURL=$(archivenow --ia $line)
  		if [[ "$archiveURL" == "Error"* ]]; then
  		  echo "$line" >> "$errorsFile"
  		  echo "$archiveURL" >> "$errorsFile"
  		  echo "" >> "$errorsFile"
  		  echo "There was an error. See $errorsFile for how to deal with it."
  		else
					echo "$archiveURL" >> "$archivedLinksFile"
  		fi
			counter=$((counter+1))
  		numSecondsSleep=$((5+ ($RANDOM%15)))
	  	sleep $numSecondsSleep
  	else
  	  archiveURL="$urlAlreadyContained"
		fi
		echo $archiveURL
		echo ""
	done < "$input"
	
	echo "Done."
  echo ""
}

function addArchiveLinksToFile(){
    
  originalFile="$1"
  originalFileTemp="$originalFile.temp"
  linksFile="$1.links"
  archivedLinksFile="$1.links.archived"
  errorsFile="$1.links.errors"
  longNowFile="$1.longnow"
  
  echo "Creating longnow file @ $longNowFile..."

	rm -f "$longNowFile"
  touch "$longNowFile"
  cp "$originalFile" "$originalFileTemp"
  
  while IFS= read -r url
	do
		wait
		archivedUrl=$(grep "$url$" "$archivedLinksFile" | tail -1)
		if [ "$archivedUrl" != ""  ]; then
  		## echo "Url: $url"
		  ## echo "ArchivedUrl: $archivedUrl"
		  urlForSed="${url//\//\\/}"
		  archiveUrlForSed="${archivedUrl//\//\\/}"
		  sed -i "s/$urlForSed)/$urlForSed) ([a]($archiveUrlForSed))/g" "$1"
	  ##else
	    ##echo "There was an error for $url; see the $errorsFile"
		fi
	done < "$linksFile"
	mv "$originalFile" "$longNowFile"
	mv "$originalFileTemp" "$originalFile"
	
	echo "Done."

}

function longnow(){
  doesArchiveNowExist=$(whereis "archivenow")
  if [ "$doesArchiveNowExist" == "archivenow:" ]
  then
    echo "Required archivenow utility not found in path."
    echo "Install with \$ pip install archivenow"
    echo "(resp. \$ pip3 install archivenow)"
    echo "Or follow instructions on https://github.com/oduwsdl/archivenow"
  else
    getMdLinks $1
    pushToArchive $1.links
    addArchiveLinksToFile $1
    numLinesErrorFile=$(wc -l "$1.md.links.errors" | awk '{ print $1 }')
    if [ "$numLinesErrorFile" > 4 ] ;then
      echo "It seems that there are errors. To view and deal with them, see the $1.links.errors file"
    fi
  fi 
}

longnow "$1" ## don't copy this line into your .bashrc file