feat: v6 work in progress

2022-05-09 23:32:59 -04:00
2 changed files with 671 additions and 61 deletions
--- a/longnow6-wip.sh
+++ b/longnow6-wip.sh
@ -1,9 +1,13 @@
+# To do:
+# - Get API keys from somewhere else
+# - Change installation warnings
+
 # Filenames
 input="$1"
 root="$(echo "$input" | sed 's/.md//g' )"
 links="$root.links.txt"
-archivedLinks="$root.links.archived.txt"
-errors="$root.errors.txt"
+archivedLinks="captures.log" ##"$root.links.archived.txt"
+errors="error-json.log"
 output="$root.longnow.md"

 ## Directories
@ -46,61 +50,7 @@ function pushToArchive(){
  echo "Expected to take ~$totalTimeInMinutes mins."
  echo ""
 	
-  ## rm -f "$archivedLinks"
-  rm -f "$errors"
-  touch "$archivedLinks"
-  touch "$errors"
-  
-  ## How to deal with errors that arise
-  echo "If this file contains errors, you can deal with them as follows:" >> "$errors"
-  echo "- Do another pass with \$ longnow yourfile.md. If you don't delete yourfile.md.links.archived, past archive links are remembered, and only the links which are not there are sent again"  >> "$errors"
-  echo "- Input the offending links manually to https://archive.org/, add the results to the yourfile.md.links.archived file manually, and then do another pass with \$ longnow yourfile.md" >> "$errors"
-  echo "" >> "$errors"
-  
-  ## Main body
-  counter=1
-  while IFS= read -r line
-  do
-    wait
-    if [ $(($counter % 15)) -eq 0 ]; then
-      printf "Archive.org doesn't accept more than 15 links per min; sleeping for 1min...\n\n"
-      sleep 1m
-    fi
-    echo "Url: $line"
-    urlAlreadyContainedInLocalArchivedLinks=$( ( grep "$line$" "$archivedLinks"; grep "$line/$" "$archivedLinks" )  | tail -1 )
-
-    if [ "$urlAlreadyContainedInLocalArchivedLinks" == "" ]; then
-      urlAlreadyInArchiveOnline="$(curl --silent http://archive.org/wayback/available?url=$line |  jq '.archived_snapshots.closest.url' | sed 's/"//g' | sed 's/null//g' )"
-      if [ "$urlAlreadyInArchiveOnline" == "" ]; then
-        echo "Sending to archive..."
-        archiveURL=$(archivenow --ia $line)
-        if [[ "$archiveURL" == "Error"* ]]; then
-          echo "$line" >> "$errors"
-          echo "$archiveURL" >> "$errors"
-          echo "" >> "$errors"
-          echo "There was an error. See $errors for how to deal with it."
-					echo ""
-        else
-            echo "$archiveURL" >> "$archivedLinks"
-        fi
-        counter=$((counter+1))
-        numSecondsSleep=$((5+ ($RANDOM%15)))
-      else
-        echo "Already in archive.org: $urlAlreadyInArchiveOnline"
-        echo "$urlAlreadyInArchiveOnline" >> "$archivedLinks"
-				echo ""
-        numSecondsSleep=0
-      fi
-    elif [ ! -z "$urlAlreadyContainedInLocalArchivedLinks" ]; then
-      echo "Already in local archive: $urlAlreadyContainedInLocalArchivedLinks"
-      archiveURL="$urlAlreadyContainedInLocalArchivedLinks"
-      numSecondsSleep=0
-      # echo $archiveURL
-      echo "Sleeping for $numSecondsSleep seconds..."
-      sleep $numSecondsSleep
-      echo ""
-    fi
-  done < "$links"
+  /home/loki/.bash/src/longnow/spn/wayback-machine-spn-scripts/spn.sh -a [my private key] -f . -p 3 "$links"
  
  echo "Done pushing links to archive.org"
  echo ""
@ -123,7 +73,7 @@ function addArchiveLinksToFile(){
      ## echo "ArchivedUrl: $archivedUrl"
      urlForSed="${url//\//\\/}"
      archiveUrlForSed="${archivedUrl//\//\\/}"
-      sed -i "s/$urlForSed)/$urlForSed) ([a]($archiveUrlForSed))/g" "$output"
+      sed -i "s/$urlForSed)/$urlForSed) ([a](https:\/\/web.archive.org$archiveUrlForSed))/g" "$output"
    ##else
      ##echo "There was an error for $url; see the $errorsFile"
    fi
@ -147,8 +97,7 @@ function explainJqInstallation(){
 }
 ## Report errors
 function reportErrors(){
-  numLinesErrorFile=$(wc -l "$errors" | awk '{ print $1 }')
-  if [ "$numLinesErrorFile" -gt 4 ]; then
+  if test -f "$errors"; then
    echo "It seems that there are errors. To view and deal with them, see the $errors file"
  fi
 }
--- a/spn.sh
+++ b/spn.sh
@ -0,0 +1,661 @@
+#!/bin/bash
+
+auth=''
+curl_args=()
+post_data=''
+custom_dir=''
+dir_suffix=''
+no_errors=''
+outlinks=''
+parallel=''
+quiet=''
+resume=''
+ssl_only=''
+include_pattern=''
+exclude_pattern=''
+
+print_usage() {
+	echo "Usage: $(basename "$0") [options] file
+       $(basename "$0") [options] url [url]...
+       $(basename "$0") [options] -r folder
+
+Options:
+ -a auth        S3 API keys, in the form accesskey:secret
+                (get account keys at https://archive.org/account/s3.php)
+
+ -c args        pass additional arguments to curl
+
+ -d data        capture request options, or other arbitrary POST data
+
+ -f folder      use a custom location for the data folder
+                (some files will be overwritten or deleted during the session)
+
+ -i suffix      add a suffix to the name of the data folder
+                (if -f is used, -i is ignored)
+
+ -n             tell Save Page Now not to save errors into the Wayback Machine
+
+ -o pattern     save detected capture outlinks matching regex (ERE) pattern
+
+ -p N           run at most N capture jobs in parallel (off by default)
+
+ -q             discard JSON for completed jobs instead of writing to log file
+
+ -r folder      resume with the remaining URLs of an aborted session
+                (settings are not carried over, except for outlinks options)
+
+ -s             use HTTPS for all captures and change HTTP input URLs to HTTPS
+
+ -x pattern     save detected capture outlinks not matching regex (ERE) pattern
+                (if -o is also used, outlinks are filtered using both regexes)"
+}
+
+while getopts 'a:c:d:f:i:no:p:qr:sx:' flag; do
+	case "${flag}" in
+		a)	auth="$OPTARG" ;;
+		c)	declare -a "curl_args=($OPTARG)" ;;
+		d)	post_data="$OPTARG" ;;
+		f)	custom_dir="$OPTARG" ;;
+		i)	dir_suffix="-$OPTARG" ;;
+		n)	no_errors='true' ;;
+		o)	outlinks='true'; include_pattern="$OPTARG" ;;
+		p)	parallel="$OPTARG" ;;
+		q)	quiet='true' ;;
+		r)	resume="$OPTARG" ;;
+		s)	ssl_only='true' ;;
+		x)	outlinks='true'; exclude_pattern="$OPTARG" ;;
+		*)	print_usage
+			exit 1 ;;
+	esac
+done
+shift "$((OPTIND-1))"
+
+if [[ -n "$resume" ]]; then
+	# There should not be any arguments
+	if [[ -n "$1" ]]; then
+		print_usage
+		exit 1
+	fi
+	# Get list
+	# List will be constructed from the specified folder
+	if [[ ! -d "$resume" ]]; then
+		echo "The folder $resume could not be found"
+		exit 1
+	fi
+	cd "$resume"
+	if ! [[ -f "index.txt" && -f "success.log" ]]; then
+		echo "Could not resume session; required files not found"
+		exit 1
+	fi
+	if [[ -f "outlinks.txt" ]]; then
+		# Index will also include successful redirects, which should be logged in captures.log
+		if [[ -f "captures.log" ]]; then
+			success=$(cat success.log captures.log | sed -Ee 's|^/web/[0-9]+/||g')
+		else
+			success=$(<success.log)
+		fi
+		index=$(cat index.txt outlinks.txt)
+		# Convert links to HTTPS
+		if [[ -n "$ssl_only" ]]; then
+			index=$(echo "$index" | sed -Ee 's|^[[:blank:]]*(https?://)?[[:blank:]]*([^[:blank:]]+)|https://\2|g;s|^https://ftp://|ftp://|g')
+			success=$(echo "$success" | sed -Ee 's|^[[:blank:]]*(https?://)?[[:blank:]]*([^[:blank:]]+)|https://\2|g;s|^https://ftp://|ftp://|g')
+		fi
+
+		# Remove duplicate lines from new index
+		index=$(awk '!seen [$0]++' <<< "$index")
+		# Remove links that are in success.log and captures.log from new index
+		list=$(awk '{if (f==1) { r[$0] } else if (! ($0 in r)) { print $0 } } ' f=1 <(echo "$success") f=2 <(echo "$index"))
+
+		# If -o and -x are not specified, then retain original values
+		if [[ -z "$outlinks" ]]; then
+			outlinks='true'
+			include_pattern=$(<include_pattern.txt)
+			exclude_pattern=$(<exclude_pattern.txt)
+		fi
+	else
+		# Remove links that are in success.log from index.txt
+		list=$(awk '{if (f==1) { r[$0] } else if (! ($0 in r)) { print $0 } } ' f=1 success.log f=2 index.txt)
+	fi
+	if [[ -z "$list" ]]; then
+		echo "Session already complete; not resuming"
+		exit 1
+	fi
+	cd
+else
+	# File or at least one URL must be provided
+	if [[ -z "$1" ]]; then
+		print_usage
+		exit 1
+	fi
+	# Get list
+	# Treat as filename if only one argument and file exists, and as URLs otherwise
+	if [[ -n "$2" || ! -f "$1" ]]; then
+		list=$(for i in "$@"; do echo "$i"; done)
+	else
+		list=$(<"$1")
+	fi
+fi
+
+if [[ -n "$custom_dir" ]]; then
+	f="-$$"
+	dir="$custom_dir"
+	if [[ ! -d "$dir" ]]; then
+		mkdir "$dir" || { echo "The folder $dir could not be created"; exit 1; }
+		echo "Created data folder $dir"
+	else
+		echo "Using the existing data folder $dir"
+	fi
+	cd "$dir"
+
+	for i in max_parallel_jobs$f.txt status_rate$f.txt lock$f.txt daily_limit$f.txt quit$f.txt; do
+		if [[ -f "$i" ]]; then
+			rm "$i"
+		fi
+	done
+else
+	f=''
+	parent="spn-data"
+	month=$(date -u +%Y-%m)
+	now=$(date +%s)
+
+	for i in "$parent" "$parent/$month"; do
+		if [[ ! -d ~/"$i" ]]; then
+			mkdir ~/"$i" || { echo "The folder ~/$i could not be created"; exit 1; }
+		fi
+	done
+
+	# Wait between 0 and 0.07 seconds to try to avoid a collision, in case another session is started at exactly the same time
+	sleep ".0$((RANDOM % 8))"
+
+	# Wait between 0.1 and 0.73 seconds if the folder already exists
+	while [[ -d ~/"$parent/$month/$now$dir_suffix" ]]; do
+		sleep ".$((10 + RANDOM % 64))"
+		now=$(date +%s)
+	done
+	dir="$parent/$month/$now$dir_suffix"
+
+	# Try to create the folder
+	mkdir ~/"$dir" || { echo "The folder ~/$dir could not be created"; exit 1; }
+	echo "Created data folder ~/$dir"
+	cd ~/"$dir"
+fi
+
+# Convert links to HTTPS
+if [[ -n "$ssl_only" ]]; then
+	list=$(echo "$list" | sed -Ee 's|^[[:blank:]]*(https?://)?[[:blank:]]*([^[:blank:]]+)|https://\2|g;s|^https://ftp://|ftp://|g')
+fi
+
+# Set POST options
+# The web form sets capture_all=on by default; this replicates the default behavior
+if [[ -z "$no_errors" ]]; then
+	if [[ -n "$post_data" ]]; then
+		post_data="${post_data}&capture_all=on"
+	else
+		post_data="capture_all=on"
+	fi
+fi
+
+# Create data files
+# max_parallel_jobs.txt and status_rate.txt are created later
+touch failed.txt
+# Add successful capture URLs from previous session, if any, to the index and the list of captures
+# This is to prevent redundant captures in the current session and in future ones
+if [[ -n "$success" ]]; then
+	success=$(echo "$success" | awk '!seen [$0]++')
+	echo "$success" >> index.txt
+	echo "$success" >> success.log
+fi
+echo "$list" | awk '!seen [$0]++' >> index.txt
+if [[ -n "$outlinks" ]]; then
+	touch outlinks.txt
+	# Create both files even if one of them would be empty
+	echo "$include_pattern" > include_pattern.txt
+	echo "$exclude_pattern" > exclude_pattern.txt
+fi
+
+# Submit a URL to Save Page Now and check the result
+function capture(){
+	local tries="0"
+	local request
+	local job_id
+	local message
+	while ((tries < 3)); do
+		# Submit
+		local lock_wait=0
+		local start_time=`date +%s`
+		while :; do
+			if (( $(date +%s) - start_time > 300 )); then
+				break 2
+			fi
+			if [[ -n "$auth" ]]; then
+				request=$(curl "${curl_args[@]}" -s -m 60 -X POST --data-urlencode "url=${1}" -d "${post_data}" -H "Accept: application/json" -H "Authorization: LOW ${auth}" "https://web.archive.org/save/")
+				job_id=$(echo "$request" | grep -Eo '"job_id":"([^"\\]|\\["\\])*"' | head -1 | sed -Ee 's/"job_id":"(.*)"/\1/g')
+				if [[ -n "$job_id" ]]; then
+					break
+				fi
+				echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Request failed] $1"
+				message=$(echo "$request" | grep -Eo '"message":"([^"\\]|\\["\\])*"' | sed -Ee 's/"message":"(.*)"/\1/g')
+			else
+				request=$(curl "${curl_args[@]}" -s -m 60 -X POST --data-urlencode "url=${1}" -d "${post_data}" "https://web.archive.org/save/")
+				job_id=$(echo "$request" | grep -E 'spn\.watchJob\(' | sed -Ee 's/^.*spn\.watchJob\("([^"]*).*$/\1/g' | head -1)
+				if [[ -n "$job_id" ]]; then
+					break
+				fi
+				echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Request failed] $1"
+				message=$(echo "$request" | grep -E -A 1 "<h2>" | tail -1 | sed -Ee 's|</?p>||g')
+			fi
+			if [[ -z "$message" ]]; then
+				if [[ "$request" =~ "429 Too Many Requests" ]] || [[ "$request" == "" ]]; then
+					echo "$request"
+					if [[ ! -f lock$f.txt ]]; then
+						touch lock$f.txt
+						sleep 20
+						rm lock$f.txt
+					else
+						break 2
+					fi
+				elif [[ "$request" =~ "400 Bad Request" ]]; then
+					echo "$request"
+					echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Job failed] $1"
+					echo "$(date -u '+%Y-%m-%d %H:%M:%S') $1" >> invalid.log
+					echo "$request" >> invalid.log
+					return 1
+				else
+					sleep 5
+				fi
+			else
+				echo "$message"
+				if ! [[ "$message" =~ "You have already reached the limit of active sessions" || "$message" =~ "Cannot start capture" || "$message" =~ "The server encountered an internal error and was unable to complete your request" || "$message" =~ "Crawling this host is paused" ]]; then
+					if [[ "$message" =~ "You cannot make more than "[1-9][0-9,]*" captures per day" ]]; then
+						touch daily_limit$f.txt
+						break 2
+					else
+						echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Job failed] $1"
+						echo "$(date -u '+%Y-%m-%d %H:%M:%S') $1" >> invalid.log
+						echo "$message" >> invalid.log
+						return 1
+					fi
+				fi
+				if [[ ! -f lock$f.txt ]]; then
+					touch lock$f.txt
+					while [[ -f lock$f.txt ]]; do
+						# Retry the request until either the job is submitted or a different error is received
+						sleep 2
+						if [[ -n "$auth" ]]; then
+							request=$(curl "${curl_args[@]}" -s -m 60 -X POST --data-urlencode "url=${1}" -d "${post_data}" -H "Accept: application/json" -H "Authorization: LOW ${auth}" "https://web.archive.org/save/")
+							job_id=$(echo "$request" | grep -Eo '"job_id":"([^"\\]|\\["\\])*"' | head -1 | sed -Ee 's/"job_id":"(.*)"/\1/g')
+							if [[ -n "$job_id" ]]; then
+								rm lock$f.txt
+								break 2
+							fi
+							echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Request failed] $1"
+							message=$(echo "$request" | grep -Eo '"message":"([^"\\]|\\["\\])*"' | sed -Ee 's/"message":"(.*)"/\1/g')
+						else
+							request=$(curl "${curl_args[@]}" -s -m 60 -X POST --data-urlencode "url=${1}" -d "${post_data}" "https://web.archive.org/save/")
+							job_id=$(echo "$request" | grep -E 'spn\.watchJob\(' | sed -Ee 's/^.*spn\.watchJob\("([^"]*).*$/\1/g' | head -1)
+							if [[ -n "$job_id" ]]; then
+								rm lock$f.txt
+								break 2
+							fi
+							echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Request failed] $1"
+							message=$(echo "$request" | grep -E -A 1 "<h2>" | tail -1 | sed -Ee 's|</?p>||g')
+						fi
+						if [[ -z "$message" ]]; then
+							if [[ "$request" =~ "429 Too Many Requests" ]] || [[ "$request" == "" ]]; then
+								echo "$request"
+								sleep 20
+							else
+								sleep 5
+								rm lock$f.txt
+								break
+							fi
+						else
+							echo "$message"
+							if [[ "$message" =~ "You have already reached the limit of active sessions" || "$message" =~ "Cannot start capture" || "$message" =~ "The server encountered an internal error and was unable to complete your request" || "$message" =~ "Crawling this host is paused" ]]; then
+								:
+							elif [[ "$message" =~ "You cannot make more than "[1-9][0-9,]*" captures per day" ]]; then
+								rm lock$f.txt
+								touch daily_limit$f.txt
+								break 3
+							else
+								rm lock$f.txt
+								echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Job failed] $1"
+								echo "$(date -u '+%Y-%m-%d %H:%M:%S') $1" >> invalid.log
+								echo "$message" >> invalid.log
+								return 1
+							fi
+						fi
+					done
+				else
+					# If another process has already created lock.txt, wait for the other process to remove it
+					while [[ -f lock$f.txt ]]; do
+						sleep 5
+						((lock_wait+=5))
+						if ((lock_wait > 120)); then
+							break 3
+						fi
+					done
+				fi
+			fi
+		done
+		echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Job submitted] $1"
+
+		# Wait
+		delay=$(echo "$request" | grep -Eo 'Your capture will begin in [1-9][0-9,]*s' | sed -Ee 's/[^0-9]*//g')
+		if [[ -z "$delay" ]]; then
+			delay="0"
+		fi
+		local start_time=`date +%s`
+		local status
+		local status_ext
+		while :; do
+			sleep "$(<status_rate$f.txt)"
+			request=$(curl "${curl_args[@]}" -s -m 60 "https://web.archive.org/save/status/$job_id")
+			status=$(echo "$request" | grep -Eo '"status":"([^"\\]|\\["\\])*"' | head -1)
+			if [[ -z "$status" ]]; then
+				echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Status request failed] $1"
+				if [[ "$request" =~ "429 Too Many Requests" ]] || [[ "$request" == "" ]]; then
+					echo "$request"
+					sleep 20
+				fi
+				sleep "$(<status_rate$f.txt)"
+				request=$(curl "${curl_args[@]}" -s -m 60 "https://web.archive.org/save/status/$job_id")
+				status=$(echo "$request" | grep -Eo '"status":"([^"\\]|\\["\\])*"' | head -1)
+				if [[ -z "$status" ]]; then
+					echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Status request failed] $1"
+					if [[ "$request" =~ "429 Too Many Requests" ]] || [[ "$request" == "" ]]; then
+						echo "$request"
+						sleep 20
+						status='"status":"pending"'
+						# Fake status response to allow while loop to continue
+					else
+						echo "$request" >> unknown-json.log
+						break 2
+					fi
+				fi
+			fi
+			if [[ -z "$status" ]]; then
+				echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Unknown error] $1"
+				echo "$request" >> unknown-json.log
+				break 2
+			fi
+			if [[ "$status" == '"status":"success"' ]]; then
+				echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Job completed] $1"
+				echo "$1" >> success.log
+				timestamp=$(echo "$request" | grep -Eo '"timestamp":"[0-9]*"' | sed -Ee 's/^"timestamp":"(.*)"/\1/g')
+				url=$(echo "$request" | grep -Eo '"original_url":"([^"\\]|\\["\\])*"' | sed -Ee 's/^"original_url":"(.*)"/\1/g;s/\\(["\\])/\1/g')
+				echo "/web/$timestamp/$url" >> captures.log
+				if [[ -z "$quiet" ]]; then
+					echo "$request" >> success-json.log
+				fi
+				if [[ -n "$outlinks" ]]; then
+					if [[ "$url" != "$1" ]]; then
+						# Prevent the URL from being submitted twice
+						echo "$url" >> index.txt
+					fi
+					# grep matches array of strings (most special characters are converted server-side, but not square brackets)
+					# sed transforms the array into just the URLs separated by line breaks
+					echo "$request" | grep -Eo '"outlinks":\["([^"\\]|\\["\\])*"(,"([^"\\]|\\["\\])*")*\]' | sed -Ee 's/"outlinks":\["(.*)"\]/\1/g;s/(([^"\\]|\\["\\])*)","/\1\
+/g;s/\\(["\\])/\1/g' | { [[ -n "$(<exclude_pattern.txt)" ]] && { [[ -n "$(<include_pattern.txt)" ]] && grep -E "$(<include_pattern.txt)" | grep -Ev "$(<exclude_pattern.txt)" || grep -Ev "$(<exclude_pattern.txt)"; } || grep -E "$(<include_pattern.txt)"; } >> outlinks.txt
+				fi
+				return 0
+			elif [[ "$status" == '"status":"pending"' ]]; then
+				if (( $(date +%s) - start_time > 600 + delay )); then
+					echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Job timed out] $1"
+					break 2
+				fi
+			elif [[ "$status" == '"status":"error"' ]]; then
+				echo "$request" >> error-json.log
+				status_ext=$(echo "$request" | grep -Eo '"status_ext":"([^"\\]|\\["\\])*"' | head -1 | sed -Ee 's/"status_ext":"(.*)"/\1/g')
+				if [[ -z "$status_ext" ]]; then
+					echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Unknown error] $1"
+					break 2
+				fi
+				if [[ "$status_ext" == 'error:filesize-limit' ]]; then
+					echo "$(date -u '+%Y-%m-%d %H:%M:%S') [File size limit of 2 GB exceeded] $1"
+					echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Job failed] $1"
+					echo "$(date -u '+%Y-%m-%d %H:%M:%S') [$status_ext] $1" >> failed.log
+					return 1
+				elif [[ "$status_ext" == 'error:proxy-error' ]]; then
+					echo "$(date -u '+%Y-%m-%d %H:%M:%S') [SPN proxy error] $1"
+				else
+					message=$(echo "$request" | grep -Eo '"message":"([^"\\]|\\["\\])*"' | sed -Ee 's/"message":"(.*)"/\1/g')
+					if [[ -z "$message" ]]; then
+						echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Unknown error: $status_ext] $1"
+						break 2
+					fi
+					if [[ "$message" == "Live page is not available: chrome-error://chromewebdata/" ]]; then
+						echo "$(date -u '+%Y-%m-%d %H:%M:%S') [SPN internal error] $1"
+					elif [[ "$message" =~ ' (HTTP status='(40[89]|429|50[023478])').'$ ]] || [[ "$message" =~ "The server didn't respond in time" ]]; then
+						# HTTP status 408, 409, 429, 500, 502, 503, 504, 507 or 508, or didn't respond in time
+						# URL may become available later
+						echo "$(date -u '+%Y-%m-%d %H:%M:%S') [$message] $1"
+						break 2
+					elif [[ "$message" =~ ' (HTTP status='[45][0-9]*').'$ ]]; then
+						# HTTP error; assume the URL cannot be archived
+						echo "$(date -u '+%Y-%m-%d %H:%M:%S') [$message] $1"
+						echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Job failed] $1"
+						echo "$(date -u '+%Y-%m-%d %H:%M:%S') [$status_ext] $1" >> failed.log
+						return 1
+					else
+						echo "$(date -u '+%Y-%m-%d %H:%M:%S') [$message] $1"
+						break 2
+					fi
+				fi
+				break
+			else
+				echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Unknown error] $1"
+				break 2
+			fi
+		done
+		((tries++))
+	done
+	echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Job failed] $1"
+	echo "$1" >> failed.txt
+	return 1
+}
+
+function get_list(){
+	local failed_file=failed-$(date +%s).txt
+	mv failed.txt $failed_file
+	touch failed.txt
+	local failed_list=$(<$failed_file)
+
+	if [[ -n "$outlinks" ]]; then
+		local outlinks_file=outlinks-$(date +%s).txt
+		mv outlinks.txt $outlinks_file
+		touch outlinks.txt
+		# Remove duplicate lines; reading into string prevents awk from emptying the file
+		awk '!seen [$0]++' <<< "$(<$outlinks_file)" > $outlinks_file
+		# Convert links to HTTPS
+		if [[ -n "$ssl_only" ]]; then
+			sed -Ee 's|^[[:blank:]]*(https?://)?[[:blank:]]*([^[:blank:]]+)|https://\2|g;s|^https://ftp://|ftp://|g' <<< "$(<$outlinks_file)" > $outlinks_file
+		fi
+		# Remove lines that are already in index.txt
+		local outlinks_list=$(awk '{if (f==1) { r[$0] } else if (! ($0 in r)) { print $0 } } ' f=1 index.txt f=2 $outlinks_file)
+
+		if [[ -n "$outlinks_list" ]]; then
+			echo "$outlinks_list" >> index.txt
+
+			if [[ -n "$failed_list" ]]; then
+				echo "$failed_list
+$outlinks_list"
+			else
+				echo "$outlinks_list"
+			fi
+		fi
+		if [[ -z "$(<$outlinks_file)" ]]; then
+			rm $outlinks_file
+		fi
+	else
+		echo "$failed_list"
+	fi
+	if [[ -z "$failed_list" ]]; then
+		rm $failed_file
+	fi
+}
+
+# Track the number of loops in which no URLs from the list are archived
+repeats=0
+
+# Parallel loop
+if [[ -n "$parallel" ]]; then
+	if ((parallel > 60)); then
+		parallel=60
+		echo "Setting maximum parallel jobs to 60"
+	elif ((parallel < 2)); then
+		parallel=2
+		echo "Setting maximum parallel jobs to 2"
+	fi
+	echo "$parallel" > max_parallel_jobs$f.txt
+	# Overall request rate stays at around 60 per minute
+	echo "$parallel" > status_rate$f.txt
+	while [[ ! -f quit$f.txt ]]; do
+		(
+		hour=`date -u +%H`
+		while IFS='' read -r line || [[ -n "$line" ]]; do
+			capture "$line" & ((children > 2)) && sleep 2.5
+			children_wait=0
+			children=`jobs -p | wc -l`
+			while ! (( children < $(<max_parallel_jobs$f.txt) )); do
+				sleep 1
+				((children_wait++))
+				if ((children_wait < 600)); then
+					children=`jobs -p | wc -l`
+				else
+					# Wait is longer than 600 seconds; something might be wrong
+					# Increase limit and ignore the problem for now
+					children=0
+					echo $(( $(<max_parallel_jobs$f.txt) + 1 )) > max_parallel_jobs$f.txt
+				fi
+			done
+			lock_wait=0
+			while [[ -f lock$f.txt ]]; do
+				sleep 2
+				((lock_wait+=2))
+				if ((lock_wait > 300)); then
+					rm lock$f.txt
+				fi
+			done
+			if [[ -f daily_limit$f.txt ]]; then
+				echo "$(date -u '+%Y-%m-%d %H:%M:%S') Pausing for $(( (3600 - $(date +%s) % 3600) / 60 )) minutes"
+				sleep $(( 3600 - $(date +%s) % 3600 ))
+				rm daily_limit$f.txt
+			fi
+			((counter++))
+			# Check failures and outlinks approximately every hour
+			if ! ((counter % 50)) && ! [[ `date -u +%H` == "$hour" || -f quit$f.txt ]]; then
+				hour=`date -u +%H`
+				new_list=$(get_list)
+				if [[ -n "$new_list" ]]; then
+					while IFS='' read -r line2 || [[ -n "$line2" ]]; do
+						capture "$line2" & ((children > 2)) && sleep 2.5
+						children_wait=0
+						children=`jobs -p | wc -l`
+						while ! ((children < $(<max_parallel_jobs$f.txt) )); do
+							sleep 1
+							((children_wait++))
+							if ((children_wait < 600)); then
+								children=`jobs -p | wc -l`
+							else
+								# Wait is longer than 600 seconds; something might be wrong
+								# Increase limit and ignore the problem for now
+								children=0
+								echo $(( $(<max_parallel_jobs$f.txt) + 1 )) > max_parallel_jobs$f.txt
+							fi
+						done
+						lock_wait=0
+						while [[ -f lock$f.txt ]]; do
+							sleep 2
+							((lock_wait+=2))
+							if ((lock_wait > 300)); then
+								rm lock$f.txt
+							fi
+						done
+						if [[ -f daily_limit$f.txt ]]; then
+							echo "$(date -u '+%Y-%m-%d %H:%M:%S') Pausing for $(( (3600 - $(date +%s) % 3600) / 60 )) minutes"
+							sleep $(( 3600 - $(date +%s) % 3600 ))
+							rm daily_limit$f.txt
+						fi
+					done <<< "$new_list"
+					unset new_list
+				fi
+			fi
+		done <<< "$list"
+
+		for job in `jobs -p`; do wait $job; done
+		)
+
+		new_list=$(get_list)
+		if [[ "$new_list" == "$list" ]]; then
+			((repeats++))
+			if ((repeats > 1)); then
+				if ((repeats > 3)); then
+					break
+				else
+					echo "$(date -u '+%Y-%m-%d %H:%M:%S') Pausing for 30 minutes"
+					sleep 1800
+				fi
+			fi
+		fi
+		list="$new_list"
+		unset new_list
+		if [[ -z "$list" && -z "$(<failed.txt)" ]]; then
+			# No more URLs
+			touch quit$f.txt
+			rm failed.txt
+		fi
+	done
+fi
+
+if [[ ! -f quit$f.txt ]]; then
+	echo "2" > status_rate$f.txt
+fi
+
+# Linear loop
+while [[ ! -f quit$f.txt ]]; do
+	hour=`date -u +%H`
+	while IFS='' read -r line || [[ -n "$line" ]]; do
+		capture "$line"
+		((counter++))
+		# Check failures and outlinks approximately every hour
+		if ! ((counter % 50)) && ! [[ `date -u +%H` == "$hour" || -f quit$f.txt ]]; then
+			hour=`date -u +%H`
+			new_list=$(get_list)
+			if [[ -n "$new_list" ]]; then
+				while IFS='' read -r line2 || [[ -n "$line2" ]]; do
+					capture "$line2"
+				done <<< "$new_list"
+			fi
+			unset new_list
+		fi
+	done <<< "$list"
+	new_list=$(get_list)
+	if [[ "$new_list" == "$list" ]]; then
+		((repeats++))
+		if ((repeats > 1)); then
+			if ((repeats > 4)); then
+				# Give up
+				touch quit$f.txt
+			else
+				echo "$(date -u '+%Y-%m-%d %H:%M:%S') Pausing for 30 minutes"
+				sleep 1800
+			fi
+		fi
+	fi
+	list="$new_list"
+	unset new_list
+	if [[ -z "$list" && -z "$(<failed.txt)" ]]; then
+		# No more URLs
+		touch quit$f.txt
+		rm failed.txt
+	fi
+done
+
+if [[ -n "$custom_dir" ]]; then
+	for i in max_parallel_jobs$f.txt status_rate$f.txt lock$f.txt daily_limit$f.txt quit$f.txt; do
+		if [[ -f "$i" ]]; then
+			rm "$i"
+		fi
+	done
+fi