diff --git a/longnow.sh b/longnow6-wip.sh similarity index 55% rename from longnow.sh rename to longnow6-wip.sh index c8b1a25..94d9e9b 100755 --- a/longnow.sh +++ b/longnow6-wip.sh @@ -1,9 +1,13 @@ +# To do: +# - Get API keys from somewhere else +# - Change installation warnings + # Filenames input="$1" root="$(echo "$input" | sed 's/.md//g' )" links="$root.links.txt" -archivedLinks="$root.links.archived.txt" -errors="$root.errors.txt" +archivedLinks="captures.log" ##"$root.links.archived.txt" +errors="error-json.log" output="$root.longnow.md" ## Directories @@ -45,62 +49,8 @@ function pushToArchive(){ totalTimeInMinutes=$(echo "scale=0; ($numLinesLinkFile*7.5 + 60*$numLinesLinkFile/15)/60" | bc) echo "Expected to take ~$totalTimeInMinutes mins." echo "" - - ## rm -f "$archivedLinks" - rm -f "$errors" - touch "$archivedLinks" - touch "$errors" - - ## How to deal with errors that arise - echo "If this file contains errors, you can deal with them as follows:" >> "$errors" - echo "- Do another pass with \$ longnow yourfile.md. If you don't delete yourfile.md.links.archived, past archive links are remembered, and only the links which are not there are sent again" >> "$errors" - echo "- Input the offending links manually to https://archive.org/, add the results to the yourfile.md.links.archived file manually, and then do another pass with \$ longnow yourfile.md" >> "$errors" - echo "" >> "$errors" - - ## Main body - counter=1 - while IFS= read -r line - do - wait - if [ $(($counter % 15)) -eq 0 ]; then - printf "Archive.org doesn't accept more than 15 links per min; sleeping for 1min...\n\n" - sleep 1m - fi - echo "Url: $line" - urlAlreadyContainedInLocalArchivedLinks=$( ( grep "$line$" "$archivedLinks"; grep "$line/$" "$archivedLinks" ) | tail -1 ) - - if [ "$urlAlreadyContainedInLocalArchivedLinks" == "" ]; then - urlAlreadyInArchiveOnline="$(curl --silent http://archive.org/wayback/available?url=$line | jq '.archived_snapshots.closest.url' | sed 's/"//g' | sed 's/null//g' )" - if [ "$urlAlreadyInArchiveOnline" == "" ]; then - echo "Sending to archive..." - archiveURL=$(archivenow --ia $line) - if [[ "$archiveURL" == "Error"* ]]; then - echo "$line" >> "$errors" - echo "$archiveURL" >> "$errors" - echo "" >> "$errors" - echo "There was an error. See $errors for how to deal with it." - echo "" - else - echo "$archiveURL" >> "$archivedLinks" - fi - counter=$((counter+1)) - numSecondsSleep=$((5+ ($RANDOM%15))) - else - echo "Already in archive.org: $urlAlreadyInArchiveOnline" - echo "$urlAlreadyInArchiveOnline" >> "$archivedLinks" - echo "" - numSecondsSleep=0 - fi - elif [ ! -z "$urlAlreadyContainedInLocalArchivedLinks" ]; then - echo "Already in local archive: $urlAlreadyContainedInLocalArchivedLinks" - archiveURL="$urlAlreadyContainedInLocalArchivedLinks" - numSecondsSleep=0 - # echo $archiveURL - echo "Sleeping for $numSecondsSleep seconds..." - sleep $numSecondsSleep - echo "" - fi - done < "$links" + + /home/loki/.bash/src/longnow/spn/wayback-machine-spn-scripts/spn.sh -a [my private key] -f . -p 3 "$links" echo "Done pushing links to archive.org" echo "" @@ -123,7 +73,7 @@ function addArchiveLinksToFile(){ ## echo "ArchivedUrl: $archivedUrl" urlForSed="${url//\//\\/}" archiveUrlForSed="${archivedUrl//\//\\/}" - sed -i "s/$urlForSed)/$urlForSed) ([a]($archiveUrlForSed))/g" "$output" + sed -i "s/$urlForSed)/$urlForSed) ([a](https:\/\/web.archive.org$archiveUrlForSed))/g" "$output" ##else ##echo "There was an error for $url; see the $errorsFile" fi @@ -147,8 +97,7 @@ function explainJqInstallation(){ } ## Report errors function reportErrors(){ - numLinesErrorFile=$(wc -l "$errors" | awk '{ print $1 }') - if [ "$numLinesErrorFile" -gt 4 ]; then + if test -f "$errors"; then echo "It seems that there are errors. To view and deal with them, see the $errors file" fi } diff --git a/spn.sh b/spn.sh new file mode 100755 index 0000000..0716574 --- /dev/null +++ b/spn.sh @@ -0,0 +1,661 @@ +#!/bin/bash + +auth='' +curl_args=() +post_data='' +custom_dir='' +dir_suffix='' +no_errors='' +outlinks='' +parallel='' +quiet='' +resume='' +ssl_only='' +include_pattern='' +exclude_pattern='' + +print_usage() { + echo "Usage: $(basename "$0") [options] file + $(basename "$0") [options] url [url]... + $(basename "$0") [options] -r folder + +Options: + -a auth S3 API keys, in the form accesskey:secret + (get account keys at https://archive.org/account/s3.php) + + -c args pass additional arguments to curl + + -d data capture request options, or other arbitrary POST data + + -f folder use a custom location for the data folder + (some files will be overwritten or deleted during the session) + + -i suffix add a suffix to the name of the data folder + (if -f is used, -i is ignored) + + -n tell Save Page Now not to save errors into the Wayback Machine + + -o pattern save detected capture outlinks matching regex (ERE) pattern + + -p N run at most N capture jobs in parallel (off by default) + + -q discard JSON for completed jobs instead of writing to log file + + -r folder resume with the remaining URLs of an aborted session + (settings are not carried over, except for outlinks options) + + -s use HTTPS for all captures and change HTTP input URLs to HTTPS + + -x pattern save detected capture outlinks not matching regex (ERE) pattern + (if -o is also used, outlinks are filtered using both regexes)" +} + +while getopts 'a:c:d:f:i:no:p:qr:sx:' flag; do + case "${flag}" in + a) auth="$OPTARG" ;; + c) declare -a "curl_args=($OPTARG)" ;; + d) post_data="$OPTARG" ;; + f) custom_dir="$OPTARG" ;; + i) dir_suffix="-$OPTARG" ;; + n) no_errors='true' ;; + o) outlinks='true'; include_pattern="$OPTARG" ;; + p) parallel="$OPTARG" ;; + q) quiet='true' ;; + r) resume="$OPTARG" ;; + s) ssl_only='true' ;; + x) outlinks='true'; exclude_pattern="$OPTARG" ;; + *) print_usage + exit 1 ;; + esac +done +shift "$((OPTIND-1))" + +if [[ -n "$resume" ]]; then + # There should not be any arguments + if [[ -n "$1" ]]; then + print_usage + exit 1 + fi + # Get list + # List will be constructed from the specified folder + if [[ ! -d "$resume" ]]; then + echo "The folder $resume could not be found" + exit 1 + fi + cd "$resume" + if ! [[ -f "index.txt" && -f "success.log" ]]; then + echo "Could not resume session; required files not found" + exit 1 + fi + if [[ -f "outlinks.txt" ]]; then + # Index will also include successful redirects, which should be logged in captures.log + if [[ -f "captures.log" ]]; then + success=$(cat success.log captures.log | sed -Ee 's|^/web/[0-9]+/||g') + else + success=$(> index.txt + echo "$success" >> success.log +fi +echo "$list" | awk '!seen [$0]++' >> index.txt +if [[ -n "$outlinks" ]]; then + touch outlinks.txt + # Create both files even if one of them would be empty + echo "$include_pattern" > include_pattern.txt + echo "$exclude_pattern" > exclude_pattern.txt +fi + +# Submit a URL to Save Page Now and check the result +function capture(){ + local tries="0" + local request + local job_id + local message + while ((tries < 3)); do + # Submit + local lock_wait=0 + local start_time=`date +%s` + while :; do + if (( $(date +%s) - start_time > 300 )); then + break 2 + fi + if [[ -n "$auth" ]]; then + request=$(curl "${curl_args[@]}" -s -m 60 -X POST --data-urlencode "url=${1}" -d "${post_data}" -H "Accept: application/json" -H "Authorization: LOW ${auth}" "https://web.archive.org/save/") + job_id=$(echo "$request" | grep -Eo '"job_id":"([^"\\]|\\["\\])*"' | head -1 | sed -Ee 's/"job_id":"(.*)"/\1/g') + if [[ -n "$job_id" ]]; then + break + fi + echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Request failed] $1" + message=$(echo "$request" | grep -Eo '"message":"([^"\\]|\\["\\])*"' | sed -Ee 's/"message":"(.*)"/\1/g') + else + request=$(curl "${curl_args[@]}" -s -m 60 -X POST --data-urlencode "url=${1}" -d "${post_data}" "https://web.archive.org/save/") + job_id=$(echo "$request" | grep -E 'spn\.watchJob\(' | sed -Ee 's/^.*spn\.watchJob\("([^"]*).*$/\1/g' | head -1) + if [[ -n "$job_id" ]]; then + break + fi + echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Request failed] $1" + message=$(echo "$request" | grep -E -A 1 "

" | tail -1 | sed -Ee 's|||g') + fi + if [[ -z "$message" ]]; then + if [[ "$request" =~ "429 Too Many Requests" ]] || [[ "$request" == "" ]]; then + echo "$request" + if [[ ! -f lock$f.txt ]]; then + touch lock$f.txt + sleep 20 + rm lock$f.txt + else + break 2 + fi + elif [[ "$request" =~ "400 Bad Request" ]]; then + echo "$request" + echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Job failed] $1" + echo "$(date -u '+%Y-%m-%d %H:%M:%S') $1" >> invalid.log + echo "$request" >> invalid.log + return 1 + else + sleep 5 + fi + else + echo "$message" + if ! [[ "$message" =~ "You have already reached the limit of active sessions" || "$message" =~ "Cannot start capture" || "$message" =~ "The server encountered an internal error and was unable to complete your request" || "$message" =~ "Crawling this host is paused" ]]; then + if [[ "$message" =~ "You cannot make more than "[1-9][0-9,]*" captures per day" ]]; then + touch daily_limit$f.txt + break 2 + else + echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Job failed] $1" + echo "$(date -u '+%Y-%m-%d %H:%M:%S') $1" >> invalid.log + echo "$message" >> invalid.log + return 1 + fi + fi + if [[ ! -f lock$f.txt ]]; then + touch lock$f.txt + while [[ -f lock$f.txt ]]; do + # Retry the request until either the job is submitted or a different error is received + sleep 2 + if [[ -n "$auth" ]]; then + request=$(curl "${curl_args[@]}" -s -m 60 -X POST --data-urlencode "url=${1}" -d "${post_data}" -H "Accept: application/json" -H "Authorization: LOW ${auth}" "https://web.archive.org/save/") + job_id=$(echo "$request" | grep -Eo '"job_id":"([^"\\]|\\["\\])*"' | head -1 | sed -Ee 's/"job_id":"(.*)"/\1/g') + if [[ -n "$job_id" ]]; then + rm lock$f.txt + break 2 + fi + echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Request failed] $1" + message=$(echo "$request" | grep -Eo '"message":"([^"\\]|\\["\\])*"' | sed -Ee 's/"message":"(.*)"/\1/g') + else + request=$(curl "${curl_args[@]}" -s -m 60 -X POST --data-urlencode "url=${1}" -d "${post_data}" "https://web.archive.org/save/") + job_id=$(echo "$request" | grep -E 'spn\.watchJob\(' | sed -Ee 's/^.*spn\.watchJob\("([^"]*).*$/\1/g' | head -1) + if [[ -n "$job_id" ]]; then + rm lock$f.txt + break 2 + fi + echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Request failed] $1" + message=$(echo "$request" | grep -E -A 1 "

" | tail -1 | sed -Ee 's|||g') + fi + if [[ -z "$message" ]]; then + if [[ "$request" =~ "429 Too Many Requests" ]] || [[ "$request" == "" ]]; then + echo "$request" + sleep 20 + else + sleep 5 + rm lock$f.txt + break + fi + else + echo "$message" + if [[ "$message" =~ "You have already reached the limit of active sessions" || "$message" =~ "Cannot start capture" || "$message" =~ "The server encountered an internal error and was unable to complete your request" || "$message" =~ "Crawling this host is paused" ]]; then + : + elif [[ "$message" =~ "You cannot make more than "[1-9][0-9,]*" captures per day" ]]; then + rm lock$f.txt + touch daily_limit$f.txt + break 3 + else + rm lock$f.txt + echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Job failed] $1" + echo "$(date -u '+%Y-%m-%d %H:%M:%S') $1" >> invalid.log + echo "$message" >> invalid.log + return 1 + fi + fi + done + else + # If another process has already created lock.txt, wait for the other process to remove it + while [[ -f lock$f.txt ]]; do + sleep 5 + ((lock_wait+=5)) + if ((lock_wait > 120)); then + break 3 + fi + done + fi + fi + done + echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Job submitted] $1" + + # Wait + delay=$(echo "$request" | grep -Eo 'Your capture will begin in [1-9][0-9,]*s' | sed -Ee 's/[^0-9]*//g') + if [[ -z "$delay" ]]; then + delay="0" + fi + local start_time=`date +%s` + local status + local status_ext + while :; do + sleep "$(> unknown-json.log + break 2 + fi + fi + fi + if [[ -z "$status" ]]; then + echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Unknown error] $1" + echo "$request" >> unknown-json.log + break 2 + fi + if [[ "$status" == '"status":"success"' ]]; then + echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Job completed] $1" + echo "$1" >> success.log + timestamp=$(echo "$request" | grep -Eo '"timestamp":"[0-9]*"' | sed -Ee 's/^"timestamp":"(.*)"/\1/g') + url=$(echo "$request" | grep -Eo '"original_url":"([^"\\]|\\["\\])*"' | sed -Ee 's/^"original_url":"(.*)"/\1/g;s/\\(["\\])/\1/g') + echo "/web/$timestamp/$url" >> captures.log + if [[ -z "$quiet" ]]; then + echo "$request" >> success-json.log + fi + if [[ -n "$outlinks" ]]; then + if [[ "$url" != "$1" ]]; then + # Prevent the URL from being submitted twice + echo "$url" >> index.txt + fi + # grep matches array of strings (most special characters are converted server-side, but not square brackets) + # sed transforms the array into just the URLs separated by line breaks + echo "$request" | grep -Eo '"outlinks":\["([^"\\]|\\["\\])*"(,"([^"\\]|\\["\\])*")*\]' | sed -Ee 's/"outlinks":\["(.*)"\]/\1/g;s/(([^"\\]|\\["\\])*)","/\1\ +/g;s/\\(["\\])/\1/g' | { [[ -n "$(> outlinks.txt + fi + return 0 + elif [[ "$status" == '"status":"pending"' ]]; then + if (( $(date +%s) - start_time > 600 + delay )); then + echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Job timed out] $1" + break 2 + fi + elif [[ "$status" == '"status":"error"' ]]; then + echo "$request" >> error-json.log + status_ext=$(echo "$request" | grep -Eo '"status_ext":"([^"\\]|\\["\\])*"' | head -1 | sed -Ee 's/"status_ext":"(.*)"/\1/g') + if [[ -z "$status_ext" ]]; then + echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Unknown error] $1" + break 2 + fi + if [[ "$status_ext" == 'error:filesize-limit' ]]; then + echo "$(date -u '+%Y-%m-%d %H:%M:%S') [File size limit of 2 GB exceeded] $1" + echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Job failed] $1" + echo "$(date -u '+%Y-%m-%d %H:%M:%S') [$status_ext] $1" >> failed.log + return 1 + elif [[ "$status_ext" == 'error:proxy-error' ]]; then + echo "$(date -u '+%Y-%m-%d %H:%M:%S') [SPN proxy error] $1" + else + message=$(echo "$request" | grep -Eo '"message":"([^"\\]|\\["\\])*"' | sed -Ee 's/"message":"(.*)"/\1/g') + if [[ -z "$message" ]]; then + echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Unknown error: $status_ext] $1" + break 2 + fi + if [[ "$message" == "Live page is not available: chrome-error://chromewebdata/" ]]; then + echo "$(date -u '+%Y-%m-%d %H:%M:%S') [SPN internal error] $1" + elif [[ "$message" =~ ' (HTTP status='(40[89]|429|50[023478])').'$ ]] || [[ "$message" =~ "The server didn't respond in time" ]]; then + # HTTP status 408, 409, 429, 500, 502, 503, 504, 507 or 508, or didn't respond in time + # URL may become available later + echo "$(date -u '+%Y-%m-%d %H:%M:%S') [$message] $1" + break 2 + elif [[ "$message" =~ ' (HTTP status='[45][0-9]*').'$ ]]; then + # HTTP error; assume the URL cannot be archived + echo "$(date -u '+%Y-%m-%d %H:%M:%S') [$message] $1" + echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Job failed] $1" + echo "$(date -u '+%Y-%m-%d %H:%M:%S') [$status_ext] $1" >> failed.log + return 1 + else + echo "$(date -u '+%Y-%m-%d %H:%M:%S') [$message] $1" + break 2 + fi + fi + break + else + echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Unknown error] $1" + break 2 + fi + done + ((tries++)) + done + echo "$(date -u '+%Y-%m-%d %H:%M:%S') [Job failed] $1" + echo "$1" >> failed.txt + return 1 +} + +function get_list(){ + local failed_file=failed-$(date +%s).txt + mv failed.txt $failed_file + touch failed.txt + local failed_list=$(<$failed_file) + + if [[ -n "$outlinks" ]]; then + local outlinks_file=outlinks-$(date +%s).txt + mv outlinks.txt $outlinks_file + touch outlinks.txt + # Remove duplicate lines; reading into string prevents awk from emptying the file + awk '!seen [$0]++' <<< "$(<$outlinks_file)" > $outlinks_file + # Convert links to HTTPS + if [[ -n "$ssl_only" ]]; then + sed -Ee 's|^[[:blank:]]*(https?://)?[[:blank:]]*([^[:blank:]]+)|https://\2|g;s|^https://ftp://|ftp://|g' <<< "$(<$outlinks_file)" > $outlinks_file + fi + # Remove lines that are already in index.txt + local outlinks_list=$(awk '{if (f==1) { r[$0] } else if (! ($0 in r)) { print $0 } } ' f=1 index.txt f=2 $outlinks_file) + + if [[ -n "$outlinks_list" ]]; then + echo "$outlinks_list" >> index.txt + + if [[ -n "$failed_list" ]]; then + echo "$failed_list +$outlinks_list" + else + echo "$outlinks_list" + fi + fi + if [[ -z "$(<$outlinks_file)" ]]; then + rm $outlinks_file + fi + else + echo "$failed_list" + fi + if [[ -z "$failed_list" ]]; then + rm $failed_file + fi +} + +# Track the number of loops in which no URLs from the list are archived +repeats=0 + +# Parallel loop +if [[ -n "$parallel" ]]; then + if ((parallel > 60)); then + parallel=60 + echo "Setting maximum parallel jobs to 60" + elif ((parallel < 2)); then + parallel=2 + echo "Setting maximum parallel jobs to 2" + fi + echo "$parallel" > max_parallel_jobs$f.txt + # Overall request rate stays at around 60 per minute + echo "$parallel" > status_rate$f.txt + while [[ ! -f quit$f.txt ]]; do + ( + hour=`date -u +%H` + while IFS='' read -r line || [[ -n "$line" ]]; do + capture "$line" & ((children > 2)) && sleep 2.5 + children_wait=0 + children=`jobs -p | wc -l` + while ! (( children < $( max_parallel_jobs$f.txt + fi + done + lock_wait=0 + while [[ -f lock$f.txt ]]; do + sleep 2 + ((lock_wait+=2)) + if ((lock_wait > 300)); then + rm lock$f.txt + fi + done + if [[ -f daily_limit$f.txt ]]; then + echo "$(date -u '+%Y-%m-%d %H:%M:%S') Pausing for $(( (3600 - $(date +%s) % 3600) / 60 )) minutes" + sleep $(( 3600 - $(date +%s) % 3600 )) + rm daily_limit$f.txt + fi + ((counter++)) + # Check failures and outlinks approximately every hour + if ! ((counter % 50)) && ! [[ `date -u +%H` == "$hour" || -f quit$f.txt ]]; then + hour=`date -u +%H` + new_list=$(get_list) + if [[ -n "$new_list" ]]; then + while IFS='' read -r line2 || [[ -n "$line2" ]]; do + capture "$line2" & ((children > 2)) && sleep 2.5 + children_wait=0 + children=`jobs -p | wc -l` + while ! ((children < $( max_parallel_jobs$f.txt + fi + done + lock_wait=0 + while [[ -f lock$f.txt ]]; do + sleep 2 + ((lock_wait+=2)) + if ((lock_wait > 300)); then + rm lock$f.txt + fi + done + if [[ -f daily_limit$f.txt ]]; then + echo "$(date -u '+%Y-%m-%d %H:%M:%S') Pausing for $(( (3600 - $(date +%s) % 3600) / 60 )) minutes" + sleep $(( 3600 - $(date +%s) % 3600 )) + rm daily_limit$f.txt + fi + done <<< "$new_list" + unset new_list + fi + fi + done <<< "$list" + + for job in `jobs -p`; do wait $job; done + ) + + new_list=$(get_list) + if [[ "$new_list" == "$list" ]]; then + ((repeats++)) + if ((repeats > 1)); then + if ((repeats > 3)); then + break + else + echo "$(date -u '+%Y-%m-%d %H:%M:%S') Pausing for 30 minutes" + sleep 1800 + fi + fi + fi + list="$new_list" + unset new_list + if [[ -z "$list" && -z "$( status_rate$f.txt +fi + +# Linear loop +while [[ ! -f quit$f.txt ]]; do + hour=`date -u +%H` + while IFS='' read -r line || [[ -n "$line" ]]; do + capture "$line" + ((counter++)) + # Check failures and outlinks approximately every hour + if ! ((counter % 50)) && ! [[ `date -u +%H` == "$hour" || -f quit$f.txt ]]; then + hour=`date -u +%H` + new_list=$(get_list) + if [[ -n "$new_list" ]]; then + while IFS='' read -r line2 || [[ -n "$line2" ]]; do + capture "$line2" + done <<< "$new_list" + fi + unset new_list + fi + done <<< "$list" + new_list=$(get_list) + if [[ "$new_list" == "$list" ]]; then + ((repeats++)) + if ((repeats > 1)); then + if ((repeats > 4)); then + # Give up + touch quit$f.txt + else + echo "$(date -u '+%Y-%m-%d %H:%M:%S') Pausing for 30 minutes" + sleep 1800 + fi + fi + fi + list="$new_list" + unset new_list + if [[ -z "$list" && -z "$(