metaforecast/src/platforms/infer-fetch.js

255 lines
8.7 KiB
JavaScript
Raw Normal View History

2021-01-12 12:43:41 +00:00
/* Imports */
import axios from "axios"
import { getCookie, applyIfCookieExists } from "../utils/getCookies.js"
import { Tabletojson } from "tabletojson"
import toMarkdown from "../utils/toMarkdown.js"
import { calculateStars } from "../utils/stars.js"
import { upsert } from "../utils/mongo-wrapper.js"
2021-01-12 12:43:41 +00:00
/* Definitions */
let htmlEndPoint = 'https://www.cset-foretell.com/questions?page='
String.prototype.replaceAll = function replaceAll(search, replace) { return this.split(search).join(replace); }
const DEBUG_MODE = "on"// "off"
2022-01-11 21:55:46 +00:00
const SLEEP_TIME_RANDOM=100//5000 // miliseconds
const SLEEP_TIME_EXTRA=0//1000
2021-01-12 12:43:41 +00:00
/* Support functions */
async function fetchPage(page, cookie) {
2021-09-22 17:29:23 +00:00
console.log(`Page #${page}`)
if (page == 1) {
cookie = cookie.split(";")[0] // Interesting that it otherwise doesn't work :(
}
let urlEndpoint = htmlEndPoint + page
2021-06-25 16:00:00 +00:00
console.log(urlEndpoint)
let response = await axios({
2021-06-25 16:00:00 +00:00
url: urlEndpoint,
2021-01-12 12:43:41 +00:00
method: 'GET',
headers: ({
'Content-Type': 'text/html',
'Cookie': cookie
2021-01-12 12:43:41 +00:00
}),
})
.then(res => res.data)
2021-04-08 19:32:03 +00:00
// console.log(response)
2021-01-12 12:43:41 +00:00
return response
}
async function fetchStats(questionUrl, cookie) {
let response = await axios({
url: questionUrl + "/stats",
2021-01-12 12:43:41 +00:00
method: 'GET',
headers: ({
'Content-Type': 'text/html',
'Cookie': cookie,
'Referer': questionUrl,
2021-01-12 12:43:41 +00:00
}),
})
.then(res => res.data)
if (response.includes("Sign up or sign in to forecast")) {
throw Error("Not logged in")
}
2021-01-12 12:43:41 +00:00
// Is binary?
let isbinary = response.includes("binary?":true")
// console.log(`is binary? ${isbinary}`)
let options = []
if (isbinary) {
2021-01-12 12:43:41 +00:00
// Crowd percentage
let htmlElements = response.split("\n")
2021-11-05 13:45:55 +00:00
// DEBUG_MODE == "on" ? htmlLines.forEach(line => console.log(line)) : id()
2021-02-16 14:18:23 +00:00
let h3Element = htmlElements.filter(str => str.includes("<h3>"))[0]
2021-11-05 13:45:55 +00:00
// DEBUG_MODE == "on" ? console.log(h5elements) : id()
2021-01-12 12:43:41 +00:00
let crowdpercentage = h3Element.split(">")[1].split("<")[0]
let probability = Number(crowdpercentage.replace("%", "")) / 100
options.push(({
name: "Yes",
probability: probability,
type: "PROBABILITY"
}), ({
name: "No",
probability: +(1 - probability).toFixed(2), // avoids floating point shenanigans
type: "PROBABILITY"
}))
} else {
try {
let optionsBody = response.split("tbody")[1] // Previously [1], then previously [3] but they added a new table.
// console.log(optionsBody)
let optionsHtmlElement = "<table" + optionsBody + "table>"
let tablesAsJson = Tabletojson.convert(optionsHtmlElement)
let firstTable = tablesAsJson[0]
options = firstTable.map(element => ({
name: element['0'],
probability: Number(element['1'].replace("%", "")) / 100,
type: "PROBABILITY"
}))
} catch (error) {
let optionsBody = response.split("tbody")[3] // Catch if the error is related to table position
let optionsHtmlElement = "<table" + optionsBody + "table>"
let tablesAsJson = Tabletojson.convert(optionsHtmlElement)
let firstTable = tablesAsJson[0]
if (firstTable) {
options = firstTable.map(element => ({
name: element['0'],
probability: Number(element['1'].replace("%", "")) / 100,
type: "PROBABILITY"
}))
} else {
// New type of question, tricky to parse the options
// Just leave options = [] for now.
// https://www.cset-foretell.com/blog/rolling-question-formats
}
}
2021-01-12 12:43:41 +00:00
}
// Description
let descriptionraw = response.split(`<meta name="description" content="`)[1]
2021-02-03 17:35:38 +00:00
let descriptionprocessed1 = descriptionraw.split(`">`)[0]
let descriptionprocessed2 = descriptionprocessed1.replace(">", "")
let descriptionprocessed3 = descriptionprocessed2.replace("To suggest a change or clarification to this question, please select Request Clarification from the green gear-shaped dropdown button to the right of the question.", ``)
2021-04-08 19:32:03 +00:00
// console.log(descriptionprocessed3)
let descriptionprocessed4 = descriptionprocessed3.replaceAll("\r\n\r\n", "\n")
let descriptionprocessed5 = descriptionprocessed4.replaceAll("\n\n", "\n")
let descriptionprocessed6 = descriptionprocessed5.replaceAll("&quot;", `"`)
let descriptionprocessed7 = descriptionprocessed6.replaceAll("&#39;", "'")
let descriptionprocessed8 = toMarkdown(descriptionprocessed7)
2021-02-03 17:35:38 +00:00
let description = descriptionprocessed8
2021-01-12 12:43:41 +00:00
// Number of forecasts
//console.log(response)
//console.log(response.split("prediction_sets_count&quot;:")[1])
2021-01-12 12:43:41 +00:00
let numforecasts = response.split("prediction_sets_count&quot;:")[1].split(",")[0]
2021-04-08 19:32:03 +00:00
// console.log(numforecasts)
2021-01-12 12:43:41 +00:00
// Number of predictors
let numforecasters = response.split("predictors_count&quot;:")[1].split(",")[0]
2021-04-08 19:32:03 +00:00
// console.log(numpredictors)
2021-01-12 12:43:41 +00:00
let result = {
"description": description,
"options": options,
2021-04-07 20:29:21 +00:00
"timestamp": new Date().toISOString(),
"qualityindicators": {
"numforecasts": Number(numforecasts),
"numforecasters": Number(numforecasters),
"stars": calculateStars("CSET-foretell", { numforecasts })
2021-04-07 20:29:21 +00:00
}
2021-01-12 12:43:41 +00:00
}
2021-01-12 12:43:41 +00:00
return result
}
function isSignedIn(html) {
let isSignedInBool = !(html.includes("You need to sign in or sign up before continuing") || html.includes("Sign up"))
if (!isSignedInBool) {
console.log("Error: Not signed in.")
}
console.log(`Signed in? ${isSignedInBool}`)
return isSignedInBool
}
function isEnd(html) {
2021-06-25 16:00:00 +00:00
let isEndBool = html.includes("No questions match your filter")
if (isEndBool) {
2021-06-25 16:00:00 +00:00
//console.log(html)
}
console.log(`IsEnd? ${isEndBool}`)
return isEndBool
2021-01-12 12:43:41 +00:00
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/* Body */
async function csetforetell_inner(cookie) {
let i = 1
2021-01-12 12:43:41 +00:00
let response = await fetchPage(i, cookie)
let results = []
2021-01-12 12:43:41 +00:00
let init = Date.now()
2021-04-08 19:32:03 +00:00
// console.log("Downloading... This might take a couple of minutes. Results will be shown.")
while (!isEnd(response) && isSignedIn(response)) {
2021-01-12 12:43:41 +00:00
let htmlLines = response.split("\n")
2022-01-11 21:55:46 +00:00
// let h4elements = htmlLines.filter(str => str.includes("<h5> <a href=") || str.includes("<h4> <a href="))
let questionHrefs = htmlLines.filter(str => str.includes("https://www.cset-foretell.com/questions/"))
2021-11-05 13:45:55 +00:00
// console.log(questionHrefs)
2022-01-11 21:55:46 +00:00
if (process.env.DEBUG_MODE == "on" || DEBUG_MODE == "on") {
2021-06-25 16:00:00 +00:00
//console.log(response)
2022-01-11 21:55:46 +00:00
console.log("questionHrefs: ")
console.log(questionHrefs)
}
//console.log("")
//console.log("")
//console.log(h4elements)
2022-01-11 21:55:46 +00:00
for (let questionHref of questionHrefs) {
2021-06-25 16:00:00 +00:00
//console.log(h4element)
2022-01-11 21:55:46 +00:00
let elementSplit = questionHref.split('"><span>')
let url = elementSplit[0].split('<a href="')[1]
let title = elementSplit[1].replace('</h4>', "").replace('</h5>', "").replace("</span></a>", "")
await sleep(Math.random() * SLEEP_TIME_RANDOM + SLEEP_TIME_EXTRA) // don't be as noticeable
try {
2021-01-12 12:43:41 +00:00
let moreinfo = await fetchStats(url, cookie)
let question = ({
"title": title,
"url": url,
"platform": "CSET-foretell",
...moreinfo
})
if (i % 30 == 0 && !(process.env.DEBUG_MODE == "on" || DEBUG_MODE == "on")) {
console.log(`Page #${i}` && !(process.env.DEBUG_MODE == "on" || DEBUG_MODE == "on"))
console.log(question)
}
results.push(question)
if (process.env.DEBUG_MODE == "on" || DEBUG_MODE == "on") {
console.log(url)
console.log(question)
}
} catch (error) {
2021-04-08 17:19:56 +00:00
console.log(error)
2021-01-12 12:43:41 +00:00
console.log(`We encountered some error when fetching the URL: ${url}, so it won't appear on the final json`)
}
}
2021-06-25 16:00:00 +00:00
i++
//i=Number(i)+1
console.log("Sleeping for ~5secs so as to not be as noticeable to the cset-foretell servers")
2022-01-11 21:55:46 +00:00
await sleep(Math.random() * SLEEP_TIME_RANDOM + SLEEP_TIME_EXTRA) // don't be as noticeable
try {
2021-01-12 12:43:41 +00:00
response = await fetchPage(i, cookie)
} catch (error) {
2021-04-08 19:32:03 +00:00
console.log(error)
2021-01-12 12:43:41 +00:00
console.log(`The program encountered some error when fetching page #${i}, so it won't appear on the final json. It is possible that this page wasn't actually a prediction question pages`)
}
}
// let string = JSON.stringify(results,null, 2)
// fs.writeFileSync('./data/csetforetell-questions.json', string);
// console.log(results)
if (results.length > 0) {
await upsert(results, "csetforetell-questions")
} else {
console.log("Not updating results, as process was not signed in")
}
2021-01-12 12:43:41 +00:00
let end = Date.now()
let difference = end - init
console.log(`Took ${difference / 1000} seconds, or ${difference / (1000 * 60)} minutes.`)
2021-01-12 12:43:41 +00:00
}
export async function infer() {
let cookie = process.env.CSETFORETELL_COOKIE || getCookie("csetforetell")
2021-04-10 18:28:19 +00:00
await applyIfCookieExists(cookie, csetforetell_inner)
}