2021-01-12 12:43:41 +00:00
/* Imports */
2022-02-11 14:21:36 +00:00
import axios from "axios" ;
import { getCookie , applyIfCookieExists } from "../utils/getCookies.js" ;
import { Tabletojson } from "tabletojson" ;
import toMarkdown from "../utils/toMarkdown.js" ;
import { calculateStars } from "../utils/stars.js" ;
2022-02-11 17:19:33 +00:00
import { upsert } from "../database/mongo-wrapper.js" ;
2021-01-12 12:43:41 +00:00
/* Definitions */
2022-02-11 14:21:36 +00:00
let htmlEndPoint = "https://www.infer-pub.com/questions" ;
String . prototype . replaceAll = function replaceAll ( search , replace ) {
return this . split ( search ) . join ( replace ) ;
} ;
const DEBUG _MODE = "on" ; // "off"
const SLEEP _TIME _RANDOM = 7000 ; // miliseconds
const SLEEP _TIME _EXTRA = 2000 ;
2021-01-12 12:43:41 +00:00
/* Support functions */
2021-10-15 09:40:05 +00:00
async function fetchPage ( page , cookie ) {
2022-02-11 14:21:36 +00:00
console . log ( ` Page # ${ page } ` ) ;
2021-10-15 09:40:05 +00:00
if ( page == 1 ) {
2022-02-11 14:21:36 +00:00
cookie = cookie . split ( ";" ) [ 0 ] ; // Interesting that it otherwise doesn't work :(
2021-04-25 17:17:34 +00:00
}
2022-02-11 14:21:36 +00:00
let urlEndpoint = ` ${ htmlEndPoint } /?page= ${ page } ` ;
console . log ( urlEndpoint ) ;
2021-10-15 09:40:05 +00:00
let response = await axios ( {
2021-06-25 16:00:00 +00:00
url : urlEndpoint ,
2022-02-11 14:21:36 +00:00
method : "GET" ,
headers : {
"Content-Type" : "text/html" ,
Cookie : cookie ,
} ,
} ) . then ( ( res ) => res . data ) ;
2021-04-08 19:32:03 +00:00
// console.log(response)
2022-02-11 14:21:36 +00:00
return response ;
2021-01-12 12:43:41 +00:00
}
2021-10-15 09:40:05 +00:00
async function fetchStats ( questionUrl , cookie ) {
let response = await axios ( {
url : questionUrl + "/stats" ,
2022-02-11 14:21:36 +00:00
method : "GET" ,
headers : {
"Content-Type" : "text/html" ,
Cookie : cookie ,
Referer : questionUrl ,
} ,
} ) . then ( ( res ) => res . data ) ;
2021-10-15 09:40:05 +00:00
if ( response . includes ( "Sign up or sign in to forecast" ) ) {
2022-02-11 14:21:36 +00:00
throw Error ( "Not logged in" ) ;
2021-07-04 22:01:09 +00:00
}
2021-01-12 12:43:41 +00:00
// Is binary?
2022-02-11 14:21:36 +00:00
let isbinary = response . includes ( "binary?":true" ) ;
2021-05-02 14:30:02 +00:00
// console.log(`is binary? ${isbinary}`)
2022-02-11 14:21:36 +00:00
let options = [ ] ;
2021-10-15 09:40:05 +00:00
if ( isbinary ) {
2021-01-12 12:43:41 +00:00
// Crowd percentage
2022-02-11 14:21:36 +00:00
let htmlElements = response . split ( "\n" ) ;
2021-11-05 13:45:55 +00:00
// DEBUG_MODE == "on" ? htmlLines.forEach(line => console.log(line)) : id()
2022-02-11 14:21:36 +00:00
let h3Element = htmlElements . filter ( ( str ) => str . includes ( "<h3>" ) ) [ 0 ] ;
2021-11-05 13:45:55 +00:00
// DEBUG_MODE == "on" ? console.log(h5elements) : id()
2022-02-11 14:21:36 +00:00
let crowdpercentage = h3Element . split ( ">" ) [ 1 ] . split ( "<" ) [ 0 ] ;
let probability = Number ( crowdpercentage . replace ( "%" , "" ) ) / 100 ;
options . push (
{
name : "Yes" ,
probability : probability ,
type : "PROBABILITY" ,
} ,
{
name : "No" ,
probability : + ( 1 - probability ) . toFixed ( 2 ) , // avoids floating point shenanigans
type : "PROBABILITY" ,
}
) ;
2021-10-15 09:40:05 +00:00
} else {
try {
2022-02-11 14:21:36 +00:00
let optionsBody = response . split ( "tbody" ) [ 1 ] ; // Previously [1], then previously [3] but they added a new table.
2021-06-11 19:13:28 +00:00
// console.log(optionsBody)
2022-02-11 14:21:36 +00:00
let optionsHtmlElement = "<table" + optionsBody + "table>" ;
let tablesAsJson = Tabletojson . convert ( optionsHtmlElement ) ;
let firstTable = tablesAsJson [ 0 ] ;
options = firstTable . map ( ( element ) => ( {
name : element [ "0" ] ,
probability : Number ( element [ "1" ] . replace ( "%" , "" ) ) / 100 ,
type : "PROBABILITY" ,
} ) ) ;
2021-10-15 09:40:05 +00:00
} catch ( error ) {
2022-02-11 14:21:36 +00:00
let optionsBody = response . split ( "tbody" ) [ 3 ] ; // Catch if the error is related to table position
let optionsHtmlElement = "<table" + optionsBody + "table>" ;
let tablesAsJson = Tabletojson . convert ( optionsHtmlElement ) ;
let firstTable = tablesAsJson [ 0 ] ;
2021-10-15 09:40:05 +00:00
if ( firstTable ) {
2022-02-11 14:21:36 +00:00
options = firstTable . map ( ( element ) => ( {
name : element [ "0" ] ,
probability : Number ( element [ "1" ] . replace ( "%" , "" ) ) / 100 ,
type : "PROBABILITY" ,
} ) ) ;
2021-10-15 09:40:05 +00:00
} else {
2021-08-31 19:23:08 +00:00
// New type of question, tricky to parse the options
// Just leave options = [] for now.
// https://www.cset-foretell.com/blog/rolling-question-formats
2021-10-15 09:40:05 +00:00
}
2021-06-11 19:13:28 +00:00
}
2021-01-12 12:43:41 +00:00
}
2022-02-11 14:21:36 +00:00
// Description
let descriptionraw = response . split ( ` <meta name="description" content=" ` ) [ 1 ] ;
let descriptionprocessed1 = descriptionraw . split ( ` "> ` ) [ 0 ] ;
let descriptionprocessed2 = descriptionprocessed1 . replace ( ">" , "" ) ;
let descriptionprocessed3 = descriptionprocessed2 . replace (
"To suggest a change or clarification to this question, please select Request Clarification from the green gear-shaped dropdown button to the right of the question." ,
` `
) ;
2021-04-08 19:32:03 +00:00
// console.log(descriptionprocessed3)
2022-02-11 14:21:36 +00:00
let descriptionprocessed4 = descriptionprocessed3 . replaceAll (
"\r\n\r\n" ,
"\n"
) ;
let descriptionprocessed5 = descriptionprocessed4 . replaceAll ( "\n\n" , "\n" ) ;
let descriptionprocessed6 = descriptionprocessed5 . replaceAll ( """ , ` " ` ) ;
let descriptionprocessed7 = descriptionprocessed6 . replaceAll ( "'" , "'" ) ;
let descriptionprocessed8 = toMarkdown ( descriptionprocessed7 ) ;
let description = descriptionprocessed8 ;
2021-04-25 17:17:34 +00:00
2021-01-12 12:43:41 +00:00
// Number of forecasts
2021-07-09 16:38:24 +00:00
//console.log(response)
//console.log(response.split("prediction_sets_count":")[1])
2022-02-11 14:21:36 +00:00
let numforecasts = response
. split ( "prediction_sets_count":" ) [ 1 ]
. split ( "," ) [ 0 ] ;
2021-04-08 19:32:03 +00:00
// console.log(numforecasts)
2021-04-25 17:17:34 +00:00
2021-01-12 12:43:41 +00:00
// Number of predictors
2022-02-11 14:21:36 +00:00
let numforecasters = response
. split ( "predictors_count":" ) [ 1 ]
. split ( "," ) [ 0 ] ;
2021-04-08 19:32:03 +00:00
// console.log(numpredictors)
2021-10-15 09:40:05 +00:00
2021-01-12 12:43:41 +00:00
let result = {
2022-02-11 14:21:36 +00:00
description : description ,
options : options ,
timestamp : new Date ( ) . toISOString ( ) ,
qualityindicators : {
numforecasts : Number ( numforecasts ) ,
numforecasters : Number ( numforecasters ) ,
stars : calculateStars ( "Infer" , { numforecasts } ) ,
} ,
} ;
2021-10-15 09:40:05 +00:00
2022-02-11 14:21:36 +00:00
return result ;
2021-01-12 12:43:41 +00:00
}
2021-10-15 09:40:05 +00:00
function isSignedIn ( html ) {
2022-02-11 14:21:36 +00:00
let isSignedInBool = ! (
html . includes ( "You need to sign in or sign up before continuing" ) ||
html . includes ( "Sign up" )
) ;
2021-10-15 09:40:05 +00:00
if ( ! isSignedInBool ) {
2022-02-11 14:21:36 +00:00
console . log ( "Error: Not signed in." ) ;
2021-07-04 21:37:08 +00:00
}
2022-02-11 14:21:36 +00:00
console . log ( ` Signed in? ${ isSignedInBool } ` ) ;
return isSignedInBool ;
2021-07-09 16:15:49 +00:00
}
2021-07-04 21:37:08 +00:00
2021-10-15 09:40:05 +00:00
function isEnd ( html ) {
2022-02-11 14:21:36 +00:00
let isEndBool = html . includes ( "No questions match your filter" ) ;
2021-10-15 09:40:05 +00:00
if ( isEndBool ) {
2021-06-25 16:00:00 +00:00
//console.log(html)
}
2022-02-11 14:21:36 +00:00
console . log ( ` IsEnd? ${ isEndBool } ` ) ;
return isEndBool ;
2021-01-12 12:43:41 +00:00
}
function sleep ( ms ) {
2022-02-11 14:21:36 +00:00
return new Promise ( ( resolve ) => setTimeout ( resolve , ms ) ) ;
2021-01-12 12:43:41 +00:00
}
/* Body */
2022-02-04 19:54:04 +00:00
async function infer _inner ( cookie ) {
2022-02-11 14:21:36 +00:00
let i = 1 ;
let response = await fetchPage ( i , cookie ) ;
let results = [ ] ;
let init = Date . now ( ) ;
2021-04-08 19:32:03 +00:00
// console.log("Downloading... This might take a couple of minutes. Results will be shown.")
2021-10-15 09:40:05 +00:00
while ( ! isEnd ( response ) && isSignedIn ( response ) ) {
2022-02-11 14:21:36 +00:00
let htmlLines = response . split ( "\n" ) ;
2022-01-11 21:55:46 +00:00
// let h4elements = htmlLines.filter(str => str.includes("<h5> <a href=") || str.includes("<h4> <a href="))
2022-02-11 14:21:36 +00:00
let questionHrefs = htmlLines . filter ( ( str ) =>
str . includes ( "https://www.infer-pub.com/questions/" )
) ;
2021-11-05 13:45:55 +00:00
// console.log(questionHrefs)
2022-02-04 19:54:04 +00:00
2021-10-15 09:40:05 +00:00
if ( process . env . DEBUG _MODE == "on" || DEBUG _MODE == "on" ) {
2021-06-25 16:00:00 +00:00
//console.log(response)
2022-02-11 14:21:36 +00:00
console . log ( "questionHrefs: " ) ;
console . log ( questionHrefs ) ;
2021-04-25 17:17:34 +00:00
}
//console.log("")
//console.log("")
//console.log(h4elements)
2021-10-15 09:40:05 +00:00
2022-01-11 21:55:46 +00:00
for ( let questionHref of questionHrefs ) {
2021-06-25 16:00:00 +00:00
//console.log(h4element)
2021-06-11 19:13:28 +00:00
2022-02-11 14:21:36 +00:00
let elementSplit = questionHref . split ( '"><span>' ) ;
let url = elementSplit [ 0 ] . split ( '<a href="' ) [ 1 ] ;
let title = elementSplit [ 1 ]
. replace ( "</h4>" , "" )
. replace ( "</h5>" , "" )
. replace ( "</span></a>" , "" ) ;
await sleep ( Math . random ( ) * SLEEP _TIME _RANDOM + SLEEP _TIME _EXTRA ) ; // don't be as noticeable
2021-06-11 19:13:28 +00:00
2021-10-15 09:40:05 +00:00
try {
2022-02-11 14:21:36 +00:00
let moreinfo = await fetchStats ( url , cookie ) ;
let questionNumRegex = new RegExp ( "questions/([0-9]+)" ) ;
let questionNum = url . match ( questionNumRegex ) [ 1 ] ; //.split("questions/")[1].split("-")[0];
let id = ` infer- ${ questionNum } ` ;
let question = {
id : id ,
title : title ,
url : url ,
platform : "Infer" ,
... moreinfo ,
} ;
if (
i % 30 == 0 &&
! ( process . env . DEBUG _MODE == "on" || DEBUG _MODE == "on" )
) {
console . log (
` Page # ${ i } ` &&
! ( process . env . DEBUG _MODE == "on" || DEBUG _MODE == "on" )
) ;
console . log ( question ) ;
2021-10-15 09:40:05 +00:00
}
2022-02-11 14:21:36 +00:00
results . push ( question ) ;
2021-10-15 09:40:05 +00:00
if ( process . env . DEBUG _MODE == "on" || DEBUG _MODE == "on" ) {
2022-02-11 14:21:36 +00:00
console . log ( url ) ;
console . log ( question ) ;
2021-10-15 09:40:05 +00:00
}
} catch ( error ) {
2022-02-11 14:21:36 +00:00
console . log ( error ) ;
console . log (
` We encountered some error when fetching the URL: ${ url } , so it won't appear on the final json `
) ;
2021-01-12 12:43:41 +00:00
}
}
2021-10-15 09:40:05 +00:00
2022-02-11 14:21:36 +00:00
i ++ ;
2021-06-25 16:00:00 +00:00
//i=Number(i)+1
2022-02-11 14:21:36 +00:00
console . log (
"Sleeping for ~5secs so as to not be as noticeable to the infer servers"
) ;
await sleep ( Math . random ( ) * SLEEP _TIME _RANDOM + SLEEP _TIME _EXTRA ) ; // don't be as noticeable
2021-10-15 09:40:05 +00:00
try {
2022-02-11 14:21:36 +00:00
response = await fetchPage ( i , cookie ) ;
2021-10-15 09:40:05 +00:00
} catch ( error ) {
2022-02-11 14:21:36 +00:00
console . log ( error ) ;
console . log (
` The program encountered some error when fetching page # ${ i } , so it won't appear on the final json. It is possible that this page wasn't actually a prediction question pages `
) ;
2021-01-12 12:43:41 +00:00
}
}
2021-04-08 16:42:48 +00:00
// let string = JSON.stringify(results,null, 2)
2022-02-04 19:54:04 +00:00
// fs.writeFileSync('./data/infer-questions.json', string);
2021-04-11 18:08:59 +00:00
// console.log(results)
2021-10-15 09:40:05 +00:00
if ( results . length > 0 ) {
2022-02-11 14:21:36 +00:00
await upsert ( results , "infer-questions" ) ;
2021-10-15 09:40:05 +00:00
} else {
2022-02-11 14:21:36 +00:00
console . log ( "Not updating results, as process was not signed in" ) ;
2021-07-09 16:38:24 +00:00
}
2021-10-15 09:40:05 +00:00
2022-02-11 14:21:36 +00:00
let end = Date . now ( ) ;
let difference = end - init ;
console . log (
` Took ${ difference / 1000 } seconds, or ${ difference / ( 1000 * 60 ) } minutes. `
) ;
2021-01-12 12:43:41 +00:00
}
2021-04-10 18:18:22 +00:00
2022-02-04 18:32:53 +00:00
export async function infer ( ) {
2022-02-11 14:21:36 +00:00
let cookie = process . env . INFER _COOKIE || getCookie ( "infer" ) ;
await applyIfCookieExists ( cookie , infer _inner ) ;
2021-04-10 18:18:22 +00:00
}