whoogle-search/app/request.py
Ben Busby 9f435bf8fe Major refactor of requests and session management
- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config (people are probably
  going to complain about this, though not sure if it'll be the same
  number of people who are upset that their friends/family have to share
  their config)
- Updates key gen/regen to more aggressively swap out keys after each
request
2020-05-28 18:14:10 -06:00

95 lines
3.0 KiB
Python

from io import BytesIO
from lxml import etree
import random
import requests
import urllib.parse as urlparse
# Core Google search URLs
SEARCH_URL = 'https://www.google.com/search?gbv=1&q='
AUTOCOMPLETE_URL = 'https://suggestqueries.google.com/complete/search?client=toolbar&'
MOBILE_UA = '{}/5.0 (Android 0; Mobile; rv:54.0) Gecko/54.0 {}/59.0'
DESKTOP_UA = '{}/5.0 (X11; {} x86_64; rv:75.0) Gecko/20100101 {}/75.0'
# Valid query params
VALID_PARAMS = ['tbs', 'tbm', 'start', 'near']
def gen_user_agent(is_mobile):
mozilla = random.choice(['Moo', 'Woah', 'Bro', 'Slow']) + 'zilla'
firefox = random.choice(['Choir', 'Squier', 'Higher', 'Wire']) + 'fox'
linux = random.choice(['Win', 'Sin', 'Gin', 'Fin', 'Kin']) + 'ux'
if is_mobile:
return MOBILE_UA.format(mozilla, firefox)
else:
return DESKTOP_UA.format(mozilla, linux, firefox)
def gen_query(query, args, config, near_city=None):
param_dict = {key: '' for key in VALID_PARAMS}
# Use :past(hour/day/week/month/year) if available
# example search "new restaurants :past month"
if ':past' in query:
time_range = str.strip(query.split(':past', 1)[-1])
param_dict['tbs'] = '&tbs=qdr:' + str.lower(time_range[0])
# Ensure search query is parsable
query = urlparse.quote(query)
# Pass along type of results (news, images, books, etc)
if 'tbm' in args:
param_dict['tbm'] = '&tbm=' + args.get('tbm')
# Get results page start value (10 per page, ie page 2 start val = 20)
if 'start' in args:
param_dict['start'] = '&start=' + args.get('start')
# Search for results near a particular city, if available
if near_city:
param_dict['near'] = '&near=' + urlparse.quote(near_city)
# Set language for results (lr) and interface (hl)
param_dict['lr'] = '&lr=' + config.lang + '&hl=' + config.lang.replace('lang_', '')
param_dict['cr'] = ('&cr=' + config.ctry) if config.ctry else ''
param_dict['safe'] = '&safe=' + ('active' if config.safe else 'off')
for val in param_dict.values():
if not val or val is None:
continue
query += val
return query
class Request:
def __init__(self, normal_ua, language='lang_en'):
self.language = language
self.mobile = 'Android' in normal_ua or 'iPhone' in normal_ua
self.modified_user_agent = gen_user_agent(self.mobile)
def __getitem__(self, name):
return getattr(self, name)
def autocomplete(self, query):
ac_query = dict(hl=self.language, q=query)
response = self.send(base_url=AUTOCOMPLETE_URL, query=urlparse.urlencode(ac_query))
if response:
dom = etree.fromstring(response)
return dom.xpath('//suggestion/@data')
return []
def send(self, base_url=SEARCH_URL, query='', return_bytes=False):
headers = {
'User-Agent': self.modified_user_agent
}
response = requests.get(base_url + query, headers=headers)
if return_bytes:
return response.content
else:
return response.text