whoogle-search/app/request.py
Ben Busby b6fb4723f9
Project refactor (#85)
* Major refactor of requests and session management

- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config (people are probably
  going to complain about this, though not sure if it'll be the same
  number of people who are upset that their friends/family have to share
  their config)
- Updates key gen/regen to more aggressively swap out keys after each
request

* Added ability to save/load configs by name

- New PUT method for config allows changing config with specified name
- New methods in js controller to handle loading/saving of configs

* Result formatting and removal of unused elements

- Fixed question section formatting from results page (added appropriate
padding and made questions styled as italic)
- Removed user agent display from main config settings

* Minor change to button label

* Fixed issue with "de-pickling" of flask session

Having a gitignore-everything ("*") file within a flask session folder seems to cause a
weird bug where the state of the app becomes unusable from continuously
trying to prune files listed in the gitignore (and it can't prune '*').

* Switched to pickling saved configs

* Updated ad/sponsored content filter and conf naming

Configs are now named with a .conf extension to allow for easier manual
cleanup/modification of named config files

Sponsored content now removed by basic string matching of span content

* Version bump to 0.2.0

* Fixed request.send return style
2020-06-02 12:54:47 -06:00

90 lines
2.9 KiB
Python

from lxml import etree
import random
import requests
from requests import Response
import urllib.parse as urlparse
# Core Google search URLs
SEARCH_URL = 'https://www.google.com/search?gbv=1&q='
AUTOCOMPLETE_URL = 'https://suggestqueries.google.com/complete/search?client=toolbar&'
MOBILE_UA = '{}/5.0 (Android 0; Mobile; rv:54.0) Gecko/54.0 {}/59.0'
DESKTOP_UA = '{}/5.0 (X11; {} x86_64; rv:75.0) Gecko/20100101 {}/75.0'
# Valid query params
VALID_PARAMS = ['tbs', 'tbm', 'start', 'near']
def gen_user_agent(is_mobile):
mozilla = random.choice(['Moo', 'Woah', 'Bro', 'Slow']) + 'zilla'
firefox = random.choice(['Choir', 'Squier', 'Higher', 'Wire']) + 'fox'
linux = random.choice(['Win', 'Sin', 'Gin', 'Fin', 'Kin']) + 'ux'
if is_mobile:
return MOBILE_UA.format(mozilla, firefox)
else:
return DESKTOP_UA.format(mozilla, linux, firefox)
def gen_query(query, args, config, near_city=None):
param_dict = {key: '' for key in VALID_PARAMS}
# Use :past(hour/day/week/month/year) if available
# example search "new restaurants :past month"
if ':past' in query:
time_range = str.strip(query.split(':past', 1)[-1])
param_dict['tbs'] = '&tbs=qdr:' + str.lower(time_range[0])
# Ensure search query is parsable
query = urlparse.quote(query)
# Pass along type of results (news, images, books, etc)
if 'tbm' in args:
param_dict['tbm'] = '&tbm=' + args.get('tbm')
# Get results page start value (10 per page, ie page 2 start val = 20)
if 'start' in args:
param_dict['start'] = '&start=' + args.get('start')
# Search for results near a particular city, if available
if near_city:
param_dict['near'] = '&near=' + urlparse.quote(near_city)
# Set language for results (lr) and interface (hl)
param_dict['lr'] = '&lr=' + config.lang + '&hl=' + config.lang.replace('lang_', '')
param_dict['cr'] = ('&cr=' + config.ctry) if config.ctry else ''
param_dict['safe'] = '&safe=' + ('active' if config.safe else 'off')
for val in param_dict.values():
if not val or val is None:
continue
query += val
return query
class Request:
def __init__(self, normal_ua, language='lang_en'):
self.language = language
self.mobile = 'Android' in normal_ua or 'iPhone' in normal_ua
self.modified_user_agent = gen_user_agent(self.mobile)
def __getitem__(self, name):
return getattr(self, name)
def autocomplete(self, query):
ac_query = dict(hl=self.language, q=query)
response = self.send(base_url=AUTOCOMPLETE_URL, query=urlparse.urlencode(ac_query)).text
if response:
dom = etree.fromstring(response)
return dom.xpath('//suggestion/@data')
return []
def send(self, base_url=SEARCH_URL, query='') -> Response:
headers = {
'User-Agent': self.modified_user_agent
}
return requests.get(base_url + query, headers=headers)