Pushing straight to master since this is an extremely simple fix, with a pretty large performance benefit. The Phyme library used for generating a User Agent rhyme was consuming an absolute unit of memory. Now that it's removed, it's using about 10x less memory, at the cost of User Agents being not as funny anymore.
84 lines
2.6 KiB
Python
84 lines
2.6 KiB
Python
from io import BytesIO
|
|
import pycurl
|
|
import random
|
|
import urllib.parse as urlparse
|
|
|
|
# Base search url
|
|
SEARCH_URL = 'https://www.google.com/search?gbv=1&q='
|
|
|
|
MOBILE_UA = '{}/5.0 (Android 0; Mobile; rv:54.0) Gecko/54.0 {}/59.0'
|
|
DESKTOP_UA = '{}/5.0 (X11; {} x86_64; rv:75.0) Gecko/20100101 {}/75.0'
|
|
|
|
# Valid query params
|
|
VALID_PARAMS = ['tbs', 'tbm', 'start', 'near']
|
|
|
|
|
|
def gen_user_agent(normal_ua):
|
|
is_mobile = 'Android' in normal_ua or 'iPhone' in normal_ua
|
|
|
|
mozilla = random.choice(['Moo', 'Woah', 'Bro', 'Slow']) + 'zilla'
|
|
firefox = random.choice(['Choir', 'Squier', 'Higher', 'Wire']) + 'fox'
|
|
linux = random.choice(['Win', 'Sin', 'Gin', 'Fin', 'Kin']) + 'ux'
|
|
|
|
if is_mobile:
|
|
return MOBILE_UA.format(mozilla, firefox)
|
|
else:
|
|
return DESKTOP_UA.format(mozilla, linux, firefox)
|
|
|
|
|
|
def gen_query(query, args, near_city=None):
|
|
param_dict = {key: '' for key in VALID_PARAMS}
|
|
# Use :past(hour/day/week/month/year) if available
|
|
# example search "new restaurants :past month"
|
|
if ':past' in query:
|
|
time_range = str.strip(query.split(':past', 1)[-1])
|
|
param_dict['tbs'] = '&tbs=qdr:' + str.lower(time_range[0])
|
|
|
|
# Ensure search query is parsable
|
|
query = urlparse.quote(query)
|
|
|
|
# Pass along type of results (news, images, books, etc)
|
|
if 'tbm' in args:
|
|
param_dict['tbm'] = '&tbm=' + args.get('tbm')
|
|
|
|
# Get results page start value (10 per page, ie page 2 start val = 20)
|
|
if 'start' in args:
|
|
param_dict['start'] = '&start=' + args.get('start')
|
|
|
|
# Search for results near a particular city, if available
|
|
if near_city is not None:
|
|
param_dict['near'] = '&near=' + urlparse.quote(near_city)
|
|
|
|
for val in param_dict.values():
|
|
if not val or val is None:
|
|
continue
|
|
query += val
|
|
|
|
return query
|
|
|
|
|
|
class Request:
|
|
def __init__(self, normal_ua):
|
|
self.modified_user_agent = gen_user_agent(normal_ua)
|
|
|
|
def __getitem__(self, name):
|
|
return getattr(self, name)
|
|
|
|
def send(self, base_url=SEARCH_URL, query='', return_bytes=False):
|
|
response_header = []
|
|
|
|
b_obj = BytesIO()
|
|
crl = pycurl.Curl()
|
|
crl.setopt(crl.URL, base_url + query)
|
|
crl.setopt(crl.USERAGENT, self.modified_user_agent)
|
|
crl.setopt(crl.WRITEDATA, b_obj)
|
|
crl.setopt(crl.HEADERFUNCTION, response_header.append)
|
|
crl.setopt(pycurl.FOLLOWLOCATION, 1)
|
|
crl.perform()
|
|
crl.close()
|
|
|
|
if return_bytes:
|
|
return b_obj.getvalue()
|
|
else:
|
|
return b_obj.getvalue().decode('unicode-escape', 'ignore')
|