From 9f435bf8fefd1319b69ad46b967ba914df393d6f Mon Sep 17 00:00:00 2001 From: Ben Busby <33362396+benbusby@users.noreply.github.com> Date: Thu, 28 May 2020 18:14:10 -0600 Subject: [PATCH] Major refactor of requests and session management - Switches from pycurl to requests library - Allows for less janky decoding, especially with non-latin character sets - Adds session level management of user configs - Allows for each session to set its own config (people are probably going to complain about this, though not sure if it'll be the same number of people who are upset that their friends/family have to share their config) - Updates key gen/regen to more aggressively swap out keys after each request --- .gitignore | 1 + app/__init__.py | 13 +++- app/filter.py | 139 +++++++++++++++++++---------------- app/request.py | 30 +++----- app/routes.py | 129 +++++++++++++++----------------- app/static/config/.gitignore | 1 + app/templates/header.html | 4 +- app/templates/index.html | 10 +-- app/utils/__init__.py | 0 app/utils/misc.py | 20 +++++ app/utils/routing_utils.py | 69 +++++++++++++++++ requirements.txt | 3 +- test/test_misc.py | 36 +++++++++ test/test_results.py | 6 +- test/test_routes.py | 6 +- 15 files changed, 302 insertions(+), 165 deletions(-) create mode 100644 app/static/config/.gitignore create mode 100644 app/utils/__init__.py create mode 100644 app/utils/misc.py create mode 100644 app/utils/routing_utils.py create mode 100644 test/test_misc.py diff --git a/.gitignore b/.gitignore index 20747c7..20307d1 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ __pycache__/ *.pem config.json test/static +flask_session/ # pip stuff build/ diff --git a/app/__init__.py b/app/__init__.py index 4b78a8d..4b0739b 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -1,12 +1,21 @@ +from app.utils.misc import generate_user_keys from cryptography.fernet import Fernet from flask import Flask +from flask_session import Session import os app = Flask(__name__, static_folder=os.path.dirname(os.path.abspath(__file__)) + '/static') -app.secret_key = Fernet.generate_key() +app.user_elements = {} +app.config['SECRET_KEY'] = os.urandom(128) +app.config['SESSION_TYPE'] = 'filesystem' app.config['VERSION_NUMBER'] = '0.1.4' app.config['APP_ROOT'] = os.getenv('APP_ROOT', os.path.dirname(os.path.abspath(__file__))) app.config['STATIC_FOLDER'] = os.getenv('STATIC_FOLDER', os.path.join(app.config['APP_ROOT'], 'static')) -app.config['CONFIG_PATH'] = os.getenv('CONFIG_VOLUME', app.config['STATIC_FOLDER']) + '/config.json' +app.config['CONFIG_PATH'] = os.getenv('CONFIG_VOLUME', app.config['STATIC_FOLDER'] + '/config') +app.config['SESSION_FILE_DIR'] = app.config['CONFIG_PATH'] +app.config['SESSION_COOKIE_SECURE'] = True + +sess = Session() +sess.init_app(app) from app import routes diff --git a/app/filter.py b/app/filter.py index 5ff46b7..a5bfb19 100644 --- a/app/filter.py +++ b/app/filter.py @@ -17,14 +17,9 @@ data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAQAAAAnOwc2AAAAD0lEQVR42m def get_first_link(soup): # Replace hrefs with only the intended destination (no "utm" type tags) for a in soup.find_all('a', href=True): - href = a['href'].replace('https://www.google.com', '') - - result_link = urlparse.urlparse(href) - query_link = parse_qs(result_link.query)['q'][0] if '?q=' in href else '' - # Return the first search result URL - if 'url?q=' in href: - return filter_link_args(href) + if 'url?q=' in a['href']: + return filter_link_args(a['href']) def filter_link_args(query_link): @@ -52,7 +47,7 @@ def filter_link_args(query_link): class Filter: - def __init__(self, mobile=False, config=None, secret_key=''): + def __init__(self, user_keys: dict, mobile=False, config=None): if config is None: config = {} @@ -61,11 +56,16 @@ class Filter: self.nojs = config['nojs'] if 'nojs' in config else False self.new_tab = config['new_tab'] if 'new_tab' in config else False self.mobile = mobile - self.secret_key = secret_key + self.user_keys = user_keys + self._elements = 0 def __getitem__(self, name): return getattr(self, name) + @property + def elements(self): + return self._elements + def reskin(self, page): # Aesthetic only re-skinning page = page.replace('>G<', '>Wh<') @@ -76,11 +76,29 @@ class Filter: return page + def encrypt_path(self, msg, is_element=False): + # Encrypts path to avoid plaintext results in logs + if is_element: + # Element paths are tracked differently in order for the element key to be regenerated + # once all elements have been loaded + enc_path = Fernet(self.user_keys['element_key']).encrypt(msg.encode()).decode() + self._elements += 1 + return enc_path + + return Fernet(self.user_keys['text_key']).encrypt(msg.encode()).decode() + def clean(self, soup): self.remove_ads(soup) - self.update_image_paths(soup) self.update_styling(soup) - self.update_links(soup) + + for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]: + self.update_element_src(img, 'image/png') + + for audio in [_ for _ in soup.find_all('audio') if 'src' in _.attrs]: + self.update_element_src(audio, 'audio/mpeg') + + for link in soup.find_all('a', href=True): + self.update_link(link) input_form = soup.find('form') if input_form is not None: @@ -116,25 +134,24 @@ class Filter: for div in ad_divs: div.decompose() - def update_image_paths(self, soup): - for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]: - img_src = img['src'] - if img_src.startswith('//'): - img_src = 'https:' + img_src - elif img_src.startswith(LOGO_URL): - # Re-brand with Whoogle logo - img['src'] = '/static/img/logo.png' - img['style'] = 'height:40px;width:162px' - continue - elif img_src.startswith(GOOG_IMG): - img['src'] = BLANK_B64 - continue + def update_element_src(self, element, mimetype): + element_src = element['src'] + if element_src.startswith('//'): + element_src = 'https:' + element_src + elif element_src.startswith(LOGO_URL): + # Re-brand with Whoogle logo + element['src'] = '/static/img/logo.png' + element['style'] = 'height:40px;width:162px' + return + elif element_src.startswith(GOOG_IMG): + element['src'] = BLANK_B64 + return - enc_src = Fernet(self.secret_key).encrypt(img_src.encode()) - img['src'] = '/tmp?image_url=' + enc_src.decode() - # TODO: Non-mobile image results link to website instead of image - # if not self.mobile: - # img.append(BeautifulSoup(FULL_RES_IMG.format(img_src), 'html.parser')) + element['src'] = '/element?url=' + self.encrypt_path(element_src, is_element=True) + \ + '&type=' + urlparse.quote(mimetype) + # TODO: Non-mobile image results link to website instead of image + # if not self.mobile: + # img.append(BeautifulSoup(FULL_RES_IMG.format(element_src), 'html.parser')) def update_styling(self, soup): # Remove unnecessary button(s) @@ -170,45 +187,43 @@ class Filter: for href_element in soup.findAll('a'): href_element['style'] = 'color: white' if href_element['href'].startswith('/search') else '' - def update_links(self, soup): - # Replace hrefs with only the intended destination (no "utm" type tags) - for a in soup.find_all('a', href=True): - href = a['href'].replace('https://www.google.com', '') - if '/advanced_search' in href: - a.decompose() - continue - elif self.new_tab: - a['target'] = '_blank' + def update_link(self, link): + # Replace href with only the intended destination (no "utm" type tags) + href = link['href'].replace('https://www.google.com', '') + if '/advanced_search' in href: + link.decompose() + return + elif self.new_tab: + link['target'] = '_blank' - result_link = urlparse.urlparse(href) - query_link = parse_qs(result_link.query)['q'][0] if '?q=' in href else '' + result_link = urlparse.urlparse(href) + query_link = parse_qs(result_link.query)['q'][0] if '?q=' in href else '' - if query_link.startswith('/'): - a['href'] = 'https://google.com' + query_link - elif '/search?q=' in href: - enc_result = Fernet(self.secret_key).encrypt(query_link.encode()) - new_search = '/search?q=' + enc_result.decode() + if query_link.startswith('/'): + link['href'] = 'https://google.com' + query_link + elif '/search?q=' in href: + new_search = '/search?q=' + self.encrypt_path(query_link) - query_params = parse_qs(urlparse.urlparse(href).query) - for param in VALID_PARAMS: - param_val = query_params[param][0] if param in query_params else '' - new_search += '&' + param + '=' + param_val - a['href'] = new_search - elif 'url?q=' in href: - # Strip unneeded arguments - a['href'] = filter_link_args(query_link) + query_params = parse_qs(urlparse.urlparse(href).query) + for param in VALID_PARAMS: + param_val = query_params[param][0] if param in query_params else '' + new_search += '&' + param + '=' + param_val + link['href'] = new_search + elif 'url?q=' in href: + # Strip unneeded arguments + link['href'] = filter_link_args(query_link) - # Add no-js option - if self.nojs: - gen_nojs(soup, a['href'], a) - else: - a['href'] = href + # Add no-js option + if self.nojs: + gen_nojs(link) + else: + link['href'] = href -def gen_nojs(soup, link, sibling): - nojs_link = soup.new_tag('a') - nojs_link['href'] = '/window?location=' + link +def gen_nojs(sibling): + nojs_link = BeautifulSoup().new_tag('a') + nojs_link['href'] = '/window?location=' + sibling['href'] nojs_link['style'] = 'display:block;width:100%;' nojs_link.string = 'NoJS Link: ' + nojs_link['href'] sibling.append(BeautifulSoup('


', 'html.parser')) - sibling.append(nojs_link) \ No newline at end of file + sibling.append(nojs_link) diff --git a/app/request.py b/app/request.py index 7ecd887..de60638 100644 --- a/app/request.py +++ b/app/request.py @@ -1,7 +1,7 @@ from io import BytesIO from lxml import etree -import pycurl import random +import requests import urllib.parse as urlparse # Core Google search URLs @@ -15,7 +15,7 @@ DESKTOP_UA = '{}/5.0 (X11; {} x86_64; rv:75.0) Gecko/20100101 {}/75.0' VALID_PARAMS = ['tbs', 'tbm', 'start', 'near'] -def gen_user_agent(normal_ua, is_mobile): +def gen_user_agent(is_mobile): mozilla = random.choice(['Moo', 'Woah', 'Bro', 'Slow']) + 'zilla' firefox = random.choice(['Choir', 'Squier', 'Higher', 'Wire']) + 'fox' linux = random.choice(['Win', 'Sin', 'Gin', 'Fin', 'Kin']) + 'ux' @@ -66,17 +66,11 @@ class Request: def __init__(self, normal_ua, language='lang_en'): self.language = language self.mobile = 'Android' in normal_ua or 'iPhone' in normal_ua - self.modified_user_agent = gen_user_agent(normal_ua, self.mobile) + self.modified_user_agent = gen_user_agent(self.mobile) def __getitem__(self, name): return getattr(self, name) - def get_decode_value(self): - if 'lang_zh' in self.language: - return 'gb2312' - else: - return 'unicode-escape' - def autocomplete(self, query): ac_query = dict(hl=self.language, q=query) response = self.send(base_url=AUTOCOMPLETE_URL, query=urlparse.urlencode(ac_query)) @@ -88,19 +82,13 @@ class Request: return [] def send(self, base_url=SEARCH_URL, query='', return_bytes=False): - response_header = [] + headers = { + 'User-Agent': self.modified_user_agent + } - b_obj = BytesIO() - crl = pycurl.Curl() - crl.setopt(crl.URL, base_url + query) - crl.setopt(crl.USERAGENT, self.modified_user_agent) - crl.setopt(crl.WRITEDATA, b_obj) - crl.setopt(crl.HEADERFUNCTION, response_header.append) - crl.setopt(pycurl.FOLLOWLOCATION, 1) - crl.perform() - crl.close() + response = requests.get(base_url + query, headers=headers) if return_bytes: - return b_obj.getvalue() + return response.content else: - return b_obj.getvalue().decode(self.get_decode_value(), 'ignore') + return response.text diff --git a/app/routes.py b/app/routes.py index 3f50082..43b6d08 100644 --- a/app/routes.py +++ b/app/routes.py @@ -1,18 +1,20 @@ from app import app -from app.filter import Filter, get_first_link from app.models.config import Config -from app.request import Request, gen_query +from app.request import Request +from app.utils.misc import generate_user_keys, valid_user_session +from app.utils.routing_utils import * import argparse import base64 from bs4 import BeautifulSoup -from cryptography.fernet import Fernet, InvalidToken -from flask import g, jsonify, make_response, request, redirect, render_template, send_file +from cryptography.fernet import Fernet +from flask import g, jsonify, make_response, request, redirect, render_template, send_file, session from functools import wraps import io import json import os -from pycurl import error as pycurl_error import urllib.parse as urlparse +from requests import exceptions +import uuid import waitress @@ -34,17 +36,22 @@ def auth_required(f): @app.before_request def before_request_func(): - # Always redirect to https if HTTPS_ONLY is set (otherwise default to false) + # Generate secret key for user if unavailable + if not valid_user_session(session): + session['config'] = {'url': request.url_root} + session['keys'] = generate_user_keys() + session['uuid'] = str(uuid.uuid4()) + + if session['uuid'] not in app.user_elements: + app.user_elements.update({session['uuid']: 0}) + + # Always redirect to https if HTTPS_ONLY is set (otherwise default to False) https_only = os.getenv('HTTPS_ONLY', False) - config_path = app.config['CONFIG_PATH'] if https_only and request.url.startswith('http://'): - https_url = request.url.replace('http://', 'https://', 1) - code = 308 - return redirect(https_url, code=code) + return redirect(request.url.replace('http://', 'https://', 1), code=308) - json_config = json.load(open(config_path)) if os.path.exists(config_path) else {'url': request.url_root} - g.user_config = Config(**json_config) + g.user_config = Config(**session['config']) if not g.user_config.url: g.user_config.url = request.url_root.replace('http://', 'https://') if https_only else request.url_root @@ -53,6 +60,16 @@ def before_request_func(): g.app_location = g.user_config.url +@app.after_request +def after_request_func(response): + # Regenerate element key if all elements have been served to user + if app.user_elements[session['uuid']] <= 0 and '/element' in request.url: + session['keys']['element_key'] = Fernet.generate_key() + app.user_elements[session['uuid']] = 0 + + return response + + @app.errorhandler(404) def unknown_page(e): return redirect(g.app_location) @@ -62,14 +79,11 @@ def unknown_page(e): @auth_required def index(): return render_template('index.html', - dark_mode=g.user_config.dark, ua=g.user_request.modified_user_agent, languages=Config.LANGUAGES, countries=Config.COUNTRIES, - current_lang=g.user_config.lang, - current_ctry=g.user_config.ctry, - version_number=app.config['VERSION_NUMBER'], - request_type='get' if g.user_config.get_only else 'post') + config=g.user_config, + version_number=app.config['VERSION_NUMBER']) @app.route('/opensearch.xml', methods=['GET']) @@ -103,52 +117,37 @@ def autocomplete(): @app.route('/search', methods=['GET', 'POST']) @auth_required def search(): - request_params = request.args if request.method == 'GET' else request.form - q = request_params.get('q') + # Clear previous elements and generate a new key each time a new search is performed + app.user_elements[session['uuid']] = 0 + session['keys']['element_key'] = Fernet.generate_key() - if q is None or len(q) == 0: + search_util = RoutingUtils(request, g.user_config, session) + query = search_util.new_search_query() + + # Redirect to home if invalid/blank search + if not query: return redirect('/') - else: - # Attempt to decrypt if this is an internal link - try: - q = Fernet(app.secret_key).decrypt(q.encode()).decode() - except InvalidToken: - pass - feeling_lucky = q.startswith('! ') + # Generate response and number of external elements from the page + response, elements = search_util.generate_response() + if search_util.feeling_lucky: + return redirect(response, code=303) - if feeling_lucky: # Well do you, punk? - q = q[2:] - - user_agent = request.headers.get('User-Agent') - mobile = 'Android' in user_agent or 'iPhone' in user_agent - - content_filter = Filter(mobile, g.user_config, secret_key=app.secret_key) - full_query = gen_query(q, request_params, g.user_config, content_filter.near) - get_body = g.user_request.send(query=full_query) - dirty_soup = BeautifulSoup(content_filter.reskin(get_body), 'html.parser') - - if feeling_lucky: - return redirect(get_first_link(dirty_soup), 303) # Using 303 so the browser performs a GET request for the URL - else: - formatted_results = content_filter.clean(dirty_soup) - - # Set search type to be used in the header template to allow for repeated searches - # in the same category - search_type = request_params.get('tbm') if 'tbm' in request_params else '' + # Keep count of external elements to fetch before element key can be regenerated + app.user_elements[session['uuid']] = elements return render_template( 'display.html', - query=urlparse.unquote(q), - search_type=search_type, + query=urlparse.unquote(query), + search_type=search_util.search_type, dark_mode=g.user_config.dark, - response=formatted_results, + response=response, search_header=render_template( 'header.html', dark_mode=g.user_config.dark, - q=urlparse.unquote(q), - search_type=search_type, - mobile=g.user_request.mobile) if 'isch' not in search_type else '') + query=urlparse.unquote(query), + search_type=search_util.search_type, + mobile=g.user_request.mobile) if 'isch' not in search_util.search_type else '') @app.route('/config', methods=['GET', 'POST']) @@ -161,10 +160,7 @@ def config(): if 'url' not in config_data or not config_data['url']: config_data['url'] = g.user_config.url - with open(app.config['CONFIG_PATH'], 'w') as config_file: - config_file.write(json.dumps(config_data, indent=4)) - config_file.close() - + session['config'] = config_data return redirect(config_data['url']) @@ -187,25 +183,22 @@ def imgres(): return redirect(request.args.get('imgurl')) -@app.route('/tmp') +@app.route('/element') @auth_required -def tmp(): - cipher_suite = Fernet(app.secret_key) - img_url = cipher_suite.decrypt(request.args.get('image_url').encode()).decode() +def element(): + cipher_suite = Fernet(session['keys']['element_key']) + src_url = cipher_suite.decrypt(request.args.get('url').encode()).decode() + src_type = request.args.get('type') try: - file_data = g.user_request.send(base_url=img_url, return_bytes=True) + file_data = g.user_request.send(base_url=src_url, return_bytes=True) + app.user_elements[session['uuid']] -= 1 tmp_mem = io.BytesIO() tmp_mem.write(file_data) tmp_mem.seek(0) - return send_file( - tmp_mem, - as_attachment=True, - attachment_filename='tmp.png', - mimetype='image/png' - ) - except pycurl_error: + return send_file(tmp_mem, mimetype=src_type) + except exceptions.RequestException: pass empty_gif = base64.b64decode('R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==') diff --git a/app/static/config/.gitignore b/app/static/config/.gitignore new file mode 100644 index 0000000..72e8ffc --- /dev/null +++ b/app/static/config/.gitignore @@ -0,0 +1 @@ +* diff --git a/app/templates/header.html b/app/templates/header.html index 5356ec2..5573b99 100644 --- a/app/templates/header.html +++ b/app/templates/header.html @@ -15,7 +15,7 @@ style="background-color: {{ '#000' if dark_mode else '#fff' }}; color: {{ '#685e79' if dark_mode else '#000' }}; border: {{ '1px solid #685e79' if dark_mode else '' }}" - spellcheck="false" type="text" value="{{ q }}"> + spellcheck="false" type="text" value="{{ query }}">
@@ -37,7 +37,7 @@
diff --git a/app/templates/index.html b/app/templates/index.html index 9279031..cf40a82 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -21,14 +21,14 @@ - + Whoogle Search - +
-
+
@@ -50,7 +50,7 @@ {% for lang in languages %}