diff --git a/.gitignore b/.gitignore index 20747c7..20307d1 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ __pycache__/ *.pem config.json test/static +flask_session/ # pip stuff build/ diff --git a/app/__init__.py b/app/__init__.py index 4b78a8d..4b0739b 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -1,12 +1,21 @@ +from app.utils.misc import generate_user_keys from cryptography.fernet import Fernet from flask import Flask +from flask_session import Session import os app = Flask(__name__, static_folder=os.path.dirname(os.path.abspath(__file__)) + '/static') -app.secret_key = Fernet.generate_key() +app.user_elements = {} +app.config['SECRET_KEY'] = os.urandom(128) +app.config['SESSION_TYPE'] = 'filesystem' app.config['VERSION_NUMBER'] = '0.1.4' app.config['APP_ROOT'] = os.getenv('APP_ROOT', os.path.dirname(os.path.abspath(__file__))) app.config['STATIC_FOLDER'] = os.getenv('STATIC_FOLDER', os.path.join(app.config['APP_ROOT'], 'static')) -app.config['CONFIG_PATH'] = os.getenv('CONFIG_VOLUME', app.config['STATIC_FOLDER']) + '/config.json' +app.config['CONFIG_PATH'] = os.getenv('CONFIG_VOLUME', app.config['STATIC_FOLDER'] + '/config') +app.config['SESSION_FILE_DIR'] = app.config['CONFIG_PATH'] +app.config['SESSION_COOKIE_SECURE'] = True + +sess = Session() +sess.init_app(app) from app import routes diff --git a/app/filter.py b/app/filter.py index 5ff46b7..a5bfb19 100644 --- a/app/filter.py +++ b/app/filter.py @@ -17,14 +17,9 @@  def get_first_link(soup): # Replace hrefs with only the intended destination (no "utm" type tags) for a in soup.find_all('a', href=True): - href = a['href'].replace('https://www.google.com', '') - - result_link = urlparse.urlparse(href) - query_link = parse_qs(result_link.query)['q'][0] if '?q=' in href else '' - # Return the first search result URL - if 'url?q=' in href: - return filter_link_args(href) + if 'url?q=' in a['href']: + return filter_link_args(a['href']) def filter_link_args(query_link): @@ -52,7 +47,7 @@ def filter_link_args(query_link): class Filter: - def __init__(self, mobile=False, config=None, secret_key=''): + def __init__(self, user_keys: dict, mobile=False, config=None): if config is None: config = {} @@ -61,11 +56,16 @@ class Filter: self.nojs = config['nojs'] if 'nojs' in config else False self.new_tab = config['new_tab'] if 'new_tab' in config else False self.mobile = mobile - self.secret_key = secret_key + self.user_keys = user_keys + self._elements = 0 def __getitem__(self, name): return getattr(self, name) + @property + def elements(self): + return self._elements + def reskin(self, page): # Aesthetic only re-skinning page = page.replace('>G<', '>Wh<') @@ -76,11 +76,29 @@ class Filter: return page + def encrypt_path(self, msg, is_element=False): + # Encrypts path to avoid plaintext results in logs + if is_element: + # Element paths are tracked differently in order for the element key to be regenerated + # once all elements have been loaded + enc_path = Fernet(self.user_keys['element_key']).encrypt(msg.encode()).decode() + self._elements += 1 + return enc_path + + return Fernet(self.user_keys['text_key']).encrypt(msg.encode()).decode() + def clean(self, soup): self.remove_ads(soup) - self.update_image_paths(soup) self.update_styling(soup) - self.update_links(soup) + + for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]: + self.update_element_src(img, 'image/png') + + for audio in [_ for _ in soup.find_all('audio') if 'src' in _.attrs]: + self.update_element_src(audio, 'audio/mpeg') + + for link in soup.find_all('a', href=True): + self.update_link(link) input_form = soup.find('form') if input_form is not None: @@ -116,25 +134,24 @@ class Filter: for div in ad_divs: div.decompose() - def update_image_paths(self, soup): - for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]: - img_src = img['src'] - if img_src.startswith('//'): - img_src = 'https:' + img_src - elif img_src.startswith(LOGO_URL): - # Re-brand with Whoogle logo - img['src'] = '/static/img/logo.png' - img['style'] = 'height:40px;width:162px' - continue - elif img_src.startswith(GOOG_IMG): - img['src'] = BLANK_B64 - continue + def update_element_src(self, element, mimetype): + element_src = element['src'] + if element_src.startswith('//'): + element_src = 'https:' + element_src + elif element_src.startswith(LOGO_URL): + # Re-brand with Whoogle logo + element['src'] = '/static/img/logo.png' + element['style'] = 'height:40px;width:162px' + return + elif element_src.startswith(GOOG_IMG): + element['src'] = BLANK_B64 + return - enc_src = Fernet(self.secret_key).encrypt(img_src.encode()) - img['src'] = '/tmp?image_url=' + enc_src.decode() - # TODO: Non-mobile image results link to website instead of image - # if not self.mobile: - # img.append(BeautifulSoup(FULL_RES_IMG.format(img_src), 'html.parser')) + element['src'] = '/element?url=' + self.encrypt_path(element_src, is_element=True) + \ + '&type=' + urlparse.quote(mimetype) + # TODO: Non-mobile image results link to website instead of image + # if not self.mobile: + # img.append(BeautifulSoup(FULL_RES_IMG.format(element_src), 'html.parser')) def update_styling(self, soup): # Remove unnecessary button(s) @@ -170,45 +187,43 @@ class Filter: for href_element in soup.findAll('a'): href_element['style'] = 'color: white' if href_element['href'].startswith('/search') else '' - def update_links(self, soup): - # Replace hrefs with only the intended destination (no "utm" type tags) - for a in soup.find_all('a', href=True): - href = a['href'].replace('https://www.google.com', '') - if '/advanced_search' in href: - a.decompose() - continue - elif self.new_tab: - a['target'] = '_blank' + def update_link(self, link): + # Replace href with only the intended destination (no "utm" type tags) + href = link['href'].replace('https://www.google.com', '') + if '/advanced_search' in href: + link.decompose() + return + elif self.new_tab: + link['target'] = '_blank' - result_link = urlparse.urlparse(href) - query_link = parse_qs(result_link.query)['q'][0] if '?q=' in href else '' + result_link = urlparse.urlparse(href) + query_link = parse_qs(result_link.query)['q'][0] if '?q=' in href else '' - if query_link.startswith('/'): - a['href'] = 'https://google.com' + query_link - elif '/search?q=' in href: - enc_result = Fernet(self.secret_key).encrypt(query_link.encode()) - new_search = '/search?q=' + enc_result.decode() + if query_link.startswith('/'): + link['href'] = 'https://google.com' + query_link + elif '/search?q=' in href: + new_search = '/search?q=' + self.encrypt_path(query_link) - query_params = parse_qs(urlparse.urlparse(href).query) - for param in VALID_PARAMS: - param_val = query_params[param][0] if param in query_params else '' - new_search += '&' + param + '=' + param_val - a['href'] = new_search - elif 'url?q=' in href: - # Strip unneeded arguments - a['href'] = filter_link_args(query_link) + query_params = parse_qs(urlparse.urlparse(href).query) + for param in VALID_PARAMS: + param_val = query_params[param][0] if param in query_params else '' + new_search += '&' + param + '=' + param_val + link['href'] = new_search + elif 'url?q=' in href: + # Strip unneeded arguments + link['href'] = filter_link_args(query_link) - # Add no-js option - if self.nojs: - gen_nojs(soup, a['href'], a) - else: - a['href'] = href + # Add no-js option + if self.nojs: + gen_nojs(link) + else: + link['href'] = href -def gen_nojs(soup, link, sibling): - nojs_link = soup.new_tag('a') - nojs_link['href'] = '/window?location=' + link +def gen_nojs(sibling): + nojs_link = BeautifulSoup().new_tag('a') + nojs_link['href'] = '/window?location=' + sibling['href'] nojs_link['style'] = 'display:block;width:100%;' nojs_link.string = 'NoJS Link: ' + nojs_link['href'] sibling.append(BeautifulSoup('


', 'html.parser')) - sibling.append(nojs_link) \ No newline at end of file + sibling.append(nojs_link) diff --git a/app/request.py b/app/request.py index 7ecd887..de60638 100644 --- a/app/request.py +++ b/app/request.py @@ -1,7 +1,7 @@ from io import BytesIO from lxml import etree -import pycurl import random +import requests import urllib.parse as urlparse # Core Google search URLs @@ -15,7 +15,7 @@ DESKTOP_UA = '{}/5.0 (X11; {} x86_64; rv:75.0) Gecko/20100101 {}/75.0' VALID_PARAMS = ['tbs', 'tbm', 'start', 'near'] -def gen_user_agent(normal_ua, is_mobile): +def gen_user_agent(is_mobile): mozilla = random.choice(['Moo', 'Woah', 'Bro', 'Slow']) + 'zilla' firefox = random.choice(['Choir', 'Squier', 'Higher', 'Wire']) + 'fox' linux = random.choice(['Win', 'Sin', 'Gin', 'Fin', 'Kin']) + 'ux' @@ -66,17 +66,11 @@ class Request: def __init__(self, normal_ua, language='lang_en'): self.language = language self.mobile = 'Android' in normal_ua or 'iPhone' in normal_ua - self.modified_user_agent = gen_user_agent(normal_ua, self.mobile) + self.modified_user_agent = gen_user_agent(self.mobile) def __getitem__(self, name): return getattr(self, name) - def get_decode_value(self): - if 'lang_zh' in self.language: - return 'gb2312' - else: - return 'unicode-escape' - def autocomplete(self, query): ac_query = dict(hl=self.language, q=query) response = self.send(base_url=AUTOCOMPLETE_URL, query=urlparse.urlencode(ac_query)) @@ -88,19 +82,13 @@ class Request: return [] def send(self, base_url=SEARCH_URL, query='', return_bytes=False): - response_header = [] + headers = { + 'User-Agent': self.modified_user_agent + } - b_obj = BytesIO() - crl = pycurl.Curl() - crl.setopt(crl.URL, base_url + query) - crl.setopt(crl.USERAGENT, self.modified_user_agent) - crl.setopt(crl.WRITEDATA, b_obj) - crl.setopt(crl.HEADERFUNCTION, response_header.append) - crl.setopt(pycurl.FOLLOWLOCATION, 1) - crl.perform() - crl.close() + response = requests.get(base_url + query, headers=headers) if return_bytes: - return b_obj.getvalue() + return response.content else: - return b_obj.getvalue().decode(self.get_decode_value(), 'ignore') + return response.text diff --git a/app/routes.py b/app/routes.py index 3f50082..43b6d08 100644 --- a/app/routes.py +++ b/app/routes.py @@ -1,18 +1,20 @@ from app import app -from app.filter import Filter, get_first_link from app.models.config import Config -from app.request import Request, gen_query +from app.request import Request +from app.utils.misc import generate_user_keys, valid_user_session +from app.utils.routing_utils import * import argparse import base64 from bs4 import BeautifulSoup -from cryptography.fernet import Fernet, InvalidToken -from flask import g, jsonify, make_response, request, redirect, render_template, send_file +from cryptography.fernet import Fernet +from flask import g, jsonify, make_response, request, redirect, render_template, send_file, session from functools import wraps import io import json import os -from pycurl import error as pycurl_error import urllib.parse as urlparse +from requests import exceptions +import uuid import waitress @@ -34,17 +36,22 @@ def auth_required(f): @app.before_request def before_request_func(): - # Always redirect to https if HTTPS_ONLY is set (otherwise default to false) + # Generate secret key for user if unavailable + if not valid_user_session(session): + session['config'] = {'url': request.url_root} + session['keys'] = generate_user_keys() + session['uuid'] = str(uuid.uuid4()) + + if session['uuid'] not in app.user_elements: + app.user_elements.update({session['uuid']: 0}) + + # Always redirect to https if HTTPS_ONLY is set (otherwise default to False) https_only = os.getenv('HTTPS_ONLY', False) - config_path = app.config['CONFIG_PATH'] if https_only and request.url.startswith('http://'): - https_url = request.url.replace('http://', 'https://', 1) - code = 308 - return redirect(https_url, code=code) + return redirect(request.url.replace('http://', 'https://', 1), code=308) - json_config = json.load(open(config_path)) if os.path.exists(config_path) else {'url': request.url_root} - g.user_config = Config(**json_config) + g.user_config = Config(**session['config']) if not g.user_config.url: g.user_config.url = request.url_root.replace('http://', 'https://') if https_only else request.url_root @@ -53,6 +60,16 @@ def before_request_func(): g.app_location = g.user_config.url +@app.after_request +def after_request_func(response): + # Regenerate element key if all elements have been served to user + if app.user_elements[session['uuid']] <= 0 and '/element' in request.url: + session['keys']['element_key'] = Fernet.generate_key() + app.user_elements[session['uuid']] = 0 + + return response + + @app.errorhandler(404) def unknown_page(e): return redirect(g.app_location) @@ -62,14 +79,11 @@ def unknown_page(e): @auth_required def index(): return render_template('index.html', - dark_mode=g.user_config.dark, ua=g.user_request.modified_user_agent, languages=Config.LANGUAGES, countries=Config.COUNTRIES, - current_lang=g.user_config.lang, - current_ctry=g.user_config.ctry, - version_number=app.config['VERSION_NUMBER'], - request_type='get' if g.user_config.get_only else 'post') + config=g.user_config, + version_number=app.config['VERSION_NUMBER']) @app.route('/opensearch.xml', methods=['GET']) @@ -103,52 +117,37 @@ def autocomplete(): @app.route('/search', methods=['GET', 'POST']) @auth_required def search(): - request_params = request.args if request.method == 'GET' else request.form - q = request_params.get('q') + # Clear previous elements and generate a new key each time a new search is performed + app.user_elements[session['uuid']] = 0 + session['keys']['element_key'] = Fernet.generate_key() - if q is None or len(q) == 0: + search_util = RoutingUtils(request, g.user_config, session) + query = search_util.new_search_query() + + # Redirect to home if invalid/blank search + if not query: return redirect('/') - else: - # Attempt to decrypt if this is an internal link - try: - q = Fernet(app.secret_key).decrypt(q.encode()).decode() - except InvalidToken: - pass - feeling_lucky = q.startswith('! ') + # Generate response and number of external elements from the page + response, elements = search_util.generate_response() + if search_util.feeling_lucky: + return redirect(response, code=303) - if feeling_lucky: # Well do you, punk? - q = q[2:] - - user_agent = request.headers.get('User-Agent') - mobile = 'Android' in user_agent or 'iPhone' in user_agent - - content_filter = Filter(mobile, g.user_config, secret_key=app.secret_key) - full_query = gen_query(q, request_params, g.user_config, content_filter.near) - get_body = g.user_request.send(query=full_query) - dirty_soup = BeautifulSoup(content_filter.reskin(get_body), 'html.parser') - - if feeling_lucky: - return redirect(get_first_link(dirty_soup), 303) # Using 303 so the browser performs a GET request for the URL - else: - formatted_results = content_filter.clean(dirty_soup) - - # Set search type to be used in the header template to allow for repeated searches - # in the same category - search_type = request_params.get('tbm') if 'tbm' in request_params else '' + # Keep count of external elements to fetch before element key can be regenerated + app.user_elements[session['uuid']] = elements return render_template( 'display.html', - query=urlparse.unquote(q), - search_type=search_type, + query=urlparse.unquote(query), + search_type=search_util.search_type, dark_mode=g.user_config.dark, - response=formatted_results, + response=response, search_header=render_template( 'header.html', dark_mode=g.user_config.dark, - q=urlparse.unquote(q), - search_type=search_type, - mobile=g.user_request.mobile) if 'isch' not in search_type else '') + query=urlparse.unquote(query), + search_type=search_util.search_type, + mobile=g.user_request.mobile) if 'isch' not in search_util.search_type else '') @app.route('/config', methods=['GET', 'POST']) @@ -161,10 +160,7 @@ def config(): if 'url' not in config_data or not config_data['url']: config_data['url'] = g.user_config.url - with open(app.config['CONFIG_PATH'], 'w') as config_file: - config_file.write(json.dumps(config_data, indent=4)) - config_file.close() - + session['config'] = config_data return redirect(config_data['url']) @@ -187,25 +183,22 @@ def imgres(): return redirect(request.args.get('imgurl')) -@app.route('/tmp') +@app.route('/element') @auth_required -def tmp(): - cipher_suite = Fernet(app.secret_key) - img_url = cipher_suite.decrypt(request.args.get('image_url').encode()).decode() +def element(): + cipher_suite = Fernet(session['keys']['element_key']) + src_url = cipher_suite.decrypt(request.args.get('url').encode()).decode() + src_type = request.args.get('type') try: - file_data = g.user_request.send(base_url=img_url, return_bytes=True) + file_data = g.user_request.send(base_url=src_url, return_bytes=True) + app.user_elements[session['uuid']] -= 1 tmp_mem = io.BytesIO() tmp_mem.write(file_data) tmp_mem.seek(0) - return send_file( - tmp_mem, - as_attachment=True, - attachment_filename='tmp.png', - mimetype='image/png' - ) - except pycurl_error: + return send_file(tmp_mem, mimetype=src_type) + except exceptions.RequestException: pass empty_gif = base64.b64decode('R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==') diff --git a/app/static/config/.gitignore b/app/static/config/.gitignore new file mode 100644 index 0000000..72e8ffc --- /dev/null +++ b/app/static/config/.gitignore @@ -0,0 +1 @@ +* diff --git a/app/templates/header.html b/app/templates/header.html index 5356ec2..5573b99 100644 --- a/app/templates/header.html +++ b/app/templates/header.html @@ -15,7 +15,7 @@ style="background-color: {{ '#000' if dark_mode else '#fff' }}; color: {{ '#685e79' if dark_mode else '#000' }}; border: {{ '1px solid #685e79' if dark_mode else '' }}" - spellcheck="false" type="text" value="{{ q }}"> + spellcheck="false" type="text" value="{{ query }}">
@@ -37,7 +37,7 @@
diff --git a/app/templates/index.html b/app/templates/index.html index 9279031..cf40a82 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -21,14 +21,14 @@ - + Whoogle Search - +
-
+
@@ -50,7 +50,7 @@ {% for lang in languages %}