From b6fb4723f9589e54ed55be2254ed7e70db519322 Mon Sep 17 00:00:00 2001 From: Ben Busby Date: Tue, 2 Jun 2020 12:54:47 -0600 Subject: [PATCH 01/22] Project refactor (#85) * Major refactor of requests and session management - Switches from pycurl to requests library - Allows for less janky decoding, especially with non-latin character sets - Adds session level management of user configs - Allows for each session to set its own config (people are probably going to complain about this, though not sure if it'll be the same number of people who are upset that their friends/family have to share their config) - Updates key gen/regen to more aggressively swap out keys after each request * Added ability to save/load configs by name - New PUT method for config allows changing config with specified name - New methods in js controller to handle loading/saving of configs * Result formatting and removal of unused elements - Fixed question section formatting from results page (added appropriate padding and made questions styled as italic) - Removed user agent display from main config settings * Minor change to button label * Fixed issue with "de-pickling" of flask session Having a gitignore-everything ("*") file within a flask session folder seems to cause a weird bug where the state of the app becomes unusable from continuously trying to prune files listed in the gitignore (and it can't prune '*'). * Switched to pickling saved configs * Updated ad/sponsored content filter and conf naming Configs are now named with a .conf extension to allow for easier manual cleanup/modification of named config files Sponsored content now removed by basic string matching of span content * Version bump to 0.2.0 * Fixed request.send return style --- .gitignore | 2 + app/__init__.py | 18 +++- app/filter.py | 169 +++++++++++++++++++++--------------- app/request.py | 37 +++----- app/routes.py | 143 +++++++++++++++--------------- app/static/js/controller.js | 35 ++++++++ app/templates/header.html | 4 +- app/templates/index.html | 20 ++--- app/utils/__init__.py | 0 app/utils/misc.py | 20 +++++ app/utils/routing_utils.py | 69 +++++++++++++++ requirements.txt | 3 +- setup.py | 2 +- test/test_misc.py | 36 ++++++++ test/test_results.py | 6 +- test/test_routes.py | 6 +- 16 files changed, 382 insertions(+), 188 deletions(-) create mode 100644 app/utils/__init__.py create mode 100644 app/utils/misc.py create mode 100644 app/utils/routing_utils.py create mode 100644 test/test_misc.py diff --git a/.gitignore b/.gitignore index 20747c7..f6e039f 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,8 @@ __pycache__/ *.pem config.json test/static +flask_session/ +app/static/config # pip stuff build/ diff --git a/app/__init__.py b/app/__init__.py index 4b78a8d..53d4a59 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -1,12 +1,24 @@ +from app.utils.misc import generate_user_keys from cryptography.fernet import Fernet from flask import Flask +from flask_session import Session import os app = Flask(__name__, static_folder=os.path.dirname(os.path.abspath(__file__)) + '/static') -app.secret_key = Fernet.generate_key() -app.config['VERSION_NUMBER'] = '0.1.4' +app.user_elements = {} +app.config['SECRET_KEY'] = os.urandom(16) +app.config['SESSION_TYPE'] = 'filesystem' +app.config['VERSION_NUMBER'] = '0.2.0' app.config['APP_ROOT'] = os.getenv('APP_ROOT', os.path.dirname(os.path.abspath(__file__))) app.config['STATIC_FOLDER'] = os.getenv('STATIC_FOLDER', os.path.join(app.config['APP_ROOT'], 'static')) -app.config['CONFIG_PATH'] = os.getenv('CONFIG_VOLUME', app.config['STATIC_FOLDER']) + '/config.json' +app.config['CONFIG_PATH'] = os.getenv('CONFIG_VOLUME', app.config['STATIC_FOLDER'] + '/config') +app.config['SESSION_FILE_DIR'] = app.config['CONFIG_PATH'] +app.config['SESSION_COOKIE_SECURE'] = True + +if not os.path.exists(app.config['CONFIG_PATH']): + os.makedirs(app.config['CONFIG_PATH']) + +sess = Session() +sess.init_app(app) from app import routes diff --git a/app/filter.py b/app/filter.py index 8c25fe4..be9809b 100644 --- a/app/filter.py +++ b/app/filter.py @@ -1,5 +1,6 @@ from app.request import VALID_PARAMS from bs4 import BeautifulSoup +from bs4.element import ResultSet from cryptography.fernet import Fernet import re import urllib.parse as urlparse @@ -17,14 +18,9 @@  def get_first_link(soup): # Replace hrefs with only the intended destination (no "utm" type tags) for a in soup.find_all('a', href=True): - href = a['href'].replace('https://www.google.com', '') - - result_link = urlparse.urlparse(href) - query_link = parse_qs(result_link.query)['q'][0] if '?q=' in href else '' - # Return the first search result URL - if 'url?q=' in href: - return filter_link_args(href) + if 'url?q=' in a['href']: + return filter_link_args(a['href']) def filter_link_args(query_link): @@ -51,8 +47,12 @@ def filter_link_args(query_link): return query_link +def has_ad_content(element): + return element == 'ad' or element == 'sponsoredⓘ' + + class Filter: - def __init__(self, mobile=False, config=None, secret_key=''): + def __init__(self, user_keys: dict, mobile=False, config=None): if config is None: config = {} @@ -61,11 +61,17 @@ class Filter: self.nojs = config['nojs'] if 'nojs' in config else False self.new_tab = config['new_tab'] if 'new_tab' in config else False self.mobile = mobile - self.secret_key = secret_key + self.user_keys = user_keys + self.main_divs = ResultSet('') + self._elements = 0 def __getitem__(self, name): return getattr(self, name) + @property + def elements(self): + return self._elements + def reskin(self, page): # Aesthetic only re-skinning page = page.replace('>G<', '>Wh<') @@ -76,11 +82,31 @@ class Filter: return page + def encrypt_path(self, msg, is_element=False): + # Encrypts path to avoid plaintext results in logs + if is_element: + # Element paths are tracked differently in order for the element key to be regenerated + # once all elements have been loaded + enc_path = Fernet(self.user_keys['element_key']).encrypt(msg.encode()).decode() + self._elements += 1 + return enc_path + + return Fernet(self.user_keys['text_key']).encrypt(msg.encode()).decode() + def clean(self, soup): - self.remove_ads(soup) - self.update_image_paths(soup) + self.main_divs = soup.find('div', {'id': 'main'}) + self.remove_ads() + self.fix_question_section() self.update_styling(soup) - self.update_links(soup) + + for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]: + self.update_element_src(img, 'image/png') + + for audio in [_ for _ in soup.find_all('audio') if 'src' in _.attrs]: + self.update_element_src(audio, 'audio/mpeg') + + for link in soup.find_all('a', href=True): + self.update_link(link) input_form = soup.find('form') if input_form is not None: @@ -105,35 +131,42 @@ class Filter: return soup - def remove_ads(self, soup): - main_divs = soup.find('div', {'id': 'main'}) - if main_divs is None: + def remove_ads(self): + if not self.main_divs: return - result_divs = main_divs.find_all('div', recursive=False) - for div in [_ for _ in result_divs]: - has_ad = len([_ for _ in div.find_all('span', recursive=True) if 'ad' == _.text.lower()]) + for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]: + has_ad = len([_ for _ in div.find_all('span', recursive=True) if has_ad_content(_.text.lower())]) _ = div.decompose() if has_ad else None - def update_image_paths(self, soup): - for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]: - img_src = img['src'] - if img_src.startswith('//'): - img_src = 'https:' + img_src - elif img_src.startswith(LOGO_URL): - # Re-brand with Whoogle logo - img['src'] = '/static/img/logo.png' - img['style'] = 'height:40px;width:162px' - continue - elif img_src.startswith(GOOG_IMG): - img['src'] = BLANK_B64 - continue + def fix_question_section(self): + if not self.main_divs: + return - enc_src = Fernet(self.secret_key).encrypt(img_src.encode()) - img['src'] = '/tmp?image_url=' + enc_src.decode() - # TODO: Non-mobile image results link to website instead of image - # if not self.mobile: - # img.append(BeautifulSoup(FULL_RES_IMG.format(img_src), 'html.parser')) + question_divs = [_ for _ in self.main_divs.find_all('div', recursive=False) if len(_.find_all('h2')) > 0] + for x in question_divs: + questions = [_ for _ in x.find_all('div', recursive=True) if _.text.endswith('?')] + for question in questions: + question['style'] = 'padding: 10px; font-style: italic;' + + def update_element_src(self, element, mimetype): + element_src = element['src'] + if element_src.startswith('//'): + element_src = 'https:' + element_src + elif element_src.startswith(LOGO_URL): + # Re-brand with Whoogle logo + element['src'] = '/static/img/logo.png' + element['style'] = 'height:40px;width:162px' + return + elif element_src.startswith(GOOG_IMG): + element['src'] = BLANK_B64 + return + + element['src'] = '/element?url=' + self.encrypt_path(element_src, is_element=True) + \ + '&type=' + urlparse.quote(mimetype) + # TODO: Non-mobile image results link to website instead of image + # if not self.mobile: + # img.append(BeautifulSoup(FULL_RES_IMG.format(element_src), 'html.parser')) def update_styling(self, soup): # Remove unnecessary button(s) @@ -169,45 +202,43 @@ class Filter: for href_element in soup.findAll('a'): href_element['style'] = 'color: white' if href_element['href'].startswith('/search') else '' - def update_links(self, soup): - # Replace hrefs with only the intended destination (no "utm" type tags) - for a in soup.find_all('a', href=True): - href = a['href'].replace('https://www.google.com', '') - if '/advanced_search' in href: - a.decompose() - continue - elif self.new_tab: - a['target'] = '_blank' + def update_link(self, link): + # Replace href with only the intended destination (no "utm" type tags) + href = link['href'].replace('https://www.google.com', '') + if '/advanced_search' in href: + link.decompose() + return + elif self.new_tab: + link['target'] = '_blank' - result_link = urlparse.urlparse(href) - query_link = parse_qs(result_link.query)['q'][0] if '?q=' in href else '' + result_link = urlparse.urlparse(href) + query_link = parse_qs(result_link.query)['q'][0] if '?q=' in href else '' - if query_link.startswith('/'): - a['href'] = 'https://google.com' + query_link - elif '/search?q=' in href: - enc_result = Fernet(self.secret_key).encrypt(query_link.encode()) - new_search = '/search?q=' + enc_result.decode() + if query_link.startswith('/'): + link['href'] = 'https://google.com' + query_link + elif '/search?q=' in href: + new_search = '/search?q=' + self.encrypt_path(query_link) - query_params = parse_qs(urlparse.urlparse(href).query) - for param in VALID_PARAMS: - param_val = query_params[param][0] if param in query_params else '' - new_search += '&' + param + '=' + param_val - a['href'] = new_search - elif 'url?q=' in href: - # Strip unneeded arguments - a['href'] = filter_link_args(query_link) + query_params = parse_qs(urlparse.urlparse(href).query) + for param in VALID_PARAMS: + param_val = query_params[param][0] if param in query_params else '' + new_search += '&' + param + '=' + param_val + link['href'] = new_search + elif 'url?q=' in href: + # Strip unneeded arguments + link['href'] = filter_link_args(query_link) - # Add no-js option - if self.nojs: - gen_nojs(soup, a['href'], a) - else: - a['href'] = href + # Add no-js option + if self.nojs: + gen_nojs(link) + else: + link['href'] = href -def gen_nojs(soup, link, sibling): - nojs_link = soup.new_tag('a') - nojs_link['href'] = '/window?location=' + link +def gen_nojs(sibling): + nojs_link = BeautifulSoup().new_tag('a') + nojs_link['href'] = '/window?location=' + sibling['href'] nojs_link['style'] = 'display:block;width:100%;' nojs_link.string = 'NoJS Link: ' + nojs_link['href'] sibling.append(BeautifulSoup('


', 'html.parser')) - sibling.append(nojs_link) \ No newline at end of file + sibling.append(nojs_link) diff --git a/app/request.py b/app/request.py index 7ecd887..38b47b0 100644 --- a/app/request.py +++ b/app/request.py @@ -1,7 +1,7 @@ -from io import BytesIO from lxml import etree -import pycurl import random +import requests +from requests import Response import urllib.parse as urlparse # Core Google search URLs @@ -15,7 +15,7 @@ DESKTOP_UA = '{}/5.0 (X11; {} x86_64; rv:75.0) Gecko/20100101 {}/75.0' VALID_PARAMS = ['tbs', 'tbm', 'start', 'near'] -def gen_user_agent(normal_ua, is_mobile): +def gen_user_agent(is_mobile): mozilla = random.choice(['Moo', 'Woah', 'Bro', 'Slow']) + 'zilla' firefox = random.choice(['Choir', 'Squier', 'Higher', 'Wire']) + 'fox' linux = random.choice(['Win', 'Sin', 'Gin', 'Fin', 'Kin']) + 'ux' @@ -66,20 +66,14 @@ class Request: def __init__(self, normal_ua, language='lang_en'): self.language = language self.mobile = 'Android' in normal_ua or 'iPhone' in normal_ua - self.modified_user_agent = gen_user_agent(normal_ua, self.mobile) + self.modified_user_agent = gen_user_agent(self.mobile) def __getitem__(self, name): return getattr(self, name) - def get_decode_value(self): - if 'lang_zh' in self.language: - return 'gb2312' - else: - return 'unicode-escape' - def autocomplete(self, query): ac_query = dict(hl=self.language, q=query) - response = self.send(base_url=AUTOCOMPLETE_URL, query=urlparse.urlencode(ac_query)) + response = self.send(base_url=AUTOCOMPLETE_URL, query=urlparse.urlencode(ac_query)).text if response: dom = etree.fromstring(response) @@ -87,20 +81,9 @@ class Request: return [] - def send(self, base_url=SEARCH_URL, query='', return_bytes=False): - response_header = [] + def send(self, base_url=SEARCH_URL, query='') -> Response: + headers = { + 'User-Agent': self.modified_user_agent + } - b_obj = BytesIO() - crl = pycurl.Curl() - crl.setopt(crl.URL, base_url + query) - crl.setopt(crl.USERAGENT, self.modified_user_agent) - crl.setopt(crl.WRITEDATA, b_obj) - crl.setopt(crl.HEADERFUNCTION, response_header.append) - crl.setopt(pycurl.FOLLOWLOCATION, 1) - crl.perform() - crl.close() - - if return_bytes: - return b_obj.getvalue() - else: - return b_obj.getvalue().decode(self.get_decode_value(), 'ignore') + return requests.get(base_url + query, headers=headers) diff --git a/app/routes.py b/app/routes.py index 3f50082..ca3bac4 100644 --- a/app/routes.py +++ b/app/routes.py @@ -1,18 +1,21 @@ from app import app -from app.filter import Filter, get_first_link from app.models.config import Config -from app.request import Request, gen_query +from app.request import Request +from app.utils.misc import generate_user_keys, valid_user_session +from app.utils.routing_utils import * import argparse import base64 from bs4 import BeautifulSoup -from cryptography.fernet import Fernet, InvalidToken -from flask import g, jsonify, make_response, request, redirect, render_template, send_file +from cryptography.fernet import Fernet +from flask import g, jsonify, make_response, request, redirect, render_template, send_file, session from functools import wraps import io import json import os -from pycurl import error as pycurl_error +import pickle import urllib.parse as urlparse +from requests import exceptions +import uuid import waitress @@ -34,17 +37,22 @@ def auth_required(f): @app.before_request def before_request_func(): - # Always redirect to https if HTTPS_ONLY is set (otherwise default to false) + # Generate secret key for user if unavailable + if not valid_user_session(session): + session['config'] = {'url': request.url_root} + session['keys'] = generate_user_keys() + session['uuid'] = str(uuid.uuid4()) + + if session['uuid'] not in app.user_elements: + app.user_elements.update({session['uuid']: 0}) + + # Always redirect to https if HTTPS_ONLY is set (otherwise default to False) https_only = os.getenv('HTTPS_ONLY', False) - config_path = app.config['CONFIG_PATH'] if https_only and request.url.startswith('http://'): - https_url = request.url.replace('http://', 'https://', 1) - code = 308 - return redirect(https_url, code=code) + return redirect(request.url.replace('http://', 'https://', 1), code=308) - json_config = json.load(open(config_path)) if os.path.exists(config_path) else {'url': request.url_root} - g.user_config = Config(**json_config) + g.user_config = Config(**session['config']) if not g.user_config.url: g.user_config.url = request.url_root.replace('http://', 'https://') if https_only else request.url_root @@ -53,6 +61,16 @@ def before_request_func(): g.app_location = g.user_config.url +@app.after_request +def after_request_func(response): + # Regenerate element key if all elements have been served to user + if app.user_elements[session['uuid']] <= 0 and '/element' in request.url: + session['keys']['element_key'] = Fernet.generate_key() + app.user_elements[session['uuid']] = 0 + + return response + + @app.errorhandler(404) def unknown_page(e): return redirect(g.app_location) @@ -62,14 +80,10 @@ def unknown_page(e): @auth_required def index(): return render_template('index.html', - dark_mode=g.user_config.dark, - ua=g.user_request.modified_user_agent, languages=Config.LANGUAGES, countries=Config.COUNTRIES, - current_lang=g.user_config.lang, - current_ctry=g.user_config.ctry, - version_number=app.config['VERSION_NUMBER'], - request_type='get' if g.user_config.get_only else 'post') + config=g.user_config, + version_number=app.config['VERSION_NUMBER']) @app.route('/opensearch.xml', methods=['GET']) @@ -103,68 +117,60 @@ def autocomplete(): @app.route('/search', methods=['GET', 'POST']) @auth_required def search(): - request_params = request.args if request.method == 'GET' else request.form - q = request_params.get('q') + # Clear previous elements and generate a new key each time a new search is performed + app.user_elements[session['uuid']] = 0 + session['keys']['element_key'] = Fernet.generate_key() - if q is None or len(q) == 0: + search_util = RoutingUtils(request, g.user_config, session) + query = search_util.new_search_query() + + # Redirect to home if invalid/blank search + if not query: return redirect('/') - else: - # Attempt to decrypt if this is an internal link - try: - q = Fernet(app.secret_key).decrypt(q.encode()).decode() - except InvalidToken: - pass - feeling_lucky = q.startswith('! ') + # Generate response and number of external elements from the page + response, elements = search_util.generate_response() + if search_util.feeling_lucky: + return redirect(response, code=303) - if feeling_lucky: # Well do you, punk? - q = q[2:] - - user_agent = request.headers.get('User-Agent') - mobile = 'Android' in user_agent or 'iPhone' in user_agent - - content_filter = Filter(mobile, g.user_config, secret_key=app.secret_key) - full_query = gen_query(q, request_params, g.user_config, content_filter.near) - get_body = g.user_request.send(query=full_query) - dirty_soup = BeautifulSoup(content_filter.reskin(get_body), 'html.parser') - - if feeling_lucky: - return redirect(get_first_link(dirty_soup), 303) # Using 303 so the browser performs a GET request for the URL - else: - formatted_results = content_filter.clean(dirty_soup) - - # Set search type to be used in the header template to allow for repeated searches - # in the same category - search_type = request_params.get('tbm') if 'tbm' in request_params else '' + # Keep count of external elements to fetch before element key can be regenerated + app.user_elements[session['uuid']] = elements return render_template( 'display.html', - query=urlparse.unquote(q), - search_type=search_type, + query=urlparse.unquote(query), + search_type=search_util.search_type, dark_mode=g.user_config.dark, - response=formatted_results, + response=response, search_header=render_template( 'header.html', dark_mode=g.user_config.dark, - q=urlparse.unquote(q), - search_type=search_type, - mobile=g.user_request.mobile) if 'isch' not in search_type else '') + query=urlparse.unquote(query), + search_type=search_util.search_type, + mobile=g.user_request.mobile) if 'isch' not in search_util.search_type else '') -@app.route('/config', methods=['GET', 'POST']) +@app.route('/config', methods=['GET', 'POST', 'PUT']) @auth_required def config(): if request.method == 'GET': return json.dumps(g.user_config.__dict__) + elif request.method == 'PUT': + if 'name' in request.args: + config_pkl = os.path.join(app.config['CONFIG_PATH'], request.args.get('name')) + session['config'] = pickle.load(open(config_pkl, 'rb')) if os.path.exists(config_pkl) else session['config'] + return json.dumps(session['config']) + else: + return json.dumps({}) else: config_data = request.form.to_dict() if 'url' not in config_data or not config_data['url']: config_data['url'] = g.user_config.url - with open(app.config['CONFIG_PATH'], 'w') as config_file: - config_file.write(json.dumps(config_data, indent=4)) - config_file.close() + if 'name' in request.args: + pickle.dump(config_data, open(os.path.join(app.config['CONFIG_PATH'], request.args.get('name')), 'wb')) + session['config'] = config_data return redirect(config_data['url']) @@ -187,25 +193,22 @@ def imgres(): return redirect(request.args.get('imgurl')) -@app.route('/tmp') +@app.route('/element') @auth_required -def tmp(): - cipher_suite = Fernet(app.secret_key) - img_url = cipher_suite.decrypt(request.args.get('image_url').encode()).decode() +def element(): + cipher_suite = Fernet(session['keys']['element_key']) + src_url = cipher_suite.decrypt(request.args.get('url').encode()).decode() + src_type = request.args.get('type') try: - file_data = g.user_request.send(base_url=img_url, return_bytes=True) + file_data = g.user_request.send(base_url=src_url).content + app.user_elements[session['uuid']] -= 1 tmp_mem = io.BytesIO() tmp_mem.write(file_data) tmp_mem.seek(0) - return send_file( - tmp_mem, - as_attachment=True, - attachment_filename='tmp.png', - mimetype='image/png' - ) - except pycurl_error: + return send_file(tmp_mem, mimetype=src_type) + except exceptions.RequestException: pass empty_gif = base64.b64decode('R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==') @@ -215,7 +218,7 @@ def tmp(): @app.route('/window') @auth_required def window(): - get_body = g.user_request.send(base_url=request.args.get('location')) + get_body = g.user_request.send(base_url=request.args.get('location')).text get_body = get_body.replace('src="/', 'src="' + request.args.get('location') + '"') get_body = get_body.replace('href="/', 'href="' + request.args.get('location') + '"') diff --git a/app/static/js/controller.js b/app/static/js/controller.js index 4817195..95d917b 100644 --- a/app/static/js/controller.js +++ b/app/static/js/controller.js @@ -71,6 +71,41 @@ const setupConfigLayout = () => { fillConfigValues(); }; +const loadConfig = event => { + event.preventDefault(); + let config = prompt("Enter name of config:"); + if (!config) { + alert("Must specify a name for the config to load"); + return; + } + + let xhrPUT = new XMLHttpRequest(); + xhrPUT.open("PUT", "/config?name=" + config + ".conf"); + xhrPUT.onload = function() { + if (xhrPUT.readyState === 4 && xhrPUT.status !== 200) { + alert("Error loading Whoogle config"); + return; + } + + location.reload(true); + }; + + xhrPUT.send(); +}; + +const saveConfig = event => { + event.preventDefault(); + let config = prompt("Enter name for this config:"); + if (!config) { + alert("Must specify a name for the config to save"); + return; + } + + let configForm = document.getElementById("config-form"); + configForm.action = '/config?name=' + config + ".conf"; + configForm.submit(); +}; + document.addEventListener("DOMContentLoaded", function() { setTimeout(function() { document.getElementById("main").style.display = "block"; diff --git a/app/templates/header.html b/app/templates/header.html index 5356ec2..5573b99 100644 --- a/app/templates/header.html +++ b/app/templates/header.html @@ -15,7 +15,7 @@ style="background-color: {{ '#000' if dark_mode else '#fff' }}; color: {{ '#685e79' if dark_mode else '#000' }}; border: {{ '1px solid #685e79' if dark_mode else '' }}" - spellcheck="false" type="text" value="{{ q }}"> + spellcheck="false" type="text" value="{{ query }}">
@@ -37,7 +37,7 @@
diff --git a/app/templates/index.html b/app/templates/index.html index 9279031..7d32b9f 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -21,14 +21,14 @@ - + Whoogle Search - +
-
+
@@ -40,17 +40,13 @@
- -
- - User Agent: {{ ua }} -
+
{% for lang in languages %}
- +   +   +
diff --git a/app/utils/__init__.py b/app/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/utils/misc.py b/app/utils/misc.py new file mode 100644 index 0000000..a70a82a --- /dev/null +++ b/app/utils/misc.py @@ -0,0 +1,20 @@ +from cryptography.fernet import Fernet + +SESSION_VALS = ['uuid', 'config', 'keys'] + + +def generate_user_keys(): + # Generate/regenerate unique key per user + return { + 'element_key': Fernet.generate_key(), + 'text_key': Fernet.generate_key() + } + + +def valid_user_session(session): + # Generate secret key for user if unavailable + for value in SESSION_VALS: + if value not in session: + return False + + return True diff --git a/app/utils/routing_utils.py b/app/utils/routing_utils.py new file mode 100644 index 0000000..cc3ed1f --- /dev/null +++ b/app/utils/routing_utils.py @@ -0,0 +1,69 @@ +from app import app +from app.filter import Filter, get_first_link +from app.request import gen_query +from bs4 import BeautifulSoup +from cryptography.fernet import Fernet, InvalidToken +from flask import g +from typing import Any, Tuple + + +class RoutingUtils: + def __init__(self, request, config, session): + self.request_params = request.args if request.method == 'GET' else request.form + self.user_agent = request.headers.get('User-Agent') + self.feeling_lucky = False + self.config = config + self.session = session + self.query = '' + self.search_type = self.request_params.get('tbm') if 'tbm' in self.request_params else '' + + def __getitem__(self, name): + return getattr(self, name) + + def __setitem__(self, name, value): + return setattr(self, name, value) + + def __delitem__(self, name): + return delattr(self, name) + + def __contains__(self, name): + return hasattr(self, name) + + def new_search_query(self) -> str: + app.user_elements[self.session['uuid']] = 0 + self.session['keys']['element_key'] = Fernet.generate_key() + + q = self.request_params.get('q') + + if q is None or len(q) == 0: + return '' + else: + # Attempt to decrypt if this is an internal link + try: + q = Fernet(self.session['keys']['text_key']).decrypt(q.encode()).decode() + except InvalidToken: + pass + + # Reset text key + self.session['keys']['text_key'] = Fernet.generate_key() + + # Format depending on whether or not the query is a "feeling lucky" query + self.feeling_lucky = q.startswith('! ') + self.query = q[2:] if self.feeling_lucky else q + return self.query + + def generate_response(self) -> Tuple[Any, int]: + mobile = 'Android' in self.user_agent or 'iPhone' in self.user_agent + + content_filter = Filter(self.session['keys'], mobile=mobile, config=self.config) + full_query = gen_query(self.query, self.request_params, self.config, content_filter.near) + get_body = g.user_request.send(query=full_query).text + + # Produce cleanable html soup from response + html_soup = BeautifulSoup(content_filter.reskin(get_body), 'html.parser') + + if self.feeling_lucky: + return get_first_link(html_soup), 1 + else: + formatted_results = content_filter.clean(html_soup) + return formatted_results, content_filter.elements diff --git a/requirements.txt b/requirements.txt index 030780c..702d8ba 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,15 +4,16 @@ cffi==1.13.2 Click==7.0 cryptography==2.8 Flask==1.1.1 +Flask-Session==0.3.2 itsdangerous==1.1.0 Jinja2==2.10.3 lxml==4.5.1 MarkupSafe==1.1.1 pycparser==2.19 -pycurl==7.43.0.4 pyOpenSSL==19.1.0 pytest==5.4.1 python-dateutil==2.8.1 +requests==2.23.0 six==1.14.0 soupsieve==1.9.5 Werkzeug==0.16.0 diff --git a/setup.py b/setup.py index 3428459..08652bc 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setuptools.setup( author='Ben Busby', author_email='benbusby@protonmail.com', name='whoogle-search', - version='0.1.4', + version='0.2.0', include_package_data=True, install_requires=requirements, description='Self-hosted, ad-free, privacy-respecting Google metasearch engine', diff --git a/test/test_misc.py b/test/test_misc.py new file mode 100644 index 0000000..296d03a --- /dev/null +++ b/test/test_misc.py @@ -0,0 +1,36 @@ +from app.utils.misc import generate_user_keys, valid_user_session + + +def test_generate_user_keys(): + keys = generate_user_keys() + assert 'text_key' in keys + assert 'element_key' in keys + assert keys['text_key'] not in keys['element_key'] + + +def test_valid_session(client): + with client.session_transaction() as session: + assert not valid_user_session(session) + + session['uuid'] = 'test' + session['keys'] = generate_user_keys() + session['config'] = {} + + assert valid_user_session(session) + + +def test_request_key_generation(client): + text_key = '' + rv = client.get('/search?q=test+1') + assert rv._status_code == 200 + + with client.session_transaction() as session: + assert valid_user_session(session) + text_key = session['keys']['text_key'] + + rv = client.get('/search?q=test+2') + assert rv._status_code == 200 + + with client.session_transaction() as session: + assert valid_user_session(session) + assert text_key not in session['keys']['text_key'] diff --git a/test/test_results.py b/test/test_results.py index 7f500c8..a943de6 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -1,13 +1,13 @@ from bs4 import BeautifulSoup -from cryptography.fernet import Fernet from app.filter import Filter +from app.utils.misc import generate_user_keys from datetime import datetime from dateutil.parser import * def get_search_results(data): - secret_key = Fernet.generate_key() - soup = Filter(secret_key=secret_key).clean(BeautifulSoup(data, 'html.parser')) + secret_key = generate_user_keys() + soup = Filter(user_keys=secret_key).clean(BeautifulSoup(data, 'html.parser')) main_divs = soup.find('div', {'id': 'main'}) assert len(main_divs) > 1 diff --git a/test/test_routes.py b/test/test_routes.py index 91e17be..56c9909 100644 --- a/test/test_routes.py +++ b/test/test_routes.py @@ -1,10 +1,13 @@ +from app.models.config import Config import json import random demo_config = { 'near': random.choice(['Seattle', 'New York', 'San Francisco']), 'dark_mode': str(random.getrandbits(1)), - 'nojs': str(random.getrandbits(1)) + 'nojs': str(random.getrandbits(1)), + 'lang': random.choice(Config.LANGUAGES)['value'], + 'ctry': random.choice(Config.COUNTRIES)['value'] } @@ -17,6 +20,7 @@ def test_search(client): rv = client.get('/search?q=test') assert rv._status_code == 200 + def test_feeling_lucky(client): rv = client.get('/search?q=!%20test') assert rv._status_code == 303 From 64af72abb503023b4ca7e45f65d994d166f3ac3c Mon Sep 17 00:00:00 2001 From: Ben Busby <33362396+benbusby@users.noreply.github.com> Date: Tue, 2 Jun 2020 14:38:29 -0600 Subject: [PATCH 02/22] Moved custom conf files to their own directory --- .gitignore | 2 ++ app/__init__.py | 4 ++++ app/routes.py | 4 ++-- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index f6e039f..bbffdb4 100644 --- a/.gitignore +++ b/.gitignore @@ -3,10 +3,12 @@ venv/ __pycache__/ *.pyc *.pem +*.conf config.json test/static flask_session/ app/static/config +app/static/custom_config # pip stuff build/ diff --git a/app/__init__.py b/app/__init__.py index 53d4a59..ecd1edd 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -12,12 +12,16 @@ app.config['VERSION_NUMBER'] = '0.2.0' app.config['APP_ROOT'] = os.getenv('APP_ROOT', os.path.dirname(os.path.abspath(__file__))) app.config['STATIC_FOLDER'] = os.getenv('STATIC_FOLDER', os.path.join(app.config['APP_ROOT'], 'static')) app.config['CONFIG_PATH'] = os.getenv('CONFIG_VOLUME', app.config['STATIC_FOLDER'] + '/config') +app.config['USER_CONFIG'] = os.path.join(app.config['STATIC_FOLDER'], 'custom_config') app.config['SESSION_FILE_DIR'] = app.config['CONFIG_PATH'] app.config['SESSION_COOKIE_SECURE'] = True if not os.path.exists(app.config['CONFIG_PATH']): os.makedirs(app.config['CONFIG_PATH']) +if not os.path.exists(app.config['USER_CONFIG']): + os.makedirs(app.config['USER_CONFIG']) + sess = Session() sess.init_app(app) diff --git a/app/routes.py b/app/routes.py index ca3bac4..b473073 100644 --- a/app/routes.py +++ b/app/routes.py @@ -157,7 +157,7 @@ def config(): return json.dumps(g.user_config.__dict__) elif request.method == 'PUT': if 'name' in request.args: - config_pkl = os.path.join(app.config['CONFIG_PATH'], request.args.get('name')) + config_pkl = os.path.join(app.config['USER_CONFIG'], request.args.get('name')) session['config'] = pickle.load(open(config_pkl, 'rb')) if os.path.exists(config_pkl) else session['config'] return json.dumps(session['config']) else: @@ -168,7 +168,7 @@ def config(): config_data['url'] = g.user_config.url if 'name' in request.args: - pickle.dump(config_data, open(os.path.join(app.config['CONFIG_PATH'], request.args.get('name')), 'wb')) + pickle.dump(config_data, open(os.path.join(app.config['USER_CONFIG'], request.args.get('name')), 'wb')) session['config'] = config_data return redirect(config_data['url']) From 32e837a5e08a6d6d44e4fa0a09e3696c28184619 Mon Sep 17 00:00:00 2001 From: Ben Busby <33362396+benbusby@users.noreply.github.com> Date: Fri, 5 Jun 2020 15:24:44 -0600 Subject: [PATCH 03/22] Refactored whoogle session mgmt Now allows a fallback "default" session to be used if a user's browser is blocking cookies --- app/__init__.py | 20 +++++++------- app/routes.py | 51 ++++++++++++++++++++++++++--------- app/static/js/autocomplete.js | 2 +- app/utils/misc.py | 8 ++++-- app/utils/routing_utils.py | 15 ++++++----- test/test_misc.py | 6 ++--- 6 files changed, 66 insertions(+), 36 deletions(-) diff --git a/app/__init__.py b/app/__init__.py index ecd1edd..c6b8a42 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -1,28 +1,28 @@ from app.utils.misc import generate_user_keys -from cryptography.fernet import Fernet from flask import Flask from flask_session import Session import os app = Flask(__name__, static_folder=os.path.dirname(os.path.abspath(__file__)) + '/static') app.user_elements = {} -app.config['SECRET_KEY'] = os.urandom(16) +app.default_key_set = generate_user_keys() +app.no_cookie_ips = [] +app.config['SECRET_KEY'] = os.urandom(32) app.config['SESSION_TYPE'] = 'filesystem' app.config['VERSION_NUMBER'] = '0.2.0' app.config['APP_ROOT'] = os.getenv('APP_ROOT', os.path.dirname(os.path.abspath(__file__))) app.config['STATIC_FOLDER'] = os.getenv('STATIC_FOLDER', os.path.join(app.config['APP_ROOT'], 'static')) -app.config['CONFIG_PATH'] = os.getenv('CONFIG_VOLUME', app.config['STATIC_FOLDER'] + '/config') -app.config['USER_CONFIG'] = os.path.join(app.config['STATIC_FOLDER'], 'custom_config') -app.config['SESSION_FILE_DIR'] = app.config['CONFIG_PATH'] -app.config['SESSION_COOKIE_SECURE'] = True +app.config['CONFIG_PATH'] = os.getenv('CONFIG_VOLUME', os.path.join(app.config['STATIC_FOLDER'], 'config')) +app.config['DEFAULT_CONFIG'] = os.path.join(app.config['CONFIG_PATH'], 'config.json') +app.config['SESSION_FILE_DIR'] = os.path.join(app.config['CONFIG_PATH'], 'session') +app.config['SESSION_COOKIE_SAMESITE'] = 'Strict' if not os.path.exists(app.config['CONFIG_PATH']): os.makedirs(app.config['CONFIG_PATH']) -if not os.path.exists(app.config['USER_CONFIG']): - os.makedirs(app.config['USER_CONFIG']) +if not os.path.exists(app.config['SESSION_FILE_DIR']): + os.makedirs(app.config['SESSION_FILE_DIR']) -sess = Session() -sess.init_app(app) +Session(app) from app import routes diff --git a/app/routes.py b/app/routes.py index b473073..99e05a6 100644 --- a/app/routes.py +++ b/app/routes.py @@ -37,11 +37,19 @@ def auth_required(f): @app.before_request def before_request_func(): - # Generate secret key for user if unavailable + g.request_params = request.args if request.method == 'GET' else request.form + g.cookies_disabled = False + + # Generate session values for user if unavailable if not valid_user_session(session): - session['config'] = {'url': request.url_root} - session['keys'] = generate_user_keys() + session['config'] = json.load(open(app.config['DEFAULT_CONFIG'])) \ + if os.path.exists(app.config['DEFAULT_CONFIG']) else {'url': request.url_root} session['uuid'] = str(uuid.uuid4()) + session['fernet_keys'] = generate_user_keys(True) + + # Flag cookies as possibly disabled in order to prevent against + # unnecessary session directory expansion + g.cookies_disabled = True if session['uuid'] not in app.user_elements: app.user_elements.update({session['uuid']: 0}) @@ -63,11 +71,22 @@ def before_request_func(): @app.after_request def after_request_func(response): - # Regenerate element key if all elements have been served to user if app.user_elements[session['uuid']] <= 0 and '/element' in request.url: - session['keys']['element_key'] = Fernet.generate_key() + # Regenerate element key if all elements have been served to user + session['fernet_keys']['element_key'] = '' if not g.cookies_disabled else app.default_key_set['element_key'] app.user_elements[session['uuid']] = 0 + # Check if address consistently has cookies blocked, in which case start removing session + # files after creation. + # Note: This is primarily done to prevent overpopulation of session directories, since browsers that + # block cookies will still trigger Flask's session creation routine with every request. + if g.cookies_disabled and request.remote_addr not in app.no_cookie_ips: + app.no_cookie_ips.append(request.remote_addr) + elif g.cookies_disabled and request.remote_addr in app.no_cookie_ips: + session_list = list(session.keys()) + for key in session_list: + session.pop(key) + return response @@ -79,6 +98,9 @@ def unknown_page(e): @app.route('/', methods=['GET']) @auth_required def index(): + # Reset keys + session['fernet_keys'] = generate_user_keys(g.cookies_disabled) + return render_template('index.html', languages=Config.LANGUAGES, countries=Config.COUNTRIES, @@ -103,8 +125,7 @@ def opensearch(): @app.route('/autocomplete', methods=['GET', 'POST']) def autocomplete(): - request_params = request.args if request.method == 'GET' else request.form - q = request_params.get('q') + q = g.request_params.get('q') if not q and not request.data: return jsonify({'?': []}) @@ -117,11 +138,10 @@ def autocomplete(): @app.route('/search', methods=['GET', 'POST']) @auth_required def search(): - # Clear previous elements and generate a new key each time a new search is performed + # Reset element counter app.user_elements[session['uuid']] = 0 - session['keys']['element_key'] = Fernet.generate_key() - search_util = RoutingUtils(request, g.user_config, session) + search_util = RoutingUtils(request, g.user_config, session, cookies_disabled=g.cookies_disabled) query = search_util.new_search_query() # Redirect to home if invalid/blank search @@ -157,7 +177,7 @@ def config(): return json.dumps(g.user_config.__dict__) elif request.method == 'PUT': if 'name' in request.args: - config_pkl = os.path.join(app.config['USER_CONFIG'], request.args.get('name')) + config_pkl = os.path.join(app.config['CONFIG_PATH'], request.args.get('name')) session['config'] = pickle.load(open(config_pkl, 'rb')) if os.path.exists(config_pkl) else session['config'] return json.dumps(session['config']) else: @@ -167,8 +187,13 @@ def config(): if 'url' not in config_data or not config_data['url']: config_data['url'] = g.user_config.url + # Save config by name to allow a user to easily load later if 'name' in request.args: - pickle.dump(config_data, open(os.path.join(app.config['USER_CONFIG'], request.args.get('name')), 'wb')) + pickle.dump(config_data, open(os.path.join(app.config['CONFIG_PATH'], request.args.get('name')), 'wb')) + + # Overwrite default config if user has cookies disabled + if g.cookies_disabled: + open(app.config['DEFAULT_CONFIG'], 'w').write(json.dumps(config_data, indent=4)) session['config'] = config_data return redirect(config_data['url']) @@ -196,7 +221,7 @@ def imgres(): @app.route('/element') @auth_required def element(): - cipher_suite = Fernet(session['keys']['element_key']) + cipher_suite = Fernet(session['fernet_keys']['element_key']) src_url = cipher_suite.decrypt(request.args.get('url').encode()).decode() src_type = request.args.get('type') diff --git a/app/static/js/autocomplete.js b/app/static/js/autocomplete.js index 316f8c4..84e9b23 100644 --- a/app/static/js/autocomplete.js +++ b/app/static/js/autocomplete.js @@ -4,7 +4,7 @@ const handleUserInput = searchBar => { xhrRequest.setRequestHeader("Content-type", "application/x-www-form-urlencoded"); xhrRequest.onload = function() { if (xhrRequest.readyState === 4 && xhrRequest.status !== 200) { - alert("Error fetching autocomplete results"); + // Do nothing if failed to fetch autocomplete results return; } diff --git a/app/utils/misc.py b/app/utils/misc.py index a70a82a..9bd580d 100644 --- a/app/utils/misc.py +++ b/app/utils/misc.py @@ -1,9 +1,13 @@ from cryptography.fernet import Fernet +from flask import current_app as app -SESSION_VALS = ['uuid', 'config', 'keys'] +SESSION_VALS = ['uuid', 'config', 'fernet_keys'] -def generate_user_keys(): +def generate_user_keys(cookies_disabled=False) -> dict: + if cookies_disabled: + return app.default_key_set + # Generate/regenerate unique key per user return { 'element_key': Fernet.generate_key(), diff --git a/app/utils/routing_utils.py b/app/utils/routing_utils.py index cc3ed1f..cfe0b64 100644 --- a/app/utils/routing_utils.py +++ b/app/utils/routing_utils.py @@ -1,5 +1,5 @@ -from app import app from app.filter import Filter, get_first_link +from app.utils.misc import generate_user_keys from app.request import gen_query from bs4 import BeautifulSoup from cryptography.fernet import Fernet, InvalidToken @@ -8,13 +8,14 @@ from typing import Any, Tuple class RoutingUtils: - def __init__(self, request, config, session): + def __init__(self, request, config, session, cookies_disabled=False): self.request_params = request.args if request.method == 'GET' else request.form self.user_agent = request.headers.get('User-Agent') self.feeling_lucky = False self.config = config self.session = session self.query = '' + self.cookies_disabled = cookies_disabled self.search_type = self.request_params.get('tbm') if 'tbm' in self.request_params else '' def __getitem__(self, name): @@ -30,8 +31,8 @@ class RoutingUtils: return hasattr(self, name) def new_search_query(self) -> str: - app.user_elements[self.session['uuid']] = 0 - self.session['keys']['element_key'] = Fernet.generate_key() + # Generate a new element key each time a new search is performed + self.session['fernet_keys']['element_key'] = generate_user_keys(cookies_disabled=self.cookies_disabled)['element_key'] q = self.request_params.get('q') @@ -40,12 +41,12 @@ class RoutingUtils: else: # Attempt to decrypt if this is an internal link try: - q = Fernet(self.session['keys']['text_key']).decrypt(q.encode()).decode() + q = Fernet(self.session['fernet_keys']['text_key']).decrypt(q.encode()).decode() except InvalidToken: pass # Reset text key - self.session['keys']['text_key'] = Fernet.generate_key() + self.session['fernet_keys']['text_key'] = generate_user_keys(cookies_disabled=self.cookies_disabled)['text_key'] # Format depending on whether or not the query is a "feeling lucky" query self.feeling_lucky = q.startswith('! ') @@ -55,7 +56,7 @@ class RoutingUtils: def generate_response(self) -> Tuple[Any, int]: mobile = 'Android' in self.user_agent or 'iPhone' in self.user_agent - content_filter = Filter(self.session['keys'], mobile=mobile, config=self.config) + content_filter = Filter(self.session['fernet_keys'], mobile=mobile, config=self.config) full_query = gen_query(self.query, self.request_params, self.config, content_filter.near) get_body = g.user_request.send(query=full_query).text diff --git a/test/test_misc.py b/test/test_misc.py index 296d03a..96ef373 100644 --- a/test/test_misc.py +++ b/test/test_misc.py @@ -13,7 +13,7 @@ def test_valid_session(client): assert not valid_user_session(session) session['uuid'] = 'test' - session['keys'] = generate_user_keys() + session['fernet_keys'] = generate_user_keys() session['config'] = {} assert valid_user_session(session) @@ -26,11 +26,11 @@ def test_request_key_generation(client): with client.session_transaction() as session: assert valid_user_session(session) - text_key = session['keys']['text_key'] + text_key = session['fernet_keys']['text_key'] rv = client.get('/search?q=test+2') assert rv._status_code == 200 with client.session_transaction() as session: assert valid_user_session(session) - assert text_key not in session['keys']['text_key'] + assert text_key not in session['fernet_keys']['text_key'] From 6ec65f8754c8e4bd40f304102a58ef596b8f8c1a Mon Sep 17 00:00:00 2001 From: Ben Busby <33362396+benbusby@users.noreply.github.com> Date: Fri, 5 Jun 2020 16:09:04 -0600 Subject: [PATCH 04/22] Reworked pytest client fixture to support new session mgmt --- app/__init__.py | 1 - test/conftest.py | 9 +++++++-- test/test_misc.py | 15 ++++++--------- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/app/__init__.py b/app/__init__.py index c6b8a42..22e436d 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -15,7 +15,6 @@ app.config['STATIC_FOLDER'] = os.getenv('STATIC_FOLDER', os.path.join(app.config app.config['CONFIG_PATH'] = os.getenv('CONFIG_VOLUME', os.path.join(app.config['STATIC_FOLDER'], 'config')) app.config['DEFAULT_CONFIG'] = os.path.join(app.config['CONFIG_PATH'], 'config.json') app.config['SESSION_FILE_DIR'] = os.path.join(app.config['CONFIG_PATH'], 'session') -app.config['SESSION_COOKIE_SAMESITE'] = 'Strict' if not os.path.exists(app.config['CONFIG_PATH']): os.makedirs(app.config['CONFIG_PATH']) diff --git a/test/conftest.py b/test/conftest.py index 3d2aa33..63aec3e 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -1,8 +1,13 @@ from app import app +from app.utils.misc import generate_user_keys import pytest @pytest.fixture def client(): - client = app.test_client() - yield client + with app.test_client() as client: + with client.session_transaction() as session: + session['uuid'] = 'test' + session['fernet_keys'] = generate_user_keys() + session['config'] = {} + yield client diff --git a/test/test_misc.py b/test/test_misc.py index 96ef373..8eb1d78 100644 --- a/test/test_misc.py +++ b/test/test_misc.py @@ -9,26 +9,23 @@ def test_generate_user_keys(): def test_valid_session(client): + assert not valid_user_session({'fernet_keys': '', 'config': {}}) with client.session_transaction() as session: - assert not valid_user_session(session) - - session['uuid'] = 'test' - session['fernet_keys'] = generate_user_keys() - session['config'] = {} - assert valid_user_session(session) def test_request_key_generation(client): - text_key = '' - rv = client.get('/search?q=test+1') + rv = client.get('/') + cookie = rv.headers['Set-Cookie'] + + rv = client.get('/search?q=test+1', headers={'Cookie': cookie}) assert rv._status_code == 200 with client.session_transaction() as session: assert valid_user_session(session) text_key = session['fernet_keys']['text_key'] - rv = client.get('/search?q=test+2') + rv = client.get('/search?q=test+2', headers={'Cookie': cookie}) assert rv._status_code == 200 with client.session_transaction() as session: From 4324fcd8f8ea274b5e913840fd97c04465feb82a Mon Sep 17 00:00:00 2001 From: Ben Busby <33362396+benbusby@users.noreply.github.com> Date: Sun, 7 Jun 2020 14:06:49 -0600 Subject: [PATCH 05/22] Added better multilingual support, updated filter Results page now includes method for switching to "All Languages" from whichever language is specified as the primary in the config (see #74). Also removes the non-Whoogle links from the page footer, leaving only the page navigation controls Added support for the date range filter on the results page, though I'd still recommend using the ":past " query instead. --- app/filter.py | 19 ++++++++----------- app/request.py | 30 ++++++++++++++++++++++++------ app/routes.py | 1 + 3 files changed, 33 insertions(+), 17 deletions(-) diff --git a/app/filter.py b/app/filter.py index be9809b..9944441 100644 --- a/app/filter.py +++ b/app/filter.py @@ -116,14 +116,11 @@ class Filter: for script in soup('script'): script.decompose() - # Remove google's language/time config - st_card = soup.find('div', id='st-card') - if st_card: - st_card.decompose() - - footer = soup.find('div', id='sfooter') + # Update default footer and header + footer = soup.find('footer') if footer: - footer.decompose() + # Remove divs that have multiple links beyond just page navigation + [_.decompose() for _ in footer.find_all('div', recursive=False) if len(_.find_all('a', href=True)) > 2] header = soup.find('header') if header: @@ -144,12 +141,12 @@ class Filter: return question_divs = [_ for _ in self.main_divs.find_all('div', recursive=False) if len(_.find_all('h2')) > 0] - for x in question_divs: - questions = [_ for _ in x.find_all('div', recursive=True) if _.text.endswith('?')] + for question_div in question_divs: + questions = [_ for _ in question_div.find_all('div', recursive=True) if _.text.endswith('?')] for question in questions: question['style'] = 'padding: 10px; font-style: italic;' - def update_element_src(self, element, mimetype): + def update_element_src(self, element, mime): element_src = element['src'] if element_src.startswith('//'): element_src = 'https:' + element_src @@ -163,7 +160,7 @@ class Filter: return element['src'] = '/element?url=' + self.encrypt_path(element_src, is_element=True) + \ - '&type=' + urlparse.quote(mimetype) + '&type=' + urlparse.quote(mime) # TODO: Non-mobile image results link to website instead of image # if not self.mobile: # img.append(BeautifulSoup(FULL_RES_IMG.format(element_src), 'html.parser')) diff --git a/app/request.py b/app/request.py index 38b47b0..398ab71 100644 --- a/app/request.py +++ b/app/request.py @@ -12,7 +12,7 @@ MOBILE_UA = '{}/5.0 (Android 0; Mobile; rv:54.0) Gecko/54.0 {}/59.0' DESKTOP_UA = '{}/5.0 (X11; {} x86_64; rv:75.0) Gecko/20100101 {}/75.0' # Valid query params -VALID_PARAMS = ['tbs', 'tbm', 'start', 'near'] +VALID_PARAMS = ['tbs', 'tbm', 'start', 'near', 'source'] def gen_user_agent(is_mobile): @@ -28,11 +28,22 @@ def gen_user_agent(is_mobile): def gen_query(query, args, config, near_city=None): param_dict = {key: '' for key in VALID_PARAMS} + # Use :past(hour/day/week/month/year) if available # example search "new restaurants :past month" - if ':past' in query: + sub_lang = '' + if ':past' in query and 'tbs' not in args: time_range = str.strip(query.split(':past', 1)[-1]) - param_dict['tbs'] = '&tbs=qdr:' + str.lower(time_range[0]) + param_dict['tbs'] = '&tbs=' + ('qdr:' + str.lower(time_range[0])) + elif 'tbs' in args: + result_tbs = args.get('tbs') + param_dict['tbs'] = '&tbs=' + result_tbs + + # Occasionally the 'tbs' param provided by google also contains a field for 'lr', but formatted + # strangely. This is a (admittedly not very elegant) solution for this. + # Ex/ &tbs=qdr:h,lr:lang_1pl --> the lr param needs to be extracted and have the "1" digit removed in this case + sub_lang = [_ for _ in result_tbs.split(',') if 'lr:' in _] + sub_lang = sub_lang[0][sub_lang[0].find('lr:') + 3:len(sub_lang[0])] if len(sub_lang) > 0 else '' # Ensure search query is parsable query = urlparse.quote(query) @@ -49,13 +60,20 @@ def gen_query(query, args, config, near_city=None): if near_city: param_dict['near'] = '&near=' + urlparse.quote(near_city) - # Set language for results (lr) and interface (hl) - param_dict['lr'] = '&lr=' + config.lang + '&hl=' + config.lang.replace('lang_', '') + # Set language for results (lr) if source isn't set, otherwise use the result + # language param provided by google (but with the strange digit(s) removed) + if 'source' in args: + param_dict['source'] = '&source=' + args.get('source') + param_dict['lr'] = ('&lr=' + ''.join([_ for _ in sub_lang if not _.isdigit()])) if sub_lang else '' + else: + param_dict['lr'] = '&lr=' + config.lang + param_dict['cr'] = ('&cr=' + config.ctry) if config.ctry else '' + param_dict['hl'] = '&hl=' + config.lang.replace('lang_', '') param_dict['safe'] = '&safe=' + ('active' if config.safe else 'off') for val in param_dict.values(): - if not val or val is None: + if not val: continue query += val diff --git a/app/routes.py b/app/routes.py index 99e05a6..0139594 100644 --- a/app/routes.py +++ b/app/routes.py @@ -87,6 +87,7 @@ def after_request_func(response): for key in session_list: session.pop(key) + response.headers["Cache-Control"] = "no-cache, no-store, must-revalidate" return response From f86a44b6372de257622821781bc852780bac0eaf Mon Sep 17 00:00:00 2001 From: Ben Busby <33362396+benbusby@users.noreply.github.com> Date: Thu, 11 Jun 2020 12:14:57 -0600 Subject: [PATCH 06/22] Removed no-cache enforcement, minor styling/formatting improvements --- app/request.py | 4 ++-- app/routes.py | 21 ++++++++++----------- app/utils/misc.py | 4 ++-- app/utils/routing_utils.py | 6 ++++-- 4 files changed, 18 insertions(+), 17 deletions(-) diff --git a/app/request.py b/app/request.py index 398ab71..fe7d3fb 100644 --- a/app/request.py +++ b/app/request.py @@ -22,8 +22,8 @@ def gen_user_agent(is_mobile): if is_mobile: return MOBILE_UA.format(mozilla, firefox) - else: - return DESKTOP_UA.format(mozilla, linux, firefox) + + return DESKTOP_UA.format(mozilla, linux, firefox) def gen_query(query, args, config, near_city=None): diff --git a/app/routes.py b/app/routes.py index 0139594..c60d4ac 100644 --- a/app/routes.py +++ b/app/routes.py @@ -1,22 +1,22 @@ -from app import app -from app.models.config import Config -from app.request import Request -from app.utils.misc import generate_user_keys, valid_user_session -from app.utils.routing_utils import * import argparse import base64 -from bs4 import BeautifulSoup -from cryptography.fernet import Fernet -from flask import g, jsonify, make_response, request, redirect, render_template, send_file, session -from functools import wraps import io import json import os import pickle import urllib.parse as urlparse -from requests import exceptions import uuid +from functools import wraps + import waitress +from flask import jsonify, make_response, request, redirect, render_template, send_file, session +from requests import exceptions + +from app import app +from app.models.config import Config +from app.request import Request +from app.utils.misc import valid_user_session +from app.utils.routing_utils import * def auth_required(f): @@ -87,7 +87,6 @@ def after_request_func(response): for key in session_list: session.pop(key) - response.headers["Cache-Control"] = "no-cache, no-store, must-revalidate" return response diff --git a/app/utils/misc.py b/app/utils/misc.py index 9bd580d..f959abe 100644 --- a/app/utils/misc.py +++ b/app/utils/misc.py @@ -1,7 +1,7 @@ from cryptography.fernet import Fernet from flask import current_app as app -SESSION_VALS = ['uuid', 'config', 'fernet_keys'] +REQUIRED_SESSION_VALUES = ['uuid', 'config', 'fernet_keys'] def generate_user_keys(cookies_disabled=False) -> dict: @@ -17,7 +17,7 @@ def generate_user_keys(cookies_disabled=False) -> dict: def valid_user_session(session): # Generate secret key for user if unavailable - for value in SESSION_VALS: + for value in REQUIRED_SESSION_VALUES: if value not in session: return False diff --git a/app/utils/routing_utils.py b/app/utils/routing_utils.py index cfe0b64..40f8a90 100644 --- a/app/utils/routing_utils.py +++ b/app/utils/routing_utils.py @@ -32,7 +32,8 @@ class RoutingUtils: def new_search_query(self) -> str: # Generate a new element key each time a new search is performed - self.session['fernet_keys']['element_key'] = generate_user_keys(cookies_disabled=self.cookies_disabled)['element_key'] + self.session['fernet_keys']['element_key'] = generate_user_keys( + cookies_disabled=self.cookies_disabled)['element_key'] q = self.request_params.get('q') @@ -46,7 +47,8 @@ class RoutingUtils: pass # Reset text key - self.session['fernet_keys']['text_key'] = generate_user_keys(cookies_disabled=self.cookies_disabled)['text_key'] + self.session['fernet_keys']['text_key'] = generate_user_keys( + cookies_disabled=self.cookies_disabled)['text_key'] # Format depending on whether or not the query is a "feeling lucky" query self.feeling_lucky = q.startswith('! ') From f7380ae15dbfc8f1d3a71d42c330e7827870f7a3 Mon Sep 17 00:00:00 2001 From: Ben Busby <33362396+benbusby@users.noreply.github.com> Date: Thu, 11 Jun 2020 13:21:40 -0600 Subject: [PATCH 07/22] Improving ad filtering for non-English languages --- app/filter.py | 7 ++++--- app/utils/misc.py | 5 +++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/app/filter.py b/app/filter.py index 9944441..1cc9f87 100644 --- a/app/filter.py +++ b/app/filter.py @@ -1,4 +1,5 @@ from app.request import VALID_PARAMS +from app.utils.misc import BLACKLIST from bs4 import BeautifulSoup from bs4.element import ResultSet from cryptography.fernet import Fernet @@ -47,8 +48,8 @@ def filter_link_args(query_link): return query_link -def has_ad_content(element): - return element == 'ad' or element == 'sponsoredⓘ' +def has_ad_content(element: str): + return element.upper() in (value.upper() for value in BLACKLIST) or 'ⓘ' in element class Filter: @@ -133,7 +134,7 @@ class Filter: return for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]: - has_ad = len([_ for _ in div.find_all('span', recursive=True) if has_ad_content(_.text.lower())]) + has_ad = len([_ for _ in div.find_all('span', recursive=True) if has_ad_content(_.text)]) _ = div.decompose() if has_ad else None def fix_question_section(self): diff --git a/app/utils/misc.py b/app/utils/misc.py index f959abe..b87941d 100644 --- a/app/utils/misc.py +++ b/app/utils/misc.py @@ -2,6 +2,11 @@ from cryptography.fernet import Fernet from flask import current_app as app REQUIRED_SESSION_VALUES = ['uuid', 'config', 'fernet_keys'] +BLACKLIST = [ + 'ad', 'anuncio', 'annuncio', 'annonce', 'Anzeige', '广告', '廣告', 'Reklama', 'Реклама', 'Anunț', '광고', + 'annons', 'Annonse', 'Iklan', '広告', 'Augl.', 'Mainos', 'Advertentie', 'إعلان', 'Գովազդ', 'विज्ञापन', 'Reklam', + 'آگهی', 'Reklāma', 'Reklaam', 'Διαφήμιση', 'מודעה', 'Hirdetés' +] def generate_user_keys(cookies_disabled=False) -> dict: From 5f8309d2f0aa484b8c32aff70ffbbe75b18555a9 Mon Sep 17 00:00:00 2001 From: Ben Busby <33362396+benbusby@users.noreply.github.com> Date: Thu, 11 Jun 2020 13:25:23 -0600 Subject: [PATCH 08/22] Added footer to results page --- app/routes.py | 1 + app/templates/display.html | 10 ++++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/app/routes.py b/app/routes.py index c60d4ac..ed288c0 100644 --- a/app/routes.py +++ b/app/routes.py @@ -162,6 +162,7 @@ def search(): search_type=search_util.search_type, dark_mode=g.user_config.dark, response=response, + version_number=app.config['VERSION_NUMBER'], search_header=render_template( 'header.html', dark_mode=g.user_config.dark, diff --git a/app/templates/display.html b/app/templates/display.html index 94ebed2..bd18838 100644 --- a/app/templates/display.html +++ b/app/templates/display.html @@ -11,7 +11,13 @@ {{ query }} - Whoogle Search - {{ search_header|safe }} - {{ response|safe }} + {{ search_header|safe }} + {{ response|safe }} + From bf4bf1ff2cdf318dfb8b7c11dc093dc274893e02 Mon Sep 17 00:00:00 2001 From: "Joao A. Candido Ramos" Date: Sat, 27 Jun 2020 20:23:17 +0000 Subject: [PATCH 09/22] Split interface and results language config (#89) Adding support to choose separately the language of search and the one for the interface (allowing a default givent by google). Co-authored-by: Joao --- app/models/config.py | 4 +++- app/request.py | 4 ++-- app/routes.py | 4 ++-- app/templates/index.html | 19 ++++++++++++++++--- test/test_routes.py | 3 ++- 5 files changed, 25 insertions(+), 9 deletions(-) diff --git a/app/models/config.py b/app/models/config.py index 544d2d1..45b1b65 100644 --- a/app/models/config.py +++ b/app/models/config.py @@ -2,6 +2,7 @@ class Config: # Derived from here: # https://sites.google.com/site/tomihasa/google-language-codes#searchlanguage LANGUAGES = [ + {'name': 'Default (use server location)', 'value': ''}, {'name': 'English', 'value': 'lang_en'}, {'name': 'Afrikaans', 'value': 'lang_af'}, {'name': 'Arabic', 'value': 'lang_ar'}, @@ -298,7 +299,8 @@ class Config: def __init__(self, **kwargs): self.url = '' - self.lang = 'lang_en' + self.lang_search = '' + self.lang_interface = '' self.ctry = '' self.safe = False self.dark = False diff --git a/app/request.py b/app/request.py index fe7d3fb..192eedc 100644 --- a/app/request.py +++ b/app/request.py @@ -66,10 +66,10 @@ def gen_query(query, args, config, near_city=None): param_dict['source'] = '&source=' + args.get('source') param_dict['lr'] = ('&lr=' + ''.join([_ for _ in sub_lang if not _.isdigit()])) if sub_lang else '' else: - param_dict['lr'] = '&lr=' + config.lang + param_dict['lr'] = ('&lr=' + config.lang_search) if config.lang_search else '' param_dict['cr'] = ('&cr=' + config.ctry) if config.ctry else '' - param_dict['hl'] = '&hl=' + config.lang.replace('lang_', '') + param_dict['hl'] = ('&hl=' + config.lang_interface.replace('lang_', '')) if config.lang_interface else '' param_dict['safe'] = '&safe=' + ('active' if config.safe else 'off') for val in param_dict.values(): diff --git a/app/routes.py b/app/routes.py index ed288c0..7f1869c 100644 --- a/app/routes.py +++ b/app/routes.py @@ -59,13 +59,13 @@ def before_request_func(): if https_only and request.url.startswith('http://'): return redirect(request.url.replace('http://', 'https://', 1), code=308) - + g.user_config = Config(**session['config']) if not g.user_config.url: g.user_config.url = request.url_root.replace('http://', 'https://') if https_only else request.url_root - g.user_request = Request(request.headers.get('User-Agent'), language=g.user_config.lang) + g.user_request = Request(request.headers.get('User-Agent'), language=g.user_config.lang_search) g.app_location = g.user_config.url diff --git a/app/templates/index.html b/app/templates/index.html index 23e3fbd..a541413 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -55,11 +55,24 @@
- - {% for lang in languages %} + {% endfor %} + +
+
+ +
+
+ + +
— Replaces Twitter/YouTube/Instagram links + with Nitter/Invidious/Bibliogram links.
+
diff --git a/app/utils/filter_utils.py b/app/utils/filter_utils.py new file mode 100644 index 0000000..ed05d76 --- /dev/null +++ b/app/utils/filter_utils.py @@ -0,0 +1,79 @@ +from bs4 import BeautifulSoup +import urllib.parse as urlparse +from urllib.parse import parse_qs + +SKIP_ARGS = ['ref_src', 'utm'] +FULL_RES_IMG = '
Full Image' +GOOG_IMG = '/images/branding/searchlogo/1x/googlelogo' +LOGO_URL = GOOG_IMG + '_desk' +BLANK_B64 = ''' + +''' + +BLACKLIST = [ + 'ad', 'anuncio', 'annuncio', 'annonce', 'Anzeige', '广告', '廣告', 'Reklama', 'Реклама', 'Anunț', '광고', + 'annons', 'Annonse', 'Iklan', '広告', 'Augl.', 'Mainos', 'Advertentie', 'إعلان', 'Գովազդ', 'विज्ञापन', 'Reklam', + 'آگهی', 'Reklāma', 'Reklaam', 'Διαφήμιση', 'מודעה', 'Hirdetés' +] + +SITE_ALTS = { + 'twitter.com': 'nitter.net', + 'youtube.com': 'invidio.us', + 'instagram.com': 'bibliogram.art/u' +} + + +def has_ad_content(element: str): + return element.upper() in (value.upper() for value in BLACKLIST) or 'ⓘ' in element + + +def get_first_link(soup): + # Replace hrefs with only the intended destination (no "utm" type tags) + for a in soup.find_all('a', href=True): + # Return the first search result URL + if 'url?q=' in a['href']: + return filter_link_args(a['href']) + + +def get_site_alt(link: str): + for site_key in SITE_ALTS.keys(): + if site_key not in link: + continue + + link = link.replace(site_key, SITE_ALTS[site_key]) + break + + return link + + +def filter_link_args(query_link): + parsed_link = urlparse.urlparse(query_link) + link_args = parse_qs(parsed_link.query) + safe_args = {} + + if len(link_args) == 0 and len(parsed_link) > 0: + return query_link + + for arg in link_args.keys(): + if arg in SKIP_ARGS: + continue + + safe_args[arg] = link_args[arg] + + # Remove original link query and replace with filtered args + query_link = query_link.replace(parsed_link.query, '') + if len(safe_args) > 0: + query_link = query_link + urlparse.urlencode(safe_args, doseq=True) + else: + query_link = query_link.replace('?', '') + + return query_link + + +def gen_nojs(sibling): + nojs_link = BeautifulSoup().new_tag('a') + nojs_link['href'] = '/window?location=' + sibling['href'] + nojs_link['style'] = 'display:block;width:100%;' + nojs_link.string = 'NoJS Link: ' + nojs_link['href'] + sibling.append(BeautifulSoup('


', 'html.parser')) + sibling.append(nojs_link) \ No newline at end of file diff --git a/app/utils/routing_utils.py b/app/utils/routing_utils.py index 40f8a90..2a649b4 100644 --- a/app/utils/routing_utils.py +++ b/app/utils/routing_utils.py @@ -1,5 +1,5 @@ from app.filter import Filter, get_first_link -from app.utils.misc import generate_user_keys +from app.utils.session_utils import generate_user_keys from app.request import gen_query from bs4 import BeautifulSoup from cryptography.fernet import Fernet, InvalidToken diff --git a/app/utils/misc.py b/app/utils/session_utils.py similarity index 62% rename from app/utils/misc.py rename to app/utils/session_utils.py index b87941d..f959abe 100644 --- a/app/utils/misc.py +++ b/app/utils/session_utils.py @@ -2,11 +2,6 @@ from cryptography.fernet import Fernet from flask import current_app as app REQUIRED_SESSION_VALUES = ['uuid', 'config', 'fernet_keys'] -BLACKLIST = [ - 'ad', 'anuncio', 'annuncio', 'annonce', 'Anzeige', '广告', '廣告', 'Reklama', 'Реклама', 'Anunț', '광고', - 'annons', 'Annonse', 'Iklan', '広告', 'Augl.', 'Mainos', 'Advertentie', 'إعلان', 'Գովազդ', 'विज्ञापन', 'Reklam', - 'آگهی', 'Reklāma', 'Reklaam', 'Διαφήμιση', 'מודעה', 'Hirdetés' -] def generate_user_keys(cookies_disabled=False) -> dict: diff --git a/test/conftest.py b/test/conftest.py index 63aec3e..7a15f00 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -1,5 +1,5 @@ from app import app -from app.utils.misc import generate_user_keys +from app.utils.session_utils import generate_user_keys import pytest diff --git a/test/test_misc.py b/test/test_misc.py index 8eb1d78..92fcadb 100644 --- a/test/test_misc.py +++ b/test/test_misc.py @@ -1,4 +1,4 @@ -from app.utils.misc import generate_user_keys, valid_user_session +from app.utils.session_utils import generate_user_keys, valid_user_session def test_generate_user_keys(): diff --git a/test/test_results.py b/test/test_results.py index 463a355..a7aa771 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -1,6 +1,6 @@ from bs4 import BeautifulSoup from app.filter import Filter -from app.utils.misc import generate_user_keys +from app.utils.session_utils import generate_user_keys from datetime import datetime from dateutil.parser import * From f4eca3711b6ba5b0cd6ff0ea3339f3144bd60a06 Mon Sep 17 00:00:00 2001 From: Spike <19519553+spikecodes@users.noreply.github.com> Date: Wed, 12 Aug 2020 05:06:16 +0000 Subject: [PATCH 13/22] Allow for free deployment to Repl.it (#114) * Update README.md with instructions for deploying via Repl.it * Create .replit --- .replit | 2 ++ README.md | 19 ++++++++++++++----- 2 files changed, 16 insertions(+), 5 deletions(-) create mode 100644 .replit diff --git a/.replit b/.replit new file mode 100644 index 0000000..909eee8 --- /dev/null +++ b/.replit @@ -0,0 +1,2 @@ +language = "python3" +run = "pip install -r requirements.txt && ./run" diff --git a/README.md b/README.md index ecf4a96..e9c27a3 100644 --- a/README.md +++ b/README.md @@ -56,10 +56,19 @@ There are a few different ways to begin using the app, depending on your prefere Provides: - Free deployment of app -- Free https url (https://\.herokuapp.com) +- Free HTTPS url (https://\.herokuapp.com) - Downtime after periods of inactivity \([solution](https://github.com/benbusby/whoogle-search#prevent-downtime-heroku-only)\) -### B) [pipx](https://github.com/pipxproject/pipx#install-pipx) +### B) [Repl.it](https://repl.it) +[![Run on Repl.it](https://repl.it/badge/github/benbusby/whoogle-search)](https://repl.it/github/benbusby/whoogle-search) + +Provides: +- Free deployment of app (can be ran without account) +- Free HTTPS url (https://\.\\.repl\.co) + - Supports custom domains +- Downtime after periods of inactivity \([solution 1](https://repl.it/talk/ask/use-this-pingmat1replco-just-enter/28821/101298), [solution 2](https://repl.it/talk/learn/How-to-use-and-setup-UptimeRobot/9003)\) + +### C) [pipx](https://github.com/pipxproject/pipx#install-pipx) Persistent install: `pipx install git+https://github.com/benbusby/whoogle-search.git` @@ -68,7 +77,7 @@ Sandboxed temporary instance: `pipx run git+https://github.com/benbusby/whoogle-search.git whoogle-search` -### C) pip +### D) pip `pip install whoogle-search` ```bash @@ -86,7 +95,7 @@ optional arguments: --https-only Enforces HTTPS redirects for all requests (default False) ``` -### D) Manual +### E) Manual Clone the repo and run the following commands to start the app in a local-only environment: ```bash @@ -125,7 +134,7 @@ sudo systemctl enable whoogle sudo systemctl start whoogle ``` -### E) Manual (Docker) +### F) Manual (Docker) 1. Ensure the Docker daemon is running, and is accessible by your user account - To add user permissions, you can execute `sudo usermod -aG docker yourusername` - Running `docker ps` should return something besides an error. If you encounter an error saying the daemon isn't running, try `sudo systemctl start docker` (Linux) or ensure the docker tool is running (Windows/macOS). From b2ecd8dc789299a26c7809c4931bb88fae06510e Mon Sep 17 00:00:00 2001 From: Ben Busby <33362396+benbusby@users.noreply.github.com> Date: Sat, 15 Aug 2020 11:58:16 -0600 Subject: [PATCH 14/22] Updated search suggestion behavior (closes #115) Arrow key navigation through search suggestions now populates the input field with text content from the active selection. Navigating "down" past the end of the suggestions list returns the active cursor to position 0, while navigating "up" before the list of suggestions restores the original search query and removes the active highlight from element 0. --- app/static/js/autocomplete.js | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/app/static/js/autocomplete.js b/app/static/js/autocomplete.js index 84e9b23..3d179ca 100644 --- a/app/static/js/autocomplete.js +++ b/app/static/js/autocomplete.js @@ -2,7 +2,7 @@ const handleUserInput = searchBar => { let xhrRequest = new XMLHttpRequest(); xhrRequest.open("POST", "/autocomplete"); xhrRequest.setRequestHeader("Content-type", "application/x-www-form-urlencoded"); - xhrRequest.onload = function() { + xhrRequest.onload = function () { if (xhrRequest.readyState === 4 && xhrRequest.status !== 200) { // Do nothing if failed to fetch autocomplete results return; @@ -18,6 +18,7 @@ const handleUserInput = searchBar => { const autocomplete = (searchInput, autocompleteResults) => { let currentFocus; + let originalSearch; searchInput.addEventListener("input", function () { let autocompleteList, autocompleteItem, i, val = this.value; @@ -53,9 +54,11 @@ const autocomplete = (searchInput, autocompleteResults) => { let suggestion = document.getElementById(this.id + "-autocomplete-list"); if (suggestion) suggestion = suggestion.getElementsByTagName("div"); if (e.keyCode === 40) { // down + e.preventDefault(); currentFocus++; addActive(suggestion); } else if (e.keyCode === 38) { //up + e.preventDefault(); currentFocus--; addActive(suggestion); } else if (e.keyCode === 13) { // enter @@ -63,17 +66,36 @@ const autocomplete = (searchInput, autocompleteResults) => { if (currentFocus > -1) { if (suggestion) suggestion[currentFocus].click(); } + } else { + originalSearch = document.getElementById("search-bar").value; } }); const addActive = suggestion => { - if (!suggestion || !suggestion[currentFocus]) return false; + let searchBar = document.getElementById("search-bar"); + + // Handle navigation outside of suggestion list + if (!suggestion || !suggestion[currentFocus]) { + if (currentFocus >= suggestion.length) { + // Move selection back to the beginning + currentFocus = 0; + } else if (currentFocus < 0) { + // Retrieve original search and remove active suggestion selection + currentFocus = -1; + searchBar.value = originalSearch; + removeActive(suggestion); + return; + } else { + return; + } + } + removeActive(suggestion); - - if (currentFocus >= suggestion.length) currentFocus = 0; - if (currentFocus < 0) currentFocus = (suggestion.length - 1); - suggestion[currentFocus].classList.add("autocomplete-active"); + + // Autofill search bar with suggestion content + searchBar.value = suggestion[currentFocus].textContent; + searchBar.focus(); }; const removeActive = suggestion => { From 0c0a01b83f424ca5bf8dffcb63e384bbd241a7a3 Mon Sep 17 00:00:00 2001 From: Ben Busby <33362396+benbusby@users.noreply.github.com> Date: Sat, 15 Aug 2020 13:02:17 -0600 Subject: [PATCH 15/22] Minor opensearch route and description updates Bumped version to 0.2.1 for next release Updated image in opensearch template to use base64 image Updated opensearch route to serve file as attachment --- app/__init__.py | 2 +- app/routes.py | 11 +++++------ app/static/css/main.css | 5 +++-- app/templates/opensearch.xml | 2 +- setup.py | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/app/__init__.py b/app/__init__.py index f21d4b4..8293c44 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -9,7 +9,7 @@ app.default_key_set = generate_user_keys() app.no_cookie_ips = [] app.config['SECRET_KEY'] = os.urandom(32) app.config['SESSION_TYPE'] = 'filesystem' -app.config['VERSION_NUMBER'] = '0.2.0' +app.config['VERSION_NUMBER'] = '0.2.1' app.config['APP_ROOT'] = os.getenv('APP_ROOT', os.path.dirname(os.path.abspath(__file__))) app.config['STATIC_FOLDER'] = os.getenv('STATIC_FOLDER', os.path.join(app.config['APP_ROOT'], 'static')) app.config['CONFIG_PATH'] = os.getenv('CONFIG_VOLUME', os.path.join(app.config['STATIC_FOLDER'], 'config')) diff --git a/app/routes.py b/app/routes.py index fd6278d..2e2eb37 100644 --- a/app/routes.py +++ b/app/routes.py @@ -115,12 +115,11 @@ def opensearch(): if opensearch_url.endswith('/'): opensearch_url = opensearch_url[:-1] - template = render_template('opensearch.xml', - main_url=opensearch_url, - request_type='get' if g.user_config.get_only else 'post') - response = make_response(template) - response.headers['Content-Type'] = 'application/xml' - return response + return render_template( + 'opensearch.xml', + main_url=opensearch_url, + request_type='get' if g.user_config.get_only else 'post' + ), 200, {'Content-Disposition': 'attachment; filename="opensearch.xml"'} @app.route('/autocomplete', methods=['GET', 'POST']) diff --git a/app/static/css/main.css b/app/static/css/main.css index 34458f6..1fc9c3d 100644 --- a/app/static/css/main.css +++ b/app/static/css/main.css @@ -16,6 +16,7 @@ body { left: 50%; transform: translate(-50%, -50%); max-width: 600px; + z-index: 15; } .search-items { @@ -127,10 +128,10 @@ footer { bottom: 0%; text-align: center; width: 100%; - z-index: -1; + z-index: 10; } .info-text { font-style: italic; font-size: 12px; -} \ No newline at end of file +} diff --git a/app/templates/opensearch.xml b/app/templates/opensearch.xml index b737be7..8730b8e 100644 --- a/app/templates/opensearch.xml +++ b/app/templates/opensearch.xml @@ -3,7 +3,7 @@ Whoogle Whoogle: A lightweight, deployable Google search proxy for desktop/mobile that removes Javascript, AMP links, and ads UTF-8 - /static/img/favicon/favicon-32x32.png +  diff --git a/setup.py b/setup.py index 08652bc..b2cddd1 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setuptools.setup( author='Ben Busby', author_email='benbusby@protonmail.com', name='whoogle-search', - version='0.2.0', + version='0.2.1', include_package_data=True, install_requires=requirements, description='Self-hosted, ad-free, privacy-respecting Google metasearch engine', From e471b012a0ab302bf00bdd681e6f649f44ec0bd6 Mon Sep 17 00:00:00 2001 From: Ben Busby <33362396+benbusby@users.noreply.github.com> Date: Sat, 15 Aug 2020 14:03:26 -0600 Subject: [PATCH 16/22] Updated opensearch template Reconfigured template to only use method parameter if set to search via POST request (which is the default). Apparently Chrome/Chromium based browsers don't like non-GET request searches, and specifying a method caused Chrome to reject the template altogether. --- app/routes.py | 2 +- app/templates/opensearch.xml | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/app/routes.py b/app/routes.py index 2e2eb37..56bc6de 100644 --- a/app/routes.py +++ b/app/routes.py @@ -118,7 +118,7 @@ def opensearch(): return render_template( 'opensearch.xml', main_url=opensearch_url, - request_type='get' if g.user_config.get_only else 'post' + request_type='' if g.user_config.get_only else 'method="post"' ), 200, {'Content-Disposition': 'attachment; filename="opensearch.xml"'} diff --git a/app/templates/opensearch.xml b/app/templates/opensearch.xml index 8730b8e..8e2e7b2 100644 --- a/app/templates/opensearch.xml +++ b/app/templates/opensearch.xml @@ -1,13 +1,14 @@ + Whoogle Whoogle: A lightweight, deployable Google search proxy for desktop/mobile that removes Javascript, AMP links, and ads UTF-8  - + - + {{ main_url }}/search From 6ba5e8f165801a5996f539fa0d147954b30a5aa0 Mon Sep 17 00:00:00 2001 From: Chad Smith Date: Thu, 20 Aug 2020 12:40:34 -0700 Subject: [PATCH 17/22] fix pipx run command (#118) Add the required `--spec` argument --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e9c27a3..84a4f44 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ Persistent install: Sandboxed temporary instance: -`pipx run git+https://github.com/benbusby/whoogle-search.git whoogle-search` +`pipx run --spec git+https://github.com/benbusby/whoogle-search.git whoogle-search` ### D) pip `pip install whoogle-search` From 481c5d179843123bcb256c7c5e32dc80d26da370 Mon Sep 17 00:00:00 2001 From: Dee-Jay Logozzo Date: Mon, 7 Sep 2020 23:42:11 +1000 Subject: [PATCH 18/22] Added instructions for Android Firefox >=79.0.0 (#119) * Added instructions for Android Firefox >=79.0.0 Long pressing on the search bar and selecting "Add search engine" no longer works as of Android Firefox 79.0.0 * Update README.md * Corrected search strings to use backticks --- README.md | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 803e3d8..0475393 100644 --- a/README.md +++ b/README.md @@ -194,15 +194,23 @@ Update browser settings: - Firefox (iOS) - In the mobile app Settings page, tap "Search" within the "General" section. There should be an option titled "Add Search Engine" to select. It should prompt you to enter a title and search query url - use the following elements to fill out the form: - Title: "Whoogle" - - URL: "https://\/search?q=%s" + - URL: `http[s]://\/search?q=%s` - Firefox (Android) - - Navigate to your app's url - - Long-press on the search text field - - Click the "Add Search Engine" menu item - - Select a name and click ok - - Click the 3 dot menu in the top right - - Navigate to the settings menu and select the "search" sub-menu - - Select Whoogle and press "Set as default" + - Version <79.0.0 + - Navigate to your app's url + - Long-press on the search text field + - Click the "Add Search Engine" menu item + - Select a name and click ok + - Click the 3 dot menu in the top right + - Navigate to the settings menu and select the "Search" sub-menu + - Select Whoogle and press "Set as default" + - Version >=79.0.0 + - Click the 3 dot menu in the top right + - Navigate to the settings menu and select the "Search" sub-menu + - Click "Add search engine" + - Select the 'Other' radio button + - Name: "Whoogle" + - Search string to use: `https://\/search?q=%s` - [Alfred](https://www.alfredapp.com/) (Mac OS X) 1. Go to `Alfred Preferences` > `Features` > `Web Search` and click `Add Custom Search`. Then configure these settings - Search URL: `https://\/search?q={query} From 9afe5f81bdbd9638d99146450518eef1a6ed9921 Mon Sep 17 00:00:00 2001 From: Ben Busby Date: Mon, 14 Sep 2020 15:29:58 -0400 Subject: [PATCH 19/22] Updated dark theme (#121) * Implemented new dark theme Now uses a dedicated css file for all dark theme color changes, rather than replacing color codes directly. Color theme is from discussion in #60. * Minor link color update --- app/filter.py | 12 ---------- app/static/css/dark-theme.css | 42 +++++++++++++++++++++++++++++++++++ app/static/css/main.css | 2 +- app/templates/display.html | 3 +++ app/templates/index.html | 3 +++ 5 files changed, 49 insertions(+), 13 deletions(-) create mode 100644 app/static/css/dark-theme.css diff --git a/app/filter.py b/app/filter.py index 41a5cef..e56dc67 100644 --- a/app/filter.py +++ b/app/filter.py @@ -144,18 +144,6 @@ class Filter: except AttributeError: pass - # Set up dark mode if active - if self.dark: - soup.find('html')['style'] = 'scrollbar-color: #333 #111;color:#fff !important;background:#000 !important' - for input_element in soup.findAll('input'): - input_element['style'] = 'color:#fff;background:#000;' - - for span_element in soup.findAll('span'): - span_element['style'] = 'color: white;' - - for href_element in soup.findAll('a'): - href_element['style'] = 'color: white' if href_element['href'].startswith('/search') else '' - def update_link(self, link): # Replace href with only the intended destination (no "utm" type tags) href = link['href'].replace('https://www.google.com', '') diff --git a/app/static/css/dark-theme.css b/app/static/css/dark-theme.css new file mode 100644 index 0000000..36cfada --- /dev/null +++ b/app/static/css/dark-theme.css @@ -0,0 +1,42 @@ +html { + background-color: #000 !important; +} + +body { + background-color: #222 !important; +} + +div { + /*background-color: #111 !important;*/ + color: #fff !important; +} + +a:visited h3 div { + color: #bbbbff !important; +} + +a:link h3 div { + color: #4b8eea !important; +} + +a:link div { + color: #aaffaa !important; +} + +div span { + color: #bbb !important; +} + +input { + background-color: #111 !important; + color: #fff !important; +} + +#search-bar { + color: #fff !important; + background-color: #000 !important; +} + +.search-container { + background-color: #000 !important; +} diff --git a/app/static/css/main.css b/app/static/css/main.css index 1fc9c3d..5b35bf6 100644 --- a/app/static/css/main.css +++ b/app/static/css/main.css @@ -46,7 +46,7 @@ body { width: 100%; height: 40px; border: 1px solid #685e79; - background: #685e79; + background: #685e79 !important; text-align: center; color: #fff; cursor: pointer; diff --git a/app/templates/display.html b/app/templates/display.html index bd18838..6a8a609 100644 --- a/app/templates/display.html +++ b/app/templates/display.html @@ -8,6 +8,9 @@ + {% if dark_mode %} + + {% endif %} {{ query }} - Whoogle Search diff --git a/app/templates/index.html b/app/templates/index.html index dd89e32..17193b6 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -23,6 +23,9 @@ + {% if config.dark %} + + {% endif %} Whoogle Search From 9a03b4111dae4ba5a3424a54ff284373d585b2dc Mon Sep 17 00:00:00 2001 From: Ben Busby <33362396+benbusby@users.noreply.github.com> Date: Thu, 17 Sep 2020 18:59:37 -0400 Subject: [PATCH 20/22] Clarified country filter, updated invidious result URL (closes #123) Improves clarity of the meaning behind the "Country" filter -- Google seemingly uses this value to only return results that are hosted in a particular country, as evidenced in the search differences highlighted in #123. It now mentions that the results are filtered by website hosting location. Also, now that invidio.us is shut down, the fallback URL (invidiou.site) is now used instead. --- app/models/config.py | 2 +- app/templates/index.html | 3 ++- app/utils/filter_utils.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/app/models/config.py b/app/models/config.py index d261cd3..2dc0d2b 100644 --- a/app/models/config.py +++ b/app/models/config.py @@ -52,7 +52,7 @@ class Config: ] COUNTRIES = [ - {'name': 'Default (use server location)', 'value': ''}, + {'name': 'Default (none)', 'value': ''}, {'name': 'Afghanistan', 'value': 'countryAF'}, {'name': 'Albania', 'value': 'countryAL'}, {'name': 'Algeria', 'value': 'countryDZ'}, diff --git a/app/templates/index.html b/app/templates/index.html index 17193b6..02b9137 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -45,7 +45,7 @@
- + +
— Note: If enabled, a website will only appear in the results if it is *hosted* in the selected country.
diff --git a/app/utils/filter_utils.py b/app/utils/filter_utils.py index ed05d76..7f9e9a5 100644 --- a/app/utils/filter_utils.py +++ b/app/utils/filter_utils.py @@ -18,7 +18,7 @@ BLACKLIST = [ SITE_ALTS = { 'twitter.com': 'nitter.net', - 'youtube.com': 'invidio.us', + 'youtube.com': 'invidiou.site', 'instagram.com': 'bibliogram.art/u' } From 1f07e4e235576ef0695538ea56bcd6936fed773f Mon Sep 17 00:00:00 2001 From: Ben Busby Date: Wed, 30 Sep 2020 10:13:55 -0400 Subject: [PATCH 21/22] Update issue template Removed the section concerning which parts of the project would need modification, since it's not always fair to expect someone to know that beforehand. --- .github/ISSUE_TEMPLATE/feature_request.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index 24bf2f6..9da6d04 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -10,8 +10,5 @@ assignees: '' **Describe the feature you'd like to see added** A short description of the feature, and what it would accomplish. -**Describe which parts of the project this would modify (front end/back end/configuration/etc)** -A short description of which aspects of Whoogle Search would need modification - **Additional context** Add any other context or screenshots about the feature request here. From dfb1e81fa12e2cc9ccc75a193fd4afdcb79d01e9 Mon Sep 17 00:00:00 2001 From: Ben Busby <33362396+benbusby@users.noreply.github.com> Date: Wed, 30 Sep 2020 10:26:27 -0400 Subject: [PATCH 22/22] Added search input auto focus, updated README The javascript controller has been updated to include a call to focus the cursor on the search field. This previously had only been seen on Firefox, and was assumed to be a weird FF-specific bug. Adding in a timeout to allow elements to finish loading allows the field to be focused as expected. Also updated the README to include clarification for IP address tracking. --- README.md | 4 +++- app/static/js/controller.js | 5 +++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b8c7a6c..3e9c823 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Contents - No ads or sponsored content - No javascript - No cookies -- No tracking/linking of your personal IP address +- No tracking/linking of your personal IP address\* - No AMP links - No URL tracking tags (i.e. utm=%s) - No referrer header @@ -35,6 +35,8 @@ Contents - Optional location-based searching (i.e. results near \) - Optional NoJS mode to disable all Javascript in results +*If deployed to a remote server + ## Dependencies If using Heroku Quick Deploy, **you can skip this section**. diff --git a/app/static/js/controller.js b/app/static/js/controller.js index 1035ff9..156a84d 100644 --- a/app/static/js/controller.js +++ b/app/static/js/controller.js @@ -8,6 +8,7 @@ CONFIG_STRS = [ "near", "url" ]; + const setupSearchLayout = () => { // Setup search field const searchBar = document.getElementById("search-bar"); @@ -114,4 +115,8 @@ document.addEventListener("DOMContentLoaded", function() { setupSearchLayout(); setupConfigLayout(); + + // Focusing on the search input field requires a delay for elements to finish + // loading (seemingly only on FF) + setTimeout(function() { document.getElementById("search-bar").focus(); }, 250); });