From a4382d59f67a3f354c0bbb3fdffaf7792aba4f3f Mon Sep 17 00:00:00 2001 From: Ben Busby Date: Sat, 16 May 2020 09:31:07 -0600 Subject: [PATCH 01/13] Updated redirect code used in https redirects See https://developer.mozilla.org/en-US/docs/Web/HTTP/Redirections 301 redirections do not keep the request method intact, and can occasionally be changed from POST to GET 308 redirections always keep the request method, which is necessary for all POST search requests --- app/routes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/routes.py b/app/routes.py index 747b847..a6016b8 100644 --- a/app/routes.py +++ b/app/routes.py @@ -25,7 +25,7 @@ def before_request_func(): if https_only and request.url.startswith('http://'): url = request.url.replace('http://', 'https://', 1) - code = 301 + code = 308 return redirect(url, code=code) json_config = json.load(open(CONFIG_PATH)) if os.path.exists(CONFIG_PATH) else {'url': request.url_root} From 56bf976ecd4fdac4d8e275736c98a89d621956af Mon Sep 17 00:00:00 2001 From: Ben Busby Date: Mon, 18 May 2020 10:17:21 -0600 Subject: [PATCH 02/13] Added question template --- .github/ISSUE_TEMPLATE/question.md | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/question.md diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md new file mode 100644 index 0000000..a1d9b21 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/question.md @@ -0,0 +1,10 @@ +--- +name: Question +about: Ask a (simple) question about Whoogle +title: "[QUESTION] " +labels: question +assignees: '' + +--- + +Type out your question here. Please make sure that this is a topic that isn't already covered in the README. From 0e39b8f97b8be72990e66b8db197045343c0db0d Mon Sep 17 00:00:00 2001 From: Paul Rothrock Date: Mon, 18 May 2020 12:28:23 -0400 Subject: [PATCH 03/13] Added "I'm feeling lucky" function (#46) * Putting '! ' at the beginning of the query now redirects to the first search result Signed-off-by: Paul Rothrock * Moved get_first_url outside of filter class Signed-off-by: Paul Rothrock --- app/filter.py | 58 ++++++++++++++++++++++++++++----------------- app/routes.py | 19 ++++++++++++--- test/test_routes.py | 4 ++++ 3 files changed, 56 insertions(+), 25 deletions(-) diff --git a/app/filter.py b/app/filter.py index 0cda5d3..e2e8168 100644 --- a/app/filter.py +++ b/app/filter.py @@ -13,6 +13,40 @@ BLANK_B64 = ''' data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAQAAAAnOwc2AAAAD0lEQVR42mNkwAIYh7IgAAVVAAuInjI5AAAAAElFTkSuQmCC ''' +def get_first_link(soup): + # Replace hrefs with only the intended destination (no "utm" type tags) + for a in soup.find_all('a', href=True): + href = a['href'].replace('https://www.google.com', '') + + result_link = urlparse.urlparse(href) + query_link = parse_qs(result_link.query)['q'][0] if '?q=' in href else '' + + # Return the first search result URL + if 'url?q=' in href: + return filter_link_args(href) + +def filter_link_args(query_link): + parsed_link = urlparse.urlparse(query_link) + link_args = parse_qs(parsed_link.query) + safe_args = {} + + if len(link_args) == 0 and len(parsed_link) > 0: + return query_link + + for arg in link_args.keys(): + if arg in SKIP_ARGS: + continue + + safe_args[arg] = link_args[arg] + + # Remove original link query and replace with filtered args + query_link = query_link.replace(parsed_link.query, '') + if len(safe_args) > 0: + query_link = query_link + urlparse.urlencode(safe_args, doseq=True) + else: + query_link = query_link.replace('?', '') + + return query_link class Filter: def __init__(self, mobile=False, config=None, secret_key=''): @@ -149,27 +183,7 @@ class Filter: a['href'] = new_search elif 'url?q=' in href: # Strip unneeded arguments - parsed_link = urlparse.urlparse(query_link) - link_args = parse_qs(parsed_link.query) - safe_args = {} - - if len(link_args) == 0 and len(parsed_link) > 0: - a['href'] = query_link - continue - - for arg in link_args.keys(): - if arg in SKIP_ARGS: - continue - - safe_args[arg] = link_args[arg] - - # Remove original link query and replace with filtered args - query_link = query_link.replace(parsed_link.query, '') - if len(safe_args) > 0: - query_link = query_link + urlparse.urlencode(safe_args, doseq=True) - else: - query_link = query_link.replace('?', '') - + query_link = filter_link_args(query_link) a['href'] = query_link # Add no-js option @@ -185,4 +199,4 @@ def gen_nojs(soup, link, sibling): nojs_link['style'] = 'display:block;width:100%;' nojs_link.string = 'NoJS Link: ' + nojs_link['href'] sibling.append(BeautifulSoup('


', 'html.parser')) - sibling.append(nojs_link) + sibling.append(nojs_link) \ No newline at end of file diff --git a/app/routes.py b/app/routes.py index a6016b8..81791a0 100644 --- a/app/routes.py +++ b/app/routes.py @@ -1,5 +1,5 @@ from app import app -from app.filter import Filter +from app.filter import Filter, get_first_link from app.models.config import Config from app.request import Request, gen_query import argparse @@ -72,7 +72,7 @@ def opensearch(): def search(): request_params = request.args if request.method == 'GET' else request.form q = request_params.get('q') - + if q is None or len(q) == 0: return redirect('/') else: @@ -82,6 +82,11 @@ def search(): except InvalidToken: pass + feeling_lucky = q.startswith('! ') + + if feeling_lucky: # Well do you, punk? + q = q[2:] + user_agent = request.headers.get('User-Agent') mobile = 'Android' in user_agent or 'iPhone' in user_agent @@ -90,7 +95,15 @@ def search(): get_body = g.user_request.send(query=full_query) results = content_filter.reskin(get_body) - formatted_results = content_filter.clean(BeautifulSoup(results, 'html.parser')) + dirty_soup = BeautifulSoup(results, 'html.parser') + + if feeling_lucky: + redirect_url = get_first_link(dirty_soup) + return redirect(redirect_url, 303) # Using 303 so the browser performs a GET request for the URL + else: + formatted_results = content_filter.clean(dirty_soup) + + return render_template('display.html', query=urlparse.unquote(q), response=formatted_results) diff --git a/test/test_routes.py b/test/test_routes.py index 1124f5f..91e17be 100644 --- a/test/test_routes.py +++ b/test/test_routes.py @@ -17,6 +17,10 @@ def test_search(client): rv = client.get('/search?q=test') assert rv._status_code == 200 +def test_feeling_lucky(client): + rv = client.get('/search?q=!%20test') + assert rv._status_code == 303 + def test_config(client): rv = client.post('/config', data=demo_config) From 38b7b19e2adb66f50e665e04da472ed93b032ff9 Mon Sep 17 00:00:00 2001 From: Ben Busby Date: Mon, 18 May 2020 10:30:32 -0600 Subject: [PATCH 04/13] Added basic authentication (#51) Username/password can be set either as Dockerfile build arguments or passed into the run script as "--userpass " --- Dockerfile | 5 +++++ app/routes.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/Dockerfile b/Dockerfile index fd8c746..61f77b2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,6 +10,11 @@ RUN mkdir -p $config_dir VOLUME $config_dir ENV CONFIG_VOLUME=$config_dir +ARG username='' +ENV WHOOGLE_USER=$username +ARG password='' +ENV WHOOGLE_PASS=$password + ARG use_https='' ENV HTTPS_ONLY=$use_https diff --git a/app/routes.py b/app/routes.py index 81791a0..7931a14 100644 --- a/app/routes.py +++ b/app/routes.py @@ -6,6 +6,7 @@ import argparse from bs4 import BeautifulSoup from cryptography.fernet import Fernet, InvalidToken from flask import g, make_response, request, redirect, render_template, send_file +from functools import wraps import io import json import os @@ -18,6 +19,21 @@ app.config['STATIC_FOLDER'] = os.getenv('STATIC_FOLDER', os.path.join(app.config CONFIG_PATH = os.getenv('CONFIG_VOLUME', app.config['STATIC_FOLDER']) + '/config.json' +def auth_required(f): + @wraps(f) + def decorated(*args, **kwargs): + auth = request.authorization + + # Skip if username/password not set + whoogle_user = os.getenv('WHOOGLE_USER', '') + whoogle_pass = os.getenv('WHOOGLE_PASS', '') + if (not whoogle_user or not whoogle_pass) or (auth and whoogle_user == auth.username and whoogle_pass == auth.password): + return f(*args, **kwargs) + else: + return make_response('Not logged in', 401, {'WWW-Authenticate': 'Basic realm="Login Required"'}) + return decorated + + @app.before_request def before_request_func(): # Always redirect to https if HTTPS_ONLY is set (otherwise default to false) @@ -44,6 +60,7 @@ def unknown_page(e): @app.route('/', methods=['GET']) +@auth_required def index(): bg = '#000' if g.user_config.dark else '#fff' return render_template('index.html', @@ -55,6 +72,7 @@ def index(): @app.route('/opensearch.xml', methods=['GET']) +@auth_required def opensearch(): opensearch_url = g.app_location if opensearch_url.endswith('/'): @@ -69,6 +87,7 @@ def opensearch(): @app.route('/search', methods=['GET', 'POST']) +@auth_required def search(): request_params = request.args if request.method == 'GET' else request.form q = request_params.get('q') @@ -109,6 +128,7 @@ def search(): @app.route('/config', methods=['GET', 'POST']) +@auth_required def config(): if request.method == 'GET': return json.dumps(g.user_config.__dict__) @@ -125,6 +145,7 @@ def config(): @app.route('/url', methods=['GET']) +@auth_required def url(): if 'url' in request.args: return redirect(request.args.get('url')) @@ -137,11 +158,13 @@ def url(): @app.route('/imgres') +@auth_required def imgres(): return redirect(request.args.get('imgurl')) @app.route('/tmp') +@auth_required def tmp(): cipher_suite = Fernet(app.secret_key) img_url = cipher_suite.decrypt(request.args.get('image_url').encode()).decode() @@ -159,6 +182,7 @@ def tmp(): @app.route('/window') +@auth_required def window(): get_body = g.user_request.send(base_url=request.args.get('location')) get_body = get_body.replace('src="/', 'src="' + request.args.get('location') + '"') @@ -185,7 +209,15 @@ def run_app(): help='Activates debug mode for the server (default False)') parser.add_argument('--https-only', default=False, action='store_true', help='Enforces HTTPS redirects for all requests') + parser.add_argument('--userpass', default='', metavar='', + help='Sets a username/password basic auth combo (default None)') args = parser.parse_args() + + if args.userpass: + user_pass = args.userpass.split(':') + os.environ['WHOOGLE_USER'] = user_pass[0] + os.environ['WHOOGLE_PASS'] = user_pass[1] + os.environ['HTTPS_ONLY'] = '1' if args.https_only else '' if args.debug: From c51f1864194abc928135caaacccee0861bad57e4 Mon Sep 17 00:00:00 2001 From: Ben Busby <33362396+benbusby@users.noreply.github.com> Date: Wed, 20 May 2020 11:02:30 -0600 Subject: [PATCH 05/13] Added version footer, minor PEP 8 refactoring --- app/__init__.py | 4 ++++ app/filter.py | 21 +++++++++++---------- app/routes.py | 31 ++++++++++++------------------- app/static/css/main.css | 12 ++++++++++++ app/templates/index.html | 4 +++- 5 files changed, 42 insertions(+), 30 deletions(-) diff --git a/app/__init__.py b/app/__init__.py index 82b63a3..3e2309c 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -4,5 +4,9 @@ import os app = Flask(__name__, static_folder=os.path.dirname(os.path.abspath(__file__)) + '/static') app.secret_key = Fernet.generate_key() +app.config['VERSION_NUMBER'] = '0.1.1' +app.config['APP_ROOT'] = os.getenv('APP_ROOT', os.path.dirname(os.path.abspath(__file__))) +app.config['STATIC_FOLDER'] = os.getenv('STATIC_FOLDER', os.path.join(app.config['APP_ROOT'], 'static')) +app.config['CONFIG_PATH'] = os.getenv('CONFIG_VOLUME', app.config['STATIC_FOLDER']) + '/config.json' from app import routes diff --git a/app/filter.py b/app/filter.py index e2e8168..5e5ec01 100644 --- a/app/filter.py +++ b/app/filter.py @@ -13,6 +13,7 @@ BLANK_B64 = ''' data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAQAAAAnOwc2AAAAD0lEQVR42mNkwAIYh7IgAAVVAAuInjI5AAAAAElFTkSuQmCC ''' + def get_first_link(soup): # Replace hrefs with only the intended destination (no "utm" type tags) for a in soup.find_all('a', href=True): @@ -25,6 +26,7 @@ def get_first_link(soup): if 'url?q=' in href: return filter_link_args(href) + def filter_link_args(query_link): parsed_link = urlparse.urlparse(query_link) link_args = parse_qs(parsed_link.query) @@ -48,6 +50,7 @@ def filter_link_args(query_link): return query_link + class Filter: def __init__(self, mobile=False, config=None, secret_key=''): if config is None: @@ -109,14 +112,13 @@ class Filter: img_src = img['src'] if img_src.startswith('//'): img_src = 'https:' + img_src + elif img_src.startswith(LOGO_URL): + # Re-brand with Whoogle logo + img['src'] = '/static/img/logo.png' + img['style'] = 'height:40px;width:162px' + continue elif img_src.startswith(GOOG_IMG): - # Special rebranding for image search results - if img_src.startswith(LOGO_URL): - img['src'] = '/static/img/logo.png' - img['style'] = 'height:40px;width:162px' - else: - img['src'] = BLANK_B64 - + img['src'] = BLANK_B64 continue enc_src = Fernet(self.secret_key).encrypt(img_src.encode()) @@ -183,12 +185,11 @@ class Filter: a['href'] = new_search elif 'url?q=' in href: # Strip unneeded arguments - query_link = filter_link_args(query_link) - a['href'] = query_link + a['href'] = filter_link_args(query_link) # Add no-js option if self.nojs: - gen_nojs(soup, query_link, a) + gen_nojs(soup, a['href'], a) else: a['href'] = href diff --git a/app/routes.py b/app/routes.py index 7931a14..a667bb8 100644 --- a/app/routes.py +++ b/app/routes.py @@ -13,11 +13,6 @@ import os import urllib.parse as urlparse import waitress -app.config['APP_ROOT'] = os.getenv('APP_ROOT', os.path.dirname(os.path.abspath(__file__))) -app.config['STATIC_FOLDER'] = os.getenv('STATIC_FOLDER', os.path.join(app.config['APP_ROOT'], 'static')) - -CONFIG_PATH = os.getenv('CONFIG_VOLUME', app.config['STATIC_FOLDER']) + '/config.json' - def auth_required(f): @wraps(f) @@ -27,7 +22,8 @@ def auth_required(f): # Skip if username/password not set whoogle_user = os.getenv('WHOOGLE_USER', '') whoogle_pass = os.getenv('WHOOGLE_PASS', '') - if (not whoogle_user or not whoogle_pass) or (auth and whoogle_user == auth.username and whoogle_pass == auth.password): + if (not whoogle_user or not whoogle_pass) or \ + (auth and whoogle_user == auth.username and whoogle_pass == auth.password): return f(*args, **kwargs) else: return make_response('Not logged in', 401, {'WWW-Authenticate': 'Basic realm="Login Required"'}) @@ -38,13 +34,14 @@ def auth_required(f): def before_request_func(): # Always redirect to https if HTTPS_ONLY is set (otherwise default to false) https_only = os.getenv('HTTPS_ONLY', False) + config_path = app.config['CONFIG_PATH'] if https_only and request.url.startswith('http://'): - url = request.url.replace('http://', 'https://', 1) + https_url = request.url.replace('http://', 'https://', 1) code = 308 - return redirect(url, code=code) + return redirect(https_url, code=code) - json_config = json.load(open(CONFIG_PATH)) if os.path.exists(CONFIG_PATH) else {'url': request.url_root} + json_config = json.load(open(config_path)) if os.path.exists(config_path) else {'url': request.url_root} g.user_config = Config(**json_config) if not g.user_config.url: @@ -68,6 +65,7 @@ def index(): ua=g.user_request.modified_user_agent, languages=Config.LANGUAGES, current_lang=g.user_config.lang, + version_number=app.config['VERSION_NUMBER'], request_type='get' if g.user_config.get_only else 'post') @@ -91,7 +89,7 @@ def opensearch(): def search(): request_params = request.args if request.method == 'GET' else request.form q = request_params.get('q') - + if q is None or len(q) == 0: return redirect('/') else: @@ -103,7 +101,7 @@ def search(): feeling_lucky = q.startswith('! ') - if feeling_lucky: # Well do you, punk? + if feeling_lucky: # Well do you, punk? q = q[2:] user_agent = request.headers.get('User-Agent') @@ -112,18 +110,13 @@ def search(): content_filter = Filter(mobile, g.user_config, secret_key=app.secret_key) full_query = gen_query(q, request_params, content_filter.near, language=g.user_config.lang) get_body = g.user_request.send(query=full_query) - - results = content_filter.reskin(get_body) - dirty_soup = BeautifulSoup(results, 'html.parser') + dirty_soup = BeautifulSoup(content_filter.reskin(get_body), 'html.parser') if feeling_lucky: - redirect_url = get_first_link(dirty_soup) - return redirect(redirect_url, 303) # Using 303 so the browser performs a GET request for the URL + return redirect(get_first_link(dirty_soup), 303) # Using 303 so the browser performs a GET request for the URL else: formatted_results = content_filter.clean(dirty_soup) - - return render_template('display.html', query=urlparse.unquote(q), response=formatted_results) @@ -137,7 +130,7 @@ def config(): if 'url' not in config_data or not config_data['url']: config_data['url'] = g.user_config.url - with open(CONFIG_PATH, 'w') as config_file: + with open(app.config['CONFIG_PATH'], 'w') as config_file: config_file.write(json.dumps(config_data, indent=4)) config_file.close() diff --git a/app/static/css/main.css b/app/static/css/main.css index f482373..ef4b557 100644 --- a/app/static/css/main.css +++ b/app/static/css/main.css @@ -1,3 +1,7 @@ +body { + font-family: Avenir, Helvetica, Arial, sans-serif; +} + .logo { width: 80%; display: block; @@ -117,3 +121,11 @@ button::-moz-focus-inner { .hidden { display: none; } + +footer { + position: fixed; + bottom: 0%; + text-align: center; + width: 100%; + z-index: -1; +} diff --git a/app/templates/index.html b/app/templates/index.html index 33f2153..4747dba 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -85,6 +85,8 @@ + - From b15368ac283114247d1264de295e7469e068a47d Mon Sep 17 00:00:00 2001 From: Ben Busby <33362396+benbusby@users.noreply.github.com> Date: Wed, 20 May 2020 11:07:01 -0600 Subject: [PATCH 06/13] Updated recent results test w/ +5 day tolerance --- test/test_results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_results.py b/test/test_results.py index 6340f32..abf3dcd 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -62,6 +62,6 @@ def test_recent_results(client): try: date = parse(date_span) - assert (current_date - date).days <= num_days + assert (current_date - date).days <= (num_days + 5) # Date can have a little bit of wiggle room except ParserError: assert ' ago' in date_span From d2b60544c5edd30052d000dcfb1ba2d458e886d6 Mon Sep 17 00:00:00 2001 From: Ben Busby Date: Wed, 20 May 2020 11:11:07 -0600 Subject: [PATCH 07/13] Update README.md Added instructions for setting default search engine while using a reverse proxy --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 88d38a1..2e09dfa 100644 --- a/README.md +++ b/README.md @@ -185,6 +185,8 @@ To filter by a range of time, append ":past