From dd9d87d25bacc48e0287650c46cea02b71ace81d Mon Sep 17 00:00:00 2001 From: Marvin Borner Date: Fri, 26 Jun 2020 00:26:02 +0200 Subject: [PATCH 01/40] Added ddg-style !bang-operators This is a proof of concept! The code works, but uses hardcoded operators and may be placed in the wrong file/class. The best-case scenario would be the possibility to use the 13.000+ ddg operators, but I don't know if that's possible without having to redirect to duckduckgo first. --- app/routes.py | 4 ++++ app/utils/routing_utils.py | 17 +++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/app/routes.py b/app/routes.py index ed288c0..7a58e06 100644 --- a/app/routes.py +++ b/app/routes.py @@ -144,6 +144,10 @@ def search(): search_util = RoutingUtils(request, g.user_config, session, cookies_disabled=g.cookies_disabled) query = search_util.new_search_query() + resolved_bangs = search_util.bang_operator() + if resolved_bangs != '': + return redirect(resolved_bangs) + # Redirect to home if invalid/blank search if not query: return redirect('/') diff --git a/app/utils/routing_utils.py b/app/utils/routing_utils.py index 40f8a90..deb0444 100644 --- a/app/utils/routing_utils.py +++ b/app/utils/routing_utils.py @@ -17,6 +17,14 @@ class RoutingUtils: self.query = '' self.cookies_disabled = cookies_disabled self.search_type = self.request_params.get('tbm') if 'tbm' in self.request_params else '' + self.bang_operators = { + '!gh': 'https://github.com/search?q=', + '!ddg': 'https://duckduckgo.com/?q=', + '!w': 'https://wikipedia.com/wiki/', + '!so': 'https://stackoverflow.com/search?q=', + '!a': 'https://amazon.com/s?k=', + '!ebay': 'https://ebay.com/sch/i.html?_nkw=', + } def __getitem__(self, name): return getattr(self, name) @@ -55,6 +63,15 @@ class RoutingUtils: self.query = q[2:] if self.feeling_lucky else q return self.query + + def bang_operator(self) -> str: + print(self.query) + for operator, url in self.bang_operators.items(): + if operator in self.query: + return url + self.query.replace(operator, '').strip() + return '' + + def generate_response(self) -> Tuple[Any, int]: mobile = 'Android' in self.user_agent or 'iPhone' in self.user_agent From 348301f201651c7ebc04db2cfbee8f9ebaa5c1ea Mon Sep 17 00:00:00 2001 From: Marvin Borner Date: Sun, 28 Jun 2020 12:03:23 +0200 Subject: [PATCH 02/40] Added bang operator list generator This is, again, just a proof of concept. --- gen_ops.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 gen_ops.py diff --git a/gen_ops.py b/gen_ops.py new file mode 100644 index 0000000..6ecedb6 --- /dev/null +++ b/gen_ops.py @@ -0,0 +1,19 @@ +import csv, json, sys +import requests +import collections + +# Request list +try: + r = requests.get('https://duckduckgo.com/bang.v255.js') + r.raise_for_status() +except requests.exceptions.HTTPError as err: + raise SystemExit(err) + +# Convert to json +data = json.loads(r.text) + +# Output CSV +output = csv.writer(sys.stdout) +output.writerow(['tag', 'url', 'domain', 'name']) +for row in data: + output.writerow([row['t'], row['u'], row['d'], row['s']]) From 558e3e15149efa7bdf3b21c100ec629b20df049e Mon Sep 17 00:00:00 2001 From: curlpipe <11898833+curlpipe@users.noreply.github.com> Date: Sun, 4 Oct 2020 17:53:37 +0000 Subject: [PATCH 03/40] Fixed annoying browser autocomplete (#128) --- app/templates/index.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/templates/index.html b/app/templates/index.html index 02b9137..4980316 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -34,7 +34,7 @@
- +
From b01b6d8c6968f2439e650ef000d5613e5a778186 Mon Sep 17 00:00:00 2001 From: Ben Busby <33362396+benbusby@users.noreply.github.com> Date: Sun, 4 Oct 2020 14:11:44 -0400 Subject: [PATCH 04/40] Minor change to wording of language config --- app/models/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/models/config.py b/app/models/config.py index 2dc0d2b..2fb4088 100644 --- a/app/models/config.py +++ b/app/models/config.py @@ -2,7 +2,7 @@ class Config: # Derived from here: # https://sites.google.com/site/tomihasa/google-language-codes#searchlanguage LANGUAGES = [ - {'name': 'Default (use server location)', 'value': ''}, + {'name': 'Default (none specified)', 'value': ''}, {'name': 'English', 'value': 'lang_en'}, {'name': 'Afrikaans', 'value': 'lang_af'}, {'name': 'Arabic', 'value': 'lang_ar'}, From ae05e8ff8b4100f32990ef4bf6d918046ac15e9e Mon Sep 17 00:00:00 2001 From: Ben Busby Date: Sat, 10 Oct 2020 15:55:14 -0400 Subject: [PATCH 05/40] Finished basic implementation of DDG bang feature Initialization of the app now includes generation of a ddg-bang json file, which is used for all bang style searches afterwards. Also added search suggestion handling for bang json lookup. Queries beginning with "!" now reference the bang json file to pull all keys that match. Updated test suite to include basic tests for bang functionality. Updated gitignore to exclude bang subdir. --- .gitignore | 1 + app/__init__.py | 8 ++++++++ app/routes.py | 9 ++++++++- app/static/js/autocomplete.js | 10 ++++++++-- app/utils/gen_ddg_bangs.py | 26 ++++++++++++++++++++++++++ app/utils/routing_utils.py | 19 ++++--------------- gen_ops.py | 19 ------------------- test/test_routes.py | 10 ++++++++++ 8 files changed, 65 insertions(+), 37 deletions(-) create mode 100644 app/utils/gen_ddg_bangs.py delete mode 100644 gen_ops.py diff --git a/.gitignore b/.gitignore index bbffdb4..caa4595 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ test/static flask_session/ app/static/config app/static/custom_config +app/static/bangs # pip stuff build/ diff --git a/app/__init__.py b/app/__init__.py index 8293c44..820edb0 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -1,4 +1,5 @@ from app.utils.session_utils import generate_user_keys +from app.utils.gen_ddg_bangs import gen_bangs_json from flask import Flask from flask_session import Session import os @@ -15,6 +16,8 @@ app.config['STATIC_FOLDER'] = os.getenv('STATIC_FOLDER', os.path.join(app.config app.config['CONFIG_PATH'] = os.getenv('CONFIG_VOLUME', os.path.join(app.config['STATIC_FOLDER'], 'config')) app.config['DEFAULT_CONFIG'] = os.path.join(app.config['CONFIG_PATH'], 'config.json') app.config['SESSION_FILE_DIR'] = os.path.join(app.config['CONFIG_PATH'], 'session') +app.config['BANG_PATH'] = os.getenv('CONFIG_VOLUME', os.path.join(app.config['STATIC_FOLDER'], 'bangs')) +app.config['BANG_FILE'] = os.path.join(app.config['BANG_PATH'], 'bangs.json') if not os.path.exists(app.config['CONFIG_PATH']): os.makedirs(app.config['CONFIG_PATH']) @@ -22,6 +25,11 @@ if not os.path.exists(app.config['CONFIG_PATH']): if not os.path.exists(app.config['SESSION_FILE_DIR']): os.makedirs(app.config['SESSION_FILE_DIR']) +# (Re)generate DDG bang filter, and create path if it doesn't exist yet +if not os.path.exists(app.config['BANG_PATH']): + os.makedirs(app.config['BANG_PATH']) +gen_bangs_json(app.config['BANG_FILE']) + Session(app) from app import routes diff --git a/app/routes.py b/app/routes.py index 198dc4f..da4be87 100644 --- a/app/routes.py +++ b/app/routes.py @@ -18,6 +18,9 @@ from app.request import Request from app.utils.session_utils import valid_user_session from app.utils.routing_utils import * +# Load DDG bang json files only on init +bang_json = json.load(open(app.config['BANG_FILE'])) + def auth_required(f): @wraps(f) @@ -126,6 +129,10 @@ def opensearch(): def autocomplete(): q = g.request_params.get('q') + # Search bangs if the query begins with "!", but not "! " (feeling lucky) + if q.startswith('!') and len(q) > 1 and not q.startswith('! '): + return jsonify([q, [bang_json[_]['suggestion'] for _ in bang_json if _.startswith(q)]]) + if not q and not request.data: return jsonify({'?': []}) elif request.data: @@ -143,7 +150,7 @@ def search(): search_util = RoutingUtils(request, g.user_config, session, cookies_disabled=g.cookies_disabled) query = search_util.new_search_query() - resolved_bangs = search_util.bang_operator() + resolved_bangs = search_util.bang_operator(bang_json) if resolved_bangs != '': return redirect(resolved_bangs) diff --git a/app/static/js/autocomplete.js b/app/static/js/autocomplete.js index 3d179ca..b8f8bf6 100644 --- a/app/static/js/autocomplete.js +++ b/app/static/js/autocomplete.js @@ -93,8 +93,14 @@ const autocomplete = (searchInput, autocompleteResults) => { removeActive(suggestion); suggestion[currentFocus].classList.add("autocomplete-active"); - // Autofill search bar with suggestion content - searchBar.value = suggestion[currentFocus].textContent; + // Autofill search bar with suggestion content (minus the "bang name" if using a bang operator) + let searchContent = suggestion[currentFocus].textContent; + if (searchContent.indexOf('(') > 0) { + searchBar.value = searchContent.substring(0, searchContent.indexOf('(')); + } else { + searchBar.value = searchContent; + } + searchBar.focus(); }; diff --git a/app/utils/gen_ddg_bangs.py b/app/utils/gen_ddg_bangs.py new file mode 100644 index 0000000..0ed3953 --- /dev/null +++ b/app/utils/gen_ddg_bangs.py @@ -0,0 +1,26 @@ +import json +import requests + + +def gen_bangs_json(bangs_file): + # Request list + try: + r = requests.get('https://duckduckgo.com/bang.v255.js') + r.raise_for_status() + except requests.exceptions.HTTPError as err: + raise SystemExit(err) + + # Convert to json + data = json.loads(r.text) + + # Set up a json object (with better formatting) for all available bangs + bangs_data = {} + + for row in data: + bang_command = '!' + row['t'] + bangs_data[bang_command] = { + 'url': row['u'].replace('{{{s}}}', '{}'), + 'suggestion': bang_command + ' (' + row['s'] + ')' + } + + json.dump(bangs_data, open(bangs_file, 'w')) diff --git a/app/utils/routing_utils.py b/app/utils/routing_utils.py index 3dac09a..c6c960b 100644 --- a/app/utils/routing_utils.py +++ b/app/utils/routing_utils.py @@ -17,14 +17,6 @@ class RoutingUtils: self.query = '' self.cookies_disabled = cookies_disabled self.search_type = self.request_params.get('tbm') if 'tbm' in self.request_params else '' - self.bang_operators = { - '!gh': 'https://github.com/search?q=', - '!ddg': 'https://duckduckgo.com/?q=', - '!w': 'https://wikipedia.com/wiki/', - '!so': 'https://stackoverflow.com/search?q=', - '!a': 'https://amazon.com/s?k=', - '!ebay': 'https://ebay.com/sch/i.html?_nkw=', - } def __getitem__(self, name): return getattr(self, name) @@ -63,15 +55,12 @@ class RoutingUtils: self.query = q[2:] if self.feeling_lucky else q return self.query - - def bang_operator(self) -> str: - print(self.query) - for operator, url in self.bang_operators.items(): - if operator in self.query: - return url + self.query.replace(operator, '').strip() + def bang_operator(self, bangs_dict: dict) -> str: + for operator in bangs_dict.keys(): + if self.query.split(' ')[0] == operator: + return bangs_dict[operator]['url'].format(self.query.replace(operator, '').strip()) return '' - def generate_response(self) -> Tuple[Any, int]: mobile = 'Android' in self.user_agent or 'iPhone' in self.user_agent diff --git a/gen_ops.py b/gen_ops.py deleted file mode 100644 index 6ecedb6..0000000 --- a/gen_ops.py +++ /dev/null @@ -1,19 +0,0 @@ -import csv, json, sys -import requests -import collections - -# Request list -try: - r = requests.get('https://duckduckgo.com/bang.v255.js') - r.raise_for_status() -except requests.exceptions.HTTPError as err: - raise SystemExit(err) - -# Convert to json -data = json.loads(r.text) - -# Output CSV -output = csv.writer(sys.stdout) -output.writerow(['tag', 'url', 'domain', 'name']) -for row in data: - output.writerow([row['t'], row['u'], row['d'], row['s']]) diff --git a/test/test_routes.py b/test/test_routes.py index 3d08f0a..995e3c7 100644 --- a/test/test_routes.py +++ b/test/test_routes.py @@ -27,6 +27,16 @@ def test_feeling_lucky(client): assert rv._status_code == 303 +def test_ddg_bang(client): + rv = client.get('/search?q=!gh%20whoogle') + assert rv._status_code == 302 + assert rv.headers.get('Location').startswith('https://github.com') + + rv = client.get('/search?q=!w%20github') + assert rv._status_code == 302 + assert rv.headers.get('Location').startswith('https://en.wikipedia.org') + + def test_config(client): rv = client.post('/config', data=demo_config) assert rv._status_code == 302 From 58a7868d255049b5429ae2ec7d887c44e796297a Mon Sep 17 00:00:00 2001 From: Ben Busby Date: Wed, 21 Oct 2020 13:16:32 -0400 Subject: [PATCH 06/40] Update README.md Added DDG-style bang searches to feature list in the readme, removed Gitter chat badge --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3e9c823..2547daf 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,6 @@ [![Build Status](https://travis-ci.com/benbusby/whoogle-search.svg?branch=master)](https://travis-ci.com/benbusby/whoogle-search) [![codebeat badge](https://codebeat.co/badges/e96cada2-fb6f-4528-8285-7d72abd74e8d)](https://codebeat.co/projects/github-com-benbusby-shoogle-master) [![Docker Pulls](https://img.shields.io/docker/pulls/benbusby/whoogle-search)](https://hub.docker.com/r/benbusby/whoogle-search) -[![Gitter](https://img.shields.io/gitter/room/benbusby/whoogle-search)](https://gitter.im/whoogle-search/community) Get Google search results, but without any ads, javascript, AMP links, cookies, or IP address tracking. Easily deployable in one click as a Docker app, and customizable with a single config file. Quick and simple to implement as a primary search engine replacement on both desktop and mobile. @@ -32,6 +31,7 @@ Contents - Dark mode - Randomly generated User Agent - Easy to install/deploy +- DDG-style bang (i.e. `! `) searches - Optional location-based searching (i.e. results near \) - Optional NoJS mode to disable all Javascript in results From f3bb1e22b476e794b3dc320c37a18adfd724dfef Mon Sep 17 00:00:00 2001 From: Ben Busby Date: Sun, 25 Oct 2020 13:52:30 -0400 Subject: [PATCH 07/40] Fix improper header styling, remove shopping tab links The header template was using Google's classes for the "Whoogle" logo, which meant keeping up with their list of colors used in the logo. The template was updated to only ever use the Whoogle logo color. Accordingly, the logo specific styling in filter.py was removed, since it is no longer needed. Also removes all links to the shopping tab, as it seems that the majority of the links to items are Google specific links (usually google.com/aclk links without any discernible param for determining the true location for the link). The shopping page should be addressed separately with unique filtering/formatting. Further tracking of this task will be followed in #136. --- app/filter.py | 8 ++++---- app/templates/header.html | 8 ++------ 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/app/filter.py b/app/filter.py index e56dc67..71ac763 100644 --- a/app/filter.py +++ b/app/filter.py @@ -31,9 +31,6 @@ class Filter: def reskin(self, page): # Aesthetic only re-skinning - page = page.replace('>G<', '>Wh<') - pattern = re.compile('4285f4|ea4335|fbcc05|34a853|fbbc05', re.IGNORECASE) - page = pattern.sub('685e79', page) if self.dark: page = page.replace('fff', '000').replace('202124', 'ddd').replace('1967D2', '3b85ea') @@ -56,6 +53,7 @@ class Filter: self.fix_question_section() self.update_styling(soup) + for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]: self.update_element_src(img, 'image/png') @@ -147,7 +145,9 @@ class Filter: def update_link(self, link): # Replace href with only the intended destination (no "utm" type tags) href = link['href'].replace('https://www.google.com', '') - if '/advanced_search' in href: + if '/advanced_search' in href or 'tbm=shop' in href: + # TODO: The "Shopping" tab requires further filtering (see #136) + # Temporarily removing all links to that tab for now. link.decompose() return elif self.new_tab: diff --git a/app/templates/header.html b/app/templates/header.html index 5573b99..113cfa4 100644 --- a/app/templates/header.html +++ b/app/templates/header.html @@ -5,9 +5,7 @@
@@ -27,9 +25,7 @@
From 83433432ec5d0bbf6a1d39d68d0f27e630910cf3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 28 Oct 2020 10:57:47 -0400 Subject: [PATCH 08/40] Bump cryptography from 2.8 to 3.2 (#138) Bumps [cryptography](https://github.com/pyca/cryptography) from 2.8 to 3.2. - [Release notes](https://github.com/pyca/cryptography/releases) - [Changelog](https://github.com/pyca/cryptography/blob/master/CHANGELOG.rst) - [Commits](https://github.com/pyca/cryptography/compare/2.8...3.2) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 702d8ba..ba00e00 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ beautifulsoup4==4.8.2 bs4==0.0.1 cffi==1.13.2 Click==7.0 -cryptography==2.8 +cryptography==3.2 Flask==1.1.1 Flask-Session==0.3.2 itsdangerous==1.1.0 From 0ef098069e0592b62924196582359357b024b992 Mon Sep 17 00:00:00 2001 From: Ben Busby Date: Wed, 28 Oct 2020 20:47:42 -0400 Subject: [PATCH 09/40] Add tor and http/socks proxy support (#137) * Add tor and http/socks proxy support Allows users to enable/disable tor from the config menu, which will forward all requests through Tor. Also adds support for setting environment variables for alternative proxy support. Setting the following variables will forward requests through the proxy: - WHOOGLE_PROXY_USER (optional) - WHOOGLE_PROXY_PASS (optional) - WHOOGLE_PROXY_TYPE (required) - Can be "http", "socks4", or "socks5" - WHOOGLE_PROXY_LOC (required) - Format: ":" See #30 * Refactor acquire_tor_conn -> acquire_tor_identity Also updated travis CI to set up tor * Add check for Tor socket on init, improve Tor error handling Initializing the app sends a heartbeat request to Tor to check for availability, and updates the home page config options accordingly. This heartbeat is sent on every request, to ensure Tor support can be reconfigured without restarting the entire app. If Tor support is enabled, and a subsequent request fails, then a new TorError exception is raised, and the Tor feature is disabled until a valid connection is restored. The max attempts has been updated to 10, since 5 seemed a bit too low for how quickly the attempts go by. * Change send_tor_signal arg type, update function doc send_tor_signal now accepts a stem.Signal arg (a bit cleaner tbh). Also added the doc string for the "disable" attribute in TorError. * Fix tor identity logic in Request.send * Update proxy init, change proxyloc var name Proxy is now only initialized if both type and location are specified, as neither have a default fallback and both are required. I suppose the type could fall back to http, but seems safer this way. Also refactored proxyurl -> proxyloc for the runtime args in order to match the Dockerfile args. * Add tor/proxy support for Docker builds, fix opensearch/init The Dockerfile is now updated to include support for Tor configuration, with a working torrc file included in the repo. An issue with opensearch was fixed as well, which was uncovered during testing and was simple enough to fix here. Likewise, DDG bang gen was updated to only ever happen if the file didn't exist previously, as testing with the file being regenerated every time was tedious. * Add missing "@" for socks proxy requests --- Dockerfile | 19 +++++- README.md | 3 +- app/__init__.py | 10 ++- app/models/config.py | 1 + app/request.py | 126 ++++++++++++++++++++++++++++++++--- app/routes.py | 44 ++++++++++-- app/static/js/controller.js | 2 +- app/templates/error.html | 1 + app/templates/index.html | 10 +++ app/templates/opensearch.xml | 8 +-- app/utils/routing_utils.py | 10 ++- rc/torrc | 8 +++ requirements.txt | 17 ++++- 13 files changed, 228 insertions(+), 31 deletions(-) create mode 100644 rc/torrc diff --git a/Dockerfile b/Dockerfile index 61f77b2..0882bad 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,13 @@ FROM python:3.8-slim WORKDIR /usr/src/app -RUN apt-get update && apt-get install -y build-essential libcurl4-openssl-dev libssl-dev +RUN apt-get update && apt-get install -y \ + build-essential \ + libcurl4-openssl-dev \ + libssl-dev \ + tor + +COPY rc/torrc /etc/tor/torrc COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt @@ -15,6 +21,15 @@ ENV WHOOGLE_USER=$username ARG password='' ENV WHOOGLE_PASS=$password +ARG proxyuser='' +ENV WHOOGLE_PROXY_USER=$proxyuser +ARG proxypass='' +ENV WHOOGLE_PROXY_PASS=$proxypass +ARG proxytype='' +ENV WHOOGLE_PROXY_TYPE=$proxytype +ARG proxyloc='' +ENV WHOOGLE_PROXY_LOC=$proxyloc + ARG use_https='' ENV HTTPS_ONLY=$use_https @@ -25,4 +40,4 @@ COPY . . EXPOSE $EXPOSE_PORT -CMD ["./run"] +CMD service tor start && ./run diff --git a/README.md b/README.md index 2547daf..699d613 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ Contents - No AMP links - No URL tracking tags (i.e. utm=%s) - No referrer header +- Tor and HTTP/SOCKS proxy support - Autocomplete/search suggestions - POST request search and suggestion queries (when possible) - View images at full res without site redirect (currently mobile only) @@ -35,7 +36,7 @@ Contents - Optional location-based searching (i.e. results near \) - Optional NoJS mode to disable all Javascript in results -*If deployed to a remote server +*If deployed to a remote server, or configured to send requests through a VPN, Tor, proxy, etc. ## Dependencies If using Heroku Quick Deploy, **you can skip this section**. diff --git a/app/__init__.py b/app/__init__.py index 820edb0..a349acc 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -1,8 +1,10 @@ +from app.request import send_tor_signal from app.utils.session_utils import generate_user_keys from app.utils.gen_ddg_bangs import gen_bangs_json from flask import Flask from flask_session import Session import os +from stem import Signal app = Flask(__name__, static_folder=os.path.dirname(os.path.abspath(__file__)) + '/static') app.user_elements = {} @@ -25,11 +27,15 @@ if not os.path.exists(app.config['CONFIG_PATH']): if not os.path.exists(app.config['SESSION_FILE_DIR']): os.makedirs(app.config['SESSION_FILE_DIR']) -# (Re)generate DDG bang filter, and create path if it doesn't exist yet +# Generate DDG bang filter, and create path if it doesn't exist yet if not os.path.exists(app.config['BANG_PATH']): os.makedirs(app.config['BANG_PATH']) -gen_bangs_json(app.config['BANG_FILE']) +if not os.path.exists(app.config['BANG_FILE']): + gen_bangs_json(app.config['BANG_FILE']) Session(app) +# Attempt to acquire tor identity, to determine if Tor config is available +send_tor_signal(Signal.HEARTBEAT) + from app import routes diff --git a/app/models/config.py b/app/models/config.py index 2fb4088..6cfe18f 100644 --- a/app/models/config.py +++ b/app/models/config.py @@ -305,6 +305,7 @@ class Config: self.safe = False self.dark = False self.nojs = False + self.tor = False self.near = '' self.alts = False self.new_tab = False diff --git a/app/request.py b/app/request.py index 4abb9b3..04ae3db 100644 --- a/app/request.py +++ b/app/request.py @@ -1,8 +1,12 @@ +from app.models.config import Config from lxml import etree import random import requests -from requests import Response +from requests import Response, ConnectionError import urllib.parse as urlparse +import os +from stem import Signal, SocketError +from stem.control import Controller # Core Google search URLs SEARCH_URL = 'https://www.google.com/search?gbv=1&q=' @@ -15,7 +19,36 @@ DESKTOP_UA = '{}/5.0 (X11; {} x86_64; rv:75.0) Gecko/20100101 {}/75.0' VALID_PARAMS = ['tbs', 'tbm', 'start', 'near', 'source', 'nfpr'] -def gen_user_agent(is_mobile): +class TorError(Exception): + """Exception raised for errors in Tor requests. + + Attributes: + message -- a message describing the error that occurred + disable -- optionally disables Tor in the user config (note: + this should only happen if the connection has been dropped + altogether). + """ + + def __init__(self, message, disable=False): + self.message = message + self.disable = disable + super().__init__(self.message) + + +def send_tor_signal(signal: Signal) -> bool: + try: + with Controller.from_port(port=9051) as c: + c.authenticate() + c.signal(signal) + os.environ['TOR_AVAILABLE'] = '1' + return True + except (SocketError, ConnectionRefusedError, ConnectionError): + os.environ['TOR_AVAILABLE'] = '0' + + return False + + +def gen_user_agent(is_mobile) -> str: mozilla = random.choice(['Moo', 'Woah', 'Bro', 'Slow']) + 'zilla' firefox = random.choice(['Choir', 'Squier', 'Higher', 'Wire']) + 'fox' linux = random.choice(['Win', 'Sin', 'Gin', 'Fin', 'Kin']) + 'ux' @@ -26,7 +59,7 @@ def gen_user_agent(is_mobile): return DESKTOP_UA.format(mozilla, linux, firefox) -def gen_query(query, args, config, near_city=None): +def gen_query(query, args, config, near_city=None) -> str: param_dict = {key: '' for key in VALID_PARAMS} # Use :past(hour/day/week/month/year) if available @@ -85,15 +118,56 @@ def gen_query(query, args, config, near_city=None): class Request: - def __init__(self, normal_ua, language='lang_en'): - self.language = language + """Class used for handling all outbound requests, including search queries, + search suggestions, and loading of external content (images, audio, etc). + + Attributes: + normal_ua -- the user's current user agent + root_path -- the root path of the whoogle instance + config -- the user's current whoogle configuration + """ + def __init__(self, normal_ua, root_path, config: Config): + # Send heartbeat to Tor, used in determining if the user can or cannot + # enable Tor for future requests + send_tor_signal(Signal.HEARTBEAT) + + self.language = config.lang_search self.mobile = 'Android' in normal_ua or 'iPhone' in normal_ua self.modified_user_agent = gen_user_agent(self.mobile) + # Set up proxy, if previously configured + if os.environ.get('WHOOGLE_PROXY_LOC'): + auth_str = '' + if os.environ.get('WHOOGLE_PROXY_USER'): + auth_str = os.environ.get('WHOOGLE_PROXY_USER') + \ + ':' + os.environ.get('WHOOGLE_PROXY_PASS') + self.proxies = { + 'http': os.environ.get('WHOOGLE_PROXY_TYPE') + '://' + + auth_str + '@' + os.environ.get('WHOOGLE_PROXY_LOC'), + } + self.proxies['https'] = self.proxies['http'].replace('http', 'https') + else: + self.proxies = { + 'http': 'socks5://127.0.0.1:9050', + 'https': 'socks5://127.0.0.1:9050' + } if config.tor else {} + self.tor = config.tor + self.tor_valid = False + self.root_path = root_path + def __getitem__(self, name): return getattr(self, name) - def autocomplete(self, query): + def autocomplete(self, query) -> list: + """Sends a query to Google's search suggestion service + + Args: + query: The in-progress query to send + + Returns: + list: The list of matches for possible search suggestions + + """ ac_query = dict(hl=self.language, q=query) response = self.send(base_url=AUTOCOMPLETE_URL, query=urlparse.urlencode(ac_query)).text @@ -103,9 +177,45 @@ class Request: return [] - def send(self, base_url=SEARCH_URL, query='') -> Response: + def send(self, base_url=SEARCH_URL, query='', attempt=0) -> Response: + """Sends an outbound request to a URL. Optionally sends the request using Tor, if + enabled by the user. + + Args: + base_url: The URL to use in the request + query: The optional query string for the request + attempt: The number of attempts made for the request (used for cycling + through Tor identities, if enabled) + + Returns: + Response: The Response object returned by the requests call + + """ headers = { 'User-Agent': self.modified_user_agent } - return requests.get(base_url + query, headers=headers) + # Validate Tor connection and request new identity if the last one failed + if self.tor and not send_tor_signal(Signal.NEWNYM if attempt > 0 else Signal.HEARTBEAT): + raise TorError("Tor was previously enabled, but the connection has been dropped. Please check your " + + "Tor configuration and try again.", disable=True) + + # Make sure that the tor connection is valid, if enabled + if self.tor: + tor_check = requests.get('https://check.torproject.org/', proxies=self.proxies, headers=headers) + self.tor_valid = 'Congratulations' in tor_check.text + + if not self.tor_valid: + raise TorError("Tor connection succeeded, but the connection could not be validated by torproject.org", + disable=True) + + response = requests.get(base_url + query, proxies=self.proxies, headers=headers) + + # Retry query with new identity if using Tor (max 10 attempts) + if 'form id="captcha-form"' in response.text and self.tor: + attempt += 1 + if attempt > 10: + raise TorError("Tor query failed -- max attempts exceeded 10") + return self.send(base_url, query, attempt) + + return response diff --git a/app/routes.py b/app/routes.py index da4be87..3916c23 100644 --- a/app/routes.py +++ b/app/routes.py @@ -9,12 +9,12 @@ import uuid from functools import wraps import waitress -from flask import jsonify, make_response, request, redirect, render_template, send_file, session +from flask import jsonify, make_response, request, redirect, render_template, send_file, session, url_for from requests import exceptions from app import app from app.models.config import Config -from app.request import Request +from app.request import Request, TorError from app.utils.session_utils import valid_user_session from app.utils.routing_utils import * @@ -62,13 +62,17 @@ def before_request_func(): if https_only and request.url.startswith('http://'): return redirect(request.url.replace('http://', 'https://', 1), code=308) - + g.user_config = Config(**session['config']) if not g.user_config.url: g.user_config.url = request.url_root.replace('http://', 'https://') if https_only else request.url_root - g.user_request = Request(request.headers.get('User-Agent'), language=g.user_config.lang_search) + g.user_request = Request( + request.headers.get('User-Agent'), + request.url_root, + config=g.user_config) + g.app_location = g.user_config.url @@ -103,11 +107,15 @@ def unknown_page(e): def index(): # Reset keys session['fernet_keys'] = generate_user_keys(g.cookies_disabled) + error_message = session['error_message'] if 'error_message' in session else '' + session['error_message'] = '' return render_template('index.html', languages=Config.LANGUAGES, countries=Config.COUNTRIES, config=g.user_config, + error_message=error_message, + tor_available=int(os.environ.get('TOR_AVAILABLE')), version_number=app.config['VERSION_NUMBER']) @@ -138,7 +146,9 @@ def autocomplete(): elif request.data: q = urlparse.unquote_plus(request.data.decode('utf-8').replace('q=', '')) - return jsonify([q, g.user_request.autocomplete(q)]) + # Return a list of suggestions for the query + # Note: If Tor is enabled, this returns nothing, as the request is almost always rejected + return jsonify([q, g.user_request.autocomplete(q) if not g.user_config.tor else []]) @app.route('/search', methods=['GET', 'POST']) @@ -159,8 +169,14 @@ def search(): return redirect('/') # Generate response and number of external elements from the page - response, elements = search_util.generate_response() - if search_util.feeling_lucky: + try: + response, elements = search_util.generate_response() + except TorError as e: + session['error_message'] = e.message + ("\\n\\nTor config is now disabled!" if e.disable else "") + session['config']['tor'] = False if e.disable else session['config']['tor'] + return redirect(url_for('.index')) + + if search_util.feeling_lucky or elements < 0: return redirect(response, code=303) # Keep count of external elements to fetch before element key can be regenerated @@ -281,6 +297,12 @@ def run_app(): help='Enforces HTTPS redirects for all requests') parser.add_argument('--userpass', default='', metavar='', help='Sets a username/password basic auth combo (default None)') + parser.add_argument('--proxyauth', default='', metavar='', + help='Sets a username/password for a HTTP/SOCKS proxy (default None)') + parser.add_argument('--proxytype', default='', metavar='', + help='Sets a proxy type for all connections (default None)') + parser.add_argument('--proxyloc', default='', metavar='', + help='Sets a proxy location for all connections (default None)') args = parser.parse_args() if args.userpass: @@ -288,6 +310,14 @@ def run_app(): os.environ['WHOOGLE_USER'] = user_pass[0] os.environ['WHOOGLE_PASS'] = user_pass[1] + if args.proxytype and args.proxyloc: + if args.proxyauth: + proxy_user_pass = args.proxyauth.split(':') + os.environ['WHOOGLE_PROXY_USER'] = proxy_user_pass[0] + os.environ['WHOOGLE_PROXY_PASS'] = proxy_user_pass[1] + os.environ['WHOOGLE_PROXY_TYPE'] = args.proxytype + os.environ['WHOOGLE_PROXY_LOC'] = args.proxyloc + os.environ['HTTPS_ONLY'] = '1' if args.https_only else '' if args.debug: diff --git a/app/static/js/controller.js b/app/static/js/controller.js index 156a84d..8775122 100644 --- a/app/static/js/controller.js +++ b/app/static/js/controller.js @@ -1,6 +1,6 @@ // Whoogle configurations that use boolean values and checkboxes CONFIG_BOOLS = [ - "nojs", "dark", "safe", "alts", "new_tab", "get_only" + "nojs", "dark", "safe", "alts", "new_tab", "get_only", "tor" ]; // Whoogle configurations that use string values and input fields diff --git a/app/templates/error.html b/app/templates/error.html index 003623d..9546e23 100644 --- a/app/templates/error.html +++ b/app/templates/error.html @@ -3,3 +3,4 @@

Error parsing "{{ query }}"

+Return Home diff --git a/app/templates/index.html b/app/templates/index.html index 4980316..2f996a3 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -29,6 +29,12 @@ Whoogle Search +
@@ -110,6 +116,10 @@
+
+ + +
diff --git a/app/templates/opensearch.xml b/app/templates/opensearch.xml index 8e2e7b2..c1d2898 100644 --- a/app/templates/opensearch.xml +++ b/app/templates/opensearch.xml @@ -5,12 +5,8 @@ Whoogle: A lightweight, deployable Google search proxy for desktop/mobile that removes Javascript, AMP links, and ads UTF-8  - - - - - - + + {{ main_url }}/search diff --git a/app/utils/routing_utils.py b/app/utils/routing_utils.py index c6c960b..a083da2 100644 --- a/app/utils/routing_utils.py +++ b/app/utils/routing_utils.py @@ -7,6 +7,9 @@ from flask import g from typing import Any, Tuple +TOR_BANNER = '

You are using Tor


' + + class RoutingUtils: def __init__(self, request, config, session, cookies_disabled=False): self.request_params = request.args if request.method == 'GET' else request.form @@ -66,10 +69,13 @@ class RoutingUtils: content_filter = Filter(self.session['fernet_keys'], mobile=mobile, config=self.config) full_query = gen_query(self.query, self.request_params, self.config, content_filter.near) - get_body = g.user_request.send(query=full_query).text + get_body = g.user_request.send(query=full_query) # Produce cleanable html soup from response - html_soup = BeautifulSoup(content_filter.reskin(get_body), 'html.parser') + html_soup = BeautifulSoup(content_filter.reskin(get_body.text), 'html.parser') + html_soup.insert(0, BeautifulSoup( + TOR_BANNER, + features='lxml') if g.user_request.tor_valid else BeautifulSoup("", features="lxml")) if self.feeling_lucky: return get_first_link(html_soup), 1 diff --git a/rc/torrc b/rc/torrc new file mode 100644 index 0000000..b162719 --- /dev/null +++ b/rc/torrc @@ -0,0 +1,8 @@ +DataDirectory /var/lib/tor +ControlPort 9051 +CookieAuthentication 1 +DataDirectoryGroupReadable 1 +CookieAuthFileGroupReadable 1 +ExtORPortCookieAuthFileGroupReadable 1 +CacheDirectoryGroupReadable 1 +CookieAuthFile /var/lib/tor/control_auth_cookie diff --git a/requirements.txt b/requirements.txt index ba00e00..508cbcf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,20 +1,33 @@ +attrs==19.3.0 beautifulsoup4==4.8.2 bs4==0.0.1 +cachelib==0.1 +certifi==2020.4.5.1 cffi==1.13.2 +chardet==3.0.4 Click==7.0 cryptography==3.2 Flask==1.1.1 Flask-Session==0.3.2 +idna==2.9 itsdangerous==1.1.0 Jinja2==2.10.3 lxml==4.5.1 MarkupSafe==1.1.1 +more-itertools==8.3.0 +packaging==20.4 +pluggy==0.13.1 +py==1.8.1 pycparser==2.19 pyOpenSSL==19.1.0 +pyparsing==2.4.7 +PySocks==1.7.1 pytest==5.4.1 python-dateutil==2.8.1 requests==2.23.0 -six==1.14.0 soupsieve==1.9.5 -Werkzeug==0.16.0 +stem==1.8.0 +urllib3==1.25.9 waitress==1.4.3 +wcwidth==0.1.9 +Werkzeug==0.16.0 From 7a61220aa509ee6f1895ad4f5d68d9726495344c Mon Sep 17 00:00:00 2001 From: Ben Busby Date: Wed, 28 Oct 2020 22:18:29 -0400 Subject: [PATCH 10/40] Update Dockerfile tor service init, refactor rc/ -> misc/ The tor service is now started by calling a script which runs tor according to the current container user. If the user is root, the script will begin the tor service as normal. Otherwise, it runs tor as the current user. This primarily is meant to address the issue with Heroku builds (which don't have a root user) not being able to start tor as a service. Also refactored the rc/ dir to misc/ (with a tor/ subdir) since that makes more sense. --- Dockerfile | 4 ++-- misc/tor/start-tor.sh | 7 +++++++ {rc => misc/tor}/torrc | 0 3 files changed, 9 insertions(+), 2 deletions(-) create mode 100755 misc/tor/start-tor.sh rename {rc => misc/tor}/torrc (100%) diff --git a/Dockerfile b/Dockerfile index 0882bad..bf038ba 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,7 +7,7 @@ RUN apt-get update && apt-get install -y \ libssl-dev \ tor -COPY rc/torrc /etc/tor/torrc +COPY misc/tor/torrc /etc/tor/torrc COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt @@ -40,4 +40,4 @@ COPY . . EXPOSE $EXPOSE_PORT -CMD service tor start && ./run +CMD misc/tor/start-tor.sh & ./run diff --git a/misc/tor/start-tor.sh b/misc/tor/start-tor.sh new file mode 100755 index 0000000..19be24a --- /dev/null +++ b/misc/tor/start-tor.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +if [ "$(whoami)" != "root" ]; then + tor -f /etc/tor/torrc +else + service tor start +fi diff --git a/rc/torrc b/misc/tor/torrc similarity index 100% rename from rc/torrc rename to misc/tor/torrc From 933ce7e0685148033aa92ed4ed03efb0c062a201 Mon Sep 17 00:00:00 2001 From: Ben Busby Date: Wed, 28 Oct 2020 23:02:41 -0400 Subject: [PATCH 11/40] Handle FF sending bad search suggestion param Occasionally, Firefox will send the search suggestion string to the server without a mimetype, resulting in the suggestion only appearing in Flask's `request.data` field. This field is typically not used for parsing arguments, as the documentation states: Contains the incoming request data as string in case it came with a mimetype Flask does not handle. This fix captures the bytes object sent to the server and parses it into a normal query to be used in forming suggestions. --- app/routes.py | 4 ++++ app/templates/opensearch.xml | 8 ++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/app/routes.py b/app/routes.py index 3916c23..d1734a3 100644 --- a/app/routes.py +++ b/app/routes.py @@ -136,6 +136,10 @@ def opensearch(): @app.route('/autocomplete', methods=['GET', 'POST']) def autocomplete(): q = g.request_params.get('q') + if not q: + # FF will occasionally (incorrectly) send the q field without a + # mimetype in the format "b'q='" through the request.data field + q = str(request.data).replace('q=', '') # Search bangs if the query begins with "!", but not "! " (feeling lucky) if q.startswith('!') and len(q) > 1 and not q.startswith('! '): diff --git a/app/templates/opensearch.xml b/app/templates/opensearch.xml index c1d2898..5b533f4 100644 --- a/app/templates/opensearch.xml +++ b/app/templates/opensearch.xml @@ -5,8 +5,12 @@ Whoogle: A lightweight, deployable Google search proxy for desktop/mobile that removes Javascript, AMP links, and ads UTF-8  - - + + + + + + {{ main_url }}/search From 1148a7fb8d796312a4e6cc2f1a498b6da0488863 Mon Sep 17 00:00:00 2001 From: bugbounce <35751394+bugbounce@users.noreply.github.com> Date: Thu, 29 Oct 2020 15:09:31 +0000 Subject: [PATCH 12/40] Use relative links instead of absolute (#139) * Use relative links instead of absolute This allows for hosting under a subpath. For example if you want to host whoogle at example.com/whoogle, it should work better with a reverse proxy. * Use relative link for opensearch.xml --- app/filter.py | 8 +++--- app/static/js/autocomplete.js | 4 +-- app/static/js/controller.js | 6 ++--- app/templates/display.html | 14 +++++----- app/templates/header.html | 6 ++--- app/templates/index.html | 48 +++++++++++++++++------------------ 6 files changed, 43 insertions(+), 43 deletions(-) diff --git a/app/filter.py b/app/filter.py index 71ac763..012ed91 100644 --- a/app/filter.py +++ b/app/filter.py @@ -107,14 +107,14 @@ class Filter: element_src = 'https:' + element_src elif element_src.startswith(LOGO_URL): # Re-brand with Whoogle logo - element['src'] = '/static/img/logo.png' + element['src'] = 'static/img/logo.png' element['style'] = 'height:40px;width:162px' return elif element_src.startswith(GOOG_IMG): element['src'] = BLANK_B64 return - element['src'] = '/element?url=' + self.encrypt_path(element_src, is_element=True) + \ + element['src'] = 'element?url=' + self.encrypt_path(element_src, is_element=True) + \ '&type=' + urlparse.quote(mime) # TODO: Non-mobile image results link to website instead of image # if not self.mobile: @@ -145,7 +145,7 @@ class Filter: def update_link(self, link): # Replace href with only the intended destination (no "utm" type tags) href = link['href'].replace('https://www.google.com', '') - if '/advanced_search' in href or 'tbm=shop' in href: + if 'advanced_search' in href or 'tbm=shop' in href: # TODO: The "Shopping" tab requires further filtering (see #136) # Temporarily removing all links to that tab for now. link.decompose() @@ -163,7 +163,7 @@ class Filter: # "li:1" implies the query should be interpreted verbatim, so we wrap it in double quotes if 'li:1' in href: query_link = '"' + query_link + '"' - new_search = '/search?q=' + self.encrypt_path(query_link) + new_search = 'search?q=' + self.encrypt_path(query_link) query_params = parse_qs(urlparse.urlparse(href).query) for param in VALID_PARAMS: diff --git a/app/static/js/autocomplete.js b/app/static/js/autocomplete.js index b8f8bf6..702ebc4 100644 --- a/app/static/js/autocomplete.js +++ b/app/static/js/autocomplete.js @@ -1,6 +1,6 @@ const handleUserInput = searchBar => { let xhrRequest = new XMLHttpRequest(); - xhrRequest.open("POST", "/autocomplete"); + xhrRequest.open("POST", "autocomplete"); xhrRequest.setRequestHeader("Content-type", "application/x-www-form-urlencoded"); xhrRequest.onload = function () { if (xhrRequest.readyState === 4 && xhrRequest.status !== 200) { @@ -123,4 +123,4 @@ const autocomplete = (searchInput, autocompleteResults) => { document.addEventListener("click", function (e) { closeAllLists(e.target); }); -}; \ No newline at end of file +}; diff --git a/app/static/js/controller.js b/app/static/js/controller.js index 8775122..3ab8ca7 100644 --- a/app/static/js/controller.js +++ b/app/static/js/controller.js @@ -31,7 +31,7 @@ const setupSearchLayout = () => { const fillConfigValues = () => { // Request existing config info let xhrGET = new XMLHttpRequest(); - xhrGET.open("GET", "/config"); + xhrGET.open("GET", "config"); xhrGET.onload = function() { if (xhrGET.readyState === 4 && xhrGET.status !== 200) { alert("Error loading Whoogle config"); @@ -82,7 +82,7 @@ const loadConfig = event => { } let xhrPUT = new XMLHttpRequest(); - xhrPUT.open("PUT", "/config?name=" + config + ".conf"); + xhrPUT.open("PUT", "config?name=" + config + ".conf"); xhrPUT.onload = function() { if (xhrPUT.readyState === 4 && xhrPUT.status !== 200) { alert("Error loading Whoogle config"); @@ -104,7 +104,7 @@ const saveConfig = event => { } let configForm = document.getElementById("config-form"); - configForm.action = '/config?name=' + config + ".conf"; + configForm.action = 'config?name=' + config + ".conf"; configForm.submit(); }; diff --git a/app/templates/display.html b/app/templates/display.html index 6a8a609..1c490ef 100644 --- a/app/templates/display.html +++ b/app/templates/display.html @@ -1,15 +1,15 @@ - - - + + + - - - + + + {% if dark_mode %} - + {% endif %} {{ query }} - Whoogle Search diff --git a/app/templates/header.html b/app/templates/header.html index 113cfa4..034b591 100644 --- a/app/templates/header.html +++ b/app/templates/header.html @@ -3,7 +3,7 @@
@@ -24,7 +24,7 @@ {% else %}
@@ -56,4 +56,4 @@ document.getElementById("search-form").submit(); } }); - \ No newline at end of file + diff --git a/app/templates/index.html b/app/templates/index.html index 2f996a3..8d11a5b 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -1,30 +1,30 @@ - - - - - - - - - - - - - - + + + + + + + + + + + + + + - - - - + + + + - - + + {% if config.dark %} - + {% endif %} Whoogle Search @@ -36,8 +36,8 @@ {% endif %}
- - + +
@@ -49,7 +49,7 @@
- +
+
@@ -38,6 +39,7 @@ color: {{ '#685e79' if dark_mode else '#000' }}; border: {{ '1px solid #685e79' if dark_mode else '' }}"> +