From 5a05bfb6debfbb82841676d2d16ef5a6c6ffa5ce Mon Sep 17 00:00:00 2001 From: DUO Labs Date: Tue, 26 Oct 2021 12:28:38 -0400 Subject: [PATCH 01/10] Allow setting number of results per page (#486) Add `WHOOGLE_RESULTS_PER_PAGE` var, allowing users to specify the number of results per page. The default is 10. --- app/request.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app/request.py b/app/request.py index 9bb9bd0..0c80f9b 100644 --- a/app/request.py +++ b/app/request.py @@ -9,7 +9,8 @@ import os from stem import Signal, SocketError from stem.control import Controller -SEARCH_URL = 'https://www.google.com/search?gbv=1&q=' +SEARCH_URL = 'https://www.google.com/search?gbv=1&num=' + str( + os.getenv('WHOOGLE_RESULTS_PER_PAGE', 10)) + '&q=' MAPS_URL = 'https://maps.google.com/maps' AUTOCOMPLETE_URL = ('https://suggestqueries.google.com/' 'complete/search?client=toolbar&') From 543f2b2a01d82dff1691e287bb0fe5956aad7180 Mon Sep 17 00:00:00 2001 From: DUO Labs Date: Tue, 26 Oct 2021 12:35:12 -0400 Subject: [PATCH 02/10] Add a "minimal mode" for condensing results (#485) If WHOOGLE_MINIMAL is set, all non-link results are removed from the view. --- app/filter.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/app/filter.py b/app/filter.py index 8f241e5..090753b 100644 --- a/app/filter.py +++ b/app/filter.py @@ -7,6 +7,8 @@ from flask import render_template import re import urllib.parse as urlparse from urllib.parse import parse_qs +import os +from app.utils.misc import read_config_bool def extract_q(q_str: str, href: str) -> str: @@ -186,8 +188,12 @@ class Filter: # Loop through results and check for the number of child divs in each for result in self.main_divs: result_children = pull_child_divs(result) - if len(result_children) < self.RESULT_CHILD_LIMIT: - continue + if read_config_bool('WHOOGLE_MINIMAL'): + if len(result_children) in (1, 3): + continue + else: + if len(result_children) < self.RESULT_CHILD_LIMIT: + continue # Find and decompose the first element with an inner HTML text val. # This typically extracts the title of the section (i.e. "Related From 90441b2668b49667409898aa509fabe3b742b2f9 Mon Sep 17 00:00:00 2001 From: Ben Busby Date: Tue, 26 Oct 2021 10:38:20 -0600 Subject: [PATCH 03/10] Add WHOOGLE_MINIMAL to docs, tweak min mode logic Activating minimal mode should also remove all collapsed sections, if any are found. WHOOGLE_MINIMAL now documented in readme and app.json (for heroku). --- README.md | 7 ++++--- app.json | 5 +++++ app/filter.py | 13 ++++++++++--- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 94a81c5..577ef73 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -![Whoogle Search](https://raw.githubusercontent.com/benbusby/whoogle-search/main/docs/banner.png) +![Whoogle Search](docs/banner.png) [![Latest Release](https://img.shields.io/github/v/release/benbusby/whoogle-search)](https://github.com/benbusby/shoogle/releases) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) @@ -321,6 +321,7 @@ There are a few optional environment variables available for customizing a Whoog | WHOOGLE_ALT_TL | The Google Translate alternative to use. This is used for all "translate ____" searches. | | WHOOGLE_ALT_MD | The medium.com alternative to use when site alternatives are enabled in the config. | | WHOOGLE_AUTOCOMPLETE | Controls visibility of autocomplete/search suggestions. Default on -- use '0' to disable | +| WHOOGLE_MINIMAL | Remove everything except basic result cards from all search queries. | ### Config Environment Variables These environment variables allow setting default config values, but can be overwritten manually by using the home page config menu. These allow a shortcut for destroying/rebuilding an instance to the same config state every time. @@ -505,7 +506,7 @@ A lot of the app currently piggybacks on Google's existing support for fetching ## Screenshots #### Desktop -![Whoogle Desktop](https://raw.githubusercontent.com/benbusby/whoogle-search/main/docs/screenshot_desktop.jpg) +![Whoogle Desktop](docs/screenshot_desktop.jpg) #### Mobile -![Whoogle Mobile](https://raw.githubusercontent.com/benbusby/whoogle-search/main/docs/screenshot_mobile.jpg) +![Whoogle Mobile](docs/screenshot_mobile.jpg) diff --git a/app.json b/app.json index af3aad2..c3d2dc3 100644 --- a/app.json +++ b/app.json @@ -75,6 +75,11 @@ "value": "scribe.rip", "required": false }, + "WHOOGLE_MINIMAL": { + "description": "Remove everything except basic result cards from all search queries (set to 1 or leave blank)", + "value": "", + "required": false + }, "WHOOGLE_CONFIG_COUNTRY": { "description": "[CONFIG] The country to use for restricting search results (use values from https://raw.githubusercontent.com/benbusby/whoogle-search/develop/app/static/settings/countries.json)", "value": "", diff --git a/app/filter.py b/app/filter.py index 090753b..fa45136 100644 --- a/app/filter.py +++ b/app/filter.py @@ -1,4 +1,5 @@ from app.request import VALID_PARAMS, MAPS_URL +from app.utils.misc import read_config_bool from app.utils.results import * from bs4 import BeautifulSoup from bs4.element import ResultSet, Tag @@ -8,7 +9,6 @@ import re import urllib.parse as urlparse from urllib.parse import parse_qs import os -from app.utils.misc import read_config_bool def extract_q(q_str: str, href: str) -> str: @@ -173,6 +173,8 @@ class Filter: Returns: None (The soup object is modified directly) """ + minimal_mode = read_config_bool('WHOOGLE_MINIMAL') + def pull_child_divs(result_div: BeautifulSoup): try: return result_div.findChildren( @@ -188,7 +190,7 @@ class Filter: # Loop through results and check for the number of child divs in each for result in self.main_divs: result_children = pull_child_divs(result) - if read_config_bool('WHOOGLE_MINIMAL'): + if minimal_mode: if len(result_children) in (1, 3): continue else: @@ -212,13 +214,18 @@ class Filter: while not parent and idx < len(result_children): parent = result_children[idx].parent idx += 1 + details = BeautifulSoup(features='html.parser').new_tag('details') summary = BeautifulSoup(features='html.parser').new_tag('summary') summary.string = label details.append(summary) - if parent: + if parent and not minimal_mode: parent.wrap(details) + elif parent and minimal_mode: + # Remove parent element from document if "minimal mode" is + # enabled + parent.decompose() def update_element_src(self, element: Tag, mime: str) -> None: """Encrypts the original src of an element and rewrites the element src From 2c9cf3ecc6835699bcdb1877e7f44549034477d8 Mon Sep 17 00:00:00 2001 From: DUO Labs Date: Tue, 26 Oct 2021 16:59:23 -0400 Subject: [PATCH 04/10] Bold search query in results (#487) This modifies the search result page by bold-ing all appearances of any word in the original query. If portions of the query are in quotes (i.e. "ice cream"), only exact matches of the sequence of words will be made bold. Co-authored-by: Ben Busby --- app/routes.py | 3 ++- app/utils/results.py | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/app/routes.py b/app/routes.py index 2139683..56256ed 100644 --- a/app/routes.py +++ b/app/routes.py @@ -14,6 +14,7 @@ from app.request import Request, TorError from app.utils.bangs import resolve_bang from app.utils.misc import read_config_bool from app.utils.results import add_ip_card +from app.utils.results import bold_search_terms from app.utils.search import * from app.utils.session import generate_user_key, valid_user_session from bs4 import BeautifulSoup as bsoup @@ -250,7 +251,7 @@ def search(): # Return 503 if temporarily blocked by captcha resp_code = 503 if has_captcha(str(response)) else 200 - + response = bold_search_terms(response, query) # Feature to display IP address if search_util.check_kw_ip(): html_soup = bsoup(response, "html.parser") diff --git a/app/utils/results.py b/app/utils/results.py index 8141074..befc86d 100644 --- a/app/utils/results.py +++ b/app/utils/results.py @@ -2,6 +2,8 @@ from bs4 import BeautifulSoup import os import urllib.parse as urlparse from urllib.parse import parse_qs +import re +from bs4 import NavigableString SKIP_ARGS = ['ref_src', 'utm'] @@ -34,6 +36,45 @@ SITE_ALTS = { } +def bold_search_terms(response: str, query: str) -> BeautifulSoup: + """Wraps all search terms in bold tags (). If any terms are wrapped + in quotes, only that exact phrase will be made bold. + + Args: + response: The initial response body for the query + query: The original search query + + Returns: + BeautifulSoup: modified soup object with bold items + """ + response = BeautifulSoup(response, 'html.parser') + + def replace_any_case(element: NavigableString, target_word: str) -> None: + # Replace all instances of the word, but maintaining the same case in + # the replacement + element.replace_with( + element.replace( + target_word.lower(), f'{target_word.lower()}' + ).replace( + target_word.capitalize(), f'{target_word.capitalize()}' + ).replace( + target_word.title(), f'{target_word.title()}' + ).replace( + target_word.upper(), f'{target_word.upper()}' + ) + ) + + # Split all words out of query, grouping the ones wrapped in quotes + for word in re.split(r'\s+(?=[^"]*(?:"[^"]*"[^"]*)*$)', query): + word = re.sub(r'[^A-Za-z0-9 ]+', '', word) + target = response.find_all( + text=re.compile(r'' + re.escape(word), re.I)) + for nav_str in target: + replace_any_case(nav_str, word) + + return response + + def has_ad_content(element: str) -> bool: """Inspects an HTML element for ad related content From d16ef6d011c2c78e2907a13dd7ba4c17b5b954e7 Mon Sep 17 00:00:00 2001 From: Ben Busby Date: Tue, 26 Oct 2021 15:00:39 -0600 Subject: [PATCH 05/10] Unescape search response before rendering template Fixes a small issue with the previous commit where bolded search terms had the tags escaped, rather than being applied as actual html. --- app/routes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app/routes.py b/app/routes.py index 56256ed..406207c 100644 --- a/app/routes.py +++ b/app/routes.py @@ -1,5 +1,6 @@ import argparse import base64 +import html import io import json import pickle @@ -273,7 +274,7 @@ def search(): is_translation=any( _ in query.lower() for _ in [translation['translate'], 'translate'] ) and not search_util.search_type, # Standard search queries only - response=response, + response=html.unescape(str(response)), version_number=app.config['VERSION_NUMBER'], search_header=(render_template( 'header.html', From 6763c2e99d354f1a5884dc73b60de65f95fd728e Mon Sep 17 00:00:00 2001 From: Ben Busby Date: Tue, 26 Oct 2021 15:04:21 -0600 Subject: [PATCH 06/10] Remove test for deprecated feature Setting config using the URL is a feature that is being deprecated in the next release, so the test for confirming its functionality has been removed. --- test/test_routes.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/test/test_routes.py b/test/test_routes.py index b894c75..efd1c58 100644 --- a/test/test_routes.py +++ b/test/test_routes.py @@ -53,12 +53,6 @@ def test_config(client): for key in demo_config.keys(): assert config[key] == demo_config[key] - # Test setting config via search - custom_config = '&dark=1&lang_interface=lang_en' - rv = client.get('/search?q=test' + custom_config) - assert rv._status_code == 200 - assert custom_config.replace('&', '&') in str(rv.data) - # Test disabling changing config from client app.config['CONFIG_DISABLE'] = 1 dark_mod = not demo_config['dark'] From 6decab5a5102c8a39ff05398f253819c36817379 Mon Sep 17 00:00:00 2001 From: Ben Busby Date: Tue, 26 Oct 2021 16:15:24 -0600 Subject: [PATCH 07/10] Improve regex for bolding search terms Co-authored by @DUOLabs333 --- app/utils/results.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/app/utils/results.py b/app/utils/results.py index befc86d..8f79326 100644 --- a/app/utils/results.py +++ b/app/utils/results.py @@ -1,9 +1,8 @@ -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, NavigableString import os import urllib.parse as urlparse from urllib.parse import parse_qs import re -from bs4 import NavigableString SKIP_ARGS = ['ref_src', 'utm'] @@ -52,16 +51,14 @@ def bold_search_terms(response: str, query: str) -> BeautifulSoup: def replace_any_case(element: NavigableString, target_word: str) -> None: # Replace all instances of the word, but maintaining the same case in # the replacement + if len(element) == len(target_word): + return + element.replace_with( - element.replace( - target_word.lower(), f'{target_word.lower()}' - ).replace( - target_word.capitalize(), f'{target_word.capitalize()}' - ).replace( - target_word.title(), f'{target_word.title()}' - ).replace( - target_word.upper(), f'{target_word.upper()}' - ) + re.sub(r'\b((?![{}<>-])' + target_word + r'(?![{}<>-]))\b', + r'\1', + element, + flags=re.I) ) # Split all words out of query, grouping the ones wrapped in quotes From f154b5f2e2587792ec46cf5b67f6b74aa6f731d5 Mon Sep 17 00:00:00 2001 From: Ben Busby Date: Tue, 26 Oct 2021 16:17:38 -0600 Subject: [PATCH 08/10] PEP-8 formatting fix --- app/utils/results.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/app/utils/results.py b/app/utils/results.py index 8f79326..bb17051 100644 --- a/app/utils/results.py +++ b/app/utils/results.py @@ -4,7 +4,6 @@ import urllib.parse as urlparse from urllib.parse import parse_qs import re - SKIP_ARGS = ['ref_src', 'utm'] SKIP_PREFIX = ['//www.', '//mobile.', '//m.'] GOOG_STATIC = 'www.gstatic.com' @@ -14,7 +13,6 @@ BLANK_B64 = ('data:image/png;base64,' 'iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAQAAAAnOwc2AAAAD0lEQVR42mNkw' 'AIYh7IgAAVVAAuInjI5AAAAAElFTkSuQmCC') - # Ad keywords BLACKLIST = [ 'ad', 'anuncio', 'annuncio', 'annonce', 'Anzeige', '广告', '廣告', 'Reklama', @@ -56,9 +54,9 @@ def bold_search_terms(response: str, query: str) -> BeautifulSoup: element.replace_with( re.sub(r'\b((?![{}<>-])' + target_word + r'(?![{}<>-]))\b', - r'\1', - element, - flags=re.I) + r'\1', + element, + flags=re.I) ) # Split all words out of query, grouping the ones wrapped in quotes From 591ed4a6d610a47c771cf74d3e2de81d22b1c84a Mon Sep 17 00:00:00 2001 From: Ben Busby Date: Tue, 26 Oct 2021 16:21:30 -0600 Subject: [PATCH 09/10] Use f-string in bold query regex by @DUOLabs333 --- app/utils/results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/utils/results.py b/app/utils/results.py index bb17051..0c04b76 100644 --- a/app/utils/results.py +++ b/app/utils/results.py @@ -53,7 +53,7 @@ def bold_search_terms(response: str, query: str) -> BeautifulSoup: return element.replace_with( - re.sub(r'\b((?![{}<>-])' + target_word + r'(?![{}<>-]))\b', + re.sub(fr'\b((?![{{}}<>-]){target_word}(?![{{}}<>-]))\b', r'\1', element, flags=re.I) From 1abd040428b9bfa8f1a13c1d9ff474bc19b77ddf Mon Sep 17 00:00:00 2001 From: Ben Busby Date: Tue, 26 Oct 2021 21:11:28 -0600 Subject: [PATCH 10/10] Remove redundant loading of variables.css variables.css doesn't need to be loaded by any template, since WHOOGLE_CONFIG_STYLE loads those values by default when not set explicitly. Loading the stylesheet caused the logo colors to be persistent unless set individually. Sorry @gripped for sneaking all of this unnecessary color in... Fixes #492 --- app/templates/display.html | 1 - app/templates/index.html | 1 - 2 files changed, 2 deletions(-) diff --git a/app/templates/display.html b/app/templates/display.html index 288f24d..e9ec20b 100644 --- a/app/templates/display.html +++ b/app/templates/display.html @@ -7,7 +7,6 @@ - {% if config.theme %} {% if config.theme == 'system' %} diff --git a/app/templates/index.html b/app/templates/index.html index 27b1f1b..18f2bcf 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -21,7 +21,6 @@ - {% if config.theme %} {% if config.theme == 'system' %}