Merge remote-tracking branch 'origin/main' into feature/public-instance-sessions

This commit is contained in:
Ben Busby 2021-10-26 21:49:46 -06:00
commit 548dc418bf
No known key found for this signature in database
GPG Key ID: 339B7B7EB5333D14
9 changed files with 70 additions and 20 deletions

View File

@ -1,4 +1,4 @@
![Whoogle Search](https://raw.githubusercontent.com/benbusby/whoogle-search/main/docs/banner.png) ![Whoogle Search](docs/banner.png)
[![Latest Release](https://img.shields.io/github/v/release/benbusby/whoogle-search)](https://github.com/benbusby/shoogle/releases) [![Latest Release](https://img.shields.io/github/v/release/benbusby/whoogle-search)](https://github.com/benbusby/shoogle/releases)
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
@ -321,6 +321,7 @@ There are a few optional environment variables available for customizing a Whoog
| WHOOGLE_ALT_TL | The Google Translate alternative to use. This is used for all "translate ____" searches. | | WHOOGLE_ALT_TL | The Google Translate alternative to use. This is used for all "translate ____" searches. |
| WHOOGLE_ALT_MD | The medium.com alternative to use when site alternatives are enabled in the config. | | WHOOGLE_ALT_MD | The medium.com alternative to use when site alternatives are enabled in the config. |
| WHOOGLE_AUTOCOMPLETE | Controls visibility of autocomplete/search suggestions. Default on -- use '0' to disable | | WHOOGLE_AUTOCOMPLETE | Controls visibility of autocomplete/search suggestions. Default on -- use '0' to disable |
| WHOOGLE_MINIMAL | Remove everything except basic result cards from all search queries. |
### Config Environment Variables ### Config Environment Variables
These environment variables allow setting default config values, but can be overwritten manually by using the home page config menu. These allow a shortcut for destroying/rebuilding an instance to the same config state every time. These environment variables allow setting default config values, but can be overwritten manually by using the home page config menu. These allow a shortcut for destroying/rebuilding an instance to the same config state every time.
@ -505,7 +506,7 @@ A lot of the app currently piggybacks on Google's existing support for fetching
## Screenshots ## Screenshots
#### Desktop #### Desktop
![Whoogle Desktop](https://raw.githubusercontent.com/benbusby/whoogle-search/main/docs/screenshot_desktop.jpg) ![Whoogle Desktop](docs/screenshot_desktop.jpg)
#### Mobile #### Mobile
![Whoogle Mobile](https://raw.githubusercontent.com/benbusby/whoogle-search/main/docs/screenshot_mobile.jpg) ![Whoogle Mobile](docs/screenshot_mobile.jpg)

View File

@ -75,6 +75,11 @@
"value": "scribe.rip", "value": "scribe.rip",
"required": false "required": false
}, },
"WHOOGLE_MINIMAL": {
"description": "Remove everything except basic result cards from all search queries (set to 1 or leave blank)",
"value": "",
"required": false
},
"WHOOGLE_CONFIG_COUNTRY": { "WHOOGLE_CONFIG_COUNTRY": {
"description": "[CONFIG] The country to use for restricting search results (use values from https://raw.githubusercontent.com/benbusby/whoogle-search/develop/app/static/settings/countries.json)", "description": "[CONFIG] The country to use for restricting search results (use values from https://raw.githubusercontent.com/benbusby/whoogle-search/develop/app/static/settings/countries.json)",
"value": "", "value": "",

View File

@ -1,4 +1,5 @@
from app.request import VALID_PARAMS, MAPS_URL from app.request import VALID_PARAMS, MAPS_URL
from app.utils.misc import read_config_bool
from app.utils.results import * from app.utils.results import *
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import ResultSet, Tag from bs4.element import ResultSet, Tag
@ -7,6 +8,7 @@ from flask import render_template
import re import re
import urllib.parse as urlparse import urllib.parse as urlparse
from urllib.parse import parse_qs from urllib.parse import parse_qs
import os
def extract_q(q_str: str, href: str) -> str: def extract_q(q_str: str, href: str) -> str:
@ -171,6 +173,8 @@ class Filter:
Returns: Returns:
None (The soup object is modified directly) None (The soup object is modified directly)
""" """
minimal_mode = read_config_bool('WHOOGLE_MINIMAL')
def pull_child_divs(result_div: BeautifulSoup): def pull_child_divs(result_div: BeautifulSoup):
try: try:
return result_div.findChildren( return result_div.findChildren(
@ -186,8 +190,12 @@ class Filter:
# Loop through results and check for the number of child divs in each # Loop through results and check for the number of child divs in each
for result in self.main_divs: for result in self.main_divs:
result_children = pull_child_divs(result) result_children = pull_child_divs(result)
if len(result_children) < self.RESULT_CHILD_LIMIT: if minimal_mode:
continue if len(result_children) in (1, 3):
continue
else:
if len(result_children) < self.RESULT_CHILD_LIMIT:
continue
# Find and decompose the first element with an inner HTML text val. # Find and decompose the first element with an inner HTML text val.
# This typically extracts the title of the section (i.e. "Related # This typically extracts the title of the section (i.e. "Related
@ -206,13 +214,18 @@ class Filter:
while not parent and idx < len(result_children): while not parent and idx < len(result_children):
parent = result_children[idx].parent parent = result_children[idx].parent
idx += 1 idx += 1
details = BeautifulSoup(features='html.parser').new_tag('details') details = BeautifulSoup(features='html.parser').new_tag('details')
summary = BeautifulSoup(features='html.parser').new_tag('summary') summary = BeautifulSoup(features='html.parser').new_tag('summary')
summary.string = label summary.string = label
details.append(summary) details.append(summary)
if parent: if parent and not minimal_mode:
parent.wrap(details) parent.wrap(details)
elif parent and minimal_mode:
# Remove parent element from document if "minimal mode" is
# enabled
parent.decompose()
def update_element_src(self, element: Tag, mime: str) -> None: def update_element_src(self, element: Tag, mime: str) -> None:
"""Encrypts the original src of an element and rewrites the element src """Encrypts the original src of an element and rewrites the element src

View File

@ -9,7 +9,8 @@ import os
from stem import Signal, SocketError from stem import Signal, SocketError
from stem.control import Controller from stem.control import Controller
SEARCH_URL = 'https://www.google.com/search?gbv=1&q=' SEARCH_URL = 'https://www.google.com/search?gbv=1&num=' + str(
os.getenv('WHOOGLE_RESULTS_PER_PAGE', 10)) + '&q='
MAPS_URL = 'https://maps.google.com/maps' MAPS_URL = 'https://maps.google.com/maps'
AUTOCOMPLETE_URL = ('https://suggestqueries.google.com/' AUTOCOMPLETE_URL = ('https://suggestqueries.google.com/'
'complete/search?client=toolbar&') 'complete/search?client=toolbar&')

View File

@ -1,5 +1,6 @@
import argparse import argparse
import base64 import base64
import html
import io import io
import json import json
import pickle import pickle
@ -15,6 +16,7 @@ from app.request import Request, TorError
from app.utils.bangs import resolve_bang from app.utils.bangs import resolve_bang
from app.utils.misc import read_config_bool from app.utils.misc import read_config_bool
from app.utils.results import add_ip_card from app.utils.results import add_ip_card
from app.utils.results import bold_search_terms
from app.utils.search import * from app.utils.search import *
from app.utils.session import generate_user_key, valid_user_session from app.utils.session import generate_user_key, valid_user_session
from bs4 import BeautifulSoup as bsoup from bs4 import BeautifulSoup as bsoup
@ -298,7 +300,7 @@ def search():
# Return 503 if temporarily blocked by captcha # Return 503 if temporarily blocked by captcha
resp_code = 503 if has_captcha(str(response)) else 200 resp_code = 503 if has_captcha(str(response)) else 200
response = bold_search_terms(response, query)
# Feature to display IP address # Feature to display IP address
if search_util.check_kw_ip(): if search_util.check_kw_ip():
html_soup = bsoup(response, "html.parser") html_soup = bsoup(response, "html.parser")
@ -320,7 +322,7 @@ def search():
is_translation=any( is_translation=any(
_ in query.lower() for _ in [translation['translate'], 'translate'] _ in query.lower() for _ in [translation['translate'], 'translate']
) and not search_util.search_type, # Standard search queries only ) and not search_util.search_type, # Standard search queries only
response=response, response=html.unescape(str(response)),
version_number=app.config['VERSION_NUMBER'], version_number=app.config['VERSION_NUMBER'],
search_header=(render_template( search_header=(render_template(
'header.html', 'header.html',

View File

@ -7,7 +7,6 @@
<meta name="referrer" content="no-referrer"> <meta name="referrer" content="no-referrer">
<link rel="stylesheet" href="{{ cb_url('input.css') }}"> <link rel="stylesheet" href="{{ cb_url('input.css') }}">
<link rel="stylesheet" href="{{ cb_url('search.css') }}"> <link rel="stylesheet" href="{{ cb_url('search.css') }}">
<link rel="stylesheet" href="{{ cb_url('variables.css') }}">
<link rel="stylesheet" href="{{ cb_url('header.css') }}"> <link rel="stylesheet" href="{{ cb_url('header.css') }}">
{% if config.theme %} {% if config.theme %}
{% if config.theme == 'system' %} {% if config.theme == 'system' %}

View File

@ -21,7 +21,6 @@
<script type="text/javascript" src="{{ cb_url('controller.js') }}"></script> <script type="text/javascript" src="{{ cb_url('controller.js') }}"></script>
<link rel="search" href="opensearch.xml" type="application/opensearchdescription+xml" title="Whoogle Search"> <link rel="search" href="opensearch.xml" type="application/opensearchdescription+xml" title="Whoogle Search">
<meta name="viewport" content="width=device-width, initial-scale=1.0"> <meta name="viewport" content="width=device-width, initial-scale=1.0">
<link rel="stylesheet" href="{{ cb_url('variables.css') }}">
{% if config.theme %} {% if config.theme %}
{% if config.theme == 'system' %} {% if config.theme == 'system' %}
<style> <style>

View File

@ -1,8 +1,8 @@
from bs4 import BeautifulSoup from bs4 import BeautifulSoup, NavigableString
import os import os
import urllib.parse as urlparse import urllib.parse as urlparse
from urllib.parse import parse_qs from urllib.parse import parse_qs
import re
SKIP_ARGS = ['ref_src', 'utm'] SKIP_ARGS = ['ref_src', 'utm']
SKIP_PREFIX = ['//www.', '//mobile.', '//m.'] SKIP_PREFIX = ['//www.', '//mobile.', '//m.']
@ -13,7 +13,6 @@ BLANK_B64 = ('data:image/png;base64,'
'iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAQAAAAnOwc2AAAAD0lEQVR42mNkw' 'iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAQAAAAnOwc2AAAAD0lEQVR42mNkw'
'AIYh7IgAAVVAAuInjI5AAAAAElFTkSuQmCC') 'AIYh7IgAAVVAAuInjI5AAAAAElFTkSuQmCC')
# Ad keywords # Ad keywords
BLACKLIST = [ BLACKLIST = [
'ad', 'anuncio', 'annuncio', 'annonce', 'Anzeige', '广告', '廣告', 'Reklama', 'ad', 'anuncio', 'annuncio', 'annonce', 'Anzeige', '广告', '廣告', 'Reklama',
@ -34,6 +33,43 @@ SITE_ALTS = {
} }
def bold_search_terms(response: str, query: str) -> BeautifulSoup:
"""Wraps all search terms in bold tags (<b>). If any terms are wrapped
in quotes, only that exact phrase will be made bold.
Args:
response: The initial response body for the query
query: The original search query
Returns:
BeautifulSoup: modified soup object with bold items
"""
response = BeautifulSoup(response, 'html.parser')
def replace_any_case(element: NavigableString, target_word: str) -> None:
# Replace all instances of the word, but maintaining the same case in
# the replacement
if len(element) == len(target_word):
return
element.replace_with(
re.sub(fr'\b((?![{{}}<>-]){target_word}(?![{{}}<>-]))\b',
r'<b>\1</b>',
element,
flags=re.I)
)
# Split all words out of query, grouping the ones wrapped in quotes
for word in re.split(r'\s+(?=[^"]*(?:"[^"]*"[^"]*)*$)', query):
word = re.sub(r'[^A-Za-z0-9 ]+', '', word)
target = response.find_all(
text=re.compile(r'' + re.escape(word), re.I))
for nav_str in target:
replace_any_case(nav_str, word)
return response
def has_ad_content(element: str) -> bool: def has_ad_content(element: str) -> bool:
"""Inspects an HTML element for ad related content """Inspects an HTML element for ad related content

View File

@ -53,12 +53,6 @@ def test_config(client):
for key in demo_config.keys(): for key in demo_config.keys():
assert config[key] == demo_config[key] assert config[key] == demo_config[key]
# Test setting config via search
custom_config = '&dark=1&lang_interface=lang_en'
rv = client.get('/search?q=test' + custom_config)
assert rv._status_code == 200
assert custom_config.replace('&', '&amp;') in str(rv.data)
# Test disabling changing config from client # Test disabling changing config from client
app.config['CONFIG_DISABLE'] = 1 app.config['CONFIG_DISABLE'] = 1
dark_mod = not demo_config['dark'] dark_mod = not demo_config['dark']