whoogle-search/app/utils/search.py
Ben Busby ef98d85dc5
Ensure searches with a leading slash are treated as queries
A user reported a bug where searches with a leading slash (in this case:
"/e/OS apps" were interpreted as a Google specific link when clicking
the next page of results.

This was due to the behavior that Google's search results exhibit, where
internal links for pages like support.google.com are delivered with
params like "?q=/support" rather than a direct link. This fixes that
scenario by checking the "q" param value against the user's original
query to ensure they don't match before assuming that the result is
intended as a redirect.

Fixes #776
2022-06-03 14:03:57 -06:00

177 lines
5.8 KiB
Python

import os
import re
from typing import Any
from app.filter import Filter, get_first_link
from app.request import gen_query
from bs4 import BeautifulSoup as bsoup
from cryptography.fernet import Fernet, InvalidToken
from flask import g
TOR_BANNER = '<hr><h1 style="text-align: center">You are using Tor</h1><hr>'
CAPTCHA = 'div class="g-recaptcha"'
def needs_https(url: str) -> bool:
"""Checks if the current instance needs to be upgraded to HTTPS
Note that all Heroku instances are available by default over HTTPS, but
do not automatically set up a redirect when visited over HTTP.
Args:
url: The instance url
Returns:
bool: True/False representing the need to upgrade
"""
https_only = bool(os.getenv('HTTPS_ONLY', 0))
is_heroku = url.endswith('.herokuapp.com')
is_http = url.startswith('http://')
return (is_heroku and is_http) or (https_only and is_http)
def has_captcha(results: str) -> bool:
"""Checks to see if the search results are blocked by a captcha
Args:
results: The search page html as a string
Returns:
bool: True/False indicating if a captcha element was found
"""
return CAPTCHA in results
class Search:
"""Search query preprocessor - used before submitting the query or
redirecting to another site
Attributes:
request: the incoming flask request
config: the current user config settings
session_key: the flask user fernet key
"""
def __init__(self, request, config, session_key, cookies_disabled=False):
method = request.method
self.request = request
self.request_params = request.args if method == 'GET' else request.form
self.user_agent = request.headers.get('User-Agent')
self.feeling_lucky = False
self.config = config
self.session_key = session_key
self.query = ''
self.cookies_disabled = cookies_disabled
self.search_type = self.request_params.get(
'tbm') if 'tbm' in self.request_params else ''
def __getitem__(self, name) -> Any:
return getattr(self, name)
def __setitem__(self, name, value) -> None:
return setattr(self, name, value)
def __delitem__(self, name) -> None:
return delattr(self, name)
def __contains__(self, name) -> bool:
return hasattr(self, name)
def new_search_query(self) -> str:
"""Parses a plaintext query into a valid string for submission
Also decrypts the query string, if encrypted (in the case of
paginated results).
Returns:
str: A valid query string
"""
q = self.request_params.get('q')
if q is None or len(q) == 0:
return ''
else:
# Attempt to decrypt if this is an internal link
try:
q = Fernet(self.session_key).decrypt(q.encode()).decode()
except InvalidToken:
pass
# Strip leading '! ' for "feeling lucky" queries
self.feeling_lucky = q.startswith('! ')
self.query = q[2:] if self.feeling_lucky else q
return self.query
def generate_response(self) -> str:
"""Generates a response for the user's query
Returns:
str: A string response to the search query, in the form of a URL
or string representation of HTML content.
"""
mobile = 'Android' in self.user_agent or 'iPhone' in self.user_agent
content_filter = Filter(self.session_key,
root_url=self.request.url_root,
mobile=mobile,
config=self.config,
query=self.query)
full_query = gen_query(self.query,
self.request_params,
self.config)
self.full_query = full_query
# force mobile search when view image is true and
# the request is not already made by a mobile
view_image = ('tbm=isch' in full_query
and self.config.view_image
and not g.user_request.mobile)
get_body = g.user_request.send(query=full_query,
force_mobile=view_image)
# Produce cleanable html soup from response
html_soup = bsoup(get_body.text, 'html.parser')
# Replace current soup if view_image is active
if view_image:
html_soup = content_filter.view_image(html_soup)
# Indicate whether or not a Tor connection is active
if g.user_request.tor_valid:
html_soup.insert(0, bsoup(TOR_BANNER, 'html.parser'))
if self.feeling_lucky:
return get_first_link(html_soup)
else:
formatted_results = content_filter.clean(html_soup)
# Append user config to all search links, if available
param_str = ''.join('&{}={}'.format(k, v)
for k, v in
self.request_params.to_dict(flat=True).items()
if self.config.is_safe_key(k))
for link in formatted_results.find_all('a', href=True):
link['rel'] = "nofollow noopener noreferrer"
if 'search?' not in link['href'] or link['href'].index(
'search?') > 1:
continue
link['href'] += param_str
return str(formatted_results)
def check_kw_ip(self) -> re.Match:
"""Checks for keywords related to 'my ip' in the query
Returns:
bool
"""
return re.search("([^a-z0-9]|^)my *[^a-z0-9] *(ip|internet protocol)" +
"($|( *[^a-z0-9] *(((addres|address|adres|" +
"adress)|a)? *$)))", self.query.lower())