Expand /window endpoint to behave like a proxy

The current `/window` endpoint is used as a proxy currently, but only
for removing Javascript from the result page. This expands the existing
functionality to allow users to proxy search result pages through their
Whoogle instance.

This commit temporarily overwrites the nojs feature until I can decide
on how I want to approach separating (or not separating) these two
features.

Note that this feature will likely be merged in without being 100%
finished, since I'm unsure of how users intend to use the feature. With
some community testing, I'll get a better idea of A) how many people are
actually going to use this feature, and B) what aspects of the feature
are useful or not useful.
This commit is contained in:
Ben Busby 2022-03-14 15:46:10 -06:00
parent 2a0ad8796c
commit 7b8a8525d5
No known key found for this signature in database
GPG Key ID: B9B7231E01D924A1
2 changed files with 58 additions and 12 deletions

View File

@ -264,7 +264,7 @@ class Filter:
# enabled
parent.decompose()
def update_element_src(self, element: Tag, mime: str) -> None:
def update_element_src(self, element: Tag, mime: str, attr='src') -> None:
"""Encrypts the original src of an element and rewrites the element src
to use the "/element?src=" pass-through.
@ -272,10 +272,12 @@ class Filter:
None (The soup element is modified directly)
"""
src = element['src']
src = element[attr].split(' ')[0]
if src.startswith('//'):
src = 'https:' + src
elif src.startswith('data:'):
return
if src.startswith(LOGO_URL):
# Re-brand with Whoogle logo
@ -287,7 +289,7 @@ class Filter:
element['src'] = BLANK_B64
return
element['src'] = f'{Endpoint.element}?url=' + self.encrypt_path(
element[attr] = f'{Endpoint.element}?url=' + self.encrypt_path(
src,
is_element=True) + '&type=' + urlparse.quote(mime)

View File

@ -16,6 +16,7 @@ from app.models.config import Config
from app.models.endpoint import Endpoint
from app.request import Request, TorError
from app.utils.bangs import resolve_bang
from app.filter import Filter
from app.utils.misc import read_config_bool, get_client_ip, get_request_url, \
check_for_update
from app.utils.results import add_ip_card, bold_search_terms,\
@ -457,8 +458,11 @@ def imgres():
@session_required
@auth_required
def element():
cipher_suite = Fernet(g.session_key)
src_url = cipher_suite.decrypt(request.args.get('url').encode()).decode()
element_url = src_url = request.args.get('url')
if element_url.startswith('gAAAAA'):
cipher_suite = Fernet(g.session_key)
src_url = cipher_suite.decrypt(element_url.encode()).decode()
src_type = request.args.get('type')
try:
@ -477,18 +481,58 @@ def element():
@app.route(f'/{Endpoint.window}')
@session_required
@auth_required
def window():
get_body = g.user_request.send(base_url=request.args.get('location')).text
get_body = get_body.replace('src="/',
'src="' + request.args.get('location') + '"')
get_body = get_body.replace('href="/',
'href="' + request.args.get('location') + '"')
target_url = request.args.get('location')
if target_url.startswith('gAAAAA'):
cipher_suite = Fernet(g.session_key)
target_url = cipher_suite.decrypt(target_url.encode()).decode()
content_filter = Filter(g.session_key, config=g.user_config)
target = urlparse.urlparse(target_url)
host_url = f'{target.scheme}://{target.netloc}'
get_body = g.user_request.send(base_url=target_url).text
results = bsoup(get_body, 'html.parser')
src_attrs = ['src', 'href', 'srcset', 'data-srcset', 'data-src']
for script in results('script'):
script.decompose()
# Parse HTML response and replace relative links w/ absolute
for element in results.find_all():
for attr in src_attrs:
if not element.has_attr(attr) or not element[attr].startswith('/'):
continue
element[attr] = host_url + element[attr]
# Replace or remove javascript sources
for script in results.find_all('script', {'src': True}):
if 'nojs' in request.args:
script.decompose()
else:
content_filter.update_element_src(script, 'application/javascript')
# Replace all possible image attributes
for img in results.find_all('img'):
_ = [
content_filter.update_element_src(img, 'image/png', attr=_)
for _ in ['src', 'data-src', 'data-srcset', 'srcset'] if img.has_attr(_)
]
# Replace all stylesheet sources
for link in results.find_all('link', {'href': True}):
content_filter.update_element_src(link, 'text/css', attr='href')
# Use anonymous view for all links on page
for a in results.find_all('a', {'href': True}):
a['href'] = '/window?location=' + a['href'] + (
'&nojs=1' if 'nojs' in request.args else '')
# Remove all iframes -- these are commonly used inside of <noscript> tags
# to enforce loading Google Analytics
for iframe in results.find_all('iframe'):
iframe.decompose()
return render_template(
'display.html',