Expand /window endpoint to behave like a proxy

The current `/window` endpoint is used as a proxy currently, but only for removing Javascript from the result page. This expands the existing functionality to allow users to proxy search result pages through their Whoogle instance. This commit temporarily overwrites the nojs feature until I can decide on how I want to approach separating (or not separating) these two features. Note that this feature will likely be merged in without being 100% finished, since I'm unsure of how users intend to use the feature. With some community testing, I'll get a better idea of A) how many people are actually going to use this feature, and B) what aspects of the feature are useful or not useful.
2022-03-14 15:46:10 -06:00 · 2022-03-14 15:46:10 -06:00 · 7b8a8525d5
commit 7b8a8525d5
parent 2a0ad8796c
2 changed files with 58 additions and 12 deletions
--- a/app/filter.py
+++ b/app/filter.py
@ -264,7 +264,7 @@ class Filter:
                # enabled
                parent.decompose()

-    def update_element_src(self, element: Tag, mime: str) -> None:
+    def update_element_src(self, element: Tag, mime: str, attr='src') -> None:
        """Encrypts the original src of an element and rewrites the element src
        to use the "/element?src=" pass-through.

@ -272,10 +272,12 @@ class Filter:
            None (The soup element is modified directly)

        """
-        src = element['src']
+        src = element[attr].split(' ')[0]

        if src.startswith('//'):
            src = 'https:' + src
+        elif src.startswith('data:'):
+            return

        if src.startswith(LOGO_URL):
            # Re-brand with Whoogle logo
@ -287,7 +289,7 @@ class Filter:
            element['src'] = BLANK_B64
            return

-        element['src'] = f'{Endpoint.element}?url=' + self.encrypt_path(
+        element[attr] = f'{Endpoint.element}?url=' + self.encrypt_path(
            src,
            is_element=True) + '&type=' + urlparse.quote(mime)

--- a/app/routes.py
+++ b/app/routes.py
@ -16,6 +16,7 @@ from app.models.config import Config
 from app.models.endpoint import Endpoint
 from app.request import Request, TorError
 from app.utils.bangs import resolve_bang
+from app.filter import Filter
 from app.utils.misc import read_config_bool, get_client_ip, get_request_url, \
    check_for_update
 from app.utils.results import add_ip_card, bold_search_terms,\
@ -457,8 +458,11 @@ def imgres():
@session_required
@auth_required
 def element():
-    cipher_suite = Fernet(g.session_key)
-    src_url = cipher_suite.decrypt(request.args.get('url').encode()).decode()
+    element_url = src_url = request.args.get('url')
+    if element_url.startswith('gAAAAA'):
+        cipher_suite = Fernet(g.session_key)
+        src_url = cipher_suite.decrypt(element_url.encode()).decode()
+
    src_type = request.args.get('type')

    try:
@ -477,18 +481,58 @@ def element():


@app.route(f'/{Endpoint.window}')
+@session_required
@auth_required
 def window():
-    get_body = g.user_request.send(base_url=request.args.get('location')).text
-    get_body = get_body.replace('src="/',
-                                'src="' + request.args.get('location') + '"')
-    get_body = get_body.replace('href="/',
-                                'href="' + request.args.get('location') + '"')
+    target_url = request.args.get('location')
+    if target_url.startswith('gAAAAA'):
+        cipher_suite = Fernet(g.session_key)
+        target_url = cipher_suite.decrypt(target_url.encode()).decode()
+
+    content_filter = Filter(g.session_key, config=g.user_config)
+    target = urlparse.urlparse(target_url)
+    host_url = f'{target.scheme}://{target.netloc}'
+
+    get_body = g.user_request.send(base_url=target_url).text

    results = bsoup(get_body, 'html.parser')
+    src_attrs = ['src', 'href', 'srcset', 'data-srcset', 'data-src']

-    for script in results('script'):
-        script.decompose()
+    # Parse HTML response and replace relative links w/ absolute
+    for element in results.find_all():
+        for attr in src_attrs:
+            if not element.has_attr(attr) or not element[attr].startswith('/'):
+                continue
+
+            element[attr] = host_url + element[attr]
+
+    # Replace or remove javascript sources
+    for script in results.find_all('script', {'src': True}):
+        if 'nojs' in request.args:
+            script.decompose()
+        else:
+            content_filter.update_element_src(script, 'application/javascript')
+
+    # Replace all possible image attributes
+    for img in results.find_all('img'):
+        _ = [
+            content_filter.update_element_src(img, 'image/png', attr=_)
+            for _ in ['src', 'data-src', 'data-srcset', 'srcset'] if img.has_attr(_)
+        ]
+
+    # Replace all stylesheet sources
+    for link in results.find_all('link', {'href': True}):
+        content_filter.update_element_src(link, 'text/css', attr='href')
+
+    # Use anonymous view for all links on page
+    for a in results.find_all('a', {'href': True}):
+        a['href'] = '/window?location=' + a['href'] + (
+            '&nojs=1' if 'nojs' in request.args else '')
+
+    # Remove all iframes -- these are commonly used inside of <noscript> tags
+    # to enforce loading Google Analytics
+    for iframe in results.find_all('iframe'):
+        iframe.decompose()

    return render_template(
        'display.html',