Implement filtering of remote content from css
This commit is contained in:
parent
2abfbc310f
commit
e7b70dd34d
|
@ -2,11 +2,12 @@ from app.models.config import Config
|
|||
from app.models.endpoint import Endpoint
|
||||
from app.models.g_classes import GClasses
|
||||
from app.request import VALID_PARAMS, MAPS_URL
|
||||
from app.utils.misc import read_config_bool
|
||||
from app.utils.misc import get_abs_url, read_config_bool
|
||||
from app.utils.results import *
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import ResultSet, Tag
|
||||
from cryptography.fernet import Fernet
|
||||
import cssutils
|
||||
from flask import render_template
|
||||
import re
|
||||
import urllib.parse as urlparse
|
||||
|
@ -53,15 +54,45 @@ def clean_query(query: str) -> str:
|
|||
return query[:query.find('-site:')] if '-site:' in query else query
|
||||
|
||||
|
||||
def clean_css(css: str, page_url: str) -> str:
|
||||
"""Removes all remote URLs from a CSS string.
|
||||
|
||||
Args:
|
||||
css: The CSS string
|
||||
|
||||
Returns:
|
||||
str: The filtered CSS, with URLs proxied through Whoogle
|
||||
"""
|
||||
sheet = cssutils.parseString(css)
|
||||
urls = cssutils.getUrls(sheet)
|
||||
|
||||
for url in urls:
|
||||
abs_url = get_abs_url(url, page_url)
|
||||
css = css.replace(
|
||||
url,
|
||||
f'/element?type=image/png&url={abs_url}'
|
||||
)
|
||||
|
||||
return css
|
||||
|
||||
|
||||
class Filter:
|
||||
# Limit used for determining if a result is a "regular" result or a list
|
||||
# type result (such as "people also asked", "related searches", etc)
|
||||
RESULT_CHILD_LIMIT = 7
|
||||
|
||||
def __init__(self, user_key: str, config: Config, mobile=False) -> None:
|
||||
def __init__(
|
||||
self,
|
||||
user_key: str,
|
||||
config: Config,
|
||||
root_url: str,
|
||||
page_url='',
|
||||
mobile=False) -> None:
|
||||
self.config = config
|
||||
self.mobile = mobile
|
||||
self.user_key = user_key
|
||||
self.root_url = root_url
|
||||
self.page_url = page_url
|
||||
self.main_divs = ResultSet('')
|
||||
self._elements = 0
|
||||
|
||||
|
@ -89,6 +120,7 @@ class Filter:
|
|||
self.remove_block_titles()
|
||||
self.remove_block_url()
|
||||
self.collapse_sections()
|
||||
self.update_css(soup)
|
||||
self.update_styling(soup)
|
||||
self.remove_block_tabs(soup)
|
||||
|
||||
|
@ -289,9 +321,28 @@ class Filter:
|
|||
element['src'] = BLANK_B64
|
||||
return
|
||||
|
||||
element[attr] = f'{Endpoint.element}?url=' + self.encrypt_path(
|
||||
src,
|
||||
is_element=True) + '&type=' + urlparse.quote(mime)
|
||||
element[attr] = f'{self.root_url}/{Endpoint.element}?url=' + (
|
||||
self.encrypt_path(
|
||||
src,
|
||||
is_element=True
|
||||
) + '&type=' + urlparse.quote(mime)
|
||||
)
|
||||
|
||||
def update_css(self, soup) -> None:
|
||||
"""Updates URLs used in inline styles to be proxied by Whoogle
|
||||
using the /element endpoint.
|
||||
|
||||
Returns:
|
||||
None (The soup element is modified directly)
|
||||
|
||||
"""
|
||||
# Filter all <style> tags
|
||||
for style in soup.find_all('style'):
|
||||
style.string = clean_css(style.string, self.page_url)
|
||||
|
||||
# Convert remote stylesheets to style tags
|
||||
for link in soup.find_all('link', attrs={'rel': 'stylesheet'}):
|
||||
print(link)
|
||||
|
||||
def update_styling(self, soup) -> None:
|
||||
# Remove unnecessary button(s)
|
||||
|
|
|
@ -489,7 +489,10 @@ def window():
|
|||
cipher_suite = Fernet(g.session_key)
|
||||
target_url = cipher_suite.decrypt(target_url.encode()).decode()
|
||||
|
||||
content_filter = Filter(g.session_key, config=g.user_config)
|
||||
content_filter = Filter(
|
||||
g.session_key,
|
||||
root_url=request.url_root,
|
||||
config=g.user_config)
|
||||
target = urlparse.urlparse(target_url)
|
||||
host_url = f'{target.scheme}://{target.netloc}'
|
||||
|
||||
|
@ -514,10 +517,11 @@ def window():
|
|||
content_filter.update_element_src(script, 'application/javascript')
|
||||
|
||||
# Replace all possible image attributes
|
||||
img_sources = ['src', 'data-src', 'data-srcset', 'srcset']
|
||||
for img in results.find_all('img'):
|
||||
_ = [
|
||||
content_filter.update_element_src(img, 'image/png', attr=_)
|
||||
for _ in ['src', 'data-src', 'data-srcset', 'srcset'] if img.has_attr(_)
|
||||
for _ in img_sources if img.has_attr(_)
|
||||
]
|
||||
|
||||
# Replace all stylesheet sources
|
||||
|
|
|
@ -3,6 +3,7 @@ from flask import Request
|
|||
import hashlib
|
||||
import os
|
||||
from requests import exceptions, get
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
def gen_file_hash(path: str, static_file: str) -> str:
|
||||
|
@ -47,3 +48,14 @@ def check_for_update(version_url: str, current: str) -> int:
|
|||
has_update = ''
|
||||
|
||||
return has_update
|
||||
|
||||
|
||||
def get_abs_url(url, page_url):
|
||||
# Creates a valid absolute URL using a partial or relative URL
|
||||
if url.startswith('//'):
|
||||
return f'https:{url}'
|
||||
elif url.startswith('/'):
|
||||
return f'{urlparse(page_url).netloc}{url}'
|
||||
elif url.startswith('./'):
|
||||
return f'{page_url}{url[2:]}'
|
||||
return url
|
||||
|
|
|
@ -56,6 +56,7 @@ class Search:
|
|||
"""
|
||||
def __init__(self, request, config, session_key, cookies_disabled=False):
|
||||
method = request.method
|
||||
self.request = request
|
||||
self.request_params = request.args if method == 'GET' else request.form
|
||||
self.user_agent = request.headers.get('User-Agent')
|
||||
self.feeling_lucky = False
|
||||
|
@ -115,6 +116,7 @@ class Search:
|
|||
mobile = 'Android' in self.user_agent or 'iPhone' in self.user_agent
|
||||
|
||||
content_filter = Filter(self.session_key,
|
||||
root_url=self.request.url_root,
|
||||
mobile=mobile,
|
||||
config=self.config)
|
||||
full_query = gen_query(self.query,
|
||||
|
|
Loading…
Reference in New Issue
Block a user