From 4c8ffaa3ba3d61d48c8fcd170e42ac64219c83bb Mon Sep 17 00:00:00 2001 From: Ben Busby <33362396+benbusby@users.noreply.github.com> Date: Sat, 18 Jul 2020 11:32:52 -0600 Subject: [PATCH] Added backend site alternative redirects/text replacement By default, twitter/instagram/youtube will redirect to nitter/bibliogram/invidious respectively. Currently missing a front end for enabling/disabling this feature, but may want to set this as enabled until a user manually disables it. Also refactored util naming a bit, added filter utils to hold all non-class methods from filter.py --- app/__init__.py | 2 +- app/filter.py | 62 +++---------------- app/routes.py | 2 +- app/utils/filter_utils.py | 79 +++++++++++++++++++++++++ app/utils/routing_utils.py | 2 +- app/utils/{misc.py => session_utils.py} | 5 -- test/conftest.py | 2 +- test/test_misc.py | 2 +- test/test_results.py | 2 +- 9 files changed, 94 insertions(+), 64 deletions(-) create mode 100644 app/utils/filter_utils.py rename app/utils/{misc.py => session_utils.py} (62%) diff --git a/app/__init__.py b/app/__init__.py index 22e436d..f21d4b4 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -1,4 +1,4 @@ -from app.utils.misc import generate_user_keys +from app.utils.session_utils import generate_user_keys from flask import Flask from flask_session import Session import os diff --git a/app/filter.py b/app/filter.py index 1cc9f87..3948bf2 100644 --- a/app/filter.py +++ b/app/filter.py @@ -1,56 +1,11 @@ from app.request import VALID_PARAMS -from app.utils.misc import BLACKLIST -from bs4 import BeautifulSoup +from app.utils.filter_utils import * from bs4.element import ResultSet from cryptography.fernet import Fernet import re import urllib.parse as urlparse from urllib.parse import parse_qs -SKIP_ARGS = ['ref_src', 'utm'] -FULL_RES_IMG = '
Full Image' -GOOG_IMG = '/images/branding/searchlogo/1x/googlelogo' -LOGO_URL = GOOG_IMG + '_desk' -BLANK_B64 = ''' - -''' - - -def get_first_link(soup): - # Replace hrefs with only the intended destination (no "utm" type tags) - for a in soup.find_all('a', href=True): - # Return the first search result URL - if 'url?q=' in a['href']: - return filter_link_args(a['href']) - - -def filter_link_args(query_link): - parsed_link = urlparse.urlparse(query_link) - link_args = parse_qs(parsed_link.query) - safe_args = {} - - if len(link_args) == 0 and len(parsed_link) > 0: - return query_link - - for arg in link_args.keys(): - if arg in SKIP_ARGS: - continue - - safe_args[arg] = link_args[arg] - - # Remove original link query and replace with filtered args - query_link = query_link.replace(parsed_link.query, '') - if len(safe_args) > 0: - query_link = query_link + urlparse.urlencode(safe_args, doseq=True) - else: - query_link = query_link.replace('?', '') - - return query_link - - -def has_ad_content(element: str): - return element.upper() in (value.upper() for value in BLACKLIST) or 'ⓘ' in element - class Filter: def __init__(self, user_keys: dict, mobile=False, config=None): @@ -61,6 +16,7 @@ class Filter: self.dark = config['dark'] if 'dark' in config else False self.nojs = config['nojs'] if 'nojs' in config else False self.new_tab = config['new_tab'] if 'new_tab' in config else False + self.alt_redirect = config['alts'] if 'alts' in config else True self.mobile = mobile self.user_keys = user_keys self.main_divs = ResultSet('') @@ -232,11 +188,11 @@ class Filter: else: link['href'] = href + # Replace link location + if self.alt_redirect: + link['href'] = get_site_alt(link['href']) + link_desc = link.find_all(text=re.compile('|'.join(SITE_ALTS.keys()))) + if len(link_desc) == 0: + return -def gen_nojs(sibling): - nojs_link = BeautifulSoup().new_tag('a') - nojs_link['href'] = '/window?location=' + sibling['href'] - nojs_link['style'] = 'display:block;width:100%;' - nojs_link.string = 'NoJS Link: ' + nojs_link['href'] - sibling.append(BeautifulSoup('


', 'html.parser')) - sibling.append(nojs_link) + link_desc[0].replace_with(get_site_alt(link_desc[0])) diff --git a/app/routes.py b/app/routes.py index 7f1869c..fd6278d 100644 --- a/app/routes.py +++ b/app/routes.py @@ -15,7 +15,7 @@ from requests import exceptions from app import app from app.models.config import Config from app.request import Request -from app.utils.misc import valid_user_session +from app.utils.session_utils import valid_user_session from app.utils.routing_utils import * diff --git a/app/utils/filter_utils.py b/app/utils/filter_utils.py new file mode 100644 index 0000000..ed05d76 --- /dev/null +++ b/app/utils/filter_utils.py @@ -0,0 +1,79 @@ +from bs4 import BeautifulSoup +import urllib.parse as urlparse +from urllib.parse import parse_qs + +SKIP_ARGS = ['ref_src', 'utm'] +FULL_RES_IMG = '
Full Image' +GOOG_IMG = '/images/branding/searchlogo/1x/googlelogo' +LOGO_URL = GOOG_IMG + '_desk' +BLANK_B64 = ''' + +''' + +BLACKLIST = [ + 'ad', 'anuncio', 'annuncio', 'annonce', 'Anzeige', '广告', '廣告', 'Reklama', 'Реклама', 'Anunț', '광고', + 'annons', 'Annonse', 'Iklan', '広告', 'Augl.', 'Mainos', 'Advertentie', 'إعلان', 'Գովազդ', 'विज्ञापन', 'Reklam', + 'آگهی', 'Reklāma', 'Reklaam', 'Διαφήμιση', 'מודעה', 'Hirdetés' +] + +SITE_ALTS = { + 'twitter.com': 'nitter.net', + 'youtube.com': 'invidio.us', + 'instagram.com': 'bibliogram.art/u' +} + + +def has_ad_content(element: str): + return element.upper() in (value.upper() for value in BLACKLIST) or 'ⓘ' in element + + +def get_first_link(soup): + # Replace hrefs with only the intended destination (no "utm" type tags) + for a in soup.find_all('a', href=True): + # Return the first search result URL + if 'url?q=' in a['href']: + return filter_link_args(a['href']) + + +def get_site_alt(link: str): + for site_key in SITE_ALTS.keys(): + if site_key not in link: + continue + + link = link.replace(site_key, SITE_ALTS[site_key]) + break + + return link + + +def filter_link_args(query_link): + parsed_link = urlparse.urlparse(query_link) + link_args = parse_qs(parsed_link.query) + safe_args = {} + + if len(link_args) == 0 and len(parsed_link) > 0: + return query_link + + for arg in link_args.keys(): + if arg in SKIP_ARGS: + continue + + safe_args[arg] = link_args[arg] + + # Remove original link query and replace with filtered args + query_link = query_link.replace(parsed_link.query, '') + if len(safe_args) > 0: + query_link = query_link + urlparse.urlencode(safe_args, doseq=True) + else: + query_link = query_link.replace('?', '') + + return query_link + + +def gen_nojs(sibling): + nojs_link = BeautifulSoup().new_tag('a') + nojs_link['href'] = '/window?location=' + sibling['href'] + nojs_link['style'] = 'display:block;width:100%;' + nojs_link.string = 'NoJS Link: ' + nojs_link['href'] + sibling.append(BeautifulSoup('


', 'html.parser')) + sibling.append(nojs_link) \ No newline at end of file diff --git a/app/utils/routing_utils.py b/app/utils/routing_utils.py index 40f8a90..2a649b4 100644 --- a/app/utils/routing_utils.py +++ b/app/utils/routing_utils.py @@ -1,5 +1,5 @@ from app.filter import Filter, get_first_link -from app.utils.misc import generate_user_keys +from app.utils.session_utils import generate_user_keys from app.request import gen_query from bs4 import BeautifulSoup from cryptography.fernet import Fernet, InvalidToken diff --git a/app/utils/misc.py b/app/utils/session_utils.py similarity index 62% rename from app/utils/misc.py rename to app/utils/session_utils.py index b87941d..f959abe 100644 --- a/app/utils/misc.py +++ b/app/utils/session_utils.py @@ -2,11 +2,6 @@ from cryptography.fernet import Fernet from flask import current_app as app REQUIRED_SESSION_VALUES = ['uuid', 'config', 'fernet_keys'] -BLACKLIST = [ - 'ad', 'anuncio', 'annuncio', 'annonce', 'Anzeige', '广告', '廣告', 'Reklama', 'Реклама', 'Anunț', '광고', - 'annons', 'Annonse', 'Iklan', '広告', 'Augl.', 'Mainos', 'Advertentie', 'إعلان', 'Գովազդ', 'विज्ञापन', 'Reklam', - 'آگهی', 'Reklāma', 'Reklaam', 'Διαφήμιση', 'מודעה', 'Hirdetés' -] def generate_user_keys(cookies_disabled=False) -> dict: diff --git a/test/conftest.py b/test/conftest.py index 63aec3e..7a15f00 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -1,5 +1,5 @@ from app import app -from app.utils.misc import generate_user_keys +from app.utils.session_utils import generate_user_keys import pytest diff --git a/test/test_misc.py b/test/test_misc.py index 8eb1d78..92fcadb 100644 --- a/test/test_misc.py +++ b/test/test_misc.py @@ -1,4 +1,4 @@ -from app.utils.misc import generate_user_keys, valid_user_session +from app.utils.session_utils import generate_user_keys, valid_user_session def test_generate_user_keys(): diff --git a/test/test_results.py b/test/test_results.py index 463a355..a7aa771 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -1,6 +1,6 @@ from bs4 import BeautifulSoup from app.filter import Filter -from app.utils.misc import generate_user_keys +from app.utils.session_utils import generate_user_keys from datetime import datetime from dateutil.parser import *