updating results.py for social-media-site-using-.env

This commit is contained in:
Shimul 2021-03-22 03:41:36 +05:30 committed by GitHub
parent 1a1782e3a0
commit 99e60b2f1e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -3,6 +3,10 @@ import os
import urllib.parse as urlparse import urllib.parse as urlparse
from urllib.parse import parse_qs from urllib.parse import parse_qs
# For .env reading
from dotenv import load_dotenv
load_dotenv(os.path.join(os.path.abspath(os.getcwd()), '.env'))
SKIP_ARGS = ['ref_src', 'utm'] SKIP_ARGS = ['ref_src', 'utm']
SKIP_PREFIX = ['//www.', '//mobile.', '//m.'] SKIP_PREFIX = ['//www.', '//mobile.', '//m.']
GOOG_STATIC = 'www.gstatic.com' GOOG_STATIC = 'www.gstatic.com'
@ -12,6 +16,7 @@ BLANK_B64 = ('data:image/png;base64,'
'iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAQAAAAnOwc2AAAAD0lEQVR42mNkw' 'iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAQAAAAnOwc2AAAAD0lEQVR42mNkw'
'AIYh7IgAAVVAAuInjI5AAAAAElFTkSuQmCC') 'AIYh7IgAAVVAAuInjI5AAAAAElFTkSuQmCC')
# Ad keywords # Ad keywords
BLACKLIST = [ BLACKLIST = [
'ad', 'anuncio', 'annuncio', 'annonce', 'Anzeige', '广告', '廣告', 'Reklama', 'ad', 'anuncio', 'annuncio', 'annonce', 'Anzeige', '广告', '廣告', 'Reklama',
@ -20,20 +25,42 @@ BLACKLIST = [
'Reklāma', 'Reklaam', 'Διαφήμιση', 'מודעה', 'Hirdetés', 'Anúncio' 'Reklāma', 'Reklaam', 'Διαφήμιση', 'מודעה', 'Hirdetés', 'Anúncio'
] ]
SITE_ALTS = { # Setting up ALTs site Links
'twitter.com': os.getenv('WHOOGLE_ALT_TW', 'nitter.net'), SITE_ALTS = {}
'youtube.com': os.getenv('WHOOGLE_ALT_YT', 'invidious.snopyta.org'), def get_alt_links():
'instagram.com': os.getenv('WHOOGLE_ALT_IG', 'bibliogram.art/u'), print("Setting up alternative social media site..")
'reddit.com': os.getenv('WHOOGLE_ALT_RD', 'libredd.it') tw,yt,ig,rd = os.getenv("twitter"),os.getenv("youtube"),os.getenv("instagram"),os.getenv("reddit")
} if tw != None and tw != "":
SITE_ALTS['twitter.com']=tw
else:
SITE_ALTS['twitter.com']=os.getenv('WHOOGLE_ALT_TW', 'nitter.net')
if yt != None and yt != "":
SITE_ALTS['youtube.com']=yt
else:
SITE_ALTS['youtube.com']=os.getenv('WHOOGLE_ALT_YT', 'invalid.com')
if ig != None and ig != "":
SITE_ALTS['instagram.com']=os.getenv("instagram")
else:
SITE_ALTS['instagram.com']=os.getenv('WHOOGLE_ALT_IG', 'bibliogram.art/u')
if rd != None and rd != "":
SITE_ALTS['reddit.com']=os.getenv("reddit")
else:
SITE_ALTS['reddit.com']=os.getenv('WHOOGLE_ALT_RD', 'libredd.it')
get_alt_links()
def has_ad_content(element: str) -> bool: def has_ad_content(element: str) -> bool:
"""Inspects an HTML element for ad related content """Inspects an HTML element for ad related content
Args: Args:
element: The HTML element to inspect element: The HTML element to inspect
Returns: Returns:
bool: True/False for the element containing an ad bool: True/False for the element containing an ad
""" """
return (element.upper() in (value.upper() for value in BLACKLIST) return (element.upper() in (value.upper() for value in BLACKLIST)
or '' in element) or '' in element)
@ -41,10 +68,13 @@ def has_ad_content(element: str) -> bool:
def get_first_link(soup: BeautifulSoup) -> str: def get_first_link(soup: BeautifulSoup) -> str:
"""Retrieves the first result link from the query response """Retrieves the first result link from the query response
Args: Args:
soup: The BeautifulSoup response body soup: The BeautifulSoup response body
Returns: Returns:
str: A str link to the first result str: A str link to the first result
""" """
# Replace hrefs with only the intended destination (no "utm" type tags) # Replace hrefs with only the intended destination (no "utm" type tags)
for a in soup.find_all('a', href=True): for a in soup.find_all('a', href=True):
@ -55,11 +85,15 @@ def get_first_link(soup: BeautifulSoup) -> str:
def get_site_alt(link: str) -> str: def get_site_alt(link: str) -> str:
"""Returns an alternative to a particular site, if one is configured """Returns an alternative to a particular site, if one is configured
Args: Args:
link: A string result URL to check against the SITE_ALTS map link: A string result URL to check against the SITE_ALTS map
Returns: Returns:
str: An updated (or ignored) result link str: An updated (or ignored) result link
""" """
for site_key in SITE_ALTS.keys(): for site_key in SITE_ALTS.keys():
if site_key not in link: if site_key not in link:
continue continue
@ -75,10 +109,13 @@ def get_site_alt(link: str) -> str:
def filter_link_args(link: str) -> str: def filter_link_args(link: str) -> str:
"""Filters out unnecessary URL args from a result link """Filters out unnecessary URL args from a result link
Args: Args:
link: The string result link to check for extraneous URL params link: The string result link to check for extraneous URL params
Returns: Returns:
str: An updated (or ignored) result link str: An updated (or ignored) result link
""" """
parsed_link = urlparse.urlparse(link) parsed_link = urlparse.urlparse(link)
link_args = parse_qs(parsed_link.query) link_args = parse_qs(parsed_link.query)
@ -105,10 +142,13 @@ def filter_link_args(link: str) -> str:
def append_nojs(result: BeautifulSoup) -> None: def append_nojs(result: BeautifulSoup) -> None:
"""Appends a no-Javascript alternative for a search result """Appends a no-Javascript alternative for a search result
Args: Args:
result: The search result to append a no-JS link to result: The search result to append a no-JS link to
Returns: Returns:
None None
""" """
nojs_link = BeautifulSoup(features='html.parser').new_tag('a') nojs_link = BeautifulSoup(features='html.parser').new_tag('a')
nojs_link['href'] = '/window?location=' + result['href'] nojs_link['href'] = '/window?location=' + result['href']