Merge remote-tracking branch 'amazon/master' into Develop

2022-01-27 19:12:33 +01:00 · 2022-01-27 19:12:33 +01:00 · e757be6953
commit e757be6953
parent 4f3c396450 477b202c38
1 changed files with 122 additions and 0 deletions
--- a/cps/metadata_provider/amazon.py
+++ b/cps/metadata_provider/amazon.py
@ -0,0 +1,122 @@
+# -*- coding: utf-8 -*-
+
+#  This file is part of the Calibre-Web (https://github.com/janeczku/calibre-web)
+#    Copyright (C) 2022 quarz12
+#
+#  This program is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 3 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import concurrent.futures
+import requests
+from bs4 import BeautifulSoup as BS  # requirement
+# import lxml     #requirement for better speed
+try:
+    import cchardet #optional for better speed
+except ImportError:
+    pass
+from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
+#from time import time
+from operator import itemgetter
+
+class Amazon(Metadata):
+    __name__ = "Amazon"
+    __id__ = "amazon"
+    headers = {'upgrade-insecure-requests': '1',
+               'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
+               'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
+               'sec-gpc': '1',
+               'sec-fetch-site': 'none',
+               'sec-fetch-mode': 'navigate',
+               'sec-fetch-user': '?1',
+               'sec-fetch-dest': 'document',
+               'accept-encoding': 'gzip, deflate, br',
+               'accept-language': 'en-US,en;q=0.9'}
+    session = requests.Session()
+    session.headers=headers
+
+    def search(
+        self, query: str, generic_cover: str = "", locale: str = "en"
+    ):
+        #timer=time()
+        def inner(link,index)->[dict,int]:
+             with self.session as session:
+                r = session.get(f"https://www.amazon.com/{link}")
+                r.raise_for_status()
+                long_soup = BS(r.text, "lxml")  #~4sec :/
+                soup2 = long_soup.find("div", attrs={"cel_widget_id": "dpx-books-ppd_csm_instrumentation_wrapper"})
+                if soup2 is None:
+                    return
+                try:
+                    match = MetaRecord(
+                        title = "",
+                        authors = "",
+                        source=MetaSourceInfo(
+                            id=self.__id__,
+                            description="Amazon Books",
+                            link="https://amazon.com/"
+                        ),
+                        url = f"https://www.amazon.com/{link}",
+                        #the more searches the slower, these are too hard to find in reasonable time or might not even exist
+                        publisher= "",  # very unreliable
+                        publishedDate= "",  # very unreliable
+                        id = None,  # ?
+                        tags = []  # dont exist on amazon
+                    )
+
+                    try:
+                        match.description = "\n".join(
+                            soup2.find("div", attrs={"data-feature-name": "bookDescription"}).stripped_strings)\
+                                                .replace("\xa0"," ")[:-9].strip().strip("\n")
+                    except (AttributeError, TypeError):
+                        return None  # if there is no description it is not a book and therefore should be ignored
+                    try:
+                        match.title = soup2.find("span", attrs={"id": "productTitle"}).text
+                    except (AttributeError, TypeError):
+                        match.title = ""
+                    try:
+                        match.authors = [next(
+                            filter(lambda i: i != " " and i != "\n" and not i.startswith("{"),
+                                   x.findAll(text=True))).strip()
+                                        for x in soup2.findAll("span", attrs={"class": "author"})]
+                    except (AttributeError, TypeError, StopIteration):
+                        match.authors = ""
+                    try:
+                        match.rating = int(
+                            soup2.find("span", class_="a-icon-alt").text.split(" ")[0].split(".")[
+                                0])  # first number in string
+                    except (AttributeError, ValueError):
+                        match.rating = 0
+                    try:
+                        match.cover = soup2.find("img", attrs={"class": "a-dynamic-image frontImage"})["src"]
+                    except (AttributeError, TypeError):
+                        match.cover = ""
+                    return match, index
+                except Exception as e:
+                    print(e)
+                    return
+
+        val = list()
+        if self.active:
+            results = self.session.get(
+                f"https://www.amazon.com/s?k={query.replace(' ', '+')}&i=digital-text&sprefix={query.replace(' ', '+')}"
+                f"%2Cdigital-text&ref=nb_sb_noss",
+                headers=self.headers)
+            results.raise_for_status()
+            soup = BS(results.text, 'html.parser')
+            links_list = [next(filter(lambda i: "digital-text" in i["href"], x.findAll("a")))["href"] for x in
+                          soup.findAll("div", attrs={"data-component-type": "s-search-result"})]
+            with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
+                fut = {executor.submit(inner, link, index) for index, link in enumerate(links_list[:5])}
+                val=list(map(lambda x : x.result() ,concurrent.futures.as_completed(fut)))
+        result=list(filter(lambda x: x, val))
+        return [x[0] for x in sorted(result, key=itemgetter(1))] #sort by amazons listing order for best relevance