unify scholar

2021-12-13 17:21:41 +01:00 · 2021-12-13 17:21:41 +01:00 · 51bf35c2e4
commit 51bf35c2e4
parent d64589914f
8 changed files with 172 additions and 140 deletions
--- a/cps/metadata_provider/comicvine.py
+++ b/cps/metadata_provider/comicvine.py
@ -17,49 +17,68 @@
 #  along with this program. If not, see <http://www.gnu.org/licenses/>.
 # ComicVine api document: https://comicvine.gamespot.com/api/documentation
 from typing import Dict, List, Optional
 from urllib.parse import quote
 import requests
-from cps.services.Metadata import Metadata
+from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
 class ComicVine(Metadata):
    __name__ = "ComicVine"
    __id__ = "comicvine"
    DESCRIPTION = "ComicVine Books"
    META_URL = "https://comicvine.gamespot.com/"
    API_KEY = "57558043c53943d5d1e96a9ad425b0eb85532ee6"
    BASE_URL = (
        f"https://comicvine.gamespot.com/api/search?api_key={API_KEY}"
        f"&resources=issue&query="
    )
    QUERY_PARAMS = "&sort=name:desc&format=json"
    HEADERS = {"User-Agent": "Not Evil Browser"}
-    def search(self, query, generic_cover=""):
+    def search(
        self, query: str, generic_cover: str = "", locale: str = "en"
    ) -> Optional[List[MetaRecord]]:
        val = list()
        apikey = "57558043c53943d5d1e96a9ad425b0eb85532ee6"
        if self.active:
-            headers = {
+            title_tokens = list(self.get_title_tokens(query, strip_joiners=False))
-                'User-Agent': 'Not Evil Browser'
+            if title_tokens:
-            }
+                tokens = [quote(t.encode("utf-8")) for t in title_tokens]
-
+                query = "%20".join(tokens)
-            result = requests.get("https://comicvine.gamespot.com/api/search?api_key="
+            result = requests.get(
-                                  + apikey + "&resources=issue&query=" + query + "&sort=name:desc&format=json", headers=headers)
+                f"{ComicVine.BASE_URL}{query}{ComicVine.QUERY_PARAMS}",
-            for r in result.json()['results']:
+                headers=ComicVine.HEADERS,
-                seriesTitle = r['volume'].get('name', "")
+            )
-                if r.get('store_date'):
+            for result in result.json()["results"]:
-                    dateFomers = r.get('store_date')
+                match = self._parse_search_result(
-                else:
+                    result=result, generic_cover=generic_cover, locale=locale
-                    dateFomers = r.get('date_added')
+                )
-                v = dict()
+                val.append(match)
                v['id'] = r['id']
                v['title'] = seriesTitle + " #" + r.get('issue_number', "0") + " - " + ( r.get('name', "") or "")
                v['authors'] = r.get('authors', [])
                v['description'] = r.get('description', "")
                v['publisher'] = ""
                v['publishedDate'] = dateFomers
                v['tags'] = ["Comics", seriesTitle]
                v['rating'] = 0
                v['series'] = seriesTitle
                v['cover'] = r['image'].get('original_url', generic_cover)
                v['source'] = {
                    "id": self.__id__,
                    "description": "ComicVine Books",
                    "link": "https://comicvine.gamespot.com/"
                }
                v['url'] = r.get('site_detail_url', "")
                val.append(v)
        return val
-
+    def _parse_search_result(
        self, result: Dict, generic_cover: str, locale: str
    ) -> MetaRecord:
        series = result["volume"].get("name", "")
        series_index = result.get("issue_number", 0)
        issue_name = result.get("name", "")
        match = MetaRecord(
            id=result["id"],
            title=f"{series}#{series_index} - {issue_name}",
            authors=result.get("authors", []),
            url=result.get("site_detail_url", ""),
            source=MetaSourceInfo(
                id=self.__id__,
                description=ComicVine.DESCRIPTION,
                link=ComicVine.META_URL,
            ),
            series=series,
        )
        match.cover = result["image"].get("original_url", generic_cover)
        match.description = result.get("description", "")
        match.publishedDate = result.get("store_date", result.get("date_added"))
        match.series_index = series_index
        match.tags = ["Comics", series]
        match.identifiers = {"comicvine": match.id}
        return match
--- a/cps/metadata_provider/google.py
+++ b/cps/metadata_provider/google.py
@ -23,7 +23,7 @@ from urllib.parse import quote
 import requests
 from cps.isoLanguages import get_lang3, get_language_name
-from cps.services.Metadata import MetaRecord, Metadata
+from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
 class Google(Metadata):
@ -56,38 +56,37 @@ class Google(Metadata):
    def _parse_search_result(
        self, result: Dict, generic_cover: str, locale: str
    ) -> MetaRecord:
-        match = dict()
+        match = MetaRecord(
-        match["id"] = result["id"]
+            id=result["id"],
-        match["title"] = result["volumeInfo"]["title"]
+            title=result["volumeInfo"]["title"],
-        match["authors"] = result["volumeInfo"].get("authors", [])
+            authors=result["volumeInfo"].get("authors", []),
-        match["url"] = Google.BOOK_URL + result["id"]
+            url=Google.BOOK_URL + result["id"],
-        match["cover"] = self._parse_cover(result=result, generic_cover=generic_cover)
+            source=MetaSourceInfo(
-        match["description"] = result["volumeInfo"].get("description", "")
+                id=self.__id__,
-        match["languages"] = self._parse_languages(result=result, locale=locale)
+                description=Google.DESCRIPTION,
-        match["publisher"] = result["volumeInfo"].get("publisher", "")
+                link=Google.META_URL,
-        match["publishedDate"] = result["volumeInfo"].get("publishedDate", "")
+            ),
-        match["rating"] = result["volumeInfo"].get("averageRating", 0)
+        )
        match["series"], match["series_index"] = "", 1
        match["tags"] = result["volumeInfo"].get("categories", [])
-        match["source"] = {
+        match.cover = self._parse_cover(result=result, generic_cover=generic_cover)
-            "id": self.__id__,
+        match.description = result["volumeInfo"].get("description", "")
-            "description": Google.DESCRIPTION,
+        match.languages = self._parse_languages(result=result, locale=locale)
-            "link": Google.META_URL,
+        match.publisher = result["volumeInfo"].get("publisher", "")
-        }
+        match.publishedDate = result["volumeInfo"].get("publishedDate", "")
        match.rating = result["volumeInfo"].get("averageRating", 0)
        match.series, match.series_index = "", 1
        match.tags = result["volumeInfo"].get("categories", [])
-        match["identifiers"] = {
+        match.identifiers = {"google": match.id}
            "google": match.get("id"),
        }
        match = self._parse_isbn(result=result, match=match)
        return match
    @staticmethod
-    def _parse_isbn(result: Dict, match: Dict) -> Dict:
+    def _parse_isbn(result: Dict, match: MetaRecord) -> MetaRecord:
        identifiers = result["volumeInfo"].get("industryIdentifiers", [])
        for identifier in identifiers:
            if identifier.get("type") == Google.ISBN_TYPE:
-                match["identifiers"]["isbn"] = identifier.get("identifier")
+                match.identifiers["isbn"] = identifier.get("identifier")
                break
        return match
@ -100,7 +99,7 @@ class Google(Metadata):
    @staticmethod
    def _parse_languages(result: Dict, locale: str) -> List[str]:
-        language_iso2 = result.get("language", "")
+        language_iso2 = result["volumeInfo"].get("language", "")
        languages = (
            [get_language_name(locale, get_lang3(language_iso2))]
            if language_iso2
--- a/cps/metadata_provider/lubimyczytac.py
+++ b/cps/metadata_provider/lubimyczytac.py
@ -27,7 +27,7 @@ from html2text import HTML2Text
 from lxml.html import HtmlElement, fromstring, tostring
 from markdown2 import Markdown
-from cps.services.Metadata import MetaRecord, Metadata
+from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
 SYMBOLS_TO_TRANSLATE = (
    "öÖüÜóÓőŐúÚéÉáÁűŰíÍąĄćĆęĘłŁńŃóÓśŚźŹżŻ",
@ -158,61 +158,60 @@ class LubimyCzytacParser:
        self.root = root
        self.metadata = metadata
-    def parse_search_results(self) -> List[Dict]:
+    def parse_search_results(self) -> List[MetaRecord]:
        matches = []
        results = self.root.xpath(LubimyCzytac.BOOK_SEARCH_RESULT_XPATH)
        for result in results:
            title = self._parse_xpath_node(
                root=result,
                xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
-                f"{LubimyCzytac.TITLE_TEXT_PATH}",
+                      f"{LubimyCzytac.TITLE_TEXT_PATH}",
            )
            book_url = self._parse_xpath_node(
                root=result,
                xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
-                f"{LubimyCzytac.URL_PATH}",
+                      f"{LubimyCzytac.URL_PATH}",
            )
            authors = self._parse_xpath_node(
                root=result,
                xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
-                f"{LubimyCzytac.AUTHORS_PATH}",
+                      f"{LubimyCzytac.AUTHORS_PATH}",
                take_first=False,
            )
            if not all([title, book_url, authors]):
                continue
            matches.append(
-                {
+                MetaRecord(
-                    "id": book_url.replace(f"/ksiazka/", "").split("/")[0],
+                    id=book_url.replace(f"/ksiazka/", "").split("/")[0],
-                    "title": title,
+                    title=title,
-                    "authors": [strip_accents(author) for author in authors],
+                    authors=[strip_accents(author) for author in authors],
-                    "url": LubimyCzytac.BASE_URL + book_url,
+                    url=LubimyCzytac.BASE_URL + book_url,
-                }
+                    source=MetaSourceInfo(
                        id=self.metadata.__id__,
                        description=self.metadata.__name__,
                        link=LubimyCzytac.BASE_URL,
                    )
                )
            )
        return matches
-    def parse_single_book(self, match: Dict, generic_cover: str) -> MetaRecord:
+    def parse_single_book(self, match: MetaRecord, generic_cover: str) -> MetaRecord:
-        response = requests.get(match.get("url"))
+        response = requests.get(match.url)
        self.root = fromstring(response.text)
-        match["cover"] = self._parse_cover(generic_cover=generic_cover)
+        match.cover = self._parse_cover(generic_cover=generic_cover)
-        match["description"] = self._parse_description()
+        match.description = self._parse_description()
-        match["languages"] = self._parse_languages()
+        match.languages = self._parse_languages()
-        match["publisher"] = self._parse_publisher()
+        match.publisher = self._parse_publisher()
-        match["publishedDate"] = self._parse_from_summary(
+        match.publishedDate = self._parse_from_summary(
            attribute_name="datePublished"
        )
-        match["rating"] = self._parse_rating()
+        match.rating = self._parse_rating()
-        match["series"], match["series_index"] = self._parse_series()
+        match.series, match.series_index = self._parse_series()
-        match["tags"] = self._parse_tags()
+        match.tags = self._parse_tags()
-
+        match.identifiers = {
        match["source"] = {
            "id": self.metadata.__id__,
            "description": self.metadata.__name__,
            "link": LubimyCzytac.BASE_URL,
        }
        match["identifiers"] = {
            "isbn": self._parse_isbn(),
-            "lubimyczytac": match["id"],
+            "lubimyczytac": match.id,
        }
        return match
--- a/cps/metadata_provider/scholar.py
+++ b/cps/metadata_provider/scholar.py
@ -15,47 +15,53 @@
 #
 #  You should have received a copy of the GNU General Public License
 #  along with this program. If not, see <http://www.gnu.org/licenses/>.
 import itertools
 from typing import Dict, List, Optional
 from urllib.parse import quote
 from scholarly import scholarly
-from cps.services.Metadata import Metadata
+from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
 class scholar(Metadata):
    __name__ = "Google Scholar"
    __id__ = "googlescholar"
    META_URL = "https://scholar.google.com/"
-    def search(self, query, generic_cover=""):
+    def search(
        self, query: str, generic_cover: str = "", locale: str = "en"
    ) -> Optional[List[MetaRecord]]:
        val = list()
        if self.active:
-            scholar_gen = scholarly.search_pubs(' '.join(query.split('+')))
+            title_tokens = list(self.get_title_tokens(query, strip_joiners=False))
-            i = 0
+            if title_tokens:
-            for publication in scholar_gen:
+                tokens = [quote(t.encode("utf-8")) for t in title_tokens]
-                v = dict()
+                query = " ".join(tokens)
-                v['id'] = "1234" # publication['bib'].get('title')
+            scholar_gen = itertools.islice(scholarly.search_pubs(query), 10)
-                v['title'] = publication['bib'].get('title')
+            for result in scholar_gen:
-                v['authors'] = publication['bib'].get('author', [])
+                match = self._parse_search_result(
-                v['description'] = publication['bib'].get('abstract', "")
+                    result=result, generic_cover=generic_cover, locale=locale
-                v['publisher'] = publication['bib'].get('venue', "")
+                )
-                if publication['bib'].get('pub_year'):
+                val.append(match)
                    v['publishedDate'] = publication['bib'].get('pub_year')+"-01-01"
                else:
                    v['publishedDate'] = ""
                v['tags'] = ""
                v['ratings'] = 0
                v['series'] = ""
                v['cover'] = generic_cover
                v['url'] = publication.get('pub_url') or publication.get('eprint_url') or "",
                v['source'] = {
                    "id": self.__id__,
                    "description": "Google Scholar",
                    "link": "https://scholar.google.com/"
                }
                val.append(v)
                i += 1
                if (i >= 10):
                    break
        return val
    def _parse_search_result(
        self, result: Dict, generic_cover: str, locale: str
    ) -> MetaRecord:
        match = MetaRecord(
            id=result.get("pub_url", result.get("eprint_url", "")),
            title=result["bib"].get("title"),
            authors=result["bib"].get("author", []),
            url=result.get("pub_url", result.get("eprint_url", "")),
            source=MetaSourceInfo(
                id=self.__id__, description=self.__name__, link=scholar.META_URL
            ),
        )
-
+        match.cover = result.get("image", {}).get("original_url", generic_cover)
        match.description = result["bib"].get("abstract", "")
        match.publisher = result["bib"].get("venue", "")
        match.publishedDate = result["bib"].get("pub_year") + "-01-01"
        match.identifiers = {"scholar": match.id}
        return match
--- a/cps/search_metadata.py
+++ b/cps/search_metadata.py
@ -22,6 +22,7 @@ import inspect
 import json
 import os
 import sys
 from dataclasses import asdict
 from flask import Blueprint, Response, request, url_for
 from flask_login import current_user
@ -99,11 +100,13 @@ def metadata_change_active_provider(prov_name):
        log.error("Invalid request received: {}".format(request))
        return "Invalid request", 400
    if "initial" in new_state and prov_name:
-        for c in cl:
+        data = []
-            if c.__id__ == prov_name:
+        provider = next((c for c in cl if c.__id__ == prov_name), None)
-                data = c.search(new_state.get("query", ""))
+        if provider is not None:
-                break
+            data = provider.search(new_state.get("query", ""))
-        return Response(json.dumps(data), mimetype="application/json")
+        return Response(
            json.dumps([asdict(x) for x in data]), mimetype="application/json"
        )
    return ""
@ -123,5 +126,5 @@ def metadata_search():
                if active.get(c.__id__, True)
            }
            for future in concurrent.futures.as_completed(meta):
-                data.extend(future.result())
+                data.extend([asdict(x) for x in future.result()])
    return Response(json.dumps(data), mimetype="application/json")
--- a/cps/services/Metadata.py
+++ b/cps/services/Metadata.py
@ -16,32 +16,38 @@
 #  You should have received a copy of the GNU General Public License
 #  along with this program. If not, see <http://www.gnu.org/licenses/>.
 import abc
 import dataclasses
 import os
 import re
-from typing import Dict, Generator, List, Optional, TypedDict, Union
+from typing import Dict, Generator, List, Optional, Union
 from cps import constants
-class MetaSourceInfo(TypedDict):
+@dataclasses.dataclass
 class MetaSourceInfo:
    id: str
    description: str
    link: str
-class MetaRecord(TypedDict):
+@dataclasses.dataclass
 class MetaRecord:
    id: Union[str, int]
    title: str
    authors: List[str]
    url: str
    cover: str
    series: Optional[str]
    series_index: Optional[Union[int, float]]
    tags: Optional[List[str]]
    publisher: Optional[str]
    publishedDate: Optional[str]
    rating: Optional[int]
    description: Optional[str]
    source: MetaSourceInfo
-    languages: Optional[List[str]]
+    cover: str = os.path.join(constants.STATIC_DIR, 'generic_cover.jpg')
-    identifiers: Dict[str, Union[str, int]]
+    description: Optional[str] = ""
    series: Optional[str] = None
    series_index: Optional[Union[int, float]] = 0
    identifiers: Dict[str, Union[str, int]] = dataclasses.field(default_factory=dict)
    publisher: Optional[str] = None
    publishedDate: Optional[str] = None
    rating: Optional[int] = 0
    languages: Optional[List[str]] = dataclasses.field(default_factory=list)
    tags: Optional[List[str]] = dataclasses.field(default_factory=list)
 class Metadata:
--- a/optional-requirements.txt
+++ b/optional-requirements.txt
@ -32,6 +32,9 @@ SQLAlchemy-Utils>=0.33.5,<0.38.0
 # extracting metadata
 rarfile>=2.7
 scholarly>=1.2.0, <1.5
 markdown2==2.4.2
 html2text==2020.1.16
 python-dateutil==2.8.2
 # other
 natsort>=2.2.0,<8.1.0
--- a/requirements.txt
+++ b/requirements.txt
@ -14,6 +14,3 @@ Wand>=0.4.4,<0.7.0
 unidecode>=0.04.19,<1.3.0
 lxml>=3.8.0,<4.7.0
 flask-wtf>=0.14.2,<1.1.0
 markdown2==2.4.2
 html2text==2020.1.16
 python-dateutil==2.8.2