Merge remote-tracking branch 'lubimyczytac/add_lubimyczytac.pl_meta_provider' into Develop

# Conflicts: # optional-requirements.txt
2022-01-27 18:37:02 +01:00 · 2022-01-27 18:37:02 +01:00 · 4f3c396450
commit 4f3c396450
parent 6339d25af0 20b5a9a2c0
11 changed files with 1163 additions and 1173 deletions
--- a/cps/metadata_provider/comicvine.py
+++ b/cps/metadata_provider/comicvine.py
@ -17,49 +17,68 @@
 #  along with this program. If not, see <http://www.gnu.org/licenses/>.

 # ComicVine api document: https://comicvine.gamespot.com/api/documentation
+from typing import Dict, List, Optional
+from urllib.parse import quote

 import requests
-from cps.services.Metadata import Metadata
+from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata


 class ComicVine(Metadata):
    __name__ = "ComicVine"
    __id__ = "comicvine"
+    DESCRIPTION = "ComicVine Books"
+    META_URL = "https://comicvine.gamespot.com/"
+    API_KEY = "57558043c53943d5d1e96a9ad425b0eb85532ee6"
+    BASE_URL = (
+        f"https://comicvine.gamespot.com/api/search?api_key={API_KEY}"
+        f"&resources=issue&query="
+    )
+    QUERY_PARAMS = "&sort=name:desc&format=json"
+    HEADERS = {"User-Agent": "Not Evil Browser"}

-    def search(self, query, generic_cover=""):
+    def search(
+        self, query: str, generic_cover: str = "", locale: str = "en"
+    ) -> Optional[List[MetaRecord]]:
        val = list()
-        apikey = "57558043c53943d5d1e96a9ad425b0eb85532ee6"
        if self.active:
-            headers = {
-                'User-Agent': 'Not Evil Browser'
-            }
-
-            result = requests.get("https://comicvine.gamespot.com/api/search?api_key="
-                                  + apikey + "&resources=issue&query=" + query + "&sort=name:desc&format=json", headers=headers)
-            for r in result.json().get('results'):
-                seriesTitle = r['volume'].get('name', "")
-                if r.get('store_date'):
-                    dateFomers = r.get('store_date')
-                else:
-                    dateFomers = r.get('date_added')
-                v = dict()
-                v['id'] = r['id']
-                v['title'] = seriesTitle + " #" + r.get('issue_number', "0") + " - " + ( r.get('name', "") or "")
-                v['authors'] = r.get('authors', [])
-                v['description'] = r.get('description', "")
-                v['publisher'] = ""
-                v['publishedDate'] = dateFomers
-                v['tags'] = ["Comics", seriesTitle]
-                v['rating'] = 0
-                v['series'] = seriesTitle
-                v['cover'] = r['image'].get('original_url')
-                v['source'] = {
-                    "id": self.__id__,
-                    "description": "ComicVine Books",
-                    "link": "https://comicvine.gamespot.com/"
-                }
-                v['url'] = r.get('site_detail_url', "")
-                val.append(v)
+            title_tokens = list(self.get_title_tokens(query, strip_joiners=False))
+            if title_tokens:
+                tokens = [quote(t.encode("utf-8")) for t in title_tokens]
+                query = "%20".join(tokens)
+            result = requests.get(
+                f"{ComicVine.BASE_URL}{query}{ComicVine.QUERY_PARAMS}",
+                headers=ComicVine.HEADERS,
+            )
+            for result in result.json()["results"]:
+                match = self._parse_search_result(
+                    result=result, generic_cover=generic_cover, locale=locale
+                )
+                val.append(match)
        return val

-
+    def _parse_search_result(
+        self, result: Dict, generic_cover: str, locale: str
+    ) -> MetaRecord:
+        series = result["volume"].get("name", "")
+        series_index = result.get("issue_number", 0)
+        issue_name = result.get("name", "")
+        match = MetaRecord(
+            id=result["id"],
+            title=f"{series}#{series_index} - {issue_name}",
+            authors=result.get("authors", []),
+            url=result.get("site_detail_url", ""),
+            source=MetaSourceInfo(
+                id=self.__id__,
+                description=ComicVine.DESCRIPTION,
+                link=ComicVine.META_URL,
+            ),
+            series=series,
+        )
+        match.cover = result["image"].get("original_url", generic_cover)
+        match.description = result.get("description", "")
+        match.publishedDate = result.get("store_date", result.get("date_added"))
+        match.series_index = series_index
+        match.tags = ["Comics", series]
+        match.identifiers = {"comicvine": match.id}
+        return match
--- a/cps/metadata_provider/google.py
+++ b/cps/metadata_provider/google.py
@ -17,39 +17,93 @@
 #  along with this program. If not, see <http://www.gnu.org/licenses/>.

 # Google Books api document: https://developers.google.com/books/docs/v1/using
-
+from typing import Dict, List, Optional
+from urllib.parse import quote

 import requests
-from cps.services.Metadata import Metadata
+
+from cps.isoLanguages import get_lang3, get_language_name
+from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
+

 class Google(Metadata):
    __name__ = "Google"
    __id__ = "google"
+    DESCRIPTION = "Google Books"
+    META_URL = "https://books.google.com/"
+    BOOK_URL = "https://books.google.com/books?id="
+    SEARCH_URL = "https://www.googleapis.com/books/v1/volumes?q="
+    ISBN_TYPE = "ISBN_13"

-    def search(self, query, generic_cover=""):
+    def search(
+        self, query: str, generic_cover: str = "", locale: str = "en"
+    ) -> Optional[List[MetaRecord]]:
+        val = list()    
        if self.active:
-            val = list()
-            result = requests.get("https://www.googleapis.com/books/v1/volumes?q="+query.replace(" ","+"))
-            for r in result.json().get('items'):
-                v = dict()
-                v['id'] = r['id']
-                v['title'] = r['volumeInfo'].get('title',"")
-                v['authors'] = r['volumeInfo'].get('authors', [])
-                v['description'] = r['volumeInfo'].get('description', "")
-                v['publisher'] = r['volumeInfo'].get('publisher', "")
-                v['publishedDate'] = r['volumeInfo'].get('publishedDate', "")
-                v['tags'] = r['volumeInfo'].get('categories', [])
-                v['rating'] = r['volumeInfo'].get('averageRating', 0)
-                if r['volumeInfo'].get('imageLinks'):
-                    v['cover'] = r['volumeInfo']['imageLinks']['thumbnail'].replace("http://", "https://")
-                else:
-                    v['cover'] = "/../../../static/generic_cover.jpg"
-                v['source'] = {
-                    "id": self.__id__,
-                    "description": "Google Books",
-                    "link": "https://books.google.com/"}
-                v['url'] = "https://books.google.com/books?id=" + r['id']
-                val.append(v)
-            return val

+            title_tokens = list(self.get_title_tokens(query, strip_joiners=False))
+            if title_tokens:
+                tokens = [quote(t.encode("utf-8")) for t in title_tokens]
+                query = "+".join(tokens)
+            results = requests.get(Google.SEARCH_URL + query)
+            for result in results.json()["items"]:
+                val.append(
+                    self._parse_search_result(
+                        result=result, generic_cover=generic_cover, locale=locale
+                    )
+                )
+        return val

+    def _parse_search_result(
+        self, result: Dict, generic_cover: str, locale: str
+    ) -> MetaRecord:
+        match = MetaRecord(
+            id=result["id"],
+            title=result["volumeInfo"]["title"],
+            authors=result["volumeInfo"].get("authors", []),
+            url=Google.BOOK_URL + result["id"],
+            source=MetaSourceInfo(
+                id=self.__id__,
+                description=Google.DESCRIPTION,
+                link=Google.META_URL,
+            ),
+        )
+
+        match.cover = self._parse_cover(result=result, generic_cover=generic_cover)
+        match.description = result["volumeInfo"].get("description", "")
+        match.languages = self._parse_languages(result=result, locale=locale)
+        match.publisher = result["volumeInfo"].get("publisher", "")
+        match.publishedDate = result["volumeInfo"].get("publishedDate", "")
+        match.rating = result["volumeInfo"].get("averageRating", 0)
+        match.series, match.series_index = "", 1
+        match.tags = result["volumeInfo"].get("categories", [])
+
+        match.identifiers = {"google": match.id}
+        match = self._parse_isbn(result=result, match=match)
+        return match
+
+    @staticmethod
+    def _parse_isbn(result: Dict, match: MetaRecord) -> MetaRecord:
+        identifiers = result["volumeInfo"].get("industryIdentifiers", [])
+        for identifier in identifiers:
+            if identifier.get("type") == Google.ISBN_TYPE:
+                match.identifiers["isbn"] = identifier.get("identifier")
+                break
+        return match
+
+    @staticmethod
+    def _parse_cover(result: Dict, generic_cover: str) -> str:
+        if result["volumeInfo"].get("imageLinks"):
+            cover_url = result["volumeInfo"]["imageLinks"]["thumbnail"]
+            return cover_url.replace("http://", "https://")
+        return generic_cover
+
+    @staticmethod
+    def _parse_languages(result: Dict, locale: str) -> List[str]:
+        language_iso2 = result["volumeInfo"].get("language", "")
+        languages = (
+            [get_language_name(locale, get_lang3(language_iso2))]
+            if language_iso2
+            else []
+        )
+        return languages
--- a/cps/metadata_provider/lubimyczytac.py
+++ b/cps/metadata_provider/lubimyczytac.py
@ -0,0 +1,337 @@
+# -*- coding: utf-8 -*-
+#  This file is part of the Calibre-Web (https://github.com/janeczku/calibre-web)
+#    Copyright (C) 2021 OzzieIsaacs
+#
+#  This program is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 3 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program. If not, see <http://www.gnu.org/licenses/>.
+import datetime
+import json
+import re
+from multiprocessing.pool import ThreadPool
+from typing import List, Optional, Tuple, Union
+from urllib.parse import quote
+
+import requests
+from dateutil import parser
+from html2text import HTML2Text
+from lxml.html import HtmlElement, fromstring, tostring
+from markdown2 import Markdown
+
+from cps.isoLanguages import get_language_name
+from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
+
+SYMBOLS_TO_TRANSLATE = (
+    "öÖüÜóÓőŐúÚéÉáÁűŰíÍąĄćĆęĘłŁńŃóÓśŚźŹżŻ",
+    "oOuUoOoOuUeEaAuUiIaAcCeElLnNoOsSzZzZ",
+)
+SYMBOL_TRANSLATION_MAP = dict(
+    [(ord(a), ord(b)) for (a, b) in zip(*SYMBOLS_TO_TRANSLATE)]
+)
+
+
+def get_int_or_float(value: str) -> Union[int, float]:
+    number_as_float = float(value)
+    number_as_int = int(number_as_float)
+    return number_as_int if number_as_float == number_as_int else number_as_float
+
+
+def strip_accents(s: Optional[str]) -> Optional[str]:
+    return s.translate(SYMBOL_TRANSLATION_MAP) if s is not None else s
+
+
+def sanitize_comments_html(html: str) -> str:
+    text = html2text(html)
+    md = Markdown()
+    html = md.convert(text)
+    return html
+
+
+def html2text(html: str) -> str:
+    # replace <u> tags with <span> as <u> becomes emphasis in html2text
+    if isinstance(html, bytes):
+        html = html.decode("utf-8")
+    html = re.sub(
+        r"<\s*(?P<solidus>/?)\s*[uU]\b(?P<rest>[^>]*)>",
+        r"<\g<solidus>span\g<rest>>",
+        html,
+    )
+    h2t = HTML2Text()
+    h2t.body_width = 0
+    h2t.single_line_break = True
+    h2t.emphasis_mark = "*"
+    return h2t.handle(html)
+
+
+class LubimyCzytac(Metadata):
+    __name__ = "LubimyCzytac.pl"
+    __id__ = "lubimyczytac"
+
+    BASE_URL = "https://lubimyczytac.pl"
+
+    BOOK_SEARCH_RESULT_XPATH = (
+        "*//div[@class='listSearch']//div[@class='authorAllBooks__single']"
+    )
+    SINGLE_BOOK_RESULT_XPATH = ".//div[contains(@class,'authorAllBooks__singleText')]"
+    TITLE_PATH = "/div/a[contains(@class,'authorAllBooks__singleTextTitle')]"
+    TITLE_TEXT_PATH = f"{TITLE_PATH}//text()"
+    URL_PATH = f"{TITLE_PATH}/@href"
+    AUTHORS_PATH = "/div/a[contains(@href,'autor')]//text()"
+
+    SIBLINGS = "/following-sibling::dd"
+
+    CONTAINER = "//section[@class='container book']"
+    PUBLISHER = f"{CONTAINER}//dt[contains(text(),'Wydawnictwo:')]{SIBLINGS}/a/text()"
+    LANGUAGES = f"{CONTAINER}//dt[contains(text(),'Język:')]{SIBLINGS}/text()"
+    DESCRIPTION = f"{CONTAINER}//div[@class='collapse-content']"
+    SERIES = f"{CONTAINER}//span/a[contains(@href,'/cykl/')]/text()"
+
+    DETAILS = "//div[@id='book-details']"
+    PUBLISH_DATE = "//dt[contains(@title,'Data pierwszego wydania"
+    FIRST_PUBLISH_DATE = f"{DETAILS}{PUBLISH_DATE} oryginalnego')]{SIBLINGS}[1]/text()"
+    FIRST_PUBLISH_DATE_PL = f"{DETAILS}{PUBLISH_DATE} polskiego')]{SIBLINGS}[1]/text()"
+    TAGS = "//nav[@aria-label='breadcrumb']//a[contains(@href,'/ksiazki/k/')]/text()"
+
+    RATING = "//meta[@property='books:rating:value']/@content"
+    COVER = "//meta[@property='og:image']/@content"
+    ISBN = "//meta[@property='books:isbn']/@content"
+    META_TITLE = "//meta[@property='og:description']/@content"
+
+    SUMMARY = "//script[@type='application/ld+json']//text()"
+
+    def search(
+        self, query: str, generic_cover: str = "", locale: str = "en"
+    ) -> Optional[List[MetaRecord]]:
+        if self.active:
+            result = requests.get(self._prepare_query(title=query))
+            root = fromstring(result.text)
+            lc_parser = LubimyCzytacParser(root=root, metadata=self)
+            matches = lc_parser.parse_search_results()
+            if matches:
+                with ThreadPool(processes=10) as pool:
+                    final_matches = pool.starmap(
+                        lc_parser.parse_single_book,
+                        [(match, generic_cover, locale) for match in matches],
+                    )
+                return final_matches
+            return matches
+
+    def _prepare_query(self, title: str) -> str:
+        query = ""
+        characters_to_remove = "\?()\/"
+        pattern = "[" + characters_to_remove + "]"
+        title = re.sub(pattern, "", title)
+        title = title.replace("_", " ")
+        if '"' in title or ",," in title:
+            title = title.split('"')[0].split(",,")[0]
+
+        if "/" in title:
+            title_tokens = [
+                token for token in title.lower().split(" ") if len(token) > 1
+            ]
+        else:
+            title_tokens = list(self.get_title_tokens(title, strip_joiners=False))
+        if title_tokens:
+            tokens = [quote(t.encode("utf-8")) for t in title_tokens]
+            query = query + "%20".join(tokens)
+        if not query:
+            return ""
+        return f"{LubimyCzytac.BASE_URL}/szukaj/ksiazki?phrase={query}"
+
+
+class LubimyCzytacParser:
+    PAGES_TEMPLATE = "<p id='strony'>Książka ma {0} stron(y).</p>"
+    PUBLISH_DATE_TEMPLATE = "<p id='pierwsze_wydanie'>Data pierwszego wydania: {0}</p>"
+    PUBLISH_DATE_PL_TEMPLATE = (
+        "<p id='pierwsze_wydanie'>Data pierwszego wydania w Polsce: {0}</p>"
+    )
+
+    def __init__(self, root: HtmlElement, metadata: Metadata) -> None:
+        self.root = root
+        self.metadata = metadata
+
+    def parse_search_results(self) -> List[MetaRecord]:
+        matches = []
+        results = self.root.xpath(LubimyCzytac.BOOK_SEARCH_RESULT_XPATH)
+        for result in results:
+            title = self._parse_xpath_node(
+                root=result,
+                xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
+                f"{LubimyCzytac.TITLE_TEXT_PATH}",
+            )
+
+            book_url = self._parse_xpath_node(
+                root=result,
+                xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
+                f"{LubimyCzytac.URL_PATH}",
+            )
+            authors = self._parse_xpath_node(
+                root=result,
+                xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
+                f"{LubimyCzytac.AUTHORS_PATH}",
+                take_first=False,
+            )
+            if not all([title, book_url, authors]):
+                continue
+            matches.append(
+                MetaRecord(
+                    id=book_url.replace(f"/ksiazka/", "").split("/")[0],
+                    title=title,
+                    authors=[strip_accents(author) for author in authors],
+                    url=LubimyCzytac.BASE_URL + book_url,
+                    source=MetaSourceInfo(
+                        id=self.metadata.__id__,
+                        description=self.metadata.__name__,
+                        link=LubimyCzytac.BASE_URL,
+                    ),
+                )
+            )
+        return matches
+
+    def parse_single_book(
+        self, match: MetaRecord, generic_cover: str, locale: str
+    ) -> MetaRecord:
+        response = requests.get(match.url)
+        self.root = fromstring(response.text)
+        match.cover = self._parse_cover(generic_cover=generic_cover)
+        match.description = self._parse_description()
+        match.languages = self._parse_languages(locale=locale)
+        match.publisher = self._parse_publisher()
+        match.publishedDate = self._parse_from_summary(attribute_name="datePublished")
+        match.rating = self._parse_rating()
+        match.series, match.series_index = self._parse_series()
+        match.tags = self._parse_tags()
+        match.identifiers = {
+            "isbn": self._parse_isbn(),
+            "lubimyczytac": match.id,
+        }
+        return match
+
+    def _parse_xpath_node(
+        self,
+        xpath: str,
+        root: HtmlElement = None,
+        take_first: bool = True,
+        strip_element: bool = True,
+    ) -> Optional[Union[str, List[str]]]:
+        root = root if root is not None else self.root
+        node = root.xpath(xpath)
+        if not node:
+            return None
+        return (
+            (node[0].strip() if strip_element else node[0])
+            if take_first
+            else [x.strip() for x in node]
+        )
+
+    def _parse_cover(self, generic_cover) -> Optional[str]:
+        return (
+            self._parse_xpath_node(xpath=LubimyCzytac.COVER, take_first=True)
+            or generic_cover
+        )
+
+    def _parse_publisher(self) -> Optional[str]:
+        return self._parse_xpath_node(xpath=LubimyCzytac.PUBLISHER, take_first=True)
+
+    def _parse_languages(self, locale: str) -> List[str]:
+        languages = list()
+        lang = self._parse_xpath_node(xpath=LubimyCzytac.LANGUAGES, take_first=True)
+        if lang:
+            if "polski" in lang:
+                languages.append("pol")
+            if "angielski" in lang:
+                languages.append("eng")
+        return [get_language_name(locale, language) for language in languages]
+
+    def _parse_series(self) -> Tuple[Optional[str], Optional[Union[float, int]]]:
+        series_index = 0
+        series = self._parse_xpath_node(xpath=LubimyCzytac.SERIES, take_first=True)
+        if series:
+            if "tom " in series:
+                series_name, series_info = series.split(" (tom ", 1)
+                series_info = series_info.replace(" ", "").replace(")", "")
+                # Check if book is not a bundle, i.e. chapter 1-3
+                if "-" in series_info:
+                    series_info = series_info.split("-", 1)[0]
+                if series_info.replace(".", "").isdigit() is True:
+                    series_index = get_int_or_float(series_info)
+                return series_name, series_index
+        return None, None
+
+    def _parse_tags(self) -> List[str]:
+        tags = self._parse_xpath_node(xpath=LubimyCzytac.TAGS, take_first=False)
+        return [
+            strip_accents(w.replace(", itd.", " itd."))
+            for w in tags
+            if isinstance(w, str)
+        ]
+
+    def _parse_from_summary(self, attribute_name: str) -> Optional[str]:
+        value = None
+        summary_text = self._parse_xpath_node(xpath=LubimyCzytac.SUMMARY)
+        if summary_text:
+            data = json.loads(summary_text)
+            value = data.get(attribute_name)
+        return value.strip() if value is not None else value
+
+    def _parse_rating(self) -> Optional[str]:
+        rating = self._parse_xpath_node(xpath=LubimyCzytac.RATING)
+        return round(float(rating.replace(",", ".")) / 2) if rating else rating
+
+    def _parse_date(self, xpath="first_publish") -> Optional[datetime.datetime]:
+        options = {
+            "first_publish": LubimyCzytac.FIRST_PUBLISH_DATE,
+            "first_publish_pl": LubimyCzytac.FIRST_PUBLISH_DATE_PL,
+        }
+        date = self._parse_xpath_node(xpath=options.get(xpath))
+        return parser.parse(date) if date else None
+
+    def _parse_isbn(self) -> Optional[str]:
+        return self._parse_xpath_node(xpath=LubimyCzytac.ISBN)
+
+    def _parse_description(self) -> str:
+        description = ""
+        description_node = self._parse_xpath_node(
+            xpath=LubimyCzytac.DESCRIPTION, strip_element=False
+        )
+        if description_node is not None:
+            for source in self.root.xpath('//p[@class="source"]'):
+                source.getparent().remove(source)
+            description = tostring(description_node, method="html")
+            description = sanitize_comments_html(description)
+
+        else:
+            description_node = self._parse_xpath_node(xpath=LubimyCzytac.META_TITLE)
+            if description_node is not None:
+                description = description_node
+                description = sanitize_comments_html(description)
+        description = self._add_extra_info_to_description(description=description)
+        return description
+
+    def _add_extra_info_to_description(self, description: str) -> str:
+        pages = self._parse_from_summary(attribute_name="numberOfPages")
+        if pages:
+            description += LubimyCzytacParser.PAGES_TEMPLATE.format(pages)
+
+        first_publish_date = self._parse_date()
+        if first_publish_date:
+            description += LubimyCzytacParser.PUBLISH_DATE_TEMPLATE.format(
+                first_publish_date.strftime("%d.%m.%Y")
+            )
+
+        first_publish_date_pl = self._parse_date(xpath="first_publish_pl")
+        if first_publish_date_pl:
+            description += LubimyCzytacParser.PUBLISH_DATE_PL_TEMPLATE.format(
+                first_publish_date_pl.strftime("%d.%m.%Y")
+            )
+
+        return description
--- a/cps/metadata_provider/scholar.py
+++ b/cps/metadata_provider/scholar.py
@ -15,46 +15,52 @@
 #
 #  You should have received a copy of the GNU General Public License
 #  along with this program. If not, see <http://www.gnu.org/licenses/>.
+import itertools
+from typing import Dict, List, Optional
+from urllib.parse import quote

 from scholarly import scholarly

-from cps.services.Metadata import Metadata
+from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata

 class scholar(Metadata):
    __name__ = "Google Scholar"
    __id__ = "googlescholar"
+    META_URL = "https://scholar.google.com/"

-    def search(self, query, generic_cover=""):
+    def search(
+        self, query: str, generic_cover: str = "", locale: str = "en"
+    ) -> Optional[List[MetaRecord]]:
        val = list()
        if self.active:
-            scholar_gen = scholarly.search_pubs(' '.join(query.split('+')))
-            i = 0
-            for publication in scholar_gen:
-                v = dict()
-                v['id'] = publication['url_scholarbib'].split(':')[1]
-                v['title'] = publication['bib'].get('title')
-                v['authors'] = publication['bib'].get('author', [])
-                v['description'] = publication['bib'].get('abstract', "")
-                v['publisher'] = publication['bib'].get('venue', "")
-                if publication['bib'].get('pub_year'):
-                    v['publishedDate'] = publication['bib'].get('pub_year')+"-01-01"
-                else:
-                    v['publishedDate'] = ""
-                v['tags'] = []
-                v['rating'] = 0
-                v['series'] = ""
-                v['cover'] = ""
-                v['url'] = publication.get('pub_url') or publication.get('eprint_url') or "",
-                v['source'] = {
-                    "id": self.__id__,
-                    "description": "Google Scholar",
-                    "link": "https://scholar.google.com/"
-                }
-                val.append(v)
-                i += 1
-                if (i >= 10):
-                    break
+            title_tokens = list(self.get_title_tokens(query, strip_joiners=False))
+            if title_tokens:
+                tokens = [quote(t.encode("utf-8")) for t in title_tokens]
+                query = " ".join(tokens)
+            scholar_gen = itertools.islice(scholarly.search_pubs(query), 10)
+            for result in scholar_gen:
+                match = self._parse_search_result(
+                    result=result, generic_cover=generic_cover, locale=locale
+                )
+                val.append(match)
        return val

+    def _parse_search_result(
+        self, result: Dict, generic_cover: str, locale: str
+    ) -> MetaRecord:
+        match = MetaRecord(
+            id=result.get("pub_url", result.get("eprint_url", "")),
+            title=result["bib"].get("title"),
+            authors=result["bib"].get("author", []),
+            url=result.get("pub_url", result.get("eprint_url", "")),
+            source=MetaSourceInfo(
+                id=self.__id__, description=self.__name__, link=scholar.META_URL
+            ),
+        )

-
+        match.cover = result.get("image", {}).get("original_url", generic_cover)
+        match.description = result["bib"].get("abstract", "")
+        match.publisher = result["bib"].get("venue", "")
+        match.publishedDate = result["bib"].get("pub_year") + "-01-01"
+        match.identifiers = {"scholar": match.id}
+        return match
--- a/cps/opds.py
+++ b/cps/opds.py
@ -432,17 +432,9 @@ def feed_languagesindex():
    if current_user.filter_language() == u"all":
        languages = calibre_db.speaking_language()
    else:
-        #try:
-        #    cur_l = LC.parse(current_user.filter_language())
-        #except UnknownLocaleError:
-        #    cur_l = None
        languages = calibre_db.session.query(db.Languages).filter(
            db.Languages.lang_code == current_user.filter_language()).all()
        languages[0].name = isoLanguages.get_language_name(get_locale(), languages[0].lang_code)
-        #if cur_l:
-        #    languages[0].name = cur_l.get_language_name(get_locale())
-        #else:
-        #    languages[0].name = _(isoLanguages.get(part3=languages[0].lang_code).name)
    pagination = Pagination((int(off) / (int(config.config_books_per_page)) + 1), config.config_books_per_page,
                            len(languages))
    return render_xml_template('feed.xml', listelements=languages, folder='opds.feed_languages', pagination=pagination)
@ -530,7 +522,8 @@ def feed_search(term):
        entries, __, ___ = calibre_db.get_search_results(term, config_read_column=config.config_read_column)
        entries_count = len(entries) if len(entries) > 0 else 1
        pagination = Pagination(1, entries_count, entries_count)
-        return render_xml_template('feed.xml', searchterm=term, entries=entries, pagination=pagination)
+        items = [entry[0] for entry in entries]
+        return render_xml_template('feed.xml', searchterm=term, entries=items, pagination=pagination)
    else:
        return render_xml_template('feed.xml', searchterm="")

--- a/cps/search_metadata.py
+++ b/cps/search_metadata.py
@ -16,25 +16,27 @@
 #  You should have received a copy of the GNU General Public License
 #  along with this program. If not, see <http://www.gnu.org/licenses/>.

-import os
-import json
-import importlib
-import sys
-import inspect
-import datetime
 import concurrent.futures
+import importlib
+import inspect
+import json
+import os
+import sys
+# from time import time
+from dataclasses import asdict

-from flask import Blueprint, request, Response, url_for
+from flask import Blueprint, Response, request, url_for
 from flask_login import current_user
 from flask_login import login_required
+from sqlalchemy.exc import InvalidRequestError, OperationalError
 from sqlalchemy.orm.attributes import flag_modified
-from sqlalchemy.exc import OperationalError, InvalidRequestError

-from . import constants, logger, ub
 from cps.services.Metadata import Metadata
+from . import constants, get_locale, logger, ub

+# current_milli_time = lambda: int(round(time() * 1000))

-meta = Blueprint('metadata', __name__)
+meta = Blueprint("metadata", __name__)

 log = logger.create()

@ -42,7 +44,7 @@ new_list = list()
 meta_dir = os.path.join(constants.BASE_DIR, "cps", "metadata_provider")
 modules = os.listdir(os.path.join(constants.BASE_DIR, "cps", "metadata_provider"))
 for f in modules:
-    if os.path.isfile(os.path.join(meta_dir, f)) and not f.endswith('__init__.py'):
+    if os.path.isfile(os.path.join(meta_dir, f)) and not f.endswith("__init__.py"):
        a = os.path.basename(f)[:-3]
        try:
            importlib.import_module("cps.metadata_provider." + a)
@ -51,34 +53,46 @@ for f in modules:
            log.error("Import error for metadata source: {}".format(a))
            pass

+
 def list_classes(provider_list):
    classes = list()
    for element in provider_list:
-        for name, obj in inspect.getmembers(sys.modules["cps.metadata_provider." + element]):
-            if inspect.isclass(obj) and name != "Metadata" and issubclass(obj, Metadata):
+        for name, obj in inspect.getmembers(
+            sys.modules["cps.metadata_provider." + element]
+        ):
+            if (
+                inspect.isclass(obj)
+                and name != "Metadata"
+                and issubclass(obj, Metadata)
+            ):
                classes.append(obj())
    return classes

+
 cl = list_classes(new_list)

+
@meta.route("/metadata/provider")
@login_required
 def metadata_provider():
-    active = current_user.view_settings.get('metadata', {})
+    active = current_user.view_settings.get("metadata", {})
    provider = list()
    for c in cl:
        ac = active.get(c.__id__, True)
-        provider.append({"name": c.__name__, "active": ac, "initial": ac, "id": c.__id__})
-    return Response(json.dumps(provider), mimetype='application/json')
+        provider.append(
+            {"name": c.__name__, "active": ac, "initial": ac, "id": c.__id__}
+        )
+    return Response(json.dumps(provider), mimetype="application/json")

-@meta.route("/metadata/provider", methods=['POST'])
-@meta.route("/metadata/provider/<prov_name>", methods=['POST'])
+
+@meta.route("/metadata/provider", methods=["POST"])
+@meta.route("/metadata/provider/<prov_name>", methods=["POST"])
@login_required
 def metadata_change_active_provider(prov_name):
    new_state = request.get_json()
-    active = current_user.view_settings.get('metadata', {})
-    active[new_state['id']] = new_state['value']
-    current_user.view_settings['metadata'] = active
+    active = current_user.view_settings.get("metadata", {})
+    active[new_state["id"]] = new_state["value"]
+    current_user.view_settings["metadata"] = active
    try:
        try:
            flag_modified(current_user, "view_settings")
@ -89,29 +103,33 @@ def metadata_change_active_provider(prov_name):
        log.error("Invalid request received: {}".format(request))
        return "Invalid request", 400
    if "initial" in new_state and prov_name:
-        for c in cl:
-            if c.__id__ == prov_name:
-                data = c.search(new_state.get('query', ""))
-                break
-        return Response(json.dumps(data), mimetype='application/json')
+        data = []
+        provider = next((c for c in cl if c.__id__ == prov_name), None)
+        if provider is not None:
+            data = provider.search(new_state.get("query", ""))
+        return Response(
+            json.dumps([asdict(x) for x in data]), mimetype="application/json"
+        )
    return ""

-@meta.route("/metadata/search", methods=['POST'])
+
+@meta.route("/metadata/search", methods=["POST"])
@login_required
 def metadata_search():
-    query = request.form.to_dict().get('query')
+    query = request.form.to_dict().get("query")
    data = list()
-    active = current_user.view_settings.get('metadata', {})
+    active = current_user.view_settings.get("metadata", {})
+    locale = get_locale()
    if query:
-        generic_cover = ""
+        static_cover = url_for("static", filename="generic_cover.jpg")
+        # start = current_milli_time()
        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
-            meta = {executor.submit(c.search, query, generic_cover): c for c in cl if active.get(c.__id__, True)}
+            meta = {
+                executor.submit(c.search, query, static_cover, locale): c
+                for c in cl
+                if active.get(c.__id__, True)
+            }
            for future in concurrent.futures.as_completed(meta):
-                data.extend(future.result())
-    return Response(json.dumps(data), mimetype='application/json')
-
-
-
-
-
-
+                data.extend([asdict(x) for x in future.result()])
+    # log.info({'Time elapsed {}'.format(current_milli_time()-start)})
+    return Response(json.dumps(data), mimetype="application/json")
--- a/cps/services/Metadata.py
+++ b/cps/services/Metadata.py
@ -15,13 +15,93 @@
 #
 #  You should have received a copy of the GNU General Public License
 #  along with this program. If not, see <http://www.gnu.org/licenses/>.
+import abc
+import dataclasses
+import os
+import re
+from typing import Dict, Generator, List, Optional, Union
+
+from cps import constants


-class Metadata():
+@dataclasses.dataclass
+class MetaSourceInfo:
+    id: str
+    description: str
+    link: str
+
+
+@dataclasses.dataclass
+class MetaRecord:
+    id: Union[str, int]
+    title: str
+    authors: List[str]
+    url: str
+    source: MetaSourceInfo
+    cover: str = os.path.join(constants.STATIC_DIR, 'generic_cover.jpg')
+    description: Optional[str] = ""
+    series: Optional[str] = None
+    series_index: Optional[Union[int, float]] = 0
+    identifiers: Dict[str, Union[str, int]] = dataclasses.field(default_factory=dict)
+    publisher: Optional[str] = None
+    publishedDate: Optional[str] = None
+    rating: Optional[int] = 0
+    languages: Optional[List[str]] = dataclasses.field(default_factory=list)
+    tags: Optional[List[str]] = dataclasses.field(default_factory=list)
+
+
+class Metadata:
    __name__ = "Generic"
+    __id__ = "generic"

    def __init__(self):
        self.active = True

    def set_status(self, state):
        self.active = state
+
+    @abc.abstractmethod
+    def search(
+        self, query: str, generic_cover: str = "", locale: str = "en"
+    ) -> Optional[List[MetaRecord]]:
+        pass
+
+    @staticmethod
+    def get_title_tokens(
+        title: str, strip_joiners: bool = True
+    ) -> Generator[str, None, None]:
+        """
+        Taken from calibre source code
+        """
+        title_patterns = [
+            (re.compile(pat, re.IGNORECASE), repl)
+            for pat, repl in [
+                # Remove things like: (2010) (Omnibus) etc.
+                (
+                    r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|"
+                    r"audiobook|audio\scd|paperback|turtleback|"
+                    r"mass\s*market|edition|ed\.)[\])}]",
+                    "",
+                ),
+                # Remove any strings that contain the substring edition inside
+                # parentheses
+                (r"(?i)[({\[].*?(edition|ed.).*?[\]})]", ""),
+                # Remove commas used a separators in numbers
+                (r"(\d+),(\d+)", r"\1\2"),
+                # Remove hyphens only if they have whitespace before them
+                (r"(\s-)", " "),
+                # Replace other special chars with a space
+                (r"""[:,;!@$%^&*(){}.`~"\s\[\]/]《》「」“”""", " "),
+            ]
+        ]
+
+        for pat, repl in title_patterns:
+            title = pat.sub(repl, title)
+
+        tokens = title.split()
+        for token in tokens:
+            token = token.strip().strip('"').strip("'")
+            if token and (
+                not strip_joiners or token.lower() not in ("a", "and", "the", "&")
+            ):
+                yield token
--- a/cps/static/js/get_meta.js
+++ b/cps/static/js/get_meta.js
@ -26,19 +26,26 @@ $(function () {
       )
    };

+    function getUniqueValues(attribute_name, book){
+        var presentArray = $.map($("#"+attribute_name).val().split(","), $.trim);
+        if ( presentArray.length === 1 && presentArray[0] === "") {
+            presentArray = [];
+        }
+        $.each(book[attribute_name], function(i, el) {
+            if ($.inArray(el, presentArray) === -1) presentArray.push(el);
+        });
+        return presentArray
+    }
+
    function populateForm (book) {
        tinymce.get("description").setContent(book.description);
-        var uniqueTags = $.map($("#tags").val().split(","), $.trim);
-        if ( uniqueTags.length == 1 && uniqueTags[0] == "") {
-            uniqueTags = [];
-        }
-        $.each(book.tags, function(i, el) {
-            if ($.inArray(el, uniqueTags) === -1) uniqueTags.push(el);
-        });
+        var uniqueTags = getUniqueValues('tags', book)
+        var uniqueLanguages = getUniqueValues('languages', book)
        var ampSeparatedAuthors = (book.authors || []).join(" & ");
        $("#bookAuthor").val(ampSeparatedAuthors);
        $("#book_title").val(book.title);
        $("#tags").val(uniqueTags.join(", "));
+        $("#languages").val(uniqueLanguages.join(", "));
        $("#rating").data("rating").setValue(Math.round(book.rating));
        if(book.cover && $("#cover_url").length){
            $(".cover img").attr("src", book.cover);
@ -48,7 +55,32 @@ $(function () {
        $("#publisher").val(book.publisher);
        if (typeof book.series !== "undefined") {
            $("#series").val(book.series);
+            $("#series_index").val(book.series_index);
        }
+        if (typeof book.identifiers !== "undefined") {
+            populateIdentifiers(book.identifiers)
+        }
+    }
+
+    function populateIdentifiers(identifiers){
+       for (const property in identifiers) {
+          console.log(`${property}: ${identifiers[property]}`);
+          if ($('input[name="identifier-type-'+property+'"]').length) {
+              $('input[name="identifier-val-'+property+'"]').val(identifiers[property])
+          }
+          else {
+              addIdentifier(property, identifiers[property])
+          }
+        }
+    }
+
+    function addIdentifier(name, value){
+        var line = '<tr>';
+        line += '<td><input type="text" class="form-control" name="identifier-type-'+ name +'" required="required" placeholder="' + _("Identifier Type") +'" value="'+ name +'"></td>';
+        line += '<td><input type="text" class="form-control" name="identifier-val-'+ name +'" required="required" placeholder="' + _("Identifier Value") +'" value="'+ value +'"></td>';
+        line += '<td><a class="btn btn-default" onclick="removeIdentifierLine(this)">'+_("Remove")+'</a></td>';
+        line += '</tr>';
+        $("#identifier-table").append(line);
    }

    function doSearch (keyword) {
--- a/cps/templates/feed.xml
+++ b/cps/templates/feed.xml
@ -40,35 +40,35 @@
  {% if entries and entries[0] %}
  {% for entry in entries %}
  <entry>
-    <title>{{entry[0].title}}</title>
-    <id>urn:uuid:{{entry[0].uuid}}</id>
-    <updated>{{entry[0].atom_timestamp}}</updated>
-    {% if entry[0].authors.__len__() > 0 %}
+    <title>{{entry.title}}</title>
+    <id>urn:uuid:{{entry.uuid}}</id>
+    <updated>{{entry.atom_timestamp}}</updated>
+    {% if entry.authors.__len__() > 0 %}
      <author>
-        <name>{{entry[0].authors[0].name}}</name>
+        <name>{{entry.authors[0].name}}</name>
      </author>
    {% endif %}
-    {% if entry[0].publishers.__len__() > 0 %}
+    {% if entry.publishers.__len__() > 0 %}
      <publisher>
-        <name>{{entry[0].publishers[0].name}}</name>
+        <name>{{entry.publishers[0].name}}</name>
      </publisher>
    {% endif %}
-    {% for lang in entry[0].languages %}
+    {% for lang in entry.languages %}
      <dcterms:language>{{lang.lang_code}}</dcterms:language>
    {% endfor %}
-    {% for tag in entry[0].tags %}
+    {% for tag in entry.tags %}
    <category scheme="http://www.bisg.org/standards/bisac_subject/index.html"
              term="{{tag.name}}"
              label="{{tag.name}}"/>
    {% endfor %}
-    {% if entry[0].comments[0] %}<summary>{{entry[0].comments[0].text|striptags}}</summary>{% endif %}
-    {% if entry[0].has_cover %}
-    <link type="image/jpeg" href="{{url_for('opds.feed_get_cover', book_id=entry[0].id)}}" rel="http://opds-spec.org/image"/>
-    <link type="image/jpeg" href="{{url_for('opds.feed_get_cover', book_id=entry[0].id)}}" rel="http://opds-spec.org/image/thumbnail"/>
+    {% if entry.comments[0] %}<summary>{{entry.comments[0].text|striptags}}</summary>{% endif %}
+    {% if entry.has_cover %}
+    <link type="image/jpeg" href="{{url_for('opds.feed_get_cover', book_id=entry.id)}}" rel="http://opds-spec.org/image"/>
+    <link type="image/jpeg" href="{{url_for('opds.feed_get_cover', book_id=entry.id)}}" rel="http://opds-spec.org/image/thumbnail"/>
    {% endif %}
-    {% for format in entry[0].data %}
-    <link rel="http://opds-spec.org/acquisition" href="{{ url_for('opds.opds_download_link', book_id=entry[0].id, book_format=format.format|lower)}}"
-          length="{{format.uncompressed_size}}" mtime="{{entry[0].atom_timestamp}}" type="{{format.format|lower|mimetype}}"/>
+    {% for format in entry.data %}
+    <link rel="http://opds-spec.org/acquisition" href="{{ url_for('opds.opds_download_link', book_id=entry.id, book_format=format.format|lower)}}"
+          length="{{format.uncompressed_size}}" mtime="{{entry.atom_timestamp}}" type="{{format.format|lower|mimetype}}"/>
    {% endfor %}
  </entry>
  {% endfor %}
--- a/optional-requirements.txt
+++ b/optional-requirements.txt
@ -31,6 +31,9 @@ SQLAlchemy-Utils>=0.33.5,<0.39.0
 # metadata extraction
 rarfile>=2.7
 scholarly>=1.2.0,<1.6
+markdown2==2.4.2
+html2text==2020.1.16
+python-dateutil==2.8.2

 # Comics
 natsort>=2.2.0,<8.1.0
--- a/TestSummary_Linux.html
+++ b/TestSummary_Linux.html