everything working to refactor

2021-12-11 01:06:04 +01:00 · 2021-12-11 01:06:04 +01:00 · 920acaca99
commit 920acaca99
parent 9d9acb058d
4 changed files with 415 additions and 7 deletions
--- a/.gitignore
+++ b/.gitignore
@ -9,6 +9,7 @@ __pycache__/
 .python-version
 env/
 venv/
+p38venv/
 eggs/
 dist/
 executable/
--- a/cps/metadata_provider/lubimyczytac.py
+++ b/cps/metadata_provider/lubimyczytac.py
@ -0,0 +1,373 @@
+# -*- coding: utf-8 -*-
+
+#  This file is part of the Calibre-Web (https://github.com/janeczku/calibre-web)
+#    Copyright (C) 2021 OzzieIsaacs
+#
+#  This program is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 3 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program. If not, see <http://www.gnu.org/licenses/>.
+import json
+import re
+from typing import Dict, List
+from urllib.parse import quote
+
+import requests
+from cps.services.Metadata import Metadata
+from lxml.html import fromstring, tostring
+
+
+def get_int_or_float(v):
+    number_as_float = float(v)
+    number_as_int = int(number_as_float)
+    return number_as_int if number_as_float == number_as_int else number_as_float
+
+
+def strip_accents(s):
+    if s is None:
+        return s
+    else:
+        symbols = (
+            "öÖüÜóÓőŐúÚéÉáÁűŰíÍąĄćĆęĘłŁńŃóÓśŚźŹżŻ",
+            "oOuUoOoOuUeEaAuUiIaAcCeElLnNoOsSzZzZ",
+        )
+        tr = dict([(ord(a), ord(b)) for (a, b) in zip(*symbols)])
+        return s.translate(tr)  # .lower()
+
+
+def sanitize_comments_html(html):
+    from markdown2 import Markdown
+
+    text = html2text(html)
+    md = Markdown()
+    html = md.convert(text)
+    return html
+
+
+def html2text(html):
+    from html2text import HTML2Text
+    import re
+
+    # replace <u> tags with <span> as <u> becomes emphasis in html2text
+    if isinstance(html, bytes):
+        html = html.decode("utf-8")
+    html = re.sub(
+        r"<\s*(?P<solidus>/?)\s*[uU]\b(?P<rest>[^>]*)>",
+        r"<\g<solidus>span\g<rest>>",
+        html,
+    )
+    h2t = HTML2Text()
+    h2t.body_width = 0
+    h2t.single_line_break = True
+    h2t.emphasis_mark = "*"
+    return h2t.handle(html)
+
+
+class LubimyCzytac(Metadata):
+    __name__ = "LubimyCzytac.pl"
+    __id__ = "lubimyczytac"
+
+    BASE_URL = "https://lubimyczytac.pl"
+
+    BOOK_SEARCH_RESULT_XPATH = (
+        "*//div[@class='listSearch']//div[@class='authorAllBooks__single']"
+    )
+    SINGLE_BOOK_RESULT_XPATH = ".//div[contains(@class,'authorAllBooks__singleText')]"
+    TITLE_PATH = "/div/a[contains(@class,'authorAllBooks__singleTextTitle')]"
+    TITLE_TEXT_PATH = f"{TITLE_PATH}//text()"
+    URL_PATH = f"{TITLE_PATH}/@href"
+    AUTHORS_PATH = "/div/a[contains(@href,'autor')]//text()"
+
+    SIBLINGS = "/following-sibling::dd"
+
+    CONTAINER = "//section[@class='container book']"
+    PUBLISHER = f"{CONTAINER}//dt[contains(text(),'Wydawnictwo:')]{SIBLINGS}/a/text()"
+    LANGUAGES = f"{CONTAINER}//dt[contains(text(),'Język:')]{SIBLINGS}/text()"
+    DESCRIPTION = f"{CONTAINER}//div[@class='collapse-content']"
+    SERIES = f"{CONTAINER}//span/a[contains(@href,'/cykl/')]"
+
+    DETAILS = "//div[@id='book-details']"
+    PUBLISH_DATE = "//dt[contains(@title,'Data pierwszego wydania"
+    FIRST_PUBLISH_DATE = f"{DETAILS}{PUBLISH_DATE} oryginalnego')]{SIBLINGS}[1]/text()"
+    FIRST_PUBLISH_DATE_PL = f"{DETAILS}{PUBLISH_DATE} polskiego')]{SIBLINGS}[1]/text()"
+    TAGS = "//nav[@aria-label='breadcrumb']//a[contains(@href,'/ksiazki/k/')]/text()"
+    RATING = "//meta[@property='books:rating:value']/@content"
+    COVER = "//meta[@property='og:image']/@content"
+
+    SUMMARY = "//script[@type='application/ld+json']//text()"
+
+    def search(self, query, __):
+        if self.active:
+            result = requests.get(self._prepare_query(title=query))
+            root = fromstring(result.text)
+            matches = self._parse_search_results(root=root)
+            if matches:
+                for ind, match in enumerate(matches):
+                    matches[ind] = self._parse_single_book(match=match)
+            return matches
+
+    def _prepare_query(self, title: str) -> str:
+        query = ""
+        characters_to_remove = "\?()\/"
+        pattern = "[" + characters_to_remove + "]"
+        title = re.sub(pattern, "", title)
+        title = title.replace("_", " ")
+        if '"' in title or ",," in title:
+            title = title.split('"')[0].split(",,")[0]
+
+        if "/" in title:
+            title_tokens = [
+                token for token in title.lower().split(" ") if len(token) > 1
+            ]
+        else:
+            title_tokens = list(
+                self.get_title_tokens(title, strip_joiners=False, strip_subtitle=True)
+            )
+        if title_tokens:
+            tokens = [quote(t.encode("utf-8")) for t in title_tokens]
+            query = query + "%20".join(tokens)
+        if not query:
+            return ""
+        return f"{LubimyCzytac.BASE_URL}/szukaj/ksiazki?phrase={query}"
+
+    def _parse_search_results(self, root) -> List[Dict]:
+        matches = []
+        results = root.xpath(LubimyCzytac.BOOK_SEARCH_RESULT_XPATH)
+        for result in results:
+            title = result.xpath(
+                f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
+                f"{LubimyCzytac.TITLE_TEXT_PATH}"
+            )
+            book_url = result.xpath(
+                f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" f"{LubimyCzytac.URL_PATH}"
+            )
+            authors = result.xpath(
+                f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
+                f"{LubimyCzytac.AUTHORS_PATH}"
+            )
+
+            if not title or not book_url or not authors:
+                continue
+            title = title[0].strip()
+            book_url = LubimyCzytac.BASE_URL + book_url[0]
+            book_id = book_url.replace(f"{LubimyCzytac.BASE_URL}/ksiazka/", "").split(
+                "/"
+            )[0]
+            matches.append(
+                {"id": book_id, "title": title, "authors": authors, "url": book_url}
+            )
+        return matches
+
+    def _parse_single_book(self, match: Dict) -> Dict:
+        url = match.get("url")
+        result = requests.get(url)
+        root = fromstring(result.text)
+        match["series"], match["series_index"] = self._parse_series(root=root)
+        match["tags"] = self._parse_tags(root=root)
+        match["publisher"] = self._parse_publisher(root=root)
+        match["publishedDate"] = self._parse_from_summary(
+            root=root, attribute_name="datePublished"
+        )
+        match["rating"] = self._parse_rating(root=root)
+        match["description"] = self._parse_description(root=root)
+        match["cover"] = self._parse_cover(root=root)
+        match["source"] = {
+            "id": self.__id__,
+            "description": self.__name__,
+            "link": LubimyCzytac.BASE_URL,
+        }
+        match['languages'] = self._parse_languages(root=root)
+        match["identifiers"] = {
+            "isbn": self._parse_isbn(root=root),
+            "lubimyczytac": match["id"],
+        }
+        return match
+
+    def _parse_cover(self, root):
+        imgcol_node = root.xpath('//meta[@property="og:image"]/@content')
+        if imgcol_node:
+            img_url = imgcol_node[0]
+            return img_url
+
+    def _parse_publisher(self, root):
+        publisher = root.xpath(LubimyCzytac.PUBLISHER)
+        if publisher:
+            return publisher[0]
+        else:
+            return None
+
+    def _parse_languages(self, root):
+        lang = root.xpath(LubimyCzytac.LANGUAGES)
+        languages = list()
+        if lang:
+            lang = lang[0].strip()
+            if "polski" in lang:
+                languages.append("Polish")
+            if "angielski" in lang:
+                languages.append("English")
+        if not languages:
+            return ['Polish']
+        return languages
+
+    def _parse_series(self, root):
+        try:
+            series_node = root.xpath(LubimyCzytac.SERIES)
+            if series_node:
+                series_lst = root.xpath(f"{LubimyCzytac.SERIES}/text()")
+                if series_lst:
+                    series_txt = series_lst
+                else:
+                    series_txt = None
+            else:
+                return (None, None)
+
+            if series_txt:
+                ser_string = [series_txt[0].replace("\n", "").strip()]
+                ser_nazwa = ser_string
+                for ser in ser_string:
+                    if "tom " in ser:
+                        ser_info = ser.split(" (tom ", 1)
+                        ser_nazwa = ser.split(" (tom ")[0]
+                        break
+
+            if ser_info:
+                series_index_unicode = ser_info[1]
+                series_index_string = str(
+                    series_index_unicode.replace(" ", "").replace(")", "")
+                )
+                # Sprawdzamy, czy cykl nie jest kompletem/pakietem tomów, np. 1-3
+                if "-" in series_index_string:
+                    series_index_string_temp = series_index_string.split("-", 1)
+                    series_index_string = series_index_string_temp[0]
+                if series_index_string.replace(".", "").isdigit() is True:
+                    series_index = get_int_or_float(series_index_string)
+                else:
+                    series_index = 0
+            else:
+                series_index = 0
+            series = ser_nazwa
+            return (series, series_index)
+        except:
+            return (None, None)
+
+    def _parse_tags(self, root):
+        tags = None
+        try:
+            tags_from_genre = root.xpath(LubimyCzytac.TAGS)
+            if tags_from_genre:
+                tags = tags_from_genre
+                tags = [w.replace(", itd.", " itd.") for w in tags]
+                return tags
+            else:
+                return None
+        except:
+            return tags
+
+    def _parse_from_summary(self, root, attribute_name: str) -> str:
+        data = json.loads(root.xpath(LubimyCzytac.SUMMARY)[0])
+        value = data.get(attribute_name)
+        return value.strip() if value is not None else value
+
+    def _parse_rating(self, root):
+        rating_node = root.xpath(LubimyCzytac.RATING)
+        if rating_node:
+            rating_value = round(float((rating_node[0]).replace(",", ".")) / 2)
+            return rating_value
+        return None
+
+    def _parse_date(self, root, xpath="first_publish"):
+        options = {
+            "first_publish": LubimyCzytac.FIRST_PUBLISH_DATE,
+            "first_publish_pl": LubimyCzytac.FIRST_PUBLISH_DATE_PL,
+        }
+        path = options.get(xpath)
+        from dateutil import parser
+
+        data = root.xpath(path)
+        if data:
+            first_pub_date = data[0].strip()
+            return parser.parse(first_pub_date)
+        return None
+
+    def _parse_isbn(self, root):
+        isbn_node = root.xpath('//meta[@property="books:isbn"]/@content')[0]
+        return isbn_node
+
+    def _parse_description(self, root):
+        comments = ""
+        description_node = root.xpath(LubimyCzytac.DESCRIPTION)
+        if description_node:
+            for zrodla in root.xpath('//p[@class="source"]'):
+                zrodla.getparent().remove(zrodla)
+            comments = tostring(description_node[0], method="html")
+            comments = sanitize_comments_html(comments)
+
+        else:
+            # try <meta>
+            description_node = root.xpath('//meta[@property="og:description"]/@content')
+            if description_node:
+                comments = description_node[0]
+                comments = sanitize_comments_html(comments)
+
+        pages = self._parse_from_summary(root=root, attribute_name="numberOfPages")
+        if pages:
+            comments += f'<p id="strony">Książka ma {pages} stron(y).</p>'
+
+        first_publish_date = self._parse_date(root=root)
+        if first_publish_date:
+            comments += f'<p id="pierwsze_wydanie">Data pierwszego wydania: {first_publish_date.strftime("%d.%m.%Y")}</p>'
+
+        first_publish_date_pl = self._parse_date(root=root, xpath="first_publish_pl")
+        if first_publish_date_pl:
+            comments += f'<p id="pierwsze_wydanie_pl">Data pierwszego wydania w Polsce: {first_publish_date_pl.strftime("%d.%m.%Y")}</p>'
+
+        return comments
+
+    def get_title_tokens(self, title, strip_joiners=True, strip_subtitle=False):
+        """
+        Taken from https://github.com/kovidgoyal/calibre/blob/master/src/calibre/ebooks/metadata/sources/base.py.
+        """
+        # strip sub-titles
+        if strip_subtitle:
+            subtitle = re.compile(r"([\(\[\{].*?[\)\]\}]|[/:\\].*$)")
+            if len(subtitle.sub("", title)) > 1:
+                title = subtitle.sub("", title)
+
+        title_patterns = [
+            (re.compile(pat, re.IGNORECASE), repl)
+            for pat, repl in [
+                # Remove things like: (2010) (Omnibus) etc.
+                (
+                    r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|audiobook|audio\scd|paperback|turtleback|mass\s*market|edition|ed\.)[\])}]",
+                    "",
+                ),
+                # Remove any strings that contain the substring edition inside
+                # parentheses
+                (r"(?i)[({\[].*?(edition|ed.).*?[\]})]", ""),
+                # Remove commas used a separators in numbers
+                (r"(\d+),(\d+)", r"\1\2"),
+                # Remove hyphens only if they have whitespace before them
+                (r"(\s-)", " "),
+                # Replace other special chars with a space
+                (r"""[:,;!@$%^&*(){}.`~"\s\[\]/]《》「」“”""", " "),
+            ]
+        ]
+
+        for pat, repl in title_patterns:
+            title = pat.sub(repl, title)
+
+        tokens = title.split()
+        for token in tokens:
+            token = token.strip().strip('"').strip("'")
+            if token and (
+                not strip_joiners or token.lower() not in ("a", "and", "the", "&")
+            ):
+                yield token
--- a/cps/static/js/get_meta.js
+++ b/cps/static/js/get_meta.js
@ -26,19 +26,26 @@ $(function () {
       )
    };

+    function getUniqueValues(attribute_name, book){
+        var presentArray = $.map($("#"+attribute_name).val().split(","), $.trim);
+        if ( presentArray.length === 1 && presentArray[0] === "") {
+            presentArray = [];
+        }
+        $.each(book[attribute_name], function(i, el) {
+            if ($.inArray(el, presentArray) === -1) presentArray.push(el);
+        });
+        return presentArray
+    }
+
    function populateForm (book) {
        tinymce.get("description").setContent(book.description);
-        var uniqueTags = $.map($("#tags").val().split(","), $.trim);
-        if ( uniqueTags.length == 1 && uniqueTags[0] == "") {
-            uniqueTags = [];
-        }
-        $.each(book.tags, function(i, el) {
-            if ($.inArray(el, uniqueTags) === -1) uniqueTags.push(el);
-        });
+        var uniqueTags = getUniqueValues('tags', book)
+        var uniqueLanguages = getUniqueValues('languages', book)
        var ampSeparatedAuthors = (book.authors || []).join(" & ");
        $("#bookAuthor").val(ampSeparatedAuthors);
        $("#book_title").val(book.title);
        $("#tags").val(uniqueTags.join(", "));
+        $("#languages").val(uniqueLanguages.join(", "));
        $("#rating").data("rating").setValue(Math.round(book.rating));
        if(book.cover !== null){
            $(".cover img").attr("src", book.cover);
@ -48,7 +55,32 @@ $(function () {
        $("#publisher").val(book.publisher);
        if (typeof book.series !== "undefined") {
            $("#series").val(book.series);
+            $("#series_index").val(book.series_index);
        }
+        if (typeof book.identifiers !== "undefined") {
+            populateIdentifiers(book.identifiers)
+        }
+    }
+
+    function populateIdentifiers(identifiers){
+       for (const property in identifiers) {
+          console.log(`${property}: ${identifiers[property]}`);
+          if ($('input[name="identifier-type-'+property+'"]').length) {
+              $('input[name="identifier-val-'+property+'"]').val(identifiers[property])
+          }
+          else {
+              addIdentifier(property, identifiers[property])
+          }
+        }
+    }
+
+    function addIdentifier(name, value){
+        var line = '<tr>';
+        line += '<td><input type="text" class="form-control" name="identifier-type-'+ name +'" required="required" placeholder="' + _("Identifier Type") +'" value="'+ name +'"></td>';
+        line += '<td><input type="text" class="form-control" name="identifier-val-'+ name +'" required="required" placeholder="' + _("Identifier Value") +'" value="'+ value +'"></td>';
+        line += '<td><a class="btn btn-default" onclick="removeIdentifierLine(this)">'+_("Remove")+'</a></td>';
+        line += '</tr>';
+        $("#identifier-table").append(line);
    }

    function doSearch (keyword) {
--- a/requirements.txt
+++ b/requirements.txt
@ -14,3 +14,5 @@ Wand>=0.4.4,<0.7.0
 unidecode>=0.04.19,<1.3.0
 lxml>=3.8.0,<4.7.0
 flask-wtf>=0.14.2,<1.1.0
+markdown2==2.4.2
+html2text==2020.1.16