refactor and cleaning
This commit is contained in:
parent
920acaca99
commit
d55626d445
|
@ -26,7 +26,7 @@ class ComicVine(Metadata):
|
||||||
__name__ = "ComicVine"
|
__name__ = "ComicVine"
|
||||||
__id__ = "comicvine"
|
__id__ = "comicvine"
|
||||||
|
|
||||||
def search(self, query, __):
|
def search(self, query, generic_cover=""):
|
||||||
val = list()
|
val = list()
|
||||||
apikey = "57558043c53943d5d1e96a9ad425b0eb85532ee6"
|
apikey = "57558043c53943d5d1e96a9ad425b0eb85532ee6"
|
||||||
if self.active:
|
if self.active:
|
||||||
|
@ -52,7 +52,7 @@ class ComicVine(Metadata):
|
||||||
v['tags'] = ["Comics", seriesTitle]
|
v['tags'] = ["Comics", seriesTitle]
|
||||||
v['rating'] = 0
|
v['rating'] = 0
|
||||||
v['series'] = seriesTitle
|
v['series'] = seriesTitle
|
||||||
v['cover'] = r['image'].get('original_url')
|
v['cover'] = r['image'].get('original_url', generic_cover)
|
||||||
v['source'] = {
|
v['source'] = {
|
||||||
"id": self.__id__,
|
"id": self.__id__,
|
||||||
"description": "ComicVine Books",
|
"description": "ComicVine Books",
|
||||||
|
|
|
@ -17,19 +17,20 @@
|
||||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
# Google Books api document: https://developers.google.com/books/docs/v1/using
|
# Google Books api document: https://developers.google.com/books/docs/v1/using
|
||||||
|
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from cps.services.Metadata import Metadata
|
from cps.services.Metadata import Metadata
|
||||||
|
|
||||||
|
|
||||||
class Google(Metadata):
|
class Google(Metadata):
|
||||||
__name__ = "Google"
|
__name__ = "Google"
|
||||||
__id__ = "google"
|
__id__ = "google"
|
||||||
|
BASE_URL = "https://www.googleapis.com/books/v1/volumes?q="
|
||||||
|
|
||||||
def search(self, query, __):
|
def search(self, query, generic_cover=""):
|
||||||
if self.active:
|
if self.active:
|
||||||
val = list()
|
val = list()
|
||||||
result = requests.get("https://www.googleapis.com/books/v1/volumes?q="+query.replace(" ","+"))
|
result = requests.get(Google.BASE_URL + query.replace(" ","+"))
|
||||||
for r in result.json()['items']:
|
for r in result.json()['items']:
|
||||||
v = dict()
|
v = dict()
|
||||||
v['id'] = r['id']
|
v['id'] = r['id']
|
||||||
|
@ -43,7 +44,8 @@ class Google(Metadata):
|
||||||
if r['volumeInfo'].get('imageLinks'):
|
if r['volumeInfo'].get('imageLinks'):
|
||||||
v['cover'] = r['volumeInfo']['imageLinks']['thumbnail'].replace("http://", "https://")
|
v['cover'] = r['volumeInfo']['imageLinks']['thumbnail'].replace("http://", "https://")
|
||||||
else:
|
else:
|
||||||
v['cover'] = "/../../../static/generic_cover.jpg"
|
# v['cover'] = "/../../../static/generic_cover.jpg"
|
||||||
|
v['cover'] = generic_cover
|
||||||
v['source'] = {
|
v['source'] = {
|
||||||
"id": self.__id__,
|
"id": self.__id__,
|
||||||
"description": "Google Books",
|
"description": "Google Books",
|
||||||
|
|
|
@ -15,47 +15,47 @@
|
||||||
#
|
#
|
||||||
# You should have received a copy of the GNU General Public License
|
# You should have received a copy of the GNU General Public License
|
||||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
import datetime
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
from typing import Dict, List
|
from typing import Dict, Generator, List, Optional, Tuple, Union
|
||||||
from urllib.parse import quote
|
from urllib.parse import quote
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from cps.services.Metadata import Metadata
|
from dateutil import parser
|
||||||
from lxml.html import fromstring, tostring
|
from html2text import HTML2Text
|
||||||
|
from lxml.html import HtmlElement, fromstring, tostring
|
||||||
|
from markdown2 import Markdown
|
||||||
|
|
||||||
|
from cps.services.Metadata import MetaRecord, Metadata
|
||||||
|
|
||||||
|
SYMBOLS_TO_TRANSLATE = (
|
||||||
|
"öÖüÜóÓőŐúÚéÉáÁűŰíÍąĄćĆęĘłŁńŃóÓśŚźŹżŻ",
|
||||||
|
"oOuUoOoOuUeEaAuUiIaAcCeElLnNoOsSzZzZ",
|
||||||
|
)
|
||||||
|
SYMBOL_TRANSLATION_MAP = dict(
|
||||||
|
[(ord(a), ord(b)) for (a, b) in zip(*SYMBOLS_TO_TRANSLATE)]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_int_or_float(v):
|
def get_int_or_float(value: str) -> Union[int, float]:
|
||||||
number_as_float = float(v)
|
number_as_float = float(value)
|
||||||
number_as_int = int(number_as_float)
|
number_as_int = int(number_as_float)
|
||||||
return number_as_int if number_as_float == number_as_int else number_as_float
|
return number_as_int if number_as_float == number_as_int else number_as_float
|
||||||
|
|
||||||
|
|
||||||
def strip_accents(s):
|
def strip_accents(s: Optional[str]) -> Optional[str]:
|
||||||
if s is None:
|
return s.translate(SYMBOL_TRANSLATION_MAP) if s is not None else s
|
||||||
return s
|
|
||||||
else:
|
|
||||||
symbols = (
|
|
||||||
"öÖüÜóÓőŐúÚéÉáÁűŰíÍąĄćĆęĘłŁńŃóÓśŚźŹżŻ",
|
|
||||||
"oOuUoOoOuUeEaAuUiIaAcCeElLnNoOsSzZzZ",
|
|
||||||
)
|
|
||||||
tr = dict([(ord(a), ord(b)) for (a, b) in zip(*symbols)])
|
|
||||||
return s.translate(tr) # .lower()
|
|
||||||
|
|
||||||
|
|
||||||
def sanitize_comments_html(html):
|
def sanitize_comments_html(html: str) -> str:
|
||||||
from markdown2 import Markdown
|
|
||||||
|
|
||||||
text = html2text(html)
|
text = html2text(html)
|
||||||
md = Markdown()
|
md = Markdown()
|
||||||
html = md.convert(text)
|
html = md.convert(text)
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
|
||||||
def html2text(html):
|
def html2text(html: str) -> str:
|
||||||
from html2text import HTML2Text
|
|
||||||
import re
|
|
||||||
|
|
||||||
# replace <u> tags with <span> as <u> becomes emphasis in html2text
|
# replace <u> tags with <span> as <u> becomes emphasis in html2text
|
||||||
if isinstance(html, bytes):
|
if isinstance(html, bytes):
|
||||||
html = html.decode("utf-8")
|
html = html.decode("utf-8")
|
||||||
|
@ -92,26 +92,36 @@ class LubimyCzytac(Metadata):
|
||||||
PUBLISHER = f"{CONTAINER}//dt[contains(text(),'Wydawnictwo:')]{SIBLINGS}/a/text()"
|
PUBLISHER = f"{CONTAINER}//dt[contains(text(),'Wydawnictwo:')]{SIBLINGS}/a/text()"
|
||||||
LANGUAGES = f"{CONTAINER}//dt[contains(text(),'Język:')]{SIBLINGS}/text()"
|
LANGUAGES = f"{CONTAINER}//dt[contains(text(),'Język:')]{SIBLINGS}/text()"
|
||||||
DESCRIPTION = f"{CONTAINER}//div[@class='collapse-content']"
|
DESCRIPTION = f"{CONTAINER}//div[@class='collapse-content']"
|
||||||
SERIES = f"{CONTAINER}//span/a[contains(@href,'/cykl/')]"
|
SERIES = f"{CONTAINER}//span/a[contains(@href,'/cykl/')]/text()"
|
||||||
|
|
||||||
DETAILS = "//div[@id='book-details']"
|
DETAILS = "//div[@id='book-details']"
|
||||||
PUBLISH_DATE = "//dt[contains(@title,'Data pierwszego wydania"
|
PUBLISH_DATE = "//dt[contains(@title,'Data pierwszego wydania"
|
||||||
FIRST_PUBLISH_DATE = f"{DETAILS}{PUBLISH_DATE} oryginalnego')]{SIBLINGS}[1]/text()"
|
FIRST_PUBLISH_DATE = f"{DETAILS}{PUBLISH_DATE} oryginalnego')]{SIBLINGS}[1]/text()"
|
||||||
FIRST_PUBLISH_DATE_PL = f"{DETAILS}{PUBLISH_DATE} polskiego')]{SIBLINGS}[1]/text()"
|
FIRST_PUBLISH_DATE_PL = f"{DETAILS}{PUBLISH_DATE} polskiego')]{SIBLINGS}[1]/text()"
|
||||||
TAGS = "//nav[@aria-label='breadcrumb']//a[contains(@href,'/ksiazki/k/')]/text()"
|
TAGS = "//nav[@aria-label='breadcrumb']//a[contains(@href,'/ksiazki/k/')]/text()"
|
||||||
|
|
||||||
RATING = "//meta[@property='books:rating:value']/@content"
|
RATING = "//meta[@property='books:rating:value']/@content"
|
||||||
COVER = "//meta[@property='og:image']/@content"
|
COVER = "//meta[@property='og:image']/@content"
|
||||||
|
ISBN = "//meta[@property='books:isbn']/@content"
|
||||||
|
META_TITLE = "//meta[@property='og:description']/@content"
|
||||||
|
|
||||||
SUMMARY = "//script[@type='application/ld+json']//text()"
|
SUMMARY = "//script[@type='application/ld+json']//text()"
|
||||||
|
|
||||||
def search(self, query, __):
|
def search(self, query: str, generic_cover: str = "") -> Optional[List]:
|
||||||
if self.active:
|
if self.active:
|
||||||
result = requests.get(self._prepare_query(title=query))
|
result = requests.get(self._prepare_query(title=query))
|
||||||
root = fromstring(result.text)
|
root = fromstring(result.text)
|
||||||
matches = self._parse_search_results(root=root)
|
lc_parser = LubimyCzytacParser(root=root, metadata=self)
|
||||||
|
matches = lc_parser.parse_search_results()
|
||||||
if matches:
|
if matches:
|
||||||
for ind, match in enumerate(matches):
|
final_matches = []
|
||||||
matches[ind] = self._parse_single_book(match=match)
|
for match in matches:
|
||||||
|
response = requests.get(match.get("url"))
|
||||||
|
match = lc_parser.parse_single_book(
|
||||||
|
match=match, response=response, generic_cover=generic_cover
|
||||||
|
)
|
||||||
|
final_matches.append(match)
|
||||||
|
return final_matches
|
||||||
return matches
|
return matches
|
||||||
|
|
||||||
def _prepare_query(self, title: str) -> str:
|
def _prepare_query(self, title: str) -> str:
|
||||||
|
@ -128,9 +138,7 @@ class LubimyCzytac(Metadata):
|
||||||
token for token in title.lower().split(" ") if len(token) > 1
|
token for token in title.lower().split(" ") if len(token) > 1
|
||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
title_tokens = list(
|
title_tokens = list(self.get_title_tokens(title, strip_joiners=False))
|
||||||
self.get_title_tokens(title, strip_joiners=False, strip_subtitle=True)
|
|
||||||
)
|
|
||||||
if title_tokens:
|
if title_tokens:
|
||||||
tokens = [quote(t.encode("utf-8")) for t in title_tokens]
|
tokens = [quote(t.encode("utf-8")) for t in title_tokens]
|
||||||
query = query + "%20".join(tokens)
|
query = query + "%20".join(tokens)
|
||||||
|
@ -138,215 +146,21 @@ class LubimyCzytac(Metadata):
|
||||||
return ""
|
return ""
|
||||||
return f"{LubimyCzytac.BASE_URL}/szukaj/ksiazki?phrase={query}"
|
return f"{LubimyCzytac.BASE_URL}/szukaj/ksiazki?phrase={query}"
|
||||||
|
|
||||||
def _parse_search_results(self, root) -> List[Dict]:
|
@staticmethod
|
||||||
matches = []
|
def get_title_tokens(
|
||||||
results = root.xpath(LubimyCzytac.BOOK_SEARCH_RESULT_XPATH)
|
title: str, strip_joiners: bool = True
|
||||||
for result in results:
|
) -> Generator[str, None, None]:
|
||||||
title = result.xpath(
|
|
||||||
f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
|
|
||||||
f"{LubimyCzytac.TITLE_TEXT_PATH}"
|
|
||||||
)
|
|
||||||
book_url = result.xpath(
|
|
||||||
f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" f"{LubimyCzytac.URL_PATH}"
|
|
||||||
)
|
|
||||||
authors = result.xpath(
|
|
||||||
f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
|
|
||||||
f"{LubimyCzytac.AUTHORS_PATH}"
|
|
||||||
)
|
|
||||||
|
|
||||||
if not title or not book_url or not authors:
|
|
||||||
continue
|
|
||||||
title = title[0].strip()
|
|
||||||
book_url = LubimyCzytac.BASE_URL + book_url[0]
|
|
||||||
book_id = book_url.replace(f"{LubimyCzytac.BASE_URL}/ksiazka/", "").split(
|
|
||||||
"/"
|
|
||||||
)[0]
|
|
||||||
matches.append(
|
|
||||||
{"id": book_id, "title": title, "authors": authors, "url": book_url}
|
|
||||||
)
|
|
||||||
return matches
|
|
||||||
|
|
||||||
def _parse_single_book(self, match: Dict) -> Dict:
|
|
||||||
url = match.get("url")
|
|
||||||
result = requests.get(url)
|
|
||||||
root = fromstring(result.text)
|
|
||||||
match["series"], match["series_index"] = self._parse_series(root=root)
|
|
||||||
match["tags"] = self._parse_tags(root=root)
|
|
||||||
match["publisher"] = self._parse_publisher(root=root)
|
|
||||||
match["publishedDate"] = self._parse_from_summary(
|
|
||||||
root=root, attribute_name="datePublished"
|
|
||||||
)
|
|
||||||
match["rating"] = self._parse_rating(root=root)
|
|
||||||
match["description"] = self._parse_description(root=root)
|
|
||||||
match["cover"] = self._parse_cover(root=root)
|
|
||||||
match["source"] = {
|
|
||||||
"id": self.__id__,
|
|
||||||
"description": self.__name__,
|
|
||||||
"link": LubimyCzytac.BASE_URL,
|
|
||||||
}
|
|
||||||
match['languages'] = self._parse_languages(root=root)
|
|
||||||
match["identifiers"] = {
|
|
||||||
"isbn": self._parse_isbn(root=root),
|
|
||||||
"lubimyczytac": match["id"],
|
|
||||||
}
|
|
||||||
return match
|
|
||||||
|
|
||||||
def _parse_cover(self, root):
|
|
||||||
imgcol_node = root.xpath('//meta[@property="og:image"]/@content')
|
|
||||||
if imgcol_node:
|
|
||||||
img_url = imgcol_node[0]
|
|
||||||
return img_url
|
|
||||||
|
|
||||||
def _parse_publisher(self, root):
|
|
||||||
publisher = root.xpath(LubimyCzytac.PUBLISHER)
|
|
||||||
if publisher:
|
|
||||||
return publisher[0]
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _parse_languages(self, root):
|
|
||||||
lang = root.xpath(LubimyCzytac.LANGUAGES)
|
|
||||||
languages = list()
|
|
||||||
if lang:
|
|
||||||
lang = lang[0].strip()
|
|
||||||
if "polski" in lang:
|
|
||||||
languages.append("Polish")
|
|
||||||
if "angielski" in lang:
|
|
||||||
languages.append("English")
|
|
||||||
if not languages:
|
|
||||||
return ['Polish']
|
|
||||||
return languages
|
|
||||||
|
|
||||||
def _parse_series(self, root):
|
|
||||||
try:
|
|
||||||
series_node = root.xpath(LubimyCzytac.SERIES)
|
|
||||||
if series_node:
|
|
||||||
series_lst = root.xpath(f"{LubimyCzytac.SERIES}/text()")
|
|
||||||
if series_lst:
|
|
||||||
series_txt = series_lst
|
|
||||||
else:
|
|
||||||
series_txt = None
|
|
||||||
else:
|
|
||||||
return (None, None)
|
|
||||||
|
|
||||||
if series_txt:
|
|
||||||
ser_string = [series_txt[0].replace("\n", "").strip()]
|
|
||||||
ser_nazwa = ser_string
|
|
||||||
for ser in ser_string:
|
|
||||||
if "tom " in ser:
|
|
||||||
ser_info = ser.split(" (tom ", 1)
|
|
||||||
ser_nazwa = ser.split(" (tom ")[0]
|
|
||||||
break
|
|
||||||
|
|
||||||
if ser_info:
|
|
||||||
series_index_unicode = ser_info[1]
|
|
||||||
series_index_string = str(
|
|
||||||
series_index_unicode.replace(" ", "").replace(")", "")
|
|
||||||
)
|
|
||||||
# Sprawdzamy, czy cykl nie jest kompletem/pakietem tomów, np. 1-3
|
|
||||||
if "-" in series_index_string:
|
|
||||||
series_index_string_temp = series_index_string.split("-", 1)
|
|
||||||
series_index_string = series_index_string_temp[0]
|
|
||||||
if series_index_string.replace(".", "").isdigit() is True:
|
|
||||||
series_index = get_int_or_float(series_index_string)
|
|
||||||
else:
|
|
||||||
series_index = 0
|
|
||||||
else:
|
|
||||||
series_index = 0
|
|
||||||
series = ser_nazwa
|
|
||||||
return (series, series_index)
|
|
||||||
except:
|
|
||||||
return (None, None)
|
|
||||||
|
|
||||||
def _parse_tags(self, root):
|
|
||||||
tags = None
|
|
||||||
try:
|
|
||||||
tags_from_genre = root.xpath(LubimyCzytac.TAGS)
|
|
||||||
if tags_from_genre:
|
|
||||||
tags = tags_from_genre
|
|
||||||
tags = [w.replace(", itd.", " itd.") for w in tags]
|
|
||||||
return tags
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
except:
|
|
||||||
return tags
|
|
||||||
|
|
||||||
def _parse_from_summary(self, root, attribute_name: str) -> str:
|
|
||||||
data = json.loads(root.xpath(LubimyCzytac.SUMMARY)[0])
|
|
||||||
value = data.get(attribute_name)
|
|
||||||
return value.strip() if value is not None else value
|
|
||||||
|
|
||||||
def _parse_rating(self, root):
|
|
||||||
rating_node = root.xpath(LubimyCzytac.RATING)
|
|
||||||
if rating_node:
|
|
||||||
rating_value = round(float((rating_node[0]).replace(",", ".")) / 2)
|
|
||||||
return rating_value
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _parse_date(self, root, xpath="first_publish"):
|
|
||||||
options = {
|
|
||||||
"first_publish": LubimyCzytac.FIRST_PUBLISH_DATE,
|
|
||||||
"first_publish_pl": LubimyCzytac.FIRST_PUBLISH_DATE_PL,
|
|
||||||
}
|
|
||||||
path = options.get(xpath)
|
|
||||||
from dateutil import parser
|
|
||||||
|
|
||||||
data = root.xpath(path)
|
|
||||||
if data:
|
|
||||||
first_pub_date = data[0].strip()
|
|
||||||
return parser.parse(first_pub_date)
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _parse_isbn(self, root):
|
|
||||||
isbn_node = root.xpath('//meta[@property="books:isbn"]/@content')[0]
|
|
||||||
return isbn_node
|
|
||||||
|
|
||||||
def _parse_description(self, root):
|
|
||||||
comments = ""
|
|
||||||
description_node = root.xpath(LubimyCzytac.DESCRIPTION)
|
|
||||||
if description_node:
|
|
||||||
for zrodla in root.xpath('//p[@class="source"]'):
|
|
||||||
zrodla.getparent().remove(zrodla)
|
|
||||||
comments = tostring(description_node[0], method="html")
|
|
||||||
comments = sanitize_comments_html(comments)
|
|
||||||
|
|
||||||
else:
|
|
||||||
# try <meta>
|
|
||||||
description_node = root.xpath('//meta[@property="og:description"]/@content')
|
|
||||||
if description_node:
|
|
||||||
comments = description_node[0]
|
|
||||||
comments = sanitize_comments_html(comments)
|
|
||||||
|
|
||||||
pages = self._parse_from_summary(root=root, attribute_name="numberOfPages")
|
|
||||||
if pages:
|
|
||||||
comments += f'<p id="strony">Książka ma {pages} stron(y).</p>'
|
|
||||||
|
|
||||||
first_publish_date = self._parse_date(root=root)
|
|
||||||
if first_publish_date:
|
|
||||||
comments += f'<p id="pierwsze_wydanie">Data pierwszego wydania: {first_publish_date.strftime("%d.%m.%Y")}</p>'
|
|
||||||
|
|
||||||
first_publish_date_pl = self._parse_date(root=root, xpath="first_publish_pl")
|
|
||||||
if first_publish_date_pl:
|
|
||||||
comments += f'<p id="pierwsze_wydanie_pl">Data pierwszego wydania w Polsce: {first_publish_date_pl.strftime("%d.%m.%Y")}</p>'
|
|
||||||
|
|
||||||
return comments
|
|
||||||
|
|
||||||
def get_title_tokens(self, title, strip_joiners=True, strip_subtitle=False):
|
|
||||||
"""
|
"""
|
||||||
Taken from https://github.com/kovidgoyal/calibre/blob/master/src/calibre/ebooks/metadata/sources/base.py.
|
Taken from calibre source code
|
||||||
"""
|
"""
|
||||||
# strip sub-titles
|
|
||||||
if strip_subtitle:
|
|
||||||
subtitle = re.compile(r"([\(\[\{].*?[\)\]\}]|[/:\\].*$)")
|
|
||||||
if len(subtitle.sub("", title)) > 1:
|
|
||||||
title = subtitle.sub("", title)
|
|
||||||
|
|
||||||
title_patterns = [
|
title_patterns = [
|
||||||
(re.compile(pat, re.IGNORECASE), repl)
|
(re.compile(pat, re.IGNORECASE), repl)
|
||||||
for pat, repl in [
|
for pat, repl in [
|
||||||
# Remove things like: (2010) (Omnibus) etc.
|
# Remove things like: (2010) (Omnibus) etc.
|
||||||
(
|
(
|
||||||
r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|audiobook|audio\scd|paperback|turtleback|mass\s*market|edition|ed\.)[\])}]",
|
r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|"
|
||||||
|
r"audiobook|audio\scd|paperback|turtleback|"
|
||||||
|
r"mass\s*market|edition|ed\.)[\])}]",
|
||||||
"",
|
"",
|
||||||
),
|
),
|
||||||
# Remove any strings that contain the substring edition inside
|
# Remove any strings that contain the substring edition inside
|
||||||
|
@ -371,3 +185,193 @@ class LubimyCzytac(Metadata):
|
||||||
not strip_joiners or token.lower() not in ("a", "and", "the", "&")
|
not strip_joiners or token.lower() not in ("a", "and", "the", "&")
|
||||||
):
|
):
|
||||||
yield token
|
yield token
|
||||||
|
|
||||||
|
|
||||||
|
class LubimyCzytacParser:
|
||||||
|
PAGES_TEMPLATE = "<p id='strony'>Książka ma {0} stron(y).</p>"
|
||||||
|
PUBLISH_DATE_TEMPLATE = "<p id='pierwsze_wydanie'>Data pierwszego wydania: {0}</p>"
|
||||||
|
PUBLISH_DATE_PL_TEMPLATE = (
|
||||||
|
"<p id='pierwsze_wydanie'>Data pierwszego wydania w Polsce: {0}</p>"
|
||||||
|
)
|
||||||
|
|
||||||
|
def __init__(self, root: HtmlElement, metadata: Metadata) -> None:
|
||||||
|
self.root = root
|
||||||
|
self.metadata = metadata
|
||||||
|
|
||||||
|
def parse_search_results(self) -> List[Dict]:
|
||||||
|
matches = []
|
||||||
|
results = self.root.xpath(LubimyCzytac.BOOK_SEARCH_RESULT_XPATH)
|
||||||
|
for result in results:
|
||||||
|
title = self._parse_xpath_node(
|
||||||
|
root=result,
|
||||||
|
xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
|
||||||
|
f"{LubimyCzytac.TITLE_TEXT_PATH}",
|
||||||
|
)
|
||||||
|
|
||||||
|
book_url = self._parse_xpath_node(
|
||||||
|
root=result,
|
||||||
|
xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
|
||||||
|
f"{LubimyCzytac.URL_PATH}",
|
||||||
|
)
|
||||||
|
authors = self._parse_xpath_node(
|
||||||
|
root=result,
|
||||||
|
xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
|
||||||
|
f"{LubimyCzytac.AUTHORS_PATH}",
|
||||||
|
take_first=False,
|
||||||
|
)
|
||||||
|
if not all([title, book_url, authors]):
|
||||||
|
continue
|
||||||
|
matches.append(
|
||||||
|
{
|
||||||
|
"id": book_url.replace(f"/ksiazka/", "").split("/")[0],
|
||||||
|
"title": title,
|
||||||
|
"authors": [strip_accents(author) for author in authors],
|
||||||
|
"url": LubimyCzytac.BASE_URL + book_url,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return matches
|
||||||
|
|
||||||
|
def parse_single_book(
|
||||||
|
self, match: Dict, response, generic_cover: str
|
||||||
|
) -> MetaRecord:
|
||||||
|
self.root = fromstring(response.text)
|
||||||
|
match["series"], match["series_index"] = self._parse_series()
|
||||||
|
match["tags"] = self._parse_tags()
|
||||||
|
match["publisher"] = self._parse_publisher()
|
||||||
|
match["publishedDate"] = self._parse_from_summary(
|
||||||
|
attribute_name="datePublished"
|
||||||
|
)
|
||||||
|
match["rating"] = self._parse_rating()
|
||||||
|
match["description"] = self._parse_description()
|
||||||
|
match["cover"] = self._parse_cover(generic_cover=generic_cover)
|
||||||
|
match["source"] = {
|
||||||
|
"id": self.metadata.__id__,
|
||||||
|
"description": self.metadata.__name__,
|
||||||
|
"link": LubimyCzytac.BASE_URL,
|
||||||
|
}
|
||||||
|
match["languages"] = self._parse_languages()
|
||||||
|
match["identifiers"] = {
|
||||||
|
"isbn": self._parse_isbn(),
|
||||||
|
"lubimyczytac": match["id"],
|
||||||
|
}
|
||||||
|
return match
|
||||||
|
|
||||||
|
def _parse_xpath_node(
|
||||||
|
self,
|
||||||
|
xpath: str,
|
||||||
|
root: HtmlElement = None,
|
||||||
|
take_first: bool = True,
|
||||||
|
strip_element: bool = True,
|
||||||
|
) -> Optional[Union[str, List[str]]]:
|
||||||
|
root = root if root is not None else self.root
|
||||||
|
node = root.xpath(xpath)
|
||||||
|
if not node:
|
||||||
|
return None
|
||||||
|
return (
|
||||||
|
(node[0].strip() if strip_element else node[0])
|
||||||
|
if take_first
|
||||||
|
else [x.strip() for x in node]
|
||||||
|
)
|
||||||
|
|
||||||
|
def _parse_cover(self, generic_cover) -> Optional[str]:
|
||||||
|
return (
|
||||||
|
self._parse_xpath_node(xpath=LubimyCzytac.COVER, take_first=True)
|
||||||
|
or generic_cover
|
||||||
|
)
|
||||||
|
|
||||||
|
def _parse_publisher(self) -> Optional[str]:
|
||||||
|
return self._parse_xpath_node(xpath=LubimyCzytac.PUBLISHER, take_first=True)
|
||||||
|
|
||||||
|
def _parse_languages(self) -> List[str]:
|
||||||
|
languages = list()
|
||||||
|
lang = self._parse_xpath_node(xpath=LubimyCzytac.LANGUAGES, take_first=True)
|
||||||
|
if lang:
|
||||||
|
if "polski" in lang:
|
||||||
|
languages.append("Polish")
|
||||||
|
if "angielski" in lang:
|
||||||
|
languages.append("English")
|
||||||
|
return languages
|
||||||
|
|
||||||
|
def _parse_series(self) -> Tuple[Optional[str], Optional[Union[float, int]]]:
|
||||||
|
series_index = 0
|
||||||
|
series = self._parse_xpath_node(xpath=LubimyCzytac.SERIES, take_first=True)
|
||||||
|
if series:
|
||||||
|
if "tom " in series:
|
||||||
|
series_name, series_info = series.split(" (tom ", 1)
|
||||||
|
series_info = series_info.replace(" ", "").replace(")", "")
|
||||||
|
# Check if book is not a bundle, i.e. chapter 1-3
|
||||||
|
if "-" in series_info:
|
||||||
|
series_info = series_info.split("-", 1)[0]
|
||||||
|
if series_info.replace(".", "").isdigit() is True:
|
||||||
|
series_index = get_int_or_float(series_info)
|
||||||
|
return series_name, series_index
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
def _parse_tags(self) -> List[str]:
|
||||||
|
tags = self._parse_xpath_node(xpath=LubimyCzytac.TAGS, take_first=False)
|
||||||
|
return [
|
||||||
|
strip_accents(w.replace(", itd.", " itd."))
|
||||||
|
for w in tags
|
||||||
|
if isinstance(w, str)
|
||||||
|
]
|
||||||
|
|
||||||
|
def _parse_from_summary(self, attribute_name: str) -> Optional[str]:
|
||||||
|
value = None
|
||||||
|
summary_text = self._parse_xpath_node(xpath=LubimyCzytac.SUMMARY)
|
||||||
|
if summary_text:
|
||||||
|
data = json.loads(summary_text)
|
||||||
|
value = data.get(attribute_name)
|
||||||
|
return value.strip() if value is not None else value
|
||||||
|
|
||||||
|
def _parse_rating(self) -> Optional[str]:
|
||||||
|
rating = self._parse_xpath_node(xpath=LubimyCzytac.RATING)
|
||||||
|
return round(float(rating.replace(",", ".")) / 2) if rating else rating
|
||||||
|
|
||||||
|
def _parse_date(self, xpath="first_publish") -> Optional[datetime.datetime]:
|
||||||
|
options = {
|
||||||
|
"first_publish": LubimyCzytac.FIRST_PUBLISH_DATE,
|
||||||
|
"first_publish_pl": LubimyCzytac.FIRST_PUBLISH_DATE_PL,
|
||||||
|
}
|
||||||
|
date = self._parse_xpath_node(xpath=options.get(xpath))
|
||||||
|
return parser.parse(date) if date else None
|
||||||
|
|
||||||
|
def _parse_isbn(self) -> Optional[str]:
|
||||||
|
return self._parse_xpath_node(xpath=LubimyCzytac.ISBN)
|
||||||
|
|
||||||
|
def _parse_description(self) -> str:
|
||||||
|
description = ""
|
||||||
|
description_node = self._parse_xpath_node(
|
||||||
|
xpath=LubimyCzytac.DESCRIPTION, strip_element=False
|
||||||
|
)
|
||||||
|
if description_node is not None:
|
||||||
|
for source in self.root.xpath('//p[@class="source"]'):
|
||||||
|
source.getparent().remove(source)
|
||||||
|
description = tostring(description_node, method="html")
|
||||||
|
description = sanitize_comments_html(description)
|
||||||
|
|
||||||
|
else:
|
||||||
|
description_node = self._parse_xpath_node(xpath=LubimyCzytac.META_TITLE)
|
||||||
|
if description_node is not None:
|
||||||
|
description = description_node
|
||||||
|
description = sanitize_comments_html(description)
|
||||||
|
description = self._add_extra_info_to_description(description=description)
|
||||||
|
return description
|
||||||
|
|
||||||
|
def _add_extra_info_to_description(self, description: str) -> str:
|
||||||
|
pages = self._parse_from_summary(attribute_name="numberOfPages")
|
||||||
|
if pages:
|
||||||
|
description += LubimyCzytacParser.PAGES_TEMPLATE.format(pages)
|
||||||
|
|
||||||
|
first_publish_date = self._parse_date()
|
||||||
|
if first_publish_date:
|
||||||
|
description += LubimyCzytacParser.PUBLISH_DATE_TEMPLATE.format(
|
||||||
|
first_publish_date.strftime("%d.%m.%Y")
|
||||||
|
)
|
||||||
|
|
||||||
|
first_publish_date_pl = self._parse_date(xpath="first_publish_pl")
|
||||||
|
if first_publish_date_pl:
|
||||||
|
description += LubimyCzytacParser.PUBLISH_DATE_PL_TEMPLATE.format(
|
||||||
|
first_publish_date_pl.strftime("%d.%m.%Y")
|
||||||
|
)
|
||||||
|
|
||||||
|
return description
|
||||||
|
|
|
@ -15,13 +15,44 @@
|
||||||
#
|
#
|
||||||
# You should have received a copy of the GNU General Public License
|
# You should have received a copy of the GNU General Public License
|
||||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
import abc
|
||||||
|
from typing import Dict, List, Optional, TypedDict, Union
|
||||||
|
|
||||||
|
|
||||||
class Metadata():
|
class Metadata:
|
||||||
__name__ = "Generic"
|
__name__ = "Generic"
|
||||||
|
__id__ = "generic"
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.active = True
|
self.active = True
|
||||||
|
|
||||||
def set_status(self, state):
|
def set_status(self, state):
|
||||||
self.active = state
|
self.active = state
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def search(self, query: str, generic_cover: str):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class MetaSourceInfo(TypedDict):
|
||||||
|
id: str
|
||||||
|
description: str
|
||||||
|
link: str
|
||||||
|
|
||||||
|
|
||||||
|
class MetaRecord(TypedDict):
|
||||||
|
id: Union[str, int]
|
||||||
|
title: str
|
||||||
|
authors: List[str]
|
||||||
|
url: str
|
||||||
|
cover: str
|
||||||
|
series: Optional[str]
|
||||||
|
series_index: Optional[Union[int, float]]
|
||||||
|
tags: Optional[List[str]]
|
||||||
|
publisher: Optional[str]
|
||||||
|
publishedDate: Optional[str]
|
||||||
|
rating: Optional[int]
|
||||||
|
description: Optional[str]
|
||||||
|
source: MetaSourceInfo
|
||||||
|
languages: Optional[List[str]]
|
||||||
|
identifiers: Dict[str, Union[str, int]]
|
||||||
|
|
|
@ -16,3 +16,4 @@ lxml>=3.8.0,<4.7.0
|
||||||
flask-wtf>=0.14.2,<1.1.0
|
flask-wtf>=0.14.2,<1.1.0
|
||||||
markdown2==2.4.2
|
markdown2==2.4.2
|
||||||
html2text==2020.1.16
|
html2text==2020.1.16
|
||||||
|
python-dateutil==2.8.2
|
||||||
|
|
Loading…
Reference in New Issue
Block a user