Merge remote-tracking branch 'douban/metadata_provider/douban'

# Conflicts:
#	cps/metadata_provider/amazon.py
#	cps/metadata_provider/lubimyczytac.py
This commit is contained in:
Ozzieisaacs 2022-04-17 10:33:52 +02:00
commit 130af069aa
9 changed files with 311 additions and 54 deletions

View File

@ -19,11 +19,13 @@
import concurrent.futures import concurrent.futures
import requests import requests
from bs4 import BeautifulSoup as BS # requirement from bs4 import BeautifulSoup as BS # requirement
from typing import List, Optional
try: try:
import cchardet #optional for better speed import cchardet #optional for better speed
except ImportError: except ImportError:
pass pass
from cps import logger
from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
import cps.logger as logger import cps.logger as logger
@ -31,6 +33,9 @@ import cps.logger as logger
from operator import itemgetter from operator import itemgetter
log = logger.create() log = logger.create()
log = logger.create()
class Amazon(Metadata): class Amazon(Metadata):
__name__ = "Amazon" __name__ = "Amazon"
__id__ = "amazon" __id__ = "amazon"
@ -49,17 +54,21 @@ class Amazon(Metadata):
def search( def search(
self, query: str, generic_cover: str = "", locale: str = "en" self, query: str, generic_cover: str = "", locale: str = "en"
): ) -> Optional[List[MetaRecord]]:
#timer=time() #timer=time()
def inner(link, index) -> [dict, int]: def inner(link,index) -> tuple[dict,int]:
try: with self.session as session:
with self.session as session: try:
r = session.get(f"https://www.amazon.com{link}") r = session.get(f"https://www.amazon.com/{link}")
r.raise_for_status() r.raise_for_status()
long_soup = BS(r.text, "lxml") #~4sec :/ except Exception as e:
soup2 = long_soup.find("div", attrs={"cel_widget_id": "dpx-books-ppd_csm_instrumentation_wrapper"}) log.warning(e)
if soup2 is None: return
return long_soup = BS(r.text, "lxml") #~4sec :/
soup2 = long_soup.find("div", attrs={"cel_widget_id": "dpx-books-ppd_csm_instrumentation_wrapper"})
if soup2 is None:
return
try:
match = MetaRecord( match = MetaRecord(
title = "", title = "",
authors = "", authors = "",
@ -109,22 +118,24 @@ class Amazon(Metadata):
return return
val = list() val = list()
try: if self.active:
if self.active: try:
results = self.session.get( results = self.session.get(
f"https://www.amazon.com/s?k={query.replace(' ', '+')}" f"https://www.amazon.com/s?k={query.replace(' ', '+')}&i=digital-text&sprefix={query.replace(' ', '+')}"
f"&i=digital-text&sprefix={query.replace(' ', '+')}"
f"%2Cdigital-text&ref=nb_sb_noss", f"%2Cdigital-text&ref=nb_sb_noss",
headers=self.headers) headers=self.headers)
results.raise_for_status() results.raise_for_status()
soup = BS(results.text, 'html.parser') except requests.exceptions.HTTPError as e:
links_list = [next(filter(lambda i: "digital-text" in i["href"], x.findAll("a")))["href"] for x in log.error_or_exception(e)
soup.findAll("div", attrs={"data-component-type": "s-search-result"})] return None
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: except Exception as e:
fut = {executor.submit(inner, link, index) for index, link in enumerate(links_list[:5])} log.warning(e)
val = list(map(lambda x: x.result(), concurrent.futures.as_completed(fut))) return None
result = list(filter(lambda x: x, val)) soup = BS(results.text, 'html.parser')
return [x[0] for x in sorted(result, key=itemgetter(1))] #sort by amazons listing order for best relevance links_list = [next(filter(lambda i: "digital-text" in i["href"], x.findAll("a")))["href"] for x in
except requests.exceptions.HTTPError as e: soup.findAll("div", attrs={"data-component-type": "s-search-result"})]
log.error_or_exception(e) with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
return [] fut = {executor.submit(inner, link, index) for index, link in enumerate(links_list[:5])}
val=list(map(lambda x : x.result() ,concurrent.futures.as_completed(fut)))
result=list(filter(lambda x: x, val))
return [x[0] for x in sorted(result, key=itemgetter(1))] #sort by amazons listing order for best relevance

View File

@ -21,8 +21,11 @@ from typing import Dict, List, Optional
from urllib.parse import quote from urllib.parse import quote
import requests import requests
from cps import logger
from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
log = logger.create()
class ComicVine(Metadata): class ComicVine(Metadata):
__name__ = "ComicVine" __name__ = "ComicVine"
@ -46,10 +49,15 @@ class ComicVine(Metadata):
if title_tokens: if title_tokens:
tokens = [quote(t.encode("utf-8")) for t in title_tokens] tokens = [quote(t.encode("utf-8")) for t in title_tokens]
query = "%20".join(tokens) query = "%20".join(tokens)
result = requests.get( try:
f"{ComicVine.BASE_URL}{query}{ComicVine.QUERY_PARAMS}", result = requests.get(
headers=ComicVine.HEADERS, f"{ComicVine.BASE_URL}{query}{ComicVine.QUERY_PARAMS}",
) headers=ComicVine.HEADERS,
)
result.raise_for_status()
except Exception as e:
log.warning(e)
return None
for result in result.json()["results"]: for result in result.json()["results"]:
match = self._parse_search_result( match = self._parse_search_result(
result=result, generic_cover=generic_cover, locale=locale result=result, generic_cover=generic_cover, locale=locale

View File

@ -0,0 +1,206 @@
# -*- coding: utf-8 -*-
# This file is part of the Calibre-Web (https://github.com/janeczku/calibre-web)
# Copyright (C) 2022 xlivevil
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import re
from concurrent import futures
from typing import List, Optional
import requests
from html2text import HTML2Text
from lxml import etree
from cps import logger
from cps.services.Metadata import Metadata, MetaRecord, MetaSourceInfo
log = logger.create()
def html2text(html: str) -> str:
h2t = HTML2Text()
h2t.body_width = 0
h2t.single_line_break = True
h2t.emphasis_mark = "*"
return h2t.handle(html)
class Douban(Metadata):
__name__ = "豆瓣"
__id__ = "douban"
DESCRIPTION = "豆瓣"
META_URL = "https://book.douban.com/"
SEARCH_URL = "https://www.douban.com/j/search"
ID_PATTERN = re.compile(r"sid: (?P<id>\d+),")
AUTHORS_PATTERN = re.compile(r"作者|译者")
PUBLISHER_PATTERN = re.compile(r"出版社")
SUBTITLE_PATTERN = re.compile(r"副标题")
PUBLISHED_DATE_PATTERN = re.compile(r"出版年")
SERIES_PATTERN = re.compile(r"丛书")
IDENTIFIERS_PATTERN = re.compile(r"ISBN|统一书号")
TITTLE_XPATH = "//span[@property='v:itemreviewed']"
COVER_XPATH = "//a[@class='nbg']"
INFO_XPATH = "//*[@id='info']//span[@class='pl']"
TAGS_XPATH = "//a[contains(@class, 'tag')]"
DESCRIPTION_XPATH = "//div[@id='link-report']//div[@class='intro']"
RATING_XPATH = "//div[@class='rating_self clearfix']/strong"
session = requests.Session()
session.headers = {
'user-agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56',
}
def search(
self, query: str, generic_cover: str = "", locale: str = "en"
) -> Optional[List[MetaRecord]]:
if self.active:
log.debug(f"starting search {query} on douban")
if title_tokens := list(
self.get_title_tokens(query, strip_joiners=False)
):
query = "+".join(title_tokens)
try:
r = self.session.get(
self.SEARCH_URL, params={"cat": 1001, "q": query}
)
r.raise_for_status()
except Exception as e:
log.warning(e)
return None
results = r.json()
if results["total"] == 0:
return []
book_id_list = [
self.ID_PATTERN.search(item).group("id")
for item in results["items"][:10] if self.ID_PATTERN.search(item)
]
with futures.ThreadPoolExecutor(max_workers=5) as executor:
fut = [
executor.submit(self._parse_single_book, book_id, generic_cover)
for book_id in book_id_list
]
val = [
future.result()
for future in futures.as_completed(fut) if future.result()
]
return val
def _parse_single_book(
self, id: str, generic_cover: str = ""
) -> Optional[MetaRecord]:
url = f"https://book.douban.com/subject/{id}/"
try:
r = self.session.get(url)
r.raise_for_status()
except Exception as e:
log.warning(e)
return None
match = MetaRecord(
id=id,
title="",
authors=[],
url=url,
source=MetaSourceInfo(
id=self.__id__,
description=self.DESCRIPTION,
link=self.META_URL,
),
)
html = etree.HTML(r.content.decode("utf8"))
match.title = html.xpath(self.TITTLE_XPATH)[0].text
match.cover = html.xpath(self.COVER_XPATH)[0].attrib["href"] or generic_cover
try:
rating_num = float(html.xpath(self.RATING_XPATH)[0].text.strip())
except Exception:
rating_num = 0
match.rating = int(-1 * rating_num // 2 * -1) if rating_num else 0
tag_elements = html.xpath(self.TAGS_XPATH)
if len(tag_elements):
match.tags = [tag_element.text for tag_element in tag_elements]
description_element = html.xpath(self.DESCRIPTION_XPATH)
if len(description_element):
match.description = html2text(etree.tostring(
description_element[-1], encoding="utf8").decode("utf8"))
info = html.xpath(self.INFO_XPATH)
for element in info:
text = element.text
if self.AUTHORS_PATTERN.search(text):
next = element.getnext()
while next is not None and next.tag != "br":
match.authors.append(next.text)
next = next.getnext()
elif self.PUBLISHER_PATTERN.search(text):
match.publisher = element.tail.strip()
elif self.SUBTITLE_PATTERN.search(text):
match.title = f'{match.title}:' + element.tail.strip()
elif self.PUBLISHED_DATE_PATTERN.search(text):
match.publishedDate = self._clean_date(element.tail.strip())
elif self.SUBTITLE_PATTERN.search(text):
match.series = element.getnext().text
elif i_type := self.IDENTIFIERS_PATTERN.search(text):
match.identifiers[i_type.group()] = element.tail.strip()
return match
def _clean_date(self, date: str) -> str:
"""
Clean up the date string to be in the format YYYY-MM-DD
Examples of possible patterns:
'2014-7-16', '1988年4月', '1995-04', '2021-8', '2020-12-1', '1996年',
'1972', '2004/11/01', '1959年3月北京第1版第1印'
"""
year = date[:4]
moon = "01"
day = "01"
if len(date) > 5:
digit = []
ls = []
for i in range(5, len(date)):
if date[i].isdigit():
digit.append(date[i])
elif digit:
ls.append("".join(digit) if len(digit)==2 else f"0{digit[0]}")
digit = []
if digit:
ls.append("".join(digit) if len(digit)==2 else f"0{digit[0]}")
moon = ls[0]
if len(ls)>1:
day = ls[1]
return f"{year}-{moon}-{day}"

View File

@ -22,9 +22,12 @@ from urllib.parse import quote
import requests import requests
from cps import logger
from cps.isoLanguages import get_lang3, get_language_name from cps.isoLanguages import get_lang3, get_language_name
from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
log = logger.create()
class Google(Metadata): class Google(Metadata):
__name__ = "Google" __name__ = "Google"
@ -45,7 +48,12 @@ class Google(Metadata):
if title_tokens: if title_tokens:
tokens = [quote(t.encode("utf-8")) for t in title_tokens] tokens = [quote(t.encode("utf-8")) for t in title_tokens]
query = "+".join(tokens) query = "+".join(tokens)
results = requests.get(Google.SEARCH_URL + query) try:
results = requests.get(Google.SEARCH_URL + query)
results.raise_for_status()
except Exception as e:
log.warning(e)
return None
for result in results.json().get("items", []): for result in results.json().get("items", []):
val.append( val.append(
self._parse_search_result( self._parse_search_result(

View File

@ -27,9 +27,12 @@ from html2text import HTML2Text
from lxml.html import HtmlElement, fromstring, tostring from lxml.html import HtmlElement, fromstring, tostring
from markdown2 import Markdown from markdown2 import Markdown
from cps import logger
from cps.isoLanguages import get_language_name from cps.isoLanguages import get_language_name
from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
log = logger.create()
SYMBOLS_TO_TRANSLATE = ( SYMBOLS_TO_TRANSLATE = (
"öÖüÜóÓőŐúÚéÉáÁűŰíÍąĄćĆęĘłŁńŃóÓśŚźŹżŻ", "öÖüÜóÓőŐúÚéÉáÁűŰíÍąĄćĆęĘłŁńŃóÓśŚźŹżŻ",
"oOuUoOoOuUeEaAuUiIaAcCeElLnNoOsSzZzZ", "oOuUoOoOuUeEaAuUiIaAcCeElLnNoOsSzZzZ",
@ -112,20 +115,23 @@ class LubimyCzytac(Metadata):
self, query: str, generic_cover: str = "", locale: str = "en" self, query: str, generic_cover: str = "", locale: str = "en"
) -> Optional[List[MetaRecord]]: ) -> Optional[List[MetaRecord]]:
if self.active: if self.active:
result = requests.get(self._prepare_query(title=query)) try:
if result.text: result = requests.get(self._prepare_query(title=query))
root = fromstring(result.text) result.raise_for_status()
lc_parser = LubimyCzytacParser(root=root, metadata=self) except Exception as e:
matches = lc_parser.parse_search_results() log.warning(e)
if matches: return None
with ThreadPool(processes=10) as pool: root = fromstring(result.text)
final_matches = pool.starmap( lc_parser = LubimyCzytacParser(root=root, metadata=self)
lc_parser.parse_single_book, matches = lc_parser.parse_search_results()
[(match, generic_cover, locale) for match in matches], if matches:
) with ThreadPool(processes=10) as pool:
return final_matches final_matches = pool.starmap(
return matches lc_parser.parse_single_book,
return [] [(match, generic_cover, locale) for match in matches],
)
return final_matches
return matches
def _prepare_query(self, title: str) -> str: def _prepare_query(self, title: str) -> str:
query = "" query = ""
@ -202,7 +208,12 @@ class LubimyCzytacParser:
def parse_single_book( def parse_single_book(
self, match: MetaRecord, generic_cover: str, locale: str self, match: MetaRecord, generic_cover: str, locale: str
) -> MetaRecord: ) -> MetaRecord:
response = requests.get(match.url) try:
response = requests.get(match.url)
response.raise_for_status()
except Exception as e:
log.warning(e)
return None
self.root = fromstring(response.text) self.root = fromstring(response.text)
match.cover = self._parse_cover(generic_cover=generic_cover) match.cover = self._parse_cover(generic_cover=generic_cover)
match.description = self._parse_description() match.description = self._parse_description()

View File

@ -28,8 +28,12 @@ try:
except FakeUserAgentError: except FakeUserAgentError:
raise ImportError("No module named 'scholarly'") raise ImportError("No module named 'scholarly'")
from cps import logger
from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
log = logger.create()
class scholar(Metadata): class scholar(Metadata):
__name__ = "Google Scholar" __name__ = "Google Scholar"
__id__ = "googlescholar" __id__ = "googlescholar"
@ -44,7 +48,11 @@ class scholar(Metadata):
if title_tokens: if title_tokens:
tokens = [quote(t.encode("utf-8")) for t in title_tokens] tokens = [quote(t.encode("utf-8")) for t in title_tokens]
query = " ".join(tokens) query = " ".join(tokens)
scholar_gen = itertools.islice(scholarly.search_pubs(query), 10) try:
scholar_gen = itertools.islice(scholarly.search_pubs(query), 10)
except Exception as e:
log.warning(e)
return None
for result in scholar_gen: for result in scholar_gen:
match = self._parse_search_result( match = self._parse_search_result(
result=result, generic_cover="", locale=locale result=result, generic_cover="", locale=locale

View File

@ -138,6 +138,6 @@ def metadata_search():
if active.get(c.__id__, True) if active.get(c.__id__, True)
} }
for future in concurrent.futures.as_completed(meta): for future in concurrent.futures.as_completed(meta):
data.extend([asdict(x) for x in future.result()]) data.extend([asdict(x) for x in future.result() if x])
# log.info({'Time elapsed {}'.format(current_milli_time()-start)}) # log.info({'Time elapsed {}'.format(current_milli_time()-start)})
return Response(json.dumps(data), mimetype="application/json") return Response(json.dumps(data), mimetype="application/json")

View File

@ -33,7 +33,7 @@ $(".datepicker").datepicker({
if (results) { if (results) {
pubDate = new Date(results[1], parseInt(results[2], 10) - 1, results[3]) || new Date(this.value); pubDate = new Date(results[1], parseInt(results[2], 10) - 1, results[3]) || new Date(this.value);
$(this).next('input') $(this).next('input')
.val(pubDate.toLocaleDateString(language)) .val(pubDate.toLocaleDateString(language.replaceAll("_","-")))
.removeClass("hidden"); .removeClass("hidden");
} }
}).trigger("change"); }).trigger("change");

View File

@ -92,14 +92,19 @@ $(function () {
data: {"query": keyword}, data: {"query": keyword},
dataType: "json", dataType: "json",
success: function success(data) { success: function success(data) {
$("#meta-info").html("<ul id=\"book-list\" class=\"media-list\"></ul>"); if (data.length) {
data.forEach(function(book) { $("#meta-info").html("<ul id=\"book-list\" class=\"media-list\"></ul>");
var $book = $(templates.bookResult(book)); data.forEach(function(book) {
$book.find("img").on("click", function () { var $book = $(templates.bookResult(book));
populateForm(book); $book.find("img").on("click", function () {
populateForm(book);
});
$("#book-list").append($book);
}); });
$("#book-list").append($book); }
}); else {
$("#meta-info").html("<p class=\"text-danger\">" + msg.no_result + "!</p>" + $("#meta-info")[0].innerHTML)
}
}, },
error: function error() { error: function error() {
$("#meta-info").html("<p class=\"text-danger\">" + msg.search_error + "!</p>" + $("#meta-info")[0].innerHTML); $("#meta-info").html("<p class=\"text-danger\">" + msg.search_error + "!</p>" + $("#meta-info")[0].innerHTML);