add series, languages and isbn to google provider

This commit is contained in:
collerek 2021-12-13 15:14:19 +01:00
parent 362fdc5716
commit d64589914f
4 changed files with 119 additions and 66 deletions

View File

@ -17,41 +17,93 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>. # along with this program. If not, see <http://www.gnu.org/licenses/>.
# Google Books api document: https://developers.google.com/books/docs/v1/using # Google Books api document: https://developers.google.com/books/docs/v1/using
from typing import Dict, List, Optional
from urllib.parse import quote
import requests import requests
from cps.services.Metadata import Metadata from cps.isoLanguages import get_lang3, get_language_name
from cps.services.Metadata import MetaRecord, Metadata
class Google(Metadata): class Google(Metadata):
__name__ = "Google" __name__ = "Google"
__id__ = "google" __id__ = "google"
BASE_URL = "https://www.googleapis.com/books/v1/volumes?q=" DESCRIPTION = "Google Books"
META_URL = "https://books.google.com/"
BOOK_URL = "https://books.google.com/books?id="
SEARCH_URL = "https://www.googleapis.com/books/v1/volumes?q="
ISBN_TYPE = "ISBN_13"
def search(self, query, generic_cover=""): def search(
self, query: str, generic_cover: str = "", locale: str = "en"
) -> Optional[List[MetaRecord]]:
if self.active: if self.active:
val = list() val = list()
result = requests.get(Google.BASE_URL + query.replace(" ","+")) title_tokens = list(self.get_title_tokens(query, strip_joiners=False))
for r in result.json()['items']: if title_tokens:
v = dict() tokens = [quote(t.encode("utf-8")) for t in title_tokens]
v['id'] = r['id'] query = "+".join(tokens)
v['title'] = r['volumeInfo']['title'] results = requests.get(Google.SEARCH_URL + query)
v['authors'] = r['volumeInfo'].get('authors', []) for result in results.json()["items"]:
v['description'] = r['volumeInfo'].get('description', "") val.append(
v['publisher'] = r['volumeInfo'].get('publisher', "") self._parse_search_result(
v['publishedDate'] = r['volumeInfo'].get('publishedDate', "") result=result, generic_cover=generic_cover, locale=locale
v['tags'] = r['volumeInfo'].get('categories', []) )
v['rating'] = r['volumeInfo'].get('averageRating', 0) )
if r['volumeInfo'].get('imageLinks'):
v['cover'] = r['volumeInfo']['imageLinks']['thumbnail'].replace("http://", "https://")
else:
# v['cover'] = "/../../../static/generic_cover.jpg"
v['cover'] = generic_cover
v['source'] = {
"id": self.__id__,
"description": "Google Books",
"link": "https://books.google.com/"}
v['url'] = "https://books.google.com/books?id=" + r['id']
val.append(v)
return val return val
def _parse_search_result(
self, result: Dict, generic_cover: str, locale: str
) -> MetaRecord:
match = dict()
match["id"] = result["id"]
match["title"] = result["volumeInfo"]["title"]
match["authors"] = result["volumeInfo"].get("authors", [])
match["url"] = Google.BOOK_URL + result["id"]
match["cover"] = self._parse_cover(result=result, generic_cover=generic_cover)
match["description"] = result["volumeInfo"].get("description", "")
match["languages"] = self._parse_languages(result=result, locale=locale)
match["publisher"] = result["volumeInfo"].get("publisher", "")
match["publishedDate"] = result["volumeInfo"].get("publishedDate", "")
match["rating"] = result["volumeInfo"].get("averageRating", 0)
match["series"], match["series_index"] = "", 1
match["tags"] = result["volumeInfo"].get("categories", [])
match["source"] = {
"id": self.__id__,
"description": Google.DESCRIPTION,
"link": Google.META_URL,
}
match["identifiers"] = {
"google": match.get("id"),
}
match = self._parse_isbn(result=result, match=match)
return match
@staticmethod
def _parse_isbn(result: Dict, match: Dict) -> Dict:
identifiers = result["volumeInfo"].get("industryIdentifiers", [])
for identifier in identifiers:
if identifier.get("type") == Google.ISBN_TYPE:
match["identifiers"]["isbn"] = identifier.get("identifier")
break
return match
@staticmethod
def _parse_cover(result: Dict, generic_cover: str) -> str:
if result["volumeInfo"].get("imageLinks"):
cover_url = result["volumeInfo"]["imageLinks"]["thumbnail"]
return cover_url.replace("http://", "https://")
return generic_cover
@staticmethod
def _parse_languages(result: Dict, locale: str) -> List[str]:
language_iso2 = result.get("language", "")
languages = (
[get_language_name(locale, get_lang3(language_iso2))]
if language_iso2
else []
)
return languages

View File

@ -107,7 +107,9 @@ class LubimyCzytac(Metadata):
SUMMARY = "//script[@type='application/ld+json']//text()" SUMMARY = "//script[@type='application/ld+json']//text()"
def search(self, query: str, generic_cover: str = "") -> Optional[List]: def search(
self, query: str, generic_cover: str = "", locale: str = "en"
) -> Optional[List[MetaRecord]]:
if self.active: if self.active:
result = requests.get(self._prepare_query(title=query)) result = requests.get(self._prepare_query(title=query))
root = fromstring(result.text) root = fromstring(result.text)
@ -117,10 +119,7 @@ class LubimyCzytac(Metadata):
with ThreadPool(processes=10) as pool: with ThreadPool(processes=10) as pool:
final_matches = pool.starmap( final_matches = pool.starmap(
lc_parser.parse_single_book, lc_parser.parse_single_book,
[ [(match, generic_cover) for match in matches],
(match, generic_cover)
for match in matches
],
) )
return final_matches return final_matches
return matches return matches
@ -192,26 +191,25 @@ class LubimyCzytacParser:
) )
return matches return matches
def parse_single_book( def parse_single_book(self, match: Dict, generic_cover: str) -> MetaRecord:
self, match: Dict, generic_cover: str
) -> MetaRecord:
response = requests.get(match.get("url")) response = requests.get(match.get("url"))
self.root = fromstring(response.text) self.root = fromstring(response.text)
match["series"], match["series_index"] = self._parse_series() match["cover"] = self._parse_cover(generic_cover=generic_cover)
match["tags"] = self._parse_tags() match["description"] = self._parse_description()
match["languages"] = self._parse_languages()
match["publisher"] = self._parse_publisher() match["publisher"] = self._parse_publisher()
match["publishedDate"] = self._parse_from_summary( match["publishedDate"] = self._parse_from_summary(
attribute_name="datePublished" attribute_name="datePublished"
) )
match["rating"] = self._parse_rating() match["rating"] = self._parse_rating()
match["description"] = self._parse_description() match["series"], match["series_index"] = self._parse_series()
match["cover"] = self._parse_cover(generic_cover=generic_cover) match["tags"] = self._parse_tags()
match["source"] = { match["source"] = {
"id": self.metadata.__id__, "id": self.metadata.__id__,
"description": self.metadata.__name__, "description": self.metadata.__name__,
"link": LubimyCzytac.BASE_URL, "link": LubimyCzytac.BASE_URL,
} }
match["languages"] = self._parse_languages()
match["identifiers"] = { match["identifiers"] = {
"isbn": self._parse_isbn(), "isbn": self._parse_isbn(),
"lubimyczytac": match["id"], "lubimyczytac": match["id"],

View File

@ -30,7 +30,7 @@ from sqlalchemy.exc import InvalidRequestError, OperationalError
from sqlalchemy.orm.attributes import flag_modified from sqlalchemy.orm.attributes import flag_modified
from cps.services.Metadata import Metadata from cps.services.Metadata import Metadata
from . import constants, logger, ub from . import constants, get_locale, logger, ub
meta = Blueprint("metadata", __name__) meta = Blueprint("metadata", __name__)
@ -113,11 +113,12 @@ def metadata_search():
query = request.form.to_dict().get("query") query = request.form.to_dict().get("query")
data = list() data = list()
active = current_user.view_settings.get("metadata", {}) active = current_user.view_settings.get("metadata", {})
locale = get_locale()
if query: if query:
static_cover = url_for("static", filename="generic_cover.jpg") static_cover = url_for("static", filename="generic_cover.jpg")
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
meta = { meta = {
executor.submit(c.search, query, static_cover): c executor.submit(c.search, query, static_cover, locale): c
for c in cl for c in cl
if active.get(c.__id__, True) if active.get(c.__id__, True)
} }

View File

@ -20,6 +20,30 @@ import re
from typing import Dict, Generator, List, Optional, TypedDict, Union from typing import Dict, Generator, List, Optional, TypedDict, Union
class MetaSourceInfo(TypedDict):
id: str
description: str
link: str
class MetaRecord(TypedDict):
id: Union[str, int]
title: str
authors: List[str]
url: str
cover: str
series: Optional[str]
series_index: Optional[Union[int, float]]
tags: Optional[List[str]]
publisher: Optional[str]
publishedDate: Optional[str]
rating: Optional[int]
description: Optional[str]
source: MetaSourceInfo
languages: Optional[List[str]]
identifiers: Dict[str, Union[str, int]]
class Metadata: class Metadata:
__name__ = "Generic" __name__ = "Generic"
__id__ = "generic" __id__ = "generic"
@ -31,7 +55,9 @@ class Metadata:
self.active = state self.active = state
@abc.abstractmethod @abc.abstractmethod
def search(self, query: str, generic_cover: str = ""): def search(
self, query: str, generic_cover: str = "", locale: str = "en"
) -> Optional[List[MetaRecord]]:
pass pass
@staticmethod @staticmethod
@ -73,27 +99,3 @@ class Metadata:
not strip_joiners or token.lower() not in ("a", "and", "the", "&") not strip_joiners or token.lower() not in ("a", "and", "the", "&")
): ):
yield token yield token
class MetaSourceInfo(TypedDict):
id: str
description: str
link: str
class MetaRecord(TypedDict):
id: Union[str, int]
title: str
authors: List[str]
url: str
cover: str
series: Optional[str]
series_index: Optional[Union[int, float]]
tags: Optional[List[str]]
publisher: Optional[str]
publishedDate: Optional[str]
rating: Optional[int]
description: Optional[str]
source: MetaSourceInfo
languages: Optional[List[str]]
identifiers: Dict[str, Union[str, int]]