unify scholar

This commit is contained in:
collerek 2021-12-13 17:21:41 +01:00
parent d64589914f
commit 51bf35c2e4
8 changed files with 172 additions and 140 deletions

View File

@ -17,49 +17,68 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>. # along with this program. If not, see <http://www.gnu.org/licenses/>.
# ComicVine api document: https://comicvine.gamespot.com/api/documentation # ComicVine api document: https://comicvine.gamespot.com/api/documentation
from typing import Dict, List, Optional
from urllib.parse import quote
import requests import requests
from cps.services.Metadata import Metadata from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
class ComicVine(Metadata): class ComicVine(Metadata):
__name__ = "ComicVine" __name__ = "ComicVine"
__id__ = "comicvine" __id__ = "comicvine"
DESCRIPTION = "ComicVine Books"
META_URL = "https://comicvine.gamespot.com/"
API_KEY = "57558043c53943d5d1e96a9ad425b0eb85532ee6"
BASE_URL = (
f"https://comicvine.gamespot.com/api/search?api_key={API_KEY}"
f"&resources=issue&query="
)
QUERY_PARAMS = "&sort=name:desc&format=json"
HEADERS = {"User-Agent": "Not Evil Browser"}
def search(self, query, generic_cover=""): def search(
self, query: str, generic_cover: str = "", locale: str = "en"
) -> Optional[List[MetaRecord]]:
val = list() val = list()
apikey = "57558043c53943d5d1e96a9ad425b0eb85532ee6"
if self.active: if self.active:
headers = { title_tokens = list(self.get_title_tokens(query, strip_joiners=False))
'User-Agent': 'Not Evil Browser' if title_tokens:
} tokens = [quote(t.encode("utf-8")) for t in title_tokens]
query = "%20".join(tokens)
result = requests.get("https://comicvine.gamespot.com/api/search?api_key=" result = requests.get(
+ apikey + "&resources=issue&query=" + query + "&sort=name:desc&format=json", headers=headers) f"{ComicVine.BASE_URL}{query}{ComicVine.QUERY_PARAMS}",
for r in result.json()['results']: headers=ComicVine.HEADERS,
seriesTitle = r['volume'].get('name', "") )
if r.get('store_date'): for result in result.json()["results"]:
dateFomers = r.get('store_date') match = self._parse_search_result(
else: result=result, generic_cover=generic_cover, locale=locale
dateFomers = r.get('date_added') )
v = dict() val.append(match)
v['id'] = r['id']
v['title'] = seriesTitle + " #" + r.get('issue_number', "0") + " - " + ( r.get('name', "") or "")
v['authors'] = r.get('authors', [])
v['description'] = r.get('description', "")
v['publisher'] = ""
v['publishedDate'] = dateFomers
v['tags'] = ["Comics", seriesTitle]
v['rating'] = 0
v['series'] = seriesTitle
v['cover'] = r['image'].get('original_url', generic_cover)
v['source'] = {
"id": self.__id__,
"description": "ComicVine Books",
"link": "https://comicvine.gamespot.com/"
}
v['url'] = r.get('site_detail_url', "")
val.append(v)
return val return val
def _parse_search_result(
self, result: Dict, generic_cover: str, locale: str
) -> MetaRecord:
series = result["volume"].get("name", "")
series_index = result.get("issue_number", 0)
issue_name = result.get("name", "")
match = MetaRecord(
id=result["id"],
title=f"{series}#{series_index} - {issue_name}",
authors=result.get("authors", []),
url=result.get("site_detail_url", ""),
source=MetaSourceInfo(
id=self.__id__,
description=ComicVine.DESCRIPTION,
link=ComicVine.META_URL,
),
series=series,
)
match.cover = result["image"].get("original_url", generic_cover)
match.description = result.get("description", "")
match.publishedDate = result.get("store_date", result.get("date_added"))
match.series_index = series_index
match.tags = ["Comics", series]
match.identifiers = {"comicvine": match.id}
return match

View File

@ -23,7 +23,7 @@ from urllib.parse import quote
import requests import requests
from cps.isoLanguages import get_lang3, get_language_name from cps.isoLanguages import get_lang3, get_language_name
from cps.services.Metadata import MetaRecord, Metadata from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
class Google(Metadata): class Google(Metadata):
@ -56,38 +56,37 @@ class Google(Metadata):
def _parse_search_result( def _parse_search_result(
self, result: Dict, generic_cover: str, locale: str self, result: Dict, generic_cover: str, locale: str
) -> MetaRecord: ) -> MetaRecord:
match = dict() match = MetaRecord(
match["id"] = result["id"] id=result["id"],
match["title"] = result["volumeInfo"]["title"] title=result["volumeInfo"]["title"],
match["authors"] = result["volumeInfo"].get("authors", []) authors=result["volumeInfo"].get("authors", []),
match["url"] = Google.BOOK_URL + result["id"] url=Google.BOOK_URL + result["id"],
match["cover"] = self._parse_cover(result=result, generic_cover=generic_cover) source=MetaSourceInfo(
match["description"] = result["volumeInfo"].get("description", "") id=self.__id__,
match["languages"] = self._parse_languages(result=result, locale=locale) description=Google.DESCRIPTION,
match["publisher"] = result["volumeInfo"].get("publisher", "") link=Google.META_URL,
match["publishedDate"] = result["volumeInfo"].get("publishedDate", "") ),
match["rating"] = result["volumeInfo"].get("averageRating", 0) )
match["series"], match["series_index"] = "", 1
match["tags"] = result["volumeInfo"].get("categories", [])
match["source"] = { match.cover = self._parse_cover(result=result, generic_cover=generic_cover)
"id": self.__id__, match.description = result["volumeInfo"].get("description", "")
"description": Google.DESCRIPTION, match.languages = self._parse_languages(result=result, locale=locale)
"link": Google.META_URL, match.publisher = result["volumeInfo"].get("publisher", "")
} match.publishedDate = result["volumeInfo"].get("publishedDate", "")
match.rating = result["volumeInfo"].get("averageRating", 0)
match.series, match.series_index = "", 1
match.tags = result["volumeInfo"].get("categories", [])
match["identifiers"] = { match.identifiers = {"google": match.id}
"google": match.get("id"),
}
match = self._parse_isbn(result=result, match=match) match = self._parse_isbn(result=result, match=match)
return match return match
@staticmethod @staticmethod
def _parse_isbn(result: Dict, match: Dict) -> Dict: def _parse_isbn(result: Dict, match: MetaRecord) -> MetaRecord:
identifiers = result["volumeInfo"].get("industryIdentifiers", []) identifiers = result["volumeInfo"].get("industryIdentifiers", [])
for identifier in identifiers: for identifier in identifiers:
if identifier.get("type") == Google.ISBN_TYPE: if identifier.get("type") == Google.ISBN_TYPE:
match["identifiers"]["isbn"] = identifier.get("identifier") match.identifiers["isbn"] = identifier.get("identifier")
break break
return match return match
@ -100,7 +99,7 @@ class Google(Metadata):
@staticmethod @staticmethod
def _parse_languages(result: Dict, locale: str) -> List[str]: def _parse_languages(result: Dict, locale: str) -> List[str]:
language_iso2 = result.get("language", "") language_iso2 = result["volumeInfo"].get("language", "")
languages = ( languages = (
[get_language_name(locale, get_lang3(language_iso2))] [get_language_name(locale, get_lang3(language_iso2))]
if language_iso2 if language_iso2

View File

@ -27,7 +27,7 @@ from html2text import HTML2Text
from lxml.html import HtmlElement, fromstring, tostring from lxml.html import HtmlElement, fromstring, tostring
from markdown2 import Markdown from markdown2 import Markdown
from cps.services.Metadata import MetaRecord, Metadata from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
SYMBOLS_TO_TRANSLATE = ( SYMBOLS_TO_TRANSLATE = (
"öÖüÜóÓőŐúÚéÉáÁűŰíÍąĄćĆęĘłŁńŃóÓśŚźŹżŻ", "öÖüÜóÓőŐúÚéÉáÁűŰíÍąĄćĆęĘłŁńŃóÓśŚźŹżŻ",
@ -158,61 +158,60 @@ class LubimyCzytacParser:
self.root = root self.root = root
self.metadata = metadata self.metadata = metadata
def parse_search_results(self) -> List[Dict]: def parse_search_results(self) -> List[MetaRecord]:
matches = [] matches = []
results = self.root.xpath(LubimyCzytac.BOOK_SEARCH_RESULT_XPATH) results = self.root.xpath(LubimyCzytac.BOOK_SEARCH_RESULT_XPATH)
for result in results: for result in results:
title = self._parse_xpath_node( title = self._parse_xpath_node(
root=result, root=result,
xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
f"{LubimyCzytac.TITLE_TEXT_PATH}", f"{LubimyCzytac.TITLE_TEXT_PATH}",
) )
book_url = self._parse_xpath_node( book_url = self._parse_xpath_node(
root=result, root=result,
xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
f"{LubimyCzytac.URL_PATH}", f"{LubimyCzytac.URL_PATH}",
) )
authors = self._parse_xpath_node( authors = self._parse_xpath_node(
root=result, root=result,
xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
f"{LubimyCzytac.AUTHORS_PATH}", f"{LubimyCzytac.AUTHORS_PATH}",
take_first=False, take_first=False,
) )
if not all([title, book_url, authors]): if not all([title, book_url, authors]):
continue continue
matches.append( matches.append(
{ MetaRecord(
"id": book_url.replace(f"/ksiazka/", "").split("/")[0], id=book_url.replace(f"/ksiazka/", "").split("/")[0],
"title": title, title=title,
"authors": [strip_accents(author) for author in authors], authors=[strip_accents(author) for author in authors],
"url": LubimyCzytac.BASE_URL + book_url, url=LubimyCzytac.BASE_URL + book_url,
} source=MetaSourceInfo(
id=self.metadata.__id__,
description=self.metadata.__name__,
link=LubimyCzytac.BASE_URL,
)
)
) )
return matches return matches
def parse_single_book(self, match: Dict, generic_cover: str) -> MetaRecord: def parse_single_book(self, match: MetaRecord, generic_cover: str) -> MetaRecord:
response = requests.get(match.get("url")) response = requests.get(match.url)
self.root = fromstring(response.text) self.root = fromstring(response.text)
match["cover"] = self._parse_cover(generic_cover=generic_cover) match.cover = self._parse_cover(generic_cover=generic_cover)
match["description"] = self._parse_description() match.description = self._parse_description()
match["languages"] = self._parse_languages() match.languages = self._parse_languages()
match["publisher"] = self._parse_publisher() match.publisher = self._parse_publisher()
match["publishedDate"] = self._parse_from_summary( match.publishedDate = self._parse_from_summary(
attribute_name="datePublished" attribute_name="datePublished"
) )
match["rating"] = self._parse_rating() match.rating = self._parse_rating()
match["series"], match["series_index"] = self._parse_series() match.series, match.series_index = self._parse_series()
match["tags"] = self._parse_tags() match.tags = self._parse_tags()
match.identifiers = {
match["source"] = {
"id": self.metadata.__id__,
"description": self.metadata.__name__,
"link": LubimyCzytac.BASE_URL,
}
match["identifiers"] = {
"isbn": self._parse_isbn(), "isbn": self._parse_isbn(),
"lubimyczytac": match["id"], "lubimyczytac": match.id,
} }
return match return match

View File

@ -15,47 +15,53 @@
# #
# You should have received a copy of the GNU General Public License # You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>. # along with this program. If not, see <http://www.gnu.org/licenses/>.
import itertools
from typing import Dict, List, Optional
from urllib.parse import quote
from scholarly import scholarly from scholarly import scholarly
from cps.services.Metadata import Metadata from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
class scholar(Metadata): class scholar(Metadata):
__name__ = "Google Scholar" __name__ = "Google Scholar"
__id__ = "googlescholar" __id__ = "googlescholar"
META_URL = "https://scholar.google.com/"
def search(self, query, generic_cover=""): def search(
self, query: str, generic_cover: str = "", locale: str = "en"
) -> Optional[List[MetaRecord]]:
val = list() val = list()
if self.active: if self.active:
scholar_gen = scholarly.search_pubs(' '.join(query.split('+'))) title_tokens = list(self.get_title_tokens(query, strip_joiners=False))
i = 0 if title_tokens:
for publication in scholar_gen: tokens = [quote(t.encode("utf-8")) for t in title_tokens]
v = dict() query = " ".join(tokens)
v['id'] = "1234" # publication['bib'].get('title') scholar_gen = itertools.islice(scholarly.search_pubs(query), 10)
v['title'] = publication['bib'].get('title') for result in scholar_gen:
v['authors'] = publication['bib'].get('author', []) match = self._parse_search_result(
v['description'] = publication['bib'].get('abstract', "") result=result, generic_cover=generic_cover, locale=locale
v['publisher'] = publication['bib'].get('venue', "") )
if publication['bib'].get('pub_year'): val.append(match)
v['publishedDate'] = publication['bib'].get('pub_year')+"-01-01"
else:
v['publishedDate'] = ""
v['tags'] = ""
v['ratings'] = 0
v['series'] = ""
v['cover'] = generic_cover
v['url'] = publication.get('pub_url') or publication.get('eprint_url') or "",
v['source'] = {
"id": self.__id__,
"description": "Google Scholar",
"link": "https://scholar.google.com/"
}
val.append(v)
i += 1
if (i >= 10):
break
return val return val
def _parse_search_result(
self, result: Dict, generic_cover: str, locale: str
) -> MetaRecord:
match = MetaRecord(
id=result.get("pub_url", result.get("eprint_url", "")),
title=result["bib"].get("title"),
authors=result["bib"].get("author", []),
url=result.get("pub_url", result.get("eprint_url", "")),
source=MetaSourceInfo(
id=self.__id__, description=self.__name__, link=scholar.META_URL
),
)
match.cover = result.get("image", {}).get("original_url", generic_cover)
match.description = result["bib"].get("abstract", "")
match.publisher = result["bib"].get("venue", "")
match.publishedDate = result["bib"].get("pub_year") + "-01-01"
match.identifiers = {"scholar": match.id}
return match

View File

@ -22,6 +22,7 @@ import inspect
import json import json
import os import os
import sys import sys
from dataclasses import asdict
from flask import Blueprint, Response, request, url_for from flask import Blueprint, Response, request, url_for
from flask_login import current_user from flask_login import current_user
@ -99,11 +100,13 @@ def metadata_change_active_provider(prov_name):
log.error("Invalid request received: {}".format(request)) log.error("Invalid request received: {}".format(request))
return "Invalid request", 400 return "Invalid request", 400
if "initial" in new_state and prov_name: if "initial" in new_state and prov_name:
for c in cl: data = []
if c.__id__ == prov_name: provider = next((c for c in cl if c.__id__ == prov_name), None)
data = c.search(new_state.get("query", "")) if provider is not None:
break data = provider.search(new_state.get("query", ""))
return Response(json.dumps(data), mimetype="application/json") return Response(
json.dumps([asdict(x) for x in data]), mimetype="application/json"
)
return "" return ""
@ -123,5 +126,5 @@ def metadata_search():
if active.get(c.__id__, True) if active.get(c.__id__, True)
} }
for future in concurrent.futures.as_completed(meta): for future in concurrent.futures.as_completed(meta):
data.extend(future.result()) data.extend([asdict(x) for x in future.result()])
return Response(json.dumps(data), mimetype="application/json") return Response(json.dumps(data), mimetype="application/json")

View File

@ -16,32 +16,38 @@
# You should have received a copy of the GNU General Public License # You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>. # along with this program. If not, see <http://www.gnu.org/licenses/>.
import abc import abc
import dataclasses
import os
import re import re
from typing import Dict, Generator, List, Optional, TypedDict, Union from typing import Dict, Generator, List, Optional, Union
from cps import constants
class MetaSourceInfo(TypedDict): @dataclasses.dataclass
class MetaSourceInfo:
id: str id: str
description: str description: str
link: str link: str
class MetaRecord(TypedDict): @dataclasses.dataclass
class MetaRecord:
id: Union[str, int] id: Union[str, int]
title: str title: str
authors: List[str] authors: List[str]
url: str url: str
cover: str
series: Optional[str]
series_index: Optional[Union[int, float]]
tags: Optional[List[str]]
publisher: Optional[str]
publishedDate: Optional[str]
rating: Optional[int]
description: Optional[str]
source: MetaSourceInfo source: MetaSourceInfo
languages: Optional[List[str]] cover: str = os.path.join(constants.STATIC_DIR, 'generic_cover.jpg')
identifiers: Dict[str, Union[str, int]] description: Optional[str] = ""
series: Optional[str] = None
series_index: Optional[Union[int, float]] = 0
identifiers: Dict[str, Union[str, int]] = dataclasses.field(default_factory=dict)
publisher: Optional[str] = None
publishedDate: Optional[str] = None
rating: Optional[int] = 0
languages: Optional[List[str]] = dataclasses.field(default_factory=list)
tags: Optional[List[str]] = dataclasses.field(default_factory=list)
class Metadata: class Metadata:

View File

@ -32,6 +32,9 @@ SQLAlchemy-Utils>=0.33.5,<0.38.0
# extracting metadata # extracting metadata
rarfile>=2.7 rarfile>=2.7
scholarly>=1.2.0, <1.5 scholarly>=1.2.0, <1.5
markdown2==2.4.2
html2text==2020.1.16
python-dateutil==2.8.2
# other # other
natsort>=2.2.0,<8.1.0 natsort>=2.2.0,<8.1.0

View File

@ -14,6 +14,3 @@ Wand>=0.4.4,<0.7.0
unidecode>=0.04.19,<1.3.0 unidecode>=0.04.19,<1.3.0
lxml>=3.8.0,<4.7.0 lxml>=3.8.0,<4.7.0
flask-wtf>=0.14.2,<1.1.0 flask-wtf>=0.14.2,<1.1.0
markdown2==2.4.2
html2text==2020.1.16
python-dateutil==2.8.2