unify scholar

This commit is contained in:
collerek 2021-12-13 17:21:41 +01:00
parent d64589914f
commit 51bf35c2e4
8 changed files with 172 additions and 140 deletions

View File

@ -17,49 +17,68 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# ComicVine api document: https://comicvine.gamespot.com/api/documentation
from typing import Dict, List, Optional
from urllib.parse import quote
import requests
from cps.services.Metadata import Metadata
from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
class ComicVine(Metadata):
__name__ = "ComicVine"
__id__ = "comicvine"
DESCRIPTION = "ComicVine Books"
META_URL = "https://comicvine.gamespot.com/"
API_KEY = "57558043c53943d5d1e96a9ad425b0eb85532ee6"
BASE_URL = (
f"https://comicvine.gamespot.com/api/search?api_key={API_KEY}"
f"&resources=issue&query="
)
QUERY_PARAMS = "&sort=name:desc&format=json"
HEADERS = {"User-Agent": "Not Evil Browser"}
def search(self, query, generic_cover=""):
def search(
self, query: str, generic_cover: str = "", locale: str = "en"
) -> Optional[List[MetaRecord]]:
val = list()
apikey = "57558043c53943d5d1e96a9ad425b0eb85532ee6"
if self.active:
headers = {
'User-Agent': 'Not Evil Browser'
}
result = requests.get("https://comicvine.gamespot.com/api/search?api_key="
+ apikey + "&resources=issue&query=" + query + "&sort=name:desc&format=json", headers=headers)
for r in result.json()['results']:
seriesTitle = r['volume'].get('name', "")
if r.get('store_date'):
dateFomers = r.get('store_date')
else:
dateFomers = r.get('date_added')
v = dict()
v['id'] = r['id']
v['title'] = seriesTitle + " #" + r.get('issue_number', "0") + " - " + ( r.get('name', "") or "")
v['authors'] = r.get('authors', [])
v['description'] = r.get('description', "")
v['publisher'] = ""
v['publishedDate'] = dateFomers
v['tags'] = ["Comics", seriesTitle]
v['rating'] = 0
v['series'] = seriesTitle
v['cover'] = r['image'].get('original_url', generic_cover)
v['source'] = {
"id": self.__id__,
"description": "ComicVine Books",
"link": "https://comicvine.gamespot.com/"
}
v['url'] = r.get('site_detail_url', "")
val.append(v)
title_tokens = list(self.get_title_tokens(query, strip_joiners=False))
if title_tokens:
tokens = [quote(t.encode("utf-8")) for t in title_tokens]
query = "%20".join(tokens)
result = requests.get(
f"{ComicVine.BASE_URL}{query}{ComicVine.QUERY_PARAMS}",
headers=ComicVine.HEADERS,
)
for result in result.json()["results"]:
match = self._parse_search_result(
result=result, generic_cover=generic_cover, locale=locale
)
val.append(match)
return val
def _parse_search_result(
self, result: Dict, generic_cover: str, locale: str
) -> MetaRecord:
series = result["volume"].get("name", "")
series_index = result.get("issue_number", 0)
issue_name = result.get("name", "")
match = MetaRecord(
id=result["id"],
title=f"{series}#{series_index} - {issue_name}",
authors=result.get("authors", []),
url=result.get("site_detail_url", ""),
source=MetaSourceInfo(
id=self.__id__,
description=ComicVine.DESCRIPTION,
link=ComicVine.META_URL,
),
series=series,
)
match.cover = result["image"].get("original_url", generic_cover)
match.description = result.get("description", "")
match.publishedDate = result.get("store_date", result.get("date_added"))
match.series_index = series_index
match.tags = ["Comics", series]
match.identifiers = {"comicvine": match.id}
return match

View File

@ -23,7 +23,7 @@ from urllib.parse import quote
import requests
from cps.isoLanguages import get_lang3, get_language_name
from cps.services.Metadata import MetaRecord, Metadata
from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
class Google(Metadata):
@ -56,38 +56,37 @@ class Google(Metadata):
def _parse_search_result(
self, result: Dict, generic_cover: str, locale: str
) -> MetaRecord:
match = dict()
match["id"] = result["id"]
match["title"] = result["volumeInfo"]["title"]
match["authors"] = result["volumeInfo"].get("authors", [])
match["url"] = Google.BOOK_URL + result["id"]
match["cover"] = self._parse_cover(result=result, generic_cover=generic_cover)
match["description"] = result["volumeInfo"].get("description", "")
match["languages"] = self._parse_languages(result=result, locale=locale)
match["publisher"] = result["volumeInfo"].get("publisher", "")
match["publishedDate"] = result["volumeInfo"].get("publishedDate", "")
match["rating"] = result["volumeInfo"].get("averageRating", 0)
match["series"], match["series_index"] = "", 1
match["tags"] = result["volumeInfo"].get("categories", [])
match = MetaRecord(
id=result["id"],
title=result["volumeInfo"]["title"],
authors=result["volumeInfo"].get("authors", []),
url=Google.BOOK_URL + result["id"],
source=MetaSourceInfo(
id=self.__id__,
description=Google.DESCRIPTION,
link=Google.META_URL,
),
)
match["source"] = {
"id": self.__id__,
"description": Google.DESCRIPTION,
"link": Google.META_URL,
}
match.cover = self._parse_cover(result=result, generic_cover=generic_cover)
match.description = result["volumeInfo"].get("description", "")
match.languages = self._parse_languages(result=result, locale=locale)
match.publisher = result["volumeInfo"].get("publisher", "")
match.publishedDate = result["volumeInfo"].get("publishedDate", "")
match.rating = result["volumeInfo"].get("averageRating", 0)
match.series, match.series_index = "", 1
match.tags = result["volumeInfo"].get("categories", [])
match["identifiers"] = {
"google": match.get("id"),
}
match.identifiers = {"google": match.id}
match = self._parse_isbn(result=result, match=match)
return match
@staticmethod
def _parse_isbn(result: Dict, match: Dict) -> Dict:
def _parse_isbn(result: Dict, match: MetaRecord) -> MetaRecord:
identifiers = result["volumeInfo"].get("industryIdentifiers", [])
for identifier in identifiers:
if identifier.get("type") == Google.ISBN_TYPE:
match["identifiers"]["isbn"] = identifier.get("identifier")
match.identifiers["isbn"] = identifier.get("identifier")
break
return match
@ -100,7 +99,7 @@ class Google(Metadata):
@staticmethod
def _parse_languages(result: Dict, locale: str) -> List[str]:
language_iso2 = result.get("language", "")
language_iso2 = result["volumeInfo"].get("language", "")
languages = (
[get_language_name(locale, get_lang3(language_iso2))]
if language_iso2

View File

@ -27,7 +27,7 @@ from html2text import HTML2Text
from lxml.html import HtmlElement, fromstring, tostring
from markdown2 import Markdown
from cps.services.Metadata import MetaRecord, Metadata
from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
SYMBOLS_TO_TRANSLATE = (
"öÖüÜóÓőŐúÚéÉáÁűŰíÍąĄćĆęĘłŁńŃóÓśŚźŹżŻ",
@ -158,61 +158,60 @@ class LubimyCzytacParser:
self.root = root
self.metadata = metadata
def parse_search_results(self) -> List[Dict]:
def parse_search_results(self) -> List[MetaRecord]:
matches = []
results = self.root.xpath(LubimyCzytac.BOOK_SEARCH_RESULT_XPATH)
for result in results:
title = self._parse_xpath_node(
root=result,
xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
f"{LubimyCzytac.TITLE_TEXT_PATH}",
f"{LubimyCzytac.TITLE_TEXT_PATH}",
)
book_url = self._parse_xpath_node(
root=result,
xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
f"{LubimyCzytac.URL_PATH}",
f"{LubimyCzytac.URL_PATH}",
)
authors = self._parse_xpath_node(
root=result,
xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
f"{LubimyCzytac.AUTHORS_PATH}",
f"{LubimyCzytac.AUTHORS_PATH}",
take_first=False,
)
if not all([title, book_url, authors]):
continue
matches.append(
{
"id": book_url.replace(f"/ksiazka/", "").split("/")[0],
"title": title,
"authors": [strip_accents(author) for author in authors],
"url": LubimyCzytac.BASE_URL + book_url,
}
MetaRecord(
id=book_url.replace(f"/ksiazka/", "").split("/")[0],
title=title,
authors=[strip_accents(author) for author in authors],
url=LubimyCzytac.BASE_URL + book_url,
source=MetaSourceInfo(
id=self.metadata.__id__,
description=self.metadata.__name__,
link=LubimyCzytac.BASE_URL,
)
)
)
return matches
def parse_single_book(self, match: Dict, generic_cover: str) -> MetaRecord:
response = requests.get(match.get("url"))
def parse_single_book(self, match: MetaRecord, generic_cover: str) -> MetaRecord:
response = requests.get(match.url)
self.root = fromstring(response.text)
match["cover"] = self._parse_cover(generic_cover=generic_cover)
match["description"] = self._parse_description()
match["languages"] = self._parse_languages()
match["publisher"] = self._parse_publisher()
match["publishedDate"] = self._parse_from_summary(
match.cover = self._parse_cover(generic_cover=generic_cover)
match.description = self._parse_description()
match.languages = self._parse_languages()
match.publisher = self._parse_publisher()
match.publishedDate = self._parse_from_summary(
attribute_name="datePublished"
)
match["rating"] = self._parse_rating()
match["series"], match["series_index"] = self._parse_series()
match["tags"] = self._parse_tags()
match["source"] = {
"id": self.metadata.__id__,
"description": self.metadata.__name__,
"link": LubimyCzytac.BASE_URL,
}
match["identifiers"] = {
match.rating = self._parse_rating()
match.series, match.series_index = self._parse_series()
match.tags = self._parse_tags()
match.identifiers = {
"isbn": self._parse_isbn(),
"lubimyczytac": match["id"],
"lubimyczytac": match.id,
}
return match

View File

@ -15,47 +15,53 @@
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import itertools
from typing import Dict, List, Optional
from urllib.parse import quote
from scholarly import scholarly
from cps.services.Metadata import Metadata
from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
class scholar(Metadata):
__name__ = "Google Scholar"
__id__ = "googlescholar"
META_URL = "https://scholar.google.com/"
def search(self, query, generic_cover=""):
def search(
self, query: str, generic_cover: str = "", locale: str = "en"
) -> Optional[List[MetaRecord]]:
val = list()
if self.active:
scholar_gen = scholarly.search_pubs(' '.join(query.split('+')))
i = 0
for publication in scholar_gen:
v = dict()
v['id'] = "1234" # publication['bib'].get('title')
v['title'] = publication['bib'].get('title')
v['authors'] = publication['bib'].get('author', [])
v['description'] = publication['bib'].get('abstract', "")
v['publisher'] = publication['bib'].get('venue', "")
if publication['bib'].get('pub_year'):
v['publishedDate'] = publication['bib'].get('pub_year')+"-01-01"
else:
v['publishedDate'] = ""
v['tags'] = ""
v['ratings'] = 0
v['series'] = ""
v['cover'] = generic_cover
v['url'] = publication.get('pub_url') or publication.get('eprint_url') or "",
v['source'] = {
"id": self.__id__,
"description": "Google Scholar",
"link": "https://scholar.google.com/"
}
val.append(v)
i += 1
if (i >= 10):
break
title_tokens = list(self.get_title_tokens(query, strip_joiners=False))
if title_tokens:
tokens = [quote(t.encode("utf-8")) for t in title_tokens]
query = " ".join(tokens)
scholar_gen = itertools.islice(scholarly.search_pubs(query), 10)
for result in scholar_gen:
match = self._parse_search_result(
result=result, generic_cover=generic_cover, locale=locale
)
val.append(match)
return val
def _parse_search_result(
self, result: Dict, generic_cover: str, locale: str
) -> MetaRecord:
match = MetaRecord(
id=result.get("pub_url", result.get("eprint_url", "")),
title=result["bib"].get("title"),
authors=result["bib"].get("author", []),
url=result.get("pub_url", result.get("eprint_url", "")),
source=MetaSourceInfo(
id=self.__id__, description=self.__name__, link=scholar.META_URL
),
)
match.cover = result.get("image", {}).get("original_url", generic_cover)
match.description = result["bib"].get("abstract", "")
match.publisher = result["bib"].get("venue", "")
match.publishedDate = result["bib"].get("pub_year") + "-01-01"
match.identifiers = {"scholar": match.id}
return match

View File

@ -22,6 +22,7 @@ import inspect
import json
import os
import sys
from dataclasses import asdict
from flask import Blueprint, Response, request, url_for
from flask_login import current_user
@ -99,11 +100,13 @@ def metadata_change_active_provider(prov_name):
log.error("Invalid request received: {}".format(request))
return "Invalid request", 400
if "initial" in new_state and prov_name:
for c in cl:
if c.__id__ == prov_name:
data = c.search(new_state.get("query", ""))
break
return Response(json.dumps(data), mimetype="application/json")
data = []
provider = next((c for c in cl if c.__id__ == prov_name), None)
if provider is not None:
data = provider.search(new_state.get("query", ""))
return Response(
json.dumps([asdict(x) for x in data]), mimetype="application/json"
)
return ""
@ -123,5 +126,5 @@ def metadata_search():
if active.get(c.__id__, True)
}
for future in concurrent.futures.as_completed(meta):
data.extend(future.result())
data.extend([asdict(x) for x in future.result()])
return Response(json.dumps(data), mimetype="application/json")

View File

@ -16,32 +16,38 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import abc
import dataclasses
import os
import re
from typing import Dict, Generator, List, Optional, TypedDict, Union
from typing import Dict, Generator, List, Optional, Union
from cps import constants
class MetaSourceInfo(TypedDict):
@dataclasses.dataclass
class MetaSourceInfo:
id: str
description: str
link: str
class MetaRecord(TypedDict):
@dataclasses.dataclass
class MetaRecord:
id: Union[str, int]
title: str
authors: List[str]
url: str
cover: str
series: Optional[str]
series_index: Optional[Union[int, float]]
tags: Optional[List[str]]
publisher: Optional[str]
publishedDate: Optional[str]
rating: Optional[int]
description: Optional[str]
source: MetaSourceInfo
languages: Optional[List[str]]
identifiers: Dict[str, Union[str, int]]
cover: str = os.path.join(constants.STATIC_DIR, 'generic_cover.jpg')
description: Optional[str] = ""
series: Optional[str] = None
series_index: Optional[Union[int, float]] = 0
identifiers: Dict[str, Union[str, int]] = dataclasses.field(default_factory=dict)
publisher: Optional[str] = None
publishedDate: Optional[str] = None
rating: Optional[int] = 0
languages: Optional[List[str]] = dataclasses.field(default_factory=list)
tags: Optional[List[str]] = dataclasses.field(default_factory=list)
class Metadata:

View File

@ -32,6 +32,9 @@ SQLAlchemy-Utils>=0.33.5,<0.38.0
# extracting metadata
rarfile>=2.7
scholarly>=1.2.0, <1.5
markdown2==2.4.2
html2text==2020.1.16
python-dateutil==2.8.2
# other
natsort>=2.2.0,<8.1.0

View File

@ -14,6 +14,3 @@ Wand>=0.4.4,<0.7.0
unidecode>=0.04.19,<1.3.0
lxml>=3.8.0,<4.7.0
flask-wtf>=0.14.2,<1.1.0
markdown2==2.4.2
html2text==2020.1.16
python-dateutil==2.8.2