refactor and cleaning

This commit is contained in:
collerek 2021-12-13 01:23:03 +01:00
parent 920acaca99
commit d55626d445
5 changed files with 278 additions and 240 deletions

View File

@ -26,7 +26,7 @@ class ComicVine(Metadata):
__name__ = "ComicVine"
__id__ = "comicvine"
def search(self, query, __):
def search(self, query, generic_cover=""):
val = list()
apikey = "57558043c53943d5d1e96a9ad425b0eb85532ee6"
if self.active:
@ -52,7 +52,7 @@ class ComicVine(Metadata):
v['tags'] = ["Comics", seriesTitle]
v['rating'] = 0
v['series'] = seriesTitle
v['cover'] = r['image'].get('original_url')
v['cover'] = r['image'].get('original_url', generic_cover)
v['source'] = {
"id": self.__id__,
"description": "ComicVine Books",

View File

@ -17,19 +17,20 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# Google Books api document: https://developers.google.com/books/docs/v1/using
import requests
from cps.services.Metadata import Metadata
class Google(Metadata):
__name__ = "Google"
__id__ = "google"
BASE_URL = "https://www.googleapis.com/books/v1/volumes?q="
def search(self, query, __):
def search(self, query, generic_cover=""):
if self.active:
val = list()
result = requests.get("https://www.googleapis.com/books/v1/volumes?q="+query.replace(" ","+"))
result = requests.get(Google.BASE_URL + query.replace(" ","+"))
for r in result.json()['items']:
v = dict()
v['id'] = r['id']
@ -43,7 +44,8 @@ class Google(Metadata):
if r['volumeInfo'].get('imageLinks'):
v['cover'] = r['volumeInfo']['imageLinks']['thumbnail'].replace("http://", "https://")
else:
v['cover'] = "/../../../static/generic_cover.jpg"
# v['cover'] = "/../../../static/generic_cover.jpg"
v['cover'] = generic_cover
v['source'] = {
"id": self.__id__,
"description": "Google Books",

View File

@ -15,47 +15,47 @@
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import datetime
import json
import re
from typing import Dict, List
from typing import Dict, Generator, List, Optional, Tuple, Union
from urllib.parse import quote
import requests
from cps.services.Metadata import Metadata
from lxml.html import fromstring, tostring
from dateutil import parser
from html2text import HTML2Text
from lxml.html import HtmlElement, fromstring, tostring
from markdown2 import Markdown
from cps.services.Metadata import MetaRecord, Metadata
SYMBOLS_TO_TRANSLATE = (
"öÖüÜóÓőŐúÚéÉáÁűŰíÍąĄćĆęĘłŁńŃóÓśŚźŹżŻ",
"oOuUoOoOuUeEaAuUiIaAcCeElLnNoOsSzZzZ",
)
SYMBOL_TRANSLATION_MAP = dict(
[(ord(a), ord(b)) for (a, b) in zip(*SYMBOLS_TO_TRANSLATE)]
)
def get_int_or_float(v):
number_as_float = float(v)
def get_int_or_float(value: str) -> Union[int, float]:
number_as_float = float(value)
number_as_int = int(number_as_float)
return number_as_int if number_as_float == number_as_int else number_as_float
def strip_accents(s):
if s is None:
return s
else:
symbols = (
"öÖüÜóÓőŐúÚéÉáÁűŰíÍąĄćĆęĘłŁńŃóÓśŚźŹżŻ",
"oOuUoOoOuUeEaAuUiIaAcCeElLnNoOsSzZzZ",
)
tr = dict([(ord(a), ord(b)) for (a, b) in zip(*symbols)])
return s.translate(tr) # .lower()
def strip_accents(s: Optional[str]) -> Optional[str]:
return s.translate(SYMBOL_TRANSLATION_MAP) if s is not None else s
def sanitize_comments_html(html):
from markdown2 import Markdown
def sanitize_comments_html(html: str) -> str:
text = html2text(html)
md = Markdown()
html = md.convert(text)
return html
def html2text(html):
from html2text import HTML2Text
import re
def html2text(html: str) -> str:
# replace <u> tags with <span> as <u> becomes emphasis in html2text
if isinstance(html, bytes):
html = html.decode("utf-8")
@ -92,26 +92,36 @@ class LubimyCzytac(Metadata):
PUBLISHER = f"{CONTAINER}//dt[contains(text(),'Wydawnictwo:')]{SIBLINGS}/a/text()"
LANGUAGES = f"{CONTAINER}//dt[contains(text(),'Język:')]{SIBLINGS}/text()"
DESCRIPTION = f"{CONTAINER}//div[@class='collapse-content']"
SERIES = f"{CONTAINER}//span/a[contains(@href,'/cykl/')]"
SERIES = f"{CONTAINER}//span/a[contains(@href,'/cykl/')]/text()"
DETAILS = "//div[@id='book-details']"
PUBLISH_DATE = "//dt[contains(@title,'Data pierwszego wydania"
FIRST_PUBLISH_DATE = f"{DETAILS}{PUBLISH_DATE} oryginalnego')]{SIBLINGS}[1]/text()"
FIRST_PUBLISH_DATE_PL = f"{DETAILS}{PUBLISH_DATE} polskiego')]{SIBLINGS}[1]/text()"
TAGS = "//nav[@aria-label='breadcrumb']//a[contains(@href,'/ksiazki/k/')]/text()"
RATING = "//meta[@property='books:rating:value']/@content"
COVER = "//meta[@property='og:image']/@content"
ISBN = "//meta[@property='books:isbn']/@content"
META_TITLE = "//meta[@property='og:description']/@content"
SUMMARY = "//script[@type='application/ld+json']//text()"
def search(self, query, __):
def search(self, query: str, generic_cover: str = "") -> Optional[List]:
if self.active:
result = requests.get(self._prepare_query(title=query))
root = fromstring(result.text)
matches = self._parse_search_results(root=root)
lc_parser = LubimyCzytacParser(root=root, metadata=self)
matches = lc_parser.parse_search_results()
if matches:
for ind, match in enumerate(matches):
matches[ind] = self._parse_single_book(match=match)
final_matches = []
for match in matches:
response = requests.get(match.get("url"))
match = lc_parser.parse_single_book(
match=match, response=response, generic_cover=generic_cover
)
final_matches.append(match)
return final_matches
return matches
def _prepare_query(self, title: str) -> str:
@ -128,9 +138,7 @@ class LubimyCzytac(Metadata):
token for token in title.lower().split(" ") if len(token) > 1
]
else:
title_tokens = list(
self.get_title_tokens(title, strip_joiners=False, strip_subtitle=True)
)
title_tokens = list(self.get_title_tokens(title, strip_joiners=False))
if title_tokens:
tokens = [quote(t.encode("utf-8")) for t in title_tokens]
query = query + "%20".join(tokens)
@ -138,215 +146,21 @@ class LubimyCzytac(Metadata):
return ""
return f"{LubimyCzytac.BASE_URL}/szukaj/ksiazki?phrase={query}"
def _parse_search_results(self, root) -> List[Dict]:
matches = []
results = root.xpath(LubimyCzytac.BOOK_SEARCH_RESULT_XPATH)
for result in results:
title = result.xpath(
f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
f"{LubimyCzytac.TITLE_TEXT_PATH}"
)
book_url = result.xpath(
f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" f"{LubimyCzytac.URL_PATH}"
)
authors = result.xpath(
f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
f"{LubimyCzytac.AUTHORS_PATH}"
)
if not title or not book_url or not authors:
continue
title = title[0].strip()
book_url = LubimyCzytac.BASE_URL + book_url[0]
book_id = book_url.replace(f"{LubimyCzytac.BASE_URL}/ksiazka/", "").split(
"/"
)[0]
matches.append(
{"id": book_id, "title": title, "authors": authors, "url": book_url}
)
return matches
def _parse_single_book(self, match: Dict) -> Dict:
url = match.get("url")
result = requests.get(url)
root = fromstring(result.text)
match["series"], match["series_index"] = self._parse_series(root=root)
match["tags"] = self._parse_tags(root=root)
match["publisher"] = self._parse_publisher(root=root)
match["publishedDate"] = self._parse_from_summary(
root=root, attribute_name="datePublished"
)
match["rating"] = self._parse_rating(root=root)
match["description"] = self._parse_description(root=root)
match["cover"] = self._parse_cover(root=root)
match["source"] = {
"id": self.__id__,
"description": self.__name__,
"link": LubimyCzytac.BASE_URL,
}
match['languages'] = self._parse_languages(root=root)
match["identifiers"] = {
"isbn": self._parse_isbn(root=root),
"lubimyczytac": match["id"],
}
return match
def _parse_cover(self, root):
imgcol_node = root.xpath('//meta[@property="og:image"]/@content')
if imgcol_node:
img_url = imgcol_node[0]
return img_url
def _parse_publisher(self, root):
publisher = root.xpath(LubimyCzytac.PUBLISHER)
if publisher:
return publisher[0]
else:
return None
def _parse_languages(self, root):
lang = root.xpath(LubimyCzytac.LANGUAGES)
languages = list()
if lang:
lang = lang[0].strip()
if "polski" in lang:
languages.append("Polish")
if "angielski" in lang:
languages.append("English")
if not languages:
return ['Polish']
return languages
def _parse_series(self, root):
try:
series_node = root.xpath(LubimyCzytac.SERIES)
if series_node:
series_lst = root.xpath(f"{LubimyCzytac.SERIES}/text()")
if series_lst:
series_txt = series_lst
else:
series_txt = None
else:
return (None, None)
if series_txt:
ser_string = [series_txt[0].replace("\n", "").strip()]
ser_nazwa = ser_string
for ser in ser_string:
if "tom " in ser:
ser_info = ser.split(" (tom ", 1)
ser_nazwa = ser.split(" (tom ")[0]
break
if ser_info:
series_index_unicode = ser_info[1]
series_index_string = str(
series_index_unicode.replace(" ", "").replace(")", "")
)
# Sprawdzamy, czy cykl nie jest kompletem/pakietem tomów, np. 1-3
if "-" in series_index_string:
series_index_string_temp = series_index_string.split("-", 1)
series_index_string = series_index_string_temp[0]
if series_index_string.replace(".", "").isdigit() is True:
series_index = get_int_or_float(series_index_string)
else:
series_index = 0
else:
series_index = 0
series = ser_nazwa
return (series, series_index)
except:
return (None, None)
def _parse_tags(self, root):
tags = None
try:
tags_from_genre = root.xpath(LubimyCzytac.TAGS)
if tags_from_genre:
tags = tags_from_genre
tags = [w.replace(", itd.", " itd.") for w in tags]
return tags
else:
return None
except:
return tags
def _parse_from_summary(self, root, attribute_name: str) -> str:
data = json.loads(root.xpath(LubimyCzytac.SUMMARY)[0])
value = data.get(attribute_name)
return value.strip() if value is not None else value
def _parse_rating(self, root):
rating_node = root.xpath(LubimyCzytac.RATING)
if rating_node:
rating_value = round(float((rating_node[0]).replace(",", ".")) / 2)
return rating_value
return None
def _parse_date(self, root, xpath="first_publish"):
options = {
"first_publish": LubimyCzytac.FIRST_PUBLISH_DATE,
"first_publish_pl": LubimyCzytac.FIRST_PUBLISH_DATE_PL,
}
path = options.get(xpath)
from dateutil import parser
data = root.xpath(path)
if data:
first_pub_date = data[0].strip()
return parser.parse(first_pub_date)
return None
def _parse_isbn(self, root):
isbn_node = root.xpath('//meta[@property="books:isbn"]/@content')[0]
return isbn_node
def _parse_description(self, root):
comments = ""
description_node = root.xpath(LubimyCzytac.DESCRIPTION)
if description_node:
for zrodla in root.xpath('//p[@class="source"]'):
zrodla.getparent().remove(zrodla)
comments = tostring(description_node[0], method="html")
comments = sanitize_comments_html(comments)
else:
# try <meta>
description_node = root.xpath('//meta[@property="og:description"]/@content')
if description_node:
comments = description_node[0]
comments = sanitize_comments_html(comments)
pages = self._parse_from_summary(root=root, attribute_name="numberOfPages")
if pages:
comments += f'<p id="strony">Książka ma {pages} stron(y).</p>'
first_publish_date = self._parse_date(root=root)
if first_publish_date:
comments += f'<p id="pierwsze_wydanie">Data pierwszego wydania: {first_publish_date.strftime("%d.%m.%Y")}</p>'
first_publish_date_pl = self._parse_date(root=root, xpath="first_publish_pl")
if first_publish_date_pl:
comments += f'<p id="pierwsze_wydanie_pl">Data pierwszego wydania w Polsce: {first_publish_date_pl.strftime("%d.%m.%Y")}</p>'
return comments
def get_title_tokens(self, title, strip_joiners=True, strip_subtitle=False):
@staticmethod
def get_title_tokens(
title: str, strip_joiners: bool = True
) -> Generator[str, None, None]:
"""
Taken from https://github.com/kovidgoyal/calibre/blob/master/src/calibre/ebooks/metadata/sources/base.py.
Taken from calibre source code
"""
# strip sub-titles
if strip_subtitle:
subtitle = re.compile(r"([\(\[\{].*?[\)\]\}]|[/:\\].*$)")
if len(subtitle.sub("", title)) > 1:
title = subtitle.sub("", title)
title_patterns = [
(re.compile(pat, re.IGNORECASE), repl)
for pat, repl in [
# Remove things like: (2010) (Omnibus) etc.
(
r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|audiobook|audio\scd|paperback|turtleback|mass\s*market|edition|ed\.)[\])}]",
r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|"
r"audiobook|audio\scd|paperback|turtleback|"
r"mass\s*market|edition|ed\.)[\])}]",
"",
),
# Remove any strings that contain the substring edition inside
@ -371,3 +185,193 @@ class LubimyCzytac(Metadata):
not strip_joiners or token.lower() not in ("a", "and", "the", "&")
):
yield token
class LubimyCzytacParser:
PAGES_TEMPLATE = "<p id='strony'>Książka ma {0} stron(y).</p>"
PUBLISH_DATE_TEMPLATE = "<p id='pierwsze_wydanie'>Data pierwszego wydania: {0}</p>"
PUBLISH_DATE_PL_TEMPLATE = (
"<p id='pierwsze_wydanie'>Data pierwszego wydania w Polsce: {0}</p>"
)
def __init__(self, root: HtmlElement, metadata: Metadata) -> None:
self.root = root
self.metadata = metadata
def parse_search_results(self) -> List[Dict]:
matches = []
results = self.root.xpath(LubimyCzytac.BOOK_SEARCH_RESULT_XPATH)
for result in results:
title = self._parse_xpath_node(
root=result,
xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
f"{LubimyCzytac.TITLE_TEXT_PATH}",
)
book_url = self._parse_xpath_node(
root=result,
xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
f"{LubimyCzytac.URL_PATH}",
)
authors = self._parse_xpath_node(
root=result,
xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
f"{LubimyCzytac.AUTHORS_PATH}",
take_first=False,
)
if not all([title, book_url, authors]):
continue
matches.append(
{
"id": book_url.replace(f"/ksiazka/", "").split("/")[0],
"title": title,
"authors": [strip_accents(author) for author in authors],
"url": LubimyCzytac.BASE_URL + book_url,
}
)
return matches
def parse_single_book(
self, match: Dict, response, generic_cover: str
) -> MetaRecord:
self.root = fromstring(response.text)
match["series"], match["series_index"] = self._parse_series()
match["tags"] = self._parse_tags()
match["publisher"] = self._parse_publisher()
match["publishedDate"] = self._parse_from_summary(
attribute_name="datePublished"
)
match["rating"] = self._parse_rating()
match["description"] = self._parse_description()
match["cover"] = self._parse_cover(generic_cover=generic_cover)
match["source"] = {
"id": self.metadata.__id__,
"description": self.metadata.__name__,
"link": LubimyCzytac.BASE_URL,
}
match["languages"] = self._parse_languages()
match["identifiers"] = {
"isbn": self._parse_isbn(),
"lubimyczytac": match["id"],
}
return match
def _parse_xpath_node(
self,
xpath: str,
root: HtmlElement = None,
take_first: bool = True,
strip_element: bool = True,
) -> Optional[Union[str, List[str]]]:
root = root if root is not None else self.root
node = root.xpath(xpath)
if not node:
return None
return (
(node[0].strip() if strip_element else node[0])
if take_first
else [x.strip() for x in node]
)
def _parse_cover(self, generic_cover) -> Optional[str]:
return (
self._parse_xpath_node(xpath=LubimyCzytac.COVER, take_first=True)
or generic_cover
)
def _parse_publisher(self) -> Optional[str]:
return self._parse_xpath_node(xpath=LubimyCzytac.PUBLISHER, take_first=True)
def _parse_languages(self) -> List[str]:
languages = list()
lang = self._parse_xpath_node(xpath=LubimyCzytac.LANGUAGES, take_first=True)
if lang:
if "polski" in lang:
languages.append("Polish")
if "angielski" in lang:
languages.append("English")
return languages
def _parse_series(self) -> Tuple[Optional[str], Optional[Union[float, int]]]:
series_index = 0
series = self._parse_xpath_node(xpath=LubimyCzytac.SERIES, take_first=True)
if series:
if "tom " in series:
series_name, series_info = series.split(" (tom ", 1)
series_info = series_info.replace(" ", "").replace(")", "")
# Check if book is not a bundle, i.e. chapter 1-3
if "-" in series_info:
series_info = series_info.split("-", 1)[0]
if series_info.replace(".", "").isdigit() is True:
series_index = get_int_or_float(series_info)
return series_name, series_index
return None, None
def _parse_tags(self) -> List[str]:
tags = self._parse_xpath_node(xpath=LubimyCzytac.TAGS, take_first=False)
return [
strip_accents(w.replace(", itd.", " itd."))
for w in tags
if isinstance(w, str)
]
def _parse_from_summary(self, attribute_name: str) -> Optional[str]:
value = None
summary_text = self._parse_xpath_node(xpath=LubimyCzytac.SUMMARY)
if summary_text:
data = json.loads(summary_text)
value = data.get(attribute_name)
return value.strip() if value is not None else value
def _parse_rating(self) -> Optional[str]:
rating = self._parse_xpath_node(xpath=LubimyCzytac.RATING)
return round(float(rating.replace(",", ".")) / 2) if rating else rating
def _parse_date(self, xpath="first_publish") -> Optional[datetime.datetime]:
options = {
"first_publish": LubimyCzytac.FIRST_PUBLISH_DATE,
"first_publish_pl": LubimyCzytac.FIRST_PUBLISH_DATE_PL,
}
date = self._parse_xpath_node(xpath=options.get(xpath))
return parser.parse(date) if date else None
def _parse_isbn(self) -> Optional[str]:
return self._parse_xpath_node(xpath=LubimyCzytac.ISBN)
def _parse_description(self) -> str:
description = ""
description_node = self._parse_xpath_node(
xpath=LubimyCzytac.DESCRIPTION, strip_element=False
)
if description_node is not None:
for source in self.root.xpath('//p[@class="source"]'):
source.getparent().remove(source)
description = tostring(description_node, method="html")
description = sanitize_comments_html(description)
else:
description_node = self._parse_xpath_node(xpath=LubimyCzytac.META_TITLE)
if description_node is not None:
description = description_node
description = sanitize_comments_html(description)
description = self._add_extra_info_to_description(description=description)
return description
def _add_extra_info_to_description(self, description: str) -> str:
pages = self._parse_from_summary(attribute_name="numberOfPages")
if pages:
description += LubimyCzytacParser.PAGES_TEMPLATE.format(pages)
first_publish_date = self._parse_date()
if first_publish_date:
description += LubimyCzytacParser.PUBLISH_DATE_TEMPLATE.format(
first_publish_date.strftime("%d.%m.%Y")
)
first_publish_date_pl = self._parse_date(xpath="first_publish_pl")
if first_publish_date_pl:
description += LubimyCzytacParser.PUBLISH_DATE_PL_TEMPLATE.format(
first_publish_date_pl.strftime("%d.%m.%Y")
)
return description

View File

@ -15,13 +15,44 @@
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import abc
from typing import Dict, List, Optional, TypedDict, Union
class Metadata():
class Metadata:
__name__ = "Generic"
__id__ = "generic"
def __init__(self):
self.active = True
def set_status(self, state):
self.active = state
@abc.abstractmethod
def search(self, query: str, generic_cover: str):
pass
class MetaSourceInfo(TypedDict):
id: str
description: str
link: str
class MetaRecord(TypedDict):
id: Union[str, int]
title: str
authors: List[str]
url: str
cover: str
series: Optional[str]
series_index: Optional[Union[int, float]]
tags: Optional[List[str]]
publisher: Optional[str]
publishedDate: Optional[str]
rating: Optional[int]
description: Optional[str]
source: MetaSourceInfo
languages: Optional[List[str]]
identifiers: Dict[str, Union[str, int]]

View File

@ -16,3 +16,4 @@ lxml>=3.8.0,<4.7.0
flask-wtf>=0.14.2,<1.1.0
markdown2==2.4.2
html2text==2020.1.16
python-dateutil==2.8.2