From 86b779f39b3725ecfdb9b8b6e837a219baab26e9 Mon Sep 17 00:00:00 2001 From: xlivevil Date: Fri, 25 Feb 2022 01:01:12 +0800 Subject: [PATCH 1/4] Add douban metadate provider --- cps/metadata_provider/douban.py | 175 ++++++++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 cps/metadata_provider/douban.py diff --git a/cps/metadata_provider/douban.py b/cps/metadata_provider/douban.py new file mode 100644 index 00000000..5eda21ec --- /dev/null +++ b/cps/metadata_provider/douban.py @@ -0,0 +1,175 @@ +# -*- coding: utf-8 -*- + +# This file is part of the Calibre-Web (https://github.com/janeczku/calibre-web) +# Copyright (C) 2022 xlivevil +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +import re +from concurrent import futures +from typing import List, Optional + +import requests +from html2text import HTML2Text +from lxml import etree + +from cps import logger +from cps.services.Metadata import Metadata, MetaRecord, MetaSourceInfo + +log = logger.create() + + +def html2text(html: str) -> str: + + h2t = HTML2Text() + h2t.body_width = 0 + h2t.single_line_break = True + h2t.emphasis_mark = "*" + return h2t.handle(html) + + +class Douban(Metadata): + __name__ = "豆瓣" + __id__ = "douban" + DESCRIPTION = "豆瓣" + META_URL = "https://book.douban.com/" + SEARCH_URL = "https://www.douban.com/j/search" + + ID_PATTERN = re.compile(r"sid: (?P\d+),") + AUTHORS_PATTERN = re.compile(r"作者|译者") + PUBLISHER_PATTERN = re.compile(r"出版社") + SUBTITLE_PATTERN = re.compile(r"副标题") + PUBLISHED_DATE_PATTERN = re.compile(r"出版年") + SERIES_PATTERN = re.compile(r"丛书") + IDENTIFIERS_PATTERN = re.compile(r"ISBN|统一书号") + + TITTLE_XPATH = "//span[@property='v:itemreviewed']" + COVER_XPATH = "//a[@class='nbg']" + INFO_XPATH = "//*[@id='info']//span[@class='pl']" + TAGS_XPATH = "//a[contains(@class, 'tag')]" + DESCRIPTION_XPATH = "//div[@id='link-report']//div[@class='intro']" + RATING_XPATH = "//div[@class='rating_self clearfix']/strong" + + session = requests.Session() + session.headers = { + 'user-agent': + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56', + } + + def search( + self, query: str, generic_cover: str = "", locale: str = "en" + ) -> Optional[List[MetaRecord]]: + if self.active: + log.debug(f"starting search {query} on douban") + if title_tokens := list( + self.get_title_tokens(query, strip_joiners=False) + ): + query = "+".join(title_tokens) + + try: + r = self.session.get( + self.SEARCH_URL, params={"cat": 1001, "q": query} + ) + r.raise_for_status() + + except Exception as e: + log.warning(e) + return None + + results = r.json() + if results["total"] == 0: + return val + + book_id_list = [ + self.ID_PATTERN.search(item).group("id") + for item in results["items"][:10] if self.ID_PATTERN.search(item) + ] + + with futures.ThreadPoolExecutor(max_workers=5) as executor: + + fut = [ + executor.submit(self._parse_single_book, book_id, generic_cover) + for book_id in book_id_list + ] + + val = [ + future.result() + for future in futures.as_completed(fut) if future.result() + ] + + return val + + def _parse_single_book( + self, id: str, generic_cover: str = "" + ) -> Optional[MetaRecord]: + url = f"https://book.douban.com/subject/{id}/" + + try: + r = self.session.get(url) + r.raise_for_status() + except Exception as e: + log.warning(e) + return None + + match = MetaRecord( + id=id, + title="", + authors=[], + url=url, + source=MetaSourceInfo( + id=self.__id__, + description=self.DESCRIPTION, + link=self.META_URL, + ), + ) + + html = etree.HTML(r.content.decode("utf8")) + + match.title = html.xpath(self.TITTLE_XPATH)[0].text + match.cover = html.xpath(self.COVER_XPATH)[0].attrib["href"] or generic_cover + try: + rating_num = float(html.xpath(self.RATING_XPATH)[0].text.strip()) + except ValueError: + rating_num = 0 + match.rating = int(-1 * rating_num // 2 * -1) if rating_num else 0 + + tag_elements = html.xpath(self.TAGS_XPATH) + if len(tag_elements): + match.tags = [tag_element.text for tag_element in tag_elements] + + description_element = html.xpath(self.DESCRIPTION_XPATH) + if len(description_element): + match.description = html2text(etree.tostring( + description_element[-1], encoding="utf8").decode("utf8")) + + info = html.xpath(self.INFO_XPATH) + + for element in info: + text = element.text + if self.AUTHORS_PATTERN.search(text): + next = element.getnext() + while next is not None and next.tag != "br": + match.authors.append(next.text) + next = next.getnext() + elif self.PUBLISHER_PATTERN.search(text): + match.publisher = element.tail.strip() + elif self.SUBTITLE_PATTERN.search(text): + match.title = f'{match.title}:' + element.tail.strip() + elif self.PUBLISHED_DATE_PATTERN.search(text): + match.publishedDate = element.tail.strip() + elif self.SUBTITLE_PATTERN.search(text): + match.series = element.getnext().text + elif i_type := self.IDENTIFIERS_PATTERN.search(text): + match.identifiers[i_type.group()] = element.tail.strip() + + return match From 695ce836813969aaffb3dac288536cdf7ff3edc9 Mon Sep 17 00:00:00 2001 From: xlivevil Date: Fri, 25 Feb 2022 01:12:22 +0800 Subject: [PATCH 2/4] Fix Uncaught RangeError --- cps/static/js/edit_books.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cps/static/js/edit_books.js b/cps/static/js/edit_books.js index 0bfe078c..c1eb319d 100644 --- a/cps/static/js/edit_books.js +++ b/cps/static/js/edit_books.js @@ -33,7 +33,7 @@ $(".datepicker").datepicker({ if (results) { pubDate = new Date(results[1], parseInt(results[2], 10) - 1, results[3]) || new Date(this.value); $(this).next('input') - .val(pubDate.toLocaleDateString(language)) + .val(pubDate.toLocaleDateString(language.replaceAll("_","-"))) .removeClass("hidden"); } }).trigger("change"); From 97cf20764bcc5ca37a7d58791d350e962c4edbc8 Mon Sep 17 00:00:00 2001 From: xlivevil Date: Fri, 25 Feb 2022 12:18:07 +0800 Subject: [PATCH 3/4] Add exception handling and logger in metadata provider --- cps/metadata_provider/amazon.py | 33 +++++++++++++++++++-------- cps/metadata_provider/comicvine.py | 16 +++++++++---- cps/metadata_provider/google.py | 10 +++++++- cps/metadata_provider/lubimyczytac.py | 17 ++++++++++++-- cps/metadata_provider/scholar.py | 10 +++++++- cps/search_metadata.py | 2 +- cps/static/js/get_meta.js | 19 +++++++++------ 7 files changed, 81 insertions(+), 26 deletions(-) diff --git a/cps/metadata_provider/amazon.py b/cps/metadata_provider/amazon.py index 558edebc..5c74cf71 100644 --- a/cps/metadata_provider/amazon.py +++ b/cps/metadata_provider/amazon.py @@ -19,15 +19,20 @@ import concurrent.futures import requests from bs4 import BeautifulSoup as BS # requirement +from typing import List, Optional try: import cchardet #optional for better speed except ImportError: pass +from cps import logger from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata #from time import time from operator import itemgetter +log = logger.create() + + class Amazon(Metadata): __name__ = "Amazon" __id__ = "amazon" @@ -46,12 +51,16 @@ class Amazon(Metadata): def search( self, query: str, generic_cover: str = "", locale: str = "en" - ): + ) -> Optional[List[MetaRecord]]: #timer=time() - def inner(link,index)->[dict,int]: - with self.session as session: - r = session.get(f"https://www.amazon.com/{link}") - r.raise_for_status() + def inner(link,index) -> tuple[dict,int]: + with self.session as session: + try: + r = session.get(f"https://www.amazon.com/{link}") + r.raise_for_status() + except Exception as e: + log.warning(e) + return long_soup = BS(r.text, "lxml") #~4sec :/ soup2 = long_soup.find("div", attrs={"cel_widget_id": "dpx-books-ppd_csm_instrumentation_wrapper"}) if soup2 is None: @@ -107,11 +116,15 @@ class Amazon(Metadata): val = list() if self.active: - results = self.session.get( - f"https://www.amazon.com/s?k={query.replace(' ', '+')}&i=digital-text&sprefix={query.replace(' ', '+')}" - f"%2Cdigital-text&ref=nb_sb_noss", - headers=self.headers) - results.raise_for_status() + try: + results = self.session.get( + f"https://www.amazon.com/s?k={query.replace(' ', '+')}&i=digital-text&sprefix={query.replace(' ', '+')}" + f"%2Cdigital-text&ref=nb_sb_noss", + headers=self.headers) + results.raise_for_status() + except Exception as e: + log.warning(e) + return None soup = BS(results.text, 'html.parser') links_list = [next(filter(lambda i: "digital-text" in i["href"], x.findAll("a")))["href"] for x in soup.findAll("div", attrs={"data-component-type": "s-search-result"})] diff --git a/cps/metadata_provider/comicvine.py b/cps/metadata_provider/comicvine.py index 56618d4b..b4d8d34c 100644 --- a/cps/metadata_provider/comicvine.py +++ b/cps/metadata_provider/comicvine.py @@ -21,8 +21,11 @@ from typing import Dict, List, Optional from urllib.parse import quote import requests +from cps import logger from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata +log = logger.create() + class ComicVine(Metadata): __name__ = "ComicVine" @@ -46,10 +49,15 @@ class ComicVine(Metadata): if title_tokens: tokens = [quote(t.encode("utf-8")) for t in title_tokens] query = "%20".join(tokens) - result = requests.get( - f"{ComicVine.BASE_URL}{query}{ComicVine.QUERY_PARAMS}", - headers=ComicVine.HEADERS, - ) + try: + result = requests.get( + f"{ComicVine.BASE_URL}{query}{ComicVine.QUERY_PARAMS}", + headers=ComicVine.HEADERS, + ) + result.raise_for_status() + except Exception as e: + log.warning(e) + return None for result in result.json()["results"]: match = self._parse_search_result( result=result, generic_cover=generic_cover, locale=locale diff --git a/cps/metadata_provider/google.py b/cps/metadata_provider/google.py index fbb68965..98fadd37 100644 --- a/cps/metadata_provider/google.py +++ b/cps/metadata_provider/google.py @@ -22,9 +22,12 @@ from urllib.parse import quote import requests +from cps import logger from cps.isoLanguages import get_lang3, get_language_name from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata +log = logger.create() + class Google(Metadata): __name__ = "Google" @@ -45,7 +48,12 @@ class Google(Metadata): if title_tokens: tokens = [quote(t.encode("utf-8")) for t in title_tokens] query = "+".join(tokens) - results = requests.get(Google.SEARCH_URL + query) + try: + results = requests.get(Google.SEARCH_URL + query) + results.raise_for_status() + except Exception as e: + log.warning(e) + return None for result in results.json().get("items", []): val.append( self._parse_search_result( diff --git a/cps/metadata_provider/lubimyczytac.py b/cps/metadata_provider/lubimyczytac.py index 814a785e..e4abe9db 100644 --- a/cps/metadata_provider/lubimyczytac.py +++ b/cps/metadata_provider/lubimyczytac.py @@ -27,9 +27,12 @@ from html2text import HTML2Text from lxml.html import HtmlElement, fromstring, tostring from markdown2 import Markdown +from cps import logger from cps.isoLanguages import get_language_name from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata +log = logger.create() + SYMBOLS_TO_TRANSLATE = ( "öÖüÜóÓőŐúÚéÉáÁűŰíÍąĄćĆęĘłŁńŃóÓśŚźŹżŻ", "oOuUoOoOuUeEaAuUiIaAcCeElLnNoOsSzZzZ", @@ -112,7 +115,12 @@ class LubimyCzytac(Metadata): self, query: str, generic_cover: str = "", locale: str = "en" ) -> Optional[List[MetaRecord]]: if self.active: - result = requests.get(self._prepare_query(title=query)) + try: + result = requests.get(self._prepare_query(title=query)) + result.raise_for_status() + except Exception as e: + log.warning(e) + return None root = fromstring(result.text) lc_parser = LubimyCzytacParser(root=root, metadata=self) matches = lc_parser.parse_search_results() @@ -200,7 +208,12 @@ class LubimyCzytacParser: def parse_single_book( self, match: MetaRecord, generic_cover: str, locale: str ) -> MetaRecord: - response = requests.get(match.url) + try: + response = requests.get(match.url) + response.raise_for_status() + except Exception as e: + log.warning(e) + return None self.root = fromstring(response.text) match.cover = self._parse_cover(generic_cover=generic_cover) match.description = self._parse_description() diff --git a/cps/metadata_provider/scholar.py b/cps/metadata_provider/scholar.py index b0c10b66..7feb0ee9 100644 --- a/cps/metadata_provider/scholar.py +++ b/cps/metadata_provider/scholar.py @@ -28,8 +28,12 @@ try: except FakeUserAgentError: raise ImportError("No module named 'scholarly'") +from cps import logger from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata +log = logger.create() + + class scholar(Metadata): __name__ = "Google Scholar" __id__ = "googlescholar" @@ -44,7 +48,11 @@ class scholar(Metadata): if title_tokens: tokens = [quote(t.encode("utf-8")) for t in title_tokens] query = " ".join(tokens) - scholar_gen = itertools.islice(scholarly.search_pubs(query), 10) + try: + scholar_gen = itertools.islice(scholarly.search_pubs(query), 10) + except Exception as e: + log.warning(e) + return None for result in scholar_gen: match = self._parse_search_result( result=result, generic_cover="", locale=locale diff --git a/cps/search_metadata.py b/cps/search_metadata.py index d72273f6..d02667d5 100644 --- a/cps/search_metadata.py +++ b/cps/search_metadata.py @@ -130,6 +130,6 @@ def metadata_search(): if active.get(c.__id__, True) } for future in concurrent.futures.as_completed(meta): - data.extend([asdict(x) for x in future.result()]) + data.extend([asdict(x) for x in future.result() if x]) # log.info({'Time elapsed {}'.format(current_milli_time()-start)}) return Response(json.dumps(data), mimetype="application/json") diff --git a/cps/static/js/get_meta.js b/cps/static/js/get_meta.js index 6db1a261..43a40fa6 100644 --- a/cps/static/js/get_meta.js +++ b/cps/static/js/get_meta.js @@ -92,14 +92,19 @@ $(function () { data: {"query": keyword}, dataType: "json", success: function success(data) { - $("#meta-info").html("
    "); - data.forEach(function(book) { - var $book = $(templates.bookResult(book)); - $book.find("img").on("click", function () { - populateForm(book); + if (data.length) { + $("#meta-info").html("
      "); + data.forEach(function(book) { + var $book = $(templates.bookResult(book)); + $book.find("img").on("click", function () { + populateForm(book); + }); + $("#book-list").append($book); }); - $("#book-list").append($book); - }); + } + else { + $("#meta-info").html("

      " + msg.no_result + "!

      " + $("#meta-info")[0].innerHTML) + } }, error: function error() { $("#meta-info").html("

      " + msg.search_error + "!

      " + $("#meta-info")[0].innerHTML); From b54a170a00f3b5ecf4d4e2888db7f092522ba552 Mon Sep 17 00:00:00 2001 From: xlivevil Date: Sat, 12 Mar 2022 13:54:37 +0800 Subject: [PATCH 4/4] Add clean_date method in douban metadata_provider --- cps/metadata_provider/douban.py | 37 ++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/cps/metadata_provider/douban.py b/cps/metadata_provider/douban.py index 5eda21ec..ee21f587 100644 --- a/cps/metadata_provider/douban.py +++ b/cps/metadata_provider/douban.py @@ -88,7 +88,7 @@ class Douban(Metadata): results = r.json() if results["total"] == 0: - return val + return [] book_id_list = [ self.ID_PATTERN.search(item).group("id") @@ -139,7 +139,7 @@ class Douban(Metadata): match.cover = html.xpath(self.COVER_XPATH)[0].attrib["href"] or generic_cover try: rating_num = float(html.xpath(self.RATING_XPATH)[0].text.strip()) - except ValueError: + except Exception: rating_num = 0 match.rating = int(-1 * rating_num // 2 * -1) if rating_num else 0 @@ -166,10 +166,41 @@ class Douban(Metadata): elif self.SUBTITLE_PATTERN.search(text): match.title = f'{match.title}:' + element.tail.strip() elif self.PUBLISHED_DATE_PATTERN.search(text): - match.publishedDate = element.tail.strip() + match.publishedDate = self._clean_date(element.tail.strip()) elif self.SUBTITLE_PATTERN.search(text): match.series = element.getnext().text elif i_type := self.IDENTIFIERS_PATTERN.search(text): match.identifiers[i_type.group()] = element.tail.strip() return match + + + def _clean_date(self, date: str) -> str: + """ + Clean up the date string to be in the format YYYY-MM-DD + + Examples of possible patterns: + '2014-7-16', '1988年4月', '1995-04', '2021-8', '2020-12-1', '1996年', + '1972', '2004/11/01', '1959年3月北京第1版第1印' + """ + year = date[:4] + moon = "01" + day = "01" + + if len(date) > 5: + digit = [] + ls = [] + for i in range(5, len(date)): + if date[i].isdigit(): + digit.append(date[i]) + elif digit: + ls.append("".join(digit) if len(digit)==2 else f"0{digit[0]}") + digit = [] + if digit: + ls.append("".join(digit) if len(digit)==2 else f"0{digit[0]}") + + moon = ls[0] + if len(ls)>1: + day = ls[1] + + return f"{year}-{moon}-{day}"