\d+),")
+ AUTHORS_PATTERN = re.compile(r"作者|译者")
+ PUBLISHER_PATTERN = re.compile(r"出版社")
+ SUBTITLE_PATTERN = re.compile(r"副标题")
+ PUBLISHED_DATE_PATTERN = re.compile(r"出版年")
+ SERIES_PATTERN = re.compile(r"丛书")
+ IDENTIFIERS_PATTERN = re.compile(r"ISBN|统一书号")
+
+ TITTLE_XPATH = "//span[@property='v:itemreviewed']"
+ COVER_XPATH = "//a[@class='nbg']"
+ INFO_XPATH = "//*[@id='info']//span[@class='pl']"
+ TAGS_XPATH = "//a[contains(@class, 'tag')]"
+ DESCRIPTION_XPATH = "//div[@id='link-report']//div[@class='intro']"
+ RATING_XPATH = "//div[@class='rating_self clearfix']/strong"
+
+ session = requests.Session()
+ session.headers = {
+ 'user-agent':
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56',
+ }
+
+ def search(
+ self, query: str, generic_cover: str = "", locale: str = "en"
+ ) -> Optional[List[MetaRecord]]:
+ if self.active:
+ log.debug(f"starting search {query} on douban")
+ if title_tokens := list(
+ self.get_title_tokens(query, strip_joiners=False)
+ ):
+ query = "+".join(title_tokens)
+
+ try:
+ r = self.session.get(
+ self.SEARCH_URL, params={"cat": 1001, "q": query}
+ )
+ r.raise_for_status()
+
+ except Exception as e:
+ log.warning(e)
+ return None
+
+ results = r.json()
+ if results["total"] == 0:
+ return []
+
+ book_id_list = [
+ self.ID_PATTERN.search(item).group("id")
+ for item in results["items"][:10] if self.ID_PATTERN.search(item)
+ ]
+
+ with futures.ThreadPoolExecutor(max_workers=5) as executor:
+
+ fut = [
+ executor.submit(self._parse_single_book, book_id, generic_cover)
+ for book_id in book_id_list
+ ]
+
+ val = [
+ future.result()
+ for future in futures.as_completed(fut) if future.result()
+ ]
+
+ return val
+
+ def _parse_single_book(
+ self, id: str, generic_cover: str = ""
+ ) -> Optional[MetaRecord]:
+ url = f"https://book.douban.com/subject/{id}/"
+
+ try:
+ r = self.session.get(url)
+ r.raise_for_status()
+ except Exception as e:
+ log.warning(e)
+ return None
+
+ match = MetaRecord(
+ id=id,
+ title="",
+ authors=[],
+ url=url,
+ source=MetaSourceInfo(
+ id=self.__id__,
+ description=self.DESCRIPTION,
+ link=self.META_URL,
+ ),
+ )
+
+ html = etree.HTML(r.content.decode("utf8"))
+
+ match.title = html.xpath(self.TITTLE_XPATH)[0].text
+ match.cover = html.xpath(self.COVER_XPATH)[0].attrib["href"] or generic_cover
+ try:
+ rating_num = float(html.xpath(self.RATING_XPATH)[0].text.strip())
+ except Exception:
+ rating_num = 0
+ match.rating = int(-1 * rating_num // 2 * -1) if rating_num else 0
+
+ tag_elements = html.xpath(self.TAGS_XPATH)
+ if len(tag_elements):
+ match.tags = [tag_element.text for tag_element in tag_elements]
+
+ description_element = html.xpath(self.DESCRIPTION_XPATH)
+ if len(description_element):
+ match.description = html2text(etree.tostring(
+ description_element[-1], encoding="utf8").decode("utf8"))
+
+ info = html.xpath(self.INFO_XPATH)
+
+ for element in info:
+ text = element.text
+ if self.AUTHORS_PATTERN.search(text):
+ next = element.getnext()
+ while next is not None and next.tag != "br":
+ match.authors.append(next.text)
+ next = next.getnext()
+ elif self.PUBLISHER_PATTERN.search(text):
+ match.publisher = element.tail.strip()
+ elif self.SUBTITLE_PATTERN.search(text):
+ match.title = f'{match.title}:' + element.tail.strip()
+ elif self.PUBLISHED_DATE_PATTERN.search(text):
+ match.publishedDate = self._clean_date(element.tail.strip())
+ elif self.SUBTITLE_PATTERN.search(text):
+ match.series = element.getnext().text
+ elif i_type := self.IDENTIFIERS_PATTERN.search(text):
+ match.identifiers[i_type.group()] = element.tail.strip()
+
+ return match
+
+
+ def _clean_date(self, date: str) -> str:
+ """
+ Clean up the date string to be in the format YYYY-MM-DD
+
+ Examples of possible patterns:
+ '2014-7-16', '1988年4月', '1995-04', '2021-8', '2020-12-1', '1996年',
+ '1972', '2004/11/01', '1959年3月北京第1版第1印'
+ """
+ year = date[:4]
+ moon = "01"
+ day = "01"
+
+ if len(date) > 5:
+ digit = []
+ ls = []
+ for i in range(5, len(date)):
+ if date[i].isdigit():
+ digit.append(date[i])
+ elif digit:
+ ls.append("".join(digit) if len(digit)==2 else f"0{digit[0]}")
+ digit = []
+ if digit:
+ ls.append("".join(digit) if len(digit)==2 else f"0{digit[0]}")
+
+ moon = ls[0]
+ if len(ls)>1:
+ day = ls[1]
+
+ return f"{year}-{moon}-{day}"
diff --git a/cps/metadata_provider/google.py b/cps/metadata_provider/google.py
index fbb68965..98fadd37 100644
--- a/cps/metadata_provider/google.py
+++ b/cps/metadata_provider/google.py
@@ -22,9 +22,12 @@ from urllib.parse import quote
import requests
+from cps import logger
from cps.isoLanguages import get_lang3, get_language_name
from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
+log = logger.create()
+
class Google(Metadata):
__name__ = "Google"
@@ -45,7 +48,12 @@ class Google(Metadata):
if title_tokens:
tokens = [quote(t.encode("utf-8")) for t in title_tokens]
query = "+".join(tokens)
- results = requests.get(Google.SEARCH_URL + query)
+ try:
+ results = requests.get(Google.SEARCH_URL + query)
+ results.raise_for_status()
+ except Exception as e:
+ log.warning(e)
+ return None
for result in results.json().get("items", []):
val.append(
self._parse_search_result(
diff --git a/cps/metadata_provider/lubimyczytac.py b/cps/metadata_provider/lubimyczytac.py
index 4a77f4ea..e4abe9db 100644
--- a/cps/metadata_provider/lubimyczytac.py
+++ b/cps/metadata_provider/lubimyczytac.py
@@ -27,9 +27,12 @@ from html2text import HTML2Text
from lxml.html import HtmlElement, fromstring, tostring
from markdown2 import Markdown
+from cps import logger
from cps.isoLanguages import get_language_name
from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
+log = logger.create()
+
SYMBOLS_TO_TRANSLATE = (
"öÖüÜóÓőŐúÚéÉáÁűŰíÍąĄćĆęĘłŁńŃóÓśŚźŹżŻ",
"oOuUoOoOuUeEaAuUiIaAcCeElLnNoOsSzZzZ",
@@ -112,20 +115,23 @@ class LubimyCzytac(Metadata):
self, query: str, generic_cover: str = "", locale: str = "en"
) -> Optional[List[MetaRecord]]:
if self.active:
- result = requests.get(self._prepare_query(title=query))
- if result.text:
- root = fromstring(result.text)
- lc_parser = LubimyCzytacParser(root=root, metadata=self)
- matches = lc_parser.parse_search_results()
- if matches:
- with ThreadPool(processes=10) as pool:
- final_matches = pool.starmap(
- lc_parser.parse_single_book,
- [(match, generic_cover, locale) for match in matches],
- )
- return final_matches
- return matches
- return []
+ try:
+ result = requests.get(self._prepare_query(title=query))
+ result.raise_for_status()
+ except Exception as e:
+ log.warning(e)
+ return None
+ root = fromstring(result.text)
+ lc_parser = LubimyCzytacParser(root=root, metadata=self)
+ matches = lc_parser.parse_search_results()
+ if matches:
+ with ThreadPool(processes=10) as pool:
+ final_matches = pool.starmap(
+ lc_parser.parse_single_book,
+ [(match, generic_cover, locale) for match in matches],
+ )
+ return final_matches
+ return matches
def _prepare_query(self, title: str) -> str:
query = ""
@@ -202,7 +208,12 @@ class LubimyCzytacParser:
def parse_single_book(
self, match: MetaRecord, generic_cover: str, locale: str
) -> MetaRecord:
- response = requests.get(match.url)
+ try:
+ response = requests.get(match.url)
+ response.raise_for_status()
+ except Exception as e:
+ log.warning(e)
+ return None
self.root = fromstring(response.text)
match.cover = self._parse_cover(generic_cover=generic_cover)
match.description = self._parse_description()
diff --git a/cps/metadata_provider/scholar.py b/cps/metadata_provider/scholar.py
index b0c10b66..7feb0ee9 100644
--- a/cps/metadata_provider/scholar.py
+++ b/cps/metadata_provider/scholar.py
@@ -28,8 +28,12 @@ try:
except FakeUserAgentError:
raise ImportError("No module named 'scholarly'")
+from cps import logger
from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
+log = logger.create()
+
+
class scholar(Metadata):
__name__ = "Google Scholar"
__id__ = "googlescholar"
@@ -44,7 +48,11 @@ class scholar(Metadata):
if title_tokens:
tokens = [quote(t.encode("utf-8")) for t in title_tokens]
query = " ".join(tokens)
- scholar_gen = itertools.islice(scholarly.search_pubs(query), 10)
+ try:
+ scholar_gen = itertools.islice(scholarly.search_pubs(query), 10)
+ except Exception as e:
+ log.warning(e)
+ return None
for result in scholar_gen:
match = self._parse_search_result(
result=result, generic_cover="", locale=locale
diff --git a/cps/search_metadata.py b/cps/search_metadata.py
index 79e27554..4bbca58f 100644
--- a/cps/search_metadata.py
+++ b/cps/search_metadata.py
@@ -138,6 +138,6 @@ def metadata_search():
if active.get(c.__id__, True)
}
for future in concurrent.futures.as_completed(meta):
- data.extend([asdict(x) for x in future.result()])
+ data.extend([asdict(x) for x in future.result() if x])
# log.info({'Time elapsed {}'.format(current_milli_time()-start)})
return Response(json.dumps(data), mimetype="application/json")
diff --git a/cps/static/js/edit_books.js b/cps/static/js/edit_books.js
index 0bfe078c..c1eb319d 100644
--- a/cps/static/js/edit_books.js
+++ b/cps/static/js/edit_books.js
@@ -33,7 +33,7 @@ $(".datepicker").datepicker({
if (results) {
pubDate = new Date(results[1], parseInt(results[2], 10) - 1, results[3]) || new Date(this.value);
$(this).next('input')
- .val(pubDate.toLocaleDateString(language))
+ .val(pubDate.toLocaleDateString(language.replaceAll("_","-")))
.removeClass("hidden");
}
}).trigger("change");
diff --git a/cps/static/js/get_meta.js b/cps/static/js/get_meta.js
index 6db1a261..43a40fa6 100644
--- a/cps/static/js/get_meta.js
+++ b/cps/static/js/get_meta.js
@@ -92,14 +92,19 @@ $(function () {
data: {"query": keyword},
dataType: "json",
success: function success(data) {
- $("#meta-info").html("");
- data.forEach(function(book) {
- var $book = $(templates.bookResult(book));
- $book.find("img").on("click", function () {
- populateForm(book);
+ if (data.length) {
+ $("#meta-info").html("");
+ data.forEach(function(book) {
+ var $book = $(templates.bookResult(book));
+ $book.find("img").on("click", function () {
+ populateForm(book);
+ });
+ $("#book-list").append($book);
});
- $("#book-list").append($book);
- });
+ }
+ else {
+ $("#meta-info").html("" + msg.no_result + "!
" + $("#meta-info")[0].innerHTML)
+ }
},
error: function error() {
$("#meta-info").html("" + msg.search_error + "!
" + $("#meta-info")[0].innerHTML);