From 73567db4fb01b934604f3d17f7d025d9b4e61f6f Mon Sep 17 00:00:00 2001 From: xlivevil Date: Mon, 10 Oct 2022 01:49:42 +0800 Subject: [PATCH] Update douban metadata provider Change search API Change to Return an empty list when an error occurs Change the way to get tags Fix series and publisher perse error Rename a variable with the same name as the built-in --- cps/metadata_provider/douban.py | 148 +++++++++++++++++++++----------- 1 file changed, 100 insertions(+), 48 deletions(-) diff --git a/cps/metadata_provider/douban.py b/cps/metadata_provider/douban.py index ee21f587..8e27e82e 100644 --- a/cps/metadata_provider/douban.py +++ b/cps/metadata_provider/douban.py @@ -43,7 +43,8 @@ class Douban(Metadata): __id__ = "douban" DESCRIPTION = "豆瓣" META_URL = "https://book.douban.com/" - SEARCH_URL = "https://www.douban.com/j/search" + SEARCH_JSON_URL = "https://www.douban.com/j/search" + SEARCH_URL = "https://www.douban.com/search" ID_PATTERN = re.compile(r"sid: (?P\d+),") AUTHORS_PATTERN = re.compile(r"作者|译者") @@ -52,6 +53,7 @@ class Douban(Metadata): PUBLISHED_DATE_PATTERN = re.compile(r"出版年") SERIES_PATTERN = re.compile(r"丛书") IDENTIFIERS_PATTERN = re.compile(r"ISBN|统一书号") + CRITERIA_PATTERN = re.compile("criteria = '(.+)'") TITTLE_XPATH = "//span[@property='v:itemreviewed']" COVER_XPATH = "//a[@class='nbg']" @@ -63,56 +65,90 @@ class Douban(Metadata): session = requests.Session() session.headers = { 'user-agent': - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56', } - def search( - self, query: str, generic_cover: str = "", locale: str = "en" - ) -> Optional[List[MetaRecord]]: + def search(self, + query: str, + generic_cover: str = "", + locale: str = "en") -> List[MetaRecord]: + val = [] if self.active: - log.debug(f"starting search {query} on douban") + log.debug(f"start searching {query} on douban") if title_tokens := list( - self.get_title_tokens(query, strip_joiners=False) - ): + self.get_title_tokens(query, strip_joiners=False)): query = "+".join(title_tokens) - try: - r = self.session.get( - self.SEARCH_URL, params={"cat": 1001, "q": query} - ) - r.raise_for_status() + book_id_list = self._get_book_id_list_from_html(query) - except Exception as e: - log.warning(e) - return None - - results = r.json() - if results["total"] == 0: + if not book_id_list: + log.debug("No search results in Douban") return [] - book_id_list = [ - self.ID_PATTERN.search(item).group("id") - for item in results["items"][:10] if self.ID_PATTERN.search(item) - ] - - with futures.ThreadPoolExecutor(max_workers=5) as executor: + with futures.ThreadPoolExecutor( + max_workers=5, thread_name_prefix='douban') as executor: fut = [ - executor.submit(self._parse_single_book, book_id, generic_cover) - for book_id in book_id_list + executor.submit(self._parse_single_book, book_id, + generic_cover) for book_id in book_id_list ] - + val = [ - future.result() - for future in futures.as_completed(fut) if future.result() + future.result() for future in futures.as_completed(fut) + if future.result() ] return val - def _parse_single_book( - self, id: str, generic_cover: str = "" - ) -> Optional[MetaRecord]: + def _get_book_id_list_from_html(self, query: str) -> List[str]: + try: + r = self.session.get(self.SEARCH_URL, + params={ + "cat": 1001, + "q": query + }) + r.raise_for_status() + + except Exception as e: + log.warning(e) + return [] + + html = etree.HTML(r.content.decode("utf8")) + result_list = html.xpath(self.COVER_XPATH) + + return [ + self.ID_PATTERN.search(item.get("onclick")).group("id") + for item in result_list[:10] + if self.ID_PATTERN.search(item.get("onclick")) + ] + + def _get_book_id_list_from_json(self, query: str) -> List[str]: + try: + r = self.session.get(self.SEARCH_JSON_URL, + params={ + "cat": 1001, + "q": query + }) + r.raise_for_status() + + except Exception as e: + log.warning(e) + return [] + + results = r.json() + if results["total"] == 0: + return [] + + return [ + self.ID_PATTERN.search(item).group("id") + for item in results["items"][:10] if self.ID_PATTERN.search(item) + ] + + def _parse_single_book(self, + id: str, + generic_cover: str = "") -> Optional[MetaRecord]: url = f"https://book.douban.com/subject/{id}/" + log.debug(f"start parsing {url}") try: r = self.session.get(url) @@ -136,7 +172,8 @@ class Douban(Metadata): html = etree.HTML(r.content.decode("utf8")) match.title = html.xpath(self.TITTLE_XPATH)[0].text - match.cover = html.xpath(self.COVER_XPATH)[0].attrib["href"] or generic_cover + match.cover = html.xpath( + self.COVER_XPATH)[0].attrib["href"] or generic_cover try: rating_num = float(html.xpath(self.RATING_XPATH)[0].text.strip()) except Exception: @@ -146,35 +183,39 @@ class Douban(Metadata): tag_elements = html.xpath(self.TAGS_XPATH) if len(tag_elements): match.tags = [tag_element.text for tag_element in tag_elements] + else: + match.tags = self._get_tags(html.text) description_element = html.xpath(self.DESCRIPTION_XPATH) if len(description_element): - match.description = html2text(etree.tostring( - description_element[-1], encoding="utf8").decode("utf8")) + match.description = html2text( + etree.tostring(description_element[-1]).decode("utf8")) info = html.xpath(self.INFO_XPATH) for element in info: text = element.text if self.AUTHORS_PATTERN.search(text): - next = element.getnext() - while next is not None and next.tag != "br": - match.authors.append(next.text) - next = next.getnext() + next_element = element.getnext() + while next_element is not None and next_element.tag != "br": + match.authors.append(next_element.text) + next_element = next_element.getnext() elif self.PUBLISHER_PATTERN.search(text): - match.publisher = element.tail.strip() + if publisher := element.tail.strip(): + match.publisher = publisher + else: + match.publisher = element.getnext().text elif self.SUBTITLE_PATTERN.search(text): - match.title = f'{match.title}:' + element.tail.strip() + match.title = f'{match.title}:{element.tail.strip()}' elif self.PUBLISHED_DATE_PATTERN.search(text): match.publishedDate = self._clean_date(element.tail.strip()) - elif self.SUBTITLE_PATTERN.search(text): + elif self.SERIES_PATTERN.search(text): match.series = element.getnext().text elif i_type := self.IDENTIFIERS_PATTERN.search(text): match.identifiers[i_type.group()] = element.tail.strip() return match - def _clean_date(self, date: str) -> str: """ Clean up the date string to be in the format YYYY-MM-DD @@ -194,13 +235,24 @@ class Douban(Metadata): if date[i].isdigit(): digit.append(date[i]) elif digit: - ls.append("".join(digit) if len(digit)==2 else f"0{digit[0]}") + ls.append("".join(digit) if len(digit) == + 2 else f"0{digit[0]}") digit = [] if digit: - ls.append("".join(digit) if len(digit)==2 else f"0{digit[0]}") + ls.append("".join(digit) if len(digit) == + 2 else f"0{digit[0]}") moon = ls[0] - if len(ls)>1: - day = ls[1] + if len(ls) > 1: + day = ls[1] return f"{year}-{moon}-{day}" + + def _get_tags(self, text: str) -> List[str]: + tags = [] + if criteria := self.CRITERIA_PATTERN.search(text): + tags.extend( + item.replace('7:', '') for item in criteria.group().split('|') + if item.startswith('7:')) + + return tags