mirror of
				https://github.com/janeczku/calibre-web
				synced 2025-11-04 09:13:02 +00:00 
			
		
		
		
	Merge remote-tracking branch 'douban/metadata_provider/douban'
This commit is contained in:
		@@ -43,7 +43,8 @@ class Douban(Metadata):
 | 
			
		||||
    __id__ = "douban"
 | 
			
		||||
    DESCRIPTION = "豆瓣"
 | 
			
		||||
    META_URL = "https://book.douban.com/"
 | 
			
		||||
    SEARCH_URL = "https://www.douban.com/j/search"
 | 
			
		||||
    SEARCH_JSON_URL = "https://www.douban.com/j/search"
 | 
			
		||||
    SEARCH_URL = "https://www.douban.com/search"
 | 
			
		||||
 | 
			
		||||
    ID_PATTERN = re.compile(r"sid: (?P<id>\d+),")
 | 
			
		||||
    AUTHORS_PATTERN = re.compile(r"作者|译者")
 | 
			
		||||
@@ -52,6 +53,7 @@ class Douban(Metadata):
 | 
			
		||||
    PUBLISHED_DATE_PATTERN = re.compile(r"出版年")
 | 
			
		||||
    SERIES_PATTERN = re.compile(r"丛书")
 | 
			
		||||
    IDENTIFIERS_PATTERN = re.compile(r"ISBN|统一书号")
 | 
			
		||||
    CRITERIA_PATTERN = re.compile("criteria = '(.+)'")
 | 
			
		||||
 | 
			
		||||
    TITTLE_XPATH = "//span[@property='v:itemreviewed']"
 | 
			
		||||
    COVER_XPATH = "//a[@class='nbg']"
 | 
			
		||||
@@ -63,56 +65,90 @@ class Douban(Metadata):
 | 
			
		||||
    session = requests.Session()
 | 
			
		||||
    session.headers = {
 | 
			
		||||
        'user-agent':
 | 
			
		||||
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56',
 | 
			
		||||
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56',
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    def search(
 | 
			
		||||
        self, query: str, generic_cover: str = "", locale: str = "en"
 | 
			
		||||
    ) -> Optional[List[MetaRecord]]:
 | 
			
		||||
    def search(self,
 | 
			
		||||
               query: str,
 | 
			
		||||
               generic_cover: str = "",
 | 
			
		||||
               locale: str = "en") -> List[MetaRecord]:
 | 
			
		||||
        val = []
 | 
			
		||||
        if self.active:
 | 
			
		||||
            log.debug(f"starting search {query} on douban")
 | 
			
		||||
            log.debug(f"start searching {query} on douban")
 | 
			
		||||
            if title_tokens := list(
 | 
			
		||||
                self.get_title_tokens(query, strip_joiners=False)
 | 
			
		||||
            ):
 | 
			
		||||
                    self.get_title_tokens(query, strip_joiners=False)):
 | 
			
		||||
                query = "+".join(title_tokens)
 | 
			
		||||
 | 
			
		||||
            try:
 | 
			
		||||
                r = self.session.get(
 | 
			
		||||
                    self.SEARCH_URL, params={"cat": 1001, "q": query}
 | 
			
		||||
                )
 | 
			
		||||
                r.raise_for_status()
 | 
			
		||||
            book_id_list = self._get_book_id_list_from_html(query)
 | 
			
		||||
 | 
			
		||||
            except Exception as e:
 | 
			
		||||
                log.warning(e)
 | 
			
		||||
                return None
 | 
			
		||||
 | 
			
		||||
            results = r.json()
 | 
			
		||||
            if results["total"] == 0:
 | 
			
		||||
            if not book_id_list:
 | 
			
		||||
                log.debug("No search results in Douban")
 | 
			
		||||
                return []
 | 
			
		||||
 | 
			
		||||
            book_id_list = [
 | 
			
		||||
                self.ID_PATTERN.search(item).group("id")
 | 
			
		||||
                for item in results["items"][:10] if self.ID_PATTERN.search(item)
 | 
			
		||||
            ]
 | 
			
		||||
 | 
			
		||||
            with futures.ThreadPoolExecutor(max_workers=5) as executor:
 | 
			
		||||
            with futures.ThreadPoolExecutor(
 | 
			
		||||
                    max_workers=5, thread_name_prefix='douban') as executor:
 | 
			
		||||
 | 
			
		||||
                fut = [
 | 
			
		||||
                    executor.submit(self._parse_single_book, book_id, generic_cover)
 | 
			
		||||
                    for book_id in book_id_list
 | 
			
		||||
                    executor.submit(self._parse_single_book, book_id,
 | 
			
		||||
                                    generic_cover) for book_id in book_id_list
 | 
			
		||||
                ]
 | 
			
		||||
 | 
			
		||||
                val = [
 | 
			
		||||
                    future.result() 
 | 
			
		||||
                    for future in futures.as_completed(fut) if future.result()
 | 
			
		||||
                    future.result() for future in futures.as_completed(fut)
 | 
			
		||||
                    if future.result()
 | 
			
		||||
                ]
 | 
			
		||||
 | 
			
		||||
        return val
 | 
			
		||||
 | 
			
		||||
    def _parse_single_book(
 | 
			
		||||
        self, id: str, generic_cover: str = ""
 | 
			
		||||
    ) -> Optional[MetaRecord]:
 | 
			
		||||
    def _get_book_id_list_from_html(self, query: str) -> List[str]:
 | 
			
		||||
        try:
 | 
			
		||||
            r = self.session.get(self.SEARCH_URL,
 | 
			
		||||
                                 params={
 | 
			
		||||
                                     "cat": 1001,
 | 
			
		||||
                                     "q": query
 | 
			
		||||
                                 })
 | 
			
		||||
            r.raise_for_status()
 | 
			
		||||
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            log.warning(e)
 | 
			
		||||
            return []
 | 
			
		||||
 | 
			
		||||
        html = etree.HTML(r.content.decode("utf8"))
 | 
			
		||||
        result_list = html.xpath(self.COVER_XPATH)
 | 
			
		||||
 | 
			
		||||
        return [
 | 
			
		||||
            self.ID_PATTERN.search(item.get("onclick")).group("id")
 | 
			
		||||
            for item in result_list[:10]
 | 
			
		||||
            if self.ID_PATTERN.search(item.get("onclick"))
 | 
			
		||||
        ]
 | 
			
		||||
 | 
			
		||||
    def _get_book_id_list_from_json(self, query: str) -> List[str]:
 | 
			
		||||
        try:
 | 
			
		||||
            r = self.session.get(self.SEARCH_JSON_URL,
 | 
			
		||||
                                 params={
 | 
			
		||||
                                     "cat": 1001,
 | 
			
		||||
                                     "q": query
 | 
			
		||||
                                 })
 | 
			
		||||
            r.raise_for_status()
 | 
			
		||||
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            log.warning(e)
 | 
			
		||||
            return []
 | 
			
		||||
 | 
			
		||||
        results = r.json()
 | 
			
		||||
        if results["total"] == 0:
 | 
			
		||||
            return []
 | 
			
		||||
 | 
			
		||||
        return [
 | 
			
		||||
            self.ID_PATTERN.search(item).group("id")
 | 
			
		||||
            for item in results["items"][:10] if self.ID_PATTERN.search(item)
 | 
			
		||||
        ]
 | 
			
		||||
 | 
			
		||||
    def _parse_single_book(self,
 | 
			
		||||
                           id: str,
 | 
			
		||||
                           generic_cover: str = "") -> Optional[MetaRecord]:
 | 
			
		||||
        url = f"https://book.douban.com/subject/{id}/"
 | 
			
		||||
        log.debug(f"start parsing {url}")
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            r = self.session.get(url)
 | 
			
		||||
@@ -136,7 +172,8 @@ class Douban(Metadata):
 | 
			
		||||
        html = etree.HTML(r.content.decode("utf8"))
 | 
			
		||||
 | 
			
		||||
        match.title = html.xpath(self.TITTLE_XPATH)[0].text
 | 
			
		||||
        match.cover = html.xpath(self.COVER_XPATH)[0].attrib["href"] or generic_cover
 | 
			
		||||
        match.cover = html.xpath(
 | 
			
		||||
            self.COVER_XPATH)[0].attrib["href"] or generic_cover
 | 
			
		||||
        try:
 | 
			
		||||
            rating_num = float(html.xpath(self.RATING_XPATH)[0].text.strip())
 | 
			
		||||
        except Exception:
 | 
			
		||||
@@ -146,35 +183,39 @@ class Douban(Metadata):
 | 
			
		||||
        tag_elements = html.xpath(self.TAGS_XPATH)
 | 
			
		||||
        if len(tag_elements):
 | 
			
		||||
            match.tags = [tag_element.text for tag_element in tag_elements]
 | 
			
		||||
        else:
 | 
			
		||||
            match.tags = self._get_tags(html.text)
 | 
			
		||||
 | 
			
		||||
        description_element = html.xpath(self.DESCRIPTION_XPATH)
 | 
			
		||||
        if len(description_element):
 | 
			
		||||
            match.description = html2text(etree.tostring(
 | 
			
		||||
                description_element[-1], encoding="utf8").decode("utf8"))
 | 
			
		||||
            match.description = html2text(
 | 
			
		||||
                etree.tostring(description_element[-1]).decode("utf8"))
 | 
			
		||||
 | 
			
		||||
        info = html.xpath(self.INFO_XPATH)
 | 
			
		||||
 | 
			
		||||
        for element in info:
 | 
			
		||||
            text = element.text
 | 
			
		||||
            if self.AUTHORS_PATTERN.search(text):
 | 
			
		||||
                next = element.getnext()
 | 
			
		||||
                while next is not None and next.tag != "br":
 | 
			
		||||
                    match.authors.append(next.text)
 | 
			
		||||
                    next = next.getnext()
 | 
			
		||||
                next_element = element.getnext()
 | 
			
		||||
                while next_element is not None and next_element.tag != "br":
 | 
			
		||||
                    match.authors.append(next_element.text)
 | 
			
		||||
                    next_element = next_element.getnext()
 | 
			
		||||
            elif self.PUBLISHER_PATTERN.search(text):
 | 
			
		||||
                match.publisher = element.tail.strip()
 | 
			
		||||
                if publisher := element.tail.strip():
 | 
			
		||||
                    match.publisher = publisher
 | 
			
		||||
                else:
 | 
			
		||||
                    match.publisher = element.getnext().text
 | 
			
		||||
            elif self.SUBTITLE_PATTERN.search(text):
 | 
			
		||||
                match.title = f'{match.title}:' + element.tail.strip()
 | 
			
		||||
                match.title = f'{match.title}:{element.tail.strip()}'
 | 
			
		||||
            elif self.PUBLISHED_DATE_PATTERN.search(text):
 | 
			
		||||
                match.publishedDate = self._clean_date(element.tail.strip())
 | 
			
		||||
            elif self.SUBTITLE_PATTERN.search(text):
 | 
			
		||||
            elif self.SERIES_PATTERN.search(text):
 | 
			
		||||
                match.series = element.getnext().text
 | 
			
		||||
            elif i_type := self.IDENTIFIERS_PATTERN.search(text):
 | 
			
		||||
                match.identifiers[i_type.group()] = element.tail.strip()
 | 
			
		||||
 | 
			
		||||
        return match
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    def _clean_date(self, date: str) -> str:
 | 
			
		||||
        """
 | 
			
		||||
        Clean up the date string to be in the format YYYY-MM-DD
 | 
			
		||||
@@ -194,13 +235,24 @@ class Douban(Metadata):
 | 
			
		||||
                if date[i].isdigit():
 | 
			
		||||
                    digit.append(date[i])
 | 
			
		||||
                elif digit:
 | 
			
		||||
                    ls.append("".join(digit) if len(digit)==2 else f"0{digit[0]}")
 | 
			
		||||
                    ls.append("".join(digit) if len(digit) ==
 | 
			
		||||
                              2 else f"0{digit[0]}")
 | 
			
		||||
                    digit = []
 | 
			
		||||
            if digit:
 | 
			
		||||
                ls.append("".join(digit) if len(digit)==2 else f"0{digit[0]}")
 | 
			
		||||
                ls.append("".join(digit) if len(digit) ==
 | 
			
		||||
                          2 else f"0{digit[0]}")
 | 
			
		||||
 | 
			
		||||
            moon = ls[0]
 | 
			
		||||
            if len(ls)>1:
 | 
			
		||||
            if len(ls) > 1:
 | 
			
		||||
                day = ls[1]
 | 
			
		||||
 | 
			
		||||
        return f"{year}-{moon}-{day}"
 | 
			
		||||
 | 
			
		||||
    def _get_tags(self, text: str) -> List[str]:
 | 
			
		||||
        tags = []
 | 
			
		||||
        if criteria := self.CRITERIA_PATTERN.search(text):
 | 
			
		||||
            tags.extend(
 | 
			
		||||
                item.replace('7:', '') for item in criteria.group().split('|')
 | 
			
		||||
                if item.startswith('7:'))
 | 
			
		||||
 | 
			
		||||
        return tags
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user