>",
+ html,
+ )
+ h2t = HTML2Text()
+ h2t.body_width = 0
+ h2t.single_line_break = True
+ h2t.emphasis_mark = "*"
+ return h2t.handle(html)
+
+
+class LubimyCzytac(Metadata):
+ __name__ = "LubimyCzytac.pl"
+ __id__ = "lubimyczytac"
+
+ BASE_URL = "https://lubimyczytac.pl"
+
+ BOOK_SEARCH_RESULT_XPATH = (
+ "*//div[@class='listSearch']//div[@class='authorAllBooks__single']"
+ )
+ SINGLE_BOOK_RESULT_XPATH = ".//div[contains(@class,'authorAllBooks__singleText')]"
+ TITLE_PATH = "/div/a[contains(@class,'authorAllBooks__singleTextTitle')]"
+ TITLE_TEXT_PATH = f"{TITLE_PATH}//text()"
+ URL_PATH = f"{TITLE_PATH}/@href"
+ AUTHORS_PATH = "/div/a[contains(@href,'autor')]//text()"
+
+ SIBLINGS = "/following-sibling::dd"
+
+ CONTAINER = "//section[@class='container book']"
+ PUBLISHER = f"{CONTAINER}//dt[contains(text(),'Wydawnictwo:')]{SIBLINGS}/a/text()"
+ LANGUAGES = f"{CONTAINER}//dt[contains(text(),'Język:')]{SIBLINGS}/text()"
+ DESCRIPTION = f"{CONTAINER}//div[@class='collapse-content']"
+ SERIES = f"{CONTAINER}//span/a[contains(@href,'/cykl/')]"
+
+ DETAILS = "//div[@id='book-details']"
+ PUBLISH_DATE = "//dt[contains(@title,'Data pierwszego wydania"
+ FIRST_PUBLISH_DATE = f"{DETAILS}{PUBLISH_DATE} oryginalnego')]{SIBLINGS}[1]/text()"
+ FIRST_PUBLISH_DATE_PL = f"{DETAILS}{PUBLISH_DATE} polskiego')]{SIBLINGS}[1]/text()"
+ TAGS = "//nav[@aria-label='breadcrumb']//a[contains(@href,'/ksiazki/k/')]/text()"
+ RATING = "//meta[@property='books:rating:value']/@content"
+ COVER = "//meta[@property='og:image']/@content"
+
+ SUMMARY = "//script[@type='application/ld+json']//text()"
+
+ def search(self, query, __):
+ if self.active:
+ result = requests.get(self._prepare_query(title=query))
+ root = fromstring(result.text)
+ matches = self._parse_search_results(root=root)
+ if matches:
+ for ind, match in enumerate(matches):
+ matches[ind] = self._parse_single_book(match=match)
+ return matches
+
+ def _prepare_query(self, title: str) -> str:
+ query = ""
+ characters_to_remove = "\?()\/"
+ pattern = "[" + characters_to_remove + "]"
+ title = re.sub(pattern, "", title)
+ title = title.replace("_", " ")
+ if '"' in title or ",," in title:
+ title = title.split('"')[0].split(",,")[0]
+
+ if "/" in title:
+ title_tokens = [
+ token for token in title.lower().split(" ") if len(token) > 1
+ ]
+ else:
+ title_tokens = list(
+ self.get_title_tokens(title, strip_joiners=False, strip_subtitle=True)
+ )
+ if title_tokens:
+ tokens = [quote(t.encode("utf-8")) for t in title_tokens]
+ query = query + "%20".join(tokens)
+ if not query:
+ return ""
+ return f"{LubimyCzytac.BASE_URL}/szukaj/ksiazki?phrase={query}"
+
+ def _parse_search_results(self, root) -> List[Dict]:
+ matches = []
+ results = root.xpath(LubimyCzytac.BOOK_SEARCH_RESULT_XPATH)
+ for result in results:
+ title = result.xpath(
+ f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
+ f"{LubimyCzytac.TITLE_TEXT_PATH}"
+ )
+ book_url = result.xpath(
+ f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" f"{LubimyCzytac.URL_PATH}"
+ )
+ authors = result.xpath(
+ f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
+ f"{LubimyCzytac.AUTHORS_PATH}"
+ )
+
+ if not title or not book_url or not authors:
+ continue
+ title = title[0].strip()
+ book_url = LubimyCzytac.BASE_URL + book_url[0]
+ book_id = book_url.replace(f"{LubimyCzytac.BASE_URL}/ksiazka/", "").split(
+ "/"
+ )[0]
+ matches.append(
+ {"id": book_id, "title": title, "authors": authors, "url": book_url}
+ )
+ return matches
+
+ def _parse_single_book(self, match: Dict) -> Dict:
+ url = match.get("url")
+ result = requests.get(url)
+ root = fromstring(result.text)
+ match["series"], match["series_index"] = self._parse_series(root=root)
+ match["tags"] = self._parse_tags(root=root)
+ match["publisher"] = self._parse_publisher(root=root)
+ match["publishedDate"] = self._parse_from_summary(
+ root=root, attribute_name="datePublished"
+ )
+ match["rating"] = self._parse_rating(root=root)
+ match["description"] = self._parse_description(root=root)
+ match["cover"] = self._parse_cover(root=root)
+ match["source"] = {
+ "id": self.__id__,
+ "description": self.__name__,
+ "link": LubimyCzytac.BASE_URL,
+ }
+ match['languages'] = self._parse_languages(root=root)
+ match["identifiers"] = {
+ "isbn": self._parse_isbn(root=root),
+ "lubimyczytac": match["id"],
+ }
+ return match
+
+ def _parse_cover(self, root):
+ imgcol_node = root.xpath('//meta[@property="og:image"]/@content')
+ if imgcol_node:
+ img_url = imgcol_node[0]
+ return img_url
+
+ def _parse_publisher(self, root):
+ publisher = root.xpath(LubimyCzytac.PUBLISHER)
+ if publisher:
+ return publisher[0]
+ else:
+ return None
+
+ def _parse_languages(self, root):
+ lang = root.xpath(LubimyCzytac.LANGUAGES)
+ languages = list()
+ if lang:
+ lang = lang[0].strip()
+ if "polski" in lang:
+ languages.append("Polish")
+ if "angielski" in lang:
+ languages.append("English")
+ if not languages:
+ return ['Polish']
+ return languages
+
+ def _parse_series(self, root):
+ try:
+ series_node = root.xpath(LubimyCzytac.SERIES)
+ if series_node:
+ series_lst = root.xpath(f"{LubimyCzytac.SERIES}/text()")
+ if series_lst:
+ series_txt = series_lst
+ else:
+ series_txt = None
+ else:
+ return (None, None)
+
+ if series_txt:
+ ser_string = [series_txt[0].replace("\n", "").strip()]
+ ser_nazwa = ser_string
+ for ser in ser_string:
+ if "tom " in ser:
+ ser_info = ser.split(" (tom ", 1)
+ ser_nazwa = ser.split(" (tom ")[0]
+ break
+
+ if ser_info:
+ series_index_unicode = ser_info[1]
+ series_index_string = str(
+ series_index_unicode.replace(" ", "").replace(")", "")
+ )
+ # Sprawdzamy, czy cykl nie jest kompletem/pakietem tomów, np. 1-3
+ if "-" in series_index_string:
+ series_index_string_temp = series_index_string.split("-", 1)
+ series_index_string = series_index_string_temp[0]
+ if series_index_string.replace(".", "").isdigit() is True:
+ series_index = get_int_or_float(series_index_string)
+ else:
+ series_index = 0
+ else:
+ series_index = 0
+ series = ser_nazwa
+ return (series, series_index)
+ except:
+ return (None, None)
+
+ def _parse_tags(self, root):
+ tags = None
+ try:
+ tags_from_genre = root.xpath(LubimyCzytac.TAGS)
+ if tags_from_genre:
+ tags = tags_from_genre
+ tags = [w.replace(", itd.", " itd.") for w in tags]
+ return tags
+ else:
+ return None
+ except:
+ return tags
+
+ def _parse_from_summary(self, root, attribute_name: str) -> str:
+ data = json.loads(root.xpath(LubimyCzytac.SUMMARY)[0])
+ value = data.get(attribute_name)
+ return value.strip() if value is not None else value
+
+ def _parse_rating(self, root):
+ rating_node = root.xpath(LubimyCzytac.RATING)
+ if rating_node:
+ rating_value = round(float((rating_node[0]).replace(",", ".")) / 2)
+ return rating_value
+ return None
+
+ def _parse_date(self, root, xpath="first_publish"):
+ options = {
+ "first_publish": LubimyCzytac.FIRST_PUBLISH_DATE,
+ "first_publish_pl": LubimyCzytac.FIRST_PUBLISH_DATE_PL,
+ }
+ path = options.get(xpath)
+ from dateutil import parser
+
+ data = root.xpath(path)
+ if data:
+ first_pub_date = data[0].strip()
+ return parser.parse(first_pub_date)
+ return None
+
+ def _parse_isbn(self, root):
+ isbn_node = root.xpath('//meta[@property="books:isbn"]/@content')[0]
+ return isbn_node
+
+ def _parse_description(self, root):
+ comments = ""
+ description_node = root.xpath(LubimyCzytac.DESCRIPTION)
+ if description_node:
+ for zrodla in root.xpath('//p[@class="source"]'):
+ zrodla.getparent().remove(zrodla)
+ comments = tostring(description_node[0], method="html")
+ comments = sanitize_comments_html(comments)
+
+ else:
+ # try
+ description_node = root.xpath('//meta[@property="og:description"]/@content')
+ if description_node:
+ comments = description_node[0]
+ comments = sanitize_comments_html(comments)
+
+ pages = self._parse_from_summary(root=root, attribute_name="numberOfPages")
+ if pages:
+ comments += f'Książka ma {pages} stron(y).
'
+
+ first_publish_date = self._parse_date(root=root)
+ if first_publish_date:
+ comments += f'Data pierwszego wydania: {first_publish_date.strftime("%d.%m.%Y")}
'
+
+ first_publish_date_pl = self._parse_date(root=root, xpath="first_publish_pl")
+ if first_publish_date_pl:
+ comments += f'Data pierwszego wydania w Polsce: {first_publish_date_pl.strftime("%d.%m.%Y")}
'
+
+ return comments
+
+ def get_title_tokens(self, title, strip_joiners=True, strip_subtitle=False):
+ """
+ Taken from https://github.com/kovidgoyal/calibre/blob/master/src/calibre/ebooks/metadata/sources/base.py.
+ """
+ # strip sub-titles
+ if strip_subtitle:
+ subtitle = re.compile(r"([\(\[\{].*?[\)\]\}]|[/:\\].*$)")
+ if len(subtitle.sub("", title)) > 1:
+ title = subtitle.sub("", title)
+
+ title_patterns = [
+ (re.compile(pat, re.IGNORECASE), repl)
+ for pat, repl in [
+ # Remove things like: (2010) (Omnibus) etc.
+ (
+ r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|audiobook|audio\scd|paperback|turtleback|mass\s*market|edition|ed\.)[\])}]",
+ "",
+ ),
+ # Remove any strings that contain the substring edition inside
+ # parentheses
+ (r"(?i)[({\[].*?(edition|ed.).*?[\]})]", ""),
+ # Remove commas used a separators in numbers
+ (r"(\d+),(\d+)", r"\1\2"),
+ # Remove hyphens only if they have whitespace before them
+ (r"(\s-)", " "),
+ # Replace other special chars with a space
+ (r"""[:,;!@$%^&*(){}.`~"\s\[\]/]《》「」“”""", " "),
+ ]
+ ]
+
+ for pat, repl in title_patterns:
+ title = pat.sub(repl, title)
+
+ tokens = title.split()
+ for token in tokens:
+ token = token.strip().strip('"').strip("'")
+ if token and (
+ not strip_joiners or token.lower() not in ("a", "and", "the", "&")
+ ):
+ yield token
diff --git a/cps/static/js/get_meta.js b/cps/static/js/get_meta.js
index 51ab740d..a8643065 100644
--- a/cps/static/js/get_meta.js
+++ b/cps/static/js/get_meta.js
@@ -26,19 +26,26 @@ $(function () {
)
};
+ function getUniqueValues(attribute_name, book){
+ var presentArray = $.map($("#"+attribute_name).val().split(","), $.trim);
+ if ( presentArray.length === 1 && presentArray[0] === "") {
+ presentArray = [];
+ }
+ $.each(book[attribute_name], function(i, el) {
+ if ($.inArray(el, presentArray) === -1) presentArray.push(el);
+ });
+ return presentArray
+ }
+
function populateForm (book) {
tinymce.get("description").setContent(book.description);
- var uniqueTags = $.map($("#tags").val().split(","), $.trim);
- if ( uniqueTags.length == 1 && uniqueTags[0] == "") {
- uniqueTags = [];
- }
- $.each(book.tags, function(i, el) {
- if ($.inArray(el, uniqueTags) === -1) uniqueTags.push(el);
- });
+ var uniqueTags = getUniqueValues('tags', book)
+ var uniqueLanguages = getUniqueValues('languages', book)
var ampSeparatedAuthors = (book.authors || []).join(" & ");
$("#bookAuthor").val(ampSeparatedAuthors);
$("#book_title").val(book.title);
$("#tags").val(uniqueTags.join(", "));
+ $("#languages").val(uniqueLanguages.join(", "));
$("#rating").data("rating").setValue(Math.round(book.rating));
if(book.cover !== null){
$(".cover img").attr("src", book.cover);
@@ -48,7 +55,32 @@ $(function () {
$("#publisher").val(book.publisher);
if (typeof book.series !== "undefined") {
$("#series").val(book.series);
+ $("#series_index").val(book.series_index);
}
+ if (typeof book.identifiers !== "undefined") {
+ populateIdentifiers(book.identifiers)
+ }
+ }
+
+ function populateIdentifiers(identifiers){
+ for (const property in identifiers) {
+ console.log(`${property}: ${identifiers[property]}`);
+ if ($('input[name="identifier-type-'+property+'"]').length) {
+ $('input[name="identifier-val-'+property+'"]').val(identifiers[property])
+ }
+ else {
+ addIdentifier(property, identifiers[property])
+ }
+ }
+ }
+
+ function addIdentifier(name, value){
+ var line = '';
+ line += ' | ';
+ line += ' | ';
+ line += ''+_("Remove")+' | ';
+ line += '
';
+ $("#identifier-table").append(line);
}
function doSearch (keyword) {
diff --git a/requirements.txt b/requirements.txt
index 1db961fe..d1f58a8d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,3 +14,5 @@ Wand>=0.4.4,<0.7.0
unidecode>=0.04.19,<1.3.0
lxml>=3.8.0,<4.7.0
flask-wtf>=0.14.2,<1.1.0
+markdown2==2.4.2
+html2text==2020.1.16