mirror of
https://github.com/janeczku/calibre-web
synced 2025-01-13 02:40:29 +00:00
Fuzzy match book titles to eliminate duplicates
Titles with a Levenshtein ratio of 70% or greater are considered duplicates.
This commit is contained in:
parent
53c687251e
commit
58abc1d024
33
cps/web.py
33
cps/web.py
@ -13,6 +13,12 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
goodreads_support = False
|
goodreads_support = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
import Levenshtein
|
||||||
|
levenshtein_support = True
|
||||||
|
except ImportError:
|
||||||
|
levenshtein_support = False
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from functools import reduce
|
from functools import reduce
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@ -1138,17 +1144,32 @@ def author(book_id, page):
|
|||||||
if goodreads_support and config.config_use_goodreads:
|
if goodreads_support and config.config_use_goodreads:
|
||||||
gc = GoodreadsClient(config.config_goodreads_api_key, config.config_goodreads_api_secret)
|
gc = GoodreadsClient(config.config_goodreads_api_key, config.config_goodreads_api_secret)
|
||||||
author_info = gc.find_author(author_name=name)
|
author_info = gc.find_author(author_name=name)
|
||||||
|
other_books = get_unique_other_books(entries.all(), author_info.books)
|
||||||
# Get all identifiers (ISBN, Goodreads, etc) and filter author's books by that list so we show fewer duplicates
|
|
||||||
# Note: Not all images will be shown, even though they're available on Goodreads.com.
|
|
||||||
# See https://www.goodreads.com/topic/show/18213769-goodreads-book-images
|
|
||||||
identifiers = reduce(lambda acc, book: acc + map(lambda identifier: identifier.val, book.identifiers), entries.all(), [])
|
|
||||||
other_books = filter(lambda book: book.isbn not in identifiers and book.gid["#text"] not in identifiers, author_info.books)
|
|
||||||
|
|
||||||
return render_title_template('author.html', entries=entries, pagination=pagination,
|
return render_title_template('author.html', entries=entries, pagination=pagination,
|
||||||
title=name, author=author_info, other_books=other_books)
|
title=name, author=author_info, other_books=other_books)
|
||||||
|
|
||||||
|
|
||||||
|
def get_unique_other_books(library_books, author_books):
|
||||||
|
# Get all identifiers (ISBN, Goodreads, etc) and filter author's books by that list so we show fewer duplicates
|
||||||
|
# Note: Not all images will be shown, even though they're available on Goodreads.com.
|
||||||
|
# See https://www.goodreads.com/topic/show/18213769-goodreads-book-images
|
||||||
|
identifiers = reduce(lambda acc, book: acc + map(lambda identifier: identifier.val, book.identifiers), library_books, [])
|
||||||
|
other_books = filter(lambda book: book.isbn not in identifiers and book.gid["#text"] not in identifiers, author_books)
|
||||||
|
|
||||||
|
# Fuzzy match book titles
|
||||||
|
if levenshtein_support:
|
||||||
|
library_titles = reduce(lambda acc, book: acc + [book.title], library_books, [])
|
||||||
|
other_books = filter(lambda author_book: not filter(
|
||||||
|
lambda library_book:
|
||||||
|
Levenshtein.ratio(re.sub(r"\(.*\)", "", author_book.title), library_book) > 0.7, # Remove items in parentheses before comparing
|
||||||
|
library_titles
|
||||||
|
), other_books)
|
||||||
|
|
||||||
|
return other_books
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@app.route("/series")
|
@app.route("/series")
|
||||||
@login_required_if_no_ano
|
@login_required_if_no_ano
|
||||||
def series_list():
|
def series_list():
|
||||||
|
@ -11,4 +11,5 @@ PyYAML==3.12
|
|||||||
rsa==3.4.2
|
rsa==3.4.2
|
||||||
six==1.10.0
|
six==1.10.0
|
||||||
uritemplate==3.0.0
|
uritemplate==3.0.0
|
||||||
goodreads==0.3.2
|
goodreads>=0.3.2
|
||||||
|
python-Levenshtein>=0.12.0
|
||||||
|
Loading…
Reference in New Issue
Block a user