Fuzzy match book titles to eliminate duplicates

Titles with a Levenshtein ratio of 70% or greater are considered duplicates.
2025-11-11 12:53:04 +00:00 · 2017-08-24 08:53:53 -07:00
parent 53c687251e
commit 58abc1d024
2 changed files with 29 additions and 7 deletions
--- a/cps/web.py
+++ b/cps/web.py
@@ -13,6 +13,12 @@ try:
 except ImportError:
    goodreads_support = False
 try:
    import Levenshtein
    levenshtein_support = True
 except ImportError:
    levenshtein_support = False
 try:
    from functools import reduce
 except ImportError:
@@ -1138,17 +1144,32 @@ def author(book_id, page):
    if goodreads_support and config.config_use_goodreads:
        gc = GoodreadsClient(config.config_goodreads_api_key, config.config_goodreads_api_secret)
        author_info = gc.find_author(author_name=name)
-
+        other_books = get_unique_other_books(entries.all(), author_info.books)
        # Get all identifiers (ISBN, Goodreads, etc) and filter author's books by that list so we show fewer duplicates
        # Note: Not all images will be shown, even though they're available on Goodreads.com.
        #       See https://www.goodreads.com/topic/show/18213769-goodreads-book-images
        identifiers = reduce(lambda acc, book: acc + map(lambda identifier: identifier.val, book.identifiers), entries.all(), [])
        other_books = filter(lambda book: book.isbn not in identifiers and book.gid["#text"] not in identifiers, author_info.books)
    return render_title_template('author.html', entries=entries, pagination=pagination,
                                 title=name, author=author_info, other_books=other_books)
 def get_unique_other_books(library_books, author_books):
    # Get all identifiers (ISBN, Goodreads, etc) and filter author's books by that list so we show fewer duplicates
    # Note: Not all images will be shown, even though they're available on Goodreads.com.
    #       See https://www.goodreads.com/topic/show/18213769-goodreads-book-images
    identifiers = reduce(lambda acc, book: acc + map(lambda identifier: identifier.val, book.identifiers), library_books, [])
    other_books = filter(lambda book: book.isbn not in identifiers and book.gid["#text"] not in identifiers, author_books)
    # Fuzzy match book titles
    if levenshtein_support:
        library_titles = reduce(lambda acc, book: acc + [book.title], library_books, [])
        other_books = filter(lambda author_book: not filter(
            lambda library_book:
            Levenshtein.ratio(re.sub(r"\(.*\)", "", author_book.title), library_book) > 0.7,  # Remove items in parentheses before comparing
            library_titles
        ), other_books)
    return other_books
@app.route("/series")
@login_required_if_no_ano
 def series_list():
--- a/optional-requirements.txt
+++ b/optional-requirements.txt
@@ -11,4 +11,5 @@ PyYAML==3.12
 rsa==3.4.2
 six==1.10.0
 uritemplate==3.0.0
-goodreads==0.3.2
+goodreads>=0.3.2
 python-Levenshtein>=0.12.0