Improved cover extraction for epub files

This commit is contained in:
Ozzie Isaacs 2022-03-12 18:01:11 +01:00
parent 4379669cf8
commit 8e2536c53b
3 changed files with 95 additions and 66 deletions

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# This file is part of the Calibre-Web (https://github.com/janeczku/calibre-web)
# Copyright (C) 2018 OzzieIsaacs
# Copyright (C) 2018-2022 OzzieIsaacs
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -18,19 +18,16 @@
import os
from . import logger, isoLanguages
from . import logger, isoLanguages, cover
from .constants import BookMeta
log = logger.create()
try:
from wand.image import Image
use_IM = True
except (ImportError, RuntimeError) as e:
use_IM = False
log = logger.create()
try:
from comicapi.comicarchive import ComicArchive, MetaDataStyle
@ -51,29 +48,8 @@ except (ImportError, LookupError) as e:
use_rarfile = False
use_comic_meta = False
NO_JPEG_EXTENSIONS = ['.png', '.webp', '.bmp']
COVER_EXTENSIONS = ['.png', '.webp', '.bmp', '.jpg', '.jpeg']
def _cover_processing(tmp_file_name, img, extension):
tmp_cover_name = os.path.join(os.path.dirname(tmp_file_name), 'cover.jpg')
if extension in NO_JPEG_EXTENSIONS:
if use_IM:
with Image(blob=img) as imgc:
imgc.format = 'jpeg'
imgc.transform_colorspace('rgb')
imgc.save(filename=tmp_cover_name)
return tmp_cover_name
else:
return None
if img:
with open(tmp_cover_name, 'wb') as f:
f.write(img)
return tmp_cover_name
else:
return None
def _extract_Cover_from_archive(original_file_extension, tmp_file_name, rarExecutable):
def _extract_cover_from_archive(original_file_extension, tmp_file_name, rar_executable):
cover_data = extension = None
if original_file_extension.upper() == '.CBZ':
cf = zipfile.ZipFile(tmp_file_name)
@ -81,7 +57,7 @@ def _extract_Cover_from_archive(original_file_extension, tmp_file_name, rarExecu
ext = os.path.splitext(name)
if len(ext) > 1:
extension = ext[1].lower()
if extension in COVER_EXTENSIONS:
if extension in cover.COVER_EXTENSIONS:
cover_data = cf.read(name)
break
elif original_file_extension.upper() == '.CBT':
@ -90,44 +66,44 @@ def _extract_Cover_from_archive(original_file_extension, tmp_file_name, rarExecu
ext = os.path.splitext(name)
if len(ext) > 1:
extension = ext[1].lower()
if extension in COVER_EXTENSIONS:
if extension in cover.COVER_EXTENSIONS:
cover_data = cf.extractfile(name).read()
break
elif original_file_extension.upper() == '.CBR' and use_rarfile:
try:
rarfile.UNRAR_TOOL = rarExecutable
rarfile.UNRAR_TOOL = rar_executable
cf = rarfile.RarFile(tmp_file_name)
for name in cf.namelist():
ext = os.path.splitext(name)
if len(ext) > 1:
extension = ext[1].lower()
if extension in COVER_EXTENSIONS:
if extension in cover.COVER_EXTENSIONS:
cover_data = cf.read(name)
break
except Exception as ex:
log.debug('Rarfile failed with error: %s', ex)
log.debug('Rarfile failed with error: {}'.format(ex))
return cover_data, extension
def _extractCover(tmp_file_name, original_file_extension, rarExecutable):
def _extract_cover(tmp_file_name, original_file_extension, rar_executable):
cover_data = extension = None
if use_comic_meta:
archive = ComicArchive(tmp_file_name, rar_exe_path=rarExecutable)
archive = ComicArchive(tmp_file_name, rar_exe_path=rar_executable)
for index, name in enumerate(archive.getPageNameList()):
ext = os.path.splitext(name)
if len(ext) > 1:
extension = ext[1].lower()
if extension in COVER_EXTENSIONS:
if extension in cover.COVER_EXTENSIONS:
cover_data = archive.getPage(index)
break
else:
cover_data, extension = _extract_Cover_from_archive(original_file_extension, tmp_file_name, rarExecutable)
return _cover_processing(tmp_file_name, cover_data, extension)
cover_data, extension = _extract_cover_from_archive(original_file_extension, tmp_file_name, rar_executable)
return cover.cover_processing(tmp_file_name, cover_data, extension)
def get_comic_info(tmp_file_path, original_file_name, original_file_extension, rarExecutable):
def get_comic_info(tmp_file_path, original_file_name, original_file_extension, rar_executable):
if use_comic_meta:
archive = ComicArchive(tmp_file_path, rar_exe_path=rarExecutable)
archive = ComicArchive(tmp_file_path, rar_exe_path=rar_executable)
if archive.seemsToBeAComicArchive():
if archive.hasMetadata(MetaDataStyle.CIX):
style = MetaDataStyle.CIX
@ -137,23 +113,23 @@ def get_comic_info(tmp_file_path, original_file_name, original_file_extension, r
style = None
# if style is not None:
loadedMetadata = archive.readMetadata(style)
loaded_metadata = archive.readMetadata(style)
lang = loadedMetadata.language or ""
loadedMetadata.language = isoLanguages.get_lang3(lang)
lang = loaded_metadata.language or ""
loaded_metadata.language = isoLanguages.get_lang3(lang)
return BookMeta(
file_path=tmp_file_path,
extension=original_file_extension,
title=loadedMetadata.title or original_file_name,
title=loaded_metadata.title or original_file_name,
author=" & ".join([credit["person"]
for credit in loadedMetadata.credits if credit["role"] == "Writer"]) or u'Unknown',
cover=_extractCover(tmp_file_path, original_file_extension, rarExecutable),
description=loadedMetadata.comments or "",
for credit in loaded_metadata.credits if credit["role"] == "Writer"]) or 'Unknown',
cover=_extract_cover(tmp_file_path, original_file_extension, rar_executable),
description=loaded_metadata.comments or "",
tags="",
series=loadedMetadata.series or "",
series_id=loadedMetadata.issue or "",
languages=loadedMetadata.language,
series=loaded_metadata.series or "",
series_id=loaded_metadata.issue or "",
languages=loaded_metadata.language,
publisher="")
return BookMeta(
@ -161,7 +137,7 @@ def get_comic_info(tmp_file_path, original_file_name, original_file_extension, r
extension=original_file_extension,
title=original_file_name,
author=u'Unknown',
cover=_extractCover(tmp_file_path, original_file_extension, rarExecutable),
cover=_extract_cover(tmp_file_path, original_file_extension, rar_executable),
description="",
tags="",
series="",

48
cps/cover.py Normal file
View File

@ -0,0 +1,48 @@
# -*- coding: utf-8 -*-
# This file is part of the Calibre-Web (https://github.com/janeczku/calibre-web)
# Copyright (C) 2022 OzzieIsaacs
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import os
try:
from wand.image import Image
use_IM = True
except (ImportError, RuntimeError) as e:
use_IM = False
NO_JPEG_EXTENSIONS = ['.png', '.webp', '.bmp']
COVER_EXTENSIONS = ['.png', '.webp', '.bmp', '.jpg', '.jpeg']
def cover_processing(tmp_file_name, img, extension):
tmp_cover_name = os.path.join(os.path.dirname(tmp_file_name), 'cover.jpg')
if extension in NO_JPEG_EXTENSIONS:
if use_IM:
with Image(blob=img) as imgc:
imgc.format = 'jpeg'
imgc.transform_colorspace('rgb')
imgc.save(filename=tmp_cover_name)
return tmp_cover_name
else:
return None
if img:
with open(tmp_cover_name, 'wb') as f:
f.write(img)
return tmp_cover_name
else:
return None

View File

@ -20,23 +20,26 @@ import os
import zipfile
from lxml import etree
from . import isoLanguages
from . import isoLanguages, cover
from .helper import split_authors
from .constants import BookMeta
def extract_cover(zip_file, cover_file, cover_path, tmp_file_name):
def _extract_cover(zip_file, cover_file, cover_path, tmp_file_name):
if cover_file is None:
return None
else:
cf = extension = None
zip_cover_path = os.path.join(cover_path, cover_file).replace('\\', '/')
cf = zip_file.read(zip_cover_path)
prefix = os.path.splitext(tmp_file_name)[0]
tmp_cover_name = prefix + '.' + os.path.basename(zip_cover_path)
image = open(tmp_cover_name, 'wb')
image.write(cf)
image.close()
return tmp_cover_name
ext = os.path.splitext(tmp_cover_name)
if len(ext) > 1:
extension = ext[1].lower()
if extension in cover.COVER_EXTENSIONS:
cf = zip_file.read(zip_cover_path)
return cover.cover_processing(tmp_file_name, cf, extension)
def get_epub_info(tmp_file_path, original_file_name, original_file_extension):
@ -70,9 +73,9 @@ def get_epub_info(tmp_file_path, original_file_name, original_file_extension):
else:
epub_metadata[s] = tmp[0]
else:
epub_metadata[s] = u'Unknown'
epub_metadata[s] = 'Unknown'
if epub_metadata['subject'] == u'Unknown':
if epub_metadata['subject'] == 'Unknown':
epub_metadata['subject'] = ''
if epub_metadata['description'] == u'Unknown':
@ -112,7 +115,7 @@ def parse_epub_cover(ns, tree, epub_zip, cover_path, tmp_file_path):
cover_section = tree.xpath("/pkg:package/pkg:manifest/pkg:item[@id='cover-image']/@href", namespaces=ns)
cover_file = None
if len(cover_section) > 0:
cover_file = extract_cover(epub_zip, cover_section[0], cover_path, tmp_file_path)
cover_file = _extract_cover(epub_zip, cover_section[0], cover_path, tmp_file_path)
else:
meta_cover = tree.xpath("/pkg:package/pkg:metadata/pkg:meta[@name='cover']/@content", namespaces=ns)
if len(meta_cover) > 0:
@ -123,10 +126,10 @@ def parse_epub_cover(ns, tree, epub_zip, cover_path, tmp_file_path):
"/pkg:package/pkg:manifest/pkg:item[@properties='" + meta_cover[0] + "']/@href", namespaces=ns)
else:
cover_section = tree.xpath("/pkg:package/pkg:guide/pkg:reference/@href", namespaces=ns)
if len(cover_section) > 0:
filetype = cover_section[0].rsplit('.', 1)[-1]
for cs in cover_section:
filetype = cs.rsplit('.', 1)[-1]
if filetype == "xhtml" or filetype == "html": # if cover is (x)html format
markup = epub_zip.read(os.path.join(cover_path, cover_section[0]))
markup = epub_zip.read(os.path.join(cover_path, cs))
markup_tree = etree.fromstring(markup)
# no matter xhtml or html with no namespace
img_src = markup_tree.xpath("//*[local-name() = 'img']/@src")
@ -137,9 +140,11 @@ def parse_epub_cover(ns, tree, epub_zip, cover_path, tmp_file_path):
# img_src maybe start with "../"" so fullpath join then relpath to cwd
filename = os.path.relpath(os.path.join(os.path.dirname(os.path.join(cover_path, cover_section[0])),
img_src[0]))
cover_file = extract_cover(epub_zip, filename, "", tmp_file_path)
cover_file = _extract_cover(epub_zip, filename, "", tmp_file_path)
else:
cover_file = extract_cover(epub_zip, cover_section[0], cover_path, tmp_file_path)
cover_file = _extract_cover(epub_zip, cs, cover_path, tmp_file_path)
if cover_file:
break
return cover_file