From 8e2536c53b8978ec05c9cb84615a179c38aa5080 Mon Sep 17 00:00:00 2001 From: Ozzie Isaacs Date: Sat, 12 Mar 2022 18:01:11 +0100 Subject: [PATCH] Improved cover extraction for epub files --- cps/comic.py | 78 ++++++++++++++++++---------------------------------- cps/cover.py | 48 ++++++++++++++++++++++++++++++++ cps/epub.py | 35 +++++++++++++---------- 3 files changed, 95 insertions(+), 66 deletions(-) create mode 100644 cps/cover.py diff --git a/cps/comic.py b/cps/comic.py index 9e7f4f8f..2549579e 100644 --- a/cps/comic.py +++ b/cps/comic.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # This file is part of the Calibre-Web (https://github.com/janeczku/calibre-web) -# Copyright (C) 2018 OzzieIsaacs +# Copyright (C) 2018-2022 OzzieIsaacs # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -18,19 +18,16 @@ import os -from . import logger, isoLanguages +from . import logger, isoLanguages, cover from .constants import BookMeta - -log = logger.create() - - try: from wand.image import Image use_IM = True except (ImportError, RuntimeError) as e: use_IM = False +log = logger.create() try: from comicapi.comicarchive import ComicArchive, MetaDataStyle @@ -51,29 +48,8 @@ except (ImportError, LookupError) as e: use_rarfile = False use_comic_meta = False -NO_JPEG_EXTENSIONS = ['.png', '.webp', '.bmp'] -COVER_EXTENSIONS = ['.png', '.webp', '.bmp', '.jpg', '.jpeg'] -def _cover_processing(tmp_file_name, img, extension): - tmp_cover_name = os.path.join(os.path.dirname(tmp_file_name), 'cover.jpg') - if extension in NO_JPEG_EXTENSIONS: - if use_IM: - with Image(blob=img) as imgc: - imgc.format = 'jpeg' - imgc.transform_colorspace('rgb') - imgc.save(filename=tmp_cover_name) - return tmp_cover_name - else: - return None - if img: - with open(tmp_cover_name, 'wb') as f: - f.write(img) - return tmp_cover_name - else: - return None - - -def _extract_Cover_from_archive(original_file_extension, tmp_file_name, rarExecutable): +def _extract_cover_from_archive(original_file_extension, tmp_file_name, rar_executable): cover_data = extension = None if original_file_extension.upper() == '.CBZ': cf = zipfile.ZipFile(tmp_file_name) @@ -81,7 +57,7 @@ def _extract_Cover_from_archive(original_file_extension, tmp_file_name, rarExecu ext = os.path.splitext(name) if len(ext) > 1: extension = ext[1].lower() - if extension in COVER_EXTENSIONS: + if extension in cover.COVER_EXTENSIONS: cover_data = cf.read(name) break elif original_file_extension.upper() == '.CBT': @@ -90,44 +66,44 @@ def _extract_Cover_from_archive(original_file_extension, tmp_file_name, rarExecu ext = os.path.splitext(name) if len(ext) > 1: extension = ext[1].lower() - if extension in COVER_EXTENSIONS: + if extension in cover.COVER_EXTENSIONS: cover_data = cf.extractfile(name).read() break elif original_file_extension.upper() == '.CBR' and use_rarfile: try: - rarfile.UNRAR_TOOL = rarExecutable + rarfile.UNRAR_TOOL = rar_executable cf = rarfile.RarFile(tmp_file_name) for name in cf.namelist(): ext = os.path.splitext(name) if len(ext) > 1: extension = ext[1].lower() - if extension in COVER_EXTENSIONS: + if extension in cover.COVER_EXTENSIONS: cover_data = cf.read(name) break except Exception as ex: - log.debug('Rarfile failed with error: %s', ex) + log.debug('Rarfile failed with error: {}'.format(ex)) return cover_data, extension -def _extractCover(tmp_file_name, original_file_extension, rarExecutable): +def _extract_cover(tmp_file_name, original_file_extension, rar_executable): cover_data = extension = None if use_comic_meta: - archive = ComicArchive(tmp_file_name, rar_exe_path=rarExecutable) + archive = ComicArchive(tmp_file_name, rar_exe_path=rar_executable) for index, name in enumerate(archive.getPageNameList()): ext = os.path.splitext(name) if len(ext) > 1: extension = ext[1].lower() - if extension in COVER_EXTENSIONS: + if extension in cover.COVER_EXTENSIONS: cover_data = archive.getPage(index) break else: - cover_data, extension = _extract_Cover_from_archive(original_file_extension, tmp_file_name, rarExecutable) - return _cover_processing(tmp_file_name, cover_data, extension) + cover_data, extension = _extract_cover_from_archive(original_file_extension, tmp_file_name, rar_executable) + return cover.cover_processing(tmp_file_name, cover_data, extension) -def get_comic_info(tmp_file_path, original_file_name, original_file_extension, rarExecutable): +def get_comic_info(tmp_file_path, original_file_name, original_file_extension, rar_executable): if use_comic_meta: - archive = ComicArchive(tmp_file_path, rar_exe_path=rarExecutable) + archive = ComicArchive(tmp_file_path, rar_exe_path=rar_executable) if archive.seemsToBeAComicArchive(): if archive.hasMetadata(MetaDataStyle.CIX): style = MetaDataStyle.CIX @@ -137,23 +113,23 @@ def get_comic_info(tmp_file_path, original_file_name, original_file_extension, r style = None # if style is not None: - loadedMetadata = archive.readMetadata(style) + loaded_metadata = archive.readMetadata(style) - lang = loadedMetadata.language or "" - loadedMetadata.language = isoLanguages.get_lang3(lang) + lang = loaded_metadata.language or "" + loaded_metadata.language = isoLanguages.get_lang3(lang) return BookMeta( file_path=tmp_file_path, extension=original_file_extension, - title=loadedMetadata.title or original_file_name, + title=loaded_metadata.title or original_file_name, author=" & ".join([credit["person"] - for credit in loadedMetadata.credits if credit["role"] == "Writer"]) or u'Unknown', - cover=_extractCover(tmp_file_path, original_file_extension, rarExecutable), - description=loadedMetadata.comments or "", + for credit in loaded_metadata.credits if credit["role"] == "Writer"]) or 'Unknown', + cover=_extract_cover(tmp_file_path, original_file_extension, rar_executable), + description=loaded_metadata.comments or "", tags="", - series=loadedMetadata.series or "", - series_id=loadedMetadata.issue or "", - languages=loadedMetadata.language, + series=loaded_metadata.series or "", + series_id=loaded_metadata.issue or "", + languages=loaded_metadata.language, publisher="") return BookMeta( @@ -161,7 +137,7 @@ def get_comic_info(tmp_file_path, original_file_name, original_file_extension, r extension=original_file_extension, title=original_file_name, author=u'Unknown', - cover=_extractCover(tmp_file_path, original_file_extension, rarExecutable), + cover=_extract_cover(tmp_file_path, original_file_extension, rar_executable), description="", tags="", series="", diff --git a/cps/cover.py b/cps/cover.py new file mode 100644 index 00000000..5dd29534 --- /dev/null +++ b/cps/cover.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- + +# This file is part of the Calibre-Web (https://github.com/janeczku/calibre-web) +# Copyright (C) 2022 OzzieIsaacs +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import os + +try: + from wand.image import Image + use_IM = True +except (ImportError, RuntimeError) as e: + use_IM = False + + +NO_JPEG_EXTENSIONS = ['.png', '.webp', '.bmp'] +COVER_EXTENSIONS = ['.png', '.webp', '.bmp', '.jpg', '.jpeg'] + + +def cover_processing(tmp_file_name, img, extension): + tmp_cover_name = os.path.join(os.path.dirname(tmp_file_name), 'cover.jpg') + if extension in NO_JPEG_EXTENSIONS: + if use_IM: + with Image(blob=img) as imgc: + imgc.format = 'jpeg' + imgc.transform_colorspace('rgb') + imgc.save(filename=tmp_cover_name) + return tmp_cover_name + else: + return None + if img: + with open(tmp_cover_name, 'wb') as f: + f.write(img) + return tmp_cover_name + else: + return None diff --git a/cps/epub.py b/cps/epub.py index b436a755..563590e8 100644 --- a/cps/epub.py +++ b/cps/epub.py @@ -20,23 +20,26 @@ import os import zipfile from lxml import etree -from . import isoLanguages +from . import isoLanguages, cover from .helper import split_authors from .constants import BookMeta -def extract_cover(zip_file, cover_file, cover_path, tmp_file_name): +def _extract_cover(zip_file, cover_file, cover_path, tmp_file_name): if cover_file is None: return None else: + cf = extension = None zip_cover_path = os.path.join(cover_path, cover_file).replace('\\', '/') - cf = zip_file.read(zip_cover_path) + prefix = os.path.splitext(tmp_file_name)[0] tmp_cover_name = prefix + '.' + os.path.basename(zip_cover_path) - image = open(tmp_cover_name, 'wb') - image.write(cf) - image.close() - return tmp_cover_name + ext = os.path.splitext(tmp_cover_name) + if len(ext) > 1: + extension = ext[1].lower() + if extension in cover.COVER_EXTENSIONS: + cf = zip_file.read(zip_cover_path) + return cover.cover_processing(tmp_file_name, cf, extension) def get_epub_info(tmp_file_path, original_file_name, original_file_extension): @@ -70,9 +73,9 @@ def get_epub_info(tmp_file_path, original_file_name, original_file_extension): else: epub_metadata[s] = tmp[0] else: - epub_metadata[s] = u'Unknown' + epub_metadata[s] = 'Unknown' - if epub_metadata['subject'] == u'Unknown': + if epub_metadata['subject'] == 'Unknown': epub_metadata['subject'] = '' if epub_metadata['description'] == u'Unknown': @@ -112,7 +115,7 @@ def parse_epub_cover(ns, tree, epub_zip, cover_path, tmp_file_path): cover_section = tree.xpath("/pkg:package/pkg:manifest/pkg:item[@id='cover-image']/@href", namespaces=ns) cover_file = None if len(cover_section) > 0: - cover_file = extract_cover(epub_zip, cover_section[0], cover_path, tmp_file_path) + cover_file = _extract_cover(epub_zip, cover_section[0], cover_path, tmp_file_path) else: meta_cover = tree.xpath("/pkg:package/pkg:metadata/pkg:meta[@name='cover']/@content", namespaces=ns) if len(meta_cover) > 0: @@ -123,10 +126,10 @@ def parse_epub_cover(ns, tree, epub_zip, cover_path, tmp_file_path): "/pkg:package/pkg:manifest/pkg:item[@properties='" + meta_cover[0] + "']/@href", namespaces=ns) else: cover_section = tree.xpath("/pkg:package/pkg:guide/pkg:reference/@href", namespaces=ns) - if len(cover_section) > 0: - filetype = cover_section[0].rsplit('.', 1)[-1] + for cs in cover_section: + filetype = cs.rsplit('.', 1)[-1] if filetype == "xhtml" or filetype == "html": # if cover is (x)html format - markup = epub_zip.read(os.path.join(cover_path, cover_section[0])) + markup = epub_zip.read(os.path.join(cover_path, cs)) markup_tree = etree.fromstring(markup) # no matter xhtml or html with no namespace img_src = markup_tree.xpath("//*[local-name() = 'img']/@src") @@ -137,9 +140,11 @@ def parse_epub_cover(ns, tree, epub_zip, cover_path, tmp_file_path): # img_src maybe start with "../"" so fullpath join then relpath to cwd filename = os.path.relpath(os.path.join(os.path.dirname(os.path.join(cover_path, cover_section[0])), img_src[0])) - cover_file = extract_cover(epub_zip, filename, "", tmp_file_path) + cover_file = _extract_cover(epub_zip, filename, "", tmp_file_path) else: - cover_file = extract_cover(epub_zip, cover_section[0], cover_path, tmp_file_path) + cover_file = _extract_cover(epub_zip, cs, cover_path, tmp_file_path) + if cover_file: + break return cover_file