Parse XMP metadata in separate function, add exception, try multiple metadata formats

This commit is contained in:
rra 2021-03-16 17:53:33 +01:00
parent 33e352819c
commit 8abfaf0ffd

View File

@ -22,11 +22,10 @@ import hashlib
from tempfile import gettempdir from tempfile import gettempdir
from flask_babel import gettext as _ from flask_babel import gettext as _
from . import logger, comic, isoLanguages from . import logger, comic, isoLanguages, get_locale
from .constants import BookMeta from .constants import BookMeta
from .helper import split_authors from .helper import split_authors
log = logger.create() log = logger.create()
@ -82,7 +81,7 @@ def process(tmp_file_path, original_file_name, original_file_extension, rarExecu
original_file_name, original_file_name,
original_file_extension, original_file_extension,
rarExecutable) rarExecutable)
except Exception as ex: except Exception as ex:
log.warning('cannot parse metadata, using default: %s', ex) log.warning('cannot parse metadata, using default: %s', ex)
if meta and meta.title.strip() and meta.author.strip(): if meta and meta.title.strip() and meta.author.strip():
@ -106,59 +105,95 @@ def default_meta(tmp_file_path, original_file_name, original_file_extension):
languages="") languages="")
def pdf_meta(tmp_file_path, original_file_name, original_file_extension): def parse_xmp(pdf_file):
doc_info = None """
if use_pdf_meta: Parse XMP Metadata and prepare for BookMeta object
with open(tmp_file_path, 'rb') as f: """
pdf_file = PdfFileReader(f) try:
doc_info = pdf_file.getDocumentInfo() xmp_info = pdf_file.getXmpMetadata()
xmp_info = pdf_file.getXmpMetadata() except Exception as e:
log.debug('Can not read XMP metadata', e)
return None
if xmp_info: if xmp_info:
xmp_author = xmp_info.dc_creator try:
xmp_author = xmp_info.dc_creator # list
except:
xmp_author = ['Unknown']
if xmp_info.dc_title: if xmp_info.dc_title:
xmp_title = xmp_info.dc_title['x-default'] xmp_title = xmp_info.dc_title['x-default']
else: else:
xmp_title = '' xmp_title = ''
if xmp_info.dc_description: if xmp_info.dc_description:
xmp_description = xmp_info.dc_description['x-default'] xmp_description = xmp_info.dc_description['x-default']
else: else:
xmp_description = '' xmp_description = ''
xmp_tags = ', '.join(xmp_info.dc_subject)
xmp_language = xmp_info.dc_language[0]
xmp_publisher = ', '.join(xmp_info.dc_publisher)
if xmp_info or doc_info: languages = []
author = xmp_author or split_authors([doc_info.author]) or u'Unknown' for i in xmp_info.dc_language:
title = xmp_title or doc_info.title or original_file_name #calibre-web currently only takes one language.
subject = xmp_description or doc_info.subject languages.append(isoLanguages.get_lang3(i))
publisher = xmp_publisher
tags = xmp_tags or doc_info['/Keywords'] xmp_tags = ', '.join(xmp_info.dc_subject)
if xmp_language : xmp_publisher = ', '.join(xmp_info.dc_publisher)
lang = xmp_language.split('-', 1)[0].lower() xmp_languages = xmp_info.dc_language
if len(lang) == 2:
language = isoLanguages.get(part1=lang).name return {'author': xmp_author,
elif len(lang) == 3: 'title': xmp_title,
language = isoLanguages.get(part3=lang).name 'subject': xmp_description,
else: 'tags': xmp_tags, 'languages': languages,
language = '' 'publisher': xmp_publisher
else: }
language = ''
def pdf_meta(tmp_file_path, original_file_name, original_file_extension):
doc_info = None
xmp_info = None
if use_pdf_meta:
with open(tmp_file_path, 'rb') as f:
pdf_file = PdfFileReader(f)
doc_info = pdf_file.getDocumentInfo()
xmp_info = parse_xmp(pdf_file)
if xmp_info:
author = ' & '.join(split_authors(xmp_info['author']))
title = xmp_info['title']
subject = xmp_info['subject']
tags = xmp_info['tags']
languages = xmp_info['languages']
publisher = xmp_info['publisher']
elif doc_info:
author = ' & '.join(split_authors([doc_info.author]))
title = doc_info.title
subject = doc_info.subject
tags = doc_info['/Keywords']
languages = ""
publisher = ""
else: else:
author = u'Unknown' author= u'Unknown'
title = original_file_name title = original_file_name
subject = "" subject = ""
tags = ""
languages = ""
publisher = ""
return BookMeta( return BookMeta(
file_path=tmp_file_path, file_path=tmp_file_path,
extension=original_file_extension, extension=original_file_extension,
title=title, title=title,
author=' & '.join(author), author=author,
cover=pdf_preview(tmp_file_path, original_file_name), cover=pdf_preview(tmp_file_path, original_file_name),
description=subject, description=subject,
tags=tags, tags=tags,
series="", series="",
series_id="", series_id="",
languages=language) languages=', '.join(languages)
)
def pdf_preview(tmp_file_path, tmp_dir): def pdf_preview(tmp_file_path, tmp_dir):