Parse XMP metadata in separate function, add exception, try multiple metadata formats

2021-03-16 17:53:33 +01:00 · 2021-03-16 17:53:33 +01:00 · 8abfaf0ffd
commit 8abfaf0ffd
parent 33e352819c
1 changed files with 68 additions and 33 deletions
--- a/cps/uploader.py
+++ b/cps/uploader.py
@ -22,11 +22,10 @@ import hashlib
 from tempfile import gettempdir
 from flask_babel import gettext as _
-from . import logger, comic, isoLanguages
+from . import logger, comic, isoLanguages, get_locale
 from .constants import BookMeta
 from .helper import split_authors
 log = logger.create()
@ -106,59 +105,95 @@ def default_meta(tmp_file_path, original_file_name, original_file_extension):
        languages="")
-def pdf_meta(tmp_file_path, original_file_name, original_file_extension):
+def parse_xmp(pdf_file):
-    doc_info = None
+    """
-    if use_pdf_meta:
+    Parse XMP Metadata and prepare for BookMeta object 
-        with open(tmp_file_path, 'rb') as f:
+    """
-            pdf_file = PdfFileReader(f)
+    try:
            doc_info = pdf_file.getDocumentInfo()
        xmp_info = pdf_file.getXmpMetadata()
    except Exception as e:
        log.debug('Can not read XMP metadata', e)
        return None
    if xmp_info:
-        xmp_author = xmp_info.dc_creator
+        try:
            xmp_author = xmp_info.dc_creator # list
        except:
            xmp_author = ['Unknown']
        if xmp_info.dc_title: 
            xmp_title = xmp_info.dc_title['x-default']
        else:
            xmp_title = ''
        if xmp_info.dc_description:
            xmp_description = xmp_info.dc_description['x-default']
        else:
            xmp_description = ''
        xmp_tags = ', '.join(xmp_info.dc_subject)
        xmp_language = xmp_info.dc_language[0]
        xmp_publisher = ', '.join(xmp_info.dc_publisher)
-    if xmp_info or doc_info:
+        languages = []
-        author = xmp_author or split_authors([doc_info.author]) or u'Unknown'
+        for i in xmp_info.dc_language:
-        title = xmp_title or doc_info.title or original_file_name
+            #calibre-web currently only takes one language.
-        subject = xmp_description or doc_info.subject
+            languages.append(isoLanguages.get_lang3(i))
-        publisher = xmp_publisher
+        
-        tags = xmp_tags or doc_info['/Keywords'] 
+        xmp_tags = ', '.join(xmp_info.dc_subject)
-        if xmp_language :
+        xmp_publisher = ', '.join(xmp_info.dc_publisher)
-            lang = xmp_language.split('-', 1)[0].lower()
+        xmp_languages = xmp_info.dc_language
-            if len(lang) == 2:
+
-                language = isoLanguages.get(part1=lang).name
+        return {'author': xmp_author,
-            elif len(lang) == 3:
+                    'title': xmp_title,
-                language = isoLanguages.get(part3=lang).name
+                    'subject': xmp_description,
                    'tags': xmp_tags, 'languages': languages,
                    'publisher': xmp_publisher
                    }
 def pdf_meta(tmp_file_path, original_file_name, original_file_extension):
    doc_info = None
    xmp_info = None
    if use_pdf_meta:
        with open(tmp_file_path, 'rb') as f:
            pdf_file = PdfFileReader(f)
            doc_info = pdf_file.getDocumentInfo()
            xmp_info = parse_xmp(pdf_file)
    if xmp_info:
        author = ' & '.join(split_authors(xmp_info['author']))
        title = xmp_info['title']
        subject = xmp_info['subject']
        tags = xmp_info['tags']
        languages = xmp_info['languages']
        publisher = xmp_info['publisher']
    elif doc_info:
        author = ' & '.join(split_authors([doc_info.author]))
        title = doc_info.title
        subject = doc_info.subject
        tags = doc_info['/Keywords']
        languages = ""
        publisher = ""
    else:
-                language = ''
+        author= u'Unknown'
        else:
            language = ''
    else:
        author = u'Unknown'
        title = original_file_name
        subject = ""
        tags = ""
        languages = ""
        publisher = ""
    return BookMeta(
        file_path=tmp_file_path,
        extension=original_file_extension,
        title=title,
-        author=' & '.join(author),
+        author=author,
        cover=pdf_preview(tmp_file_path, original_file_name),
        description=subject,
        tags=tags,
        series="",
        series_id="",
-        languages=language)
+        languages=', '.join(languages)
        )
 def pdf_preview(tmp_file_path, tmp_dir):