From 027e103ce37b86c3aa3d0e703ddae4aeaa22e0d8 Mon Sep 17 00:00:00 2001 From: Pavel Yakunin Date: Sun, 5 Jun 2016 18:41:47 +0300 Subject: [PATCH 1/7] refactoring to make adding new formats possible --- cps/book_formats.py | 50 +++++++++++++++++++++++++++++++++++++++++++++ cps/uploader.py | 30 +++++++++++++++++++++++++++ cps/web.py | 43 +++++++++++++++++++------------------- 3 files changed, 101 insertions(+), 22 deletions(-) create mode 100644 cps/book_formats.py create mode 100644 cps/uploader.py diff --git a/cps/book_formats.py b/cps/book_formats.py new file mode 100644 index 00000000..4f0d16f4 --- /dev/null +++ b/cps/book_formats.py @@ -0,0 +1,50 @@ +__author__ = 'lemmsh' + +import uploader +import os +try: + from wand.image import Image + use_generic_pdf_cover = False +except ImportError, e: + use_generic_pdf_cover = True + +def process(tmp_file_path, original_file_name, original_file_extension): + if (".PDF" == original_file_extension.upper()): + return pdf_meta(tmp_file_path, original_file_name, original_file_extension) + else: return None + + +def pdf_meta(tmp_file_path, original_file_name, original_file_extension): + from PyPDF2 import PdfFileReader + pdf = PdfFileReader(open(tmp_file_path, 'rb')) + doc_info = pdf.getDocumentInfo() + print("!!!!!!!!!!!!!!") + print(doc_info.producer) + if (doc_info is not None): + author = doc_info.author + title = doc_info.title + subject = doc_info.subject + else: + author = "Unknown" + title = original_file_name + subject = "" + return uploader.BookMeta( + file_path = tmp_file_path, + extension = original_file_extension, + title = title, + author = author, + cover = pdf_preview(tmp_file_path, original_file_name), + description = subject, + tags = "", + series = "", + series_id="") + +def pdf_preview(tmp_file_path, tmp_dir): + if use_generic_pdf_cover: + return None + else: + cover_file_name = os.path.splitext(tmp_file_path)[0] + ".cover.jpg" + with Image(filename=tmp_file_path + "[0]", resolution=150) as img: + img.compression_quality = 88 + img.save(filename=os.path.join(tmp_dir, cover_file_name)) + return cover_file_name diff --git a/cps/uploader.py b/cps/uploader.py new file mode 100644 index 00000000..73d7f538 --- /dev/null +++ b/cps/uploader.py @@ -0,0 +1,30 @@ +import os +import hashlib +from collections import namedtuple +import book_formats + + +tmp_dir = "/tmp/calibre-web" + +BookMeta = namedtuple('BookMeta', 'file_path, extension, title, author, cover, description, tags, series, series_id') + + +""" + :rtype: BookMeta +""" +def upload(file): + if not os.path.isdir(tmp_dir): + os.mkdir(tmp_dir) + + filename = file.filename + filename_root, file_extension = os.path.splitext(filename) + md5 = hashlib.md5() + md5.update(filename) + tmp_file_path = os.path.join(tmp_dir, md5.hexdigest()) + file.save(tmp_file_path) + meta = book_formats.process(tmp_file_path, filename_root, file_extension) + return meta + + + + diff --git a/cps/web.py b/cps/web.py index 85dce9d6..6b71305b 100755 --- a/cps/web.py +++ b/cps/web.py @@ -23,6 +23,7 @@ import base64 from sqlalchemy.sql import * import json import datetime +import book_formats from uuid import uuid4 try: from wand.image import Image @@ -1075,6 +1076,8 @@ def edit_book(book_id): else: return render_template('edit_book.html', book=book, authors=author_names, cc=cc) +import uploader + @app.route("/upload", methods = ["GET", "POST"]) @login_required @upload_required @@ -1086,20 +1089,17 @@ def upload(): db.session.connection().connection.connection.create_function('uuid4', 0, lambda : str(uuid4())) if request.method == 'POST' and 'btn-upload' in request.files: file = request.files['btn-upload'] - filename = file.filename - filename_root, fileextension = os.path.splitext(filename) - if fileextension.upper() == ".PDF": - title = filename_root - author = "Unknown" - else: - flash("Upload is only available for PDF files", category="error") - return redirect(url_for('index')) - + meta = uploader.upload(file) + + title = meta.title + author = meta.author + + title_dir = helper.get_valid_filename(title, False) author_dir = helper.get_valid_filename(author.decode('utf-8'), False) data_name = title_dir filepath = config.DB_ROOT + "/" + author_dir + "/" + title_dir - saved_filename = filepath + "/" + data_name + fileextension + saved_filename = filepath + "/" + data_name + meta.extension if not os.path.exists(filepath): try: os.makedirs(filepath) @@ -1107,21 +1107,20 @@ def upload(): flash("Failed to create path %s (Permission denied)." % filepath, category="error") return redirect(url_for('index')) try: - file.save(saved_filename) + copyfile(meta.file_path, saved_filename) #remove as well except OSError: flash("Failed to store file %s (Permission denied)." % saved_filename, category="error") return redirect(url_for('index')) + file_size = os.path.getsize(saved_filename) - has_cover = 0 - if fileextension.upper() == ".PDF": - if use_generic_pdf_cover: - basedir = os.path.dirname(__file__) - copyfile(os.path.join(basedir, "static/generic_cover.jpg"), os.path.join(filepath, "cover.jpg")) - else: - with Image(filename=saved_filename + "[0]", resolution=150) as img: - img.compression_quality = 88 - img.save(filename=os.path.join(filepath, "cover.jpg")) - has_cover = 1 + if meta.cover is None: + has_cover = 0 + basedir = os.path.dirname(__file__) + copyfile(os.path.join(basedir, "static/generic_cover.jpg"), os.path.join(filepath, "cover.jpg")) + else: + has_cover = 1 + copyfile(meta.cover, os.path.join(filepath, "cover.jpg")) + is_author = db.session.query(db.Authors).filter(db.Authors.name == author).first() if is_author: db_author = is_author @@ -1131,7 +1130,7 @@ def upload(): path = os.path.join(author_dir, title_dir) db_book = db.Books(title, "", "", datetime.datetime.now(), datetime.datetime(101, 01,01), 1, datetime.datetime.now(), path, has_cover, db_author, []) db_book.authors.append(db_author) - db_data = db.Data(db_book, fileextension.upper()[1:], file_size, data_name) + db_data = db.Data(db_book, meta.extension.upper()[1:], file_size, data_name) db_book.data.append(db_data) db.session.add(db_book) From 44df873f331e529e1c6ba11794e953739af4d8e1 Mon Sep 17 00:00:00 2001 From: Pavel Yakunin Date: Sun, 5 Jun 2016 19:42:18 +0300 Subject: [PATCH 2/7] logging, tmp cleanup --- cps/book_formats.py | 22 +++++++++++++++++----- cps/web.py | 14 ++++++-------- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/cps/book_formats.py b/cps/book_formats.py index 4f0d16f4..fa1808fe 100644 --- a/cps/book_formats.py +++ b/cps/book_formats.py @@ -1,12 +1,22 @@ __author__ = 'lemmsh' +import logging +logger = logging.getLogger("book_formats") + import uploader import os try: from wand.image import Image use_generic_pdf_cover = False except ImportError, e: + logger.warning('cannot import Image, generating pdf covers for pdf uploads will not work') use_generic_pdf_cover = True +try: + from PyPDF2 import PdfFileReader + use_pdf_meta = True +except ImportError, e: + logger.warning('cannot import PyPDF2, extracting pdf metadata will not work') + use_pdf_meta = False def process(tmp_file_path, original_file_name, original_file_extension): if (".PDF" == original_file_extension.upper()): @@ -15,11 +25,13 @@ def process(tmp_file_path, original_file_name, original_file_extension): def pdf_meta(tmp_file_path, original_file_name, original_file_extension): - from PyPDF2 import PdfFileReader - pdf = PdfFileReader(open(tmp_file_path, 'rb')) - doc_info = pdf.getDocumentInfo() - print("!!!!!!!!!!!!!!") - print(doc_info.producer) + + if (use_pdf_meta): + pdf = PdfFileReader(open(tmp_file_path, 'rb')) + doc_info = pdf.getDocumentInfo() + else: + doc_info = None + if (doc_info is not None): author = doc_info.author title = doc_info.title diff --git a/cps/web.py b/cps/web.py index 6b71305b..e158b9ea 100755 --- a/cps/web.py +++ b/cps/web.py @@ -23,13 +23,7 @@ import base64 from sqlalchemy.sql import * import json import datetime -import book_formats from uuid import uuid4 -try: - from wand.image import Image - use_generic_pdf_cover = False -except ImportError, e: - use_generic_pdf_cover = True from shutil import copyfile app = (Flask(__name__)) @@ -41,6 +35,9 @@ file_handler.setLevel(logging.INFO) file_handler.setFormatter(formatter) app.logger.addHandler(file_handler) app.logger.info('Starting Calibre Web...') +logging.getLogger("book_formats").addHandler(file_handler) +logging.getLogger("book_formats").setLevel(logging.INFO) + Principal(app) @@ -1077,6 +1074,7 @@ def edit_book(book_id): return render_template('edit_book.html', book=book, authors=author_names, cc=cc) import uploader +from shutil import move @app.route("/upload", methods = ["GET", "POST"]) @login_required @@ -1107,7 +1105,7 @@ def upload(): flash("Failed to create path %s (Permission denied)." % filepath, category="error") return redirect(url_for('index')) try: - copyfile(meta.file_path, saved_filename) #remove as well + move(meta.file_path, saved_filename) #remove as well except OSError: flash("Failed to store file %s (Permission denied)." % saved_filename, category="error") return redirect(url_for('index')) @@ -1119,7 +1117,7 @@ def upload(): copyfile(os.path.join(basedir, "static/generic_cover.jpg"), os.path.join(filepath, "cover.jpg")) else: has_cover = 1 - copyfile(meta.cover, os.path.join(filepath, "cover.jpg")) + move(meta.cover, os.path.join(filepath, "cover.jpg")) is_author = db.session.query(db.Authors).filter(db.Authors.name == author).first() if is_author: From dc5074a86574cb77b376da3408ef5837fd951daa Mon Sep 17 00:00:00 2001 From: Pavel Yakunin Date: Sun, 5 Jun 2016 19:52:28 +0300 Subject: [PATCH 3/7] default upload logic --- cps/book_formats.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/cps/book_formats.py b/cps/book_formats.py index fa1808fe..b656fa02 100644 --- a/cps/book_formats.py +++ b/cps/book_formats.py @@ -21,7 +21,21 @@ except ImportError, e: def process(tmp_file_path, original_file_name, original_file_extension): if (".PDF" == original_file_extension.upper()): return pdf_meta(tmp_file_path, original_file_name, original_file_extension) - else: return None + else: return default_meta(tmp_file_path, original_file_name, original_file_extension) + + + +def default_meta(tmp_file_path, original_file_name, original_file_extension): + return uploader.BookMeta( + file_path = tmp_file_path, + extension = original_file_extension, + title = original_file_name, + author = "Unknown", + cover = None, + description = "", + tags = "", + series = "", + series_id="") def pdf_meta(tmp_file_path, original_file_name, original_file_extension): From 18e341d650c6cdb0637464608e23cb185130eaf5 Mon Sep 17 00:00:00 2001 From: Pavel Yakunin Date: Sun, 5 Jun 2016 22:28:30 +0300 Subject: [PATCH 4/7] epub uploading --- cps/book_formats.py | 24 ++++++++++++---- cps/epub.py | 67 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 5 deletions(-) create mode 100644 cps/epub.py diff --git a/cps/book_formats.py b/cps/book_formats.py index b656fa02..4e797113 100644 --- a/cps/book_formats.py +++ b/cps/book_formats.py @@ -9,19 +9,33 @@ try: from wand.image import Image use_generic_pdf_cover = False except ImportError, e: - logger.warning('cannot import Image, generating pdf covers for pdf uploads will not work') + logger.warning('cannot import Image, generating pdf covers for pdf uploads will not work: %s', e) use_generic_pdf_cover = True try: from PyPDF2 import PdfFileReader use_pdf_meta = True except ImportError, e: - logger.warning('cannot import PyPDF2, extracting pdf metadata will not work') + logger.warning('cannot import PyPDF2, extracting pdf metadata will not work: %s', e) use_pdf_meta = False +try: + import epub + use_epub_meta = True +except ImportError, e: + logger.warning('cannot import PyPDF2, extracting pdf metadata will not work: %s', e) + use_epub_meta = False + + def process(tmp_file_path, original_file_name, original_file_extension): - if (".PDF" == original_file_extension.upper()): - return pdf_meta(tmp_file_path, original_file_name, original_file_extension) - else: return default_meta(tmp_file_path, original_file_name, original_file_extension) + try: + if ".PDF" == original_file_extension.upper(): + return pdf_meta(tmp_file_path, original_file_name, original_file_extension) + if ".EPUB" == original_file_extension.upper() and use_pdf_meta == True: + return epub.get_epub_info(tmp_file_path, original_file_name, original_file_extension) + except Exception, e: + logger.warning('cannot parse metadata, using default: %s', e) + + return default_meta(tmp_file_path, original_file_name, original_file_extension) diff --git a/cps/epub.py b/cps/epub.py new file mode 100644 index 00000000..03fb30bb --- /dev/null +++ b/cps/epub.py @@ -0,0 +1,67 @@ +import zipfile +from lxml import etree +import os +import uploader + +def extractCover(zip, coverFile, tmp_file_name): + if (coverFile is None): + return None + else: + cf = zip.read("OPS/" + coverFile) + prefix = os.path.splitext(tmp_file_name)[0] + tmp_cover_name = prefix + "." + coverFile + image = open(tmp_cover_name, 'wb') + image.write(cf) + image.close() + return tmp_cover_name + + + +def get_epub_info(tmp_file_path, original_file_name, original_file_extension): + ns = { + 'n':'urn:oasis:names:tc:opendocument:xmlns:container', + 'pkg':'http://www.idpf.org/2007/opf', + 'dc':'http://purl.org/dc/elements/1.1/' + } + + zip = zipfile.ZipFile(tmp_file_path) + + txt = zip.read('META-INF/container.xml') + tree = etree.fromstring(txt) + cfname = tree.xpath('n:rootfiles/n:rootfile/@full-path',namespaces=ns)[0] + + cf = zip.read(cfname) + tree = etree.fromstring(cf) + + p = tree.xpath('/pkg:package/pkg:metadata',namespaces=ns)[0] + + epub_metadata = {} + for s in ['title', 'description', 'creator']: + tmp = p.xpath('dc:%s/text()'%(s),namespaces=ns) + if (len(tmp) > 0): + epub_metadata[s] = p.xpath('dc:%s/text()'%(s),namespaces=ns)[0] + else: + epub_metadata[s] = "Unknown" + + coversection = tree.xpath("/pkg:package/pkg:manifest/pkg:item[@id='cover']/@href",namespaces=ns) + if (len(coversection) > 0): + coverfile = extractCover(zip, coversection[0], tmp_file_path) + else: + coverfile = None + if epub_metadata['title'] is None: + title = original_file_name + else: + title = epub_metadata['title'] + + + return uploader.BookMeta( + file_path = tmp_file_path, + extension = original_file_extension, + title = title, + author = epub_metadata['creator'], + cover = coverfile, + description = epub_metadata['description'], + tags = "", + series = "", + series_id="") + From 43d60778159894457b414fe891d13e985ccb4c2d Mon Sep 17 00:00:00 2001 From: Pavel Yakunin Date: Sun, 5 Jun 2016 22:32:26 +0300 Subject: [PATCH 5/7] cleanup --- cps/web.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cps/web.py b/cps/web.py index e158b9ea..bc496b95 100755 --- a/cps/web.py +++ b/cps/web.py @@ -1105,7 +1105,7 @@ def upload(): flash("Failed to create path %s (Permission denied)." % filepath, category="error") return redirect(url_for('index')) try: - move(meta.file_path, saved_filename) #remove as well + move(meta.file_path, saved_filename) except OSError: flash("Failed to store file %s (Permission denied)." % saved_filename, category="error") return redirect(url_for('index')) From 8b5bd61467d769cd7d1c0a9aab98045f0c275fba Mon Sep 17 00:00:00 2001 From: Pavel Yakunin Date: Sat, 18 Jun 2016 16:50:32 +0300 Subject: [PATCH 6/7] fb2 uploading --- cps/book_formats.py | 13 +++++++-- cps/fb2.py | 64 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 2 deletions(-) create mode 100644 cps/fb2.py diff --git a/cps/book_formats.py b/cps/book_formats.py index 4e797113..b5c48086 100644 --- a/cps/book_formats.py +++ b/cps/book_formats.py @@ -22,16 +22,25 @@ try: import epub use_epub_meta = True except ImportError, e: - logger.warning('cannot import PyPDF2, extracting pdf metadata will not work: %s', e) + logger.warning('cannot import PyPDF2, extracting epub metadata will not work: %s', e) use_epub_meta = False +try: + import fb2 + use_fb2_meta = True +except ImportError, e: + logger.warning('cannot import lxml, extracting fb2 metadata will not work: %s', e) + use_fb2_meta = False + def process(tmp_file_path, original_file_name, original_file_extension): try: if ".PDF" == original_file_extension.upper(): return pdf_meta(tmp_file_path, original_file_name, original_file_extension) - if ".EPUB" == original_file_extension.upper() and use_pdf_meta == True: + if ".EPUB" == original_file_extension.upper() and use_epub_meta == True: return epub.get_epub_info(tmp_file_path, original_file_name, original_file_extension) + if ".FB2" == original_file_extension.upper() and use_fb2_meta == True: + return fb2.get_fb2_info(tmp_file_path, original_file_name, original_file_extension) except Exception, e: logger.warning('cannot parse metadata, using default: %s', e) diff --git a/cps/fb2.py b/cps/fb2.py new file mode 100644 index 00000000..746d041e --- /dev/null +++ b/cps/fb2.py @@ -0,0 +1,64 @@ + +from lxml import etree +import os +import uploader + + +def get_fb2_info(tmp_file_path, original_file_name, original_file_extension): + + ns = { + 'fb':'http://www.gribuser.ru/xml/fictionbook/2.0', + 'l':'ttp://www.w3.org/1999/xlink', + } + + fb2_file = open(tmp_file_path) + tree = etree.fromstring(fb2_file.read()) + + authors = tree.xpath('/fb:FictionBook/fb:description/fb:title-info/fb:author', namespaces=ns) + def get_author(element): + return element.xpath('fb:first-name/text()', namespaces=ns)[0] + ' ' + element.xpath('fb:middle-name/text()', namespaces=ns)[0] + ' ' + element.xpath('fb:last-name/text()', namespaces=ns)[0] + author = ", ".join(map(get_author, authors)) + + title = unicode(tree.xpath('/fb:FictionBook/fb:description/fb:title-info/fb:book-title/text()', namespaces=ns)[0]) + description = unicode(tree.xpath('/fb:FictionBook/fb:description/fb:publish-info/fb:book-name/text()', namespaces=ns)[0]) + + # + # + # + # cfname = tree.xpath('n:rootfiles/n:rootfile/@full-path',namespaces=ns)[0] + # + # cf = zip.read(cfname) + # tree = etree.fromstring(cf) + # + # p = tree.xpath('/pkg:package/pkg:metadata',namespaces=ns)[0] + # + # epub_metadata = {} + # for s in ['title', 'description', 'creator']: + # tmp = p.xpath('dc:%s/text()'%(s),namespaces=ns) + # if (len(tmp) > 0): + # epub_metadata[s] = p.xpath('dc:%s/text()'%(s),namespaces=ns)[0] + # else: + # epub_metadata[s] = "Unknown" + # + # coversection = tree.xpath("/pkg:package/pkg:manifest/pkg:item[@id='cover']/@href",namespaces=ns) + # if (len(coversection) > 0): + # coverfile = extractCover(zip, coversection[0], tmp_file_path) + # else: + # coverfile = None + # if epub_metadata['title'] is None: + # title = original_file_name + # else: + # title = epub_metadata['title'] + # + # + return uploader.BookMeta( + file_path = tmp_file_path, + extension = original_file_extension, + title = title, + author = author, + cover = None, + description = description, + tags = "", + series = "", + series_id="") + From 8a9c97bf69856ab8331dd16d51a581496fab9d0e Mon Sep 17 00:00:00 2001 From: Pavel Yakunin Date: Sat, 18 Jun 2016 16:51:16 +0300 Subject: [PATCH 7/7] fb2 uploading --- cps/fb2.py | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/cps/fb2.py b/cps/fb2.py index 746d041e..a7f0ca14 100644 --- a/cps/fb2.py +++ b/cps/fb2.py @@ -22,35 +22,6 @@ def get_fb2_info(tmp_file_path, original_file_name, original_file_extension): title = unicode(tree.xpath('/fb:FictionBook/fb:description/fb:title-info/fb:book-title/text()', namespaces=ns)[0]) description = unicode(tree.xpath('/fb:FictionBook/fb:description/fb:publish-info/fb:book-name/text()', namespaces=ns)[0]) - # - # - # - # cfname = tree.xpath('n:rootfiles/n:rootfile/@full-path',namespaces=ns)[0] - # - # cf = zip.read(cfname) - # tree = etree.fromstring(cf) - # - # p = tree.xpath('/pkg:package/pkg:metadata',namespaces=ns)[0] - # - # epub_metadata = {} - # for s in ['title', 'description', 'creator']: - # tmp = p.xpath('dc:%s/text()'%(s),namespaces=ns) - # if (len(tmp) > 0): - # epub_metadata[s] = p.xpath('dc:%s/text()'%(s),namespaces=ns)[0] - # else: - # epub_metadata[s] = "Unknown" - # - # coversection = tree.xpath("/pkg:package/pkg:manifest/pkg:item[@id='cover']/@href",namespaces=ns) - # if (len(coversection) > 0): - # coverfile = extractCover(zip, coversection[0], tmp_file_path) - # else: - # coverfile = None - # if epub_metadata['title'] is None: - # title = original_file_name - # else: - # title = epub_metadata['title'] - # - # return uploader.BookMeta( file_path = tmp_file_path, extension = original_file_extension,