diff --git a/vendor/PyPDF2/__init__.py b/vendor/PyPDF2/__init__.py
new file mode 100755
index 00000000..f458c0ea
--- /dev/null
+++ b/vendor/PyPDF2/__init__.py
@@ -0,0 +1,5 @@
+from .pdf import PdfFileReader, PdfFileWriter
+from .merger import PdfFileMerger
+from .pagerange import PageRange, parse_filename_page_ranges
+from ._version import __version__
+__all__ = ["pdf", "PdfFileMerger"]
diff --git a/vendor/PyPDF2/_version.py b/vendor/PyPDF2/_version.py
new file mode 100755
index 00000000..5fc7041e
--- /dev/null
+++ b/vendor/PyPDF2/_version.py
@@ -0,0 +1 @@
+__version__ = '1.26.0'
diff --git a/vendor/PyPDF2/filters.py b/vendor/PyPDF2/filters.py
new file mode 100755
index 00000000..3717fd4c
--- /dev/null
+++ b/vendor/PyPDF2/filters.py
@@ -0,0 +1,362 @@
+# vim: sw=4:expandtab:foldmethod=marker
+#
+# Copyright (c) 2006, Mathieu Fenniak
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# * The name of the author may not be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+"""
+Implementation of stream filters for PDF.
+"""
+__author__ = "Mathieu Fenniak"
+__author_email__ = "biziqe@mathieu.fenniak.net"
+
+from .utils import PdfReadError, ord_, chr_
+from sys import version_info
+if version_info < ( 3, 0 ):
+ from cStringIO import StringIO
+else:
+ from io import StringIO
+ import struct
+
+try:
+ import zlib
+
+ def decompress(data):
+ return zlib.decompress(data)
+
+ def compress(data):
+ return zlib.compress(data)
+
+except ImportError:
+ # Unable to import zlib. Attempt to use the System.IO.Compression
+ # library from the .NET framework. (IronPython only)
+ import System
+ from System import IO, Collections, Array
+
+ def _string_to_bytearr(buf):
+ retval = Array.CreateInstance(System.Byte, len(buf))
+ for i in range(len(buf)):
+ retval[i] = ord(buf[i])
+ return retval
+
+ def _bytearr_to_string(bytes):
+ retval = ""
+ for i in range(bytes.Length):
+ retval += chr(bytes[i])
+ return retval
+
+ def _read_bytes(stream):
+ ms = IO.MemoryStream()
+ buf = Array.CreateInstance(System.Byte, 2048)
+ while True:
+ bytes = stream.Read(buf, 0, buf.Length)
+ if bytes == 0:
+ break
+ else:
+ ms.Write(buf, 0, bytes)
+ retval = ms.ToArray()
+ ms.Close()
+ return retval
+
+ def decompress(data):
+ bytes = _string_to_bytearr(data)
+ ms = IO.MemoryStream()
+ ms.Write(bytes, 0, bytes.Length)
+ ms.Position = 0 # fseek 0
+ gz = IO.Compression.DeflateStream(ms, IO.Compression.CompressionMode.Decompress)
+ bytes = _read_bytes(gz)
+ retval = _bytearr_to_string(bytes)
+ gz.Close()
+ return retval
+
+ def compress(data):
+ bytes = _string_to_bytearr(data)
+ ms = IO.MemoryStream()
+ gz = IO.Compression.DeflateStream(ms, IO.Compression.CompressionMode.Compress, True)
+ gz.Write(bytes, 0, bytes.Length)
+ gz.Close()
+ ms.Position = 0 # fseek 0
+ bytes = ms.ToArray()
+ retval = _bytearr_to_string(bytes)
+ ms.Close()
+ return retval
+
+
+class FlateDecode(object):
+ def decode(data, decodeParms):
+ data = decompress(data)
+ predictor = 1
+ if decodeParms:
+ try:
+ predictor = decodeParms.get("/Predictor", 1)
+ except AttributeError:
+ pass # usually an array with a null object was read
+
+ # predictor 1 == no predictor
+ if predictor != 1:
+ columns = decodeParms["/Columns"]
+ # PNG prediction:
+ if predictor >= 10 and predictor <= 15:
+ output = StringIO()
+ # PNG prediction can vary from row to row
+ rowlength = columns + 1
+ assert len(data) % rowlength == 0
+ prev_rowdata = (0,) * rowlength
+ for row in range(len(data) // rowlength):
+ rowdata = [ord_(x) for x in data[(row*rowlength):((row+1)*rowlength)]]
+ filterByte = rowdata[0]
+ if filterByte == 0:
+ pass
+ elif filterByte == 1:
+ for i in range(2, rowlength):
+ rowdata[i] = (rowdata[i] + rowdata[i-1]) % 256
+ elif filterByte == 2:
+ for i in range(1, rowlength):
+ rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256
+ else:
+ # unsupported PNG filter
+ raise PdfReadError("Unsupported PNG filter %r" % filterByte)
+ prev_rowdata = rowdata
+ output.write(''.join([chr(x) for x in rowdata[1:]]))
+ data = output.getvalue()
+ else:
+ # unsupported predictor
+ raise PdfReadError("Unsupported flatedecode predictor %r" % predictor)
+ return data
+ decode = staticmethod(decode)
+
+ def encode(data):
+ return compress(data)
+ encode = staticmethod(encode)
+
+
+class ASCIIHexDecode(object):
+ def decode(data, decodeParms=None):
+ retval = ""
+ char = ""
+ x = 0
+ while True:
+ c = data[x]
+ if c == ">":
+ break
+ elif c.isspace():
+ x += 1
+ continue
+ char += c
+ if len(char) == 2:
+ retval += chr(int(char, base=16))
+ char = ""
+ x += 1
+ assert char == ""
+ return retval
+ decode = staticmethod(decode)
+
+
+class LZWDecode(object):
+ """Taken from:
+ http://www.java2s.com/Open-Source/Java-Document/PDF/PDF-Renderer/com/sun/pdfview/decode/LZWDecode.java.htm
+ """
+ class decoder(object):
+ def __init__(self, data):
+ self.STOP=257
+ self.CLEARDICT=256
+ self.data=data
+ self.bytepos=0
+ self.bitpos=0
+ self.dict=[""]*4096
+ for i in range(256):
+ self.dict[i]=chr(i)
+ self.resetDict()
+
+ def resetDict(self):
+ self.dictlen=258
+ self.bitspercode=9
+
+ def nextCode(self):
+ fillbits=self.bitspercode
+ value=0
+ while fillbits>0 :
+ if self.bytepos >= len(self.data):
+ return -1
+ nextbits=ord(self.data[self.bytepos])
+ bitsfromhere=8-self.bitpos
+ if bitsfromhere>fillbits:
+ bitsfromhere=fillbits
+ value |= (((nextbits >> (8-self.bitpos-bitsfromhere)) &
+ (0xff >> (8-bitsfromhere))) <<
+ (fillbits-bitsfromhere))
+ fillbits -= bitsfromhere
+ self.bitpos += bitsfromhere
+ if self.bitpos >=8:
+ self.bitpos=0
+ self.bytepos = self.bytepos+1
+ return value
+
+ def decode(self):
+ """ algorithm derived from:
+ http://www.rasip.fer.hr/research/compress/algorithms/fund/lz/lzw.html
+ and the PDFReference
+ """
+ cW = self.CLEARDICT;
+ baos=""
+ while True:
+ pW = cW;
+ cW = self.nextCode();
+ if cW == -1:
+ raise PdfReadError("Missed the stop code in LZWDecode!")
+ if cW == self.STOP:
+ break;
+ elif cW == self.CLEARDICT:
+ self.resetDict();
+ elif pW == self.CLEARDICT:
+ baos+=self.dict[cW]
+ else:
+ if cW < self.dictlen:
+ baos += self.dict[cW]
+ p=self.dict[pW]+self.dict[cW][0]
+ self.dict[self.dictlen]=p
+ self.dictlen+=1
+ else:
+ p=self.dict[pW]+self.dict[pW][0]
+ baos+=p
+ self.dict[self.dictlen] = p;
+ self.dictlen+=1
+ if (self.dictlen >= (1 << self.bitspercode) - 1 and
+ self.bitspercode < 12):
+ self.bitspercode+=1
+ return baos
+
+ @staticmethod
+ def decode(data,decodeParams=None):
+ return LZWDecode.decoder(data).decode()
+
+
+class ASCII85Decode(object):
+ def decode(data, decodeParms=None):
+ if version_info < ( 3, 0 ):
+ retval = ""
+ group = []
+ x = 0
+ hitEod = False
+ # remove all whitespace from data
+ data = [y for y in data if not (y in ' \n\r\t')]
+ while not hitEod:
+ c = data[x]
+ if len(retval) == 0 and c == "<" and data[x+1] == "~":
+ x += 2
+ continue
+ #elif c.isspace():
+ # x += 1
+ # continue
+ elif c == 'z':
+ assert len(group) == 0
+ retval += '\x00\x00\x00\x00'
+ x += 1
+ continue
+ elif c == "~" and data[x+1] == ">":
+ if len(group) != 0:
+ # cannot have a final group of just 1 char
+ assert len(group) > 1
+ cnt = len(group) - 1
+ group += [ 85, 85, 85 ]
+ hitEod = cnt
+ else:
+ break
+ else:
+ c = ord(c) - 33
+ assert c >= 0 and c < 85
+ group += [ c ]
+ if len(group) >= 5:
+ b = group[0] * (85**4) + \
+ group[1] * (85**3) + \
+ group[2] * (85**2) + \
+ group[3] * 85 + \
+ group[4]
+ assert b < (2**32 - 1)
+ c4 = chr((b >> 0) % 256)
+ c3 = chr((b >> 8) % 256)
+ c2 = chr((b >> 16) % 256)
+ c1 = chr(b >> 24)
+ retval += (c1 + c2 + c3 + c4)
+ if hitEod:
+ retval = retval[:-4+hitEod]
+ group = []
+ x += 1
+ return retval
+ else:
+ if isinstance(data, str):
+ data = data.encode('ascii')
+ n = b = 0
+ out = bytearray()
+ for c in data:
+ if ord('!') <= c and c <= ord('u'):
+ n += 1
+ b = b*85+(c-33)
+ if n == 5:
+ out += struct.pack(b'>L',b)
+ n = b = 0
+ elif c == ord('z'):
+ assert n == 0
+ out += b'\0\0\0\0'
+ elif c == ord('~'):
+ if n:
+ for _ in range(5-n):
+ b = b*85+84
+ out += struct.pack(b'>L',b)[:n-1]
+ break
+ return bytes(out)
+ decode = staticmethod(decode)
+
+
+def decodeStreamData(stream):
+ from .generic import NameObject
+ filters = stream.get("/Filter", ())
+ if len(filters) and not isinstance(filters[0], NameObject):
+ # we have a single filter instance
+ filters = (filters,)
+ data = stream._data
+ # If there is not data to decode we should not try to decode the data.
+ if data:
+ for filterType in filters:
+ if filterType == "/FlateDecode" or filterType == "/Fl":
+ data = FlateDecode.decode(data, stream.get("/DecodeParms"))
+ elif filterType == "/ASCIIHexDecode" or filterType == "/AHx":
+ data = ASCIIHexDecode.decode(data)
+ elif filterType == "/LZWDecode" or filterType == "/LZW":
+ data = LZWDecode.decode(data, stream.get("/DecodeParms"))
+ elif filterType == "/ASCII85Decode" or filterType == "/A85":
+ data = ASCII85Decode.decode(data)
+ elif filterType == "/Crypt":
+ decodeParams = stream.get("/DecodeParams", {})
+ if "/Name" not in decodeParams and "/Type" not in decodeParams:
+ pass
+ else:
+ raise NotImplementedError("/Crypt filter with /Name or /Type not supported yet")
+ else:
+ # unsupported filter
+ raise NotImplementedError("unsupported filter %s" % filterType)
+ return data
diff --git a/vendor/PyPDF2/generic.py b/vendor/PyPDF2/generic.py
new file mode 100755
index 00000000..c4332297
--- /dev/null
+++ b/vendor/PyPDF2/generic.py
@@ -0,0 +1,1226 @@
+# vim: sw=4:expandtab:foldmethod=marker
+#
+# Copyright (c) 2006, Mathieu Fenniak
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# * The name of the author may not be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+"""
+Implementation of generic PDF objects (dictionary, number, string, and so on)
+"""
+__author__ = "Mathieu Fenniak"
+__author_email__ = "biziqe@mathieu.fenniak.net"
+
+import re
+from .utils import readNonWhitespace, RC4_encrypt, skipOverComment
+from .utils import b_, u_, chr_, ord_
+from .utils import PdfStreamError
+import warnings
+from . import filters
+from . import utils
+import decimal
+import codecs
+import sys
+#import debugging
+
+ObjectPrefix = b_('/<[tf(n%')
+NumberSigns = b_('+-')
+IndirectPattern = re.compile(b_(r"(\d+)\s+(\d+)\s+R[^a-zA-Z]"))
+
+
+def readObject(stream, pdf):
+ tok = stream.read(1)
+ stream.seek(-1, 1) # reset to start
+ idx = ObjectPrefix.find(tok)
+ if idx == 0:
+ # name object
+ return NameObject.readFromStream(stream, pdf)
+ elif idx == 1:
+ # hexadecimal string OR dictionary
+ peek = stream.read(2)
+ stream.seek(-2, 1) # reset to start
+ if peek == b_('<<'):
+ return DictionaryObject.readFromStream(stream, pdf)
+ else:
+ return readHexStringFromStream(stream)
+ elif idx == 2:
+ # array object
+ return ArrayObject.readFromStream(stream, pdf)
+ elif idx == 3 or idx == 4:
+ # boolean object
+ return BooleanObject.readFromStream(stream)
+ elif idx == 5:
+ # string object
+ return readStringFromStream(stream)
+ elif idx == 6:
+ # null object
+ return NullObject.readFromStream(stream)
+ elif idx == 7:
+ # comment
+ while tok not in (b_('\r'), b_('\n')):
+ tok = stream.read(1)
+ tok = readNonWhitespace(stream)
+ stream.seek(-1, 1)
+ return readObject(stream, pdf)
+ else:
+ # number object OR indirect reference
+ if tok in NumberSigns:
+ # number
+ return NumberObject.readFromStream(stream)
+ peek = stream.read(20)
+ stream.seek(-len(peek), 1) # reset to start
+ if IndirectPattern.match(peek) != None:
+ return IndirectObject.readFromStream(stream, pdf)
+ else:
+ return NumberObject.readFromStream(stream)
+
+
+class PdfObject(object):
+ def getObject(self):
+ """Resolves indirect references."""
+ return self
+
+
+class NullObject(PdfObject):
+ def writeToStream(self, stream, encryption_key):
+ stream.write(b_("null"))
+
+ def readFromStream(stream):
+ nulltxt = stream.read(4)
+ if nulltxt != b_("null"):
+ raise utils.PdfReadError("Could not read Null object")
+ return NullObject()
+ readFromStream = staticmethod(readFromStream)
+
+
+class BooleanObject(PdfObject):
+ def __init__(self, value):
+ self.value = value
+
+ def writeToStream(self, stream, encryption_key):
+ if self.value:
+ stream.write(b_("true"))
+ else:
+ stream.write(b_("false"))
+
+ def readFromStream(stream):
+ word = stream.read(4)
+ if word == b_("true"):
+ return BooleanObject(True)
+ elif word == b_("fals"):
+ stream.read(1)
+ return BooleanObject(False)
+ else:
+ raise utils.PdfReadError('Could not read Boolean object')
+ readFromStream = staticmethod(readFromStream)
+
+
+class ArrayObject(list, PdfObject):
+ def writeToStream(self, stream, encryption_key):
+ stream.write(b_("["))
+ for data in self:
+ stream.write(b_(" "))
+ data.writeToStream(stream, encryption_key)
+ stream.write(b_(" ]"))
+
+ def readFromStream(stream, pdf):
+ arr = ArrayObject()
+ tmp = stream.read(1)
+ if tmp != b_("["):
+ raise utils.PdfReadError("Could not read array")
+ while True:
+ # skip leading whitespace
+ tok = stream.read(1)
+ while tok.isspace():
+ tok = stream.read(1)
+ stream.seek(-1, 1)
+ # check for array ending
+ peekahead = stream.read(1)
+ if peekahead == b_("]"):
+ break
+ stream.seek(-1, 1)
+ # read and append obj
+ arr.append(readObject(stream, pdf))
+ return arr
+ readFromStream = staticmethod(readFromStream)
+
+
+class IndirectObject(PdfObject):
+ def __init__(self, idnum, generation, pdf):
+ self.idnum = idnum
+ self.generation = generation
+ self.pdf = pdf
+
+ def getObject(self):
+ return self.pdf.getObject(self).getObject()
+
+ def __repr__(self):
+ return "IndirectObject(%r, %r)" % (self.idnum, self.generation)
+
+ def __eq__(self, other):
+ return (
+ other != None and
+ isinstance(other, IndirectObject) and
+ self.idnum == other.idnum and
+ self.generation == other.generation and
+ self.pdf is other.pdf
+ )
+
+ def __ne__(self, other):
+ return not self.__eq__(other)
+
+ def writeToStream(self, stream, encryption_key):
+ stream.write(b_("%s %s R" % (self.idnum, self.generation)))
+
+ def readFromStream(stream, pdf):
+ idnum = b_("")
+ while True:
+ tok = stream.read(1)
+ if not tok:
+ # stream has truncated prematurely
+ raise PdfStreamError("Stream has ended unexpectedly")
+ if tok.isspace():
+ break
+ idnum += tok
+ generation = b_("")
+ while True:
+ tok = stream.read(1)
+ if not tok:
+ # stream has truncated prematurely
+ raise PdfStreamError("Stream has ended unexpectedly")
+ if tok.isspace():
+ if not generation:
+ continue
+ break
+ generation += tok
+ r = readNonWhitespace(stream)
+ if r != b_("R"):
+ raise utils.PdfReadError("Error reading indirect object reference at byte %s" % utils.hexStr(stream.tell()))
+ return IndirectObject(int(idnum), int(generation), pdf)
+ readFromStream = staticmethod(readFromStream)
+
+
+class FloatObject(decimal.Decimal, PdfObject):
+ def __new__(cls, value="0", context=None):
+ try:
+ return decimal.Decimal.__new__(cls, utils.str_(value), context)
+ except:
+ return decimal.Decimal.__new__(cls, str(value))
+
+ def __repr__(self):
+ if self == self.to_integral():
+ return str(self.quantize(decimal.Decimal(1)))
+ else:
+ # Standard formatting adds useless extraneous zeros.
+ o = "%.5f" % self
+ # Remove the zeros.
+ while o and o[-1] == '0':
+ o = o[:-1]
+ return o
+
+ def as_numeric(self):
+ return float(b_(repr(self)))
+
+ def writeToStream(self, stream, encryption_key):
+ stream.write(b_(repr(self)))
+
+
+class NumberObject(int, PdfObject):
+ NumberPattern = re.compile(b_('[^+-.0-9]'))
+ ByteDot = b_(".")
+
+ def __new__(cls, value):
+ val = int(value)
+ try:
+ return int.__new__(cls, val)
+ except OverflowError:
+ return int.__new__(cls, 0)
+
+ def as_numeric(self):
+ return int(b_(repr(self)))
+
+ def writeToStream(self, stream, encryption_key):
+ stream.write(b_(repr(self)))
+
+ def readFromStream(stream):
+ num = utils.readUntilRegex(stream, NumberObject.NumberPattern)
+ if num.find(NumberObject.ByteDot) != -1:
+ return FloatObject(num)
+ else:
+ return NumberObject(num)
+ readFromStream = staticmethod(readFromStream)
+
+
+##
+# Given a string (either a "str" or "unicode"), create a ByteStringObject or a
+# TextStringObject to represent the string.
+def createStringObject(string):
+ if isinstance(string, utils.string_type):
+ return TextStringObject(string)
+ elif isinstance(string, utils.bytes_type):
+ try:
+ if string.startswith(codecs.BOM_UTF16_BE):
+ retval = TextStringObject(string.decode("utf-16"))
+ retval.autodetect_utf16 = True
+ return retval
+ else:
+ # This is probably a big performance hit here, but we need to
+ # convert string objects into the text/unicode-aware version if
+ # possible... and the only way to check if that's possible is
+ # to try. Some strings are strings, some are just byte arrays.
+ retval = TextStringObject(decode_pdfdocencoding(string))
+ retval.autodetect_pdfdocencoding = True
+ return retval
+ except UnicodeDecodeError:
+ return ByteStringObject(string)
+ else:
+ raise TypeError("createStringObject should have str or unicode arg")
+
+
+def readHexStringFromStream(stream):
+ stream.read(1)
+ txt = ""
+ x = b_("")
+ while True:
+ tok = readNonWhitespace(stream)
+ if not tok:
+ # stream has truncated prematurely
+ raise PdfStreamError("Stream has ended unexpectedly")
+ if tok == b_(">"):
+ break
+ x += tok
+ if len(x) == 2:
+ txt += chr(int(x, base=16))
+ x = b_("")
+ if len(x) == 1:
+ x += b_("0")
+ if len(x) == 2:
+ txt += chr(int(x, base=16))
+ return createStringObject(b_(txt))
+
+
+def readStringFromStream(stream):
+ tok = stream.read(1)
+ parens = 1
+ txt = b_("")
+ while True:
+ tok = stream.read(1)
+ if not tok:
+ # stream has truncated prematurely
+ raise PdfStreamError("Stream has ended unexpectedly")
+ if tok == b_("("):
+ parens += 1
+ elif tok == b_(")"):
+ parens -= 1
+ if parens == 0:
+ break
+ elif tok == b_("\\"):
+ tok = stream.read(1)
+ if tok == b_("n"):
+ tok = b_("\n")
+ elif tok == b_("r"):
+ tok = b_("\r")
+ elif tok == b_("t"):
+ tok = b_("\t")
+ elif tok == b_("b"):
+ tok = b_("\b")
+ elif tok == b_("f"):
+ tok = b_("\f")
+ elif tok == b_("c"):
+ tok = b_("\c")
+ elif tok == b_("("):
+ tok = b_("(")
+ elif tok == b_(")"):
+ tok = b_(")")
+ elif tok == b_("/"):
+ tok = b_("/")
+ elif tok == b_("\\"):
+ tok = b_("\\")
+ elif tok in (b_(" "), b_("/"), b_("%"), b_("<"), b_(">"), b_("["),
+ b_("]"), b_("#"), b_("_"), b_("&"), b_('$')):
+ # odd/unnessecary escape sequences we have encountered
+ tok = b_(tok)
+ elif tok.isdigit():
+ # "The number ddd may consist of one, two, or three
+ # octal digits; high-order overflow shall be ignored.
+ # Three octal digits shall be used, with leading zeros
+ # as needed, if the next character of the string is also
+ # a digit." (PDF reference 7.3.4.2, p 16)
+ for i in range(2):
+ ntok = stream.read(1)
+ if ntok.isdigit():
+ tok += ntok
+ else:
+ break
+ tok = b_(chr(int(tok, base=8)))
+ elif tok in b_("\n\r"):
+ # This case is hit when a backslash followed by a line
+ # break occurs. If it's a multi-char EOL, consume the
+ # second character:
+ tok = stream.read(1)
+ if not tok in b_("\n\r"):
+ stream.seek(-1, 1)
+ # Then don't add anything to the actual string, since this
+ # line break was escaped:
+ tok = b_('')
+ else:
+ raise utils.PdfReadError(r"Unexpected escaped string: %s" % tok)
+ txt += tok
+ return createStringObject(txt)
+
+
+##
+# Represents a string object where the text encoding could not be determined.
+# This occurs quite often, as the PDF spec doesn't provide an alternate way to
+# represent strings -- for example, the encryption data stored in files (like
+# /O) is clearly not text, but is still stored in a "String" object.
+class ByteStringObject(utils.bytes_type, PdfObject):
+
+ ##
+ # For compatibility with TextStringObject.original_bytes. This method
+ # returns self.
+ original_bytes = property(lambda self: self)
+
+ def writeToStream(self, stream, encryption_key):
+ bytearr = self
+ if encryption_key:
+ bytearr = RC4_encrypt(encryption_key, bytearr)
+ stream.write(b_("<"))
+ stream.write(utils.hexencode(bytearr))
+ stream.write(b_(">"))
+
+
+##
+# Represents a string object that has been decoded into a real unicode string.
+# If read from a PDF document, this string appeared to match the
+# PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to
+# occur.
+class TextStringObject(utils.string_type, PdfObject):
+ autodetect_pdfdocencoding = False
+ autodetect_utf16 = False
+
+ ##
+ # It is occasionally possible that a text string object gets created where
+ # a byte string object was expected due to the autodetection mechanism --
+ # if that occurs, this "original_bytes" property can be used to
+ # back-calculate what the original encoded bytes were.
+ original_bytes = property(lambda self: self.get_original_bytes())
+
+ def get_original_bytes(self):
+ # We're a text string object, but the library is trying to get our raw
+ # bytes. This can happen if we auto-detected this string as text, but
+ # we were wrong. It's pretty common. Return the original bytes that
+ # would have been used to create this object, based upon the autodetect
+ # method.
+ if self.autodetect_utf16:
+ return codecs.BOM_UTF16_BE + self.encode("utf-16be")
+ elif self.autodetect_pdfdocencoding:
+ return encode_pdfdocencoding(self)
+ else:
+ raise Exception("no information about original bytes")
+
+ def writeToStream(self, stream, encryption_key):
+ # Try to write the string out as a PDFDocEncoding encoded string. It's
+ # nicer to look at in the PDF file. Sadly, we take a performance hit
+ # here for trying...
+ try:
+ bytearr = encode_pdfdocencoding(self)
+ except UnicodeEncodeError:
+ bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be")
+ if encryption_key:
+ bytearr = RC4_encrypt(encryption_key, bytearr)
+ obj = ByteStringObject(bytearr)
+ obj.writeToStream(stream, None)
+ else:
+ stream.write(b_("("))
+ for c in bytearr:
+ if not chr_(c).isalnum() and c != b_(' '):
+ stream.write(b_("\\%03o" % ord_(c)))
+ else:
+ stream.write(b_(chr_(c)))
+ stream.write(b_(")"))
+
+
+class NameObject(str, PdfObject):
+ delimiterPattern = re.compile(b_(r"\s+|[\(\)<>\[\]{}/%]"))
+ surfix = b_("/")
+
+ def writeToStream(self, stream, encryption_key):
+ stream.write(b_(self))
+
+ def readFromStream(stream, pdf):
+ debug = False
+ if debug: print((stream.tell()))
+ name = stream.read(1)
+ if name != NameObject.surfix:
+ raise utils.PdfReadError("name read error")
+ name += utils.readUntilRegex(stream, NameObject.delimiterPattern,
+ ignore_eof=True)
+ if debug: print(name)
+ try:
+ return NameObject(name.decode('utf-8'))
+ except (UnicodeEncodeError, UnicodeDecodeError) as e:
+ # Name objects should represent irregular characters
+ # with a '#' followed by the symbol's hex number
+ if not pdf.strict:
+ warnings.warn("Illegal character in Name Object", utils.PdfReadWarning)
+ return NameObject(name)
+ else:
+ raise utils.PdfReadError("Illegal character in Name Object")
+
+ readFromStream = staticmethod(readFromStream)
+
+
+class DictionaryObject(dict, PdfObject):
+ def raw_get(self, key):
+ return dict.__getitem__(self, key)
+
+ def __setitem__(self, key, value):
+ if not isinstance(key, PdfObject):
+ raise ValueError("key must be PdfObject")
+ if not isinstance(value, PdfObject):
+ raise ValueError("value must be PdfObject")
+ return dict.__setitem__(self, key, value)
+
+ def setdefault(self, key, value=None):
+ if not isinstance(key, PdfObject):
+ raise ValueError("key must be PdfObject")
+ if not isinstance(value, PdfObject):
+ raise ValueError("value must be PdfObject")
+ return dict.setdefault(self, key, value)
+
+ def __getitem__(self, key):
+ return dict.__getitem__(self, key).getObject()
+
+ ##
+ # Retrieves XMP (Extensible Metadata Platform) data relevant to the
+ # this object, if available.
+ #
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ # @return Returns a {@link #xmp.XmpInformation XmlInformation} instance
+ # that can be used to access XMP metadata from the document. Can also
+ # return None if no metadata was found on the document root.
+ def getXmpMetadata(self):
+ metadata = self.get("/Metadata", None)
+ if metadata == None:
+ return None
+ metadata = metadata.getObject()
+ from . import xmp
+ if not isinstance(metadata, xmp.XmpInformation):
+ metadata = xmp.XmpInformation(metadata)
+ self[NameObject("/Metadata")] = metadata
+ return metadata
+
+ ##
+ # Read-only property that accesses the {@link
+ # #DictionaryObject.getXmpData getXmpData} function.
+ #
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None)
+
+ def writeToStream(self, stream, encryption_key):
+ stream.write(b_("<<\n"))
+ for key, value in list(self.items()):
+ key.writeToStream(stream, encryption_key)
+ stream.write(b_(" "))
+ value.writeToStream(stream, encryption_key)
+ stream.write(b_("\n"))
+ stream.write(b_(">>"))
+
+ def readFromStream(stream, pdf):
+ debug = False
+ tmp = stream.read(2)
+ if tmp != b_("<<"):
+ raise utils.PdfReadError("Dictionary read error at byte %s: stream must begin with '<<'" % utils.hexStr(stream.tell()))
+ data = {}
+ while True:
+ tok = readNonWhitespace(stream)
+ if tok == b_('\x00'):
+ continue
+ elif tok == b_('%'):
+ stream.seek(-1, 1)
+ skipOverComment(stream)
+ continue
+ if not tok:
+ # stream has truncated prematurely
+ raise PdfStreamError("Stream has ended unexpectedly")
+
+ if debug: print(("Tok:", tok))
+ if tok == b_(">"):
+ stream.read(1)
+ break
+ stream.seek(-1, 1)
+ key = readObject(stream, pdf)
+ tok = readNonWhitespace(stream)
+ stream.seek(-1, 1)
+ value = readObject(stream, pdf)
+ if not data.get(key):
+ data[key] = value
+ elif pdf.strict:
+ # multiple definitions of key not permitted
+ raise utils.PdfReadError("Multiple definitions in dictionary at byte %s for key %s" \
+ % (utils.hexStr(stream.tell()), key))
+ else:
+ warnings.warn("Multiple definitions in dictionary at byte %s for key %s" \
+ % (utils.hexStr(stream.tell()), key), utils.PdfReadWarning)
+
+ pos = stream.tell()
+ s = readNonWhitespace(stream)
+ if s == b_('s') and stream.read(5) == b_('tream'):
+ eol = stream.read(1)
+ # odd PDF file output has spaces after 'stream' keyword but before EOL.
+ # patch provided by Danial Sandler
+ while eol == b_(' '):
+ eol = stream.read(1)
+ assert eol in (b_("\n"), b_("\r"))
+ if eol == b_("\r"):
+ # read \n after
+ if stream.read(1) != b_('\n'):
+ stream.seek(-1, 1)
+ # this is a stream object, not a dictionary
+ assert "/Length" in data
+ length = data["/Length"]
+ if debug: print(data)
+ if isinstance(length, IndirectObject):
+ t = stream.tell()
+ length = pdf.getObject(length)
+ stream.seek(t, 0)
+ data["__streamdata__"] = stream.read(length)
+ if debug: print("here")
+ #if debug: print(binascii.hexlify(data["__streamdata__"]))
+ e = readNonWhitespace(stream)
+ ndstream = stream.read(8)
+ if (e + ndstream) != b_("endstream"):
+ # (sigh) - the odd PDF file has a length that is too long, so
+ # we need to read backwards to find the "endstream" ending.
+ # ReportLab (unknown version) generates files with this bug,
+ # and Python users into PDF files tend to be our audience.
+ # we need to do this to correct the streamdata and chop off
+ # an extra character.
+ pos = stream.tell()
+ stream.seek(-10, 1)
+ end = stream.read(9)
+ if end == b_("endstream"):
+ # we found it by looking back one character further.
+ data["__streamdata__"] = data["__streamdata__"][:-1]
+ else:
+ if debug: print(("E", e, ndstream, debugging.toHex(end)))
+ stream.seek(pos, 0)
+ raise utils.PdfReadError("Unable to find 'endstream' marker after stream at byte %s." % utils.hexStr(stream.tell()))
+ else:
+ stream.seek(pos, 0)
+ if "__streamdata__" in data:
+ return StreamObject.initializeFromDictionary(data)
+ else:
+ retval = DictionaryObject()
+ retval.update(data)
+ return retval
+ readFromStream = staticmethod(readFromStream)
+
+
+class TreeObject(DictionaryObject):
+ def __init__(self):
+ DictionaryObject.__init__(self)
+
+ def hasChildren(self):
+ return '/First' in self
+
+ def __iter__(self):
+ return self.children()
+
+ def children(self):
+ if not self.hasChildren():
+ raise StopIteration
+
+ child = self['/First']
+ while True:
+ yield child
+ if child == self['/Last']:
+ raise StopIteration
+ child = child['/Next']
+
+ def addChild(self, child, pdf):
+ childObj = child.getObject()
+ child = pdf.getReference(childObj)
+ assert isinstance(child, IndirectObject)
+
+ if '/First' not in self:
+ self[NameObject('/First')] = child
+ self[NameObject('/Count')] = NumberObject(0)
+ prev = None
+ else:
+ prev = self['/Last']
+
+ self[NameObject('/Last')] = child
+ self[NameObject('/Count')] = NumberObject(self[NameObject('/Count')] + 1)
+
+ if prev:
+ prevRef = pdf.getReference(prev)
+ assert isinstance(prevRef, IndirectObject)
+ childObj[NameObject('/Prev')] = prevRef
+ prev[NameObject('/Next')] = child
+
+ parentRef = pdf.getReference(self)
+ assert isinstance(parentRef, IndirectObject)
+ childObj[NameObject('/Parent')] = parentRef
+
+ def removeChild(self, child):
+ childObj = child.getObject()
+
+ if NameObject('/Parent') not in childObj:
+ raise ValueError("Removed child does not appear to be a tree item")
+ elif childObj[NameObject('/Parent')] != self:
+ raise ValueError("Removed child is not a member of this tree")
+
+ found = False
+ prevRef = None
+ prev = None
+ curRef = self[NameObject('/First')]
+ cur = curRef.getObject()
+ lastRef = self[NameObject('/Last')]
+ last = lastRef.getObject()
+ while cur != None:
+ if cur == childObj:
+ if prev == None:
+ if NameObject('/Next') in cur:
+ # Removing first tree node
+ nextRef = cur[NameObject('/Next')]
+ next = nextRef.getObject()
+ del next[NameObject('/Prev')]
+ self[NameObject('/First')] = nextRef
+ self[NameObject('/Count')] = self[NameObject('/Count')] - 1
+
+ else:
+ # Removing only tree node
+ assert self[NameObject('/Count')] == 1
+ del self[NameObject('/Count')]
+ del self[NameObject('/First')]
+ if NameObject('/Last') in self:
+ del self[NameObject('/Last')]
+ else:
+ if NameObject('/Next') in cur:
+ # Removing middle tree node
+ nextRef = cur[NameObject('/Next')]
+ next = nextRef.getObject()
+ next[NameObject('/Prev')] = prevRef
+ prev[NameObject('/Next')] = nextRef
+ self[NameObject('/Count')] = self[NameObject('/Count')] - 1
+ else:
+ # Removing last tree node
+ assert cur == last
+ del prev[NameObject('/Next')]
+ self[NameObject('/Last')] = prevRef
+ self[NameObject('/Count')] = self[NameObject('/Count')] - 1
+ found = True
+ break
+
+ prevRef = curRef
+ prev = cur
+ if NameObject('/Next') in cur:
+ curRef = cur[NameObject('/Next')]
+ cur = curRef.getObject()
+ else:
+ curRef = None
+ cur = None
+
+ if not found:
+ raise ValueError("Removal couldn't find item in tree")
+
+ del childObj[NameObject('/Parent')]
+ if NameObject('/Next') in childObj:
+ del childObj[NameObject('/Next')]
+ if NameObject('/Prev') in childObj:
+ del childObj[NameObject('/Prev')]
+
+ def emptyTree(self):
+ for child in self:
+ childObj = child.getObject()
+ del childObj[NameObject('/Parent')]
+ if NameObject('/Next') in childObj:
+ del childObj[NameObject('/Next')]
+ if NameObject('/Prev') in childObj:
+ del childObj[NameObject('/Prev')]
+
+ if NameObject('/Count') in self:
+ del self[NameObject('/Count')]
+ if NameObject('/First') in self:
+ del self[NameObject('/First')]
+ if NameObject('/Last') in self:
+ del self[NameObject('/Last')]
+
+
+class StreamObject(DictionaryObject):
+ def __init__(self):
+ self._data = None
+ self.decodedSelf = None
+
+ def writeToStream(self, stream, encryption_key):
+ self[NameObject("/Length")] = NumberObject(len(self._data))
+ DictionaryObject.writeToStream(self, stream, encryption_key)
+ del self["/Length"]
+ stream.write(b_("\nstream\n"))
+ data = self._data
+ if encryption_key:
+ data = RC4_encrypt(encryption_key, data)
+ stream.write(data)
+ stream.write(b_("\nendstream"))
+
+ def initializeFromDictionary(data):
+ if "/Filter" in data:
+ retval = EncodedStreamObject()
+ else:
+ retval = DecodedStreamObject()
+ retval._data = data["__streamdata__"]
+ del data["__streamdata__"]
+ del data["/Length"]
+ retval.update(data)
+ return retval
+ initializeFromDictionary = staticmethod(initializeFromDictionary)
+
+ def flateEncode(self):
+ if "/Filter" in self:
+ f = self["/Filter"]
+ if isinstance(f, ArrayObject):
+ f.insert(0, NameObject("/FlateDecode"))
+ else:
+ newf = ArrayObject()
+ newf.append(NameObject("/FlateDecode"))
+ newf.append(f)
+ f = newf
+ else:
+ f = NameObject("/FlateDecode")
+ retval = EncodedStreamObject()
+ retval[NameObject("/Filter")] = f
+ retval._data = filters.FlateDecode.encode(self._data)
+ return retval
+
+
+class DecodedStreamObject(StreamObject):
+ def getData(self):
+ return self._data
+
+ def setData(self, data):
+ self._data = data
+
+
+class EncodedStreamObject(StreamObject):
+ def __init__(self):
+ self.decodedSelf = None
+
+ def getData(self):
+ if self.decodedSelf:
+ # cached version of decoded object
+ return self.decodedSelf.getData()
+ else:
+ # create decoded object
+ decoded = DecodedStreamObject()
+
+ decoded._data = filters.decodeStreamData(self)
+ for key, value in list(self.items()):
+ if not key in ("/Length", "/Filter", "/DecodeParms"):
+ decoded[key] = value
+ self.decodedSelf = decoded
+ return decoded._data
+
+ def setData(self, data):
+ raise utils.PdfReadError("Creating EncodedStreamObject is not currently supported")
+
+
+class RectangleObject(ArrayObject):
+ """
+ This class is used to represent *page boxes* in PyPDF2. These boxes include:
+
+ * :attr:`artBox `
+ * :attr:`bleedBox `
+ * :attr:`cropBox `
+ * :attr:`mediaBox `
+ * :attr:`trimBox `
+ """
+ def __init__(self, arr):
+ # must have four points
+ assert len(arr) == 4
+ # automatically convert arr[x] into NumberObject(arr[x]) if necessary
+ ArrayObject.__init__(self, [self.ensureIsNumber(x) for x in arr])
+
+ def ensureIsNumber(self, value):
+ if not isinstance(value, (NumberObject, FloatObject)):
+ value = FloatObject(value)
+ return value
+
+ def __repr__(self):
+ return "RectangleObject(%s)" % repr(list(self))
+
+ def getLowerLeft_x(self):
+ return self[0]
+
+ def getLowerLeft_y(self):
+ return self[1]
+
+ def getUpperRight_x(self):
+ return self[2]
+
+ def getUpperRight_y(self):
+ return self[3]
+
+ def getUpperLeft_x(self):
+ return self.getLowerLeft_x()
+
+ def getUpperLeft_y(self):
+ return self.getUpperRight_y()
+
+ def getLowerRight_x(self):
+ return self.getUpperRight_x()
+
+ def getLowerRight_y(self):
+ return self.getLowerLeft_y()
+
+ def getLowerLeft(self):
+ return self.getLowerLeft_x(), self.getLowerLeft_y()
+
+ def getLowerRight(self):
+ return self.getLowerRight_x(), self.getLowerRight_y()
+
+ def getUpperLeft(self):
+ return self.getUpperLeft_x(), self.getUpperLeft_y()
+
+ def getUpperRight(self):
+ return self.getUpperRight_x(), self.getUpperRight_y()
+
+ def setLowerLeft(self, value):
+ self[0], self[1] = [self.ensureIsNumber(x) for x in value]
+
+ def setLowerRight(self, value):
+ self[2], self[1] = [self.ensureIsNumber(x) for x in value]
+
+ def setUpperLeft(self, value):
+ self[0], self[3] = [self.ensureIsNumber(x) for x in value]
+
+ def setUpperRight(self, value):
+ self[2], self[3] = [self.ensureIsNumber(x) for x in value]
+
+ def getWidth(self):
+ return self.getUpperRight_x() - self.getLowerLeft_x()
+
+ def getHeight(self):
+ return self.getUpperRight_y() - self.getLowerLeft_y()
+
+ lowerLeft = property(getLowerLeft, setLowerLeft, None, None)
+ """
+ Property to read and modify the lower left coordinate of this box
+ in (x,y) form.
+ """
+ lowerRight = property(getLowerRight, setLowerRight, None, None)
+ """
+ Property to read and modify the lower right coordinate of this box
+ in (x,y) form.
+ """
+ upperLeft = property(getUpperLeft, setUpperLeft, None, None)
+ """
+ Property to read and modify the upper left coordinate of this box
+ in (x,y) form.
+ """
+ upperRight = property(getUpperRight, setUpperRight, None, None)
+ """
+ Property to read and modify the upper right coordinate of this box
+ in (x,y) form.
+ """
+
+
+class Field(TreeObject):
+ """
+ A class representing a field dictionary. This class is accessed through
+ :meth:`getFields()`
+ """
+ def __init__(self, data):
+ DictionaryObject.__init__(self)
+ attributes = ("/FT", "/Parent", "/Kids", "/T", "/TU", "/TM", "/Ff",
+ "/V", "/DV", "/AA")
+ for attr in attributes:
+ try:
+ self[NameObject(attr)] = data[attr]
+ except KeyError:
+ pass
+
+ fieldType = property(lambda self: self.get("/FT"))
+ """
+ Read-only property accessing the type of this field.
+ """
+
+ parent = property(lambda self: self.get("/Parent"))
+ """
+ Read-only property accessing the parent of this field.
+ """
+
+ kids = property(lambda self: self.get("/Kids"))
+ """
+ Read-only property accessing the kids of this field.
+ """
+
+ name = property(lambda self: self.get("/T"))
+ """
+ Read-only property accessing the name of this field.
+ """
+
+ altName = property(lambda self: self.get("/TU"))
+ """
+ Read-only property accessing the alternate name of this field.
+ """
+
+ mappingName = property(lambda self: self.get("/TM"))
+ """
+ Read-only property accessing the mapping name of this field. This
+ name is used by PyPDF2 as a key in the dictionary returned by
+ :meth:`getFields()`
+ """
+
+ flags = property(lambda self: self.get("/Ff"))
+ """
+ Read-only property accessing the field flags, specifying various
+ characteristics of the field (see Table 8.70 of the PDF 1.7 reference).
+ """
+
+ value = property(lambda self: self.get("/V"))
+ """
+ Read-only property accessing the value of this field. Format
+ varies based on field type.
+ """
+
+ defaultValue = property(lambda self: self.get("/DV"))
+ """
+ Read-only property accessing the default value of this field.
+ """
+
+ additionalActions = property(lambda self: self.get("/AA"))
+ """
+ Read-only property accessing the additional actions dictionary.
+ This dictionary defines the field's behavior in response to trigger events.
+ See Section 8.5.2 of the PDF 1.7 reference.
+ """
+
+
+class Destination(TreeObject):
+ """
+ A class representing a destination within a PDF file.
+ See section 8.2.1 of the PDF 1.6 reference.
+
+ :param str title: Title of this destination.
+ :param int page: Page number of this destination.
+ :param str typ: How the destination is displayed.
+ :param args: Additional arguments may be necessary depending on the type.
+ :raises PdfReadError: If destination type is invalid.
+
+ Valid ``typ`` arguments (see PDF spec for details):
+ /Fit No additional arguments
+ /XYZ [left] [top] [zoomFactor]
+ /FitH [top]
+ /FitV [left]
+ /FitR [left] [bottom] [right] [top]
+ /FitB No additional arguments
+ /FitBH [top]
+ /FitBV [left]
+ """
+ def __init__(self, title, page, typ, *args):
+ DictionaryObject.__init__(self)
+ self[NameObject("/Title")] = title
+ self[NameObject("/Page")] = page
+ self[NameObject("/Type")] = typ
+
+ # from table 8.2 of the PDF 1.7 reference.
+ if typ == "/XYZ":
+ (self[NameObject("/Left")], self[NameObject("/Top")],
+ self[NameObject("/Zoom")]) = args
+ elif typ == "/FitR":
+ (self[NameObject("/Left")], self[NameObject("/Bottom")],
+ self[NameObject("/Right")], self[NameObject("/Top")]) = args
+ elif typ in ["/FitH", "/FitBH"]:
+ self[NameObject("/Top")], = args
+ elif typ in ["/FitV", "/FitBV"]:
+ self[NameObject("/Left")], = args
+ elif typ in ["/Fit", "/FitB"]:
+ pass
+ else:
+ raise utils.PdfReadError("Unknown Destination Type: %r" % typ)
+
+ def getDestArray(self):
+ return ArrayObject([self.raw_get('/Page'), self['/Type']] + [self[x] for x in ['/Left', '/Bottom', '/Right', '/Top', '/Zoom'] if x in self])
+
+ def writeToStream(self, stream, encryption_key):
+ stream.write(b_("<<\n"))
+ key = NameObject('/D')
+ key.writeToStream(stream, encryption_key)
+ stream.write(b_(" "))
+ value = self.getDestArray()
+ value.writeToStream(stream, encryption_key)
+
+ key = NameObject("/S")
+ key.writeToStream(stream, encryption_key)
+ stream.write(b_(" "))
+ value = NameObject("/GoTo")
+ value.writeToStream(stream, encryption_key)
+
+ stream.write(b_("\n"))
+ stream.write(b_(">>"))
+
+ title = property(lambda self: self.get("/Title"))
+ """
+ Read-only property accessing the destination title.
+
+ :rtype: str
+ """
+
+ page = property(lambda self: self.get("/Page"))
+ """
+ Read-only property accessing the destination page number.
+
+ :rtype: int
+ """
+
+ typ = property(lambda self: self.get("/Type"))
+ """
+ Read-only property accessing the destination type.
+
+ :rtype: str
+ """
+
+ zoom = property(lambda self: self.get("/Zoom", None))
+ """
+ Read-only property accessing the zoom factor.
+
+ :rtype: int, or ``None`` if not available.
+ """
+
+ left = property(lambda self: self.get("/Left", None))
+ """
+ Read-only property accessing the left horizontal coordinate.
+
+ :rtype: int, or ``None`` if not available.
+ """
+
+ right = property(lambda self: self.get("/Right", None))
+ """
+ Read-only property accessing the right horizontal coordinate.
+
+ :rtype: int, or ``None`` if not available.
+ """
+
+ top = property(lambda self: self.get("/Top", None))
+ """
+ Read-only property accessing the top vertical coordinate.
+
+ :rtype: int, or ``None`` if not available.
+ """
+
+ bottom = property(lambda self: self.get("/Bottom", None))
+ """
+ Read-only property accessing the bottom vertical coordinate.
+
+ :rtype: int, or ``None`` if not available.
+ """
+
+
+class Bookmark(Destination):
+ def writeToStream(self, stream, encryption_key):
+ stream.write(b_("<<\n"))
+ for key in [NameObject(x) for x in ['/Title', '/Parent', '/First', '/Last', '/Next', '/Prev'] if x in self]:
+ key.writeToStream(stream, encryption_key)
+ stream.write(b_(" "))
+ value = self.raw_get(key)
+ value.writeToStream(stream, encryption_key)
+ stream.write(b_("\n"))
+ key = NameObject('/Dest')
+ key.writeToStream(stream, encryption_key)
+ stream.write(b_(" "))
+ value = self.getDestArray()
+ value.writeToStream(stream, encryption_key)
+ stream.write(b_("\n"))
+ stream.write(b_(">>"))
+
+
+def encode_pdfdocencoding(unicode_string):
+ retval = b_('')
+ for c in unicode_string:
+ try:
+ retval += b_(chr(_pdfDocEncoding_rev[c]))
+ except KeyError:
+ raise UnicodeEncodeError("pdfdocencoding", c, -1, -1,
+ "does not exist in translation table")
+ return retval
+
+
+def decode_pdfdocencoding(byte_array):
+ retval = u_('')
+ for b in byte_array:
+ c = _pdfDocEncoding[ord_(b)]
+ if c == u_('\u0000'):
+ raise UnicodeDecodeError("pdfdocencoding", utils.barray(b), -1, -1,
+ "does not exist in translation table")
+ retval += c
+ return retval
+
+_pdfDocEncoding = (
+ u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'),
+ u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'),
+ u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'),
+ u_('\u02d8'), u_('\u02c7'), u_('\u02c6'), u_('\u02d9'), u_('\u02dd'), u_('\u02db'), u_('\u02da'), u_('\u02dc'),
+ u_('\u0020'), u_('\u0021'), u_('\u0022'), u_('\u0023'), u_('\u0024'), u_('\u0025'), u_('\u0026'), u_('\u0027'),
+ u_('\u0028'), u_('\u0029'), u_('\u002a'), u_('\u002b'), u_('\u002c'), u_('\u002d'), u_('\u002e'), u_('\u002f'),
+ u_('\u0030'), u_('\u0031'), u_('\u0032'), u_('\u0033'), u_('\u0034'), u_('\u0035'), u_('\u0036'), u_('\u0037'),
+ u_('\u0038'), u_('\u0039'), u_('\u003a'), u_('\u003b'), u_('\u003c'), u_('\u003d'), u_('\u003e'), u_('\u003f'),
+ u_('\u0040'), u_('\u0041'), u_('\u0042'), u_('\u0043'), u_('\u0044'), u_('\u0045'), u_('\u0046'), u_('\u0047'),
+ u_('\u0048'), u_('\u0049'), u_('\u004a'), u_('\u004b'), u_('\u004c'), u_('\u004d'), u_('\u004e'), u_('\u004f'),
+ u_('\u0050'), u_('\u0051'), u_('\u0052'), u_('\u0053'), u_('\u0054'), u_('\u0055'), u_('\u0056'), u_('\u0057'),
+ u_('\u0058'), u_('\u0059'), u_('\u005a'), u_('\u005b'), u_('\u005c'), u_('\u005d'), u_('\u005e'), u_('\u005f'),
+ u_('\u0060'), u_('\u0061'), u_('\u0062'), u_('\u0063'), u_('\u0064'), u_('\u0065'), u_('\u0066'), u_('\u0067'),
+ u_('\u0068'), u_('\u0069'), u_('\u006a'), u_('\u006b'), u_('\u006c'), u_('\u006d'), u_('\u006e'), u_('\u006f'),
+ u_('\u0070'), u_('\u0071'), u_('\u0072'), u_('\u0073'), u_('\u0074'), u_('\u0075'), u_('\u0076'), u_('\u0077'),
+ u_('\u0078'), u_('\u0079'), u_('\u007a'), u_('\u007b'), u_('\u007c'), u_('\u007d'), u_('\u007e'), u_('\u0000'),
+ u_('\u2022'), u_('\u2020'), u_('\u2021'), u_('\u2026'), u_('\u2014'), u_('\u2013'), u_('\u0192'), u_('\u2044'),
+ u_('\u2039'), u_('\u203a'), u_('\u2212'), u_('\u2030'), u_('\u201e'), u_('\u201c'), u_('\u201d'), u_('\u2018'),
+ u_('\u2019'), u_('\u201a'), u_('\u2122'), u_('\ufb01'), u_('\ufb02'), u_('\u0141'), u_('\u0152'), u_('\u0160'),
+ u_('\u0178'), u_('\u017d'), u_('\u0131'), u_('\u0142'), u_('\u0153'), u_('\u0161'), u_('\u017e'), u_('\u0000'),
+ u_('\u20ac'), u_('\u00a1'), u_('\u00a2'), u_('\u00a3'), u_('\u00a4'), u_('\u00a5'), u_('\u00a6'), u_('\u00a7'),
+ u_('\u00a8'), u_('\u00a9'), u_('\u00aa'), u_('\u00ab'), u_('\u00ac'), u_('\u0000'), u_('\u00ae'), u_('\u00af'),
+ u_('\u00b0'), u_('\u00b1'), u_('\u00b2'), u_('\u00b3'), u_('\u00b4'), u_('\u00b5'), u_('\u00b6'), u_('\u00b7'),
+ u_('\u00b8'), u_('\u00b9'), u_('\u00ba'), u_('\u00bb'), u_('\u00bc'), u_('\u00bd'), u_('\u00be'), u_('\u00bf'),
+ u_('\u00c0'), u_('\u00c1'), u_('\u00c2'), u_('\u00c3'), u_('\u00c4'), u_('\u00c5'), u_('\u00c6'), u_('\u00c7'),
+ u_('\u00c8'), u_('\u00c9'), u_('\u00ca'), u_('\u00cb'), u_('\u00cc'), u_('\u00cd'), u_('\u00ce'), u_('\u00cf'),
+ u_('\u00d0'), u_('\u00d1'), u_('\u00d2'), u_('\u00d3'), u_('\u00d4'), u_('\u00d5'), u_('\u00d6'), u_('\u00d7'),
+ u_('\u00d8'), u_('\u00d9'), u_('\u00da'), u_('\u00db'), u_('\u00dc'), u_('\u00dd'), u_('\u00de'), u_('\u00df'),
+ u_('\u00e0'), u_('\u00e1'), u_('\u00e2'), u_('\u00e3'), u_('\u00e4'), u_('\u00e5'), u_('\u00e6'), u_('\u00e7'),
+ u_('\u00e8'), u_('\u00e9'), u_('\u00ea'), u_('\u00eb'), u_('\u00ec'), u_('\u00ed'), u_('\u00ee'), u_('\u00ef'),
+ u_('\u00f0'), u_('\u00f1'), u_('\u00f2'), u_('\u00f3'), u_('\u00f4'), u_('\u00f5'), u_('\u00f6'), u_('\u00f7'),
+ u_('\u00f8'), u_('\u00f9'), u_('\u00fa'), u_('\u00fb'), u_('\u00fc'), u_('\u00fd'), u_('\u00fe'), u_('\u00ff')
+)
+
+assert len(_pdfDocEncoding) == 256
+
+_pdfDocEncoding_rev = {}
+for i in range(256):
+ char = _pdfDocEncoding[i]
+ if char == u_("\u0000"):
+ continue
+ assert char not in _pdfDocEncoding_rev
+ _pdfDocEncoding_rev[char] = i
diff --git a/vendor/PyPDF2/merger.py b/vendor/PyPDF2/merger.py
new file mode 100755
index 00000000..27702add
--- /dev/null
+++ b/vendor/PyPDF2/merger.py
@@ -0,0 +1,553 @@
+# vim: sw=4:expandtab:foldmethod=marker
+#
+# Copyright (c) 2006, Mathieu Fenniak
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# * The name of the author may not be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+from .generic import *
+from .utils import isString, str_
+from .pdf import PdfFileReader, PdfFileWriter
+from .pagerange import PageRange
+from sys import version_info
+if version_info < ( 3, 0 ):
+ from cStringIO import StringIO
+ StreamIO = StringIO
+else:
+ from io import BytesIO
+ from io import FileIO as file
+ StreamIO = BytesIO
+
+
+class _MergedPage(object):
+ """
+ _MergedPage is used internally by PdfFileMerger to collect necessary
+ information on each page that is being merged.
+ """
+ def __init__(self, pagedata, src, id):
+ self.src = src
+ self.pagedata = pagedata
+ self.out_pagedata = None
+ self.id = id
+
+
+class PdfFileMerger(object):
+ """
+ Initializes a PdfFileMerger object. PdfFileMerger merges multiple PDFs
+ into a single PDF. It can concatenate, slice, insert, or any combination
+ of the above.
+
+ See the functions :meth:`merge()` (or :meth:`append()`)
+ and :meth:`write()` for usage information.
+
+ :param bool strict: Determines whether user should be warned of all
+ problems and also causes some correctable problems to be fatal.
+ Defaults to ``True``.
+ """
+
+ def __init__(self, strict=True):
+ self.inputs = []
+ self.pages = []
+ self.output = PdfFileWriter()
+ self.bookmarks = []
+ self.named_dests = []
+ self.id_count = 0
+ self.strict = strict
+
+ def merge(self, position, fileobj, bookmark=None, pages=None, import_bookmarks=True):
+ """
+ Merges the pages from the given file into the output file at the
+ specified page number.
+
+ :param int position: The *page number* to insert this file. File will
+ be inserted after the given number.
+
+ :param fileobj: A File Object or an object that supports the standard read
+ and seek methods similar to a File Object. Could also be a
+ string representing a path to a PDF file.
+
+ :param str bookmark: Optionally, you may specify a bookmark to be applied at
+ the beginning of the included file by supplying the text of the bookmark.
+
+ :param pages: can be a :ref:`Page Range ` or a ``(start, stop[, step])`` tuple
+ to merge only the specified range of pages from the source
+ document into the output document.
+
+ :param bool import_bookmarks: You may prevent the source document's bookmarks
+ from being imported by specifying this as ``False``.
+ """
+
+ # This parameter is passed to self.inputs.append and means
+ # that the stream used was created in this method.
+ my_file = False
+
+ # If the fileobj parameter is a string, assume it is a path
+ # and create a file object at that location. If it is a file,
+ # copy the file's contents into a BytesIO (or StreamIO) stream object; if
+ # it is a PdfFileReader, copy that reader's stream into a
+ # BytesIO (or StreamIO) stream.
+ # If fileobj is none of the above types, it is not modified
+ decryption_key = None
+ if isString(fileobj):
+ fileobj = file(fileobj, 'rb')
+ my_file = True
+ elif isinstance(fileobj, file):
+ fileobj.seek(0)
+ filecontent = fileobj.read()
+ fileobj = StreamIO(filecontent)
+ my_file = True
+ elif isinstance(fileobj, PdfFileReader):
+ orig_tell = fileobj.stream.tell()
+ fileobj.stream.seek(0)
+ filecontent = StreamIO(fileobj.stream.read())
+ fileobj.stream.seek(orig_tell) # reset the stream to its original location
+ fileobj = filecontent
+ if hasattr(fileobj, '_decryption_key'):
+ decryption_key = fileobj._decryption_key
+ my_file = True
+
+ # Create a new PdfFileReader instance using the stream
+ # (either file or BytesIO or StringIO) created above
+ pdfr = PdfFileReader(fileobj, strict=self.strict)
+ if decryption_key is not None:
+ pdfr._decryption_key = decryption_key
+
+ # Find the range of pages to merge.
+ if pages == None:
+ pages = (0, pdfr.getNumPages())
+ elif isinstance(pages, PageRange):
+ pages = pages.indices(pdfr.getNumPages())
+ elif not isinstance(pages, tuple):
+ raise TypeError('"pages" must be a tuple of (start, stop[, step])')
+
+ srcpages = []
+ if bookmark:
+ bookmark = Bookmark(TextStringObject(bookmark), NumberObject(self.id_count), NameObject('/Fit'))
+
+ outline = []
+ if import_bookmarks:
+ outline = pdfr.getOutlines()
+ outline = self._trim_outline(pdfr, outline, pages)
+
+ if bookmark:
+ self.bookmarks += [bookmark, outline]
+ else:
+ self.bookmarks += outline
+
+ dests = pdfr.namedDestinations
+ dests = self._trim_dests(pdfr, dests, pages)
+ self.named_dests += dests
+
+ # Gather all the pages that are going to be merged
+ for i in range(*pages):
+ pg = pdfr.getPage(i)
+
+ id = self.id_count
+ self.id_count += 1
+
+ mp = _MergedPage(pg, pdfr, id)
+
+ srcpages.append(mp)
+
+ self._associate_dests_to_pages(srcpages)
+ self._associate_bookmarks_to_pages(srcpages)
+
+ # Slice to insert the pages at the specified position
+ self.pages[position:position] = srcpages
+
+ # Keep track of our input files so we can close them later
+ self.inputs.append((fileobj, pdfr, my_file))
+
+ def append(self, fileobj, bookmark=None, pages=None, import_bookmarks=True):
+ """
+ Identical to the :meth:`merge()` method, but assumes you want to concatenate
+ all pages onto the end of the file instead of specifying a position.
+
+ :param fileobj: A File Object or an object that supports the standard read
+ and seek methods similar to a File Object. Could also be a
+ string representing a path to a PDF file.
+
+ :param str bookmark: Optionally, you may specify a bookmark to be applied at
+ the beginning of the included file by supplying the text of the bookmark.
+
+ :param pages: can be a :ref:`Page Range ` or a ``(start, stop[, step])`` tuple
+ to merge only the specified range of pages from the source
+ document into the output document.
+
+ :param bool import_bookmarks: You may prevent the source document's bookmarks
+ from being imported by specifying this as ``False``.
+ """
+
+ self.merge(len(self.pages), fileobj, bookmark, pages, import_bookmarks)
+
+ def write(self, fileobj):
+ """
+ Writes all data that has been merged to the given output file.
+
+ :param fileobj: Output file. Can be a filename or any kind of
+ file-like object.
+ """
+ my_file = False
+ if isString(fileobj):
+ fileobj = file(fileobj, 'wb')
+ my_file = True
+
+ # Add pages to the PdfFileWriter
+ # The commented out line below was replaced with the two lines below it to allow PdfFileMerger to work with PyPdf 1.13
+ for page in self.pages:
+ self.output.addPage(page.pagedata)
+ page.out_pagedata = self.output.getReference(self.output._pages.getObject()["/Kids"][-1].getObject())
+ #idnum = self.output._objects.index(self.output._pages.getObject()["/Kids"][-1].getObject()) + 1
+ #page.out_pagedata = IndirectObject(idnum, 0, self.output)
+
+ # Once all pages are added, create bookmarks to point at those pages
+ self._write_dests()
+ self._write_bookmarks()
+
+ # Write the output to the file
+ self.output.write(fileobj)
+
+ if my_file:
+ fileobj.close()
+
+ def close(self):
+ """
+ Shuts all file descriptors (input and output) and clears all memory
+ usage.
+ """
+ self.pages = []
+ for fo, pdfr, mine in self.inputs:
+ if mine:
+ fo.close()
+
+ self.inputs = []
+ self.output = None
+
+ def addMetadata(self, infos):
+ """
+ Add custom metadata to the output.
+
+ :param dict infos: a Python dictionary where each key is a field
+ and each value is your new metadata.
+ Example: ``{u'/Title': u'My title'}``
+ """
+ self.output.addMetadata(infos)
+
+ def setPageLayout(self, layout):
+ """
+ Set the page layout
+
+ :param str layout: The page layout to be used
+
+ Valid layouts are:
+ /NoLayout Layout explicitly not specified
+ /SinglePage Show one page at a time
+ /OneColumn Show one column at a time
+ /TwoColumnLeft Show pages in two columns, odd-numbered pages on the left
+ /TwoColumnRight Show pages in two columns, odd-numbered pages on the right
+ /TwoPageLeft Show two pages at a time, odd-numbered pages on the left
+ /TwoPageRight Show two pages at a time, odd-numbered pages on the right
+ """
+ self.output.setPageLayout(layout)
+
+ def setPageMode(self, mode):
+ """
+ Set the page mode.
+
+ :param str mode: The page mode to use.
+
+ Valid modes are:
+ /UseNone Do not show outlines or thumbnails panels
+ /UseOutlines Show outlines (aka bookmarks) panel
+ /UseThumbs Show page thumbnails panel
+ /FullScreen Fullscreen view
+ /UseOC Show Optional Content Group (OCG) panel
+ /UseAttachments Show attachments panel
+ """
+ self.output.setPageMode(mode)
+
+ def _trim_dests(self, pdf, dests, pages):
+ """
+ Removes any named destinations that are not a part of the specified
+ page set.
+ """
+ new_dests = []
+ prev_header_added = True
+ for k, o in list(dests.items()):
+ for j in range(*pages):
+ if pdf.getPage(j).getObject() == o['/Page'].getObject():
+ o[NameObject('/Page')] = o['/Page'].getObject()
+ assert str_(k) == str_(o['/Title'])
+ new_dests.append(o)
+ break
+ return new_dests
+
+ def _trim_outline(self, pdf, outline, pages):
+ """
+ Removes any outline/bookmark entries that are not a part of the
+ specified page set.
+ """
+ new_outline = []
+ prev_header_added = True
+ for i, o in enumerate(outline):
+ if isinstance(o, list):
+ sub = self._trim_outline(pdf, o, pages)
+ if sub:
+ if not prev_header_added:
+ new_outline.append(outline[i-1])
+ new_outline.append(sub)
+ else:
+ prev_header_added = False
+ for j in range(*pages):
+ if pdf.getPage(j).getObject() == o['/Page'].getObject():
+ o[NameObject('/Page')] = o['/Page'].getObject()
+ new_outline.append(o)
+ prev_header_added = True
+ break
+ return new_outline
+
+ def _write_dests(self):
+ dests = self.named_dests
+
+ for v in dests:
+ pageno = None
+ pdf = None
+ if '/Page' in v:
+ for i, p in enumerate(self.pages):
+ if p.id == v['/Page']:
+ v[NameObject('/Page')] = p.out_pagedata
+ pageno = i
+ pdf = p.src
+ break
+ if pageno != None:
+ self.output.addNamedDestinationObject(v)
+
+ def _write_bookmarks(self, bookmarks=None, parent=None):
+
+ if bookmarks == None:
+ bookmarks = self.bookmarks
+
+ last_added = None
+ for b in bookmarks:
+ if isinstance(b, list):
+ self._write_bookmarks(b, last_added)
+ continue
+
+ pageno = None
+ pdf = None
+ if '/Page' in b:
+ for i, p in enumerate(self.pages):
+ if p.id == b['/Page']:
+ #b[NameObject('/Page')] = p.out_pagedata
+ args = [NumberObject(p.id), NameObject(b['/Type'])]
+ #nothing more to add
+ #if b['/Type'] == '/Fit' or b['/Type'] == '/FitB'
+ if b['/Type'] == '/FitH' or b['/Type'] == '/FitBH':
+ if '/Top' in b and not isinstance(b['/Top'], NullObject):
+ args.append(FloatObject(b['/Top']))
+ else:
+ args.append(FloatObject(0))
+ del b['/Top']
+ elif b['/Type'] == '/FitV' or b['/Type'] == '/FitBV':
+ if '/Left' in b and not isinstance(b['/Left'], NullObject):
+ args.append(FloatObject(b['/Left']))
+ else:
+ args.append(FloatObject(0))
+ del b['/Left']
+ elif b['/Type'] == '/XYZ':
+ if '/Left' in b and not isinstance(b['/Left'], NullObject):
+ args.append(FloatObject(b['/Left']))
+ else:
+ args.append(FloatObject(0))
+ if '/Top' in b and not isinstance(b['/Top'], NullObject):
+ args.append(FloatObject(b['/Top']))
+ else:
+ args.append(FloatObject(0))
+ if '/Zoom' in b and not isinstance(b['/Zoom'], NullObject):
+ args.append(FloatObject(b['/Zoom']))
+ else:
+ args.append(FloatObject(0))
+ del b['/Top'], b['/Zoom'], b['/Left']
+ elif b['/Type'] == '/FitR':
+ if '/Left' in b and not isinstance(b['/Left'], NullObject):
+ args.append(FloatObject(b['/Left']))
+ else:
+ args.append(FloatObject(0))
+ if '/Bottom' in b and not isinstance(b['/Bottom'], NullObject):
+ args.append(FloatObject(b['/Bottom']))
+ else:
+ args.append(FloatObject(0))
+ if '/Right' in b and not isinstance(b['/Right'], NullObject):
+ args.append(FloatObject(b['/Right']))
+ else:
+ args.append(FloatObject(0))
+ if '/Top' in b and not isinstance(b['/Top'], NullObject):
+ args.append(FloatObject(b['/Top']))
+ else:
+ args.append(FloatObject(0))
+ del b['/Left'], b['/Right'], b['/Bottom'], b['/Top']
+
+ b[NameObject('/A')] = DictionaryObject({NameObject('/S'): NameObject('/GoTo'), NameObject('/D'): ArrayObject(args)})
+
+ pageno = i
+ pdf = p.src
+ break
+ if pageno != None:
+ del b['/Page'], b['/Type']
+ last_added = self.output.addBookmarkDict(b, parent)
+
+ def _associate_dests_to_pages(self, pages):
+ for nd in self.named_dests:
+ pageno = None
+ np = nd['/Page']
+
+ if isinstance(np, NumberObject):
+ continue
+
+ for p in pages:
+ if np.getObject() == p.pagedata.getObject():
+ pageno = p.id
+
+ if pageno != None:
+ nd[NameObject('/Page')] = NumberObject(pageno)
+ else:
+ raise ValueError("Unresolved named destination '%s'" % (nd['/Title'],))
+
+ def _associate_bookmarks_to_pages(self, pages, bookmarks=None):
+ if bookmarks == None:
+ bookmarks = self.bookmarks
+
+ for b in bookmarks:
+ if isinstance(b, list):
+ self._associate_bookmarks_to_pages(pages, b)
+ continue
+
+ pageno = None
+ bp = b['/Page']
+
+ if isinstance(bp, NumberObject):
+ continue
+
+ for p in pages:
+ if bp.getObject() == p.pagedata.getObject():
+ pageno = p.id
+
+ if pageno != None:
+ b[NameObject('/Page')] = NumberObject(pageno)
+ else:
+ raise ValueError("Unresolved bookmark '%s'" % (b['/Title'],))
+
+ def findBookmark(self, bookmark, root=None):
+ if root == None:
+ root = self.bookmarks
+
+ for i, b in enumerate(root):
+ if isinstance(b, list):
+ res = self.findBookmark(bookmark, b)
+ if res:
+ return [i] + res
+ elif b == bookmark or b['/Title'] == bookmark:
+ return [i]
+
+ return None
+
+ def addBookmark(self, title, pagenum, parent=None):
+ """
+ Add a bookmark to this PDF file.
+
+ :param str title: Title to use for this bookmark.
+ :param int pagenum: Page number this bookmark will point to.
+ :param parent: A reference to a parent bookmark to create nested
+ bookmarks.
+ """
+ if parent == None:
+ iloc = [len(self.bookmarks)-1]
+ elif isinstance(parent, list):
+ iloc = parent
+ else:
+ iloc = self.findBookmark(parent)
+
+ dest = Bookmark(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826))
+
+ if parent == None:
+ self.bookmarks.append(dest)
+ else:
+ bmparent = self.bookmarks
+ for i in iloc[:-1]:
+ bmparent = bmparent[i]
+ npos = iloc[-1]+1
+ if npos < len(bmparent) and isinstance(bmparent[npos], list):
+ bmparent[npos].append(dest)
+ else:
+ bmparent.insert(npos, [dest])
+ return dest
+
+ def addNamedDestination(self, title, pagenum):
+ """
+ Add a destination to the output.
+
+ :param str title: Title to use
+ :param int pagenum: Page number this destination points at.
+ """
+
+ dest = Destination(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826))
+ self.named_dests.append(dest)
+
+
+class OutlinesObject(list):
+ def __init__(self, pdf, tree, parent=None):
+ list.__init__(self)
+ self.tree = tree
+ self.pdf = pdf
+ self.parent = parent
+
+ def remove(self, index):
+ obj = self[index]
+ del self[index]
+ self.tree.removeChild(obj)
+
+ def add(self, title, pagenum):
+ pageRef = self.pdf.getObject(self.pdf._pages)['/Kids'][pagenum]
+ action = DictionaryObject()
+ action.update({
+ NameObject('/D') : ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]),
+ NameObject('/S') : NameObject('/GoTo')
+ })
+ actionRef = self.pdf._addObject(action)
+ bookmark = TreeObject()
+
+ bookmark.update({
+ NameObject('/A'): actionRef,
+ NameObject('/Title'): createStringObject(title),
+ })
+
+ self.pdf._addObject(bookmark)
+
+ self.tree.addChild(bookmark)
+
+ def removeAll(self):
+ for child in [x for x in self.tree.children()]:
+ self.tree.removeChild(child)
+ self.pop()
diff --git a/vendor/PyPDF2/pagerange.py b/vendor/PyPDF2/pagerange.py
new file mode 100755
index 00000000..ce96ec5f
--- /dev/null
+++ b/vendor/PyPDF2/pagerange.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python
+"""
+Representation and utils for ranges of PDF file pages.
+
+Copyright (c) 2014, Steve Witham .
+All rights reserved. This software is available under a BSD license;
+see https://github.com/mstamy2/PyPDF2/blob/master/LICENSE
+"""
+
+import re
+from .utils import isString
+
+_INT_RE = r"(0|-?[1-9]\d*)" # A decimal int, don't allow "-0".
+PAGE_RANGE_RE = "^({int}|({int}?(:{int}?(:{int}?)?)))$".format(int=_INT_RE)
+# groups: 12 34 5 6 7 8
+
+
+class ParseError(Exception):
+ pass
+
+
+PAGE_RANGE_HELP = """Remember, page indices start with zero.
+ Page range expression examples:
+ : all pages. -1 last page.
+ 22 just the 23rd page. :-1 all but the last page.
+ 0:3 the first three pages. -2 second-to-last page.
+ :3 the first three pages. -2: last two pages.
+ 5: from the sixth page onward. -3:-1 third & second to last.
+ The third, "stride" or "step" number is also recognized.
+ ::2 0 2 4 ... to the end. 3:0:-1 3 2 1 but not 0.
+ 1:10:2 1 3 5 7 9 2::-1 2 1 0.
+ ::-1 all pages in reverse order.
+"""
+
+
+class PageRange(object):
+ """
+ A slice-like representation of a range of page indices,
+ i.e. page numbers, only starting at zero.
+ The syntax is like what you would put between brackets [ ].
+ The slice is one of the few Python types that can't be subclassed,
+ but this class converts to and from slices, and allows similar use.
+ o PageRange(str) parses a string representing a page range.
+ o PageRange(slice) directly "imports" a slice.
+ o to_slice() gives the equivalent slice.
+ o str() and repr() allow printing.
+ o indices(n) is like slice.indices(n).
+ """
+
+ def __init__(self, arg):
+ """
+ Initialize with either a slice -- giving the equivalent page range,
+ or a PageRange object -- making a copy,
+ or a string like
+ "int", "[int]:[int]" or "[int]:[int]:[int]",
+ where the brackets indicate optional ints.
+ {page_range_help}
+ Note the difference between this notation and arguments to slice():
+ slice(3) means the first three pages;
+ PageRange("3") means the range of only the fourth page.
+ However PageRange(slice(3)) means the first three pages.
+ """
+ if isinstance(arg, slice):
+ self._slice = arg
+ return
+
+ if isinstance(arg, PageRange):
+ self._slice = arg.to_slice()
+ return
+
+ m = isString(arg) and re.match(PAGE_RANGE_RE, arg)
+ if not m:
+ raise ParseError(arg)
+ elif m.group(2):
+ # Special case: just an int means a range of one page.
+ start = int(m.group(2))
+ stop = start + 1 if start != -1 else None
+ self._slice = slice(start, stop)
+ else:
+ self._slice = slice(*[int(g) if g else None
+ for g in m.group(4, 6, 8)])
+
+ # Just formatting this when there is __doc__ for __init__
+ if __init__.__doc__:
+ __init__.__doc__ = __init__.__doc__.format(page_range_help=PAGE_RANGE_HELP)
+
+ @staticmethod
+ def valid(input):
+ """ True if input is a valid initializer for a PageRange. """
+ return isinstance(input, slice) or \
+ isinstance(input, PageRange) or \
+ (isString(input)
+ and bool(re.match(PAGE_RANGE_RE, input)))
+
+ def to_slice(self):
+ """ Return the slice equivalent of this page range. """
+ return self._slice
+
+ def __str__(self):
+ """ A string like "1:2:3". """
+ s = self._slice
+ if s.step == None:
+ if s.start != None and s.stop == s.start + 1:
+ return str(s.start)
+
+ indices = s.start, s.stop
+ else:
+ indices = s.start, s.stop, s.step
+ return ':'.join("" if i == None else str(i) for i in indices)
+
+ def __repr__(self):
+ """ A string like "PageRange('1:2:3')". """
+ return "PageRange(" + repr(str(self)) + ")"
+
+ def indices(self, n):
+ """
+ n is the length of the list of pages to choose from.
+ Returns arguments for range(). See help(slice.indices).
+ """
+ return self._slice.indices(n)
+
+
+PAGE_RANGE_ALL = PageRange(":") # The range of all pages.
+
+
+def parse_filename_page_ranges(args):
+ """
+ Given a list of filenames and page ranges, return a list of
+ (filename, page_range) pairs.
+ First arg must be a filename; other ags are filenames, page-range
+ expressions, slice objects, or PageRange objects.
+ A filename not followed by a page range indicates all pages of the file.
+ """
+ pairs = []
+ pdf_filename = None
+ did_page_range = False
+ for arg in args + [None]:
+ if PageRange.valid(arg):
+ if not pdf_filename:
+ raise ValueError("The first argument must be a filename, " \
+ "not a page range.")
+
+ pairs.append( (pdf_filename, PageRange(arg)) )
+ did_page_range = True
+ else:
+ # New filename or end of list--do all of the previous file?
+ if pdf_filename and not did_page_range:
+ pairs.append( (pdf_filename, PAGE_RANGE_ALL) )
+
+ pdf_filename = arg
+ did_page_range = False
+ return pairs
diff --git a/vendor/PyPDF2/pdf.py b/vendor/PyPDF2/pdf.py
new file mode 100755
index 00000000..9979414f
--- /dev/null
+++ b/vendor/PyPDF2/pdf.py
@@ -0,0 +1,3004 @@
+# -*- coding: utf-8 -*-
+#
+# vim: sw=4:expandtab:foldmethod=marker
+#
+# Copyright (c) 2006, Mathieu Fenniak
+# Copyright (c) 2007, Ashish Kulkarni
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# * The name of the author may not be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+"""
+A pure-Python PDF library with an increasing number of capabilities.
+See README for links to FAQ, documentation, homepage, etc.
+"""
+
+__author__ = "Mathieu Fenniak"
+__author_email__ = "biziqe@mathieu.fenniak.net"
+
+__maintainer__ = "Phaseit, Inc."
+__maintainer_email = "PyPDF2@phaseit.net"
+
+import string
+import math
+import struct
+import sys
+import uuid
+from sys import version_info
+if version_info < ( 3, 0 ):
+ from cStringIO import StringIO
+else:
+ from io import StringIO
+
+if version_info < ( 3, 0 ):
+ BytesIO = StringIO
+else:
+ from io import BytesIO
+
+from . import filters
+from . import utils
+import warnings
+import codecs
+from .generic import *
+from .utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList
+from .utils import isString, b_, u_, ord_, chr_, str_, formatWarning
+
+if version_info < ( 2, 4 ):
+ from sets import ImmutableSet as frozenset
+
+if version_info < ( 2, 5 ):
+ from md5 import md5
+else:
+ from hashlib import md5
+import uuid
+
+
+class PdfFileWriter(object):
+ """
+ This class supports writing PDF files out, given pages produced by another
+ class (typically :class:`PdfFileReader`).
+ """
+ def __init__(self):
+ self._header = b_("%PDF-1.3")
+ self._objects = [] # array of indirect objects
+
+ # The root of our page tree node.
+ pages = DictionaryObject()
+ pages.update({
+ NameObject("/Type"): NameObject("/Pages"),
+ NameObject("/Count"): NumberObject(0),
+ NameObject("/Kids"): ArrayObject(),
+ })
+ self._pages = self._addObject(pages)
+
+ # info object
+ info = DictionaryObject()
+ info.update({
+ NameObject("/Producer"): createStringObject(codecs.BOM_UTF16_BE + u_("PyPDF2").encode('utf-16be'))
+ })
+ self._info = self._addObject(info)
+
+ # root object
+ root = DictionaryObject()
+ root.update({
+ NameObject("/Type"): NameObject("/Catalog"),
+ NameObject("/Pages"): self._pages,
+ })
+ self._root = None
+ self._root_object = root
+
+ def _addObject(self, obj):
+ self._objects.append(obj)
+ return IndirectObject(len(self._objects), 0, self)
+
+ def getObject(self, ido):
+ if ido.pdf != self:
+ raise ValueError("pdf must be self")
+ return self._objects[ido.idnum - 1]
+
+ def _addPage(self, page, action):
+ assert page["/Type"] == "/Page"
+ page[NameObject("/Parent")] = self._pages
+ page = self._addObject(page)
+ pages = self.getObject(self._pages)
+ action(pages["/Kids"], page)
+ pages[NameObject("/Count")] = NumberObject(pages["/Count"] + 1)
+
+ def addPage(self, page):
+ """
+ Adds a page to this PDF file. The page is usually acquired from a
+ :class:`PdfFileReader` instance.
+
+ :param PageObject page: The page to add to the document. Should be
+ an instance of :class:`PageObject`
+ """
+ self._addPage(page, list.append)
+
+ def insertPage(self, page, index=0):
+ """
+ Insert a page in this PDF file. The page is usually acquired from a
+ :class:`PdfFileReader` instance.
+
+ :param PageObject page: The page to add to the document. This
+ argument should be an instance of :class:`PageObject`.
+ :param int index: Position at which the page will be inserted.
+ """
+ self._addPage(page, lambda l, p: l.insert(index, p))
+
+ def getPage(self, pageNumber):
+ """
+ Retrieves a page by number from this PDF file.
+
+ :param int pageNumber: The page number to retrieve
+ (pages begin at zero)
+ :return: the page at the index given by *pageNumber*
+ :rtype: :class:`PageObject`
+ """
+ pages = self.getObject(self._pages)
+ # XXX: crude hack
+ return pages["/Kids"][pageNumber].getObject()
+
+ def getNumPages(self):
+ """
+ :return: the number of pages.
+ :rtype: int
+ """
+ pages = self.getObject(self._pages)
+ return int(pages[NameObject("/Count")])
+
+ def addBlankPage(self, width=None, height=None):
+ """
+ Appends a blank page to this PDF file and returns it. If no page size
+ is specified, use the size of the last page.
+
+ :param float width: The width of the new page expressed in default user
+ space units.
+ :param float height: The height of the new page expressed in default
+ user space units.
+ :return: the newly appended page
+ :rtype: :class:`PageObject`
+ :raises PageSizeNotDefinedError: if width and height are not defined
+ and previous page does not exist.
+ """
+ page = PageObject.createBlankPage(self, width, height)
+ self.addPage(page)
+ return page
+
+ def insertBlankPage(self, width=None, height=None, index=0):
+ """
+ Inserts a blank page to this PDF file and returns it. If no page size
+ is specified, use the size of the last page.
+
+ :param float width: The width of the new page expressed in default user
+ space units.
+ :param float height: The height of the new page expressed in default
+ user space units.
+ :param int index: Position to add the page.
+ :return: the newly appended page
+ :rtype: :class:`PageObject`
+ :raises PageSizeNotDefinedError: if width and height are not defined
+ and previous page does not exist.
+ """
+ if width is None or height is None and \
+ (self.getNumPages() - 1) >= index:
+ oldpage = self.getPage(index)
+ width = oldpage.mediaBox.getWidth()
+ height = oldpage.mediaBox.getHeight()
+ page = PageObject.createBlankPage(self, width, height)
+ self.insertPage(page, index)
+ return page
+
+ def addJS(self, javascript):
+ """
+ Add Javascript which will launch upon opening this PDF.
+
+ :param str javascript: Your Javascript.
+
+ >>> output.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")
+ # Example: This will launch the print window when the PDF is opened.
+ """
+ js = DictionaryObject()
+ js.update({
+ NameObject("/Type"): NameObject("/Action"),
+ NameObject("/S"): NameObject("/JavaScript"),
+ NameObject("/JS"): NameObject("(%s)" % javascript)
+ })
+ js_indirect_object = self._addObject(js)
+
+ # We need a name for parameterized javascript in the pdf file, but it can be anything.
+ js_string_name = str(uuid.uuid4())
+
+ js_name_tree = DictionaryObject()
+ js_name_tree.update({
+ NameObject("/JavaScript"): DictionaryObject({
+ NameObject("/Names"): ArrayObject([createStringObject(js_string_name), js_indirect_object])
+ })
+ })
+ self._addObject(js_name_tree)
+
+ self._root_object.update({
+ NameObject("/OpenAction"): js_indirect_object,
+ NameObject("/Names"): js_name_tree
+ })
+
+ def addAttachment(self, fname, fdata):
+ """
+ Embed a file inside the PDF.
+
+ :param str fname: The filename to display.
+ :param str fdata: The data in the file.
+
+ Reference:
+ https://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/PDF32000_2008.pdf
+ Section 7.11.3
+ """
+
+ # We need 3 entries:
+ # * The file's data
+ # * The /Filespec entry
+ # * The file's name, which goes in the Catalog
+
+
+ # The entry for the file
+ """ Sample:
+ 8 0 obj
+ <<
+ /Length 12
+ /Type /EmbeddedFile
+ >>
+ stream
+ Hello world!
+ endstream
+ endobj
+ """
+ file_entry = DecodedStreamObject()
+ file_entry.setData(fdata)
+ file_entry.update({
+ NameObject("/Type"): NameObject("/EmbeddedFile")
+ })
+
+ # The Filespec entry
+ """ Sample:
+ 7 0 obj
+ <<
+ /Type /Filespec
+ /F (hello.txt)
+ /EF << /F 8 0 R >>
+ >>
+ """
+ efEntry = DictionaryObject()
+ efEntry.update({ NameObject("/F"):file_entry })
+
+ filespec = DictionaryObject()
+ filespec.update({
+ NameObject("/Type"): NameObject("/Filespec"),
+ NameObject("/F"): createStringObject(fname), # Perhaps also try TextStringObject
+ NameObject("/EF"): efEntry
+ })
+
+ # Then create the entry for the root, as it needs a reference to the Filespec
+ """ Sample:
+ 1 0 obj
+ <<
+ /Type /Catalog
+ /Outlines 2 0 R
+ /Pages 3 0 R
+ /Names << /EmbeddedFiles << /Names [(hello.txt) 7 0 R] >> >>
+ >>
+ endobj
+
+ """
+ embeddedFilesNamesDictionary = DictionaryObject()
+ embeddedFilesNamesDictionary.update({
+ NameObject("/Names"): ArrayObject([createStringObject(fname), filespec])
+ })
+
+ embeddedFilesDictionary = DictionaryObject()
+ embeddedFilesDictionary.update({
+ NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary
+ })
+ # Update the root
+ self._root_object.update({
+ NameObject("/Names"): embeddedFilesDictionary
+ })
+
+ def appendPagesFromReader(self, reader, after_page_append=None):
+ """
+ Copy pages from reader to writer. Includes an optional callback parameter
+ which is invoked after pages are appended to the writer.
+
+ :param reader: a PdfFileReader object from which to copy page
+ annotations to this writer object. The writer's annots
+ will then be updated
+ :callback after_page_append (function): Callback function that is invoked after
+ each page is appended to the writer. Callback signature:
+
+ :param writer_pageref (PDF page reference): Reference to the page
+ appended to the writer.
+ """
+ # Get page count from writer and reader
+ reader_num_pages = reader.getNumPages()
+ writer_num_pages = self.getNumPages()
+
+ # Copy pages from reader to writer
+ for rpagenum in range(0, reader_num_pages):
+ reader_page = reader.getPage(rpagenum)
+ self.addPage(reader_page)
+ writer_page = self.getPage(writer_num_pages+rpagenum)
+ # Trigger callback, pass writer page as parameter
+ if callable(after_page_append): after_page_append(writer_page)
+
+ def updatePageFormFieldValues(self, page, fields):
+ '''
+ Update the form field values for a given page from a fields dictionary.
+ Copy field texts and values from fields to page.
+
+ :param page: Page reference from PDF writer where the annotations
+ and field data will be updated.
+ :param fields: a Python dictionary of field names (/T) and text
+ values (/V)
+ '''
+ # Iterate through pages, update field values
+ for j in range(0, len(page['/Annots'])):
+ writer_annot = page['/Annots'][j].getObject()
+ for field in fields:
+ if writer_annot.get('/T') == field:
+ writer_annot.update({
+ NameObject("/V"): TextStringObject(fields[field])
+ })
+
+ def cloneReaderDocumentRoot(self, reader):
+ '''
+ Copy the reader document root to the writer.
+
+ :param reader: PdfFileReader from the document root should be copied.
+ :callback after_page_append
+ '''
+ self._root_object = reader.trailer['/Root']
+
+ def cloneDocumentFromReader(self, reader, after_page_append=None):
+ '''
+ Create a copy (clone) of a document from a PDF file reader
+
+ :param reader: PDF file reader instance from which the clone
+ should be created.
+ :callback after_page_append (function): Callback function that is invoked after
+ each page is appended to the writer. Signature includes a reference to the
+ appended page (delegates to appendPagesFromReader). Callback signature:
+
+ :param writer_pageref (PDF page reference): Reference to the page just
+ appended to the document.
+ '''
+ self.cloneReaderDocumentRoot(reader)
+ self.appendPagesFromReader(reader, after_page_append)
+
+ def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True):
+ """
+ Encrypt this PDF file with the PDF Standard encryption handler.
+
+ :param str user_pwd: The "user password", which allows for opening
+ and reading the PDF file with the restrictions provided.
+ :param str owner_pwd: The "owner password", which allows for
+ opening the PDF files without any restrictions. By default,
+ the owner password is the same as the user password.
+ :param bool use_128bit: flag as to whether to use 128bit
+ encryption. When false, 40bit encryption will be used. By default,
+ this flag is on.
+ """
+ import time, random
+ if owner_pwd == None:
+ owner_pwd = user_pwd
+ if use_128bit:
+ V = 2
+ rev = 3
+ keylen = int(128 / 8)
+ else:
+ V = 1
+ rev = 2
+ keylen = int(40 / 8)
+ # permit everything:
+ P = -1
+ O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen))
+ ID_1 = ByteStringObject(md5(b_(repr(time.time()))).digest())
+ ID_2 = ByteStringObject(md5(b_(repr(random.random()))).digest())
+ self._ID = ArrayObject((ID_1, ID_2))
+ if rev == 2:
+ U, key = _alg34(user_pwd, O, P, ID_1)
+ else:
+ assert rev == 3
+ U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False)
+ encrypt = DictionaryObject()
+ encrypt[NameObject("/Filter")] = NameObject("/Standard")
+ encrypt[NameObject("/V")] = NumberObject(V)
+ if V == 2:
+ encrypt[NameObject("/Length")] = NumberObject(keylen * 8)
+ encrypt[NameObject("/R")] = NumberObject(rev)
+ encrypt[NameObject("/O")] = ByteStringObject(O)
+ encrypt[NameObject("/U")] = ByteStringObject(U)
+ encrypt[NameObject("/P")] = NumberObject(P)
+ self._encrypt = self._addObject(encrypt)
+ self._encrypt_key = key
+
+ def write(self, stream):
+ """
+ Writes the collection of pages added to this object out as a PDF file.
+
+ :param stream: An object to write the file to. The object must support
+ the write method and the tell method, similar to a file object.
+ """
+ if hasattr(stream, 'mode') and 'b' not in stream.mode:
+ warnings.warn("File <%s> to write to is not in binary mode. It may not be written to correctly." % stream.name)
+ debug = False
+ import struct
+
+ if not self._root:
+ self._root = self._addObject(self._root_object)
+
+ externalReferenceMap = {}
+
+ # PDF objects sometimes have circular references to their /Page objects
+ # inside their object tree (for example, annotations). Those will be
+ # indirect references to objects that we've recreated in this PDF. To
+ # address this problem, PageObject's store their original object
+ # reference number, and we add it to the external reference map before
+ # we sweep for indirect references. This forces self-page-referencing
+ # trees to reference the correct new object location, rather than
+ # copying in a new copy of the page object.
+ for objIndex in range(len(self._objects)):
+ obj = self._objects[objIndex]
+ if isinstance(obj, PageObject) and obj.indirectRef != None:
+ data = obj.indirectRef
+ if data.pdf not in externalReferenceMap:
+ externalReferenceMap[data.pdf] = {}
+ if data.generation not in externalReferenceMap[data.pdf]:
+ externalReferenceMap[data.pdf][data.generation] = {}
+ externalReferenceMap[data.pdf][data.generation][data.idnum] = IndirectObject(objIndex + 1, 0, self)
+
+ self.stack = []
+ if debug: print(("ERM:", externalReferenceMap, "root:", self._root))
+ self._sweepIndirectReferences(externalReferenceMap, self._root)
+ del self.stack
+
+ # Begin writing:
+ object_positions = []
+ stream.write(self._header + b_("\n"))
+ for i in range(len(self._objects)):
+ idnum = (i + 1)
+ obj = self._objects[i]
+ object_positions.append(stream.tell())
+ stream.write(b_(str(idnum) + " 0 obj\n"))
+ key = None
+ if hasattr(self, "_encrypt") and idnum != self._encrypt.idnum:
+ pack1 = struct.pack("` for details.
+ """
+ pageRef = self.getObject(self._pages)['/Kids'][pagenum]
+ action = DictionaryObject()
+ zoomArgs = []
+ for a in args:
+ if a is not None:
+ zoomArgs.append(NumberObject(a))
+ else:
+ zoomArgs.append(NullObject())
+ dest = Destination(NameObject("/"+title + " bookmark"), pageRef, NameObject(fit), *zoomArgs)
+ destArray = dest.getDestArray()
+ action.update({
+ NameObject('/D') : destArray,
+ NameObject('/S') : NameObject('/GoTo')
+ })
+ actionRef = self._addObject(action)
+
+ outlineRef = self.getOutlineRoot()
+
+ if parent == None:
+ parent = outlineRef
+
+ bookmark = TreeObject()
+
+ bookmark.update({
+ NameObject('/A'): actionRef,
+ NameObject('/Title'): createStringObject(title),
+ })
+
+ if color is not None:
+ bookmark.update({NameObject('/C'): ArrayObject([FloatObject(c) for c in color])})
+
+ format = 0
+ if italic:
+ format += 1
+ if bold:
+ format += 2
+ if format:
+ bookmark.update({NameObject('/F'): NumberObject(format)})
+
+ bookmarkRef = self._addObject(bookmark)
+
+ parent = parent.getObject()
+ parent.addChild(bookmarkRef, self)
+
+ return bookmarkRef
+
+ def addNamedDestinationObject(self, dest):
+ destRef = self._addObject(dest)
+
+ nd = self.getNamedDestRoot()
+ nd.extend([dest['/Title'], destRef])
+
+ return destRef
+
+ def addNamedDestination(self, title, pagenum):
+ pageRef = self.getObject(self._pages)['/Kids'][pagenum]
+ dest = DictionaryObject()
+ dest.update({
+ NameObject('/D') : ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]),
+ NameObject('/S') : NameObject('/GoTo')
+ })
+
+ destRef = self._addObject(dest)
+ nd = self.getNamedDestRoot()
+
+ nd.extend([title, destRef])
+
+ return destRef
+
+ def removeLinks(self):
+ """
+ Removes links and annotations from this output.
+ """
+ pages = self.getObject(self._pages)['/Kids']
+ for page in pages:
+ pageRef = self.getObject(page)
+ if "/Annots" in pageRef:
+ del pageRef['/Annots']
+
+ def removeImages(self, ignoreByteStringObject=False):
+ """
+ Removes images from this output.
+
+ :param bool ignoreByteStringObject: optional parameter
+ to ignore ByteString Objects.
+ """
+ pages = self.getObject(self._pages)['/Kids']
+ for j in range(len(pages)):
+ page = pages[j]
+ pageRef = self.getObject(page)
+ content = pageRef['/Contents'].getObject()
+ if not isinstance(content, ContentStream):
+ content = ContentStream(content, pageRef)
+
+ _operations = []
+ seq_graphics = False
+ for operands, operator in content.operations:
+ if operator == b_('Tj'):
+ text = operands[0]
+ if ignoreByteStringObject:
+ if not isinstance(text, TextStringObject):
+ operands[0] = TextStringObject()
+ elif operator == b_("'"):
+ text = operands[0]
+ if ignoreByteStringObject:
+ if not isinstance(text, TextStringObject):
+ operands[0] = TextStringObject()
+ elif operator == b_('"'):
+ text = operands[2]
+ if ignoreByteStringObject:
+ if not isinstance(text, TextStringObject):
+ operands[2] = TextStringObject()
+ elif operator == b_("TJ"):
+ for i in range(len(operands[0])):
+ if ignoreByteStringObject:
+ if not isinstance(operands[0][i], TextStringObject):
+ operands[0][i] = TextStringObject()
+
+ if operator == b_('q'):
+ seq_graphics = True
+ if operator == b_('Q'):
+ seq_graphics = False
+ if seq_graphics:
+ if operator in [b_('cm'), b_('w'), b_('J'), b_('j'), b_('M'), b_('d'), b_('ri'), b_('i'),
+ b_('gs'), b_('W'), b_('b'), b_('s'), b_('S'), b_('f'), b_('F'), b_('n'), b_('m'), b_('l'),
+ b_('c'), b_('v'), b_('y'), b_('h'), b_('B'), b_('Do'), b_('sh')]:
+ continue
+ if operator == b_('re'):
+ continue
+ _operations.append((operands, operator))
+
+ content.operations = _operations
+ pageRef.__setitem__(NameObject('/Contents'), content)
+
+ def removeText(self, ignoreByteStringObject=False):
+ """
+ Removes images from this output.
+
+ :param bool ignoreByteStringObject: optional parameter
+ to ignore ByteString Objects.
+ """
+ pages = self.getObject(self._pages)['/Kids']
+ for j in range(len(pages)):
+ page = pages[j]
+ pageRef = self.getObject(page)
+ content = pageRef['/Contents'].getObject()
+ if not isinstance(content, ContentStream):
+ content = ContentStream(content, pageRef)
+ for operands,operator in content.operations:
+ if operator == b_('Tj'):
+ text = operands[0]
+ if not ignoreByteStringObject:
+ if isinstance(text, TextStringObject):
+ operands[0] = TextStringObject()
+ else:
+ if isinstance(text, TextStringObject) or \
+ isinstance(text, ByteStringObject):
+ operands[0] = TextStringObject()
+ elif operator == b_("'"):
+ text = operands[0]
+ if not ignoreByteStringObject:
+ if isinstance(text, TextStringObject):
+ operands[0] = TextStringObject()
+ else:
+ if isinstance(text, TextStringObject) or \
+ isinstance(text, ByteStringObject):
+ operands[0] = TextStringObject()
+ elif operator == b_('"'):
+ text = operands[2]
+ if not ignoreByteStringObject:
+ if isinstance(text, TextStringObject):
+ operands[2] = TextStringObject()
+ else:
+ if isinstance(text, TextStringObject) or \
+ isinstance(text, ByteStringObject):
+ operands[2] = TextStringObject()
+ elif operator == b_("TJ"):
+ for i in range(len(operands[0])):
+ if not ignoreByteStringObject:
+ if isinstance(operands[0][i], TextStringObject):
+ operands[0][i] = TextStringObject()
+ else:
+ if isinstance(operands[0][i], TextStringObject) or \
+ isinstance(operands[0][i], ByteStringObject):
+ operands[0][i] = TextStringObject()
+
+ pageRef.__setitem__(NameObject('/Contents'), content)
+
+ def addLink(self, pagenum, pagedest, rect, border=None, fit='/Fit', *args):
+ """
+ Add an internal link from a rectangular area to the specified page.
+
+ :param int pagenum: index of the page on which to place the link.
+ :param int pagedest: index of the page to which the link should go.
+ :param rect: :class:`RectangleObject` or array of four
+ integers specifying the clickable rectangular area
+ ``[xLL, yLL, xUR, yUR]``, or string in the form ``"[ xLL yLL xUR yUR ]"``.
+ :param border: if provided, an array describing border-drawing
+ properties. See the PDF spec for details. No border will be
+ drawn if this argument is omitted.
+ :param str fit: Page fit or 'zoom' option (see below). Additional arguments may need
+ to be supplied. Passing ``None`` will be read as a null value for that coordinate.
+
+ Valid zoom arguments (see Table 8.2 of the PDF 1.7 reference for details):
+ /Fit No additional arguments
+ /XYZ [left] [top] [zoomFactor]
+ /FitH [top]
+ /FitV [left]
+ /FitR [left] [bottom] [right] [top]
+ /FitB No additional arguments
+ /FitBH [top]
+ /FitBV [left]
+ """
+
+ pageLink = self.getObject(self._pages)['/Kids'][pagenum]
+ pageDest = self.getObject(self._pages)['/Kids'][pagedest] #TODO: switch for external link
+ pageRef = self.getObject(pageLink)
+
+ if border is not None:
+ borderArr = [NameObject(n) for n in border[:3]]
+ if len(border) == 4:
+ dashPattern = ArrayObject([NameObject(n) for n in border[3]])
+ borderArr.append(dashPattern)
+ else:
+ borderArr = [NumberObject(0)] * 3
+
+ if isString(rect):
+ rect = NameObject(rect)
+ elif isinstance(rect, RectangleObject):
+ pass
+ else:
+ rect = RectangleObject(rect)
+
+ zoomArgs = []
+ for a in args:
+ if a is not None:
+ zoomArgs.append(NumberObject(a))
+ else:
+ zoomArgs.append(NullObject())
+ dest = Destination(NameObject("/LinkName"), pageDest, NameObject(fit), *zoomArgs) #TODO: create a better name for the link
+ destArray = dest.getDestArray()
+
+ lnk = DictionaryObject()
+ lnk.update({
+ NameObject('/Type'): NameObject('/Annot'),
+ NameObject('/Subtype'): NameObject('/Link'),
+ NameObject('/P'): pageLink,
+ NameObject('/Rect'): rect,
+ NameObject('/Border'): ArrayObject(borderArr),
+ NameObject('/Dest'): destArray
+ })
+ lnkRef = self._addObject(lnk)
+
+ if "/Annots" in pageRef:
+ pageRef['/Annots'].append(lnkRef)
+ else:
+ pageRef[NameObject('/Annots')] = ArrayObject([lnkRef])
+
+ _valid_layouts = ['/NoLayout', '/SinglePage', '/OneColumn', '/TwoColumnLeft', '/TwoColumnRight', '/TwoPageLeft', '/TwoPageRight']
+
+ def getPageLayout(self):
+ """
+ Get the page layout.
+ See :meth:`setPageLayout()` for a description of valid layouts.
+
+ :return: Page layout currently being used.
+ :rtype: str, None if not specified
+ """
+ try:
+ return self._root_object['/PageLayout']
+ except KeyError:
+ return None
+
+ def setPageLayout(self, layout):
+ """
+ Set the page layout
+
+ :param str layout: The page layout to be used
+
+ Valid layouts are:
+ /NoLayout Layout explicitly not specified
+ /SinglePage Show one page at a time
+ /OneColumn Show one column at a time
+ /TwoColumnLeft Show pages in two columns, odd-numbered pages on the left
+ /TwoColumnRight Show pages in two columns, odd-numbered pages on the right
+ /TwoPageLeft Show two pages at a time, odd-numbered pages on the left
+ /TwoPageRight Show two pages at a time, odd-numbered pages on the right
+ """
+ if not isinstance(layout, NameObject):
+ if layout not in self._valid_layouts:
+ warnings.warn("Layout should be one of: {}".format(', '.join(self._valid_layouts)))
+ layout = NameObject(layout)
+ self._root_object.update({NameObject('/PageLayout'): layout})
+
+ pageLayout = property(getPageLayout, setPageLayout)
+ """Read and write property accessing the :meth:`getPageLayout()`
+ and :meth:`setPageLayout()` methods."""
+
+ _valid_modes = ['/UseNone', '/UseOutlines', '/UseThumbs', '/FullScreen', '/UseOC', '/UseAttachments']
+
+ def getPageMode(self):
+ """
+ Get the page mode.
+ See :meth:`setPageMode()` for a description
+ of valid modes.
+
+ :return: Page mode currently being used.
+ :rtype: str, None if not specified
+ """
+ try:
+ return self._root_object['/PageMode']
+ except KeyError:
+ return None
+
+ def setPageMode(self, mode):
+ """
+ Set the page mode.
+
+ :param str mode: The page mode to use.
+
+ Valid modes are:
+ /UseNone Do not show outlines or thumbnails panels
+ /UseOutlines Show outlines (aka bookmarks) panel
+ /UseThumbs Show page thumbnails panel
+ /FullScreen Fullscreen view
+ /UseOC Show Optional Content Group (OCG) panel
+ /UseAttachments Show attachments panel
+ """
+ if not isinstance(mode, NameObject):
+ if mode not in self._valid_modes:
+ warnings.warn("Mode should be one of: {}".format(', '.join(self._valid_modes)))
+ mode = NameObject(mode)
+ self._root_object.update({NameObject('/PageMode'): mode})
+
+ pageMode = property(getPageMode, setPageMode)
+ """Read and write property accessing the :meth:`getPageMode()`
+ and :meth:`setPageMode()` methods."""
+
+
+class PdfFileReader(object):
+ """
+ Initializes a PdfFileReader object. This operation can take some time, as
+ the PDF stream's cross-reference tables are read into memory.
+
+ :param stream: A File object or an object that supports the standard read
+ and seek methods similar to a File object. Could also be a
+ string representing a path to a PDF file.
+ :param bool strict: Determines whether user should be warned of all
+ problems and also causes some correctable problems to be fatal.
+ Defaults to ``True``.
+ :param warndest: Destination for logging warnings (defaults to
+ ``sys.stderr``).
+ :param bool overwriteWarnings: Determines whether to override Python's
+ ``warnings.py`` module with a custom implementation (defaults to
+ ``True``).
+ """
+ def __init__(self, stream, strict=True, warndest = None, overwriteWarnings = True):
+ if overwriteWarnings:
+ # have to dynamically override the default showwarning since there are no
+ # public methods that specify the 'file' parameter
+ def _showwarning(message, category, filename, lineno, file=warndest, line=None):
+ if file is None:
+ file = sys.stderr
+ try:
+ file.write(formatWarning(message, category, filename, lineno, line))
+ except IOError:
+ pass
+ warnings.showwarning = _showwarning
+ self.strict = strict
+ self.flattenedPages = None
+ self.resolvedObjects = {}
+ self.xrefIndex = 0
+ self._pageId2Num = None # map page IndirectRef number to Page Number
+ if hasattr(stream, 'mode') and 'b' not in stream.mode:
+ warnings.warn("PdfFileReader stream/file object is not in binary mode. It may not be read correctly.", utils.PdfReadWarning)
+ if isString(stream):
+ fileobj = open(stream, 'rb')
+ stream = BytesIO(b_(fileobj.read()))
+ fileobj.close()
+ self.read(stream)
+ self.stream = stream
+
+ self._override_encryption = False
+
+ def getDocumentInfo(self):
+ """
+ Retrieves the PDF file's document information dictionary, if it exists.
+ Note that some PDF files use metadata streams instead of docinfo
+ dictionaries, and these metadata streams will not be accessed by this
+ function.
+
+ :return: the document information of this PDF file
+ :rtype: :class:`DocumentInformation` or ``None`` if none exists.
+ """
+ if "/Info" not in self.trailer:
+ return None
+ obj = self.trailer['/Info']
+ retval = DocumentInformation()
+ retval.update(obj)
+ return retval
+
+ documentInfo = property(lambda self: self.getDocumentInfo(), None, None)
+ """Read-only property that accesses the :meth:`getDocumentInfo()` function."""
+
+ def getXmpMetadata(self):
+ """
+ Retrieves XMP (Extensible Metadata Platform) data from the PDF document
+ root.
+
+ :return: a :class:`XmpInformation`
+ instance that can be used to access XMP metadata from the document.
+ :rtype: :class:`XmpInformation` or
+ ``None`` if no metadata was found on the document root.
+ """
+ try:
+ self._override_encryption = True
+ return self.trailer["/Root"].getXmpMetadata()
+ finally:
+ self._override_encryption = False
+
+ xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None)
+ """
+ Read-only property that accesses the
+ :meth:`getXmpMetadata()` function.
+ """
+
+ def getNumPages(self):
+ """
+ Calculates the number of pages in this PDF file.
+
+ :return: number of pages
+ :rtype: int
+ :raises PdfReadError: if file is encrypted and restrictions prevent
+ this action.
+ """
+
+ # Flattened pages will not work on an Encrypted PDF;
+ # the PDF file's page count is used in this case. Otherwise,
+ # the original method (flattened page count) is used.
+ if self.isEncrypted:
+ try:
+ self._override_encryption = True
+ self.decrypt('')
+ return self.trailer["/Root"]["/Pages"]["/Count"]
+ except:
+ raise utils.PdfReadError("File has not been decrypted")
+ finally:
+ self._override_encryption = False
+ else:
+ if self.flattenedPages == None:
+ self._flatten()
+ return len(self.flattenedPages)
+
+ numPages = property(lambda self: self.getNumPages(), None, None)
+ """
+ Read-only property that accesses the
+ :meth:`getNumPages()` function.
+ """
+
+ def getPage(self, pageNumber):
+ """
+ Retrieves a page by number from this PDF file.
+
+ :param int pageNumber: The page number to retrieve
+ (pages begin at zero)
+ :return: a :class:`PageObject` instance.
+ :rtype: :class:`PageObject`
+ """
+ ## ensure that we're not trying to access an encrypted PDF
+ #assert not self.trailer.has_key("/Encrypt")
+ if self.flattenedPages == None:
+ self._flatten()
+ return self.flattenedPages[pageNumber]
+
+ namedDestinations = property(lambda self:
+ self.getNamedDestinations(), None, None)
+ """
+ Read-only property that accesses the
+ :meth:`getNamedDestinations()` function.
+ """
+
+ # A select group of relevant field attributes. For the complete list,
+ # see section 8.6.2 of the PDF 1.7 reference.
+
+ def getFields(self, tree = None, retval = None, fileobj = None):
+ """
+ Extracts field data if this PDF contains interactive form fields.
+ The *tree* and *retval* parameters are for recursive use.
+
+ :param fileobj: A file object (usually a text file) to write
+ a report to on all interactive form fields found.
+ :return: A dictionary where each key is a field name, and each
+ value is a :class:`Field` object. By
+ default, the mapping name is used for keys.
+ :rtype: dict, or ``None`` if form data could not be located.
+ """
+ fieldAttributes = {"/FT" : "Field Type", "/Parent" : "Parent",
+ "/T" : "Field Name", "/TU" : "Alternate Field Name",
+ "/TM" : "Mapping Name", "/Ff" : "Field Flags",
+ "/V" : "Value", "/DV" : "Default Value"}
+ if retval == None:
+ retval = {}
+ catalog = self.trailer["/Root"]
+ # get the AcroForm tree
+ if "/AcroForm" in catalog:
+ tree = catalog["/AcroForm"]
+ else:
+ return None
+ if tree == None:
+ return retval
+
+ self._checkKids(tree, retval, fileobj)
+ for attr in fieldAttributes:
+ if attr in tree:
+ # Tree is a field
+ self._buildField(tree, retval, fileobj, fieldAttributes)
+ break
+
+ if "/Fields" in tree:
+ fields = tree["/Fields"]
+ for f in fields:
+ field = f.getObject()
+ self._buildField(field, retval, fileobj, fieldAttributes)
+
+ return retval
+
+ def _buildField(self, field, retval, fileobj, fieldAttributes):
+ self._checkKids(field, retval, fileobj)
+ try:
+ key = field["/TM"]
+ except KeyError:
+ try:
+ key = field["/T"]
+ except KeyError:
+ # Ignore no-name field for now
+ return
+ if fileobj:
+ self._writeField(fileobj, field, fieldAttributes)
+ fileobj.write("\n")
+ retval[key] = Field(field)
+
+ def _checkKids(self, tree, retval, fileobj):
+ if "/Kids" in tree:
+ # recurse down the tree
+ for kid in tree["/Kids"]:
+ self.getFields(kid.getObject(), retval, fileobj)
+
+ def _writeField(self, fileobj, field, fieldAttributes):
+ order = ["/TM", "/T", "/FT", "/Parent", "/TU", "/Ff", "/V", "/DV"]
+ for attr in order:
+ attrName = fieldAttributes[attr]
+ try:
+ if attr == "/FT":
+ # Make the field type value more clear
+ types = {"/Btn":"Button", "/Tx":"Text", "/Ch": "Choice",
+ "/Sig":"Signature"}
+ if field[attr] in types:
+ fileobj.write(attrName + ": " + types[field[attr]] + "\n")
+ elif attr == "/Parent":
+ # Let's just write the name of the parent
+ try:
+ name = field["/Parent"]["/TM"]
+ except KeyError:
+ name = field["/Parent"]["/T"]
+ fileobj.write(attrName + ": " + name + "\n")
+ else:
+ fileobj.write(attrName + ": " + str(field[attr]) + "\n")
+ except KeyError:
+ # Field attribute is N/A or unknown, so don't write anything
+ pass
+
+ def getFormTextFields(self):
+ ''' Retrieves form fields from the document with textual data (inputs, dropdowns)
+ '''
+ # Retrieve document form fields
+ formfields = self.getFields()
+ return dict(
+ (formfields[field]['/T'], formfields[field].get('/V')) for field in formfields \
+ if formfields[field].get('/FT') == '/Tx'
+ )
+
+ def getNamedDestinations(self, tree=None, retval=None):
+ """
+ Retrieves the named destinations present in the document.
+
+ :return: a dictionary which maps names to
+ :class:`Destinations`.
+ :rtype: dict
+ """
+ if retval == None:
+ retval = {}
+ catalog = self.trailer["/Root"]
+
+ # get the name tree
+ if "/Dests" in catalog:
+ tree = catalog["/Dests"]
+ elif "/Names" in catalog:
+ names = catalog['/Names']
+ if "/Dests" in names:
+ tree = names['/Dests']
+
+ if tree == None:
+ return retval
+
+ if "/Kids" in tree:
+ # recurse down the tree
+ for kid in tree["/Kids"]:
+ self.getNamedDestinations(kid.getObject(), retval)
+
+ if "/Names" in tree:
+ names = tree["/Names"]
+ for i in range(0, len(names), 2):
+ key = names[i].getObject()
+ val = names[i+1].getObject()
+ if isinstance(val, DictionaryObject) and '/D' in val:
+ val = val['/D']
+ dest = self._buildDestination(key, val)
+ if dest != None:
+ retval[key] = dest
+
+ return retval
+
+ outlines = property(lambda self: self.getOutlines(), None, None)
+ """
+ Read-only property that accesses the
+ :meth:`getOutlines()` function.
+ """
+
+ def getOutlines(self, node=None, outlines=None):
+ """
+ Retrieves the document outline present in the document.
+
+ :return: a nested list of :class:`Destinations`.
+ """
+ if outlines == None:
+ outlines = []
+ catalog = self.trailer["/Root"]
+
+ # get the outline dictionary and named destinations
+ if "/Outlines" in catalog:
+ try:
+ lines = catalog["/Outlines"]
+ except utils.PdfReadError:
+ # this occurs if the /Outlines object reference is incorrect
+ # for an example of such a file, see https://unglueit-files.s3.amazonaws.com/ebf/7552c42e9280b4476e59e77acc0bc812.pdf
+ # so continue to load the file without the Bookmarks
+ return outlines
+
+ if "/First" in lines:
+ node = lines["/First"]
+ self._namedDests = self.getNamedDestinations()
+
+ if node == None:
+ return outlines
+
+ # see if there are any more outlines
+ while True:
+ outline = self._buildOutline(node)
+ if outline:
+ outlines.append(outline)
+
+ # check for sub-outlines
+ if "/First" in node:
+ subOutlines = []
+ self.getOutlines(node["/First"], subOutlines)
+ if subOutlines:
+ outlines.append(subOutlines)
+
+ if "/Next" not in node:
+ break
+ node = node["/Next"]
+
+ return outlines
+
+ def _getPageNumberByIndirect(self, indirectRef):
+ """Generate _pageId2Num"""
+ if self._pageId2Num is None:
+ id2num = {}
+ for i, x in enumerate(self.pages):
+ id2num[x.indirectRef.idnum] = i
+ self._pageId2Num = id2num
+
+ if isinstance(indirectRef, int):
+ idnum = indirectRef
+ else:
+ idnum = indirectRef.idnum
+
+ ret = self._pageId2Num.get(idnum, -1)
+ return ret
+
+ def getPageNumber(self, page):
+ """
+ Retrieve page number of a given PageObject
+
+ :param PageObject page: The page to get page number. Should be
+ an instance of :class:`PageObject`
+ :return: the page number or -1 if page not found
+ :rtype: int
+ """
+ indirectRef = page.indirectRef
+ ret = self._getPageNumberByIndirect(indirectRef)
+ return ret
+
+ def getDestinationPageNumber(self, destination):
+ """
+ Retrieve page number of a given Destination object
+
+ :param Destination destination: The destination to get page number.
+ Should be an instance of
+ :class:`Destination`
+ :return: the page number or -1 if page not found
+ :rtype: int
+ """
+ indirectRef = destination.page
+ ret = self._getPageNumberByIndirect(indirectRef)
+ return ret
+
+ def _buildDestination(self, title, array):
+ page, typ = array[0:2]
+ array = array[2:]
+ return Destination(title, page, typ, *array)
+
+ def _buildOutline(self, node):
+ dest, title, outline = None, None, None
+
+ if "/A" in node and "/Title" in node:
+ # Action, section 8.5 (only type GoTo supported)
+ title = node["/Title"]
+ action = node["/A"]
+ if action["/S"] == "/GoTo":
+ dest = action["/D"]
+ elif "/Dest" in node and "/Title" in node:
+ # Destination, section 8.2.1
+ title = node["/Title"]
+ dest = node["/Dest"]
+
+ # if destination found, then create outline
+ if dest:
+ if isinstance(dest, ArrayObject):
+ outline = self._buildDestination(title, dest)
+ elif isString(dest) and dest in self._namedDests:
+ outline = self._namedDests[dest]
+ outline[NameObject("/Title")] = title
+ else:
+ raise utils.PdfReadError("Unexpected destination %r" % dest)
+ return outline
+
+ pages = property(lambda self: ConvertFunctionsToVirtualList(self.getNumPages, self.getPage),
+ None, None)
+ """
+ Read-only property that emulates a list based upon the
+ :meth:`getNumPages()` and
+ :meth:`getPage()` methods.
+ """
+
+ def getPageLayout(self):
+ """
+ Get the page layout.
+ See :meth:`setPageLayout()`
+ for a description of valid layouts.
+
+ :return: Page layout currently being used.
+ :rtype: ``str``, ``None`` if not specified
+ """
+ try:
+ return self.trailer['/Root']['/PageLayout']
+ except KeyError:
+ return None
+
+ pageLayout = property(getPageLayout)
+ """Read-only property accessing the
+ :meth:`getPageLayout()` method."""
+
+ def getPageMode(self):
+ """
+ Get the page mode.
+ See :meth:`setPageMode()`
+ for a description of valid modes.
+
+ :return: Page mode currently being used.
+ :rtype: ``str``, ``None`` if not specified
+ """
+ try:
+ return self.trailer['/Root']['/PageMode']
+ except KeyError:
+ return None
+
+ pageMode = property(getPageMode)
+ """Read-only property accessing the
+ :meth:`getPageMode()` method."""
+
+ def _flatten(self, pages=None, inherit=None, indirectRef=None):
+ inheritablePageAttributes = (
+ NameObject("/Resources"), NameObject("/MediaBox"),
+ NameObject("/CropBox"), NameObject("/Rotate")
+ )
+ if inherit == None:
+ inherit = dict()
+ if pages == None:
+ self.flattenedPages = []
+ catalog = self.trailer["/Root"].getObject()
+ pages = catalog["/Pages"].getObject()
+
+ t = "/Pages"
+ if "/Type" in pages:
+ t = pages["/Type"]
+
+ if t == "/Pages":
+ for attr in inheritablePageAttributes:
+ if attr in pages:
+ inherit[attr] = pages[attr]
+ for page in pages["/Kids"]:
+ addt = {}
+ if isinstance(page, IndirectObject):
+ addt["indirectRef"] = page
+ self._flatten(page.getObject(), inherit, **addt)
+ elif t == "/Page":
+ for attr, value in list(inherit.items()):
+ # if the page has it's own value, it does not inherit the
+ # parent's value:
+ if attr not in pages:
+ pages[attr] = value
+ pageObj = PageObject(self, indirectRef)
+ pageObj.update(pages)
+ self.flattenedPages.append(pageObj)
+
+ def _getObjectFromStream(self, indirectReference):
+ # indirect reference to object in object stream
+ # read the entire object stream into memory
+ debug = False
+ stmnum, idx = self.xref_objStm[indirectReference.idnum]
+ if debug: print(("Here1: %s %s"%(stmnum, idx)))
+ objStm = IndirectObject(stmnum, 0, self).getObject()
+ if debug: print(("Here2: objStm=%s.. stmnum=%s data=%s"%(objStm, stmnum, objStm.getData())))
+ # This is an xref to a stream, so its type better be a stream
+ assert objStm['/Type'] == '/ObjStm'
+ # /N is the number of indirect objects in the stream
+ assert idx < objStm['/N']
+ streamData = BytesIO(b_(objStm.getData()))
+ for i in range(objStm['/N']):
+ readNonWhitespace(streamData)
+ streamData.seek(-1, 1)
+ objnum = NumberObject.readFromStream(streamData)
+ readNonWhitespace(streamData)
+ streamData.seek(-1, 1)
+ offset = NumberObject.readFromStream(streamData)
+ readNonWhitespace(streamData)
+ streamData.seek(-1, 1)
+ if objnum != indirectReference.idnum:
+ # We're only interested in one object
+ continue
+ if self.strict and idx != i:
+ raise utils.PdfReadError("Object is in wrong index.")
+ streamData.seek(objStm['/First']+offset, 0)
+ if debug:
+ pos = streamData.tell()
+ streamData.seek(0, 0)
+ lines = streamData.readlines()
+ for i in range(0, len(lines)):
+ print((lines[i]))
+ streamData.seek(pos, 0)
+ try:
+ obj = readObject(streamData, self)
+ except utils.PdfStreamError as e:
+ # Stream object cannot be read. Normally, a critical error, but
+ # Adobe Reader doesn't complain, so continue (in strict mode?)
+ e = sys.exc_info()[1]
+ warnings.warn("Invalid stream (index %d) within object %d %d: %s" % \
+ (i, indirectReference.idnum, indirectReference.generation, e), utils.PdfReadWarning)
+
+ if self.strict:
+ raise utils.PdfReadError("Can't read object stream: %s"%e)
+ # Replace with null. Hopefully it's nothing important.
+ obj = NullObject()
+ return obj
+
+ if self.strict: raise utils.PdfReadError("This is a fatal error in strict mode.")
+ return NullObject()
+
+ def getObject(self, indirectReference):
+ debug = False
+ if debug: print(("looking at:", indirectReference.idnum, indirectReference.generation))
+ retval = self.cacheGetIndirectObject(indirectReference.generation,
+ indirectReference.idnum)
+ if retval != None:
+ return retval
+ if indirectReference.generation == 0 and \
+ indirectReference.idnum in self.xref_objStm:
+ retval = self._getObjectFromStream(indirectReference)
+ elif indirectReference.generation in self.xref and \
+ indirectReference.idnum in self.xref[indirectReference.generation]:
+ start = self.xref[indirectReference.generation][indirectReference.idnum]
+ if debug: print((" Uncompressed Object", indirectReference.idnum, indirectReference.generation, ":", start))
+ self.stream.seek(start, 0)
+ idnum, generation = self.readObjectHeader(self.stream)
+ if idnum != indirectReference.idnum and self.xrefIndex:
+ # Xref table probably had bad indexes due to not being zero-indexed
+ if self.strict:
+ raise utils.PdfReadError("Expected object ID (%d %d) does not match actual (%d %d); xref table not zero-indexed." \
+ % (indirectReference.idnum, indirectReference.generation, idnum, generation))
+ else: pass # xref table is corrected in non-strict mode
+ elif idnum != indirectReference.idnum:
+ # some other problem
+ raise utils.PdfReadError("Expected object ID (%d %d) does not match actual (%d %d)." \
+ % (indirectReference.idnum, indirectReference.generation, idnum, generation))
+ assert generation == indirectReference.generation
+ retval = readObject(self.stream, self)
+
+ # override encryption is used for the /Encrypt dictionary
+ if not self._override_encryption and self.isEncrypted:
+ # if we don't have the encryption key:
+ if not hasattr(self, '_decryption_key'):
+ raise utils.PdfReadError("file has not been decrypted")
+ # otherwise, decrypt here...
+ import struct
+ pack1 = struct.pack(">read", stream)
+ # start at the end:
+ stream.seek(-1, 2)
+ if not stream.tell():
+ raise utils.PdfReadError('Cannot read an empty file')
+ last1K = stream.tell() - 1024 + 1 # offset of last 1024 bytes of stream
+ line = b_('')
+ while line[:5] != b_("%%EOF"):
+ if stream.tell() < last1K:
+ raise utils.PdfReadError("EOF marker not found")
+ line = self.readNextEndLine(stream)
+ if debug: print(" line:",line)
+
+ # find startxref entry - the location of the xref table
+ line = self.readNextEndLine(stream)
+ try:
+ startxref = int(line)
+ except ValueError:
+ # 'startxref' may be on the same line as the location
+ if not line.startswith(b_("startxref")):
+ raise utils.PdfReadError("startxref not found")
+ startxref = int(line[9:].strip())
+ warnings.warn("startxref on same line as offset")
+ else:
+ line = self.readNextEndLine(stream)
+ if line[:9] != b_("startxref"):
+ raise utils.PdfReadError("startxref not found")
+
+ # read all cross reference tables and their trailers
+ self.xref = {}
+ self.xref_objStm = {}
+ self.trailer = DictionaryObject()
+ while True:
+ # load the xref table
+ stream.seek(startxref, 0)
+ x = stream.read(1)
+ if x == b_("x"):
+ # standard cross-reference table
+ ref = stream.read(4)
+ if ref[:3] != b_("ref"):
+ raise utils.PdfReadError("xref table read error")
+ readNonWhitespace(stream)
+ stream.seek(-1, 1)
+ firsttime = True; # check if the first time looking at the xref table
+ while True:
+ num = readObject(stream, self)
+ if firsttime and num != 0:
+ self.xrefIndex = num
+ if self.strict:
+ warnings.warn("Xref table not zero-indexed. ID numbers for objects will be corrected.", utils.PdfReadWarning)
+ #if table not zero indexed, could be due to error from when PDF was created
+ #which will lead to mismatched indices later on, only warned and corrected if self.strict=True
+ firsttime = False
+ readNonWhitespace(stream)
+ stream.seek(-1, 1)
+ size = readObject(stream, self)
+ readNonWhitespace(stream)
+ stream.seek(-1, 1)
+ cnt = 0
+ while cnt < size:
+ line = stream.read(20)
+
+ # It's very clear in section 3.4.3 of the PDF spec
+ # that all cross-reference table lines are a fixed
+ # 20 bytes (as of PDF 1.7). However, some files have
+ # 21-byte entries (or more) due to the use of \r\n
+ # (CRLF) EOL's. Detect that case, and adjust the line
+ # until it does not begin with a \r (CR) or \n (LF).
+ while line[0] in b_("\x0D\x0A"):
+ stream.seek(-20 + 1, 1)
+ line = stream.read(20)
+
+ # On the other hand, some malformed PDF files
+ # use a single character EOL without a preceeding
+ # space. Detect that case, and seek the stream
+ # back one character. (0-9 means we've bled into
+ # the next xref entry, t means we've bled into the
+ # text "trailer"):
+ if line[-1] in b_("0123456789t"):
+ stream.seek(-1, 1)
+
+ offset, generation = line[:16].split(b_(" "))
+ offset, generation = int(offset), int(generation)
+ if generation not in self.xref:
+ self.xref[generation] = {}
+ if num in self.xref[generation]:
+ # It really seems like we should allow the last
+ # xref table in the file to override previous
+ # ones. Since we read the file backwards, assume
+ # any existing key is already set correctly.
+ pass
+ else:
+ self.xref[generation][num] = offset
+ cnt += 1
+ num += 1
+ readNonWhitespace(stream)
+ stream.seek(-1, 1)
+ trailertag = stream.read(7)
+ if trailertag != b_("trailer"):
+ # more xrefs!
+ stream.seek(-7, 1)
+ else:
+ break
+ readNonWhitespace(stream)
+ stream.seek(-1, 1)
+ newTrailer = readObject(stream, self)
+ for key, value in list(newTrailer.items()):
+ if key not in self.trailer:
+ self.trailer[key] = value
+ if "/Prev" in newTrailer:
+ startxref = newTrailer["/Prev"]
+ else:
+ break
+ elif x.isdigit():
+ # PDF 1.5+ Cross-Reference Stream
+ stream.seek(-1, 1)
+ idnum, generation = self.readObjectHeader(stream)
+ xrefstream = readObject(stream, self)
+ assert xrefstream["/Type"] == "/XRef"
+ self.cacheIndirectObject(generation, idnum, xrefstream)
+ streamData = BytesIO(b_(xrefstream.getData()))
+ # Index pairs specify the subsections in the dictionary. If
+ # none create one subsection that spans everything.
+ idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")])
+ if debug: print(("read idx_pairs=%s"%list(self._pairs(idx_pairs))))
+ entrySizes = xrefstream.get("/W")
+ assert len(entrySizes) >= 3
+ if self.strict and len(entrySizes) > 3:
+ raise utils.PdfReadError("Too many entry sizes: %s" %entrySizes)
+
+ def getEntry(i):
+ # Reads the correct number of bytes for each entry. See the
+ # discussion of the W parameter in PDF spec table 17.
+ if entrySizes[i] > 0:
+ d = streamData.read(entrySizes[i])
+ return convertToInt(d, entrySizes[i])
+
+ # PDF Spec Table 17: A value of zero for an element in the
+ # W array indicates...the default value shall be used
+ if i == 0: return 1 # First value defaults to 1
+ else: return 0
+
+ def used_before(num, generation):
+ # We move backwards through the xrefs, don't replace any.
+ return num in self.xref.get(generation, []) or \
+ num in self.xref_objStm
+
+ # Iterate through each subsection
+ last_end = 0
+ for start, size in self._pairs(idx_pairs):
+ # The subsections must increase
+ assert start >= last_end
+ last_end = start + size
+ for num in range(start, start+size):
+ # The first entry is the type
+ xref_type = getEntry(0)
+ # The rest of the elements depend on the xref_type
+ if xref_type == 0:
+ # linked list of free objects
+ next_free_object = getEntry(1)
+ next_generation = getEntry(2)
+ elif xref_type == 1:
+ # objects that are in use but are not compressed
+ byte_offset = getEntry(1)
+ generation = getEntry(2)
+ if generation not in self.xref:
+ self.xref[generation] = {}
+ if not used_before(num, generation):
+ self.xref[generation][num] = byte_offset
+ if debug: print(("XREF Uncompressed: %s %s"%(
+ num, generation)))
+ elif xref_type == 2:
+ # compressed objects
+ objstr_num = getEntry(1)
+ obstr_idx = getEntry(2)
+ generation = 0 # PDF spec table 18, generation is 0
+ if not used_before(num, generation):
+ if debug: print(("XREF Compressed: %s %s %s"%(
+ num, objstr_num, obstr_idx)))
+ self.xref_objStm[num] = (objstr_num, obstr_idx)
+ elif self.strict:
+ raise utils.PdfReadError("Unknown xref type: %s"%
+ xref_type)
+
+ trailerKeys = "/Root", "/Encrypt", "/Info", "/ID"
+ for key in trailerKeys:
+ if key in xrefstream and key not in self.trailer:
+ self.trailer[NameObject(key)] = xrefstream.raw_get(key)
+ if "/Prev" in xrefstream:
+ startxref = xrefstream["/Prev"]
+ else:
+ break
+ else:
+ # bad xref character at startxref. Let's see if we can find
+ # the xref table nearby, as we've observed this error with an
+ # off-by-one before.
+ stream.seek(-11, 1)
+ tmp = stream.read(20)
+ xref_loc = tmp.find(b_("xref"))
+ if xref_loc != -1:
+ startxref -= (10 - xref_loc)
+ continue
+ # No explicit xref table, try finding a cross-reference stream.
+ stream.seek(startxref, 0)
+ found = False
+ for look in range(5):
+ if stream.read(1).isdigit():
+ # This is not a standard PDF, consider adding a warning
+ startxref += look
+ found = True
+ break
+ if found:
+ continue
+ # no xref table found at specified location
+ raise utils.PdfReadError("Could not find xref table at specified location")
+ #if not zero-indexed, verify that the table is correct; change it if necessary
+ if self.xrefIndex and not self.strict:
+ loc = stream.tell()
+ for gen in self.xref:
+ if gen == 65535: continue
+ for id in self.xref[gen]:
+ stream.seek(self.xref[gen][id], 0)
+ try:
+ pid, pgen = self.readObjectHeader(stream)
+ except ValueError:
+ break
+ if pid == id - self.xrefIndex:
+ self._zeroXref(gen)
+ break
+ #if not, then either it's just plain wrong, or the non-zero-index is actually correct
+ stream.seek(loc, 0) #return to where it was
+
+ def _zeroXref(self, generation):
+ self.xref[generation] = dict( (k-self.xrefIndex, v) for (k, v) in list(self.xref[generation].items()) )
+
+ def _pairs(self, array):
+ i = 0
+ while True:
+ yield array[i], array[i+1]
+ i += 2
+ if (i+1) >= len(array):
+ break
+
+ def readNextEndLine(self, stream):
+ debug = False
+ if debug: print(">>readNextEndLine")
+ line = b_("")
+ while True:
+ # Prevent infinite loops in malformed PDFs
+ if stream.tell() == 0:
+ raise utils.PdfReadError("Could not read malformed PDF file")
+ x = stream.read(1)
+ if debug: print((" x:", x, "%x"%ord(x)))
+ if stream.tell() < 2:
+ raise utils.PdfReadError("EOL marker not found")
+ stream.seek(-2, 1)
+ if x == b_('\n') or x == b_('\r'): ## \n = LF; \r = CR
+ crlf = False
+ while x == b_('\n') or x == b_('\r'):
+ if debug:
+ if ord(x) == 0x0D: print(" x is CR 0D")
+ elif ord(x) == 0x0A: print(" x is LF 0A")
+ x = stream.read(1)
+ if x == b_('\n') or x == b_('\r'): # account for CR+LF
+ stream.seek(-1, 1)
+ crlf = True
+ if stream.tell() < 2:
+ raise utils.PdfReadError("EOL marker not found")
+ stream.seek(-2, 1)
+ stream.seek(2 if crlf else 1, 1) #if using CR+LF, go back 2 bytes, else 1
+ break
+ else:
+ if debug: print(" x is neither")
+ line = x + line
+ if debug: print((" RNEL line:", line))
+ if debug: print("leaving RNEL")
+ return line
+
+ def decrypt(self, password):
+ """
+ When using an encrypted / secured PDF file with the PDF Standard
+ encryption handler, this function will allow the file to be decrypted.
+ It checks the given password against the document's user password and
+ owner password, and then stores the resulting decryption key if either
+ password is correct.
+
+ It does not matter which password was matched. Both passwords provide
+ the correct decryption key that will allow the document to be used with
+ this library.
+
+ :param str password: The password to match.
+ :return: ``0`` if the password failed, ``1`` if the password matched the user
+ password, and ``2`` if the password matched the owner password.
+ :rtype: int
+ :raises NotImplementedError: if document uses an unsupported encryption
+ method.
+ """
+
+ self._override_encryption = True
+ try:
+ return self._decrypt(password)
+ finally:
+ self._override_encryption = False
+
+ def _decrypt(self, password):
+ encrypt = self.trailer['/Encrypt'].getObject()
+ if encrypt['/Filter'] != '/Standard':
+ raise NotImplementedError("only Standard PDF encryption handler is available")
+ if not (encrypt['/V'] in (1, 2)):
+ raise NotImplementedError("only algorithm code 1 and 2 are supported")
+ user_password, key = self._authenticateUserPassword(password)
+ if user_password:
+ self._decryption_key = key
+ return 1
+ else:
+ rev = encrypt['/R'].getObject()
+ if rev == 2:
+ keylen = 5
+ else:
+ keylen = encrypt['/Length'].getObject() // 8
+ key = _alg33_1(password, rev, keylen)
+ real_O = encrypt["/O"].getObject()
+ if rev == 2:
+ userpass = utils.RC4_encrypt(key, real_O)
+ else:
+ val = real_O
+ for i in range(19, -1, -1):
+ new_key = b_('')
+ for l in range(len(key)):
+ new_key += b_(chr(utils.ord_(key[l]) ^ i))
+ val = utils.RC4_encrypt(new_key, val)
+ userpass = val
+ owner_password, key = self._authenticateUserPassword(userpass)
+ if owner_password:
+ self._decryption_key = key
+ return 2
+ return 0
+
+ def _authenticateUserPassword(self, password):
+ encrypt = self.trailer['/Encrypt'].getObject()
+ rev = encrypt['/R'].getObject()
+ owner_entry = encrypt['/O'].getObject()
+ p_entry = encrypt['/P'].getObject()
+ id_entry = self.trailer['/ID'].getObject()
+ id1_entry = id_entry[0].getObject()
+ real_U = encrypt['/U'].getObject().original_bytes
+ if rev == 2:
+ U, key = _alg34(password, owner_entry, p_entry, id1_entry)
+ elif rev >= 3:
+ U, key = _alg35(password, rev,
+ encrypt["/Length"].getObject() // 8, owner_entry,
+ p_entry, id1_entry,
+ encrypt.get("/EncryptMetadata", BooleanObject(False)).getObject())
+ U, real_U = U[:16], real_U[:16]
+ return U == real_U, key
+
+ def getIsEncrypted(self):
+ return "/Encrypt" in self.trailer
+
+ isEncrypted = property(lambda self: self.getIsEncrypted(), None, None)
+ """
+ Read-only boolean property showing whether this PDF file is encrypted.
+ Note that this property, if true, will remain true even after the
+ :meth:`decrypt()` method is called.
+ """
+
+
+def getRectangle(self, name, defaults):
+ retval = self.get(name)
+ if isinstance(retval, RectangleObject):
+ return retval
+ if retval == None:
+ for d in defaults:
+ retval = self.get(d)
+ if retval != None:
+ break
+ if isinstance(retval, IndirectObject):
+ retval = self.pdf.getObject(retval)
+ retval = RectangleObject(retval)
+ setRectangle(self, name, retval)
+ return retval
+
+
+def setRectangle(self, name, value):
+ if not isinstance(name, NameObject):
+ name = NameObject(name)
+ self[name] = value
+
+
+def deleteRectangle(self, name):
+ del self[name]
+
+
+def createRectangleAccessor(name, fallback):
+ return \
+ property(
+ lambda self: getRectangle(self, name, fallback),
+ lambda self, value: setRectangle(self, name, value),
+ lambda self: deleteRectangle(self, name)
+ )
+
+
+class PageObject(DictionaryObject):
+ """
+ This class represents a single page within a PDF file. Typically this
+ object will be created by accessing the
+ :meth:`getPage()` method of the
+ :class:`PdfFileReader` class, but it is
+ also possible to create an empty page with the
+ :meth:`createBlankPage()` static method.
+
+ :param pdf: PDF file the page belongs to.
+ :param indirectRef: Stores the original indirect reference to
+ this object in its source PDF
+ """
+ def __init__(self, pdf=None, indirectRef=None):
+ DictionaryObject.__init__(self)
+ self.pdf = pdf
+ self.indirectRef = indirectRef
+
+ def createBlankPage(pdf=None, width=None, height=None):
+ """
+ Returns a new blank page.
+ If ``width`` or ``height`` is ``None``, try to get the page size
+ from the last page of *pdf*.
+
+ :param pdf: PDF file the page belongs to
+ :param float width: The width of the new page expressed in default user
+ space units.
+ :param float height: The height of the new page expressed in default user
+ space units.
+ :return: the new blank page:
+ :rtype: :class:`PageObject`
+ :raises PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains
+ no page
+ """
+ page = PageObject(pdf)
+
+ # Creates a new page (cf PDF Reference 7.7.3.3)
+ page.__setitem__(NameObject('/Type'), NameObject('/Page'))
+ page.__setitem__(NameObject('/Parent'), NullObject())
+ page.__setitem__(NameObject('/Resources'), DictionaryObject())
+ if width is None or height is None:
+ if pdf is not None and pdf.getNumPages() > 0:
+ lastpage = pdf.getPage(pdf.getNumPages() - 1)
+ width = lastpage.mediaBox.getWidth()
+ height = lastpage.mediaBox.getHeight()
+ else:
+ raise utils.PageSizeNotDefinedError()
+ page.__setitem__(NameObject('/MediaBox'),
+ RectangleObject([0, 0, width, height]))
+
+ return page
+ createBlankPage = staticmethod(createBlankPage)
+
+ def rotateClockwise(self, angle):
+ """
+ Rotates a page clockwise by increments of 90 degrees.
+
+ :param int angle: Angle to rotate the page. Must be an increment
+ of 90 deg.
+ """
+ assert angle % 90 == 0
+ self._rotate(angle)
+ return self
+
+ def rotateCounterClockwise(self, angle):
+ """
+ Rotates a page counter-clockwise by increments of 90 degrees.
+
+ :param int angle: Angle to rotate the page. Must be an increment
+ of 90 deg.
+ """
+ assert angle % 90 == 0
+ self._rotate(-angle)
+ return self
+
+ def _rotate(self, angle):
+ currentAngle = self.get("/Rotate", 0)
+ self[NameObject("/Rotate")] = NumberObject(currentAngle + angle)
+
+ def _mergeResources(res1, res2, resource):
+ newRes = DictionaryObject()
+ newRes.update(res1.get(resource, DictionaryObject()).getObject())
+ page2Res = res2.get(resource, DictionaryObject()).getObject()
+ renameRes = {}
+ for key in list(page2Res.keys()):
+ if key in newRes and newRes.raw_get(key) != page2Res.raw_get(key):
+ newname = NameObject(key + str(uuid.uuid4()))
+ renameRes[key] = newname
+ newRes[newname] = page2Res[key]
+ elif key not in newRes:
+ newRes[key] = page2Res.raw_get(key)
+ return newRes, renameRes
+ _mergeResources = staticmethod(_mergeResources)
+
+ def _contentStreamRename(stream, rename, pdf):
+ if not rename:
+ return stream
+ stream = ContentStream(stream, pdf)
+ for operands, operator in stream.operations:
+ for i in range(len(operands)):
+ op = operands[i]
+ if isinstance(op, NameObject):
+ operands[i] = rename.get(op,op)
+ return stream
+ _contentStreamRename = staticmethod(_contentStreamRename)
+
+ def _pushPopGS(contents, pdf):
+ # adds a graphics state "push" and "pop" to the beginning and end
+ # of a content stream. This isolates it from changes such as
+ # transformation matricies.
+ stream = ContentStream(contents, pdf)
+ stream.operations.insert(0, [[], "q"])
+ stream.operations.append([[], "Q"])
+ return stream
+ _pushPopGS = staticmethod(_pushPopGS)
+
+ def _addTransformationMatrix(contents, pdf, ctm):
+ # adds transformation matrix at the beginning of the given
+ # contents stream.
+ a, b, c, d, e, f = ctm
+ contents = ContentStream(contents, pdf)
+ contents.operations.insert(0, [[FloatObject(a), FloatObject(b),
+ FloatObject(c), FloatObject(d), FloatObject(e),
+ FloatObject(f)], " cm"])
+ return contents
+ _addTransformationMatrix = staticmethod(_addTransformationMatrix)
+
+ def getContents(self):
+ """
+ Accesses the page contents.
+
+ :return: the ``/Contents`` object, or ``None`` if it doesn't exist.
+ ``/Contents`` is optional, as described in PDF Reference 7.7.3.3
+ """
+ if "/Contents" in self:
+ return self["/Contents"].getObject()
+ else:
+ return None
+
+ def mergePage(self, page2):
+ """
+ Merges the content streams of two pages into one. Resource references
+ (i.e. fonts) are maintained from both pages. The mediabox/cropbox/etc
+ of this page are not altered. The parameter page's content stream will
+ be added to the end of this page's content stream, meaning that it will
+ be drawn after, or "on top" of this page.
+
+ :param PageObject page2: The page to be merged into this one. Should be
+ an instance of :class:`PageObject`.
+ """
+ self._mergePage(page2)
+
+ def _mergePage(self, page2, page2transformation=None, ctm=None, expand=False):
+ # First we work on merging the resource dictionaries. This allows us
+ # to find out what symbols in the content streams we might need to
+ # rename.
+
+ newResources = DictionaryObject()
+ rename = {}
+ originalResources = self["/Resources"].getObject()
+ page2Resources = page2["/Resources"].getObject()
+ newAnnots = ArrayObject()
+
+ for page in (self, page2):
+ if "/Annots" in page:
+ annots = page["/Annots"]
+ if isinstance(annots, ArrayObject):
+ for ref in annots:
+ newAnnots.append(ref)
+
+ for res in "/ExtGState", "/Font", "/XObject", "/ColorSpace", "/Pattern", "/Shading", "/Properties":
+ new, newrename = PageObject._mergeResources(originalResources, page2Resources, res)
+ if new:
+ newResources[NameObject(res)] = new
+ rename.update(newrename)
+
+ # Combine /ProcSet sets.
+ newResources[NameObject("/ProcSet")] = ArrayObject(
+ frozenset(originalResources.get("/ProcSet", ArrayObject()).getObject()).union(
+ frozenset(page2Resources.get("/ProcSet", ArrayObject()).getObject())
+ )
+ )
+
+ newContentArray = ArrayObject()
+
+ originalContent = self.getContents()
+ if originalContent is not None:
+ newContentArray.append(PageObject._pushPopGS(
+ originalContent, self.pdf))
+
+ page2Content = page2.getContents()
+ if page2Content is not None:
+ if page2transformation is not None:
+ page2Content = page2transformation(page2Content)
+ page2Content = PageObject._contentStreamRename(
+ page2Content, rename, self.pdf)
+ page2Content = PageObject._pushPopGS(page2Content, self.pdf)
+ newContentArray.append(page2Content)
+
+ # if expanding the page to fit a new page, calculate the new media box size
+ if expand:
+ corners1 = [self.mediaBox.getLowerLeft_x().as_numeric(), self.mediaBox.getLowerLeft_y().as_numeric(),
+ self.mediaBox.getUpperRight_x().as_numeric(), self.mediaBox.getUpperRight_y().as_numeric()]
+ corners2 = [page2.mediaBox.getLowerLeft_x().as_numeric(), page2.mediaBox.getLowerLeft_y().as_numeric(),
+ page2.mediaBox.getUpperLeft_x().as_numeric(), page2.mediaBox.getUpperLeft_y().as_numeric(),
+ page2.mediaBox.getUpperRight_x().as_numeric(), page2.mediaBox.getUpperRight_y().as_numeric(),
+ page2.mediaBox.getLowerRight_x().as_numeric(), page2.mediaBox.getLowerRight_y().as_numeric()]
+ if ctm is not None:
+ ctm = [float(x) for x in ctm]
+ new_x = [ctm[0]*corners2[i] + ctm[2]*corners2[i+1] + ctm[4] for i in range(0, 8, 2)]
+ new_y = [ctm[1]*corners2[i] + ctm[3]*corners2[i+1] + ctm[5] for i in range(0, 8, 2)]
+ else:
+ new_x = corners2[0:8:2]
+ new_y = corners2[1:8:2]
+ lowerleft = [min(new_x), min(new_y)]
+ upperright = [max(new_x), max(new_y)]
+ lowerleft = [min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])]
+ upperright = [max(corners1[2], upperright[0]), max(corners1[3], upperright[1])]
+
+ self.mediaBox.setLowerLeft(lowerleft)
+ self.mediaBox.setUpperRight(upperright)
+
+ self[NameObject('/Contents')] = ContentStream(newContentArray, self.pdf)
+ self[NameObject('/Resources')] = newResources
+ self[NameObject('/Annots')] = newAnnots
+
+ def mergeTransformedPage(self, page2, ctm, expand=False):
+ """
+ This is similar to mergePage, but a transformation matrix is
+ applied to the merged stream.
+
+ :param PageObject page2: The page to be merged into this one. Should be
+ an instance of :class:`PageObject`.
+ :param tuple ctm: a 6-element tuple containing the operands of the
+ transformation matrix
+ :param bool expand: Whether the page should be expanded to fit the dimensions
+ of the page to be merged.
+ """
+ self._mergePage(page2, lambda page2Content:
+ PageObject._addTransformationMatrix(page2Content, page2.pdf, ctm), ctm, expand)
+
+ def mergeScaledPage(self, page2, scale, expand=False):
+ """
+ This is similar to mergePage, but the stream to be merged is scaled
+ by appling a transformation matrix.
+
+ :param PageObject page2: The page to be merged into this one. Should be
+ an instance of :class:`PageObject`.
+ :param float scale: The scaling factor
+ :param bool expand: Whether the page should be expanded to fit the
+ dimensions of the page to be merged.
+ """
+ # CTM to scale : [ sx 0 0 sy 0 0 ]
+ return self.mergeTransformedPage(page2, [scale, 0,
+ 0, scale,
+ 0, 0], expand)
+
+ def mergeRotatedPage(self, page2, rotation, expand=False):
+ """
+ This is similar to mergePage, but the stream to be merged is rotated
+ by appling a transformation matrix.
+
+ :param PageObject page2: the page to be merged into this one. Should be
+ an instance of :class:`PageObject`.
+ :param float rotation: The angle of the rotation, in degrees
+ :param bool expand: Whether the page should be expanded to fit the
+ dimensions of the page to be merged.
+ """
+ rotation = math.radians(rotation)
+ return self.mergeTransformedPage(page2,
+ [math.cos(rotation), math.sin(rotation),
+ -math.sin(rotation), math.cos(rotation),
+ 0, 0], expand)
+
+ def mergeTranslatedPage(self, page2, tx, ty, expand=False):
+ """
+ This is similar to mergePage, but the stream to be merged is translated
+ by appling a transformation matrix.
+
+ :param PageObject page2: the page to be merged into this one. Should be
+ an instance of :class:`PageObject`.
+ :param float tx: The translation on X axis
+ :param float ty: The translation on Y axis
+ :param bool expand: Whether the page should be expanded to fit the
+ dimensions of the page to be merged.
+ """
+ return self.mergeTransformedPage(page2, [1, 0,
+ 0, 1,
+ tx, ty], expand)
+
+ def mergeRotatedTranslatedPage(self, page2, rotation, tx, ty, expand=False):
+ """
+ This is similar to mergePage, but the stream to be merged is rotated
+ and translated by appling a transformation matrix.
+
+ :param PageObject page2: the page to be merged into this one. Should be
+ an instance of :class:`PageObject`.
+ :param float tx: The translation on X axis
+ :param float ty: The translation on Y axis
+ :param float rotation: The angle of the rotation, in degrees
+ :param bool expand: Whether the page should be expanded to fit the
+ dimensions of the page to be merged.
+ """
+
+ translation = [[1, 0, 0],
+ [0, 1, 0],
+ [-tx, -ty, 1]]
+ rotation = math.radians(rotation)
+ rotating = [[math.cos(rotation), math.sin(rotation), 0],
+ [-math.sin(rotation), math.cos(rotation), 0],
+ [0, 0, 1]]
+ rtranslation = [[1, 0, 0],
+ [0, 1, 0],
+ [tx, ty, 1]]
+ ctm = utils.matrixMultiply(translation, rotating)
+ ctm = utils.matrixMultiply(ctm, rtranslation)
+
+ return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1],
+ ctm[1][0], ctm[1][1],
+ ctm[2][0], ctm[2][1]], expand)
+
+ def mergeRotatedScaledPage(self, page2, rotation, scale, expand=False):
+ """
+ This is similar to mergePage, but the stream to be merged is rotated
+ and scaled by appling a transformation matrix.
+
+ :param PageObject page2: the page to be merged into this one. Should be
+ an instance of :class:`PageObject`.
+ :param float rotation: The angle of the rotation, in degrees
+ :param float scale: The scaling factor
+ :param bool expand: Whether the page should be expanded to fit the
+ dimensions of the page to be merged.
+ """
+ rotation = math.radians(rotation)
+ rotating = [[math.cos(rotation), math.sin(rotation), 0],
+ [-math.sin(rotation), math.cos(rotation), 0],
+ [0, 0, 1]]
+ scaling = [[scale, 0, 0],
+ [0, scale, 0],
+ [0, 0, 1]]
+ ctm = utils.matrixMultiply(rotating, scaling)
+
+ return self.mergeTransformedPage(page2,
+ [ctm[0][0], ctm[0][1],
+ ctm[1][0], ctm[1][1],
+ ctm[2][0], ctm[2][1]], expand)
+
+ def mergeScaledTranslatedPage(self, page2, scale, tx, ty, expand=False):
+ """
+ This is similar to mergePage, but the stream to be merged is translated
+ and scaled by appling a transformation matrix.
+
+ :param PageObject page2: the page to be merged into this one. Should be
+ an instance of :class:`PageObject`.
+ :param float scale: The scaling factor
+ :param float tx: The translation on X axis
+ :param float ty: The translation on Y axis
+ :param bool expand: Whether the page should be expanded to fit the
+ dimensions of the page to be merged.
+ """
+
+ translation = [[1, 0, 0],
+ [0, 1, 0],
+ [tx, ty, 1]]
+ scaling = [[scale, 0, 0],
+ [0, scale, 0],
+ [0, 0, 1]]
+ ctm = utils.matrixMultiply(scaling, translation)
+
+ return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1],
+ ctm[1][0], ctm[1][1],
+ ctm[2][0], ctm[2][1]], expand)
+
+ def mergeRotatedScaledTranslatedPage(self, page2, rotation, scale, tx, ty, expand=False):
+ """
+ This is similar to mergePage, but the stream to be merged is translated,
+ rotated and scaled by appling a transformation matrix.
+
+ :param PageObject page2: the page to be merged into this one. Should be
+ an instance of :class:`PageObject`.
+ :param float tx: The translation on X axis
+ :param float ty: The translation on Y axis
+ :param float rotation: The angle of the rotation, in degrees
+ :param float scale: The scaling factor
+ :param bool expand: Whether the page should be expanded to fit the
+ dimensions of the page to be merged.
+ """
+ translation = [[1, 0, 0],
+ [0, 1, 0],
+ [tx, ty, 1]]
+ rotation = math.radians(rotation)
+ rotating = [[math.cos(rotation), math.sin(rotation), 0],
+ [-math.sin(rotation), math.cos(rotation), 0],
+ [0, 0, 1]]
+ scaling = [[scale, 0, 0],
+ [0, scale, 0],
+ [0, 0, 1]]
+ ctm = utils.matrixMultiply(rotating, scaling)
+ ctm = utils.matrixMultiply(ctm, translation)
+
+ return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1],
+ ctm[1][0], ctm[1][1],
+ ctm[2][0], ctm[2][1]], expand)
+
+ ##
+ # Applys a transformation matrix the page.
+ #
+ # @param ctm A 6 elements tuple containing the operands of the
+ # transformation matrix
+ def addTransformation(self, ctm):
+ """
+ Applies a transformation matrix to the page.
+
+ :param tuple ctm: A 6-element tuple containing the operands of the
+ transformation matrix.
+ """
+ originalContent = self.getContents()
+ if originalContent is not None:
+ newContent = PageObject._addTransformationMatrix(
+ originalContent, self.pdf, ctm)
+ newContent = PageObject._pushPopGS(newContent, self.pdf)
+ self[NameObject('/Contents')] = newContent
+
+ def scale(self, sx, sy):
+ """
+ Scales a page by the given factors by appling a transformation
+ matrix to its content and updating the page size.
+
+ :param float sx: The scaling factor on horizontal axis.
+ :param float sy: The scaling factor on vertical axis.
+ """
+ self.addTransformation([sx, 0,
+ 0, sy,
+ 0, 0])
+ self.mediaBox = RectangleObject([
+ float(self.mediaBox.getLowerLeft_x()) * sx,
+ float(self.mediaBox.getLowerLeft_y()) * sy,
+ float(self.mediaBox.getUpperRight_x()) * sx,
+ float(self.mediaBox.getUpperRight_y()) * sy])
+ if "/VP" in self:
+ viewport = self["/VP"]
+ if isinstance(viewport, ArrayObject):
+ bbox = viewport[0]["/BBox"]
+ else:
+ bbox = viewport["/BBox"]
+ scaled_bbox = RectangleObject([
+ float(bbox[0]) * sx,
+ float(bbox[1]) * sy,
+ float(bbox[2]) * sx,
+ float(bbox[3]) * sy])
+ if isinstance(viewport, ArrayObject):
+ self[NameObject("/VP")][NumberObject(0)][NameObject("/BBox")] = scaled_bbox
+ else:
+ self[NameObject("/VP")][NameObject("/BBox")] = scaled_bbox
+
+ def scaleBy(self, factor):
+ """
+ Scales a page by the given factor by appling a transformation
+ matrix to its content and updating the page size.
+
+ :param float factor: The scaling factor (for both X and Y axis).
+ """
+ self.scale(factor, factor)
+
+ def scaleTo(self, width, height):
+ """
+ Scales a page to the specified dimentions by appling a
+ transformation matrix to its content and updating the page size.
+
+ :param float width: The new width.
+ :param float height: The new heigth.
+ """
+ sx = width / float(self.mediaBox.getUpperRight_x() -
+ self.mediaBox.getLowerLeft_x ())
+ sy = height / float(self.mediaBox.getUpperRight_y() -
+ self.mediaBox.getLowerLeft_y ())
+ self.scale(sx, sy)
+
+ def compressContentStreams(self):
+ """
+ Compresses the size of this page by joining all content streams and
+ applying a FlateDecode filter.
+
+ However, it is possible that this function will perform no action if
+ content stream compression becomes "automatic" for some reason.
+ """
+ content = self.getContents()
+ if content is not None:
+ if not isinstance(content, ContentStream):
+ content = ContentStream(content, self.pdf)
+ self[NameObject("/Contents")] = content.flateEncode()
+
+ def extractText(self):
+ """
+ Locate all text drawing commands, in the order they are provided in the
+ content stream, and extract the text. This works well for some PDF
+ files, but poorly for others, depending on the generator used. This will
+ be refined in the future. Do not rely on the order of text coming out of
+ this function, as it will change if this function is made more
+ sophisticated.
+
+ :return: a unicode string object.
+ """
+ text = u_("")
+ content = self["/Contents"].getObject()
+ if not isinstance(content, ContentStream):
+ content = ContentStream(content, self.pdf)
+ # Note: we check all strings are TextStringObjects. ByteStringObjects
+ # are strings where the byte->string encoding was unknown, so adding
+ # them to the text here would be gibberish.
+ for operands, operator in content.operations:
+ if operator == b_("Tj"):
+ _text = operands[0]
+ if isinstance(_text, TextStringObject):
+ text += _text
+ elif operator == b_("T*"):
+ text += "\n"
+ elif operator == b_("'"):
+ text += "\n"
+ _text = operands[0]
+ if isinstance(_text, TextStringObject):
+ text += operands[0]
+ elif operator == b_('"'):
+ _text = operands[2]
+ if isinstance(_text, TextStringObject):
+ text += "\n"
+ text += _text
+ elif operator == b_("TJ"):
+ for i in operands[0]:
+ if isinstance(i, TextStringObject):
+ text += i
+ text += "\n"
+ return text
+
+ mediaBox = createRectangleAccessor("/MediaBox", ())
+ """
+ A :class:`RectangleObject`, expressed in default user space units,
+ defining the boundaries of the physical medium on which the page is
+ intended to be displayed or printed.
+ """
+
+ cropBox = createRectangleAccessor("/CropBox", ("/MediaBox",))
+ """
+ A :class:`RectangleObject`, expressed in default user space units,
+ defining the visible region of default user space. When the page is
+ displayed or printed, its contents are to be clipped (cropped) to this
+ rectangle and then imposed on the output medium in some
+ implementation-defined manner. Default value: same as :attr:`mediaBox`.
+ """
+
+ bleedBox = createRectangleAccessor("/BleedBox", ("/CropBox", "/MediaBox"))
+ """
+ A :class:`RectangleObject`, expressed in default user space units,
+ defining the region to which the contents of the page should be clipped
+ when output in a production enviroment.
+ """
+
+ trimBox = createRectangleAccessor("/TrimBox", ("/CropBox", "/MediaBox"))
+ """
+ A :class:`RectangleObject`, expressed in default user space units,
+ defining the intended dimensions of the finished page after trimming.
+ """
+
+ artBox = createRectangleAccessor("/ArtBox", ("/CropBox", "/MediaBox"))
+ """
+ A :class:`RectangleObject`, expressed in default user space units,
+ defining the extent of the page's meaningful content as intended by the
+ page's creator.
+ """
+
+
+class ContentStream(DecodedStreamObject):
+ def __init__(self, stream, pdf):
+ self.pdf = pdf
+ self.operations = []
+ # stream may be a StreamObject or an ArrayObject containing
+ # multiple StreamObjects to be cat'd together.
+ stream = stream.getObject()
+ if isinstance(stream, ArrayObject):
+ data = b_("")
+ for s in stream:
+ data += s.getObject().getData()
+ stream = BytesIO(b_(data))
+ else:
+ stream = BytesIO(b_(stream.getData()))
+ self.__parseContentStream(stream)
+
+ def __parseContentStream(self, stream):
+ # file("f:\\tmp.txt", "w").write(stream.read())
+ stream.seek(0, 0)
+ operands = []
+ while True:
+ peek = readNonWhitespace(stream)
+ if peek == b_('') or ord_(peek) == 0:
+ break
+ stream.seek(-1, 1)
+ if peek.isalpha() or peek == b_("'") or peek == b_('"'):
+ operator = utils.readUntilRegex(stream,
+ NameObject.delimiterPattern, True)
+ if operator == b_("BI"):
+ # begin inline image - a completely different parsing
+ # mechanism is required, of course... thanks buddy...
+ assert operands == []
+ ii = self._readInlineImage(stream)
+ self.operations.append((ii, b_("INLINE IMAGE")))
+ else:
+ self.operations.append((operands, operator))
+ operands = []
+ elif peek == b_('%'):
+ # If we encounter a comment in the content stream, we have to
+ # handle it here. Typically, readObject will handle
+ # encountering a comment -- but readObject assumes that
+ # following the comment must be the object we're trying to
+ # read. In this case, it could be an operator instead.
+ while peek not in (b_('\r'), b_('\n')):
+ peek = stream.read(1)
+ else:
+ operands.append(readObject(stream, None))
+
+ def _readInlineImage(self, stream):
+ # begin reading just after the "BI" - begin image
+ # first read the dictionary of settings.
+ settings = DictionaryObject()
+ while True:
+ tok = readNonWhitespace(stream)
+ stream.seek(-1, 1)
+ if tok == b_("I"):
+ # "ID" - begin of image data
+ break
+ key = readObject(stream, self.pdf)
+ tok = readNonWhitespace(stream)
+ stream.seek(-1, 1)
+ value = readObject(stream, self.pdf)
+ settings[key] = value
+ # left at beginning of ID
+ tmp = stream.read(3)
+ assert tmp[:2] == b_("ID")
+ data = b_("")
+ while True:
+ # Read the inline image, while checking for EI (End Image) operator.
+ tok = stream.read(1)
+ if tok == b_("E"):
+ # Check for End Image
+ tok2 = stream.read(1)
+ if tok2 == b_("I"):
+ # Data can contain EI, so check for the Q operator.
+ tok3 = stream.read(1)
+ info = tok + tok2
+ # We need to find whitespace between EI and Q.
+ has_q_whitespace = False
+ while tok3 in utils.WHITESPACES:
+ has_q_whitespace = True
+ info += tok3
+ tok3 = stream.read(1)
+ if tok3 == b_("Q") and has_q_whitespace:
+ stream.seek(-1, 1)
+ break
+ else:
+ stream.seek(-1,1)
+ data += info
+ else:
+ stream.seek(-1, 1)
+ data += tok
+ else:
+ data += tok
+ return {"settings": settings, "data": data}
+
+ def _getData(self):
+ newdata = BytesIO()
+ for operands, operator in self.operations:
+ if operator == b_("INLINE IMAGE"):
+ newdata.write(b_("BI"))
+ dicttext = BytesIO()
+ operands["settings"].writeToStream(dicttext, None)
+ newdata.write(dicttext.getvalue()[2:-2])
+ newdata.write(b_("ID "))
+ newdata.write(operands["data"])
+ newdata.write(b_("EI"))
+ else:
+ for op in operands:
+ op.writeToStream(newdata, None)
+ newdata.write(b_(" "))
+ newdata.write(b_(operator))
+ newdata.write(b_("\n"))
+ return newdata.getvalue()
+
+ def _setData(self, value):
+ self.__parseContentStream(BytesIO(b_(value)))
+
+ _data = property(_getData, _setData)
+
+
+class DocumentInformation(DictionaryObject):
+ """
+ A class representing the basic document metadata provided in a PDF File.
+ This class is accessible through
+ :meth:`getDocumentInfo()`
+
+ All text properties of the document metadata have
+ *two* properties, eg. author and author_raw. The non-raw property will
+ always return a ``TextStringObject``, making it ideal for a case where
+ the metadata is being displayed. The raw property can sometimes return
+ a ``ByteStringObject``, if PyPDF2 was unable to decode the string's
+ text encoding; this requires additional safety in the caller and
+ therefore is not as commonly accessed.
+ """
+
+ def __init__(self):
+ DictionaryObject.__init__(self)
+
+ def getText(self, key):
+ retval = self.get(key, None)
+ if isinstance(retval, TextStringObject):
+ return retval
+ return None
+
+ title = property(lambda self: self.getText("/Title"))
+ """Read-only property accessing the document's **title**.
+ Returns a unicode string (``TextStringObject``) or ``None``
+ if the title is not specified."""
+ title_raw = property(lambda self: self.get("/Title"))
+ """The "raw" version of title; can return a ``ByteStringObject``."""
+
+ author = property(lambda self: self.getText("/Author"))
+ """Read-only property accessing the document's **author**.
+ Returns a unicode string (``TextStringObject``) or ``None``
+ if the author is not specified."""
+ author_raw = property(lambda self: self.get("/Author"))
+ """The "raw" version of author; can return a ``ByteStringObject``."""
+
+ subject = property(lambda self: self.getText("/Subject"))
+ """Read-only property accessing the document's **subject**.
+ Returns a unicode string (``TextStringObject``) or ``None``
+ if the subject is not specified."""
+ subject_raw = property(lambda self: self.get("/Subject"))
+ """The "raw" version of subject; can return a ``ByteStringObject``."""
+
+ creator = property(lambda self: self.getText("/Creator"))
+ """Read-only property accessing the document's **creator**. If the
+ document was converted to PDF from another format, this is the name of the
+ application (e.g. OpenOffice) that created the original document from
+ which it was converted. Returns a unicode string (``TextStringObject``)
+ or ``None`` if the creator is not specified."""
+ creator_raw = property(lambda self: self.get("/Creator"))
+ """The "raw" version of creator; can return a ``ByteStringObject``."""
+
+ producer = property(lambda self: self.getText("/Producer"))
+ """Read-only property accessing the document's **producer**.
+ If the document was converted to PDF from another format, this is
+ the name of the application (for example, OSX Quartz) that converted
+ it to PDF. Returns a unicode string (``TextStringObject``)
+ or ``None`` if the producer is not specified."""
+ producer_raw = property(lambda self: self.get("/Producer"))
+ """The "raw" version of producer; can return a ``ByteStringObject``."""
+
+
+def convertToInt(d, size):
+ if size > 8:
+ raise utils.PdfReadError("invalid size in convertToInt")
+ d = b_("\x00\x00\x00\x00\x00\x00\x00\x00") + b_(d)
+ d = d[-8:]
+ return struct.unpack(">q", d)[0]
+
+# ref: pdf1.8 spec section 3.5.2 algorithm 3.2
+_encryption_padding = b_('\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56') + \
+ b_('\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c') + \
+ b_('\xa9\xfe\x64\x53\x69\x7a')
+
+
+# Implementation of algorithm 3.2 of the PDF standard security handler,
+# section 3.5.2 of the PDF 1.6 reference.
+def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True):
+ # 1. Pad or truncate the password string to exactly 32 bytes. If the
+ # password string is more than 32 bytes long, use only its first 32 bytes;
+ # if it is less than 32 bytes long, pad it by appending the required number
+ # of additional bytes from the beginning of the padding string
+ # (_encryption_padding).
+ password = b_((str_(password) + str_(_encryption_padding))[:32])
+ # 2. Initialize the MD5 hash function and pass the result of step 1 as
+ # input to this function.
+ import struct
+ m = md5(password)
+ # 3. Pass the value of the encryption dictionary's /O entry to the MD5 hash
+ # function.
+ m.update(owner_entry.original_bytes)
+ # 4. Treat the value of the /P entry as an unsigned 4-byte integer and pass
+ # these bytes to the MD5 hash function, low-order byte first.
+ p_entry = struct.pack('= 3 and not metadata_encrypt:
+ m.update(b_("\xff\xff\xff\xff"))
+ # 7. Finish the hash.
+ md5_hash = m.digest()
+ # 8. (Revision 3 or greater) Do the following 50 times: Take the output
+ # from the previous MD5 hash and pass the first n bytes of the output as
+ # input into a new MD5 hash, where n is the number of bytes of the
+ # encryption key as defined by the value of the encryption dictionary's
+ # /Length entry.
+ if rev >= 3:
+ for i in range(50):
+ md5_hash = md5(md5_hash[:keylen]).digest()
+ # 9. Set the encryption key to the first n bytes of the output from the
+ # final MD5 hash, where n is always 5 for revision 2 but, for revision 3 or
+ # greater, depends on the value of the encryption dictionary's /Length
+ # entry.
+ return md5_hash[:keylen]
+
+
+# Implementation of algorithm 3.3 of the PDF standard security handler,
+# section 3.5.2 of the PDF 1.6 reference.
+def _alg33(owner_pwd, user_pwd, rev, keylen):
+ # steps 1 - 4
+ key = _alg33_1(owner_pwd, rev, keylen)
+ # 5. Pad or truncate the user password string as described in step 1 of
+ # algorithm 3.2.
+ user_pwd = b_((user_pwd + str_(_encryption_padding))[:32])
+ # 6. Encrypt the result of step 5, using an RC4 encryption function with
+ # the encryption key obtained in step 4.
+ val = utils.RC4_encrypt(key, user_pwd)
+ # 7. (Revision 3 or greater) Do the following 19 times: Take the output
+ # from the previous invocation of the RC4 function and pass it as input to
+ # a new invocation of the function; use an encryption key generated by
+ # taking each byte of the encryption key obtained in step 4 and performing
+ # an XOR operation between that byte and the single-byte value of the
+ # iteration counter (from 1 to 19).
+ if rev >= 3:
+ for i in range(1, 20):
+ new_key = ''
+ for l in range(len(key)):
+ new_key += chr(ord_(key[l]) ^ i)
+ val = utils.RC4_encrypt(new_key, val)
+ # 8. Store the output from the final invocation of the RC4 as the value of
+ # the /O entry in the encryption dictionary.
+ return val
+
+
+# Steps 1-4 of algorithm 3.3
+def _alg33_1(password, rev, keylen):
+ # 1. Pad or truncate the owner password string as described in step 1 of
+ # algorithm 3.2. If there is no owner password, use the user password
+ # instead.
+ password = b_((password + str_(_encryption_padding))[:32])
+ # 2. Initialize the MD5 hash function and pass the result of step 1 as
+ # input to this function.
+ m = md5(password)
+ # 3. (Revision 3 or greater) Do the following 50 times: Take the output
+ # from the previous MD5 hash and pass it as input into a new MD5 hash.
+ md5_hash = m.digest()
+ if rev >= 3:
+ for i in range(50):
+ md5_hash = md5(md5_hash).digest()
+ # 4. Create an RC4 encryption key using the first n bytes of the output
+ # from the final MD5 hash, where n is always 5 for revision 2 but, for
+ # revision 3 or greater, depends on the value of the encryption
+ # dictionary's /Length entry.
+ key = md5_hash[:keylen]
+ return key
+
+
+# Implementation of algorithm 3.4 of the PDF standard security handler,
+# section 3.5.2 of the PDF 1.6 reference.
+def _alg34(password, owner_entry, p_entry, id1_entry):
+ # 1. Create an encryption key based on the user password string, as
+ # described in algorithm 3.2.
+ key = _alg32(password, 2, 5, owner_entry, p_entry, id1_entry)
+ # 2. Encrypt the 32-byte padding string shown in step 1 of algorithm 3.2,
+ # using an RC4 encryption function with the encryption key from the
+ # preceding step.
+ U = utils.RC4_encrypt(key, _encryption_padding)
+ # 3. Store the result of step 2 as the value of the /U entry in the
+ # encryption dictionary.
+ return U, key
+
+
+# Implementation of algorithm 3.4 of the PDF standard security handler,
+# section 3.5.2 of the PDF 1.6 reference.
+def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt):
+ # 1. Create an encryption key based on the user password string, as
+ # described in Algorithm 3.2.
+ key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry)
+ # 2. Initialize the MD5 hash function and pass the 32-byte padding string
+ # shown in step 1 of Algorithm 3.2 as input to this function.
+ m = md5()
+ m.update(_encryption_padding)
+ # 3. Pass the first element of the file's file identifier array (the value
+ # of the ID entry in the document's trailer dictionary; see Table 3.13 on
+ # page 73) to the hash function and finish the hash. (See implementation
+ # note 25 in Appendix H.)
+ m.update(id1_entry.original_bytes)
+ md5_hash = m.digest()
+ # 4. Encrypt the 16-byte result of the hash, using an RC4 encryption
+ # function with the encryption key from step 1.
+ val = utils.RC4_encrypt(key, md5_hash)
+ # 5. Do the following 19 times: Take the output from the previous
+ # invocation of the RC4 function and pass it as input to a new invocation
+ # of the function; use an encryption key generated by taking each byte of
+ # the original encryption key (obtained in step 2) and performing an XOR
+ # operation between that byte and the single-byte value of the iteration
+ # counter (from 1 to 19).
+ for i in range(1, 20):
+ new_key = b_('')
+ for l in range(len(key)):
+ new_key += b_(chr(ord_(key[l]) ^ i))
+ val = utils.RC4_encrypt(new_key, val)
+ # 6. Append 16 bytes of arbitrary padding to the output from the final
+ # invocation of the RC4 function and store the 32-byte result as the value
+ # of the U entry in the encryption dictionary.
+ # (implementator note: I don't know what "arbitrary padding" is supposed to
+ # mean, so I have used null bytes. This seems to match a few other
+ # people's implementations)
+ return val + (b_('\x00') * 16), key
diff --git a/vendor/PyPDF2/utils.py b/vendor/PyPDF2/utils.py
new file mode 100755
index 00000000..718a875c
--- /dev/null
+++ b/vendor/PyPDF2/utils.py
@@ -0,0 +1,295 @@
+# Copyright (c) 2006, Mathieu Fenniak
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# * The name of the author may not be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+"""
+Utility functions for PDF library.
+"""
+__author__ = "Mathieu Fenniak"
+__author_email__ = "biziqe@mathieu.fenniak.net"
+
+
+import sys
+
+try:
+ import __builtin__ as builtins
+except ImportError: # Py3
+ import builtins
+
+
+xrange_fn = getattr(builtins, "xrange", range)
+_basestring = getattr(builtins, "basestring", str)
+
+bytes_type = type(bytes()) # Works the same in Python 2.X and 3.X
+string_type = getattr(builtins, "unicode", str)
+int_types = (int, long) if sys.version_info[0] < 3 else (int,)
+
+
+# Make basic type tests more consistent
+def isString(s):
+ """Test if arg is a string. Compatible with Python 2 and 3."""
+ return isinstance(s, _basestring)
+
+
+def isInt(n):
+ """Test if arg is an int. Compatible with Python 2 and 3."""
+ return isinstance(n, int_types)
+
+
+def isBytes(b):
+ """Test if arg is a bytes instance. Compatible with Python 2 and 3."""
+ return isinstance(b, bytes_type)
+
+
+#custom implementation of warnings.formatwarning
+def formatWarning(message, category, filename, lineno, line=None):
+ file = filename.replace("/", "\\").rsplit("\\", 1)[1] # find the file name
+ return "%s: %s [%s:%s]\n" % (category.__name__, message, file, lineno)
+
+
+def readUntilWhitespace(stream, maxchars=None):
+ """
+ Reads non-whitespace characters and returns them.
+ Stops upon encountering whitespace or when maxchars is reached.
+ """
+ txt = b_("")
+ while True:
+ tok = stream.read(1)
+ if tok.isspace() or not tok:
+ break
+ txt += tok
+ if len(txt) == maxchars:
+ break
+ return txt
+
+
+def readNonWhitespace(stream):
+ """
+ Finds and reads the next non-whitespace character (ignores whitespace).
+ """
+ tok = WHITESPACES[0]
+ while tok in WHITESPACES:
+ tok = stream.read(1)
+ return tok
+
+
+def skipOverWhitespace(stream):
+ """
+ Similar to readNonWhitespace, but returns a Boolean if more than
+ one whitespace character was read.
+ """
+ tok = WHITESPACES[0]
+ cnt = 0;
+ while tok in WHITESPACES:
+ tok = stream.read(1)
+ cnt+=1
+ return (cnt > 1)
+
+
+def skipOverComment(stream):
+ tok = stream.read(1)
+ stream.seek(-1, 1)
+ if tok == b_('%'):
+ while tok not in (b_('\n'), b_('\r')):
+ tok = stream.read(1)
+
+
+def readUntilRegex(stream, regex, ignore_eof=False):
+ """
+ Reads until the regular expression pattern matched (ignore the match)
+ Raise PdfStreamError on premature end-of-file.
+ :param bool ignore_eof: If true, ignore end-of-line and return immediately
+ """
+ name = b_('')
+ while True:
+ tok = stream.read(16)
+ if not tok:
+ # stream has truncated prematurely
+ if ignore_eof == True:
+ return name
+ else:
+ raise PdfStreamError("Stream has ended unexpectedly")
+ m = regex.search(tok)
+ if m is not None:
+ name += tok[:m.start()]
+ stream.seek(m.start()-len(tok), 1)
+ break
+ name += tok
+ return name
+
+
+class ConvertFunctionsToVirtualList(object):
+ def __init__(self, lengthFunction, getFunction):
+ self.lengthFunction = lengthFunction
+ self.getFunction = getFunction
+
+ def __len__(self):
+ return self.lengthFunction()
+
+ def __getitem__(self, index):
+ if isinstance(index, slice):
+ indices = xrange_fn(*index.indices(len(self)))
+ cls = type(self)
+ return cls(indices.__len__, lambda idx: self[indices[idx]])
+ if not isInt(index):
+ raise TypeError("sequence indices must be integers")
+ len_self = len(self)
+ if index < 0:
+ # support negative indexes
+ index = len_self + index
+ if index < 0 or index >= len_self:
+ raise IndexError("sequence index out of range")
+ return self.getFunction(index)
+
+
+def RC4_encrypt(key, plaintext):
+ S = [i for i in range(256)]
+ j = 0
+ for i in range(256):
+ j = (j + S[i] + ord_(key[i % len(key)])) % 256
+ S[i], S[j] = S[j], S[i]
+ i, j = 0, 0
+ retval = b_("")
+ for x in range(len(plaintext)):
+ i = (i + 1) % 256
+ j = (j + S[i]) % 256
+ S[i], S[j] = S[j], S[i]
+ t = S[(S[i] + S[j]) % 256]
+ retval += b_(chr(ord_(plaintext[x]) ^ t))
+ return retval
+
+
+def matrixMultiply(a, b):
+ return [[sum([float(i)*float(j)
+ for i, j in zip(row, col)]
+ ) for col in zip(*b)]
+ for row in a]
+
+
+def markLocation(stream):
+ """Creates text file showing current location in context."""
+ # Mainly for debugging
+ RADIUS = 5000
+ stream.seek(-RADIUS, 1)
+ outputDoc = open('PyPDF2_pdfLocation.txt', 'w')
+ outputDoc.write(stream.read(RADIUS))
+ outputDoc.write('HERE')
+ outputDoc.write(stream.read(RADIUS))
+ outputDoc.close()
+ stream.seek(-RADIUS, 1)
+
+
+class PyPdfError(Exception):
+ pass
+
+
+class PdfReadError(PyPdfError):
+ pass
+
+
+class PageSizeNotDefinedError(PyPdfError):
+ pass
+
+
+class PdfReadWarning(UserWarning):
+ pass
+
+
+class PdfStreamError(PdfReadError):
+ pass
+
+
+if sys.version_info[0] < 3:
+ def b_(s):
+ return s
+else:
+ B_CACHE = {}
+
+ def b_(s):
+ bc = B_CACHE
+ if s in bc:
+ return bc[s]
+ if type(s) == bytes:
+ return s
+ else:
+ r = s.encode('latin-1')
+ if len(s) < 2:
+ bc[s] = r
+ return r
+
+
+def u_(s):
+ if sys.version_info[0] < 3:
+ return unicode(s, 'unicode_escape')
+ else:
+ return s
+
+
+def str_(b):
+ if sys.version_info[0] < 3:
+ return b
+ else:
+ if type(b) == bytes:
+ return b.decode('latin-1')
+ else:
+ return b
+
+
+def ord_(b):
+ if sys.version_info[0] < 3 or type(b) == str:
+ return ord(b)
+ else:
+ return b
+
+
+def chr_(c):
+ if sys.version_info[0] < 3:
+ return c
+ else:
+ return chr(c)
+
+
+def barray(b):
+ if sys.version_info[0] < 3:
+ return b
+ else:
+ return bytearray(b)
+
+
+def hexencode(b):
+ if sys.version_info[0] < 3:
+ return b.encode('hex')
+ else:
+ import codecs
+ coder = codecs.getencoder('hex_codec')
+ return coder(b)[0]
+
+
+def hexStr(num):
+ return hex(num).replace('L', '')
+
+
+WHITESPACES = [b_(x) for x in [' ', '\n', '\r', '\t', '\x00']]
diff --git a/vendor/PyPDF2/xmp.py b/vendor/PyPDF2/xmp.py
new file mode 100755
index 00000000..7ba62f0d
--- /dev/null
+++ b/vendor/PyPDF2/xmp.py
@@ -0,0 +1,358 @@
+import re
+import datetime
+import decimal
+from .generic import PdfObject
+from xml.dom import getDOMImplementation
+from xml.dom.minidom import parseString
+from .utils import u_
+
+RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+DC_NAMESPACE = "http://purl.org/dc/elements/1.1/"
+XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/"
+PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/"
+XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/"
+
+# What is the PDFX namespace, you might ask? I might ask that too. It's
+# a completely undocumented namespace used to place "custom metadata"
+# properties, which are arbitrary metadata properties with no semantic or
+# documented meaning. Elements in the namespace are key/value-style storage,
+# where the element name is the key and the content is the value. The keys
+# are transformed into valid XML identifiers by substituting an invalid
+# identifier character with \u2182 followed by the unicode hex ID of the
+# original character. A key like "my car" is therefore "my\u21820020car".
+#
+# \u2182, in case you're wondering, is the unicode character
+# \u{ROMAN NUMERAL TEN THOUSAND}, a straightforward and obvious choice for
+# escaping characters.
+#
+# Intentional users of the pdfx namespace should be shot on sight. A
+# custom data schema and sensical XML elements could be used instead, as is
+# suggested by Adobe's own documentation on XMP (under "Extensibility of
+# Schemas").
+#
+# Information presented here on the /pdfx/ schema is a result of limited
+# reverse engineering, and does not constitute a full specification.
+PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/"
+
+iso8601 = re.compile("""
+ (?P[0-9]{4})
+ (-
+ (?P[0-9]{2})
+ (-
+ (?P[0-9]+)
+ (T
+ (?P[0-9]{2}):
+ (?P[0-9]{2})
+ (:(?P[0-9]{2}(.[0-9]+)?))?
+ (?PZ|[-+][0-9]{2}:[0-9]{2})
+ )?
+ )?
+ )?
+ """, re.VERBOSE)
+
+
+class XmpInformation(PdfObject):
+ """
+ An object that represents Adobe XMP metadata.
+ Usually accessed by :meth:`getXmpMetadata()`
+ """
+
+ def __init__(self, stream):
+ self.stream = stream
+ docRoot = parseString(self.stream.getData())
+ self.rdfRoot = docRoot.getElementsByTagNameNS(RDF_NAMESPACE, "RDF")[0]
+ self.cache = {}
+
+ def writeToStream(self, stream, encryption_key):
+ self.stream.writeToStream(stream, encryption_key)
+
+ def getElement(self, aboutUri, namespace, name):
+ for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
+ if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri:
+ attr = desc.getAttributeNodeNS(namespace, name)
+ if attr != None:
+ yield attr
+ for element in desc.getElementsByTagNameNS(namespace, name):
+ yield element
+
+ def getNodesInNamespace(self, aboutUri, namespace):
+ for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
+ if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri:
+ for i in range(desc.attributes.length):
+ attr = desc.attributes.item(i)
+ if attr.namespaceURI == namespace:
+ yield attr
+ for child in desc.childNodes:
+ if child.namespaceURI == namespace:
+ yield child
+
+ def _getText(self, element):
+ text = ""
+ for child in element.childNodes:
+ if child.nodeType == child.TEXT_NODE:
+ text += child.data
+ return text
+
+ def _converter_string(value):
+ return value
+
+ def _converter_date(value):
+ m = iso8601.match(value)
+ year = int(m.group("year"))
+ month = int(m.group("month") or "1")
+ day = int(m.group("day") or "1")
+ hour = int(m.group("hour") or "0")
+ minute = int(m.group("minute") or "0")
+ second = decimal.Decimal(m.group("second") or "0")
+ seconds = second.to_integral(decimal.ROUND_FLOOR)
+ milliseconds = (second - seconds) * 1000000
+ tzd = m.group("tzd") or "Z"
+ dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds)
+ if tzd != "Z":
+ tzd_hours, tzd_minutes = [int(x) for x in tzd.split(":")]
+ tzd_hours *= -1
+ if tzd_hours < 0:
+ tzd_minutes *= -1
+ dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes)
+ return dt
+ _test_converter_date = staticmethod(_converter_date)
+
+ def _getter_bag(namespace, name, converter):
+ def get(self):
+ cached = self.cache.get(namespace, {}).get(name)
+ if cached:
+ return cached
+ retval = []
+ for element in self.getElement("", namespace, name):
+ bags = element.getElementsByTagNameNS(RDF_NAMESPACE, "Bag")
+ if len(bags):
+ for bag in bags:
+ for item in bag.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
+ value = self._getText(item)
+ value = converter(value)
+ retval.append(value)
+ ns_cache = self.cache.setdefault(namespace, {})
+ ns_cache[name] = retval
+ return retval
+ return get
+
+ def _getter_seq(namespace, name, converter):
+ def get(self):
+ cached = self.cache.get(namespace, {}).get(name)
+ if cached:
+ return cached
+ retval = []
+ for element in self.getElement("", namespace, name):
+ seqs = element.getElementsByTagNameNS(RDF_NAMESPACE, "Seq")
+ if len(seqs):
+ for seq in seqs:
+ for item in seq.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
+ value = self._getText(item)
+ value = converter(value)
+ retval.append(value)
+ else:
+ value = converter(self._getText(element))
+ retval.append(value)
+ ns_cache = self.cache.setdefault(namespace, {})
+ ns_cache[name] = retval
+ return retval
+ return get
+
+ def _getter_langalt(namespace, name, converter):
+ def get(self):
+ cached = self.cache.get(namespace, {}).get(name)
+ if cached:
+ return cached
+ retval = {}
+ for element in self.getElement("", namespace, name):
+ alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt")
+ if len(alts):
+ for alt in alts:
+ for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
+ value = self._getText(item)
+ value = converter(value)
+ retval[item.getAttribute("xml:lang")] = value
+ else:
+ retval["x-default"] = converter(self._getText(element))
+ ns_cache = self.cache.setdefault(namespace, {})
+ ns_cache[name] = retval
+ return retval
+ return get
+
+ def _getter_single(namespace, name, converter):
+ def get(self):
+ cached = self.cache.get(namespace, {}).get(name)
+ if cached:
+ return cached
+ value = None
+ for element in self.getElement("", namespace, name):
+ if element.nodeType == element.ATTRIBUTE_NODE:
+ value = element.nodeValue
+ else:
+ value = self._getText(element)
+ break
+ if value != None:
+ value = converter(value)
+ ns_cache = self.cache.setdefault(namespace, {})
+ ns_cache[name] = value
+ return value
+ return get
+
+ dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor", _converter_string))
+ """
+ Contributors to the resource (other than the authors). An unsorted
+ array of names.
+ """
+
+ dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage", _converter_string))
+ """
+ Text describing the extent or scope of the resource.
+ """
+
+ dc_creator = property(_getter_seq(DC_NAMESPACE, "creator", _converter_string))
+ """
+ A sorted array of names of the authors of the resource, listed in order
+ of precedence.
+ """
+
+ dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date))
+ """
+ A sorted array of dates (datetime.datetime instances) of signifigance to
+ the resource. The dates and times are in UTC.
+ """
+
+ dc_description = property(_getter_langalt(DC_NAMESPACE, "description", _converter_string))
+ """
+ A language-keyed dictionary of textual descriptions of the content of the
+ resource.
+ """
+
+ dc_format = property(_getter_single(DC_NAMESPACE, "format", _converter_string))
+ """
+ The mime-type of the resource.
+ """
+
+ dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier", _converter_string))
+ """
+ Unique identifier of the resource.
+ """
+
+ dc_language = property(_getter_bag(DC_NAMESPACE, "language", _converter_string))
+ """
+ An unordered array specifying the languages used in the resource.
+ """
+
+ dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher", _converter_string))
+ """
+ An unordered array of publisher names.
+ """
+
+ dc_relation = property(_getter_bag(DC_NAMESPACE, "relation", _converter_string))
+ """
+ An unordered array of text descriptions of relationships to other
+ documents.
+ """
+
+ dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights", _converter_string))
+ """
+ A language-keyed dictionary of textual descriptions of the rights the
+ user has to this resource.
+ """
+
+ dc_source = property(_getter_single(DC_NAMESPACE, "source", _converter_string))
+ """
+ Unique identifier of the work from which this resource was derived.
+ """
+
+ dc_subject = property(_getter_bag(DC_NAMESPACE, "subject", _converter_string))
+ """
+ An unordered array of descriptive phrases or keywrods that specify the
+ topic of the content of the resource.
+ """
+
+ dc_title = property(_getter_langalt(DC_NAMESPACE, "title", _converter_string))
+ """
+ A language-keyed dictionary of the title of the resource.
+ """
+
+ dc_type = property(_getter_bag(DC_NAMESPACE, "type", _converter_string))
+ """
+ An unordered array of textual descriptions of the document type.
+ """
+
+ pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords", _converter_string))
+ """
+ An unformatted text string representing document keywords.
+ """
+
+ pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion", _converter_string))
+ """
+ The PDF file version, for example 1.0, 1.3.
+ """
+
+ pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer", _converter_string))
+ """
+ The name of the tool that created the PDF document.
+ """
+
+ xmp_createDate = property(_getter_single(XMP_NAMESPACE, "CreateDate", _converter_date))
+ """
+ The date and time the resource was originally created. The date and
+ time are returned as a UTC datetime.datetime object.
+ """
+
+ xmp_modifyDate = property(_getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date))
+ """
+ The date and time the resource was last modified. The date and time
+ are returned as a UTC datetime.datetime object.
+ """
+
+ xmp_metadataDate = property(_getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date))
+ """
+ The date and time that any metadata for this resource was last
+ changed. The date and time are returned as a UTC datetime.datetime
+ object.
+ """
+
+ xmp_creatorTool = property(_getter_single(XMP_NAMESPACE, "CreatorTool", _converter_string))
+ """
+ The name of the first known tool used to create the resource.
+ """
+
+ xmpmm_documentId = property(_getter_single(XMPMM_NAMESPACE, "DocumentID", _converter_string))
+ """
+ The common identifier for all versions and renditions of this resource.
+ """
+
+ xmpmm_instanceId = property(_getter_single(XMPMM_NAMESPACE, "InstanceID", _converter_string))
+ """
+ An identifier for a specific incarnation of a document, updated each
+ time a file is saved.
+ """
+
+ def custom_properties(self):
+ if not hasattr(self, "_custom_properties"):
+ self._custom_properties = {}
+ for node in self.getNodesInNamespace("", PDFX_NAMESPACE):
+ key = node.localName
+ while True:
+ # see documentation about PDFX_NAMESPACE earlier in file
+ idx = key.find(u_("\u2182"))
+ if idx == -1:
+ break
+ key = key[:idx] + chr(int(key[idx+1:idx+5], base=16)) + key[idx+5:]
+ if node.nodeType == node.ATTRIBUTE_NODE:
+ value = node.nodeValue
+ else:
+ value = self._getText(node)
+ self._custom_properties[key] = value
+ return self._custom_properties
+
+ custom_properties = property(custom_properties)
+ """
+ Retrieves custom metadata properties defined in the undocumented pdfx
+ metadata schema.
+
+ :return: a dictionary of key/value items for custom metadata properties.
+ :rtype: dict
+ """