From 829e926770d74ee03b9106dff4ed39ffc9c532a0 Mon Sep 17 00:00:00 2001 From: Cervinko Cera Date: Sun, 7 Aug 2016 22:12:32 +0200 Subject: [PATCH] Add PyPDF2 to vendor --- vendor/PyPDF2/__init__.py | 5 + vendor/PyPDF2/_version.py | 1 + vendor/PyPDF2/filters.py | 362 +++++ vendor/PyPDF2/generic.py | 1226 +++++++++++++++ vendor/PyPDF2/merger.py | 553 +++++++ vendor/PyPDF2/pagerange.py | 152 ++ vendor/PyPDF2/pdf.py | 3004 ++++++++++++++++++++++++++++++++++++ vendor/PyPDF2/utils.py | 295 ++++ vendor/PyPDF2/xmp.py | 358 +++++ 9 files changed, 5956 insertions(+) create mode 100755 vendor/PyPDF2/__init__.py create mode 100755 vendor/PyPDF2/_version.py create mode 100755 vendor/PyPDF2/filters.py create mode 100755 vendor/PyPDF2/generic.py create mode 100755 vendor/PyPDF2/merger.py create mode 100755 vendor/PyPDF2/pagerange.py create mode 100755 vendor/PyPDF2/pdf.py create mode 100755 vendor/PyPDF2/utils.py create mode 100755 vendor/PyPDF2/xmp.py diff --git a/vendor/PyPDF2/__init__.py b/vendor/PyPDF2/__init__.py new file mode 100755 index 00000000..f458c0ea --- /dev/null +++ b/vendor/PyPDF2/__init__.py @@ -0,0 +1,5 @@ +from .pdf import PdfFileReader, PdfFileWriter +from .merger import PdfFileMerger +from .pagerange import PageRange, parse_filename_page_ranges +from ._version import __version__ +__all__ = ["pdf", "PdfFileMerger"] diff --git a/vendor/PyPDF2/_version.py b/vendor/PyPDF2/_version.py new file mode 100755 index 00000000..5fc7041e --- /dev/null +++ b/vendor/PyPDF2/_version.py @@ -0,0 +1 @@ +__version__ = '1.26.0' diff --git a/vendor/PyPDF2/filters.py b/vendor/PyPDF2/filters.py new file mode 100755 index 00000000..3717fd4c --- /dev/null +++ b/vendor/PyPDF2/filters.py @@ -0,0 +1,362 @@ +# vim: sw=4:expandtab:foldmethod=marker +# +# Copyright (c) 2006, Mathieu Fenniak +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +""" +Implementation of stream filters for PDF. +""" +__author__ = "Mathieu Fenniak" +__author_email__ = "biziqe@mathieu.fenniak.net" + +from .utils import PdfReadError, ord_, chr_ +from sys import version_info +if version_info < ( 3, 0 ): + from cStringIO import StringIO +else: + from io import StringIO + import struct + +try: + import zlib + + def decompress(data): + return zlib.decompress(data) + + def compress(data): + return zlib.compress(data) + +except ImportError: + # Unable to import zlib. Attempt to use the System.IO.Compression + # library from the .NET framework. (IronPython only) + import System + from System import IO, Collections, Array + + def _string_to_bytearr(buf): + retval = Array.CreateInstance(System.Byte, len(buf)) + for i in range(len(buf)): + retval[i] = ord(buf[i]) + return retval + + def _bytearr_to_string(bytes): + retval = "" + for i in range(bytes.Length): + retval += chr(bytes[i]) + return retval + + def _read_bytes(stream): + ms = IO.MemoryStream() + buf = Array.CreateInstance(System.Byte, 2048) + while True: + bytes = stream.Read(buf, 0, buf.Length) + if bytes == 0: + break + else: + ms.Write(buf, 0, bytes) + retval = ms.ToArray() + ms.Close() + return retval + + def decompress(data): + bytes = _string_to_bytearr(data) + ms = IO.MemoryStream() + ms.Write(bytes, 0, bytes.Length) + ms.Position = 0 # fseek 0 + gz = IO.Compression.DeflateStream(ms, IO.Compression.CompressionMode.Decompress) + bytes = _read_bytes(gz) + retval = _bytearr_to_string(bytes) + gz.Close() + return retval + + def compress(data): + bytes = _string_to_bytearr(data) + ms = IO.MemoryStream() + gz = IO.Compression.DeflateStream(ms, IO.Compression.CompressionMode.Compress, True) + gz.Write(bytes, 0, bytes.Length) + gz.Close() + ms.Position = 0 # fseek 0 + bytes = ms.ToArray() + retval = _bytearr_to_string(bytes) + ms.Close() + return retval + + +class FlateDecode(object): + def decode(data, decodeParms): + data = decompress(data) + predictor = 1 + if decodeParms: + try: + predictor = decodeParms.get("/Predictor", 1) + except AttributeError: + pass # usually an array with a null object was read + + # predictor 1 == no predictor + if predictor != 1: + columns = decodeParms["/Columns"] + # PNG prediction: + if predictor >= 10 and predictor <= 15: + output = StringIO() + # PNG prediction can vary from row to row + rowlength = columns + 1 + assert len(data) % rowlength == 0 + prev_rowdata = (0,) * rowlength + for row in range(len(data) // rowlength): + rowdata = [ord_(x) for x in data[(row*rowlength):((row+1)*rowlength)]] + filterByte = rowdata[0] + if filterByte == 0: + pass + elif filterByte == 1: + for i in range(2, rowlength): + rowdata[i] = (rowdata[i] + rowdata[i-1]) % 256 + elif filterByte == 2: + for i in range(1, rowlength): + rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256 + else: + # unsupported PNG filter + raise PdfReadError("Unsupported PNG filter %r" % filterByte) + prev_rowdata = rowdata + output.write(''.join([chr(x) for x in rowdata[1:]])) + data = output.getvalue() + else: + # unsupported predictor + raise PdfReadError("Unsupported flatedecode predictor %r" % predictor) + return data + decode = staticmethod(decode) + + def encode(data): + return compress(data) + encode = staticmethod(encode) + + +class ASCIIHexDecode(object): + def decode(data, decodeParms=None): + retval = "" + char = "" + x = 0 + while True: + c = data[x] + if c == ">": + break + elif c.isspace(): + x += 1 + continue + char += c + if len(char) == 2: + retval += chr(int(char, base=16)) + char = "" + x += 1 + assert char == "" + return retval + decode = staticmethod(decode) + + +class LZWDecode(object): + """Taken from: + http://www.java2s.com/Open-Source/Java-Document/PDF/PDF-Renderer/com/sun/pdfview/decode/LZWDecode.java.htm + """ + class decoder(object): + def __init__(self, data): + self.STOP=257 + self.CLEARDICT=256 + self.data=data + self.bytepos=0 + self.bitpos=0 + self.dict=[""]*4096 + for i in range(256): + self.dict[i]=chr(i) + self.resetDict() + + def resetDict(self): + self.dictlen=258 + self.bitspercode=9 + + def nextCode(self): + fillbits=self.bitspercode + value=0 + while fillbits>0 : + if self.bytepos >= len(self.data): + return -1 + nextbits=ord(self.data[self.bytepos]) + bitsfromhere=8-self.bitpos + if bitsfromhere>fillbits: + bitsfromhere=fillbits + value |= (((nextbits >> (8-self.bitpos-bitsfromhere)) & + (0xff >> (8-bitsfromhere))) << + (fillbits-bitsfromhere)) + fillbits -= bitsfromhere + self.bitpos += bitsfromhere + if self.bitpos >=8: + self.bitpos=0 + self.bytepos = self.bytepos+1 + return value + + def decode(self): + """ algorithm derived from: + http://www.rasip.fer.hr/research/compress/algorithms/fund/lz/lzw.html + and the PDFReference + """ + cW = self.CLEARDICT; + baos="" + while True: + pW = cW; + cW = self.nextCode(); + if cW == -1: + raise PdfReadError("Missed the stop code in LZWDecode!") + if cW == self.STOP: + break; + elif cW == self.CLEARDICT: + self.resetDict(); + elif pW == self.CLEARDICT: + baos+=self.dict[cW] + else: + if cW < self.dictlen: + baos += self.dict[cW] + p=self.dict[pW]+self.dict[cW][0] + self.dict[self.dictlen]=p + self.dictlen+=1 + else: + p=self.dict[pW]+self.dict[pW][0] + baos+=p + self.dict[self.dictlen] = p; + self.dictlen+=1 + if (self.dictlen >= (1 << self.bitspercode) - 1 and + self.bitspercode < 12): + self.bitspercode+=1 + return baos + + @staticmethod + def decode(data,decodeParams=None): + return LZWDecode.decoder(data).decode() + + +class ASCII85Decode(object): + def decode(data, decodeParms=None): + if version_info < ( 3, 0 ): + retval = "" + group = [] + x = 0 + hitEod = False + # remove all whitespace from data + data = [y for y in data if not (y in ' \n\r\t')] + while not hitEod: + c = data[x] + if len(retval) == 0 and c == "<" and data[x+1] == "~": + x += 2 + continue + #elif c.isspace(): + # x += 1 + # continue + elif c == 'z': + assert len(group) == 0 + retval += '\x00\x00\x00\x00' + x += 1 + continue + elif c == "~" and data[x+1] == ">": + if len(group) != 0: + # cannot have a final group of just 1 char + assert len(group) > 1 + cnt = len(group) - 1 + group += [ 85, 85, 85 ] + hitEod = cnt + else: + break + else: + c = ord(c) - 33 + assert c >= 0 and c < 85 + group += [ c ] + if len(group) >= 5: + b = group[0] * (85**4) + \ + group[1] * (85**3) + \ + group[2] * (85**2) + \ + group[3] * 85 + \ + group[4] + assert b < (2**32 - 1) + c4 = chr((b >> 0) % 256) + c3 = chr((b >> 8) % 256) + c2 = chr((b >> 16) % 256) + c1 = chr(b >> 24) + retval += (c1 + c2 + c3 + c4) + if hitEod: + retval = retval[:-4+hitEod] + group = [] + x += 1 + return retval + else: + if isinstance(data, str): + data = data.encode('ascii') + n = b = 0 + out = bytearray() + for c in data: + if ord('!') <= c and c <= ord('u'): + n += 1 + b = b*85+(c-33) + if n == 5: + out += struct.pack(b'>L',b) + n = b = 0 + elif c == ord('z'): + assert n == 0 + out += b'\0\0\0\0' + elif c == ord('~'): + if n: + for _ in range(5-n): + b = b*85+84 + out += struct.pack(b'>L',b)[:n-1] + break + return bytes(out) + decode = staticmethod(decode) + + +def decodeStreamData(stream): + from .generic import NameObject + filters = stream.get("/Filter", ()) + if len(filters) and not isinstance(filters[0], NameObject): + # we have a single filter instance + filters = (filters,) + data = stream._data + # If there is not data to decode we should not try to decode the data. + if data: + for filterType in filters: + if filterType == "/FlateDecode" or filterType == "/Fl": + data = FlateDecode.decode(data, stream.get("/DecodeParms")) + elif filterType == "/ASCIIHexDecode" or filterType == "/AHx": + data = ASCIIHexDecode.decode(data) + elif filterType == "/LZWDecode" or filterType == "/LZW": + data = LZWDecode.decode(data, stream.get("/DecodeParms")) + elif filterType == "/ASCII85Decode" or filterType == "/A85": + data = ASCII85Decode.decode(data) + elif filterType == "/Crypt": + decodeParams = stream.get("/DecodeParams", {}) + if "/Name" not in decodeParams and "/Type" not in decodeParams: + pass + else: + raise NotImplementedError("/Crypt filter with /Name or /Type not supported yet") + else: + # unsupported filter + raise NotImplementedError("unsupported filter %s" % filterType) + return data diff --git a/vendor/PyPDF2/generic.py b/vendor/PyPDF2/generic.py new file mode 100755 index 00000000..c4332297 --- /dev/null +++ b/vendor/PyPDF2/generic.py @@ -0,0 +1,1226 @@ +# vim: sw=4:expandtab:foldmethod=marker +# +# Copyright (c) 2006, Mathieu Fenniak +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +""" +Implementation of generic PDF objects (dictionary, number, string, and so on) +""" +__author__ = "Mathieu Fenniak" +__author_email__ = "biziqe@mathieu.fenniak.net" + +import re +from .utils import readNonWhitespace, RC4_encrypt, skipOverComment +from .utils import b_, u_, chr_, ord_ +from .utils import PdfStreamError +import warnings +from . import filters +from . import utils +import decimal +import codecs +import sys +#import debugging + +ObjectPrefix = b_('/<[tf(n%') +NumberSigns = b_('+-') +IndirectPattern = re.compile(b_(r"(\d+)\s+(\d+)\s+R[^a-zA-Z]")) + + +def readObject(stream, pdf): + tok = stream.read(1) + stream.seek(-1, 1) # reset to start + idx = ObjectPrefix.find(tok) + if idx == 0: + # name object + return NameObject.readFromStream(stream, pdf) + elif idx == 1: + # hexadecimal string OR dictionary + peek = stream.read(2) + stream.seek(-2, 1) # reset to start + if peek == b_('<<'): + return DictionaryObject.readFromStream(stream, pdf) + else: + return readHexStringFromStream(stream) + elif idx == 2: + # array object + return ArrayObject.readFromStream(stream, pdf) + elif idx == 3 or idx == 4: + # boolean object + return BooleanObject.readFromStream(stream) + elif idx == 5: + # string object + return readStringFromStream(stream) + elif idx == 6: + # null object + return NullObject.readFromStream(stream) + elif idx == 7: + # comment + while tok not in (b_('\r'), b_('\n')): + tok = stream.read(1) + tok = readNonWhitespace(stream) + stream.seek(-1, 1) + return readObject(stream, pdf) + else: + # number object OR indirect reference + if tok in NumberSigns: + # number + return NumberObject.readFromStream(stream) + peek = stream.read(20) + stream.seek(-len(peek), 1) # reset to start + if IndirectPattern.match(peek) != None: + return IndirectObject.readFromStream(stream, pdf) + else: + return NumberObject.readFromStream(stream) + + +class PdfObject(object): + def getObject(self): + """Resolves indirect references.""" + return self + + +class NullObject(PdfObject): + def writeToStream(self, stream, encryption_key): + stream.write(b_("null")) + + def readFromStream(stream): + nulltxt = stream.read(4) + if nulltxt != b_("null"): + raise utils.PdfReadError("Could not read Null object") + return NullObject() + readFromStream = staticmethod(readFromStream) + + +class BooleanObject(PdfObject): + def __init__(self, value): + self.value = value + + def writeToStream(self, stream, encryption_key): + if self.value: + stream.write(b_("true")) + else: + stream.write(b_("false")) + + def readFromStream(stream): + word = stream.read(4) + if word == b_("true"): + return BooleanObject(True) + elif word == b_("fals"): + stream.read(1) + return BooleanObject(False) + else: + raise utils.PdfReadError('Could not read Boolean object') + readFromStream = staticmethod(readFromStream) + + +class ArrayObject(list, PdfObject): + def writeToStream(self, stream, encryption_key): + stream.write(b_("[")) + for data in self: + stream.write(b_(" ")) + data.writeToStream(stream, encryption_key) + stream.write(b_(" ]")) + + def readFromStream(stream, pdf): + arr = ArrayObject() + tmp = stream.read(1) + if tmp != b_("["): + raise utils.PdfReadError("Could not read array") + while True: + # skip leading whitespace + tok = stream.read(1) + while tok.isspace(): + tok = stream.read(1) + stream.seek(-1, 1) + # check for array ending + peekahead = stream.read(1) + if peekahead == b_("]"): + break + stream.seek(-1, 1) + # read and append obj + arr.append(readObject(stream, pdf)) + return arr + readFromStream = staticmethod(readFromStream) + + +class IndirectObject(PdfObject): + def __init__(self, idnum, generation, pdf): + self.idnum = idnum + self.generation = generation + self.pdf = pdf + + def getObject(self): + return self.pdf.getObject(self).getObject() + + def __repr__(self): + return "IndirectObject(%r, %r)" % (self.idnum, self.generation) + + def __eq__(self, other): + return ( + other != None and + isinstance(other, IndirectObject) and + self.idnum == other.idnum and + self.generation == other.generation and + self.pdf is other.pdf + ) + + def __ne__(self, other): + return not self.__eq__(other) + + def writeToStream(self, stream, encryption_key): + stream.write(b_("%s %s R" % (self.idnum, self.generation))) + + def readFromStream(stream, pdf): + idnum = b_("") + while True: + tok = stream.read(1) + if not tok: + # stream has truncated prematurely + raise PdfStreamError("Stream has ended unexpectedly") + if tok.isspace(): + break + idnum += tok + generation = b_("") + while True: + tok = stream.read(1) + if not tok: + # stream has truncated prematurely + raise PdfStreamError("Stream has ended unexpectedly") + if tok.isspace(): + if not generation: + continue + break + generation += tok + r = readNonWhitespace(stream) + if r != b_("R"): + raise utils.PdfReadError("Error reading indirect object reference at byte %s" % utils.hexStr(stream.tell())) + return IndirectObject(int(idnum), int(generation), pdf) + readFromStream = staticmethod(readFromStream) + + +class FloatObject(decimal.Decimal, PdfObject): + def __new__(cls, value="0", context=None): + try: + return decimal.Decimal.__new__(cls, utils.str_(value), context) + except: + return decimal.Decimal.__new__(cls, str(value)) + + def __repr__(self): + if self == self.to_integral(): + return str(self.quantize(decimal.Decimal(1))) + else: + # Standard formatting adds useless extraneous zeros. + o = "%.5f" % self + # Remove the zeros. + while o and o[-1] == '0': + o = o[:-1] + return o + + def as_numeric(self): + return float(b_(repr(self))) + + def writeToStream(self, stream, encryption_key): + stream.write(b_(repr(self))) + + +class NumberObject(int, PdfObject): + NumberPattern = re.compile(b_('[^+-.0-9]')) + ByteDot = b_(".") + + def __new__(cls, value): + val = int(value) + try: + return int.__new__(cls, val) + except OverflowError: + return int.__new__(cls, 0) + + def as_numeric(self): + return int(b_(repr(self))) + + def writeToStream(self, stream, encryption_key): + stream.write(b_(repr(self))) + + def readFromStream(stream): + num = utils.readUntilRegex(stream, NumberObject.NumberPattern) + if num.find(NumberObject.ByteDot) != -1: + return FloatObject(num) + else: + return NumberObject(num) + readFromStream = staticmethod(readFromStream) + + +## +# Given a string (either a "str" or "unicode"), create a ByteStringObject or a +# TextStringObject to represent the string. +def createStringObject(string): + if isinstance(string, utils.string_type): + return TextStringObject(string) + elif isinstance(string, utils.bytes_type): + try: + if string.startswith(codecs.BOM_UTF16_BE): + retval = TextStringObject(string.decode("utf-16")) + retval.autodetect_utf16 = True + return retval + else: + # This is probably a big performance hit here, but we need to + # convert string objects into the text/unicode-aware version if + # possible... and the only way to check if that's possible is + # to try. Some strings are strings, some are just byte arrays. + retval = TextStringObject(decode_pdfdocencoding(string)) + retval.autodetect_pdfdocencoding = True + return retval + except UnicodeDecodeError: + return ByteStringObject(string) + else: + raise TypeError("createStringObject should have str or unicode arg") + + +def readHexStringFromStream(stream): + stream.read(1) + txt = "" + x = b_("") + while True: + tok = readNonWhitespace(stream) + if not tok: + # stream has truncated prematurely + raise PdfStreamError("Stream has ended unexpectedly") + if tok == b_(">"): + break + x += tok + if len(x) == 2: + txt += chr(int(x, base=16)) + x = b_("") + if len(x) == 1: + x += b_("0") + if len(x) == 2: + txt += chr(int(x, base=16)) + return createStringObject(b_(txt)) + + +def readStringFromStream(stream): + tok = stream.read(1) + parens = 1 + txt = b_("") + while True: + tok = stream.read(1) + if not tok: + # stream has truncated prematurely + raise PdfStreamError("Stream has ended unexpectedly") + if tok == b_("("): + parens += 1 + elif tok == b_(")"): + parens -= 1 + if parens == 0: + break + elif tok == b_("\\"): + tok = stream.read(1) + if tok == b_("n"): + tok = b_("\n") + elif tok == b_("r"): + tok = b_("\r") + elif tok == b_("t"): + tok = b_("\t") + elif tok == b_("b"): + tok = b_("\b") + elif tok == b_("f"): + tok = b_("\f") + elif tok == b_("c"): + tok = b_("\c") + elif tok == b_("("): + tok = b_("(") + elif tok == b_(")"): + tok = b_(")") + elif tok == b_("/"): + tok = b_("/") + elif tok == b_("\\"): + tok = b_("\\") + elif tok in (b_(" "), b_("/"), b_("%"), b_("<"), b_(">"), b_("["), + b_("]"), b_("#"), b_("_"), b_("&"), b_('$')): + # odd/unnessecary escape sequences we have encountered + tok = b_(tok) + elif tok.isdigit(): + # "The number ddd may consist of one, two, or three + # octal digits; high-order overflow shall be ignored. + # Three octal digits shall be used, with leading zeros + # as needed, if the next character of the string is also + # a digit." (PDF reference 7.3.4.2, p 16) + for i in range(2): + ntok = stream.read(1) + if ntok.isdigit(): + tok += ntok + else: + break + tok = b_(chr(int(tok, base=8))) + elif tok in b_("\n\r"): + # This case is hit when a backslash followed by a line + # break occurs. If it's a multi-char EOL, consume the + # second character: + tok = stream.read(1) + if not tok in b_("\n\r"): + stream.seek(-1, 1) + # Then don't add anything to the actual string, since this + # line break was escaped: + tok = b_('') + else: + raise utils.PdfReadError(r"Unexpected escaped string: %s" % tok) + txt += tok + return createStringObject(txt) + + +## +# Represents a string object where the text encoding could not be determined. +# This occurs quite often, as the PDF spec doesn't provide an alternate way to +# represent strings -- for example, the encryption data stored in files (like +# /O) is clearly not text, but is still stored in a "String" object. +class ByteStringObject(utils.bytes_type, PdfObject): + + ## + # For compatibility with TextStringObject.original_bytes. This method + # returns self. + original_bytes = property(lambda self: self) + + def writeToStream(self, stream, encryption_key): + bytearr = self + if encryption_key: + bytearr = RC4_encrypt(encryption_key, bytearr) + stream.write(b_("<")) + stream.write(utils.hexencode(bytearr)) + stream.write(b_(">")) + + +## +# Represents a string object that has been decoded into a real unicode string. +# If read from a PDF document, this string appeared to match the +# PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to +# occur. +class TextStringObject(utils.string_type, PdfObject): + autodetect_pdfdocencoding = False + autodetect_utf16 = False + + ## + # It is occasionally possible that a text string object gets created where + # a byte string object was expected due to the autodetection mechanism -- + # if that occurs, this "original_bytes" property can be used to + # back-calculate what the original encoded bytes were. + original_bytes = property(lambda self: self.get_original_bytes()) + + def get_original_bytes(self): + # We're a text string object, but the library is trying to get our raw + # bytes. This can happen if we auto-detected this string as text, but + # we were wrong. It's pretty common. Return the original bytes that + # would have been used to create this object, based upon the autodetect + # method. + if self.autodetect_utf16: + return codecs.BOM_UTF16_BE + self.encode("utf-16be") + elif self.autodetect_pdfdocencoding: + return encode_pdfdocencoding(self) + else: + raise Exception("no information about original bytes") + + def writeToStream(self, stream, encryption_key): + # Try to write the string out as a PDFDocEncoding encoded string. It's + # nicer to look at in the PDF file. Sadly, we take a performance hit + # here for trying... + try: + bytearr = encode_pdfdocencoding(self) + except UnicodeEncodeError: + bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be") + if encryption_key: + bytearr = RC4_encrypt(encryption_key, bytearr) + obj = ByteStringObject(bytearr) + obj.writeToStream(stream, None) + else: + stream.write(b_("(")) + for c in bytearr: + if not chr_(c).isalnum() and c != b_(' '): + stream.write(b_("\\%03o" % ord_(c))) + else: + stream.write(b_(chr_(c))) + stream.write(b_(")")) + + +class NameObject(str, PdfObject): + delimiterPattern = re.compile(b_(r"\s+|[\(\)<>\[\]{}/%]")) + surfix = b_("/") + + def writeToStream(self, stream, encryption_key): + stream.write(b_(self)) + + def readFromStream(stream, pdf): + debug = False + if debug: print((stream.tell())) + name = stream.read(1) + if name != NameObject.surfix: + raise utils.PdfReadError("name read error") + name += utils.readUntilRegex(stream, NameObject.delimiterPattern, + ignore_eof=True) + if debug: print(name) + try: + return NameObject(name.decode('utf-8')) + except (UnicodeEncodeError, UnicodeDecodeError) as e: + # Name objects should represent irregular characters + # with a '#' followed by the symbol's hex number + if not pdf.strict: + warnings.warn("Illegal character in Name Object", utils.PdfReadWarning) + return NameObject(name) + else: + raise utils.PdfReadError("Illegal character in Name Object") + + readFromStream = staticmethod(readFromStream) + + +class DictionaryObject(dict, PdfObject): + def raw_get(self, key): + return dict.__getitem__(self, key) + + def __setitem__(self, key, value): + if not isinstance(key, PdfObject): + raise ValueError("key must be PdfObject") + if not isinstance(value, PdfObject): + raise ValueError("value must be PdfObject") + return dict.__setitem__(self, key, value) + + def setdefault(self, key, value=None): + if not isinstance(key, PdfObject): + raise ValueError("key must be PdfObject") + if not isinstance(value, PdfObject): + raise ValueError("value must be PdfObject") + return dict.setdefault(self, key, value) + + def __getitem__(self, key): + return dict.__getitem__(self, key).getObject() + + ## + # Retrieves XMP (Extensible Metadata Platform) data relevant to the + # this object, if available. + #

+ # Stability: Added in v1.12, will exist for all future v1.x releases. + # @return Returns a {@link #xmp.XmpInformation XmlInformation} instance + # that can be used to access XMP metadata from the document. Can also + # return None if no metadata was found on the document root. + def getXmpMetadata(self): + metadata = self.get("/Metadata", None) + if metadata == None: + return None + metadata = metadata.getObject() + from . import xmp + if not isinstance(metadata, xmp.XmpInformation): + metadata = xmp.XmpInformation(metadata) + self[NameObject("/Metadata")] = metadata + return metadata + + ## + # Read-only property that accesses the {@link + # #DictionaryObject.getXmpData getXmpData} function. + #

+ # Stability: Added in v1.12, will exist for all future v1.x releases. + xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None) + + def writeToStream(self, stream, encryption_key): + stream.write(b_("<<\n")) + for key, value in list(self.items()): + key.writeToStream(stream, encryption_key) + stream.write(b_(" ")) + value.writeToStream(stream, encryption_key) + stream.write(b_("\n")) + stream.write(b_(">>")) + + def readFromStream(stream, pdf): + debug = False + tmp = stream.read(2) + if tmp != b_("<<"): + raise utils.PdfReadError("Dictionary read error at byte %s: stream must begin with '<<'" % utils.hexStr(stream.tell())) + data = {} + while True: + tok = readNonWhitespace(stream) + if tok == b_('\x00'): + continue + elif tok == b_('%'): + stream.seek(-1, 1) + skipOverComment(stream) + continue + if not tok: + # stream has truncated prematurely + raise PdfStreamError("Stream has ended unexpectedly") + + if debug: print(("Tok:", tok)) + if tok == b_(">"): + stream.read(1) + break + stream.seek(-1, 1) + key = readObject(stream, pdf) + tok = readNonWhitespace(stream) + stream.seek(-1, 1) + value = readObject(stream, pdf) + if not data.get(key): + data[key] = value + elif pdf.strict: + # multiple definitions of key not permitted + raise utils.PdfReadError("Multiple definitions in dictionary at byte %s for key %s" \ + % (utils.hexStr(stream.tell()), key)) + else: + warnings.warn("Multiple definitions in dictionary at byte %s for key %s" \ + % (utils.hexStr(stream.tell()), key), utils.PdfReadWarning) + + pos = stream.tell() + s = readNonWhitespace(stream) + if s == b_('s') and stream.read(5) == b_('tream'): + eol = stream.read(1) + # odd PDF file output has spaces after 'stream' keyword but before EOL. + # patch provided by Danial Sandler + while eol == b_(' '): + eol = stream.read(1) + assert eol in (b_("\n"), b_("\r")) + if eol == b_("\r"): + # read \n after + if stream.read(1) != b_('\n'): + stream.seek(-1, 1) + # this is a stream object, not a dictionary + assert "/Length" in data + length = data["/Length"] + if debug: print(data) + if isinstance(length, IndirectObject): + t = stream.tell() + length = pdf.getObject(length) + stream.seek(t, 0) + data["__streamdata__"] = stream.read(length) + if debug: print("here") + #if debug: print(binascii.hexlify(data["__streamdata__"])) + e = readNonWhitespace(stream) + ndstream = stream.read(8) + if (e + ndstream) != b_("endstream"): + # (sigh) - the odd PDF file has a length that is too long, so + # we need to read backwards to find the "endstream" ending. + # ReportLab (unknown version) generates files with this bug, + # and Python users into PDF files tend to be our audience. + # we need to do this to correct the streamdata and chop off + # an extra character. + pos = stream.tell() + stream.seek(-10, 1) + end = stream.read(9) + if end == b_("endstream"): + # we found it by looking back one character further. + data["__streamdata__"] = data["__streamdata__"][:-1] + else: + if debug: print(("E", e, ndstream, debugging.toHex(end))) + stream.seek(pos, 0) + raise utils.PdfReadError("Unable to find 'endstream' marker after stream at byte %s." % utils.hexStr(stream.tell())) + else: + stream.seek(pos, 0) + if "__streamdata__" in data: + return StreamObject.initializeFromDictionary(data) + else: + retval = DictionaryObject() + retval.update(data) + return retval + readFromStream = staticmethod(readFromStream) + + +class TreeObject(DictionaryObject): + def __init__(self): + DictionaryObject.__init__(self) + + def hasChildren(self): + return '/First' in self + + def __iter__(self): + return self.children() + + def children(self): + if not self.hasChildren(): + raise StopIteration + + child = self['/First'] + while True: + yield child + if child == self['/Last']: + raise StopIteration + child = child['/Next'] + + def addChild(self, child, pdf): + childObj = child.getObject() + child = pdf.getReference(childObj) + assert isinstance(child, IndirectObject) + + if '/First' not in self: + self[NameObject('/First')] = child + self[NameObject('/Count')] = NumberObject(0) + prev = None + else: + prev = self['/Last'] + + self[NameObject('/Last')] = child + self[NameObject('/Count')] = NumberObject(self[NameObject('/Count')] + 1) + + if prev: + prevRef = pdf.getReference(prev) + assert isinstance(prevRef, IndirectObject) + childObj[NameObject('/Prev')] = prevRef + prev[NameObject('/Next')] = child + + parentRef = pdf.getReference(self) + assert isinstance(parentRef, IndirectObject) + childObj[NameObject('/Parent')] = parentRef + + def removeChild(self, child): + childObj = child.getObject() + + if NameObject('/Parent') not in childObj: + raise ValueError("Removed child does not appear to be a tree item") + elif childObj[NameObject('/Parent')] != self: + raise ValueError("Removed child is not a member of this tree") + + found = False + prevRef = None + prev = None + curRef = self[NameObject('/First')] + cur = curRef.getObject() + lastRef = self[NameObject('/Last')] + last = lastRef.getObject() + while cur != None: + if cur == childObj: + if prev == None: + if NameObject('/Next') in cur: + # Removing first tree node + nextRef = cur[NameObject('/Next')] + next = nextRef.getObject() + del next[NameObject('/Prev')] + self[NameObject('/First')] = nextRef + self[NameObject('/Count')] = self[NameObject('/Count')] - 1 + + else: + # Removing only tree node + assert self[NameObject('/Count')] == 1 + del self[NameObject('/Count')] + del self[NameObject('/First')] + if NameObject('/Last') in self: + del self[NameObject('/Last')] + else: + if NameObject('/Next') in cur: + # Removing middle tree node + nextRef = cur[NameObject('/Next')] + next = nextRef.getObject() + next[NameObject('/Prev')] = prevRef + prev[NameObject('/Next')] = nextRef + self[NameObject('/Count')] = self[NameObject('/Count')] - 1 + else: + # Removing last tree node + assert cur == last + del prev[NameObject('/Next')] + self[NameObject('/Last')] = prevRef + self[NameObject('/Count')] = self[NameObject('/Count')] - 1 + found = True + break + + prevRef = curRef + prev = cur + if NameObject('/Next') in cur: + curRef = cur[NameObject('/Next')] + cur = curRef.getObject() + else: + curRef = None + cur = None + + if not found: + raise ValueError("Removal couldn't find item in tree") + + del childObj[NameObject('/Parent')] + if NameObject('/Next') in childObj: + del childObj[NameObject('/Next')] + if NameObject('/Prev') in childObj: + del childObj[NameObject('/Prev')] + + def emptyTree(self): + for child in self: + childObj = child.getObject() + del childObj[NameObject('/Parent')] + if NameObject('/Next') in childObj: + del childObj[NameObject('/Next')] + if NameObject('/Prev') in childObj: + del childObj[NameObject('/Prev')] + + if NameObject('/Count') in self: + del self[NameObject('/Count')] + if NameObject('/First') in self: + del self[NameObject('/First')] + if NameObject('/Last') in self: + del self[NameObject('/Last')] + + +class StreamObject(DictionaryObject): + def __init__(self): + self._data = None + self.decodedSelf = None + + def writeToStream(self, stream, encryption_key): + self[NameObject("/Length")] = NumberObject(len(self._data)) + DictionaryObject.writeToStream(self, stream, encryption_key) + del self["/Length"] + stream.write(b_("\nstream\n")) + data = self._data + if encryption_key: + data = RC4_encrypt(encryption_key, data) + stream.write(data) + stream.write(b_("\nendstream")) + + def initializeFromDictionary(data): + if "/Filter" in data: + retval = EncodedStreamObject() + else: + retval = DecodedStreamObject() + retval._data = data["__streamdata__"] + del data["__streamdata__"] + del data["/Length"] + retval.update(data) + return retval + initializeFromDictionary = staticmethod(initializeFromDictionary) + + def flateEncode(self): + if "/Filter" in self: + f = self["/Filter"] + if isinstance(f, ArrayObject): + f.insert(0, NameObject("/FlateDecode")) + else: + newf = ArrayObject() + newf.append(NameObject("/FlateDecode")) + newf.append(f) + f = newf + else: + f = NameObject("/FlateDecode") + retval = EncodedStreamObject() + retval[NameObject("/Filter")] = f + retval._data = filters.FlateDecode.encode(self._data) + return retval + + +class DecodedStreamObject(StreamObject): + def getData(self): + return self._data + + def setData(self, data): + self._data = data + + +class EncodedStreamObject(StreamObject): + def __init__(self): + self.decodedSelf = None + + def getData(self): + if self.decodedSelf: + # cached version of decoded object + return self.decodedSelf.getData() + else: + # create decoded object + decoded = DecodedStreamObject() + + decoded._data = filters.decodeStreamData(self) + for key, value in list(self.items()): + if not key in ("/Length", "/Filter", "/DecodeParms"): + decoded[key] = value + self.decodedSelf = decoded + return decoded._data + + def setData(self, data): + raise utils.PdfReadError("Creating EncodedStreamObject is not currently supported") + + +class RectangleObject(ArrayObject): + """ + This class is used to represent *page boxes* in PyPDF2. These boxes include: + + * :attr:`artBox ` + * :attr:`bleedBox ` + * :attr:`cropBox ` + * :attr:`mediaBox ` + * :attr:`trimBox ` + """ + def __init__(self, arr): + # must have four points + assert len(arr) == 4 + # automatically convert arr[x] into NumberObject(arr[x]) if necessary + ArrayObject.__init__(self, [self.ensureIsNumber(x) for x in arr]) + + def ensureIsNumber(self, value): + if not isinstance(value, (NumberObject, FloatObject)): + value = FloatObject(value) + return value + + def __repr__(self): + return "RectangleObject(%s)" % repr(list(self)) + + def getLowerLeft_x(self): + return self[0] + + def getLowerLeft_y(self): + return self[1] + + def getUpperRight_x(self): + return self[2] + + def getUpperRight_y(self): + return self[3] + + def getUpperLeft_x(self): + return self.getLowerLeft_x() + + def getUpperLeft_y(self): + return self.getUpperRight_y() + + def getLowerRight_x(self): + return self.getUpperRight_x() + + def getLowerRight_y(self): + return self.getLowerLeft_y() + + def getLowerLeft(self): + return self.getLowerLeft_x(), self.getLowerLeft_y() + + def getLowerRight(self): + return self.getLowerRight_x(), self.getLowerRight_y() + + def getUpperLeft(self): + return self.getUpperLeft_x(), self.getUpperLeft_y() + + def getUpperRight(self): + return self.getUpperRight_x(), self.getUpperRight_y() + + def setLowerLeft(self, value): + self[0], self[1] = [self.ensureIsNumber(x) for x in value] + + def setLowerRight(self, value): + self[2], self[1] = [self.ensureIsNumber(x) for x in value] + + def setUpperLeft(self, value): + self[0], self[3] = [self.ensureIsNumber(x) for x in value] + + def setUpperRight(self, value): + self[2], self[3] = [self.ensureIsNumber(x) for x in value] + + def getWidth(self): + return self.getUpperRight_x() - self.getLowerLeft_x() + + def getHeight(self): + return self.getUpperRight_y() - self.getLowerLeft_y() + + lowerLeft = property(getLowerLeft, setLowerLeft, None, None) + """ + Property to read and modify the lower left coordinate of this box + in (x,y) form. + """ + lowerRight = property(getLowerRight, setLowerRight, None, None) + """ + Property to read and modify the lower right coordinate of this box + in (x,y) form. + """ + upperLeft = property(getUpperLeft, setUpperLeft, None, None) + """ + Property to read and modify the upper left coordinate of this box + in (x,y) form. + """ + upperRight = property(getUpperRight, setUpperRight, None, None) + """ + Property to read and modify the upper right coordinate of this box + in (x,y) form. + """ + + +class Field(TreeObject): + """ + A class representing a field dictionary. This class is accessed through + :meth:`getFields()` + """ + def __init__(self, data): + DictionaryObject.__init__(self) + attributes = ("/FT", "/Parent", "/Kids", "/T", "/TU", "/TM", "/Ff", + "/V", "/DV", "/AA") + for attr in attributes: + try: + self[NameObject(attr)] = data[attr] + except KeyError: + pass + + fieldType = property(lambda self: self.get("/FT")) + """ + Read-only property accessing the type of this field. + """ + + parent = property(lambda self: self.get("/Parent")) + """ + Read-only property accessing the parent of this field. + """ + + kids = property(lambda self: self.get("/Kids")) + """ + Read-only property accessing the kids of this field. + """ + + name = property(lambda self: self.get("/T")) + """ + Read-only property accessing the name of this field. + """ + + altName = property(lambda self: self.get("/TU")) + """ + Read-only property accessing the alternate name of this field. + """ + + mappingName = property(lambda self: self.get("/TM")) + """ + Read-only property accessing the mapping name of this field. This + name is used by PyPDF2 as a key in the dictionary returned by + :meth:`getFields()` + """ + + flags = property(lambda self: self.get("/Ff")) + """ + Read-only property accessing the field flags, specifying various + characteristics of the field (see Table 8.70 of the PDF 1.7 reference). + """ + + value = property(lambda self: self.get("/V")) + """ + Read-only property accessing the value of this field. Format + varies based on field type. + """ + + defaultValue = property(lambda self: self.get("/DV")) + """ + Read-only property accessing the default value of this field. + """ + + additionalActions = property(lambda self: self.get("/AA")) + """ + Read-only property accessing the additional actions dictionary. + This dictionary defines the field's behavior in response to trigger events. + See Section 8.5.2 of the PDF 1.7 reference. + """ + + +class Destination(TreeObject): + """ + A class representing a destination within a PDF file. + See section 8.2.1 of the PDF 1.6 reference. + + :param str title: Title of this destination. + :param int page: Page number of this destination. + :param str typ: How the destination is displayed. + :param args: Additional arguments may be necessary depending on the type. + :raises PdfReadError: If destination type is invalid. + + Valid ``typ`` arguments (see PDF spec for details): + /Fit No additional arguments + /XYZ [left] [top] [zoomFactor] + /FitH [top] + /FitV [left] + /FitR [left] [bottom] [right] [top] + /FitB No additional arguments + /FitBH [top] + /FitBV [left] + """ + def __init__(self, title, page, typ, *args): + DictionaryObject.__init__(self) + self[NameObject("/Title")] = title + self[NameObject("/Page")] = page + self[NameObject("/Type")] = typ + + # from table 8.2 of the PDF 1.7 reference. + if typ == "/XYZ": + (self[NameObject("/Left")], self[NameObject("/Top")], + self[NameObject("/Zoom")]) = args + elif typ == "/FitR": + (self[NameObject("/Left")], self[NameObject("/Bottom")], + self[NameObject("/Right")], self[NameObject("/Top")]) = args + elif typ in ["/FitH", "/FitBH"]: + self[NameObject("/Top")], = args + elif typ in ["/FitV", "/FitBV"]: + self[NameObject("/Left")], = args + elif typ in ["/Fit", "/FitB"]: + pass + else: + raise utils.PdfReadError("Unknown Destination Type: %r" % typ) + + def getDestArray(self): + return ArrayObject([self.raw_get('/Page'), self['/Type']] + [self[x] for x in ['/Left', '/Bottom', '/Right', '/Top', '/Zoom'] if x in self]) + + def writeToStream(self, stream, encryption_key): + stream.write(b_("<<\n")) + key = NameObject('/D') + key.writeToStream(stream, encryption_key) + stream.write(b_(" ")) + value = self.getDestArray() + value.writeToStream(stream, encryption_key) + + key = NameObject("/S") + key.writeToStream(stream, encryption_key) + stream.write(b_(" ")) + value = NameObject("/GoTo") + value.writeToStream(stream, encryption_key) + + stream.write(b_("\n")) + stream.write(b_(">>")) + + title = property(lambda self: self.get("/Title")) + """ + Read-only property accessing the destination title. + + :rtype: str + """ + + page = property(lambda self: self.get("/Page")) + """ + Read-only property accessing the destination page number. + + :rtype: int + """ + + typ = property(lambda self: self.get("/Type")) + """ + Read-only property accessing the destination type. + + :rtype: str + """ + + zoom = property(lambda self: self.get("/Zoom", None)) + """ + Read-only property accessing the zoom factor. + + :rtype: int, or ``None`` if not available. + """ + + left = property(lambda self: self.get("/Left", None)) + """ + Read-only property accessing the left horizontal coordinate. + + :rtype: int, or ``None`` if not available. + """ + + right = property(lambda self: self.get("/Right", None)) + """ + Read-only property accessing the right horizontal coordinate. + + :rtype: int, or ``None`` if not available. + """ + + top = property(lambda self: self.get("/Top", None)) + """ + Read-only property accessing the top vertical coordinate. + + :rtype: int, or ``None`` if not available. + """ + + bottom = property(lambda self: self.get("/Bottom", None)) + """ + Read-only property accessing the bottom vertical coordinate. + + :rtype: int, or ``None`` if not available. + """ + + +class Bookmark(Destination): + def writeToStream(self, stream, encryption_key): + stream.write(b_("<<\n")) + for key in [NameObject(x) for x in ['/Title', '/Parent', '/First', '/Last', '/Next', '/Prev'] if x in self]: + key.writeToStream(stream, encryption_key) + stream.write(b_(" ")) + value = self.raw_get(key) + value.writeToStream(stream, encryption_key) + stream.write(b_("\n")) + key = NameObject('/Dest') + key.writeToStream(stream, encryption_key) + stream.write(b_(" ")) + value = self.getDestArray() + value.writeToStream(stream, encryption_key) + stream.write(b_("\n")) + stream.write(b_(">>")) + + +def encode_pdfdocencoding(unicode_string): + retval = b_('') + for c in unicode_string: + try: + retval += b_(chr(_pdfDocEncoding_rev[c])) + except KeyError: + raise UnicodeEncodeError("pdfdocencoding", c, -1, -1, + "does not exist in translation table") + return retval + + +def decode_pdfdocencoding(byte_array): + retval = u_('') + for b in byte_array: + c = _pdfDocEncoding[ord_(b)] + if c == u_('\u0000'): + raise UnicodeDecodeError("pdfdocencoding", utils.barray(b), -1, -1, + "does not exist in translation table") + retval += c + return retval + +_pdfDocEncoding = ( + u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), + u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), + u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), + u_('\u02d8'), u_('\u02c7'), u_('\u02c6'), u_('\u02d9'), u_('\u02dd'), u_('\u02db'), u_('\u02da'), u_('\u02dc'), + u_('\u0020'), u_('\u0021'), u_('\u0022'), u_('\u0023'), u_('\u0024'), u_('\u0025'), u_('\u0026'), u_('\u0027'), + u_('\u0028'), u_('\u0029'), u_('\u002a'), u_('\u002b'), u_('\u002c'), u_('\u002d'), u_('\u002e'), u_('\u002f'), + u_('\u0030'), u_('\u0031'), u_('\u0032'), u_('\u0033'), u_('\u0034'), u_('\u0035'), u_('\u0036'), u_('\u0037'), + u_('\u0038'), u_('\u0039'), u_('\u003a'), u_('\u003b'), u_('\u003c'), u_('\u003d'), u_('\u003e'), u_('\u003f'), + u_('\u0040'), u_('\u0041'), u_('\u0042'), u_('\u0043'), u_('\u0044'), u_('\u0045'), u_('\u0046'), u_('\u0047'), + u_('\u0048'), u_('\u0049'), u_('\u004a'), u_('\u004b'), u_('\u004c'), u_('\u004d'), u_('\u004e'), u_('\u004f'), + u_('\u0050'), u_('\u0051'), u_('\u0052'), u_('\u0053'), u_('\u0054'), u_('\u0055'), u_('\u0056'), u_('\u0057'), + u_('\u0058'), u_('\u0059'), u_('\u005a'), u_('\u005b'), u_('\u005c'), u_('\u005d'), u_('\u005e'), u_('\u005f'), + u_('\u0060'), u_('\u0061'), u_('\u0062'), u_('\u0063'), u_('\u0064'), u_('\u0065'), u_('\u0066'), u_('\u0067'), + u_('\u0068'), u_('\u0069'), u_('\u006a'), u_('\u006b'), u_('\u006c'), u_('\u006d'), u_('\u006e'), u_('\u006f'), + u_('\u0070'), u_('\u0071'), u_('\u0072'), u_('\u0073'), u_('\u0074'), u_('\u0075'), u_('\u0076'), u_('\u0077'), + u_('\u0078'), u_('\u0079'), u_('\u007a'), u_('\u007b'), u_('\u007c'), u_('\u007d'), u_('\u007e'), u_('\u0000'), + u_('\u2022'), u_('\u2020'), u_('\u2021'), u_('\u2026'), u_('\u2014'), u_('\u2013'), u_('\u0192'), u_('\u2044'), + u_('\u2039'), u_('\u203a'), u_('\u2212'), u_('\u2030'), u_('\u201e'), u_('\u201c'), u_('\u201d'), u_('\u2018'), + u_('\u2019'), u_('\u201a'), u_('\u2122'), u_('\ufb01'), u_('\ufb02'), u_('\u0141'), u_('\u0152'), u_('\u0160'), + u_('\u0178'), u_('\u017d'), u_('\u0131'), u_('\u0142'), u_('\u0153'), u_('\u0161'), u_('\u017e'), u_('\u0000'), + u_('\u20ac'), u_('\u00a1'), u_('\u00a2'), u_('\u00a3'), u_('\u00a4'), u_('\u00a5'), u_('\u00a6'), u_('\u00a7'), + u_('\u00a8'), u_('\u00a9'), u_('\u00aa'), u_('\u00ab'), u_('\u00ac'), u_('\u0000'), u_('\u00ae'), u_('\u00af'), + u_('\u00b0'), u_('\u00b1'), u_('\u00b2'), u_('\u00b3'), u_('\u00b4'), u_('\u00b5'), u_('\u00b6'), u_('\u00b7'), + u_('\u00b8'), u_('\u00b9'), u_('\u00ba'), u_('\u00bb'), u_('\u00bc'), u_('\u00bd'), u_('\u00be'), u_('\u00bf'), + u_('\u00c0'), u_('\u00c1'), u_('\u00c2'), u_('\u00c3'), u_('\u00c4'), u_('\u00c5'), u_('\u00c6'), u_('\u00c7'), + u_('\u00c8'), u_('\u00c9'), u_('\u00ca'), u_('\u00cb'), u_('\u00cc'), u_('\u00cd'), u_('\u00ce'), u_('\u00cf'), + u_('\u00d0'), u_('\u00d1'), u_('\u00d2'), u_('\u00d3'), u_('\u00d4'), u_('\u00d5'), u_('\u00d6'), u_('\u00d7'), + u_('\u00d8'), u_('\u00d9'), u_('\u00da'), u_('\u00db'), u_('\u00dc'), u_('\u00dd'), u_('\u00de'), u_('\u00df'), + u_('\u00e0'), u_('\u00e1'), u_('\u00e2'), u_('\u00e3'), u_('\u00e4'), u_('\u00e5'), u_('\u00e6'), u_('\u00e7'), + u_('\u00e8'), u_('\u00e9'), u_('\u00ea'), u_('\u00eb'), u_('\u00ec'), u_('\u00ed'), u_('\u00ee'), u_('\u00ef'), + u_('\u00f0'), u_('\u00f1'), u_('\u00f2'), u_('\u00f3'), u_('\u00f4'), u_('\u00f5'), u_('\u00f6'), u_('\u00f7'), + u_('\u00f8'), u_('\u00f9'), u_('\u00fa'), u_('\u00fb'), u_('\u00fc'), u_('\u00fd'), u_('\u00fe'), u_('\u00ff') +) + +assert len(_pdfDocEncoding) == 256 + +_pdfDocEncoding_rev = {} +for i in range(256): + char = _pdfDocEncoding[i] + if char == u_("\u0000"): + continue + assert char not in _pdfDocEncoding_rev + _pdfDocEncoding_rev[char] = i diff --git a/vendor/PyPDF2/merger.py b/vendor/PyPDF2/merger.py new file mode 100755 index 00000000..27702add --- /dev/null +++ b/vendor/PyPDF2/merger.py @@ -0,0 +1,553 @@ +# vim: sw=4:expandtab:foldmethod=marker +# +# Copyright (c) 2006, Mathieu Fenniak +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from .generic import * +from .utils import isString, str_ +from .pdf import PdfFileReader, PdfFileWriter +from .pagerange import PageRange +from sys import version_info +if version_info < ( 3, 0 ): + from cStringIO import StringIO + StreamIO = StringIO +else: + from io import BytesIO + from io import FileIO as file + StreamIO = BytesIO + + +class _MergedPage(object): + """ + _MergedPage is used internally by PdfFileMerger to collect necessary + information on each page that is being merged. + """ + def __init__(self, pagedata, src, id): + self.src = src + self.pagedata = pagedata + self.out_pagedata = None + self.id = id + + +class PdfFileMerger(object): + """ + Initializes a PdfFileMerger object. PdfFileMerger merges multiple PDFs + into a single PDF. It can concatenate, slice, insert, or any combination + of the above. + + See the functions :meth:`merge()` (or :meth:`append()`) + and :meth:`write()` for usage information. + + :param bool strict: Determines whether user should be warned of all + problems and also causes some correctable problems to be fatal. + Defaults to ``True``. + """ + + def __init__(self, strict=True): + self.inputs = [] + self.pages = [] + self.output = PdfFileWriter() + self.bookmarks = [] + self.named_dests = [] + self.id_count = 0 + self.strict = strict + + def merge(self, position, fileobj, bookmark=None, pages=None, import_bookmarks=True): + """ + Merges the pages from the given file into the output file at the + specified page number. + + :param int position: The *page number* to insert this file. File will + be inserted after the given number. + + :param fileobj: A File Object or an object that supports the standard read + and seek methods similar to a File Object. Could also be a + string representing a path to a PDF file. + + :param str bookmark: Optionally, you may specify a bookmark to be applied at + the beginning of the included file by supplying the text of the bookmark. + + :param pages: can be a :ref:`Page Range ` or a ``(start, stop[, step])`` tuple + to merge only the specified range of pages from the source + document into the output document. + + :param bool import_bookmarks: You may prevent the source document's bookmarks + from being imported by specifying this as ``False``. + """ + + # This parameter is passed to self.inputs.append and means + # that the stream used was created in this method. + my_file = False + + # If the fileobj parameter is a string, assume it is a path + # and create a file object at that location. If it is a file, + # copy the file's contents into a BytesIO (or StreamIO) stream object; if + # it is a PdfFileReader, copy that reader's stream into a + # BytesIO (or StreamIO) stream. + # If fileobj is none of the above types, it is not modified + decryption_key = None + if isString(fileobj): + fileobj = file(fileobj, 'rb') + my_file = True + elif isinstance(fileobj, file): + fileobj.seek(0) + filecontent = fileobj.read() + fileobj = StreamIO(filecontent) + my_file = True + elif isinstance(fileobj, PdfFileReader): + orig_tell = fileobj.stream.tell() + fileobj.stream.seek(0) + filecontent = StreamIO(fileobj.stream.read()) + fileobj.stream.seek(orig_tell) # reset the stream to its original location + fileobj = filecontent + if hasattr(fileobj, '_decryption_key'): + decryption_key = fileobj._decryption_key + my_file = True + + # Create a new PdfFileReader instance using the stream + # (either file or BytesIO or StringIO) created above + pdfr = PdfFileReader(fileobj, strict=self.strict) + if decryption_key is not None: + pdfr._decryption_key = decryption_key + + # Find the range of pages to merge. + if pages == None: + pages = (0, pdfr.getNumPages()) + elif isinstance(pages, PageRange): + pages = pages.indices(pdfr.getNumPages()) + elif not isinstance(pages, tuple): + raise TypeError('"pages" must be a tuple of (start, stop[, step])') + + srcpages = [] + if bookmark: + bookmark = Bookmark(TextStringObject(bookmark), NumberObject(self.id_count), NameObject('/Fit')) + + outline = [] + if import_bookmarks: + outline = pdfr.getOutlines() + outline = self._trim_outline(pdfr, outline, pages) + + if bookmark: + self.bookmarks += [bookmark, outline] + else: + self.bookmarks += outline + + dests = pdfr.namedDestinations + dests = self._trim_dests(pdfr, dests, pages) + self.named_dests += dests + + # Gather all the pages that are going to be merged + for i in range(*pages): + pg = pdfr.getPage(i) + + id = self.id_count + self.id_count += 1 + + mp = _MergedPage(pg, pdfr, id) + + srcpages.append(mp) + + self._associate_dests_to_pages(srcpages) + self._associate_bookmarks_to_pages(srcpages) + + # Slice to insert the pages at the specified position + self.pages[position:position] = srcpages + + # Keep track of our input files so we can close them later + self.inputs.append((fileobj, pdfr, my_file)) + + def append(self, fileobj, bookmark=None, pages=None, import_bookmarks=True): + """ + Identical to the :meth:`merge()` method, but assumes you want to concatenate + all pages onto the end of the file instead of specifying a position. + + :param fileobj: A File Object or an object that supports the standard read + and seek methods similar to a File Object. Could also be a + string representing a path to a PDF file. + + :param str bookmark: Optionally, you may specify a bookmark to be applied at + the beginning of the included file by supplying the text of the bookmark. + + :param pages: can be a :ref:`Page Range ` or a ``(start, stop[, step])`` tuple + to merge only the specified range of pages from the source + document into the output document. + + :param bool import_bookmarks: You may prevent the source document's bookmarks + from being imported by specifying this as ``False``. + """ + + self.merge(len(self.pages), fileobj, bookmark, pages, import_bookmarks) + + def write(self, fileobj): + """ + Writes all data that has been merged to the given output file. + + :param fileobj: Output file. Can be a filename or any kind of + file-like object. + """ + my_file = False + if isString(fileobj): + fileobj = file(fileobj, 'wb') + my_file = True + + # Add pages to the PdfFileWriter + # The commented out line below was replaced with the two lines below it to allow PdfFileMerger to work with PyPdf 1.13 + for page in self.pages: + self.output.addPage(page.pagedata) + page.out_pagedata = self.output.getReference(self.output._pages.getObject()["/Kids"][-1].getObject()) + #idnum = self.output._objects.index(self.output._pages.getObject()["/Kids"][-1].getObject()) + 1 + #page.out_pagedata = IndirectObject(idnum, 0, self.output) + + # Once all pages are added, create bookmarks to point at those pages + self._write_dests() + self._write_bookmarks() + + # Write the output to the file + self.output.write(fileobj) + + if my_file: + fileobj.close() + + def close(self): + """ + Shuts all file descriptors (input and output) and clears all memory + usage. + """ + self.pages = [] + for fo, pdfr, mine in self.inputs: + if mine: + fo.close() + + self.inputs = [] + self.output = None + + def addMetadata(self, infos): + """ + Add custom metadata to the output. + + :param dict infos: a Python dictionary where each key is a field + and each value is your new metadata. + Example: ``{u'/Title': u'My title'}`` + """ + self.output.addMetadata(infos) + + def setPageLayout(self, layout): + """ + Set the page layout + + :param str layout: The page layout to be used + + Valid layouts are: + /NoLayout Layout explicitly not specified + /SinglePage Show one page at a time + /OneColumn Show one column at a time + /TwoColumnLeft Show pages in two columns, odd-numbered pages on the left + /TwoColumnRight Show pages in two columns, odd-numbered pages on the right + /TwoPageLeft Show two pages at a time, odd-numbered pages on the left + /TwoPageRight Show two pages at a time, odd-numbered pages on the right + """ + self.output.setPageLayout(layout) + + def setPageMode(self, mode): + """ + Set the page mode. + + :param str mode: The page mode to use. + + Valid modes are: + /UseNone Do not show outlines or thumbnails panels + /UseOutlines Show outlines (aka bookmarks) panel + /UseThumbs Show page thumbnails panel + /FullScreen Fullscreen view + /UseOC Show Optional Content Group (OCG) panel + /UseAttachments Show attachments panel + """ + self.output.setPageMode(mode) + + def _trim_dests(self, pdf, dests, pages): + """ + Removes any named destinations that are not a part of the specified + page set. + """ + new_dests = [] + prev_header_added = True + for k, o in list(dests.items()): + for j in range(*pages): + if pdf.getPage(j).getObject() == o['/Page'].getObject(): + o[NameObject('/Page')] = o['/Page'].getObject() + assert str_(k) == str_(o['/Title']) + new_dests.append(o) + break + return new_dests + + def _trim_outline(self, pdf, outline, pages): + """ + Removes any outline/bookmark entries that are not a part of the + specified page set. + """ + new_outline = [] + prev_header_added = True + for i, o in enumerate(outline): + if isinstance(o, list): + sub = self._trim_outline(pdf, o, pages) + if sub: + if not prev_header_added: + new_outline.append(outline[i-1]) + new_outline.append(sub) + else: + prev_header_added = False + for j in range(*pages): + if pdf.getPage(j).getObject() == o['/Page'].getObject(): + o[NameObject('/Page')] = o['/Page'].getObject() + new_outline.append(o) + prev_header_added = True + break + return new_outline + + def _write_dests(self): + dests = self.named_dests + + for v in dests: + pageno = None + pdf = None + if '/Page' in v: + for i, p in enumerate(self.pages): + if p.id == v['/Page']: + v[NameObject('/Page')] = p.out_pagedata + pageno = i + pdf = p.src + break + if pageno != None: + self.output.addNamedDestinationObject(v) + + def _write_bookmarks(self, bookmarks=None, parent=None): + + if bookmarks == None: + bookmarks = self.bookmarks + + last_added = None + for b in bookmarks: + if isinstance(b, list): + self._write_bookmarks(b, last_added) + continue + + pageno = None + pdf = None + if '/Page' in b: + for i, p in enumerate(self.pages): + if p.id == b['/Page']: + #b[NameObject('/Page')] = p.out_pagedata + args = [NumberObject(p.id), NameObject(b['/Type'])] + #nothing more to add + #if b['/Type'] == '/Fit' or b['/Type'] == '/FitB' + if b['/Type'] == '/FitH' or b['/Type'] == '/FitBH': + if '/Top' in b and not isinstance(b['/Top'], NullObject): + args.append(FloatObject(b['/Top'])) + else: + args.append(FloatObject(0)) + del b['/Top'] + elif b['/Type'] == '/FitV' or b['/Type'] == '/FitBV': + if '/Left' in b and not isinstance(b['/Left'], NullObject): + args.append(FloatObject(b['/Left'])) + else: + args.append(FloatObject(0)) + del b['/Left'] + elif b['/Type'] == '/XYZ': + if '/Left' in b and not isinstance(b['/Left'], NullObject): + args.append(FloatObject(b['/Left'])) + else: + args.append(FloatObject(0)) + if '/Top' in b and not isinstance(b['/Top'], NullObject): + args.append(FloatObject(b['/Top'])) + else: + args.append(FloatObject(0)) + if '/Zoom' in b and not isinstance(b['/Zoom'], NullObject): + args.append(FloatObject(b['/Zoom'])) + else: + args.append(FloatObject(0)) + del b['/Top'], b['/Zoom'], b['/Left'] + elif b['/Type'] == '/FitR': + if '/Left' in b and not isinstance(b['/Left'], NullObject): + args.append(FloatObject(b['/Left'])) + else: + args.append(FloatObject(0)) + if '/Bottom' in b and not isinstance(b['/Bottom'], NullObject): + args.append(FloatObject(b['/Bottom'])) + else: + args.append(FloatObject(0)) + if '/Right' in b and not isinstance(b['/Right'], NullObject): + args.append(FloatObject(b['/Right'])) + else: + args.append(FloatObject(0)) + if '/Top' in b and not isinstance(b['/Top'], NullObject): + args.append(FloatObject(b['/Top'])) + else: + args.append(FloatObject(0)) + del b['/Left'], b['/Right'], b['/Bottom'], b['/Top'] + + b[NameObject('/A')] = DictionaryObject({NameObject('/S'): NameObject('/GoTo'), NameObject('/D'): ArrayObject(args)}) + + pageno = i + pdf = p.src + break + if pageno != None: + del b['/Page'], b['/Type'] + last_added = self.output.addBookmarkDict(b, parent) + + def _associate_dests_to_pages(self, pages): + for nd in self.named_dests: + pageno = None + np = nd['/Page'] + + if isinstance(np, NumberObject): + continue + + for p in pages: + if np.getObject() == p.pagedata.getObject(): + pageno = p.id + + if pageno != None: + nd[NameObject('/Page')] = NumberObject(pageno) + else: + raise ValueError("Unresolved named destination '%s'" % (nd['/Title'],)) + + def _associate_bookmarks_to_pages(self, pages, bookmarks=None): + if bookmarks == None: + bookmarks = self.bookmarks + + for b in bookmarks: + if isinstance(b, list): + self._associate_bookmarks_to_pages(pages, b) + continue + + pageno = None + bp = b['/Page'] + + if isinstance(bp, NumberObject): + continue + + for p in pages: + if bp.getObject() == p.pagedata.getObject(): + pageno = p.id + + if pageno != None: + b[NameObject('/Page')] = NumberObject(pageno) + else: + raise ValueError("Unresolved bookmark '%s'" % (b['/Title'],)) + + def findBookmark(self, bookmark, root=None): + if root == None: + root = self.bookmarks + + for i, b in enumerate(root): + if isinstance(b, list): + res = self.findBookmark(bookmark, b) + if res: + return [i] + res + elif b == bookmark or b['/Title'] == bookmark: + return [i] + + return None + + def addBookmark(self, title, pagenum, parent=None): + """ + Add a bookmark to this PDF file. + + :param str title: Title to use for this bookmark. + :param int pagenum: Page number this bookmark will point to. + :param parent: A reference to a parent bookmark to create nested + bookmarks. + """ + if parent == None: + iloc = [len(self.bookmarks)-1] + elif isinstance(parent, list): + iloc = parent + else: + iloc = self.findBookmark(parent) + + dest = Bookmark(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826)) + + if parent == None: + self.bookmarks.append(dest) + else: + bmparent = self.bookmarks + for i in iloc[:-1]: + bmparent = bmparent[i] + npos = iloc[-1]+1 + if npos < len(bmparent) and isinstance(bmparent[npos], list): + bmparent[npos].append(dest) + else: + bmparent.insert(npos, [dest]) + return dest + + def addNamedDestination(self, title, pagenum): + """ + Add a destination to the output. + + :param str title: Title to use + :param int pagenum: Page number this destination points at. + """ + + dest = Destination(TextStringObject(title), NumberObject(pagenum), NameObject('/FitH'), NumberObject(826)) + self.named_dests.append(dest) + + +class OutlinesObject(list): + def __init__(self, pdf, tree, parent=None): + list.__init__(self) + self.tree = tree + self.pdf = pdf + self.parent = parent + + def remove(self, index): + obj = self[index] + del self[index] + self.tree.removeChild(obj) + + def add(self, title, pagenum): + pageRef = self.pdf.getObject(self.pdf._pages)['/Kids'][pagenum] + action = DictionaryObject() + action.update({ + NameObject('/D') : ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]), + NameObject('/S') : NameObject('/GoTo') + }) + actionRef = self.pdf._addObject(action) + bookmark = TreeObject() + + bookmark.update({ + NameObject('/A'): actionRef, + NameObject('/Title'): createStringObject(title), + }) + + self.pdf._addObject(bookmark) + + self.tree.addChild(bookmark) + + def removeAll(self): + for child in [x for x in self.tree.children()]: + self.tree.removeChild(child) + self.pop() diff --git a/vendor/PyPDF2/pagerange.py b/vendor/PyPDF2/pagerange.py new file mode 100755 index 00000000..ce96ec5f --- /dev/null +++ b/vendor/PyPDF2/pagerange.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python +""" +Representation and utils for ranges of PDF file pages. + +Copyright (c) 2014, Steve Witham . +All rights reserved. This software is available under a BSD license; +see https://github.com/mstamy2/PyPDF2/blob/master/LICENSE +""" + +import re +from .utils import isString + +_INT_RE = r"(0|-?[1-9]\d*)" # A decimal int, don't allow "-0". +PAGE_RANGE_RE = "^({int}|({int}?(:{int}?(:{int}?)?)))$".format(int=_INT_RE) +# groups: 12 34 5 6 7 8 + + +class ParseError(Exception): + pass + + +PAGE_RANGE_HELP = """Remember, page indices start with zero. + Page range expression examples: + : all pages. -1 last page. + 22 just the 23rd page. :-1 all but the last page. + 0:3 the first three pages. -2 second-to-last page. + :3 the first three pages. -2: last two pages. + 5: from the sixth page onward. -3:-1 third & second to last. + The third, "stride" or "step" number is also recognized. + ::2 0 2 4 ... to the end. 3:0:-1 3 2 1 but not 0. + 1:10:2 1 3 5 7 9 2::-1 2 1 0. + ::-1 all pages in reverse order. +""" + + +class PageRange(object): + """ + A slice-like representation of a range of page indices, + i.e. page numbers, only starting at zero. + The syntax is like what you would put between brackets [ ]. + The slice is one of the few Python types that can't be subclassed, + but this class converts to and from slices, and allows similar use. + o PageRange(str) parses a string representing a page range. + o PageRange(slice) directly "imports" a slice. + o to_slice() gives the equivalent slice. + o str() and repr() allow printing. + o indices(n) is like slice.indices(n). + """ + + def __init__(self, arg): + """ + Initialize with either a slice -- giving the equivalent page range, + or a PageRange object -- making a copy, + or a string like + "int", "[int]:[int]" or "[int]:[int]:[int]", + where the brackets indicate optional ints. + {page_range_help} + Note the difference between this notation and arguments to slice(): + slice(3) means the first three pages; + PageRange("3") means the range of only the fourth page. + However PageRange(slice(3)) means the first three pages. + """ + if isinstance(arg, slice): + self._slice = arg + return + + if isinstance(arg, PageRange): + self._slice = arg.to_slice() + return + + m = isString(arg) and re.match(PAGE_RANGE_RE, arg) + if not m: + raise ParseError(arg) + elif m.group(2): + # Special case: just an int means a range of one page. + start = int(m.group(2)) + stop = start + 1 if start != -1 else None + self._slice = slice(start, stop) + else: + self._slice = slice(*[int(g) if g else None + for g in m.group(4, 6, 8)]) + + # Just formatting this when there is __doc__ for __init__ + if __init__.__doc__: + __init__.__doc__ = __init__.__doc__.format(page_range_help=PAGE_RANGE_HELP) + + @staticmethod + def valid(input): + """ True if input is a valid initializer for a PageRange. """ + return isinstance(input, slice) or \ + isinstance(input, PageRange) or \ + (isString(input) + and bool(re.match(PAGE_RANGE_RE, input))) + + def to_slice(self): + """ Return the slice equivalent of this page range. """ + return self._slice + + def __str__(self): + """ A string like "1:2:3". """ + s = self._slice + if s.step == None: + if s.start != None and s.stop == s.start + 1: + return str(s.start) + + indices = s.start, s.stop + else: + indices = s.start, s.stop, s.step + return ':'.join("" if i == None else str(i) for i in indices) + + def __repr__(self): + """ A string like "PageRange('1:2:3')". """ + return "PageRange(" + repr(str(self)) + ")" + + def indices(self, n): + """ + n is the length of the list of pages to choose from. + Returns arguments for range(). See help(slice.indices). + """ + return self._slice.indices(n) + + +PAGE_RANGE_ALL = PageRange(":") # The range of all pages. + + +def parse_filename_page_ranges(args): + """ + Given a list of filenames and page ranges, return a list of + (filename, page_range) pairs. + First arg must be a filename; other ags are filenames, page-range + expressions, slice objects, or PageRange objects. + A filename not followed by a page range indicates all pages of the file. + """ + pairs = [] + pdf_filename = None + did_page_range = False + for arg in args + [None]: + if PageRange.valid(arg): + if not pdf_filename: + raise ValueError("The first argument must be a filename, " \ + "not a page range.") + + pairs.append( (pdf_filename, PageRange(arg)) ) + did_page_range = True + else: + # New filename or end of list--do all of the previous file? + if pdf_filename and not did_page_range: + pairs.append( (pdf_filename, PAGE_RANGE_ALL) ) + + pdf_filename = arg + did_page_range = False + return pairs diff --git a/vendor/PyPDF2/pdf.py b/vendor/PyPDF2/pdf.py new file mode 100755 index 00000000..9979414f --- /dev/null +++ b/vendor/PyPDF2/pdf.py @@ -0,0 +1,3004 @@ +# -*- coding: utf-8 -*- +# +# vim: sw=4:expandtab:foldmethod=marker +# +# Copyright (c) 2006, Mathieu Fenniak +# Copyright (c) 2007, Ashish Kulkarni +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +""" +A pure-Python PDF library with an increasing number of capabilities. +See README for links to FAQ, documentation, homepage, etc. +""" + +__author__ = "Mathieu Fenniak" +__author_email__ = "biziqe@mathieu.fenniak.net" + +__maintainer__ = "Phaseit, Inc." +__maintainer_email = "PyPDF2@phaseit.net" + +import string +import math +import struct +import sys +import uuid +from sys import version_info +if version_info < ( 3, 0 ): + from cStringIO import StringIO +else: + from io import StringIO + +if version_info < ( 3, 0 ): + BytesIO = StringIO +else: + from io import BytesIO + +from . import filters +from . import utils +import warnings +import codecs +from .generic import * +from .utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList +from .utils import isString, b_, u_, ord_, chr_, str_, formatWarning + +if version_info < ( 2, 4 ): + from sets import ImmutableSet as frozenset + +if version_info < ( 2, 5 ): + from md5 import md5 +else: + from hashlib import md5 +import uuid + + +class PdfFileWriter(object): + """ + This class supports writing PDF files out, given pages produced by another + class (typically :class:`PdfFileReader`). + """ + def __init__(self): + self._header = b_("%PDF-1.3") + self._objects = [] # array of indirect objects + + # The root of our page tree node. + pages = DictionaryObject() + pages.update({ + NameObject("/Type"): NameObject("/Pages"), + NameObject("/Count"): NumberObject(0), + NameObject("/Kids"): ArrayObject(), + }) + self._pages = self._addObject(pages) + + # info object + info = DictionaryObject() + info.update({ + NameObject("/Producer"): createStringObject(codecs.BOM_UTF16_BE + u_("PyPDF2").encode('utf-16be')) + }) + self._info = self._addObject(info) + + # root object + root = DictionaryObject() + root.update({ + NameObject("/Type"): NameObject("/Catalog"), + NameObject("/Pages"): self._pages, + }) + self._root = None + self._root_object = root + + def _addObject(self, obj): + self._objects.append(obj) + return IndirectObject(len(self._objects), 0, self) + + def getObject(self, ido): + if ido.pdf != self: + raise ValueError("pdf must be self") + return self._objects[ido.idnum - 1] + + def _addPage(self, page, action): + assert page["/Type"] == "/Page" + page[NameObject("/Parent")] = self._pages + page = self._addObject(page) + pages = self.getObject(self._pages) + action(pages["/Kids"], page) + pages[NameObject("/Count")] = NumberObject(pages["/Count"] + 1) + + def addPage(self, page): + """ + Adds a page to this PDF file. The page is usually acquired from a + :class:`PdfFileReader` instance. + + :param PageObject page: The page to add to the document. Should be + an instance of :class:`PageObject` + """ + self._addPage(page, list.append) + + def insertPage(self, page, index=0): + """ + Insert a page in this PDF file. The page is usually acquired from a + :class:`PdfFileReader` instance. + + :param PageObject page: The page to add to the document. This + argument should be an instance of :class:`PageObject`. + :param int index: Position at which the page will be inserted. + """ + self._addPage(page, lambda l, p: l.insert(index, p)) + + def getPage(self, pageNumber): + """ + Retrieves a page by number from this PDF file. + + :param int pageNumber: The page number to retrieve + (pages begin at zero) + :return: the page at the index given by *pageNumber* + :rtype: :class:`PageObject` + """ + pages = self.getObject(self._pages) + # XXX: crude hack + return pages["/Kids"][pageNumber].getObject() + + def getNumPages(self): + """ + :return: the number of pages. + :rtype: int + """ + pages = self.getObject(self._pages) + return int(pages[NameObject("/Count")]) + + def addBlankPage(self, width=None, height=None): + """ + Appends a blank page to this PDF file and returns it. If no page size + is specified, use the size of the last page. + + :param float width: The width of the new page expressed in default user + space units. + :param float height: The height of the new page expressed in default + user space units. + :return: the newly appended page + :rtype: :class:`PageObject` + :raises PageSizeNotDefinedError: if width and height are not defined + and previous page does not exist. + """ + page = PageObject.createBlankPage(self, width, height) + self.addPage(page) + return page + + def insertBlankPage(self, width=None, height=None, index=0): + """ + Inserts a blank page to this PDF file and returns it. If no page size + is specified, use the size of the last page. + + :param float width: The width of the new page expressed in default user + space units. + :param float height: The height of the new page expressed in default + user space units. + :param int index: Position to add the page. + :return: the newly appended page + :rtype: :class:`PageObject` + :raises PageSizeNotDefinedError: if width and height are not defined + and previous page does not exist. + """ + if width is None or height is None and \ + (self.getNumPages() - 1) >= index: + oldpage = self.getPage(index) + width = oldpage.mediaBox.getWidth() + height = oldpage.mediaBox.getHeight() + page = PageObject.createBlankPage(self, width, height) + self.insertPage(page, index) + return page + + def addJS(self, javascript): + """ + Add Javascript which will launch upon opening this PDF. + + :param str javascript: Your Javascript. + + >>> output.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") + # Example: This will launch the print window when the PDF is opened. + """ + js = DictionaryObject() + js.update({ + NameObject("/Type"): NameObject("/Action"), + NameObject("/S"): NameObject("/JavaScript"), + NameObject("/JS"): NameObject("(%s)" % javascript) + }) + js_indirect_object = self._addObject(js) + + # We need a name for parameterized javascript in the pdf file, but it can be anything. + js_string_name = str(uuid.uuid4()) + + js_name_tree = DictionaryObject() + js_name_tree.update({ + NameObject("/JavaScript"): DictionaryObject({ + NameObject("/Names"): ArrayObject([createStringObject(js_string_name), js_indirect_object]) + }) + }) + self._addObject(js_name_tree) + + self._root_object.update({ + NameObject("/OpenAction"): js_indirect_object, + NameObject("/Names"): js_name_tree + }) + + def addAttachment(self, fname, fdata): + """ + Embed a file inside the PDF. + + :param str fname: The filename to display. + :param str fdata: The data in the file. + + Reference: + https://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/PDF32000_2008.pdf + Section 7.11.3 + """ + + # We need 3 entries: + # * The file's data + # * The /Filespec entry + # * The file's name, which goes in the Catalog + + + # The entry for the file + """ Sample: + 8 0 obj + << + /Length 12 + /Type /EmbeddedFile + >> + stream + Hello world! + endstream + endobj + """ + file_entry = DecodedStreamObject() + file_entry.setData(fdata) + file_entry.update({ + NameObject("/Type"): NameObject("/EmbeddedFile") + }) + + # The Filespec entry + """ Sample: + 7 0 obj + << + /Type /Filespec + /F (hello.txt) + /EF << /F 8 0 R >> + >> + """ + efEntry = DictionaryObject() + efEntry.update({ NameObject("/F"):file_entry }) + + filespec = DictionaryObject() + filespec.update({ + NameObject("/Type"): NameObject("/Filespec"), + NameObject("/F"): createStringObject(fname), # Perhaps also try TextStringObject + NameObject("/EF"): efEntry + }) + + # Then create the entry for the root, as it needs a reference to the Filespec + """ Sample: + 1 0 obj + << + /Type /Catalog + /Outlines 2 0 R + /Pages 3 0 R + /Names << /EmbeddedFiles << /Names [(hello.txt) 7 0 R] >> >> + >> + endobj + + """ + embeddedFilesNamesDictionary = DictionaryObject() + embeddedFilesNamesDictionary.update({ + NameObject("/Names"): ArrayObject([createStringObject(fname), filespec]) + }) + + embeddedFilesDictionary = DictionaryObject() + embeddedFilesDictionary.update({ + NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary + }) + # Update the root + self._root_object.update({ + NameObject("/Names"): embeddedFilesDictionary + }) + + def appendPagesFromReader(self, reader, after_page_append=None): + """ + Copy pages from reader to writer. Includes an optional callback parameter + which is invoked after pages are appended to the writer. + + :param reader: a PdfFileReader object from which to copy page + annotations to this writer object. The writer's annots + will then be updated + :callback after_page_append (function): Callback function that is invoked after + each page is appended to the writer. Callback signature: + + :param writer_pageref (PDF page reference): Reference to the page + appended to the writer. + """ + # Get page count from writer and reader + reader_num_pages = reader.getNumPages() + writer_num_pages = self.getNumPages() + + # Copy pages from reader to writer + for rpagenum in range(0, reader_num_pages): + reader_page = reader.getPage(rpagenum) + self.addPage(reader_page) + writer_page = self.getPage(writer_num_pages+rpagenum) + # Trigger callback, pass writer page as parameter + if callable(after_page_append): after_page_append(writer_page) + + def updatePageFormFieldValues(self, page, fields): + ''' + Update the form field values for a given page from a fields dictionary. + Copy field texts and values from fields to page. + + :param page: Page reference from PDF writer where the annotations + and field data will be updated. + :param fields: a Python dictionary of field names (/T) and text + values (/V) + ''' + # Iterate through pages, update field values + for j in range(0, len(page['/Annots'])): + writer_annot = page['/Annots'][j].getObject() + for field in fields: + if writer_annot.get('/T') == field: + writer_annot.update({ + NameObject("/V"): TextStringObject(fields[field]) + }) + + def cloneReaderDocumentRoot(self, reader): + ''' + Copy the reader document root to the writer. + + :param reader: PdfFileReader from the document root should be copied. + :callback after_page_append + ''' + self._root_object = reader.trailer['/Root'] + + def cloneDocumentFromReader(self, reader, after_page_append=None): + ''' + Create a copy (clone) of a document from a PDF file reader + + :param reader: PDF file reader instance from which the clone + should be created. + :callback after_page_append (function): Callback function that is invoked after + each page is appended to the writer. Signature includes a reference to the + appended page (delegates to appendPagesFromReader). Callback signature: + + :param writer_pageref (PDF page reference): Reference to the page just + appended to the document. + ''' + self.cloneReaderDocumentRoot(reader) + self.appendPagesFromReader(reader, after_page_append) + + def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True): + """ + Encrypt this PDF file with the PDF Standard encryption handler. + + :param str user_pwd: The "user password", which allows for opening + and reading the PDF file with the restrictions provided. + :param str owner_pwd: The "owner password", which allows for + opening the PDF files without any restrictions. By default, + the owner password is the same as the user password. + :param bool use_128bit: flag as to whether to use 128bit + encryption. When false, 40bit encryption will be used. By default, + this flag is on. + """ + import time, random + if owner_pwd == None: + owner_pwd = user_pwd + if use_128bit: + V = 2 + rev = 3 + keylen = int(128 / 8) + else: + V = 1 + rev = 2 + keylen = int(40 / 8) + # permit everything: + P = -1 + O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen)) + ID_1 = ByteStringObject(md5(b_(repr(time.time()))).digest()) + ID_2 = ByteStringObject(md5(b_(repr(random.random()))).digest()) + self._ID = ArrayObject((ID_1, ID_2)) + if rev == 2: + U, key = _alg34(user_pwd, O, P, ID_1) + else: + assert rev == 3 + U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False) + encrypt = DictionaryObject() + encrypt[NameObject("/Filter")] = NameObject("/Standard") + encrypt[NameObject("/V")] = NumberObject(V) + if V == 2: + encrypt[NameObject("/Length")] = NumberObject(keylen * 8) + encrypt[NameObject("/R")] = NumberObject(rev) + encrypt[NameObject("/O")] = ByteStringObject(O) + encrypt[NameObject("/U")] = ByteStringObject(U) + encrypt[NameObject("/P")] = NumberObject(P) + self._encrypt = self._addObject(encrypt) + self._encrypt_key = key + + def write(self, stream): + """ + Writes the collection of pages added to this object out as a PDF file. + + :param stream: An object to write the file to. The object must support + the write method and the tell method, similar to a file object. + """ + if hasattr(stream, 'mode') and 'b' not in stream.mode: + warnings.warn("File <%s> to write to is not in binary mode. It may not be written to correctly." % stream.name) + debug = False + import struct + + if not self._root: + self._root = self._addObject(self._root_object) + + externalReferenceMap = {} + + # PDF objects sometimes have circular references to their /Page objects + # inside their object tree (for example, annotations). Those will be + # indirect references to objects that we've recreated in this PDF. To + # address this problem, PageObject's store their original object + # reference number, and we add it to the external reference map before + # we sweep for indirect references. This forces self-page-referencing + # trees to reference the correct new object location, rather than + # copying in a new copy of the page object. + for objIndex in range(len(self._objects)): + obj = self._objects[objIndex] + if isinstance(obj, PageObject) and obj.indirectRef != None: + data = obj.indirectRef + if data.pdf not in externalReferenceMap: + externalReferenceMap[data.pdf] = {} + if data.generation not in externalReferenceMap[data.pdf]: + externalReferenceMap[data.pdf][data.generation] = {} + externalReferenceMap[data.pdf][data.generation][data.idnum] = IndirectObject(objIndex + 1, 0, self) + + self.stack = [] + if debug: print(("ERM:", externalReferenceMap, "root:", self._root)) + self._sweepIndirectReferences(externalReferenceMap, self._root) + del self.stack + + # Begin writing: + object_positions = [] + stream.write(self._header + b_("\n")) + for i in range(len(self._objects)): + idnum = (i + 1) + obj = self._objects[i] + object_positions.append(stream.tell()) + stream.write(b_(str(idnum) + " 0 obj\n")) + key = None + if hasattr(self, "_encrypt") and idnum != self._encrypt.idnum: + pack1 = struct.pack("` for details. + """ + pageRef = self.getObject(self._pages)['/Kids'][pagenum] + action = DictionaryObject() + zoomArgs = [] + for a in args: + if a is not None: + zoomArgs.append(NumberObject(a)) + else: + zoomArgs.append(NullObject()) + dest = Destination(NameObject("/"+title + " bookmark"), pageRef, NameObject(fit), *zoomArgs) + destArray = dest.getDestArray() + action.update({ + NameObject('/D') : destArray, + NameObject('/S') : NameObject('/GoTo') + }) + actionRef = self._addObject(action) + + outlineRef = self.getOutlineRoot() + + if parent == None: + parent = outlineRef + + bookmark = TreeObject() + + bookmark.update({ + NameObject('/A'): actionRef, + NameObject('/Title'): createStringObject(title), + }) + + if color is not None: + bookmark.update({NameObject('/C'): ArrayObject([FloatObject(c) for c in color])}) + + format = 0 + if italic: + format += 1 + if bold: + format += 2 + if format: + bookmark.update({NameObject('/F'): NumberObject(format)}) + + bookmarkRef = self._addObject(bookmark) + + parent = parent.getObject() + parent.addChild(bookmarkRef, self) + + return bookmarkRef + + def addNamedDestinationObject(self, dest): + destRef = self._addObject(dest) + + nd = self.getNamedDestRoot() + nd.extend([dest['/Title'], destRef]) + + return destRef + + def addNamedDestination(self, title, pagenum): + pageRef = self.getObject(self._pages)['/Kids'][pagenum] + dest = DictionaryObject() + dest.update({ + NameObject('/D') : ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]), + NameObject('/S') : NameObject('/GoTo') + }) + + destRef = self._addObject(dest) + nd = self.getNamedDestRoot() + + nd.extend([title, destRef]) + + return destRef + + def removeLinks(self): + """ + Removes links and annotations from this output. + """ + pages = self.getObject(self._pages)['/Kids'] + for page in pages: + pageRef = self.getObject(page) + if "/Annots" in pageRef: + del pageRef['/Annots'] + + def removeImages(self, ignoreByteStringObject=False): + """ + Removes images from this output. + + :param bool ignoreByteStringObject: optional parameter + to ignore ByteString Objects. + """ + pages = self.getObject(self._pages)['/Kids'] + for j in range(len(pages)): + page = pages[j] + pageRef = self.getObject(page) + content = pageRef['/Contents'].getObject() + if not isinstance(content, ContentStream): + content = ContentStream(content, pageRef) + + _operations = [] + seq_graphics = False + for operands, operator in content.operations: + if operator == b_('Tj'): + text = operands[0] + if ignoreByteStringObject: + if not isinstance(text, TextStringObject): + operands[0] = TextStringObject() + elif operator == b_("'"): + text = operands[0] + if ignoreByteStringObject: + if not isinstance(text, TextStringObject): + operands[0] = TextStringObject() + elif operator == b_('"'): + text = operands[2] + if ignoreByteStringObject: + if not isinstance(text, TextStringObject): + operands[2] = TextStringObject() + elif operator == b_("TJ"): + for i in range(len(operands[0])): + if ignoreByteStringObject: + if not isinstance(operands[0][i], TextStringObject): + operands[0][i] = TextStringObject() + + if operator == b_('q'): + seq_graphics = True + if operator == b_('Q'): + seq_graphics = False + if seq_graphics: + if operator in [b_('cm'), b_('w'), b_('J'), b_('j'), b_('M'), b_('d'), b_('ri'), b_('i'), + b_('gs'), b_('W'), b_('b'), b_('s'), b_('S'), b_('f'), b_('F'), b_('n'), b_('m'), b_('l'), + b_('c'), b_('v'), b_('y'), b_('h'), b_('B'), b_('Do'), b_('sh')]: + continue + if operator == b_('re'): + continue + _operations.append((operands, operator)) + + content.operations = _operations + pageRef.__setitem__(NameObject('/Contents'), content) + + def removeText(self, ignoreByteStringObject=False): + """ + Removes images from this output. + + :param bool ignoreByteStringObject: optional parameter + to ignore ByteString Objects. + """ + pages = self.getObject(self._pages)['/Kids'] + for j in range(len(pages)): + page = pages[j] + pageRef = self.getObject(page) + content = pageRef['/Contents'].getObject() + if not isinstance(content, ContentStream): + content = ContentStream(content, pageRef) + for operands,operator in content.operations: + if operator == b_('Tj'): + text = operands[0] + if not ignoreByteStringObject: + if isinstance(text, TextStringObject): + operands[0] = TextStringObject() + else: + if isinstance(text, TextStringObject) or \ + isinstance(text, ByteStringObject): + operands[0] = TextStringObject() + elif operator == b_("'"): + text = operands[0] + if not ignoreByteStringObject: + if isinstance(text, TextStringObject): + operands[0] = TextStringObject() + else: + if isinstance(text, TextStringObject) or \ + isinstance(text, ByteStringObject): + operands[0] = TextStringObject() + elif operator == b_('"'): + text = operands[2] + if not ignoreByteStringObject: + if isinstance(text, TextStringObject): + operands[2] = TextStringObject() + else: + if isinstance(text, TextStringObject) or \ + isinstance(text, ByteStringObject): + operands[2] = TextStringObject() + elif operator == b_("TJ"): + for i in range(len(operands[0])): + if not ignoreByteStringObject: + if isinstance(operands[0][i], TextStringObject): + operands[0][i] = TextStringObject() + else: + if isinstance(operands[0][i], TextStringObject) or \ + isinstance(operands[0][i], ByteStringObject): + operands[0][i] = TextStringObject() + + pageRef.__setitem__(NameObject('/Contents'), content) + + def addLink(self, pagenum, pagedest, rect, border=None, fit='/Fit', *args): + """ + Add an internal link from a rectangular area to the specified page. + + :param int pagenum: index of the page on which to place the link. + :param int pagedest: index of the page to which the link should go. + :param rect: :class:`RectangleObject` or array of four + integers specifying the clickable rectangular area + ``[xLL, yLL, xUR, yUR]``, or string in the form ``"[ xLL yLL xUR yUR ]"``. + :param border: if provided, an array describing border-drawing + properties. See the PDF spec for details. No border will be + drawn if this argument is omitted. + :param str fit: Page fit or 'zoom' option (see below). Additional arguments may need + to be supplied. Passing ``None`` will be read as a null value for that coordinate. + + Valid zoom arguments (see Table 8.2 of the PDF 1.7 reference for details): + /Fit No additional arguments + /XYZ [left] [top] [zoomFactor] + /FitH [top] + /FitV [left] + /FitR [left] [bottom] [right] [top] + /FitB No additional arguments + /FitBH [top] + /FitBV [left] + """ + + pageLink = self.getObject(self._pages)['/Kids'][pagenum] + pageDest = self.getObject(self._pages)['/Kids'][pagedest] #TODO: switch for external link + pageRef = self.getObject(pageLink) + + if border is not None: + borderArr = [NameObject(n) for n in border[:3]] + if len(border) == 4: + dashPattern = ArrayObject([NameObject(n) for n in border[3]]) + borderArr.append(dashPattern) + else: + borderArr = [NumberObject(0)] * 3 + + if isString(rect): + rect = NameObject(rect) + elif isinstance(rect, RectangleObject): + pass + else: + rect = RectangleObject(rect) + + zoomArgs = [] + for a in args: + if a is not None: + zoomArgs.append(NumberObject(a)) + else: + zoomArgs.append(NullObject()) + dest = Destination(NameObject("/LinkName"), pageDest, NameObject(fit), *zoomArgs) #TODO: create a better name for the link + destArray = dest.getDestArray() + + lnk = DictionaryObject() + lnk.update({ + NameObject('/Type'): NameObject('/Annot'), + NameObject('/Subtype'): NameObject('/Link'), + NameObject('/P'): pageLink, + NameObject('/Rect'): rect, + NameObject('/Border'): ArrayObject(borderArr), + NameObject('/Dest'): destArray + }) + lnkRef = self._addObject(lnk) + + if "/Annots" in pageRef: + pageRef['/Annots'].append(lnkRef) + else: + pageRef[NameObject('/Annots')] = ArrayObject([lnkRef]) + + _valid_layouts = ['/NoLayout', '/SinglePage', '/OneColumn', '/TwoColumnLeft', '/TwoColumnRight', '/TwoPageLeft', '/TwoPageRight'] + + def getPageLayout(self): + """ + Get the page layout. + See :meth:`setPageLayout()` for a description of valid layouts. + + :return: Page layout currently being used. + :rtype: str, None if not specified + """ + try: + return self._root_object['/PageLayout'] + except KeyError: + return None + + def setPageLayout(self, layout): + """ + Set the page layout + + :param str layout: The page layout to be used + + Valid layouts are: + /NoLayout Layout explicitly not specified + /SinglePage Show one page at a time + /OneColumn Show one column at a time + /TwoColumnLeft Show pages in two columns, odd-numbered pages on the left + /TwoColumnRight Show pages in two columns, odd-numbered pages on the right + /TwoPageLeft Show two pages at a time, odd-numbered pages on the left + /TwoPageRight Show two pages at a time, odd-numbered pages on the right + """ + if not isinstance(layout, NameObject): + if layout not in self._valid_layouts: + warnings.warn("Layout should be one of: {}".format(', '.join(self._valid_layouts))) + layout = NameObject(layout) + self._root_object.update({NameObject('/PageLayout'): layout}) + + pageLayout = property(getPageLayout, setPageLayout) + """Read and write property accessing the :meth:`getPageLayout()` + and :meth:`setPageLayout()` methods.""" + + _valid_modes = ['/UseNone', '/UseOutlines', '/UseThumbs', '/FullScreen', '/UseOC', '/UseAttachments'] + + def getPageMode(self): + """ + Get the page mode. + See :meth:`setPageMode()` for a description + of valid modes. + + :return: Page mode currently being used. + :rtype: str, None if not specified + """ + try: + return self._root_object['/PageMode'] + except KeyError: + return None + + def setPageMode(self, mode): + """ + Set the page mode. + + :param str mode: The page mode to use. + + Valid modes are: + /UseNone Do not show outlines or thumbnails panels + /UseOutlines Show outlines (aka bookmarks) panel + /UseThumbs Show page thumbnails panel + /FullScreen Fullscreen view + /UseOC Show Optional Content Group (OCG) panel + /UseAttachments Show attachments panel + """ + if not isinstance(mode, NameObject): + if mode not in self._valid_modes: + warnings.warn("Mode should be one of: {}".format(', '.join(self._valid_modes))) + mode = NameObject(mode) + self._root_object.update({NameObject('/PageMode'): mode}) + + pageMode = property(getPageMode, setPageMode) + """Read and write property accessing the :meth:`getPageMode()` + and :meth:`setPageMode()` methods.""" + + +class PdfFileReader(object): + """ + Initializes a PdfFileReader object. This operation can take some time, as + the PDF stream's cross-reference tables are read into memory. + + :param stream: A File object or an object that supports the standard read + and seek methods similar to a File object. Could also be a + string representing a path to a PDF file. + :param bool strict: Determines whether user should be warned of all + problems and also causes some correctable problems to be fatal. + Defaults to ``True``. + :param warndest: Destination for logging warnings (defaults to + ``sys.stderr``). + :param bool overwriteWarnings: Determines whether to override Python's + ``warnings.py`` module with a custom implementation (defaults to + ``True``). + """ + def __init__(self, stream, strict=True, warndest = None, overwriteWarnings = True): + if overwriteWarnings: + # have to dynamically override the default showwarning since there are no + # public methods that specify the 'file' parameter + def _showwarning(message, category, filename, lineno, file=warndest, line=None): + if file is None: + file = sys.stderr + try: + file.write(formatWarning(message, category, filename, lineno, line)) + except IOError: + pass + warnings.showwarning = _showwarning + self.strict = strict + self.flattenedPages = None + self.resolvedObjects = {} + self.xrefIndex = 0 + self._pageId2Num = None # map page IndirectRef number to Page Number + if hasattr(stream, 'mode') and 'b' not in stream.mode: + warnings.warn("PdfFileReader stream/file object is not in binary mode. It may not be read correctly.", utils.PdfReadWarning) + if isString(stream): + fileobj = open(stream, 'rb') + stream = BytesIO(b_(fileobj.read())) + fileobj.close() + self.read(stream) + self.stream = stream + + self._override_encryption = False + + def getDocumentInfo(self): + """ + Retrieves the PDF file's document information dictionary, if it exists. + Note that some PDF files use metadata streams instead of docinfo + dictionaries, and these metadata streams will not be accessed by this + function. + + :return: the document information of this PDF file + :rtype: :class:`DocumentInformation` or ``None`` if none exists. + """ + if "/Info" not in self.trailer: + return None + obj = self.trailer['/Info'] + retval = DocumentInformation() + retval.update(obj) + return retval + + documentInfo = property(lambda self: self.getDocumentInfo(), None, None) + """Read-only property that accesses the :meth:`getDocumentInfo()` function.""" + + def getXmpMetadata(self): + """ + Retrieves XMP (Extensible Metadata Platform) data from the PDF document + root. + + :return: a :class:`XmpInformation` + instance that can be used to access XMP metadata from the document. + :rtype: :class:`XmpInformation` or + ``None`` if no metadata was found on the document root. + """ + try: + self._override_encryption = True + return self.trailer["/Root"].getXmpMetadata() + finally: + self._override_encryption = False + + xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None) + """ + Read-only property that accesses the + :meth:`getXmpMetadata()` function. + """ + + def getNumPages(self): + """ + Calculates the number of pages in this PDF file. + + :return: number of pages + :rtype: int + :raises PdfReadError: if file is encrypted and restrictions prevent + this action. + """ + + # Flattened pages will not work on an Encrypted PDF; + # the PDF file's page count is used in this case. Otherwise, + # the original method (flattened page count) is used. + if self.isEncrypted: + try: + self._override_encryption = True + self.decrypt('') + return self.trailer["/Root"]["/Pages"]["/Count"] + except: + raise utils.PdfReadError("File has not been decrypted") + finally: + self._override_encryption = False + else: + if self.flattenedPages == None: + self._flatten() + return len(self.flattenedPages) + + numPages = property(lambda self: self.getNumPages(), None, None) + """ + Read-only property that accesses the + :meth:`getNumPages()` function. + """ + + def getPage(self, pageNumber): + """ + Retrieves a page by number from this PDF file. + + :param int pageNumber: The page number to retrieve + (pages begin at zero) + :return: a :class:`PageObject` instance. + :rtype: :class:`PageObject` + """ + ## ensure that we're not trying to access an encrypted PDF + #assert not self.trailer.has_key("/Encrypt") + if self.flattenedPages == None: + self._flatten() + return self.flattenedPages[pageNumber] + + namedDestinations = property(lambda self: + self.getNamedDestinations(), None, None) + """ + Read-only property that accesses the + :meth:`getNamedDestinations()` function. + """ + + # A select group of relevant field attributes. For the complete list, + # see section 8.6.2 of the PDF 1.7 reference. + + def getFields(self, tree = None, retval = None, fileobj = None): + """ + Extracts field data if this PDF contains interactive form fields. + The *tree* and *retval* parameters are for recursive use. + + :param fileobj: A file object (usually a text file) to write + a report to on all interactive form fields found. + :return: A dictionary where each key is a field name, and each + value is a :class:`Field` object. By + default, the mapping name is used for keys. + :rtype: dict, or ``None`` if form data could not be located. + """ + fieldAttributes = {"/FT" : "Field Type", "/Parent" : "Parent", + "/T" : "Field Name", "/TU" : "Alternate Field Name", + "/TM" : "Mapping Name", "/Ff" : "Field Flags", + "/V" : "Value", "/DV" : "Default Value"} + if retval == None: + retval = {} + catalog = self.trailer["/Root"] + # get the AcroForm tree + if "/AcroForm" in catalog: + tree = catalog["/AcroForm"] + else: + return None + if tree == None: + return retval + + self._checkKids(tree, retval, fileobj) + for attr in fieldAttributes: + if attr in tree: + # Tree is a field + self._buildField(tree, retval, fileobj, fieldAttributes) + break + + if "/Fields" in tree: + fields = tree["/Fields"] + for f in fields: + field = f.getObject() + self._buildField(field, retval, fileobj, fieldAttributes) + + return retval + + def _buildField(self, field, retval, fileobj, fieldAttributes): + self._checkKids(field, retval, fileobj) + try: + key = field["/TM"] + except KeyError: + try: + key = field["/T"] + except KeyError: + # Ignore no-name field for now + return + if fileobj: + self._writeField(fileobj, field, fieldAttributes) + fileobj.write("\n") + retval[key] = Field(field) + + def _checkKids(self, tree, retval, fileobj): + if "/Kids" in tree: + # recurse down the tree + for kid in tree["/Kids"]: + self.getFields(kid.getObject(), retval, fileobj) + + def _writeField(self, fileobj, field, fieldAttributes): + order = ["/TM", "/T", "/FT", "/Parent", "/TU", "/Ff", "/V", "/DV"] + for attr in order: + attrName = fieldAttributes[attr] + try: + if attr == "/FT": + # Make the field type value more clear + types = {"/Btn":"Button", "/Tx":"Text", "/Ch": "Choice", + "/Sig":"Signature"} + if field[attr] in types: + fileobj.write(attrName + ": " + types[field[attr]] + "\n") + elif attr == "/Parent": + # Let's just write the name of the parent + try: + name = field["/Parent"]["/TM"] + except KeyError: + name = field["/Parent"]["/T"] + fileobj.write(attrName + ": " + name + "\n") + else: + fileobj.write(attrName + ": " + str(field[attr]) + "\n") + except KeyError: + # Field attribute is N/A or unknown, so don't write anything + pass + + def getFormTextFields(self): + ''' Retrieves form fields from the document with textual data (inputs, dropdowns) + ''' + # Retrieve document form fields + formfields = self.getFields() + return dict( + (formfields[field]['/T'], formfields[field].get('/V')) for field in formfields \ + if formfields[field].get('/FT') == '/Tx' + ) + + def getNamedDestinations(self, tree=None, retval=None): + """ + Retrieves the named destinations present in the document. + + :return: a dictionary which maps names to + :class:`Destinations`. + :rtype: dict + """ + if retval == None: + retval = {} + catalog = self.trailer["/Root"] + + # get the name tree + if "/Dests" in catalog: + tree = catalog["/Dests"] + elif "/Names" in catalog: + names = catalog['/Names'] + if "/Dests" in names: + tree = names['/Dests'] + + if tree == None: + return retval + + if "/Kids" in tree: + # recurse down the tree + for kid in tree["/Kids"]: + self.getNamedDestinations(kid.getObject(), retval) + + if "/Names" in tree: + names = tree["/Names"] + for i in range(0, len(names), 2): + key = names[i].getObject() + val = names[i+1].getObject() + if isinstance(val, DictionaryObject) and '/D' in val: + val = val['/D'] + dest = self._buildDestination(key, val) + if dest != None: + retval[key] = dest + + return retval + + outlines = property(lambda self: self.getOutlines(), None, None) + """ + Read-only property that accesses the + :meth:`getOutlines()` function. + """ + + def getOutlines(self, node=None, outlines=None): + """ + Retrieves the document outline present in the document. + + :return: a nested list of :class:`Destinations`. + """ + if outlines == None: + outlines = [] + catalog = self.trailer["/Root"] + + # get the outline dictionary and named destinations + if "/Outlines" in catalog: + try: + lines = catalog["/Outlines"] + except utils.PdfReadError: + # this occurs if the /Outlines object reference is incorrect + # for an example of such a file, see https://unglueit-files.s3.amazonaws.com/ebf/7552c42e9280b4476e59e77acc0bc812.pdf + # so continue to load the file without the Bookmarks + return outlines + + if "/First" in lines: + node = lines["/First"] + self._namedDests = self.getNamedDestinations() + + if node == None: + return outlines + + # see if there are any more outlines + while True: + outline = self._buildOutline(node) + if outline: + outlines.append(outline) + + # check for sub-outlines + if "/First" in node: + subOutlines = [] + self.getOutlines(node["/First"], subOutlines) + if subOutlines: + outlines.append(subOutlines) + + if "/Next" not in node: + break + node = node["/Next"] + + return outlines + + def _getPageNumberByIndirect(self, indirectRef): + """Generate _pageId2Num""" + if self._pageId2Num is None: + id2num = {} + for i, x in enumerate(self.pages): + id2num[x.indirectRef.idnum] = i + self._pageId2Num = id2num + + if isinstance(indirectRef, int): + idnum = indirectRef + else: + idnum = indirectRef.idnum + + ret = self._pageId2Num.get(idnum, -1) + return ret + + def getPageNumber(self, page): + """ + Retrieve page number of a given PageObject + + :param PageObject page: The page to get page number. Should be + an instance of :class:`PageObject` + :return: the page number or -1 if page not found + :rtype: int + """ + indirectRef = page.indirectRef + ret = self._getPageNumberByIndirect(indirectRef) + return ret + + def getDestinationPageNumber(self, destination): + """ + Retrieve page number of a given Destination object + + :param Destination destination: The destination to get page number. + Should be an instance of + :class:`Destination` + :return: the page number or -1 if page not found + :rtype: int + """ + indirectRef = destination.page + ret = self._getPageNumberByIndirect(indirectRef) + return ret + + def _buildDestination(self, title, array): + page, typ = array[0:2] + array = array[2:] + return Destination(title, page, typ, *array) + + def _buildOutline(self, node): + dest, title, outline = None, None, None + + if "/A" in node and "/Title" in node: + # Action, section 8.5 (only type GoTo supported) + title = node["/Title"] + action = node["/A"] + if action["/S"] == "/GoTo": + dest = action["/D"] + elif "/Dest" in node and "/Title" in node: + # Destination, section 8.2.1 + title = node["/Title"] + dest = node["/Dest"] + + # if destination found, then create outline + if dest: + if isinstance(dest, ArrayObject): + outline = self._buildDestination(title, dest) + elif isString(dest) and dest in self._namedDests: + outline = self._namedDests[dest] + outline[NameObject("/Title")] = title + else: + raise utils.PdfReadError("Unexpected destination %r" % dest) + return outline + + pages = property(lambda self: ConvertFunctionsToVirtualList(self.getNumPages, self.getPage), + None, None) + """ + Read-only property that emulates a list based upon the + :meth:`getNumPages()` and + :meth:`getPage()` methods. + """ + + def getPageLayout(self): + """ + Get the page layout. + See :meth:`setPageLayout()` + for a description of valid layouts. + + :return: Page layout currently being used. + :rtype: ``str``, ``None`` if not specified + """ + try: + return self.trailer['/Root']['/PageLayout'] + except KeyError: + return None + + pageLayout = property(getPageLayout) + """Read-only property accessing the + :meth:`getPageLayout()` method.""" + + def getPageMode(self): + """ + Get the page mode. + See :meth:`setPageMode()` + for a description of valid modes. + + :return: Page mode currently being used. + :rtype: ``str``, ``None`` if not specified + """ + try: + return self.trailer['/Root']['/PageMode'] + except KeyError: + return None + + pageMode = property(getPageMode) + """Read-only property accessing the + :meth:`getPageMode()` method.""" + + def _flatten(self, pages=None, inherit=None, indirectRef=None): + inheritablePageAttributes = ( + NameObject("/Resources"), NameObject("/MediaBox"), + NameObject("/CropBox"), NameObject("/Rotate") + ) + if inherit == None: + inherit = dict() + if pages == None: + self.flattenedPages = [] + catalog = self.trailer["/Root"].getObject() + pages = catalog["/Pages"].getObject() + + t = "/Pages" + if "/Type" in pages: + t = pages["/Type"] + + if t == "/Pages": + for attr in inheritablePageAttributes: + if attr in pages: + inherit[attr] = pages[attr] + for page in pages["/Kids"]: + addt = {} + if isinstance(page, IndirectObject): + addt["indirectRef"] = page + self._flatten(page.getObject(), inherit, **addt) + elif t == "/Page": + for attr, value in list(inherit.items()): + # if the page has it's own value, it does not inherit the + # parent's value: + if attr not in pages: + pages[attr] = value + pageObj = PageObject(self, indirectRef) + pageObj.update(pages) + self.flattenedPages.append(pageObj) + + def _getObjectFromStream(self, indirectReference): + # indirect reference to object in object stream + # read the entire object stream into memory + debug = False + stmnum, idx = self.xref_objStm[indirectReference.idnum] + if debug: print(("Here1: %s %s"%(stmnum, idx))) + objStm = IndirectObject(stmnum, 0, self).getObject() + if debug: print(("Here2: objStm=%s.. stmnum=%s data=%s"%(objStm, stmnum, objStm.getData()))) + # This is an xref to a stream, so its type better be a stream + assert objStm['/Type'] == '/ObjStm' + # /N is the number of indirect objects in the stream + assert idx < objStm['/N'] + streamData = BytesIO(b_(objStm.getData())) + for i in range(objStm['/N']): + readNonWhitespace(streamData) + streamData.seek(-1, 1) + objnum = NumberObject.readFromStream(streamData) + readNonWhitespace(streamData) + streamData.seek(-1, 1) + offset = NumberObject.readFromStream(streamData) + readNonWhitespace(streamData) + streamData.seek(-1, 1) + if objnum != indirectReference.idnum: + # We're only interested in one object + continue + if self.strict and idx != i: + raise utils.PdfReadError("Object is in wrong index.") + streamData.seek(objStm['/First']+offset, 0) + if debug: + pos = streamData.tell() + streamData.seek(0, 0) + lines = streamData.readlines() + for i in range(0, len(lines)): + print((lines[i])) + streamData.seek(pos, 0) + try: + obj = readObject(streamData, self) + except utils.PdfStreamError as e: + # Stream object cannot be read. Normally, a critical error, but + # Adobe Reader doesn't complain, so continue (in strict mode?) + e = sys.exc_info()[1] + warnings.warn("Invalid stream (index %d) within object %d %d: %s" % \ + (i, indirectReference.idnum, indirectReference.generation, e), utils.PdfReadWarning) + + if self.strict: + raise utils.PdfReadError("Can't read object stream: %s"%e) + # Replace with null. Hopefully it's nothing important. + obj = NullObject() + return obj + + if self.strict: raise utils.PdfReadError("This is a fatal error in strict mode.") + return NullObject() + + def getObject(self, indirectReference): + debug = False + if debug: print(("looking at:", indirectReference.idnum, indirectReference.generation)) + retval = self.cacheGetIndirectObject(indirectReference.generation, + indirectReference.idnum) + if retval != None: + return retval + if indirectReference.generation == 0 and \ + indirectReference.idnum in self.xref_objStm: + retval = self._getObjectFromStream(indirectReference) + elif indirectReference.generation in self.xref and \ + indirectReference.idnum in self.xref[indirectReference.generation]: + start = self.xref[indirectReference.generation][indirectReference.idnum] + if debug: print((" Uncompressed Object", indirectReference.idnum, indirectReference.generation, ":", start)) + self.stream.seek(start, 0) + idnum, generation = self.readObjectHeader(self.stream) + if idnum != indirectReference.idnum and self.xrefIndex: + # Xref table probably had bad indexes due to not being zero-indexed + if self.strict: + raise utils.PdfReadError("Expected object ID (%d %d) does not match actual (%d %d); xref table not zero-indexed." \ + % (indirectReference.idnum, indirectReference.generation, idnum, generation)) + else: pass # xref table is corrected in non-strict mode + elif idnum != indirectReference.idnum: + # some other problem + raise utils.PdfReadError("Expected object ID (%d %d) does not match actual (%d %d)." \ + % (indirectReference.idnum, indirectReference.generation, idnum, generation)) + assert generation == indirectReference.generation + retval = readObject(self.stream, self) + + # override encryption is used for the /Encrypt dictionary + if not self._override_encryption and self.isEncrypted: + # if we don't have the encryption key: + if not hasattr(self, '_decryption_key'): + raise utils.PdfReadError("file has not been decrypted") + # otherwise, decrypt here... + import struct + pack1 = struct.pack(">read", stream) + # start at the end: + stream.seek(-1, 2) + if not stream.tell(): + raise utils.PdfReadError('Cannot read an empty file') + last1K = stream.tell() - 1024 + 1 # offset of last 1024 bytes of stream + line = b_('') + while line[:5] != b_("%%EOF"): + if stream.tell() < last1K: + raise utils.PdfReadError("EOF marker not found") + line = self.readNextEndLine(stream) + if debug: print(" line:",line) + + # find startxref entry - the location of the xref table + line = self.readNextEndLine(stream) + try: + startxref = int(line) + except ValueError: + # 'startxref' may be on the same line as the location + if not line.startswith(b_("startxref")): + raise utils.PdfReadError("startxref not found") + startxref = int(line[9:].strip()) + warnings.warn("startxref on same line as offset") + else: + line = self.readNextEndLine(stream) + if line[:9] != b_("startxref"): + raise utils.PdfReadError("startxref not found") + + # read all cross reference tables and their trailers + self.xref = {} + self.xref_objStm = {} + self.trailer = DictionaryObject() + while True: + # load the xref table + stream.seek(startxref, 0) + x = stream.read(1) + if x == b_("x"): + # standard cross-reference table + ref = stream.read(4) + if ref[:3] != b_("ref"): + raise utils.PdfReadError("xref table read error") + readNonWhitespace(stream) + stream.seek(-1, 1) + firsttime = True; # check if the first time looking at the xref table + while True: + num = readObject(stream, self) + if firsttime and num != 0: + self.xrefIndex = num + if self.strict: + warnings.warn("Xref table not zero-indexed. ID numbers for objects will be corrected.", utils.PdfReadWarning) + #if table not zero indexed, could be due to error from when PDF was created + #which will lead to mismatched indices later on, only warned and corrected if self.strict=True + firsttime = False + readNonWhitespace(stream) + stream.seek(-1, 1) + size = readObject(stream, self) + readNonWhitespace(stream) + stream.seek(-1, 1) + cnt = 0 + while cnt < size: + line = stream.read(20) + + # It's very clear in section 3.4.3 of the PDF spec + # that all cross-reference table lines are a fixed + # 20 bytes (as of PDF 1.7). However, some files have + # 21-byte entries (or more) due to the use of \r\n + # (CRLF) EOL's. Detect that case, and adjust the line + # until it does not begin with a \r (CR) or \n (LF). + while line[0] in b_("\x0D\x0A"): + stream.seek(-20 + 1, 1) + line = stream.read(20) + + # On the other hand, some malformed PDF files + # use a single character EOL without a preceeding + # space. Detect that case, and seek the stream + # back one character. (0-9 means we've bled into + # the next xref entry, t means we've bled into the + # text "trailer"): + if line[-1] in b_("0123456789t"): + stream.seek(-1, 1) + + offset, generation = line[:16].split(b_(" ")) + offset, generation = int(offset), int(generation) + if generation not in self.xref: + self.xref[generation] = {} + if num in self.xref[generation]: + # It really seems like we should allow the last + # xref table in the file to override previous + # ones. Since we read the file backwards, assume + # any existing key is already set correctly. + pass + else: + self.xref[generation][num] = offset + cnt += 1 + num += 1 + readNonWhitespace(stream) + stream.seek(-1, 1) + trailertag = stream.read(7) + if trailertag != b_("trailer"): + # more xrefs! + stream.seek(-7, 1) + else: + break + readNonWhitespace(stream) + stream.seek(-1, 1) + newTrailer = readObject(stream, self) + for key, value in list(newTrailer.items()): + if key not in self.trailer: + self.trailer[key] = value + if "/Prev" in newTrailer: + startxref = newTrailer["/Prev"] + else: + break + elif x.isdigit(): + # PDF 1.5+ Cross-Reference Stream + stream.seek(-1, 1) + idnum, generation = self.readObjectHeader(stream) + xrefstream = readObject(stream, self) + assert xrefstream["/Type"] == "/XRef" + self.cacheIndirectObject(generation, idnum, xrefstream) + streamData = BytesIO(b_(xrefstream.getData())) + # Index pairs specify the subsections in the dictionary. If + # none create one subsection that spans everything. + idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) + if debug: print(("read idx_pairs=%s"%list(self._pairs(idx_pairs)))) + entrySizes = xrefstream.get("/W") + assert len(entrySizes) >= 3 + if self.strict and len(entrySizes) > 3: + raise utils.PdfReadError("Too many entry sizes: %s" %entrySizes) + + def getEntry(i): + # Reads the correct number of bytes for each entry. See the + # discussion of the W parameter in PDF spec table 17. + if entrySizes[i] > 0: + d = streamData.read(entrySizes[i]) + return convertToInt(d, entrySizes[i]) + + # PDF Spec Table 17: A value of zero for an element in the + # W array indicates...the default value shall be used + if i == 0: return 1 # First value defaults to 1 + else: return 0 + + def used_before(num, generation): + # We move backwards through the xrefs, don't replace any. + return num in self.xref.get(generation, []) or \ + num in self.xref_objStm + + # Iterate through each subsection + last_end = 0 + for start, size in self._pairs(idx_pairs): + # The subsections must increase + assert start >= last_end + last_end = start + size + for num in range(start, start+size): + # The first entry is the type + xref_type = getEntry(0) + # The rest of the elements depend on the xref_type + if xref_type == 0: + # linked list of free objects + next_free_object = getEntry(1) + next_generation = getEntry(2) + elif xref_type == 1: + # objects that are in use but are not compressed + byte_offset = getEntry(1) + generation = getEntry(2) + if generation not in self.xref: + self.xref[generation] = {} + if not used_before(num, generation): + self.xref[generation][num] = byte_offset + if debug: print(("XREF Uncompressed: %s %s"%( + num, generation))) + elif xref_type == 2: + # compressed objects + objstr_num = getEntry(1) + obstr_idx = getEntry(2) + generation = 0 # PDF spec table 18, generation is 0 + if not used_before(num, generation): + if debug: print(("XREF Compressed: %s %s %s"%( + num, objstr_num, obstr_idx))) + self.xref_objStm[num] = (objstr_num, obstr_idx) + elif self.strict: + raise utils.PdfReadError("Unknown xref type: %s"% + xref_type) + + trailerKeys = "/Root", "/Encrypt", "/Info", "/ID" + for key in trailerKeys: + if key in xrefstream and key not in self.trailer: + self.trailer[NameObject(key)] = xrefstream.raw_get(key) + if "/Prev" in xrefstream: + startxref = xrefstream["/Prev"] + else: + break + else: + # bad xref character at startxref. Let's see if we can find + # the xref table nearby, as we've observed this error with an + # off-by-one before. + stream.seek(-11, 1) + tmp = stream.read(20) + xref_loc = tmp.find(b_("xref")) + if xref_loc != -1: + startxref -= (10 - xref_loc) + continue + # No explicit xref table, try finding a cross-reference stream. + stream.seek(startxref, 0) + found = False + for look in range(5): + if stream.read(1).isdigit(): + # This is not a standard PDF, consider adding a warning + startxref += look + found = True + break + if found: + continue + # no xref table found at specified location + raise utils.PdfReadError("Could not find xref table at specified location") + #if not zero-indexed, verify that the table is correct; change it if necessary + if self.xrefIndex and not self.strict: + loc = stream.tell() + for gen in self.xref: + if gen == 65535: continue + for id in self.xref[gen]: + stream.seek(self.xref[gen][id], 0) + try: + pid, pgen = self.readObjectHeader(stream) + except ValueError: + break + if pid == id - self.xrefIndex: + self._zeroXref(gen) + break + #if not, then either it's just plain wrong, or the non-zero-index is actually correct + stream.seek(loc, 0) #return to where it was + + def _zeroXref(self, generation): + self.xref[generation] = dict( (k-self.xrefIndex, v) for (k, v) in list(self.xref[generation].items()) ) + + def _pairs(self, array): + i = 0 + while True: + yield array[i], array[i+1] + i += 2 + if (i+1) >= len(array): + break + + def readNextEndLine(self, stream): + debug = False + if debug: print(">>readNextEndLine") + line = b_("") + while True: + # Prevent infinite loops in malformed PDFs + if stream.tell() == 0: + raise utils.PdfReadError("Could not read malformed PDF file") + x = stream.read(1) + if debug: print((" x:", x, "%x"%ord(x))) + if stream.tell() < 2: + raise utils.PdfReadError("EOL marker not found") + stream.seek(-2, 1) + if x == b_('\n') or x == b_('\r'): ## \n = LF; \r = CR + crlf = False + while x == b_('\n') or x == b_('\r'): + if debug: + if ord(x) == 0x0D: print(" x is CR 0D") + elif ord(x) == 0x0A: print(" x is LF 0A") + x = stream.read(1) + if x == b_('\n') or x == b_('\r'): # account for CR+LF + stream.seek(-1, 1) + crlf = True + if stream.tell() < 2: + raise utils.PdfReadError("EOL marker not found") + stream.seek(-2, 1) + stream.seek(2 if crlf else 1, 1) #if using CR+LF, go back 2 bytes, else 1 + break + else: + if debug: print(" x is neither") + line = x + line + if debug: print((" RNEL line:", line)) + if debug: print("leaving RNEL") + return line + + def decrypt(self, password): + """ + When using an encrypted / secured PDF file with the PDF Standard + encryption handler, this function will allow the file to be decrypted. + It checks the given password against the document's user password and + owner password, and then stores the resulting decryption key if either + password is correct. + + It does not matter which password was matched. Both passwords provide + the correct decryption key that will allow the document to be used with + this library. + + :param str password: The password to match. + :return: ``0`` if the password failed, ``1`` if the password matched the user + password, and ``2`` if the password matched the owner password. + :rtype: int + :raises NotImplementedError: if document uses an unsupported encryption + method. + """ + + self._override_encryption = True + try: + return self._decrypt(password) + finally: + self._override_encryption = False + + def _decrypt(self, password): + encrypt = self.trailer['/Encrypt'].getObject() + if encrypt['/Filter'] != '/Standard': + raise NotImplementedError("only Standard PDF encryption handler is available") + if not (encrypt['/V'] in (1, 2)): + raise NotImplementedError("only algorithm code 1 and 2 are supported") + user_password, key = self._authenticateUserPassword(password) + if user_password: + self._decryption_key = key + return 1 + else: + rev = encrypt['/R'].getObject() + if rev == 2: + keylen = 5 + else: + keylen = encrypt['/Length'].getObject() // 8 + key = _alg33_1(password, rev, keylen) + real_O = encrypt["/O"].getObject() + if rev == 2: + userpass = utils.RC4_encrypt(key, real_O) + else: + val = real_O + for i in range(19, -1, -1): + new_key = b_('') + for l in range(len(key)): + new_key += b_(chr(utils.ord_(key[l]) ^ i)) + val = utils.RC4_encrypt(new_key, val) + userpass = val + owner_password, key = self._authenticateUserPassword(userpass) + if owner_password: + self._decryption_key = key + return 2 + return 0 + + def _authenticateUserPassword(self, password): + encrypt = self.trailer['/Encrypt'].getObject() + rev = encrypt['/R'].getObject() + owner_entry = encrypt['/O'].getObject() + p_entry = encrypt['/P'].getObject() + id_entry = self.trailer['/ID'].getObject() + id1_entry = id_entry[0].getObject() + real_U = encrypt['/U'].getObject().original_bytes + if rev == 2: + U, key = _alg34(password, owner_entry, p_entry, id1_entry) + elif rev >= 3: + U, key = _alg35(password, rev, + encrypt["/Length"].getObject() // 8, owner_entry, + p_entry, id1_entry, + encrypt.get("/EncryptMetadata", BooleanObject(False)).getObject()) + U, real_U = U[:16], real_U[:16] + return U == real_U, key + + def getIsEncrypted(self): + return "/Encrypt" in self.trailer + + isEncrypted = property(lambda self: self.getIsEncrypted(), None, None) + """ + Read-only boolean property showing whether this PDF file is encrypted. + Note that this property, if true, will remain true even after the + :meth:`decrypt()` method is called. + """ + + +def getRectangle(self, name, defaults): + retval = self.get(name) + if isinstance(retval, RectangleObject): + return retval + if retval == None: + for d in defaults: + retval = self.get(d) + if retval != None: + break + if isinstance(retval, IndirectObject): + retval = self.pdf.getObject(retval) + retval = RectangleObject(retval) + setRectangle(self, name, retval) + return retval + + +def setRectangle(self, name, value): + if not isinstance(name, NameObject): + name = NameObject(name) + self[name] = value + + +def deleteRectangle(self, name): + del self[name] + + +def createRectangleAccessor(name, fallback): + return \ + property( + lambda self: getRectangle(self, name, fallback), + lambda self, value: setRectangle(self, name, value), + lambda self: deleteRectangle(self, name) + ) + + +class PageObject(DictionaryObject): + """ + This class represents a single page within a PDF file. Typically this + object will be created by accessing the + :meth:`getPage()` method of the + :class:`PdfFileReader` class, but it is + also possible to create an empty page with the + :meth:`createBlankPage()` static method. + + :param pdf: PDF file the page belongs to. + :param indirectRef: Stores the original indirect reference to + this object in its source PDF + """ + def __init__(self, pdf=None, indirectRef=None): + DictionaryObject.__init__(self) + self.pdf = pdf + self.indirectRef = indirectRef + + def createBlankPage(pdf=None, width=None, height=None): + """ + Returns a new blank page. + If ``width`` or ``height`` is ``None``, try to get the page size + from the last page of *pdf*. + + :param pdf: PDF file the page belongs to + :param float width: The width of the new page expressed in default user + space units. + :param float height: The height of the new page expressed in default user + space units. + :return: the new blank page: + :rtype: :class:`PageObject` + :raises PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains + no page + """ + page = PageObject(pdf) + + # Creates a new page (cf PDF Reference 7.7.3.3) + page.__setitem__(NameObject('/Type'), NameObject('/Page')) + page.__setitem__(NameObject('/Parent'), NullObject()) + page.__setitem__(NameObject('/Resources'), DictionaryObject()) + if width is None or height is None: + if pdf is not None and pdf.getNumPages() > 0: + lastpage = pdf.getPage(pdf.getNumPages() - 1) + width = lastpage.mediaBox.getWidth() + height = lastpage.mediaBox.getHeight() + else: + raise utils.PageSizeNotDefinedError() + page.__setitem__(NameObject('/MediaBox'), + RectangleObject([0, 0, width, height])) + + return page + createBlankPage = staticmethod(createBlankPage) + + def rotateClockwise(self, angle): + """ + Rotates a page clockwise by increments of 90 degrees. + + :param int angle: Angle to rotate the page. Must be an increment + of 90 deg. + """ + assert angle % 90 == 0 + self._rotate(angle) + return self + + def rotateCounterClockwise(self, angle): + """ + Rotates a page counter-clockwise by increments of 90 degrees. + + :param int angle: Angle to rotate the page. Must be an increment + of 90 deg. + """ + assert angle % 90 == 0 + self._rotate(-angle) + return self + + def _rotate(self, angle): + currentAngle = self.get("/Rotate", 0) + self[NameObject("/Rotate")] = NumberObject(currentAngle + angle) + + def _mergeResources(res1, res2, resource): + newRes = DictionaryObject() + newRes.update(res1.get(resource, DictionaryObject()).getObject()) + page2Res = res2.get(resource, DictionaryObject()).getObject() + renameRes = {} + for key in list(page2Res.keys()): + if key in newRes and newRes.raw_get(key) != page2Res.raw_get(key): + newname = NameObject(key + str(uuid.uuid4())) + renameRes[key] = newname + newRes[newname] = page2Res[key] + elif key not in newRes: + newRes[key] = page2Res.raw_get(key) + return newRes, renameRes + _mergeResources = staticmethod(_mergeResources) + + def _contentStreamRename(stream, rename, pdf): + if not rename: + return stream + stream = ContentStream(stream, pdf) + for operands, operator in stream.operations: + for i in range(len(operands)): + op = operands[i] + if isinstance(op, NameObject): + operands[i] = rename.get(op,op) + return stream + _contentStreamRename = staticmethod(_contentStreamRename) + + def _pushPopGS(contents, pdf): + # adds a graphics state "push" and "pop" to the beginning and end + # of a content stream. This isolates it from changes such as + # transformation matricies. + stream = ContentStream(contents, pdf) + stream.operations.insert(0, [[], "q"]) + stream.operations.append([[], "Q"]) + return stream + _pushPopGS = staticmethod(_pushPopGS) + + def _addTransformationMatrix(contents, pdf, ctm): + # adds transformation matrix at the beginning of the given + # contents stream. + a, b, c, d, e, f = ctm + contents = ContentStream(contents, pdf) + contents.operations.insert(0, [[FloatObject(a), FloatObject(b), + FloatObject(c), FloatObject(d), FloatObject(e), + FloatObject(f)], " cm"]) + return contents + _addTransformationMatrix = staticmethod(_addTransformationMatrix) + + def getContents(self): + """ + Accesses the page contents. + + :return: the ``/Contents`` object, or ``None`` if it doesn't exist. + ``/Contents`` is optional, as described in PDF Reference 7.7.3.3 + """ + if "/Contents" in self: + return self["/Contents"].getObject() + else: + return None + + def mergePage(self, page2): + """ + Merges the content streams of two pages into one. Resource references + (i.e. fonts) are maintained from both pages. The mediabox/cropbox/etc + of this page are not altered. The parameter page's content stream will + be added to the end of this page's content stream, meaning that it will + be drawn after, or "on top" of this page. + + :param PageObject page2: The page to be merged into this one. Should be + an instance of :class:`PageObject`. + """ + self._mergePage(page2) + + def _mergePage(self, page2, page2transformation=None, ctm=None, expand=False): + # First we work on merging the resource dictionaries. This allows us + # to find out what symbols in the content streams we might need to + # rename. + + newResources = DictionaryObject() + rename = {} + originalResources = self["/Resources"].getObject() + page2Resources = page2["/Resources"].getObject() + newAnnots = ArrayObject() + + for page in (self, page2): + if "/Annots" in page: + annots = page["/Annots"] + if isinstance(annots, ArrayObject): + for ref in annots: + newAnnots.append(ref) + + for res in "/ExtGState", "/Font", "/XObject", "/ColorSpace", "/Pattern", "/Shading", "/Properties": + new, newrename = PageObject._mergeResources(originalResources, page2Resources, res) + if new: + newResources[NameObject(res)] = new + rename.update(newrename) + + # Combine /ProcSet sets. + newResources[NameObject("/ProcSet")] = ArrayObject( + frozenset(originalResources.get("/ProcSet", ArrayObject()).getObject()).union( + frozenset(page2Resources.get("/ProcSet", ArrayObject()).getObject()) + ) + ) + + newContentArray = ArrayObject() + + originalContent = self.getContents() + if originalContent is not None: + newContentArray.append(PageObject._pushPopGS( + originalContent, self.pdf)) + + page2Content = page2.getContents() + if page2Content is not None: + if page2transformation is not None: + page2Content = page2transformation(page2Content) + page2Content = PageObject._contentStreamRename( + page2Content, rename, self.pdf) + page2Content = PageObject._pushPopGS(page2Content, self.pdf) + newContentArray.append(page2Content) + + # if expanding the page to fit a new page, calculate the new media box size + if expand: + corners1 = [self.mediaBox.getLowerLeft_x().as_numeric(), self.mediaBox.getLowerLeft_y().as_numeric(), + self.mediaBox.getUpperRight_x().as_numeric(), self.mediaBox.getUpperRight_y().as_numeric()] + corners2 = [page2.mediaBox.getLowerLeft_x().as_numeric(), page2.mediaBox.getLowerLeft_y().as_numeric(), + page2.mediaBox.getUpperLeft_x().as_numeric(), page2.mediaBox.getUpperLeft_y().as_numeric(), + page2.mediaBox.getUpperRight_x().as_numeric(), page2.mediaBox.getUpperRight_y().as_numeric(), + page2.mediaBox.getLowerRight_x().as_numeric(), page2.mediaBox.getLowerRight_y().as_numeric()] + if ctm is not None: + ctm = [float(x) for x in ctm] + new_x = [ctm[0]*corners2[i] + ctm[2]*corners2[i+1] + ctm[4] for i in range(0, 8, 2)] + new_y = [ctm[1]*corners2[i] + ctm[3]*corners2[i+1] + ctm[5] for i in range(0, 8, 2)] + else: + new_x = corners2[0:8:2] + new_y = corners2[1:8:2] + lowerleft = [min(new_x), min(new_y)] + upperright = [max(new_x), max(new_y)] + lowerleft = [min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])] + upperright = [max(corners1[2], upperright[0]), max(corners1[3], upperright[1])] + + self.mediaBox.setLowerLeft(lowerleft) + self.mediaBox.setUpperRight(upperright) + + self[NameObject('/Contents')] = ContentStream(newContentArray, self.pdf) + self[NameObject('/Resources')] = newResources + self[NameObject('/Annots')] = newAnnots + + def mergeTransformedPage(self, page2, ctm, expand=False): + """ + This is similar to mergePage, but a transformation matrix is + applied to the merged stream. + + :param PageObject page2: The page to be merged into this one. Should be + an instance of :class:`PageObject`. + :param tuple ctm: a 6-element tuple containing the operands of the + transformation matrix + :param bool expand: Whether the page should be expanded to fit the dimensions + of the page to be merged. + """ + self._mergePage(page2, lambda page2Content: + PageObject._addTransformationMatrix(page2Content, page2.pdf, ctm), ctm, expand) + + def mergeScaledPage(self, page2, scale, expand=False): + """ + This is similar to mergePage, but the stream to be merged is scaled + by appling a transformation matrix. + + :param PageObject page2: The page to be merged into this one. Should be + an instance of :class:`PageObject`. + :param float scale: The scaling factor + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + # CTM to scale : [ sx 0 0 sy 0 0 ] + return self.mergeTransformedPage(page2, [scale, 0, + 0, scale, + 0, 0], expand) + + def mergeRotatedPage(self, page2, rotation, expand=False): + """ + This is similar to mergePage, but the stream to be merged is rotated + by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject`. + :param float rotation: The angle of the rotation, in degrees + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + rotation = math.radians(rotation) + return self.mergeTransformedPage(page2, + [math.cos(rotation), math.sin(rotation), + -math.sin(rotation), math.cos(rotation), + 0, 0], expand) + + def mergeTranslatedPage(self, page2, tx, ty, expand=False): + """ + This is similar to mergePage, but the stream to be merged is translated + by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject`. + :param float tx: The translation on X axis + :param float ty: The translation on Y axis + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + return self.mergeTransformedPage(page2, [1, 0, + 0, 1, + tx, ty], expand) + + def mergeRotatedTranslatedPage(self, page2, rotation, tx, ty, expand=False): + """ + This is similar to mergePage, but the stream to be merged is rotated + and translated by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject`. + :param float tx: The translation on X axis + :param float ty: The translation on Y axis + :param float rotation: The angle of the rotation, in degrees + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + + translation = [[1, 0, 0], + [0, 1, 0], + [-tx, -ty, 1]] + rotation = math.radians(rotation) + rotating = [[math.cos(rotation), math.sin(rotation), 0], + [-math.sin(rotation), math.cos(rotation), 0], + [0, 0, 1]] + rtranslation = [[1, 0, 0], + [0, 1, 0], + [tx, ty, 1]] + ctm = utils.matrixMultiply(translation, rotating) + ctm = utils.matrixMultiply(ctm, rtranslation) + + return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1], + ctm[1][0], ctm[1][1], + ctm[2][0], ctm[2][1]], expand) + + def mergeRotatedScaledPage(self, page2, rotation, scale, expand=False): + """ + This is similar to mergePage, but the stream to be merged is rotated + and scaled by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject`. + :param float rotation: The angle of the rotation, in degrees + :param float scale: The scaling factor + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + rotation = math.radians(rotation) + rotating = [[math.cos(rotation), math.sin(rotation), 0], + [-math.sin(rotation), math.cos(rotation), 0], + [0, 0, 1]] + scaling = [[scale, 0, 0], + [0, scale, 0], + [0, 0, 1]] + ctm = utils.matrixMultiply(rotating, scaling) + + return self.mergeTransformedPage(page2, + [ctm[0][0], ctm[0][1], + ctm[1][0], ctm[1][1], + ctm[2][0], ctm[2][1]], expand) + + def mergeScaledTranslatedPage(self, page2, scale, tx, ty, expand=False): + """ + This is similar to mergePage, but the stream to be merged is translated + and scaled by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject`. + :param float scale: The scaling factor + :param float tx: The translation on X axis + :param float ty: The translation on Y axis + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + + translation = [[1, 0, 0], + [0, 1, 0], + [tx, ty, 1]] + scaling = [[scale, 0, 0], + [0, scale, 0], + [0, 0, 1]] + ctm = utils.matrixMultiply(scaling, translation) + + return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1], + ctm[1][0], ctm[1][1], + ctm[2][0], ctm[2][1]], expand) + + def mergeRotatedScaledTranslatedPage(self, page2, rotation, scale, tx, ty, expand=False): + """ + This is similar to mergePage, but the stream to be merged is translated, + rotated and scaled by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject`. + :param float tx: The translation on X axis + :param float ty: The translation on Y axis + :param float rotation: The angle of the rotation, in degrees + :param float scale: The scaling factor + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + translation = [[1, 0, 0], + [0, 1, 0], + [tx, ty, 1]] + rotation = math.radians(rotation) + rotating = [[math.cos(rotation), math.sin(rotation), 0], + [-math.sin(rotation), math.cos(rotation), 0], + [0, 0, 1]] + scaling = [[scale, 0, 0], + [0, scale, 0], + [0, 0, 1]] + ctm = utils.matrixMultiply(rotating, scaling) + ctm = utils.matrixMultiply(ctm, translation) + + return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1], + ctm[1][0], ctm[1][1], + ctm[2][0], ctm[2][1]], expand) + + ## + # Applys a transformation matrix the page. + # + # @param ctm A 6 elements tuple containing the operands of the + # transformation matrix + def addTransformation(self, ctm): + """ + Applies a transformation matrix to the page. + + :param tuple ctm: A 6-element tuple containing the operands of the + transformation matrix. + """ + originalContent = self.getContents() + if originalContent is not None: + newContent = PageObject._addTransformationMatrix( + originalContent, self.pdf, ctm) + newContent = PageObject._pushPopGS(newContent, self.pdf) + self[NameObject('/Contents')] = newContent + + def scale(self, sx, sy): + """ + Scales a page by the given factors by appling a transformation + matrix to its content and updating the page size. + + :param float sx: The scaling factor on horizontal axis. + :param float sy: The scaling factor on vertical axis. + """ + self.addTransformation([sx, 0, + 0, sy, + 0, 0]) + self.mediaBox = RectangleObject([ + float(self.mediaBox.getLowerLeft_x()) * sx, + float(self.mediaBox.getLowerLeft_y()) * sy, + float(self.mediaBox.getUpperRight_x()) * sx, + float(self.mediaBox.getUpperRight_y()) * sy]) + if "/VP" in self: + viewport = self["/VP"] + if isinstance(viewport, ArrayObject): + bbox = viewport[0]["/BBox"] + else: + bbox = viewport["/BBox"] + scaled_bbox = RectangleObject([ + float(bbox[0]) * sx, + float(bbox[1]) * sy, + float(bbox[2]) * sx, + float(bbox[3]) * sy]) + if isinstance(viewport, ArrayObject): + self[NameObject("/VP")][NumberObject(0)][NameObject("/BBox")] = scaled_bbox + else: + self[NameObject("/VP")][NameObject("/BBox")] = scaled_bbox + + def scaleBy(self, factor): + """ + Scales a page by the given factor by appling a transformation + matrix to its content and updating the page size. + + :param float factor: The scaling factor (for both X and Y axis). + """ + self.scale(factor, factor) + + def scaleTo(self, width, height): + """ + Scales a page to the specified dimentions by appling a + transformation matrix to its content and updating the page size. + + :param float width: The new width. + :param float height: The new heigth. + """ + sx = width / float(self.mediaBox.getUpperRight_x() - + self.mediaBox.getLowerLeft_x ()) + sy = height / float(self.mediaBox.getUpperRight_y() - + self.mediaBox.getLowerLeft_y ()) + self.scale(sx, sy) + + def compressContentStreams(self): + """ + Compresses the size of this page by joining all content streams and + applying a FlateDecode filter. + + However, it is possible that this function will perform no action if + content stream compression becomes "automatic" for some reason. + """ + content = self.getContents() + if content is not None: + if not isinstance(content, ContentStream): + content = ContentStream(content, self.pdf) + self[NameObject("/Contents")] = content.flateEncode() + + def extractText(self): + """ + Locate all text drawing commands, in the order they are provided in the + content stream, and extract the text. This works well for some PDF + files, but poorly for others, depending on the generator used. This will + be refined in the future. Do not rely on the order of text coming out of + this function, as it will change if this function is made more + sophisticated. + + :return: a unicode string object. + """ + text = u_("") + content = self["/Contents"].getObject() + if not isinstance(content, ContentStream): + content = ContentStream(content, self.pdf) + # Note: we check all strings are TextStringObjects. ByteStringObjects + # are strings where the byte->string encoding was unknown, so adding + # them to the text here would be gibberish. + for operands, operator in content.operations: + if operator == b_("Tj"): + _text = operands[0] + if isinstance(_text, TextStringObject): + text += _text + elif operator == b_("T*"): + text += "\n" + elif operator == b_("'"): + text += "\n" + _text = operands[0] + if isinstance(_text, TextStringObject): + text += operands[0] + elif operator == b_('"'): + _text = operands[2] + if isinstance(_text, TextStringObject): + text += "\n" + text += _text + elif operator == b_("TJ"): + for i in operands[0]: + if isinstance(i, TextStringObject): + text += i + text += "\n" + return text + + mediaBox = createRectangleAccessor("/MediaBox", ()) + """ + A :class:`RectangleObject`, expressed in default user space units, + defining the boundaries of the physical medium on which the page is + intended to be displayed or printed. + """ + + cropBox = createRectangleAccessor("/CropBox", ("/MediaBox",)) + """ + A :class:`RectangleObject`, expressed in default user space units, + defining the visible region of default user space. When the page is + displayed or printed, its contents are to be clipped (cropped) to this + rectangle and then imposed on the output medium in some + implementation-defined manner. Default value: same as :attr:`mediaBox`. + """ + + bleedBox = createRectangleAccessor("/BleedBox", ("/CropBox", "/MediaBox")) + """ + A :class:`RectangleObject`, expressed in default user space units, + defining the region to which the contents of the page should be clipped + when output in a production enviroment. + """ + + trimBox = createRectangleAccessor("/TrimBox", ("/CropBox", "/MediaBox")) + """ + A :class:`RectangleObject`, expressed in default user space units, + defining the intended dimensions of the finished page after trimming. + """ + + artBox = createRectangleAccessor("/ArtBox", ("/CropBox", "/MediaBox")) + """ + A :class:`RectangleObject`, expressed in default user space units, + defining the extent of the page's meaningful content as intended by the + page's creator. + """ + + +class ContentStream(DecodedStreamObject): + def __init__(self, stream, pdf): + self.pdf = pdf + self.operations = [] + # stream may be a StreamObject or an ArrayObject containing + # multiple StreamObjects to be cat'd together. + stream = stream.getObject() + if isinstance(stream, ArrayObject): + data = b_("") + for s in stream: + data += s.getObject().getData() + stream = BytesIO(b_(data)) + else: + stream = BytesIO(b_(stream.getData())) + self.__parseContentStream(stream) + + def __parseContentStream(self, stream): + # file("f:\\tmp.txt", "w").write(stream.read()) + stream.seek(0, 0) + operands = [] + while True: + peek = readNonWhitespace(stream) + if peek == b_('') or ord_(peek) == 0: + break + stream.seek(-1, 1) + if peek.isalpha() or peek == b_("'") or peek == b_('"'): + operator = utils.readUntilRegex(stream, + NameObject.delimiterPattern, True) + if operator == b_("BI"): + # begin inline image - a completely different parsing + # mechanism is required, of course... thanks buddy... + assert operands == [] + ii = self._readInlineImage(stream) + self.operations.append((ii, b_("INLINE IMAGE"))) + else: + self.operations.append((operands, operator)) + operands = [] + elif peek == b_('%'): + # If we encounter a comment in the content stream, we have to + # handle it here. Typically, readObject will handle + # encountering a comment -- but readObject assumes that + # following the comment must be the object we're trying to + # read. In this case, it could be an operator instead. + while peek not in (b_('\r'), b_('\n')): + peek = stream.read(1) + else: + operands.append(readObject(stream, None)) + + def _readInlineImage(self, stream): + # begin reading just after the "BI" - begin image + # first read the dictionary of settings. + settings = DictionaryObject() + while True: + tok = readNonWhitespace(stream) + stream.seek(-1, 1) + if tok == b_("I"): + # "ID" - begin of image data + break + key = readObject(stream, self.pdf) + tok = readNonWhitespace(stream) + stream.seek(-1, 1) + value = readObject(stream, self.pdf) + settings[key] = value + # left at beginning of ID + tmp = stream.read(3) + assert tmp[:2] == b_("ID") + data = b_("") + while True: + # Read the inline image, while checking for EI (End Image) operator. + tok = stream.read(1) + if tok == b_("E"): + # Check for End Image + tok2 = stream.read(1) + if tok2 == b_("I"): + # Data can contain EI, so check for the Q operator. + tok3 = stream.read(1) + info = tok + tok2 + # We need to find whitespace between EI and Q. + has_q_whitespace = False + while tok3 in utils.WHITESPACES: + has_q_whitespace = True + info += tok3 + tok3 = stream.read(1) + if tok3 == b_("Q") and has_q_whitespace: + stream.seek(-1, 1) + break + else: + stream.seek(-1,1) + data += info + else: + stream.seek(-1, 1) + data += tok + else: + data += tok + return {"settings": settings, "data": data} + + def _getData(self): + newdata = BytesIO() + for operands, operator in self.operations: + if operator == b_("INLINE IMAGE"): + newdata.write(b_("BI")) + dicttext = BytesIO() + operands["settings"].writeToStream(dicttext, None) + newdata.write(dicttext.getvalue()[2:-2]) + newdata.write(b_("ID ")) + newdata.write(operands["data"]) + newdata.write(b_("EI")) + else: + for op in operands: + op.writeToStream(newdata, None) + newdata.write(b_(" ")) + newdata.write(b_(operator)) + newdata.write(b_("\n")) + return newdata.getvalue() + + def _setData(self, value): + self.__parseContentStream(BytesIO(b_(value))) + + _data = property(_getData, _setData) + + +class DocumentInformation(DictionaryObject): + """ + A class representing the basic document metadata provided in a PDF File. + This class is accessible through + :meth:`getDocumentInfo()` + + All text properties of the document metadata have + *two* properties, eg. author and author_raw. The non-raw property will + always return a ``TextStringObject``, making it ideal for a case where + the metadata is being displayed. The raw property can sometimes return + a ``ByteStringObject``, if PyPDF2 was unable to decode the string's + text encoding; this requires additional safety in the caller and + therefore is not as commonly accessed. + """ + + def __init__(self): + DictionaryObject.__init__(self) + + def getText(self, key): + retval = self.get(key, None) + if isinstance(retval, TextStringObject): + return retval + return None + + title = property(lambda self: self.getText("/Title")) + """Read-only property accessing the document's **title**. + Returns a unicode string (``TextStringObject``) or ``None`` + if the title is not specified.""" + title_raw = property(lambda self: self.get("/Title")) + """The "raw" version of title; can return a ``ByteStringObject``.""" + + author = property(lambda self: self.getText("/Author")) + """Read-only property accessing the document's **author**. + Returns a unicode string (``TextStringObject``) or ``None`` + if the author is not specified.""" + author_raw = property(lambda self: self.get("/Author")) + """The "raw" version of author; can return a ``ByteStringObject``.""" + + subject = property(lambda self: self.getText("/Subject")) + """Read-only property accessing the document's **subject**. + Returns a unicode string (``TextStringObject``) or ``None`` + if the subject is not specified.""" + subject_raw = property(lambda self: self.get("/Subject")) + """The "raw" version of subject; can return a ``ByteStringObject``.""" + + creator = property(lambda self: self.getText("/Creator")) + """Read-only property accessing the document's **creator**. If the + document was converted to PDF from another format, this is the name of the + application (e.g. OpenOffice) that created the original document from + which it was converted. Returns a unicode string (``TextStringObject``) + or ``None`` if the creator is not specified.""" + creator_raw = property(lambda self: self.get("/Creator")) + """The "raw" version of creator; can return a ``ByteStringObject``.""" + + producer = property(lambda self: self.getText("/Producer")) + """Read-only property accessing the document's **producer**. + If the document was converted to PDF from another format, this is + the name of the application (for example, OSX Quartz) that converted + it to PDF. Returns a unicode string (``TextStringObject``) + or ``None`` if the producer is not specified.""" + producer_raw = property(lambda self: self.get("/Producer")) + """The "raw" version of producer; can return a ``ByteStringObject``.""" + + +def convertToInt(d, size): + if size > 8: + raise utils.PdfReadError("invalid size in convertToInt") + d = b_("\x00\x00\x00\x00\x00\x00\x00\x00") + b_(d) + d = d[-8:] + return struct.unpack(">q", d)[0] + +# ref: pdf1.8 spec section 3.5.2 algorithm 3.2 +_encryption_padding = b_('\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56') + \ + b_('\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c') + \ + b_('\xa9\xfe\x64\x53\x69\x7a') + + +# Implementation of algorithm 3.2 of the PDF standard security handler, +# section 3.5.2 of the PDF 1.6 reference. +def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True): + # 1. Pad or truncate the password string to exactly 32 bytes. If the + # password string is more than 32 bytes long, use only its first 32 bytes; + # if it is less than 32 bytes long, pad it by appending the required number + # of additional bytes from the beginning of the padding string + # (_encryption_padding). + password = b_((str_(password) + str_(_encryption_padding))[:32]) + # 2. Initialize the MD5 hash function and pass the result of step 1 as + # input to this function. + import struct + m = md5(password) + # 3. Pass the value of the encryption dictionary's /O entry to the MD5 hash + # function. + m.update(owner_entry.original_bytes) + # 4. Treat the value of the /P entry as an unsigned 4-byte integer and pass + # these bytes to the MD5 hash function, low-order byte first. + p_entry = struct.pack('= 3 and not metadata_encrypt: + m.update(b_("\xff\xff\xff\xff")) + # 7. Finish the hash. + md5_hash = m.digest() + # 8. (Revision 3 or greater) Do the following 50 times: Take the output + # from the previous MD5 hash and pass the first n bytes of the output as + # input into a new MD5 hash, where n is the number of bytes of the + # encryption key as defined by the value of the encryption dictionary's + # /Length entry. + if rev >= 3: + for i in range(50): + md5_hash = md5(md5_hash[:keylen]).digest() + # 9. Set the encryption key to the first n bytes of the output from the + # final MD5 hash, where n is always 5 for revision 2 but, for revision 3 or + # greater, depends on the value of the encryption dictionary's /Length + # entry. + return md5_hash[:keylen] + + +# Implementation of algorithm 3.3 of the PDF standard security handler, +# section 3.5.2 of the PDF 1.6 reference. +def _alg33(owner_pwd, user_pwd, rev, keylen): + # steps 1 - 4 + key = _alg33_1(owner_pwd, rev, keylen) + # 5. Pad or truncate the user password string as described in step 1 of + # algorithm 3.2. + user_pwd = b_((user_pwd + str_(_encryption_padding))[:32]) + # 6. Encrypt the result of step 5, using an RC4 encryption function with + # the encryption key obtained in step 4. + val = utils.RC4_encrypt(key, user_pwd) + # 7. (Revision 3 or greater) Do the following 19 times: Take the output + # from the previous invocation of the RC4 function and pass it as input to + # a new invocation of the function; use an encryption key generated by + # taking each byte of the encryption key obtained in step 4 and performing + # an XOR operation between that byte and the single-byte value of the + # iteration counter (from 1 to 19). + if rev >= 3: + for i in range(1, 20): + new_key = '' + for l in range(len(key)): + new_key += chr(ord_(key[l]) ^ i) + val = utils.RC4_encrypt(new_key, val) + # 8. Store the output from the final invocation of the RC4 as the value of + # the /O entry in the encryption dictionary. + return val + + +# Steps 1-4 of algorithm 3.3 +def _alg33_1(password, rev, keylen): + # 1. Pad or truncate the owner password string as described in step 1 of + # algorithm 3.2. If there is no owner password, use the user password + # instead. + password = b_((password + str_(_encryption_padding))[:32]) + # 2. Initialize the MD5 hash function and pass the result of step 1 as + # input to this function. + m = md5(password) + # 3. (Revision 3 or greater) Do the following 50 times: Take the output + # from the previous MD5 hash and pass it as input into a new MD5 hash. + md5_hash = m.digest() + if rev >= 3: + for i in range(50): + md5_hash = md5(md5_hash).digest() + # 4. Create an RC4 encryption key using the first n bytes of the output + # from the final MD5 hash, where n is always 5 for revision 2 but, for + # revision 3 or greater, depends on the value of the encryption + # dictionary's /Length entry. + key = md5_hash[:keylen] + return key + + +# Implementation of algorithm 3.4 of the PDF standard security handler, +# section 3.5.2 of the PDF 1.6 reference. +def _alg34(password, owner_entry, p_entry, id1_entry): + # 1. Create an encryption key based on the user password string, as + # described in algorithm 3.2. + key = _alg32(password, 2, 5, owner_entry, p_entry, id1_entry) + # 2. Encrypt the 32-byte padding string shown in step 1 of algorithm 3.2, + # using an RC4 encryption function with the encryption key from the + # preceding step. + U = utils.RC4_encrypt(key, _encryption_padding) + # 3. Store the result of step 2 as the value of the /U entry in the + # encryption dictionary. + return U, key + + +# Implementation of algorithm 3.4 of the PDF standard security handler, +# section 3.5.2 of the PDF 1.6 reference. +def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt): + # 1. Create an encryption key based on the user password string, as + # described in Algorithm 3.2. + key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry) + # 2. Initialize the MD5 hash function and pass the 32-byte padding string + # shown in step 1 of Algorithm 3.2 as input to this function. + m = md5() + m.update(_encryption_padding) + # 3. Pass the first element of the file's file identifier array (the value + # of the ID entry in the document's trailer dictionary; see Table 3.13 on + # page 73) to the hash function and finish the hash. (See implementation + # note 25 in Appendix H.) + m.update(id1_entry.original_bytes) + md5_hash = m.digest() + # 4. Encrypt the 16-byte result of the hash, using an RC4 encryption + # function with the encryption key from step 1. + val = utils.RC4_encrypt(key, md5_hash) + # 5. Do the following 19 times: Take the output from the previous + # invocation of the RC4 function and pass it as input to a new invocation + # of the function; use an encryption key generated by taking each byte of + # the original encryption key (obtained in step 2) and performing an XOR + # operation between that byte and the single-byte value of the iteration + # counter (from 1 to 19). + for i in range(1, 20): + new_key = b_('') + for l in range(len(key)): + new_key += b_(chr(ord_(key[l]) ^ i)) + val = utils.RC4_encrypt(new_key, val) + # 6. Append 16 bytes of arbitrary padding to the output from the final + # invocation of the RC4 function and store the 32-byte result as the value + # of the U entry in the encryption dictionary. + # (implementator note: I don't know what "arbitrary padding" is supposed to + # mean, so I have used null bytes. This seems to match a few other + # people's implementations) + return val + (b_('\x00') * 16), key diff --git a/vendor/PyPDF2/utils.py b/vendor/PyPDF2/utils.py new file mode 100755 index 00000000..718a875c --- /dev/null +++ b/vendor/PyPDF2/utils.py @@ -0,0 +1,295 @@ +# Copyright (c) 2006, Mathieu Fenniak +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +""" +Utility functions for PDF library. +""" +__author__ = "Mathieu Fenniak" +__author_email__ = "biziqe@mathieu.fenniak.net" + + +import sys + +try: + import __builtin__ as builtins +except ImportError: # Py3 + import builtins + + +xrange_fn = getattr(builtins, "xrange", range) +_basestring = getattr(builtins, "basestring", str) + +bytes_type = type(bytes()) # Works the same in Python 2.X and 3.X +string_type = getattr(builtins, "unicode", str) +int_types = (int, long) if sys.version_info[0] < 3 else (int,) + + +# Make basic type tests more consistent +def isString(s): + """Test if arg is a string. Compatible with Python 2 and 3.""" + return isinstance(s, _basestring) + + +def isInt(n): + """Test if arg is an int. Compatible with Python 2 and 3.""" + return isinstance(n, int_types) + + +def isBytes(b): + """Test if arg is a bytes instance. Compatible with Python 2 and 3.""" + return isinstance(b, bytes_type) + + +#custom implementation of warnings.formatwarning +def formatWarning(message, category, filename, lineno, line=None): + file = filename.replace("/", "\\").rsplit("\\", 1)[1] # find the file name + return "%s: %s [%s:%s]\n" % (category.__name__, message, file, lineno) + + +def readUntilWhitespace(stream, maxchars=None): + """ + Reads non-whitespace characters and returns them. + Stops upon encountering whitespace or when maxchars is reached. + """ + txt = b_("") + while True: + tok = stream.read(1) + if tok.isspace() or not tok: + break + txt += tok + if len(txt) == maxchars: + break + return txt + + +def readNonWhitespace(stream): + """ + Finds and reads the next non-whitespace character (ignores whitespace). + """ + tok = WHITESPACES[0] + while tok in WHITESPACES: + tok = stream.read(1) + return tok + + +def skipOverWhitespace(stream): + """ + Similar to readNonWhitespace, but returns a Boolean if more than + one whitespace character was read. + """ + tok = WHITESPACES[0] + cnt = 0; + while tok in WHITESPACES: + tok = stream.read(1) + cnt+=1 + return (cnt > 1) + + +def skipOverComment(stream): + tok = stream.read(1) + stream.seek(-1, 1) + if tok == b_('%'): + while tok not in (b_('\n'), b_('\r')): + tok = stream.read(1) + + +def readUntilRegex(stream, regex, ignore_eof=False): + """ + Reads until the regular expression pattern matched (ignore the match) + Raise PdfStreamError on premature end-of-file. + :param bool ignore_eof: If true, ignore end-of-line and return immediately + """ + name = b_('') + while True: + tok = stream.read(16) + if not tok: + # stream has truncated prematurely + if ignore_eof == True: + return name + else: + raise PdfStreamError("Stream has ended unexpectedly") + m = regex.search(tok) + if m is not None: + name += tok[:m.start()] + stream.seek(m.start()-len(tok), 1) + break + name += tok + return name + + +class ConvertFunctionsToVirtualList(object): + def __init__(self, lengthFunction, getFunction): + self.lengthFunction = lengthFunction + self.getFunction = getFunction + + def __len__(self): + return self.lengthFunction() + + def __getitem__(self, index): + if isinstance(index, slice): + indices = xrange_fn(*index.indices(len(self))) + cls = type(self) + return cls(indices.__len__, lambda idx: self[indices[idx]]) + if not isInt(index): + raise TypeError("sequence indices must be integers") + len_self = len(self) + if index < 0: + # support negative indexes + index = len_self + index + if index < 0 or index >= len_self: + raise IndexError("sequence index out of range") + return self.getFunction(index) + + +def RC4_encrypt(key, plaintext): + S = [i for i in range(256)] + j = 0 + for i in range(256): + j = (j + S[i] + ord_(key[i % len(key)])) % 256 + S[i], S[j] = S[j], S[i] + i, j = 0, 0 + retval = b_("") + for x in range(len(plaintext)): + i = (i + 1) % 256 + j = (j + S[i]) % 256 + S[i], S[j] = S[j], S[i] + t = S[(S[i] + S[j]) % 256] + retval += b_(chr(ord_(plaintext[x]) ^ t)) + return retval + + +def matrixMultiply(a, b): + return [[sum([float(i)*float(j) + for i, j in zip(row, col)] + ) for col in zip(*b)] + for row in a] + + +def markLocation(stream): + """Creates text file showing current location in context.""" + # Mainly for debugging + RADIUS = 5000 + stream.seek(-RADIUS, 1) + outputDoc = open('PyPDF2_pdfLocation.txt', 'w') + outputDoc.write(stream.read(RADIUS)) + outputDoc.write('HERE') + outputDoc.write(stream.read(RADIUS)) + outputDoc.close() + stream.seek(-RADIUS, 1) + + +class PyPdfError(Exception): + pass + + +class PdfReadError(PyPdfError): + pass + + +class PageSizeNotDefinedError(PyPdfError): + pass + + +class PdfReadWarning(UserWarning): + pass + + +class PdfStreamError(PdfReadError): + pass + + +if sys.version_info[0] < 3: + def b_(s): + return s +else: + B_CACHE = {} + + def b_(s): + bc = B_CACHE + if s in bc: + return bc[s] + if type(s) == bytes: + return s + else: + r = s.encode('latin-1') + if len(s) < 2: + bc[s] = r + return r + + +def u_(s): + if sys.version_info[0] < 3: + return unicode(s, 'unicode_escape') + else: + return s + + +def str_(b): + if sys.version_info[0] < 3: + return b + else: + if type(b) == bytes: + return b.decode('latin-1') + else: + return b + + +def ord_(b): + if sys.version_info[0] < 3 or type(b) == str: + return ord(b) + else: + return b + + +def chr_(c): + if sys.version_info[0] < 3: + return c + else: + return chr(c) + + +def barray(b): + if sys.version_info[0] < 3: + return b + else: + return bytearray(b) + + +def hexencode(b): + if sys.version_info[0] < 3: + return b.encode('hex') + else: + import codecs + coder = codecs.getencoder('hex_codec') + return coder(b)[0] + + +def hexStr(num): + return hex(num).replace('L', '') + + +WHITESPACES = [b_(x) for x in [' ', '\n', '\r', '\t', '\x00']] diff --git a/vendor/PyPDF2/xmp.py b/vendor/PyPDF2/xmp.py new file mode 100755 index 00000000..7ba62f0d --- /dev/null +++ b/vendor/PyPDF2/xmp.py @@ -0,0 +1,358 @@ +import re +import datetime +import decimal +from .generic import PdfObject +from xml.dom import getDOMImplementation +from xml.dom.minidom import parseString +from .utils import u_ + +RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" +DC_NAMESPACE = "http://purl.org/dc/elements/1.1/" +XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/" +PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/" +XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/" + +# What is the PDFX namespace, you might ask? I might ask that too. It's +# a completely undocumented namespace used to place "custom metadata" +# properties, which are arbitrary metadata properties with no semantic or +# documented meaning. Elements in the namespace are key/value-style storage, +# where the element name is the key and the content is the value. The keys +# are transformed into valid XML identifiers by substituting an invalid +# identifier character with \u2182 followed by the unicode hex ID of the +# original character. A key like "my car" is therefore "my\u21820020car". +# +# \u2182, in case you're wondering, is the unicode character +# \u{ROMAN NUMERAL TEN THOUSAND}, a straightforward and obvious choice for +# escaping characters. +# +# Intentional users of the pdfx namespace should be shot on sight. A +# custom data schema and sensical XML elements could be used instead, as is +# suggested by Adobe's own documentation on XMP (under "Extensibility of +# Schemas"). +# +# Information presented here on the /pdfx/ schema is a result of limited +# reverse engineering, and does not constitute a full specification. +PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/" + +iso8601 = re.compile(""" + (?P[0-9]{4}) + (- + (?P[0-9]{2}) + (- + (?P[0-9]+) + (T + (?P[0-9]{2}): + (?P[0-9]{2}) + (:(?P[0-9]{2}(.[0-9]+)?))? + (?PZ|[-+][0-9]{2}:[0-9]{2}) + )? + )? + )? + """, re.VERBOSE) + + +class XmpInformation(PdfObject): + """ + An object that represents Adobe XMP metadata. + Usually accessed by :meth:`getXmpMetadata()` + """ + + def __init__(self, stream): + self.stream = stream + docRoot = parseString(self.stream.getData()) + self.rdfRoot = docRoot.getElementsByTagNameNS(RDF_NAMESPACE, "RDF")[0] + self.cache = {} + + def writeToStream(self, stream, encryption_key): + self.stream.writeToStream(stream, encryption_key) + + def getElement(self, aboutUri, namespace, name): + for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): + if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri: + attr = desc.getAttributeNodeNS(namespace, name) + if attr != None: + yield attr + for element in desc.getElementsByTagNameNS(namespace, name): + yield element + + def getNodesInNamespace(self, aboutUri, namespace): + for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): + if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri: + for i in range(desc.attributes.length): + attr = desc.attributes.item(i) + if attr.namespaceURI == namespace: + yield attr + for child in desc.childNodes: + if child.namespaceURI == namespace: + yield child + + def _getText(self, element): + text = "" + for child in element.childNodes: + if child.nodeType == child.TEXT_NODE: + text += child.data + return text + + def _converter_string(value): + return value + + def _converter_date(value): + m = iso8601.match(value) + year = int(m.group("year")) + month = int(m.group("month") or "1") + day = int(m.group("day") or "1") + hour = int(m.group("hour") or "0") + minute = int(m.group("minute") or "0") + second = decimal.Decimal(m.group("second") or "0") + seconds = second.to_integral(decimal.ROUND_FLOOR) + milliseconds = (second - seconds) * 1000000 + tzd = m.group("tzd") or "Z" + dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds) + if tzd != "Z": + tzd_hours, tzd_minutes = [int(x) for x in tzd.split(":")] + tzd_hours *= -1 + if tzd_hours < 0: + tzd_minutes *= -1 + dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes) + return dt + _test_converter_date = staticmethod(_converter_date) + + def _getter_bag(namespace, name, converter): + def get(self): + cached = self.cache.get(namespace, {}).get(name) + if cached: + return cached + retval = [] + for element in self.getElement("", namespace, name): + bags = element.getElementsByTagNameNS(RDF_NAMESPACE, "Bag") + if len(bags): + for bag in bags: + for item in bag.getElementsByTagNameNS(RDF_NAMESPACE, "li"): + value = self._getText(item) + value = converter(value) + retval.append(value) + ns_cache = self.cache.setdefault(namespace, {}) + ns_cache[name] = retval + return retval + return get + + def _getter_seq(namespace, name, converter): + def get(self): + cached = self.cache.get(namespace, {}).get(name) + if cached: + return cached + retval = [] + for element in self.getElement("", namespace, name): + seqs = element.getElementsByTagNameNS(RDF_NAMESPACE, "Seq") + if len(seqs): + for seq in seqs: + for item in seq.getElementsByTagNameNS(RDF_NAMESPACE, "li"): + value = self._getText(item) + value = converter(value) + retval.append(value) + else: + value = converter(self._getText(element)) + retval.append(value) + ns_cache = self.cache.setdefault(namespace, {}) + ns_cache[name] = retval + return retval + return get + + def _getter_langalt(namespace, name, converter): + def get(self): + cached = self.cache.get(namespace, {}).get(name) + if cached: + return cached + retval = {} + for element in self.getElement("", namespace, name): + alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt") + if len(alts): + for alt in alts: + for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"): + value = self._getText(item) + value = converter(value) + retval[item.getAttribute("xml:lang")] = value + else: + retval["x-default"] = converter(self._getText(element)) + ns_cache = self.cache.setdefault(namespace, {}) + ns_cache[name] = retval + return retval + return get + + def _getter_single(namespace, name, converter): + def get(self): + cached = self.cache.get(namespace, {}).get(name) + if cached: + return cached + value = None + for element in self.getElement("", namespace, name): + if element.nodeType == element.ATTRIBUTE_NODE: + value = element.nodeValue + else: + value = self._getText(element) + break + if value != None: + value = converter(value) + ns_cache = self.cache.setdefault(namespace, {}) + ns_cache[name] = value + return value + return get + + dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor", _converter_string)) + """ + Contributors to the resource (other than the authors). An unsorted + array of names. + """ + + dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage", _converter_string)) + """ + Text describing the extent or scope of the resource. + """ + + dc_creator = property(_getter_seq(DC_NAMESPACE, "creator", _converter_string)) + """ + A sorted array of names of the authors of the resource, listed in order + of precedence. + """ + + dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date)) + """ + A sorted array of dates (datetime.datetime instances) of signifigance to + the resource. The dates and times are in UTC. + """ + + dc_description = property(_getter_langalt(DC_NAMESPACE, "description", _converter_string)) + """ + A language-keyed dictionary of textual descriptions of the content of the + resource. + """ + + dc_format = property(_getter_single(DC_NAMESPACE, "format", _converter_string)) + """ + The mime-type of the resource. + """ + + dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier", _converter_string)) + """ + Unique identifier of the resource. + """ + + dc_language = property(_getter_bag(DC_NAMESPACE, "language", _converter_string)) + """ + An unordered array specifying the languages used in the resource. + """ + + dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher", _converter_string)) + """ + An unordered array of publisher names. + """ + + dc_relation = property(_getter_bag(DC_NAMESPACE, "relation", _converter_string)) + """ + An unordered array of text descriptions of relationships to other + documents. + """ + + dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights", _converter_string)) + """ + A language-keyed dictionary of textual descriptions of the rights the + user has to this resource. + """ + + dc_source = property(_getter_single(DC_NAMESPACE, "source", _converter_string)) + """ + Unique identifier of the work from which this resource was derived. + """ + + dc_subject = property(_getter_bag(DC_NAMESPACE, "subject", _converter_string)) + """ + An unordered array of descriptive phrases or keywrods that specify the + topic of the content of the resource. + """ + + dc_title = property(_getter_langalt(DC_NAMESPACE, "title", _converter_string)) + """ + A language-keyed dictionary of the title of the resource. + """ + + dc_type = property(_getter_bag(DC_NAMESPACE, "type", _converter_string)) + """ + An unordered array of textual descriptions of the document type. + """ + + pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords", _converter_string)) + """ + An unformatted text string representing document keywords. + """ + + pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion", _converter_string)) + """ + The PDF file version, for example 1.0, 1.3. + """ + + pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer", _converter_string)) + """ + The name of the tool that created the PDF document. + """ + + xmp_createDate = property(_getter_single(XMP_NAMESPACE, "CreateDate", _converter_date)) + """ + The date and time the resource was originally created. The date and + time are returned as a UTC datetime.datetime object. + """ + + xmp_modifyDate = property(_getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date)) + """ + The date and time the resource was last modified. The date and time + are returned as a UTC datetime.datetime object. + """ + + xmp_metadataDate = property(_getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date)) + """ + The date and time that any metadata for this resource was last + changed. The date and time are returned as a UTC datetime.datetime + object. + """ + + xmp_creatorTool = property(_getter_single(XMP_NAMESPACE, "CreatorTool", _converter_string)) + """ + The name of the first known tool used to create the resource. + """ + + xmpmm_documentId = property(_getter_single(XMPMM_NAMESPACE, "DocumentID", _converter_string)) + """ + The common identifier for all versions and renditions of this resource. + """ + + xmpmm_instanceId = property(_getter_single(XMPMM_NAMESPACE, "InstanceID", _converter_string)) + """ + An identifier for a specific incarnation of a document, updated each + time a file is saved. + """ + + def custom_properties(self): + if not hasattr(self, "_custom_properties"): + self._custom_properties = {} + for node in self.getNodesInNamespace("", PDFX_NAMESPACE): + key = node.localName + while True: + # see documentation about PDFX_NAMESPACE earlier in file + idx = key.find(u_("\u2182")) + if idx == -1: + break + key = key[:idx] + chr(int(key[idx+1:idx+5], base=16)) + key[idx+5:] + if node.nodeType == node.ATTRIBUTE_NODE: + value = node.nodeValue + else: + value = self._getText(node) + self._custom_properties[key] = value + return self._custom_properties + + custom_properties = property(custom_properties) + """ + Retrieves custom metadata properties defined in the undocumented pdfx + metadata schema. + + :return: a dictionary of key/value items for custom metadata properties. + :rtype: dict + """