diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py index 33ee8380aa..c5e6f06b1e 100644 --- a/PyPDF2/pdf.py +++ b/PyPDF2/pdf.py @@ -106,7 +106,8 @@ def __init__(self): NameObject("/Type"): NameObject("/Catalog"), NameObject("/Pages"): self._pages, }) - self._root = self._addObject(root) + self._root = None + self.root = root def _addObject(self, obj): self._objects.append(obj) @@ -209,6 +210,17 @@ def insertBlankPage(self, width=None, height=None, index=0): self.insertPage(page, index) return page + def addJS(self, javascript): + js = DictionaryObject() + js.update({ + NameObject("/Type"): NameObject("/Action"), + NameObject("/S"): NameObject("/JavaScript"), + NameObject("/JS"): NameObject("(%s)" % javascript) + }) + self.root.update({ + NameObject("/OpenAction"): self._addObject(js) + }) + def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True): """ Encrypt this PDF file with the PDF Standard encryption handler. @@ -268,6 +280,9 @@ def write(self, stream): debug = False import struct + if(not self._root): + self._root = self._addObject(self.root) + externalReferenceMap = {} # PDF objects sometimes have circular references to their /Page objects @@ -333,7 +348,7 @@ def write(self, stream): if hasattr(self, "_encrypt"): trailer[NameObject("/Encrypt")] = self._encrypt trailer.writeToStream(stream, None) - + # eof stream.write(b_("\nstartxref\n%s\n%%%%EOF\n" % (xref_location))) @@ -399,13 +414,13 @@ def _sweepIndirectReferences(self, externMap, data): return newobj else: return data - + def getReference(self, obj): idnum = self._objects.index(obj) + 1 ref = IndirectObject(idnum, 0, self) assert ref.getObject() == obj return ref - + def getOutlineRoot(self): root = self.getObject(self._root) @@ -413,15 +428,15 @@ def getOutlineRoot(self): outline = root['/Outlines'] idnum = self._objects.index(outline) + 1 outlineRef = IndirectObject(idnum, 0, self) - assert outlineRef.getObject() == outline + assert outlineRef.getObject() == outline else: - outline = TreeObject() + outline = TreeObject() outline.update({ }) outlineRef = self._addObject(outline) root[NameObject('/Outlines')] = outlineRef - + return outline - + def getNamedDestRoot(self): root = self.getObject(self._root) @@ -429,12 +444,12 @@ def getNamedDestRoot(self): names = root['/Names'] idnum = self._objects.index(names) + 1 namesRef = IndirectObject(idnum, 0, self) - assert namesRef.getObject() == names + assert namesRef.getObject() == names if '/Dests' in names and isinstance(names['/Dests'], DictionaryObject): dests = names['/Dests'] idnum = self._objects.index(dests) + 1 destsRef = IndirectObject(idnum, 0, self) - assert destsRef.getObject() == dests + assert destsRef.getObject() == dests if '/Names' in dests: nd = dests['/Names'] else: @@ -446,7 +461,7 @@ def getNamedDestRoot(self): names[NameObject('/Dests')] = destsRef nd = ArrayObject() dests[NameObject('/Names')] = nd - + else: names = DictionaryObject() namesRef = self._addObject(names) @@ -456,49 +471,49 @@ def getNamedDestRoot(self): names[NameObject('/Dests')] = destsRef nd = ArrayObject() dests[NameObject('/Names')] = nd - + return nd - + def addBookmarkDestination(self, dest, parent=None): destRef = self._addObject(dest) outlineRef = self.getOutlineRoot() - + if parent == None: parent = outlineRef parent = parent.getObject() #print parent.__class__.__name__ parent.addChild(destRef, self) - + return destRef - + def addBookmarkDict(self, bookmark, parent=None): bookmarkObj = TreeObject() for k, v in list(bookmark.items()): bookmarkObj[NameObject(str(k))] = v bookmarkObj.update(bookmark) - + if '/A' in bookmark: action = DictionaryObject() for k, v in list(bookmark['/A'].items()): action[NameObject(str(k))] = v actionRef = self._addObject(action) bookmarkObj[NameObject('/A')] = actionRef - + bookmarkRef = self._addObject(bookmarkObj) outlineRef = self.getOutlineRoot() - + if parent == None: parent = outlineRef - + parent = parent.getObject() parent.addChild(bookmarkRef, self) - - return bookmarkRef - - + + return bookmarkRef + + def addBookmark(self, title, pagenum, parent=None): """ Add a bookmark to this PDF file. @@ -517,10 +532,10 @@ def addBookmark(self, title, pagenum, parent=None): actionRef = self._addObject(action) outlineRef = self.getOutlineRoot() - + if parent == None: parent = outlineRef - + bookmark = TreeObject() @@ -530,10 +545,10 @@ def addBookmark(self, title, pagenum, parent=None): }) bookmarkRef = self._addObject(bookmark) - + parent = parent.getObject() parent.addChild(bookmarkRef, self) - + return bookmarkRef def addNamedDestinationObject(self, dest): @@ -541,8 +556,8 @@ def addNamedDestinationObject(self, dest): nd = self.getNamedDestRoot() nd.extend([dest['/Title'], destRef]) - - return destRef + + return destRef def addNamedDestination(self, title, pagenum): pageRef = self.getObject(self._pages)['/Kids'][pagenum] @@ -551,12 +566,12 @@ def addNamedDestination(self, title, pagenum): NameObject('/D') : ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]), NameObject('/S') : NameObject('/GoTo') }) - + destRef = self._addObject(dest) nd = self.getNamedDestRoot() nd.extend([title, destRef]) - + return destRef def removeLinks(self): @@ -714,7 +729,7 @@ def addLink(self, pagenum, pagedest, rect, zoom='/FitV', border=None): borderArr.append(dashPattern) else: borderArr = [NumberObject(0)] * 3 - + if isinstance(rect, Str): rect = NameObject(rect) elif isinstance(rect, RectangleObject): @@ -739,12 +754,12 @@ def addLink(self, pagenum, pagedest, rect, zoom='/FitV', border=None): pageRef[NameObject('/Annots')] = ArrayObject([lnkRef]) _valid_layouts = ['/NoLayout', '/SinglePage', '/OneColumn', '/TwoColumnLeft', '/TwoColumnRight', '/TwoPageLeft', '/TwoPageRight'] - + def getPageLayout(self): """ Get the page layout. See :meth:`setPageLayout()` for a description of valid layouts. - + :return: Page layout currently being used. :rtype: str, None if not specified """ @@ -752,13 +767,13 @@ def getPageLayout(self): return self.getObject(self._root)['/PageLayout'] except KeyError: return None - + def setPageLayout(self, layout): """ Set the page layout :param str layout: The page layout to be used - + Valid layouts are: /NoLayout Layout explicitly not specified /SinglePage Show one page at a time @@ -774,7 +789,7 @@ def setPageLayout(self, layout): layout = NameObject(layout) root = self.getObject(self._root) root.update({NameObject('/PageLayout'): layout}) - + pageLayout = property(getPageLayout, setPageLayout) """Read and write property accessing the :meth:`getPageLayout()` and :meth:`setPageLayout()` methods.""" @@ -786,7 +801,7 @@ def getPageMode(self): Get the page mode. See :meth:`setPageMode()` for a description of valid modes. - + :return: Page mode currently being used. :rtype: str, None if not specified """ @@ -800,7 +815,7 @@ def setPageMode(self, mode): Set the page mode. :param str mode: The page mode to use. - + Valid modes are: /UseNone Do not show outlines or thumbnails panels /UseOutlines Show outlines (aka bookmarks) panel @@ -815,7 +830,7 @@ def setPageMode(self, mode): mode = NameObject(mode) root = self.getObject(self._root) root.update({NameObject('/PageMode'): mode}) - + pageMode = property(getPageMode, setPageMode) """Read and write property accessing the :meth:`getPageMode()` and :meth:`setPageMode()` methods.""" @@ -915,8 +930,8 @@ def getNumPages(self): :raises PdfReadError: if file is encrypted and restrictions prevent this action. """ - - # Flattened pages will not work on an Encrypted PDF; + + # Flattened pages will not work on an Encrypted PDF; # the PDF file's page count is used in this case. Otherwise, # the original method (flattened page count) is used. if self.isEncrypted: @@ -971,7 +986,7 @@ def getNamedDestinations(self, tree=None, retval=None): if retval == None: retval = {} catalog = self.trailer["/Root"] - + # get the name tree if "/Dests" in catalog: tree = catalog["/Dests"] @@ -979,7 +994,7 @@ def getNamedDestinations(self, tree=None, retval=None): names = catalog['/Names'] if "/Dests" in names: tree = names['/Dests'] - + if tree == None: return retval @@ -1016,17 +1031,17 @@ def getOutlines(self, node=None, outlines=None): if outlines == None: outlines = [] catalog = self.trailer["/Root"] - + # get the outline dictionary and named destinations if "/Outlines" in catalog: lines = catalog["/Outlines"] if "/First" in lines: node = lines["/First"] self._namedDests = self.getNamedDestinations() - + if node == None: return outlines - + # see if there are any more outlines while True: outline = self._buildOutline(node) @@ -1050,10 +1065,10 @@ def _buildDestination(self, title, array): page, typ = array[0:2] array = array[2:] return Destination(title, page, typ, *array) - + def _buildOutline(self, node): dest, title, outline = None, None, None - + if "/A" in node and "/Title" in node: # Action, section 8.5 (only type GoTo supported) title = node["/Title"] @@ -1097,7 +1112,7 @@ def getPageLayout(self): return self.trailer['/Root']['/PageLayout'] except KeyError: return None - + pageLayout = property(getPageLayout) """Read-only property accessing the :meth:`getPageLayout()` method.""" @@ -1107,7 +1122,7 @@ def getPageMode(self): Get the page mode. See :meth:`setPageMode()` for a description of valid modes. - + :return: Page mode currently being used. :rtype: ``str``, ``None`` if not specified """ @@ -1197,20 +1212,20 @@ def _getObjectFromStream(self, indirectReference): warnings.warn("Invalid stream (index %d) within object %d %d: %s" % \ (i, indirectReference.idnum, indirectReference.generation, e), utils.PdfReadWarning) - if self.strict: + if self.strict: raise utils.PdfReadError("Can't read object stream: %s"%e) # Replace with null. Hopefully it's nothing important. obj = NullObject() return obj - + if self.strict: raise utils.PdfReadError("This is a fatal error in strict mode.") return NullObject() - - + + def getObject(self, indirectReference): debug = False if debug: print(("looking at:", indirectReference.idnum, indirectReference.generation)) - retval = self.cacheGetIndirectObject(indirectReference.generation, + retval = self.cacheGetIndirectObject(indirectReference.generation, indirectReference.idnum) if retval != None: return retval @@ -1225,11 +1240,11 @@ def getObject(self, indirectReference): idnum, generation = self.readObjectHeader(self.stream) if idnum != indirectReference.idnum and self.xrefIndex: # Xref table probably had bad indexes due to not being zero-indexed - if self.strict: + if self.strict: raise utils.PdfReadError("Expected object ID (%d %d) does not match actual (%d %d); xref table not zero-indexed." \ % (indirectReference.idnum, indirectReference.generation, idnum, generation)) else: pass # xref table is corrected in non-strict mode - elif idnum != indirectReference.idnum: + elif idnum != indirectReference.idnum: # some other problem raise utils.PdfReadError("Expected object ID (%d %d) does not match actual (%d %d)." \ % (indirectReference.idnum, indirectReference.generation, idnum, generation)) @@ -1253,9 +1268,9 @@ def getObject(self, indirectReference): else: warnings.warn("Object %d %d not defined."%(indirectReference.idnum, indirectReference.generation), utils.PdfReadWarning) - #if self.strict: + #if self.strict: raise utils.PdfReadError("Could not find object.") - self.cacheIndirectObject(indirectReference.generation, + self.cacheIndirectObject(indirectReference.generation, indirectReference.idnum, retval) return retval @@ -1286,7 +1301,7 @@ def readObjectHeader(self, stream): obj = stream.read(3) readNonWhitespace(stream) stream.seek(-1, 1) - if (extra and self.strict): + if (extra and self.strict): #not a fatal error warnings.warn("Superfluous whitespace found in object header %s %s" % \ (idnum, generation), utils.PdfReadWarning) @@ -1298,7 +1313,7 @@ def cacheGetIndirectObject(self, generation, idnum): if debug and out: print(("cache hit: %d %d"%(idnum, generation))) elif debug: print(("cache miss: %d %d"%(idnum, generation))) return out - + def cacheIndirectObject(self, generation, idnum, obj): # return None # Sometimes we want to turn off cache for debugging. if (generation, idnum) in self.resolvedObjects: @@ -1371,17 +1386,17 @@ def read(self, stream): cnt = 0 while cnt < size: line = stream.read(20) - + # It's very clear in section 3.4.3 of the PDF spec # that all cross-reference table lines are a fixed # 20 bytes (as of PDF 1.7). However, some files have # 21-byte entries (or more) due to the use of \r\n - # (CRLF) EOL's. Detect that case, and adjust the line + # (CRLF) EOL's. Detect that case, and adjust the line # until it does not begin with a \r (CR) or \n (LF). while line[0] in b_("\x0D\x0A"): stream.seek(-20 + 1, 1) line = stream.read(20) - + # On the other hand, some malformed PDF files # use a single character EOL without a preceeding # space. Detect that case, and seek the stream @@ -1390,7 +1405,7 @@ def read(self, stream): # text "trailer"): if line[-1] in b_("0123456789t"): stream.seek(-1, 1) - + offset, generation = line[:16].split(b_(" ")) offset, generation = int(offset), int(generation) if generation not in self.xref: @@ -1431,7 +1446,7 @@ def read(self, stream): assert xrefstream["/Type"] == "/XRef" self.cacheIndirectObject(generation, idnum, xrefstream) streamData = BytesIO(b_(xrefstream.getData())) - # Index pairs specify the subsections in the dictionary. If + # Index pairs specify the subsections in the dictionary. If # none create one subsection that spans everything. idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) if debug: print(("read idx_pairs=%s"%list(self._pairs(idx_pairs)))) @@ -1445,17 +1460,17 @@ def getEntry(i): if entrySizes[i] > 0: d = streamData.read(entrySizes[i]) return convertToInt(d, entrySizes[i]) - - # PDF Spec Table 17: A value of zero for an element in the + + # PDF Spec Table 17: A value of zero for an element in the # W array indicates...the default value shall be used if i == 0: return 1 # First value defaults to 1 else: return 0 - + def used_before(num, generation): # We move backwards through the xrefs, don't replace any. return num in self.xref.get(generation, []) or \ num in self.xref_objStm - + # Iterate through each subsection last_end = 0 for start, size in self._pairs(idx_pairs): @@ -1492,7 +1507,7 @@ def used_before(num, generation): elif self.strict: raise utils.PdfReadError("Unknown xref type: %s"% xref_type) - + trailerKeys = "/Root", "/Encrypt", "/Info", "/ID" for key in trailerKeys: if key in xrefstream and key not in self.trailer: @@ -1542,10 +1557,10 @@ def used_before(num, generation): #if not, then either it's just plain wrong, or the non-zero-index is actually correct stream.seek(loc, 0) #return to where it was - + def _zeroXref(self, generation): self.xref[generation] = dict( (k-self.xrefIndex, v) for (k, v) in list(self.xref[generation].items()) ) - + def _pairs(self, array): i = 0 while True: @@ -1810,7 +1825,7 @@ def _contentStreamRename(stream, rename, pdf): def _pushPopGS(contents, pdf): # adds a graphics state "push" and "pop" to the beginning and end - # of a content stream. This isolates it from changes such as + # of a content stream. This isolates it from changes such as # transformation matricies. stream = ContentStream(contents, pdf) stream.operations.insert(0, [[], "q"]) @@ -1892,12 +1907,12 @@ def _mergePage(self, page2, page2transformation=None, ctm=None, expand=False): page2Content, rename, self.pdf) page2Content = PageObject._pushPopGS(page2Content, self.pdf) newContentArray.append(page2Content) - + # if expanding the page to fit a new page, calculate the new media box size if expand: - corners1 = [self.mediaBox.getLowerLeft_x().as_numeric(), self.mediaBox.getLowerLeft_y().as_numeric(), + corners1 = [self.mediaBox.getLowerLeft_x().as_numeric(), self.mediaBox.getLowerLeft_y().as_numeric(), self.mediaBox.getUpperRight_x().as_numeric(), self.mediaBox.getUpperRight_y().as_numeric()] - corners2 = [page2.mediaBox.getLowerLeft_x().as_numeric(), page2.mediaBox.getLowerLeft_y().as_numeric(), + corners2 = [page2.mediaBox.getLowerLeft_x().as_numeric(), page2.mediaBox.getLowerLeft_y().as_numeric(), page2.mediaBox.getUpperLeft_x().as_numeric(), page2.mediaBox.getUpperLeft_y().as_numeric(), page2.mediaBox.getUpperRight_x().as_numeric(), page2.mediaBox.getUpperRight_y().as_numeric(), page2.mediaBox.getLowerRight_x().as_numeric(), page2.mediaBox.getLowerRight_y().as_numeric()] @@ -2554,24 +2569,24 @@ def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr # described in Algorithm 3.2. key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry) # 2. Initialize the MD5 hash function and pass the 32-byte padding string - # shown in step 1 of Algorithm 3.2 as input to this function. + # shown in step 1 of Algorithm 3.2 as input to this function. m = md5() m.update(_encryption_padding) # 3. Pass the first element of the file's file identifier array (the value # of the ID entry in the document's trailer dictionary; see Table 3.13 on # page 73) to the hash function and finish the hash. (See implementation - # note 25 in Appendix H.) + # note 25 in Appendix H.) m.update(id1_entry.original_bytes) md5_hash = m.digest() # 4. Encrypt the 16-byte result of the hash, using an RC4 encryption - # function with the encryption key from step 1. + # function with the encryption key from step 1. val = utils.RC4_encrypt(key, md5_hash) # 5. Do the following 19 times: Take the output from the previous # invocation of the RC4 function and pass it as input to a new invocation # of the function; use an encryption key generated by taking each byte of # the original encryption key (obtained in step 2) and performing an XOR # operation between that byte and the single-byte value of the iteration - # counter (from 1 to 19). + # counter (from 1 to 19). for i in range(1, 20): new_key = b_('') for l in range(len(key)): @@ -2579,7 +2594,7 @@ def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr val = utils.RC4_encrypt(new_key, val) # 6. Append 16 bytes of arbitrary padding to the output from the final # invocation of the RC4 function and store the 32-byte result as the value - # of the U entry in the encryption dictionary. + # of the U entry in the encryption dictionary. # (implementator note: I don't know what "arbitrary padding" is supposed to # mean, so I have used null bytes. This seems to match a few other # people's implementations) diff --git a/Sample_Code/basic_features.py b/Sample_Code/basic_features.py index 3ac5b8621b..5e874970da 100644 --- a/Sample_Code/basic_features.py +++ b/Sample_Code/basic_features.py @@ -2,7 +2,7 @@ output = PdfFileWriter() input1 = PdfFileReader(open("document1.pdf", "rb")) - + # print how many pages input1 has: print "document1.pdf has %d pages." % input1.getNumPages() @@ -21,7 +21,7 @@ watermark = PdfFileReader(open("watermark.pdf", "rb")) page4.mergePage(watermark.getPage(0)) output.addPage(page4) - + # add page 5 from input1, but crop it to half size: page5 = input1.getPage(4) @@ -31,6 +31,11 @@ ) output.addPage(page5) +# add some Javascript to launch the print window on opening this PDF. +# the password dialog may prevent the print dialog from being shown, +# comment the the encription lines, if that's the case, to try this out +output.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") + # encrypt your new PDF and add a password password = "secret" output.encrypt(password)