deflation tool to help with debugging

The deflate function expands some of the FlateDecode streams in a pdf
file. The output of the deflate function is not always correct and it is
very buggy. Still, this is a useful tool to poke around in foreign pdfs
under investigation.
This commit is contained in:
Bryan Bishop 2013-02-07 20:51:10 -06:00
parent e108a43e26
commit 5c8a194445
2 changed files with 40 additions and 0 deletions

View File

@ -25,3 +25,5 @@ __copyright__ = "Copyright 2013 Bryan Bishop"
from . import utils
from .core import scrub
from .parser import deflate

View File

@ -17,6 +17,8 @@ from pdfminer.pdfparser import (
PDFDocument,
)
from .eraser import replace_object_with
def parse_pdf(handler):
"""
Parses a PDF via pdfminer.
@ -43,3 +45,39 @@ def parse_content(content):
stream = StringIO(content)
return parse_pdf(stream)
def deflate(content):
"""
Converts all FlateDecode streams into plaintext streams. This significantly
increases the size of a pdf, but it's useful for debugging and searching
for how watermarks are implemented.
Not all elements are preserved in the resulting document. This is for
debugging only.
"""
# parse the pdf
pdf = parse_content(content)
# get a list of all object ids
xrefs = pdf._parser.read_xref()
xref = xrefs[0]
objids = xref.get_objids()
# store new replacements
replacements = []
# scan through each object looking for things to deflate
for objid in objids:
obj = pdf.getobj(objid)
if hasattr(obj, "attrs"):
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
obj.decode()
data = obj.data
if len(data) < 1000:
replacements.append([objid, data])
# apply the replacements to the document
for (objid, replacement) in replacements:
content = replace_object_with(content, objid, replacement)
return content