deflation tool to help with debugging

The deflate function expands some of the FlateDecode streams in a pdf file. The output of the deflate function is not always correct and it is very buggy. Still, this is a useful tool to poke around in foreign pdfs under investigation.
2025-07-04 20:37:38 +02:00 · 2013-02-07 20:51:10 -06:00 · 2013-02-07 20:51:10 -06:00 · 5c8a194445
commit 5c8a194445
parent e108a43e26
2 changed files with 40 additions and 0 deletions
--- a/pdfparanoia/init.py
+++ b/pdfparanoia/init.py
@ -25,3 +25,5 @@ __copyright__ = "Copyright 2013 Bryan Bishop"

 from . import utils
 from .core import scrub
+from .parser import deflate
+
--- a/pdfparanoia/parser.py
+++ b/pdfparanoia/parser.py
@ -17,6 +17,8 @@ from pdfminer.pdfparser import (
    PDFDocument,
 )

+from .eraser import replace_object_with
+
 def parse_pdf(handler):
    """
    Parses a PDF via pdfminer.
@ -43,3 +45,39 @@ def parse_content(content):
    stream = StringIO(content)
    return parse_pdf(stream)

+def deflate(content):
+    """
+    Converts all FlateDecode streams into plaintext streams. This significantly
+    increases the size of a pdf, but it's useful for debugging and searching
+    for how watermarks are implemented.
+
+    Not all elements are preserved in the resulting document. This is for
+    debugging only.
+    """
+    # parse the pdf
+    pdf = parse_content(content)
+
+    # get a list of all object ids
+    xrefs = pdf._parser.read_xref()
+    xref = xrefs[0]
+    objids = xref.get_objids()
+
+    # store new replacements
+    replacements = []
+
+    # scan through each object looking for things to deflate
+    for objid in objids:
+        obj = pdf.getobj(objid)
+        if hasattr(obj, "attrs"):
+            if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
+                obj.decode()
+                data = obj.data
+                if len(data) < 1000:
+                    replacements.append([objid, data])
+
+    # apply the replacements to the document
+    for (objid, replacement) in replacements:
+        content = replace_object_with(content, objid, replacement)
+
+    return content
+