mirror of
https://github.com/kanzure/pdfparanoia.git
synced 2024-06-02 00:58:05 +02:00
deflation tool to help with debugging
The deflate function expands some of the FlateDecode streams in a pdf file. The output of the deflate function is not always correct and it is very buggy. Still, this is a useful tool to poke around in foreign pdfs under investigation.
This commit is contained in:
parent
e108a43e26
commit
5c8a194445
|
@ -25,3 +25,5 @@ __copyright__ = "Copyright 2013 Bryan Bishop"
|
||||||
|
|
||||||
from . import utils
|
from . import utils
|
||||||
from .core import scrub
|
from .core import scrub
|
||||||
|
from .parser import deflate
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,8 @@ from pdfminer.pdfparser import (
|
||||||
PDFDocument,
|
PDFDocument,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from .eraser import replace_object_with
|
||||||
|
|
||||||
def parse_pdf(handler):
|
def parse_pdf(handler):
|
||||||
"""
|
"""
|
||||||
Parses a PDF via pdfminer.
|
Parses a PDF via pdfminer.
|
||||||
|
@ -43,3 +45,39 @@ def parse_content(content):
|
||||||
stream = StringIO(content)
|
stream = StringIO(content)
|
||||||
return parse_pdf(stream)
|
return parse_pdf(stream)
|
||||||
|
|
||||||
|
def deflate(content):
|
||||||
|
"""
|
||||||
|
Converts all FlateDecode streams into plaintext streams. This significantly
|
||||||
|
increases the size of a pdf, but it's useful for debugging and searching
|
||||||
|
for how watermarks are implemented.
|
||||||
|
|
||||||
|
Not all elements are preserved in the resulting document. This is for
|
||||||
|
debugging only.
|
||||||
|
"""
|
||||||
|
# parse the pdf
|
||||||
|
pdf = parse_content(content)
|
||||||
|
|
||||||
|
# get a list of all object ids
|
||||||
|
xrefs = pdf._parser.read_xref()
|
||||||
|
xref = xrefs[0]
|
||||||
|
objids = xref.get_objids()
|
||||||
|
|
||||||
|
# store new replacements
|
||||||
|
replacements = []
|
||||||
|
|
||||||
|
# scan through each object looking for things to deflate
|
||||||
|
for objid in objids:
|
||||||
|
obj = pdf.getobj(objid)
|
||||||
|
if hasattr(obj, "attrs"):
|
||||||
|
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
||||||
|
obj.decode()
|
||||||
|
data = obj.data
|
||||||
|
if len(data) < 1000:
|
||||||
|
replacements.append([objid, data])
|
||||||
|
|
||||||
|
# apply the replacements to the document
|
||||||
|
for (objid, replacement) in replacements:
|
||||||
|
content = replace_object_with(content, objid, replacement)
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user