mirror of
https://github.com/kanzure/pdfparanoia.git
synced 2024-12-04 23:15:52 +01:00
deflation tool to help with debugging
The deflate function expands some of the FlateDecode streams in a pdf file. The output of the deflate function is not always correct and it is very buggy. Still, this is a useful tool to poke around in foreign pdfs under investigation.
This commit is contained in:
parent
e108a43e26
commit
5c8a194445
@ -25,3 +25,5 @@ __copyright__ = "Copyright 2013 Bryan Bishop"
|
||||
|
||||
from . import utils
|
||||
from .core import scrub
|
||||
from .parser import deflate
|
||||
|
||||
|
@ -17,6 +17,8 @@ from pdfminer.pdfparser import (
|
||||
PDFDocument,
|
||||
)
|
||||
|
||||
from .eraser import replace_object_with
|
||||
|
||||
def parse_pdf(handler):
|
||||
"""
|
||||
Parses a PDF via pdfminer.
|
||||
@ -43,3 +45,39 @@ def parse_content(content):
|
||||
stream = StringIO(content)
|
||||
return parse_pdf(stream)
|
||||
|
||||
def deflate(content):
|
||||
"""
|
||||
Converts all FlateDecode streams into plaintext streams. This significantly
|
||||
increases the size of a pdf, but it's useful for debugging and searching
|
||||
for how watermarks are implemented.
|
||||
|
||||
Not all elements are preserved in the resulting document. This is for
|
||||
debugging only.
|
||||
"""
|
||||
# parse the pdf
|
||||
pdf = parse_content(content)
|
||||
|
||||
# get a list of all object ids
|
||||
xrefs = pdf._parser.read_xref()
|
||||
xref = xrefs[0]
|
||||
objids = xref.get_objids()
|
||||
|
||||
# store new replacements
|
||||
replacements = []
|
||||
|
||||
# scan through each object looking for things to deflate
|
||||
for objid in objids:
|
||||
obj = pdf.getobj(objid)
|
||||
if hasattr(obj, "attrs"):
|
||||
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
||||
obj.decode()
|
||||
data = obj.data
|
||||
if len(data) < 1000:
|
||||
replacements.append([objid, data])
|
||||
|
||||
# apply the replacements to the document
|
||||
for (objid, replacement) in replacements:
|
||||
content = replace_object_with(content, objid, replacement)
|
||||
|
||||
return content
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user