2013-02-05 10:10:14 +01:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
|
|
pdfparanoia.eraser
|
|
|
|
~~~~~~~~~~~~~~~
|
|
|
|
|
|
|
|
Tools to erase things from pdfs by direct manipulation of the pdf format.
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
2013-02-07 00:27:12 +01:00
|
|
|
def manipulate_pdf(content, objid, callback, *args):
|
2013-02-05 10:10:14 +01:00
|
|
|
"""
|
2013-02-07 00:27:12 +01:00
|
|
|
Iterates through a pdf looking for the object with the objid id. When the
|
|
|
|
object is found, callback is called with a reference to the current list of
|
|
|
|
output lines.
|
2013-02-05 10:10:14 +01:00
|
|
|
"""
|
|
|
|
outlines = []
|
2013-02-07 00:27:12 +01:00
|
|
|
content = content.replace("\r\n", "\n")
|
2013-02-05 10:10:14 +01:00
|
|
|
lines = content.split("\n")
|
|
|
|
last_line = None
|
|
|
|
skip_mode = False
|
|
|
|
for line in lines:
|
2013-02-12 06:52:59 +01:00
|
|
|
if line == "":
|
|
|
|
outlines.append("")
|
|
|
|
continue
|
2013-02-05 10:10:14 +01:00
|
|
|
if not skip_mode:
|
2013-02-07 00:27:12 +01:00
|
|
|
if last_line in ["endobj", "endobj ", None]:
|
2013-02-07 10:56:18 +01:00
|
|
|
if line[-3:] == "obj" or line[-4:] == "obj " or " obj <<" in line[0:50] or " obj<<" in line[0:50]:
|
2013-02-05 10:10:14 +01:00
|
|
|
if line.startswith(str(objid) + " "):
|
|
|
|
skip_mode = True
|
|
|
|
last_line = line
|
2013-02-07 00:27:12 +01:00
|
|
|
callback(outlines, *args)
|
2013-02-05 10:10:14 +01:00
|
|
|
continue
|
|
|
|
outlines.append(line)
|
|
|
|
elif skip_mode:
|
2013-02-06 02:07:28 +01:00
|
|
|
if line == "endobj" or line == "endobj ":
|
2013-02-05 10:10:14 +01:00
|
|
|
skip_mode = False
|
|
|
|
last_line = line
|
|
|
|
output = "\n".join(outlines)
|
|
|
|
return output
|
|
|
|
|
2013-02-07 00:27:12 +01:00
|
|
|
def remove_object_by_id(content, objid):
|
|
|
|
"""
|
|
|
|
Deletes an object from a pdf. Mostly streams and FlateDecode stuff.
|
|
|
|
"""
|
|
|
|
def _remove_object(outlines): pass
|
|
|
|
output = manipulate_pdf(content, objid, _remove_object)
|
|
|
|
return output
|
|
|
|
|
|
|
|
def replace_object_with(content, objid, replacement):
|
|
|
|
"""
|
|
|
|
Replaces an object from a pdf. Mostly streams. This is useful for replacing
|
|
|
|
an encoded object with a plaintext object.
|
|
|
|
"""
|
|
|
|
def _replace_object_with(outlines, details):
|
|
|
|
objid = details["objid"]
|
|
|
|
replacement = details["replacement"]
|
|
|
|
|
|
|
|
output = str(objid) + " 0 obj\n"
|
|
|
|
output += "<</Length " + str(len(replacement)+2) + ">>stream\n"
|
|
|
|
output += replacement
|
|
|
|
output += "\nendstream\nendobj\n"
|
|
|
|
|
|
|
|
for line in output.split("\n"):
|
|
|
|
outlines.append(line)
|
|
|
|
|
|
|
|
output = manipulate_pdf(content, objid, _replace_object_with, {"objid": objid, "replacement": replacement})
|
|
|
|
return output
|
|
|
|
|