1
0
mirror of https://github.com/kanzure/pdfparanoia.git synced 2024-06-24 12:42:55 +02:00
pdfparanoia/pdfparanoia/plugins/spie.py
Zooko O'Whielacronx 56cc7719da add a "--verbose" option that writes to stderr if it finds anything to omit
Also cleaned up some flakes noticed by pyflakes, and make the scrub() be @classmethod instead of @staticmethod so I could use the class for the verbose output.

caveats:

* there are no unit tests of this patch
* now your logs of your stderr have potentially sensitive information in them
* the implementation of arg parsing is very low-tech; (a *good* way to do arg parsing is the "argparse" module)
2013-02-13 19:58:47 +00:00

55 lines
1.6 KiB
Python

# -*- coding: utf-8 -*-
from copy import copy
import sys
from ..parser import parse_content
from ..plugin import Plugin
class SPIE(Plugin):
"""
Society of Photo-Optical Instrumentation Engineers
~~~~~~~~~~~~~~~
These watermarks are shown on each page, but are only defined in one place.
Also, there seems to be some interference from some of the other
pdfparanoia plugins causing the deletion of images in the document.
Side-effects need to be better accounted for.
"""
@classmethod
def scrub(cls, content, verbose=False):
evil_ids = []
# parse the pdf into a pdfminer document
pdf = parse_content(content)
# get a list of all object ids
xrefs = pdf._parser.read_xref()
xref = xrefs[0]
objids = xref.get_objids()
# check each object in the pdf
for objid in objids:
# get an object by id
obj = pdf.getobj(objid)
if hasattr(obj, "attrs"):
# watermarks tend to be in FlateDecode elements
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
data = copy(obj.get_data())
if "Downloaded From:" in data:
if verbose:
sys.stderr.write("%s: found object with %r; omitting..." % (cls.__name__, data))
evil_ids.append(objid)
for objid in evil_ids:
# for some reason SPIE pdfs are broken by this, images are randomly removed
#content = remove_object_by_id(content, objid)
continue
return content