1
0
mirror of https://github.com/kanzure/pdfparanoia.git synced 2024-06-24 12:42:55 +02:00
pdfparanoia/pdfparanoia/plugins/aip.py
Zooko O'Whielacronx 56cc7719da add a "--verbose" option that writes to stderr if it finds anything to omit
Also cleaned up some flakes noticed by pyflakes, and make the scrub() be @classmethod instead of @staticmethod so I could use the class for the verbose output.

caveats:

* there are no unit tests of this patch
* now your logs of your stderr have potentially sensitive information in them
* the implementation of arg parsing is very low-tech; (a *good* way to do arg parsing is the "argparse" module)
2013-02-13 19:58:47 +00:00

58 lines
1.7 KiB
Python

# -*- coding: utf-8 -*-
import sys
from copy import copy
from ..parser import parse_content
from ..eraser import remove_object_by_id
from ..plugin import Plugin
class AmericanInstituteOfPhysics(Plugin):
"""
American Institute of Physics
~~~~~~~~~~~~~~~
These watermarks are pretty basic, but sometimes they don't have indexes
attached for whatever reason.
"""
@classmethod
def scrub(cls, content, verbose=False):
evil_ids = []
# parse the pdf into a pdfminer document
pdf = parse_content(content)
# get a list of all object ids
xrefs = pdf._parser.read_xref()
xref = xrefs[0]
objids = xref.get_objids()
# check each object in the pdf
for objid in objids:
# get an object by id
obj = pdf.getobj(objid)
if hasattr(obj, "attrs"):
# watermarks tend to be in FlateDecode elements
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
length = obj.attrs["Length"]
# the watermark is never very long
if length < 1000:
#rawdata = copy(obj.rawdata)
data = copy(obj.get_data())
if "Redistribution subject to AIP license or copyright" in data:
if verbose:
sys.stderr.write("%s: Found object with %r; omitting..." % (cls.__name__, data,))
evil_ids.append(objid)
for objid in evil_ids:
content = remove_object_by_id(content, objid)
return content