From 503b8aead5dfc899f2b2944af6b0d789f211a698 Mon Sep 17 00:00:00 2001 From: Zooko O'Whielacronx Date: Wed, 13 Feb 2013 21:08:49 +0000 Subject: [PATCH] add -v -v mode which prints out the details (potentially sensitive, potentially bulky) remove spie, which appears to do nothing --- bin/pdfparanoia | 6 +++--- pdfparanoia/{plugins => plugins-broken}/spie.py | 0 pdfparanoia/plugins/__init__.py | 1 - pdfparanoia/plugins/aip.py | 6 ++++-- pdfparanoia/plugins/ieee.py | 6 ++++-- pdfparanoia/plugins/jstor.py | 14 ++++++++++---- 6 files changed, 21 insertions(+), 12 deletions(-) rename pdfparanoia/{plugins => plugins-broken}/spie.py (100%) diff --git a/bin/pdfparanoia b/bin/pdfparanoia index 122e59a..749fa96 100755 --- a/bin/pdfparanoia +++ b/bin/pdfparanoia @@ -13,13 +13,13 @@ if __name__ == "__main__": import fileinput from StringIO import StringIO - verbose = False + verbose = 0 while '--verbose' in sys.argv: - verbose = True + verbose += 1 sys.argv.pop(sys.argv.index('--verbose')) while '-v' in sys.argv: - verbose = True + verbose += 1 sys.argv.pop(sys.argv.index('-v')) import pdfparanoia diff --git a/pdfparanoia/plugins/spie.py b/pdfparanoia/plugins-broken/spie.py similarity index 100% rename from pdfparanoia/plugins/spie.py rename to pdfparanoia/plugins-broken/spie.py diff --git a/pdfparanoia/plugins/__init__.py b/pdfparanoia/plugins/__init__.py index 93a425b..10179eb 100644 --- a/pdfparanoia/plugins/__init__.py +++ b/pdfparanoia/plugins/__init__.py @@ -10,5 +10,4 @@ Scrubbing machines. Bubbles mandatory. from .aip import * from .ieee import * from .jstor import * -from .spie import * diff --git a/pdfparanoia/plugins/aip.py b/pdfparanoia/plugins/aip.py index 685b545..d9d995d 100644 --- a/pdfparanoia/plugins/aip.py +++ b/pdfparanoia/plugins/aip.py @@ -18,7 +18,7 @@ class AmericanInstituteOfPhysics(Plugin): """ @classmethod - def scrub(cls, content, verbose=False): + def scrub(cls, content, verbose=0): evil_ids = [] # parse the pdf into a pdfminer document @@ -46,7 +46,9 @@ class AmericanInstituteOfPhysics(Plugin): phrase="Redistribution subject to AIP license or copyright" if phrase in data: - if verbose: + if verbose >= 2: + sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data)) + elif verbose >= 1: sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,)) evil_ids.append(objid) diff --git a/pdfparanoia/plugins/ieee.py b/pdfparanoia/plugins/ieee.py index 847b1d0..0a8691b 100644 --- a/pdfparanoia/plugins/ieee.py +++ b/pdfparanoia/plugins/ieee.py @@ -15,7 +15,7 @@ class IEEEXplore(Plugin): """ @classmethod - def scrub(cls, content, verbose=False): + def scrub(cls, content, verbose=0): evil_ids = [] # parse the pdf into a pdfminer document @@ -40,7 +40,9 @@ class IEEEXplore(Plugin): phrase= "Authorized licensed use limited to: " if phrase in data: - if verbose: + if verbose >= 2: + sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000])) + elif verbose >= 1: sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,)) evil_ids.append(objid) diff --git a/pdfparanoia/plugins/jstor.py b/pdfparanoia/plugins/jstor.py index d368fee..0ca971d 100644 --- a/pdfparanoia/plugins/jstor.py +++ b/pdfparanoia/plugins/jstor.py @@ -34,7 +34,7 @@ class JSTOR(Plugin): ] @classmethod - def scrub(cls, content, verbose=False): + def scrub(cls, content, verbose=0): replacements = [] # jstor has certain watermarks only on the first page @@ -61,13 +61,13 @@ class JSTOR(Plugin): if all([requirement in data for requirement in JSTOR.requirements]): better_content = data - if verbose: - sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, cls.requirements)) - # remove the date startpos = better_content.find("This content downloaded ") endpos = better_content.find(")", startpos) segment = better_content[startpos:endpos] + if verbose >= 2 and replacements: + sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment)) + better_content = better_content.replace(segment, "") # it looks like all of the watermarks are at the end? @@ -85,12 +85,18 @@ class JSTOR(Plugin): startpos = better_content.rfind("/F2 11 Tf\n") endpos = better_content.find("Tf\n", startpos+5) + if verbose >= 2 and replacements: + sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos])) + better_content = better_content[0:startpos] + better_content[endpos:] replacements.append([objid, better_content]) page_id += 1 + if verbose >= 1 and replacements: + sys.stderr.write("%s: Found objects %s with %r; omitting..." % (cls.__name__, [deets[0] for deets in replacements], cls.requirements)) + for deets in replacements: objid = deets[0] replacement = deets[1]