From 56cc7719dab683507d95a4ce5d38590f076255c1 Mon Sep 17 00:00:00 2001 From: Zooko O'Whielacronx Date: Wed, 13 Feb 2013 19:58:47 +0000 Subject: [PATCH 1/3] add a "--verbose" option that writes to stderr if it finds anything to omit Also cleaned up some flakes noticed by pyflakes, and make the scrub() be @classmethod instead of @staticmethod so I could use the class for the verbose output. caveats: * there are no unit tests of this patch * now your logs of your stderr have potentially sensitive information in them * the implementation of arg parsing is very low-tech; (a *good* way to do arg parsing is the "argparse" module) --- bin/pdfparanoia | 11 ++++++++++- pdfparanoia/core.py | 4 ++-- pdfparanoia/plugin.py | 4 ++-- pdfparanoia/plugins/aip.py | 9 +++++++-- pdfparanoia/plugins/ieee.py | 8 ++++++-- pdfparanoia/plugins/jstor.py | 13 ++++++++----- pdfparanoia/plugins/spie.py | 8 +++++--- 7 files changed, 40 insertions(+), 17 deletions(-) diff --git a/bin/pdfparanoia b/bin/pdfparanoia index 2705bc9..122e59a 100755 --- a/bin/pdfparanoia +++ b/bin/pdfparanoia @@ -13,6 +13,15 @@ if __name__ == "__main__": import fileinput from StringIO import StringIO + verbose = False + while '--verbose' in sys.argv: + verbose = True + sys.argv.pop(sys.argv.index('--verbose')) + + while '-v' in sys.argv: + verbose = True + sys.argv.pop(sys.argv.index('-v')) + import pdfparanoia # read in all lines @@ -21,7 +30,7 @@ if __name__ == "__main__": content += line # scrub the pdf to get rid of watermarks - output = pdfparanoia.scrub(StringIO(content)) + output = pdfparanoia.scrub(StringIO(content), verbose=verbose) # dump to output sys.stdout.write(output) diff --git a/pdfparanoia/core.py b/pdfparanoia/core.py index 800c52e..1cfc0f2 100644 --- a/pdfparanoia/core.py +++ b/pdfparanoia/core.py @@ -32,7 +32,7 @@ def find_plugins(): plugins = [each[1] for each in plugins] return plugins -def scrub(obj): +def scrub(obj, verbose=False): """ Removes watermarks from a pdf and returns the resulting pdf as a string. """ @@ -50,7 +50,7 @@ def scrub(obj): # clean this pdf as much as possible for plugin in plugins: - content = plugin.scrub(content) + content = plugin.scrub(content, verbose=verbose) return content diff --git a/pdfparanoia/plugin.py b/pdfparanoia/plugin.py index 867ee48..20c28bf 100644 --- a/pdfparanoia/plugin.py +++ b/pdfparanoia/plugin.py @@ -8,8 +8,8 @@ Defines how plugins work. """ class Plugin: - @staticmethod - def scrub(content): + @classmethod + def scrub(cls, content, verbose=False): """ Removes watermarks from the given pdf. """ diff --git a/pdfparanoia/plugins/aip.py b/pdfparanoia/plugins/aip.py index 20457f2..145f67a 100644 --- a/pdfparanoia/plugins/aip.py +++ b/pdfparanoia/plugins/aip.py @@ -1,5 +1,7 @@ # -*- coding: utf-8 -*- +import sys + from copy import copy from ..parser import parse_content @@ -15,8 +17,8 @@ class AmericanInstituteOfPhysics(Plugin): attached for whatever reason. """ - @staticmethod - def scrub(content): + @classmethod + def scrub(cls, content, verbose=False): evil_ids = [] # parse the pdf into a pdfminer document @@ -43,6 +45,9 @@ class AmericanInstituteOfPhysics(Plugin): data = copy(obj.get_data()) if "Redistribution subject to AIP license or copyright" in data: + if verbose: + sys.stderr.write("%s: Found object with %r; omitting..." % (cls.__name__, data,)) + evil_ids.append(objid) for objid in evil_ids: diff --git a/pdfparanoia/plugins/ieee.py b/pdfparanoia/plugins/ieee.py index b142ac0..4c04636 100644 --- a/pdfparanoia/plugins/ieee.py +++ b/pdfparanoia/plugins/ieee.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- from copy import copy +import sys from ..parser import parse_content from ..eraser import remove_object_by_id @@ -13,8 +14,8 @@ class IEEEXplore(Plugin): """ - @staticmethod - def scrub(content): + @classmethod + def scrub(cls, content, verbose=False): evil_ids = [] # parse the pdf into a pdfminer document @@ -38,6 +39,9 @@ class IEEEXplore(Plugin): data = copy(obj.get_data()) if "Authorized licensed use limited to: " in data: + if verbose: + sys.stderr.write("%s: Found object with %r; omitting..." % (cls.__name__, data,)) + evil_ids.append(objid) for objid in evil_ids: diff --git a/pdfparanoia/plugins/jstor.py b/pdfparanoia/plugins/jstor.py index f983a2e..c183e38 100644 --- a/pdfparanoia/plugins/jstor.py +++ b/pdfparanoia/plugins/jstor.py @@ -2,9 +2,10 @@ from copy import copy +import sys + from ..parser import parse_content from ..eraser import ( - remove_object_by_id, replace_object_with, ) from ..plugin import Plugin @@ -32,8 +33,8 @@ class JSTOR(Plugin): "This content downloaded on", ] - @staticmethod - def scrub(content): + @classmethod + def scrub(cls, content, verbose=False): replacements = [] # jstor has certain watermarks only on the first page @@ -54,8 +55,6 @@ class JSTOR(Plugin): if hasattr(obj, "attrs"): if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode": - length = obj.attrs["Length"] - rawdata = copy(obj.rawdata) data = copy(obj.get_data()) # make sure all of the requirements are in there @@ -82,6 +81,10 @@ class JSTOR(Plugin): if page_id == 0 and "/F2 11 Tf\n" in better_content: startpos = better_content.rfind("/F2 11 Tf\n") endpos = better_content.find("Tf\n", startpos+5) + + if verbose: + sys.stderr.write("%s: Found object with %r; omitting..." % (cls.__name__, better_content[startpos:endpos],)) + better_content = better_content[0:startpos] + better_content[endpos:] replacements.append([objid, better_content]) diff --git a/pdfparanoia/plugins/spie.py b/pdfparanoia/plugins/spie.py index 2f8ea2e..7150267 100644 --- a/pdfparanoia/plugins/spie.py +++ b/pdfparanoia/plugins/spie.py @@ -1,9 +1,9 @@ # -*- coding: utf-8 -*- from copy import copy +import sys from ..parser import parse_content -from ..eraser import remove_object_by_id from ..plugin import Plugin class SPIE(Plugin): @@ -18,8 +18,8 @@ class SPIE(Plugin): """ - @staticmethod - def scrub(content): + @classmethod + def scrub(cls, content, verbose=False): evil_ids = [] # parse the pdf into a pdfminer document @@ -41,6 +41,8 @@ class SPIE(Plugin): data = copy(obj.get_data()) if "Downloaded From:" in data: + if verbose: + sys.stderr.write("%s: found object with %r; omitting..." % (cls.__name__, data)) evil_ids.append(objid) for objid in evil_ids: From 9204b2e17e25c7912bb2f386ced743324b4afad7 Mon Sep 17 00:00:00 2001 From: Zooko O'Whielacronx Date: Wed, 13 Feb 2013 20:56:33 +0000 Subject: [PATCH 2/3] fix up verbose printouts, don't print out large data --- pdfparanoia/plugins/aip.py | 5 +++-- pdfparanoia/plugins/ieee.py | 5 +++-- pdfparanoia/plugins/jstor.py | 6 +++--- pdfparanoia/plugins/spie.py | 5 +++-- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/pdfparanoia/plugins/aip.py b/pdfparanoia/plugins/aip.py index 145f67a..685b545 100644 --- a/pdfparanoia/plugins/aip.py +++ b/pdfparanoia/plugins/aip.py @@ -44,9 +44,10 @@ class AmericanInstituteOfPhysics(Plugin): #rawdata = copy(obj.rawdata) data = copy(obj.get_data()) - if "Redistribution subject to AIP license or copyright" in data: + phrase="Redistribution subject to AIP license or copyright" + if phrase in data: if verbose: - sys.stderr.write("%s: Found object with %r; omitting..." % (cls.__name__, data,)) + sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,)) evil_ids.append(objid) diff --git a/pdfparanoia/plugins/ieee.py b/pdfparanoia/plugins/ieee.py index 4c04636..847b1d0 100644 --- a/pdfparanoia/plugins/ieee.py +++ b/pdfparanoia/plugins/ieee.py @@ -38,9 +38,10 @@ class IEEEXplore(Plugin): #rawdata = copy(obj.rawdata) data = copy(obj.get_data()) - if "Authorized licensed use limited to: " in data: + phrase= "Authorized licensed use limited to: " + if phrase in data: if verbose: - sys.stderr.write("%s: Found object with %r; omitting..." % (cls.__name__, data,)) + sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,)) evil_ids.append(objid) diff --git a/pdfparanoia/plugins/jstor.py b/pdfparanoia/plugins/jstor.py index c183e38..d368fee 100644 --- a/pdfparanoia/plugins/jstor.py +++ b/pdfparanoia/plugins/jstor.py @@ -61,6 +61,9 @@ class JSTOR(Plugin): if all([requirement in data for requirement in JSTOR.requirements]): better_content = data + if verbose: + sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, cls.requirements)) + # remove the date startpos = better_content.find("This content downloaded ") endpos = better_content.find(")", startpos) @@ -82,9 +85,6 @@ class JSTOR(Plugin): startpos = better_content.rfind("/F2 11 Tf\n") endpos = better_content.find("Tf\n", startpos+5) - if verbose: - sys.stderr.write("%s: Found object with %r; omitting..." % (cls.__name__, better_content[startpos:endpos],)) - better_content = better_content[0:startpos] + better_content[endpos:] replacements.append([objid, better_content]) diff --git a/pdfparanoia/plugins/spie.py b/pdfparanoia/plugins/spie.py index 7150267..8d49c11 100644 --- a/pdfparanoia/plugins/spie.py +++ b/pdfparanoia/plugins/spie.py @@ -40,9 +40,10 @@ class SPIE(Plugin): if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode": data = copy(obj.get_data()) - if "Downloaded From:" in data: + phrase="Downloaded From:" + if phrase in data: if verbose: - sys.stderr.write("%s: found object with %r; omitting..." % (cls.__name__, data)) + sys.stderr.write("%s: found object %s with %r; omitting..." % (cls.__name__, objid, phrase)) evil_ids.append(objid) for objid in evil_ids: From 503b8aead5dfc899f2b2944af6b0d789f211a698 Mon Sep 17 00:00:00 2001 From: Zooko O'Whielacronx Date: Wed, 13 Feb 2013 21:08:49 +0000 Subject: [PATCH 3/3] add -v -v mode which prints out the details (potentially sensitive, potentially bulky) remove spie, which appears to do nothing --- bin/pdfparanoia | 6 +++--- pdfparanoia/{plugins => plugins-broken}/spie.py | 0 pdfparanoia/plugins/__init__.py | 1 - pdfparanoia/plugins/aip.py | 6 ++++-- pdfparanoia/plugins/ieee.py | 6 ++++-- pdfparanoia/plugins/jstor.py | 14 ++++++++++---- 6 files changed, 21 insertions(+), 12 deletions(-) rename pdfparanoia/{plugins => plugins-broken}/spie.py (100%) diff --git a/bin/pdfparanoia b/bin/pdfparanoia index 122e59a..749fa96 100755 --- a/bin/pdfparanoia +++ b/bin/pdfparanoia @@ -13,13 +13,13 @@ if __name__ == "__main__": import fileinput from StringIO import StringIO - verbose = False + verbose = 0 while '--verbose' in sys.argv: - verbose = True + verbose += 1 sys.argv.pop(sys.argv.index('--verbose')) while '-v' in sys.argv: - verbose = True + verbose += 1 sys.argv.pop(sys.argv.index('-v')) import pdfparanoia diff --git a/pdfparanoia/plugins/spie.py b/pdfparanoia/plugins-broken/spie.py similarity index 100% rename from pdfparanoia/plugins/spie.py rename to pdfparanoia/plugins-broken/spie.py diff --git a/pdfparanoia/plugins/__init__.py b/pdfparanoia/plugins/__init__.py index 93a425b..10179eb 100644 --- a/pdfparanoia/plugins/__init__.py +++ b/pdfparanoia/plugins/__init__.py @@ -10,5 +10,4 @@ Scrubbing machines. Bubbles mandatory. from .aip import * from .ieee import * from .jstor import * -from .spie import * diff --git a/pdfparanoia/plugins/aip.py b/pdfparanoia/plugins/aip.py index 685b545..d9d995d 100644 --- a/pdfparanoia/plugins/aip.py +++ b/pdfparanoia/plugins/aip.py @@ -18,7 +18,7 @@ class AmericanInstituteOfPhysics(Plugin): """ @classmethod - def scrub(cls, content, verbose=False): + def scrub(cls, content, verbose=0): evil_ids = [] # parse the pdf into a pdfminer document @@ -46,7 +46,9 @@ class AmericanInstituteOfPhysics(Plugin): phrase="Redistribution subject to AIP license or copyright" if phrase in data: - if verbose: + if verbose >= 2: + sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data)) + elif verbose >= 1: sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,)) evil_ids.append(objid) diff --git a/pdfparanoia/plugins/ieee.py b/pdfparanoia/plugins/ieee.py index 847b1d0..0a8691b 100644 --- a/pdfparanoia/plugins/ieee.py +++ b/pdfparanoia/plugins/ieee.py @@ -15,7 +15,7 @@ class IEEEXplore(Plugin): """ @classmethod - def scrub(cls, content, verbose=False): + def scrub(cls, content, verbose=0): evil_ids = [] # parse the pdf into a pdfminer document @@ -40,7 +40,9 @@ class IEEEXplore(Plugin): phrase= "Authorized licensed use limited to: " if phrase in data: - if verbose: + if verbose >= 2: + sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000])) + elif verbose >= 1: sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,)) evil_ids.append(objid) diff --git a/pdfparanoia/plugins/jstor.py b/pdfparanoia/plugins/jstor.py index d368fee..0ca971d 100644 --- a/pdfparanoia/plugins/jstor.py +++ b/pdfparanoia/plugins/jstor.py @@ -34,7 +34,7 @@ class JSTOR(Plugin): ] @classmethod - def scrub(cls, content, verbose=False): + def scrub(cls, content, verbose=0): replacements = [] # jstor has certain watermarks only on the first page @@ -61,13 +61,13 @@ class JSTOR(Plugin): if all([requirement in data for requirement in JSTOR.requirements]): better_content = data - if verbose: - sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, cls.requirements)) - # remove the date startpos = better_content.find("This content downloaded ") endpos = better_content.find(")", startpos) segment = better_content[startpos:endpos] + if verbose >= 2 and replacements: + sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment)) + better_content = better_content.replace(segment, "") # it looks like all of the watermarks are at the end? @@ -85,12 +85,18 @@ class JSTOR(Plugin): startpos = better_content.rfind("/F2 11 Tf\n") endpos = better_content.find("Tf\n", startpos+5) + if verbose >= 2 and replacements: + sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos])) + better_content = better_content[0:startpos] + better_content[endpos:] replacements.append([objid, better_content]) page_id += 1 + if verbose >= 1 and replacements: + sys.stderr.write("%s: Found objects %s with %r; omitting..." % (cls.__name__, [deets[0] for deets in replacements], cls.requirements)) + for deets in replacements: objid = deets[0] replacement = deets[1]