diff --git a/bin/pdfparanoia b/bin/pdfparanoia index 2705bc9..749fa96 100755 --- a/bin/pdfparanoia +++ b/bin/pdfparanoia @@ -13,6 +13,15 @@ if __name__ == "__main__": import fileinput from StringIO import StringIO + verbose = 0 + while '--verbose' in sys.argv: + verbose += 1 + sys.argv.pop(sys.argv.index('--verbose')) + + while '-v' in sys.argv: + verbose += 1 + sys.argv.pop(sys.argv.index('-v')) + import pdfparanoia # read in all lines @@ -21,7 +30,7 @@ if __name__ == "__main__": content += line # scrub the pdf to get rid of watermarks - output = pdfparanoia.scrub(StringIO(content)) + output = pdfparanoia.scrub(StringIO(content), verbose=verbose) # dump to output sys.stdout.write(output) diff --git a/pdfparanoia/core.py b/pdfparanoia/core.py index 800c52e..1cfc0f2 100644 --- a/pdfparanoia/core.py +++ b/pdfparanoia/core.py @@ -32,7 +32,7 @@ def find_plugins(): plugins = [each[1] for each in plugins] return plugins -def scrub(obj): +def scrub(obj, verbose=False): """ Removes watermarks from a pdf and returns the resulting pdf as a string. """ @@ -50,7 +50,7 @@ def scrub(obj): # clean this pdf as much as possible for plugin in plugins: - content = plugin.scrub(content) + content = plugin.scrub(content, verbose=verbose) return content diff --git a/pdfparanoia/plugin.py b/pdfparanoia/plugin.py index 867ee48..20c28bf 100644 --- a/pdfparanoia/plugin.py +++ b/pdfparanoia/plugin.py @@ -8,8 +8,8 @@ Defines how plugins work. """ class Plugin: - @staticmethod - def scrub(content): + @classmethod + def scrub(cls, content, verbose=False): """ Removes watermarks from the given pdf. """ diff --git a/pdfparanoia/plugins/spie.py b/pdfparanoia/plugins-broken/spie.py similarity index 81% rename from pdfparanoia/plugins/spie.py rename to pdfparanoia/plugins-broken/spie.py index 2f8ea2e..8d49c11 100644 --- a/pdfparanoia/plugins/spie.py +++ b/pdfparanoia/plugins-broken/spie.py @@ -1,9 +1,9 @@ # -*- coding: utf-8 -*- from copy import copy +import sys from ..parser import parse_content -from ..eraser import remove_object_by_id from ..plugin import Plugin class SPIE(Plugin): @@ -18,8 +18,8 @@ class SPIE(Plugin): """ - @staticmethod - def scrub(content): + @classmethod + def scrub(cls, content, verbose=False): evil_ids = [] # parse the pdf into a pdfminer document @@ -40,7 +40,10 @@ class SPIE(Plugin): if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode": data = copy(obj.get_data()) - if "Downloaded From:" in data: + phrase="Downloaded From:" + if phrase in data: + if verbose: + sys.stderr.write("%s: found object %s with %r; omitting..." % (cls.__name__, objid, phrase)) evil_ids.append(objid) for objid in evil_ids: diff --git a/pdfparanoia/plugins/__init__.py b/pdfparanoia/plugins/__init__.py index 93a425b..10179eb 100644 --- a/pdfparanoia/plugins/__init__.py +++ b/pdfparanoia/plugins/__init__.py @@ -10,5 +10,4 @@ Scrubbing machines. Bubbles mandatory. from .aip import * from .ieee import * from .jstor import * -from .spie import * diff --git a/pdfparanoia/plugins/aip.py b/pdfparanoia/plugins/aip.py index 20457f2..d9d995d 100644 --- a/pdfparanoia/plugins/aip.py +++ b/pdfparanoia/plugins/aip.py @@ -1,5 +1,7 @@ # -*- coding: utf-8 -*- +import sys + from copy import copy from ..parser import parse_content @@ -15,8 +17,8 @@ class AmericanInstituteOfPhysics(Plugin): attached for whatever reason. """ - @staticmethod - def scrub(content): + @classmethod + def scrub(cls, content, verbose=0): evil_ids = [] # parse the pdf into a pdfminer document @@ -42,7 +44,13 @@ class AmericanInstituteOfPhysics(Plugin): #rawdata = copy(obj.rawdata) data = copy(obj.get_data()) - if "Redistribution subject to AIP license or copyright" in data: + phrase="Redistribution subject to AIP license or copyright" + if phrase in data: + if verbose >= 2: + sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data)) + elif verbose >= 1: + sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,)) + evil_ids.append(objid) for objid in evil_ids: diff --git a/pdfparanoia/plugins/ieee.py b/pdfparanoia/plugins/ieee.py index b142ac0..0a8691b 100644 --- a/pdfparanoia/plugins/ieee.py +++ b/pdfparanoia/plugins/ieee.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- from copy import copy +import sys from ..parser import parse_content from ..eraser import remove_object_by_id @@ -13,8 +14,8 @@ class IEEEXplore(Plugin): """ - @staticmethod - def scrub(content): + @classmethod + def scrub(cls, content, verbose=0): evil_ids = [] # parse the pdf into a pdfminer document @@ -37,7 +38,13 @@ class IEEEXplore(Plugin): #rawdata = copy(obj.rawdata) data = copy(obj.get_data()) - if "Authorized licensed use limited to: " in data: + phrase= "Authorized licensed use limited to: " + if phrase in data: + if verbose >= 2: + sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000])) + elif verbose >= 1: + sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,)) + evil_ids.append(objid) for objid in evil_ids: diff --git a/pdfparanoia/plugins/jstor.py b/pdfparanoia/plugins/jstor.py index f983a2e..0ca971d 100644 --- a/pdfparanoia/plugins/jstor.py +++ b/pdfparanoia/plugins/jstor.py @@ -2,9 +2,10 @@ from copy import copy +import sys + from ..parser import parse_content from ..eraser import ( - remove_object_by_id, replace_object_with, ) from ..plugin import Plugin @@ -32,8 +33,8 @@ class JSTOR(Plugin): "This content downloaded on", ] - @staticmethod - def scrub(content): + @classmethod + def scrub(cls, content, verbose=0): replacements = [] # jstor has certain watermarks only on the first page @@ -54,8 +55,6 @@ class JSTOR(Plugin): if hasattr(obj, "attrs"): if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode": - length = obj.attrs["Length"] - rawdata = copy(obj.rawdata) data = copy(obj.get_data()) # make sure all of the requirements are in there @@ -66,6 +65,9 @@ class JSTOR(Plugin): startpos = better_content.find("This content downloaded ") endpos = better_content.find(")", startpos) segment = better_content[startpos:endpos] + if verbose >= 2 and replacements: + sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment)) + better_content = better_content.replace(segment, "") # it looks like all of the watermarks are at the end? @@ -82,12 +84,19 @@ class JSTOR(Plugin): if page_id == 0 and "/F2 11 Tf\n" in better_content: startpos = better_content.rfind("/F2 11 Tf\n") endpos = better_content.find("Tf\n", startpos+5) + + if verbose >= 2 and replacements: + sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos])) + better_content = better_content[0:startpos] + better_content[endpos:] replacements.append([objid, better_content]) page_id += 1 + if verbose >= 1 and replacements: + sys.stderr.write("%s: Found objects %s with %r; omitting..." % (cls.__name__, [deets[0] for deets in replacements], cls.requirements)) + for deets in replacements: objid = deets[0] replacement = deets[1]