diff --git a/bin/pdfparanoia b/bin/pdfparanoia index 2705bc9..122e59a 100755 --- a/bin/pdfparanoia +++ b/bin/pdfparanoia @@ -13,6 +13,15 @@ if __name__ == "__main__": import fileinput from StringIO import StringIO + verbose = False + while '--verbose' in sys.argv: + verbose = True + sys.argv.pop(sys.argv.index('--verbose')) + + while '-v' in sys.argv: + verbose = True + sys.argv.pop(sys.argv.index('-v')) + import pdfparanoia # read in all lines @@ -21,7 +30,7 @@ if __name__ == "__main__": content += line # scrub the pdf to get rid of watermarks - output = pdfparanoia.scrub(StringIO(content)) + output = pdfparanoia.scrub(StringIO(content), verbose=verbose) # dump to output sys.stdout.write(output) diff --git a/pdfparanoia/core.py b/pdfparanoia/core.py index 800c52e..1cfc0f2 100644 --- a/pdfparanoia/core.py +++ b/pdfparanoia/core.py @@ -32,7 +32,7 @@ def find_plugins(): plugins = [each[1] for each in plugins] return plugins -def scrub(obj): +def scrub(obj, verbose=False): """ Removes watermarks from a pdf and returns the resulting pdf as a string. """ @@ -50,7 +50,7 @@ def scrub(obj): # clean this pdf as much as possible for plugin in plugins: - content = plugin.scrub(content) + content = plugin.scrub(content, verbose=verbose) return content diff --git a/pdfparanoia/plugin.py b/pdfparanoia/plugin.py index 867ee48..20c28bf 100644 --- a/pdfparanoia/plugin.py +++ b/pdfparanoia/plugin.py @@ -8,8 +8,8 @@ Defines how plugins work. """ class Plugin: - @staticmethod - def scrub(content): + @classmethod + def scrub(cls, content, verbose=False): """ Removes watermarks from the given pdf. """ diff --git a/pdfparanoia/plugins/aip.py b/pdfparanoia/plugins/aip.py index 20457f2..145f67a 100644 --- a/pdfparanoia/plugins/aip.py +++ b/pdfparanoia/plugins/aip.py @@ -1,5 +1,7 @@ # -*- coding: utf-8 -*- +import sys + from copy import copy from ..parser import parse_content @@ -15,8 +17,8 @@ class AmericanInstituteOfPhysics(Plugin): attached for whatever reason. """ - @staticmethod - def scrub(content): + @classmethod + def scrub(cls, content, verbose=False): evil_ids = [] # parse the pdf into a pdfminer document @@ -43,6 +45,9 @@ class AmericanInstituteOfPhysics(Plugin): data = copy(obj.get_data()) if "Redistribution subject to AIP license or copyright" in data: + if verbose: + sys.stderr.write("%s: Found object with %r; omitting..." % (cls.__name__, data,)) + evil_ids.append(objid) for objid in evil_ids: diff --git a/pdfparanoia/plugins/ieee.py b/pdfparanoia/plugins/ieee.py index b142ac0..4c04636 100644 --- a/pdfparanoia/plugins/ieee.py +++ b/pdfparanoia/plugins/ieee.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- from copy import copy +import sys from ..parser import parse_content from ..eraser import remove_object_by_id @@ -13,8 +14,8 @@ class IEEEXplore(Plugin): """ - @staticmethod - def scrub(content): + @classmethod + def scrub(cls, content, verbose=False): evil_ids = [] # parse the pdf into a pdfminer document @@ -38,6 +39,9 @@ class IEEEXplore(Plugin): data = copy(obj.get_data()) if "Authorized licensed use limited to: " in data: + if verbose: + sys.stderr.write("%s: Found object with %r; omitting..." % (cls.__name__, data,)) + evil_ids.append(objid) for objid in evil_ids: diff --git a/pdfparanoia/plugins/jstor.py b/pdfparanoia/plugins/jstor.py index f983a2e..c183e38 100644 --- a/pdfparanoia/plugins/jstor.py +++ b/pdfparanoia/plugins/jstor.py @@ -2,9 +2,10 @@ from copy import copy +import sys + from ..parser import parse_content from ..eraser import ( - remove_object_by_id, replace_object_with, ) from ..plugin import Plugin @@ -32,8 +33,8 @@ class JSTOR(Plugin): "This content downloaded on", ] - @staticmethod - def scrub(content): + @classmethod + def scrub(cls, content, verbose=False): replacements = [] # jstor has certain watermarks only on the first page @@ -54,8 +55,6 @@ class JSTOR(Plugin): if hasattr(obj, "attrs"): if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode": - length = obj.attrs["Length"] - rawdata = copy(obj.rawdata) data = copy(obj.get_data()) # make sure all of the requirements are in there @@ -82,6 +81,10 @@ class JSTOR(Plugin): if page_id == 0 and "/F2 11 Tf\n" in better_content: startpos = better_content.rfind("/F2 11 Tf\n") endpos = better_content.find("Tf\n", startpos+5) + + if verbose: + sys.stderr.write("%s: Found object with %r; omitting..." % (cls.__name__, better_content[startpos:endpos],)) + better_content = better_content[0:startpos] + better_content[endpos:] replacements.append([objid, better_content]) diff --git a/pdfparanoia/plugins/spie.py b/pdfparanoia/plugins/spie.py index 2f8ea2e..7150267 100644 --- a/pdfparanoia/plugins/spie.py +++ b/pdfparanoia/plugins/spie.py @@ -1,9 +1,9 @@ # -*- coding: utf-8 -*- from copy import copy +import sys from ..parser import parse_content -from ..eraser import remove_object_by_id from ..plugin import Plugin class SPIE(Plugin): @@ -18,8 +18,8 @@ class SPIE(Plugin): """ - @staticmethod - def scrub(content): + @classmethod + def scrub(cls, content, verbose=False): evil_ids = [] # parse the pdf into a pdfminer document @@ -41,6 +41,8 @@ class SPIE(Plugin): data = copy(obj.get_data()) if "Downloaded From:" in data: + if verbose: + sys.stderr.write("%s: found object with %r; omitting..." % (cls.__name__, data)) evil_ids.append(objid) for objid in evil_ids: