Merge pull request #21 from zooko/verbose-option

Verbosity argument.
2025-04-13 10:02:05 +02:00 · 2013-02-14 01:39:19 -08:00 · 2013-02-14 01:39:19 -08:00 · ee483ab986
commit ee483ab986
parent caed396870 503b8aead5
8 changed files with 56 additions and 21 deletions
--- a/bin/pdfparanoia
+++ b/bin/pdfparanoia
@ -13,6 +13,15 @@ if __name__ == "__main__":
    import fileinput
    from StringIO import StringIO

+    verbose = 0
+    while '--verbose' in sys.argv:
+        verbose += 1
+        sys.argv.pop(sys.argv.index('--verbose'))
+
+    while '-v' in sys.argv:
+        verbose += 1
+        sys.argv.pop(sys.argv.index('-v'))
+
    import pdfparanoia

    # read in all lines
@ -21,7 +30,7 @@ if __name__ == "__main__":
        content += line

    # scrub the pdf to get rid of watermarks
-    output = pdfparanoia.scrub(StringIO(content))
+    output = pdfparanoia.scrub(StringIO(content), verbose=verbose)

    # dump to output
    sys.stdout.write(output)
--- a/pdfparanoia/core.py
+++ b/pdfparanoia/core.py
@ -32,7 +32,7 @@ def find_plugins():
    plugins = [each[1] for each in plugins]
    return plugins

-def scrub(obj):
+def scrub(obj, verbose=False):
    """
    Removes watermarks from a pdf and returns the resulting pdf as a string.
    """
@ -50,7 +50,7 @@ def scrub(obj):

    # clean this pdf as much as possible
    for plugin in plugins:
-        content = plugin.scrub(content)
+        content = plugin.scrub(content, verbose=verbose)

    return content

--- a/pdfparanoia/plugin.py
+++ b/pdfparanoia/plugin.py
@ -8,8 +8,8 @@ Defines how plugins work.
 """

 class Plugin:
-    @staticmethod
-    def scrub(content):
+    @classmethod
+    def scrub(cls, content, verbose=False):
        """
        Removes watermarks from the given pdf.
        """
--- a/pdfparanoia/plugins-broken/spie.py
+++ b/pdfparanoia/plugins-broken/spie.py
@ -1,9 +1,9 @@
 # -*- coding: utf-8 -*-

 from copy import copy
+import sys

 from ..parser import parse_content
-from ..eraser import remove_object_by_id
 from ..plugin import Plugin

 class SPIE(Plugin):
@ -18,8 +18,8 @@ class SPIE(Plugin):

    """

-    @staticmethod
-    def scrub(content):
+    @classmethod
+    def scrub(cls, content, verbose=False):
        evil_ids = []

        # parse the pdf into a pdfminer document
@ -40,7 +40,10 @@ class SPIE(Plugin):
                if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
                    data = copy(obj.get_data())

-                    if "Downloaded From:" in data:
+                    phrase="Downloaded From:"
+                    if phrase in data:
+                        if verbose:
+                            sys.stderr.write("%s: found object %s with %r; omitting..." % (cls.__name__, objid, phrase))
                        evil_ids.append(objid)

        for objid in evil_ids:
--- a/pdfparanoia/plugins/init.py
+++ b/pdfparanoia/plugins/init.py
@ -10,5 +10,4 @@ Scrubbing machines. Bubbles mandatory.
 from .aip import *
 from .ieee import *
 from .jstor import *
-from .spie import *

--- a/pdfparanoia/plugins/aip.py
+++ b/pdfparanoia/plugins/aip.py
@ -1,5 +1,7 @@
 # -*- coding: utf-8 -*-

+import sys
+
 from copy import copy

 from ..parser import parse_content
@ -15,8 +17,8 @@ class AmericanInstituteOfPhysics(Plugin):
    attached for whatever reason.
    """

-    @staticmethod
-    def scrub(content):
+    @classmethod
+    def scrub(cls, content, verbose=0):
        evil_ids = []

        # parse the pdf into a pdfminer document
@ -42,7 +44,13 @@ class AmericanInstituteOfPhysics(Plugin):
                        #rawdata = copy(obj.rawdata)
                        data = copy(obj.get_data())

-                        if "Redistribution subject to AIP license or copyright" in data:
+                        phrase="Redistribution subject to AIP license or copyright"
+                        if phrase in data:
+                            if verbose >= 2:
+                                sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data))
+                            elif verbose >= 1:
+                                sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,))
+
                            evil_ids.append(objid)

        for objid in evil_ids:
--- a/pdfparanoia/plugins/ieee.py
+++ b/pdfparanoia/plugins/ieee.py
@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-

 from copy import copy
+import sys

 from ..parser import parse_content
 from ..eraser import remove_object_by_id
@ -13,8 +14,8 @@ class IEEEXplore(Plugin):

    """

-    @staticmethod
-    def scrub(content):
+    @classmethod
+    def scrub(cls, content, verbose=0):
        evil_ids = []

        # parse the pdf into a pdfminer document
@ -37,7 +38,13 @@ class IEEEXplore(Plugin):
                    #rawdata = copy(obj.rawdata)
                    data = copy(obj.get_data())

-                    if "Authorized licensed use limited to: " in data:
+                    phrase= "Authorized licensed use limited to: "
+                    if phrase in data:
+                        if verbose >= 2:
+                            sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000]))
+                        elif verbose >= 1:
+                            sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,))
+
                        evil_ids.append(objid)

        for objid in evil_ids:
--- a/pdfparanoia/plugins/jstor.py
+++ b/pdfparanoia/plugins/jstor.py
@ -2,9 +2,10 @@

 from copy import copy

+import sys
+
 from ..parser import parse_content
 from ..eraser import (
-    remove_object_by_id,
    replace_object_with,
 )
 from ..plugin import Plugin
@ -32,8 +33,8 @@ class JSTOR(Plugin):
        "This content downloaded  on",
    ]

-    @staticmethod
-    def scrub(content):
+    @classmethod
+    def scrub(cls, content, verbose=0):
        replacements = []

        # jstor has certain watermarks only on the first page
@ -54,8 +55,6 @@ class JSTOR(Plugin):

            if hasattr(obj, "attrs"):
                if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
-                    length = obj.attrs["Length"]
-                    rawdata = copy(obj.rawdata)
                    data = copy(obj.get_data())

                    # make sure all of the requirements are in there
@ -66,6 +65,9 @@ class JSTOR(Plugin):
                        startpos = better_content.find("This content downloaded ")
                        endpos = better_content.find(")", startpos)
                        segment = better_content[startpos:endpos]
+                        if verbose >= 2 and replacements:
+                            sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment))
+
                        better_content = better_content.replace(segment, "")

                        # it looks like all of the watermarks are at the end?
@ -82,12 +84,19 @@ class JSTOR(Plugin):
                        if page_id == 0 and "/F2 11 Tf\n" in better_content:
                            startpos = better_content.rfind("/F2 11 Tf\n")
                            endpos = better_content.find("Tf\n", startpos+5)
+
+                            if verbose >= 2 and replacements:
+                                sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos]))
+
                            better_content = better_content[0:startpos] + better_content[endpos:]

                        replacements.append([objid, better_content])

                        page_id += 1

+        if verbose >= 1 and replacements:
+            sys.stderr.write("%s: Found objects %s with %r; omitting..." % (cls.__name__, [deets[0] for deets in replacements], cls.requirements))
+
        for deets in replacements:
            objid = deets[0]
            replacement = deets[1]