add -v -v mode which prints out the details (potentially sensitive, potentially bulky)

remove spie, which appears to do nothing
2025-07-15 14:08:21 +02:00 · 2013-02-13 21:08:49 +00:00 · 2013-02-13 21:08:49 +00:00 · 503b8aead5
commit 503b8aead5
parent 9204b2e17e
6 changed files with 21 additions and 12 deletions
--- a/bin/pdfparanoia
+++ b/bin/pdfparanoia
@ -13,13 +13,13 @@ if __name__ == "__main__":
    import fileinput
    from StringIO import StringIO

-    verbose = False
+    verbose = 0
    while '--verbose' in sys.argv:
-        verbose = True
+        verbose += 1
        sys.argv.pop(sys.argv.index('--verbose'))

    while '-v' in sys.argv:
-        verbose = True
+        verbose += 1
        sys.argv.pop(sys.argv.index('-v'))

    import pdfparanoia
--- a/pdfparanoia/plugins-broken/spie.py
+++ b/pdfparanoia/plugins-broken/spie.py
--- a/pdfparanoia/plugins/init.py
+++ b/pdfparanoia/plugins/init.py
@ -10,5 +10,4 @@ Scrubbing machines. Bubbles mandatory.
 from .aip import *
 from .ieee import *
 from .jstor import *
-from .spie import *

--- a/pdfparanoia/plugins/aip.py
+++ b/pdfparanoia/plugins/aip.py
@ -18,7 +18,7 @@ class AmericanInstituteOfPhysics(Plugin):
    """

    @classmethod
-    def scrub(cls, content, verbose=False):
+    def scrub(cls, content, verbose=0):
        evil_ids = []

        # parse the pdf into a pdfminer document
@ -46,7 +46,9 @@ class AmericanInstituteOfPhysics(Plugin):

                        phrase="Redistribution subject to AIP license or copyright"
                        if phrase in data:
-                            if verbose:
+                            if verbose >= 2:
+                                sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data))
+                            elif verbose >= 1:
                                sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,))

                            evil_ids.append(objid)
--- a/pdfparanoia/plugins/ieee.py
+++ b/pdfparanoia/plugins/ieee.py
@ -15,7 +15,7 @@ class IEEEXplore(Plugin):
    """

    @classmethod
-    def scrub(cls, content, verbose=False):
+    def scrub(cls, content, verbose=0):
        evil_ids = []

        # parse the pdf into a pdfminer document
@ -40,7 +40,9 @@ class IEEEXplore(Plugin):

                    phrase= "Authorized licensed use limited to: "
                    if phrase in data:
-                        if verbose:
+                        if verbose >= 2:
+                            sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000]))
+                        elif verbose >= 1:
                            sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,))

                        evil_ids.append(objid)
--- a/pdfparanoia/plugins/jstor.py
+++ b/pdfparanoia/plugins/jstor.py
@ -34,7 +34,7 @@ class JSTOR(Plugin):
    ]

    @classmethod
-    def scrub(cls, content, verbose=False):
+    def scrub(cls, content, verbose=0):
        replacements = []

        # jstor has certain watermarks only on the first page
@ -61,13 +61,13 @@ class JSTOR(Plugin):
                    if all([requirement in data for requirement in JSTOR.requirements]):
                        better_content = data

-                        if verbose:
-                            sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, cls.requirements))
-
                        # remove the date
                        startpos = better_content.find("This content downloaded ")
                        endpos = better_content.find(")", startpos)
                        segment = better_content[startpos:endpos]
+                        if verbose >= 2 and replacements:
+                            sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment))
+
                        better_content = better_content.replace(segment, "")

                        # it looks like all of the watermarks are at the end?
@ -85,12 +85,18 @@ class JSTOR(Plugin):
                            startpos = better_content.rfind("/F2 11 Tf\n")
                            endpos = better_content.find("Tf\n", startpos+5)

+                            if verbose >= 2 and replacements:
+                                sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos]))
+
                            better_content = better_content[0:startpos] + better_content[endpos:]

                        replacements.append([objid, better_content])

                        page_id += 1

+        if verbose >= 1 and replacements:
+            sys.stderr.write("%s: Found objects %s with %r; omitting..." % (cls.__name__, [deets[0] for deets in replacements], cls.requirements))
+
        for deets in replacements:
            objid = deets[0]
            replacement = deets[1]