From 56cc7719dab683507d95a4ce5d38590f076255c1 Mon Sep 17 00:00:00 2001
From: Zooko O'Whielacronx <zooko@zooko.com>
Date: Wed, 13 Feb 2013 19:58:47 +0000
Subject: [PATCH 1/3] add a "--verbose" option that writes to stderr if it
 finds anything to omit

Also cleaned up some flakes noticed by pyflakes, and make the scrub() be @classmethod instead of @staticmethod so I could use the class for the verbose output.

caveats:

* there are no unit tests of this patch
* now your logs of your stderr have potentially sensitive information in them
* the implementation of arg parsing is very low-tech; (a *good* way to do arg parsing is the "argparse" module)
---
 bin/pdfparanoia              | 11 ++++++++++-
 pdfparanoia/core.py          |  4 ++--
 pdfparanoia/plugin.py        |  4 ++--
 pdfparanoia/plugins/aip.py   |  9 +++++++--
 pdfparanoia/plugins/ieee.py  |  8 ++++++--
 pdfparanoia/plugins/jstor.py | 13 ++++++++-----
 pdfparanoia/plugins/spie.py  |  8 +++++---
 7 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/bin/pdfparanoia b/bin/pdfparanoia
index 2705bc9..122e59a 100755
--- a/bin/pdfparanoia
+++ b/bin/pdfparanoia
@@ -13,6 +13,15 @@ if __name__ == "__main__":
     import fileinput
     from StringIO import StringIO
 
+    verbose = False
+    while '--verbose' in sys.argv:
+        verbose = True
+        sys.argv.pop(sys.argv.index('--verbose'))
+
+    while '-v' in sys.argv:
+        verbose = True
+        sys.argv.pop(sys.argv.index('-v'))
+
     import pdfparanoia
 
     # read in all lines
@@ -21,7 +30,7 @@ if __name__ == "__main__":
         content += line
 
     # scrub the pdf to get rid of watermarks
-    output = pdfparanoia.scrub(StringIO(content))
+    output = pdfparanoia.scrub(StringIO(content), verbose=verbose)
 
     # dump to output
     sys.stdout.write(output)
diff --git a/pdfparanoia/core.py b/pdfparanoia/core.py
index 800c52e..1cfc0f2 100644
--- a/pdfparanoia/core.py
+++ b/pdfparanoia/core.py
@@ -32,7 +32,7 @@ def find_plugins():
     plugins = [each[1] for each in plugins]
     return plugins
 
-def scrub(obj):
+def scrub(obj, verbose=False):
     """
     Removes watermarks from a pdf and returns the resulting pdf as a string.
     """
@@ -50,7 +50,7 @@ def scrub(obj):
 
     # clean this pdf as much as possible
     for plugin in plugins:
-        content = plugin.scrub(content)
+        content = plugin.scrub(content, verbose=verbose)
 
     return content
 
diff --git a/pdfparanoia/plugin.py b/pdfparanoia/plugin.py
index 867ee48..20c28bf 100644
--- a/pdfparanoia/plugin.py
+++ b/pdfparanoia/plugin.py
@@ -8,8 +8,8 @@ Defines how plugins work.
 """
 
 class Plugin:
-    @staticmethod
-    def scrub(content):
+    @classmethod
+    def scrub(cls, content, verbose=False):
         """
         Removes watermarks from the given pdf.
         """
diff --git a/pdfparanoia/plugins/aip.py b/pdfparanoia/plugins/aip.py
index 20457f2..145f67a 100644
--- a/pdfparanoia/plugins/aip.py
+++ b/pdfparanoia/plugins/aip.py
@@ -1,5 +1,7 @@
 # -*- coding: utf-8 -*-
 
+import sys
+
 from copy import copy
 
 from ..parser import parse_content
@@ -15,8 +17,8 @@ class AmericanInstituteOfPhysics(Plugin):
     attached for whatever reason.
     """
 
-    @staticmethod
-    def scrub(content):
+    @classmethod
+    def scrub(cls, content, verbose=False):
         evil_ids = []
 
         # parse the pdf into a pdfminer document
@@ -43,6 +45,9 @@ class AmericanInstituteOfPhysics(Plugin):
                         data = copy(obj.get_data())
 
                         if "Redistribution subject to AIP license or copyright" in data:
+                            if verbose:
+                                sys.stderr.write("%s: Found object with %r; omitting..." % (cls.__name__, data,))
+
                             evil_ids.append(objid)
 
         for objid in evil_ids:
diff --git a/pdfparanoia/plugins/ieee.py b/pdfparanoia/plugins/ieee.py
index b142ac0..4c04636 100644
--- a/pdfparanoia/plugins/ieee.py
+++ b/pdfparanoia/plugins/ieee.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 
 from copy import copy
+import sys
 
 from ..parser import parse_content
 from ..eraser import remove_object_by_id
@@ -13,8 +14,8 @@ class IEEEXplore(Plugin):
 
     """
 
-    @staticmethod
-    def scrub(content):
+    @classmethod
+    def scrub(cls, content, verbose=False):
         evil_ids = []
 
         # parse the pdf into a pdfminer document
@@ -38,6 +39,9 @@ class IEEEXplore(Plugin):
                     data = copy(obj.get_data())
 
                     if "Authorized licensed use limited to: " in data:
+                        if verbose:
+                            sys.stderr.write("%s: Found object with %r; omitting..." % (cls.__name__, data,))
+
                         evil_ids.append(objid)
 
         for objid in evil_ids:
diff --git a/pdfparanoia/plugins/jstor.py b/pdfparanoia/plugins/jstor.py
index f983a2e..c183e38 100644
--- a/pdfparanoia/plugins/jstor.py
+++ b/pdfparanoia/plugins/jstor.py
@@ -2,9 +2,10 @@
 
 from copy import copy
 
+import sys
+
 from ..parser import parse_content
 from ..eraser import (
-    remove_object_by_id,
     replace_object_with,
 )
 from ..plugin import Plugin
@@ -32,8 +33,8 @@ class JSTOR(Plugin):
         "This content downloaded  on",
     ]
 
-    @staticmethod
-    def scrub(content):
+    @classmethod
+    def scrub(cls, content, verbose=False):
         replacements = []
 
         # jstor has certain watermarks only on the first page
@@ -54,8 +55,6 @@ class JSTOR(Plugin):
 
             if hasattr(obj, "attrs"):
                 if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
-                    length = obj.attrs["Length"]
-                    rawdata = copy(obj.rawdata)
                     data = copy(obj.get_data())
 
                     # make sure all of the requirements are in there
@@ -82,6 +81,10 @@ class JSTOR(Plugin):
                         if page_id == 0 and "/F2 11 Tf\n" in better_content:
                             startpos = better_content.rfind("/F2 11 Tf\n")
                             endpos = better_content.find("Tf\n", startpos+5)
+
+                            if verbose:
+                                sys.stderr.write("%s: Found object with %r; omitting..." % (cls.__name__, better_content[startpos:endpos],))
+
                             better_content = better_content[0:startpos] + better_content[endpos:]
 
                         replacements.append([objid, better_content])
diff --git a/pdfparanoia/plugins/spie.py b/pdfparanoia/plugins/spie.py
index 2f8ea2e..7150267 100644
--- a/pdfparanoia/plugins/spie.py
+++ b/pdfparanoia/plugins/spie.py
@@ -1,9 +1,9 @@
 # -*- coding: utf-8 -*-
 
 from copy import copy
+import sys
 
 from ..parser import parse_content
-from ..eraser import remove_object_by_id
 from ..plugin import Plugin
 
 class SPIE(Plugin):
@@ -18,8 +18,8 @@ class SPIE(Plugin):
 
     """
 
-    @staticmethod
-    def scrub(content):
+    @classmethod
+    def scrub(cls, content, verbose=False):
         evil_ids = []
 
         # parse the pdf into a pdfminer document
@@ -41,6 +41,8 @@ class SPIE(Plugin):
                     data = copy(obj.get_data())
 
                     if "Downloaded From:" in data:
+                        if verbose:
+                            sys.stderr.write("%s: found object with %r; omitting..." % (cls.__name__, data))
                         evil_ids.append(objid)
 
         for objid in evil_ids:

From 9204b2e17e25c7912bb2f386ced743324b4afad7 Mon Sep 17 00:00:00 2001
From: Zooko O'Whielacronx <zooko@zooko.com>
Date: Wed, 13 Feb 2013 20:56:33 +0000
Subject: [PATCH 2/3] fix up verbose printouts, don't print out large data

---
 pdfparanoia/plugins/aip.py   | 5 +++--
 pdfparanoia/plugins/ieee.py  | 5 +++--
 pdfparanoia/plugins/jstor.py | 6 +++---
 pdfparanoia/plugins/spie.py  | 5 +++--
 4 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/pdfparanoia/plugins/aip.py b/pdfparanoia/plugins/aip.py
index 145f67a..685b545 100644
--- a/pdfparanoia/plugins/aip.py
+++ b/pdfparanoia/plugins/aip.py
@@ -44,9 +44,10 @@ class AmericanInstituteOfPhysics(Plugin):
                         #rawdata = copy(obj.rawdata)
                         data = copy(obj.get_data())
 
-                        if "Redistribution subject to AIP license or copyright" in data:
+                        phrase="Redistribution subject to AIP license or copyright"
+                        if phrase in data:
                             if verbose:
-                                sys.stderr.write("%s: Found object with %r; omitting..." % (cls.__name__, data,))
+                                sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,))
 
                             evil_ids.append(objid)
 
diff --git a/pdfparanoia/plugins/ieee.py b/pdfparanoia/plugins/ieee.py
index 4c04636..847b1d0 100644
--- a/pdfparanoia/plugins/ieee.py
+++ b/pdfparanoia/plugins/ieee.py
@@ -38,9 +38,10 @@ class IEEEXplore(Plugin):
                     #rawdata = copy(obj.rawdata)
                     data = copy(obj.get_data())
 
-                    if "Authorized licensed use limited to: " in data:
+                    phrase= "Authorized licensed use limited to: "
+                    if phrase in data:
                         if verbose:
-                            sys.stderr.write("%s: Found object with %r; omitting..." % (cls.__name__, data,))
+                            sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,))
 
                         evil_ids.append(objid)
 
diff --git a/pdfparanoia/plugins/jstor.py b/pdfparanoia/plugins/jstor.py
index c183e38..d368fee 100644
--- a/pdfparanoia/plugins/jstor.py
+++ b/pdfparanoia/plugins/jstor.py
@@ -61,6 +61,9 @@ class JSTOR(Plugin):
                     if all([requirement in data for requirement in JSTOR.requirements]):
                         better_content = data
 
+                        if verbose:
+                            sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, cls.requirements))
+
                         # remove the date
                         startpos = better_content.find("This content downloaded ")
                         endpos = better_content.find(")", startpos)
@@ -82,9 +85,6 @@ class JSTOR(Plugin):
                             startpos = better_content.rfind("/F2 11 Tf\n")
                             endpos = better_content.find("Tf\n", startpos+5)
 
-                            if verbose:
-                                sys.stderr.write("%s: Found object with %r; omitting..." % (cls.__name__, better_content[startpos:endpos],))
-
                             better_content = better_content[0:startpos] + better_content[endpos:]
 
                         replacements.append([objid, better_content])
diff --git a/pdfparanoia/plugins/spie.py b/pdfparanoia/plugins/spie.py
index 7150267..8d49c11 100644
--- a/pdfparanoia/plugins/spie.py
+++ b/pdfparanoia/plugins/spie.py
@@ -40,9 +40,10 @@ class SPIE(Plugin):
                 if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
                     data = copy(obj.get_data())
 
-                    if "Downloaded From:" in data:
+                    phrase="Downloaded From:"
+                    if phrase in data:
                         if verbose:
-                            sys.stderr.write("%s: found object with %r; omitting..." % (cls.__name__, data))
+                            sys.stderr.write("%s: found object %s with %r; omitting..." % (cls.__name__, objid, phrase))
                         evil_ids.append(objid)
 
         for objid in evil_ids:

From 503b8aead5dfc899f2b2944af6b0d789f211a698 Mon Sep 17 00:00:00 2001
From: Zooko O'Whielacronx <zooko@zooko.com>
Date: Wed, 13 Feb 2013 21:08:49 +0000
Subject: [PATCH 3/3] add -v -v mode which prints out the details (potentially
 sensitive, potentially bulky)

remove spie, which appears to do nothing
---
 bin/pdfparanoia                                 |  6 +++---
 pdfparanoia/{plugins => plugins-broken}/spie.py |  0
 pdfparanoia/plugins/__init__.py                 |  1 -
 pdfparanoia/plugins/aip.py                      |  6 ++++--
 pdfparanoia/plugins/ieee.py                     |  6 ++++--
 pdfparanoia/plugins/jstor.py                    | 14 ++++++++++----
 6 files changed, 21 insertions(+), 12 deletions(-)
 rename pdfparanoia/{plugins => plugins-broken}/spie.py (100%)

diff --git a/bin/pdfparanoia b/bin/pdfparanoia
index 122e59a..749fa96 100755
--- a/bin/pdfparanoia
+++ b/bin/pdfparanoia
@@ -13,13 +13,13 @@ if __name__ == "__main__":
     import fileinput
     from StringIO import StringIO
 
-    verbose = False
+    verbose = 0
     while '--verbose' in sys.argv:
-        verbose = True
+        verbose += 1
         sys.argv.pop(sys.argv.index('--verbose'))
 
     while '-v' in sys.argv:
-        verbose = True
+        verbose += 1
         sys.argv.pop(sys.argv.index('-v'))
 
     import pdfparanoia
diff --git a/pdfparanoia/plugins/spie.py b/pdfparanoia/plugins-broken/spie.py
similarity index 100%
rename from pdfparanoia/plugins/spie.py
rename to pdfparanoia/plugins-broken/spie.py
diff --git a/pdfparanoia/plugins/__init__.py b/pdfparanoia/plugins/__init__.py
index 93a425b..10179eb 100644
--- a/pdfparanoia/plugins/__init__.py
+++ b/pdfparanoia/plugins/__init__.py
@@ -10,5 +10,4 @@ Scrubbing machines. Bubbles mandatory.
 from .aip import *
 from .ieee import *
 from .jstor import *
-from .spie import *
 
diff --git a/pdfparanoia/plugins/aip.py b/pdfparanoia/plugins/aip.py
index 685b545..d9d995d 100644
--- a/pdfparanoia/plugins/aip.py
+++ b/pdfparanoia/plugins/aip.py
@@ -18,7 +18,7 @@ class AmericanInstituteOfPhysics(Plugin):
     """
 
     @classmethod
-    def scrub(cls, content, verbose=False):
+    def scrub(cls, content, verbose=0):
         evil_ids = []
 
         # parse the pdf into a pdfminer document
@@ -46,7 +46,9 @@ class AmericanInstituteOfPhysics(Plugin):
 
                         phrase="Redistribution subject to AIP license or copyright"
                         if phrase in data:
-                            if verbose:
+                            if verbose >= 2:
+                                sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data))
+                            elif verbose >= 1:
                                 sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,))
 
                             evil_ids.append(objid)
diff --git a/pdfparanoia/plugins/ieee.py b/pdfparanoia/plugins/ieee.py
index 847b1d0..0a8691b 100644
--- a/pdfparanoia/plugins/ieee.py
+++ b/pdfparanoia/plugins/ieee.py
@@ -15,7 +15,7 @@ class IEEEXplore(Plugin):
     """
 
     @classmethod
-    def scrub(cls, content, verbose=False):
+    def scrub(cls, content, verbose=0):
         evil_ids = []
 
         # parse the pdf into a pdfminer document
@@ -40,7 +40,9 @@ class IEEEXplore(Plugin):
 
                     phrase= "Authorized licensed use limited to: "
                     if phrase in data:
-                        if verbose:
+                        if verbose >= 2:
+                            sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000]))
+                        elif verbose >= 1:
                             sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,))
 
                         evil_ids.append(objid)
diff --git a/pdfparanoia/plugins/jstor.py b/pdfparanoia/plugins/jstor.py
index d368fee..0ca971d 100644
--- a/pdfparanoia/plugins/jstor.py
+++ b/pdfparanoia/plugins/jstor.py
@@ -34,7 +34,7 @@ class JSTOR(Plugin):
     ]
 
     @classmethod
-    def scrub(cls, content, verbose=False):
+    def scrub(cls, content, verbose=0):
         replacements = []
 
         # jstor has certain watermarks only on the first page
@@ -61,13 +61,13 @@ class JSTOR(Plugin):
                     if all([requirement in data for requirement in JSTOR.requirements]):
                         better_content = data
 
-                        if verbose:
-                            sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, cls.requirements))
-
                         # remove the date
                         startpos = better_content.find("This content downloaded ")
                         endpos = better_content.find(")", startpos)
                         segment = better_content[startpos:endpos]
+                        if verbose >= 2 and replacements:
+                            sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment))
+
                         better_content = better_content.replace(segment, "")
 
                         # it looks like all of the watermarks are at the end?
@@ -85,12 +85,18 @@ class JSTOR(Plugin):
                             startpos = better_content.rfind("/F2 11 Tf\n")
                             endpos = better_content.find("Tf\n", startpos+5)
 
+                            if verbose >= 2 and replacements:
+                                sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos]))
+
                             better_content = better_content[0:startpos] + better_content[endpos:]
 
                         replacements.append([objid, better_content])
 
                         page_id += 1
 
+        if verbose >= 1 and replacements:
+            sys.stderr.write("%s: Found objects %s with %r; omitting..." % (cls.__name__, [deets[0] for deets in replacements], cls.requirements))
+
         for deets in replacements:
             objid = deets[0]
             replacement = deets[1]