SPIE watermark removal

This is slightly broken because the SPIE plugin removes more than just watermarks. For some reason it seems to also remove images and large blocks of text from the paper. However, the object that is being removed is tiny. In the unit testing sample, the removed object is pdf stream 55. For now, SPIE is partially disabled until this is fixed. The problem does not originate from the other plugins. fixes #20
2025-07-04 04:17:34 +02:00 · 2013-02-11 23:52:59 -06:00 · 2013-02-11 23:52:59 -06:00 · caed396870
commit caed396870
parent 9d7fd1dbb6
7 changed files with 74 additions and 2 deletions
--- a/README.md
+++ b/README.md
@ -45,9 +45,11 @@ cat input.pdf | pdfparanoia > output.pdf
 * AIP
 * IEEE
 * JSTOR
+* SPIE (sort of)

 ## Changelog

+* 0.0.12 - SPIE
 * 0.0.11 - pdfparanoia command-line interface. Use it by either piping in pdf data, or specifying a path to a pdf in the first argv slot.
 * 0.0.10 - JSTOR
 * 0.0.9 - AIP: better checks for false-positives; IEEE: remove stdout garbage.
--- a/pdfparanoia/init.py
+++ b/pdfparanoia/init.py
@ -17,8 +17,8 @@ usage:
 """

 __title__ = "pdfparanoia"
-__version__ = "0.0.11"
-__build__ = 0x000011
+__version__ = "0.0.12"
+__build__ = 0x000012
 __author__ = "Bryan Bishop <kanzure@gmail.com>"
 __license__ = "BSD"
 __copyright__ = "Copyright 2013 Bryan Bishop"
--- a/pdfparanoia/eraser.py
+++ b/pdfparanoia/eraser.py
@ -19,6 +19,9 @@ def manipulate_pdf(content, objid, callback, *args):
    last_line = None
    skip_mode = False
    for line in lines:
+        if line == "":
+            outlines.append("")
+            continue
        if not skip_mode:
            if last_line in ["endobj", "endobj ", None]:
                if line[-3:] == "obj" or line[-4:] == "obj " or " obj <<" in line[0:50] or " obj<<" in line[0:50]:
--- a/pdfparanoia/plugins/init.py
+++ b/pdfparanoia/plugins/init.py
@ -10,4 +10,5 @@ Scrubbing machines. Bubbles mandatory.
 from .aip import *
 from .ieee import *
 from .jstor import *
+from .spie import *

--- a/pdfparanoia/plugins/spie.py
+++ b/pdfparanoia/plugins/spie.py
@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+
+from copy import copy
+
+from ..parser import parse_content
+from ..eraser import remove_object_by_id
+from ..plugin import Plugin
+
+class SPIE(Plugin):
+    """
+    Society of Photo-Optical Instrumentation Engineers
+    ~~~~~~~~~~~~~~~
+
+    These watermarks are shown on each page, but are only defined in one place.
+    Also, there seems to be some interference from some of the other
+    pdfparanoia plugins causing the deletion of images in the document.
+    Side-effects need to be better accounted for.
+
+    """
+
+    @staticmethod
+    def scrub(content):
+        evil_ids = []
+
+        # parse the pdf into a pdfminer document
+        pdf = parse_content(content)
+
+        # get a list of all object ids
+        xrefs = pdf._parser.read_xref()
+        xref = xrefs[0]
+        objids = xref.get_objids()
+
+        # check each object in the pdf
+        for objid in objids:
+            # get an object by id
+            obj = pdf.getobj(objid)
+
+            if hasattr(obj, "attrs"):
+                # watermarks tend to be in FlateDecode elements
+                if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
+                    data = copy(obj.get_data())
+
+                    if "Downloaded From:" in data:
+                        evil_ids.append(objid)
+
+        for objid in evil_ids:
+            # for some reason SPIE pdfs are broken by this, images are randomly removed
+            #content = remove_object_by_id(content, objid)
+            continue
+
+        return content
+
--- a/tests/samples/spie/266c86e6f47e39415584450f5a3af4d0.pdf
+++ b/tests/samples/spie/266c86e6f47e39415584450f5a3af4d0.pdf
--- a/tests/test_spie.py
+++ b/tests/test_spie.py
@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+
+import unittest
+import pdfparanoia
+
+class SPIETestCase(unittest.TestCase):
+    def test_spie(self):
+        file_handler = open("tests/samples/spie/266c86e6f47e39415584450f5a3af4d0.pdf", "rb")
+        content = file_handler.read()
+        self.assertIn("\n46 0 obj", content)
+
+        output = pdfparanoia.plugins.SPIE.scrub(content)
+        self.assertNotIn("\n55 0 obj", output)
+