ieee watermark removal

2025-07-04 20:37:38 +02:00 · 2013-02-05 04:49:56 -06:00 · 2013-02-05 04:49:56 -06:00 · 14f1439c76
commit 14f1439c76
parent 0adec6c74e
6 changed files with 73 additions and 4 deletions
--- a/pdfparanoia/init.py
+++ b/pdfparanoia/init.py
@ -7,7 +7,7 @@ pdfparanoia is a pdf watermark remover library for academic papers. Basic
 usage:

    >>> import pdfparanoia
-    >>> pdf = pdfparanoia.scrub(open("nmat91417.pdf", "r"))
+    >>> pdf = pdfparanoia.scrub(open("nmat.pdf", "r"))
    >>> file_handler = open("output.pdf", "w")
    >>> file_handler.write(pdf)
    >>> file_handler.close()
@ -17,8 +17,8 @@ usage:
 """

 __title__ = "pdfparanoia"
-__version__ = "0.0.7"
-__build__ = 0x000007
+__version__ = "0.0.8"
+__build__ = 0x000008
 __author__ = "Bryan Bishop <kanzure@gmail.com>"
 __license__ = "BSD"
 __copyright__ = "Copyright 2013 Bryan Bishop"
--- a/pdfparanoia/eraser.py
+++ b/pdfparanoia/eraser.py
@ -18,7 +18,7 @@ def remove_object_by_id(content, objid):
    for line in lines:
        if not skip_mode:
            if last_line in ["endobj", None]:
-                if line[-3:] == "obj":
+                if line[-3:] == "obj" or " obj<<" in line[0:50]:
                    if line.startswith(str(objid) + " "):
                        skip_mode = True
                        last_line = line
--- a/pdfparanoia/plugins/init.py
+++ b/pdfparanoia/plugins/init.py
@ -8,4 +8,5 @@ Scrubbing machines. Bubbles mandatory.
 """

 from .aip import *
+from .ieee import *

--- a/pdfparanoia/plugins/ieee.py
+++ b/pdfparanoia/plugins/ieee.py
@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+
+from copy import copy
+
+from ..parser import parse_content
+from ..eraser import remove_object_by_id
+from ..plugin import Plugin
+
+class IEEEXplore(Plugin):
+    """
+    IEEE Xplore
+    ~~~~~~~~~~~~~~~
+
+    """
+
+    @staticmethod
+    def scrub(content):
+        evil_ids = []
+
+        # parse the pdf into a pdfminer document
+        pdf = parse_content(content)
+
+        # get a list of all object ids
+        xrefs = pdf._parser.read_xref()
+        xref = xrefs[0]
+        objids = xref.get_objids()
+
+        # check each object in the pdf
+        for objid in objids:
+            # get an object by id
+            obj = pdf.getobj(objid)
+
+            if hasattr(obj, "attrs"):
+                # watermarks tend to be in FlateDecode elements
+                if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
+                    #length = obj.attrs["Length"]
+                    #rawdata = copy(obj.rawdata)
+                    data = copy(obj.get_data())
+
+                    if "Authorized licensed use limited to: " in data:
+                        evil_ids.append(objid)
+
+        for objid in evil_ids:
+            print "evil id: " + str(objid)
+            content = remove_object_by_id(content, objid)
+
+        return content
+
--- a/tests/samples/ieee/9984106e01b63d996f19f383b8d96f02.pdf
+++ b/tests/samples/ieee/9984106e01b63d996f19f383b8d96f02.pdf
--- a/tests/test_ieee.py
+++ b/tests/test_ieee.py
@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+
+import unittest
+import pdfparanoia
+
+class IEEEXploreTestCase(unittest.TestCase):
+    def test_ieee(self):
+        file_handler = open("tests/samples/ieee/9984106e01b63d996f19f383b8d96f02.pdf", "rb")
+        content = file_handler.read()
+        self.assertIn("\n4 0 obj", content)
+        self.assertIn("\n7 0 obj", content)
+
+        output = pdfparanoia.plugins.IEEEXplore.scrub(content)
+        self.assertNotIn("\n19 0 obj", output)
+        self.assertNotIn("\n37 0 obj", output)
+        self.assertNotIn("\n43 0 obj", output)
+        self.assertNotIn("\n53 0 obj", output)
+        self.assertNotIn("\n64 0 obj", output)
+        self.assertNotIn("\n73 0 obj", output)
+