ieee watermark removal

2025-06-14 03:52:02 +02:00 · 2013-02-05 04:49:56 -06:00 · 2013-02-05 04:49:56 -06:00 · 14f1439c76
commit 14f1439c76
parent 0adec6c74e
6 changed files with 73 additions and 4 deletions
--- a/pdfparanoia/init.py
+++ b/pdfparanoia/init.py
@ -7,7 +7,7 @@ pdfparanoia is a pdf watermark remover library for academic papers. Basic
 usage:
    >>> import pdfparanoia
-    >>> pdf = pdfparanoia.scrub(open("nmat91417.pdf", "r"))
+    >>> pdf = pdfparanoia.scrub(open("nmat.pdf", "r"))
    >>> file_handler = open("output.pdf", "w")
    >>> file_handler.write(pdf)
    >>> file_handler.close()
@ -17,8 +17,8 @@ usage:
 """
 __title__ = "pdfparanoia"
-__version__ = "0.0.7"
+__version__ = "0.0.8"
-__build__ = 0x000007
+__build__ = 0x000008
 __author__ = "Bryan Bishop <kanzure@gmail.com>"
 __license__ = "BSD"
 __copyright__ = "Copyright 2013 Bryan Bishop"
--- a/pdfparanoia/eraser.py
+++ b/pdfparanoia/eraser.py
@ -18,7 +18,7 @@ def remove_object_by_id(content, objid):
    for line in lines:
        if not skip_mode:
            if last_line in ["endobj", None]:
-                if line[-3:] == "obj":
+                if line[-3:] == "obj" or " obj<<" in line[0:50]:
                    if line.startswith(str(objid) + " "):
                        skip_mode = True
                        last_line = line
--- a/pdfparanoia/plugins/init.py
+++ b/pdfparanoia/plugins/init.py
@ -8,4 +8,5 @@ Scrubbing machines. Bubbles mandatory.
 """
 from .aip import *
 from .ieee import *
--- a/pdfparanoia/plugins/ieee.py
+++ b/pdfparanoia/plugins/ieee.py
@ -0,0 +1,48 @@
 # -*- coding: utf-8 -*-
 from copy import copy
 from ..parser import parse_content
 from ..eraser import remove_object_by_id
 from ..plugin import Plugin
 class IEEEXplore(Plugin):
    """
    IEEE Xplore
    ~~~~~~~~~~~~~~~
    """
    @staticmethod
    def scrub(content):
        evil_ids = []
        # parse the pdf into a pdfminer document
        pdf = parse_content(content)
        # get a list of all object ids
        xrefs = pdf._parser.read_xref()
        xref = xrefs[0]
        objids = xref.get_objids()
        # check each object in the pdf
        for objid in objids:
            # get an object by id
            obj = pdf.getobj(objid)
            if hasattr(obj, "attrs"):
                # watermarks tend to be in FlateDecode elements
                if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
                    #length = obj.attrs["Length"]
                    #rawdata = copy(obj.rawdata)
                    data = copy(obj.get_data())
                    if "Authorized licensed use limited to: " in data:
                        evil_ids.append(objid)
        for objid in evil_ids:
            print "evil id: " + str(objid)
            content = remove_object_by_id(content, objid)
        return content
--- a/tests/samples/ieee/9984106e01b63d996f19f383b8d96f02.pdf
+++ b/tests/samples/ieee/9984106e01b63d996f19f383b8d96f02.pdf
--- a/tests/test_ieee.py
+++ b/tests/test_ieee.py
@ -0,0 +1,20 @@
 # -*- coding: utf-8 -*-
 import unittest
 import pdfparanoia
 class IEEEXploreTestCase(unittest.TestCase):
    def test_ieee(self):
        file_handler = open("tests/samples/ieee/9984106e01b63d996f19f383b8d96f02.pdf", "rb")
        content = file_handler.read()
        self.assertIn("\n4 0 obj", content)
        self.assertIn("\n7 0 obj", content)
        output = pdfparanoia.plugins.IEEEXplore.scrub(content)
        self.assertNotIn("\n19 0 obj", output)
        self.assertNotIn("\n37 0 obj", output)
        self.assertNotIn("\n43 0 obj", output)
        self.assertNotIn("\n53 0 obj", output)
        self.assertNotIn("\n64 0 obj", output)
        self.assertNotIn("\n73 0 obj", output)