jstor watermark removal

fixes #1
2024-12-04 23:15:52 +01:00 · 2013-02-06 17:31:19 -06:00 · 2013-02-06 17:31:19 -06:00 · b7b5a4ef65
commit b7b5a4ef65
parent 47bc734318
6 changed files with 128 additions and 4 deletions
--- a/README.md
+++ b/README.md
@ -28,11 +28,17 @@ file_handler.write(pdf)
 file_handler.close()
 ```

+## Supported
+
+* AIP
+* IEEE
+* JSTOR
+
 ## Changelog

+* 0.0.10 - JSTOR
 * 0.0.9 - AIP: better checks for false-positives; IEEE: remove stdout garbage.
-* 0.0.8 - ieee support
-* 0.0.1 - initial commit
+* 0.0.8 - IEEE

 ## License

--- a/pdfparanoia/init.py
+++ b/pdfparanoia/init.py
@ -17,8 +17,8 @@ usage:
 """

 __title__ = "pdfparanoia"
-__version__ = "0.0.9"
-__build__ = 0x000009
+__version__ = "0.0.10"
+__build__ = 0x000010
 __author__ = "Bryan Bishop <kanzure@gmail.com>"
 __license__ = "BSD"
 __copyright__ = "Copyright 2013 Bryan Bishop"
--- a/pdfparanoia/plugins/init.py
+++ b/pdfparanoia/plugins/init.py
@ -9,4 +9,5 @@ Scrubbing machines. Bubbles mandatory.

 from .aip import *
 from .ieee import *
+from .jstor import *

--- a/pdfparanoia/plugins/jstor.py
+++ b/pdfparanoia/plugins/jstor.py
@ -0,0 +1,97 @@
+# -*- coding: utf-8 -*-
+
+from copy import copy
+
+from ..parser import parse_content
+from ..eraser import (
+    remove_object_by_id,
+    replace_object_with,
+)
+from ..plugin import Plugin
+
+class JSTOR(Plugin):
+    """
+    JSTOR
+    ~~~~~~~~~~~~~~~
+
+    JSTOR watermarks a first page with an "Accessed" date, lots of TC barf, and
+    then also a watermark at the bottom of each page with a timestamp.
+
+    Watermarks removed:
+        * "Accessed" timestamp on the front page
+        * footer watermarks on each page
+
+    This was primary written for JSTOR pdfs generated by:
+         /Producer (itext-paulo-155 \(itextpdf.sf.net-lowagie.com\))
+    """
+
+    # these terms appear on a page that has been watermarked
+    requirements = [
+        "All use subject to ",
+        "JSTOR Terms and Conditions",
+        "This content downloaded  on",
+    ]
+
+    @staticmethod
+    def scrub(content):
+        replacements = []
+
+        # jstor has certain watermarks only on the first page
+        page_id = 0
+
+        # parse the pdf into a pdfminer document
+        pdf = parse_content(content)
+
+        # get a list of all object ids
+        xrefs = pdf._parser.read_xref()
+        xref = xrefs[0]
+        objids = xref.get_objids()
+
+        # check each object in the pdf
+        for objid in objids:
+            # get an object by id
+            obj = pdf.getobj(objid)
+
+            if hasattr(obj, "attrs"):
+                if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
+                    length = obj.attrs["Length"]
+                    rawdata = copy(obj.rawdata)
+                    data = copy(obj.get_data())
+
+                    # make sure all of the requirements are in there
+                    if all([requirement in data for requirement in JSTOR.requirements]):
+                        better_content = data
+
+                        # remove the date
+                        startpos = better_content.find("This content downloaded ")
+                        endpos = better_content.find(")", startpos)
+                        segment = better_content[startpos:endpos]
+                        better_content = better_content.replace(segment, "")
+
+                        # it looks like all of the watermarks are at the end?
+                        better_content = better_content[:-160]
+
+                        # "Accessed on dd/mm/yyy hh:mm"
+                        #
+                        # the "Accessed" line is only on the first page
+                        #
+                        # it's based on /F2
+                        #
+                        # This would be better if it could be decoded to
+                        # actually search for the "Accessed" text.
+                        if page_id == 0 and "/F2 11 Tf\n" in better_content:
+                            startpos = better_content.rfind("/F2 11 Tf\n")
+                            endpos = better_content.find("Tf\n", startpos+5)
+                            better_content = better_content[0:startpos] + better_content[endpos:]
+
+                        replacements.append([objid, better_content])
+
+                        page_id += 1
+
+        for deets in replacements:
+            objid = deets[0]
+            replacement = deets[1]
+            content = replace_object_with(content, objid, replacement)
+
+        return content
+
--- a/tests/samples/jstor/231a515256115368c142f528cee7f727.pdf
+++ b/tests/samples/jstor/231a515256115368c142f528cee7f727.pdf
--- a/tests/test_jstor.py
+++ b/tests/test_jstor.py
@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+
+import unittest
+import pdfparanoia
+
+class JSTORTestCase(unittest.TestCase):
+    def test_jstor(self):
+        file_handler = open("tests/samples/jstor/231a515256115368c142f528cee7f727.pdf", "rb")
+        content = file_handler.read()
+        file_handler.close()
+        self.assertIn("\n18 0 obj \n", content)
+
+        # this section will later be manipulated
+        self.assertIn("\n19 0 obj \n", content)
+
+        output = pdfparanoia.plugins.JSTOR.scrub(content)
+
+        # FlateDecode should be replaced with a decompressed section
+        self.assertIn("\n19 0 obj\n<</Length 2862>>stream", output)
+