Adding support for PDF's from pubs.rsc.org

2013-05-13 20:28:35 +01:00 · 2013-05-13 20:28:35 +01:00 · 18140d838d
parent 9d26a0aa01
commit 18140d838d
6 changed files with 99 additions and 2 deletions
--- a/README.md
+++ b/README.md
@ -47,10 +47,12 @@ cat input.pdf | pdfparanoia > output.pdf
 * AIP
 * IEEE
 * JSTOR
+* RSC
 * SPIE (sort of)

 ## Changelog

+* 0.0.13 - RSC
 * 0.0.12 - SPIE
 * 0.0.11 - pdfparanoia command-line interface. Use it by either piping in pdf data, or specifying a path to a pdf in the first argv slot.
 * 0.0.10 - JSTOR
--- a/pdfparanoia/init.py
+++ b/pdfparanoia/init.py
@ -17,8 +17,8 @@ usage:
 """

 __title__ = "pdfparanoia"
-__version__ = "0.0.12"
-__build__ = 0x000012
+__version__ = "0.0.13"
+__build__ = 0x000013
 __author__ = "Bryan Bishop <kanzure@gmail.com>"
 __license__ = "BSD"
 __copyright__ = "Copyright 2013 Bryan Bishop"
--- a/pdfparanoia/plugins/init.py
+++ b/pdfparanoia/plugins/init.py
@ -10,4 +10,5 @@ Scrubbing machines. Bubbles mandatory.
 from .aip import *
 from .ieee import *
 from .jstor import *
+from .rsc import *

--- a/pdfparanoia/plugins/rsc.py
+++ b/pdfparanoia/plugins/rsc.py
@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+
+from copy import copy
+import sys
+from ..parser import parse_content
+from ..plugin import Plugin
+import base64
+
+class RoyalSocietyOfChemistry(Plugin):
+    """
+    RoyalSocietyOfChemistry
+    ~~~~~~~~~~~~~~~
+
+    RSC watermarks each PDF with a "Downloaded" date and the name
+    of the institution from which the PDF was downloaded.
+    
+    Watermarks removed:
+        * "Downloaded by" watermark and timestamp on the each page
+        * "Published on" watermark on the side of each page
+
+    This was primary written for RSC PDF's from http://pubs.rsc.org
+    """
+        
+    @classmethod
+    def scrub(cls, content, verbose=0):
+        replacements = []
+        
+        # List of watermark strings to remove
+        watermarks = [
+            "Downloaded by ",
+            "Downloaded on ",
+            "Published on ",
+            #"View Article Online",
+            #"Journal Homepage",
+            #"Table of Contents for this issue",
+        ]
+
+        # parse the pdf into a pdfminer document
+        pdf = parse_content(content)
+
+        # get a list of all object ids
+        xrefs = pdf._parser.read_xref()
+        xref = xrefs[0]
+        objids = xref.get_objids()
+
+        # check each object in the pdf
+        for objid in objids:
+            # get an object by id
+            obj = pdf.getobj(objid)
+
+            if hasattr(obj, "attrs"):
+                # watermarks tend to be in FlateDecode elements
+                if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
+                    rawdata = copy(obj.rawdata)
+                    data = copy(obj.get_data())
+
+                    # Check if any of the watermarks are in the current object
+                    for phrase in watermarks:
+                        if phrase in data:
+                            if verbose >= 2:
+                                sys.stderr.write("%s: Found object %s with %r: %r; omitting...\n" % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000]))
+                            elif verbose >= 1:
+                                sys.stderr.write("%s: Found object %s with %r; omitting...\n" % (cls.__name__, objid, phrase)) 
+                            
+                            # We had a match so replace the watermark data with an empty string                 
+                            replacements.append([rawdata, ""])
+            
+        for deets in replacements:
+            # Directly replace the stream data in binary encoded object
+            content = content.replace( deets[0], deets[1])
+
+        return content
+
+
--- a/tests/samples/rsc/3589bf649f8bb019bd97be9880627b7c.pdf
+++ b/tests/samples/rsc/3589bf649f8bb019bd97be9880627b7c.pdf
--- a/tests/test_rsc.py
+++ b/tests/test_rsc.py
@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+
+import unittest
+import pdfparanoia
+
+class RoyalSocietyOfChemistryTestCase(unittest.TestCase):
+    def test_rsc(self):
+        file_handler = open("tests/samples/rsc/3589bf649f8bb019bd97be9880627b7c.pdf", "rb")
+        content = file_handler.read()
+        file_handler.close()
+
+        # Check the PDF is from the RSC
+        self.assertIn("pubs.rsc.org", content)
+
+        output = pdfparanoia.plugins.RoyalSocietyOfChemistry.scrub(content)
+
+        # Check the PDF was output correctly and still 
+        # contains the RSC url. 
+        self.assertIn("pubs.rsc.org", output)
+