diff --git a/README.md b/README.md index 96819d7..6fe9c95 100644 --- a/README.md +++ b/README.md @@ -47,10 +47,12 @@ cat input.pdf | pdfparanoia > output.pdf * AIP * IEEE * JSTOR +* RSC * SPIE (sort of) ## Changelog +* 0.0.13 - RSC * 0.0.12 - SPIE * 0.0.11 - pdfparanoia command-line interface. Use it by either piping in pdf data, or specifying a path to a pdf in the first argv slot. * 0.0.10 - JSTOR diff --git a/pdfparanoia/__init__.py b/pdfparanoia/__init__.py index 8194a4a..37e4978 100644 --- a/pdfparanoia/__init__.py +++ b/pdfparanoia/__init__.py @@ -17,8 +17,8 @@ usage: """ __title__ = "pdfparanoia" -__version__ = "0.0.12" -__build__ = 0x000012 +__version__ = "0.0.13" +__build__ = 0x000013 __author__ = "Bryan Bishop " __license__ = "BSD" __copyright__ = "Copyright 2013 Bryan Bishop" diff --git a/pdfparanoia/plugins/__init__.py b/pdfparanoia/plugins/__init__.py index 10179eb..2717682 100644 --- a/pdfparanoia/plugins/__init__.py +++ b/pdfparanoia/plugins/__init__.py @@ -10,4 +10,5 @@ Scrubbing machines. Bubbles mandatory. from .aip import * from .ieee import * from .jstor import * +from .rsc import * diff --git a/pdfparanoia/plugins/rsc.py b/pdfparanoia/plugins/rsc.py new file mode 100644 index 0000000..f07f876 --- /dev/null +++ b/pdfparanoia/plugins/rsc.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- + +from copy import copy +import sys +from ..parser import parse_content +from ..plugin import Plugin +import base64 + +class RoyalSocietyOfChemistry(Plugin): + """ + RoyalSocietyOfChemistry + ~~~~~~~~~~~~~~~ + + RSC watermarks each PDF with a "Downloaded" date and the name + of the institution from which the PDF was downloaded. + + Watermarks removed: + * "Downloaded by" watermark and timestamp on the each page + * "Published on" watermark on the side of each page + + This was primary written for RSC PDF's from http://pubs.rsc.org + """ + + @classmethod + def scrub(cls, content, verbose=0): + replacements = [] + + # List of watermark strings to remove + watermarks = [ + "Downloaded by ", + "Downloaded on ", + "Published on ", + #"View Article Online", + #"Journal Homepage", + #"Table of Contents for this issue", + ] + + # parse the pdf into a pdfminer document + pdf = parse_content(content) + + # get a list of all object ids + xrefs = pdf._parser.read_xref() + xref = xrefs[0] + objids = xref.get_objids() + + # check each object in the pdf + for objid in objids: + # get an object by id + obj = pdf.getobj(objid) + + if hasattr(obj, "attrs"): + # watermarks tend to be in FlateDecode elements + if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode": + rawdata = copy(obj.rawdata) + data = copy(obj.get_data()) + + # Check if any of the watermarks are in the current object + for phrase in watermarks: + if phrase in data: + if verbose >= 2: + sys.stderr.write("%s: Found object %s with %r: %r; omitting...\n" % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000])) + elif verbose >= 1: + sys.stderr.write("%s: Found object %s with %r; omitting...\n" % (cls.__name__, objid, phrase)) + + # We had a match so replace the watermark data with an empty string + replacements.append([rawdata, ""]) + + for deets in replacements: + # Directly replace the stream data in binary encoded object + content = content.replace( deets[0], deets[1]) + + return content + + diff --git a/tests/samples/rsc/3589bf649f8bb019bd97be9880627b7c.pdf b/tests/samples/rsc/3589bf649f8bb019bd97be9880627b7c.pdf new file mode 100644 index 0000000..676266f Binary files /dev/null and b/tests/samples/rsc/3589bf649f8bb019bd97be9880627b7c.pdf differ diff --git a/tests/test_rsc.py b/tests/test_rsc.py new file mode 100644 index 0000000..cedcc47 --- /dev/null +++ b/tests/test_rsc.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- + +import unittest +import pdfparanoia + +class RoyalSocietyOfChemistryTestCase(unittest.TestCase): + def test_rsc(self): + file_handler = open("tests/samples/rsc/3589bf649f8bb019bd97be9880627b7c.pdf", "rb") + content = file_handler.read() + file_handler.close() + + # Check the PDF is from the RSC + self.assertIn("pubs.rsc.org", content) + + output = pdfparanoia.plugins.RoyalSocietyOfChemistry.scrub(content) + + # Check the PDF was output correctly and still + # contains the RSC url. + self.assertIn("pubs.rsc.org", output) +