# -*- coding: utf-8 -*- from copy import copy import sys from ..parser import parse_content from ..plugin import Plugin import base64 class RoyalSocietyOfChemistry(Plugin): """ RoyalSocietyOfChemistry ~~~~~~~~~~~~~~~ RSC watermarks each PDF with a "Downloaded" date and the name of the institution from which the PDF was downloaded. Watermarks removed: * "Downloaded by" watermark and timestamp on the each page * "Published on" watermark on the side of each page This was primary written for RSC PDF's from http://pubs.rsc.org """ @classmethod def scrub(cls, content, verbose=0): replacements = [] # List of watermark strings to remove watermarks = [ "Downloaded by ", "Downloaded on ", "Published on ", #"View Article Online", #"Journal Homepage", #"Table of Contents for this issue", ] # parse the pdf into a pdfminer document pdf = parse_content(content) # get a list of all object ids xrefs = pdf._parser.read_xref() xref = xrefs[0] objids = xref.get_objids() # check each object in the pdf for objid in objids: # get an object by id obj = pdf.getobj(objid) if hasattr(obj, "attrs"): # watermarks tend to be in FlateDecode elements if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode": rawdata = copy(obj.rawdata) data = copy(obj.get_data()) # Check if any of the watermarks are in the current object for phrase in watermarks: if phrase in data: if verbose >= 2: sys.stderr.write("%s: Found object %s with %r: %r; omitting...\n" % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000])) elif verbose >= 1: sys.stderr.write("%s: Found object %s with %r; omitting...\n" % (cls.__name__, objid, phrase)) # We had a match so replace the watermark data with an empty string replacements.append([rawdata, ""]) for deets in replacements: # Directly replace the stream data in binary encoded object content = content.replace( deets[0], deets[1]) return content