From c673d77ec6eedd13a9f4bdc5cabea07124ddcef2 Mon Sep 17 00:00:00 2001 From: Donncha O'Cearbhaill Date: Mon, 13 May 2013 21:01:52 +0100 Subject: [PATCH] Check PDF is from the RSC before cleaning --- pdfparanoia/plugins/rsc.py | 53 ++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/pdfparanoia/plugins/rsc.py b/pdfparanoia/plugins/rsc.py index f07f876..524fc32 100644 --- a/pdfparanoia/plugins/rsc.py +++ b/pdfparanoia/plugins/rsc.py @@ -35,35 +35,38 @@ class RoyalSocietyOfChemistry(Plugin): #"Table of Contents for this issue", ] - # parse the pdf into a pdfminer document - pdf = parse_content(content) + # Confirm the PDF is from the RSC + if "pubs.rsc.org" in content: + + # parse the pdf into a pdfminer document + pdf = parse_content(content) - # get a list of all object ids - xrefs = pdf._parser.read_xref() - xref = xrefs[0] - objids = xref.get_objids() + # get a list of all object ids + xrefs = pdf._parser.read_xref() + xref = xrefs[0] + objids = xref.get_objids() - # check each object in the pdf - for objid in objids: - # get an object by id - obj = pdf.getobj(objid) + # check each object in the pdf + for objid in objids: + # get an object by id + obj = pdf.getobj(objid) - if hasattr(obj, "attrs"): - # watermarks tend to be in FlateDecode elements - if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode": - rawdata = copy(obj.rawdata) - data = copy(obj.get_data()) + if hasattr(obj, "attrs"): + # watermarks tend to be in FlateDecode elements + if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode": + rawdata = copy(obj.rawdata) + data = copy(obj.get_data()) - # Check if any of the watermarks are in the current object - for phrase in watermarks: - if phrase in data: - if verbose >= 2: - sys.stderr.write("%s: Found object %s with %r: %r; omitting...\n" % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000])) - elif verbose >= 1: - sys.stderr.write("%s: Found object %s with %r; omitting...\n" % (cls.__name__, objid, phrase)) - - # We had a match so replace the watermark data with an empty string - replacements.append([rawdata, ""]) + # Check if any of the watermarks are in the current object + for phrase in watermarks: + if phrase in data: + if verbose >= 2: + sys.stderr.write("%s: Found object %s with %r: %r; omitting...\n" % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000])) + elif verbose >= 1: + sys.stderr.write("%s: Found object %s with %r; omitting...\n" % (cls.__name__, objid, phrase)) + + # We had a match so replace the watermark data with an empty string + replacements.append([rawdata, ""]) for deets in replacements: # Directly replace the stream data in binary encoded object