1
0
mirror of https://github.com/kanzure/pdfparanoia.git synced 2025-06-14 12:02:00 +02:00

Check PDF is from the RSC before cleaning

This commit is contained in:
Donncha O'Cearbhaill 2013-05-13 21:01:52 +01:00
parent 18140d838d
commit c673d77ec6

View File

@ -35,35 +35,38 @@ class RoyalSocietyOfChemistry(Plugin):
#"Table of Contents for this issue", #"Table of Contents for this issue",
] ]
# parse the pdf into a pdfminer document # Confirm the PDF is from the RSC
pdf = parse_content(content) if "pubs.rsc.org" in content:
# get a list of all object ids # parse the pdf into a pdfminer document
xrefs = pdf._parser.read_xref() pdf = parse_content(content)
xref = xrefs[0]
objids = xref.get_objids()
# check each object in the pdf # get a list of all object ids
for objid in objids: xrefs = pdf._parser.read_xref()
# get an object by id xref = xrefs[0]
obj = pdf.getobj(objid) objids = xref.get_objids()
if hasattr(obj, "attrs"): # check each object in the pdf
# watermarks tend to be in FlateDecode elements for objid in objids:
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode": # get an object by id
rawdata = copy(obj.rawdata) obj = pdf.getobj(objid)
data = copy(obj.get_data())
# Check if any of the watermarks are in the current object if hasattr(obj, "attrs"):
for phrase in watermarks: # watermarks tend to be in FlateDecode elements
if phrase in data: if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
if verbose >= 2: rawdata = copy(obj.rawdata)
sys.stderr.write("%s: Found object %s with %r: %r; omitting...\n" % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000])) data = copy(obj.get_data())
elif verbose >= 1:
sys.stderr.write("%s: Found object %s with %r; omitting...\n" % (cls.__name__, objid, phrase))
# We had a match so replace the watermark data with an empty string # Check if any of the watermarks are in the current object
replacements.append([rawdata, ""]) for phrase in watermarks:
if phrase in data:
if verbose >= 2:
sys.stderr.write("%s: Found object %s with %r: %r; omitting...\n" % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000]))
elif verbose >= 1:
sys.stderr.write("%s: Found object %s with %r; omitting...\n" % (cls.__name__, objid, phrase))
# We had a match so replace the watermark data with an empty string
replacements.append([rawdata, ""])
for deets in replacements: for deets in replacements:
# Directly replace the stream data in binary encoded object # Directly replace the stream data in binary encoded object