mirror of
https://github.com/kanzure/pdfparanoia.git
synced 2024-12-04 23:15:52 +01:00
Check PDF is from the RSC before cleaning
This commit is contained in:
parent
18140d838d
commit
c673d77ec6
@ -35,35 +35,38 @@ class RoyalSocietyOfChemistry(Plugin):
|
|||||||
#"Table of Contents for this issue",
|
#"Table of Contents for this issue",
|
||||||
]
|
]
|
||||||
|
|
||||||
# parse the pdf into a pdfminer document
|
# Confirm the PDF is from the RSC
|
||||||
pdf = parse_content(content)
|
if "pubs.rsc.org" in content:
|
||||||
|
|
||||||
|
# parse the pdf into a pdfminer document
|
||||||
|
pdf = parse_content(content)
|
||||||
|
|
||||||
# get a list of all object ids
|
# get a list of all object ids
|
||||||
xrefs = pdf._parser.read_xref()
|
xrefs = pdf._parser.read_xref()
|
||||||
xref = xrefs[0]
|
xref = xrefs[0]
|
||||||
objids = xref.get_objids()
|
objids = xref.get_objids()
|
||||||
|
|
||||||
# check each object in the pdf
|
# check each object in the pdf
|
||||||
for objid in objids:
|
for objid in objids:
|
||||||
# get an object by id
|
# get an object by id
|
||||||
obj = pdf.getobj(objid)
|
obj = pdf.getobj(objid)
|
||||||
|
|
||||||
if hasattr(obj, "attrs"):
|
if hasattr(obj, "attrs"):
|
||||||
# watermarks tend to be in FlateDecode elements
|
# watermarks tend to be in FlateDecode elements
|
||||||
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
||||||
rawdata = copy(obj.rawdata)
|
rawdata = copy(obj.rawdata)
|
||||||
data = copy(obj.get_data())
|
data = copy(obj.get_data())
|
||||||
|
|
||||||
# Check if any of the watermarks are in the current object
|
# Check if any of the watermarks are in the current object
|
||||||
for phrase in watermarks:
|
for phrase in watermarks:
|
||||||
if phrase in data:
|
if phrase in data:
|
||||||
if verbose >= 2:
|
if verbose >= 2:
|
||||||
sys.stderr.write("%s: Found object %s with %r: %r; omitting...\n" % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000]))
|
sys.stderr.write("%s: Found object %s with %r: %r; omitting...\n" % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000]))
|
||||||
elif verbose >= 1:
|
elif verbose >= 1:
|
||||||
sys.stderr.write("%s: Found object %s with %r; omitting...\n" % (cls.__name__, objid, phrase))
|
sys.stderr.write("%s: Found object %s with %r; omitting...\n" % (cls.__name__, objid, phrase))
|
||||||
|
|
||||||
# We had a match so replace the watermark data with an empty string
|
# We had a match so replace the watermark data with an empty string
|
||||||
replacements.append([rawdata, ""])
|
replacements.append([rawdata, ""])
|
||||||
|
|
||||||
for deets in replacements:
|
for deets in replacements:
|
||||||
# Directly replace the stream data in binary encoded object
|
# Directly replace the stream data in binary encoded object
|
||||||
|
Loading…
Reference in New Issue
Block a user