Remove a couple of residual metadata in pdf
This commit takes care of removing residual metadata added by mat2 during the cleaning of pdf.
This commit is contained in:
parent
5312603a88
commit
5270071b94
@ -122,6 +122,17 @@ class PDFParser(abstract.AbstractParser):
|
||||
document.set_creator('')
|
||||
document.set_creation_date(-1)
|
||||
document.save('file://' + os.path.abspath(out_file))
|
||||
|
||||
# Cairo adds "/Producer" and "/CreationDate", and Poppler sometimes
|
||||
# fails to remove them, we have to use this terrible regex.
|
||||
# It should(tm) be alright though, because cairo's output format
|
||||
# for metadata is fixed.
|
||||
with open(out_file, 'rb') as f:
|
||||
out = re.sub(rb'<<[\s\n]*/Producer.*?>>', b' << >>', f.read(), 0,
|
||||
re.DOTALL | re.IGNORECASE)
|
||||
with open(out_file, 'wb') as f:
|
||||
f.write(out)
|
||||
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
|
Loading…
Reference in New Issue
Block a user