1
0
Fork 0

Remove a couple of residual metadata in pdf

This commit takes care of removing residual metadata
added by mat2 during the cleaning of pdf.
This commit is contained in:
jvoisin 2020-02-08 16:08:32 +01:00
parent 5312603a88
commit 5270071b94
1 changed files with 11 additions and 0 deletions

View File

@ -122,6 +122,17 @@ class PDFParser(abstract.AbstractParser):
document.set_creator('')
document.set_creation_date(-1)
document.save('file://' + os.path.abspath(out_file))
# Cairo adds "/Producer" and "/CreationDate", and Poppler sometimes
# fails to remove them, we have to use this terrible regex.
# It should(tm) be alright though, because cairo's output format
# for metadata is fixed.
with open(out_file, 'rb') as f:
out = re.sub(rb'<<[\s\n]*/Producer.*?>>', b' << >>', f.read(), 0,
re.DOTALL | re.IGNORECASE)
with open(out_file, 'wb') as f:
f.write(out)
return True
@staticmethod