Remove a couple of residual metadata in pdf
This commit takes care of removing residual metadata added by mat2 during the cleaning of pdf.
This commit is contained in:
parent
5312603a88
commit
5270071b94
@ -122,6 +122,17 @@ class PDFParser(abstract.AbstractParser):
|
|||||||
document.set_creator('')
|
document.set_creator('')
|
||||||
document.set_creation_date(-1)
|
document.set_creation_date(-1)
|
||||||
document.save('file://' + os.path.abspath(out_file))
|
document.save('file://' + os.path.abspath(out_file))
|
||||||
|
|
||||||
|
# Cairo adds "/Producer" and "/CreationDate", and Poppler sometimes
|
||||||
|
# fails to remove them, we have to use this terrible regex.
|
||||||
|
# It should(tm) be alright though, because cairo's output format
|
||||||
|
# for metadata is fixed.
|
||||||
|
with open(out_file, 'rb') as f:
|
||||||
|
out = re.sub(rb'<<[\s\n]*/Producer.*?>>', b' << >>', f.read(), 0,
|
||||||
|
re.DOTALL | re.IGNORECASE)
|
||||||
|
with open(out_file, 'wb') as f:
|
||||||
|
f.write(out)
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
Loading…
Reference in New Issue
Block a user