1
0
mirror of synced 2024-06-05 13:57:48 +02:00

Clean deep metadata for zip files

This commit is contained in:
jvoisin 2018-04-01 00:17:06 +02:00
parent 6d506b8757
commit c186fc4292
3 changed files with 38 additions and 5 deletions

View File

@ -34,6 +34,13 @@ class LibreOfficeParser(abstract.AbstractParser):
zipin.close() zipin.close()
return metadata return metadata
def __clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo:
zipinfo.compress_type = zipfile.ZIP_DEFLATED
zipinfo.create_system = 3 # Linux
zipinfo.comment = b''
zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
return zipinfo
def remove_all(self): def remove_all(self):
zin = zipfile.ZipFile(self.filename, 'r') zin = zipfile.ZipFile(self.filename, 'r')
zout = zipfile.ZipFile(self.output_filename, 'w') zout = zipfile.ZipFile(self.output_filename, 'w')
@ -51,7 +58,10 @@ class LibreOfficeParser(abstract.AbstractParser):
print("%s isn't supported" % item.filename) print("%s isn't supported" % item.filename)
continue continue
tmp_parser.remove_all() tmp_parser.remove_all()
zout.write(tmp_parser.output_filename, item.filename) zinfo = zipfile.ZipInfo(item.filename)
item = self.__clean_zipinfo(item)
with open(tmp_parser.output_filename, 'rb') as f:
zout.writestr(zinfo, f.read())
shutil.rmtree(temp_folder) shutil.rmtree(temp_folder)
zout.close() zout.close()
zin.close() zin.close()

View File

@ -33,6 +33,13 @@ class OfficeParser(abstract.AbstractParser):
zipin.close() zipin.close()
return metadata return metadata
def __clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo:
zipinfo.compress_type = zipfile.ZIP_DEFLATED
zipinfo.create_system = 3 # Linux
zipinfo.comment = b''
zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
return zipinfo
def remove_all(self): def remove_all(self):
zin = zipfile.ZipFile(self.filename, 'r') zin = zipfile.ZipFile(self.filename, 'r')
zout = zipfile.ZipFile(self.output_filename, 'w') zout = zipfile.ZipFile(self.output_filename, 'w')
@ -45,6 +52,7 @@ class OfficeParser(abstract.AbstractParser):
if not item.filename.endswith('.rels'): if not item.filename.endswith('.rels'):
continue # don't keep metadata files continue # don't keep metadata files
if item.filename in self.files_to_keep: if item.filename in self.files_to_keep:
item = self.__clean_zipinfo(item)
zout.writestr(item, zin.read(item)) zout.writestr(item, zin.read(item))
continue continue
@ -54,7 +62,11 @@ class OfficeParser(abstract.AbstractParser):
print("%s isn't supported" % item.filename) print("%s isn't supported" % item.filename)
continue continue
tmp_parser.remove_all() tmp_parser.remove_all()
zout.write(tmp_parser.output_filename, item.filename) zinfo = zipfile.ZipInfo(item.filename)
item = self.__clean_zipinfo(item)
with open(tmp_parser.output_filename, 'rb') as f:
zout.writestr(zinfo, f.read())
shutil.rmtree(temp_folder) shutil.rmtree(temp_folder)
zout.close() zout.close()
zin.close() zin.close()

View File

@ -57,7 +57,7 @@ class TestGetMeta(unittest.TestCase):
class TestDeepCleaning(unittest.TestCase): class TestDeepCleaning(unittest.TestCase):
def __check_zip_clean(self, p): def __check_deep_meta(self, p):
tempdir = tempfile.mkdtemp() tempdir = tempfile.mkdtemp()
zipin = zipfile.ZipFile(p.filename) zipin = zipfile.ZipFile(p.filename)
zipin.extractall(tempdir) zipin.extractall(tempdir)
@ -72,6 +72,15 @@ class TestDeepCleaning(unittest.TestCase):
self.assertEqual(inside_p.get_meta(), {}) self.assertEqual(inside_p.get_meta(), {})
shutil.rmtree(tempdir) shutil.rmtree(tempdir)
def __check_zip_meta(self, p):
zipin = zipfile.ZipFile(p.filename)
for item in zipin.infolist():
self.assertEqual(item.comment, b'')
self.assertEqual(item.date_time, (1980, 1, 1, 0, 0, 0))
self.assertEqual(item.create_system, 3) # 3 is UNIX
def test_office(self): def test_office(self):
shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx') shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx')
p = office.OfficeParser('./tests/data/clean.docx') p = office.OfficeParser('./tests/data/clean.docx')
@ -85,7 +94,8 @@ class TestDeepCleaning(unittest.TestCase):
p = office.OfficeParser('./tests/data/clean.docx.cleaned') p = office.OfficeParser('./tests/data/clean.docx.cleaned')
self.assertEqual(p.get_meta(), {}) self.assertEqual(p.get_meta(), {})
self.__check_zip_clean(p) self.__check_zip_meta(p)
self.__check_deep_meta(p)
os.remove('./tests/data/clean.docx') os.remove('./tests/data/clean.docx')
@ -103,7 +113,8 @@ class TestDeepCleaning(unittest.TestCase):
p = libreoffice.LibreOfficeParser('./tests/data/clean.odt.cleaned') p = libreoffice.LibreOfficeParser('./tests/data/clean.odt.cleaned')
self.assertEqual(p.get_meta(), {}) self.assertEqual(p.get_meta(), {})
self.__check_zip_clean(p) self.__check_zip_meta(p)
self.__check_deep_meta(p)
os.remove('./tests/data/clean.odt') os.remove('./tests/data/clean.odt')