diff --git a/src/libreoffice.py b/src/libreoffice.py index a3481a1..809ae3c 100644 --- a/src/libreoffice.py +++ b/src/libreoffice.py @@ -34,6 +34,13 @@ class LibreOfficeParser(abstract.AbstractParser): zipin.close() return metadata + def __clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo: + zipinfo.compress_type = zipfile.ZIP_DEFLATED + zipinfo.create_system = 3 # Linux + zipinfo.comment = b'' + zipinfo.date_time = (1980, 1, 1, 0, 0, 0) + return zipinfo + def remove_all(self): zin = zipfile.ZipFile(self.filename, 'r') zout = zipfile.ZipFile(self.output_filename, 'w') @@ -51,7 +58,10 @@ class LibreOfficeParser(abstract.AbstractParser): print("%s isn't supported" % item.filename) continue tmp_parser.remove_all() - zout.write(tmp_parser.output_filename, item.filename) + zinfo = zipfile.ZipInfo(item.filename) + item = self.__clean_zipinfo(item) + with open(tmp_parser.output_filename, 'rb') as f: + zout.writestr(zinfo, f.read()) shutil.rmtree(temp_folder) zout.close() zin.close() diff --git a/src/office.py b/src/office.py index 5de0597..a729f2f 100644 --- a/src/office.py +++ b/src/office.py @@ -33,6 +33,13 @@ class OfficeParser(abstract.AbstractParser): zipin.close() return metadata + def __clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo: + zipinfo.compress_type = zipfile.ZIP_DEFLATED + zipinfo.create_system = 3 # Linux + zipinfo.comment = b'' + zipinfo.date_time = (1980, 1, 1, 0, 0, 0) + return zipinfo + def remove_all(self): zin = zipfile.ZipFile(self.filename, 'r') zout = zipfile.ZipFile(self.output_filename, 'w') @@ -45,6 +52,7 @@ class OfficeParser(abstract.AbstractParser): if not item.filename.endswith('.rels'): continue # don't keep metadata files if item.filename in self.files_to_keep: + item = self.__clean_zipinfo(item) zout.writestr(item, zin.read(item)) continue @@ -54,7 +62,11 @@ class OfficeParser(abstract.AbstractParser): print("%s isn't supported" % item.filename) continue tmp_parser.remove_all() - zout.write(tmp_parser.output_filename, item.filename) + zinfo = zipfile.ZipInfo(item.filename) + item = self.__clean_zipinfo(item) + with open(tmp_parser.output_filename, 'rb') as f: + zout.writestr(zinfo, f.read()) + shutil.rmtree(temp_folder) zout.close() zin.close() diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index c065237..888c782 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -57,7 +57,7 @@ class TestGetMeta(unittest.TestCase): class TestDeepCleaning(unittest.TestCase): - def __check_zip_clean(self, p): + def __check_deep_meta(self, p): tempdir = tempfile.mkdtemp() zipin = zipfile.ZipFile(p.filename) zipin.extractall(tempdir) @@ -72,6 +72,15 @@ class TestDeepCleaning(unittest.TestCase): self.assertEqual(inside_p.get_meta(), {}) shutil.rmtree(tempdir) + + def __check_zip_meta(self, p): + zipin = zipfile.ZipFile(p.filename) + for item in zipin.infolist(): + self.assertEqual(item.comment, b'') + self.assertEqual(item.date_time, (1980, 1, 1, 0, 0, 0)) + self.assertEqual(item.create_system, 3) # 3 is UNIX + + def test_office(self): shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx') p = office.OfficeParser('./tests/data/clean.docx') @@ -85,7 +94,8 @@ class TestDeepCleaning(unittest.TestCase): p = office.OfficeParser('./tests/data/clean.docx.cleaned') self.assertEqual(p.get_meta(), {}) - self.__check_zip_clean(p) + self.__check_zip_meta(p) + self.__check_deep_meta(p) os.remove('./tests/data/clean.docx') @@ -103,7 +113,8 @@ class TestDeepCleaning(unittest.TestCase): p = libreoffice.LibreOfficeParser('./tests/data/clean.odt.cleaned') self.assertEqual(p.get_meta(), {}) - self.__check_zip_clean(p) + self.__check_zip_meta(p) + self.__check_deep_meta(p) os.remove('./tests/data/clean.odt')