Clean deep metadata for zip files
This commit is contained in:
parent
6d506b8757
commit
c186fc4292
@ -34,6 +34,13 @@ class LibreOfficeParser(abstract.AbstractParser):
|
|||||||
zipin.close()
|
zipin.close()
|
||||||
return metadata
|
return metadata
|
||||||
|
|
||||||
|
def __clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo:
|
||||||
|
zipinfo.compress_type = zipfile.ZIP_DEFLATED
|
||||||
|
zipinfo.create_system = 3 # Linux
|
||||||
|
zipinfo.comment = b''
|
||||||
|
zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
|
||||||
|
return zipinfo
|
||||||
|
|
||||||
def remove_all(self):
|
def remove_all(self):
|
||||||
zin = zipfile.ZipFile(self.filename, 'r')
|
zin = zipfile.ZipFile(self.filename, 'r')
|
||||||
zout = zipfile.ZipFile(self.output_filename, 'w')
|
zout = zipfile.ZipFile(self.output_filename, 'w')
|
||||||
@ -51,7 +58,10 @@ class LibreOfficeParser(abstract.AbstractParser):
|
|||||||
print("%s isn't supported" % item.filename)
|
print("%s isn't supported" % item.filename)
|
||||||
continue
|
continue
|
||||||
tmp_parser.remove_all()
|
tmp_parser.remove_all()
|
||||||
zout.write(tmp_parser.output_filename, item.filename)
|
zinfo = zipfile.ZipInfo(item.filename)
|
||||||
|
item = self.__clean_zipinfo(item)
|
||||||
|
with open(tmp_parser.output_filename, 'rb') as f:
|
||||||
|
zout.writestr(zinfo, f.read())
|
||||||
shutil.rmtree(temp_folder)
|
shutil.rmtree(temp_folder)
|
||||||
zout.close()
|
zout.close()
|
||||||
zin.close()
|
zin.close()
|
||||||
|
@ -33,6 +33,13 @@ class OfficeParser(abstract.AbstractParser):
|
|||||||
zipin.close()
|
zipin.close()
|
||||||
return metadata
|
return metadata
|
||||||
|
|
||||||
|
def __clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo:
|
||||||
|
zipinfo.compress_type = zipfile.ZIP_DEFLATED
|
||||||
|
zipinfo.create_system = 3 # Linux
|
||||||
|
zipinfo.comment = b''
|
||||||
|
zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
|
||||||
|
return zipinfo
|
||||||
|
|
||||||
def remove_all(self):
|
def remove_all(self):
|
||||||
zin = zipfile.ZipFile(self.filename, 'r')
|
zin = zipfile.ZipFile(self.filename, 'r')
|
||||||
zout = zipfile.ZipFile(self.output_filename, 'w')
|
zout = zipfile.ZipFile(self.output_filename, 'w')
|
||||||
@ -45,6 +52,7 @@ class OfficeParser(abstract.AbstractParser):
|
|||||||
if not item.filename.endswith('.rels'):
|
if not item.filename.endswith('.rels'):
|
||||||
continue # don't keep metadata files
|
continue # don't keep metadata files
|
||||||
if item.filename in self.files_to_keep:
|
if item.filename in self.files_to_keep:
|
||||||
|
item = self.__clean_zipinfo(item)
|
||||||
zout.writestr(item, zin.read(item))
|
zout.writestr(item, zin.read(item))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -54,7 +62,11 @@ class OfficeParser(abstract.AbstractParser):
|
|||||||
print("%s isn't supported" % item.filename)
|
print("%s isn't supported" % item.filename)
|
||||||
continue
|
continue
|
||||||
tmp_parser.remove_all()
|
tmp_parser.remove_all()
|
||||||
zout.write(tmp_parser.output_filename, item.filename)
|
zinfo = zipfile.ZipInfo(item.filename)
|
||||||
|
item = self.__clean_zipinfo(item)
|
||||||
|
with open(tmp_parser.output_filename, 'rb') as f:
|
||||||
|
zout.writestr(zinfo, f.read())
|
||||||
|
|
||||||
shutil.rmtree(temp_folder)
|
shutil.rmtree(temp_folder)
|
||||||
zout.close()
|
zout.close()
|
||||||
zin.close()
|
zin.close()
|
||||||
|
@ -57,7 +57,7 @@ class TestGetMeta(unittest.TestCase):
|
|||||||
|
|
||||||
|
|
||||||
class TestDeepCleaning(unittest.TestCase):
|
class TestDeepCleaning(unittest.TestCase):
|
||||||
def __check_zip_clean(self, p):
|
def __check_deep_meta(self, p):
|
||||||
tempdir = tempfile.mkdtemp()
|
tempdir = tempfile.mkdtemp()
|
||||||
zipin = zipfile.ZipFile(p.filename)
|
zipin = zipfile.ZipFile(p.filename)
|
||||||
zipin.extractall(tempdir)
|
zipin.extractall(tempdir)
|
||||||
@ -72,6 +72,15 @@ class TestDeepCleaning(unittest.TestCase):
|
|||||||
self.assertEqual(inside_p.get_meta(), {})
|
self.assertEqual(inside_p.get_meta(), {})
|
||||||
shutil.rmtree(tempdir)
|
shutil.rmtree(tempdir)
|
||||||
|
|
||||||
|
|
||||||
|
def __check_zip_meta(self, p):
|
||||||
|
zipin = zipfile.ZipFile(p.filename)
|
||||||
|
for item in zipin.infolist():
|
||||||
|
self.assertEqual(item.comment, b'')
|
||||||
|
self.assertEqual(item.date_time, (1980, 1, 1, 0, 0, 0))
|
||||||
|
self.assertEqual(item.create_system, 3) # 3 is UNIX
|
||||||
|
|
||||||
|
|
||||||
def test_office(self):
|
def test_office(self):
|
||||||
shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx')
|
shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx')
|
||||||
p = office.OfficeParser('./tests/data/clean.docx')
|
p = office.OfficeParser('./tests/data/clean.docx')
|
||||||
@ -85,7 +94,8 @@ class TestDeepCleaning(unittest.TestCase):
|
|||||||
p = office.OfficeParser('./tests/data/clean.docx.cleaned')
|
p = office.OfficeParser('./tests/data/clean.docx.cleaned')
|
||||||
self.assertEqual(p.get_meta(), {})
|
self.assertEqual(p.get_meta(), {})
|
||||||
|
|
||||||
self.__check_zip_clean(p)
|
self.__check_zip_meta(p)
|
||||||
|
self.__check_deep_meta(p)
|
||||||
|
|
||||||
os.remove('./tests/data/clean.docx')
|
os.remove('./tests/data/clean.docx')
|
||||||
|
|
||||||
@ -103,7 +113,8 @@ class TestDeepCleaning(unittest.TestCase):
|
|||||||
p = libreoffice.LibreOfficeParser('./tests/data/clean.odt.cleaned')
|
p = libreoffice.LibreOfficeParser('./tests/data/clean.odt.cleaned')
|
||||||
self.assertEqual(p.get_meta(), {})
|
self.assertEqual(p.get_meta(), {})
|
||||||
|
|
||||||
self.__check_zip_clean(p)
|
self.__check_zip_meta(p)
|
||||||
|
self.__check_deep_meta(p)
|
||||||
|
|
||||||
os.remove('./tests/data/clean.odt')
|
os.remove('./tests/data/clean.odt')
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user