From 513d897ea0cf3e006a2b33a89cdbf33cae3592cd Mon Sep 17 00:00:00 2001 From: jvoisin Date: Thu, 25 Oct 2018 11:29:50 +0200 Subject: [PATCH] Implement get_meta() for archives --- libmat2/archive.py | 25 +++++++++++++++++++++++++ libmat2/office.py | 2 +- tests/test_deep_cleaning.py | 1 + 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/libmat2/archive.py b/libmat2/archive.py index f788ecc..80e0bf2 100644 --- a/libmat2/archive.py +++ b/libmat2/archive.py @@ -67,6 +67,31 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): return metadata + def get_meta(self) -> Dict[str, Union[str, dict]]: + meta = dict() # type: Dict[str, Union[str, dict]] + + with zipfile.ZipFile(self.filename) as zin: + temp_folder = tempfile.mkdtemp() + + for item in zin.infolist(): + if item.filename[-1] == '/': # pragma: no cover + # `is_dir` is added in Python3.6 + continue # don't keep empty folders + + zin.extract(member=item, path=temp_folder) + full_path = os.path.join(temp_folder, item.filename) + + tmp_parser, _ = parser_factory.get_parser(full_path) # type: ignore + if not tmp_parser: + continue + + local_meta = tmp_parser.get_meta() + if local_meta: + meta[item.filename] = local_meta + + shutil.rmtree(temp_folder) + return meta + def remove_all(self) -> bool: # pylint: disable=too-many-branches diff --git a/libmat2/office.py b/libmat2/office.py index c10664f..e6370e7 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -301,7 +301,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser): Yes, I know that parsing xml with regexp ain't pretty, be my guest and fix it if you want. """ - metadata = {} + metadata = super().get_meta() zipin = zipfile.ZipFile(self.filename) for item in zipin.infolist(): if item.filename.startswith('docProps/') and item.filename.endswith('.xml'): diff --git a/tests/test_deep_cleaning.py b/tests/test_deep_cleaning.py index 03db6c5..8466127 100644 --- a/tests/test_deep_cleaning.py +++ b/tests/test_deep_cleaning.py @@ -36,6 +36,7 @@ class TestZipMetadata(unittest.TestCase): meta = p.get_meta() self.assertIsNotNone(meta) + self.assertEqual(meta['word/media/image1.png']['Comment'], 'This is a comment, be careful!') ret = p.remove_all() self.assertTrue(ret)