From 513d897ea0cf3e006a2b33a89cdbf33cae3592cd Mon Sep 17 00:00:00 2001
From: jvoisin <julien.voisin@dustri.org>
Date: Thu, 25 Oct 2018 11:29:50 +0200
Subject: [PATCH] Implement get_meta() for archives

---
 libmat2/archive.py          | 25 +++++++++++++++++++++++++
 libmat2/office.py           |  2 +-
 tests/test_deep_cleaning.py |  1 +
 3 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/libmat2/archive.py b/libmat2/archive.py
index f788ecc..80e0bf2 100644
--- a/libmat2/archive.py
+++ b/libmat2/archive.py
@@ -67,6 +67,31 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
 
         return metadata
 
+    def get_meta(self) -> Dict[str, Union[str, dict]]:
+        meta = dict()  # type: Dict[str, Union[str, dict]]
+
+        with zipfile.ZipFile(self.filename) as zin:
+            temp_folder = tempfile.mkdtemp()
+
+            for item in zin.infolist():
+                if item.filename[-1] == '/':  # pragma: no cover
+                    # `is_dir` is added in Python3.6
+                    continue  # don't keep empty folders
+
+                zin.extract(member=item, path=temp_folder)
+                full_path = os.path.join(temp_folder, item.filename)
+
+                tmp_parser, _ = parser_factory.get_parser(full_path)  # type: ignore
+                if not tmp_parser:
+                    continue
+
+                local_meta = tmp_parser.get_meta()
+                if local_meta:
+                    meta[item.filename] = local_meta
+
+        shutil.rmtree(temp_folder)
+        return meta
+
     def remove_all(self) -> bool:
         # pylint: disable=too-many-branches
 
diff --git a/libmat2/office.py b/libmat2/office.py
index c10664f..e6370e7 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -301,7 +301,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
         Yes, I know that parsing xml with regexp ain't pretty,
         be my guest and fix it if you want.
         """
-        metadata = {}
+        metadata = super().get_meta()
         zipin = zipfile.ZipFile(self.filename)
         for item in zipin.infolist():
             if item.filename.startswith('docProps/') and item.filename.endswith('.xml'):
diff --git a/tests/test_deep_cleaning.py b/tests/test_deep_cleaning.py
index 03db6c5..8466127 100644
--- a/tests/test_deep_cleaning.py
+++ b/tests/test_deep_cleaning.py
@@ -36,6 +36,7 @@ class TestZipMetadata(unittest.TestCase):
 
         meta = p.get_meta()
         self.assertIsNotNone(meta)
+        self.assertEqual(meta['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
 
         ret = p.remove_all()
         self.assertTrue(ret)