1
0
mirror of synced 2024-11-22 09:14:23 +01:00

Implement get_meta() for archives

This commit is contained in:
jvoisin 2018-10-25 11:29:50 +02:00
parent 5a9dc388ad
commit 513d897ea0
3 changed files with 27 additions and 1 deletions

View File

@ -67,6 +67,31 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
return metadata
def get_meta(self) -> Dict[str, Union[str, dict]]:
meta = dict() # type: Dict[str, Union[str, dict]]
with zipfile.ZipFile(self.filename) as zin:
temp_folder = tempfile.mkdtemp()
for item in zin.infolist():
if item.filename[-1] == '/': # pragma: no cover
# `is_dir` is added in Python3.6
continue # don't keep empty folders
zin.extract(member=item, path=temp_folder)
full_path = os.path.join(temp_folder, item.filename)
tmp_parser, _ = parser_factory.get_parser(full_path) # type: ignore
if not tmp_parser:
continue
local_meta = tmp_parser.get_meta()
if local_meta:
meta[item.filename] = local_meta
shutil.rmtree(temp_folder)
return meta
def remove_all(self) -> bool:
# pylint: disable=too-many-branches

View File

@ -301,7 +301,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
Yes, I know that parsing xml with regexp ain't pretty,
be my guest and fix it if you want.
"""
metadata = {}
metadata = super().get_meta()
zipin = zipfile.ZipFile(self.filename)
for item in zipin.infolist():
if item.filename.startswith('docProps/') and item.filename.endswith('.xml'):

View File

@ -36,6 +36,7 @@ class TestZipMetadata(unittest.TestCase):
meta = p.get_meta()
self.assertIsNotNone(meta)
self.assertEqual(meta['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
ret = p.remove_all()
self.assertTrue(ret)