From b9a62d798af14ea799ae5fceab1ed7a537d1cbdd Mon Sep 17 00:00:00 2001 From: jvoisin Date: Sun, 3 Feb 2019 22:55:15 +0100 Subject: [PATCH] Refactor a bit office get_meta handling This should make easier to get more metadata from archive-based file formats. --- libmat2/archive.py | 24 +++++++++++++++++++----- libmat2/office.py | 29 ++++++++++++----------------- tests/test_libmat2.py | 6 +++--- 3 files changed, 34 insertions(+), 25 deletions(-) diff --git a/libmat2/archive.py b/libmat2/archive.py index b2483fc..d155664 100644 --- a/libmat2/archive.py +++ b/libmat2/archive.py @@ -4,7 +4,7 @@ import tempfile import os import logging import shutil -from typing import Dict, Set, Pattern, Union +from typing import Dict, Set, Pattern, Union, Any from . import abstract, UnknownMemberPolicy, parser_factory @@ -42,6 +42,12 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): # pylint: disable=unused-argument,no-self-use return True # pragma: no cover + def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]: + """ This method can be used to extract specific metadata + from files present in the archive.""" + # pylint: disable=unused-argument,no-self-use + return {} # pragma: no cover + @staticmethod def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: zipinfo.create_system = 3 # Linux @@ -74,6 +80,10 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): temp_folder = tempfile.mkdtemp() for item in zin.infolist(): + local_meta = dict() # type: Dict[str, Union[str, Dict]] + for k, v in self._get_zipinfo_meta(item).items(): + local_meta[k] = v + if item.filename[-1] == '/': # pragma: no cover # `is_dir` is added in Python3.6 continue # don't keep empty folders @@ -81,11 +91,15 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): zin.extract(member=item, path=temp_folder) full_path = os.path.join(temp_folder, item.filename) - tmp_parser, _ = parser_factory.get_parser(full_path) # type: ignore - if not tmp_parser: - continue + specific_meta = self._specific_get_meta(full_path, item.filename) + for (k, v) in specific_meta.items(): + local_meta[k] = v + + tmp_parser, _ = parser_factory.get_parser(full_path) # type: ignore + if tmp_parser: + for k, v in tmp_parser.get_meta().items(): + local_meta[k] = v - local_meta = tmp_parser.get_meta() if local_meta: meta[item.filename] = local_meta diff --git a/libmat2/office.py b/libmat2/office.py index 365c230..dfad3b3 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -2,7 +2,7 @@ import logging import os import re import zipfile -from typing import Dict, Set, Pattern, Tuple, Union +from typing import Dict, Set, Pattern, Tuple, Union, Any import xml.etree.ElementTree as ET # type: ignore @@ -295,26 +295,21 @@ class MSOfficeParser(ArchiveBasedAbstractParser): return True - def get_meta(self) -> Dict[str, Union[str, dict]]: + def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]: """ Yes, I know that parsing xml with regexp ain't pretty, be my guest and fix it if you want. """ - metadata = super().get_meta() - zipin = zipfile.ZipFile(self.filename) - for item in zipin.infolist(): - if item.filename.startswith('docProps/') and item.filename.endswith('.xml'): - try: - content = zipin.read(item).decode('utf-8') - results = re.findall(r"<(.+)>(.+)", content, re.I|re.M) - for (key, value) in results: - metadata[key] = value - except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file - metadata[item.filename] = 'harmful content' - for key, value in self._get_zipinfo_meta(item).items(): - metadata[key] = value - zipin.close() - return metadata + if not file_path.startswith('docProps/') or not file_path.endswith('.xml'): + return {} + + with open(full_path, encoding='utf-8') as f: + try: + results = re.findall(r"<(.+)>(.+)", f.read(), re.I|re.M) + return {k:v for (k, v) in results} + except (TypeError, UnicodeDecodeError): + # We didn't manage to parse the xml file + return {file_path: 'harmful content', } class LibreOfficeParser(ArchiveBasedAbstractParser): diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 9354286..d692181 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -131,9 +131,9 @@ class TestGetMeta(unittest.TestCase): def test_docx(self): p = office.MSOfficeParser('./tests/data/dirty.docx') meta = p.get_meta() - self.assertEqual(meta['cp:lastModifiedBy'], 'Julien Voisin') - self.assertEqual(meta['dc:creator'], 'julien voisin') - self.assertEqual(meta['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1') + self.assertEqual(meta['docProps/core.xml']['cp:lastModifiedBy'], 'Julien Voisin') + self.assertEqual(meta['docProps/core.xml']['dc:creator'], 'julien voisin') + self.assertEqual(meta['docProps/app.xml']['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1') def test_libreoffice(self): p = office.LibreOfficeParser('./tests/data/dirty.odt')