From e1dd439fc86ba15816e2331e8bed67dd7147e368 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Thu, 7 Feb 2019 21:58:10 +0100 Subject: [PATCH] Use of the archive refactoring for the office documents too --- libmat2/office.py | 28 +++++++++++----------------- tests/test_corrupted_files.py | 7 +------ tests/test_libmat2.py | 14 +++++++------- 3 files changed, 19 insertions(+), 30 deletions(-) diff --git a/libmat2/office.py b/libmat2/office.py index dfad3b3..0c9caa8 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -2,7 +2,7 @@ import logging import os import re import zipfile -from typing import Dict, Set, Pattern, Tuple, Union, Any +from typing import Dict, Set, Pattern, Tuple, Any import xml.etree.ElementTree as ET # type: ignore @@ -375,23 +375,17 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): return False return True - def get_meta(self) -> Dict[str, Union[str, dict]]: + def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]: """ Yes, I know that parsing xml with regexp ain't pretty, be my guest and fix it if you want. """ - metadata = {} - zipin = zipfile.ZipFile(self.filename) - for item in zipin.infolist(): - if item.filename == 'meta.xml': - try: - content = zipin.read(item).decode('utf-8') - results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)", content, re.I|re.M) - for (key, value) in results: - metadata[key] = value - except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file - metadata[item.filename] = 'harmful content' - for key, value in self._get_zipinfo_meta(item).items(): - metadata[key] = value - zipin.close() - return metadata + if file_path != 'meta.xml': + return {} + with open(full_path, encoding='utf-8') as f: + try: + results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)", f.read(), re.I|re.M) + return {k:v for (k, v) in results} + except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file + # We didn't manage to parse the xml file + return {file_path: 'harmful content', } diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py index e7d3c2a..b2e7798 100644 --- a/tests/test_corrupted_files.py +++ b/tests/test_corrupted_files.py @@ -67,15 +67,10 @@ class TestCorruptedEmbedded(unittest.TestCase): os.remove('./tests/data/clean.docx') def test_odt(self): - expected = { - 'create_system': 'Weird', - 'date_time': '2018-06-10 17:18:18', - 'meta.xml': 'harmful content' - } shutil.copy('./tests/data/embedded_corrupted.odt', './tests/data/clean.odt') parser, _ = parser_factory.get_parser('./tests/data/clean.odt') self.assertFalse(parser.remove_all()) - self.assertEqual(parser.get_meta(), expected) + self.assertTrue(parser.get_meta()) os.remove('./tests/data/clean.odt') diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index d692181..548b076 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -138,14 +138,14 @@ class TestGetMeta(unittest.TestCase): def test_libreoffice(self): p = office.LibreOfficeParser('./tests/data/dirty.odt') meta = p.get_meta() - self.assertEqual(meta['meta:initial-creator'], 'jvoisin ') - self.assertEqual(meta['meta:creation-date'], '2011-07-26T03:27:48') - self.assertEqual(meta['meta:generator'], 'LibreOffice/3.3$Unix LibreOffice_project/330m19$Build-202') + self.assertEqual(meta['meta.xml']['meta:initial-creator'], 'jvoisin ') + self.assertEqual(meta['meta.xml']['meta:creation-date'], '2011-07-26T03:27:48') + self.assertEqual(meta['meta.xml']['meta:generator'], 'LibreOffice/3.3$Unix LibreOffice_project/330m19$Build-202') p = office.LibreOfficeParser('./tests/data/weird_producer.odt') meta = p.get_meta() - self.assertEqual(meta['create_system'], 'Windows') - self.assertEqual(meta['comment'], b'YAY FOR COMMENTS') + self.assertEqual(meta['mimetype']['create_system'], 'Windows') + self.assertEqual(meta['mimetype']['comment'], b'YAY FOR COMMENTS') def test_txt(self): p, mimetype = parser_factory.get_parser('./tests/data/dirty.txt') @@ -440,7 +440,7 @@ class TestCleaning(unittest.TestCase): p = office.LibreOfficeParser('./tests/data/clean.odf') meta = p.get_meta() - self.assertEqual(meta['meta:creation-date'], '2018-04-23T00:18:59.438231281') + self.assertEqual(meta['meta.xml']['meta:creation-date'], '2018-04-23T00:18:59.438231281') ret = p.remove_all() self.assertTrue(ret) @@ -458,7 +458,7 @@ class TestCleaning(unittest.TestCase): p = office.LibreOfficeParser('./tests/data/clean.odg') meta = p.get_meta() - self.assertEqual(meta['dc:date'], '2018-04-23T00:26:59.385838550') + self.assertEqual(meta['meta.xml']['dc:date'], '2018-04-23T00:26:59.385838550') ret = p.remove_all() self.assertTrue(ret)