diff --git a/libmat2/office.py b/libmat2/office.py index 5165056..6087c47 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -147,7 +147,10 @@ class MSOfficeParser(ArchiveBasedAbstractParser): """ In this function, we're changing the XML document in two times, since we don't want to change the tree we're iterating on.""" - tree, ns = _parse_xml(full_path) + try: + tree, ns = _parse_xml(full_path) + except ET.ParseError: + return False # No revisions are present del_presence = tree.find('.//w:del', ns) @@ -191,15 +194,13 @@ class MSOfficeParser(ArchiveBasedAbstractParser): zipin = zipfile.ZipFile(self.filename) for item in zipin.infolist(): if item.filename.startswith('docProps/') and item.filename.endswith('.xml'): - content = zipin.read(item).decode('utf-8') try: + content = zipin.read(item).decode('utf-8') results = re.findall(r"<(.+)>(.+)", content, re.I|re.M) for (key, value) in results: metadata[key] = value - except TypeError: # We didn't manage to parse the xml file - pass - if not metadata: # better safe than sorry - metadata[item] = 'harmful content' + except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file + metadata[item.filename] = 'harmful content' for key, value in self._get_zipinfo_meta(item).items(): metadata[key] = value zipin.close() @@ -232,7 +233,10 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): def __remove_revisions(self, full_path: str) -> bool: - tree, ns = _parse_xml(full_path) + try: + tree, ns = _parse_xml(full_path) + except ET.ParseError: + return False if 'office' not in ns.keys(): # no revisions in the current file return True @@ -259,15 +263,13 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): zipin = zipfile.ZipFile(self.filename) for item in zipin.infolist(): if item.filename == 'meta.xml': - content = zipin.read(item).decode('utf-8') try: + content = zipin.read(item).decode('utf-8') results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)", content, re.I|re.M) for (key, value) in results: metadata[key] = value - except TypeError: # We didn't manage to parse the xml file - pass - if not metadata: # better safe than sorry - metadata[item] = 'harmful content' + except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file + metadata[item.filename] = 'harmful content' for key, value in self._get_zipinfo_meta(item).items(): metadata[key] = value zipin.close() diff --git a/tests/data/embedded_corrupted.docx b/tests/data/embedded_corrupted.docx new file mode 100644 index 0000000..989bdb8 Binary files /dev/null and b/tests/data/embedded_corrupted.docx differ diff --git a/tests/data/embedded_corrupted.odt b/tests/data/embedded_corrupted.odt new file mode 100644 index 0000000..1e4a844 Binary files /dev/null and b/tests/data/embedded_corrupted.odt differ diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py index a77acbc..2bb1c76 100644 --- a/tests/test_corrupted_files.py +++ b/tests/test_corrupted_files.py @@ -15,6 +15,21 @@ class TestUnsupportedFiles(unittest.TestCase): self.assertEqual(parser, None) os.remove('./tests/clean.py') +class TestCorruptedEmbedded(unittest.TestCase): + def test_docx(self): + shutil.copy('./tests/data/embedded_corrupted.docx', './tests/data/clean.docx') + parser, mimetype = parser_factory.get_parser('./tests/data/clean.docx') + self.assertFalse(parser.remove_all()) + self.assertIsNotNone(parser.get_meta()) + os.remove('./tests/data/clean.docx') + + def test_odt(self): + shutil.copy('./tests/data/embedded_corrupted.odt', './tests/data/clean.odt') + parser, mimetype = parser_factory.get_parser('./tests/data/clean.odt') + self.assertFalse(parser.remove_all()) + self.assertEqual(parser.get_meta(), {'create_system': 'Weird', 'date_time': '2018-06-10 17:18:18', 'meta.xml': 'harmful content'}) + os.remove('./tests/data/clean.odt') + class TestExplicitelyUnsupportedFiles(unittest.TestCase): def test_pdf(self):