1
0
mirror of synced 2024-11-22 01:04:23 +01:00

Bump coverage for office files and fix some related crashes

This commit is contained in:
jvoisin 2018-07-08 21:35:45 +02:00
parent ca01484126
commit ad3e7ccee8
4 changed files with 29 additions and 12 deletions

View File

@ -147,7 +147,10 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
""" In this function, we're changing the XML """ In this function, we're changing the XML
document in two times, since we don't want document in two times, since we don't want
to change the tree we're iterating on.""" to change the tree we're iterating on."""
tree, ns = _parse_xml(full_path) try:
tree, ns = _parse_xml(full_path)
except ET.ParseError:
return False
# No revisions are present # No revisions are present
del_presence = tree.find('.//w:del', ns) del_presence = tree.find('.//w:del', ns)
@ -191,15 +194,13 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
zipin = zipfile.ZipFile(self.filename) zipin = zipfile.ZipFile(self.filename)
for item in zipin.infolist(): for item in zipin.infolist():
if item.filename.startswith('docProps/') and item.filename.endswith('.xml'): if item.filename.startswith('docProps/') and item.filename.endswith('.xml'):
content = zipin.read(item).decode('utf-8')
try: try:
content = zipin.read(item).decode('utf-8')
results = re.findall(r"<(.+)>(.+)</\1>", content, re.I|re.M) results = re.findall(r"<(.+)>(.+)</\1>", content, re.I|re.M)
for (key, value) in results: for (key, value) in results:
metadata[key] = value metadata[key] = value
except TypeError: # We didn't manage to parse the xml file except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file
pass metadata[item.filename] = 'harmful content'
if not metadata: # better safe than sorry
metadata[item] = 'harmful content'
for key, value in self._get_zipinfo_meta(item).items(): for key, value in self._get_zipinfo_meta(item).items():
metadata[key] = value metadata[key] = value
zipin.close() zipin.close()
@ -232,7 +233,10 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
def __remove_revisions(self, full_path: str) -> bool: def __remove_revisions(self, full_path: str) -> bool:
tree, ns = _parse_xml(full_path) try:
tree, ns = _parse_xml(full_path)
except ET.ParseError:
return False
if 'office' not in ns.keys(): # no revisions in the current file if 'office' not in ns.keys(): # no revisions in the current file
return True return True
@ -259,15 +263,13 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
zipin = zipfile.ZipFile(self.filename) zipin = zipfile.ZipFile(self.filename)
for item in zipin.infolist(): for item in zipin.infolist():
if item.filename == 'meta.xml': if item.filename == 'meta.xml':
content = zipin.read(item).decode('utf-8')
try: try:
content = zipin.read(item).decode('utf-8')
results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I|re.M) results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I|re.M)
for (key, value) in results: for (key, value) in results:
metadata[key] = value metadata[key] = value
except TypeError: # We didn't manage to parse the xml file except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file
pass metadata[item.filename] = 'harmful content'
if not metadata: # better safe than sorry
metadata[item] = 'harmful content'
for key, value in self._get_zipinfo_meta(item).items(): for key, value in self._get_zipinfo_meta(item).items():
metadata[key] = value metadata[key] = value
zipin.close() zipin.close()

Binary file not shown.

Binary file not shown.

View File

@ -15,6 +15,21 @@ class TestUnsupportedFiles(unittest.TestCase):
self.assertEqual(parser, None) self.assertEqual(parser, None)
os.remove('./tests/clean.py') os.remove('./tests/clean.py')
class TestCorruptedEmbedded(unittest.TestCase):
def test_docx(self):
shutil.copy('./tests/data/embedded_corrupted.docx', './tests/data/clean.docx')
parser, mimetype = parser_factory.get_parser('./tests/data/clean.docx')
self.assertFalse(parser.remove_all())
self.assertIsNotNone(parser.get_meta())
os.remove('./tests/data/clean.docx')
def test_odt(self):
shutil.copy('./tests/data/embedded_corrupted.odt', './tests/data/clean.odt')
parser, mimetype = parser_factory.get_parser('./tests/data/clean.odt')
self.assertFalse(parser.remove_all())
self.assertEqual(parser.get_meta(), {'create_system': 'Weird', 'date_time': '2018-06-10 17:18:18', 'meta.xml': 'harmful content'})
os.remove('./tests/data/clean.odt')
class TestExplicitelyUnsupportedFiles(unittest.TestCase): class TestExplicitelyUnsupportedFiles(unittest.TestCase):
def test_pdf(self): def test_pdf(self):