Bump coverage for office files and fix some related crashes
This commit is contained in:
parent
ca01484126
commit
ad3e7ccee8
@ -147,7 +147,10 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
|
|||||||
""" In this function, we're changing the XML
|
""" In this function, we're changing the XML
|
||||||
document in two times, since we don't want
|
document in two times, since we don't want
|
||||||
to change the tree we're iterating on."""
|
to change the tree we're iterating on."""
|
||||||
|
try:
|
||||||
tree, ns = _parse_xml(full_path)
|
tree, ns = _parse_xml(full_path)
|
||||||
|
except ET.ParseError:
|
||||||
|
return False
|
||||||
|
|
||||||
# No revisions are present
|
# No revisions are present
|
||||||
del_presence = tree.find('.//w:del', ns)
|
del_presence = tree.find('.//w:del', ns)
|
||||||
@ -191,15 +194,13 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
|
|||||||
zipin = zipfile.ZipFile(self.filename)
|
zipin = zipfile.ZipFile(self.filename)
|
||||||
for item in zipin.infolist():
|
for item in zipin.infolist():
|
||||||
if item.filename.startswith('docProps/') and item.filename.endswith('.xml'):
|
if item.filename.startswith('docProps/') and item.filename.endswith('.xml'):
|
||||||
content = zipin.read(item).decode('utf-8')
|
|
||||||
try:
|
try:
|
||||||
|
content = zipin.read(item).decode('utf-8')
|
||||||
results = re.findall(r"<(.+)>(.+)</\1>", content, re.I|re.M)
|
results = re.findall(r"<(.+)>(.+)</\1>", content, re.I|re.M)
|
||||||
for (key, value) in results:
|
for (key, value) in results:
|
||||||
metadata[key] = value
|
metadata[key] = value
|
||||||
except TypeError: # We didn't manage to parse the xml file
|
except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file
|
||||||
pass
|
metadata[item.filename] = 'harmful content'
|
||||||
if not metadata: # better safe than sorry
|
|
||||||
metadata[item] = 'harmful content'
|
|
||||||
for key, value in self._get_zipinfo_meta(item).items():
|
for key, value in self._get_zipinfo_meta(item).items():
|
||||||
metadata[key] = value
|
metadata[key] = value
|
||||||
zipin.close()
|
zipin.close()
|
||||||
@ -232,7 +233,10 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
|
|||||||
|
|
||||||
|
|
||||||
def __remove_revisions(self, full_path: str) -> bool:
|
def __remove_revisions(self, full_path: str) -> bool:
|
||||||
|
try:
|
||||||
tree, ns = _parse_xml(full_path)
|
tree, ns = _parse_xml(full_path)
|
||||||
|
except ET.ParseError:
|
||||||
|
return False
|
||||||
|
|
||||||
if 'office' not in ns.keys(): # no revisions in the current file
|
if 'office' not in ns.keys(): # no revisions in the current file
|
||||||
return True
|
return True
|
||||||
@ -259,15 +263,13 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
|
|||||||
zipin = zipfile.ZipFile(self.filename)
|
zipin = zipfile.ZipFile(self.filename)
|
||||||
for item in zipin.infolist():
|
for item in zipin.infolist():
|
||||||
if item.filename == 'meta.xml':
|
if item.filename == 'meta.xml':
|
||||||
content = zipin.read(item).decode('utf-8')
|
|
||||||
try:
|
try:
|
||||||
|
content = zipin.read(item).decode('utf-8')
|
||||||
results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I|re.M)
|
results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I|re.M)
|
||||||
for (key, value) in results:
|
for (key, value) in results:
|
||||||
metadata[key] = value
|
metadata[key] = value
|
||||||
except TypeError: # We didn't manage to parse the xml file
|
except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file
|
||||||
pass
|
metadata[item.filename] = 'harmful content'
|
||||||
if not metadata: # better safe than sorry
|
|
||||||
metadata[item] = 'harmful content'
|
|
||||||
for key, value in self._get_zipinfo_meta(item).items():
|
for key, value in self._get_zipinfo_meta(item).items():
|
||||||
metadata[key] = value
|
metadata[key] = value
|
||||||
zipin.close()
|
zipin.close()
|
||||||
|
BIN
tests/data/embedded_corrupted.docx
Normal file
BIN
tests/data/embedded_corrupted.docx
Normal file
Binary file not shown.
BIN
tests/data/embedded_corrupted.odt
Normal file
BIN
tests/data/embedded_corrupted.odt
Normal file
Binary file not shown.
@ -15,6 +15,21 @@ class TestUnsupportedFiles(unittest.TestCase):
|
|||||||
self.assertEqual(parser, None)
|
self.assertEqual(parser, None)
|
||||||
os.remove('./tests/clean.py')
|
os.remove('./tests/clean.py')
|
||||||
|
|
||||||
|
class TestCorruptedEmbedded(unittest.TestCase):
|
||||||
|
def test_docx(self):
|
||||||
|
shutil.copy('./tests/data/embedded_corrupted.docx', './tests/data/clean.docx')
|
||||||
|
parser, mimetype = parser_factory.get_parser('./tests/data/clean.docx')
|
||||||
|
self.assertFalse(parser.remove_all())
|
||||||
|
self.assertIsNotNone(parser.get_meta())
|
||||||
|
os.remove('./tests/data/clean.docx')
|
||||||
|
|
||||||
|
def test_odt(self):
|
||||||
|
shutil.copy('./tests/data/embedded_corrupted.odt', './tests/data/clean.odt')
|
||||||
|
parser, mimetype = parser_factory.get_parser('./tests/data/clean.odt')
|
||||||
|
self.assertFalse(parser.remove_all())
|
||||||
|
self.assertEqual(parser.get_meta(), {'create_system': 'Weird', 'date_time': '2018-06-10 17:18:18', 'meta.xml': 'harmful content'})
|
||||||
|
os.remove('./tests/data/clean.odt')
|
||||||
|
|
||||||
|
|
||||||
class TestExplicitelyUnsupportedFiles(unittest.TestCase):
|
class TestExplicitelyUnsupportedFiles(unittest.TestCase):
|
||||||
def test_pdf(self):
|
def test_pdf(self):
|
||||||
|
Loading…
Reference in New Issue
Block a user