1
0
Fork 0
mirror of synced 2025-07-03 11:57:26 +02:00

Bump coverage for office files and fix some related crashes

This commit is contained in:
jvoisin 2018-07-08 21:35:45 +02:00
parent ca01484126
commit ad3e7ccee8
4 changed files with 29 additions and 12 deletions

View file

@ -147,7 +147,10 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
""" In this function, we're changing the XML
document in two times, since we don't want
to change the tree we're iterating on."""
tree, ns = _parse_xml(full_path)
try:
tree, ns = _parse_xml(full_path)
except ET.ParseError:
return False
# No revisions are present
del_presence = tree.find('.//w:del', ns)
@ -191,15 +194,13 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
zipin = zipfile.ZipFile(self.filename)
for item in zipin.infolist():
if item.filename.startswith('docProps/') and item.filename.endswith('.xml'):
content = zipin.read(item).decode('utf-8')
try:
content = zipin.read(item).decode('utf-8')
results = re.findall(r"<(.+)>(.+)</\1>", content, re.I|re.M)
for (key, value) in results:
metadata[key] = value
except TypeError: # We didn't manage to parse the xml file
pass
if not metadata: # better safe than sorry
metadata[item] = 'harmful content'
except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file
metadata[item.filename] = 'harmful content'
for key, value in self._get_zipinfo_meta(item).items():
metadata[key] = value
zipin.close()
@ -232,7 +233,10 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
def __remove_revisions(self, full_path: str) -> bool:
tree, ns = _parse_xml(full_path)
try:
tree, ns = _parse_xml(full_path)
except ET.ParseError:
return False
if 'office' not in ns.keys(): # no revisions in the current file
return True
@ -259,15 +263,13 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
zipin = zipfile.ZipFile(self.filename)
for item in zipin.infolist():
if item.filename == 'meta.xml':
content = zipin.read(item).decode('utf-8')
try:
content = zipin.read(item).decode('utf-8')
results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I|re.M)
for (key, value) in results:
metadata[key] = value
except TypeError: # We didn't manage to parse the xml file
pass
if not metadata: # better safe than sorry
metadata[item] = 'harmful content'
except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file
metadata[item.filename] = 'harmful content'
for key, value in self._get_zipinfo_meta(item).items():
metadata[key] = value
zipin.close()