From 7dad77a7857990787551e99b2a09bdf908e67553 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Sun, 10 Jun 2018 20:20:00 +0200 Subject: [PATCH] Make the parsing of office format's metadata more robust --- libmat2/office.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/libmat2/office.py b/libmat2/office.py index 914fd39..6ab7e80 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -78,8 +78,12 @@ class MSOfficeParser(ArchiveBasedAbstractParser): for item in zipin.infolist(): if item.filename.startswith('docProps/') and item.filename.endswith('.xml'): content = zipin.read(item).decode('utf-8') - for (key, value) in re.findall(r"<(.+)>(.+)", content, re.I): - metadata[key] = value + try: + results = re.findall(r"<(.+)>(.+)", content, re.I|re.M) + for (key, value) in results: + metadata[key] = value + except TypeError: # We didn't manage to parse the xml file + pass if not metadata: # better safe than sorry metadata[item] = 'harmful content' @@ -140,8 +144,12 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): for item in zipin.infolist(): if item.filename == 'meta.xml': content = zipin.read(item).decode('utf-8') - for (key, value) in re.findall(r"<((?:meta|dc|cp).+?)>(.+)", content, re.I): - metadata[key] = value + try: + results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)", content, re.I|re.M) + for (key, value) in results: + metadata[key] = value + except TypeError: # We didn't manage to parse the xml file + pass if not metadata: # better safe than sorry metadata[item] = 'harmful content' for key, value in self._get_zipinfo_meta(item).items():