diff --git a/libmat2/archive.py b/libmat2/archive.py index d812531..b29d690 100644 --- a/libmat2/archive.py +++ b/libmat2/archive.py @@ -17,7 +17,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): """ Office files (.docx, .odt, …) are zipped files. """ # Those are the files that have a format that _isn't_ # supported by MAT2, but that we want to keep anyway. - files_to_keep = set() # type: Set[str] + files_to_keep = set() # type: Set[Pattern] # Those are the files that we _do not_ want to keep, # no matter if they are supported or not. @@ -89,7 +89,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): abort = True continue - if item.filename in self.files_to_keep: + if any(map(lambda r: r.search(item.filename), self.files_to_keep)): # those files aren't supported, but we want to add them anyway pass elif any(map(lambda r: r.search(item.filename), self.files_to_omit)): diff --git a/libmat2/office.py b/libmat2/office.py index 91bf2a6..3abf108 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -50,24 +50,74 @@ class MSOfficeParser(ArchiveBasedAbstractParser): 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'application/vnd.openxmlformats-officedocument.presentationml.presentation' } - files_to_keep = { - '[Content_Types].xml', - '_rels/.rels', - 'word/_rels/document.xml.rels', - 'word/document.xml', - 'word/fontTable.xml', - 'word/settings.xml', - 'word/styles.xml', - 'docProps/app.xml', - 'docProps/core.xml', + content_types_to_keep = { + 'application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml', # /word/endnotes.xml + 'application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml', # /word/footnotes.xml + 'application/vnd.openxmlformats-officedocument.extended-properties+xml', # /docProps/app.xml + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml', # /word/document.xml + 'application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml', # /word/fontTable.xml + 'application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml', # /word/footer.xml + 'application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml', # /word/header.xml + 'application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml', # /word/styles.xml + 'application/vnd.openxmlformats-package.core-properties+xml', # /docProps/core.xml + + # Do we want to keep the following ones? + 'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml', + + # See https://0xacab.org/jvoisin/mat2/issues/71 + 'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml + } + files_to_keep = set(map(re.compile, { # type: ignore + r'^\[Content_Types\]\.xml$', + r'^_rels/\.rels$', + r'^word/_rels/document\.xml\.rels$', + r'^word/_rels/footer[0-9]*\.xml\.rels$', + r'^word/_rels/header[0-9]*\.xml\.rels$', # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx - 'word/stylesWithEffects.xml', - } - files_to_omit = set(map(re.compile, { # type: ignore - 'word/webSettings.xml', - 'word/theme', + r'^word/stylesWithEffects\.xml$', })) + files_to_omit = set(map(re.compile, { # type: ignore + r'^customXml/', + r'webSettings\.xml$', + r'^docProps/custom\.xml$', + r'^word/printerSettings/', + r'^word/theme', + + # we have a whitelist in self.files_to_keep, + # so we can trash everything else + r'^word/_rels/', + })) + + def __init__(self, filename): + super().__init__(filename) + if self.__fill_files_to_keep_via_content_types() is False: + raise ValueError + + def __fill_files_to_keep_via_content_types(self) -> bool: + """ There is a suer-handy `[Content_Types].xml` file + in MS Office archives, describing what each other file contains. + The self.content_types_to_keep member contains a type whitelist, + so we're using it to fill the self.files_to_keep one. + """ + with zipfile.ZipFile(self.filename) as zin: + if '[Content_Types].xml' not in zin.namelist(): + return False + xml_data = zin.read('[Content_Types].xml') + + self.content_types = dict() # type: Dict[str, str] + try: + tree = ET.fromstring(xml_data) + except ET.ParseError: + return False + for c in tree: + if 'PartName' not in c.attrib or 'ContentType' not in c.attrib: + continue + elif c.attrib['ContentType'] in self.content_types_to_keep: + fname = c.attrib['PartName'][1:] # remove leading `/` + re_fname = re.compile('^' + re.escape(fname) + '$') + self.files_to_keep.add(re_fname) # type: ignore + return True @staticmethod def __remove_rsid(full_path: str) -> bool: @@ -270,18 +320,18 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): 'application/vnd.oasis.opendocument.formula', 'application/vnd.oasis.opendocument.image', } - files_to_keep = { - 'META-INF/manifest.xml', - 'content.xml', - 'manifest.rdf', - 'mimetype', - 'settings.xml', - 'styles.xml', - } + files_to_keep = set(map(re.compile, { # type: ignore + r'^META-INF/manifest\.xml$', + r'^content\.xml$', + r'^manifest\.rdf$', + r'^mimetype$', + r'^settings\.xml$', + r'^styles\.xml$', + })) files_to_omit = set(map(re.compile, { # type: ignore r'^meta\.xml$', - '^Configurations2/', - '^Thumbnails/', + r'^Configurations2/', + r'^Thumbnails/', })) @staticmethod diff --git a/tests/data/broken_xml_content_types.docx b/tests/data/broken_xml_content_types.docx new file mode 100644 index 0000000..41e0e49 Binary files /dev/null and b/tests/data/broken_xml_content_types.docx differ diff --git a/tests/data/malformed_content_types.docx b/tests/data/malformed_content_types.docx index 43ac743..cc5caf3 100644 Binary files a/tests/data/malformed_content_types.docx and b/tests/data/malformed_content_types.docx differ diff --git a/tests/data/no_content_types.docx b/tests/data/no_content_types.docx new file mode 100644 index 0000000..d0e0330 Binary files /dev/null and b/tests/data/no_content_types.docx differ diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py index 4ac2678..8d7c252 100644 --- a/tests/test_corrupted_files.py +++ b/tests/test_corrupted_files.py @@ -86,14 +86,26 @@ class TestExplicitelyUnsupportedFiles(unittest.TestCase): os.remove('./tests/data/clean.py') -class TestCorruptedContentTypesOffice(unittest.TestCase): - def test_office(self): +class TestWrongContentTypesFileOffice(unittest.TestCase): + def test_office_incomplete(self): shutil.copy('./tests/data/malformed_content_types.docx', './tests/data/clean.docx') p = office.MSOfficeParser('./tests/data/clean.docx') self.assertIsNotNone(p) self.assertFalse(p.remove_all()) os.remove('./tests/data/clean.docx') + def test_office_broken(self): + shutil.copy('./tests/data/broken_xml_content_types.docx', './tests/data/clean.docx') + with self.assertRaises(ValueError): + office.MSOfficeParser('./tests/data/clean.docx') + os.remove('./tests/data/clean.docx') + + def test_office_absent(self): + shutil.copy('./tests/data/no_content_types.docx', './tests/data/clean.docx') + with self.assertRaises(ValueError): + office.MSOfficeParser('./tests/data/clean.docx') + os.remove('./tests/data/clean.docx') + class TestCorruptedFiles(unittest.TestCase): def test_pdf(self): shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')