Use [Content_Types].xml to improve MS Office coverage
This commit is contained in:
parent
5b606f939d
commit
c67bbafb2c
@ -17,7 +17,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
|
|||||||
""" Office files (.docx, .odt, …) are zipped files. """
|
""" Office files (.docx, .odt, …) are zipped files. """
|
||||||
# Those are the files that have a format that _isn't_
|
# Those are the files that have a format that _isn't_
|
||||||
# supported by MAT2, but that we want to keep anyway.
|
# supported by MAT2, but that we want to keep anyway.
|
||||||
files_to_keep = set() # type: Set[str]
|
files_to_keep = set() # type: Set[Pattern]
|
||||||
|
|
||||||
# Those are the files that we _do not_ want to keep,
|
# Those are the files that we _do not_ want to keep,
|
||||||
# no matter if they are supported or not.
|
# no matter if they are supported or not.
|
||||||
@ -89,7 +89,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
|
|||||||
abort = True
|
abort = True
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if item.filename in self.files_to_keep:
|
if any(map(lambda r: r.search(item.filename), self.files_to_keep)):
|
||||||
# those files aren't supported, but we want to add them anyway
|
# those files aren't supported, but we want to add them anyway
|
||||||
pass
|
pass
|
||||||
elif any(map(lambda r: r.search(item.filename), self.files_to_omit)):
|
elif any(map(lambda r: r.search(item.filename), self.files_to_omit)):
|
||||||
|
@ -50,24 +50,74 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
|
|||||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation'
|
'application/vnd.openxmlformats-officedocument.presentationml.presentation'
|
||||||
}
|
}
|
||||||
files_to_keep = {
|
content_types_to_keep = {
|
||||||
'[Content_Types].xml',
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml', # /word/endnotes.xml
|
||||||
'_rels/.rels',
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml', # /word/footnotes.xml
|
||||||
'word/_rels/document.xml.rels',
|
'application/vnd.openxmlformats-officedocument.extended-properties+xml', # /docProps/app.xml
|
||||||
'word/document.xml',
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml', # /word/document.xml
|
||||||
'word/fontTable.xml',
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml', # /word/fontTable.xml
|
||||||
'word/settings.xml',
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml', # /word/footer.xml
|
||||||
'word/styles.xml',
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml', # /word/header.xml
|
||||||
'docProps/app.xml',
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml', # /word/styles.xml
|
||||||
'docProps/core.xml',
|
'application/vnd.openxmlformats-package.core-properties+xml', # /docProps/core.xml
|
||||||
|
|
||||||
|
# Do we want to keep the following ones?
|
||||||
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml',
|
||||||
|
|
||||||
|
# See https://0xacab.org/jvoisin/mat2/issues/71
|
||||||
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml
|
||||||
|
}
|
||||||
|
files_to_keep = set(map(re.compile, { # type: ignore
|
||||||
|
r'^\[Content_Types\]\.xml$',
|
||||||
|
r'^_rels/\.rels$',
|
||||||
|
r'^word/_rels/document\.xml\.rels$',
|
||||||
|
r'^word/_rels/footer[0-9]*\.xml\.rels$',
|
||||||
|
r'^word/_rels/header[0-9]*\.xml\.rels$',
|
||||||
|
|
||||||
# https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
|
# https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
|
||||||
'word/stylesWithEffects.xml',
|
r'^word/stylesWithEffects\.xml$',
|
||||||
}
|
|
||||||
files_to_omit = set(map(re.compile, { # type: ignore
|
|
||||||
'word/webSettings.xml',
|
|
||||||
'word/theme',
|
|
||||||
}))
|
}))
|
||||||
|
files_to_omit = set(map(re.compile, { # type: ignore
|
||||||
|
r'^customXml/',
|
||||||
|
r'webSettings\.xml$',
|
||||||
|
r'^docProps/custom\.xml$',
|
||||||
|
r'^word/printerSettings/',
|
||||||
|
r'^word/theme',
|
||||||
|
|
||||||
|
# we have a whitelist in self.files_to_keep,
|
||||||
|
# so we can trash everything else
|
||||||
|
r'^word/_rels/',
|
||||||
|
}))
|
||||||
|
|
||||||
|
def __init__(self, filename):
|
||||||
|
super().__init__(filename)
|
||||||
|
if self.__fill_files_to_keep_via_content_types() is False:
|
||||||
|
raise ValueError
|
||||||
|
|
||||||
|
def __fill_files_to_keep_via_content_types(self) -> bool:
|
||||||
|
""" There is a suer-handy `[Content_Types].xml` file
|
||||||
|
in MS Office archives, describing what each other file contains.
|
||||||
|
The self.content_types_to_keep member contains a type whitelist,
|
||||||
|
so we're using it to fill the self.files_to_keep one.
|
||||||
|
"""
|
||||||
|
with zipfile.ZipFile(self.filename) as zin:
|
||||||
|
if '[Content_Types].xml' not in zin.namelist():
|
||||||
|
return False
|
||||||
|
xml_data = zin.read('[Content_Types].xml')
|
||||||
|
|
||||||
|
self.content_types = dict() # type: Dict[str, str]
|
||||||
|
try:
|
||||||
|
tree = ET.fromstring(xml_data)
|
||||||
|
except ET.ParseError:
|
||||||
|
return False
|
||||||
|
for c in tree:
|
||||||
|
if 'PartName' not in c.attrib or 'ContentType' not in c.attrib:
|
||||||
|
continue
|
||||||
|
elif c.attrib['ContentType'] in self.content_types_to_keep:
|
||||||
|
fname = c.attrib['PartName'][1:] # remove leading `/`
|
||||||
|
re_fname = re.compile('^' + re.escape(fname) + '$')
|
||||||
|
self.files_to_keep.add(re_fname) # type: ignore
|
||||||
|
return True
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def __remove_rsid(full_path: str) -> bool:
|
def __remove_rsid(full_path: str) -> bool:
|
||||||
@ -270,18 +320,18 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
|
|||||||
'application/vnd.oasis.opendocument.formula',
|
'application/vnd.oasis.opendocument.formula',
|
||||||
'application/vnd.oasis.opendocument.image',
|
'application/vnd.oasis.opendocument.image',
|
||||||
}
|
}
|
||||||
files_to_keep = {
|
files_to_keep = set(map(re.compile, { # type: ignore
|
||||||
'META-INF/manifest.xml',
|
r'^META-INF/manifest\.xml$',
|
||||||
'content.xml',
|
r'^content\.xml$',
|
||||||
'manifest.rdf',
|
r'^manifest\.rdf$',
|
||||||
'mimetype',
|
r'^mimetype$',
|
||||||
'settings.xml',
|
r'^settings\.xml$',
|
||||||
'styles.xml',
|
r'^styles\.xml$',
|
||||||
}
|
}))
|
||||||
files_to_omit = set(map(re.compile, { # type: ignore
|
files_to_omit = set(map(re.compile, { # type: ignore
|
||||||
r'^meta\.xml$',
|
r'^meta\.xml$',
|
||||||
'^Configurations2/',
|
r'^Configurations2/',
|
||||||
'^Thumbnails/',
|
r'^Thumbnails/',
|
||||||
}))
|
}))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
BIN
tests/data/broken_xml_content_types.docx
Normal file
BIN
tests/data/broken_xml_content_types.docx
Normal file
Binary file not shown.
Binary file not shown.
BIN
tests/data/no_content_types.docx
Normal file
BIN
tests/data/no_content_types.docx
Normal file
Binary file not shown.
@ -86,14 +86,26 @@ class TestExplicitelyUnsupportedFiles(unittest.TestCase):
|
|||||||
os.remove('./tests/data/clean.py')
|
os.remove('./tests/data/clean.py')
|
||||||
|
|
||||||
|
|
||||||
class TestCorruptedContentTypesOffice(unittest.TestCase):
|
class TestWrongContentTypesFileOffice(unittest.TestCase):
|
||||||
def test_office(self):
|
def test_office_incomplete(self):
|
||||||
shutil.copy('./tests/data/malformed_content_types.docx', './tests/data/clean.docx')
|
shutil.copy('./tests/data/malformed_content_types.docx', './tests/data/clean.docx')
|
||||||
p = office.MSOfficeParser('./tests/data/clean.docx')
|
p = office.MSOfficeParser('./tests/data/clean.docx')
|
||||||
self.assertIsNotNone(p)
|
self.assertIsNotNone(p)
|
||||||
self.assertFalse(p.remove_all())
|
self.assertFalse(p.remove_all())
|
||||||
os.remove('./tests/data/clean.docx')
|
os.remove('./tests/data/clean.docx')
|
||||||
|
|
||||||
|
def test_office_broken(self):
|
||||||
|
shutil.copy('./tests/data/broken_xml_content_types.docx', './tests/data/clean.docx')
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
office.MSOfficeParser('./tests/data/clean.docx')
|
||||||
|
os.remove('./tests/data/clean.docx')
|
||||||
|
|
||||||
|
def test_office_absent(self):
|
||||||
|
shutil.copy('./tests/data/no_content_types.docx', './tests/data/clean.docx')
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
office.MSOfficeParser('./tests/data/clean.docx')
|
||||||
|
os.remove('./tests/data/clean.docx')
|
||||||
|
|
||||||
class TestCorruptedFiles(unittest.TestCase):
|
class TestCorruptedFiles(unittest.TestCase):
|
||||||
def test_pdf(self):
|
def test_pdf(self):
|
||||||
shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
|
shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
|
||||||
|
Loading…
Reference in New Issue
Block a user