Remove dangling references in MS Office's [Content_types].xml
This commit is contained in:
parent
212d9c472c
commit
e342671ead
@ -151,10 +151,44 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
|
|||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def __remove_content_type_members(self, full_path: str) -> bool:
|
||||||
|
""" The method will remove the dangling references
|
||||||
|
form the [Content_Types].xml file, since MS office doesn't like them
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
tree, namespace = _parse_xml(full_path)
|
||||||
|
except ET.ParseError: # pragma: no cover
|
||||||
|
return False
|
||||||
|
|
||||||
|
if len(namespace.items()) != 1:
|
||||||
|
return False # there should be only one namespace for Types
|
||||||
|
|
||||||
|
removed_fnames = set()
|
||||||
|
with zipfile.ZipFile(self.filename) as zin:
|
||||||
|
for fname in [item.filename for item in zin.infolist()]:
|
||||||
|
if any(map(lambda r: r.search(fname), self.files_to_omit)):
|
||||||
|
removed_fnames.add(fname)
|
||||||
|
|
||||||
|
root = tree.getroot()
|
||||||
|
for item in root.findall('{%s}Override' % namespace['']):
|
||||||
|
name = item.attrib['PartName'][1:] # remove the leading '/'
|
||||||
|
if name in removed_fnames:
|
||||||
|
root.remove(item)
|
||||||
|
|
||||||
|
tree.write(full_path, xml_declaration=True)
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
def _specific_cleanup(self, full_path: str) -> bool:
|
def _specific_cleanup(self, full_path: str) -> bool:
|
||||||
if os.stat(full_path).st_size == 0: # Don't process empty files
|
if os.stat(full_path).st_size == 0: # Don't process empty files
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
if full_path.endswith('/[Content_Types].xml'):
|
||||||
|
# this file contains references to files that we might
|
||||||
|
# remove, and MS Office doesn't like dangling references
|
||||||
|
if self.__remove_content_type_members(full_path) is False:
|
||||||
|
return False
|
||||||
|
|
||||||
if full_path.endswith('/word/document.xml'):
|
if full_path.endswith('/word/document.xml'):
|
||||||
# this file contains the revisions
|
# this file contains the revisions
|
||||||
if self.__remove_revisions(full_path) is False:
|
if self.__remove_revisions(full_path) is False:
|
||||||
|
BIN
tests/data/malformed_content_types.docx
Normal file
BIN
tests/data/malformed_content_types.docx
Normal file
Binary file not shown.
@ -80,6 +80,14 @@ class TestExplicitelyUnsupportedFiles(unittest.TestCase):
|
|||||||
os.remove('./tests/data/clean.py')
|
os.remove('./tests/data/clean.py')
|
||||||
|
|
||||||
|
|
||||||
|
class TestCorruptedContentTypesOffice(unittest.TestCase):
|
||||||
|
def test_office(self):
|
||||||
|
shutil.copy('./tests/data/malformed_content_types.docx', './tests/data/clean.docx')
|
||||||
|
p = office.MSOfficeParser('./tests/data/clean.docx')
|
||||||
|
self.assertIsNotNone(p)
|
||||||
|
self.assertFalse(p.remove_all())
|
||||||
|
os.remove('./tests/data/clean.docx')
|
||||||
|
|
||||||
class TestCorruptedFiles(unittest.TestCase):
|
class TestCorruptedFiles(unittest.TestCase):
|
||||||
def test_pdf(self):
|
def test_pdf(self):
|
||||||
shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
|
shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
|
||||||
|
Loading…
Reference in New Issue
Block a user