diff --git a/libmat2/office.py b/libmat2/office.py index 5381eb9..acd8ca2 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -14,6 +14,24 @@ from . import abstract, parser_factory assert Set assert Pattern +def _parse_xml(full_path: str): + """ This function parse XML with namespace support. """ + def parse_map(f): # etree support for ns is a bit rough + ns_map = dict() + for event, (k, v) in ET.iterparse(f, ("start-ns", )): + if event == "start-ns": + ns_map[k] = v + return ns_map + + ns = parse_map(full_path) + + # Register the namespaces + for k,v in ns.items(): + ET.register_namespace(k, v) + + return ET.parse(full_path), ns + + class ArchiveBasedAbstractParser(abstract.AbstractParser): # Those are the files that have a format that _isn't_ # supported by MAT2, but that we want to keep anyway. @@ -72,7 +90,11 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): zin.extract(member=item, path=temp_folder) full_path = os.path.join(temp_folder, item.filename) - self._specific_cleanup(full_path) + if self._specific_cleanup(full_path) is False: + shutil.rmtree(temp_folder) + os.remove(self.output_filename) + print("Something went wrong during deep cleaning of %s" % item.filename) + return False if item.filename in self.files_to_keep: # those files aren't supported, but we want to add them anyway @@ -118,6 +140,45 @@ class MSOfficeParser(ArchiveBasedAbstractParser): '^docProps/', })) + def __remove_revisions(self, full_path:str) -> bool: + """ In this function, we're changing the XML + document in two times, since we don't want + to change the tree we're iterating on.""" + tree, ns = _parse_xml(full_path) + + # No revisions are present + if tree.find('.//w:del', ns) is None: + return True + elif tree.find('.//w:ins', ns) is None: + return True + + parent_map = {c:p for p in tree.iter( ) for c in p} + + elements = list([element for element in tree.iterfind('.//w:del', ns)]) + for element in elements: + parent_map[element].remove(element) + + elements = list() + for element in tree.iterfind('.//w:ins', ns): + for position, item in enumerate(tree.iter()): + if item == element: + for children in element.iterfind('./*'): + elements.append((element, position, children)) + break + + for (element, position, children) in elements: + parent_map[element].insert(position, children) + parent_map[element].remove(element) + + tree.write(full_path, xml_declaration=True) + + return True + + def _specific_cleanup(self, full_path:str) -> bool: + if full_path.endswith('/word/document.xml'): + return self.__remove_revisions(full_path) + return True + def get_meta(self) -> Dict[str, str]: """ Yes, I know that parsing xml with regexp ain't pretty, @@ -168,27 +229,16 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): def __remove_revisions(self, full_path:str) -> bool: - def parse_map(f): # etree support for ns is a bit rough - ns_map = dict() - for event, (k, v) in ET.iterparse(f, ("start-ns", )): - if event == "start-ns": - ns_map[k] = v - return ns_map + tree, ns = _parse_xml(full_path) - ns = parse_map(full_path) if 'office' not in ns.keys(): # no revisions in the current file return True - # Register the namespaces - for k,v in ns.items(): - ET.register_namespace(k, v) - - tree = ET.parse(full_path) for text in tree.getroot().iterfind('.//office:text', ns): for changes in text.iterfind('.//text:tracked-changes', ns): text.remove(changes) - tree.write(full_path, xml_declaration = True) + tree.write(full_path, xml_declaration=True) return True @@ -219,4 +269,3 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): metadata[key] = value zipin.close() return metadata - diff --git a/tests/data/revision.docx b/tests/data/revision.docx new file mode 100644 index 0000000..8a2d814 Binary files /dev/null and b/tests/data/revision.docx differ diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 1573790..4df6385 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -121,6 +121,7 @@ class TestRemovingThumbnails(unittest.TestCase): zipin.close() os.remove('./tests/data/clean.cleaned.odt') + os.remove('./tests/data/clean.odt') class TestRevisionsCleaning(unittest.TestCase): @@ -142,6 +143,26 @@ class TestRevisionsCleaning(unittest.TestCase): os.remove('./tests/data/clean.odt') os.remove('./tests/data/clean.cleaned.odt') + def test_msoffice(self): + with zipfile.ZipFile('./tests/data/revision.docx') as zipin: + c = zipin.open('word/document.xml') + content = c.read() + r = b'' + self.assertIn(r, content) + + shutil.copy('./tests/data/revision.docx', './tests/data/revision_clean.docx') + p = office.MSOfficeParser('./tests/data/revision_clean.docx') + self.assertTrue(p.remove_all()) + + with zipfile.ZipFile('./tests/data/revision_clean.cleaned.docx') as zipin: + c = zipin.open('word/document.xml') + content = c.read() + r = b'' + self.assertNotIn(r, content) + + os.remove('./tests/data/revision_clean.docx') + os.remove('./tests/data/revision_clean.cleaned.docx') + class TestDeepCleaning(unittest.TestCase): def __check_deep_meta(self, p):