Strip comment references from document.xml

2025-07-04 20:37:34 +02:00 · 2024-04-03 15:20:00 -04:00 · 2024-04-03 15:20:00 -04:00 · 61f39c4bd0
commit 61f39c4bd0
parent 1b9ce34e2c
2 changed files with 66 additions and 0 deletions
--- a/libmat2/office.py
+++ b/libmat2/office.py
@ -290,6 +290,39 @@ class MSOfficeParser(ZipParser):
        tree.write(full_path, xml_declaration=True)
        return True
    @staticmethod
    def __remove_document_comment_meta(full_path: str) -> bool:
        try:
            tree, namespace = _parse_xml(full_path)
        except ET.ParseError as e:  # pragma: no cover
            logging.error("Unable to parse %s: %s", full_path, e)
            return False
        # search the docs to see if we can bail early
        range_start = tree.find('.//w:commentRangeStart', namespace)
        range_end = tree.find('.//w:commentRangeEnd', namespace)
        references = tree.find('.//w:commentReference', namespace)
        if range_start is None and range_end is None and references is None:
            return True  # No comment meta tags are present
        parent_map = {c:p for p in tree.iter() for c in p}
        # iterate over the elements and add them to list
        elements_del = list()
        for element in tree.iterfind('.//w:commentRangeStart', namespace):
            elements_del.append(element)
        for element in tree.iterfind('.//w:commentRangeEnd', namespace):
            elements_del.append(element)
        for element in tree.iterfind('.//w:commentReference', namespace):
            elements_del.append(element)
        # remove the elements
        for element in elements_del:
            parent_map[element].remove(element)
        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True
    def __remove_content_type_members(self, full_path: str) -> bool:
        """ The method will remove the dangling references
        form the [Content_Types].xml file, since MS office doesn't like them
@ -396,6 +429,9 @@ class MSOfficeParser(ZipParser):
            # this file contains the revisions
            if self.__remove_revisions(full_path) is False:
                return False  # pragma: no cover
            # remove comment references and ranges
            if self.__remove_document_comment_meta(full_path) is False:
                return False  # pragma: no cover
        elif full_path.endswith('/docProps/app.xml'):
            # This file must be present and valid,
            # so we're removing as much as we can.
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@ -875,3 +875,33 @@ class TextDocx(unittest.TestCase):
        os.remove('./tests/data/comment_clean.docx')
        os.remove('./tests/data/comment_clean.cleaned.docx')
    def test_comment_references_are_removed(self):
        with zipfile.ZipFile('./tests/data/comment.docx') as zipin:
            c = zipin.open('word/document.xml')
            content = c.read()
            r = b'w:commentRangeStart'
            self.assertIn(r, content)
            r = b'w:commentRangeEnd'
            self.assertIn(r, content)
            r = b'w:commentReference'
            self.assertIn(r, content)
        shutil.copy('./tests/data/comment.docx', './tests/data/comment_clean.docx')
        p = office.MSOfficeParser('./tests/data/comment_clean.docx')
        self.assertTrue(p.remove_all())
        with zipfile.ZipFile('./tests/data/comment_clean.cleaned.docx') as zipin:
            c = zipin.open('word/document.xml')
            content = c.read()
            r = b'w:commentRangeStart'
            self.assertNotIn(r, content)
            r = b'w:commentRangeEnd'
            self.assertNotIn(r, content)
            r = b'w:commentReference'
            self.assertNotIn(r, content)
        os.remove('./tests/data/comment_clean.docx')
        os.remove('./tests/data/comment_clean.cleaned.docx')