Remove dangling references from document.xml.rels

The file `word/_rels/document.xml.rels` is similar to `[Content_Types].xml` and has references to other files in the archive. If those references aren't removed Word refuses to open the document. # Please enter the commit message for your changes. Lines starting
2025-07-03 20:07:28 +02:00 · 2024-04-05 18:45:58 +02:00 · 2024-04-05 18:45:58 +02:00 · 156855ab7e
commit 156855ab7e
parent 09672a2dcc
2 changed files with 58 additions and 1 deletions
--- a/libmat2/office.py
+++ b/libmat2/office.py
@ -323,6 +323,38 @@ class MSOfficeParser(ZipParser):
        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True

+    def __remove_document_xml_rels_members(self, full_path: str) -> bool:
+        """ Remove the dangling references from the word/_rels/document.xml.rels file, since MS office doesn't like them.
+        """
+        try:
+            tree, namespace = _parse_xml(full_path)
+        except ET.ParseError as e:  # pragma: no cover
+            logging.error("Unable to parse %s: %s", full_path, e)
+            return False
+
+        if len(namespace.items()) != 1:  # pragma: no cover
+            logging.debug("Got several namespaces for Types: %s", namespace.items())
+
+        removed_fnames = set()
+        with zipfile.ZipFile(self.filename) as zin:
+            for fname in [item.filename for item in zin.infolist()]:
+                for file_to_omit in self.files_to_omit:
+                    if file_to_omit.search(fname):
+                        matches = map(lambda r: r.search(fname), self.files_to_keep)
+                        if any(matches):  # the file is in the allowlist
+                            continue
+                        removed_fnames.add(fname)
+                        break
+
+        root = tree.getroot()
+        for item in root.findall('{%s}Relationship' % namespace['']):
+            name = 'word/' + item.attrib['Target'] # add the word/ prefix to the path, since all document rels are in the word/ directory
+            if name in removed_fnames:
+                root.remove(item)
+
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
+        return True
+
    @staticmethod
    def __remove_document_comment_meta(full_path: str) -> bool:
        try:
@ -445,7 +477,7 @@ class MSOfficeParser(ZipParser):
        if os.stat(full_path).st_size == 0:  # Don't process empty files
            return True

-        if not full_path.endswith('.xml'):
+        if not full_path.endswith(('.xml', '.xml.rels')):
            return True

        if self.__randomize_creationId(full_path) is False:
@ -465,6 +497,10 @@ class MSOfficeParser(ZipParser):
            # remove comment references and ranges
            if self.__remove_document_comment_meta(full_path) is False:
                return False  # pragma: no cover
+        elif full_path.endswith('/word/_rels/document.xml.rels'):
+            # similar to the above, but for the document.xml.rels file
+            if self.__remove_document_xml_rels_members(full_path) is False:  # pragma: no cover
+                return False
        elif full_path.endswith('/docProps/app.xml'):
            # This file must be present and valid,
            # so we're removing as much as we can.
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@ -931,3 +931,24 @@ class TextDocx(unittest.TestCase):

        os.remove('./tests/data/comment_clean.docx')
        os.remove('./tests/data/comment_clean.cleaned.docx')
+
+    def test_clean_document_xml_rels(self):
+        with zipfile.ZipFile('./tests/data/comment.docx') as zipin:
+            c = zipin.open('word/_rels/document.xml.rels')
+            content = c.read()
+            r = b'Target="comments.xml"'
+            self.assertIn(r, content)
+
+        shutil.copy('./tests/data/comment.docx', './tests/data/comment_clean.docx')
+        p = office.MSOfficeParser('./tests/data/comment_clean.docx')
+        self.assertTrue(p.remove_all())
+
+        with zipfile.ZipFile('./tests/data/comment_clean.cleaned.docx') as zipin:
+            c = zipin.open('word/_rels/document.xml.rels')
+            content = c.read()
+            r = b'Target="comments.xml"'
+            self.assertNotIn(r, content)
+
+        os.remove('./tests/data/comment_clean.docx')
+        os.remove('./tests/data/comment_clean.cleaned.docx')
+