From f931a0eceed3a89ef7c94a8a7b2bbed208bf0295 Mon Sep 17 00:00:00 2001 From: Alex Marchant Date: Wed, 3 Apr 2024 15:27:48 -0400 Subject: [PATCH 1/2] Make utf-8 explicit in all tree.write calls --- libmat2/office.py | 16 ++++++++-------- tests/test_libmat2.py | 26 ++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 8 deletions(-) diff --git a/libmat2/office.py b/libmat2/office.py index 6f69e4a..fa79834 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -38,7 +38,7 @@ def _sort_xml_attributes(full_path: str) -> bool: for c in tree.getroot(): c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc'))) - tree.write(full_path, xml_declaration=True) + tree.write(full_path, xml_declaration=True, encoding='utf-8') return True @@ -220,7 +220,7 @@ class MSOfficeParser(ZipParser): for element in elements_to_remove: parent_map[element].remove(element) - tree.write(full_path, xml_declaration=True) + tree.write(full_path, xml_declaration=True, encoding='utf-8') return True @staticmethod @@ -250,7 +250,7 @@ class MSOfficeParser(ZipParser): for element in elements_to_remove: parent_map[element].remove(element) - tree.write(full_path, xml_declaration=True) + tree.write(full_path, xml_declaration=True, encoding='utf-8') return True @staticmethod @@ -287,7 +287,7 @@ class MSOfficeParser(ZipParser): parent_map[element].insert(position, children) parent_map[element].remove(element) - tree.write(full_path, xml_declaration=True) + tree.write(full_path, xml_declaration=True, encoding='utf-8') return True def __remove_content_type_members(self, full_path: str) -> bool: @@ -320,7 +320,7 @@ class MSOfficeParser(ZipParser): if name in removed_fnames: root.remove(item) - tree.write(full_path, xml_declaration=True) + tree.write(full_path, xml_declaration=True, encoding='utf-8') return True def _final_checks(self) -> bool: @@ -355,7 +355,7 @@ class MSOfficeParser(ZipParser): for item in tree.iterfind('.//p14:creationId', namespace): item.set('val', '%s' % random.randint(0, 2**32)) - tree.write(full_path, xml_declaration=True) + tree.write(full_path, xml_declaration=True, encoding='utf-8') return True @staticmethod @@ -371,7 +371,7 @@ class MSOfficeParser(ZipParser): for item in tree.iterfind('.//p:sldMasterId', namespace): item.set('id', '%s' % random.randint(0, 2**32)) - tree.write(full_path, xml_declaration=True) + tree.write(full_path, xml_declaration=True, encoding='utf-8') return True def _specific_cleanup(self, full_path: str) -> bool: @@ -514,7 +514,7 @@ class LibreOfficeParser(ZipParser): for changes in text.iterfind('.//text:tracked-changes', namespace): text.remove(changes) - tree.write(full_path, xml_declaration=True) + tree.write(full_path, xml_declaration=True, encoding='utf-8') return True def _specific_cleanup(self, full_path: str) -> bool: diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 32ae543..0435113 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -873,5 +873,31 @@ class TextDocx(unittest.TestCase): # Check if 'word/comments.xml' exists in the zip self.assertNotIn('word/comments.xml', zipin.namelist()) + os.remove('./tests/data/comment_clean.docx') + os.remove('./tests/data/comment_clean.cleaned.docx') + + def test_xml_is_utf8(self): + with zipfile.ZipFile('./tests/data/comment.docx') as zipin: + c = zipin.open('word/document.xml') + content = c.read() + + # ensure encoding is utf-8 + r = b'encoding=(\'|\")UTF-8(\'|\")' + match = re.search(r, content, re.IGNORECASE) + self.assertIsNotNone(match) + + shutil.copy('./tests/data/comment.docx', './tests/data/comment_clean.docx') + p = office.MSOfficeParser('./tests/data/comment_clean.docx') + self.assertTrue(p.remove_all()) + + with zipfile.ZipFile('./tests/data/comment_clean.cleaned.docx') as zipin: + c = zipin.open('word/document.xml') + content = c.read() + + # ensure encoding is still utf-8 + r = b'encoding=(\'|\")UTF-8(\'|\")' + match = re.search(r, content, re.IGNORECASE) + self.assertIsNotNone(match) + os.remove('./tests/data/comment_clean.docx') os.remove('./tests/data/comment_clean.cleaned.docx') \ No newline at end of file From f2c898c92d0422ddc76fa977d60f7345b06a5ad6 Mon Sep 17 00:00:00 2001 From: Alex Marchant Date: Wed, 3 Apr 2024 15:20:00 -0400 Subject: [PATCH 2/2] Strip comment references from document.xml --- libmat2/office.py | 36 ++++++++++++++++++++++++++++++++++++ tests/test_libmat2.py | 32 +++++++++++++++++++++++++++++++- 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/libmat2/office.py b/libmat2/office.py index fa79834..3a290d8 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -290,6 +290,39 @@ class MSOfficeParser(ZipParser): tree.write(full_path, xml_declaration=True, encoding='utf-8') return True + @staticmethod + def __remove_document_comment_meta(full_path: str) -> bool: + try: + tree, namespace = _parse_xml(full_path) + except ET.ParseError as e: # pragma: no cover + logging.error("Unable to parse %s: %s", full_path, e) + return False + + # search the docs to see if we can bail early + range_start = tree.find('.//w:commentRangeStart', namespace) + range_end = tree.find('.//w:commentRangeEnd', namespace) + references = tree.find('.//w:commentReference', namespace) + if range_start is None and range_end is None and references is None: + return True # No comment meta tags are present + + parent_map = {c:p for p in tree.iter() for c in p} + + # iterate over the elements and add them to list + elements_del = list() + for element in tree.iterfind('.//w:commentRangeStart', namespace): + elements_del.append(element) + for element in tree.iterfind('.//w:commentRangeEnd', namespace): + elements_del.append(element) + for element in tree.iterfind('.//w:commentReference', namespace): + elements_del.append(element) + + # remove the elements + for element in elements_del: + parent_map[element].remove(element) + + tree.write(full_path, xml_declaration=True, encoding='utf-8') + return True + def __remove_content_type_members(self, full_path: str) -> bool: """ The method will remove the dangling references form the [Content_Types].xml file, since MS office doesn't like them @@ -396,6 +429,9 @@ class MSOfficeParser(ZipParser): # this file contains the revisions if self.__remove_revisions(full_path) is False: return False # pragma: no cover + # remove comment references and ranges + if self.__remove_document_comment_meta(full_path) is False: + return False # pragma: no cover elif full_path.endswith('/docProps/app.xml'): # This file must be present and valid, # so we're removing as much as we can. diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 0435113..491f396 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -900,4 +900,34 @@ class TextDocx(unittest.TestCase): self.assertIsNotNone(match) os.remove('./tests/data/comment_clean.docx') - os.remove('./tests/data/comment_clean.cleaned.docx') \ No newline at end of file + os.remove('./tests/data/comment_clean.cleaned.docx') + + def test_comment_references_are_removed(self): + with zipfile.ZipFile('./tests/data/comment.docx') as zipin: + c = zipin.open('word/document.xml') + content = c.read() + + r = b'w:commentRangeStart' + self.assertIn(r, content) + r = b'w:commentRangeEnd' + self.assertIn(r, content) + r = b'w:commentReference' + self.assertIn(r, content) + + shutil.copy('./tests/data/comment.docx', './tests/data/comment_clean.docx') + p = office.MSOfficeParser('./tests/data/comment_clean.docx') + self.assertTrue(p.remove_all()) + + with zipfile.ZipFile('./tests/data/comment_clean.cleaned.docx') as zipin: + c = zipin.open('word/document.xml') + content = c.read() + + r = b'w:commentRangeStart' + self.assertNotIn(r, content) + r = b'w:commentRangeEnd' + self.assertNotIn(r, content) + r = b'w:commentReference' + self.assertNotIn(r, content) + + os.remove('./tests/data/comment_clean.docx') + os.remove('./tests/data/comment_clean.cleaned.docx')