1
0
Fork 0

Strip comment references from document.xml

This commit is contained in:
Alex Marchant 2024-04-03 15:20:00 -04:00 committed by jvoisin
parent f931a0ecee
commit f2c898c92d
2 changed files with 67 additions and 1 deletions

View File

@ -290,6 +290,39 @@ class MSOfficeParser(ZipParser):
tree.write(full_path, xml_declaration=True, encoding='utf-8')
return True
@staticmethod
def __remove_document_comment_meta(full_path: str) -> bool:
try:
tree, namespace = _parse_xml(full_path)
except ET.ParseError as e: # pragma: no cover
logging.error("Unable to parse %s: %s", full_path, e)
return False
# search the docs to see if we can bail early
range_start = tree.find('.//w:commentRangeStart', namespace)
range_end = tree.find('.//w:commentRangeEnd', namespace)
references = tree.find('.//w:commentReference', namespace)
if range_start is None and range_end is None and references is None:
return True # No comment meta tags are present
parent_map = {c:p for p in tree.iter() for c in p}
# iterate over the elements and add them to list
elements_del = list()
for element in tree.iterfind('.//w:commentRangeStart', namespace):
elements_del.append(element)
for element in tree.iterfind('.//w:commentRangeEnd', namespace):
elements_del.append(element)
for element in tree.iterfind('.//w:commentReference', namespace):
elements_del.append(element)
# remove the elements
for element in elements_del:
parent_map[element].remove(element)
tree.write(full_path, xml_declaration=True, encoding='utf-8')
return True
def __remove_content_type_members(self, full_path: str) -> bool:
""" The method will remove the dangling references
form the [Content_Types].xml file, since MS office doesn't like them
@ -396,6 +429,9 @@ class MSOfficeParser(ZipParser):
# this file contains the revisions
if self.__remove_revisions(full_path) is False:
return False # pragma: no cover
# remove comment references and ranges
if self.__remove_document_comment_meta(full_path) is False:
return False # pragma: no cover
elif full_path.endswith('/docProps/app.xml'):
# This file must be present and valid,
# so we're removing as much as we can.

View File

@ -900,4 +900,34 @@ class TextDocx(unittest.TestCase):
self.assertIsNotNone(match)
os.remove('./tests/data/comment_clean.docx')
os.remove('./tests/data/comment_clean.cleaned.docx')
os.remove('./tests/data/comment_clean.cleaned.docx')
def test_comment_references_are_removed(self):
with zipfile.ZipFile('./tests/data/comment.docx') as zipin:
c = zipin.open('word/document.xml')
content = c.read()
r = b'w:commentRangeStart'
self.assertIn(r, content)
r = b'w:commentRangeEnd'
self.assertIn(r, content)
r = b'w:commentReference'
self.assertIn(r, content)
shutil.copy('./tests/data/comment.docx', './tests/data/comment_clean.docx')
p = office.MSOfficeParser('./tests/data/comment_clean.docx')
self.assertTrue(p.remove_all())
with zipfile.ZipFile('./tests/data/comment_clean.cleaned.docx') as zipin:
c = zipin.open('word/document.xml')
content = c.read()
r = b'w:commentRangeStart'
self.assertNotIn(r, content)
r = b'w:commentRangeEnd'
self.assertNotIn(r, content)
r = b'w:commentReference'
self.assertNotIn(r, content)
os.remove('./tests/data/comment_clean.docx')
os.remove('./tests/data/comment_clean.cleaned.docx')