Strip comment references from document.xml
This commit is contained in:
parent
1b9ce34e2c
commit
61f39c4bd0
@ -290,6 +290,39 @@ class MSOfficeParser(ZipParser):
|
|||||||
tree.write(full_path, xml_declaration=True)
|
tree.write(full_path, xml_declaration=True)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def __remove_document_comment_meta(full_path: str) -> bool:
|
||||||
|
try:
|
||||||
|
tree, namespace = _parse_xml(full_path)
|
||||||
|
except ET.ParseError as e: # pragma: no cover
|
||||||
|
logging.error("Unable to parse %s: %s", full_path, e)
|
||||||
|
return False
|
||||||
|
|
||||||
|
# search the docs to see if we can bail early
|
||||||
|
range_start = tree.find('.//w:commentRangeStart', namespace)
|
||||||
|
range_end = tree.find('.//w:commentRangeEnd', namespace)
|
||||||
|
references = tree.find('.//w:commentReference', namespace)
|
||||||
|
if range_start is None and range_end is None and references is None:
|
||||||
|
return True # No comment meta tags are present
|
||||||
|
|
||||||
|
parent_map = {c:p for p in tree.iter() for c in p}
|
||||||
|
|
||||||
|
# iterate over the elements and add them to list
|
||||||
|
elements_del = list()
|
||||||
|
for element in tree.iterfind('.//w:commentRangeStart', namespace):
|
||||||
|
elements_del.append(element)
|
||||||
|
for element in tree.iterfind('.//w:commentRangeEnd', namespace):
|
||||||
|
elements_del.append(element)
|
||||||
|
for element in tree.iterfind('.//w:commentReference', namespace):
|
||||||
|
elements_del.append(element)
|
||||||
|
|
||||||
|
# remove the elements
|
||||||
|
for element in elements_del:
|
||||||
|
parent_map[element].remove(element)
|
||||||
|
|
||||||
|
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||||
|
return True
|
||||||
|
|
||||||
def __remove_content_type_members(self, full_path: str) -> bool:
|
def __remove_content_type_members(self, full_path: str) -> bool:
|
||||||
""" The method will remove the dangling references
|
""" The method will remove the dangling references
|
||||||
form the [Content_Types].xml file, since MS office doesn't like them
|
form the [Content_Types].xml file, since MS office doesn't like them
|
||||||
@ -396,6 +429,9 @@ class MSOfficeParser(ZipParser):
|
|||||||
# this file contains the revisions
|
# this file contains the revisions
|
||||||
if self.__remove_revisions(full_path) is False:
|
if self.__remove_revisions(full_path) is False:
|
||||||
return False # pragma: no cover
|
return False # pragma: no cover
|
||||||
|
# remove comment references and ranges
|
||||||
|
if self.__remove_document_comment_meta(full_path) is False:
|
||||||
|
return False # pragma: no cover
|
||||||
elif full_path.endswith('/docProps/app.xml'):
|
elif full_path.endswith('/docProps/app.xml'):
|
||||||
# This file must be present and valid,
|
# This file must be present and valid,
|
||||||
# so we're removing as much as we can.
|
# so we're removing as much as we can.
|
||||||
|
@ -873,5 +873,35 @@ class TextDocx(unittest.TestCase):
|
|||||||
# Check if 'word/comments.xml' exists in the zip
|
# Check if 'word/comments.xml' exists in the zip
|
||||||
self.assertNotIn('word/comments.xml', zipin.namelist())
|
self.assertNotIn('word/comments.xml', zipin.namelist())
|
||||||
|
|
||||||
|
os.remove('./tests/data/comment_clean.docx')
|
||||||
|
os.remove('./tests/data/comment_clean.cleaned.docx')
|
||||||
|
|
||||||
|
def test_comment_references_are_removed(self):
|
||||||
|
with zipfile.ZipFile('./tests/data/comment.docx') as zipin:
|
||||||
|
c = zipin.open('word/document.xml')
|
||||||
|
content = c.read()
|
||||||
|
|
||||||
|
r = b'w:commentRangeStart'
|
||||||
|
self.assertIn(r, content)
|
||||||
|
r = b'w:commentRangeEnd'
|
||||||
|
self.assertIn(r, content)
|
||||||
|
r = b'w:commentReference'
|
||||||
|
self.assertIn(r, content)
|
||||||
|
|
||||||
|
shutil.copy('./tests/data/comment.docx', './tests/data/comment_clean.docx')
|
||||||
|
p = office.MSOfficeParser('./tests/data/comment_clean.docx')
|
||||||
|
self.assertTrue(p.remove_all())
|
||||||
|
|
||||||
|
with zipfile.ZipFile('./tests/data/comment_clean.cleaned.docx') as zipin:
|
||||||
|
c = zipin.open('word/document.xml')
|
||||||
|
content = c.read()
|
||||||
|
|
||||||
|
r = b'w:commentRangeStart'
|
||||||
|
self.assertNotIn(r, content)
|
||||||
|
r = b'w:commentRangeEnd'
|
||||||
|
self.assertNotIn(r, content)
|
||||||
|
r = b'w:commentReference'
|
||||||
|
self.assertNotIn(r, content)
|
||||||
|
|
||||||
os.remove('./tests/data/comment_clean.docx')
|
os.remove('./tests/data/comment_clean.docx')
|
||||||
os.remove('./tests/data/comment_clean.cleaned.docx')
|
os.remove('./tests/data/comment_clean.cleaned.docx')
|
Loading…
Reference in New Issue
Block a user