Merge branch 'alexmarchant-utf-8-encode-all'
This commit is contained in:
commit
09672a2dcc
@ -38,7 +38,7 @@ def _sort_xml_attributes(full_path: str) -> bool:
|
|||||||
for c in tree.getroot():
|
for c in tree.getroot():
|
||||||
c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc')))
|
c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc')))
|
||||||
|
|
||||||
tree.write(full_path, xml_declaration=True)
|
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
@ -220,7 +220,7 @@ class MSOfficeParser(ZipParser):
|
|||||||
for element in elements_to_remove:
|
for element in elements_to_remove:
|
||||||
parent_map[element].remove(element)
|
parent_map[element].remove(element)
|
||||||
|
|
||||||
tree.write(full_path, xml_declaration=True)
|
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -250,7 +250,7 @@ class MSOfficeParser(ZipParser):
|
|||||||
for element in elements_to_remove:
|
for element in elements_to_remove:
|
||||||
parent_map[element].remove(element)
|
parent_map[element].remove(element)
|
||||||
|
|
||||||
tree.write(full_path, xml_declaration=True)
|
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -287,7 +287,40 @@ class MSOfficeParser(ZipParser):
|
|||||||
parent_map[element].insert(position, children)
|
parent_map[element].insert(position, children)
|
||||||
parent_map[element].remove(element)
|
parent_map[element].remove(element)
|
||||||
|
|
||||||
tree.write(full_path, xml_declaration=True)
|
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||||
|
return True
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def __remove_document_comment_meta(full_path: str) -> bool:
|
||||||
|
try:
|
||||||
|
tree, namespace = _parse_xml(full_path)
|
||||||
|
except ET.ParseError as e: # pragma: no cover
|
||||||
|
logging.error("Unable to parse %s: %s", full_path, e)
|
||||||
|
return False
|
||||||
|
|
||||||
|
# search the docs to see if we can bail early
|
||||||
|
range_start = tree.find('.//w:commentRangeStart', namespace)
|
||||||
|
range_end = tree.find('.//w:commentRangeEnd', namespace)
|
||||||
|
references = tree.find('.//w:commentReference', namespace)
|
||||||
|
if range_start is None and range_end is None and references is None:
|
||||||
|
return True # No comment meta tags are present
|
||||||
|
|
||||||
|
parent_map = {c:p for p in tree.iter() for c in p}
|
||||||
|
|
||||||
|
# iterate over the elements and add them to list
|
||||||
|
elements_del = list()
|
||||||
|
for element in tree.iterfind('.//w:commentRangeStart', namespace):
|
||||||
|
elements_del.append(element)
|
||||||
|
for element in tree.iterfind('.//w:commentRangeEnd', namespace):
|
||||||
|
elements_del.append(element)
|
||||||
|
for element in tree.iterfind('.//w:commentReference', namespace):
|
||||||
|
elements_del.append(element)
|
||||||
|
|
||||||
|
# remove the elements
|
||||||
|
for element in elements_del:
|
||||||
|
parent_map[element].remove(element)
|
||||||
|
|
||||||
|
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -353,7 +386,7 @@ class MSOfficeParser(ZipParser):
|
|||||||
if name in removed_fnames:
|
if name in removed_fnames:
|
||||||
root.remove(item)
|
root.remove(item)
|
||||||
|
|
||||||
tree.write(full_path, xml_declaration=True)
|
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _final_checks(self) -> bool:
|
def _final_checks(self) -> bool:
|
||||||
@ -388,7 +421,7 @@ class MSOfficeParser(ZipParser):
|
|||||||
|
|
||||||
for item in tree.iterfind('.//p14:creationId', namespace):
|
for item in tree.iterfind('.//p14:creationId', namespace):
|
||||||
item.set('val', '%s' % random.randint(0, 2**32))
|
item.set('val', '%s' % random.randint(0, 2**32))
|
||||||
tree.write(full_path, xml_declaration=True)
|
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -404,7 +437,7 @@ class MSOfficeParser(ZipParser):
|
|||||||
|
|
||||||
for item in tree.iterfind('.//p:sldMasterId', namespace):
|
for item in tree.iterfind('.//p:sldMasterId', namespace):
|
||||||
item.set('id', '%s' % random.randint(0, 2**32))
|
item.set('id', '%s' % random.randint(0, 2**32))
|
||||||
tree.write(full_path, xml_declaration=True)
|
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _specific_cleanup(self, full_path: str) -> bool:
|
def _specific_cleanup(self, full_path: str) -> bool:
|
||||||
@ -550,7 +583,7 @@ class LibreOfficeParser(ZipParser):
|
|||||||
for changes in text.iterfind('.//text:tracked-changes', namespace):
|
for changes in text.iterfind('.//text:tracked-changes', namespace):
|
||||||
text.remove(changes)
|
text.remove(changes)
|
||||||
|
|
||||||
tree.write(full_path, xml_declaration=True)
|
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _specific_cleanup(self, full_path: str) -> bool:
|
def _specific_cleanup(self, full_path: str) -> bool:
|
||||||
|
@ -876,6 +876,32 @@ class TextDocx(unittest.TestCase):
|
|||||||
os.remove('./tests/data/comment_clean.docx')
|
os.remove('./tests/data/comment_clean.docx')
|
||||||
os.remove('./tests/data/comment_clean.cleaned.docx')
|
os.remove('./tests/data/comment_clean.cleaned.docx')
|
||||||
|
|
||||||
|
def test_xml_is_utf8(self):
|
||||||
|
with zipfile.ZipFile('./tests/data/comment.docx') as zipin:
|
||||||
|
c = zipin.open('word/document.xml')
|
||||||
|
content = c.read()
|
||||||
|
|
||||||
|
# ensure encoding is utf-8
|
||||||
|
r = b'encoding=(\'|\")UTF-8(\'|\")'
|
||||||
|
match = re.search(r, content, re.IGNORECASE)
|
||||||
|
self.assertIsNotNone(match)
|
||||||
|
|
||||||
|
shutil.copy('./tests/data/comment.docx', './tests/data/comment_clean.docx')
|
||||||
|
p = office.MSOfficeParser('./tests/data/comment_clean.docx')
|
||||||
|
self.assertTrue(p.remove_all())
|
||||||
|
|
||||||
|
with zipfile.ZipFile('./tests/data/comment_clean.cleaned.docx') as zipin:
|
||||||
|
c = zipin.open('word/document.xml')
|
||||||
|
content = c.read()
|
||||||
|
|
||||||
|
# ensure encoding is still utf-8
|
||||||
|
r = b'encoding=(\'|\")UTF-8(\'|\")'
|
||||||
|
match = re.search(r, content, re.IGNORECASE)
|
||||||
|
self.assertIsNotNone(match)
|
||||||
|
|
||||||
|
os.remove('./tests/data/comment_clean.docx')
|
||||||
|
os.remove('./tests/data/comment_clean.cleaned.docx')
|
||||||
|
|
||||||
def test_comment_references_are_removed(self):
|
def test_comment_references_are_removed(self):
|
||||||
with zipfile.ZipFile('./tests/data/comment.docx') as zipin:
|
with zipfile.ZipFile('./tests/data/comment.docx') as zipin:
|
||||||
c = zipin.open('word/document.xml')
|
c = zipin.open('word/document.xml')
|
||||||
|
Loading…
Reference in New Issue
Block a user