Implement rsid stripping for office files

MS Office XML rsid is a "unique identifier used to track the editing session when the physical character representing this section mark was last formatted." See the following links for details: - https://msdn.microsoft.com/en-us/library/office/documentformat.openxml.wordprocessing.previoussectionproperties.rsidrpr.aspx - https://blogs.msdn.microsoft.com/brian_jones/2006/12/11/whats-up-with-all-those-rsids/.
2025-07-04 20:37:34 +02:00 · 2018-09-20 22:37:53 +02:00 · 2018-09-20 22:37:53 +02:00 · 174d4a0ac0
commit 174d4a0ac0
parent fbcf68c280
3 changed files with 87 additions and 5 deletions
--- a/libmat2/office.py
+++ b/libmat2/office.py
@ -8,6 +8,8 @@ import xml.etree.ElementTree as ET  # type: ignore
 from .archive import ArchiveBasedAbstractParser
 # pylint: disable=line-too-long
 # Make pyflakes happy
 assert Set
 assert Pattern
@ -15,14 +17,12 @@ assert Pattern
 def _parse_xml(full_path: str):
    """ This function parses XML, with namespace support. """
    cpt = 0
    namespace_map = dict()
    for _, (key, value) in ET.iterparse(full_path, ("start-ns", )):
        # The ns[0-9]+ namespaces are reserved for interal usage, so
        # we have to use an other nomenclature.
-        if re.match('^ns[0-9]+$', key):
+        if re.match('^ns[0-9]+$', key, re.I):  #pragma: no cover
-            key = 'mat%d' % cpt
+            key = 'mat' + key[2:]
            cpt += 1
        namespace_map[key] = value
        ET.register_namespace(key, value)
@ -59,11 +59,56 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
        'word/fontTable.xml',
        'word/settings.xml',
        'word/styles.xml',
        # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
        'word/stylesWithEffects.xml',
    }
    files_to_omit = set(map(re.compile, {  # type: ignore
        'word/webSettings.xml',
        'word/theme',
        '^docProps/',
    }))
    @staticmethod
    def __remove_rsid(full_path: str) -> bool:
        """ The method will remove "revision session ID".  We're '}rsid'
        instead of proper parsing, since rsid can have multiple forms, like
        `rsidRDefault`, `rsidR`, `rsids`, …
        We're removing rsid tags in two times, because we can't modify
        the xml while we're iterating on it.
        For more details, see
        - https://msdn.microsoft.com/en-us/library/office/documentformat.openxml.wordprocessing.previoussectionproperties.rsidrpr.aspx
        - https://blogs.msdn.microsoft.com/brian_jones/2006/12/11/whats-up-with-all-those-rsids/
        """
        try:
            tree, namespace = _parse_xml(full_path)
        except ET.ParseError:
            return False
        # rsid, tags or attributes, are always under the `w` namespace
        if 'w' not in namespace.keys():
            return True
        parent_map = {c:p for p in tree.iter() for c in p}
        elements_to_remove = list()
        for item in tree.iterfind('.//', namespace):
            if '}rsid' in item.tag.strip().lower():  # resi as tag
                elements_to_remove.append(item)
                continue
            for key in list(item.attrib.keys()):  # rsid as attribute
                if '}rsid' in key.lower():
                    del item.attrib[key]
        for element in elements_to_remove:
            parent_map[element].remove(element)
        tree.write(full_path, xml_declaration=True)
        return True
    @staticmethod
    def __remove_revisions(full_path: str) -> bool:
        """ In this function, we're changing the XML document in several
@ -112,7 +157,13 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
        if full_path.endswith('/word/document.xml'):
            # this file contains the revisions
-            return self.__remove_revisions(full_path)
+            if self.__remove_revisions(full_path) is False:
                return False
        if full_path.endswith('.xml'):
            if self.__remove_rsid(full_path) is False:
                return False
        return True
    def get_meta(self) -> Dict[str, str]:
--- a/tests/data/office_revision_session_ids.docx
+++ b/tests/data/office_revision_session_ids.docx
--- a/tests/test_deep_cleaning.py
+++ b/tests/test_deep_cleaning.py
@ -105,3 +105,34 @@ class TestZipOrder(unittest.TestCase):
        os.remove('./tests/data/clean.odt')
        os.remove('./tests/data/clean.cleaned.odt')
 class TestRsidRemoval(unittest.TestCase):
    def test_office(self):
        shutil.copy('./tests/data/office_revision_session_ids.docx', './tests/data/clean.docx')
        p = office.MSOfficeParser('./tests/data/clean.docx')
        meta = p.get_meta()
        self.assertIsNotNone(meta)
        how_many_rsid = False
        with zipfile.ZipFile('./tests/data/clean.docx') as zin:
            for item in zin.infolist():
                if not item.filename.endswith('.xml'):
                    continue
                num = zin.read(item).decode('utf-8').lower().count('w:rsid')
                how_many_rsid += num
        self.assertEqual(how_many_rsid, 11)
        ret = p.remove_all()
        self.assertTrue(ret)
        with zipfile.ZipFile('./tests/data/clean.cleaned.docx') as zin:
            for item in zin.infolist():
                if not item.filename.endswith('.xml'):
                    continue
                num = zin.read(item).decode('utf-8').lower().count('w:rsid')
                self.assertEqual(num, 0)
        os.remove('./tests/data/clean.docx')
        os.remove('./tests/data/clean.cleaned.docx')