Remove nsid fields from MSOffice documents
nsids are random identifiers, usually used to ease merging between documents, and can trivially be used for fingerprinting.
This commit is contained in:
parent
40669186c9
commit
0cf0541ad9
@ -62,9 +62,6 @@ class MSOfficeParser(ZipParser):
|
|||||||
|
|
||||||
# Do we want to keep the following ones?
|
# Do we want to keep the following ones?
|
||||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml',
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml',
|
||||||
|
|
||||||
# See https://0xacab.org/jvoisin/mat2/issues/71
|
|
||||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -88,6 +85,7 @@ class MSOfficeParser(ZipParser):
|
|||||||
r'^word/printerSettings/',
|
r'^word/printerSettings/',
|
||||||
r'^word/theme',
|
r'^word/theme',
|
||||||
r'^word/people\.xml$',
|
r'^word/people\.xml$',
|
||||||
|
r'^word/numbering\.xml$',
|
||||||
|
|
||||||
# we have an allowlist in self.files_to_keep,
|
# we have an allowlist in self.files_to_keep,
|
||||||
# so we can trash everything else
|
# so we can trash everything else
|
||||||
@ -124,7 +122,7 @@ class MSOfficeParser(ZipParser):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def __remove_rsid(full_path: str) -> bool:
|
def __remove_rsid(full_path: str) -> bool:
|
||||||
""" The method will remove "revision session ID". We're '}rsid'
|
""" The method will remove "revision session ID". We're using '}rsid'
|
||||||
instead of proper parsing, since rsid can have multiple forms, like
|
instead of proper parsing, since rsid can have multiple forms, like
|
||||||
`rsidRDefault`, `rsidR`, `rsids`, …
|
`rsidRDefault`, `rsidR`, `rsids`, …
|
||||||
|
|
||||||
@ -137,7 +135,8 @@ class MSOfficeParser(ZipParser):
|
|||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
tree, namespace = _parse_xml(full_path)
|
tree, namespace = _parse_xml(full_path)
|
||||||
except ET.ParseError:
|
except ET.ParseError as e:
|
||||||
|
logging.error("Unable to parse %s: %s", full_path, e)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# rsid, tags or attributes, are always under the `w` namespace
|
# rsid, tags or attributes, are always under the `w` namespace
|
||||||
@ -161,6 +160,41 @@ class MSOfficeParser(ZipParser):
|
|||||||
tree.write(full_path, xml_declaration=True)
|
tree.write(full_path, xml_declaration=True)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def __remove_nsid(full_path: str) -> bool:
|
||||||
|
"""
|
||||||
|
NSID are random identifiers that can be used
|
||||||
|
to ease the merging of some components of a document.
|
||||||
|
They can also be used for fingerprinting.
|
||||||
|
|
||||||
|
See the spec for more details: https://docs.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing.nsid?view=openxml-2.8.1
|
||||||
|
|
||||||
|
In this function, we're changing the XML document in several
|
||||||
|
different times, since we don't want to change the tree we're currently
|
||||||
|
iterating on.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
tree, namespace = _parse_xml(full_path)
|
||||||
|
except ET.ParseError as e: # pragma: no cover
|
||||||
|
logging.error("Unable to parse %s: %s", full_path, e)
|
||||||
|
return False
|
||||||
|
|
||||||
|
# The NSID tag is always under the `w` namespace
|
||||||
|
if 'w' not in namespace.keys():
|
||||||
|
return True
|
||||||
|
|
||||||
|
parent_map = {c:p for p in tree.iter() for c in p}
|
||||||
|
|
||||||
|
elements_to_remove = list()
|
||||||
|
for element in tree.iterfind('.//w:nsid', namespace):
|
||||||
|
elements_to_remove.append(element)
|
||||||
|
for element in elements_to_remove:
|
||||||
|
parent_map[element].remove(element)
|
||||||
|
|
||||||
|
tree.write(full_path, xml_declaration=True)
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def __remove_revisions(full_path: str) -> bool:
|
def __remove_revisions(full_path: str) -> bool:
|
||||||
""" In this function, we're changing the XML document in several
|
""" In this function, we're changing the XML document in several
|
||||||
@ -208,7 +242,8 @@ class MSOfficeParser(ZipParser):
|
|||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
tree, namespace = _parse_xml(full_path)
|
tree, namespace = _parse_xml(full_path)
|
||||||
except ET.ParseError: # pragma: no cover
|
except ET.ParseError as e: # pragma: no cover
|
||||||
|
logging.error("Unable to parse %s: %s", full_path, e)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if len(namespace.items()) != 1:
|
if len(namespace.items()) != 1:
|
||||||
@ -269,6 +304,9 @@ class MSOfficeParser(ZipParser):
|
|||||||
if self.__remove_rsid(full_path) is False:
|
if self.__remove_rsid(full_path) is False:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
if self.__remove_nsid(full_path) is False:
|
||||||
|
return False # pragma: no cover
|
||||||
|
|
||||||
try:
|
try:
|
||||||
_sort_xml_attributes(full_path)
|
_sort_xml_attributes(full_path)
|
||||||
except ET.ParseError as e: # pragma: no cover
|
except ET.ParseError as e: # pragma: no cover
|
||||||
|
Loading…
Reference in New Issue
Block a user