Improve a bit the comments in the code
This is related to the previous commit
This commit is contained in:
parent
0cf0541ad9
commit
0170f0e37e
@ -44,6 +44,12 @@ def _sort_xml_attributes(full_path: str) -> bool:
|
|||||||
|
|
||||||
|
|
||||||
class MSOfficeParser(ZipParser):
|
class MSOfficeParser(ZipParser):
|
||||||
|
"""
|
||||||
|
The methods modifying XML documents are usually doing so in two loops:
|
||||||
|
1. finding the tag/attributes to remove;
|
||||||
|
2. actually editing the document
|
||||||
|
since it's tricky to modify the XML while iterating on it.
|
||||||
|
"""
|
||||||
mimetypes = {
|
mimetypes = {
|
||||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||||
@ -126,9 +132,6 @@ class MSOfficeParser(ZipParser):
|
|||||||
instead of proper parsing, since rsid can have multiple forms, like
|
instead of proper parsing, since rsid can have multiple forms, like
|
||||||
`rsidRDefault`, `rsidR`, `rsids`, …
|
`rsidRDefault`, `rsidR`, `rsids`, …
|
||||||
|
|
||||||
We're removing rsid tags in two times, because we can't modify
|
|
||||||
the xml while we're iterating on it.
|
|
||||||
|
|
||||||
For more details, see
|
For more details, see
|
||||||
- https://msdn.microsoft.com/en-us/library/office/documentformat.openxml.wordprocessing.previoussectionproperties.rsidrpr.aspx
|
- https://msdn.microsoft.com/en-us/library/office/documentformat.openxml.wordprocessing.previoussectionproperties.rsidrpr.aspx
|
||||||
- https://blogs.msdn.microsoft.com/brian_jones/2006/12/11/whats-up-with-all-those-rsids/
|
- https://blogs.msdn.microsoft.com/brian_jones/2006/12/11/whats-up-with-all-those-rsids/
|
||||||
@ -163,15 +166,11 @@ class MSOfficeParser(ZipParser):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def __remove_nsid(full_path: str) -> bool:
|
def __remove_nsid(full_path: str) -> bool:
|
||||||
"""
|
"""
|
||||||
NSID are random identifiers that can be used
|
nsid are random identifiers that can be used to ease the merging of
|
||||||
to ease the merging of some components of a document.
|
some components of a document. They can also be used for
|
||||||
They can also be used for fingerprinting.
|
fingerprinting.
|
||||||
|
|
||||||
See the spec for more details: https://docs.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing.nsid?view=openxml-2.8.1
|
See the spec for more details: https://docs.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing.nsid?view=openxml-2.8.1
|
||||||
|
|
||||||
In this function, we're changing the XML document in several
|
|
||||||
different times, since we don't want to change the tree we're currently
|
|
||||||
iterating on.
|
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
tree, namespace = _parse_xml(full_path)
|
tree, namespace = _parse_xml(full_path)
|
||||||
@ -179,7 +178,7 @@ class MSOfficeParser(ZipParser):
|
|||||||
logging.error("Unable to parse %s: %s", full_path, e)
|
logging.error("Unable to parse %s: %s", full_path, e)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# The NSID tag is always under the `w` namespace
|
# The nsid tag is always under the `w` namespace
|
||||||
if 'w' not in namespace.keys():
|
if 'w' not in namespace.keys():
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -197,10 +196,6 @@ class MSOfficeParser(ZipParser):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def __remove_revisions(full_path: str) -> bool:
|
def __remove_revisions(full_path: str) -> bool:
|
||||||
""" In this function, we're changing the XML document in several
|
|
||||||
different times, since we don't want to change the tree we're currently
|
|
||||||
iterating on.
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
tree, namespace = _parse_xml(full_path)
|
tree, namespace = _parse_xml(full_path)
|
||||||
except ET.ParseError as e:
|
except ET.ParseError as e:
|
||||||
|
Loading…
Reference in New Issue
Block a user