Vastly improve ppt compatibility
This commit is contained in:
parent
a23dc001cd
commit
d7a03d907b
@ -16,7 +16,7 @@ linting:bandit:
|
|||||||
script: # TODO: remove B405 and B314
|
script: # TODO: remove B405 and B314
|
||||||
- bandit ./mat2 --format txt --skip B101
|
- bandit ./mat2 --format txt --skip B101
|
||||||
- bandit -r ./nautilus/ --format txt --skip B101
|
- bandit -r ./nautilus/ --format txt --skip B101
|
||||||
- bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314,B108
|
- bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314,B108,B311
|
||||||
|
|
||||||
linting:codespell:
|
linting:codespell:
|
||||||
image: $CONTAINER_REGISTRY:linting
|
image: $CONTAINER_REGISTRY:linting
|
||||||
|
@ -152,6 +152,8 @@ Copyright 2016 Marie-Rose for mat2's logo
|
|||||||
The `tests/data/dirty_with_nsid.docx` file is licensed under GPLv3,
|
The `tests/data/dirty_with_nsid.docx` file is licensed under GPLv3,
|
||||||
and was borrowed from the Calibre project: https://calibre-ebook.com/downloads/demos/demo.docx
|
and was borrowed from the Calibre project: https://calibre-ebook.com/downloads/demos/demo.docx
|
||||||
|
|
||||||
|
The `narrated_powerpoint_presentation.pptx` file is in the public domain.
|
||||||
|
|
||||||
# Thanks
|
# Thanks
|
||||||
|
|
||||||
mat2 wouldn't exist without:
|
mat2 wouldn't exist without:
|
||||||
|
@ -82,6 +82,13 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
|
|||||||
# pylint: disable=unused-argument,no-self-use
|
# pylint: disable=unused-argument,no-self-use
|
||||||
return {} # pragma: no cover
|
return {} # pragma: no cover
|
||||||
|
|
||||||
|
def _final_checks(self) -> bool:
|
||||||
|
""" This method is invoked after the file has been cleaned,
|
||||||
|
allowing to run final verifications.
|
||||||
|
"""
|
||||||
|
# pylint: disable=unused-argument,no-self-use
|
||||||
|
return True
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]:
|
def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]:
|
||||||
@ -223,6 +230,8 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
|
|||||||
if abort:
|
if abort:
|
||||||
os.remove(self.output_filename)
|
os.remove(self.output_filename)
|
||||||
return False
|
return False
|
||||||
|
if not self._final_checks():
|
||||||
|
return False # pragma: no cover
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import random
|
||||||
import uuid
|
import uuid
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
@ -75,6 +76,14 @@ class MSOfficeParser(ZipParser):
|
|||||||
def __init__(self, filename):
|
def __init__(self, filename):
|
||||||
super().__init__(filename)
|
super().__init__(filename)
|
||||||
|
|
||||||
|
# MSOffice documents are using various counters for cross-references,
|
||||||
|
# we collect them all, to make sure that they're effectively counters,
|
||||||
|
# and not unique id used for fingerprinting.
|
||||||
|
self.__counters = {
|
||||||
|
'cNvPr': set(),
|
||||||
|
'rid': set(),
|
||||||
|
}
|
||||||
|
|
||||||
self.files_to_keep = set(map(re.compile, { # type: ignore
|
self.files_to_keep = set(map(re.compile, { # type: ignore
|
||||||
r'^\[Content_Types\]\.xml$',
|
r'^\[Content_Types\]\.xml$',
|
||||||
r'^_rels/\.rels$',
|
r'^_rels/\.rels$',
|
||||||
@ -84,8 +93,14 @@ class MSOfficeParser(ZipParser):
|
|||||||
r'^ppt/slideLayouts/_rels/slideLayout[0-9]+\.xml\.rels$',
|
r'^ppt/slideLayouts/_rels/slideLayout[0-9]+\.xml\.rels$',
|
||||||
r'^ppt/slideLayouts/slideLayout[0-9]+\.xml$',
|
r'^ppt/slideLayouts/slideLayout[0-9]+\.xml$',
|
||||||
r'^(?:word|ppt)/tableStyles\.xml$',
|
r'^(?:word|ppt)/tableStyles\.xml$',
|
||||||
|
r'^ppt/slides/_rels/slide[0-9]*\.xml\.rels$',
|
||||||
|
r'^ppt/slides/slide[0-9]*\.xml$',
|
||||||
# https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
|
# https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
|
||||||
r'^(?:word|ppt)/stylesWithEffects\.xml$',
|
r'^(?:word|ppt)/stylesWithEffects\.xml$',
|
||||||
|
r'^ppt/presentation\.xml$',
|
||||||
|
# TODO: check if p:bgRef can be randomized
|
||||||
|
r'^ppt/slideMasters/slideMaster[0-9]+\.xml',
|
||||||
|
r'^ppt/slideMasters/_rels/slideMaster[0-9]+\.xml\.rels',
|
||||||
}))
|
}))
|
||||||
self.files_to_omit = set(map(re.compile, { # type: ignore
|
self.files_to_omit = set(map(re.compile, { # type: ignore
|
||||||
r'^customXml/',
|
r'^customXml/',
|
||||||
@ -95,6 +110,7 @@ class MSOfficeParser(ZipParser):
|
|||||||
r'^(?:word|ppt)/theme',
|
r'^(?:word|ppt)/theme',
|
||||||
r'^(?:word|ppt)/people\.xml$',
|
r'^(?:word|ppt)/people\.xml$',
|
||||||
r'^(?:word|ppt)/numbering\.xml$',
|
r'^(?:word|ppt)/numbering\.xml$',
|
||||||
|
r'^(?:word|ppt)/tags/',
|
||||||
# View properties like view mode, last viewed slide etc
|
# View properties like view mode, last viewed slide etc
|
||||||
r'^(?:word|ppt)/viewProps\.xml$',
|
r'^(?:word|ppt)/viewProps\.xml$',
|
||||||
# Additional presentation-wide properties like printing properties,
|
# Additional presentation-wide properties like printing properties,
|
||||||
@ -146,7 +162,7 @@ class MSOfficeParser(ZipParser):
|
|||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
tree, namespace = _parse_xml(full_path)
|
tree, namespace = _parse_xml(full_path)
|
||||||
except ET.ParseError as e:
|
except ET.ParseError as e: # pragma: no cover
|
||||||
logging.error("Unable to parse %s: %s", full_path, e)
|
logging.error("Unable to parse %s: %s", full_path, e)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@ -206,7 +222,7 @@ class MSOfficeParser(ZipParser):
|
|||||||
def __remove_revisions(full_path: str) -> bool:
|
def __remove_revisions(full_path: str) -> bool:
|
||||||
try:
|
try:
|
||||||
tree, namespace = _parse_xml(full_path)
|
tree, namespace = _parse_xml(full_path)
|
||||||
except ET.ParseError as e:
|
except ET.ParseError as e: # pragma: no cover
|
||||||
logging.error("Unable to parse %s: %s", full_path, e)
|
logging.error("Unable to parse %s: %s", full_path, e)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@ -272,14 +288,71 @@ class MSOfficeParser(ZipParser):
|
|||||||
tree.write(full_path, xml_declaration=True)
|
tree.write(full_path, xml_declaration=True)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def _final_checks(self) -> bool:
|
||||||
|
for k, v in self.__counters.items():
|
||||||
|
if v and len(v) != max(v):
|
||||||
|
# TODO: make this an error and return False
|
||||||
|
# once the ability to correct the counters is implemented
|
||||||
|
logging.warning("%s contains invalid %s: %s", self.filename, k, v)
|
||||||
|
return True
|
||||||
|
return True
|
||||||
|
|
||||||
|
def __collect_counters(self, full_path: str):
|
||||||
|
with open(full_path, encoding='utf-8') as f:
|
||||||
|
content = f.read()
|
||||||
|
# "relationship Id"
|
||||||
|
for i in re.findall(r'(?:\s|r:)[iI][dD]="rId([0-9]+)"(?:\s|/)', content):
|
||||||
|
self.__counters['rid'].add(int(i))
|
||||||
|
# "connector for Non-visual property"
|
||||||
|
for i in re.findall(r'<p:cNvPr id="([0-9]+)"', content):
|
||||||
|
self.__counters['cNvPr'].add(int(i))
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def __randomize_creationId(full_path: str) -> bool:
|
||||||
|
try:
|
||||||
|
tree, namespace = _parse_xml(full_path)
|
||||||
|
except ET.ParseError as e: # pragma: no cover
|
||||||
|
logging.error("Unable to parse %s: %s", full_path, e)
|
||||||
|
return False
|
||||||
|
|
||||||
|
if 'p14' not in namespace.keys():
|
||||||
|
return True # pragma: no cover
|
||||||
|
|
||||||
|
for item in tree.iterfind('.//p14:creationId', namespace):
|
||||||
|
item.set('val', '%s' % random.randint(0, 2**32))
|
||||||
|
tree.write(full_path, xml_declaration=True)
|
||||||
|
return True
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def __randomize_sldMasterId(full_path: str) -> bool:
|
||||||
|
try:
|
||||||
|
tree, namespace = _parse_xml(full_path)
|
||||||
|
except ET.ParseError as e: # pragma: no cover
|
||||||
|
logging.error("Unable to parse %s: %s", full_path, e)
|
||||||
|
return False
|
||||||
|
|
||||||
|
if 'p' not in namespace.keys():
|
||||||
|
return True # pragma: no cover
|
||||||
|
|
||||||
|
for item in tree.iterfind('.//p:sldMasterId', namespace):
|
||||||
|
item.set('id', '%s' % random.randint(0, 2**32))
|
||||||
|
tree.write(full_path, xml_declaration=True)
|
||||||
|
return True
|
||||||
|
|
||||||
def _specific_cleanup(self, full_path: str) -> bool:
|
def _specific_cleanup(self, full_path: str) -> bool:
|
||||||
# pylint: disable=too-many-return-statements
|
# pylint: disable=too-many-return-statements,too-many-branches
|
||||||
if os.stat(full_path).st_size == 0: # Don't process empty files
|
if os.stat(full_path).st_size == 0: # Don't process empty files
|
||||||
return True
|
return True
|
||||||
|
|
||||||
if not full_path.endswith('.xml'):
|
if not full_path.endswith('.xml'):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
if self.__randomize_creationId(full_path) is False:
|
||||||
|
return False
|
||||||
|
|
||||||
|
self.__collect_counters(full_path)
|
||||||
|
|
||||||
if full_path.endswith('/[Content_Types].xml'):
|
if full_path.endswith('/[Content_Types].xml'):
|
||||||
# this file contains references to files that we might
|
# this file contains references to files that we might
|
||||||
# remove, and MS Office doesn't like dangling references
|
# remove, and MS Office doesn't like dangling references
|
||||||
@ -288,7 +361,7 @@ class MSOfficeParser(ZipParser):
|
|||||||
elif full_path.endswith('/word/document.xml'):
|
elif full_path.endswith('/word/document.xml'):
|
||||||
# this file contains the revisions
|
# this file contains the revisions
|
||||||
if self.__remove_revisions(full_path) is False:
|
if self.__remove_revisions(full_path) is False:
|
||||||
return False
|
return False # pragma: no cover
|
||||||
elif full_path.endswith('/docProps/app.xml'):
|
elif full_path.endswith('/docProps/app.xml'):
|
||||||
# This file must be present and valid,
|
# This file must be present and valid,
|
||||||
# so we're removing as much as we can.
|
# so we're removing as much as we can.
|
||||||
@ -310,9 +383,12 @@ class MSOfficeParser(ZipParser):
|
|||||||
f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>')
|
f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>')
|
||||||
uid = str(uuid.uuid4()).encode('utf-8')
|
uid = str(uuid.uuid4()).encode('utf-8')
|
||||||
f.write(b'<a:tblStyleLst def="{%s}" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"/>' % uid)
|
f.write(b'<a:tblStyleLst def="{%s}" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"/>' % uid)
|
||||||
|
elif full_path.endswith('ppt/presentation.xml'):
|
||||||
|
if self.__randomize_sldMasterId(full_path) is False:
|
||||||
|
return False # pragma: no cover
|
||||||
|
|
||||||
if self.__remove_rsid(full_path) is False:
|
if self.__remove_rsid(full_path) is False:
|
||||||
return False
|
return False # pragma: no cover
|
||||||
|
|
||||||
if self.__remove_nsid(full_path) is False:
|
if self.__remove_nsid(full_path) is False:
|
||||||
return False # pragma: no cover
|
return False # pragma: no cover
|
||||||
|
BIN
tests/data/narrated_powerpoint_presentation.pptx
Normal file
BIN
tests/data/narrated_powerpoint_presentation.pptx
Normal file
Binary file not shown.
@ -777,3 +777,13 @@ class TestNoSandbox(unittest.TestCase):
|
|||||||
os.remove('./tests/data/clean.png')
|
os.remove('./tests/data/clean.png')
|
||||||
os.remove('./tests/data/clean.cleaned.png')
|
os.remove('./tests/data/clean.cleaned.png')
|
||||||
os.remove('./tests/data/clean.cleaned.cleaned.png')
|
os.remove('./tests/data/clean.cleaned.cleaned.png')
|
||||||
|
|
||||||
|
class TestComplexOfficeFiles(unittest.TestCase):
|
||||||
|
def test_complex_pptx(self):
|
||||||
|
target = './tests/data/clean.pptx'
|
||||||
|
shutil.copy('./tests/data/narrated_powerpoint_presentation.pptx', target)
|
||||||
|
p = office.MSOfficeParser(target)
|
||||||
|
self.assertTrue(p.remove_all())
|
||||||
|
|
||||||
|
os.remove(target)
|
||||||
|
os.remove(p.output_filename)
|
||||||
|
Loading…
Reference in New Issue
Block a user