diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4ad5c98..38cbe20 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -16,7 +16,7 @@ linting:bandit: script: # TODO: remove B405 and B314 - bandit ./mat2 --format txt --skip B101 - bandit -r ./nautilus/ --format txt --skip B101 - - bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314,B108 + - bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314,B108,B311 linting:codespell: image: $CONTAINER_REGISTRY:linting diff --git a/README.md b/README.md index 5f902fe..c81daff 100644 --- a/README.md +++ b/README.md @@ -152,6 +152,8 @@ Copyright 2016 Marie-Rose for mat2's logo The `tests/data/dirty_with_nsid.docx` file is licensed under GPLv3, and was borrowed from the Calibre project: https://calibre-ebook.com/downloads/demos/demo.docx +The `narrated_powerpoint_presentation.pptx` file is in the public domain. + # Thanks mat2 wouldn't exist without: diff --git a/libmat2/archive.py b/libmat2/archive.py index de80a35..f6db491 100644 --- a/libmat2/archive.py +++ b/libmat2/archive.py @@ -82,6 +82,13 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): # pylint: disable=unused-argument,no-self-use return {} # pragma: no cover + def _final_checks(self) -> bool: + """ This method is invoked after the file has been cleaned, + allowing to run final verifications. + """ + # pylint: disable=unused-argument,no-self-use + return True + @staticmethod @abc.abstractmethod def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]: @@ -223,6 +230,8 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): if abort: os.remove(self.output_filename) return False + if not self._final_checks(): + return False # pragma: no cover return True diff --git a/libmat2/office.py b/libmat2/office.py index 7f70b72..d122fc8 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -1,3 +1,4 @@ +import random import uuid import logging import os @@ -75,6 +76,14 @@ class MSOfficeParser(ZipParser): def __init__(self, filename): super().__init__(filename) + # MSOffice documents are using various counters for cross-references, + # we collect them all, to make sure that they're effectively counters, + # and not unique id used for fingerprinting. + self.__counters = { + 'cNvPr': set(), + 'rid': set(), + } + self.files_to_keep = set(map(re.compile, { # type: ignore r'^\[Content_Types\]\.xml$', r'^_rels/\.rels$', @@ -84,8 +93,14 @@ class MSOfficeParser(ZipParser): r'^ppt/slideLayouts/_rels/slideLayout[0-9]+\.xml\.rels$', r'^ppt/slideLayouts/slideLayout[0-9]+\.xml$', r'^(?:word|ppt)/tableStyles\.xml$', + r'^ppt/slides/_rels/slide[0-9]*\.xml\.rels$', + r'^ppt/slides/slide[0-9]*\.xml$', # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx r'^(?:word|ppt)/stylesWithEffects\.xml$', + r'^ppt/presentation\.xml$', + # TODO: check if p:bgRef can be randomized + r'^ppt/slideMasters/slideMaster[0-9]+\.xml', + r'^ppt/slideMasters/_rels/slideMaster[0-9]+\.xml\.rels', })) self.files_to_omit = set(map(re.compile, { # type: ignore r'^customXml/', @@ -95,6 +110,7 @@ class MSOfficeParser(ZipParser): r'^(?:word|ppt)/theme', r'^(?:word|ppt)/people\.xml$', r'^(?:word|ppt)/numbering\.xml$', + r'^(?:word|ppt)/tags/', # View properties like view mode, last viewed slide etc r'^(?:word|ppt)/viewProps\.xml$', # Additional presentation-wide properties like printing properties, @@ -146,7 +162,7 @@ class MSOfficeParser(ZipParser): """ try: tree, namespace = _parse_xml(full_path) - except ET.ParseError as e: + except ET.ParseError as e: # pragma: no cover logging.error("Unable to parse %s: %s", full_path, e) return False @@ -206,7 +222,7 @@ class MSOfficeParser(ZipParser): def __remove_revisions(full_path: str) -> bool: try: tree, namespace = _parse_xml(full_path) - except ET.ParseError as e: + except ET.ParseError as e: # pragma: no cover logging.error("Unable to parse %s: %s", full_path, e) return False @@ -272,14 +288,71 @@ class MSOfficeParser(ZipParser): tree.write(full_path, xml_declaration=True) return True + def _final_checks(self) -> bool: + for k, v in self.__counters.items(): + if v and len(v) != max(v): + # TODO: make this an error and return False + # once the ability to correct the counters is implemented + logging.warning("%s contains invalid %s: %s", self.filename, k, v) + return True + return True + + def __collect_counters(self, full_path: str): + with open(full_path, encoding='utf-8') as f: + content = f.read() + # "relationship Id" + for i in re.findall(r'(?:\s|r:)[iI][dD]="rId([0-9]+)"(?:\s|/)', content): + self.__counters['rid'].add(int(i)) + # "connector for Non-visual property" + for i in re.findall(r' bool: + try: + tree, namespace = _parse_xml(full_path) + except ET.ParseError as e: # pragma: no cover + logging.error("Unable to parse %s: %s", full_path, e) + return False + + if 'p14' not in namespace.keys(): + return True # pragma: no cover + + for item in tree.iterfind('.//p14:creationId', namespace): + item.set('val', '%s' % random.randint(0, 2**32)) + tree.write(full_path, xml_declaration=True) + return True + + @staticmethod + def __randomize_sldMasterId(full_path: str) -> bool: + try: + tree, namespace = _parse_xml(full_path) + except ET.ParseError as e: # pragma: no cover + logging.error("Unable to parse %s: %s", full_path, e) + return False + + if 'p' not in namespace.keys(): + return True # pragma: no cover + + for item in tree.iterfind('.//p:sldMasterId', namespace): + item.set('id', '%s' % random.randint(0, 2**32)) + tree.write(full_path, xml_declaration=True) + return True + def _specific_cleanup(self, full_path: str) -> bool: - # pylint: disable=too-many-return-statements + # pylint: disable=too-many-return-statements,too-many-branches if os.stat(full_path).st_size == 0: # Don't process empty files return True if not full_path.endswith('.xml'): return True + if self.__randomize_creationId(full_path) is False: + return False + + self.__collect_counters(full_path) + if full_path.endswith('/[Content_Types].xml'): # this file contains references to files that we might # remove, and MS Office doesn't like dangling references @@ -288,7 +361,7 @@ class MSOfficeParser(ZipParser): elif full_path.endswith('/word/document.xml'): # this file contains the revisions if self.__remove_revisions(full_path) is False: - return False + return False # pragma: no cover elif full_path.endswith('/docProps/app.xml'): # This file must be present and valid, # so we're removing as much as we can. @@ -310,9 +383,12 @@ class MSOfficeParser(ZipParser): f.write(b'') uid = str(uuid.uuid4()).encode('utf-8') f.write(b'' % uid) + elif full_path.endswith('ppt/presentation.xml'): + if self.__randomize_sldMasterId(full_path) is False: + return False # pragma: no cover if self.__remove_rsid(full_path) is False: - return False + return False # pragma: no cover if self.__remove_nsid(full_path) is False: return False # pragma: no cover diff --git a/tests/data/narrated_powerpoint_presentation.pptx b/tests/data/narrated_powerpoint_presentation.pptx new file mode 100644 index 0000000..ef04132 Binary files /dev/null and b/tests/data/narrated_powerpoint_presentation.pptx differ diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 9e208ec..a6c3a9a 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -777,3 +777,13 @@ class TestNoSandbox(unittest.TestCase): os.remove('./tests/data/clean.png') os.remove('./tests/data/clean.cleaned.png') os.remove('./tests/data/clean.cleaned.cleaned.png') + +class TestComplexOfficeFiles(unittest.TestCase): + def test_complex_pptx(self): + target = './tests/data/clean.pptx' + shutil.copy('./tests/data/narrated_powerpoint_presentation.pptx', target) + p = office.MSOfficeParser(target) + self.assertTrue(p.remove_all()) + + os.remove(target) + os.remove(p.output_filename)