mat2/libmat2/office.py

import os
import re
import zipfile
from typing import Dict, Set, Pattern

import xml.etree.ElementTree as ET  # type: ignore

from .archive import ArchiveBasedAbstractParser

# Make pyflakes happy
assert Set
assert Pattern

def _parse_xml(full_path: str):
    """ This function parse XML, with namespace support. """

    namespace_map = dict()
    for _, (key, value) in ET.iterparse(full_path, ("start-ns", )):
        namespace_map[key] = value
        ET.register_namespace(key, value)

    return ET.parse(full_path), namespace_map


class MSOfficeParser(ArchiveBasedAbstractParser):
    mimetypes = {
        'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
        'application/vnd.openxmlformats-officedocument.presentationml.presentation'
    }
    files_to_keep = {
        '[Content_Types].xml',
        '_rels/.rels',
        'word/_rels/document.xml.rels',
        'word/document.xml',
        'word/fontTable.xml',
        'word/settings.xml',
        'word/styles.xml',
    }
    files_to_omit = set(map(re.compile, {  # type: ignore
        '^docProps/',
    }))

    @staticmethod
    def __remove_revisions(full_path: str) -> bool:
        """ In this function, we're changing the XML document in several
        different times, since we don't want to change the tree we're currently
        iterating on.
        """
        try:
            tree, namespace = _parse_xml(full_path)
        except ET.ParseError:
            return False

        # Revisions are either deletions (`w:del`) or
        # insertions (`w:ins`)
        del_presence = tree.find('.//w:del', namespace)
        ins_presence = tree.find('.//w:ins', namespace)
        if del_presence is None and ins_presence is None:
            return True  # No revisions are present

        parent_map = {c:p for p in tree.iter() for c in p}

        elements = list()
        for element in tree.iterfind('.//w:del', namespace):
            elements.append(element)
        for element in elements:
            parent_map[element].remove(element)

        elements = list()
        for element in tree.iterfind('.//w:ins', namespace):
            for position, item in enumerate(tree.iter()):  #pragma: no cover
                if item == element:
                    for children in element.iterfind('./*'):
                        elements.append((element, position, children))
                    break
        for (element, position, children) in elements:
            parent_map[element].insert(position, children)
            parent_map[element].remove(element)

        tree.write(full_path, xml_declaration=True)

        return True

    def _specific_cleanup(self, full_path: str) -> bool:
        if full_path.endswith('/word/document.xml'):
            # this file contains the revisions
            return self.__remove_revisions(full_path)
        return True

    def get_meta(self) -> Dict[str, str]:
        """
        Yes, I know that parsing xml with regexp ain't pretty,
        be my guest and fix it if you want.
        """
        metadata = {}
        zipin = zipfile.ZipFile(self.filename)
        for item in zipin.infolist():
            if item.filename.startswith('docProps/') and item.filename.endswith('.xml'):
                try:
                    content = zipin.read(item).decode('utf-8')
                    results = re.findall(r"<(.+)>(.+)</\1>", content, re.I|re.M)
                    for (key, value) in results:
                        metadata[key] = value
                except (TypeError, UnicodeDecodeError):  # We didn't manage to parse the xml file
                    metadata[item.filename] = 'harmful content'
            for key, value in self._get_zipinfo_meta(item).items():
                metadata[key] = value
        zipin.close()
        return metadata


class LibreOfficeParser(ArchiveBasedAbstractParser):
    mimetypes = {
        'application/vnd.oasis.opendocument.text',
        'application/vnd.oasis.opendocument.spreadsheet',
        'application/vnd.oasis.opendocument.presentation',
        'application/vnd.oasis.opendocument.graphics',
        'application/vnd.oasis.opendocument.chart',
        'application/vnd.oasis.opendocument.formula',
        'application/vnd.oasis.opendocument.image',
    }
    files_to_keep = {
        'META-INF/manifest.xml',
        'content.xml',
        'manifest.rdf',
        'mimetype',
        'settings.xml',
        'styles.xml',
    }
    files_to_omit = set(map(re.compile, {  # type: ignore
        r'^meta\.xml$',
        '^Configurations2/',
        '^Thumbnails/',
    }))


    @staticmethod
    def __remove_revisions(full_path: str) -> bool:
        try:
            tree, namespace = _parse_xml(full_path)
        except ET.ParseError:
            return False

        if 'office' not in namespace.keys():  # no revisions in the current file
            return True

        for text in tree.getroot().iterfind('.//office:text', namespace):
            for changes in text.iterfind('.//text:tracked-changes', namespace):
                text.remove(changes)

        tree.write(full_path, xml_declaration=True)

        return True

    def _specific_cleanup(self, full_path: str) -> bool:
        if os.path.basename(full_path) == 'content.xml':
            return self.__remove_revisions(full_path)
        return True

    def get_meta(self) -> Dict[str, str]:
        """
        Yes, I know that parsing xml with regexp ain't pretty,
        be my guest and fix it if you want.
        """
        metadata = {}
        zipin = zipfile.ZipFile(self.filename)
        for item in zipin.infolist():
            if item.filename == 'meta.xml':
                try:
                    content = zipin.read(item).decode('utf-8')
                    results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I|re.M)
                    for (key, value) in results:
                        metadata[key] = value
                except (TypeError, UnicodeDecodeError):  # We didn't manage to parse the xml file
                    metadata[item.filename] = 'harmful content'
            for key, value in self._get_zipinfo_meta(item).items():
                metadata[key] = value
        zipin.close()
        return metadata
Refactor office document handling 2018-04-01 01:04:06 +02:00			`import os`
Display docx metadata 2018-03-31 20:56:15 +02:00			`import re`
Refactor office document handling 2018-04-01 01:04:06 +02:00			`import zipfile`
Refactor how offices files are handled - xml files are no longer considered harmless - Factorization of the `remove_all` method for office files - Explicit whitelist are used - Blacklist are used to skip files completely - Non-blacklisted files are _still cleaned_ - Unsupported files are still triggering an error 2018-06-21 23:02:41 +02:00			`from typing import Dict, Set, Pattern`
Remove `print` from libmat, and use the `logging` module instead This should close #28 2018-07-10 21:30:38 +02:00
Remove defusedxml support and document why 2018-09-05 18:41:08 +02:00			`import xml.etree.ElementTree as ET # type: ignore`
MAT2 is now cleaning revisions from odt files! 2018-06-27 23:10:53 +02:00
Split office and archives 2018-09-06 11:32:45 +02:00			`from .archive import ArchiveBasedAbstractParser`
Add support for docx 2018-03-31 15:47:06 +02:00
Fix some linter warnings 2018-06-21 23:07:21 +02:00			`# Make pyflakes happy`
			`assert Set`
			`assert Pattern`
Add some white lines to make the code more compliant 2018-04-04 23:21:48 +02:00
Remove docx revisions 2018-07-01 23:11:10 +02:00			`def _parse_xml(full_path: str):`
Minor simplification in how we're handling xml for office files 2018-07-19 22:52:40 +02:00			`""" This function parse XML, with namespace support. """`
Remove docx revisions 2018-07-01 23:11:10 +02:00
Minor simplification in how we're handling xml for office files 2018-07-19 22:52:40 +02:00			`namespace_map = dict()`
			`for _, (key, value) in ET.iterparse(full_path, ("start-ns", )):`
			`namespace_map[key] = value`
Make pylint even happier 2018-07-09 01:11:44 +02:00			`ET.register_namespace(key, value)`
Remove docx revisions 2018-07-01 23:11:10 +02:00
Make pylint even happier 2018-07-09 01:11:44 +02:00			`return ET.parse(full_path), namespace_map`
Remove docx revisions 2018-07-01 23:11:10 +02:00

Refactor office document handling 2018-04-01 01:04:06 +02:00			`class MSOfficeParser(ArchiveBasedAbstractParser):`
Add support for docx 2018-03-31 15:47:06 +02:00			`mimetypes = {`
Do a pylint pass 2018-05-16 22:36:59 +02:00			`'application/vnd.openxmlformats-officedocument.wordprocessingml.document',`
			`'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',`
			`'application/vnd.openxmlformats-officedocument.presentationml.presentation'`
Add support for docx 2018-03-31 15:47:06 +02:00			`}`
Refactor how offices files are handled - xml files are no longer considered harmless - Factorization of the `remove_all` method for office files - Explicit whitelist are used - Blacklist are used to skip files completely - Non-blacklisted files are _still cleaned_ - Unsupported files are still triggering an error 2018-06-21 23:02:41 +02:00			`files_to_keep = {`
Improve a bit the formatting of the code thanks to pyflakes3 2018-07-02 00:22:05 +02:00			`'[Content_Types].xml',`
			`'_rels/.rels',`
			`'word/_rels/document.xml.rels',`
			`'word/document.xml',`
			`'word/fontTable.xml',`
			`'word/settings.xml',`
			`'word/styles.xml',`
Refactor how offices files are handled - xml files are no longer considered harmless - Factorization of the `remove_all` method for office files - Explicit whitelist are used - Blacklist are used to skip files completely - Non-blacklisted files are _still cleaned_ - Unsupported files are still triggering an error 2018-06-21 23:02:41 +02:00			`}`
			`files_to_omit = set(map(re.compile, { # type: ignore`
Improve a bit the formatting of the code thanks to pyflakes3 2018-07-02 00:22:05 +02:00			`'^docProps/',`
Refactor how offices files are handled - xml files are no longer considered harmless - Factorization of the `remove_all` method for office files - Explicit whitelist are used - Blacklist are used to skip files completely - Non-blacklisted files are _still cleaned_ - Unsupported files are still triggering an error 2018-06-21 23:02:41 +02:00			`}))`
Add support for docx 2018-03-31 15:47:06 +02:00
Make pylint even happier 2018-07-09 01:11:44 +02:00			`@staticmethod`
			`def __remove_revisions(full_path: str) -> bool:`
Improve the code's documentation 2018-07-19 23:10:27 +02:00			`""" In this function, we're changing the XML document in several`
			`different times, since we don't want to change the tree we're currently`
			`iterating on.`
			`"""`
Bump coverage for office files and fix some related crashes 2018-07-08 21:35:45 +02:00			`try:`
Make pylint even happier 2018-07-09 01:11:44 +02:00			`tree, namespace = _parse_xml(full_path)`
Bump coverage for office files and fix some related crashes 2018-07-08 21:35:45 +02:00			`except ET.ParseError:`
			`return False`
Remove docx revisions 2018-07-01 23:11:10 +02:00
Improve the code's documentation 2018-07-19 23:10:27 +02:00			# Revisions are either deletions (`w:del`) or
			# insertions (`w:ins`)
Make pylint even happier 2018-07-09 01:11:44 +02:00			`del_presence = tree.find('.//w:del', namespace)`
			`ins_presence = tree.find('.//w:ins', namespace)`
Fix a mistake in office file revisions handling 2018-07-07 18:05:54 +02:00			`if del_presence is None and ins_presence is None:`
Improve the code's documentation 2018-07-19 23:10:27 +02:00			`return True # No revisions are present`
Remove docx revisions 2018-07-01 23:11:10 +02:00
Improve a bit the formatting of the code thanks to pyflakes3 2018-07-02 00:22:05 +02:00			`parent_map = {c:p for p in tree.iter() for c in p}`
Remove docx revisions 2018-07-01 23:11:10 +02:00
Improve the code's documentation 2018-07-19 23:10:27 +02:00			`elements = list()`
			`for element in tree.iterfind('.//w:del', namespace):`
			`elements.append(element)`
Remove docx revisions 2018-07-01 23:11:10 +02:00			`for element in elements:`
			`parent_map[element].remove(element)`

			`elements = list()`
Make pylint even happier 2018-07-09 01:11:44 +02:00			`for element in tree.iterfind('.//w:ins', namespace):`
Achieve 100% coverage! 2018-07-08 22:27:37 +02:00			`for position, item in enumerate(tree.iter()): #pragma: no cover`
Remove docx revisions 2018-07-01 23:11:10 +02:00			`if item == element:`
			`for children in element.iterfind('./*'):`
			`elements.append((element, position, children))`
			`break`
			`for (element, position, children) in elements:`
			`parent_map[element].insert(position, children)`
			`parent_map[element].remove(element)`

			`tree.write(full_path, xml_declaration=True)`

			`return True`

Improve a bit the formatting of the code thanks to pyflakes3 2018-07-02 00:22:05 +02:00			`def _specific_cleanup(self, full_path: str) -> bool:`
Remove docx revisions 2018-07-01 23:11:10 +02:00			`if full_path.endswith('/word/document.xml'):`
Improve the code's documentation 2018-07-19 23:10:27 +02:00			`# this file contains the revisions`
Remove docx revisions 2018-07-01 23:11:10 +02:00			`return self.__remove_revisions(full_path)`
			`return True`

Refactor how offices files are handled - xml files are no longer considered harmless - Factorization of the `remove_all` method for office files - Explicit whitelist are used - Blacklist are used to skip files completely - Non-blacklisted files are _still cleaned_ - Unsupported files are still triggering an error 2018-06-21 23:02:41 +02:00			`def get_meta(self) -> Dict[str, str]:`
Display docx metadata 2018-03-31 20:56:15 +02:00			`"""`
			`Yes, I know that parsing xml with regexp ain't pretty,`
			`be my guest and fix it if you want.`
			`"""`
Add support for docx 2018-03-31 15:47:06 +02:00			`metadata = {}`
			`zipin = zipfile.ZipFile(self.filename)`
Implement support in `get_meta` for deep meta in office-related files 2018-04-01 15:08:38 +02:00			`for item in zipin.infolist():`
			`if item.filename.startswith('docProps/') and item.filename.endswith('.xml'):`
Make the parsing of office format's metadata more robust 2018-06-10 20:20:00 +02:00			`try:`
Bump coverage for office files and fix some related crashes 2018-07-08 21:35:45 +02:00			`content = zipin.read(item).decode('utf-8')`
Make the parsing of office format's metadata more robust 2018-06-10 20:20:00 +02:00			`results = re.findall(r"<(.+)>(.+)</\1>", content, re.I\|re.M)`
			`for (key, value) in results:`
			`metadata[key] = value`
Bump coverage for office files and fix some related crashes 2018-07-08 21:35:45 +02:00			`except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file`
			`metadata[item.filename] = 'harmful content'`
Add more typing and use mypy in the CI 2018-06-04 22:54:01 +02:00			`for key, value in self._get_zipinfo_meta(item).items():`
			`metadata[key] = value`
Add support for docx 2018-03-31 15:47:06 +02:00			`zipin.close()`
			`return metadata`

Clean deep metadata for zip files 2018-04-01 00:17:06 +02:00
Refactor office document handling 2018-04-01 01:04:06 +02:00			`class LibreOfficeParser(ArchiveBasedAbstractParser):`
			`mimetypes = {`
Do a pylint pass 2018-05-16 22:36:59 +02:00			`'application/vnd.oasis.opendocument.text',`
			`'application/vnd.oasis.opendocument.spreadsheet',`
			`'application/vnd.oasis.opendocument.presentation',`
			`'application/vnd.oasis.opendocument.graphics',`
			`'application/vnd.oasis.opendocument.chart',`
			`'application/vnd.oasis.opendocument.formula',`
			`'application/vnd.oasis.opendocument.image',`
Refactor office document handling 2018-04-01 01:04:06 +02:00			`}`
Refactor how offices files are handled - xml files are no longer considered harmless - Factorization of the `remove_all` method for office files - Explicit whitelist are used - Blacklist are used to skip files completely - Non-blacklisted files are _still cleaned_ - Unsupported files are still triggering an error 2018-06-21 23:02:41 +02:00			`files_to_keep = {`
Improve a bit the formatting of the code thanks to pyflakes3 2018-07-02 00:22:05 +02:00			`'META-INF/manifest.xml',`
			`'content.xml',`
			`'manifest.rdf',`
			`'mimetype',`
			`'settings.xml',`
			`'styles.xml',`
Refactor how offices files are handled - xml files are no longer considered harmless - Factorization of the `remove_all` method for office files - Explicit whitelist are used - Blacklist are used to skip files completely - Non-blacklisted files are _still cleaned_ - Unsupported files are still triggering an error 2018-06-21 23:02:41 +02:00			`}`
			`files_to_omit = set(map(re.compile, { # type: ignore`
Improve a bit the formatting of the code thanks to pyflakes3 2018-07-02 00:22:05 +02:00			`r'^meta\.xml$',`
			`'^Configurations2/',`
			`'^Thumbnails/',`
Refactor how offices files are handled - xml files are no longer considered harmless - Factorization of the `remove_all` method for office files - Explicit whitelist are used - Blacklist are used to skip files completely - Non-blacklisted files are _still cleaned_ - Unsupported files are still triggering an error 2018-06-21 23:02:41 +02:00			`}))`
Refactor office document handling 2018-04-01 01:04:06 +02:00
MAT2 is now cleaning revisions from odt files! 2018-06-27 23:10:53 +02:00
Make pylint even happier 2018-07-09 01:11:44 +02:00			`@staticmethod`
			`def __remove_revisions(full_path: str) -> bool:`
Bump coverage for office files and fix some related crashes 2018-07-08 21:35:45 +02:00			`try:`
Make pylint even happier 2018-07-09 01:11:44 +02:00			`tree, namespace = _parse_xml(full_path)`
Bump coverage for office files and fix some related crashes 2018-07-08 21:35:45 +02:00			`except ET.ParseError:`
			`return False`
Remove docx revisions 2018-07-01 23:11:10 +02:00
Make pylint even happier 2018-07-09 01:11:44 +02:00			`if 'office' not in namespace.keys(): # no revisions in the current file`
MAT2 is now cleaning revisions from odt files! 2018-06-27 23:10:53 +02:00			`return True`

Make pylint even happier 2018-07-09 01:11:44 +02:00			`for text in tree.getroot().iterfind('.//office:text', namespace):`
			`for changes in text.iterfind('.//text:tracked-changes', namespace):`
MAT2 is now cleaning revisions from odt files! 2018-06-27 23:10:53 +02:00			`text.remove(changes)`

Remove docx revisions 2018-07-01 23:11:10 +02:00			`tree.write(full_path, xml_declaration=True)`
MAT2 is now cleaning revisions from odt files! 2018-06-27 23:10:53 +02:00
			`return True`

Improve a bit the formatting of the code thanks to pyflakes3 2018-07-02 00:22:05 +02:00			`def _specific_cleanup(self, full_path: str) -> bool:`
MAT2 is now cleaning revisions from odt files! 2018-06-27 23:10:53 +02:00			`if os.path.basename(full_path) == 'content.xml':`
			`return self.__remove_revisions(full_path)`
			`return True`

Refactor how offices files are handled - xml files are no longer considered harmless - Factorization of the `remove_all` method for office files - Explicit whitelist are used - Blacklist are used to skip files completely - Non-blacklisted files are _still cleaned_ - Unsupported files are still triggering an error 2018-06-21 23:02:41 +02:00			`def get_meta(self) -> Dict[str, str]:`
Refactor office document handling 2018-04-01 01:04:06 +02:00			`"""`
			`Yes, I know that parsing xml with regexp ain't pretty,`
			`be my guest and fix it if you want.`
			`"""`
			`metadata = {}`
			`zipin = zipfile.ZipFile(self.filename)`
Implement support in `get_meta` for deep meta in office-related files 2018-04-01 15:08:38 +02:00			`for item in zipin.infolist():`
			`if item.filename == 'meta.xml':`
Make the parsing of office format's metadata more robust 2018-06-10 20:20:00 +02:00			`try:`
Bump coverage for office files and fix some related crashes 2018-07-08 21:35:45 +02:00			`content = zipin.read(item).decode('utf-8')`
Make the parsing of office format's metadata more robust 2018-06-10 20:20:00 +02:00			`results = re.findall(r"<((?:meta\|dc\|cp).+?)>(.+)</\1>", content, re.I\|re.M)`
			`for (key, value) in results:`
			`metadata[key] = value`
Bump coverage for office files and fix some related crashes 2018-07-08 21:35:45 +02:00			`except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file`
			`metadata[item.filename] = 'harmful content'`
Add more typing and use mypy in the CI 2018-06-04 22:54:01 +02:00			`for key, value in self._get_zipinfo_meta(item).items():`
			`metadata[key] = value`
Refactor office document handling 2018-04-01 01:04:06 +02:00			`zipin.close()`
			`return metadata`