diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b31d088..29e3553 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -9,7 +9,7 @@ bandit: script: # TODO: remove B405 and B314 - apt-get -qqy update - apt-get -qqy install --no-install-recommends python3-bandit - - bandit ./mat2 --format txt + - bandit ./mat2 --format txt --skip B101 - bandit -r ./nautilus/ --format txt --skip B101 - bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314 diff --git a/libmat2/office.py b/libmat2/office.py index 54347ea..32e7b75 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -2,7 +2,7 @@ import logging import os import re import zipfile -from typing import Dict, Set, Pattern +from typing import Dict, Set, Pattern, Tuple import xml.etree.ElementTree as ET # type: ignore @@ -14,9 +14,8 @@ from .archive import ArchiveBasedAbstractParser assert Set assert Pattern -def _parse_xml(full_path: str): +def _parse_xml(full_path: str) -> Tuple[ET.ElementTree, Dict[str, str]]: """ This function parses XML, with namespace support. """ - namespace_map = dict() for _, (key, value) in ET.iterparse(full_path, ("start-ns", )): # The ns[0-9]+ namespaces are reserved for internal usage, so @@ -183,20 +182,20 @@ class MSOfficeParser(ArchiveBasedAbstractParser): parent_map = {c:p for p in tree.iter() for c in p} - elements = list() + elements_del = list() for element in tree.iterfind('.//w:del', namespace): - elements.append(element) - for element in elements: + elements_del.append(element) + for element in elements_del: parent_map[element].remove(element) - elements = list() + elements_ins = list() for element in tree.iterfind('.//w:ins', namespace): for position, item in enumerate(tree.iter()): # pragma: no cover if item == element: for children in element.iterfind('./*'): - elements.append((element, position, children)) + elements_ins.append((element, position, children)) break - for (element, position, children) in elements: + for (element, position, children) in elements_ins: parent_map[element].insert(position, children) parent_map[element].remove(element) diff --git a/mat2 b/mat2 index 6c23836..987e439 100755 --- a/mat2 +++ b/mat2 @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import os -from typing import Tuple +from typing import Tuple, Generator, List import sys import mimetypes import argparse @@ -16,6 +16,10 @@ except ValueError as e: __version__ = '0.4.0' +# Make pyflakes happy +assert Tuple + + def __check_file(filename: str, mode: int=os.R_OK) -> bool: if not os.path.exists(filename): print("[-] %s is doesn't exist." % filename) @@ -29,7 +33,7 @@ def __check_file(filename: str, mode: int=os.R_OK) -> bool: return True -def create_arg_parser(): +def create_arg_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description='Metadata anonymisation toolkit 2') parser.add_argument('files', nargs='*', help='the files to process') parser.add_argument('-v', '--version', action='version', @@ -63,19 +67,18 @@ def show_meta(filename: str): return print("[+] Metadata for %s:" % filename) - meta = p.get_meta().items() - if not meta: + metadata = p.get_meta().items() + if not metadata: print(" No metadata found") return - for k, v in meta: + for k, v in metadata: try: # FIXME this is ugly. print(" %s: %s" % (k, v)) except UnicodeEncodeError: print(" %s: harmful content" % k) -def clean_meta(params: Tuple[str, bool, UnknownMemberPolicy]) -> bool: - filename, is_lightweight, unknown_member_policy = params +def clean_meta(filename: str, is_lightweight: bool, policy: UnknownMemberPolicy) -> bool: if not __check_file(filename, os.R_OK|os.W_OK): return False @@ -83,7 +86,7 @@ def clean_meta(params: Tuple[str, bool, UnknownMemberPolicy]) -> bool: if p is None: print("[-] %s's format (%s) is not supported" % (filename, mtype)) return False - p.unknown_member_policy = unknown_member_policy + p.unknown_member_policy = policy if is_lightweight: return p.remove_all_lightweight() return p.remove_all() @@ -91,7 +94,7 @@ def clean_meta(params: Tuple[str, bool, UnknownMemberPolicy]) -> bool: def show_parsers(): print('[+] Supported formats:') - formats = list() + formats = set() for parser in parser_factory._get_parsers(): for mtype in parser.mimetypes: extensions = set() @@ -102,11 +105,11 @@ def show_parsers(): # we're not supporting a single extension in the current # mimetype, so there is not point in showing the mimetype at all continue - formats.append(' - %s (%s)' % (mtype, ', '.join(extensions))) + formats.add(' - %s (%s)' % (mtype, ', '.join(extensions))) print('\n'.join(sorted(formats))) -def __get_files_recursively(files): +def __get_files_recursively(files: List[str]) -> Generator[str, None, None]: for f in files: if os.path.isdir(f): for path, _, _files in os.walk(f): @@ -141,13 +144,13 @@ def main(): return 0 else: - unknown_member_policy = UnknownMemberPolicy(args.unknown_members) - if unknown_member_policy == UnknownMemberPolicy.KEEP: + policy = UnknownMemberPolicy(args.unknown_members) + if policy == UnknownMemberPolicy.KEEP: logging.warning('Keeping unknown member files may leak metadata in the resulting file!') no_failure = True for f in __get_files_recursively(args.files): - if clean_meta([f, args.lightweight, unknown_member_policy]) is False: + if clean_meta(f, args.lightweight, policy) is False: no_failure = False return 0 if no_failure is True else -1