1
0
Fork 0
mat2/mat2

232 lines
8.4 KiB
Plaintext
Raw Permalink Normal View History

#!/usr/bin/env python3
2018-04-02 19:12:10 +02:00
import os
import shutil
2023-01-28 17:22:26 +01:00
from typing import List, Set, Dict
import sys
2018-04-01 17:13:34 +02:00
import mimetypes
2018-03-06 23:20:18 +01:00
import argparse
2018-09-01 14:14:32 +02:00
import logging
import unicodedata
2019-06-05 22:28:34 +02:00
import concurrent.futures
2018-03-06 23:20:18 +01:00
try:
2018-09-06 11:13:11 +02:00
from libmat2 import parser_factory, UNSUPPORTED_EXTENSIONS
from libmat2 import check_dependencies, UnknownMemberPolicy
2021-11-23 17:34:14 +01:00
except ValueError as ex:
print(ex)
sys.exit(1)
2018-03-06 23:20:18 +01:00
2023-08-02 21:09:12 +02:00
__version__ = '0.13.4'
2018-04-04 23:18:32 +02:00
2018-10-28 15:41:04 +01:00
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING)
2022-03-29 22:13:55 +02:00
def __print_without_chars(s: str):
""" Remove control characters
We might use 'Cc' instead of 'C', but better safe than sorry
https://www.unicode.org/reports/tr44/#GC_Values_Table
"""
print(''.join(ch for ch in s if not unicodedata.category(ch).startswith('C')))
2019-02-28 16:47:16 +01:00
def __check_file(filename: str, mode: int = os.R_OK) -> bool:
if not os.path.exists(filename):
2022-03-29 22:13:55 +02:00
__print_without_chars("[-] %s doesn't exist." % filename)
return False
elif not os.path.isfile(filename):
2022-03-29 22:13:55 +02:00
__print_without_chars("[-] %s is not a regular file." % filename)
2018-04-02 19:12:10 +02:00
return False
elif not os.access(filename, mode):
mode_str: List[str] = list()
if mode & os.R_OK:
mode_str += 'readable'
if mode & os.W_OK:
mode_str += 'writeable'
2022-03-29 22:13:55 +02:00
__print_without_chars("[-] %s is not %s." % (filename, 'nor '.join(mode_str)))
2018-04-02 19:12:10 +02:00
return False
return True
2018-03-06 23:20:18 +01:00
def create_arg_parser() -> argparse.ArgumentParser:
2018-03-06 23:20:18 +01:00
parser = argparse.ArgumentParser(description='Metadata anonymisation toolkit 2')
2018-09-01 14:14:32 +02:00
parser.add_argument('-V', '--verbose', action='store_true',
help='show more verbose status information')
parser.add_argument('--unknown-members', metavar='policy', default='abort',
2019-03-30 10:39:39 +01:00
help='how to handle unknown members of archive-style '
'files (policy should be one of: %s) [Default: abort]' %
', '.join(p.value for p in UnknownMemberPolicy))
2019-08-31 19:31:08 +02:00
parser.add_argument('--inplace', action='store_true',
help='clean in place, without backup')
parser.add_argument('--no-sandbox', dest='sandbox', action='store_false',
default=True, help='Disable bubblewrap\'s sandboxing')
excl_group = parser.add_mutually_exclusive_group()
excl_group.add_argument('files', nargs='*', help='the files to process',
default=[])
excl_group.add_argument('-v', '--version', action='version',
2019-11-28 03:15:20 +01:00
version='mat2 %s' % __version__)
excl_group.add_argument('-l', '--list', action='store_true', default=False,
help='list all supported fileformats')
excl_group.add_argument('--check-dependencies', action='store_true',
default=False,
2019-11-28 03:15:20 +01:00
help='check if mat2 has all the dependencies it '
'needs')
excl_group = parser.add_mutually_exclusive_group()
excl_group.add_argument('-L', '--lightweight', action='store_true',
help='remove SOME metadata')
excl_group.add_argument('-s', '--show', action='store_true',
2019-11-28 03:15:20 +01:00
help='list harmful metadata detectable by mat2 '
'without removing them')
2018-03-06 23:20:18 +01:00
return parser
def show_meta(filename: str, sandbox: bool):
2018-04-02 19:12:10 +02:00
if not __check_file(filename):
return
2020-11-23 19:50:46 +01:00
try:
p, mtype = parser_factory.get_parser(filename) # type: ignore
except ValueError as e:
2022-03-29 22:13:55 +02:00
__print_without_chars("[-] something went wrong when processing %s: %s" % (filename, e))
2020-11-23 19:50:46 +01:00
return
2018-03-31 21:15:48 +02:00
if p is None:
2022-03-29 22:13:55 +02:00
__print_without_chars("[-] %s's format (%s) is not supported" % (filename, mtype))
2018-03-31 21:15:48 +02:00
return
p.sandbox = sandbox
__print_meta(filename, p.get_meta())
2023-01-28 17:22:26 +01:00
def __print_meta(filename: str, metadata: Dict, depth: int = 1):
padding = " " * depth*2
if not metadata:
2022-03-29 22:13:55 +02:00
__print_without_chars(padding + "No metadata found in %s." % filename)
return
2022-03-29 22:13:55 +02:00
__print_without_chars("[%s] Metadata for %s:" % ('+'*depth, filename))
2018-10-18 21:22:28 +02:00
for (k, v) in sorted(metadata.items()):
if isinstance(v, dict):
2018-10-18 21:17:12 +02:00
__print_meta(k, v, depth+1)
continue
try: # FIXME this is ugly.
2022-03-29 22:13:55 +02:00
__print_without_chars(padding + " %s: %s" % (k, v))
except UnicodeEncodeError:
2022-03-29 22:13:55 +02:00
__print_without_chars(padding + " %s: harmful content" % k)
except TypeError:
pass # for things that aren't iterable
2018-04-02 19:12:10 +02:00
def clean_meta(filename: str, is_lightweight: bool, inplace: bool, sandbox: bool,
2019-08-31 19:31:08 +02:00
policy: UnknownMemberPolicy) -> bool:
mode = (os.R_OK | os.W_OK) if inplace else os.R_OK
if not __check_file(filename, mode):
2018-05-16 22:36:59 +02:00
return False
2018-04-02 19:12:10 +02:00
2020-11-23 19:50:46 +01:00
try:
p, mtype = parser_factory.get_parser(filename) # type: ignore
except ValueError as e:
2022-03-29 22:13:55 +02:00
__print_without_chars("[-] something went wrong when cleaning %s: %s" % (filename, e))
2020-11-23 19:50:46 +01:00
return False
2018-04-02 19:12:10 +02:00
if p is None:
2022-03-29 22:13:55 +02:00
__print_without_chars("[-] %s's format (%s) is not supported" % (filename, mtype))
return False
p.unknown_member_policy = policy
p.lightweight_cleaning = is_lightweight
p.sandbox = sandbox
try:
2019-05-13 23:31:46 +02:00
logging.debug('Cleaning %s…', filename)
2019-08-31 19:31:08 +02:00
ret = p.remove_all()
if ret is True:
shutil.copymode(filename, p.output_filename)
if inplace is True:
os.rename(p.output_filename, filename)
2019-08-31 19:31:08 +02:00
return ret
except RuntimeError as e:
2022-03-29 22:13:55 +02:00
__print_without_chars("[-] %s can't be cleaned: %s" % (filename, e))
return False
2018-03-06 23:20:18 +01:00
def show_parsers():
print('[+] Supported formats:')
formats = set() # Set[str]
2018-10-12 11:58:01 +02:00
for parser in parser_factory._get_parsers(): # type: ignore
for mtype in parser.mimetypes:
extensions = set() # Set[str]
for extension in mimetypes.guess_all_extensions(mtype):
if extension not in UNSUPPORTED_EXTENSIONS:
extensions.add(extension)
if not extensions:
# we're not supporting a single extension in the current
# mimetype, so there is not point in showing the mimetype at all
continue
formats.add(' - %s (%s)' % (mtype, ', '.join(extensions)))
2023-01-07 17:10:02 +01:00
print('\n'.join(sorted(formats)))
def __get_files_recursively(files: List[str]) -> List[str]:
ret: Set[str] = set()
for f in files:
if os.path.isdir(f):
for path, _, _files in os.walk(f):
for _f in _files:
fname = os.path.join(path, _f)
if __check_file(fname):
2019-06-05 22:28:34 +02:00
ret.add(fname)
elif __check_file(f):
2019-06-05 22:28:34 +02:00
ret.add(f)
return list(ret)
2018-10-12 11:58:01 +02:00
def main() -> int:
arg_parser = create_arg_parser()
args = arg_parser.parse_args()
2018-04-02 19:12:10 +02:00
2018-09-01 14:14:32 +02:00
if args.verbose:
2023-09-08 22:15:00 +02:00
logging.getLogger(__name__).setLevel(logging.DEBUG)
2018-09-01 14:14:32 +02:00
if not args.files:
if args.list:
show_parsers()
return 0
elif args.check_dependencies:
2022-03-29 22:13:55 +02:00
__print_without_chars("Dependencies for mat2 %s:" % __version__)
for key, value in sorted(check_dependencies().items()):
2022-03-29 22:13:55 +02:00
__print_without_chars('- %s: %s %s' % (key, 'yes' if value['found'] else 'no',
'(optional)' if not value['required'] else ''))
else:
2018-10-12 11:58:01 +02:00
arg_parser.print_help()
2018-05-16 22:36:59 +02:00
return 0
2018-04-14 16:10:45 +02:00
elif args.show:
2018-04-04 23:18:32 +02:00
for f in __get_files_recursively(args.files):
show_meta(f, args.sandbox)
2018-05-16 22:36:59 +02:00
return 0
2018-04-14 16:10:45 +02:00
else:
2019-08-31 19:31:08 +02:00
inplace = args.inplace
policy = UnknownMemberPolicy(args.unknown_members)
if policy == UnknownMemberPolicy.KEEP:
logging.warning('Keeping unknown member files may leak metadata in the resulting file!')
2018-03-06 23:20:18 +01:00
2018-10-03 15:29:46 +02:00
no_failure = True
2019-06-05 22:28:34 +02:00
files = __get_files_recursively(args.files)
# We have to use Processes instead of Threads, since
# we're using tempfile.mkdtemp, which isn't thread-safe.
futures = list()
2019-06-05 22:28:34 +02:00
with concurrent.futures.ProcessPoolExecutor() as executor:
for f in files:
2019-08-31 19:31:08 +02:00
future = executor.submit(clean_meta, f, args.lightweight,
inplace, args.sandbox, policy)
2019-06-05 22:28:34 +02:00
futures.append(future)
for future in concurrent.futures.as_completed(futures):
no_failure &= future.result()
2018-10-03 15:29:46 +02:00
return 0 if no_failure is True else -1
2018-09-06 11:13:11 +02:00
2018-03-06 23:20:18 +01:00
if __name__ == '__main__':
sys.exit(main())