2018-09-11 15:54:53 +02:00
|
|
|
#!/usr/bin/env python3
|
2018-04-04 23:15:00 +02:00
|
|
|
|
2018-04-02 19:12:10 +02:00
|
|
|
import os
|
2018-10-12 11:58:01 +02:00
|
|
|
from typing import Tuple, Generator, List, Union
|
2018-04-29 22:55:26 +02:00
|
|
|
import sys
|
2018-04-01 17:13:34 +02:00
|
|
|
import mimetypes
|
2018-03-06 23:20:18 +01:00
|
|
|
import argparse
|
2018-09-01 05:14:32 -07:00
|
|
|
import logging
|
2018-03-06 23:20:18 +01:00
|
|
|
|
2018-06-24 22:40:57 +02:00
|
|
|
try:
|
2018-09-06 11:13:11 +02:00
|
|
|
from libmat2 import parser_factory, UNSUPPORTED_EXTENSIONS
|
|
|
|
from libmat2 import check_dependencies, UnknownMemberPolicy
|
2018-06-24 22:40:57 +02:00
|
|
|
except ValueError as e:
|
|
|
|
print(e)
|
|
|
|
sys.exit(1)
|
2018-03-06 23:20:18 +01:00
|
|
|
|
2018-10-23 17:07:42 +02:00
|
|
|
__version__ = '0.5.0'
|
2018-04-04 23:18:32 +02:00
|
|
|
|
2018-10-05 17:00:59 +02:00
|
|
|
# Make pyflakes happy
|
|
|
|
assert Tuple
|
2018-10-12 11:58:01 +02:00
|
|
|
assert Union
|
2018-10-05 17:00:59 +02:00
|
|
|
|
|
|
|
|
2018-07-23 23:42:56 +02:00
|
|
|
def __check_file(filename: str, mode: int=os.R_OK) -> bool:
|
2018-06-10 00:07:49 +02:00
|
|
|
if not os.path.exists(filename):
|
|
|
|
print("[-] %s is doesn't exist." % filename)
|
|
|
|
return False
|
|
|
|
elif not os.path.isfile(filename):
|
2018-04-02 19:12:10 +02:00
|
|
|
print("[-] %s is not a regular file." % filename)
|
|
|
|
return False
|
|
|
|
elif not os.access(filename, mode):
|
|
|
|
print("[-] %s is not readable and writeable." % filename)
|
|
|
|
return False
|
|
|
|
return True
|
2018-03-06 23:20:18 +01:00
|
|
|
|
2018-04-04 23:21:48 +02:00
|
|
|
|
2018-10-05 17:00:59 +02:00
|
|
|
def create_arg_parser() -> argparse.ArgumentParser:
|
2018-03-06 23:20:18 +01:00
|
|
|
parser = argparse.ArgumentParser(description='Metadata anonymisation toolkit 2')
|
2018-07-24 22:34:33 +02:00
|
|
|
parser.add_argument('files', nargs='*', help='the files to process')
|
2018-05-14 22:44:31 +02:00
|
|
|
parser.add_argument('-v', '--version', action='version',
|
2018-05-16 22:36:59 +02:00
|
|
|
version='MAT2 %s' % __version__)
|
2018-05-15 23:27:58 +02:00
|
|
|
parser.add_argument('-l', '--list', action='store_true',
|
2018-05-16 22:36:59 +02:00
|
|
|
help='list all supported fileformats')
|
2018-09-05 17:48:14 +02:00
|
|
|
parser.add_argument('--check-dependencies', action='store_true',
|
2018-07-23 23:42:56 +02:00
|
|
|
help='check if MAT2 has all the dependencies it needs')
|
2018-09-01 05:14:32 -07:00
|
|
|
parser.add_argument('-V', '--verbose', action='store_true',
|
|
|
|
help='show more verbose status information')
|
2018-09-05 17:48:14 +02:00
|
|
|
parser.add_argument('--unknown-members', metavar='policy', default='abort',
|
2018-09-05 18:49:35 -04:00
|
|
|
help='how to handle unknown members of archive-style files (policy should' +
|
2018-09-06 11:13:11 +02:00
|
|
|
' be one of: %s)' % ', '.join(p.value for p in UnknownMemberPolicy))
|
2018-07-10 20:49:54 +02:00
|
|
|
|
2018-03-06 23:20:18 +01:00
|
|
|
|
2018-05-16 00:07:04 +02:00
|
|
|
info = parser.add_mutually_exclusive_group()
|
2018-03-06 23:20:18 +01:00
|
|
|
info.add_argument('-s', '--show', action='store_true',
|
2018-07-09 00:13:16 +02:00
|
|
|
help='list harmful metadata detectable by MAT2 without removing them')
|
2018-04-14 21:23:31 +02:00
|
|
|
info.add_argument('-L', '--lightweight', action='store_true',
|
|
|
|
help='remove SOME metadata')
|
2018-03-06 23:20:18 +01:00
|
|
|
return parser
|
|
|
|
|
2018-04-04 23:21:48 +02:00
|
|
|
|
2018-05-16 22:36:59 +02:00
|
|
|
def show_meta(filename: str):
|
2018-04-02 19:12:10 +02:00
|
|
|
if not __check_file(filename):
|
|
|
|
return
|
|
|
|
|
2018-06-04 22:54:01 +02:00
|
|
|
p, mtype = parser_factory.get_parser(filename) # type: ignore
|
2018-03-31 21:15:48 +02:00
|
|
|
if p is None:
|
2018-04-01 17:13:34 +02:00
|
|
|
print("[-] %s's format (%s) is not supported" % (filename, mtype))
|
2018-03-31 21:15:48 +02:00
|
|
|
return
|
2018-10-11 19:52:47 +02:00
|
|
|
__print_meta(filename, p.get_meta())
|
2018-10-11 08:28:02 -07:00
|
|
|
|
|
|
|
|
2018-10-11 19:55:07 +02:00
|
|
|
def __print_meta(filename: str, metadata: dict, depth: int=1):
|
2018-10-11 19:52:47 +02:00
|
|
|
padding = " " * depth*2
|
2018-10-05 17:00:59 +02:00
|
|
|
if not metadata:
|
2018-10-11 19:52:47 +02:00
|
|
|
print(padding + "No metadata found")
|
2018-10-05 12:35:35 +02:00
|
|
|
return
|
|
|
|
|
2018-10-11 19:52:47 +02:00
|
|
|
print("[%s] Metadata for %s:" % ('+'*depth, filename))
|
|
|
|
|
2018-10-18 21:22:28 +02:00
|
|
|
for (k, v) in sorted(metadata.items()):
|
2018-10-11 08:28:02 -07:00
|
|
|
if isinstance(v, dict):
|
2018-10-18 21:17:12 +02:00
|
|
|
__print_meta(k, v, depth+1)
|
|
|
|
continue
|
2018-10-11 19:52:47 +02:00
|
|
|
try: # FIXME this is ugly.
|
|
|
|
print(padding + " %s: %s" % (k, v))
|
|
|
|
except UnicodeEncodeError:
|
|
|
|
print(padding + " %s: harmful content" % k)
|
2018-10-11 08:28:02 -07:00
|
|
|
|
2018-04-02 19:12:10 +02:00
|
|
|
|
2018-10-05 17:00:59 +02:00
|
|
|
def clean_meta(filename: str, is_lightweight: bool, policy: UnknownMemberPolicy) -> bool:
|
2018-04-02 19:12:10 +02:00
|
|
|
if not __check_file(filename, os.R_OK|os.W_OK):
|
2018-05-16 22:36:59 +02:00
|
|
|
return False
|
2018-04-02 19:12:10 +02:00
|
|
|
|
2018-06-04 22:54:01 +02:00
|
|
|
p, mtype = parser_factory.get_parser(filename) # type: ignore
|
2018-04-02 19:12:10 +02:00
|
|
|
if p is None:
|
|
|
|
print("[-] %s's format (%s) is not supported" % (filename, mtype))
|
2018-04-29 22:55:26 +02:00
|
|
|
return False
|
2018-10-05 17:00:59 +02:00
|
|
|
p.unknown_member_policy = policy
|
2018-10-12 11:49:24 +02:00
|
|
|
p.lightweight_cleaning = is_lightweight
|
2018-10-18 19:19:56 +02:00
|
|
|
|
|
|
|
try:
|
|
|
|
return p.remove_all()
|
|
|
|
except RuntimeError as e:
|
|
|
|
print("[-] %s can't be cleaned: %s" % (filename, e))
|
|
|
|
return False
|
|
|
|
|
2018-03-06 23:20:18 +01:00
|
|
|
|
2018-04-04 23:21:48 +02:00
|
|
|
|
2018-10-12 11:58:01 +02:00
|
|
|
def show_parsers() -> bool:
|
2018-04-04 23:15:00 +02:00
|
|
|
print('[+] Supported formats:')
|
2018-10-12 11:58:01 +02:00
|
|
|
formats = set() # Set[str]
|
|
|
|
for parser in parser_factory._get_parsers(): # type: ignore
|
2018-04-04 23:15:00 +02:00
|
|
|
for mtype in parser.mimetypes:
|
2018-10-12 11:58:01 +02:00
|
|
|
extensions = set() # Set[str]
|
2018-05-16 22:00:37 +02:00
|
|
|
for extension in mimetypes.guess_all_extensions(mtype):
|
2018-10-05 12:42:51 +02:00
|
|
|
if extension not in UNSUPPORTED_EXTENSIONS:
|
2018-05-16 22:00:37 +02:00
|
|
|
extensions.add(extension)
|
|
|
|
if not extensions:
|
|
|
|
# we're not supporting a single extension in the current
|
|
|
|
# mimetype, so there is not point in showing the mimetype at all
|
|
|
|
continue
|
2018-10-05 17:00:59 +02:00
|
|
|
formats.add(' - %s (%s)' % (mtype, ', '.join(extensions)))
|
2018-06-04 23:32:13 +02:00
|
|
|
print('\n'.join(sorted(formats)))
|
2018-10-12 11:58:01 +02:00
|
|
|
return True
|
2018-04-04 23:15:00 +02:00
|
|
|
|
2018-04-04 23:21:48 +02:00
|
|
|
|
2018-10-05 17:00:59 +02:00
|
|
|
def __get_files_recursively(files: List[str]) -> Generator[str, None, None]:
|
2018-04-04 23:15:00 +02:00
|
|
|
for f in files:
|
2018-06-10 00:07:49 +02:00
|
|
|
if os.path.isdir(f):
|
2018-04-04 23:15:00 +02:00
|
|
|
for path, _, _files in os.walk(f):
|
|
|
|
for _f in _files:
|
2018-06-10 00:07:49 +02:00
|
|
|
fname = os.path.join(path, _f)
|
|
|
|
if __check_file(fname):
|
|
|
|
yield fname
|
|
|
|
elif __check_file(f):
|
|
|
|
yield f
|
2018-04-04 23:15:00 +02:00
|
|
|
|
2018-10-12 11:58:01 +02:00
|
|
|
def main() -> int:
|
2018-04-03 23:57:13 +02:00
|
|
|
arg_parser = create_arg_parser()
|
|
|
|
args = arg_parser.parse_args()
|
2018-04-02 19:12:10 +02:00
|
|
|
|
2018-09-01 05:14:32 -07:00
|
|
|
if args.verbose:
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
|
|
2018-04-04 23:15:00 +02:00
|
|
|
if not args.files:
|
2018-07-10 20:49:54 +02:00
|
|
|
if args.list:
|
2018-10-12 11:58:01 +02:00
|
|
|
return show_parsers()
|
2018-07-10 20:49:54 +02:00
|
|
|
elif args.check_dependencies:
|
|
|
|
print("Dependencies required for MAT2 %s:" % __version__)
|
|
|
|
for key, value in sorted(check_dependencies().items()):
|
|
|
|
print('- %s: %s' % (key, 'yes' if value else 'no'))
|
|
|
|
else:
|
2018-10-12 11:58:01 +02:00
|
|
|
arg_parser.print_help()
|
2018-05-16 22:36:59 +02:00
|
|
|
return 0
|
2018-04-04 23:15:00 +02:00
|
|
|
|
2018-04-14 16:10:45 +02:00
|
|
|
elif args.show:
|
2018-04-04 23:18:32 +02:00
|
|
|
for f in __get_files_recursively(args.files):
|
2018-03-06 23:20:18 +01:00
|
|
|
show_meta(f)
|
2018-05-16 22:36:59 +02:00
|
|
|
return 0
|
2018-04-14 16:10:45 +02:00
|
|
|
|
2018-04-29 22:55:26 +02:00
|
|
|
else:
|
2018-10-05 17:00:59 +02:00
|
|
|
policy = UnknownMemberPolicy(args.unknown_members)
|
|
|
|
if policy == UnknownMemberPolicy.KEEP:
|
2018-08-31 21:33:42 -04:00
|
|
|
logging.warning('Keeping unknown member files may leak metadata in the resulting file!')
|
2018-03-06 23:20:18 +01:00
|
|
|
|
2018-10-03 15:29:46 +02:00
|
|
|
no_failure = True
|
2018-10-03 15:22:36 +02:00
|
|
|
for f in __get_files_recursively(args.files):
|
2018-10-05 17:00:59 +02:00
|
|
|
if clean_meta(f, args.lightweight, policy) is False:
|
2018-10-03 15:29:46 +02:00
|
|
|
no_failure = False
|
|
|
|
return 0 if no_failure is True else -1
|
2018-09-06 11:13:11 +02:00
|
|
|
|
2018-03-06 23:20:18 +01:00
|
|
|
|
|
|
|
if __name__ == '__main__':
|
2018-04-29 22:55:26 +02:00
|
|
|
sys.exit(main())
|