diff --git a/main.py b/main.py index ab07641..a31adaa 100755 --- a/main.py +++ b/main.py @@ -12,7 +12,7 @@ from src import parser_factory, unsupported_extensions __version__ = '0.1.0' -def __check_file(filename:str, mode:int = os.R_OK) -> bool: +def __check_file(filename: str, mode: int = os.R_OK) -> bool: if not os.path.isfile(filename): print("[-] %s is not a regular file." % filename) return False @@ -26,9 +26,9 @@ def create_arg_parser(): parser = argparse.ArgumentParser(description='Metadata anonymisation toolkit 2') parser.add_argument('files', nargs='*') parser.add_argument('-v', '--version', action='version', - version='MAT2 %s' % __version__) + version='MAT2 %s' % __version__) parser.add_argument('-l', '--list', action='store_true', - help='list all supported fileformats') + help='list all supported fileformats') info = parser.add_mutually_exclusive_group() info.add_argument('-c', '--check', action='store_true', @@ -40,7 +40,7 @@ def create_arg_parser(): return parser -def show_meta(filename:str): +def show_meta(filename: str): if not __check_file(filename): return @@ -48,18 +48,18 @@ def show_meta(filename:str): if p is None: print("[-] %s's format (%s) is not supported" % (filename, mtype)) return + print("[+] Metadata for %s:" % filename) - for k,v in p.get_meta().items(): + for k, v in p.get_meta().items(): try: # FIXME this is ugly. print(" %s: %s" % (k, v)) except UnicodeEncodeError: print(" %s: harmful content" % k) - -def clean_meta(params:Tuple[str, bool]) -> bool: +def clean_meta(params: Tuple[str, bool]) -> bool: filename, is_lightweigth = params if not __check_file(filename, os.R_OK|os.W_OK): - return + return False p, mtype = parser_factory.get_parser(filename) if p is None: @@ -102,12 +102,12 @@ def main(): if not args.list: return arg_parser.print_help() show_parsers() - return + return 0 elif args.show: for f in __get_files_recursively(args.files): show_meta(f) - return + return 0 else: p = multiprocessing.Pool() diff --git a/src/__init__.py b/src/__init__.py index 3f5c478..07d3036 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -2,4 +2,5 @@ # A set of extension that aren't supported, despite matching a supported mimetype unsupported_extensions = set(['bat', 'c', 'h', 'ksh', 'pl', 'txt', 'asc', - 'text', 'pot', 'brf', 'srt', 'rdf', 'wsdl', 'xpdl', 'xsl', 'xsd']) + 'text', 'pot', 'brf', 'srt', 'rdf', 'wsdl', + 'xpdl', 'xsl', 'xsd']) diff --git a/src/audio.py b/src/audio.py index 4a385b2..3a6aa79 100644 --- a/src/audio.py +++ b/src/audio.py @@ -9,7 +9,7 @@ class MutagenParser(abstract.AbstractParser): def get_meta(self): f = mutagen.File(self.filename) if f.tags: - return {k:', '.join(v) for k,v in f.tags.items()} + return {k:', '.join(v) for k, v in f.tags.items()} return {} def remove_all(self): diff --git a/src/harmless.py b/src/harmless.py index fbc2897..aa00582 100644 --- a/src/harmless.py +++ b/src/harmless.py @@ -6,6 +6,7 @@ class HarmlessParser(abstract.AbstractParser): mimetypes = {'application/xml', 'text/plain'} def __init__(self, filename: str): + super().__init__(filename) self.filename = filename self.output_filename = filename diff --git a/src/images.py b/src/images.py index 6cc3dfe..c84952a 100644 --- a/src/images.py +++ b/src/images.py @@ -14,11 +14,12 @@ from . import abstract class PNGParser(abstract.AbstractParser): mimetypes = {'image/png', } meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', - 'Directory', 'FileSize', 'FileModifyDate', 'FileAccessDate', - "FileInodeChangeDate", 'FilePermissions', 'FileType', - 'FileTypeExtension', 'MIMEType', 'ImageWidth', 'BitDepth', 'ColorType', - 'Compression', 'Filter', 'Interlace', 'BackgroundColor', 'ImageSize', - 'Megapixels', 'ImageHeight'} + 'Directory', 'FileSize', 'FileModifyDate', + 'FileAccessDate', 'FileInodeChangeDate', + 'FilePermissions', 'FileType', 'FileTypeExtension', + 'MIMEType', 'ImageWidth', 'BitDepth', 'ColorType', + 'Compression', 'Filter', 'Interlace', 'BackgroundColor', + 'ImageSize', 'Megapixels', 'ImageHeight'} def __init__(self, filename): super().__init__(filename) @@ -63,36 +64,38 @@ class GdkPixbufAbstractParser(abstract.AbstractParser): class JPGParser(GdkPixbufAbstractParser): mimetypes = {'image/jpeg'} meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', - 'Directory', 'FileSize', 'FileModifyDate', 'FileAccessDate', - "FileInodeChangeDate", 'FilePermissions', 'FileType', - 'FileTypeExtension', 'MIMEType', 'ImageWidth', - 'ImageSize', 'BitsPerSample', 'ColorComponents', 'EncodingProcess', - 'JFIFVersion', 'ResolutionUnit', 'XResolution', 'YCbCrSubSampling', - 'YResolution', 'Megapixels', 'ImageHeight'} + 'Directory', 'FileSize', 'FileModifyDate', + 'FileAccessDate', "FileInodeChangeDate", + 'FilePermissions', 'FileType', 'FileTypeExtension', + 'MIMEType', 'ImageWidth', 'ImageSize', 'BitsPerSample', + 'ColorComponents', 'EncodingProcess', 'JFIFVersion', + 'ResolutionUnit', 'XResolution', 'YCbCrSubSampling', + 'YResolution', 'Megapixels', 'ImageHeight'} class TiffParser(GdkPixbufAbstractParser): mimetypes = {'image/tiff'} meta_whitelist = {'Compression', 'ExifByteOrder', 'ExtraSamples', - 'FillOrder', 'PhotometricInterpretation', 'PlanarConfiguration', - 'RowsPerStrip', 'SamplesPerPixel', 'StripByteCounts', - 'StripOffsets', 'BitsPerSample', 'Directory', 'ExifToolVersion', - 'FileAccessDate', 'FileInodeChangeDate', 'FileModifyDate', - 'FileName', 'FilePermissions', 'FileSize', 'FileType', - 'FileTypeExtension', 'ImageHeight', 'ImageSize', 'ImageWidth', - 'MIMEType', 'Megapixels', 'SourceFile'} + 'FillOrder', 'PhotometricInterpretation', + 'PlanarConfiguration', 'RowsPerStrip', 'SamplesPerPixel', + 'StripByteCounts', 'StripOffsets', 'BitsPerSample', + 'Directory', 'ExifToolVersion', 'FileAccessDate', + 'FileInodeChangeDate', 'FileModifyDate', 'FileName', + 'FilePermissions', 'FileSize', 'FileType', + 'FileTypeExtension', 'ImageHeight', 'ImageSize', + 'ImageWidth', 'MIMEType', 'Megapixels', 'SourceFile'} class BMPParser(GdkPixbufAbstractParser): mimetypes = {'image/x-ms-bmp'} meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory', - 'FileSize', 'FileModifyDate', 'FileAccessDate', - 'FileInodeChangeDate', 'FilePermissions', 'FileType', - 'FileTypeExtension', 'MIMEType', 'BMPVersion', 'ImageWidth', - 'ImageHeight', 'Planes', 'BitDepth', 'Compression', 'ImageLength', - 'PixelsPerMeterX', 'PixelsPerMeterY', 'NumColors', - 'NumImportantColors', 'RedMask', 'GreenMask', 'BlueMask', - 'AlphaMask', 'ColorSpace', 'RedEndpoint', 'GreenEndpoint', - 'BlueEndpoint', 'GammaRed', 'GammaGreen', 'GammaBlue', 'ImageSize', - 'Megapixels'} - + 'FileSize', 'FileModifyDate', 'FileAccessDate', + 'FileInodeChangeDate', 'FilePermissions', 'FileType', + 'FileTypeExtension', 'MIMEType', 'BMPVersion', + 'ImageWidth', 'ImageHeight', 'Planes', 'BitDepth', + 'Compression', 'ImageLength', 'PixelsPerMeterX', + 'PixelsPerMeterY', 'NumColors', 'NumImportantColors', + 'RedMask', 'GreenMask', 'BlueMask', 'AlphaMask', + 'ColorSpace', 'RedEndpoint', 'GreenEndpoint', + 'BlueEndpoint', 'GammaRed', 'GammaGreen', 'GammaBlue', + 'ImageSize', 'Megapixels'} diff --git a/src/office.py b/src/office.py index da6168e..749fc7d 100644 --- a/src/office.py +++ b/src/office.py @@ -9,14 +9,14 @@ from . import abstract, parser_factory class ArchiveBasedAbstractParser(abstract.AbstractParser): - def _clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo: + def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: zipinfo.compress_type = zipfile.ZIP_DEFLATED zipinfo.create_system = 3 # Linux zipinfo.comment = b'' zipinfo.date_time = (1980, 1, 1, 0, 0, 0) return zipinfo - def _get_zipinfo_meta(self, zipinfo:zipfile.ZipInfo) -> dict: + def _get_zipinfo_meta(self, zipinfo: zipfile.ZipInfo) -> dict: metadata = {} if zipinfo.create_system == 3: #metadata['create_system'] = 'Linux' @@ -35,7 +35,8 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): return metadata - def _clean_internal_file(self, item:zipfile.ZipInfo, temp_folder:str, zin:zipfile.ZipFile, zout:zipfile.ZipFile): + def _clean_internal_file(self, item: zipfile.ZipInfo, temp_folder: str, + zin: zipfile.ZipFile, zout: zipfile.ZipFile): zin.extract(member=item, path=temp_folder) tmp_parser, mtype = parser_factory.get_parser(os.path.join(temp_folder, item.filename)) if not tmp_parser: @@ -50,9 +51,9 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): class MSOfficeParser(ArchiveBasedAbstractParser): mimetypes = { - 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', - 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', - 'application/vnd.openxmlformats-officedocument.presentationml.presentation' + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + 'application/vnd.openxmlformats-officedocument.presentationml.presentation' } files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'} @@ -103,13 +104,13 @@ class MSOfficeParser(ArchiveBasedAbstractParser): class LibreOfficeParser(ArchiveBasedAbstractParser): mimetypes = { - 'application/vnd.oasis.opendocument.text', - 'application/vnd.oasis.opendocument.spreadsheet', - 'application/vnd.oasis.opendocument.presentation', - 'application/vnd.oasis.opendocument.graphics', - 'application/vnd.oasis.opendocument.chart', - 'application/vnd.oasis.opendocument.formula', - 'application/vnd.oasis.opendocument.image', + 'application/vnd.oasis.opendocument.text', + 'application/vnd.oasis.opendocument.spreadsheet', + 'application/vnd.oasis.opendocument.presentation', + 'application/vnd.oasis.opendocument.graphics', + 'application/vnd.oasis.opendocument.chart', + 'application/vnd.oasis.opendocument.formula', + 'application/vnd.oasis.opendocument.image', } def get_meta(self): diff --git a/src/parser_factory.py b/src/parser_factory.py index 2c30659..48616b0 100644 --- a/src/parser_factory.py +++ b/src/parser_factory.py @@ -2,10 +2,10 @@ import os import mimetypes import importlib import pkgutil +from typing import TypeVar from . import abstract, unsupported_extensions -from typing import TypeVar T = TypeVar('T', bound='abstract.AbstractParser') diff --git a/src/pdf.py b/src/pdf.py index fbc5175..5b99192 100644 --- a/src/pdf.py +++ b/src/pdf.py @@ -21,8 +21,8 @@ logging.basicConfig(level=logging.DEBUG) class PDFParser(abstract.AbstractParser): mimetypes = {'application/pdf', } meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords', - 'metadata', 'mod-date', 'producer', 'subject', 'title', - 'viewer-preferences'} + 'metadata', 'mod-date', 'producer', 'subject', 'title', + 'viewer-preferences'} def __init__(self, filename): super().__init__(filename) @@ -103,7 +103,8 @@ class PDFParser(abstract.AbstractParser): return True - def __remove_superficial_meta(self, in_file:str, out_file: str) -> bool: + @staticmethod + def __remove_superficial_meta(in_file: str, out_file: str) -> bool: document = Poppler.Document.new_from_file('file://' + in_file) document.set_producer('') document.set_creator('') @@ -112,7 +113,8 @@ class PDFParser(abstract.AbstractParser): return True - def __parse_metadata_field(self, data:str) -> dict: + @staticmethod + def __parse_metadata_field(data: str) -> dict: metadata = {} for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)", data, re.I): metadata[key] = value @@ -128,6 +130,6 @@ class PDFParser(abstract.AbstractParser): if document.get_property(key): metadata[key] = document.get_property(key) if 'metadata' in metadata: - parsed_meta = self.__parse_metadata_field(metadata['metadata']) + parsed_meta = self.__parse_metadata_field(metadata['metadata']) return {**metadata, **parsed_meta} return metadata diff --git a/src/torrent.py b/src/torrent.py index bdf83ce..cb4b5e3 100644 --- a/src/torrent.py +++ b/src/torrent.py @@ -11,7 +11,7 @@ class TorrentParser(abstract.AbstractParser): d = _BencodeHandler().bdecode(f.read()) if d is None: return {'Unknown meta': 'Unable to parse torrent file "%s".' % self.filename} - for k,v in d.items(): + for k, v in d.items(): if k not in self.whitelist: metadata[k.decode('utf-8')] = v return metadata @@ -23,7 +23,7 @@ class TorrentParser(abstract.AbstractParser): d = _BencodeHandler().bdecode(f.read()) if d is None: return False - for k,v in d.items(): + for k, v in d.items(): if k in self.whitelist: cleaned[k] = v with open(self.output_filename, 'wb') as f: @@ -39,21 +39,22 @@ class _BencodeHandler(object): """ def __init__(self): self.__decode_func = { - ord('d'): self.__decode_dict, - ord('i'): self.__decode_int, - ord('l'): self.__decode_list, - } + ord('d'): self.__decode_dict, + ord('i'): self.__decode_int, + ord('l'): self.__decode_list, + } for i in range(0, 10): self.__decode_func[ord(str(i))] = self.__decode_string self.__encode_func = { - bytes: self.__encode_string, - dict: self.__encode_dict, - int: self.__encode_int, - list: self.__encode_list, + bytes: self.__encode_string, + dict: self.__encode_dict, + int: self.__encode_int, + list: self.__encode_list, } - def __decode_int(self, s:str) -> (int, str): + @staticmethod + def __decode_int(s: str) -> (int, str): s = s[1:] next_idx = s.index(b'e') if s.startswith(b'-0'): @@ -62,7 +63,8 @@ class _BencodeHandler(object): raise ValueError # no leading zero except for zero itself return int(s[:next_idx]), s[next_idx+1:] - def __decode_string(self, s:str) -> (str, str): + @staticmethod + def __decode_string(s: str) -> (str, str): sep = s.index(b':') str_len = int(s[:sep]) if str_len < 0: @@ -72,7 +74,7 @@ class _BencodeHandler(object): s = s[1:] return s[sep:sep+str_len], s[sep+str_len:] - def __decode_list(self, s:str) -> (list, str): + def __decode_list(self, s: str) -> (list, str): r = list() s = s[1:] # skip leading `l` while s[0] != ord('e'): @@ -80,7 +82,7 @@ class _BencodeHandler(object): r.append(v) return r, s[1:] - def __decode_dict(self, s:str) -> (dict, str): + def __decode_dict(self, s: str) -> (dict, str): r = dict() s = s[1:] # skip leading `d` while s[0] != ord(b'e'): @@ -89,30 +91,30 @@ class _BencodeHandler(object): return r, s[1:] @staticmethod - def __encode_int(x:str) -> bytes: + def __encode_int(x: str) -> bytes: return b'i' + bytes(str(x), 'utf-8') + b'e' @staticmethod - def __encode_string(x:str) -> bytes: + def __encode_string(x: str) -> bytes: return bytes((str(len(x))), 'utf-8') + b':' + x - def __encode_list(self, x:str) -> bytes: + def __encode_list(self, x: str) -> bytes: ret = b'' for i in x: ret += self.__encode_func[type(i)](i) return b'l' + ret + b'e' - def __encode_dict(self, x:str) -> bytes: + def __encode_dict(self, x: str) -> bytes: ret = b'' for k, v in sorted(x.items()): ret += self.__encode_func[type(k)](k) ret += self.__encode_func[type(v)](v) return b'd' + ret + b'e' - def bencode(self, s:str) -> bytes: + def bencode(self, s: str) -> bytes: return self.__encode_func[type(s)](s) - def bdecode(self, s:str): + def bdecode(self, s: str): try: r, l = self.__decode_func[s[0]](s) except (IndexError, KeyError, ValueError) as e: