diff --git a/main.py b/main.py index 65158e6..80355ae 100755 --- a/main.py +++ b/main.py @@ -16,6 +16,7 @@ def __check_file(filename:str, mode:int = os.R_OK) -> bool: return False return True + def create_arg_parser(): parser = argparse.ArgumentParser(description='Metadata anonymisation toolkit 2') parser.add_argument('files', nargs='*') @@ -29,6 +30,7 @@ def create_arg_parser(): help='list all the harmful metadata of a file without removing them') return parser + def show_meta(filename:str): if not __check_file(filename): return @@ -44,6 +46,7 @@ def show_meta(filename:str): except UnicodeEncodeError: print(" %s: harmful content" % k) + def clean_meta(filename:str): if not __check_file(filename, os.R_OK|os.W_OK): return @@ -54,6 +57,7 @@ def clean_meta(filename:str): return p.remove_all() + def show_parsers(): print('[+] Supported formats:') for parser in parser_factory._get_parsers(): @@ -61,6 +65,7 @@ def show_parsers(): extensions = ', '.join(mimetypes.guess_all_extensions(mtype)) print(' - %s (%s)' % (mtype, extensions)) + def __get_files_recursively(files): for f in files: if os.path.isfile(f): diff --git a/src/abstract.py b/src/abstract.py index 4626789..04c1535 100644 --- a/src/abstract.py +++ b/src/abstract.py @@ -1,5 +1,6 @@ import abc + class AbstractParser(abc.ABC): meta_list = set() mimetypes = set() diff --git a/src/audio.py b/src/audio.py index a56828f..4a385b2 100644 --- a/src/audio.py +++ b/src/audio.py @@ -4,6 +4,7 @@ import mutagen from . import abstract + class MutagenParser(abstract.AbstractParser): def get_meta(self): f = mutagen.File(self.filename) @@ -18,6 +19,7 @@ class MutagenParser(abstract.AbstractParser): f.save() return True + class MP3Parser(MutagenParser): mimetypes = {'audio/mpeg', } @@ -28,8 +30,10 @@ class MP3Parser(MutagenParser): metadata[key.rstrip(' \t\r\n\0')] = ', '.join(map(str, meta[key].text)) return metadata + class OGGParser(MutagenParser): mimetypes = {'audio/ogg', } + class FLACParser(MutagenParser): mimetypes = {'audio/flac', } diff --git a/src/harmless.py b/src/harmless.py index 235dabe..9e7c1b4 100644 --- a/src/harmless.py +++ b/src/harmless.py @@ -1,5 +1,6 @@ from . import abstract + class HarmlessParser(abstract.AbstractParser): """ This is the parser for filetypes that do not contain metadata. """ mimetypes = {'application/xml', 'text/plain', 'application/rdf+xml'} diff --git a/src/images.py b/src/images.py index afc0658..2c1fd2e 100644 --- a/src/images.py +++ b/src/images.py @@ -10,6 +10,7 @@ from gi.repository import GdkPixbuf from . import abstract + class PNGParser(abstract.AbstractParser): mimetypes = {'image/png', } meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', @@ -31,6 +32,7 @@ class PNGParser(abstract.AbstractParser): surface.write_to_png(self.output_filename) return True + class GdkPixbufAbstractParser(abstract.AbstractParser): """ GdkPixbuf can handle a lot of surfaces, so we're rending images on it, this has the side-effect of removing metadata completely. diff --git a/src/office.py b/src/office.py index 9729e19..11692c3 100644 --- a/src/office.py +++ b/src/office.py @@ -7,6 +7,7 @@ import zipfile from . import abstract, parser_factory + class ArchiveBasedAbstractParser(abstract.AbstractParser): def _clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo: zipinfo.compress_type = zipfile.ZIP_DEFLATED @@ -46,6 +47,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): with open(tmp_parser.output_filename, 'rb') as f: zout.writestr(clean_zinfo, f.read()) + class MSOfficeParser(ArchiveBasedAbstractParser): mimetypes = { 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', diff --git a/src/parser_factory.py b/src/parser_factory.py index ecec789..68e9e9c 100644 --- a/src/parser_factory.py +++ b/src/parser_factory.py @@ -16,6 +16,7 @@ for module_loader, name, ispkg in pkgutil.walk_packages('.src'): continue importlib.import_module(name) + def _get_parsers() -> list: """ Get all our parsers!""" def __get_parsers(cls): @@ -23,6 +24,7 @@ def _get_parsers() -> list: [g for s in cls.__subclasses__() for g in __get_parsers(s)] return __get_parsers(abstract.AbstractParser) + def get_parser(filename: str) -> (T, str): mtype, _ = mimetypes.guess_type(filename) diff --git a/tests/test_climat2.py b/tests/test_climat2.py index f395001..b9c52b5 100644 --- a/tests/test_climat2.py +++ b/tests/test_climat2.py @@ -13,6 +13,7 @@ class TestHelp(unittest.TestCase): stdout, _ = proc.communicate() self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [files [files ...]]', stdout) + class TestGetMeta(unittest.TestCase): def test_pdf(self): proc = subprocess.Popen(['./main.py', '--show', './tests/data/dirty.pdf'], diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index c2864c6..4cfb80a 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -8,6 +8,7 @@ import tempfile from src import pdf, images, audio, office, parser_factory + class TestParserFactory(unittest.TestCase): def test_subsubcalss(self): """ Test that our module auto-detection is handling sub-sub-classes """ @@ -15,6 +16,7 @@ class TestParserFactory(unittest.TestCase): self.assertEqual(mimetype, 'audio/mpeg') self.assertEqual(parser.__class__, audio.MP3Parser) + class TestGetMeta(unittest.TestCase): def test_pdf(self): p = pdf.PDFParser('./tests/data/dirty.pdf') @@ -132,6 +134,7 @@ class TestDeepCleaning(unittest.TestCase): os.remove('./tests/data/clean.odt') + class TestCleaning(unittest.TestCase): def test_pdf(self): shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')