diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4f0a140..32ec086 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -42,7 +42,7 @@ tests:debian: stage: test script: - apt-get -qqy update - - apt-get -qqy install --no-install-recommends python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl python3-coverage + - apt-get -qqy install --no-install-recommends python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl python3-coverage ffmpeg - python3-coverage run --branch -m unittest discover -s tests/ - python3-coverage report --fail-under=100 -m --include 'libmat2/*' @@ -62,5 +62,5 @@ tests:archlinux: tags: - whitewhale script: - - pacman -Sy --noconfirm python-mutagen python-gobject gdk-pixbuf2 poppler-glib gdk-pixbuf2 python-cairo perl-image-exiftool python-setuptools mailcap + - pacman -Sy --noconfirm python-mutagen python-gobject gdk-pixbuf2 poppler-glib gdk-pixbuf2 python-cairo perl-image-exiftool python-setuptools mailcap ffmpeg - python3 setup.py test diff --git a/libmat2/__init__.py b/libmat2/__init__.py index f55a14c..399a364 100644 --- a/libmat2/__init__.py +++ b/libmat2/__init__.py @@ -1,11 +1,12 @@ #!/usr/bin/env python3 -import os import collections import enum import importlib from typing import Dict, Optional +from . import exiftool, video + # make pyflakes happy assert Dict assert Optional @@ -37,24 +38,13 @@ DEPENDENCIES = { 'mutagen': 'Mutagen', } -def _get_exiftool_path() -> str: # pragma: no cover - exiftool_path = '/usr/bin/exiftool' - if os.path.isfile(exiftool_path): - if os.access(exiftool_path, os.X_OK): - return exiftool_path - # ArchLinux - exiftool_path = '/usr/bin/vendor_perl/exiftool' - if os.path.isfile(exiftool_path): - if os.access(exiftool_path, os.X_OK): - return exiftool_path - - raise ValueError def check_dependencies() -> dict: ret = collections.defaultdict(bool) # type: Dict[str, bool] - ret['Exiftool'] = True if _get_exiftool_path() else False + ret['Exiftool'] = True if exiftool._get_exiftool_path() else False + ret['Ffmpeg'] = True if video._get_ffmpeg_path() else False for key, value in DEPENDENCIES.items(): ret[value] = True diff --git a/libmat2/abstract.py b/libmat2/abstract.py index 0084796..414a68b 100644 --- a/libmat2/abstract.py +++ b/libmat2/abstract.py @@ -7,7 +7,8 @@ assert Set # make pyflakes happy class AbstractParser(abc.ABC): """ This is the base class of every parser. - It might yield `ValueError` on instantiation on invalid files. + It might yield `ValueError` on instantiation on invalid files, + and `RuntimeError` when something went wrong in `remove_all`. """ meta_list = set() # type: Set[str] mimetypes = set() # type: Set[str] @@ -27,4 +28,7 @@ class AbstractParser(abc.ABC): @abc.abstractmethod def remove_all(self) -> bool: + """ + :raises RuntimeError: Raised if the cleaning process went wrong. + """ pass # pragma: no cover diff --git a/libmat2/exiftool.py b/libmat2/exiftool.py new file mode 100644 index 0000000..e17d31b --- /dev/null +++ b/libmat2/exiftool.py @@ -0,0 +1,61 @@ +import json +import os +import re +import shutil +import subprocess +import tempfile + +from typing import Dict, Union, Set + +from . import abstract + +# Make pyflakes happy +assert Set + + +class ExiftoolParser(abstract.AbstractParser): + """ Exiftool is often the easiest way to get all the metadata + from a import file, hence why several parsers are re-using its `get_meta` + method. + """ + meta_whitelist = set() # type: Set[str] + + @staticmethod + def __handle_problematic_filename(filename: str, callback) -> bytes: + """ This method takes a filename with a problematic name, + and safely applies it a `callback`.""" + tmpdirname = tempfile.mkdtemp() + fname = os.path.join(tmpdirname, "temp_file") + shutil.copy(filename, fname) + out = callback(fname) + shutil.rmtree(tmpdirname) + return out + + def get_meta(self) -> Dict[str, Union[str, dict]]: + """ There is no way to escape the leading(s) dash(es) of the current + self.filename to prevent parameter injections, so we need to take care + of this. + """ + fun = lambda f: subprocess.check_output([_get_exiftool_path(), '-json', f]) + if re.search('^[a-z0-9/]', self.filename) is None: + out = self.__handle_problematic_filename(self.filename, fun) + else: + out = fun(self.filename) + meta = json.loads(out.decode('utf-8'))[0] + for key in self.meta_whitelist: + meta.pop(key, None) + return meta + +def _get_exiftool_path() -> str: # pragma: no cover + exiftool_path = '/usr/bin/exiftool' + if os.path.isfile(exiftool_path): + if os.access(exiftool_path, os.X_OK): + return exiftool_path + + # ArchLinux + exiftool_path = '/usr/bin/vendor_perl/exiftool' + if os.path.isfile(exiftool_path): + if os.access(exiftool_path, os.X_OK): + return exiftool_path + + raise RuntimeError("Unable to find exiftool") diff --git a/libmat2/images.py b/libmat2/images.py index a29cbb7..ad80892 100644 --- a/libmat2/images.py +++ b/libmat2/images.py @@ -1,11 +1,6 @@ -import subprocess import imghdr -import json import os -import shutil -import tempfile -import re -from typing import Set, Dict, Union +from typing import Set import cairo @@ -13,44 +8,12 @@ import gi gi.require_version('GdkPixbuf', '2.0') from gi.repository import GdkPixbuf -from . import abstract, _get_exiftool_path +from . import exiftool # Make pyflakes happy assert Set -class _ImageParser(abstract.AbstractParser): - """ Since we use `exiftool` to get metadata from - all images fileformat, `get_meta` is implemented in this class, - and all the image-handling ones are inheriting from it.""" - meta_whitelist = set() # type: Set[str] - - @staticmethod - def __handle_problematic_filename(filename: str, callback) -> bytes: - """ This method takes a filename with a problematic name, - and safely applies it a `callback`.""" - tmpdirname = tempfile.mkdtemp() - fname = os.path.join(tmpdirname, "temp_file") - shutil.copy(filename, fname) - out = callback(fname) - shutil.rmtree(tmpdirname) - return out - - def get_meta(self) -> Dict[str, Union[str, dict]]: - """ There is no way to escape the leading(s) dash(es) of the current - self.filename to prevent parameter injections, so we need to take care - of this. - """ - fun = lambda f: subprocess.check_output([_get_exiftool_path(), '-json', f]) - if re.search('^[a-z0-9/]', self.filename) is None: - out = self.__handle_problematic_filename(self.filename, fun) - else: - out = fun(self.filename) - meta = json.loads(out.decode('utf-8'))[0] - for key in self.meta_whitelist: - meta.pop(key, None) - return meta - -class PNGParser(_ImageParser): +class PNGParser(exiftool.ExiftoolParser): mimetypes = {'image/png', } meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory', 'FileSize', 'FileModifyDate', @@ -77,7 +40,7 @@ class PNGParser(_ImageParser): return True -class GdkPixbufAbstractParser(_ImageParser): +class GdkPixbufAbstractParser(exiftool.ExiftoolParser): """ GdkPixbuf can handle a lot of surfaces, so we're rending images on it, this has the side-effect of completely removing metadata. """ diff --git a/libmat2/parser_factory.py b/libmat2/parser_factory.py index 621640b..4a0ca0d 100644 --- a/libmat2/parser_factory.py +++ b/libmat2/parser_factory.py @@ -18,6 +18,8 @@ def __load_all_parsers(): continue elif fname.endswith('__init__.py'): continue + elif fname.endswith('exiftool.py'): + continue basename = os.path.basename(fname) name, _ = os.path.splitext(basename) importlib.import_module('.' + name, package='libmat2') diff --git a/libmat2/video.py b/libmat2/video.py new file mode 100644 index 0000000..b9f3687 --- /dev/null +++ b/libmat2/video.py @@ -0,0 +1,58 @@ +import os +import subprocess + +from . import exiftool + + +class AVIParser(exiftool.ExiftoolParser): + mimetypes = {'video/x-msvideo', } + meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory', + 'FileSize', 'FileModifyDate', 'FileAccessDate', + 'FileInodeChangeDate', 'FilePermissions', 'FileType', + 'FileTypeExtension', 'MIMEType', 'FrameRate', 'MaxDataRate', + 'FrameCount', 'StreamCount', 'StreamType', 'VideoCodec', + 'VideoFrameRate', 'VideoFrameCount', 'Quality', + 'SampleSize', 'BMPVersion', 'ImageWidth', 'ImageHeight', + 'Planes', 'BitDepth', 'Compression', 'ImageLength', + 'PixelsPerMeterX', 'PixelsPerMeterY', 'NumColors', + 'NumImportantColors', 'NumColors', 'NumImportantColors', + 'RedMask', 'GreenMask', 'BlueMask', 'AlphaMask', + 'ColorSpace', 'AudioCodec', 'AudioCodecRate', + 'AudioSampleCount', 'AudioSampleCount', + 'AudioSampleRate', 'Encoding', 'NumChannels', + 'SampleRate', 'AvgBytesPerSec', 'BitsPerSample', + 'Duration', 'ImageSize', 'Megapixels'} + + def remove_all(self) -> bool: + """ + TODO: handle problematic filenames starting with `-` and `--`, + check exiftool.py + """ + cmd = [_get_ffmpeg_path(), + '-i', self.filename, # input file + '-y', # overwrite existing output file + '-loglevel', 'panic', # Don't show log + '-hide_banner', # hide the banner + '-codec', 'copy', # don't decode anything, just copy (speed!) + '-map_metadata', '-1', # remove supperficial metadata + '-map_chapters', '-1', # remove chapters + '-fflags', '+bitexact', # don't add any metadata + '-flags:v', '+bitexact', # don't add any metadata + '-flags:a', '+bitexact', # don't add any metadata + self.output_filename] + + try: + subprocess.check_call(cmd) + except subprocess.CalledProcessError: # pragma: no cover + return False + + return True + + +def _get_ffmpeg_path() -> str: # pragma: no cover + ffmpeg_path = '/usr/bin/ffmpeg' + if os.path.isfile(ffmpeg_path): + if os.access(ffmpeg_path, os.X_OK): + return ffmpeg_path + + raise RuntimeError("Unable to find ffmpeg") diff --git a/mat2 b/mat2 index 1665576..01263b6 100755 --- a/mat2 +++ b/mat2 @@ -97,7 +97,13 @@ def clean_meta(filename: str, is_lightweight: bool, policy: UnknownMemberPolicy) return False p.unknown_member_policy = policy p.lightweight_cleaning = is_lightweight - return p.remove_all() + + try: + return p.remove_all() + except RuntimeError as e: + print("[-] %s can't be cleaned: %s" % (filename, e)) + return False + def show_parsers() -> bool: diff --git a/tests/data/dirty.avi b/tests/data/dirty.avi new file mode 100644 index 0000000..850feab Binary files /dev/null and b/tests/data/dirty.avi differ diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 665bab0..37adc6a 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -6,12 +6,16 @@ import os import zipfile from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless -from libmat2 import check_dependencies +from libmat2 import check_dependencies, video class TestCheckDependencies(unittest.TestCase): def test_deps(self): - ret = check_dependencies() + try: + ret = check_dependencies() + except RuntimeError: + return # this happens if not every dependency is installed + for value in ret.values(): self.assertTrue(value) @@ -471,3 +475,24 @@ class TestCleaning(unittest.TestCase): os.remove('./tests/data/clean.txt') os.remove('./tests/data/clean.cleaned.txt') os.remove('./tests/data/clean.cleaned.cleaned.txt') + + def test_avi(self): + shutil.copy('./tests/data/dirty.avi', './tests/data/clean.avi') + p = video.AVIParser('./tests/data/clean.avi') + + meta = p.get_meta() + self.assertEqual(meta['Software'], 'MEncoder SVN-r33148-4.0.1') + + try: + ret = p.remove_all() + except RuntimeError: + return # this happens if ffmepg is not installed + self.assertTrue(ret) + + p = video.AVIParser('./tests/data/clean.cleaned.avi') + self.assertEqual(p.get_meta(), {}) + self.assertTrue(p.remove_all()) + + os.remove('./tests/data/clean.avi') + os.remove('./tests/data/clean.cleaned.avi') + os.remove('./tests/data/clean.cleaned.cleaned.avi')