1
0
Fork 0

Implement support for .avi files, via ffmpeg

- This commit introduces optional dependencies (namely ffmpeg):
  mat2 will spit a warning when trying to process an .avi file
  if ffmpeg isn't installed.
- Since metadata are obtained via exiftool, this commit
  also refactors a bit our exfitool wrapper.
This commit is contained in:
jvoisin 2018-10-18 19:19:56 +02:00
parent 2ae5d909c3
commit e70ea811c9
10 changed files with 170 additions and 61 deletions

View File

@ -42,7 +42,7 @@ tests:debian:
stage: test stage: test
script: script:
- apt-get -qqy update - apt-get -qqy update
- apt-get -qqy install --no-install-recommends python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl python3-coverage - apt-get -qqy install --no-install-recommends python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl python3-coverage ffmpeg
- python3-coverage run --branch -m unittest discover -s tests/ - python3-coverage run --branch -m unittest discover -s tests/
- python3-coverage report --fail-under=100 -m --include 'libmat2/*' - python3-coverage report --fail-under=100 -m --include 'libmat2/*'
@ -62,5 +62,5 @@ tests:archlinux:
tags: tags:
- whitewhale - whitewhale
script: script:
- pacman -Sy --noconfirm python-mutagen python-gobject gdk-pixbuf2 poppler-glib gdk-pixbuf2 python-cairo perl-image-exiftool python-setuptools mailcap - pacman -Sy --noconfirm python-mutagen python-gobject gdk-pixbuf2 poppler-glib gdk-pixbuf2 python-cairo perl-image-exiftool python-setuptools mailcap ffmpeg
- python3 setup.py test - python3 setup.py test

View File

@ -1,11 +1,12 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import os
import collections import collections
import enum import enum
import importlib import importlib
from typing import Dict, Optional from typing import Dict, Optional
from . import exiftool, video
# make pyflakes happy # make pyflakes happy
assert Dict assert Dict
assert Optional assert Optional
@ -37,24 +38,13 @@ DEPENDENCIES = {
'mutagen': 'Mutagen', 'mutagen': 'Mutagen',
} }
def _get_exiftool_path() -> str: # pragma: no cover
exiftool_path = '/usr/bin/exiftool'
if os.path.isfile(exiftool_path):
if os.access(exiftool_path, os.X_OK):
return exiftool_path
# ArchLinux
exiftool_path = '/usr/bin/vendor_perl/exiftool'
if os.path.isfile(exiftool_path):
if os.access(exiftool_path, os.X_OK):
return exiftool_path
raise ValueError
def check_dependencies() -> dict: def check_dependencies() -> dict:
ret = collections.defaultdict(bool) # type: Dict[str, bool] ret = collections.defaultdict(bool) # type: Dict[str, bool]
ret['Exiftool'] = True if _get_exiftool_path() else False ret['Exiftool'] = True if exiftool._get_exiftool_path() else False
ret['Ffmpeg'] = True if video._get_ffmpeg_path() else False
for key, value in DEPENDENCIES.items(): for key, value in DEPENDENCIES.items():
ret[value] = True ret[value] = True

View File

@ -7,7 +7,8 @@ assert Set # make pyflakes happy
class AbstractParser(abc.ABC): class AbstractParser(abc.ABC):
""" This is the base class of every parser. """ This is the base class of every parser.
It might yield `ValueError` on instantiation on invalid files. It might yield `ValueError` on instantiation on invalid files,
and `RuntimeError` when something went wrong in `remove_all`.
""" """
meta_list = set() # type: Set[str] meta_list = set() # type: Set[str]
mimetypes = set() # type: Set[str] mimetypes = set() # type: Set[str]
@ -27,4 +28,7 @@ class AbstractParser(abc.ABC):
@abc.abstractmethod @abc.abstractmethod
def remove_all(self) -> bool: def remove_all(self) -> bool:
"""
:raises RuntimeError: Raised if the cleaning process went wrong.
"""
pass # pragma: no cover pass # pragma: no cover

61
libmat2/exiftool.py Normal file
View File

@ -0,0 +1,61 @@
import json
import os
import re
import shutil
import subprocess
import tempfile
from typing import Dict, Union, Set
from . import abstract
# Make pyflakes happy
assert Set
class ExiftoolParser(abstract.AbstractParser):
""" Exiftool is often the easiest way to get all the metadata
from a import file, hence why several parsers are re-using its `get_meta`
method.
"""
meta_whitelist = set() # type: Set[str]
@staticmethod
def __handle_problematic_filename(filename: str, callback) -> bytes:
""" This method takes a filename with a problematic name,
and safely applies it a `callback`."""
tmpdirname = tempfile.mkdtemp()
fname = os.path.join(tmpdirname, "temp_file")
shutil.copy(filename, fname)
out = callback(fname)
shutil.rmtree(tmpdirname)
return out
def get_meta(self) -> Dict[str, Union[str, dict]]:
""" There is no way to escape the leading(s) dash(es) of the current
self.filename to prevent parameter injections, so we need to take care
of this.
"""
fun = lambda f: subprocess.check_output([_get_exiftool_path(), '-json', f])
if re.search('^[a-z0-9/]', self.filename) is None:
out = self.__handle_problematic_filename(self.filename, fun)
else:
out = fun(self.filename)
meta = json.loads(out.decode('utf-8'))[0]
for key in self.meta_whitelist:
meta.pop(key, None)
return meta
def _get_exiftool_path() -> str: # pragma: no cover
exiftool_path = '/usr/bin/exiftool'
if os.path.isfile(exiftool_path):
if os.access(exiftool_path, os.X_OK):
return exiftool_path
# ArchLinux
exiftool_path = '/usr/bin/vendor_perl/exiftool'
if os.path.isfile(exiftool_path):
if os.access(exiftool_path, os.X_OK):
return exiftool_path
raise RuntimeError("Unable to find exiftool")

View File

@ -1,11 +1,6 @@
import subprocess
import imghdr import imghdr
import json
import os import os
import shutil from typing import Set
import tempfile
import re
from typing import Set, Dict, Union
import cairo import cairo
@ -13,44 +8,12 @@ import gi
gi.require_version('GdkPixbuf', '2.0') gi.require_version('GdkPixbuf', '2.0')
from gi.repository import GdkPixbuf from gi.repository import GdkPixbuf
from . import abstract, _get_exiftool_path from . import exiftool
# Make pyflakes happy # Make pyflakes happy
assert Set assert Set
class _ImageParser(abstract.AbstractParser): class PNGParser(exiftool.ExiftoolParser):
""" Since we use `exiftool` to get metadata from
all images fileformat, `get_meta` is implemented in this class,
and all the image-handling ones are inheriting from it."""
meta_whitelist = set() # type: Set[str]
@staticmethod
def __handle_problematic_filename(filename: str, callback) -> bytes:
""" This method takes a filename with a problematic name,
and safely applies it a `callback`."""
tmpdirname = tempfile.mkdtemp()
fname = os.path.join(tmpdirname, "temp_file")
shutil.copy(filename, fname)
out = callback(fname)
shutil.rmtree(tmpdirname)
return out
def get_meta(self) -> Dict[str, Union[str, dict]]:
""" There is no way to escape the leading(s) dash(es) of the current
self.filename to prevent parameter injections, so we need to take care
of this.
"""
fun = lambda f: subprocess.check_output([_get_exiftool_path(), '-json', f])
if re.search('^[a-z0-9/]', self.filename) is None:
out = self.__handle_problematic_filename(self.filename, fun)
else:
out = fun(self.filename)
meta = json.loads(out.decode('utf-8'))[0]
for key in self.meta_whitelist:
meta.pop(key, None)
return meta
class PNGParser(_ImageParser):
mimetypes = {'image/png', } mimetypes = {'image/png', }
meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName',
'Directory', 'FileSize', 'FileModifyDate', 'Directory', 'FileSize', 'FileModifyDate',
@ -77,7 +40,7 @@ class PNGParser(_ImageParser):
return True return True
class GdkPixbufAbstractParser(_ImageParser): class GdkPixbufAbstractParser(exiftool.ExiftoolParser):
""" GdkPixbuf can handle a lot of surfaces, so we're rending images on it, """ GdkPixbuf can handle a lot of surfaces, so we're rending images on it,
this has the side-effect of completely removing metadata. this has the side-effect of completely removing metadata.
""" """

View File

@ -18,6 +18,8 @@ def __load_all_parsers():
continue continue
elif fname.endswith('__init__.py'): elif fname.endswith('__init__.py'):
continue continue
elif fname.endswith('exiftool.py'):
continue
basename = os.path.basename(fname) basename = os.path.basename(fname)
name, _ = os.path.splitext(basename) name, _ = os.path.splitext(basename)
importlib.import_module('.' + name, package='libmat2') importlib.import_module('.' + name, package='libmat2')

58
libmat2/video.py Normal file
View File

@ -0,0 +1,58 @@
import os
import subprocess
from . import exiftool
class AVIParser(exiftool.ExiftoolParser):
mimetypes = {'video/x-msvideo', }
meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory',
'FileSize', 'FileModifyDate', 'FileAccessDate',
'FileInodeChangeDate', 'FilePermissions', 'FileType',
'FileTypeExtension', 'MIMEType', 'FrameRate', 'MaxDataRate',
'FrameCount', 'StreamCount', 'StreamType', 'VideoCodec',
'VideoFrameRate', 'VideoFrameCount', 'Quality',
'SampleSize', 'BMPVersion', 'ImageWidth', 'ImageHeight',
'Planes', 'BitDepth', 'Compression', 'ImageLength',
'PixelsPerMeterX', 'PixelsPerMeterY', 'NumColors',
'NumImportantColors', 'NumColors', 'NumImportantColors',
'RedMask', 'GreenMask', 'BlueMask', 'AlphaMask',
'ColorSpace', 'AudioCodec', 'AudioCodecRate',
'AudioSampleCount', 'AudioSampleCount',
'AudioSampleRate', 'Encoding', 'NumChannels',
'SampleRate', 'AvgBytesPerSec', 'BitsPerSample',
'Duration', 'ImageSize', 'Megapixels'}
def remove_all(self) -> bool:
"""
TODO: handle problematic filenames starting with `-` and `--`,
check exiftool.py
"""
cmd = [_get_ffmpeg_path(),
'-i', self.filename, # input file
'-y', # overwrite existing output file
'-loglevel', 'panic', # Don't show log
'-hide_banner', # hide the banner
'-codec', 'copy', # don't decode anything, just copy (speed!)
'-map_metadata', '-1', # remove supperficial metadata
'-map_chapters', '-1', # remove chapters
'-fflags', '+bitexact', # don't add any metadata
'-flags:v', '+bitexact', # don't add any metadata
'-flags:a', '+bitexact', # don't add any metadata
self.output_filename]
try:
subprocess.check_call(cmd)
except subprocess.CalledProcessError: # pragma: no cover
return False
return True
def _get_ffmpeg_path() -> str: # pragma: no cover
ffmpeg_path = '/usr/bin/ffmpeg'
if os.path.isfile(ffmpeg_path):
if os.access(ffmpeg_path, os.X_OK):
return ffmpeg_path
raise RuntimeError("Unable to find ffmpeg")

8
mat2
View File

@ -97,7 +97,13 @@ def clean_meta(filename: str, is_lightweight: bool, policy: UnknownMemberPolicy)
return False return False
p.unknown_member_policy = policy p.unknown_member_policy = policy
p.lightweight_cleaning = is_lightweight p.lightweight_cleaning = is_lightweight
return p.remove_all()
try:
return p.remove_all()
except RuntimeError as e:
print("[-] %s can't be cleaned: %s" % (filename, e))
return False
def show_parsers() -> bool: def show_parsers() -> bool:

BIN
tests/data/dirty.avi Normal file

Binary file not shown.

View File

@ -6,12 +6,16 @@ import os
import zipfile import zipfile
from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
from libmat2 import check_dependencies from libmat2 import check_dependencies, video
class TestCheckDependencies(unittest.TestCase): class TestCheckDependencies(unittest.TestCase):
def test_deps(self): def test_deps(self):
ret = check_dependencies() try:
ret = check_dependencies()
except RuntimeError:
return # this happens if not every dependency is installed
for value in ret.values(): for value in ret.values():
self.assertTrue(value) self.assertTrue(value)
@ -471,3 +475,24 @@ class TestCleaning(unittest.TestCase):
os.remove('./tests/data/clean.txt') os.remove('./tests/data/clean.txt')
os.remove('./tests/data/clean.cleaned.txt') os.remove('./tests/data/clean.cleaned.txt')
os.remove('./tests/data/clean.cleaned.cleaned.txt') os.remove('./tests/data/clean.cleaned.cleaned.txt')
def test_avi(self):
shutil.copy('./tests/data/dirty.avi', './tests/data/clean.avi')
p = video.AVIParser('./tests/data/clean.avi')
meta = p.get_meta()
self.assertEqual(meta['Software'], 'MEncoder SVN-r33148-4.0.1')
try:
ret = p.remove_all()
except RuntimeError:
return # this happens if ffmepg is not installed
self.assertTrue(ret)
p = video.AVIParser('./tests/data/clean.cleaned.avi')
self.assertEqual(p.get_meta(), {})
self.assertTrue(p.remove_all())
os.remove('./tests/data/clean.avi')
os.remove('./tests/data/clean.cleaned.avi')
os.remove('./tests/data/clean.cleaned.cleaned.avi')