1
0
Fork 0

Implement support for .avi files, via ffmpeg

- This commit introduces optional dependencies (namely ffmpeg):
  mat2 will spit a warning when trying to process an .avi file
  if ffmpeg isn't installed.
- Since metadata are obtained via exiftool, this commit
  also refactors a bit our exfitool wrapper.
This commit is contained in:
jvoisin 2018-10-18 19:19:56 +02:00
parent 2ae5d909c3
commit e70ea811c9
10 changed files with 170 additions and 61 deletions

View File

@ -42,7 +42,7 @@ tests:debian:
stage: test
script:
- apt-get -qqy update
- apt-get -qqy install --no-install-recommends python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl python3-coverage
- apt-get -qqy install --no-install-recommends python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl python3-coverage ffmpeg
- python3-coverage run --branch -m unittest discover -s tests/
- python3-coverage report --fail-under=100 -m --include 'libmat2/*'
@ -62,5 +62,5 @@ tests:archlinux:
tags:
- whitewhale
script:
- pacman -Sy --noconfirm python-mutagen python-gobject gdk-pixbuf2 poppler-glib gdk-pixbuf2 python-cairo perl-image-exiftool python-setuptools mailcap
- pacman -Sy --noconfirm python-mutagen python-gobject gdk-pixbuf2 poppler-glib gdk-pixbuf2 python-cairo perl-image-exiftool python-setuptools mailcap ffmpeg
- python3 setup.py test

View File

@ -1,11 +1,12 @@
#!/usr/bin/env python3
import os
import collections
import enum
import importlib
from typing import Dict, Optional
from . import exiftool, video
# make pyflakes happy
assert Dict
assert Optional
@ -37,24 +38,13 @@ DEPENDENCIES = {
'mutagen': 'Mutagen',
}
def _get_exiftool_path() -> str: # pragma: no cover
exiftool_path = '/usr/bin/exiftool'
if os.path.isfile(exiftool_path):
if os.access(exiftool_path, os.X_OK):
return exiftool_path
# ArchLinux
exiftool_path = '/usr/bin/vendor_perl/exiftool'
if os.path.isfile(exiftool_path):
if os.access(exiftool_path, os.X_OK):
return exiftool_path
raise ValueError
def check_dependencies() -> dict:
ret = collections.defaultdict(bool) # type: Dict[str, bool]
ret['Exiftool'] = True if _get_exiftool_path() else False
ret['Exiftool'] = True if exiftool._get_exiftool_path() else False
ret['Ffmpeg'] = True if video._get_ffmpeg_path() else False
for key, value in DEPENDENCIES.items():
ret[value] = True

View File

@ -7,7 +7,8 @@ assert Set # make pyflakes happy
class AbstractParser(abc.ABC):
""" This is the base class of every parser.
It might yield `ValueError` on instantiation on invalid files.
It might yield `ValueError` on instantiation on invalid files,
and `RuntimeError` when something went wrong in `remove_all`.
"""
meta_list = set() # type: Set[str]
mimetypes = set() # type: Set[str]
@ -27,4 +28,7 @@ class AbstractParser(abc.ABC):
@abc.abstractmethod
def remove_all(self) -> bool:
"""
:raises RuntimeError: Raised if the cleaning process went wrong.
"""
pass # pragma: no cover

61
libmat2/exiftool.py Normal file
View File

@ -0,0 +1,61 @@
import json
import os
import re
import shutil
import subprocess
import tempfile
from typing import Dict, Union, Set
from . import abstract
# Make pyflakes happy
assert Set
class ExiftoolParser(abstract.AbstractParser):
""" Exiftool is often the easiest way to get all the metadata
from a import file, hence why several parsers are re-using its `get_meta`
method.
"""
meta_whitelist = set() # type: Set[str]
@staticmethod
def __handle_problematic_filename(filename: str, callback) -> bytes:
""" This method takes a filename with a problematic name,
and safely applies it a `callback`."""
tmpdirname = tempfile.mkdtemp()
fname = os.path.join(tmpdirname, "temp_file")
shutil.copy(filename, fname)
out = callback(fname)
shutil.rmtree(tmpdirname)
return out
def get_meta(self) -> Dict[str, Union[str, dict]]:
""" There is no way to escape the leading(s) dash(es) of the current
self.filename to prevent parameter injections, so we need to take care
of this.
"""
fun = lambda f: subprocess.check_output([_get_exiftool_path(), '-json', f])
if re.search('^[a-z0-9/]', self.filename) is None:
out = self.__handle_problematic_filename(self.filename, fun)
else:
out = fun(self.filename)
meta = json.loads(out.decode('utf-8'))[0]
for key in self.meta_whitelist:
meta.pop(key, None)
return meta
def _get_exiftool_path() -> str: # pragma: no cover
exiftool_path = '/usr/bin/exiftool'
if os.path.isfile(exiftool_path):
if os.access(exiftool_path, os.X_OK):
return exiftool_path
# ArchLinux
exiftool_path = '/usr/bin/vendor_perl/exiftool'
if os.path.isfile(exiftool_path):
if os.access(exiftool_path, os.X_OK):
return exiftool_path
raise RuntimeError("Unable to find exiftool")

View File

@ -1,11 +1,6 @@
import subprocess
import imghdr
import json
import os
import shutil
import tempfile
import re
from typing import Set, Dict, Union
from typing import Set
import cairo
@ -13,44 +8,12 @@ import gi
gi.require_version('GdkPixbuf', '2.0')
from gi.repository import GdkPixbuf
from . import abstract, _get_exiftool_path
from . import exiftool
# Make pyflakes happy
assert Set
class _ImageParser(abstract.AbstractParser):
""" Since we use `exiftool` to get metadata from
all images fileformat, `get_meta` is implemented in this class,
and all the image-handling ones are inheriting from it."""
meta_whitelist = set() # type: Set[str]
@staticmethod
def __handle_problematic_filename(filename: str, callback) -> bytes:
""" This method takes a filename with a problematic name,
and safely applies it a `callback`."""
tmpdirname = tempfile.mkdtemp()
fname = os.path.join(tmpdirname, "temp_file")
shutil.copy(filename, fname)
out = callback(fname)
shutil.rmtree(tmpdirname)
return out
def get_meta(self) -> Dict[str, Union[str, dict]]:
""" There is no way to escape the leading(s) dash(es) of the current
self.filename to prevent parameter injections, so we need to take care
of this.
"""
fun = lambda f: subprocess.check_output([_get_exiftool_path(), '-json', f])
if re.search('^[a-z0-9/]', self.filename) is None:
out = self.__handle_problematic_filename(self.filename, fun)
else:
out = fun(self.filename)
meta = json.loads(out.decode('utf-8'))[0]
for key in self.meta_whitelist:
meta.pop(key, None)
return meta
class PNGParser(_ImageParser):
class PNGParser(exiftool.ExiftoolParser):
mimetypes = {'image/png', }
meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName',
'Directory', 'FileSize', 'FileModifyDate',
@ -77,7 +40,7 @@ class PNGParser(_ImageParser):
return True
class GdkPixbufAbstractParser(_ImageParser):
class GdkPixbufAbstractParser(exiftool.ExiftoolParser):
""" GdkPixbuf can handle a lot of surfaces, so we're rending images on it,
this has the side-effect of completely removing metadata.
"""

View File

@ -18,6 +18,8 @@ def __load_all_parsers():
continue
elif fname.endswith('__init__.py'):
continue
elif fname.endswith('exiftool.py'):
continue
basename = os.path.basename(fname)
name, _ = os.path.splitext(basename)
importlib.import_module('.' + name, package='libmat2')

58
libmat2/video.py Normal file
View File

@ -0,0 +1,58 @@
import os
import subprocess
from . import exiftool
class AVIParser(exiftool.ExiftoolParser):
mimetypes = {'video/x-msvideo', }
meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory',
'FileSize', 'FileModifyDate', 'FileAccessDate',
'FileInodeChangeDate', 'FilePermissions', 'FileType',
'FileTypeExtension', 'MIMEType', 'FrameRate', 'MaxDataRate',
'FrameCount', 'StreamCount', 'StreamType', 'VideoCodec',
'VideoFrameRate', 'VideoFrameCount', 'Quality',
'SampleSize', 'BMPVersion', 'ImageWidth', 'ImageHeight',
'Planes', 'BitDepth', 'Compression', 'ImageLength',
'PixelsPerMeterX', 'PixelsPerMeterY', 'NumColors',
'NumImportantColors', 'NumColors', 'NumImportantColors',
'RedMask', 'GreenMask', 'BlueMask', 'AlphaMask',
'ColorSpace', 'AudioCodec', 'AudioCodecRate',
'AudioSampleCount', 'AudioSampleCount',
'AudioSampleRate', 'Encoding', 'NumChannels',
'SampleRate', 'AvgBytesPerSec', 'BitsPerSample',
'Duration', 'ImageSize', 'Megapixels'}
def remove_all(self) -> bool:
"""
TODO: handle problematic filenames starting with `-` and `--`,
check exiftool.py
"""
cmd = [_get_ffmpeg_path(),
'-i', self.filename, # input file
'-y', # overwrite existing output file
'-loglevel', 'panic', # Don't show log
'-hide_banner', # hide the banner
'-codec', 'copy', # don't decode anything, just copy (speed!)
'-map_metadata', '-1', # remove supperficial metadata
'-map_chapters', '-1', # remove chapters
'-fflags', '+bitexact', # don't add any metadata
'-flags:v', '+bitexact', # don't add any metadata
'-flags:a', '+bitexact', # don't add any metadata
self.output_filename]
try:
subprocess.check_call(cmd)
except subprocess.CalledProcessError: # pragma: no cover
return False
return True
def _get_ffmpeg_path() -> str: # pragma: no cover
ffmpeg_path = '/usr/bin/ffmpeg'
if os.path.isfile(ffmpeg_path):
if os.access(ffmpeg_path, os.X_OK):
return ffmpeg_path
raise RuntimeError("Unable to find ffmpeg")

8
mat2
View File

@ -97,7 +97,13 @@ def clean_meta(filename: str, is_lightweight: bool, policy: UnknownMemberPolicy)
return False
p.unknown_member_policy = policy
p.lightweight_cleaning = is_lightweight
return p.remove_all()
try:
return p.remove_all()
except RuntimeError as e:
print("[-] %s can't be cleaned: %s" % (filename, e))
return False
def show_parsers() -> bool:

BIN
tests/data/dirty.avi Normal file

Binary file not shown.

View File

@ -6,12 +6,16 @@ import os
import zipfile
from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
from libmat2 import check_dependencies
from libmat2 import check_dependencies, video
class TestCheckDependencies(unittest.TestCase):
def test_deps(self):
ret = check_dependencies()
try:
ret = check_dependencies()
except RuntimeError:
return # this happens if not every dependency is installed
for value in ret.values():
self.assertTrue(value)
@ -471,3 +475,24 @@ class TestCleaning(unittest.TestCase):
os.remove('./tests/data/clean.txt')
os.remove('./tests/data/clean.cleaned.txt')
os.remove('./tests/data/clean.cleaned.cleaned.txt')
def test_avi(self):
shutil.copy('./tests/data/dirty.avi', './tests/data/clean.avi')
p = video.AVIParser('./tests/data/clean.avi')
meta = p.get_meta()
self.assertEqual(meta['Software'], 'MEncoder SVN-r33148-4.0.1')
try:
ret = p.remove_all()
except RuntimeError:
return # this happens if ffmepg is not installed
self.assertTrue(ret)
p = video.AVIParser('./tests/data/clean.cleaned.avi')
self.assertEqual(p.get_meta(), {})
self.assertTrue(p.remove_all())
os.remove('./tests/data/clean.avi')
os.remove('./tests/data/clean.cleaned.avi')
os.remove('./tests/data/clean.cleaned.cleaned.avi')