Implement support for .avi files, via ffmpeg
- This commit introduces optional dependencies (namely ffmpeg): mat2 will spit a warning when trying to process an .avi file if ffmpeg isn't installed. - Since metadata are obtained via exiftool, this commit also refactors a bit our exfitool wrapper.
This commit is contained in:
parent
2ae5d909c3
commit
e70ea811c9
@ -42,7 +42,7 @@ tests:debian:
|
|||||||
stage: test
|
stage: test
|
||||||
script:
|
script:
|
||||||
- apt-get -qqy update
|
- apt-get -qqy update
|
||||||
- apt-get -qqy install --no-install-recommends python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl python3-coverage
|
- apt-get -qqy install --no-install-recommends python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl python3-coverage ffmpeg
|
||||||
- python3-coverage run --branch -m unittest discover -s tests/
|
- python3-coverage run --branch -m unittest discover -s tests/
|
||||||
- python3-coverage report --fail-under=100 -m --include 'libmat2/*'
|
- python3-coverage report --fail-under=100 -m --include 'libmat2/*'
|
||||||
|
|
||||||
@ -62,5 +62,5 @@ tests:archlinux:
|
|||||||
tags:
|
tags:
|
||||||
- whitewhale
|
- whitewhale
|
||||||
script:
|
script:
|
||||||
- pacman -Sy --noconfirm python-mutagen python-gobject gdk-pixbuf2 poppler-glib gdk-pixbuf2 python-cairo perl-image-exiftool python-setuptools mailcap
|
- pacman -Sy --noconfirm python-mutagen python-gobject gdk-pixbuf2 poppler-glib gdk-pixbuf2 python-cairo perl-image-exiftool python-setuptools mailcap ffmpeg
|
||||||
- python3 setup.py test
|
- python3 setup.py test
|
||||||
|
@ -1,11 +1,12 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import os
|
|
||||||
import collections
|
import collections
|
||||||
import enum
|
import enum
|
||||||
import importlib
|
import importlib
|
||||||
from typing import Dict, Optional
|
from typing import Dict, Optional
|
||||||
|
|
||||||
|
from . import exiftool, video
|
||||||
|
|
||||||
# make pyflakes happy
|
# make pyflakes happy
|
||||||
assert Dict
|
assert Dict
|
||||||
assert Optional
|
assert Optional
|
||||||
@ -37,24 +38,13 @@ DEPENDENCIES = {
|
|||||||
'mutagen': 'Mutagen',
|
'mutagen': 'Mutagen',
|
||||||
}
|
}
|
||||||
|
|
||||||
def _get_exiftool_path() -> str: # pragma: no cover
|
|
||||||
exiftool_path = '/usr/bin/exiftool'
|
|
||||||
if os.path.isfile(exiftool_path):
|
|
||||||
if os.access(exiftool_path, os.X_OK):
|
|
||||||
return exiftool_path
|
|
||||||
|
|
||||||
# ArchLinux
|
|
||||||
exiftool_path = '/usr/bin/vendor_perl/exiftool'
|
|
||||||
if os.path.isfile(exiftool_path):
|
|
||||||
if os.access(exiftool_path, os.X_OK):
|
|
||||||
return exiftool_path
|
|
||||||
|
|
||||||
raise ValueError
|
|
||||||
|
|
||||||
def check_dependencies() -> dict:
|
def check_dependencies() -> dict:
|
||||||
ret = collections.defaultdict(bool) # type: Dict[str, bool]
|
ret = collections.defaultdict(bool) # type: Dict[str, bool]
|
||||||
|
|
||||||
ret['Exiftool'] = True if _get_exiftool_path() else False
|
ret['Exiftool'] = True if exiftool._get_exiftool_path() else False
|
||||||
|
ret['Ffmpeg'] = True if video._get_ffmpeg_path() else False
|
||||||
|
|
||||||
for key, value in DEPENDENCIES.items():
|
for key, value in DEPENDENCIES.items():
|
||||||
ret[value] = True
|
ret[value] = True
|
||||||
|
@ -7,7 +7,8 @@ assert Set # make pyflakes happy
|
|||||||
|
|
||||||
class AbstractParser(abc.ABC):
|
class AbstractParser(abc.ABC):
|
||||||
""" This is the base class of every parser.
|
""" This is the base class of every parser.
|
||||||
It might yield `ValueError` on instantiation on invalid files.
|
It might yield `ValueError` on instantiation on invalid files,
|
||||||
|
and `RuntimeError` when something went wrong in `remove_all`.
|
||||||
"""
|
"""
|
||||||
meta_list = set() # type: Set[str]
|
meta_list = set() # type: Set[str]
|
||||||
mimetypes = set() # type: Set[str]
|
mimetypes = set() # type: Set[str]
|
||||||
@ -27,4 +28,7 @@ class AbstractParser(abc.ABC):
|
|||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def remove_all(self) -> bool:
|
def remove_all(self) -> bool:
|
||||||
|
"""
|
||||||
|
:raises RuntimeError: Raised if the cleaning process went wrong.
|
||||||
|
"""
|
||||||
pass # pragma: no cover
|
pass # pragma: no cover
|
||||||
|
61
libmat2/exiftool.py
Normal file
61
libmat2/exiftool.py
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
from typing import Dict, Union, Set
|
||||||
|
|
||||||
|
from . import abstract
|
||||||
|
|
||||||
|
# Make pyflakes happy
|
||||||
|
assert Set
|
||||||
|
|
||||||
|
|
||||||
|
class ExiftoolParser(abstract.AbstractParser):
|
||||||
|
""" Exiftool is often the easiest way to get all the metadata
|
||||||
|
from a import file, hence why several parsers are re-using its `get_meta`
|
||||||
|
method.
|
||||||
|
"""
|
||||||
|
meta_whitelist = set() # type: Set[str]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def __handle_problematic_filename(filename: str, callback) -> bytes:
|
||||||
|
""" This method takes a filename with a problematic name,
|
||||||
|
and safely applies it a `callback`."""
|
||||||
|
tmpdirname = tempfile.mkdtemp()
|
||||||
|
fname = os.path.join(tmpdirname, "temp_file")
|
||||||
|
shutil.copy(filename, fname)
|
||||||
|
out = callback(fname)
|
||||||
|
shutil.rmtree(tmpdirname)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def get_meta(self) -> Dict[str, Union[str, dict]]:
|
||||||
|
""" There is no way to escape the leading(s) dash(es) of the current
|
||||||
|
self.filename to prevent parameter injections, so we need to take care
|
||||||
|
of this.
|
||||||
|
"""
|
||||||
|
fun = lambda f: subprocess.check_output([_get_exiftool_path(), '-json', f])
|
||||||
|
if re.search('^[a-z0-9/]', self.filename) is None:
|
||||||
|
out = self.__handle_problematic_filename(self.filename, fun)
|
||||||
|
else:
|
||||||
|
out = fun(self.filename)
|
||||||
|
meta = json.loads(out.decode('utf-8'))[0]
|
||||||
|
for key in self.meta_whitelist:
|
||||||
|
meta.pop(key, None)
|
||||||
|
return meta
|
||||||
|
|
||||||
|
def _get_exiftool_path() -> str: # pragma: no cover
|
||||||
|
exiftool_path = '/usr/bin/exiftool'
|
||||||
|
if os.path.isfile(exiftool_path):
|
||||||
|
if os.access(exiftool_path, os.X_OK):
|
||||||
|
return exiftool_path
|
||||||
|
|
||||||
|
# ArchLinux
|
||||||
|
exiftool_path = '/usr/bin/vendor_perl/exiftool'
|
||||||
|
if os.path.isfile(exiftool_path):
|
||||||
|
if os.access(exiftool_path, os.X_OK):
|
||||||
|
return exiftool_path
|
||||||
|
|
||||||
|
raise RuntimeError("Unable to find exiftool")
|
@ -1,11 +1,6 @@
|
|||||||
import subprocess
|
|
||||||
import imghdr
|
import imghdr
|
||||||
import json
|
|
||||||
import os
|
import os
|
||||||
import shutil
|
from typing import Set
|
||||||
import tempfile
|
|
||||||
import re
|
|
||||||
from typing import Set, Dict, Union
|
|
||||||
|
|
||||||
import cairo
|
import cairo
|
||||||
|
|
||||||
@ -13,44 +8,12 @@ import gi
|
|||||||
gi.require_version('GdkPixbuf', '2.0')
|
gi.require_version('GdkPixbuf', '2.0')
|
||||||
from gi.repository import GdkPixbuf
|
from gi.repository import GdkPixbuf
|
||||||
|
|
||||||
from . import abstract, _get_exiftool_path
|
from . import exiftool
|
||||||
|
|
||||||
# Make pyflakes happy
|
# Make pyflakes happy
|
||||||
assert Set
|
assert Set
|
||||||
|
|
||||||
class _ImageParser(abstract.AbstractParser):
|
class PNGParser(exiftool.ExiftoolParser):
|
||||||
""" Since we use `exiftool` to get metadata from
|
|
||||||
all images fileformat, `get_meta` is implemented in this class,
|
|
||||||
and all the image-handling ones are inheriting from it."""
|
|
||||||
meta_whitelist = set() # type: Set[str]
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def __handle_problematic_filename(filename: str, callback) -> bytes:
|
|
||||||
""" This method takes a filename with a problematic name,
|
|
||||||
and safely applies it a `callback`."""
|
|
||||||
tmpdirname = tempfile.mkdtemp()
|
|
||||||
fname = os.path.join(tmpdirname, "temp_file")
|
|
||||||
shutil.copy(filename, fname)
|
|
||||||
out = callback(fname)
|
|
||||||
shutil.rmtree(tmpdirname)
|
|
||||||
return out
|
|
||||||
|
|
||||||
def get_meta(self) -> Dict[str, Union[str, dict]]:
|
|
||||||
""" There is no way to escape the leading(s) dash(es) of the current
|
|
||||||
self.filename to prevent parameter injections, so we need to take care
|
|
||||||
of this.
|
|
||||||
"""
|
|
||||||
fun = lambda f: subprocess.check_output([_get_exiftool_path(), '-json', f])
|
|
||||||
if re.search('^[a-z0-9/]', self.filename) is None:
|
|
||||||
out = self.__handle_problematic_filename(self.filename, fun)
|
|
||||||
else:
|
|
||||||
out = fun(self.filename)
|
|
||||||
meta = json.loads(out.decode('utf-8'))[0]
|
|
||||||
for key in self.meta_whitelist:
|
|
||||||
meta.pop(key, None)
|
|
||||||
return meta
|
|
||||||
|
|
||||||
class PNGParser(_ImageParser):
|
|
||||||
mimetypes = {'image/png', }
|
mimetypes = {'image/png', }
|
||||||
meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName',
|
meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName',
|
||||||
'Directory', 'FileSize', 'FileModifyDate',
|
'Directory', 'FileSize', 'FileModifyDate',
|
||||||
@ -77,7 +40,7 @@ class PNGParser(_ImageParser):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
class GdkPixbufAbstractParser(_ImageParser):
|
class GdkPixbufAbstractParser(exiftool.ExiftoolParser):
|
||||||
""" GdkPixbuf can handle a lot of surfaces, so we're rending images on it,
|
""" GdkPixbuf can handle a lot of surfaces, so we're rending images on it,
|
||||||
this has the side-effect of completely removing metadata.
|
this has the side-effect of completely removing metadata.
|
||||||
"""
|
"""
|
||||||
|
@ -18,6 +18,8 @@ def __load_all_parsers():
|
|||||||
continue
|
continue
|
||||||
elif fname.endswith('__init__.py'):
|
elif fname.endswith('__init__.py'):
|
||||||
continue
|
continue
|
||||||
|
elif fname.endswith('exiftool.py'):
|
||||||
|
continue
|
||||||
basename = os.path.basename(fname)
|
basename = os.path.basename(fname)
|
||||||
name, _ = os.path.splitext(basename)
|
name, _ = os.path.splitext(basename)
|
||||||
importlib.import_module('.' + name, package='libmat2')
|
importlib.import_module('.' + name, package='libmat2')
|
||||||
|
58
libmat2/video.py
Normal file
58
libmat2/video.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
from . import exiftool
|
||||||
|
|
||||||
|
|
||||||
|
class AVIParser(exiftool.ExiftoolParser):
|
||||||
|
mimetypes = {'video/x-msvideo', }
|
||||||
|
meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory',
|
||||||
|
'FileSize', 'FileModifyDate', 'FileAccessDate',
|
||||||
|
'FileInodeChangeDate', 'FilePermissions', 'FileType',
|
||||||
|
'FileTypeExtension', 'MIMEType', 'FrameRate', 'MaxDataRate',
|
||||||
|
'FrameCount', 'StreamCount', 'StreamType', 'VideoCodec',
|
||||||
|
'VideoFrameRate', 'VideoFrameCount', 'Quality',
|
||||||
|
'SampleSize', 'BMPVersion', 'ImageWidth', 'ImageHeight',
|
||||||
|
'Planes', 'BitDepth', 'Compression', 'ImageLength',
|
||||||
|
'PixelsPerMeterX', 'PixelsPerMeterY', 'NumColors',
|
||||||
|
'NumImportantColors', 'NumColors', 'NumImportantColors',
|
||||||
|
'RedMask', 'GreenMask', 'BlueMask', 'AlphaMask',
|
||||||
|
'ColorSpace', 'AudioCodec', 'AudioCodecRate',
|
||||||
|
'AudioSampleCount', 'AudioSampleCount',
|
||||||
|
'AudioSampleRate', 'Encoding', 'NumChannels',
|
||||||
|
'SampleRate', 'AvgBytesPerSec', 'BitsPerSample',
|
||||||
|
'Duration', 'ImageSize', 'Megapixels'}
|
||||||
|
|
||||||
|
def remove_all(self) -> bool:
|
||||||
|
"""
|
||||||
|
TODO: handle problematic filenames starting with `-` and `--`,
|
||||||
|
check exiftool.py
|
||||||
|
"""
|
||||||
|
cmd = [_get_ffmpeg_path(),
|
||||||
|
'-i', self.filename, # input file
|
||||||
|
'-y', # overwrite existing output file
|
||||||
|
'-loglevel', 'panic', # Don't show log
|
||||||
|
'-hide_banner', # hide the banner
|
||||||
|
'-codec', 'copy', # don't decode anything, just copy (speed!)
|
||||||
|
'-map_metadata', '-1', # remove supperficial metadata
|
||||||
|
'-map_chapters', '-1', # remove chapters
|
||||||
|
'-fflags', '+bitexact', # don't add any metadata
|
||||||
|
'-flags:v', '+bitexact', # don't add any metadata
|
||||||
|
'-flags:a', '+bitexact', # don't add any metadata
|
||||||
|
self.output_filename]
|
||||||
|
|
||||||
|
try:
|
||||||
|
subprocess.check_call(cmd)
|
||||||
|
except subprocess.CalledProcessError: # pragma: no cover
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def _get_ffmpeg_path() -> str: # pragma: no cover
|
||||||
|
ffmpeg_path = '/usr/bin/ffmpeg'
|
||||||
|
if os.path.isfile(ffmpeg_path):
|
||||||
|
if os.access(ffmpeg_path, os.X_OK):
|
||||||
|
return ffmpeg_path
|
||||||
|
|
||||||
|
raise RuntimeError("Unable to find ffmpeg")
|
6
mat2
6
mat2
@ -97,7 +97,13 @@ def clean_meta(filename: str, is_lightweight: bool, policy: UnknownMemberPolicy)
|
|||||||
return False
|
return False
|
||||||
p.unknown_member_policy = policy
|
p.unknown_member_policy = policy
|
||||||
p.lightweight_cleaning = is_lightweight
|
p.lightweight_cleaning = is_lightweight
|
||||||
|
|
||||||
|
try:
|
||||||
return p.remove_all()
|
return p.remove_all()
|
||||||
|
except RuntimeError as e:
|
||||||
|
print("[-] %s can't be cleaned: %s" % (filename, e))
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def show_parsers() -> bool:
|
def show_parsers() -> bool:
|
||||||
|
BIN
tests/data/dirty.avi
Normal file
BIN
tests/data/dirty.avi
Normal file
Binary file not shown.
@ -6,12 +6,16 @@ import os
|
|||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
|
from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
|
||||||
from libmat2 import check_dependencies
|
from libmat2 import check_dependencies, video
|
||||||
|
|
||||||
|
|
||||||
class TestCheckDependencies(unittest.TestCase):
|
class TestCheckDependencies(unittest.TestCase):
|
||||||
def test_deps(self):
|
def test_deps(self):
|
||||||
|
try:
|
||||||
ret = check_dependencies()
|
ret = check_dependencies()
|
||||||
|
except RuntimeError:
|
||||||
|
return # this happens if not every dependency is installed
|
||||||
|
|
||||||
for value in ret.values():
|
for value in ret.values():
|
||||||
self.assertTrue(value)
|
self.assertTrue(value)
|
||||||
|
|
||||||
@ -471,3 +475,24 @@ class TestCleaning(unittest.TestCase):
|
|||||||
os.remove('./tests/data/clean.txt')
|
os.remove('./tests/data/clean.txt')
|
||||||
os.remove('./tests/data/clean.cleaned.txt')
|
os.remove('./tests/data/clean.cleaned.txt')
|
||||||
os.remove('./tests/data/clean.cleaned.cleaned.txt')
|
os.remove('./tests/data/clean.cleaned.cleaned.txt')
|
||||||
|
|
||||||
|
def test_avi(self):
|
||||||
|
shutil.copy('./tests/data/dirty.avi', './tests/data/clean.avi')
|
||||||
|
p = video.AVIParser('./tests/data/clean.avi')
|
||||||
|
|
||||||
|
meta = p.get_meta()
|
||||||
|
self.assertEqual(meta['Software'], 'MEncoder SVN-r33148-4.0.1')
|
||||||
|
|
||||||
|
try:
|
||||||
|
ret = p.remove_all()
|
||||||
|
except RuntimeError:
|
||||||
|
return # this happens if ffmepg is not installed
|
||||||
|
self.assertTrue(ret)
|
||||||
|
|
||||||
|
p = video.AVIParser('./tests/data/clean.cleaned.avi')
|
||||||
|
self.assertEqual(p.get_meta(), {})
|
||||||
|
self.assertTrue(p.remove_all())
|
||||||
|
|
||||||
|
os.remove('./tests/data/clean.avi')
|
||||||
|
os.remove('./tests/data/clean.cleaned.avi')
|
||||||
|
os.remove('./tests/data/clean.cleaned.cleaned.avi')
|
||||||
|
Loading…
Reference in New Issue
Block a user