1
0
mirror of synced 2024-11-25 02:34:24 +01:00

Test for faulty files, and document how MAT2 is behaving wrt. them

This commit is contained in:
jvoisin 2018-05-06 21:58:31 +02:00
parent 459e9b82f7
commit b02d72887a
5 changed files with 38 additions and 3 deletions

View File

@ -9,6 +9,14 @@ that only cleans the superficial metadata of your file, but not
the ones that might be in **embeded** resources. Like for example, the ones that might be in **embeded** resources. Like for example,
images in a PDF or an office document. images in a PDF or an office document.
Race conditions
---------------
MAT2 does its very best to avoid crashing at runtime. This is why it's checking
if the file is valid __at parser creation__. MAT2 doesn't take any measure to
ensure that the file is not changed between the time the parser is
instantiated, and the call to clean or show the metadata.
Symlink attacks Symlink attacks
--------------- ---------------

View File

@ -20,6 +20,13 @@ class PNGParser(abstract.AbstractParser):
'Compression', 'Filter', 'Interlace', 'BackgroundColor', 'ImageSize', 'Compression', 'Filter', 'Interlace', 'BackgroundColor', 'ImageSize',
'Megapixels', 'ImageHeight'} 'Megapixels', 'ImageHeight'}
def __init__(self, filename):
super().__init__(filename)
try: # better fail here than later
cairo.ImageSurface.create_from_png(self.filename)
except MemoryError:
raise ValueError
def get_meta(self): def get_meta(self):
out = subprocess.check_output(['/usr/bin/exiftool', '-json', self.filename]) out = subprocess.check_output(['/usr/bin/exiftool', '-json', self.filename])
meta = json.loads(out.decode('utf-8'))[0] meta = json.loads(out.decode('utf-8'))[0]

View File

@ -30,5 +30,8 @@ def get_parser(filename: str) -> (T, str):
for c in _get_parsers(): for c in _get_parsers():
if mtype in c.mimetypes: if mtype in c.mimetypes:
return c(filename), mtype try:
return c(filename), mtype
except ValueError:
return None, mtype
return None, mtype return None, mtype

View File

@ -11,7 +11,7 @@ import io
import cairo import cairo
import gi import gi
gi.require_version('Poppler', '0.18') gi.require_version('Poppler', '0.18')
from gi.repository import Poppler from gi.repository import Poppler, GLib
from . import abstract from . import abstract
@ -28,6 +28,10 @@ class PDFParser(abstract.AbstractParser):
super().__init__(filename) super().__init__(filename)
self.uri = 'file://' + os.path.abspath(self.filename) self.uri = 'file://' + os.path.abspath(self.filename)
self.__scale = 2 # how much precision do we want for the render self.__scale = 2 # how much precision do we want for the render
try: # Check now that the file is valid, to avoid surprises later
Poppler.Document.new_from_file(self.uri, None)
except GLib.GError: # Invalid PDF
raise ValueError
def remove_all_lightweight(self): def remove_all_lightweight(self):
""" """
@ -116,8 +120,9 @@ class PDFParser(abstract.AbstractParser):
def get_meta(self): def get_meta(self):
""" Return a dict with all the meta of the file """ Return a dict with all the meta of the file
""" """
document = Poppler.Document.new_from_file(self.uri, None)
metadata = {} metadata = {}
document = Poppler.Document.new_from_file(self.uri, None)
for key in self.meta_list: for key in self.meta_list:
if document.get_property(key): if document.get_property(key):
metadata[key] = document.get_property(key) metadata[key] = document.get_property(key)

View File

@ -16,6 +16,18 @@ class TestParserFactory(unittest.TestCase):
self.assertEqual(mimetype, 'audio/mpeg') self.assertEqual(mimetype, 'audio/mpeg')
self.assertEqual(parser.__class__, audio.MP3Parser) self.assertEqual(parser.__class__, audio.MP3Parser)
class TestCorruptedFiles(unittest.TestCase):
def test_pdf(self):
shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
with self.assertRaises(ValueError):
pdf.PDFParser('./tests/data/clean.png')
os.remove('./tests/data/clean.png')
def test_png(self):
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
with self.assertRaises(ValueError):
images.PNGParser('./tests/data/clean.pdf')
os.remove('./tests/data/clean.pdf')
class TestGetMeta(unittest.TestCase): class TestGetMeta(unittest.TestCase):
def test_pdf(self): def test_pdf(self):