diff --git a/doc/implementation_notes.md b/doc/implementation_notes.md index 60e9081..59e7d94 100644 --- a/doc/implementation_notes.md +++ b/doc/implementation_notes.md @@ -9,6 +9,14 @@ that only cleans the superficial metadata of your file, but not the ones that might be in **embeded** resources. Like for example, images in a PDF or an office document. +Race conditions +--------------- + +MAT2 does its very best to avoid crashing at runtime. This is why it's checking +if the file is valid __at parser creation__. MAT2 doesn't take any measure to +ensure that the file is not changed between the time the parser is +instantiated, and the call to clean or show the metadata. + Symlink attacks --------------- diff --git a/src/images.py b/src/images.py index 7c1abaa..6cc3dfe 100644 --- a/src/images.py +++ b/src/images.py @@ -20,6 +20,13 @@ class PNGParser(abstract.AbstractParser): 'Compression', 'Filter', 'Interlace', 'BackgroundColor', 'ImageSize', 'Megapixels', 'ImageHeight'} + def __init__(self, filename): + super().__init__(filename) + try: # better fail here than later + cairo.ImageSurface.create_from_png(self.filename) + except MemoryError: + raise ValueError + def get_meta(self): out = subprocess.check_output(['/usr/bin/exiftool', '-json', self.filename]) meta = json.loads(out.decode('utf-8'))[0] diff --git a/src/parser_factory.py b/src/parser_factory.py index 68e9e9c..80aedae 100644 --- a/src/parser_factory.py +++ b/src/parser_factory.py @@ -30,5 +30,8 @@ def get_parser(filename: str) -> (T, str): for c in _get_parsers(): if mtype in c.mimetypes: - return c(filename), mtype + try: + return c(filename), mtype + except ValueError: + return None, mtype return None, mtype diff --git a/src/pdf.py b/src/pdf.py index 6e639cd..3ba3d4a 100644 --- a/src/pdf.py +++ b/src/pdf.py @@ -11,7 +11,7 @@ import io import cairo import gi gi.require_version('Poppler', '0.18') -from gi.repository import Poppler +from gi.repository import Poppler, GLib from . import abstract @@ -28,6 +28,10 @@ class PDFParser(abstract.AbstractParser): super().__init__(filename) self.uri = 'file://' + os.path.abspath(self.filename) self.__scale = 2 # how much precision do we want for the render + try: # Check now that the file is valid, to avoid surprises later + Poppler.Document.new_from_file(self.uri, None) + except GLib.GError: # Invalid PDF + raise ValueError def remove_all_lightweight(self): """ @@ -116,8 +120,9 @@ class PDFParser(abstract.AbstractParser): def get_meta(self): """ Return a dict with all the meta of the file """ - document = Poppler.Document.new_from_file(self.uri, None) metadata = {} + document = Poppler.Document.new_from_file(self.uri, None) + for key in self.meta_list: if document.get_property(key): metadata[key] = document.get_property(key) diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 1950444..17afaf4 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -16,6 +16,18 @@ class TestParserFactory(unittest.TestCase): self.assertEqual(mimetype, 'audio/mpeg') self.assertEqual(parser.__class__, audio.MP3Parser) +class TestCorruptedFiles(unittest.TestCase): + def test_pdf(self): + shutil.copy('./tests/data/dirty.png', './tests/data/clean.png') + with self.assertRaises(ValueError): + pdf.PDFParser('./tests/data/clean.png') + os.remove('./tests/data/clean.png') + + def test_png(self): + shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf') + with self.assertRaises(ValueError): + images.PNGParser('./tests/data/clean.pdf') + os.remove('./tests/data/clean.pdf') class TestGetMeta(unittest.TestCase): def test_pdf(self):