diff --git a/.pylintrc b/.pylintrc index 1f3dc23..31fad0e 100644 --- a/.pylintrc +++ b/.pylintrc @@ -6,11 +6,12 @@ max-locals=20 disable= fixme, invalid-name, + duplicate-code, missing-docstring, protected-access, - abstract-method, - wrong-import-position, - catching-non-exception, - cell-var-from-loop, - locally-disabled, - invalid-sequence-index, # pylint doesn't like things like `Tuple[int, bytes]` in type annotation + abstract-method, + wrong-import-position, + catching-non-exception, + cell-var-from-loop, + locally-disabled, + invalid-sequence-index, # pylint doesn't like things like `Tuple[int, bytes]` in type annotation diff --git a/libmat2/exiftool.py b/libmat2/exiftool.py index 11dd36d..23d0d89 100644 --- a/libmat2/exiftool.py +++ b/libmat2/exiftool.py @@ -1,4 +1,5 @@ import json +import logging import os import subprocess from typing import Dict, Union, Set @@ -23,6 +24,34 @@ class ExiftoolParser(abstract.AbstractParser): meta.pop(key, None) return meta + def _lightweight_cleanup(self): + if os.path.exists(self.output_filename): + try: + # exiftool can't force output to existing files + os.remove(self.output_filename) + except OSError as e: # pragma: no cover + logging.error("The output file %s is already existing and \ + can't be overwritten: %s.", self.filename, e) + return False + + # Note: '-All=' must be followed by a known exiftool option. + # Also, '-CommonIFD0' is needed for .tiff files + cmd = [_get_exiftool_path(), + '-all=', # remove metadata + '-adobe=', # remove adobe-specific metadata + '-exif:all=', # remove all exif metadata + '-Time:All=', # remove all timestamps + '-quiet', # don't show useless logs + '-CommonIFD0=', # remove IFD0 metadata + '-o', self.output_filename, + self.filename] + try: + subprocess.check_call(cmd) + except subprocess.CalledProcessError as e: # pragma: no cover + logging.error("Something went wrong during the processing of %s: %s", self.filename, e) + return False + return True + def _get_exiftool_path() -> str: # pragma: no cover exiftool_path = '/usr/bin/exiftool' if os.path.isfile(exiftool_path): diff --git a/libmat2/images.py b/libmat2/images.py index ad80892..03cecd3 100644 --- a/libmat2/images.py +++ b/libmat2/images.py @@ -35,6 +35,8 @@ class PNGParser(exiftool.ExiftoolParser): raise ValueError def remove_all(self) -> bool: + if self.lightweight_cleaning: + return self._lightweight_cleanup() surface = cairo.ImageSurface.create_from_png(self.filename) surface.write_to_png(self.output_filename) return True diff --git a/libmat2/video.py b/libmat2/video.py index fe2a1af..b7ba0a0 100644 --- a/libmat2/video.py +++ b/libmat2/video.py @@ -26,7 +26,7 @@ class AVIParser(exiftool.ExiftoolParser): def remove_all(self): cmd = [_get_ffmpeg_path(), - '-i', self.filename, # input file + '-i', self.filename, # input file '-y', # overwrite existing output file '-loglevel', 'panic', # Don't show log '-hide_banner', # hide the banner diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py index 82c6c3b..181d4d2 100644 --- a/tests/test_corrupted_files.py +++ b/tests/test_corrupted_files.py @@ -194,6 +194,13 @@ class TestCorruptedFiles(unittest.TestCase): images.JPGParser('./tests/data/clean.jpg') os.remove('./tests/data/clean.jpg') + def test_png_lightweight(self): + return + shutil.copy('./tests/data/dirty.torrent', './tests/data/clean.png') + p = images.PNGParser('./tests/data/clean.png') + self.assertTrue(p.remove_all()) + os.remove('./tests/data/clean.png') + def test_avi(self): try: video._get_ffmpeg_path() diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index f5fc9e8..46d6aaa 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -212,42 +212,6 @@ class TestRevisionsCleaning(unittest.TestCase): os.remove('./tests/data/revision_clean.docx') os.remove('./tests/data/revision_clean.cleaned.docx') -class TestLightWeightCleaning(unittest.TestCase): - def test_pdf(self): - shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf') - p = pdf.PDFParser('./tests/data/clean.pdf') - - meta = p.get_meta() - self.assertEqual(meta['producer'], 'pdfTeX-1.40.14') - - p.lightweight_cleaning = True - ret = p.remove_all() - self.assertTrue(ret) - - p = pdf.PDFParser('./tests/data/clean.cleaned.pdf') - expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1} - self.assertEqual(p.get_meta(), expected_meta) - - os.remove('./tests/data/clean.pdf') - os.remove('./tests/data/clean.cleaned.pdf') - - def test_png(self): - shutil.copy('./tests/data/dirty.png', './tests/data/clean.png') - p = images.PNGParser('./tests/data/clean.png') - - meta = p.get_meta() - self.assertEqual(meta['Comment'], 'This is a comment, be careful!') - - p.lightweight_cleaning = True - ret = p.remove_all() - self.assertTrue(ret) - - p = images.PNGParser('./tests/data/clean.cleaned.png') - self.assertEqual(p.get_meta(), {}) - - os.remove('./tests/data/clean.png') - os.remove('./tests/data/clean.cleaned.png') - class TestCleaning(unittest.TestCase): def test_pdf(self): shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf') diff --git a/tests/test_lightweigh_cleaning.py b/tests/test_lightweigh_cleaning.py new file mode 100644 index 0000000..7af31ad --- /dev/null +++ b/tests/test_lightweigh_cleaning.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 + +import unittest +import shutil +import os + +from libmat2 import pdf, images + +class TestLightWeightCleaning(unittest.TestCase): + def test_pdf(self): + shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf') + p = pdf.PDFParser('./tests/data/clean.pdf') + + meta = p.get_meta() + self.assertEqual(meta['producer'], 'pdfTeX-1.40.14') + + p.lightweight_cleaning = True + ret = p.remove_all() + self.assertTrue(ret) + + p = pdf.PDFParser('./tests/data/clean.cleaned.pdf') + expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1} + self.assertEqual(p.get_meta(), expected_meta) + + os.remove('./tests/data/clean.pdf') + os.remove('./tests/data/clean.cleaned.pdf') + + def test_png(self): + shutil.copy('./tests/data/dirty.png', './tests/data/clean.png') + p = images.PNGParser('./tests/data/clean.png') + + meta = p.get_meta() + self.assertEqual(meta['Comment'], 'This is a comment, be careful!') + + p.lightweight_cleaning = True + ret = p.remove_all() + self.assertTrue(ret) + + p = images.PNGParser('./tests/data/clean.cleaned.png') + self.assertEqual(p.get_meta(), {}) + + p = images.PNGParser('./tests/data/clean.png') + p.lightweight_cleaning = True + ret = p.remove_all() + self.assertTrue(ret) + + os.remove('./tests/data/clean.png') + os.remove('./tests/data/clean.cleaned.png') + + def test_jpg(self): + shutil.copy('./tests/data/dirty.jpg', './tests/data/clean.jpg') + p = images.JPGParser('./tests/data/clean.jpg') + + meta = p.get_meta() + self.assertEqual(meta['Comment'], 'Created with GIMP') + + p.lightweight_cleaning = True + ret = p.remove_all() + self.assertTrue(ret) + + p = images.JPGParser('./tests/data/clean.cleaned.jpg') + self.assertEqual(p.get_meta(), {}) + + os.remove('./tests/data/clean.jpg') + os.remove('./tests/data/clean.cleaned.jpg')