1
0
Fork 0

Implement lightweight cleaning for png and tiff

This commit is contained in:
jvoisin 2018-10-23 16:14:21 +02:00
parent 38df679a88
commit f1a071d460
7 changed files with 111 additions and 43 deletions

View File

@ -6,11 +6,12 @@ max-locals=20
disable=
fixme,
invalid-name,
duplicate-code,
missing-docstring,
protected-access,
abstract-method,
wrong-import-position,
catching-non-exception,
cell-var-from-loop,
locally-disabled,
invalid-sequence-index, # pylint doesn't like things like `Tuple[int, bytes]` in type annotation
abstract-method,
wrong-import-position,
catching-non-exception,
cell-var-from-loop,
locally-disabled,
invalid-sequence-index, # pylint doesn't like things like `Tuple[int, bytes]` in type annotation

View File

@ -1,4 +1,5 @@
import json
import logging
import os
import subprocess
from typing import Dict, Union, Set
@ -23,6 +24,34 @@ class ExiftoolParser(abstract.AbstractParser):
meta.pop(key, None)
return meta
def _lightweight_cleanup(self):
if os.path.exists(self.output_filename):
try:
# exiftool can't force output to existing files
os.remove(self.output_filename)
except OSError as e: # pragma: no cover
logging.error("The output file %s is already existing and \
can't be overwritten: %s.", self.filename, e)
return False
# Note: '-All=' must be followed by a known exiftool option.
# Also, '-CommonIFD0' is needed for .tiff files
cmd = [_get_exiftool_path(),
'-all=', # remove metadata
'-adobe=', # remove adobe-specific metadata
'-exif:all=', # remove all exif metadata
'-Time:All=', # remove all timestamps
'-quiet', # don't show useless logs
'-CommonIFD0=', # remove IFD0 metadata
'-o', self.output_filename,
self.filename]
try:
subprocess.check_call(cmd)
except subprocess.CalledProcessError as e: # pragma: no cover
logging.error("Something went wrong during the processing of %s: %s", self.filename, e)
return False
return True
def _get_exiftool_path() -> str: # pragma: no cover
exiftool_path = '/usr/bin/exiftool'
if os.path.isfile(exiftool_path):

View File

@ -35,6 +35,8 @@ class PNGParser(exiftool.ExiftoolParser):
raise ValueError
def remove_all(self) -> bool:
if self.lightweight_cleaning:
return self._lightweight_cleanup()
surface = cairo.ImageSurface.create_from_png(self.filename)
surface.write_to_png(self.output_filename)
return True

View File

@ -26,7 +26,7 @@ class AVIParser(exiftool.ExiftoolParser):
def remove_all(self):
cmd = [_get_ffmpeg_path(),
'-i', self.filename, # input file
'-i', self.filename, # input file
'-y', # overwrite existing output file
'-loglevel', 'panic', # Don't show log
'-hide_banner', # hide the banner

View File

@ -194,6 +194,13 @@ class TestCorruptedFiles(unittest.TestCase):
images.JPGParser('./tests/data/clean.jpg')
os.remove('./tests/data/clean.jpg')
def test_png_lightweight(self):
return
shutil.copy('./tests/data/dirty.torrent', './tests/data/clean.png')
p = images.PNGParser('./tests/data/clean.png')
self.assertTrue(p.remove_all())
os.remove('./tests/data/clean.png')
def test_avi(self):
try:
video._get_ffmpeg_path()

View File

@ -212,42 +212,6 @@ class TestRevisionsCleaning(unittest.TestCase):
os.remove('./tests/data/revision_clean.docx')
os.remove('./tests/data/revision_clean.cleaned.docx')
class TestLightWeightCleaning(unittest.TestCase):
def test_pdf(self):
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
p = pdf.PDFParser('./tests/data/clean.pdf')
meta = p.get_meta()
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
p.lightweight_cleaning = True
ret = p.remove_all()
self.assertTrue(ret)
p = pdf.PDFParser('./tests/data/clean.cleaned.pdf')
expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1}
self.assertEqual(p.get_meta(), expected_meta)
os.remove('./tests/data/clean.pdf')
os.remove('./tests/data/clean.cleaned.pdf')
def test_png(self):
shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
p = images.PNGParser('./tests/data/clean.png')
meta = p.get_meta()
self.assertEqual(meta['Comment'], 'This is a comment, be careful!')
p.lightweight_cleaning = True
ret = p.remove_all()
self.assertTrue(ret)
p = images.PNGParser('./tests/data/clean.cleaned.png')
self.assertEqual(p.get_meta(), {})
os.remove('./tests/data/clean.png')
os.remove('./tests/data/clean.cleaned.png')
class TestCleaning(unittest.TestCase):
def test_pdf(self):
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')

View File

@ -0,0 +1,65 @@
#!/usr/bin/env python3
import unittest
import shutil
import os
from libmat2 import pdf, images
class TestLightWeightCleaning(unittest.TestCase):
def test_pdf(self):
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
p = pdf.PDFParser('./tests/data/clean.pdf')
meta = p.get_meta()
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
p.lightweight_cleaning = True
ret = p.remove_all()
self.assertTrue(ret)
p = pdf.PDFParser('./tests/data/clean.cleaned.pdf')
expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1}
self.assertEqual(p.get_meta(), expected_meta)
os.remove('./tests/data/clean.pdf')
os.remove('./tests/data/clean.cleaned.pdf')
def test_png(self):
shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
p = images.PNGParser('./tests/data/clean.png')
meta = p.get_meta()
self.assertEqual(meta['Comment'], 'This is a comment, be careful!')
p.lightweight_cleaning = True
ret = p.remove_all()
self.assertTrue(ret)
p = images.PNGParser('./tests/data/clean.cleaned.png')
self.assertEqual(p.get_meta(), {})
p = images.PNGParser('./tests/data/clean.png')
p.lightweight_cleaning = True
ret = p.remove_all()
self.assertTrue(ret)
os.remove('./tests/data/clean.png')
os.remove('./tests/data/clean.cleaned.png')
def test_jpg(self):
shutil.copy('./tests/data/dirty.jpg', './tests/data/clean.jpg')
p = images.JPGParser('./tests/data/clean.jpg')
meta = p.get_meta()
self.assertEqual(meta['Comment'], 'Created with GIMP')
p.lightweight_cleaning = True
ret = p.remove_all()
self.assertTrue(ret)
p = images.JPGParser('./tests/data/clean.cleaned.jpg')
self.assertEqual(p.get_meta(), {})
os.remove('./tests/data/clean.jpg')
os.remove('./tests/data/clean.cleaned.jpg')