Implement lightweight cleaning for png and tiff
This commit is contained in:
parent
38df679a88
commit
f1a071d460
13
.pylintrc
13
.pylintrc
@ -6,11 +6,12 @@ max-locals=20
|
||||
disable=
|
||||
fixme,
|
||||
invalid-name,
|
||||
duplicate-code,
|
||||
missing-docstring,
|
||||
protected-access,
|
||||
abstract-method,
|
||||
wrong-import-position,
|
||||
catching-non-exception,
|
||||
cell-var-from-loop,
|
||||
locally-disabled,
|
||||
invalid-sequence-index, # pylint doesn't like things like `Tuple[int, bytes]` in type annotation
|
||||
abstract-method,
|
||||
wrong-import-position,
|
||||
catching-non-exception,
|
||||
cell-var-from-loop,
|
||||
locally-disabled,
|
||||
invalid-sequence-index, # pylint doesn't like things like `Tuple[int, bytes]` in type annotation
|
||||
|
@ -1,4 +1,5 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
from typing import Dict, Union, Set
|
||||
@ -23,6 +24,34 @@ class ExiftoolParser(abstract.AbstractParser):
|
||||
meta.pop(key, None)
|
||||
return meta
|
||||
|
||||
def _lightweight_cleanup(self):
|
||||
if os.path.exists(self.output_filename):
|
||||
try:
|
||||
# exiftool can't force output to existing files
|
||||
os.remove(self.output_filename)
|
||||
except OSError as e: # pragma: no cover
|
||||
logging.error("The output file %s is already existing and \
|
||||
can't be overwritten: %s.", self.filename, e)
|
||||
return False
|
||||
|
||||
# Note: '-All=' must be followed by a known exiftool option.
|
||||
# Also, '-CommonIFD0' is needed for .tiff files
|
||||
cmd = [_get_exiftool_path(),
|
||||
'-all=', # remove metadata
|
||||
'-adobe=', # remove adobe-specific metadata
|
||||
'-exif:all=', # remove all exif metadata
|
||||
'-Time:All=', # remove all timestamps
|
||||
'-quiet', # don't show useless logs
|
||||
'-CommonIFD0=', # remove IFD0 metadata
|
||||
'-o', self.output_filename,
|
||||
self.filename]
|
||||
try:
|
||||
subprocess.check_call(cmd)
|
||||
except subprocess.CalledProcessError as e: # pragma: no cover
|
||||
logging.error("Something went wrong during the processing of %s: %s", self.filename, e)
|
||||
return False
|
||||
return True
|
||||
|
||||
def _get_exiftool_path() -> str: # pragma: no cover
|
||||
exiftool_path = '/usr/bin/exiftool'
|
||||
if os.path.isfile(exiftool_path):
|
||||
|
@ -35,6 +35,8 @@ class PNGParser(exiftool.ExiftoolParser):
|
||||
raise ValueError
|
||||
|
||||
def remove_all(self) -> bool:
|
||||
if self.lightweight_cleaning:
|
||||
return self._lightweight_cleanup()
|
||||
surface = cairo.ImageSurface.create_from_png(self.filename)
|
||||
surface.write_to_png(self.output_filename)
|
||||
return True
|
||||
|
@ -26,7 +26,7 @@ class AVIParser(exiftool.ExiftoolParser):
|
||||
|
||||
def remove_all(self):
|
||||
cmd = [_get_ffmpeg_path(),
|
||||
'-i', self.filename, # input file
|
||||
'-i', self.filename, # input file
|
||||
'-y', # overwrite existing output file
|
||||
'-loglevel', 'panic', # Don't show log
|
||||
'-hide_banner', # hide the banner
|
||||
|
@ -194,6 +194,13 @@ class TestCorruptedFiles(unittest.TestCase):
|
||||
images.JPGParser('./tests/data/clean.jpg')
|
||||
os.remove('./tests/data/clean.jpg')
|
||||
|
||||
def test_png_lightweight(self):
|
||||
return
|
||||
shutil.copy('./tests/data/dirty.torrent', './tests/data/clean.png')
|
||||
p = images.PNGParser('./tests/data/clean.png')
|
||||
self.assertTrue(p.remove_all())
|
||||
os.remove('./tests/data/clean.png')
|
||||
|
||||
def test_avi(self):
|
||||
try:
|
||||
video._get_ffmpeg_path()
|
||||
|
@ -212,42 +212,6 @@ class TestRevisionsCleaning(unittest.TestCase):
|
||||
os.remove('./tests/data/revision_clean.docx')
|
||||
os.remove('./tests/data/revision_clean.cleaned.docx')
|
||||
|
||||
class TestLightWeightCleaning(unittest.TestCase):
|
||||
def test_pdf(self):
|
||||
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
|
||||
p = pdf.PDFParser('./tests/data/clean.pdf')
|
||||
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
|
||||
|
||||
p.lightweight_cleaning = True
|
||||
ret = p.remove_all()
|
||||
self.assertTrue(ret)
|
||||
|
||||
p = pdf.PDFParser('./tests/data/clean.cleaned.pdf')
|
||||
expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1}
|
||||
self.assertEqual(p.get_meta(), expected_meta)
|
||||
|
||||
os.remove('./tests/data/clean.pdf')
|
||||
os.remove('./tests/data/clean.cleaned.pdf')
|
||||
|
||||
def test_png(self):
|
||||
shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
|
||||
p = images.PNGParser('./tests/data/clean.png')
|
||||
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['Comment'], 'This is a comment, be careful!')
|
||||
|
||||
p.lightweight_cleaning = True
|
||||
ret = p.remove_all()
|
||||
self.assertTrue(ret)
|
||||
|
||||
p = images.PNGParser('./tests/data/clean.cleaned.png')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
|
||||
os.remove('./tests/data/clean.png')
|
||||
os.remove('./tests/data/clean.cleaned.png')
|
||||
|
||||
class TestCleaning(unittest.TestCase):
|
||||
def test_pdf(self):
|
||||
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
|
||||
|
65
tests/test_lightweigh_cleaning.py
Normal file
65
tests/test_lightweigh_cleaning.py
Normal file
@ -0,0 +1,65 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import os
|
||||
|
||||
from libmat2 import pdf, images
|
||||
|
||||
class TestLightWeightCleaning(unittest.TestCase):
|
||||
def test_pdf(self):
|
||||
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
|
||||
p = pdf.PDFParser('./tests/data/clean.pdf')
|
||||
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
|
||||
|
||||
p.lightweight_cleaning = True
|
||||
ret = p.remove_all()
|
||||
self.assertTrue(ret)
|
||||
|
||||
p = pdf.PDFParser('./tests/data/clean.cleaned.pdf')
|
||||
expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1}
|
||||
self.assertEqual(p.get_meta(), expected_meta)
|
||||
|
||||
os.remove('./tests/data/clean.pdf')
|
||||
os.remove('./tests/data/clean.cleaned.pdf')
|
||||
|
||||
def test_png(self):
|
||||
shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
|
||||
p = images.PNGParser('./tests/data/clean.png')
|
||||
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['Comment'], 'This is a comment, be careful!')
|
||||
|
||||
p.lightweight_cleaning = True
|
||||
ret = p.remove_all()
|
||||
self.assertTrue(ret)
|
||||
|
||||
p = images.PNGParser('./tests/data/clean.cleaned.png')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
|
||||
p = images.PNGParser('./tests/data/clean.png')
|
||||
p.lightweight_cleaning = True
|
||||
ret = p.remove_all()
|
||||
self.assertTrue(ret)
|
||||
|
||||
os.remove('./tests/data/clean.png')
|
||||
os.remove('./tests/data/clean.cleaned.png')
|
||||
|
||||
def test_jpg(self):
|
||||
shutil.copy('./tests/data/dirty.jpg', './tests/data/clean.jpg')
|
||||
p = images.JPGParser('./tests/data/clean.jpg')
|
||||
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['Comment'], 'Created with GIMP')
|
||||
|
||||
p.lightweight_cleaning = True
|
||||
ret = p.remove_all()
|
||||
self.assertTrue(ret)
|
||||
|
||||
p = images.JPGParser('./tests/data/clean.cleaned.jpg')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
|
||||
os.remove('./tests/data/clean.jpg')
|
||||
os.remove('./tests/data/clean.cleaned.jpg')
|
Loading…
Reference in New Issue
Block a user