Implement lightweight cleaning for png and tiff
This commit is contained in:
parent
38df679a88
commit
f1a071d460
13
.pylintrc
13
.pylintrc
@ -6,11 +6,12 @@ max-locals=20
|
|||||||
disable=
|
disable=
|
||||||
fixme,
|
fixme,
|
||||||
invalid-name,
|
invalid-name,
|
||||||
|
duplicate-code,
|
||||||
missing-docstring,
|
missing-docstring,
|
||||||
protected-access,
|
protected-access,
|
||||||
abstract-method,
|
abstract-method,
|
||||||
wrong-import-position,
|
wrong-import-position,
|
||||||
catching-non-exception,
|
catching-non-exception,
|
||||||
cell-var-from-loop,
|
cell-var-from-loop,
|
||||||
locally-disabled,
|
locally-disabled,
|
||||||
invalid-sequence-index, # pylint doesn't like things like `Tuple[int, bytes]` in type annotation
|
invalid-sequence-index, # pylint doesn't like things like `Tuple[int, bytes]` in type annotation
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
from typing import Dict, Union, Set
|
from typing import Dict, Union, Set
|
||||||
@ -23,6 +24,34 @@ class ExiftoolParser(abstract.AbstractParser):
|
|||||||
meta.pop(key, None)
|
meta.pop(key, None)
|
||||||
return meta
|
return meta
|
||||||
|
|
||||||
|
def _lightweight_cleanup(self):
|
||||||
|
if os.path.exists(self.output_filename):
|
||||||
|
try:
|
||||||
|
# exiftool can't force output to existing files
|
||||||
|
os.remove(self.output_filename)
|
||||||
|
except OSError as e: # pragma: no cover
|
||||||
|
logging.error("The output file %s is already existing and \
|
||||||
|
can't be overwritten: %s.", self.filename, e)
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Note: '-All=' must be followed by a known exiftool option.
|
||||||
|
# Also, '-CommonIFD0' is needed for .tiff files
|
||||||
|
cmd = [_get_exiftool_path(),
|
||||||
|
'-all=', # remove metadata
|
||||||
|
'-adobe=', # remove adobe-specific metadata
|
||||||
|
'-exif:all=', # remove all exif metadata
|
||||||
|
'-Time:All=', # remove all timestamps
|
||||||
|
'-quiet', # don't show useless logs
|
||||||
|
'-CommonIFD0=', # remove IFD0 metadata
|
||||||
|
'-o', self.output_filename,
|
||||||
|
self.filename]
|
||||||
|
try:
|
||||||
|
subprocess.check_call(cmd)
|
||||||
|
except subprocess.CalledProcessError as e: # pragma: no cover
|
||||||
|
logging.error("Something went wrong during the processing of %s: %s", self.filename, e)
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
def _get_exiftool_path() -> str: # pragma: no cover
|
def _get_exiftool_path() -> str: # pragma: no cover
|
||||||
exiftool_path = '/usr/bin/exiftool'
|
exiftool_path = '/usr/bin/exiftool'
|
||||||
if os.path.isfile(exiftool_path):
|
if os.path.isfile(exiftool_path):
|
||||||
|
@ -35,6 +35,8 @@ class PNGParser(exiftool.ExiftoolParser):
|
|||||||
raise ValueError
|
raise ValueError
|
||||||
|
|
||||||
def remove_all(self) -> bool:
|
def remove_all(self) -> bool:
|
||||||
|
if self.lightweight_cleaning:
|
||||||
|
return self._lightweight_cleanup()
|
||||||
surface = cairo.ImageSurface.create_from_png(self.filename)
|
surface = cairo.ImageSurface.create_from_png(self.filename)
|
||||||
surface.write_to_png(self.output_filename)
|
surface.write_to_png(self.output_filename)
|
||||||
return True
|
return True
|
||||||
|
@ -26,7 +26,7 @@ class AVIParser(exiftool.ExiftoolParser):
|
|||||||
|
|
||||||
def remove_all(self):
|
def remove_all(self):
|
||||||
cmd = [_get_ffmpeg_path(),
|
cmd = [_get_ffmpeg_path(),
|
||||||
'-i', self.filename, # input file
|
'-i', self.filename, # input file
|
||||||
'-y', # overwrite existing output file
|
'-y', # overwrite existing output file
|
||||||
'-loglevel', 'panic', # Don't show log
|
'-loglevel', 'panic', # Don't show log
|
||||||
'-hide_banner', # hide the banner
|
'-hide_banner', # hide the banner
|
||||||
|
@ -194,6 +194,13 @@ class TestCorruptedFiles(unittest.TestCase):
|
|||||||
images.JPGParser('./tests/data/clean.jpg')
|
images.JPGParser('./tests/data/clean.jpg')
|
||||||
os.remove('./tests/data/clean.jpg')
|
os.remove('./tests/data/clean.jpg')
|
||||||
|
|
||||||
|
def test_png_lightweight(self):
|
||||||
|
return
|
||||||
|
shutil.copy('./tests/data/dirty.torrent', './tests/data/clean.png')
|
||||||
|
p = images.PNGParser('./tests/data/clean.png')
|
||||||
|
self.assertTrue(p.remove_all())
|
||||||
|
os.remove('./tests/data/clean.png')
|
||||||
|
|
||||||
def test_avi(self):
|
def test_avi(self):
|
||||||
try:
|
try:
|
||||||
video._get_ffmpeg_path()
|
video._get_ffmpeg_path()
|
||||||
|
@ -212,42 +212,6 @@ class TestRevisionsCleaning(unittest.TestCase):
|
|||||||
os.remove('./tests/data/revision_clean.docx')
|
os.remove('./tests/data/revision_clean.docx')
|
||||||
os.remove('./tests/data/revision_clean.cleaned.docx')
|
os.remove('./tests/data/revision_clean.cleaned.docx')
|
||||||
|
|
||||||
class TestLightWeightCleaning(unittest.TestCase):
|
|
||||||
def test_pdf(self):
|
|
||||||
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
|
|
||||||
p = pdf.PDFParser('./tests/data/clean.pdf')
|
|
||||||
|
|
||||||
meta = p.get_meta()
|
|
||||||
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
|
|
||||||
|
|
||||||
p.lightweight_cleaning = True
|
|
||||||
ret = p.remove_all()
|
|
||||||
self.assertTrue(ret)
|
|
||||||
|
|
||||||
p = pdf.PDFParser('./tests/data/clean.cleaned.pdf')
|
|
||||||
expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1}
|
|
||||||
self.assertEqual(p.get_meta(), expected_meta)
|
|
||||||
|
|
||||||
os.remove('./tests/data/clean.pdf')
|
|
||||||
os.remove('./tests/data/clean.cleaned.pdf')
|
|
||||||
|
|
||||||
def test_png(self):
|
|
||||||
shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
|
|
||||||
p = images.PNGParser('./tests/data/clean.png')
|
|
||||||
|
|
||||||
meta = p.get_meta()
|
|
||||||
self.assertEqual(meta['Comment'], 'This is a comment, be careful!')
|
|
||||||
|
|
||||||
p.lightweight_cleaning = True
|
|
||||||
ret = p.remove_all()
|
|
||||||
self.assertTrue(ret)
|
|
||||||
|
|
||||||
p = images.PNGParser('./tests/data/clean.cleaned.png')
|
|
||||||
self.assertEqual(p.get_meta(), {})
|
|
||||||
|
|
||||||
os.remove('./tests/data/clean.png')
|
|
||||||
os.remove('./tests/data/clean.cleaned.png')
|
|
||||||
|
|
||||||
class TestCleaning(unittest.TestCase):
|
class TestCleaning(unittest.TestCase):
|
||||||
def test_pdf(self):
|
def test_pdf(self):
|
||||||
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
|
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
|
||||||
|
65
tests/test_lightweigh_cleaning.py
Normal file
65
tests/test_lightweigh_cleaning.py
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import shutil
|
||||||
|
import os
|
||||||
|
|
||||||
|
from libmat2 import pdf, images
|
||||||
|
|
||||||
|
class TestLightWeightCleaning(unittest.TestCase):
|
||||||
|
def test_pdf(self):
|
||||||
|
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
|
||||||
|
p = pdf.PDFParser('./tests/data/clean.pdf')
|
||||||
|
|
||||||
|
meta = p.get_meta()
|
||||||
|
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
|
||||||
|
|
||||||
|
p.lightweight_cleaning = True
|
||||||
|
ret = p.remove_all()
|
||||||
|
self.assertTrue(ret)
|
||||||
|
|
||||||
|
p = pdf.PDFParser('./tests/data/clean.cleaned.pdf')
|
||||||
|
expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1}
|
||||||
|
self.assertEqual(p.get_meta(), expected_meta)
|
||||||
|
|
||||||
|
os.remove('./tests/data/clean.pdf')
|
||||||
|
os.remove('./tests/data/clean.cleaned.pdf')
|
||||||
|
|
||||||
|
def test_png(self):
|
||||||
|
shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
|
||||||
|
p = images.PNGParser('./tests/data/clean.png')
|
||||||
|
|
||||||
|
meta = p.get_meta()
|
||||||
|
self.assertEqual(meta['Comment'], 'This is a comment, be careful!')
|
||||||
|
|
||||||
|
p.lightweight_cleaning = True
|
||||||
|
ret = p.remove_all()
|
||||||
|
self.assertTrue(ret)
|
||||||
|
|
||||||
|
p = images.PNGParser('./tests/data/clean.cleaned.png')
|
||||||
|
self.assertEqual(p.get_meta(), {})
|
||||||
|
|
||||||
|
p = images.PNGParser('./tests/data/clean.png')
|
||||||
|
p.lightweight_cleaning = True
|
||||||
|
ret = p.remove_all()
|
||||||
|
self.assertTrue(ret)
|
||||||
|
|
||||||
|
os.remove('./tests/data/clean.png')
|
||||||
|
os.remove('./tests/data/clean.cleaned.png')
|
||||||
|
|
||||||
|
def test_jpg(self):
|
||||||
|
shutil.copy('./tests/data/dirty.jpg', './tests/data/clean.jpg')
|
||||||
|
p = images.JPGParser('./tests/data/clean.jpg')
|
||||||
|
|
||||||
|
meta = p.get_meta()
|
||||||
|
self.assertEqual(meta['Comment'], 'Created with GIMP')
|
||||||
|
|
||||||
|
p.lightweight_cleaning = True
|
||||||
|
ret = p.remove_all()
|
||||||
|
self.assertTrue(ret)
|
||||||
|
|
||||||
|
p = images.JPGParser('./tests/data/clean.cleaned.jpg')
|
||||||
|
self.assertEqual(p.get_meta(), {})
|
||||||
|
|
||||||
|
os.remove('./tests/data/clean.jpg')
|
||||||
|
os.remove('./tests/data/clean.cleaned.jpg')
|
Loading…
Reference in New Issue
Block a user