2018-09-11 15:54:53 +02:00
|
|
|
#!/usr/bin/env python3
|
2018-03-13 01:01:07 +01:00
|
|
|
|
|
|
|
import unittest
|
|
|
|
import shutil
|
|
|
|
import os
|
2019-02-27 23:04:38 +01:00
|
|
|
import re
|
2019-04-27 13:05:36 +02:00
|
|
|
import tarfile
|
|
|
|
import tempfile
|
2018-03-31 23:09:54 +02:00
|
|
|
import zipfile
|
2018-03-13 01:01:07 +01:00
|
|
|
|
2018-07-06 00:42:09 +02:00
|
|
|
from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
|
2019-02-21 01:28:11 +01:00
|
|
|
from libmat2 import check_dependencies, video, archive, web, epub
|
2018-07-10 20:49:54 +02:00
|
|
|
|
|
|
|
|
|
|
|
class TestCheckDependencies(unittest.TestCase):
|
|
|
|
def test_deps(self):
|
2019-05-11 11:20:04 +02:00
|
|
|
ret = check_dependencies()
|
2019-04-14 21:00:13 +02:00
|
|
|
for key, value in ret.items():
|
2019-05-11 11:20:05 +02:00
|
|
|
if value['required']:
|
|
|
|
self.assertTrue(value['found'], "The value for %s is False" % key)
|
2018-03-13 01:01:07 +01:00
|
|
|
|
2018-04-04 23:21:48 +02:00
|
|
|
|
2018-04-03 23:29:34 +02:00
|
|
|
class TestParserFactory(unittest.TestCase):
|
|
|
|
def test_subsubcalss(self):
|
|
|
|
""" Test that our module auto-detection is handling sub-sub-classes """
|
|
|
|
parser, mimetype = parser_factory.get_parser('./tests/data/dirty.mp3')
|
|
|
|
self.assertEqual(mimetype, 'audio/mpeg')
|
|
|
|
self.assertEqual(parser.__class__, audio.MP3Parser)
|
|
|
|
|
2019-04-27 15:03:09 +02:00
|
|
|
def test_tarfile_double_extension_handling(self):
|
|
|
|
""" Test that our module auto-detection is handling sub-sub-classes """
|
|
|
|
with tarfile.TarFile.open('./tests/data/dirty.tar.bz2', 'w:bz2') as zout:
|
|
|
|
zout.add('./tests/data/dirty.jpg')
|
|
|
|
parser, mimetype = parser_factory.get_parser('./tests/data/dirty.tar.bz2')
|
|
|
|
self.assertEqual(mimetype, 'application/x-tar+bz2')
|
|
|
|
os.remove('./tests/data/dirty.tar.bz2')
|
|
|
|
|
2018-05-16 22:10:47 +02:00
|
|
|
|
2018-06-06 23:50:25 +02:00
|
|
|
class TestParameterInjection(unittest.TestCase):
|
|
|
|
def test_ver_injection(self):
|
|
|
|
shutil.copy('./tests/data/dirty.png', './-ver')
|
|
|
|
p = images.PNGParser('-ver')
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['Comment'], 'This is a comment, be careful!')
|
|
|
|
self.assertEqual(meta['ModifyDate'], "2018:03:20 21:59:25")
|
|
|
|
os.remove('-ver')
|
|
|
|
|
2018-10-22 16:45:30 +02:00
|
|
|
def test_ffmpeg_injection(self):
|
|
|
|
try:
|
|
|
|
video._get_ffmpeg_path()
|
|
|
|
except RuntimeError:
|
|
|
|
raise unittest.SkipTest
|
|
|
|
|
|
|
|
shutil.copy('./tests/data/dirty.avi', './--output')
|
|
|
|
p = video.AVIParser('--output')
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['Software'], 'MEncoder SVN-r33148-4.0.1')
|
|
|
|
os.remove('--output')
|
|
|
|
|
2018-10-22 19:12:39 +02:00
|
|
|
def test_ffmpeg_injection_complete_path(self):
|
|
|
|
try:
|
|
|
|
video._get_ffmpeg_path()
|
|
|
|
except RuntimeError:
|
|
|
|
raise unittest.SkipTest
|
|
|
|
|
|
|
|
shutil.copy('./tests/data/dirty.avi', './tests/data/ --output.avi')
|
|
|
|
p = video.AVIParser('./tests/data/ --output.avi')
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['Software'], 'MEncoder SVN-r33148-4.0.1')
|
|
|
|
self.assertTrue(p.remove_all())
|
|
|
|
os.remove('./tests/data/ --output.avi')
|
|
|
|
os.remove('./tests/data/ --output.cleaned.avi')
|
|
|
|
|
2018-06-06 23:50:25 +02:00
|
|
|
|
2018-06-10 20:19:35 +02:00
|
|
|
class TestUnsupportedEmbeddedFiles(unittest.TestCase):
|
2019-07-13 21:26:05 +02:00
|
|
|
def test_odt_with_py(self):
|
2018-06-10 20:19:35 +02:00
|
|
|
shutil.copy('./tests/data/embedded.odt', './tests/data/clean.odt')
|
|
|
|
p = office.LibreOfficeParser('./tests/data/clean.odt')
|
|
|
|
self.assertFalse(p.remove_all())
|
|
|
|
os.remove('./tests/data/clean.odt')
|
|
|
|
|
2019-07-13 21:26:05 +02:00
|
|
|
def test_docx_with_py(self):
|
2018-06-10 20:19:35 +02:00
|
|
|
shutil.copy('./tests/data/embedded.docx', './tests/data/clean.docx')
|
|
|
|
p = office.MSOfficeParser('./tests/data/clean.docx')
|
|
|
|
self.assertFalse(p.remove_all())
|
|
|
|
os.remove('./tests/data/clean.docx')
|
|
|
|
|
2018-06-21 23:18:50 +02:00
|
|
|
|
2018-03-13 01:01:07 +01:00
|
|
|
class TestGetMeta(unittest.TestCase):
|
|
|
|
def test_pdf(self):
|
|
|
|
p = pdf.PDFParser('./tests/data/dirty.pdf')
|
2018-03-18 21:42:12 +01:00
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
|
|
|
|
self.assertEqual(meta['creator'], "'Certified by IEEE PDFeXpress at 03/19/2016 2:56:07 AM'")
|
2018-04-11 23:20:59 +02:00
|
|
|
self.assertEqual(meta['DocumentID'], "uuid:4a1a79c8-404e-4d38-9580-5bc081036e61")
|
2018-09-24 20:15:07 +02:00
|
|
|
self.assertEqual(meta['PTEX.Fullbanner'], "This is pdfTeX, Version "
|
|
|
|
"3.1415926-2.5-1.40.14 (TeX Live 2013/Debian) kpathsea "
|
2018-04-11 23:20:59 +02:00
|
|
|
"version 6.1.1")
|
2018-03-13 01:01:07 +01:00
|
|
|
|
2018-04-22 22:02:00 +02:00
|
|
|
def test_torrent(self):
|
|
|
|
p = torrent.TorrentParser('./tests/data/dirty.torrent')
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['created by'], b'mktorrent 1.0')
|
|
|
|
|
2018-03-20 23:35:02 +01:00
|
|
|
def test_png(self):
|
2018-04-01 12:30:00 +02:00
|
|
|
p = images.PNGParser('./tests/data/dirty.png')
|
2018-03-20 23:35:02 +01:00
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['Comment'], 'This is a comment, be careful!')
|
|
|
|
self.assertEqual(meta['ModifyDate'], "2018:03:20 21:59:25")
|
2018-03-13 01:01:07 +01:00
|
|
|
|
2018-03-25 15:09:12 +02:00
|
|
|
def test_jpg(self):
|
2018-04-01 12:30:00 +02:00
|
|
|
p = images.JPGParser('./tests/data/dirty.jpg')
|
2018-03-25 15:09:12 +02:00
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['Comment'], 'Created with GIMP')
|
|
|
|
|
2019-09-01 18:28:46 +02:00
|
|
|
def test_ppm(self):
|
|
|
|
p = images.PPMParser('./tests/data/dirty.ppm')
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['1'], '# A metadata')
|
|
|
|
self.assertEqual(meta['4'], '# And an other one')
|
|
|
|
self.assertEqual(meta['6'], '# and a final one here')
|
|
|
|
|
|
|
|
|
2018-04-01 00:43:36 +02:00
|
|
|
def test_tiff(self):
|
2018-06-22 20:38:29 +02:00
|
|
|
p = images.TiffParser('./tests/data/dirty.tiff')
|
2018-04-01 00:43:36 +02:00
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['Make'], 'OLYMPUS IMAGING CORP.')
|
|
|
|
self.assertEqual(meta['Model'], 'C7070WZ')
|
|
|
|
self.assertEqual(meta['ModifyDate'], '2005:12:26 17:09:35')
|
|
|
|
|
2018-03-25 16:17:41 +02:00
|
|
|
def test_mp3(self):
|
|
|
|
p = audio.MP3Parser('./tests/data/dirty.mp3')
|
|
|
|
meta = p.get_meta()
|
2018-04-04 21:59:46 +02:00
|
|
|
self.assertEqual(meta['TXXX:I am a'], 'various comment')
|
2018-03-25 16:17:41 +02:00
|
|
|
|
|
|
|
def test_ogg(self):
|
|
|
|
p = audio.OGGParser('./tests/data/dirty.ogg')
|
|
|
|
meta = p.get_meta()
|
2018-04-04 21:59:46 +02:00
|
|
|
self.assertEqual(meta['title'], 'I am so')
|
2018-03-25 16:17:41 +02:00
|
|
|
|
2018-03-25 16:20:45 +02:00
|
|
|
def test_flac(self):
|
|
|
|
p = audio.FLACParser('./tests/data/dirty.flac')
|
|
|
|
meta = p.get_meta()
|
2018-04-04 21:59:46 +02:00
|
|
|
self.assertEqual(meta['title'], 'I am so')
|
2018-10-11 19:52:47 +02:00
|
|
|
self.assertEqual(meta['Cover 0'], {'Comment': 'Created with GIMP'})
|
2018-03-25 16:20:45 +02:00
|
|
|
|
2018-03-31 15:47:06 +02:00
|
|
|
def test_docx(self):
|
2018-04-01 01:04:06 +02:00
|
|
|
p = office.MSOfficeParser('./tests/data/dirty.docx')
|
2018-03-31 15:47:06 +02:00
|
|
|
meta = p.get_meta()
|
2019-02-03 22:55:15 +01:00
|
|
|
self.assertEqual(meta['docProps/core.xml']['cp:lastModifiedBy'], 'Julien Voisin')
|
|
|
|
self.assertEqual(meta['docProps/core.xml']['dc:creator'], 'julien voisin')
|
|
|
|
self.assertEqual(meta['docProps/app.xml']['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1')
|
2018-03-31 15:47:06 +02:00
|
|
|
|
2018-03-31 21:20:21 +02:00
|
|
|
def test_libreoffice(self):
|
2018-04-01 01:04:06 +02:00
|
|
|
p = office.LibreOfficeParser('./tests/data/dirty.odt')
|
2018-03-31 21:20:21 +02:00
|
|
|
meta = p.get_meta()
|
2019-02-07 21:58:10 +01:00
|
|
|
self.assertEqual(meta['meta.xml']['meta:initial-creator'], 'jvoisin ')
|
|
|
|
self.assertEqual(meta['meta.xml']['meta:creation-date'], '2011-07-26T03:27:48')
|
|
|
|
self.assertEqual(meta['meta.xml']['meta:generator'], 'LibreOffice/3.3$Unix LibreOffice_project/330m19$Build-202')
|
2018-03-31 21:20:21 +02:00
|
|
|
|
2018-07-08 21:50:52 +02:00
|
|
|
p = office.LibreOfficeParser('./tests/data/weird_producer.odt')
|
|
|
|
meta = p.get_meta()
|
2019-02-07 21:58:10 +01:00
|
|
|
self.assertEqual(meta['mimetype']['create_system'], 'Windows')
|
|
|
|
self.assertEqual(meta['mimetype']['comment'], b'YAY FOR COMMENTS')
|
2018-07-08 21:50:52 +02:00
|
|
|
|
2018-07-06 00:42:09 +02:00
|
|
|
def test_txt(self):
|
|
|
|
p, mimetype = parser_factory.get_parser('./tests/data/dirty.txt')
|
|
|
|
self.assertEqual(mimetype, 'text/plain')
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta, {})
|
|
|
|
|
2018-10-25 11:56:46 +02:00
|
|
|
def test_zip(self):
|
|
|
|
with zipfile.ZipFile('./tests/data/dirty.zip', 'w') as zout:
|
|
|
|
zout.write('./tests/data/dirty.flac')
|
|
|
|
zout.write('./tests/data/dirty.docx')
|
|
|
|
zout.write('./tests/data/dirty.jpg')
|
|
|
|
p, mimetype = parser_factory.get_parser('./tests/data/dirty.zip')
|
|
|
|
self.assertEqual(mimetype, 'application/zip')
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['tests/data/dirty.flac']['comments'], 'Thank you for using MAT !')
|
|
|
|
self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
|
|
|
|
os.remove('./tests/data/dirty.zip')
|
|
|
|
|
2019-02-02 18:44:02 +01:00
|
|
|
def test_wmv(self):
|
|
|
|
p, mimetype = parser_factory.get_parser('./tests/data/dirty.wmv')
|
|
|
|
self.assertEqual(mimetype, 'video/x-ms-wmv')
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['EncodingSettings'], 'Lavf52.103.0')
|
2018-03-31 21:20:21 +02:00
|
|
|
|
2019-02-03 21:01:58 +01:00
|
|
|
def test_gif(self):
|
|
|
|
p, mimetype = parser_factory.get_parser('./tests/data/dirty.gif')
|
|
|
|
self.assertEqual(mimetype, 'image/gif')
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['Comment'], 'this is a test comment')
|
|
|
|
|
2019-02-21 01:28:11 +01:00
|
|
|
def test_epub(self):
|
|
|
|
p, mimetype = parser_factory.get_parser('./tests/data/dirty.epub')
|
|
|
|
self.assertEqual(mimetype, 'application/epub+zip')
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['OEBPS/content.opf']['dc:creator'], 'Dorothy L. Sayers')
|
|
|
|
self.assertEqual(meta['OEBPS/toc.ncx']['dtb:generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster@gutenberg.org>')
|
|
|
|
self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@images@shield25.jpg']['CreatorTool'], 'Adobe Photoshop CS5 Macintosh')
|
|
|
|
self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@58820-h-2.htm.html']['generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster@gutenberg.org>')
|
|
|
|
|
|
|
|
def test_css(self):
|
|
|
|
p, mimetype = parser_factory.get_parser('./tests/data/dirty.css')
|
|
|
|
self.assertEqual(mimetype, 'text/css')
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['author'], 'jvoisin')
|
|
|
|
self.assertEqual(meta['version'], '1.0')
|
|
|
|
self.assertEqual(meta['harmful data'], 'underline is cool')
|
|
|
|
|
2019-04-27 13:05:36 +02:00
|
|
|
def test_tar(self):
|
|
|
|
with tarfile.TarFile('./tests/data/dirty.tar', 'w') as tout:
|
|
|
|
tout.add('./tests/data/dirty.flac')
|
|
|
|
tout.add('./tests/data/dirty.docx')
|
|
|
|
tout.add('./tests/data/dirty.jpg')
|
|
|
|
p, mimetype = parser_factory.get_parser('./tests/data/dirty.tar')
|
|
|
|
self.assertEqual(mimetype, 'application/x-tar')
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['./tests/data/dirty.flac']['comments'], 'Thank you for using MAT !')
|
|
|
|
self.assertEqual(meta['./tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
|
|
|
|
os.remove('./tests/data/dirty.tar')
|
|
|
|
|
|
|
|
|
2018-07-01 17:29:05 +02:00
|
|
|
class TestRemovingThumbnails(unittest.TestCase):
|
|
|
|
def test_odt(self):
|
|
|
|
shutil.copy('./tests/data/revision.odt', './tests/data/clean.odt')
|
|
|
|
|
|
|
|
zipin = zipfile.ZipFile(os.path.abspath('./tests/data/clean.odt'))
|
|
|
|
self.assertIn('Thumbnails/thumbnail.png', zipin.namelist())
|
|
|
|
zipin.close()
|
|
|
|
|
|
|
|
p = office.LibreOfficeParser('./tests/data/clean.odt')
|
|
|
|
self.assertTrue(p.remove_all())
|
|
|
|
|
|
|
|
zipin = zipfile.ZipFile(os.path.abspath('./tests/data/clean.cleaned.odt'))
|
|
|
|
self.assertNotIn('Thumbnails/thumbnail.png', zipin.namelist())
|
|
|
|
zipin.close()
|
|
|
|
|
|
|
|
os.remove('./tests/data/clean.cleaned.odt')
|
2018-07-01 23:11:10 +02:00
|
|
|
os.remove('./tests/data/clean.odt')
|
2018-07-01 17:29:05 +02:00
|
|
|
|
2018-06-27 23:10:53 +02:00
|
|
|
|
|
|
|
class TestRevisionsCleaning(unittest.TestCase):
|
|
|
|
def test_libreoffice(self):
|
|
|
|
with zipfile.ZipFile('./tests/data/revision.odt') as zipin:
|
|
|
|
c = zipin.open('content.xml')
|
|
|
|
r = c.read()
|
|
|
|
self.assertIn(b'tracked-changes', r)
|
|
|
|
|
|
|
|
shutil.copy('./tests/data/revision.odt', './tests/data/clean.odt')
|
|
|
|
p = office.LibreOfficeParser('./tests/data/clean.odt')
|
|
|
|
self.assertTrue(p.remove_all())
|
|
|
|
|
|
|
|
with zipfile.ZipFile('./tests/data/clean.cleaned.odt') as zipin:
|
|
|
|
c = zipin.open('content.xml')
|
|
|
|
r = c.read()
|
|
|
|
self.assertNotIn(b'tracked-changes', r)
|
|
|
|
|
|
|
|
os.remove('./tests/data/clean.odt')
|
|
|
|
os.remove('./tests/data/clean.cleaned.odt')
|
|
|
|
|
2018-07-01 23:11:10 +02:00
|
|
|
def test_msoffice(self):
|
|
|
|
with zipfile.ZipFile('./tests/data/revision.docx') as zipin:
|
|
|
|
c = zipin.open('word/document.xml')
|
|
|
|
content = c.read()
|
|
|
|
r = b'<w:ins w:id="1" w:author="Unknown Author" w:date="2018-06-28T23:48:00Z">'
|
|
|
|
self.assertIn(r, content)
|
|
|
|
|
|
|
|
shutil.copy('./tests/data/revision.docx', './tests/data/revision_clean.docx')
|
|
|
|
p = office.MSOfficeParser('./tests/data/revision_clean.docx')
|
|
|
|
self.assertTrue(p.remove_all())
|
|
|
|
|
|
|
|
with zipfile.ZipFile('./tests/data/revision_clean.cleaned.docx') as zipin:
|
|
|
|
c = zipin.open('word/document.xml')
|
|
|
|
content = c.read()
|
|
|
|
r = b'<w:ins w:id="1" w:author="Unknown Author" w:date="2018-06-28T23:48:00Z">'
|
|
|
|
self.assertNotIn(r, content)
|
|
|
|
|
|
|
|
os.remove('./tests/data/revision_clean.docx')
|
|
|
|
os.remove('./tests/data/revision_clean.cleaned.docx')
|
|
|
|
|
2018-03-20 23:35:02 +01:00
|
|
|
class TestCleaning(unittest.TestCase):
|
2018-03-13 01:01:07 +01:00
|
|
|
def test_pdf(self):
|
2018-03-20 23:35:02 +01:00
|
|
|
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
|
2018-03-13 01:01:07 +01:00
|
|
|
p = pdf.PDFParser('./tests/data/clean.pdf')
|
2018-03-18 21:42:12 +01:00
|
|
|
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
|
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
2018-04-30 23:46:37 +02:00
|
|
|
p = pdf.PDFParser('./tests/data/clean.cleaned.pdf')
|
2018-03-18 23:48:14 +01:00
|
|
|
expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1}
|
|
|
|
self.assertEqual(p.get_meta(), expected_meta)
|
2018-10-02 16:05:51 +02:00
|
|
|
self.assertTrue(p.remove_all())
|
2018-03-20 23:35:02 +01:00
|
|
|
|
|
|
|
os.remove('./tests/data/clean.pdf')
|
2018-04-30 23:51:59 +02:00
|
|
|
os.remove('./tests/data/clean.cleaned.pdf')
|
2018-10-03 16:11:40 +02:00
|
|
|
os.remove('./tests/data/clean.cleaned.cleaned.pdf')
|
2018-03-20 23:35:02 +01:00
|
|
|
|
|
|
|
def test_png(self):
|
|
|
|
shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
|
2018-04-01 12:30:00 +02:00
|
|
|
p = images.PNGParser('./tests/data/clean.png')
|
2018-03-20 23:35:02 +01:00
|
|
|
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['Comment'], 'This is a comment, be careful!')
|
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
2018-04-30 23:46:37 +02:00
|
|
|
p = images.PNGParser('./tests/data/clean.cleaned.png')
|
2018-03-20 23:35:02 +01:00
|
|
|
self.assertEqual(p.get_meta(), {})
|
2018-10-02 16:05:51 +02:00
|
|
|
self.assertTrue(p.remove_all())
|
2018-03-20 23:35:02 +01:00
|
|
|
|
|
|
|
os.remove('./tests/data/clean.png')
|
2018-04-30 23:51:59 +02:00
|
|
|
os.remove('./tests/data/clean.cleaned.png')
|
2018-10-03 16:11:40 +02:00
|
|
|
os.remove('./tests/data/clean.cleaned.cleaned.png')
|
2018-03-25 15:09:12 +02:00
|
|
|
|
|
|
|
def test_jpg(self):
|
|
|
|
shutil.copy('./tests/data/dirty.jpg', './tests/data/clean.jpg')
|
2018-04-01 12:30:00 +02:00
|
|
|
p = images.JPGParser('./tests/data/clean.jpg')
|
2018-03-25 15:09:12 +02:00
|
|
|
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['Comment'], 'Created with GIMP')
|
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
2018-04-30 23:46:37 +02:00
|
|
|
p = images.JPGParser('./tests/data/clean.cleaned.jpg')
|
2018-03-25 15:09:12 +02:00
|
|
|
self.assertEqual(p.get_meta(), {})
|
2018-10-02 16:05:51 +02:00
|
|
|
self.assertTrue(p.remove_all())
|
2018-03-25 15:09:12 +02:00
|
|
|
|
|
|
|
os.remove('./tests/data/clean.jpg')
|
2018-04-30 23:51:59 +02:00
|
|
|
os.remove('./tests/data/clean.cleaned.jpg')
|
2018-10-03 16:11:40 +02:00
|
|
|
os.remove('./tests/data/clean.cleaned.cleaned.jpg')
|
2018-03-25 16:17:41 +02:00
|
|
|
|
|
|
|
def test_mp3(self):
|
|
|
|
shutil.copy('./tests/data/dirty.mp3', './tests/data/clean.mp3')
|
|
|
|
p = audio.MP3Parser('./tests/data/clean.mp3')
|
|
|
|
|
|
|
|
meta = p.get_meta()
|
2018-04-04 21:59:46 +02:00
|
|
|
self.assertEqual(meta['TXXX:I am a'], 'various comment')
|
2018-03-25 16:17:41 +02:00
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
2018-04-30 23:46:37 +02:00
|
|
|
p = audio.MP3Parser('./tests/data/clean.cleaned.mp3')
|
2018-03-25 16:17:41 +02:00
|
|
|
self.assertEqual(p.get_meta(), {})
|
2018-10-02 16:05:51 +02:00
|
|
|
self.assertTrue(p.remove_all())
|
2018-03-25 16:17:41 +02:00
|
|
|
|
|
|
|
os.remove('./tests/data/clean.mp3')
|
2018-04-30 23:51:59 +02:00
|
|
|
os.remove('./tests/data/clean.cleaned.mp3')
|
2018-10-03 16:11:40 +02:00
|
|
|
os.remove('./tests/data/clean.cleaned.cleaned.mp3')
|
2018-03-25 16:17:41 +02:00
|
|
|
|
|
|
|
def test_ogg(self):
|
|
|
|
shutil.copy('./tests/data/dirty.ogg', './tests/data/clean.ogg')
|
|
|
|
p = audio.OGGParser('./tests/data/clean.ogg')
|
|
|
|
|
|
|
|
meta = p.get_meta()
|
2018-04-04 21:59:46 +02:00
|
|
|
self.assertEqual(meta['title'], 'I am so')
|
2018-03-25 16:17:41 +02:00
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
2018-04-30 23:46:37 +02:00
|
|
|
p = audio.OGGParser('./tests/data/clean.cleaned.ogg')
|
2018-03-25 16:17:41 +02:00
|
|
|
self.assertEqual(p.get_meta(), {})
|
2018-10-02 16:05:51 +02:00
|
|
|
self.assertTrue(p.remove_all())
|
2018-03-25 16:17:41 +02:00
|
|
|
|
|
|
|
os.remove('./tests/data/clean.ogg')
|
2018-04-30 23:51:59 +02:00
|
|
|
os.remove('./tests/data/clean.cleaned.ogg')
|
2018-10-03 16:11:40 +02:00
|
|
|
os.remove('./tests/data/clean.cleaned.cleaned.ogg')
|
2018-03-25 16:20:45 +02:00
|
|
|
|
|
|
|
def test_flac(self):
|
|
|
|
shutil.copy('./tests/data/dirty.flac', './tests/data/clean.flac')
|
|
|
|
p = audio.FLACParser('./tests/data/clean.flac')
|
|
|
|
|
|
|
|
meta = p.get_meta()
|
2018-04-04 21:59:46 +02:00
|
|
|
self.assertEqual(meta['title'], 'I am so')
|
2018-03-25 16:20:45 +02:00
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
2018-04-30 23:46:37 +02:00
|
|
|
p = audio.FLACParser('./tests/data/clean.cleaned.flac')
|
2018-03-25 16:20:45 +02:00
|
|
|
self.assertEqual(p.get_meta(), {})
|
2018-10-02 16:05:51 +02:00
|
|
|
self.assertTrue(p.remove_all())
|
2018-03-25 16:20:45 +02:00
|
|
|
|
|
|
|
os.remove('./tests/data/clean.flac')
|
2018-04-30 23:51:59 +02:00
|
|
|
os.remove('./tests/data/clean.cleaned.flac')
|
2018-10-03 16:11:40 +02:00
|
|
|
os.remove('./tests/data/clean.cleaned.cleaned.flac')
|
2018-03-31 15:47:06 +02:00
|
|
|
|
|
|
|
def test_office(self):
|
|
|
|
shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx')
|
2018-04-01 01:04:06 +02:00
|
|
|
p = office.MSOfficeParser('./tests/data/clean.docx')
|
2018-03-31 15:47:06 +02:00
|
|
|
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertIsNotNone(meta)
|
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
2018-04-30 23:46:37 +02:00
|
|
|
p = office.MSOfficeParser('./tests/data/clean.cleaned.docx')
|
2018-03-31 15:47:06 +02:00
|
|
|
self.assertEqual(p.get_meta(), {})
|
2018-10-02 16:05:51 +02:00
|
|
|
self.assertTrue(p.remove_all())
|
2018-03-31 15:47:06 +02:00
|
|
|
|
|
|
|
os.remove('./tests/data/clean.docx')
|
2018-04-30 23:51:59 +02:00
|
|
|
os.remove('./tests/data/clean.cleaned.docx')
|
2018-10-03 16:11:40 +02:00
|
|
|
os.remove('./tests/data/clean.cleaned.cleaned.docx')
|
2018-03-31 21:20:21 +02:00
|
|
|
|
|
|
|
def test_libreoffice(self):
|
|
|
|
shutil.copy('./tests/data/dirty.odt', './tests/data/clean.odt')
|
2018-04-01 01:04:06 +02:00
|
|
|
p = office.LibreOfficeParser('./tests/data/clean.odt')
|
2018-03-31 21:20:21 +02:00
|
|
|
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertIsNotNone(meta)
|
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
2018-04-30 23:46:37 +02:00
|
|
|
p = office.LibreOfficeParser('./tests/data/clean.cleaned.odt')
|
2018-03-31 21:20:21 +02:00
|
|
|
self.assertEqual(p.get_meta(), {})
|
2018-10-02 16:05:51 +02:00
|
|
|
self.assertTrue(p.remove_all())
|
2018-03-31 21:20:21 +02:00
|
|
|
|
|
|
|
os.remove('./tests/data/clean.odt')
|
2018-04-30 23:51:59 +02:00
|
|
|
os.remove('./tests/data/clean.cleaned.odt')
|
2018-10-03 16:11:40 +02:00
|
|
|
os.remove('./tests/data/clean.cleaned.cleaned.odt')
|
2018-04-01 00:43:36 +02:00
|
|
|
|
|
|
|
def test_tiff(self):
|
|
|
|
shutil.copy('./tests/data/dirty.tiff', './tests/data/clean.tiff')
|
2018-04-01 12:30:00 +02:00
|
|
|
p = images.TiffParser('./tests/data/clean.tiff')
|
2018-04-01 00:43:36 +02:00
|
|
|
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['Model'], 'C7070WZ')
|
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
2018-04-30 23:46:37 +02:00
|
|
|
p = images.TiffParser('./tests/data/clean.cleaned.tiff')
|
2018-04-01 00:43:36 +02:00
|
|
|
self.assertEqual(p.get_meta(), {})
|
2018-10-02 16:05:51 +02:00
|
|
|
self.assertTrue(p.remove_all())
|
2018-04-01 00:43:36 +02:00
|
|
|
|
|
|
|
os.remove('./tests/data/clean.tiff')
|
2018-04-30 23:51:59 +02:00
|
|
|
os.remove('./tests/data/clean.cleaned.tiff')
|
2018-10-03 16:11:40 +02:00
|
|
|
os.remove('./tests/data/clean.cleaned.cleaned.tiff')
|
2018-04-16 22:27:29 +02:00
|
|
|
|
|
|
|
def test_bmp(self):
|
|
|
|
shutil.copy('./tests/data/dirty.bmp', './tests/data/clean.bmp')
|
2018-07-06 00:49:17 +02:00
|
|
|
p = harmless.HarmlessParser('./tests/data/clean.bmp')
|
2018-04-16 22:27:29 +02:00
|
|
|
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta, {}) # bmp has no meta :)
|
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
2018-07-06 00:49:17 +02:00
|
|
|
p = harmless.HarmlessParser('./tests/data/clean.cleaned.bmp')
|
2018-04-16 22:27:29 +02:00
|
|
|
self.assertEqual(p.get_meta(), {})
|
2018-10-02 16:05:51 +02:00
|
|
|
self.assertTrue(p.remove_all())
|
2018-04-16 22:27:29 +02:00
|
|
|
|
|
|
|
os.remove('./tests/data/clean.bmp')
|
2018-04-30 23:51:59 +02:00
|
|
|
os.remove('./tests/data/clean.cleaned.bmp')
|
2018-10-03 16:11:40 +02:00
|
|
|
os.remove('./tests/data/clean.cleaned.cleaned.bmp')
|
2018-04-22 22:02:00 +02:00
|
|
|
|
|
|
|
def test_torrent(self):
|
|
|
|
shutil.copy('./tests/data/dirty.torrent', './tests/data/clean.torrent')
|
|
|
|
p = torrent.TorrentParser('./tests/data/clean.torrent')
|
|
|
|
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta, {'created by': b'mktorrent 1.0', 'creation date': 1522397702})
|
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
2018-04-30 23:46:37 +02:00
|
|
|
p = torrent.TorrentParser('./tests/data/clean.cleaned.torrent')
|
2018-04-22 22:02:00 +02:00
|
|
|
self.assertEqual(p.get_meta(), {})
|
2018-10-02 16:05:51 +02:00
|
|
|
self.assertTrue(p.remove_all())
|
2018-04-22 22:02:00 +02:00
|
|
|
|
|
|
|
os.remove('./tests/data/clean.torrent')
|
2018-04-30 23:51:59 +02:00
|
|
|
os.remove('./tests/data/clean.cleaned.torrent')
|
2018-10-03 16:11:40 +02:00
|
|
|
os.remove('./tests/data/clean.cleaned.cleaned.torrent')
|
2018-04-23 00:24:39 +02:00
|
|
|
|
|
|
|
def test_odf(self):
|
|
|
|
shutil.copy('./tests/data/dirty.odf', './tests/data/clean.odf')
|
|
|
|
p = office.LibreOfficeParser('./tests/data/clean.odf')
|
|
|
|
|
|
|
|
meta = p.get_meta()
|
2019-02-07 21:58:10 +01:00
|
|
|
self.assertEqual(meta['meta.xml']['meta:creation-date'], '2018-04-23T00:18:59.438231281')
|
2018-04-23 00:24:39 +02:00
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
2018-04-30 23:46:37 +02:00
|
|
|
p = office.LibreOfficeParser('./tests/data/clean.cleaned.odf')
|
2018-04-23 00:24:39 +02:00
|
|
|
self.assertEqual(p.get_meta(), {})
|
2018-10-02 16:05:51 +02:00
|
|
|
self.assertTrue(p.remove_all())
|
2018-04-23 00:24:39 +02:00
|
|
|
|
|
|
|
os.remove('./tests/data/clean.odf')
|
2018-04-30 23:51:59 +02:00
|
|
|
os.remove('./tests/data/clean.cleaned.odf')
|
2018-10-03 16:11:40 +02:00
|
|
|
os.remove('./tests/data/clean.cleaned.cleaned.odf')
|
2018-04-23 00:28:36 +02:00
|
|
|
|
|
|
|
def test_odg(self):
|
|
|
|
shutil.copy('./tests/data/dirty.odg', './tests/data/clean.odg')
|
|
|
|
p = office.LibreOfficeParser('./tests/data/clean.odg')
|
|
|
|
|
|
|
|
meta = p.get_meta()
|
2019-02-07 21:58:10 +01:00
|
|
|
self.assertEqual(meta['meta.xml']['dc:date'], '2018-04-23T00:26:59.385838550')
|
2018-04-23 00:28:36 +02:00
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
2018-04-30 23:46:37 +02:00
|
|
|
p = office.LibreOfficeParser('./tests/data/clean.cleaned.odg')
|
2018-04-23 00:28:36 +02:00
|
|
|
self.assertEqual(p.get_meta(), {})
|
2018-10-02 16:05:51 +02:00
|
|
|
self.assertTrue(p.remove_all())
|
2018-04-23 00:28:36 +02:00
|
|
|
|
|
|
|
os.remove('./tests/data/clean.odg')
|
2018-04-30 23:51:59 +02:00
|
|
|
os.remove('./tests/data/clean.cleaned.odg')
|
2018-10-03 16:11:40 +02:00
|
|
|
os.remove('./tests/data/clean.cleaned.cleaned.odg')
|
2018-07-06 00:42:09 +02:00
|
|
|
|
|
|
|
def test_txt(self):
|
|
|
|
shutil.copy('./tests/data/dirty.txt', './tests/data/clean.txt')
|
|
|
|
p = harmless.HarmlessParser('./tests/data/clean.txt')
|
|
|
|
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta, {})
|
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
|
|
|
p = harmless.HarmlessParser('./tests/data/clean.cleaned.txt')
|
|
|
|
self.assertEqual(p.get_meta(), {})
|
2018-10-02 16:05:51 +02:00
|
|
|
self.assertTrue(p.remove_all())
|
2018-07-06 00:42:09 +02:00
|
|
|
|
|
|
|
os.remove('./tests/data/clean.txt')
|
|
|
|
os.remove('./tests/data/clean.cleaned.txt')
|
2018-10-03 16:11:40 +02:00
|
|
|
os.remove('./tests/data/clean.cleaned.cleaned.txt')
|
2018-10-18 19:19:56 +02:00
|
|
|
|
|
|
|
def test_avi(self):
|
2018-10-22 13:42:04 +02:00
|
|
|
try:
|
|
|
|
video._get_ffmpeg_path()
|
|
|
|
except RuntimeError:
|
|
|
|
raise unittest.SkipTest
|
|
|
|
|
2018-10-18 19:19:56 +02:00
|
|
|
shutil.copy('./tests/data/dirty.avi', './tests/data/clean.avi')
|
|
|
|
p = video.AVIParser('./tests/data/clean.avi')
|
|
|
|
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['Software'], 'MEncoder SVN-r33148-4.0.1')
|
|
|
|
|
2018-10-22 13:42:04 +02:00
|
|
|
ret = p.remove_all()
|
2018-10-18 19:19:56 +02:00
|
|
|
self.assertTrue(ret)
|
|
|
|
|
|
|
|
p = video.AVIParser('./tests/data/clean.cleaned.avi')
|
|
|
|
self.assertEqual(p.get_meta(), {})
|
|
|
|
self.assertTrue(p.remove_all())
|
|
|
|
|
|
|
|
os.remove('./tests/data/clean.avi')
|
|
|
|
os.remove('./tests/data/clean.cleaned.avi')
|
|
|
|
os.remove('./tests/data/clean.cleaned.cleaned.avi')
|
2018-10-25 11:56:46 +02:00
|
|
|
|
|
|
|
def test_zip(self):
|
|
|
|
with zipfile.ZipFile('./tests/data/dirty.zip', 'w') as zout:
|
|
|
|
zout.write('./tests/data/dirty.flac')
|
|
|
|
zout.write('./tests/data/dirty.docx')
|
|
|
|
zout.write('./tests/data/dirty.jpg')
|
|
|
|
p = archive.ZipParser('./tests/data/dirty.zip')
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
|
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
|
|
|
p = archive.ZipParser('./tests/data/dirty.cleaned.zip')
|
|
|
|
self.assertEqual(p.get_meta(), {})
|
|
|
|
self.assertTrue(p.remove_all())
|
|
|
|
|
|
|
|
os.remove('./tests/data/dirty.zip')
|
|
|
|
os.remove('./tests/data/dirty.cleaned.zip')
|
|
|
|
os.remove('./tests/data/dirty.cleaned.cleaned.zip')
|
|
|
|
|
2018-10-28 15:41:04 +01:00
|
|
|
|
|
|
|
def test_mp4(self):
|
|
|
|
try:
|
|
|
|
video._get_ffmpeg_path()
|
|
|
|
except RuntimeError:
|
|
|
|
raise unittest.SkipTest
|
|
|
|
|
|
|
|
shutil.copy('./tests/data/dirty.mp4', './tests/data/clean.mp4')
|
|
|
|
p = video.MP4Parser('./tests/data/clean.mp4')
|
|
|
|
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['Encoder'], 'HandBrake 0.9.4 2009112300')
|
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
|
|
|
p = video.MP4Parser('./tests/data/clean.cleaned.mp4')
|
|
|
|
self.assertNotIn('Encoder', p.get_meta())
|
|
|
|
self.assertTrue(p.remove_all())
|
|
|
|
|
|
|
|
os.remove('./tests/data/clean.mp4')
|
|
|
|
os.remove('./tests/data/clean.cleaned.mp4')
|
|
|
|
os.remove('./tests/data/clean.cleaned.cleaned.mp4')
|
2019-02-02 18:44:02 +01:00
|
|
|
|
|
|
|
def test_wmv(self):
|
|
|
|
try:
|
|
|
|
video._get_ffmpeg_path()
|
|
|
|
except RuntimeError:
|
|
|
|
raise unittest.SkipTest
|
|
|
|
|
|
|
|
shutil.copy('./tests/data/dirty.wmv', './tests/data/clean.wmv')
|
|
|
|
p = video.WMVParser('./tests/data/clean.wmv')
|
|
|
|
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['EncodingSettings'], 'Lavf52.103.0')
|
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
|
|
|
p = video.WMVParser('./tests/data/clean.cleaned.wmv')
|
|
|
|
self.assertNotIn('EncodingSettings', p.get_meta())
|
|
|
|
self.assertTrue(p.remove_all())
|
|
|
|
|
|
|
|
os.remove('./tests/data/clean.wmv')
|
|
|
|
os.remove('./tests/data/clean.cleaned.wmv')
|
|
|
|
os.remove('./tests/data/clean.cleaned.cleaned.wmv')
|
2019-02-03 21:01:58 +01:00
|
|
|
|
|
|
|
def test_gif(self):
|
|
|
|
shutil.copy('./tests/data/dirty.gif', './tests/data/clean.gif')
|
|
|
|
p = images.GIFParser('./tests/data/clean.gif')
|
|
|
|
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['Comment'], 'this is a test comment')
|
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
|
|
|
p = images.GIFParser('./tests/data/clean.cleaned.gif')
|
|
|
|
self.assertNotIn('EncodingSettings', p.get_meta())
|
|
|
|
self.assertTrue(p.remove_all())
|
|
|
|
|
|
|
|
os.remove('./tests/data/clean.gif')
|
|
|
|
os.remove('./tests/data/clean.cleaned.gif')
|
|
|
|
os.remove('./tests/data/clean.cleaned.cleaned.gif')
|
2019-02-08 00:26:47 +01:00
|
|
|
|
|
|
|
def test_html(self):
|
|
|
|
shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
|
2019-02-21 01:28:11 +01:00
|
|
|
p = web.HTMLParser('./tests/data/clean.html')
|
2019-02-08 00:26:47 +01:00
|
|
|
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['author'], 'jvoisin')
|
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
2019-02-21 01:28:11 +01:00
|
|
|
p = web.HTMLParser('./tests/data/clean.cleaned.html')
|
2019-02-08 00:26:47 +01:00
|
|
|
self.assertEqual(p.get_meta(), {})
|
|
|
|
self.assertTrue(p.remove_all())
|
|
|
|
|
|
|
|
os.remove('./tests/data/clean.html')
|
|
|
|
os.remove('./tests/data/clean.cleaned.html')
|
|
|
|
os.remove('./tests/data/clean.cleaned.cleaned.html')
|
2019-02-21 01:28:11 +01:00
|
|
|
|
2019-02-27 23:53:07 +01:00
|
|
|
with open('./tests/data/clean.html', 'w') as f:
|
|
|
|
f.write('<title><title><pouet/><meta/></title></title><test/>')
|
|
|
|
p = web.HTMLParser('./tests/data/clean.html')
|
|
|
|
self.assertTrue(p.remove_all())
|
|
|
|
with open('./tests/data/clean.cleaned.html', 'r') as f:
|
|
|
|
self.assertEqual(f.read(), '<title></title><test/>')
|
|
|
|
os.remove('./tests/data/clean.html')
|
|
|
|
os.remove('./tests/data/clean.cleaned.html')
|
|
|
|
|
|
|
|
with open('./tests/data/clean.html', 'w') as f:
|
|
|
|
f.write('<test><title>Some<b>metadata</b><br/></title></test>')
|
|
|
|
p = web.HTMLParser('./tests/data/clean.html')
|
|
|
|
self.assertTrue(p.remove_all())
|
|
|
|
with open('./tests/data/clean.cleaned.html', 'r') as f:
|
|
|
|
self.assertEqual(f.read(), '<test><title></title></test>')
|
|
|
|
os.remove('./tests/data/clean.html')
|
|
|
|
os.remove('./tests/data/clean.cleaned.html')
|
|
|
|
|
|
|
|
with open('./tests/data/clean.html', 'w') as f:
|
2019-02-28 00:13:28 +01:00
|
|
|
f.write('<meta><meta/><!----><!-- test--></meta>')
|
2019-02-27 23:53:07 +01:00
|
|
|
p = web.HTMLParser('./tests/data/clean.html')
|
|
|
|
self.assertTrue(p.remove_all())
|
|
|
|
with open('./tests/data/clean.cleaned.html', 'r') as f:
|
|
|
|
self.assertEqual(f.read(), '')
|
|
|
|
os.remove('./tests/data/clean.html')
|
|
|
|
os.remove('./tests/data/clean.cleaned.html')
|
|
|
|
|
2019-02-21 01:28:11 +01:00
|
|
|
|
|
|
|
def test_epub(self):
|
|
|
|
shutil.copy('./tests/data/dirty.epub', './tests/data/clean.epub')
|
|
|
|
p = epub.EPUBParser('./tests/data/clean.epub')
|
|
|
|
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['OEBPS/content.opf']['dc:source'], 'http://www.gutenberg.org/files/58820/58820-h/58820-h.htm')
|
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
|
|
|
p = epub.EPUBParser('./tests/data/clean.cleaned.epub')
|
2019-02-27 23:04:38 +01:00
|
|
|
meta = p.get_meta()
|
|
|
|
res = re.match(meta['OEBPS/content.opf']['metadata'], '^<dc:identifier>[0-9a-f-]+</dc:identifier><dc:title /><dc:language />$')
|
|
|
|
self.assertNotEqual(res, False)
|
|
|
|
|
2019-02-21 01:28:11 +01:00
|
|
|
self.assertTrue(p.remove_all())
|
|
|
|
|
|
|
|
os.remove('./tests/data/clean.epub')
|
|
|
|
os.remove('./tests/data/clean.cleaned.epub')
|
|
|
|
os.remove('./tests/data/clean.cleaned.cleaned.epub')
|
|
|
|
|
|
|
|
|
|
|
|
def test_css(self):
|
|
|
|
shutil.copy('./tests/data/dirty.css', './tests/data/clean.css')
|
|
|
|
p = web.CSSParser('./tests/data/clean.css')
|
|
|
|
|
|
|
|
self.assertEqual(p.get_meta(), {
|
|
|
|
'harmful data': 'underline is cool',
|
|
|
|
'version': '1.0',
|
|
|
|
'author': 'jvoisin'})
|
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
|
|
|
p = web.CSSParser('./tests/data/clean.cleaned.css')
|
|
|
|
self.assertEqual(p.get_meta(), {})
|
|
|
|
self.assertTrue(p.remove_all())
|
|
|
|
|
|
|
|
os.remove('./tests/data/clean.css')
|
|
|
|
os.remove('./tests/data/clean.cleaned.css')
|
|
|
|
os.remove('./tests/data/clean.cleaned.cleaned.css')
|
2019-04-27 13:05:36 +02:00
|
|
|
|
|
|
|
def test_tar(self):
|
2019-04-27 15:03:09 +02:00
|
|
|
with tarfile.TarFile.open('./tests/data/dirty.tar', 'w') as zout:
|
2019-04-27 13:05:36 +02:00
|
|
|
zout.add('./tests/data/dirty.flac')
|
|
|
|
zout.add('./tests/data/dirty.docx')
|
|
|
|
zout.add('./tests/data/dirty.jpg')
|
|
|
|
p = archive.TarParser('./tests/data/dirty.tar')
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['./tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
|
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
|
|
|
p = archive.TarParser('./tests/data/dirty.cleaned.tar')
|
|
|
|
self.assertEqual(p.get_meta(), {})
|
|
|
|
self.assertTrue(p.remove_all())
|
|
|
|
|
|
|
|
tmp_dir = tempfile.mkdtemp()
|
|
|
|
with tarfile.open('./tests/data/dirty.cleaned.tar') as zout:
|
|
|
|
zout.extractall(path=tmp_dir)
|
|
|
|
zout.close()
|
|
|
|
|
|
|
|
number_of_files = 0
|
|
|
|
for root, _, fnames in os.walk(tmp_dir):
|
|
|
|
for f in fnames:
|
|
|
|
complete_path = os.path.join(root, f)
|
|
|
|
p, _ = parser_factory.get_parser(complete_path)
|
|
|
|
self.assertIsNotNone(p)
|
|
|
|
self.assertEqual(p.get_meta(), {})
|
|
|
|
number_of_files += 1
|
|
|
|
self.assertEqual(number_of_files, 3)
|
|
|
|
|
|
|
|
os.remove('./tests/data/dirty.tar')
|
|
|
|
os.remove('./tests/data/dirty.cleaned.tar')
|
|
|
|
os.remove('./tests/data/dirty.cleaned.cleaned.tar')
|
2019-04-27 15:03:09 +02:00
|
|
|
|
|
|
|
def test_targz(self):
|
|
|
|
with tarfile.TarFile.open('./tests/data/dirty.tar.gz', 'w:gz') as zout:
|
|
|
|
zout.add('./tests/data/dirty.flac')
|
|
|
|
zout.add('./tests/data/dirty.docx')
|
|
|
|
zout.add('./tests/data/dirty.jpg')
|
|
|
|
p = archive.TarParser('./tests/data/dirty.tar.gz')
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['./tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
|
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
|
|
|
p = archive.TarParser('./tests/data/dirty.cleaned.tar.gz')
|
|
|
|
self.assertEqual(p.get_meta(), {})
|
|
|
|
self.assertTrue(p.remove_all())
|
|
|
|
|
|
|
|
tmp_dir = tempfile.mkdtemp()
|
|
|
|
with tarfile.open('./tests/data/dirty.cleaned.tar.gz') as zout:
|
|
|
|
zout.extractall(path=tmp_dir)
|
|
|
|
zout.close()
|
|
|
|
|
|
|
|
number_of_files = 0
|
|
|
|
for root, _, fnames in os.walk(tmp_dir):
|
|
|
|
for f in fnames:
|
|
|
|
complete_path = os.path.join(root, f)
|
|
|
|
p, _ = parser_factory.get_parser(complete_path)
|
|
|
|
self.assertIsNotNone(p)
|
|
|
|
self.assertEqual(p.get_meta(), {})
|
|
|
|
number_of_files += 1
|
|
|
|
self.assertEqual(number_of_files, 3)
|
|
|
|
|
|
|
|
os.remove('./tests/data/dirty.tar.gz')
|
|
|
|
os.remove('./tests/data/dirty.cleaned.tar.gz')
|
|
|
|
os.remove('./tests/data/dirty.cleaned.cleaned.tar.gz')
|
|
|
|
|
|
|
|
def test_tarbz2(self):
|
|
|
|
with tarfile.TarFile.open('./tests/data/dirty.tar.bz2', 'w:bz2') as zout:
|
|
|
|
zout.add('./tests/data/dirty.flac')
|
|
|
|
zout.add('./tests/data/dirty.docx')
|
|
|
|
zout.add('./tests/data/dirty.jpg')
|
|
|
|
p = archive.TarParser('./tests/data/dirty.tar.bz2')
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['./tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
|
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
|
|
|
p = archive.TarParser('./tests/data/dirty.cleaned.tar.bz2')
|
|
|
|
self.assertEqual(p.get_meta(), {})
|
|
|
|
self.assertTrue(p.remove_all())
|
|
|
|
|
|
|
|
tmp_dir = tempfile.mkdtemp()
|
|
|
|
with tarfile.open('./tests/data/dirty.cleaned.tar.bz2') as zout:
|
|
|
|
zout.extractall(path=tmp_dir)
|
|
|
|
zout.close()
|
|
|
|
|
|
|
|
number_of_files = 0
|
|
|
|
for root, _, fnames in os.walk(tmp_dir):
|
|
|
|
for f in fnames:
|
|
|
|
complete_path = os.path.join(root, f)
|
|
|
|
p, _ = parser_factory.get_parser(complete_path)
|
|
|
|
self.assertIsNotNone(p)
|
|
|
|
self.assertEqual(p.get_meta(), {})
|
|
|
|
number_of_files += 1
|
|
|
|
self.assertEqual(number_of_files, 3)
|
|
|
|
|
|
|
|
os.remove('./tests/data/dirty.tar.bz2')
|
|
|
|
os.remove('./tests/data/dirty.cleaned.tar.bz2')
|
|
|
|
os.remove('./tests/data/dirty.cleaned.cleaned.tar.bz2')
|
|
|
|
|
|
|
|
def test_tarxz(self):
|
|
|
|
with tarfile.TarFile.open('./tests/data/dirty.tar.xz', 'w:xz') as zout:
|
|
|
|
zout.add('./tests/data/dirty.flac')
|
|
|
|
zout.add('./tests/data/dirty.docx')
|
|
|
|
zout.add('./tests/data/dirty.jpg')
|
|
|
|
p = archive.TarParser('./tests/data/dirty.tar.xz')
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['./tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
|
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
|
|
|
p = archive.TarParser('./tests/data/dirty.cleaned.tar.xz')
|
|
|
|
self.assertEqual(p.get_meta(), {})
|
|
|
|
self.assertTrue(p.remove_all())
|
|
|
|
|
|
|
|
tmp_dir = tempfile.mkdtemp()
|
|
|
|
with tarfile.open('./tests/data/dirty.cleaned.tar.xz') as zout:
|
|
|
|
zout.extractall(path=tmp_dir)
|
|
|
|
zout.close()
|
|
|
|
|
|
|
|
number_of_files = 0
|
|
|
|
for root, _, fnames in os.walk(tmp_dir):
|
|
|
|
for f in fnames:
|
|
|
|
complete_path = os.path.join(root, f)
|
|
|
|
p, _ = parser_factory.get_parser(complete_path)
|
|
|
|
self.assertIsNotNone(p)
|
|
|
|
self.assertEqual(p.get_meta(), {})
|
|
|
|
number_of_files += 1
|
|
|
|
self.assertEqual(number_of_files, 3)
|
|
|
|
|
|
|
|
os.remove('./tests/data/dirty.tar.xz')
|
|
|
|
os.remove('./tests/data/dirty.cleaned.tar.xz')
|
|
|
|
os.remove('./tests/data/dirty.cleaned.cleaned.tar.xz')
|
2019-07-13 21:26:05 +02:00
|
|
|
|
|
|
|
def test_svg(self):
|
|
|
|
shutil.copy('./tests/data/dirty.svg', './tests/data/clean.svg')
|
|
|
|
p = images.SVGParser('./tests/data/clean.svg')
|
|
|
|
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertEqual(meta['WorkCreatorAgentTitle'], 'GNOME Design Team')
|
|
|
|
self.assertEqual(meta['WorkSubject'], ['mat2', 'logo', 'metadata'])
|
|
|
|
self.assertEqual(meta['ID'], 'svg11300')
|
|
|
|
self.assertEqual(meta['Output_extension'],
|
|
|
|
'org.inkscape.output.svg.inkscape')
|
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
|
|
|
p = images.SVGParser('./tests/data/clean.cleaned.svg')
|
|
|
|
self.assertEqual(p.get_meta(), {})
|
|
|
|
self.assertTrue(p.remove_all())
|
|
|
|
|
|
|
|
os.remove('./tests/data/clean.svg')
|
|
|
|
os.remove('./tests/data/clean.cleaned.svg')
|
|
|
|
os.remove('./tests/data/clean.cleaned.cleaned.svg')
|
|
|
|
|
2019-07-22 23:20:37 +02:00
|
|
|
p = images.SVGParser('./tests/data/weird.svg')
|
|
|
|
self.assertEqual(p.get_meta()['Xmlns'], 'http://www.w3.org/1337/svg')
|
2019-09-01 18:28:46 +02:00
|
|
|
|
|
|
|
def test_ppm(self):
|
|
|
|
shutil.copy('./tests/data/dirty.ppm', './tests/data/clean.ppm')
|
|
|
|
p = images.PPMParser('./tests/data/clean.ppm')
|
|
|
|
|
|
|
|
meta = p.get_meta()
|
|
|
|
print(meta)
|
|
|
|
self.assertEqual(meta['1'], '# A metadata')
|
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
|
|
|
p = images.PPMParser('./tests/data/clean.cleaned.ppm')
|
|
|
|
self.assertEqual(p.get_meta(), {})
|
|
|
|
self.assertTrue(p.remove_all())
|
|
|
|
|
|
|
|
os.remove('./tests/data/clean.ppm')
|
|
|
|
os.remove('./tests/data/clean.cleaned.ppm')
|
|
|
|
os.remove('./tests/data/clean.cleaned.cleaned.ppm')
|
|
|
|
|
|
|
|
|