1
0
mirror of synced 2024-11-24 10:14:24 +01:00
mat2/tests/test_libmat2.py

838 lines
33 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
2018-03-13 01:01:07 +01:00
import unittest
import shutil
import os
2019-02-27 23:04:38 +01:00
import re
2019-04-27 13:05:36 +02:00
import tarfile
import tempfile
import zipfile
2018-03-13 01:01:07 +01:00
2018-07-06 00:42:09 +02:00
from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
2019-02-21 01:28:11 +01:00
from libmat2 import check_dependencies, video, archive, web, epub
class TestCheckDependencies(unittest.TestCase):
def test_deps(self):
ret = check_dependencies()
2019-04-14 21:00:13 +02:00
for key, value in ret.items():
if value['required']:
self.assertTrue(value['found'], "The value for %s is False" % key)
2018-03-13 01:01:07 +01:00
class TestParserFactory(unittest.TestCase):
def test_subsubcalss(self):
""" Test that our module auto-detection is handling sub-sub-classes """
parser, mimetype = parser_factory.get_parser('./tests/data/dirty.mp3')
self.assertEqual(mimetype, 'audio/mpeg')
self.assertEqual(parser.__class__, audio.MP3Parser)
2019-04-27 15:03:09 +02:00
def test_tarfile_double_extension_handling(self):
""" Test that our module auto-detection is handling sub-sub-classes """
with tarfile.TarFile.open('./tests/data/dirty.tar.bz2', 'w:bz2') as zout:
zout.add('./tests/data/dirty.jpg')
parser, mimetype = parser_factory.get_parser('./tests/data/dirty.tar.bz2')
self.assertEqual(mimetype, 'application/x-tar+bz2')
os.remove('./tests/data/dirty.tar.bz2')
2018-05-16 22:10:47 +02:00
class TestParameterInjection(unittest.TestCase):
def test_ver_injection(self):
shutil.copy('./tests/data/dirty.png', './-ver')
p = images.PNGParser('-ver')
meta = p.get_meta()
self.assertEqual(meta['Comment'], 'This is a comment, be careful!')
self.assertEqual(meta['ModifyDate'], "2018:03:20 21:59:25")
os.remove('-ver')
2018-10-22 16:45:30 +02:00
def test_ffmpeg_injection(self):
try:
video._get_ffmpeg_path()
except RuntimeError:
raise unittest.SkipTest
shutil.copy('./tests/data/dirty.avi', './--output')
p = video.AVIParser('--output')
meta = p.get_meta()
self.assertEqual(meta['Software'], 'MEncoder SVN-r33148-4.0.1')
os.remove('--output')
def test_ffmpeg_injection_complete_path(self):
try:
video._get_ffmpeg_path()
except RuntimeError:
raise unittest.SkipTest
shutil.copy('./tests/data/dirty.avi', './tests/data/ --output.avi')
p = video.AVIParser('./tests/data/ --output.avi')
meta = p.get_meta()
self.assertEqual(meta['Software'], 'MEncoder SVN-r33148-4.0.1')
self.assertTrue(p.remove_all())
os.remove('./tests/data/ --output.avi')
os.remove('./tests/data/ --output.cleaned.avi')
class TestUnsupportedEmbeddedFiles(unittest.TestCase):
2019-07-13 21:26:05 +02:00
def test_odt_with_py(self):
shutil.copy('./tests/data/embedded.odt', './tests/data/clean.odt')
p = office.LibreOfficeParser('./tests/data/clean.odt')
self.assertFalse(p.remove_all())
os.remove('./tests/data/clean.odt')
2019-07-13 21:26:05 +02:00
def test_docx_with_py(self):
shutil.copy('./tests/data/embedded.docx', './tests/data/clean.docx')
p = office.MSOfficeParser('./tests/data/clean.docx')
self.assertFalse(p.remove_all())
os.remove('./tests/data/clean.docx')
2018-03-13 01:01:07 +01:00
class TestGetMeta(unittest.TestCase):
def test_pdf(self):
p = pdf.PDFParser('./tests/data/dirty.pdf')
2018-03-18 21:42:12 +01:00
meta = p.get_meta()
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
self.assertEqual(meta['creator'], "'Certified by IEEE PDFeXpress at 03/19/2016 2:56:07 AM'")
self.assertEqual(meta['DocumentID'], "uuid:4a1a79c8-404e-4d38-9580-5bc081036e61")
2018-09-24 20:15:07 +02:00
self.assertEqual(meta['PTEX.Fullbanner'], "This is pdfTeX, Version "
"3.1415926-2.5-1.40.14 (TeX Live 2013/Debian) kpathsea "
"version 6.1.1")
2018-03-13 01:01:07 +01:00
2018-04-22 22:02:00 +02:00
def test_torrent(self):
p = torrent.TorrentParser('./tests/data/dirty.torrent')
meta = p.get_meta()
self.assertEqual(meta['created by'], b'mktorrent 1.0')
2018-03-20 23:35:02 +01:00
def test_png(self):
p = images.PNGParser('./tests/data/dirty.png')
2018-03-20 23:35:02 +01:00
meta = p.get_meta()
self.assertEqual(meta['Comment'], 'This is a comment, be careful!')
self.assertEqual(meta['ModifyDate'], "2018:03:20 21:59:25")
2018-03-13 01:01:07 +01:00
2018-03-25 15:09:12 +02:00
def test_jpg(self):
p = images.JPGParser('./tests/data/dirty.jpg')
2018-03-25 15:09:12 +02:00
meta = p.get_meta()
self.assertEqual(meta['Comment'], 'Created with GIMP')
2019-09-01 18:28:46 +02:00
def test_ppm(self):
p = images.PPMParser('./tests/data/dirty.ppm')
meta = p.get_meta()
self.assertEqual(meta['1'], '# A metadata')
self.assertEqual(meta['4'], '# And an other one')
self.assertEqual(meta['6'], '# and a final one here')
2018-04-01 00:43:36 +02:00
def test_tiff(self):
p = images.TiffParser('./tests/data/dirty.tiff')
2018-04-01 00:43:36 +02:00
meta = p.get_meta()
self.assertEqual(meta['Make'], 'OLYMPUS IMAGING CORP.')
self.assertEqual(meta['Model'], 'C7070WZ')
self.assertEqual(meta['ModifyDate'], '2005:12:26 17:09:35')
2020-01-01 17:34:42 +01:00
def test_wav(self):
p = audio.WAVParser('./tests/data/dirty.wav')
meta = p.get_meta()
self.assertEqual(meta['Artist'], 'jvoisin')
2018-03-25 16:17:41 +02:00
def test_mp3(self):
p = audio.MP3Parser('./tests/data/dirty.mp3')
meta = p.get_meta()
self.assertEqual(meta['TXXX:I am a'], 'various comment')
2018-03-25 16:17:41 +02:00
def test_ogg(self):
p = audio.OGGParser('./tests/data/dirty.ogg')
meta = p.get_meta()
self.assertEqual(meta['title'], 'I am so')
2018-03-25 16:17:41 +02:00
2018-03-25 16:20:45 +02:00
def test_flac(self):
p = audio.FLACParser('./tests/data/dirty.flac')
meta = p.get_meta()
self.assertEqual(meta['title'], 'I am so')
self.assertEqual(meta['Cover 0'], {'Comment': 'Created with GIMP'})
2018-03-25 16:20:45 +02:00
2018-03-31 15:47:06 +02:00
def test_docx(self):
2018-04-01 01:04:06 +02:00
p = office.MSOfficeParser('./tests/data/dirty.docx')
2018-03-31 15:47:06 +02:00
meta = p.get_meta()
self.assertEqual(meta['docProps/core.xml']['cp:lastModifiedBy'], 'Julien Voisin')
self.assertEqual(meta['docProps/core.xml']['dc:creator'], 'julien voisin')
self.assertEqual(meta['docProps/app.xml']['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1')
2018-03-31 15:47:06 +02:00
2018-03-31 21:20:21 +02:00
def test_libreoffice(self):
2018-04-01 01:04:06 +02:00
p = office.LibreOfficeParser('./tests/data/dirty.odt')
2018-03-31 21:20:21 +02:00
meta = p.get_meta()
self.assertEqual(meta['meta.xml']['meta:initial-creator'], 'jvoisin ')
self.assertEqual(meta['meta.xml']['meta:creation-date'], '2011-07-26T03:27:48')
self.assertEqual(meta['meta.xml']['meta:generator'], 'LibreOffice/3.3$Unix LibreOffice_project/330m19$Build-202')
2018-03-31 21:20:21 +02:00
2018-07-08 21:50:52 +02:00
p = office.LibreOfficeParser('./tests/data/weird_producer.odt')
meta = p.get_meta()
self.assertEqual(meta['mimetype']['create_system'], 'Windows')
self.assertEqual(meta['mimetype']['comment'], b'YAY FOR COMMENTS')
2018-07-08 21:50:52 +02:00
2018-07-06 00:42:09 +02:00
def test_txt(self):
p, mimetype = parser_factory.get_parser('./tests/data/dirty.txt')
self.assertEqual(mimetype, 'text/plain')
meta = p.get_meta()
self.assertEqual(meta, {})
2018-10-25 11:56:46 +02:00
def test_zip(self):
with zipfile.ZipFile('./tests/data/dirty.zip', 'w') as zout:
zout.write('./tests/data/dirty.flac',
compress_type = zipfile.ZIP_STORED)
zout.write('./tests/data/dirty.docx',
compress_type = zipfile.ZIP_DEFLATED)
zout.write('./tests/data/dirty.jpg',
compress_type = zipfile.ZIP_BZIP2)
zout.write('./tests/data/dirty.txt',
compress_type = zipfile.ZIP_LZMA)
2018-10-25 11:56:46 +02:00
p, mimetype = parser_factory.get_parser('./tests/data/dirty.zip')
self.assertEqual(mimetype, 'application/zip')
meta = p.get_meta()
self.assertEqual(meta['tests/data/dirty.flac']['comments'], 'Thank you for using MAT !')
self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
with zipfile.ZipFile('./tests/data/dirty.zip') as zipin:
members = {
'tests/data/dirty.flac' : zipfile.ZIP_STORED,
'tests/data/dirty.docx': zipfile.ZIP_DEFLATED,
'tests/data/dirty.jpg' : zipfile.ZIP_BZIP2,
'tests/data/dirty.txt' : zipfile.ZIP_LZMA,
}
for k, v in members.items():
self.assertEqual(zipin.getinfo(k).compress_type, v)
2018-10-25 11:56:46 +02:00
os.remove('./tests/data/dirty.zip')
2019-02-02 18:44:02 +01:00
def test_wmv(self):
p, mimetype = parser_factory.get_parser('./tests/data/dirty.wmv')
self.assertEqual(mimetype, 'video/x-ms-wmv')
meta = p.get_meta()
self.assertEqual(meta['EncodingSettings'], 'Lavf52.103.0')
2018-03-31 21:20:21 +02:00
2019-02-03 21:01:58 +01:00
def test_gif(self):
p, mimetype = parser_factory.get_parser('./tests/data/dirty.gif')
self.assertEqual(mimetype, 'image/gif')
meta = p.get_meta()
self.assertEqual(meta['Comment'], 'this is a test comment')
2019-02-21 01:28:11 +01:00
def test_epub(self):
p, mimetype = parser_factory.get_parser('./tests/data/dirty.epub')
self.assertEqual(mimetype, 'application/epub+zip')
meta = p.get_meta()
self.assertEqual(meta['OEBPS/content.opf']['dc:creator'], 'Dorothy L. Sayers')
self.assertEqual(meta['OEBPS/toc.ncx']['dtb:generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster@gutenberg.org>')
self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@images@shield25.jpg']['CreatorTool'], 'Adobe Photoshop CS5 Macintosh')
self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@58820-h-2.htm.html']['generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster@gutenberg.org>')
def test_css(self):
p, mimetype = parser_factory.get_parser('./tests/data/dirty.css')
self.assertEqual(mimetype, 'text/css')
meta = p.get_meta()
self.assertEqual(meta['author'], 'jvoisin')
self.assertEqual(meta['version'], '1.0')
self.assertEqual(meta['harmful data'], 'underline is cool')
2019-04-27 13:05:36 +02:00
def test_tar(self):
with tarfile.TarFile('./tests/data/dirty.tar', 'w') as tout:
tout.add('./tests/data/dirty.flac')
tout.add('./tests/data/dirty.docx')
tout.add('./tests/data/dirty.jpg')
p, mimetype = parser_factory.get_parser('./tests/data/dirty.tar')
self.assertEqual(mimetype, 'application/x-tar')
meta = p.get_meta()
self.assertEqual(meta['./tests/data/dirty.flac']['comments'], 'Thank you for using MAT !')
self.assertEqual(meta['./tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
os.remove('./tests/data/dirty.tar')
2019-10-12 22:32:04 +02:00
def test_svg(self):
p = images.SVGParser('./tests/data/weird.svg')
self.assertEqual(p.get_meta()['Xmlns'], 'http://www.w3.org/1337/svg')
def test_aiff(self):
p = audio.AIFFParser('./tests/data/dirty.aiff')
meta = p.get_meta()
self.assertEqual(meta['Name'], 'I am so')
2019-04-27 13:05:36 +02:00
class TestRemovingThumbnails(unittest.TestCase):
def test_odt(self):
shutil.copy('./tests/data/revision.odt', './tests/data/clean.odt')
zipin = zipfile.ZipFile(os.path.abspath('./tests/data/clean.odt'))
self.assertIn('Thumbnails/thumbnail.png', zipin.namelist())
zipin.close()
p = office.LibreOfficeParser('./tests/data/clean.odt')
self.assertTrue(p.remove_all())
zipin = zipfile.ZipFile(os.path.abspath('./tests/data/clean.cleaned.odt'))
self.assertNotIn('Thumbnails/thumbnail.png', zipin.namelist())
zipin.close()
os.remove('./tests/data/clean.cleaned.odt')
2018-07-01 23:11:10 +02:00
os.remove('./tests/data/clean.odt')
class TestRevisionsCleaning(unittest.TestCase):
def test_libreoffice(self):
with zipfile.ZipFile('./tests/data/revision.odt') as zipin:
c = zipin.open('content.xml')
r = c.read()
self.assertIn(b'tracked-changes', r)
shutil.copy('./tests/data/revision.odt', './tests/data/clean.odt')
p = office.LibreOfficeParser('./tests/data/clean.odt')
self.assertTrue(p.remove_all())
with zipfile.ZipFile('./tests/data/clean.cleaned.odt') as zipin:
c = zipin.open('content.xml')
r = c.read()
self.assertNotIn(b'tracked-changes', r)
os.remove('./tests/data/clean.odt')
os.remove('./tests/data/clean.cleaned.odt')
2018-07-01 23:11:10 +02:00
def test_msoffice(self):
with zipfile.ZipFile('./tests/data/revision.docx') as zipin:
c = zipin.open('word/document.xml')
content = c.read()
r = b'<w:ins w:id="1" w:author="Unknown Author" w:date="2018-06-28T23:48:00Z">'
self.assertIn(r, content)
shutil.copy('./tests/data/revision.docx', './tests/data/revision_clean.docx')
p = office.MSOfficeParser('./tests/data/revision_clean.docx')
self.assertTrue(p.remove_all())
with zipfile.ZipFile('./tests/data/revision_clean.cleaned.docx') as zipin:
c = zipin.open('word/document.xml')
content = c.read()
r = b'<w:ins w:id="1" w:author="Unknown Author" w:date="2018-06-28T23:48:00Z">'
self.assertNotIn(r, content)
os.remove('./tests/data/revision_clean.docx')
os.remove('./tests/data/revision_clean.cleaned.docx')
2018-04-23 00:24:39 +02:00
2019-10-12 22:32:04 +02:00
class TestCleaning(unittest.TestCase):
data = [{
'name': 'pdf',
'parser': pdf.PDFParser,
'meta': {'producer': 'pdfTeX-1.40.14'},
'expected_meta': {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1},
}, {
'name': 'png',
'parser': images.PNGParser,
'meta': {'Comment': 'This is a comment, be careful!'},
'expected_meta': {},
}, {
'name': 'jpg',
'parser': images.JPGParser,
'meta': {'Comment': 'Created with GIMP'},
'expected_meta': {},
2020-01-01 17:34:42 +01:00
}, {
'name': 'wav',
'parser': audio.WAVParser,
'meta': {'Comment': 'Zomg, a comment!'},
'expected_meta': {},
2019-10-12 22:32:04 +02:00
}, {
'name': 'aiff',
'parser': audio.AIFFParser,
'meta': {'Annotation': 'Thank you for using MAT !'},
'expected_meta': {},
},
{
2019-10-12 22:32:04 +02:00
'name': 'mp3',
'parser': audio.MP3Parser,
'meta': {'TXXX:I am a': 'various comment'},
'expected_meta': {},
}, {
'name': 'ogg',
'parser': audio.OGGParser,
'meta': {'title': 'I am so'},
'expected_meta': {},
}, {
'name': 'flac',
'parser': audio.FLACParser,
'meta': {'title': 'I am so'},
'expected_meta': {},
}, {
'name': 'docx',
'parser': office.MSOfficeParser,
'meta': {'word/media/image1.png' :
{'Comment': 'This is a comment, be careful!',
'ModifyDate': '2018:03:20 21:59:25',
'PixelUnits': 'meters',
'PixelsPerUnitX': 2835,
'PixelsPerUnitY': 2835,
'create_system': 'Weird',
'date_time': '2018-03-31 13:15:38'} ,
},
'expected_meta': {},
}, {
'name': 'odt',
'parser': office.LibreOfficeParser,
'meta': {
'Pictures/1000000000000032000000311EC5314D.png': {
'create_system': 'Weird',
'date_time': '2011-07-26 02:40:16',
'PixelsPerUnitX': 4847,
'PixelsPerUnitY': 4760,
'PixelUnits': 'meters',
},
},
'expected_meta': {},
},{
'name': 'tiff',
'parser': images.TiffParser,
'meta': {'Model': 'C7070WZ'},
'expected_meta':
{'Orientation': 'Horizontal (normal)',
'ResolutionUnit': 'inches',
'XResolution': 72,
'YResolution': 72}
},{
'name': 'bmp',
'parser': harmless.HarmlessParser,
'meta': {},
'expected_meta': {},
},{
'name': 'torrent',
'parser': torrent.TorrentParser,
'meta': {'created by': b'mktorrent 1.0', 'creation date': 1522397702},
'expected_meta': {},
}, {
'name': 'odf',
'parser': office.LibreOfficeParser,
'meta': {'meta.xml': {'create_system': 'Weird', 'date_time':
'2018-04-22 22:20:24', 'meta:initial-creator': 'Julien Voisin',
'meta:creation-date': '2018-04-23T00:18:59.438231281',
'dc:date': '2018-04-23T00:20:23.978564933', 'dc:creator':
'Julien Voisin', 'meta:editing-duration': 'PT1M24S',
'meta:editing-cycles': '1', 'meta:generator':
'LibreOffice/5.4.6.2$Linux_X86_64 LibreOffice_project/40m0$Build-2'}},
'expected_meta': {},
}, {
'name': 'odg',
'parser': office.LibreOfficeParser,
'meta': {'meta.xml': {'create_system': 'Weird', 'date_time':
'2018-04-22 22:26:58', 'meta:initial-creator': 'Julien Voisin',
'meta:creation-date': '2018-04-23T00:25:59.953271949',
'dc:date': '2018-04-23T00:26:59.385838550', 'dc:creator':
'Julien Voisin', 'meta:editing-duration': 'PT59S',
'meta:editing-cycles': '1', 'meta:generator':
'LibreOffice/5.4.6.2$Linux_X86_64 LibreOffice_project/40m0$Build-2'}},
'expected_meta': {},
}, {
'name': 'txt',
'parser': harmless.HarmlessParser,
'meta': {},
'expected_meta': {},
},{
'name': 'gif',
'parser': images.GIFParser,
'meta': {'Comment': 'this is a test comment'},
2021-12-26 15:45:29 +01:00
'expected_meta': {'TransparentColor': '5'},
2019-10-12 22:32:04 +02:00
},{
'name': 'css',
'parser': web.CSSParser,
'meta': {
'harmful data': 'underline is cool',
'version': '1.0',
'author': 'jvoisin'
},
'expected_meta': {},
},{
'name': 'svg',
'parser': images.SVGParser,
'meta': {
'WorkDescription': "This is a test svg image for mat2's testsuite",
},
'expected_meta': {},
} ,{
'name': 'ppm',
'parser': images.PPMParser,
'meta': {
'1': '# A metadata',
},
'expected_meta': {},
} ,{
'name': 'avi',
'ffmpeg': 1,
'parser': video.AVIParser,
'meta': {
'Software': 'MEncoder SVN-r33148-4.0.1',
},
'expected_meta': {},
} ,{
'name': 'mp4',
'ffmpeg': 1,
'parser': video.MP4Parser,
'meta': {
'Encoder': 'HandBrake 0.9.4 2009112300',
},
2020-02-08 14:44:45 +01:00
'expected_meta': {
2021-05-05 20:39:07 +02:00
'AverageBitrate': 465641,
'BufferSize': 0,
2020-02-08 14:44:45 +01:00
'CompatibleBrands': ['isom', 'iso2', 'avc1', 'mp41'],
2021-05-05 20:39:07 +02:00
'ColorRepresentation': 'nclx 1 1 1',
2019-10-12 22:32:04 +02:00
'CompressorID': 'avc1',
'GraphicsMode': 'srcCopy',
'HandlerDescription': 'SoundHandler',
'HandlerType': 'Metadata',
'HandlerVendorID': 'Apple',
'MajorBrand': 'Base Media v1 [IS0 14496-12:2003]',
2021-05-05 20:39:07 +02:00
'MaxBitrate': 465641,
2020-02-08 14:44:45 +01:00
'MediaDataOffset': 48,
'MediaDataSize': 379872,
2019-10-12 22:32:04 +02:00
'MediaHeaderVersion': 0,
'MinorVersion': '0.2.0',
'MovieDataOffset': 48,
'MovieHeaderVersion': 0,
'NextTrackID': 3,
'PreferredRate': 1,
'Rotation': 0,
'TimeScale': 1000,
'TrackHeaderVersion': 0,
'TrackID': 1,
'TrackLayer': 0},
},{
'name': 'wmv',
'ffmpeg': 1,
'parser': video.WMVParser,
'meta': {
'EncodingSettings': 'Lavf52.103.0',
},
'expected_meta': {},
}
]
def test_all_parametred(self):
for case in self.data:
if 'ffmpeg' in case:
try:
video._get_ffmpeg_path()
except RuntimeError:
raise unittest.SkipTest
print('[+] Testing %s' % case['name'])
target = './tests/data/clean.' + case['name']
shutil.copy('./tests/data/dirty.' + case['name'], target)
p1 = case['parser'](target)
for k, v in p1.get_meta().items():
if k not in case['meta']:
continue
2019-10-12 22:32:04 +02:00
if isinstance(v, dict):
for _k, _v in v.items():
if _k in case['meta'][k]:
self.assertEqual(_v, case['meta'][k][_k])
2019-10-12 22:32:04 +02:00
else:
self.assertEqual(v, case['meta'][k])
2019-10-12 22:32:04 +02:00
p1.lightweight_cleaning = True
self.assertTrue(p1.remove_all())
p2 = case['parser'](p1.output_filename)
2021-12-26 15:45:29 +01:00
meta = p2.get_meta()
if meta:
for k, v in p2.get_meta().items():
self.assertIn(k, case['expected_meta'], '"%s" is not in "%s" (%s)' % (k, case['expected_meta'], case['name']))
self.assertIn(str(case['expected_meta'][k]), str(v))
2019-10-12 22:32:04 +02:00
self.assertTrue(p2.remove_all())
os.remove(target)
os.remove(p1.output_filename)
os.remove(p2.output_filename)
2019-02-03 21:01:58 +01:00
2019-02-08 00:26:47 +01:00
def test_html(self):
shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
2019-02-21 01:28:11 +01:00
p = web.HTMLParser('./tests/data/clean.html')
2019-02-08 00:26:47 +01:00
meta = p.get_meta()
self.assertEqual(meta['author'], 'jvoisin')
ret = p.remove_all()
self.assertTrue(ret)
2019-02-21 01:28:11 +01:00
p = web.HTMLParser('./tests/data/clean.cleaned.html')
2019-02-08 00:26:47 +01:00
self.assertEqual(p.get_meta(), {})
self.assertTrue(p.remove_all())
os.remove('./tests/data/clean.html')
os.remove('./tests/data/clean.cleaned.html')
os.remove('./tests/data/clean.cleaned.cleaned.html')
2019-02-21 01:28:11 +01:00
with open('./tests/data/clean.html', 'w') as f:
f.write('<title><title><pouet/><meta/></title></title><test/>')
p = web.HTMLParser('./tests/data/clean.html')
self.assertTrue(p.remove_all())
with open('./tests/data/clean.cleaned.html', 'r') as f:
self.assertEqual(f.read(), '<title></title><test/>')
os.remove('./tests/data/clean.html')
os.remove('./tests/data/clean.cleaned.html')
with open('./tests/data/clean.html', 'w') as f:
f.write('<test><title>Some<b>metadata</b><br/></title></test>')
p = web.HTMLParser('./tests/data/clean.html')
self.assertTrue(p.remove_all())
with open('./tests/data/clean.cleaned.html', 'r') as f:
self.assertEqual(f.read(), '<test><title></title></test>')
os.remove('./tests/data/clean.html')
os.remove('./tests/data/clean.cleaned.html')
with open('./tests/data/clean.html', 'w') as f:
2019-02-28 00:13:28 +01:00
f.write('<meta><meta/><!----><!-- test--></meta>')
p = web.HTMLParser('./tests/data/clean.html')
self.assertTrue(p.remove_all())
with open('./tests/data/clean.cleaned.html', 'r') as f:
self.assertEqual(f.read(), '')
os.remove('./tests/data/clean.html')
os.remove('./tests/data/clean.cleaned.html')
2019-02-21 01:28:11 +01:00
def test_epub(self):
shutil.copy('./tests/data/dirty.epub', './tests/data/clean.epub')
p = epub.EPUBParser('./tests/data/clean.epub')
meta = p.get_meta()
self.assertEqual(meta['OEBPS/content.opf']['dc:source'], 'http://www.gutenberg.org/files/58820/58820-h/58820-h.htm')
ret = p.remove_all()
self.assertTrue(ret)
p = epub.EPUBParser('./tests/data/clean.cleaned.epub')
2019-02-27 23:04:38 +01:00
meta = p.get_meta()
res = re.match(meta['OEBPS/content.opf']['metadata'], '^<dc:identifier>[0-9a-f-]+</dc:identifier><dc:title /><dc:language />$')
self.assertNotEqual(res, False)
2019-02-21 01:28:11 +01:00
self.assertTrue(p.remove_all())
os.remove('./tests/data/clean.epub')
os.remove('./tests/data/clean.cleaned.epub')
os.remove('./tests/data/clean.cleaned.cleaned.epub')
2019-10-12 22:32:04 +02:00
class TestCleaningArchives(unittest.TestCase):
def test_zip(self):
with zipfile.ZipFile('./tests/data/dirty.zip', 'w') as zout:
zout.write('./tests/data/dirty.flac',
compress_type = zipfile.ZIP_STORED)
zout.write('./tests/data/dirty.docx',
compress_type = zipfile.ZIP_DEFLATED)
zout.write('./tests/data/dirty.jpg',
compress_type = zipfile.ZIP_BZIP2)
zout.write('./tests/data/dirty.txt',
compress_type = zipfile.ZIP_LZMA)
2019-10-12 22:32:04 +02:00
p = archive.ZipParser('./tests/data/dirty.zip')
meta = p.get_meta()
self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
2019-02-21 01:28:11 +01:00
ret = p.remove_all()
self.assertTrue(ret)
2019-10-12 22:32:04 +02:00
p = archive.ZipParser('./tests/data/dirty.cleaned.zip')
2019-02-21 01:28:11 +01:00
self.assertEqual(p.get_meta(), {})
self.assertTrue(p.remove_all())
with zipfile.ZipFile('./tests/data/dirty.zip') as zipin:
members = {
'tests/data/dirty.flac' : zipfile.ZIP_STORED,
'tests/data/dirty.docx': zipfile.ZIP_DEFLATED,
'tests/data/dirty.jpg' : zipfile.ZIP_BZIP2,
'tests/data/dirty.txt' : zipfile.ZIP_LZMA,
}
for k, v in members.items():
self.assertEqual(zipin.getinfo(k).compress_type, v)
2019-10-12 22:32:04 +02:00
os.remove('./tests/data/dirty.zip')
os.remove('./tests/data/dirty.cleaned.zip')
os.remove('./tests/data/dirty.cleaned.cleaned.zip')
2019-04-27 13:05:36 +02:00
def test_tar(self):
2019-04-27 15:03:09 +02:00
with tarfile.TarFile.open('./tests/data/dirty.tar', 'w') as zout:
2019-04-27 13:05:36 +02:00
zout.add('./tests/data/dirty.flac')
zout.add('./tests/data/dirty.docx')
zout.add('./tests/data/dirty.jpg')
p = archive.TarParser('./tests/data/dirty.tar')
meta = p.get_meta()
self.assertEqual(meta['./tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
ret = p.remove_all()
self.assertTrue(ret)
p = archive.TarParser('./tests/data/dirty.cleaned.tar')
self.assertEqual(p.get_meta(), {})
self.assertTrue(p.remove_all())
tmp_dir = tempfile.mkdtemp()
with tarfile.open('./tests/data/dirty.cleaned.tar') as zout:
zout.extractall(path=tmp_dir)
zout.close()
number_of_files = 0
for root, _, fnames in os.walk(tmp_dir):
for f in fnames:
complete_path = os.path.join(root, f)
p, _ = parser_factory.get_parser(complete_path)
self.assertIsNotNone(p)
self.assertEqual(p.get_meta(), {})
number_of_files += 1
self.assertEqual(number_of_files, 3)
os.remove('./tests/data/dirty.tar')
os.remove('./tests/data/dirty.cleaned.tar')
os.remove('./tests/data/dirty.cleaned.cleaned.tar')
2019-04-27 15:03:09 +02:00
def test_targz(self):
with tarfile.TarFile.open('./tests/data/dirty.tar.gz', 'w:gz') as zout:
zout.add('./tests/data/dirty.flac')
zout.add('./tests/data/dirty.docx')
zout.add('./tests/data/dirty.jpg')
p = archive.TarParser('./tests/data/dirty.tar.gz')
meta = p.get_meta()
self.assertEqual(meta['./tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
ret = p.remove_all()
self.assertTrue(ret)
p = archive.TarParser('./tests/data/dirty.cleaned.tar.gz')
self.assertEqual(p.get_meta(), {})
self.assertTrue(p.remove_all())
tmp_dir = tempfile.mkdtemp()
with tarfile.open('./tests/data/dirty.cleaned.tar.gz') as zout:
zout.extractall(path=tmp_dir)
zout.close()
number_of_files = 0
for root, _, fnames in os.walk(tmp_dir):
for f in fnames:
complete_path = os.path.join(root, f)
p, _ = parser_factory.get_parser(complete_path)
self.assertIsNotNone(p)
self.assertEqual(p.get_meta(), {})
number_of_files += 1
self.assertEqual(number_of_files, 3)
os.remove('./tests/data/dirty.tar.gz')
os.remove('./tests/data/dirty.cleaned.tar.gz')
os.remove('./tests/data/dirty.cleaned.cleaned.tar.gz')
def test_tarbz2(self):
with tarfile.TarFile.open('./tests/data/dirty.tar.bz2', 'w:bz2') as zout:
zout.add('./tests/data/dirty.flac')
zout.add('./tests/data/dirty.docx')
zout.add('./tests/data/dirty.jpg')
p = archive.TarParser('./tests/data/dirty.tar.bz2')
meta = p.get_meta()
self.assertEqual(meta['./tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
ret = p.remove_all()
self.assertTrue(ret)
p = archive.TarParser('./tests/data/dirty.cleaned.tar.bz2')
self.assertEqual(p.get_meta(), {})
self.assertTrue(p.remove_all())
tmp_dir = tempfile.mkdtemp()
with tarfile.open('./tests/data/dirty.cleaned.tar.bz2') as zout:
zout.extractall(path=tmp_dir)
zout.close()
number_of_files = 0
for root, _, fnames in os.walk(tmp_dir):
for f in fnames:
complete_path = os.path.join(root, f)
p, _ = parser_factory.get_parser(complete_path)
self.assertIsNotNone(p)
self.assertEqual(p.get_meta(), {})
number_of_files += 1
self.assertEqual(number_of_files, 3)
os.remove('./tests/data/dirty.tar.bz2')
os.remove('./tests/data/dirty.cleaned.tar.bz2')
os.remove('./tests/data/dirty.cleaned.cleaned.tar.bz2')
def test_tarxz(self):
with tarfile.TarFile.open('./tests/data/dirty.tar.xz', 'w:xz') as zout:
zout.add('./tests/data/dirty.flac')
zout.add('./tests/data/dirty.docx')
zout.add('./tests/data/dirty.jpg')
p = archive.TarParser('./tests/data/dirty.tar.xz')
meta = p.get_meta()
self.assertEqual(meta['./tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
ret = p.remove_all()
self.assertTrue(ret)
p = archive.TarParser('./tests/data/dirty.cleaned.tar.xz')
self.assertEqual(p.get_meta(), {})
self.assertTrue(p.remove_all())
tmp_dir = tempfile.mkdtemp()
with tarfile.open('./tests/data/dirty.cleaned.tar.xz') as zout:
zout.extractall(path=tmp_dir)
zout.close()
number_of_files = 0
for root, _, fnames in os.walk(tmp_dir):
for f in fnames:
complete_path = os.path.join(root, f)
p, _ = parser_factory.get_parser(complete_path)
self.assertIsNotNone(p)
self.assertEqual(p.get_meta(), {})
number_of_files += 1
self.assertEqual(number_of_files, 3)
os.remove('./tests/data/dirty.tar.xz')
os.remove('./tests/data/dirty.cleaned.tar.xz')
os.remove('./tests/data/dirty.cleaned.cleaned.tar.xz')
class TestNoSandbox(unittest.TestCase):
def test_avi_nosandbox(self):
shutil.copy('./tests/data/dirty.avi', './tests/data/clean.avi')
p = video.AVIParser('./tests/data/clean.avi')
p.sandbox = False
meta = p.get_meta()
self.assertEqual(meta['Software'], 'MEncoder SVN-r33148-4.0.1')
ret = p.remove_all()
self.assertTrue(ret)
p = video.AVIParser('./tests/data/clean.cleaned.avi')
self.assertEqual(p.get_meta(), {})
self.assertTrue(p.remove_all())
os.remove('./tests/data/clean.avi')
os.remove('./tests/data/clean.cleaned.avi')
os.remove('./tests/data/clean.cleaned.cleaned.avi')
def test_png_nosandbox(self):
shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
p = images.PNGParser('./tests/data/clean.png')
p.sandbox = False
p.lightweight_cleaning = True
meta = p.get_meta()
self.assertEqual(meta['Comment'], 'This is a comment, be careful!')
ret = p.remove_all()
self.assertTrue(ret)
p = images.PNGParser('./tests/data/clean.cleaned.png')
self.assertEqual(p.get_meta(), {})
self.assertTrue(p.remove_all())
os.remove('./tests/data/clean.png')
os.remove('./tests/data/clean.cleaned.png')
os.remove('./tests/data/clean.cleaned.cleaned.png')
2020-03-08 12:17:56 +01:00
class TestComplexOfficeFiles(unittest.TestCase):
def test_complex_pptx(self):
target = './tests/data/clean.pptx'
shutil.copy('./tests/data/narrated_powerpoint_presentation.pptx', target)
p = office.MSOfficeParser(target)
self.assertTrue(p.remove_all())
os.remove(target)
os.remove(p.output_filename)