2018-09-20 01:13:59 +02:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
import unittest
|
|
|
|
import shutil
|
|
|
|
import os
|
|
|
|
import zipfile
|
|
|
|
import tempfile
|
|
|
|
|
|
|
|
from libmat2 import office, parser_factory
|
|
|
|
|
|
|
|
class TestZipMetadata(unittest.TestCase):
|
|
|
|
def __check_deep_meta(self, p):
|
|
|
|
tempdir = tempfile.mkdtemp()
|
|
|
|
zipin = zipfile.ZipFile(p.filename)
|
|
|
|
zipin.extractall(tempdir)
|
|
|
|
|
|
|
|
for subdir, dirs, files in os.walk(tempdir):
|
|
|
|
for f in files:
|
|
|
|
complete_path = os.path.join(subdir, f)
|
|
|
|
inside_p, _ = parser_factory.get_parser(complete_path)
|
|
|
|
if inside_p is None:
|
|
|
|
continue
|
|
|
|
self.assertEqual(inside_p.get_meta(), {})
|
|
|
|
shutil.rmtree(tempdir)
|
|
|
|
|
|
|
|
def __check_zip_meta(self, p):
|
|
|
|
zipin = zipfile.ZipFile(p.filename)
|
|
|
|
for item in zipin.infolist():
|
|
|
|
self.assertEqual(item.comment, b'')
|
|
|
|
self.assertEqual(item.date_time, (1980, 1, 1, 0, 0, 0))
|
|
|
|
self.assertEqual(item.create_system, 3) # 3 is UNIX
|
|
|
|
|
|
|
|
def test_office(self):
|
|
|
|
shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx')
|
|
|
|
p = office.MSOfficeParser('./tests/data/clean.docx')
|
|
|
|
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertIsNotNone(meta)
|
2018-10-25 11:29:50 +02:00
|
|
|
self.assertEqual(meta['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
|
2018-09-20 01:13:59 +02:00
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
|
|
|
p = office.MSOfficeParser('./tests/data/clean.cleaned.docx')
|
|
|
|
self.assertEqual(p.get_meta(), {})
|
|
|
|
|
|
|
|
self.__check_zip_meta(p)
|
|
|
|
self.__check_deep_meta(p)
|
|
|
|
|
|
|
|
os.remove('./tests/data/clean.docx')
|
|
|
|
os.remove('./tests/data/clean.cleaned.docx')
|
|
|
|
|
|
|
|
def test_libreoffice(self):
|
|
|
|
shutil.copy('./tests/data/dirty.odt', './tests/data/clean.odt')
|
|
|
|
p = office.LibreOfficeParser('./tests/data/clean.odt')
|
|
|
|
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertIsNotNone(meta)
|
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
|
|
|
p = office.LibreOfficeParser('./tests/data/clean.cleaned.odt')
|
|
|
|
self.assertEqual(p.get_meta(), {})
|
|
|
|
|
|
|
|
self.__check_zip_meta(p)
|
|
|
|
self.__check_deep_meta(p)
|
|
|
|
|
|
|
|
os.remove('./tests/data/clean.odt')
|
|
|
|
os.remove('./tests/data/clean.cleaned.odt')
|
2018-09-20 14:04:46 +02:00
|
|
|
|
|
|
|
|
|
|
|
class TestZipOrder(unittest.TestCase):
|
|
|
|
def test_libreoffice(self):
|
|
|
|
shutil.copy('./tests/data/dirty.odt', './tests/data/clean.odt')
|
|
|
|
p = office.LibreOfficeParser('./tests/data/clean.odt')
|
|
|
|
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertIsNotNone(meta)
|
|
|
|
|
|
|
|
is_unordered = False
|
|
|
|
with zipfile.ZipFile('./tests/data/clean.odt') as zin:
|
|
|
|
previous_name = ''
|
|
|
|
for item in zin.infolist():
|
|
|
|
if previous_name == '':
|
2019-02-24 23:03:17 +01:00
|
|
|
if item.filename == 'mimetype':
|
|
|
|
continue
|
2018-09-20 14:04:46 +02:00
|
|
|
previous_name = item.filename
|
|
|
|
continue
|
|
|
|
elif item.filename < previous_name:
|
|
|
|
is_unordered = True
|
|
|
|
break
|
|
|
|
self.assertTrue(is_unordered)
|
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
|
|
|
with zipfile.ZipFile('./tests/data/clean.cleaned.odt') as zin:
|
|
|
|
previous_name = ''
|
|
|
|
for item in zin.infolist():
|
|
|
|
if previous_name == '':
|
2019-02-24 23:03:17 +01:00
|
|
|
if item.filename == 'mimetype':
|
|
|
|
continue
|
2018-09-20 14:04:46 +02:00
|
|
|
previous_name = item.filename
|
|
|
|
continue
|
|
|
|
self.assertGreaterEqual(item.filename, previous_name)
|
|
|
|
|
|
|
|
os.remove('./tests/data/clean.odt')
|
|
|
|
os.remove('./tests/data/clean.cleaned.odt')
|
2018-09-20 22:37:53 +02:00
|
|
|
|
|
|
|
class TestRsidRemoval(unittest.TestCase):
|
|
|
|
def test_office(self):
|
|
|
|
shutil.copy('./tests/data/office_revision_session_ids.docx', './tests/data/clean.docx')
|
|
|
|
p = office.MSOfficeParser('./tests/data/clean.docx')
|
|
|
|
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertIsNotNone(meta)
|
|
|
|
|
|
|
|
how_many_rsid = False
|
|
|
|
with zipfile.ZipFile('./tests/data/clean.docx') as zin:
|
|
|
|
for item in zin.infolist():
|
|
|
|
if not item.filename.endswith('.xml'):
|
|
|
|
continue
|
|
|
|
num = zin.read(item).decode('utf-8').lower().count('w:rsid')
|
|
|
|
how_many_rsid += num
|
|
|
|
self.assertEqual(how_many_rsid, 11)
|
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
|
|
|
with zipfile.ZipFile('./tests/data/clean.cleaned.docx') as zin:
|
|
|
|
for item in zin.infolist():
|
|
|
|
if not item.filename.endswith('.xml'):
|
|
|
|
continue
|
|
|
|
num = zin.read(item).decode('utf-8').lower().count('w:rsid')
|
|
|
|
self.assertEqual(num, 0)
|
|
|
|
|
|
|
|
os.remove('./tests/data/clean.docx')
|
|
|
|
os.remove('./tests/data/clean.cleaned.docx')
|
2019-09-01 13:34:26 +02:00
|
|
|
|
|
|
|
|
|
|
|
class TestNsidRemoval(unittest.TestCase):
|
|
|
|
def test_office(self):
|
|
|
|
shutil.copy('./tests/data/dirty_with_nsid.docx', './tests/data/clean.docx')
|
|
|
|
p = office.MSOfficeParser('./tests/data/clean.docx')
|
|
|
|
|
|
|
|
meta = p.get_meta()
|
|
|
|
self.assertIsNotNone(meta)
|
|
|
|
|
|
|
|
how_many_rsid = False
|
|
|
|
with zipfile.ZipFile('./tests/data/clean.docx') as zin:
|
|
|
|
for item in zin.infolist():
|
|
|
|
if not item.filename.endswith('.xml'):
|
|
|
|
continue
|
|
|
|
num = zin.read(item).decode('utf-8').lower().count('w:rsid')
|
|
|
|
how_many_rsid += num
|
|
|
|
self.assertEqual(how_many_rsid, 1190)
|
|
|
|
|
|
|
|
ret = p.remove_all()
|
|
|
|
self.assertTrue(ret)
|
|
|
|
|
|
|
|
with zipfile.ZipFile('./tests/data/clean.cleaned.docx') as zin:
|
|
|
|
for item in zin.infolist():
|
|
|
|
if not item.filename.endswith('.xml'):
|
|
|
|
continue
|
|
|
|
num = zin.read(item).decode('utf-8').lower().count('w:nsid')
|
|
|
|
self.assertEqual(num, 0)
|
|
|
|
|
|
|
|
os.remove('./tests/data/clean.docx')
|
|
|
|
os.remove('./tests/data/clean.cleaned.docx')
|