diff --git a/src/libreoffice.py b/src/libreoffice.py new file mode 100644 index 0000000..b7e0dfb --- /dev/null +++ b/src/libreoffice.py @@ -0,0 +1,54 @@ +import re +import subprocess +import json +import zipfile +import tempfile +import shutil +import os + +from . import abstract, parser_factory + +class LibreOfficeParser(abstract.AbstractParser): + mimetypes = { + 'application/vnd.oasis.opendocument.text', + } + + def get_meta(self): + """ + Yes, I know that parsing xml with regexp ain't pretty, + be my guest and fix it if you want. + """ + metadata = {} + zipin = zipfile.ZipFile(self.filename) + for item in zipin.namelist(): + if item == 'meta.xml': + content = zipin.read(item).decode('utf-8') + for (key, value) in re.findall(r"<((?:meta|dc).+?)>(.+)", content, re.I): + metadata[key] = value + if not metadata: # better safe than sorry + metadata[item] = 'harmful content' + zipin.close() + return metadata + + def remove_all(self): + zin = zipfile.ZipFile(self.filename, 'r') + zout = zipfile.ZipFile(self.output_filename, 'w') + temp_folder = tempfile.mkdtemp() + + for item in zin.infolist(): + if item.filename[-1] == '/': + continue # `is_dir` is added in Python3.6 + elif item.filename == 'meta.xml': + continue # don't keep metadata files + + zin.extract(member=item, path=temp_folder) + tmp_parser = parser_factory.get_parser(os.path.join(temp_folder, item.filename)) + if tmp_parser is None: + print("%s isn't supported" % item.filename) + continue + tmp_parser.remove_all() + zout.write(tmp_parser.output_filename, item.filename) + shutil.rmtree(temp_folder) + zout.close() + zin.close() + return True diff --git a/tests/data/dirty.odt b/tests/data/dirty.odt new file mode 100644 index 0000000..926ebff Binary files /dev/null and b/tests/data/dirty.odt differ diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 717de3f..743a845 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -4,7 +4,7 @@ import unittest import shutil import os -from src import pdf, png, jpg, audio, office +from src import pdf, png, jpg, audio, office, libreoffice class TestGetMeta(unittest.TestCase): def test_pdf(self): @@ -46,6 +46,14 @@ class TestGetMeta(unittest.TestCase): self.assertEqual(meta['dc:creator'], 'julien voisin') self.assertEqual(meta['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1') + def test_libreoffice(self): + p = libreoffice.LibreOfficeParser('./tests/data/dirty.odt') + meta = p.get_meta() + self.assertEqual(meta['meta:initial-creator'], 'jvoisin ') + self.assertEqual(meta['meta:creation-date'], '2011-07-26T03:27:48') + self.assertEqual(meta['meta:generator'], 'LibreOffice/3.3$Unix LibreOffice_project/330m19$Build-202') + + class TestCleaning(unittest.TestCase): def test_pdf(self): @@ -153,3 +161,19 @@ class TestCleaning(unittest.TestCase): self.assertEqual(p.get_meta(), {}) os.remove('./tests/data/clean.docx') + + + def test_libreoffice(self): + shutil.copy('./tests/data/dirty.odt', './tests/data/clean.odt') + p = libreoffice.LibreOfficeParser('./tests/data/clean.odt') + + meta = p.get_meta() + self.assertIsNotNone(meta) + + ret = p.remove_all() + self.assertTrue(ret) + + p = libreoffice.LibreOfficeParser('./tests/data/clean.odt.cleaned') + self.assertEqual(p.get_meta(), {}) + + os.remove('./tests/data/clean.odt')