diff --git a/src/office.py b/src/office.py new file mode 100644 index 0000000..d6728c8 --- /dev/null +++ b/src/office.py @@ -0,0 +1,52 @@ +import subprocess +import json +import zipfile +import tempfile +import shutil +import os + +from . import abstract, parser_factory + +class OfficeParser(abstract.AbstractParser): + mimetypes = { + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + 'application/vnd.openxmlformats-officedocument.presentationml.presentation' + } + files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'} + + def get_meta(self): + metadata = {} + zipin = zipfile.ZipFile(self.filename) + for item in zipin.namelist(): + if item.startswith('docProps/'): + metadata[item] = 'harmful content' + zipin.close() + return metadata + + def remove_all(self): + zin = zipfile.ZipFile(self.filename, 'r') + zout = zipfile.ZipFile(self.output_filename, 'w') + temp_folder = tempfile.mkdtemp() + + for item in zin.infolist(): + if item.is_dir(): + continue + elif item.filename.startswith('docProps/'): + if not item.filename.endswith('.rels'): + continue # don't keep metadata files + if item.filename in self.files_to_keep: + zout.writestr(item, zin.read(item)) + continue + + zin.extract(member=item, path=temp_folder) + tmp_parser = parser_factory.get_parser(os.path.join(temp_folder, item.filename)) + if tmp_parser is None: + print("%s isn't supported" % item.filename) + continue + tmp_parser.remove_all() + zout.write(tmp_parser.output_filename, item.filename) + shutil.rmtree(temp_folder) + zout.close() + zin.close() + return True diff --git a/tests/data/dirty.docx b/tests/data/dirty.docx new file mode 100644 index 0000000..97e2c21 Binary files /dev/null and b/tests/data/dirty.docx differ diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index c21185e..02579b0 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -39,6 +39,11 @@ class TestGetMeta(unittest.TestCase): meta = p.get_meta() self.assertEqual(meta['TITLE'], ['I am so']) + def test_docx(self): + p = office.OfficeParser('./tests/data/dirty.docx') + meta = p.get_meta() + print(meta) + class TestCleaning(unittest.TestCase): def test_pdf(self): @@ -131,3 +136,18 @@ class TestCleaning(unittest.TestCase): self.assertEqual(p.get_meta(), {}) os.remove('./tests/data/clean.flac') + + def test_office(self): + shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx') + p = office.OfficeParser('./tests/data/clean.docx') + + meta = p.get_meta() + self.assertIsNotNone(meta) + + ret = p.remove_all() + self.assertTrue(ret) + + p = office.OfficeParser('./tests/data/clean.docx.cleaned') + self.assertEqual(p.get_meta(), {}) + + os.remove('./tests/data/clean.docx')