1
0
Fork 0

Use of the archive refactoring for the office documents too

This commit is contained in:
jvoisin 2019-02-07 21:58:10 +01:00
parent b9a62d798a
commit e1dd439fc8
3 changed files with 19 additions and 30 deletions

View File

@ -2,7 +2,7 @@ import logging
import os import os
import re import re
import zipfile import zipfile
from typing import Dict, Set, Pattern, Tuple, Union, Any from typing import Dict, Set, Pattern, Tuple, Any
import xml.etree.ElementTree as ET # type: ignore import xml.etree.ElementTree as ET # type: ignore
@ -375,23 +375,17 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
return False return False
return True return True
def get_meta(self) -> Dict[str, Union[str, dict]]: def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]:
""" """
Yes, I know that parsing xml with regexp ain't pretty, Yes, I know that parsing xml with regexp ain't pretty,
be my guest and fix it if you want. be my guest and fix it if you want.
""" """
metadata = {} if file_path != 'meta.xml':
zipin = zipfile.ZipFile(self.filename) return {}
for item in zipin.infolist(): with open(full_path, encoding='utf-8') as f:
if item.filename == 'meta.xml': try:
try: results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", f.read(), re.I|re.M)
content = zipin.read(item).decode('utf-8') return {k:v for (k, v) in results}
results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I|re.M) except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file
for (key, value) in results: # We didn't manage to parse the xml file
metadata[key] = value return {file_path: 'harmful content', }
except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file
metadata[item.filename] = 'harmful content'
for key, value in self._get_zipinfo_meta(item).items():
metadata[key] = value
zipin.close()
return metadata

View File

@ -67,15 +67,10 @@ class TestCorruptedEmbedded(unittest.TestCase):
os.remove('./tests/data/clean.docx') os.remove('./tests/data/clean.docx')
def test_odt(self): def test_odt(self):
expected = {
'create_system': 'Weird',
'date_time': '2018-06-10 17:18:18',
'meta.xml': 'harmful content'
}
shutil.copy('./tests/data/embedded_corrupted.odt', './tests/data/clean.odt') shutil.copy('./tests/data/embedded_corrupted.odt', './tests/data/clean.odt')
parser, _ = parser_factory.get_parser('./tests/data/clean.odt') parser, _ = parser_factory.get_parser('./tests/data/clean.odt')
self.assertFalse(parser.remove_all()) self.assertFalse(parser.remove_all())
self.assertEqual(parser.get_meta(), expected) self.assertTrue(parser.get_meta())
os.remove('./tests/data/clean.odt') os.remove('./tests/data/clean.odt')

View File

@ -138,14 +138,14 @@ class TestGetMeta(unittest.TestCase):
def test_libreoffice(self): def test_libreoffice(self):
p = office.LibreOfficeParser('./tests/data/dirty.odt') p = office.LibreOfficeParser('./tests/data/dirty.odt')
meta = p.get_meta() meta = p.get_meta()
self.assertEqual(meta['meta:initial-creator'], 'jvoisin ') self.assertEqual(meta['meta.xml']['meta:initial-creator'], 'jvoisin ')
self.assertEqual(meta['meta:creation-date'], '2011-07-26T03:27:48') self.assertEqual(meta['meta.xml']['meta:creation-date'], '2011-07-26T03:27:48')
self.assertEqual(meta['meta:generator'], 'LibreOffice/3.3$Unix LibreOffice_project/330m19$Build-202') self.assertEqual(meta['meta.xml']['meta:generator'], 'LibreOffice/3.3$Unix LibreOffice_project/330m19$Build-202')
p = office.LibreOfficeParser('./tests/data/weird_producer.odt') p = office.LibreOfficeParser('./tests/data/weird_producer.odt')
meta = p.get_meta() meta = p.get_meta()
self.assertEqual(meta['create_system'], 'Windows') self.assertEqual(meta['mimetype']['create_system'], 'Windows')
self.assertEqual(meta['comment'], b'YAY FOR COMMENTS') self.assertEqual(meta['mimetype']['comment'], b'YAY FOR COMMENTS')
def test_txt(self): def test_txt(self):
p, mimetype = parser_factory.get_parser('./tests/data/dirty.txt') p, mimetype = parser_factory.get_parser('./tests/data/dirty.txt')
@ -440,7 +440,7 @@ class TestCleaning(unittest.TestCase):
p = office.LibreOfficeParser('./tests/data/clean.odf') p = office.LibreOfficeParser('./tests/data/clean.odf')
meta = p.get_meta() meta = p.get_meta()
self.assertEqual(meta['meta:creation-date'], '2018-04-23T00:18:59.438231281') self.assertEqual(meta['meta.xml']['meta:creation-date'], '2018-04-23T00:18:59.438231281')
ret = p.remove_all() ret = p.remove_all()
self.assertTrue(ret) self.assertTrue(ret)
@ -458,7 +458,7 @@ class TestCleaning(unittest.TestCase):
p = office.LibreOfficeParser('./tests/data/clean.odg') p = office.LibreOfficeParser('./tests/data/clean.odg')
meta = p.get_meta() meta = p.get_meta()
self.assertEqual(meta['dc:date'], '2018-04-23T00:26:59.385838550') self.assertEqual(meta['meta.xml']['dc:date'], '2018-04-23T00:26:59.385838550')
ret = p.remove_all() ret = p.remove_all()
self.assertTrue(ret) self.assertTrue(ret)