diff --git a/libmat2/harmless.py b/libmat2/harmless.py index 54737a8..2878571 100644 --- a/libmat2/harmless.py +++ b/libmat2/harmless.py @@ -4,7 +4,7 @@ from . import abstract class HarmlessParser(abstract.AbstractParser): """ This is the parser for filetypes that do not contain metadata. """ - mimetypes = {'application/xml', 'text/plain', 'text/xml', 'application/rdf+xml'} + mimetypes = {'text/plain', } def __init__(self, filename: str) -> None: super().__init__(filename) diff --git a/libmat2/office.py b/libmat2/office.py index 0791b07..fd3cdf4 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -4,17 +4,16 @@ import shutil import tempfile import datetime import zipfile -from typing import Dict, Set +from typing import Dict, Set, Pattern from . import abstract, parser_factory -assert Set # make pyflakes happy class ArchiveBasedAbstractParser(abstract.AbstractParser): - whitelist = set() # type: Set[str] + files_to_keep : Set[str] = set() + files_to_omit : Set[Pattern] = set() def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: - zipinfo.compress_type = zipfile.ZIP_DEFLATED zipinfo.create_system = 3 # Linux zipinfo.comment = b'' zipinfo.date_time = (1980, 1, 1, 0, 0, 0) @@ -34,33 +33,51 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): metadata['comment'] = zipinfo.comment # type: ignore if zipinfo.date_time != (1980, 1, 1, 0, 0, 0): - metadata['date_time'] =str(datetime.datetime(*zipinfo.date_time)) + metadata['date_time'] = str(datetime.datetime(*zipinfo.date_time)) return metadata def _clean_internal_file(self, item: zipfile.ZipInfo, temp_folder: str, zin: zipfile.ZipFile, zout: zipfile.ZipFile) -> bool: - output = '' zin.extract(member=item, path=temp_folder) - if item.filename not in self.whitelist: - full_path = os.path.join(temp_folder, item.filename) - tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore - if not tmp_parser: - zout.close() - os.remove(self.output_filename) - print("%s's format (%s) isn't supported" % (item.filename, mtype)) - return False - tmp_parser.remove_all() - output = tmp_parser.output_filename - else: - output = os.path.join(temp_folder, item.filename) + full_path = os.path.join(temp_folder, item.filename) + tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore + if not tmp_parser: + zout.close() + os.remove(self.output_filename) + print("%s's format (%s) isn't supported" % (item.filename, mtype)) + return False + tmp_parser.remove_all() + zinfo = zipfile.ZipInfo(item.filename) # type: ignore clean_zinfo = self._clean_zipinfo(zinfo) - with open(output, 'rb') as f: + with open(tmp_parser.output_filename, 'rb') as f: zout.writestr(clean_zinfo, f.read()) return True + def remove_all(self) -> bool: + zin = zipfile.ZipFile(self.filename, 'r') + zout = zipfile.ZipFile(self.output_filename, 'w') + temp_folder = tempfile.mkdtemp() + + for item in zin.infolist(): + if item.filename[-1] == '/': # `is_dir` is added in Python3.6 + continue # don't keep empty folders + elif item.filename in self.files_to_keep: + item = self._clean_zipinfo(item) + zout.writestr(item, zin.read(item)) + continue + elif any(map(lambda r: r.search(item.filename), self.files_to_omit)): + continue + elif not self._clean_internal_file(item, temp_folder, zin, zout): + return False + + shutil.rmtree(temp_folder) + zout.close() + zin.close() + return True + class MSOfficeParser(ArchiveBasedAbstractParser): mimetypes = { @@ -68,9 +85,20 @@ class MSOfficeParser(ArchiveBasedAbstractParser): 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'application/vnd.openxmlformats-officedocument.presentationml.presentation' } - files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'} + files_to_keep = { + '[Content_Types].xml', + '_rels/.rels', + 'word/_rels/document.xml.rels', + 'word/document.xml', + 'word/fontTable.xml', + 'word/settings.xml', + 'word/styles.xml', + } + files_to_omit = set(map(re.compile, { # type: ignore + '^docProps/', + })) - def get_meta(self): + def get_meta(self) -> Dict[str, str]: """ Yes, I know that parsing xml with regexp ain't pretty, be my guest and fix it if you want. @@ -88,38 +116,12 @@ class MSOfficeParser(ArchiveBasedAbstractParser): pass if not metadata: # better safe than sorry metadata[item] = 'harmful content' - for key, value in self._get_zipinfo_meta(item).items(): metadata[key] = value zipin.close() return metadata - def remove_all(self): - zin = zipfile.ZipFile(self.filename, 'r') - zout = zipfile.ZipFile(self.output_filename, 'w') - temp_folder = tempfile.mkdtemp() - - for item in zin.infolist(): - if item.filename[-1] == '/': - continue # `is_dir` is added in Python3.6 - elif item.filename.startswith('docProps/'): - continue # don't keep metadata files - if item.filename in self.files_to_keep: - item = self._clean_zipinfo(item) - zout.writestr(item, zin.read(item)) - continue - - if self._clean_internal_file(item, temp_folder, zin, zout) is False: - return False - - shutil.rmtree(temp_folder) - zout.close() - zin.close() - return True - - - class LibreOfficeParser(ArchiveBasedAbstractParser): mimetypes = { 'application/vnd.oasis.opendocument.text', @@ -130,10 +132,20 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): 'application/vnd.oasis.opendocument.formula', 'application/vnd.oasis.opendocument.image', } - whitelist = {'mimetype', 'manifest.rdf'} + files_to_keep = { + 'META-INF/manifest.xml', + 'content.xml', + 'manifest.rdf', + 'mimetype', + 'settings.xml', + 'styles.xml', + } + files_to_omit = set(map(re.compile, { # type: ignore + '^meta\.xml$', + '^Configurations2/', + })) - - def get_meta(self): + def get_meta(self) -> Dict[str, str]: """ Yes, I know that parsing xml with regexp ain't pretty, be my guest and fix it if you want. @@ -156,21 +168,3 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): zipin.close() return metadata - def remove_all(self): - zin = zipfile.ZipFile(self.filename, 'r') - zout = zipfile.ZipFile(self.output_filename, 'w') - temp_folder = tempfile.mkdtemp() - - for item in zin.infolist(): - if item.filename[-1] == '/': - continue # `is_dir` is added in Python3.6 - elif item.filename == 'meta.xml': - continue # don't keep metadata files - - if self._clean_internal_file(item, temp_folder, zin, zout) is False: - return False - - shutil.rmtree(temp_folder) - zout.close() - zin.close() - return True