1
0
mirror of synced 2024-11-22 09:14:23 +01:00

Improve mat2's cli reliability

- Replace some class members by instance members
- Don't thread the cleaning process anymore for now
This commit is contained in:
jvoisin 2018-10-03 15:22:36 +02:00
parent c67bbafb2c
commit 1b356b8c6f
3 changed files with 59 additions and 53 deletions

View File

@ -15,20 +15,21 @@ assert Pattern
class ArchiveBasedAbstractParser(abstract.AbstractParser): class ArchiveBasedAbstractParser(abstract.AbstractParser):
""" Office files (.docx, .odt, …) are zipped files. """ """ Office files (.docx, .odt, …) are zipped files. """
def __init__(self, filename):
super().__init__(filename)
# Those are the files that have a format that _isn't_ # Those are the files that have a format that _isn't_
# supported by MAT2, but that we want to keep anyway. # supported by MAT2, but that we want to keep anyway.
files_to_keep = set() # type: Set[Pattern] self.files_to_keep = set() # type: Set[Pattern]
# Those are the files that we _do not_ want to keep, # Those are the files that we _do not_ want to keep,
# no matter if they are supported or not. # no matter if they are supported or not.
files_to_omit = set() # type: Set[Pattern] self.files_to_omit = set() # type: Set[Pattern]
# what should the parser do if it encounters an unknown file in # what should the parser do if it encounters an unknown file in
# the archive? # the archive?
unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy self.unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy
def __init__(self, filename):
super().__init__(filename)
try: # better fail here than later try: # better fail here than later
zipfile.ZipFile(self.filename) zipfile.ZipFile(self.filename)
except zipfile.BadZipFile: except zipfile.BadZipFile:

View File

@ -67,7 +67,12 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
# See https://0xacab.org/jvoisin/mat2/issues/71 # See https://0xacab.org/jvoisin/mat2/issues/71
'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml 'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml
} }
files_to_keep = set(map(re.compile, { # type: ignore
def __init__(self, filename):
super().__init__(filename)
self.files_to_keep = set(map(re.compile, { # type: ignore
r'^\[Content_Types\]\.xml$', r'^\[Content_Types\]\.xml$',
r'^_rels/\.rels$', r'^_rels/\.rels$',
r'^word/_rels/document\.xml\.rels$', r'^word/_rels/document\.xml\.rels$',
@ -77,7 +82,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
# https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
r'^word/stylesWithEffects\.xml$', r'^word/stylesWithEffects\.xml$',
})) }))
files_to_omit = set(map(re.compile, { # type: ignore self.files_to_omit = set(map(re.compile, { # type: ignore
r'^customXml/', r'^customXml/',
r'webSettings\.xml$', r'webSettings\.xml$',
r'^docProps/custom\.xml$', r'^docProps/custom\.xml$',
@ -89,8 +94,6 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
r'^word/_rels/', r'^word/_rels/',
})) }))
def __init__(self, filename):
super().__init__(filename)
if self.__fill_files_to_keep_via_content_types() is False: if self.__fill_files_to_keep_via_content_types() is False:
raise ValueError raise ValueError
@ -320,7 +323,12 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
'application/vnd.oasis.opendocument.formula', 'application/vnd.oasis.opendocument.formula',
'application/vnd.oasis.opendocument.image', 'application/vnd.oasis.opendocument.image',
} }
files_to_keep = set(map(re.compile, { # type: ignore
def __init__(self, filename):
super().__init__(filename)
self.files_to_keep = set(map(re.compile, { # type: ignore
r'^META-INF/manifest\.xml$', r'^META-INF/manifest\.xml$',
r'^content\.xml$', r'^content\.xml$',
r'^manifest\.rdf$', r'^manifest\.rdf$',
@ -328,7 +336,7 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
r'^settings\.xml$', r'^settings\.xml$',
r'^styles\.xml$', r'^styles\.xml$',
})) }))
files_to_omit = set(map(re.compile, { # type: ignore self.files_to_omit = set(map(re.compile, { # type: ignore
r'^meta\.xml$', r'^meta\.xml$',
r'^Configurations2/', r'^Configurations2/',
r'^Thumbnails/', r'^Thumbnails/',

13
mat2
View File

@ -3,10 +3,8 @@
import os import os
from typing import Tuple from typing import Tuple
import sys import sys
import itertools
import mimetypes import mimetypes
import argparse import argparse
import multiprocessing
import logging import logging
try: try:
@ -142,13 +140,12 @@ def main():
if unknown_member_policy == UnknownMemberPolicy.KEEP: if unknown_member_policy == UnknownMemberPolicy.KEEP:
logging.warning('Keeping unknown member files may leak metadata in the resulting file!') logging.warning('Keeping unknown member files may leak metadata in the resulting file!')
rep_mode = itertools.repeat(args.lightweight is True) success = True
rep_policy = itertools.repeat(unknown_member_policy) for f in __get_files_recursively(args.files):
l = zip(__get_files_recursively(args.files), rep_mode, rep_policy) if clean_meta([f, args.lightweight, unknown_member_policy]) is False:
success = False
return success
p = multiprocessing.Pool()
ret = list(p.imap_unordered(clean_meta, list(l)))
return 0 if all(ret) else -1
if __name__ == '__main__': if __name__ == '__main__':
sys.exit(main()) sys.exit(main())