Improve mat2's cli reliability
- Replace some class members by instance members - Don't thread the cleaning process anymore for now
This commit is contained in:
parent
c67bbafb2c
commit
1b356b8c6f
@ -15,20 +15,21 @@ assert Pattern
|
|||||||
|
|
||||||
class ArchiveBasedAbstractParser(abstract.AbstractParser):
|
class ArchiveBasedAbstractParser(abstract.AbstractParser):
|
||||||
""" Office files (.docx, .odt, …) are zipped files. """
|
""" Office files (.docx, .odt, …) are zipped files. """
|
||||||
# Those are the files that have a format that _isn't_
|
|
||||||
# supported by MAT2, but that we want to keep anyway.
|
|
||||||
files_to_keep = set() # type: Set[Pattern]
|
|
||||||
|
|
||||||
# Those are the files that we _do not_ want to keep,
|
|
||||||
# no matter if they are supported or not.
|
|
||||||
files_to_omit = set() # type: Set[Pattern]
|
|
||||||
|
|
||||||
# what should the parser do if it encounters an unknown file in
|
|
||||||
# the archive?
|
|
||||||
unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy
|
|
||||||
|
|
||||||
def __init__(self, filename):
|
def __init__(self, filename):
|
||||||
super().__init__(filename)
|
super().__init__(filename)
|
||||||
|
|
||||||
|
# Those are the files that have a format that _isn't_
|
||||||
|
# supported by MAT2, but that we want to keep anyway.
|
||||||
|
self.files_to_keep = set() # type: Set[Pattern]
|
||||||
|
|
||||||
|
# Those are the files that we _do not_ want to keep,
|
||||||
|
# no matter if they are supported or not.
|
||||||
|
self.files_to_omit = set() # type: Set[Pattern]
|
||||||
|
|
||||||
|
# what should the parser do if it encounters an unknown file in
|
||||||
|
# the archive?
|
||||||
|
self.unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy
|
||||||
|
|
||||||
try: # better fail here than later
|
try: # better fail here than later
|
||||||
zipfile.ZipFile(self.filename)
|
zipfile.ZipFile(self.filename)
|
||||||
except zipfile.BadZipFile:
|
except zipfile.BadZipFile:
|
||||||
|
@ -67,30 +67,33 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
|
|||||||
# See https://0xacab.org/jvoisin/mat2/issues/71
|
# See https://0xacab.org/jvoisin/mat2/issues/71
|
||||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml
|
||||||
}
|
}
|
||||||
files_to_keep = set(map(re.compile, { # type: ignore
|
|
||||||
r'^\[Content_Types\]\.xml$',
|
|
||||||
r'^_rels/\.rels$',
|
|
||||||
r'^word/_rels/document\.xml\.rels$',
|
|
||||||
r'^word/_rels/footer[0-9]*\.xml\.rels$',
|
|
||||||
r'^word/_rels/header[0-9]*\.xml\.rels$',
|
|
||||||
|
|
||||||
# https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
|
|
||||||
r'^word/stylesWithEffects\.xml$',
|
|
||||||
}))
|
|
||||||
files_to_omit = set(map(re.compile, { # type: ignore
|
|
||||||
r'^customXml/',
|
|
||||||
r'webSettings\.xml$',
|
|
||||||
r'^docProps/custom\.xml$',
|
|
||||||
r'^word/printerSettings/',
|
|
||||||
r'^word/theme',
|
|
||||||
|
|
||||||
# we have a whitelist in self.files_to_keep,
|
|
||||||
# so we can trash everything else
|
|
||||||
r'^word/_rels/',
|
|
||||||
}))
|
|
||||||
|
|
||||||
def __init__(self, filename):
|
def __init__(self, filename):
|
||||||
super().__init__(filename)
|
super().__init__(filename)
|
||||||
|
|
||||||
|
self.files_to_keep = set(map(re.compile, { # type: ignore
|
||||||
|
r'^\[Content_Types\]\.xml$',
|
||||||
|
r'^_rels/\.rels$',
|
||||||
|
r'^word/_rels/document\.xml\.rels$',
|
||||||
|
r'^word/_rels/footer[0-9]*\.xml\.rels$',
|
||||||
|
r'^word/_rels/header[0-9]*\.xml\.rels$',
|
||||||
|
|
||||||
|
# https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
|
||||||
|
r'^word/stylesWithEffects\.xml$',
|
||||||
|
}))
|
||||||
|
self.files_to_omit = set(map(re.compile, { # type: ignore
|
||||||
|
r'^customXml/',
|
||||||
|
r'webSettings\.xml$',
|
||||||
|
r'^docProps/custom\.xml$',
|
||||||
|
r'^word/printerSettings/',
|
||||||
|
r'^word/theme',
|
||||||
|
|
||||||
|
# we have a whitelist in self.files_to_keep,
|
||||||
|
# so we can trash everything else
|
||||||
|
r'^word/_rels/',
|
||||||
|
}))
|
||||||
|
|
||||||
if self.__fill_files_to_keep_via_content_types() is False:
|
if self.__fill_files_to_keep_via_content_types() is False:
|
||||||
raise ValueError
|
raise ValueError
|
||||||
|
|
||||||
@ -320,19 +323,24 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
|
|||||||
'application/vnd.oasis.opendocument.formula',
|
'application/vnd.oasis.opendocument.formula',
|
||||||
'application/vnd.oasis.opendocument.image',
|
'application/vnd.oasis.opendocument.image',
|
||||||
}
|
}
|
||||||
files_to_keep = set(map(re.compile, { # type: ignore
|
|
||||||
r'^META-INF/manifest\.xml$',
|
|
||||||
r'^content\.xml$',
|
def __init__(self, filename):
|
||||||
r'^manifest\.rdf$',
|
super().__init__(filename)
|
||||||
r'^mimetype$',
|
|
||||||
r'^settings\.xml$',
|
self.files_to_keep = set(map(re.compile, { # type: ignore
|
||||||
r'^styles\.xml$',
|
r'^META-INF/manifest\.xml$',
|
||||||
}))
|
r'^content\.xml$',
|
||||||
files_to_omit = set(map(re.compile, { # type: ignore
|
r'^manifest\.rdf$',
|
||||||
r'^meta\.xml$',
|
r'^mimetype$',
|
||||||
r'^Configurations2/',
|
r'^settings\.xml$',
|
||||||
r'^Thumbnails/',
|
r'^styles\.xml$',
|
||||||
}))
|
}))
|
||||||
|
self.files_to_omit = set(map(re.compile, { # type: ignore
|
||||||
|
r'^meta\.xml$',
|
||||||
|
r'^Configurations2/',
|
||||||
|
r'^Thumbnails/',
|
||||||
|
}))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def __remove_revisions(full_path: str) -> bool:
|
def __remove_revisions(full_path: str) -> bool:
|
||||||
|
13
mat2
13
mat2
@ -3,10 +3,8 @@
|
|||||||
import os
|
import os
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
import sys
|
import sys
|
||||||
import itertools
|
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import argparse
|
import argparse
|
||||||
import multiprocessing
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -142,13 +140,12 @@ def main():
|
|||||||
if unknown_member_policy == UnknownMemberPolicy.KEEP:
|
if unknown_member_policy == UnknownMemberPolicy.KEEP:
|
||||||
logging.warning('Keeping unknown member files may leak metadata in the resulting file!')
|
logging.warning('Keeping unknown member files may leak metadata in the resulting file!')
|
||||||
|
|
||||||
rep_mode = itertools.repeat(args.lightweight is True)
|
success = True
|
||||||
rep_policy = itertools.repeat(unknown_member_policy)
|
for f in __get_files_recursively(args.files):
|
||||||
l = zip(__get_files_recursively(args.files), rep_mode, rep_policy)
|
if clean_meta([f, args.lightweight, unknown_member_policy]) is False:
|
||||||
|
success = False
|
||||||
|
return success
|
||||||
|
|
||||||
p = multiprocessing.Pool()
|
|
||||||
ret = list(p.imap_unordered(clean_meta, list(l)))
|
|
||||||
return 0 if all(ret) else -1
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
sys.exit(main())
|
sys.exit(main())
|
||||||
|
Loading…
Reference in New Issue
Block a user