Massively simplify how we're cleaning office files
This commit is contained in:
parent
f44769df41
commit
177184ac67
@ -47,45 +47,38 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
|
|||||||
|
|
||||||
return metadata
|
return metadata
|
||||||
|
|
||||||
|
|
||||||
def _clean_internal_file(self, item: zipfile.ZipInfo, temp_folder: str,
|
|
||||||
zin: zipfile.ZipFile, zout: zipfile.ZipFile) -> bool:
|
|
||||||
zin.extract(member=item, path=temp_folder)
|
|
||||||
full_path = os.path.join(temp_folder, item.filename)
|
|
||||||
tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore
|
|
||||||
if not tmp_parser:
|
|
||||||
zout.close()
|
|
||||||
os.remove(self.output_filename)
|
|
||||||
print("%s's format (%s) isn't supported" % (item.filename, mtype))
|
|
||||||
return False
|
|
||||||
tmp_parser.remove_all()
|
|
||||||
|
|
||||||
zinfo = zipfile.ZipInfo(item.filename) # type: ignore
|
|
||||||
clean_zinfo = self._clean_zipinfo(zinfo)
|
|
||||||
with open(tmp_parser.output_filename, 'rb') as f:
|
|
||||||
zout.writestr(clean_zinfo, f.read())
|
|
||||||
return True
|
|
||||||
|
|
||||||
def remove_all(self) -> bool:
|
def remove_all(self) -> bool:
|
||||||
zin = zipfile.ZipFile(self.filename, 'r')
|
with zipfile.ZipFile(self.filename) as zin,\
|
||||||
zout = zipfile.ZipFile(self.output_filename, 'w')
|
zipfile.ZipFile(self.output_filename, 'w') as zout:
|
||||||
temp_folder = tempfile.mkdtemp()
|
|
||||||
|
|
||||||
for item in zin.infolist():
|
temp_folder = tempfile.mkdtemp()
|
||||||
if item.filename[-1] == '/': # `is_dir` is added in Python3.6
|
|
||||||
continue # don't keep empty folders
|
for item in zin.infolist():
|
||||||
elif item.filename in self.files_to_keep:
|
if item.filename[-1] == '/': # `is_dir` is added in Python3.6
|
||||||
item = self._clean_zipinfo(item)
|
continue # don't keep empty folders
|
||||||
zout.writestr(item, zin.read(item))
|
elif item.filename in self.files_to_keep:
|
||||||
continue
|
item = self._clean_zipinfo(item)
|
||||||
elif any(map(lambda r: r.search(item.filename), self.files_to_omit)):
|
zout.writestr(item, zin.read(item))
|
||||||
continue
|
continue
|
||||||
elif not self._clean_internal_file(item, temp_folder, zin, zout):
|
elif any(map(lambda r: r.search(item.filename), self.files_to_omit)):
|
||||||
return False
|
continue
|
||||||
|
|
||||||
|
zin.extract(member=item, path=temp_folder)
|
||||||
|
full_path = os.path.join(temp_folder, item.filename)
|
||||||
|
tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore
|
||||||
|
if not tmp_parser:
|
||||||
|
shutil.rmtree(temp_folder)
|
||||||
|
os.remove(self.output_filename)
|
||||||
|
print("%s's format (%s) isn't supported" % (item.filename, mtype))
|
||||||
|
return False
|
||||||
|
tmp_parser.remove_all()
|
||||||
|
|
||||||
|
zinfo = zipfile.ZipInfo(item.filename) # type: ignore
|
||||||
|
clean_zinfo = self._clean_zipinfo(zinfo)
|
||||||
|
with open(tmp_parser.output_filename, 'rb') as f:
|
||||||
|
zout.writestr(clean_zinfo, f.read())
|
||||||
|
|
||||||
shutil.rmtree(temp_folder)
|
shutil.rmtree(temp_folder)
|
||||||
zout.close()
|
|
||||||
zin.close()
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user