1
0
Fork 0

MAT2 is now cleaning revisions from odt files!

This commit is contained in:
jvoisin 2018-06-27 23:10:53 +02:00
parent 80fc4ffb40
commit 02f7605ac1
3 changed files with 85 additions and 16 deletions

View File

@ -6,10 +6,10 @@ stages:
bandit: bandit:
stage: linting stage: linting
script: script: # TODO: remove B405 and B314
- apt-get -qqy update - apt-get -qqy update
- apt-get -qqy install --no-install-recommends python3-bandit - apt-get -qqy install --no-install-recommends python3-bandit
- bandit -r ./libmat2 --format txt --skip B101,B404,B603 - bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314
pyflakes: pyflakes:
stage: linting stage: linting

View File

@ -4,8 +4,10 @@ import shutil
import tempfile import tempfile
import datetime import datetime
import zipfile import zipfile
import xml.etree.ElementTree as ET
from typing import Dict, Set, Pattern from typing import Dict, Set, Pattern
from . import abstract, parser_factory from . import abstract, parser_factory
# Make pyflakes happy # Make pyflakes happy
@ -13,7 +15,12 @@ assert Set
assert Pattern assert Pattern
class ArchiveBasedAbstractParser(abstract.AbstractParser): class ArchiveBasedAbstractParser(abstract.AbstractParser):
# Those are the files that have a format that _isn't_
# supported by MAT2, but that we want to keep anyway.
files_to_keep = set() # type: Set[str] files_to_keep = set() # type: Set[str]
# Those are the files that we _do not_ want to keep,
# no matter if they are supported or not.
files_to_omit = set() # type: Set[Pattern] files_to_omit = set() # type: Set[Pattern]
def __init__(self, filename): def __init__(self, filename):
@ -23,6 +30,11 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
except zipfile.BadZipFile: except zipfile.BadZipFile:
raise ValueError raise ValueError
def _specific_cleanup(self, full_path:str) -> bool:
""" This method can be used to apply specific treatment
to files present in the archive."""
return True
def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
zipinfo.create_system = 3 # Linux zipinfo.create_system = 3 # Linux
zipinfo.comment = b'' zipinfo.comment = b''
@ -56,26 +68,31 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
for item in zin.infolist(): for item in zin.infolist():
if item.filename[-1] == '/': # `is_dir` is added in Python3.6 if item.filename[-1] == '/': # `is_dir` is added in Python3.6
continue # don't keep empty folders continue # don't keep empty folders
elif item.filename in self.files_to_keep:
item = self._clean_zipinfo(item)
zout.writestr(item, zin.read(item))
continue
elif any(map(lambda r: r.search(item.filename), self.files_to_omit)):
continue
zin.extract(member=item, path=temp_folder) zin.extract(member=item, path=temp_folder)
full_path = os.path.join(temp_folder, item.filename) full_path = os.path.join(temp_folder, item.filename)
tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore
if not tmp_parser: self._specific_cleanup(full_path)
shutil.rmtree(temp_folder)
os.remove(self.output_filename) if item.filename in self.files_to_keep:
print("%s's format (%s) isn't supported" % (item.filename, mtype)) # those files aren't supported, but we want to add them anyway
return False pass
tmp_parser.remove_all() elif any(map(lambda r: r.search(item.filename), self.files_to_omit)):
continue
else:
# supported files that we want to clean then add
tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore
if not tmp_parser:
shutil.rmtree(temp_folder)
os.remove(self.output_filename)
print("%s's format (%s) isn't supported" % (item.filename, mtype))
return False
tmp_parser.remove_all()
os.rename(tmp_parser.output_filename, full_path)
zinfo = zipfile.ZipInfo(item.filename) # type: ignore zinfo = zipfile.ZipInfo(item.filename) # type: ignore
clean_zinfo = self._clean_zipinfo(zinfo) clean_zinfo = self._clean_zipinfo(zinfo)
with open(tmp_parser.output_filename, 'rb') as f: with open(full_path, 'rb') as f:
zout.writestr(clean_zinfo, f.read()) zout.writestr(clean_zinfo, f.read())
shutil.rmtree(temp_folder) shutil.rmtree(temp_folder)
@ -149,6 +166,37 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
'^Thumbnails/', '^Thumbnails/',
})) }))
def __remove_revisions(self, full_path:str) -> bool:
def parse_map(f): # etree support for ns is a bit rough
ns_map = dict()
for event, (k, v) in ET.iterparse(f, ("start-ns", )):
if event == "start-ns":
ns_map[k] = v
return ns_map
ns = parse_map(full_path)
if 'office' not in ns.keys(): # no revisions in the current file
return True
# Register the namespaces
for k,v in ns.items():
ET.register_namespace(k, v)
tree = ET.parse(full_path)
for text in tree.getroot().iterfind('.//office:text', ns):
for changes in text.iterfind('.//text:tracked-changes', ns):
text.remove(changes)
tree.write(full_path, xml_declaration = True)
return True
def _specific_cleanup(self, full_path:str) -> bool:
if os.path.basename(full_path) == 'content.xml':
return self.__remove_revisions(full_path)
return True
def get_meta(self) -> Dict[str, str]: def get_meta(self) -> Dict[str, str]:
""" """
Yes, I know that parsing xml with regexp ain't pretty, Yes, I know that parsing xml with regexp ain't pretty,

View File

@ -122,6 +122,27 @@ class TestRemovingThumbnails(unittest.TestCase):
os.remove('./tests/data/clean.cleaned.odt') os.remove('./tests/data/clean.cleaned.odt')
class TestRevisionsCleaning(unittest.TestCase):
def test_libreoffice(self):
with zipfile.ZipFile('./tests/data/revision.odt') as zipin:
c = zipin.open('content.xml')
r = c.read()
self.assertIn(b'tracked-changes', r)
shutil.copy('./tests/data/revision.odt', './tests/data/clean.odt')
p = office.LibreOfficeParser('./tests/data/clean.odt')
self.assertTrue(p.remove_all())
with zipfile.ZipFile('./tests/data/clean.cleaned.odt') as zipin:
c = zipin.open('content.xml')
r = c.read()
self.assertNotIn(b'tracked-changes', r)
os.remove('./tests/data/clean.odt')
os.remove('./tests/data/clean.cleaned.odt')
class TestDeepCleaning(unittest.TestCase): class TestDeepCleaning(unittest.TestCase):
def __check_deep_meta(self, p): def __check_deep_meta(self, p):
tempdir = tempfile.mkdtemp() tempdir = tempfile.mkdtemp()