1
0
mirror of synced 2024-11-22 09:14:23 +01:00

Add tar archive support

This commit is contained in:
jvoisin 2019-04-27 04:05:36 -07:00
parent 20ed5eb7d6
commit 82cc822a1d
5 changed files with 274 additions and 69 deletions

View File

@ -1,5 +1,7 @@
import abc
import zipfile import zipfile
import datetime import datetime
import tarfile
import tempfile import tempfile
import os import os
import logging import logging
@ -11,14 +13,37 @@ from . import abstract, UnknownMemberPolicy, parser_factory
# Make pyflakes happy # Make pyflakes happy
assert Set assert Set
assert Pattern assert Pattern
assert List
assert Union # pylint: disable=not-callable,assignment-from-no-return
# An ArchiveClass is a class representing an archive,
# while an ArchiveMember is a class representing an element
# (usually a file) of an archive.
ArchiveClass = Union[zipfile.ZipFile, tarfile.TarFile]
ArchiveMember = Union[zipfile.ZipInfo, tarfile.TarInfo]
class ArchiveBasedAbstractParser(abstract.AbstractParser): class ArchiveBasedAbstractParser(abstract.AbstractParser):
""" Office files (.docx, .odt, …) are zipped files. """ """Base class for all archive-based formats.
Welcome to a world of frustrating complexity and tediouness:
- A lot of file formats (docx, odt, epubs, ) are archive-based,
so we need to add callbacks erverywhere to allow their respective
parsers to apply specific cleanup to the required files.
- Python has two different modules to deal with .tar and .zip files,
with similar-but-yet-o-so-different API, so we need to write
a ghetto-wrapper to avoid duplicating everything
- The combination of @staticmethod and @abstractstaticmethod is
required because for now, mypy doesn't know that
@abstractstaticmethod is, indeed, a static method.
- Mypy is too dumb (yet) to realise that a type A is valid under
the Union[A, B] constrain, hence the weird `# type: ignore`
annotations.
"""
def __init__(self, filename): def __init__(self, filename):
super().__init__(filename) super().__init__(filename)
self.archive_class = None # type: Optional[ArchiveClass]
self.member_class = None # type: Optional[ArchiveMember]
# Those are the files that have a format that _isn't_ # Those are the files that have a format that _isn't_
# supported by MAT2, but that we want to keep anyway. # supported by MAT2, but that we want to keep anyway.
@ -32,10 +57,10 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
# the archive? # the archive?
self.unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy self.unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy
try: # better fail here than later self.is_archive_valid()
zipfile.ZipFile(self.filename)
except zipfile.BadZipFile: def is_archive_valid(self):
raise ValueError """Raise a ValueError is the current archive isn't a valid one."""
def _specific_cleanup(self, full_path: str) -> bool: def _specific_cleanup(self, full_path: str) -> bool:
""" This method can be used to apply specific treatment """ This method can be used to apply specific treatment
@ -50,59 +75,57 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
return {} # pragma: no cover return {} # pragma: no cover
@staticmethod @staticmethod
def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: @abc.abstractstaticmethod
zipinfo.create_system = 3 # Linux def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]:
zipinfo.comment = b'' """Return all the members of the archive."""
zipinfo.date_time = (1980, 1, 1, 0, 0, 0) # this is as early as a zipfile can be
return zipinfo
@staticmethod @staticmethod
def _get_zipinfo_meta(zipinfo: zipfile.ZipInfo) -> Dict[str, str]: @abc.abstractstaticmethod
metadata = {} def _clean_member(member: ArchiveMember) -> ArchiveMember:
if zipinfo.create_system == 3: # this is Linux """Remove all the metadata for a given member."""
pass
elif zipinfo.create_system == 2:
metadata['create_system'] = 'Windows'
else:
metadata['create_system'] = 'Weird'
if zipinfo.comment: @staticmethod
metadata['comment'] = zipinfo.comment # type: ignore @abc.abstractstaticmethod
def _get_member_meta(member: ArchiveMember) -> Dict[str, str]:
"""Return all the metadata of a given member."""
if zipinfo.date_time != (1980, 1, 1, 0, 0, 0): @staticmethod
metadata['date_time'] = str(datetime.datetime(*zipinfo.date_time)) @abc.abstractstaticmethod
def _get_member_name(member: ArchiveMember) -> str:
"""Return the name of the given member."""
return metadata @staticmethod
@abc.abstractstaticmethod
def _add_file_to_archive(archive: ArchiveClass, member: ArchiveMember,
full_path: str):
"""Add the file at full_path to the archive, via the given member."""
def get_meta(self) -> Dict[str, Union[str, dict]]: def get_meta(self) -> Dict[str, Union[str, dict]]:
meta = dict() # type: Dict[str, Union[str, dict]] meta = dict() # type: Dict[str, Union[str, dict]]
with zipfile.ZipFile(self.filename) as zin: with self.archive_class(self.filename) as zin:
temp_folder = tempfile.mkdtemp() temp_folder = tempfile.mkdtemp()
for item in zin.infolist(): for item in self._get_all_members(zin):
local_meta = dict() # type: Dict[str, Union[str, Dict]] local_meta = self._get_member_meta(item)
for k, v in self._get_zipinfo_meta(item).items(): member_name = self._get_member_name(item)
local_meta[k] = v
if item.filename[-1] == '/': # pragma: no cover if member_name[-1] == '/': # pragma: no cover
# `is_dir` is added in Python3.6 # `is_dir` is added in Python3.6
continue # don't keep empty folders continue # don't keep empty folders
zin.extract(member=item, path=temp_folder) zin.extract(member=item, path=temp_folder)
full_path = os.path.join(temp_folder, item.filename) full_path = os.path.join(temp_folder, member_name)
specific_meta = self._specific_get_meta(full_path, item.filename) specific_meta = self._specific_get_meta(full_path, member_name)
for (k, v) in specific_meta.items(): local_meta = {**local_meta, **specific_meta}
local_meta[k] = v
tmp_parser, _ = parser_factory.get_parser(full_path) # type: ignore member_parser, _ = parser_factory.get_parser(full_path) # type: ignore
if tmp_parser: if member_parser:
for k, v in tmp_parser.get_meta().items(): local_meta = {**local_meta, **member_parser.get_meta()}
local_meta[k] = v
if local_meta: if local_meta:
meta[item.filename] = local_meta meta[member_name] = local_meta
shutil.rmtree(temp_folder) shutil.rmtree(temp_folder)
return meta return meta
@ -110,17 +133,19 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
def remove_all(self) -> bool: def remove_all(self) -> bool:
# pylint: disable=too-many-branches # pylint: disable=too-many-branches
with zipfile.ZipFile(self.filename) as zin,\ with self.archive_class(self.filename) as zin,\
zipfile.ZipFile(self.output_filename, 'w') as zout: self.archive_class(self.output_filename, 'w') as zout:
temp_folder = tempfile.mkdtemp() temp_folder = tempfile.mkdtemp()
abort = False abort = False
items = list() # type: List[zipfile.ZipInfo] # Sort the items to process, to reduce fingerprinting,
for item in sorted(zin.infolist(), key=lambda z: z.filename): # and keep them in the `items` variable.
items = list() # type: List[ArchiveMember]
for item in sorted(self._get_all_members(zin), key=self._get_member_name):
# Some fileformats do require to have the `mimetype` file # Some fileformats do require to have the `mimetype` file
# as the first file in the archive. # as the first file in the archive.
if item.filename == 'mimetype': if self._get_member_name(item) == 'mimetype':
items = [item] + items items = [item] + items
else: else:
items.append(item) items.append(item)
@ -128,53 +153,53 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
# Since files order is a fingerprint factor, # Since files order is a fingerprint factor,
# we're iterating (and thus inserting) them in lexicographic order. # we're iterating (and thus inserting) them in lexicographic order.
for item in items: for item in items:
if item.filename[-1] == '/': # `is_dir` is added in Python3.6 member_name = self._get_member_name(item)
if member_name[-1] == '/': # `is_dir` is added in Python3.6
continue # don't keep empty folders continue # don't keep empty folders
zin.extract(member=item, path=temp_folder) zin.extract(member=item, path=temp_folder)
full_path = os.path.join(temp_folder, item.filename) full_path = os.path.join(temp_folder, member_name)
if self._specific_cleanup(full_path) is False: if self._specific_cleanup(full_path) is False:
logging.warning("Something went wrong during deep cleaning of %s", logging.warning("Something went wrong during deep cleaning of %s",
item.filename) member_name)
abort = True abort = True
continue continue
if any(map(lambda r: r.search(item.filename), self.files_to_keep)): if any(map(lambda r: r.search(member_name), self.files_to_keep)):
# those files aren't supported, but we want to add them anyway # those files aren't supported, but we want to add them anyway
pass pass
elif any(map(lambda r: r.search(item.filename), self.files_to_omit)): elif any(map(lambda r: r.search(member_name), self.files_to_omit)):
continue continue
else: # supported files that we want to first clean, then add else: # supported files that we want to first clean, then add
tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore member_parser, mtype = parser_factory.get_parser(full_path) # type: ignore
if not tmp_parser: if not member_parser:
if self.unknown_member_policy == UnknownMemberPolicy.OMIT: if self.unknown_member_policy == UnknownMemberPolicy.OMIT:
logging.warning("In file %s, omitting unknown element %s (format: %s)", logging.warning("In file %s, omitting unknown element %s (format: %s)",
self.filename, item.filename, mtype) self.filename, member_name, mtype)
continue continue
elif self.unknown_member_policy == UnknownMemberPolicy.KEEP: elif self.unknown_member_policy == UnknownMemberPolicy.KEEP:
logging.warning("In file %s, keeping unknown element %s (format: %s)", logging.warning("In file %s, keeping unknown element %s (format: %s)",
self.filename, item.filename, mtype) self.filename, member_name, mtype)
else: else:
logging.error("In file %s, element %s's format (%s) " \ logging.error("In file %s, element %s's format (%s) " \
"isn't supported", "isn't supported",
self.filename, item.filename, mtype) self.filename, member_name, mtype)
abort = True abort = True
continue continue
if tmp_parser: else:
if tmp_parser.remove_all() is False: if member_parser.remove_all() is False:
logging.warning("In file %s, something went wrong \ logging.warning("In file %s, something went wrong \
with the cleaning of %s \ with the cleaning of %s \
(format: %s)", (format: %s)",
self.filename, item.filename, mtype) self.filename, member_name, mtype)
abort = True abort = True
continue continue
os.rename(tmp_parser.output_filename, full_path) os.rename(member_parser.output_filename, full_path)
zinfo = zipfile.ZipInfo(item.filename) # type: ignore zinfo = self.member_class(member_name) # type: ignore
clean_zinfo = self._clean_zipinfo(zinfo) clean_zinfo = self._clean_member(zinfo)
with open(full_path, 'rb') as f: self._add_file_to_archive(zout, clean_zinfo, full_path)
zout.writestr(clean_zinfo, f.read())
shutil.rmtree(temp_folder) shutil.rmtree(temp_folder)
if abort: if abort:
@ -183,6 +208,111 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
return True return True
class TarParser(ArchiveBasedAbstractParser):
mimetypes = {'application/x-tar'}
def __init__(self, filename):
super().__init__(filename)
self.archive_class = tarfile.TarFile
self.member_class = tarfile.TarInfo
def is_archive_valid(self):
if tarfile.is_tarfile(self.filename) is False:
raise ValueError
@staticmethod
def _clean_member(member: ArchiveMember) -> ArchiveMember:
assert isinstance(member, tarfile.TarInfo) # please mypy
member.mtime = member.uid = member.gid = 0
member.uname = member.gname = ''
return member
@staticmethod
def _get_member_meta(member: ArchiveMember) -> Dict[str, str]:
assert isinstance(member, tarfile.TarInfo) # please mypy
metadata = {}
if member.mtime != 0:
metadata['mtime'] = str(member.mtime)
if member.uid != 0:
metadata['uid'] = str(member.uid)
if member.gid != 0:
metadata['gid'] = str(member.gid)
if member.uname != '':
metadata['uname'] = member.uname
if member.gname != '':
metadata['gname'] = member.gname
return metadata
@staticmethod
def _add_file_to_archive(archive: ArchiveClass, member: ArchiveMember,
full_path: str):
assert isinstance(member, tarfile.TarInfo) # please mypy
assert isinstance(archive, tarfile.TarFile) # please mypy
archive.add(full_path, member.name, filter=TarParser._clean_member) # type: ignore
@staticmethod
def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]:
assert isinstance(archive, tarfile.TarFile) # please mypy
return archive.getmembers() # type: ignore
@staticmethod
def _get_member_name(member: ArchiveMember) -> str:
assert isinstance(member, tarfile.TarInfo) # please mypy
return member.name
class ZipParser(ArchiveBasedAbstractParser): class ZipParser(ArchiveBasedAbstractParser):
mimetypes = {'application/zip'} mimetypes = {'application/zip'}
def __init__(self, filename):
super().__init__(filename)
self.archive_class = zipfile.ZipFile
self.member_class = zipfile.ZipInfo
def is_archive_valid(self):
try:
zipfile.ZipFile(self.filename)
except zipfile.BadZipFile:
raise ValueError
@staticmethod
def _clean_member(member: ArchiveMember) -> ArchiveMember:
assert isinstance(member, zipfile.ZipInfo) # please mypy
member.create_system = 3 # Linux
member.comment = b''
member.date_time = (1980, 1, 1, 0, 0, 0) # this is as early as a zipfile can be
return member
@staticmethod
def _get_member_meta(member: ArchiveMember) -> Dict[str, str]:
assert isinstance(member, zipfile.ZipInfo) # please mypy
metadata = {}
if member.create_system == 3: # this is Linux
pass
elif member.create_system == 2:
metadata['create_system'] = 'Windows'
else:
metadata['create_system'] = 'Weird'
if member.comment:
metadata['comment'] = member.comment # type: ignore
if member.date_time != (1980, 1, 1, 0, 0, 0):
metadata['date_time'] = str(datetime.datetime(*member.date_time))
return metadata
@staticmethod
def _add_file_to_archive(archive: ArchiveClass, member: ArchiveMember,
full_path: str):
assert isinstance(archive, zipfile.ZipFile) # please mypy
assert isinstance(member, zipfile.ZipInfo) # please mypy
with open(full_path, 'rb') as f:
archive.writestr(member, f.read())
@staticmethod
def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]:
assert isinstance(archive, zipfile.ZipFile) # please mypy
return archive.infolist() # type: ignore
@staticmethod
def _get_member_name(member: ArchiveMember) -> str:
assert isinstance(member, zipfile.ZipInfo) # please mypy
return member.filename

View File

@ -5,7 +5,7 @@ import xml.etree.ElementTree as ET # type: ignore
from . import archive, office from . import archive, office
class EPUBParser(archive.ArchiveBasedAbstractParser): class EPUBParser(archive.ZipParser):
mimetypes = {'application/epub+zip', } mimetypes = {'application/epub+zip', }
metadata_namespace = '{http://purl.org/dc/elements/1.1/}' metadata_namespace = '{http://purl.org/dc/elements/1.1/}'

View File

@ -6,7 +6,7 @@ from typing import Dict, Set, Pattern, Tuple, Any
import xml.etree.ElementTree as ET # type: ignore import xml.etree.ElementTree as ET # type: ignore
from .archive import ArchiveBasedAbstractParser from .archive import ZipParser
# pylint: disable=line-too-long # pylint: disable=line-too-long
@ -43,7 +43,7 @@ def _sort_xml_attributes(full_path: str) -> bool:
return True return True
class MSOfficeParser(ArchiveBasedAbstractParser): class MSOfficeParser(ZipParser):
mimetypes = { mimetypes = {
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
@ -312,7 +312,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
return {file_path: 'harmful content', } return {file_path: 'harmful content', }
class LibreOfficeParser(ArchiveBasedAbstractParser): class LibreOfficeParser(ZipParser):
mimetypes = { mimetypes = {
'application/vnd.oasis.opendocument.text', 'application/vnd.oasis.opendocument.text',
'application/vnd.oasis.opendocument.spreadsheet', 'application/vnd.oasis.opendocument.spreadsheet',

View File

@ -1,13 +1,15 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import unittest import unittest
import time
import shutil import shutil
import os import os
import logging import logging
import zipfile import zipfile
import tarfile
from libmat2 import pdf, images, audio, office, parser_factory, torrent from libmat2 import pdf, images, audio, office, parser_factory, torrent
from libmat2 import harmless, video, web from libmat2 import harmless, video, web, archive
# No need to logging messages, should something go wrong, # No need to logging messages, should something go wrong,
# the testsuite _will_ fail. # the testsuite _will_ fail.
@ -278,7 +280,6 @@ class TestCorruptedFiles(unittest.TestCase):
p.remove_all() p.remove_all()
os.remove('./tests/data/clean.html') os.remove('./tests/data/clean.html')
def test_epub(self): def test_epub(self):
with zipfile.ZipFile('./tests/data/clean.epub', 'w') as zout: with zipfile.ZipFile('./tests/data/clean.epub', 'w') as zout:
zout.write('./tests/data/dirty.jpg', 'OEBPS/content.opf') zout.write('./tests/data/dirty.jpg', 'OEBPS/content.opf')
@ -291,3 +292,27 @@ class TestCorruptedFiles(unittest.TestCase):
self.assertFalse(p.remove_all()) self.assertFalse(p.remove_all())
os.remove('./tests/data/clean.epub') os.remove('./tests/data/clean.epub')
def test_tar(self):
with tarfile.TarFile('./tests/data/clean.tar', 'w') as zout:
zout.add('./tests/data/dirty.flac')
zout.add('./tests/data/dirty.docx')
zout.add('./tests/data/dirty.jpg')
zout.add('./tests/data/embedded_corrupted.docx')
tarinfo = tarfile.TarInfo(name='./tests/data/dirty.png')
tarinfo.mtime = time.time()
tarinfo.uid = 1337
tarinfo.gid = 1338
with open('./tests/data/dirty.png', 'rb') as f:
zout.addfile(tarinfo, f)
p, mimetype = parser_factory.get_parser('./tests/data/clean.tar')
self.assertEqual(mimetype, 'application/x-tar')
meta = p.get_meta()
self.assertEqual(meta['./tests/data/dirty.flac']['comments'], 'Thank you for using MAT !')
self.assertEqual(meta['./tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
self.assertFalse(p.remove_all())
os.remove('./tests/data/clean.tar')
shutil.copy('./tests/data/dirty.png', './tests/data/clean.tar')
with self.assertRaises(ValueError):
archive.TarParser('./tests/data/clean.tar')
os.remove('./tests/data/clean.tar')

View File

@ -4,6 +4,8 @@ import unittest
import shutil import shutil
import os import os
import re import re
import tarfile
import tempfile
import zipfile import zipfile
from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
@ -195,6 +197,19 @@ class TestGetMeta(unittest.TestCase):
self.assertEqual(meta['version'], '1.0') self.assertEqual(meta['version'], '1.0')
self.assertEqual(meta['harmful data'], 'underline is cool') self.assertEqual(meta['harmful data'], 'underline is cool')
def test_tar(self):
with tarfile.TarFile('./tests/data/dirty.tar', 'w') as tout:
tout.add('./tests/data/dirty.flac')
tout.add('./tests/data/dirty.docx')
tout.add('./tests/data/dirty.jpg')
p, mimetype = parser_factory.get_parser('./tests/data/dirty.tar')
self.assertEqual(mimetype, 'application/x-tar')
meta = p.get_meta()
self.assertEqual(meta['./tests/data/dirty.flac']['comments'], 'Thank you for using MAT !')
self.assertEqual(meta['./tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
os.remove('./tests/data/dirty.tar')
class TestRemovingThumbnails(unittest.TestCase): class TestRemovingThumbnails(unittest.TestCase):
def test_odt(self): def test_odt(self):
shutil.copy('./tests/data/revision.odt', './tests/data/clean.odt') shutil.copy('./tests/data/revision.odt', './tests/data/clean.odt')
@ -702,3 +717,38 @@ class TestCleaning(unittest.TestCase):
os.remove('./tests/data/clean.css') os.remove('./tests/data/clean.css')
os.remove('./tests/data/clean.cleaned.css') os.remove('./tests/data/clean.cleaned.css')
os.remove('./tests/data/clean.cleaned.cleaned.css') os.remove('./tests/data/clean.cleaned.cleaned.css')
def test_tar(self):
with tarfile.TarFile('./tests/data/dirty.tar', 'w') as zout:
zout.add('./tests/data/dirty.flac')
zout.add('./tests/data/dirty.docx')
zout.add('./tests/data/dirty.jpg')
p = archive.TarParser('./tests/data/dirty.tar')
meta = p.get_meta()
self.assertEqual(meta['./tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
ret = p.remove_all()
self.assertTrue(ret)
p = archive.TarParser('./tests/data/dirty.cleaned.tar')
self.assertEqual(p.get_meta(), {})
self.assertTrue(p.remove_all())
tmp_dir = tempfile.mkdtemp()
with tarfile.open('./tests/data/dirty.cleaned.tar') as zout:
zout.extractall(path=tmp_dir)
zout.close()
number_of_files = 0
for root, _, fnames in os.walk(tmp_dir):
for f in fnames:
complete_path = os.path.join(root, f)
p, _ = parser_factory.get_parser(complete_path)
self.assertIsNotNone(p)
self.assertEqual(p.get_meta(), {})
number_of_files += 1
self.assertEqual(number_of_files, 3)
os.remove('./tests/data/dirty.tar')
os.remove('./tests/data/dirty.cleaned.tar')
os.remove('./tests/data/dirty.cleaned.cleaned.tar')