Add support for compressed tar files
This commit is contained in:
parent
82cc822a1d
commit
8e41b098d6
4 changed files with 149 additions and 3 deletions
|
@ -25,6 +25,11 @@ class AbstractParser(abc.ABC):
|
|||
|
||||
self.filename = filename
|
||||
fname, extension = os.path.splitext(filename)
|
||||
|
||||
# Special case for tar.gz, tar.bz2, … files
|
||||
if fname.endswith('.tar') and len(fname) > 4:
|
||||
fname, extension = fname[:-4], '.tar' + extension
|
||||
|
||||
self.output_filename = fname + '.cleaned' + extension
|
||||
self.lightweight_cleaning = False
|
||||
|
||||
|
|
|
@ -40,6 +40,10 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
|
|||
the Union[A, B] constrain, hence the weird `# type: ignore`
|
||||
annotations.
|
||||
"""
|
||||
# Tarfiles can optionally support compression
|
||||
# https://docs.python.org/3/library/tarfile.html#tarfile.open
|
||||
compression = ''
|
||||
|
||||
def __init__(self, filename):
|
||||
super().__init__(filename)
|
||||
self.archive_class = None # type: Optional[ArchiveClass]
|
||||
|
@ -134,7 +138,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
|
|||
# pylint: disable=too-many-branches
|
||||
|
||||
with self.archive_class(self.filename) as zin,\
|
||||
self.archive_class(self.output_filename, 'w') as zout:
|
||||
self.archive_class(self.output_filename, 'w' + self.compression) as zout:
|
||||
|
||||
temp_folder = tempfile.mkdtemp()
|
||||
abort = False
|
||||
|
@ -212,7 +216,11 @@ class TarParser(ArchiveBasedAbstractParser):
|
|||
mimetypes = {'application/x-tar'}
|
||||
def __init__(self, filename):
|
||||
super().__init__(filename)
|
||||
self.archive_class = tarfile.TarFile
|
||||
# yes, it's tarfile.TarFile.open and not tarfile.TarFile,
|
||||
# as stated in the documentation:
|
||||
# https://docs.python.org/3/library/tarfile.html#tarfile.TarFile
|
||||
# This is required to support compressed archives.
|
||||
self.archive_class = tarfile.TarFile.open
|
||||
self.member_class = tarfile.TarInfo
|
||||
|
||||
def is_archive_valid(self):
|
||||
|
@ -259,6 +267,22 @@ class TarParser(ArchiveBasedAbstractParser):
|
|||
assert isinstance(member, tarfile.TarInfo) # please mypy
|
||||
return member.name
|
||||
|
||||
|
||||
class TarGzParser(TarParser):
|
||||
compression = ':gz'
|
||||
mimetypes = {'application/x-tar+gz'}
|
||||
|
||||
|
||||
class TarBz2Parser(TarParser):
|
||||
compression = ':bz2'
|
||||
mimetypes = {'application/x-tar+bz2'}
|
||||
|
||||
|
||||
class TarXzParser(TarParser):
|
||||
compression = ':xz'
|
||||
mimetypes = {'application/x-tar+xz'}
|
||||
|
||||
|
||||
class ZipParser(ArchiveBasedAbstractParser):
|
||||
mimetypes = {'application/zip'}
|
||||
def __init__(self, filename):
|
||||
|
|
|
@ -50,6 +50,10 @@ def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]:
|
|||
if extension.lower() in UNSUPPORTED_EXTENSIONS:
|
||||
return None, mtype
|
||||
|
||||
if mtype == 'application/x-tar':
|
||||
if extension[1:] in ('bz2', 'gz', 'xz'):
|
||||
mtype = mtype + '+' + extension[1:]
|
||||
|
||||
for parser_class in _get_parsers(): # type: ignore
|
||||
if mtype in parser_class.mimetypes:
|
||||
try:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue