2018-04-01 12:06:50 +02:00
|
|
|
import abc
|
2018-04-30 23:46:37 +02:00
|
|
|
import os
|
2018-10-22 19:12:39 +02:00
|
|
|
import re
|
2018-10-12 11:58:01 +02:00
|
|
|
from typing import Set, Dict, Union
|
2018-04-01 12:06:50 +02:00
|
|
|
|
2018-06-04 20:43:28 +02:00
|
|
|
assert Set # make pyflakes happy
|
|
|
|
|
2018-04-04 23:21:48 +02:00
|
|
|
|
2018-04-01 12:06:50 +02:00
|
|
|
class AbstractParser(abc.ABC):
|
2018-08-30 23:11:35 +02:00
|
|
|
""" This is the base class of every parser.
|
2018-10-18 19:19:56 +02:00
|
|
|
It might yield `ValueError` on instantiation on invalid files,
|
|
|
|
and `RuntimeError` when something went wrong in `remove_all`.
|
2018-07-19 23:10:27 +02:00
|
|
|
"""
|
2018-06-04 20:39:27 +02:00
|
|
|
meta_list = set() # type: Set[str]
|
|
|
|
mimetypes = set() # type: Set[str]
|
2018-03-20 01:20:11 +01:00
|
|
|
|
2018-06-04 20:39:27 +02:00
|
|
|
def __init__(self, filename: str) -> None:
|
2018-07-19 23:10:27 +02:00
|
|
|
"""
|
|
|
|
:raises ValueError: Raised upon an invalid file
|
|
|
|
"""
|
2018-10-22 19:12:39 +02:00
|
|
|
if re.search('^[a-z0-9./]', filename) is None:
|
|
|
|
# Some parsers are calling external binaries,
|
|
|
|
# this prevents shell command injections
|
|
|
|
filename = os.path.join('.', filename)
|
|
|
|
|
2018-03-06 23:20:18 +01:00
|
|
|
self.filename = filename
|
2018-04-30 23:46:37 +02:00
|
|
|
fname, extension = os.path.splitext(filename)
|
2019-04-27 15:03:09 +02:00
|
|
|
|
|
|
|
# Special case for tar.gz, tar.bz2, … files
|
|
|
|
if fname.endswith('.tar') and len(fname) > 4:
|
|
|
|
fname, extension = fname[:-4], '.tar' + extension
|
|
|
|
|
2018-04-30 23:46:37 +02:00
|
|
|
self.output_filename = fname + '.cleaned' + extension
|
2018-10-12 11:49:24 +02:00
|
|
|
self.lightweight_cleaning = False
|
2018-03-06 23:20:18 +01:00
|
|
|
|
2018-04-01 12:06:50 +02:00
|
|
|
@abc.abstractmethod
|
2018-10-12 11:58:01 +02:00
|
|
|
def get_meta(self) -> Dict[str, Union[str, dict]]:
|
2019-05-09 09:41:05 +02:00
|
|
|
"""Return all the metadata of the current file"""
|
2018-03-06 23:20:18 +01:00
|
|
|
|
2018-04-01 12:06:50 +02:00
|
|
|
@abc.abstractmethod
|
2018-04-01 01:04:06 +02:00
|
|
|
def remove_all(self) -> bool:
|
2018-10-18 19:19:56 +02:00
|
|
|
"""
|
2019-05-09 09:41:05 +02:00
|
|
|
Remove all the metadata of the current file
|
|
|
|
|
2018-10-18 19:19:56 +02:00
|
|
|
:raises RuntimeError: Raised if the cleaning process went wrong.
|
|
|
|
"""
|