diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 264b710..20426f6 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -18,6 +18,14 @@ pyflakes: - apt-get -qqy install --no-install-recommends pyflakes3 - pyflakes3 ./libmat2 +mypy: + stage: linting + script: + - apt-get -qqy update + - apt-get -qqy install --no-install-recommends python3-pip + - pip3 install mypy + - mypy mat2 libmat2/*.py --ignore-missing-imports + tests: stage: test script: diff --git a/libmat2/harmless.py b/libmat2/harmless.py index a63407f..d25603b 100644 --- a/libmat2/harmless.py +++ b/libmat2/harmless.py @@ -1,17 +1,18 @@ +from typing import Dict from . import abstract class HarmlessParser(abstract.AbstractParser): """ This is the parser for filetypes that do not contain metadata. """ - mimetypes = {'application/xml', 'text/plain'} + mimetypes = {'application/xml', 'text/plain', 'application/rdf+xml'} def __init__(self, filename: str) -> None: super().__init__(filename) self.filename = filename self.output_filename = filename - def get_meta(self): + def get_meta(self) -> Dict[str, str]: return dict() - def remove_all(self): + def remove_all(self) -> bool: return True diff --git a/libmat2/office.py b/libmat2/office.py index 749fc7d..90f7c7a 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -4,11 +4,15 @@ import shutil import tempfile import datetime import zipfile +from typing import Dict, Set from . import abstract, parser_factory +assert Set # make pyflakes happy class ArchiveBasedAbstractParser(abstract.AbstractParser): + whitelist = set() # type: Set[str] + def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: zipinfo.compress_type = zipfile.ZIP_DEFLATED zipinfo.create_system = 3 # Linux @@ -16,7 +20,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): zipinfo.date_time = (1980, 1, 1, 0, 0, 0) return zipinfo - def _get_zipinfo_meta(self, zipinfo: zipfile.ZipInfo) -> dict: + def _get_zipinfo_meta(self, zipinfo: zipfile.ZipInfo) -> Dict[str, str]: metadata = {} if zipinfo.create_system == 3: #metadata['create_system'] = 'Linux' @@ -27,25 +31,31 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): metadata['create_system'] = 'Weird' if zipinfo.comment: - metadata['comment'] = zipinfo.comment + metadata['comment'] = zipinfo.comment # type: ignore if zipinfo.date_time != (1980, 1, 1, 0, 0, 0): - metadata['date_time'] = datetime.datetime(*zipinfo.date_time) + metadata['date_time'] =str(datetime.datetime(*zipinfo.date_time)) return metadata def _clean_internal_file(self, item: zipfile.ZipInfo, temp_folder: str, zin: zipfile.ZipFile, zout: zipfile.ZipFile): + output = '' zin.extract(member=item, path=temp_folder) - tmp_parser, mtype = parser_factory.get_parser(os.path.join(temp_folder, item.filename)) - if not tmp_parser: - print("%s's format (%s) isn't supported" % (item.filename, mtype)) - return - tmp_parser.remove_all() - zinfo = zipfile.ZipInfo(item.filename) + if item.filename not in self.whitelist: + full_path = os.path.join(temp_folder, item.filename) + tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore + if not tmp_parser: + print("%s's format (%s) isn't supported" % (item.filename, mtype)) + return + tmp_parser.remove_all() + output = tmp_parser.output_filename + else: + output = os.path.join(temp_folder, item.filename) + zinfo = zipfile.ZipInfo(item.filename) # type: ignore clean_zinfo = self._clean_zipinfo(zinfo) - with open(tmp_parser.output_filename, 'rb') as f: + with open(output, 'rb') as f: zout.writestr(clean_zinfo, f.read()) @@ -72,7 +82,8 @@ class MSOfficeParser(ArchiveBasedAbstractParser): if not metadata: # better safe than sorry metadata[item] = 'harmful content' - metadata = {**metadata, **self._get_zipinfo_meta(item)} + for key, value in self._get_zipinfo_meta(item).items(): + metadata[key] = value zipin.close() return metadata @@ -112,6 +123,8 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): 'application/vnd.oasis.opendocument.formula', 'application/vnd.oasis.opendocument.image', } + whitelist = {'mimetype', 'manifest.rdf'} + def get_meta(self): """ @@ -127,7 +140,8 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): metadata[key] = value if not metadata: # better safe than sorry metadata[item] = 'harmful content' - metadata = {**metadata, **self._get_zipinfo_meta(item)} + for key, value in self._get_zipinfo_meta(item).items(): + metadata[key] = value zipin.close() return metadata diff --git a/libmat2/parser_factory.py b/libmat2/parser_factory.py index 2f6acc1..42d20de 100644 --- a/libmat2/parser_factory.py +++ b/libmat2/parser_factory.py @@ -2,10 +2,12 @@ import glob import os import mimetypes import importlib -from typing import TypeVar, List +from typing import TypeVar, List, Tuple, Optional from . import abstract, unsupported_extensions +assert Tuple # make pyflakes happy + T = TypeVar('T', bound='abstract.AbstractParser') def __load_all_parsers(): @@ -28,14 +30,14 @@ def _get_parsers() -> List[T]: return __get_parsers(abstract.AbstractParser) -def get_parser(filename: str) -> (T, str): +def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]: mtype, _ = mimetypes.guess_type(filename) _, extension = os.path.splitext(filename) if extension in unsupported_extensions: return None, mtype - for c in _get_parsers(): + for c in _get_parsers(): # type: ignore if mtype in c.mimetypes: try: return c(filename), mtype diff --git a/libmat2/pdf.py b/libmat2/pdf.py index 5b99192..77710bf 100644 --- a/libmat2/pdf.py +++ b/libmat2/pdf.py @@ -131,5 +131,6 @@ class PDFParser(abstract.AbstractParser): metadata[key] = document.get_property(key) if 'metadata' in metadata: parsed_meta = self.__parse_metadata_field(metadata['metadata']) - return {**metadata, **parsed_meta} + for key, value in parsed_meta.items(): + metadata[key] = value return metadata diff --git a/libmat2/torrent.py b/libmat2/torrent.py index 3a819fb..f5935e6 100644 --- a/libmat2/torrent.py +++ b/libmat2/torrent.py @@ -1,11 +1,12 @@ +from typing import Union, Tuple, Dict from . import abstract class TorrentParser(abstract.AbstractParser): - mimetypes = {b'application/x-bittorrent', } + mimetypes = {'application/x-bittorrent', } whitelist = {b'announce', b'announce-list', b'info'} - def get_meta(self) -> dict: + def get_meta(self) -> Dict[str, str]: metadata = {} with open(self.filename, 'rb') as f: d = _BencodeHandler().bdecode(f.read()) @@ -54,7 +55,7 @@ class _BencodeHandler(object): } @staticmethod - def __decode_int(s: str) -> (int, str): + def __decode_int(s: bytes) -> Tuple[int, bytes]: s = s[1:] next_idx = s.index(b'e') if s.startswith(b'-0'): @@ -64,7 +65,7 @@ class _BencodeHandler(object): return int(s[:next_idx]), s[next_idx+1:] @staticmethod - def __decode_string(s: str) -> (str, str): + def __decode_string(s: bytes) -> Tuple[bytes, bytes]: sep = s.index(b':') str_len = int(s[:sep]) if str_len < 0: @@ -74,7 +75,7 @@ class _BencodeHandler(object): s = s[1:] return s[sep:sep+str_len], s[sep+str_len:] - def __decode_list(self, s: str) -> (list, str): + def __decode_list(self, s: bytes) -> Tuple[list, bytes]: r = list() s = s[1:] # skip leading `l` while s[0] != ord('e'): @@ -82,7 +83,7 @@ class _BencodeHandler(object): r.append(v) return r, s[1:] - def __decode_dict(self, s: str) -> (dict, str): + def __decode_dict(self, s: bytes) -> Tuple[dict, bytes]: r = dict() s = s[1:] # skip leading `d` while s[0] != ord(b'e'): @@ -91,11 +92,11 @@ class _BencodeHandler(object): return r, s[1:] @staticmethod - def __encode_int(x: str) -> bytes: + def __encode_int(x: bytes) -> bytes: return b'i' + bytes(str(x), 'utf-8') + b'e' @staticmethod - def __encode_string(x: str) -> bytes: + def __encode_string(x: bytes) -> bytes: return bytes((str(len(x))), 'utf-8') + b':' + x def __encode_list(self, x: str) -> bytes: @@ -104,17 +105,17 @@ class _BencodeHandler(object): ret += self.__encode_func[type(i)](i) return b'l' + ret + b'e' - def __encode_dict(self, x: str) -> bytes: + def __encode_dict(self, x: dict) -> bytes: ret = b'' for k, v in sorted(x.items()): ret += self.__encode_func[type(k)](k) ret += self.__encode_func[type(v)](v) return b'd' + ret + b'e' - def bencode(self, s: str) -> bytes: + def bencode(self, s: Union[dict, list, bytes, int]) -> bytes: return self.__encode_func[type(s)](s) - def bdecode(self, s: str): + def bdecode(self, s: bytes) -> Union[dict, None]: try: r, l = self.__decode_func[s[0]](s) except (IndexError, KeyError, ValueError) as e: diff --git a/mat2 b/mat2 index aa213ab..11699f4 100755 --- a/mat2 +++ b/mat2 @@ -44,7 +44,7 @@ def show_meta(filename: str): if not __check_file(filename): return - p, mtype = parser_factory.get_parser(filename) + p, mtype = parser_factory.get_parser(filename) # type: ignore if p is None: print("[-] %s's format (%s) is not supported" % (filename, mtype)) return @@ -61,7 +61,7 @@ def clean_meta(params: Tuple[str, bool]) -> bool: if not __check_file(filename, os.R_OK|os.W_OK): return False - p, mtype = parser_factory.get_parser(filename) + p, mtype = parser_factory.get_parser(filename) # type: ignore if p is None: print("[-] %s's format (%s) is not supported" % (filename, mtype)) return False diff --git a/tests/test_climat2.py b/tests/test_climat2.py index fd72278..a119884 100644 --- a/tests/test_climat2.py +++ b/tests/test_climat2.py @@ -67,6 +67,13 @@ class TestCleanMeta(unittest.TestCase): os.remove('./tests/data/clean.jpg') +class TestIsSupported(unittest.TestCase): + def test_pdf(self): + proc = subprocess.Popen(['./mat2', '--show', './tests/data/dirty.pdf'], + stdout=subprocess.PIPE) + stdout, _ = proc.communicate() + self.assertNotIn(b"isn't supported", stdout) + class TestGetMeta(unittest.TestCase): def test_pdf(self): proc = subprocess.Popen(['./mat2', '--show', './tests/data/dirty.pdf'],