diff --git a/libmat2/__init__.py b/libmat2/__init__.py index 762686f..2f20265 100644 --- a/libmat2/__init__.py +++ b/libmat2/__init__.py @@ -2,7 +2,7 @@ import enum import importlib -from typing import Optional, Union +from typing import Optional, Union, Dict from . import exiftool, video @@ -66,8 +66,9 @@ CMD_DEPENDENCIES = { }, } -def check_dependencies() -> dict[str, dict[str, bool]]: - ret = dict() # type: dict[str, dict] + +def check_dependencies() -> Dict[str, Dict[str, bool]]: + ret = dict() # type: Dict[str, Dict] for key, value in DEPENDENCIES.items(): ret[key] = { diff --git a/libmat2/abstract.py b/libmat2/abstract.py index 426ccfc..1aff630 100644 --- a/libmat2/abstract.py +++ b/libmat2/abstract.py @@ -1,7 +1,7 @@ import abc import os import re -from typing import Union +from typing import Union, Set, Dict class AbstractParser(abc.ABC): @@ -9,8 +9,8 @@ class AbstractParser(abc.ABC): It might yield `ValueError` on instantiation on invalid files, and `RuntimeError` when something went wrong in `remove_all`. """ - meta_list = set() # type: set[str] - mimetypes = set() # type: set[str] + meta_list = set() # type: Set[str] + mimetypes = set() # type: Set[str] def __init__(self, filename: str) -> None: """ @@ -33,7 +33,7 @@ class AbstractParser(abc.ABC): self.sandbox = True @abc.abstractmethod - def get_meta(self) -> dict[str, Union[str, dict]]: + def get_meta(self) -> Dict[str, Union[str, Dict]]: """Return all the metadata of the current file""" @abc.abstractmethod diff --git a/libmat2/archive.py b/libmat2/archive.py index 25ff7f9..cbedcd2 100644 --- a/libmat2/archive.py +++ b/libmat2/archive.py @@ -7,7 +7,7 @@ import tempfile import os import logging import shutil -from typing import Pattern, Union, Any +from typing import Pattern, Union, Any, Set, Dict, List from . import abstract, UnknownMemberPolicy, parser_factory @@ -44,16 +44,16 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): def __init__(self, filename): super().__init__(filename) # We ignore typing here because mypy is too stupid - self.archive_class = None # type: ignore - self.member_class = None # type: ignore + self.archive_class = None # type: ignore + self.member_class = None # type: ignore # Those are the files that have a format that _isn't_ # supported by mat2, but that we want to keep anyway. - self.files_to_keep = set() # type: set[Pattern] + self.files_to_keep = set() # type: Set[Pattern] # Those are the files that we _do not_ want to keep, # no matter if they are supported or not. - self.files_to_omit = set() # type: set[Pattern] + self.files_to_omit = set() # type: Set[Pattern] # what should the parser do if it encounters an unknown file in # the archive? @@ -72,7 +72,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): # pylint: disable=unused-argument return True # pragma: no cover - def _specific_get_meta(self, full_path: str, file_path: str) -> dict[str, Any]: + def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]: """ This method can be used to extract specific metadata from files present in the archive.""" # pylint: disable=unused-argument @@ -87,7 +87,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): @staticmethod @abc.abstractmethod - def _get_all_members(archive: ArchiveClass) -> list[ArchiveMember]: + def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]: """Return all the members of the archive.""" @staticmethod @@ -97,7 +97,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): @staticmethod @abc.abstractmethod - def _get_member_meta(member: ArchiveMember) -> dict[str, str]: + def _get_member_meta(member: ArchiveMember) -> Dict[str, str]: """Return all the metadata of a given member.""" @staticmethod @@ -128,8 +128,8 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): # pylint: disable=unused-argument return member - def get_meta(self) -> dict[str, Union[str, dict]]: - meta = dict() # type: dict[str, Union[str, dict]] + def get_meta(self) -> dict[str, Union[str, Dict]]: + meta = dict() # type: Dict[str, Union[str, Dict]] with self.archive_class(self.filename) as zin: temp_folder = tempfile.mkdtemp() @@ -264,6 +264,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): class TarParser(ArchiveBasedAbstractParser): mimetypes = {'application/x-tar'} + def __init__(self, filename): super().__init__(filename) # yes, it's tarfile.open and not tarfile.TarFile, @@ -336,7 +337,7 @@ class TarParser(ArchiveBasedAbstractParser): return member @staticmethod - def _get_member_meta(member: ArchiveMember) -> dict[str, str]: + def _get_member_meta(member: ArchiveMember) -> Dict[str, str]: assert isinstance(member, tarfile.TarInfo) # please mypy metadata = {} if member.mtime != 0: @@ -358,7 +359,7 @@ class TarParser(ArchiveBasedAbstractParser): archive.add(full_path, member.name, filter=TarParser._clean_member) # type: ignore @staticmethod - def _get_all_members(archive: ArchiveClass) -> list[ArchiveMember]: + def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]: assert isinstance(archive, tarfile.TarFile) # please mypy return archive.getmembers() # type: ignore @@ -391,7 +392,8 @@ class TarXzParser(TarParser): class ZipParser(ArchiveBasedAbstractParser): mimetypes = {'application/zip'} - def __init__(self, filename): + + def __init__(self, filename: str): super().__init__(filename) self.archive_class = zipfile.ZipFile self.member_class = zipfile.ZipInfo @@ -412,7 +414,7 @@ class ZipParser(ArchiveBasedAbstractParser): return member @staticmethod - def _get_member_meta(member: ArchiveMember) -> dict[str, str]: + def _get_member_meta(member: ArchiveMember) -> Dict[str, str]: assert isinstance(member, zipfile.ZipInfo) # please mypy metadata = {} if member.create_system == 3: # this is Linux @@ -439,7 +441,7 @@ class ZipParser(ArchiveBasedAbstractParser): compress_type=member.compress_type) @staticmethod - def _get_all_members(archive: ArchiveClass) -> list[ArchiveMember]: + def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]: assert isinstance(archive, zipfile.ZipFile) # please mypy return archive.infolist() # type: ignore diff --git a/libmat2/audio.py b/libmat2/audio.py index 366d451..aa4afdb 100644 --- a/libmat2/audio.py +++ b/libmat2/audio.py @@ -2,7 +2,7 @@ import mimetypes import os import shutil import tempfile -from typing import Union +from typing import Union, Dict import mutagen @@ -18,10 +18,10 @@ class MutagenParser(abstract.AbstractParser): except mutagen.MutagenError: raise ValueError - def get_meta(self) -> dict[str, Union[str, dict]]: + def get_meta(self) -> Dict[str, Union[str, Dict]]: f = mutagen.File(self.filename) if f.tags: - return {k:', '.join(map(str, v)) for k, v in f.tags.items()} + return {k: ', '.join(map(str, v)) for k, v in f.tags.items()} return {} def remove_all(self) -> bool: @@ -38,8 +38,8 @@ class MutagenParser(abstract.AbstractParser): class MP3Parser(MutagenParser): mimetypes = {'audio/mpeg', } - def get_meta(self) -> dict[str, Union[str, dict]]: - metadata = {} # type: dict[str, Union[str, dict]] + def get_meta(self) -> Dict[str, Union[str, Dict]]: + metadata = {} # type: Dict[str, Union[str, Dict]] meta = mutagen.File(self.filename).tags if not meta: return metadata @@ -68,12 +68,12 @@ class FLACParser(MutagenParser): f.save(deleteid3=True) return True - def get_meta(self) -> dict[str, Union[str, dict]]: + def get_meta(self) -> Dict[str, Union[str, Dict]]: meta = super().get_meta() for num, picture in enumerate(mutagen.File(self.filename).pictures): name = picture.desc if picture.desc else 'Cover %d' % num extension = mimetypes.guess_extension(picture.mime) - if extension is None: # pragma: no cover + if extension is None: # pragma: no cover meta[name] = 'harmful data' continue @@ -98,6 +98,7 @@ class WAVParser(video.AbstractFFmpegParser): 'MIMEType', 'NumChannels', 'SampleRate', 'SourceFile', } + class AIFFParser(video.AbstractFFmpegParser): mimetypes = {'audio/aiff', 'audio/x-aiff'} meta_allowlist = {'AvgBytesPerSec', 'BitsPerSample', 'Directory', diff --git a/libmat2/bubblewrap.py b/libmat2/bubblewrap.py index 0e202b9..e59f111 100644 --- a/libmat2/bubblewrap.py +++ b/libmat2/bubblewrap.py @@ -12,7 +12,7 @@ import shutil import subprocess import tempfile import functools -from typing import Optional +from typing import Optional, List __all__ = ['PIPE', 'run', 'CalledProcessError'] @@ -33,7 +33,7 @@ def _get_bwrap_path() -> str: def _get_bwrap_args(tempdir: str, input_filename: str, - output_filename: Optional[str] = None) -> list[str]: + output_filename: Optional[str] = None) -> List[str]: ro_bind_args = [] cwd = os.getcwd() diff --git a/libmat2/epub.py b/libmat2/epub.py index 7613d35..3c5046a 100644 --- a/libmat2/epub.py +++ b/libmat2/epub.py @@ -3,10 +3,11 @@ import re import uuid import zipfile import xml.etree.ElementTree as ET # type: ignore -from typing import Any +from typing import Any, Dict from . import archive, office + class EPUBParser(archive.ZipParser): mimetypes = {'application/epub+zip', } metadata_namespace = '{http://purl.org/dc/elements/1.1/}' @@ -28,7 +29,6 @@ class EPUBParser(archive.ZipParser): })) self.uniqid = uuid.uuid4() - def is_archive_valid(self): super().is_archive_valid() with zipfile.ZipFile(self.filename) as zin: @@ -37,7 +37,7 @@ class EPUBParser(archive.ZipParser): if member_name.endswith('META-INF/encryption.xml'): raise ValueError('the file contains encrypted fonts') - def _specific_get_meta(self, full_path, file_path) -> dict[str, Any]: + def _specific_get_meta(self, full_path, file_path) -> Dict[str, Any]: if not file_path.endswith('.opf'): return {} @@ -73,7 +73,6 @@ class EPUBParser(archive.ZipParser): short_empty_elements=False) return True - def __handle_tocncx(self, full_path: str) -> bool: try: tree, namespace = office._parse_xml(full_path) diff --git a/libmat2/exiftool.py b/libmat2/exiftool.py index cdfce3d..5979a64 100644 --- a/libmat2/exiftool.py +++ b/libmat2/exiftool.py @@ -4,7 +4,7 @@ import logging import os import shutil import subprocess -from typing import Union +from typing import Union, Set, Dict from . import abstract from . import bubblewrap @@ -15,9 +15,9 @@ class ExiftoolParser(abstract.AbstractParser): from a import file, hence why several parsers are re-using its `get_meta` method. """ - meta_allowlist = set() # type: set[str] + meta_allowlist = set() # type: Set[str] - def get_meta(self) -> dict[str, Union[str, dict]]: + def get_meta(self) -> Dict[str, Union[str, Dict]]: try: if self.sandbox: out = bubblewrap.run([_get_exiftool_path(), '-json', diff --git a/libmat2/harmless.py b/libmat2/harmless.py index 8688a9d..42b6eda 100644 --- a/libmat2/harmless.py +++ b/libmat2/harmless.py @@ -1,5 +1,5 @@ import shutil -from typing import Union +from typing import Union, Dict from . import abstract @@ -7,7 +7,7 @@ class HarmlessParser(abstract.AbstractParser): """ This is the parser for filetypes that can not contain metadata. """ mimetypes = {'text/plain', 'image/x-ms-bmp'} - def get_meta(self) -> dict[str, Union[str, dict]]: + def get_meta(self) -> Dict[str, Union[str, Dict]]: return dict() def remove_all(self) -> bool: diff --git a/libmat2/images.py b/libmat2/images.py index 083ff64..e7cdf5a 100644 --- a/libmat2/images.py +++ b/libmat2/images.py @@ -1,6 +1,6 @@ import os import re -from typing import Union, Any +from typing import Union, Any, Dict import cairo @@ -48,7 +48,7 @@ class SVGParser(exiftool.ExiftoolParser): surface.finish() return True - def get_meta(self) -> dict[str, Union[str, dict]]: + def get_meta(self) -> Dict[str, Union[str, Dict]]: meta = super().get_meta() # The namespace is mandatory, but only the …/2000/svg is valid. @@ -57,6 +57,7 @@ class SVGParser(exiftool.ExiftoolParser): meta.pop('Xmlns') return meta + class PNGParser(exiftool.ExiftoolParser): mimetypes = {'image/png', } meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName', @@ -156,11 +157,12 @@ class TiffParser(GdkPixbufAbstractParser): 'FileTypeExtension', 'ImageHeight', 'ImageSize', 'ImageWidth', 'MIMEType', 'Megapixels', 'SourceFile'} + class PPMParser(abstract.AbstractParser): mimetypes = {'image/x-portable-pixmap'} - def get_meta(self) -> dict[str, Union[str, dict]]: - meta = {} # type: dict[str, Union[str, dict[Any, Any]]] + def get_meta(self) -> Dict[str, Union[str, Dict]]: + meta = {} # type: Dict[str, Union[str, Dict[Any, Any]]] with open(self.filename) as f: for idx, line in enumerate(f): if line.lstrip().startswith('#'): @@ -176,9 +178,10 @@ class PPMParser(abstract.AbstractParser): fout.write(line) return True + class HEICParser(exiftool.ExiftoolParser): mimetypes = {'image/heic'} - meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName','Directory', + meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory', 'FileSize', 'FileModifyDate', 'FileAccessDate', 'FileInodeChangeDate', 'FilePermissions', 'FileType', 'FileTypeExtension', 'MIMEType', 'MajorBrand', 'MinorVersion', diff --git a/libmat2/office.py b/libmat2/office.py index 8ccaa02..87a0b7e 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -4,7 +4,7 @@ import logging import os import re import zipfile -from typing import Pattern, Any +from typing import Pattern, Any, Tuple, Dict import xml.etree.ElementTree as ET # type: ignore @@ -12,7 +12,8 @@ from .archive import ZipParser # pylint: disable=line-too-long -def _parse_xml(full_path: str) -> tuple[ET.ElementTree, dict[str, str]]: + +def _parse_xml(full_path: str) -> Tuple[ET.ElementTree, Dict[str, str]]: """ This function parses XML, with namespace support. """ namespace_map = dict() for _, (key, value) in ET.iterparse(full_path, ("start-ns", )): @@ -68,7 +69,6 @@ class MSOfficeParser(ZipParser): 'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml', } - def __init__(self, filename): super().__init__(filename) @@ -218,7 +218,7 @@ class MSOfficeParser(ZipParser): if 'w' not in namespace: return True - parent_map = {c:p for p in tree.iter() for c in p} + parent_map = {c: p for p in tree.iter() for c in p} elements_to_remove = list() for element in tree.iterfind('.//w:nsid', namespace): @@ -229,7 +229,6 @@ class MSOfficeParser(ZipParser): tree.write(full_path, xml_declaration=True) return True - @staticmethod def __remove_revisions(full_path: str) -> bool: try: @@ -319,7 +318,6 @@ class MSOfficeParser(ZipParser): for i in re.findall(r' bool: try: @@ -441,8 +439,8 @@ class MSOfficeParser(ZipParser): with open(full_path, encoding='utf-8') as f: try: - results = re.findall(r"<(.+)>(.+)", f.read(), re.I|re.M) - return {k:v for (k, v) in results} + results = re.findall(r"<(.+)>(.+)", f.read(), re.I | re.M) + return {k: v for (k, v) in results} except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file return {file_path: 'harmful content', } @@ -459,7 +457,6 @@ class LibreOfficeParser(ZipParser): 'application/vnd.oasis.opendocument.image', } - def __init__(self, filename): super().__init__(filename) diff --git a/libmat2/pdf.py b/libmat2/pdf.py index 63ed9c1..8c3055f 100644 --- a/libmat2/pdf.py +++ b/libmat2/pdf.py @@ -7,7 +7,7 @@ import re import logging import tempfile import io -from typing import Union +from typing import Union, Dict import cairo import gi @@ -18,6 +18,7 @@ from . import abstract FIXED_PDF_VERSION = cairo.PDFVersion.VERSION_1_5 + class PDFParser(abstract.AbstractParser): mimetypes = {'application/pdf', } meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords', @@ -140,13 +141,13 @@ class PDFParser(abstract.AbstractParser): return True @staticmethod - def __parse_metadata_field(data: str) -> dict[str, str]: + def __parse_metadata_field(data: str) -> Dict[str, str]: metadata = {} for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)", data, re.I): metadata[key] = value return metadata - def get_meta(self) -> dict[str, Union[str, dict]]: + def get_meta(self) -> Dict[str, Union[str, Dict]]: """ Return a dict with all the meta of the file """ metadata = {} diff --git a/libmat2/torrent.py b/libmat2/torrent.py index c547a20..e6407ff 100644 --- a/libmat2/torrent.py +++ b/libmat2/torrent.py @@ -1,5 +1,5 @@ import logging -from typing import Union +from typing import Union, Dict, List, Tuple from . import abstract @@ -15,7 +15,7 @@ class TorrentParser(abstract.AbstractParser): if self.dict_repr is None: raise ValueError - def get_meta(self) -> dict[str, Union[str, dict]]: + def get_meta(self) -> Dict[str, Union[str, Dict]]: metadata = {} for key, value in self.dict_repr.items(): if key not in self.allowlist: @@ -56,7 +56,7 @@ class _BencodeHandler: } @staticmethod - def __decode_int(s: bytes) -> tuple[int, bytes]: + def __decode_int(s: bytes) -> Tuple[int, bytes]: s = s[1:] next_idx = s.index(b'e') if s.startswith(b'-0'): @@ -66,7 +66,7 @@ class _BencodeHandler: return int(s[:next_idx]), s[next_idx+1:] @staticmethod - def __decode_string(s: bytes) -> tuple[bytes, bytes]: + def __decode_string(s: bytes) -> Tuple[bytes, bytes]: colon = s.index(b':') # FIXME Python3 is broken here, the call to `ord` shouldn't be needed, # but apparently it is. This is utterly idiotic. @@ -76,7 +76,7 @@ class _BencodeHandler: s = s[1:] return s[colon:colon+str_len], s[colon+str_len:] - def __decode_list(self, s: bytes) -> tuple[list, bytes]: + def __decode_list(self, s: bytes) -> Tuple[List, bytes]: ret = list() s = s[1:] # skip leading `l` while s[0] != ord('e'): @@ -84,7 +84,7 @@ class _BencodeHandler: ret.append(value) return ret, s[1:] - def __decode_dict(self, s: bytes) -> tuple[dict, bytes]: + def __decode_dict(self, s: bytes) -> Tuple[Dict, bytes]: ret = dict() s = s[1:] # skip leading `d` while s[0] != ord(b'e'): @@ -113,10 +113,10 @@ class _BencodeHandler: ret += self.__encode_func[type(value)](value) return b'd' + ret + b'e' - def bencode(self, s: Union[dict, list, bytes, int]) -> bytes: + def bencode(self, s: Union[Dict, List, bytes, int]) -> bytes: return self.__encode_func[type(s)](s) - def bdecode(self, s: bytes) -> Union[dict, None]: + def bdecode(self, s: bytes) -> Union[Dict, None]: try: ret, trail = self.__decode_func[s[0]](s) except (IndexError, KeyError, ValueError) as e: diff --git a/libmat2/video.py b/libmat2/video.py index 4d33aa4..772a89e 100644 --- a/libmat2/video.py +++ b/libmat2/video.py @@ -3,7 +3,7 @@ import functools import shutil import logging -from typing import Union +from typing import Union, Dict from . import exiftool from . import bubblewrap @@ -12,7 +12,7 @@ from . import bubblewrap class AbstractFFmpegParser(exiftool.ExiftoolParser): """ Abstract parser for all FFmpeg-based ones, mainly for video. """ # Some fileformats have mandatory metadata fields - meta_key_value_allowlist = {} # type: dict[str, Union[str, int]] + meta_key_value_allowlist = {} # type: Dict[str, Union[str, int]] def remove_all(self) -> bool: if self.meta_key_value_allowlist: @@ -45,10 +45,10 @@ class AbstractFFmpegParser(exiftool.ExiftoolParser): return False return True - def get_meta(self) -> dict[str, Union[str, dict]]: + def get_meta(self) -> Dict[str, Union[str, Dict]]: meta = super().get_meta() - ret = dict() # type: dict[str, Union[str, dict]] + ret = dict() # type: Dict[str, Union[str, Dict]] for key, value in meta.items(): if key in self.meta_key_value_allowlist: if value == self.meta_key_value_allowlist[key]: diff --git a/libmat2/web.py b/libmat2/web.py index 574bdd7..f2938e2 100644 --- a/libmat2/web.py +++ b/libmat2/web.py @@ -1,5 +1,5 @@ from html import parser, escape -from typing import Any, Optional +from typing import Any, Optional, Dict, List, Tuple, Set import re import string @@ -25,7 +25,7 @@ class CSSParser(abstract.AbstractParser): f.write(cleaned) return True - def get_meta(self) -> dict[str, Any]: + def get_meta(self) -> Dict[str, Any]: metadata = {} with open(self.filename, encoding='utf-8') as f: try: @@ -44,10 +44,10 @@ class CSSParser(abstract.AbstractParser): class AbstractHTMLParser(abstract.AbstractParser): - tags_blocklist = set() # type: set[str] + tags_blocklist = set() # type: Set[str] # In some html/xml-based formats some tags are mandatory, # so we're keeping them, but are discarding their content - tags_required_blocklist = set() # type: set[str] + tags_required_blocklist = set() # type: Set[str] def __init__(self, filename): super().__init__(filename) @@ -57,7 +57,7 @@ class AbstractHTMLParser(abstract.AbstractParser): self.__parser.feed(f.read()) self.__parser.close() - def get_meta(self) -> dict[str, Any]: + def get_meta(self) -> Dict[str, Any]: return self.__parser.get_meta() def remove_all(self) -> bool: @@ -112,7 +112,7 @@ class _HTMLParser(parser.HTMLParser): """ raise ValueError(message) - def handle_starttag(self, tag: str, attrs: list[tuple[str, Optional[str]]]): + def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]): # Ignore the type, because mypy is too stupid to infer # that get_starttag_text() can't return None. original_tag = self.get_starttag_text() # type: ignore @@ -159,7 +159,7 @@ class _HTMLParser(parser.HTMLParser): self.__textrepr += escape(data) def handle_startendtag(self, tag: str, - attrs: list[tuple[str, Optional[str]]]): + attrs: List[Tuple[str, Optional[str]]]): if tag in self.tag_required_blocklist | self.tag_blocklist: meta = {k:v for k, v in attrs} name = meta.get('name', 'harmful metadata') @@ -184,7 +184,7 @@ class _HTMLParser(parser.HTMLParser): f.write(self.__textrepr) return True - def get_meta(self) -> dict[str, Any]: + def get_meta(self) -> Dict[str, Any]: if self.__validation_queue: raise ValueError("Some tags (%s) were left unclosed in %s" % ( ', '.join(self.__validation_queue),