1
0
mirror of synced 2024-11-22 09:14:23 +01:00

Refactor {black,white}list into {block,allow}list

Closes #96
This commit is contained in:
Brolf 2019-02-20 00:45:27 +01:00 committed by georg
parent c3f097a82b
commit 5ac91cd4f9
No known key found for this signature in database
GPG Key ID: 5AD75B414EA41667
6 changed files with 39 additions and 39 deletions

View File

@ -15,14 +15,14 @@ class ExiftoolParser(abstract.AbstractParser):
from a import file, hence why several parsers are re-using its `get_meta` from a import file, hence why several parsers are re-using its `get_meta`
method. method.
""" """
meta_whitelist = set() # type: Set[str] meta_allowlist = set() # type: Set[str]
def get_meta(self) -> Dict[str, Union[str, dict]]: def get_meta(self) -> Dict[str, Union[str, dict]]:
out = subprocess.run([_get_exiftool_path(), '-json', self.filename], out = subprocess.run([_get_exiftool_path(), '-json', self.filename],
input_filename=self.filename, input_filename=self.filename,
check=True, stdout=subprocess.PIPE).stdout check=True, stdout=subprocess.PIPE).stdout
meta = json.loads(out.decode('utf-8'))[0] meta = json.loads(out.decode('utf-8'))[0]
for key in self.meta_whitelist: for key in self.meta_allowlist:
meta.pop(key, None) meta.pop(key, None)
return meta return meta

View File

@ -15,7 +15,7 @@ assert Set
class PNGParser(exiftool.ExiftoolParser): class PNGParser(exiftool.ExiftoolParser):
mimetypes = {'image/png', } mimetypes = {'image/png', }
meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName',
'Directory', 'FileSize', 'FileModifyDate', 'Directory', 'FileSize', 'FileModifyDate',
'FileAccessDate', 'FileInodeChangeDate', 'FileAccessDate', 'FileInodeChangeDate',
'FilePermissions', 'FileType', 'FileTypeExtension', 'FilePermissions', 'FileType', 'FileTypeExtension',
@ -44,7 +44,7 @@ class PNGParser(exiftool.ExiftoolParser):
class GIFParser(exiftool.ExiftoolParser): class GIFParser(exiftool.ExiftoolParser):
mimetypes = {'image/gif'} mimetypes = {'image/gif'}
meta_whitelist = {'AnimationIterations', 'BackgroundColor', 'BitsPerPixel', meta_allowlist = {'AnimationIterations', 'BackgroundColor', 'BitsPerPixel',
'ColorResolutionDepth', 'Directory', 'Duration', 'ColorResolutionDepth', 'Directory', 'Duration',
'ExifToolVersion', 'FileAccessDate', 'ExifToolVersion', 'FileAccessDate',
'FileInodeChangeDate', 'FileModifyDate', 'FileName', 'FileInodeChangeDate', 'FileModifyDate', 'FileName',
@ -86,7 +86,7 @@ class GdkPixbufAbstractParser(exiftool.ExiftoolParser):
class JPGParser(GdkPixbufAbstractParser): class JPGParser(GdkPixbufAbstractParser):
_type = 'jpeg' _type = 'jpeg'
mimetypes = {'image/jpeg'} mimetypes = {'image/jpeg'}
meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName',
'Directory', 'FileSize', 'FileModifyDate', 'Directory', 'FileSize', 'FileModifyDate',
'FileAccessDate', "FileInodeChangeDate", 'FileAccessDate', "FileInodeChangeDate",
'FilePermissions', 'FileType', 'FileTypeExtension', 'FilePermissions', 'FileType', 'FileTypeExtension',
@ -99,7 +99,7 @@ class JPGParser(GdkPixbufAbstractParser):
class TiffParser(GdkPixbufAbstractParser): class TiffParser(GdkPixbufAbstractParser):
_type = 'tiff' _type = 'tiff'
mimetypes = {'image/tiff'} mimetypes = {'image/tiff'}
meta_whitelist = {'Compression', 'ExifByteOrder', 'ExtraSamples', meta_allowlist = {'Compression', 'ExifByteOrder', 'ExtraSamples',
'FillOrder', 'PhotometricInterpretation', 'FillOrder', 'PhotometricInterpretation',
'PlanarConfiguration', 'RowsPerStrip', 'SamplesPerPixel', 'PlanarConfiguration', 'RowsPerStrip', 'SamplesPerPixel',
'StripByteCounts', 'StripOffsets', 'BitsPerSample', 'StripByteCounts', 'StripOffsets', 'BitsPerSample',

View File

@ -89,7 +89,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
r'^word/theme', r'^word/theme',
r'^word/people\.xml$', r'^word/people\.xml$',
# we have a whitelist in self.files_to_keep, # we have an allowlist in self.files_to_keep,
# so we can trash everything else # so we can trash everything else
r'^word/_rels/', r'^word/_rels/',
})) }))
@ -100,7 +100,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
def __fill_files_to_keep_via_content_types(self) -> bool: def __fill_files_to_keep_via_content_types(self) -> bool:
""" There is a suer-handy `[Content_Types].xml` file """ There is a suer-handy `[Content_Types].xml` file
in MS Office archives, describing what each other file contains. in MS Office archives, describing what each other file contains.
The self.content_types_to_keep member contains a type whitelist, The self.content_types_to_keep member contains a type allowlist,
so we're using it to fill the self.files_to_keep one. so we're using it to fill the self.files_to_keep one.
""" """
with zipfile.ZipFile(self.filename) as zin: with zipfile.ZipFile(self.filename) as zin:
@ -220,7 +220,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
for file_to_omit in self.files_to_omit: for file_to_omit in self.files_to_omit:
if file_to_omit.search(fname): if file_to_omit.search(fname):
matches = map(lambda r: r.search(fname), self.files_to_keep) matches = map(lambda r: r.search(fname), self.files_to_keep)
if any(matches): # the file is whitelisted if any(matches): # the file is in the allowlist
continue continue
removed_fnames.add(fname) removed_fnames.add(fname)
break break

View File

@ -6,7 +6,7 @@ from . import abstract
class TorrentParser(abstract.AbstractParser): class TorrentParser(abstract.AbstractParser):
mimetypes = {'application/x-bittorrent', } mimetypes = {'application/x-bittorrent', }
whitelist = {b'announce', b'announce-list', b'info'} allowlist = {b'announce', b'announce-list', b'info'}
def __init__(self, filename): def __init__(self, filename):
super().__init__(filename) super().__init__(filename)
@ -18,14 +18,14 @@ class TorrentParser(abstract.AbstractParser):
def get_meta(self) -> Dict[str, Union[str, dict]]: def get_meta(self) -> Dict[str, Union[str, dict]]:
metadata = {} metadata = {}
for key, value in self.dict_repr.items(): for key, value in self.dict_repr.items():
if key not in self.whitelist: if key not in self.allowlist:
metadata[key.decode('utf-8')] = value metadata[key.decode('utf-8')] = value
return metadata return metadata
def remove_all(self) -> bool: def remove_all(self) -> bool:
cleaned = dict() cleaned = dict()
for key, value in self.dict_repr.items(): for key, value in self.dict_repr.items():
if key in self.whitelist: if key in self.allowlist:
cleaned[key] = value cleaned[key] = value
with open(self.output_filename, 'wb') as f: with open(self.output_filename, 'wb') as f:
f.write(_BencodeHandler().bencode(cleaned)) f.write(_BencodeHandler().bencode(cleaned))

View File

@ -10,10 +10,10 @@ from . import subprocess
class AbstractFFmpegParser(exiftool.ExiftoolParser): class AbstractFFmpegParser(exiftool.ExiftoolParser):
""" Abstract parser for all FFmpeg-based ones, mainly for video. """ """ Abstract parser for all FFmpeg-based ones, mainly for video. """
# Some fileformats have mandatory metadata fields # Some fileformats have mandatory metadata fields
meta_key_value_whitelist = {} # type: Dict[str, Union[str, int]] meta_key_value_allowlist = {} # type: Dict[str, Union[str, int]]
def remove_all(self) -> bool: def remove_all(self) -> bool:
if self.meta_key_value_whitelist: if self.meta_key_value_allowlist:
logging.warning('The format of "%s" (%s) has some mandatory ' logging.warning('The format of "%s" (%s) has some mandatory '
'metadata fields; mat2 filled them with standard ' 'metadata fields; mat2 filled them with standard '
'data.', self.filename, ', '.join(self.mimetypes)) 'data.', self.filename, ', '.join(self.mimetypes))
@ -45,8 +45,8 @@ class AbstractFFmpegParser(exiftool.ExiftoolParser):
ret = dict() # type: Dict[str, Union[str, dict]] ret = dict() # type: Dict[str, Union[str, dict]]
for key, value in meta.items(): for key, value in meta.items():
if key in self.meta_key_value_whitelist.keys(): if key in self.meta_key_value_allowlist.keys():
if value == self.meta_key_value_whitelist[key]: if value == self.meta_key_value_allowlist[key]:
continue continue
ret[key] = value ret[key] = value
return ret return ret
@ -54,7 +54,7 @@ class AbstractFFmpegParser(exiftool.ExiftoolParser):
class WMVParser(AbstractFFmpegParser): class WMVParser(AbstractFFmpegParser):
mimetypes = {'video/x-ms-wmv', } mimetypes = {'video/x-ms-wmv', }
meta_whitelist = {'AudioChannels', 'AudioCodecID', 'AudioCodecName', meta_allowlist = {'AudioChannels', 'AudioCodecID', 'AudioCodecName',
'ErrorCorrectionType', 'AudioSampleRate', 'DataPackets', 'ErrorCorrectionType', 'AudioSampleRate', 'DataPackets',
'Directory', 'Duration', 'ExifToolVersion', 'Directory', 'Duration', 'ExifToolVersion',
'FileAccessDate', 'FileInodeChangeDate', 'FileLength', 'FileAccessDate', 'FileInodeChangeDate', 'FileLength',
@ -64,7 +64,7 @@ class WMVParser(AbstractFFmpegParser):
'ImageWidth', 'MIMEType', 'MaxBitrate', 'MaxPacketSize', 'ImageWidth', 'MIMEType', 'MaxBitrate', 'MaxPacketSize',
'Megapixels', 'MinPacketSize', 'Preroll', 'SendDuration', 'Megapixels', 'MinPacketSize', 'Preroll', 'SendDuration',
'SourceFile', 'StreamNumber', 'VideoCodecName', } 'SourceFile', 'StreamNumber', 'VideoCodecName', }
meta_key_value_whitelist = { # some metadata are mandatory :/ meta_key_value_allowlist = { # some metadata are mandatory :/
'AudioCodecDescription': '', 'AudioCodecDescription': '',
'CreationDate': '0000:00:00 00:00:00Z', 'CreationDate': '0000:00:00 00:00:00Z',
'FileID': '00000000-0000-0000-0000-000000000000', 'FileID': '00000000-0000-0000-0000-000000000000',
@ -78,7 +78,7 @@ class WMVParser(AbstractFFmpegParser):
class AVIParser(AbstractFFmpegParser): class AVIParser(AbstractFFmpegParser):
mimetypes = {'video/x-msvideo', } mimetypes = {'video/x-msvideo', }
meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory', meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory',
'FileSize', 'FileModifyDate', 'FileAccessDate', 'FileSize', 'FileModifyDate', 'FileAccessDate',
'FileInodeChangeDate', 'FilePermissions', 'FileType', 'FileInodeChangeDate', 'FilePermissions', 'FileType',
'FileTypeExtension', 'MIMEType', 'FrameRate', 'MaxDataRate', 'FileTypeExtension', 'MIMEType', 'FrameRate', 'MaxDataRate',
@ -98,7 +98,7 @@ class AVIParser(AbstractFFmpegParser):
class MP4Parser(AbstractFFmpegParser): class MP4Parser(AbstractFFmpegParser):
mimetypes = {'video/mp4', } mimetypes = {'video/mp4', }
meta_whitelist = {'AudioFormat', 'AvgBitrate', 'Balance', 'TrackDuration', meta_allowlist = {'AudioFormat', 'AvgBitrate', 'Balance', 'TrackDuration',
'XResolution', 'YResolution', 'ExifToolVersion', 'XResolution', 'YResolution', 'ExifToolVersion',
'FileAccessDate', 'FileInodeChangeDate', 'FileModifyDate', 'FileAccessDate', 'FileInodeChangeDate', 'FileModifyDate',
'FileName', 'FilePermissions', 'MIMEType', 'FileType', 'FileName', 'FilePermissions', 'MIMEType', 'FileType',
@ -109,7 +109,7 @@ class MP4Parser(AbstractFFmpegParser):
'MovieDataSize', 'VideoFrameRate', 'MediaTimeScale', 'MovieDataSize', 'VideoFrameRate', 'MediaTimeScale',
'SourceImageHeight', 'SourceImageWidth', 'SourceImageHeight', 'SourceImageWidth',
'MatrixStructure', 'MediaDuration'} 'MatrixStructure', 'MediaDuration'}
meta_key_value_whitelist = { # some metadata are mandatory :/ meta_key_value_allowlist = { # some metadata are mandatory :/
'CreateDate': '0000:00:00 00:00:00', 'CreateDate': '0000:00:00 00:00:00',
'CurrentTime': '0 s', 'CurrentTime': '0 s',
'MediaCreateDate': '0000:00:00 00:00:00', 'MediaCreateDate': '0000:00:00 00:00:00',

View File

@ -37,15 +37,15 @@ class CSSParser(abstract.AbstractParser):
class AbstractHTMLParser(abstract.AbstractParser): class AbstractHTMLParser(abstract.AbstractParser):
tags_blacklist = set() # type: Set[str] tags_blocklist = set() # type: Set[str]
# In some html/xml-based formats some tags are mandatory, # In some html/xml-based formats some tags are mandatory,
# so we're keeping them, but are discarding their content # so we're keeping them, but are discarding their content
tags_required_blacklist = set() # type: Set[str] tags_required_blocklist = set() # type: Set[str]
def __init__(self, filename): def __init__(self, filename):
super().__init__(filename) super().__init__(filename)
self.__parser = _HTMLParser(self.filename, self.tags_blacklist, self.__parser = _HTMLParser(self.filename, self.tags_blocklist,
self.tags_required_blacklist) self.tags_required_blocklist)
with open(filename, encoding='utf-8') as f: with open(filename, encoding='utf-8') as f:
self.__parser.feed(f.read()) self.__parser.feed(f.read())
self.__parser.close() self.__parser.close()
@ -59,13 +59,13 @@ class AbstractHTMLParser(abstract.AbstractParser):
class HTMLParser(AbstractHTMLParser): class HTMLParser(AbstractHTMLParser):
mimetypes = {'text/html', } mimetypes = {'text/html', }
tags_blacklist = {'meta', } tags_blocklist = {'meta', }
tags_required_blacklist = {'title', } tags_required_blocklist = {'title', }
class DTBNCXParser(AbstractHTMLParser): class DTBNCXParser(AbstractHTMLParser):
mimetypes = {'application/x-dtbncx+xml', } mimetypes = {'application/x-dtbncx+xml', }
tags_required_blacklist = {'title', 'doctitle', 'meta'} tags_required_blocklist = {'title', 'doctitle', 'meta'}
class _HTMLParser(parser.HTMLParser): class _HTMLParser(parser.HTMLParser):
@ -79,7 +79,7 @@ class _HTMLParser(parser.HTMLParser):
Also, gotcha: the `tag` parameters are always in lowercase. Also, gotcha: the `tag` parameters are always in lowercase.
""" """
def __init__(self, filename, blacklisted_tags, required_blacklisted_tags): def __init__(self, filename, blocklisted_tags, required_blocklisted_tags):
super().__init__() super().__init__()
self.filename = filename self.filename = filename
self.__textrepr = '' self.__textrepr = ''
@ -90,24 +90,24 @@ class _HTMLParser(parser.HTMLParser):
self.__in_dangerous_but_required_tag = 0 self.__in_dangerous_but_required_tag = 0
self.__in_dangerous_tag = 0 self.__in_dangerous_tag = 0
if required_blacklisted_tags & blacklisted_tags: # pragma: nocover if required_blocklisted_tags & blocklisted_tags: # pragma: nocover
raise ValueError("There is an overlap between %s and %s" % ( raise ValueError("There is an overlap between %s and %s" % (
required_blacklisted_tags, blacklisted_tags)) required_blocklisted_tags, blocklisted_tags))
self.tag_required_blacklist = required_blacklisted_tags self.tag_required_blocklist = required_blocklisted_tags
self.tag_blacklist = blacklisted_tags self.tag_blocklist = blocklisted_tags
def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]): def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
original_tag = self.get_starttag_text() original_tag = self.get_starttag_text()
self.__validation_queue.append(original_tag) self.__validation_queue.append(original_tag)
if tag in self.tag_blacklist: if tag in self.tag_blocklist:
self.__in_dangerous_tag += 1 self.__in_dangerous_tag += 1
if self.__in_dangerous_tag == 0: if self.__in_dangerous_tag == 0:
if self.__in_dangerous_but_required_tag == 0: if self.__in_dangerous_but_required_tag == 0:
self.__textrepr += original_tag self.__textrepr += original_tag
if tag in self.tag_required_blacklist: if tag in self.tag_required_blocklist:
self.__in_dangerous_but_required_tag += 1 self.__in_dangerous_but_required_tag += 1
def handle_endtag(self, tag: str): def handle_endtag(self, tag: str):
@ -123,7 +123,7 @@ class _HTMLParser(parser.HTMLParser):
"tag %s in %s" % "tag %s in %s" %
(tag, previous_tag, self.filename)) (tag, previous_tag, self.filename))
if tag in self.tag_required_blacklist: if tag in self.tag_required_blocklist:
self.__in_dangerous_but_required_tag -= 1 self.__in_dangerous_but_required_tag -= 1
if self.__in_dangerous_tag == 0: if self.__in_dangerous_tag == 0:
@ -131,7 +131,7 @@ class _HTMLParser(parser.HTMLParser):
# There is no `get_endtag_text()` method :/ # There is no `get_endtag_text()` method :/
self.__textrepr += '</' + previous_tag + '>' self.__textrepr += '</' + previous_tag + '>'
if tag in self.tag_blacklist: if tag in self.tag_blocklist:
self.__in_dangerous_tag -= 1 self.__in_dangerous_tag -= 1
def handle_data(self, data: str): def handle_data(self, data: str):
@ -141,14 +141,14 @@ class _HTMLParser(parser.HTMLParser):
self.__textrepr += escape(data) self.__textrepr += escape(data)
def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]): def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
if tag in self.tag_required_blacklist | self.tag_blacklist: if tag in self.tag_required_blocklist | self.tag_blocklist:
meta = {k:v for k, v in attrs} meta = {k:v for k, v in attrs}
name = meta.get('name', 'harmful metadata') name = meta.get('name', 'harmful metadata')
content = meta.get('content', 'harmful data') content = meta.get('content', 'harmful data')
self.__meta[name] = content self.__meta[name] = content
if self.__in_dangerous_tag == 0: if self.__in_dangerous_tag == 0:
if tag in self.tag_required_blacklist: if tag in self.tag_required_blocklist:
self.__textrepr += '<' + tag + ' />' self.__textrepr += '<' + tag + ' />'
return return