1
0
Fork 0
mat2/libmat2/web.py

193 lines
7.4 KiB
Python
Raw Permalink Normal View History

2019-02-27 23:04:38 +01:00
from html import parser, escape
2023-01-28 16:57:20 +01:00
from typing import Any, Optional, Dict, List, Tuple, Set
2019-02-21 01:28:11 +01:00
import re
import string
from . import abstract
2019-02-27 23:04:38 +01:00
# pylint: disable=too-many-instance-attributes
2019-02-21 01:28:11 +01:00
class CSSParser(abstract.AbstractParser):
"""There is no such things as metadata in CSS files,
only comments of the form `/* */`, so we're removing the laters."""
mimetypes = {'text/css', }
flags = re.MULTILINE | re.DOTALL
def remove_all(self) -> bool:
with open(self.filename, encoding='utf-8') as f:
try:
content = f.read()
except UnicodeDecodeError: # pragma: no cover
raise ValueError
cleaned = re.sub(r'/\*.*?\*/', '', content, 0, self.flags)
2019-02-21 01:28:11 +01:00
with open(self.output_filename, 'w', encoding='utf-8') as f:
f.write(cleaned)
return True
2023-01-28 16:57:20 +01:00
def get_meta(self) -> Dict[str, Any]:
2019-02-21 01:28:11 +01:00
metadata = {}
with open(self.filename, encoding='utf-8') as f:
try:
content = f.read()
except UnicodeDecodeError: # pragma: no cover
raise ValueError
cssdoc = re.findall(r'/\*(.*?)\*/', content, self.flags)
2019-02-21 01:28:11 +01:00
for match in cssdoc:
for line in match.splitlines():
try:
k, v = line.split(':')
metadata[k.strip(string.whitespace + '*')] = v.strip()
except ValueError:
metadata['harmful data'] = line.strip()
return metadata
2019-02-27 23:04:38 +01:00
class AbstractHTMLParser(abstract.AbstractParser):
tags_blocklist: Set[str] = set()
# In some html/xml-based formats some tags are mandatory,
2019-03-01 23:00:23 +01:00
# so we're keeping them, but are discarding their content
tags_required_blocklist: Set[str] = set()
2019-02-27 23:04:38 +01:00
2019-02-21 01:28:11 +01:00
def __init__(self, filename):
super().__init__(filename)
self.__parser = _HTMLParser(self.filename, self.tags_blocklist,
self.tags_required_blocklist)
2019-02-21 01:28:11 +01:00
with open(filename, encoding='utf-8') as f:
self.__parser.feed(f.read())
self.__parser.close()
2023-01-28 16:57:20 +01:00
def get_meta(self) -> Dict[str, Any]:
2019-02-21 01:28:11 +01:00
return self.__parser.get_meta()
def remove_all(self) -> bool:
return self.__parser.remove_all(self.output_filename)
2019-02-27 23:04:38 +01:00
class HTMLParser(AbstractHTMLParser):
2019-04-14 20:36:33 +02:00
mimetypes = {'text/html', 'application/xhtml+xml'}
tags_blocklist = {'meta', }
tags_required_blocklist = {'title', }
2019-02-27 23:04:38 +01:00
class DTBNCXParser(AbstractHTMLParser):
mimetypes = {'application/x-dtbncx+xml', }
tags_required_blocklist = {'title', 'doctitle', 'meta'}
2019-02-27 23:04:38 +01:00
2019-02-21 01:28:11 +01:00
class _HTMLParser(parser.HTMLParser):
"""Python doesn't have a validating html parser in its stdlib, so
we're using an internal queue to track all the opening/closing tags,
and hoping for the best.
Moreover, the parser.HTMLParser call doesn't provide a get_endtag_text
method, so we have to use get_starttag_text instead, put its result in a
LIFO, and transform it in a closing tag when needed.
Also, gotcha: the `tag` parameters are always in lowercase.
2019-02-21 01:28:11 +01:00
"""
def __init__(self, filename, blocklisted_tags, required_blocklisted_tags):
2019-02-21 01:28:11 +01:00
super().__init__()
self.filename = filename
self.__textrepr = ''
self.__meta = {}
self.__validation_queue: List[str] = list()
2019-02-27 23:04:38 +01:00
# We're using counters instead of booleans, to handle nested tags
self.__in_dangerous_but_required_tag = 0
2019-02-21 01:28:11 +01:00
self.__in_dangerous_tag = 0
if required_blocklisted_tags & blocklisted_tags: # pragma: nocover
2019-02-27 23:04:38 +01:00
raise ValueError("There is an overlap between %s and %s" % (
required_blocklisted_tags, blocklisted_tags))
self.tag_required_blocklist = required_blocklisted_tags
self.tag_blocklist = blocklisted_tags
2019-02-27 23:04:38 +01:00
def error(self, message): # pragma: no cover
""" Amusingly, Python's documentation doesn't mention that this
function needs to be implemented in subclasses of the parent class
of parser.HTMLParser. This was found by fuzzing,
triggering the following exception:
NotImplementedError: subclasses of ParserBase must override error()
"""
raise ValueError(message)
2023-01-28 16:57:20 +01:00
def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]):
# Ignore the type, because mypy is too stupid to infer
# that get_starttag_text() can't return None.
original_tag = self.get_starttag_text() # type: ignore
self.__validation_queue.append(original_tag) # type: ignore
2019-02-27 23:04:38 +01:00
if tag in self.tag_blocklist:
2019-02-21 01:28:11 +01:00
self.__in_dangerous_tag += 1
if self.__in_dangerous_tag == 0:
if self.__in_dangerous_but_required_tag == 0:
2019-02-27 23:04:38 +01:00
self.__textrepr += original_tag
2019-02-21 01:28:11 +01:00
if tag in self.tag_required_blocklist:
self.__in_dangerous_but_required_tag += 1
2019-02-21 01:28:11 +01:00
def handle_endtag(self, tag: str):
if not self.__validation_queue:
raise ValueError("The closing tag %s doesn't have a corresponding "
"opening one in %s." % (tag, self.filename))
previous_tag = self.__validation_queue.pop()
2019-02-27 23:04:38 +01:00
previous_tag = previous_tag[1:-1] # remove < and >
previous_tag = previous_tag.split(' ')[0] # remove attributes
if tag != previous_tag.lower():
2019-02-21 01:28:11 +01:00
raise ValueError("The closing tag %s doesn't match the previous "
"tag %s in %s" %
(tag, previous_tag, self.filename))
if tag in self.tag_required_blocklist:
self.__in_dangerous_but_required_tag -= 1
2019-02-21 01:28:11 +01:00
if self.__in_dangerous_tag == 0:
if self.__in_dangerous_but_required_tag == 0:
2019-02-27 23:04:38 +01:00
# There is no `get_endtag_text()` method :/
self.__textrepr += '</' + previous_tag + '>'
if tag in self.tag_blocklist:
2019-02-27 23:04:38 +01:00
self.__in_dangerous_tag -= 1
2019-02-21 01:28:11 +01:00
def handle_data(self, data: str):
2019-02-27 23:04:38 +01:00
if self.__in_dangerous_but_required_tag == 0:
if self.__in_dangerous_tag == 0:
if data.strip():
self.__textrepr += escape(data)
2019-02-21 01:28:11 +01:00
def handle_startendtag(self, tag: str,
2023-01-28 16:57:20 +01:00
attrs: List[Tuple[str, Optional[str]]]):
if tag in self.tag_required_blocklist | self.tag_blocklist:
2019-02-21 01:28:11 +01:00
meta = {k:v for k, v in attrs}
name = meta.get('name', 'harmful metadata')
content = meta.get('content', 'harmful data')
self.__meta[name] = content
2019-02-27 23:04:38 +01:00
if self.__in_dangerous_tag == 0:
if tag in self.tag_required_blocklist:
self.__textrepr += '<' + tag + ' />'
2019-02-27 23:04:38 +01:00
return
if self.__in_dangerous_tag == 0:
if self.__in_dangerous_but_required_tag == 0:
2019-02-21 01:28:11 +01:00
self.__textrepr += self.get_starttag_text()
def remove_all(self, output_filename: str) -> bool:
if self.__validation_queue:
raise ValueError("Some tags (%s) were left unclosed in %s" % (
', '.join(self.__validation_queue),
self.filename))
with open(output_filename, 'w', encoding='utf-8') as f:
f.write(self.__textrepr)
return True
2023-01-28 16:57:20 +01:00
def get_meta(self) -> Dict[str, Any]:
2019-02-21 01:28:11 +01:00
if self.__validation_queue:
raise ValueError("Some tags (%s) were left unclosed in %s" % (
', '.join(self.__validation_queue),
self.filename))
return self.__meta