1
0
mirror of synced 2024-11-22 01:04:23 +01:00

Improve the robustness of the HTML parser

This commit is contained in:
jvoisin 2019-12-15 06:50:54 -08:00
parent f67cd9d7dc
commit efa525c102

View File

@ -104,6 +104,15 @@ class _HTMLParser(parser.HTMLParser):
self.tag_required_blocklist = required_blocklisted_tags self.tag_required_blocklist = required_blocklisted_tags
self.tag_blocklist = blocklisted_tags self.tag_blocklist = blocklisted_tags
def error(self, message): # pragma: no cover
""" Amusingly, Python's documentation doesn't mention that this
function needs to be implemented in subclasses of the parent class
of parser.HTMLParser. This was found by fuzzing,
triggering the following exception:
NotImplementedError: subclasses of ParserBase must override error()
"""
raise ValueError(message)
def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]): def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]):
# Ignore the type, because mypy is too stupid to infer # Ignore the type, because mypy is too stupid to infer
# that get_starttag_text() can't return None. # that get_starttag_text() can't return None.