2019-02-27 23:04:38 +01:00
|
|
|
from html import parser, escape
|
2023-01-28 16:57:20 +01:00
|
|
|
from typing import Any, Optional, Dict, List, Tuple, Set
|
2019-02-21 01:28:11 +01:00
|
|
|
import re
|
|
|
|
import string
|
|
|
|
|
|
|
|
from . import abstract
|
|
|
|
|
2019-02-27 23:04:38 +01:00
|
|
|
|
|
|
|
# pylint: disable=too-many-instance-attributes
|
2019-02-21 01:28:11 +01:00
|
|
|
|
|
|
|
class CSSParser(abstract.AbstractParser):
|
|
|
|
"""There is no such things as metadata in CSS files,
|
|
|
|
only comments of the form `/* … */`, so we're removing the laters."""
|
|
|
|
mimetypes = {'text/css', }
|
|
|
|
flags = re.MULTILINE | re.DOTALL
|
|
|
|
|
|
|
|
def remove_all(self) -> bool:
|
|
|
|
with open(self.filename, encoding='utf-8') as f:
|
2019-12-15 15:44:21 +01:00
|
|
|
try:
|
|
|
|
content = f.read()
|
|
|
|
except UnicodeDecodeError: # pragma: no cover
|
|
|
|
raise ValueError
|
|
|
|
cleaned = re.sub(r'/\*.*?\*/', '', content, 0, self.flags)
|
2019-02-21 01:28:11 +01:00
|
|
|
with open(self.output_filename, 'w', encoding='utf-8') as f:
|
|
|
|
f.write(cleaned)
|
|
|
|
return True
|
|
|
|
|
2023-01-28 16:57:20 +01:00
|
|
|
def get_meta(self) -> Dict[str, Any]:
|
2019-02-21 01:28:11 +01:00
|
|
|
metadata = {}
|
|
|
|
with open(self.filename, encoding='utf-8') as f:
|
2019-12-15 15:44:21 +01:00
|
|
|
try:
|
|
|
|
content = f.read()
|
|
|
|
except UnicodeDecodeError: # pragma: no cover
|
|
|
|
raise ValueError
|
|
|
|
cssdoc = re.findall(r'/\*(.*?)\*/', content, self.flags)
|
2019-02-21 01:28:11 +01:00
|
|
|
for match in cssdoc:
|
|
|
|
for line in match.splitlines():
|
|
|
|
try:
|
|
|
|
k, v = line.split(':')
|
|
|
|
metadata[k.strip(string.whitespace + '*')] = v.strip()
|
|
|
|
except ValueError:
|
|
|
|
metadata['harmful data'] = line.strip()
|
|
|
|
return metadata
|
|
|
|
|
|
|
|
|
2019-02-27 23:04:38 +01:00
|
|
|
class AbstractHTMLParser(abstract.AbstractParser):
|
2023-05-03 22:28:02 +02:00
|
|
|
tags_blocklist: Set[str] = set()
|
2019-02-27 23:53:07 +01:00
|
|
|
# In some html/xml-based formats some tags are mandatory,
|
2019-03-01 23:00:23 +01:00
|
|
|
# so we're keeping them, but are discarding their content
|
2023-05-03 22:28:02 +02:00
|
|
|
tags_required_blocklist: Set[str] = set()
|
2019-02-27 23:04:38 +01:00
|
|
|
|
2019-02-21 01:28:11 +01:00
|
|
|
def __init__(self, filename):
|
|
|
|
super().__init__(filename)
|
2019-02-20 00:45:27 +01:00
|
|
|
self.__parser = _HTMLParser(self.filename, self.tags_blocklist,
|
|
|
|
self.tags_required_blocklist)
|
2019-02-21 01:28:11 +01:00
|
|
|
with open(filename, encoding='utf-8') as f:
|
|
|
|
self.__parser.feed(f.read())
|
|
|
|
self.__parser.close()
|
|
|
|
|
2023-01-28 16:57:20 +01:00
|
|
|
def get_meta(self) -> Dict[str, Any]:
|
2019-02-21 01:28:11 +01:00
|
|
|
return self.__parser.get_meta()
|
|
|
|
|
|
|
|
def remove_all(self) -> bool:
|
|
|
|
return self.__parser.remove_all(self.output_filename)
|
|
|
|
|
|
|
|
|
2019-02-27 23:04:38 +01:00
|
|
|
class HTMLParser(AbstractHTMLParser):
|
2019-04-14 20:36:33 +02:00
|
|
|
mimetypes = {'text/html', 'application/xhtml+xml'}
|
2019-02-20 00:45:27 +01:00
|
|
|
tags_blocklist = {'meta', }
|
|
|
|
tags_required_blocklist = {'title', }
|
2019-02-27 23:04:38 +01:00
|
|
|
|
|
|
|
|
|
|
|
class DTBNCXParser(AbstractHTMLParser):
|
|
|
|
mimetypes = {'application/x-dtbncx+xml', }
|
2019-02-20 00:45:27 +01:00
|
|
|
tags_required_blocklist = {'title', 'doctitle', 'meta'}
|
2019-02-27 23:04:38 +01:00
|
|
|
|
|
|
|
|
2019-02-21 01:28:11 +01:00
|
|
|
class _HTMLParser(parser.HTMLParser):
|
|
|
|
"""Python doesn't have a validating html parser in its stdlib, so
|
|
|
|
we're using an internal queue to track all the opening/closing tags,
|
|
|
|
and hoping for the best.
|
2019-02-27 23:53:07 +01:00
|
|
|
|
|
|
|
Moreover, the parser.HTMLParser call doesn't provide a get_endtag_text
|
|
|
|
method, so we have to use get_starttag_text instead, put its result in a
|
|
|
|
LIFO, and transform it in a closing tag when needed.
|
|
|
|
|
|
|
|
Also, gotcha: the `tag` parameters are always in lowercase.
|
2019-02-21 01:28:11 +01:00
|
|
|
"""
|
2019-02-20 00:45:27 +01:00
|
|
|
def __init__(self, filename, blocklisted_tags, required_blocklisted_tags):
|
2019-02-21 01:28:11 +01:00
|
|
|
super().__init__()
|
|
|
|
self.filename = filename
|
|
|
|
self.__textrepr = ''
|
|
|
|
self.__meta = {}
|
2023-05-03 22:28:02 +02:00
|
|
|
self.__validation_queue: List[str] = list()
|
2019-02-27 23:53:07 +01:00
|
|
|
|
2019-02-27 23:04:38 +01:00
|
|
|
# We're using counters instead of booleans, to handle nested tags
|
|
|
|
self.__in_dangerous_but_required_tag = 0
|
2019-02-21 01:28:11 +01:00
|
|
|
self.__in_dangerous_tag = 0
|
|
|
|
|
2019-02-20 00:45:27 +01:00
|
|
|
if required_blocklisted_tags & blocklisted_tags: # pragma: nocover
|
2019-02-27 23:04:38 +01:00
|
|
|
raise ValueError("There is an overlap between %s and %s" % (
|
2019-02-20 00:45:27 +01:00
|
|
|
required_blocklisted_tags, blocklisted_tags))
|
|
|
|
self.tag_required_blocklist = required_blocklisted_tags
|
|
|
|
self.tag_blocklist = blocklisted_tags
|
2019-02-27 23:04:38 +01:00
|
|
|
|
2019-12-15 15:50:54 +01:00
|
|
|
def error(self, message): # pragma: no cover
|
|
|
|
""" Amusingly, Python's documentation doesn't mention that this
|
|
|
|
function needs to be implemented in subclasses of the parent class
|
|
|
|
of parser.HTMLParser. This was found by fuzzing,
|
|
|
|
triggering the following exception:
|
|
|
|
NotImplementedError: subclasses of ParserBase must override error()
|
|
|
|
"""
|
|
|
|
raise ValueError(message)
|
|
|
|
|
2023-01-28 16:57:20 +01:00
|
|
|
def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]):
|
2019-07-22 23:28:51 +02:00
|
|
|
# Ignore the type, because mypy is too stupid to infer
|
|
|
|
# that get_starttag_text() can't return None.
|
|
|
|
original_tag = self.get_starttag_text() # type: ignore
|
|
|
|
self.__validation_queue.append(original_tag) # type: ignore
|
2019-02-27 23:04:38 +01:00
|
|
|
|
2019-02-20 00:45:27 +01:00
|
|
|
if tag in self.tag_blocklist:
|
2019-02-21 01:28:11 +01:00
|
|
|
self.__in_dangerous_tag += 1
|
|
|
|
|
|
|
|
if self.__in_dangerous_tag == 0:
|
2019-02-27 23:53:07 +01:00
|
|
|
if self.__in_dangerous_but_required_tag == 0:
|
2019-02-27 23:04:38 +01:00
|
|
|
self.__textrepr += original_tag
|
2019-02-21 01:28:11 +01:00
|
|
|
|
2019-02-20 00:45:27 +01:00
|
|
|
if tag in self.tag_required_blocklist:
|
2019-02-27 23:53:07 +01:00
|
|
|
self.__in_dangerous_but_required_tag += 1
|
|
|
|
|
2019-02-21 01:28:11 +01:00
|
|
|
def handle_endtag(self, tag: str):
|
|
|
|
if not self.__validation_queue:
|
|
|
|
raise ValueError("The closing tag %s doesn't have a corresponding "
|
|
|
|
"opening one in %s." % (tag, self.filename))
|
|
|
|
|
|
|
|
previous_tag = self.__validation_queue.pop()
|
2019-02-27 23:04:38 +01:00
|
|
|
previous_tag = previous_tag[1:-1] # remove < and >
|
|
|
|
previous_tag = previous_tag.split(' ')[0] # remove attributes
|
|
|
|
if tag != previous_tag.lower():
|
2019-02-21 01:28:11 +01:00
|
|
|
raise ValueError("The closing tag %s doesn't match the previous "
|
|
|
|
"tag %s in %s" %
|
|
|
|
(tag, previous_tag, self.filename))
|
|
|
|
|
2019-02-20 00:45:27 +01:00
|
|
|
if tag in self.tag_required_blocklist:
|
2019-02-27 23:53:07 +01:00
|
|
|
self.__in_dangerous_but_required_tag -= 1
|
|
|
|
|
2019-02-21 01:28:11 +01:00
|
|
|
if self.__in_dangerous_tag == 0:
|
2019-02-27 23:53:07 +01:00
|
|
|
if self.__in_dangerous_but_required_tag == 0:
|
2019-02-27 23:04:38 +01:00
|
|
|
# There is no `get_endtag_text()` method :/
|
|
|
|
self.__textrepr += '</' + previous_tag + '>'
|
|
|
|
|
2019-02-20 00:45:27 +01:00
|
|
|
if tag in self.tag_blocklist:
|
2019-02-27 23:04:38 +01:00
|
|
|
self.__in_dangerous_tag -= 1
|
2019-02-21 01:28:11 +01:00
|
|
|
|
|
|
|
def handle_data(self, data: str):
|
2019-02-27 23:04:38 +01:00
|
|
|
if self.__in_dangerous_but_required_tag == 0:
|
|
|
|
if self.__in_dangerous_tag == 0:
|
|
|
|
if data.strip():
|
|
|
|
self.__textrepr += escape(data)
|
2019-02-21 01:28:11 +01:00
|
|
|
|
2019-12-29 14:45:20 +01:00
|
|
|
def handle_startendtag(self, tag: str,
|
2023-01-28 16:57:20 +01:00
|
|
|
attrs: List[Tuple[str, Optional[str]]]):
|
2019-02-20 00:45:27 +01:00
|
|
|
if tag in self.tag_required_blocklist | self.tag_blocklist:
|
2019-02-21 01:28:11 +01:00
|
|
|
meta = {k:v for k, v in attrs}
|
|
|
|
name = meta.get('name', 'harmful metadata')
|
|
|
|
content = meta.get('content', 'harmful data')
|
|
|
|
self.__meta[name] = content
|
2019-02-27 23:04:38 +01:00
|
|
|
|
2019-02-27 23:53:07 +01:00
|
|
|
if self.__in_dangerous_tag == 0:
|
2019-02-20 00:45:27 +01:00
|
|
|
if tag in self.tag_required_blocklist:
|
2019-02-27 23:53:07 +01:00
|
|
|
self.__textrepr += '<' + tag + ' />'
|
2019-02-27 23:04:38 +01:00
|
|
|
return
|
|
|
|
|
2019-02-27 23:53:07 +01:00
|
|
|
if self.__in_dangerous_tag == 0:
|
|
|
|
if self.__in_dangerous_but_required_tag == 0:
|
2019-02-21 01:28:11 +01:00
|
|
|
self.__textrepr += self.get_starttag_text()
|
|
|
|
|
|
|
|
def remove_all(self, output_filename: str) -> bool:
|
|
|
|
if self.__validation_queue:
|
|
|
|
raise ValueError("Some tags (%s) were left unclosed in %s" % (
|
|
|
|
', '.join(self.__validation_queue),
|
|
|
|
self.filename))
|
|
|
|
with open(output_filename, 'w', encoding='utf-8') as f:
|
|
|
|
f.write(self.__textrepr)
|
|
|
|
return True
|
|
|
|
|
2023-01-28 16:57:20 +01:00
|
|
|
def get_meta(self) -> Dict[str, Any]:
|
2019-02-21 01:28:11 +01:00
|
|
|
if self.__validation_queue:
|
|
|
|
raise ValueError("Some tags (%s) were left unclosed in %s" % (
|
|
|
|
', '.join(self.__validation_queue),
|
|
|
|
self.filename))
|
|
|
|
return self.__meta
|