mat2/libmat2/web.py

from html import parser, escape
from typing import Any, Optional, Dict, List, Tuple, Set
import re
import string

from . import abstract


# pylint: disable=too-many-instance-attributes

class CSSParser(abstract.AbstractParser):
    """There is no such things as metadata in CSS files,
    only comments of the form `/* … */`, so we're removing the laters."""
    mimetypes = {'text/css', }
    flags = re.MULTILINE | re.DOTALL

    def remove_all(self) -> bool:
        with open(self.filename, encoding='utf-8') as f:
            try:
                content = f.read()
            except UnicodeDecodeError:  # pragma: no cover
                raise ValueError
            cleaned = re.sub(r'/\*.*?\*/', '', content, 0, self.flags)
        with open(self.output_filename, 'w', encoding='utf-8') as f:
            f.write(cleaned)
        return True

    def get_meta(self) -> Dict[str, Any]:
        metadata = {}
        with open(self.filename, encoding='utf-8') as f:
            try:
                content = f.read()
            except UnicodeDecodeError:  # pragma: no cover
                raise ValueError
        cssdoc = re.findall(r'/\*(.*?)\*/', content, self.flags)
        for match in cssdoc:
            for line in match.splitlines():
                try:
                    k, v = line.split(':')
                    metadata[k.strip(string.whitespace + '*')] = v.strip()
                except ValueError:
                    metadata['harmful data'] = line.strip()
        return metadata


class AbstractHTMLParser(abstract.AbstractParser):
    tags_blocklist = set()  # type: Set[str]
    # In some html/xml-based formats some tags are mandatory,
    # so we're keeping them, but are discarding their content
    tags_required_blocklist = set()  # type: Set[str]

    def __init__(self, filename):
        super().__init__(filename)
        self.__parser = _HTMLParser(self.filename, self.tags_blocklist,
                                    self.tags_required_blocklist)
        with open(filename, encoding='utf-8') as f:
            self.__parser.feed(f.read())
        self.__parser.close()

    def get_meta(self) -> Dict[str, Any]:
        return self.__parser.get_meta()

    def remove_all(self) -> bool:
        return self.__parser.remove_all(self.output_filename)


class HTMLParser(AbstractHTMLParser):
    mimetypes = {'text/html', 'application/xhtml+xml'}
    tags_blocklist = {'meta', }
    tags_required_blocklist = {'title', }


class DTBNCXParser(AbstractHTMLParser):
    mimetypes = {'application/x-dtbncx+xml', }
    tags_required_blocklist = {'title', 'doctitle', 'meta'}


class _HTMLParser(parser.HTMLParser):
    """Python doesn't have a validating html parser in its stdlib, so
    we're using an internal queue to track all the opening/closing tags,
    and hoping for the best.

    Moreover, the parser.HTMLParser call doesn't provide a get_endtag_text
    method, so we have to use get_starttag_text instead, put its result in a
    LIFO, and transform it in a closing tag when needed.

    Also, gotcha: the `tag` parameters are always in lowercase.
    """
    def __init__(self, filename, blocklisted_tags, required_blocklisted_tags):
        super().__init__()
        self.filename = filename
        self.__textrepr = ''
        self.__meta = {}
        self.__validation_queue = []  # type: list[str]

        # We're using counters instead of booleans, to handle nested tags
        self.__in_dangerous_but_required_tag = 0
        self.__in_dangerous_tag = 0

        if required_blocklisted_tags & blocklisted_tags:  # pragma: nocover
            raise ValueError("There is an overlap between %s and %s" % (
                required_blocklisted_tags, blocklisted_tags))
        self.tag_required_blocklist = required_blocklisted_tags
        self.tag_blocklist = blocklisted_tags

    def error(self, message):  # pragma: no cover
        """ Amusingly, Python's documentation doesn't mention that this
        function needs to be implemented in subclasses of the parent class
        of parser.HTMLParser. This was found by fuzzing,
        triggering the following exception:
            NotImplementedError: subclasses of ParserBase must override error()
        """
        raise ValueError(message)

    def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]):
        # Ignore the type, because mypy is too stupid to infer
        # that get_starttag_text() can't return None.
        original_tag = self.get_starttag_text()  # type: ignore
        self.__validation_queue.append(original_tag)  # type: ignore

        if tag in self.tag_blocklist:
            self.__in_dangerous_tag += 1

        if self.__in_dangerous_tag == 0:
            if self.__in_dangerous_but_required_tag == 0:
                self.__textrepr += original_tag

        if tag in self.tag_required_blocklist:
            self.__in_dangerous_but_required_tag += 1

    def handle_endtag(self, tag: str):
        if not self.__validation_queue:
            raise ValueError("The closing tag %s doesn't have a corresponding "
                             "opening one in %s." % (tag, self.filename))

        previous_tag = self.__validation_queue.pop()
        previous_tag = previous_tag[1:-1]  # remove < and >
        previous_tag = previous_tag.split(' ')[0]  # remove attributes
        if tag != previous_tag.lower():
            raise ValueError("The closing tag %s doesn't match the previous "
                             "tag %s in %s" %
                             (tag, previous_tag, self.filename))

        if tag in self.tag_required_blocklist:
            self.__in_dangerous_but_required_tag -= 1

        if self.__in_dangerous_tag == 0:
            if self.__in_dangerous_but_required_tag == 0:
                # There is no `get_endtag_text()` method :/
                self.__textrepr += '</' + previous_tag + '>'

        if tag in self.tag_blocklist:
            self.__in_dangerous_tag -= 1

    def handle_data(self, data: str):
        if self.__in_dangerous_but_required_tag == 0:
            if self.__in_dangerous_tag == 0:
                if data.strip():
                    self.__textrepr += escape(data)

    def handle_startendtag(self, tag: str,
                           attrs: List[Tuple[str, Optional[str]]]):
        if tag in self.tag_required_blocklist | self.tag_blocklist:
            meta = {k:v for k, v in attrs}
            name = meta.get('name', 'harmful metadata')
            content = meta.get('content', 'harmful data')
            self.__meta[name] = content

            if self.__in_dangerous_tag == 0:
                if tag in self.tag_required_blocklist:
                    self.__textrepr += '<' + tag + ' />'
                return

        if self.__in_dangerous_tag == 0:
            if self.__in_dangerous_but_required_tag == 0:
                self.__textrepr += self.get_starttag_text()

    def remove_all(self, output_filename: str) -> bool:
        if self.__validation_queue:
            raise ValueError("Some tags (%s) were left unclosed in %s" % (
                ', '.join(self.__validation_queue),
                self.filename))
        with open(output_filename, 'w', encoding='utf-8') as f:
            f.write(self.__textrepr)
        return True

    def get_meta(self) -> Dict[str, Any]:
        if self.__validation_queue:
            raise ValueError("Some tags (%s) were left unclosed in %s" % (
                ', '.join(self.__validation_queue),
                self.filename))
        return self.__meta
Improve epub support 2019-02-27 23:04:38 +01:00			`from html import parser, escape`
Fix the type annotations 2023-01-28 16:57:20 +01:00			`from typing import Any, Optional, Dict, List, Tuple, Set`
Implement epub support 2019-02-21 01:28:11 +01:00			`import re`
			`import string`

			`from . import abstract`

Improve epub support 2019-02-27 23:04:38 +01:00
			`# pylint: disable=too-many-instance-attributes`
Implement epub support 2019-02-21 01:28:11 +01:00
			`class CSSParser(abstract.AbstractParser):`
			`"""There is no such things as metadata in CSS files,`
			only comments of the form `/* … */`, so we're removing the laters."""
			`mimetypes = {'text/css', }`
			`flags = re.MULTILINE \| re.DOTALL`

			`def remove_all(self) -> bool:`
			`with open(self.filename, encoding='utf-8') as f:`
Improve the robustness of the CSS parser 2019-12-15 15:44:21 +01:00			`try:`
			`content = f.read()`
			`except UnicodeDecodeError: # pragma: no cover`
			`raise ValueError`
			`cleaned = re.sub(r'/\.?\*/', '', content, 0, self.flags)`
Implement epub support 2019-02-21 01:28:11 +01:00			`with open(self.output_filename, 'w', encoding='utf-8') as f:`
			`f.write(cleaned)`
			`return True`

Fix the type annotations 2023-01-28 16:57:20 +01:00			`def get_meta(self) -> Dict[str, Any]:`
Implement epub support 2019-02-21 01:28:11 +01:00			`metadata = {}`
			`with open(self.filename, encoding='utf-8') as f:`
Improve the robustness of the CSS parser 2019-12-15 15:44:21 +01:00			`try:`
			`content = f.read()`
			`except UnicodeDecodeError: # pragma: no cover`
			`raise ValueError`
			`cssdoc = re.findall(r'/\(.?)\*/', content, self.flags)`
Implement epub support 2019-02-21 01:28:11 +01:00			`for match in cssdoc:`
			`for line in match.splitlines():`
			`try:`
			`k, v = line.split(':')`
			`metadata[k.strip(string.whitespace + '*')] = v.strip()`
			`except ValueError:`
			`metadata['harmful data'] = line.strip()`
			`return metadata`


Improve epub support 2019-02-27 23:04:38 +01:00			`class AbstractHTMLParser(abstract.AbstractParser):`
Fix the type annotations 2023-01-28 16:57:20 +01:00			`tags_blocklist = set() # type: Set[str]`
Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00			`# In some html/xml-based formats some tags are mandatory,`
fix typo 2019-03-01 23:00:23 +01:00			`# so we're keeping them, but are discarding their content`
Fix the type annotations 2023-01-28 16:57:20 +01:00			`tags_required_blocklist = set() # type: Set[str]`
Improve epub support 2019-02-27 23:04:38 +01:00
Implement epub support 2019-02-21 01:28:11 +01:00			`def __init__(self, filename):`
			`super().__init__(filename)`
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`self.__parser = _HTMLParser(self.filename, self.tags_blocklist,`
			`self.tags_required_blocklist)`
Implement epub support 2019-02-21 01:28:11 +01:00			`with open(filename, encoding='utf-8') as f:`
			`self.__parser.feed(f.read())`
			`self.__parser.close()`

Fix the type annotations 2023-01-28 16:57:20 +01:00			`def get_meta(self) -> Dict[str, Any]:`
Implement epub support 2019-02-21 01:28:11 +01:00			`return self.__parser.get_meta()`

			`def remove_all(self) -> bool:`
			`return self.__parser.remove_all(self.output_filename)`


Improve epub support 2019-02-27 23:04:38 +01:00			`class HTMLParser(AbstractHTMLParser):`
Add support for xhtml files 2019-04-14 20:36:33 +02:00			`mimetypes = {'text/html', 'application/xhtml+xml'}`
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`tags_blocklist = {'meta', }`
			`tags_required_blocklist = {'title', }`
Improve epub support 2019-02-27 23:04:38 +01:00

			`class DTBNCXParser(AbstractHTMLParser):`
			`mimetypes = {'application/x-dtbncx+xml', }`
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`tags_required_blocklist = {'title', 'doctitle', 'meta'}`
Improve epub support 2019-02-27 23:04:38 +01:00

Implement epub support 2019-02-21 01:28:11 +01:00			`class _HTMLParser(parser.HTMLParser):`
			`"""Python doesn't have a validating html parser in its stdlib, so`
			`we're using an internal queue to track all the opening/closing tags,`
			`and hoping for the best.`
Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00
			`Moreover, the parser.HTMLParser call doesn't provide a get_endtag_text`
			`method, so we have to use get_starttag_text instead, put its result in a`
			`LIFO, and transform it in a closing tag when needed.`

			Also, gotcha: the `tag` parameters are always in lowercase.
Implement epub support 2019-02-21 01:28:11 +01:00			`"""`
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`def __init__(self, filename, blocklisted_tags, required_blocklisted_tags):`
Implement epub support 2019-02-21 01:28:11 +01:00			`super().__init__()`
			`self.filename = filename`
			`self.__textrepr = ''`
			`self.__meta = {}`
Simplify the typing annotations 2022-08-28 22:29:06 +02:00			`self.__validation_queue = [] # type: list[str]`
Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00
Improve epub support 2019-02-27 23:04:38 +01:00			`# We're using counters instead of booleans, to handle nested tags`
			`self.__in_dangerous_but_required_tag = 0`
Implement epub support 2019-02-21 01:28:11 +01:00			`self.__in_dangerous_tag = 0`

Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`if required_blocklisted_tags & blocklisted_tags: # pragma: nocover`
Improve epub support 2019-02-27 23:04:38 +01:00			`raise ValueError("There is an overlap between %s and %s" % (`
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`required_blocklisted_tags, blocklisted_tags))`
			`self.tag_required_blocklist = required_blocklisted_tags`
			`self.tag_blocklist = blocklisted_tags`
Improve epub support 2019-02-27 23:04:38 +01:00
Improve the robustness of the HTML parser 2019-12-15 15:50:54 +01:00			`def error(self, message): # pragma: no cover`
			`""" Amusingly, Python's documentation doesn't mention that this`
			`function needs to be implemented in subclasses of the parent class`
			`of parser.HTMLParser. This was found by fuzzing,`
			`triggering the following exception:`
			`NotImplementedError: subclasses of ParserBase must override error()`
			`"""`
			`raise ValueError(message)`

Fix the type annotations 2023-01-28 16:57:20 +01:00			`def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]):`
Remove a mypy workaround to bump coverage back to 100% 2019-07-22 23:28:51 +02:00			`# Ignore the type, because mypy is too stupid to infer`
			`# that get_starttag_text() can't return None.`
			`original_tag = self.get_starttag_text() # type: ignore`
			`self.__validation_queue.append(original_tag) # type: ignore`
Improve epub support 2019-02-27 23:04:38 +01:00
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`if tag in self.tag_blocklist:`
Implement epub support 2019-02-21 01:28:11 +01:00			`self.__in_dangerous_tag += 1`

			`if self.__in_dangerous_tag == 0:`
Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00			`if self.__in_dangerous_but_required_tag == 0:`
Improve epub support 2019-02-27 23:04:38 +01:00			`self.__textrepr += original_tag`
Implement epub support 2019-02-21 01:28:11 +01:00
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`if tag in self.tag_required_blocklist:`
Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00			`self.__in_dangerous_but_required_tag += 1`

Implement epub support 2019-02-21 01:28:11 +01:00			`def handle_endtag(self, tag: str):`
			`if not self.__validation_queue:`
			`raise ValueError("The closing tag %s doesn't have a corresponding "`
			`"opening one in %s." % (tag, self.filename))`

			`previous_tag = self.__validation_queue.pop()`
Improve epub support 2019-02-27 23:04:38 +01:00			`previous_tag = previous_tag[1:-1] # remove < and >`
			`previous_tag = previous_tag.split(' ')[0] # remove attributes`
			`if tag != previous_tag.lower():`
Implement epub support 2019-02-21 01:28:11 +01:00			`raise ValueError("The closing tag %s doesn't match the previous "`
			`"tag %s in %s" %`
			`(tag, previous_tag, self.filename))`

Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`if tag in self.tag_required_blocklist:`
Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00			`self.__in_dangerous_but_required_tag -= 1`

Implement epub support 2019-02-21 01:28:11 +01:00			`if self.__in_dangerous_tag == 0:`
Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00			`if self.__in_dangerous_but_required_tag == 0:`
Improve epub support 2019-02-27 23:04:38 +01:00			# There is no `get_endtag_text()` method :/
			`self.__textrepr += '</' + previous_tag + '>'`

Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`if tag in self.tag_blocklist:`
Improve epub support 2019-02-27 23:04:38 +01:00			`self.__in_dangerous_tag -= 1`
Implement epub support 2019-02-21 01:28:11 +01:00
			`def handle_data(self, data: str):`
Improve epub support 2019-02-27 23:04:38 +01:00			`if self.__in_dangerous_but_required_tag == 0:`
			`if self.__in_dangerous_tag == 0:`
			`if data.strip():`
			`self.__textrepr += escape(data)`
Implement epub support 2019-02-21 01:28:11 +01:00
Please mypy Mypy doesn't like some annotation in web.py, this commits aims at pleasing it. 2019-12-29 14:45:20 +01:00			`def handle_startendtag(self, tag: str,`
Fix the type annotations 2023-01-28 16:57:20 +01:00			`attrs: List[Tuple[str, Optional[str]]]):`
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`if tag in self.tag_required_blocklist \| self.tag_blocklist:`
Implement epub support 2019-02-21 01:28:11 +01:00			`meta = {k:v for k, v in attrs}`
			`name = meta.get('name', 'harmful metadata')`
			`content = meta.get('content', 'harmful data')`
			`self.__meta[name] = content`
Improve epub support 2019-02-27 23:04:38 +01:00
Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00			`if self.__in_dangerous_tag == 0:`
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`if tag in self.tag_required_blocklist:`
Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00			`self.__textrepr += '<' + tag + ' />'`
Improve epub support 2019-02-27 23:04:38 +01:00			`return`

Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00			`if self.__in_dangerous_tag == 0:`
			`if self.__in_dangerous_but_required_tag == 0:`
Implement epub support 2019-02-21 01:28:11 +01:00			`self.__textrepr += self.get_starttag_text()`

			`def remove_all(self, output_filename: str) -> bool:`
			`if self.__validation_queue:`
			`raise ValueError("Some tags (%s) were left unclosed in %s" % (`
			`', '.join(self.__validation_queue),`
			`self.filename))`
			`with open(output_filename, 'w', encoding='utf-8') as f:`
			`f.write(self.__textrepr)`
			`return True`

Fix the type annotations 2023-01-28 16:57:20 +01:00			`def get_meta(self) -> Dict[str, Any]:`
Implement epub support 2019-02-21 01:28:11 +01:00			`if self.__validation_queue:`
			`raise ValueError("Some tags (%s) were left unclosed in %s" % (`
			`', '.join(self.__validation_queue),`
			`self.filename))`
			`return self.__meta`