mat2/libmat2/web.py

from html import parser, escape
from typing import Dict, Any, List, Tuple, Set, Optional
import re
import string

from . import abstract

assert Set

# pylint: disable=too-many-instance-attributes

class CSSParser(abstract.AbstractParser):
    """There is no such things as metadata in CSS files,
    only comments of the form `/* … */`, so we're removing the laters."""
    mimetypes = {'text/css', }
    flags = re.MULTILINE | re.DOTALL

    def remove_all(self) -> bool:
        with open(self.filename, encoding='utf-8') as f:
            try:
                content = f.read()
            except UnicodeDecodeError:  # pragma: no cover
                raise ValueError
            cleaned = re.sub(r'/\*.*?\*/', '', content, 0, self.flags)
        with open(self.output_filename, 'w', encoding='utf-8') as f:
            f.write(cleaned)
        return True

    def get_meta(self) -> Dict[str, Any]:
        metadata = {}
        with open(self.filename, encoding='utf-8') as f:
            try:
                content = f.read()
            except UnicodeDecodeError:  # pragma: no cover
                raise ValueError
        cssdoc = re.findall(r'/\*(.*?)\*/', content, self.flags)
        for match in cssdoc:
            for line in match.splitlines():
                try:
                    k, v = line.split(':')
                    metadata[k.strip(string.whitespace + '*')] = v.strip()
                except ValueError:
                    metadata['harmful data'] = line.strip()
        return metadata


class AbstractHTMLParser(abstract.AbstractParser):
    tags_blocklist = set()  # type: Set[str]
    # In some html/xml-based formats some tags are mandatory,
    # so we're keeping them, but are discarding their content
    tags_required_blocklist = set()  # type: Set[str]

    def __init__(self, filename):
        super().__init__(filename)
        self.__parser = _HTMLParser(self.filename, self.tags_blocklist,
                                    self.tags_required_blocklist)
        with open(filename, encoding='utf-8') as f:
            self.__parser.feed(f.read())
        self.__parser.close()

    def get_meta(self) -> Dict[str, Any]:
        return self.__parser.get_meta()

    def remove_all(self) -> bool:
        return self.__parser.remove_all(self.output_filename)


class HTMLParser(AbstractHTMLParser):
    mimetypes = {'text/html', 'application/xhtml+xml'}
    tags_blocklist = {'meta', }
    tags_required_blocklist = {'title', }


class DTBNCXParser(AbstractHTMLParser):
    mimetypes = {'application/x-dtbncx+xml', }
    tags_required_blocklist = {'title', 'doctitle', 'meta'}


class _HTMLParser(parser.HTMLParser):
    """Python doesn't have a validating html parser in its stdlib, so
    we're using an internal queue to track all the opening/closing tags,
    and hoping for the best.

    Moreover, the parser.HTMLParser call doesn't provide a get_endtag_text
    method, so we have to use get_starttag_text instead, put its result in a
    LIFO, and transform it in a closing tag when needed.

    Also, gotcha: the `tag` parameters are always in lowercase.
    """
    def __init__(self, filename, blocklisted_tags, required_blocklisted_tags):
        super().__init__()
        self.filename = filename
        self.__textrepr = ''
        self.__meta = {}
        self.__validation_queue = []  # type: List[str]

        # We're using counters instead of booleans, to handle nested tags
        self.__in_dangerous_but_required_tag = 0
        self.__in_dangerous_tag = 0

        if required_blocklisted_tags & blocklisted_tags:  # pragma: nocover
            raise ValueError("There is an overlap between %s and %s" % (
                required_blocklisted_tags, blocklisted_tags))
        self.tag_required_blocklist = required_blocklisted_tags
        self.tag_blocklist = blocklisted_tags

    # pylint: disable=R0201
    def error(self, message):  # pragma: no cover
        """ Amusingly, Python's documentation doesn't mention that this
        function needs to be implemented in subclasses of the parent class
        of parser.HTMLParser. This was found by fuzzing,
        triggering the following exception:
            NotImplementedError: subclasses of ParserBase must override error()
        """
        raise ValueError(message)

    def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]):
        # Ignore the type, because mypy is too stupid to infer
        # that get_starttag_text() can't return None.
        original_tag = self.get_starttag_text()  # type: ignore
        self.__validation_queue.append(original_tag)  # type: ignore

        if tag in self.tag_blocklist:
            self.__in_dangerous_tag += 1

        if self.__in_dangerous_tag == 0:
            if self.__in_dangerous_but_required_tag == 0:
                self.__textrepr += original_tag

        if tag in self.tag_required_blocklist:
            self.__in_dangerous_but_required_tag += 1

    def handle_endtag(self, tag: str):
        if not self.__validation_queue:
            raise ValueError("The closing tag %s doesn't have a corresponding "
                             "opening one in %s." % (tag, self.filename))

        previous_tag = self.__validation_queue.pop()
        previous_tag = previous_tag[1:-1]  # remove < and >
        previous_tag = previous_tag.split(' ')[0]  # remove attributes
        if tag != previous_tag.lower():
            raise ValueError("The closing tag %s doesn't match the previous "
                             "tag %s in %s" %
                             (tag, previous_tag, self.filename))

        if tag in self.tag_required_blocklist:
            self.__in_dangerous_but_required_tag -= 1

        if self.__in_dangerous_tag == 0:
            if self.__in_dangerous_but_required_tag == 0:
                # There is no `get_endtag_text()` method :/
                self.__textrepr += '</' + previous_tag + '>'

        if tag in self.tag_blocklist:
            self.__in_dangerous_tag -= 1

    def handle_data(self, data: str):
        if self.__in_dangerous_but_required_tag == 0:
            if self.__in_dangerous_tag == 0:
                if data.strip():
                    self.__textrepr += escape(data)

    def handle_startendtag(self, tag: str,
                           attrs: List[Tuple[str, Optional[str]]]):
        if tag in self.tag_required_blocklist | self.tag_blocklist:
            meta = {k:v for k, v in attrs}
            name = meta.get('name', 'harmful metadata')
            content = meta.get('content', 'harmful data')
            self.__meta[name] = content

            if self.__in_dangerous_tag == 0:
                if tag in self.tag_required_blocklist:
                    self.__textrepr += '<' + tag + ' />'
                return

        if self.__in_dangerous_tag == 0:
            if self.__in_dangerous_but_required_tag == 0:
                self.__textrepr += self.get_starttag_text()

    def remove_all(self, output_filename: str) -> bool:
        if self.__validation_queue:
            raise ValueError("Some tags (%s) were left unclosed in %s" % (
                ', '.join(self.__validation_queue),
                self.filename))
        with open(output_filename, 'w', encoding='utf-8') as f:
            f.write(self.__textrepr)
        return True

    def get_meta(self) -> Dict[str, Any]:
        if self.__validation_queue:
            raise ValueError("Some tags (%s) were left unclosed in %s" % (
                ', '.join(self.__validation_queue),
                self.filename))
        return self.__meta
Improve epub support 2019-02-27 23:04:38 +01:00			`from html import parser, escape`
Please mypy 2019-07-13 15:02:01 +02:00			`from typing import Dict, Any, List, Tuple, Set, Optional`
Implement epub support 2019-02-21 01:28:11 +01:00			`import re`
			`import string`

			`from . import abstract`

Improve epub support 2019-02-27 23:04:38 +01:00			`assert Set`

			`# pylint: disable=too-many-instance-attributes`
Implement epub support 2019-02-21 01:28:11 +01:00
			`class CSSParser(abstract.AbstractParser):`
			`"""There is no such things as metadata in CSS files,`
			only comments of the form `/* … */`, so we're removing the laters."""
			`mimetypes = {'text/css', }`
			`flags = re.MULTILINE \| re.DOTALL`

			`def remove_all(self) -> bool:`
			`with open(self.filename, encoding='utf-8') as f:`
Improve the robustness of the CSS parser 2019-12-15 15:44:21 +01:00			`try:`
			`content = f.read()`
			`except UnicodeDecodeError: # pragma: no cover`
			`raise ValueError`
			`cleaned = re.sub(r'/\.?\*/', '', content, 0, self.flags)`
Implement epub support 2019-02-21 01:28:11 +01:00			`with open(self.output_filename, 'w', encoding='utf-8') as f:`
			`f.write(cleaned)`
			`return True`

			`def get_meta(self) -> Dict[str, Any]:`
			`metadata = {}`
			`with open(self.filename, encoding='utf-8') as f:`
Improve the robustness of the CSS parser 2019-12-15 15:44:21 +01:00			`try:`
			`content = f.read()`
			`except UnicodeDecodeError: # pragma: no cover`
			`raise ValueError`
			`cssdoc = re.findall(r'/\(.?)\*/', content, self.flags)`
Implement epub support 2019-02-21 01:28:11 +01:00			`for match in cssdoc:`
			`for line in match.splitlines():`
			`try:`
			`k, v = line.split(':')`
			`metadata[k.strip(string.whitespace + '*')] = v.strip()`
			`except ValueError:`
			`metadata['harmful data'] = line.strip()`
			`return metadata`


Improve epub support 2019-02-27 23:04:38 +01:00			`class AbstractHTMLParser(abstract.AbstractParser):`
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`tags_blocklist = set() # type: Set[str]`
Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00			`# In some html/xml-based formats some tags are mandatory,`
fix typo 2019-03-01 23:00:23 +01:00			`# so we're keeping them, but are discarding their content`
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`tags_required_blocklist = set() # type: Set[str]`
Improve epub support 2019-02-27 23:04:38 +01:00
Implement epub support 2019-02-21 01:28:11 +01:00			`def __init__(self, filename):`
			`super().__init__(filename)`
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`self.__parser = _HTMLParser(self.filename, self.tags_blocklist,`
			`self.tags_required_blocklist)`
Implement epub support 2019-02-21 01:28:11 +01:00			`with open(filename, encoding='utf-8') as f:`
			`self.__parser.feed(f.read())`
			`self.__parser.close()`

			`def get_meta(self) -> Dict[str, Any]:`
			`return self.__parser.get_meta()`

			`def remove_all(self) -> bool:`
			`return self.__parser.remove_all(self.output_filename)`


Improve epub support 2019-02-27 23:04:38 +01:00			`class HTMLParser(AbstractHTMLParser):`
Add support for xhtml files 2019-04-14 20:36:33 +02:00			`mimetypes = {'text/html', 'application/xhtml+xml'}`
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`tags_blocklist = {'meta', }`
			`tags_required_blocklist = {'title', }`
Improve epub support 2019-02-27 23:04:38 +01:00

			`class DTBNCXParser(AbstractHTMLParser):`
			`mimetypes = {'application/x-dtbncx+xml', }`
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`tags_required_blocklist = {'title', 'doctitle', 'meta'}`
Improve epub support 2019-02-27 23:04:38 +01:00

Implement epub support 2019-02-21 01:28:11 +01:00			`class _HTMLParser(parser.HTMLParser):`
			`"""Python doesn't have a validating html parser in its stdlib, so`
			`we're using an internal queue to track all the opening/closing tags,`
			`and hoping for the best.`
Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00
			`Moreover, the parser.HTMLParser call doesn't provide a get_endtag_text`
			`method, so we have to use get_starttag_text instead, put its result in a`
			`LIFO, and transform it in a closing tag when needed.`

			Also, gotcha: the `tag` parameters are always in lowercase.
Implement epub support 2019-02-21 01:28:11 +01:00			`"""`
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`def __init__(self, filename, blocklisted_tags, required_blocklisted_tags):`
Implement epub support 2019-02-21 01:28:11 +01:00			`super().__init__()`
			`self.filename = filename`
			`self.__textrepr = ''`
			`self.__meta = {}`
Improve epub support 2019-02-27 23:04:38 +01:00			`self.__validation_queue = [] # type: List[str]`
Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00
Improve epub support 2019-02-27 23:04:38 +01:00			`# We're using counters instead of booleans, to handle nested tags`
			`self.__in_dangerous_but_required_tag = 0`
Implement epub support 2019-02-21 01:28:11 +01:00			`self.__in_dangerous_tag = 0`

Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`if required_blocklisted_tags & blocklisted_tags: # pragma: nocover`
Improve epub support 2019-02-27 23:04:38 +01:00			`raise ValueError("There is an overlap between %s and %s" % (`
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`required_blocklisted_tags, blocklisted_tags))`
			`self.tag_required_blocklist = required_blocklisted_tags`
			`self.tag_blocklist = blocklisted_tags`
Improve epub support 2019-02-27 23:04:38 +01:00
Please the linters 2022-03-29 22:18:06 +02:00			`# pylint: disable=R0201`
Improve the robustness of the HTML parser 2019-12-15 15:50:54 +01:00			`def error(self, message): # pragma: no cover`
			`""" Amusingly, Python's documentation doesn't mention that this`
			`function needs to be implemented in subclasses of the parent class`
			`of parser.HTMLParser. This was found by fuzzing,`
			`triggering the following exception:`
			`NotImplementedError: subclasses of ParserBase must override error()`
			`"""`
			`raise ValueError(message)`

Please mypy 2019-07-13 15:02:01 +02:00			`def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]):`
Remove a mypy workaround to bump coverage back to 100% 2019-07-22 23:28:51 +02:00			`# Ignore the type, because mypy is too stupid to infer`
			`# that get_starttag_text() can't return None.`
			`original_tag = self.get_starttag_text() # type: ignore`
			`self.__validation_queue.append(original_tag) # type: ignore`
Improve epub support 2019-02-27 23:04:38 +01:00
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`if tag in self.tag_blocklist:`
Implement epub support 2019-02-21 01:28:11 +01:00			`self.__in_dangerous_tag += 1`

			`if self.__in_dangerous_tag == 0:`
Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00			`if self.__in_dangerous_but_required_tag == 0:`
Improve epub support 2019-02-27 23:04:38 +01:00			`self.__textrepr += original_tag`
Implement epub support 2019-02-21 01:28:11 +01:00
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`if tag in self.tag_required_blocklist:`
Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00			`self.__in_dangerous_but_required_tag += 1`

Implement epub support 2019-02-21 01:28:11 +01:00			`def handle_endtag(self, tag: str):`
			`if not self.__validation_queue:`
			`raise ValueError("The closing tag %s doesn't have a corresponding "`
			`"opening one in %s." % (tag, self.filename))`

			`previous_tag = self.__validation_queue.pop()`
Improve epub support 2019-02-27 23:04:38 +01:00			`previous_tag = previous_tag[1:-1] # remove < and >`
			`previous_tag = previous_tag.split(' ')[0] # remove attributes`
			`if tag != previous_tag.lower():`
Implement epub support 2019-02-21 01:28:11 +01:00			`raise ValueError("The closing tag %s doesn't match the previous "`
			`"tag %s in %s" %`
			`(tag, previous_tag, self.filename))`

Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`if tag in self.tag_required_blocklist:`
Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00			`self.__in_dangerous_but_required_tag -= 1`

Implement epub support 2019-02-21 01:28:11 +01:00			`if self.__in_dangerous_tag == 0:`
Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00			`if self.__in_dangerous_but_required_tag == 0:`
Improve epub support 2019-02-27 23:04:38 +01:00			# There is no `get_endtag_text()` method :/
			`self.__textrepr += '</' + previous_tag + '>'`

Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`if tag in self.tag_blocklist:`
Improve epub support 2019-02-27 23:04:38 +01:00			`self.__in_dangerous_tag -= 1`
Implement epub support 2019-02-21 01:28:11 +01:00
			`def handle_data(self, data: str):`
Improve epub support 2019-02-27 23:04:38 +01:00			`if self.__in_dangerous_but_required_tag == 0:`
			`if self.__in_dangerous_tag == 0:`
			`if data.strip():`
			`self.__textrepr += escape(data)`
Implement epub support 2019-02-21 01:28:11 +01:00
Please mypy Mypy doesn't like some annotation in web.py, this commits aims at pleasing it. 2019-12-29 14:45:20 +01:00			`def handle_startendtag(self, tag: str,`
			`attrs: List[Tuple[str, Optional[str]]]):`
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`if tag in self.tag_required_blocklist \| self.tag_blocklist:`
Implement epub support 2019-02-21 01:28:11 +01:00			`meta = {k:v for k, v in attrs}`
			`name = meta.get('name', 'harmful metadata')`
			`content = meta.get('content', 'harmful data')`
			`self.__meta[name] = content`
Improve epub support 2019-02-27 23:04:38 +01:00
Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00			`if self.__in_dangerous_tag == 0:`
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`if tag in self.tag_required_blocklist:`
Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00			`self.__textrepr += '<' + tag + ' />'`
Improve epub support 2019-02-27 23:04:38 +01:00			`return`

Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00			`if self.__in_dangerous_tag == 0:`
			`if self.__in_dangerous_but_required_tag == 0:`
Implement epub support 2019-02-21 01:28:11 +01:00			`self.__textrepr += self.get_starttag_text()`

			`def remove_all(self, output_filename: str) -> bool:`
			`if self.__validation_queue:`
			`raise ValueError("Some tags (%s) were left unclosed in %s" % (`
			`', '.join(self.__validation_queue),`
			`self.filename))`
			`with open(output_filename, 'w', encoding='utf-8') as f:`
			`f.write(self.__textrepr)`
			`return True`

			`def get_meta(self) -> Dict[str, Any]:`
			`if self.__validation_queue:`
			`raise ValueError("Some tags (%s) were left unclosed in %s" % (`
			`', '.join(self.__validation_queue),`
			`self.filename))`
			`return self.__meta`