mat2/libmat2/web.py

from html import parser, escape
from typing import Dict, Any, List, Tuple, Set, Optional
import re
import string

from . import abstract

assert Set

# pylint: disable=too-many-instance-attributes

class CSSParser(abstract.AbstractParser):
    """There is no such things as metadata in CSS files,
    only comments of the form `/* … */`, so we're removing the laters."""
    mimetypes = {'text/css', }
    flags = re.MULTILINE | re.DOTALL

    def remove_all(self) -> bool:
        with open(self.filename, encoding='utf-8') as f:
            cleaned = re.sub(r'/\*.*?\*/', '', f.read(), 0, self.flags)
        with open(self.output_filename, 'w', encoding='utf-8') as f:
            f.write(cleaned)
        return True

    def get_meta(self) -> Dict[str, Any]:
        metadata = {}
        with open(self.filename, encoding='utf-8') as f:
            cssdoc = re.findall(r'/\*(.*?)\*/', f.read(), self.flags)
        for match in cssdoc:
            for line in match.splitlines():
                try:
                    k, v = line.split(':')
                    metadata[k.strip(string.whitespace + '*')] = v.strip()
                except ValueError:
                    metadata['harmful data'] = line.strip()
        return metadata


class AbstractHTMLParser(abstract.AbstractParser):
    tags_blocklist = set()  # type: Set[str]
    # In some html/xml-based formats some tags are mandatory,
    # so we're keeping them, but are discarding their content
    tags_required_blocklist = set()  # type: Set[str]

    def __init__(self, filename):
        super().__init__(filename)
        self.__parser = _HTMLParser(self.filename, self.tags_blocklist,
                                    self.tags_required_blocklist)
        with open(filename, encoding='utf-8') as f:
            self.__parser.feed(f.read())
        self.__parser.close()

    def get_meta(self) -> Dict[str, Any]:
        return self.__parser.get_meta()

    def remove_all(self) -> bool:
        return self.__parser.remove_all(self.output_filename)


class HTMLParser(AbstractHTMLParser):
    mimetypes = {'text/html', 'application/xhtml+xml'}
    tags_blocklist = {'meta', }
    tags_required_blocklist = {'title', }


class DTBNCXParser(AbstractHTMLParser):
    mimetypes = {'application/x-dtbncx+xml', }
    tags_required_blocklist = {'title', 'doctitle', 'meta'}


class _HTMLParser(parser.HTMLParser):
    """Python doesn't have a validating html parser in its stdlib, so
    we're using an internal queue to track all the opening/closing tags,
    and hoping for the best.

    Moreover, the parser.HTMLParser call doesn't provide a get_endtag_text
    method, so we have to use get_starttag_text instead, put its result in a
    LIFO, and transform it in a closing tag when needed.

    Also, gotcha: the `tag` parameters are always in lowercase.
    """
    def __init__(self, filename, blocklisted_tags, required_blocklisted_tags):
        super().__init__()
        self.filename = filename
        self.__textrepr = ''
        self.__meta = {}
        self.__validation_queue = []  # type: List[str]

        # We're using counters instead of booleans, to handle nested tags
        self.__in_dangerous_but_required_tag = 0
        self.__in_dangerous_tag = 0

        if required_blocklisted_tags & blocklisted_tags:  # pragma: nocover
            raise ValueError("There is an overlap between %s and %s" % (
                required_blocklisted_tags, blocklisted_tags))
        self.tag_required_blocklist = required_blocklisted_tags
        self.tag_blocklist = blocklisted_tags

    def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]):
        original_tag = self.get_starttag_text()
        if not original_tag:  # empty tag
            return
        self.__validation_queue.append(original_tag)

        if tag in self.tag_blocklist:
            self.__in_dangerous_tag += 1

        if self.__in_dangerous_tag == 0:
            if self.__in_dangerous_but_required_tag == 0:
                self.__textrepr += original_tag

        if tag in self.tag_required_blocklist:
            self.__in_dangerous_but_required_tag += 1

    def handle_endtag(self, tag: str):
        if not self.__validation_queue:
            raise ValueError("The closing tag %s doesn't have a corresponding "
                             "opening one in %s." % (tag, self.filename))

        previous_tag = self.__validation_queue.pop()
        previous_tag = previous_tag[1:-1]  # remove < and >
        previous_tag = previous_tag.split(' ')[0]  # remove attributes
        if tag != previous_tag.lower():
            raise ValueError("The closing tag %s doesn't match the previous "
                             "tag %s in %s" %
                             (tag, previous_tag, self.filename))

        if tag in self.tag_required_blocklist:
            self.__in_dangerous_but_required_tag -= 1

        if self.__in_dangerous_tag == 0:
            if self.__in_dangerous_but_required_tag == 0:
                # There is no `get_endtag_text()` method :/
                self.__textrepr += '</' + previous_tag + '>'

        if tag in self.tag_blocklist:
            self.__in_dangerous_tag -= 1

    def handle_data(self, data: str):
        if self.__in_dangerous_but_required_tag == 0:
            if self.__in_dangerous_tag == 0:
                if data.strip():
                    self.__textrepr += escape(data)

    def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
        if tag in self.tag_required_blocklist | self.tag_blocklist:
            meta = {k:v for k, v in attrs}
            name = meta.get('name', 'harmful metadata')
            content = meta.get('content', 'harmful data')
            self.__meta[name] = content

            if self.__in_dangerous_tag == 0:
                if tag in self.tag_required_blocklist:
                    self.__textrepr += '<' + tag + ' />'
                return

        if self.__in_dangerous_tag == 0:
            if self.__in_dangerous_but_required_tag == 0:
                self.__textrepr += self.get_starttag_text()

    def remove_all(self, output_filename: str) -> bool:
        if self.__validation_queue:
            raise ValueError("Some tags (%s) were left unclosed in %s" % (
                ', '.join(self.__validation_queue),
                self.filename))
        with open(output_filename, 'w', encoding='utf-8') as f:
            f.write(self.__textrepr)
        return True

    def get_meta(self) -> Dict[str, Any]:
        if self.__validation_queue:
            raise ValueError("Some tags (%s) were left unclosed in %s" % (
                ', '.join(self.__validation_queue),
                self.filename))
        return self.__meta
Improve epub support 2019-02-27 23:04:38 +01:00			`from html import parser, escape`
Please mypy 2019-07-13 15:02:01 +02:00			`from typing import Dict, Any, List, Tuple, Set, Optional`
Implement epub support 2019-02-21 01:28:11 +01:00			`import re`
			`import string`

			`from . import abstract`

Improve epub support 2019-02-27 23:04:38 +01:00			`assert Set`

			`# pylint: disable=too-many-instance-attributes`
Implement epub support 2019-02-21 01:28:11 +01:00
			`class CSSParser(abstract.AbstractParser):`
			`"""There is no such things as metadata in CSS files,`
			only comments of the form `/* … */`, so we're removing the laters."""
			`mimetypes = {'text/css', }`
			`flags = re.MULTILINE \| re.DOTALL`

			`def remove_all(self) -> bool:`
			`with open(self.filename, encoding='utf-8') as f:`
Fix a bug in css cleaning It's not mandatory to actually have a comment inside comment delimiter, like `/**/`. 2019-02-23 20:21:11 +01:00			`cleaned = re.sub(r'/\.?\*/', '', f.read(), 0, self.flags)`
Implement epub support 2019-02-21 01:28:11 +01:00			`with open(self.output_filename, 'w', encoding='utf-8') as f:`
			`f.write(cleaned)`
			`return True`

			`def get_meta(self) -> Dict[str, Any]:`
			`metadata = {}`
			`with open(self.filename, encoding='utf-8') as f:`
Fix a bug in css cleaning It's not mandatory to actually have a comment inside comment delimiter, like `/**/`. 2019-02-23 20:21:11 +01:00			`cssdoc = re.findall(r'/\(.?)\*/', f.read(), self.flags)`
Implement epub support 2019-02-21 01:28:11 +01:00			`for match in cssdoc:`
			`for line in match.splitlines():`
			`try:`
			`k, v = line.split(':')`
			`metadata[k.strip(string.whitespace + '*')] = v.strip()`
			`except ValueError:`
			`metadata['harmful data'] = line.strip()`
			`return metadata`


Improve epub support 2019-02-27 23:04:38 +01:00			`class AbstractHTMLParser(abstract.AbstractParser):`
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`tags_blocklist = set() # type: Set[str]`
Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00			`# In some html/xml-based formats some tags are mandatory,`
fix typo 2019-03-01 23:00:23 +01:00			`# so we're keeping them, but are discarding their content`
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`tags_required_blocklist = set() # type: Set[str]`
Improve epub support 2019-02-27 23:04:38 +01:00
Implement epub support 2019-02-21 01:28:11 +01:00			`def __init__(self, filename):`
			`super().__init__(filename)`
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`self.__parser = _HTMLParser(self.filename, self.tags_blocklist,`
			`self.tags_required_blocklist)`
Implement epub support 2019-02-21 01:28:11 +01:00			`with open(filename, encoding='utf-8') as f:`
			`self.__parser.feed(f.read())`
			`self.__parser.close()`

			`def get_meta(self) -> Dict[str, Any]:`
			`return self.__parser.get_meta()`

			`def remove_all(self) -> bool:`
			`return self.__parser.remove_all(self.output_filename)`


Improve epub support 2019-02-27 23:04:38 +01:00			`class HTMLParser(AbstractHTMLParser):`
Add support for xhtml files 2019-04-14 20:36:33 +02:00			`mimetypes = {'text/html', 'application/xhtml+xml'}`
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`tags_blocklist = {'meta', }`
			`tags_required_blocklist = {'title', }`
Improve epub support 2019-02-27 23:04:38 +01:00

			`class DTBNCXParser(AbstractHTMLParser):`
			`mimetypes = {'application/x-dtbncx+xml', }`
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`tags_required_blocklist = {'title', 'doctitle', 'meta'}`
Improve epub support 2019-02-27 23:04:38 +01:00

Implement epub support 2019-02-21 01:28:11 +01:00			`class _HTMLParser(parser.HTMLParser):`
			`"""Python doesn't have a validating html parser in its stdlib, so`
			`we're using an internal queue to track all the opening/closing tags,`
			`and hoping for the best.`
Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00
			`Moreover, the parser.HTMLParser call doesn't provide a get_endtag_text`
			`method, so we have to use get_starttag_text instead, put its result in a`
			`LIFO, and transform it in a closing tag when needed.`

			Also, gotcha: the `tag` parameters are always in lowercase.
Implement epub support 2019-02-21 01:28:11 +01:00			`"""`
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`def __init__(self, filename, blocklisted_tags, required_blocklisted_tags):`
Implement epub support 2019-02-21 01:28:11 +01:00			`super().__init__()`
			`self.filename = filename`
			`self.__textrepr = ''`
			`self.__meta = {}`
Improve epub support 2019-02-27 23:04:38 +01:00			`self.__validation_queue = [] # type: List[str]`
Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00
Improve epub support 2019-02-27 23:04:38 +01:00			`# We're using counters instead of booleans, to handle nested tags`
			`self.__in_dangerous_but_required_tag = 0`
Implement epub support 2019-02-21 01:28:11 +01:00			`self.__in_dangerous_tag = 0`

Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`if required_blocklisted_tags & blocklisted_tags: # pragma: nocover`
Improve epub support 2019-02-27 23:04:38 +01:00			`raise ValueError("There is an overlap between %s and %s" % (`
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`required_blocklisted_tags, blocklisted_tags))`
			`self.tag_required_blocklist = required_blocklisted_tags`
			`self.tag_blocklist = blocklisted_tags`
Improve epub support 2019-02-27 23:04:38 +01:00
Please mypy 2019-07-13 15:02:01 +02:00			`def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]):`
Improve epub support 2019-02-27 23:04:38 +01:00			`original_tag = self.get_starttag_text()`
Please mypy 2019-07-13 23:25:44 +02:00			`if not original_tag: # empty tag`
			`return`
Improve epub support 2019-02-27 23:04:38 +01:00			`self.__validation_queue.append(original_tag)`

Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`if tag in self.tag_blocklist:`
Implement epub support 2019-02-21 01:28:11 +01:00			`self.__in_dangerous_tag += 1`

			`if self.__in_dangerous_tag == 0:`
Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00			`if self.__in_dangerous_but_required_tag == 0:`
Improve epub support 2019-02-27 23:04:38 +01:00			`self.__textrepr += original_tag`
Implement epub support 2019-02-21 01:28:11 +01:00
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`if tag in self.tag_required_blocklist:`
Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00			`self.__in_dangerous_but_required_tag += 1`

Implement epub support 2019-02-21 01:28:11 +01:00			`def handle_endtag(self, tag: str):`
			`if not self.__validation_queue:`
			`raise ValueError("The closing tag %s doesn't have a corresponding "`
			`"opening one in %s." % (tag, self.filename))`

			`previous_tag = self.__validation_queue.pop()`
Improve epub support 2019-02-27 23:04:38 +01:00			`previous_tag = previous_tag[1:-1] # remove < and >`
			`previous_tag = previous_tag.split(' ')[0] # remove attributes`
			`if tag != previous_tag.lower():`
Implement epub support 2019-02-21 01:28:11 +01:00			`raise ValueError("The closing tag %s doesn't match the previous "`
			`"tag %s in %s" %`
			`(tag, previous_tag, self.filename))`

Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`if tag in self.tag_required_blocklist:`
Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00			`self.__in_dangerous_but_required_tag -= 1`

Implement epub support 2019-02-21 01:28:11 +01:00			`if self.__in_dangerous_tag == 0:`
Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00			`if self.__in_dangerous_but_required_tag == 0:`
Improve epub support 2019-02-27 23:04:38 +01:00			# There is no `get_endtag_text()` method :/
			`self.__textrepr += '</' + previous_tag + '>'`

Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`if tag in self.tag_blocklist:`
Improve epub support 2019-02-27 23:04:38 +01:00			`self.__in_dangerous_tag -= 1`
Implement epub support 2019-02-21 01:28:11 +01:00
			`def handle_data(self, data: str):`
Improve epub support 2019-02-27 23:04:38 +01:00			`if self.__in_dangerous_but_required_tag == 0:`
			`if self.__in_dangerous_tag == 0:`
			`if data.strip():`
			`self.__textrepr += escape(data)`
Implement epub support 2019-02-21 01:28:11 +01:00
			`def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):`
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`if tag in self.tag_required_blocklist \| self.tag_blocklist:`
Implement epub support 2019-02-21 01:28:11 +01:00			`meta = {k:v for k, v in attrs}`
			`name = meta.get('name', 'harmful metadata')`
			`content = meta.get('content', 'harmful data')`
			`self.__meta[name] = content`
Improve epub support 2019-02-27 23:04:38 +01:00
Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00			`if self.__in_dangerous_tag == 0:`
Refactor {black,white}list into {block,allow}list Closes #96 2019-02-20 00:45:27 +01:00			`if tag in self.tag_required_blocklist:`
Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00			`self.__textrepr += '<' + tag + ' />'`
Improve epub support 2019-02-27 23:04:38 +01:00			`return`

Improve the previous commit - More tests - More documentation - Minor code cleanup 2019-02-27 23:53:07 +01:00			`if self.__in_dangerous_tag == 0:`
			`if self.__in_dangerous_but_required_tag == 0:`
Implement epub support 2019-02-21 01:28:11 +01:00			`self.__textrepr += self.get_starttag_text()`

			`def remove_all(self, output_filename: str) -> bool:`
			`if self.__validation_queue:`
			`raise ValueError("Some tags (%s) were left unclosed in %s" % (`
			`', '.join(self.__validation_queue),`
			`self.filename))`
			`with open(output_filename, 'w', encoding='utf-8') as f:`
			`f.write(self.__textrepr)`
			`return True`

			`def get_meta(self) -> Dict[str, Any]:`
			`if self.__validation_queue:`
			`raise ValueError("Some tags (%s) were left unclosed in %s" % (`
			`', '.join(self.__validation_queue),`
			`self.filename))`
			`return self.__meta`