1
0
mirror of synced 2024-11-26 03:04:22 +01:00
mat2/libmat2/html.py
2019-02-08 23:05:18 +01:00

70 lines
2.2 KiB
Python

from html import parser
from typing import Dict, Any, List, Tuple
from . import abstract
class HTMLParser(abstract.AbstractParser):
mimetypes = {'text/html', }
def __init__(self, filename):
super().__init__(filename)
self.__parser = _HTMLParser()
with open(filename) as f:
self.__parser.feed(f.read())
self.__parser.close()
def get_meta(self) -> Dict[str, Any]:
return self.__parser.get_meta()
def remove_all(self) -> bool:
return self.__parser.remove_all(self.output_filename)
class _HTMLParser(parser.HTMLParser):
"""Python doesn't have a validating html parser in its stdlib, so
we're using an internal queue to track all the opening/closing tags,
and hoping for the best.
"""
def __init__(self):
super().__init__()
self.__textrepr = ''
self.__meta = {}
self.__validation_queue = []
def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
self.__textrepr += self.get_starttag_text()
self.__validation_queue.append(tag)
def handle_endtag(self, tag: str):
if not self.__validation_queue:
raise ValueError
elif tag != self.__validation_queue.pop():
raise ValueError
# There is no `get_endtag_text()` method :/
self.__textrepr += '</' + tag + '>\n'
def handle_data(self, data: str):
if data.strip():
self.__textrepr += data
def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
if tag == 'meta':
meta = {k:v for k, v in attrs}
name = meta.get('name', 'harmful metadata')
content = meta.get('content', 'harmful data')
self.__meta[name] = content
else:
self.__textrepr += self.get_starttag_text()
def remove_all(self, output_filename: str) -> bool:
if self.__validation_queue:
raise ValueError
with open(output_filename, 'w') as f:
f.write(self.__textrepr)
return True
def get_meta(self) -> Dict[str, Any]:
if self.__validation_queue:
raise ValueError
return self.__meta