diff --git a/libmat2/html.py b/libmat2/html.py new file mode 100644 index 0000000..d0e9a2b --- /dev/null +++ b/libmat2/html.py @@ -0,0 +1,69 @@ +from html import parser +from typing import Dict, Any, List, Tuple + +from . import abstract + + +class HTMLParser(abstract.AbstractParser): + mimetypes = {'text/html', } + def __init__(self, filename): + super().__init__(filename) + self.__parser = _HTMLParser() + with open(filename) as f: + self.__parser.feed(f.read()) + self.__parser.close() + + def get_meta(self) -> Dict[str, Any]: + return self.__parser.get_meta() + + def remove_all(self) -> bool: + return self.__parser.remove_all(self.output_filename) + + +class _HTMLParser(parser.HTMLParser): + """Python doesn't have a validating html parser in its stdlib, so + we're using an internal queue to track all the opening/closing tags, + and hoping for the best. + """ + def __init__(self): + super().__init__() + self.__textrepr = '' + self.__meta = {} + self.__validation_queue = [] + + def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]): + self.__textrepr += self.get_starttag_text() + self.__validation_queue.append(tag) + + def handle_endtag(self, tag: str): + if not self.__validation_queue: + raise ValueError + elif tag != self.__validation_queue.pop(): + raise ValueError + # There is no `get_endtag_text()` method :/ + self.__textrepr += '\n' + + def handle_data(self, data: str): + if data.strip(): + self.__textrepr += data + + def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]): + if tag == 'meta': + meta = {k:v for k, v in attrs} + name = meta.get('name', 'harmful metadata') + content = meta.get('content', 'harmful data') + self.__meta[name] = content + else: + self.__textrepr += self.get_starttag_text() + + def remove_all(self, output_filename: str) -> bool: + if self.__validation_queue: + raise ValueError + with open(output_filename, 'w') as f: + f.write(self.__textrepr) + return True + + def get_meta(self) -> Dict[str, Any]: + if self.__validation_queue: + raise ValueError + return self.__meta diff --git a/tests/data/dirty.html b/tests/data/dirty.html new file mode 100644 index 0000000..1aa1723 --- /dev/null +++ b/tests/data/dirty.html @@ -0,0 +1,14 @@ + + + + + + +

+

Hello

+ I am a web page. + Please love me. + Here, have a pretty picture: a pretty picture +

+ + diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py index b2e7798..8728cb2 100644 --- a/tests/test_corrupted_files.py +++ b/tests/test_corrupted_files.py @@ -7,7 +7,7 @@ import logging import zipfile from libmat2 import pdf, images, audio, office, parser_factory, torrent -from libmat2 import harmless, video +from libmat2 import harmless, video, html # No need to logging messages, should something go wrong, # the testsuite _will_ fail. @@ -232,3 +232,40 @@ class TestCorruptedFiles(unittest.TestCase): self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!') self.assertFalse(p.remove_all()) os.remove('./tests/data/dirty.zip') + + def test_html(self): + shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') + with open('./tests/data/clean.html', 'a') as f: + f.write('but not') + with self.assertRaises(ValueError): + html.HTMLParser('./tests/data/clean.html') + os.remove('./tests/data/clean.html') + + # Yes, we're able to deal with malformed html :/ + shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') + with open('./tests/data/clean.html', 'a') as f: + f.write('') + p = html.HTMLParser('./tests/data/clean.html') + self.assertTrue(p.remove_all()) + p = html.HTMLParser('./tests/data/clean.cleaned.html') + self.assertEqual(p.get_meta(), {}) + os.remove('./tests/data/clean.html') + os.remove('./tests/data/clean.cleaned.html') + + with open('./tests/data/clean.html', 'w') as f: + f.write('') + with self.assertRaises(ValueError): + html.HTMLParser('./tests/data/clean.html') + os.remove('./tests/data/clean.html') + + with open('./tests/data/clean.html', 'w') as f: + f.write('') + p = html.HTMLParser('./tests/data/clean.html') + with self.assertRaises(ValueError): + p.get_meta() + p = html.HTMLParser('./tests/data/clean.html') + with self.assertRaises(ValueError): + p.remove_all() + os.remove('./tests/data/clean.html') + + diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 548b076..8753e09 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -6,7 +6,7 @@ import os import zipfile from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless -from libmat2 import check_dependencies, video, archive +from libmat2 import check_dependencies, video, archive, html class TestCheckDependencies(unittest.TestCase): @@ -596,3 +596,21 @@ class TestCleaning(unittest.TestCase): os.remove('./tests/data/clean.gif') os.remove('./tests/data/clean.cleaned.gif') os.remove('./tests/data/clean.cleaned.cleaned.gif') + + def test_html(self): + shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') + p = html.HTMLParser('./tests/data/clean.html') + + meta = p.get_meta() + self.assertEqual(meta['author'], 'jvoisin') + + ret = p.remove_all() + self.assertTrue(ret) + + p = html.HTMLParser('./tests/data/clean.cleaned.html') + self.assertEqual(p.get_meta(), {}) + self.assertTrue(p.remove_all()) + + os.remove('./tests/data/clean.html') + os.remove('./tests/data/clean.cleaned.html') + os.remove('./tests/data/clean.cleaned.cleaned.html')