Add support for html files

2025-06-14 20:11:57 +02:00 · 2019-02-08 00:26:47 +01:00 · 2019-02-08 00:26:47 +01:00 · 6cc034e81b
commit 6cc034e81b
parent e1dd439fc8
4 changed files with 140 additions and 2 deletions
--- a/libmat2/html.py
+++ b/libmat2/html.py
@ -0,0 +1,69 @@
 from html import parser
 from typing import Dict, Any, List, Tuple
 from . import abstract
 class HTMLParser(abstract.AbstractParser):
    mimetypes = {'text/html', }
    def __init__(self, filename):
        super().__init__(filename)
        self.__parser = _HTMLParser()
        with open(filename) as f:
            self.__parser.feed(f.read())
        self.__parser.close()
    def get_meta(self) -> Dict[str, Any]:
        return self.__parser.get_meta()
    def remove_all(self) -> bool:
        return self.__parser.remove_all(self.output_filename)
 class _HTMLParser(parser.HTMLParser):
    """Python doesn't have a validating html parser in its stdlib, so
    we're using an internal queue to track all the opening/closing tags,
    and hoping for the best.
    """
    def __init__(self):
        super().__init__()
        self.__textrepr = ''
        self.__meta = {}
        self.__validation_queue = []
    def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
        self.__textrepr += self.get_starttag_text()
        self.__validation_queue.append(tag)
    def handle_endtag(self, tag: str):
        if not self.__validation_queue:
            raise ValueError
        elif tag != self.__validation_queue.pop():
            raise ValueError
        # There is no `get_endtag_text()` method :/
        self.__textrepr += '</' + tag + '>\n'
    def handle_data(self, data: str):
        if data.strip():
            self.__textrepr += data
    def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
        if tag == 'meta':
            meta = {k:v for k, v in attrs}
            name = meta.get('name', 'harmful metadata')
            content = meta.get('content', 'harmful data')
            self.__meta[name] = content
        else:
            self.__textrepr += self.get_starttag_text()
    def remove_all(self, output_filename: str) -> bool:
        if self.__validation_queue:
            raise ValueError
        with open(output_filename, 'w') as f:
            f.write(self.__textrepr)
        return True
    def get_meta(self) -> Dict[str, Any]:
        if self.__validation_queue:
            raise ValueError
        return self.__meta
--- a/tests/data/dirty.html
+++ b/tests/data/dirty.html
@ -0,0 +1,14 @@
 <html>
 	<head>
 		<meta content="vim" name="generator"/>
 		<meta content="jvoisin" name="author"/>
 </head>
 <body>
 	<p>
 		<h1>Hello</h1>
 		I am a web page.
 		Please <b>love</b> me.
 		Here, have a pretty picture: <img src='dirty.jpg' alt='a pretty picture'/>
 	</p>
 </body>
 </html>
--- a/tests/test_corrupted_files.py
+++ b/tests/test_corrupted_files.py
@ -7,7 +7,7 @@ import logging
 import zipfile
 from libmat2 import pdf, images, audio, office, parser_factory, torrent
-from libmat2 import harmless, video
+from libmat2 import harmless, video, html
 # No need to logging messages, should something go wrong,
 # the testsuite _will_ fail.
@ -232,3 +232,40 @@ class TestCorruptedFiles(unittest.TestCase):
        self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
        self.assertFalse(p.remove_all())
        os.remove('./tests/data/dirty.zip')
    def test_html(self):
        shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
        with open('./tests/data/clean.html', 'a') as f:
            f.write('<open>but not</closed>')
        with self.assertRaises(ValueError):
            html.HTMLParser('./tests/data/clean.html')
        os.remove('./tests/data/clean.html')
        # Yes, we're able to deal with malformed html :/
        shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
        with open('./tests/data/clean.html', 'a') as f:
            f.write('<meta name=\'this" is="weird"/>')
        p = html.HTMLParser('./tests/data/clean.html')
        self.assertTrue(p.remove_all())
        p = html.HTMLParser('./tests/data/clean.cleaned.html')
        self.assertEqual(p.get_meta(), {})
        os.remove('./tests/data/clean.html')
        os.remove('./tests/data/clean.cleaned.html')
        with open('./tests/data/clean.html', 'w') as f:
            f.write('</close>')
        with self.assertRaises(ValueError):
            html.HTMLParser('./tests/data/clean.html')
        os.remove('./tests/data/clean.html')
        with open('./tests/data/clean.html', 'w') as f:
            f.write('<notclosed>')
        p = html.HTMLParser('./tests/data/clean.html')
        with self.assertRaises(ValueError):
            p.get_meta()
        p = html.HTMLParser('./tests/data/clean.html')
        with self.assertRaises(ValueError):
            p.remove_all()
        os.remove('./tests/data/clean.html')
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@ -6,7 +6,7 @@ import os
 import zipfile
 from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
-from libmat2 import check_dependencies, video, archive
+from libmat2 import check_dependencies, video, archive, html
 class TestCheckDependencies(unittest.TestCase):
@ -596,3 +596,21 @@ class TestCleaning(unittest.TestCase):
        os.remove('./tests/data/clean.gif')
        os.remove('./tests/data/clean.cleaned.gif')
        os.remove('./tests/data/clean.cleaned.cleaned.gif')
    def test_html(self):
        shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
        p = html.HTMLParser('./tests/data/clean.html')
        meta = p.get_meta()
        self.assertEqual(meta['author'], 'jvoisin')
        ret = p.remove_all()
        self.assertTrue(ret)
        p = html.HTMLParser('./tests/data/clean.cleaned.html')
        self.assertEqual(p.get_meta(), {})
        self.assertTrue(p.remove_all())
        os.remove('./tests/data/clean.html')
        os.remove('./tests/data/clean.cleaned.html')
        os.remove('./tests/data/clean.cleaned.cleaned.html')