Add support for html files
This commit is contained in:
parent
e1dd439fc8
commit
6cc034e81b
69
libmat2/html.py
Normal file
69
libmat2/html.py
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
from html import parser
|
||||||
|
from typing import Dict, Any, List, Tuple
|
||||||
|
|
||||||
|
from . import abstract
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLParser(abstract.AbstractParser):
|
||||||
|
mimetypes = {'text/html', }
|
||||||
|
def __init__(self, filename):
|
||||||
|
super().__init__(filename)
|
||||||
|
self.__parser = _HTMLParser()
|
||||||
|
with open(filename) as f:
|
||||||
|
self.__parser.feed(f.read())
|
||||||
|
self.__parser.close()
|
||||||
|
|
||||||
|
def get_meta(self) -> Dict[str, Any]:
|
||||||
|
return self.__parser.get_meta()
|
||||||
|
|
||||||
|
def remove_all(self) -> bool:
|
||||||
|
return self.__parser.remove_all(self.output_filename)
|
||||||
|
|
||||||
|
|
||||||
|
class _HTMLParser(parser.HTMLParser):
|
||||||
|
"""Python doesn't have a validating html parser in its stdlib, so
|
||||||
|
we're using an internal queue to track all the opening/closing tags,
|
||||||
|
and hoping for the best.
|
||||||
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.__textrepr = ''
|
||||||
|
self.__meta = {}
|
||||||
|
self.__validation_queue = []
|
||||||
|
|
||||||
|
def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
|
||||||
|
self.__textrepr += self.get_starttag_text()
|
||||||
|
self.__validation_queue.append(tag)
|
||||||
|
|
||||||
|
def handle_endtag(self, tag: str):
|
||||||
|
if not self.__validation_queue:
|
||||||
|
raise ValueError
|
||||||
|
elif tag != self.__validation_queue.pop():
|
||||||
|
raise ValueError
|
||||||
|
# There is no `get_endtag_text()` method :/
|
||||||
|
self.__textrepr += '</' + tag + '>\n'
|
||||||
|
|
||||||
|
def handle_data(self, data: str):
|
||||||
|
if data.strip():
|
||||||
|
self.__textrepr += data
|
||||||
|
|
||||||
|
def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
|
||||||
|
if tag == 'meta':
|
||||||
|
meta = {k:v for k, v in attrs}
|
||||||
|
name = meta.get('name', 'harmful metadata')
|
||||||
|
content = meta.get('content', 'harmful data')
|
||||||
|
self.__meta[name] = content
|
||||||
|
else:
|
||||||
|
self.__textrepr += self.get_starttag_text()
|
||||||
|
|
||||||
|
def remove_all(self, output_filename: str) -> bool:
|
||||||
|
if self.__validation_queue:
|
||||||
|
raise ValueError
|
||||||
|
with open(output_filename, 'w') as f:
|
||||||
|
f.write(self.__textrepr)
|
||||||
|
return True
|
||||||
|
|
||||||
|
def get_meta(self) -> Dict[str, Any]:
|
||||||
|
if self.__validation_queue:
|
||||||
|
raise ValueError
|
||||||
|
return self.__meta
|
14
tests/data/dirty.html
Normal file
14
tests/data/dirty.html
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta content="vim" name="generator"/>
|
||||||
|
<meta content="jvoisin" name="author"/>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>
|
||||||
|
<h1>Hello</h1>
|
||||||
|
I am a web page.
|
||||||
|
Please <b>love</b> me.
|
||||||
|
Here, have a pretty picture: <img src='dirty.jpg' alt='a pretty picture'/>
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
@ -7,7 +7,7 @@ import logging
|
|||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
from libmat2 import pdf, images, audio, office, parser_factory, torrent
|
from libmat2 import pdf, images, audio, office, parser_factory, torrent
|
||||||
from libmat2 import harmless, video
|
from libmat2 import harmless, video, html
|
||||||
|
|
||||||
# No need to logging messages, should something go wrong,
|
# No need to logging messages, should something go wrong,
|
||||||
# the testsuite _will_ fail.
|
# the testsuite _will_ fail.
|
||||||
@ -232,3 +232,40 @@ class TestCorruptedFiles(unittest.TestCase):
|
|||||||
self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
|
self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
|
||||||
self.assertFalse(p.remove_all())
|
self.assertFalse(p.remove_all())
|
||||||
os.remove('./tests/data/dirty.zip')
|
os.remove('./tests/data/dirty.zip')
|
||||||
|
|
||||||
|
def test_html(self):
|
||||||
|
shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
|
||||||
|
with open('./tests/data/clean.html', 'a') as f:
|
||||||
|
f.write('<open>but not</closed>')
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
html.HTMLParser('./tests/data/clean.html')
|
||||||
|
os.remove('./tests/data/clean.html')
|
||||||
|
|
||||||
|
# Yes, we're able to deal with malformed html :/
|
||||||
|
shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
|
||||||
|
with open('./tests/data/clean.html', 'a') as f:
|
||||||
|
f.write('<meta name=\'this" is="weird"/>')
|
||||||
|
p = html.HTMLParser('./tests/data/clean.html')
|
||||||
|
self.assertTrue(p.remove_all())
|
||||||
|
p = html.HTMLParser('./tests/data/clean.cleaned.html')
|
||||||
|
self.assertEqual(p.get_meta(), {})
|
||||||
|
os.remove('./tests/data/clean.html')
|
||||||
|
os.remove('./tests/data/clean.cleaned.html')
|
||||||
|
|
||||||
|
with open('./tests/data/clean.html', 'w') as f:
|
||||||
|
f.write('</close>')
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
html.HTMLParser('./tests/data/clean.html')
|
||||||
|
os.remove('./tests/data/clean.html')
|
||||||
|
|
||||||
|
with open('./tests/data/clean.html', 'w') as f:
|
||||||
|
f.write('<notclosed>')
|
||||||
|
p = html.HTMLParser('./tests/data/clean.html')
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
p.get_meta()
|
||||||
|
p = html.HTMLParser('./tests/data/clean.html')
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
p.remove_all()
|
||||||
|
os.remove('./tests/data/clean.html')
|
||||||
|
|
||||||
|
|
||||||
|
@ -6,7 +6,7 @@ import os
|
|||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
|
from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
|
||||||
from libmat2 import check_dependencies, video, archive
|
from libmat2 import check_dependencies, video, archive, html
|
||||||
|
|
||||||
|
|
||||||
class TestCheckDependencies(unittest.TestCase):
|
class TestCheckDependencies(unittest.TestCase):
|
||||||
@ -596,3 +596,21 @@ class TestCleaning(unittest.TestCase):
|
|||||||
os.remove('./tests/data/clean.gif')
|
os.remove('./tests/data/clean.gif')
|
||||||
os.remove('./tests/data/clean.cleaned.gif')
|
os.remove('./tests/data/clean.cleaned.gif')
|
||||||
os.remove('./tests/data/clean.cleaned.cleaned.gif')
|
os.remove('./tests/data/clean.cleaned.cleaned.gif')
|
||||||
|
|
||||||
|
def test_html(self):
|
||||||
|
shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
|
||||||
|
p = html.HTMLParser('./tests/data/clean.html')
|
||||||
|
|
||||||
|
meta = p.get_meta()
|
||||||
|
self.assertEqual(meta['author'], 'jvoisin')
|
||||||
|
|
||||||
|
ret = p.remove_all()
|
||||||
|
self.assertTrue(ret)
|
||||||
|
|
||||||
|
p = html.HTMLParser('./tests/data/clean.cleaned.html')
|
||||||
|
self.assertEqual(p.get_meta(), {})
|
||||||
|
self.assertTrue(p.remove_all())
|
||||||
|
|
||||||
|
os.remove('./tests/data/clean.html')
|
||||||
|
os.remove('./tests/data/clean.cleaned.html')
|
||||||
|
os.remove('./tests/data/clean.cleaned.cleaned.html')
|
||||||
|
Loading…
Reference in New Issue
Block a user