1
0
Fork 0
mirror of synced 2025-07-03 20:07:28 +02:00

Add support for html files

This commit is contained in:
jvoisin 2019-02-08 00:26:47 +01:00
parent e1dd439fc8
commit 6cc034e81b
4 changed files with 140 additions and 2 deletions

14
tests/data/dirty.html Normal file
View file

@ -0,0 +1,14 @@
<html>
<head>
<meta content="vim" name="generator"/>
<meta content="jvoisin" name="author"/>
</head>
<body>
<p>
<h1>Hello</h1>
I am a web page.
Please <b>love</b> me.
Here, have a pretty picture: <img src='dirty.jpg' alt='a pretty picture'/>
</p>
</body>
</html>

View file

@ -7,7 +7,7 @@ import logging
import zipfile
from libmat2 import pdf, images, audio, office, parser_factory, torrent
from libmat2 import harmless, video
from libmat2 import harmless, video, html
# No need to logging messages, should something go wrong,
# the testsuite _will_ fail.
@ -232,3 +232,40 @@ class TestCorruptedFiles(unittest.TestCase):
self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
self.assertFalse(p.remove_all())
os.remove('./tests/data/dirty.zip')
def test_html(self):
shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
with open('./tests/data/clean.html', 'a') as f:
f.write('<open>but not</closed>')
with self.assertRaises(ValueError):
html.HTMLParser('./tests/data/clean.html')
os.remove('./tests/data/clean.html')
# Yes, we're able to deal with malformed html :/
shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
with open('./tests/data/clean.html', 'a') as f:
f.write('<meta name=\'this" is="weird"/>')
p = html.HTMLParser('./tests/data/clean.html')
self.assertTrue(p.remove_all())
p = html.HTMLParser('./tests/data/clean.cleaned.html')
self.assertEqual(p.get_meta(), {})
os.remove('./tests/data/clean.html')
os.remove('./tests/data/clean.cleaned.html')
with open('./tests/data/clean.html', 'w') as f:
f.write('</close>')
with self.assertRaises(ValueError):
html.HTMLParser('./tests/data/clean.html')
os.remove('./tests/data/clean.html')
with open('./tests/data/clean.html', 'w') as f:
f.write('<notclosed>')
p = html.HTMLParser('./tests/data/clean.html')
with self.assertRaises(ValueError):
p.get_meta()
p = html.HTMLParser('./tests/data/clean.html')
with self.assertRaises(ValueError):
p.remove_all()
os.remove('./tests/data/clean.html')

View file

@ -6,7 +6,7 @@ import os
import zipfile
from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
from libmat2 import check_dependencies, video, archive
from libmat2 import check_dependencies, video, archive, html
class TestCheckDependencies(unittest.TestCase):
@ -596,3 +596,21 @@ class TestCleaning(unittest.TestCase):
os.remove('./tests/data/clean.gif')
os.remove('./tests/data/clean.cleaned.gif')
os.remove('./tests/data/clean.cleaned.cleaned.gif')
def test_html(self):
shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
p = html.HTMLParser('./tests/data/clean.html')
meta = p.get_meta()
self.assertEqual(meta['author'], 'jvoisin')
ret = p.remove_all()
self.assertTrue(ret)
p = html.HTMLParser('./tests/data/clean.cleaned.html')
self.assertEqual(p.get_meta(), {})
self.assertTrue(p.remove_all())
os.remove('./tests/data/clean.html')
os.remove('./tests/data/clean.cleaned.html')
os.remove('./tests/data/clean.cleaned.cleaned.html')