Add support for html files
This commit is contained in:
parent
e1dd439fc8
commit
6cc034e81b
4 changed files with 140 additions and 2 deletions
14
tests/data/dirty.html
Normal file
14
tests/data/dirty.html
Normal file
|
@ -0,0 +1,14 @@
|
|||
<html>
|
||||
<head>
|
||||
<meta content="vim" name="generator"/>
|
||||
<meta content="jvoisin" name="author"/>
|
||||
</head>
|
||||
<body>
|
||||
<p>
|
||||
<h1>Hello</h1>
|
||||
I am a web page.
|
||||
Please <b>love</b> me.
|
||||
Here, have a pretty picture: <img src='dirty.jpg' alt='a pretty picture'/>
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
|
@ -7,7 +7,7 @@ import logging
|
|||
import zipfile
|
||||
|
||||
from libmat2 import pdf, images, audio, office, parser_factory, torrent
|
||||
from libmat2 import harmless, video
|
||||
from libmat2 import harmless, video, html
|
||||
|
||||
# No need to logging messages, should something go wrong,
|
||||
# the testsuite _will_ fail.
|
||||
|
@ -232,3 +232,40 @@ class TestCorruptedFiles(unittest.TestCase):
|
|||
self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
|
||||
self.assertFalse(p.remove_all())
|
||||
os.remove('./tests/data/dirty.zip')
|
||||
|
||||
def test_html(self):
|
||||
shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
|
||||
with open('./tests/data/clean.html', 'a') as f:
|
||||
f.write('<open>but not</closed>')
|
||||
with self.assertRaises(ValueError):
|
||||
html.HTMLParser('./tests/data/clean.html')
|
||||
os.remove('./tests/data/clean.html')
|
||||
|
||||
# Yes, we're able to deal with malformed html :/
|
||||
shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
|
||||
with open('./tests/data/clean.html', 'a') as f:
|
||||
f.write('<meta name=\'this" is="weird"/>')
|
||||
p = html.HTMLParser('./tests/data/clean.html')
|
||||
self.assertTrue(p.remove_all())
|
||||
p = html.HTMLParser('./tests/data/clean.cleaned.html')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
os.remove('./tests/data/clean.html')
|
||||
os.remove('./tests/data/clean.cleaned.html')
|
||||
|
||||
with open('./tests/data/clean.html', 'w') as f:
|
||||
f.write('</close>')
|
||||
with self.assertRaises(ValueError):
|
||||
html.HTMLParser('./tests/data/clean.html')
|
||||
os.remove('./tests/data/clean.html')
|
||||
|
||||
with open('./tests/data/clean.html', 'w') as f:
|
||||
f.write('<notclosed>')
|
||||
p = html.HTMLParser('./tests/data/clean.html')
|
||||
with self.assertRaises(ValueError):
|
||||
p.get_meta()
|
||||
p = html.HTMLParser('./tests/data/clean.html')
|
||||
with self.assertRaises(ValueError):
|
||||
p.remove_all()
|
||||
os.remove('./tests/data/clean.html')
|
||||
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ import os
|
|||
import zipfile
|
||||
|
||||
from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
|
||||
from libmat2 import check_dependencies, video, archive
|
||||
from libmat2 import check_dependencies, video, archive, html
|
||||
|
||||
|
||||
class TestCheckDependencies(unittest.TestCase):
|
||||
|
@ -596,3 +596,21 @@ class TestCleaning(unittest.TestCase):
|
|||
os.remove('./tests/data/clean.gif')
|
||||
os.remove('./tests/data/clean.cleaned.gif')
|
||||
os.remove('./tests/data/clean.cleaned.cleaned.gif')
|
||||
|
||||
def test_html(self):
|
||||
shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
|
||||
p = html.HTMLParser('./tests/data/clean.html')
|
||||
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['author'], 'jvoisin')
|
||||
|
||||
ret = p.remove_all()
|
||||
self.assertTrue(ret)
|
||||
|
||||
p = html.HTMLParser('./tests/data/clean.cleaned.html')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
self.assertTrue(p.remove_all())
|
||||
|
||||
os.remove('./tests/data/clean.html')
|
||||
os.remove('./tests/data/clean.cleaned.html')
|
||||
os.remove('./tests/data/clean.cleaned.cleaned.html')
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue