Improve the previous commit
- More tests - More documentation - Minor code cleanup
This commit is contained in:
parent
73d2966e8c
commit
55214206b5
@ -38,8 +38,8 @@ class CSSParser(abstract.AbstractParser):
|
|||||||
|
|
||||||
class AbstractHTMLParser(abstract.AbstractParser):
|
class AbstractHTMLParser(abstract.AbstractParser):
|
||||||
tags_blacklist = set() # type: Set[str]
|
tags_blacklist = set() # type: Set[str]
|
||||||
# In some html/xml based formats some tags are mandatory,
|
# In some html/xml-based formats some tags are mandatory,
|
||||||
# so we're keeping them, but are discaring their contents
|
# so we're keeping them, but are discaring their content
|
||||||
tags_required_blacklist = set() # type: Set[str]
|
tags_required_blacklist = set() # type: Set[str]
|
||||||
|
|
||||||
def __init__(self, filename):
|
def __init__(self, filename):
|
||||||
@ -72,6 +72,12 @@ class _HTMLParser(parser.HTMLParser):
|
|||||||
"""Python doesn't have a validating html parser in its stdlib, so
|
"""Python doesn't have a validating html parser in its stdlib, so
|
||||||
we're using an internal queue to track all the opening/closing tags,
|
we're using an internal queue to track all the opening/closing tags,
|
||||||
and hoping for the best.
|
and hoping for the best.
|
||||||
|
|
||||||
|
Moreover, the parser.HTMLParser call doesn't provide a get_endtag_text
|
||||||
|
method, so we have to use get_starttag_text instead, put its result in a
|
||||||
|
LIFO, and transform it in a closing tag when needed.
|
||||||
|
|
||||||
|
Also, gotcha: the `tag` parameters are always in lowercase.
|
||||||
"""
|
"""
|
||||||
def __init__(self, filename, blacklisted_tags, required_blacklisted_tags):
|
def __init__(self, filename, blacklisted_tags, required_blacklisted_tags):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
@ -79,6 +85,7 @@ class _HTMLParser(parser.HTMLParser):
|
|||||||
self.__textrepr = ''
|
self.__textrepr = ''
|
||||||
self.__meta = {}
|
self.__meta = {}
|
||||||
self.__validation_queue = [] # type: List[str]
|
self.__validation_queue = [] # type: List[str]
|
||||||
|
|
||||||
# We're using counters instead of booleans, to handle nested tags
|
# We're using counters instead of booleans, to handle nested tags
|
||||||
self.__in_dangerous_but_required_tag = 0
|
self.__in_dangerous_but_required_tag = 0
|
||||||
self.__in_dangerous_tag = 0
|
self.__in_dangerous_tag = 0
|
||||||
@ -93,15 +100,16 @@ class _HTMLParser(parser.HTMLParser):
|
|||||||
original_tag = self.get_starttag_text()
|
original_tag = self.get_starttag_text()
|
||||||
self.__validation_queue.append(original_tag)
|
self.__validation_queue.append(original_tag)
|
||||||
|
|
||||||
if tag in self.tag_required_blacklist:
|
|
||||||
self.__in_dangerous_but_required_tag += 1
|
|
||||||
if tag in self.tag_blacklist:
|
if tag in self.tag_blacklist:
|
||||||
self.__in_dangerous_tag += 1
|
self.__in_dangerous_tag += 1
|
||||||
|
|
||||||
if self.__in_dangerous_tag == 0:
|
if self.__in_dangerous_tag == 0:
|
||||||
if self.__in_dangerous_but_required_tag <= 1:
|
if self.__in_dangerous_but_required_tag == 0:
|
||||||
self.__textrepr += original_tag
|
self.__textrepr += original_tag
|
||||||
|
|
||||||
|
if tag in self.tag_required_blacklist:
|
||||||
|
self.__in_dangerous_but_required_tag += 1
|
||||||
|
|
||||||
def handle_endtag(self, tag: str):
|
def handle_endtag(self, tag: str):
|
||||||
if not self.__validation_queue:
|
if not self.__validation_queue:
|
||||||
raise ValueError("The closing tag %s doesn't have a corresponding "
|
raise ValueError("The closing tag %s doesn't have a corresponding "
|
||||||
@ -115,14 +123,15 @@ class _HTMLParser(parser.HTMLParser):
|
|||||||
"tag %s in %s" %
|
"tag %s in %s" %
|
||||||
(tag, previous_tag, self.filename))
|
(tag, previous_tag, self.filename))
|
||||||
|
|
||||||
|
if tag in self.tag_required_blacklist:
|
||||||
|
self.__in_dangerous_but_required_tag -= 1
|
||||||
|
|
||||||
if self.__in_dangerous_tag == 0:
|
if self.__in_dangerous_tag == 0:
|
||||||
if self.__in_dangerous_but_required_tag <= 1:
|
if self.__in_dangerous_but_required_tag == 0:
|
||||||
# There is no `get_endtag_text()` method :/
|
# There is no `get_endtag_text()` method :/
|
||||||
self.__textrepr += '</' + previous_tag + '>'
|
self.__textrepr += '</' + previous_tag + '>'
|
||||||
|
|
||||||
if tag in self.tag_required_blacklist:
|
if tag in self.tag_blacklist:
|
||||||
self.__in_dangerous_but_required_tag -= 1
|
|
||||||
elif tag in self.tag_blacklist:
|
|
||||||
self.__in_dangerous_tag -= 1
|
self.__in_dangerous_tag -= 1
|
||||||
|
|
||||||
def handle_data(self, data: str):
|
def handle_data(self, data: str):
|
||||||
@ -138,14 +147,13 @@ class _HTMLParser(parser.HTMLParser):
|
|||||||
content = meta.get('content', 'harmful data')
|
content = meta.get('content', 'harmful data')
|
||||||
self.__meta[name] = content
|
self.__meta[name] = content
|
||||||
|
|
||||||
if self.__in_dangerous_tag != 0:
|
if self.__in_dangerous_tag == 0:
|
||||||
return
|
if tag in self.tag_required_blacklist:
|
||||||
elif tag in self.tag_required_blacklist:
|
|
||||||
self.__textrepr += '<' + tag + ' />'
|
self.__textrepr += '<' + tag + ' />'
|
||||||
return
|
return
|
||||||
|
|
||||||
if self.__in_dangerous_but_required_tag == 0:
|
|
||||||
if self.__in_dangerous_tag == 0:
|
if self.__in_dangerous_tag == 0:
|
||||||
|
if self.__in_dangerous_but_required_tag == 0:
|
||||||
self.__textrepr += self.get_starttag_text()
|
self.__textrepr += self.get_starttag_text()
|
||||||
|
|
||||||
def remove_all(self, output_filename: str) -> bool:
|
def remove_all(self, output_filename: str) -> bool:
|
||||||
|
@ -269,9 +269,6 @@ class TestCorruptedFiles(unittest.TestCase):
|
|||||||
os.remove('./tests/data/clean.html')
|
os.remove('./tests/data/clean.html')
|
||||||
|
|
||||||
with open('./tests/data/clean.html', 'w') as f:
|
with open('./tests/data/clean.html', 'w') as f:
|
||||||
f.write('<meta><meta/></meta>')
|
|
||||||
f.write('<title><title>pouet</title></title>')
|
|
||||||
f.write('<title><mysupertag/></title>')
|
|
||||||
f.write('<doctitle><br/></doctitle><br/><notclosed>')
|
f.write('<doctitle><br/></doctitle><br/><notclosed>')
|
||||||
p = web.HTMLParser('./tests/data/clean.html')
|
p = web.HTMLParser('./tests/data/clean.html')
|
||||||
with self.assertRaises(ValueError):
|
with self.assertRaises(ValueError):
|
||||||
@ -281,6 +278,7 @@ class TestCorruptedFiles(unittest.TestCase):
|
|||||||
p.remove_all()
|
p.remove_all()
|
||||||
os.remove('./tests/data/clean.html')
|
os.remove('./tests/data/clean.html')
|
||||||
|
|
||||||
|
|
||||||
def test_epub(self):
|
def test_epub(self):
|
||||||
with zipfile.ZipFile('./tests/data/clean.epub', 'w') as zout:
|
with zipfile.ZipFile('./tests/data/clean.epub', 'w') as zout:
|
||||||
zout.write('./tests/data/dirty.jpg', 'OEBPS/content.opf')
|
zout.write('./tests/data/dirty.jpg', 'OEBPS/content.opf')
|
||||||
|
@ -633,6 +633,33 @@ class TestCleaning(unittest.TestCase):
|
|||||||
os.remove('./tests/data/clean.cleaned.html')
|
os.remove('./tests/data/clean.cleaned.html')
|
||||||
os.remove('./tests/data/clean.cleaned.cleaned.html')
|
os.remove('./tests/data/clean.cleaned.cleaned.html')
|
||||||
|
|
||||||
|
with open('./tests/data/clean.html', 'w') as f:
|
||||||
|
f.write('<title><title><pouet/><meta/></title></title><test/>')
|
||||||
|
p = web.HTMLParser('./tests/data/clean.html')
|
||||||
|
self.assertTrue(p.remove_all())
|
||||||
|
with open('./tests/data/clean.cleaned.html', 'r') as f:
|
||||||
|
self.assertEqual(f.read(), '<title></title><test/>')
|
||||||
|
os.remove('./tests/data/clean.html')
|
||||||
|
os.remove('./tests/data/clean.cleaned.html')
|
||||||
|
|
||||||
|
with open('./tests/data/clean.html', 'w') as f:
|
||||||
|
f.write('<test><title>Some<b>metadata</b><br/></title></test>')
|
||||||
|
p = web.HTMLParser('./tests/data/clean.html')
|
||||||
|
self.assertTrue(p.remove_all())
|
||||||
|
with open('./tests/data/clean.cleaned.html', 'r') as f:
|
||||||
|
self.assertEqual(f.read(), '<test><title></title></test>')
|
||||||
|
os.remove('./tests/data/clean.html')
|
||||||
|
os.remove('./tests/data/clean.cleaned.html')
|
||||||
|
|
||||||
|
with open('./tests/data/clean.html', 'w') as f:
|
||||||
|
f.write('<meta><meta/></meta>')
|
||||||
|
p = web.HTMLParser('./tests/data/clean.html')
|
||||||
|
self.assertTrue(p.remove_all())
|
||||||
|
with open('./tests/data/clean.cleaned.html', 'r') as f:
|
||||||
|
self.assertEqual(f.read(), '')
|
||||||
|
os.remove('./tests/data/clean.html')
|
||||||
|
os.remove('./tests/data/clean.cleaned.html')
|
||||||
|
|
||||||
|
|
||||||
def test_epub(self):
|
def test_epub(self):
|
||||||
shutil.copy('./tests/data/dirty.epub', './tests/data/clean.epub')
|
shutil.copy('./tests/data/dirty.epub', './tests/data/clean.epub')
|
||||||
|
Loading…
Reference in New Issue
Block a user