diff --git a/libmat2/web.py b/libmat2/web.py index 067f5f9..62e7747 100644 --- a/libmat2/web.py +++ b/libmat2/web.py @@ -38,8 +38,8 @@ class CSSParser(abstract.AbstractParser): class AbstractHTMLParser(abstract.AbstractParser): tags_blacklist = set() # type: Set[str] - # In some html/xml based formats some tags are mandatory, - # so we're keeping them, but are discaring their contents + # In some html/xml-based formats some tags are mandatory, + # so we're keeping them, but are discaring their content tags_required_blacklist = set() # type: Set[str] def __init__(self, filename): @@ -72,6 +72,12 @@ class _HTMLParser(parser.HTMLParser): """Python doesn't have a validating html parser in its stdlib, so we're using an internal queue to track all the opening/closing tags, and hoping for the best. + + Moreover, the parser.HTMLParser call doesn't provide a get_endtag_text + method, so we have to use get_starttag_text instead, put its result in a + LIFO, and transform it in a closing tag when needed. + + Also, gotcha: the `tag` parameters are always in lowercase. """ def __init__(self, filename, blacklisted_tags, required_blacklisted_tags): super().__init__() @@ -79,6 +85,7 @@ class _HTMLParser(parser.HTMLParser): self.__textrepr = '' self.__meta = {} self.__validation_queue = [] # type: List[str] + # We're using counters instead of booleans, to handle nested tags self.__in_dangerous_but_required_tag = 0 self.__in_dangerous_tag = 0 @@ -93,15 +100,16 @@ class _HTMLParser(parser.HTMLParser): original_tag = self.get_starttag_text() self.__validation_queue.append(original_tag) - if tag in self.tag_required_blacklist: - self.__in_dangerous_but_required_tag += 1 if tag in self.tag_blacklist: self.__in_dangerous_tag += 1 if self.__in_dangerous_tag == 0: - if self.__in_dangerous_but_required_tag <= 1: + if self.__in_dangerous_but_required_tag == 0: self.__textrepr += original_tag + if tag in self.tag_required_blacklist: + self.__in_dangerous_but_required_tag += 1 + def handle_endtag(self, tag: str): if not self.__validation_queue: raise ValueError("The closing tag %s doesn't have a corresponding " @@ -115,14 +123,15 @@ class _HTMLParser(parser.HTMLParser): "tag %s in %s" % (tag, previous_tag, self.filename)) + if tag in self.tag_required_blacklist: + self.__in_dangerous_but_required_tag -= 1 + if self.__in_dangerous_tag == 0: - if self.__in_dangerous_but_required_tag <= 1: + if self.__in_dangerous_but_required_tag == 0: # There is no `get_endtag_text()` method :/ self.__textrepr += '' - if tag in self.tag_required_blacklist: - self.__in_dangerous_but_required_tag -= 1 - elif tag in self.tag_blacklist: + if tag in self.tag_blacklist: self.__in_dangerous_tag -= 1 def handle_data(self, data: str): @@ -138,14 +147,13 @@ class _HTMLParser(parser.HTMLParser): content = meta.get('content', 'harmful data') self.__meta[name] = content - if self.__in_dangerous_tag != 0: - return - elif tag in self.tag_required_blacklist: - self.__textrepr += '<' + tag + ' />' - return - - if self.__in_dangerous_but_required_tag == 0: if self.__in_dangerous_tag == 0: + if tag in self.tag_required_blacklist: + self.__textrepr += '<' + tag + ' />' + return + + if self.__in_dangerous_tag == 0: + if self.__in_dangerous_but_required_tag == 0: self.__textrepr += self.get_starttag_text() def remove_all(self, output_filename: str) -> bool: diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py index b2cec00..4a16d51 100644 --- a/tests/test_corrupted_files.py +++ b/tests/test_corrupted_files.py @@ -269,9 +269,6 @@ class TestCorruptedFiles(unittest.TestCase): os.remove('./tests/data/clean.html') with open('./tests/data/clean.html', 'w') as f: - f.write('') - f.write('<title>pouet') - f.write('<mysupertag/>') f.write('

') p = web.HTMLParser('./tests/data/clean.html') with self.assertRaises(ValueError): @@ -281,6 +278,7 @@ class TestCorruptedFiles(unittest.TestCase): p.remove_all() os.remove('./tests/data/clean.html') + def test_epub(self): with zipfile.ZipFile('./tests/data/clean.epub', 'w') as zout: zout.write('./tests/data/dirty.jpg', 'OEBPS/content.opf') diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index f4b1890..46e234e 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -633,6 +633,33 @@ class TestCleaning(unittest.TestCase): os.remove('./tests/data/clean.cleaned.html') os.remove('./tests/data/clean.cleaned.cleaned.html') + with open('./tests/data/clean.html', 'w') as f: + f.write('<title><pouet/><meta/>') + p = web.HTMLParser('./tests/data/clean.html') + self.assertTrue(p.remove_all()) + with open('./tests/data/clean.cleaned.html', 'r') as f: + self.assertEqual(f.read(), '') + os.remove('./tests/data/clean.html') + os.remove('./tests/data/clean.cleaned.html') + + with open('./tests/data/clean.html', 'w') as f: + f.write('Some<b>metadata</b><br/>') + p = web.HTMLParser('./tests/data/clean.html') + self.assertTrue(p.remove_all()) + with open('./tests/data/clean.cleaned.html', 'r') as f: + self.assertEqual(f.read(), '') + os.remove('./tests/data/clean.html') + os.remove('./tests/data/clean.cleaned.html') + + with open('./tests/data/clean.html', 'w') as f: + f.write('') + p = web.HTMLParser('./tests/data/clean.html') + self.assertTrue(p.remove_all()) + with open('./tests/data/clean.cleaned.html', 'r') as f: + self.assertEqual(f.read(), '') + os.remove('./tests/data/clean.html') + os.remove('./tests/data/clean.cleaned.html') + def test_epub(self): shutil.copy('./tests/data/dirty.epub', './tests/data/clean.epub')