From 6cc034e81bd0cea98dffe4d7311f3bd16178b63e Mon Sep 17 00:00:00 2001
From: jvoisin
Date: Fri, 8 Feb 2019 00:26:47 +0100
Subject: [PATCH] Add support for html files
---
libmat2/html.py | 69 +++++++++++++++++++++++++++++++++++
tests/data/dirty.html | 14 +++++++
tests/test_corrupted_files.py | 39 +++++++++++++++++++-
tests/test_libmat2.py | 20 +++++++++-
4 files changed, 140 insertions(+), 2 deletions(-)
create mode 100644 libmat2/html.py
create mode 100644 tests/data/dirty.html
diff --git a/libmat2/html.py b/libmat2/html.py
new file mode 100644
index 0000000..d0e9a2b
--- /dev/null
+++ b/libmat2/html.py
@@ -0,0 +1,69 @@
+from html import parser
+from typing import Dict, Any, List, Tuple
+
+from . import abstract
+
+
+class HTMLParser(abstract.AbstractParser):
+ mimetypes = {'text/html', }
+ def __init__(self, filename):
+ super().__init__(filename)
+ self.__parser = _HTMLParser()
+ with open(filename) as f:
+ self.__parser.feed(f.read())
+ self.__parser.close()
+
+ def get_meta(self) -> Dict[str, Any]:
+ return self.__parser.get_meta()
+
+ def remove_all(self) -> bool:
+ return self.__parser.remove_all(self.output_filename)
+
+
+class _HTMLParser(parser.HTMLParser):
+ """Python doesn't have a validating html parser in its stdlib, so
+ we're using an internal queue to track all the opening/closing tags,
+ and hoping for the best.
+ """
+ def __init__(self):
+ super().__init__()
+ self.__textrepr = ''
+ self.__meta = {}
+ self.__validation_queue = []
+
+ def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
+ self.__textrepr += self.get_starttag_text()
+ self.__validation_queue.append(tag)
+
+ def handle_endtag(self, tag: str):
+ if not self.__validation_queue:
+ raise ValueError
+ elif tag != self.__validation_queue.pop():
+ raise ValueError
+ # There is no `get_endtag_text()` method :/
+ self.__textrepr += '' + tag + '>\n'
+
+ def handle_data(self, data: str):
+ if data.strip():
+ self.__textrepr += data
+
+ def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
+ if tag == 'meta':
+ meta = {k:v for k, v in attrs}
+ name = meta.get('name', 'harmful metadata')
+ content = meta.get('content', 'harmful data')
+ self.__meta[name] = content
+ else:
+ self.__textrepr += self.get_starttag_text()
+
+ def remove_all(self, output_filename: str) -> bool:
+ if self.__validation_queue:
+ raise ValueError
+ with open(output_filename, 'w') as f:
+ f.write(self.__textrepr)
+ return True
+
+ def get_meta(self) -> Dict[str, Any]:
+ if self.__validation_queue:
+ raise ValueError
+ return self.__meta
diff --git a/tests/data/dirty.html b/tests/data/dirty.html
new file mode 100644
index 0000000..1aa1723
--- /dev/null
+++ b/tests/data/dirty.html
@@ -0,0 +1,14 @@
+
+
+
+
+
+
+
+
Hello
+ I am a web page.
+ Please love me.
+ Here, have a pretty picture:
+
+
+
diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py
index b2e7798..8728cb2 100644
--- a/tests/test_corrupted_files.py
+++ b/tests/test_corrupted_files.py
@@ -7,7 +7,7 @@ import logging
import zipfile
from libmat2 import pdf, images, audio, office, parser_factory, torrent
-from libmat2 import harmless, video
+from libmat2 import harmless, video, html
# No need to logging messages, should something go wrong,
# the testsuite _will_ fail.
@@ -232,3 +232,40 @@ class TestCorruptedFiles(unittest.TestCase):
self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
self.assertFalse(p.remove_all())
os.remove('./tests/data/dirty.zip')
+
+ def test_html(self):
+ shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
+ with open('./tests/data/clean.html', 'a') as f:
+ f.write('but not')
+ with self.assertRaises(ValueError):
+ html.HTMLParser('./tests/data/clean.html')
+ os.remove('./tests/data/clean.html')
+
+ # Yes, we're able to deal with malformed html :/
+ shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
+ with open('./tests/data/clean.html', 'a') as f:
+ f.write('')
+ p = html.HTMLParser('./tests/data/clean.html')
+ self.assertTrue(p.remove_all())
+ p = html.HTMLParser('./tests/data/clean.cleaned.html')
+ self.assertEqual(p.get_meta(), {})
+ os.remove('./tests/data/clean.html')
+ os.remove('./tests/data/clean.cleaned.html')
+
+ with open('./tests/data/clean.html', 'w') as f:
+ f.write('')
+ with self.assertRaises(ValueError):
+ html.HTMLParser('./tests/data/clean.html')
+ os.remove('./tests/data/clean.html')
+
+ with open('./tests/data/clean.html', 'w') as f:
+ f.write('')
+ p = html.HTMLParser('./tests/data/clean.html')
+ with self.assertRaises(ValueError):
+ p.get_meta()
+ p = html.HTMLParser('./tests/data/clean.html')
+ with self.assertRaises(ValueError):
+ p.remove_all()
+ os.remove('./tests/data/clean.html')
+
+
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 548b076..8753e09 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -6,7 +6,7 @@ import os
import zipfile
from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
-from libmat2 import check_dependencies, video, archive
+from libmat2 import check_dependencies, video, archive, html
class TestCheckDependencies(unittest.TestCase):
@@ -596,3 +596,21 @@ class TestCleaning(unittest.TestCase):
os.remove('./tests/data/clean.gif')
os.remove('./tests/data/clean.cleaned.gif')
os.remove('./tests/data/clean.cleaned.cleaned.gif')
+
+ def test_html(self):
+ shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
+ p = html.HTMLParser('./tests/data/clean.html')
+
+ meta = p.get_meta()
+ self.assertEqual(meta['author'], 'jvoisin')
+
+ ret = p.remove_all()
+ self.assertTrue(ret)
+
+ p = html.HTMLParser('./tests/data/clean.cleaned.html')
+ self.assertEqual(p.get_meta(), {})
+ self.assertTrue(p.remove_all())
+
+ os.remove('./tests/data/clean.html')
+ os.remove('./tests/data/clean.cleaned.html')
+ os.remove('./tests/data/clean.cleaned.cleaned.html')