From 73d2966e8c10eb6c083a2abacc53f3297d16376e Mon Sep 17 00:00:00 2001 From: jvoisin Date: Wed, 27 Feb 2019 23:04:38 +0100 Subject: [PATCH] Improve epub support --- libmat2/epub.py | 46 +++++++++++++++--- libmat2/web.py | 87 ++++++++++++++++++++++++++--------- tests/test_corrupted_files.py | 7 ++- tests/test_libmat2.py | 6 ++- 4 files changed, 114 insertions(+), 32 deletions(-) diff --git a/libmat2/epub.py b/libmat2/epub.py index 09b7937..d385465 100644 --- a/libmat2/epub.py +++ b/libmat2/epub.py @@ -1,11 +1,13 @@ import logging import re +import uuid import xml.etree.ElementTree as ET # type: ignore from . import archive, office class EPUBParser(archive.ArchiveBasedAbstractParser): mimetypes = {'application/epub+zip', } + metadata_namespace = '{http://purl.org/dc/elements/1.1/}' def __init__(self, filename): super().__init__(filename) @@ -14,6 +16,7 @@ class EPUBParser(archive.ArchiveBasedAbstractParser): 'mimetype', 'OEBPS/content.opf', })) + self.uniqid = uuid.uuid4() def _specific_get_meta(self, full_path, file_path): if file_path != 'OEBPS/content.opf': @@ -25,23 +28,52 @@ class EPUBParser(archive.ArchiveBasedAbstractParser): f.read(), re.I|re.M) return {k:v for (k, v) in results} except (TypeError, UnicodeDecodeError): - # We didn't manage to parse the xml file return {file_path: 'harmful content', } def _specific_cleanup(self, full_path: str): - if not full_path.endswith('OEBPS/content.opf'): - return True + if full_path.endswith('OEBPS/content.opf'): + return self.__handle_contentopf(full_path) + elif full_path.endswith('OEBPS/toc.ncx'): + return self.__handle_tocncx(full_path) + return True + def __handle_tocncx(self, full_path: str): + try: + tree, namespace = office._parse_xml(full_path) + except ET.ParseError: # pragma: nocover + logging.error("Unable to parse %s in %s.", full_path, self.filename) + return False + + for item in tree.iterfind('.//', namespace): # pragma: nocover + if item.tag.strip().lower().endswith('head'): + item.clear() + ET.SubElement(item, 'meta', attrib={'name': '', 'content': ''}) + break + tree.write(full_path, xml_declaration=True, encoding='utf-8', + short_empty_elements=False) + return True + + def __handle_contentopf(self, full_path: str): try: tree, namespace = office._parse_xml(full_path) except ET.ParseError: logging.error("Unable to parse %s in %s.", full_path, self.filename) return False - parent_map = {c:p for p in tree.iter() for c in p} - for item in tree.iterfind('.//', namespace): + for item in tree.iterfind('.//', namespace): # pragma: nocover if item.tag.strip().lower().endswith('metadata'): - parent_map[item].remove(item) + item.clear() + + # item with mandatory content + uniqid = ET.Element(self.metadata_namespace + 'identifier') + uniqid.text = str(self.uniqid) + uniqid.set('id', 'id') + item.append(uniqid) + + # items without mandatory content + for name in {'language', 'title'}: + uniqid = ET.Element(self.metadata_namespace + name) + item.append(uniqid) break # there is only a single block - tree.write(full_path, xml_declaration=True) + tree.write(full_path, xml_declaration=True, encoding='utf-8') return True diff --git a/libmat2/web.py b/libmat2/web.py index c11b47d..067f5f9 100644 --- a/libmat2/web.py +++ b/libmat2/web.py @@ -1,10 +1,13 @@ -from html import parser -from typing import Dict, Any, List, Tuple +from html import parser, escape +from typing import Dict, Any, List, Tuple, Set import re import string from . import abstract +assert Set + +# pylint: disable=too-many-instance-attributes class CSSParser(abstract.AbstractParser): """There is no such things as metadata in CSS files, @@ -33,11 +36,16 @@ class CSSParser(abstract.AbstractParser): return metadata -class HTMLParser(abstract.AbstractParser): - mimetypes = {'text/html', 'application/x-dtbncx+xml', } +class AbstractHTMLParser(abstract.AbstractParser): + tags_blacklist = set() # type: Set[str] + # In some html/xml based formats some tags are mandatory, + # so we're keeping them, but are discaring their contents + tags_required_blacklist = set() # type: Set[str] + def __init__(self, filename): super().__init__(filename) - self.__parser = _HTMLParser(self.filename) + self.__parser = _HTMLParser(self.filename, self.tags_blacklist, + self.tags_required_blacklist) with open(filename, encoding='utf-8') as f: self.__parser.feed(f.read()) self.__parser.close() @@ -49,29 +57,50 @@ class HTMLParser(abstract.AbstractParser): return self.__parser.remove_all(self.output_filename) +class HTMLParser(AbstractHTMLParser): + mimetypes = {'text/html', } + tags_blacklist = {'meta', } + tags_required_blacklist = {'title', } + + +class DTBNCXParser(AbstractHTMLParser): + mimetypes = {'application/x-dtbncx+xml', } + tags_required_blacklist = {'title', 'doctitle', 'meta'} + + class _HTMLParser(parser.HTMLParser): """Python doesn't have a validating html parser in its stdlib, so we're using an internal queue to track all the opening/closing tags, and hoping for the best. """ - tag_blacklist = {'doctitle', 'meta', 'title'} # everything is lowercase - def __init__(self, filename): + def __init__(self, filename, blacklisted_tags, required_blacklisted_tags): super().__init__() self.filename = filename self.__textrepr = '' self.__meta = {} - self.__validation_queue = [] - # We're using a counter instead of a boolean to handle nested tags + self.__validation_queue = [] # type: List[str] + # We're using counters instead of booleans, to handle nested tags + self.__in_dangerous_but_required_tag = 0 self.__in_dangerous_tag = 0 + if required_blacklisted_tags & blacklisted_tags: # pragma: nocover + raise ValueError("There is an overlap between %s and %s" % ( + required_blacklisted_tags, blacklisted_tags)) + self.tag_required_blacklist = required_blacklisted_tags + self.tag_blacklist = blacklisted_tags + def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]): - self.__validation_queue.append(tag) + original_tag = self.get_starttag_text() + self.__validation_queue.append(original_tag) + + if tag in self.tag_required_blacklist: + self.__in_dangerous_but_required_tag += 1 if tag in self.tag_blacklist: self.__in_dangerous_tag += 1 - return if self.__in_dangerous_tag == 0: - self.__textrepr += self.get_starttag_text() + if self.__in_dangerous_but_required_tag <= 1: + self.__textrepr += original_tag def handle_endtag(self, tag: str): if not self.__validation_queue: @@ -79,29 +108,43 @@ class _HTMLParser(parser.HTMLParser): "opening one in %s." % (tag, self.filename)) previous_tag = self.__validation_queue.pop() - if tag != previous_tag: + previous_tag = previous_tag[1:-1] # remove < and > + previous_tag = previous_tag.split(' ')[0] # remove attributes + if tag != previous_tag.lower(): raise ValueError("The closing tag %s doesn't match the previous " "tag %s in %s" % (tag, previous_tag, self.filename)) - elif tag in self.tag_blacklist: - self.__in_dangerous_tag -= 1 - return if self.__in_dangerous_tag == 0: - # There is no `get_endtag_text()` method :/ - self.__textrepr += '\n' + if self.__in_dangerous_but_required_tag <= 1: + # There is no `get_endtag_text()` method :/ + self.__textrepr += '' + + if tag in self.tag_required_blacklist: + self.__in_dangerous_but_required_tag -= 1 + elif tag in self.tag_blacklist: + self.__in_dangerous_tag -= 1 def handle_data(self, data: str): - if self.__in_dangerous_tag == 0 and data.strip(): - self.__textrepr += data + if self.__in_dangerous_but_required_tag == 0: + if self.__in_dangerous_tag == 0: + if data.strip(): + self.__textrepr += escape(data) def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]): - if tag in self.tag_blacklist: + if tag in self.tag_required_blacklist | self.tag_blacklist: meta = {k:v for k, v in attrs} name = meta.get('name', 'harmful metadata') content = meta.get('content', 'harmful data') self.__meta[name] = content - else: + + if self.__in_dangerous_tag != 0: + return + elif tag in self.tag_required_blacklist: + self.__textrepr += '<' + tag + ' />' + return + + if self.__in_dangerous_but_required_tag == 0: if self.__in_dangerous_tag == 0: self.__textrepr += self.get_starttag_text() diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py index 53c856a..b2cec00 100644 --- a/tests/test_corrupted_files.py +++ b/tests/test_corrupted_files.py @@ -253,13 +253,13 @@ class TestCorruptedFiles(unittest.TestCase): os.remove('./tests/data/clean.cleaned.html') with open('./tests/data/clean.html', 'w') as f: - f.write('') + f.write('') with self.assertRaises(ValueError): web.HTMLParser('./tests/data/clean.html') os.remove('./tests/data/clean.html') with open('./tests/data/clean.html', 'w') as f: - f.write('') + f.write('test') p = web.HTMLParser('./tests/data/clean.html') with self.assertRaises(ValueError): p.get_meta() @@ -269,6 +269,9 @@ class TestCorruptedFiles(unittest.TestCase): os.remove('./tests/data/clean.html') with open('./tests/data/clean.html', 'w') as f: + f.write('') + f.write('<title>pouet') + f.write('<mysupertag/>') f.write('

') p = web.HTMLParser('./tests/data/clean.html') with self.assertRaises(ValueError): diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 249c56d..f4b1890 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -3,6 +3,7 @@ import unittest import shutil import os +import re import zipfile from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless @@ -644,7 +645,10 @@ class TestCleaning(unittest.TestCase): self.assertTrue(ret) p = epub.EPUBParser('./tests/data/clean.cleaned.epub') - self.assertEqual(p.get_meta(), {}) + meta = p.get_meta() + res = re.match(meta['OEBPS/content.opf']['metadata'], '^[0-9a-f-]+$') + self.assertNotEqual(res, False) + self.assertTrue(p.remove_all()) os.remove('./tests/data/clean.epub')