Improve epub support

2025-05-25 09:04:18 +02:00 · 2019-02-27 23:04:38 +01:00 · 2019-02-27 23:04:38 +01:00 · 73d2966e8c
commit 73d2966e8c
parent eb2e702f37
4 changed files with 114 additions and 32 deletions
--- a/libmat2/epub.py
+++ b/libmat2/epub.py
@ -1,11 +1,13 @@
 import logging
 import re
+import uuid
 import xml.etree.ElementTree as ET  # type: ignore

 from . import archive, office

 class EPUBParser(archive.ArchiveBasedAbstractParser):
    mimetypes = {'application/epub+zip', }
+    metadata_namespace = '{http://purl.org/dc/elements/1.1/}'

    def __init__(self, filename):
        super().__init__(filename)
@ -14,6 +16,7 @@ class EPUBParser(archive.ArchiveBasedAbstractParser):
            'mimetype',
            'OEBPS/content.opf',
            }))
+        self.uniqid = uuid.uuid4()

    def _specific_get_meta(self, full_path, file_path):
        if file_path != 'OEBPS/content.opf':
@ -25,23 +28,52 @@ class EPUBParser(archive.ArchiveBasedAbstractParser):
                                     f.read(), re.I|re.M)
                return {k:v for (k, v) in results}
            except (TypeError, UnicodeDecodeError):
-                # We didn't manage to parse the xml file
                return {file_path: 'harmful content', }

    def _specific_cleanup(self, full_path: str):
-        if not full_path.endswith('OEBPS/content.opf'):
+        if full_path.endswith('OEBPS/content.opf'):
+            return self.__handle_contentopf(full_path)
+        elif full_path.endswith('OEBPS/toc.ncx'):
+            return self.__handle_tocncx(full_path)
        return True

+    def __handle_tocncx(self, full_path: str):
+        try:
+            tree, namespace = office._parse_xml(full_path)
+        except ET.ParseError:  # pragma: nocover
+            logging.error("Unable to parse %s in %s.", full_path, self.filename)
+            return False
+
+        for item in tree.iterfind('.//', namespace):  # pragma: nocover
+            if item.tag.strip().lower().endswith('head'):
+                item.clear()
+                ET.SubElement(item, 'meta', attrib={'name': '', 'content': ''})
+                break
+        tree.write(full_path, xml_declaration=True, encoding='utf-8',
+                   short_empty_elements=False)
+        return True
+
+    def __handle_contentopf(self, full_path: str):
        try:
            tree, namespace = office._parse_xml(full_path)
        except ET.ParseError:
            logging.error("Unable to parse %s in %s.", full_path, self.filename)
            return False
-        parent_map = {c:p for p in tree.iter() for c in p}

-        for item in tree.iterfind('.//', namespace):
+        for item in tree.iterfind('.//', namespace):  # pragma: nocover
            if item.tag.strip().lower().endswith('metadata'):
-                parent_map[item].remove(item)
+                item.clear()
+
+                # item with mandatory content
+                uniqid = ET.Element(self.metadata_namespace + 'identifier')
+                uniqid.text = str(self.uniqid)
+                uniqid.set('id', 'id')
+                item.append(uniqid)
+
+                # items without mandatory content
+                for name in {'language', 'title'}:
+                    uniqid = ET.Element(self.metadata_namespace + name)
+                    item.append(uniqid)
                break  # there is only a single <metadata> block
-        tree.write(full_path, xml_declaration=True)
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True
--- a/libmat2/web.py
+++ b/libmat2/web.py
@ -1,10 +1,13 @@
-from html import parser
-from typing import Dict, Any, List, Tuple
+from html import parser, escape
+from typing import Dict, Any, List, Tuple, Set
 import re
 import string

 from . import abstract

+assert Set
+
+# pylint: disable=too-many-instance-attributes

 class CSSParser(abstract.AbstractParser):
    """There is no such things as metadata in CSS files,
@ -33,11 +36,16 @@ class CSSParser(abstract.AbstractParser):
        return metadata


-class HTMLParser(abstract.AbstractParser):
-    mimetypes = {'text/html', 'application/x-dtbncx+xml', }
+class AbstractHTMLParser(abstract.AbstractParser):
+    tags_blacklist = set()  # type: Set[str]
+    # In some html/xml based formats some tags are mandatory,
+    # so we're keeping them, but are discaring their contents
+    tags_required_blacklist = set()  # type: Set[str]
+
    def __init__(self, filename):
        super().__init__(filename)
-        self.__parser = _HTMLParser(self.filename)
+        self.__parser = _HTMLParser(self.filename, self.tags_blacklist,
+                                    self.tags_required_blacklist)
        with open(filename, encoding='utf-8') as f:
            self.__parser.feed(f.read())
        self.__parser.close()
@ -49,29 +57,50 @@ class HTMLParser(abstract.AbstractParser):
        return self.__parser.remove_all(self.output_filename)


+class HTMLParser(AbstractHTMLParser):
+    mimetypes = {'text/html', }
+    tags_blacklist = {'meta', }
+    tags_required_blacklist = {'title', }
+
+
+class DTBNCXParser(AbstractHTMLParser):
+    mimetypes = {'application/x-dtbncx+xml', }
+    tags_required_blacklist = {'title', 'doctitle', 'meta'}
+
+
 class _HTMLParser(parser.HTMLParser):
    """Python doesn't have a validating html parser in its stdlib, so
    we're using an internal queue to track all the opening/closing tags,
    and hoping for the best.
    """
-    tag_blacklist = {'doctitle', 'meta', 'title'}  # everything is lowercase
-    def __init__(self, filename):
+    def __init__(self, filename, blacklisted_tags, required_blacklisted_tags):
        super().__init__()
        self.filename = filename
        self.__textrepr = ''
        self.__meta = {}
-        self.__validation_queue = []
-        # We're using a counter instead of a boolean to handle nested tags
+        self.__validation_queue = []  # type: List[str]
+        # We're using counters instead of booleans, to handle nested tags
+        self.__in_dangerous_but_required_tag = 0
        self.__in_dangerous_tag = 0

+        if required_blacklisted_tags & blacklisted_tags:  # pragma: nocover
+            raise ValueError("There is an overlap between %s and %s" % (
+                required_blacklisted_tags, blacklisted_tags))
+        self.tag_required_blacklist = required_blacklisted_tags
+        self.tag_blacklist = blacklisted_tags
+
    def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
-        self.__validation_queue.append(tag)
+        original_tag = self.get_starttag_text()
+        self.__validation_queue.append(original_tag)
+
+        if tag in self.tag_required_blacklist:
+            self.__in_dangerous_but_required_tag += 1
        if tag in self.tag_blacklist:
            self.__in_dangerous_tag += 1
-            return

        if self.__in_dangerous_tag == 0:
-            self.__textrepr += self.get_starttag_text()
+            if self.__in_dangerous_but_required_tag <= 1:
+                self.__textrepr += original_tag

    def handle_endtag(self, tag: str):
        if not self.__validation_queue:
@ -79,29 +108,43 @@ class _HTMLParser(parser.HTMLParser):
                             "opening one in %s." % (tag, self.filename))

        previous_tag = self.__validation_queue.pop()
-        if tag != previous_tag:
+        previous_tag = previous_tag[1:-1]  # remove < and >
+        previous_tag = previous_tag.split(' ')[0]  # remove attributes
+        if tag != previous_tag.lower():
            raise ValueError("The closing tag %s doesn't match the previous "
                             "tag %s in %s" %
                             (tag, previous_tag, self.filename))
-        elif tag in self.tag_blacklist:
-            self.__in_dangerous_tag -= 1
-            return

        if self.__in_dangerous_tag == 0:
+            if self.__in_dangerous_but_required_tag <= 1:
                # There is no `get_endtag_text()` method :/
-            self.__textrepr += '</' + tag + '>\n'
+                self.__textrepr += '</' + previous_tag + '>'
+
+        if tag in self.tag_required_blacklist:
+            self.__in_dangerous_but_required_tag -= 1
+        elif tag in self.tag_blacklist:
+            self.__in_dangerous_tag -= 1

    def handle_data(self, data: str):
-        if self.__in_dangerous_tag == 0 and data.strip():
-            self.__textrepr += data
+        if self.__in_dangerous_but_required_tag == 0:
+            if self.__in_dangerous_tag == 0:
+                if data.strip():
+                    self.__textrepr += escape(data)

    def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
-        if tag in self.tag_blacklist:
+        if tag in self.tag_required_blacklist | self.tag_blacklist:
            meta = {k:v for k, v in attrs}
            name = meta.get('name', 'harmful metadata')
            content = meta.get('content', 'harmful data')
            self.__meta[name] = content
-        else:
+
+            if self.__in_dangerous_tag != 0:
+                return
+            elif tag in self.tag_required_blacklist:
+                self.__textrepr += '<' + tag + ' />'
+            return
+
+        if self.__in_dangerous_but_required_tag == 0:
            if self.__in_dangerous_tag == 0:
                self.__textrepr += self.get_starttag_text()

--- a/tests/test_corrupted_files.py
+++ b/tests/test_corrupted_files.py
@ -253,13 +253,13 @@ class TestCorruptedFiles(unittest.TestCase):
        os.remove('./tests/data/clean.cleaned.html')

        with open('./tests/data/clean.html', 'w') as f:
-            f.write('</close>')
+            f.write('</meta>')
        with self.assertRaises(ValueError):
            web.HTMLParser('./tests/data/clean.html')
        os.remove('./tests/data/clean.html')

        with open('./tests/data/clean.html', 'w') as f:
-            f.write('<notclosed>')
+            f.write('<meta><a>test</a><set/></meta><title></title><meta>')
        p = web.HTMLParser('./tests/data/clean.html')
        with self.assertRaises(ValueError):
            p.get_meta()
@ -269,6 +269,9 @@ class TestCorruptedFiles(unittest.TestCase):
        os.remove('./tests/data/clean.html')

        with open('./tests/data/clean.html', 'w') as f:
+            f.write('<meta><meta/></meta>')
+            f.write('<title><title>pouet</title></title>')
+            f.write('<title><mysupertag/></title>')
            f.write('<doctitle><br/></doctitle><br/><notclosed>')
        p = web.HTMLParser('./tests/data/clean.html')
        with self.assertRaises(ValueError):
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@ -3,6 +3,7 @@
 import unittest
 import shutil
 import os
+import re
 import zipfile

 from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
@ -644,7 +645,10 @@ class TestCleaning(unittest.TestCase):
        self.assertTrue(ret)

        p = epub.EPUBParser('./tests/data/clean.cleaned.epub')
-        self.assertEqual(p.get_meta(), {})
+        meta = p.get_meta()
+        res = re.match(meta['OEBPS/content.opf']['metadata'], '^<dc:identifier>[0-9a-f-]+</dc:identifier><dc:title /><dc:language />$')
+        self.assertNotEqual(res, False)
+
        self.assertTrue(p.remove_all())

        os.remove('./tests/data/clean.epub')