Implement epub support

2025-07-03 20:07:28 +02:00 · 2019-02-20 16:28:11 -08:00 · 2019-02-20 16:28:11 -08:00 · 02ff21b158
commit 02ff21b158
parent 6b45064c78
9 changed files with 282 additions and 83 deletions
--- a/libmat2/epub.py
+++ b/libmat2/epub.py
@ -0,0 +1,47 @@
+import logging
+import re
+import xml.etree.ElementTree as ET  # type: ignore
+
+from . import archive, office
+
+class EPUBParser(archive.ArchiveBasedAbstractParser):
+    mimetypes = {'application/epub+zip', }
+
+    def __init__(self, filename):
+        super().__init__(filename)
+        self.files_to_keep = set(map(re.compile, {  # type: ignore
+            'META-INF/container.xml',
+            'mimetype',
+            'OEBPS/content.opf',
+            }))
+
+    def _specific_get_meta(self, full_path, file_path):
+        if file_path != 'OEBPS/content.opf':
+            return {}
+
+        with open(full_path, encoding='utf-8') as f:
+            try:
+                results = re.findall(r"<((?:meta|dc|cp).+?)[^>]*>(.+)</\1>",
+                                     f.read(), re.I|re.M)
+                return {k:v for (k, v) in results}
+            except (TypeError, UnicodeDecodeError):
+                # We didn't manage to parse the xml file
+                return {file_path: 'harmful content', }
+
+    def _specific_cleanup(self, full_path: str):
+        if not full_path.endswith('OEBPS/content.opf'):
+            return True
+
+        try:
+            tree, namespace = office._parse_xml(full_path)
+        except ET.ParseError:
+            logging.error("Unable to parse %s in %s.", full_path, self.filename)
+            return False
+        parent_map = {c:p for p in tree.iter() for c in p}
+
+        for item in tree.iterfind('.//', namespace):
+            if item.tag.strip().lower().endswith('metadata'):
+                parent_map[item].remove(item)
+                break  # there is only a single <metadata> block
+        tree.write(full_path, xml_declaration=True)
+        return True
--- a/libmat2/html.py
+++ b/libmat2/html.py
@ -1,69 +0,0 @@
-from html import parser
-from typing import Dict, Any, List, Tuple
-
-from . import abstract
-
-
-class HTMLParser(abstract.AbstractParser):
-    mimetypes = {'text/html', }
-    def __init__(self, filename):
-        super().__init__(filename)
-        self.__parser = _HTMLParser()
-        with open(filename) as f:
-            self.__parser.feed(f.read())
-        self.__parser.close()
-
-    def get_meta(self) -> Dict[str, Any]:
-        return self.__parser.get_meta()
-
-    def remove_all(self) -> bool:
-        return self.__parser.remove_all(self.output_filename)
-
-
-class _HTMLParser(parser.HTMLParser):
-    """Python doesn't have a validating html parser in its stdlib, so
-    we're using an internal queue to track all the opening/closing tags,
-    and hoping for the best.
-    """
-    def __init__(self):
-        super().__init__()
-        self.__textrepr = ''
-        self.__meta = {}
-        self.__validation_queue = []
-
-    def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
-        self.__textrepr += self.get_starttag_text()
-        self.__validation_queue.append(tag)
-
-    def handle_endtag(self, tag: str):
-        if not self.__validation_queue:
-            raise ValueError
-        elif tag != self.__validation_queue.pop():
-            raise ValueError
-        # There is no `get_endtag_text()` method :/
-        self.__textrepr += '</' + tag + '>\n'
-
-    def handle_data(self, data: str):
-        if data.strip():
-            self.__textrepr += data
-
-    def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
-        if tag == 'meta':
-            meta = {k:v for k, v in attrs}
-            name = meta.get('name', 'harmful metadata')
-            content = meta.get('content', 'harmful data')
-            self.__meta[name] = content
-        else:
-            self.__textrepr += self.get_starttag_text()
-
-    def remove_all(self, output_filename: str) -> bool:
-        if self.__validation_queue:
-            raise ValueError
-        with open(output_filename, 'w') as f:
-            f.write(self.__textrepr)
-        return True
-
-    def get_meta(self) -> Dict[str, Any]:
-        if self.__validation_queue:
-            raise ValueError
-        return self.__meta
--- a/libmat2/parser_factory.py
+++ b/libmat2/parser_factory.py
@ -1,3 +1,4 @@
+import logging
 import glob
 import os
 import mimetypes
@ -10,6 +11,10 @@ assert Tuple  # make pyflakes happy

 T = TypeVar('T', bound='abstract.AbstractParser')

+mimetypes.add_type('application/epub+zip', '.epub')
+# EPUB Navigation Control XML File
+mimetypes.add_type('application/x-dtbncx+xml', '.ncx')
+

 def __load_all_parsers():
    """ Loads every parser in a dynamic way """
@ -49,6 +54,8 @@ def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]:
        if mtype in parser_class.mimetypes:
            try:
                return parser_class(filename), mtype
-            except ValueError:
+            except ValueError as e:
+                logging.info("Got an exception when trying to instanciate "
+                             "%s for %s: %s", parser_class, filename, e)
                return None, mtype
    return None, mtype
--- a/libmat2/web.py
+++ b/libmat2/web.py
@ -0,0 +1,122 @@
+from html import parser
+from typing import Dict, Any, List, Tuple
+import re
+import string
+
+from . import abstract
+
+
+class CSSParser(abstract.AbstractParser):
+    """There is no such things as metadata in CSS files,
+    only comments of the form `/* … */`, so we're removing the laters."""
+    mimetypes = {'text/css', }
+    flags = re.MULTILINE | re.DOTALL
+
+    def remove_all(self) -> bool:
+        with open(self.filename, encoding='utf-8') as f:
+            cleaned = re.sub(r'/\*.+?\*/', '', f.read(), 0, self.flags)
+        with open(self.output_filename, 'w', encoding='utf-8') as f:
+            f.write(cleaned)
+        return True
+
+    def get_meta(self) -> Dict[str, Any]:
+        metadata = {}
+        with open(self.filename, encoding='utf-8') as f:
+            cssdoc = re.findall(r'/\*(.+?)\*/', f.read(), self.flags)
+        for match in cssdoc:
+            for line in match.splitlines():
+                try:
+                    k, v = line.split(':')
+                    metadata[k.strip(string.whitespace + '*')] = v.strip()
+                except ValueError:
+                    metadata['harmful data'] = line.strip()
+        return metadata
+
+
+class HTMLParser(abstract.AbstractParser):
+    mimetypes = {'text/html', 'application/x-dtbncx+xml', }
+    def __init__(self, filename):
+        super().__init__(filename)
+        self.__parser = _HTMLParser(self.filename)
+        with open(filename, encoding='utf-8') as f:
+            self.__parser.feed(f.read())
+        self.__parser.close()
+
+    def get_meta(self) -> Dict[str, Any]:
+        return self.__parser.get_meta()
+
+    def remove_all(self) -> bool:
+        return self.__parser.remove_all(self.output_filename)
+
+
+class _HTMLParser(parser.HTMLParser):
+    """Python doesn't have a validating html parser in its stdlib, so
+    we're using an internal queue to track all the opening/closing tags,
+    and hoping for the best.
+    """
+    tag_blacklist = {'doctitle', 'meta'}  # everything is lowercase
+    def __init__(self, filename):
+        super().__init__()
+        self.filename = filename
+        self.__textrepr = ''
+        self.__meta = {}
+        self.__validation_queue = []
+        # We're using a counter instead of a boolean to handle nested tags
+        self.__in_dangerous_tag = 0
+
+    def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
+        self.__validation_queue.append(tag)
+        if tag in self.tag_blacklist:
+            self.__in_dangerous_tag += 1
+            return
+
+        if self.__in_dangerous_tag == 0:
+            self.__textrepr += self.get_starttag_text()
+
+    def handle_endtag(self, tag: str):
+        if not self.__validation_queue:
+            raise ValueError("The closing tag %s doesn't have a corresponding "
+                             "opening one in %s." % (tag, self.filename))
+
+        previous_tag = self.__validation_queue.pop()
+        if tag != previous_tag:
+            raise ValueError("The closing tag %s doesn't match the previous "
+                             "tag %s in %s" %
+                             (tag, previous_tag, self.filename))
+        elif tag in self.tag_blacklist:
+            self.__in_dangerous_tag -= 1
+            return
+
+        if self.__in_dangerous_tag == 0:
+            # There is no `get_endtag_text()` method :/
+            self.__textrepr += '</' + tag + '>\n'
+
+    def handle_data(self, data: str):
+        if self.__in_dangerous_tag == 0 and data.strip():
+            self.__textrepr += data
+
+    def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
+        if tag in self.tag_blacklist:
+            meta = {k:v for k, v in attrs}
+            name = meta.get('name', 'harmful metadata')
+            content = meta.get('content', 'harmful data')
+            self.__meta[name] = content
+        else:
+            if self.__in_dangerous_tag == 0:
+                self.__textrepr += self.get_starttag_text()
+
+    def remove_all(self, output_filename: str) -> bool:
+        if self.__validation_queue:
+            raise ValueError("Some tags (%s) were left unclosed in %s" % (
+                ', '.join(self.__validation_queue),
+                self.filename))
+        with open(output_filename, 'w', encoding='utf-8') as f:
+            f.write(self.__textrepr)
+        return True
+
+    def get_meta(self) -> Dict[str, Any]:
+        if self.__validation_queue:
+            raise ValueError("Some tags (%s) were left unclosed in %s" % (
+                ', '.join(self.__validation_queue),
+                self.filename))
+        return self.__meta