From ec082d64833ed209d0bbec9ae5171e9378ffcb87 Mon Sep 17 00:00:00 2001
From: jvoisin <julien.voisin@dustri.org>
Date: Sun, 7 Feb 2021 17:17:16 +0100
Subject: [PATCH] Improve a bit the support of epub

---
 libmat2/epub.py | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/libmat2/epub.py b/libmat2/epub.py
index fd38411..52fab1c 100644
--- a/libmat2/epub.py
+++ b/libmat2/epub.py
@@ -16,11 +16,17 @@ class EPUBParser(archive.ZipParser):
             'mimetype',
             'OEBPS/content.opf',
             'content.opf',
+            'hmh.opf',
+            'OPS/.+.xml'
             }))
+        self.files_to_omit = set(map(re.compile, {  # type: ignore
+            'iTunesMetadata.plist'
+            'META-INF/calibre_bookmarks.txt'
+             }))
         self.uniqid = uuid.uuid4()
 
     def _specific_get_meta(self, full_path, file_path):
-        if not file_path.endswith('content.opf'):
+        if not file_path.endswith('.opf'):
             return {}
 
         with open(full_path, encoding='utf-8') as f:
@@ -32,12 +38,30 @@ class EPUBParser(archive.ZipParser):
                 return {file_path: 'harmful content', }
 
     def _specific_cleanup(self, full_path: str):
-        if full_path.endswith('content.opf'):
+        if full_path.endswith('hmh.opf') or full_path.endswith('content.opf'):
             return self.__handle_contentopf(full_path)
         elif full_path.endswith('OEBPS/toc.ncx'):
             return self.__handle_tocncx(full_path)
+        elif re.search('/OPS/[^/]+.xml$', full_path):
+            return self.__handle_ops_xml(full_path)
         return True
 
+    def __handle_ops_xml(self, full_path: str):
+        try:
+            tree, namespace = office._parse_xml(full_path)
+        except ET.ParseError:  # pragma: nocover
+            logging.error("Unable to parse %s in %s.", full_path, self.filename)
+            return False
+
+        for item in tree.iterfind('.//', namespace):  # pragma: nocover
+            if item.tag.strip().lower().endswith('head'):
+                item.clear()
+                break
+        tree.write(full_path, xml_declaration=True, encoding='utf-8',
+                   short_empty_elements=False)
+        return True
+
+
     def __handle_tocncx(self, full_path: str):
         try:
             tree, namespace = office._parse_xml(full_path)