mat2/libmat2/epub.py

import logging
import re
import uuid
import zipfile
import xml.etree.ElementTree as ET  # type: ignore
from typing import Dict, Any

from . import archive, office

class EPUBParser(archive.ZipParser):
    mimetypes = {'application/epub+zip', }
    metadata_namespace = '{http://purl.org/dc/elements/1.1/}'

    def __init__(self, filename):
        super().__init__(filename)
        self.files_to_keep = set(map(re.compile, {  # type: ignore
            'META-INF/container.xml',
            'mimetype',
            'OEBPS/content.opf',
            'content.opf',
            'hmh.opf',
            'OPS/.+.xml'
            }))
        self.files_to_omit = set(map(re.compile, {  # type: ignore
            'iTunesMetadata.plist',
            'META-INF/calibre_bookmarks.txt',
            'OEBPS/package.opf',
             }))
        self.uniqid = uuid.uuid4()


    def is_archive_valid(self):
        super().is_archive_valid()
        with zipfile.ZipFile(self.filename) as zin:
            for item in self._get_all_members(zin):
                member_name = self._get_member_name(item)
                if member_name.endswith('META-INF/encryption.xml'):
                    raise ValueError('the file contains encrypted fonts')

    def _specific_get_meta(self, full_path, file_path) -> Dict[str, Any]:
        if not file_path.endswith('.opf'):
            return {}

        with open(full_path, encoding='utf-8') as f:
            try:
                results = re.findall(r"<((?:meta|dc|cp).+?)[^>]*>(.+)</\1>",
                                     f.read(), re.I|re.M)
                return {k:v for (k, v) in results}
            except (TypeError, UnicodeDecodeError):
                return {file_path: 'harmful content', }

    def _specific_cleanup(self, full_path: str) -> bool:
        if full_path.endswith('hmh.opf') or full_path.endswith('content.opf'):
            return self.__handle_contentopf(full_path)
        elif full_path.endswith('OEBPS/toc.ncx'):
            return self.__handle_tocncx(full_path)
        elif re.search('/OPS/[^/]+.xml$', full_path):
            return self.__handle_ops_xml(full_path)
        return True

    def __handle_ops_xml(self, full_path: str) -> bool:
        try:
            tree, namespace = office._parse_xml(full_path)
        except ET.ParseError:  # pragma: nocover
            logging.error("Unable to parse %s in %s.", full_path, self.filename)
            return False

        for item in tree.iterfind('.//', namespace):  # pragma: nocover
            if item.tag.strip().lower().endswith('head'):
                item.clear()
                break
        tree.write(full_path, xml_declaration=True, encoding='utf-8',
                   short_empty_elements=False)
        return True


    def __handle_tocncx(self, full_path: str) -> bool:
        try:
            tree, namespace = office._parse_xml(full_path)
        except ET.ParseError:  # pragma: nocover
            logging.error("Unable to parse %s in %s.", full_path, self.filename)
            return False

        for item in tree.iterfind('.//', namespace):  # pragma: nocover
            if item.tag.strip().lower().endswith('head'):
                item.clear()
                ET.SubElement(item, 'meta', attrib={'name': '', 'content': ''})
                break
        tree.write(full_path, xml_declaration=True, encoding='utf-8',
                   short_empty_elements=False)
        return True

    def __handle_contentopf(self, full_path: str) -> bool:
        try:
            tree, namespace = office._parse_xml(full_path)
        except ET.ParseError:
            logging.error("Unable to parse %s in %s.", full_path, self.filename)
            return False

        for item in tree.iterfind('.//', namespace):  # pragma: nocover
            if item.tag.strip().lower().endswith('metadata'):
                item.clear()

                # item with mandatory content
                uniqid = ET.Element(self.metadata_namespace + 'identifier')
                uniqid.text = str(self.uniqid)
                uniqid.set('id', 'id')
                item.append(uniqid)

                # items without mandatory content
                for name in {'language', 'title'}:
                    uniqid = ET.Element(self.metadata_namespace + name)
                    item.append(uniqid)
                break  # there is only a single <metadata> block
        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True
Implement epub support 2019-02-21 01:28:11 +01:00			`import logging`
			`import re`
Improve epub support 2019-02-27 23:04:38 +01:00			`import uuid`
Improve epub support Warn when there are encrypted fonts in an epub file 2021-03-07 17:50:25 +01:00			`import zipfile`
Implement epub support 2019-02-21 01:28:11 +01:00			`import xml.etree.ElementTree as ET # type: ignore`
Add some typing to epub.py 2021-03-07 17:50:17 +01:00			`from typing import Dict, Any`
Implement epub support 2019-02-21 01:28:11 +01:00
			`from . import archive, office`

Add tar archive support 2019-04-27 13:05:36 +02:00			`class EPUBParser(archive.ZipParser):`
Implement epub support 2019-02-21 01:28:11 +01:00			`mimetypes = {'application/epub+zip', }`
Improve epub support 2019-02-27 23:04:38 +01:00			`metadata_namespace = '{http://purl.org/dc/elements/1.1/}'`
Implement epub support 2019-02-21 01:28:11 +01:00
			`def __init__(self, filename):`
			`super().__init__(filename)`
			`self.files_to_keep = set(map(re.compile, { # type: ignore`
			`'META-INF/container.xml',`
			`'mimetype',`
			`'OEBPS/content.opf',`
Improve epub compatibility 2021-01-30 16:24:42 +01:00			`'content.opf',`
Improve a bit the support of epub 2021-02-07 17:17:16 +01:00			`'hmh.opf',`
			`'OPS/.+.xml'`
Implement epub support 2019-02-21 01:28:11 +01:00			`}))`
Improve a bit the support of epub 2021-02-07 17:17:16 +01:00			`self.files_to_omit = set(map(re.compile, { # type: ignore`
Add a missing comma This should improve epub support 2021-03-07 16:42:38 +01:00			`'iTunesMetadata.plist',`
			`'META-INF/calibre_bookmarks.txt',`
Improve epub compatibility 2021-03-07 16:59:18 +01:00			`'OEBPS/package.opf',`
Improve a bit the support of epub 2021-02-07 17:17:16 +01:00			`}))`
Improve epub support 2019-02-27 23:04:38 +01:00			`self.uniqid = uuid.uuid4()`
Implement epub support 2019-02-21 01:28:11 +01:00
Improve epub support Warn when there are encrypted fonts in an epub file 2021-03-07 17:50:25 +01:00
			`def is_archive_valid(self):`
			`super().is_archive_valid()`
			`with zipfile.ZipFile(self.filename) as zin:`
			`for item in self._get_all_members(zin):`
			`member_name = self._get_member_name(item)`
			`if member_name.endswith('META-INF/encryption.xml'):`
			`raise ValueError('the file contains encrypted fonts')`

			`def _specific_get_meta(self, full_path, file_path) -> Dict[str, Any]:`
Improve a bit the support of epub 2021-02-07 17:17:16 +01:00			`if not file_path.endswith('.opf'):`
Implement epub support 2019-02-21 01:28:11 +01:00			`return {}`

			`with open(full_path, encoding='utf-8') as f:`
			`try:`
			`results = re.findall(r"<((?:meta\|dc\|cp).+?)[^>]*>(.+)</\1>",`
			`f.read(), re.I\|re.M)`
			`return {k:v for (k, v) in results}`
			`except (TypeError, UnicodeDecodeError):`
			`return {file_path: 'harmful content', }`

Add some typing to epub.py 2021-03-07 17:50:17 +01:00			`def _specific_cleanup(self, full_path: str) -> bool:`
Improve a bit the support of epub 2021-02-07 17:17:16 +01:00			`if full_path.endswith('hmh.opf') or full_path.endswith('content.opf'):`
Improve epub support 2019-02-27 23:04:38 +01:00			`return self.__handle_contentopf(full_path)`
			`elif full_path.endswith('OEBPS/toc.ncx'):`
			`return self.__handle_tocncx(full_path)`
Improve a bit the support of epub 2021-02-07 17:17:16 +01:00			`elif re.search('/OPS/[^/]+.xml$', full_path):`
			`return self.__handle_ops_xml(full_path)`
Improve epub support 2019-02-27 23:04:38 +01:00			`return True`

Add some typing to epub.py 2021-03-07 17:50:17 +01:00			`def __handle_ops_xml(self, full_path: str) -> bool:`
Improve a bit the support of epub 2021-02-07 17:17:16 +01:00			`try:`
			`tree, namespace = office._parse_xml(full_path)`
			`except ET.ParseError: # pragma: nocover`
			`logging.error("Unable to parse %s in %s.", full_path, self.filename)`
			`return False`

			`for item in tree.iterfind('.//', namespace): # pragma: nocover`
			`if item.tag.strip().lower().endswith('head'):`
			`item.clear()`
			`break`
			`tree.write(full_path, xml_declaration=True, encoding='utf-8',`
			`short_empty_elements=False)`
			`return True`


Add some typing to epub.py 2021-03-07 17:50:17 +01:00			`def __handle_tocncx(self, full_path: str) -> bool:`
Improve epub support 2019-02-27 23:04:38 +01:00			`try:`
			`tree, namespace = office._parse_xml(full_path)`
			`except ET.ParseError: # pragma: nocover`
			`logging.error("Unable to parse %s in %s.", full_path, self.filename)`
			`return False`

			`for item in tree.iterfind('.//', namespace): # pragma: nocover`
			`if item.tag.strip().lower().endswith('head'):`
			`item.clear()`
			`ET.SubElement(item, 'meta', attrib={'name': '', 'content': ''})`
			`break`
			`tree.write(full_path, xml_declaration=True, encoding='utf-8',`
			`short_empty_elements=False)`
			`return True`
Implement epub support 2019-02-21 01:28:11 +01:00
Add some typing to epub.py 2021-03-07 17:50:17 +01:00			`def __handle_contentopf(self, full_path: str) -> bool:`
Implement epub support 2019-02-21 01:28:11 +01:00			`try:`
			`tree, namespace = office._parse_xml(full_path)`
			`except ET.ParseError:`
			`logging.error("Unable to parse %s in %s.", full_path, self.filename)`
			`return False`

Improve epub support 2019-02-27 23:04:38 +01:00			`for item in tree.iterfind('.//', namespace): # pragma: nocover`
Implement epub support 2019-02-21 01:28:11 +01:00			`if item.tag.strip().lower().endswith('metadata'):`
Improve epub support 2019-02-27 23:04:38 +01:00			`item.clear()`

			`# item with mandatory content`
			`uniqid = ET.Element(self.metadata_namespace + 'identifier')`
			`uniqid.text = str(self.uniqid)`
			`uniqid.set('id', 'id')`
			`item.append(uniqid)`

			`# items without mandatory content`
			`for name in {'language', 'title'}:`
			`uniqid = ET.Element(self.metadata_namespace + name)`
			`item.append(uniqid)`
Implement epub support 2019-02-21 01:28:11 +01:00			`break # there is only a single <metadata> block`
Improve epub support 2019-02-27 23:04:38 +01:00			`tree.write(full_path, xml_declaration=True, encoding='utf-8')`
Implement epub support 2019-02-21 01:28:11 +01:00			`return True`