1
0
mirror of synced 2025-01-11 14:14:25 +01:00
mat2/libmat2/epub.py

80 lines
3.0 KiB
Python
Raw Normal View History

2019-02-20 16:28:11 -08:00
import logging
import re
2019-02-27 23:04:38 +01:00
import uuid
2019-02-20 16:28:11 -08:00
import xml.etree.ElementTree as ET # type: ignore
from . import archive, office
class EPUBParser(archive.ArchiveBasedAbstractParser):
mimetypes = {'application/epub+zip', }
2019-02-27 23:04:38 +01:00
metadata_namespace = '{http://purl.org/dc/elements/1.1/}'
2019-02-20 16:28:11 -08:00
def __init__(self, filename):
super().__init__(filename)
self.files_to_keep = set(map(re.compile, { # type: ignore
'META-INF/container.xml',
'mimetype',
'OEBPS/content.opf',
}))
2019-02-27 23:04:38 +01:00
self.uniqid = uuid.uuid4()
2019-02-20 16:28:11 -08:00
def _specific_get_meta(self, full_path, file_path):
if file_path != 'OEBPS/content.opf':
return {}
with open(full_path, encoding='utf-8') as f:
try:
results = re.findall(r"<((?:meta|dc|cp).+?)[^>]*>(.+)</\1>",
f.read(), re.I|re.M)
return {k:v for (k, v) in results}
except (TypeError, UnicodeDecodeError):
return {file_path: 'harmful content', }
def _specific_cleanup(self, full_path: str):
2019-02-27 23:04:38 +01:00
if full_path.endswith('OEBPS/content.opf'):
return self.__handle_contentopf(full_path)
elif full_path.endswith('OEBPS/toc.ncx'):
return self.__handle_tocncx(full_path)
return True
def __handle_tocncx(self, full_path: str):
try:
tree, namespace = office._parse_xml(full_path)
except ET.ParseError: # pragma: nocover
logging.error("Unable to parse %s in %s.", full_path, self.filename)
return False
for item in tree.iterfind('.//', namespace): # pragma: nocover
if item.tag.strip().lower().endswith('head'):
item.clear()
ET.SubElement(item, 'meta', attrib={'name': '', 'content': ''})
break
tree.write(full_path, xml_declaration=True, encoding='utf-8',
short_empty_elements=False)
return True
2019-02-20 16:28:11 -08:00
2019-02-27 23:04:38 +01:00
def __handle_contentopf(self, full_path: str):
2019-02-20 16:28:11 -08:00
try:
tree, namespace = office._parse_xml(full_path)
except ET.ParseError:
logging.error("Unable to parse %s in %s.", full_path, self.filename)
return False
2019-02-27 23:04:38 +01:00
for item in tree.iterfind('.//', namespace): # pragma: nocover
2019-02-20 16:28:11 -08:00
if item.tag.strip().lower().endswith('metadata'):
2019-02-27 23:04:38 +01:00
item.clear()
# item with mandatory content
uniqid = ET.Element(self.metadata_namespace + 'identifier')
uniqid.text = str(self.uniqid)
uniqid.set('id', 'id')
item.append(uniqid)
# items without mandatory content
for name in {'language', 'title'}:
uniqid = ET.Element(self.metadata_namespace + name)
item.append(uniqid)
2019-02-20 16:28:11 -08:00
break # there is only a single <metadata> block
2019-02-27 23:04:38 +01:00
tree.write(full_path, xml_declaration=True, encoding='utf-8')
2019-02-20 16:28:11 -08:00
return True