2019-02-21 01:28:11 +01:00
|
|
|
import logging
|
|
|
|
import re
|
2019-02-27 23:04:38 +01:00
|
|
|
import uuid
|
2021-03-07 17:50:25 +01:00
|
|
|
import zipfile
|
2019-02-21 01:28:11 +01:00
|
|
|
import xml.etree.ElementTree as ET # type: ignore
|
2021-03-07 17:50:17 +01:00
|
|
|
from typing import Dict, Any
|
2019-02-21 01:28:11 +01:00
|
|
|
|
|
|
|
from . import archive, office
|
|
|
|
|
2019-04-27 13:05:36 +02:00
|
|
|
class EPUBParser(archive.ZipParser):
|
2019-02-21 01:28:11 +01:00
|
|
|
mimetypes = {'application/epub+zip', }
|
2019-02-27 23:04:38 +01:00
|
|
|
metadata_namespace = '{http://purl.org/dc/elements/1.1/}'
|
2019-02-21 01:28:11 +01:00
|
|
|
|
|
|
|
def __init__(self, filename):
|
|
|
|
super().__init__(filename)
|
|
|
|
self.files_to_keep = set(map(re.compile, { # type: ignore
|
|
|
|
'META-INF/container.xml',
|
|
|
|
'mimetype',
|
|
|
|
'OEBPS/content.opf',
|
2021-01-30 16:24:42 +01:00
|
|
|
'content.opf',
|
2021-02-07 17:17:16 +01:00
|
|
|
'hmh.opf',
|
|
|
|
'OPS/.+.xml'
|
2019-02-21 01:28:11 +01:00
|
|
|
}))
|
2021-02-07 17:17:16 +01:00
|
|
|
self.files_to_omit = set(map(re.compile, { # type: ignore
|
2021-03-07 16:42:38 +01:00
|
|
|
'iTunesMetadata.plist',
|
|
|
|
'META-INF/calibre_bookmarks.txt',
|
2021-03-07 16:59:18 +01:00
|
|
|
'OEBPS/package.opf',
|
2021-02-07 17:17:16 +01:00
|
|
|
}))
|
2019-02-27 23:04:38 +01:00
|
|
|
self.uniqid = uuid.uuid4()
|
2019-02-21 01:28:11 +01:00
|
|
|
|
2021-03-07 17:50:25 +01:00
|
|
|
|
|
|
|
def is_archive_valid(self):
|
|
|
|
super().is_archive_valid()
|
|
|
|
with zipfile.ZipFile(self.filename) as zin:
|
|
|
|
for item in self._get_all_members(zin):
|
|
|
|
member_name = self._get_member_name(item)
|
|
|
|
if member_name.endswith('META-INF/encryption.xml'):
|
|
|
|
raise ValueError('the file contains encrypted fonts')
|
|
|
|
|
|
|
|
def _specific_get_meta(self, full_path, file_path) -> Dict[str, Any]:
|
2021-02-07 17:17:16 +01:00
|
|
|
if not file_path.endswith('.opf'):
|
2019-02-21 01:28:11 +01:00
|
|
|
return {}
|
|
|
|
|
|
|
|
with open(full_path, encoding='utf-8') as f:
|
|
|
|
try:
|
|
|
|
results = re.findall(r"<((?:meta|dc|cp).+?)[^>]*>(.+)</\1>",
|
|
|
|
f.read(), re.I|re.M)
|
|
|
|
return {k:v for (k, v) in results}
|
|
|
|
except (TypeError, UnicodeDecodeError):
|
|
|
|
return {file_path: 'harmful content', }
|
|
|
|
|
2021-03-07 17:50:17 +01:00
|
|
|
def _specific_cleanup(self, full_path: str) -> bool:
|
2021-02-07 17:17:16 +01:00
|
|
|
if full_path.endswith('hmh.opf') or full_path.endswith('content.opf'):
|
2019-02-27 23:04:38 +01:00
|
|
|
return self.__handle_contentopf(full_path)
|
|
|
|
elif full_path.endswith('OEBPS/toc.ncx'):
|
|
|
|
return self.__handle_tocncx(full_path)
|
2021-02-07 17:17:16 +01:00
|
|
|
elif re.search('/OPS/[^/]+.xml$', full_path):
|
|
|
|
return self.__handle_ops_xml(full_path)
|
2019-02-27 23:04:38 +01:00
|
|
|
return True
|
|
|
|
|
2021-03-07 17:50:17 +01:00
|
|
|
def __handle_ops_xml(self, full_path: str) -> bool:
|
2021-02-07 17:17:16 +01:00
|
|
|
try:
|
|
|
|
tree, namespace = office._parse_xml(full_path)
|
|
|
|
except ET.ParseError: # pragma: nocover
|
|
|
|
logging.error("Unable to parse %s in %s.", full_path, self.filename)
|
|
|
|
return False
|
|
|
|
|
|
|
|
for item in tree.iterfind('.//', namespace): # pragma: nocover
|
|
|
|
if item.tag.strip().lower().endswith('head'):
|
|
|
|
item.clear()
|
|
|
|
break
|
|
|
|
tree.write(full_path, xml_declaration=True, encoding='utf-8',
|
|
|
|
short_empty_elements=False)
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
2021-03-07 17:50:17 +01:00
|
|
|
def __handle_tocncx(self, full_path: str) -> bool:
|
2019-02-27 23:04:38 +01:00
|
|
|
try:
|
|
|
|
tree, namespace = office._parse_xml(full_path)
|
|
|
|
except ET.ParseError: # pragma: nocover
|
|
|
|
logging.error("Unable to parse %s in %s.", full_path, self.filename)
|
|
|
|
return False
|
|
|
|
|
|
|
|
for item in tree.iterfind('.//', namespace): # pragma: nocover
|
|
|
|
if item.tag.strip().lower().endswith('head'):
|
|
|
|
item.clear()
|
|
|
|
ET.SubElement(item, 'meta', attrib={'name': '', 'content': ''})
|
|
|
|
break
|
|
|
|
tree.write(full_path, xml_declaration=True, encoding='utf-8',
|
|
|
|
short_empty_elements=False)
|
|
|
|
return True
|
2019-02-21 01:28:11 +01:00
|
|
|
|
2021-03-07 17:50:17 +01:00
|
|
|
def __handle_contentopf(self, full_path: str) -> bool:
|
2019-02-21 01:28:11 +01:00
|
|
|
try:
|
|
|
|
tree, namespace = office._parse_xml(full_path)
|
|
|
|
except ET.ParseError:
|
|
|
|
logging.error("Unable to parse %s in %s.", full_path, self.filename)
|
|
|
|
return False
|
|
|
|
|
2019-02-27 23:04:38 +01:00
|
|
|
for item in tree.iterfind('.//', namespace): # pragma: nocover
|
2019-02-21 01:28:11 +01:00
|
|
|
if item.tag.strip().lower().endswith('metadata'):
|
2019-02-27 23:04:38 +01:00
|
|
|
item.clear()
|
|
|
|
|
|
|
|
# item with mandatory content
|
|
|
|
uniqid = ET.Element(self.metadata_namespace + 'identifier')
|
|
|
|
uniqid.text = str(self.uniqid)
|
|
|
|
uniqid.set('id', 'id')
|
|
|
|
item.append(uniqid)
|
|
|
|
|
|
|
|
# items without mandatory content
|
|
|
|
for name in {'language', 'title'}:
|
|
|
|
uniqid = ET.Element(self.metadata_namespace + name)
|
|
|
|
item.append(uniqid)
|
2019-02-21 01:28:11 +01:00
|
|
|
break # there is only a single <metadata> block
|
2019-02-27 23:04:38 +01:00
|
|
|
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
2019-02-21 01:28:11 +01:00
|
|
|
return True
|