1
0
mirror of synced 2024-11-10 19:38:52 +01:00
mat2/libmat2/epub.py

48 lines
1.7 KiB
Python
Raw Normal View History

2019-02-21 01:28:11 +01:00
import logging
import re
import xml.etree.ElementTree as ET # type: ignore
from . import archive, office
class EPUBParser(archive.ArchiveBasedAbstractParser):
mimetypes = {'application/epub+zip', }
def __init__(self, filename):
super().__init__(filename)
self.files_to_keep = set(map(re.compile, { # type: ignore
'META-INF/container.xml',
'mimetype',
'OEBPS/content.opf',
}))
def _specific_get_meta(self, full_path, file_path):
if file_path != 'OEBPS/content.opf':
return {}
with open(full_path, encoding='utf-8') as f:
try:
results = re.findall(r"<((?:meta|dc|cp).+?)[^>]*>(.+)</\1>",
f.read(), re.I|re.M)
return {k:v for (k, v) in results}
except (TypeError, UnicodeDecodeError):
# We didn't manage to parse the xml file
return {file_path: 'harmful content', }
def _specific_cleanup(self, full_path: str):
if not full_path.endswith('OEBPS/content.opf'):
return True
try:
tree, namespace = office._parse_xml(full_path)
except ET.ParseError:
logging.error("Unable to parse %s in %s.", full_path, self.filename)
return False
parent_map = {c:p for p in tree.iter() for c in p}
for item in tree.iterfind('.//', namespace):
if item.tag.strip().lower().endswith('metadata'):
parent_map[item].remove(item)
break # there is only a single <metadata> block
tree.write(full_path, xml_declaration=True)
return True