1
0
Fork 0

Improve epub support

This commit is contained in:
jvoisin 2019-02-27 23:04:38 +01:00
parent eb2e702f37
commit 73d2966e8c
4 changed files with 114 additions and 32 deletions

View File

@ -1,11 +1,13 @@
import logging
import re
import uuid
import xml.etree.ElementTree as ET # type: ignore
from . import archive, office
class EPUBParser(archive.ArchiveBasedAbstractParser):
mimetypes = {'application/epub+zip', }
metadata_namespace = '{http://purl.org/dc/elements/1.1/}'
def __init__(self, filename):
super().__init__(filename)
@ -14,6 +16,7 @@ class EPUBParser(archive.ArchiveBasedAbstractParser):
'mimetype',
'OEBPS/content.opf',
}))
self.uniqid = uuid.uuid4()
def _specific_get_meta(self, full_path, file_path):
if file_path != 'OEBPS/content.opf':
@ -25,23 +28,52 @@ class EPUBParser(archive.ArchiveBasedAbstractParser):
f.read(), re.I|re.M)
return {k:v for (k, v) in results}
except (TypeError, UnicodeDecodeError):
# We didn't manage to parse the xml file
return {file_path: 'harmful content', }
def _specific_cleanup(self, full_path: str):
if not full_path.endswith('OEBPS/content.opf'):
return True
if full_path.endswith('OEBPS/content.opf'):
return self.__handle_contentopf(full_path)
elif full_path.endswith('OEBPS/toc.ncx'):
return self.__handle_tocncx(full_path)
return True
def __handle_tocncx(self, full_path: str):
try:
tree, namespace = office._parse_xml(full_path)
except ET.ParseError: # pragma: nocover
logging.error("Unable to parse %s in %s.", full_path, self.filename)
return False
for item in tree.iterfind('.//', namespace): # pragma: nocover
if item.tag.strip().lower().endswith('head'):
item.clear()
ET.SubElement(item, 'meta', attrib={'name': '', 'content': ''})
break
tree.write(full_path, xml_declaration=True, encoding='utf-8',
short_empty_elements=False)
return True
def __handle_contentopf(self, full_path: str):
try:
tree, namespace = office._parse_xml(full_path)
except ET.ParseError:
logging.error("Unable to parse %s in %s.", full_path, self.filename)
return False
parent_map = {c:p for p in tree.iter() for c in p}
for item in tree.iterfind('.//', namespace):
for item in tree.iterfind('.//', namespace): # pragma: nocover
if item.tag.strip().lower().endswith('metadata'):
parent_map[item].remove(item)
item.clear()
# item with mandatory content
uniqid = ET.Element(self.metadata_namespace + 'identifier')
uniqid.text = str(self.uniqid)
uniqid.set('id', 'id')
item.append(uniqid)
# items without mandatory content
for name in {'language', 'title'}:
uniqid = ET.Element(self.metadata_namespace + name)
item.append(uniqid)
break # there is only a single <metadata> block
tree.write(full_path, xml_declaration=True)
tree.write(full_path, xml_declaration=True, encoding='utf-8')
return True

View File

@ -1,10 +1,13 @@
from html import parser
from typing import Dict, Any, List, Tuple
from html import parser, escape
from typing import Dict, Any, List, Tuple, Set
import re
import string
from . import abstract
assert Set
# pylint: disable=too-many-instance-attributes
class CSSParser(abstract.AbstractParser):
"""There is no such things as metadata in CSS files,
@ -33,11 +36,16 @@ class CSSParser(abstract.AbstractParser):
return metadata
class HTMLParser(abstract.AbstractParser):
mimetypes = {'text/html', 'application/x-dtbncx+xml', }
class AbstractHTMLParser(abstract.AbstractParser):
tags_blacklist = set() # type: Set[str]
# In some html/xml based formats some tags are mandatory,
# so we're keeping them, but are discaring their contents
tags_required_blacklist = set() # type: Set[str]
def __init__(self, filename):
super().__init__(filename)
self.__parser = _HTMLParser(self.filename)
self.__parser = _HTMLParser(self.filename, self.tags_blacklist,
self.tags_required_blacklist)
with open(filename, encoding='utf-8') as f:
self.__parser.feed(f.read())
self.__parser.close()
@ -49,29 +57,50 @@ class HTMLParser(abstract.AbstractParser):
return self.__parser.remove_all(self.output_filename)
class HTMLParser(AbstractHTMLParser):
mimetypes = {'text/html', }
tags_blacklist = {'meta', }
tags_required_blacklist = {'title', }
class DTBNCXParser(AbstractHTMLParser):
mimetypes = {'application/x-dtbncx+xml', }
tags_required_blacklist = {'title', 'doctitle', 'meta'}
class _HTMLParser(parser.HTMLParser):
"""Python doesn't have a validating html parser in its stdlib, so
we're using an internal queue to track all the opening/closing tags,
and hoping for the best.
"""
tag_blacklist = {'doctitle', 'meta', 'title'} # everything is lowercase
def __init__(self, filename):
def __init__(self, filename, blacklisted_tags, required_blacklisted_tags):
super().__init__()
self.filename = filename
self.__textrepr = ''
self.__meta = {}
self.__validation_queue = []
# We're using a counter instead of a boolean to handle nested tags
self.__validation_queue = [] # type: List[str]
# We're using counters instead of booleans, to handle nested tags
self.__in_dangerous_but_required_tag = 0
self.__in_dangerous_tag = 0
if required_blacklisted_tags & blacklisted_tags: # pragma: nocover
raise ValueError("There is an overlap between %s and %s" % (
required_blacklisted_tags, blacklisted_tags))
self.tag_required_blacklist = required_blacklisted_tags
self.tag_blacklist = blacklisted_tags
def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
self.__validation_queue.append(tag)
original_tag = self.get_starttag_text()
self.__validation_queue.append(original_tag)
if tag in self.tag_required_blacklist:
self.__in_dangerous_but_required_tag += 1
if tag in self.tag_blacklist:
self.__in_dangerous_tag += 1
return
if self.__in_dangerous_tag == 0:
self.__textrepr += self.get_starttag_text()
if self.__in_dangerous_but_required_tag <= 1:
self.__textrepr += original_tag
def handle_endtag(self, tag: str):
if not self.__validation_queue:
@ -79,29 +108,43 @@ class _HTMLParser(parser.HTMLParser):
"opening one in %s." % (tag, self.filename))
previous_tag = self.__validation_queue.pop()
if tag != previous_tag:
previous_tag = previous_tag[1:-1] # remove < and >
previous_tag = previous_tag.split(' ')[0] # remove attributes
if tag != previous_tag.lower():
raise ValueError("The closing tag %s doesn't match the previous "
"tag %s in %s" %
(tag, previous_tag, self.filename))
elif tag in self.tag_blacklist:
self.__in_dangerous_tag -= 1
return
if self.__in_dangerous_tag == 0:
# There is no `get_endtag_text()` method :/
self.__textrepr += '</' + tag + '>\n'
if self.__in_dangerous_but_required_tag <= 1:
# There is no `get_endtag_text()` method :/
self.__textrepr += '</' + previous_tag + '>'
if tag in self.tag_required_blacklist:
self.__in_dangerous_but_required_tag -= 1
elif tag in self.tag_blacklist:
self.__in_dangerous_tag -= 1
def handle_data(self, data: str):
if self.__in_dangerous_tag == 0 and data.strip():
self.__textrepr += data
if self.__in_dangerous_but_required_tag == 0:
if self.__in_dangerous_tag == 0:
if data.strip():
self.__textrepr += escape(data)
def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
if tag in self.tag_blacklist:
if tag in self.tag_required_blacklist | self.tag_blacklist:
meta = {k:v for k, v in attrs}
name = meta.get('name', 'harmful metadata')
content = meta.get('content', 'harmful data')
self.__meta[name] = content
else:
if self.__in_dangerous_tag != 0:
return
elif tag in self.tag_required_blacklist:
self.__textrepr += '<' + tag + ' />'
return
if self.__in_dangerous_but_required_tag == 0:
if self.__in_dangerous_tag == 0:
self.__textrepr += self.get_starttag_text()

View File

@ -253,13 +253,13 @@ class TestCorruptedFiles(unittest.TestCase):
os.remove('./tests/data/clean.cleaned.html')
with open('./tests/data/clean.html', 'w') as f:
f.write('</close>')
f.write('</meta>')
with self.assertRaises(ValueError):
web.HTMLParser('./tests/data/clean.html')
os.remove('./tests/data/clean.html')
with open('./tests/data/clean.html', 'w') as f:
f.write('<notclosed>')
f.write('<meta><a>test</a><set/></meta><title></title><meta>')
p = web.HTMLParser('./tests/data/clean.html')
with self.assertRaises(ValueError):
p.get_meta()
@ -269,6 +269,9 @@ class TestCorruptedFiles(unittest.TestCase):
os.remove('./tests/data/clean.html')
with open('./tests/data/clean.html', 'w') as f:
f.write('<meta><meta/></meta>')
f.write('<title><title>pouet</title></title>')
f.write('<title><mysupertag/></title>')
f.write('<doctitle><br/></doctitle><br/><notclosed>')
p = web.HTMLParser('./tests/data/clean.html')
with self.assertRaises(ValueError):

View File

@ -3,6 +3,7 @@
import unittest
import shutil
import os
import re
import zipfile
from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
@ -644,7 +645,10 @@ class TestCleaning(unittest.TestCase):
self.assertTrue(ret)
p = epub.EPUBParser('./tests/data/clean.cleaned.epub')
self.assertEqual(p.get_meta(), {})
meta = p.get_meta()
res = re.match(meta['OEBPS/content.opf']['metadata'], '^<dc:identifier>[0-9a-f-]+</dc:identifier><dc:title /><dc:language />$')
self.assertNotEqual(res, False)
self.assertTrue(p.remove_all())
os.remove('./tests/data/clean.epub')