1
0
mirror of synced 2024-12-22 20:59:58 +01:00

Improve the reliability of the office parser

This commit is contained in:
jvoisin 2018-06-21 23:18:50 +02:00
parent 846a261465
commit 5b38bd7ccd
2 changed files with 12 additions and 0 deletions

View File

@ -16,6 +16,13 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
files_to_keep = set() # type: Set[str]
files_to_omit = set() # type: Set[Pattern]
def __init__(self, filename):
super().__init__(filename)
try: # better fail here than later
zipfile.ZipFile(self.filename)
except zipfile.BadZipFile:
raise ValueError
def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
zipinfo.create_system = 3 # Linux
zipinfo.comment = b''

View File

@ -90,6 +90,11 @@ class TestCorruptedFiles(unittest.TestCase):
os.remove('./tests/data/clean.torrent')
def test_odg(self):
shutil.copy('./tests/data/dirty.png', './tests/data/clean.odg')
with self.assertRaises(ValueError):
office.LibreOfficeParser('./tests/data/clean.odg')
class TestGetMeta(unittest.TestCase):
def test_pdf(self):
p = pdf.PDFParser('./tests/data/dirty.pdf')