1
0
mirror of synced 2024-05-29 17:38:03 +02:00

Refactor a bit office get_meta handling

This should make easier to get more metadata from
archive-based file formats.
This commit is contained in:
jvoisin 2019-02-03 22:55:15 +01:00
parent 54e50450ad
commit b9a62d798a
3 changed files with 34 additions and 25 deletions

View File

@ -4,7 +4,7 @@ import tempfile
import os import os
import logging import logging
import shutil import shutil
from typing import Dict, Set, Pattern, Union from typing import Dict, Set, Pattern, Union, Any
from . import abstract, UnknownMemberPolicy, parser_factory from . import abstract, UnknownMemberPolicy, parser_factory
@ -42,6 +42,12 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
# pylint: disable=unused-argument,no-self-use # pylint: disable=unused-argument,no-self-use
return True # pragma: no cover return True # pragma: no cover
def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]:
""" This method can be used to extract specific metadata
from files present in the archive."""
# pylint: disable=unused-argument,no-self-use
return {} # pragma: no cover
@staticmethod @staticmethod
def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
zipinfo.create_system = 3 # Linux zipinfo.create_system = 3 # Linux
@ -74,6 +80,10 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
temp_folder = tempfile.mkdtemp() temp_folder = tempfile.mkdtemp()
for item in zin.infolist(): for item in zin.infolist():
local_meta = dict() # type: Dict[str, Union[str, Dict]]
for k, v in self._get_zipinfo_meta(item).items():
local_meta[k] = v
if item.filename[-1] == '/': # pragma: no cover if item.filename[-1] == '/': # pragma: no cover
# `is_dir` is added in Python3.6 # `is_dir` is added in Python3.6
continue # don't keep empty folders continue # don't keep empty folders
@ -81,11 +91,15 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
zin.extract(member=item, path=temp_folder) zin.extract(member=item, path=temp_folder)
full_path = os.path.join(temp_folder, item.filename) full_path = os.path.join(temp_folder, item.filename)
tmp_parser, _ = parser_factory.get_parser(full_path) # type: ignore specific_meta = self._specific_get_meta(full_path, item.filename)
if not tmp_parser: for (k, v) in specific_meta.items():
continue local_meta[k] = v
tmp_parser, _ = parser_factory.get_parser(full_path) # type: ignore
if tmp_parser:
for k, v in tmp_parser.get_meta().items():
local_meta[k] = v
local_meta = tmp_parser.get_meta()
if local_meta: if local_meta:
meta[item.filename] = local_meta meta[item.filename] = local_meta

View File

@ -2,7 +2,7 @@ import logging
import os import os
import re import re
import zipfile import zipfile
from typing import Dict, Set, Pattern, Tuple, Union from typing import Dict, Set, Pattern, Tuple, Union, Any
import xml.etree.ElementTree as ET # type: ignore import xml.etree.ElementTree as ET # type: ignore
@ -295,26 +295,21 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
return True return True
def get_meta(self) -> Dict[str, Union[str, dict]]: def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]:
""" """
Yes, I know that parsing xml with regexp ain't pretty, Yes, I know that parsing xml with regexp ain't pretty,
be my guest and fix it if you want. be my guest and fix it if you want.
""" """
metadata = super().get_meta() if not file_path.startswith('docProps/') or not file_path.endswith('.xml'):
zipin = zipfile.ZipFile(self.filename) return {}
for item in zipin.infolist():
if item.filename.startswith('docProps/') and item.filename.endswith('.xml'): with open(full_path, encoding='utf-8') as f:
try: try:
content = zipin.read(item).decode('utf-8') results = re.findall(r"<(.+)>(.+)</\1>", f.read(), re.I|re.M)
results = re.findall(r"<(.+)>(.+)</\1>", content, re.I|re.M) return {k:v for (k, v) in results}
for (key, value) in results: except (TypeError, UnicodeDecodeError):
metadata[key] = value # We didn't manage to parse the xml file
except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file return {file_path: 'harmful content', }
metadata[item.filename] = 'harmful content'
for key, value in self._get_zipinfo_meta(item).items():
metadata[key] = value
zipin.close()
return metadata
class LibreOfficeParser(ArchiveBasedAbstractParser): class LibreOfficeParser(ArchiveBasedAbstractParser):

View File

@ -131,9 +131,9 @@ class TestGetMeta(unittest.TestCase):
def test_docx(self): def test_docx(self):
p = office.MSOfficeParser('./tests/data/dirty.docx') p = office.MSOfficeParser('./tests/data/dirty.docx')
meta = p.get_meta() meta = p.get_meta()
self.assertEqual(meta['cp:lastModifiedBy'], 'Julien Voisin') self.assertEqual(meta['docProps/core.xml']['cp:lastModifiedBy'], 'Julien Voisin')
self.assertEqual(meta['dc:creator'], 'julien voisin') self.assertEqual(meta['docProps/core.xml']['dc:creator'], 'julien voisin')
self.assertEqual(meta['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1') self.assertEqual(meta['docProps/app.xml']['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1')
def test_libreoffice(self): def test_libreoffice(self):
p = office.LibreOfficeParser('./tests/data/dirty.odt') p = office.LibreOfficeParser('./tests/data/dirty.odt')