1
0
mirror of synced 2024-11-25 18:54:22 +01:00

Improve the way we parse/display pdf metadata

This commit is contained in:
jvoisin 2018-04-11 23:20:59 +02:00
parent 0239ab3b6a
commit 7ec1eff96e
2 changed files with 15 additions and 0 deletions

View File

@ -3,6 +3,7 @@
""" """
import os import os
import re
import logging import logging
import tempfile import tempfile
import io import io
@ -76,6 +77,13 @@ class PDFParser(abstract.AbstractParser):
return True return True
def __parse_metadata_field(self, data:str) -> dict:
metadata = {}
for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I):
metadata[key] = value
return metadata
def get_meta(self): def get_meta(self):
""" Return a dict with all the meta of the file """ Return a dict with all the meta of the file
""" """
@ -84,4 +92,7 @@ class PDFParser(abstract.AbstractParser):
for key in self.meta_list: for key in self.meta_list:
if document.get_property(key): if document.get_property(key):
metadata[key] = document.get_property(key) metadata[key] = document.get_property(key)
if 'metadata' in metadata:
parsed_meta = self.__parse_metadata_field(metadata['metadata'])
return {**metadata, **parsed_meta}
return metadata return metadata

View File

@ -23,6 +23,10 @@ class TestGetMeta(unittest.TestCase):
meta = p.get_meta() meta = p.get_meta()
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14') self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
self.assertEqual(meta['creator'], "'Certified by IEEE PDFeXpress at 03/19/2016 2:56:07 AM'") self.assertEqual(meta['creator'], "'Certified by IEEE PDFeXpress at 03/19/2016 2:56:07 AM'")
self.assertEqual(meta['DocumentID'], "uuid:4a1a79c8-404e-4d38-9580-5bc081036e61")
self.assertEqual(meta['PTEX.Fullbanner'], "This is pdfTeX, Version " \
"3.1415926-2.5-1.40.14 (TeX Live 2013/Debian) kpathsea " \
"version 6.1.1")
def test_png(self): def test_png(self):
p = images.PNGParser('./tests/data/dirty.png') p = images.PNGParser('./tests/data/dirty.png')