Improve the way we parse/display pdf metadata
This commit is contained in:
parent
0239ab3b6a
commit
7ec1eff96e
11
src/pdf.py
11
src/pdf.py
@ -3,6 +3,7 @@
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import logging
|
||||
import tempfile
|
||||
import io
|
||||
@ -76,6 +77,13 @@ class PDFParser(abstract.AbstractParser):
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def __parse_metadata_field(self, data:str) -> dict:
|
||||
metadata = {}
|
||||
for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I):
|
||||
metadata[key] = value
|
||||
return metadata
|
||||
|
||||
def get_meta(self):
|
||||
""" Return a dict with all the meta of the file
|
||||
"""
|
||||
@ -84,4 +92,7 @@ class PDFParser(abstract.AbstractParser):
|
||||
for key in self.meta_list:
|
||||
if document.get_property(key):
|
||||
metadata[key] = document.get_property(key)
|
||||
if 'metadata' in metadata:
|
||||
parsed_meta = self.__parse_metadata_field(metadata['metadata'])
|
||||
return {**metadata, **parsed_meta}
|
||||
return metadata
|
||||
|
@ -23,6 +23,10 @@ class TestGetMeta(unittest.TestCase):
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
|
||||
self.assertEqual(meta['creator'], "'Certified by IEEE PDFeXpress at 03/19/2016 2:56:07 AM'")
|
||||
self.assertEqual(meta['DocumentID'], "uuid:4a1a79c8-404e-4d38-9580-5bc081036e61")
|
||||
self.assertEqual(meta['PTEX.Fullbanner'], "This is pdfTeX, Version " \
|
||||
"3.1415926-2.5-1.40.14 (TeX Live 2013/Debian) kpathsea " \
|
||||
"version 6.1.1")
|
||||
|
||||
def test_png(self):
|
||||
p = images.PNGParser('./tests/data/dirty.png')
|
||||
|
Loading…
Reference in New Issue
Block a user