Improve the way we parse/display pdf metadata
This commit is contained in:
parent
0239ab3b6a
commit
7ec1eff96e
11
src/pdf.py
11
src/pdf.py
@ -3,6 +3,7 @@
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import logging
|
import logging
|
||||||
import tempfile
|
import tempfile
|
||||||
import io
|
import io
|
||||||
@ -76,6 +77,13 @@ class PDFParser(abstract.AbstractParser):
|
|||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def __parse_metadata_field(self, data:str) -> dict:
|
||||||
|
metadata = {}
|
||||||
|
for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I):
|
||||||
|
metadata[key] = value
|
||||||
|
return metadata
|
||||||
|
|
||||||
def get_meta(self):
|
def get_meta(self):
|
||||||
""" Return a dict with all the meta of the file
|
""" Return a dict with all the meta of the file
|
||||||
"""
|
"""
|
||||||
@ -84,4 +92,7 @@ class PDFParser(abstract.AbstractParser):
|
|||||||
for key in self.meta_list:
|
for key in self.meta_list:
|
||||||
if document.get_property(key):
|
if document.get_property(key):
|
||||||
metadata[key] = document.get_property(key)
|
metadata[key] = document.get_property(key)
|
||||||
|
if 'metadata' in metadata:
|
||||||
|
parsed_meta = self.__parse_metadata_field(metadata['metadata'])
|
||||||
|
return {**metadata, **parsed_meta}
|
||||||
return metadata
|
return metadata
|
||||||
|
@ -23,6 +23,10 @@ class TestGetMeta(unittest.TestCase):
|
|||||||
meta = p.get_meta()
|
meta = p.get_meta()
|
||||||
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
|
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
|
||||||
self.assertEqual(meta['creator'], "'Certified by IEEE PDFeXpress at 03/19/2016 2:56:07 AM'")
|
self.assertEqual(meta['creator'], "'Certified by IEEE PDFeXpress at 03/19/2016 2:56:07 AM'")
|
||||||
|
self.assertEqual(meta['DocumentID'], "uuid:4a1a79c8-404e-4d38-9580-5bc081036e61")
|
||||||
|
self.assertEqual(meta['PTEX.Fullbanner'], "This is pdfTeX, Version " \
|
||||||
|
"3.1415926-2.5-1.40.14 (TeX Live 2013/Debian) kpathsea " \
|
||||||
|
"version 6.1.1")
|
||||||
|
|
||||||
def test_png(self):
|
def test_png(self):
|
||||||
p = images.PNGParser('./tests/data/dirty.png')
|
p = images.PNGParser('./tests/data/dirty.png')
|
||||||
|
Loading…
Reference in New Issue
Block a user