diff --git a/src/pdf.py b/src/pdf.py index 96eec13..c119449 100644 --- a/src/pdf.py +++ b/src/pdf.py @@ -3,6 +3,7 @@ """ import os +import re import logging import tempfile import io @@ -76,6 +77,13 @@ class PDFParser(abstract.AbstractParser): return True + + def __parse_metadata_field(self, data:str) -> dict: + metadata = {} + for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)", data, re.I): + metadata[key] = value + return metadata + def get_meta(self): """ Return a dict with all the meta of the file """ @@ -84,4 +92,7 @@ class PDFParser(abstract.AbstractParser): for key in self.meta_list: if document.get_property(key): metadata[key] = document.get_property(key) + if 'metadata' in metadata: + parsed_meta = self.__parse_metadata_field(metadata['metadata']) + return {**metadata, **parsed_meta} return metadata diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 4cfb80a..6141dbe 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -23,6 +23,10 @@ class TestGetMeta(unittest.TestCase): meta = p.get_meta() self.assertEqual(meta['producer'], 'pdfTeX-1.40.14') self.assertEqual(meta['creator'], "'Certified by IEEE PDFeXpress at 03/19/2016 2:56:07 AM'") + self.assertEqual(meta['DocumentID'], "uuid:4a1a79c8-404e-4d38-9580-5bc081036e61") + self.assertEqual(meta['PTEX.Fullbanner'], "This is pdfTeX, Version " \ + "3.1415926-2.5-1.40.14 (TeX Live 2013/Debian) kpathsea " \ + "version 6.1.1") def test_png(self): p = images.PNGParser('./tests/data/dirty.png')