From 1ee936420ca1df1ebff14f19de28df5c41602b2b Mon Sep 17 00:00:00 2001 From: jvoisin Date: Sat, 31 Mar 2018 20:56:15 +0200 Subject: [PATCH] Display docx metadata --- src/office.py | 13 +++++++++++-- tests/test_libmat2.py | 4 +++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/office.py b/src/office.py index 2bdeec7..5de0597 100644 --- a/src/office.py +++ b/src/office.py @@ -1,3 +1,4 @@ +import re import subprocess import json import zipfile @@ -16,11 +17,19 @@ class OfficeParser(abstract.AbstractParser): files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'} def get_meta(self): + """ + Yes, I know that parsing xml with regexp ain't pretty, + be my guest and fix it if you want. + """ metadata = {} zipin = zipfile.ZipFile(self.filename) for item in zipin.namelist(): - if item.startswith('docProps/'): - metadata[item] = 'harmful content' + if item.startswith('docProps/') and item.endswith('.xml'): + content = zipin.read(item).decode('utf-8') + for (key, value) in re.findall(r"<(.+)>(.+)", content, re.I): + metadata[key] = value + if not metadata: # better safe than sorry + metadata[item] = 'harmful content' zipin.close() return metadata diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 02579b0..717de3f 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -42,7 +42,9 @@ class TestGetMeta(unittest.TestCase): def test_docx(self): p = office.OfficeParser('./tests/data/dirty.docx') meta = p.get_meta() - print(meta) + self.assertEqual(meta['cp:lastModifiedBy'], 'Julien Voisin') + self.assertEqual(meta['dc:creator'], 'julien voisin') + self.assertEqual(meta['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1') class TestCleaning(unittest.TestCase):