From e342671eadd3f5ff922fe62cae81792d4cd65e83 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Sun, 30 Sep 2018 19:52:35 +0200 Subject: [PATCH] Remove dangling references in MS Office's [Content_types].xml --- libmat2/office.py | 34 ++++++++++++++++++++++++ tests/data/malformed_content_types.docx | Bin 0 -> 4131 bytes tests/test_corrupted_files.py | 8 ++++++ 3 files changed, 42 insertions(+) create mode 100644 tests/data/malformed_content_types.docx diff --git a/libmat2/office.py b/libmat2/office.py index bad352b..b220092 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -151,10 +151,44 @@ class MSOfficeParser(ArchiveBasedAbstractParser): return True + def __remove_content_type_members(self, full_path: str) -> bool: + """ The method will remove the dangling references + form the [Content_Types].xml file, since MS office doesn't like them + """ + try: + tree, namespace = _parse_xml(full_path) + except ET.ParseError: # pragma: no cover + return False + + if len(namespace.items()) != 1: + return False # there should be only one namespace for Types + + removed_fnames = set() + with zipfile.ZipFile(self.filename) as zin: + for fname in [item.filename for item in zin.infolist()]: + if any(map(lambda r: r.search(fname), self.files_to_omit)): + removed_fnames.add(fname) + + root = tree.getroot() + for item in root.findall('{%s}Override' % namespace['']): + name = item.attrib['PartName'][1:] # remove the leading '/' + if name in removed_fnames: + root.remove(item) + + tree.write(full_path, xml_declaration=True) + + return True + def _specific_cleanup(self, full_path: str) -> bool: if os.stat(full_path).st_size == 0: # Don't process empty files return True + if full_path.endswith('/[Content_Types].xml'): + # this file contains references to files that we might + # remove, and MS Office doesn't like dangling references + if self.__remove_content_type_members(full_path) is False: + return False + if full_path.endswith('/word/document.xml'): # this file contains the revisions if self.__remove_revisions(full_path) is False: diff --git a/tests/data/malformed_content_types.docx b/tests/data/malformed_content_types.docx new file mode 100644 index 0000000000000000000000000000000000000000..43ac7437618f8f49e52c2006526efa087cb0c011 GIT binary patch literal 4131 zcmaJ^2RxhW7fytltx>d+wnWWVC0tz8C>_>CsMW?EHDlMFwMo?q(ps@wn^aL-)vDMu zl%hJ+C~DUFC%vO?fB)N)-RCb;USB8hfm->6S6Q)(;--$zFgd}D%SAA zlc~^bGZvKxk?nshJ{A*kDSQOIqp95?W6oVw5brWsy_Fg$nw2RsGHTa}d`|InS9$Mu z2_N=Ko>0C!#`%V2Rg%JN8s3%zra9h^U3TomFbrNsHv7IfNe05oO);fDg;MX;TapIV z&oBac$Qz!Awqj0xJ@p+91IDv4z1eL<$a*G1jhg0DKkz(}RIe-r;1`H6k-lN+WPxyX za(086IXjb?m>y4pMT9B*%Rw~n`ndXSE5QWyRgfkXed#DNs>}WN${j9j^5$Cz^46&b zQ&ZMcGwEHSum{z!6HJM#rF>Nh@f2xm<2BWMWxYf03elm1wnSfQaVZ6EHJA3W0qRH_ zE;gx_yoc_QGs%Ow(&eHUq6d+4?b}Djnjh0*-)-Nnj(dR~ikXIRZeU^em(OF=>=)#B(5*3qB6HIh(CxLpnjMEFeqEIk=p8RHik=SbW6fOPyf+nA4H zYvJT-MF#i2wvlEloYj9zU&#M$oruWm^7z6T6_%4+;J0?Z93xNd$6Kf0PM{pJ)}|+9 zx6(3pN9Gr~dQ2H#Mypy0M47V&nX{dG(%J7L&d4uN9(zHfM^zy_)^tFq2`j2@U^U&Y z14f@T1v%vknMq$oeiUnYU3p2_?tB>{9(8CZs=J?|EuUl)mm!zBX?Vs;%Q+}w6KQ?W_Z%AuY$9TBcu?Hb9 zP<0VYm^o>2CVol&Go%IbbaJ(X{izA0eQ*~=5@qr-{izS}>U6Derm(FH?pIU!x=iu; zj@O(oZU98mt6L;!y99iELD9DbW?SFy`F|0CnV9Ux8ue47?gt5}1mm=-2lJVo1)dG@ z7RRVGc4%lD(w2Axw?j@+;;%Wjdm}Dwit>!&q7uiy8EXsqx25aw6S*?;=I`xl9%BXZ zP3ThWm39YVevfD>M;Y0NL5c{nU~v3&u;bZq<)lUhmX*T9PN1By{BD6C>NsZY^~^*| z(MIo!y)yYL*{Sb8p2|os2vZ?Y&#-%v;n;z}o7+jnp8O>oWWIif<0ue$I_gdnaE692 z`lsvotH-iC(b5f+5Dx=pE+QkslE}^%MRC2OH()<^yU0EmY5>cBtkkG9n9Z4KjMgBq zG*BQb{n9J0-3DSR)w=@DHI+&Tt-e6DiyPdkj9Fi_-JP$1U0_O&Pflny^#}4U^s^h| z2c4TlRk^$hZF!qn5o4;I1LYzhwNaK@ZuO2_aYISL*PFB;)X;lU!S9mxL=uTHb)9?Rm80y3vkIY@(Bv_9AV z)`1%yr~IZ$C;ouPI9irM-GA6{w}O`L(h_U9M_adM&C7R!t@EOh?%x>Z*PO-=fMj0y zhbXNRN!w(a@>e>>@;5K;kW7n?nYqJvAuB;BXtcqZ>c7(Ucz(uHhi2!qvCM}0WW5I> zt%nRM$KuC$*0*XjE@kUq3sRYE+FZ6K5NVw?|1H-;0V<^| za4~=J9P_PGea{}8gUoey+w__<7lRV8XRY;}5cwAm12%<@Q+n#2TepN?Y4?B_}zpK>r9UtygdZvHTW6Q6|loeSHCQT}cZ=D<=Gg?~SYlDgm zZc7J|<}3+ZU6Kp@b(8qM>D;W`-EAGM-N@>5JywmS70z&VD~9Hjj_*YIB`w=Gu@!yMJ|-JNp3%&khb}!uGd8sOmhb zGrU}D}|s^E`eLn z^LY5I&G-0SbA-mTWR}^8^%*RsapVcEf+1P5D8!DzV-$$MiNX+`UP6zYiSpaIkx$(= zqct9cD27!9)@T(nzp)$4n|N2Ja`zTpY(@!-rwlCw=4=Uu@mB35eyz4{Aw{n^a%+DA=y)@75CK1iC;)Jp}LA_X`8<&0kxfCOpa4mdukaJozzy}v3Fw!`$fX+ zmrLreizQ+xt&xjK@aF3de&GQ?(|(K7k&5&E-~2t{mSy5^FW0v13a=S2=rhPpnSWaK z_46KfolFJrLBAD?r)vl1VH9U{A){7lLnl`Mkmsymd!P9!phkV`nZ#YePLvpH3#vip zYyn0b-+O2z+<>t=!S|~B*u=R&jlD}lnk6Emhk3?61N^9W9q}Ho0#Y~iUmYydvh}=8 zGp-MsdNO9f=GBB7W!Z<9$h96E+}AiDQI(p;=GsHL08&4^NlT_KkU}2xU*j4NiN56E z3c?4I&;Mt5qXTe}KLG#%rvX7JH^b))hJit{t0CXde_fMr!V35?m>f0exrP z$e8kT;!*cW_U}L9KB=Cc-TcqlN39XLd+h%hk)P9#$}71;{fG*hztaEJuYS%yDqa7+{Rz6i@{daL&*?|$L)Nz+ cvGz~;KWc|iqau|A0ML>irld70IZnR%A1i&pumAu6 literal 0 HcmV?d00001 diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py index 30039e6..5af0e81 100644 --- a/tests/test_corrupted_files.py +++ b/tests/test_corrupted_files.py @@ -80,6 +80,14 @@ class TestExplicitelyUnsupportedFiles(unittest.TestCase): os.remove('./tests/data/clean.py') +class TestCorruptedContentTypesOffice(unittest.TestCase): + def test_office(self): + shutil.copy('./tests/data/malformed_content_types.docx', './tests/data/clean.docx') + p = office.MSOfficeParser('./tests/data/clean.docx') + self.assertIsNotNone(p) + self.assertFalse(p.remove_all()) + os.remove('./tests/data/clean.docx') + class TestCorruptedFiles(unittest.TestCase): def test_pdf(self): shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')