From c67bbafb2c60782096af4f6225d94e18225d2ecf Mon Sep 17 00:00:00 2001 From: jvoisin Date: Mon, 1 Oct 2018 22:26:35 +0200 Subject: [PATCH] Use [Content_Types].xml to improve MS Office coverage --- libmat2/archive.py | 4 +- libmat2/office.py | 100 +++++++++++++++++------ tests/data/broken_xml_content_types.docx | Bin 0 -> 4145 bytes tests/data/malformed_content_types.docx | Bin 4131 -> 4135 bytes tests/data/no_content_types.docx | Bin 0 -> 3651 bytes tests/test_corrupted_files.py | 16 +++- 6 files changed, 91 insertions(+), 29 deletions(-) create mode 100644 tests/data/broken_xml_content_types.docx create mode 100644 tests/data/no_content_types.docx diff --git a/libmat2/archive.py b/libmat2/archive.py index d812531..b29d690 100644 --- a/libmat2/archive.py +++ b/libmat2/archive.py @@ -17,7 +17,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): """ Office files (.docx, .odt, …) are zipped files. """ # Those are the files that have a format that _isn't_ # supported by MAT2, but that we want to keep anyway. - files_to_keep = set() # type: Set[str] + files_to_keep = set() # type: Set[Pattern] # Those are the files that we _do not_ want to keep, # no matter if they are supported or not. @@ -89,7 +89,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): abort = True continue - if item.filename in self.files_to_keep: + if any(map(lambda r: r.search(item.filename), self.files_to_keep)): # those files aren't supported, but we want to add them anyway pass elif any(map(lambda r: r.search(item.filename), self.files_to_omit)): diff --git a/libmat2/office.py b/libmat2/office.py index 91bf2a6..3abf108 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -50,24 +50,74 @@ class MSOfficeParser(ArchiveBasedAbstractParser): 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'application/vnd.openxmlformats-officedocument.presentationml.presentation' } - files_to_keep = { - '[Content_Types].xml', - '_rels/.rels', - 'word/_rels/document.xml.rels', - 'word/document.xml', - 'word/fontTable.xml', - 'word/settings.xml', - 'word/styles.xml', - 'docProps/app.xml', - 'docProps/core.xml', + content_types_to_keep = { + 'application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml', # /word/endnotes.xml + 'application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml', # /word/footnotes.xml + 'application/vnd.openxmlformats-officedocument.extended-properties+xml', # /docProps/app.xml + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml', # /word/document.xml + 'application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml', # /word/fontTable.xml + 'application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml', # /word/footer.xml + 'application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml', # /word/header.xml + 'application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml', # /word/styles.xml + 'application/vnd.openxmlformats-package.core-properties+xml', # /docProps/core.xml + + # Do we want to keep the following ones? + 'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml', + + # See https://0xacab.org/jvoisin/mat2/issues/71 + 'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml + } + files_to_keep = set(map(re.compile, { # type: ignore + r'^\[Content_Types\]\.xml$', + r'^_rels/\.rels$', + r'^word/_rels/document\.xml\.rels$', + r'^word/_rels/footer[0-9]*\.xml\.rels$', + r'^word/_rels/header[0-9]*\.xml\.rels$', # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx - 'word/stylesWithEffects.xml', - } - files_to_omit = set(map(re.compile, { # type: ignore - 'word/webSettings.xml', - 'word/theme', + r'^word/stylesWithEffects\.xml$', })) + files_to_omit = set(map(re.compile, { # type: ignore + r'^customXml/', + r'webSettings\.xml$', + r'^docProps/custom\.xml$', + r'^word/printerSettings/', + r'^word/theme', + + # we have a whitelist in self.files_to_keep, + # so we can trash everything else + r'^word/_rels/', + })) + + def __init__(self, filename): + super().__init__(filename) + if self.__fill_files_to_keep_via_content_types() is False: + raise ValueError + + def __fill_files_to_keep_via_content_types(self) -> bool: + """ There is a suer-handy `[Content_Types].xml` file + in MS Office archives, describing what each other file contains. + The self.content_types_to_keep member contains a type whitelist, + so we're using it to fill the self.files_to_keep one. + """ + with zipfile.ZipFile(self.filename) as zin: + if '[Content_Types].xml' not in zin.namelist(): + return False + xml_data = zin.read('[Content_Types].xml') + + self.content_types = dict() # type: Dict[str, str] + try: + tree = ET.fromstring(xml_data) + except ET.ParseError: + return False + for c in tree: + if 'PartName' not in c.attrib or 'ContentType' not in c.attrib: + continue + elif c.attrib['ContentType'] in self.content_types_to_keep: + fname = c.attrib['PartName'][1:] # remove leading `/` + re_fname = re.compile('^' + re.escape(fname) + '$') + self.files_to_keep.add(re_fname) # type: ignore + return True @staticmethod def __remove_rsid(full_path: str) -> bool: @@ -270,18 +320,18 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): 'application/vnd.oasis.opendocument.formula', 'application/vnd.oasis.opendocument.image', } - files_to_keep = { - 'META-INF/manifest.xml', - 'content.xml', - 'manifest.rdf', - 'mimetype', - 'settings.xml', - 'styles.xml', - } + files_to_keep = set(map(re.compile, { # type: ignore + r'^META-INF/manifest\.xml$', + r'^content\.xml$', + r'^manifest\.rdf$', + r'^mimetype$', + r'^settings\.xml$', + r'^styles\.xml$', + })) files_to_omit = set(map(re.compile, { # type: ignore r'^meta\.xml$', - '^Configurations2/', - '^Thumbnails/', + r'^Configurations2/', + r'^Thumbnails/', })) @staticmethod diff --git a/tests/data/broken_xml_content_types.docx b/tests/data/broken_xml_content_types.docx new file mode 100644 index 0000000000000000000000000000000000000000..41e0e49e9fbb843ce24f4b4538adf6e8991b3e3c GIT binary patch literal 4145 zcmaJ^2|UyP8{f=LIT~rJ961Xc;fF$wa%Dz@oaM+hckZ(!$HFkI+_&5;u?|-v$6O)g z)*(m8eg3z8M}EEjzn;Cm`|i8#*>|7!^ZvY__k+{~QG!7L04<>OsiM}PvX`hK5CGt% z0sxKy003iUXD1J9Cl7Ny9~WzPQ&De}<5QjIh$c8V^{s)5na)&WJu}||;Jk;0>YG<~ zBD--5Ve8*&`R$myud_*LPR)GqGb=rAT(WZ1KXyQ3CNBl!FV^+-bLzd=Gd4=f17d{8 z-ul4|Bf%2TJvE1yHY^Hzk+N^b-_w6taP(BIm1Mapd~1qJPTd+>xts@)rWdJw)=*tB zaowOA#rI(H>G;xT+KqEeiOv!l>)WTnQ5*urxzM%e*S1ryTcw%yYa50pQLOC9E7@mq zcgXN<3GgkTH*aQhR3?pEmAMAbXnvYL)7PqK&U_SqRa@zxxW9!>`2%ZpjnE}9aXKT0 zHIQL?iM8hvhH#Z@&r8=yE`<+A4a@N%jC##ABbN=25>09rbM98igmQJ#&w^fu=bs5& zw{;WGj2Pfy@zZyC-%P9RXsIF*|EV0i*ZLZv^g&44O+aqlIv7`+m}jXGS^TC>!XcL; zwDJ-Ed7w5Yl3WM?EiEm;5iPGp!v3FNAiXkovvzcciISg7;xCwo!E{R~7(RFq=V6FWs$cn<+? z?>!<1Ou5 zGNl=bOV=b+yA(@(mW`whtDj*7bB(Wf9oR`Y`?tSnaqKahit5a28iKB-BQO*Ma6XP<{+ zWN5G5kROetj3SSwYx7_aO}H%=n@mIQBaiUO#NHg)a&hdC=g`oWEuRfqWbJvX&eE6u zvAfE$w~Mn}R9qmd5{w};ao&FQ2VwSQuQoKDJB4O0HbBcWlW6bunGbm<1Xv2wCbol? zo-_Mq^L~m7$8Lo$`JYxSio@#5q!*({IG=rsvNs+DXNREz8wCV&&_+_VKc_@b zZ{DbmeU0vmnt*by;b3f!-U{jHTaSMSt}{B%l&9cTPwl!zR0@ z7|7UOG<%IWP1@Z1q~pJ_WjTzkrL&tg8Qh1urrM2gaKMIvaKJc0OzeF*reIQ?^%xgq z+}`g*|5JzI#)^xVy1=E{I4Qs)OTj1bZY{#FpcKb>)2?&-) zpGCB5D27E_^a$7C#I+2qCz|yj=wlWj=N#divdYGvB^usWim2J2DMO}x(7U4hIa8=S zZ$64WSk=}40iSeRVRnreJI6tBjT(OkWn-JfunIhg{c>x_+r(rj5&?yCdJI+3)bg@R0E=M#<$e_#7hIOrXa=>>4MZLPy$dlUaC z+aD|t4}Znf=Z*akznX~d+Ys7cdnGnM)i!g)k6Qkd1&(ES3+g!&6WYS3-%NyEq+u&f zDzQ?N%oi@a!w|Ik5U}sEdfLD(Ky@Wy=RnO;9Ab`s2gPHL%wPV#v63SqTc79fd}ovL z69Gd#VJTojXG&l?*lpQ`k_%K_#Crc0sW>CQCI3lMfxMjEtYCj?0%;%IL6IbyT&6$u z0i#9V2xq>(k;c<8mZ#5*$#YV1IkyH7ORa8@qI)ao>j#RwAvo3eaW~+b7|h&!C)%`| z2K6vlSUn_Dr@A+f#YOO0A762l`l}X%t_fX|nPQ@g~)msDr`R2OW&G5Z0b>bRa-UW7i^Hfep)x-(Y1oXXKh;%3%jtrs-9~mK~eThhBB1gFPZMaeY+)1hH(5{ zPPWo39VZsqKhpagXsmow@&whux4jbFPs}0TNWVO%IniGAhyR1b3$^8voR&Lb-L(;& z;l>2XlRIEKDJQ*X-A8bzTiDhk$8D0wPy~#qM!&+oHQqCYy zAlgHivy3rt&xlG?H&3Y*9de()Q6<5{0qSjzrrSI^$1n^Tx{@B&$1TreMh~$RbxN#Q zxxfAbb1&B6XdQ^P)ITjKLU|!d>V<;iqSUo|rszq%_D8q6M+4A`{rIvE*H<*ctUI%m zC)jhSGfv0=S(2$7Wg!hZUu(~|WWF1w+}5C%jNvtll%voJcxSRxK}RpL01oqPYSXTH zGcD9OD<1B#%{aT_JiHGi^CIA$>go__n@mvtPRCgP=7kN(!sy++<@iI$N|1^q_rm<* zJAJ#?S3FH!|gzjN;LygEKoedVpMA2)t>2i@iMzzYR$>BLvP+=H7cBXfL;C% z63XAu(1-;zDISr3XSxKz#S^ORb-+w*A>2NRbc3y%d18?Cm-{Yiu3$P_?mUJX$y4q9 zGN~LI6iG_o(8}!4ldm6pUBYD;)1n0sOx1Chx%J0Dx969|kKocmh$^FIpX4Y(eo+Ey z4{hqMK2v?R;o0lp>QI)if~$UXF^Z;XSG&ldswK9a-Iy`&ZK=gHf9CPj^njv5tMA51 zOL8ZewW*+f1bJe5q^yGX8WkDbhL$1PQzUTpNuKccP2$IJuyeZYPFA1G(VDD{ za7N{g6rO20-tvh)?uBRu01a(P2$9#5UkWTOce7cyH~z}bA~wtC76$XS$!)TpYwaT1 z?VOn`eFSyLt+S-BoIvYUol64a+uW*- zC2=2L)XBMi4S(+QfYg0VMoB#9C#yVu%UR63CSu6m7RapW%vT|k=(bwcE@nB$4~xe~ z4zTM#8et@*j$ST_9$lp4zt_n0H-X$ed>pNRtgurbr9A>?-hXAzvgN0)7@lbltl+{s z6+9N)4K8w57d|=iT_#~=?)G!Ka{(-6z3aIV>EBJ${L=c?+GhjK#9zSk;@jTukHzv8 z)QE;Y5%+Wt5R1DPOU4{!OCvt_BARv+^8yKQh4RF9$+&go2$|-j?S~X1nI%@8ecSedXt0Z``mAjON4l>PQcpS6}eV z->m6w(-G$cx^jzq3fb5O_>{y6-5|a3qzk-2%N={+4DiHb+x&PTL6RgX#Mz~p-0asV zpJW$EFL9WRbAJJ%Vl!;$UD`6nx0%;~a25CXRG4DwZ^q2vqt%i6F`1ALSJNwf3TU=? zFP+!%{`!;yEx;|d0MQ=&2%pPeN{&+Ma)-tCyWt!OGyQY3VZE-cyDg)Q-WM1R3xtzp zmm)G0bhCv^Jhe-4SXjoYBi*#)5<0)n@g2$c=8nSuUWe%|J!As;D_xB$n_DNKwp|w?cL?1}% zhu7%H)CE!~fc|TE<0a9T99+Tp5c2u|jB@kyDN8io!9GChrGBEJ5GB5};Ffc?r=jWBA=9R>UR2HNb z$Ldw&=4=hz?RVHfWbVmum$<-3hYkp+zGWzWd7!VS_;S<{{-3)R{;gi;DaH9K!1cCf z=eeEFrxtCyn0u?yJ=OQ9+$T=&^vg?XZd)CgV$dua-THv5{#@WX2ktX#nVmh7_lh;z z{heg3Xn$1b$K#YwN0yp0`kZjOlCak!QG46BWfCSvYnN=#m_5OBHTP{3uR|Yh#QrGm z`8`=&`}!=M>U#dQ93kxCr+WY0)%@?bJ!jUPsq0Nz!cCW)uWjAobhM>-?Iin2dyK!% zm$-Ss_J3-%Kg%?|7XIS;g`Um(g?nr!P54+}{k8PSLX} pj8$&35Wg1V+{w25c8psm*YMjj3QgVvWPh9dn_r1-BQG#g830g$*INJp delta 502 zcmV@6axSN2mtJyK25JStmH}o000XG000vJ003JNf4bqq3%AclnH^i+NJM?i^rBn8AvzLvRIu=P1{p1?JX)A*8Whi$I@# z#hs8JVg>wuHhbXGngAwbA!@K*q_7p+cvB1-nO4$Uv`xsQ)rD;uC1H5%FZ?;i{mGM+ z8%>tIzlT-?s)K9gM*r@Vf2UhEPj1Q=8p5j@(#O6+T%lsZxmL+yo^ZVp?ztf9`FC!!Au{u&c`NhorX&LhY#ZYr&Z zvn(U=N}i|J{j|OwCLYHj&XwnmE>v3rqB1LzM+kx5j^x>+G0B7>8cRary@kUmMtI^F zBcg>ehokH%6DPs2YvIU0>9H|Dl9W7p;yg*d08mSl6bcx#nE@mW4eXpgO|Lerx8h>e1~AcIvs6 zyRDN4Op10aj@8ozsB?sW{my%KVa(}RL_2X!=!}cdDljy8ZlQ+=JYiqFGp^27hrE~G zR`2_5@n9c{NJ&SeL`A8$6xyr1!F%iA4nCtQFahg+Se=RLg8`jni=&0(p_x}f+A$HW z7v>6NrQPx$L2u|Awkcl|s?5W>ja98D1<9o2r3Z%`+RdLbJRxgspOp9Gs}K$qyo@0LAd&geZ zC7`pu&8gS&T~t?Q3kkAs83vXu|c}KxO%{@y1GjF zpqx^%R9d9D!oO1PiL8w1UAC2o)msASGqINpnMc0zJXx_Rn4GXS0Wmk*A2U9Fb9^fG zRVXZ`3bvOkZmC4HG8W5_vOH2#C0g3m@2P$-w9lU850q0>7t(cW9qt82*a`9~Hsw6@ zjF?L3%T_9r!I8X3q)j`K8<^v(Pg4yRKM#)IKC5)|ai*(=D|khgBV;<($G>qW+@bXO zx~@x?#I$k~q%0$m<#xXn$tNCdBgvA`30Qo}<(DP=F)9ML8L=2}SiL9~XRL^Sj2RWE z{uUqab<5t=71>!J09UX23jo~V=@xKTM4_V~t45k*but2xkg_Mi z?`FqP@CgnPEhO=su+E~{Y@?R{s(Pn|c#f;)9+Mmu#eB|79L;qlSxTs!=R1rI$I}w$ z^D~fLg$L3D{2W5zfB|~Wdit4;)z^+&8-#~h0X$-H@aLTUKDf7WD+!qXbqUJqbLqKv z9n*XKnN>epCv%T%LcFG9Lfb@)TZu4bAa6-x@pUb^JjsGv>_IE{=_sx%hfUnkn#=Lz z9W5IfuoY$iLcr~rJNIpUIs3R$W3J;<@&@C_IxO>~jhF@Nl{(Iy?Y3FULmcJeT)PR=T=|j@iiIYp^V75ku6I@Npwr9kr+u3pEGlttk28?5FIsU*e>Y^ z#+|&|P=>nV7zQaKs(``R$q?s5;ac&})j^B-aqaY~QflP90Ms7b@|&sArlQrZ7u%(3 zr!$j2eA=Iunir-`1WxgJ)9BcQ!5dqtgWdN>I%xI!gO1%qjKVpdeKk2yCss9_ z*^W_q#t88;RBS?QqcxgdWVVOTlF;YcAfqGb zlW#B5$b%f#Y40tQ2IvlfYK8RLvgH))0ha(^L{qEiAw#dMOIwPahYjN&Ud$h^?%0f+ z--dm!?7e6a6y?Bas>ld^#(4+s*CS^^N*3*Qwo_Ps&A-6+fz5YEXZh&D($_7O1YJU+Z1sw!m32DXuMPAFdvTdT+3dEmIW0RSoqNDGQQJ0B%qujaHM zf3cA>ddjHt;kBMIG)8@pQ2O@LvQD^dSLWGCzHH`nenoojBqk>%aFfB8+7oRV10#%E zI&5+=!j|_`81&Es7UXhPw&U|W;a)8r`ZX`#OEk~OM0jp-%q+W%Q0QrOfxe@;LZVKS zNyb0v80eq6xZ&dLX>|3P(+?plMyexP;9T|J*gCzx5P+eXxxAovp?>Kf=*@5Tn^p{C zhlN+xYY@kv?HTFgbE%|cl39sbDz6H&eA-b(Jg=dky%j{sXc@ikz9Y`0nGSv|_IR4x zti;5-Gs8*w9G`t^&B3Fjmv3-7<<7nI-2r#w%U%M3(&!fTJ*op&7QxAJb(IbVJe(aN zP~QaB;r5MOX)yj7#Z}9Vht&?6%U&aQsB=&;^?(jT;+e0QvwV<)FCKZHPEI#x!H6L^ zt76YVjmH5u=ayvlB&UTCHO4GIsxSilqr|lC-)Oi{ty#V9)#vEuSemDiT=h^n3fQu( zUu06*_Mnr`oFn&DiS>KYjQy#2bWy?eZ{~@MD*RmfOc4J%RU-bLlA6yd6OG)aHWJGr zD!ImhKauW;n;$^%SA^D#B%x{m}Vnb~+$SaR^b^OH`gm zLFHOY(QE*im( ziNt@qlIEY*zuGy2J`#6|AWZ0ZLm7V{l3ybgS}5b?ftFUd!K#R4(kh1AunT+nEzfiY zXgEW$AbhEvY`5ur3V)8o5*U)MfjJ?&glC(?OSGS(dF^_69O-*&#y6oDIlReZ$^hW0#pEq*P%Sl+^@>Ar?T8~ z2w}-9V}7c0FV-*H1xAYy`V16Ctg23V zvp^r5IzrF?$SyBVLYyi|YO(fd7FP##%BJ|l@WqZ(u^#mRCSKE~zQs**LWfliAi4aG zhz4g&Yd9eQQ2l#B>Vp7zZZV%Xl zLHA^*y6M5$neaZh_U*Q@W}j0WrumXdN{f*hZrm%&#E#E#x0%V7N(yTmWT85|YWL@$ z@;Ju=)C3y=vEz}LH4a;yhMX~lLULDprey)m@A5`&BsKdFzaOT2FOuRrRa}2gHV(&l z(|j%--gwR_AUu%Xy2s`~#F?3%EwnfMda2yRsoEB@)UxHQ35Uw~wa-ib0low7V@Y(P z(5=UEsfIy0xHD76kRjWY{=G{V)C9_TKj1$H*66KQ%ioe{N6GRup`Ixp%EQSKx^}Ke zJ;OQd6@A^oJbE|?v3zgT)`C&$vSR}A&O4eRwfFlo)?HD!U7w{s%JvOajcaM+aB zry*kq^c`NT+DxI`M^LDx3S_an}(Ysv>O&5NKGxbORMgL2*1O3D<7?S{0w>E-|$~r$?y2xs`BgS<5>R2?^es-;kz?Slb)a9P0jTW9)1cD UQkRLD_5cfYBU5uE!$G_HAHZGiNB{r; literal 0 HcmV?d00001 diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py index 4ac2678..8d7c252 100644 --- a/tests/test_corrupted_files.py +++ b/tests/test_corrupted_files.py @@ -86,14 +86,26 @@ class TestExplicitelyUnsupportedFiles(unittest.TestCase): os.remove('./tests/data/clean.py') -class TestCorruptedContentTypesOffice(unittest.TestCase): - def test_office(self): +class TestWrongContentTypesFileOffice(unittest.TestCase): + def test_office_incomplete(self): shutil.copy('./tests/data/malformed_content_types.docx', './tests/data/clean.docx') p = office.MSOfficeParser('./tests/data/clean.docx') self.assertIsNotNone(p) self.assertFalse(p.remove_all()) os.remove('./tests/data/clean.docx') + def test_office_broken(self): + shutil.copy('./tests/data/broken_xml_content_types.docx', './tests/data/clean.docx') + with self.assertRaises(ValueError): + office.MSOfficeParser('./tests/data/clean.docx') + os.remove('./tests/data/clean.docx') + + def test_office_absent(self): + shutil.copy('./tests/data/no_content_types.docx', './tests/data/clean.docx') + with self.assertRaises(ValueError): + office.MSOfficeParser('./tests/data/clean.docx') + os.remove('./tests/data/clean.docx') + class TestCorruptedFiles(unittest.TestCase): def test_pdf(self): shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')