From 174d4a0ac09c2e9d4a9aa3677a442c05459b8309 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Thu, 20 Sep 2018 22:37:53 +0200 Subject: [PATCH] Implement rsid stripping for office files MS Office XML rsid is a "unique identifier used to track the editing session when the physical character representing this section mark was last formatted." See the following links for details: - https://msdn.microsoft.com/en-us/library/office/documentformat.openxml.wordprocessing.previoussectionproperties.rsidrpr.aspx - https://blogs.msdn.microsoft.com/brian_jones/2006/12/11/whats-up-with-all-those-rsids/. --- libmat2/office.py | 61 ++++++++++++++++++-- tests/data/office_revision_session_ids.docx | Bin 0 -> 12163 bytes tests/test_deep_cleaning.py | 31 ++++++++++ 3 files changed, 87 insertions(+), 5 deletions(-) create mode 100644 tests/data/office_revision_session_ids.docx diff --git a/libmat2/office.py b/libmat2/office.py index 5c2c996..07bbbb9 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -8,6 +8,8 @@ import xml.etree.ElementTree as ET # type: ignore from .archive import ArchiveBasedAbstractParser +# pylint: disable=line-too-long + # Make pyflakes happy assert Set assert Pattern @@ -15,14 +17,12 @@ assert Pattern def _parse_xml(full_path: str): """ This function parses XML, with namespace support. """ - cpt = 0 namespace_map = dict() for _, (key, value) in ET.iterparse(full_path, ("start-ns", )): # The ns[0-9]+ namespaces are reserved for interal usage, so # we have to use an other nomenclature. - if re.match('^ns[0-9]+$', key): - key = 'mat%d' % cpt - cpt += 1 + if re.match('^ns[0-9]+$', key, re.I): #pragma: no cover + key = 'mat' + key[2:] namespace_map[key] = value ET.register_namespace(key, value) @@ -59,11 +59,56 @@ class MSOfficeParser(ArchiveBasedAbstractParser): 'word/fontTable.xml', 'word/settings.xml', 'word/styles.xml', + + # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx + 'word/stylesWithEffects.xml', } files_to_omit = set(map(re.compile, { # type: ignore + 'word/webSettings.xml', + 'word/theme', '^docProps/', })) + @staticmethod + def __remove_rsid(full_path: str) -> bool: + """ The method will remove "revision session ID". We're '}rsid' + instead of proper parsing, since rsid can have multiple forms, like + `rsidRDefault`, `rsidR`, `rsids`, … + + We're removing rsid tags in two times, because we can't modify + the xml while we're iterating on it. + + For more details, see + - https://msdn.microsoft.com/en-us/library/office/documentformat.openxml.wordprocessing.previoussectionproperties.rsidrpr.aspx + - https://blogs.msdn.microsoft.com/brian_jones/2006/12/11/whats-up-with-all-those-rsids/ + """ + try: + tree, namespace = _parse_xml(full_path) + except ET.ParseError: + return False + + # rsid, tags or attributes, are always under the `w` namespace + if 'w' not in namespace.keys(): + return True + + parent_map = {c:p for p in tree.iter() for c in p} + + elements_to_remove = list() + for item in tree.iterfind('.//', namespace): + if '}rsid' in item.tag.strip().lower(): # resi as tag + elements_to_remove.append(item) + continue + for key in list(item.attrib.keys()): # rsid as attribute + if '}rsid' in key.lower(): + del item.attrib[key] + + for element in elements_to_remove: + parent_map[element].remove(element) + + tree.write(full_path, xml_declaration=True) + + return True + @staticmethod def __remove_revisions(full_path: str) -> bool: """ In this function, we're changing the XML document in several @@ -112,7 +157,13 @@ class MSOfficeParser(ArchiveBasedAbstractParser): if full_path.endswith('/word/document.xml'): # this file contains the revisions - return self.__remove_revisions(full_path) + if self.__remove_revisions(full_path) is False: + return False + + if full_path.endswith('.xml'): + if self.__remove_rsid(full_path) is False: + return False + return True def get_meta(self) -> Dict[str, str]: diff --git a/tests/data/office_revision_session_ids.docx b/tests/data/office_revision_session_ids.docx new file mode 100644 index 0000000000000000000000000000000000000000..b40a3415ad56150781929feac25049fea41db8e7 GIT binary patch literal 12163 zcmeIYbyOVb@;^MdJ0!RVCrEG$?iMUqaCaHp-Q9u{+})kQU4pw?&>$f|;5XU3dv`Z? z?|I+^f_55KB=}o}2bjma&ADyMryn;IeB!zY1 z7=v-u=fS&VG2FmmQFD}hE0iJe-qDC2;3v5>*C$61Jb@caYJB z_I8Dca8kNUwIXU2e88nz$nkDD-8M&t`W*YO|%zdP(+64PeEMbcIR=ry# zCEHC>+Lan4xpC!aKKHb6D$%nLD)JZ1i*s6wHLTMO6`BDz)OZ1B7h?RK(xd#w#j;c9 z0CbvsW!cUrfBW}`&8laj>Y#_wu6HM4F8;f0_{x_$AcU}I=1B2YEb^&#s15B0Q%5VK zXpI>o*>)$%hu-Jh(>4F%QbT-hH~|_nHPMw&_UL+#i8Hj)(`2Ff?QzZ(;2i=0@caw~ zQ2blu635{;U4h+E9vqlR;KQTu{gImpeQ1NLv+GboQ?gMiu7M1JY-Ci}+xx^fl&k7Sr`4 zHY{DOn9ql! zmDGSklm+b#1HKjWuc4^2p8b|MsNPhY zc-uz`iJsSMi{F+AVL#Szr#^H@mxmjF5nj+HZ>q&2Ru_|x>Vk?6*%P%KZL{~D9LVnl zNY`{FqtF;UXzHe-UF9xH!j9wHo0N7Zu)<4NI6qCWAzZA78H!^Z>3CnWhgWTms4#NP zdhcuG_RekVn8ex;R*RO+#>`p=kqCVe!8gh}gPD;!Q$*A=TmQhpW8?eRH(v=dPVPP# zbT|peC&RcoEITXtGHUq443iwn1NdfST-Vm5>Iy*yhu^ghA3e^yD6@6PS$Q^sQE>T( z@7~|N1UuS)`e_K6H=`5S3wOYNit$%JHMDgw{-2}%8Jv+ya<(9_qbe->6g$`^K;oBM zMBI|ei=?p?%C!ac35q9(OXXagr!*=SH}=}pY+Tj-$N;*>idDlIij}ZpOF+Zqs1FBe znjU(wAW_p)N7KniCrJzDbx5VktDA4mCE`b7=|NTHTY-SV91};-hNB#$B~g4L zZg#5pQkiJu_?9D$KonQOA)@3>nxg1!P3GB5avmv%d8y+}T6~BJW^7U+?uZ?lftQs~ z6sOrRC=UIJAfkqiBet^!4pYX(8Z= z>x1H#{ii~Dh{wxCr@N@xDC85d9|T|ZKbIi=5RdXZ&FO}+m72kK6&DCrn2f+1fuqa{ z+!dhQfeA~PrXkxc($+>NlVM)HLSvkG|^3o5D>F_SF_B0Dute zt1h+@#_ zqq13GGH?LxsRO4DaU>V+wVX*>=j3OpHqEX%)oV!Ns5<`wdk}f`$lkyUe|{@lh1VK^ zl-955O@*po#8%Ypn8WrE9Ez~l?P;^fEKLSJkXf=p!lugCaHreT`y08hwTIgErov z6oWyGF@e(k4bcIt)|3TPIJWM9<9`Ma1S%6i&{*(Nv6NH%@?Pv6Z~{kH_`& z_->w3OikZyM~Cg?Ri;kw)FgYz6Vet32w@t6l(ccFvyAB8soD$~iw7lUs255Lrlca0 z$~~^?{!$=)w$Rd4j_&3L+;hKDiiUi@gaxyl7LYnEu2kk}z_75?_*0HYXk`f&?>l%G z;S;kNP0||9M0|etTHMn)xN0I$jWV}_zk6iq))Gsw~<|8Tqy9QPAMaJ&^kL0N}po5LUph{tMrKg3DW;ZEM^Md;wnP;e1F>3bTw`F}C2$08VTw(WJvMY2qwzzq z#r*sD-iy5|edzFeW$c0Jo^yMm`D#syt$vKb^lYh~wWAXWlV{~gOgU`Lg^N%X!yzN4 zh^wmG=~lF(4l3#)?>-bmBRbV0h1U*s*X_f_H7Lz7&F;iEeYmyed1GjkSdus3)eHy5I&*$`v+f2NtvHH<&{VDh@wk^O zYDlN6Bej^bo?eSuSvwEyRb+|0a;0GuR-#$0vT8w6h zaHi`c8e1+^eC=>7e2utrno4;qN>Dh}JVzSVtFVX@-)(Jcl9?RLweHZfR`W4sPeY(^ zx~v^eHHOwC-X*7Eorgr}%W1NU)uR-~MOrJ%*UU5b;Y1`HBt1xvsgbjm8rt;7B`dCN zfmht^Y<)uEyPPU}GFdy#0kG~Qmua`2z7T6HmKdb8+s*NIPT7T;e9}OlS!&l=vGNah zD!4jHLTNlr+wUH+%p|a@+>oHj$0Pw8K_|y(yaj&skW%a_5s#R7H*%J#C=L2kWRjAD zwPQ%OIaV_6TQ-s_T-H;2g4ItOSJSJf2(c3^X2?f&+w>KLW^s?jPr7aNMueMweSvX4 z0#1o(L&lMcq6QN0c)vR4(T+~fLiw;=*M;mu38!#recU<{#y=;v8?j^S!FR#7iW=v9 zBzTaRj_GhB#3NC^M1CcEQjGFK+V|-$PK-!dO?pF zN-VE!G)uJ=G5@;9m)=AplW}99?!qR)`045T_)%)~isgKA)ELj8`bk}N-W$O)cr=2> zjphxWxt*91Qic&h7ga2Jt<;{RAedP8z4>;- zl~^?^_>;o+JUG6Revg|=fs`&s_T#r7 z-NoJ#byG7G?%-L3Fp1{ev%hN#gjTMrigFPXg<*Tb(I+rK)-eZ~yUfwO^lWlTDk|z_ z3a^s7Gjq9`=0M|k{MvXS#Pb%I`dKQZjrE?9YK`?MIpK`q=z^W# zn17ubU3+bNMlyH7wgKH;@472rdoM=i68$6#H@NwJEhK;5NilMC$|NLhZ7bO*k=HXH zr6_*&dja>`tF62yQi-)TxX$(c%yB4d@h}Qp;v9GM$f+&M=jxYT3B1ujtLWNHwNoi8 zZM!e{CU*3^C|nNefj&WFpPz$A&%u%TpY^mnMN|+k8~|X=0|21?5xb5+H!I_RMPmAf zV>%c5;D+t~6I}4O*_zCO1=%(Ew8n+8`i^?}{Vd~HLpcb0QckJod#ws)GCe)csiKc0 z%Q0$g_mwA-RU7Mrao?0pT4si4vOBqM`BR~JREAGyx>}xGuT#$V#3?_80?AXJ0>*wc z**-n)+^lKX5R<2R=#dUjZtES{MoCRlCGahdsP8fX>9Aazr!^RmwN085)Q#RcAHB=j zQNe9ci(^GgSh~C{(fv3~qjqhQ2l-tGu1%c^c13{|&rd9KS1W1X!_o!Y(uD`z%m_?fZme0>@rZk03^zr!jw54a= z+3M{I6Sb!h)I`YIFx8cnH#1aTOiE;aTy5jUa?6=zN2bLM#aX-uX6=rIP%G{3^S$5R z?5*(nvY9~2iDP-LQv4Y9YQL*{Ujc*h$C8!L0qAyfxI>b%>A{1QyH!KgqxO1K%d)!d zTFkkgr)hxdaR14+f9*Q#$}Z@Glo+4~5WR?tAO*o~z#Ui1&7~#S*&K1s%Ufw+3WN}X z0+gi%jV_0ZR0{5wYeV)bw|p-TAYs+UbFIFPgWCuHgAzeE-=3AFn-!2mata}RX;nNXh9qC9f+BZL0>t+8V(M5Ql1{6|z zQbHV5XR{(=j~$!YV%MuKeurZN*Vx)5(1ab@!v=zIl}muon=kL?AnHetzj!u>C2UdY zpmO>R^I9>y==FeT6<{NO%}J*rg;qN4C)0V%HKyy7Uv51;l)L^SD*p&f#DR&#LoneB zCVam&n)Nj`iBV)^GD7-+R3~$}qhZYrbm_6ARyvh^3!w@sY1r43hWf0jW-3gPEIU>{ z7_=-`pj4F66ivPyO&SI|tniX6W}eP^HB$M6ey$k8DUWv#_U5oVtU4_neBf2qXF4vB z=?qqAX69B0Rgz;2RYVJ?x-EQN7fYOeD!Wy)Nt#}1@zeiNZqqcNWnHs-3zFA@?eL<^ z_%TU0$a0t=Tf|s-jYr*oQU8i*CCVr6({lgAC$e&J`OK_0AZ-!7mu(@3eZj@tZSEb_ z5IN18BQP5<^$Lew^shZ1I0h1*+LW$k9|E7+KsT}vA4%3Ojp@fd?q-hT3K#tOU6_FR zb>pv>C;fEX)T>SH!g5-=v-%3W;w2dt-`s_Hx*rGuHdbleMxy)kqFHH9?6?r+Ia--c za%PQeM=nB%a_}t&^iV!)!iF-oZtN)-`(W+7fKoYfs9567JYb{%yb7%kero(&=!3#< zIn776$Lg0Y<8U->c9oneQGoM(_n3%cpcjz@ime=yRj^l@3~Dx>aR>!Rcyf|b!Cfo#S+{J<%IZ+=5S!BoMqg@o7LGV2w(9ME`P=}#BbhbyLCheqV( zVHaY2SpxlnSCCx|{KJb*kW{dL(D1mu1>q&t|EJzs#L|kY?Fgie9E8}m8IfHde(LF) zwk?-aou`a7t8@U*;OuKruGEgeFqYB-)^&@162_hnU!om>_}^-yygc_Fa^05Gt7-T& zJ6j62l$9Hekm@Cs@8YtZV!M6xqw^(|;v$4aIM^1^@^XqYmsHh@5T!NJB^1>I7~m(v za;ye(-Hp5v7Xi9e^Ho zQM=ie^`-gO+@Oj=N(X~3T;utWOTCz2C@prk#a|w5ejti-3F6Q8c52>q`@!}7Rt)Ot}Ll)4cB6EcgfxhfA zdHr##!>(}idr)L96C_$bUS?w#{&1ANLeZc)gk|CUGA>b5)1+Mh*X?dBgk*??>1u&f zF26$jhxTN=jEiI3tEy!+RKBoVwpVL93&|=u(bt9>x7lAg!R?2Cz}f1|MVdA+&g$?3 z0Qi5$+4tr^GcgkrV?*HYkZY|BE`A3$HpaZ*Vm3r1b9P7AP8AQK-dM+QFn`mm!O87u z^zCog@SDc(^xWVAnnC^DnnujOU8s)-Qf^Vey}!fNVv}kh7?6OzxJE2n(l5x zd9qm42x#4Kcm)I)=OC_G{J>_~5AMf^+qB3d-x@Ek%X{j`3?1w;s~PhROsn;6CXMb) z6mdVgDSlrwZm_X;pTUuh!!QQj``T}@*IiQ@k=L4_rl%e6g4}VHHgqvl2j?@@k?WkX z_%*dGG%)#ixfkfG@HM4br_@_VQ6gHt1g`gwi%?=UB6jDUUXQ)QIqHNfxhQZMUtmD1 zRq>LwN!T{LlzP6>5Z5@iVp2awu!qBhR1=1P?a~o1vD+e;m^op^u{uv7xS3zKZD%V} zf1vT)lE-HGfo3I8qqj9-vX?-=*HdN+5kWj+%E!~HtH!U)BYkdY~=CJeRi zu?f@Wk7MNz~ji=ZY^H|j3@y>eU8Die4NV+ zi)L8`v@$kqUS#qnZ^uCQ}qD*RDykqA#(I%qY))|6C*M? z_l2TLNuIRGi>N|8XbC1%WE|_CGf9b@1aR4uF;tmk z;O5t6NYNmlbFMQL&*3dA@TjKpLEJ%+fjJp@5QHjvlDGECU<&g+wM09PqU{uybvLYS z3R;65+abWal|Sj@cA5$S|4+cXs0#pq_M=_}=t&-4p+Hx!6^8Vmkbj zcDw_e1h2J*o4V#Zg~AOC2fp%448dL=QV6zk985vRo9WJj8h#8kymdkZhFXS|*M*~~ z6=ROItXk+5Fp;yI7!Os+b1OPXF|i$ybU@g4^;l_nj50knyum<(WPn9 z2;r-oWu$Fbf}&uor}zo_kiQYXgFc12k#%*NCXOw%WS8)F)eSJ{M*-p*dWt_>a^}^Z zo71m#twxisgM|ypU5YCXEtzB4Ld$dsmjvREN)!3B|4rt97aq!h_5uw0KW{P}+_7Ep z{)g>LTj-#SoI+O$n%$FG=s6oD3fzcTr zTwO!~en;mhJ+P7%7OL$Sq_uqNV9*C2IZkkFJ8*s4eu{)^XjQ$&~tjV!mB> zY1DRQvEQ;$@!%OKDABb>pa`}lXi11xd(J*5Lki{f(V;7m9C=R{0qI_1$R7}KBAh*S zI&`KXF-O_A=Idm)`>lw~eG$HQk8kCI_BLT+(OwE0P1qa3*G5ICcXb#^KX%!ckI>D|P2P`&KHX%(zzW~yxwbt}}oY=u$saJ83bE?TIi!Y!V!Lott` zKq=|+7s`=d9EI?m{fbv5JRzN7x2=@p8Vkuv-4oXnO<#+ye?0%=t2OhQ2yHQVSvH>l z0KoYJ@_}Z?*2YY~?OA>|k=q`U~0cYGmwTlGV57Og|{wftTz7Cf*m@mLW}RaHwU#rLc0#B(f$R|rsIe&2a` zCMGEeDsiTc!#(NxBT^kC%b<9kg-l4N>1V+QJu9 zMGgG*yvP^qS0WOn7gCb*(R$T&_GzW`Y{tYQaoWS}XwmQB4+pHwThz2ZgtN%;C2ryl zt>ed(C`>6PTyr>V^TSq!#k+T#8stb}nFjU1!Z}67DlZvB5*f_j`YLqC1n+i!MekMW z5nU-u&2Fl%G)!C1h7^2j{ z&?KPxJfKi5nRQ&;r(%8BJX!G8KFhgjzMNg#Gwf+3W2{2H=(Ck^t$Hr35eb3gUBj}4 zq2p2yL2N(g!9~=BNmJYLCd$zE`-6)g^c8c#5u7=p)4O`86-3MDrn5*xAL~w~@9?`9 zEbu&o{mp1{W$JLRL>a>ya5r{x(Eun$>eJ9A=F$U;Av9;(u9P(M$QL|41$ex>qb+3P zNK(2}Dh(ZI%E)*udEbt|f2+!RD>jIhMfCRW=x(cb2kxz=@IXG+J5PAT{PG6HIcuf>RD>E&4SDX7y-A{xz+y;IeW zke{5b>OA0_xXNbSBM4h1hY-i;0!c9UKVI@=(Gb3mBi#O&gyIcVRxCZ#qjG1gtw47H zl*E@|akrYPwQ-^RkB z=i;8)tHNyT*#sZDTtLacXnjC^72T##1>0nP7|Mo(&i@7jr?XwY8LKORmcY_ zNNteqdmCt+M#W}yYa5X>apa}$+sfDnYsRJYY7f=DvD+}=4tSSZK9?5w!xi1krVGj? zidgh3t=YEi$4V}b=kMF?G}%m@l#`{**k)@Hi{dNwYnPx(4_Q2dVp^^mypkQiyiK?=N+9Aq$tbG<0PK`r-CEXI(n~PVOUvW8g zNAEFfbu1EEL5NxQfp~EXhd(2m9g$VMbc)c-QJg-lTO+kXdpOkJrmDBRGKl9*Q!B$T*=Y5amHRVae$GSbBvv3n#*{-HraK zKvA_aoD$jGi{v;Ils8F102wxF`ez5MG52?fDk^z(QZ==(Jr`9{5YE64G6RO&WwmuO z6wA)H?DPemxuLkCJs9{kVT*&C-bAH23WBe+T|S2A=x*A`Ypuw(^mEI!evSIrvt&Ou z18=&gR%Dhn{K5uNi7dd4J`Ao^b;Kr-w~LNyk4ZAg0rz=QW%>bORiGSt8A?DK1t}6( zSkM1jrSB7B0mjV(BtD!nFRA9H_sc@E;!XeM$CtZ`&rRH><{#^WMe>)6$S|>QDR2;k zZ*v$@4|#*s`B&N)q>?dMF;p?%7pk3La8p_pTW*%Fukd3F6{lOoCqV@8;`y9;Bn#y! za&qr^I?`#>d|M{GE8-eY2B@1+ zn;MyUVi)5&P_v|;&lngT*XFv|ZXaM)XA`AeB_sO*cl(yk)1=G!5XEWe+owkIJJ(NP zLW_bby0uYoGcYr#(K+_D0URIMXf2n^oo3WVIlmyC0g*bXBP(9d*@C2d}cED^PQ^`tU$zEhi>m7cAqe3J7 z&L~j_++q4hXnUFSLLL&_*)9ivvHyTJ7h?nEzcg_or+TEJSkPfN+ctR_TAG%U$ZLU3 zq)M0M{`UZLXJ>*jBao~`xq|I-DXEV8C}nC)s!z+kpl%0LMBY@1iB1`!e?d=BiEW*a>D&WtcKh_{wEFgE%}TF=Wkf2$9PB#ha4#@l49 z8tAkSId9#}wC{}5`rpE58e`&rZfD3-z&e6bT~5Du6|SPIKkda19Un-6cC1tM3X3st zI26#A^BhftQ75U4`j(hYd@l3p*=W@VY?Xhlv_n8Lg6;Fq6U1P%{cZb)d16JGzYF;L zAmd-TTfvJCe;I51mGSS>U4La<2Nx*6&U*dI{r8!hzj6nI=Pm!u{l^5(uk3#xgZL{u zEAsEn!v8W9@vEd?o85oPN(MK_|D*N&ll?8!f9q3H{?1M6TknStZbSZO*Afvx`r9=S z09=5tk=GEh$}GgckNHE`B;fO3Rs6NL`KOA~f867*oz7p`e{IM7$vzK`v)@$wvpMrC z^RH`6e=-XY{*C#U6{lb2{93sFDTf7YwBMBYwT%5$#IFhBPZ7xUzl-=KT`0=HfL##) QKn8yp!QL#$@bm2d0m@8IcmMzZ literal 0 HcmV?d00001 diff --git a/tests/test_deep_cleaning.py b/tests/test_deep_cleaning.py index 3d1c8e1..82579a3 100644 --- a/tests/test_deep_cleaning.py +++ b/tests/test_deep_cleaning.py @@ -105,3 +105,34 @@ class TestZipOrder(unittest.TestCase): os.remove('./tests/data/clean.odt') os.remove('./tests/data/clean.cleaned.odt') + +class TestRsidRemoval(unittest.TestCase): + def test_office(self): + shutil.copy('./tests/data/office_revision_session_ids.docx', './tests/data/clean.docx') + p = office.MSOfficeParser('./tests/data/clean.docx') + + meta = p.get_meta() + self.assertIsNotNone(meta) + + how_many_rsid = False + with zipfile.ZipFile('./tests/data/clean.docx') as zin: + for item in zin.infolist(): + if not item.filename.endswith('.xml'): + continue + num = zin.read(item).decode('utf-8').lower().count('w:rsid') + how_many_rsid += num + self.assertEqual(how_many_rsid, 11) + + ret = p.remove_all() + self.assertTrue(ret) + + with zipfile.ZipFile('./tests/data/clean.cleaned.docx') as zin: + for item in zin.infolist(): + if not item.filename.endswith('.xml'): + continue + num = zin.read(item).decode('utf-8').lower().count('w:rsid') + self.assertEqual(num, 0) + + os.remove('./tests/data/clean.docx') + os.remove('./tests/data/clean.cleaned.docx') +