From 12b3b39d4d5520af04233578ec93138eb192621e Mon Sep 17 00:00:00 2001 From: jvoisin Date: Sat, 31 Mar 2018 21:20:21 +0200 Subject: [PATCH] Add support for .odt --- src/libreoffice.py | 54 ++++++++++++++++++++++++++++++++++++++++++ tests/data/dirty.odt | Bin 0 -> 14114 bytes tests/test_libmat2.py | 26 +++++++++++++++++++- 3 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 src/libreoffice.py create mode 100644 tests/data/dirty.odt diff --git a/src/libreoffice.py b/src/libreoffice.py new file mode 100644 index 0000000..b7e0dfb --- /dev/null +++ b/src/libreoffice.py @@ -0,0 +1,54 @@ +import re +import subprocess +import json +import zipfile +import tempfile +import shutil +import os + +from . import abstract, parser_factory + +class LibreOfficeParser(abstract.AbstractParser): + mimetypes = { + 'application/vnd.oasis.opendocument.text', + } + + def get_meta(self): + """ + Yes, I know that parsing xml with regexp ain't pretty, + be my guest and fix it if you want. + """ + metadata = {} + zipin = zipfile.ZipFile(self.filename) + for item in zipin.namelist(): + if item == 'meta.xml': + content = zipin.read(item).decode('utf-8') + for (key, value) in re.findall(r"<((?:meta|dc).+?)>(.+)", content, re.I): + metadata[key] = value + if not metadata: # better safe than sorry + metadata[item] = 'harmful content' + zipin.close() + return metadata + + def remove_all(self): + zin = zipfile.ZipFile(self.filename, 'r') + zout = zipfile.ZipFile(self.output_filename, 'w') + temp_folder = tempfile.mkdtemp() + + for item in zin.infolist(): + if item.filename[-1] == '/': + continue # `is_dir` is added in Python3.6 + elif item.filename == 'meta.xml': + continue # don't keep metadata files + + zin.extract(member=item, path=temp_folder) + tmp_parser = parser_factory.get_parser(os.path.join(temp_folder, item.filename)) + if tmp_parser is None: + print("%s isn't supported" % item.filename) + continue + tmp_parser.remove_all() + zout.write(tmp_parser.output_filename, item.filename) + shutil.rmtree(temp_folder) + zout.close() + zin.close() + return True diff --git a/tests/data/dirty.odt b/tests/data/dirty.odt new file mode 100644 index 0000000000000000000000000000000000000000..926ebff39ef53a4d3cc5caeccb5118383482e8a4 GIT binary patch literal 14114 zcmeHubzD?i*FPX2-2xJl(#_D_AzdN}2*UuQ0|Ud*f;5+wk`g4OyQBo9MH*DPL_$F6 zM))1{diCDN`+VN_^ZWPRaAxN0v)9^j*7~ls*I7sB))jOz6cjAr|JsSPd6xhlD+&tA zLnE)Q;inGO_;bU{FzdHMN%ErJ5y&jk7Tl@x>o`9X?2jt;h=I$A3DI8-$mAqAaQ5{eI$X1C zS$G2En`~$tR(t_}V4`DZ_{m}-0*KngSWizYPSW)@9()$r{X&=QJmn?hRZ{b8znGSw zi;^LbsEyKDlqQ0|o$oFWMX4AK!HG6lhA*`rUG9~Tyxm8U?8!M-zszqb&HNB2^AV?%Ax;qsFjnyjvQAJ_~FtD>@D^p_NMi^LF%#Ds_>lGY5^Z|ZW*$z@s zQ>Q(BYQT}udhd_$@ZFsqgo{g)_ptGF{ z4mP-a@x2I{qxmaWuZB!cn)uV#-7S@+-`d)m-*&gRx5o^SV~JY=dh6)mGJUb}p)*|O z{3shJs>}kZ{a9d0Ny(>ApK9nDC@W*iv(eMvyLWGXyaJr3{VW$sC5(Gxc>X<1m>Mtm z{M;ujYo1&hsFmhR)=*GT*yzq~AjHQ9c+WO@iLkO>WvKS~>REp`JtYNamWfzi1U`#L zK)|9_Sy@TKKz9B5@z#QZic0tDs@-@nx}5I=yOiZ^cdE!sYG8#1SK^qVASw?@b9eWn zj!;teun*hr0?8YnM)Tgke~&JQhmS8NCf1)U7{aXfeg?07%l#!jweZKKCCy|3C3>Q} zEk0j?wc_F7iKdewARthtTnxBMD1fK0r{_#aFxG{Oeyj%C#3>Mf{l$$ zv(MK_m~e~dzB*$J2n3RRbS}xty0YA#O1T(L!J97TteiL6*&jpOj}u&$1dhkJWO%Goi4D@=iBXJ%stK4tGLbx>Jb+kYWva!o01$C} zazcfN>3%EL+R14{p>qjgi4*u3SfNHhINM0$dK9;g&;0yQ1BddS`GgTs zQBmm?7{F&=%3#>-WzXubODIO=*S2#nSmulq%xBn5r915VPPh9j+u#xw~s`VLvKma6!X@Vdw2;q9T|Epy0;)(&O-*Jsc^;#pcYJAOWxVpQ z?Le{2Y5@Em93CFf07a6JXaq`O zA%}p#4lssGaYND8)a2&kl5}3yxOE)ZEbB{&BP?7vC#_#+enje zVJx8zT9RS(M$;m8nYCMT_lI}VsMvV;(J{MmPX=!O~2h%Yc3JVG< zs;d>TloS=;j*ZcYBVW(W(J@%t+UDu76&4l_q)CMEE0^yI3JUUfS@;~^l*iHlW=BO! zi_cb6SV*7z>WL44`hb2|wD@=*ZCU2%9q)Y_1)2bwvAk?kV&t{IW+t9g*FUTQOd==J>)(12M-=RcEid%I&u$aef;D}u3iCO7naSN*RLO|GUEg>3Ja4n zpi9t^be`<5Gr-Y8`0;iD++5Wd1u5yO=Tz3#))p7T=9;}J7r&k!awe!pI6)u~Jv}`Y zm8+>J!6701U4YdF1n21m$WON1^|MKYlbl>@&aK4r4E-dHyixn97n(VGO`iK4H*Z1! zlP{n7m>%m!^w`i4ik-3^p!0x-aFqxECvtM~7cX95U|>)#0^bJ*t`Sb!+Jjk2;ehZF z%YXXxX@~+rpi`TgI`z_H7r>o+!l`spp4D!f=B|t#xtE~LZTbRGjlILext2#VfENIm z8gcQIYZycGi>HR2!;5)eD8 zNE-mSP*G9o`+e7C$qWn)`_m*S7|2LSvWD$|yCi*nvp8aeMKeZ>w=drUjkE;lA;9>rm{9PHbPTeot!Ld0B;G9O1Cf5*J)_*0IK%&cyimDnP$_`_T z)$HLCNXeC(hSrz}+bS^cxWAlfbmKLtemyfoOGej48HPsb%#}DtcI|JCL%m|jMaoe1& zcU~EIDmH2@fU5tP*<*Exp~<|(c|}Y09RMyc&DI6SJx_0CwEW|bKRP-T zbxO#TD&YM#g0GK9M)WF-?;0CpO%IQc*FCy82OK-fv*B!&L>{v@gM)+p{gBY$G$(_4 z*(i?ZqBH}5O;lHZ^Cp4w_~CgZwaA;^-URifYUrrHzrSp&RfJkBIw_iZ-HV!Z2S z6B9kMgr`sO67+R-V}Sdw$k+o6q!Xm5aOci-wOCvV9#Cq(^Ku_M2glmxW^Hx#-DyOzCs#L;5`);s(oWbmK)kA|%CzRbp^J-9o!9gAlHy`T#uyEj08^qP z07_o*f;4aCjKG_Za{+ho`I|=)>c^wk;1a}bL}vhi2}w%g9TCX`tDKXQ0|3}7<6;s1 zuHHo6#NMT}v^2>ZgoUW(;_`oVFZQPht$*x^11|Bhvs-$6@wTj(HzrT!vqV$U1vU}) z01uIshsU8k7$49E-(Vp@IU>M?0vsbsDk`gW#>c~eo_zlN*|X^e_^zob?i5B-Rm`S) zH(vx`f0~-S!C){TmCVWGg$V=rd12+B^MD9F*P-f>v=wG z2becEH#azZ^AZkDH>v;|>-Tx?v*O;If7Hz$Koua?*0O!_)xttTfX5?;6%}Do_kMl< zaC5HZ;V}Z(Uf9~Q2C{dFa&i^{n=-n9uY?!Pi?n-Vb|Ao7UeO8(HRLc&N3*DC!;W$`7`-aRayM6aEXYO&0uBqY8tQJN^_Y&9({ zX~0keaO&6Q&Op6!$*m5cDC(LNr;IdrlV`I>8Co_T7i$wwYV~-^6+&5 zOIlD+z$N8BCVjTmo|Tm~D03mn$EQfxOE_{wz_t+ZrU4V5LF#8zZDQ}@;sSx>Xys8$xN@D;;~1TZ7L|kKTMxYH4x4Vnnj&Ea5Ws}J zaNqvW*QX&GqOPva!NU`bfg2=$@W=dD)UOi8OoT}bXs?BNFAL!~u*JD{4MPPT%@4(T z5O4&6jR0WOE;>veg%hr#Oz*W5%5-#gwkA^-bH95JBz5Undi=D4QsONL#v**zc&(A!&=WI4d&k1GUsR)_2mhAx13m;DdW2By? z8@(d}ZUcou=r02;^iW467zM@ssZ>NOjxQgc!sgd7Z+X*3n1t8Pa;uCZ0lM7w1)(br zi;lcIzUHn#@VfMr4~)!QDTg7|q~gu;d5)Tq5&MmPt3Is^k@2KX4+vJ_9G+Sjj!u|y zLR@BM!G-JD)EZOY9*RCH4V#_)10P@{C~8Vtily@R{J+OPGY}F@;074u`mZQ!~sTr8p9z0A3m&XxY|AW zZt2kN>8MAth$rWY!igvzksB} zYo{h$`SlwH!U2_^9*iPbBp!23@gi##TYAfFvtwoS@rsd7c<#wcdBv?tQbGgmT+BOr z%!M}k+5>-RdIwWY(@TkoNFIs4#1)WwiA%DMW1T9=j?sfYfmhi{oPc2^@Hk}kO#C<1LTCc}-F?!rnZlr%Y^9fQ4EvlvsEyxwo z_~U)&swo&l#^Q^EtMB`i$8kE2wlNBy6c!-V%C#1B`|T^c3mk_T{7%=G6cQrFeI$@$ zdi~Q{L~fyvv9COrL*&1RsE+o^B5#B)-<4<6lX`KKFIvuz>lGG2_~@epkFyFIsL3a1 zl?}a1$}i`gDwQn-RYdBVyp9-&Ysb_X8&YrHSOuIe7xtX0_|GmehA^ISLj2;y^ zj6A3jgNC3yzsUOf5Wb2NNmo>mp=S}5O&m1~Hs=Vs=bSHT#OCc!wfTn2-B@Jk(DUm> zcvLUuED(guw^z7j&<>X4m=?7diVDzC$XblhdYBY@xq=VsQ=G% z)~*>Vb!(%VPm6anGm~HB=eBEPgHE5{l?Vfgm26qVCgk&A$NP9HeI~{qqU5JAOs-C4=!k#dEv)sx5}x83wh})vJaI`488%a+95Q@3A+R1LxNAEc?AwE zRnHRd_&u_KeKd2occ0X*TvjlIqJ-pk1}zr#-)gTbqt@TMCA%zVW$QXOl6k|bKF+!w zH$x_CUe%({ocBpt5}tpPj3q^4uUt8#Vwt!wK&9sJAOmbNPGI5EC)AA!_lF!rEszU-)OEqoA}A0@3F`W7FTUZB9y(e;2?sG9a9c2YlOGIzVk8&PX1F zwM|Y;g+lG(Mc0PNC5gQTB1+1?z)ZJ#M7FQU#Or=pQCy*Kyi<9Nd#Jm4t0+aIK$2ltWwIAPcQ{6m&8#;`j{gfVe=AjOp7)@k0O#~xd| zQ!OuvQ}LrG1c;pys^m#?qOa>SCEd->xmoJ3PJT=7+U;KdWeskZmt;3Nr=RBxHi2$U z5HrD6k?k9U?>90+w3KJ#>XA|pA;7s7^9*5H#7o}-8E>XwE$RFS7<#CZxnBQZwW#&W zqn9{(CA|>JFNdewKEmV(YUS`wM$lX3g}Mb!&gAQ*%mjD39Qm&)FR~$Dij5-keO|8@ z85kU=^t2A?X!5=+be(D?u9!NICCxJ+h{A7wxHfx7V7jTQ<5{YH+QE_a4Ih-OupBB(Pfq0pm6FA zrCAKBEhPp=q8IIaF4H8DUlPtJ)s1ipsFj4RFDlu>%llJ!6`Vp^u``f^3J;9lpuDpvtRVL#P^r7n<>xQo7ui5I}#**U%V3A;| z)o{tmMdFL2*YL@gkmci!65W4Hpj4Us##%6#uGO@b9fl6WqNWmzEf>{?&y8C<1gji~ z$T#E6hR3fDOPDiD;wV3Eh~izDaJqID`9%Kf47l=&+>yTl!G2eg;K^&|xxHJ`;Ns3` zt-wX}N(cs@?1p;G=!d}UkK(}_v${Qz51Qr0WPKhxyb_TWs##HQQm-LjEpzNLOo&pi zIujEAnt1J^)NWfw`s5XuU3t^E#ew$aoMv}9@5^~@q5rHMA zdH40X=kbs2^qlRUmrEHJ#a zi__EMiwdfm9&Q)gtHh5xwu|r}l@vZKf z3(Z2}9M4TFktY=aqX!yR3Yy3DeZhO{a5>ynH3xjPx~?m&?{v_HM!?CHZDeM#!_27F z5j;g1MDvGFMb-!->MA*`%5m zKY?R$55m1qSED1O?~VrPy1l!IiMekCNBe?gJtN-U2%&a{e3Znz*~`w0m9BJ`UW+&B zje3qPSD7mLl)2|V7ubh#LVwdlwGTf+B7c!c@jCZ4<^2Z~EP5M*RBh!1eZ{bcJqmoPp0D9QYqjt+Q8!0c>UExQBhOM37_`09aGM4XB*OCCGyzS2 z%=QTV$|DWdN5x8r4ZZc8$|w5RLRaZI`*TnCtnaJdGR7_0E*843y)39A$d)=&$JfZ; zf3TNt&cP@3t{v21+qsyjnLf7kC@-T-6GAqEIgHynx+u?^TvQ7DCs!M$}`@+jq16527g?JYziO7fmZrqWI72=1OV5jFPm;IJKk`ijtctiS_}2ew^A2e${?T29xR3yeJomIq0?#*QY9qZE9QnugoI}X|2jW8_6wRD=yrB>)d_BoLL8@ zWhm`edr}|3Yj)-O$bHj!e7nSP!6zm!u20L-u^r48><_fxX@UgTT7=pxiNt}JmtzM$poHyQF#HX_e0cw2$6i*SZgW*TjABF*w_JYX9v(iL6% z>S$qr&u6XmsDOGg&-C7zyDg>6;k~oz$n9NjKA#4`h!rWy!AAb{Ikd$UiFWtlfbA^m z#ZGR%^to$Yx~y%w6wSljLuS(y4U0D(cF$l%!~LB=^@5M{r0BI9<3u71i~(qj{Bw)IxegmS1xD3YQAG}mlty3%w^ z4gIVSJkyyU6X173cf+X0a2sW+JNy`Uk=d7>5%6h4Z$RfjvQ@itxxcF^e zfN5v1*3bpK#S2T&dKzRSkgky~MKit2c1O^+U4u`fs`F()sKXXa$G;1uJ73eGdl_dT!1@vd3?(* zfFLa|y$VSg;Dg>3f^Y`%3#1tNdH5LVAr4@;HPpdYiqX(OnOl@mMjA&FZes%lL&Sfi z9B>1<36}H#se`ll_e)ZYE(izlZ^;Sb4wm*1XK^H0{MRf6@n2huf9vRbjXMnL@IZ>u z4vBOW=j8>$NIY(WJaB|9FF){{llR+I#y`8V2LIL0(FFnf*2)^p3xPo{^B{oi1|ELi zA8jv3_&*)gW!qne1c$@_>gaL~->3MkJ0N2M#QXjHM^Cm0YirnF?SGj7&{mMwme<-6 zY02#hg}5;>(*L=Rzbv=_BmFl9;z)$0gR>1BVgD`H<1c=|k^|`;mq{PomqYnBi0@p0 z=?d_=JHwE?aBCX@z7KweRb1Z)y8iCI6q5aI4Gy z{@K*=H!U445tgD<{Abwon=V#=Vmi$ws#V_0Z zJo{@F->QFX?BhuoijeqZq*&5*`-A$7{`N&%H>Y8X3#IOxN?O$IF&hAAh3d$qEWF=YH|#Bu+Vj|ylU)x~*#-tA?cw7V_mXWg4pZi`j>{o5ZV4tZ(l2rvnK znooL8dfd{RL`<45Mq_Z{O`?ZeX9r`?<{lpJtd;P#n0g#ew}T~fb2aZTZwc;Rt=)w6 z(}e0EV3x<}J4rV0{z5o08zcu*(k%MzgyLl^WK@(5~fDiwObORBOzarh2d4-qd zGWAgb?qCD8{S)t<1$ZsNV8Fb818H8c3j*eXMzoSFK;V?%_2MFxH(EV1wWX07t*_KH4-9t7mxlq zO~Ctz9`;pke6S*_h9`bD30rP9Qz{Mx)eXKt$xoy#d#xWhiNiVX2A2mqj@>yWbZ zmu^iu4qhaWL;YiCSUO?OmOmfJKfz*|DBTm-X(y~Y+Gim}$nrpGGZ~~plv<>n+rv(7 z#P12&Rz7?EiWw?uMr)1oA~C>Tk5lvf4`M~}Ej^3O%=`A)lfknB8k>3#zLJIXZLDhw zt2zhXHc?ITZ40$fmCSEi>I6M}2}AU}eN%LUZOxf1H6dF=`x+j0HagLly8L%WS!Au7 zw1R!<%|W^y)>>EGd{x+BDIpok<)Fc^PN@m%i%}JdC#OzaM%q;IHBAr4NHq?Ut)MOh zU~=QD@uh*7S0QH|_l(pDG7ucPa;cp5`Bs?<8XTG`(k#Q2ZzWVXc(XU=7+5&QQ9}db z3d6Z*P;hm_5~$^;qpIP-!HoJol9M!ws{->#94QpJh$Hzpd2;w|D8%Jh0EWY|lH%TF zV;mkzPppMKkw~Ih$neY)=S3Zq0MF5ha6W5ay&qn+wCHg9&|Sz{^UA^li*ghjHY^9) zanF!wyXKpdTRm?ujbxk!z3vHEW)=AM)8;et)y}+mv_ae`mml?|lK+p!{WpXCYMzXD z-KYp^2@5k=;CUKpVm0p~OzgIpk_+cEWR0s*{3aaYAC^kzxSLkpam)Lj10` zmOF=RE3XO8I0-{lxH;<4CQ*mD92$w{C?WOn;?=?pZMk?5?!rdSu6+S8jwX3u9!^|Z zLDo~2G#uJ^>klID%_JqKWSk9uw9J5d|%Ycud#$Chg0mr2r z8p_}ze;s)*4M4{;&I}3?n~I%&J$?q-PK8~Y z`eDDhK#{RRlA7Aa85gEEqF1pqi@29&C)WjXTa3t1!7j9Guet9q`>Ed#ChBbxo(;6f zhz?LG$+5(-rg`{4aGNvUtKei+$cwQ*+FV}`S;Xzm5rS6?O-~Cb*f;N= z4R+ZRqR8SUCN2{5Wy*Zv8P&eLn-_go!oyKfP!517j?OJqG%}Qbe}x3NrN5LPFO>X~ z>Ywk5{3Ou?e*UlC9Qncehk*V15b!dM;b*}o`CC%M55gZ_;Xj`d{v`ZF{`V(_KPZ26 z`D+O5@&Ve%{3pI`myf9G@QQU5GAIR0~ee)`w{Q!WO-;quep{&y~y9{0}@_8Tt$ z;eY=-v%fEN_ivc}>W_ct^FxzxmD1XJMOS|&3IP?9BmC?C{ UfqB`C1o-O(B1&?XZ=9k0KS>F)Q~&?~ literal 0 HcmV?d00001 diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 717de3f..743a845 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -4,7 +4,7 @@ import unittest import shutil import os -from src import pdf, png, jpg, audio, office +from src import pdf, png, jpg, audio, office, libreoffice class TestGetMeta(unittest.TestCase): def test_pdf(self): @@ -46,6 +46,14 @@ class TestGetMeta(unittest.TestCase): self.assertEqual(meta['dc:creator'], 'julien voisin') self.assertEqual(meta['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1') + def test_libreoffice(self): + p = libreoffice.LibreOfficeParser('./tests/data/dirty.odt') + meta = p.get_meta() + self.assertEqual(meta['meta:initial-creator'], 'jvoisin ') + self.assertEqual(meta['meta:creation-date'], '2011-07-26T03:27:48') + self.assertEqual(meta['meta:generator'], 'LibreOffice/3.3$Unix LibreOffice_project/330m19$Build-202') + + class TestCleaning(unittest.TestCase): def test_pdf(self): @@ -153,3 +161,19 @@ class TestCleaning(unittest.TestCase): self.assertEqual(p.get_meta(), {}) os.remove('./tests/data/clean.docx') + + + def test_libreoffice(self): + shutil.copy('./tests/data/dirty.odt', './tests/data/clean.odt') + p = libreoffice.LibreOfficeParser('./tests/data/clean.odt') + + meta = p.get_meta() + self.assertIsNotNone(meta) + + ret = p.remove_all() + self.assertTrue(ret) + + p = libreoffice.LibreOfficeParser('./tests/data/clean.odt.cleaned') + self.assertEqual(p.get_meta(), {}) + + os.remove('./tests/data/clean.odt')