From b208879e45bf6875bc8f6aab0ce14ea8a61930eb Mon Sep 17 00:00:00 2001 From: NunoSempere Date: Fri, 12 Jan 2024 00:25:58 +0100 Subject: [PATCH] update squiggle.c with speedup after avoiding conflicting cache hits --- README.md | 2 +- squiggle.c/samples | Bin 27544 -> 27552 bytes squiggle.c/samples.c | 8 +- squiggle.c/squiggle_c/squiggle.c | 43 ++++---- squiggle.c/squiggle_c/squiggle.h | 4 + squiggle.c/squiggle_c/squiggle_more.c | 136 ++++++++++++++------------ squiggle.c/squiggle_c/squiggle_more.h | 14 +-- 7 files changed, 112 insertions(+), 95 deletions(-) diff --git a/README.md b/README.md index 960eaf2a..0bf64330 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ The name of this repository is a pun on two meanings of "time to": "how much tim | Language | Time | Lines of code | |-----------------------------|-----------|---------------| | C | 5.6ms | 252 | -| squiggle.c | 12.7ms | 29* | +| squiggle.c | 10.5ms | 29* | | Nim | 40.8ms | 84 | | Lua (LuaJIT) | 69.9ms | 82 | | OCaml (flambda) | 187.9ms | 123 | diff --git a/squiggle.c/samples b/squiggle.c/samples index 409e05f14e1cc0d135f5195e272b8750ae053cfa..d2edfd2a7fe194c6ce14fe445759556f2787d02c 100755 GIT binary patch delta 9146 zcmai43se+G*6yBRV0g_m3eKP)qZ#68P(VRsBbyl=v?m$gU@)R$P?SUw6@2WfgNu3e zf!W2Bn`6%I{_E!K!~frG)MNG}X0yp4W+mn!7^Aql5jBs-R}jsLCZ_-IcJ~ls%t@Uy zeXG8E>(;GXw{BH8>^#ZuI>|TL6FLWSYH@3X>)bx_FVS4A-r9aDGv?C~TI16ZYydyT zBgAOUFy#cT&4WbZXP&0D%?s|LMy)Ndt-LM4u1nc^=t{CfENm^uZB;wj z*#YAS{!4bwXyemadxVWY%Mv0oS(C}mFHq4n^ki~`o$ZRS^8;0x7L~&oZ_4C9SL;{R z=RWnhSbe_9wwp3p7W6ztZPtz2A(a!shDEMrzmIG*EnQh&y^)QKDv!(Oruu!8CNCO0 zCOe1ijLL{Maz@Ss6QkJCC|iK%4BS$*$md!3wcU4r1jj{lqc9)B+)iqnxN*9}NS>-; z`P#vi(O4qxL0w+j{yf$!f}7eWAHn@amj~f?sQf5ywkH1=msnXDPwTuuSD-00_9?7M zrU4AxGF@R`6p%KQP@+bKaCB%*u8uXECnxKnrDF0TjYTmP&r+jp*?Od^aBn{@t!KEn zbvz0H&o)Qf*rw>X_?i7sBl{cBmw=Y_LnGOLfSv(b-48Xh_!t|Di;0V`>xag&0-zIt zHuXa-Y$ebYKzH>+1@=3jzXiItA3A`&A7e|_U7?yD>c>rDchKi+^l9sd+V=lpe5~m? zVYoSF2FLX%>9PAq2|wX;Q43DzIBS}h1|Q^mpmS( zze%_#eIcKU! zq>_T2kWiA2J;HG}n#5qiEmXvysfS2@Z%vZeV0xMAPlaltl}>9~9kg>TI)(f@s(kK2 z_GDrTw%D_YLq|RZh^u`7vnhC-s-pC1O`OY_}-g@&(s8{L;y;7vkB{q)Gfo zY;6u z_tH*gvQ6P%&(6dr$j&>rU}s}R#TaWES;)579{awj z^*EX^0(tKTO+DMInYswFKWgeE2+8pn8d|?+YV;vj*I4l2&T9MMeiqMDEq)&Z^|M&p zjo45wu^|ep*4|?+#j2IGL3PrB`^oTy@fuZ^UOFf?pN|)_+eE5ejKZ7}o$Q|}qXV`L zo4>O_6V=Xo%m@@S#P7E{zqzSRc?bmWIc59$N;PN%TAn-Um~*SvMO_$e<|PqMvHp;& zrGnR{Th+Hf0jkPD^tja2w?Nu-=ZMl*n6g6)XSwUYWQc&s1`y$d7nL3};|np=Rg&4b4Vq`Skm&Dj?Vegqc9ZSZ?>`+#cD zJkwtx`~#lZ_^oGxkdjNZE_Zn*uKti?e%EI3kZ4(zJ%xXqeK=@@Wg)pH9`L0)xQ6W< zl49G5G-v%q9WE!BU%WvzQ!p9WyF*3<9s>_EK82~4I_+K0!(eL~S*ETgoH5Vzj4DE& zQ0LTn*PKktz?aY8&{q6$G|>AByxNH@AtOtC0^LiTg7uj-ss(~+imH$nSsoqxr5@^t z1T2#seUD*KiqrVFhLf($cIz*GjWMxgta_+}y_Yf!7o@8xsZj!^iqy!?3>s>QrQp!F zDmH6YYGEl%=z!AEM~A8Fg8IDpfvQT^hp5vwQ0JEi433d8RM+~26e6lK9CF2VvJDd_ z+sN8t6%bN44`c1ALk({aW7kp>`R7>V(1~d=>L}Pq5H4w!AGxCk9`#MV;$Vx1PLAqz zz*CcZ*k6Z^8gRHrQM$INQndTzS9{p!LpP>gP9tNGOysdRCU9=41|WI`Q+wL}&1nq= z!;UXl%|q?uz;mICj><4&K%nO@`=leBDKiB zBGTMYXn*16SNZMM#t-0A>z-D=J(y@lJGh5@Twa^vFK@1wd9h)$;O9ersm0&0loOjf zBc&dnbW?2p%xK-yUf;}%L2r~>Y`zfbvFj*M*g^B?#d zrkTCL)r#-t314uG#N-*Om{$P&Fn}2chkP7wMAT~VctN)kT zqa5$?SQDo7`tq;VUh+y8#rhVvXk39ve6Z9h)P^WO%wOOs^(^o#^ek$r5AoT@x?E~l z1nOuQPq9z>(o^IwKVJ-Y_@xWJ=1+`tji|qDm(N~O6u)%dFI^R-!*VlP(kayHyLjt1 z1MU#kXa4~e*~k1r(X1cAQma^g&0l_SWSc*ja(b~}*5BCIM zIOQ1f?JrM253VgikAjYT$Ro8)3p$&vX>Q01gQk_Y`I^rg5q}5K)wV7PN`!}lUYps2 z8KGezJt7?9%fC_kK8slQEkRE=jvlk*{CW*>W^y4!>`M6g(E3$>+D%z9E*sgE<}5=S0yz03Fao6>hg{ zZf|MAT)tz=Q?(cth9y&%&aQiOVXJc!1CL6rp1HI-cf|7Bo+6+0O)*8QC`vg_3HxD* z*pQfxV;gQOa!$xT_QFjC5ff+Kb69NtJko>pN($SLE1}$hfeN4c?-rjYxKQy)mk}sD z1*gbgYI@5IHNN^oZuvZ)<6tds(BGO&B^gWB+Nt_m_%c!ufX!c7hkmHan?DBgUrlH7b_U7N- zGNJw=?<+qnN*{Zq<6a8t}^hN9D?5Pe2x52CbP+)h_WB;A7-A|d=Kvf`Nv>s)kl zz1w5W==DnHMd>Q0OUcgyZ=x%zeK6hJGH%H?)Vm~|)QCCsZCw!Mq0D};MGd1Obq3On zVw!+%KNOj6!Umeh2-SXJ^jWlN9_We7t7`hCb$MiZh)hS9f)z<*w(C4h?^I1k<2uam zhWV~T=yVf@#K)LI3lzowk2=XfpCc~a!x2MNjrVqh>s@HVwec=T^s1NCdpY7`jnmH& z@IxFqBD{5(GSWMK<7c!HK6wkT)SSoaJL8kymOAP$bT{`#AW~iOJn*%5CG46MG{oRj zTm;)VDs7-!9*(w8ni`TNsx+(*vA>RbIN?=XD=DxiP|nVh&7ZNz2TB5N`HfGIJ|9Hl zuf0O=SARp3Rs+7oCLrhk121XF4Wr%zG`b26a_{$6K@Fh3Rk%4o4W1uefxtY!(*b?q zK)jd#U($5_cp%~g&;wCY@9Mh$HCR3V5s6h``NQY2YzSRoGcv{sXD(pY-(EbLSX0J* zd_H?6;}OG^3ydGV&=9=97LU$!Zzlj2@KV*04Cbn(7zyh4C{JC|b3P z`b5+z_L(~^@uzg2Q}o#}Gw6Vy5tDRX!L>ti~dIJ%Phy?)*RdNKno8`;r+1N3q ziAV7wec%R8w>Rk_N@wL2=h%y5GR9_zTMwCWL-R|W^F5_4qSO<9r(TUN^4W9YxN0!h4FEg03N;*X*R9Xg;BDLU zS}F}5j9+F^4_F+?!Q2l~mn+!KtYSmfSvEa8HD(sR^q_P60Sl|me$~($$Ht6(ZN_w< zno>V%P}*aC=0$qR50*Ns;UV+#N;ip%dGc0%9 z-zOGBvMW`+IC#mhu%^cdH+>)9K=>65DwmvL&*x;}W8I@U1qS&vyFVw>kaw0HMr*@q z7MVNB@ZQI4Z0`7|9j75}JJJwD2CpZcxjyNHe&6v(Ul&7VBj7py zk(AZV@~V#g4~<`Ke$Jg^tlKt*<+&@_%kH6b*Je1}8IH;-NBUZaD}BjGF1;edfd{t$ zzEg{@YTe>Bm5Zxt9P27qtyo;Mbcv&4)fz`dZB=V1x4X_eVN`Stkj)vaq4-{oL>U+U&0}X-7pxteXvIJq6bxcv3&~|*NDEOSmWq~Gw zxp*9M?gHHjDjZjomp~mKDM~A76X+Sx&2nwcx(kQy2GG5@73?wvIBq5`;@tr1KqK&Ru@f`})O|%!azJ;1`a!v?@C4{y&|RS2 z-HLJ?wC%d0c=37BjaRN}(7m9)1$E$6DnxBid|~ArcnK8g?t>Ek(#Nx|Nw(CT@Np)= z-YEL4#yRoJ!7uclqPSV=!w-=YsD|+F6<027i*q8kl)5WnmjJRf&aML1DZi6V8l{jCK_3^I1Kw@ zEas`P>PO`wU^5XbPxg~v4(ugh>6(0&p%kapfT;H{9`4betT^ zIneP9V7&;YTH+O2gnGkNOO`Q#kFf|Ii}~SLd~4)7!7Dn+{?9wgBp~Ltu~Xh5rYzK6 zVYj?P7E+wvK)5Z1C>8Pt#rjO72J$}@!ZrZg1Z<;w>aimXmLgS zc7qr9AGW4&QVun5pxKS+jV9447U5ThNf!Grjgu^nV8kSgtI_1KWIY|}wM?jw@>4?b|SG{Sv#ciC38M{0dhsR=v$Rvw- zQtT7o1vo#tH$s;gTVzw85#Bx{L^8rlMtC$M>Ue`uWE-cX%{zl`6L2Q-)NQ&&Xf%ix z`_o3a+R$OK!|fQ5!ZiXr&4Ha4rK{KF9)aBXupF6@0XZ{v<_>mk%81A+LyToYA?q$o z#6BcXZvwjn8+HWhR4C4yfYktdlz>$hM>+gV_-^2D+yf5*kHBUxCV7fZ(PAd~4&Z*^ zsvZ%BO|h6K0PhCAgF5VGf1f%oG6Si=EVHXq$3&K)ns|{77oUi1Lv`C__NF*I#)Up# zquEN$`y*XQ0dZH@MR9Pz2l6Vv|eSfepI zM0TOnvFK@O2}`vGM=qrD7nTpSBTs9EH?5gfPD|qjCQZxa9qa(=YuTx3nF%g>t3*Mt zsil=APftsjfTM~E@{n3KFyHhUd@FlldK$lj9hkn1x3gh0_U!MNVdtUl+DseZVY8Bq z(OfI2EqQ}k@7a$S@#nw^Y}Tx)d>PvZSu0l!Y8$)@*Py$LkxSq@^4R%VWB9!+VYV&V zHTiB4>>na<;(2^1^ z;I>aCjMIi9Jc})!lZkP*&#@)D^}ebg9-DL-{P9VZxdxf=8ii$!qr$894l7zTOi0&o zb#J3esgXu;+^^x=*&m8V7~&pgM~bFk(1c=Ja`-QtngGYGP}f^^K(TFP_%EGd?sMSs z$hzU>4dcIOTZ_{SZF%g);wgj6QZ$&(O)A&XX#lJ!8_x`Le~P~@dgkUCx^vmj=6VMT z+%DZEYF34GBTJZTV{LQIeX*u0!DBaZ0*4S%q^)zXhN~GD)em$$r0-TWwE#b>G(|btm8O!BSJ)g)ug{$msd9~@JtOK#kp;oBK3?wwLtGsmZjltD5f%#U3=7) zT?TG~VIX(6C)et|LV94Rrx$uXO}+R((KGoqU1rxXRiXM_9a z_}dBsHca6p%-ro$N}?DJUQ z0ymhmm#J{yX^!rZnxJ~Hp!%t%=P@qpE~9Vh132_0 z*{BZ?ffP&z31vE<69G64uuD%y6R{VlB(bgYZONHBw+w%?qq0MHaJL$lNQuf>OsAr*j2|#+{}2n!*xFZTz1pl%wbBGBcU4^nULI zZtdcg)hm`R${sT=XIyr;wQ_OQMgsD}K*iGa)Xve{YZk3py0&&j4X~V?{rgMp2D`4O zYSo&Ri&reFShH&7qWr8y<(2Hn!mrqgMSmW!QvC-9x4Yl}HvGJ7;{GFLFGuhtY~#|* z1P4r`5~gd}0iZkdb1Y29vRg|tqg!>fm1S4tMTbwU4QzEqUf%&2J_uXcfr^0Htp^1= zS7Bq1E=!0m!)ra2GPZhI9$UUlFo&sP@2?N3b6{flA?Dn!TG2!zWhuqlZQ*g*_ zT%MT_&OKpT!wvuqAAMmevs=qE+49OPR#GXj-S};&%!&@54|cY%GOzDgcg@fY?AU*+ T@*$pog5^J%#~);?pB(Z3QEJET delta 8808 zcma)B3wTu3wLWJaVe*BMn&F?(dtk z_Wsw}Yp=c5+K+Rl{XM?zJ-#*8&P8!b$yW(?a!1N@p*i7dD`&^4^srxyQyae+$A

nj;Y%}k$+cYVgCS_wU@b4t8_NX$q(^aG#WfgnU zFni1ojlgLma7Gqptcf~(Vf}}XN2Z2e_*+fq2bp8K|MXVbBDTTkj*f$-L2J3qism@> znsGe;6+35i@F{F}g@gZ;*-h#EaRphSJU1%OQ}kr*rdYmCX*)=oZAJUn)ZS^1WzU;q z`2mIby7F8_PnK_rWv$@UC|E1&GN<$JP~B{SHDlOrLljF2Sr}f=tzGj#1Gnn_<@Ibs z$TlVnTN07Y75lse(-%)lo0Q8o3`-3khRGT^GigsxD)5aVTne>bax{Dx0G>?@b+GEth)8h=YGjLmZUb611Px(9psxd6F$A@+ zKLNc2^uZx$82cP(d{{)})*)ylO9*o?AJDcTsEw5X-2n9YA?R?nG0YJ+LN^+8&k$}j z`yKk6L7(m+sDqsibMUbpeE7SjowhL+!)%Tlkduua6Kp@?qgRl!fxg|}$#UyeVf5GF zY@F5cq&=Kpi0VWvL?N+|<1|+CZS92H+R;PXp7Phcdu4gA_m8r?k9*~ysX}bN^kBW< zHrKFy!}ItIc60c;;-+7=aeFDve1T8Iz#%bkSo(N4$NiabF>qP>3bEZsNg)QOdQvfA zst*V?=2|qQhhaBKb+El7M)McgsSybk^&o5&uqDH4I1H?7nf!X90P*n_E zY={tB@@AkRz3{CpOG`22VRIgPDJpl7=?+Do>OsL4`RxQU-)1J-_Z}9VHkRUH@Goin zw~7CUTlt=rIr(DXD{t@tr!Vkc-)j-9D7t`um2Hhq<{xLjkFNF$TerfZdg1b$1EMhD z2nLEyR20rf-R^T)Fi~OaHdUsRWI6|g`TM2+`bL&fF@K$+^%fR%3ht9aTK83{=&Iw73h7_RfGu%OUoGJ)5{8e@{Jb#Os_nvnPihT=HZ9y2n0 zF(7Fo`h3>R4#mvi?JUBv&HsdI%RlU0>m~;&_C~lAb9bmBL(Ki~E7jcDkX0u8eRCt$ zKMZqCF0wWQV$!IuZ?iUnjCJ5q=!zW>stR2;vH74Iu_v3^rkHqs9{Zn|#Ha;i_ePA^ z_b_T2`!2_V@0&h({J%H-H#bz%1<3vfruXA~l}@A2Esj=9htap1-ggahLSjDi$Ba^K z_bayN0Ulzzx|-(>z-MCM8>~sEni|r@>tvb@Spvh3A)-eqvS3ERiwnA>L*R=oMumCrQ_tU~04U@&pqv%%{resT`83X9z4|>gq z7*K*f`Mrn_Q*4`WUJv0Pa9h0am>>O=B(+i4=R#Jx3mu30&gqJ$xGTOxDp#3Yh`MuO z-#o}EOHqmZEQ-8!4n_Xyzr%a$QDar{@s27NR_xE43^Y!4jp=4l=ezVz~!YKn|O1HC_|ND1ysBQXhJv$1M|fkuNY`#Fa7zVXBH;GnO=`eKvm?*hVlXuHvm;}EYuJ97tsWaNj7?zQjGbjLe!*tC z7V?$s{;_wli>@61Ty?3qrT_a}mIH^R#YDPVrpG>c{bp{DuB_DkJ-PcX`YcJO)f zYu(y-UtO2j^02L0;(dIluVrjtKnxbXVYw%m`i3`fU3jUdxr-OOJ~oO0_{Af<^o|&4 za9KRT^|G(-NJ@{7|EsTMrloLEaE$yQP)dAlwekzD&{1tZedIcR3~SwUZ##nar$D>|Vr zRBZ0FcykXozJ*R0L9mz5Fm~R9LqBg*v?o|9=k_#A#AN#NFmu6Aw;WGUur9+B256>8z{aZ?Vw7_`Ws?hw;1Ra zyLuBvo`^vVgp}l7ZhX5CM&$TfU`ghinaBLdq4;`HxbH7wu)*cT&F2ahOubjQ@31eh z)a9%fR&~LcX|#UbQZ$rP(xG=mZvUoqq_{jpD$eYqbsenD9+E)^%66bCiNV4H^H zF6-X+EbeQCsB{{SA$dz7K-?~P$~}Q=2;RpPV&EoxZn-LKicZIkANer#lUQY|I!Iet zy-@ysy^zK>q(mEDaxj+S_MZd(Ht-|BudJJlTT&`SFC+WiXV87(G6(H~702oNgk0Z$ zDg?r3k>GDZ0-Y<*BhhaZ2F{CtFQlz-BdS2Y1y_D+Lc2+mp$cv z#s0vx3Nl}_9>bc4!D`GRGF!3UHt{mdz=lh+z-C+C18YBBmrd4oLIYN)E${UhWxqZJ zYfqB3kaG{f;(l1{K7dZwaf+RQ#dJ6&|6ZAwd)4tlZG&?^26<)f_bM7I-d40h)BF0| z--X0^XgW7!?%Q$M8#?z*Xx}z>wmW(RTYAUcSzmo3%SBr*>}x_Mdj#(pZ{SeiWb;S; zT{ld~JGXQS{J}-=6dGc1OB2UV-H|xLB0YQk%y7Af^(q}ZLneuQkpz`IvSq!2ongJxE&454X65Tu2!5g=<0cR z%luE=?4Pf!CilN`a)heu-#FP9_yW$A(lJZT$=K#_ibx4SeELx*-F;EN)nPW}F80>M zWMLt&A&X{b+F0JpTxl~68!xgoX*Gs~i>xOt-A|W4-20YYP&AY)_rf=E^HXm87%YPB zh5Mx8|Sbol~D@HV;Ij_X79M_O7*51=NUJ%5X1aP_Iphza``@27BH zv~@UZ%Ghi;U}MHiW=ONKw#*Hao4{N*9PFu^P{YA=Rd}ocFP)w}QDjUo@{oX|^BBF} z2B&_6x>U)ECsi0)d)coiC4@a=Rdh$bDX>G6{$LQTY}4dDvul8=N~=+W(gESIXX#xx zSnXO(+u4f6P)>*cUQl)Nn0IEUFBrKs5)%%Z*Mcbikh~wTEmK~dM&D}s66ml~cIH7% zrG%TmkKaf5=NMGlbC#`}nr?8MW6w>^Glc?;JLmOWsjHko$7czdyS>T z7GBEl%%5UBI=qf$=dZNpr#e^O=S*JfbSE!O!P{v5XiI&nlS{7W7O^N#1ATtzmF0Kn z5=XR{wRz(G^j#6(7`R=3mF0HSxude&4Qd1J1ugB7oHm0igwexvU~%y6*K~P zJE#-16Lbn_FQ^F09mls8(AA*71#SPEEO&z1PRR0c(9-v0x!=I~xz>|VgwK{4@5^!q zXe+#13)&0X1e*T=3&|c6S&~{vg8>oF*mY)ah z9FXPMVKDqEwkFVa(AA(eT=<`$Ht11m<8>vLB~2fnz`>X4a1B>DjDo?r@SB34NcZnqwPUcszvceGw2f%>xfNI^KoGoSI+O``Ctg%63fz`RqQ zPB0-34OVHthQ~~U1J|L79(5-tg-Oz46B6_trv3Buto+*4ufcN zGa>gEU5@5&!J3J}8rs4Z6^{>DY`{=OtaL^c`*m@MzY5H9EUfXQ1cmZ*6|mL7<`Hne z)maBu68;GAJ^uuM3OI*Fxsv25t7@$ll79tw6>vq5Xw5Ld@EqW81E)pYfzK;r#mo>3 z)~M?fwm?h^nTBdcAKNbeB;-X@+dpI9iQ__OF<+qSm^3H)o3K{hSi?(Dr(Ao~a1WGw z)Ko(RX$&`H8ZNP)c_)PxqPY~!?n`X1w;=RIRQu7}j8Dt=8bkFj^zF=El4xH|Dp4qI zR90>3k0BLVZ0uSCVdbYTvJsT}tU?NwX5|RoZ&S&Y_ex zRyJ!kFR^E4CGxvi*R18djd|vbV4VgN%b(rG*LL*GPUNBZN~r?^NoCQ-P)-7M#I;dQ z-&TxVI2KS@SuxvH7Go5+CbqlGF|t$1^;<=aoSoa6#m<+d@vE7Ajw7zO;8p=@897`W zVGsV>nG<7la5lDPj)Qlyz?}7bKjY^*>?A-%-;%~N&s;|mIo*KunW1rYU(s?Syp%1O zn~sm?JLWp#YAHL0lntnpOgrm>jD=l+j2;(V>^LgCVp~%A;-pfSiW{^*(UwFSYLivm z&7LVAZ+Obfj+D=ULVJZHPXAXQgZlsDUTPUipYIy576<0PBR1fPo?WrjG(mko+{S#GOS$3#i*YG9{SC%R87+8&Q zCvZ&;wF5bkic4j1HsxsJxO^y}lFhp2IWmjEwHroow|i`X)~i=b0OgE9ub)ycev|Ti znx1jHbV(;^5&Qh!P z(w{EEF&8lIsEuF&KcD>%nKsj_E2p!dBk z3_%UIX`9J1t?$$994z0haXaatMbTqn1a3oS#MUbC;A6igUz(;u*h`htsryY^Vp<5f zqx$G>RZi06&#L$Wu2ahbBT+;udXa`c-KJYpQYWFAHe@x(`nO^`(91w z3#<$}4=9@dMo;hOn&hrTRb-had0oRf4KLDgBMuOnX=$bk(7K~Cf}O)m({--at4w8b zcQJdl1AnJu0ff%dWL7h8^|->x+}b5SUA=1A;>@(kQzm8Ut)DKr?;!w_C+R@_vh^g8 ztF<3k{J^rcjjI|+Cby%bI@Z8;h74zauaRu@H+z2m;J@qH?Cj#_*sp5u98Q1259Opy z?dYj}$%Nxz^Rje1-HEB_bUo_=x=TB^bUKCc_35FKhDt2AK08!Dm)h7v_1S|5o__H4 zvaWhRU&=ho)7h@&HrBK}!`_6qSSmWrXI;zFhwI6s(?esKVMSV)e*WU{`q7FRWbKM{ zyT1SEbQS9Ysvl`O?PUDQ^ich8d5-0-%w`!YGg$jdTc~~_*w||;vj-1v=WNxo-VXjJ Wck}#@ndjc@(7iua!I5xD;{O9GDLUZ* diff --git a/squiggle.c/samples.c b/squiggle.c/samples.c index a385cb07..33e21674 100644 --- a/squiggle.c/samples.c +++ b/squiggle.c/samples.c @@ -9,8 +9,8 @@ int main() double p_b = 0.5; double p_c = p_a * p_b; - double sample_0(uint64_t * seed) { return 0; } - double sample_1(uint64_t * seed) { return 1; } + double sample_0(uint64_t * seed) { UNUSED(seed); return 0; } + double sample_1(uint64_t * seed) { UNUSED(seed); return 1; } double sample_few(uint64_t * seed) { return sample_to(1, 3, seed); } double sample_many(uint64_t * seed) { return sample_to(2, 10, seed); } @@ -22,8 +22,8 @@ int main() return sample_mixture(samplers, weights, n_dists, seed); } - int n_samples = 1000000, n_threads = 16; - double* results = malloc(n_samples * sizeof(double)); + int n_samples = 1000 * 1000, n_threads = 16; + double* results = malloc((size_t)n_samples * sizeof(double)); sampler_parallel(sampler_result, results, n_threads, n_samples); printf("Avg: %f\n", array_sum(results, n_samples) / n_samples); free(results); diff --git a/squiggle.c/squiggle_c/squiggle.c b/squiggle.c/squiggle_c/squiggle.c index 1d0224c3..ec0d4a23 100644 --- a/squiggle.c/squiggle_c/squiggle.c +++ b/squiggle.c/squiggle_c/squiggle.c @@ -3,12 +3,14 @@ #include #include -// math constants +// Defs #define PI 3.14159265358979323846 // M_PI in gcc gnu99 #define NORMAL90CONFIDENCE 1.6448536269514727 +#define UNUSED(x) (void)(x) +// ^ https://stackoverflow.com/questions/3599160/how-can-i-suppress-unused-parameter-warnings-in-c -// Pseudo Random number generator -static uint64_t xorshift32(uint32_t* seed) +// Pseudo Random number generators +static uint64_t xorshift64(uint64_t* seed) { // Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs" // See: @@ -19,19 +21,20 @@ static uint64_t xorshift32(uint32_t* seed) // uint64_t x = *seed; x ^= x << 13; - x ^= x >> 17; - x ^= x << 5; - return *seed = x; -} - -static uint64_t xorshift64(uint64_t* seed) -{ - // same as above, but for generating doubles instead of floats - uint64_t x = *seed; - x ^= x << 13; x ^= x >> 7; x ^= x << 17; return *seed = x; + + /* + // if one wanted to generate 32 bit ints, + // from which to generate floats, + // one could do the following: + uint32_t x = *seed; + x ^= x << 13; + x ^= x >> 17; + x ^= x << 5; + return *seed = x; + */ } // Distribution & sampling functions @@ -47,7 +50,7 @@ double sample_unit_normal(uint64_t* seed) // // See: double u1 = sample_unit_uniform(seed); double u2 = sample_unit_uniform(seed); - double z = sqrtf(-2.0 * log(u1)) * sin(2 * PI * u2); + double z = sqrt(-2.0 * log(u1)) * sin(2 * PI * u2); return z; } @@ -67,7 +70,7 @@ double sample_lognormal(double logmean, double logstd, uint64_t* seed) return exp(sample_normal(logmean, logstd, seed)); } -inline double sample_normal_from_90_confidence_interval(double low, double high, uint64_t* seed) +double sample_normal_from_90_ci(double low, double high, uint64_t* seed) { // Explanation of key idea: // 1. We know that the 90% confidence interval of the unit normal is @@ -98,10 +101,10 @@ double sample_to(double low, double high, uint64_t* seed) // returns a sample from a lognorma with a matching 90% c.i. // Key idea: If we want a lognormal with 90% confidence interval [a, b] // we need but get a normal with 90% confidence interval [log(a), log(b)]. - // Then see code for sample_normal_from_90_confidence_interval - double loglow = logf(low); - double loghigh = logf(high); - return exp(sample_normal_from_90_confidence_interval(loglow, loghigh, seed)); + // Then see code for sample_normal_from_90_ci + double loglow = log(low); + double loghigh = log(high); + return exp(sample_normal_from_90_ci(loglow, loghigh, seed)); } double sample_gamma(double alpha, uint64_t* seed) @@ -201,7 +204,7 @@ double sample_mixture(double (*samplers[])(uint64_t*), double* weights, int n_di { // Sample from samples with frequency proportional to their weights. double sum_weights = array_sum(weights, n_dists); - double* cumsummed_normalized_weights = (double*)malloc(n_dists * sizeof(double)); + double* cumsummed_normalized_weights = (double*)malloc((size_t)n_dists * sizeof(double)); cumsummed_normalized_weights[0] = weights[0] / sum_weights; for (int i = 1; i < n_dists; i++) { cumsummed_normalized_weights[i] = cumsummed_normalized_weights[i - 1] + weights[i] / sum_weights; diff --git a/squiggle.c/squiggle_c/squiggle.h b/squiggle.c/squiggle_c/squiggle.h index 6f180389..100ea68d 100644 --- a/squiggle.c/squiggle_c/squiggle.h +++ b/squiggle.c/squiggle_c/squiggle.h @@ -15,6 +15,7 @@ double sample_unit_normal(uint64_t* seed); double sample_uniform(double start, double end, uint64_t* seed); double sample_normal(double mean, double sigma, uint64_t* seed); double sample_lognormal(double logmean, double logsigma, uint64_t* seed); +double sample_normal_from_90_ci(double low, double high, uint64_t* seed); double sample_to(double low, double high, uint64_t* seed); double sample_gamma(double alpha, uint64_t* seed); @@ -30,4 +31,7 @@ double array_std(double* array, int length); // Mixture function double sample_mixture(double (*samplers[])(uint64_t*), double* weights, int n_dists, uint64_t* seed); +// Macro to mute "unused variable" warning when -Wall -Wextra is enabled. Useful for nested functions +#define UNUSED(x) (void)(x) + #endif diff --git a/squiggle.c/squiggle_c/squiggle_more.c b/squiggle.c/squiggle_c/squiggle_more.c index 720fdccd..766a66f3 100644 --- a/squiggle.c/squiggle_c/squiggle_more.c +++ b/squiggle.c/squiggle_c/squiggle_more.c @@ -6,19 +6,27 @@ #include #include #include +#include // memcpy /* Parallel sampler */ +#define CACHE_LINE_SIZE 64 +typedef struct seed_cache_box_t { + uint64_t* seed; + char padding[CACHE_LINE_SIZE - sizeof(uint64_t*)]; +} seed_cache_box; +// This avoid false sharing. Dealing with this shaves ~2ms. + void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_threads, int n_samples) { - // Division terminology: - // a = b * quotient + reminder - // a = (a/b)*b + (a%b) + // Terms of the division: + // a = b * quotient + reminder + // a = b * (a/b) + (a%b) // dividend: a // divisor: b - // quotient = a / b - // reminder = a % b - // "divisor's multiple" := (a/b)*b + // quotient = a/b + // reminder = a%b + // "divisor's multiple" := b*(a/b) // now, we have n_samples and n_threads // to make our life easy, each thread will have a number of samples of: a/b (quotient) @@ -26,18 +34,17 @@ void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_ // to possibly do by Jorge: improve so that the remainder is included in the threads int quotient = n_samples / n_threads; - /* int remainder = n_samples % n_threads; // not used, comment to avoid lint warning */ int divisor_multiple = quotient * n_threads; - uint64_t** seeds = malloc(n_threads * sizeof(uint64_t*)); - // printf("UINT64_MAX: %lu\n", UINT64_MAX); + // uint64_t** seeds = malloc((size_t)n_threads * sizeof(uint64_t*)); + seed_cache_box* cache_box = (seed_cache_box*) malloc(sizeof(seed_cache_box) * (size_t)n_threads); srand(1); - for (uint64_t i = 0; i < n_threads; i++) { - seeds[i] = malloc(sizeof(uint64_t)); + for (int i = 0; i < n_threads; i++) { + cache_box[i].seed = malloc(sizeof(uint64_t*)); // Constraints: // - xorshift can't start with 0 // - the seeds should be reasonably separated and not correlated - *seeds[i] = (uint64_t)rand() * (UINT64_MAX / RAND_MAX); + *(cache_box[i].seed) = (uint64_t)rand() * (UINT64_MAX / RAND_MAX); // printf("#%ld: %lu\n",i, *seeds[i]); // Other initializations tried: @@ -47,33 +54,29 @@ void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_ } int i; -#pragma omp parallel private(i) +#pragma omp parallel private(i, quotient) { #pragma omp for for (i = 0; i < n_threads; i++) { int lower_bound_inclusive = i * quotient; int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below, - // printf("Lower bound: %d, upper bound: %d\n", lower_bound, upper_bound); for (int j = lower_bound_inclusive; j < upper_bound_not_inclusive; j++) { - results[j] = sampler(seeds[i]); + results[j] = sampler(cache_box[i].seed); } } } for (int j = divisor_multiple; j < n_samples; j++) { - results[j] = sampler(seeds[0]); + results[j] = sampler(cache_box[0].seed); // we can just reuse a seed, this isn't problematic because we are not doing multithreading } - for (uint64_t i = 0; i < n_threads; i++) { - free(seeds[i]); + for (int i = 0; i < n_threads; i++) { + free(cache_box[i].seed); } - free(seeds); + free(cache_box); } /* Get confidence intervals, given a sampler */ -// Not in core yet because I'm not sure how much I like the struct -// and the built-in 100k samples -// to do: add n to function parameters and document typedef struct ci_t { double low; @@ -89,10 +92,12 @@ static void swp(int i, int j, double xs[]) static int partition(int low, int high, double xs[], int length) { - // To understand this function: - // - see the note after gt variable definition - // - go to commit 578bfa27 and the scratchpad/ folder in it, which has printfs sprinkled throughout - int pivot = low + floor((high - low) / 2); + if (low > high || high >= length) { + printf("Invariant violated for function partition in %s (%d)", __FILE__, __LINE__); + exit(1); + } + // Note: the scratchpad/ folder in commit 578bfa27 has printfs sprinkled throughout + int pivot = low + (int)floor((high - low) / 2); double pivot_value = xs[pivot]; swp(pivot, high, xs); int gt = low; /* This pointer will iterate until finding an element which is greater than the pivot. Then it will move elements that are smaller before it--more specifically, it will move elements to its position and then increment. As a result all elements between gt and i will be greater than the pivot. */ @@ -109,15 +114,24 @@ static int partition(int low, int high, double xs[], int length) static double quickselect(int k, double xs[], int n) { // https://en.wikipedia.org/wiki/Quickselect + + double *ys = malloc((size_t)n * sizeof(double)); + memcpy(ys, xs, (size_t)n * sizeof(double)); + // ^: don't rearrange item order in the original array + int low = 0; int high = n - 1; for (;;) { if (low == high) { - return xs[low]; + double result = ys[low]; + free(ys); + return result; } - int pivot = partition(low, high, xs, n); + int pivot = partition(low, high, ys, n); if (pivot == k) { - return xs[pivot]; + double result = ys[pivot]; + free(ys); + return result; } else if (k < pivot) { high = pivot - 1; } else { @@ -129,8 +143,8 @@ static double quickselect(int k, double xs[], int n) ci array_get_ci(ci interval, double* xs, int n) { - int low_k = floor(interval.low * n); - int high_k = ceil(interval.high * n); + int low_k = (int)floor(interval.low * n); + int high_k = (int)ceil(interval.high * n); ci result = { .low = quickselect(low_k, xs, n), .high = quickselect(high_k, xs, n), @@ -144,10 +158,8 @@ ci array_get_90_ci(double xs[], int n) ci sampler_get_ci(ci interval, double (*sampler)(uint64_t*), int n, uint64_t* seed) { - double* xs = malloc(n * sizeof(double)); - /*for (int i = 0; i < n; i++) { - xs[i] = sampler(seed); - }*/ + UNUSED(seed); // don't want to use it right now, but want to preserve ability to do so (e.g., remove parallelism from internals). Also nicer for consistency. + double* xs = malloc((size_t)n * sizeof(double)); sampler_parallel(sampler, xs, 16, n); ci result = array_get_ci(interval, xs, n); free(xs); @@ -159,9 +171,6 @@ ci sampler_get_90_ci(double (*sampler)(uint64_t*), int n, uint64_t* seed) } /* Algebra manipulations */ -// here I discover named structs, -// which mean that I don't have to be typing -// struct blah all the time. #define NORMAL90CONFIDENCE 1.6448536269514727 @@ -195,8 +204,8 @@ lognormal_params algebra_product_lognormals(lognormal_params a, lognormal_params lognormal_params convert_ci_to_lognormal_params(ci x) { - double loghigh = logf(x.high); - double loglow = logf(x.low); + double loghigh = log(x.high); + double loglow = log(x.low); double logmean = (loghigh + loglow) / 2.0; double logstd = (loghigh - loglow) / (2.0 * NORMAL90CONFIDENCE); lognormal_params result = { .logmean = logmean, .logstd = logstd }; @@ -221,21 +230,21 @@ ci convert_lognormal_params_to_ci(lognormal_params y) #define EXIT_ON_ERROR 0 #define PROCESS_ERROR(error_msg) process_error(error_msg, EXIT_ON_ERROR, __FILE__, __LINE__) -struct box { +typedef struct box_t { int empty; double content; char* error_msg; -}; +} box; -struct box process_error(const char* error_msg, int should_exit, char* file, int line) +box process_error(const char* error_msg, int should_exit, char* file, int line) { if (should_exit) { - printf("@, in %s (%d)", file, line); + printf("%s, @, in %s (%d)", error_msg, file, line); exit(1); } else { char error_msg[MAX_ERROR_LENGTH]; snprintf(error_msg, MAX_ERROR_LENGTH, "@, in %s (%d)", file, line); // NOLINT: We are being carefull here by considering MAX_ERROR_LENGTH explicitly. - struct box error = { .empty = 1, .error_msg = error_msg }; + box error = { .empty = 1, .error_msg = error_msg }; return error; } } @@ -244,7 +253,7 @@ struct box process_error(const char* error_msg, int should_exit, char* file, int // Version #1: // - input: (cdf: double => double, p) // - output: Box(number|error) -struct box inverse_cdf_double(double cdf(double), double p) +box inverse_cdf_double(double cdf(double), double p) { // given a cdf: [-Inf, Inf] => [0,1] // returns a box with either @@ -257,8 +266,9 @@ struct box inverse_cdf_double(double cdf(double), double p) // 1. Make sure that cdf(low) < p < cdf(high) int interval_found = 0; - while ((!interval_found) && (low > -FLT_MAX / 4) && (high < FLT_MAX / 4)) { - // ^ Using FLT_MIN and FLT_MAX is overkill + while ((!interval_found) && (low > -DBL_MAX / 4) && (high < DBL_MAX / 4)) { + // for floats, use FLT_MAX instead + // Note that this approach is overkill // but it's also the *correct* thing to do. int low_condition = (cdf(low) < p); @@ -299,7 +309,7 @@ struct box inverse_cdf_double(double cdf(double), double p) } if (convergence_condition) { - struct box result = { .empty = 0, .content = low }; + box result = { .empty = 0, .content = low }; return result; } else { return PROCESS_ERROR("Search process did not converge, in function inverse_cdf"); @@ -310,7 +320,7 @@ struct box inverse_cdf_double(double cdf(double), double p) // Version #2: // - input: (cdf: double => Box(number|error), p) // - output: Box(number|error) -struct box inverse_cdf_box(struct box cdf_box(double), double p) +box inverse_cdf_box(box cdf_box(double), double p) { // given a cdf: [-Inf, Inf] => Box([0,1]) // returns a box with either @@ -323,15 +333,16 @@ struct box inverse_cdf_box(struct box cdf_box(double), double p) // 1. Make sure that cdf(low) < p < cdf(high) int interval_found = 0; - while ((!interval_found) && (low > -FLT_MAX / 4) && (high < FLT_MAX / 4)) { - // ^ Using FLT_MIN and FLT_MAX is overkill + while ((!interval_found) && (low > -DBL_MAX / 4) && (high < DBL_MAX / 4)) { + // for floats, use FLT_MAX instead + // Note that this approach is overkill // but it's also the *correct* thing to do. - struct box cdf_low = cdf_box(low); + box cdf_low = cdf_box(low); if (cdf_low.empty) { return PROCESS_ERROR(cdf_low.error_msg); } - struct box cdf_high = cdf_box(high); + box cdf_high = cdf_box(high); if (cdf_high.empty) { return PROCESS_ERROR(cdf_low.error_msg); } @@ -361,7 +372,7 @@ struct box inverse_cdf_box(struct box cdf_box(double), double p) // if ((width < 1e-8) || mid_not_new){ convergence_condition = 1; } else { - struct box cdf_mid = cdf_box(mid); + box cdf_mid = cdf_box(mid); if (cdf_mid.empty) { return PROCESS_ERROR(cdf_mid.error_msg); } @@ -378,7 +389,7 @@ struct box inverse_cdf_box(struct box cdf_box(double), double p) } if (convergence_condition) { - struct box result = { .empty = 0, .content = low }; + box result = { .empty = 0, .content = low }; return result; } else { return PROCESS_ERROR("Search process did not converge, in function inverse_cdf"); @@ -389,22 +400,22 @@ struct box inverse_cdf_box(struct box cdf_box(double), double p) /* Sample from an arbitrary cdf */ // Before: invert an arbitrary cdf at a point // Now: from an arbitrary cdf, get a sample -struct box sampler_cdf_box(struct box cdf(double), uint64_t* seed) +box sampler_cdf_box(box cdf(double), uint64_t* seed) { double p = sample_unit_uniform(seed); - struct box result = inverse_cdf_box(cdf, p); + box result = inverse_cdf_box(cdf, p); return result; } -struct box sampler_cdf_double(double cdf(double), uint64_t* seed) +box sampler_cdf_double(double cdf(double), uint64_t* seed) { double p = sample_unit_uniform(seed); - struct box result = inverse_cdf_double(cdf, p); + box result = inverse_cdf_double(cdf, p); return result; } -double sampler_cdf_danger(struct box cdf(double), uint64_t* seed) +double sampler_cdf_danger(box cdf(double), uint64_t* seed) { double p = sample_unit_uniform(seed); - struct box result = inverse_cdf_box(cdf, p); + box result = inverse_cdf_box(cdf, p); if (result.empty) { exit(1); } else { @@ -413,7 +424,6 @@ double sampler_cdf_danger(struct box cdf(double), uint64_t* seed) } /* array print: potentially useful for debugging */ - void array_print(double xs[], int n) { printf("["); diff --git a/squiggle.c/squiggle_c/squiggle_more.h b/squiggle.c/squiggle_c/squiggle_more.h index 947b164d..2f88349a 100644 --- a/squiggle.c/squiggle_c/squiggle_more.h +++ b/squiggle.c/squiggle_c/squiggle_more.h @@ -32,23 +32,23 @@ lognormal_params convert_ci_to_lognormal_params(ci x); ci convert_lognormal_params_to_ci(lognormal_params y); /* Error handling */ -struct box { +typedef struct box_t { int empty; double content; char* error_msg; -}; +} box; #define MAX_ERROR_LENGTH 500 #define EXIT_ON_ERROR 0 #define PROCESS_ERROR(error_msg) process_error(error_msg, EXIT_ON_ERROR, __FILE__, __LINE__) -struct box process_error(const char* error_msg, int should_exit, char* file, int line); +box process_error(const char* error_msg, int should_exit, char* file, int line); void array_print(double* array, int length); /* Inverse cdf */ -struct box inverse_cdf_double(double cdf(double), double p); -struct box inverse_cdf_box(struct box cdf_box(double), double p); +box inverse_cdf_double(double cdf(double), double p); +box inverse_cdf_box(box cdf_box(double), double p); /* Samplers from cdf */ -struct box sampler_cdf_double(double cdf(double), uint64_t* seed); -struct box sampler_cdf_box(struct box cdf(double), uint64_t* seed); +box sampler_cdf_double(double cdf(double), uint64_t* seed); +box sampler_cdf_box(box cdf(double), uint64_t* seed); #endif