From 3a9a290ba89ffd9e6b129030d03db1b15fa38e33 Mon Sep 17 00:00:00 2001 From: NunoSempere Date: Fri, 12 Jan 2024 17:02:10 +0100 Subject: [PATCH] update squiggle.c to avoid cache sharing --- README.md | 2 +- squiggle.c/samples | Bin 27552 -> 27552 bytes squiggle.c/squiggle_c/squiggle_more.c | 19 ++++++++++++------- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 0bf64330..6e111a1c 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ The name of this repository is a pun on two meanings of "time to": "how much tim | Language | Time | Lines of code | |-----------------------------|-----------|---------------| | C | 5.6ms | 252 | -| squiggle.c | 10.5ms | 29* | +| squiggle.c | 8.2ms | 29* | | Nim | 40.8ms | 84 | | Lua (LuaJIT) | 69.9ms | 82 | | OCaml (flambda) | 187.9ms | 123 | diff --git a/squiggle.c/samples b/squiggle.c/samples index d2edfd2a7fe194c6ce14fe445759556f2787d02c..ebc321484fa99698e05c36b98b892cd5038eb900 100755 GIT binary patch delta 2696 zcmZWq4^R}>8Q-^e9Lov2i-?{XD~Ky_z;Smd=P_1?g#+H2wPQv=6hg2du?Y>*qBYin zDN(Znn>fC4MrWE-JDqW+?Gzf?sWwhLDhOIz!B!;|D*}mYB>urz@egdjcL$wGdoy>x z_kQpDzW2TF_q}(=uAyVsPzQ&i3R0NRr+~yFcNAw2&$vznC9v}>yI5_F(e^8E*N5?| ze?fAF-*~yZbC1O?qC#>Yu0G1Qlwq(Qb0(?Nj7yGf4z$)gjLYMushu@iDRc!isCh9&K|{Gqc-`;ly;$|3t{ve z9}(L76|EO{4O_4lkT_Xrza;BLrdoYLMSW`PI?g!|&>KueugmD-^g>g4a#ck^u(x47 z5SdGz#I?HhF+CWaIBJE20K<^rvW2X6lU!exZ99j&m+vel^OIAAOLt9E)kak99@ct| z-w`!=aLHEcsmE_w$S~d@*0Ybmz!?3~G|wPdF`j-&3R0FKmb{tbMg8Q%6lb$sDwinQ z6s~?I7c1JdtX(YGfLq2f)*2#yGP-pg2TltiqrKJ>yg16qxgQ7hDeP>oWms?|xN5DPUjuE4f|RW*{e zDGOIaQgX{U!mW_K?Ci$+qH_xB{*GSr1$PjKQsQLu%7hS`KdI)7?ieSr&LsPO!!(Tn z_&G>&olERLb}-C@AZ9OjkT;#l=o50tsh|cj$enXC@xzNAJrL-irJf7TezC>cu7qIv_5z`!O=5z2*ZH98=>S?)z z7XAg4)KaoR)+;0d%3lT-tPrjgq9y`NW8AcYAAzJ(sz=cFsw4RpH z2gd6#SM+_?lLr~g zSUrv`a5<5K$gTu7FAmD<{jM950uxo@0@?`H?#&i3P_Jv_6M_2*_h+TF8f^`{fqvVjgVqQH2JM}eO>`&iU;?B9SoXx|iR&YNO z4m3`KpeL8o4fL2u0p69%s&*TmmLCcvKWh_xMY0y#AI5~j#%vNuPu3sVBQd0D`N=u= zfDOkRC-RB~u6l?hbg{COmxz8#5>_rM&W3*+qYbRe;S!|7 zWTr#z82_XRhA-D6VevD0qpGi7l?<1&RMD=|_w}({ENd@HAYum48&u~MJ%%BC*7?E8 z29Ahxj(wcl#y0PTKKnh~*hIKROa#NMzYR}3U=_Sgjey(XsrwmV2R!}bfWv@{jbU=( zAIt%G4A29}d<7j3SOr)GDBc49_zQw{WM5MB4kv+isTOtE|6?>nc!uV);4ufI8Tnoc=lkXSim(p1q%pA8& z(@k|Qfawl-bO-$39Tff`ve;PIqJXR~oJT$? zvNi7l_tWsyq<}a5%!6+maD%{Yq(nWRSj}@G{+Ga0J^q(J4g5CXOQ=0;%wnFSasB~( z$E-c1!4~qIANXTZ+GIMY@E!(DgC`}hspMHm`)$uv%mn1o>l znZipm6Tw{q*|vkcTVTu-@Zx@UEyR}XyeRV=s6e-%lAj(YyOpe{08HP7sYp&K`4Jak z5q_~JS;6z`e(`)k{O(Df*5nbxylua#mrXs;SArA8RkQkgrNx1Qv3Df zosDib=q4X;yoyAkm7U2NUJ*9k76xWF6pxxdm8Q-_SvHaUT5EKm&!~^aSxI6St7-D*>2fU4I##BH+f?5nE9Yb^qY7!zk zn55fdBa>%3V`Cbeb|%iWsWzmIX*2PzXhEwHtRjgFDEMa;6a)>FsN3(|LED-3W{&TB z-~0am-uE4?mr?6wbc{n`H_{mY=LU&J)-Ypu;ACu*P$C;!cJ8Hjw-}tSOmEohS$(K? zovZ%$AGI|Eg&UERyff$3u%5*XgVm^wi`(=$snLyE&xo!o?b?1ye4hOHgOlIqb_>m4 zgHz#XC+b#XIXJ6F;f$;56XYsilZLS8O{0vwveGVlUShECszDi-lu7Kn86+I-Zt@|l zxwWGyHTWT| zhHAyPkgk;ZtS65Qr#K8=WWR9ROj1+Rgn?mQmz6DVB)ElJXc=O^Zd6@}zhjZW_T@ zS!PmX6SoC>VpGiiREkVF_6r9mb;v1FINE`Iw+tc#j+^z4l~Y>pZC%fQ>$}~Os8;Hd zGJ>1h5iK91;g%B|xRIJpy0k&ilE95u!;N4qdFo7FT-Aw{PH15wMwGBvF-W<6^=Ggr z!Q_FE)mW%KXTN~eI+M|U+6}!lV|B&P9_c#0X$N;1)xuVvtZX$Ig=pxTb{4{TicFk4 zcY1eD(;XyLbzr4mR4!sQp*`R7NJ1x{U!!ih$q9-3AT{Wm=S>DO=0yF{xDdT+O3ods zAExD`<*U^?R)!&aB>AozBs3x-*!t>%8 zW?V2jCgzbPmQ?fx`JqKZc5=v)jd*h2G9Nu4dis3ArZ0(^oXap1!tS`Vd47}3fw|-d z>5I@-vN=5y{fg{QPiCKrBOh88kaOwt*r{pKWr=6|r{7y*ZWN2fB0qLDSZOdBan%#3 z0ooubj{vG5ILM|XQLQiufNAgyrfyTI-K)KJlcIM01QX0Kq9FB2$|AJqGdOp+v}90~ltQnjP{yV+qZkQ zs1})YT=jR$x032U2g6VljU=_=5>x%}1gVMHk2Q6scTt~a=nr7;<#29qeSfu(=!Wt< z&q_^yb7@~*he6AO*{#2el~L^zI93azrvXxEFGcGwu?!P2E(+;qL6%t?OUg4+*vS~O zH^YLG$Zs2Sw&DU zg4!Y3(k*yt0Kxh>xanj~wP611h8M*O@kOyx+}N)5flc*)>TQ`27Ea^izdT54GM_@Z zWMAfL_U;hr&a7nBA!5(Uc78MH4+66SQG-p?pLQxBi=?OSaiOAfT&Rp%8%^J3Y1g<= z7WWJ}n6=m$A44Vdp@EJMIN4;>B4(3@iXdsY0tN70dIp^Nw2?tFo>dWh9zgx#0W7Ak z=pU>MXj`w5A6X0LZifqP;M(lgdI+4h?Hc*Y8Xx`(!9N&rPmJWMbuarymFUZO*SdIVS$*=8Y$VML97lywO z<#!wP1y544KWC>RNq_e7i045X@F$bQOXp?23|{C^{~2f?b4c3mmDQ;9NQRNs5H}Ei zG1~1ua%1Tq^NK<9_R~7u1@NFSOy4%xu3Swym52PVd>579g_N~Ya%q{3{djFn8lGU#v`4qO3cfh=^U&;8A}P-@d-wA&%7`A=kJ0;`$l z!$_qw3HD5*Gok4HFzD+DXl;NA^1xx{&D_Qyn;l>JSj6?Ry5fTr*ajt%(nEh99*RYx z@~9X*HiLMbY=dw>QT80IR_1fZ_Fy1oo(D^M9Ib5GZ|06hk@73kQQu3rS&HKFV7P|$y% z>)U`H19~WwnadD6|EgiPa9!y{lj zDVv;?1g2)dArf%fOkL*l#<$t$cpEE*^G1>9pux|8m||FZm+X+TVT>5Y4c}%E9U*^^ zR)zV21P38)C?Y6`aBE6YZ`!z3?#`9Y%qGPYzI)F)~TyV38F?lG8<(Q5QK} zypi4HBy7nJG)7)4*@=3|{gOh-JJTXRieNkdeU1TRCd==OneE{H8TndkK<@7hutq?Q z>W`7V>#gi28~Jp74_Zr<(i7Qx90Apu0AXH-=26v_8P~He-k*_gT25Xst3=1hH)YlZ zE=#})%@lQGM#+{%lFN65ZCVnzK0pqar=hO4)8(NEwYGI{utu_u6vEbIh4n5EsPvND NnhvD3p_e~m{{@P{_!R&E diff --git a/squiggle.c/squiggle_c/squiggle_more.c b/squiggle.c/squiggle_c/squiggle_more.c index 766a66f3..3211b003 100644 --- a/squiggle.c/squiggle_c/squiggle_more.c +++ b/squiggle.c/squiggle_c/squiggle_more.c @@ -11,10 +11,13 @@ /* Parallel sampler */ #define CACHE_LINE_SIZE 64 typedef struct seed_cache_box_t { - uint64_t* seed; + uint64_t seed; char padding[CACHE_LINE_SIZE - sizeof(uint64_t*)]; } seed_cache_box; -// This avoid false sharing. Dealing with this shaves ~2ms. +// This avoids "false sharing", i.e., different threads competing for the same cache line +// It's possible dealing with this shaves ~2ms +// However, it's possible it doesn't, since pointers aren't changed, just their contents (and the location of their contents doesn't necessarily have to be close, since they are malloc'ed sepately) +// Still, I thought it was interesting void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_threads, int n_samples) { @@ -40,11 +43,10 @@ void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_ seed_cache_box* cache_box = (seed_cache_box*) malloc(sizeof(seed_cache_box) * (size_t)n_threads); srand(1); for (int i = 0; i < n_threads; i++) { - cache_box[i].seed = malloc(sizeof(uint64_t*)); // Constraints: // - xorshift can't start with 0 // - the seeds should be reasonably separated and not correlated - *(cache_box[i].seed) = (uint64_t)rand() * (UINT64_MAX / RAND_MAX); + cache_box[i].seed = (uint64_t)rand() * (UINT64_MAX / RAND_MAX); // printf("#%ld: %lu\n",i, *seeds[i]); // Other initializations tried: @@ -58,21 +60,24 @@ void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_ { #pragma omp for for (i = 0; i < n_threads; i++) { + int quotient = n_samples / n_threads; int lower_bound_inclusive = i * quotient; int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below, for (int j = lower_bound_inclusive; j < upper_bound_not_inclusive; j++) { - results[j] = sampler(cache_box[i].seed); + results[j] = sampler(&(cache_box[i].seed)); + // Could also result in inefficient cache stuff, but hopefully not too often } } } for (int j = divisor_multiple; j < n_samples; j++) { - results[j] = sampler(cache_box[0].seed); + results[j] = sampler(&(cache_box[0].seed)); // we can just reuse a seed, this isn't problematic because we are not doing multithreading } - + /* for (int i = 0; i < n_threads; i++) { free(cache_box[i].seed); } + */ free(cache_box); }