From 3d3a3f0045e248ebe2608ddf55c6fa5ae30e3b93 Mon Sep 17 00:00:00 2001 From: NunoSempere Date: Sat, 18 Nov 2023 22:50:03 +0000 Subject: [PATCH] tweak: try simplest version, but notice it doesn't work $ make time-linux Requires /bin/time, found on GNU/Linux systems Running 100x and taking avg time: OMP_NUM_THREADS=1 out/samples Time using 1 thread: 34.50ms Running 100x and taking avg time: OMP_NUM_THREADS=2 out/samples Time using 2 threads: 34.60ms Running 100x and taking avg time: OMP_NUM_THREADS=4 out/samples Time for 4 threads: 32.00ms Running 100x and taking avg time: OMP_NUM_THREADS=8 out/samples Time using 8 threads: 30.40ms Running 100x and taking avg time: OMP_NUM_THREADS=16 out/samples Time using 16 threads: 30.80ms --- C/alt/05-refactor-split-array/out/samples | Bin 22560 -> 22688 bytes C/alt/05-refactor-split-array/samples.c | 42 ++++++++++++---------- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/C/alt/05-refactor-split-array/out/samples b/C/alt/05-refactor-split-array/out/samples index 7157dc0bada9100dee55a618703db13977e02f3a..a06fd7e01b4dcfbe5c4e4bd18e04fa28185db706 100755 GIT binary patch delta 6346 zcmZ`-4OmpynLhV2gEJ`eKB*Yet!`}u6r)cXgK1;THf=R&>r%2k z^W6E)_kKU;obP<+=iYSHt= z=B|ec{TFe0>fh^1gu9GTb4+Ezp9^pQacOqJ!zHaj;f;sRN9tbxk3qUM#^;>?Z~CZ+ zu!H#)8Ro=uuIB8QJ%UZ3@pa@>3Lwivp zTCKt$QN<_;pE3=>qG44=zcCRlPKH;9eGF=LGLd*nSj&W%=qtt^-_rYwODfk?*VhMY z>dA)Snzd;So73i#%F4R>;HJjf%C+l%L~6DK8|l{A)cA(Rbv5<%r1r`BU}Ft^Db_D6 zrN52M{~;0aBP50xh!Mdzk};@@plAiuto<{MIIbx`z8*ZA>pVQYQ>Gc1bRqhvVP3`? zNdf13eT+Ad0IeRA+-xGn91rc;hq)d@9_0M98XrT}foFxa^DZd!$x5RuXeQ{-X0D$Q z492r``A5z-O=NOW^Uw74q%$!FrJrSY0v->~DqYY{?WV=9u}p`T$o$mi7IH9$r)cYLTaEdo1ApmG%mhw1R6`<13KG0 zE`Iw6)I{$9T@7^i2sDn~V|Kd6=}m(k93i#PN6b!o3^wf}P#X=Ko$>Y&Hl6em^iG?{ zx%iHt$$CfFBz6p#CmHwJCYvVa5^`OgI*HD-U75Gu0?q4vf~u;!2dydJthZ|Xz;`;& zCi8mVC7p-i8;j`c_J^BMdk?GL>JAXXEW!7T;(s%W|K%wDjZyr+kK$h*#s6dk-^>~e zj{q3oI*Q*mif#v0m~)66IvHsJ69|2JG%w#ulyJ^pu-_@69wl@t z5=11FP(%s;gz469v|j79x{1|hQd&A~R^P`~-?>3+#RcoBbLCcFH#lqJIqRu2D!>zX zy76==+1*X=DdBlfB@=R-W!4e)H)ws-*b8LP`cP-Yfet94Ymxg9ai!&?DWmfddG!hU z{&;8klJV$4IR1T1Ji6wCqMG=#dkML*8|)cWl(4azk?f(yM5WF66sxy(Hk#FoR`+%^ zL{4xD{R5i*yuEaxBS|Qy6^@h{8v&7~S%^wbCaWr;zc!9j+H&TDi3IPcYGfUnJY_7P zKXqga+dDpXtQKR-(4&?+^go?x;@Ug(56)TAv~N{aZcFf5ea5RU=A~!QYKwo%MN_2d zmS=&;p#(3YVQg^GQYlUR_BL&iipAmE^aIH&o|ov5bYI--w^jAJ^+98oM2lV1##T!_ zJiWK+Q?8_Gi!{E%_ybTxN$MLILiaYn=xRr8n$`U@8!e$1r3u1KdP^!YpMjkDyF`mz z)8lqvwQJGtr`ue~3u`sD!ng)h^JZt>p2OcrZ!F1MZ$7nNE)Rq^sd8CScb zK|e`O7rPuZK4pfu&q1?N?u!fFhz@s$gN9S4jZMX_ppAF;4f=LUikNqU{wZaV*gl@7 zxIN;o@ifov7gxuvcJeh-eGhn73G>$>$Y4%SH(hKgH$rGV^^_U)J78tA~ zscj4l5`RH+-IZ~hu1B3cW9QD~yUwN~e>G?OuG1r)84G?rfMCyu{?#K^cOFD3-;^q$ zZ{g84Xtfu44UyZ>fH|rF3`nf5$mau8N=;7r6(W++3pC(0)M7=l4zvJ6r{iC=(bCl7 zxFQ&{nO|o^PbN*shf2U}Vxux>idb_E14yb#^k%AM%>I6K#xX$0PChG^_S4@?$)b)a zIpS6eT{$IF9QukrGi9Fm_%#StiAUn;1h1$02-_><2jtM%&^fsryGkj#o)s$d8V=mL zt%MrAw$=}!Qf|V+t?%u5j&#iqNaH zm3eJSc=bOSl(ip*WwoV8upasfg79)x2?yRWg^HTJNk!p%KVo*az>F`$1>Y&RdUa18 zzTu!_NdX}uXZLJNRls-(+tQ1|b0zF_g)OHHvikc$*=k=hT$DZ7bTtsO0iB9j?KKs) zykjcD&}>a_N9E&Y)<0bV_)w8aPu|CYr1BIdUa(Q9sYq?Ag4VNq7dxobIQd%_Qj&{_% z-?3|Iecax-s=a=xd3n>uH8u5~&9$C}>Wy_9Y8pJjhE3BOJ?nx`1RH{DHq?05{MfVU zu2SRr`kLx>4YNJfYc|(61~)y%ulc03X``>9ZbPuK5*N$rA6M4bG&F5!gyQnbQn|RK zsGxL4`rWE0$1}Ayj{Lz|NOS!e`h|+nI)760Fov6gC+p9u>c`kCE~;t;=z7pP&@Go# zwG%1m`&3mW@X}|hx)am~`VwdsXglc6zp83K=;_Z@H3=8d*DkAS7U<51s+NJ8`f#}b zJp%eF=n&`;ku($OimLt=zy(kbPG0G%suqE60bLKeA9O3|Y0%xECQR9@pcg-A5Zy?`Jp$sTZd69G;ytHPey!G(v^Du2$Gl`WBw;v@ml)9I}Mmz_a@^ zdOmYnss}D!{fnxe2SgN$N9>NXm$ktO~j)VH=7-HE>%>JDIV9xyf7f`K3 za9?LeD6H+*fQ>_-=W0k4I|eKZ7<0x-37%eHfy?yGIm=_c2zC2q8asDda|x;^p;^h) z6pLS6fo{lSHP0I$GhymjYOumu+YT%NSh|Kpu@`{(fz9U_YxOl?tATYhj1BJ>(cv*4 zCn2+8>K77BJuA%M0>FC#USI$-I34W>lih%fAT>T2DMPG0;BA=hlaRSFJW^n>wTMeC zo|sr6&SH~UK=T04g-$0@$IbjKvDkKr1r{kBQ(*DXne%2gZ-MwFWX>(-0^;Z4LfRDr zKVd_@#rK?1w)mfkl`UBy%=3$@cO)kUXs&= z2({-sU6G8@8tfG~kw{d|cVh5zeqxN1_~_%{vS=tj9oQ@RiS%rKrrnbewaw>&m`X3f zqaUD%Y|b0`$^pV$U6gXX{F(n?B$vn7RNT}xFRrX)_Yk06h$2Ll*n|p3gidyM0dynJ&9Pj2% zKhnGna_Pm1QJC#4ti+)wY%HC46JrJ2l2)>rzKCSxW8^S#QF)0|n8*GLS8`i2B`Vdn z5L6%4ah`yD<&noYzMc0_-$omlH&+X!cJceuvrlXB=j=b@6m(cjM#t@UdW$^Thk17+ z%qrQ4iTwXOyMTMxcPlLQJxiU1^Ozo@LD(8W!LFp00B{zggFJTnVY7ncbrYg`a}hIE zs_4K{r>lWW36Ee+R7hHwv^fD%8k;2tk}qJVaSp_<9{Z9Kp9 zE5}0Jqp?`86;`8W7r(!XHfs3yvu(%U!S_*NnbX*=`_1@<_pP7zS-+H=W*QhvJ6@_; z(dX}Xw0fD-evNs@tm~iPO!V<(vpYJLH476)ek9Gy)3}T@e@8~q4sj~`I0~$&Fn@7n zW?E+2JpOs~Z!mgz`SY>SucsZE_y0)fI9TElkl9@;a>YE6iYwFX`k|%MRw}Pdx9{Rd zqE7vE2hdKA-len)sD5hd^a2%1)9w1%s?+^cE^QWZmFzA}Gw1PxGmqXbT`<0cb2_yN zG`}p(Ud2;Tr-^h&S-M%@e74Y|WlM1)3gzi`efQSsFqK!hX;rz+uAh}URp^fLbhCa6 zNc3p=0<->2)kANC(@zc`&3<6P-Frh;ZglES(;W|NPS&T&mKQ~BQ4;;9c7ua@R#nAs i=dUGf7tNsIRkPwcmqk@#jI?{zeDjI>qwGY4-2K0IA^AW6 delta 6068 zcmZ`-3v?96m96fXk!JLn5Bkgq^cl$`Aq-|lAV69eP!9vN$Or^TAdE30U<`@)7)vpZ zSq3cQ)kKUJT*o;gpA$LBN#tEN3sH_uauN{cr<`@Xf=xKsu^oZ=jQ9wF_z3gqz13YU z%VM{VreD4LUcGwt-m9u^wH^>!_Y1pSj;;|zZDkrEwR^_=I$^+~aRZh;pN_X&Ow+iF z)FF7og>f3EQ)1V60fdQsevS84e3N#I8Xx0SAIDfWu8fZ>ni2gFPvot`r|g-cX2I;q|0qcuhsl}#?%WRwLXJ!M=W^fH;y zscO_h(z?J%_nF3-?!ZhQY@||4+@I3x)_r@`_?I`1pG;P)*tB`whWa%tR;~Rm?TyQ& zS@B+45g%A266Uvo7>NnLIP^qNLgGQqcoJe!HJH~TSqq+}hJ~u@sRAfk$S5p_VPzi1 z8;OVBkIxFnlW`o6?D-*55l2cm|1TDP}T{>!+?;mrUT@ zvz%`j$>gGHpXuv0VP*{4eiqJD{7QJzb-^b3o_VH_K(Cvf8A(i&x4!vWhZT&2W?7sW zdO{e!mP=VE1o|HqC*5jEO5QRAHPX+4?gQF11dXG~2~PSM(3T;nnfieyB_t)c4M7v> z`UGbNRyo)X4v|{vPcfzrV>*YRcKR2fL1XGKTJz)Hw&$5A%pl~Jl0T6?ZNCym=kw`= z*m8n|^xLw(0`DKjPa4KQGmOt4#%B%V(}wZML-=$yp=k)f_(y(P?pPLIEtWr@FdDVa%c_bgXY>4dv8@Y0*EMOz$kz;b?Wb6jg9=Q^VT#}>y zMJ7UGUpNRU`g74~S%tQvK28QbY8gf9JW1%A+2~NjjI~sfYv7)S@wiIXe zC7gD0GY zsj#*^BlS5kyhK_oRU_1ksjl?md(vTRrv=OCEm4 z(o_GuzVwW|swdK6n|DSIAGv47 z`nci3uo>B0;lnKN%aC?yx%9lW;%H-sP;|QIFRTIvcbHU#8c2DF8jemHC(zf^M~nRr z=)b3rb-jNd;cia#*a}U{Tr7RZQFqDXT(mcRv{i8uB1KX?{k9s@RVRJQIbQtn0A1v) z5O)pGpF2I`JSXjRek-ALKv8blmYRy3G|N?x_(?jeUes&t0G;Q`EO=k#YfQfdg|n`n zJ@-%-5AnF#zGl2F=MCh65W3%)E^bIy4K9U@8C;OA8oZDDST(r)9y2)g3ze@ieF~}? zJa88VGlv;uN6XH;+ty0e!RQ7>stQH(E7IuBjAtfafw~+yD>oh$%1e(ooPiDVVeBZP zg|X*p$iH?>mGq?FZI5NVEQm#S=<-pcMe`l{=BRI{7;nQ;>D7iC%djhvgP#6=`e4*V z+csFx_j_!|ZR*d=7Ekrl<(U=YcSg{CnO?DU1pPd7jA$J}%~{_{xPMDq@J$C@o>h>z z7l((sV0S-#D=S;wTD*OA!w`e-aCE87cOMO{Eq^=oW5 zI+zC!tjzw>WSfmX<*rEh?1tvaU#;+DRC)j^sB^Zgt*Rml%g~297)4LGt??z$j{V{W z{k!{P@ylKs%`2f_=K96kNi;8Sg4lAMF3y`I;*}(tw?yyCe$NY`6zXWIhlIx0sQ-0@l+Z+2(BXZ7 z54y$p)qJ5SeS{WF$Xn`0c=T6IrpewE>+y2|?*`Qy)7gbzD@^h)7xfjj71&RJ>2GW7 zM*basE%e<9Q;KEuZsXU6V15TdB(tsde-#5(*IR~w)g3hktcANwlGXQSoMbI&jGtqz zHI&ialhTuLyhNmH2&jp=)UZ2@+c0ZhL$VXW0GC91+Q2&bXKKKojz2Ez^hImpO0`nWhd zZacaUuhG0o)5FKmJ&T%p2C*2P4|y;jGRsHMx?K?8Pq3B*tv<W_&-ZTWQW#YjEt81pJ3QvO&PlN968w&mVB+10;_F#MN% zC%P@_Ckiw+(Okb*I7Aota~x;$nMp!2G(voIhd)Osq3`;=xoxO&*4P&cYn^PJXzV^^ zZ8g2=_c|))YaF7jwn|zwJtw!#ta0r7g|!&(aF}$^m!{9aJO`%d2ubwR^cRIPDg{z# zhiIggGd8uKo#7HPbO$sO+S7ngMqdI~Nh1L-=6E-dMn4Wr z7}L!^+5&v}ESw@r(uX%9*3{hyI+dOaICE#C$E0NFryel2f7MZR<;)ZsoSC}vB_IMB zN$hN!sC|uYSNrLMUzr+8$gk017W5qJc)O~PweYV}9Y4+d5mc`-&Z#=7#G~&XuB7Tv zU(gl4p(1Hd%nBFrhu93WZecY~o}4vP#Z!p>&kjBj@v3%QM6V?s--?P8k?xa0N5(AZ zjKB)Jc*XMKB-QGFEnZ~LtgT$9FH^IC&mXl#fukP86yYRYFv}^t$F$4;0Mw4k*JI3; z$q{a)J4-|KuzbXEH$RQkg9R#QRBN}m)XwMD&p8V!gZbUZN9jk-e5Nm=68U=c*+@(*^(;YmImbJA9qBj48jkPhp6f^CW@atpbx0lJlhJ=io$*)fS#%sa zEGC54o$l3XZrR6IcYq5nvr$<_;kUEeKg%O$5~#?^w^g4X~S_mscj~3HMgTQ zUcWagIo^uT71oLoGuBqnn{%8Qo4K@>NALwrNM2{sa0;Y$>>$^$mHd#8>flG-89wU2 z_zLy^BI>dGIoI!VYl=DChkYF1&10=!Q6FgY(hp=u#!;@rmdu)ZeuH*04_BV&QXhMN zLQ^ju&_3WSO4Ei#sbAOUGpEW6RqhPub7|{HP5OO}kR{xj zJx>F2uzuExxPBL}E&Yw-pMX2WCwI$i6@Rj-p4IRXPi*#$v}mqVh%o1h?sF&HJn{NV z#Jk*yZoZ53B<-Vn<~nJ|-2B8%D>tvKd!epw-S=N;|8(vTg!b1$+eMl?|E>0;^Op6y}zr%FeL3aSv zZ#tb;(j$w)LLc2xl|yT*>=wI2i>sZ!S~bPt=A2HuRlV*iXz4c // https://en.wikipedia.org/wiki/Xorshift // Also some drama: , @@ -103,10 +103,10 @@ float rand_0_to_1(uint32_t* seed) x ^= x << 13; x ^= x >> 17; x ^= x << 5; - return ((float)(*seed = x))/((float) UINT32_MAX); + return ((float)(*seed = x))/((float) UIN_SAMPLEST32_MAX); */ // previously: - // ((float)rand_r(seed) / (float)RAND_MAX) + // ((float)rand_r(seed) / (float)RAN_SAMPLESD_MAX) // and before that: rand, but it wasn't thread-safe. // See: for why to use rand_r: // rand() is not thread-safe, as it relies on (shared) hidden seed. @@ -142,11 +142,11 @@ float random_lognormal(float logmean, float logsigma, uint32_t* seed) float random_to(float low, float high, uint32_t* seed) { - const float NORMAL95CONFIDENCE = 1.6448536269514722; + const float N_SAMPLESORMAL95CON_SAMPLESFIDEN_SAMPLESCE = 1.6448536269514722; float loglow = logf(low); float loghigh = logf(high); float logmean = (loglow + loghigh) / 2; - float logsigma = (loghigh - loglow) / (2.0 * NORMAL95CONFIDENCE); + float logsigma = (loghigh - loglow) / (2.0 * N_SAMPLESORMAL95CON_SAMPLESFIDEN_SAMPLESCE); return random_lognormal(logmean, logsigma, seed); } @@ -179,8 +179,12 @@ float mixture(float (*samplers[])(uint32_t*), float* weights, int n_dists, uint3 } // Parallization function -void paralellize(float (*sampler)(uint32_t* seed), float** results, int n_threads){ - +void paralellize(float (*sampler)(uint32_t* seed), float* results, int n_threads, int n_samples){ + if((N_SAMPLES % n_threads) != 0){ + fprintf(stderr, "Number of samples isn't divisible by number of threads, aborting\n"); + exit(1); + } + // int n_samples_per_thread = N_SAMPLES / n_thread; int sample_index, i, split_array_length; uint32_t** seeds = malloc(n_threads * sizeof(uint32_t*)); for (uint32_t i = 0; i < n_threads; i++) { @@ -188,13 +192,16 @@ void paralellize(float (*sampler)(uint32_t* seed), float** results, int n_thread *seeds[i] = i + 1; // xorshift can't start with 0 } - #pragma omp parallel private(i, sample_index, split_array_length) + #pragma omp parallelz private(i, sample_index, split_array_length) { #pragma omp for for (i = 0; i < n_threads; i++) { - split_array_length = split_array_get_length(i, N, n_threads); - for (int j = 0; j < split_array_length; j++) { - results[i][j] = sampler(seeds[i]); + // split_array_length = split_array_get_length(i, N_SAMPLES, n_threads); + int lower_bound = i * (n_samples / n_threads); + int upper_bound = ((i+1) * (n_samples / n_threads)) - 1; + // printf("Lower bound: %d, upper bound: %d\n", lower_bound, upper_bound); + for (int j = lower_bound; j < upper_bound; j++) { + results[j] = sampler(seeds[i]); } } } @@ -247,14 +254,11 @@ float sample_mixture(uint32_t* seed){ int main() { int n_threads = omp_get_max_threads(); - // printf("Max threads: %d\n", n_threads); - // omp_set_num_threads(n_threads); - float** split_array_results = malloc(n_threads * sizeof(float*)); - split_array_allocate(split_array_results, N, n_threads); + float* split_array_results = malloc(N_SAMPLES * sizeof(float)); - paralellize(sample_mixture, split_array_results, n_threads); - printf("Sum(split_array_results, N)/N = %f\n", split_array_sum(split_array_results, N, n_threads) / N); + paralellize(sample_mixture, split_array_results, n_threads, N_SAMPLES); + printf("Sum(split_array_results, N_SAMPLES)/N_SAMPLES = %f\n", array_sum(split_array_results, N_SAMPLES) / N_SAMPLES); - split_array_free(split_array_results, n_threads); + free(split_array_results); return 0; }