From a48b15f171a7ef44606f61ab60879a3c91543e66 Mon Sep 17 00:00:00 2001 From: NunoSempere Date: Sat, 18 Nov 2023 23:25:39 +0000 Subject: [PATCH] reorg, get parsimonious paralellism working, to go into squiggle.c --- .../desiderata.md | 0 C/alt/05-refactor-still-split-array/makefile | 107 +++++++++ .../out/samples | Bin 22688 -> 22688 bytes .../samples.c | 41 +++- C/alt/05-refactor-still-split-array/time.txt | 17 ++ C/alt/06-refactor-one-array/desiderata.md | 37 +++ .../makefile | 1 + C/alt/06-refactor-one-array/out/samples | Bin 0 -> 22472 bytes C/alt/06-refactor-one-array/samples.c | 220 ++++++++++++++++++ C/alt/06-refactor-one-array/time.txt | 17 ++ 10 files changed, 429 insertions(+), 11 deletions(-) rename C/alt/{05-refactor-split-array => 05-refactor-still-split-array}/desiderata.md (100%) create mode 100644 C/alt/05-refactor-still-split-array/makefile rename C/alt/{05-refactor-split-array => 05-refactor-still-split-array}/out/samples (63%) rename C/alt/{05-refactor-split-array => 05-refactor-still-split-array}/samples.c (84%) create mode 100644 C/alt/05-refactor-still-split-array/time.txt create mode 100644 C/alt/06-refactor-one-array/desiderata.md rename C/alt/{05-refactor-split-array => 06-refactor-one-array}/makefile (97%) create mode 100755 C/alt/06-refactor-one-array/out/samples create mode 100644 C/alt/06-refactor-one-array/samples.c create mode 100644 C/alt/06-refactor-one-array/time.txt diff --git a/C/alt/05-refactor-split-array/desiderata.md b/C/alt/05-refactor-still-split-array/desiderata.md similarity index 100% rename from C/alt/05-refactor-split-array/desiderata.md rename to C/alt/05-refactor-still-split-array/desiderata.md diff --git a/C/alt/05-refactor-still-split-array/makefile b/C/alt/05-refactor-still-split-array/makefile new file mode 100644 index 00000000..f7df522c --- /dev/null +++ b/C/alt/05-refactor-still-split-array/makefile @@ -0,0 +1,107 @@ +# Interface: +# make +# make build +# make format +# make run + +# Compiler +CC=gcc +# CC=tcc # <= faster compilation + +# Main file +SRC=samples.c +OUTPUT=out/samples + +SRC_ONE_THREAD=./samples-one-thread.c +OUTPUT_ONE_THREAD=out/samples-one-thread + +## Dependencies +# Has no dependencies +MATH=-lm + +## Flags +DEBUG= #'-g' +STANDARD=-std=c99 +WARNINGS=-Wall +OPTIMIZED=-O3 #-O3 actually gives better performance than -Ofast, at least for this version +OPENMP=-fopenmp + +## Formatter +STYLE_BLUEPRINT=webkit +FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT) + +## make build +build: $(SRC) + $(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(OPENMP) $(MATH) -o $(OUTPUT) + +static: + $(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(OPENMP) $(MATH) -o $(OUTPUT) + +format: $(SRC) + $(FORMATTER) $(SRC) + +run: $(SRC) $(OUTPUT) + OMP_NUM_THREADS=1 ./$(OUTPUT) && echo + +multi: + OMP_NUM_THREADS=1 ./$(OUTPUT) && echo + OMP_NUM_THREADS=2 ./$(OUTPUT) && echo + OMP_NUM_THREADS=4 ./$(OUTPUT) && echo + OMP_NUM_THREADS=8 ./$(OUTPUT) && echo + OMP_NUM_THREADS=16 ./$(OUTPUT) && echo + +## Timing + +time-linux: + @echo "Requires /bin/time, found on GNU/Linux systems" && echo + + @echo "Running 100x and taking avg time: OMP_NUM_THREADS=1 $(OUTPUT)" + @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=1 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 1 thread: |" | sed 's|$$|ms|' && echo + + @echo "Running 100x and taking avg time: OMP_NUM_THREADS=2 $(OUTPUT)" + @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=2 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 2 threads: |" | sed 's|$$|ms|' && echo + + @echo "Running 100x and taking avg time: OMP_NUM_THREADS=4 $(OUTPUT)" + @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=4 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time for 4 threads: |" | sed 's|$$|ms|' && echo + + @echo "Running 100x and taking avg time: OMP_NUM_THREADS=8 $(OUTPUT)" + @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=8 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 8 threads: |" | sed 's|$$|ms|' && echo + + @echo "Running 100x and taking avg time: OMP_NUM_THREADS=16 $(OUTPUT)" + @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=16 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo + +time-linux-fastest: + @echo "Running 100x and taking avg time: OMP_NUM_THREADS=32 $(OUTPUT)" + @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=16 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo + +time-linux-simple: + @echo "Requires /bin/time, found on GNU/Linux systems" && echo + OMP_NUM_THREADS=1 /bin/time -f "Time: %es" ./$(OUTPUT) && echo + OMP_NUM_THREADS=2 /bin/time -f "Time: %es" ./$(OUTPUT) && echo + OMP_NUM_THREADS=4 /bin/time -f "Time: %es" ./$(OUTPUT) && echo + OMP_NUM_THREADS=8 /bin/time -f "Time: %es" ./$(OUTPUT) && echo + OMP_NUM_THREADS=16 /bin/time -f "Time: %es" ./$(OUTPUT) && echo + OMP_NUM_THREADS=32 /bin/time -f "Time: %es" ./$(OUTPUT) && echo + +## Profiling + +profile-linux: + echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar" + echo "Must be run as sudo" + $(CC) $(SRC) $(OPENMP) $(MATH) -o $(OUTPUT) + # ./$(OUTPUT) + # gprof: + # gprof $(OUTPUT) gmon.out > analysis.txt + # rm gmon.out + # vim analysis.txt + # rm analysis.txt + # perf: + OMP_NUM_THREADS=16 sudo perf record $(OUTPUT) + sudo perf report + rm perf.data + + +## Install +debian-install-dependencies: + sudo apt-get install libomp-dev + diff --git a/C/alt/05-refactor-split-array/out/samples b/C/alt/05-refactor-still-split-array/out/samples similarity index 63% rename from C/alt/05-refactor-split-array/out/samples rename to C/alt/05-refactor-still-split-array/out/samples index 0c7c222a691a6a0da00145c22eec6a53c3c84441..531df6a89c5ec5ad88787f67e4575b89474f589e 100755 GIT binary patch delta 3396 zcmYjT4RBLc7JlzFfe_lfv}sAw-)IAE0io#+Hnx?%HZ3=pVigK?1;id83b_TSL%Q~z!ZK1$ch3)`#{aHW?uN2y15DF-7zni4$n@PU= z&OP_MbHDR*Z|4Vm=LdYdO_)gNNFFl|TeO^_`>d z8~)t)c~?u!H#++XOGGofI)a7>aP3{23^!-Qex&0#A|EF55!Fqb5^|P{**#+RNipf9 zc;uKC66<>L>%lKb+&#^o6QyuZ8pnOc!^%PS7ZTU%Cm;!lzw1^lnx+wXTz&j5(BEfp zb+3_D_qM-cnN@ov5#Leb9u7lkHWam_o)V6`-cFR#5e$f;KhMBD*_1>A`aNvi*4;Ei zJ1b`ViTT_cMd<%CtY7#c{kHNV={X8ST=TqI;xB!a4T(J56iWi7EAgT}_p_#{ z>+#7W`lslZ7B}BXXC$T=_CYKkRv+n_pGbsOo4WE&_+7gV2nH%_yKnyNJ}K_(fH`sh zQU16RYjz*s`p3XSsU+~M*%zoc_yY6Vj=iv+)DQYqo+u@!pvn=xhbW%Y2BOS=(98cs zWRKIJNSaS6Xam`tm{G>LzHSgRJRDaYC~=a&R!-7RE%EaIBmQ1pND||x``n@C3&ele z;024MxY20<_5MEFNBG|#qBD85DOZPmj3Tl7PlQN6u~j8HLh{D!7AED$rcG3L4O%*e-GHTo&F1&f*HY^f_Fjh> z_h|F@<39NkNF1Tmm{uUB!+O8RX=ZCx=|@_fmd388BZ8r_#vYIaXh%Q&(du! z1^+rQ{+NDMe01^O=sz;%r7wl%=%J}yuwWAZ6iPacFphJEH)mhi*q$4eD zI!w7vKeMX8y6bOEmAEWyTg*X9@9<2tQJ0l|uvh&Gxh+tcN!6&IEEJ zg`U30G<|3szEn)m(R;oS{Nwb^%pw}Px0D~CuFQO)XpA;w7V>Y=H!|x4Lma)4nb|6z zmTILHwaek{79zJeZL-hlTu%bJ+1ZE@tECOD?t@&X$nBPH-eslLHYm2`$Do$#$A}zS zs_3q=XcD=fDVx>7tD5GO`zzaSh`atOV4@z?Xnn#?&0Q95@>;i4v0`YUioUJ>%vrwK4YaU`TO|rvZte~JhG$Gef4xX(hP_m(!y^UZnz6tZUwM=~$w!+@l!CUA!*@}{C z`A((lDv=wV*_E!r-9Av03j!o%rRG&uoG}o2h0~@Gjnq#lHFqWX7 zq~M4^f)@p7IpyP|{+Ks-PRC}<@+l&J=as*d+_y!gvlYAY1}sEbY9{V4h_cKqiL{%z zkBc-Y8IFqE=Ah~OTb#}q@ud_D+18@9Cz@l8VtlNL_=9Gj`&je1q{Pm3H#&=&mW9fZ z_n-LcgBZl?{2oQ(CHLgkMb-6ZD|1fxgF-|!^(dSYrf;YwKc($a}xft zWZ%Jro_w9e_wfC`7{rmpUki`H^fC z+b*$8^Y?A>$=a{2d87~Otnz&F2)X$cQDS%aG+zlV39X&oEMgTw5^ z?7$LlV6oaNV=%fql6Yxw4~fTi(B{Wk#gb|0m(&VpRc|<_T!Uz0b?}AzZDK7$?fsHi zWoky)+PIp9FN_qhbE9n(=kzGDJT8Rd`b5I|HJ`FW9a;Hb+!NGqp&CBw)n1I$1^D~5 z*G}~oX_ZeY?V62hSm|Ze>y>|$R!H(swM4mZFT~4zIX|PC4eB#$*Z$d?46o!KnfAPw zIZFp}J*_r9#~IFRngbu61hxYW-)q`lpbf}w85@Q*Z4y{?5#dEuY`LUq8-aDIrtJh4 zjcD3F;6dO8;08E43ET&aMQb<;bO5)5f+qG3>;z_C(N<}PFd$bn?F4WO@B;84a1yv3 zwU~%XKMKqS?i+&x!0n)R9eYPJ+{Wh7zmEd9;4~`3hSVGMqH8XHmd58j!>^++=2?g7Oi5b@aWw#nU!mmjfIf%gc%2r03LfQS=~D&s zcn9q$NJ&}`64Rq<%-jIfHqu~$W!6s2EQ-!BL)S5LP^Fg(l9L+2bn$PdoFMCUYA;Ml zTMe$iVKcKcX7Z`Dxv;p^2U#Cvlc4r(AXjBFzburSY`bDA zO%6p@Zpz-NmrS`YO_NMT{^=gm0Y2tk!DuS-m~tgkcDcy`-Exz;++^_1#BwKoEL26` zXxe^^nZ2EYWU{>!=}pvS4_{9TWMK^!S0COwWAXp!9YNaYUq;{ zR=%A+SFwaY(G{$?pBGMK(c7LX+#00H&B^O3&`W49%4y04TyTE=RHKOuF1h^Q4jhSetUhhJ?BRL{|K03Q9YN(xdc&);Og}(3o m)ZQ6YnyQ2TsEKu5FD`903a=$mp>~0BC@W$E@q3`QA?E*O+OeYm delta 2878 zcmYjT4Nz3a9e?j}r{{_H?%?3w!Eq2d;(-=CJ`@BJ9~|dii_xekiTKf?H*H4~oq`&x zfee`R&3TD-XIh-jI5d-K)oGg;+Uj(i2qLIcbbx8BR%fcl=)*5CV7Qp*>u=v3ZFlDG zxBK7!{`UX1yRY*&-+7!rAPHRZ!L&zaV=AXNV`^6tzgntl6 z{tKGuJp(~6em$h@O!LKY@iimw=D5>5Xil;}CK=BFNu;D5FfP8{t6MwN+Dt6 z`>emOyJ?1YL(FVtJiT#*{!v}Of)xO;a~v%YI`WJ)K;}AE4hPr<*z;R{B)A%Vj4*i`O51Q&OwG)n)ge0($(4dT8>Q z(qA?t?jHoH`0*uvC}Oab^@x$<7>BD}B>Z!0Z8-aoJo!vkE|SKfwjQ2D=KYg&_{ERa zpwdrzZU{s%cw{V=BZ22j8~bYc?%Hs*$)7tCj5M?wA_0x?gDrAohejg)b0($9V<(E= zV~WVn$>d;+!0(a7%*9;T@3B`!>|TP7R2!*~P|4UfEt$b{&RTy_I8)JXFj z1$;Gq-7!!4`WsD?!*-A8HC~fgO#8+~>#UC?de|{%)=o%d#qL3|7#r!Zqd>S8r3unf zVIWFZNFJfzL3c?%FuxVmw69P!#~t*d^y17{9r2>+jMC?l%aN~->y3Fp7$=PkO(Itn z)aT3*?xgArhah1LL#aB$cAN#h?7Am)hSmDG-nas&Gd%tp3=PzlHk;o{i_`K;@<2e8 zE2OQL50>>cUuHFW+Kl|gA~Et)EC}UKm8(88vbyWnX-z!8ohG`xJfS77#w7m)Xv*$4 zkFKp%?jnXGMf8}fSiFSQ=lmWk_&D|E1bygo^ER58@rrOOi8f>u2*;CXBqNu9m!8O| zol!O(a|XojIc6Jz<0Q7!7YFE?%(?D+k2UT7sVOhUMfX$E_CN|xhknXoEA4i#H@^fG7RepfIGOxk zGRG8Ih%U^^TlkwX#Bds8Jju$Av8^P^w_vo6i?&*iLA?MCtVS0C14^Vv{p%R*$jZq0 zy~uICK}h``Lr5I$W(NEWGT1!pJvyBAirIy=%#Kwe>_|_of(FW+`7~jP`&2!KFi2=3muwWBwTq&r)8J$GOft27mg&-Uu9>v)XA&r zl+RbfSVEKv55a*lZ6aYqju$CnC#%%geUj@GIUMeN!{WXfxY$S|&hI7Cb2mY4f2e6e z zzA-=|O&+Ts+t0(L^(RC#ByQct-X8K`ExE!@=QX3w` z3yz)k(oLi&=0S5k3Yee<|96l_d{8|QGa=MM#5V?+4a^wb1nlm^?z$x~;S9=^g+h;o zHD<+45Zv!j^VpW*S-p82_cs|DpRqe1%a7r>sDH$y{t~DZJuy=+3wcd#q>|1-WL5{2 z%W_>%aXyh($;x+iB=Ws>$XC|pK0_)@>au%XAI#Zq>iWcdga76H{%Ng?IBvI}4$Swr zuxClSrD<8%&h!Ob0ZPD+fLG$Z@hUx3RQ}RD zxXwzGHFobzAS~x0E7m13nF+2>K!#gdZ_DxTf{asXQt=9$p3TK``Br+cI6dVoB8W#` zZ;WmLYJ;?=I63tJW){R}7|{e~q!D_rI4xxW;V%EbDGwqQrOuM{j8h2wbv8pRS;Bd< z#Z)ts%fhe$8m;uzl5BntZ7wOd5XjC#R*$%DWv;vGt1YE%)BToKUbt$ul=?06WsA4U zl2v7qsw~zjizzUZ)vXmluDGsgC(!2t7VCaNwn*$GxKfn)Cxn^ftH$ItIn~LcG zn3Grv-E_DnAEm+v9BG5`TueoG8B-E$Tr`Ik1_Dm2C&sZgZq+*EgmH|~g8>J=Y>oy# z5Ueh`XvzD|Pts$mQ+2T%(PpZx^yZQRp*4plkwCE18B@9(1Mcr}J}&NKL0tY}eDPpb zOpqDh{||BbHu_t#kUv4Mk~xAklM2Cnr#0^GhB!q6k`XgC7%XtMrp3g&#)14TstYCj2MF%k^8f$< diff --git a/C/alt/05-refactor-split-array/samples.c b/C/alt/05-refactor-still-split-array/samples.c similarity index 84% rename from C/alt/05-refactor-split-array/samples.c rename to C/alt/05-refactor-still-split-array/samples.c index e128518f..e999761e 100644 --- a/C/alt/05-refactor-split-array/samples.c +++ b/C/alt/05-refactor-still-split-array/samples.c @@ -6,7 +6,7 @@ const float PI = 3.14159265358979323846; -#define N_SAMPLES 1024 * 1000 +#define N_SAMPLES (1024 * 1000 ) //Array helpers @@ -180,36 +180,54 @@ float mixture(float (*samplers[])(uint32_t*), float* weights, int n_dists, uint3 // Parallization function void paralellize(float (*sampler)(uint32_t* seed), float* results, int n_threads, int n_samples){ + if((N_SAMPLES % n_threads) != 0){ fprintf(stderr, "Number of samples isn't divisible by number of threads, aborting\n"); exit(1); } - // int n_samples_per_thread = N_SAMPLES / n_thread; - int sample_index, i, split_array_length; + int n_samples_per_thread = N_SAMPLES / n_threads; + + float** split_results = malloc(n_threads * sizeof(float*)); + for(int i=0; i2AvBnTq$EOE64q9h9a#z{ zo3M0TXthQO-M+|9I!ij+_ja2!?UZJk;0F(fNiv2_Qc{}MVM`-H2*l2VQb7Ct=bj^9 zU8FYKNoRIvd}gHo{NMTi^Z3ty9{1?#{8K}7V~NScWH7T^8Rst75RXW(tdtUrc~~u* zhVL2dV{8)OQhYoj*+`F+XbF;9gq#_a^r|RQfL^P?Oj21xqNG-e!M_a7uaM(P#vl2REIRvnKA{+9Aq3ca+jqmlkr%brOp z%lj!57=>MtkKjp|rd(3yQ+E^YD}43gJ_MJMtto#s^YJ zK3N3+O%Xg%1YcPMzq<&2Ws!FMu!#I8fIILph86&Y+BH@LA1ZDF%&}Gd{^-F#8T=QHD#^ZJ4(VKJ8n_TmA@%hvP zC45|)F-w=>GYKQTgwNQMK`o5FX{=8X)CAfFp5~Yue1@<8IHrp!%!U3Y%;O}!l-&%T zf(iLil&66vt`1LO$aRU3Pk!Yam_8=4&j|iYoHw&Obo$QU4~8_F18FjsauEZ z*S9S?oXVEr%Q{@zj?#AMaCBTQ?9|~jx680ghofV2;rlw=nnM|TLWkRQ_--9ez9&PU z4zG|P=#UQA&u7CroYu{WFkyiS3rtvG!UBI^3yj%5c24bi)1vm4y}E=kb!%U|Bs;A3 z?6nN?g0l-|0v^rYa6O1^1|#`4N*T?(p3P==aGn;%(afP7PYdE`ra#BiLOz;#GRM=x zIGTAp$J2s1n)ynOr-g7d^MxEw3*czxz8p^r-)P34<7vSg&3rb;(*|KQqvUv6_(n7C z98VjC(ad!@o)*B-Oht~Tg>N)t%JDYNzkN}(7uI;@R*e1y z@TUy?aRdLFfq&V+zhL0^8Te-m{Lc;ij}82H4E(nY{MYro(z2RdDD8p4_AusrM@jBY z_ki&xkNpyx*2QXS@m~0Zn%u7@_obhMxt}I6HJJ|CPWP#%6Csyv#us5^7|Cu?d;09Q z>Q`;mhsJDcj@b4eT5YQy0B4(i$hQBvEWkm02Jq=u>jn}psHp|t_$Xt0$P0JV*O=|z zvNVt}+ueQXsi=pVJe^*J-Xphm_uZqc9Sq6x$ba3Znv#d^Rg=fmoL3E#q|_b zlgH!bYVYDkFzN7xY&IQ075A5|25-6tx_SDUjYt&rq5R0``uxXi_Ocz<@-n$UD|ae) zD67${tJLJ5(X-Xw-j1sDzn~cO-XbFX-emgG_b(*(*B!ELA4Y|2KY7t~C{=2$>$jzz z1mC;#TDA8d?ajT{KdQWcK}o)#wjS!~H>s&PzgBm8Z9hpj!58{RO4MYjQ-QLQYP`6- zbzifoueo=r#XBq0lv=x`G}V|@O~Xp+{;Znv9gXvwhXP-qi zU+YvCMVyvIA6%pQbMOr19_3!eH`ue^#TM6!Ck%zc@i2%{MS$l*&+kPAF<8yI_(tO4``4$ zN4oKRHmmejI&IZuv#+7<-iJP#_RiHXbOlQeQwR2sOmhz;hx^kNoQ#yY2gX#}CowGU zDmwr{C0Xgj5Gi|#l;#d?3?YZ*r4$QL%tEnGQLKD7P0z&E&^Vd>XS54OtPQ9)X>CZ5 znAcej6R9+IxPR`ZIlejhMz&?Ieau#Y5AME`8KQh3p8Q?2dl=+I1t-VKh)|>YC}kjt`QWaB{!E4YVDeD^>t*hPly)$A zC5>|=9Yj5votE@-=d#&RSiyu$sgO}5+p3S)Ttj2F&mM))T4(vjM&2e1S?lqu#8hzx ziXmI|kXEa^(I#ZmPwFJLnTxF{djJ%ssLEUi@$`0loqTHCN60@eZQnxZd}!^<{7Gxy zb;$ICw6DKPw9oz*+xJ~Awx;acpuBww%1QqRzKreL`rhU?WwkdI%_^8IZ+PZP##XDz z3#;5m(Q=D6evh1N1TN{3vr_Z7n za_a$4$$Xcv-1@ooJ=??Ep@8Y3=WK~>TN3f)sG2-WMfRU4QTvaTs-|;XUaTjySkMxJ z#e~+5ywzmbR^3maS~r{+PM+x*Iq!LIzd61@Z5;+&`pwizQ{5|@XR57-d-_T^Qn!Ee z+WddmBne=Eoowk$M$EgR?Y@mo&yd>!XFe5tR2V&3?*(D)@^rZGI2 z9(fPGd_ZSiv7hDkJ3MAvJ5;a@WJ7quUDG>J5H|>IOa`=haNHiK(DW6&Dwqv13QnU( z_>5Fejrb7ewsL3st#>ukd>rKzF-_^l#i&Us=6^!p=iKBM+Ud4_+#_E|;y8c|Ns_9?mcy@8WG-oAy<`(W+MeEuEGYOmny zhAW6ims) zZDmcZ=V9cv@aXN6`OrE%mL$+YI`ckpaZGv>`Lz>ZTK6eStZrtyvSEE znzAX2=4X({z3BZlm8_fAeDJE+)M zDs}H{#-_#I{BzoN-T!8D^8IY(7_VnJZNa|(Hg`a6%Vn3lQfpkeGTsH9y%*f zVBaR}*V8U8)o|9EtX1kxCytHTHq*S-Tk3Q_=RO>2vPGX)Q*o!=i_+)d(aQxV#;&@b zo*Do?=&YtOT9saie3aC(Y*Q+-*TVM+YVRWKNG$0&XRte|JDE6b*aNS@&``18mA#?( zGA4*(d{7Pe#-6G`RpB*2;ZLGJctvQdf57Xc=K(YfHPI^ZB=k@2OrJYV8}4aeb>XR4 zfyA>g3_W!|fQQfYLy+OVmu^K&tXImN&8fw$vtbtu%vW*)^ZaTxbz=+QRn4<5&`vBp z<1J*}f59A|;XW$%BhAUt=Hy%6IBMuJ_Aa98vbaYrD0f4D1*S417_H#zWXrVtWub{d9Hy0HZIBIM1Xa?MZ zmkzef{ntmYT5{C(oagfMKfJZ&-^!Pqe(>C_l~3H+bdmcZARUOM_&qB`f-`Q*% z@G8&;L90P`fU1AUW`7I%<>T3`1;2x}{V|($((l`EWV0U7ASn|&Ts1swscI)QRQ*MQRh0cuZXvva5(Z{qQt_*2Ys}-sGG~a-_c&9}C%x01*g}QyjSd#YmE&-)45AJoz3XVY~488Zh~; zp$h(I`0M~yfqJq=tNoFZdTZ6g=6b6mRa$SY-d?6yYqm{NthGIpms!`9ym6Ve7Kmc4 zh6E(*t@e7WMVZR@!3y#Spx@?;ytbFzZmrs8UID#VFSAxDT()s4!;f2xezWaHU%rAk zQ>~7Yfy=Cx1}#lR+AZi`V_Gdl1C%%V`~lMYwAG#}fyuU;k;Ar9ueG|TtkGH%w^la} zS{=%M)bDu|`hwNM-^twtT`zR$aEk)jfbAdK4epIW=9EChoLIBARK_2#`w3SFJg}>66 z$)&+Y4zLlV4Iym?>cAST_MVbuR)=|#=`yQbAwzlqli%-!f6oA=zJ0i)!CI9vH&`9p z$yVE_Z+j*!vwF-S=uxLCx=wAJO8)T@ba!Da&^yP){bMQlhu6BrL>*o0y~IHpr;>j- zFm762%VvLpIOVmyL?Qq1qSfEERxRc3K}lYGnjyOrV=qVm_sb^oOVmA8+CYAZLbgp( zHNW&k3wb5)->)H`KIEf+-hf>2|g+ccQAa}>4+eR>X{ApfI;cfojY zh|RymRZ(GCZ@%Y*q7sfc8j=6YHfC5O`r9MuN@?aO#A*iuF ze~p*_zgxC+>8BjktJf!@@r0w!h5tU@=uU9Xy?I`Zt7g7y-YgCOYdO+?nN7L>_v@wU zpO5nY>G`;s$^W0{Awqqabc&H3j<7%xuh45Vtdr{?Bxx3^SZV&NI4VVIK(B z=n*Y$W|uShe|&wa8K(e4ZapI9r|16nsZTZIACz*w%Ezx{g~p2+vyYr#^5ro*uGAx1 z+^qXwvp&_#R+r~2Y5b}EV2L* zT_~0hi4@BJWv)B@l79aqa1*<<-%ks9x$bcmUnSfim-^v!;FwJ=@sk@cFO(D*U$Y_q z5!lTk)>*kOF5ZJaJIg0Bbe7#GXMwqhZ4uuJ`ObCG-=0d9x9-QdsP;PQ5MMr;J+ zdCubkuMr!70a5agisXMl$jc!h&v_OQUUKO?@Jf+>)x8iT(r5&5hX8#R8;iM*1cK3D>Hkp1-;Azjxkf*y^K0%uqH+8j4*A^2aV>BCcdwKeQ@Y$QJ_?-d{56A}9~1Hpu|ekN zmdNnOMe=`E$k&LCCqL7r3dnp!gZTL+;DZ8}2kmkn|6fJ&KVAg?6Yz?XOXqECRtKss|)gPfKZq8_p#k=AOynNrv#_t^Q#oK+Ybn8eLZr})X_%=j3*83yAK)fT? zs;yb*E-Xa1c!b5zOTlP7wh4E4_}hcNK%%{U6I67Z50dhiC{45@ z6pD1_uG8ZZj&@%n8V*4iRn5OsWxR~}xomWo zx{T+e^8u!LhOb7eO^q+!;d9e%G2@xdt3yemCw%w&(f7ewWkSJNCv!zR;z8GjXu{PQ>*x%|;+u5P`a~EPf`tPDQtF#-jQck* zF5c$vYGbazrYMTgXgsE++#if}g*&2oj1OtCV8l-fBGws+GZ%Ld7ox5W9bhTa6>Md$ zc(5BRO>~g%;A7hrY!g#kTL8I9PRm1^4mAxKM^XOvFf!BB;ZMwk`L!JrVS$PMe+kht zZNdlJ2p+1i&QXw5o?B_CCM)Aaug8auY^1(CACT06G<}fgLlUEXvtECVC|^>i6ciN8 zSYBuXr`MO~4U#(Yr0E$bKhSd>{$$l5>%U8sFKM+9m*;WD`umZtJ(FsJ^87;5YLPM3 zk?bt>#d6NVz^F)BzC7oURG#-iga^t)AMOJ~XK7Mjo`*7GOLUJxU!FTjD)-A$-e|vlLSOd3e2$VdEfoco^_K7p27P&s zC8>NKkn+azj~evld6%T$7vov#OaGDlaYXQk9XV!{^HQJw5l6l&_2oI6yhqe73`_F|1*s?P_#hiDP0KihzlECoHy z`JR@ +#include +#include +#include +#include + +const float PI = 3.14159265358979323846; + +#define N_SAMPLES (1024 * 1000 * 1000) + +//Array helpers +void array_print(float* array, int length) +{ + for (int i = 0; i < length; i++) { + printf("item[%d] = %f\n", i, array[i]); + } + printf("\n"); +} + +float array_sum(float* array, int length) +{ + float output = 0.0; + for (int i = 0; i < length; i++) { + output += array[i]; + } + return output; +} + +void array_cumsum(float* array_to_sum, float* array_cumsummed, int length) +{ + array_cumsummed[0] = array_to_sum[0]; + for (int i = 1; i < length; i++) { + array_cumsummed[i] = array_cumsummed[i - 1] + array_to_sum[i]; + } +} + +// Pseudo Random number generator + +uint32_t xorshift32(uint32_t* seed) +{ + // Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RN_SAMPLESGs" + // See + // https://en.wikipedia.org/wiki/Xorshift + // Also some drama: , + + uint32_t x = *seed; + x ^= x << 13; + x ^= x >> 17; + x ^= x << 5; + return *seed = x; +} + +// Distribution & sampling functions + +float rand_0_to_1(uint32_t* seed) +{ + return ((float)xorshift32(seed)) / ((float)UINT32_MAX); + /* + uint32_t x = *seed; + x ^= x << 13; + x ^= x >> 17; + x ^= x << 5; + return ((float)(*seed = x))/((float) UIN_SAMPLEST32_MAX); + */ + // previously: + // ((float)rand_r(seed) / (float)RAN_SAMPLESD_MAX) + // and before that: rand, but it wasn't thread-safe. + // See: for why to use rand_r: + // rand() is not thread-safe, as it relies on (shared) hidden seed. +} + +float rand_float(float max, uint32_t* seed) +{ + return rand_0_to_1(seed) * max; +} + +float ur_normal(uint32_t* seed) +{ + float u1 = rand_0_to_1(seed); + float u2 = rand_0_to_1(seed); + float z = sqrtf(-2.0 * log(u1)) * sin(2 * PI * u2); + return z; +} + +float random_uniform(float from, float to, uint32_t* seed) +{ + return rand_0_to_1(seed) * (to - from) + from; +} + +float random_normal(float mean, float sigma, uint32_t* seed) +{ + return (mean + sigma * ur_normal(seed)); +} + +float random_lognormal(float logmean, float logsigma, uint32_t* seed) +{ + return expf(random_normal(logmean, logsigma, seed)); +} + +float random_to(float low, float high, uint32_t* seed) +{ + const float N_SAMPLESORMAL95CON_SAMPLESFIDEN_SAMPLESCE = 1.6448536269514722; + float loglow = logf(low); + float loghigh = logf(high); + float logmean = (loglow + loghigh) / 2; + float logsigma = (loghigh - loglow) / (2.0 * N_SAMPLESORMAL95CON_SAMPLESFIDEN_SAMPLESCE); + return random_lognormal(logmean, logsigma, seed); +} + +// Mixture function + +float mixture(float (*samplers[])(uint32_t*), float* weights, int n_dists, uint32_t* seed) +{ + + // You can see a slightly simpler version of this function in the git history + // or in alt/C-02-better-algorithm-one-thread/ + float sum_weights = array_sum(weights, n_dists); + float* cumsummed_normalized_weights = malloc(n_dists * sizeof(float)); + cumsummed_normalized_weights[0] = weights[0] / sum_weights; + for (int i = 1; i < n_dists; i++) { + cumsummed_normalized_weights[i] = cumsummed_normalized_weights[i - 1] + weights[i] / sum_weights; + } + + //create var holders + float p1, result; + int sample_index, i, own_length; + p1 = random_uniform(0, 1, seed); + for (int i = 0; i < n_dists; i++) { + if (p1 < cumsummed_normalized_weights[i]) { + result = samplers[i](seed); + break; + } + } + free(cumsummed_normalized_weights); + return result; +} + +// Parallization function +void paralellize(float (*sampler)(uint32_t* seed), float* results, int n_threads, int n_samples){ + if((N_SAMPLES % n_threads) != 0){ + fprintf(stderr, "Number of samples isn't divisible by number of threads, aborting\n"); + exit(1); + } + // int n_samples_per_thread = N_SAMPLES / n_thread; + int sample_index, i, split_array_length; + uint32_t** seeds = malloc(n_threads * sizeof(uint32_t*)); + for (uint32_t i = 0; i < n_threads; i++) { + seeds[i] = malloc(sizeof(uint32_t)); + *seeds[i] = i + 1; // xorshift can't start with 0 + } + + #pragma omp parallel private(i, sample_index, split_array_length) + { + #pragma omp for + for (i = 0; i < n_threads; i++) { + int lower_bound = i * (n_samples / n_threads); + int upper_bound = ((i+1) * (n_samples / n_threads)) - 1; + // printf("Lower bound: %d, upper bound: %d\n", lower_bound, upper_bound); + for (int j = lower_bound; j < upper_bound; j++) { + results[j] = sampler(seeds[i]); + } + } + } + + for (uint32_t i = 0; i < n_threads; i++) { + free(seeds[i]); + } + free(seeds); +} + +// Functions used for the BOTEC. +// Their type has to be the same, as we will be passing them around. + +float sample_0(uint32_t* seed) +{ + return 0; +} + +float sample_1(uint32_t* seed) +{ + return 1; +} + +float sample_few(uint32_t* seed) +{ + return random_to(1, 3, seed); +} + +float sample_many(uint32_t* seed) +{ + return random_to(2, 10, seed); +} + +float sample_mixture(uint32_t* seed){ + float p_a, p_b, p_c; + + // Initialize variables + p_a = 0.8; + p_b = 0.5; + p_c = p_a * p_b; + + // Generate mixture + int n_dists = 4; + float weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 }; + float (*samplers[])(uint32_t*) = { sample_0, sample_1, sample_few, sample_many }; + + return mixture(samplers, weights, n_dists, seed); +} + +int main() +{ + int n_threads = omp_get_max_threads(); + float* split_array_results = malloc(N_SAMPLES * sizeof(float)); + + paralellize(sample_mixture, split_array_results, n_threads, N_SAMPLES); + printf("Sum(split_array_results, N_SAMPLES)/N_SAMPLES = %f\n", array_sum(split_array_results, N_SAMPLES) / N_SAMPLES); + + free(split_array_results); + return 0; +} diff --git a/C/alt/06-refactor-one-array/time.txt b/C/alt/06-refactor-one-array/time.txt new file mode 100644 index 00000000..df9bde98 --- /dev/null +++ b/C/alt/06-refactor-one-array/time.txt @@ -0,0 +1,17 @@ +Requires /bin/time, found on GNU/Linux systems + +Running 100x and taking avg time: OMP_NUM_THREADS=1 out/samples +Time using 1 thread: 32.30ms + +Running 100x and taking avg time: OMP_NUM_THREADS=2 out/samples +Time using 2 threads: 30.80ms + +Running 100x and taking avg time: OMP_NUM_THREADS=4 out/samples +Time for 4 threads: 30.50ms + +Running 100x and taking avg time: OMP_NUM_THREADS=8 out/samples +Time using 8 threads: 12.90ms + +Running 100x and taking avg time: OMP_NUM_THREADS=16 out/samples +Time using 16 threads: 9.10ms +