From ff3685766b18c89ffb753de753e16ed38693c31b Mon Sep 17 00:00:00 2001 From: NunoSempere Date: Fri, 2 Jun 2023 16:00:49 -0600 Subject: [PATCH] reorganize C code --- C-optimized/README.md | 32 -- C-optimized/makefile | 82 ---- C-optimized/samples.c | 281 ----------- C/C-01-simple/makefile | 53 +++ C/{ => C-01-simple}/samples | Bin C/C-01-simple/samples.c | 157 +++++++ C/C-02-better-algorithm-one-thread/makefile | 53 +++ .../out/samples-one-thread | Bin 0 -> 18008 bytes .../samples-one-thread.c | 2 +- C/README.md | 15 + C/makefile | 135 +++--- {C-optimized => C}/out/samples | Bin {C-optimized => C}/out/samples-one-thread | Bin {C-optimized => C}/out/samples.exe | Bin {C-optimized => C}/out/samples.txt | 0 C/samples.c | 438 +++++++++++------- README.md | 18 +- 17 files changed, 651 insertions(+), 615 deletions(-) delete mode 100644 C-optimized/README.md delete mode 100644 C-optimized/makefile delete mode 100644 C-optimized/samples.c create mode 100644 C/C-01-simple/makefile rename C/{ => C-01-simple}/samples (100%) create mode 100644 C/C-01-simple/samples.c create mode 100644 C/C-02-better-algorithm-one-thread/makefile create mode 100755 C/C-02-better-algorithm-one-thread/out/samples-one-thread rename {C-optimized => C/C-02-better-algorithm-one-thread}/samples-one-thread.c (95%) create mode 100644 C/README.md rename {C-optimized => C}/out/samples (100%) rename {C-optimized => C}/out/samples-one-thread (100%) rename {C-optimized => C}/out/samples.exe (100%) rename {C-optimized => C}/out/samples.txt (100%) diff --git a/C-optimized/README.md b/C-optimized/README.md deleted file mode 100644 index 222436d0..00000000 --- a/C-optimized/README.md +++ /dev/null @@ -1,32 +0,0 @@ -# C-Optimized - -An optimized version of the original C implementation. - -The main changes are: - -- an optimization of the mixture function (it passes the functions instead of the whole arrays, reducing in great measure the memory usage and the computation time) and -- the implementation of multi-threading with OpenMP. - -## Performance - -The mean time of execution is 6 ms. With the following distribution: - -![Time histogram](https://i.imgur.com/6iT2PkF.png) - -The hardware used has been an AMD 5800x3D and 16GB of DDR4-3200 MHz. - -Also, the time data has been collected by executing the interior of the main() function 1000 times in a for loop, not executing the program itself 1000 times. - -## Multithreading - -Take into account that the multi-threading introduces a bit of dispersion in the execution time due to the creation and destruction of threads. - -In Nuño's machine, multithreading actually introduces a noticeable slowdown factor. - -## To do - -- [ ] Use proper profiling tool to capture timing with 1M samples. -- [ ] Update above with correct timing -- [ ] Add Windows/Powershell time-measuring commands -- [ ] Add CUDA? -- [ ] See if program can be reworded so as to use multithreading effectively, e.g., so that you see speed gains proportional to the number of threads used diff --git a/C-optimized/makefile b/C-optimized/makefile deleted file mode 100644 index 3d7d4053..00000000 --- a/C-optimized/makefile +++ /dev/null @@ -1,82 +0,0 @@ -# Interface: -# make -# make build -# make format -# make run - -# Compiler -CC=gcc -# CC=tcc # <= faster compilation - -# Main file -SRC=samples.c -OUTPUT=out/samples - -SRC_ONE_THREAD=./samples-one-thread.c -OUTPUT_ONE_THREAD=out/samples-one-thread - -## Dependencies -# Has no dependencies -MATH=-lm - -## Flags -DEBUG= #'-g' -STANDARD=-std=c99 -WARNINGS=-Wall -OPTIMIZED=-O3 #-O3 actually gives better performance than -Ofast, at least for this version -OPENMP=-fopenmp - -## Formatter -STYLE_BLUEPRINT=webkit -FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT) - -## make build -build: $(SRC) - $(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(OPENMP) $(MATH) -o $(OUTPUT) - $(CC) $(OPTIMIZED) $(DEBUG) $(SRC_ONE_THREAD) $(OPENMP) $(MATH) -o $(OUTPUT_ONE_THREAD) - -format: $(SRC) - $(FORMATTER) $(SRC) - -run: $(SRC) $(OUTPUT) - OMP_NUM_THREADS=1 ./$(OUTPUT) && echo - ./$(OUTPUT_ONE_THREAD) - -multi: - OMP_NUM_THREADS=1 ./$(OUTPUT) && echo - OMP_NUM_THREADS=2 ./$(OUTPUT) && echo - OMP_NUM_THREADS=4 ./$(OUTPUT) && echo - OMP_NUM_THREADS=8 ./$(OUTPUT) && echo - OMP_NUM_THREADS=16 ./$(OUTPUT) && echo - ./$(OUTPUT_ONE_THREAD) && echo - -time-linux-simple: - @echo "Requires /bin/time, found on GNU/Linux systems" && echo - OMP_NUM_THREADS=1 /bin/time -f "Time: %es" ./$(OUTPUT) && echo - OMP_NUM_THREADS=2 /bin/time -f "Time: %es" ./$(OUTPUT) && echo - OMP_NUM_THREADS=4 /bin/time -f "Time: %es" ./$(OUTPUT) && echo - OMP_NUM_THREADS=8 /bin/time -f "Time: %es" ./$(OUTPUT) && echo - OMP_NUM_THREADS=16 /bin/time -f "Time: %es" ./$(OUTPUT) && echo - /bin/time -f "Time: %es" ./$(OUTPUT_ONE_THREAD) && echo - -time-linux: - @echo "Requires /bin/time, found on GNU/Linux systems" && echo - - @echo "Running 100x and taking avg time: OMP_NUM_THREADS=1 $(OUTPUT)" - @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=1 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 1 thread: |" | sed 's|$$|ms|' && echo - - @echo "Running 100x and taking avg time: OMP_NUM_THREADS=2 $(OUTPUT)" - @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=2 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 2 threads: |" | sed 's|$$|ms|' && echo - - @echo "Running 100x and taking avg time: OMP_NUM_THREADS=4 $(OUTPUT)" - @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=4 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time for 4 threads: |" | sed 's|$$|ms|' && echo - - @echo "Running 100x and taking avg time: OMP_NUM_THREADS=8 $(OUTPUT)" - @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=8 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 8 threads: |" | sed 's|$$|ms|' && echo - - @echo "Running 100x and taking avg time: OMP_NUM_THREADS=16 $(OUTPUT)" - @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=16 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo - -debian-install-dependencies: - sudo apt-get install libomp-dev - diff --git a/C-optimized/samples.c b/C-optimized/samples.c deleted file mode 100644 index 6b779a6a..00000000 --- a/C-optimized/samples.c +++ /dev/null @@ -1,281 +0,0 @@ -#include -#include -#include -#include -#include - -const float PI = 3.14159265358979323846; - -#define N 1000000 - -//Array helpers - -void array_print(float* array, int length) -{ - for (int i = 0; i < length; i++) { - printf("item[%d] = %f\n", i, array[i]); - } - printf("\n"); -} - -void array_fill(float* array, int length, float item) -{ - int i; - #pragma omp private(i) - { - #pragma omp for - for (i = 0; i < length; i++) { - array[i] = item; - } - } -} - -float array_sum(float* array, int length) -{ - float output = 0.0; - for (int i = 0; i < length; i++) { - output += array[i]; - } - return output; -} - -void array_cumsum(float* array_to_sum, float* array_cumsummed, int length) -{ - array_cumsummed[0] = array_to_sum[0]; - for (int i = 1; i < length; i++) { - array_cumsummed[i] = array_cumsummed[i - 1] + array_to_sum[i]; - } -} - -float rand_float(float to, unsigned int* seed) -{ - return ((float)rand_r(seed) / (float)RAND_MAX) * to; - // See: - // rand() is not thread-safe, as it relies on (shared) hidden state. -} - -float ur_normal(unsigned int* seed) -{ - float u1 = rand_float(1.0, seed); - float u2 = rand_float(1.0, seed); - float z = sqrtf(-2.0 * log(u1)) * sin(2 * PI * u2); - return z; -} - -inline float random_uniform(float from, float to, unsigned int* seed) -{ - return ((float) rand_r(seed) / (float)RAND_MAX) * (to - from) + from; -} - -inline float random_normal(float mean, float sigma, unsigned int* seed) -{ - return (mean + sigma * ur_normal(seed)); -} - -inline float random_lognormal(float logmean, float logsigma, unsigned int* seed) -{ - return expf(random_normal(logmean, logsigma, seed)); -} - -inline float random_to(float low, float high, unsigned int* seed) -{ - const float NORMAL95CONFIDENCE = 1.6448536269514722; - float loglow = logf(low); - float loghigh = logf(high); - float logmean = (loglow + loghigh) / 2; - float logsigma = (loghigh - loglow) / (2.0 * NORMAL95CONFIDENCE); - return random_lognormal(logmean, logsigma, seed); -} - -int split_array_get_my_length(int index, int total_length, int n_threads) -{ - return (total_length % n_threads > index ? total_length / n_threads + 1 : total_length / n_threads); -} - -//Old version, don't use it!! Optimized version is called mixture_f. This one is just for display -/* -void mixture(float* dists[], float* weights, int n_dists, float* results) -{ - float sum_weights = array_sum(weights, n_dists); - float* normalized_weights = malloc(n_dists * sizeof(float)); - for (int i = 0; i < n_dists; i++) { - normalized_weights[i] = weights[i] / sum_weights; - } - - float* cummulative_weights = malloc(n_dists * sizeof(float)); - array_cumsum(normalized_weights, cummulative_weights, n_dists); - - //create var holders - float p1, p2; - int index_found, index_counter, sample_index, i; - - #pragma omp parallel private(i, p1, p2, index_found, index_counter, sample_index) - { - #pragma omp for - for (i = 0; i < N; i++) { - p1 = random_uniform(0, 1); - p2 = random_uniform(0, 1); - - index_found = 0; - index_counter = 0; - - while ((index_found == 0) && (index_counter < n_dists)) { - if (p1 < cummulative_weights[index_counter]) { - index_found = 1; - } else { - index_counter++; - } - } - if (index_found != 0) { - sample_index = (int)(p2 * N); - results[i] = dists[index_counter][sample_index]; - } else - printf("This shouldn't be able to happen.\n"); - } - } - free(normalized_weights); - free(cummulative_weights); -} -*/ -void mixture_f(float (*samplers[])(unsigned int* ), float* weights, int n_dists, float** results, int n_threads) -{ - float sum_weights = array_sum(weights, n_dists); - float* normalized_weights = malloc(n_dists * sizeof(float)); - for (int i = 0; i < n_dists; i++) { - normalized_weights[i] = weights[i] / sum_weights; - } - - float* cummulative_weights = malloc(n_dists * sizeof(float)); - array_cumsum(normalized_weights, cummulative_weights, n_dists); - - //create var holders - float p1; - int sample_index, i, own_length; - unsigned int* seeds[n_threads]; - for(unsigned int i=0; i +#include +#include +#include +#include +#include + +#define N 1000000 +/* + * For very high values of N, you will want to increase the maximum stack trace, otherwise you will suffer a segmentation fault + * In Ubuntu/bash you can do this with $ ulimit -Ss 256000 ## ~256Mbs + * And confirm it with $ ulimit -a +*/ + +/* Helpers */ +void print(double* ys) +{ + for (int i = 0; i < N; i++) { + printf("%f\n", ys[i]); + } + printf("\n"); +} + +void fill(double* ys, float f) +{ + for (int i = 0; i < N; i++) { + ys[i] = f; + } +} + +double sum(double* ps, int n) +{ + double result = 0; + for (int i = 0; i < n; i++) { + result += ps[i]; + } + return (result); +} + +void cumsum(double* ps, double* rs, int n) +{ + double counter = 0; + for (int i = 0; i < n; i++) { + counter += ps[i]; + rs[i] = counter; + } +} + +/* Distributions*/ +void normal(gsl_rng* r, double* ys, double mean, double std) +{ + for (int i = 0; i < N; i++) { + ys[i] = mean + gsl_ran_gaussian(r, std); + } +} + +void lognormal(gsl_rng* r, double* ys, double zeta, double sigma) +{ + for (int i = 0; i < N; i++) { + ys[i] = gsl_ran_lognormal(r, zeta, sigma); + } +} + +void to(gsl_rng* r, double* ys, double low, double high) +{ + double normal95confidencePoint = 1.6448536269514722; + double log_low = log(low); + double log_high = log(high); + double zeta = (log_low + log_high) / 2; + double sigma = (log_high - log_low) / (2.0 * normal95confidencePoint); + lognormal(r, ys, zeta, sigma); +} + +/* Mixture of distributions */ +void mixture(gsl_rng* r, double* dists[], double* weights, int n, double* results) +{ + /* Get cummulative, normalized weights */ + double sum_weights = sum(weights, n); + double normalized_weights[n]; + for (int i = 0; i < n; i++) { + normalized_weights[i] = weights[i] / sum_weights; + } + double cummulative_weights[n]; + cumsum(normalized_weights, cummulative_weights, n); + + /* Get N samples, drawn from the different distributions in proportion to their weights. */ + for (int i = 0; i < N; i++) { + double p_1 = gsl_rng_uniform(r); + double p_2 = gsl_rng_uniform(r); + + int index_found = 0; + int index_counter = 0; + while ((index_found == 0) && (index_counter < n)) { + if (p_1 < cummulative_weights[index_counter]) { + index_found = 1; + } else { + index_counter++; + } + } + if (index_found == 0) { + printf("\nThis shouldn't be able to happen"); + // gsl_rng_free (r); + // abort(); // this shouldn't have happened. + + } else { + int sample_index = (int)floor(p_2 * N); + results[i] = dists[index_counter][sample_index]; + } + } +} + +/* Main */ +int main(void) +{ + // Start clock + clock_t start, end; + start = clock(); + + /* Initialize GNU Statistical Library (GSL) stuff */ + const gsl_rng_type* T; + gsl_rng* r; + // gsl_rng_env_setup(); + T = gsl_rng_default; + r = gsl_rng_alloc(T); + + /* Toy example */ + /* Declare variables in play */ + double p_a, p_b, p_c; + double dist_none[N], dist_one[N], dist_few[N], dist_many[N], dist_mixture[N]; + + /* Initialize variables */ + p_a = 0.8; + p_b = 0.5; + p_c = p_a * p_b; + + fill(dist_none, 0); + fill(dist_one, 1); + to(r, dist_few, 1, 3); + to(r, dist_many, 2, 10); + + /* Generate mixture */ + int n = 4; + double weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 }; + double* dists[] = { dist_none, dist_one, dist_few, dist_many }; + + mixture(r, dists, weights, n, dist_mixture); + printf("%f\n", sum(dist_mixture, N) / N); + + /* Clean up GSL */ + gsl_rng_free(r); + + // End clock + end = clock(); + printf("Total time (ms): %f\n", ((double)(end - start)) / CLOCKS_PER_SEC * 1000); + /* Return success*/ + return EXIT_SUCCESS; +} diff --git a/C/C-02-better-algorithm-one-thread/makefile b/C/C-02-better-algorithm-one-thread/makefile new file mode 100644 index 00000000..a9044735 --- /dev/null +++ b/C/C-02-better-algorithm-one-thread/makefile @@ -0,0 +1,53 @@ +# Interface: +# make +# make build +# make format +# make run + +# Compiler +CC=gcc +# CC=tcc # <= faster compilation + +# Main file + +SRC_ONE_THREAD=./samples-one-thread.c +OUTPUT_ONE_THREAD=out/samples-one-thread + +## Dependencies +# Has no dependencies +MATH=-lm + +## Flags +DEBUG= #'-g' +STANDARD=-std=c99 +WARNINGS=-Wall +OPTIMIZED=-O3 #-O3 actually gives better performance than -Ofast, at least for this version +OPENMP=-fopenmp + +## Formatter +STYLE_BLUEPRINT=webkit +FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT) + +## make build +build: $(SRC_ONE_THREAD) + mkdir -p out + $(CC) $(OPTIMIZED) $(DEBUG) $(SRC_ONE_THREAD) $(OPENMP) $(MATH) -o $(OUTPUT_ONE_THREAD) + +format: $(SRC_ONE_THREAD) + $(FORMATTER) $(SRC_ONE_THREAD) + +run: $(SRC_ONE_THREAD) $(OUTPUT_ONE_THREAD) + ./$(OUTPUT_ONE_THREAD) + +time-linux: + @echo "Requires /bin/time, found on GNU/Linux systems" && echo + @echo "Running 100x and taking avg time: $(OUTPUT_ONE_THREAD)" + @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(OUTPUT_ONE_THREAD); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time: |" | sed 's|$$|ms|' && echo + +time-linux-simple: + @echo "Requires /bin/time, found on GNU/Linux systems" && echo + /bin/time -f "Time: %es" ./$(OUTPUT_ONE_THREAD) && echo + +debian-install-dependencies: + sudo apt-get install libomp-dev + diff --git a/C/C-02-better-algorithm-one-thread/out/samples-one-thread b/C/C-02-better-algorithm-one-thread/out/samples-one-thread new file mode 100755 index 0000000000000000000000000000000000000000..b7e78ead09e288d9ec110400f3ec4d1923a9d203 GIT binary patch literal 18008 zcmeHPdvugVmaonOnpb~~5{<8+#oz{%CgEi=yFhnH;ENkH@=);AG)af#n3tVy0oP?S z6N0wTE*@uAXMGIgdXBn#<}AC;>@c%COF|?IjLZo->tXJ+TQwSf zxj(5-=u@Wru~HYxD7oaf5LUf4O3z9wl{{0bgVM_5q540hzFMW%>5^uImA+!Yf+^Q` z7xbuKe%Hjqyh7ENZ5*ueX;T%I0xHkB=ywP2pRqs=`>N|0Yv z2gTH~J5~z}OAleH8|7F`Q9P9&`t;OI`yRe9|G5d%%Rjih=r3M6{@EvNgX$z1N~F&r zK|JNj4#^np{|0gfM)XMF=Pg9DrLj=FB17SS0n3C>0Pe~l@5>_pLKZxn1@F&-*Ji=* z&4PcK1@Fm%KamCh6W}gI1>RhNg~CAh>GzlIFJ7z$*Xx=78||Hng<0`Z_}XV94h)L^BFDw`~v| z4+ldH0=Whe*wo%2I+|Lknhk9&?Nn2#K*qNr5b}i@g8_fNuQl9~f{<5mhgf#!@><_| ze=yh-2$F$1<)^Pb*wh+o@YOXwBszlr)_S-H&*~l`#Mj_&Y8LI`P+g-xC|dl@&24ok zuiNDJH8i#Qo0}e`){r&TnifCYYX}AcMt!C360xkNx?+iMj(d)K{&2p0ID1)(wTRa`LXMemwFOJT zmAn1Sz zS3_s_DP_3in;oGpaj-4n`54(MHP+WTbFlk;Z+ zP9&$_0-{Y7Bu~#hA`$=ZWHPx^^0Xl*;zv?EZN!OqZ;Gb}g+%f}FAXrF#8a+DD-Il zc?j##N&f@Vplhwi0DXnLHY2Zy_+FH~iU6T}tU+sjQ=3s)iP?YGA z6MFQqBBvhd(<8m!J)R0p#P%Q_^vIwdeTJ%R&`$R>xV5QUA&zRCcD<{|sTKcHD?V~Y zTQjKjA6cyx?*pe59MSp@CIR;0vk#wMy?kHzupXU1ZW8>bM(&}nGungse+J}?c5hGY zXP1*neLGrPug^8w9!3psdP;ll>x(Typ{eev9vL5{r>Y}I9@HZr>RlfV)vl_E<}W}* zqTPEDsCsnaYtTv-tiM%=FLr{BAcIc&CXw=Up#n_BMU=->9G5(%P3?qBqEAuCua)xi zR}h(TS&t0qy`L@Cd(Y+SwtoG{<s<>Iw;-!W zj)x}d+ZUb%6B~CanT*xL#s2(L;B5~;H&s9B$2Zcb8-5W_BVe|FMswy5V$P8sm9IqT zIAbrvY@{#tJO~sHk`e?9-hkj4ZRP=dFf3~i;e(->TvKG1-T&!DLp-6a&EE~t1ja-w z9@N}Wxa%kc?=PCTaTyF%lET{jMrjV?c9X29_<&K*y_H21VWOARNvukWt;zR-Vl*uUfJ!09Af{^U}%4X|F*Gq&fIsCl#~# zAC+Qj@*|*z|6Ld0|7CoQ^uPP;mPYUD%4loSi(s5S2|BCw$fcE~M``6Z;v@1NHK-$0wAEukEAAzbd>k0|alZ6%jp5_FTc)v(K;Yw*PcTBQ-G*h(M(fw9K2vjL zykb-fA%c*P&5{E_Gi|03_t-iS|0u#R7WOrm;yVYTi0=!(PM@DwN{tNjbE2G|HAMO@onEYSI$o;tj#}wwOcB7Nkvn9+i8BvSSA~Nu{jVp*-zu=T~!^qxO!#e3>xN= z4Ws7O6kQ*?d=5VEkG)EyH!}F0uP&7z(RP&3cJq%1Y=@!~o#ltLr~^EWM$A?nz4cqV zZQt@}{kQTsKCyBX#=Z>;(lQni0N2O1O2CO_$k;~^erpS+O6Tv#=s%c$wail|LvK}) zJ+_aW=!^X;h_N;pJ^l_JW~61OLTQ!D|EWYV%_kb-N2&h?02=*wKywdbeLEmHBA8{O=p z=^K9lMKbOYOr>wf`>`{{ek{eE`8SNL@aINRO%Yef3+ixX7tX56s>?ssx?hCBlTVuh zRF3C8-n+f4t0MbpVBcCz%WJV7d928;*PYiR7xdnbbGnaePtC?tLiEp*)se5dex0Lj zyAy2WJ#2UA$AhU^n^5ycb7CFraiYZswBiA+e}LxgA<7nj>!DpATg#dQG-=<73}Rs^ zAJn$KE+b>$j8|JT@mX41-q}*9=Bj*GS4E$ zFp85azb(kG_q6D}DvGi0Sp?wa71B#5Ee(cRw}{!~NI0uRb>u%_Nkms(POT&ee&cxw zeNx#iU=i!5v7!uwPgZrG+S1{TRwb85PE~U>&?;~-2d#K2soYGGfkE!nJ&s$!8BK&uH zx*|2YU=#y@FKnAQLX1c30yHswQoRtLiuf<> zi9Z8ziX>#u(*w^t7P+nqHm&<0gyNq`$y9$~J-VkRy71&wA-s22N8a_Wu8w@^UFF3R zrAKG|3h>IB8JB2hjeR!(mwGSdhNhMtRdHGqNz_C>tBic?O-}iM_Ef$6{qQHWrw+H) zyr9#3p*Jz2WRoO~H#6ErM$>2y{Bb=%f5&JF1zPT#UjKk=p=)}>c(F3vQe2NeA^BRG zHig2$z^$%3X3V}r6&}^9JwnLS@{*(4LC@*O&M%tslf8?ty!iKvR(*FOohmQh`Q!?Z zC<;}ZJnP~7-+t$s#fRUjS#=i=JOR2JkKX49$GYI5vkN5tTZHY=6~ea3X}fOXSo==cB%FSuV25uw zArYc-tUP`%LUtZLJ)aqE5f0~4N8#<-gpKx2vH04r&zxH{jl@a+ete!o+s)_pgop6i z3492Y6UorC>=URDZ<8_+whN!F!0_5ClMCNJz^4-!QBmb^cI8w$Tsbd|cR0NcJKlg? zfT_Q)NB?D$DMR^1eD*-*QBZNa!x_y%Q+DL)4%fE4N=I>5{!&Lt$Wgqs&*Ad+I|?h_ zayTmvJM8jrU1jK_i_oRN2U8+nYk)OktX`Lhfvp3Uhw+-HuqmJ+U?sqCR)PmOhbsV^ zE@XEA*J+HAA8yx^sdkkB-VK>IA+sM;EOj_{pzVd*a=i{$SKd;GC%4)*9^N_%z3|mx zzkPxZv5z3T4l(f~j6Lc6bxwt&@TuGihbx*_Hpc5H>l$0>SeH9FXS|~f zkk?TR5r|euTU8TWP^Y+h6JtM(>X&_A?WoP2fLfhP9Cq)7O28CzyAg8%qdoA4_W=K1 zihmzPk9Cx+w9$Z5I6vb~RJeR#L!O`0U%SC5xJH%p@3gK{JdRXlx>iyCy%xQ5P~!40 zFDKiG-=*GH_*t+=WpI`!(*adZ2l$khs|MijATkvyTA(Nsr6zRsyCHf+rqrnP>3N+J zPDN$9TG7{(gN(;ihW+kQa{MmC^?yy_EQdcr$o8Q+3E@me@@2{%4s|5Ye&W?p@(xvw z$K$keW&Zz+@b9y&221FL2g##ot)lA`ZCA8Y(VdF!QgpYXJ&Fz}dQ8zFMTPotm(%D%rx#Fi~t@^x47>h6SSq?vi=#(isN++s)E19t(Lu zOycyLO8$Mi2^P8d$G<;M$MeKv8SUh!;yqn{jNmv=$Hxku59zpFaGa#$;{?Z7IzC=- z9H-+G1jlbsao_hgk5ID z$mik|rZXcxMHJtdNxmTUyHay$uDB}oyV7+0YLRK437^D&-(&cnQHKDGx))iEDdHE@5L3!#heR^jKO}VvMvU)K;5IRG zd_Pk1yst?XUth`0$aou%j>4)jA|5Vc!^sgN=aCEYSD@c^seP39y%LFYZNs!83w{r9 z*RWVB_YmrJiIH*Gk;Tr|Eco-lov>q#+n1%hQw*pLQ=WC9_U|hkT_MvhgNPqxu|JdW zoRRbWohO>Xw<-IzYF(PG@M(}IJJxt} zD?2V#xjdhQ&fF~iEC){gP~tKSb3fE(k@qY6$J7I)JTr!Bi^AL0gRDGb1s+y-t!fuP z7jFSRIcMa2c~asd{_#42L>qZ0A_hI02-B9yOo)e?&eT5%W@yyTfA1mCY+Qt2! z06s0vvccRV$0b8Shrgx0InXf+S94|=_h;O7qAnQf2!-)y@;WhmfyWnW@zv24o(^1i zsc-XbXl`5YZ}!!P+JYTEe|VFq!#$klKqye}E-P73npuc0%{2LN2gkqJ7ibLyH{+^| zza`+S54W^zhKh;vK{Avk+Sbs}+~%)m*L=REE4<6Ad{uW;(jB67d0gVDTziLidG(U? z5_#na5H7g+s&r*RuUsK~%WCed@YeY5T)K2+)hgdAZ$(WNS(R6N>N>*Gb9u4H<1t_8 zQCEY~!QXWEiS96&q;dJjbT0`7#L=xGU!dL}@(a3<4dZjl(N5L5zDN8R&Hye!8H3c&)Z8rZUYW#i<8CqTZ_$k> zQz9UaPVdAG}Fx-T}YMmM)=tLbjq zFh=*ij9!#C%!K*_CXWJ55!{MP#VM~xIGwPy%X~V6zHdVLm z2RX}q4S|iQ(|mU=-C5!8*xZ7uK|?`752!|wKATYDfF+H8W> zhnsLiwW(e~-iqp3A^!#;#T)${jlx~OxfN9yG!!&S9ti|Hn%Y{^7$3@lfo4A`s9bw< zNVw&Yxsi2mXah@`jzFDohXR|x(&7T?HaT(Jfkw5QG}gl!bA}CLT`@E`kE;AFO)zGt zW7vfoYg!A|sSFF}?+4MF03K!V%xENj4`s^dC#qP=a-Qf-_|VY->+}06Qy0q2iO(q* zqxS-{{u=BPl$aJ7Z0ei$RR*e&G3)dDFH;u_Dr(ig2RQ!hO^*jM@wpIFuAk3^tp59< zMQ1{+&+pSr={b=S`AAvT=kuE;VAN#R=l5==eC`AhnUpzZ#ly(anGWmo`#IB@N|OC& zIi_1sPG>~S^Lsqg-AbSQCqLPL1&b&$)TgY^@B2)9lpxp7`rQ6!m41bi<8uL~d~Qbi zq+`|p9x&d^KvOT6hw&?ToD${O0u(as2*}WEhJ_ljS&zUT5?Z5p>pT}Rgl%%4aie>A^ zGqnt{6!a*ACF+HpUMmS=e|Y_&V)k7ss;~BPBjd0H9M{XTG~F}lZ&_$aTx(IVu;PCK Dr8Xqg literal 0 HcmV?d00001 diff --git a/C-optimized/samples-one-thread.c b/C/C-02-better-algorithm-one-thread/samples-one-thread.c similarity index 95% rename from C-optimized/samples-one-thread.c rename to C/C-02-better-algorithm-one-thread/samples-one-thread.c index da6e152c..bac15fea 100644 --- a/C-optimized/samples-one-thread.c +++ b/C/C-02-better-algorithm-one-thread/samples-one-thread.c @@ -5,7 +5,7 @@ const float PI = 3.14159265358979323846; -#define N 10000000 +#define N 1000000 //Array helpers diff --git a/C/README.md b/C/README.md new file mode 100644 index 00000000..23e43fef --- /dev/null +++ b/C/README.md @@ -0,0 +1,15 @@ +# Time to BOTEC in C + +This repository contains a few implementations of a simple botec (back-of-the-envelope) calculation in C: + +- In the folder C-01-simple/, you can see a simple implementation, which passes large arrays +- In the folder C-02-better-algorithm-one-thread/ you can see a better implementations, that passes around pointers to functions, which makes the implementation more efficient +- In the top level, you can see an implementation that uses the better implementation in C-02..., and that also implements multithreading using OpenMP + +## To do + +- [ ] Update repository with correct timing +- [ ] Add Windows/Powershell time-measuring commands +- [ ] Add CUDA? +- [x] Use better profiling approach to capture timing with 1M samples. +- [x] See if program can be reworded so as to use multithreading effectively, e.g., so that you see speed gains proportional to the number of threads used diff --git a/C/makefile b/C/makefile index f6e9010e..94fcb5f2 100644 --- a/C/makefile +++ b/C/makefile @@ -1,53 +1,82 @@ -# Interface: -# make -# make build -# make format -# make run - -# Compiler -CC=gcc -# CC=tcc # <= faster compilation - -# Main file -SRC=samples.c -OUTPUT=samples - -## Dependencies -DEPS='gsl' - -## Flags -INCS=`pkg-config --cflags ${DEPS}` -LIBS=`pkg-config --libs ${DEPS}` -DEBUG= #'-g' -STANDARD=-std=c99 -WARNINGS=-Wall -FAST=-Ofast -## Formatter -STYLE_BLUEPRINT=webkit -FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT) - -## make build -build: $(SRC) - $(CC) $(DEBUG) $(INCS) $(PLUGS) $(SRC) -o samples $(LIBS) - -fast: $(SRC) - $(CC) $(FAST) $(DEBUG) $(INCS) $(PLUGS) $(SRC) -o samples $(LIBS) - -format: $(SRC) - $(FORMATTER) $(SRC) - -run: $(SRC) $(OUTPUT) - echo "Increasing stack size limit, because we are dealing with 1M samples" - # ulimit: increase stack size limit - # -Ss: the soft limit. If you set the hard limit, you then can't raise it - # 256000: around 250Mbs, if I'm reading it correctly. - # Then run the program - ulimit -Ss 256000 && ./$(OUTPUT) - - - -# Old: -# Link libraries, for good measure -# LD_LIBRARY_PATH=/usr/local/lib -# export LD_LIBRARY_PATH - +# Interface: +# make +# make build +# make format +# make run + +# Compiler +CC=gcc +# CC=tcc # <= faster compilation + +# Main file +SRC=samples.c +OUTPUT=out/samples + +SRC_ONE_THREAD=./samples-one-thread.c +OUTPUT_ONE_THREAD=out/samples-one-thread + +## Dependencies +# Has no dependencies +MATH=-lm + +## Flags +DEBUG= #'-g' +STANDARD=-std=c99 +WARNINGS=-Wall +OPTIMIZED=-O3 #-O3 actually gives better performance than -Ofast, at least for this version +OPENMP=-fopenmp + +## Formatter +STYLE_BLUEPRINT=webkit +FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT) + +## make build +build: $(SRC) + $(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(OPENMP) $(MATH) -o $(OUTPUT) + $(CC) $(OPTIMIZED) $(DEBUG) $(SRC_ONE_THREAD) $(OPENMP) $(MATH) -o $(OUTPUT_ONE_THREAD) + +format: $(SRC) + $(FORMATTER) $(SRC) + +run: $(SRC) $(OUTPUT) + OMP_NUM_THREADS=1 ./$(OUTPUT) && echo + ./$(OUTPUT_ONE_THREAD) + +multi: + OMP_NUM_THREADS=1 ./$(OUTPUT) && echo + OMP_NUM_THREADS=2 ./$(OUTPUT) && echo + OMP_NUM_THREADS=4 ./$(OUTPUT) && echo + OMP_NUM_THREADS=8 ./$(OUTPUT) && echo + OMP_NUM_THREADS=16 ./$(OUTPUT) && echo + ./$(OUTPUT_ONE_THREAD) && echo + +time-linux: + @echo "Requires /bin/time, found on GNU/Linux systems" && echo + + @echo "Running 100x and taking avg time: OMP_NUM_THREADS=1 $(OUTPUT)" + @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=1 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 1 thread: |" | sed 's|$$|ms|' && echo + + @echo "Running 100x and taking avg time: OMP_NUM_THREADS=2 $(OUTPUT)" + @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=2 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 2 threads: |" | sed 's|$$|ms|' && echo + + @echo "Running 100x and taking avg time: OMP_NUM_THREADS=4 $(OUTPUT)" + @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=4 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time for 4 threads: |" | sed 's|$$|ms|' && echo + + @echo "Running 100x and taking avg time: OMP_NUM_THREADS=8 $(OUTPUT)" + @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=8 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 8 threads: |" | sed 's|$$|ms|' && echo + + @echo "Running 100x and taking avg time: OMP_NUM_THREADS=16 $(OUTPUT)" + @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=16 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo + +time-linux-simple: + @echo "Requires /bin/time, found on GNU/Linux systems" && echo + OMP_NUM_THREADS=1 /bin/time -f "Time: %es" ./$(OUTPUT) && echo + OMP_NUM_THREADS=2 /bin/time -f "Time: %es" ./$(OUTPUT) && echo + OMP_NUM_THREADS=4 /bin/time -f "Time: %es" ./$(OUTPUT) && echo + OMP_NUM_THREADS=8 /bin/time -f "Time: %es" ./$(OUTPUT) && echo + OMP_NUM_THREADS=16 /bin/time -f "Time: %es" ./$(OUTPUT) && echo + /bin/time -f "Time: %es" ./$(OUTPUT_ONE_THREAD) && echo + +debian-install-dependencies: + sudo apt-get install libomp-dev + diff --git a/C-optimized/out/samples b/C/out/samples similarity index 100% rename from C-optimized/out/samples rename to C/out/samples diff --git a/C-optimized/out/samples-one-thread b/C/out/samples-one-thread similarity index 100% rename from C-optimized/out/samples-one-thread rename to C/out/samples-one-thread diff --git a/C-optimized/out/samples.exe b/C/out/samples.exe similarity index 100% rename from C-optimized/out/samples.exe rename to C/out/samples.exe diff --git a/C-optimized/out/samples.txt b/C/out/samples.txt similarity index 100% rename from C-optimized/out/samples.txt rename to C/out/samples.txt diff --git a/C/samples.c b/C/samples.c index b18ec29b..6b779a6a 100644 --- a/C/samples.c +++ b/C/samples.c @@ -1,157 +1,281 @@ -#include -#include -#include -#include -#include -#include - -#define N 1000000 -/* - * For very high values of N, you will want to increase the maximum stack trace, otherwise you will suffer a segmentation fault - * In Ubuntu/bash you can do this with $ ulimit -Ss 256000 ## ~256Mbs - * And confirm it with $ ulimit -a -*/ - -/* Helpers */ -void print(double* ys) -{ - for (int i = 0; i < N; i++) { - printf("%f\n", ys[i]); - } - printf("\n"); -} - -void fill(double* ys, float f) -{ - for (int i = 0; i < N; i++) { - ys[i] = f; - } -} - -double sum(double* ps, int n) -{ - double result = 0; - for (int i = 0; i < n; i++) { - result += ps[i]; - } - return (result); -} - -void cumsum(double* ps, double* rs, int n) -{ - double counter = 0; - for (int i = 0; i < n; i++) { - counter += ps[i]; - rs[i] = counter; - } -} - -/* Distributions*/ -void normal(gsl_rng* r, double* ys, double mean, double std) -{ - for (int i = 0; i < N; i++) { - ys[i] = mean + gsl_ran_gaussian(r, std); - } -} - -void lognormal(gsl_rng* r, double* ys, double zeta, double sigma) -{ - for (int i = 0; i < N; i++) { - ys[i] = gsl_ran_lognormal(r, zeta, sigma); - } -} - -void to(gsl_rng* r, double* ys, double low, double high) -{ - double normal95confidencePoint = 1.6448536269514722; - double log_low = log(low); - double log_high = log(high); - double zeta = (log_low + log_high) / 2; - double sigma = (log_high - log_low) / (2.0 * normal95confidencePoint); - lognormal(r, ys, zeta, sigma); -} - -/* Mixture of distributions */ -void mixture(gsl_rng* r, double* dists[], double* weights, int n, double* results) -{ - /* Get cummulative, normalized weights */ - double sum_weights = sum(weights, n); - double normalized_weights[n]; - for (int i = 0; i < n; i++) { - normalized_weights[i] = weights[i] / sum_weights; - } - double cummulative_weights[n]; - cumsum(normalized_weights, cummulative_weights, n); - - /* Get N samples, drawn from the different distributions in proportion to their weights. */ - for (int i = 0; i < N; i++) { - double p_1 = gsl_rng_uniform(r); - double p_2 = gsl_rng_uniform(r); - - int index_found = 0; - int index_counter = 0; - while ((index_found == 0) && (index_counter < n)) { - if (p_1 < cummulative_weights[index_counter]) { - index_found = 1; - } else { - index_counter++; - } - } - if (index_found == 0) { - printf("\nThis shouldn't be able to happen"); - // gsl_rng_free (r); - // abort(); // this shouldn't have happened. - - } else { - int sample_index = (int)floor(p_2 * N); - results[i] = dists[index_counter][sample_index]; - } - } -} - -/* Main */ -int main(void) -{ - // Start clock - clock_t start, end; - start = clock(); - - /* Initialize GNU Statistical Library (GSL) stuff */ - const gsl_rng_type* T; - gsl_rng* r; - // gsl_rng_env_setup(); - T = gsl_rng_default; - r = gsl_rng_alloc(T); - - /* Toy example */ - /* Declare variables in play */ - double p_a, p_b, p_c; - double dist_none[N], dist_one[N], dist_few[N], dist_many[N], dist_mixture[N]; - - /* Initialize variables */ - p_a = 0.8; - p_b = 0.5; - p_c = p_a * p_b; - - fill(dist_none, 0); - fill(dist_one, 1); - to(r, dist_few, 1, 3); - to(r, dist_many, 2, 10); - - /* Generate mixture */ - int n = 4; - double weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 }; - double* dists[] = { dist_none, dist_one, dist_few, dist_many }; - - mixture(r, dists, weights, n, dist_mixture); - printf("%f\n", sum(dist_mixture, N) / N); - - /* Clean up GSL */ - gsl_rng_free(r); - - // End clock - end = clock(); - printf("Total time (ms): %f\n", ((double)(end - start)) / CLOCKS_PER_SEC * 1000); - /* Return success*/ - return EXIT_SUCCESS; -} +#include +#include +#include +#include +#include + +const float PI = 3.14159265358979323846; + +#define N 1000000 + +//Array helpers + +void array_print(float* array, int length) +{ + for (int i = 0; i < length; i++) { + printf("item[%d] = %f\n", i, array[i]); + } + printf("\n"); +} + +void array_fill(float* array, int length, float item) +{ + int i; + #pragma omp private(i) + { + #pragma omp for + for (i = 0; i < length; i++) { + array[i] = item; + } + } +} + +float array_sum(float* array, int length) +{ + float output = 0.0; + for (int i = 0; i < length; i++) { + output += array[i]; + } + return output; +} + +void array_cumsum(float* array_to_sum, float* array_cumsummed, int length) +{ + array_cumsummed[0] = array_to_sum[0]; + for (int i = 1; i < length; i++) { + array_cumsummed[i] = array_cumsummed[i - 1] + array_to_sum[i]; + } +} + +float rand_float(float to, unsigned int* seed) +{ + return ((float)rand_r(seed) / (float)RAND_MAX) * to; + // See: + // rand() is not thread-safe, as it relies on (shared) hidden state. +} + +float ur_normal(unsigned int* seed) +{ + float u1 = rand_float(1.0, seed); + float u2 = rand_float(1.0, seed); + float z = sqrtf(-2.0 * log(u1)) * sin(2 * PI * u2); + return z; +} + +inline float random_uniform(float from, float to, unsigned int* seed) +{ + return ((float) rand_r(seed) / (float)RAND_MAX) * (to - from) + from; +} + +inline float random_normal(float mean, float sigma, unsigned int* seed) +{ + return (mean + sigma * ur_normal(seed)); +} + +inline float random_lognormal(float logmean, float logsigma, unsigned int* seed) +{ + return expf(random_normal(logmean, logsigma, seed)); +} + +inline float random_to(float low, float high, unsigned int* seed) +{ + const float NORMAL95CONFIDENCE = 1.6448536269514722; + float loglow = logf(low); + float loghigh = logf(high); + float logmean = (loglow + loghigh) / 2; + float logsigma = (loghigh - loglow) / (2.0 * NORMAL95CONFIDENCE); + return random_lognormal(logmean, logsigma, seed); +} + +int split_array_get_my_length(int index, int total_length, int n_threads) +{ + return (total_length % n_threads > index ? total_length / n_threads + 1 : total_length / n_threads); +} + +//Old version, don't use it!! Optimized version is called mixture_f. This one is just for display +/* +void mixture(float* dists[], float* weights, int n_dists, float* results) +{ + float sum_weights = array_sum(weights, n_dists); + float* normalized_weights = malloc(n_dists * sizeof(float)); + for (int i = 0; i < n_dists; i++) { + normalized_weights[i] = weights[i] / sum_weights; + } + + float* cummulative_weights = malloc(n_dists * sizeof(float)); + array_cumsum(normalized_weights, cummulative_weights, n_dists); + + //create var holders + float p1, p2; + int index_found, index_counter, sample_index, i; + + #pragma omp parallel private(i, p1, p2, index_found, index_counter, sample_index) + { + #pragma omp for + for (i = 0; i < N; i++) { + p1 = random_uniform(0, 1); + p2 = random_uniform(0, 1); + + index_found = 0; + index_counter = 0; + + while ((index_found == 0) && (index_counter < n_dists)) { + if (p1 < cummulative_weights[index_counter]) { + index_found = 1; + } else { + index_counter++; + } + } + if (index_found != 0) { + sample_index = (int)(p2 * N); + results[i] = dists[index_counter][sample_index]; + } else + printf("This shouldn't be able to happen.\n"); + } + } + free(normalized_weights); + free(cummulative_weights); +} +*/ +void mixture_f(float (*samplers[])(unsigned int* ), float* weights, int n_dists, float** results, int n_threads) +{ + float sum_weights = array_sum(weights, n_dists); + float* normalized_weights = malloc(n_dists * sizeof(float)); + for (int i = 0; i < n_dists; i++) { + normalized_weights[i] = weights[i] / sum_weights; + } + + float* cummulative_weights = malloc(n_dists * sizeof(float)); + array_cumsum(normalized_weights, cummulative_weights, n_dists); + + //create var holders + float p1; + int sample_index, i, own_length; + unsigned int* seeds[n_threads]; + for(unsigned int i=0; i