diff --git a/C-optimized/README.md b/C-optimized/README.md deleted file mode 100644 index 222436d0..00000000 --- a/C-optimized/README.md +++ /dev/null @@ -1,32 +0,0 @@ -# C-Optimized - -An optimized version of the original C implementation. - -The main changes are: - -- an optimization of the mixture function (it passes the functions instead of the whole arrays, reducing in great measure the memory usage and the computation time) and -- the implementation of multi-threading with OpenMP. - -## Performance - -The mean time of execution is 6 ms. With the following distribution: - -![Time histogram](https://i.imgur.com/6iT2PkF.png) - -The hardware used has been an AMD 5800x3D and 16GB of DDR4-3200 MHz. - -Also, the time data has been collected by executing the interior of the main() function 1000 times in a for loop, not executing the program itself 1000 times. - -## Multithreading - -Take into account that the multi-threading introduces a bit of dispersion in the execution time due to the creation and destruction of threads. - -In Nuño's machine, multithreading actually introduces a noticeable slowdown factor. - -## To do - -- [ ] Use proper profiling tool to capture timing with 1M samples. -- [ ] Update above with correct timing -- [ ] Add Windows/Powershell time-measuring commands -- [ ] Add CUDA? -- [ ] See if program can be reworded so as to use multithreading effectively, e.g., so that you see speed gains proportional to the number of threads used diff --git a/C-optimized/makefile b/C-optimized/makefile deleted file mode 100644 index 3d7d4053..00000000 --- a/C-optimized/makefile +++ /dev/null @@ -1,82 +0,0 @@ -# Interface: -# make -# make build -# make format -# make run - -# Compiler -CC=gcc -# CC=tcc # <= faster compilation - -# Main file -SRC=samples.c -OUTPUT=out/samples - -SRC_ONE_THREAD=./samples-one-thread.c -OUTPUT_ONE_THREAD=out/samples-one-thread - -## Dependencies -# Has no dependencies -MATH=-lm - -## Flags -DEBUG= #'-g' -STANDARD=-std=c99 -WARNINGS=-Wall -OPTIMIZED=-O3 #-O3 actually gives better performance than -Ofast, at least for this version -OPENMP=-fopenmp - -## Formatter -STYLE_BLUEPRINT=webkit -FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT) - -## make build -build: $(SRC) - $(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(OPENMP) $(MATH) -o $(OUTPUT) - $(CC) $(OPTIMIZED) $(DEBUG) $(SRC_ONE_THREAD) $(OPENMP) $(MATH) -o $(OUTPUT_ONE_THREAD) - -format: $(SRC) - $(FORMATTER) $(SRC) - -run: $(SRC) $(OUTPUT) - OMP_NUM_THREADS=1 ./$(OUTPUT) && echo - ./$(OUTPUT_ONE_THREAD) - -multi: - OMP_NUM_THREADS=1 ./$(OUTPUT) && echo - OMP_NUM_THREADS=2 ./$(OUTPUT) && echo - OMP_NUM_THREADS=4 ./$(OUTPUT) && echo - OMP_NUM_THREADS=8 ./$(OUTPUT) && echo - OMP_NUM_THREADS=16 ./$(OUTPUT) && echo - ./$(OUTPUT_ONE_THREAD) && echo - -time-linux-simple: - @echo "Requires /bin/time, found on GNU/Linux systems" && echo - OMP_NUM_THREADS=1 /bin/time -f "Time: %es" ./$(OUTPUT) && echo - OMP_NUM_THREADS=2 /bin/time -f "Time: %es" ./$(OUTPUT) && echo - OMP_NUM_THREADS=4 /bin/time -f "Time: %es" ./$(OUTPUT) && echo - OMP_NUM_THREADS=8 /bin/time -f "Time: %es" ./$(OUTPUT) && echo - OMP_NUM_THREADS=16 /bin/time -f "Time: %es" ./$(OUTPUT) && echo - /bin/time -f "Time: %es" ./$(OUTPUT_ONE_THREAD) && echo - -time-linux: - @echo "Requires /bin/time, found on GNU/Linux systems" && echo - - @echo "Running 100x and taking avg time: OMP_NUM_THREADS=1 $(OUTPUT)" - @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=1 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 1 thread: |" | sed 's|$$|ms|' && echo - - @echo "Running 100x and taking avg time: OMP_NUM_THREADS=2 $(OUTPUT)" - @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=2 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 2 threads: |" | sed 's|$$|ms|' && echo - - @echo "Running 100x and taking avg time: OMP_NUM_THREADS=4 $(OUTPUT)" - @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=4 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time for 4 threads: |" | sed 's|$$|ms|' && echo - - @echo "Running 100x and taking avg time: OMP_NUM_THREADS=8 $(OUTPUT)" - @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=8 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 8 threads: |" | sed 's|$$|ms|' && echo - - @echo "Running 100x and taking avg time: OMP_NUM_THREADS=16 $(OUTPUT)" - @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=16 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo - -debian-install-dependencies: - sudo apt-get install libomp-dev - diff --git a/C-optimized/samples.c b/C-optimized/samples.c deleted file mode 100644 index 6b779a6a..00000000 --- a/C-optimized/samples.c +++ /dev/null @@ -1,281 +0,0 @@ -#include -#include -#include -#include -#include - -const float PI = 3.14159265358979323846; - -#define N 1000000 - -//Array helpers - -void array_print(float* array, int length) -{ - for (int i = 0; i < length; i++) { - printf("item[%d] = %f\n", i, array[i]); - } - printf("\n"); -} - -void array_fill(float* array, int length, float item) -{ - int i; - #pragma omp private(i) - { - #pragma omp for - for (i = 0; i < length; i++) { - array[i] = item; - } - } -} - -float array_sum(float* array, int length) -{ - float output = 0.0; - for (int i = 0; i < length; i++) { - output += array[i]; - } - return output; -} - -void array_cumsum(float* array_to_sum, float* array_cumsummed, int length) -{ - array_cumsummed[0] = array_to_sum[0]; - for (int i = 1; i < length; i++) { - array_cumsummed[i] = array_cumsummed[i - 1] + array_to_sum[i]; - } -} - -float rand_float(float to, unsigned int* seed) -{ - return ((float)rand_r(seed) / (float)RAND_MAX) * to; - // See: - // rand() is not thread-safe, as it relies on (shared) hidden state. -} - -float ur_normal(unsigned int* seed) -{ - float u1 = rand_float(1.0, seed); - float u2 = rand_float(1.0, seed); - float z = sqrtf(-2.0 * log(u1)) * sin(2 * PI * u2); - return z; -} - -inline float random_uniform(float from, float to, unsigned int* seed) -{ - return ((float) rand_r(seed) / (float)RAND_MAX) * (to - from) + from; -} - -inline float random_normal(float mean, float sigma, unsigned int* seed) -{ - return (mean + sigma * ur_normal(seed)); -} - -inline float random_lognormal(float logmean, float logsigma, unsigned int* seed) -{ - return expf(random_normal(logmean, logsigma, seed)); -} - -inline float random_to(float low, float high, unsigned int* seed) -{ - const float NORMAL95CONFIDENCE = 1.6448536269514722; - float loglow = logf(low); - float loghigh = logf(high); - float logmean = (loglow + loghigh) / 2; - float logsigma = (loghigh - loglow) / (2.0 * NORMAL95CONFIDENCE); - return random_lognormal(logmean, logsigma, seed); -} - -int split_array_get_my_length(int index, int total_length, int n_threads) -{ - return (total_length % n_threads > index ? total_length / n_threads + 1 : total_length / n_threads); -} - -//Old version, don't use it!! Optimized version is called mixture_f. This one is just for display -/* -void mixture(float* dists[], float* weights, int n_dists, float* results) -{ - float sum_weights = array_sum(weights, n_dists); - float* normalized_weights = malloc(n_dists * sizeof(float)); - for (int i = 0; i < n_dists; i++) { - normalized_weights[i] = weights[i] / sum_weights; - } - - float* cummulative_weights = malloc(n_dists * sizeof(float)); - array_cumsum(normalized_weights, cummulative_weights, n_dists); - - //create var holders - float p1, p2; - int index_found, index_counter, sample_index, i; - - #pragma omp parallel private(i, p1, p2, index_found, index_counter, sample_index) - { - #pragma omp for - for (i = 0; i < N; i++) { - p1 = random_uniform(0, 1); - p2 = random_uniform(0, 1); - - index_found = 0; - index_counter = 0; - - while ((index_found == 0) && (index_counter < n_dists)) { - if (p1 < cummulative_weights[index_counter]) { - index_found = 1; - } else { - index_counter++; - } - } - if (index_found != 0) { - sample_index = (int)(p2 * N); - results[i] = dists[index_counter][sample_index]; - } else - printf("This shouldn't be able to happen.\n"); - } - } - free(normalized_weights); - free(cummulative_weights); -} -*/ -void mixture_f(float (*samplers[])(unsigned int* ), float* weights, int n_dists, float** results, int n_threads) -{ - float sum_weights = array_sum(weights, n_dists); - float* normalized_weights = malloc(n_dists * sizeof(float)); - for (int i = 0; i < n_dists; i++) { - normalized_weights[i] = weights[i] / sum_weights; - } - - float* cummulative_weights = malloc(n_dists * sizeof(float)); - array_cumsum(normalized_weights, cummulative_weights, n_dists); - - //create var holders - float p1; - int sample_index, i, own_length; - unsigned int* seeds[n_threads]; - for(unsigned int i=0; i +#include +#include +#include +#include +#include + +#define N 1000000 +/* + * For very high values of N, you will want to increase the maximum stack trace, otherwise you will suffer a segmentation fault + * In Ubuntu/bash you can do this with $ ulimit -Ss 256000 ## ~256Mbs + * And confirm it with $ ulimit -a +*/ + +/* Helpers */ +void print(double* ys) +{ + for (int i = 0; i < N; i++) { + printf("%f\n", ys[i]); + } + printf("\n"); +} + +void fill(double* ys, float f) +{ + for (int i = 0; i < N; i++) { + ys[i] = f; + } +} + +double sum(double* ps, int n) +{ + double result = 0; + for (int i = 0; i < n; i++) { + result += ps[i]; + } + return (result); +} + +void cumsum(double* ps, double* rs, int n) +{ + double counter = 0; + for (int i = 0; i < n; i++) { + counter += ps[i]; + rs[i] = counter; + } +} + +/* Distributions*/ +void normal(gsl_rng* r, double* ys, double mean, double std) +{ + for (int i = 0; i < N; i++) { + ys[i] = mean + gsl_ran_gaussian(r, std); + } +} + +void lognormal(gsl_rng* r, double* ys, double zeta, double sigma) +{ + for (int i = 0; i < N; i++) { + ys[i] = gsl_ran_lognormal(r, zeta, sigma); + } +} + +void to(gsl_rng* r, double* ys, double low, double high) +{ + double normal95confidencePoint = 1.6448536269514722; + double log_low = log(low); + double log_high = log(high); + double zeta = (log_low + log_high) / 2; + double sigma = (log_high - log_low) / (2.0 * normal95confidencePoint); + lognormal(r, ys, zeta, sigma); +} + +/* Mixture of distributions */ +void mixture(gsl_rng* r, double* dists[], double* weights, int n, double* results) +{ + /* Get cummulative, normalized weights */ + double sum_weights = sum(weights, n); + double normalized_weights[n]; + for (int i = 0; i < n; i++) { + normalized_weights[i] = weights[i] / sum_weights; + } + double cummulative_weights[n]; + cumsum(normalized_weights, cummulative_weights, n); + + /* Get N samples, drawn from the different distributions in proportion to their weights. */ + for (int i = 0; i < N; i++) { + double p_1 = gsl_rng_uniform(r); + double p_2 = gsl_rng_uniform(r); + + int index_found = 0; + int index_counter = 0; + while ((index_found == 0) && (index_counter < n)) { + if (p_1 < cummulative_weights[index_counter]) { + index_found = 1; + } else { + index_counter++; + } + } + if (index_found == 0) { + printf("\nThis shouldn't be able to happen"); + // gsl_rng_free (r); + // abort(); // this shouldn't have happened. + + } else { + int sample_index = (int)floor(p_2 * N); + results[i] = dists[index_counter][sample_index]; + } + } +} + +/* Main */ +int main(void) +{ + // Start clock + clock_t start, end; + start = clock(); + + /* Initialize GNU Statistical Library (GSL) stuff */ + const gsl_rng_type* T; + gsl_rng* r; + // gsl_rng_env_setup(); + T = gsl_rng_default; + r = gsl_rng_alloc(T); + + /* Toy example */ + /* Declare variables in play */ + double p_a, p_b, p_c; + double dist_none[N], dist_one[N], dist_few[N], dist_many[N], dist_mixture[N]; + + /* Initialize variables */ + p_a = 0.8; + p_b = 0.5; + p_c = p_a * p_b; + + fill(dist_none, 0); + fill(dist_one, 1); + to(r, dist_few, 1, 3); + to(r, dist_many, 2, 10); + + /* Generate mixture */ + int n = 4; + double weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 }; + double* dists[] = { dist_none, dist_one, dist_few, dist_many }; + + mixture(r, dists, weights, n, dist_mixture); + printf("%f\n", sum(dist_mixture, N) / N); + + /* Clean up GSL */ + gsl_rng_free(r); + + // End clock + end = clock(); + printf("Total time (ms): %f\n", ((double)(end - start)) / CLOCKS_PER_SEC * 1000); + /* Return success*/ + return EXIT_SUCCESS; +} diff --git a/C/C-02-better-algorithm-one-thread/makefile b/C/C-02-better-algorithm-one-thread/makefile new file mode 100644 index 00000000..a9044735 --- /dev/null +++ b/C/C-02-better-algorithm-one-thread/makefile @@ -0,0 +1,53 @@ +# Interface: +# make +# make build +# make format +# make run + +# Compiler +CC=gcc +# CC=tcc # <= faster compilation + +# Main file + +SRC_ONE_THREAD=./samples-one-thread.c +OUTPUT_ONE_THREAD=out/samples-one-thread + +## Dependencies +# Has no dependencies +MATH=-lm + +## Flags +DEBUG= #'-g' +STANDARD=-std=c99 +WARNINGS=-Wall +OPTIMIZED=-O3 #-O3 actually gives better performance than -Ofast, at least for this version +OPENMP=-fopenmp + +## Formatter +STYLE_BLUEPRINT=webkit +FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT) + +## make build +build: $(SRC_ONE_THREAD) + mkdir -p out + $(CC) $(OPTIMIZED) $(DEBUG) $(SRC_ONE_THREAD) $(OPENMP) $(MATH) -o $(OUTPUT_ONE_THREAD) + +format: $(SRC_ONE_THREAD) + $(FORMATTER) $(SRC_ONE_THREAD) + +run: $(SRC_ONE_THREAD) $(OUTPUT_ONE_THREAD) + ./$(OUTPUT_ONE_THREAD) + +time-linux: + @echo "Requires /bin/time, found on GNU/Linux systems" && echo + @echo "Running 100x and taking avg time: $(OUTPUT_ONE_THREAD)" + @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(OUTPUT_ONE_THREAD); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time: |" | sed 's|$$|ms|' && echo + +time-linux-simple: + @echo "Requires /bin/time, found on GNU/Linux systems" && echo + /bin/time -f "Time: %es" ./$(OUTPUT_ONE_THREAD) && echo + +debian-install-dependencies: + sudo apt-get install libomp-dev + diff --git a/C/C-02-better-algorithm-one-thread/out/samples-one-thread b/C/C-02-better-algorithm-one-thread/out/samples-one-thread new file mode 100755 index 00000000..b7e78ead Binary files /dev/null and b/C/C-02-better-algorithm-one-thread/out/samples-one-thread differ diff --git a/C-optimized/samples-one-thread.c b/C/C-02-better-algorithm-one-thread/samples-one-thread.c similarity index 95% rename from C-optimized/samples-one-thread.c rename to C/C-02-better-algorithm-one-thread/samples-one-thread.c index da6e152c..bac15fea 100644 --- a/C-optimized/samples-one-thread.c +++ b/C/C-02-better-algorithm-one-thread/samples-one-thread.c @@ -5,7 +5,7 @@ const float PI = 3.14159265358979323846; -#define N 10000000 +#define N 1000000 //Array helpers diff --git a/C/README.md b/C/README.md new file mode 100644 index 00000000..23e43fef --- /dev/null +++ b/C/README.md @@ -0,0 +1,15 @@ +# Time to BOTEC in C + +This repository contains a few implementations of a simple botec (back-of-the-envelope) calculation in C: + +- In the folder C-01-simple/, you can see a simple implementation, which passes large arrays +- In the folder C-02-better-algorithm-one-thread/ you can see a better implementations, that passes around pointers to functions, which makes the implementation more efficient +- In the top level, you can see an implementation that uses the better implementation in C-02..., and that also implements multithreading using OpenMP + +## To do + +- [ ] Update repository with correct timing +- [ ] Add Windows/Powershell time-measuring commands +- [ ] Add CUDA? +- [x] Use better profiling approach to capture timing with 1M samples. +- [x] See if program can be reworded so as to use multithreading effectively, e.g., so that you see speed gains proportional to the number of threads used diff --git a/C/makefile b/C/makefile index f6e9010e..94fcb5f2 100644 --- a/C/makefile +++ b/C/makefile @@ -1,53 +1,82 @@ -# Interface: -# make -# make build -# make format -# make run - -# Compiler -CC=gcc -# CC=tcc # <= faster compilation - -# Main file -SRC=samples.c -OUTPUT=samples - -## Dependencies -DEPS='gsl' - -## Flags -INCS=`pkg-config --cflags ${DEPS}` -LIBS=`pkg-config --libs ${DEPS}` -DEBUG= #'-g' -STANDARD=-std=c99 -WARNINGS=-Wall -FAST=-Ofast -## Formatter -STYLE_BLUEPRINT=webkit -FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT) - -## make build -build: $(SRC) - $(CC) $(DEBUG) $(INCS) $(PLUGS) $(SRC) -o samples $(LIBS) - -fast: $(SRC) - $(CC) $(FAST) $(DEBUG) $(INCS) $(PLUGS) $(SRC) -o samples $(LIBS) - -format: $(SRC) - $(FORMATTER) $(SRC) - -run: $(SRC) $(OUTPUT) - echo "Increasing stack size limit, because we are dealing with 1M samples" - # ulimit: increase stack size limit - # -Ss: the soft limit. If you set the hard limit, you then can't raise it - # 256000: around 250Mbs, if I'm reading it correctly. - # Then run the program - ulimit -Ss 256000 && ./$(OUTPUT) - - - -# Old: -# Link libraries, for good measure -# LD_LIBRARY_PATH=/usr/local/lib -# export LD_LIBRARY_PATH - +# Interface: +# make +# make build +# make format +# make run + +# Compiler +CC=gcc +# CC=tcc # <= faster compilation + +# Main file +SRC=samples.c +OUTPUT=out/samples + +SRC_ONE_THREAD=./samples-one-thread.c +OUTPUT_ONE_THREAD=out/samples-one-thread + +## Dependencies +# Has no dependencies +MATH=-lm + +## Flags +DEBUG= #'-g' +STANDARD=-std=c99 +WARNINGS=-Wall +OPTIMIZED=-O3 #-O3 actually gives better performance than -Ofast, at least for this version +OPENMP=-fopenmp + +## Formatter +STYLE_BLUEPRINT=webkit +FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT) + +## make build +build: $(SRC) + $(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(OPENMP) $(MATH) -o $(OUTPUT) + $(CC) $(OPTIMIZED) $(DEBUG) $(SRC_ONE_THREAD) $(OPENMP) $(MATH) -o $(OUTPUT_ONE_THREAD) + +format: $(SRC) + $(FORMATTER) $(SRC) + +run: $(SRC) $(OUTPUT) + OMP_NUM_THREADS=1 ./$(OUTPUT) && echo + ./$(OUTPUT_ONE_THREAD) + +multi: + OMP_NUM_THREADS=1 ./$(OUTPUT) && echo + OMP_NUM_THREADS=2 ./$(OUTPUT) && echo + OMP_NUM_THREADS=4 ./$(OUTPUT) && echo + OMP_NUM_THREADS=8 ./$(OUTPUT) && echo + OMP_NUM_THREADS=16 ./$(OUTPUT) && echo + ./$(OUTPUT_ONE_THREAD) && echo + +time-linux: + @echo "Requires /bin/time, found on GNU/Linux systems" && echo + + @echo "Running 100x and taking avg time: OMP_NUM_THREADS=1 $(OUTPUT)" + @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=1 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 1 thread: |" | sed 's|$$|ms|' && echo + + @echo "Running 100x and taking avg time: OMP_NUM_THREADS=2 $(OUTPUT)" + @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=2 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 2 threads: |" | sed 's|$$|ms|' && echo + + @echo "Running 100x and taking avg time: OMP_NUM_THREADS=4 $(OUTPUT)" + @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=4 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time for 4 threads: |" | sed 's|$$|ms|' && echo + + @echo "Running 100x and taking avg time: OMP_NUM_THREADS=8 $(OUTPUT)" + @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=8 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 8 threads: |" | sed 's|$$|ms|' && echo + + @echo "Running 100x and taking avg time: OMP_NUM_THREADS=16 $(OUTPUT)" + @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=16 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo + +time-linux-simple: + @echo "Requires /bin/time, found on GNU/Linux systems" && echo + OMP_NUM_THREADS=1 /bin/time -f "Time: %es" ./$(OUTPUT) && echo + OMP_NUM_THREADS=2 /bin/time -f "Time: %es" ./$(OUTPUT) && echo + OMP_NUM_THREADS=4 /bin/time -f "Time: %es" ./$(OUTPUT) && echo + OMP_NUM_THREADS=8 /bin/time -f "Time: %es" ./$(OUTPUT) && echo + OMP_NUM_THREADS=16 /bin/time -f "Time: %es" ./$(OUTPUT) && echo + /bin/time -f "Time: %es" ./$(OUTPUT_ONE_THREAD) && echo + +debian-install-dependencies: + sudo apt-get install libomp-dev + diff --git a/C-optimized/out/samples b/C/out/samples similarity index 100% rename from C-optimized/out/samples rename to C/out/samples diff --git a/C-optimized/out/samples-one-thread b/C/out/samples-one-thread similarity index 100% rename from C-optimized/out/samples-one-thread rename to C/out/samples-one-thread diff --git a/C-optimized/out/samples.exe b/C/out/samples.exe similarity index 100% rename from C-optimized/out/samples.exe rename to C/out/samples.exe diff --git a/C-optimized/out/samples.txt b/C/out/samples.txt similarity index 100% rename from C-optimized/out/samples.txt rename to C/out/samples.txt diff --git a/C/samples.c b/C/samples.c index b18ec29b..6b779a6a 100644 --- a/C/samples.c +++ b/C/samples.c @@ -1,157 +1,281 @@ -#include -#include -#include -#include -#include -#include - -#define N 1000000 -/* - * For very high values of N, you will want to increase the maximum stack trace, otherwise you will suffer a segmentation fault - * In Ubuntu/bash you can do this with $ ulimit -Ss 256000 ## ~256Mbs - * And confirm it with $ ulimit -a -*/ - -/* Helpers */ -void print(double* ys) -{ - for (int i = 0; i < N; i++) { - printf("%f\n", ys[i]); - } - printf("\n"); -} - -void fill(double* ys, float f) -{ - for (int i = 0; i < N; i++) { - ys[i] = f; - } -} - -double sum(double* ps, int n) -{ - double result = 0; - for (int i = 0; i < n; i++) { - result += ps[i]; - } - return (result); -} - -void cumsum(double* ps, double* rs, int n) -{ - double counter = 0; - for (int i = 0; i < n; i++) { - counter += ps[i]; - rs[i] = counter; - } -} - -/* Distributions*/ -void normal(gsl_rng* r, double* ys, double mean, double std) -{ - for (int i = 0; i < N; i++) { - ys[i] = mean + gsl_ran_gaussian(r, std); - } -} - -void lognormal(gsl_rng* r, double* ys, double zeta, double sigma) -{ - for (int i = 0; i < N; i++) { - ys[i] = gsl_ran_lognormal(r, zeta, sigma); - } -} - -void to(gsl_rng* r, double* ys, double low, double high) -{ - double normal95confidencePoint = 1.6448536269514722; - double log_low = log(low); - double log_high = log(high); - double zeta = (log_low + log_high) / 2; - double sigma = (log_high - log_low) / (2.0 * normal95confidencePoint); - lognormal(r, ys, zeta, sigma); -} - -/* Mixture of distributions */ -void mixture(gsl_rng* r, double* dists[], double* weights, int n, double* results) -{ - /* Get cummulative, normalized weights */ - double sum_weights = sum(weights, n); - double normalized_weights[n]; - for (int i = 0; i < n; i++) { - normalized_weights[i] = weights[i] / sum_weights; - } - double cummulative_weights[n]; - cumsum(normalized_weights, cummulative_weights, n); - - /* Get N samples, drawn from the different distributions in proportion to their weights. */ - for (int i = 0; i < N; i++) { - double p_1 = gsl_rng_uniform(r); - double p_2 = gsl_rng_uniform(r); - - int index_found = 0; - int index_counter = 0; - while ((index_found == 0) && (index_counter < n)) { - if (p_1 < cummulative_weights[index_counter]) { - index_found = 1; - } else { - index_counter++; - } - } - if (index_found == 0) { - printf("\nThis shouldn't be able to happen"); - // gsl_rng_free (r); - // abort(); // this shouldn't have happened. - - } else { - int sample_index = (int)floor(p_2 * N); - results[i] = dists[index_counter][sample_index]; - } - } -} - -/* Main */ -int main(void) -{ - // Start clock - clock_t start, end; - start = clock(); - - /* Initialize GNU Statistical Library (GSL) stuff */ - const gsl_rng_type* T; - gsl_rng* r; - // gsl_rng_env_setup(); - T = gsl_rng_default; - r = gsl_rng_alloc(T); - - /* Toy example */ - /* Declare variables in play */ - double p_a, p_b, p_c; - double dist_none[N], dist_one[N], dist_few[N], dist_many[N], dist_mixture[N]; - - /* Initialize variables */ - p_a = 0.8; - p_b = 0.5; - p_c = p_a * p_b; - - fill(dist_none, 0); - fill(dist_one, 1); - to(r, dist_few, 1, 3); - to(r, dist_many, 2, 10); - - /* Generate mixture */ - int n = 4; - double weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 }; - double* dists[] = { dist_none, dist_one, dist_few, dist_many }; - - mixture(r, dists, weights, n, dist_mixture); - printf("%f\n", sum(dist_mixture, N) / N); - - /* Clean up GSL */ - gsl_rng_free(r); - - // End clock - end = clock(); - printf("Total time (ms): %f\n", ((double)(end - start)) / CLOCKS_PER_SEC * 1000); - /* Return success*/ - return EXIT_SUCCESS; -} +#include +#include +#include +#include +#include + +const float PI = 3.14159265358979323846; + +#define N 1000000 + +//Array helpers + +void array_print(float* array, int length) +{ + for (int i = 0; i < length; i++) { + printf("item[%d] = %f\n", i, array[i]); + } + printf("\n"); +} + +void array_fill(float* array, int length, float item) +{ + int i; + #pragma omp private(i) + { + #pragma omp for + for (i = 0; i < length; i++) { + array[i] = item; + } + } +} + +float array_sum(float* array, int length) +{ + float output = 0.0; + for (int i = 0; i < length; i++) { + output += array[i]; + } + return output; +} + +void array_cumsum(float* array_to_sum, float* array_cumsummed, int length) +{ + array_cumsummed[0] = array_to_sum[0]; + for (int i = 1; i < length; i++) { + array_cumsummed[i] = array_cumsummed[i - 1] + array_to_sum[i]; + } +} + +float rand_float(float to, unsigned int* seed) +{ + return ((float)rand_r(seed) / (float)RAND_MAX) * to; + // See: + // rand() is not thread-safe, as it relies on (shared) hidden state. +} + +float ur_normal(unsigned int* seed) +{ + float u1 = rand_float(1.0, seed); + float u2 = rand_float(1.0, seed); + float z = sqrtf(-2.0 * log(u1)) * sin(2 * PI * u2); + return z; +} + +inline float random_uniform(float from, float to, unsigned int* seed) +{ + return ((float) rand_r(seed) / (float)RAND_MAX) * (to - from) + from; +} + +inline float random_normal(float mean, float sigma, unsigned int* seed) +{ + return (mean + sigma * ur_normal(seed)); +} + +inline float random_lognormal(float logmean, float logsigma, unsigned int* seed) +{ + return expf(random_normal(logmean, logsigma, seed)); +} + +inline float random_to(float low, float high, unsigned int* seed) +{ + const float NORMAL95CONFIDENCE = 1.6448536269514722; + float loglow = logf(low); + float loghigh = logf(high); + float logmean = (loglow + loghigh) / 2; + float logsigma = (loghigh - loglow) / (2.0 * NORMAL95CONFIDENCE); + return random_lognormal(logmean, logsigma, seed); +} + +int split_array_get_my_length(int index, int total_length, int n_threads) +{ + return (total_length % n_threads > index ? total_length / n_threads + 1 : total_length / n_threads); +} + +//Old version, don't use it!! Optimized version is called mixture_f. This one is just for display +/* +void mixture(float* dists[], float* weights, int n_dists, float* results) +{ + float sum_weights = array_sum(weights, n_dists); + float* normalized_weights = malloc(n_dists * sizeof(float)); + for (int i = 0; i < n_dists; i++) { + normalized_weights[i] = weights[i] / sum_weights; + } + + float* cummulative_weights = malloc(n_dists * sizeof(float)); + array_cumsum(normalized_weights, cummulative_weights, n_dists); + + //create var holders + float p1, p2; + int index_found, index_counter, sample_index, i; + + #pragma omp parallel private(i, p1, p2, index_found, index_counter, sample_index) + { + #pragma omp for + for (i = 0; i < N; i++) { + p1 = random_uniform(0, 1); + p2 = random_uniform(0, 1); + + index_found = 0; + index_counter = 0; + + while ((index_found == 0) && (index_counter < n_dists)) { + if (p1 < cummulative_weights[index_counter]) { + index_found = 1; + } else { + index_counter++; + } + } + if (index_found != 0) { + sample_index = (int)(p2 * N); + results[i] = dists[index_counter][sample_index]; + } else + printf("This shouldn't be able to happen.\n"); + } + } + free(normalized_weights); + free(cummulative_weights); +} +*/ +void mixture_f(float (*samplers[])(unsigned int* ), float* weights, int n_dists, float** results, int n_threads) +{ + float sum_weights = array_sum(weights, n_dists); + float* normalized_weights = malloc(n_dists * sizeof(float)); + for (int i = 0; i < n_dists; i++) { + normalized_weights[i] = weights[i] / sum_weights; + } + + float* cummulative_weights = malloc(n_dists * sizeof(float)); + array_cumsum(normalized_weights, cummulative_weights, n_dists); + + //create var holders + float p1; + int sample_index, i, own_length; + unsigned int* seeds[n_threads]; + for(unsigned int i=0; i