Merge pull request #1 from JJSierraM/JJSierraM-C-optimized

Added C-optimized version of the code
2023-05-29 23:19:18 +02:00 · 2023-05-29 23:19:18 +02:00 · 7a905ae16e
commit 7a905ae16e
parent c0e6b0677a d62fae0c04
5 changed files with 1825 additions and 0 deletions
--- a/C-optimized/README.md
+++ b/C-optimized/README.md
@ -0,0 +1,10 @@
+# C-Optimized
+An optimized version of the original C implementation. <br>
+The main changes are an optimization of the mixture function (it passes the functions instead of the whole arrays, reducing in great measure the memory usage and the computation time) and the implementation of multi-threading with OpenMP. <br>
+
+The mean time of execution is 6 ms. With the following distribution:
+[![Time histogram](https://imgur.com/a/AzjP7P8)]
+Take into account that the multi-threading introduces a bit of dispersion in the execution time due to the creation and destruction of threads. 
+Also, the time data has been collected by executing the interior of the main() function 1000 times in a for loop, not executing the program itself 1000 times. <br>
+
+The hardware used has been an AMD 5800x3D and 16GB of DDR4-3200 MHz.
--- a/C-optimized/makefile
+++ b/C-optimized/makefile
@ -0,0 +1,52 @@
+# Interface: 
+#   make
+#   make build
+#   make format
+#   make run
+
+# Compiler
+CC=gcc
+# CC=tcc # <= faster compilation
+
+# Main file
+SRC=samples.c
+OUTPUT=samples
+
+## Dependencies
+#  Has no dependencies
+
+## Flags
+DEBUG= #'-g'
+STANDARD=-std=c99
+WARNINGS=-Wall
+OPTIMIZED=-O3 #-O3 actually gives better performance than -Ofast, at least for this version
+OPENMP=-fopenmp
+
+## Formatter
+STYLE_BLUEPRINT=webkit
+FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
+
+## make build
+build: $(SRC)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(OPENMP) $(SRC) -o $(OUTPUT)
+	
+#fast: Has been removed, compilation of "build" is very fast and it outputs optimized code by default
+
+format: $(SRC)
+	$(FORMATTER) $(SRC)
+
+run: $(SRC) $(OUTPUT)
+#	echo "Increasing stack size limit, because we are dealing with 1M samples"
+#	# ulimit: increase stack size limit
+#	# -Ss: the soft limit. If you set the hard limit, you then can't raise it
+#	# 256000: around 250Mbs, if I'm reading it correctly.
+#	# Then run the program
+#	ulimit -Ss 256000 && ./$(OUTPUT)
+
+
+
+# Old:
+# Link libraries, for good measure
+# LD_LIBRARY_PATH=/usr/local/lib
+# export LD_LIBRARY_PATH
+
--- a/C-optimized/samples.c
+++ b/C-optimized/samples.c
@ -0,0 +1,268 @@
+#include <math.h>
+#include <stdlib.h>
+#include <time.h>
+#include <stdio.h>
+#include <omp.h>
+
+const float PI = 3.14159265358979323846;
+
+#define N 1000000
+
+//Array helpers
+
+void array_print(float* array, int length) {
+    for (int i = 0; i < length; i++)
+    {
+        printf("item[%d] = %f\n", i, array[i]);
+    }
+    printf("\n");
+}
+
+void array_fill(float* array, int length, float item) {
+    int i;
+    #pragma omp private (i)
+    {
+        #pragma omp for
+        for (i = 0; i < length; i++)
+        {
+            array[i] = item;
+        }
+    }    
+}
+
+float array_sum(float* array, int length) {
+    float output = 0.0;
+    for (int i = 0; i < length; i++)
+    {
+        output += array[i];
+    }
+    return output;
+}
+
+void array_cumsum(float* array_to_sum, float* array_cumsummed, int length) {
+    array_cumsummed[0] = array_to_sum[0];
+    for (int i = 1; i < length; i++)
+    {
+        array_cumsummed[i] = array_cumsummed[i-1] + array_to_sum[i]; 
+    }
+}
+ 
+float rand_float(float to) {
+    return ((float)rand()/(float)RAND_MAX) * to;
+}
+
+float ur_normal() {
+    float u1 = rand_float(1.0);
+    float u2 = rand_float(1.0);
+    float z = sqrtf(-2.0 * log(u1)) * sin(2 * PI * u2);
+    return z;
+}
+
+inline float random_uniform(float from, float to) {
+    return ((float)rand()/(float)RAND_MAX)*(to-from)+from;
+}
+
+inline float random_normal(float mean, float sigma) {
+    return (mean + sigma * ur_normal());
+}
+
+inline float random_lognormal(float logmean, float logsigma) {
+    return expf(random_normal(logmean, logsigma));
+}
+
+inline float random_to(float low, float high) {
+    const float NORMAL95CONFIDENCE = 1.6448536269514722;
+    float loglow = logf(low);
+    float loghigh = logf(high);
+    float logmean = (loglow + loghigh) / 2;
+    float logsigma = (loghigh - loglow) / (2.0 * NORMAL95CONFIDENCE);
+    return random_lognormal(logmean, logsigma);
+}
+
+void array_random_to(float* array, int length, float low, float high) {
+    int i;
+    #pragma omp private(i) 
+    {
+        #pragma omp for
+        for (i = 0; i < length; i++)
+        {
+            array[i] = random_to(low, high);
+        }
+    }
+}
+
+int split_array_get_my_length(int index, int total_length, int n_threads) {
+    return (total_length%n_threads > index ? total_length/n_threads+1 : total_length/n_threads);
+}
+
+//Old version, don't use it!! Optimized version is called mixture_f. This one is just for display
+void mixture(float* dists[], float* weights, int n_dists, float* results) {
+    float sum_weights = array_sum(weights, n_dists);
+    float* normalized_weights = malloc(n_dists * sizeof(float));
+    for (int i = 0; i < n_dists; i++) {normalized_weights[i] = weights[i] / sum_weights;}
+    
+    float* cummulative_weights = malloc(n_dists * sizeof(float));
+    array_cumsum(normalized_weights, cummulative_weights, n_dists);
+
+    //create var holders
+    float p1, p2;
+    int index_found, index_counter, sample_index, i;
+
+    #pragma omp parallel private (i, p1, p2, index_found, index_counter, sample_index) 
+    {
+        #pragma omp for
+        for (i = 0; i < N; i++)
+        {
+            p1 = random_uniform(0, 1);
+            p2 = random_uniform(0, 1);
+
+            index_found = 0;
+            index_counter = 0;
+
+            while ((index_found == 0) && (index_counter < n_dists))
+            {
+                if (p1 < cummulative_weights[index_counter])
+                {
+                    index_found = 1;
+                } else
+                {
+                    index_counter++;
+                }
+            }
+            if (index_found != 0)
+            {
+                sample_index = (int) (p2 * N);
+                results[i] = dists[index_counter][sample_index];
+            } 
+            else printf("This shouldn't be able to happen.\n"); 
+        }
+    }
+    free(normalized_weights);
+    free(cummulative_weights);
+}
+
+void mixture_f(float (*samplers[])(void), float* weights, int n_dists, float** results, int n_threads) {
+    float sum_weights = array_sum(weights, n_dists);
+    float* normalized_weights = malloc(n_dists * sizeof(float));
+    for (int i = 0; i < n_dists; i++) {normalized_weights[i] = weights[i] / sum_weights;}
+    
+    float* cummulative_weights = malloc(n_dists * sizeof(float));
+    array_cumsum(normalized_weights, cummulative_weights, n_dists);
+
+    //create var holders
+    float p1;
+    int sample_index, i, own_length;
+
+    #pragma omp parallel private (i, p1, sample_index, own_length) 
+    {
+        #pragma omp for
+        for (i = 0; i < n_threads; i++)
+        {
+            own_length = split_array_get_my_length(i, N, n_threads);
+            for (int j = 0; j < own_length; j++)
+            {
+                p1 = random_uniform(0, 1);
+                for (int k = 0; k < n_dists; k++)
+                {
+                    if (p1 < cummulative_weights[k])
+                    {
+                        results[i][j] = samplers[k]();
+                        break;
+                    }
+                }
+            }
+        }
+    }
+    free(normalized_weights);
+    free(cummulative_weights);
+}
+
+float sample_0() {
+    return 0;
+}
+
+float sample_1() {
+    return 1;
+}
+
+float sample_few() {
+    return random_to(1, 3);
+}
+
+float sample_many() {
+    return random_to(2, 10);
+}
+
+void split_array_allocate(float** meta_array, int length, int divide_into) {
+    int own_length;
+
+    for (int i = 0; i < divide_into; i++)
+    {
+        own_length = split_array_get_my_length(i, length, divide_into);
+        meta_array[i] = malloc(own_length*sizeof(float));
+    }
+    
+}
+
+void split_array_free(float** meta_array, int divided_into) {
+    for (int i = 0; i < divided_into; i++)
+    {
+        free(meta_array[i]);
+    }
+    free(meta_array);
+}
+
+float split_array_sum(float** meta_array, int length, int divided_into) {
+    int i;
+    float output;
+    float* partial_sum = malloc(divided_into*sizeof(float));
+
+    #pragma omp private(i) shared(partial_sum)
+    for (int i = 0; i < divided_into; i++)
+    {
+        float own_partial_sum = 0;
+        int own_length = split_array_get_my_length(i, length, divided_into);
+        for (int j = 0; j < own_length; j++)
+        {
+            own_partial_sum += meta_array[i][j];
+        }
+        partial_sum[i] = own_partial_sum;
+    }
+    for (int i = 0; i < divided_into; i++)
+    {
+        output += partial_sum[i];
+    }
+    return output;
+}
+
+int main() {
+    clock_t start, end;
+    start = clock();
+    //initialize randomness
+    srand(time(NULL));
+    //Toy example
+    // Declare variables in play
+    float p_a, p_b, p_c;
+    int n_threads = omp_get_max_threads();
+    float** dist_mixture = malloc(n_threads*sizeof(float*));
+    split_array_allocate(dist_mixture, N, n_threads);
+
+    // Initialize variables
+    p_a = 0.8;
+    p_b = 0.5;
+    p_c = p_a * p_b;
+
+    // Generate mixture
+    int n_dists = 4;
+    float weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 };
+    float (*samplers[])(void) = {sample_0, sample_1, sample_few, sample_many};
+
+    mixture_f(samplers, weights, n_dists, dist_mixture, n_threads);
+    // printf("Sum(dist_mixture, N)/N = %f\n", split_array_sum(dist_mixture, N, n_threads) / N);
+
+    end = clock();
+    split_array_free(dist_mixture, n_threads);
+
+    printf("Total time (ms): %f\n", ((double) (end-start)) / CLOCKS_PER_SEC * 1000);
+    return 0;
+}
--- a/C-optimized/samples.exe
+++ b/C-optimized/samples.exe
--- a/C-optimized/samples.txt
+++ b/C-optimized/samples.txt