reorg, get parsimonious paralellism working, to go into squiggle.c
This commit is contained in:
		
							parent
							
								
									9a56a63c61
								
							
						
					
					
						commit
						a48b15f171
					
				
							
								
								
									
										107
									
								
								C/alt/05-refactor-still-split-array/makefile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										107
									
								
								C/alt/05-refactor-still-split-array/makefile
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,107 @@ | |||
| # Interface: 
 | ||||
| #   make
 | ||||
| #   make build
 | ||||
| #   make format
 | ||||
| #   make run
 | ||||
| 
 | ||||
| # Compiler
 | ||||
| CC=gcc | ||||
| # CC=tcc # <= faster compilation
 | ||||
| 
 | ||||
| # Main file
 | ||||
| SRC=samples.c | ||||
| OUTPUT=out/samples | ||||
| 
 | ||||
| SRC_ONE_THREAD=./samples-one-thread.c | ||||
| OUTPUT_ONE_THREAD=out/samples-one-thread | ||||
| 
 | ||||
| ## Dependencies
 | ||||
| #  Has no dependencies
 | ||||
| MATH=-lm | ||||
| 
 | ||||
| ## Flags
 | ||||
| DEBUG= #'-g' | ||||
| STANDARD=-std=c99 | ||||
| WARNINGS=-Wall | ||||
| OPTIMIZED=-O3 #-O3 actually gives better performance than -Ofast, at least for this version | ||||
| OPENMP=-fopenmp | ||||
| 
 | ||||
| ## Formatter
 | ||||
| STYLE_BLUEPRINT=webkit | ||||
| FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT) | ||||
| 
 | ||||
| ## make build
 | ||||
| build: $(SRC) | ||||
| 	$(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(OPENMP) $(MATH) -o $(OUTPUT) | ||||
| 
 | ||||
| static: | ||||
| 	$(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(OPENMP) $(MATH) -o $(OUTPUT) | ||||
| 
 | ||||
| format: $(SRC) | ||||
| 	$(FORMATTER) $(SRC) | ||||
| 
 | ||||
| run: $(SRC) $(OUTPUT) | ||||
| 	OMP_NUM_THREADS=1 ./$(OUTPUT) && echo | ||||
| 
 | ||||
| multi: | ||||
| 	OMP_NUM_THREADS=1 ./$(OUTPUT) && echo | ||||
| 	OMP_NUM_THREADS=2 ./$(OUTPUT) && echo | ||||
| 	OMP_NUM_THREADS=4 ./$(OUTPUT) && echo | ||||
| 	OMP_NUM_THREADS=8 ./$(OUTPUT) && echo | ||||
| 	OMP_NUM_THREADS=16 ./$(OUTPUT) && echo | ||||
| 
 | ||||
| ## Timing
 | ||||
| 
 | ||||
| time-linux:  | ||||
| 	@echo "Requires /bin/time, found on GNU/Linux systems" && echo | ||||
| 	 | ||||
| 	@echo "Running 100x and taking avg time: OMP_NUM_THREADS=1 $(OUTPUT)" | ||||
| 	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=1 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 1 thread: |" | sed 's|$$|ms|' && echo | ||||
| 	 | ||||
| 	@echo "Running 100x and taking avg time: OMP_NUM_THREADS=2 $(OUTPUT)" | ||||
| 	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=2 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 2 threads: |" | sed 's|$$|ms|' && echo | ||||
| 	 | ||||
| 	@echo "Running 100x and taking avg time: OMP_NUM_THREADS=4 $(OUTPUT)" | ||||
| 	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=4 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time for 4 threads: |" | sed 's|$$|ms|' && echo | ||||
| 	 | ||||
| 	@echo "Running 100x and taking avg time: OMP_NUM_THREADS=8 $(OUTPUT)" | ||||
| 	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=8 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 8 threads: |" | sed 's|$$|ms|' && echo | ||||
| 	 | ||||
| 	@echo "Running 100x and taking avg time: OMP_NUM_THREADS=16 $(OUTPUT)" | ||||
| 	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=16 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo | ||||
| 
 | ||||
| time-linux-fastest:  | ||||
| 	@echo "Running 100x and taking avg time: OMP_NUM_THREADS=32 $(OUTPUT)" | ||||
| 	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=16 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo | ||||
| 
 | ||||
| time-linux-simple: | ||||
| 	@echo "Requires /bin/time, found on GNU/Linux systems" && echo | ||||
| 	OMP_NUM_THREADS=1 /bin/time -f "Time: %es" ./$(OUTPUT) && echo | ||||
| 	OMP_NUM_THREADS=2 /bin/time -f "Time: %es" ./$(OUTPUT) && echo | ||||
| 	OMP_NUM_THREADS=4 /bin/time -f "Time: %es" ./$(OUTPUT) && echo | ||||
| 	OMP_NUM_THREADS=8 /bin/time -f "Time: %es" ./$(OUTPUT) && echo | ||||
| 	OMP_NUM_THREADS=16 /bin/time -f "Time: %es" ./$(OUTPUT) && echo | ||||
| 	OMP_NUM_THREADS=32 /bin/time -f "Time: %es" ./$(OUTPUT) && echo | ||||
| 
 | ||||
| ## Profiling
 | ||||
| 
 | ||||
| profile-linux:  | ||||
| 	echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar" | ||||
| 	echo "Must be run as sudo" | ||||
| 	$(CC) $(SRC) $(OPENMP) $(MATH) -o $(OUTPUT) | ||||
| 	# ./$(OUTPUT) | ||||
| 	# gprof:  | ||||
| 	# gprof $(OUTPUT) gmon.out > analysis.txt | ||||
| 	# rm gmon.out | ||||
| 	# vim analysis.txt | ||||
| 	# rm analysis.txt | ||||
| 	# perf:  | ||||
| 	OMP_NUM_THREADS=16 sudo perf record $(OUTPUT) | ||||
| 	sudo perf report | ||||
| 	rm perf.data | ||||
| 	 | ||||
| 
 | ||||
| ## Install
 | ||||
| debian-install-dependencies: | ||||
| 	sudo apt-get install libomp-dev | ||||
| 
 | ||||
										
											Binary file not shown.
										
									
								
							|  | @ -6,7 +6,7 @@ | |||
| 
 | ||||
| const float PI = 3.14159265358979323846; | ||||
| 
 | ||||
| #define N_SAMPLES 1024 * 1000 | ||||
| #define N_SAMPLES (1024 * 1000 ) | ||||
| 
 | ||||
| //Array helpers
 | ||||
| 
 | ||||
|  | @ -180,29 +180,42 @@ float mixture(float (*samplers[])(uint32_t*), float* weights, int n_dists, uint3 | |||
| 
 | ||||
| // Parallization function
 | ||||
| void paralellize(float (*sampler)(uint32_t* seed), float* results, int n_threads, int n_samples){ | ||||
| 
 | ||||
|     if((N_SAMPLES % n_threads) != 0){ | ||||
|         fprintf(stderr, "Number of samples isn't divisible by number of threads, aborting\n"); | ||||
|         exit(1); | ||||
|     } | ||||
|     // int n_samples_per_thread = N_SAMPLES / n_thread;
 | ||||
|     int sample_index, i, split_array_length; | ||||
|     int n_samples_per_thread = N_SAMPLES / n_threads; | ||||
| 
 | ||||
|     float** split_results = malloc(n_threads * sizeof(float*)); | ||||
|     for(int i=0; i<n_threads; i++){ | ||||
|         split_results[i] = malloc(n_samples_per_thread * sizeof(float)); | ||||
|     } | ||||
| 
 | ||||
|     uint32_t** seeds = malloc(n_threads * sizeof(uint32_t*)); | ||||
|     for (uint32_t i = 0; i < n_threads; i++) { | ||||
|         seeds[i] = malloc(sizeof(uint32_t)); | ||||
|         *seeds[i] = i + 1; // xorshift can't start with 0
 | ||||
|     } | ||||
|      | ||||
|     #pragma omp parallel private(i, sample_index, split_array_length) | ||||
|     int i; | ||||
|     #pragma omp parallel private(i) | ||||
|     { | ||||
|         #pragma omp for | ||||
|         for (i = 0; i < n_threads; i++) { | ||||
|             // split_array_length = split_array_get_length(i, N_SAMPLES, n_threads);
 | ||||
|             for (int j = 0; j < n_samples_per_thread; j++) { | ||||
|                 split_results[i][j] = sampler(seeds[i]); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     for(int i=0; i<n_threads; i++){ | ||||
|         int lower_bound = i * (n_samples / n_threads); | ||||
|         int upper_bound = ((i+1) * (n_samples / n_threads)) - 1; | ||||
|         // printf("Lower bound: %d, upper bound: %d\n", lower_bound, upper_bound);
 | ||||
|         for(int j=lower_bound; j<upper_bound; j++){ | ||||
|                 results[j] = sampler(seeds[i]); | ||||
|             } | ||||
|             results[j] = split_results[i][j-lower_bound]; | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|  | @ -210,6 +223,11 @@ void paralellize(float (*sampler)(uint32_t* seed), float* results, int n_threads | |||
|         free(seeds[i]); | ||||
|     } | ||||
|     free(seeds); | ||||
| 
 | ||||
|     for(int i=0; i<n_threads; i++){ | ||||
|         free(split_results[i]);  | ||||
|     } | ||||
|     free(split_results); | ||||
| } | ||||
| 
 | ||||
| // Functions used for the BOTEC.
 | ||||
|  | @ -257,7 +275,8 @@ int main() | |||
|     float* split_array_results = malloc(N_SAMPLES * sizeof(float)); | ||||
| 
 | ||||
|     paralellize(sample_mixture, split_array_results, n_threads, N_SAMPLES); | ||||
|     printf("Sum(split_array_results, N_SAMPLES)/N_SAMPLES = %f\n", array_sum(split_array_results, N_SAMPLES) / N_SAMPLES); | ||||
|     printf("Sum(split_array_results, N_SAMPLES)/N_SAMPLES = %f\n",  | ||||
|            array_sum(split_array_results, N_SAMPLES) / N_SAMPLES); | ||||
| 
 | ||||
|     free(split_array_results); | ||||
|     return 0; | ||||
							
								
								
									
										17
									
								
								C/alt/05-refactor-still-split-array/time.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								C/alt/05-refactor-still-split-array/time.txt
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,17 @@ | |||
| Requires /bin/time, found on GNU/Linux systems | ||||
| 
 | ||||
| Running 100x and taking avg time: OMP_NUM_THREADS=1 out/samples | ||||
| Time using 1 thread: 35.60ms | ||||
| 
 | ||||
| Running 100x and taking avg time: OMP_NUM_THREADS=2 out/samples | ||||
| Time using 2 threads: 38.40ms | ||||
| 
 | ||||
| Running 100x and taking avg time: OMP_NUM_THREADS=4 out/samples | ||||
| Time for 4 threads: 26.30ms | ||||
| 
 | ||||
| Running 100x and taking avg time: OMP_NUM_THREADS=8 out/samples | ||||
| Time using 8 threads: 15.30ms | ||||
| 
 | ||||
| Running 100x and taking avg time: OMP_NUM_THREADS=16 out/samples | ||||
| Time using 16 threads: 10.40ms | ||||
| 
 | ||||
							
								
								
									
										37
									
								
								C/alt/06-refactor-one-array/desiderata.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								C/alt/06-refactor-one-array/desiderata.md
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,37 @@ | |||
| Instead of | ||||
| 
 | ||||
| ```C | ||||
| #pragma omp parallel private(i, sample_index, split_array_length) | ||||
| { | ||||
|     #pragma omp for | ||||
|     for (i = 0; i < n_threads; i++) { | ||||
|         split_array_length = split_array_get_length(i, N, n_threads); | ||||
|         for (int j = 0; j < split_array_length; j++) { | ||||
|             results[i][j] = sampler(seeds[i]); | ||||
|         } | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| ``` | ||||
| 
 | ||||
| Algo como | ||||
| 
 | ||||
| ```C | ||||
| #pragma omp parallel private(i, sample_index, bounds) | ||||
| { | ||||
|     #pragma omp for | ||||
|     for (i = 0; i < n_threads; i++) { | ||||
|         int bounds[2] = split_array_get_bounds(i, N, n_threads); | ||||
|         for (int j = bound[0]; j < bounds[1] + 1; j++) { | ||||
|             // o j < bounds[1], no se si el +1 va a ser más elegante | ||||
|             // dentro o fuera, aunque algo me dice que dentro | ||||
|             results[j] = sampler(seeds[i]); | ||||
|         } | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| ``` | ||||
| 
 | ||||
| Por qué? Porque esto hace que la estructura subyacente sea un solo array,  | ||||
| lo cual implica que no *tengo* que utilizar split_array_functions especializadas  | ||||
| si no quiero | ||||
|  | @ -6,6 +6,7 @@ | |||
| 
 | ||||
| # Compiler
 | ||||
| CC=gcc | ||||
| # CC=clang
 | ||||
| # CC=tcc # <= faster compilation
 | ||||
| 
 | ||||
| # Main file
 | ||||
							
								
								
									
										
											BIN
										
									
								
								C/alt/06-refactor-one-array/out/samples
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								C/alt/06-refactor-one-array/out/samples
									
									
									
									
									
										Executable file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										220
									
								
								C/alt/06-refactor-one-array/samples.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										220
									
								
								C/alt/06-refactor-one-array/samples.c
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,220 @@ | |||
| #include <math.h> | ||||
| #include <omp.h> | ||||
| #include <stdint.h> | ||||
| #include <stdio.h> | ||||
| #include <stdlib.h> | ||||
| 
 | ||||
| const float PI = 3.14159265358979323846; | ||||
| 
 | ||||
| #define N_SAMPLES (1024 * 1000 * 1000) | ||||
| 
 | ||||
| //Array helpers
 | ||||
| void array_print(float* array, int length) | ||||
| { | ||||
|     for (int i = 0; i < length; i++) { | ||||
|         printf("item[%d] = %f\n", i, array[i]); | ||||
|     } | ||||
|     printf("\n"); | ||||
| } | ||||
| 
 | ||||
| float array_sum(float* array, int length) | ||||
| { | ||||
|     float output = 0.0; | ||||
|     for (int i = 0; i < length; i++) { | ||||
|         output += array[i]; | ||||
|     } | ||||
|     return output; | ||||
| } | ||||
| 
 | ||||
| void array_cumsum(float* array_to_sum, float* array_cumsummed, int length) | ||||
| { | ||||
|     array_cumsummed[0] = array_to_sum[0]; | ||||
|     for (int i = 1; i < length; i++) { | ||||
|         array_cumsummed[i] = array_cumsummed[i - 1] + array_to_sum[i]; | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| // Pseudo Random number generator
 | ||||
| 
 | ||||
| uint32_t xorshift32(uint32_t* seed) | ||||
| { | ||||
|     // Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RN_SAMPLESGs"
 | ||||
|     // See <https://stackoverflow.com/questions/53886131/how-does-xorshift32-works>
 | ||||
|     // https://en.wikipedia.org/wiki/Xorshift
 | ||||
|     // Also some drama: <https://www.pcg-random.org/posts/on-vignas-pcg-critique.html>, <https://prng.di.unimi.it/>
 | ||||
| 
 | ||||
|     uint32_t x = *seed; | ||||
|     x ^= x << 13; | ||||
|     x ^= x >> 17; | ||||
|     x ^= x << 5; | ||||
|     return *seed = x; | ||||
| } | ||||
| 
 | ||||
| // Distribution & sampling functions
 | ||||
| 
 | ||||
| float rand_0_to_1(uint32_t* seed) | ||||
| { | ||||
|     return ((float)xorshift32(seed)) / ((float)UINT32_MAX); | ||||
|     /* 
 | ||||
| 	uint32_t x = *seed; | ||||
| 	x ^= x << 13; | ||||
| 	x ^= x >> 17; | ||||
| 	x ^= x << 5; | ||||
| 	return ((float)(*seed = x))/((float) UIN_SAMPLEST32_MAX); | ||||
| 	*/ | ||||
|     // previously:
 | ||||
|     // ((float)rand_r(seed) / (float)RAN_SAMPLESD_MAX)
 | ||||
|     // and before that: rand, but it wasn't thread-safe.
 | ||||
|     // See: <https://stackoverflow.com/questions/43151361/how-to-create-thread-safe-random-number-generator-in-c-using-rand-r> for why to use rand_r:
 | ||||
|     // rand() is not thread-safe, as it relies on (shared) hidden seed.
 | ||||
| } | ||||
| 
 | ||||
| float rand_float(float max, uint32_t* seed) | ||||
| { | ||||
|     return rand_0_to_1(seed) * max; | ||||
| } | ||||
| 
 | ||||
| float ur_normal(uint32_t* seed) | ||||
| { | ||||
|     float u1 = rand_0_to_1(seed); | ||||
|     float u2 = rand_0_to_1(seed); | ||||
|     float z = sqrtf(-2.0 * log(u1)) * sin(2 * PI * u2); | ||||
|     return z; | ||||
| } | ||||
| 
 | ||||
| float random_uniform(float from, float to, uint32_t* seed) | ||||
| { | ||||
|     return rand_0_to_1(seed) * (to - from) + from; | ||||
| } | ||||
| 
 | ||||
| float random_normal(float mean, float sigma, uint32_t* seed) | ||||
| { | ||||
|     return (mean + sigma * ur_normal(seed)); | ||||
| } | ||||
| 
 | ||||
| float random_lognormal(float logmean, float logsigma, uint32_t* seed) | ||||
| { | ||||
|     return expf(random_normal(logmean, logsigma, seed)); | ||||
| } | ||||
| 
 | ||||
| float random_to(float low, float high, uint32_t* seed) | ||||
| { | ||||
|     const float N_SAMPLESORMAL95CON_SAMPLESFIDEN_SAMPLESCE = 1.6448536269514722; | ||||
|     float loglow = logf(low); | ||||
|     float loghigh = logf(high); | ||||
|     float logmean = (loglow + loghigh) / 2; | ||||
|     float logsigma = (loghigh - loglow) / (2.0 * N_SAMPLESORMAL95CON_SAMPLESFIDEN_SAMPLESCE); | ||||
|     return random_lognormal(logmean, logsigma, seed); | ||||
| } | ||||
| 
 | ||||
| // Mixture function
 | ||||
| 
 | ||||
| float mixture(float (*samplers[])(uint32_t*), float* weights, int n_dists, uint32_t* seed) | ||||
| { | ||||
| 
 | ||||
|     // You can see a slightly simpler version of this function in the git history
 | ||||
|     // or in alt/C-02-better-algorithm-one-thread/
 | ||||
|     float sum_weights = array_sum(weights, n_dists); | ||||
|     float* cumsummed_normalized_weights = malloc(n_dists * sizeof(float)); | ||||
|     cumsummed_normalized_weights[0] = weights[0] / sum_weights; | ||||
|     for (int i = 1; i < n_dists; i++) { | ||||
|         cumsummed_normalized_weights[i] = cumsummed_normalized_weights[i - 1] + weights[i] / sum_weights; | ||||
|     } | ||||
| 
 | ||||
|     //create var holders
 | ||||
|     float p1, result; | ||||
|     int sample_index, i, own_length; | ||||
|     p1 = random_uniform(0, 1, seed); | ||||
|     for (int i = 0; i < n_dists; i++) { | ||||
|         if (p1 < cumsummed_normalized_weights[i]) { | ||||
|             result = samplers[i](seed); | ||||
|             break; | ||||
|         } | ||||
|     } | ||||
|     free(cumsummed_normalized_weights); | ||||
|     return result; | ||||
| } | ||||
| 
 | ||||
| // Parallization function
 | ||||
| void paralellize(float (*sampler)(uint32_t* seed), float* results, int n_threads, int n_samples){ | ||||
|     if((N_SAMPLES % n_threads) != 0){ | ||||
|         fprintf(stderr, "Number of samples isn't divisible by number of threads, aborting\n"); | ||||
|         exit(1); | ||||
|     } | ||||
|     // int n_samples_per_thread = N_SAMPLES / n_thread;
 | ||||
|     int sample_index, i, split_array_length; | ||||
|     uint32_t** seeds = malloc(n_threads * sizeof(uint32_t*)); | ||||
|     for (uint32_t i = 0; i < n_threads; i++) { | ||||
|         seeds[i] = malloc(sizeof(uint32_t)); | ||||
|         *seeds[i] = i + 1; // xorshift can't start with 0
 | ||||
|     } | ||||
| 
 | ||||
|     #pragma omp parallel private(i, sample_index, split_array_length) | ||||
|     { | ||||
|         #pragma omp for | ||||
|         for (i = 0; i < n_threads; i++) { | ||||
|             int lower_bound = i * (n_samples / n_threads); | ||||
|             int upper_bound = ((i+1) * (n_samples / n_threads)) - 1; | ||||
|             // printf("Lower bound: %d, upper bound: %d\n", lower_bound, upper_bound);
 | ||||
|             for (int j = lower_bound; j < upper_bound; j++) { | ||||
|                 results[j] = sampler(seeds[i]); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     for (uint32_t i = 0; i < n_threads; i++) { | ||||
|         free(seeds[i]); | ||||
|     } | ||||
|     free(seeds); | ||||
| } | ||||
| 
 | ||||
| // Functions used for the BOTEC.
 | ||||
| // Their type has to be the same, as we will be passing them around.
 | ||||
| 
 | ||||
| float sample_0(uint32_t* seed) | ||||
| { | ||||
|     return 0; | ||||
| } | ||||
| 
 | ||||
| float sample_1(uint32_t* seed) | ||||
| { | ||||
|     return 1; | ||||
| } | ||||
| 
 | ||||
| float sample_few(uint32_t* seed) | ||||
| { | ||||
|     return random_to(1, 3, seed); | ||||
| } | ||||
| 
 | ||||
| float sample_many(uint32_t* seed) | ||||
| { | ||||
|     return random_to(2, 10, seed); | ||||
| } | ||||
| 
 | ||||
| float sample_mixture(uint32_t* seed){ | ||||
|     float p_a, p_b, p_c; | ||||
| 
 | ||||
|     // Initialize variables
 | ||||
|     p_a = 0.8; | ||||
|     p_b = 0.5; | ||||
|     p_c = p_a * p_b; | ||||
| 
 | ||||
|     // Generate mixture
 | ||||
|     int n_dists = 4; | ||||
|     float weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 }; | ||||
|     float (*samplers[])(uint32_t*) = { sample_0, sample_1, sample_few, sample_many }; | ||||
| 
 | ||||
|     return mixture(samplers, weights, n_dists, seed); | ||||
| } | ||||
| 
 | ||||
| int main() | ||||
| { | ||||
|     int n_threads = omp_get_max_threads(); | ||||
|     float* split_array_results = malloc(N_SAMPLES * sizeof(float)); | ||||
| 
 | ||||
|     paralellize(sample_mixture, split_array_results, n_threads, N_SAMPLES); | ||||
|     printf("Sum(split_array_results, N_SAMPLES)/N_SAMPLES = %f\n", array_sum(split_array_results, N_SAMPLES) / N_SAMPLES); | ||||
| 
 | ||||
|     free(split_array_results); | ||||
|     return 0; | ||||
| } | ||||
							
								
								
									
										17
									
								
								C/alt/06-refactor-one-array/time.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								C/alt/06-refactor-one-array/time.txt
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,17 @@ | |||
| Requires /bin/time, found on GNU/Linux systems | ||||
| 
 | ||||
| Running 100x and taking avg time: OMP_NUM_THREADS=1 out/samples | ||||
| Time using 1 thread: 32.30ms | ||||
| 
 | ||||
| Running 100x and taking avg time: OMP_NUM_THREADS=2 out/samples | ||||
| Time using 2 threads: 30.80ms | ||||
| 
 | ||||
| Running 100x and taking avg time: OMP_NUM_THREADS=4 out/samples | ||||
| Time for 4 threads: 30.50ms | ||||
| 
 | ||||
| Running 100x and taking avg time: OMP_NUM_THREADS=8 out/samples | ||||
| Time using 8 threads: 12.90ms | ||||
| 
 | ||||
| Running 100x and taking avg time: OMP_NUM_THREADS=16 out/samples | ||||
| Time using 16 threads: 9.10ms | ||||
| 
 | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user