add failed example of loop unrolling

This commit is contained in:
NunoSempere 2024-01-12 20:41:06 +01:00
parent a50d776d2c
commit c25e9f916f
17 changed files with 36 additions and 0 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -4,6 +4,9 @@ MAKEFLAGS += --no-print-directory
STYLE_BLUEPRINT=webkit STYLE_BLUEPRINT=webkit
FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT) FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
## Time to botec
TTB=./examples/more/12_time_to_botec_parallel/example
build-examples: build-examples:
cd examples/core && make all cd examples/core && make all
cd examples/more && make all cd examples/more && make all
@ -25,3 +28,6 @@ profile:
sudo perf report sudo perf report
rm perf.data rm perf.data
time-linux:
@echo "Running 100x and taking avg time: $(TTB)"
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(TTB); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo

View File

@ -64,6 +64,8 @@ void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_
{ {
#pragma omp for #pragma omp for
for (i = 0; i < n_threads; i++) { for (i = 0; i < n_threads; i++) {
// Simple version
/*
int quotient = n_samples / n_threads; int quotient = n_samples / n_threads;
int lower_bound_inclusive = i * quotient; int lower_bound_inclusive = i * quotient;
int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below, int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below,
@ -71,6 +73,34 @@ void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_
results[j] = sampler(&(cache_box[i].seed)); results[j] = sampler(&(cache_box[i].seed));
// Could also result in inefficient cache stuff, but hopefully not too often // Could also result in inefficient cache stuff, but hopefully not too often
} }
*/
// Version with loop unrolling
int quotient = n_samples / n_threads;
int lower_bound_inclusive = i * quotient;
int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below,
int delta = quotient;
int eighth_of_deltas = delta/8; // why 8? a double in 8 bytes, 8 doubles is the size of a cache line
int k;
// to do: simplify these variables. Maybe divide by n_threads * 8 directly
for(int j=0; j<eighth_of_deltas; j++){
k = lower_bound_inclusive + j*8;
results[k+0] = sampler(&(cache_box[i].seed));
results[k+1] = sampler(&(cache_box[i].seed));
results[k+2] = sampler(&(cache_box[i].seed));
results[k+3] = sampler(&(cache_box[i].seed));
results[k+4] = sampler(&(cache_box[i].seed));
results[k+5] = sampler(&(cache_box[i].seed));
results[k+6] = sampler(&(cache_box[i].seed));
results[k+7] = sampler(&(cache_box[i].seed));
// these all fit one single cache line
// name of the techique: loop unrolling.
}
for(int k=eighth_of_deltas*8; k<upper_bound_not_inclusive; k++){
results[k] = sampler(&(cache_box[i].seed));
}
} }
} }
for (int j = divisor_multiple; j < n_samples; j++) { for (int j = divisor_multiple; j < n_samples; j++) {