forked from personal/squiggle.c
add failed example of loop unrolling
This commit is contained in:
parent
a50d776d2c
commit
c25e9f916f
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
6
makefile
6
makefile
|
@ -4,6 +4,9 @@ MAKEFLAGS += --no-print-directory
|
|||
STYLE_BLUEPRINT=webkit
|
||||
FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
|
||||
|
||||
## Time to botec
|
||||
TTB=./examples/more/12_time_to_botec_parallel/example
|
||||
|
||||
build-examples:
|
||||
cd examples/core && make all
|
||||
cd examples/more && make all
|
||||
|
@ -25,3 +28,6 @@ profile:
|
|||
sudo perf report
|
||||
rm perf.data
|
||||
|
||||
time-linux:
|
||||
@echo "Running 100x and taking avg time: $(TTB)"
|
||||
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(TTB); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo
|
||||
|
|
|
@ -64,6 +64,8 @@ void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_
|
|||
{
|
||||
#pragma omp for
|
||||
for (i = 0; i < n_threads; i++) {
|
||||
// Simple version
|
||||
/*
|
||||
int quotient = n_samples / n_threads;
|
||||
int lower_bound_inclusive = i * quotient;
|
||||
int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below,
|
||||
|
@ -71,6 +73,34 @@ void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_
|
|||
results[j] = sampler(&(cache_box[i].seed));
|
||||
// Could also result in inefficient cache stuff, but hopefully not too often
|
||||
}
|
||||
*/
|
||||
|
||||
// Version with loop unrolling
|
||||
int quotient = n_samples / n_threads;
|
||||
int lower_bound_inclusive = i * quotient;
|
||||
int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below,
|
||||
int delta = quotient;
|
||||
int eighth_of_deltas = delta/8; // why 8? a double in 8 bytes, 8 doubles is the size of a cache line
|
||||
int k;
|
||||
// to do: simplify these variables. Maybe divide by n_threads * 8 directly
|
||||
for(int j=0; j<eighth_of_deltas; j++){
|
||||
k = lower_bound_inclusive + j*8;
|
||||
results[k+0] = sampler(&(cache_box[i].seed));
|
||||
results[k+1] = sampler(&(cache_box[i].seed));
|
||||
results[k+2] = sampler(&(cache_box[i].seed));
|
||||
results[k+3] = sampler(&(cache_box[i].seed));
|
||||
results[k+4] = sampler(&(cache_box[i].seed));
|
||||
results[k+5] = sampler(&(cache_box[i].seed));
|
||||
results[k+6] = sampler(&(cache_box[i].seed));
|
||||
results[k+7] = sampler(&(cache_box[i].seed));
|
||||
// these all fit one single cache line
|
||||
// name of the techique: loop unrolling.
|
||||
}
|
||||
for(int k=eighth_of_deltas*8; k<upper_bound_not_inclusive; k++){
|
||||
results[k] = sampler(&(cache_box[i].seed));
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
for (int j = divisor_multiple; j < n_samples; j++) {
|
||||
|
|
Loading…
Reference in New Issue
Block a user