add more comments, undo loop unrolling.

This commit is contained in:
NunoSempere 2024-01-12 23:53:58 +01:00
parent c25e9f916f
commit dd6bb53f1b
17 changed files with 6 additions and 32 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -24,7 +24,7 @@ lint:
clang-tidy squiggle_more.c -- -lm clang-tidy squiggle_more.c -- -lm
profile: profile:
OMP_NUM_THREADS=16 sudo perf record ./examples/more/12_time_to_botec_parallel/example sudo perf record ./examples/more/12_time_to_botec_parallel/example
sudo perf report sudo perf report
rm perf.data rm perf.data

View File

@ -64,41 +64,15 @@ void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_
{ {
#pragma omp for #pragma omp for
for (i = 0; i < n_threads; i++) { for (i = 0; i < n_threads; i++) {
// Simple version
/*
int quotient = n_samples / n_threads; int quotient = n_samples / n_threads;
int lower_bound_inclusive = i * quotient; int lower_bound_inclusive = i * quotient;
int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below, int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below,
for (int j = lower_bound_inclusive; j < upper_bound_not_inclusive; j++) { for (int j = lower_bound_inclusive; j < upper_bound_not_inclusive; j++) {
results[j] = sampler(&(cache_box[i].seed)); results[j] = sampler(&(cache_box[i].seed));
// Could also result in inefficient cache stuff, but hopefully not too often // In principle, these results[j] could also result in two threads competing for the same cache line.
} // In practice, though,
*/ // a) this would happen infrequently
// b)
// Version with loop unrolling
int quotient = n_samples / n_threads;
int lower_bound_inclusive = i * quotient;
int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below,
int delta = quotient;
int eighth_of_deltas = delta/8; // why 8? a double in 8 bytes, 8 doubles is the size of a cache line
int k;
// to do: simplify these variables. Maybe divide by n_threads * 8 directly
for(int j=0; j<eighth_of_deltas; j++){
k = lower_bound_inclusive + j*8;
results[k+0] = sampler(&(cache_box[i].seed));
results[k+1] = sampler(&(cache_box[i].seed));
results[k+2] = sampler(&(cache_box[i].seed));
results[k+3] = sampler(&(cache_box[i].seed));
results[k+4] = sampler(&(cache_box[i].seed));
results[k+5] = sampler(&(cache_box[i].seed));
results[k+6] = sampler(&(cache_box[i].seed));
results[k+7] = sampler(&(cache_box[i].seed));
// these all fit one single cache line
// name of the techique: loop unrolling.
}
for(int k=eighth_of_deltas*8; k<upper_bound_not_inclusive; k++){
results[k] = sampler(&(cache_box[i].seed));
} }
} }