add more comments, undo loop unrolling.
This commit is contained in:
parent
c25e9f916f
commit
dd6bb53f1b
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
2
makefile
2
makefile
|
@ -24,7 +24,7 @@ lint:
|
|||
clang-tidy squiggle_more.c -- -lm
|
||||
|
||||
profile:
|
||||
OMP_NUM_THREADS=16 sudo perf record ./examples/more/12_time_to_botec_parallel/example
|
||||
sudo perf record ./examples/more/12_time_to_botec_parallel/example
|
||||
sudo perf report
|
||||
rm perf.data
|
||||
|
||||
|
|
|
@ -64,41 +64,15 @@ void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_
|
|||
{
|
||||
#pragma omp for
|
||||
for (i = 0; i < n_threads; i++) {
|
||||
// Simple version
|
||||
/*
|
||||
int quotient = n_samples / n_threads;
|
||||
int lower_bound_inclusive = i * quotient;
|
||||
int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below,
|
||||
for (int j = lower_bound_inclusive; j < upper_bound_not_inclusive; j++) {
|
||||
results[j] = sampler(&(cache_box[i].seed));
|
||||
// Could also result in inefficient cache stuff, but hopefully not too often
|
||||
}
|
||||
*/
|
||||
|
||||
// Version with loop unrolling
|
||||
int quotient = n_samples / n_threads;
|
||||
int lower_bound_inclusive = i * quotient;
|
||||
int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below,
|
||||
int delta = quotient;
|
||||
int eighth_of_deltas = delta/8; // why 8? a double in 8 bytes, 8 doubles is the size of a cache line
|
||||
int k;
|
||||
// to do: simplify these variables. Maybe divide by n_threads * 8 directly
|
||||
for(int j=0; j<eighth_of_deltas; j++){
|
||||
k = lower_bound_inclusive + j*8;
|
||||
results[k+0] = sampler(&(cache_box[i].seed));
|
||||
results[k+1] = sampler(&(cache_box[i].seed));
|
||||
results[k+2] = sampler(&(cache_box[i].seed));
|
||||
results[k+3] = sampler(&(cache_box[i].seed));
|
||||
results[k+4] = sampler(&(cache_box[i].seed));
|
||||
results[k+5] = sampler(&(cache_box[i].seed));
|
||||
results[k+6] = sampler(&(cache_box[i].seed));
|
||||
results[k+7] = sampler(&(cache_box[i].seed));
|
||||
// these all fit one single cache line
|
||||
// name of the techique: loop unrolling.
|
||||
}
|
||||
for(int k=eighth_of_deltas*8; k<upper_bound_not_inclusive; k++){
|
||||
results[k] = sampler(&(cache_box[i].seed));
|
||||
|
||||
// In principle, these results[j] could also result in two threads competing for the same cache line.
|
||||
// In practice, though,
|
||||
// a) this would happen infrequently
|
||||
// b)
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user