add more comments, undo loop unrolling.
This commit is contained in:
parent
c25e9f916f
commit
dd6bb53f1b
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
2
makefile
2
makefile
|
@ -24,7 +24,7 @@ lint:
|
||||||
clang-tidy squiggle_more.c -- -lm
|
clang-tidy squiggle_more.c -- -lm
|
||||||
|
|
||||||
profile:
|
profile:
|
||||||
OMP_NUM_THREADS=16 sudo perf record ./examples/more/12_time_to_botec_parallel/example
|
sudo perf record ./examples/more/12_time_to_botec_parallel/example
|
||||||
sudo perf report
|
sudo perf report
|
||||||
rm perf.data
|
rm perf.data
|
||||||
|
|
||||||
|
|
|
@ -64,41 +64,15 @@ void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_
|
||||||
{
|
{
|
||||||
#pragma omp for
|
#pragma omp for
|
||||||
for (i = 0; i < n_threads; i++) {
|
for (i = 0; i < n_threads; i++) {
|
||||||
// Simple version
|
|
||||||
/*
|
|
||||||
int quotient = n_samples / n_threads;
|
int quotient = n_samples / n_threads;
|
||||||
int lower_bound_inclusive = i * quotient;
|
int lower_bound_inclusive = i * quotient;
|
||||||
int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below,
|
int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below,
|
||||||
for (int j = lower_bound_inclusive; j < upper_bound_not_inclusive; j++) {
|
for (int j = lower_bound_inclusive; j < upper_bound_not_inclusive; j++) {
|
||||||
results[j] = sampler(&(cache_box[i].seed));
|
results[j] = sampler(&(cache_box[i].seed));
|
||||||
// Could also result in inefficient cache stuff, but hopefully not too often
|
// In principle, these results[j] could also result in two threads competing for the same cache line.
|
||||||
}
|
// In practice, though,
|
||||||
*/
|
// a) this would happen infrequently
|
||||||
|
// b)
|
||||||
// Version with loop unrolling
|
|
||||||
int quotient = n_samples / n_threads;
|
|
||||||
int lower_bound_inclusive = i * quotient;
|
|
||||||
int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below,
|
|
||||||
int delta = quotient;
|
|
||||||
int eighth_of_deltas = delta/8; // why 8? a double in 8 bytes, 8 doubles is the size of a cache line
|
|
||||||
int k;
|
|
||||||
// to do: simplify these variables. Maybe divide by n_threads * 8 directly
|
|
||||||
for(int j=0; j<eighth_of_deltas; j++){
|
|
||||||
k = lower_bound_inclusive + j*8;
|
|
||||||
results[k+0] = sampler(&(cache_box[i].seed));
|
|
||||||
results[k+1] = sampler(&(cache_box[i].seed));
|
|
||||||
results[k+2] = sampler(&(cache_box[i].seed));
|
|
||||||
results[k+3] = sampler(&(cache_box[i].seed));
|
|
||||||
results[k+4] = sampler(&(cache_box[i].seed));
|
|
||||||
results[k+5] = sampler(&(cache_box[i].seed));
|
|
||||||
results[k+6] = sampler(&(cache_box[i].seed));
|
|
||||||
results[k+7] = sampler(&(cache_box[i].seed));
|
|
||||||
// these all fit one single cache line
|
|
||||||
// name of the techique: loop unrolling.
|
|
||||||
}
|
|
||||||
for(int k=eighth_of_deltas*8; k<upper_bound_not_inclusive; k++){
|
|
||||||
results[k] = sampler(&(cache_box[i].seed));
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user