Compare commits

...

7 Commits

17 changed files with 20 additions and 8 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -4,6 +4,9 @@ MAKEFLAGS += --no-print-directory
STYLE_BLUEPRINT=webkit STYLE_BLUEPRINT=webkit
FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT) FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
## Time to botec
TTB=./examples/more/12_time_to_botec_parallel/example
build-examples: build-examples:
cd examples/core && make all cd examples/core && make all
cd examples/more && make all cd examples/more && make all
@ -21,7 +24,12 @@ lint:
clang-tidy squiggle_more.c -- -lm clang-tidy squiggle_more.c -- -lm
profile: profile:
OMP_NUM_THREADS=16 sudo perf record ./examples/more/12_time_to_botec_parallel/example sudo perf record -g ./examples/more/12_time_to_botec_parallel/example
sudo perf report sudo perf report
rm perf.data rm perf.data
sudo perf stat ./examples/more/12_time_to_botec_parallel/example
time-linux:
gcc -O3 -Wall -Wextra -Wdouble-promotion -Wconversion examples/more/12_time_to_botec_parallel/example.c squiggle.c squiggle_more.c -lm -fopenmp -o examples/more/12_time_to_botec_parallel/example
@echo "Running 100x and taking avg time: $(TTB)"
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_PROC_BIND=TRUE $(TTB); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo

View File

@ -43,8 +43,8 @@ void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_
int divisor_multiple = quotient * n_threads; int divisor_multiple = quotient * n_threads;
// uint64_t** seeds = malloc((size_t)n_threads * sizeof(uint64_t*)); // uint64_t** seeds = malloc((size_t)n_threads * sizeof(uint64_t*));
seed_cache_box* cache_box = (seed_cache_box*) malloc(sizeof(seed_cache_box) * (size_t)n_threads); seed_cache_box* cache_box = (seed_cache_box*)malloc(sizeof(seed_cache_box) * (size_t)n_threads);
// seed_cache_box cache_box[n_threads]; // seed_cache_box cache_box[n_threads]; // we could use the C stack. On normal linux machines, it's 8MB ($ ulimit -s). However, it doesn't quite feel right.
srand(1); srand(1);
for (int i = 0; i < n_threads; i++) { for (int i = 0; i < n_threads; i++) {
// Constraints: // Constraints:
@ -60,16 +60,20 @@ void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_
} }
int i; int i;
#pragma omp parallel private(i, quotient) #pragma omp parallel private(i)
{ {
#pragma omp for #pragma omp for
for (i = 0; i < n_threads; i++) { for (i = 0; i < n_threads; i++) {
int quotient = n_samples / n_threads;
int lower_bound_inclusive = i * quotient; int lower_bound_inclusive = i * quotient;
int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below, int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below,
for (int j = lower_bound_inclusive; j < upper_bound_not_inclusive; j++) { for (int j = lower_bound_inclusive; j < upper_bound_not_inclusive; j++) {
results[j] = sampler(&(cache_box[i].seed)); results[j] = sampler(&(cache_box[i].seed));
// Could also result in inefficient cache stuff, but hopefully not too often // In principle, these results[j] could also result in two threads competing for the same cache line.
// In practice, though,
// a) this would happen infrequently
// b) trying to unroll loops actually makes the code slower
// c) 8 results[j] are 8 doubles, which fit a cache line. If n_samples/n_threads
} }
} }
} }
@ -88,7 +92,7 @@ typedef struct ci_t {
double high; double high;
} ci; } ci;
static void swp(int i, int j, double xs[]) inline static void swp(int i, int j, double xs[])
{ {
double tmp = xs[i]; double tmp = xs[i];
xs[i] = xs[j]; xs[i] = xs[j];
@ -120,7 +124,7 @@ static double quickselect(int k, double xs[], int n)
{ {
// https://en.wikipedia.org/wiki/Quickselect // https://en.wikipedia.org/wiki/Quickselect
double *ys = malloc((size_t)n * sizeof(double)); double* ys = malloc((size_t)n * sizeof(double));
memcpy(ys, xs, (size_t)n * sizeof(double)); memcpy(ys, xs, (size_t)n * sizeof(double));
// ^: don't rearrange item order in the original array // ^: don't rearrange item order in the original array