Compare commits
7 Commits
b6bbbc6b2e
...
48f333adfe
Author | SHA1 | Date | |
---|---|---|---|
48f333adfe | |||
b497b5b399 | |||
3bb2804ccf | |||
eb1c592610 | |||
dd6bb53f1b | |||
c25e9f916f | |||
a50d776d2c |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
10
makefile
10
makefile
|
@ -4,6 +4,9 @@ MAKEFLAGS += --no-print-directory
|
||||||
STYLE_BLUEPRINT=webkit
|
STYLE_BLUEPRINT=webkit
|
||||||
FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
|
FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
|
||||||
|
|
||||||
|
## Time to botec
|
||||||
|
TTB=./examples/more/12_time_to_botec_parallel/example
|
||||||
|
|
||||||
build-examples:
|
build-examples:
|
||||||
cd examples/core && make all
|
cd examples/core && make all
|
||||||
cd examples/more && make all
|
cd examples/more && make all
|
||||||
|
@ -21,7 +24,12 @@ lint:
|
||||||
clang-tidy squiggle_more.c -- -lm
|
clang-tidy squiggle_more.c -- -lm
|
||||||
|
|
||||||
profile:
|
profile:
|
||||||
OMP_NUM_THREADS=16 sudo perf record ./examples/more/12_time_to_botec_parallel/example
|
sudo perf record -g ./examples/more/12_time_to_botec_parallel/example
|
||||||
sudo perf report
|
sudo perf report
|
||||||
rm perf.data
|
rm perf.data
|
||||||
|
sudo perf stat ./examples/more/12_time_to_botec_parallel/example
|
||||||
|
|
||||||
|
time-linux:
|
||||||
|
gcc -O3 -Wall -Wextra -Wdouble-promotion -Wconversion examples/more/12_time_to_botec_parallel/example.c squiggle.c squiggle_more.c -lm -fopenmp -o examples/more/12_time_to_botec_parallel/example
|
||||||
|
@echo "Running 100x and taking avg time: $(TTB)"
|
||||||
|
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_PROC_BIND=TRUE $(TTB); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo
|
||||||
|
|
|
@ -43,8 +43,8 @@ void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_
|
||||||
int divisor_multiple = quotient * n_threads;
|
int divisor_multiple = quotient * n_threads;
|
||||||
|
|
||||||
// uint64_t** seeds = malloc((size_t)n_threads * sizeof(uint64_t*));
|
// uint64_t** seeds = malloc((size_t)n_threads * sizeof(uint64_t*));
|
||||||
seed_cache_box* cache_box = (seed_cache_box*) malloc(sizeof(seed_cache_box) * (size_t)n_threads);
|
seed_cache_box* cache_box = (seed_cache_box*)malloc(sizeof(seed_cache_box) * (size_t)n_threads);
|
||||||
// seed_cache_box cache_box[n_threads];
|
// seed_cache_box cache_box[n_threads]; // we could use the C stack. On normal linux machines, it's 8MB ($ ulimit -s). However, it doesn't quite feel right.
|
||||||
srand(1);
|
srand(1);
|
||||||
for (int i = 0; i < n_threads; i++) {
|
for (int i = 0; i < n_threads; i++) {
|
||||||
// Constraints:
|
// Constraints:
|
||||||
|
@ -60,16 +60,20 @@ void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_
|
||||||
}
|
}
|
||||||
|
|
||||||
int i;
|
int i;
|
||||||
#pragma omp parallel private(i, quotient)
|
#pragma omp parallel private(i)
|
||||||
{
|
{
|
||||||
#pragma omp for
|
#pragma omp for
|
||||||
for (i = 0; i < n_threads; i++) {
|
for (i = 0; i < n_threads; i++) {
|
||||||
int quotient = n_samples / n_threads;
|
|
||||||
int lower_bound_inclusive = i * quotient;
|
int lower_bound_inclusive = i * quotient;
|
||||||
int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below,
|
int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below,
|
||||||
|
|
||||||
for (int j = lower_bound_inclusive; j < upper_bound_not_inclusive; j++) {
|
for (int j = lower_bound_inclusive; j < upper_bound_not_inclusive; j++) {
|
||||||
results[j] = sampler(&(cache_box[i].seed));
|
results[j] = sampler(&(cache_box[i].seed));
|
||||||
// Could also result in inefficient cache stuff, but hopefully not too often
|
// In principle, these results[j] could also result in two threads competing for the same cache line.
|
||||||
|
// In practice, though,
|
||||||
|
// a) this would happen infrequently
|
||||||
|
// b) trying to unroll loops actually makes the code slower
|
||||||
|
// c) 8 results[j] are 8 doubles, which fit a cache line. If n_samples/n_threads
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -88,7 +92,7 @@ typedef struct ci_t {
|
||||||
double high;
|
double high;
|
||||||
} ci;
|
} ci;
|
||||||
|
|
||||||
static void swp(int i, int j, double xs[])
|
inline static void swp(int i, int j, double xs[])
|
||||||
{
|
{
|
||||||
double tmp = xs[i];
|
double tmp = xs[i];
|
||||||
xs[i] = xs[j];
|
xs[i] = xs[j];
|
||||||
|
@ -120,7 +124,7 @@ static double quickselect(int k, double xs[], int n)
|
||||||
{
|
{
|
||||||
// https://en.wikipedia.org/wiki/Quickselect
|
// https://en.wikipedia.org/wiki/Quickselect
|
||||||
|
|
||||||
double *ys = malloc((size_t)n * sizeof(double));
|
double* ys = malloc((size_t)n * sizeof(double));
|
||||||
memcpy(ys, xs, (size_t)n * sizeof(double));
|
memcpy(ys, xs, (size_t)n * sizeof(double));
|
||||||
// ^: don't rearrange item order in the original array
|
// ^: don't rearrange item order in the original array
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user