Compare commits
No commits in common. "48f333adfeea46098157593773c47c0604989590" and "b6bbbc6b2e8676a79ae928a7fd47e334ed6af5dc" have entirely different histories.
48f333adfe
...
b6bbbc6b2e
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
10
makefile
10
makefile
|
@ -4,9 +4,6 @@ MAKEFLAGS += --no-print-directory
|
|||
STYLE_BLUEPRINT=webkit
|
||||
FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
|
||||
|
||||
## Time to botec
|
||||
TTB=./examples/more/12_time_to_botec_parallel/example
|
||||
|
||||
build-examples:
|
||||
cd examples/core && make all
|
||||
cd examples/more && make all
|
||||
|
@ -24,12 +21,7 @@ lint:
|
|||
clang-tidy squiggle_more.c -- -lm
|
||||
|
||||
profile:
|
||||
sudo perf record -g ./examples/more/12_time_to_botec_parallel/example
|
||||
OMP_NUM_THREADS=16 sudo perf record ./examples/more/12_time_to_botec_parallel/example
|
||||
sudo perf report
|
||||
rm perf.data
|
||||
sudo perf stat ./examples/more/12_time_to_botec_parallel/example
|
||||
|
||||
time-linux:
|
||||
gcc -O3 -Wall -Wextra -Wdouble-promotion -Wconversion examples/more/12_time_to_botec_parallel/example.c squiggle.c squiggle_more.c -lm -fopenmp -o examples/more/12_time_to_botec_parallel/example
|
||||
@echo "Running 100x and taking avg time: $(TTB)"
|
||||
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_PROC_BIND=TRUE $(TTB); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo
|
||||
|
|
|
@ -43,8 +43,8 @@ void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_
|
|||
int divisor_multiple = quotient * n_threads;
|
||||
|
||||
// uint64_t** seeds = malloc((size_t)n_threads * sizeof(uint64_t*));
|
||||
seed_cache_box* cache_box = (seed_cache_box*)malloc(sizeof(seed_cache_box) * (size_t)n_threads);
|
||||
// seed_cache_box cache_box[n_threads]; // we could use the C stack. On normal linux machines, it's 8MB ($ ulimit -s). However, it doesn't quite feel right.
|
||||
seed_cache_box* cache_box = (seed_cache_box*) malloc(sizeof(seed_cache_box) * (size_t)n_threads);
|
||||
// seed_cache_box cache_box[n_threads];
|
||||
srand(1);
|
||||
for (int i = 0; i < n_threads; i++) {
|
||||
// Constraints:
|
||||
|
@ -60,20 +60,16 @@ void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_
|
|||
}
|
||||
|
||||
int i;
|
||||
#pragma omp parallel private(i)
|
||||
#pragma omp parallel private(i, quotient)
|
||||
{
|
||||
#pragma omp for
|
||||
for (i = 0; i < n_threads; i++) {
|
||||
int quotient = n_samples / n_threads;
|
||||
int lower_bound_inclusive = i * quotient;
|
||||
int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below,
|
||||
|
||||
for (int j = lower_bound_inclusive; j < upper_bound_not_inclusive; j++) {
|
||||
results[j] = sampler(&(cache_box[i].seed));
|
||||
// In principle, these results[j] could also result in two threads competing for the same cache line.
|
||||
// In practice, though,
|
||||
// a) this would happen infrequently
|
||||
// b) trying to unroll loops actually makes the code slower
|
||||
// c) 8 results[j] are 8 doubles, which fit a cache line. If n_samples/n_threads
|
||||
// Could also result in inefficient cache stuff, but hopefully not too often
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -92,7 +88,7 @@ typedef struct ci_t {
|
|||
double high;
|
||||
} ci;
|
||||
|
||||
inline static void swp(int i, int j, double xs[])
|
||||
static void swp(int i, int j, double xs[])
|
||||
{
|
||||
double tmp = xs[i];
|
||||
xs[i] = xs[j];
|
||||
|
@ -124,7 +120,7 @@ static double quickselect(int k, double xs[], int n)
|
|||
{
|
||||
// https://en.wikipedia.org/wiki/Quickselect
|
||||
|
||||
double* ys = malloc((size_t)n * sizeof(double));
|
||||
double *ys = malloc((size_t)n * sizeof(double));
|
||||
memcpy(ys, xs, (size_t)n * sizeof(double));
|
||||
// ^: don't rearrange item order in the original array
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user