From 2b5b496c25177b070c247e977d338bb0eea20704 Mon Sep 17 00:00:00 2001 From: NunoSempere Date: Fri, 12 Jan 2024 00:23:01 +0100 Subject: [PATCH] seed cache box name, make "quotient" private var --- squiggle_more.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/squiggle_more.c b/squiggle_more.c index 0146707..766a66f 100644 --- a/squiggle_more.c +++ b/squiggle_more.c @@ -10,10 +10,11 @@ /* Parallel sampler */ #define CACHE_LINE_SIZE 64 -typedef struct padded_seed_t { +typedef struct seed_cache_box_t { uint64_t* seed; char padding[CACHE_LINE_SIZE - sizeof(uint64_t*)]; -} padded_seed; +} seed_cache_box; +// This avoid false sharing. Dealing with this shaves ~2ms. void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_threads, int n_samples) { @@ -36,14 +37,14 @@ void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_ int divisor_multiple = quotient * n_threads; // uint64_t** seeds = malloc((size_t)n_threads * sizeof(uint64_t*)); - padded_seed* seeds = (padded_seed*) malloc(sizeof(padded_seed) * (size_t)n_threads); + seed_cache_box* cache_box = (seed_cache_box*) malloc(sizeof(seed_cache_box) * (size_t)n_threads); srand(1); for (int i = 0; i < n_threads; i++) { - seeds[i].seed = malloc(sizeof(uint64_t*)); + cache_box[i].seed = malloc(sizeof(uint64_t*)); // Constraints: // - xorshift can't start with 0 // - the seeds should be reasonably separated and not correlated - *(seeds[i].seed) = (uint64_t)rand() * (UINT64_MAX / RAND_MAX); + *(cache_box[i].seed) = (uint64_t)rand() * (UINT64_MAX / RAND_MAX); // printf("#%ld: %lu\n",i, *seeds[i]); // Other initializations tried: @@ -53,26 +54,26 @@ void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_ } int i; -#pragma omp parallel private(i) +#pragma omp parallel private(i, quotient) { #pragma omp for for (i = 0; i < n_threads; i++) { int lower_bound_inclusive = i * quotient; int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below, for (int j = lower_bound_inclusive; j < upper_bound_not_inclusive; j++) { - results[j] = sampler(seeds[i].seed); + results[j] = sampler(cache_box[i].seed); } } } for (int j = divisor_multiple; j < n_samples; j++) { - results[j] = sampler(seeds[0].seed); + results[j] = sampler(cache_box[0].seed); // we can just reuse a seed, this isn't problematic because we are not doing multithreading } for (int i = 0; i < n_threads; i++) { - free(seeds[i].seed); + free(cache_box[i].seed); } - free(seeds); + free(cache_box); } /* Get confidence intervals, given a sampler */