forked from personal/squiggle.c
recover better parallelism function from git history, recompile
This commit is contained in:
parent
b22eb34835
commit
ddf76e79b0
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -21,7 +21,7 @@ int main()
|
||||||
|
|
||||||
sampler_parallel(sampler, results, n_threads, n_samples);
|
sampler_parallel(sampler, results, n_threads, n_samples);
|
||||||
double avg = array_sum(results, n_samples) / n_samples;
|
double avg = array_sum(results, n_samples) / n_samples;
|
||||||
printf("Average of 1B lognormal(0,10): %f", avg);
|
printf("Average of 1B lognormal(0,10): %f\n", avg);
|
||||||
|
|
||||||
free(results);
|
free(results);
|
||||||
|
|
||||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -10,14 +10,40 @@
|
||||||
/* Parallel sampler */
|
/* Parallel sampler */
|
||||||
void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_threads, int n_samples)
|
void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_threads, int n_samples)
|
||||||
{
|
{
|
||||||
if ((n_samples % n_threads) != 0) {
|
|
||||||
fprintf(stderr, "Number of samples isn't divisible by number of threads, aborting\n");
|
// Division terminology:
|
||||||
exit(1);
|
// a = b * quotient + reminder
|
||||||
}
|
// a = (a/b)*b + (a%b)
|
||||||
|
// dividend: a
|
||||||
|
// divisor: b
|
||||||
|
// quotient = a / b
|
||||||
|
// reminder = a % b
|
||||||
|
// "divisor's multiple" := (a/b)*b
|
||||||
|
|
||||||
|
// now, we have n_samples and n_threads
|
||||||
|
// to make our life easy, each thread will have a number of samples of: a/b (quotient)
|
||||||
|
// and we'll compute the remainder of samples separately
|
||||||
|
// to possibly do by Jorge: improve so that the remainder is included in the threads
|
||||||
|
|
||||||
|
int quotient = n_samples / n_threads;
|
||||||
|
int remainder = n_samples % n_threads;
|
||||||
|
int divisor_multiple = quotient * n_threads;
|
||||||
|
|
||||||
uint64_t** seeds = malloc(n_threads * sizeof(uint64_t*));
|
uint64_t** seeds = malloc(n_threads * sizeof(uint64_t*));
|
||||||
|
// printf("UINT64_MAX: %lu\n", UINT64_MAX);
|
||||||
|
srand(1);
|
||||||
for (uint64_t i = 0; i < n_threads; i++) {
|
for (uint64_t i = 0; i < n_threads; i++) {
|
||||||
seeds[i] = malloc(sizeof(uint64_t));
|
seeds[i] = malloc(sizeof(uint64_t));
|
||||||
*seeds[i] = i + 1; // xorshift can't start with 0
|
// Constraints:
|
||||||
|
// - xorshift can't start with 0
|
||||||
|
// - the seeds should be reasonably separated and not correlated
|
||||||
|
*seeds[i] = (uint64_t)rand() * (UINT64_MAX / RAND_MAX);
|
||||||
|
// printf("#%ld: %lu\n",i, *seeds[i]);
|
||||||
|
|
||||||
|
// Other initializations tried:
|
||||||
|
// *seeds[i] = 1 + i;
|
||||||
|
// *seeds[i] = (i + 0.5)*(UINT64_MAX/n_threads);
|
||||||
|
// *seeds[i] = (i + 0.5)*(UINT64_MAX/n_threads) + constant * i;
|
||||||
}
|
}
|
||||||
|
|
||||||
int i;
|
int i;
|
||||||
|
@ -25,14 +51,18 @@ void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_
|
||||||
{
|
{
|
||||||
#pragma omp for
|
#pragma omp for
|
||||||
for (i = 0; i < n_threads; i++) {
|
for (i = 0; i < n_threads; i++) {
|
||||||
int lower_bound = i * (n_samples / n_threads);
|
int lower_bound_inclusive = i * quotient;
|
||||||
int upper_bound = ((i + 1) * (n_samples / n_threads)) - 1;
|
int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below,
|
||||||
// printf("Lower bound: %d, upper bound: %d\n", lower_bound, upper_bound);
|
// printf("Lower bound: %d, upper bound: %d\n", lower_bound, upper_bound);
|
||||||
for (int j = lower_bound; j < upper_bound; j++) {
|
for (int j = lower_bound_inclusive; j < upper_bound_not_inclusive; j++) {
|
||||||
results[j] = sampler(seeds[i]);
|
results[j] = sampler(seeds[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
for (int j = divisor_multiple; j < n_samples; j++) {
|
||||||
|
results[j] = sampler(seeds[0]);
|
||||||
|
// we can just reuse a seed, this isn't problematic because we are not doing multithreading
|
||||||
|
}
|
||||||
|
|
||||||
for (uint64_t i = 0; i < n_threads; i++) {
|
for (uint64_t i = 0; i < n_threads; i++) {
|
||||||
free(seeds[i]);
|
free(seeds[i]);
|
||||||
|
|
Loading…
Reference in New Issue
Block a user