formatting pass; makefile tweaks

remove loop unrolling again
save squiggle_more tweaks
2024-01-13 01:05:44 +01:00 · 2024-01-13 00:50:51 +01:00 · 2024-01-13 00:50:27 +01:00 · 2024-01-12 23:55:09 +01:00 · 2024-01-12 23:53:58 +01:00 · 2024-01-12 20:41:06 +01:00
17 changed files with 20 additions and 8 deletions
--- a/examples/more/00_example_template/example
+++ b/examples/more/00_example_template/example
--- a/examples/more/01_sample_from_cdf/example
+++ b/examples/more/01_sample_from_cdf/example
--- a/examples/more/02_sample_from_cdf_beta/example
+++ b/examples/more/02_sample_from_cdf_beta/example
--- a/examples/more/03_ci_beta/example
+++ b/examples/more/03_ci_beta/example
--- a/examples/more/04_nuclear_war/example
+++ b/examples/more/04_nuclear_war/example
--- a/examples/more/05_burn_10kg_fat/example
+++ b/examples/more/05_burn_10kg_fat/example
--- a/examples/more/06_nuclear_recovery/example
+++ b/examples/more/06_nuclear_recovery/example
--- a/examples/more/07_algebra/example
+++ b/examples/more/07_algebra/example
--- a/examples/more/08_algebra_and_conversion/example
+++ b/examples/more/08_algebra_and_conversion/example
--- a/examples/more/09_ergonomic_algebra/example
+++ b/examples/more/09_ergonomic_algebra/example
--- a/examples/more/10_twitter_thread_example/example
+++ b/examples/more/10_twitter_thread_example/example
--- a/examples/more/11_billion_lognormals_paralell/example
+++ b/examples/more/11_billion_lognormals_paralell/example
--- a/examples/more/12_time_to_botec_parallel/example
+++ b/examples/more/12_time_to_botec_parallel/example
--- a/examples/more/13_parallelize_min/example
+++ b/examples/more/13_parallelize_min/example
--- a/examples/more/14_check_confidence_interval/example
+++ b/examples/more/14_check_confidence_interval/example
--- a/10
+++ b/10
@ -4,6 +4,9 @@ MAKEFLAGS += --no-print-directory
 STYLE_BLUEPRINT=webkit
 FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)

+## Time to botec
+TTB=./examples/more/12_time_to_botec_parallel/example
+
 build-examples:
 	cd examples/core && make all
 	cd examples/more && make all
@ -21,7 +24,12 @@ lint:
 	clang-tidy squiggle_more.c -- -lm

 profile:
-	OMP_NUM_THREADS=16 sudo perf record ./examples/more/12_time_to_botec_parallel/example
+	sudo perf record -g ./examples/more/12_time_to_botec_parallel/example 
 	sudo perf report
 	rm perf.data
+	sudo perf stat ./examples/more/12_time_to_botec_parallel/example

+time-linux: 
+	gcc -O3 -Wall -Wextra -Wdouble-promotion -Wconversion examples/more/12_time_to_botec_parallel/example.c squiggle.c squiggle_more.c -lm -fopenmp -o examples/more/12_time_to_botec_parallel/example
+	@echo "Running 100x and taking avg time: $(TTB)"
+	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_PROC_BIND=TRUE $(TTB); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo
--- a/squiggle_more.c
+++ b/squiggle_more.c
@ -43,8 +43,8 @@ void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_
    int divisor_multiple = quotient * n_threads;

    // uint64_t** seeds = malloc((size_t)n_threads * sizeof(uint64_t*));
-    seed_cache_box* cache_box = (seed_cache_box*) malloc(sizeof(seed_cache_box) * (size_t)n_threads);
-    // seed_cache_box cache_box[n_threads];
+    seed_cache_box* cache_box = (seed_cache_box*)malloc(sizeof(seed_cache_box) * (size_t)n_threads);
+    // seed_cache_box cache_box[n_threads]; // we could use the C stack. On normal linux machines, it's 8MB ($ ulimit -s). However, it doesn't quite feel right.
    srand(1);
    for (int i = 0; i < n_threads; i++) {
        // Constraints:
@ -60,16 +60,20 @@ void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_
    }

    int i;
-#pragma omp parallel private(i, quotient)
+#pragma omp parallel private(i)
    {
 #pragma omp for
        for (i = 0; i < n_threads; i++) {
-            int quotient = n_samples / n_threads;
            int lower_bound_inclusive = i * quotient;
            int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below,
+
            for (int j = lower_bound_inclusive; j < upper_bound_not_inclusive; j++) {
                results[j] = sampler(&(cache_box[i].seed));
-                // Could also result in inefficient cache stuff, but hopefully not too often
+                // In principle, these results[j] could also result in two threads competing for the same cache line.
+                // In practice, though,
+                // a) this would happen infrequently
+                // b) trying to unroll loops actually makes the code slower
+                // c) 8 results[j] are 8 doubles, which fit a cache line. If n_samples/n_threads
            }
        }
    }
@ -88,7 +92,7 @@ typedef struct ci_t {
    double high;
 } ci;

-static void swp(int i, int j, double xs[])
+inline static void swp(int i, int j, double xs[])
 {
    double tmp = xs[i];
    xs[i] = xs[j];
@ -120,7 +124,7 @@ static double quickselect(int k, double xs[], int n)
 {
    // https://en.wikipedia.org/wiki/Quickselect

-    double *ys = malloc((size_t)n * sizeof(double));
+    double* ys = malloc((size_t)n * sizeof(double));
    memcpy(ys, xs, (size_t)n * sizeof(double));
    // ^: don't rearrange item order in the original array
Author	SHA1	Message	Date
NunoSempere	48f333adfe	formatting pass; makefile tweaks	2024-01-13 01:05:44 +01:00
NunoSempere	b497b5b399	remove loop unrolling again	2024-01-13 00:50:51 +01:00
NunoSempere	3bb2804ccf	save squiggle_more tweaks	2024-01-13 00:50:27 +01:00
NunoSempere	eb1c592610	formatting pass.	2024-01-12 23:55:09 +01:00
NunoSempere	dd6bb53f1b	add more comments, undo loop unrolling.	2024-01-12 23:53:58 +01:00
NunoSempere	c25e9f916f	add failed example of loop unrolling	2024-01-12 20:41:06 +01:00
NunoSempere	a50d776d2c	add comment on size of the C stack	2024-01-12 19:08:12 +01:00