add failed example of loop unrolling

2024-01-12 20:41:06 +01:00 · 2024-01-12 20:41:06 +01:00 · c25e9f916f
commit c25e9f916f
parent a50d776d2c
17 changed files with 36 additions and 0 deletions
--- a/examples/more/00_example_template/example
+++ b/examples/more/00_example_template/example
--- a/examples/more/01_sample_from_cdf/example
+++ b/examples/more/01_sample_from_cdf/example
--- a/examples/more/02_sample_from_cdf_beta/example
+++ b/examples/more/02_sample_from_cdf_beta/example
--- a/examples/more/03_ci_beta/example
+++ b/examples/more/03_ci_beta/example
--- a/examples/more/04_nuclear_war/example
+++ b/examples/more/04_nuclear_war/example
--- a/examples/more/05_burn_10kg_fat/example
+++ b/examples/more/05_burn_10kg_fat/example
--- a/examples/more/06_nuclear_recovery/example
+++ b/examples/more/06_nuclear_recovery/example
--- a/examples/more/07_algebra/example
+++ b/examples/more/07_algebra/example
--- a/examples/more/08_algebra_and_conversion/example
+++ b/examples/more/08_algebra_and_conversion/example
--- a/examples/more/09_ergonomic_algebra/example
+++ b/examples/more/09_ergonomic_algebra/example
--- a/examples/more/10_twitter_thread_example/example
+++ b/examples/more/10_twitter_thread_example/example
--- a/examples/more/11_billion_lognormals_paralell/example
+++ b/examples/more/11_billion_lognormals_paralell/example
--- a/examples/more/12_time_to_botec_parallel/example
+++ b/examples/more/12_time_to_botec_parallel/example
--- a/examples/more/13_parallelize_min/example
+++ b/examples/more/13_parallelize_min/example
--- a/examples/more/14_check_confidence_interval/example
+++ b/examples/more/14_check_confidence_interval/example
--- a/6
+++ b/6
@ -4,6 +4,9 @@ MAKEFLAGS += --no-print-directory
 STYLE_BLUEPRINT=webkit
 FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
 ## Time to botec
 TTB=./examples/more/12_time_to_botec_parallel/example
 build-examples:
 	cd examples/core && make all
 	cd examples/more && make all
@ -25,3 +28,6 @@ profile:
 	sudo perf report
 	rm perf.data
 time-linux: 
 	@echo "Running 100x and taking avg time: $(TTB)"
 	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(TTB); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo
--- a/squiggle_more.c
+++ b/squiggle_more.c
@ -64,6 +64,8 @@ void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_
    {
 #pragma omp for
        for (i = 0; i < n_threads; i++) {
            // Simple version 
            /*
            int quotient = n_samples / n_threads;
            int lower_bound_inclusive = i * quotient;
            int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below,
@ -71,6 +73,34 @@ void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_
                results[j] = sampler(&(cache_box[i].seed));
                // Could also result in inefficient cache stuff, but hopefully not too often
            }
            */
            // Version with loop unrolling
            int quotient = n_samples / n_threads;
            int lower_bound_inclusive = i * quotient;
            int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below,
            int delta = quotient;
            int eighth_of_deltas = delta/8; // why 8? a double in 8 bytes, 8 doubles is the size of a cache line
            int k;
            // to do: simplify these variables. Maybe divide by n_threads * 8 directly
            for(int j=0; j<eighth_of_deltas; j++){
                k = lower_bound_inclusive + j*8;
                results[k+0] = sampler(&(cache_box[i].seed));
                results[k+1] = sampler(&(cache_box[i].seed));
                results[k+2] = sampler(&(cache_box[i].seed));
                results[k+3] = sampler(&(cache_box[i].seed));
                results[k+4] = sampler(&(cache_box[i].seed));
                results[k+5] = sampler(&(cache_box[i].seed));
                results[k+6] = sampler(&(cache_box[i].seed));
                results[k+7] = sampler(&(cache_box[i].seed));
                // these all fit one single cache line
                // name of the techique: loop unrolling.
            }
            for(int k=eighth_of_deltas*8; k<upper_bound_not_inclusive; k++){
                results[k] = sampler(&(cache_box[i].seed));
            }
        }
    }
    for (int j = divisor_multiple; j < n_samples; j++) {