60 changed files with 211 additions and 3494 deletions
--- a/C/C-01-simple/makefile
+++ b/C/C-01-simple/makefile
@ -1,53 +0,0 @@
 # Interface: 
 #   make
 #   make build
 #   make format
 #   make run
 # Compiler
 CC=gcc
 # CC=tcc # <= faster compilation
 # Main file
 SRC=samples.c
 OUTPUT=samples
 ## Dependencies
 DEPS='gsl'
 ## Flags
 INCS=`pkg-config --cflags ${DEPS}`
 LIBS=`pkg-config --libs ${DEPS}`
 DEBUG= #'-g'
 STANDARD=-std=c99
 WARNINGS=-Wall
 FAST=-Ofast
 ## Formatter
 STYLE_BLUEPRINT=webkit
 FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
 ## make build
 build: $(SRC)
 	$(CC) $(DEBUG) $(INCS) $(PLUGS) $(SRC) -o samples $(LIBS)
 fast: $(SRC)
 	$(CC) $(FAST) $(DEBUG) $(INCS) $(PLUGS) $(SRC) -o samples $(LIBS)
 format: $(SRC)
 	$(FORMATTER) $(SRC)
 run: $(SRC) $(OUTPUT)
 	echo "Increasing stack size limit, because we are dealing with 1M samples"
 	# ulimit: increase stack size limit
 	# -Ss: the soft limit. If you set the hard limit, you then can't raise it
 	# 256000: around 250Mbs, if I'm reading it correctly.
 	# Then run the program
 	ulimit -Ss 256000 && ./$(OUTPUT)
 # Old:
 # Link libraries, for good measure
 # LD_LIBRARY_PATH=/usr/local/lib
 # export LD_LIBRARY_PATH
--- a/C/C-01-simple/samples
+++ b/C/C-01-simple/samples
--- a/C/C-01-simple/samples.c
+++ b/C/C-01-simple/samples.c
@ -1,157 +0,0 @@
 #include <gsl/gsl_randist.h>
 #include <gsl/gsl_rng.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #define N 1000000
 /* 
 * For very high values of N, you will want to increase the maximum stack trace, otherwise you will suffer a segmentation fault
 * In Ubuntu/bash you can do this with $ ulimit -Ss 256000 ## ~256Mbs
 * And confirm it with $ ulimit -a
 */
 /* Helpers */
 void print(double* ys)
 {
    for (int i = 0; i < N; i++) {
        printf("%f\n", ys[i]);
    }
    printf("\n");
 }
 void fill(double* ys, float f)
 {
    for (int i = 0; i < N; i++) {
        ys[i] = f;
    }
 }
 double sum(double* ps, int n)
 {
    double result = 0;
    for (int i = 0; i < n; i++) {
        result += ps[i];
    }
    return (result);
 }
 void cumsum(double* ps, double* rs, int n)
 {
    double counter = 0;
    for (int i = 0; i < n; i++) {
        counter += ps[i];
        rs[i] = counter;
    }
 }
 /* Distributions*/
 void normal(gsl_rng* r, double* ys, double mean, double std)
 {
    for (int i = 0; i < N; i++) {
        ys[i] = mean + gsl_ran_gaussian(r, std);
    }
 }
 void lognormal(gsl_rng* r, double* ys, double zeta, double sigma)
 {
    for (int i = 0; i < N; i++) {
        ys[i] = gsl_ran_lognormal(r, zeta, sigma);
    }
 }
 void to(gsl_rng* r, double* ys, double low, double high)
 {
    double normal95confidencePoint = 1.6448536269514722;
    double log_low = log(low);
    double log_high = log(high);
    double zeta = (log_low + log_high) / 2;
    double sigma = (log_high - log_low) / (2.0 * normal95confidencePoint);
    lognormal(r, ys, zeta, sigma);
 }
 /* Mixture of distributions */
 void mixture(gsl_rng* r, double* dists[], double* weights, int n, double* results)
 {
    /* Get cummulative, normalized weights */
    double sum_weights = sum(weights, n);
    double normalized_weights[n];
    for (int i = 0; i < n; i++) {
        normalized_weights[i] = weights[i] / sum_weights;
    }
    double cummulative_weights[n];
    cumsum(normalized_weights, cummulative_weights, n);
    /* Get N samples, drawn from the different distributions in proportion to their weights. */
    for (int i = 0; i < N; i++) {
        double p_1 = gsl_rng_uniform(r);
        double p_2 = gsl_rng_uniform(r);
        int index_found = 0;
        int index_counter = 0;
        while ((index_found == 0) && (index_counter < n)) {
            if (p_1 < cummulative_weights[index_counter]) {
                index_found = 1;
            } else {
                index_counter++;
            }
        }
        if (index_found == 0) {
            printf("\nThis shouldn't be able to happen");
            // gsl_rng_free (r);
            // abort(); // this shouldn't have happened.
        } else {
            int sample_index = (int)floor(p_2 * N);
            results[i] = dists[index_counter][sample_index];
        }
    }
 }
 /* Main */
 int main(void)
 {
 		// Start clock
    clock_t start, end;
    start = clock();
    /* Initialize GNU Statistical Library (GSL) stuff */
    const gsl_rng_type* T;
    gsl_rng* r;
    // gsl_rng_env_setup();
    T = gsl_rng_default;
    r = gsl_rng_alloc(T);
    /* Toy example */
    /* Declare variables in play */
    double p_a, p_b, p_c;
    double dist_none[N], dist_one[N], dist_few[N], dist_many[N], dist_mixture[N];
    /* Initialize variables */
    p_a = 0.8;
    p_b = 0.5;
    p_c = p_a * p_b;
    fill(dist_none, 0);
    fill(dist_one, 1);
    to(r, dist_few, 1, 3);
    to(r, dist_many, 2, 10);
    /* Generate mixture */
    int n = 4;
    double weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 };
    double* dists[] = { dist_none, dist_one, dist_few, dist_many };
    mixture(r, dists, weights, n, dist_mixture);
    printf("%f\n", sum(dist_mixture, N) / N);
    /* Clean up GSL */
    gsl_rng_free(r);
 		// End clock
    end = clock();
    printf("Total time (ms): %f\n", ((double)(end - start)) / CLOCKS_PER_SEC * 1000);
    /* Return success*/
    return EXIT_SUCCESS;
 }
--- a/C/C-02-better-algorithm-one-thread/makefile
+++ b/C/C-02-better-algorithm-one-thread/makefile
@ -1,53 +0,0 @@
 # Interface: 
 #   make
 #   make build
 #   make format
 #   make run
 # Compiler
 CC=gcc
 # CC=tcc # <= faster compilation
 # Main file
 SRC_ONE_THREAD=./samples-one-thread.c
 OUTPUT_ONE_THREAD=out/samples-one-thread
 ## Dependencies
 #  Has no dependencies
 MATH=-lm
 ## Flags
 DEBUG= #'-g'
 STANDARD=-std=c99
 WARNINGS=-Wall
 OPTIMIZED=-O3 #-O3 actually gives better performance than -Ofast, at least for this version
 OPENMP=-fopenmp
 ## Formatter
 STYLE_BLUEPRINT=webkit
 FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
 ## make build
 build: $(SRC_ONE_THREAD)
 	mkdir -p out
 	$(CC) $(OPTIMIZED) $(DEBUG) $(SRC_ONE_THREAD) $(OPENMP) $(MATH) -o $(OUTPUT_ONE_THREAD)
 format: $(SRC_ONE_THREAD)
 	$(FORMATTER) $(SRC_ONE_THREAD)
 run: $(SRC_ONE_THREAD) $(OUTPUT_ONE_THREAD)
 	./$(OUTPUT_ONE_THREAD)
 time-linux: 
 	@echo "Requires /bin/time, found on GNU/Linux systems" && echo
 	@echo "Running 100x and taking avg time: $(OUTPUT_ONE_THREAD)"
 	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(OUTPUT_ONE_THREAD); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time: |" | sed 's|$$|ms|' && echo
 time-linux-simple:
 	@echo "Requires /bin/time, found on GNU/Linux systems" && echo
 	/bin/time -f "Time: %es" ./$(OUTPUT_ONE_THREAD) && echo
 debian-install-dependencies:
 	sudo apt-get install libomp-dev
--- a/C/C-02-better-algorithm-one-thread/out/samples-one-thread
+++ b/C/C-02-better-algorithm-one-thread/out/samples-one-thread
--- a/C/C-02-better-algorithm-one-thread/samples-one-thread.c
+++ b/C/C-02-better-algorithm-one-thread/samples-one-thread.c
@ -1,183 +0,0 @@
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 const float PI = 3.14159265358979323846;
 #define N 1000000
 //Array helpers
 void array_print(float* array, int length)
 {
    for (int i = 0; i < length; i++) {
        printf("item[%d] = %f\n", i, array[i]);
    }
    printf("\n");
 }
 void array_fill(float* array, int length, float item)
 {
    int i;
    {
        for (i = 0; i < length; i++) {
            array[i] = item;
        }
    }
 }
 float array_sum(float* array, int length)
 {
    float output = 0.0;
    for (int i = 0; i < length; i++) {
        output += array[i];
    }
    return output;
 }
 void array_cumsum(float* array_to_sum, float* array_cumsummed, int length)
 {
    array_cumsummed[0] = array_to_sum[0];
    for (int i = 1; i < length; i++) {
        array_cumsummed[i] = array_cumsummed[i - 1] + array_to_sum[i];
    }
 }
 float rand_float(float to)
 {
    return ((float)rand() / (float)RAND_MAX) * to;
 }
 float ur_normal()
 {
    float u1 = rand_float(1.0);
    float u2 = rand_float(1.0);
    float z = sqrtf(-2.0 * log(u1)) * sin(2 * PI * u2);
    return z;
 }
 inline float random_uniform(float from, float to)
 {
    return ((float)rand() / (float)RAND_MAX) * (to - from) + from;
 }
 inline float random_normal(float mean, float sigma)
 {
    return (mean + sigma * ur_normal());
 }
 inline float random_lognormal(float logmean, float logsigma)
 {
    return expf(random_normal(logmean, logsigma));
 }
 inline float random_to(float low, float high)
 {
    const float NORMAL95CONFIDENCE = 1.6448536269514722;
    float loglow = logf(low);
    float loghigh = logf(high);
    float logmean = (loglow + loghigh) / 2;
    float logsigma = (loghigh - loglow) / (2.0 * NORMAL95CONFIDENCE);
    return random_lognormal(logmean, logsigma);
 }
 void array_random_to(float* array, int length, float low, float high)
 {
    int i;
    #pragma omp private(i)
    {
        #pragma omp for
        for (i = 0; i < length; i++) {
            array[i] = random_to(low, high);
        }
    }
 }
 void mixture(float (*samplers[])(void), float* weights, int n_dists, float* results, int results_length)
 {
    float sum_weights = array_sum(weights, n_dists);
    float* normalized_weights = malloc(n_dists * sizeof(float));
    for (int i = 0; i < n_dists; i++) {
        normalized_weights[i] = weights[i] / sum_weights;
    }
    float* cummulative_weights = malloc(n_dists * sizeof(float));
    array_cumsum(normalized_weights, cummulative_weights, n_dists);
    //create var holders
    float p1;
    int sample_index, i, own_length;
    {
 			for (int i = 0; i < results_length; i++) {
 					p1 = random_uniform(0, 1);
 					for (int j = 0; j < n_dists; j++) {
 							if (p1 < cummulative_weights[j]) {
 									results[i] = samplers[j]();
 									break;
 							}
 					}
 			}
    }
    free(normalized_weights);
    free(cummulative_weights);
 }
 float sample_0()
 {
    return 0;
 }
 float sample_1()
 {
    return 1;
 }
 float sample_few()
 {
    return random_to(1, 3);
 }
 float sample_many()
 {
    return random_to(2, 10);
 }
 int main()
 {
    //initialize randomness
    srand(1);
 		// clock_t start, end;
 		// start = clock();
 		// Toy example
 		// Declare variables in play
 		float p_a, p_b, p_c;
 		// printf("Max threads: %d\n", n_threads);
 		// omp_set_num_threads(n_threads);
 		// Initialize variables
 		p_a = 0.8;
 		p_b = 0.5;
 		p_c = p_a * p_b;
 		// Generate mixture
 		int n_dists = 4;
 		float weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 };
 		float (*samplers[])(void) = { sample_0, sample_1, sample_few, sample_many };
 		float* results = malloc(N * sizeof(float));
 		mixture(samplers, weights, n_dists, results, N);
 		printf("Sum(dist_mixture, N)/N = %f\n", array_sum(results, N) / N);
 		// array_print(dist_mixture[0], N);
 		// end = clock();
 		// printf("Time (ms): %f\n", ((double)(end - start)) / (CLOCKS_PER_SEC * 1000));
 		// ^ Will only measure how long it takes the inner main to run, not the whole program, 
 		// including e.g., loading the program into memory or smth.
 		// Also CLOCKS_PER_SEC in POSIX is a constant equal to 1000000.
 		// See: https://stackoverflow.com/questions/10455905/why-is-clocks-per-sec-not-the-actual-number-of-clocks-per-second
    return 0;
 }
--- a/C/README.md
+++ b/C/README.md
@ -1,18 +0,0 @@
 # Time to BOTEC in C
 This repository contains a few implementations of a simple botec (back-of-the-envelope) calculation in C:
 - In the folder C-01-simple/, you can see a simple implementation, which passes large arrays around
 - In the folder C-02-better-algorithm-one-thread/ you can see a better implementations, that passes around pointers to functions, which makes the implementation more efficient
 - The top level samples.c uses the algorithm improvements in C-02..., and also implements multithreading using OpenMP
 ## To do
 - [ ] Add Windows/Powershell time-measuring commands
 - [ ] Add CUDA?
 - [x] Added results of perf. `rand_r` seems like a big chunk of it, but I'm hesitant to use lower-quality random numbers
  - [x] used xorshift instead
  - [-] Use xorshift with a struct instead of a pointer? idk, could be faster for some reason? => Tested, it takes the same time.
 - [x] Update repository with correct timing
 - [x] Use better profiling approach to capture timing with 1M samples.
 - [x] See if program can be reworded so as to use multithreading effectively, e.g., so that you see speed gains proportional to the number of threads used
--- a/C/hello-world/build.sh
+++ b/C/hello-world/build.sh
@ -0,0 +1,3 @@
 #!/bin/bash
 gcc -std=c99 -Wall -lm hello-world.c -o hello-world
--- a/C/xorshift-scratchpad/out/xorshift-pointer
+++ b/C/xorshift-scratchpad/out/xorshift-pointer
--- a/C/hello-world/hello-world.c
+++ b/C/hello-world/hello-world.c
@ -0,0 +1,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 int main(void){
  printf("Hello world!\n");
  return EXIT_SUCCESS;
 }
--- a/C/makefile
+++ b/C/makefile
@ -1,106 +0,0 @@
 # Interface: 
 #   make
 #   make build
 #   make format
 #   make run
 # Compiler
 CC=gcc
 # CC=tcc # <= faster compilation
 # Main file
 SRC=samples.c
 OUTPUT=out/samples
 SRC_ONE_THREAD=./samples-one-thread.c
 OUTPUT_ONE_THREAD=out/samples-one-thread
 ## Dependencies
 #  Has no dependencies
 MATH=-lm
 ## Flags
 DEBUG= #'-g'
 STANDARD=-std=c99
 WARNINGS=-Wall
 OPTIMIZED=-O3 #-O3 actually gives better performance than -Ofast, at least for this version
 OPENMP=-fopenmp
 ## Formatter
 STYLE_BLUEPRINT=webkit
 FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
 ## make build
 build: $(SRC)
 	$(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(OPENMP) $(MATH) -o $(OUTPUT)
 static:
 	$(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(OPENMP) $(MATH) -o $(OUTPUT)
 format: $(SRC)
 	$(FORMATTER) $(SRC)
 run: $(SRC) $(OUTPUT)
 	OMP_NUM_THREADS=1 ./$(OUTPUT) && echo
 multi:
 	OMP_NUM_THREADS=1 ./$(OUTPUT) && echo
 	OMP_NUM_THREADS=2 ./$(OUTPUT) && echo
 	OMP_NUM_THREADS=4 ./$(OUTPUT) && echo
 	OMP_NUM_THREADS=8 ./$(OUTPUT) && echo
 	OMP_NUM_THREADS=16 ./$(OUTPUT) && echo
 ## Timing
 time-linux: 
 	@echo "Requires /bin/time, found on GNU/Linux systems" && echo
 	@echo "Running 100x and taking avg time: OMP_NUM_THREADS=1 $(OUTPUT)"
 	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=1 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 1 thread: |" | sed 's|$$|ms|' && echo
 	@echo "Running 100x and taking avg time: OMP_NUM_THREADS=2 $(OUTPUT)"
 	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=2 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 2 threads: |" | sed 's|$$|ms|' && echo
 	@echo "Running 100x and taking avg time: OMP_NUM_THREADS=4 $(OUTPUT)"
 	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=4 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time for 4 threads: |" | sed 's|$$|ms|' && echo
 	@echo "Running 100x and taking avg time: OMP_NUM_THREADS=8 $(OUTPUT)"
 	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=8 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 8 threads: |" | sed 's|$$|ms|' && echo
 	@echo "Running 100x and taking avg time: OMP_NUM_THREADS=16 $(OUTPUT)"
 	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=16 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo
 time-linux-fastest: 
 	@echo "Running 100x and taking avg time: OMP_NUM_THREADS=16 $(OUTPUT)"
 	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=16 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo
 time-linux-simple:
 	@echo "Requires /bin/time, found on GNU/Linux systems" && echo
 	OMP_NUM_THREADS=1 /bin/time -f "Time: %es" ./$(OUTPUT) && echo
 	OMP_NUM_THREADS=2 /bin/time -f "Time: %es" ./$(OUTPUT) && echo
 	OMP_NUM_THREADS=4 /bin/time -f "Time: %es" ./$(OUTPUT) && echo
 	OMP_NUM_THREADS=8 /bin/time -f "Time: %es" ./$(OUTPUT) && echo
 	OMP_NUM_THREADS=16 /bin/time -f "Time: %es" ./$(OUTPUT) && echo
 ## Profiling
 profile-linux: 
 	echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar"
 	echo "Must be run as sudo"
 	$(CC) $(SRC) $(OPENMP) $(MATH) -o $(OUTPUT)
 	# ./$(OUTPUT)
 	# gprof: 
 	# gprof $(OUTPUT) gmon.out > analysis.txt
 	# rm gmon.out
 	# vim analysis.txt
 	# rm analysis.txt
 	# perf: 
 	OMP_NUM_THREADS=16 sudo perf record $(OUTPUT)
 	sudo perf report
 	rm perf.data
 ## Install
 debian-install-dependencies:
 	sudo apt-get install libomp-dev
--- a/C/out/samples
+++ b/C/out/samples
--- a/C/out/samples-one-thread
+++ b/C/out/samples-one-thread
--- a/C/out/samples.exe
+++ b/C/out/samples.exe
--- a/C/out/samples.txt
+++ b/C/out/samples.txt
--- a/C/perf.txt
+++ b/C/perf.txt
@ -1,26 +0,0 @@
 Samples: 884  of event 'cycles', Event count (approx.): 551167949
 Overhead  Command  Shared Object      Symbol
  35.32%  samples  samples            [.] xorshift32
  14.09%  samples  libgomp.so.1.0.0   [.] 0x000000000001d2ea
  12.04%  samples  libgomp.so.1.0.0   [.] 0x000000000001d132
  11.53%  samples  samples            [.] mixture._omp_fn.0
   4.55%  samples  libm-2.31.so       [.] __sin_fma
   4.24%  samples  samples            [.] rand_0_to_1
   3.77%  samples  samples            [.] random_to
   3.03%  samples  libm-2.31.so       [.] __logf_fma
   1.61%  samples  libm-2.31.so       [.] __expf_fma
   1.54%  samples  samples            [.] split_array_sum._omp_fn.0
   1.38%  samples  samples            [.] random_uniform
   0.94%  samples  samples            [.] ur_normal
   0.91%  samples  libm-2.31.so       [.] __ieee754_log_fma
   0.74%  samples  libgomp.so.1.0.0   [.] 0x000000000001d13d
   0.52%  samples  samples            [.] sample_0
   0.41%  samples  libm-2.31.so       [.] __sqrtf_finite@GLIBC_2.15
   0.38%  samples  samples            [.] sample_1
   0.36%  samples  libgomp.so.1.0.0   [.] 0x000000000001d139
   0.36%  samples  libgomp.so.1.0.0   [.] 0x000000000001d2f5
   0.22%  samples  [kernel.kallsyms]  [k] native_queued_spin_lock_slowpath
   0.18%  samples  [kernel.kallsyms]  [k] _raw_spin_lock_irq
   0.18%  samples  samples            [.] random_lognormal
   0.17%  samples  libgomp.so.1.0.0   [.] 0x000000000001d2f1
--- a/C/samples.c
+++ b/C/samples.c
@ -1,251 +0,0 @@
 #include <omp.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdint.h>
 #include <stdlib.h>
 const float PI = 3.14159265358979323846;
 #define N 1000000
 //Array helpers
 void array_print(float* array, int length)
 {
    for (int i = 0; i < length; i++) {
        printf("item[%d] = %f\n", i, array[i]);
    }
    printf("\n");
 }
 float array_sum(float* array, int length)
 {
    float output = 0.0;
    for (int i = 0; i < length; i++) {
        output += array[i];
    }
    return output;
 }
 void array_cumsum(float* array_to_sum, float* array_cumsummed, int length)
 {
    array_cumsummed[0] = array_to_sum[0];
    for (int i = 1; i < length; i++) {
        array_cumsummed[i] = array_cumsummed[i - 1] + array_to_sum[i];
    }
 }
 // Split array helpers
 int split_array_get_length(int index, int total_length, int n_threads)
 {
    return (total_length % n_threads > index ? total_length / n_threads + 1 : total_length / n_threads);
 }
 void split_array_allocate(float** meta_array, int length, int divide_into)
 {
    int split_array_length;
    for (int i = 0; i < divide_into; i++) {
        split_array_length = split_array_get_length(i, length, divide_into);
        meta_array[i] = malloc(split_array_length * sizeof(float));
    }
 }
 void split_array_free(float** meta_array, int divided_into)
 {
    for (int i = 0; i < divided_into; i++) {
        free(meta_array[i]);
    }
    free(meta_array);
 }
 float split_array_sum(float** meta_array, int length, int divided_into)
 {
    int i;
    float output = 0;
    #pragma omp parallel for reduction(+:output)
 		for (int i = 0; i < divided_into; i++) {
 				float own_partial_sum = 0;
 				int split_array_length = split_array_get_length(i, length, divided_into);
 				for (int j = 0; j < split_array_length; j++) {
 						own_partial_sum += meta_array[i][j];
 				}
 				output += own_partial_sum;
 		}
 		return output;
 }
 // Pseudo Random number generator
 uint32_t xorshift32(uint32_t* seed)
 {
 	// Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs"
  // See <https://stackoverflow.com/questions/53886131/how-does-xorshift32-works>
  // https://en.wikipedia.org/wiki/Xorshift
 	// Also some drama: <https://www.pcg-random.org/posts/on-vignas-pcg-critique.html>, <https://prng.di.unimi.it/>
 	uint32_t x = *seed;
 	x ^= x << 13;
 	x ^= x >> 17;
 	x ^= x << 5;
 	return *seed = x;
 }
 // Distribution & sampling functions
 float rand_0_to_1(uint32_t* seed){
 	return ((float) xorshift32(seed)) / ((float) UINT32_MAX);
 	/* 
 	uint32_t x = *seed;
 	x ^= x << 13;
 	x ^= x >> 17;
 	x ^= x << 5;
 	return ((float)(*seed = x))/((float) UINT32_MAX);
 	*/
 	// previously:
 	// ((float)rand_r(seed) / (float)RAND_MAX) 
 	// and before that: rand, but it wasn't thread-safe.
 	// See: <https://stackoverflow.com/questions/43151361/how-to-create-thread-safe-random-number-generator-in-c-using-rand-r> for why to use rand_r:
 	// rand() is not thread-safe, as it relies on (shared) hidden seed.
 }
 float rand_float(float max, uint32_t* seed)
 {
    return rand_0_to_1(seed) * max;
 }
 float ur_normal(uint32_t* seed)
 {
    float u1 = rand_0_to_1(seed);
    float u2 = rand_0_to_1(seed);
    float z = sqrtf(-2.0 * log(u1)) * sin(2 * PI * u2);
    return z;
 }
 float random_uniform(float from, float to, uint32_t* seed)
 {
    return rand_0_to_1(seed) * (to - from) + from;
 }
 float random_normal(float mean, float sigma, uint32_t* seed)
 {
    return (mean + sigma * ur_normal(seed));
 }
 float random_lognormal(float logmean, float logsigma, uint32_t* seed)
 {
    return expf(random_normal(logmean, logsigma, seed));
 }
 float random_to(float low, float high, uint32_t* seed)
 {
    const float NORMAL95CONFIDENCE = 1.6448536269514722;
    float loglow = logf(low);
    float loghigh = logf(high);
    float logmean = (loglow + loghigh) / 2;
    float logsigma = (loghigh - loglow) / (2.0 * NORMAL95CONFIDENCE);
    return random_lognormal(logmean, logsigma, seed);
 }
 // Mixture function
 void mixture(float (*samplers[])(uint32_t*), float* weights, int n_dists, float** results, int n_threads)
 {
    // You can see a simpler version of this function in the git history
    // or in C-02-better-algorithm-one-thread/
    float sum_weights = array_sum(weights, n_dists);
 		float* cumsummed_normalized_weights = malloc(n_dists * sizeof(float));
 		cumsummed_normalized_weights[0] = weights[0]/sum_weights;
    for (int i = 1; i < n_dists; i++) {
        cumsummed_normalized_weights[i] = cumsummed_normalized_weights[i - 1] + weights[i]/sum_weights;
    }
    //create var holders
    float p1;
    int sample_index, i, split_array_length;
 		// uint32_t* seeds[n_threads];
    uint32_t** seeds = malloc(n_threads * sizeof(uint32_t*));
 		for (uint32_t i = 0; i < n_threads; i++) {
        seeds[i] = malloc(sizeof(uint32_t));
        *seeds[i] = i + 1; // xorshift can't start with 0
    }
    #pragma omp parallel private(i, p1, sample_index, split_array_length)
    {
        #pragma omp for
        for (i = 0; i < n_threads; i++) {
            split_array_length = split_array_get_length(i, N, n_threads);
            for (int j = 0; j < split_array_length; j++) {
                p1 = random_uniform(0, 1, seeds[i]);
                for (int k = 0; k < n_dists; k++) {
                    if (p1 < cumsummed_normalized_weights[k]) {
                        results[i][j] = samplers[k](seeds[i]);
                        break;
                    }
                }
            }
        }
    }
    // free(normalized_weights);
    // free(cummulative_weights);
 		free(cumsummed_normalized_weights);
    for (uint32_t i = 0; i < n_threads; i++) {
        free(seeds[i]);
    }
 		free(seeds);
 }
 // Functions used for the BOTEC.
 // Their type has to be the same, as we will be passing them around.
 float sample_0(uint32_t* seed)
 {
    return 0;
 }
 float sample_1(uint32_t* seed)
 {
    return 1;
 }
 float sample_few(uint32_t* seed)
 {
    return random_to(1, 3, seed);
 }
 float sample_many(uint32_t* seed)
 {
    return random_to(2, 10, seed);
 }
 int main()
 {
    // Toy example
    // Declare variables in play
    float p_a, p_b, p_c;
    int n_threads = omp_get_max_threads();
    // printf("Max threads: %d\n", n_threads);
    // omp_set_num_threads(n_threads);
    float** dist_mixture = malloc(n_threads * sizeof(float*));
    split_array_allocate(dist_mixture, N, n_threads);
    // Initialize variables
    p_a = 0.8;
    p_b = 0.5;
    p_c = p_a * p_b;
    // Generate mixture
    int n_dists = 4;
    float weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 };
    float (*samplers[])(uint32_t*) = { sample_0, sample_1, sample_few, sample_many };
    mixture(samplers, weights, n_dists, dist_mixture, n_threads);
    printf("Sum(dist_mixture, N)/N = %f\n", split_array_sum(dist_mixture, N, n_threads) / N);
    // array_print(dist_mixture[0], N);
    split_array_free(dist_mixture, n_threads);
    return 0;
 }
--- a/C/samples/build-and-run.sh
+++ b/C/samples/build-and-run.sh
@ -0,0 +1,15 @@
 #!/bin/bash
 root="$1"
 gcc -std=c99 -Wall -lm "$root".c -o "$root" -I/usr/local/include -L/usr/local/lib -lgsl -lgslcblas -lm -g
 # Link libraries, for good measure
 LD_LIBRARY_PATH=/usr/local/lib
 export LD_LIBRARY_PATH
 # Increase stack size limit
 ulimit -Ss 256000
 # -Ss: the soft limit. If you set the hard limit, you then can't raise it
 # 256000: around 250Mbs, if I'm reading it correctly.
 # run
 ./samples
--- a/C/samples/build-gsl.sh
+++ b/C/samples/build-gsl.sh
@ -0,0 +1,13 @@
 #!/bin/bash
 root="$1"
 gcc -std=c99 -Wall -lm "$root".c -o "$root" -I/usr/local/include -L/usr/local/lib -lgsl -lgslcblas -lm -g
 # Link libraries, for good measure
 LD_LIBRARY_PATH=/usr/local/lib
 export LD_LIBRARY_PATH
 # Increase stack size limit
 ulimit -Ss 256000
 # -Ss: the soft limit. If you set the hard limit, you then can't raise it
 # 256000: around 250Mbs, if I'm reading it correctly.
--- a/C/samples/samples
+++ b/C/samples/samples
--- a/C/samples/samples.c
+++ b/C/samples/samples.c
@ -0,0 +1,140 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <gsl/gsl_rng.h>
 #include <gsl/gsl_randist.h>
 #define N 1000000
 /* 
 * For very high values of N, you will want to increase the maximum stack trace, otherwise you will suffer a segmentation fault
 * In Ubuntu/bash you can do this with $ ulimit -Ss 256000 ## ~256Mbs
 * And confirm it with $ ulimit -a
 */
 /* Helpers */
 void print(double *ys){
  for(int i=0; i<N; i++){
    printf("%f\n", ys[i]);
  }
  printf("\n");
 }
 void fill(double *ys, float f){
  for(int i=0; i<N; i++){
    ys[i] = f;
  }
 }
 double sum(double *ps, int n){
  double result = 0;
  for(int i=0; i<n; i++){
    result += ps[i];
  }
  return(result);
 }
 void cumsum(double *ps, double *rs, int n){
  double counter = 0;
  for(int i=0; i<n; i++){
    counter += ps[i];
    rs[i] = counter;
  }
 }
 /* Distributions*/
 void normal(gsl_rng * r, double *ys, double mean, double std){
  for(int i=0; i<N; i++){
    ys[i] = mean + gsl_ran_gaussian(r, std);
  }
 }
 void lognormal(gsl_rng * r, double *ys, double zeta, double sigma){
  for(int i=0; i<N; i++){
    ys[i] = gsl_ran_lognormal(r, zeta, sigma);
  }
 }
 void to(gsl_rng * r, double *ys, double low, double high){
  double normal95confidencePoint = 1.6448536269514722; 
  double log_low = log(low);
  double log_high = log(high);
  double zeta = (log_low + log_high)/2;
  double sigma = (log_high - log_low) / (2.0 * normal95confidencePoint);
  lognormal(r, ys, zeta, sigma);
 }
 /* Mixture of distributions */
 void mixture(gsl_rng * r, double *dists[], double *weights, int n, double *results){
  /* Get cummulative, normalized weights */
  double sum_weights = sum(weights, n);
  double normalized_weights[n];
  for(int i=0; i<n; i++){
    normalized_weights[i] = weights[i]/sum_weights;
  }
  double cummulative_weights[n];
  cumsum(normalized_weights, cummulative_weights, n);
  /* Get N samples, drawn from the different distributions in proportion to their weights. */
  for(int i=0; i<N; i++){
    double p_1 = gsl_rng_uniform(r);
    double p_2 = gsl_rng_uniform(r);
    int index_found = 0;
    int index_counter = 0;
    while ((index_found == 0) && (index_counter < n) ){
      if(p_1 < cummulative_weights[index_counter]){
        index_found = 1;
      }else{
        index_counter++;
      }
    }
    if(index_found == 0) {
      printf("\nThis shouldn't be able to happen");
      // gsl_rng_free (r);
      // abort(); // this shouldn't have happened.
    }else{
      int sample_index = (int) floor(p_2 * N);
      results[i] = dists[index_counter][sample_index];
    }
  }
 }
 /* Main */
 int main(void){
  /* Initialize GNU Statistical Library (GSL) stuff */
  const gsl_rng_type * T;
  gsl_rng * r;
  // gsl_rng_env_setup();
  T = gsl_rng_default;
  r = gsl_rng_alloc (T);
  /* Toy example */
  /* Declare variables in play */
  double p_a, p_b, p_c;
  double dist_none[N], dist_one[N], dist_few[N], dist_many[N], dist_mixture[N];
  /* Initialize variables */ 
  p_a = 0.8;
  p_b = 0.5;
  p_c = p_a * p_b;
  fill(dist_none, 0);
  fill(dist_one, 1);
  to(r, dist_few, 1, 3);
  to(r, dist_many, 2, 10);
  /* Generate mixture */
  int n = 4;
  double weights[] = { 1 - p_c, p_c/2, p_c/4, p_c/4 };
  double *dists[] = { dist_none, dist_one, dist_few, dist_many };
  mixture(r, dists, weights, n, dist_mixture);
  printf("%f\n", sum(dist_mixture, N)/N);
  /* Clean up GSL */
  gsl_rng_free (r);
  /* Return success*/
  return EXIT_SUCCESS;
 }
--- a/C/squiggle_c/examples/01_one_sample/example
+++ b/C/squiggle_c/examples/01_one_sample/example
--- a/C/squiggle_c/examples/01_one_sample/example.c
+++ b/C/squiggle_c/examples/01_one_sample/example.c
@ -1,50 +0,0 @@
 #include "../../squiggle.h"
 #include <stdint.h>
 #include <stdlib.h>
 #include <stdio.h>
 // Estimate functions
 float sample_0(uint32_t* seed)
 {
    return 0;
 }
 float sample_1(uint32_t* seed)
 {
    return 1;
 }
 float sample_few(uint32_t* seed)
 {
    return random_to(1, 3, seed);
 }
 float sample_many(uint32_t* seed)
 {
    return random_to(2, 10, seed);
 }
 int main(){
    // set randomness seed
 		uint32_t* seed = malloc(sizeof(uint32_t));
 		*seed = 1000; // xorshift can't start with 0
    float p_a = 0.8;
    float p_b = 0.5;
    float p_c = p_a * p_b;
    int n_dists = 4;
    float weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 };
    float (*samplers[])(uint32_t*) = { sample_0, sample_1, sample_few, sample_many };
    float result_one = mixture(samplers, weights, n_dists, seed);
 		printf("result_one: %f\n", result_one);
 }
 /* 
 Aggregation mechanisms:
 - Quantiles (requires a sort)
 - Sum 
 - Average
 - Std
 */
--- a/C/squiggle_c/examples/01_one_sample/makefile
+++ b/C/squiggle_c/examples/01_one_sample/makefile
@ -1,53 +0,0 @@
 # Interface: 
 #   make
 #   make build
 #   make format
 #   make run
 # Compiler
 CC=gcc
 # CC=tcc # <= faster compilation
 # Main file
 SRC=example.c
 OUTPUT=example
 ## Dependencies
 MATH=-lm
 ## Flags
 DEBUG= #'-g'
 STANDARD=-std=c99
 WARNINGS=-Wall
 OPTIMIZED=-O3  #-Ofast
 # OPENMP=-fopenmp
 ## Formatter
 STYLE_BLUEPRINT=webkit
 FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
 ## make build
 build: $(SRC)
 	$(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(MATH) -o $(OUTPUT)
 format: $(SRC)
 	$(FORMATTER) $(SRC)
 run: $(SRC) $(OUTPUT)
 	OMP_NUM_THREADS=1 ./$(OUTPUT) && echo
 time-linux: 
 	@echo "Requires /bin/time, found on GNU/Linux systems" && echo
 	@echo "Running 100x and taking avg time $(OUTPUT)"
 	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 1 thread: |" | sed 's|$$|ms|' && echo
 ## Profiling
 profile-linux: 
 	echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar"
 	echo "Must be run as sudo"
 	$(CC) $(SRC) $(MATH) -o $(OUTPUT)
 	sudo perf record $(OUTPUT)
 	sudo perf report
 	rm perf.data
--- a/C/squiggle_c/examples/02_many_samples/example
+++ b/C/squiggle_c/examples/02_many_samples/example
--- a/C/squiggle_c/examples/02_many_samples/example.c
+++ b/C/squiggle_c/examples/02_many_samples/example.c
@ -1,59 +0,0 @@
 #include <stdint.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include "../../squiggle.h"
 // Estimate functions
 float sample_0(uint32_t* seed)
 {
    return 0;
 }
 float sample_1(uint32_t* seed)
 {
    return 1;
 }
 float sample_few(uint32_t* seed)
 {
    return random_to(1, 3, seed);
 }
 float sample_many(uint32_t* seed)
 {
    return random_to(2, 10, seed);
 }
 int main(){
    // set randomness seed
 		uint32_t* seed = malloc(sizeof(uint32_t));
 		*seed = 1000; // xorshift can't start with 0
    float p_a = 0.8;
    float p_b = 0.5;
    float p_c = p_a * p_b;
    int n_dists = 4;
    float weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 };
    float (*samplers[])(uint32_t*) = { sample_0, sample_1, sample_few, sample_many };
 		int n_samples = 1000000;
 		float* result_many = (float *) malloc(n_samples * sizeof(float));
 		for(int i=0; i<n_samples; i++){
      result_many[i] = mixture(samplers, weights, n_dists, seed);
 		}
 		printf("result_many: [");
 		for(int i=0; i<100; i++){
 		  printf("%.2f, ", result_many[i]);
 		}
 		printf("]\n");
 }
 /* 
 Aggregation mechanisms:
 - Quantiles (requires a sort)
 - Sum 
 - Average
 - Std
 */
--- a/C/squiggle_c/examples/02_many_samples/makefile
+++ b/C/squiggle_c/examples/02_many_samples/makefile
@ -1,53 +0,0 @@
 # Interface: 
 #   make
 #   make build
 #   make format
 #   make run
 # Compiler
 CC=gcc
 # CC=tcc # <= faster compilation
 # Main file
 SRC=example.c
 OUTPUT=example
 ## Dependencies
 MATH=-lm
 ## Flags
 DEBUG= #'-g'
 STANDARD=-std=c99
 WARNINGS=-Wall
 OPTIMIZED=-O3  #-Ofast
 # OPENMP=-fopenmp
 ## Formatter
 STYLE_BLUEPRINT=webkit
 FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
 ## make build
 build: $(SRC)
 	$(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(MATH) -o $(OUTPUT)
 format: $(SRC)
 	$(FORMATTER) $(SRC)
 run: $(SRC) $(OUTPUT)
 	OMP_NUM_THREADS=1 ./$(OUTPUT) && echo
 time-linux: 
 	@echo "Requires /bin/time, found on GNU/Linux systems" && echo
 	@echo "Running 100x and taking avg time $(OUTPUT)"
 	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 1 thread: |" | sed 's|$$|ms|' && echo
 ## Profiling
 profile-linux: 
 	echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar"
 	echo "Must be run as sudo"
 	$(CC) $(SRC) $(MATH) -o $(OUTPUT)
 	sudo perf record $(OUTPUT)
 	sudo perf report
 	rm perf.data
--- a/C/squiggle_c/squiggle.h
+++ b/C/squiggle_c/squiggle.h
@ -1,109 +0,0 @@
 #include <math.h>
 #include <stdint.h>
 #include <stdlib.h>
 const float PI = 3.14159265358979323846;
 // Pseudo Random number generator
 uint32_t xorshift32
 (uint32_t* seed)
 {
 	// Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs"
  // See <https://stackoverflow.com/questions/53886131/how-does-xorshift32-works>
  // https://en.wikipedia.org/wiki/Xorshift
 	// Also some drama: <https://www.pcg-random.org/posts/on-vignas-pcg-critique.html>, <https://prng.di.unimi.it/>
 	uint32_t x = *seed;
 	x ^= x << 13;
 	x ^= x >> 17;
 	x ^= x << 5;
 	return *seed = x;
 }
 // Distribution & sampling functions
 float rand_0_to_1(uint32_t* seed){
 	return ((float) xorshift32(seed)) / ((float) UINT32_MAX);
 }
 float rand_float(float max, uint32_t* seed)
 {
    return rand_0_to_1(seed) * max;
 }
 float ur_normal(uint32_t* seed)
 {
    float u1 = rand_0_to_1(seed);
    float u2 = rand_0_to_1(seed);
    float z = sqrtf(-2.0 * log(u1)) * sin(2 * PI * u2);
    return z;
 }
 float random_uniform(float from, float to, uint32_t* seed)
 {
    return rand_0_to_1(seed) * (to - from) + from;
 }
 float random_normal(float mean, float sigma, uint32_t* seed)
 {
    return (mean + sigma * ur_normal(seed));
 }
 float random_lognormal(float logmean, float logsigma, uint32_t* seed)
 {
    return expf(random_normal(logmean, logsigma, seed));
 }
 float random_to(float low, float high, uint32_t* seed)
 {
    const float NORMAL95CONFIDENCE = 1.6448536269514722;
    float loglow = logf(low);
    float loghigh = logf(high);
    float logmean = (loglow + loghigh) / 2;
    float logsigma = (loghigh - loglow) / (2.0 * NORMAL95CONFIDENCE);
    return random_lognormal(logmean, logsigma, seed);
 }
 // Array helpers
 float array_sum(float* array, int length)
 {
    float output = 0.0;
    for (int i = 0; i < length; i++) {
        output += array[i];
    }
    return output;
 }
 void array_cumsum(float* array_to_sum, float* array_cumsummed, int length)
 {
    array_cumsummed[0] = array_to_sum[0];
    for (int i = 1; i < length; i++) {
        array_cumsummed[i] = array_cumsummed[i - 1] + array_to_sum[i];
    }
 }
 // Mixture function
 float mixture(float (*samplers[])(uint32_t*), float* weights, int n_dists, uint32_t* seed)
 {
    // You can see a simpler version of this function in the git history
    // or in C-02-better-algorithm-one-thread/
    float sum_weights = array_sum(weights, n_dists);
 		float* cumsummed_normalized_weights = (float*) malloc(n_dists * sizeof(float));
 		cumsummed_normalized_weights[0] = weights[0]/sum_weights;
    for (int i = 1; i < n_dists; i++) {
        cumsummed_normalized_weights[i] = cumsummed_normalized_weights[i - 1] + weights[i]/sum_weights;
    }
 		float p = random_uniform(0, 1, seed);
 		float result;
 		for (int k = 0; k < n_dists; k++) {
 				if (p < cumsummed_normalized_weights[k]) {
 						result = samplers[k](seed);
 						break;
 				}
 		}
 		free(cumsummed_normalized_weights);
 		return result;
 }
--- a/C/squiggle_c/to-do.md
+++ b/C/squiggle_c/to-do.md
@ -1,9 +0,0 @@
 - [ ] Add example for only one sample
 - [ ] Add example for many samples
 - [ ] Use gcc extension to define functions nested inside main.
 - [ ] Use OpenMP for acceleration
 - [ ] Chain various mixture functions
 - [ ] Have some more complicated & realistic example
 - [ ] Add summarization functions, like mean, std, 90% ci (or all c.i.?)
 - [ ] Add beta distribution
--- a/C/xorshift-scratchpad/makefile
+++ b/C/xorshift-scratchpad/makefile
@ -1,15 +0,0 @@
 build: 
 	gcc xorshift-pointer.c -o out/xorshift-pointer
 	gcc xorshift-struct.c -o out/xorshift-struct
 run-pointer:
 	./out/xorshift-pointer
 run-struct:
 	./out/xorshift-struct
 time-pointer:
 	/bin/time -f "Time: %es" ./out/xorshift-pointer && echo
 time-struct:
 	/bin/time -f "Time: %es" ./out/xorshift-struct && echo
--- a/C/xorshift-scratchpad/notes.md
+++ b/C/xorshift-scratchpad/notes.md
@ -1 +0,0 @@
 Using a pointer or a struct turns out to be pretty equivalent in terms of speed. I don't think there are gains to be found here, but I find the pointer implementation clearer.
--- a/C/xorshift-scratchpad/out/xorshift-struct
+++ b/C/xorshift-scratchpad/out/xorshift-struct
--- a/C/xorshift-scratchpad/xorshift-pointer.c
+++ b/C/xorshift-scratchpad/xorshift-pointer.c
@ -1,41 +0,0 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 uint32_t xorshift32(uint32_t* state)
 {
 	/* Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs" */
 	uint32_t x = *state;
 	x ^= x << 13;
 	x ^= x >> 17;
 	x ^= x << 5;
 	return *state = x;
 }
 float rand_xorshift32(uint32_t* state){
 	return (float) xorshift32(state) / (float) UINT32_MAX;
 }
 int main(){
 	uint32_t** states = malloc(4 * sizeof(uint32_t*));
 	for(int i=0; i<4;i++){
 		states[i] = malloc(sizeof(uint32_t));
 		*states[i] = (uint32_t) i + 1;
 	}
 	for(int i=0; i<1000000000;i++){
 		uint32_t x = xorshift32(states[0]);
 		float y = rand_xorshift32(states[1]);
 		// printf("%u\n", x);
 	  // printf("%f\n", y);
 	}
 	for(int i=0; i<4;i++){
 		free(states[i]);
 	}
 	free(states);
  return 0;
 }
 // See <https://stackoverflow.com/questions/53886131/how-does-xorshift32-works>
 // https://en.wikipedia.org/wiki/Xorshift
--- a/C/xorshift-scratchpad/xorshift-struct.c
+++ b/C/xorshift-scratchpad/xorshift-struct.c
@ -1,46 +0,0 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 struct xorshift32_state {
    uint32_t a;
 };
 uint32_t xorshift32(struct xorshift32_state *state)
 {
 	/* Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs" */
 	uint32_t x = state->a;
 	x ^= x << 13;
 	x ^= x >> 17;
 	x ^= x << 5;
 	return state->a = x;
 }
 float rand_xorshift32(struct xorshift32_state *state){
 	return (float) xorshift32(state) / (float) UINT32_MAX;
 }
 int main(){
 	struct xorshift32_state** states = malloc(sizeof(struct xorshift32_state*) * 4);
 	for(int i=0; i<4;i++){
 		states[i] = malloc(sizeof(struct xorshift32_state));
 		states[i]->a = (uint32_t) i + 1;
 	}
 	for(int i=0; i<1000000000; i++){
 		uint32_t x = xorshift32(states[0]);
 		float y = rand_xorshift32(states[1]);
 		// printf("%u\n", x);
 		// printf("%f\n", y);
 	}
 	for(int i=0; i<4;i++){
 		free(states[i]);
 	}
 	free(states);
  return 0;
 }
--- a/R/hello-world.R
+++ b/R/hello-world.R
@ -0,0 +1 @@
 print("Hello world")
--- a/R/samples.R
+++ b/R/samples.R
@ -48,3 +48,6 @@ weights = c((1 - p_c), p_c/2, p_c/4, p_c/4)
 result = mixture(dists, weights)
 mean_result = mean(result)
 print(mean_result)
--- a/README.md
+++ b/README.md
@ -2,161 +2,49 @@
 ## About
-This repository contains example of very simple code to manipulate samples in various programming languages, and tallies their speed and the complexity of their code. It implements this platonic estimate:
+This repository contains example of very simple code to manipulate samples in various programming languages. As of now, it may be useful for checking the validity of simple estimations.
-```
+The title of this repository is a pun on two meanings of "time to": "how much time does it take to do x", and "let's do x".
 p_a = 0.8
 p_b = 0.5
 p_c = p_a * p_b
 dists = [0, 1, 1 to 3, 2 to 10]
 weights = [(1 - p_c), p_c/2, p_c/4, p_c/4 ]
 result = mixture(dists, weights) # should be 1M samples
 mean(result)
 ```
 I don't particularly care about the speed of this particular example, but rather think that the speed in this simple exaxmple would be indicative of the speed when considering 100x or 1000x more complicated models. As of now, it may also be useful for checking the validity of simple estimations. 
 The title of this repository is a pun on two meanings of "time to": "how much time does it take to do x", and "let's do x". "BOTEC" stands for "back of the envelope calculation".
 ## Current languages
 - [x] C
 - [x] Javascript (NodeJS)
 - [x] Squiggle 
 - [x] R
 - [x] Python
- [x] Nim 
+- [x] R
 - [x] Squiggle 
 - [x] Javascript (NodeJS)
 - [x] C
-## Comparison table
+## Performance table
-| Language                    | Time      | Lines of code |
+With the [time](https://man7.org/linux/man-pages/man1/time.1.html) tool, using 1M samples:
 |-----------------------------|-----------|---------------|
 | C (optimized, 16 threads)   | 5ms       | 249  |
 | Nim                         | 38ms      | 84   |
 | Lua (LuaJIT)                | 68ms      | 82   |
 | Lua                         | 278ms     | 82   |
 | C (naïve implementation)    | 292ms     | 149  |
 | Javascript (NodeJS)         | 732ms     | 69   |
 | Squiggle                    | 1,536s    | 14*  |
 | SquigglePy                  | 1.602s    | 18*  |
 | R                           | 7,000s    | 49   |
 | Python (CPython)            | 16,641s   | 56   |
-Time measurements taken with the [time](https://man7.org/linux/man-pages/man1/time.1.html) tool, using 1M samples. But different implementations use different algorithms and, occasionally, different time measuring methodologies (for the C, Nim and Lua implementations, I run the program 100 times and take the mean). Their speed was also measured under different loads in my machine. So I think that these time estimates are accurate within maybe ~2x or so.
+| Language | Time      |
 |----------|-----------|
 | C        | 0m0,442s  |
 | Node     | 0m0,732s  |
 | Squiggle | 0m1,536s  |
 | R        | 0m7,000s  |
 | Python (CPython)  | 0m16,641s |
 Note that the number of lines is much shorter for Squiggle and SquigglePy because I'm just counting the lines needed to get Squiggle and SquigglePy to output a model, not the lines inside them.
-## Notes
+I was very surprised that Node/Squiggle code was almost as fast as the raw C code. For the Python code, it's possible that the lack of speed is more a function of me not being as familiar with Python. It's also very possible that the code would run faster with [PyPy](https://doc.pypy.org)
 ### Nim
 I was really happy trying [Nim](https://nim-lang.org/), and as a result the Nim code is a bit more optimized and engineered:
 1. It is using the fastest "danger" compilation mode.
 2. It has some optimizations: I don't compute 1M samples for each dist, but instead pass functions around and compute the 1M samples at the end
 3. I define the normal function from scratch, using the [Box–Muller transform](https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form).
 4. I also have a version in which I define the logarithm and sine functions themselves in nim to feed into the Box-Muller method. But it is much slower.
 Without 1. and 2., the nim code takes 0m0.183s instead. But I don't think that these are unfair advantages: I liked trying out nim and therefore put in more love into the code, and this seems like it could be a recurring factor.
 Ultimately, these optimizations were also incorporated into the C code as well.
 Nim also has multithreading support, but I haven't bothered loooking into it yet.
 ### C
 The optimizations which make the final C code significantly faster than the naïve implementation are:
 - To pass around pointers to functions, instead of large arrays. This is the same as in the nim implementation, but imho leads to more complex code
 - To use the Box-Muller transform instead of using libraries, like in nim.
 - To use multithreading support
 The C code uses the `-Ofast` or `-O3` compilation flags. Initially, without using those flags and without the algorithmic improvements, the code took ~0.4 seconds to run. So I was surprised that the Node and Squiggle code were comparable to the C code. It ended up being the case that the C code could be pushed to be ~100x faster, though :)
 In fact, the C code ended up being so fast that I had to measure its time by running the code 100 times in a row and dividing that amount by 100, rather than by just running it once, because running it once was too fast for /bin/time. More sophisticated profiling tools exist that could e.g., account for how iddle a machine is when running the code, but I didn't think that was worth it at this point.
 And still, there are some missing optimizations, like tweaking the code to take into account cache misses. I'm not exactly sure how that would go, though.
 Once the code was at 6.6ms, there was a 0.6ms gain possible by using OMP better, and a 1ms gain by using a xorshift algorithm instead of rand_r from stdlib. I think there might be faster gains to be made by using OpenCL or CUDA, but I haven't gotten into how to do that instead.
 Although the above paragraphs were written in the first person, the C code was written together with Jorge Sierra, who translated the algorithmic improvements from nim to it and added the initial multithreading support.
 ### NodeJS and Squiggle
 Using [bun](https://bun.sh/) instead of node is actually a bit slower. Also, both the NodeJS and the Squiggle code use [stdlib](https://stdlib.io/) in their innards, which has a bunch of interleaved functions that make the code slower. It's possible that not using that external library could make the code faster, but at the same time, the js approach does seem to be to use external libraries whenever possible.
 I am not particularly sure that the Squiggle code is actually producing 1M samples, but also have no particular plan to debug this.
 ### Python
 For the Python code, it's possible that the lack of speed is more a function of me not being as familiar with Python. It's also very possible that the code would run faster with [PyPy](https://doc.pypy.org).
 ### R
 R has a warm place in my heart from back in the day, and it has predefined functions to do everything. It was particularly fast to write for me, though not particularly fast to run :) However, I do recall that R does have some multithreading support; it wasn't used.
 ### Lua 
 I was also really pleased with Lua. I liked the simplicity. And coming from javascript, I appreciated that the program did not fail silently, but rather gave me useful errors, like:
 ```
 lua samples.lua
 lua: samples.lua:42: table index is nil
 stack traceback:
        samples.lua:42: in function 'mixture'
        samples.lua:49: in main chunk
        [C]: in ?
 make: *** [makefile:14: run] Error 1
 ```
 I also appreciated the speedup when using the LuaJIT interpreter. You can install it with commands similar to 
 ```
 git clone https://luajit.org/git/luajit.git
 make
 sudo make install
 sudo ln -sf luajit-2.1.0-beta3 /usr/local/bin/luajit
 ```
 Overall I'm thinking that a combination of lua at least for scripting and ¿nim/C/tbd? for more serious programs could be quite powerful.
 ### Overall thoughts
 Overall I don't think that this is a fair comparison of the languages intrinsically, because I'm just differentially good at them, because I've chosen to put more effort in ones than in others. But it is still useful to me personally, and perhaps mildly informative to others. 
 ## Languages I may add later
- [ ] Julia (TuringML) 
+- Julia (TuringML) 
- [ ] Rust
+- Rust
- [ ] Lisp
+- Lisp
 - [ ] Go 
 - [ ] Zig
 - [ ] Forth
 - [ ] sh/bash, lol?
 - [ ] OCaml
 - [ ] Haskell
 - [ ] OpenCL/CUDA (e.g., as in <https://www.eriksmistad.no/getting-started-with-opencl-and-gpu-computing/>). Seems like it would be overkill, and also the code would be way more complex. But maybe worth trying?
 - [-] Stan => As far as I can tell, Stan is designed to generate samples from the posterior distribution given some data, not to create data by drawing samples from an arbitrary distribution.
  - [ ] Maybe still worth reversing the process?
 - ... and suggestions welcome
-
+- Stan
 ## Languages added so far
 - [x] Squiggle
 - [x] Javascript (NodeJS)
 - [x] Python (CPython)
 - [x] R
 - [x] C
 - [x] Nim
 - [x] SquigglePy
 - [x] Lua
 ## Roadmap
 The future of this project is uncertain. In most words, I simply forget about this repository.
 To do:
 - [ ] Check whether the Squiggle code is producing 1M samples. Still not too sure.
 - Differentiate between initial startup time (e.g., compiling, loading environment) and runtime. This matters because startup time could be ~constant, so for larger projects only the runtime matters.
 ## Other similar projects
 - Squigglepy: <https://github.com/rethinkpriorities/squigglepy>
--- a/js/hello-world.js
+++ b/js/hello-world.js
@ -0,0 +1 @@
 console.log("Hello world")
--- a/js/node_modules/@stdlib/utils/global/lib/window.js
+++ b/js/node_modules/@stdlib/utils/global/lib/window.js
@ -20,7 +20,7 @@
 // MAIN //
-var obj = ( typeof window === 'object' ) ? window : {};
+var obj = ( typeof window === 'object' ) ? window : null;
 // EXPORTS //
--- a/lua/hello-world/hello-world.lua
+++ b/lua/hello-world/hello-world.lua
@ -1 +0,0 @@
 print("Hello world")
--- a/lua/makefile
+++ b/lua/makefile
@ -1,32 +0,0 @@
 # Interface: 
 #   make
 #   make build
 #   make run
 #   make time-linux
 #   make install
 #   make format
 #   make time-linux-simple
 #   make profile-linux
 SRC=samples.lua
 # INTERPETER=lua# < original
 INTERPETER=luajit# < faster
 run: $(SRC)
 	$(INTERPETER) $(SRC)
 time-linux: 
 	@echo "Requires /bin/time, found on GNU/Linux systems" && echo
 	@echo "Running 100x and taking avg time of: $(INTERPETER) $(SRC)"
 	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(INTERPETER) $(SRC); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time: |" | sed 's|$$|ms|' && echo
 time-linux-simple:
 	@echo "Requires /bin/time, found on GNU/Linux systems" && echo
 	/bin/time -f "Time: %es" $(INTERPETER) $(SRC) && echo
 profile-linux: 
 	@echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar"
 	@echo "Must be run as sudo"
 	sudo perf record $(INTERPETER) $(SRC)
 	sudo perf report
 	rm perf.data
--- a/lua/samples.lua
+++ b/lua/samples.lua
@ -1,82 +0,0 @@
 -- Consts and prep
 PI = 3.14159265358979323846;
 NORMAL95CONFIDENCE = 1.6448536269514722;
 math.randomseed(1234)
 -- Random distribution functions
 function sample_normal_0_1()
  local u1 = math.random()
 	local u2 = math.random()
 	local result = math.sqrt(-2 * math.log(u1)) * math.sin(2 * PI * u2)
 	return result
 end
 function sample_normal(mean, sigma)
 	return mean + (sigma * sample_normal_0_1())
 end
 function sample_uniform(min, max)
  return math.random() * (max - min) + min
 end
 function sample_lognormal(logmean, logsigma)
 	return math.exp(sample_normal(logmean, logsigma))
 end
 function sample_to(low, high)
 	local loglow = math.log(low);
 	local loghigh = math.log(high);
 	local logmean = (loglow + loghigh) / 2;
 	local logsigma = (loghigh - loglow) / (2.0 * NORMAL95CONFIDENCE);
 	return sample_lognormal(logmean, logsigma, seed);
 end
 -- Mixture
 function mixture(samplers, weights, n)
 	assert(#samplers == #weights)
  local l = #weights
 	local sum_weights = 0
 	for i = 1, l, 1 do
 		-- ^ arrays start at 1
 		sum_weights = sum_weights + weights[i]
 	end
  local cumsummed_normalized_weights = {}
 	table.insert(cumsummed_normalized_weights, weights[1]/sum_weights)
 	for i = 2, l, 1 do
 		table.insert(cumsummed_normalized_weights, cumsummed_normalized_weights[i-1] + weights[i]/sum_weights)
 	end
 	local result = {}
 	for i = 1, n, 1 do
 		r = math.random()
 		local i = 1
 		while r > cumsummed_normalized_weights[i] do
 			i = i+1
 		end
 		table.insert(result, samplers[i]())
 	end
 	return result
 end
 -- Main 
 p_a = 0.8
 p_b = 0.5
 p_c = p_a * p_b
 function sample_0() return 0 end
 function sample_1() return 1 end
 function sample_few() return sample_to(1, 3) end
 function sample_many() return sample_to(2, 10) end
 samplers = {sample_0, sample_1, sample_few, sample_many}
 weights = { (1 - p_c), p_c/2, p_c/4, p_c/4 }
 n = 1000000
 result = mixture(samplers, weights, n)
 sum = 0
 for i = 1, n, 1 do
 	sum = sum + result[i]
 	-- print(result[i])
 end
 print(sum/n)
--- a/nim/makefile
+++ b/nim/makefile
@ -1,18 +0,0 @@
 SHELL := /bin/bash ## <= required to use time
 VERBOSE=--verbosity:0
 build: samples.nim
 	nim c $(VERBOSE) samples.nim
 run: samples 
 	./samples $(VERBOSE)
 examine: samples
 	nim c $(VERBOSE) samples.nim && time ./samples $(VERBOSE) && echo
 	nim c $(VERBOSE) -d:release samples.nim && time ./samples $(VERBOSE) && echo
 	nim c $(VERBOSE) -d:danger samples.nim && time ./samples $(VERBOSE)
 time-linux: 
 	@echo "Requires /bin/time, found on GNU/Linux systems" && echo
 	@echo "Running 100x and taking avg time of: $(OUTPUT)"
 	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do ./samples; done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time: |" | sed 's|$$|ms|' && echo
--- a/nim/samples
+++ b/nim/samples
--- a/nim/samples-from-scratch/README.md
+++ b/nim/samples-from-scratch/README.md
@ -1,4 +0,0 @@
 This is a version of time-to-botec for nim for which I define logarithms, normals & lognormals pretty much 
 from scratch, without relying on libraries except for random numbers, sqrts, or taking a to the power of b.
 This is interesting because it leads to better understanding. But it doesn't help compare the efficiency of various languages. So I'm setting it aside for now, even though I think it was really interesting.
--- a/nim/samples-from-scratch/makefile
+++ b/nim/samples-from-scratch/makefile
@ -1,11 +0,0 @@
 SHELL := /bin/bash
 build: samples.nim
 	nim c --verbosity:0 samples.nim
 run: samples 
 	./samples --verbosity:0
 examine: samples
 	# nim c --verbosity:0 --opt:speed -d:release -d:danger --checks:off samples.nim && time ./samples --verbosity:0 --checks:off
 	nim c -d:release samples.nim && time ./samples
--- a/nim/samples-from-scratch/samples
+++ b/nim/samples-from-scratch/samples
--- a/nim/samples-from-scratch/samples.nim
+++ b/nim/samples-from-scratch/samples.nim
@ -1,145 +0,0 @@
 import std/math
 import std/sugar
 import std/random
 import std/sequtils
 randomize()
 ## Basic math functions
 proc factorial(n: int): int = 
  if n == 0 or n < 0: 
    return 1
  else:
    return n * factorial(n - 1)
 proc sine(x: float): float = 
  let n = 8
  # ^ Taylor will converge really quickly
  # notice that the factorial of 17 is 
  # already pretty gigantic
  var acc = 0.0
  for i in 0..n:
    var k = 2*i + 1
    var taylor = pow(-1, i.float) * pow(x, k.float) / factorial(k).float 
    acc = acc + taylor
  return acc 
 ## Log function
 ## <https://en.wikipedia.org/wiki/Natural_logarithm#High_precision>
 ## Arithmetic-geomtric mean
 proc ag(x: float, y: float): float = 
  let n = 32 # just some high number
  var a = (x + y)/2.0
  var b = sqrt(x * y)
  for i in 0..n:
    let temp = a
    a = (a+b)/2.0
    b = sqrt(b*temp)
  return a
 ## Find m such that x * 2^m > 2^precision/2
 proc find_m(x:float): float = 
  var m = 0.0;
  let precision = 64 # bits
  let c = pow(2.0, precision.float / 2.0)
  while x * pow(2.0, m) < c:
    m = m + 1
  return m
 proc log(x: float): float = 
  let m = find_m(x)
  let s = x * pow(2.0, m)
  let ln2 = 0.6931471805599453
  return ( PI / (2.0 * ag(1, 4.0/s)) ) - m * ln2
 ## Test these functions
 ## echo factorial(5)
 ## echo sine(1.0)
 ## echo log(0.1)
 ## echo log(2.0)
 ## echo log(3.0)
 ## echo pow(2.0, 32.float)
 ## Distribution functions
 ## Normal
 ## <https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form>
 proc ur_normal(): float = 
  let u1 = rand(1.0)
  let u2 = rand(1.0)
  let z = sqrt(-2.0 * log(u1)) * sine(2 * PI * u2)
  return z
 proc normal(mean: float, sigma: float): float = 
  return (mean + sigma * ur_normal())
 proc lognormal(logmean: float, logsigma: float): float =
  let answer = pow(E, normal(logmean, logsigma))
  return answer
 proc to(low: float, high: float): float = 
  let normal95confidencePoint = 1.6448536269514722
  let loglow = log(low)
  let loghigh = log(high)
  let logmean = (loglow + loghigh)/2
  let logsigma = (loghigh - loglow) / (2.0 * normal95confidencePoint);
  return lognormal(logmean, logsigma)
 ## echo ur_normal()
 ## echo normal(10, 20)
 ## echo lognormal(2, 4)
 ## echo to(10, 90)
 ## Manipulate samples
 proc make_samples(f: () -> float, n: int): seq[float] = 
  result = toSeq(1..n).map(_ => f())
  return result
 proc mixture(sxs: seq[seq[float]], ps: seq[float], n: int): seq[float] = 
  assert sxs.len == ps.len
  var ws: seq[float]
  var sum = 0.0
  for i, p in ps:
    sum = sum + p
    ws.add(sum)
  ws = ws.map(w => w/sum)
  proc get_mixture_sample(): float = 
    let r = rand(1.0)
    var i = ws.len - 1
    for j, w in ws:
      if r < w:
        i = j
        break
    ## only occasion when ^ doesn't assign i
    ## is when r is exactly 1
    ## which would correspond to choosing the last item in ws
    ## which is why i is initialized to ws.len
    let xs = sxs[i]
    let l = xs.len-1
    let k = rand(0..l)
    return xs[k]
  return toSeq(1..n).map(_ => get_mixture_sample())
 ## Actual model
 let n = 1000000
 let p_a = 0.8
 let p_b = 0.5
 let p_c = p_a * p_b
 let weights = @[ 1.0 - p_c, p_c/2.0, p_c/4.0, p_c/4.0 ]
 let fs = [ () => 0.0, () => 1.0, () => to(1.0, 3.0), () => to(2.0, 10.0)  ]
 let dists = fs.map(f => make_samples(f, n))
 let result = mixture(dists, weights, n)
 let mean_result = foldl(result, a + b, 0.0) / float(result.len)
 # echo result
 echo mean_result
--- a/nim/samples-from-scratch/taylor-expansions/makefile
+++ b/nim/samples-from-scratch/taylor-expansions/makefile
@ -1,2 +0,0 @@
 build: sums.nim
 	nim c sums.nim
--- a/nim/samples-from-scratch/taylor-expansions/sums
+++ b/nim/samples-from-scratch/taylor-expansions/sums
--- a/nim/samples-from-scratch/taylor-expansions/sums.nim
+++ b/nim/samples-from-scratch/taylor-expansions/sums.nim
@ -1,22 +0,0 @@
 import std/math
 proc factorial(n: int): int = 
  if n == 0 or n < 0: 
    return 1
  else:
    return n * factorial(n - 1)
 proc sine(x: float): float = 
  let n = 8
  # ^ Taylor will converge really quickly
  # notice that the factorial of 17 is 
  # already pretty gigantic
  var acc = 0.0
  for i in 0..n:
    var k = 2*i + 1
    var taylor = pow(-1, i.float) * pow(x, k.float) / factorial(k).float 
    acc = acc + taylor
  return acc 
 echo factorial(17)
 echo sine(1.0)
--- a/nim/samples.nim
+++ b/nim/samples.nim
@ -1,84 +0,0 @@
 import std/math
 import std/sugar
 import std/random
 import std/sequtils
 randomize()
 ## Distribution functions
 ## Normal
 ## <https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form>
 proc ur_normal(): float = 
  let u1 = rand(1.0)
  let u2 = rand(1.0)
  let z = sqrt(-2.0 * ln(u1)) * sin(2 * PI * u2)
  return z
 proc normal(mean: float, sigma: float): float = 
  return (mean + sigma * ur_normal())
 proc lognormal(logmean: float, logsigma: float): float =
  let answer = pow(E, normal(logmean, logsigma))
  return answer
 proc to(low: float, high: float): float = 
  let normal95confidencePoint = 1.6448536269514722
  let loglow = ln(low)
  let loghigh = ln(high)
  let logmean = (loglow + loghigh)/2
  let logsigma = (loghigh - loglow) / (2.0 * normal95confidencePoint)
  return lognormal(logmean, logsigma)
 ## echo ur_normal()
 ## echo normal(10, 20)
 ## echo lognormal(2, 4)
 ## echo to(10, 90)
 ## Manipulate samples
 proc mixture(fs: seq[proc (): float{.nimcall.}], ps: seq[float], n: int): seq[float] = 
  assert fs.len == ps.len
  var ws: seq[float]
  var sum = 0.0
  for i, p in ps:
    sum = sum + p
    ws.add(sum)
  ws = ws.map(w => w/sum)
  var samples: seq[float]
  let rs = toSeq(1..n).map(_=>rand(1.0))
  for i in 0..(n-1):
    let r = rs[i]
    var j = ws.len - 1
    for k, w in ws:
      if r < w:
        j = k
        break
    ## only occasion when ^ doesn't assign j
    ## is when r is exactly 1
    ## which would correspond to choosing the last item in ws
    ## which is why j is initialized to ws.len - 1
    let f = fs[j]
    samples.add(f())
  return samples
 ## Actual model
 let n = 1000000
 let p_a = 0.8
 let p_b = 0.5
 let p_c = p_a * p_b
 let weights = @[ 1.0 - p_c, p_c/2.0, p_c/4.0, p_c/4.0 ]
 let fs = @[ proc (): float = 0.0, proc (): float = 1.0, proc (): float = to(1.0, 3.0), proc (): float = to(2.0, 10.0)]
 let result = mixture(fs, weights, n)
 let mean_result = foldl(result, a + b, 0.0) / float(result.len)
 # echo result
 echo mean_result
--- a/python/hello-world.py
+++ b/python/hello-world.py
@ -0,0 +1 @@
 print("Hello world!")
--- a/squigglepy/makefile
+++ b/squigglepy/makefile
@ -1,10 +0,0 @@
 install: 
 	pip3 install "squigglepy>=0.26"
 	pip3 install numpy
 run:
 	python3 samples.py
 time: 
 	time python3 samples.py
--- a/squigglepy/samples.py
+++ b/squigglepy/samples.py
@ -1,18 +0,0 @@
 import squigglepy as sq
 import numpy as np
 p_a = 0.8
 p_b = 0.5
 p_c = p_a * p_b
 dist_0 = 0
 dist_1 = 1
 dist_some = sq.to(1, 3)
 dist_many = sq.to(2, 10)
 dists = [dist_0, dist_1, dist_some, dist_many]
 weights = [(1 - p_c), p_c/2, p_c/4, p_c/4 ]
 result = sq.mixture(dists, weights)
 result_samples = sq.sample(result, 1000000)
 print(np.mean(result_samples))
--- a/time.txt
+++ b/time.txt
@ -1,41 +1,11 @@
 # Optimized C
 $ make && make time-linux
 gcc -O3   samples.c -fopenmp -lm -o out/samples
 Requires /bin/time, found on GNU/Linux systems
 Running 100x and taking avg time: OMP_NUM_THREADS=1 out/samples
 Time using 1 thread: 20.20ms
 Running 100x and taking avg time: OMP_NUM_THREADS=2 out/samples
 Time using 2 threads: 17.50ms
 Running 100x and taking avg time: OMP_NUM_THREADS=4 out/samples
 Time for 4 threads: 17.00ms
 Running 100x and taking avg time: OMP_NUM_THREADS=8 out/samples
 Time using 8 threads: 8.40ms
 Running 100x and taking avg time: OMP_NUM_THREADS=16 out/samples
 Time using 16 threads: 5.00ms
 # C
 ## normal compilation
 0.888458
 real    0m0,442s
 user    0m0,378s
 sys     0m0,064s
 ## -Ofast
 0.888458
 real    0m0.292s
 user    0m0.266s
 sys     0m0.026s
 # Squiggle
 real    0m1,536s
@ -66,65 +36,3 @@ real    0m7,000s
 user    0m6,944s
 sys     0m0,052s
 ## Nim
 nim c --verbosity:0 samples.nim && time ./samples --verbosity:0 && echo
 0.8860780498240779
 real    0m0.234s
 user    0m0.214s
 sys     0m0.020s
 nim c --verbosity:0 -d:release samples.nim && time ./samples --verbosity:0 && echo
 0.884035098700204
 real    0m0.074s
 user    0m0.043s
 sys     0m0.031s
 nim c --verbosity:0 -d:danger samples.nim && time ./samples --verbosity:0
 0.8892827195895541
 real    0m0.068s
 user    0m0.048s
 sys     0m0.020s
 — make time-linux
 Requires /bin/time, found on GNU/Linux systems
 Running 100x and taking avg time of:
 Time: 38.90ms
 ## Squigglepy
 — time make run
 python3 samples.py
 100%|█████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 20.68it/s]
 100%|████████████████████████████████████████████████████████████████████| 1000000/1000000 [00:00<00:00, 1298266.78it/s]
 0.8854926010232937
 real    0m1.602s
 user    0m1.582s
 sys     0m0.331s
 ## Lua
 — make time-linux-simple
 Requires /bin/time, found on GNU/Linux systems
 /bin/time -f "Time: %es" lua samples.lua && echo
 0.88932198462254
 Time: 0.28s
 — make time-linux
 Requires /bin/time, found on GNU/Linux systems
 Running 100x and taking avg time of: lua samples.lua
 Time: 275.80ms
 — make time-linux
 Requires /bin/time, found on GNU/Linux systems
 Running 100x and taking avg time of: luajit samples.lua
 Time: 68.60ms
--- a/wip/template/makefile
+++ b/wip/template/makefile
@ -1,49 +0,0 @@
 # Interface: 
 #   make
 #   make build
 #   make run
 #   make time-linux
 #   make install
 #   make format
 #   make time-linux-simple
 #   make profile-linux
 # Main file
 SRC=samples.lang
 OUTPUT=out/samples
 # Compiler
 COMPILER=lang-compiler
 FLAGS=
 ## Formatter
 FORMATTER=lang-formatter --options
 ## make build
 build: $(SRC)
 	$(COMPILER) $(FLAGS) -o $(OUTPUT)
 install:
 	lang-package-manager install dependencies
 format: $(SRC)
 	$(FORMATTER) $(SRC)
 run: $(SRC) $(OUTPUT)
 	./$(OUTPUT)
 time-linux: 
 	@echo "Requires /bin/time, found on GNU/Linux systems" && echo
 	@echo "Running 100x and taking avg time of: $(OUTPUT)"
 	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time: |" | sed 's|$$|ms|' && echo
 time-linux-simple:
 	@echo "Requires /bin/time, found on GNU/Linux systems" && echo
 	/bin/time -f "Time: %es" ./$(OUTPUT) && echo
 profile-linux: 
 	@echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar"
 	@echo "Must be run as sudo"
 	sudo perf record $(OUTPUT)
 	sudo perf report
 	rm perf.data
--- a/wip/zig/hello-world.zig
+++ b/wip/zig/hello-world.zig
@ -1,5 +0,0 @@
 const std = @import("std");
 pub fn main() void {
    std.debug.print("Hello, {s}!\n", .{"World"});
 }
--- a/wip/zig/install-zig.sh
+++ b/wip/zig/install-zig.sh
@ -1 +0,0 @@
 See <https://ziglang.org/learn/getting-started/#tagged-release-or-nightly-build>
--- a/wip/zig/notes.md
+++ b/wip/zig/notes.md
@ -1 +0,0 @@
 https://ziglearn.org/chapter-1/
		`@ -0,0 +1,3 @@`
							`#!/bin/bash`

							`gcc -std=c99 -Wall -lm hello-world.c -o hello-world`
		`@ -1 +0,0 @@`
			`Using a pointer or a struct turns out to be pretty equivalent in terms of speed. I don't think there are gains to be found here, but I find the pointer implementation clearer.`
		`@ -1 +0,0 @@`
			`See <https://ziglang.org/learn/getting-started/#tagged-release-or-nightly-build>`