more work on squiggle_c

add squiggle_c beginnings.
add opencl pointer
2023-06-26 18:44:04 +01:00 · 2023-06-26 18:05:37 +01:00 · 2023-06-17 14:47:37 -06:00 · 2023-06-10 19:43:36 -06:00 · 2023-06-10 19:35:05 -06:00 · 2023-06-10 19:07:04 -06:00
60 changed files with 3494 additions and 211 deletions
--- a/C/C-01-simple/makefile
+++ b/C/C-01-simple/makefile
@ -0,0 +1,53 @@
+# Interface: 
+#   make
+#   make build
+#   make format
+#   make run
+
+# Compiler
+CC=gcc
+# CC=tcc # <= faster compilation
+
+# Main file
+SRC=samples.c
+OUTPUT=samples
+
+## Dependencies
+DEPS='gsl'
+
+## Flags
+INCS=`pkg-config --cflags ${DEPS}`
+LIBS=`pkg-config --libs ${DEPS}`
+DEBUG= #'-g'
+STANDARD=-std=c99
+WARNINGS=-Wall
+FAST=-Ofast
+## Formatter
+STYLE_BLUEPRINT=webkit
+FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
+
+## make build
+build: $(SRC)
+	$(CC) $(DEBUG) $(INCS) $(PLUGS) $(SRC) -o samples $(LIBS)
+
+fast: $(SRC)
+	$(CC) $(FAST) $(DEBUG) $(INCS) $(PLUGS) $(SRC) -o samples $(LIBS)
+
+format: $(SRC)
+	$(FORMATTER) $(SRC)
+
+run: $(SRC) $(OUTPUT)
+	echo "Increasing stack size limit, because we are dealing with 1M samples"
+	# ulimit: increase stack size limit
+	# -Ss: the soft limit. If you set the hard limit, you then can't raise it
+	# 256000: around 250Mbs, if I'm reading it correctly.
+	# Then run the program
+	ulimit -Ss 256000 && ./$(OUTPUT)
+
+
+
+# Old:
+# Link libraries, for good measure
+# LD_LIBRARY_PATH=/usr/local/lib
+# export LD_LIBRARY_PATH
+
--- a/C/C-01-simple/samples
+++ b/C/C-01-simple/samples
--- a/C/C-01-simple/samples.c
+++ b/C/C-01-simple/samples.c
@ -0,0 +1,157 @@
+#include <gsl/gsl_randist.h>
+#include <gsl/gsl_rng.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#define N 1000000
+/* 
+ * For very high values of N, you will want to increase the maximum stack trace, otherwise you will suffer a segmentation fault
+ * In Ubuntu/bash you can do this with $ ulimit -Ss 256000 ## ~256Mbs
+ * And confirm it with $ ulimit -a
+*/
+
+/* Helpers */
+void print(double* ys)
+{
+    for (int i = 0; i < N; i++) {
+        printf("%f\n", ys[i]);
+    }
+    printf("\n");
+}
+
+void fill(double* ys, float f)
+{
+    for (int i = 0; i < N; i++) {
+        ys[i] = f;
+    }
+}
+
+double sum(double* ps, int n)
+{
+    double result = 0;
+    for (int i = 0; i < n; i++) {
+        result += ps[i];
+    }
+    return (result);
+}
+
+void cumsum(double* ps, double* rs, int n)
+{
+    double counter = 0;
+    for (int i = 0; i < n; i++) {
+        counter += ps[i];
+        rs[i] = counter;
+    }
+}
+
+/* Distributions*/
+void normal(gsl_rng* r, double* ys, double mean, double std)
+{
+    for (int i = 0; i < N; i++) {
+        ys[i] = mean + gsl_ran_gaussian(r, std);
+    }
+}
+
+void lognormal(gsl_rng* r, double* ys, double zeta, double sigma)
+{
+    for (int i = 0; i < N; i++) {
+        ys[i] = gsl_ran_lognormal(r, zeta, sigma);
+    }
+}
+
+void to(gsl_rng* r, double* ys, double low, double high)
+{
+    double normal95confidencePoint = 1.6448536269514722;
+    double log_low = log(low);
+    double log_high = log(high);
+    double zeta = (log_low + log_high) / 2;
+    double sigma = (log_high - log_low) / (2.0 * normal95confidencePoint);
+    lognormal(r, ys, zeta, sigma);
+}
+
+/* Mixture of distributions */
+void mixture(gsl_rng* r, double* dists[], double* weights, int n, double* results)
+{
+    /* Get cummulative, normalized weights */
+    double sum_weights = sum(weights, n);
+    double normalized_weights[n];
+    for (int i = 0; i < n; i++) {
+        normalized_weights[i] = weights[i] / sum_weights;
+    }
+    double cummulative_weights[n];
+    cumsum(normalized_weights, cummulative_weights, n);
+
+    /* Get N samples, drawn from the different distributions in proportion to their weights. */
+    for (int i = 0; i < N; i++) {
+        double p_1 = gsl_rng_uniform(r);
+        double p_2 = gsl_rng_uniform(r);
+
+        int index_found = 0;
+        int index_counter = 0;
+        while ((index_found == 0) && (index_counter < n)) {
+            if (p_1 < cummulative_weights[index_counter]) {
+                index_found = 1;
+            } else {
+                index_counter++;
+            }
+        }
+        if (index_found == 0) {
+            printf("\nThis shouldn't be able to happen");
+            // gsl_rng_free (r);
+            // abort(); // this shouldn't have happened.
+
+        } else {
+            int sample_index = (int)floor(p_2 * N);
+            results[i] = dists[index_counter][sample_index];
+        }
+    }
+}
+
+/* Main */
+int main(void)
+{
+		// Start clock
+    clock_t start, end;
+    start = clock();
+
+    /* Initialize GNU Statistical Library (GSL) stuff */
+    const gsl_rng_type* T;
+    gsl_rng* r;
+    // gsl_rng_env_setup();
+    T = gsl_rng_default;
+    r = gsl_rng_alloc(T);
+
+    /* Toy example */
+    /* Declare variables in play */
+    double p_a, p_b, p_c;
+    double dist_none[N], dist_one[N], dist_few[N], dist_many[N], dist_mixture[N];
+
+    /* Initialize variables */
+    p_a = 0.8;
+    p_b = 0.5;
+    p_c = p_a * p_b;
+
+    fill(dist_none, 0);
+    fill(dist_one, 1);
+    to(r, dist_few, 1, 3);
+    to(r, dist_many, 2, 10);
+
+    /* Generate mixture */
+    int n = 4;
+    double weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 };
+    double* dists[] = { dist_none, dist_one, dist_few, dist_many };
+
+    mixture(r, dists, weights, n, dist_mixture);
+    printf("%f\n", sum(dist_mixture, N) / N);
+
+    /* Clean up GSL */
+    gsl_rng_free(r);
+		
+		// End clock
+    end = clock();
+    printf("Total time (ms): %f\n", ((double)(end - start)) / CLOCKS_PER_SEC * 1000);
+    /* Return success*/
+    return EXIT_SUCCESS;
+}
--- a/C/C-02-better-algorithm-one-thread/makefile
+++ b/C/C-02-better-algorithm-one-thread/makefile
@ -0,0 +1,53 @@
+# Interface: 
+#   make
+#   make build
+#   make format
+#   make run
+
+# Compiler
+CC=gcc
+# CC=tcc # <= faster compilation
+
+# Main file
+
+SRC_ONE_THREAD=./samples-one-thread.c
+OUTPUT_ONE_THREAD=out/samples-one-thread
+
+## Dependencies
+#  Has no dependencies
+MATH=-lm
+
+## Flags
+DEBUG= #'-g'
+STANDARD=-std=c99
+WARNINGS=-Wall
+OPTIMIZED=-O3 #-O3 actually gives better performance than -Ofast, at least for this version
+OPENMP=-fopenmp
+
+## Formatter
+STYLE_BLUEPRINT=webkit
+FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
+
+## make build
+build: $(SRC_ONE_THREAD)
+	mkdir -p out
+	$(CC) $(OPTIMIZED) $(DEBUG) $(SRC_ONE_THREAD) $(OPENMP) $(MATH) -o $(OUTPUT_ONE_THREAD)
+
+format: $(SRC_ONE_THREAD)
+	$(FORMATTER) $(SRC_ONE_THREAD)
+
+run: $(SRC_ONE_THREAD) $(OUTPUT_ONE_THREAD)
+	./$(OUTPUT_ONE_THREAD)
+
+time-linux: 
+	@echo "Requires /bin/time, found on GNU/Linux systems" && echo
+	@echo "Running 100x and taking avg time: $(OUTPUT_ONE_THREAD)"
+	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(OUTPUT_ONE_THREAD); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time: |" | sed 's|$$|ms|' && echo
+
+time-linux-simple:
+	@echo "Requires /bin/time, found on GNU/Linux systems" && echo
+	/bin/time -f "Time: %es" ./$(OUTPUT_ONE_THREAD) && echo
+
+debian-install-dependencies:
+	sudo apt-get install libomp-dev
+
--- a/C/C-02-better-algorithm-one-thread/out/samples-one-thread
+++ b/C/C-02-better-algorithm-one-thread/out/samples-one-thread
--- a/C/C-02-better-algorithm-one-thread/samples-one-thread.c
+++ b/C/C-02-better-algorithm-one-thread/samples-one-thread.c
@ -0,0 +1,183 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+const float PI = 3.14159265358979323846;
+
+#define N 1000000
+
+//Array helpers
+
+void array_print(float* array, int length)
+{
+    for (int i = 0; i < length; i++) {
+        printf("item[%d] = %f\n", i, array[i]);
+    }
+    printf("\n");
+}
+
+void array_fill(float* array, int length, float item)
+{
+    int i;
+    {
+        for (i = 0; i < length; i++) {
+            array[i] = item;
+        }
+    }
+}
+
+float array_sum(float* array, int length)
+{
+    float output = 0.0;
+    for (int i = 0; i < length; i++) {
+        output += array[i];
+    }
+    return output;
+}
+
+void array_cumsum(float* array_to_sum, float* array_cumsummed, int length)
+{
+    array_cumsummed[0] = array_to_sum[0];
+    for (int i = 1; i < length; i++) {
+        array_cumsummed[i] = array_cumsummed[i - 1] + array_to_sum[i];
+    }
+}
+
+float rand_float(float to)
+{
+    return ((float)rand() / (float)RAND_MAX) * to;
+}
+
+float ur_normal()
+{
+    float u1 = rand_float(1.0);
+    float u2 = rand_float(1.0);
+    float z = sqrtf(-2.0 * log(u1)) * sin(2 * PI * u2);
+    return z;
+}
+
+inline float random_uniform(float from, float to)
+{
+    return ((float)rand() / (float)RAND_MAX) * (to - from) + from;
+}
+
+inline float random_normal(float mean, float sigma)
+{
+    return (mean + sigma * ur_normal());
+}
+
+inline float random_lognormal(float logmean, float logsigma)
+{
+    return expf(random_normal(logmean, logsigma));
+}
+
+inline float random_to(float low, float high)
+{
+    const float NORMAL95CONFIDENCE = 1.6448536269514722;
+    float loglow = logf(low);
+    float loghigh = logf(high);
+    float logmean = (loglow + loghigh) / 2;
+    float logsigma = (loghigh - loglow) / (2.0 * NORMAL95CONFIDENCE);
+    return random_lognormal(logmean, logsigma);
+}
+
+void array_random_to(float* array, int length, float low, float high)
+{
+    int i;
+    #pragma omp private(i)
+    {
+        #pragma omp for
+        for (i = 0; i < length; i++) {
+            array[i] = random_to(low, high);
+        }
+    }
+}
+
+void mixture(float (*samplers[])(void), float* weights, int n_dists, float* results, int results_length)
+{
+    float sum_weights = array_sum(weights, n_dists);
+    float* normalized_weights = malloc(n_dists * sizeof(float));
+    for (int i = 0; i < n_dists; i++) {
+        normalized_weights[i] = weights[i] / sum_weights;
+    }
+
+    float* cummulative_weights = malloc(n_dists * sizeof(float));
+    array_cumsum(normalized_weights, cummulative_weights, n_dists);
+
+    //create var holders
+    float p1;
+    int sample_index, i, own_length;
+
+    {
+			for (int i = 0; i < results_length; i++) {
+					p1 = random_uniform(0, 1);
+					for (int j = 0; j < n_dists; j++) {
+							if (p1 < cummulative_weights[j]) {
+									results[i] = samplers[j]();
+									break;
+							}
+					}
+			}
+    }
+    free(normalized_weights);
+    free(cummulative_weights);
+}
+
+float sample_0()
+{
+    return 0;
+}
+
+float sample_1()
+{
+    return 1;
+}
+
+float sample_few()
+{
+    return random_to(1, 3);
+}
+
+float sample_many()
+{
+    return random_to(2, 10);
+}
+
+int main()
+{
+    //initialize randomness
+    srand(1);
+    
+		// clock_t start, end;
+		// start = clock();
+
+		// Toy example
+		// Declare variables in play
+		float p_a, p_b, p_c;
+		// printf("Max threads: %d\n", n_threads);
+		// omp_set_num_threads(n_threads);
+
+		// Initialize variables
+		p_a = 0.8;
+		p_b = 0.5;
+		p_c = p_a * p_b;
+
+		// Generate mixture
+		int n_dists = 4;
+		float weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 };
+		float (*samplers[])(void) = { sample_0, sample_1, sample_few, sample_many };
+
+		float* results = malloc(N * sizeof(float));
+		mixture(samplers, weights, n_dists, results, N);
+		printf("Sum(dist_mixture, N)/N = %f\n", array_sum(results, N) / N);
+		// array_print(dist_mixture[0], N);
+		
+		// end = clock();
+		// printf("Time (ms): %f\n", ((double)(end - start)) / (CLOCKS_PER_SEC * 1000));
+		// ^ Will only measure how long it takes the inner main to run, not the whole program, 
+		// including e.g., loading the program into memory or smth.
+		// Also CLOCKS_PER_SEC in POSIX is a constant equal to 1000000.
+		// See: https://stackoverflow.com/questions/10455905/why-is-clocks-per-sec-not-the-actual-number-of-clocks-per-second
+    return 0;
+}
--- a/C/README.md
+++ b/C/README.md
@ -0,0 +1,18 @@
+# Time to BOTEC in C
+
+This repository contains a few implementations of a simple botec (back-of-the-envelope) calculation in C:
+
+- In the folder C-01-simple/, you can see a simple implementation, which passes large arrays around
+- In the folder C-02-better-algorithm-one-thread/ you can see a better implementations, that passes around pointers to functions, which makes the implementation more efficient
+- The top level samples.c uses the algorithm improvements in C-02..., and also implements multithreading using OpenMP
+
+## To do
+
+- [ ] Add Windows/Powershell time-measuring commands
+- [ ] Add CUDA?
+- [x] Added results of perf. `rand_r` seems like a big chunk of it, but I'm hesitant to use lower-quality random numbers
+  - [x] used xorshift instead
+  - [-] Use xorshift with a struct instead of a pointer? idk, could be faster for some reason? => Tested, it takes the same time.
+- [x] Update repository with correct timing
+- [x] Use better profiling approach to capture timing with 1M samples.
+- [x] See if program can be reworded so as to use multithreading effectively, e.g., so that you see speed gains proportional to the number of threads used
--- a/C/hello-world/build.sh
+++ b/C/hello-world/build.sh
@ -1,3 +0,0 @@
-#!/bin/bash
-
-gcc -std=c99 -Wall -lm hello-world.c -o hello-world
--- a/C/hello-world/hello-world.c
+++ b/C/hello-world/hello-world.c
@ -1,7 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-
-int main(void){
-  printf("Hello world!\n");
-  return EXIT_SUCCESS;
-}
--- a/C/makefile
+++ b/C/makefile
@ -0,0 +1,106 @@
+# Interface: 
+#   make
+#   make build
+#   make format
+#   make run
+
+# Compiler
+CC=gcc
+# CC=tcc # <= faster compilation
+
+# Main file
+SRC=samples.c
+OUTPUT=out/samples
+
+SRC_ONE_THREAD=./samples-one-thread.c
+OUTPUT_ONE_THREAD=out/samples-one-thread
+
+## Dependencies
+#  Has no dependencies
+MATH=-lm
+
+## Flags
+DEBUG= #'-g'
+STANDARD=-std=c99
+WARNINGS=-Wall
+OPTIMIZED=-O3 #-O3 actually gives better performance than -Ofast, at least for this version
+OPENMP=-fopenmp
+
+## Formatter
+STYLE_BLUEPRINT=webkit
+FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
+
+## make build
+build: $(SRC)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(OPENMP) $(MATH) -o $(OUTPUT)
+
+static:
+	$(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(OPENMP) $(MATH) -o $(OUTPUT)
+
+format: $(SRC)
+	$(FORMATTER) $(SRC)
+
+run: $(SRC) $(OUTPUT)
+	OMP_NUM_THREADS=1 ./$(OUTPUT) && echo
+
+multi:
+	OMP_NUM_THREADS=1 ./$(OUTPUT) && echo
+	OMP_NUM_THREADS=2 ./$(OUTPUT) && echo
+	OMP_NUM_THREADS=4 ./$(OUTPUT) && echo
+	OMP_NUM_THREADS=8 ./$(OUTPUT) && echo
+	OMP_NUM_THREADS=16 ./$(OUTPUT) && echo
+
+## Timing
+
+time-linux: 
+	@echo "Requires /bin/time, found on GNU/Linux systems" && echo
+	
+	@echo "Running 100x and taking avg time: OMP_NUM_THREADS=1 $(OUTPUT)"
+	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=1 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 1 thread: |" | sed 's|$$|ms|' && echo
+	
+	@echo "Running 100x and taking avg time: OMP_NUM_THREADS=2 $(OUTPUT)"
+	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=2 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 2 threads: |" | sed 's|$$|ms|' && echo
+	
+	@echo "Running 100x and taking avg time: OMP_NUM_THREADS=4 $(OUTPUT)"
+	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=4 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time for 4 threads: |" | sed 's|$$|ms|' && echo
+	
+	@echo "Running 100x and taking avg time: OMP_NUM_THREADS=8 $(OUTPUT)"
+	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=8 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 8 threads: |" | sed 's|$$|ms|' && echo
+	
+	@echo "Running 100x and taking avg time: OMP_NUM_THREADS=16 $(OUTPUT)"
+	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=16 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo
+
+time-linux-fastest: 
+	@echo "Running 100x and taking avg time: OMP_NUM_THREADS=16 $(OUTPUT)"
+	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=16 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo
+
+time-linux-simple:
+	@echo "Requires /bin/time, found on GNU/Linux systems" && echo
+	OMP_NUM_THREADS=1 /bin/time -f "Time: %es" ./$(OUTPUT) && echo
+	OMP_NUM_THREADS=2 /bin/time -f "Time: %es" ./$(OUTPUT) && echo
+	OMP_NUM_THREADS=4 /bin/time -f "Time: %es" ./$(OUTPUT) && echo
+	OMP_NUM_THREADS=8 /bin/time -f "Time: %es" ./$(OUTPUT) && echo
+	OMP_NUM_THREADS=16 /bin/time -f "Time: %es" ./$(OUTPUT) && echo
+
+## Profiling
+
+profile-linux: 
+	echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar"
+	echo "Must be run as sudo"
+	$(CC) $(SRC) $(OPENMP) $(MATH) -o $(OUTPUT)
+	# ./$(OUTPUT)
+	# gprof: 
+	# gprof $(OUTPUT) gmon.out > analysis.txt
+	# rm gmon.out
+	# vim analysis.txt
+	# rm analysis.txt
+	# perf: 
+	OMP_NUM_THREADS=16 sudo perf record $(OUTPUT)
+	sudo perf report
+	rm perf.data
+	
+
+## Install
+debian-install-dependencies:
+	sudo apt-get install libomp-dev
+
--- a/C/out/samples
+++ b/C/out/samples
--- a/C/out/samples-one-thread
+++ b/C/out/samples-one-thread
--- a/C/out/samples.exe
+++ b/C/out/samples.exe
--- a/C/out/samples.txt
+++ b/C/out/samples.txt
--- a/C/perf.txt
+++ b/C/perf.txt
@ -0,0 +1,26 @@
+Samples: 884  of event 'cycles', Event count (approx.): 551167949
+Overhead  Command  Shared Object      Symbol
+  35.32%  samples  samples            [.] xorshift32
+  14.09%  samples  libgomp.so.1.0.0   [.] 0x000000000001d2ea
+  12.04%  samples  libgomp.so.1.0.0   [.] 0x000000000001d132
+  11.53%  samples  samples            [.] mixture._omp_fn.0
+   4.55%  samples  libm-2.31.so       [.] __sin_fma
+   4.24%  samples  samples            [.] rand_0_to_1
+   3.77%  samples  samples            [.] random_to
+   3.03%  samples  libm-2.31.so       [.] __logf_fma
+   1.61%  samples  libm-2.31.so       [.] __expf_fma
+   1.54%  samples  samples            [.] split_array_sum._omp_fn.0
+   1.38%  samples  samples            [.] random_uniform
+   0.94%  samples  samples            [.] ur_normal
+   0.91%  samples  libm-2.31.so       [.] __ieee754_log_fma
+   0.74%  samples  libgomp.so.1.0.0   [.] 0x000000000001d13d
+   0.52%  samples  samples            [.] sample_0
+   0.41%  samples  libm-2.31.so       [.] __sqrtf_finite@GLIBC_2.15
+   0.38%  samples  samples            [.] sample_1
+   0.36%  samples  libgomp.so.1.0.0   [.] 0x000000000001d139
+   0.36%  samples  libgomp.so.1.0.0   [.] 0x000000000001d2f5
+   0.22%  samples  [kernel.kallsyms]  [k] native_queued_spin_lock_slowpath
+   0.18%  samples  [kernel.kallsyms]  [k] _raw_spin_lock_irq
+   0.18%  samples  samples            [.] random_lognormal
+   0.17%  samples  libgomp.so.1.0.0   [.] 0x000000000001d2f1
+
--- a/C/samples.c
+++ b/C/samples.c
@ -0,0 +1,251 @@
+#include <omp.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+const float PI = 3.14159265358979323846;
+
+#define N 1000000
+
+//Array helpers
+
+void array_print(float* array, int length)
+{
+    for (int i = 0; i < length; i++) {
+        printf("item[%d] = %f\n", i, array[i]);
+    }
+    printf("\n");
+}
+
+float array_sum(float* array, int length)
+{
+    float output = 0.0;
+    for (int i = 0; i < length; i++) {
+        output += array[i];
+    }
+    return output;
+}
+
+void array_cumsum(float* array_to_sum, float* array_cumsummed, int length)
+{
+    array_cumsummed[0] = array_to_sum[0];
+    for (int i = 1; i < length; i++) {
+        array_cumsummed[i] = array_cumsummed[i - 1] + array_to_sum[i];
+    }
+}
+
+// Split array helpers
+int split_array_get_length(int index, int total_length, int n_threads)
+{
+    return (total_length % n_threads > index ? total_length / n_threads + 1 : total_length / n_threads);
+}
+
+void split_array_allocate(float** meta_array, int length, int divide_into)
+{
+    int split_array_length;
+
+    for (int i = 0; i < divide_into; i++) {
+        split_array_length = split_array_get_length(i, length, divide_into);
+        meta_array[i] = malloc(split_array_length * sizeof(float));
+    }
+}
+
+void split_array_free(float** meta_array, int divided_into)
+{
+    for (int i = 0; i < divided_into; i++) {
+        free(meta_array[i]);
+    }
+    free(meta_array);
+}
+
+float split_array_sum(float** meta_array, int length, int divided_into)
+{
+    int i;
+    float output = 0;
+
+    #pragma omp parallel for reduction(+:output)
+		for (int i = 0; i < divided_into; i++) {
+				float own_partial_sum = 0;
+				int split_array_length = split_array_get_length(i, length, divided_into);
+				for (int j = 0; j < split_array_length; j++) {
+						own_partial_sum += meta_array[i][j];
+				}
+				output += own_partial_sum;
+		}
+		return output;
+
+}
+
+// Pseudo Random number generator
+
+uint32_t xorshift32(uint32_t* seed)
+{
+	// Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs"
+  // See <https://stackoverflow.com/questions/53886131/how-does-xorshift32-works>
+  // https://en.wikipedia.org/wiki/Xorshift
+	// Also some drama: <https://www.pcg-random.org/posts/on-vignas-pcg-critique.html>, <https://prng.di.unimi.it/>
+
+	uint32_t x = *seed;
+	x ^= x << 13;
+	x ^= x >> 17;
+	x ^= x << 5;
+	return *seed = x;
+}
+
+// Distribution & sampling functions
+
+float rand_0_to_1(uint32_t* seed){
+	return ((float) xorshift32(seed)) / ((float) UINT32_MAX);
+	/* 
+	uint32_t x = *seed;
+	x ^= x << 13;
+	x ^= x >> 17;
+	x ^= x << 5;
+	return ((float)(*seed = x))/((float) UINT32_MAX);
+	*/
+	// previously:
+	// ((float)rand_r(seed) / (float)RAND_MAX) 
+	// and before that: rand, but it wasn't thread-safe.
+	// See: <https://stackoverflow.com/questions/43151361/how-to-create-thread-safe-random-number-generator-in-c-using-rand-r> for why to use rand_r:
+	// rand() is not thread-safe, as it relies on (shared) hidden seed.
+}
+
+float rand_float(float max, uint32_t* seed)
+{
+    return rand_0_to_1(seed) * max;
+}
+
+float ur_normal(uint32_t* seed)
+{
+    float u1 = rand_0_to_1(seed);
+    float u2 = rand_0_to_1(seed);
+    float z = sqrtf(-2.0 * log(u1)) * sin(2 * PI * u2);
+    return z;
+}
+
+float random_uniform(float from, float to, uint32_t* seed)
+{
+    return rand_0_to_1(seed) * (to - from) + from;
+}
+
+float random_normal(float mean, float sigma, uint32_t* seed)
+{
+    return (mean + sigma * ur_normal(seed));
+}
+
+float random_lognormal(float logmean, float logsigma, uint32_t* seed)
+{
+    return expf(random_normal(logmean, logsigma, seed));
+}
+
+float random_to(float low, float high, uint32_t* seed)
+{
+    const float NORMAL95CONFIDENCE = 1.6448536269514722;
+    float loglow = logf(low);
+    float loghigh = logf(high);
+    float logmean = (loglow + loghigh) / 2;
+    float logsigma = (loghigh - loglow) / (2.0 * NORMAL95CONFIDENCE);
+    return random_lognormal(logmean, logsigma, seed);
+}
+
+// Mixture function
+void mixture(float (*samplers[])(uint32_t*), float* weights, int n_dists, float** results, int n_threads)
+{
+    // You can see a simpler version of this function in the git history
+    // or in C-02-better-algorithm-one-thread/
+    float sum_weights = array_sum(weights, n_dists);
+		float* cumsummed_normalized_weights = malloc(n_dists * sizeof(float));
+		cumsummed_normalized_weights[0] = weights[0]/sum_weights;
+    for (int i = 1; i < n_dists; i++) {
+        cumsummed_normalized_weights[i] = cumsummed_normalized_weights[i - 1] + weights[i]/sum_weights;
+    }
+
+    //create var holders
+    float p1;
+    int sample_index, i, split_array_length;
+    
+		// uint32_t* seeds[n_threads];
+    uint32_t** seeds = malloc(n_threads * sizeof(uint32_t*));
+		for (uint32_t i = 0; i < n_threads; i++) {
+        seeds[i] = malloc(sizeof(uint32_t));
+        *seeds[i] = i + 1; // xorshift can't start with 0
+    }
+
+    #pragma omp parallel private(i, p1, sample_index, split_array_length)
+    {
+        #pragma omp for
+        for (i = 0; i < n_threads; i++) {
+            split_array_length = split_array_get_length(i, N, n_threads);
+            for (int j = 0; j < split_array_length; j++) {
+                p1 = random_uniform(0, 1, seeds[i]);
+                for (int k = 0; k < n_dists; k++) {
+                    if (p1 < cumsummed_normalized_weights[k]) {
+                        results[i][j] = samplers[k](seeds[i]);
+                        break;
+                    }
+                }
+            }
+        }
+    }
+    // free(normalized_weights);
+    // free(cummulative_weights);
+		free(cumsummed_normalized_weights);
+    for (uint32_t i = 0; i < n_threads; i++) {
+        free(seeds[i]);
+    }
+		free(seeds);
+}
+
+// Functions used for the BOTEC.
+// Their type has to be the same, as we will be passing them around.
+
+float sample_0(uint32_t* seed)
+{
+    return 0;
+}
+
+float sample_1(uint32_t* seed)
+{
+    return 1;
+}
+
+float sample_few(uint32_t* seed)
+{
+    return random_to(1, 3, seed);
+}
+
+float sample_many(uint32_t* seed)
+{
+    return random_to(2, 10, seed);
+}
+
+int main()
+{
+
+    // Toy example
+    // Declare variables in play
+    float p_a, p_b, p_c;
+    int n_threads = omp_get_max_threads();
+    // printf("Max threads: %d\n", n_threads);
+    // omp_set_num_threads(n_threads);
+    float** dist_mixture = malloc(n_threads * sizeof(float*));
+    split_array_allocate(dist_mixture, N, n_threads);
+
+    // Initialize variables
+    p_a = 0.8;
+    p_b = 0.5;
+    p_c = p_a * p_b;
+
+    // Generate mixture
+    int n_dists = 4;
+    float weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 };
+    float (*samplers[])(uint32_t*) = { sample_0, sample_1, sample_few, sample_many };
+
+    mixture(samplers, weights, n_dists, dist_mixture, n_threads);
+    printf("Sum(dist_mixture, N)/N = %f\n", split_array_sum(dist_mixture, N, n_threads) / N);
+    // array_print(dist_mixture[0], N);
+    split_array_free(dist_mixture, n_threads);
+
+    return 0;
+}
--- a/C/samples/build-and-run.sh
+++ b/C/samples/build-and-run.sh
@ -1,15 +0,0 @@
-#!/bin/bash
-root="$1"
-gcc -std=c99 -Wall -lm "$root".c -o "$root" -I/usr/local/include -L/usr/local/lib -lgsl -lgslcblas -lm -g
-
-# Link libraries, for good measure
-LD_LIBRARY_PATH=/usr/local/lib
-export LD_LIBRARY_PATH
-
-# Increase stack size limit
-ulimit -Ss 256000
-# -Ss: the soft limit. If you set the hard limit, you then can't raise it
-# 256000: around 250Mbs, if I'm reading it correctly.
-
-# run
-./samples
--- a/C/samples/build-gsl.sh
+++ b/C/samples/build-gsl.sh
@ -1,13 +0,0 @@
-#!/bin/bash
-root="$1"
-gcc -std=c99 -Wall -lm "$root".c -o "$root" -I/usr/local/include -L/usr/local/lib -lgsl -lgslcblas -lm -g
-
-# Link libraries, for good measure
-LD_LIBRARY_PATH=/usr/local/lib
-export LD_LIBRARY_PATH
-
-# Increase stack size limit
-ulimit -Ss 256000
-# -Ss: the soft limit. If you set the hard limit, you then can't raise it
-# 256000: around 250Mbs, if I'm reading it correctly.
-
--- a/C/samples/samples
+++ b/C/samples/samples
--- a/C/samples/samples.c
+++ b/C/samples/samples.c
@ -1,140 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <gsl/gsl_rng.h>
-#include <gsl/gsl_randist.h>
-
-#define N 1000000
-/* 
- * For very high values of N, you will want to increase the maximum stack trace, otherwise you will suffer a segmentation fault
- * In Ubuntu/bash you can do this with $ ulimit -Ss 256000 ## ~256Mbs
- * And confirm it with $ ulimit -a
-*/
-
-/* Helpers */
-void print(double *ys){
-  for(int i=0; i<N; i++){
-    printf("%f\n", ys[i]);
-  }
-  printf("\n");
-}
-
-void fill(double *ys, float f){
-  for(int i=0; i<N; i++){
-    ys[i] = f;
-  }
-}
-
-double sum(double *ps, int n){
-  double result = 0;
-  for(int i=0; i<n; i++){
-    result += ps[i];
-  }
-  return(result);
-}
-
-void cumsum(double *ps, double *rs, int n){
-  double counter = 0;
-  for(int i=0; i<n; i++){
-    counter += ps[i];
-    rs[i] = counter;
-  }
-}
-
-/* Distributions*/
-void normal(gsl_rng * r, double *ys, double mean, double std){
-  for(int i=0; i<N; i++){
-    ys[i] = mean + gsl_ran_gaussian(r, std);
-  }
-}
-
-void lognormal(gsl_rng * r, double *ys, double zeta, double sigma){
-  for(int i=0; i<N; i++){
-    ys[i] = gsl_ran_lognormal(r, zeta, sigma);
-  }
-}
-
-void to(gsl_rng * r, double *ys, double low, double high){
-  double normal95confidencePoint = 1.6448536269514722; 
-  double log_low = log(low);
-  double log_high = log(high);
-  double zeta = (log_low + log_high)/2;
-  double sigma = (log_high - log_low) / (2.0 * normal95confidencePoint);
-  lognormal(r, ys, zeta, sigma);
-}
-
-/* Mixture of distributions */
-void mixture(gsl_rng * r, double *dists[], double *weights, int n, double *results){
-  /* Get cummulative, normalized weights */
-  double sum_weights = sum(weights, n);
-  double normalized_weights[n];
-  for(int i=0; i<n; i++){
-    normalized_weights[i] = weights[i]/sum_weights;
-  }
-  double cummulative_weights[n];
-  cumsum(normalized_weights, cummulative_weights, n);
-  
-  /* Get N samples, drawn from the different distributions in proportion to their weights. */
-  for(int i=0; i<N; i++){
-    double p_1 = gsl_rng_uniform(r);
-    double p_2 = gsl_rng_uniform(r);
-    
-    int index_found = 0;
-    int index_counter = 0;
-    while ((index_found == 0) && (index_counter < n) ){
-      if(p_1 < cummulative_weights[index_counter]){
-        index_found = 1;
-      }else{
-        index_counter++;
-      }
-    }
-    if(index_found == 0) {
-      printf("\nThis shouldn't be able to happen");
-      // gsl_rng_free (r);
-      // abort(); // this shouldn't have happened.
-      
-    }else{
-      int sample_index = (int) floor(p_2 * N);
-      results[i] = dists[index_counter][sample_index];
-    }
-  }
-}
-
-/* Main */
-int main(void){
-  /* Initialize GNU Statistical Library (GSL) stuff */
-  const gsl_rng_type * T;
-  gsl_rng * r;
-  // gsl_rng_env_setup();
-  T = gsl_rng_default;
-  r = gsl_rng_alloc (T);
-  
-  /* Toy example */
-  /* Declare variables in play */
-  double p_a, p_b, p_c;
-  double dist_none[N], dist_one[N], dist_few[N], dist_many[N], dist_mixture[N];
-  
-  /* Initialize variables */ 
-  p_a = 0.8;
-  p_b = 0.5;
-  p_c = p_a * p_b;
-  
-  fill(dist_none, 0);
-  fill(dist_one, 1);
-  to(r, dist_few, 1, 3);
-  to(r, dist_many, 2, 10);
-
-  /* Generate mixture */
-  int n = 4;
-  double weights[] = { 1 - p_c, p_c/2, p_c/4, p_c/4 };
-  double *dists[] = { dist_none, dist_one, dist_few, dist_many };
-  
-  mixture(r, dists, weights, n, dist_mixture);
-  printf("%f\n", sum(dist_mixture, N)/N);
-
-  /* Clean up GSL */
-  gsl_rng_free (r);
-
-  /* Return success*/
-  return EXIT_SUCCESS;
-}
--- a/C/squiggle_c/examples/01_one_sample/example
+++ b/C/squiggle_c/examples/01_one_sample/example
--- a/C/squiggle_c/examples/01_one_sample/example.c
+++ b/C/squiggle_c/examples/01_one_sample/example.c
@ -0,0 +1,50 @@
+#include "../../squiggle.h"
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+// Estimate functions
+float sample_0(uint32_t* seed)
+{
+    return 0;
+}
+
+float sample_1(uint32_t* seed)
+{
+    return 1;
+}
+
+float sample_few(uint32_t* seed)
+{
+    return random_to(1, 3, seed);
+}
+
+float sample_many(uint32_t* seed)
+{
+    return random_to(2, 10, seed);
+}
+
+int main(){
+    // set randomness seed
+		uint32_t* seed = malloc(sizeof(uint32_t));
+		*seed = 1000; // xorshift can't start with 0
+
+    float p_a = 0.8;
+    float p_b = 0.5;
+    float p_c = p_a * p_b;
+
+    int n_dists = 4;
+    float weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 };
+    float (*samplers[])(uint32_t*) = { sample_0, sample_1, sample_few, sample_many };
+
+    float result_one = mixture(samplers, weights, n_dists, seed);
+		printf("result_one: %f\n", result_one);
+}
+
+/* 
+Aggregation mechanisms:
+- Quantiles (requires a sort)
+- Sum 
+- Average
+- Std
+*/
--- a/C/squiggle_c/examples/01_one_sample/makefile
+++ b/C/squiggle_c/examples/01_one_sample/makefile
@ -0,0 +1,53 @@
+# Interface: 
+#   make
+#   make build
+#   make format
+#   make run
+
+# Compiler
+CC=gcc
+# CC=tcc # <= faster compilation
+
+# Main file
+SRC=example.c
+OUTPUT=example
+
+## Dependencies
+MATH=-lm
+
+## Flags
+DEBUG= #'-g'
+STANDARD=-std=c99
+WARNINGS=-Wall
+OPTIMIZED=-O3  #-Ofast
+# OPENMP=-fopenmp
+
+## Formatter
+STYLE_BLUEPRINT=webkit
+FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
+
+## make build
+build: $(SRC)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(MATH) -o $(OUTPUT)
+
+format: $(SRC)
+	$(FORMATTER) $(SRC)
+
+run: $(SRC) $(OUTPUT)
+	OMP_NUM_THREADS=1 ./$(OUTPUT) && echo
+
+time-linux: 
+	@echo "Requires /bin/time, found on GNU/Linux systems" && echo
+	
+	@echo "Running 100x and taking avg time $(OUTPUT)"
+	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 1 thread: |" | sed 's|$$|ms|' && echo
+
+## Profiling
+
+profile-linux: 
+	echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar"
+	echo "Must be run as sudo"
+	$(CC) $(SRC) $(MATH) -o $(OUTPUT)
+	sudo perf record $(OUTPUT)
+	sudo perf report
+	rm perf.data
--- a/C/squiggle_c/examples/02_many_samples/example
+++ b/C/squiggle_c/examples/02_many_samples/example
--- a/C/squiggle_c/examples/02_many_samples/example.c
+++ b/C/squiggle_c/examples/02_many_samples/example.c
@ -0,0 +1,59 @@
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "../../squiggle.h"
+
+// Estimate functions
+float sample_0(uint32_t* seed)
+{
+    return 0;
+}
+
+float sample_1(uint32_t* seed)
+{
+    return 1;
+}
+
+float sample_few(uint32_t* seed)
+{
+    return random_to(1, 3, seed);
+}
+
+float sample_many(uint32_t* seed)
+{
+    return random_to(2, 10, seed);
+}
+
+int main(){
+    // set randomness seed
+		uint32_t* seed = malloc(sizeof(uint32_t));
+		*seed = 1000; // xorshift can't start with 0
+
+    float p_a = 0.8;
+    float p_b = 0.5;
+    float p_c = p_a * p_b;
+
+    int n_dists = 4;
+    float weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 };
+    float (*samplers[])(uint32_t*) = { sample_0, sample_1, sample_few, sample_many };
+
+		int n_samples = 1000000;
+		float* result_many = (float *) malloc(n_samples * sizeof(float));
+		for(int i=0; i<n_samples; i++){
+      result_many[i] = mixture(samplers, weights, n_dists, seed);
+		}
+		
+		printf("result_many: [");
+		for(int i=0; i<100; i++){
+		  printf("%.2f, ", result_many[i]);
+		}
+		printf("]\n");
+}
+
+/* 
+Aggregation mechanisms:
+- Quantiles (requires a sort)
+- Sum 
+- Average
+- Std
+*/
--- a/C/squiggle_c/examples/02_many_samples/makefile
+++ b/C/squiggle_c/examples/02_many_samples/makefile
@ -0,0 +1,53 @@
+# Interface: 
+#   make
+#   make build
+#   make format
+#   make run
+
+# Compiler
+CC=gcc
+# CC=tcc # <= faster compilation
+
+# Main file
+SRC=example.c
+OUTPUT=example
+
+## Dependencies
+MATH=-lm
+
+## Flags
+DEBUG= #'-g'
+STANDARD=-std=c99
+WARNINGS=-Wall
+OPTIMIZED=-O3  #-Ofast
+# OPENMP=-fopenmp
+
+## Formatter
+STYLE_BLUEPRINT=webkit
+FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
+
+## make build
+build: $(SRC)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(MATH) -o $(OUTPUT)
+
+format: $(SRC)
+	$(FORMATTER) $(SRC)
+
+run: $(SRC) $(OUTPUT)
+	OMP_NUM_THREADS=1 ./$(OUTPUT) && echo
+
+time-linux: 
+	@echo "Requires /bin/time, found on GNU/Linux systems" && echo
+	
+	@echo "Running 100x and taking avg time $(OUTPUT)"
+	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 1 thread: |" | sed 's|$$|ms|' && echo
+
+## Profiling
+
+profile-linux: 
+	echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar"
+	echo "Must be run as sudo"
+	$(CC) $(SRC) $(MATH) -o $(OUTPUT)
+	sudo perf record $(OUTPUT)
+	sudo perf report
+	rm perf.data
--- a/C/squiggle_c/squiggle.h
+++ b/C/squiggle_c/squiggle.h
@ -0,0 +1,109 @@
+#include <math.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+const float PI = 3.14159265358979323846;
+
+// Pseudo Random number generator
+
+uint32_t xorshift32
+(uint32_t* seed)
+{
+	// Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs"
+  // See <https://stackoverflow.com/questions/53886131/how-does-xorshift32-works>
+  // https://en.wikipedia.org/wiki/Xorshift
+	// Also some drama: <https://www.pcg-random.org/posts/on-vignas-pcg-critique.html>, <https://prng.di.unimi.it/>
+
+	uint32_t x = *seed;
+	x ^= x << 13;
+	x ^= x >> 17;
+	x ^= x << 5;
+	return *seed = x;
+}
+
+// Distribution & sampling functions
+
+float rand_0_to_1(uint32_t* seed){
+	return ((float) xorshift32(seed)) / ((float) UINT32_MAX);
+}
+
+float rand_float(float max, uint32_t* seed)
+{
+    return rand_0_to_1(seed) * max;
+}
+
+float ur_normal(uint32_t* seed)
+{
+    float u1 = rand_0_to_1(seed);
+    float u2 = rand_0_to_1(seed);
+    float z = sqrtf(-2.0 * log(u1)) * sin(2 * PI * u2);
+    return z;
+}
+
+float random_uniform(float from, float to, uint32_t* seed)
+{
+    return rand_0_to_1(seed) * (to - from) + from;
+}
+
+float random_normal(float mean, float sigma, uint32_t* seed)
+{
+    return (mean + sigma * ur_normal(seed));
+}
+
+float random_lognormal(float logmean, float logsigma, uint32_t* seed)
+{
+    return expf(random_normal(logmean, logsigma, seed));
+}
+
+float random_to(float low, float high, uint32_t* seed)
+{
+    const float NORMAL95CONFIDENCE = 1.6448536269514722;
+    float loglow = logf(low);
+    float loghigh = logf(high);
+    float logmean = (loglow + loghigh) / 2;
+    float logsigma = (loghigh - loglow) / (2.0 * NORMAL95CONFIDENCE);
+    return random_lognormal(logmean, logsigma, seed);
+}
+
+// Array helpers
+float array_sum(float* array, int length)
+{
+    float output = 0.0;
+    for (int i = 0; i < length; i++) {
+        output += array[i];
+    }
+    return output;
+}
+
+void array_cumsum(float* array_to_sum, float* array_cumsummed, int length)
+{
+    array_cumsummed[0] = array_to_sum[0];
+    for (int i = 1; i < length; i++) {
+        array_cumsummed[i] = array_cumsummed[i - 1] + array_to_sum[i];
+    }
+}
+
+// Mixture function
+float mixture(float (*samplers[])(uint32_t*), float* weights, int n_dists, uint32_t* seed)
+{
+    // You can see a simpler version of this function in the git history
+    // or in C-02-better-algorithm-one-thread/
+    float sum_weights = array_sum(weights, n_dists);
+		float* cumsummed_normalized_weights = (float*) malloc(n_dists * sizeof(float));
+		cumsummed_normalized_weights[0] = weights[0]/sum_weights;
+    for (int i = 1; i < n_dists; i++) {
+        cumsummed_normalized_weights[i] = cumsummed_normalized_weights[i - 1] + weights[i]/sum_weights;
+    }
+
+		float p = random_uniform(0, 1, seed);
+		float result;
+		for (int k = 0; k < n_dists; k++) {
+				if (p < cumsummed_normalized_weights[k]) {
+						result = samplers[k](seed);
+						break;
+				}
+		}
+
+		free(cumsummed_normalized_weights);
+		return result;
+}
--- a/C/squiggle_c/to-do.md
+++ b/C/squiggle_c/to-do.md
@ -0,0 +1,9 @@
+
+- [ ] Add example for only one sample
+- [ ] Add example for many samples
+- [ ] Use gcc extension to define functions nested inside main.
+- [ ] Use OpenMP for acceleration
+- [ ] Chain various mixture functions
+- [ ] Have some more complicated & realistic example
+- [ ] Add summarization functions, like mean, std, 90% ci (or all c.i.?)
+- [ ] Add beta distribution
--- a/C/xorshift-scratchpad/makefile
+++ b/C/xorshift-scratchpad/makefile
@ -0,0 +1,15 @@
+build: 
+	gcc xorshift-pointer.c -o out/xorshift-pointer
+	gcc xorshift-struct.c -o out/xorshift-struct
+
+run-pointer:
+	./out/xorshift-pointer
+
+run-struct:
+	./out/xorshift-struct
+
+time-pointer:
+	/bin/time -f "Time: %es" ./out/xorshift-pointer && echo
+
+time-struct:
+	/bin/time -f "Time: %es" ./out/xorshift-struct && echo
--- a/C/xorshift-scratchpad/notes.md
+++ b/C/xorshift-scratchpad/notes.md
@ -0,0 +1 @@
+Using a pointer or a struct turns out to be pretty equivalent in terms of speed. I don't think there are gains to be found here, but I find the pointer implementation clearer.
--- a/C/xorshift-scratchpad/out/xorshift-pointer
+++ b/C/xorshift-scratchpad/out/xorshift-pointer
--- a/C/xorshift-scratchpad/out/xorshift-struct
+++ b/C/xorshift-scratchpad/out/xorshift-struct
--- a/C/xorshift-scratchpad/xorshift-pointer.c
+++ b/C/xorshift-scratchpad/xorshift-pointer.c
@ -0,0 +1,41 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+uint32_t xorshift32(uint32_t* state)
+{
+	/* Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs" */
+	uint32_t x = *state;
+	x ^= x << 13;
+	x ^= x >> 17;
+	x ^= x << 5;
+	return *state = x;
+}
+
+float rand_xorshift32(uint32_t* state){
+	return (float) xorshift32(state) / (float) UINT32_MAX;
+}
+
+int main(){
+	uint32_t** states = malloc(4 * sizeof(uint32_t*));
+	for(int i=0; i<4;i++){
+		states[i] = malloc(sizeof(uint32_t));
+		*states[i] = (uint32_t) i + 1;
+	}
+	for(int i=0; i<1000000000;i++){
+		uint32_t x = xorshift32(states[0]);
+		float y = rand_xorshift32(states[1]);
+		// printf("%u\n", x);
+	  // printf("%f\n", y);
+	}
+  
+	for(int i=0; i<4;i++){
+		free(states[i]);
+	}
+	free(states);
+
+  return 0;
+}
+
+// See <https://stackoverflow.com/questions/53886131/how-does-xorshift32-works>
+// https://en.wikipedia.org/wiki/Xorshift
--- a/C/xorshift-scratchpad/xorshift-struct.c
+++ b/C/xorshift-scratchpad/xorshift-struct.c
@ -0,0 +1,46 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+struct xorshift32_state {
+    uint32_t a;
+};
+
+uint32_t xorshift32(struct xorshift32_state *state)
+{
+	/* Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs" */
+	uint32_t x = state->a;
+	x ^= x << 13;
+	x ^= x >> 17;
+	x ^= x << 5;
+	return state->a = x;
+}
+
+float rand_xorshift32(struct xorshift32_state *state){
+	return (float) xorshift32(state) / (float) UINT32_MAX;
+}
+
+
+int main(){
+	struct xorshift32_state** states = malloc(sizeof(struct xorshift32_state*) * 4);
+	for(int i=0; i<4;i++){
+		states[i] = malloc(sizeof(struct xorshift32_state));
+		states[i]->a = (uint32_t) i + 1;
+	}
+
+	for(int i=0; i<1000000000; i++){
+		uint32_t x = xorshift32(states[0]);
+		float y = rand_xorshift32(states[1]);
+		// printf("%u\n", x);
+		// printf("%f\n", y);
+	}
+
+	for(int i=0; i<4;i++){
+		free(states[i]);
+	}
+	free(states);
+
+
+  return 0;
+}
+
--- a/R/hello-world.R
+++ b/R/hello-world.R
@ -1 +0,0 @@
-print("Hello world")
--- a/R/samples.R
+++ b/R/samples.R
@ -47,7 +47,4 @@ weights = c((1 - p_c), p_c/2, p_c/4, p_c/4)
 # print(weights)
 result = mixture(dists, weights)
 mean_result = mean(result)
-print(mean_result)
-
-
-
+print(mean_result)
--- a/README.md
+++ b/README.md
@ -2,49 +2,161 @@

 ## About

-This repository contains example of very simple code to manipulate samples in various programming languages. As of now, it may be useful for checking the validity of simple estimations.
+This repository contains example of very simple code to manipulate samples in various programming languages, and tallies their speed and the complexity of their code. It implements this platonic estimate:

-The title of this repository is a pun on two meanings of "time to": "how much time does it take to do x", and "let's do x".
+```
+p_a = 0.8
+p_b = 0.5
+p_c = p_a * p_b
+
+dists = [0, 1, 1 to 3, 2 to 10]
+weights = [(1 - p_c), p_c/2, p_c/4, p_c/4 ]
+
+result = mixture(dists, weights) # should be 1M samples
+mean(result)
+```
+
+I don't particularly care about the speed of this particular example, but rather think that the speed in this simple exaxmple would be indicative of the speed when considering 100x or 1000x more complicated models. As of now, it may also be useful for checking the validity of simple estimations. 
+
+The title of this repository is a pun on two meanings of "time to": "how much time does it take to do x", and "let's do x". "BOTEC" stands for "back of the envelope calculation".

 ## Current languages

- [x] Python
- [x] R
- [x] Squiggle 
- [x] Javascript (NodeJS)
 - [x] C
+- [x] Javascript (NodeJS)
+- [x] Squiggle 
+- [x] R
+- [x] Python
+- [x] Nim 

-## Performance table
+## Comparison table

-With the [time](https://man7.org/linux/man-pages/man1/time.1.html) tool, using 1M samples:
+| Language                    | Time      | Lines of code |
+|-----------------------------|-----------|---------------|
+| C (optimized, 16 threads)   | 5ms       | 249  |
+| Nim                         | 38ms      | 84   |
+| Lua (LuaJIT)                | 68ms      | 82   |
+| Lua                         | 278ms     | 82   |
+| C (naïve implementation)    | 292ms     | 149  |
+| Javascript (NodeJS)         | 732ms     | 69   |
+| Squiggle                    | 1,536s    | 14*  |
+| SquigglePy                  | 1.602s    | 18*  |
+| R                           | 7,000s    | 49   |
+| Python (CPython)            | 16,641s   | 56   |

-| Language | Time      |
-|----------|-----------|
-| C        | 0m0,442s  |
-| Node     | 0m0,732s  |
-| Squiggle | 0m1,536s  |
-| R        | 0m7,000s  |
-| Python (CPython)  | 0m16,641s |
+Time measurements taken with the [time](https://man7.org/linux/man-pages/man1/time.1.html) tool, using 1M samples. But different implementations use different algorithms and, occasionally, different time measuring methodologies (for the C, Nim and Lua implementations, I run the program 100 times and take the mean). Their speed was also measured under different loads in my machine. So I think that these time estimates are accurate within maybe ~2x or so.

+Note that the number of lines is much shorter for Squiggle and SquigglePy because I'm just counting the lines needed to get Squiggle and SquigglePy to output a model, not the lines inside them.
+
+## Notes
+
+### Nim
+
+I was really happy trying [Nim](https://nim-lang.org/), and as a result the Nim code is a bit more optimized and engineered:
+
+1. It is using the fastest "danger" compilation mode.
+2. It has some optimizations: I don't compute 1M samples for each dist, but instead pass functions around and compute the 1M samples at the end
+3. I define the normal function from scratch, using the [Box–Muller transform](https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form).
+4. I also have a version in which I define the logarithm and sine functions themselves in nim to feed into the Box-Muller method. But it is much slower.
+
+Without 1. and 2., the nim code takes 0m0.183s instead. But I don't think that these are unfair advantages: I liked trying out nim and therefore put in more love into the code, and this seems like it could be a recurring factor.
+
+Ultimately, these optimizations were also incorporated into the C code as well.
+
+Nim also has multithreading support, but I haven't bothered loooking into it yet.
+
+### C
+
+The optimizations which make the final C code significantly faster than the naïve implementation are:
+
+- To pass around pointers to functions, instead of large arrays. This is the same as in the nim implementation, but imho leads to more complex code
+- To use the Box-Muller transform instead of using libraries, like in nim.
+- To use multithreading support
+
+The C code uses the `-Ofast` or `-O3` compilation flags. Initially, without using those flags and without the algorithmic improvements, the code took ~0.4 seconds to run. So I was surprised that the Node and Squiggle code were comparable to the C code. It ended up being the case that the C code could be pushed to be ~100x faster, though :)
+
+In fact, the C code ended up being so fast that I had to measure its time by running the code 100 times in a row and dividing that amount by 100, rather than by just running it once, because running it once was too fast for /bin/time. More sophisticated profiling tools exist that could e.g., account for how iddle a machine is when running the code, but I didn't think that was worth it at this point.
+
+And still, there are some missing optimizations, like tweaking the code to take into account cache misses. I'm not exactly sure how that would go, though.
+
+Once the code was at 6.6ms, there was a 0.6ms gain possible by using OMP better, and a 1ms gain by using a xorshift algorithm instead of rand_r from stdlib. I think there might be faster gains to be made by using OpenCL or CUDA, but I haven't gotten into how to do that instead.
+
+Although the above paragraphs were written in the first person, the C code was written together with Jorge Sierra, who translated the algorithmic improvements from nim to it and added the initial multithreading support.
+
+### NodeJS and Squiggle
+
+Using [bun](https://bun.sh/) instead of node is actually a bit slower. Also, both the NodeJS and the Squiggle code use [stdlib](https://stdlib.io/) in their innards, which has a bunch of interleaved functions that make the code slower. It's possible that not using that external library could make the code faster, but at the same time, the js approach does seem to be to use external libraries whenever possible.
+
+I am not particularly sure that the Squiggle code is actually producing 1M samples, but also have no particular plan to debug this.
+
+### Python
+
+For the Python code, it's possible that the lack of speed is more a function of me not being as familiar with Python. It's also very possible that the code would run faster with [PyPy](https://doc.pypy.org).
+
+### R
+
+R has a warm place in my heart from back in the day, and it has predefined functions to do everything. It was particularly fast to write for me, though not particularly fast to run :) However, I do recall that R does have some multithreading support; it wasn't used.
+
+### Lua 
+
+I was also really pleased with Lua. I liked the simplicity. And coming from javascript, I appreciated that the program did not fail silently, but rather gave me useful errors, like:
+
+```
+lua samples.lua
+lua: samples.lua:42: table index is nil
+stack traceback:
+        samples.lua:42: in function 'mixture'
+        samples.lua:49: in main chunk
+        [C]: in ?
+make: *** [makefile:14: run] Error 1
+```
+
+I also appreciated the speedup when using the LuaJIT interpreter. You can install it with commands similar to 
+
+```
+git clone https://luajit.org/git/luajit.git
+make
+sudo make install
+sudo ln -sf luajit-2.1.0-beta3 /usr/local/bin/luajit
+```
+
+Overall I'm thinking that a combination of lua at least for scripting and ¿nim/C/tbd? for more serious programs could be quite powerful.
+
+### Overall thoughts
+
+Overall I don't think that this is a fair comparison of the languages intrinsically, because I'm just differentially good at them, because I've chosen to put more effort in ones than in others. But it is still useful to me personally, and perhaps mildly informative to others. 

-I was very surprised that Node/Squiggle code was almost as fast as the raw C code. For the Python code, it's possible that the lack of speed is more a function of me not being as familiar with Python. It's also very possible that the code would run faster with [PyPy](https://doc.pypy.org)
- 
 ## Languages I may add later

- Julia (TuringML) 
- Rust
- Lisp
+- [ ] Julia (TuringML) 
+- [ ] Rust
+- [ ] Lisp
+- [ ] Go 
+- [ ] Zig
+- [ ] Forth
+- [ ] sh/bash, lol?
+- [ ] OCaml
+- [ ] Haskell
+- [ ] OpenCL/CUDA (e.g., as in <https://www.eriksmistad.no/getting-started-with-opencl-and-gpu-computing/>). Seems like it would be overkill, and also the code would be way more complex. But maybe worth trying?
+- [-] Stan => As far as I can tell, Stan is designed to generate samples from the posterior distribution given some data, not to create data by drawing samples from an arbitrary distribution.
+  - [ ] Maybe still worth reversing the process?
 - ... and suggestions welcome
- Stan
+
+## Languages added so far
+
+- [x] Squiggle
+- [x] Javascript (NodeJS)
+- [x] Python (CPython)
+- [x] R
+- [x] C
+- [x] Nim
+- [x] SquigglePy
+- [x] Lua

 ## Roadmap

 The future of this project is uncertain. In most words, I simply forget about this repository.

-To do:
- [ ] Check whether the Squiggle code is producing 1M samples. Still not too sure.
- Differentiate between initial startup time (e.g., compiling, loading environment) and runtime. This matters because startup time could be ~constant, so for larger projects only the runtime matters.
-
 ## Other similar projects

 - Squigglepy: <https://github.com/rethinkpriorities/squigglepy>
--- a/js/hello-world.js
+++ b/js/hello-world.js
@ -1 +0,0 @@
-console.log("Hello world")
--- a/js/node_modules/@stdlib/utils/global/lib/window.js
+++ b/js/node_modules/@stdlib/utils/global/lib/window.js
@ -20,7 +20,7 @@

 // MAIN //

-var obj = ( typeof window === 'object' ) ? window : null;
+var obj = ( typeof window === 'object' ) ? window : {};


 // EXPORTS //
--- a/lua/hello-world/hello-world.lua
+++ b/lua/hello-world/hello-world.lua
@ -0,0 +1 @@
+print("Hello world")
--- a/lua/makefile
+++ b/lua/makefile
@ -0,0 +1,32 @@
+# Interface: 
+#   make
+#   make build
+#   make run
+#   make time-linux
+#   make install
+#   make format
+#   make time-linux-simple
+#   make profile-linux
+
+SRC=samples.lua
+# INTERPETER=lua# < original
+INTERPETER=luajit# < faster
+
+run: $(SRC)
+	$(INTERPETER) $(SRC)
+
+time-linux: 
+	@echo "Requires /bin/time, found on GNU/Linux systems" && echo
+	@echo "Running 100x and taking avg time of: $(INTERPETER) $(SRC)"
+	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(INTERPETER) $(SRC); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time: |" | sed 's|$$|ms|' && echo
+	
+time-linux-simple:
+	@echo "Requires /bin/time, found on GNU/Linux systems" && echo
+	/bin/time -f "Time: %es" $(INTERPETER) $(SRC) && echo
+
+profile-linux: 
+	@echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar"
+	@echo "Must be run as sudo"
+	sudo perf record $(INTERPETER) $(SRC)
+	sudo perf report
+	rm perf.data
--- a/lua/samples.lua
+++ b/lua/samples.lua
@ -0,0 +1,82 @@
+-- Consts and prep
+PI = 3.14159265358979323846;
+NORMAL95CONFIDENCE = 1.6448536269514722;
+math.randomseed(1234)
+
+-- Random distribution functions
+function sample_normal_0_1()
+  local u1 = math.random()
+	local u2 = math.random()
+	local result = math.sqrt(-2 * math.log(u1)) * math.sin(2 * PI * u2)
+	return result
+end
+
+function sample_normal(mean, sigma)
+	return mean + (sigma * sample_normal_0_1())
+end
+
+function sample_uniform(min, max)
+  return math.random() * (max - min) + min
+end
+
+function sample_lognormal(logmean, logsigma)
+	return math.exp(sample_normal(logmean, logsigma))
+end
+
+function sample_to(low, high)
+	local loglow = math.log(low);
+	local loghigh = math.log(high);
+	local logmean = (loglow + loghigh) / 2;
+	local logsigma = (loghigh - loglow) / (2.0 * NORMAL95CONFIDENCE);
+	return sample_lognormal(logmean, logsigma, seed);
+end
+
+-- Mixture
+function mixture(samplers, weights, n)
+	assert(#samplers == #weights)
+  local l = #weights
+	local sum_weights = 0
+	for i = 1, l, 1 do
+		-- ^ arrays start at 1
+		sum_weights = sum_weights + weights[i]
+	end
+  local cumsummed_normalized_weights = {}
+	table.insert(cumsummed_normalized_weights, weights[1]/sum_weights)
+	for i = 2, l, 1 do
+		table.insert(cumsummed_normalized_weights, cumsummed_normalized_weights[i-1] + weights[i]/sum_weights)
+	end
+	
+	local result = {}
+	for i = 1, n, 1 do
+		r = math.random()
+		local i = 1
+		while r > cumsummed_normalized_weights[i] do
+			i = i+1
+		end
+		table.insert(result, samplers[i]())
+	end
+	return result
+end
+
+
+-- Main 
+p_a = 0.8
+p_b = 0.5
+p_c = p_a * p_b
+
+function sample_0() return 0 end
+function sample_1() return 1 end
+function sample_few() return sample_to(1, 3) end
+function sample_many() return sample_to(2, 10) end
+
+samplers = {sample_0, sample_1, sample_few, sample_many}
+weights = { (1 - p_c), p_c/2, p_c/4, p_c/4 }
+
+n = 1000000
+result = mixture(samplers, weights, n)
+sum = 0
+for i = 1, n, 1 do
+	sum = sum + result[i]
+	-- print(result[i])
+end
+print(sum/n)
--- a/nim/makefile
+++ b/nim/makefile
@ -0,0 +1,18 @@
+SHELL := /bin/bash ## <= required to use time
+VERBOSE=--verbosity:0
+
+build: samples.nim
+	nim c $(VERBOSE) samples.nim
+
+run: samples 
+	./samples $(VERBOSE)
+
+examine: samples
+	nim c $(VERBOSE) samples.nim && time ./samples $(VERBOSE) && echo
+	nim c $(VERBOSE) -d:release samples.nim && time ./samples $(VERBOSE) && echo
+	nim c $(VERBOSE) -d:danger samples.nim && time ./samples $(VERBOSE)
+
+time-linux: 
+	@echo "Requires /bin/time, found on GNU/Linux systems" && echo
+	@echo "Running 100x and taking avg time of: $(OUTPUT)"
+	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do ./samples; done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time: |" | sed 's|$$|ms|' && echo
--- a/nim/samples
+++ b/nim/samples
--- a/nim/samples-from-scratch/README.md
+++ b/nim/samples-from-scratch/README.md
@ -0,0 +1,4 @@
+This is a version of time-to-botec for nim for which I define logarithms, normals & lognormals pretty much 
+from scratch, without relying on libraries except for random numbers, sqrts, or taking a to the power of b.
+
+This is interesting because it leads to better understanding. But it doesn't help compare the efficiency of various languages. So I'm setting it aside for now, even though I think it was really interesting.
--- a/nim/samples-from-scratch/makefile
+++ b/nim/samples-from-scratch/makefile
@ -0,0 +1,11 @@
+SHELL := /bin/bash
+
+build: samples.nim
+	nim c --verbosity:0 samples.nim
+
+run: samples 
+	./samples --verbosity:0
+
+examine: samples
+	# nim c --verbosity:0 --opt:speed -d:release -d:danger --checks:off samples.nim && time ./samples --verbosity:0 --checks:off
+	nim c -d:release samples.nim && time ./samples
--- a/nim/samples-from-scratch/samples
+++ b/nim/samples-from-scratch/samples
--- a/nim/samples-from-scratch/samples.nim
+++ b/nim/samples-from-scratch/samples.nim
@ -0,0 +1,145 @@
+import std/math
+import std/sugar
+import std/random
+import std/sequtils
+
+randomize()
+
+## Basic math functions
+proc factorial(n: int): int = 
+  if n == 0 or n < 0: 
+    return 1
+  else:
+    return n * factorial(n - 1)
+
+proc sine(x: float): float = 
+  let n = 8
+  # ^ Taylor will converge really quickly
+  # notice that the factorial of 17 is 
+  # already pretty gigantic
+  var acc = 0.0
+  for i in 0..n:
+    var k = 2*i + 1
+    var taylor = pow(-1, i.float) * pow(x, k.float) / factorial(k).float 
+    acc = acc + taylor
+  return acc 
+
+## Log function
+## <https://en.wikipedia.org/wiki/Natural_logarithm#High_precision>
+
+## Arithmetic-geomtric mean
+proc ag(x: float, y: float): float = 
+  let n = 32 # just some high number
+  var a = (x + y)/2.0
+  var b = sqrt(x * y)
+  for i in 0..n:
+    let temp = a
+    a = (a+b)/2.0
+    b = sqrt(b*temp)
+  return a
+
+## Find m such that x * 2^m > 2^precision/2
+proc find_m(x:float): float = 
+  var m = 0.0;
+  let precision = 64 # bits
+  let c = pow(2.0, precision.float / 2.0)
+  while x * pow(2.0, m) < c:
+    m = m + 1
+  return m
+
+proc log(x: float): float = 
+  let m = find_m(x)
+  let s = x * pow(2.0, m)
+  let ln2 = 0.6931471805599453
+  return ( PI / (2.0 * ag(1, 4.0/s)) ) - m * ln2
+
+## Test these functions
+## echo factorial(5)
+## echo sine(1.0)
+## echo log(0.1)
+## echo log(2.0)
+## echo log(3.0)
+## echo pow(2.0, 32.float)
+   
+## Distribution functions
+
+## Normal
+## <https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form>
+proc ur_normal(): float = 
+  let u1 = rand(1.0)
+  let u2 = rand(1.0)
+  let z = sqrt(-2.0 * log(u1)) * sine(2 * PI * u2)
+  return z
+
+proc normal(mean: float, sigma: float): float = 
+  return (mean + sigma * ur_normal())
+
+proc lognormal(logmean: float, logsigma: float): float =
+  let answer = pow(E, normal(logmean, logsigma))
+  return answer
+
+proc to(low: float, high: float): float = 
+  let normal95confidencePoint = 1.6448536269514722
+  let loglow = log(low)
+  let loghigh = log(high)
+  let logmean = (loglow + loghigh)/2
+  let logsigma = (loghigh - loglow) / (2.0 * normal95confidencePoint);
+  return lognormal(logmean, logsigma)
+
+## echo ur_normal()
+## echo normal(10, 20)
+## echo lognormal(2, 4)
+## echo to(10, 90)
+
+## Manipulate samples
+
+proc make_samples(f: () -> float, n: int): seq[float] = 
+  result = toSeq(1..n).map(_ => f())
+  return result
+
+proc mixture(sxs: seq[seq[float]], ps: seq[float], n: int): seq[float] = 
+  
+  assert sxs.len == ps.len
+
+  var ws: seq[float]
+  var sum = 0.0
+  for i, p in ps:
+    sum = sum + p
+    ws.add(sum)
+  ws = ws.map(w => w/sum)
+  
+  proc get_mixture_sample(): float = 
+    let r = rand(1.0)
+    var i = ws.len - 1
+    for j, w in ws:
+      if r < w:
+        i = j
+        break
+    ## only occasion when ^ doesn't assign i
+    ## is when r is exactly 1
+    ## which would correspond to choosing the last item in ws
+    ## which is why i is initialized to ws.len
+    let xs = sxs[i]
+    let l = xs.len-1
+    let k = rand(0..l)
+    return xs[k]
+  
+  return toSeq(1..n).map(_ => get_mixture_sample())
+
+## Actual model
+
+let n = 1000000
+
+let p_a = 0.8
+let p_b = 0.5
+let p_c = p_a * p_b
+
+let weights = @[ 1.0 - p_c, p_c/2.0, p_c/4.0, p_c/4.0 ]
+
+let fs = [ () => 0.0, () => 1.0, () => to(1.0, 3.0), () => to(2.0, 10.0)  ]
+let dists = fs.map(f => make_samples(f, n))
+let result = mixture(dists, weights, n)
+let mean_result = foldl(result, a + b, 0.0) / float(result.len)
+
+# echo result
+echo mean_result
--- a/nim/samples-from-scratch/taylor-expansions/makefile
+++ b/nim/samples-from-scratch/taylor-expansions/makefile
@ -0,0 +1,2 @@
+build: sums.nim
+	nim c sums.nim
--- a/nim/samples-from-scratch/taylor-expansions/sums
+++ b/nim/samples-from-scratch/taylor-expansions/sums
--- a/nim/samples-from-scratch/taylor-expansions/sums.nim
+++ b/nim/samples-from-scratch/taylor-expansions/sums.nim
@ -0,0 +1,22 @@
+import std/math
+
+proc factorial(n: int): int = 
+  if n == 0 or n < 0: 
+    return 1
+  else:
+    return n * factorial(n - 1)
+
+proc sine(x: float): float = 
+  let n = 8
+  # ^ Taylor will converge really quickly
+  # notice that the factorial of 17 is 
+  # already pretty gigantic
+  var acc = 0.0
+  for i in 0..n:
+    var k = 2*i + 1
+    var taylor = pow(-1, i.float) * pow(x, k.float) / factorial(k).float 
+    acc = acc + taylor
+  return acc 
+
+echo factorial(17)
+echo sine(1.0)
--- a/nim/samples.nim
+++ b/nim/samples.nim
@ -0,0 +1,84 @@
+import std/math
+import std/sugar
+import std/random
+import std/sequtils
+
+randomize()
+
+## Distribution functions
+
+## Normal
+## <https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form>
+proc ur_normal(): float = 
+  let u1 = rand(1.0)
+  let u2 = rand(1.0)
+  let z = sqrt(-2.0 * ln(u1)) * sin(2 * PI * u2)
+  return z
+
+proc normal(mean: float, sigma: float): float = 
+  return (mean + sigma * ur_normal())
+
+proc lognormal(logmean: float, logsigma: float): float =
+  let answer = pow(E, normal(logmean, logsigma))
+  return answer
+
+proc to(low: float, high: float): float = 
+  let normal95confidencePoint = 1.6448536269514722
+  let loglow = ln(low)
+  let loghigh = ln(high)
+  let logmean = (loglow + loghigh)/2
+  let logsigma = (loghigh - loglow) / (2.0 * normal95confidencePoint)
+  return lognormal(logmean, logsigma)
+
+## echo ur_normal()
+## echo normal(10, 20)
+## echo lognormal(2, 4)
+## echo to(10, 90)
+
+## Manipulate samples
+
+proc mixture(fs: seq[proc (): float{.nimcall.}], ps: seq[float], n: int): seq[float] = 
+  
+  assert fs.len == ps.len
+
+  var ws: seq[float]
+  var sum = 0.0
+  for i, p in ps:
+    sum = sum + p
+    ws.add(sum)
+  ws = ws.map(w => w/sum)
+  
+  var samples: seq[float]
+  let rs = toSeq(1..n).map(_=>rand(1.0))
+  for i in 0..(n-1):
+    let r = rs[i]
+    var j = ws.len - 1
+    for k, w in ws:
+      if r < w:
+        j = k
+        break
+    ## only occasion when ^ doesn't assign j
+    ## is when r is exactly 1
+    ## which would correspond to choosing the last item in ws
+    ## which is why j is initialized to ws.len - 1
+    let f = fs[j]
+    samples.add(f())
+  return samples
+
+
+## Actual model
+
+let n = 1000000
+
+let p_a = 0.8
+let p_b = 0.5
+let p_c = p_a * p_b
+
+let weights = @[ 1.0 - p_c, p_c/2.0, p_c/4.0, p_c/4.0 ]
+
+let fs = @[ proc (): float = 0.0, proc (): float = 1.0, proc (): float = to(1.0, 3.0), proc (): float = to(2.0, 10.0)]
+let result = mixture(fs, weights, n)
+let mean_result = foldl(result, a + b, 0.0) / float(result.len)
+
+# echo result
+echo mean_result
--- a/python/hello-world.py
+++ b/python/hello-world.py
@ -1 +0,0 @@
-print("Hello world!")
--- a/squigglepy/makefile
+++ b/squigglepy/makefile
@ -0,0 +1,10 @@
+install: 
+	pip3 install "squigglepy>=0.26"
+	pip3 install numpy
+
+run:
+	python3 samples.py
+
+time: 
+	time python3 samples.py
+
--- a/squigglepy/samples.py
+++ b/squigglepy/samples.py
@ -0,0 +1,18 @@
+import squigglepy as sq
+import numpy as np
+
+p_a = 0.8
+p_b = 0.5
+p_c = p_a * p_b
+
+dist_0 = 0
+dist_1 = 1
+dist_some = sq.to(1, 3)
+dist_many = sq.to(2, 10)
+
+dists = [dist_0, dist_1, dist_some, dist_many]
+weights = [(1 - p_c), p_c/2, p_c/4, p_c/4 ]
+
+result = sq.mixture(dists, weights)
+result_samples = sq.sample(result, 1000000)
+print(np.mean(result_samples))
--- a/time.txt
+++ b/time.txt
@ -1,11 +1,41 @@
+# Optimized C
+
+
+$ make && make time-linux
+gcc -O3   samples.c -fopenmp -lm -o out/samples
+Requires /bin/time, found on GNU/Linux systems
+
+Running 100x and taking avg time: OMP_NUM_THREADS=1 out/samples
+Time using 1 thread: 20.20ms
+
+Running 100x and taking avg time: OMP_NUM_THREADS=2 out/samples
+Time using 2 threads: 17.50ms
+
+Running 100x and taking avg time: OMP_NUM_THREADS=4 out/samples
+Time for 4 threads: 17.00ms
+
+Running 100x and taking avg time: OMP_NUM_THREADS=8 out/samples
+Time using 8 threads: 8.40ms
+
+Running 100x and taking avg time: OMP_NUM_THREADS=16 out/samples
+Time using 16 threads: 5.00ms
+
 # C

+## normal compilation
 0.888458

 real    0m0,442s
 user    0m0,378s
 sys     0m0,064s

+## -Ofast
+0.888458
+
+real    0m0.292s
+user    0m0.266s
+sys     0m0.026s
+
 # Squiggle

 real    0m1,536s
@ -36,3 +66,65 @@ real    0m7,000s
 user    0m6,944s
 sys     0m0,052s

+## Nim
+
+nim c --verbosity:0 samples.nim && time ./samples --verbosity:0 && echo
+0.8860780498240779
+
+real    0m0.234s
+user    0m0.214s
+sys     0m0.020s
+
+nim c --verbosity:0 -d:release samples.nim && time ./samples --verbosity:0 && echo
+0.884035098700204
+
+real    0m0.074s
+user    0m0.043s
+sys     0m0.031s
+
+nim c --verbosity:0 -d:danger samples.nim && time ./samples --verbosity:0
+0.8892827195895541
+
+real    0m0.068s
+user    0m0.048s
+sys     0m0.020s
+
+— make time-linux
+Requires /bin/time, found on GNU/Linux systems
+
+Running 100x and taking avg time of:
+Time: 38.90ms
+
+## Squigglepy
+
+
+— time make run
+python3 samples.py
+100%|█████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 20.68it/s]
+100%|████████████████████████████████████████████████████████████████████| 1000000/1000000 [00:00<00:00, 1298266.78it/s]
+0.8854926010232937
+
+real    0m1.602s
+user    0m1.582s
+sys     0m0.331s
+
+## Lua
+
+— make time-linux-simple
+Requires /bin/time, found on GNU/Linux systems
+
+/bin/time -f "Time: %es" lua samples.lua && echo
+0.88932198462254
+Time: 0.28s
+
+— make time-linux
+Requires /bin/time, found on GNU/Linux systems
+
+Running 100x and taking avg time of: lua samples.lua
+Time: 275.80ms
+
+— make time-linux
+Requires /bin/time, found on GNU/Linux systems
+
+Running 100x and taking avg time of: luajit samples.lua
+Time: 68.60ms
--- a/wip/template/makefile
+++ b/wip/template/makefile
@ -0,0 +1,49 @@
+# Interface: 
+#   make
+#   make build
+#   make run
+#   make time-linux
+#   make install
+#   make format
+#   make time-linux-simple
+#   make profile-linux
+
+# Main file
+SRC=samples.lang
+OUTPUT=out/samples
+
+# Compiler
+COMPILER=lang-compiler
+FLAGS=
+
+## Formatter
+FORMATTER=lang-formatter --options
+
+## make build
+build: $(SRC)
+	$(COMPILER) $(FLAGS) -o $(OUTPUT)
+
+install:
+	lang-package-manager install dependencies
+
+format: $(SRC)
+	$(FORMATTER) $(SRC)
+
+run: $(SRC) $(OUTPUT)
+	./$(OUTPUT)
+
+time-linux: 
+	@echo "Requires /bin/time, found on GNU/Linux systems" && echo
+	@echo "Running 100x and taking avg time of: $(OUTPUT)"
+	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time: |" | sed 's|$$|ms|' && echo
+	
+time-linux-simple:
+	@echo "Requires /bin/time, found on GNU/Linux systems" && echo
+	/bin/time -f "Time: %es" ./$(OUTPUT) && echo
+
+profile-linux: 
+	@echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar"
+	@echo "Must be run as sudo"
+	sudo perf record $(OUTPUT)
+	sudo perf report
+	rm perf.data
--- a/wip/zig/hello-world.zig
+++ b/wip/zig/hello-world.zig
@ -0,0 +1,5 @@
+const std = @import("std");
+
+pub fn main() void {
+    std.debug.print("Hello, {s}!\n", .{"World"});
+}
--- a/wip/zig/install-zig.sh
+++ b/wip/zig/install-zig.sh
@ -0,0 +1 @@
+See <https://ziglang.org/learn/getting-started/#tagged-release-or-nightly-build>
--- a/wip/zig/notes.md
+++ b/wip/zig/notes.md
@ -0,0 +1 @@
+https://ziglearn.org/chapter-1/
Author	SHA1	Message	Date
NunoSempere	3b29ad7e45	more work on squiggle_c	2023-06-26 18:44:04 +01:00
NunoSempere	39652b3bfe	add squiggle_c beginnings.	2023-06-26 18:05:37 +01:00
NunoSempere	8106ab7c11	add opencl pointer	2023-06-17 14:47:37 -06:00
NunoSempere	fac9550488	reword lack of warts in lua	2023-06-10 19:43:36 -06:00
NunoSempere	541aed0c97	add LuaJIT timing, redo nim, add warning about timings	2023-06-10 19:35:05 -06:00
NunoSempere	c9eeaf5b5d	feat: add lua.	2023-06-10 19:07:04 -06:00
NunoSempere	82ef16b55d	recompile C, static => dynamic compilation, I think	2023-06-10 18:25:47 -06:00
NunoSempere	b9ee250369	start with lua	2023-06-10 18:24:03 -06:00
NunoSempere	2bc028b29c	add makefile template	2023-06-10 18:23:03 -06:00
NunoSempere	d1013758c8	update squigglepy timing in readme	2023-06-09 19:12:33 -06:00
NunoSempere	7dd1c17b42	update squigglepy version number	2023-06-09 19:11:09 -06:00
NunoSempere	0ce76b3f23	Merge branch 'master' of github.com:NunoSempere/time-to-botec	2023-06-09 19:10:33 -06:00
Nuño Sempere	edd8590835	Merge pull request #2 from peterhurford/patch-1 Fix squigglepy	2023-06-09 19:09:42 -06:00
Peter Wildeford	05566faf75	Fix squigglepy Corrects the "ValueError: You cannot nest discrete distributions within mixture distributions" Runs in 4.6s on my laptop (M2 chip)	2023-06-09 22:00:20 +01:00
NunoSempere	ff361186ec	add clarification about stan	2023-06-08 21:28:52 -06:00
NunoSempere	3dac5f35cd	add SquigglePy	2023-06-08 19:12:07 -06:00
NunoSempere	60ea376a19	README typo.	2023-06-08 19:01:07 -06:00
NunoSempere	88b627b62a	makefile perf tweak	2023-06-03 11:06:44 -06:00
NunoSempere	1ff35f6235	Revert "tweak: try using array instead of array of pointers" it makes code 3x slower	2023-06-03 10:56:09 -06:00
NunoSempere	00e6b67bf6	tweak: try using array instead of array of pointers	2023-06-03 10:54:45 -06:00
NunoSempere	1436ee4e42	compare using a struct instead of a pointer, reorg	2023-06-03 10:50:06 -06:00
NunoSempere	a7bb3bc812	xorshift minor tweaks	2023-06-03 10:02:01 -06:00
NunoSempere	c73476e5aa	finish xorshift updating.	2023-06-03 04:21:59 -06:00
NunoSempere	eaee16f205	finish integrating xorshift rng	2023-06-03 04:08:23 -06:00
NunoSempere	80f98830da	start adding xorshift prng.	2023-06-03 03:52:04 -06:00
NunoSempere	b128793fc3	tweak: add static compilation option	2023-06-03 03:47:58 -06:00
NunoSempere	00684179e1	add 0 to 1 float to xorshift implementation	2023-06-03 03:47:10 -06:00
NunoSempere	131ea138ae	simplify xorshift implementation; struct not needed.	2023-06-03 03:42:30 -06:00
NunoSempere	a2e1a48d82	tweak: add xorshift example	2023-06-03 03:38:40 -06:00
NunoSempere	58cfe378e5	perf tweaks	2023-06-03 01:42:48 -06:00
NunoSempere	d229021625	tweak perf makefile command	2023-06-03 01:37:42 -06:00
NunoSempere	a13a042492	tweak: don't use inline functions, add profiling	2023-06-03 01:29:16 -06:00
NunoSempere	396170d0a9	rename split_array_get_my_length to split_array_get_length	2023-06-03 01:04:59 -06:00
NunoSempere	8b441e104c	rename own_length to split_array_length	2023-06-03 01:04:09 -06:00
NunoSempere	75b9e38694	use omp reductions to shave off 0.6ms	2023-06-03 01:01:25 -06:00
NunoSempere	15e65534e4	normalize & cumsum array in one for loop.	2023-06-03 00:41:09 -06:00
NunoSempere	52260630de	add reference to nim multithread	2023-06-02 16:55:20 -06:00
NunoSempere	5c59c6b1e2	readme typo	2023-06-02 16:49:33 -06:00
NunoSempere	6273ba69a0	make format	2023-06-02 16:37:57 -06:00
NunoSempere	93a502552e	readme tweaks	2023-06-02 16:31:54 -06:00
NunoSempere	3a417fd733	add motivation to README	2023-06-02 16:29:20 -06:00
NunoSempere	fa4311c131	README.md grammar	2023-06-02 16:26:05 -06:00
NunoSempere	d2bca155b8	mark to-do as done.	2023-06-02 16:25:03 -06:00
NunoSempere	3378d1b9e7	update README, time.txt tally	2023-06-02 16:24:08 -06:00
NunoSempere	76fc0c817d	update README	2023-06-02 16:14:04 -06:00
NunoSempere	cdec5b6fce	remove old code from samples.c	2023-06-02 16:06:17 -06:00
NunoSempere	ff3685766b	reorganize C code	2023-06-02 16:00:49 -06:00
NunoSempere	ef04e0349a	add better timing to makefile	2023-06-02 15:44:52 -06:00
NunoSempere	e1b180bd5b	feat: add timing across 10 runs.	2023-06-02 13:56:50 -06:00
NunoSempere	331c7566f0	savepoint	2023-06-02 13:17:12 -06:00
NunoSempere	6b34d9abdb	feat: add more threads, document rand_r in code.	2023-06-02 12:50:51 -06:00
NunoSempere	58c74ce37d	feat: rand not thread safe, use rand_r throughout	2023-06-02 12:44:36 -06:00
NunoSempere	3f0ec8be0e	tweak: add to-dos in C.	2023-05-30 18:44:29 -04:00
NunoSempere	e2558b05ba	clearly signal what makefile commands are linux only.	2023-05-30 13:01:31 -04:00
NunoSempere	2d4eea8956	tweak: time-printing tweaks.	2023-05-29 20:05:18 -04:00
NunoSempere	7be18ff7cb	tweak: change number of lines	2023-05-29 19:58:59 -04:00
NunoSempere	03421f953b	add one-threaded C example	2023-05-29 19:55:57 -04:00
NunoSempere	160e824108	time measuring tweaks.	2023-05-29 19:40:03 -04:00
NunoSempere	c35ddcc358	C-optimized tweaks.	2023-05-29 19:04:21 -04:00
NunoSempere	28d443a6cf	formatting tweaks	2023-05-29 18:48:25 -04:00
NunoSempere	f64fedc398	makefile tweaks	2023-05-29 17:59:17 -04:00
NunoSempere	5dead1a2c1	make format	2023-05-29 17:51:24 -04:00
NunoSempere	7724115933	reorg: put output in its own folder.	2023-05-29 17:50:32 -04:00
NunoSempere	5cfc4ab468	tweak: link math library.	2023-05-29 17:47:52 -04:00
Nuño Sempere	7d1919dc3d	Merge pull request #1 from JJSierraM/master Created a C-optimized version of the code	2023-05-29 14:36:00 -07:00
JJSierraM	792e03a5cc	Update README.md	2023-05-29 23:30:07 +02:00
JJSierraM	7a905ae16e	Merge pull request #1 from JJSierraM/JJSierraM-C-optimized Added C-optimized version of the code	2023-05-29 23:19:18 +02:00
JJSierraM	d62fae0c04	Added C-optimized version of the code	2023-05-29 23:17:07 +02:00
NunoSempere	c0e6b0677a	fix: remove semicolon.	2023-05-24 22:39:16 -07:00
NunoSempere	0bdb94a2d4	remove old files, simplify outline	2023-05-22 19:21:21 -04:00
NunoSempere	4419798c18	README: performance => comparison	2023-05-21 12:29:44 -04:00
NunoSempere	8174e8a49e	README: add lines of code, more comments.	2023-05-21 12:23:43 -04:00
NunoSempere	47e2a25490	improve nim code, change README	2023-05-21 12:05:15 -04:00
NunoSempere	8acdc283a2	look at the R code	2023-05-21 12:04:27 -04:00
NunoSempere	3e70318e36	add fast output to C.	2023-05-21 12:02:53 -04:00
NunoSempere	e9ab827320	tweak: nim/hardcore -> nim/samples-from-scratch	2023-05-21 11:07:00 -04:00
NunoSempere	2a41138478	tweaks.	2023-05-21 01:54:03 -04:00
NunoSempere	2cf684da56	move nim to top level, add to README	2023-05-21 01:46:45 -04:00
NunoSempere	a84b6b9cc0	tweak C makefile	2023-05-21 01:46:22 -04:00
NunoSempere	3050f7adee	tweak nim makefile	2023-05-21 01:34:17 -04:00
NunoSempere	1ebc3ce7b9	move hardcore defs to a different folder, use stdlib math Makes this way faster. Don't roll your stdlib, Nuño	2023-05-21 01:29:57 -04:00
NunoSempere	6454b2eeab	tweak: decrease convergence constants for faster speed.	2023-05-21 01:24:02 -04:00
NunoSempere	da9a10791f	feat: add the actual model	2023-05-21 01:22:02 -04:00
NunoSempere	76968afc79	add mixture implementation in nim.	2023-05-21 00:46:10 -04:00
NunoSempere	ccdeb77f3f	feat: remove odd log implementation, get normal, lognormal & to	2023-05-20 23:06:34 -04:00
NunoSempere	6e22e78d4f	add cool implementation of the logarithm	2023-05-20 22:38:38 -04:00
NunoSempere	7e2d2b95a1	tweak: save some progress.	2023-05-20 22:24:30 -04:00
NunoSempere	dc27673887	tweak: nim scratchpad	2023-05-20 21:45:28 -04:00
NunoSempere	88c079235e	add starting version for nim	2023-05-20 21:13:51 -04:00
NunoSempere	f0493f6955	clean up compilation of C example	2023-05-20 20:20:43 -04:00
NunoSempere	eaf1915bdb	add wip folder.	2023-05-20 20:00:58 -04:00
NunoSempere	77680a8590	tweak: make null window an object in order for this to run with bun <https://bun.sh/>	2022-12-07 19:14:31 +00:00
				`@ -0,0 +1 @@`
				`Using a pointer or a struct turns out to be pretty equivalent in terms of speed. I don't think there are gains to be found here, but I find the pointer implementation clearer.`
				`@ -0,0 +1 @@`
				`See <https://ziglang.org/learn/getting-started/#tagged-release-or-nightly-build>`