diff --git a/examples/core/00_example_template/example b/C/examples/core/00_example_template/example
similarity index 100%
rename from examples/core/00_example_template/example
rename to C/examples/core/00_example_template/example
diff --git a/examples/core/00_example_template/example.c b/C/examples/core/00_example_template/example.c
similarity index 100%
rename from examples/core/00_example_template/example.c
rename to C/examples/core/00_example_template/example.c
diff --git a/examples/core/01_one_sample/example b/C/examples/core/01_one_sample/example
similarity index 100%
rename from examples/core/01_one_sample/example
rename to C/examples/core/01_one_sample/example
diff --git a/examples/core/01_one_sample/example.c b/C/examples/core/01_one_sample/example.c
similarity index 100%
rename from examples/core/01_one_sample/example.c
rename to C/examples/core/01_one_sample/example.c
diff --git a/examples/core/02_time_to_botec/example b/C/examples/core/02_time_to_botec/example
similarity index 100%
rename from examples/core/02_time_to_botec/example
rename to C/examples/core/02_time_to_botec/example
diff --git a/examples/core/02_time_to_botec/example.c b/C/examples/core/02_time_to_botec/example.c
similarity index 100%
rename from examples/core/02_time_to_botec/example.c
rename to C/examples/core/02_time_to_botec/example.c
diff --git a/examples/core/03_gcc_nested_function/example b/C/examples/core/03_gcc_nested_function/example
similarity index 100%
rename from examples/core/03_gcc_nested_function/example
rename to C/examples/core/03_gcc_nested_function/example
diff --git a/examples/core/03_gcc_nested_function/example.c b/C/examples/core/03_gcc_nested_function/example.c
similarity index 100%
rename from examples/core/03_gcc_nested_function/example.c
rename to C/examples/core/03_gcc_nested_function/example.c
diff --git a/examples/core/04_gamma_beta/example b/C/examples/core/04_gamma_beta/example
similarity index 100%
rename from examples/core/04_gamma_beta/example
rename to C/examples/core/04_gamma_beta/example
diff --git a/examples/core/04_gamma_beta/example.c b/C/examples/core/04_gamma_beta/example.c
similarity index 100%
rename from examples/core/04_gamma_beta/example.c
rename to C/examples/core/04_gamma_beta/example.c
diff --git a/examples/core/05_hundred_lognormals/example b/C/examples/core/05_hundred_lognormals/example
similarity index 100%
rename from examples/core/05_hundred_lognormals/example
rename to C/examples/core/05_hundred_lognormals/example
diff --git a/examples/core/05_hundred_lognormals/example.c b/C/examples/core/05_hundred_lognormals/example.c
similarity index 100%
rename from examples/core/05_hundred_lognormals/example.c
rename to C/examples/core/05_hundred_lognormals/example.c
diff --git a/examples/core/05_hundred_lognormals/run-sorted.sh b/C/examples/core/05_hundred_lognormals/run-sorted.sh
similarity index 100%
rename from examples/core/05_hundred_lognormals/run-sorted.sh
rename to C/examples/core/05_hundred_lognormals/run-sorted.sh
diff --git a/examples/core/06_dissolving_fermi_paradox/example b/C/examples/core/06_dissolving_fermi_paradox/example
similarity index 100%
rename from examples/core/06_dissolving_fermi_paradox/example
rename to C/examples/core/06_dissolving_fermi_paradox/example
diff --git a/examples/core/06_dissolving_fermi_paradox/example.c b/C/examples/core/06_dissolving_fermi_paradox/example.c
similarity index 100%
rename from examples/core/06_dissolving_fermi_paradox/example.c
rename to C/examples/core/06_dissolving_fermi_paradox/example.c
diff --git a/examples/core/06_dissolving_fermi_paradox/fermi.pdf b/C/examples/core/06_dissolving_fermi_paradox/fermi.pdf
similarity index 100%
rename from examples/core/06_dissolving_fermi_paradox/fermi.pdf
rename to C/examples/core/06_dissolving_fermi_paradox/fermi.pdf
diff --git a/examples/core/06_dissolving_fermi_paradox/naive.c b/C/examples/core/06_dissolving_fermi_paradox/naive.c
similarity index 100%
rename from examples/core/06_dissolving_fermi_paradox/naive.c
rename to C/examples/core/06_dissolving_fermi_paradox/naive.c
diff --git a/examples/core/06_dissolving_fermi_paradox/scratchpad b/C/examples/core/06_dissolving_fermi_paradox/scratchpad
similarity index 100%
rename from examples/core/06_dissolving_fermi_paradox/scratchpad
rename to C/examples/core/06_dissolving_fermi_paradox/scratchpad
diff --git a/examples/core/makefile b/C/examples/core/makefile
similarity index 100%
rename from examples/core/makefile
rename to C/examples/core/makefile
diff --git a/examples/more/00_example_template/example b/C/examples/more/00_example_template/example
similarity index 100%
rename from examples/more/00_example_template/example
rename to C/examples/more/00_example_template/example
diff --git a/examples/more/00_example_template/example.c b/C/examples/more/00_example_template/example.c
similarity index 100%
rename from examples/more/00_example_template/example.c
rename to C/examples/more/00_example_template/example.c
diff --git a/examples/more/02_ci_beta/example b/C/examples/more/02_ci_beta/example
similarity index 100%
rename from examples/more/02_ci_beta/example
rename to C/examples/more/02_ci_beta/example
diff --git a/examples/more/02_ci_beta/example.c b/C/examples/more/02_ci_beta/example.c
similarity index 100%
rename from examples/more/02_ci_beta/example.c
rename to C/examples/more/02_ci_beta/example.c
diff --git a/examples/more/03_ci_beta_parallel/example b/C/examples/more/03_ci_beta_parallel/example
similarity index 100%
rename from examples/more/03_ci_beta_parallel/example
rename to C/examples/more/03_ci_beta_parallel/example
diff --git a/examples/more/03_ci_beta_parallel/example.c b/C/examples/more/03_ci_beta_parallel/example.c
similarity index 100%
rename from examples/more/03_ci_beta_parallel/example.c
rename to C/examples/more/03_ci_beta_parallel/example.c
diff --git a/examples/more/04_nuclear_war/example b/C/examples/more/04_nuclear_war/example
similarity index 100%
rename from examples/more/04_nuclear_war/example
rename to C/examples/more/04_nuclear_war/example
diff --git a/examples/more/04_nuclear_war/example.c b/C/examples/more/04_nuclear_war/example.c
similarity index 100%
rename from examples/more/04_nuclear_war/example.c
rename to C/examples/more/04_nuclear_war/example.c
diff --git a/examples/more/04_nuclear_war/scratchpad/example b/C/examples/more/04_nuclear_war/scratchpad/example
similarity index 100%
rename from examples/more/04_nuclear_war/scratchpad/example
rename to C/examples/more/04_nuclear_war/scratchpad/example
diff --git a/examples/more/04_nuclear_war/scratchpad/example.c b/C/examples/more/04_nuclear_war/scratchpad/example.c
similarity index 100%
rename from examples/more/04_nuclear_war/scratchpad/example.c
rename to C/examples/more/04_nuclear_war/scratchpad/example.c
diff --git a/examples/more/04_nuclear_war/scratchpad/makefile b/C/examples/more/04_nuclear_war/scratchpad/makefile
similarity index 100%
rename from examples/more/04_nuclear_war/scratchpad/makefile
rename to C/examples/more/04_nuclear_war/scratchpad/makefile
diff --git a/examples/more/05_burn_10kg_fat/example b/C/examples/more/05_burn_10kg_fat/example
similarity index 100%
rename from examples/more/05_burn_10kg_fat/example
rename to C/examples/more/05_burn_10kg_fat/example
diff --git a/examples/more/05_burn_10kg_fat/example.c b/C/examples/more/05_burn_10kg_fat/example.c
similarity index 100%
rename from examples/more/05_burn_10kg_fat/example.c
rename to C/examples/more/05_burn_10kg_fat/example.c
diff --git a/examples/more/06_nuclear_recovery/example b/C/examples/more/06_nuclear_recovery/example
similarity index 100%
rename from examples/more/06_nuclear_recovery/example
rename to C/examples/more/06_nuclear_recovery/example
diff --git a/examples/more/06_nuclear_recovery/example.c b/C/examples/more/06_nuclear_recovery/example.c
similarity index 100%
rename from examples/more/06_nuclear_recovery/example.c
rename to C/examples/more/06_nuclear_recovery/example.c
diff --git a/examples/more/07_algebra/example b/C/examples/more/07_algebra/example
similarity index 100%
rename from examples/more/07_algebra/example
rename to C/examples/more/07_algebra/example
diff --git a/examples/more/07_algebra/example.c b/C/examples/more/07_algebra/example.c
similarity index 100%
rename from examples/more/07_algebra/example.c
rename to C/examples/more/07_algebra/example.c
diff --git a/examples/more/08_algebra_and_conversion/example b/C/examples/more/08_algebra_and_conversion/example
similarity index 100%
rename from examples/more/08_algebra_and_conversion/example
rename to C/examples/more/08_algebra_and_conversion/example
diff --git a/examples/more/08_algebra_and_conversion/example.c b/C/examples/more/08_algebra_and_conversion/example.c
similarity index 100%
rename from examples/more/08_algebra_and_conversion/example.c
rename to C/examples/more/08_algebra_and_conversion/example.c
diff --git a/examples/more/09_ergonomic_algebra/example b/C/examples/more/09_ergonomic_algebra/example
similarity index 100%
rename from examples/more/09_ergonomic_algebra/example
rename to C/examples/more/09_ergonomic_algebra/example
diff --git a/examples/more/09_ergonomic_algebra/example.c b/C/examples/more/09_ergonomic_algebra/example.c
similarity index 100%
rename from examples/more/09_ergonomic_algebra/example.c
rename to C/examples/more/09_ergonomic_algebra/example.c
diff --git a/examples/more/10_twitter_thread_example/example b/C/examples/more/10_twitter_thread_example/example
similarity index 100%
rename from examples/more/10_twitter_thread_example/example
rename to C/examples/more/10_twitter_thread_example/example
diff --git a/examples/more/10_twitter_thread_example/example.c b/C/examples/more/10_twitter_thread_example/example.c
similarity index 100%
rename from examples/more/10_twitter_thread_example/example.c
rename to C/examples/more/10_twitter_thread_example/example.c
diff --git a/examples/more/11_billion_lognormals_paralell/example b/C/examples/more/11_billion_lognormals_paralell/example
similarity index 100%
rename from examples/more/11_billion_lognormals_paralell/example
rename to C/examples/more/11_billion_lognormals_paralell/example
diff --git a/examples/more/11_billion_lognormals_paralell/example.c b/C/examples/more/11_billion_lognormals_paralell/example.c
similarity index 100%
rename from examples/more/11_billion_lognormals_paralell/example.c
rename to C/examples/more/11_billion_lognormals_paralell/example.c
diff --git a/examples/more/12_time_to_botec_parallel/example b/C/examples/more/12_time_to_botec_parallel/example
similarity index 100%
rename from examples/more/12_time_to_botec_parallel/example
rename to C/examples/more/12_time_to_botec_parallel/example
diff --git a/examples/more/12_time_to_botec_parallel/example.c b/C/examples/more/12_time_to_botec_parallel/example.c
similarity index 100%
rename from examples/more/12_time_to_botec_parallel/example.c
rename to C/examples/more/12_time_to_botec_parallel/example.c
diff --git a/examples/more/13_parallelize_min/example b/C/examples/more/13_parallelize_min/example
similarity index 100%
rename from examples/more/13_parallelize_min/example
rename to C/examples/more/13_parallelize_min/example
diff --git a/examples/more/13_parallelize_min/example.c b/C/examples/more/13_parallelize_min/example.c
similarity index 100%
rename from examples/more/13_parallelize_min/example.c
rename to C/examples/more/13_parallelize_min/example.c
diff --git a/examples/more/14_check_confidence_interval/example b/C/examples/more/14_check_confidence_interval/example
similarity index 100%
rename from examples/more/14_check_confidence_interval/example
rename to C/examples/more/14_check_confidence_interval/example
diff --git a/examples/more/14_check_confidence_interval/example.c b/C/examples/more/14_check_confidence_interval/example.c
similarity index 100%
rename from examples/more/14_check_confidence_interval/example.c
rename to C/examples/more/14_check_confidence_interval/example.c
diff --git a/examples/more/15_time_to_botec_custom_mixture/example b/C/examples/more/15_time_to_botec_custom_mixture/example
similarity index 100%
rename from examples/more/15_time_to_botec_custom_mixture/example
rename to C/examples/more/15_time_to_botec_custom_mixture/example
diff --git a/examples/more/15_time_to_botec_custom_mixture/example.c b/C/examples/more/15_time_to_botec_custom_mixture/example.c
similarity index 100%
rename from examples/more/15_time_to_botec_custom_mixture/example.c
rename to C/examples/more/15_time_to_botec_custom_mixture/example.c
diff --git a/examples/more/makefile b/C/examples/more/makefile
similarity index 100%
rename from examples/more/makefile
rename to C/examples/more/makefile
diff --git a/makefile b/C/makefile
similarity index 100%
rename from makefile
rename to C/makefile
diff --git a/squiggle.c b/C/squiggle.c
similarity index 100%
rename from squiggle.c
rename to C/squiggle.c
diff --git a/squiggle.h b/C/squiggle.h
similarity index 100%
rename from squiggle.h
rename to C/squiggle.h
diff --git a/squiggle_more.c b/C/squiggle_more.c
similarity index 100%
rename from squiggle_more.c
rename to C/squiggle_more.c
diff --git a/squiggle_more.h b/C/squiggle_more.h
similarity index 100%
rename from squiggle_more.h
rename to C/squiggle_more.h
diff --git a/test/makefile b/C/test/makefile
similarity index 100%
rename from test/makefile
rename to C/test/makefile
diff --git a/test/test b/C/test/test
similarity index 100%
rename from test/test
rename to C/test/test
diff --git a/test/test.c b/C/test/test.c
similarity index 100%
rename from test/test.c
rename to C/test/test.c
diff --git a/CUDA/examples/core/00_example_template/example b/CUDA/examples/core/00_example_template/example
new file mode 100755
index 0000000..e4195e1
Binary files /dev/null and b/CUDA/examples/core/00_example_template/example differ
diff --git a/CUDA/examples/core/00_example_template/example.c b/CUDA/examples/core/00_example_template/example.c
new file mode 100644
index 0000000..41c7ef5
--- /dev/null
+++ b/CUDA/examples/core/00_example_template/example.c
@@ -0,0 +1,14 @@
+#include "../../../squiggle.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+int main()
+{
+    // set randomness seed
+    uint64_t* seed = malloc(sizeof(uint64_t));
+    *seed = 1000; // xorshift can't start with 0
+
+    // ...
+
+    free(seed);
+}
diff --git a/CUDA/examples/core/01_one_sample/example b/CUDA/examples/core/01_one_sample/example
new file mode 100755
index 0000000..a65e937
Binary files /dev/null and b/CUDA/examples/core/01_one_sample/example differ
diff --git a/CUDA/examples/core/01_one_sample/example.c b/CUDA/examples/core/01_one_sample/example.c
new file mode 100644
index 0000000..a74718b
--- /dev/null
+++ b/CUDA/examples/core/01_one_sample/example.c
@@ -0,0 +1,34 @@
+#include "../../../squiggle.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+// Estimate functions
+
+double sample_0(uint64_t* seed) { UNUSED(seed); return 0; }
+double sample_1(uint64_t* seed) { UNUSED(seed); return 1; }
+double sample_few(uint64_t* seed) { return sample_to(1, 3, seed); }
+double sample_many(uint64_t* seed) { return sample_to(2, 10, seed); }
+
+double sample_model(uint64_t* seed){
+
+    double p_a = 0.8;
+    double p_b = 0.5;
+    double p_c = p_a * p_b;
+
+    int n_dists = 4;
+    double weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 };
+    double (*samplers[])(uint64_t*) = { sample_0, sample_1, sample_few, sample_many };
+    double result = sample_mixture(samplers, weights, n_dists, seed);
+
+    return result;
+}
+
+int main()
+{
+    // set randomness seed
+    uint64_t* seed = malloc(sizeof(uint64_t));
+    *seed = 1000; // xorshift can't start with 0
+
+    printf("result_one: %f\n", sample_model(seed));
+    free(seed);
+}
diff --git a/CUDA/examples/core/02_time_to_botec/example b/CUDA/examples/core/02_time_to_botec/example
new file mode 100755
index 0000000..3735046
Binary files /dev/null and b/CUDA/examples/core/02_time_to_botec/example differ
diff --git a/CUDA/examples/core/02_time_to_botec/example.c b/CUDA/examples/core/02_time_to_botec/example.c
new file mode 100644
index 0000000..de3b3e9
--- /dev/null
+++ b/CUDA/examples/core/02_time_to_botec/example.c
@@ -0,0 +1,38 @@
+#include "../../../squiggle.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+double sample_0(uint64_t* seed) { UNUSED(seed); return 0; }
+double sample_1(uint64_t* seed) { UNUSED(seed); return 1; }
+double sample_few(uint64_t* seed) { return sample_to(1, 3, seed); }
+double sample_many(uint64_t* seed) { return sample_to(2, 10, seed); }
+
+double sample_model(uint64_t* seed){
+
+    double p_a = 0.8;
+    double p_b = 0.5;
+    double p_c = p_a * p_b;
+
+    int n_dists = 4;
+    double weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 };
+    double (*samplers[])(uint64_t*) = { sample_0, sample_1, sample_few, sample_many };
+    double result = sample_mixture(samplers, weights, n_dists, seed);
+
+    return result;
+}
+
+int main()
+{
+    // set randomness seed
+    uint64_t* seed = malloc(sizeof(uint64_t));
+    *seed = 1000; // xorshift can't start with 0
+
+    int n_samples = 1000000;
+    double* result_many = (double*)malloc((size_t)n_samples * sizeof(double));
+    for (int i = 0; i < n_samples; i++) {
+        result_many[i] = sample_model(seed);
+    }
+    printf("Mean: %f\n", array_mean(result_many, n_samples));
+
+    free(seed);
+}
diff --git a/CUDA/examples/core/03_gcc_nested_function/example b/CUDA/examples/core/03_gcc_nested_function/example
new file mode 100755
index 0000000..42ec59c
Binary files /dev/null and b/CUDA/examples/core/03_gcc_nested_function/example differ
diff --git a/CUDA/examples/core/03_gcc_nested_function/example.c b/CUDA/examples/core/03_gcc_nested_function/example.c
new file mode 100644
index 0000000..828aaea
--- /dev/null
+++ b/CUDA/examples/core/03_gcc_nested_function/example.c
@@ -0,0 +1,44 @@
+#include "../../../squiggle.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+double sample_model(uint64_t* seed){
+
+    double sample_0(uint64_t* seed) { UNUSED(seed); return 0; }
+    // Using a gcc extension, you can define a function inside another function
+    double sample_1(uint64_t* seed) { UNUSED(seed); return 1; }
+    double sample_few(uint64_t* seed) { return sample_to(1, 3, seed); }
+    double sample_many(uint64_t* seed) { return sample_to(2, 10, seed); }
+
+    double p_a = 0.8;
+    double p_b = 0.5;
+    double p_c = p_a * p_b;
+
+    int n_dists = 4;
+    double weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 };
+    double (*samplers[])(uint64_t*) = { sample_0, sample_1, sample_few, sample_many };
+    double result = sample_mixture(samplers, weights, n_dists, seed);
+
+    return result;
+}
+
+int main()
+{
+    // set randomness seed
+    uint64_t* seed = malloc(sizeof(uint64_t));
+    *seed = 1000; // xorshift can't start with 0
+
+    int n_samples = 1000000;
+    double* result_many = (double*)malloc((size_t)n_samples * sizeof(double));
+    for (int i = 0; i < n_samples; i++) {
+        result_many[i] = sample_model(seed);
+    }
+
+    printf("result_many: [");
+    for (int i = 0; i < 100; i++) {
+        printf("%.2f, ", result_many[i]);
+    }
+    printf("]\n");
+
+    free(seed);
+}
diff --git a/CUDA/examples/core/04_gamma_beta/example b/CUDA/examples/core/04_gamma_beta/example
new file mode 100755
index 0000000..d38f27f
Binary files /dev/null and b/CUDA/examples/core/04_gamma_beta/example differ
diff --git a/CUDA/examples/core/04_gamma_beta/example.c b/CUDA/examples/core/04_gamma_beta/example.c
new file mode 100644
index 0000000..1243235
--- /dev/null
+++ b/CUDA/examples/core/04_gamma_beta/example.c
@@ -0,0 +1,29 @@
+#include "../../../squiggle.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+int main()
+{
+    // set randomness seed
+    uint64_t* seed = malloc(sizeof(uint64_t));
+    *seed = 1000; // xorshift can't start with 0
+
+    int n = 1000 * 1000;
+    double* gamma_array = malloc(sizeof(double) * (size_t)n);
+    for (int i = 0; i < n; i++) {
+        gamma_array[i] = sample_gamma(1.0, seed);
+    }
+    printf("gamma(1) summary statistics = mean: %f, std: %f\n", array_mean(gamma_array, n), array_std(gamma_array, n));
+    printf("\n");
+
+    double* beta_array = malloc(sizeof(double) * (size_t)n);
+    for (int i = 0; i < n; i++) {
+        beta_array[i] = sample_beta(1, 2.0, seed);
+    }
+    printf("beta(1,2) summary statistics: mean: %f, std: %f\n", array_mean(beta_array, n), array_std(beta_array, n));
+    printf("\n");
+
+    free(gamma_array);
+    free(beta_array);
+    free(seed);
+}
diff --git a/CUDA/examples/core/05_hundred_lognormals/example b/CUDA/examples/core/05_hundred_lognormals/example
new file mode 100755
index 0000000..a25f8ba
Binary files /dev/null and b/CUDA/examples/core/05_hundred_lognormals/example differ
diff --git a/CUDA/examples/core/05_hundred_lognormals/example.c b/CUDA/examples/core/05_hundred_lognormals/example.c
new file mode 100644
index 0000000..eb8e07e
--- /dev/null
+++ b/CUDA/examples/core/05_hundred_lognormals/example.c
@@ -0,0 +1,17 @@
+#include "../../../squiggle.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+// Estimate functions
+int main()
+{
+    // set randomness seed
+    uint64_t* seed = malloc(sizeof(uint64_t));
+    *seed = 1000; // xorshift can't start with 0
+
+    for (int i = 0; i < 100; i++) {
+        double sample = sample_lognormal(0, 10, seed);
+        printf("%f\n", sample);
+    }
+    free(seed);
+}
diff --git a/CUDA/examples/core/05_hundred_lognormals/run-sorted.sh b/CUDA/examples/core/05_hundred_lognormals/run-sorted.sh
new file mode 100644
index 0000000..b7c3750
--- /dev/null
+++ b/CUDA/examples/core/05_hundred_lognormals/run-sorted.sh
@@ -0,0 +1 @@
+./example | sort -h
diff --git a/CUDA/examples/core/06_dissolving_fermi_paradox/example b/CUDA/examples/core/06_dissolving_fermi_paradox/example
new file mode 100755
index 0000000..0d13c3f
Binary files /dev/null and b/CUDA/examples/core/06_dissolving_fermi_paradox/example differ
diff --git a/CUDA/examples/core/06_dissolving_fermi_paradox/example.c b/CUDA/examples/core/06_dissolving_fermi_paradox/example.c
new file mode 100644
index 0000000..1fd185c
--- /dev/null
+++ b/CUDA/examples/core/06_dissolving_fermi_paradox/example.c
@@ -0,0 +1,99 @@
+#include "../../../squiggle.h"
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+double sample_fermi_logspace(uint64_t * seed)
+{
+    // Replicate <https://arxiv.org/pdf/1806.02404.pdf>, and in particular the red line in page 11.
+    // You can see a simple version of this function in naive.c in this same folder
+    double log_rate_of_star_formation = sample_uniform(log(1), log(100), seed);
+    double log_fraction_of_stars_with_planets = sample_uniform(log(0.1), log(1), seed);
+    double log_number_of_habitable_planets_per_star_system = sample_uniform(log(0.1), log(1), seed);
+
+    double log_rate_of_life_formation_in_habitable_planets = sample_normal(1, 50, seed);
+    double log_fraction_of_habitable_planets_in_which_any_life_appears;
+    /* 
+        Consider:
+        a = underlying normal 
+        b = rate_of_life_formation_in_habitable_planets = exp(underlying normal) = exp(a)
+        c = 1 - exp(-b) = fraction_of_habitable_planets_in_which_any_life_appears
+        d = log(c)
+
+        Looking at the Taylor expansion for c = 1 - exp(-b), it's 
+        b - b^2/2 + b^3/6 - x^b/24, etc.
+        <https://www.wolframalpha.com/input?i=1-exp%28-x%29>
+        When b ~ 0 (as is often the case), this is close to b.
+
+        But now, if b ~ 0, c ~ b
+        and d = log(c) ~ log(b) = log(exp(a)) = a
+
+        Now, we could play around with estimating errors,
+        and indeed if we want b^2/2 = exp(a)^2/2 < 10^(-n), i.e., to have n decimal digits of precision,
+        we could compute this as e.g., a < (nlog(10) + log(2))/2
+        so for example if we want ten digits of precision, that's a < -11
+        
+        Empirically, the two numbers as calculated in C do become really close around 11 or so, 
+        and at 38 that calculation results in a -inf (so probably a floating point error or similar.)
+        So we should be using that formula for somewhere between -38 << a < -11
+
+        I chose -16 as a happy medium after playing around with 
+        double invert(double x){
+            return log(1-exp(-exp(-x)));
+        }
+        for(int i=0; i<64; i++){
+            double j = i;
+            printf("for %lf, log(1-exp(-exp(-x))) is calculated as... %lf\n", j, invert(j));
+        }
+        and <https://www.wolframalpha.com/input?i=log%281-exp%28-exp%28-16%29%29%29>
+    */
+    if (log_rate_of_life_formation_in_habitable_planets < -16) {
+        log_fraction_of_habitable_planets_in_which_any_life_appears = log_rate_of_life_formation_in_habitable_planets;
+    } else {
+        double rate_of_life_formation_in_habitable_planets = exp(log_rate_of_life_formation_in_habitable_planets);
+        double fraction_of_habitable_planets_in_which_any_life_appears = -expm1(-rate_of_life_formation_in_habitable_planets);
+        log_fraction_of_habitable_planets_in_which_any_life_appears = log(fraction_of_habitable_planets_in_which_any_life_appears);
+    }
+
+    double log_fraction_of_planets_with_life_in_which_intelligent_life_appears = sample_uniform(log(0.001), log(1), seed);
+    double log_fraction_of_intelligent_planets_which_are_detectable_as_such = sample_uniform(log(0.01), log(1), seed);
+    double log_longevity_of_detectable_civilizations = sample_uniform(log(100), log(10000000000), seed);
+
+    double log_n = 
+        log_rate_of_star_formation + 
+        log_fraction_of_stars_with_planets + 
+        log_number_of_habitable_planets_per_star_system + 
+        log_fraction_of_habitable_planets_in_which_any_life_appears + 
+        log_fraction_of_planets_with_life_in_which_intelligent_life_appears + 
+        log_fraction_of_intelligent_planets_which_are_detectable_as_such + 
+        log_longevity_of_detectable_civilizations;
+    return log_n;
+}
+
+double sample_are_we_alone_logspace(uint64_t * seed)
+{
+    double log_n = sample_fermi_logspace(seed);
+    return ((log_n > 0) ? 1 : 0);
+    // log_n > 0 => n > 1
+}
+
+
+int main()
+{
+
+    // set randomness seed
+    uint64_t* seed = malloc(sizeof(uint64_t));
+    *seed = 1001; // xorshift can't start with a seed of 0
+
+    double logspace_fermi_proportion = 0;
+    int n_samples = 1000 * 1000;
+    for (int i = 0; i < n_samples; i++) {
+        double result = sample_are_we_alone_logspace(seed);
+        logspace_fermi_proportion += result;
+    }
+    double p_not_alone = logspace_fermi_proportion / n_samples;
+    printf("Probability that we are not alone: %lf (%.lf%%)\n", p_not_alone, p_not_alone * 100);
+
+    free(seed);
+}
diff --git a/CUDA/examples/core/06_dissolving_fermi_paradox/fermi.pdf b/CUDA/examples/core/06_dissolving_fermi_paradox/fermi.pdf
new file mode 100644
index 0000000..df2e828
Binary files /dev/null and b/CUDA/examples/core/06_dissolving_fermi_paradox/fermi.pdf differ
diff --git a/CUDA/examples/core/06_dissolving_fermi_paradox/naive.c b/CUDA/examples/core/06_dissolving_fermi_paradox/naive.c
new file mode 100644
index 0000000..0990a43
--- /dev/null
+++ b/CUDA/examples/core/06_dissolving_fermi_paradox/naive.c
@@ -0,0 +1,79 @@
+#include "../../../squiggle.h"
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define VERBOSE 0
+
+double sample_loguniform(double a, double b, uint64_t* seed)
+{
+    return exp(sample_uniform(log(a), log(b), seed));
+}
+
+int main()
+{
+    // Replicate <https://arxiv.org/pdf/1806.02404.pdf>, and in particular the red line in page 11.
+    // Could also be interesting to just produce and save many samples.
+
+    // set randomness seed
+    uint64_t* seed = malloc(sizeof(uint64_t));
+    *seed = UINT64_MAX / 64; // xorshift can't start with a seed of 0
+
+    // Do this naïvely, without worrying that much about numerical precision
+    double sample_fermi_naive(uint64_t * seed)
+    {
+        double rate_of_star_formation = sample_loguniform(1, 100, seed);
+        double fraction_of_stars_with_planets = sample_loguniform(0.1, 1, seed);
+        double number_of_habitable_planets_per_star_system = sample_loguniform(0.1, 1, seed);
+        double rate_of_life_formation_in_habitable_planets = sample_lognormal(1, 50, seed);
+        double fraction_of_habitable_planets_in_which_any_life_appears = -expm1(-rate_of_life_formation_in_habitable_planets);
+        // double fraction_of_habitable_planets_in_which_any_life_appears = 1-exp(-rate_of_life_formation_in_habitable_planets);
+        // but with more precision
+        double fraction_of_planets_with_life_in_which_intelligent_life_appears = sample_loguniform(0.001, 1, seed);
+        double fraction_of_intelligent_planets_which_are_detectable_as_such = sample_loguniform(0.01, 1, seed);
+        double longevity_of_detectable_civilizations = sample_loguniform(100, 10000000000, seed);
+
+        if(VERBOSE) printf(" rate_of_star_formation = %lf\n", rate_of_star_formation);
+        if(VERBOSE) printf(" fraction_of_stars_with_planets = %lf\n", fraction_of_stars_with_planets);
+        if(VERBOSE) printf(" number_of_habitable_planets_per_star_system = %lf\n", number_of_habitable_planets_per_star_system);
+        if(VERBOSE) printf(" rate_of_life_formation_in_habitable_planets = %.16lf\n", rate_of_life_formation_in_habitable_planets);
+        if(VERBOSE) printf(" fraction_of_habitable_planets_in_which_any_life_appears = %lf\n", fraction_of_habitable_planets_in_which_any_life_appears);
+        if(VERBOSE) printf(" fraction_of_planets_with_life_in_which_intelligent_life_appears = %lf\n", fraction_of_planets_with_life_in_which_intelligent_life_appears);
+        if(VERBOSE) printf(" fraction_of_intelligent_planets_which_are_detectable_as_such = %lf\n", fraction_of_intelligent_planets_which_are_detectable_as_such);
+        if(VERBOSE) printf(" longevity_of_detectable_civilizations = %lf\n", longevity_of_detectable_civilizations);
+
+        // Expected number of civilizations in the Milky way;
+        // see footnote 3 (p. 5)
+        double n = rate_of_star_formation * fraction_of_stars_with_planets * number_of_habitable_planets_per_star_system * fraction_of_habitable_planets_in_which_any_life_appears * fraction_of_planets_with_life_in_which_intelligent_life_appears * fraction_of_intelligent_planets_which_are_detectable_as_such * longevity_of_detectable_civilizations;
+
+        return n;
+    }
+
+    double sample_are_we_alone_naive(uint64_t * seed)
+    {
+        double n = sample_fermi_naive(seed);
+        return ((n > 1) ? 1 : 0);
+    }
+
+    double n = 1000000;
+    double naive_fermi_proportion = 0;
+    for (int i = 0; i < n; i++) {
+        double result = sample_are_we_alone_naive(seed);
+        if(VERBOSE) printf("result: %lf\n", result);
+        naive_fermi_proportion += result;
+    }
+    printf("Naïve %% that we are not alone: %lf\n", naive_fermi_proportion / n);
+
+    free(seed);
+
+    /* 
+        double invert(double x){
+            return log(1-exp(-exp(-x)));
+        }
+        for(int i=0; i<64; i++){
+            double j = i;
+            printf("for %lf, log(1-exp(-exp(-x))) is calculated as... %lf\n", j, invert(j));
+        }
+    */
+}
diff --git a/CUDA/examples/core/06_dissolving_fermi_paradox/scratchpad b/CUDA/examples/core/06_dissolving_fermi_paradox/scratchpad
new file mode 100755
index 0000000..f3010c5
Binary files /dev/null and b/CUDA/examples/core/06_dissolving_fermi_paradox/scratchpad differ
diff --git a/CUDA/examples/core/makefile b/CUDA/examples/core/makefile
new file mode 100644
index 0000000..f89d0e4
--- /dev/null
+++ b/CUDA/examples/core/makefile
@@ -0,0 +1,91 @@
+# Interface: 
+# make all 
+# make format-all
+# make run-all
+# make one             DIR=01_one_sample
+# make format-one      DIR=01_one_sample
+# make run-one         DIR=01_one_sample
+# make time-linux-one  DIR=01_one_sample
+# make profile-one     DIR=01_one_sample
+
+# Compiler
+CC=gcc
+# CC=tcc # <= faster compilation
+
+# Main file
+SRC=example.c
+OUTPUT=example
+
+## Dependencies
+SQUIGGLE=../../squiggle.c
+MATH=-lm
+DEPS=$(SQUIGGLE) $(MATH)
+
+## Flags
+# DEBUG=-fsanitize=address,undefined -fanalyzer
+# DEBUG=-g
+# DEBUG=
+WARN=-Wall -Wextra -Wdouble-promotion -Wconversion
+STANDARD=-std=c99
+OPTIMIZED=-O3  #-Ofast
+
+## Formatter
+STYLE_BLUEPRINT=webkit
+FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
+
+## make all
+all:
+	$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) 00_example_template/$(SRC) $(DEPS) -o 00_example_template/$(OUTPUT)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) 01_one_sample/$(SRC) $(DEPS) -o 01_one_sample/$(OUTPUT)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) 02_time_to_botec/$(SRC) $(DEPS) -o 02_time_to_botec/$(OUTPUT)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(WARN)  03_gcc_nested_function/$(SRC) $(DEPS) -o 03_gcc_nested_function/$(OUTPUT)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) 04_gamma_beta/$(SRC)          $(DEPS) -o 04_gamma_beta/$(OUTPUT)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) 05_hundred_lognormals/$(SRC)  $(DEPS) -o 05_hundred_lognormals/$(OUTPUT)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) 06_dissolving_fermi_paradox/$(SRC)  $(DEPS) -o 06_dissolving_fermi_paradox/$(OUTPUT)
+
+
+format-all:
+	$(FORMATTER) 00_example_template/$(SRC)
+	$(FORMATTER) 01_one_sample/$(SRC)
+	$(FORMATTER) 02_time_to_botec/$(SRC)
+	$(FORMATTER) 03_gcc_nested_function/$(SRC)
+	$(FORMATTER) 04_gamma_beta/$(SRC)
+	$(FORMATTER) 05_hundred_lognormals/$(SRC)
+	$(FORMATTER) 06_dissolving_fermi_paradox/$(SRC)
+
+run-all:
+	00_example_template/$(OUTPUT)
+	01_one_sample/$(OUTPUT)
+	02_time_to_botec/$(OUTPUT)
+	03_gcc_nested_function/$(OUTPUT)
+	04_gamma_beta/$(OUTPUT)
+	05_hundred_lognormals/$(OUTPUT)
+	06_dissolving_fermi_paradox/$(OUTPUT)
+	
+## make one DIR=01_one_sample
+one: $(DIR)/$(SRC)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) $(DIR)/$(SRC) $(DEPS) -o $(DIR)/$(OUTPUT)
+
+## make format-one DIR=01_one_sample
+format-one: $(DIR)/$(SRC)
+	$(FORMATTER) $(DIR)/$(SRC)
+
+## make run-one DIR=01_one_sample
+run-one: $(DIR)/$(OUTPUT)
+	$(DIR)/$(OUTPUT) && echo
+
+## make time-linux-one DIR=01_one_sample
+time-linux-one: $(DIR)/$(OUTPUT)
+	@echo "Requires /bin/time, found on GNU/Linux systems" && echo
+	@echo "Running 100x and taking avg time $(DIR)/$(OUTPUT)"
+	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(DIR)/$(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 1 thread: |" | sed 's|$$|ms|' && echo
+
+## e.g., make profile-linux-one DIR=01_one_sample
+profile-linux-one: 
+	echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar"
+	echo "Must be run as sudo"
+	$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) $(DIR)/$(SRC) $(DEPS) -o $(DIR)/$(OUTPUT)
+	# $(CC) $(SRC) $(DEPS) -o $(OUTPUT)
+	sudo perf record $(DIR)/$(OUTPUT)
+	sudo perf report
+	rm perf.data
diff --git a/CUDA/examples/more/00_example_template/example b/CUDA/examples/more/00_example_template/example
new file mode 100755
index 0000000..ac90d49
Binary files /dev/null and b/CUDA/examples/more/00_example_template/example differ
diff --git a/CUDA/examples/more/00_example_template/example.c b/CUDA/examples/more/00_example_template/example.c
new file mode 100644
index 0000000..f5ee07e
--- /dev/null
+++ b/CUDA/examples/more/00_example_template/example.c
@@ -0,0 +1,19 @@
+#include "../../../squiggle.h"
+#include "../../../squiggle_more.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+double sample_model(uint64_t* seed){
+    return sample_to(1, 10, seed);
+}
+
+int main()
+{
+    // set randomness seed
+    uint64_t* seed = malloc(sizeof(uint64_t));
+    *seed = 1000; // xorshift can't start with 0
+
+    // ...
+
+    free(seed);
+}
diff --git a/CUDA/examples/more/02_ci_beta/example b/CUDA/examples/more/02_ci_beta/example
new file mode 100755
index 0000000..27effbb
Binary files /dev/null and b/CUDA/examples/more/02_ci_beta/example differ
diff --git a/CUDA/examples/more/02_ci_beta/example.c b/CUDA/examples/more/02_ci_beta/example.c
new file mode 100644
index 0000000..fd9581c
--- /dev/null
+++ b/CUDA/examples/more/02_ci_beta/example.c
@@ -0,0 +1,30 @@
+#include "../../../squiggle.h"
+#include "../../../squiggle_more.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+// Estimate functions
+double sample_beta_3_2(uint64_t* seed)
+{
+    return sample_beta(3.0, 2.0, seed);
+}
+
+int main()
+{
+    // set randomness seed
+    uint64_t* seed = malloc(sizeof(uint64_t));
+    *seed = 1000; // xorshift can't start with 0
+
+    int n_samples = 1 * MILLION;
+    double* xs = malloc(sizeof(double) * (size_t)n_samples);
+    for (int i = 0; i < n_samples; i++) {
+        xs[i] = sample_beta_3_2(seed);
+    }
+
+    printf("\n# Stats\n");
+    array_print_stats(xs, n_samples);
+    printf("\n# Histogram\n");
+    array_print_histogram(xs, n_samples, 23);
+
+    free(seed);
+}
diff --git a/CUDA/examples/more/03_ci_beta_parallel/example b/CUDA/examples/more/03_ci_beta_parallel/example
new file mode 100755
index 0000000..73819d5
Binary files /dev/null and b/CUDA/examples/more/03_ci_beta_parallel/example differ
diff --git a/CUDA/examples/more/03_ci_beta_parallel/example.c b/CUDA/examples/more/03_ci_beta_parallel/example.c
new file mode 100644
index 0000000..ebe28d9
--- /dev/null
+++ b/CUDA/examples/more/03_ci_beta_parallel/example.c
@@ -0,0 +1,28 @@
+#include "../../../squiggle.h"
+#include "../../../squiggle_more.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+// Estimate functions
+double sample_beta_3_2(uint64_t* seed)
+{
+    return sample_beta(3.0, 2.0, seed);
+}
+
+int main()
+{
+    // set randomness seed
+    uint64_t* seed = malloc(sizeof(uint64_t));
+    *seed = 1000; // xorshift can't start with 0
+
+    int n_samples = 1 * MILLION;
+    double* xs = malloc(sizeof(double) * (size_t)n_samples);
+    sampler_parallel(sample_beta_3_2, xs, 16, n_samples);
+
+    printf("\n# Stats\n");
+    array_print_stats(xs, n_samples);
+    printf("\n# Histogram\n");
+    array_print_histogram(xs, n_samples, 23);
+
+    free(seed);
+}
diff --git a/CUDA/examples/more/04_nuclear_war/example b/CUDA/examples/more/04_nuclear_war/example
new file mode 100755
index 0000000..a22a449
Binary files /dev/null and b/CUDA/examples/more/04_nuclear_war/example differ
diff --git a/CUDA/examples/more/04_nuclear_war/example.c b/CUDA/examples/more/04_nuclear_war/example.c
new file mode 100644
index 0000000..065815c
--- /dev/null
+++ b/CUDA/examples/more/04_nuclear_war/example.c
@@ -0,0 +1,63 @@
+#include "../../../squiggle.h"
+#include "../../../squiggle_more.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+double probability_of_dying_nuno(uint64_t* seed)
+{
+    double first_year_russian_nuclear_weapons = 1953;
+    double current_year = 2022;
+    double laplace_probability_nuclear_exchange_year = sample_beta(1, current_year - first_year_russian_nuclear_weapons + 1, seed);
+    double laplace_probability_nuclear_exchange_month = 1 - pow(1 - laplace_probability_nuclear_exchange_year, (1.0 / 12.0));
+
+    double london_hit_conditional_on_russia_nuclear_weapon_usage = sample_beta(7.67, 69.65, seed);
+    // I.e., a beta distribution with a range of 0.05 to 0.16 into: https://nunosempere.com/blog/2023/03/15/fit-beta/
+    // 0.05 were my estimate and Samotsvety's estimate in March 2022, respectively:
+    // https://forum.effectivealtruism.org/posts/KRFXjCqqfGQAYirm5/samotsvety-nuclear-risk-forecasts-march-2022#Nu_o_Sempere
+    double informed_actor_not_able_to_escape = sample_beta(3.26212166586967, 3.26228162008564, seed);
+    // 0.2 to 0.8, i.e., 20% to 80%, again using the previous tool
+    double proportion_which_die_if_bomb_drops_in_london = sample_beta(10.00, 2.45, seed); // 60% to 95%
+
+    double probability_of_dying = laplace_probability_nuclear_exchange_month * london_hit_conditional_on_russia_nuclear_weapon_usage * informed_actor_not_able_to_escape * proportion_which_die_if_bomb_drops_in_london;
+    return probability_of_dying;
+}
+
+double probability_of_dying_eli(uint64_t* seed)
+{
+    double russia_nato_nuclear_exchange_in_next_month = sample_beta(1.30, 1182.99, seed); // .0001 to .003
+    double london_hit_conditional = sample_beta(3.47, 8.97, seed); // 0.1 to 0.5
+    double informed_actors_not_able_to_escape = sample_beta(2.73, 5.67, seed); // .1 to .6
+    double proportion_which_die_if_bomb_drops_in_london = sample_beta(3.00, 1.46, seed); // 0.3 to 0.95;
+
+    double probability_of_dying = russia_nato_nuclear_exchange_in_next_month * london_hit_conditional * informed_actors_not_able_to_escape * proportion_which_die_if_bomb_drops_in_london;
+    return probability_of_dying;
+}
+
+double sample_nuclear_model(uint64_t* seed)
+{
+    double (*samplers[])(uint64_t*) = { probability_of_dying_nuno, probability_of_dying_eli };
+    double weights[] = { 0.5, 0.5 };
+    return sample_mixture(samplers, weights, 2, seed);
+}
+
+int main()
+{
+    // set randomness seed
+    uint64_t* seed = malloc(sizeof(uint64_t));
+    *seed = 1000; // xorshift can't start with 0
+
+    int n = 1 * MILLION;
+    double* xs = malloc(sizeof(double) * (size_t)n);
+    for (int i = 0; i < n; i++) {
+        xs[i] = sample_nuclear_model(seed);
+    }
+
+    printf("\n# Stats\n");
+    array_print_stats(xs, n);
+    printf("\n# Histogram\n");
+    array_print_90_ci_histogram(xs, n, 20);
+
+    free(xs);
+    free(seed);
+}
diff --git a/CUDA/examples/more/04_nuclear_war/scratchpad/example b/CUDA/examples/more/04_nuclear_war/scratchpad/example
new file mode 100755
index 0000000..5ed948d
Binary files /dev/null and b/CUDA/examples/more/04_nuclear_war/scratchpad/example differ
diff --git a/CUDA/examples/more/04_nuclear_war/scratchpad/example.c b/CUDA/examples/more/04_nuclear_war/scratchpad/example.c
new file mode 100644
index 0000000..3c3e45c
--- /dev/null
+++ b/CUDA/examples/more/04_nuclear_war/scratchpad/example.c
@@ -0,0 +1,20 @@
+#include "../../../squiggle.h"
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int main()
+{
+    // set randomness seed
+    uint64_t* seed = malloc(sizeof(uint64_t));
+    *seed = 1000; // xorshift can't start with 0
+		double firstYearRussianNuclearWeapons = 1953;
+		double	currentYear = 2023;
+
+		for(int i=0; i<10; i++){
+			double laplace_beta = sample_beta(currentYear-firstYearRussianNuclearWeapons + 1, 1, seed);
+			printf("%f\n", laplace_beta);
+		}
+		
+    free(seed);
+}
diff --git a/CUDA/examples/more/04_nuclear_war/scratchpad/makefile b/CUDA/examples/more/04_nuclear_war/scratchpad/makefile
new file mode 100644
index 0000000..989c057
--- /dev/null
+++ b/CUDA/examples/more/04_nuclear_war/scratchpad/makefile
@@ -0,0 +1,53 @@
+# Interface: 
+#   make
+#   make build
+#   make format
+#   make run
+
+# Compiler
+CC=gcc
+# CC=tcc # <= faster compilation
+
+# Main file
+SRC=example.c ../../../squiggle.c
+OUTPUT=example
+
+## Dependencies
+MATH=-lm
+
+## Flags
+DEBUG= #'-g'
+STANDARD=-std=c99
+WARNINGS=-Wall
+OPTIMIZED=-O3  #-Ofast
+# OPENMP=-fopenmp
+
+## Formatter
+STYLE_BLUEPRINT=webkit
+FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
+
+## make build
+build: $(SRC)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(MATH) -o $(OUTPUT)
+
+format: $(SRC)
+	$(FORMATTER) $(SRC)
+
+run: $(SRC) $(OUTPUT)
+	OMP_NUM_THREADS=1 ./$(OUTPUT) && echo
+
+time-linux: 
+	@echo "Requires /bin/time, found on GNU/Linux systems" && echo
+	
+	@echo "Running 100x and taking avg time $(OUTPUT)"
+	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 1 thread: |" | sed 's|$$|ms|' && echo
+
+## Profiling
+
+profile-linux: 
+	echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar"
+	echo "Must be run as sudo"
+	$(CC) $(SRC) $(MATH) -o $(OUTPUT)
+	sudo perf record ./$(OUTPUT)
+	sudo perf report
+	rm perf.data
diff --git a/CUDA/examples/more/05_burn_10kg_fat/example b/CUDA/examples/more/05_burn_10kg_fat/example
new file mode 100755
index 0000000..ba9d49b
Binary files /dev/null and b/CUDA/examples/more/05_burn_10kg_fat/example differ
diff --git a/CUDA/examples/more/05_burn_10kg_fat/example.c b/CUDA/examples/more/05_burn_10kg_fat/example.c
new file mode 100644
index 0000000..b690ec5
--- /dev/null
+++ b/CUDA/examples/more/05_burn_10kg_fat/example.c
@@ -0,0 +1,43 @@
+#include "../../../squiggle.h"
+#include "../../../squiggle_more.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+double sample_minutes_per_day_jumping_rope_needed_to_burn_10kg(uint64_t* seed)
+{
+    double kcal_jumping_rope_minute = sample_to(15, 20, seed);
+    double kcal_jumping_rope_hour = kcal_jumping_rope_minute * 60;
+
+    double kcal_in_kg_of_fat = 7700;
+    double num_kg_of_fat_to_lose = 10;
+
+    double hours_jumping_rope_needed = kcal_in_kg_of_fat * num_kg_of_fat_to_lose / kcal_jumping_rope_hour;
+
+    double days_until_end_of_year = 152; // as of 2023-08-01
+    double hours_per_day = hours_jumping_rope_needed / days_until_end_of_year;
+    double minutes_per_day = hours_per_day * 60;
+    return minutes_per_day;
+}
+
+int main()
+{
+    // set randomness seed
+    uint64_t* seed = malloc(sizeof(uint64_t));
+    *seed = 1000; // xorshift can't start with 0
+
+    int n = 1000 * 1000;
+    double* xs = malloc(sizeof(double) * (size_t)n);
+    for (int i = 0; i < n; i++) {
+        xs[i] = sample_minutes_per_day_jumping_rope_needed_to_burn_10kg(seed);
+    }
+
+    printf("## How many minutes per day do I have to jump rope to lose 10kg of fat by the end of the year?\n");
+
+    printf("\n# Stats\n");
+    array_print_stats(xs, n);
+    printf("\n# Histogram\n");
+    array_print_histogram(xs, n, 23);
+
+    free(seed);
+}
diff --git a/CUDA/examples/more/06_nuclear_recovery/example b/CUDA/examples/more/06_nuclear_recovery/example
new file mode 100755
index 0000000..c5ade09
Binary files /dev/null and b/CUDA/examples/more/06_nuclear_recovery/example differ
diff --git a/CUDA/examples/more/06_nuclear_recovery/example.c b/CUDA/examples/more/06_nuclear_recovery/example.c
new file mode 100644
index 0000000..9d3e3ae
--- /dev/null
+++ b/CUDA/examples/more/06_nuclear_recovery/example.c
@@ -0,0 +1,84 @@
+#include "../../../squiggle.h"
+#include "../../../squiggle_more.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+double yearly_probability_nuclear_collapse(double year, uint64_t* seed)
+{
+    double successes = 0;
+    double failures = (year - 1960);
+    return sample_laplace(successes, failures, seed);
+    // ^ can change to (successes + 1)/(trials + 2)
+    // to get a probability,
+    // rather than sampling from a distribution over probabilities.
+}
+double yearly_probability_nuclear_collapse_2023(uint64_t* seed)
+{
+    return yearly_probability_nuclear_collapse(2023, seed);
+}
+
+double yearly_probability_nuclear_collapse_after_recovery(double year, double rebuilding_period_length_years, uint64_t* seed)
+{
+    // assumption: nuclear
+    double successes = 1.0;
+    double failures = (year - rebuilding_period_length_years - 1960 - 1);
+    return sample_laplace(successes, failures, seed);
+}
+double yearly_probability_nuclear_collapse_after_recovery_example(uint64_t* seed)
+{
+    double year = 2070;
+    double rebuilding_period_length_years = 30;
+    // So, there was a nuclear collapse in 2040,
+    // then a recovery period of 30 years
+    // and it's now 2070
+    return yearly_probability_nuclear_collapse_after_recovery(year, rebuilding_period_length_years, seed);
+}
+
+double yearly_probability_nuclear_collapse_after_recovery_antiinductive(uint64_t* seed)
+{
+    return yearly_probability_nuclear_collapse(2023, seed) / 2;
+}
+
+int main()
+{
+    // set randomness seed
+    uint64_t* seed = malloc(sizeof(uint64_t));
+    *seed = 1000; // xorshift can't start with 0
+
+    int n_samples = 1000000;
+
+    // Before a first nuclear collapse
+    printf("## Before the first nuclear collapse\n");
+    double* yearly_probability_nuclear_collapse_2023_samples = malloc(sizeof(double) * (size_t)n_samples);
+    for (int i = 0; i < n_samples; i++) {
+        yearly_probability_nuclear_collapse_2023_samples[i] = yearly_probability_nuclear_collapse_2023(seed);
+    }
+    ci ci_90_2023 = array_get_90_ci(yearly_probability_nuclear_collapse_2023_samples, n_samples);
+    printf("90%% confidence interval: [%f, %f]\n", ci_90_2023.low, ci_90_2023.high);
+
+    // After the first nuclear collapse
+    printf("\n## After the first nuclear collapse\n");
+
+    double* yearly_probability_nuclear_collapse_after_recovery_samples = malloc(sizeof(double) * (size_t)n_samples);
+    for (int i = 0; i < n_samples; i++) {
+        yearly_probability_nuclear_collapse_after_recovery_samples[i] = yearly_probability_nuclear_collapse_after_recovery_example(seed);
+    }
+    ci ci_90_2070 = array_get_90_ci(yearly_probability_nuclear_collapse_after_recovery_samples, 1000000);
+    printf("90%% confidence interval: [%f, %f]\n", ci_90_2070.low, ci_90_2070.high);
+
+    // After the first nuclear collapse (antiinductive)
+    printf("\n## After the first nuclear collapse (antiinductive)\n");
+    double* yearly_probability_nuclear_collapse_after_recovery_antiinductive_samples = malloc(sizeof(double) * (size_t)n_samples);
+    for (int i = 0; i < n_samples; i++) {
+        yearly_probability_nuclear_collapse_after_recovery_antiinductive_samples[i] = yearly_probability_nuclear_collapse_after_recovery_antiinductive(seed);
+    }
+    ci ci_90_antiinductive = array_get_90_ci(yearly_probability_nuclear_collapse_after_recovery_antiinductive_samples, 1000000);
+    printf("90%% confidence interval: [%f, %f]\n", ci_90_antiinductive.low, ci_90_antiinductive.high);
+
+    // free seeds
+    free(yearly_probability_nuclear_collapse_2023_samples);
+    free(yearly_probability_nuclear_collapse_after_recovery_samples);
+    free(yearly_probability_nuclear_collapse_after_recovery_antiinductive_samples);
+    free(seed);
+}
diff --git a/CUDA/examples/more/07_algebra/example b/CUDA/examples/more/07_algebra/example
new file mode 100755
index 0000000..6b15091
Binary files /dev/null and b/CUDA/examples/more/07_algebra/example differ
diff --git a/CUDA/examples/more/07_algebra/example.c b/CUDA/examples/more/07_algebra/example.c
new file mode 100644
index 0000000..65b5cd6
--- /dev/null
+++ b/CUDA/examples/more/07_algebra/example.c
@@ -0,0 +1,26 @@
+#include "../../../squiggle.h"
+#include "../../../squiggle_more.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int main()
+{
+    // set randomness seed
+    uint64_t* seed = malloc(sizeof(uint64_t));
+    *seed = 1000; // xorshift can't start with 0
+
+    normal_params n1 = { .mean = 1.0, .std = 3.0 };
+    normal_params n2 = { .mean = 2.0, .std = 4.0 };
+    normal_params sn = algebra_sum_normals(n1, n2);
+    printf("The sum of Normal(%f, %f) and Normal(%f, %f) is Normal(%f, %f)\n",
+        n1.mean, n1.std, n2.mean, n2.std, sn.mean, sn.std);
+
+    lognormal_params ln1 = { .logmean = 1.0, .logstd = 3.0 };
+    lognormal_params ln2 = { .logmean = 2.0, .logstd = 4.0 };
+    lognormal_params sln = algebra_product_lognormals(ln1, ln2);
+    printf("The product of Lognormal(%f, %f) and Lognormal(%f, %f) is Lognormal(%f, %f)\n",
+        ln1.logmean, ln1.logstd, ln2.logmean, ln2.logstd, sln.logmean, sln.logstd);
+
+    free(seed);
+}
diff --git a/CUDA/examples/more/08_algebra_and_conversion/example b/CUDA/examples/more/08_algebra_and_conversion/example
new file mode 100755
index 0000000..e676eb3
Binary files /dev/null and b/CUDA/examples/more/08_algebra_and_conversion/example differ
diff --git a/CUDA/examples/more/08_algebra_and_conversion/example.c b/CUDA/examples/more/08_algebra_and_conversion/example.c
new file mode 100644
index 0000000..3f2a2c7
--- /dev/null
+++ b/CUDA/examples/more/08_algebra_and_conversion/example.c
@@ -0,0 +1,33 @@
+#include "../../../squiggle.h"
+#include "../../../squiggle_more.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int main()
+{
+    // set randomness seed
+    uint64_t* seed = malloc(sizeof(uint64_t));
+    *seed = 1000; // xorshift can't start with 0
+
+    // Convert to 90% confidence interval form and back
+    lognormal_params ln1 = { .logmean = 1.0, .logstd = 3.0 };
+    ci ln1_ci = convert_lognormal_params_to_ci(ln1);
+    printf("The 90%% confidence interval of Lognormal(%f, %f) is [%f, %f]\n",
+        ln1.logmean, ln1.logstd,
+        ln1_ci.low, ln1_ci.high);
+    lognormal_params ln1_params2 = convert_ci_to_lognormal_params(ln1_ci);
+    printf("The lognormal which has 90%% confidence interval [%f, %f] is Lognormal(%f, %f)\n",
+        ln1_ci.low, ln1_ci.high,
+        ln1_params2.logmean, ln1_params2.logstd);
+
+    lognormal_params ln2 = convert_ci_to_lognormal_params((ci) { .low = 1, .high = 10 });
+    lognormal_params ln3 = convert_ci_to_lognormal_params((ci) { .low = 5, .high = 50 });
+
+    lognormal_params sln = algebra_product_lognormals(ln2, ln3);
+    ci sln_ci = convert_lognormal_params_to_ci(sln);
+
+    printf("Result of some lognormal products: to(%f, %f)\n", sln_ci.low, sln_ci.high);
+
+    free(seed);
+}
diff --git a/CUDA/examples/more/09_ergonomic_algebra/example b/CUDA/examples/more/09_ergonomic_algebra/example
new file mode 100755
index 0000000..cfddc7e
Binary files /dev/null and b/CUDA/examples/more/09_ergonomic_algebra/example differ
diff --git a/CUDA/examples/more/09_ergonomic_algebra/example.c b/CUDA/examples/more/09_ergonomic_algebra/example.c
new file mode 100644
index 0000000..0cd9adf
--- /dev/null
+++ b/CUDA/examples/more/09_ergonomic_algebra/example.c
@@ -0,0 +1,26 @@
+#include "../../../squiggle.h"
+#include "../../../squiggle_more.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define ln lognormal_params
+#define to(...) convert_ci_to_lognormal_params((ci)__VA_ARGS__)
+#define from(...) convert_lognormal_params_to_ci((ln)__VA_ARGS__)
+#define times(a, b) algebra_product_lognormals(a, b)
+
+int main()
+{
+    // set randomness seed
+    uint64_t* seed = malloc(sizeof(uint64_t));
+    *seed = 1000; // xorshift can't start with 0
+
+    ln a = to({ .low = 1, .high = 10 });
+    ln b = to({ .low = 5, .high = 500 });
+    ln c = times(a, b);
+
+    printf("Result: to(%f, %f)\n", from(c).low, from(c).high);
+    printf("One sample from it is: %f\n", sample_lognormal(c.logmean, c.logstd, seed));
+
+    free(seed);
+}
diff --git a/CUDA/examples/more/10_twitter_thread_example/example b/CUDA/examples/more/10_twitter_thread_example/example
new file mode 100755
index 0000000..1f9061d
Binary files /dev/null and b/CUDA/examples/more/10_twitter_thread_example/example differ
diff --git a/CUDA/examples/more/10_twitter_thread_example/example.c b/CUDA/examples/more/10_twitter_thread_example/example.c
new file mode 100644
index 0000000..7ddd0ec
--- /dev/null
+++ b/CUDA/examples/more/10_twitter_thread_example/example.c
@@ -0,0 +1,49 @@
+#include "../../../squiggle.h"
+#include "../../../squiggle_more.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+double sample_0(uint64_t* seed)
+{
+    UNUSED(seed);
+    return 0;
+}
+
+double sample_1(uint64_t* seed)
+{
+    UNUSED(seed);
+    return 1;
+}
+
+double sample_normal_mean_1_std_2(uint64_t* seed)
+{
+    return sample_normal(1, 2, seed);
+}
+
+double sample_1_to_3(uint64_t* seed)
+{
+    return sample_to(1, 3, seed);
+}
+
+int main()
+{
+    // set randomness seed
+    uint64_t* seed = malloc(sizeof(uint64_t));
+    *seed = 1000; // xorshift can't start with 0
+
+    int n_dists = 4;
+    double weights[] = { 1, 2, 3, 4 };
+    double (*samplers[])(uint64_t*) = {
+        sample_0,
+        sample_1,
+        sample_normal_mean_1_std_2,
+        sample_1_to_3
+    };
+
+    int n_samples = 10;
+    for (int i = 0; i < n_samples; i++) {
+        printf("Sample #%d: %f\n", i, sample_mixture(samplers, weights, n_dists, seed));
+    }
+
+    free(seed);
+}
diff --git a/CUDA/examples/more/11_billion_lognormals_paralell/example b/CUDA/examples/more/11_billion_lognormals_paralell/example
new file mode 100755
index 0000000..e1eb2f8
Binary files /dev/null and b/CUDA/examples/more/11_billion_lognormals_paralell/example differ
diff --git a/CUDA/examples/more/11_billion_lognormals_paralell/example.c b/CUDA/examples/more/11_billion_lognormals_paralell/example.c
new file mode 100644
index 0000000..4840304
--- /dev/null
+++ b/CUDA/examples/more/11_billion_lognormals_paralell/example.c
@@ -0,0 +1,30 @@
+#include "../../../squiggle.h"
+#include "../../../squiggle_more.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+double sample_model(uint64_t * seed)
+{
+    return sample_lognormal(0, 10, seed);
+}
+// Estimate functions
+int main()
+{
+    // set randomness seed
+    // uint64_t* seed = malloc(sizeof(uint64_t));
+    // *seed = 1000; // xorshift can't start with 0
+    // ^ not necessary, because sampler_parallel takes care of the seed.
+
+    int n_samples = 1000 * 1000 * 1000;
+    int n_threads = 16;
+    double* results = malloc((size_t)n_samples * sizeof(double));
+
+    sampler_parallel(sample_model, results, n_threads, n_samples);
+    double avg = array_sum(results, n_samples) / n_samples;
+    printf("Average of 1B lognormal(0,10): %f\n", avg);
+
+    free(results);
+
+    // free(seed);
+    // ^ not necessary, because sampler_parallel takes care of the seed.
+}
diff --git a/CUDA/examples/more/12_time_to_botec_parallel/example b/CUDA/examples/more/12_time_to_botec_parallel/example
new file mode 100755
index 0000000..7cfdd49
Binary files /dev/null and b/CUDA/examples/more/12_time_to_botec_parallel/example differ
diff --git a/CUDA/examples/more/12_time_to_botec_parallel/example.c b/CUDA/examples/more/12_time_to_botec_parallel/example.c
new file mode 100644
index 0000000..07d181c
--- /dev/null
+++ b/CUDA/examples/more/12_time_to_botec_parallel/example.c
@@ -0,0 +1,31 @@
+#include "../../../squiggle.h"
+#include "../../../squiggle_more.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+double sampler_result(uint64_t * seed)
+{
+    double p_a = 0.8;
+    double p_b = 0.5;
+    double p_c = p_a * p_b;
+
+    double sample_0(uint64_t * seed) { UNUSED(seed); return 0; }
+    double sample_1(uint64_t * seed) { UNUSED(seed); return 1; }
+    double sample_few(uint64_t * seed) { return sample_to(1, 3, seed); }
+    double sample_many(uint64_t * seed) { return sample_to(2, 10, seed); }
+
+    int n_dists = 4;
+    double weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 };
+    double (*samplers[])(uint64_t*) = { sample_0, sample_1, sample_few, sample_many };
+    return sample_mixture(samplers, weights, n_dists, seed);
+}
+
+int main()
+{
+
+    int n_samples = 1000 * 1000, n_threads = 16;
+    double* results = malloc((size_t)n_samples * sizeof(double));
+    sampler_parallel(sampler_result, results, n_threads, n_samples);
+    printf("Avg: %f\n", array_sum(results, n_samples) / n_samples);
+    free(results);
+}
diff --git a/CUDA/examples/more/13_parallelize_min/example b/CUDA/examples/more/13_parallelize_min/example
new file mode 100755
index 0000000..aba62d9
Binary files /dev/null and b/CUDA/examples/more/13_parallelize_min/example differ
diff --git a/CUDA/examples/more/13_parallelize_min/example.c b/CUDA/examples/more/13_parallelize_min/example.c
new file mode 100644
index 0000000..288b281
--- /dev/null
+++ b/CUDA/examples/more/13_parallelize_min/example.c
@@ -0,0 +1,70 @@
+#include "../../../squiggle.h"
+#include "../../../squiggle_more.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+int main()
+{
+    /* Question: can we parallelize this?
+    A = normal(5,2)
+    B = min(A)
+    B * 20
+    */
+
+    /* Option 1: parallelize taking from n samples */
+    // Question being asked: what is the distribution of sampling 1000 times and taking the min?
+    double sample_min_of_n(uint64_t * seed, int n)
+    {
+        double min = sample_normal(5, 2, seed);
+        for (int i = 0; i < (n - 2); i++) {
+            double sample = sample_normal(5, 2, seed);
+            if (sample < min) {
+                min = sample;
+            }
+        }
+        return min;
+    }
+    double sample_min_of_1000(uint64_t * seed)
+    {
+        return sample_min_of_n(seed, 1000);
+    }
+
+    int n_samples = 1000000, n_threads = 16;
+    double* results = malloc((size_t)n_samples * sizeof(double));
+    sampler_parallel(sample_min_of_1000, results, n_threads, n_samples);
+    printf("Mean of the distribution of (taking the min of 1000 samples of a normal(5,2)): %f\n", array_mean(results, n_samples));
+    free(results);
+
+    /* Option 2: take the min from n samples cleverly using parallelism */
+    // Question being asked: can we take the min of n samples cleverly?
+    double sample_n_parallel(int n)
+    {
+
+        int n_threads = 16;
+        int quotient = n / 16;
+        int remainder = n % 16;
+
+        uint64_t seed = 1000;
+        double result_remainder = sample_min_of_n(&seed, remainder);
+
+        double sample_min_of_quotient(uint64_t * seed)
+        {
+            return sample_min_of_n(seed, quotient);
+        }
+        double* results_quotient = malloc((size_t)quotient * sizeof(double));
+        sampler_parallel(sample_min_of_quotient, results_quotient, n_threads, quotient);
+
+        double min = results_quotient[0];
+        for (int i = 1; i < quotient; i++) {
+            if (min > results_quotient[i]) {
+                min = results_quotient[i];
+            }
+        }
+        if (min > result_remainder) {
+            min = result_remainder;
+        }
+        free(results_quotient);
+        return min;
+    }
+    printf("Minimum of 1M samples of normal(5,2): %f\n", sample_n_parallel(1000000));
+}
diff --git a/CUDA/examples/more/14_check_confidence_interval/example b/CUDA/examples/more/14_check_confidence_interval/example
new file mode 100755
index 0000000..b890551
Binary files /dev/null and b/CUDA/examples/more/14_check_confidence_interval/example differ
diff --git a/CUDA/examples/more/14_check_confidence_interval/example.c b/CUDA/examples/more/14_check_confidence_interval/example.c
new file mode 100644
index 0000000..400b814
--- /dev/null
+++ b/CUDA/examples/more/14_check_confidence_interval/example.c
@@ -0,0 +1,22 @@
+#include "../../../squiggle.h"
+#include "../../../squiggle_more.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+int main()
+{
+    // set randomness seed
+    uint64_t* seed = malloc(sizeof(uint64_t));
+    *seed = 1000; // xorshift can't start with a seed of 0
+
+    int n = 1000000;
+    double* xs = malloc(sizeof(double) * (size_t)n);
+    for (int i = 0; i < n; i++) {
+        xs[i] = sample_to(10, 100, seed);
+    }
+    ci ci_90 = array_get_90_ci(xs, n);
+    printf("Recovering confidence interval of sample_to(10, 100):\n  low: %f, high: %f\n", ci_90.low, ci_90.high);
+
+    free(xs);
+    free(seed);
+}
diff --git a/CUDA/examples/more/15_time_to_botec_custom_mixture/example b/CUDA/examples/more/15_time_to_botec_custom_mixture/example
new file mode 100755
index 0000000..de627d4
Binary files /dev/null and b/CUDA/examples/more/15_time_to_botec_custom_mixture/example differ
diff --git a/CUDA/examples/more/15_time_to_botec_custom_mixture/example.c b/CUDA/examples/more/15_time_to_botec_custom_mixture/example.c
new file mode 100644
index 0000000..47577f8
--- /dev/null
+++ b/CUDA/examples/more/15_time_to_botec_custom_mixture/example.c
@@ -0,0 +1,34 @@
+#include "../../../squiggle.h"
+#include "../../../squiggle_more.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+double cumsum_p0 = 0.6;
+double cumsum_p1 = 0.8;
+double cumsum_p2 = 0.9;
+double cumsum_p3 = 1.0;
+
+double sampler_result(uint64_t * seed)
+{
+
+    double p = sample_uniform(0, 1, seed);
+    if(p< cumsum_p0){
+        return 0;
+    } else if (p < cumsum_p1){
+        return 1;
+    } else if (p < cumsum_p2){
+        return sample_to(1,3, seed);
+    } else {
+        return sample_to(2, 10, seed);
+    } 
+}
+
+int main()
+{
+
+    int n_samples = 1000 * 1000, n_threads = 16;
+    double* results = malloc((size_t)n_samples * sizeof(double));
+    sampler_parallel(sampler_result, results, n_threads, n_samples);
+    printf("Avg: %f\n", array_sum(results, n_samples) / n_samples);
+    free(results);
+}
diff --git a/CUDA/examples/more/makefile b/CUDA/examples/more/makefile
new file mode 100644
index 0000000..6b4b7dc
--- /dev/null
+++ b/CUDA/examples/more/makefile
@@ -0,0 +1,117 @@
+# Interface: 
+# make all 
+# make format-all
+# make run-all
+# make one             DIR=06_nuclear_recovery
+# make format-one      DIR=06_nuclear_recovery
+# make run-one         DIR=06_nuclear_recovery
+# make time-linux-one  DIR=06_nuclear_recovery
+# make profile-one     DIR=06_nuclear_recovery
+
+# Compiler
+CC=gcc
+# CC=tcc # <= faster compilation
+
+# Main file
+SRC=example.c
+OUTPUT=example
+
+## Dependencies
+SQUIGGLE=../../squiggle.c 
+SQUIGGLE_MORE=../../squiggle_more.c
+MATH=-lm
+OPENMP=-fopenmp
+DEPS=$(SQUIGGLE) $(SQUIGGLE_MORE) $(MATH) $(OPENMP)
+
+## Flags
+# DEBUG=-fsanitize=address,undefined
+# DEBUG=-g
+DEBUG=
+WARN=-Wall -Wextra -Wdouble-promotion -Wconversion
+STANDARD=-std=c99
+WARNINGS=-Wall
+OPTIMIZED=-O3  #-Ofast
+
+## Formatter
+STYLE_BLUEPRINT=webkit
+FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
+
+## make all
+all:
+	$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) 00_example_template/$(SRC)             $(DEPS) -o 00_example_template/$(OUTPUT)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) 02_ci_beta/$(SRC)                      $(DEPS) -o 02_ci_beta/$(OUTPUT)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) 03_ci_beta_parallel/$(SRC)             $(DEPS) -o 03_ci_beta_parallel/$(OUTPUT)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) 04_nuclear_war/$(SRC)                  $(DEPS) -o 04_nuclear_war/$(OUTPUT)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) 05_burn_10kg_fat/$(SRC)                $(DEPS) -o 05_burn_10kg_fat/$(OUTPUT)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) 06_nuclear_recovery/$(SRC)             $(DEPS) -o 06_nuclear_recovery/$(OUTPUT)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) 07_algebra/$(SRC)                      $(DEPS) -o 07_algebra/$(OUTPUT)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) 08_algebra_and_conversion/$(SRC)       $(DEPS) -o 08_algebra_and_conversion/$(OUTPUT)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) 09_ergonomic_algebra/$(SRC)            $(DEPS) -o 09_ergonomic_algebra/$(OUTPUT)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) 10_twitter_thread_example/$(SRC)       $(DEPS) -o 10_twitter_thread_example/$(OUTPUT)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) 11_billion_lognormals_paralell/$(SRC)  $(DEPS) -o 11_billion_lognormals_paralell/$(OUTPUT)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) 12_time_to_botec_parallel/$(SRC)       $(DEPS) -o 12_time_to_botec_parallel/$(OUTPUT)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) 13_parallelize_min/$(SRC)              $(DEPS) -o 13_parallelize_min/$(OUTPUT)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) 14_check_confidence_interval/$(SRC)    $(DEPS) -o 14_check_confidence_interval/$(OUTPUT)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) 15_time_to_botec_custom_mixture/$(SRC)    $(DEPS) -o 15_time_to_botec_custom_mixture/$(OUTPUT)
+
+format-all:
+	$(FORMATTER) 00_example_template/$(SRC)
+	$(FORMATTER) 01_sample_from_cdf/$(SRC)
+	$(FORMATTER) 02_ci_beta/$(SRC)
+	$(FORMATTER) 03_ci_beta_parallel/$(SRC)
+	$(FORMATTER) 04_nuclear_war/$(SRC)
+	$(FORMATTER) 05_burn_10kg_fat/$(SRC)
+	$(FORMATTER) 06_nuclear_recovery/$(SRC)
+	$(FORMATTER) 07_algebra/$(SRC)
+	$(FORMATTER) 08_algebra_and_conversion/$(SRC)
+	$(FORMATTER) 09_ergonomic_algebra/$(SRC)
+	$(FORMATTER) 10_twitter_thread_example/$(SRC)
+	$(FORMATTER) 11_billion_lognormals_paralell/$(SRC)
+	$(FORMATTER) 12_time_to_botec_parallel/$(SRC)
+	$(FORMATTER) 13_parallelize_min/$(SRC)
+	$(FORMATTER) 14_check_confidence_interval/$(SRC)
+
+run-all:
+	00_example_template/$(OUTPUT)
+	01_sample_from_cdf/$(OUTPUT)
+	02_ci_beta/$(OUTPUT)
+	03_ci_beta_parallel/$(OUTPUT)
+	04_nuclear_war/$(OUTPUT)
+	05_burn_10kg_fat/$(OUTPUT)
+	06_nuclear_recovery/$(OUTPUT)
+	07_algebra/$(OUTPUT)
+	08_algebra_and_conversion/$(OUTPUT)
+	09_ergonomic_algebra/$(OUTPUT)
+	10_twitter_thread_example/$(OUTPUT)
+	11_billion_lognormals_paralell/$(OUTPUT)
+	12_time_to_botec_parallel/$(OUTPUT)
+	13_parallelize_min/$(OUTPUT)
+	14_check_confidence_interval/$(OUTPUT)
+
+## make one DIR=06_nuclear_recovery
+one: $(DIR)/$(SRC)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) $(DIR)/$(SRC) $(DEPS) -o $(DIR)/$(OUTPUT)
+
+## make format-one DIR=06_nuclear_recovery
+format-one: $(DIR)/$(SRC)
+	$(FORMATTER) $(DIR)/$(SRC)
+
+## make run-one DIR=06_nuclear_recovery
+run-one: $(DIR)/$(OUTPUT)
+	$(DIR)/$(OUTPUT) && echo
+
+## make time-linux-one DIR=06_nuclear_recovery
+time-linux-one: $(DIR)/$(OUTPUT)
+	@echo "Requires /bin/time, found on GNU/Linux systems" && echo
+	@echo "Running 100x and taking avg time $(DIR)/$(OUTPUT)"
+	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(DIR)/$(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time: |" | sed 's|$$|ms|' && echo
+
+## e.g., make profile-linux-one DIR=06_nuclear_recovery
+profile-linux-one: 
+	echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar"
+	echo "Must be run as sudo"
+	$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) $(DIR)/$(SRC) $(DEPS) -o $(DIR)/$(OUTPUT)
+	# $(CC) $(SRC) $(DEPS) -o $(OUTPUT)
+	sudo perf record $(DIR)/$(OUTPUT)
+	sudo perf report
+	rm perf.data
diff --git a/CUDA/makefile b/CUDA/makefile
new file mode 100644
index 0000000..5c21cd9
--- /dev/null
+++ b/CUDA/makefile
@@ -0,0 +1,35 @@
+MAKEFLAGS += --no-print-directory
+
+## Formatter
+STYLE_BLUEPRINT=webkit
+FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
+
+## Time to botec
+TTB=./examples/more/12_time_to_botec_parallel/example
+
+build-examples:
+	cd examples/core && make all
+	cd examples/more && make all
+
+format-examples:
+	cd examples/core && make format-all
+	cd examples/more && make format-all
+
+format: squiggle.c squiggle.h
+	$(FORMATTER) squiggle.c squiggle.h
+	$(FORMATTER) squiggle_more.c squiggle_more.h
+
+lint:
+	clang-tidy squiggle.c -- -lm
+	clang-tidy squiggle_more.c -- -lm
+
+profile:
+	sudo perf record -g ./examples/more/12_time_to_botec_parallel/example 
+	sudo perf report
+	rm perf.data
+	sudo perf stat ./examples/more/12_time_to_botec_parallel/example
+
+time-linux: 
+	gcc -O3 -Wall -Wextra -Wdouble-promotion -Wconversion examples/more/12_time_to_botec_parallel/example.c squiggle.c squiggle_more.c -lm -fopenmp -o examples/more/12_time_to_botec_parallel/example
+	@echo "Running 100x and taking avg time: $(TTB)"
+	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_PROC_BIND=TRUE $(TTB); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo
diff --git a/CUDA/squiggle.c b/CUDA/squiggle.c
new file mode 100644
index 0000000..3f2f4d3
--- /dev/null
+++ b/CUDA/squiggle.c
@@ -0,0 +1,228 @@
+#include <limits.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+// Defs
+#define PI 3.14159265358979323846 // M_PI in gcc gnu99
+#define NORMAL90CONFIDENCE 1.6448536269514727
+#define UNUSED(x) (void)(x)
+// ^ https://stackoverflow.com/questions/3599160/how-can-i-suppress-unused-parameter-warnings-in-c
+
+// Pseudo Random number generators
+static uint64_t xorshift64(uint64_t* seed)
+{
+    // Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs"
+    // See:
+    //   <https://en.wikipedia.org/wiki/Xorshift>
+    //   <https://stackoverflow.com/questions/53886131/how-does-xorshift32-works>,
+    // Also some drama:
+    //   <https://www.pcg-random.org/posts/on-vignas-pcg-critique.html>,
+    //   <https://prng.di.unimi.it/>
+    uint64_t x = *seed;
+    x ^= x << 13;
+    x ^= x >> 7;
+    x ^= x << 17;
+    return *seed = x;
+
+    /* 
+    // if one wanted to generate 32 bit ints, 
+    // from which to generate floats,
+    // one could do the following: 
+    uint32_t x = *seed;
+    x ^= x << 13;
+    x ^= x >> 17;
+    x ^= x << 5;
+    return *seed = x;
+    */
+}
+
+// Distribution & sampling functions
+// Unit distributions
+double sample_unit_uniform(uint64_t* seed)
+{
+    // samples uniform from [0,1] interval.
+    return ((double)xorshift64(seed)) / ((double)UINT64_MAX);
+}
+
+double sample_unit_normal(uint64_t* seed)
+{
+    // // See: <https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform>
+    double u1 = sample_unit_uniform(seed);
+    double u2 = sample_unit_uniform(seed);
+    double z = sqrt(-2.0 * log(u1)) * sin(2.0 * PI * u2);
+    return z;
+}
+
+// Composite distributions
+double sample_uniform(double start, double end, uint64_t* seed)
+{
+    return sample_unit_uniform(seed) * (end - start) + start;
+}
+
+double sample_normal(double mean, double sigma, uint64_t* seed)
+{
+    return (mean + sigma * sample_unit_normal(seed));
+}
+
+double sample_lognormal(double logmean, double logstd, uint64_t* seed)
+{
+    return exp(sample_normal(logmean, logstd, seed));
+}
+
+double sample_normal_from_90_ci(double low, double high, uint64_t* seed)
+{
+    // Explanation of key idea:
+    // 1. We know that the 90% confidence interval of the unit normal is
+    // [-1.6448536269514722, 1.6448536269514722]
+    // see e.g.: https://stackoverflow.com/questions/20626994/how-to-calculate-the-inverse-of-the-normal-cumulative-distribution-function-in-p
+    // or https://www.wolframalpha.com/input?i=N%5BInverseCDF%28normal%280%2C1%29%2C+0.05%29%2C%7B%E2%88%9E%2C100%7D%5D
+    // 2. So if we take a unit normal and multiply it by
+    // L / 1.6448536269514722, its new 90% confidence interval will be
+    // [-L, L], i.e., length 2 * L
+    // 3. Instead, if we want to get a confidence interval of length L,
+    // we should multiply the unit normal by
+    // L / (2 * 1.6448536269514722)
+    // Meaning that its standard deviation should be multiplied by that amount
+    // see: https://en.wikipedia.org/wiki/Normal_distribution?lang=en#Operations_on_a_single_normal_variable
+    // 4. So we have learnt that Normal(0, L / (2 * 1.6448536269514722))
+    // has a 90% confidence interval of length L
+    // 5. If we want a 90% confidence interval from high to low,
+    // we can set mean = (high + low)/2; the midpoint, and L = high-low,
+    // Normal([high + low]/2, [high - low]/(2 * 1.6448536269514722))
+    double mean = (high + low) * 0.5;
+    double std = (high - low) / (2.0 * NORMAL90CONFIDENCE);
+    return sample_normal(mean, std, seed);
+}
+
+double sample_to(double low, double high, uint64_t* seed)
+{
+    // Given a (positive) 90% confidence interval,
+    // returns a sample from a lognorma with a matching 90% c.i.
+    // Key idea: If we want a lognormal with 90% confidence interval [a, b]
+    // we need but get a normal with 90% confidence interval [log(a), log(b)].
+    // Then see code for sample_normal_from_90_ci
+    double loglow = log(low);
+    double loghigh = log(high);
+    return exp(sample_normal_from_90_ci(loglow, loghigh, seed));
+}
+
+double sample_gamma(double alpha, uint64_t* seed)
+{
+
+    // A Simple Method for Generating Gamma Variables, Marsaglia and Wan Tsang, 2001
+    // https://dl.acm.org/doi/pdf/10.1145/358407.358414
+    // see also the references/ folder
+    // Note that the Wikipedia page for the gamma distribution includes a scaling parameter
+    // k or beta
+    // https://en.wikipedia.org/wiki/Gamma_distribution
+    // such that gamma_k(alpha, k) = k * gamma(alpha)
+    // or gamma_beta(alpha, beta) = gamma(alpha) / beta
+    // So far I have not needed to use this, and thus the second parameter is by default 1.
+    if (alpha >= 1) {
+        double d, c, x, v, u;
+        d = alpha - 1.0 / 3.0;
+        c = 1.0 / sqrt(9.0 * d);
+        while (1) {
+
+            do {
+                x = sample_unit_normal(seed);
+                v = 1.0 + c * x;
+            } while (v <= 0.0);
+
+            v = v * v * v;
+            u = sample_unit_uniform(seed);
+            if (u < 1.0 - 0.0331 * (x * x * x * x)) { // Condition 1
+                // the 0.0331 doesn't inspire much confidence
+                // however, this isn't the whole story
+                // by knowing that Condition 1 implies condition 2
+                // we realize that this is just a way of making the algorithm faster
+                // i.e., of not using the logarithms
+                return d * v;
+            }
+            if (log(u) < 0.5 * (x * x) + d * (1.0 - v + log(v))) { // Condition 2
+                return d * v;
+            }
+        }
+    } else {
+        return sample_gamma(1 + alpha, seed) * pow(sample_unit_uniform(seed), 1 / alpha);
+        // see note in p. 371 of https://dl.acm.org/doi/pdf/10.1145/358407.358414
+    }
+}
+
+double sample_beta(double a, double b, uint64_t* seed)
+{
+    // See: https://en.wikipedia.org/wiki/Gamma_distribution#Related_distributions
+    double gamma_a = sample_gamma(a, seed);
+    double gamma_b = sample_gamma(b, seed);
+    return gamma_a / (gamma_a + gamma_b);
+}
+
+double sample_laplace(double successes, double failures, uint64_t* seed)
+{
+    // see <https://en.wikipedia.org/wiki/Beta_distribution?lang=en#Rule_of_succession>
+    return sample_beta(successes + 1, failures + 1, seed);
+}
+
+// Array helpers
+double array_sum(double* array, int length)
+{
+    double sum = 0.0;
+    for (int i = 0; i < length; i++) {
+        sum += array[i];
+    }
+    return sum;
+}
+
+void array_cumsum(double* array_to_sum, double* array_cumsummed, int length)
+{
+    array_cumsummed[0] = array_to_sum[0];
+    for (int i = 1; i < length; i++) {
+        array_cumsummed[i] = array_cumsummed[i - 1] + array_to_sum[i];
+    }
+}
+
+double array_mean(double* array, int length)
+{
+    double sum = array_sum(array, length);
+    return sum / length;
+}
+
+double array_std(double* array, int length)
+{
+    double mean = array_mean(array, length);
+    double std = 0.0;
+    for (int i = 0; i < length; i++) {
+        std += (array[i] - mean) * (array[i] - mean);
+    }
+    std = sqrt(std / length);
+    return std;
+}
+
+// Mixture function
+double sample_mixture(double (*samplers[])(uint64_t*), double* weights, int n_dists, uint64_t* seed)
+{
+    // Sample from samples with frequency proportional to their weights.
+    double sum_weights = array_sum(weights, n_dists);
+    double* cumsummed_normalized_weights = (double*)malloc((size_t)n_dists * sizeof(double));
+    cumsummed_normalized_weights[0] = weights[0] / sum_weights;
+    for (int i = 1; i < n_dists; i++) {
+        cumsummed_normalized_weights[i] = cumsummed_normalized_weights[i - 1] + weights[i] / sum_weights;
+    }
+
+    double result;
+    int result_set_flag = 0;
+    double p = sample_uniform(0, 1, seed);
+    for (int k = 0; k < n_dists; k++) {
+        if (p < cumsummed_normalized_weights[k]) {
+            result = samplers[k](seed);
+            result_set_flag = 1;
+            break;
+        }
+    }
+    if (result_set_flag == 0)
+        result = samplers[n_dists - 1](seed);
+
+    free(cumsummed_normalized_weights);
+    return result;
+}
diff --git a/CUDA/squiggle.h b/CUDA/squiggle.h
new file mode 100644
index 0000000..100ea68
--- /dev/null
+++ b/CUDA/squiggle.h
@@ -0,0 +1,37 @@
+#ifndef SQUIGGLE_C_CORE
+#define SQUIGGLE_C_CORE
+
+// uint64_t header
+#include <stdint.h>
+
+// Pseudo Random number generator
+uint64_t xorshift64(uint64_t* seed);
+
+// Basic distribution sampling functions
+double sample_unit_uniform(uint64_t* seed);
+double sample_unit_normal(uint64_t* seed);
+
+// Composite distribution sampling functions
+double sample_uniform(double start, double end, uint64_t* seed);
+double sample_normal(double mean, double sigma, uint64_t* seed);
+double sample_lognormal(double logmean, double logsigma, uint64_t* seed);
+double sample_normal_from_90_ci(double low, double high, uint64_t* seed);
+double sample_to(double low, double high, uint64_t* seed);
+
+double sample_gamma(double alpha, uint64_t* seed);
+double sample_beta(double a, double b, uint64_t* seed);
+double sample_laplace(double successes, double failures, uint64_t* seed);
+
+// Array helpers
+double array_sum(double* array, int length);
+void array_cumsum(double* array_to_sum, double* array_cumsummed, int length);
+double array_mean(double* array, int length);
+double array_std(double* array, int length);
+
+// Mixture function
+double sample_mixture(double (*samplers[])(uint64_t*), double* weights, int n_dists, uint64_t* seed);
+
+// Macro to mute "unused variable" warning when -Wall -Wextra is enabled. Useful for nested functions
+#define UNUSED(x) (void)(x)
+
+#endif
diff --git a/CUDA/squiggle_more.c b/CUDA/squiggle_more.c
new file mode 100644
index 0000000..4441ce2
--- /dev/null
+++ b/CUDA/squiggle_more.c
@@ -0,0 +1,459 @@
+#include "squiggle.h"
+#include <float.h>
+#include <limits.h>
+#include <math.h>
+#include <omp.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h> // memcpy
+
+/* Cache optimizations */
+#define CACHE_LINE_SIZE 64
+// getconf LEVEL1_DCACHE_LINESIZE
+// <https://stackoverflow.com/questions/794632/programmatically-get-the-cache-line-size>
+typedef struct seed_cache_box_t {
+    uint64_t seed;
+    char padding[CACHE_LINE_SIZE - sizeof(uint64_t)];
+    // Cache line size is 64 *bytes*, uint64_t is 64 *bits* (8 bytes). Different units!
+} seed_cache_box;
+// This avoids "false sharing", i.e., different threads competing for the same cache line
+// Dealing with this shaves 4ms from a 12ms process, or a third of runtime
+// <http://www.nic.uoregon.edu/~khuck/ts/acumem-report/manual_html/ch06s07.html>
+
+/* Parallel sampler */
+void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_threads, int n_samples)
+{
+
+    // Terms of the division:
+    // a = b * quotient + reminder
+    // a = b * (a/b)    + (a%b)
+    // dividend: a
+    // divisor: b
+    // quotient = a/b
+    // reminder = a%b
+    // "divisor's multiple" := b*(a/b)
+
+    // now, we have n_samples and n_threads
+    // to make our life easy, each thread will have a number of samples of: a/b (quotient)
+    // and we'll compute the remainder of samples separately
+    // to possibly do by Jorge: improve so that the remainder is included in the threads
+
+    int quotient = n_samples / n_threads;
+    int divisor_multiple = quotient * n_threads;
+
+    // uint64_t** seeds = malloc((size_t)n_threads * sizeof(uint64_t*));
+    seed_cache_box* cache_box = (seed_cache_box*)malloc(sizeof(seed_cache_box) * (size_t)n_threads);
+    // seed_cache_box cache_box[n_threads]; // we could use the C stack. On normal linux machines, it's 8MB ($ ulimit -s). However, it doesn't quite feel right.
+    srand(1);
+    for (int i = 0; i < n_threads; i++) {
+        // Constraints:
+        // - xorshift can't start with 0
+        // - the seeds should be reasonably separated and not correlated
+        cache_box[i].seed = (uint64_t)rand() * (UINT64_MAX / RAND_MAX);
+
+        // Other initializations tried:
+        // *seeds[i] = 1 + i;
+        // *seeds[i] = (i + 0.5)*(UINT64_MAX/n_threads);
+        // *seeds[i] = (i + 0.5)*(UINT64_MAX/n_threads) + constant * i;
+    }
+
+    int i;
+#pragma omp parallel private(i)
+    {
+#pragma omp for
+        for (i = 0; i < n_threads; i++) {
+            // It's possible I don't need the for, and could instead call omp
+            // in some different way and get  the thread number with omp_get_thread_num()
+            int lower_bound_inclusive = i * quotient;
+            int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below,
+
+            for (int j = lower_bound_inclusive; j < upper_bound_not_inclusive; j++) {
+                results[j] = sampler(&(cache_box[i].seed));
+                /* 
+                t starts at 0 and ends at T
+                at t=0, 
+                  thread i accesses: results[i*quotient +0], 
+                  thread i+1 acccesses: results[(i+1)*quotient +0]
+                at t=T
+                  thread i accesses: results[(i+1)*quotient -1]
+                  thread i+1 acccesses: results[(i+2)*quotient -1]
+                The results[j] that are directly adjacent are 
+                  results[(i+1)*quotient -1] (accessed by thread i at time T)
+                  results[(i+1)*quotient +0] (accessed by thread i+1 at time 0)
+                and these are themselves adjacent to
+                  results[(i+1)*quotient -2] (accessed by thread i at time T-1)
+                  results[(i+1)*quotient +1] (accessed by thread i+1 at time 2)
+                If T is large enough, which it is, two threads won't access the same
+                cache line at the same time.
+                Pictorially:
+                at t=0 ....i.........I.........
+                at t=T .............i.........I
+                and the two never overlap
+                Note that results[j] is a double, a double has 8 bytes (64 bits)
+                8 doubles fill a cache line of 64 bytes.
+                So we specifically won't get problems as long as n_samples/n_threads > 8
+                n_threads is normally 16, so n_samples > 128 
+                Note also that this is only a problem in terms of speed, if n_samples<128
+                the results are still computed, it'll just be slower
+                */
+            }
+        }
+    }
+    for (int j = divisor_multiple; j < n_samples; j++) {
+        results[j] = sampler(&(cache_box[0].seed));
+        // we can just reuse a seed,
+        // this isn't problematic because we;ve now stopped doing multithreading
+    }
+
+    free(cache_box);
+}
+
+/* Get confidence intervals, given a sampler */
+
+typedef struct ci_t {
+    double low;
+    double high;
+} ci;
+
+inline static void swp(int i, int j, double xs[])
+{
+    double tmp = xs[i];
+    xs[i] = xs[j];
+    xs[j] = tmp;
+}
+
+static int partition(int low, int high, double xs[], int length)
+{
+    if (low > high || high >= length) {
+        printf("Invariant violated for function partition in %s (%d)", __FILE__, __LINE__);
+        exit(1);
+    }
+    // Note: the scratchpad/ folder in commit 578bfa27 has printfs sprinkled throughout
+    int pivot = low + (int)floor((high - low) / 2);
+    double pivot_value = xs[pivot];
+    swp(pivot, high, xs);
+    int gt = low; /* This pointer will iterate until finding an element which is greater than the pivot. Then it will move elements that are smaller before it--more specifically, it will move elements to its position and then increment. As a result all elements between gt and i will be greater than the pivot. */
+    for (int i = low; i < high; i++) {
+        if (xs[i] < pivot_value) {
+            swp(gt, i, xs);
+            gt++;
+        }
+    }
+    swp(high, gt, xs);
+    return gt;
+}
+
+static double quickselect(int k, double xs[], int n)
+{
+    // https://en.wikipedia.org/wiki/Quickselect
+
+    double* ys = malloc((size_t)n * sizeof(double));
+    memcpy(ys, xs, (size_t)n * sizeof(double));
+    // ^: don't rearrange item order in the original array
+
+    int low = 0;
+    int high = n - 1;
+    for (;;) {
+        if (low == high) {
+            double result = ys[low];
+            free(ys);
+            return result;
+        }
+        int pivot = partition(low, high, ys, n);
+        if (pivot == k) {
+            double result = ys[pivot];
+            free(ys);
+            return result;
+        } else if (k < pivot) {
+            high = pivot - 1;
+        } else {
+            low = pivot + 1;
+        }
+    }
+}
+
+ci array_get_ci(ci interval, double* xs, int n)
+{
+
+    int low_k = (int)floor(interval.low * n);
+    int high_k = (int)ceil(interval.high * n);
+    ci result = {
+        .low = quickselect(low_k, xs, n),
+        .high = quickselect(high_k, xs, n),
+    };
+    return result;
+}
+ci array_get_90_ci(double xs[], int n)
+{
+    return array_get_ci((ci) { .low = 0.05, .high = 0.95 }, xs, n);
+}
+
+double array_get_median(double xs[], int n)
+{
+    int median_k = (int)floor(0.5 * n);
+    return quickselect(median_k, xs, n);
+}
+
+/* array print: potentially useful for debugging */
+void array_print(double xs[], int n)
+{
+    printf("[");
+    for (int i = 0; i < n - 1; i++) {
+        printf("%f, ", xs[i]);
+    }
+    printf("%f", xs[n - 1]);
+    printf("]\n");
+}
+
+void array_print_stats(double xs[], int n)
+{
+    ci ci_90 = array_get_ci((ci) { .low = 0.05, .high = 0.95 }, xs, n);
+    ci ci_80 = array_get_ci((ci) { .low = 0.1, .high = 0.9 }, xs, n);
+    ci ci_50 = array_get_ci((ci) { .low = 0.25, .high = 0.75 }, xs, n);
+    double median = array_get_median(xs, n);
+    double mean = array_mean(xs, n);
+    double std = array_std(xs, n);
+    printf("| Statistic | Value |\n"
+           "| --- | --- |\n"
+           "| Mean   | %lf |\n"
+           "| Median | %lf |\n"
+           "| Std    | %lf |\n"
+           "| 90%% confidence interval | %lf to %lf |\n"
+           "| 80%% confidence interval | %lf to %lf |\n"
+           "| 50%% confidence interval | %lf to %lf |\n",
+           mean, median, std, ci_90.low, ci_90.high, ci_80.low, ci_80.high, ci_50.low, ci_50.high);
+}
+
+void array_print_histogram(double* xs, int n_samples, int n_bins)
+{
+    // Interface inspired by <https://github.com/red-data-tools/YouPlot>
+    if (n_bins <= 1) {
+        fprintf(stderr, "Number of bins must be greater than 1.\n");
+        return;
+    } else if (n_samples <= 1) {
+        fprintf(stderr, "Number of samples must be higher than 1.\n");
+        return;
+    }
+
+    int* bins = (int*)calloc((size_t)n_bins, sizeof(int));
+    if (bins == NULL) {
+        fprintf(stderr, "Memory allocation for bins failed.\n");
+        return;
+    }
+
+    // Find the minimum and maximum values from the samples
+    double min_value = xs[0], max_value = xs[0];
+    for (int i = 0; i < n_samples; i++) {
+        if (xs[i] < min_value) {
+            min_value = xs[i];
+        }
+        if (xs[i] > max_value) {
+            max_value = xs[i];
+        }
+    }
+
+    // Avoid division by zero for a single unique value
+    if (min_value == max_value) {
+        max_value++;
+    }
+
+    // Calculate bin width
+    double bin_width = (max_value - min_value) / n_bins;
+
+    // Fill the bins with sample counts
+    for (int i = 0; i < n_samples; i++) {
+        int bin_index = (int)((xs[i] - min_value) / bin_width);
+        if (bin_index == n_bins) {
+            bin_index--; // Last bin includes max_value
+        }
+        bins[bin_index]++;
+    }
+
+    // Calculate the scaling factor based on the maximum bin count
+    int max_bin_count = 0;
+    for (int i = 0; i < n_bins; i++) {
+        if (bins[i] > max_bin_count) {
+            max_bin_count = bins[i];
+        }
+    }
+    const int MAX_WIDTH = 50; // Adjust this to your terminal width
+    double scale = max_bin_count > MAX_WIDTH ? (double)MAX_WIDTH / max_bin_count : 1.0;
+
+    // Print the histogram
+    for (int i = 0; i < n_bins; i++) {
+        double bin_start = min_value + i * bin_width;
+        double bin_end = bin_start + bin_width;
+
+        int decimalPlaces = 1;
+        if ((0 < bin_width) && (bin_width < 1)) {
+            int magnitude = (int)floor(log10(bin_width));
+            decimalPlaces = -magnitude;
+            decimalPlaces = decimalPlaces > 10 ? 10 : decimalPlaces;
+        }
+        printf("[%*.*f, %*.*f", 4 + decimalPlaces, decimalPlaces, bin_start, 4 + decimalPlaces, decimalPlaces, bin_end);
+        char interval_delimiter = ')';
+        if (i == (n_bins - 1)) {
+            interval_delimiter = ']'; // last bucket is inclusive
+        }
+        printf("%c: ", interval_delimiter);
+
+        int marks = (int)(bins[i] * scale);
+        for (int j = 0; j < marks; j++) {
+            printf("█");
+        }
+        printf(" %d\n", bins[i]);
+    }
+
+    // Free the allocated memory for bins
+    free(bins);
+}
+
+void array_print_90_ci_histogram(double* xs, int n_samples, int n_bins)
+{
+    // Code duplicated from previous function
+    // I'll consider simplifying it at some future point
+    // Possible ideas:
+    // - having only one function that takes any confidence interval?
+    // - having a utility function that is called by both functions?
+    ci ci_90 = array_get_90_ci(xs, n_samples);
+
+    if (n_bins <= 1) {
+        fprintf(stderr, "Number of bins must be greater than 1.\n");
+        return;
+    } else if (n_samples <= 10) {
+        fprintf(stderr, "Number of samples must be higher than 10.\n");
+        return;
+    }
+
+    int* bins = (int*)calloc((size_t)n_bins, sizeof(int));
+    if (bins == NULL) {
+        fprintf(stderr, "Memory allocation for bins failed.\n");
+        return;
+    }
+
+    double min_value = ci_90.low, max_value = ci_90.high;
+
+    // Avoid division by zero for a single unique value
+    if (min_value == max_value) {
+        max_value++;
+    }
+    double bin_width = (max_value - min_value) / n_bins;
+
+    // Fill the bins with sample counts
+    int below_min = 0, above_max = 0;
+    for (int i = 0; i < n_samples; i++) {
+        if (xs[i] < min_value) {
+            below_min++;
+        } else if (xs[i] > max_value) {
+            above_max++;
+        } else {
+            int bin_index = (int)((xs[i] - min_value) / bin_width);
+            if (bin_index == n_bins) {
+                bin_index--; // Last bin includes max_value
+            }
+            bins[bin_index]++;
+        }
+    }
+
+    // Calculate the scaling factor based on the maximum bin count
+    int max_bin_count = 0;
+    for (int i = 0; i < n_bins; i++) {
+        if (bins[i] > max_bin_count) {
+            max_bin_count = bins[i];
+        }
+    }
+    const int MAX_WIDTH = 40; // Adjust this to your terminal width
+    double scale = max_bin_count > MAX_WIDTH ? (double)MAX_WIDTH / max_bin_count : 1.0;
+
+    // Print the histogram
+    int decimalPlaces = 1;
+    if ((0 < bin_width) && (bin_width < 1)) {
+        int magnitude = (int)floor(log10(bin_width));
+        decimalPlaces = -magnitude;
+        decimalPlaces = decimalPlaces > 10 ? 10 : decimalPlaces;
+    }
+    printf("(%*s, %*.*f): ", 6 + decimalPlaces, "-∞", 4 + decimalPlaces, decimalPlaces, min_value);
+    int marks_below_min = (int)(below_min * scale);
+    for (int j = 0; j < marks_below_min; j++) {
+        printf("█");
+    }
+    printf(" %d\n", below_min);
+    for (int i = 0; i < n_bins; i++) {
+        double bin_start = min_value + i * bin_width;
+        double bin_end = bin_start + bin_width;
+
+        printf("[%*.*f, %*.*f", 4 + decimalPlaces, decimalPlaces, bin_start, 4 + decimalPlaces, decimalPlaces, bin_end);
+        char interval_delimiter = ')';
+        if (i == (n_bins - 1)) {
+            interval_delimiter = ']'; // last bucket is inclusive
+        }
+        printf("%c: ", interval_delimiter);
+
+        int marks = (int)(bins[i] * scale);
+        for (int j = 0; j < marks; j++) {
+            printf("█");
+        }
+        printf(" %d\n", bins[i]);
+    }
+    printf("(%*.*f, %*s): ", 4 + decimalPlaces, decimalPlaces, max_value, 6 + decimalPlaces, "+∞");
+    int marks_above_max = (int)(above_max * scale);
+    for (int j = 0; j < marks_above_max; j++) {
+        printf("█");
+    }
+    printf(" %d\n", above_max);
+
+    // Free the allocated memory for bins
+    free(bins);
+}
+
+/* Algebra manipulations */
+
+#define NORMAL90CONFIDENCE 1.6448536269514727
+
+typedef struct normal_params_t {
+    double mean;
+    double std;
+} normal_params;
+
+normal_params algebra_sum_normals(normal_params a, normal_params b)
+{
+    normal_params result = {
+        .mean = a.mean + b.mean,
+        .std = sqrt((a.std * a.std) + (b.std * b.std)),
+    };
+    return result;
+}
+
+typedef struct lognormal_params_t {
+    double logmean;
+    double logstd;
+} lognormal_params;
+
+lognormal_params algebra_product_lognormals(lognormal_params a, lognormal_params b)
+{
+    lognormal_params result = {
+        .logmean = a.logmean + b.logmean,
+        .logstd = sqrt((a.logstd * a.logstd) + (b.logstd * b.logstd)),
+    };
+    return result;
+}
+
+lognormal_params convert_ci_to_lognormal_params(ci x)
+{
+    double loghigh = log(x.high);
+    double loglow = log(x.low);
+    double logmean = (loghigh + loglow) / 2.0;
+    double logstd = (loghigh - loglow) / (2.0 * NORMAL90CONFIDENCE);
+    lognormal_params result = { .logmean = logmean, .logstd = logstd };
+    return result;
+}
+
+ci convert_lognormal_params_to_ci(lognormal_params y)
+{
+    double h = y.logstd * NORMAL90CONFIDENCE;
+    double loghigh = y.logmean + h;
+    double loglow = y.logmean - h;
+    ci result = { .low = exp(loglow), .high = exp(loghigh) };
+    return result;
+}
diff --git a/CUDA/squiggle_more.h b/CUDA/squiggle_more.h
new file mode 100644
index 0000000..6ff601b
--- /dev/null
+++ b/CUDA/squiggle_more.h
@@ -0,0 +1,42 @@
+#ifndef SQUIGGLE_C_EXTRA
+#define SQUIGGLE_C_EXTRA
+
+/* Parallel sampling */
+void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_threads, int n_samples);
+
+/* Stats */
+double array_get_median(double xs[], int n);
+typedef struct ci_t {
+    double low;
+    double high;
+} ci;
+ci array_get_ci(ci interval, double* xs, int n);
+ci array_get_90_ci(double xs[], int n);
+
+void array_print_stats(double xs[], int n);
+void array_print_histogram(double* xs, int n_samples, int n_bins);
+void array_print_90_ci_histogram(double* xs, int n, int n_bins);
+
+/* Algebra manipulations */
+
+typedef struct normal_params_t {
+    double mean;
+    double std;
+} normal_params;
+normal_params algebra_sum_normals(normal_params a, normal_params b);
+
+typedef struct lognormal_params_t {
+    double logmean;
+    double logstd;
+} lognormal_params;
+lognormal_params algebra_product_lognormals(lognormal_params a, lognormal_params b);
+
+lognormal_params convert_ci_to_lognormal_params(ci x);
+ci convert_lognormal_params_to_ci(lognormal_params y);
+
+/* Utilities */
+
+#define THOUSAND 1000
+#define MILLION 1000000
+
+#endif
diff --git a/CUDA/test/makefile b/CUDA/test/makefile
new file mode 100644
index 0000000..af3137a
--- /dev/null
+++ b/CUDA/test/makefile
@@ -0,0 +1,56 @@
+# Interface: 
+#   make
+#   make build
+#   make format
+#   make run
+
+# Compiler
+CC=gcc
+# CC=tcc # <= faster compilation
+
+# Main file
+SRC=test.c ../squiggle.c
+OUTPUT=test
+
+## Dependencies
+MATH=-lm
+
+## Flags
+DEBUG= #'-g'
+STANDARD=-std=c99
+WARNINGS=-Wall
+OPTIMIZED=-O3  #-Ofast
+# OPENMP=-fopenmp
+
+## Formatter
+STYLE_BLUEPRINT=webkit
+FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
+
+## make build
+build: $(SRC)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(MATH) -o $(OUTPUT)
+
+format: $(SRC)
+	$(FORMATTER) $(SRC)
+
+run: $(SRC) $(OUTPUT)
+	./$(OUTPUT)
+
+verify: $(SRC) $(OUTPUT)
+	./$(OUTPUT) | grep "NOT passed" -A 2 --group-separator='' || true
+
+time-linux: 
+	@echo "Requires /bin/time, found on GNU/Linux systems" && echo
+	
+	@echo "Running 100x and taking avg time $(OUTPUT)"
+	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 1 thread: |" | sed 's|$$|ms|' && echo
+
+## Profiling
+
+profile-linux: 
+	echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar"
+	echo "Must be run as sudo"
+	$(CC) $(SRC) $(MATH) -o $(OUTPUT)
+	sudo perf record ./$(OUTPUT)
+	sudo perf report
+	rm perf.data
diff --git a/CUDA/test/test b/CUDA/test/test
new file mode 100755
index 0000000..495bacc
Binary files /dev/null and b/CUDA/test/test differ
diff --git a/CUDA/test/test.c b/CUDA/test/test.c
new file mode 100644
index 0000000..cb9fab3
--- /dev/null
+++ b/CUDA/test/test.c
@@ -0,0 +1,328 @@
+#include "../squiggle.h"
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define TOLERANCE 5.0 / 1000.0
+#define MAX_NAME_LENGTH 500
+
+// Structs
+
+struct array_expectations {
+    double* array;
+    int n;
+    char* name;
+    double expected_mean;
+    double expected_std;
+    double tolerance;
+};
+
+void test_array_expectations(struct array_expectations e)
+{
+    double mean = array_mean(e.array, e.n);
+    double delta_mean = mean - e.expected_mean;
+
+    double std = array_std(e.array, e.n);
+    double delta_std = std - e.expected_std;
+
+    if ((fabs(delta_mean) / fabs(mean) > e.tolerance) && (fabs(delta_mean) > e.tolerance)) {
+        printf("[-] Mean test for %s NOT passed.\n", e.name);
+        printf("Mean of %s: %f, vs expected mean: %f\n", e.name, mean, e.expected_mean);
+        printf("delta: %f, relative delta: %f\n", delta_mean, delta_mean / fabs(mean));
+    } else {
+        printf("[x] Mean test for %s PASSED\n", e.name);
+    }
+
+    if ((fabs(delta_std) / fabs(std) > e.tolerance) && (fabs(delta_std) > e.tolerance)) {
+        printf("[-] Std test for %s NOT passed.\n", e.name);
+        printf("Std of %s: %f, vs expected std: %f\n", e.name, std, e.expected_std);
+        printf("delta: %f, relative delta: %f\n", delta_std, delta_std / fabs(std));
+    } else {
+        printf("[x] Std test for %s PASSED\n", e.name);
+    }
+
+    printf("\n");
+}
+
+// Test unit uniform
+void test_unit_uniform(uint64_t* seed)
+{
+    int n = 1000 * 1000;
+    double* unit_uniform_array = malloc(sizeof(double) * n);
+
+    for (int i = 0; i < n; i++) {
+        unit_uniform_array[i] = sample_unit_uniform(seed);
+    }
+
+    struct array_expectations expectations = {
+        .array = unit_uniform_array,
+        .n = n,
+        .name = "unit uniform",
+        .expected_mean = 0.5,
+        .expected_std = sqrt(1.0 / 12.0),
+        .tolerance = TOLERANCE,
+    };
+
+    test_array_expectations(expectations);
+    free(unit_uniform_array);
+}
+
+// Test uniforms
+void test_uniform(double start, double end, uint64_t* seed)
+{
+    int n = 1000 * 1000;
+    double* uniform_array = malloc(sizeof(double) * n);
+
+    for (int i = 0; i < n; i++) {
+        uniform_array[i] = sample_uniform(start, end, seed);
+    }
+
+    char* name = malloc(MAX_NAME_LENGTH * sizeof(char));
+    snprintf(name, MAX_NAME_LENGTH, "[%f, %f] uniform", start, end);
+    struct array_expectations expectations = {
+        .array = uniform_array,
+        .n = n,
+        .name = name,
+        .expected_mean = (start + end) / 2,
+        .expected_std = sqrt(1.0 / 12.0) * fabs(end - start),
+        .tolerance = fabs(end - start) * TOLERANCE,
+    };
+
+    test_array_expectations(expectations);
+    free(name);
+    free(uniform_array);
+}
+
+// Test unit normal
+void test_unit_normal(uint64_t* seed)
+{
+    int n = 1000 * 1000;
+    double* unit_normal_array = malloc(sizeof(double) * n);
+
+    for (int i = 0; i < n; i++) {
+        unit_normal_array[i] = sample_unit_normal(seed);
+    }
+
+    struct array_expectations expectations = {
+        .array = unit_normal_array,
+        .n = n,
+        .name = "unit normal",
+        .expected_mean = 0,
+        .expected_std = 1,
+        .tolerance = TOLERANCE,
+    };
+
+    test_array_expectations(expectations);
+    free(unit_normal_array);
+}
+
+// Test normal
+void test_normal(double mean, double std, uint64_t* seed)
+{
+    int n = 10 * 1000 * 1000;
+    double* normal_array = malloc(sizeof(double) * n);
+
+    for (int i = 0; i < n; i++) {
+        normal_array[i] = sample_normal(mean, std, seed);
+    }
+
+    char* name = malloc(MAX_NAME_LENGTH * sizeof(char));
+    snprintf(name, MAX_NAME_LENGTH, "normal(%f, %f)", mean, std);
+    struct array_expectations expectations = {
+        .array = normal_array,
+        .n = n,
+        .name = name,
+        .expected_mean = mean,
+        .expected_std = std,
+        .tolerance = TOLERANCE,
+    };
+
+    test_array_expectations(expectations);
+    free(name);
+    free(normal_array);
+}
+
+// Test lognormal
+void test_lognormal(double logmean, double logstd, uint64_t* seed)
+{
+    int n = 10 * 1000 * 1000;
+    double* lognormal_array = malloc(sizeof(double) * n);
+
+    for (int i = 0; i < n; i++) {
+        lognormal_array[i] = sample_lognormal(logmean, logstd, seed);
+    }
+
+    char* name = malloc(MAX_NAME_LENGTH * sizeof(char));
+    snprintf(name, MAX_NAME_LENGTH, "lognormal(%f, %f)", logmean, logstd);
+    struct array_expectations expectations = {
+        .array = lognormal_array,
+        .n = n,
+        .name = name,
+        .expected_mean = exp(logmean + pow(logstd, 2) / 2),
+        .expected_std = sqrt((exp(pow(logstd, 2)) - 1) * exp(2 * logmean + pow(logstd, 2))),
+        .tolerance = TOLERANCE,
+    };
+
+    test_array_expectations(expectations);
+    free(name);
+    free(lognormal_array);
+}
+
+// Test lognormal to
+void test_to(double low, double high, uint64_t* seed)
+{
+    int n = 10 * 1000 * 1000;
+    double* lognormal_array = malloc(sizeof(double) * n);
+
+    for (int i = 0; i < n; i++) {
+        lognormal_array[i] = sample_to(low, high, seed);
+    }
+
+
+    char* name = malloc(MAX_NAME_LENGTH * sizeof(char));
+    snprintf(name, MAX_NAME_LENGTH, "to(%f, %f)", low, high);
+    
+		const double NORMAL95CONFIDENCE = 1.6448536269514722;
+    double loglow = logf(low);
+    double loghigh = logf(high);
+    double logmean = (loglow + loghigh) / 2;
+    double logstd = (loghigh - loglow) / (2.0 * NORMAL95CONFIDENCE);
+    
+		struct array_expectations expectations = {
+        .array = lognormal_array,
+        .n = n,
+        .name = name,
+        .expected_mean = exp(logmean + pow(logstd, 2) / 2),
+        .expected_std = sqrt((exp(pow(logstd, 2)) - 1) * exp(2 * logmean + pow(logstd, 2))),
+        .tolerance = TOLERANCE,
+    };
+
+    test_array_expectations(expectations);
+    free(name);
+    free(lognormal_array);
+}
+
+// Test beta
+
+void test_beta(double a, double b, uint64_t* seed)
+{
+    int n = 10 * 1000 * 1000;
+    double* beta_array = malloc(sizeof(double) * n);
+
+    for (int i = 0; i < n; i++) {
+        beta_array[i] = sample_beta(a, b, seed);
+    }
+
+    char* name = malloc(MAX_NAME_LENGTH * sizeof(char));
+    snprintf(name, MAX_NAME_LENGTH, "beta(%f, %f)", a, b);
+    struct array_expectations expectations = {
+        .array = beta_array,
+        .n = n,
+        .name = name,
+        .expected_mean = a / (a + b),
+        .expected_std = sqrt((a * b) / (pow(a + b, 2) * (a + b + 1))),
+        .tolerance = TOLERANCE,
+    };
+
+    test_array_expectations(expectations);
+    free(name);
+}
+
+int main()
+{
+    // set randomness seed
+    uint64_t* seed = malloc(sizeof(uint64_t));
+    *seed = 1000; // xorshift can't start with a seed of 0
+
+    printf("Testing unit uniform\n");
+    test_unit_uniform(seed);
+
+    printf("Testing small uniforms\n");
+    for (int i = 0; i < 100; i++) {
+        double start = sample_uniform(-10, 10, seed);
+        double end = sample_uniform(-10, 10, seed);
+        if (end > start) {
+            test_uniform(start, end, seed);
+        }
+    }
+
+    printf("Testing wide uniforms\n");
+    for (int i = 0; i < 100; i++) {
+        double start = sample_uniform(-1000 * 1000, 1000 * 1000, seed);
+        double end = sample_uniform(-1000 * 1000, 1000 * 1000, seed);
+        if (end > start) {
+            test_uniform(start, end, seed);
+        }
+    }
+
+    printf("Testing unit normal\n");
+    test_unit_normal(seed);
+
+    printf("Testing small normals\n");
+    for (int i = 0; i < 100; i++) {
+        double mean = sample_uniform(-10, 10, seed);
+        double std = sample_uniform(0, 10, seed);
+        if (std > 0) {
+            test_normal(mean, std, seed);
+        }
+    }
+
+    printf("Testing larger normals\n");
+    for (int i = 0; i < 100; i++) {
+        double mean = sample_uniform(-1000 * 1000, 1000 * 1000, seed);
+        double std = sample_uniform(0, 1000 * 1000, seed);
+        if (std > 0) {
+            test_normal(mean, std, seed);
+        }
+    }
+
+    printf("Testing smaller lognormals\n");
+    for (int i = 0; i < 100; i++) {
+        double mean = sample_uniform(-1, 1, seed);
+        double std = sample_uniform(0, 1, seed);
+        if (std > 0) {
+            test_lognormal(mean, std, seed);
+        }
+    }
+
+    printf("Testing larger lognormals\n");
+    for (int i = 0; i < 100; i++) {
+        double mean = sample_uniform(-1, 5, seed);
+        double std = sample_uniform(0, 5, seed);
+        if (std > 0) {
+            test_lognormal(mean, std, seed);
+        }
+    }
+
+    printf("Testing lognormals — sample_to(low, high) syntax\n");
+    for (int i = 0; i < 100; i++) {
+        double low = sample_uniform(0, 1000 * 1000, seed);
+        double high = sample_uniform(0, 1000 * 1000, seed);
+        if (low < high) {
+            test_to(low, high, seed);
+        }
+    }
+		// Bonus example
+		test_to(10, 10 * 1000, seed);
+
+    printf("Testing beta distribution\n");
+    for (int i = 0; i < 100; i++) {
+        double a = sample_uniform(0, 1000, seed);
+        double b = sample_uniform(0, 1000, seed);
+        if ((a > 0) && (b > 0)) {
+            test_beta(a, b, seed);
+        }
+    }
+
+    printf("Testing larger beta distributions\n");
+    for (int i = 0; i < 100; i++) {
+        double a = sample_uniform(0, 1000 * 1000, seed);
+        double b = sample_uniform(0, 1000 * 1000, seed);
+        if ((a > 0) && (b > 0)) {
+            test_beta(a, b, seed);
+        }
+    }
+
+    free(seed);
+}