diff --git a/README.md b/README.md index 45a3e34..d241d17 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,8 @@ A self-contained C99 library that provides a subset of [Squiggle](https://www.sq - Because it can fit in my head - Because if you can implement something in C, you can implement it anywhere else - Because it can be made faster if need be - - e.g., with a multi-threading library like OpenMP, or by adding more algorithmic complexity + - e.g., with a multi-threading library like OpenMP, + - or by implementing faster but more complex algorithms - or more simply, by inlining the sampling functions (adding an `inline` directive before their function declaration) - **Because there are few abstractions between it and machine code** (C => assembly => machine code with gcc, or C => machine code, with tcc), leading to fewer errors beyond the programmer's control. @@ -184,16 +185,11 @@ int main(){ ## To do list -- [ ] Test summary statistics for each of the distributions. +- [ ] Pontificate about lognormal tests - [ ] Have some more complicated & realistic example - [ ] Add summarization functions: 90% ci (or all c.i.?) - [ ] Systematize references - [ ] Publish online -- [ ] Add efficient sampling from a beta distribution - - https://dl.acm.org/doi/10.1145/358407.358414 - - https://link.springer.com/article/10.1007/bf02293108 - - https://stats.stackexchange.com/questions/502146/how-does-numpy-generate-samples-from-a-beta-distribution - - https://github.com/numpy/numpy/blob/5cae51e794d69dd553104099305e9f92db237c53/numpy/random/src/distributions/distributions.c - [ ] Support all distribution functions in - [ ] Support all distribution functions in , and do so efficiently @@ -224,3 +220,15 @@ int main(){ - https://dl.acm.org/doi/pdf/10.1145/358407.358414 - [x] Explain correlated samples - [-] ~~Add tests in Stan?~~ +- [x] Test summary statistics for each of the distributions. + - [x] For uniform + - [x] For normal + - [x] For lognormal + - [x] For lognormal (to syntax) + - [x] For beta distribution +- [x] Clarify gamma/standard gamma +- [x] Add efficient sampling from a beta distribution + - https://dl.acm.org/doi/10.1145/358407.358414 + - https://link.springer.com/article/10.1007/bf02293108 + - https://stats.stackexchange.com/questions/502146/how-does-numpy-generate-samples-from-a-beta-distribution + - https://github.com/numpy/numpy/blob/5cae51e794d69dd553104099305e9f92db237c53/numpy/random/src/distributions/distributions.c diff --git a/examples/01_one_sample/example b/examples/01_one_sample/example index a8bdd5d..a22d80e 100755 Binary files a/examples/01_one_sample/example and b/examples/01_one_sample/example differ diff --git a/examples/02_many_samples/example b/examples/02_many_samples/example index 1e63c34..627577c 100755 Binary files a/examples/02_many_samples/example and b/examples/02_many_samples/example differ diff --git a/examples/03_gcc_nested_function/example b/examples/03_gcc_nested_function/example index 5bf8d86..7508692 100755 Binary files a/examples/03_gcc_nested_function/example and b/examples/03_gcc_nested_function/example differ diff --git a/examples/04_sample_from_cdf_simple/example b/examples/04_sample_from_cdf_simple/example index a11109c..5dbcfc6 100755 Binary files a/examples/04_sample_from_cdf_simple/example and b/examples/04_sample_from_cdf_simple/example differ diff --git a/examples/04_sample_from_cdf_simple/example.c b/examples/04_sample_from_cdf_simple/example.c index 7734adb..fc8e219 100644 --- a/examples/04_sample_from_cdf_simple/example.c +++ b/examples/04_sample_from_cdf_simple/example.c @@ -87,9 +87,9 @@ int main() printf("\nGetting some samples from sample_unit_normal\n"); clock_t begin_2 = clock(); - + double* normal_samples = malloc(NUM_SAMPLES * sizeof(double)); for (int i = 0; i < NUM_SAMPLES; i++) { - double normal_sample = sample_unit_normal(seed); + normal_samples[i] = sample_unit_normal(seed); // printf("%f\n", normal_sample); } diff --git a/examples/05_sample_from_cdf_beta/example b/examples/05_sample_from_cdf_beta/example index 8a07292..2381ed0 100755 Binary files a/examples/05_sample_from_cdf_beta/example and b/examples/05_sample_from_cdf_beta/example differ diff --git a/examples/06_gamma_beta/example b/examples/06_gamma_beta/example index a82f6b9..246d397 100755 Binary files a/examples/06_gamma_beta/example and b/examples/06_gamma_beta/example differ diff --git a/examples/06_gamma_beta/example.c b/examples/06_gamma_beta/example.c index 3a7d422..00330f2 100644 --- a/examples/06_gamma_beta/example.c +++ b/examples/06_gamma_beta/example.c @@ -43,10 +43,3 @@ int main() free(seed); } -/* -Aggregation mechanisms: -- Quantiles (requires a sort) -- Sum -- Average -- Std -*/ diff --git a/examples/07_ci_beta/example b/examples/07_ci_beta/example new file mode 100755 index 0000000..d5a3462 Binary files /dev/null and b/examples/07_ci_beta/example differ diff --git a/examples/07_ci_beta/example.c b/examples/07_ci_beta/example.c new file mode 100644 index 0000000..c69d51c --- /dev/null +++ b/examples/07_ci_beta/example.c @@ -0,0 +1,21 @@ +#include "../../squiggle.h" +#include +#include +#include + +// Estimate functions +double beta_1_2_sampler(uint64_t* seed){ + return sample_beta(1, 2.0, seed); +} + +int main() +{ + // set randomness seed + uint64_t* seed = malloc(sizeof(uint64_t)); + *seed = 1000; // xorshift can't start with 0 + + struct c_i beta_1_2_ci_90 = get_90_confidence_interval(beta_1_2_sampler, seed); + printf("90%% confidence interval of beta(1,2) is [%f, %f]\n", beta_1_2_ci_90.low, beta_1_2_ci_90.high); + + free(seed); +} diff --git a/examples/07_ci_beta/makefile b/examples/07_ci_beta/makefile new file mode 100644 index 0000000..ef385f7 --- /dev/null +++ b/examples/07_ci_beta/makefile @@ -0,0 +1,53 @@ +# Interface: +# make +# make build +# make format +# make run + +# Compiler +CC=gcc +# CC=tcc # <= faster compilation + +# Main file +SRC=example.c ../../squiggle.c +OUTPUT=example + +## Dependencies +MATH=-lm + +## Flags +DEBUG= #'-g' +STANDARD=-std=c99 +WARNINGS=-Wall +OPTIMIZED=-O3 #-Ofast +# OPENMP=-fopenmp + +## Formatter +STYLE_BLUEPRINT=webkit +FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT) + +## make build +build: $(SRC) + $(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(MATH) -o $(OUTPUT) + +format: $(SRC) + $(FORMATTER) $(SRC) + +run: $(SRC) $(OUTPUT) + OMP_NUM_THREADS=1 ./$(OUTPUT) && echo + +time-linux: + @echo "Requires /bin/time, found on GNU/Linux systems" && echo + + @echo "Running 100x and taking avg time $(OUTPUT)" + @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 1 thread: |" | sed 's|$$|ms|' && echo + +## Profiling + +profile-linux: + echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar" + echo "Must be run as sudo" + $(CC) $(SRC) $(MATH) -o $(OUTPUT) + sudo perf record ./$(OUTPUT) + sudo perf report + rm perf.data diff --git a/makefile b/makefile index 1a81168..5664c73 100644 --- a/makefile +++ b/makefile @@ -11,6 +11,7 @@ all: cd examples/04_sample_from_cdf_simple && make && echo cd examples/05_sample_from_cdf_beta && make && echo cd examples/06_gamma_beta && make && echo + cd examples/07_ci_beta && make && echo format: squiggle.c squiggle.h $(FORMATTER) squiggle.c diff --git a/squiggle.c b/squiggle.c index 96288d6..3ddb416 100644 --- a/squiggle.c +++ b/squiggle.c @@ -94,6 +94,12 @@ double sample_gamma(double alpha, uint64_t* seed) // A Simple Method for Generating Gamma Variables, Marsaglia and Wan Tsang, 2001 // https://dl.acm.org/doi/pdf/10.1145/358407.358414 // see also the references/ folder + // Note that the Wikipedia page for the gamma distribution includes a scaling parameter + // k or beta + // https://en.wikipedia.org/wiki/Gamma_distribution + // such that gamma_k(alpha, k) = k * gamma(alpha) + // or gamma_beta(alpha, beta) = gamma(alpha) / beta + // So far I have not needed to use this, and thus the second parameter is by default 1. if (alpha >= 1) { double d, c, x, v, u; d = alpha - 1.0 / 3.0; @@ -377,6 +383,43 @@ struct box sampler_cdf_double(double cdf(double), uint64_t* seed) return result; } +// Get confidence intervals, given a sampler + +struct c_i { + float low; + float high; +}; +int compare_doubles(const void *p, const void *q) { + // https://wikiless.esmailelbob.xyz/wiki/Qsort?lang=en + double x = *(const double *)p; + double y = *(const double *)q; + + /* Avoid return x - y, which can cause undefined behaviour + because of signed integer overflow. */ + if (x < y) + return -1; // Return -1 if you want ascending, 1 if you want descending order. + else if (x > y) + return 1; // Return 1 if you want ascending, -1 if you want descending order. + + return 0; +} +struct c_i get_90_confidence_interval(double (*sampler)(uint64_t*), uint64_t* seed){ + int n = 100 * 1000; + double* samples_array = malloc(n * sizeof(double)); + for(int i=0; i