Compare commits
No commits in common. "4113c87e4d264c76d5b55a55bf41340e99b9c4ea" and "8f6919fa2a845ae5709277829204423765b83e3b" have entirely different histories.
4113c87e4d
...
8f6919fa2a
263
FOLK_WISDOM.md
263
FOLK_WISDOM.md
|
@ -1,263 +0,0 @@
|
|||
# Folk Wisdom
|
||||
|
||||
The README.md file was getting too long and messy, so I moved some of the grumpier comments here.
|
||||
|
||||
### Nested functions and compilation with tcc.
|
||||
|
||||
GCC has an extension which allows a program to define a function inside another function. This makes squiggle.c code more linear and nicer to read, at the cost of becoming dependent on GCC and hence sacrificing portability and increasing compilation times. Conversely, compiling with tcc (tiny c compiler) is almost instantaneous, but leads to longer execution times and doesn't allow for nested functions.
|
||||
|
||||
| GCC | tcc |
|
||||
| --- | --- |
|
||||
| slower compilation | faster compilation |
|
||||
| allows nested functions | doesn't allow nested functions |
|
||||
| faster execution | slower execution |
|
||||
|
||||
~~My recommendation would be to use tcc while drawing a small number of samples for fast iteration, and then using gcc for the final version with lots of samples, and possibly with nested functions for ease of reading by others.~~
|
||||
|
||||
My previous recommendation was to use tcc for marginally faster iteration, but nested functions are just really nice. So my current recommendation is to use gcc throughout, though keep in mind that modifying code to not use nested functions is relatively easy, so keep in mind that you can do that if you run in other environments.
|
||||
|
||||
### Correlated samples
|
||||
|
||||
In the original [squiggle](https://www.squiggle-language.com/) language, there is some ambiguity about what this code means:
|
||||
|
||||
```js
|
||||
a = 1 to 10
|
||||
b = 2 * a
|
||||
c = b/a
|
||||
c
|
||||
```
|
||||
|
||||
Likewise in [squigglepy](https://github.com/rethinkpriorities/squigglepy):
|
||||
|
||||
```python
|
||||
import squigglepy as sq
|
||||
import numpy as np
|
||||
|
||||
a = sq.to(1, 3)
|
||||
b = 2 * a
|
||||
c = b / a
|
||||
|
||||
c_samples = sq.sample(c, 10)
|
||||
|
||||
print(c_samples)
|
||||
```
|
||||
|
||||
Should `c` be equal to `2`? or should it be equal to 2 times the expected distribution of the ratio of two independent draws from a (`2 * a/a`, as it were)? You don't know, because you are not operating on samples, you are operating on magical objects whose internals are hidden from you.
|
||||
|
||||
In squiggle.c, this ambiguity doesn't exist, at the cost of much greater overhead & verbosity:
|
||||
|
||||
```c
|
||||
// correlated samples
|
||||
// gcc -O3 correlated.c squiggle.c -lm -o correlated
|
||||
|
||||
#include "squiggle.h"
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
int main(){
|
||||
// set randomness seed
|
||||
uint64_t* seed = malloc(sizeof(uint64_t));
|
||||
*seed = 1000; // xorshift can't start with a seed of 0
|
||||
|
||||
double a = sample_to(1, 10, seed);
|
||||
double b = 2 * a;
|
||||
double c = b / a;
|
||||
|
||||
printf("a: %f, b: %f, c: %f\n", a, b, c);
|
||||
// a: 0.607162, b: 1.214325, c: 0.500000
|
||||
|
||||
free(seed);
|
||||
}
|
||||
```
|
||||
|
||||
vs
|
||||
|
||||
```c
|
||||
// uncorrelated samples
|
||||
// gcc -O3 uncorrelated.c ../../squiggle.c -lm -o uncorrelated
|
||||
|
||||
#include "squiggle.h"
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
double draw_xyz(uint64_t* seed){
|
||||
// function could also be placed inside main with gcc nested functions extension.
|
||||
return sample_to(1, 20, seed);
|
||||
}
|
||||
|
||||
|
||||
int main(){
|
||||
// set randomness seed
|
||||
uint64_t* seed = malloc(sizeof(uint64_t));
|
||||
*seed = 1000; // xorshift can't start with a seed of 0
|
||||
|
||||
double a = draw_xyz(seed);
|
||||
double b = 2 * draw_xyz(seed);
|
||||
double c = b / a;
|
||||
|
||||
printf("a: %f, b: %f, c: %f\n", a, b, c);
|
||||
// a: 0.522484, b: 10.283501, c: 19.681936
|
||||
|
||||
free(seed)
|
||||
}
|
||||
```
|
||||
|
||||
Exercise for the reader: What possible meanings could the following represent in [squiggle](https://www.squiggle-language.com/playground?v=0.8.6#code=eNqrVkpJTUsszSlxzk9JVbJSys3M08jLL8pNzNEw0FEw1NRUUKoFAOYsC1c%3D)? How would you implement each of those meanings in squiggle.c?
|
||||
|
||||
```
|
||||
min(normal(0, 1))
|
||||
```
|
||||
|
||||
Hint: See examples/more/13_parallelize_min
|
||||
|
||||
### Note on sampling strategies
|
||||
|
||||
|
||||
Right now, I am drawing samples using a random number generator. It requires some finesse, particularly when using parallelism. But it works fine.
|
||||
|
||||
But..., what if we could do something more elegant, more ingenious? In particular, what if instead of drawing samples, we had a mesh of equally spaced points in the range of floats? Then we could, for a given number of samples, better estimate the, say, mean of the distribution we are trying to model...
|
||||
|
||||
The problem with that is that if we have some code like:
|
||||
|
||||
```C
|
||||
double model(...){
|
||||
double a = sample_to(1, 10, i_mesh++);
|
||||
double b = sample_to(1, 2, i_mesh);
|
||||
return a * b;
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
Then this doesn't work, because the values of a and b will be correlated: when a is high, b will also be high. What might work would be something like this:
|
||||
|
||||
|
||||
```C
|
||||
double* model(int n_samples){
|
||||
double* xs = malloc((size_t)n_samples * sizeof(double));
|
||||
for(int i_mesh=0; i_mesh < sqrt(n_samples); i_mesh++){
|
||||
for(int j_mesh=0; j_mesh < sqrt(n_samples); j_mesh++){
|
||||
double a = sample_to(1, 10, i_mesh);
|
||||
double b = sample_to(1, 2, j_mesh);
|
||||
}
|
||||
}
|
||||
return xs;
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
But that requires us to encode the shape of the model into the sampling function. It leads to an ugly nesting of for loops. It is a more complex approach. It is not [grug-brained](https://grugbrain.dev/). So every now and then I have to remember that this is not the way.
|
||||
|
||||
### Tests and the long tail of the lognormal
|
||||
|
||||
Distribution functions can be tested with:
|
||||
|
||||
```sh
|
||||
cd tests
|
||||
make && make run
|
||||
```
|
||||
|
||||
`make verify` is an alias that runs all the tests and just displays the ones that are failing.
|
||||
|
||||
These tests are somewhat rudimentary: they get between 1M and 10M samples from a given sampling function, and check that their mean and standard deviations correspond to what they should theoretically should be.
|
||||
|
||||
If you run `make run` (or `make verify`), you will see errors such as these:
|
||||
|
||||
```
|
||||
[-] Mean test for normal(47211.047473, 682197.019012) NOT passed.
|
||||
Mean of normal(47211.047473, 682197.019012): 46933.673278, vs expected mean: 47211.047473
|
||||
delta: -277.374195, relative delta: -0.005910
|
||||
|
||||
[-] Std test for lognormal(4.584666, 2.180816) NOT passed.
|
||||
Std of lognormal(4.584666, 2.180816): 11443.588861, vs expected std: 11342.434900
|
||||
delta: 101.153961, relative delta: 0.008839
|
||||
|
||||
[-] Std test for to(13839.861856, 897828.354318) NOT passed.
|
||||
Std of to(13839.861856, 897828.354318): 495123.630575, vs expected std: 498075.002499
|
||||
delta: -2951.371925, relative delta: -0.005961
|
||||
```
|
||||
|
||||
These tests I wouldn't worry about. Due to luck of the draw, their relative error is a bit over 0.005, or 0.5%, and so the test fails. But it would surprise me if that had some meaningful practical implication.
|
||||
|
||||
The errors that should raise some worry are:
|
||||
|
||||
```
|
||||
[-] Mean test for lognormal(1.210013, 4.766882) NOT passed.
|
||||
Mean of lognormal(1.210013, 4.766882): 342337.257677, vs expected mean: 288253.061628
|
||||
delta: 54084.196049, relative delta: 0.157985
|
||||
[-] Std test for lognormal(1.210013, 4.766882) NOT passed.
|
||||
Std of lognormal(1.210013, 4.766882): 208107782.972184, vs expected std: 24776840217.604111
|
||||
delta: -24568732434.631927, relative delta: -118.057730
|
||||
|
||||
[-] Mean test for lognormal(-0.195240, 4.883106) NOT passed.
|
||||
Mean of lognormal(-0.195240, 4.883106): 87151.733198, vs expected mean: 123886.818303
|
||||
delta: -36735.085104, relative delta: -0.421507
|
||||
[-] Std test for lognormal(-0.195240, 4.883106) NOT passed.
|
||||
Std of lognormal(-0.195240, 4.883106): 33837426.331671, vs expected std: 18657000192.914921
|
||||
delta: -18623162766.583248, relative delta: -550.371727
|
||||
|
||||
[-] Mean test for lognormal(0.644931, 4.795860) NOT passed.
|
||||
Mean of lognormal(0.644931, 4.795860): 125053.904456, vs expected mean: 188163.894101
|
||||
delta: -63109.989645, relative delta: -0.504662
|
||||
[-] Std test for lognormal(0.644931, 4.795860) NOT passed.
|
||||
Std of lognormal(0.644931, 4.795860): 39976300.711166, vs expected std: 18577298706.170452
|
||||
delta: -18537322405.459286, relative delta: -463.707799
|
||||
```
|
||||
|
||||
What is happening in this case is that you are taking a normal, like `normal(-0.195240, 4.883106)`, and you are exponentiating it to arrive at a lognormal. But `normal(-0.195240, 4.883106)` is going to have some non-insignificant weight on, say, 18. But `exp(18) = 39976300`, and points like it are going to end up a nontrivial amount to the analytical mean and standard deviation, even though they have little probability mass.
|
||||
|
||||
The reader can also check that for more plausible real-world values, like those fitting a lognormal to a really wide 90% confidence interval from 10 to 10k, errors aren't egregious:
|
||||
|
||||
```
|
||||
[x] Mean test for to(10.000000, 10000.000000) PASSED
|
||||
[-] Std test for to(10.000000, 10000.000000) NOT passed.
|
||||
Std of to(10.000000, 10000.000000): 23578.091775, vs expected std: 25836.381819
|
||||
delta: -2258.290043, relative delta: -0.095779
|
||||
```
|
||||
|
||||
Overall, I would caution that if you really care about the very far tails of distributions, you might want to instead use tools which can do some of the analytical manipulations for you, like the original Squiggle, Simple Squiggle (both linked below), or even doing lognormal multiplication by hand, relying on the fact that two lognormals multiplied together result in another lognormal with known shape.
|
||||
|
||||
In fact, squiggle.c does have a few functions for algebraic manipulations of simple distributions at the end of squiggle.c. But these are pretty rudimentary, and I don't know whether I'll end up expanding or deleting them.
|
||||
|
||||
### Compiler warnings
|
||||
|
||||
#### Harsh compilation
|
||||
|
||||
By default, I've enabled -Wall -Wextra -Wdouble-promotion -Wconversion. However, these produce some false positive warnings, which I've dealt with through:
|
||||
|
||||
- For conversion: Explicit casts, particularly from int to size_t when calling malloc.
|
||||
- For dealing with unused variables: Using an UNUSED macro. If you don't like that approach, you could add -Wno-unused-parameter to your makefile and remove the macro and its usage.
|
||||
|
||||
Some resources on compiler flags: [1](https://nullprogram.com/blog/2023/04/29/), [2](https://news.ycombinator.com/item?id=7371806)
|
||||
|
||||
#### Results of running clang-tidy
|
||||
|
||||
clang-tidy is a utility to detect common errors in C/C++. You can run it with:
|
||||
|
||||
```
|
||||
make tidy
|
||||
```
|
||||
|
||||
So far in the history of this program it has emitted:
|
||||
- One false-positive warning about an issue I'd already taken care of (so I've suppressed the warning)
|
||||
- a warning about an unused variable
|
||||
|
||||
I think this is good news in terms of making me more confident that this simple library is correct :).
|
||||
|
||||
### Boundaries between sampling functions and arrays of samples
|
||||
|
||||
In squiggle.c, the boundary between working with sampler functions and arrays of samples is clear. Not so in the original squiggle, which hides this distinction from the user in the interest of accessibility.
|
||||
|
||||
### Other gotchas
|
||||
|
||||
- Even though the C standard is ambiguous about this, this code assumes that doubles are 64 bit precision (otherwise the xorshift code should be different).
|
||||
|
||||
## Related projects
|
||||
|
||||
- [Squiggle](https://www.squiggle-language.com/)
|
||||
- [SquigglePy](https://github.com/rethinkpriorities/squigglepy)
|
||||
- [Simple Squiggle](https://nunosempere.com/blog/2022/04/17/simple-squiggle/)
|
||||
- [time to BOTEC](https://github.com/NunoSempere/time-to-botec)
|
||||
- [Find a beta distribution that fits your desired confidence interval](https://nunosempere.com/blog/2023/03/15/fit-beta/)
|
||||
|
561
README.md
561
README.md
|
@ -1,188 +1,56 @@
|
|||
# squiggle.c
|
||||
|
||||
squiggle.c is a self-contained C99 library that provides functions for simple Monte Carlo estimation, inspired by [Squiggle](https://www.squiggle-language.com/).
|
||||
squiggle.c is a [grug-brained](https://grugbrain.dev/) self-contained C99 library that provides functions for simple Monte Carlo estimation, inspired by [Squiggle](https://www.squiggle-language.com/).
|
||||
|
||||
## Motivation
|
||||
|
||||
### What am I trying to do here?
|
||||
|
||||
- I am trying to build a reliable alternative to the original squiggle, that works for me and addresses my frustrations with it.
|
||||
- Some adjectives: [grug brain](https://grugbrain.dev/), [lindy](https://en.wikipedia.org/wiki/Lindy_effect), [suckless](https://suckless.org/)
|
||||
- I am trying to make something that is simple enough that I and others can fully understand. squiggle.c is less than 700 lines of C, with a core of <230 lines. You, a somewhat technically sophisticated reader, could just read it and grasp its contents, and is encouraged to do so.
|
||||
|
||||
### Why C?
|
||||
## Why C?
|
||||
|
||||
- Because it is fast
|
||||
- Because I enjoy it
|
||||
- Because C is honest
|
||||
- Because it will last long
|
||||
- Because it can be made faster if need be, e.g., with a multi-threading library like OpenMP, by implementing faster but more complex algorithms, or more simply, by inlining the sampling functions (adding an `inline` directive before their function declaration)
|
||||
- **Because there are few abstractions between it and machine code** (C => assembly => machine code with gcc, or C => machine code, with tcc), leading to fewer errors beyond the programmer's control.
|
||||
- Because it will last long (because C is more [lindy](https://en.wikipedia.org/wiki/Lindy_effect))
|
||||
- Because it can fit in my head
|
||||
- Because if you can implement something in C, you can implement it anywhere else
|
||||
|
||||
### Design choices
|
||||
|
||||
This code should aim to be correct, then simple, then fast.
|
||||
|
||||
- It should be correct. The user should be able to rely on it and not think about whether errors come from the library.
|
||||
- Nonetheless, the user should understand the limitations of sampling-based methods. See the section on [Tests and the long tail of the lognormal](https://git.nunosempere.com/personal/squiggle.c/FOLK_WISDOM.md#tests-and-the-long-tail-of-the-lognormal) for a discussion of how sampling is bad at capturing some aspects of distributions with long tails.
|
||||
- It should be clear, conceptually simple. Simple for me to implement, simple for others to understand.
|
||||
- It should be fast. But when speed conflicts with simplicity, choose simplicity. For example, there might be several possible algorithms to sample a distribution, each of which is faster over part of the domain. In that case, it's conceptually simpler to just pick one algorithm, and pay the—normally small—performance penalty.
|
||||
- In any case, though, the code should still be *way faster* than, say, Python.
|
||||
|
||||
Note that being terse, or avoiding verbosity, is a non-goal. This is in part because of the constraints that C imposes. But it also aids with clarity and conceptual simplicity, as the issue of correlated samples illustrates in the next section.
|
||||
|
||||
Caveats: Parallelism might hide monsters. The histogram function is pretty but was created with the aid of GPT, and so might have errors.
|
||||
- Because it can be made faster if need be, e.g., with a multi-threading library like OpenMP, by implementing faster but more complex algorithms, or more simply, by inlining the sampling functions (adding an `inline` directive before their function declaration)
|
||||
- **Because there are few abstractions between it and machine code** (C => assembly => machine code with gcc, or C => machine code, with tcc), leading to fewer errors beyond the programmer's control.
|
||||
|
||||
## Getting started
|
||||
|
||||
### Example upfront
|
||||
You can follow some example usage in the [examples/](examples]) folder. In [examples/core](examples/core/), we build up some functionality, starting from drawing one sample and finishing with a replication of [Dissolving the Fermi paradox](https://arxiv.org/abs/1806.02404). In [examples/more](examples/more), we present a few more complicated examples, like finding confidence intervals, a model of nuclear war, an estimate of how much exercise to do to lose 10kg, or an example using parallelism.
|
||||
|
||||
Download squiggle.c, for instance:
|
||||
## Commentary
|
||||
|
||||
```sh
|
||||
$ rm -r -f squiggle_c
|
||||
$ wget https://git.nunosempere.com/personal/squiggle.c/raw/branch/master/squiggle.c
|
||||
$ wget https://git.nunosempere.com/personal/squiggle.c/raw/branch/master/squiggle.h
|
||||
$ wget https://git.nunosempere.com/personal/squiggle.c/raw/branch/master/squiggle_more.c
|
||||
$ wget https://git.nunosempere.com/personal/squiggle.c/raw/branch/master/squiggle_more.h
|
||||
$ mkdir temp
|
||||
$ mv squiggle* temp
|
||||
$ mv temp squiggle_c
|
||||
```
|
||||
### squiggle.c is short
|
||||
|
||||
Write your model. The below is an overcomplicated example, because it has a long digression in the middle, which I think is funny. May change:
|
||||
|
||||
```C
|
||||
#include "squiggle_c/squiggle.h"
|
||||
#include <math.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
double sample_fermi_logspace(uint64_t * seed)
|
||||
{
|
||||
// Replicate <https://arxiv.org/pdf/1806.02404.pdf>, and in particular the red line in page 11.
|
||||
// You can see a simple version of this function in naive.c in this same folder
|
||||
double log_rate_of_star_formation = sample_uniform(log(1), log(100), seed);
|
||||
double log_fraction_of_stars_with_planets = sample_uniform(log(0.1), log(1), seed);
|
||||
double log_number_of_habitable_planets_per_star_system = sample_uniform(log(0.1), log(1), seed);
|
||||
|
||||
double log_rate_of_life_formation_in_habitable_planets = sample_normal(1, 50, seed);
|
||||
double log_fraction_of_habitable_planets_in_which_any_life_appears;
|
||||
/*
|
||||
Consider:
|
||||
a = underlying normal
|
||||
b = rate_of_life_formation_in_habitable_planets = exp(underlying normal) = exp(a)
|
||||
c = 1 - exp(-b) = fraction_of_habitable_planets_in_which_any_life_appears
|
||||
d = log(c)
|
||||
|
||||
Looking at the Taylor expansion for c = 1 - exp(-b), it's
|
||||
b - b^2/2 + b^3/6 - x^b/24, etc.
|
||||
<https://www.wolframalpha.com/input?i=1-exp%28-x%29>
|
||||
When b ~ 0 (as is often the case), this is close to b.
|
||||
|
||||
But now, if b ~ 0, c ~ b
|
||||
and d = log(c) ~ log(b) = log(exp(a)) = a
|
||||
|
||||
Now, we could play around with estimating errors,
|
||||
and indeed if we want b^2/2 = exp(a)^2/2 < 10^(-n), i.e., to have n decimal digits of precision,
|
||||
we could compute this as e.g., a < (nlog(10) + log(2))/2
|
||||
so for example if we want ten digits of precision, that's a < -11
|
||||
|
||||
Empirically, the two numbers as calculated in C do become really close around 11 or so,
|
||||
and at 38 that calculation results in a -inf (so probably a floating point error or similar.)
|
||||
So we should be using that formula for somewhere between -38 << a < -11
|
||||
|
||||
I chose -16 as a happy medium after playing around with
|
||||
double invert(double x){
|
||||
return log(1-exp(-exp(-x)));
|
||||
}
|
||||
for(int i=0; i<64; i++){
|
||||
double j = i;
|
||||
printf("for %lf, log(1-exp(-exp(-x))) is calculated as... %lf\n", j, invert(j));
|
||||
}
|
||||
and <https://www.wolframalpha.com/input?i=log%281-exp%28-exp%28-16%29%29%29>
|
||||
*/
|
||||
if (log_rate_of_life_formation_in_habitable_planets < -16) {
|
||||
log_fraction_of_habitable_planets_in_which_any_life_appears = log_rate_of_life_formation_in_habitable_planets;
|
||||
} else {
|
||||
double rate_of_life_formation_in_habitable_planets = exp(log_rate_of_life_formation_in_habitable_planets);
|
||||
double fraction_of_habitable_planets_in_which_any_life_appears = -expm1(-rate_of_life_formation_in_habitable_planets);
|
||||
log_fraction_of_habitable_planets_in_which_any_life_appears = log(fraction_of_habitable_planets_in_which_any_life_appears);
|
||||
}
|
||||
|
||||
double log_fraction_of_planets_with_life_in_which_intelligent_life_appears = sample_uniform(log(0.001), log(1), seed);
|
||||
double log_fraction_of_intelligent_planets_which_are_detectable_as_such = sample_uniform(log(0.01), log(1), seed);
|
||||
double log_longevity_of_detectable_civilizations = sample_uniform(log(100), log(10000000000), seed);
|
||||
|
||||
double log_n =
|
||||
log_rate_of_star_formation +
|
||||
log_fraction_of_stars_with_planets +
|
||||
log_number_of_habitable_planets_per_star_system +
|
||||
log_fraction_of_habitable_planets_in_which_any_life_appears +
|
||||
log_fraction_of_planets_with_life_in_which_intelligent_life_appears +
|
||||
log_fraction_of_intelligent_planets_which_are_detectable_as_such +
|
||||
log_longevity_of_detectable_civilizations;
|
||||
return log_n;
|
||||
}
|
||||
|
||||
double sample_are_we_alone_logspace(uint64_t * seed)
|
||||
{
|
||||
double log_n = sample_fermi_logspace(seed);
|
||||
return ((log_n > 0) ? 1 : 0);
|
||||
// log_n > 0 => n > 1
|
||||
}
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
|
||||
// set randomness seed
|
||||
uint64_t* seed = malloc(sizeof(uint64_t));
|
||||
*seed = 1010101; // xorshift can't start with a seed of 0
|
||||
// Note: come back to choice of seed.
|
||||
|
||||
double logspace_fermi_proportion = 0;
|
||||
int n_samples = 1000 * 1000;
|
||||
for (int i = 0; i < n_samples; i++) {
|
||||
double result = sample_are_we_alone_logspace(seed);
|
||||
logspace_fermi_proportion += result;
|
||||
}
|
||||
double p_not_alone = logspace_fermi_proportion / n_samples;
|
||||
printf("Probability that we are not alone: %lf (%.lf%%)\n", p_not_alone, p_not_alone * 100);
|
||||
|
||||
free(seed);
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
Compile and run:
|
||||
|
||||
```
|
||||
$ gcc -O3 samples.c ./squiggle_c/squiggle.c ./squiggle_c/squiggle_more.c -lm -fopenmp -o samples
|
||||
$ ./samples
|
||||
```
|
||||
[squiggle.c](squiggle.c) is less than 700 lines of C, with a core of <230 lines. The reader could just read it and grasp its contents, and is encouraged to do so.
|
||||
|
||||
### Core strategy
|
||||
|
||||
The recommended strategy is to:
|
||||
This library provides some basic building blocks. The recommended strategy is to:
|
||||
|
||||
1. Define sampler functions, which take a seed, and return 1 sample
|
||||
2. Compose those sampler functions to define your estimation model
|
||||
3. Produce an array of samples from a sampler function
|
||||
4. Get summary statistics for that array of samples.
|
||||
|
||||
### Examples
|
||||
### Nested functions and compilation with tcc.
|
||||
|
||||
You can follow some example usage in the [examples/](examples]) folder. In [examples/core](examples/core/), we build up some functionality, starting from drawing one sample and finishing with a replication of [Dissolving the Fermi paradox](https://arxiv.org/abs/1806.02404). In [examples/more](examples/more), we present a few more complicated examples, like finding confidence intervals, a model of nuclear war, an estimate of how much exercise to do to lose 10kg, or an example using parallelism.
|
||||
GCC has an extension which allows a program to define a function inside another function. This makes squiggle.c code more linear and nicer to read, at the cost of becoming dependent on GCC and hence sacrificing portability and increasing compilation times. Conversely, compiling with tcc (tiny c compiler) is almost instantaneous, but leads to longer execution times and doesn't allow for nested functions.
|
||||
|
||||
## Guarantees
|
||||
| GCC | tcc |
|
||||
| --- | --- |
|
||||
| slower compilation | faster compilation |
|
||||
| allows nested functions | doesn't allow nested functions |
|
||||
| faster execution | slower execution |
|
||||
|
||||
~~My recommendation would be to use tcc while drawing a small number of samples for fast iteration, and then using gcc for the final version with lots of samples, and possibly with nested functions for ease of reading by others.~~
|
||||
|
||||
My previous recommendation was to use tcc for marginally faster iteration, but nested functions are just really nice. So my current recommendation is to use gcc throughout, though keep in mind that modifying code to not use nested functions is relatively easy, so keep in mind that you can do that if you run in other environments.
|
||||
|
||||
### Guarantees and licensing
|
||||
|
||||
The motte:
|
||||
|
||||
- I offer no guarantees about stability, correctness, performance, etc. I might, for instance, abandon the version in C and rewrite it in Zig, Nim, Rust, Go.
|
||||
- I offer no guarantees about stability, correctness, performance, etc. I might, for instance, abandon the version in C and rewrite it in Zig, Nim or Rust.
|
||||
- This project mostly exists for my own usage & for my own amusement.
|
||||
- Caution! Think carefully before using this project for anything important.
|
||||
- If you wanted to pay me to provide some stability or correctness, guarantees, or to tweak this library for your own usage, or to teach you how to use it, you could do so [here](https://nunosempere.com/consulting).
|
||||
|
@ -190,12 +58,254 @@ The motte:
|
|||
|
||||
The bailey:
|
||||
|
||||
- You can vendor the code, i.e., save it as a dependency together with your other files. This way, this renders you immune to any changes I may make.
|
||||
- I've been hacking at this project for a while now, and I think I have a good grasp of its correctness and limitations. I've tried Nim and Zig, and I prefer C so far.
|
||||
- I think the core interface is not likely to change much, though I've recently changed the interface for parallelism and for getting confidence intervals.
|
||||
- I am using this code for a few important consulting projects, and I trust myself to operate it correctly.
|
||||
|
||||
## squiggle extra functions
|
||||
This project is released under the MIT license, a permissive open-source license. You can see it in the LICENSE.txt file.
|
||||
|
||||
### Design choices
|
||||
|
||||
This code should aim to be correct, then simple, then fast.
|
||||
|
||||
- It should be correct. The user should be able to rely on it and not think about whether errors come from the library.
|
||||
- Nonetheless, the user should understand the limitations of sampling-based methods. See the section on [Tests and the long tail of the lognormal](https://git.nunosempere.com/personal/squiggle.c#tests-and-the-long-tail-of-the-lognormal) for a discussion of how sampling is bad at capturing some aspects of distributions with long tails.
|
||||
- It should be clear, conceptually simple. Simple for me to implement, simple for others to understand.
|
||||
- It should be fast. But when speed conflicts with simplicity, choose simplicity. For example, there might be several possible algorithms to sample a distribution, each of which is faster over part of the domain. In that case, it's conceptually simpler to just pick one algorithm, and pay the—normally small—performance penalty. In any case, though, the code should still be *way faster* than, say, Python.
|
||||
|
||||
Note that being terse, or avoiding verbosity, is a non-goal. This is in part because of the constraints that C imposes. But it also aids with clarity and conceptual simplicity, as the issue of correlated samples illustrates in the next section.
|
||||
|
||||
### Correlated samples
|
||||
|
||||
In the original [squiggle](https://www.squiggle-language.com/) language, there is some ambiguity about what this code means:
|
||||
|
||||
```js
|
||||
a = 1 to 10
|
||||
b = 2 * a
|
||||
c = b/a
|
||||
c
|
||||
```
|
||||
|
||||
Likewise in [squigglepy](https://github.com/rethinkpriorities/squigglepy):
|
||||
|
||||
```python
|
||||
import squigglepy as sq
|
||||
import numpy as np
|
||||
|
||||
a = sq.to(1, 3)
|
||||
b = 2 * a
|
||||
c = b / a
|
||||
|
||||
c_samples = sq.sample(c, 10)
|
||||
|
||||
print(c_samples)
|
||||
```
|
||||
|
||||
Should `c` be equal to `2`? or should it be equal to 2 times the expected distribution of the ratio of two independent draws from a (`2 * a/a`, as it were)? You don't know, because you are not operating on samples, you are operating on magical objects whose internals are hidden from you.
|
||||
|
||||
In squiggle.c, this ambiguity doesn't exist, at the cost of much greater overhead & verbosity:
|
||||
|
||||
```c
|
||||
// correlated samples
|
||||
// gcc -O3 correlated.c squiggle.c -lm -o correlated
|
||||
|
||||
#include "squiggle.h"
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
int main(){
|
||||
// set randomness seed
|
||||
uint64_t* seed = malloc(sizeof(uint64_t));
|
||||
*seed = 1000; // xorshift can't start with a seed of 0
|
||||
|
||||
double a = sample_to(1, 10, seed);
|
||||
double b = 2 * a;
|
||||
double c = b / a;
|
||||
|
||||
printf("a: %f, b: %f, c: %f\n", a, b, c);
|
||||
// a: 0.607162, b: 1.214325, c: 0.500000
|
||||
|
||||
free(seed);
|
||||
}
|
||||
```
|
||||
|
||||
vs
|
||||
|
||||
```c
|
||||
// uncorrelated samples
|
||||
// gcc -O3 uncorrelated.c ../../squiggle.c -lm -o uncorrelated
|
||||
|
||||
#include "squiggle.h"
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
double draw_xyz(uint64_t* seed){
|
||||
// function could also be placed inside main with gcc nested functions extension.
|
||||
return sample_to(1, 20, seed);
|
||||
}
|
||||
|
||||
|
||||
int main(){
|
||||
// set randomness seed
|
||||
uint64_t* seed = malloc(sizeof(uint64_t));
|
||||
*seed = 1000; // xorshift can't start with a seed of 0
|
||||
|
||||
double a = draw_xyz(seed);
|
||||
double b = 2 * draw_xyz(seed);
|
||||
double c = b / a;
|
||||
|
||||
printf("a: %f, b: %f, c: %f\n", a, b, c);
|
||||
// a: 0.522484, b: 10.283501, c: 19.681936
|
||||
|
||||
free(seed)
|
||||
}
|
||||
```
|
||||
|
||||
Exercise for the reader: What possible meanings could the following represent in [squiggle](https://www.squiggle-language.com/playground?v=0.8.6#code=eNqrVkpJTUsszSlxzk9JVbJSys3M08jLL8pNzNEw0FEw1NRUUKoFAOYsC1c%3D)? How would you implement each of those meanings in squiggle.c?
|
||||
|
||||
```
|
||||
min(normal(0, 1))
|
||||
```
|
||||
|
||||
Hint: See examples/more/13_parallelize_min
|
||||
|
||||
### Note on sampling strategies
|
||||
|
||||
Right now, I am drawing samples using a random number generator. It requires some finesse, particularly when using parallelism. But it works fine.
|
||||
|
||||
But..., what if we could do something more elegant, more ingenious? In particular, what if instead of drawing samples, we had a mesh of equally spaced points in the range of floats? Then we could, for a given number of samples, better estimate the, say, mean of the distribution we are trying to model...
|
||||
|
||||
The problem with that is that if we have some code like:
|
||||
|
||||
```C
|
||||
double model(...){
|
||||
double a = sample_to(1, 10, i_mesh++);
|
||||
double b = sample_to(1, 2, i_mesh);
|
||||
return a * b;
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
Then this doesn't work, because the values of a and b will be correlated: when a is high, b will also be high. What might work would be something like this:
|
||||
|
||||
|
||||
```C
|
||||
double* model(int n_samples){
|
||||
double* xs = malloc((size_t)n_samples * sizeof(double));
|
||||
for(int i_mesh=0; i_mesh < sqrt(n_samples); i_mesh++){
|
||||
for(int j_mesh=0; j_mesh < sqrt(n_samples); j_mesh++){
|
||||
double a = sample_to(1, 10, i_mesh);
|
||||
double b = sample_to(1, 2, j_mesh);
|
||||
}
|
||||
}
|
||||
return xs;
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
But that requires us to encode the shape of the model into the sampling function. It leads to an ugly nesting of for loops. It is a more complex approach. It is not [grug-brained](https://grugbrain.dev/). So every now and then I have to remember that this is not the way.
|
||||
|
||||
### Boundaries between sampling functions and arrays of samples
|
||||
|
||||
In squiggle.c, the boundary between working with sampler functions and arrays of samples is clear. Not so in the original squiggle, which hides this distinction from the user in the interest of accessibility.
|
||||
|
||||
### Tests and the long tail of the lognormal
|
||||
|
||||
Distribution functions can be tested with:
|
||||
|
||||
```sh
|
||||
cd tests
|
||||
make && make run
|
||||
```
|
||||
|
||||
`make verify` is an alias that runs all the tests and just displays the ones that are failing.
|
||||
|
||||
These tests are somewhat rudimentary: they get between 1M and 10M samples from a given sampling function, and check that their mean and standard deviations correspond to what they should theoretically should be.
|
||||
|
||||
If you run `make run` (or `make verify`), you will see errors such as these:
|
||||
|
||||
```
|
||||
[-] Mean test for normal(47211.047473, 682197.019012) NOT passed.
|
||||
Mean of normal(47211.047473, 682197.019012): 46933.673278, vs expected mean: 47211.047473
|
||||
delta: -277.374195, relative delta: -0.005910
|
||||
|
||||
[-] Std test for lognormal(4.584666, 2.180816) NOT passed.
|
||||
Std of lognormal(4.584666, 2.180816): 11443.588861, vs expected std: 11342.434900
|
||||
delta: 101.153961, relative delta: 0.008839
|
||||
|
||||
[-] Std test for to(13839.861856, 897828.354318) NOT passed.
|
||||
Std of to(13839.861856, 897828.354318): 495123.630575, vs expected std: 498075.002499
|
||||
delta: -2951.371925, relative delta: -0.005961
|
||||
```
|
||||
|
||||
These tests I wouldn't worry about. Due to luck of the draw, their relative error is a bit over 0.005, or 0.5%, and so the test fails. But it would surprise me if that had some meaningful practical implication.
|
||||
|
||||
The errors that should raise some worry are:
|
||||
|
||||
```
|
||||
[-] Mean test for lognormal(1.210013, 4.766882) NOT passed.
|
||||
Mean of lognormal(1.210013, 4.766882): 342337.257677, vs expected mean: 288253.061628
|
||||
delta: 54084.196049, relative delta: 0.157985
|
||||
[-] Std test for lognormal(1.210013, 4.766882) NOT passed.
|
||||
Std of lognormal(1.210013, 4.766882): 208107782.972184, vs expected std: 24776840217.604111
|
||||
delta: -24568732434.631927, relative delta: -118.057730
|
||||
|
||||
[-] Mean test for lognormal(-0.195240, 4.883106) NOT passed.
|
||||
Mean of lognormal(-0.195240, 4.883106): 87151.733198, vs expected mean: 123886.818303
|
||||
delta: -36735.085104, relative delta: -0.421507
|
||||
[-] Std test for lognormal(-0.195240, 4.883106) NOT passed.
|
||||
Std of lognormal(-0.195240, 4.883106): 33837426.331671, vs expected std: 18657000192.914921
|
||||
delta: -18623162766.583248, relative delta: -550.371727
|
||||
|
||||
[-] Mean test for lognormal(0.644931, 4.795860) NOT passed.
|
||||
Mean of lognormal(0.644931, 4.795860): 125053.904456, vs expected mean: 188163.894101
|
||||
delta: -63109.989645, relative delta: -0.504662
|
||||
[-] Std test for lognormal(0.644931, 4.795860) NOT passed.
|
||||
Std of lognormal(0.644931, 4.795860): 39976300.711166, vs expected std: 18577298706.170452
|
||||
delta: -18537322405.459286, relative delta: -463.707799
|
||||
```
|
||||
|
||||
What is happening in this case is that you are taking a normal, like `normal(-0.195240, 4.883106)`, and you are exponentiating it to arrive at a lognormal. But `normal(-0.195240, 4.883106)` is going to have some non-insignificant weight on, say, 18. But `exp(18) = 39976300`, and points like it are going to end up a nontrivial amount to the analytical mean and standard deviation, even though they have little probability mass.
|
||||
|
||||
The reader can also check that for more plausible real-world values, like those fitting a lognormal to a really wide 90% confidence interval from 10 to 10k, errors aren't egregious:
|
||||
|
||||
```
|
||||
[x] Mean test for to(10.000000, 10000.000000) PASSED
|
||||
[-] Std test for to(10.000000, 10000.000000) NOT passed.
|
||||
Std of to(10.000000, 10000.000000): 23578.091775, vs expected std: 25836.381819
|
||||
delta: -2258.290043, relative delta: -0.095779
|
||||
```
|
||||
|
||||
Overall, I would caution that if you really care about the very far tails of distributions, you might want to instead use tools which can do some of the analytical manipulations for you, like the original Squiggle, Simple Squiggle (both linked below), or even doing lognormal multiplication by hand, relying on the fact that two lognormals multiplied together result in another lognormal with known shape.
|
||||
|
||||
In fact, squiggle.c does have a few functions for algebraic manipulations of simple distributions at the end of squiggle.c. But these are pretty rudimentary, and I don't know whether I'll end up expanding or deleting them.
|
||||
|
||||
### Compiler warnings
|
||||
|
||||
#### Harsh compilation
|
||||
|
||||
By default, I've enabled -Wall -Wextra -Wdouble-promotion -Wconversion. However, these produce some false positive warnings, which I've dealt with through:
|
||||
|
||||
- For conversion: Explicit casts, particularly from int to size_t when calling malloc.
|
||||
- For dealing with unused variables: Using an UNUSED macro. If you don't like that approach, you could add -Wno-unused-parameter to your makefile and remove the macro and its usage.
|
||||
|
||||
Some resources on compiler flags: [1](https://nullprogram.com/blog/2023/04/29/), [2](https://news.ycombinator.com/item?id=7371806)
|
||||
|
||||
#### Results of running clang-tidy
|
||||
|
||||
clang-tidy is a utility to detect common errors in C/C++. You can run it with:
|
||||
|
||||
```
|
||||
make tidy
|
||||
```
|
||||
|
||||
So far in the history of this program it has emitted:
|
||||
- One false-positive warning about an issue I'd already taken care of (so I've suppressed the warning)
|
||||
- a warning about an unused variable
|
||||
|
||||
I think this is good news in terms of making me more confident that this simple library is correct :).
|
||||
|
||||
### Division between core functions and squiggle_more expansions
|
||||
|
||||
|
@ -246,7 +356,138 @@ That said, I found adding parallelism to be an interesting an engaging task. Mos
|
|||
|
||||
`squiggle_more.c` has some functions to do some simple algebra manipulations: sums of normals and products of lognormals. You can see some example usage [here](examples/more/07_algebra/example.c) and [here](examples/more/08_algebra_and_conversion/example.c).
|
||||
|
||||
## Licensing
|
||||
#### Extra: cdf auxiliary functions
|
||||
|
||||
This project is released under the MIT license, a permissive open-source license. You can see it in the LICENSE.txt file.
|
||||
I provide some auxiliary functions that take a cdf, and return a sample from the distribution produced by that cdf. This might make it easier to program models, at the cost of a 20x to 60x slowdown in the parts of the code that use it.
|
||||
|
||||
I don't have any immediate plans or reason to delete these functions, but I probably will, since they are too slow/inefficient for my taste.
|
||||
|
||||
#### Extra: Error propagation vs exiting on error
|
||||
|
||||
The process of taking a cdf and returning a sample might fail, e.g., it's a Newton method which might fail to converge because of cdf artifacts. The cdf itself might also fail, e.g., if a distribution only accepts a range of parameters, but is fed parameters outside that range.
|
||||
|
||||
This library provides two approaches:
|
||||
|
||||
1. Print the line and function in which the error occurred, then exit on error
|
||||
2. In situations where there might be an error, return a struct containing either the correct value or an error message:
|
||||
|
||||
```C
|
||||
struct box {
|
||||
int empty;
|
||||
double content;
|
||||
char* error_msg;
|
||||
};
|
||||
```
|
||||
|
||||
The first approach produces terser programs but might not scale. The second approach seems like it could lead to more robust programs, but is more verbose.
|
||||
|
||||
Behaviour on error can be toggled by the `EXIT_ON_ERROR` variable. This library also provides a convenient macro, `PROCESS_ERROR`, to make error handling in either case much terser—see the usage in example 4 in the examples/ folder.
|
||||
|
||||
Overall, I'd describe the error handling capabilities of this library as pretty rudimentary. For example, this program might fail in surprising ways if you ask for a lognormal with negative standard deviation, because I haven't added error checking for that case yet.
|
||||
|
||||
### Other gotchas
|
||||
|
||||
- Even though the C standard is ambiguous about this, this code assumes that doubles are 64 bit precision (otherwise the xorshift code should be different).
|
||||
|
||||
## Related projects
|
||||
|
||||
- [Squiggle](https://www.squiggle-language.com/)
|
||||
- [SquigglePy](https://github.com/rethinkpriorities/squigglepy)
|
||||
- [Simple Squiggle](https://nunosempere.com/blog/2022/04/17/simple-squiggle/)
|
||||
- [time to BOTEC](https://github.com/NunoSempere/time-to-botec)
|
||||
- [Find a beta distribution that fits your desired confidence interval](https://nunosempere.com/blog/2023/03/15/fit-beta/)
|
||||
|
||||
## Roadmap
|
||||
|
||||
### To do
|
||||
|
||||
- [ ] Come up with a better headline example; fermi paradox paper is too complicated
|
||||
- [ ] Post on suckless subreddit
|
||||
- [ ] Drive in a few more real-life applications
|
||||
- [ ] US election modelling?
|
||||
- [ ] Look into using size_t instead of int for sample numbers
|
||||
- [ ] Reorganize code a little bit to reduce usage of gcc's nested functions
|
||||
|
||||
### Done
|
||||
|
||||
- [x] Document print stats
|
||||
- [x] Document rudimentary algebra manipulations for normal/lognormal
|
||||
- [x] Think through whether to delete cdf => samples function => not for now
|
||||
- [x] Think through whether to:
|
||||
- simplify and just abort on error
|
||||
- complexify and use boxes for everything
|
||||
- leave as is
|
||||
- [x] Offer both options
|
||||
- [x] Add more functions to do algebra and get the 90% c.i. of normals, lognormals, betas, etc.
|
||||
- Think through which of these make sense.
|
||||
- [x] Systematize references
|
||||
- [x] Think through seed initialization
|
||||
- [x] Document parallelism
|
||||
- [x] Document confidence intervals
|
||||
- [x] Add example for only one sample
|
||||
- [x] Add example for many samples
|
||||
- [x] Use gcc extension to define functions nested inside main.
|
||||
- [x] Chain various `sample_mixture` functions
|
||||
- [x] Add beta distribution
|
||||
- See <https://stats.stackexchange.com/questions/502146/how-does-numpy-generate-samples-from-a-beta-distribution> for a faster method.
|
||||
- [x] Use OpenMP for acceleration
|
||||
- [x] Add function to get sample when given a cdf
|
||||
- [x] Don't have a single header file.
|
||||
- [x] Structure project a bit better
|
||||
- [x] Simplify `PROCESS_ERROR` macro
|
||||
- [x] Add README
|
||||
- [x] Schema: a function which takes a sample and manipulates it,
|
||||
- [x] and at the end, an array of samples.
|
||||
- [x] Explain boxes
|
||||
- [x] Explain nested functions
|
||||
- [x] Explain exit on error
|
||||
- [x] Explain individual examples
|
||||
- [x] Rename functions to something more self-explanatory, e.g,. `sample_unit_normal`.
|
||||
- [x] Add summarization functions: mean, std
|
||||
- [x] Add sampling from a gamma distribution
|
||||
- https://dl.acm.org/doi/pdf/10.1145/358407.358414
|
||||
- [x] Explain correlated samples
|
||||
- [x] Test summary statistics for each of the distributions.
|
||||
- [x] For uniform
|
||||
- [x] For normal
|
||||
- [x] For lognormal
|
||||
- [x] For lognormal (to syntax)
|
||||
- [x] For beta distribution
|
||||
- [x] Clarify gamma/standard gamma
|
||||
- [x] Add efficient sampling from a beta distribution
|
||||
- https://dl.acm.org/doi/10.1145/358407.358414
|
||||
- https://link.springer.com/article/10.1007/bf02293108
|
||||
- https://stats.stackexchange.com/questions/502146/how-does-numpy-generate-samples-from-a-beta-distribution
|
||||
- https://github.com/numpy/numpy/blob/5cae51e794d69dd553104099305e9f92db237c53/numpy/random/src/distributions/distributions.c
|
||||
- [x] Pontificate about lognormal tests
|
||||
- [x] Give warning about sampling-based methods.
|
||||
- [x] Have some more complicated & realistic example
|
||||
- [x] Add summarization functions: 90% ci (or all c.i.?)
|
||||
- [x] Link to the examples in the examples section.
|
||||
- [x] Add a few functions for doing simple algebra on normals, and lognormals
|
||||
- [x] Add prototypes
|
||||
- [x] Use named structs
|
||||
- [x] Add to header file
|
||||
- [x] Provide example algebra
|
||||
- [x] Add conversion between 90% ci and parameters.
|
||||
- [x] Use that conversion in conjunction with small algebra.
|
||||
- [x] Consider ergonomics of using ci instead of c_i
|
||||
- [x] use named struct instead
|
||||
- [x] demonstrate and document feeding a struct directly to a function; my_function((struct c_i){.low = 1, .high = 2});
|
||||
- [x] Move to own file? Or signpost in file? => signposted in file.
|
||||
- [x] Write twitter thread: now [here](https://twitter.com/NunoSempere/status/1707041153210564959); retweets appreciated.
|
||||
- [x] Write better confidence interval code that:
|
||||
- Gets number of samples as an input
|
||||
- Gets either a sampler function or a list of samples
|
||||
- is O(n), not O(nlog(n))
|
||||
- Parallelizes stuff
|
||||
|
||||
### Discarded
|
||||
|
||||
- [ ] ~~Disambiguate sample_laplace--successes vs failures || successes vs total trials as two distinct and differently named functions~~
|
||||
- [ ] ~~Support all distribution functions in <https://www.squiggle-language.com/docs/Api/Dist>~~
|
||||
- [ ] ~~Add a custom preprocessor to allow simple nested functions that don't rely on local scope?~~
|
||||
- [ ] ~~Add tests in Stan?~~
|
||||
- [ ] ~~Test results for lognormal manipulations~~
|
||||
- [ ] ~~Consider desirability of defining shortcuts for algebra functions. Adds a level of magic, though.~~
|
||||
- [ ] ~~Think about whether to write a simple version of this for [uxn](https://100r.co/site/uxn.html), a minimalist portable programming stack which, sadly, doesn't have doubles (64 bit floats)~~
|
||||
|
|
99
ROADMAP.md
99
ROADMAP.md
|
@ -1,99 +0,0 @@
|
|||
# Roadmap
|
||||
|
||||
## To do
|
||||
|
||||
- [x] Big refactor
|
||||
- [ ] Come up with a better headline example; fermi paradox paper is too complicated
|
||||
- [x] Make README.md less messy
|
||||
- [ ] Give examples of new functions
|
||||
- [ ] Reference commit with cdf functions, even though deleted
|
||||
- [ ] Post on suckless subreddit
|
||||
- [ ] Drive in a few more real-life applications
|
||||
- [ ] US election modelling?
|
||||
- [ ] Look into using size_t instead of int for sample numbers
|
||||
- [ ] Reorganize code a little bit to reduce usage of gcc's nested functions
|
||||
- [ ] Rename examples
|
||||
|
||||
## Done
|
||||
|
||||
- [x] Document print stats
|
||||
- [x] Document rudimentary algebra manipulations for normal/lognormal
|
||||
- [x] Think through whether to delete cdf => samples function => not for now
|
||||
- [x] Think through whether to:
|
||||
- simplify and just abort on error
|
||||
- complexify and use boxes for everything
|
||||
- leave as is
|
||||
- [x] Offer both options
|
||||
- [x] Add more functions to do algebra and get the 90% c.i. of normals, lognormals, betas, etc.
|
||||
- Think through which of these make sense.
|
||||
- [x] Systematize references
|
||||
- [x] Think through seed initialization
|
||||
- [x] Document parallelism
|
||||
- [x] Document confidence intervals
|
||||
- [x] Add example for only one sample
|
||||
- [x] Add example for many samples
|
||||
- [x] Use gcc extension to define functions nested inside main.
|
||||
- [x] Chain various `sample_mixture` functions
|
||||
- [x] Add beta distribution
|
||||
- See <https://stats.stackexchange.com/questions/502146/how-does-numpy-generate-samples-from-a-beta-distribution> for a faster method.
|
||||
- [x] Use OpenMP for acceleration
|
||||
- [x] Add function to get sample when given a cdf
|
||||
- [x] Don't have a single header file.
|
||||
- [x] Structure project a bit better
|
||||
- [x] Simplify `PROCESS_ERROR` macro
|
||||
- [x] Add README
|
||||
- [x] Schema: a function which takes a sample and manipulates it,
|
||||
- [x] and at the end, an array of samples.
|
||||
- [x] Explain boxes
|
||||
- [x] Explain nested functions
|
||||
- [x] Explain exit on error
|
||||
- [x] Explain individual examples
|
||||
- [x] Rename functions to something more self-explanatory, e.g,. `sample_unit_normal`.
|
||||
- [x] Add summarization functions: mean, std
|
||||
- [x] Add sampling from a gamma distribution
|
||||
- https://dl.acm.org/doi/pdf/10.1145/358407.358414
|
||||
- [x] Explain correlated samples
|
||||
- [x] Test summary statistics for each of the distributions.
|
||||
- [x] For uniform
|
||||
- [x] For normal
|
||||
- [x] For lognormal
|
||||
- [x] For lognormal (to syntax)
|
||||
- [x] For beta distribution
|
||||
- [x] Clarify gamma/standard gamma
|
||||
- [x] Add efficient sampling from a beta distribution
|
||||
- https://dl.acm.org/doi/10.1145/358407.358414
|
||||
- https://link.springer.com/article/10.1007/bf02293108
|
||||
- https://stats.stackexchange.com/questions/502146/how-does-numpy-generate-samples-from-a-beta-distribution
|
||||
- https://github.com/numpy/numpy/blob/5cae51e794d69dd553104099305e9f92db237c53/numpy/random/src/distributions/distributions.c
|
||||
- [x] Pontificate about lognormal tests
|
||||
- [x] Give warning about sampling-based methods.
|
||||
- [x] Have some more complicated & realistic example
|
||||
- [x] Add summarization functions: 90% ci (or all c.i.?)
|
||||
- [x] Link to the examples in the examples section.
|
||||
- [x] Add a few functions for doing simple algebra on normals, and lognormals
|
||||
- [x] Add prototypes
|
||||
- [x] Use named structs
|
||||
- [x] Add to header file
|
||||
- [x] Provide example algebra
|
||||
- [x] Add conversion between 90% ci and parameters.
|
||||
- [x] Use that conversion in conjunction with small algebra.
|
||||
- [x] Consider ergonomics of using ci instead of c_i
|
||||
- [x] use named struct instead
|
||||
- [x] demonstrate and document feeding a struct directly to a function; my_function((struct c_i){.low = 1, .high = 2});
|
||||
- [x] Move to own file? Or signpost in file? => signposted in file.
|
||||
- [x] Write twitter thread: now [here](https://twitter.com/NunoSempere/status/1707041153210564959); retweets appreciated.
|
||||
- [x] Write better confidence interval code that:
|
||||
- Gets number of samples as an input
|
||||
- Gets either a sampler function or a list of samples
|
||||
- is O(n), not O(nlog(n))
|
||||
- Parallelizes stuff
|
||||
|
||||
## Discarded
|
||||
|
||||
- [ ] ~~Disambiguate sample_laplace--successes vs failures || successes vs total trials as two distinct and differently named functions~~
|
||||
- [ ] ~~Support all distribution functions in <https://www.squiggle-language.com/docs/Api/Dist>~~
|
||||
- [ ] ~~Add a custom preprocessor to allow simple nested functions that don't rely on local scope?~~
|
||||
- [ ] ~~Add tests in Stan?~~
|
||||
- [ ] ~~Test results for lognormal manipulations~~
|
||||
- [ ] ~~Consider desirability of defining shortcuts for algebra functions. Adds a level of magic, though.~~
|
||||
- [ ] ~~Think about whether to write a simple version of this for [uxn](https://100r.co/site/uxn.html), a minimalist portable programming stack which, sadly, doesn't have doubles (64 bit floats)~~
|
Binary file not shown.
BIN
examples/more/01_sample_from_cdf/example
Executable file
BIN
examples/more/01_sample_from_cdf/example
Executable file
Binary file not shown.
102
examples/more/01_sample_from_cdf/example.c
Normal file
102
examples/more/01_sample_from_cdf/example.c
Normal file
|
@ -0,0 +1,102 @@
|
|||
#include "../../../squiggle.h"
|
||||
#include "../../../squiggle_more.h"
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
|
||||
#define NUM_SAMPLES 1000000
|
||||
|
||||
// Example cdf
|
||||
double cdf_uniform_0_1(double x)
|
||||
{
|
||||
if (x < 0) {
|
||||
return 0;
|
||||
} else if (x > 1) {
|
||||
return 1;
|
||||
} else {
|
||||
return x;
|
||||
}
|
||||
}
|
||||
|
||||
double cdf_squared_0_1(double x)
|
||||
{
|
||||
if (x < 0) {
|
||||
return 0;
|
||||
} else if (x > 1) {
|
||||
return 1;
|
||||
} else {
|
||||
return x * x;
|
||||
}
|
||||
}
|
||||
|
||||
double cdf_normal_0_1(double x)
|
||||
{
|
||||
double mean = 0;
|
||||
double std = 1;
|
||||
return 0.5 * (1 + erf((x - mean) / (std * sqrt(2)))); // erf from math.h
|
||||
}
|
||||
|
||||
// Some testers
|
||||
void test_inverse_cdf_double(char* cdf_name, double cdf_double(double))
|
||||
{
|
||||
box result = inverse_cdf_double(cdf_double, 0.5);
|
||||
if (result.empty) {
|
||||
printf("Inverse for %s not calculated\n", cdf_name);
|
||||
exit(1);
|
||||
} else {
|
||||
printf("Inverse of %s at %f is: %f\n", cdf_name, 0.5, result.content);
|
||||
}
|
||||
}
|
||||
|
||||
void test_and_time_sampler_double(char* cdf_name, double cdf_double(double), uint64_t* seed)
|
||||
{
|
||||
printf("\nGetting some samples from %s:\n", cdf_name);
|
||||
clock_t begin = clock();
|
||||
for (int i = 0; i < NUM_SAMPLES; i++) {
|
||||
box sample = sampler_cdf_double(cdf_double, seed);
|
||||
if (sample.empty) {
|
||||
printf("Error in sampler function for %s", cdf_name);
|
||||
} else {
|
||||
// printf("%f\n", sample.content);
|
||||
}
|
||||
}
|
||||
clock_t end = clock();
|
||||
double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
|
||||
printf("Time spent: %f\n", time_spent);
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
// Test inverse cdf double
|
||||
test_inverse_cdf_double("cdf_uniform_0_1", cdf_uniform_0_1);
|
||||
test_inverse_cdf_double("cdf_squared_0_1", cdf_squared_0_1);
|
||||
test_inverse_cdf_double("cdf_normal_0_1", cdf_normal_0_1);
|
||||
|
||||
// Testing samplers
|
||||
// set randomness seed
|
||||
uint64_t* seed = malloc(sizeof(uint64_t));
|
||||
*seed = 1000; // xorshift can't start with 0
|
||||
|
||||
// Test double sampler
|
||||
test_and_time_sampler_double("cdf_uniform_0_1", cdf_uniform_0_1, seed);
|
||||
test_and_time_sampler_double("cdf_squared_0_1", cdf_squared_0_1, seed);
|
||||
test_and_time_sampler_double("cdf_normal_0_1", cdf_normal_0_1, seed);
|
||||
|
||||
// Get some normal samples using a previous approach
|
||||
printf("\nGetting some samples from sample_unit_normal\n");
|
||||
|
||||
clock_t begin_2 = clock();
|
||||
double* normal_samples = malloc(NUM_SAMPLES * sizeof(double));
|
||||
for (int i = 0; i < NUM_SAMPLES; i++) {
|
||||
normal_samples[i] = sample_unit_normal(seed);
|
||||
// printf("%f\n", normal_sample);
|
||||
}
|
||||
|
||||
clock_t end_2 = clock();
|
||||
double time_spent_2 = (double)(end_2 - begin_2) / CLOCKS_PER_SEC;
|
||||
printf("Time spent: %f\n", time_spent_2);
|
||||
|
||||
free(seed);
|
||||
return 0;
|
||||
}
|
BIN
examples/more/02_sample_from_cdf_beta/example
Executable file
BIN
examples/more/02_sample_from_cdf_beta/example
Executable file
Binary file not shown.
168
examples/more/02_sample_from_cdf_beta/example.c
Normal file
168
examples/more/02_sample_from_cdf_beta/example.c
Normal file
|
@ -0,0 +1,168 @@
|
|||
#include "../../../squiggle.h"
|
||||
#include "../../../squiggle_more.h"
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
|
||||
#define NUM_SAMPLES 10000
|
||||
#define STOP_BETA 1.0e-8
|
||||
#define TINY_BETA 1.0e-30
|
||||
|
||||
// Incomplete beta function
|
||||
box incbeta(double a, double b, double x)
|
||||
{
|
||||
// Descended from <https://github.com/codeplea/incbeta/blob/master/incbeta.c>,
|
||||
// <https://codeplea.com/incomplete-beta-function-c>
|
||||
// but modified to return a box struct and doubles instead of doubles.
|
||||
// [ ] to do: add attribution in README
|
||||
// Original code under this license:
|
||||
/*
|
||||
* zlib License
|
||||
*
|
||||
* Regularized Incomplete Beta Function
|
||||
*
|
||||
* Copyright (c) 2016, 2017 Lewis Van Winkle
|
||||
* http://CodePlea.com
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgement in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
if (x < 0.0 || x > 1.0) {
|
||||
return PROCESS_ERROR("x out of bounds [0, 1], in function incbeta");
|
||||
}
|
||||
|
||||
/*The continued fraction converges nicely for x < (a+1)/(a+b+2)*/
|
||||
if (x > (a + 1.0) / (a + b + 2.0)) {
|
||||
box symmetric_incbeta = incbeta(b, a, 1.0 - x);
|
||||
if (symmetric_incbeta.empty) {
|
||||
return symmetric_incbeta; // propagate error
|
||||
} else {
|
||||
box result = {
|
||||
.empty = 0,
|
||||
.content = 1 - symmetric_incbeta.content
|
||||
};
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
/*Find the first part before the continued fraction.*/
|
||||
const double lbeta_ab = lgamma(a) + lgamma(b) - lgamma(a + b);
|
||||
const double front = exp(log(x) * a + log(1.0 - x) * b - lbeta_ab) / a;
|
||||
|
||||
/*Use Lentz's algorithm to evaluate the continued fraction.*/
|
||||
double f = 1.0, c = 1.0, d = 0.0;
|
||||
|
||||
int i, m;
|
||||
for (i = 0; i <= 200; ++i) {
|
||||
m = i / 2;
|
||||
|
||||
double numerator;
|
||||
if (i == 0) {
|
||||
numerator = 1.0; /*First numerator is 1.0.*/
|
||||
} else if (i % 2 == 0) {
|
||||
numerator = (m * (b - m) * x) / ((a + 2.0 * m - 1.0) * (a + 2.0 * m)); /*Even term.*/
|
||||
} else {
|
||||
numerator = -((a + m) * (a + b + m) * x) / ((a + 2.0 * m) * (a + 2.0 * m + 1)); /*Odd term.*/
|
||||
}
|
||||
|
||||
/*Do an iteration of Lentz's algorithm.*/
|
||||
d = 1.0 + numerator * d;
|
||||
if (fabs(d) < TINY_BETA)
|
||||
d = TINY_BETA;
|
||||
d = 1.0 / d;
|
||||
|
||||
c = 1.0 + numerator / c;
|
||||
if (fabs(c) < TINY_BETA)
|
||||
c = TINY_BETA;
|
||||
|
||||
const double cd = c * d;
|
||||
f *= cd;
|
||||
|
||||
/*Check for stop.*/
|
||||
if (fabs(1.0 - cd) < STOP_BETA) {
|
||||
box result = {
|
||||
.empty = 0,
|
||||
.content = front * (f - 1.0)
|
||||
};
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
return PROCESS_ERROR("More loops needed, did not converge, in function incbeta");
|
||||
}
|
||||
|
||||
box cdf_beta(double x)
|
||||
{
|
||||
if (x < 0) {
|
||||
box result = { .empty = 0, .content = 0 };
|
||||
return result;
|
||||
} else if (x > 1) {
|
||||
box result = { .empty = 0, .content = 1 };
|
||||
return result;
|
||||
} else {
|
||||
double successes = 1, failures = (2023 - 1945);
|
||||
return incbeta(successes, failures, x);
|
||||
}
|
||||
}
|
||||
|
||||
// Some testers
|
||||
void test_inverse_cdf_box(char* cdf_name, box cdf_box(double))
|
||||
{
|
||||
box result = inverse_cdf_box(cdf_box, 0.5);
|
||||
if (result.empty) {
|
||||
printf("Inverse for %s not calculated\n", cdf_name);
|
||||
exit(1);
|
||||
} else {
|
||||
printf("Inverse of %s at %f is: %f\n", cdf_name, 0.5, result.content);
|
||||
}
|
||||
}
|
||||
|
||||
void test_and_time_sampler_box(char* cdf_name, box cdf_box(double), uint64_t* seed)
|
||||
{
|
||||
printf("\nGetting some samples from %s:\n", cdf_name);
|
||||
clock_t begin = clock();
|
||||
for (int i = 0; i < NUM_SAMPLES; i++) {
|
||||
box sample = sampler_cdf_box(cdf_box, seed);
|
||||
if (sample.empty) {
|
||||
printf("Error in sampler function for %s", cdf_name);
|
||||
} else {
|
||||
// printf("%f\n", sample.content);
|
||||
}
|
||||
}
|
||||
clock_t end = clock();
|
||||
double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
|
||||
printf("Time spent: %f\n", time_spent);
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
// Test inverse cdf box
|
||||
test_inverse_cdf_box("cdf_beta", cdf_beta);
|
||||
|
||||
// Test box sampler
|
||||
uint64_t* seed = malloc(sizeof(uint64_t));
|
||||
*seed = 1000; // xorshift can't start with 0
|
||||
test_and_time_sampler_box("cdf_beta", cdf_beta, seed);
|
||||
// Ok, this is slower than python!!
|
||||
// Partly this is because I am using a more general algorithm,
|
||||
// which applies to any cdf
|
||||
// But I am also using absurdly precise convergence conditions.
|
||||
// This could be optimized.
|
||||
|
||||
free(seed);
|
||||
return 0;
|
||||
}
|
Binary file not shown.
|
@ -4,9 +4,9 @@
|
|||
#include <stdlib.h>
|
||||
|
||||
// Estimate functions
|
||||
double sample_beta_3_2(uint64_t* seed)
|
||||
double beta_1_2_sampler(uint64_t* seed)
|
||||
{
|
||||
return sample_beta(3.0, 2.0, seed);
|
||||
return sample_beta(1, 2.0, seed);
|
||||
}
|
||||
|
||||
int main()
|
||||
|
@ -15,16 +15,9 @@ int main()
|
|||
uint64_t* seed = malloc(sizeof(uint64_t));
|
||||
*seed = 1000; // xorshift can't start with 0
|
||||
|
||||
int n_samples = 1 * MILLION;
|
||||
double* xs = malloc(sizeof(double) * (size_t)n_samples);
|
||||
for (int i = 0; i < n_samples; i++) {
|
||||
xs[i] = sample_beta_3_2(seed);
|
||||
}
|
||||
|
||||
printf("\n# Stats\n");
|
||||
array_print_stats(xs, n_samples);
|
||||
printf("\n# Histogram\n");
|
||||
array_print_histogram(xs, n_samples, 23);
|
||||
ci beta_1_2_ci_90 = sampler_get_90_ci(beta_1_2_sampler, 1000000, seed);
|
||||
printf("90%% confidence interval of beta(1,2) is [%f, %f]\n", beta_1_2_ci_90.low, beta_1_2_ci_90.high);
|
||||
printf("You can check this in <https://nunosempere.com/blog/2023/03/15/fit-beta/>\n");
|
||||
|
||||
free(seed);
|
||||
}
|
||||
|
|
Binary file not shown.
|
@ -34,7 +34,7 @@ double probability_of_dying_eli(uint64_t* seed)
|
|||
return probability_of_dying;
|
||||
}
|
||||
|
||||
double sample_nuclear_model(uint64_t* seed)
|
||||
double mixture(uint64_t* seed)
|
||||
{
|
||||
double (*samplers[])(uint64_t*) = { probability_of_dying_nuno, probability_of_dying_eli };
|
||||
double weights[] = { 0.5, 0.5 };
|
||||
|
@ -47,17 +47,23 @@ int main()
|
|||
uint64_t* seed = malloc(sizeof(uint64_t));
|
||||
*seed = 1000; // xorshift can't start with 0
|
||||
|
||||
int n = 1 * MILLION;
|
||||
double* xs = malloc(sizeof(double) * (size_t)n);
|
||||
int n = 1000 * 1000;
|
||||
|
||||
double* mixture_result = malloc(sizeof(double) * (size_t)n);
|
||||
for (int i = 0; i < n; i++) {
|
||||
xs[i] = sample_nuclear_model(seed);
|
||||
mixture_result[i] = mixture(seed);
|
||||
}
|
||||
|
||||
printf("\n# Stats\n");
|
||||
array_print_stats(xs, n);
|
||||
printf("\n# Histogram\n");
|
||||
array_print_90_ci_histogram(xs, n, 20);
|
||||
printf("mixture_result: [ ");
|
||||
for (int i = 0; i < 9; i++) {
|
||||
printf("%.6f, ", mixture_result[i]);
|
||||
}
|
||||
printf("... ]\n");
|
||||
|
||||
free(xs);
|
||||
ci ci_90 = sampler_get_90_ci(mixture, 1000000, seed);
|
||||
printf("mean: %f\n", array_mean(mixture_result, n));
|
||||
printf("90%% confidence interval: [%f, %f]\n", ci_90.low, ci_90.high);
|
||||
|
||||
free(mixture_result);
|
||||
free(seed);
|
||||
}
|
||||
|
|
Binary file not shown.
|
@ -27,17 +27,22 @@ int main()
|
|||
*seed = 1000; // xorshift can't start with 0
|
||||
|
||||
int n = 1000 * 1000;
|
||||
double* xs = malloc(sizeof(double) * (size_t)n);
|
||||
|
||||
double* result = malloc(sizeof(double) * (size_t)n);
|
||||
for (int i = 0; i < n; i++) {
|
||||
xs[i] = sample_minutes_per_day_jumping_rope_needed_to_burn_10kg(seed);
|
||||
result[i] = sample_minutes_per_day_jumping_rope_needed_to_burn_10kg(seed);
|
||||
}
|
||||
|
||||
printf("## How many minutes per day do I have to jump rope to lose 10kg of fat by the end of the year?\n");
|
||||
printf("Mean: %f\n", array_mean(result, n));
|
||||
printf("A few samples: [ ");
|
||||
for (int i = 0; i < 9; i++) {
|
||||
printf("%.6f, ", result[i]);
|
||||
}
|
||||
printf("... ]\n");
|
||||
|
||||
printf("\n# Stats\n");
|
||||
array_print_stats(xs, n);
|
||||
printf("\n# Histogram\n");
|
||||
array_print_histogram(xs, n, 23);
|
||||
ci ci_90 = sampler_get_90_ci(sample_minutes_per_day_jumping_rope_needed_to_burn_10kg, 1000000, seed);
|
||||
printf("90%% confidence interval: [%f, %f]\n", ci_90.low, ci_90.high);
|
||||
|
||||
free(seed);
|
||||
}
|
||||
|
|
Binary file not shown.
|
@ -46,37 +46,41 @@ int main()
|
|||
uint64_t* seed = malloc(sizeof(uint64_t));
|
||||
*seed = 1000; // xorshift can't start with 0
|
||||
|
||||
int n_samples = 1000000;
|
||||
int num_samples = 1000000;
|
||||
|
||||
// Before a first nuclear collapse
|
||||
printf("## Before the first nuclear collapse\n");
|
||||
double* yearly_probability_nuclear_collapse_2023_samples = malloc(sizeof(double) * (size_t)n_samples);
|
||||
for (int i = 0; i < n_samples; i++) {
|
||||
ci ci_90_2023 = sampler_get_90_ci(yearly_probability_nuclear_collapse_2023, 1000000, seed);
|
||||
printf("90%% confidence interval: [%f, %f]\n", ci_90_2023.low, ci_90_2023.high);
|
||||
|
||||
double* yearly_probability_nuclear_collapse_2023_samples = malloc(sizeof(double) * (size_t)num_samples);
|
||||
for (int i = 0; i < num_samples; i++) {
|
||||
yearly_probability_nuclear_collapse_2023_samples[i] = yearly_probability_nuclear_collapse_2023(seed);
|
||||
}
|
||||
ci ci_90_2023 = array_get_90_ci(yearly_probability_nuclear_collapse_2023_samples, n_samples);
|
||||
printf("90%% confidence interval: [%f, %f]\n", ci_90_2023.low, ci_90_2023.high);
|
||||
printf("mean: %f\n", array_mean(yearly_probability_nuclear_collapse_2023_samples, num_samples));
|
||||
|
||||
// After the first nuclear collapse
|
||||
printf("\n## After the first nuclear collapse\n");
|
||||
ci ci_90_2070 = sampler_get_90_ci(yearly_probability_nuclear_collapse_after_recovery_example, 1000000, seed);
|
||||
printf("90%% confidence interval: [%f, %f]\n", ci_90_2070.low, ci_90_2070.high);
|
||||
|
||||
double* yearly_probability_nuclear_collapse_after_recovery_samples = malloc(sizeof(double) * (size_t)n_samples);
|
||||
for (int i = 0; i < n_samples; i++) {
|
||||
double* yearly_probability_nuclear_collapse_after_recovery_samples = malloc(sizeof(double) * (size_t)num_samples);
|
||||
for (int i = 0; i < num_samples; i++) {
|
||||
yearly_probability_nuclear_collapse_after_recovery_samples[i] = yearly_probability_nuclear_collapse_after_recovery_example(seed);
|
||||
}
|
||||
ci ci_90_2070 = array_get_90_ci(yearly_probability_nuclear_collapse_after_recovery_samples, 1000000);
|
||||
printf("90%% confidence interval: [%f, %f]\n", ci_90_2070.low, ci_90_2070.high);
|
||||
printf("mean: %f\n", array_mean(yearly_probability_nuclear_collapse_after_recovery_samples, num_samples));
|
||||
|
||||
// After the first nuclear collapse (antiinductive)
|
||||
printf("\n## After the first nuclear collapse (antiinductive)\n");
|
||||
double* yearly_probability_nuclear_collapse_after_recovery_antiinductive_samples = malloc(sizeof(double) * (size_t)n_samples);
|
||||
for (int i = 0; i < n_samples; i++) {
|
||||
yearly_probability_nuclear_collapse_after_recovery_antiinductive_samples[i] = yearly_probability_nuclear_collapse_after_recovery_antiinductive(seed);
|
||||
}
|
||||
ci ci_90_antiinductive = array_get_90_ci(yearly_probability_nuclear_collapse_after_recovery_antiinductive_samples, 1000000);
|
||||
ci ci_90_antiinductive = sampler_get_90_ci(yearly_probability_nuclear_collapse_after_recovery_antiinductive, 1000000, seed);
|
||||
printf("90%% confidence interval: [%f, %f]\n", ci_90_antiinductive.low, ci_90_antiinductive.high);
|
||||
|
||||
// free seeds
|
||||
double* yearly_probability_nuclear_collapse_after_recovery_antiinductive_samples = malloc(sizeof(double) * (size_t)num_samples);
|
||||
for (int i = 0; i < num_samples; i++) {
|
||||
yearly_probability_nuclear_collapse_after_recovery_antiinductive_samples[i] = yearly_probability_nuclear_collapse_after_recovery_antiinductive(seed);
|
||||
}
|
||||
printf("mean: %f\n", array_mean(yearly_probability_nuclear_collapse_after_recovery_antiinductive_samples, num_samples));
|
||||
|
||||
free(yearly_probability_nuclear_collapse_2023_samples);
|
||||
free(yearly_probability_nuclear_collapse_after_recovery_samples);
|
||||
free(yearly_probability_nuclear_collapse_after_recovery_antiinductive_samples);
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -39,6 +39,8 @@ FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
|
|||
## make all
|
||||
all:
|
||||
$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) 00_example_template/$(SRC) $(DEPS) -o 00_example_template/$(OUTPUT)
|
||||
$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) 01_sample_from_cdf/$(SRC) $(DEPS) -o 01_sample_from_cdf/$(OUTPUT)
|
||||
$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) 02_sample_from_cdf_beta/$(SRC) $(DEPS) -o 02_sample_from_cdf_beta/$(OUTPUT)
|
||||
$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) 03_ci_beta/$(SRC) $(DEPS) -o 03_ci_beta/$(OUTPUT)
|
||||
$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) 04_nuclear_war/$(SRC) $(DEPS) -o 04_nuclear_war/$(OUTPUT)
|
||||
$(CC) $(OPTIMIZED) $(DEBUG) $(WARN) 05_burn_10kg_fat/$(SRC) $(DEPS) -o 05_burn_10kg_fat/$(OUTPUT)
|
||||
|
|
265
squiggle_more.c
265
squiggle_more.c
|
@ -194,17 +194,6 @@ double array_get_median(double xs[], int n){
|
|||
return quickselect(median_k, xs, n);
|
||||
}
|
||||
|
||||
/* array print: potentially useful for debugging */
|
||||
void array_print(double xs[], int n)
|
||||
{
|
||||
printf("[");
|
||||
for (int i = 0; i < n - 1; i++) {
|
||||
printf("%f, ", xs[i]);
|
||||
}
|
||||
printf("%f", xs[n - 1]);
|
||||
printf("]\n");
|
||||
}
|
||||
|
||||
void array_print_stats(double xs[], int n){
|
||||
ci ci_90 = array_get_ci((ci) { .low = 0.05, .high = 0.95 }, xs, n);
|
||||
ci ci_80 = array_get_ci((ci) { .low = 0.1, .high = 0.9 }, xs, n);
|
||||
|
@ -212,15 +201,15 @@ void array_print_stats(double xs[], int n){
|
|||
double median = array_get_median(xs, n);
|
||||
double mean = array_mean(xs, n);
|
||||
double std = array_std(xs, n);
|
||||
printf("Avg: %lf\n"
|
||||
"Std: %lf\n"
|
||||
printf("Mean: %lf\n"
|
||||
" Std: %lf\n"
|
||||
" 5%%: %lf\n"
|
||||
"10%%: %lf\n"
|
||||
"25%%: %lf\n"
|
||||
"50%%: %lf\n"
|
||||
"75%%: %lf\n"
|
||||
"90%%: %lf\n"
|
||||
"95%%: %lf\n",
|
||||
" 10%%: %lf\n"
|
||||
" 25%%: %lf\n"
|
||||
" 50%%: %lf\n"
|
||||
" 75%%: %lf\n"
|
||||
" 90%%: %lf\n"
|
||||
" 95%%: %lf\n",
|
||||
mean, std, ci_90.low, ci_80.low, ci_50.low, median, ci_50.high, ci_80.high, ci_90.high);
|
||||
}
|
||||
|
||||
|
@ -293,7 +282,7 @@ void array_print_histogram(double* xs, int n_samples, int n_bins) {
|
|||
decimalPlaces = -magnitude;
|
||||
decimalPlaces = decimalPlaces > 10 ? 10 : decimalPlaces;
|
||||
}
|
||||
printf("[%*.*f, %*.*f", 4+decimalPlaces, decimalPlaces, bin_start, 4+decimalPlaces, decimalPlaces, bin_end);
|
||||
printf(" [%*.*f, %*.*f", 4+decimalPlaces, decimalPlaces, bin_start, 4+decimalPlaces, decimalPlaces, bin_end);
|
||||
char interval_delimiter = ')';
|
||||
if(i == (n_bins-1)){
|
||||
interval_delimiter = ']'; // last bucket is inclusive
|
||||
|
@ -377,12 +366,12 @@ void array_print_90_ci_histogram(double* xs, int n_samples, int n_bins){
|
|||
decimalPlaces = -magnitude;
|
||||
decimalPlaces = decimalPlaces > 10 ? 10 : decimalPlaces;
|
||||
}
|
||||
printf( "(-∞, %*.*f): %d\n", 4+decimalPlaces, decimalPlaces, min_value, below_min);
|
||||
printf( " (-∞, %*.*f): %d\n", 4+decimalPlaces, decimalPlaces, min_value, below_min);
|
||||
for (int i = 0; i < n_bins; i++) {
|
||||
double bin_start = min_value + i * bin_width;
|
||||
double bin_end = bin_start + bin_width;
|
||||
|
||||
printf("[%*.*f, %*.*f", 4+decimalPlaces, decimalPlaces, bin_start, 4+decimalPlaces, decimalPlaces, bin_end);
|
||||
printf(" [%*.*f, %*.*f", 4+decimalPlaces, decimalPlaces, bin_start, 4+decimalPlaces, decimalPlaces, bin_end);
|
||||
char interval_delimiter = ')';
|
||||
if(i == (n_bins-1)){
|
||||
interval_delimiter = ']'; // last bucket is inclusive
|
||||
|
@ -395,13 +384,30 @@ void array_print_90_ci_histogram(double* xs, int n_samples, int n_bins){
|
|||
}
|
||||
printf(" %d\n", bins[i]);
|
||||
}
|
||||
printf( "(%*.*f, +∞): %d\n", 4+decimalPlaces, decimalPlaces, max_value, above_max);
|
||||
printf( " (%*.*f, +∞): %d\n", 4+decimalPlaces, decimalPlaces, max_value, above_max);
|
||||
|
||||
// Free the allocated memory for bins
|
||||
free(bins);
|
||||
|
||||
}
|
||||
|
||||
// Replicate some of the above functions over samplers
|
||||
// However, in the future I'll delete this
|
||||
// There should be a clear boundary between working with samplers and working with an array of samples
|
||||
ci sampler_get_ci(ci interval, double (*sampler)(uint64_t*), int n, uint64_t* seed)
|
||||
{
|
||||
UNUSED(seed); // don't want to use it right now, but want to preserve ability to do so (e.g., remove parallelism from internals). Also nicer for consistency.
|
||||
double* xs = malloc((size_t)n * sizeof(double));
|
||||
sampler_parallel(sampler, xs, 16, n);
|
||||
ci result = array_get_ci(interval, xs, n);
|
||||
free(xs);
|
||||
return result;
|
||||
}
|
||||
ci sampler_get_90_ci(double (*sampler)(uint64_t*), int n, uint64_t* seed)
|
||||
{
|
||||
return sampler_get_ci((ci) { .low = 0.05, .high = 0.95 }, sampler, n, seed);
|
||||
}
|
||||
|
||||
/* Algebra manipulations */
|
||||
|
||||
#define NORMAL90CONFIDENCE 1.6448536269514727
|
||||
|
@ -452,3 +458,216 @@ ci convert_lognormal_params_to_ci(lognormal_params y)
|
|||
ci result = { .low = exp(loglow), .high = exp(loghigh) };
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Scaffolding to handle errors */
|
||||
// We will sample from an arbitrary cdf
|
||||
// and that operation might fail
|
||||
// so we build some scaffolding here
|
||||
|
||||
#define MAX_ERROR_LENGTH 500
|
||||
#define EXIT_ON_ERROR 0
|
||||
#define PROCESS_ERROR(error_msg) process_error(error_msg, EXIT_ON_ERROR, __FILE__, __LINE__)
|
||||
|
||||
typedef struct box_t {
|
||||
int empty;
|
||||
double content;
|
||||
char* error_msg;
|
||||
} box;
|
||||
|
||||
box process_error(const char* error_msg, int should_exit, char* file, int line)
|
||||
{
|
||||
if (should_exit) {
|
||||
printf("%s, @, in %s (%d)", error_msg, file, line);
|
||||
exit(1);
|
||||
} else {
|
||||
char error_msg[MAX_ERROR_LENGTH];
|
||||
snprintf(error_msg, MAX_ERROR_LENGTH, "@, in %s (%d)", file, line); // NOLINT: We are being carefull here by considering MAX_ERROR_LENGTH explicitly.
|
||||
box error = { .empty = 1, .error_msg = error_msg };
|
||||
return error;
|
||||
}
|
||||
}
|
||||
|
||||
/* Invert an arbitrary cdf at a point */
|
||||
// Version #1:
|
||||
// - input: (cdf: double => double, p)
|
||||
// - output: Box(number|error)
|
||||
box inverse_cdf_double(double cdf(double), double p)
|
||||
{
|
||||
// given a cdf: [-Inf, Inf] => [0,1]
|
||||
// returns a box with either
|
||||
// x such that cdf(x) = p
|
||||
// or an error
|
||||
// if EXIT_ON_ERROR is set to 1, it exits instead of providing an error
|
||||
|
||||
double low = -1.0;
|
||||
double high = 1.0;
|
||||
|
||||
// 1. Make sure that cdf(low) < p < cdf(high)
|
||||
int interval_found = 0;
|
||||
while ((!interval_found) && (low > -DBL_MAX / 4) && (high < DBL_MAX / 4)) {
|
||||
// for floats, use FLT_MAX instead
|
||||
// Note that this approach is overkill
|
||||
// but it's also the *correct* thing to do.
|
||||
|
||||
int low_condition = (cdf(low) < p);
|
||||
int high_condition = (p < cdf(high));
|
||||
if (low_condition && high_condition) {
|
||||
interval_found = 1;
|
||||
} else if (!low_condition) {
|
||||
low = low * 2;
|
||||
} else if (!high_condition) {
|
||||
high = high * 2;
|
||||
}
|
||||
}
|
||||
|
||||
if (!interval_found) {
|
||||
return PROCESS_ERROR("Interval containing the target value not found, in function inverse_cdf");
|
||||
} else {
|
||||
|
||||
int convergence_condition = 0;
|
||||
int count = 0;
|
||||
while (!convergence_condition && (count < (INT_MAX / 2))) {
|
||||
double mid = (high + low) / 2;
|
||||
int mid_not_new = (mid == low) || (mid == high);
|
||||
// double width = high - low;
|
||||
// if ((width < 1e-8) || mid_not_new){
|
||||
if (mid_not_new) {
|
||||
convergence_condition = 1;
|
||||
} else {
|
||||
double mid_sign = cdf(mid) - p;
|
||||
if (mid_sign < 0) {
|
||||
low = mid;
|
||||
} else if (mid_sign > 0) {
|
||||
high = mid;
|
||||
} else if (mid_sign == 0) {
|
||||
low = mid;
|
||||
high = mid;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (convergence_condition) {
|
||||
box result = { .empty = 0, .content = low };
|
||||
return result;
|
||||
} else {
|
||||
return PROCESS_ERROR("Search process did not converge, in function inverse_cdf");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Version #2:
|
||||
// - input: (cdf: double => Box(number|error), p)
|
||||
// - output: Box(number|error)
|
||||
box inverse_cdf_box(box cdf_box(double), double p)
|
||||
{
|
||||
// given a cdf: [-Inf, Inf] => Box([0,1])
|
||||
// returns a box with either
|
||||
// x such that cdf(x) = p
|
||||
// or an error
|
||||
// if EXIT_ON_ERROR is set to 1, it exits instead of providing an error
|
||||
|
||||
double low = -1.0;
|
||||
double high = 1.0;
|
||||
|
||||
// 1. Make sure that cdf(low) < p < cdf(high)
|
||||
int interval_found = 0;
|
||||
while ((!interval_found) && (low > -DBL_MAX / 4) && (high < DBL_MAX / 4)) {
|
||||
// for floats, use FLT_MAX instead
|
||||
// Note that this approach is overkill
|
||||
// but it's also the *correct* thing to do.
|
||||
box cdf_low = cdf_box(low);
|
||||
if (cdf_low.empty) {
|
||||
return PROCESS_ERROR(cdf_low.error_msg);
|
||||
}
|
||||
|
||||
box cdf_high = cdf_box(high);
|
||||
if (cdf_high.empty) {
|
||||
return PROCESS_ERROR(cdf_low.error_msg);
|
||||
}
|
||||
|
||||
int low_condition = (cdf_low.content < p);
|
||||
int high_condition = (p < cdf_high.content);
|
||||
if (low_condition && high_condition) {
|
||||
interval_found = 1;
|
||||
} else if (!low_condition) {
|
||||
low = low * 2;
|
||||
} else if (!high_condition) {
|
||||
high = high * 2;
|
||||
}
|
||||
}
|
||||
|
||||
if (!interval_found) {
|
||||
return PROCESS_ERROR("Interval containing the target value not found, in function inverse_cdf");
|
||||
} else {
|
||||
|
||||
int convergence_condition = 0;
|
||||
int count = 0;
|
||||
while (!convergence_condition && (count < (INT_MAX / 2))) {
|
||||
double mid = (high + low) / 2;
|
||||
int mid_not_new = (mid == low) || (mid == high);
|
||||
// double width = high - low;
|
||||
if (mid_not_new) {
|
||||
// if ((width < 1e-8) || mid_not_new){
|
||||
convergence_condition = 1;
|
||||
} else {
|
||||
box cdf_mid = cdf_box(mid);
|
||||
if (cdf_mid.empty) {
|
||||
return PROCESS_ERROR(cdf_mid.error_msg);
|
||||
}
|
||||
double mid_sign = cdf_mid.content - p;
|
||||
if (mid_sign < 0) {
|
||||
low = mid;
|
||||
} else if (mid_sign > 0) {
|
||||
high = mid;
|
||||
} else if (mid_sign == 0) {
|
||||
low = mid;
|
||||
high = mid;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (convergence_condition) {
|
||||
box result = { .empty = 0, .content = low };
|
||||
return result;
|
||||
} else {
|
||||
return PROCESS_ERROR("Search process did not converge, in function inverse_cdf");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Sample from an arbitrary cdf */
|
||||
// Before: invert an arbitrary cdf at a point
|
||||
// Now: from an arbitrary cdf, get a sample
|
||||
box sampler_cdf_box(box cdf(double), uint64_t* seed)
|
||||
{
|
||||
double p = sample_unit_uniform(seed);
|
||||
box result = inverse_cdf_box(cdf, p);
|
||||
return result;
|
||||
}
|
||||
box sampler_cdf_double(double cdf(double), uint64_t* seed)
|
||||
{
|
||||
double p = sample_unit_uniform(seed);
|
||||
box result = inverse_cdf_double(cdf, p);
|
||||
return result;
|
||||
}
|
||||
double sampler_cdf_danger(box cdf(double), uint64_t* seed)
|
||||
{
|
||||
double p = sample_unit_uniform(seed);
|
||||
box result = inverse_cdf_box(cdf, p);
|
||||
if (result.empty) {
|
||||
exit(1);
|
||||
} else {
|
||||
return result.content;
|
||||
}
|
||||
}
|
||||
|
||||
/* array print: potentially useful for debugging */
|
||||
void array_print(double xs[], int n)
|
||||
{
|
||||
printf("[");
|
||||
for (int i = 0; i < n - 1; i++) {
|
||||
printf("%f, ", xs[i]);
|
||||
}
|
||||
printf("%f", xs[n - 1]);
|
||||
printf("]\n");
|
||||
}
|
||||
|
|
|
@ -17,6 +17,10 @@ void array_print_stats(double xs[], int n);
|
|||
void array_print_histogram(double* xs, int n_samples, int n_bins);
|
||||
void array_print_90_ci_histogram(double* xs, int n, int n_bins);
|
||||
|
||||
// Deprecated: get confidence intervals directly from samplers
|
||||
ci sampler_get_ci(ci interval, double (*sampler)(uint64_t*), int n, uint64_t* seed);
|
||||
ci sampler_get_90_ci(double (*sampler)(uint64_t*), int n, uint64_t* seed);
|
||||
|
||||
/* Algebra manipulations */
|
||||
|
||||
typedef struct normal_params_t {
|
||||
|
@ -34,9 +38,24 @@ lognormal_params algebra_product_lognormals(lognormal_params a, lognormal_params
|
|||
lognormal_params convert_ci_to_lognormal_params(ci x);
|
||||
ci convert_lognormal_params_to_ci(lognormal_params y);
|
||||
|
||||
/* Utilities */
|
||||
/* Error handling */
|
||||
typedef struct box_t {
|
||||
int empty;
|
||||
double content;
|
||||
char* error_msg;
|
||||
} box;
|
||||
#define MAX_ERROR_LENGTH 500
|
||||
#define EXIT_ON_ERROR 0
|
||||
#define PROCESS_ERROR(error_msg) process_error(error_msg, EXIT_ON_ERROR, __FILE__, __LINE__)
|
||||
box process_error(const char* error_msg, int should_exit, char* file, int line);
|
||||
void array_print(double* array, int length);
|
||||
|
||||
#define THOUSAND 1000
|
||||
#define MILLION 1000000
|
||||
/* Inverse cdf */
|
||||
box inverse_cdf_double(double cdf(double), double p);
|
||||
box inverse_cdf_box(box cdf_box(double), double p);
|
||||
|
||||
/* Samplers from cdf */
|
||||
box sampler_cdf_double(double cdf(double), uint64_t* seed);
|
||||
box sampler_cdf_box(box cdf(double), uint64_t* seed);
|
||||
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue
Block a user