diff --git a/README.md b/README.md index 73041d6..7b46a40 100644 --- a/README.md +++ b/README.md @@ -344,7 +344,9 @@ It emits one warning about something I already took care of, so by default I've - [x] Add a few functions for doing simple algebra on normals, and lognormals - [x] Add prototypes - [x] Use named structs + - [x] Add to header file - [ ] Test results - [ ] Provide example - - [ ] Add to headers - - [ ] Move to own file + - [ ] Add conversion between 90%ci and parameters. + - [ ] Move to own file? Or signpost in file? +- [ ] Disambiguate sample_laplace--successes vs failures || successes vs total trials as two distinct and differently named functions diff --git a/squiggle.c b/squiggle.c index a5b11bf..1e03bb2 100644 --- a/squiggle.c +++ b/squiggle.c @@ -7,15 +7,22 @@ #include #include +// Some error niceties; these won't be used until later #define MAX_ERROR_LENGTH 500 #define EXIT_ON_ERROR 0 #define PROCESS_ERROR(error_msg) process_error(error_msg, EXIT_ON_ERROR, __FILE__, __LINE__) const double PI = 3.14159265358979323846; // M_PI in gcc gnu99 +// # Key functionality +// Define the minimum number of functions needed to do simple estimation +// Starts here, ends until the end of the mixture function + // Pseudo Random number generator uint64_t xorshift32(uint32_t* seed) { + // The reader isn't expected to understand this code immediately; + // read the linked Wikipedia page! // Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs" // See // https://en.wikipedia.org/wiki/Xorshift @@ -71,17 +78,40 @@ double sample_lognormal(double logmean, double logstd, uint64_t* seed) return exp(sample_normal(logmean, logstd, seed)); } +inline double sample_normal_from_95_confidence_interval(double low, double high, uint64_t* seed){ + // Explanation of key idea: + // 1. We know that the 90% confidence interval of the unit normal is + // [-1.6448536269514722, 1.6448536269514722] + // see e.g.: https://stackoverflow.com/questions/20626994/how-to-calculate-the-inverse-of-the-normal-cumulative-distribution-function-in-p + // 2. So if we take a unit normal and multiply it by + // L / 1.6448536269514722, its new 90% confidence interval will be + // [-L, L], i.e., length 2 * L + // 3. Instead, if we want to get a confidence interval of length L, + // we should multiply the unit normal by + // L / (2 * 1.6448536269514722) + // Meaning that its standard deviation should be multiplied by that amount + // see: https://en.wikipedia.org/wiki/Normal_distribution?lang=en#Operations_on_a_single_normal_variable + // 4. So we have learnt that Normal(0, L / (2 * 1.6448536269514722)) + // has a 90% confidence interval of length L + // 5. If we want a 90% confidence interval from high to low, + // we can set mean = (high + low)/2; the midpoint, and L = high-low, + // Normal([high + low]/2, [high - low]/(2 * 1.6448536269514722)) + const double NORMAL95CONFIDENCE = 1.6448536269514722; + double mean = (high + low) / 2.0; + double std = (high - low) / (2.0 * NORMAL95CONFIDENCE ); + return sample_normal(mean, std); +} + double sample_to(double low, double high, uint64_t* seed) { // Given a (positive) 90% confidence interval, - // returns a sample from a lognormal - // with a matching 90% c.i. - const double NORMAL95CONFIDENCE = 1.6448536269514722; + // returns a sample from a lognorma with a matching 90% c.i. + // Key idea: If we want a lognormal with 90% confidence interval [a, b] + // we need but get a normal with 90% confidence interval [log(a), log(b)]. + // Then see code for sample_normal_from_95_confidence_interval double loglow = logf(low); double loghigh = logf(high); - double logmean = (loglow + loghigh) / 2; - double logstd = (loghigh - loglow) / (2.0 * NORMAL95CONFIDENCE); - return sample_lognormal(logmean, logstd, seed); + return exp(sample_normal_from_95_confidence_interval(loglow, loghigh)); } double sample_gamma(double alpha, uint64_t* seed) @@ -129,13 +159,14 @@ double sample_gamma(double alpha, uint64_t* seed) double sample_beta(double a, double b, uint64_t* seed) { + // See: https://en.wikipedia.org/wiki/Gamma_distribution#Related_distributions double gamma_a = sample_gamma(a, seed); double gamma_b = sample_gamma(b, seed); return gamma_a / (gamma_a + gamma_b); } double sample_laplace(double successes, double failures, uint64_t* seed){ - // see + // see return sample_beta(successes + 1, failures + 1, seed); } @@ -177,8 +208,7 @@ double array_std(double* array, int length) // Mixture function double sample_mixture(double (*samplers[])(uint64_t*), double* weights, int n_dists, uint64_t* seed) { - // You can see a simpler version of this function in the git history - // or in C-02-better-algorithm-one-thread/ + // Sample from samples with frequency proportional to their weights. double sum_weights = array_sum(weights, n_dists); double* cumsummed_normalized_weights = (double*)malloc(n_dists * sizeof(double)); cumsummed_normalized_weights[0] = weights[0] / sum_weights; @@ -203,7 +233,11 @@ double sample_mixture(double (*samplers[])(uint64_t*), double* weights, int n_di return result; } -// Sample from an arbitrary cdf +// # More cool stuff +// This is no longer necessary to do basic estimation, +// but is still cool + +// ## Sample from an arbitrary cdf struct box { int empty; double content; @@ -436,45 +470,52 @@ struct c_i get_90_confidence_interval(double (*sampler)(uint64_t*), uint64_t* se return result; } +// # Small algebra system + // Do algebra over lognormals and normals -typedef struct { +// here I discover named structs, +// which mean that I don't have to be typing +// struct blah all the time. +typedef struct normal_params_t { double mean; double std; -} normal_parameters; +} normal_params; -struct { +typedef struct lognormal_params_t { double logmean; double logstd; -} lognormal_parameters; +} lognormal_params; -normal_parameters algebra_sum_normals(normal_parameters a, normal_parameters b) +normal_params algebra_sum_normals(normal_params a, normal_params b) { - normal_parameters result = { + normal_params result = { .mean = a.mean + b.mean, .std = sqrt((a.std * a.std) + (b.std * b.std)), }; return result; } -normal_parameters algebra_shift_normal(normal_parameters a, double shift) +normal_params algebra_shift_normal(normal_params a, double shift) { - normal_parameters result = { + normal_params result = { .mean = a.mean + shift, .std = a.std, }; return result; } -lognormal_parameters algebra_product_lognormals(lognormal_parameters a, lognormal_parameters b) +// Also add stretching + +lognormal_params algebra_product_lognormals(lognormal_params a, lognormal_params b) { - lognormal_parameters result = { + lognormal_params result = { .logmean = a.logmean + b.logmean, .logstd = sqrt((a.logstd * a.logstd) + (b.logstd * b.logstd)), }; return result; } -lognormal_parameters algebra_scale_lognormal(lognormal_parameters a, double k) +lognormal_params algebra_scale_lognormal(lognormal_params a, double k) { - lognormal_parameters result = { + lognormal_params result = { .logmean = a.logmean + k, .logstd = a.logstd, }; diff --git a/squiggle.h b/squiggle.h index cd1eaad..75a9fca 100644 --- a/squiggle.h +++ b/squiggle.h @@ -58,4 +58,24 @@ struct c_i { }; struct c_i get_90_confidence_interval(double (*sampler)(uint64_t*), uint64_t* seed); +// small algebra system + +typedef struct normal_params_t { + double mean; + double std; +} normal_params; + +typedef struct lognormal_params_t { + double logmean; + double logstd; +} lognormal_params; + + +normal_params algebra_sum_normals(normal_params a, normal_params b); +normal_params algebra_shift_normal(normal_params a, double shift); + + +lognormal_params algebra_product_lognormals(lognormal_params a, lognormal_params b); +lognormal_params algebra_scale_lognormal(lognormal_params a, double k); + #endif