squiggle.c/scratchpad/scratchpad.c

472 lines
15 KiB
C

#include <float.h> // FLT_MAX, FLT_MIN
#include <limits.h> // INT_MAX
#include <math.h> // erf, sqrt
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <time.h>
#define EXIT_ON_ERROR 0
#define MAX_ERROR_LENGTH 500
#define PROCESS_ERROR(...) \
do { \
if (EXIT_ON_ERROR) { \
printf("@, in %s (%d)", __FILE__, __LINE__); \
exit(1); \
} else { \
char error_msg[MAX_ERROR_LENGTH]; \
snprintf(error_msg, MAX_ERROR_LENGTH, "@, in %s (%d)", __FILE__, __LINE__); \
struct box error = { .empty = 1, .error_msg = error_msg }; \
return error; \
} \
} while (0)
#define NUM_SAMPLES 1000000
struct box {
int empty;
float content;
char* error_msg;
};
// Example cdf
float cdf_uniform_0_1(float x)
{
if (x < 0) {
return 0;
} else if (x > 1) {
return 1;
} else {
return x;
}
}
float cdf_squared_0_1(float x)
{
if (x < 0) {
return 0;
} else if (x > 1) {
return 1;
} else {
return x * x;
}
}
float cdf_normal_0_1(float x)
{
float mean = 0;
float std = 1;
return 0.5 * (1 + erf((x - mean) / (std * sqrt(2)))); // erf from math.h
}
// [x] to do: add beta.
// [x] for the cdf, use this incomplete beta function implementation, based on continuous fractions:
// <https://codeplea.com/incomplete-beta-function-c>
// <https://github.com/codeplea/incbeta>
#define STOP_BETA 1.0e-8
#define TINY_BETA 1.0e-30
struct box incbeta(float a, float b, float x)
{
// Descended from <https://github.com/codeplea/incbeta/blob/master/incbeta.c>,
// but modified to return a box struct and floats instead of doubles.
// [ ] to do: add attribution in README
// Original code under this license:
/*
* zlib License
*
* Regularized Incomplete Beta Function
*
* Copyright (c) 2016, 2017 Lewis Van Winkle
* http://CodePlea.com
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgement in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
if (x < 0.0 || x > 1.0) {
PROCESS_ERROR("x out of bounds [0, 1], in function incbeta");
}
/*The continued fraction converges nicely for x < (a+1)/(a+b+2)*/
if (x > (a + 1.0) / (a + b + 2.0)) {
struct box symmetric_incbeta = incbeta(b, a, 1.0 - x);
if (symmetric_incbeta.empty) {
return symmetric_incbeta; // propagate error
} else {
struct box result = {
.empty = 0,
.content = 1 - symmetric_incbeta.content
};
return result;
}
}
/*Find the first part before the continued fraction.*/
const float lbeta_ab = lgamma(a) + lgamma(b) - lgamma(a + b);
const float front = exp(log(x) * a + log(1.0 - x) * b - lbeta_ab) / a;
/*Use Lentz's algorithm to evaluate the continued fraction.*/
float f = 1.0, c = 1.0, d = 0.0;
int i, m;
for (i = 0; i <= 200; ++i) {
m = i / 2;
float numerator;
if (i == 0) {
numerator = 1.0; /*First numerator is 1.0.*/
} else if (i % 2 == 0) {
numerator = (m * (b - m) * x) / ((a + 2.0 * m - 1.0) * (a + 2.0 * m)); /*Even term.*/
} else {
numerator = -((a + m) * (a + b + m) * x) / ((a + 2.0 * m) * (a + 2.0 * m + 1)); /*Odd term.*/
}
/*Do an iteration of Lentz's algorithm.*/
d = 1.0 + numerator * d;
if (fabs(d) < TINY_BETA)
d = TINY_BETA;
d = 1.0 / d;
c = 1.0 + numerator / c;
if (fabs(c) < TINY_BETA)
c = TINY_BETA;
const float cd = c * d;
f *= cd;
/*Check for stop.*/
if (fabs(1.0 - cd) < STOP_BETA) {
struct box result = {
.empty = 0,
.content = front * (f - 1.0)
};
return result;
}
}
PROCESS_ERROR("More loops needed, did not converge, in function incbeta");
}
struct box cdf_beta(float x)
{
if (x < 0) {
struct box result = { .empty = 0, .content = 0 };
return result;
} else if (x > 1) {
struct box result = { .empty = 0, .content = 1 };
return result;
} else {
float successes = 1, failures = (2023 - 1945);
return incbeta(successes, failures, x);
}
}
// Inverse cdf at point
// Two versions of this function:
// - raw, dealing with cdfs that return floats
// - box, dealing with cdfs that return a box.
// Inverse cdf
struct box inverse_cdf_float(float cdf(float), float p)
{
// given a cdf: [-Inf, Inf] => [0,1]
// returns a box with either
// x such that cdf(x) = p
// or an error
// if EXIT_ON_ERROR is set to 1, it exits instead of providing an error
float low = -1.0;
float high = 1.0;
// 1. Make sure that cdf(low) < p < cdf(high)
int interval_found = 0;
while ((!interval_found) && (low > -FLT_MAX / 4) && (high < FLT_MAX / 4)) {
// ^ Using FLT_MIN and FLT_MAX is overkill
// but it's also the *correct* thing to do.
int low_condition = (cdf(low) < p);
int high_condition = (p < cdf(high));
if (low_condition && high_condition) {
interval_found = 1;
} else if (!low_condition) {
low = low * 2;
} else if (!high_condition) {
high = high * 2;
}
}
if (!interval_found) {
PROCESS_ERROR("Interval containing the target value not found, in function inverse_cdf");
} else {
int convergence_condition = 0;
int count = 0;
while (!convergence_condition && (count < (INT_MAX / 2))) {
float mid = (high + low) / 2;
int mid_not_new = (mid == low) || (mid == high);
// float width = high - low;
// if ((width < 1e-8) || mid_not_new){
if (mid_not_new) {
convergence_condition = 1;
} else {
float mid_sign = cdf(mid) - p;
if (mid_sign < 0) {
low = mid;
} else if (mid_sign > 0) {
high = mid;
} else if (mid_sign == 0) {
low = mid;
high = mid;
}
}
}
if (convergence_condition) {
struct box result = { .empty = 0, .content = low };
return result;
} else {
PROCESS_ERROR("Search process did not converge, in function inverse_cdf");
}
}
}
struct box inverse_cdf_box(struct box cdf_box(float), float p)
{
// given a cdf: [-Inf, Inf] => Box([0,1])
// returns a box with either
// x such that cdf(x) = p
// or an error
// if EXIT_ON_ERROR is set to 1, it exits instead of providing an error
float low = -1.0;
float high = 1.0;
// 1. Make sure that cdf(low) < p < cdf(high)
int interval_found = 0;
while ((!interval_found) && (low > -FLT_MAX / 4) && (high < FLT_MAX / 4)) {
// ^ Using FLT_MIN and FLT_MAX is overkill
// but it's also the *correct* thing to do.
struct box cdf_low = cdf_box(low);
if (cdf_low.empty) {
PROCESS_ERROR(cdf_low.error_msg);
}
struct box cdf_high = cdf_box(high);
if (cdf_high.empty) {
PROCESS_ERROR(cdf_low.error_msg);
}
int low_condition = (cdf_low.content < p);
int high_condition = (p < cdf_high.content);
if (low_condition && high_condition) {
interval_found = 1;
} else if (!low_condition) {
low = low * 2;
} else if (!high_condition) {
high = high * 2;
}
}
if (!interval_found) {
PROCESS_ERROR("Interval containing the target value not found, in function inverse_cdf");
} else {
int convergence_condition = 0;
int count = 0;
while (!convergence_condition && (count < (INT_MAX / 2))) {
float mid = (high + low) / 2;
int mid_not_new = (mid == low) || (mid == high);
// float width = high - low;
if (mid_not_new) {
// if ((width < 1e-8) || mid_not_new){
convergence_condition = 1;
} else {
struct box cdf_mid = cdf_box(mid);
if (cdf_mid.empty) {
PROCESS_ERROR(cdf_mid.error_msg);
}
float mid_sign = cdf_mid.content - p;
if (mid_sign < 0) {
low = mid;
} else if (mid_sign > 0) {
high = mid;
} else if (mid_sign == 0) {
low = mid;
high = mid;
}
}
}
if (convergence_condition) {
struct box result = { .empty = 0, .content = low };
return result;
} else {
PROCESS_ERROR("Search process did not converge, in function inverse_cdf");
}
}
}
// Some randomness functions for:
// - Sampling from a cdf
// - Benchmarking against a previous approach, which will be faster, but less general
// Get random number between 0 and 1
uint32_t xorshift32(uint32_t* seed)
{
// Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs"
// See <https://stackoverflow.com/questions/53886131/how-does-xorshift32-works>
// https://en.wikipedia.org/wiki/Xorshift
// Also some drama: <https://www.pcg-random.org/posts/on-vignas-pcg-critique.html>, <https://prng.di.unimi.it/>
uint32_t x = *seed;
x ^= x << 13;
x ^= x >> 17;
x ^= x << 5;
return *seed = x;
}
// Distribution & sampling functions
float rand_0_to_1(uint32_t* seed)
{
return ((float)xorshift32(seed)) / ((float)UINT32_MAX);
}
// Sampler based on inverse cdf and randomness function
struct box sampler_box_cdf(struct box cdf(float), uint32_t* seed)
{
float p = rand_0_to_1(seed);
struct box result = inverse_cdf_box(cdf, p);
return result;
}
struct box sampler_float_cdf(float cdf(float), uint32_t* seed)
{
float p = rand_0_to_1(seed);
struct box result = inverse_cdf_float(cdf, p);
return result;
}
// Comparison point with raw normal sampler
const float PI = 3.14159265358979323846;
float sampler_normal_0_1(uint32_t* seed)
{
float u1 = rand_0_to_1(seed);
float u2 = rand_0_to_1(seed);
float z = sqrtf(-2.0 * log(u1)) * sin(2 * PI * u2);
return z;
}
// Some testers
void test_inverse_cdf_float(char* cdf_name, float cdf_float(float))
{
struct box result = inverse_cdf_float(cdf_float, 0.5);
if (result.empty) {
printf("Inverse for %s not calculated\n", cdf_name);
exit(1);
} else {
printf("Inverse of %s at %f is: %f\n", cdf_name, 0.5, result.content);
}
}
void test_inverse_cdf_box(char* cdf_name, struct box cdf_box(float))
{
struct box result = inverse_cdf_box(cdf_box, 0.5);
if (result.empty) {
printf("Inverse for %s not calculated\n", cdf_name);
exit(1);
} else {
printf("Inverse of %s at %f is: %f\n", cdf_name, 0.5, result.content);
}
}
void test_and_time_sampler_float(char* cdf_name, float cdf_float(float), uint32_t* seed)
{
printf("\nGetting some samples from %s:\n", cdf_name);
clock_t begin = clock();
for (int i = 0; i < NUM_SAMPLES; i++) {
struct box sample = sampler_float_cdf(cdf_float, seed);
if (sample.empty) {
printf("Error in sampler function for %s", cdf_name);
} else {
// printf("%f\n", sample.content);
}
}
clock_t end = clock();
float time_spent = (float)(end - begin) / CLOCKS_PER_SEC;
printf("Time spent: %f\n", time_spent);
}
void test_and_time_sampler_box(char* cdf_name, struct box cdf_box(float), uint32_t* seed)
{
printf("\nGetting some samples from %s:\n", cdf_name);
clock_t begin = clock();
for (int i = 0; i < NUM_SAMPLES; i++) {
struct box sample = sampler_box_cdf(cdf_box, seed);
if (sample.empty) {
printf("Error in sampler function for %s", cdf_name);
} else {
// printf("%f\n", sample.content);
}
}
clock_t end = clock();
float time_spent = (float)(end - begin) / CLOCKS_PER_SEC;
printf("Time spent: %f\n", time_spent);
}
int main()
{
// Test inverse cdf float
test_inverse_cdf_float("cdf_uniform_0_1", cdf_uniform_0_1);
test_inverse_cdf_float("cdf_squared_0_1", cdf_squared_0_1);
test_inverse_cdf_float("cdf_normal_0_1", cdf_normal_0_1);
// Test inverse cdf box
test_inverse_cdf_box("cdf_beta", cdf_beta);
// Testing samplers
// set randomness seed
uint32_t* seed = malloc(sizeof(uint32_t));
*seed = 1000; // xorshift can't start with 0
// Test float sampler
test_and_time_sampler_float("cdf_uniform_0_1", cdf_uniform_0_1, seed);
test_and_time_sampler_float("cdf_squared_0_1", cdf_squared_0_1, seed);
test_and_time_sampler_float("cdf_normal_0_1", cdf_normal_0_1, seed);
// Get some normal samples using a previous approach
printf("\nGetting some samples from sampler_normal_0_1\n");
clock_t begin_2 = clock();
for (int i = 0; i < NUM_SAMPLES; i++) {
float normal_sample = sampler_normal_0_1(seed);
// printf("%f\n", normal_sample);
}
clock_t end_2 = clock();
float time_spent_2 = (float)(end_2 - begin_2) / CLOCKS_PER_SEC;
printf("Time spent: %f\n", time_spent_2);
// Test box sampler
test_and_time_sampler_box("cdf_beta", cdf_beta, seed);
// Ok, this is slower than python!!
// Partly this is because I am using a more general algorithm,
// which applies to any cdf
// But I am also using really anal convergence conditions.
// This could be optimized.
free(seed);
return 0;
}