Compare commits

..

No commits in common. "3b29ad7e4534fa397136204d41c2823bd9f96df4" and "b624ab46d790ba3833186e4878959dedd252a2cb" have entirely different histories.

60 changed files with 211 additions and 3494 deletions

View File

@ -1,53 +0,0 @@
# Interface:
# make
# make build
# make format
# make run
# Compiler
CC=gcc
# CC=tcc # <= faster compilation
# Main file
SRC=samples.c
OUTPUT=samples
## Dependencies
DEPS='gsl'
## Flags
INCS=`pkg-config --cflags ${DEPS}`
LIBS=`pkg-config --libs ${DEPS}`
DEBUG= #'-g'
STANDARD=-std=c99
WARNINGS=-Wall
FAST=-Ofast
## Formatter
STYLE_BLUEPRINT=webkit
FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
## make build
build: $(SRC)
$(CC) $(DEBUG) $(INCS) $(PLUGS) $(SRC) -o samples $(LIBS)
fast: $(SRC)
$(CC) $(FAST) $(DEBUG) $(INCS) $(PLUGS) $(SRC) -o samples $(LIBS)
format: $(SRC)
$(FORMATTER) $(SRC)
run: $(SRC) $(OUTPUT)
echo "Increasing stack size limit, because we are dealing with 1M samples"
# ulimit: increase stack size limit
# -Ss: the soft limit. If you set the hard limit, you then can't raise it
# 256000: around 250Mbs, if I'm reading it correctly.
# Then run the program
ulimit -Ss 256000 && ./$(OUTPUT)
# Old:
# Link libraries, for good measure
# LD_LIBRARY_PATH=/usr/local/lib
# export LD_LIBRARY_PATH

Binary file not shown.

View File

@ -1,157 +0,0 @@
#include <gsl/gsl_randist.h>
#include <gsl/gsl_rng.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define N 1000000
/*
* For very high values of N, you will want to increase the maximum stack trace, otherwise you will suffer a segmentation fault
* In Ubuntu/bash you can do this with $ ulimit -Ss 256000 ## ~256Mbs
* And confirm it with $ ulimit -a
*/
/* Helpers */
void print(double* ys)
{
for (int i = 0; i < N; i++) {
printf("%f\n", ys[i]);
}
printf("\n");
}
void fill(double* ys, float f)
{
for (int i = 0; i < N; i++) {
ys[i] = f;
}
}
double sum(double* ps, int n)
{
double result = 0;
for (int i = 0; i < n; i++) {
result += ps[i];
}
return (result);
}
void cumsum(double* ps, double* rs, int n)
{
double counter = 0;
for (int i = 0; i < n; i++) {
counter += ps[i];
rs[i] = counter;
}
}
/* Distributions*/
void normal(gsl_rng* r, double* ys, double mean, double std)
{
for (int i = 0; i < N; i++) {
ys[i] = mean + gsl_ran_gaussian(r, std);
}
}
void lognormal(gsl_rng* r, double* ys, double zeta, double sigma)
{
for (int i = 0; i < N; i++) {
ys[i] = gsl_ran_lognormal(r, zeta, sigma);
}
}
void to(gsl_rng* r, double* ys, double low, double high)
{
double normal95confidencePoint = 1.6448536269514722;
double log_low = log(low);
double log_high = log(high);
double zeta = (log_low + log_high) / 2;
double sigma = (log_high - log_low) / (2.0 * normal95confidencePoint);
lognormal(r, ys, zeta, sigma);
}
/* Mixture of distributions */
void mixture(gsl_rng* r, double* dists[], double* weights, int n, double* results)
{
/* Get cummulative, normalized weights */
double sum_weights = sum(weights, n);
double normalized_weights[n];
for (int i = 0; i < n; i++) {
normalized_weights[i] = weights[i] / sum_weights;
}
double cummulative_weights[n];
cumsum(normalized_weights, cummulative_weights, n);
/* Get N samples, drawn from the different distributions in proportion to their weights. */
for (int i = 0; i < N; i++) {
double p_1 = gsl_rng_uniform(r);
double p_2 = gsl_rng_uniform(r);
int index_found = 0;
int index_counter = 0;
while ((index_found == 0) && (index_counter < n)) {
if (p_1 < cummulative_weights[index_counter]) {
index_found = 1;
} else {
index_counter++;
}
}
if (index_found == 0) {
printf("\nThis shouldn't be able to happen");
// gsl_rng_free (r);
// abort(); // this shouldn't have happened.
} else {
int sample_index = (int)floor(p_2 * N);
results[i] = dists[index_counter][sample_index];
}
}
}
/* Main */
int main(void)
{
// Start clock
clock_t start, end;
start = clock();
/* Initialize GNU Statistical Library (GSL) stuff */
const gsl_rng_type* T;
gsl_rng* r;
// gsl_rng_env_setup();
T = gsl_rng_default;
r = gsl_rng_alloc(T);
/* Toy example */
/* Declare variables in play */
double p_a, p_b, p_c;
double dist_none[N], dist_one[N], dist_few[N], dist_many[N], dist_mixture[N];
/* Initialize variables */
p_a = 0.8;
p_b = 0.5;
p_c = p_a * p_b;
fill(dist_none, 0);
fill(dist_one, 1);
to(r, dist_few, 1, 3);
to(r, dist_many, 2, 10);
/* Generate mixture */
int n = 4;
double weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 };
double* dists[] = { dist_none, dist_one, dist_few, dist_many };
mixture(r, dists, weights, n, dist_mixture);
printf("%f\n", sum(dist_mixture, N) / N);
/* Clean up GSL */
gsl_rng_free(r);
// End clock
end = clock();
printf("Total time (ms): %f\n", ((double)(end - start)) / CLOCKS_PER_SEC * 1000);
/* Return success*/
return EXIT_SUCCESS;
}

View File

@ -1,53 +0,0 @@
# Interface:
# make
# make build
# make format
# make run
# Compiler
CC=gcc
# CC=tcc # <= faster compilation
# Main file
SRC_ONE_THREAD=./samples-one-thread.c
OUTPUT_ONE_THREAD=out/samples-one-thread
## Dependencies
# Has no dependencies
MATH=-lm
## Flags
DEBUG= #'-g'
STANDARD=-std=c99
WARNINGS=-Wall
OPTIMIZED=-O3 #-O3 actually gives better performance than -Ofast, at least for this version
OPENMP=-fopenmp
## Formatter
STYLE_BLUEPRINT=webkit
FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
## make build
build: $(SRC_ONE_THREAD)
mkdir -p out
$(CC) $(OPTIMIZED) $(DEBUG) $(SRC_ONE_THREAD) $(OPENMP) $(MATH) -o $(OUTPUT_ONE_THREAD)
format: $(SRC_ONE_THREAD)
$(FORMATTER) $(SRC_ONE_THREAD)
run: $(SRC_ONE_THREAD) $(OUTPUT_ONE_THREAD)
./$(OUTPUT_ONE_THREAD)
time-linux:
@echo "Requires /bin/time, found on GNU/Linux systems" && echo
@echo "Running 100x and taking avg time: $(OUTPUT_ONE_THREAD)"
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(OUTPUT_ONE_THREAD); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time: |" | sed 's|$$|ms|' && echo
time-linux-simple:
@echo "Requires /bin/time, found on GNU/Linux systems" && echo
/bin/time -f "Time: %es" ./$(OUTPUT_ONE_THREAD) && echo
debian-install-dependencies:
sudo apt-get install libomp-dev

View File

@ -1,183 +0,0 @@
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
const float PI = 3.14159265358979323846;
#define N 1000000
//Array helpers
void array_print(float* array, int length)
{
for (int i = 0; i < length; i++) {
printf("item[%d] = %f\n", i, array[i]);
}
printf("\n");
}
void array_fill(float* array, int length, float item)
{
int i;
{
for (i = 0; i < length; i++) {
array[i] = item;
}
}
}
float array_sum(float* array, int length)
{
float output = 0.0;
for (int i = 0; i < length; i++) {
output += array[i];
}
return output;
}
void array_cumsum(float* array_to_sum, float* array_cumsummed, int length)
{
array_cumsummed[0] = array_to_sum[0];
for (int i = 1; i < length; i++) {
array_cumsummed[i] = array_cumsummed[i - 1] + array_to_sum[i];
}
}
float rand_float(float to)
{
return ((float)rand() / (float)RAND_MAX) * to;
}
float ur_normal()
{
float u1 = rand_float(1.0);
float u2 = rand_float(1.0);
float z = sqrtf(-2.0 * log(u1)) * sin(2 * PI * u2);
return z;
}
inline float random_uniform(float from, float to)
{
return ((float)rand() / (float)RAND_MAX) * (to - from) + from;
}
inline float random_normal(float mean, float sigma)
{
return (mean + sigma * ur_normal());
}
inline float random_lognormal(float logmean, float logsigma)
{
return expf(random_normal(logmean, logsigma));
}
inline float random_to(float low, float high)
{
const float NORMAL95CONFIDENCE = 1.6448536269514722;
float loglow = logf(low);
float loghigh = logf(high);
float logmean = (loglow + loghigh) / 2;
float logsigma = (loghigh - loglow) / (2.0 * NORMAL95CONFIDENCE);
return random_lognormal(logmean, logsigma);
}
void array_random_to(float* array, int length, float low, float high)
{
int i;
#pragma omp private(i)
{
#pragma omp for
for (i = 0; i < length; i++) {
array[i] = random_to(low, high);
}
}
}
void mixture(float (*samplers[])(void), float* weights, int n_dists, float* results, int results_length)
{
float sum_weights = array_sum(weights, n_dists);
float* normalized_weights = malloc(n_dists * sizeof(float));
for (int i = 0; i < n_dists; i++) {
normalized_weights[i] = weights[i] / sum_weights;
}
float* cummulative_weights = malloc(n_dists * sizeof(float));
array_cumsum(normalized_weights, cummulative_weights, n_dists);
//create var holders
float p1;
int sample_index, i, own_length;
{
for (int i = 0; i < results_length; i++) {
p1 = random_uniform(0, 1);
for (int j = 0; j < n_dists; j++) {
if (p1 < cummulative_weights[j]) {
results[i] = samplers[j]();
break;
}
}
}
}
free(normalized_weights);
free(cummulative_weights);
}
float sample_0()
{
return 0;
}
float sample_1()
{
return 1;
}
float sample_few()
{
return random_to(1, 3);
}
float sample_many()
{
return random_to(2, 10);
}
int main()
{
//initialize randomness
srand(1);
// clock_t start, end;
// start = clock();
// Toy example
// Declare variables in play
float p_a, p_b, p_c;
// printf("Max threads: %d\n", n_threads);
// omp_set_num_threads(n_threads);
// Initialize variables
p_a = 0.8;
p_b = 0.5;
p_c = p_a * p_b;
// Generate mixture
int n_dists = 4;
float weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 };
float (*samplers[])(void) = { sample_0, sample_1, sample_few, sample_many };
float* results = malloc(N * sizeof(float));
mixture(samplers, weights, n_dists, results, N);
printf("Sum(dist_mixture, N)/N = %f\n", array_sum(results, N) / N);
// array_print(dist_mixture[0], N);
// end = clock();
// printf("Time (ms): %f\n", ((double)(end - start)) / (CLOCKS_PER_SEC * 1000));
// ^ Will only measure how long it takes the inner main to run, not the whole program,
// including e.g., loading the program into memory or smth.
// Also CLOCKS_PER_SEC in POSIX is a constant equal to 1000000.
// See: https://stackoverflow.com/questions/10455905/why-is-clocks-per-sec-not-the-actual-number-of-clocks-per-second
return 0;
}

View File

@ -1,18 +0,0 @@
# Time to BOTEC in C
This repository contains a few implementations of a simple botec (back-of-the-envelope) calculation in C:
- In the folder C-01-simple/, you can see a simple implementation, which passes large arrays around
- In the folder C-02-better-algorithm-one-thread/ you can see a better implementations, that passes around pointers to functions, which makes the implementation more efficient
- The top level samples.c uses the algorithm improvements in C-02..., and also implements multithreading using OpenMP
## To do
- [ ] Add Windows/Powershell time-measuring commands
- [ ] Add CUDA?
- [x] Added results of perf. `rand_r` seems like a big chunk of it, but I'm hesitant to use lower-quality random numbers
- [x] used xorshift instead
- [-] Use xorshift with a struct instead of a pointer? idk, could be faster for some reason? => Tested, it takes the same time.
- [x] Update repository with correct timing
- [x] Use better profiling approach to capture timing with 1M samples.
- [x] See if program can be reworded so as to use multithreading effectively, e.g., so that you see speed gains proportional to the number of threads used

3
C/hello-world/build.sh Executable file
View File

@ -0,0 +1,3 @@
#!/bin/bash
gcc -std=c99 -Wall -lm hello-world.c -o hello-world

7
C/hello-world/hello-world.c Executable file
View File

@ -0,0 +1,7 @@
#include <stdlib.h>
#include <stdio.h>
int main(void){
printf("Hello world!\n");
return EXIT_SUCCESS;
}

View File

@ -1,106 +0,0 @@
# Interface:
# make
# make build
# make format
# make run
# Compiler
CC=gcc
# CC=tcc # <= faster compilation
# Main file
SRC=samples.c
OUTPUT=out/samples
SRC_ONE_THREAD=./samples-one-thread.c
OUTPUT_ONE_THREAD=out/samples-one-thread
## Dependencies
# Has no dependencies
MATH=-lm
## Flags
DEBUG= #'-g'
STANDARD=-std=c99
WARNINGS=-Wall
OPTIMIZED=-O3 #-O3 actually gives better performance than -Ofast, at least for this version
OPENMP=-fopenmp
## Formatter
STYLE_BLUEPRINT=webkit
FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
## make build
build: $(SRC)
$(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(OPENMP) $(MATH) -o $(OUTPUT)
static:
$(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(OPENMP) $(MATH) -o $(OUTPUT)
format: $(SRC)
$(FORMATTER) $(SRC)
run: $(SRC) $(OUTPUT)
OMP_NUM_THREADS=1 ./$(OUTPUT) && echo
multi:
OMP_NUM_THREADS=1 ./$(OUTPUT) && echo
OMP_NUM_THREADS=2 ./$(OUTPUT) && echo
OMP_NUM_THREADS=4 ./$(OUTPUT) && echo
OMP_NUM_THREADS=8 ./$(OUTPUT) && echo
OMP_NUM_THREADS=16 ./$(OUTPUT) && echo
## Timing
time-linux:
@echo "Requires /bin/time, found on GNU/Linux systems" && echo
@echo "Running 100x and taking avg time: OMP_NUM_THREADS=1 $(OUTPUT)"
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=1 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 1 thread: |" | sed 's|$$|ms|' && echo
@echo "Running 100x and taking avg time: OMP_NUM_THREADS=2 $(OUTPUT)"
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=2 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 2 threads: |" | sed 's|$$|ms|' && echo
@echo "Running 100x and taking avg time: OMP_NUM_THREADS=4 $(OUTPUT)"
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=4 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time for 4 threads: |" | sed 's|$$|ms|' && echo
@echo "Running 100x and taking avg time: OMP_NUM_THREADS=8 $(OUTPUT)"
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=8 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 8 threads: |" | sed 's|$$|ms|' && echo
@echo "Running 100x and taking avg time: OMP_NUM_THREADS=16 $(OUTPUT)"
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=16 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo
time-linux-fastest:
@echo "Running 100x and taking avg time: OMP_NUM_THREADS=16 $(OUTPUT)"
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=16 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo
time-linux-simple:
@echo "Requires /bin/time, found on GNU/Linux systems" && echo
OMP_NUM_THREADS=1 /bin/time -f "Time: %es" ./$(OUTPUT) && echo
OMP_NUM_THREADS=2 /bin/time -f "Time: %es" ./$(OUTPUT) && echo
OMP_NUM_THREADS=4 /bin/time -f "Time: %es" ./$(OUTPUT) && echo
OMP_NUM_THREADS=8 /bin/time -f "Time: %es" ./$(OUTPUT) && echo
OMP_NUM_THREADS=16 /bin/time -f "Time: %es" ./$(OUTPUT) && echo
## Profiling
profile-linux:
echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar"
echo "Must be run as sudo"
$(CC) $(SRC) $(OPENMP) $(MATH) -o $(OUTPUT)
# ./$(OUTPUT)
# gprof:
# gprof $(OUTPUT) gmon.out > analysis.txt
# rm gmon.out
# vim analysis.txt
# rm analysis.txt
# perf:
OMP_NUM_THREADS=16 sudo perf record $(OUTPUT)
sudo perf report
rm perf.data
## Install
debian-install-dependencies:
sudo apt-get install libomp-dev

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@ -1,26 +0,0 @@
Samples: 884 of event 'cycles', Event count (approx.): 551167949
Overhead Command Shared Object Symbol
35.32% samples samples [.] xorshift32
14.09% samples libgomp.so.1.0.0 [.] 0x000000000001d2ea
12.04% samples libgomp.so.1.0.0 [.] 0x000000000001d132
11.53% samples samples [.] mixture._omp_fn.0
4.55% samples libm-2.31.so [.] __sin_fma
4.24% samples samples [.] rand_0_to_1
3.77% samples samples [.] random_to
3.03% samples libm-2.31.so [.] __logf_fma
1.61% samples libm-2.31.so [.] __expf_fma
1.54% samples samples [.] split_array_sum._omp_fn.0
1.38% samples samples [.] random_uniform
0.94% samples samples [.] ur_normal
0.91% samples libm-2.31.so [.] __ieee754_log_fma
0.74% samples libgomp.so.1.0.0 [.] 0x000000000001d13d
0.52% samples samples [.] sample_0
0.41% samples libm-2.31.so [.] __sqrtf_finite@GLIBC_2.15
0.38% samples samples [.] sample_1
0.36% samples libgomp.so.1.0.0 [.] 0x000000000001d139
0.36% samples libgomp.so.1.0.0 [.] 0x000000000001d2f5
0.22% samples [kernel.kallsyms] [k] native_queued_spin_lock_slowpath
0.18% samples [kernel.kallsyms] [k] _raw_spin_lock_irq
0.18% samples samples [.] random_lognormal
0.17% samples libgomp.so.1.0.0 [.] 0x000000000001d2f1

View File

@ -1,251 +0,0 @@
#include <omp.h>
#include <math.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
const float PI = 3.14159265358979323846;
#define N 1000000
//Array helpers
void array_print(float* array, int length)
{
for (int i = 0; i < length; i++) {
printf("item[%d] = %f\n", i, array[i]);
}
printf("\n");
}
float array_sum(float* array, int length)
{
float output = 0.0;
for (int i = 0; i < length; i++) {
output += array[i];
}
return output;
}
void array_cumsum(float* array_to_sum, float* array_cumsummed, int length)
{
array_cumsummed[0] = array_to_sum[0];
for (int i = 1; i < length; i++) {
array_cumsummed[i] = array_cumsummed[i - 1] + array_to_sum[i];
}
}
// Split array helpers
int split_array_get_length(int index, int total_length, int n_threads)
{
return (total_length % n_threads > index ? total_length / n_threads + 1 : total_length / n_threads);
}
void split_array_allocate(float** meta_array, int length, int divide_into)
{
int split_array_length;
for (int i = 0; i < divide_into; i++) {
split_array_length = split_array_get_length(i, length, divide_into);
meta_array[i] = malloc(split_array_length * sizeof(float));
}
}
void split_array_free(float** meta_array, int divided_into)
{
for (int i = 0; i < divided_into; i++) {
free(meta_array[i]);
}
free(meta_array);
}
float split_array_sum(float** meta_array, int length, int divided_into)
{
int i;
float output = 0;
#pragma omp parallel for reduction(+:output)
for (int i = 0; i < divided_into; i++) {
float own_partial_sum = 0;
int split_array_length = split_array_get_length(i, length, divided_into);
for (int j = 0; j < split_array_length; j++) {
own_partial_sum += meta_array[i][j];
}
output += own_partial_sum;
}
return output;
}
// Pseudo Random number generator
uint32_t xorshift32(uint32_t* seed)
{
// Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs"
// See <https://stackoverflow.com/questions/53886131/how-does-xorshift32-works>
// https://en.wikipedia.org/wiki/Xorshift
// Also some drama: <https://www.pcg-random.org/posts/on-vignas-pcg-critique.html>, <https://prng.di.unimi.it/>
uint32_t x = *seed;
x ^= x << 13;
x ^= x >> 17;
x ^= x << 5;
return *seed = x;
}
// Distribution & sampling functions
float rand_0_to_1(uint32_t* seed){
return ((float) xorshift32(seed)) / ((float) UINT32_MAX);
/*
uint32_t x = *seed;
x ^= x << 13;
x ^= x >> 17;
x ^= x << 5;
return ((float)(*seed = x))/((float) UINT32_MAX);
*/
// previously:
// ((float)rand_r(seed) / (float)RAND_MAX)
// and before that: rand, but it wasn't thread-safe.
// See: <https://stackoverflow.com/questions/43151361/how-to-create-thread-safe-random-number-generator-in-c-using-rand-r> for why to use rand_r:
// rand() is not thread-safe, as it relies on (shared) hidden seed.
}
float rand_float(float max, uint32_t* seed)
{
return rand_0_to_1(seed) * max;
}
float ur_normal(uint32_t* seed)
{
float u1 = rand_0_to_1(seed);
float u2 = rand_0_to_1(seed);
float z = sqrtf(-2.0 * log(u1)) * sin(2 * PI * u2);
return z;
}
float random_uniform(float from, float to, uint32_t* seed)
{
return rand_0_to_1(seed) * (to - from) + from;
}
float random_normal(float mean, float sigma, uint32_t* seed)
{
return (mean + sigma * ur_normal(seed));
}
float random_lognormal(float logmean, float logsigma, uint32_t* seed)
{
return expf(random_normal(logmean, logsigma, seed));
}
float random_to(float low, float high, uint32_t* seed)
{
const float NORMAL95CONFIDENCE = 1.6448536269514722;
float loglow = logf(low);
float loghigh = logf(high);
float logmean = (loglow + loghigh) / 2;
float logsigma = (loghigh - loglow) / (2.0 * NORMAL95CONFIDENCE);
return random_lognormal(logmean, logsigma, seed);
}
// Mixture function
void mixture(float (*samplers[])(uint32_t*), float* weights, int n_dists, float** results, int n_threads)
{
// You can see a simpler version of this function in the git history
// or in C-02-better-algorithm-one-thread/
float sum_weights = array_sum(weights, n_dists);
float* cumsummed_normalized_weights = malloc(n_dists * sizeof(float));
cumsummed_normalized_weights[0] = weights[0]/sum_weights;
for (int i = 1; i < n_dists; i++) {
cumsummed_normalized_weights[i] = cumsummed_normalized_weights[i - 1] + weights[i]/sum_weights;
}
//create var holders
float p1;
int sample_index, i, split_array_length;
// uint32_t* seeds[n_threads];
uint32_t** seeds = malloc(n_threads * sizeof(uint32_t*));
for (uint32_t i = 0; i < n_threads; i++) {
seeds[i] = malloc(sizeof(uint32_t));
*seeds[i] = i + 1; // xorshift can't start with 0
}
#pragma omp parallel private(i, p1, sample_index, split_array_length)
{
#pragma omp for
for (i = 0; i < n_threads; i++) {
split_array_length = split_array_get_length(i, N, n_threads);
for (int j = 0; j < split_array_length; j++) {
p1 = random_uniform(0, 1, seeds[i]);
for (int k = 0; k < n_dists; k++) {
if (p1 < cumsummed_normalized_weights[k]) {
results[i][j] = samplers[k](seeds[i]);
break;
}
}
}
}
}
// free(normalized_weights);
// free(cummulative_weights);
free(cumsummed_normalized_weights);
for (uint32_t i = 0; i < n_threads; i++) {
free(seeds[i]);
}
free(seeds);
}
// Functions used for the BOTEC.
// Their type has to be the same, as we will be passing them around.
float sample_0(uint32_t* seed)
{
return 0;
}
float sample_1(uint32_t* seed)
{
return 1;
}
float sample_few(uint32_t* seed)
{
return random_to(1, 3, seed);
}
float sample_many(uint32_t* seed)
{
return random_to(2, 10, seed);
}
int main()
{
// Toy example
// Declare variables in play
float p_a, p_b, p_c;
int n_threads = omp_get_max_threads();
// printf("Max threads: %d\n", n_threads);
// omp_set_num_threads(n_threads);
float** dist_mixture = malloc(n_threads * sizeof(float*));
split_array_allocate(dist_mixture, N, n_threads);
// Initialize variables
p_a = 0.8;
p_b = 0.5;
p_c = p_a * p_b;
// Generate mixture
int n_dists = 4;
float weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 };
float (*samplers[])(uint32_t*) = { sample_0, sample_1, sample_few, sample_many };
mixture(samplers, weights, n_dists, dist_mixture, n_threads);
printf("Sum(dist_mixture, N)/N = %f\n", split_array_sum(dist_mixture, N, n_threads) / N);
// array_print(dist_mixture[0], N);
split_array_free(dist_mixture, n_threads);
return 0;
}

15
C/samples/build-and-run.sh Executable file
View File

@ -0,0 +1,15 @@
#!/bin/bash
root="$1"
gcc -std=c99 -Wall -lm "$root".c -o "$root" -I/usr/local/include -L/usr/local/lib -lgsl -lgslcblas -lm -g
# Link libraries, for good measure
LD_LIBRARY_PATH=/usr/local/lib
export LD_LIBRARY_PATH
# Increase stack size limit
ulimit -Ss 256000
# -Ss: the soft limit. If you set the hard limit, you then can't raise it
# 256000: around 250Mbs, if I'm reading it correctly.
# run
./samples

13
C/samples/build-gsl.sh Executable file
View File

@ -0,0 +1,13 @@
#!/bin/bash
root="$1"
gcc -std=c99 -Wall -lm "$root".c -o "$root" -I/usr/local/include -L/usr/local/lib -lgsl -lgslcblas -lm -g
# Link libraries, for good measure
LD_LIBRARY_PATH=/usr/local/lib
export LD_LIBRARY_PATH
# Increase stack size limit
ulimit -Ss 256000
# -Ss: the soft limit. If you set the hard limit, you then can't raise it
# 256000: around 250Mbs, if I'm reading it correctly.

BIN
C/samples/samples Executable file

Binary file not shown.

140
C/samples/samples.c Executable file
View File

@ -0,0 +1,140 @@
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <gsl/gsl_rng.h>
#include <gsl/gsl_randist.h>
#define N 1000000
/*
* For very high values of N, you will want to increase the maximum stack trace, otherwise you will suffer a segmentation fault
* In Ubuntu/bash you can do this with $ ulimit -Ss 256000 ## ~256Mbs
* And confirm it with $ ulimit -a
*/
/* Helpers */
void print(double *ys){
for(int i=0; i<N; i++){
printf("%f\n", ys[i]);
}
printf("\n");
}
void fill(double *ys, float f){
for(int i=0; i<N; i++){
ys[i] = f;
}
}
double sum(double *ps, int n){
double result = 0;
for(int i=0; i<n; i++){
result += ps[i];
}
return(result);
}
void cumsum(double *ps, double *rs, int n){
double counter = 0;
for(int i=0; i<n; i++){
counter += ps[i];
rs[i] = counter;
}
}
/* Distributions*/
void normal(gsl_rng * r, double *ys, double mean, double std){
for(int i=0; i<N; i++){
ys[i] = mean + gsl_ran_gaussian(r, std);
}
}
void lognormal(gsl_rng * r, double *ys, double zeta, double sigma){
for(int i=0; i<N; i++){
ys[i] = gsl_ran_lognormal(r, zeta, sigma);
}
}
void to(gsl_rng * r, double *ys, double low, double high){
double normal95confidencePoint = 1.6448536269514722;
double log_low = log(low);
double log_high = log(high);
double zeta = (log_low + log_high)/2;
double sigma = (log_high - log_low) / (2.0 * normal95confidencePoint);
lognormal(r, ys, zeta, sigma);
}
/* Mixture of distributions */
void mixture(gsl_rng * r, double *dists[], double *weights, int n, double *results){
/* Get cummulative, normalized weights */
double sum_weights = sum(weights, n);
double normalized_weights[n];
for(int i=0; i<n; i++){
normalized_weights[i] = weights[i]/sum_weights;
}
double cummulative_weights[n];
cumsum(normalized_weights, cummulative_weights, n);
/* Get N samples, drawn from the different distributions in proportion to their weights. */
for(int i=0; i<N; i++){
double p_1 = gsl_rng_uniform(r);
double p_2 = gsl_rng_uniform(r);
int index_found = 0;
int index_counter = 0;
while ((index_found == 0) && (index_counter < n) ){
if(p_1 < cummulative_weights[index_counter]){
index_found = 1;
}else{
index_counter++;
}
}
if(index_found == 0) {
printf("\nThis shouldn't be able to happen");
// gsl_rng_free (r);
// abort(); // this shouldn't have happened.
}else{
int sample_index = (int) floor(p_2 * N);
results[i] = dists[index_counter][sample_index];
}
}
}
/* Main */
int main(void){
/* Initialize GNU Statistical Library (GSL) stuff */
const gsl_rng_type * T;
gsl_rng * r;
// gsl_rng_env_setup();
T = gsl_rng_default;
r = gsl_rng_alloc (T);
/* Toy example */
/* Declare variables in play */
double p_a, p_b, p_c;
double dist_none[N], dist_one[N], dist_few[N], dist_many[N], dist_mixture[N];
/* Initialize variables */
p_a = 0.8;
p_b = 0.5;
p_c = p_a * p_b;
fill(dist_none, 0);
fill(dist_one, 1);
to(r, dist_few, 1, 3);
to(r, dist_many, 2, 10);
/* Generate mixture */
int n = 4;
double weights[] = { 1 - p_c, p_c/2, p_c/4, p_c/4 };
double *dists[] = { dist_none, dist_one, dist_few, dist_many };
mixture(r, dists, weights, n, dist_mixture);
printf("%f\n", sum(dist_mixture, N)/N);
/* Clean up GSL */
gsl_rng_free (r);
/* Return success*/
return EXIT_SUCCESS;
}

View File

@ -1,50 +0,0 @@
#include "../../squiggle.h"
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
// Estimate functions
float sample_0(uint32_t* seed)
{
return 0;
}
float sample_1(uint32_t* seed)
{
return 1;
}
float sample_few(uint32_t* seed)
{
return random_to(1, 3, seed);
}
float sample_many(uint32_t* seed)
{
return random_to(2, 10, seed);
}
int main(){
// set randomness seed
uint32_t* seed = malloc(sizeof(uint32_t));
*seed = 1000; // xorshift can't start with 0
float p_a = 0.8;
float p_b = 0.5;
float p_c = p_a * p_b;
int n_dists = 4;
float weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 };
float (*samplers[])(uint32_t*) = { sample_0, sample_1, sample_few, sample_many };
float result_one = mixture(samplers, weights, n_dists, seed);
printf("result_one: %f\n", result_one);
}
/*
Aggregation mechanisms:
- Quantiles (requires a sort)
- Sum
- Average
- Std
*/

View File

@ -1,53 +0,0 @@
# Interface:
# make
# make build
# make format
# make run
# Compiler
CC=gcc
# CC=tcc # <= faster compilation
# Main file
SRC=example.c
OUTPUT=example
## Dependencies
MATH=-lm
## Flags
DEBUG= #'-g'
STANDARD=-std=c99
WARNINGS=-Wall
OPTIMIZED=-O3 #-Ofast
# OPENMP=-fopenmp
## Formatter
STYLE_BLUEPRINT=webkit
FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
## make build
build: $(SRC)
$(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(MATH) -o $(OUTPUT)
format: $(SRC)
$(FORMATTER) $(SRC)
run: $(SRC) $(OUTPUT)
OMP_NUM_THREADS=1 ./$(OUTPUT) && echo
time-linux:
@echo "Requires /bin/time, found on GNU/Linux systems" && echo
@echo "Running 100x and taking avg time $(OUTPUT)"
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 1 thread: |" | sed 's|$$|ms|' && echo
## Profiling
profile-linux:
echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar"
echo "Must be run as sudo"
$(CC) $(SRC) $(MATH) -o $(OUTPUT)
sudo perf record $(OUTPUT)
sudo perf report
rm perf.data

View File

@ -1,59 +0,0 @@
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include "../../squiggle.h"
// Estimate functions
float sample_0(uint32_t* seed)
{
return 0;
}
float sample_1(uint32_t* seed)
{
return 1;
}
float sample_few(uint32_t* seed)
{
return random_to(1, 3, seed);
}
float sample_many(uint32_t* seed)
{
return random_to(2, 10, seed);
}
int main(){
// set randomness seed
uint32_t* seed = malloc(sizeof(uint32_t));
*seed = 1000; // xorshift can't start with 0
float p_a = 0.8;
float p_b = 0.5;
float p_c = p_a * p_b;
int n_dists = 4;
float weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 };
float (*samplers[])(uint32_t*) = { sample_0, sample_1, sample_few, sample_many };
int n_samples = 1000000;
float* result_many = (float *) malloc(n_samples * sizeof(float));
for(int i=0; i<n_samples; i++){
result_many[i] = mixture(samplers, weights, n_dists, seed);
}
printf("result_many: [");
for(int i=0; i<100; i++){
printf("%.2f, ", result_many[i]);
}
printf("]\n");
}
/*
Aggregation mechanisms:
- Quantiles (requires a sort)
- Sum
- Average
- Std
*/

View File

@ -1,53 +0,0 @@
# Interface:
# make
# make build
# make format
# make run
# Compiler
CC=gcc
# CC=tcc # <= faster compilation
# Main file
SRC=example.c
OUTPUT=example
## Dependencies
MATH=-lm
## Flags
DEBUG= #'-g'
STANDARD=-std=c99
WARNINGS=-Wall
OPTIMIZED=-O3 #-Ofast
# OPENMP=-fopenmp
## Formatter
STYLE_BLUEPRINT=webkit
FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
## make build
build: $(SRC)
$(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(MATH) -o $(OUTPUT)
format: $(SRC)
$(FORMATTER) $(SRC)
run: $(SRC) $(OUTPUT)
OMP_NUM_THREADS=1 ./$(OUTPUT) && echo
time-linux:
@echo "Requires /bin/time, found on GNU/Linux systems" && echo
@echo "Running 100x and taking avg time $(OUTPUT)"
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 1 thread: |" | sed 's|$$|ms|' && echo
## Profiling
profile-linux:
echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar"
echo "Must be run as sudo"
$(CC) $(SRC) $(MATH) -o $(OUTPUT)
sudo perf record $(OUTPUT)
sudo perf report
rm perf.data

View File

@ -1,109 +0,0 @@
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
const float PI = 3.14159265358979323846;
// Pseudo Random number generator
uint32_t xorshift32
(uint32_t* seed)
{
// Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs"
// See <https://stackoverflow.com/questions/53886131/how-does-xorshift32-works>
// https://en.wikipedia.org/wiki/Xorshift
// Also some drama: <https://www.pcg-random.org/posts/on-vignas-pcg-critique.html>, <https://prng.di.unimi.it/>
uint32_t x = *seed;
x ^= x << 13;
x ^= x >> 17;
x ^= x << 5;
return *seed = x;
}
// Distribution & sampling functions
float rand_0_to_1(uint32_t* seed){
return ((float) xorshift32(seed)) / ((float) UINT32_MAX);
}
float rand_float(float max, uint32_t* seed)
{
return rand_0_to_1(seed) * max;
}
float ur_normal(uint32_t* seed)
{
float u1 = rand_0_to_1(seed);
float u2 = rand_0_to_1(seed);
float z = sqrtf(-2.0 * log(u1)) * sin(2 * PI * u2);
return z;
}
float random_uniform(float from, float to, uint32_t* seed)
{
return rand_0_to_1(seed) * (to - from) + from;
}
float random_normal(float mean, float sigma, uint32_t* seed)
{
return (mean + sigma * ur_normal(seed));
}
float random_lognormal(float logmean, float logsigma, uint32_t* seed)
{
return expf(random_normal(logmean, logsigma, seed));
}
float random_to(float low, float high, uint32_t* seed)
{
const float NORMAL95CONFIDENCE = 1.6448536269514722;
float loglow = logf(low);
float loghigh = logf(high);
float logmean = (loglow + loghigh) / 2;
float logsigma = (loghigh - loglow) / (2.0 * NORMAL95CONFIDENCE);
return random_lognormal(logmean, logsigma, seed);
}
// Array helpers
float array_sum(float* array, int length)
{
float output = 0.0;
for (int i = 0; i < length; i++) {
output += array[i];
}
return output;
}
void array_cumsum(float* array_to_sum, float* array_cumsummed, int length)
{
array_cumsummed[0] = array_to_sum[0];
for (int i = 1; i < length; i++) {
array_cumsummed[i] = array_cumsummed[i - 1] + array_to_sum[i];
}
}
// Mixture function
float mixture(float (*samplers[])(uint32_t*), float* weights, int n_dists, uint32_t* seed)
{
// You can see a simpler version of this function in the git history
// or in C-02-better-algorithm-one-thread/
float sum_weights = array_sum(weights, n_dists);
float* cumsummed_normalized_weights = (float*) malloc(n_dists * sizeof(float));
cumsummed_normalized_weights[0] = weights[0]/sum_weights;
for (int i = 1; i < n_dists; i++) {
cumsummed_normalized_weights[i] = cumsummed_normalized_weights[i - 1] + weights[i]/sum_weights;
}
float p = random_uniform(0, 1, seed);
float result;
for (int k = 0; k < n_dists; k++) {
if (p < cumsummed_normalized_weights[k]) {
result = samplers[k](seed);
break;
}
}
free(cumsummed_normalized_weights);
return result;
}

View File

@ -1,9 +0,0 @@
- [ ] Add example for only one sample
- [ ] Add example for many samples
- [ ] Use gcc extension to define functions nested inside main.
- [ ] Use OpenMP for acceleration
- [ ] Chain various mixture functions
- [ ] Have some more complicated & realistic example
- [ ] Add summarization functions, like mean, std, 90% ci (or all c.i.?)
- [ ] Add beta distribution

View File

@ -1,15 +0,0 @@
build:
gcc xorshift-pointer.c -o out/xorshift-pointer
gcc xorshift-struct.c -o out/xorshift-struct
run-pointer:
./out/xorshift-pointer
run-struct:
./out/xorshift-struct
time-pointer:
/bin/time -f "Time: %es" ./out/xorshift-pointer && echo
time-struct:
/bin/time -f "Time: %es" ./out/xorshift-struct && echo

View File

@ -1 +0,0 @@
Using a pointer or a struct turns out to be pretty equivalent in terms of speed. I don't think there are gains to be found here, but I find the pointer implementation clearer.

View File

@ -1,41 +0,0 @@
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
uint32_t xorshift32(uint32_t* state)
{
/* Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs" */
uint32_t x = *state;
x ^= x << 13;
x ^= x >> 17;
x ^= x << 5;
return *state = x;
}
float rand_xorshift32(uint32_t* state){
return (float) xorshift32(state) / (float) UINT32_MAX;
}
int main(){
uint32_t** states = malloc(4 * sizeof(uint32_t*));
for(int i=0; i<4;i++){
states[i] = malloc(sizeof(uint32_t));
*states[i] = (uint32_t) i + 1;
}
for(int i=0; i<1000000000;i++){
uint32_t x = xorshift32(states[0]);
float y = rand_xorshift32(states[1]);
// printf("%u\n", x);
// printf("%f\n", y);
}
for(int i=0; i<4;i++){
free(states[i]);
}
free(states);
return 0;
}
// See <https://stackoverflow.com/questions/53886131/how-does-xorshift32-works>
// https://en.wikipedia.org/wiki/Xorshift

View File

@ -1,46 +0,0 @@
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
struct xorshift32_state {
uint32_t a;
};
uint32_t xorshift32(struct xorshift32_state *state)
{
/* Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs" */
uint32_t x = state->a;
x ^= x << 13;
x ^= x >> 17;
x ^= x << 5;
return state->a = x;
}
float rand_xorshift32(struct xorshift32_state *state){
return (float) xorshift32(state) / (float) UINT32_MAX;
}
int main(){
struct xorshift32_state** states = malloc(sizeof(struct xorshift32_state*) * 4);
for(int i=0; i<4;i++){
states[i] = malloc(sizeof(struct xorshift32_state));
states[i]->a = (uint32_t) i + 1;
}
for(int i=0; i<1000000000; i++){
uint32_t x = xorshift32(states[0]);
float y = rand_xorshift32(states[1]);
// printf("%u\n", x);
// printf("%f\n", y);
}
for(int i=0; i<4;i++){
free(states[i]);
}
free(states);
return 0;
}

1
R/hello-world.R Normal file
View File

@ -0,0 +1 @@
print("Hello world")

View File

@ -48,3 +48,6 @@ weights = c((1 - p_c), p_c/2, p_c/4, p_c/4)
result = mixture(dists, weights) result = mixture(dists, weights)
mean_result = mean(result) mean_result = mean(result)
print(mean_result) print(mean_result)

160
README.md
View File

@ -2,161 +2,49 @@
## About ## About
This repository contains example of very simple code to manipulate samples in various programming languages, and tallies their speed and the complexity of their code. It implements this platonic estimate: This repository contains example of very simple code to manipulate samples in various programming languages. As of now, it may be useful for checking the validity of simple estimations.
``` The title of this repository is a pun on two meanings of "time to": "how much time does it take to do x", and "let's do x".
p_a = 0.8
p_b = 0.5
p_c = p_a * p_b
dists = [0, 1, 1 to 3, 2 to 10]
weights = [(1 - p_c), p_c/2, p_c/4, p_c/4 ]
result = mixture(dists, weights) # should be 1M samples
mean(result)
```
I don't particularly care about the speed of this particular example, but rather think that the speed in this simple exaxmple would be indicative of the speed when considering 100x or 1000x more complicated models. As of now, it may also be useful for checking the validity of simple estimations.
The title of this repository is a pun on two meanings of "time to": "how much time does it take to do x", and "let's do x". "BOTEC" stands for "back of the envelope calculation".
## Current languages ## Current languages
- [x] C
- [x] Javascript (NodeJS)
- [x] Squiggle
- [x] R
- [x] Python - [x] Python
- [x] Nim - [x] R
- [x] Squiggle
- [x] Javascript (NodeJS)
- [x] C
## Comparison table ## Performance table
| Language | Time | Lines of code | With the [time](https://man7.org/linux/man-pages/man1/time.1.html) tool, using 1M samples:
|-----------------------------|-----------|---------------|
| C (optimized, 16 threads) | 5ms | 249 |
| Nim | 38ms | 84 |
| Lua (LuaJIT) | 68ms | 82 |
| Lua | 278ms | 82 |
| C (naïve implementation) | 292ms | 149 |
| Javascript (NodeJS) | 732ms | 69 |
| Squiggle | 1,536s | 14* |
| SquigglePy | 1.602s | 18* |
| R | 7,000s | 49 |
| Python (CPython) | 16,641s | 56 |
Time measurements taken with the [time](https://man7.org/linux/man-pages/man1/time.1.html) tool, using 1M samples. But different implementations use different algorithms and, occasionally, different time measuring methodologies (for the C, Nim and Lua implementations, I run the program 100 times and take the mean). Their speed was also measured under different loads in my machine. So I think that these time estimates are accurate within maybe ~2x or so. | Language | Time |
|----------|-----------|
| C | 0m0,442s |
| Node | 0m0,732s |
| Squiggle | 0m1,536s |
| R | 0m7,000s |
| Python (CPython) | 0m16,641s |
Note that the number of lines is much shorter for Squiggle and SquigglePy because I'm just counting the lines needed to get Squiggle and SquigglePy to output a model, not the lines inside them.
## Notes I was very surprised that Node/Squiggle code was almost as fast as the raw C code. For the Python code, it's possible that the lack of speed is more a function of me not being as familiar with Python. It's also very possible that the code would run faster with [PyPy](https://doc.pypy.org)
### Nim
I was really happy trying [Nim](https://nim-lang.org/), and as a result the Nim code is a bit more optimized and engineered:
1. It is using the fastest "danger" compilation mode.
2. It has some optimizations: I don't compute 1M samples for each dist, but instead pass functions around and compute the 1M samples at the end
3. I define the normal function from scratch, using the [BoxMuller transform](https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form).
4. I also have a version in which I define the logarithm and sine functions themselves in nim to feed into the Box-Muller method. But it is much slower.
Without 1. and 2., the nim code takes 0m0.183s instead. But I don't think that these are unfair advantages: I liked trying out nim and therefore put in more love into the code, and this seems like it could be a recurring factor.
Ultimately, these optimizations were also incorporated into the C code as well.
Nim also has multithreading support, but I haven't bothered loooking into it yet.
### C
The optimizations which make the final C code significantly faster than the naïve implementation are:
- To pass around pointers to functions, instead of large arrays. This is the same as in the nim implementation, but imho leads to more complex code
- To use the Box-Muller transform instead of using libraries, like in nim.
- To use multithreading support
The C code uses the `-Ofast` or `-O3` compilation flags. Initially, without using those flags and without the algorithmic improvements, the code took ~0.4 seconds to run. So I was surprised that the Node and Squiggle code were comparable to the C code. It ended up being the case that the C code could be pushed to be ~100x faster, though :)
In fact, the C code ended up being so fast that I had to measure its time by running the code 100 times in a row and dividing that amount by 100, rather than by just running it once, because running it once was too fast for /bin/time. More sophisticated profiling tools exist that could e.g., account for how iddle a machine is when running the code, but I didn't think that was worth it at this point.
And still, there are some missing optimizations, like tweaking the code to take into account cache misses. I'm not exactly sure how that would go, though.
Once the code was at 6.6ms, there was a 0.6ms gain possible by using OMP better, and a 1ms gain by using a xorshift algorithm instead of rand_r from stdlib. I think there might be faster gains to be made by using OpenCL or CUDA, but I haven't gotten into how to do that instead.
Although the above paragraphs were written in the first person, the C code was written together with Jorge Sierra, who translated the algorithmic improvements from nim to it and added the initial multithreading support.
### NodeJS and Squiggle
Using [bun](https://bun.sh/) instead of node is actually a bit slower. Also, both the NodeJS and the Squiggle code use [stdlib](https://stdlib.io/) in their innards, which has a bunch of interleaved functions that make the code slower. It's possible that not using that external library could make the code faster, but at the same time, the js approach does seem to be to use external libraries whenever possible.
I am not particularly sure that the Squiggle code is actually producing 1M samples, but also have no particular plan to debug this.
### Python
For the Python code, it's possible that the lack of speed is more a function of me not being as familiar with Python. It's also very possible that the code would run faster with [PyPy](https://doc.pypy.org).
### R
R has a warm place in my heart from back in the day, and it has predefined functions to do everything. It was particularly fast to write for me, though not particularly fast to run :) However, I do recall that R does have some multithreading support; it wasn't used.
### Lua
I was also really pleased with Lua. I liked the simplicity. And coming from javascript, I appreciated that the program did not fail silently, but rather gave me useful errors, like:
```
lua samples.lua
lua: samples.lua:42: table index is nil
stack traceback:
samples.lua:42: in function 'mixture'
samples.lua:49: in main chunk
[C]: in ?
make: *** [makefile:14: run] Error 1
```
I also appreciated the speedup when using the LuaJIT interpreter. You can install it with commands similar to
```
git clone https://luajit.org/git/luajit.git
make
sudo make install
sudo ln -sf luajit-2.1.0-beta3 /usr/local/bin/luajit
```
Overall I'm thinking that a combination of lua at least for scripting and ¿nim/C/tbd? for more serious programs could be quite powerful.
### Overall thoughts
Overall I don't think that this is a fair comparison of the languages intrinsically, because I'm just differentially good at them, because I've chosen to put more effort in ones than in others. But it is still useful to me personally, and perhaps mildly informative to others.
## Languages I may add later ## Languages I may add later
- [ ] Julia (TuringML) - Julia (TuringML)
- [ ] Rust - Rust
- [ ] Lisp - Lisp
- [ ] Go
- [ ] Zig
- [ ] Forth
- [ ] sh/bash, lol?
- [ ] OCaml
- [ ] Haskell
- [ ] OpenCL/CUDA (e.g., as in <https://www.eriksmistad.no/getting-started-with-opencl-and-gpu-computing/>). Seems like it would be overkill, and also the code would be way more complex. But maybe worth trying?
- [-] Stan => As far as I can tell, Stan is designed to generate samples from the posterior distribution given some data, not to create data by drawing samples from an arbitrary distribution.
- [ ] Maybe still worth reversing the process?
- ... and suggestions welcome - ... and suggestions welcome
- Stan
## Languages added so far
- [x] Squiggle
- [x] Javascript (NodeJS)
- [x] Python (CPython)
- [x] R
- [x] C
- [x] Nim
- [x] SquigglePy
- [x] Lua
## Roadmap ## Roadmap
The future of this project is uncertain. In most words, I simply forget about this repository. The future of this project is uncertain. In most words, I simply forget about this repository.
To do:
- [ ] Check whether the Squiggle code is producing 1M samples. Still not too sure.
- Differentiate between initial startup time (e.g., compiling, loading environment) and runtime. This matters because startup time could be ~constant, so for larger projects only the runtime matters.
## Other similar projects ## Other similar projects
- Squigglepy: <https://github.com/rethinkpriorities/squigglepy> - Squigglepy: <https://github.com/rethinkpriorities/squigglepy>

1
js/hello-world.js Normal file
View File

@ -0,0 +1 @@
console.log("Hello world")

View File

@ -20,7 +20,7 @@
// MAIN // // MAIN //
var obj = ( typeof window === 'object' ) ? window : {}; var obj = ( typeof window === 'object' ) ? window : null;
// EXPORTS // // EXPORTS //

View File

@ -1 +0,0 @@
print("Hello world")

View File

@ -1,32 +0,0 @@
# Interface:
# make
# make build
# make run
# make time-linux
# make install
# make format
# make time-linux-simple
# make profile-linux
SRC=samples.lua
# INTERPETER=lua# < original
INTERPETER=luajit# < faster
run: $(SRC)
$(INTERPETER) $(SRC)
time-linux:
@echo "Requires /bin/time, found on GNU/Linux systems" && echo
@echo "Running 100x and taking avg time of: $(INTERPETER) $(SRC)"
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(INTERPETER) $(SRC); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time: |" | sed 's|$$|ms|' && echo
time-linux-simple:
@echo "Requires /bin/time, found on GNU/Linux systems" && echo
/bin/time -f "Time: %es" $(INTERPETER) $(SRC) && echo
profile-linux:
@echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar"
@echo "Must be run as sudo"
sudo perf record $(INTERPETER) $(SRC)
sudo perf report
rm perf.data

View File

@ -1,82 +0,0 @@
-- Consts and prep
PI = 3.14159265358979323846;
NORMAL95CONFIDENCE = 1.6448536269514722;
math.randomseed(1234)
-- Random distribution functions
function sample_normal_0_1()
local u1 = math.random()
local u2 = math.random()
local result = math.sqrt(-2 * math.log(u1)) * math.sin(2 * PI * u2)
return result
end
function sample_normal(mean, sigma)
return mean + (sigma * sample_normal_0_1())
end
function sample_uniform(min, max)
return math.random() * (max - min) + min
end
function sample_lognormal(logmean, logsigma)
return math.exp(sample_normal(logmean, logsigma))
end
function sample_to(low, high)
local loglow = math.log(low);
local loghigh = math.log(high);
local logmean = (loglow + loghigh) / 2;
local logsigma = (loghigh - loglow) / (2.0 * NORMAL95CONFIDENCE);
return sample_lognormal(logmean, logsigma, seed);
end
-- Mixture
function mixture(samplers, weights, n)
assert(#samplers == #weights)
local l = #weights
local sum_weights = 0
for i = 1, l, 1 do
-- ^ arrays start at 1
sum_weights = sum_weights + weights[i]
end
local cumsummed_normalized_weights = {}
table.insert(cumsummed_normalized_weights, weights[1]/sum_weights)
for i = 2, l, 1 do
table.insert(cumsummed_normalized_weights, cumsummed_normalized_weights[i-1] + weights[i]/sum_weights)
end
local result = {}
for i = 1, n, 1 do
r = math.random()
local i = 1
while r > cumsummed_normalized_weights[i] do
i = i+1
end
table.insert(result, samplers[i]())
end
return result
end
-- Main
p_a = 0.8
p_b = 0.5
p_c = p_a * p_b
function sample_0() return 0 end
function sample_1() return 1 end
function sample_few() return sample_to(1, 3) end
function sample_many() return sample_to(2, 10) end
samplers = {sample_0, sample_1, sample_few, sample_many}
weights = { (1 - p_c), p_c/2, p_c/4, p_c/4 }
n = 1000000
result = mixture(samplers, weights, n)
sum = 0
for i = 1, n, 1 do
sum = sum + result[i]
-- print(result[i])
end
print(sum/n)

View File

@ -1,18 +0,0 @@
SHELL := /bin/bash ## <= required to use time
VERBOSE=--verbosity:0
build: samples.nim
nim c $(VERBOSE) samples.nim
run: samples
./samples $(VERBOSE)
examine: samples
nim c $(VERBOSE) samples.nim && time ./samples $(VERBOSE) && echo
nim c $(VERBOSE) -d:release samples.nim && time ./samples $(VERBOSE) && echo
nim c $(VERBOSE) -d:danger samples.nim && time ./samples $(VERBOSE)
time-linux:
@echo "Requires /bin/time, found on GNU/Linux systems" && echo
@echo "Running 100x and taking avg time of: $(OUTPUT)"
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do ./samples; done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time: |" | sed 's|$$|ms|' && echo

Binary file not shown.

View File

@ -1,4 +0,0 @@
This is a version of time-to-botec for nim for which I define logarithms, normals & lognormals pretty much
from scratch, without relying on libraries except for random numbers, sqrts, or taking a to the power of b.
This is interesting because it leads to better understanding. But it doesn't help compare the efficiency of various languages. So I'm setting it aside for now, even though I think it was really interesting.

View File

@ -1,11 +0,0 @@
SHELL := /bin/bash
build: samples.nim
nim c --verbosity:0 samples.nim
run: samples
./samples --verbosity:0
examine: samples
# nim c --verbosity:0 --opt:speed -d:release -d:danger --checks:off samples.nim && time ./samples --verbosity:0 --checks:off
nim c -d:release samples.nim && time ./samples

Binary file not shown.

View File

@ -1,145 +0,0 @@
import std/math
import std/sugar
import std/random
import std/sequtils
randomize()
## Basic math functions
proc factorial(n: int): int =
if n == 0 or n < 0:
return 1
else:
return n * factorial(n - 1)
proc sine(x: float): float =
let n = 8
# ^ Taylor will converge really quickly
# notice that the factorial of 17 is
# already pretty gigantic
var acc = 0.0
for i in 0..n:
var k = 2*i + 1
var taylor = pow(-1, i.float) * pow(x, k.float) / factorial(k).float
acc = acc + taylor
return acc
## Log function
## <https://en.wikipedia.org/wiki/Natural_logarithm#High_precision>
## Arithmetic-geomtric mean
proc ag(x: float, y: float): float =
let n = 32 # just some high number
var a = (x + y)/2.0
var b = sqrt(x * y)
for i in 0..n:
let temp = a
a = (a+b)/2.0
b = sqrt(b*temp)
return a
## Find m such that x * 2^m > 2^precision/2
proc find_m(x:float): float =
var m = 0.0;
let precision = 64 # bits
let c = pow(2.0, precision.float / 2.0)
while x * pow(2.0, m) < c:
m = m + 1
return m
proc log(x: float): float =
let m = find_m(x)
let s = x * pow(2.0, m)
let ln2 = 0.6931471805599453
return ( PI / (2.0 * ag(1, 4.0/s)) ) - m * ln2
## Test these functions
## echo factorial(5)
## echo sine(1.0)
## echo log(0.1)
## echo log(2.0)
## echo log(3.0)
## echo pow(2.0, 32.float)
## Distribution functions
## Normal
## <https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form>
proc ur_normal(): float =
let u1 = rand(1.0)
let u2 = rand(1.0)
let z = sqrt(-2.0 * log(u1)) * sine(2 * PI * u2)
return z
proc normal(mean: float, sigma: float): float =
return (mean + sigma * ur_normal())
proc lognormal(logmean: float, logsigma: float): float =
let answer = pow(E, normal(logmean, logsigma))
return answer
proc to(low: float, high: float): float =
let normal95confidencePoint = 1.6448536269514722
let loglow = log(low)
let loghigh = log(high)
let logmean = (loglow + loghigh)/2
let logsigma = (loghigh - loglow) / (2.0 * normal95confidencePoint);
return lognormal(logmean, logsigma)
## echo ur_normal()
## echo normal(10, 20)
## echo lognormal(2, 4)
## echo to(10, 90)
## Manipulate samples
proc make_samples(f: () -> float, n: int): seq[float] =
result = toSeq(1..n).map(_ => f())
return result
proc mixture(sxs: seq[seq[float]], ps: seq[float], n: int): seq[float] =
assert sxs.len == ps.len
var ws: seq[float]
var sum = 0.0
for i, p in ps:
sum = sum + p
ws.add(sum)
ws = ws.map(w => w/sum)
proc get_mixture_sample(): float =
let r = rand(1.0)
var i = ws.len - 1
for j, w in ws:
if r < w:
i = j
break
## only occasion when ^ doesn't assign i
## is when r is exactly 1
## which would correspond to choosing the last item in ws
## which is why i is initialized to ws.len
let xs = sxs[i]
let l = xs.len-1
let k = rand(0..l)
return xs[k]
return toSeq(1..n).map(_ => get_mixture_sample())
## Actual model
let n = 1000000
let p_a = 0.8
let p_b = 0.5
let p_c = p_a * p_b
let weights = @[ 1.0 - p_c, p_c/2.0, p_c/4.0, p_c/4.0 ]
let fs = [ () => 0.0, () => 1.0, () => to(1.0, 3.0), () => to(2.0, 10.0) ]
let dists = fs.map(f => make_samples(f, n))
let result = mixture(dists, weights, n)
let mean_result = foldl(result, a + b, 0.0) / float(result.len)
# echo result
echo mean_result

View File

@ -1,2 +0,0 @@
build: sums.nim
nim c sums.nim

View File

@ -1,22 +0,0 @@
import std/math
proc factorial(n: int): int =
if n == 0 or n < 0:
return 1
else:
return n * factorial(n - 1)
proc sine(x: float): float =
let n = 8
# ^ Taylor will converge really quickly
# notice that the factorial of 17 is
# already pretty gigantic
var acc = 0.0
for i in 0..n:
var k = 2*i + 1
var taylor = pow(-1, i.float) * pow(x, k.float) / factorial(k).float
acc = acc + taylor
return acc
echo factorial(17)
echo sine(1.0)

View File

@ -1,84 +0,0 @@
import std/math
import std/sugar
import std/random
import std/sequtils
randomize()
## Distribution functions
## Normal
## <https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form>
proc ur_normal(): float =
let u1 = rand(1.0)
let u2 = rand(1.0)
let z = sqrt(-2.0 * ln(u1)) * sin(2 * PI * u2)
return z
proc normal(mean: float, sigma: float): float =
return (mean + sigma * ur_normal())
proc lognormal(logmean: float, logsigma: float): float =
let answer = pow(E, normal(logmean, logsigma))
return answer
proc to(low: float, high: float): float =
let normal95confidencePoint = 1.6448536269514722
let loglow = ln(low)
let loghigh = ln(high)
let logmean = (loglow + loghigh)/2
let logsigma = (loghigh - loglow) / (2.0 * normal95confidencePoint)
return lognormal(logmean, logsigma)
## echo ur_normal()
## echo normal(10, 20)
## echo lognormal(2, 4)
## echo to(10, 90)
## Manipulate samples
proc mixture(fs: seq[proc (): float{.nimcall.}], ps: seq[float], n: int): seq[float] =
assert fs.len == ps.len
var ws: seq[float]
var sum = 0.0
for i, p in ps:
sum = sum + p
ws.add(sum)
ws = ws.map(w => w/sum)
var samples: seq[float]
let rs = toSeq(1..n).map(_=>rand(1.0))
for i in 0..(n-1):
let r = rs[i]
var j = ws.len - 1
for k, w in ws:
if r < w:
j = k
break
## only occasion when ^ doesn't assign j
## is when r is exactly 1
## which would correspond to choosing the last item in ws
## which is why j is initialized to ws.len - 1
let f = fs[j]
samples.add(f())
return samples
## Actual model
let n = 1000000
let p_a = 0.8
let p_b = 0.5
let p_c = p_a * p_b
let weights = @[ 1.0 - p_c, p_c/2.0, p_c/4.0, p_c/4.0 ]
let fs = @[ proc (): float = 0.0, proc (): float = 1.0, proc (): float = to(1.0, 3.0), proc (): float = to(2.0, 10.0)]
let result = mixture(fs, weights, n)
let mean_result = foldl(result, a + b, 0.0) / float(result.len)
# echo result
echo mean_result

1
python/hello-world.py Normal file
View File

@ -0,0 +1 @@
print("Hello world!")

View File

@ -1,10 +0,0 @@
install:
pip3 install "squigglepy>=0.26"
pip3 install numpy
run:
python3 samples.py
time:
time python3 samples.py

View File

@ -1,18 +0,0 @@
import squigglepy as sq
import numpy as np
p_a = 0.8
p_b = 0.5
p_c = p_a * p_b
dist_0 = 0
dist_1 = 1
dist_some = sq.to(1, 3)
dist_many = sq.to(2, 10)
dists = [dist_0, dist_1, dist_some, dist_many]
weights = [(1 - p_c), p_c/2, p_c/4, p_c/4 ]
result = sq.mixture(dists, weights)
result_samples = sq.sample(result, 1000000)
print(np.mean(result_samples))

View File

@ -1,41 +1,11 @@
# Optimized C
$ make && make time-linux
gcc -O3 samples.c -fopenmp -lm -o out/samples
Requires /bin/time, found on GNU/Linux systems
Running 100x and taking avg time: OMP_NUM_THREADS=1 out/samples
Time using 1 thread: 20.20ms
Running 100x and taking avg time: OMP_NUM_THREADS=2 out/samples
Time using 2 threads: 17.50ms
Running 100x and taking avg time: OMP_NUM_THREADS=4 out/samples
Time for 4 threads: 17.00ms
Running 100x and taking avg time: OMP_NUM_THREADS=8 out/samples
Time using 8 threads: 8.40ms
Running 100x and taking avg time: OMP_NUM_THREADS=16 out/samples
Time using 16 threads: 5.00ms
# C # C
## normal compilation
0.888458 0.888458
real 0m0,442s real 0m0,442s
user 0m0,378s user 0m0,378s
sys 0m0,064s sys 0m0,064s
## -Ofast
0.888458
real 0m0.292s
user 0m0.266s
sys 0m0.026s
# Squiggle # Squiggle
real 0m1,536s real 0m1,536s
@ -66,65 +36,3 @@ real 0m7,000s
user 0m6,944s user 0m6,944s
sys 0m0,052s sys 0m0,052s
## Nim
nim c --verbosity:0 samples.nim && time ./samples --verbosity:0 && echo
0.8860780498240779
real 0m0.234s
user 0m0.214s
sys 0m0.020s
nim c --verbosity:0 -d:release samples.nim && time ./samples --verbosity:0 && echo
0.884035098700204
real 0m0.074s
user 0m0.043s
sys 0m0.031s
nim c --verbosity:0 -d:danger samples.nim && time ./samples --verbosity:0
0.8892827195895541
real 0m0.068s
user 0m0.048s
sys 0m0.020s
— make time-linux
Requires /bin/time, found on GNU/Linux systems
Running 100x and taking avg time of:
Time: 38.90ms
## Squigglepy
— time make run
python3 samples.py
100%|█████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 20.68it/s]
100%|████████████████████████████████████████████████████████████████████| 1000000/1000000 [00:00<00:00, 1298266.78it/s]
0.8854926010232937
real 0m1.602s
user 0m1.582s
sys 0m0.331s
## Lua
— make time-linux-simple
Requires /bin/time, found on GNU/Linux systems
/bin/time -f "Time: %es" lua samples.lua && echo
0.88932198462254
Time: 0.28s
— make time-linux
Requires /bin/time, found on GNU/Linux systems
Running 100x and taking avg time of: lua samples.lua
Time: 275.80ms
— make time-linux
Requires /bin/time, found on GNU/Linux systems
Running 100x and taking avg time of: luajit samples.lua
Time: 68.60ms

View File

@ -1,49 +0,0 @@
# Interface:
# make
# make build
# make run
# make time-linux
# make install
# make format
# make time-linux-simple
# make profile-linux
# Main file
SRC=samples.lang
OUTPUT=out/samples
# Compiler
COMPILER=lang-compiler
FLAGS=
## Formatter
FORMATTER=lang-formatter --options
## make build
build: $(SRC)
$(COMPILER) $(FLAGS) -o $(OUTPUT)
install:
lang-package-manager install dependencies
format: $(SRC)
$(FORMATTER) $(SRC)
run: $(SRC) $(OUTPUT)
./$(OUTPUT)
time-linux:
@echo "Requires /bin/time, found on GNU/Linux systems" && echo
@echo "Running 100x and taking avg time of: $(OUTPUT)"
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time: |" | sed 's|$$|ms|' && echo
time-linux-simple:
@echo "Requires /bin/time, found on GNU/Linux systems" && echo
/bin/time -f "Time: %es" ./$(OUTPUT) && echo
profile-linux:
@echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar"
@echo "Must be run as sudo"
sudo perf record $(OUTPUT)
sudo perf report
rm perf.data

View File

@ -1,5 +0,0 @@
const std = @import("std");
pub fn main() void {
std.debug.print("Hello, {s}!\n", .{"World"});
}

View File

@ -1 +0,0 @@
See <https://ziglang.org/learn/getting-started/#tagged-release-or-nightly-build>

View File

@ -1 +0,0 @@
https://ziglearn.org/chapter-1/