Compare commits
No commits in common. "3b29ad7e4534fa397136204d41c2823bd9f96df4" and "b624ab46d790ba3833186e4878959dedd252a2cb" have entirely different histories.
3b29ad7e45
...
b624ab46d7
|
@ -1,53 +0,0 @@
|
|||
# Interface:
|
||||
# make
|
||||
# make build
|
||||
# make format
|
||||
# make run
|
||||
|
||||
# Compiler
|
||||
CC=gcc
|
||||
# CC=tcc # <= faster compilation
|
||||
|
||||
# Main file
|
||||
SRC=samples.c
|
||||
OUTPUT=samples
|
||||
|
||||
## Dependencies
|
||||
DEPS='gsl'
|
||||
|
||||
## Flags
|
||||
INCS=`pkg-config --cflags ${DEPS}`
|
||||
LIBS=`pkg-config --libs ${DEPS}`
|
||||
DEBUG= #'-g'
|
||||
STANDARD=-std=c99
|
||||
WARNINGS=-Wall
|
||||
FAST=-Ofast
|
||||
## Formatter
|
||||
STYLE_BLUEPRINT=webkit
|
||||
FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
|
||||
|
||||
## make build
|
||||
build: $(SRC)
|
||||
$(CC) $(DEBUG) $(INCS) $(PLUGS) $(SRC) -o samples $(LIBS)
|
||||
|
||||
fast: $(SRC)
|
||||
$(CC) $(FAST) $(DEBUG) $(INCS) $(PLUGS) $(SRC) -o samples $(LIBS)
|
||||
|
||||
format: $(SRC)
|
||||
$(FORMATTER) $(SRC)
|
||||
|
||||
run: $(SRC) $(OUTPUT)
|
||||
echo "Increasing stack size limit, because we are dealing with 1M samples"
|
||||
# ulimit: increase stack size limit
|
||||
# -Ss: the soft limit. If you set the hard limit, you then can't raise it
|
||||
# 256000: around 250Mbs, if I'm reading it correctly.
|
||||
# Then run the program
|
||||
ulimit -Ss 256000 && ./$(OUTPUT)
|
||||
|
||||
|
||||
|
||||
# Old:
|
||||
# Link libraries, for good measure
|
||||
# LD_LIBRARY_PATH=/usr/local/lib
|
||||
# export LD_LIBRARY_PATH
|
||||
|
Binary file not shown.
|
@ -1,157 +0,0 @@
|
|||
#include <gsl/gsl_randist.h>
|
||||
#include <gsl/gsl_rng.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
|
||||
#define N 1000000
|
||||
/*
|
||||
* For very high values of N, you will want to increase the maximum stack trace, otherwise you will suffer a segmentation fault
|
||||
* In Ubuntu/bash you can do this with $ ulimit -Ss 256000 ## ~256Mbs
|
||||
* And confirm it with $ ulimit -a
|
||||
*/
|
||||
|
||||
/* Helpers */
|
||||
void print(double* ys)
|
||||
{
|
||||
for (int i = 0; i < N; i++) {
|
||||
printf("%f\n", ys[i]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void fill(double* ys, float f)
|
||||
{
|
||||
for (int i = 0; i < N; i++) {
|
||||
ys[i] = f;
|
||||
}
|
||||
}
|
||||
|
||||
double sum(double* ps, int n)
|
||||
{
|
||||
double result = 0;
|
||||
for (int i = 0; i < n; i++) {
|
||||
result += ps[i];
|
||||
}
|
||||
return (result);
|
||||
}
|
||||
|
||||
void cumsum(double* ps, double* rs, int n)
|
||||
{
|
||||
double counter = 0;
|
||||
for (int i = 0; i < n; i++) {
|
||||
counter += ps[i];
|
||||
rs[i] = counter;
|
||||
}
|
||||
}
|
||||
|
||||
/* Distributions*/
|
||||
void normal(gsl_rng* r, double* ys, double mean, double std)
|
||||
{
|
||||
for (int i = 0; i < N; i++) {
|
||||
ys[i] = mean + gsl_ran_gaussian(r, std);
|
||||
}
|
||||
}
|
||||
|
||||
void lognormal(gsl_rng* r, double* ys, double zeta, double sigma)
|
||||
{
|
||||
for (int i = 0; i < N; i++) {
|
||||
ys[i] = gsl_ran_lognormal(r, zeta, sigma);
|
||||
}
|
||||
}
|
||||
|
||||
void to(gsl_rng* r, double* ys, double low, double high)
|
||||
{
|
||||
double normal95confidencePoint = 1.6448536269514722;
|
||||
double log_low = log(low);
|
||||
double log_high = log(high);
|
||||
double zeta = (log_low + log_high) / 2;
|
||||
double sigma = (log_high - log_low) / (2.0 * normal95confidencePoint);
|
||||
lognormal(r, ys, zeta, sigma);
|
||||
}
|
||||
|
||||
/* Mixture of distributions */
|
||||
void mixture(gsl_rng* r, double* dists[], double* weights, int n, double* results)
|
||||
{
|
||||
/* Get cummulative, normalized weights */
|
||||
double sum_weights = sum(weights, n);
|
||||
double normalized_weights[n];
|
||||
for (int i = 0; i < n; i++) {
|
||||
normalized_weights[i] = weights[i] / sum_weights;
|
||||
}
|
||||
double cummulative_weights[n];
|
||||
cumsum(normalized_weights, cummulative_weights, n);
|
||||
|
||||
/* Get N samples, drawn from the different distributions in proportion to their weights. */
|
||||
for (int i = 0; i < N; i++) {
|
||||
double p_1 = gsl_rng_uniform(r);
|
||||
double p_2 = gsl_rng_uniform(r);
|
||||
|
||||
int index_found = 0;
|
||||
int index_counter = 0;
|
||||
while ((index_found == 0) && (index_counter < n)) {
|
||||
if (p_1 < cummulative_weights[index_counter]) {
|
||||
index_found = 1;
|
||||
} else {
|
||||
index_counter++;
|
||||
}
|
||||
}
|
||||
if (index_found == 0) {
|
||||
printf("\nThis shouldn't be able to happen");
|
||||
// gsl_rng_free (r);
|
||||
// abort(); // this shouldn't have happened.
|
||||
|
||||
} else {
|
||||
int sample_index = (int)floor(p_2 * N);
|
||||
results[i] = dists[index_counter][sample_index];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Main */
|
||||
int main(void)
|
||||
{
|
||||
// Start clock
|
||||
clock_t start, end;
|
||||
start = clock();
|
||||
|
||||
/* Initialize GNU Statistical Library (GSL) stuff */
|
||||
const gsl_rng_type* T;
|
||||
gsl_rng* r;
|
||||
// gsl_rng_env_setup();
|
||||
T = gsl_rng_default;
|
||||
r = gsl_rng_alloc(T);
|
||||
|
||||
/* Toy example */
|
||||
/* Declare variables in play */
|
||||
double p_a, p_b, p_c;
|
||||
double dist_none[N], dist_one[N], dist_few[N], dist_many[N], dist_mixture[N];
|
||||
|
||||
/* Initialize variables */
|
||||
p_a = 0.8;
|
||||
p_b = 0.5;
|
||||
p_c = p_a * p_b;
|
||||
|
||||
fill(dist_none, 0);
|
||||
fill(dist_one, 1);
|
||||
to(r, dist_few, 1, 3);
|
||||
to(r, dist_many, 2, 10);
|
||||
|
||||
/* Generate mixture */
|
||||
int n = 4;
|
||||
double weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 };
|
||||
double* dists[] = { dist_none, dist_one, dist_few, dist_many };
|
||||
|
||||
mixture(r, dists, weights, n, dist_mixture);
|
||||
printf("%f\n", sum(dist_mixture, N) / N);
|
||||
|
||||
/* Clean up GSL */
|
||||
gsl_rng_free(r);
|
||||
|
||||
// End clock
|
||||
end = clock();
|
||||
printf("Total time (ms): %f\n", ((double)(end - start)) / CLOCKS_PER_SEC * 1000);
|
||||
/* Return success*/
|
||||
return EXIT_SUCCESS;
|
||||
}
|
|
@ -1,53 +0,0 @@
|
|||
# Interface:
|
||||
# make
|
||||
# make build
|
||||
# make format
|
||||
# make run
|
||||
|
||||
# Compiler
|
||||
CC=gcc
|
||||
# CC=tcc # <= faster compilation
|
||||
|
||||
# Main file
|
||||
|
||||
SRC_ONE_THREAD=./samples-one-thread.c
|
||||
OUTPUT_ONE_THREAD=out/samples-one-thread
|
||||
|
||||
## Dependencies
|
||||
# Has no dependencies
|
||||
MATH=-lm
|
||||
|
||||
## Flags
|
||||
DEBUG= #'-g'
|
||||
STANDARD=-std=c99
|
||||
WARNINGS=-Wall
|
||||
OPTIMIZED=-O3 #-O3 actually gives better performance than -Ofast, at least for this version
|
||||
OPENMP=-fopenmp
|
||||
|
||||
## Formatter
|
||||
STYLE_BLUEPRINT=webkit
|
||||
FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
|
||||
|
||||
## make build
|
||||
build: $(SRC_ONE_THREAD)
|
||||
mkdir -p out
|
||||
$(CC) $(OPTIMIZED) $(DEBUG) $(SRC_ONE_THREAD) $(OPENMP) $(MATH) -o $(OUTPUT_ONE_THREAD)
|
||||
|
||||
format: $(SRC_ONE_THREAD)
|
||||
$(FORMATTER) $(SRC_ONE_THREAD)
|
||||
|
||||
run: $(SRC_ONE_THREAD) $(OUTPUT_ONE_THREAD)
|
||||
./$(OUTPUT_ONE_THREAD)
|
||||
|
||||
time-linux:
|
||||
@echo "Requires /bin/time, found on GNU/Linux systems" && echo
|
||||
@echo "Running 100x and taking avg time: $(OUTPUT_ONE_THREAD)"
|
||||
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(OUTPUT_ONE_THREAD); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time: |" | sed 's|$$|ms|' && echo
|
||||
|
||||
time-linux-simple:
|
||||
@echo "Requires /bin/time, found on GNU/Linux systems" && echo
|
||||
/bin/time -f "Time: %es" ./$(OUTPUT_ONE_THREAD) && echo
|
||||
|
||||
debian-install-dependencies:
|
||||
sudo apt-get install libomp-dev
|
||||
|
Binary file not shown.
|
@ -1,183 +0,0 @@
|
|||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
|
||||
const float PI = 3.14159265358979323846;
|
||||
|
||||
#define N 1000000
|
||||
|
||||
//Array helpers
|
||||
|
||||
void array_print(float* array, int length)
|
||||
{
|
||||
for (int i = 0; i < length; i++) {
|
||||
printf("item[%d] = %f\n", i, array[i]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void array_fill(float* array, int length, float item)
|
||||
{
|
||||
int i;
|
||||
{
|
||||
for (i = 0; i < length; i++) {
|
||||
array[i] = item;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float array_sum(float* array, int length)
|
||||
{
|
||||
float output = 0.0;
|
||||
for (int i = 0; i < length; i++) {
|
||||
output += array[i];
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
void array_cumsum(float* array_to_sum, float* array_cumsummed, int length)
|
||||
{
|
||||
array_cumsummed[0] = array_to_sum[0];
|
||||
for (int i = 1; i < length; i++) {
|
||||
array_cumsummed[i] = array_cumsummed[i - 1] + array_to_sum[i];
|
||||
}
|
||||
}
|
||||
|
||||
float rand_float(float to)
|
||||
{
|
||||
return ((float)rand() / (float)RAND_MAX) * to;
|
||||
}
|
||||
|
||||
float ur_normal()
|
||||
{
|
||||
float u1 = rand_float(1.0);
|
||||
float u2 = rand_float(1.0);
|
||||
float z = sqrtf(-2.0 * log(u1)) * sin(2 * PI * u2);
|
||||
return z;
|
||||
}
|
||||
|
||||
inline float random_uniform(float from, float to)
|
||||
{
|
||||
return ((float)rand() / (float)RAND_MAX) * (to - from) + from;
|
||||
}
|
||||
|
||||
inline float random_normal(float mean, float sigma)
|
||||
{
|
||||
return (mean + sigma * ur_normal());
|
||||
}
|
||||
|
||||
inline float random_lognormal(float logmean, float logsigma)
|
||||
{
|
||||
return expf(random_normal(logmean, logsigma));
|
||||
}
|
||||
|
||||
inline float random_to(float low, float high)
|
||||
{
|
||||
const float NORMAL95CONFIDENCE = 1.6448536269514722;
|
||||
float loglow = logf(low);
|
||||
float loghigh = logf(high);
|
||||
float logmean = (loglow + loghigh) / 2;
|
||||
float logsigma = (loghigh - loglow) / (2.0 * NORMAL95CONFIDENCE);
|
||||
return random_lognormal(logmean, logsigma);
|
||||
}
|
||||
|
||||
void array_random_to(float* array, int length, float low, float high)
|
||||
{
|
||||
int i;
|
||||
#pragma omp private(i)
|
||||
{
|
||||
#pragma omp for
|
||||
for (i = 0; i < length; i++) {
|
||||
array[i] = random_to(low, high);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void mixture(float (*samplers[])(void), float* weights, int n_dists, float* results, int results_length)
|
||||
{
|
||||
float sum_weights = array_sum(weights, n_dists);
|
||||
float* normalized_weights = malloc(n_dists * sizeof(float));
|
||||
for (int i = 0; i < n_dists; i++) {
|
||||
normalized_weights[i] = weights[i] / sum_weights;
|
||||
}
|
||||
|
||||
float* cummulative_weights = malloc(n_dists * sizeof(float));
|
||||
array_cumsum(normalized_weights, cummulative_weights, n_dists);
|
||||
|
||||
//create var holders
|
||||
float p1;
|
||||
int sample_index, i, own_length;
|
||||
|
||||
{
|
||||
for (int i = 0; i < results_length; i++) {
|
||||
p1 = random_uniform(0, 1);
|
||||
for (int j = 0; j < n_dists; j++) {
|
||||
if (p1 < cummulative_weights[j]) {
|
||||
results[i] = samplers[j]();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
free(normalized_weights);
|
||||
free(cummulative_weights);
|
||||
}
|
||||
|
||||
float sample_0()
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
float sample_1()
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
float sample_few()
|
||||
{
|
||||
return random_to(1, 3);
|
||||
}
|
||||
|
||||
float sample_many()
|
||||
{
|
||||
return random_to(2, 10);
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
//initialize randomness
|
||||
srand(1);
|
||||
|
||||
// clock_t start, end;
|
||||
// start = clock();
|
||||
|
||||
// Toy example
|
||||
// Declare variables in play
|
||||
float p_a, p_b, p_c;
|
||||
// printf("Max threads: %d\n", n_threads);
|
||||
// omp_set_num_threads(n_threads);
|
||||
|
||||
// Initialize variables
|
||||
p_a = 0.8;
|
||||
p_b = 0.5;
|
||||
p_c = p_a * p_b;
|
||||
|
||||
// Generate mixture
|
||||
int n_dists = 4;
|
||||
float weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 };
|
||||
float (*samplers[])(void) = { sample_0, sample_1, sample_few, sample_many };
|
||||
|
||||
float* results = malloc(N * sizeof(float));
|
||||
mixture(samplers, weights, n_dists, results, N);
|
||||
printf("Sum(dist_mixture, N)/N = %f\n", array_sum(results, N) / N);
|
||||
// array_print(dist_mixture[0], N);
|
||||
|
||||
// end = clock();
|
||||
// printf("Time (ms): %f\n", ((double)(end - start)) / (CLOCKS_PER_SEC * 1000));
|
||||
// ^ Will only measure how long it takes the inner main to run, not the whole program,
|
||||
// including e.g., loading the program into memory or smth.
|
||||
// Also CLOCKS_PER_SEC in POSIX is a constant equal to 1000000.
|
||||
// See: https://stackoverflow.com/questions/10455905/why-is-clocks-per-sec-not-the-actual-number-of-clocks-per-second
|
||||
return 0;
|
||||
}
|
18
C/README.md
18
C/README.md
|
@ -1,18 +0,0 @@
|
|||
# Time to BOTEC in C
|
||||
|
||||
This repository contains a few implementations of a simple botec (back-of-the-envelope) calculation in C:
|
||||
|
||||
- In the folder C-01-simple/, you can see a simple implementation, which passes large arrays around
|
||||
- In the folder C-02-better-algorithm-one-thread/ you can see a better implementations, that passes around pointers to functions, which makes the implementation more efficient
|
||||
- The top level samples.c uses the algorithm improvements in C-02..., and also implements multithreading using OpenMP
|
||||
|
||||
## To do
|
||||
|
||||
- [ ] Add Windows/Powershell time-measuring commands
|
||||
- [ ] Add CUDA?
|
||||
- [x] Added results of perf. `rand_r` seems like a big chunk of it, but I'm hesitant to use lower-quality random numbers
|
||||
- [x] used xorshift instead
|
||||
- [-] Use xorshift with a struct instead of a pointer? idk, could be faster for some reason? => Tested, it takes the same time.
|
||||
- [x] Update repository with correct timing
|
||||
- [x] Use better profiling approach to capture timing with 1M samples.
|
||||
- [x] See if program can be reworded so as to use multithreading effectively, e.g., so that you see speed gains proportional to the number of threads used
|
3
C/hello-world/build.sh
Executable file
3
C/hello-world/build.sh
Executable file
|
@ -0,0 +1,3 @@
|
|||
#!/bin/bash
|
||||
|
||||
gcc -std=c99 -Wall -lm hello-world.c -o hello-world
|
Binary file not shown.
7
C/hello-world/hello-world.c
Executable file
7
C/hello-world/hello-world.c
Executable file
|
@ -0,0 +1,7 @@
|
|||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
int main(void){
|
||||
printf("Hello world!\n");
|
||||
return EXIT_SUCCESS;
|
||||
}
|
106
C/makefile
106
C/makefile
|
@ -1,106 +0,0 @@
|
|||
# Interface:
|
||||
# make
|
||||
# make build
|
||||
# make format
|
||||
# make run
|
||||
|
||||
# Compiler
|
||||
CC=gcc
|
||||
# CC=tcc # <= faster compilation
|
||||
|
||||
# Main file
|
||||
SRC=samples.c
|
||||
OUTPUT=out/samples
|
||||
|
||||
SRC_ONE_THREAD=./samples-one-thread.c
|
||||
OUTPUT_ONE_THREAD=out/samples-one-thread
|
||||
|
||||
## Dependencies
|
||||
# Has no dependencies
|
||||
MATH=-lm
|
||||
|
||||
## Flags
|
||||
DEBUG= #'-g'
|
||||
STANDARD=-std=c99
|
||||
WARNINGS=-Wall
|
||||
OPTIMIZED=-O3 #-O3 actually gives better performance than -Ofast, at least for this version
|
||||
OPENMP=-fopenmp
|
||||
|
||||
## Formatter
|
||||
STYLE_BLUEPRINT=webkit
|
||||
FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
|
||||
|
||||
## make build
|
||||
build: $(SRC)
|
||||
$(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(OPENMP) $(MATH) -o $(OUTPUT)
|
||||
|
||||
static:
|
||||
$(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(OPENMP) $(MATH) -o $(OUTPUT)
|
||||
|
||||
format: $(SRC)
|
||||
$(FORMATTER) $(SRC)
|
||||
|
||||
run: $(SRC) $(OUTPUT)
|
||||
OMP_NUM_THREADS=1 ./$(OUTPUT) && echo
|
||||
|
||||
multi:
|
||||
OMP_NUM_THREADS=1 ./$(OUTPUT) && echo
|
||||
OMP_NUM_THREADS=2 ./$(OUTPUT) && echo
|
||||
OMP_NUM_THREADS=4 ./$(OUTPUT) && echo
|
||||
OMP_NUM_THREADS=8 ./$(OUTPUT) && echo
|
||||
OMP_NUM_THREADS=16 ./$(OUTPUT) && echo
|
||||
|
||||
## Timing
|
||||
|
||||
time-linux:
|
||||
@echo "Requires /bin/time, found on GNU/Linux systems" && echo
|
||||
|
||||
@echo "Running 100x and taking avg time: OMP_NUM_THREADS=1 $(OUTPUT)"
|
||||
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=1 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 1 thread: |" | sed 's|$$|ms|' && echo
|
||||
|
||||
@echo "Running 100x and taking avg time: OMP_NUM_THREADS=2 $(OUTPUT)"
|
||||
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=2 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 2 threads: |" | sed 's|$$|ms|' && echo
|
||||
|
||||
@echo "Running 100x and taking avg time: OMP_NUM_THREADS=4 $(OUTPUT)"
|
||||
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=4 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time for 4 threads: |" | sed 's|$$|ms|' && echo
|
||||
|
||||
@echo "Running 100x and taking avg time: OMP_NUM_THREADS=8 $(OUTPUT)"
|
||||
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=8 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 8 threads: |" | sed 's|$$|ms|' && echo
|
||||
|
||||
@echo "Running 100x and taking avg time: OMP_NUM_THREADS=16 $(OUTPUT)"
|
||||
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=16 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo
|
||||
|
||||
time-linux-fastest:
|
||||
@echo "Running 100x and taking avg time: OMP_NUM_THREADS=16 $(OUTPUT)"
|
||||
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=16 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo
|
||||
|
||||
time-linux-simple:
|
||||
@echo "Requires /bin/time, found on GNU/Linux systems" && echo
|
||||
OMP_NUM_THREADS=1 /bin/time -f "Time: %es" ./$(OUTPUT) && echo
|
||||
OMP_NUM_THREADS=2 /bin/time -f "Time: %es" ./$(OUTPUT) && echo
|
||||
OMP_NUM_THREADS=4 /bin/time -f "Time: %es" ./$(OUTPUT) && echo
|
||||
OMP_NUM_THREADS=8 /bin/time -f "Time: %es" ./$(OUTPUT) && echo
|
||||
OMP_NUM_THREADS=16 /bin/time -f "Time: %es" ./$(OUTPUT) && echo
|
||||
|
||||
## Profiling
|
||||
|
||||
profile-linux:
|
||||
echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar"
|
||||
echo "Must be run as sudo"
|
||||
$(CC) $(SRC) $(OPENMP) $(MATH) -o $(OUTPUT)
|
||||
# ./$(OUTPUT)
|
||||
# gprof:
|
||||
# gprof $(OUTPUT) gmon.out > analysis.txt
|
||||
# rm gmon.out
|
||||
# vim analysis.txt
|
||||
# rm analysis.txt
|
||||
# perf:
|
||||
OMP_NUM_THREADS=16 sudo perf record $(OUTPUT)
|
||||
sudo perf report
|
||||
rm perf.data
|
||||
|
||||
|
||||
## Install
|
||||
debian-install-dependencies:
|
||||
sudo apt-get install libomp-dev
|
||||
|
BIN
C/out/samples
BIN
C/out/samples
Binary file not shown.
Binary file not shown.
Binary file not shown.
1495
C/out/samples.txt
1495
C/out/samples.txt
File diff suppressed because it is too large
Load Diff
26
C/perf.txt
26
C/perf.txt
|
@ -1,26 +0,0 @@
|
|||
Samples: 884 of event 'cycles', Event count (approx.): 551167949
|
||||
Overhead Command Shared Object Symbol
|
||||
35.32% samples samples [.] xorshift32
|
||||
14.09% samples libgomp.so.1.0.0 [.] 0x000000000001d2ea
|
||||
12.04% samples libgomp.so.1.0.0 [.] 0x000000000001d132
|
||||
11.53% samples samples [.] mixture._omp_fn.0
|
||||
4.55% samples libm-2.31.so [.] __sin_fma
|
||||
4.24% samples samples [.] rand_0_to_1
|
||||
3.77% samples samples [.] random_to
|
||||
3.03% samples libm-2.31.so [.] __logf_fma
|
||||
1.61% samples libm-2.31.so [.] __expf_fma
|
||||
1.54% samples samples [.] split_array_sum._omp_fn.0
|
||||
1.38% samples samples [.] random_uniform
|
||||
0.94% samples samples [.] ur_normal
|
||||
0.91% samples libm-2.31.so [.] __ieee754_log_fma
|
||||
0.74% samples libgomp.so.1.0.0 [.] 0x000000000001d13d
|
||||
0.52% samples samples [.] sample_0
|
||||
0.41% samples libm-2.31.so [.] __sqrtf_finite@GLIBC_2.15
|
||||
0.38% samples samples [.] sample_1
|
||||
0.36% samples libgomp.so.1.0.0 [.] 0x000000000001d139
|
||||
0.36% samples libgomp.so.1.0.0 [.] 0x000000000001d2f5
|
||||
0.22% samples [kernel.kallsyms] [k] native_queued_spin_lock_slowpath
|
||||
0.18% samples [kernel.kallsyms] [k] _raw_spin_lock_irq
|
||||
0.18% samples samples [.] random_lognormal
|
||||
0.17% samples libgomp.so.1.0.0 [.] 0x000000000001d2f1
|
||||
|
251
C/samples.c
251
C/samples.c
|
@ -1,251 +0,0 @@
|
|||
#include <omp.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
const float PI = 3.14159265358979323846;
|
||||
|
||||
#define N 1000000
|
||||
|
||||
//Array helpers
|
||||
|
||||
void array_print(float* array, int length)
|
||||
{
|
||||
for (int i = 0; i < length; i++) {
|
||||
printf("item[%d] = %f\n", i, array[i]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
float array_sum(float* array, int length)
|
||||
{
|
||||
float output = 0.0;
|
||||
for (int i = 0; i < length; i++) {
|
||||
output += array[i];
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
void array_cumsum(float* array_to_sum, float* array_cumsummed, int length)
|
||||
{
|
||||
array_cumsummed[0] = array_to_sum[0];
|
||||
for (int i = 1; i < length; i++) {
|
||||
array_cumsummed[i] = array_cumsummed[i - 1] + array_to_sum[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Split array helpers
|
||||
int split_array_get_length(int index, int total_length, int n_threads)
|
||||
{
|
||||
return (total_length % n_threads > index ? total_length / n_threads + 1 : total_length / n_threads);
|
||||
}
|
||||
|
||||
void split_array_allocate(float** meta_array, int length, int divide_into)
|
||||
{
|
||||
int split_array_length;
|
||||
|
||||
for (int i = 0; i < divide_into; i++) {
|
||||
split_array_length = split_array_get_length(i, length, divide_into);
|
||||
meta_array[i] = malloc(split_array_length * sizeof(float));
|
||||
}
|
||||
}
|
||||
|
||||
void split_array_free(float** meta_array, int divided_into)
|
||||
{
|
||||
for (int i = 0; i < divided_into; i++) {
|
||||
free(meta_array[i]);
|
||||
}
|
||||
free(meta_array);
|
||||
}
|
||||
|
||||
float split_array_sum(float** meta_array, int length, int divided_into)
|
||||
{
|
||||
int i;
|
||||
float output = 0;
|
||||
|
||||
#pragma omp parallel for reduction(+:output)
|
||||
for (int i = 0; i < divided_into; i++) {
|
||||
float own_partial_sum = 0;
|
||||
int split_array_length = split_array_get_length(i, length, divided_into);
|
||||
for (int j = 0; j < split_array_length; j++) {
|
||||
own_partial_sum += meta_array[i][j];
|
||||
}
|
||||
output += own_partial_sum;
|
||||
}
|
||||
return output;
|
||||
|
||||
}
|
||||
|
||||
// Pseudo Random number generator
|
||||
|
||||
uint32_t xorshift32(uint32_t* seed)
|
||||
{
|
||||
// Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs"
|
||||
// See <https://stackoverflow.com/questions/53886131/how-does-xorshift32-works>
|
||||
// https://en.wikipedia.org/wiki/Xorshift
|
||||
// Also some drama: <https://www.pcg-random.org/posts/on-vignas-pcg-critique.html>, <https://prng.di.unimi.it/>
|
||||
|
||||
uint32_t x = *seed;
|
||||
x ^= x << 13;
|
||||
x ^= x >> 17;
|
||||
x ^= x << 5;
|
||||
return *seed = x;
|
||||
}
|
||||
|
||||
// Distribution & sampling functions
|
||||
|
||||
float rand_0_to_1(uint32_t* seed){
|
||||
return ((float) xorshift32(seed)) / ((float) UINT32_MAX);
|
||||
/*
|
||||
uint32_t x = *seed;
|
||||
x ^= x << 13;
|
||||
x ^= x >> 17;
|
||||
x ^= x << 5;
|
||||
return ((float)(*seed = x))/((float) UINT32_MAX);
|
||||
*/
|
||||
// previously:
|
||||
// ((float)rand_r(seed) / (float)RAND_MAX)
|
||||
// and before that: rand, but it wasn't thread-safe.
|
||||
// See: <https://stackoverflow.com/questions/43151361/how-to-create-thread-safe-random-number-generator-in-c-using-rand-r> for why to use rand_r:
|
||||
// rand() is not thread-safe, as it relies on (shared) hidden seed.
|
||||
}
|
||||
|
||||
float rand_float(float max, uint32_t* seed)
|
||||
{
|
||||
return rand_0_to_1(seed) * max;
|
||||
}
|
||||
|
||||
float ur_normal(uint32_t* seed)
|
||||
{
|
||||
float u1 = rand_0_to_1(seed);
|
||||
float u2 = rand_0_to_1(seed);
|
||||
float z = sqrtf(-2.0 * log(u1)) * sin(2 * PI * u2);
|
||||
return z;
|
||||
}
|
||||
|
||||
float random_uniform(float from, float to, uint32_t* seed)
|
||||
{
|
||||
return rand_0_to_1(seed) * (to - from) + from;
|
||||
}
|
||||
|
||||
float random_normal(float mean, float sigma, uint32_t* seed)
|
||||
{
|
||||
return (mean + sigma * ur_normal(seed));
|
||||
}
|
||||
|
||||
float random_lognormal(float logmean, float logsigma, uint32_t* seed)
|
||||
{
|
||||
return expf(random_normal(logmean, logsigma, seed));
|
||||
}
|
||||
|
||||
float random_to(float low, float high, uint32_t* seed)
|
||||
{
|
||||
const float NORMAL95CONFIDENCE = 1.6448536269514722;
|
||||
float loglow = logf(low);
|
||||
float loghigh = logf(high);
|
||||
float logmean = (loglow + loghigh) / 2;
|
||||
float logsigma = (loghigh - loglow) / (2.0 * NORMAL95CONFIDENCE);
|
||||
return random_lognormal(logmean, logsigma, seed);
|
||||
}
|
||||
|
||||
// Mixture function
|
||||
void mixture(float (*samplers[])(uint32_t*), float* weights, int n_dists, float** results, int n_threads)
|
||||
{
|
||||
// You can see a simpler version of this function in the git history
|
||||
// or in C-02-better-algorithm-one-thread/
|
||||
float sum_weights = array_sum(weights, n_dists);
|
||||
float* cumsummed_normalized_weights = malloc(n_dists * sizeof(float));
|
||||
cumsummed_normalized_weights[0] = weights[0]/sum_weights;
|
||||
for (int i = 1; i < n_dists; i++) {
|
||||
cumsummed_normalized_weights[i] = cumsummed_normalized_weights[i - 1] + weights[i]/sum_weights;
|
||||
}
|
||||
|
||||
//create var holders
|
||||
float p1;
|
||||
int sample_index, i, split_array_length;
|
||||
|
||||
// uint32_t* seeds[n_threads];
|
||||
uint32_t** seeds = malloc(n_threads * sizeof(uint32_t*));
|
||||
for (uint32_t i = 0; i < n_threads; i++) {
|
||||
seeds[i] = malloc(sizeof(uint32_t));
|
||||
*seeds[i] = i + 1; // xorshift can't start with 0
|
||||
}
|
||||
|
||||
#pragma omp parallel private(i, p1, sample_index, split_array_length)
|
||||
{
|
||||
#pragma omp for
|
||||
for (i = 0; i < n_threads; i++) {
|
||||
split_array_length = split_array_get_length(i, N, n_threads);
|
||||
for (int j = 0; j < split_array_length; j++) {
|
||||
p1 = random_uniform(0, 1, seeds[i]);
|
||||
for (int k = 0; k < n_dists; k++) {
|
||||
if (p1 < cumsummed_normalized_weights[k]) {
|
||||
results[i][j] = samplers[k](seeds[i]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// free(normalized_weights);
|
||||
// free(cummulative_weights);
|
||||
free(cumsummed_normalized_weights);
|
||||
for (uint32_t i = 0; i < n_threads; i++) {
|
||||
free(seeds[i]);
|
||||
}
|
||||
free(seeds);
|
||||
}
|
||||
|
||||
// Functions used for the BOTEC.
|
||||
// Their type has to be the same, as we will be passing them around.
|
||||
|
||||
float sample_0(uint32_t* seed)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
float sample_1(uint32_t* seed)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
float sample_few(uint32_t* seed)
|
||||
{
|
||||
return random_to(1, 3, seed);
|
||||
}
|
||||
|
||||
float sample_many(uint32_t* seed)
|
||||
{
|
||||
return random_to(2, 10, seed);
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
|
||||
// Toy example
|
||||
// Declare variables in play
|
||||
float p_a, p_b, p_c;
|
||||
int n_threads = omp_get_max_threads();
|
||||
// printf("Max threads: %d\n", n_threads);
|
||||
// omp_set_num_threads(n_threads);
|
||||
float** dist_mixture = malloc(n_threads * sizeof(float*));
|
||||
split_array_allocate(dist_mixture, N, n_threads);
|
||||
|
||||
// Initialize variables
|
||||
p_a = 0.8;
|
||||
p_b = 0.5;
|
||||
p_c = p_a * p_b;
|
||||
|
||||
// Generate mixture
|
||||
int n_dists = 4;
|
||||
float weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 };
|
||||
float (*samplers[])(uint32_t*) = { sample_0, sample_1, sample_few, sample_many };
|
||||
|
||||
mixture(samplers, weights, n_dists, dist_mixture, n_threads);
|
||||
printf("Sum(dist_mixture, N)/N = %f\n", split_array_sum(dist_mixture, N, n_threads) / N);
|
||||
// array_print(dist_mixture[0], N);
|
||||
split_array_free(dist_mixture, n_threads);
|
||||
|
||||
return 0;
|
||||
}
|
15
C/samples/build-and-run.sh
Executable file
15
C/samples/build-and-run.sh
Executable file
|
@ -0,0 +1,15 @@
|
|||
#!/bin/bash
|
||||
root="$1"
|
||||
gcc -std=c99 -Wall -lm "$root".c -o "$root" -I/usr/local/include -L/usr/local/lib -lgsl -lgslcblas -lm -g
|
||||
|
||||
# Link libraries, for good measure
|
||||
LD_LIBRARY_PATH=/usr/local/lib
|
||||
export LD_LIBRARY_PATH
|
||||
|
||||
# Increase stack size limit
|
||||
ulimit -Ss 256000
|
||||
# -Ss: the soft limit. If you set the hard limit, you then can't raise it
|
||||
# 256000: around 250Mbs, if I'm reading it correctly.
|
||||
|
||||
# run
|
||||
./samples
|
13
C/samples/build-gsl.sh
Executable file
13
C/samples/build-gsl.sh
Executable file
|
@ -0,0 +1,13 @@
|
|||
#!/bin/bash
|
||||
root="$1"
|
||||
gcc -std=c99 -Wall -lm "$root".c -o "$root" -I/usr/local/include -L/usr/local/lib -lgsl -lgslcblas -lm -g
|
||||
|
||||
# Link libraries, for good measure
|
||||
LD_LIBRARY_PATH=/usr/local/lib
|
||||
export LD_LIBRARY_PATH
|
||||
|
||||
# Increase stack size limit
|
||||
ulimit -Ss 256000
|
||||
# -Ss: the soft limit. If you set the hard limit, you then can't raise it
|
||||
# 256000: around 250Mbs, if I'm reading it correctly.
|
||||
|
BIN
C/samples/samples
Executable file
BIN
C/samples/samples
Executable file
Binary file not shown.
140
C/samples/samples.c
Executable file
140
C/samples/samples.c
Executable file
|
@ -0,0 +1,140 @@
|
|||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <gsl/gsl_rng.h>
|
||||
#include <gsl/gsl_randist.h>
|
||||
|
||||
#define N 1000000
|
||||
/*
|
||||
* For very high values of N, you will want to increase the maximum stack trace, otherwise you will suffer a segmentation fault
|
||||
* In Ubuntu/bash you can do this with $ ulimit -Ss 256000 ## ~256Mbs
|
||||
* And confirm it with $ ulimit -a
|
||||
*/
|
||||
|
||||
/* Helpers */
|
||||
void print(double *ys){
|
||||
for(int i=0; i<N; i++){
|
||||
printf("%f\n", ys[i]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void fill(double *ys, float f){
|
||||
for(int i=0; i<N; i++){
|
||||
ys[i] = f;
|
||||
}
|
||||
}
|
||||
|
||||
double sum(double *ps, int n){
|
||||
double result = 0;
|
||||
for(int i=0; i<n; i++){
|
||||
result += ps[i];
|
||||
}
|
||||
return(result);
|
||||
}
|
||||
|
||||
void cumsum(double *ps, double *rs, int n){
|
||||
double counter = 0;
|
||||
for(int i=0; i<n; i++){
|
||||
counter += ps[i];
|
||||
rs[i] = counter;
|
||||
}
|
||||
}
|
||||
|
||||
/* Distributions*/
|
||||
void normal(gsl_rng * r, double *ys, double mean, double std){
|
||||
for(int i=0; i<N; i++){
|
||||
ys[i] = mean + gsl_ran_gaussian(r, std);
|
||||
}
|
||||
}
|
||||
|
||||
void lognormal(gsl_rng * r, double *ys, double zeta, double sigma){
|
||||
for(int i=0; i<N; i++){
|
||||
ys[i] = gsl_ran_lognormal(r, zeta, sigma);
|
||||
}
|
||||
}
|
||||
|
||||
void to(gsl_rng * r, double *ys, double low, double high){
|
||||
double normal95confidencePoint = 1.6448536269514722;
|
||||
double log_low = log(low);
|
||||
double log_high = log(high);
|
||||
double zeta = (log_low + log_high)/2;
|
||||
double sigma = (log_high - log_low) / (2.0 * normal95confidencePoint);
|
||||
lognormal(r, ys, zeta, sigma);
|
||||
}
|
||||
|
||||
/* Mixture of distributions */
|
||||
void mixture(gsl_rng * r, double *dists[], double *weights, int n, double *results){
|
||||
/* Get cummulative, normalized weights */
|
||||
double sum_weights = sum(weights, n);
|
||||
double normalized_weights[n];
|
||||
for(int i=0; i<n; i++){
|
||||
normalized_weights[i] = weights[i]/sum_weights;
|
||||
}
|
||||
double cummulative_weights[n];
|
||||
cumsum(normalized_weights, cummulative_weights, n);
|
||||
|
||||
/* Get N samples, drawn from the different distributions in proportion to their weights. */
|
||||
for(int i=0; i<N; i++){
|
||||
double p_1 = gsl_rng_uniform(r);
|
||||
double p_2 = gsl_rng_uniform(r);
|
||||
|
||||
int index_found = 0;
|
||||
int index_counter = 0;
|
||||
while ((index_found == 0) && (index_counter < n) ){
|
||||
if(p_1 < cummulative_weights[index_counter]){
|
||||
index_found = 1;
|
||||
}else{
|
||||
index_counter++;
|
||||
}
|
||||
}
|
||||
if(index_found == 0) {
|
||||
printf("\nThis shouldn't be able to happen");
|
||||
// gsl_rng_free (r);
|
||||
// abort(); // this shouldn't have happened.
|
||||
|
||||
}else{
|
||||
int sample_index = (int) floor(p_2 * N);
|
||||
results[i] = dists[index_counter][sample_index];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Main */
|
||||
int main(void){
|
||||
/* Initialize GNU Statistical Library (GSL) stuff */
|
||||
const gsl_rng_type * T;
|
||||
gsl_rng * r;
|
||||
// gsl_rng_env_setup();
|
||||
T = gsl_rng_default;
|
||||
r = gsl_rng_alloc (T);
|
||||
|
||||
/* Toy example */
|
||||
/* Declare variables in play */
|
||||
double p_a, p_b, p_c;
|
||||
double dist_none[N], dist_one[N], dist_few[N], dist_many[N], dist_mixture[N];
|
||||
|
||||
/* Initialize variables */
|
||||
p_a = 0.8;
|
||||
p_b = 0.5;
|
||||
p_c = p_a * p_b;
|
||||
|
||||
fill(dist_none, 0);
|
||||
fill(dist_one, 1);
|
||||
to(r, dist_few, 1, 3);
|
||||
to(r, dist_many, 2, 10);
|
||||
|
||||
/* Generate mixture */
|
||||
int n = 4;
|
||||
double weights[] = { 1 - p_c, p_c/2, p_c/4, p_c/4 };
|
||||
double *dists[] = { dist_none, dist_one, dist_few, dist_many };
|
||||
|
||||
mixture(r, dists, weights, n, dist_mixture);
|
||||
printf("%f\n", sum(dist_mixture, N)/N);
|
||||
|
||||
/* Clean up GSL */
|
||||
gsl_rng_free (r);
|
||||
|
||||
/* Return success*/
|
||||
return EXIT_SUCCESS;
|
||||
}
|
Binary file not shown.
|
@ -1,50 +0,0 @@
|
|||
#include "../../squiggle.h"
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
// Estimate functions
|
||||
float sample_0(uint32_t* seed)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
float sample_1(uint32_t* seed)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
float sample_few(uint32_t* seed)
|
||||
{
|
||||
return random_to(1, 3, seed);
|
||||
}
|
||||
|
||||
float sample_many(uint32_t* seed)
|
||||
{
|
||||
return random_to(2, 10, seed);
|
||||
}
|
||||
|
||||
int main(){
|
||||
// set randomness seed
|
||||
uint32_t* seed = malloc(sizeof(uint32_t));
|
||||
*seed = 1000; // xorshift can't start with 0
|
||||
|
||||
float p_a = 0.8;
|
||||
float p_b = 0.5;
|
||||
float p_c = p_a * p_b;
|
||||
|
||||
int n_dists = 4;
|
||||
float weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 };
|
||||
float (*samplers[])(uint32_t*) = { sample_0, sample_1, sample_few, sample_many };
|
||||
|
||||
float result_one = mixture(samplers, weights, n_dists, seed);
|
||||
printf("result_one: %f\n", result_one);
|
||||
}
|
||||
|
||||
/*
|
||||
Aggregation mechanisms:
|
||||
- Quantiles (requires a sort)
|
||||
- Sum
|
||||
- Average
|
||||
- Std
|
||||
*/
|
|
@ -1,53 +0,0 @@
|
|||
# Interface:
|
||||
# make
|
||||
# make build
|
||||
# make format
|
||||
# make run
|
||||
|
||||
# Compiler
|
||||
CC=gcc
|
||||
# CC=tcc # <= faster compilation
|
||||
|
||||
# Main file
|
||||
SRC=example.c
|
||||
OUTPUT=example
|
||||
|
||||
## Dependencies
|
||||
MATH=-lm
|
||||
|
||||
## Flags
|
||||
DEBUG= #'-g'
|
||||
STANDARD=-std=c99
|
||||
WARNINGS=-Wall
|
||||
OPTIMIZED=-O3 #-Ofast
|
||||
# OPENMP=-fopenmp
|
||||
|
||||
## Formatter
|
||||
STYLE_BLUEPRINT=webkit
|
||||
FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
|
||||
|
||||
## make build
|
||||
build: $(SRC)
|
||||
$(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(MATH) -o $(OUTPUT)
|
||||
|
||||
format: $(SRC)
|
||||
$(FORMATTER) $(SRC)
|
||||
|
||||
run: $(SRC) $(OUTPUT)
|
||||
OMP_NUM_THREADS=1 ./$(OUTPUT) && echo
|
||||
|
||||
time-linux:
|
||||
@echo "Requires /bin/time, found on GNU/Linux systems" && echo
|
||||
|
||||
@echo "Running 100x and taking avg time $(OUTPUT)"
|
||||
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 1 thread: |" | sed 's|$$|ms|' && echo
|
||||
|
||||
## Profiling
|
||||
|
||||
profile-linux:
|
||||
echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar"
|
||||
echo "Must be run as sudo"
|
||||
$(CC) $(SRC) $(MATH) -o $(OUTPUT)
|
||||
sudo perf record $(OUTPUT)
|
||||
sudo perf report
|
||||
rm perf.data
|
Binary file not shown.
|
@ -1,59 +0,0 @@
|
|||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include "../../squiggle.h"
|
||||
|
||||
// Estimate functions
|
||||
float sample_0(uint32_t* seed)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
float sample_1(uint32_t* seed)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
float sample_few(uint32_t* seed)
|
||||
{
|
||||
return random_to(1, 3, seed);
|
||||
}
|
||||
|
||||
float sample_many(uint32_t* seed)
|
||||
{
|
||||
return random_to(2, 10, seed);
|
||||
}
|
||||
|
||||
int main(){
|
||||
// set randomness seed
|
||||
uint32_t* seed = malloc(sizeof(uint32_t));
|
||||
*seed = 1000; // xorshift can't start with 0
|
||||
|
||||
float p_a = 0.8;
|
||||
float p_b = 0.5;
|
||||
float p_c = p_a * p_b;
|
||||
|
||||
int n_dists = 4;
|
||||
float weights[] = { 1 - p_c, p_c / 2, p_c / 4, p_c / 4 };
|
||||
float (*samplers[])(uint32_t*) = { sample_0, sample_1, sample_few, sample_many };
|
||||
|
||||
int n_samples = 1000000;
|
||||
float* result_many = (float *) malloc(n_samples * sizeof(float));
|
||||
for(int i=0; i<n_samples; i++){
|
||||
result_many[i] = mixture(samplers, weights, n_dists, seed);
|
||||
}
|
||||
|
||||
printf("result_many: [");
|
||||
for(int i=0; i<100; i++){
|
||||
printf("%.2f, ", result_many[i]);
|
||||
}
|
||||
printf("]\n");
|
||||
}
|
||||
|
||||
/*
|
||||
Aggregation mechanisms:
|
||||
- Quantiles (requires a sort)
|
||||
- Sum
|
||||
- Average
|
||||
- Std
|
||||
*/
|
|
@ -1,53 +0,0 @@
|
|||
# Interface:
|
||||
# make
|
||||
# make build
|
||||
# make format
|
||||
# make run
|
||||
|
||||
# Compiler
|
||||
CC=gcc
|
||||
# CC=tcc # <= faster compilation
|
||||
|
||||
# Main file
|
||||
SRC=example.c
|
||||
OUTPUT=example
|
||||
|
||||
## Dependencies
|
||||
MATH=-lm
|
||||
|
||||
## Flags
|
||||
DEBUG= #'-g'
|
||||
STANDARD=-std=c99
|
||||
WARNINGS=-Wall
|
||||
OPTIMIZED=-O3 #-Ofast
|
||||
# OPENMP=-fopenmp
|
||||
|
||||
## Formatter
|
||||
STYLE_BLUEPRINT=webkit
|
||||
FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
|
||||
|
||||
## make build
|
||||
build: $(SRC)
|
||||
$(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(MATH) -o $(OUTPUT)
|
||||
|
||||
format: $(SRC)
|
||||
$(FORMATTER) $(SRC)
|
||||
|
||||
run: $(SRC) $(OUTPUT)
|
||||
OMP_NUM_THREADS=1 ./$(OUTPUT) && echo
|
||||
|
||||
time-linux:
|
||||
@echo "Requires /bin/time, found on GNU/Linux systems" && echo
|
||||
|
||||
@echo "Running 100x and taking avg time $(OUTPUT)"
|
||||
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 1 thread: |" | sed 's|$$|ms|' && echo
|
||||
|
||||
## Profiling
|
||||
|
||||
profile-linux:
|
||||
echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar"
|
||||
echo "Must be run as sudo"
|
||||
$(CC) $(SRC) $(MATH) -o $(OUTPUT)
|
||||
sudo perf record $(OUTPUT)
|
||||
sudo perf report
|
||||
rm perf.data
|
|
@ -1,109 +0,0 @@
|
|||
#include <math.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
const float PI = 3.14159265358979323846;
|
||||
|
||||
// Pseudo Random number generator
|
||||
|
||||
uint32_t xorshift32
|
||||
(uint32_t* seed)
|
||||
{
|
||||
// Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs"
|
||||
// See <https://stackoverflow.com/questions/53886131/how-does-xorshift32-works>
|
||||
// https://en.wikipedia.org/wiki/Xorshift
|
||||
// Also some drama: <https://www.pcg-random.org/posts/on-vignas-pcg-critique.html>, <https://prng.di.unimi.it/>
|
||||
|
||||
uint32_t x = *seed;
|
||||
x ^= x << 13;
|
||||
x ^= x >> 17;
|
||||
x ^= x << 5;
|
||||
return *seed = x;
|
||||
}
|
||||
|
||||
// Distribution & sampling functions
|
||||
|
||||
float rand_0_to_1(uint32_t* seed){
|
||||
return ((float) xorshift32(seed)) / ((float) UINT32_MAX);
|
||||
}
|
||||
|
||||
float rand_float(float max, uint32_t* seed)
|
||||
{
|
||||
return rand_0_to_1(seed) * max;
|
||||
}
|
||||
|
||||
float ur_normal(uint32_t* seed)
|
||||
{
|
||||
float u1 = rand_0_to_1(seed);
|
||||
float u2 = rand_0_to_1(seed);
|
||||
float z = sqrtf(-2.0 * log(u1)) * sin(2 * PI * u2);
|
||||
return z;
|
||||
}
|
||||
|
||||
float random_uniform(float from, float to, uint32_t* seed)
|
||||
{
|
||||
return rand_0_to_1(seed) * (to - from) + from;
|
||||
}
|
||||
|
||||
float random_normal(float mean, float sigma, uint32_t* seed)
|
||||
{
|
||||
return (mean + sigma * ur_normal(seed));
|
||||
}
|
||||
|
||||
float random_lognormal(float logmean, float logsigma, uint32_t* seed)
|
||||
{
|
||||
return expf(random_normal(logmean, logsigma, seed));
|
||||
}
|
||||
|
||||
float random_to(float low, float high, uint32_t* seed)
|
||||
{
|
||||
const float NORMAL95CONFIDENCE = 1.6448536269514722;
|
||||
float loglow = logf(low);
|
||||
float loghigh = logf(high);
|
||||
float logmean = (loglow + loghigh) / 2;
|
||||
float logsigma = (loghigh - loglow) / (2.0 * NORMAL95CONFIDENCE);
|
||||
return random_lognormal(logmean, logsigma, seed);
|
||||
}
|
||||
|
||||
// Array helpers
|
||||
float array_sum(float* array, int length)
|
||||
{
|
||||
float output = 0.0;
|
||||
for (int i = 0; i < length; i++) {
|
||||
output += array[i];
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
void array_cumsum(float* array_to_sum, float* array_cumsummed, int length)
|
||||
{
|
||||
array_cumsummed[0] = array_to_sum[0];
|
||||
for (int i = 1; i < length; i++) {
|
||||
array_cumsummed[i] = array_cumsummed[i - 1] + array_to_sum[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Mixture function
|
||||
float mixture(float (*samplers[])(uint32_t*), float* weights, int n_dists, uint32_t* seed)
|
||||
{
|
||||
// You can see a simpler version of this function in the git history
|
||||
// or in C-02-better-algorithm-one-thread/
|
||||
float sum_weights = array_sum(weights, n_dists);
|
||||
float* cumsummed_normalized_weights = (float*) malloc(n_dists * sizeof(float));
|
||||
cumsummed_normalized_weights[0] = weights[0]/sum_weights;
|
||||
for (int i = 1; i < n_dists; i++) {
|
||||
cumsummed_normalized_weights[i] = cumsummed_normalized_weights[i - 1] + weights[i]/sum_weights;
|
||||
}
|
||||
|
||||
float p = random_uniform(0, 1, seed);
|
||||
float result;
|
||||
for (int k = 0; k < n_dists; k++) {
|
||||
if (p < cumsummed_normalized_weights[k]) {
|
||||
result = samplers[k](seed);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
free(cumsummed_normalized_weights);
|
||||
return result;
|
||||
}
|
|
@ -1,9 +0,0 @@
|
|||
|
||||
- [ ] Add example for only one sample
|
||||
- [ ] Add example for many samples
|
||||
- [ ] Use gcc extension to define functions nested inside main.
|
||||
- [ ] Use OpenMP for acceleration
|
||||
- [ ] Chain various mixture functions
|
||||
- [ ] Have some more complicated & realistic example
|
||||
- [ ] Add summarization functions, like mean, std, 90% ci (or all c.i.?)
|
||||
- [ ] Add beta distribution
|
|
@ -1,15 +0,0 @@
|
|||
build:
|
||||
gcc xorshift-pointer.c -o out/xorshift-pointer
|
||||
gcc xorshift-struct.c -o out/xorshift-struct
|
||||
|
||||
run-pointer:
|
||||
./out/xorshift-pointer
|
||||
|
||||
run-struct:
|
||||
./out/xorshift-struct
|
||||
|
||||
time-pointer:
|
||||
/bin/time -f "Time: %es" ./out/xorshift-pointer && echo
|
||||
|
||||
time-struct:
|
||||
/bin/time -f "Time: %es" ./out/xorshift-struct && echo
|
|
@ -1 +0,0 @@
|
|||
Using a pointer or a struct turns out to be pretty equivalent in terms of speed. I don't think there are gains to be found here, but I find the pointer implementation clearer.
|
Binary file not shown.
|
@ -1,41 +0,0 @@
|
|||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
uint32_t xorshift32(uint32_t* state)
|
||||
{
|
||||
/* Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs" */
|
||||
uint32_t x = *state;
|
||||
x ^= x << 13;
|
||||
x ^= x >> 17;
|
||||
x ^= x << 5;
|
||||
return *state = x;
|
||||
}
|
||||
|
||||
float rand_xorshift32(uint32_t* state){
|
||||
return (float) xorshift32(state) / (float) UINT32_MAX;
|
||||
}
|
||||
|
||||
int main(){
|
||||
uint32_t** states = malloc(4 * sizeof(uint32_t*));
|
||||
for(int i=0; i<4;i++){
|
||||
states[i] = malloc(sizeof(uint32_t));
|
||||
*states[i] = (uint32_t) i + 1;
|
||||
}
|
||||
for(int i=0; i<1000000000;i++){
|
||||
uint32_t x = xorshift32(states[0]);
|
||||
float y = rand_xorshift32(states[1]);
|
||||
// printf("%u\n", x);
|
||||
// printf("%f\n", y);
|
||||
}
|
||||
|
||||
for(int i=0; i<4;i++){
|
||||
free(states[i]);
|
||||
}
|
||||
free(states);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// See <https://stackoverflow.com/questions/53886131/how-does-xorshift32-works>
|
||||
// https://en.wikipedia.org/wiki/Xorshift
|
|
@ -1,46 +0,0 @@
|
|||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
struct xorshift32_state {
|
||||
uint32_t a;
|
||||
};
|
||||
|
||||
uint32_t xorshift32(struct xorshift32_state *state)
|
||||
{
|
||||
/* Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs" */
|
||||
uint32_t x = state->a;
|
||||
x ^= x << 13;
|
||||
x ^= x >> 17;
|
||||
x ^= x << 5;
|
||||
return state->a = x;
|
||||
}
|
||||
|
||||
float rand_xorshift32(struct xorshift32_state *state){
|
||||
return (float) xorshift32(state) / (float) UINT32_MAX;
|
||||
}
|
||||
|
||||
|
||||
int main(){
|
||||
struct xorshift32_state** states = malloc(sizeof(struct xorshift32_state*) * 4);
|
||||
for(int i=0; i<4;i++){
|
||||
states[i] = malloc(sizeof(struct xorshift32_state));
|
||||
states[i]->a = (uint32_t) i + 1;
|
||||
}
|
||||
|
||||
for(int i=0; i<1000000000; i++){
|
||||
uint32_t x = xorshift32(states[0]);
|
||||
float y = rand_xorshift32(states[1]);
|
||||
// printf("%u\n", x);
|
||||
// printf("%f\n", y);
|
||||
}
|
||||
|
||||
for(int i=0; i<4;i++){
|
||||
free(states[i]);
|
||||
}
|
||||
free(states);
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
1
R/hello-world.R
Normal file
1
R/hello-world.R
Normal file
|
@ -0,0 +1 @@
|
|||
print("Hello world")
|
|
@ -47,4 +47,7 @@ weights = c((1 - p_c), p_c/2, p_c/4, p_c/4)
|
|||
# print(weights)
|
||||
result = mixture(dists, weights)
|
||||
mean_result = mean(result)
|
||||
print(mean_result)
|
||||
print(mean_result)
|
||||
|
||||
|
||||
|
||||
|
|
162
README.md
162
README.md
|
@ -2,161 +2,49 @@
|
|||
|
||||
## About
|
||||
|
||||
This repository contains example of very simple code to manipulate samples in various programming languages, and tallies their speed and the complexity of their code. It implements this platonic estimate:
|
||||
This repository contains example of very simple code to manipulate samples in various programming languages. As of now, it may be useful for checking the validity of simple estimations.
|
||||
|
||||
```
|
||||
p_a = 0.8
|
||||
p_b = 0.5
|
||||
p_c = p_a * p_b
|
||||
|
||||
dists = [0, 1, 1 to 3, 2 to 10]
|
||||
weights = [(1 - p_c), p_c/2, p_c/4, p_c/4 ]
|
||||
|
||||
result = mixture(dists, weights) # should be 1M samples
|
||||
mean(result)
|
||||
```
|
||||
|
||||
I don't particularly care about the speed of this particular example, but rather think that the speed in this simple exaxmple would be indicative of the speed when considering 100x or 1000x more complicated models. As of now, it may also be useful for checking the validity of simple estimations.
|
||||
|
||||
The title of this repository is a pun on two meanings of "time to": "how much time does it take to do x", and "let's do x". "BOTEC" stands for "back of the envelope calculation".
|
||||
The title of this repository is a pun on two meanings of "time to": "how much time does it take to do x", and "let's do x".
|
||||
|
||||
## Current languages
|
||||
|
||||
- [x] C
|
||||
- [x] Javascript (NodeJS)
|
||||
- [x] Squiggle
|
||||
- [x] R
|
||||
- [x] Python
|
||||
- [x] Nim
|
||||
- [x] R
|
||||
- [x] Squiggle
|
||||
- [x] Javascript (NodeJS)
|
||||
- [x] C
|
||||
|
||||
## Comparison table
|
||||
## Performance table
|
||||
|
||||
| Language | Time | Lines of code |
|
||||
|-----------------------------|-----------|---------------|
|
||||
| C (optimized, 16 threads) | 5ms | 249 |
|
||||
| Nim | 38ms | 84 |
|
||||
| Lua (LuaJIT) | 68ms | 82 |
|
||||
| Lua | 278ms | 82 |
|
||||
| C (naïve implementation) | 292ms | 149 |
|
||||
| Javascript (NodeJS) | 732ms | 69 |
|
||||
| Squiggle | 1,536s | 14* |
|
||||
| SquigglePy | 1.602s | 18* |
|
||||
| R | 7,000s | 49 |
|
||||
| Python (CPython) | 16,641s | 56 |
|
||||
With the [time](https://man7.org/linux/man-pages/man1/time.1.html) tool, using 1M samples:
|
||||
|
||||
Time measurements taken with the [time](https://man7.org/linux/man-pages/man1/time.1.html) tool, using 1M samples. But different implementations use different algorithms and, occasionally, different time measuring methodologies (for the C, Nim and Lua implementations, I run the program 100 times and take the mean). Their speed was also measured under different loads in my machine. So I think that these time estimates are accurate within maybe ~2x or so.
|
||||
| Language | Time |
|
||||
|----------|-----------|
|
||||
| C | 0m0,442s |
|
||||
| Node | 0m0,732s |
|
||||
| Squiggle | 0m1,536s |
|
||||
| R | 0m7,000s |
|
||||
| Python (CPython) | 0m16,641s |
|
||||
|
||||
Note that the number of lines is much shorter for Squiggle and SquigglePy because I'm just counting the lines needed to get Squiggle and SquigglePy to output a model, not the lines inside them.
|
||||
|
||||
## Notes
|
||||
|
||||
### Nim
|
||||
|
||||
I was really happy trying [Nim](https://nim-lang.org/), and as a result the Nim code is a bit more optimized and engineered:
|
||||
|
||||
1. It is using the fastest "danger" compilation mode.
|
||||
2. It has some optimizations: I don't compute 1M samples for each dist, but instead pass functions around and compute the 1M samples at the end
|
||||
3. I define the normal function from scratch, using the [Box–Muller transform](https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form).
|
||||
4. I also have a version in which I define the logarithm and sine functions themselves in nim to feed into the Box-Muller method. But it is much slower.
|
||||
|
||||
Without 1. and 2., the nim code takes 0m0.183s instead. But I don't think that these are unfair advantages: I liked trying out nim and therefore put in more love into the code, and this seems like it could be a recurring factor.
|
||||
|
||||
Ultimately, these optimizations were also incorporated into the C code as well.
|
||||
|
||||
Nim also has multithreading support, but I haven't bothered loooking into it yet.
|
||||
|
||||
### C
|
||||
|
||||
The optimizations which make the final C code significantly faster than the naïve implementation are:
|
||||
|
||||
- To pass around pointers to functions, instead of large arrays. This is the same as in the nim implementation, but imho leads to more complex code
|
||||
- To use the Box-Muller transform instead of using libraries, like in nim.
|
||||
- To use multithreading support
|
||||
|
||||
The C code uses the `-Ofast` or `-O3` compilation flags. Initially, without using those flags and without the algorithmic improvements, the code took ~0.4 seconds to run. So I was surprised that the Node and Squiggle code were comparable to the C code. It ended up being the case that the C code could be pushed to be ~100x faster, though :)
|
||||
|
||||
In fact, the C code ended up being so fast that I had to measure its time by running the code 100 times in a row and dividing that amount by 100, rather than by just running it once, because running it once was too fast for /bin/time. More sophisticated profiling tools exist that could e.g., account for how iddle a machine is when running the code, but I didn't think that was worth it at this point.
|
||||
|
||||
And still, there are some missing optimizations, like tweaking the code to take into account cache misses. I'm not exactly sure how that would go, though.
|
||||
|
||||
Once the code was at 6.6ms, there was a 0.6ms gain possible by using OMP better, and a 1ms gain by using a xorshift algorithm instead of rand_r from stdlib. I think there might be faster gains to be made by using OpenCL or CUDA, but I haven't gotten into how to do that instead.
|
||||
|
||||
Although the above paragraphs were written in the first person, the C code was written together with Jorge Sierra, who translated the algorithmic improvements from nim to it and added the initial multithreading support.
|
||||
|
||||
### NodeJS and Squiggle
|
||||
|
||||
Using [bun](https://bun.sh/) instead of node is actually a bit slower. Also, both the NodeJS and the Squiggle code use [stdlib](https://stdlib.io/) in their innards, which has a bunch of interleaved functions that make the code slower. It's possible that not using that external library could make the code faster, but at the same time, the js approach does seem to be to use external libraries whenever possible.
|
||||
|
||||
I am not particularly sure that the Squiggle code is actually producing 1M samples, but also have no particular plan to debug this.
|
||||
|
||||
### Python
|
||||
|
||||
For the Python code, it's possible that the lack of speed is more a function of me not being as familiar with Python. It's also very possible that the code would run faster with [PyPy](https://doc.pypy.org).
|
||||
|
||||
### R
|
||||
|
||||
R has a warm place in my heart from back in the day, and it has predefined functions to do everything. It was particularly fast to write for me, though not particularly fast to run :) However, I do recall that R does have some multithreading support; it wasn't used.
|
||||
|
||||
### Lua
|
||||
|
||||
I was also really pleased with Lua. I liked the simplicity. And coming from javascript, I appreciated that the program did not fail silently, but rather gave me useful errors, like:
|
||||
|
||||
```
|
||||
lua samples.lua
|
||||
lua: samples.lua:42: table index is nil
|
||||
stack traceback:
|
||||
samples.lua:42: in function 'mixture'
|
||||
samples.lua:49: in main chunk
|
||||
[C]: in ?
|
||||
make: *** [makefile:14: run] Error 1
|
||||
```
|
||||
|
||||
I also appreciated the speedup when using the LuaJIT interpreter. You can install it with commands similar to
|
||||
|
||||
```
|
||||
git clone https://luajit.org/git/luajit.git
|
||||
make
|
||||
sudo make install
|
||||
sudo ln -sf luajit-2.1.0-beta3 /usr/local/bin/luajit
|
||||
```
|
||||
|
||||
Overall I'm thinking that a combination of lua at least for scripting and ¿nim/C/tbd? for more serious programs could be quite powerful.
|
||||
|
||||
### Overall thoughts
|
||||
|
||||
Overall I don't think that this is a fair comparison of the languages intrinsically, because I'm just differentially good at them, because I've chosen to put more effort in ones than in others. But it is still useful to me personally, and perhaps mildly informative to others.
|
||||
|
||||
I was very surprised that Node/Squiggle code was almost as fast as the raw C code. For the Python code, it's possible that the lack of speed is more a function of me not being as familiar with Python. It's also very possible that the code would run faster with [PyPy](https://doc.pypy.org)
|
||||
|
||||
## Languages I may add later
|
||||
|
||||
- [ ] Julia (TuringML)
|
||||
- [ ] Rust
|
||||
- [ ] Lisp
|
||||
- [ ] Go
|
||||
- [ ] Zig
|
||||
- [ ] Forth
|
||||
- [ ] sh/bash, lol?
|
||||
- [ ] OCaml
|
||||
- [ ] Haskell
|
||||
- [ ] OpenCL/CUDA (e.g., as in <https://www.eriksmistad.no/getting-started-with-opencl-and-gpu-computing/>). Seems like it would be overkill, and also the code would be way more complex. But maybe worth trying?
|
||||
- [-] Stan => As far as I can tell, Stan is designed to generate samples from the posterior distribution given some data, not to create data by drawing samples from an arbitrary distribution.
|
||||
- [ ] Maybe still worth reversing the process?
|
||||
- Julia (TuringML)
|
||||
- Rust
|
||||
- Lisp
|
||||
- ... and suggestions welcome
|
||||
|
||||
## Languages added so far
|
||||
|
||||
- [x] Squiggle
|
||||
- [x] Javascript (NodeJS)
|
||||
- [x] Python (CPython)
|
||||
- [x] R
|
||||
- [x] C
|
||||
- [x] Nim
|
||||
- [x] SquigglePy
|
||||
- [x] Lua
|
||||
- Stan
|
||||
|
||||
## Roadmap
|
||||
|
||||
The future of this project is uncertain. In most words, I simply forget about this repository.
|
||||
|
||||
To do:
|
||||
- [ ] Check whether the Squiggle code is producing 1M samples. Still not too sure.
|
||||
- Differentiate between initial startup time (e.g., compiling, loading environment) and runtime. This matters because startup time could be ~constant, so for larger projects only the runtime matters.
|
||||
|
||||
## Other similar projects
|
||||
|
||||
- Squigglepy: <https://github.com/rethinkpriorities/squigglepy>
|
||||
|
|
1
js/hello-world.js
Normal file
1
js/hello-world.js
Normal file
|
@ -0,0 +1 @@
|
|||
console.log("Hello world")
|
2
js/node_modules/@stdlib/utils/global/lib/window.js
generated
vendored
2
js/node_modules/@stdlib/utils/global/lib/window.js
generated
vendored
|
@ -20,7 +20,7 @@
|
|||
|
||||
// MAIN //
|
||||
|
||||
var obj = ( typeof window === 'object' ) ? window : {};
|
||||
var obj = ( typeof window === 'object' ) ? window : null;
|
||||
|
||||
|
||||
// EXPORTS //
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
print("Hello world")
|
32
lua/makefile
32
lua/makefile
|
@ -1,32 +0,0 @@
|
|||
# Interface:
|
||||
# make
|
||||
# make build
|
||||
# make run
|
||||
# make time-linux
|
||||
# make install
|
||||
# make format
|
||||
# make time-linux-simple
|
||||
# make profile-linux
|
||||
|
||||
SRC=samples.lua
|
||||
# INTERPETER=lua# < original
|
||||
INTERPETER=luajit# < faster
|
||||
|
||||
run: $(SRC)
|
||||
$(INTERPETER) $(SRC)
|
||||
|
||||
time-linux:
|
||||
@echo "Requires /bin/time, found on GNU/Linux systems" && echo
|
||||
@echo "Running 100x and taking avg time of: $(INTERPETER) $(SRC)"
|
||||
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(INTERPETER) $(SRC); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time: |" | sed 's|$$|ms|' && echo
|
||||
|
||||
time-linux-simple:
|
||||
@echo "Requires /bin/time, found on GNU/Linux systems" && echo
|
||||
/bin/time -f "Time: %es" $(INTERPETER) $(SRC) && echo
|
||||
|
||||
profile-linux:
|
||||
@echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar"
|
||||
@echo "Must be run as sudo"
|
||||
sudo perf record $(INTERPETER) $(SRC)
|
||||
sudo perf report
|
||||
rm perf.data
|
|
@ -1,82 +0,0 @@
|
|||
-- Consts and prep
|
||||
PI = 3.14159265358979323846;
|
||||
NORMAL95CONFIDENCE = 1.6448536269514722;
|
||||
math.randomseed(1234)
|
||||
|
||||
-- Random distribution functions
|
||||
function sample_normal_0_1()
|
||||
local u1 = math.random()
|
||||
local u2 = math.random()
|
||||
local result = math.sqrt(-2 * math.log(u1)) * math.sin(2 * PI * u2)
|
||||
return result
|
||||
end
|
||||
|
||||
function sample_normal(mean, sigma)
|
||||
return mean + (sigma * sample_normal_0_1())
|
||||
end
|
||||
|
||||
function sample_uniform(min, max)
|
||||
return math.random() * (max - min) + min
|
||||
end
|
||||
|
||||
function sample_lognormal(logmean, logsigma)
|
||||
return math.exp(sample_normal(logmean, logsigma))
|
||||
end
|
||||
|
||||
function sample_to(low, high)
|
||||
local loglow = math.log(low);
|
||||
local loghigh = math.log(high);
|
||||
local logmean = (loglow + loghigh) / 2;
|
||||
local logsigma = (loghigh - loglow) / (2.0 * NORMAL95CONFIDENCE);
|
||||
return sample_lognormal(logmean, logsigma, seed);
|
||||
end
|
||||
|
||||
-- Mixture
|
||||
function mixture(samplers, weights, n)
|
||||
assert(#samplers == #weights)
|
||||
local l = #weights
|
||||
local sum_weights = 0
|
||||
for i = 1, l, 1 do
|
||||
-- ^ arrays start at 1
|
||||
sum_weights = sum_weights + weights[i]
|
||||
end
|
||||
local cumsummed_normalized_weights = {}
|
||||
table.insert(cumsummed_normalized_weights, weights[1]/sum_weights)
|
||||
for i = 2, l, 1 do
|
||||
table.insert(cumsummed_normalized_weights, cumsummed_normalized_weights[i-1] + weights[i]/sum_weights)
|
||||
end
|
||||
|
||||
local result = {}
|
||||
for i = 1, n, 1 do
|
||||
r = math.random()
|
||||
local i = 1
|
||||
while r > cumsummed_normalized_weights[i] do
|
||||
i = i+1
|
||||
end
|
||||
table.insert(result, samplers[i]())
|
||||
end
|
||||
return result
|
||||
end
|
||||
|
||||
|
||||
-- Main
|
||||
p_a = 0.8
|
||||
p_b = 0.5
|
||||
p_c = p_a * p_b
|
||||
|
||||
function sample_0() return 0 end
|
||||
function sample_1() return 1 end
|
||||
function sample_few() return sample_to(1, 3) end
|
||||
function sample_many() return sample_to(2, 10) end
|
||||
|
||||
samplers = {sample_0, sample_1, sample_few, sample_many}
|
||||
weights = { (1 - p_c), p_c/2, p_c/4, p_c/4 }
|
||||
|
||||
n = 1000000
|
||||
result = mixture(samplers, weights, n)
|
||||
sum = 0
|
||||
for i = 1, n, 1 do
|
||||
sum = sum + result[i]
|
||||
-- print(result[i])
|
||||
end
|
||||
print(sum/n)
|
18
nim/makefile
18
nim/makefile
|
@ -1,18 +0,0 @@
|
|||
SHELL := /bin/bash ## <= required to use time
|
||||
VERBOSE=--verbosity:0
|
||||
|
||||
build: samples.nim
|
||||
nim c $(VERBOSE) samples.nim
|
||||
|
||||
run: samples
|
||||
./samples $(VERBOSE)
|
||||
|
||||
examine: samples
|
||||
nim c $(VERBOSE) samples.nim && time ./samples $(VERBOSE) && echo
|
||||
nim c $(VERBOSE) -d:release samples.nim && time ./samples $(VERBOSE) && echo
|
||||
nim c $(VERBOSE) -d:danger samples.nim && time ./samples $(VERBOSE)
|
||||
|
||||
time-linux:
|
||||
@echo "Requires /bin/time, found on GNU/Linux systems" && echo
|
||||
@echo "Running 100x and taking avg time of: $(OUTPUT)"
|
||||
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do ./samples; done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time: |" | sed 's|$$|ms|' && echo
|
BIN
nim/samples
BIN
nim/samples
Binary file not shown.
|
@ -1,4 +0,0 @@
|
|||
This is a version of time-to-botec for nim for which I define logarithms, normals & lognormals pretty much
|
||||
from scratch, without relying on libraries except for random numbers, sqrts, or taking a to the power of b.
|
||||
|
||||
This is interesting because it leads to better understanding. But it doesn't help compare the efficiency of various languages. So I'm setting it aside for now, even though I think it was really interesting.
|
|
@ -1,11 +0,0 @@
|
|||
SHELL := /bin/bash
|
||||
|
||||
build: samples.nim
|
||||
nim c --verbosity:0 samples.nim
|
||||
|
||||
run: samples
|
||||
./samples --verbosity:0
|
||||
|
||||
examine: samples
|
||||
# nim c --verbosity:0 --opt:speed -d:release -d:danger --checks:off samples.nim && time ./samples --verbosity:0 --checks:off
|
||||
nim c -d:release samples.nim && time ./samples
|
Binary file not shown.
|
@ -1,145 +0,0 @@
|
|||
import std/math
|
||||
import std/sugar
|
||||
import std/random
|
||||
import std/sequtils
|
||||
|
||||
randomize()
|
||||
|
||||
## Basic math functions
|
||||
proc factorial(n: int): int =
|
||||
if n == 0 or n < 0:
|
||||
return 1
|
||||
else:
|
||||
return n * factorial(n - 1)
|
||||
|
||||
proc sine(x: float): float =
|
||||
let n = 8
|
||||
# ^ Taylor will converge really quickly
|
||||
# notice that the factorial of 17 is
|
||||
# already pretty gigantic
|
||||
var acc = 0.0
|
||||
for i in 0..n:
|
||||
var k = 2*i + 1
|
||||
var taylor = pow(-1, i.float) * pow(x, k.float) / factorial(k).float
|
||||
acc = acc + taylor
|
||||
return acc
|
||||
|
||||
## Log function
|
||||
## <https://en.wikipedia.org/wiki/Natural_logarithm#High_precision>
|
||||
|
||||
## Arithmetic-geomtric mean
|
||||
proc ag(x: float, y: float): float =
|
||||
let n = 32 # just some high number
|
||||
var a = (x + y)/2.0
|
||||
var b = sqrt(x * y)
|
||||
for i in 0..n:
|
||||
let temp = a
|
||||
a = (a+b)/2.0
|
||||
b = sqrt(b*temp)
|
||||
return a
|
||||
|
||||
## Find m such that x * 2^m > 2^precision/2
|
||||
proc find_m(x:float): float =
|
||||
var m = 0.0;
|
||||
let precision = 64 # bits
|
||||
let c = pow(2.0, precision.float / 2.0)
|
||||
while x * pow(2.0, m) < c:
|
||||
m = m + 1
|
||||
return m
|
||||
|
||||
proc log(x: float): float =
|
||||
let m = find_m(x)
|
||||
let s = x * pow(2.0, m)
|
||||
let ln2 = 0.6931471805599453
|
||||
return ( PI / (2.0 * ag(1, 4.0/s)) ) - m * ln2
|
||||
|
||||
## Test these functions
|
||||
## echo factorial(5)
|
||||
## echo sine(1.0)
|
||||
## echo log(0.1)
|
||||
## echo log(2.0)
|
||||
## echo log(3.0)
|
||||
## echo pow(2.0, 32.float)
|
||||
|
||||
## Distribution functions
|
||||
|
||||
## Normal
|
||||
## <https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form>
|
||||
proc ur_normal(): float =
|
||||
let u1 = rand(1.0)
|
||||
let u2 = rand(1.0)
|
||||
let z = sqrt(-2.0 * log(u1)) * sine(2 * PI * u2)
|
||||
return z
|
||||
|
||||
proc normal(mean: float, sigma: float): float =
|
||||
return (mean + sigma * ur_normal())
|
||||
|
||||
proc lognormal(logmean: float, logsigma: float): float =
|
||||
let answer = pow(E, normal(logmean, logsigma))
|
||||
return answer
|
||||
|
||||
proc to(low: float, high: float): float =
|
||||
let normal95confidencePoint = 1.6448536269514722
|
||||
let loglow = log(low)
|
||||
let loghigh = log(high)
|
||||
let logmean = (loglow + loghigh)/2
|
||||
let logsigma = (loghigh - loglow) / (2.0 * normal95confidencePoint);
|
||||
return lognormal(logmean, logsigma)
|
||||
|
||||
## echo ur_normal()
|
||||
## echo normal(10, 20)
|
||||
## echo lognormal(2, 4)
|
||||
## echo to(10, 90)
|
||||
|
||||
## Manipulate samples
|
||||
|
||||
proc make_samples(f: () -> float, n: int): seq[float] =
|
||||
result = toSeq(1..n).map(_ => f())
|
||||
return result
|
||||
|
||||
proc mixture(sxs: seq[seq[float]], ps: seq[float], n: int): seq[float] =
|
||||
|
||||
assert sxs.len == ps.len
|
||||
|
||||
var ws: seq[float]
|
||||
var sum = 0.0
|
||||
for i, p in ps:
|
||||
sum = sum + p
|
||||
ws.add(sum)
|
||||
ws = ws.map(w => w/sum)
|
||||
|
||||
proc get_mixture_sample(): float =
|
||||
let r = rand(1.0)
|
||||
var i = ws.len - 1
|
||||
for j, w in ws:
|
||||
if r < w:
|
||||
i = j
|
||||
break
|
||||
## only occasion when ^ doesn't assign i
|
||||
## is when r is exactly 1
|
||||
## which would correspond to choosing the last item in ws
|
||||
## which is why i is initialized to ws.len
|
||||
let xs = sxs[i]
|
||||
let l = xs.len-1
|
||||
let k = rand(0..l)
|
||||
return xs[k]
|
||||
|
||||
return toSeq(1..n).map(_ => get_mixture_sample())
|
||||
|
||||
## Actual model
|
||||
|
||||
let n = 1000000
|
||||
|
||||
let p_a = 0.8
|
||||
let p_b = 0.5
|
||||
let p_c = p_a * p_b
|
||||
|
||||
let weights = @[ 1.0 - p_c, p_c/2.0, p_c/4.0, p_c/4.0 ]
|
||||
|
||||
let fs = [ () => 0.0, () => 1.0, () => to(1.0, 3.0), () => to(2.0, 10.0) ]
|
||||
let dists = fs.map(f => make_samples(f, n))
|
||||
let result = mixture(dists, weights, n)
|
||||
let mean_result = foldl(result, a + b, 0.0) / float(result.len)
|
||||
|
||||
# echo result
|
||||
echo mean_result
|
|
@ -1,2 +0,0 @@
|
|||
build: sums.nim
|
||||
nim c sums.nim
|
Binary file not shown.
|
@ -1,22 +0,0 @@
|
|||
import std/math
|
||||
|
||||
proc factorial(n: int): int =
|
||||
if n == 0 or n < 0:
|
||||
return 1
|
||||
else:
|
||||
return n * factorial(n - 1)
|
||||
|
||||
proc sine(x: float): float =
|
||||
let n = 8
|
||||
# ^ Taylor will converge really quickly
|
||||
# notice that the factorial of 17 is
|
||||
# already pretty gigantic
|
||||
var acc = 0.0
|
||||
for i in 0..n:
|
||||
var k = 2*i + 1
|
||||
var taylor = pow(-1, i.float) * pow(x, k.float) / factorial(k).float
|
||||
acc = acc + taylor
|
||||
return acc
|
||||
|
||||
echo factorial(17)
|
||||
echo sine(1.0)
|
|
@ -1,84 +0,0 @@
|
|||
import std/math
|
||||
import std/sugar
|
||||
import std/random
|
||||
import std/sequtils
|
||||
|
||||
randomize()
|
||||
|
||||
## Distribution functions
|
||||
|
||||
## Normal
|
||||
## <https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form>
|
||||
proc ur_normal(): float =
|
||||
let u1 = rand(1.0)
|
||||
let u2 = rand(1.0)
|
||||
let z = sqrt(-2.0 * ln(u1)) * sin(2 * PI * u2)
|
||||
return z
|
||||
|
||||
proc normal(mean: float, sigma: float): float =
|
||||
return (mean + sigma * ur_normal())
|
||||
|
||||
proc lognormal(logmean: float, logsigma: float): float =
|
||||
let answer = pow(E, normal(logmean, logsigma))
|
||||
return answer
|
||||
|
||||
proc to(low: float, high: float): float =
|
||||
let normal95confidencePoint = 1.6448536269514722
|
||||
let loglow = ln(low)
|
||||
let loghigh = ln(high)
|
||||
let logmean = (loglow + loghigh)/2
|
||||
let logsigma = (loghigh - loglow) / (2.0 * normal95confidencePoint)
|
||||
return lognormal(logmean, logsigma)
|
||||
|
||||
## echo ur_normal()
|
||||
## echo normal(10, 20)
|
||||
## echo lognormal(2, 4)
|
||||
## echo to(10, 90)
|
||||
|
||||
## Manipulate samples
|
||||
|
||||
proc mixture(fs: seq[proc (): float{.nimcall.}], ps: seq[float], n: int): seq[float] =
|
||||
|
||||
assert fs.len == ps.len
|
||||
|
||||
var ws: seq[float]
|
||||
var sum = 0.0
|
||||
for i, p in ps:
|
||||
sum = sum + p
|
||||
ws.add(sum)
|
||||
ws = ws.map(w => w/sum)
|
||||
|
||||
var samples: seq[float]
|
||||
let rs = toSeq(1..n).map(_=>rand(1.0))
|
||||
for i in 0..(n-1):
|
||||
let r = rs[i]
|
||||
var j = ws.len - 1
|
||||
for k, w in ws:
|
||||
if r < w:
|
||||
j = k
|
||||
break
|
||||
## only occasion when ^ doesn't assign j
|
||||
## is when r is exactly 1
|
||||
## which would correspond to choosing the last item in ws
|
||||
## which is why j is initialized to ws.len - 1
|
||||
let f = fs[j]
|
||||
samples.add(f())
|
||||
return samples
|
||||
|
||||
|
||||
## Actual model
|
||||
|
||||
let n = 1000000
|
||||
|
||||
let p_a = 0.8
|
||||
let p_b = 0.5
|
||||
let p_c = p_a * p_b
|
||||
|
||||
let weights = @[ 1.0 - p_c, p_c/2.0, p_c/4.0, p_c/4.0 ]
|
||||
|
||||
let fs = @[ proc (): float = 0.0, proc (): float = 1.0, proc (): float = to(1.0, 3.0), proc (): float = to(2.0, 10.0)]
|
||||
let result = mixture(fs, weights, n)
|
||||
let mean_result = foldl(result, a + b, 0.0) / float(result.len)
|
||||
|
||||
# echo result
|
||||
echo mean_result
|
1
python/hello-world.py
Normal file
1
python/hello-world.py
Normal file
|
@ -0,0 +1 @@
|
|||
print("Hello world!")
|
|
@ -1,10 +0,0 @@
|
|||
install:
|
||||
pip3 install "squigglepy>=0.26"
|
||||
pip3 install numpy
|
||||
|
||||
run:
|
||||
python3 samples.py
|
||||
|
||||
time:
|
||||
time python3 samples.py
|
||||
|
|
@ -1,18 +0,0 @@
|
|||
import squigglepy as sq
|
||||
import numpy as np
|
||||
|
||||
p_a = 0.8
|
||||
p_b = 0.5
|
||||
p_c = p_a * p_b
|
||||
|
||||
dist_0 = 0
|
||||
dist_1 = 1
|
||||
dist_some = sq.to(1, 3)
|
||||
dist_many = sq.to(2, 10)
|
||||
|
||||
dists = [dist_0, dist_1, dist_some, dist_many]
|
||||
weights = [(1 - p_c), p_c/2, p_c/4, p_c/4 ]
|
||||
|
||||
result = sq.mixture(dists, weights)
|
||||
result_samples = sq.sample(result, 1000000)
|
||||
print(np.mean(result_samples))
|
92
time.txt
92
time.txt
|
@ -1,41 +1,11 @@
|
|||
# Optimized C
|
||||
|
||||
|
||||
$ make && make time-linux
|
||||
gcc -O3 samples.c -fopenmp -lm -o out/samples
|
||||
Requires /bin/time, found on GNU/Linux systems
|
||||
|
||||
Running 100x and taking avg time: OMP_NUM_THREADS=1 out/samples
|
||||
Time using 1 thread: 20.20ms
|
||||
|
||||
Running 100x and taking avg time: OMP_NUM_THREADS=2 out/samples
|
||||
Time using 2 threads: 17.50ms
|
||||
|
||||
Running 100x and taking avg time: OMP_NUM_THREADS=4 out/samples
|
||||
Time for 4 threads: 17.00ms
|
||||
|
||||
Running 100x and taking avg time: OMP_NUM_THREADS=8 out/samples
|
||||
Time using 8 threads: 8.40ms
|
||||
|
||||
Running 100x and taking avg time: OMP_NUM_THREADS=16 out/samples
|
||||
Time using 16 threads: 5.00ms
|
||||
|
||||
# C
|
||||
|
||||
## normal compilation
|
||||
0.888458
|
||||
|
||||
real 0m0,442s
|
||||
user 0m0,378s
|
||||
sys 0m0,064s
|
||||
|
||||
## -Ofast
|
||||
0.888458
|
||||
|
||||
real 0m0.292s
|
||||
user 0m0.266s
|
||||
sys 0m0.026s
|
||||
|
||||
# Squiggle
|
||||
|
||||
real 0m1,536s
|
||||
|
@ -66,65 +36,3 @@ real 0m7,000s
|
|||
user 0m6,944s
|
||||
sys 0m0,052s
|
||||
|
||||
## Nim
|
||||
|
||||
nim c --verbosity:0 samples.nim && time ./samples --verbosity:0 && echo
|
||||
0.8860780498240779
|
||||
|
||||
real 0m0.234s
|
||||
user 0m0.214s
|
||||
sys 0m0.020s
|
||||
|
||||
nim c --verbosity:0 -d:release samples.nim && time ./samples --verbosity:0 && echo
|
||||
0.884035098700204
|
||||
|
||||
real 0m0.074s
|
||||
user 0m0.043s
|
||||
sys 0m0.031s
|
||||
|
||||
nim c --verbosity:0 -d:danger samples.nim && time ./samples --verbosity:0
|
||||
0.8892827195895541
|
||||
|
||||
real 0m0.068s
|
||||
user 0m0.048s
|
||||
sys 0m0.020s
|
||||
|
||||
— make time-linux
|
||||
Requires /bin/time, found on GNU/Linux systems
|
||||
|
||||
Running 100x and taking avg time of:
|
||||
Time: 38.90ms
|
||||
|
||||
## Squigglepy
|
||||
|
||||
|
||||
— time make run
|
||||
python3 samples.py
|
||||
100%|█████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 20.68it/s]
|
||||
100%|████████████████████████████████████████████████████████████████████| 1000000/1000000 [00:00<00:00, 1298266.78it/s]
|
||||
0.8854926010232937
|
||||
|
||||
real 0m1.602s
|
||||
user 0m1.582s
|
||||
sys 0m0.331s
|
||||
|
||||
## Lua
|
||||
|
||||
— make time-linux-simple
|
||||
Requires /bin/time, found on GNU/Linux systems
|
||||
|
||||
/bin/time -f "Time: %es" lua samples.lua && echo
|
||||
0.88932198462254
|
||||
Time: 0.28s
|
||||
|
||||
— make time-linux
|
||||
Requires /bin/time, found on GNU/Linux systems
|
||||
|
||||
Running 100x and taking avg time of: lua samples.lua
|
||||
Time: 275.80ms
|
||||
|
||||
— make time-linux
|
||||
Requires /bin/time, found on GNU/Linux systems
|
||||
|
||||
Running 100x and taking avg time of: luajit samples.lua
|
||||
Time: 68.60ms
|
||||
|
|
|
@ -1,49 +0,0 @@
|
|||
# Interface:
|
||||
# make
|
||||
# make build
|
||||
# make run
|
||||
# make time-linux
|
||||
# make install
|
||||
# make format
|
||||
# make time-linux-simple
|
||||
# make profile-linux
|
||||
|
||||
# Main file
|
||||
SRC=samples.lang
|
||||
OUTPUT=out/samples
|
||||
|
||||
# Compiler
|
||||
COMPILER=lang-compiler
|
||||
FLAGS=
|
||||
|
||||
## Formatter
|
||||
FORMATTER=lang-formatter --options
|
||||
|
||||
## make build
|
||||
build: $(SRC)
|
||||
$(COMPILER) $(FLAGS) -o $(OUTPUT)
|
||||
|
||||
install:
|
||||
lang-package-manager install dependencies
|
||||
|
||||
format: $(SRC)
|
||||
$(FORMATTER) $(SRC)
|
||||
|
||||
run: $(SRC) $(OUTPUT)
|
||||
./$(OUTPUT)
|
||||
|
||||
time-linux:
|
||||
@echo "Requires /bin/time, found on GNU/Linux systems" && echo
|
||||
@echo "Running 100x and taking avg time of: $(OUTPUT)"
|
||||
@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time: |" | sed 's|$$|ms|' && echo
|
||||
|
||||
time-linux-simple:
|
||||
@echo "Requires /bin/time, found on GNU/Linux systems" && echo
|
||||
/bin/time -f "Time: %es" ./$(OUTPUT) && echo
|
||||
|
||||
profile-linux:
|
||||
@echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar"
|
||||
@echo "Must be run as sudo"
|
||||
sudo perf record $(OUTPUT)
|
||||
sudo perf report
|
||||
rm perf.data
|
|
@ -1,5 +0,0 @@
|
|||
const std = @import("std");
|
||||
|
||||
pub fn main() void {
|
||||
std.debug.print("Hello, {s}!\n", .{"World"});
|
||||
}
|
|
@ -1 +0,0 @@
|
|||
See <https://ziglang.org/learn/getting-started/#tagged-release-or-nightly-build>
|
|
@ -1 +0,0 @@
|
|||
https://ziglearn.org/chapter-1/
|
Loading…
Reference in New Issue
Block a user