time measuring tweaks.
This commit is contained in:
parent
c35ddcc358
commit
160e824108
|
@ -7,13 +7,18 @@ The main changes are:
|
||||||
- an optimization of the mixture function (it passes the functions instead of the whole arrays, reducing in great measure the memory usage and the computation time) and
|
- an optimization of the mixture function (it passes the functions instead of the whole arrays, reducing in great measure the memory usage and the computation time) and
|
||||||
- the implementation of multi-threading with OpenMP.
|
- the implementation of multi-threading with OpenMP.
|
||||||
|
|
||||||
|
## Performance
|
||||||
|
|
||||||
The mean time of execution is 6 ms. With the following distribution:
|
The mean time of execution is 6 ms. With the following distribution:
|
||||||
|
|
||||||
![Time histogram](https://i.imgur.com/6iT2PkF.png)
|
![Time histogram](https://i.imgur.com/6iT2PkF.png)
|
||||||
|
|
||||||
The hardware used has been an AMD 5800x3D and 16GB of DDR4-3200 MHz.
|
The hardware used has been an AMD 5800x3D and 16GB of DDR4-3200 MHz.
|
||||||
|
|
||||||
Take into account that the multi-threading introduces a bit of dispersion in the execution time due to the creation and destruction of threads.
|
|
||||||
|
|
||||||
Also, the time data has been collected by executing the interior of the main() function 1000 times in a for loop, not executing the program itself 1000 times.
|
Also, the time data has been collected by executing the interior of the main() function 1000 times in a for loop, not executing the program itself 1000 times.
|
||||||
|
|
||||||
|
## Multithreading
|
||||||
|
|
||||||
|
Take into account that the multi-threading introduces a bit of dispersion in the execution time due to the creation and destruction of threads.
|
||||||
|
|
||||||
|
In Nuño's machine, multithreading actually introduces a noticeable slowdown factor.
|
||||||
|
|
|
@ -37,27 +37,17 @@ format: $(SRC)
|
||||||
$(FORMATTER) $(SRC)
|
$(FORMATTER) $(SRC)
|
||||||
|
|
||||||
run: $(SRC) $(OUTPUT)
|
run: $(SRC) $(OUTPUT)
|
||||||
OMP_NUM_THREADS=4 ./$(OUTPUT)
|
|
||||||
|
|
||||||
test: $(SRC) $(OUTPUT)
|
|
||||||
OMP_NUM_THREADS=1 ./$(OUTPUT)
|
OMP_NUM_THREADS=1 ./$(OUTPUT)
|
||||||
echo ""
|
|
||||||
OMP_NUM_THREADS=2 ./$(OUTPUT)
|
multi: $(SRC) $(OUTPUT)
|
||||||
echo ""
|
OMP_NUM_THREADS=1 ./$(OUTPUT) && echo
|
||||||
|
OMP_NUM_THREADS=2 ./$(OUTPUT) && echo
|
||||||
OMP_NUM_THREADS=4 ./$(OUTPUT)
|
OMP_NUM_THREADS=4 ./$(OUTPUT)
|
||||||
|
|
||||||
# echo "Increasing stack size limit, because we are dealing with 1M samples"
|
time:
|
||||||
# # ulimit: increase stack size limit
|
OMP_NUM_THREADS=1 /bin/time -f "Time: %es" ./$(OUTPUT) && echo
|
||||||
# # -Ss: the soft limit. If you set the hard limit, you then can't raise it
|
OMP_NUM_THREADS=2 /bin/time -f "Time: %es" ./$(OUTPUT) && echo
|
||||||
# # 256000: around 250Mbs, if I'm reading it correctly.
|
OMP_NUM_THREADS=4 /bin/time -f "Time: %es" ./$(OUTPUT) && echo
|
||||||
# # Then run the program
|
|
||||||
# ulimit -Ss 256000 && ./$(OUTPUT)
|
|
||||||
|
|
||||||
linux-install:
|
linux-install:
|
||||||
sudo apt-get install libomp-dev
|
sudo apt-get install libomp-dev
|
||||||
|
|
||||||
# Old:
|
|
||||||
# Link libraries, for good measure
|
|
||||||
# LD_LIBRARY_PATH=/usr/local/lib
|
|
||||||
# export LD_LIBRARY_PATH
|
|
||||||
|
|
||||||
|
|
Binary file not shown.
|
@ -245,8 +245,8 @@ int main()
|
||||||
//initialize randomness
|
//initialize randomness
|
||||||
srand(time(NULL));
|
srand(time(NULL));
|
||||||
|
|
||||||
clock_t start, end;
|
// clock_t start, end;
|
||||||
start = clock();
|
// start = clock();
|
||||||
|
|
||||||
// Toy example
|
// Toy example
|
||||||
// Declare variables in play
|
// Declare variables in play
|
||||||
|
@ -269,10 +269,14 @@ int main()
|
||||||
|
|
||||||
mixture_f(samplers, weights, n_dists, dist_mixture, n_threads);
|
mixture_f(samplers, weights, n_dists, dist_mixture, n_threads);
|
||||||
printf("Sum(dist_mixture, N)/N = %f\n", split_array_sum(dist_mixture, N, n_threads) / N);
|
printf("Sum(dist_mixture, N)/N = %f\n", split_array_sum(dist_mixture, N, n_threads) / N);
|
||||||
|
// array_print(dist_mixture[0], N);
|
||||||
split_array_free(dist_mixture, n_threads);
|
split_array_free(dist_mixture, n_threads);
|
||||||
|
|
||||||
end = clock();
|
// end = clock();
|
||||||
printf("Time (ms): %f\n", ((double)(end - start)) / (CLOCKS_PER_SEC * 10) * 1000);
|
// printf("Time (ms): %f\n", ((double)(end - start)) / (CLOCKS_PER_SEC * 10) * 1000);
|
||||||
|
// ^ Will only measure how long it takes the inner main to run, not the whole program,
|
||||||
|
// including e.g., loading the program into memory or smth.
|
||||||
|
// Also CLOCKS_PER_SEC in POSIX is a constant equal to 1000000.
|
||||||
|
// See: https://stackoverflow.com/questions/10455905/why-is-clocks-per-sec-not-the-actual-number-of-clocks-per-second
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
10
C/samples.c
10
C/samples.c
|
@ -3,6 +3,7 @@
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
#include <time.h>
|
||||||
|
|
||||||
#define N 1000000
|
#define N 1000000
|
||||||
/*
|
/*
|
||||||
|
@ -111,6 +112,10 @@ void mixture(gsl_rng* r, double* dists[], double* weights, int n, double* result
|
||||||
/* Main */
|
/* Main */
|
||||||
int main(void)
|
int main(void)
|
||||||
{
|
{
|
||||||
|
// Start clock
|
||||||
|
clock_t start, end;
|
||||||
|
start = clock();
|
||||||
|
|
||||||
/* Initialize GNU Statistical Library (GSL) stuff */
|
/* Initialize GNU Statistical Library (GSL) stuff */
|
||||||
const gsl_rng_type* T;
|
const gsl_rng_type* T;
|
||||||
gsl_rng* r;
|
gsl_rng* r;
|
||||||
|
@ -143,7 +148,10 @@ int main(void)
|
||||||
|
|
||||||
/* Clean up GSL */
|
/* Clean up GSL */
|
||||||
gsl_rng_free(r);
|
gsl_rng_free(r);
|
||||||
|
|
||||||
|
// End clock
|
||||||
|
end = clock();
|
||||||
|
printf("Total time (ms): %f\n", ((double)(end - start)) / CLOCKS_PER_SEC * 1000);
|
||||||
/* Return success*/
|
/* Return success*/
|
||||||
return EXIT_SUCCESS;
|
return EXIT_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
21
README.md
21
README.md
|
@ -29,14 +29,15 @@ As of now, it may be useful for checking the validity of simple estimations. The
|
||||||
|
|
||||||
## Comparison table
|
## Comparison table
|
||||||
|
|
||||||
| Language | Time | Lines of code |
|
| Language | Time | Lines of code |
|
||||||
|----------------------|-----------|---------------|
|
|--------------------------|-----------|---------------|
|
||||||
| Nim | 0m0.068s | 84 |
|
| C (optimized, 1 thread) | ~30ms | 282 |
|
||||||
| C | 0m0.292s | 149 |
|
| Nim | 68ms | 84 |
|
||||||
| Javascript (NodeJS) | 0m0,732s | 69 |
|
| C | 292ms | 149 |
|
||||||
| Squiggle | 0m1,536s | 14 |
|
| Javascript (NodeJS) | 732ms | 69 |
|
||||||
| R | 0m7,000s | 49 |
|
| Squiggle | 1,536s | 14 |
|
||||||
| Python (CPython) | 0m16,641s | 56 |
|
| R | 7,000s | 49 |
|
||||||
|
| Python (CPython) | 16,641s | 56 |
|
||||||
|
|
||||||
Time measurements taken with the [time](https://man7.org/linux/man-pages/man1/time.1.html) tool, using 1M samples:
|
Time measurements taken with the [time](https://man7.org/linux/man-pages/man1/time.1.html) tool, using 1M samples:
|
||||||
|
|
||||||
|
@ -51,7 +52,9 @@ I was really happy trying [Nim](https://nim-lang.org/), and as a result the Nim
|
||||||
|
|
||||||
Without 1. and 2., the nim code takes 0m0.183s instead. But I don't think that these are unfair advantages: I liked trying out nim and therefore put in more love into the code, and this seems like it could be a recurring factor.
|
Without 1. and 2., the nim code takes 0m0.183s instead. But I don't think that these are unfair advantages: I liked trying out nim and therefore put in more love into the code, and this seems like it could be a recurring factor.
|
||||||
|
|
||||||
For C, I enabled the `-Ofast` compilation flag. Without it, it instead takes ~0.4 seconds. Initially, before I enabled the `-Ofast` flag, I was surprised that the Node and Squiggle code were comparable to the C code. Using [bun](https://bun.sh/) instead of node is actually a bit slower.
|
For the initial C code, I enabled the `-Ofast` compilation flag. Without it, it instead takes ~0.4 seconds. Initially, before I enabled the `-Ofast` flag, I was surprised that the Node and Squiggle code were comparable to the C code. Using [bun](https://bun.sh/) instead of node is actually a bit slower.
|
||||||
|
|
||||||
|
For the optimized C code, see [that folder's README](./C-optimized/README.md).
|
||||||
|
|
||||||
For the Python code, it's possible that the lack of speed is more a function of me not being as familiar with Python. It's also very possible that the code would run faster with [PyPy](https://doc.pypy.org).
|
For the Python code, it's possible that the lack of speed is more a function of me not being as familiar with Python. It's also very possible that the code would run faster with [PyPy](https://doc.pypy.org).
|
||||||
|
|
||||||
|
|
14
time.txt
14
time.txt
|
@ -1,3 +1,17 @@
|
||||||
|
# Optimized C
|
||||||
|
|
||||||
|
OMP_NUM_THREADS=1 /bin/time -f "Time: %es" ./out/samples && echo
|
||||||
|
Sum(dist_mixture, N)/N = 0.885837
|
||||||
|
Time: 0.02s
|
||||||
|
|
||||||
|
OMP_NUM_THREADS=2 /bin/time -f "Time: %es" ./out/samples && echo
|
||||||
|
Sum(dist_mixture, N)/N = 0.885123
|
||||||
|
Time: 0.14s
|
||||||
|
|
||||||
|
OMP_NUM_THREADS=4 /bin/time -f "Time: %es" ./out/samples && echo
|
||||||
|
Sum(dist_mixture, N)/N = 0.886255
|
||||||
|
Time: 0.11s
|
||||||
|
|
||||||
# C
|
# C
|
||||||
|
|
||||||
## normal compilation
|
## normal compilation
|
||||||
|
|
Loading…
Reference in New Issue
Block a user