diff --git a/C-optimized/README.md b/C-optimized/README.md index afaefdc7..9e58f7a4 100644 --- a/C-optimized/README.md +++ b/C-optimized/README.md @@ -7,13 +7,18 @@ The main changes are: - an optimization of the mixture function (it passes the functions instead of the whole arrays, reducing in great measure the memory usage and the computation time) and - the implementation of multi-threading with OpenMP. +## Performance + The mean time of execution is 6 ms. With the following distribution: ![Time histogram](https://i.imgur.com/6iT2PkF.png) The hardware used has been an AMD 5800x3D and 16GB of DDR4-3200 MHz. -Take into account that the multi-threading introduces a bit of dispersion in the execution time due to the creation and destruction of threads. - Also, the time data has been collected by executing the interior of the main() function 1000 times in a for loop, not executing the program itself 1000 times. +## Multithreading + +Take into account that the multi-threading introduces a bit of dispersion in the execution time due to the creation and destruction of threads. + +In Nuño's machine, multithreading actually introduces a noticeable slowdown factor. diff --git a/C-optimized/makefile b/C-optimized/makefile index 83174017..38136a5b 100644 --- a/C-optimized/makefile +++ b/C-optimized/makefile @@ -37,27 +37,17 @@ format: $(SRC) $(FORMATTER) $(SRC) run: $(SRC) $(OUTPUT) - OMP_NUM_THREADS=4 ./$(OUTPUT) - -test: $(SRC) $(OUTPUT) OMP_NUM_THREADS=1 ./$(OUTPUT) - echo "" - OMP_NUM_THREADS=2 ./$(OUTPUT) - echo "" + +multi: $(SRC) $(OUTPUT) + OMP_NUM_THREADS=1 ./$(OUTPUT) && echo + OMP_NUM_THREADS=2 ./$(OUTPUT) && echo OMP_NUM_THREADS=4 ./$(OUTPUT) -# echo "Increasing stack size limit, because we are dealing with 1M samples" -# # ulimit: increase stack size limit -# # -Ss: the soft limit. If you set the hard limit, you then can't raise it -# # 256000: around 250Mbs, if I'm reading it correctly. -# # Then run the program -# ulimit -Ss 256000 && ./$(OUTPUT) +time: + OMP_NUM_THREADS=1 /bin/time -f "Time: %es" ./$(OUTPUT) && echo + OMP_NUM_THREADS=2 /bin/time -f "Time: %es" ./$(OUTPUT) && echo + OMP_NUM_THREADS=4 /bin/time -f "Time: %es" ./$(OUTPUT) && echo linux-install: sudo apt-get install libomp-dev - -# Old: -# Link libraries, for good measure -# LD_LIBRARY_PATH=/usr/local/lib -# export LD_LIBRARY_PATH - diff --git a/C-optimized/out/samples b/C-optimized/out/samples index 015ac6cc..0486ecc6 100755 Binary files a/C-optimized/out/samples and b/C-optimized/out/samples differ diff --git a/C-optimized/samples.c b/C-optimized/samples.c index 744b00ab..9942d7ef 100644 --- a/C-optimized/samples.c +++ b/C-optimized/samples.c @@ -245,8 +245,8 @@ int main() //initialize randomness srand(time(NULL)); - clock_t start, end; - start = clock(); + // clock_t start, end; + // start = clock(); // Toy example // Declare variables in play @@ -269,10 +269,14 @@ int main() mixture_f(samplers, weights, n_dists, dist_mixture, n_threads); printf("Sum(dist_mixture, N)/N = %f\n", split_array_sum(dist_mixture, N, n_threads) / N); - + // array_print(dist_mixture[0], N); split_array_free(dist_mixture, n_threads); - end = clock(); - printf("Time (ms): %f\n", ((double)(end - start)) / (CLOCKS_PER_SEC * 10) * 1000); + // end = clock(); + // printf("Time (ms): %f\n", ((double)(end - start)) / (CLOCKS_PER_SEC * 10) * 1000); + // ^ Will only measure how long it takes the inner main to run, not the whole program, + // including e.g., loading the program into memory or smth. + // Also CLOCKS_PER_SEC in POSIX is a constant equal to 1000000. + // See: https://stackoverflow.com/questions/10455905/why-is-clocks-per-sec-not-the-actual-number-of-clocks-per-second return 0; } diff --git a/C/samples b/C/samples index bf5c87a3..b16b392a 100755 Binary files a/C/samples and b/C/samples differ diff --git a/C/samples.c b/C/samples.c index 6437c2b6..b18ec29b 100644 --- a/C/samples.c +++ b/C/samples.c @@ -3,6 +3,7 @@ #include #include #include +#include #define N 1000000 /* @@ -111,6 +112,10 @@ void mixture(gsl_rng* r, double* dists[], double* weights, int n, double* result /* Main */ int main(void) { + // Start clock + clock_t start, end; + start = clock(); + /* Initialize GNU Statistical Library (GSL) stuff */ const gsl_rng_type* T; gsl_rng* r; @@ -143,7 +148,10 @@ int main(void) /* Clean up GSL */ gsl_rng_free(r); - + + // End clock + end = clock(); + printf("Total time (ms): %f\n", ((double)(end - start)) / CLOCKS_PER_SEC * 1000); /* Return success*/ return EXIT_SUCCESS; } diff --git a/README.md b/README.md index 23f2d43b..79ce1129 100644 --- a/README.md +++ b/README.md @@ -29,14 +29,15 @@ As of now, it may be useful for checking the validity of simple estimations. The ## Comparison table -| Language | Time | Lines of code | -|----------------------|-----------|---------------| -| Nim | 0m0.068s | 84 | -| C | 0m0.292s | 149 | -| Javascript (NodeJS) | 0m0,732s | 69 | -| Squiggle | 0m1,536s | 14 | -| R | 0m7,000s | 49 | -| Python (CPython) | 0m16,641s | 56 | +| Language | Time | Lines of code | +|--------------------------|-----------|---------------| +| C (optimized, 1 thread) | ~30ms | 282 | +| Nim | 68ms | 84 | +| C | 292ms | 149 | +| Javascript (NodeJS) | 732ms | 69 | +| Squiggle | 1,536s | 14 | +| R | 7,000s | 49 | +| Python (CPython) | 16,641s | 56 | Time measurements taken with the [time](https://man7.org/linux/man-pages/man1/time.1.html) tool, using 1M samples: @@ -51,7 +52,9 @@ I was really happy trying [Nim](https://nim-lang.org/), and as a result the Nim Without 1. and 2., the nim code takes 0m0.183s instead. But I don't think that these are unfair advantages: I liked trying out nim and therefore put in more love into the code, and this seems like it could be a recurring factor. -For C, I enabled the `-Ofast` compilation flag. Without it, it instead takes ~0.4 seconds. Initially, before I enabled the `-Ofast` flag, I was surprised that the Node and Squiggle code were comparable to the C code. Using [bun](https://bun.sh/) instead of node is actually a bit slower. +For the initial C code, I enabled the `-Ofast` compilation flag. Without it, it instead takes ~0.4 seconds. Initially, before I enabled the `-Ofast` flag, I was surprised that the Node and Squiggle code were comparable to the C code. Using [bun](https://bun.sh/) instead of node is actually a bit slower. + +For the optimized C code, see [that folder's README](./C-optimized/README.md). For the Python code, it's possible that the lack of speed is more a function of me not being as familiar with Python. It's also very possible that the code would run faster with [PyPy](https://doc.pypy.org). diff --git a/time.txt b/time.txt index 01fb21f0..26814ee5 100644 --- a/time.txt +++ b/time.txt @@ -1,3 +1,17 @@ +# Optimized C + +OMP_NUM_THREADS=1 /bin/time -f "Time: %es" ./out/samples && echo +Sum(dist_mixture, N)/N = 0.885837 +Time: 0.02s + +OMP_NUM_THREADS=2 /bin/time -f "Time: %es" ./out/samples && echo +Sum(dist_mixture, N)/N = 0.885123 +Time: 0.14s + +OMP_NUM_THREADS=4 /bin/time -f "Time: %es" ./out/samples && echo +Sum(dist_mixture, N)/N = 0.886255 +Time: 0.11s + # C ## normal compilation