finish xorshift updating.
This commit is contained in:
parent
eaee16f205
commit
c73476e5aa
|
@ -11,6 +11,8 @@ This repository contains a few implementations of a simple botec (back-of-the-en
|
||||||
- [ ] Add Windows/Powershell time-measuring commands
|
- [ ] Add Windows/Powershell time-measuring commands
|
||||||
- [ ] Add CUDA?
|
- [ ] Add CUDA?
|
||||||
- [x] Added results of perf. `rand_r` seems like a big chunk of it, but I'm hesitant to use lower-quality random numbers
|
- [x] Added results of perf. `rand_r` seems like a big chunk of it, but I'm hesitant to use lower-quality random numbers
|
||||||
|
- [x] used xorshift instead
|
||||||
|
- [ ] Use xorshift with a struct instead of a pointer? idk, could be faster for some reason?
|
||||||
- [x] Update repository with correct timing
|
- [x] Update repository with correct timing
|
||||||
- [x] Use better profiling approach to capture timing with 1M samples.
|
- [x] Use better profiling approach to capture timing with 1M samples.
|
||||||
- [x] See if program can be reworded so as to use multithreading effectively, e.g., so that you see speed gains proportional to the number of threads used
|
- [x] See if program can be reworded so as to use multithreading effectively, e.g., so that you see speed gains proportional to the number of threads used
|
||||||
|
|
BIN
C/out/samples
BIN
C/out/samples
Binary file not shown.
47
C/perf.txt
47
C/perf.txt
|
@ -1,25 +1,26 @@
|
||||||
|
Samples: 884 of event 'cycles', Event count (approx.): 551167949
|
||||||
Overhead Command Shared Object Symbol
|
Overhead Command Shared Object Symbol
|
||||||
23.94% samples libc-2.31.so [.] rand_r
|
35.32% samples samples [.] xorshift32
|
||||||
18.14% samples libgomp.so.1.0.0 [.] 0x000000000001d132
|
14.09% samples libgomp.so.1.0.0 [.] 0x000000000001d2ea
|
||||||
15.43% samples libgomp.so.1.0.0 [.] 0x000000000001d2ea
|
12.04% samples libgomp.so.1.0.0 [.] 0x000000000001d132
|
||||||
12.16% samples samples [.] mixture._omp_fn.0
|
11.53% samples samples [.] mixture._omp_fn.0
|
||||||
4.36% samples libm-2.31.so [.] __sin_fma
|
4.55% samples libm-2.31.so [.] __sin_fma
|
||||||
3.49% samples libm-2.31.so [.] __ieee754_log_fma
|
4.24% samples samples [.] rand_0_to_1
|
||||||
3.34% samples samples [.] random_to
|
3.77% samples samples [.] random_to
|
||||||
3.13% samples samples [.] random_uniform
|
3.03% samples libm-2.31.so [.] __logf_fma
|
||||||
2.77% samples samples [.] split_array_sum._omp_fn.0
|
1.61% samples libm-2.31.so [.] __expf_fma
|
||||||
2.01% samples samples [.] rand_float
|
1.54% samples samples [.] split_array_sum._omp_fn.0
|
||||||
1.65% samples libm-2.31.so [.] __logf_fma
|
1.38% samples samples [.] random_uniform
|
||||||
0.88% samples libgomp.so.1.0.0 [.] 0x000000000001d2f5
|
0.94% samples samples [.] ur_normal
|
||||||
0.86% samples samples [.] ur_normal
|
0.91% samples libm-2.31.so [.] __ieee754_log_fma
|
||||||
0.75% samples libm-2.31.so [.] __expf_fma
|
0.74% samples libgomp.so.1.0.0 [.] 0x000000000001d13d
|
||||||
0.70% samples libgomp.so.1.0.0 [.] 0x000000000001d13d
|
0.52% samples samples [.] sample_0
|
||||||
0.69% samples libgomp.so.1.0.0 [.] 0x000000000001d139
|
0.41% samples libm-2.31.so [.] __sqrtf_finite@GLIBC_2.15
|
||||||
0.57% samples libgomp.so.1.0.0 [.] 0x000000000001d2f1
|
0.38% samples samples [.] sample_1
|
||||||
0.57% samples samples [.] sample_1
|
0.36% samples libgomp.so.1.0.0 [.] 0x000000000001d139
|
||||||
0.55% samples samples [.] random_lognormal
|
0.36% samples libgomp.so.1.0.0 [.] 0x000000000001d2f5
|
||||||
0.50% samples [kernel.kallsyms] [k] asm_exc_page_fault
|
0.22% samples [kernel.kallsyms] [k] native_queued_spin_lock_slowpath
|
||||||
0.49% samples [kernel.kallsyms] [k] clear_page_rep
|
0.18% samples [kernel.kallsyms] [k] _raw_spin_lock_irq
|
||||||
0.47% samples samples [.] random_normal
|
0.18% samples samples [.] random_lognormal
|
||||||
0.38% samples [kernel.kallsyms] [k] default_send_IPI_single_phys
|
0.17% samples libgomp.so.1.0.0 [.] 0x000000000001d2f1
|
||||||
|
|
||||||
|
|
11
C/samples.c
11
C/samples.c
|
@ -81,7 +81,9 @@ float split_array_sum(float** meta_array, int length, int divided_into)
|
||||||
|
|
||||||
uint32_t xorshift32(uint32_t* seed)
|
uint32_t xorshift32(uint32_t* seed)
|
||||||
{
|
{
|
||||||
/* Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs" */
|
// Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs"
|
||||||
|
// See <https://stackoverflow.com/questions/53886131/how-does-xorshift32-works>
|
||||||
|
// https://en.wikipedia.org/wiki/Xorshift
|
||||||
uint32_t x = *seed;
|
uint32_t x = *seed;
|
||||||
x ^= x << 13;
|
x ^= x << 13;
|
||||||
x ^= x >> 17;
|
x ^= x >> 17;
|
||||||
|
@ -93,6 +95,13 @@ uint32_t xorshift32(uint32_t* seed)
|
||||||
|
|
||||||
float rand_0_to_1(uint32_t* seed){
|
float rand_0_to_1(uint32_t* seed){
|
||||||
return ((float) xorshift32(seed)) / ((float) UINT32_MAX);
|
return ((float) xorshift32(seed)) / ((float) UINT32_MAX);
|
||||||
|
/*
|
||||||
|
uint32_t x = *seed;
|
||||||
|
x ^= x << 13;
|
||||||
|
x ^= x >> 17;
|
||||||
|
x ^= x << 5;
|
||||||
|
return ((float)(*seed = x))/((float) UINT32_MAX);
|
||||||
|
*/
|
||||||
// previously:
|
// previously:
|
||||||
// ((float)rand_r(seed) / (float)RAND_MAX)
|
// ((float)rand_r(seed) / (float)RAND_MAX)
|
||||||
// and before that: rand, but it wasn't thread-safe.
|
// and before that: rand, but it wasn't thread-safe.
|
||||||
|
|
|
@ -33,7 +33,7 @@ The title of this repository is a pun on two meanings of "time to": "how much ti
|
||||||
|
|
||||||
| Language | Time | Lines of code |
|
| Language | Time | Lines of code |
|
||||||
|-----------------------------|-----------|---------------|
|
|-----------------------------|-----------|---------------|
|
||||||
| C (optimized, 16 threads) | 6ms | 222 |
|
| C (optimized, 16 threads) | 5ms | 249 |
|
||||||
| Nim | 68ms | 84 |
|
| Nim | 68ms | 84 |
|
||||||
| C (naïve implementation) | 292ms | 149 |
|
| C (naïve implementation) | 292ms | 149 |
|
||||||
| Javascript (NodeJS) | 732ms | 69 |
|
| Javascript (NodeJS) | 732ms | 69 |
|
||||||
|
@ -74,6 +74,8 @@ In fact, the C code ended up being so fast that I had to measure its time by run
|
||||||
|
|
||||||
And still, there are some missing optimizations, like tweaking the code to take into account cache misses. I'm not exactly sure how that would go, though.
|
And still, there are some missing optimizations, like tweaking the code to take into account cache misses. I'm not exactly sure how that would go, though.
|
||||||
|
|
||||||
|
Once the code was at 6.6ms, there was a 0.6ms gain possible by using OMP better, and a 1ms gain by using a xorshift algorithm instead of rand_r from stdlib.
|
||||||
|
|
||||||
Although the above paragraphs were written in the first person, the C code was written together with Jorge Sierra, who translated the algorithmic improvements from nim to it and added the initial multithreading support.
|
Although the above paragraphs were written in the first person, the C code was written together with Jorge Sierra, who translated the algorithmic improvements from nim to it and added the initial multithreading support.
|
||||||
|
|
||||||
### NodeJS and Squiggle
|
### NodeJS and Squiggle
|
||||||
|
|
14
time.txt
14
time.txt
|
@ -1,22 +1,24 @@
|
||||||
# Optimized C
|
# Optimized C
|
||||||
|
|
||||||
$ make time-linux
|
|
||||||
|
$ make && make time-linux
|
||||||
|
gcc -O3 samples.c -fopenmp -lm -o out/samples
|
||||||
Requires /bin/time, found on GNU/Linux systems
|
Requires /bin/time, found on GNU/Linux systems
|
||||||
|
|
||||||
Running 100x and taking avg time: OMP_NUM_THREADS=1 out/samples
|
Running 100x and taking avg time: OMP_NUM_THREADS=1 out/samples
|
||||||
Time using 1 thread: 24.00ms
|
Time using 1 thread: 20.20ms
|
||||||
|
|
||||||
Running 100x and taking avg time: OMP_NUM_THREADS=2 out/samples
|
Running 100x and taking avg time: OMP_NUM_THREADS=2 out/samples
|
||||||
Time using 2 threads: 21.80ms
|
Time using 2 threads: 17.50ms
|
||||||
|
|
||||||
Running 100x and taking avg time: OMP_NUM_THREADS=4 out/samples
|
Running 100x and taking avg time: OMP_NUM_THREADS=4 out/samples
|
||||||
Time for 4 threads: 24.40ms
|
Time for 4 threads: 17.00ms
|
||||||
|
|
||||||
Running 100x and taking avg time: OMP_NUM_THREADS=8 out/samples
|
Running 100x and taking avg time: OMP_NUM_THREADS=8 out/samples
|
||||||
Time using 8 threads: 10.40ms
|
Time using 8 threads: 8.40ms
|
||||||
|
|
||||||
Running 100x and taking avg time: OMP_NUM_THREADS=16 out/samples
|
Running 100x and taking avg time: OMP_NUM_THREADS=16 out/samples
|
||||||
Time using 16 threads: 6.60ms
|
Time using 16 threads: 5.00ms
|
||||||
|
|
||||||
# C
|
# C
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user