perf tweaks

This commit is contained in:
NunoSempere 2023-06-03 01:42:48 -06:00
parent d229021625
commit 58cfe378e5
3 changed files with 27 additions and 1 deletions

View File

@ -10,6 +10,7 @@ This repository contains a few implementations of a simple botec (back-of-the-en
- [ ] Add Windows/Powershell time-measuring commands
- [ ] Add CUDA?
- [x] Added results of perf. `rand_r` seems like a big chunk of it, but I'm hesitant to use lower-quality random numbers
- [x] Update repository with correct timing
- [x] Use better profiling approach to capture timing with 1M samples.
- [x] See if program can be reworded so as to use multithreading effectively, e.g., so that you see speed gains proportional to the number of threads used

View File

@ -82,7 +82,7 @@ time-linux-simple:
## Profiling
profile-linux:
echo "Requires perf, which depends on the kernel, and might be in linux-tools package or similar"
echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar"
echo "Must be run as sudo"
$(CC) $(SRC) $(OPENMP) $(MATH) -o $(OUTPUT)
# ./$(OUTPUT)

25
C/perf.txt Normal file
View File

@ -0,0 +1,25 @@
Overhead Command Shared Object Symbol
23.94% samples libc-2.31.so [.] rand_r
18.14% samples libgomp.so.1.0.0 [.] 0x000000000001d132
15.43% samples libgomp.so.1.0.0 [.] 0x000000000001d2ea
12.16% samples samples [.] mixture._omp_fn.0
4.36% samples libm-2.31.so [.] __sin_fma
3.49% samples libm-2.31.so [.] __ieee754_log_fma
3.34% samples samples [.] random_to
3.13% samples samples [.] random_uniform
2.77% samples samples [.] split_array_sum._omp_fn.0
2.01% samples samples [.] rand_float
1.65% samples libm-2.31.so [.] __logf_fma
0.88% samples libgomp.so.1.0.0 [.] 0x000000000001d2f5
0.86% samples samples [.] ur_normal
0.75% samples libm-2.31.so [.] __expf_fma
0.70% samples libgomp.so.1.0.0 [.] 0x000000000001d13d
0.69% samples libgomp.so.1.0.0 [.] 0x000000000001d139
0.57% samples libgomp.so.1.0.0 [.] 0x000000000001d2f1
0.57% samples samples [.] sample_1
0.55% samples samples [.] random_lognormal
0.50% samples [kernel.kallsyms] [k] asm_exc_page_fault
0.49% samples [kernel.kallsyms] [k] clear_page_rep
0.47% samples samples [.] random_normal
0.38% samples [kernel.kallsyms] [k] default_send_IPI_single_phys