readme tweaks; add 90% histogram function

2024-01-31 15:15:56 +01:00 · 2024-01-31 15:15:56 +01:00 · c676a22ba8
commit c676a22ba8
parent e62a840625
2 changed files with 87 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -401,6 +401,8 @@ Overall, I'd describe the error handling capabilities of this library as pretty

 ### To do

+- [ ] Come up with a better headline example; fermi paradox paper is too complicated
+- [ ] Post on suckless subreddit
 - [ ] Drive in a few more real-life applications
  - [ ] US election modelling?
 - [ ] Look into using size_t instead of int for sample numbers
--- a/squiggle_more.c
+++ b/squiggle_more.c
@ -215,8 +215,8 @@ void array_print_stats(double xs[], int n){


 void array_print_histogram(double* xs, int n_samples, int n_bins) {
+    // Interface inspired by <https://github.com/red-data-tools/YouPlot>
    // Generated with the help of an llm; there might be subtle off-by-one errors
-    // interface inspired by <https://github.com/red-data-tools/YouPlot>
    if (n_bins <= 1) {
        fprintf(stderr, "Number of bins must be greater than 1.\n");
        return;
@ -305,6 +305,90 @@ void array_print_histogram(double* xs, int n_samples, int n_bins) {
    free(bins);
 }

+void array_print_90_ci_histogram(double* xs, int n){
+    // Code duplicated from previous function
+    // I'll consider simplifying it at some future point
+    // Possible ideas:
+    // - having only one function that takes any confidence interval?
+    // - having a utility function that is called by both functions?
+    ci ci_90 = array_get_90_ci(xs, n);
+
+    if (n_bins <= 1) {
+        fprintf(stderr, "Number of bins must be greater than 1.\n");
+        return;
+    } else if (n_samples <= 10) {
+        fprintf(stderr, "Number of samples must be higher than 10.\n");
+        return;
+    }
+
+    int *bins = (int*) calloc((size_t)n_bins, sizeof(int));
+    if (bins == NULL) {
+        fprintf(stderr, "Memory allocation for bins failed.\n");
+        return;
+    }
+
+    double min_value = ci_90.low, max_value = ci_90.high;
+
+    // Avoid division by zero for a single unique value
+    if (min_value == max_value) {
+        max_value++;
+    }
+
+    // Calculate bin width
+    double range = max_value - min_value;
+    double bin_width = range / n_bins;
+
+    // Fill the bins with sample counts
+    for (int i = 0; i < n_samples; i++) {
+        if((x[i] > min_value) && (x[i] < max_value)){
+            int bin_index = (int)((xs[i] - min_value) / bin_width);
+            if (bin_index == n_bins) {
+                bin_index--; // Last bin includes max_value
+            }
+            bins[bin_index]++;
+        }
+    }
+
+    // Calculate the scaling factor based on the maximum bin count
+    int max_bin_count = 0;
+    for (int i = 0; i < n_bins; i++) {
+        if (bins[i] > max_bin_count) {
+            max_bin_count = bins[i];
+        }
+    }
+    const int MAX_WIDTH = 50; // Adjust this to your terminal width
+    double scale = max_bin_count > MAX_WIDTH ? (double)MAX_WIDTH / max_bin_count : 1.0;
+
+    // Print the histogram
+    for (int i = 0; i < n_bins; i++) {
+        double bin_start = min_value + i * bin_width;
+        double bin_end = bin_start + bin_width;
+
+        int decimalPlaces = 1;
+        if((0 < bin_width) && (bin_width < 1)){
+            int magnitude = (int) floor(log10(bin_width));
+            decimalPlaces = -magnitude;
+            decimalPlaces = decimalPlaces > 10 ? 10 : decimalPlaces;  
+        }
+        printf("  [%*.*f, %*.*f", 4+decimalPlaces, decimalPlaces, bin_start, 4+decimalPlaces, decimalPlaces, bin_end);
+        char interval_delimiter = ')';
+        if(i == (n_bins-1)){
+            interval_delimiter = ']'; // last bucket is inclusive
+        }
+        printf("%c: ", interval_delimiter);
+
+        int marks = (int)(bins[i] * scale);
+        for (int j = 0; j < marks; j++) {
+            printf("█");
+        }
+        printf(" %d\n", bins[i]);
+    }
+
+    // Free the allocated memory for bins
+    free(bins);
+
+}
+
 // Replicate some of the above functions over samplers
 // However, in the future I'll delete this
 // There should be a clear boundary between working with samplers and working with an array of samples