Merge pull request #384 from quantified-uncertainty/normalize-improvements

Minor Math improvements for pointSet distributions
2022-04-27 13:11:47 -04:00 · 2022-04-27 13:11:47 -04:00 · 079e8f6c8d
commit 079e8f6c8d
parent d81e68273b a22fbb1afd
8 changed files with 65 additions and 34 deletions
--- a/packages/squiggle-lang/tests/Distributions/Invariants/AlgebraicCombination_test.res
+++ b/packages/squiggle-lang/tests/Distributions/Invariants/AlgebraicCombination_test.res
@ -65,7 +65,7 @@ describe("(Algebraic) addition of distributions", () => {
      | None => "algebraicAdd has"->expect->toBe("failed")
      // This is nondeterministic, we could be in a situation where ci fails but you click rerun and it passes, which is bad.
      // sometimes it works with ~digits=2.
-      | Some(x) => x->expect->toBeSoCloseTo(9.78655777150074, ~digits=1) // (uniformMean +. betaMean)
+      | Some(x) => x->expect->toBeSoCloseTo(9.786831807237022, ~digits=1) // (uniformMean +. betaMean)
      }
    })
    test("beta(alpha=2, beta=5) + uniform(low=9, high=10)", () => {
@ -82,7 +82,7 @@ describe("(Algebraic) addition of distributions", () => {
      | None => "algebraicAdd has"->expect->toBe("failed")
      // This is nondeterministic, we could be in a situation where ci fails but you click rerun and it passes, which is bad.
      // sometimes it works with ~digits=2.
-      | Some(x) => x->expect->toBeSoCloseTo(9.786753454457116, ~digits=1) // (uniformMean +. betaMean)
+      | Some(x) => x->expect->toBeSoCloseTo(9.784290207736126, ~digits=1) // (uniformMean +. betaMean)
      }
    })
  })
@ -162,6 +162,7 @@ describe("(Algebraic) addition of distributions", () => {
      switch received {
      | None => "algebraicAdd has"->expect->toBe("failed")
      // This is nondeterministic, we could be in a situation where ci fails but you click rerun and it passes, which is bad.
+      // sometimes it works with ~digits=4.
      // This value was calculated by a python script
      | Some(x) => x->expect->toBeSoCloseTo(0.979023, ~digits=0)
      }
@ -360,7 +361,7 @@ describe("(Algebraic) addition of distributions", () => {
      | None => "algebraicAdd has"->expect->toBe("failed")
      // This is nondeterministic, we could be in a situation where ci fails but you click rerun and it passes, which is bad.
      // sometimes it works with ~digits=2.
-      | Some(x) => x->expect->toBeSoCloseTo(9.174267267465632, ~digits=0)
+      | Some(x) => x->expect->toBeSoCloseTo(9.190872365862756, ~digits=0)
      }
    })
  })
--- a/packages/squiggle-lang/tests/Distributions/SampleSetDist_test.res
+++ b/packages/squiggle-lang/tests/Distributions/SampleSetDist_test.res
@ -37,4 +37,5 @@ describe("Continuous and discrete splits", () => {
  )
  let toArr2 = discrete2 |> E.FloatFloatMap.toArray
  makeTest("splitMedium at count=500", toArr2 |> Belt.Array.length, 500)
+  // makeTest("foo", [] |> Belt.Array.length, 500)
 })
--- a/packages/squiggle-lang/tests/TS/JS_test.ts
+++ b/packages/squiggle-lang/tests/TS/JS_test.ts
@ -58,14 +58,15 @@ describe("Distribution", () => {
  );

  test("mean", () => {
-    expect(dist.mean().value).toBeCloseTo(8.704375514292865);
+    expect(dist.mean().value).toBeCloseTo(9.5555555);
  });
  test("pdf", () => {
-    expect(dist.pdf(5.0).value).toBeCloseTo(0.052007455285386944, 1);
+    expect(dist.pdf(5.0).value).toBeCloseTo(0.10499097598222966, 1);
  });
  test("cdf", () => {
    expect(dist.cdf(5.0).value).toBeCloseTo(
-      dist1Samples.filter((x) => x <= 5).length / dist1SampleCount
+      dist1Samples.filter((x) => x <= 5).length / dist1SampleCount,
+      1
    );
  });
  test("inv", () => {
@ -77,7 +78,7 @@ describe("Distribution", () => {
    ).toEqual(Ok("Point Set Distribution"));
  });
  test("toSparkline", () => {
-    expect(dist.toSparkline(20).value).toEqual("▁▁▃▅███▆▄▃▂▁▁▂▂▃▂▁▁▁");
+    expect(dist.toSparkline(20).value).toEqual("▁▁▃▇█▇▄▂▂▂▁▁▁▁▁▂▂▁▁▁");
  });
  test("algebraicAdd", () => {
    expect(
@ -91,6 +92,6 @@ describe("Distribution", () => {
      resultMap(dist.pointwiseAdd(dist2), (r: Distribution) =>
        r.toSparkline(20)
      ).value
-    ).toEqual(Ok("▁▂▅██▅▅▅▆▆▇▅▄▃▃▂▂▁▁▁"));
+    ).toEqual(Ok("▁▂██▃▃▃▃▄▅▄▃▃▂▂▂▁▁▁▁"));
  });
 });
--- a/packages/squiggle-lang/tests/TS/SampleSet_test.ts
+++ b/packages/squiggle-lang/tests/TS/SampleSet_test.ts
@ -59,13 +59,7 @@ describe("cumulative density function", () => {
          { sampleCount: n, xyPointLength: 100 }
        );
        let cdfValue = dist.cdf(max).value;
-        let min = Math.min(...xs);
-        let epsilon = 5e-3;
-        if (max - min < epsilon) {
-          expect(cdfValue).toBeLessThan(1 - epsilon);
-        } else {
-          expect(dist.cdf(max).value).toBeGreaterThan(1 - epsilon);
-        }
+        expect(cdfValue).toBeCloseTo(1.0, 2);
      })
    );
  });
--- a/packages/squiggle-lang/src/rescript/Distributions/GenericDist/GenericDist.res
+++ b/packages/squiggle-lang/src/rescript/Distributions/GenericDist/GenericDist.res
@ -46,18 +46,25 @@ let toFloatOperation = (
  ~toPointSetFn: toPointSetFn,
  ~distToFloatOperation: Operation.distToFloatOperation,
 ) => {
-  let symbolicSolution = switch (t: t) {
-  | Symbolic(r) =>
-    switch SymbolicDist.T.operate(distToFloatOperation, r) {
-    | Ok(f) => Some(f)
-    | _ => None
-    }
+  let trySymbolicSolution = switch (t: t) {
+  | Symbolic(r) => SymbolicDist.T.operate(distToFloatOperation, r)->E.R.toOption
  | _ => None
  }

-  switch symbolicSolution {
+  let trySampleSetSolution = switch ((t: t), distToFloatOperation) {
+  | (SampleSet(sampleSet), #Mean) => SampleSetDist.mean(sampleSet)->Some
+  | (SampleSet(sampleSet), #Sample) => SampleSetDist.sample(sampleSet)->Some
+  | (SampleSet(sampleSet), #Inv(r)) => SampleSetDist.percentile(sampleSet, r)->Some
+  | _ => None
+  }
+
+  switch trySymbolicSolution {
  | Some(r) => Ok(r)
-  | None => toPointSetFn(t)->E.R2.fmap(PointSetDist.operate(distToFloatOperation))
+  | None =>
+    switch trySampleSetSolution {
+    | Some(r) => Ok(r)
+    | None => toPointSetFn(t)->E.R2.fmap(PointSetDist.operate(distToFloatOperation))
+    }
  }
 }

--- a/packages/squiggle-lang/src/rescript/Distributions/SampleSetDist/SampleSetDist.res
+++ b/packages/squiggle-lang/src/rescript/Distributions/SampleSetDist/SampleSetDist.res
@ -98,3 +98,13 @@ let map2 = (~fn: (float, float) => result<float, Operation.Error.t>, ~t1: t, ~t2
    E.R.toExn("Input of samples should be larger than 5", make(x))
  )
 }
+
+let mean = t => T.get(t)->E.A.Floats.mean
+let geomean = t => T.get(t)->E.A.Floats.geomean
+let mode = t => T.get(t)->E.A.Floats.mode
+let sum = t => T.get(t)->E.A.Floats.sum
+let min = t => T.get(t)->E.A.Floats.min
+let max = t => T.get(t)->E.A.Floats.max
+let stdev = t => T.get(t)->E.A.Floats.stdev
+let variance = t => T.get(t)->E.A.Floats.variance
+let percentile = (t, f) => T.get(t)->E.A.Floats.percentile(f)
--- a/packages/squiggle-lang/src/rescript/Distributions/SampleSetDist/SampleSetDist_ToPointSet.res
+++ b/packages/squiggle-lang/src/rescript/Distributions/SampleSetDist/SampleSetDist_ToPointSet.res
@ -133,9 +133,17 @@ let toPointSetDist = (
    ~discrete=Some(discrete),
  )

+  /*
+   I'm surprised that this doesn't come out normalized. My guess is that the KDE library
+  we're using is standardizing on something else. If we ever change that library, we should
+  check to see if we still need to do this.
+ */
+
+  let normalizedPointSet = pointSetDist->E.O2.fmap(PointSetDist.T.normalize)
+
  let samplesParse: Internals.Types.outputs = {
    continuousParseParams: pdf |> E.O.fmap(snd),
-    pointSetDist: pointSetDist,
+    pointSetDist: normalizedPointSet,
  }

  samplesParse
--- a/packages/squiggle-lang/src/rescript/Utility/E.res
+++ b/packages/squiggle-lang/src/rescript/Utility/E.res
@ -521,14 +521,15 @@ module A = {
      let split = (sortedArray: array<float>) => {
        let continuous = []
        let discrete = FloatFloatMap.empty()
-        Belt.Array.forEachWithIndex(sortedArray, (index, element) => {
-          let maxIndex = (sortedArray |> Array.length) - 1
-          let possiblySimilarElements = switch index {
-          | 0 => [index + 1]
-          | n if n == maxIndex => [index - 1]
-          | _ => [index - 1, index + 1]
-          } |> Belt.Array.map(_, r => sortedArray[r])
-          let hasSimilarElement = Belt.Array.some(possiblySimilarElements, r => r == element)
+        Belt.Array.forEachWithIndex(sortedArray, (_, element) => {
+          // let maxIndex = (sortedArray |> Array.length) - 1
+          // let possiblySimilarElements = switch index {
+          // | 0 => [index + 1]
+          // | n if n == maxIndex => [index - 1]
+          // | _ => [index - 1, index + 1]
+          // } |> Belt.Array.map(_, r => sortedArray[r])
+          // let hasSimilarElement = Belt.Array.some(possiblySimilarElements, r => r == element)
+          let hasSimilarElement = false
          hasSimilarElement
            ? FloatFloatMap.increment(element, discrete)
            : {
@ -544,10 +545,18 @@ module A = {
  }

  module Floats = {
-    let sum = Belt.Array.reduce(_, 0., (i, j) => i +. j)
-    let mean = a => sum(a) /. (Array.length(a) |> float_of_int)
+    let mean = Jstat.mean
+    let geomean = Jstat.geomean
+    let mode = Jstat.mode
+    let variance = Jstat.variance
+    let stdev = Jstat.stdev
+    let sum = Jstat.sum
    let random = Js.Math.random_int

+    //Passing true for the exclusive parameter excludes both endpoints of the range.
+    //https://jstat.github.io/all.html
+    let percentile = (a, b) => Jstat.percentile(a, b, false)
+
    // Gives an array with all the differences between values
    // diff([1,5,3,7]) = [4,-2,4]
    let diff = (arr: array<float>): array<float> =>