diff --git a/packages/squiggle-lang/__tests__/Distributions/SampleSetDist_test.res b/packages/squiggle-lang/__tests__/E/splitContinuousAndDiscrete_test.res similarity index 90% rename from packages/squiggle-lang/__tests__/Distributions/SampleSetDist_test.res rename to packages/squiggle-lang/__tests__/E/splitContinuousAndDiscrete_test.res index 825c0095..449787dd 100644 --- a/packages/squiggle-lang/__tests__/Distributions/SampleSetDist_test.res +++ b/packages/squiggle-lang/__tests__/E/splitContinuousAndDiscrete_test.res @@ -2,7 +2,7 @@ open Jest open TestHelpers let prepareInputs = (ar, minWeight) => - E.A.Sorted.Floats.splitContinuousAndDiscreteForMinWeight(ar, minWeight) |> ( + E.A.Sorted.Floats.splitContinuousAndDiscreteForMinWeight(ar, ~minDiscreteWeight=minWeight) |> ( ((c, disc)) => (c, disc |> E.FloatFloatMap.toArray) ) @@ -33,14 +33,14 @@ describe("Continuous and discrete splits", () => { let (_, discrete1) = E.A.Sorted.Floats.splitContinuousAndDiscreteForMinWeight( makeDuplicatedArray(10), - 2, + ~minDiscreteWeight=2, ) let toArr1 = discrete1 |> E.FloatFloatMap.toArray makeTest("splitMedium at count=10", toArr1 |> Belt.Array.length, 10) let (_c, discrete2) = E.A.Sorted.Floats.splitContinuousAndDiscreteForMinWeight( makeDuplicatedArray(500), - 2, + ~minDiscreteWeight=2, ) let toArr2 = discrete2 |> E.FloatFloatMap.toArray makeTest("splitMedium at count=500", toArr2 |> Belt.Array.length, 500) diff --git a/packages/squiggle-lang/src/rescript/Distributions/SampleSetDist/SampleSetDist_ToPointSet.res b/packages/squiggle-lang/src/rescript/Distributions/SampleSetDist/SampleSetDist_ToPointSet.res index 10874f96..142f10b2 100644 --- a/packages/squiggle-lang/src/rescript/Distributions/SampleSetDist/SampleSetDist_ToPointSet.res +++ b/packages/squiggle-lang/src/rescript/Distributions/SampleSetDist/SampleSetDist_ToPointSet.res @@ -63,8 +63,11 @@ let toPointSetDist = ( (), ): Internals.Types.outputs => { Array.fast_sort(compare, samples) - let minDiscreteToKeep = max(2, E.A.length(samples) / 10); - let (continuousPart, discretePart) = E.A.Sorted.Floats.splitContinuousAndDiscreteForMinWeight(samples, minDiscreteToKeep) + let minDiscreteToKeep = MagicNumbers.ToPointSet.minDiscreteToKeep(samples) + let (continuousPart, discretePart) = E.A.Sorted.Floats.splitContinuousAndDiscreteForMinWeight( + samples, + ~minDiscreteWeight=minDiscreteToKeep, + ) let length = samples |> E.A.length |> float_of_int let discrete: PointSetTypes.discreteShape = discretePart diff --git a/packages/squiggle-lang/src/rescript/MagicNumbers.res b/packages/squiggle-lang/src/rescript/MagicNumbers.res index 291f05d6..124a44f4 100644 --- a/packages/squiggle-lang/src/rescript/MagicNumbers.res +++ b/packages/squiggle-lang/src/rescript/MagicNumbers.res @@ -22,3 +22,16 @@ module OpCost = { let wildcardCost = 1000 let monteCarloCost = Environment.defaultSampleCount } + +module ToPointSet = { + /* + This function chooses the minimum amount of duplicate samples that need + to exist in order for this to be considered discrete. The tricky thing + is that there are some operations that create duplicate continuous samples, + so we can't guarantee that these only will occur because the fundamental + structure is meant to be discrete. I chose this heuristic because I think + it would strike a reasonable trade-off, but I’m really unsure what’s + best right now. + */ + let minDiscreteToKeep = samples => max(20, E.A.length(samples) / 50) +} diff --git a/packages/squiggle-lang/src/rescript/Utility/E.res b/packages/squiggle-lang/src/rescript/Utility/E.res index b0d9ef19..1f218194 100644 --- a/packages/squiggle-lang/src/rescript/Utility/E.res +++ b/packages/squiggle-lang/src/rescript/Utility/E.res @@ -522,7 +522,12 @@ module A = { let makeIncrementalDown = (a, b) => Array.make(a - b + 1, a) |> Array.mapi((i, c) => c - i) |> Belt.Array.map(_, float_of_int) - let splitContinuousAndDiscreteForDuplicates = (sortedArray: array) => { + /* + This function goes through a sorted array and divides it into two different clusters: + continuous samples and discrete samples. The discrete samples are stored in a mutable map. + Samples are thought to be discrete if they have any duplicates. + */ + let _splitContinuousAndDiscreteForDuplicates = (sortedArray: array) => { let continuous: array = [] let discrete = FloatFloatMap.empty() Belt.Array.forEachWithIndex(sortedArray, (index, element) => { @@ -545,11 +550,18 @@ module A = { (continuous, discrete) } + /* + This function works very similarly to splitContinuousAndDiscreteForDuplicates. The one major difference + is that you can specify a minDiscreteWeight. If the min discreet weight is 4, that would mean that + at least four elements needed from a specific value for that to be kept as discrete. This is important + because in some cases, we can expect that some common elements will be generated by regular operations. + The final continous array will be sorted. + */ let splitContinuousAndDiscreteForMinWeight = ( sortedArray: array, - minDiscreteWeight: int, + ~minDiscreteWeight: int, ) => { - let (continuous, discrete) = splitContinuousAndDiscreteForDuplicates(sortedArray) + let (continuous, discrete) = _splitContinuousAndDiscreteForDuplicates(sortedArray) let keepFn = v => Belt.Float.toInt(v) >= minDiscreteWeight let (discreteToKeep, discreteToIntegrate) = FloatFloatMap.partition( ((_, v)) => keepFn(v), @@ -559,7 +571,6 @@ module A = { discreteToIntegrate->FloatFloatMap.toArray |> fmap(((k, v)) => Belt.Array.makeBy(Belt.Float.toInt(v), _ => k)) |> Belt.Array.concatMany - let newContinuous = concat(continuous, newContinousSamples) newContinuous |> Array.fast_sort(floatCompare) (newContinuous, discreteToKeep)