Merge pull request #409 from quantified-uncertainty/splidcontinuousDiscrete-refactor

Refactor of splitContinuousAndDiscrete to allow for customization
This commit is contained in:
Ozzie Gooen 2022-04-28 11:41:06 -04:00 committed by GitHub
commit e1551cb1d7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 112 additions and 76 deletions

View File

@ -1,41 +0,0 @@
open Jest
open TestHelpers
describe("Continuous and discrete splits", () => {
makeTest(
"splits (1)",
SampleSetDist_ToPointSet.Internals.T.splitContinuousAndDiscrete([1.432, 1.33455, 2.0]),
([1.432, 1.33455, 2.0], E.FloatFloatMap.empty()),
)
makeTest(
"splits (2)",
SampleSetDist_ToPointSet.Internals.T.splitContinuousAndDiscrete([
1.432,
1.33455,
2.0,
2.0,
2.0,
2.0,
]) |> (((c, disc)) => (c, disc |> E.FloatFloatMap.toArray)),
([1.432, 1.33455], [(2.0, 4.0)]),
)
let makeDuplicatedArray = count => {
let arr = Belt.Array.range(1, count) |> E.A.fmap(float_of_int)
let sorted = arr |> Belt.SortArray.stableSortBy(_, compare)
E.A.concatMany([sorted, sorted, sorted, sorted]) |> Belt.SortArray.stableSortBy(_, compare)
}
let (_, discrete1) = SampleSetDist_ToPointSet.Internals.T.splitContinuousAndDiscrete(
makeDuplicatedArray(10),
)
let toArr1 = discrete1 |> E.FloatFloatMap.toArray
makeTest("splitMedium at count=10", toArr1 |> Belt.Array.length, 10)
let (_c, discrete2) = SampleSetDist_ToPointSet.Internals.T.splitContinuousAndDiscrete(
makeDuplicatedArray(500),
)
let toArr2 = discrete2 |> E.FloatFloatMap.toArray
makeTest("splitMedium at count=500", toArr2 |> Belt.Array.length, 500)
// makeTest("foo", [] |> Belt.Array.length, 500)
})

View File

@ -0,0 +1,48 @@
open Jest
open TestHelpers
let prepareInputs = (ar, minWeight) =>
E.A.Sorted.Floats.splitContinuousAndDiscreteForMinWeight(ar, ~minDiscreteWeight=minWeight) |> (
((c, disc)) => (c, disc |> E.FloatFloatMap.toArray)
)
describe("Continuous and discrete splits", () => {
makeTest(
"is empty, with no common elements",
prepareInputs([1.432, 1.33455, 2.0], 2),
([1.33455, 1.432, 2.0], []),
)
makeTest(
"only stores 3.5 as discrete when minWeight is 3",
prepareInputs([1.432, 1.33455, 2.0, 2.0, 3.5, 3.5, 3.5], 3),
([1.33455, 1.432, 2.0, 2.0], [(3.5, 3.0)]),
)
makeTest(
"doesn't store 3.5 as discrete when minWeight is 5",
prepareInputs([1.432, 1.33455, 2.0, 2.0, 3.5, 3.5, 3.5], 5),
([1.33455, 1.432, 2.0, 2.0, 3.5, 3.5, 3.5], []),
)
let makeDuplicatedArray = count => {
let arr = Belt.Array.range(1, count) |> E.A.fmap(float_of_int)
let sorted = arr |> Belt.SortArray.stableSortBy(_, compare)
E.A.concatMany([sorted, sorted, sorted, sorted]) |> Belt.SortArray.stableSortBy(_, compare)
}
let (_, discrete1) = E.A.Sorted.Floats.splitContinuousAndDiscreteForMinWeight(
makeDuplicatedArray(10),
~minDiscreteWeight=2,
)
let toArr1 = discrete1 |> E.FloatFloatMap.toArray
makeTest("splitMedium at count=10", toArr1 |> Belt.Array.length, 10)
let (_c, discrete2) = E.A.Sorted.Floats.splitContinuousAndDiscreteForMinWeight(
makeDuplicatedArray(500),
~minDiscreteWeight=2,
)
let toArr2 = discrete2 |> E.FloatFloatMap.toArray
makeTest("splitMedium at count=500", toArr2 |> Belt.Array.length, 500)
// makeTest("foo", [] |> Belt.Array.length, 500)
})

View File

@ -39,28 +39,6 @@ module Internals = {
module T = { module T = {
type t = array<float> type t = array<float>
let splitContinuousAndDiscrete = (sortedArray: t) => {
let continuous = []
let discrete = E.FloatFloatMap.empty()
Belt.Array.forEachWithIndex(sortedArray, (index, element) => {
let maxIndex = (sortedArray |> Array.length) - 1
let possiblySimilarElements = switch index {
| 0 => [index + 1]
| n if n == maxIndex => [index - 1]
| _ => [index - 1, index + 1]
} |> Belt.Array.map(_, r => sortedArray[r])
let hasSimilarElement = Belt.Array.some(possiblySimilarElements, r => r == element)
hasSimilarElement
? E.FloatFloatMap.increment(element, discrete)
: {
let _ = Js.Array.push(element, continuous)
}
()
})
(continuous, discrete)
}
let xWidthToUnitWidth = (samples, outputXYPoints, xWidth) => { let xWidthToUnitWidth = (samples, outputXYPoints, xWidth) => {
let xyPointRange = E.A.Sorted.range(samples) |> E.O.default(0.0) let xyPointRange = E.A.Sorted.range(samples) |> E.O.default(0.0)
let xyPointWidth = xyPointRange /. float_of_int(outputXYPoints) let xyPointWidth = xyPointRange /. float_of_int(outputXYPoints)
@ -85,7 +63,11 @@ let toPointSetDist = (
(), (),
): Internals.Types.outputs => { ): Internals.Types.outputs => {
Array.fast_sort(compare, samples) Array.fast_sort(compare, samples)
let (continuousPart, discretePart) = E.A.Sorted.Floats.split(samples) let minDiscreteToKeep = MagicNumbers.ToPointSet.minDiscreteToKeep(samples)
let (continuousPart, discretePart) = E.A.Sorted.Floats.splitContinuousAndDiscreteForMinWeight(
samples,
~minDiscreteWeight=minDiscreteToKeep,
)
let length = samples |> E.A.length |> float_of_int let length = samples |> E.A.length |> float_of_int
let discrete: PointSetTypes.discreteShape = let discrete: PointSetTypes.discreteShape =
discretePart discretePart

View File

@ -22,3 +22,16 @@ module OpCost = {
let wildcardCost = 1000 let wildcardCost = 1000
let monteCarloCost = Environment.defaultSampleCount let monteCarloCost = Environment.defaultSampleCount
} }
module ToPointSet = {
/*
This function chooses the minimum amount of duplicate samples that need
to exist in order for this to be considered discrete. The tricky thing
is that there are some operations that create duplicate continuous samples,
so we can't guarantee that these only will occur because the fundamental
structure is meant to be discrete. I chose this heuristic because I think
it would strike a reasonable trade-off, but Im really unsure whats
best right now.
*/
let minDiscreteToKeep = samples => max(20, E.A.length(samples) / 50)
}

View File

@ -8,7 +8,7 @@ module FloatFloatMap = {
type t = Belt.MutableMap.t<Id.t, float, Id.identity> type t = Belt.MutableMap.t<Id.t, float, Id.identity>
let fromArray = (ar: array<(float, float)>) => Belt.MutableMap.fromArray(ar, ~id=module(Id)) let fromArray = (ar: array<(float, float)>) => Belt.MutableMap.fromArray(ar, ~id=module(Id))
let toArray = (t: t) => Belt.MutableMap.toArray(t) let toArray = (t: t): array<(float, float)> => Belt.MutableMap.toArray(t)
let empty = () => Belt.MutableMap.make(~id=module(Id)) let empty = () => Belt.MutableMap.make(~id=module(Id))
let increment = (el, t: t) => let increment = (el, t: t) =>
Belt.MutableMap.update(t, el, x => Belt.MutableMap.update(t, el, x =>
@ -20,6 +20,10 @@ module FloatFloatMap = {
let get = (el, t: t) => Belt.MutableMap.get(t, el) let get = (el, t: t) => Belt.MutableMap.get(t, el)
let fmap = (fn, t: t) => Belt.MutableMap.map(t, fn) let fmap = (fn, t: t) => Belt.MutableMap.map(t, fn)
let partition = (fn, t: t) => {
let (match, noMatch) = Belt.Array.partition(toArray(t), fn)
(fromArray(match), fromArray(noMatch))
}
} }
module Int = { module Int = {
@ -518,18 +522,22 @@ module A = {
let makeIncrementalDown = (a, b) => let makeIncrementalDown = (a, b) =>
Array.make(a - b + 1, a) |> Array.mapi((i, c) => c - i) |> Belt.Array.map(_, float_of_int) Array.make(a - b + 1, a) |> Array.mapi((i, c) => c - i) |> Belt.Array.map(_, float_of_int)
let split = (sortedArray: array<float>) => { /*
let continuous = [] This function goes through a sorted array and divides it into two different clusters:
continuous samples and discrete samples. The discrete samples are stored in a mutable map.
Samples are thought to be discrete if they have any duplicates.
*/
let _splitContinuousAndDiscreteForDuplicates = (sortedArray: array<float>) => {
let continuous: array<float> = []
let discrete = FloatFloatMap.empty() let discrete = FloatFloatMap.empty()
Belt.Array.forEachWithIndex(sortedArray, (_, element) => { Belt.Array.forEachWithIndex(sortedArray, (index, element) => {
// let maxIndex = (sortedArray |> Array.length) - 1 let maxIndex = (sortedArray |> Array.length) - 1
// let possiblySimilarElements = switch index { let possiblySimilarElements = switch index {
// | 0 => [index + 1] | 0 => [index + 1]
// | n if n == maxIndex => [index - 1] | n if n == maxIndex => [index - 1]
// | _ => [index - 1, index + 1] | _ => [index - 1, index + 1]
// } |> Belt.Array.map(_, r => sortedArray[r]) } |> Belt.Array.map(_, r => sortedArray[r])
// let hasSimilarElement = Belt.Array.some(possiblySimilarElements, r => r == element) let hasSimilarElement = Belt.Array.some(possiblySimilarElements, r => r == element)
let hasSimilarElement = false
hasSimilarElement hasSimilarElement
? FloatFloatMap.increment(element, discrete) ? FloatFloatMap.increment(element, discrete)
: { : {
@ -541,6 +549,32 @@ module A = {
(continuous, discrete) (continuous, discrete)
} }
/*
This function works very similarly to splitContinuousAndDiscreteForDuplicates. The one major difference
is that you can specify a minDiscreteWeight. If the min discreet weight is 4, that would mean that
at least four elements needed from a specific value for that to be kept as discrete. This is important
because in some cases, we can expect that some common elements will be generated by regular operations.
The final continous array will be sorted.
*/
let splitContinuousAndDiscreteForMinWeight = (
sortedArray: array<float>,
~minDiscreteWeight: int,
) => {
let (continuous, discrete) = _splitContinuousAndDiscreteForDuplicates(sortedArray)
let keepFn = v => Belt.Float.toInt(v) >= minDiscreteWeight
let (discreteToKeep, discreteToIntegrate) = FloatFloatMap.partition(
((_, v)) => keepFn(v),
discrete,
)
let newContinousSamples =
discreteToIntegrate->FloatFloatMap.toArray
|> fmap(((k, v)) => Belt.Array.makeBy(Belt.Float.toInt(v), _ => k))
|> Belt.Array.concatMany
let newContinuous = concat(continuous, newContinousSamples)
newContinuous |> Array.fast_sort(floatCompare)
(newContinuous, discreteToKeep)
}
} }
} }