From cb4e1199fc55860bb6942b633104e5010953575e Mon Sep 17 00:00:00 2001 From: Vyacheslav Matyukhin Date: Tue, 20 Sep 2022 00:56:46 +0400 Subject: [PATCH] reimplement splitContinuousAndDiscreteForMinWeight --- .../E/splitContinuousAndDiscrete_test.res | 6 +- .../scripts/bench-sampleset-to-pointset.mjs | 22 +++++ .../SampleSetDist_ToPointSet.res | 35 ++++---- .../src/rescript/Utility/E/E_A.res | 80 +++++++++---------- .../rescript/Utility/E/E_FloatFloatMap.res | 8 ++ 5 files changed, 88 insertions(+), 63 deletions(-) create mode 100755 packages/squiggle-lang/scripts/bench-sampleset-to-pointset.mjs diff --git a/packages/squiggle-lang/__tests__/E/splitContinuousAndDiscrete_test.res b/packages/squiggle-lang/__tests__/E/splitContinuousAndDiscrete_test.res index a52227ee..30f760dc 100644 --- a/packages/squiggle-lang/__tests__/E/splitContinuousAndDiscrete_test.res +++ b/packages/squiggle-lang/__tests__/E/splitContinuousAndDiscrete_test.res @@ -9,19 +9,19 @@ let prepareInputs = (ar, minWeight) => describe("Continuous and discrete splits", () => { makeTest( "is empty, with no common elements", - prepareInputs([1.432, 1.33455, 2.0], 2), + prepareInputs([1.33455, 1.432, 2.0], 2), ([1.33455, 1.432, 2.0], []), ) makeTest( "only stores 3.5 as discrete when minWeight is 3", - prepareInputs([1.432, 1.33455, 2.0, 2.0, 3.5, 3.5, 3.5], 3), + prepareInputs([1.33455, 1.432, 2.0, 2.0, 3.5, 3.5, 3.5], 3), ([1.33455, 1.432, 2.0, 2.0], [(3.5, 3.0)]), ) makeTest( "doesn't store 3.5 as discrete when minWeight is 5", - prepareInputs([1.432, 1.33455, 2.0, 2.0, 3.5, 3.5, 3.5], 5), + prepareInputs([1.33455, 1.432, 2.0, 2.0, 3.5, 3.5, 3.5], 5), ([1.33455, 1.432, 2.0, 2.0, 3.5, 3.5, 3.5], []), ) diff --git a/packages/squiggle-lang/scripts/bench-sampleset-to-pointset.mjs b/packages/squiggle-lang/scripts/bench-sampleset-to-pointset.mjs new file mode 100755 index 00000000..a10b17b0 --- /dev/null +++ b/packages/squiggle-lang/scripts/bench-sampleset-to-pointset.mjs @@ -0,0 +1,22 @@ +#!/usr/bin/env node +import { SqProject } from "@quri/squiggle-lang"; +import { measure } from "./lib.mjs"; + +const maxP = 7; + +for (let p = 0; p <= maxP; p++) { + const size = Math.pow(10, p); + const project = SqProject.create(); + project.setSource( + "main", + ` + List.upTo(1, ${size}) -> map({|x| + normal(x,2) -> SampleSet.fromDist -> PointSet.fromDist + })->List.last + ` + ); + const time = measure(() => { + project.run("main"); + }); + console.log(`1e${p}`, "\t", time); +} diff --git a/packages/squiggle-lang/src/rescript/Distributions/SampleSetDist/SampleSetDist_ToPointSet.res b/packages/squiggle-lang/src/rescript/Distributions/SampleSetDist/SampleSetDist_ToPointSet.res index 2836ca78..6157e83b 100644 --- a/packages/squiggle-lang/src/rescript/Distributions/SampleSetDist/SampleSetDist_ToPointSet.res +++ b/packages/squiggle-lang/src/rescript/Distributions/SampleSetDist/SampleSetDist_ToPointSet.res @@ -33,19 +33,19 @@ module Internals = { module KDE = { let normalSampling = (samples, outputXYPoints, kernelWidth) => - samples |> JS.samplesToContinuousPdf(_, outputXYPoints, kernelWidth) |> JS.jsToDist + samples -> JS.samplesToContinuousPdf(outputXYPoints, kernelWidth) -> JS.jsToDist } module T = { type t = array let xWidthToUnitWidth = (samples, outputXYPoints, xWidth) => { - let xyPointRange = E.A.Sorted.range(samples) |> E.O.default(0.0) + let xyPointRange = E.A.Sorted.range(samples) -> E.O2.default(0.0) let xyPointWidth = xyPointRange /. float_of_int(outputXYPoints) xWidth /. xyPointWidth } - let formatUnitWidth = w => Jstat.max([w, 1.0]) |> int_of_float + let formatUnitWidth = w => Jstat.max([w, 1.0]) -> int_of_float let suggestedUnitWidth = (samples, outputXYPoints) => { let suggestedXWidth = SampleSetDist_Bandwidth.nrd0(samples) @@ -62,23 +62,24 @@ let toPointSetDist = ( ~samplingInputs: SamplingInputs.samplingInputs, (), ): Internals.Types.outputs => { - let samples = Js.Array2.copy(samples) - Array.fast_sort(compare, samples) + let samples = samples->Js.Array2.copy->Js.Array2.sortInPlaceWith(compare) + let minDiscreteToKeep = MagicNumbers.ToPointSet.minDiscreteToKeep(samples) let (continuousPart, discretePart) = E.A.Floats.Sorted.splitContinuousAndDiscreteForMinWeight( samples, ~minDiscreteWeight=minDiscreteToKeep, ) - let length = samples |> E.A.length |> float_of_int + + let length = samples->E.A.length->float_of_int let discrete: PointSetTypes.discreteShape = discretePart - |> E.FloatFloatMap.fmap(r => r /. length) - |> E.FloatFloatMap.toArray - |> XYShape.T.fromZippedArray - |> Discrete.make + ->E.FloatFloatMap.fmap(r => r /. length, _) + ->E.FloatFloatMap.toArray + ->XYShape.T.fromZippedArray + ->Discrete.make let pdf = - continuousPart |> E.A.length > 5 + continuousPart->E.A.length > 5 ? { let _suggestedXWidth = SampleSetDist_Bandwidth.nrd0(continuousPart) // todo: This does some recalculating from the last step. @@ -86,7 +87,7 @@ let toPointSetDist = ( continuousPart, samplingInputs.outputXYPoints, ) - let usedWidth = samplingInputs.kernelWidth |> E.O.default(_suggestedXWidth) + let usedWidth = samplingInputs.kernelWidth -> E.O2.default(_suggestedXWidth) let usedUnitWidth = Internals.T.xWidthToUnitWidth( samples, samplingInputs.outputXYPoints, @@ -101,18 +102,18 @@ let toPointSetDist = ( bandwidthUnitImplemented: usedUnitWidth, } continuousPart - |> Internals.T.kde( + ->Internals.T.kde( ~samples=_, ~outputXYPoints=samplingInputs.outputXYPoints, Internals.T.formatUnitWidth(usedUnitWidth), ) - |> Continuous.make - |> (r => Some((r, samplingStats))) + ->Continuous.make + ->(r => Some((r, samplingStats))) } : None let pointSetDist = MixedShapeBuilder.buildSimple( - ~continuous=pdf |> E.O.fmap(fst), + ~continuous=pdf->E.O2.fmap(fst), ~discrete=Some(discrete), ) @@ -125,7 +126,7 @@ let toPointSetDist = ( let normalizedPointSet = pointSetDist->E.O2.fmap(PointSetDist.T.normalize) let samplesParse: Internals.Types.outputs = { - continuousParseParams: pdf |> E.O.fmap(snd), + continuousParseParams: pdf -> E.O2.fmap(snd), pointSetDist: normalizedPointSet, } diff --git a/packages/squiggle-lang/src/rescript/Utility/E/E_A.res b/packages/squiggle-lang/src/rescript/Utility/E/E_A.res index 032d2305..54a26933 100644 --- a/packages/squiggle-lang/src/rescript/Utility/E/E_A.res +++ b/packages/squiggle-lang/src/rescript/Utility/E/E_A.res @@ -305,55 +305,49 @@ module Floats = { /* This function goes through a sorted array and divides it into two different clusters: continuous samples and discrete samples. The discrete samples are stored in a mutable map. - Samples are thought to be discrete if they have any duplicates. - */ - let _splitContinuousAndDiscreteForDuplicates = (sortedArray: array) => { - let continuous: array = [] - let discrete = FloatFloatMap.empty() - Belt.Array.forEachWithIndex(sortedArray, (index, element) => { - let maxIndex = (sortedArray |> Array.length) - 1 - let possiblySimilarElements = switch index { - | 0 => [index + 1] - | n if n == maxIndex => [index - 1] - | _ => [index - 1, index + 1] - } |> Belt.Array.map(_, r => sortedArray[r]) - let hasSimilarElement = Belt.Array.some(possiblySimilarElements, r => r == element) - hasSimilarElement - ? FloatFloatMap.increment(element, discrete) - : { - let _ = Js.Array.push(element, continuous) - } + Samples are thought to be discrete if they have at least `minDiscreteWight` duplicates. - () - }) - - (continuous, discrete) - } - - /* - This function works very similarly to splitContinuousAndDiscreteForDuplicates. The one major difference - is that you can specify a minDiscreteWeight. If the min discreet weight is 4, that would mean that - at least four elements needed from a specific value for that to be kept as discrete. This is important - because in some cases, we can expect that some common elements will be generated by regular operations. - The final continous array will be sorted. + If the min discreet weight is 4, that would mean that at least four elements needed from a specific + value for that to be kept as discrete. This is important because in some cases, we can expect that + some common elements will be generated by regular operations. The final continous array will be sorted. */ let splitContinuousAndDiscreteForMinWeight = ( sortedArray: array, ~minDiscreteWeight: int, ) => { - let (continuous, discrete) = _splitContinuousAndDiscreteForDuplicates(sortedArray) - let keepFn = v => Belt.Float.toInt(v) >= minDiscreteWeight - let (discreteToKeep, discreteToIntegrate) = FloatFloatMap.partition( - ((_, v)) => keepFn(v), - discrete, - ) - let newContinousSamples = - discreteToIntegrate->FloatFloatMap.toArray - |> fmap(((k, v)) => Belt.Array.makeBy(Belt.Float.toInt(v), _ => k)) - |> Belt.Array.concatMany - let newContinuous = concat(continuous, newContinousSamples) - newContinuous |> Array.fast_sort(floatCompare) - (newContinuous, discreteToKeep) + let continuous: array = [] + let discrete = FloatFloatMap.empty() + + let flush = (cnt: int, value: float): unit => { + if cnt >= minDiscreteWeight { + FloatFloatMap.add(value, cnt->Belt.Int.toFloat, discrete) + } else { + for _ in 1 to cnt { + let _ = continuous->Js.Array2.push(value) + } + } + } + + if sortedArray->Js.Array2.length != 0 { + let (finalCnt, finalValue) = sortedArray->Belt.Array.reduce( + // initial prev value doesn't matter; if it collides with the first element of the array, flush won't do anything + (0, 0.), + ((cnt, prev), element) => { + if element == prev { + (cnt + 1, prev) + } else { + // new value, process previous ones + flush(cnt, prev) + (1, element) + } + } + ) + + // flush final values + flush(finalCnt, finalValue) + } + + (continuous, discrete) } } } diff --git a/packages/squiggle-lang/src/rescript/Utility/E/E_FloatFloatMap.res b/packages/squiggle-lang/src/rescript/Utility/E/E_FloatFloatMap.res index aa89aac7..53849d19 100644 --- a/packages/squiggle-lang/src/rescript/Utility/E/E_FloatFloatMap.res +++ b/packages/squiggle-lang/src/rescript/Utility/E/E_FloatFloatMap.res @@ -16,6 +16,14 @@ let increment = (el, t: t) => } ) +let add = (el, amount: float, t: t) => + Belt.MutableMap.update(t, el, x => + switch x { + | Some(n) => Some(n +. amount) + | None => Some(amount) + } + ) + let get = (el, t: t) => Belt.MutableMap.get(t, el) let fmap = (fn, t: t) => Belt.MutableMap.map(t, fn) let partition = (fn, t: t) => {