From 92397d5ead38dde4154e70d00f24973bcf2a925a Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 27 Mar 2024 09:04:32 -0500 Subject: Add statistics recipe for sampling from an estimated probability density distribution (#117221) --- Doc/library/statistics.rst | 58 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index fc7e0c1..197c123 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -1148,6 +1148,64 @@ The final prediction goes to the largest posterior. This is known as the 'female' +Sampling from kernel density estimation +*************************************** + +The :func:`kde()` function creates a continuous probability density +function from discrete samples. Some applications need a way to make +random selections from that distribution. + +The technique is to pick a sample from a bandwidth scaled kernel +function and recenter the result around a randomly chosen point from +the input data. This can be done with any kernel that has a known or +accurately approximated inverse cumulative distribution function. + +.. testcode:: + + from random import choice, random, seed + from math import sqrt, log, pi, tan, asin + from statistics import NormalDist + + kernel_invcdfs = { + 'normal': NormalDist().inv_cdf, + 'logistic': lambda p: log(p / (1 - p)), + 'sigmoid': lambda p: log(tan(p * pi/2)), + 'rectangular': lambda p: 2*p - 1, + 'triangular': lambda p: sqrt(2*p) - 1 if p < 0.5 else 1 - sqrt(2 - 2*p), + 'cosine': lambda p: 2*asin(2*p - 1)/pi, + } + + def kde_random(data, h, kernel='normal'): + 'Return a function that samples from kde() smoothed data.' + kernel_invcdf = kernel_invcdfs[kernel] + def rand(): + return h * kernel_invcdf(random()) + choice(data) + return rand + +For example: + +.. doctest:: + + >>> discrete_samples = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2] + >>> rand = kde_random(discrete_samples, h=1.5) + >>> seed(8675309) + >>> selections = [rand() for i in range(10)] + >>> [round(x, 1) for x in selections] + [4.7, 7.4, 1.2, 7.8, 6.9, -1.3, 5.8, 0.2, -1.4, 5.7] + +.. testcode:: + :hide: + + from statistics import kde + from math import isclose + + # Verify that cdf / invcdf will round trip + xarr = [i/100 for i in range(-100, 101)] + for kernel, invcdf in kernel_invcdfs.items(): + cdf = kde([0.0], h=1.0, kernel=kernel, cumulative=True) + for x in xarr: + assert isclose(invcdf(cdf(x)), x, abs_tol=1E-9) + .. # This modelines must appear within the last ten lines of the file. kate: indent-width 3; remove-trailing-space on; replace-tabs on; encoding utf-8; -- cgit v0.12