diff options
author | Raymond Hettinger <rhettinger@users.noreply.github.com> | 2024-02-25 23:46:47 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-25 23:46:47 (GMT) |
commit | 6d34eb0e36d3a7edd9e7629f21da39b6a74b8f68 (patch) | |
tree | 0ac0ed92cdfe54d2f9ff6b0e387c424c8324dcf0 | |
parent | 6a3236fe2e61673cf9f819534afbf14a18678408 (diff) | |
download | cpython-6d34eb0e36d3a7edd9e7629f21da39b6a74b8f68.zip cpython-6d34eb0e36d3a7edd9e7629f21da39b6a74b8f68.tar.gz cpython-6d34eb0e36d3a7edd9e7629f21da39b6a74b8f68.tar.bz2 |
gh-115532: Add kernel density estimation to the statistics module (gh-115863)
-rw-r--r-- | Doc/library/statistics.rst | 89 | ||||
-rw-r--r-- | Doc/whatsnew/3.13.rst | 8 | ||||
-rw-r--r-- | Lib/statistics.py | 168 | ||||
-rw-r--r-- | Lib/test/test_statistics.py | 60 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Library/2024-02-23-11-08-31.gh-issue-115532.zVd3gK.rst | 1 |
5 files changed, 285 insertions, 41 deletions
diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 0417b3f..1785c6b 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -76,6 +76,7 @@ or sample. :func:`fmean` Fast, floating point arithmetic mean, with optional weighting. :func:`geometric_mean` Geometric mean of data. :func:`harmonic_mean` Harmonic mean of data. +:func:`kde` Estimate the probability density distribution of the data. :func:`median` Median (middle value) of data. :func:`median_low` Low median of data. :func:`median_high` High median of data. @@ -259,6 +260,54 @@ However, for reading convenience, most of the examples show sorted sequences. .. versionchanged:: 3.10 Added support for *weights*. + +.. function:: kde(data, h, kernel='normal') + + `Kernel Density Estimation (KDE) + <https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf>`_: + Create a continuous probability density function from discrete samples. + + The basic idea is to smooth the data using `a kernel function + <https://en.wikipedia.org/wiki/Kernel_(statistics)>`_. + to help draw inferences about a population from a sample. + + The degree of smoothing is controlled by the scaling parameter *h* + which is called the bandwidth. Smaller values emphasize local + features while larger values give smoother results. + + The *kernel* determines the relative weights of the sample data + points. Generally, the choice of kernel shape does not matter + as much as the more influential bandwidth smoothing parameter. + + Kernels that give some weight to every sample point include + *normal* or *gauss*, *logistic*, and *sigmoid*. + + Kernels that only give weight to sample points within the bandwidth + include *rectangular* or *uniform*, *triangular*, *parabolic* or + *epanechnikov*, *quartic* or *biweight*, *triweight*, and *cosine*. + + A :exc:`StatisticsError` will be raised if the *data* sequence is empty. + + `Wikipedia has an example + <https://en.wikipedia.org/wiki/Kernel_density_estimation#Example>`_ + where we can use :func:`kde` to generate and plot a probability + density function estimated from a small sample: + + .. doctest:: + + >>> sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2] + >>> f_hat = kde(sample, h=1.5) + >>> xarr = [i/100 for i in range(-750, 1100)] + >>> yarr = [f_hat(x) for x in xarr] + + The points in ``xarr`` and ``yarr`` can be used to make a PDF plot: + + .. image:: kde_example.png + :alt: Scatter plot of the estimated probability density function. + + .. versionadded:: 3.13 + + .. function:: median(data) Return the median (middle value) of numeric data, using the common "mean of @@ -1095,46 +1144,6 @@ The final prediction goes to the largest posterior. This is known as the 'female' -Kernel density estimation -************************* - -It is possible to estimate a continuous probability density function -from a fixed number of discrete samples. - -The basic idea is to smooth the data using `a kernel function such as a -normal distribution, triangular distribution, or uniform distribution -<https://en.wikipedia.org/wiki/Kernel_(statistics)#Kernel_functions_in_common_use>`_. -The degree of smoothing is controlled by a scaling parameter, ``h``, -which is called the *bandwidth*. - -.. testcode:: - - def kde_normal(sample, h): - "Create a continuous probability density function from a sample." - # Smooth the sample with a normal distribution kernel scaled by h. - kernel_h = NormalDist(0.0, h).pdf - n = len(sample) - def pdf(x): - return sum(kernel_h(x - x_i) for x_i in sample) / n - return pdf - -`Wikipedia has an example -<https://en.wikipedia.org/wiki/Kernel_density_estimation#Example>`_ -where we can use the ``kde_normal()`` recipe to generate and plot -a probability density function estimated from a small sample: - -.. doctest:: - - >>> sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2] - >>> f_hat = kde_normal(sample, h=1.5) - >>> xarr = [i/100 for i in range(-750, 1100)] - >>> yarr = [f_hat(x) for x in xarr] - -The points in ``xarr`` and ``yarr`` can be used to make a PDF plot: - -.. image:: kde_example.png - :alt: Scatter plot of the estimated probability density function. - .. # This modelines must appear within the last ten lines of the file. kate: indent-width 3; remove-trailing-space on; replace-tabs on; encoding utf-8; diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index a393a1d..ca5a0e7 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -476,6 +476,14 @@ sqlite3 for filtering database objects to dump. (Contributed by Mariusz Felisiak in :gh:`91602`.) +statistics +---------- + +* Add :func:`statistics.kde` for kernel density estimation. + This makes it possible to estimate a continuous probability density function + from a fixed number of discrete samples. + (Contributed by Raymond Hettinger in :gh:`115863`.) + subprocess ---------- diff --git a/Lib/statistics.py b/Lib/statistics.py index 83aaedb..7924123 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -112,6 +112,7 @@ __all__ = [ 'fmean', 'geometric_mean', 'harmonic_mean', + 'kde', 'linear_regression', 'mean', 'median', @@ -137,7 +138,7 @@ from decimal import Decimal from itertools import count, groupby, repeat from bisect import bisect_left, bisect_right from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum, sumprod -from math import isfinite, isinf +from math import isfinite, isinf, pi, cos, cosh from functools import reduce from operator import itemgetter from collections import Counter, namedtuple, defaultdict @@ -802,6 +803,171 @@ def multimode(data): return [value for value, count in counts.items() if count == maxcount] +def kde(data, h, kernel='normal'): + """Kernel Density Estimation: Create a continuous probability + density function from discrete samples. + + The basic idea is to smooth the data using a kernel function + to help draw inferences about a population from a sample. + + The degree of smoothing is controlled by the scaling parameter h + which is called the bandwidth. Smaller values emphasize local + features while larger values give smoother results. + + The kernel determines the relative weights of the sample data + points. Generally, the choice of kernel shape does not matter + as much as the more influential bandwidth smoothing parameter. + + Kernels that give some weight to every sample point: + + normal or gauss + logistic + sigmoid + + Kernels that only give weight to sample points within + the bandwidth: + + rectangular or uniform + triangular + parabolic or epanechnikov + quartic or biweight + triweight + cosine + + A StatisticsError will be raised if the data sequence is empty. + + Example + ------- + + Given a sample of six data points, construct a continuous + function that estimates the underlying probability density: + + >>> sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2] + >>> f_hat = kde(sample, h=1.5) + + Compute the area under the curve: + + >>> sum(f_hat(x) for x in range(-20, 20)) + 1.0 + + Plot the estimated probability density function at + evenly spaced points from -6 to 10: + + >>> for x in range(-6, 11): + ... density = f_hat(x) + ... plot = ' ' * int(density * 400) + 'x' + ... print(f'{x:2}: {density:.3f} {plot}') + ... + -6: 0.002 x + -5: 0.009 x + -4: 0.031 x + -3: 0.070 x + -2: 0.111 x + -1: 0.125 x + 0: 0.110 x + 1: 0.086 x + 2: 0.068 x + 3: 0.059 x + 4: 0.066 x + 5: 0.082 x + 6: 0.082 x + 7: 0.058 x + 8: 0.028 x + 9: 0.009 x + 10: 0.002 x + + References + ---------- + + Kernel density estimation and its application: + https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf + + Kernel functions in common use: + https://en.wikipedia.org/wiki/Kernel_(statistics)#kernel_functions_in_common_use + + Interactive graphical demonstration and exploration: + https://demonstrations.wolfram.com/KernelDensityEstimation/ + + """ + + n = len(data) + if not n: + raise StatisticsError('Empty data sequence') + + if not isinstance(data[0], (int, float)): + raise TypeError('Data sequence must contain ints or floats') + + if h <= 0.0: + raise StatisticsError(f'Bandwidth h must be positive, not {h=!r}') + + match kernel: + + case 'normal' | 'gauss': + c = 1 / sqrt(2 * pi) + K = lambda t: c * exp(-1/2 * t * t) + support = None + + case 'logistic': + # 1.0 / (exp(t) + 2.0 + exp(-t)) + K = lambda t: 1/2 / (1.0 + cosh(t)) + support = None + + case 'sigmoid': + # (2/pi) / (exp(t) + exp(-t)) + c = 1 / pi + K = lambda t: c / cosh(t) + support = None + + case 'rectangular' | 'uniform': + K = lambda t: 1/2 + support = 1.0 + + case 'triangular': + K = lambda t: 1.0 - abs(t) + support = 1.0 + + case 'parabolic' | 'epanechnikov': + K = lambda t: 3/4 * (1.0 - t * t) + support = 1.0 + + case 'quartic' | 'biweight': + K = lambda t: 15/16 * (1.0 - t * t) ** 2 + support = 1.0 + + case 'triweight': + K = lambda t: 35/32 * (1.0 - t * t) ** 3 + support = 1.0 + + case 'cosine': + c1 = pi / 4 + c2 = pi / 2 + K = lambda t: c1 * cos(c2 * t) + support = 1.0 + + case _: + raise StatisticsError(f'Unknown kernel name: {kernel!r}') + + if support is None: + + def pdf(x): + return sum(K((x - x_i) / h) for x_i in data) / (n * h) + + else: + + sample = sorted(data) + bandwidth = h * support + + def pdf(x): + i = bisect_left(sample, x - bandwidth) + j = bisect_right(sample, x + bandwidth) + supported = sample[i : j] + return sum(K((x - x_i) / h) for x_i in supported) / (n * h) + + pdf.__doc__ = f'PDF estimate with {kernel=!r} and {h=!r}' + + return pdf + + # Notes on methods for computing quantiles # ---------------------------------------- # diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index bf2c254..1cf4163 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2353,6 +2353,66 @@ class TestGeometricMean(unittest.TestCase): self.assertAlmostEqual(actual_mean, expected_mean, places=5) +class TestKDE(unittest.TestCase): + + def test_kde(self): + kde = statistics.kde + StatisticsError = statistics.StatisticsError + + kernels = ['normal', 'gauss', 'logistic', 'sigmoid', 'rectangular', + 'uniform', 'triangular', 'parabolic', 'epanechnikov', + 'quartic', 'biweight', 'triweight', 'cosine'] + + sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2] + + # The approximate integral of a PDF should be close to 1.0 + + def integrate(func, low, high, steps=10_000): + "Numeric approximation of a definite function integral." + dx = (high - low) / steps + midpoints = (low + (i + 1/2) * dx for i in range(steps)) + return sum(map(func, midpoints)) * dx + + for kernel in kernels: + with self.subTest(kernel=kernel): + f_hat = kde(sample, h=1.5, kernel=kernel) + area = integrate(f_hat, -20, 20) + self.assertAlmostEqual(area, 1.0, places=4) + + # Check error cases + + with self.assertRaises(StatisticsError): + kde([], h=1.0) # Empty dataset + with self.assertRaises(TypeError): + kde(['abc', 'def'], 1.5) # Non-numeric data + with self.assertRaises(TypeError): + kde(iter(sample), 1.5) # Data is not a sequence + with self.assertRaises(StatisticsError): + kde(sample, h=0.0) # Zero bandwidth + with self.assertRaises(StatisticsError): + kde(sample, h=0.0) # Negative bandwidth + with self.assertRaises(TypeError): + kde(sample, h='str') # Wrong bandwidth type + with self.assertRaises(StatisticsError): + kde(sample, h=1.0, kernel='bogus') # Invalid kernel + + # Test name and docstring of the generated function + + h = 1.5 + kernel = 'cosine' + f_hat = kde(sample, h, kernel) + self.assertEqual(f_hat.__name__, 'pdf') + self.assertIn(kernel, f_hat.__doc__) + self.assertIn(str(h), f_hat.__doc__) + + # Test closed interval for the support boundaries. + # In particular, 'uniform' should non-zero at the boundaries. + + f_hat = kde([0], 1.0, 'uniform') + self.assertEqual(f_hat(-1.0), 1/2) + self.assertEqual(f_hat(1.0), 1/2) + + class TestQuantiles(unittest.TestCase): def test_specific_cases(self): diff --git a/Misc/NEWS.d/next/Library/2024-02-23-11-08-31.gh-issue-115532.zVd3gK.rst b/Misc/NEWS.d/next/Library/2024-02-23-11-08-31.gh-issue-115532.zVd3gK.rst new file mode 100644 index 0000000..fb36c0b --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-02-23-11-08-31.gh-issue-115532.zVd3gK.rst @@ -0,0 +1 @@ +Add kernel density estimation to the statistics module. |