summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRaymond Hettinger <rhettinger@users.noreply.github.com>2024-02-25 23:46:47 (GMT)
committerGitHub <noreply@github.com>2024-02-25 23:46:47 (GMT)
commit6d34eb0e36d3a7edd9e7629f21da39b6a74b8f68 (patch)
tree0ac0ed92cdfe54d2f9ff6b0e387c424c8324dcf0
parent6a3236fe2e61673cf9f819534afbf14a18678408 (diff)
downloadcpython-6d34eb0e36d3a7edd9e7629f21da39b6a74b8f68.zip
cpython-6d34eb0e36d3a7edd9e7629f21da39b6a74b8f68.tar.gz
cpython-6d34eb0e36d3a7edd9e7629f21da39b6a74b8f68.tar.bz2
gh-115532: Add kernel density estimation to the statistics module (gh-115863)
-rw-r--r--Doc/library/statistics.rst89
-rw-r--r--Doc/whatsnew/3.13.rst8
-rw-r--r--Lib/statistics.py168
-rw-r--r--Lib/test/test_statistics.py60
-rw-r--r--Misc/NEWS.d/next/Library/2024-02-23-11-08-31.gh-issue-115532.zVd3gK.rst1
5 files changed, 285 insertions, 41 deletions
diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst
index 0417b3f..1785c6b 100644
--- a/Doc/library/statistics.rst
+++ b/Doc/library/statistics.rst
@@ -76,6 +76,7 @@ or sample.
:func:`fmean` Fast, floating point arithmetic mean, with optional weighting.
:func:`geometric_mean` Geometric mean of data.
:func:`harmonic_mean` Harmonic mean of data.
+:func:`kde` Estimate the probability density distribution of the data.
:func:`median` Median (middle value) of data.
:func:`median_low` Low median of data.
:func:`median_high` High median of data.
@@ -259,6 +260,54 @@ However, for reading convenience, most of the examples show sorted sequences.
.. versionchanged:: 3.10
Added support for *weights*.
+
+.. function:: kde(data, h, kernel='normal')
+
+ `Kernel Density Estimation (KDE)
+ <https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf>`_:
+ Create a continuous probability density function from discrete samples.
+
+ The basic idea is to smooth the data using `a kernel function
+ <https://en.wikipedia.org/wiki/Kernel_(statistics)>`_.
+ to help draw inferences about a population from a sample.
+
+ The degree of smoothing is controlled by the scaling parameter *h*
+ which is called the bandwidth. Smaller values emphasize local
+ features while larger values give smoother results.
+
+ The *kernel* determines the relative weights of the sample data
+ points. Generally, the choice of kernel shape does not matter
+ as much as the more influential bandwidth smoothing parameter.
+
+ Kernels that give some weight to every sample point include
+ *normal* or *gauss*, *logistic*, and *sigmoid*.
+
+ Kernels that only give weight to sample points within the bandwidth
+ include *rectangular* or *uniform*, *triangular*, *parabolic* or
+ *epanechnikov*, *quartic* or *biweight*, *triweight*, and *cosine*.
+
+ A :exc:`StatisticsError` will be raised if the *data* sequence is empty.
+
+ `Wikipedia has an example
+ <https://en.wikipedia.org/wiki/Kernel_density_estimation#Example>`_
+ where we can use :func:`kde` to generate and plot a probability
+ density function estimated from a small sample:
+
+ .. doctest::
+
+ >>> sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
+ >>> f_hat = kde(sample, h=1.5)
+ >>> xarr = [i/100 for i in range(-750, 1100)]
+ >>> yarr = [f_hat(x) for x in xarr]
+
+ The points in ``xarr`` and ``yarr`` can be used to make a PDF plot:
+
+ .. image:: kde_example.png
+ :alt: Scatter plot of the estimated probability density function.
+
+ .. versionadded:: 3.13
+
+
.. function:: median(data)
Return the median (middle value) of numeric data, using the common "mean of
@@ -1095,46 +1144,6 @@ The final prediction goes to the largest posterior. This is known as the
'female'
-Kernel density estimation
-*************************
-
-It is possible to estimate a continuous probability density function
-from a fixed number of discrete samples.
-
-The basic idea is to smooth the data using `a kernel function such as a
-normal distribution, triangular distribution, or uniform distribution
-<https://en.wikipedia.org/wiki/Kernel_(statistics)#Kernel_functions_in_common_use>`_.
-The degree of smoothing is controlled by a scaling parameter, ``h``,
-which is called the *bandwidth*.
-
-.. testcode::
-
- def kde_normal(sample, h):
- "Create a continuous probability density function from a sample."
- # Smooth the sample with a normal distribution kernel scaled by h.
- kernel_h = NormalDist(0.0, h).pdf
- n = len(sample)
- def pdf(x):
- return sum(kernel_h(x - x_i) for x_i in sample) / n
- return pdf
-
-`Wikipedia has an example
-<https://en.wikipedia.org/wiki/Kernel_density_estimation#Example>`_
-where we can use the ``kde_normal()`` recipe to generate and plot
-a probability density function estimated from a small sample:
-
-.. doctest::
-
- >>> sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
- >>> f_hat = kde_normal(sample, h=1.5)
- >>> xarr = [i/100 for i in range(-750, 1100)]
- >>> yarr = [f_hat(x) for x in xarr]
-
-The points in ``xarr`` and ``yarr`` can be used to make a PDF plot:
-
-.. image:: kde_example.png
- :alt: Scatter plot of the estimated probability density function.
-
..
# This modelines must appear within the last ten lines of the file.
kate: indent-width 3; remove-trailing-space on; replace-tabs on; encoding utf-8;
diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst
index a393a1d..ca5a0e7 100644
--- a/Doc/whatsnew/3.13.rst
+++ b/Doc/whatsnew/3.13.rst
@@ -476,6 +476,14 @@ sqlite3
for filtering database objects to dump.
(Contributed by Mariusz Felisiak in :gh:`91602`.)
+statistics
+----------
+
+* Add :func:`statistics.kde` for kernel density estimation.
+ This makes it possible to estimate a continuous probability density function
+ from a fixed number of discrete samples.
+ (Contributed by Raymond Hettinger in :gh:`115863`.)
+
subprocess
----------
diff --git a/Lib/statistics.py b/Lib/statistics.py
index 83aaedb..7924123 100644
--- a/Lib/statistics.py
+++ b/Lib/statistics.py
@@ -112,6 +112,7 @@ __all__ = [
'fmean',
'geometric_mean',
'harmonic_mean',
+ 'kde',
'linear_regression',
'mean',
'median',
@@ -137,7 +138,7 @@ from decimal import Decimal
from itertools import count, groupby, repeat
from bisect import bisect_left, bisect_right
from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum, sumprod
-from math import isfinite, isinf
+from math import isfinite, isinf, pi, cos, cosh
from functools import reduce
from operator import itemgetter
from collections import Counter, namedtuple, defaultdict
@@ -802,6 +803,171 @@ def multimode(data):
return [value for value, count in counts.items() if count == maxcount]
+def kde(data, h, kernel='normal'):
+ """Kernel Density Estimation: Create a continuous probability
+ density function from discrete samples.
+
+ The basic idea is to smooth the data using a kernel function
+ to help draw inferences about a population from a sample.
+
+ The degree of smoothing is controlled by the scaling parameter h
+ which is called the bandwidth. Smaller values emphasize local
+ features while larger values give smoother results.
+
+ The kernel determines the relative weights of the sample data
+ points. Generally, the choice of kernel shape does not matter
+ as much as the more influential bandwidth smoothing parameter.
+
+ Kernels that give some weight to every sample point:
+
+ normal or gauss
+ logistic
+ sigmoid
+
+ Kernels that only give weight to sample points within
+ the bandwidth:
+
+ rectangular or uniform
+ triangular
+ parabolic or epanechnikov
+ quartic or biweight
+ triweight
+ cosine
+
+ A StatisticsError will be raised if the data sequence is empty.
+
+ Example
+ -------
+
+ Given a sample of six data points, construct a continuous
+ function that estimates the underlying probability density:
+
+ >>> sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
+ >>> f_hat = kde(sample, h=1.5)
+
+ Compute the area under the curve:
+
+ >>> sum(f_hat(x) for x in range(-20, 20))
+ 1.0
+
+ Plot the estimated probability density function at
+ evenly spaced points from -6 to 10:
+
+ >>> for x in range(-6, 11):
+ ... density = f_hat(x)
+ ... plot = ' ' * int(density * 400) + 'x'
+ ... print(f'{x:2}: {density:.3f} {plot}')
+ ...
+ -6: 0.002 x
+ -5: 0.009 x
+ -4: 0.031 x
+ -3: 0.070 x
+ -2: 0.111 x
+ -1: 0.125 x
+ 0: 0.110 x
+ 1: 0.086 x
+ 2: 0.068 x
+ 3: 0.059 x
+ 4: 0.066 x
+ 5: 0.082 x
+ 6: 0.082 x
+ 7: 0.058 x
+ 8: 0.028 x
+ 9: 0.009 x
+ 10: 0.002 x
+
+ References
+ ----------
+
+ Kernel density estimation and its application:
+ https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf
+
+ Kernel functions in common use:
+ https://en.wikipedia.org/wiki/Kernel_(statistics)#kernel_functions_in_common_use
+
+ Interactive graphical demonstration and exploration:
+ https://demonstrations.wolfram.com/KernelDensityEstimation/
+
+ """
+
+ n = len(data)
+ if not n:
+ raise StatisticsError('Empty data sequence')
+
+ if not isinstance(data[0], (int, float)):
+ raise TypeError('Data sequence must contain ints or floats')
+
+ if h <= 0.0:
+ raise StatisticsError(f'Bandwidth h must be positive, not {h=!r}')
+
+ match kernel:
+
+ case 'normal' | 'gauss':
+ c = 1 / sqrt(2 * pi)
+ K = lambda t: c * exp(-1/2 * t * t)
+ support = None
+
+ case 'logistic':
+ # 1.0 / (exp(t) + 2.0 + exp(-t))
+ K = lambda t: 1/2 / (1.0 + cosh(t))
+ support = None
+
+ case 'sigmoid':
+ # (2/pi) / (exp(t) + exp(-t))
+ c = 1 / pi
+ K = lambda t: c / cosh(t)
+ support = None
+
+ case 'rectangular' | 'uniform':
+ K = lambda t: 1/2
+ support = 1.0
+
+ case 'triangular':
+ K = lambda t: 1.0 - abs(t)
+ support = 1.0
+
+ case 'parabolic' | 'epanechnikov':
+ K = lambda t: 3/4 * (1.0 - t * t)
+ support = 1.0
+
+ case 'quartic' | 'biweight':
+ K = lambda t: 15/16 * (1.0 - t * t) ** 2
+ support = 1.0
+
+ case 'triweight':
+ K = lambda t: 35/32 * (1.0 - t * t) ** 3
+ support = 1.0
+
+ case 'cosine':
+ c1 = pi / 4
+ c2 = pi / 2
+ K = lambda t: c1 * cos(c2 * t)
+ support = 1.0
+
+ case _:
+ raise StatisticsError(f'Unknown kernel name: {kernel!r}')
+
+ if support is None:
+
+ def pdf(x):
+ return sum(K((x - x_i) / h) for x_i in data) / (n * h)
+
+ else:
+
+ sample = sorted(data)
+ bandwidth = h * support
+
+ def pdf(x):
+ i = bisect_left(sample, x - bandwidth)
+ j = bisect_right(sample, x + bandwidth)
+ supported = sample[i : j]
+ return sum(K((x - x_i) / h) for x_i in supported) / (n * h)
+
+ pdf.__doc__ = f'PDF estimate with {kernel=!r} and {h=!r}'
+
+ return pdf
+
+
# Notes on methods for computing quantiles
# ----------------------------------------
#
diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py
index bf2c254..1cf4163 100644
--- a/Lib/test/test_statistics.py
+++ b/Lib/test/test_statistics.py
@@ -2353,6 +2353,66 @@ class TestGeometricMean(unittest.TestCase):
self.assertAlmostEqual(actual_mean, expected_mean, places=5)
+class TestKDE(unittest.TestCase):
+
+ def test_kde(self):
+ kde = statistics.kde
+ StatisticsError = statistics.StatisticsError
+
+ kernels = ['normal', 'gauss', 'logistic', 'sigmoid', 'rectangular',
+ 'uniform', 'triangular', 'parabolic', 'epanechnikov',
+ 'quartic', 'biweight', 'triweight', 'cosine']
+
+ sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
+
+ # The approximate integral of a PDF should be close to 1.0
+
+ def integrate(func, low, high, steps=10_000):
+ "Numeric approximation of a definite function integral."
+ dx = (high - low) / steps
+ midpoints = (low + (i + 1/2) * dx for i in range(steps))
+ return sum(map(func, midpoints)) * dx
+
+ for kernel in kernels:
+ with self.subTest(kernel=kernel):
+ f_hat = kde(sample, h=1.5, kernel=kernel)
+ area = integrate(f_hat, -20, 20)
+ self.assertAlmostEqual(area, 1.0, places=4)
+
+ # Check error cases
+
+ with self.assertRaises(StatisticsError):
+ kde([], h=1.0) # Empty dataset
+ with self.assertRaises(TypeError):
+ kde(['abc', 'def'], 1.5) # Non-numeric data
+ with self.assertRaises(TypeError):
+ kde(iter(sample), 1.5) # Data is not a sequence
+ with self.assertRaises(StatisticsError):
+ kde(sample, h=0.0) # Zero bandwidth
+ with self.assertRaises(StatisticsError):
+ kde(sample, h=0.0) # Negative bandwidth
+ with self.assertRaises(TypeError):
+ kde(sample, h='str') # Wrong bandwidth type
+ with self.assertRaises(StatisticsError):
+ kde(sample, h=1.0, kernel='bogus') # Invalid kernel
+
+ # Test name and docstring of the generated function
+
+ h = 1.5
+ kernel = 'cosine'
+ f_hat = kde(sample, h, kernel)
+ self.assertEqual(f_hat.__name__, 'pdf')
+ self.assertIn(kernel, f_hat.__doc__)
+ self.assertIn(str(h), f_hat.__doc__)
+
+ # Test closed interval for the support boundaries.
+ # In particular, 'uniform' should non-zero at the boundaries.
+
+ f_hat = kde([0], 1.0, 'uniform')
+ self.assertEqual(f_hat(-1.0), 1/2)
+ self.assertEqual(f_hat(1.0), 1/2)
+
+
class TestQuantiles(unittest.TestCase):
def test_specific_cases(self):
diff --git a/Misc/NEWS.d/next/Library/2024-02-23-11-08-31.gh-issue-115532.zVd3gK.rst b/Misc/NEWS.d/next/Library/2024-02-23-11-08-31.gh-issue-115532.zVd3gK.rst
new file mode 100644
index 0000000..fb36c0b
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-02-23-11-08-31.gh-issue-115532.zVd3gK.rst
@@ -0,0 +1 @@
+Add kernel density estimation to the statistics module.