gh-115532: Add kernel density estimation to the statistics module (gh-115863)

author: Raymond Hettinger <rhettinger@users.noreply.github.com> 2024-02-25 23:46:47 (GMT)
committer: GitHub <noreply@github.com> 2024-02-25 23:46:47 (GMT)
commit: 6d34eb0e36d3a7edd9e7629f21da39b6a74b8f68 (patch)
tree: 0ac0ed92cdfe54d2f9ff6b0e387c424c8324dcf0 /Lib/test/test_statistics.py
parent: 6a3236fe2e61673cf9f819534afbf14a18678408 (diff)
download: cpython-6d34eb0e36d3a7edd9e7629f21da39b6a74b8f68.zip
cpython-6d34eb0e36d3a7edd9e7629f21da39b6a74b8f68.tar.gz
cpython-6d34eb0e36d3a7edd9e7629f21da39b6a74b8f68.tar.bz2
1 files changed, 60 insertions, 0 deletions
diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py
index bf2c254..1cf4163 100644
--- a/Lib/test/test_statistics.py
+++ b/Lib/test/test_statistics.py
@@ -2353,6 +2353,66 @@ class TestGeometricMean(unittest.TestCase):
                 self.assertAlmostEqual(actual_mean, expected_mean, places=5)
 
 
+class TestKDE(unittest.TestCase):
+
+    def test_kde(self):
+        kde = statistics.kde
+        StatisticsError = statistics.StatisticsError
+
+        kernels = ['normal', 'gauss', 'logistic', 'sigmoid', 'rectangular',
+                   'uniform', 'triangular', 'parabolic', 'epanechnikov',
+                   'quartic', 'biweight', 'triweight', 'cosine']
+
+        sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
+
+        # The approximate integral of a PDF should be close to 1.0
+
+        def integrate(func, low, high, steps=10_000):
+            "Numeric approximation of a definite function integral."
+            dx = (high - low) / steps
+            midpoints = (low + (i + 1/2) * dx for i in range(steps))
+            return sum(map(func, midpoints)) * dx
+
+        for kernel in kernels:
+            with self.subTest(kernel=kernel):
+                f_hat = kde(sample, h=1.5, kernel=kernel)
+                area = integrate(f_hat, -20, 20)
+                self.assertAlmostEqual(area, 1.0, places=4)
+
+        # Check error cases
+
+        with self.assertRaises(StatisticsError):
+            kde([], h=1.0)                              # Empty dataset
+        with self.assertRaises(TypeError):
+            kde(['abc', 'def'], 1.5)                    # Non-numeric data
+        with self.assertRaises(TypeError):
+            kde(iter(sample), 1.5)                      # Data is not a sequence
+        with self.assertRaises(StatisticsError):
+            kde(sample, h=0.0)                          # Zero bandwidth
+        with self.assertRaises(StatisticsError):
+            kde(sample, h=0.0)                          # Negative bandwidth
+        with self.assertRaises(TypeError):
+            kde(sample, h='str')                        # Wrong bandwidth type
+        with self.assertRaises(StatisticsError):
+            kde(sample, h=1.0, kernel='bogus')          # Invalid kernel
+
+        # Test name and docstring of the generated function
+
+        h = 1.5
+        kernel = 'cosine'
+        f_hat = kde(sample, h, kernel)
+        self.assertEqual(f_hat.__name__, 'pdf')
+        self.assertIn(kernel, f_hat.__doc__)
+        self.assertIn(str(h), f_hat.__doc__)
+
+        # Test closed interval for the support boundaries.
+        # In particular, 'uniform' should non-zero at the boundaries.
+
+        f_hat = kde([0], 1.0, 'uniform')
+        self.assertEqual(f_hat(-1.0), 1/2)
+        self.assertEqual(f_hat(1.0), 1/2)
+
+
 class TestQuantiles(unittest.TestCase):
 
     def test_specific_cases(self):
author	Raymond Hettinger <rhettinger@users.noreply.github.com>	2024-02-25 23:46:47 (GMT)
committer	GitHub <noreply@github.com>	2024-02-25 23:46:47 (GMT)
commit	6d34eb0e36d3a7edd9e7629f21da39b6a74b8f68 (patch)
tree	0ac0ed92cdfe54d2f9ff6b0e387c424c8324dcf0 /Lib/test/test_statistics.py
parent	6a3236fe2e61673cf9f819534afbf14a18678408 (diff)
download	cpython-6d34eb0e36d3a7edd9e7629f21da39b6a74b8f68.zip cpython-6d34eb0e36d3a7edd9e7629f21da39b6a74b8f68.tar.gz cpython-6d34eb0e36d3a7edd9e7629f21da39b6a74b8f68.tar.bz2