diff options
author | Raymond Hettinger <rhettinger@users.noreply.github.com> | 2019-03-07 06:59:40 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-03-07 06:59:40 (GMT) |
commit | 318d537daabf2bd5f781255c7e25bfce260cf227 (patch) | |
tree | 05255317e7fd489c1fc22bd4164285e9234d1a11 /Lib | |
parent | e942e7b5c91995ae1ad967ef2c0f116a5d8555de (diff) | |
download | cpython-318d537daabf2bd5f781255c7e25bfce260cf227.zip cpython-318d537daabf2bd5f781255c7e25bfce260cf227.tar.gz cpython-318d537daabf2bd5f781255c7e25bfce260cf227.tar.bz2 |
bpo-36169 : Add overlap() method to statistics.NormalDist (GH-12149)
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/statistics.py | 37 | ||||
-rw-r--r-- | Lib/test/test_statistics.py | 62 |
2 files changed, 98 insertions, 1 deletions
diff --git a/Lib/statistics.py b/Lib/statistics.py index e917a5d..e85aaa9 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -91,7 +91,7 @@ from fractions import Fraction from decimal import Decimal from itertools import groupby from bisect import bisect_left, bisect_right -from math import hypot, sqrt, fabs, exp, erf, tau +from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum @@ -740,6 +740,41 @@ class NormalDist: raise StatisticsError('cdf() not defined when sigma is zero') return 0.5 * (1.0 + erf((x - self.mu) / (self.sigma * sqrt(2.0)))) + def overlap(self, other): + '''Compute the overlapping coefficient (OVL) between two normal distributions. + + Measures the agreement between two normal probability distributions. + Returns a value between 0.0 and 1.0 giving the overlapping area in + the two underlying probability density functions. + + >>> N1 = NormalDist(2.4, 1.6) + >>> N2 = NormalDist(3.2, 2.0) + >>> N1.overlap(N2) + 0.8035050657330205 + + ''' + # See: "The overlapping coefficient as a measure of agreement between + # probability distributions and point estimation of the overlap of two + # normal densities" -- Henry F. Inman and Edwin L. Bradley Jr + # http://dx.doi.org/10.1080/03610928908830127 + if not isinstance(other, NormalDist): + raise TypeError('Expected another NormalDist instance') + X, Y = self, other + if (Y.sigma, Y.mu) < (X.sigma, X.mu): # sort to assure commutativity + X, Y = Y, X + X_var, Y_var = X.variance, Y.variance + if not X_var or not Y_var: + raise StatisticsError('overlap() not defined when sigma is zero') + dv = Y_var - X_var + dm = fabs(Y.mu - X.mu) + if not dv: + return 2.0 * NormalDist(dm, 2.0 * X.sigma).cdf(0) + a = X.mu * Y_var - Y.mu * X_var + b = X.sigma * Y.sigma * sqrt(dm**2.0 + dv * log(Y_var / X_var)) + x1 = (a + b) / dv + x2 = (a - b) / dv + return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2))) + @property def mean(self): 'Arithmetic mean of the normal distribution' diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index 3f14e63..132b982 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2162,6 +2162,68 @@ class TestNormalDist(unittest.TestCase): self.assertEqual(X.cdf(float('Inf')), 1.0) self.assertTrue(math.isnan(X.cdf(float('NaN')))) + def test_overlap(self): + NormalDist = statistics.NormalDist + + # Match examples from Imman and Bradley + for X1, X2, published_result in [ + (NormalDist(0.0, 2.0), NormalDist(1.0, 2.0), 0.80258), + (NormalDist(0.0, 1.0), NormalDist(1.0, 2.0), 0.60993), + ]: + self.assertAlmostEqual(X1.overlap(X2), published_result, places=4) + self.assertAlmostEqual(X2.overlap(X1), published_result, places=4) + + # Check against integration of the PDF + def overlap_numeric(X, Y, *, steps=8_192, z=5): + 'Numerical integration cross-check for overlap() ' + fsum = math.fsum + center = (X.mu + Y.mu) / 2.0 + width = z * max(X.sigma, Y.sigma) + start = center - width + dx = 2.0 * width / steps + x_arr = [start + i*dx for i in range(steps)] + xp = list(map(X.pdf, x_arr)) + yp = list(map(Y.pdf, x_arr)) + total = max(fsum(xp), fsum(yp)) + return fsum(map(min, xp, yp)) / total + + for X1, X2 in [ + # Examples from Imman and Bradley + (NormalDist(0.0, 2.0), NormalDist(1.0, 2.0)), + (NormalDist(0.0, 1.0), NormalDist(1.0, 2.0)), + # Example from https://www.rasch.org/rmt/rmt101r.htm + (NormalDist(0.0, 1.0), NormalDist(1.0, 2.0)), + # Gender heights from http://www.usablestats.com/lessons/normal + (NormalDist(70, 4), NormalDist(65, 3.5)), + # Misc cases with equal standard deviations + (NormalDist(100, 15), NormalDist(110, 15)), + (NormalDist(-100, 15), NormalDist(110, 15)), + (NormalDist(-100, 15), NormalDist(-110, 15)), + # Misc cases with unequal standard deviations + (NormalDist(100, 12), NormalDist(110, 15)), + (NormalDist(100, 12), NormalDist(150, 15)), + (NormalDist(100, 12), NormalDist(150, 35)), + # Misc cases with small values + (NormalDist(1.000, 0.002), NormalDist(1.001, 0.003)), + (NormalDist(1.000, 0.002), NormalDist(1.006, 0.0003)), + (NormalDist(1.000, 0.002), NormalDist(1.001, 0.099)), + ]: + self.assertAlmostEqual(X1.overlap(X2), overlap_numeric(X1, X2), places=5) + self.assertAlmostEqual(X2.overlap(X1), overlap_numeric(X1, X2), places=5) + + # Error cases + X = NormalDist() + with self.assertRaises(TypeError): + X.overlap() # too few arguments + with self.assertRaises(TypeError): + X.overlap(X, X) # too may arguments + with self.assertRaises(TypeError): + X.overlap(None) # right operand not a NormalDist + with self.assertRaises(statistics.StatisticsError): + X.overlap(NormalDist(1, 0)) # right operand sigma is zero + with self.assertRaises(statistics.StatisticsError): + NormalDist(1, 0).overlap(X) # left operand sigma is zero + def test_properties(self): X = statistics.NormalDist(100, 15) self.assertEqual(X.mean, 100) |