diff options
author | Raymond Hettinger <rhettinger@users.noreply.github.com> | 2019-03-07 06:59:40 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-03-07 06:59:40 (GMT) |
commit | 318d537daabf2bd5f781255c7e25bfce260cf227 (patch) | |
tree | 05255317e7fd489c1fc22bd4164285e9234d1a11 | |
parent | e942e7b5c91995ae1ad967ef2c0f116a5d8555de (diff) | |
download | cpython-318d537daabf2bd5f781255c7e25bfce260cf227.zip cpython-318d537daabf2bd5f781255c7e25bfce260cf227.tar.gz cpython-318d537daabf2bd5f781255c7e25bfce260cf227.tar.bz2 |
bpo-36169 : Add overlap() method to statistics.NormalDist (GH-12149)
-rw-r--r-- | Doc/library/statistics.rst | 32 | ||||
-rw-r--r-- | Lib/statistics.py | 37 | ||||
-rw-r--r-- | Lib/test/test_statistics.py | 62 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Library/2019-03-03-11-37-09.bpo-36169.8nWJy7.rst | 2 |
4 files changed, 132 insertions, 1 deletions
diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 8f8c009..be0215a 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -549,6 +549,28 @@ of applications in statistics, including simulations and hypothesis testing. compute the probability that a random variable *X* will be less than or equal to *x*. Mathematically, it is written ``P(X <= x)``. + .. method:: NormalDist.overlap(other) + + Compute the `overlapping coefficient (OVL) + <http://www.iceaaonline.com/ready/wp-content/uploads/2014/06/MM-9-Presentation-Meet-the-Overlapping-Coefficient-A-Measure-for-Elevator-Speeches.pdf>`_ + between two normal distributions. + + Measures the agreement between two normal probability distributions. + Returns a value between 0.0 and 1.0 giving the overlapping area for + two probability density functions. + + In this `example from John M. Linacre + <https://www.rasch.org/rmt/rmt101r.htm>`_ about 80% of each + distribution overlaps the other: + + .. doctest:: + + >>> N1 = NormalDist(2.4, 1.6) + >>> N2 = NormalDist(3.2, 2.0) + >>> ovl = N1.overlap(N2) + >>> f'{ovl * 100.0 :.1f}%' + '80.4%' + Instances of :class:`NormalDist` support addition, subtraction, multiplication and division by a constant. These operations are used for translation and scaling. For example: @@ -595,6 +617,16 @@ determine the percentage of students with scores between 1100 and 1200: >>> f'{fraction * 100 :.1f}% score between 1100 and 1200' '18.2% score between 1100 and 1200' +What percentage of men and women will have the same height in `two normally +distributed populations with known means and standard deviations +<http://www.usablestats.com/lessons/normal>`_? + + >>> men = NormalDist(70, 4) + >>> women = NormalDist(65, 3.5) + >>> ovl = men.overlap(women) + >>> round(ovl * 100.0, 1) + 50.3 + To estimate the distribution for a model than isn't easy to solve analytically, :class:`NormalDist` can generate input samples for a `Monte Carlo simulation <https://en.wikipedia.org/wiki/Monte_Carlo_method>`_ of the diff --git a/Lib/statistics.py b/Lib/statistics.py index e917a5d..e85aaa9 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -91,7 +91,7 @@ from fractions import Fraction from decimal import Decimal from itertools import groupby from bisect import bisect_left, bisect_right -from math import hypot, sqrt, fabs, exp, erf, tau +from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum @@ -740,6 +740,41 @@ class NormalDist: raise StatisticsError('cdf() not defined when sigma is zero') return 0.5 * (1.0 + erf((x - self.mu) / (self.sigma * sqrt(2.0)))) + def overlap(self, other): + '''Compute the overlapping coefficient (OVL) between two normal distributions. + + Measures the agreement between two normal probability distributions. + Returns a value between 0.0 and 1.0 giving the overlapping area in + the two underlying probability density functions. + + >>> N1 = NormalDist(2.4, 1.6) + >>> N2 = NormalDist(3.2, 2.0) + >>> N1.overlap(N2) + 0.8035050657330205 + + ''' + # See: "The overlapping coefficient as a measure of agreement between + # probability distributions and point estimation of the overlap of two + # normal densities" -- Henry F. Inman and Edwin L. Bradley Jr + # http://dx.doi.org/10.1080/03610928908830127 + if not isinstance(other, NormalDist): + raise TypeError('Expected another NormalDist instance') + X, Y = self, other + if (Y.sigma, Y.mu) < (X.sigma, X.mu): # sort to assure commutativity + X, Y = Y, X + X_var, Y_var = X.variance, Y.variance + if not X_var or not Y_var: + raise StatisticsError('overlap() not defined when sigma is zero') + dv = Y_var - X_var + dm = fabs(Y.mu - X.mu) + if not dv: + return 2.0 * NormalDist(dm, 2.0 * X.sigma).cdf(0) + a = X.mu * Y_var - Y.mu * X_var + b = X.sigma * Y.sigma * sqrt(dm**2.0 + dv * log(Y_var / X_var)) + x1 = (a + b) / dv + x2 = (a - b) / dv + return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2))) + @property def mean(self): 'Arithmetic mean of the normal distribution' diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index 3f14e63..132b982 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2162,6 +2162,68 @@ class TestNormalDist(unittest.TestCase): self.assertEqual(X.cdf(float('Inf')), 1.0) self.assertTrue(math.isnan(X.cdf(float('NaN')))) + def test_overlap(self): + NormalDist = statistics.NormalDist + + # Match examples from Imman and Bradley + for X1, X2, published_result in [ + (NormalDist(0.0, 2.0), NormalDist(1.0, 2.0), 0.80258), + (NormalDist(0.0, 1.0), NormalDist(1.0, 2.0), 0.60993), + ]: + self.assertAlmostEqual(X1.overlap(X2), published_result, places=4) + self.assertAlmostEqual(X2.overlap(X1), published_result, places=4) + + # Check against integration of the PDF + def overlap_numeric(X, Y, *, steps=8_192, z=5): + 'Numerical integration cross-check for overlap() ' + fsum = math.fsum + center = (X.mu + Y.mu) / 2.0 + width = z * max(X.sigma, Y.sigma) + start = center - width + dx = 2.0 * width / steps + x_arr = [start + i*dx for i in range(steps)] + xp = list(map(X.pdf, x_arr)) + yp = list(map(Y.pdf, x_arr)) + total = max(fsum(xp), fsum(yp)) + return fsum(map(min, xp, yp)) / total + + for X1, X2 in [ + # Examples from Imman and Bradley + (NormalDist(0.0, 2.0), NormalDist(1.0, 2.0)), + (NormalDist(0.0, 1.0), NormalDist(1.0, 2.0)), + # Example from https://www.rasch.org/rmt/rmt101r.htm + (NormalDist(0.0, 1.0), NormalDist(1.0, 2.0)), + # Gender heights from http://www.usablestats.com/lessons/normal + (NormalDist(70, 4), NormalDist(65, 3.5)), + # Misc cases with equal standard deviations + (NormalDist(100, 15), NormalDist(110, 15)), + (NormalDist(-100, 15), NormalDist(110, 15)), + (NormalDist(-100, 15), NormalDist(-110, 15)), + # Misc cases with unequal standard deviations + (NormalDist(100, 12), NormalDist(110, 15)), + (NormalDist(100, 12), NormalDist(150, 15)), + (NormalDist(100, 12), NormalDist(150, 35)), + # Misc cases with small values + (NormalDist(1.000, 0.002), NormalDist(1.001, 0.003)), + (NormalDist(1.000, 0.002), NormalDist(1.006, 0.0003)), + (NormalDist(1.000, 0.002), NormalDist(1.001, 0.099)), + ]: + self.assertAlmostEqual(X1.overlap(X2), overlap_numeric(X1, X2), places=5) + self.assertAlmostEqual(X2.overlap(X1), overlap_numeric(X1, X2), places=5) + + # Error cases + X = NormalDist() + with self.assertRaises(TypeError): + X.overlap() # too few arguments + with self.assertRaises(TypeError): + X.overlap(X, X) # too may arguments + with self.assertRaises(TypeError): + X.overlap(None) # right operand not a NormalDist + with self.assertRaises(statistics.StatisticsError): + X.overlap(NormalDist(1, 0)) # right operand sigma is zero + with self.assertRaises(statistics.StatisticsError): + NormalDist(1, 0).overlap(X) # left operand sigma is zero + def test_properties(self): X = statistics.NormalDist(100, 15) self.assertEqual(X.mean, 100) diff --git a/Misc/NEWS.d/next/Library/2019-03-03-11-37-09.bpo-36169.8nWJy7.rst b/Misc/NEWS.d/next/Library/2019-03-03-11-37-09.bpo-36169.8nWJy7.rst new file mode 100644 index 0000000..49afa2e --- /dev/null +++ b/Misc/NEWS.d/next/Library/2019-03-03-11-37-09.bpo-36169.8nWJy7.rst @@ -0,0 +1,2 @@ +Add overlap() method to statistics.NormalDist. Computes the overlapping +coefficient for two normal distributions. |