bpo-36169 : Add overlap() method to statistics.NormalDist (GH-12149)

author: Raymond Hettinger <rhettinger@users.noreply.github.com> 2019-03-07 06:59:40 (GMT)
committer: GitHub <noreply@github.com> 2019-03-07 06:59:40 (GMT)
commit: 318d537daabf2bd5f781255c7e25bfce260cf227 (patch)
tree: 05255317e7fd489c1fc22bd4164285e9234d1a11 /Lib
parent: e942e7b5c91995ae1ad967ef2c0f116a5d8555de (diff)
download: cpython-318d537daabf2bd5f781255c7e25bfce260cf227.zip
cpython-318d537daabf2bd5f781255c7e25bfce260cf227.tar.gz
cpython-318d537daabf2bd5f781255c7e25bfce260cf227.tar.bz2
2 files changed, 98 insertions, 1 deletions
diff --git a/Lib/statistics.py b/Lib/statistics.py
index e917a5d..e85aaa9 100644
--- a/Lib/statistics.py
+++ b/Lib/statistics.py
@@ -91,7 +91,7 @@ from fractions import Fraction
 from decimal import Decimal
 from itertools import groupby
 from bisect import bisect_left, bisect_right
-from math import hypot, sqrt, fabs, exp, erf, tau
+from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
 
 
 
@@ -740,6 +740,41 @@ class NormalDist:
             raise StatisticsError('cdf() not defined when sigma is zero')
         return 0.5 * (1.0 + erf((x - self.mu) / (self.sigma * sqrt(2.0))))
 
+    def overlap(self, other):
+        '''Compute the overlapping coefficient (OVL) between two normal distributions.
+
+        Measures the agreement between two normal probability distributions.
+        Returns a value between 0.0 and 1.0 giving the overlapping area in
+        the two underlying probability density functions.
+
+            >>> N1 = NormalDist(2.4, 1.6)
+            >>> N2 = NormalDist(3.2, 2.0)
+            >>> N1.overlap(N2)
+            0.8035050657330205
+
+        '''
+        # See: "The overlapping coefficient as a measure of agreement between
+        # probability distributions and point estimation of the overlap of two
+        # normal densities" -- Henry F. Inman and Edwin L. Bradley Jr
+        # http://dx.doi.org/10.1080/03610928908830127
+        if not isinstance(other, NormalDist):
+            raise TypeError('Expected another NormalDist instance')
+        X, Y = self, other
+        if (Y.sigma, Y.mu) < (X.sigma, X.mu):   # sort to assure commutativity
+            X, Y = Y, X
+        X_var, Y_var = X.variance, Y.variance
+        if not X_var or not Y_var:
+            raise StatisticsError('overlap() not defined when sigma is zero')
+        dv = Y_var - X_var
+        dm = fabs(Y.mu - X.mu)
+        if not dv:
+            return 2.0 * NormalDist(dm, 2.0 * X.sigma).cdf(0)
+        a = X.mu * Y_var - Y.mu * X_var
+        b = X.sigma * Y.sigma * sqrt(dm**2.0 + dv * log(Y_var / X_var))
+        x1 = (a + b) / dv
+        x2 = (a - b) / dv
+        return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))
+
     @property
     def mean(self):
         'Arithmetic mean of the normal distribution'
diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py
index 3f14e63..132b982 100644
--- a/Lib/test/test_statistics.py
+++ b/Lib/test/test_statistics.py
@@ -2162,6 +2162,68 @@ class TestNormalDist(unittest.TestCase):
         self.assertEqual(X.cdf(float('Inf')), 1.0)
         self.assertTrue(math.isnan(X.cdf(float('NaN'))))
 
+    def test_overlap(self):
+        NormalDist = statistics.NormalDist
+
+        # Match examples from Imman and Bradley
+        for X1, X2, published_result in [
+                (NormalDist(0.0, 2.0), NormalDist(1.0, 2.0), 0.80258),
+                (NormalDist(0.0, 1.0), NormalDist(1.0, 2.0), 0.60993),
+            ]:
+            self.assertAlmostEqual(X1.overlap(X2), published_result, places=4)
+            self.assertAlmostEqual(X2.overlap(X1), published_result, places=4)
+
+        # Check against integration of the PDF
+        def overlap_numeric(X, Y, *, steps=8_192, z=5):
+            'Numerical integration cross-check for overlap() '
+            fsum = math.fsum
+            center = (X.mu + Y.mu) / 2.0
+            width = z * max(X.sigma, Y.sigma)
+            start = center - width
+            dx = 2.0 * width / steps
+            x_arr = [start + i*dx for i in range(steps)]
+            xp = list(map(X.pdf, x_arr))
+            yp = list(map(Y.pdf, x_arr))
+            total = max(fsum(xp), fsum(yp))
+            return fsum(map(min, xp, yp)) / total
+
+        for X1, X2 in [
+                # Examples from Imman and Bradley
+                (NormalDist(0.0, 2.0), NormalDist(1.0, 2.0)),
+                (NormalDist(0.0, 1.0), NormalDist(1.0, 2.0)),
+                # Example from https://www.rasch.org/rmt/rmt101r.htm
+                (NormalDist(0.0, 1.0), NormalDist(1.0, 2.0)),
+                # Gender heights from http://www.usablestats.com/lessons/normal
+                (NormalDist(70, 4), NormalDist(65, 3.5)),
+                # Misc cases with equal standard deviations
+                (NormalDist(100, 15), NormalDist(110, 15)),
+                (NormalDist(-100, 15), NormalDist(110, 15)),
+                (NormalDist(-100, 15), NormalDist(-110, 15)),
+                # Misc cases with unequal standard deviations
+                (NormalDist(100, 12), NormalDist(110, 15)),
+                (NormalDist(100, 12), NormalDist(150, 15)),
+                (NormalDist(100, 12), NormalDist(150, 35)),
+                # Misc cases with small values
+                (NormalDist(1.000, 0.002), NormalDist(1.001, 0.003)),
+                (NormalDist(1.000, 0.002), NormalDist(1.006, 0.0003)),
+                (NormalDist(1.000, 0.002), NormalDist(1.001, 0.099)),
+            ]:
+            self.assertAlmostEqual(X1.overlap(X2), overlap_numeric(X1, X2), places=5)
+            self.assertAlmostEqual(X2.overlap(X1), overlap_numeric(X1, X2), places=5)
+
+        # Error cases
+        X = NormalDist()
+        with self.assertRaises(TypeError):
+            X.overlap()                             # too few arguments
+        with self.assertRaises(TypeError):
+            X.overlap(X, X)                         # too may arguments
+        with self.assertRaises(TypeError):
+            X.overlap(None)                         # right operand not a NormalDist
+        with self.assertRaises(statistics.StatisticsError):
+            X.overlap(NormalDist(1, 0))             # right operand sigma is zero
+        with self.assertRaises(statistics.StatisticsError):
+            NormalDist(1, 0).overlap(X)             # left operand sigma is zero
+
     def test_properties(self):
         X = statistics.NormalDist(100, 15)
         self.assertEqual(X.mean, 100)
author	Raymond Hettinger <rhettinger@users.noreply.github.com>	2019-03-07 06:59:40 (GMT)
committer	GitHub <noreply@github.com>	2019-03-07 06:59:40 (GMT)
commit	318d537daabf2bd5f781255c7e25bfce260cf227 (patch)
tree	05255317e7fd489c1fc22bd4164285e9234d1a11 /Lib
parent	e942e7b5c91995ae1ad967ef2c0f116a5d8555de (diff)
download	cpython-318d537daabf2bd5f781255c7e25bfce260cf227.zip cpython-318d537daabf2bd5f781255c7e25bfce260cf227.tar.gz cpython-318d537daabf2bd5f781255c7e25bfce260cf227.tar.bz2