summaryrefslogtreecommitdiffstats
path: root/Lib
diff options
context:
space:
mode:
authorRaymond Hettinger <rhettinger@users.noreply.github.com>2019-02-23 22:44:07 (GMT)
committerGitHub <noreply@github.com>2019-02-23 22:44:07 (GMT)
commit11c79531655a4aa3f82c20ff562ac571f40040cc (patch)
tree6af6cf3108204156c7b66022044514d75fca134e /Lib
parent64d6cc826dacebc2493b1bb5e8cb97828eb76f81 (diff)
downloadcpython-11c79531655a4aa3f82c20ff562ac571f40040cc.zip
cpython-11c79531655a4aa3f82c20ff562ac571f40040cc.tar.gz
cpython-11c79531655a4aa3f82c20ff562ac571f40040cc.tar.bz2
bpo-36018: Add the NormalDist class to the statistics module (GH-11973)
Diffstat (limited to 'Lib')
-rw-r--r--Lib/statistics.py156
-rw-r--r--Lib/test/test_statistics.py177
2 files changed, 332 insertions, 1 deletions
diff --git a/Lib/statistics.py b/Lib/statistics.py
index 8ecb906..a73001a 100644
--- a/Lib/statistics.py
+++ b/Lib/statistics.py
@@ -76,7 +76,7 @@ A single exception is defined: StatisticsError is a subclass of ValueError.
"""
-__all__ = [ 'StatisticsError',
+__all__ = [ 'StatisticsError', 'NormalDist',
'pstdev', 'pvariance', 'stdev', 'variance',
'median', 'median_low', 'median_high', 'median_grouped',
'mean', 'mode', 'harmonic_mean', 'fmean',
@@ -85,11 +85,13 @@ __all__ = [ 'StatisticsError',
import collections
import math
import numbers
+import random
from fractions import Fraction
from decimal import Decimal
from itertools import groupby
from bisect import bisect_left, bisect_right
+from math import hypot, sqrt, fabs, exp, erf, tau
@@ -694,3 +696,155 @@ def pstdev(data, mu=None):
return var.sqrt()
except AttributeError:
return math.sqrt(var)
+
+## Normal Distribution #####################################################
+
+class NormalDist:
+ 'Normal distribution of a random variable'
+ # https://en.wikipedia.org/wiki/Normal_distribution
+ # https://en.wikipedia.org/wiki/Variance#Properties
+
+ __slots__ = ('mu', 'sigma')
+
+ def __init__(self, mu=0.0, sigma=1.0):
+ 'NormalDist where mu is the mean and sigma is the standard deviation'
+ if sigma < 0.0:
+ raise StatisticsError('sigma must be non-negative')
+ self.mu = mu
+ self.sigma = sigma
+
+ @classmethod
+ def from_samples(cls, data):
+ 'Make a normal distribution instance from sample data'
+ if not isinstance(data, (list, tuple)):
+ data = list(data)
+ xbar = fmean(data)
+ return cls(xbar, stdev(data, xbar))
+
+ def samples(self, n, seed=None):
+ 'Generate *n* samples for a given mean and standard deviation'
+ gauss = random.gauss if seed is None else random.Random(seed).gauss
+ mu, sigma = self.mu, self.sigma
+ return [gauss(mu, sigma) for i in range(n)]
+
+ def pdf(self, x):
+ 'Probability density function: P(x <= X < x+dx) / dx'
+ variance = self.sigma ** 2.0
+ if not variance:
+ raise StatisticsError('pdf() not defined when sigma is zero')
+ return exp((x - self.mu)**2.0 / (-2.0*variance)) / sqrt(tau * variance)
+
+ def cdf(self, x):
+ 'Cumulative density function: P(X <= x)'
+ if not self.sigma:
+ raise StatisticsError('cdf() not defined when sigma is zero')
+ return 0.5 * (1.0 + erf((x - self.mu) / (self.sigma * sqrt(2.0))))
+
+ @property
+ def variance(self):
+ 'Square of the standard deviation'
+ return self.sigma ** 2.0
+
+ def __add__(x1, x2):
+ if isinstance(x2, NormalDist):
+ return NormalDist(x1.mu + x2.mu, hypot(x1.sigma, x2.sigma))
+ return NormalDist(x1.mu + x2, x1.sigma)
+
+ def __sub__(x1, x2):
+ if isinstance(x2, NormalDist):
+ return NormalDist(x1.mu - x2.mu, hypot(x1.sigma, x2.sigma))
+ return NormalDist(x1.mu - x2, x1.sigma)
+
+ def __mul__(x1, x2):
+ return NormalDist(x1.mu * x2, x1.sigma * fabs(x2))
+
+ def __truediv__(x1, x2):
+ return NormalDist(x1.mu / x2, x1.sigma / fabs(x2))
+
+ def __pos__(x1):
+ return x1
+
+ def __neg__(x1):
+ return NormalDist(-x1.mu, x1.sigma)
+
+ __radd__ = __add__
+
+ def __rsub__(x1, x2):
+ return -(x1 - x2)
+
+ __rmul__ = __mul__
+
+ def __eq__(x1, x2):
+ if not isinstance(x2, NormalDist):
+ return NotImplemented
+ return (x1.mu, x2.sigma) == (x2.mu, x2.sigma)
+
+ def __repr__(self):
+ return f'{type(self).__name__}(mu={self.mu!r}, sigma={self.sigma!r})'
+
+
+if __name__ == '__main__':
+
+ # Show math operations computed analytically in comparsion
+ # to a monte carlo simulation of the same operations
+
+ from math import isclose
+ from operator import add, sub, mul, truediv
+ from itertools import repeat
+
+ g1 = NormalDist(10, 20)
+ g2 = NormalDist(-5, 25)
+
+ # Test scaling by a constant
+ assert (g1 * 5 / 5).mu == g1.mu
+ assert (g1 * 5 / 5).sigma == g1.sigma
+
+ n = 100_000
+ G1 = g1.samples(n)
+ G2 = g2.samples(n)
+
+ for func in (add, sub):
+ print(f'\nTest {func.__name__} with another NormalDist:')
+ print(func(g1, g2))
+ print(NormalDist.from_samples(map(func, G1, G2)))
+
+ const = 11
+ for func in (add, sub, mul, truediv):
+ print(f'\nTest {func.__name__} with a constant:')
+ print(func(g1, const))
+ print(NormalDist.from_samples(map(func, G1, repeat(const))))
+
+ const = 19
+ for func in (add, sub, mul):
+ print(f'\nTest constant with {func.__name__}:')
+ print(func(const, g1))
+ print(NormalDist.from_samples(map(func, repeat(const), G1)))
+
+ def assert_close(G1, G2):
+ assert isclose(G1.mu, G1.mu, rel_tol=0.01), (G1, G2)
+ assert isclose(G1.sigma, G2.sigma, rel_tol=0.01), (G1, G2)
+
+ X = NormalDist(-105, 73)
+ Y = NormalDist(31, 47)
+ s = 32.75
+ n = 100_000
+
+ S = NormalDist.from_samples([x + s for x in X.samples(n)])
+ assert_close(X + s, S)
+
+ S = NormalDist.from_samples([x - s for x in X.samples(n)])
+ assert_close(X - s, S)
+
+ S = NormalDist.from_samples([x * s for x in X.samples(n)])
+ assert_close(X * s, S)
+
+ S = NormalDist.from_samples([x / s for x in X.samples(n)])
+ assert_close(X / s, S)
+
+ S = NormalDist.from_samples([x + y for x, y in zip(X.samples(n),
+ Y.samples(n))])
+ assert_close(X + Y, S)
+
+ S = NormalDist.from_samples([x - y for x, y in zip(X.samples(n),
+ Y.samples(n))])
+ assert_close(X - Y, S)
diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py
index e351446..a65fbe8 100644
--- a/Lib/test/test_statistics.py
+++ b/Lib/test/test_statistics.py
@@ -5,9 +5,11 @@ approx_equal function.
import collections
import collections.abc
+import copy
import decimal
import doctest
import math
+import pickle
import random
import sys
import unittest
@@ -2025,6 +2027,181 @@ class TestStdev(VarianceStdevMixin, NumericTestCase):
expected = math.sqrt(statistics.variance(data))
self.assertEqual(self.func(data), expected)
+class TestNormalDist(unittest.TestCase):
+
+ def test_slots(self):
+ nd = statistics.NormalDist(300, 23)
+ with self.assertRaises(TypeError):
+ vars(nd)
+ self.assertEqual(nd.__slots__, ('mu', 'sigma'))
+
+ def test_instantiation_and_attributes(self):
+ nd = statistics.NormalDist(500, 17)
+ self.assertEqual(nd.mu, 500)
+ self.assertEqual(nd.sigma, 17)
+ self.assertEqual(nd.variance, 17**2)
+
+ # default arguments
+ nd = statistics.NormalDist()
+ self.assertEqual(nd.mu, 0)
+ self.assertEqual(nd.sigma, 1)
+ self.assertEqual(nd.variance, 1**2)
+
+ # error case: negative sigma
+ with self.assertRaises(statistics.StatisticsError):
+ statistics.NormalDist(500, -10)
+
+ def test_alternative_constructor(self):
+ NormalDist = statistics.NormalDist
+ data = [96, 107, 90, 92, 110]
+ # list input
+ self.assertEqual(NormalDist.from_samples(data), NormalDist(99, 9))
+ # tuple input
+ self.assertEqual(NormalDist.from_samples(tuple(data)), NormalDist(99, 9))
+ # iterator input
+ self.assertEqual(NormalDist.from_samples(iter(data)), NormalDist(99, 9))
+ # error cases
+ with self.assertRaises(statistics.StatisticsError):
+ NormalDist.from_samples([]) # empty input
+ with self.assertRaises(statistics.StatisticsError):
+ NormalDist.from_samples([10]) # only one input
+
+ def test_sample_generation(self):
+ NormalDist = statistics.NormalDist
+ mu, sigma = 10_000, 3.0
+ X = NormalDist(mu, sigma)
+ n = 1_000
+ data = X.samples(n)
+ self.assertEqual(len(data), n)
+ self.assertEqual(set(map(type, data)), {float})
+ # mean(data) expected to fall within 8 standard deviations
+ xbar = statistics.mean(data)
+ self.assertTrue(mu - sigma*8 <= xbar <= mu + sigma*8)
+
+ # verify that seeding makes reproducible sequences
+ n = 100
+ data1 = X.samples(n, seed='happiness and joy')
+ data2 = X.samples(n, seed='trouble and despair')
+ data3 = X.samples(n, seed='happiness and joy')
+ data4 = X.samples(n, seed='trouble and despair')
+ self.assertEqual(data1, data3)
+ self.assertEqual(data2, data4)
+ self.assertNotEqual(data1, data2)
+
+ # verify that subclass type is honored
+ class NewNormalDist(NormalDist):
+ pass
+ nnd = NewNormalDist(200, 5)
+ self.assertEqual(type(nnd), NewNormalDist)
+
+ def test_pdf(self):
+ NormalDist = statistics.NormalDist
+ X = NormalDist(100, 15)
+ # Verify peak around center
+ self.assertLess(X.pdf(99), X.pdf(100))
+ self.assertLess(X.pdf(101), X.pdf(100))
+ # Test symmetry
+ self.assertAlmostEqual(X.pdf(99), X.pdf(101))
+ self.assertAlmostEqual(X.pdf(98), X.pdf(102))
+ self.assertAlmostEqual(X.pdf(97), X.pdf(103))
+ # Test vs CDF
+ dx = 2.0 ** -10
+ for x in range(90, 111):
+ est_pdf = (X.cdf(x + dx) - X.cdf(x)) / dx
+ self.assertAlmostEqual(X.pdf(x), est_pdf, places=4)
+ # Error case: variance is zero
+ Y = NormalDist(100, 0)
+ with self.assertRaises(statistics.StatisticsError):
+ Y.pdf(90)
+
+ def test_cdf(self):
+ NormalDist = statistics.NormalDist
+ X = NormalDist(100, 15)
+ cdfs = [X.cdf(x) for x in range(1, 200)]
+ self.assertEqual(set(map(type, cdfs)), {float})
+ # Verify montonic
+ self.assertEqual(cdfs, sorted(cdfs))
+ # Verify center
+ self.assertAlmostEqual(X.cdf(100), 0.50)
+ # Error case: variance is zero
+ Y = NormalDist(100, 0)
+ with self.assertRaises(statistics.StatisticsError):
+ Y.cdf(90)
+
+ def test_same_type_addition_and_subtraction(self):
+ NormalDist = statistics.NormalDist
+ X = NormalDist(100, 12)
+ Y = NormalDist(40, 5)
+ self.assertEqual(X + Y, NormalDist(140, 13)) # __add__
+ self.assertEqual(X - Y, NormalDist(60, 13)) # __sub__
+
+ def test_translation_and_scaling(self):
+ NormalDist = statistics.NormalDist
+ X = NormalDist(100, 15)
+ y = 10
+ self.assertEqual(+X, NormalDist(100, 15)) # __pos__
+ self.assertEqual(-X, NormalDist(-100, 15)) # __neg__
+ self.assertEqual(X + y, NormalDist(110, 15)) # __add__
+ self.assertEqual(y + X, NormalDist(110, 15)) # __radd__
+ self.assertEqual(X - y, NormalDist(90, 15)) # __sub__
+ self.assertEqual(y - X, NormalDist(-90, 15)) # __rsub__
+ self.assertEqual(X * y, NormalDist(1000, 150)) # __mul__
+ self.assertEqual(y * X, NormalDist(1000, 150)) # __rmul__
+ self.assertEqual(X / y, NormalDist(10, 1.5)) # __truediv__
+ with self.assertRaises(TypeError):
+ y / X
+
+ def test_equality(self):
+ NormalDist = statistics.NormalDist
+ nd1 = NormalDist()
+ nd2 = NormalDist(2, 4)
+ nd3 = NormalDist()
+ self.assertNotEqual(nd1, nd2)
+ self.assertEqual(nd1, nd3)
+
+ # Test NotImplemented when types are different
+ class A:
+ def __eq__(self, other):
+ return 10
+ a = A()
+ self.assertEqual(nd1.__eq__(a), NotImplemented)
+ self.assertEqual(nd1 == a, 10)
+ self.assertEqual(a == nd1, 10)
+
+ # All subclasses to compare equal giving the same behavior
+ # as list, tuple, int, float, complex, str, dict, set, etc.
+ class SizedNormalDist(NormalDist):
+ def __init__(self, mu, sigma, n):
+ super().__init__(mu, sigma)
+ self.n = n
+ s = SizedNormalDist(100, 15, 57)
+ nd4 = NormalDist(100, 15)
+ self.assertEqual(s, nd4)
+
+ # Don't allow duck type equality because we wouldn't
+ # want a lognormal distribution to compare equal
+ # to a normal distribution with the same parameters
+ class LognormalDist:
+ def __init__(self, mu, sigma):
+ self.mu = mu
+ self.sigma = sigma
+ lnd = LognormalDist(100, 15)
+ nd = NormalDist(100, 15)
+ self.assertNotEqual(nd, lnd)
+
+ def test_pickle_and_copy(self):
+ nd = statistics.NormalDist(37.5, 5.625)
+ nd1 = copy.copy(nd)
+ self.assertEqual(nd, nd1)
+ nd2 = copy.deepcopy(nd)
+ self.assertEqual(nd, nd2)
+ nd3 = pickle.loads(pickle.dumps(nd))
+ self.assertEqual(nd, nd3)
+
+ def test_repr(self):
+ nd = statistics.NormalDist(37.5, 5.625)
+ self.assertEqual(repr(nd), 'NormalDist(mu=37.5, sigma=5.625)')
+
# === Run tests ===