Compute from_sample() in a single pass over the data (#92284)

author: Raymond Hettinger <rhettinger@users.noreply.github.com> 2022-05-04 02:22:26 (GMT)
committer: GitHub <noreply@github.com> 2022-05-04 02:22:26 (GMT)
commit: 9badc86fb76b48fdd7e335eef850c7a508af5266 (patch)
tree: 5fc525fbc0306825716e3d3071ebf55a84d87aae /Lib/statistics.py
parent: 6dcfd6c5e3cb46543e82dc3f7234546adf4bb04a (diff)
download: cpython-9badc86fb76b48fdd7e335eef850c7a508af5266.zip
cpython-9badc86fb76b48fdd7e335eef850c7a508af5266.tar.gz
cpython-9badc86fb76b48fdd7e335eef850c7a508af5266.tar.bz2
1 files changed, 27 insertions, 18 deletions
diff --git a/Lib/statistics.py b/Lib/statistics.py
index 6e6d62c..5a3de81 100644
--- a/Lib/statistics.py
+++ b/Lib/statistics.py
@@ -206,16 +206,17 @@ def _sum(data):
 
 
 def _ss(data, c=None):
-    """Return sum of square deviations of sequence data.
+    """Return the exact mean and sum of square deviations of sequence data.
+
+    Calculations are done in a single pass, allowing the input to be an iterator.
+
+    If given *c* is used the mean; otherwise, it is calculated from the data.
+    Use the *c* argument with care, as it can lead to garbage results.
 
-    If ``c`` is None, the mean is calculated in one pass, and the deviations
-    from the mean are calculated in a second pass. Otherwise, deviations are
-    calculated from ``c`` as given. Use the second case with care, as it can
-    lead to garbage results.
     """
     if c is not None:
-        T, total, count = _sum((d := x - c) * d for x in data)
-        return (T, total, count)
+        T, ssd, count = _sum((d := x - c) * d for x in data)
+        return (T, ssd, c, count)
     count = 0
     types = set()
     types_add = types.add
@@ -228,20 +229,21 @@ def _ss(data, c=None):
             sx_partials[d] += n
             sxx_partials[d] += n * n
     if not count:
-        total = Fraction(0)
+        ssd = c = Fraction(0)
     elif None in sx_partials:
         # The sum will be a NAN or INF. We can ignore all the finite
         # partials, and just look at this special one.
-        total = sx_partials[None]
+        ssd = c = sx_partials[None]
         assert not _isfinite(total)
     else:
         sx = sum(Fraction(n, d) for d, n in sx_partials.items())
         sxx = sum(Fraction(n, d*d) for d, n in sxx_partials.items())
         # This formula has poor numeric properties for floats,
         # but with fractions it is exact.
-        total = (count * sxx - sx * sx) / count
+        ssd = (count * sxx - sx * sx) / count
+        c = sx / count
     T = reduce(_coerce, types, int)  # or raise TypeError
-    return (T, total, count)
+    return (T, ssd, c, count)
 
 
 def _isfinite(x):
@@ -854,7 +856,7 @@ def variance(data, xbar=None):
     Fraction(67, 108)
 
     """
-    T, ss, n = _ss(data, xbar)
+    T, ss, c, n = _ss(data, xbar)
     if n < 2:
         raise StatisticsError('variance requires at least two data points')
     return _convert(ss / (n - 1), T)
@@ -895,7 +897,7 @@ def pvariance(data, mu=None):
     Fraction(13, 72)
 
     """
-    T, ss, n = _ss(data, mu)
+    T, ss, c, n = _ss(data, mu)
     if n < 1:
         raise StatisticsError('pvariance requires at least one data point')
     return _convert(ss / n, T)
@@ -910,7 +912,7 @@ def stdev(data, xbar=None):
     1.0810874155219827
 
     """
-    T, ss, n = _ss(data, xbar)
+    T, ss, c, n = _ss(data, xbar)
     if n < 2:
         raise StatisticsError('stdev requires at least two data points')
     mss = ss / (n - 1)
@@ -928,7 +930,7 @@ def pstdev(data, mu=None):
     0.986893273527251
 
     """
-    T, ss, n = _ss(data, mu)
+    T, ss, c, n = _ss(data, mu)
     if n < 1:
         raise StatisticsError('pstdev requires at least one data point')
     mss = ss / n
@@ -937,6 +939,15 @@ def pstdev(data, mu=None):
     return _float_sqrt_of_frac(mss.numerator, mss.denominator)
 
 
+def _mean_stdev(data):
+    """In one pass, compute the mean and sample standard deviation as floats."""
+    T, ss, xbar, n = _ss(data)
+    if n < 2:
+        raise StatisticsError('stdev requires at least two data points')
+    mss = ss / (n - 1)
+    return float(xbar), _float_sqrt_of_frac(mss.numerator, mss.denominator)
+
+
 # === Statistics for relations between two inputs ===
 
 # See https://en.wikipedia.org/wiki/Covariance
@@ -1171,9 +1182,7 @@ class NormalDist:
     @classmethod
     def from_samples(cls, data):
         "Make a normal distribution instance from sample data."
-        if not isinstance(data, (list, tuple)):
-            data = list(data)
-        return cls(mean(data), stdev(data))
+        return cls(*_mean_stdev(data))
 
     def samples(self, n, *, seed=None):
         "Generate *n* samples for a given mean and standard deviation."
author	Raymond Hettinger <rhettinger@users.noreply.github.com>	2022-05-04 02:22:26 (GMT)
committer	GitHub <noreply@github.com>	2022-05-04 02:22:26 (GMT)
commit	9badc86fb76b48fdd7e335eef850c7a508af5266 (patch)
tree	5fc525fbc0306825716e3d3071ebf55a84d87aae /Lib/statistics.py
parent	6dcfd6c5e3cb46543e82dc3f7234546adf4bb04a (diff)
download	cpython-9badc86fb76b48fdd7e335eef850c7a508af5266.zip cpython-9badc86fb76b48fdd7e335eef850c7a508af5266.tar.gz cpython-9badc86fb76b48fdd7e335eef850c7a508af5266.tar.bz2