diff options
author | Raymond Hettinger <rhettinger@users.noreply.github.com> | 2023-03-14 01:06:43 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-03-14 01:06:43 (GMT) |
commit | 457e4d1a516c2b83edeff2f255f4cd6e7b114feb (patch) | |
tree | 51ca85d76d69c13f8f88aef3bd5daa613c735ad7 /Lib/statistics.py | |
parent | 61479d46848bc7a7f9b571b0b09c4a4b4436d839 (diff) | |
download | cpython-457e4d1a516c2b83edeff2f255f4cd6e7b114feb.zip cpython-457e4d1a516c2b83edeff2f255f4cd6e7b114feb.tar.gz cpython-457e4d1a516c2b83edeff2f255f4cd6e7b114feb.tar.bz2 |
GH-102670: Use sumprod() to simplify, speed up, and improve accuracy of statistics functions (GH-102649)
Diffstat (limited to 'Lib/statistics.py')
-rw-r--r-- | Lib/statistics.py | 26 |
1 files changed, 14 insertions, 12 deletions
diff --git a/Lib/statistics.py b/Lib/statistics.py index 7d5d750..6bd214b 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1036,7 +1036,7 @@ def covariance(x, y, /): raise StatisticsError('covariance requires at least two data points') xbar = fsum(x) / n ybar = fsum(y) / n - sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) + sxy = sumprod((xi - xbar for xi in x), (yi - ybar for yi in y)) return sxy / (n - 1) @@ -1074,11 +1074,14 @@ def correlation(x, y, /, *, method='linear'): start = (n - 1) / -2 # Center rankings around zero x = _rank(x, start=start) y = _rank(y, start=start) - xbar = fsum(x) / n - ybar = fsum(y) / n - sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) - sxx = fsum((d := xi - xbar) * d for xi in x) - syy = fsum((d := yi - ybar) * d for yi in y) + else: + xbar = fsum(x) / n + ybar = fsum(y) / n + x = [xi - xbar for xi in x] + y = [yi - ybar for yi in y] + sxy = sumprod(x, y) + sxx = sumprod(x, x) + syy = sumprod(y, y) try: return sxy / sqrt(sxx * syy) except ZeroDivisionError: @@ -1131,14 +1134,13 @@ def linear_regression(x, y, /, *, proportional=False): raise StatisticsError('linear regression requires that both inputs have same number of data points') if n < 2: raise StatisticsError('linear regression requires at least two data points') - if proportional: - sxy = fsum(xi * yi for xi, yi in zip(x, y)) - sxx = fsum(xi * xi for xi in x) - else: + if not proportional: xbar = fsum(x) / n ybar = fsum(y) / n - sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) - sxx = fsum((d := xi - xbar) * d for xi in x) + x = [xi - xbar for xi in x] # List because used three times below + y = (yi - ybar for yi in y) # Generator because only used once below + sxy = sumprod(x, y) + 0.0 # Add zero to coerce result to a float + sxx = sumprod(x, x) try: slope = sxy / sxx # equivalent to: covariance(x, y) / variance(x) except ZeroDivisionError: |