summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRaymond Hettinger <rhettinger@users.noreply.github.com>2021-05-15 18:00:51 (GMT)
committerGitHub <noreply@github.com>2021-05-15 18:00:51 (GMT)
commitfdfea4ab16ff65234dc30f51ed8056138ab19005 (patch)
tree1f05666323091cf852304e34bc5abaa6af7a6052
parent80b089179fa798c8ceaab2ff699c82499b2fcacd (diff)
downloadcpython-fdfea4ab16ff65234dc30f51ed8056138ab19005.zip
cpython-fdfea4ab16ff65234dc30f51ed8056138ab19005.tar.gz
cpython-fdfea4ab16ff65234dc30f51ed8056138ab19005.tar.bz2
Improve speed and accuracy for correlation() (GH-26135)
-rw-r--r--Lib/statistics.py26
1 files changed, 14 insertions, 12 deletions
diff --git a/Lib/statistics.py b/Lib/statistics.py
index db8c581..507a5b2 100644
--- a/Lib/statistics.py
+++ b/Lib/statistics.py
@@ -107,9 +107,12 @@ A single exception is defined: StatisticsError is a subclass of ValueError.
__all__ = [
'NormalDist',
'StatisticsError',
+ 'correlation',
+ 'covariance',
'fmean',
'geometric_mean',
'harmonic_mean',
+ 'linear_regression',
'mean',
'median',
'median_grouped',
@@ -122,9 +125,6 @@ __all__ = [
'quantiles',
'stdev',
'variance',
- 'correlation',
- 'covariance',
- 'linear_regression',
]
import math
@@ -882,10 +882,10 @@ def covariance(x, y, /):
raise StatisticsError('covariance requires that both inputs have same number of data points')
if n < 2:
raise StatisticsError('covariance requires at least two data points')
- xbar = fmean(x)
- ybar = fmean(y)
- total = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
- return total / (n - 1)
+ xbar = fsum(x) / n
+ ybar = fsum(y) / n
+ sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
+ return sxy / (n - 1)
def correlation(x, y, /):
@@ -910,11 +910,13 @@ def correlation(x, y, /):
raise StatisticsError('correlation requires that both inputs have same number of data points')
if n < 2:
raise StatisticsError('correlation requires at least two data points')
- cov = covariance(x, y)
- stdx = stdev(x)
- stdy = stdev(y)
+ xbar = fsum(x) / n
+ ybar = fsum(y) / n
+ sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
+ s2x = fsum((xi - xbar) ** 2.0 for xi in x)
+ s2y = fsum((yi - ybar) ** 2.0 for yi in y)
try:
- return cov / (stdx * stdy)
+ return sxy / sqrt(s2x * s2y)
except ZeroDivisionError:
raise StatisticsError('at least one of the inputs is constant')
@@ -958,7 +960,7 @@ def linear_regression(regressor, dependent_variable, /):
sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
s2x = fsum((xi - xbar) ** 2.0 for xi in x)
try:
- slope = sxy / s2x
+ slope = sxy / s2x # equivalent to: covariance(x, y) / variance(x)
except ZeroDivisionError:
raise StatisticsError('regressor is constant')
intercept = ybar - slope * xbar