Improve speed and accuracy for correlation() (GH-26135)

author: Raymond Hettinger <rhettinger@users.noreply.github.com> 2021-05-15 18:00:51 (GMT)
committer: GitHub <noreply@github.com> 2021-05-15 18:00:51 (GMT)
commit: fdfea4ab16ff65234dc30f51ed8056138ab19005 (patch)
tree: 1f05666323091cf852304e34bc5abaa6af7a6052 /Lib/statistics.py
parent: 80b089179fa798c8ceaab2ff699c82499b2fcacd (diff)
download: cpython-fdfea4ab16ff65234dc30f51ed8056138ab19005.zip
cpython-fdfea4ab16ff65234dc30f51ed8056138ab19005.tar.gz
cpython-fdfea4ab16ff65234dc30f51ed8056138ab19005.tar.bz2
1 files changed, 14 insertions, 12 deletions
diff --git a/Lib/statistics.py b/Lib/statistics.py
index db8c581..507a5b2 100644
--- a/Lib/statistics.py
+++ b/Lib/statistics.py
@@ -107,9 +107,12 @@ A single exception is defined: StatisticsError is a subclass of ValueError.
 __all__ = [
     'NormalDist',
     'StatisticsError',
+    'correlation',
+    'covariance',
     'fmean',
     'geometric_mean',
     'harmonic_mean',
+    'linear_regression',
     'mean',
     'median',
     'median_grouped',
@@ -122,9 +125,6 @@ __all__ = [
     'quantiles',
     'stdev',
     'variance',
-    'correlation',
-    'covariance',
-    'linear_regression',
 ]
 
 import math
@@ -882,10 +882,10 @@ def covariance(x, y, /):
         raise StatisticsError('covariance requires that both inputs have same number of data points')
     if n < 2:
         raise StatisticsError('covariance requires at least two data points')
-    xbar = fmean(x)
-    ybar = fmean(y)
-    total = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
-    return total / (n - 1)
+    xbar = fsum(x) / n
+    ybar = fsum(y) / n
+    sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
+    return sxy / (n - 1)
 
 
 def correlation(x, y, /):
@@ -910,11 +910,13 @@ def correlation(x, y, /):
         raise StatisticsError('correlation requires that both inputs have same number of data points')
     if n < 2:
         raise StatisticsError('correlation requires at least two data points')
-    cov = covariance(x, y)
-    stdx = stdev(x)
-    stdy = stdev(y)
+    xbar = fsum(x) / n
+    ybar = fsum(y) / n
+    sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
+    s2x = fsum((xi - xbar) ** 2.0 for xi in x)
+    s2y = fsum((yi - ybar) ** 2.0 for yi in y)
     try:
-        return cov / (stdx * stdy)
+        return sxy / sqrt(s2x * s2y)
     except ZeroDivisionError:
         raise StatisticsError('at least one of the inputs is constant')
 
@@ -958,7 +960,7 @@ def linear_regression(regressor, dependent_variable, /):
     sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
     s2x = fsum((xi - xbar) ** 2.0 for xi in x)
     try:
-        slope = sxy / s2x
+        slope = sxy / s2x   # equivalent to:  covariance(x, y) / variance(x)
     except ZeroDivisionError:
         raise StatisticsError('regressor is constant')
     intercept = ybar - slope * xbar
author	Raymond Hettinger <rhettinger@users.noreply.github.com>	2021-05-15 18:00:51 (GMT)
committer	GitHub <noreply@github.com>	2021-05-15 18:00:51 (GMT)
commit	fdfea4ab16ff65234dc30f51ed8056138ab19005 (patch)
tree	1f05666323091cf852304e34bc5abaa6af7a6052 /Lib/statistics.py
parent	80b089179fa798c8ceaab2ff699c82499b2fcacd (diff)
download	cpython-fdfea4ab16ff65234dc30f51ed8056138ab19005.zip cpython-fdfea4ab16ff65234dc30f51ed8056138ab19005.tar.gz cpython-fdfea4ab16ff65234dc30f51ed8056138ab19005.tar.bz2