bpo-38490: statistics: Add covariance, Pearson's correlation, and simple linear regression (#16813)

Co-authored-by: Tymoteusz Wołodźko <twolodzko+gitkraken@gmail.com
author: Tymoteusz Wołodźko <twolodzko@users.noreply.github.com> 2021-04-25 11:45:09 (GMT)
committer: GitHub <noreply@github.com> 2021-04-25 11:45:09 (GMT)
commit: 09aa6f914dc313875ff18474770a0a7c13ea8dea (patch)
tree: 8f4ea916f3016fd3845b87705b1eb6f85c4fb190 /Lib/statistics.py
parent: 172c0f2752d8708b6dda7b42e6c5a3519420a4e8 (diff)
download: cpython-09aa6f914dc313875ff18474770a0a7c13ea8dea.zip
cpython-09aa6f914dc313875ff18474770a0a7c13ea8dea.tar.gz
cpython-09aa6f914dc313875ff18474770a0a7c13ea8dea.tar.bz2
1 files changed, 135 insertions, 1 deletions
diff --git a/Lib/statistics.py b/Lib/statistics.py
index 2414869..673a162 100644
--- a/Lib/statistics.py
+++ b/Lib/statistics.py
@@ -73,6 +73,30 @@ second argument to the four "spread" functions to avoid recalculating it:
 2.5
 
 
+Statistics for relations between two inputs
+-------------------------------------------
+
+==================  ====================================================
+Function            Description
+==================  ====================================================
+covariance          Sample covariance for two variables.
+correlation         Pearson's correlation coefficient for two variables.
+linear_regression   Intercept and slope for simple linear regression.
+==================  ====================================================
+
+Calculate covariance, Pearson's correlation, and simple linear regression
+for two inputs:
+
+>>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+>>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3]
+>>> covariance(x, y)
+0.75
+>>> correlation(x, y)  #doctest: +ELLIPSIS
+0.31622776601...
+>>> linear_regression(x, y)  #doctest:
+LinearRegression(intercept=1.5, slope=0.1)
+
+
 Exceptions
 ----------
 
@@ -98,6 +122,9 @@ __all__ = [
     'quantiles',
     'stdev',
     'variance',
+    'correlation',
+    'covariance',
+    'linear_regression',
 ]
 
 import math
@@ -110,7 +137,7 @@ from itertools import groupby, repeat
 from bisect import bisect_left, bisect_right
 from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
 from operator import itemgetter
-from collections import Counter
+from collections import Counter, namedtuple
 
 # === Exceptions ===
 
@@ -826,6 +853,113 @@ def pstdev(data, mu=None):
         return math.sqrt(var)
 
 
+# === Statistics for relations between two inputs ===
+
+# See https://en.wikipedia.org/wiki/Covariance
+#     https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
+#     https://en.wikipedia.org/wiki/Simple_linear_regression
+
+
+def covariance(x, y, /):
+    """Covariance
+
+    Return the sample covariance of two inputs *x* and *y*. Covariance
+    is a measure of the joint variability of two inputs.
+
+    >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+    >>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3]
+    >>> covariance(x, y)
+    0.75
+    >>> z = [9, 8, 7, 6, 5, 4, 3, 2, 1]
+    >>> covariance(x, z)
+    -7.5
+    >>> covariance(z, x)
+    -7.5
+
+    """
+    n = len(x)
+    if len(y) != n:
+        raise StatisticsError('covariance requires that both inputs have same number of data points')
+    if n < 2:
+        raise StatisticsError('covariance requires at least two data points')
+    xbar = mean(x)
+    ybar = mean(y)
+    total = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
+    return total / (n - 1)
+
+
+def correlation(x, y, /):
+    """Pearson's correlation coefficient
+
+    Return the Pearson's correlation coefficient for two inputs. Pearson's
+    correlation coefficient *r* takes values between -1 and +1. It measures the
+    strength and direction of the linear relationship, where +1 means very
+    strong, positive linear relationship, -1 very strong, negative linear
+    relationship, and 0 no linear relationship.
+
+    >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+    >>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1]
+    >>> correlation(x, x)
+    1.0
+    >>> correlation(x, y)
+    -1.0
+
+    """
+    n = len(x)
+    if len(y) != n:
+        raise StatisticsError('correlation requires that both inputs have same number of data points')
+    if n < 2:
+        raise StatisticsError('correlation requires at least two data points')
+    cov = covariance(x, y)
+    stdx = stdev(x)
+    stdy = stdev(y)
+    try:
+        return cov / (stdx * stdy)
+    except ZeroDivisionError:
+        raise StatisticsError('at least one of the inputs is constant')
+
+
+LinearRegression = namedtuple('LinearRegression', ['intercept', 'slope'])
+
+
+def linear_regression(regressor, dependent_variable, /):
+    """Intercept and slope for simple linear regression
+
+    Return the intercept and slope of simple linear regression
+    parameters estimated using ordinary least squares. Simple linear
+    regression describes relationship between *regressor* and
+    *dependent variable* in terms of linear function::
+
+        dependent_variable = intercept + slope * regressor + noise
+
+    where ``intercept`` and ``slope`` are the regression parameters that are
+    estimated, and noise term is an unobserved random variable, for the
+    variability of the data that was not explained by the linear regression
+    (it is equal to the difference between prediction and the actual values
+    of dependent variable).
+
+    The parameters are returned as a named tuple.
+
+    >>> regressor = [1, 2, 3, 4, 5]
+    >>> noise = NormalDist().samples(5, seed=42)
+    >>> dependent_variable = [2 + 3 * regressor[i] + noise[i] for i in range(5)]
+    >>> linear_regression(regressor, dependent_variable)  #doctest: +ELLIPSIS
+    LinearRegression(intercept=1.75684970486..., slope=3.09078914170...)
+
+    """
+    n = len(regressor)
+    if len(dependent_variable) != n:
+        raise StatisticsError('linear regression requires that both inputs have same number of data points')
+    if n < 2:
+        raise StatisticsError('linear regression requires at least two data points')
+    try:
+        slope = covariance(regressor, dependent_variable) / variance(regressor)
+    except ZeroDivisionError:
+        raise StatisticsError('regressor is constant')
+    intercept = mean(dependent_variable) - slope * mean(regressor)
+    return LinearRegression(intercept=intercept, slope=slope)
+
+
 ## Normal Distribution #####################################################
author	Tymoteusz Wołodźko <twolodzko@users.noreply.github.com>	2021-04-25 11:45:09 (GMT)
committer	GitHub <noreply@github.com>	2021-04-25 11:45:09 (GMT)
commit	09aa6f914dc313875ff18474770a0a7c13ea8dea (patch)
tree	8f4ea916f3016fd3845b87705b1eb6f85c4fb190 /Lib/statistics.py
parent	172c0f2752d8708b6dda7b42e6c5a3519420a4e8 (diff)
download	cpython-09aa6f914dc313875ff18474770a0a7c13ea8dea.zip cpython-09aa6f914dc313875ff18474770a0a7c13ea8dea.tar.gz cpython-09aa6f914dc313875ff18474770a0a7c13ea8dea.tar.bz2