summaryrefslogtreecommitdiffstats
path: root/Lib/statistics.py
diff options
context:
space:
mode:
authorRaymond Hettinger <rhettinger@users.noreply.github.com>2022-08-18 18:48:27 (GMT)
committerGitHub <noreply@github.com>2022-08-18 18:48:27 (GMT)
commit29c8f80760869018aa0d7b1d42ab737dc325cfa2 (patch)
tree400b4ceaa8e083a67ed2772c920174e2e304456f /Lib/statistics.py
parent91afe66707237558d808aeca4683d0822aa0511e (diff)
downloadcpython-29c8f80760869018aa0d7b1d42ab737dc325cfa2.zip
cpython-29c8f80760869018aa0d7b1d42ab737dc325cfa2.tar.gz
cpython-29c8f80760869018aa0d7b1d42ab737dc325cfa2.tar.bz2
GH-95861: Add support for Spearman's rank correlation coefficient (GH-95863)
Diffstat (limited to 'Lib/statistics.py')
-rw-r--r--Lib/statistics.py69
1 files changed, 62 insertions, 7 deletions
diff --git a/Lib/statistics.py b/Lib/statistics.py
index c78d645..a3f915c 100644
--- a/Lib/statistics.py
+++ b/Lib/statistics.py
@@ -134,11 +134,11 @@ import sys
from fractions import Fraction
from decimal import Decimal
-from itertools import groupby, repeat
+from itertools import count, groupby, repeat
from bisect import bisect_left, bisect_right
from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
from functools import reduce
-from operator import mul
+from operator import mul, itemgetter
from collections import Counter, namedtuple, defaultdict
_SQRT2 = sqrt(2.0)
@@ -355,6 +355,50 @@ def _fail_neg(values, errmsg='negative value'):
raise StatisticsError(errmsg)
yield x
+def _rank(data, /, *, key=None, reverse=False, ties='average') -> list[float]:
+ """Rank order a dataset. The lowest value has rank 1.
+
+ Ties are averaged so that equal values receive the same rank:
+
+ >>> data = [31, 56, 31, 25, 75, 18]
+ >>> _rank(data)
+ [3.5, 5.0, 3.5, 2.0, 6.0, 1.0]
+
+ The operation is idempotent:
+
+ >>> _rank([3.5, 5.0, 3.5, 2.0, 6.0, 1.0])
+ [3.5, 5.0, 3.5, 2.0, 6.0, 1.0]
+
+ It is possible to rank the data in reverse order so that
+ the highest value has rank 1. Also, a key-function can
+ extract the field to be ranked:
+
+ >>> goals = [('eagles', 45), ('bears', 48), ('lions', 44)]
+ >>> _rank(goals, key=itemgetter(1), reverse=True)
+ [2.0, 1.0, 3.0]
+
+ """
+ # If this function becomes public at some point, more thought
+ # needs to be given to the signature. A list of ints is
+ # plausible when ties is "min" or "max". When ties is "average",
+ # either list[float] or list[Fraction] is plausible.
+
+ # Default handling of ties matches scipy.stats.mstats.spearmanr.
+ if ties != 'average':
+ raise ValueError(f'Unknown tie resolution method: {ties!r}')
+ if key is not None:
+ data = map(key, data)
+ val_pos = sorted(zip(data, count()), reverse=reverse)
+ i = 0 # To rank starting at 0 instead of 1, set i = -1.
+ result = [0] * len(val_pos)
+ for _, g in groupby(val_pos, key=itemgetter(0)):
+ group = list(g)
+ size = len(group)
+ rank = i + (size + 1) / 2
+ for value, orig_pos in group:
+ result[orig_pos] = rank
+ i += size
+ return result
def _integer_sqrt_of_frac_rto(n: int, m: int) -> int:
"""Square root of n/m, rounded to the nearest integer using round-to-odd."""
@@ -988,14 +1032,12 @@ def covariance(x, y, /):
return sxy / (n - 1)
-def correlation(x, y, /):
+def correlation(x, y, /, *, method='linear'):
"""Pearson's correlation coefficient
Return the Pearson's correlation coefficient for two inputs. Pearson's
- correlation coefficient *r* takes values between -1 and +1. It measures the
- strength and direction of the linear relationship, where +1 means very
- strong, positive linear relationship, -1 very strong, negative linear
- relationship, and 0 no linear relationship.
+ correlation coefficient *r* takes values between -1 and +1. It measures
+ the strength and direction of a linear relationship.
>>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
>>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1]
@@ -1004,12 +1046,25 @@ def correlation(x, y, /):
>>> correlation(x, y)
-1.0
+ If *method* is "ranked", computes Spearman's rank correlation coefficient
+ for two inputs. The data is replaced by ranks. Ties are averaged
+ so that equal values receive the same rank. The resulting coefficient
+ measures the strength of a monotonic relationship.
+
+ Spearman's rank correlation coefficient is appropriate for ordinal
+ data or for continuous data that doesn't meet the linear proportion
+ requirement for Pearson's correlation coefficient.
"""
n = len(x)
if len(y) != n:
raise StatisticsError('correlation requires that both inputs have same number of data points')
if n < 2:
raise StatisticsError('correlation requires at least two data points')
+ if method not in {'linear', 'ranked'}:
+ raise ValueError(f'Unknown method: {method!r}')
+ if method == 'ranked':
+ x = _rank(x)
+ y = _rank(y)
xbar = fsum(x) / n
ybar = fsum(y) / n
sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))