bpo-36546: Add statistics.quantiles() (#12710)

author: Raymond Hettinger <rhettinger@users.noreply.github.com> 2019-04-23 07:06:35 (GMT)
committer: GitHub <noreply@github.com> 2019-04-23 07:06:35 (GMT)
commit: 9013ccf6d8037f6ae78145a42d194141cb10d332 (patch)
tree: 9a1bf5b8739569012d9d3ecbf50b739936b730e2 /Lib
parent: d437012cdd4a38b5b3d05f139d5f0a28196e4769 (diff)
download: cpython-9013ccf6d8037f6ae78145a42d194141cb10d332.zip
cpython-9013ccf6d8037f6ae78145a42d194141cb10d332.tar.gz
cpython-9013ccf6d8037f6ae78145a42d194141cb10d332.tar.bz2
2 files changed, 195 insertions, 4 deletions
diff --git a/Lib/statistics.py b/Lib/statistics.py
index 262ad97..05edfdf 100644
--- a/Lib/statistics.py
+++ b/Lib/statistics.py
@@ -7,9 +7,9 @@ averages, variance, and standard deviation.
 Calculating averages
 --------------------
 
-==================  =============================================
+==================  ==================================================
 Function            Description
-==================  =============================================
+==================  ==================================================
 mean                Arithmetic mean (average) of data.
 geometric_mean      Geometric mean of data.
 harmonic_mean       Harmonic mean of data.
@@ -19,7 +19,8 @@ median_high         High median of data.
 median_grouped      Median, or 50th percentile, of grouped data.
 mode                Mode (most common value) of data.
 multimode           List of modes (most common values of data).
-==================  =============================================
+quantiles           Divide data into intervals with equal probability.
+==================  ==================================================
 
 Calculate the arithmetic mean ("the average") of data:
 
@@ -78,7 +79,7 @@ A single exception is defined: StatisticsError is a subclass of ValueError.
 
 """
 
-__all__ = [ 'StatisticsError', 'NormalDist',
+__all__ = [ 'StatisticsError', 'NormalDist', 'quantiles',
             'pstdev', 'pvariance', 'stdev', 'variance',
             'median',  'median_low', 'median_high', 'median_grouped',
             'mean', 'mode', 'multimode', 'harmonic_mean', 'fmean',
@@ -562,6 +563,54 @@ def multimode(data):
     maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, []))
     return list(map(itemgetter(0), mode_items))
 
+def quantiles(dist, *, n=4, method='exclusive'):
+    '''Divide *dist* into *n* continuous intervals with equal probability.
+
+    Returns a list of (n - 1) cut points separating the intervals.
+
+    Set *n* to 4 for quartiles (the default).  Set *n* to 10 for deciles.
+    Set *n* to 100 for percentiles which gives the 99 cuts points that
+    separate *dist* in to 100 equal sized groups.
+
+    The *dist* can be any iterable containing sample data or it can be
+    an instance of a class that defines an inv_cdf() method.  For sample
+    data, the cut points are linearly interpolated between data points.
+
+    If *method* is set to *inclusive*, *dist* is treated as population
+    data.  The minimum value is treated as the 0th percentile and the
+    maximum value is treated as the 100th percentile.
+    '''
+    # Possible future API extensions:
+    #     quantiles(data, already_sorted=True)
+    #     quantiles(data, cut_points=[0.02, 0.25, 0.50, 0.75, 0.98])
+    if n < 1:
+        raise StatisticsError('n must be at least 1')
+    if hasattr(dist, 'inv_cdf'):
+        return [dist.inv_cdf(i / n) for i in range(1, n)]
+    data = sorted(dist)
+    ld = len(data)
+    if ld < 2:
+        raise StatisticsError('must have at least two data points')
+    if method == 'inclusive':
+        m = ld - 1
+        result = []
+        for i in range(1, n):
+            j = i * m // n
+            delta = i*m - j*n
+            interpolated = (data[j] * (n - delta) + data[j+1] * delta) / n
+            result.append(interpolated)
+        return result
+    if method == 'exclusive':
+        m = ld + 1
+        result = []
+        for i in range(1, n):
+            j = i * m // n                               # rescale i to m/n
+            j = 1 if j < 1 else ld-1 if j > ld-1 else j  # clamp to 1 .. ld-1
+            delta = i*m - j*n                            # exact integer math
+            interpolated = (data[j-1] * (n - delta) + data[j] * delta) / n
+            result.append(interpolated)
+        return result
+    raise ValueError(f'Unknown method: {method!r}')
 
 # === Measures of spread ===
 
diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py
index 4d397eb..c988d7f 100644
--- a/Lib/test/test_statistics.py
+++ b/Lib/test/test_statistics.py
@@ -3,6 +3,7 @@ approx_equal function.
 
 """
 
+import bisect
 import collections
 import collections.abc
 import copy
@@ -2038,6 +2039,7 @@ class TestStdev(VarianceStdevMixin, NumericTestCase):
         expected = math.sqrt(statistics.variance(data))
         self.assertEqual(self.func(data), expected)
 
+
 class TestGeometricMean(unittest.TestCase):
 
     def test_basics(self):
@@ -2126,6 +2128,146 @@ class TestGeometricMean(unittest.TestCase):
         with self.assertRaises(ValueError):
             geometric_mean([Inf, -Inf])
 
+
+class TestQuantiles(unittest.TestCase):
+
+    def test_specific_cases(self):
+        # Match results computed by hand and cross-checked
+        # against the PERCENTILE.EXC function in MS Excel.
+        quantiles = statistics.quantiles
+        data = [120, 200, 250, 320, 350]
+        random.shuffle(data)
+        for n, expected in [
+            (1, []),
+            (2, [250.0]),
+            (3, [200.0, 320.0]),
+            (4, [160.0, 250.0, 335.0]),
+            (5, [136.0, 220.0, 292.0, 344.0]),
+            (6, [120.0, 200.0, 250.0, 320.0, 350.0]),
+            (8, [100.0, 160.0, 212.5, 250.0, 302.5, 335.0, 357.5]),
+            (10, [88.0, 136.0, 184.0, 220.0, 250.0, 292.0, 326.0, 344.0, 362.0]),
+            (12, [80.0, 120.0, 160.0, 200.0, 225.0, 250.0, 285.0, 320.0, 335.0,
+                  350.0, 365.0]),
+            (15, [72.0, 104.0, 136.0, 168.0, 200.0, 220.0, 240.0, 264.0, 292.0,
+                  320.0, 332.0, 344.0, 356.0, 368.0]),
+                ]:
+            self.assertEqual(expected, quantiles(data, n=n))
+            self.assertEqual(len(quantiles(data, n=n)), n - 1)
+            self.assertEqual(list(map(float, expected)),
+                             quantiles(map(Decimal, data), n=n))
+            self.assertEqual(list(map(Decimal, expected)),
+                             quantiles(map(Decimal, data), n=n))
+            self.assertEqual(list(map(Fraction, expected)),
+                             quantiles(map(Fraction, data), n=n))
+            # Invariant under tranlation and scaling
+            def f(x):
+                return 3.5 * x - 1234.675
+            exp = list(map(f, expected))
+            act = quantiles(map(f, data), n=n)
+            self.assertTrue(all(math.isclose(e, a) for e, a in zip(exp, act)))
+        # Quartiles of a standard normal distribution
+        for n, expected in [
+            (1, []),
+            (2, [0.0]),
+            (3, [-0.4307, 0.4307]),
+            (4 ,[-0.6745, 0.0, 0.6745]),
+                ]:
+            actual = quantiles(statistics.NormalDist(), n=n)
+            self.assertTrue(all(math.isclose(e, a, abs_tol=0.0001)
+                            for e, a in zip(expected, actual)))
+
+    def test_specific_cases_inclusive(self):
+        # Match results computed by hand and cross-checked
+        # against the PERCENTILE.INC function in MS Excel
+        # and against the quaatile() function in SciPy.
+        quantiles = statistics.quantiles
+        data = [100, 200, 400, 800]
+        random.shuffle(data)
+        for n, expected in [
+            (1, []),
+            (2, [300.0]),
+            (3, [200.0, 400.0]),
+            (4, [175.0, 300.0, 500.0]),
+            (5, [160.0, 240.0, 360.0, 560.0]),
+            (6, [150.0, 200.0, 300.0, 400.0, 600.0]),
+            (8, [137.5, 175, 225.0, 300.0, 375.0, 500.0,650.0]),
+            (10, [130.0, 160.0, 190.0, 240.0, 300.0, 360.0, 440.0, 560.0, 680.0]),
+            (12, [125.0, 150.0, 175.0, 200.0, 250.0, 300.0, 350.0, 400.0,
+                  500.0, 600.0, 700.0]),
+            (15, [120.0, 140.0, 160.0, 180.0, 200.0, 240.0, 280.0, 320.0, 360.0,
+                  400.0, 480.0, 560.0, 640.0, 720.0]),
+                ]:
+            self.assertEqual(expected, quantiles(data, n=n, method="inclusive"))
+            self.assertEqual(len(quantiles(data, n=n, method="inclusive")), n - 1)
+            self.assertEqual(list(map(float, expected)),
+                             quantiles(map(Decimal, data), n=n, method="inclusive"))
+            self.assertEqual(list(map(Decimal, expected)),
+                             quantiles(map(Decimal, data), n=n, method="inclusive"))
+            self.assertEqual(list(map(Fraction, expected)),
+                             quantiles(map(Fraction, data), n=n, method="inclusive"))
+            # Invariant under tranlation and scaling
+            def f(x):
+                return 3.5 * x - 1234.675
+            exp = list(map(f, expected))
+            act = quantiles(map(f, data), n=n, method="inclusive")
+            self.assertTrue(all(math.isclose(e, a) for e, a in zip(exp, act)))
+        # Quartiles of a standard normal distribution
+        for n, expected in [
+            (1, []),
+            (2, [0.0]),
+            (3, [-0.4307, 0.4307]),
+            (4 ,[-0.6745, 0.0, 0.6745]),
+                ]:
+            actual = quantiles(statistics.NormalDist(), n=n, method="inclusive")
+            self.assertTrue(all(math.isclose(e, a, abs_tol=0.0001)
+                            for e, a in zip(expected, actual)))
+
+    def test_equal_sized_groups(self):
+        quantiles = statistics.quantiles
+        total = 10_000
+        data = [random.expovariate(0.2) for i in range(total)]
+        while len(set(data)) != total:
+            data.append(random.expovariate(0.2))
+        data.sort()
+
+        # Cases where the group size exactly divides the total
+        for n in (1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000):
+            group_size = total // n
+            self.assertEqual(
+                [bisect.bisect(data, q) for q in quantiles(data, n=n)],
+                list(range(group_size, total, group_size)))
+
+        # When the group sizes can't be exactly equal, they should
+        # differ by no more than one
+        for n in (13, 19, 59, 109, 211, 571, 1019, 1907, 5261, 9769):
+            group_sizes = {total // n, total // n + 1}
+            pos = [bisect.bisect(data, q) for q in quantiles(data, n=n)]
+            sizes = {q - p for p, q in zip(pos, pos[1:])}
+            self.assertTrue(sizes <= group_sizes)
+
+    def test_error_cases(self):
+        quantiles = statistics.quantiles
+        StatisticsError = statistics.StatisticsError
+        with self.assertRaises(TypeError):
+            quantiles()                         # Missing arguments
+        with self.assertRaises(TypeError):
+            quantiles([10, 20, 30], 13, n=4)    # Too many arguments
+        with self.assertRaises(TypeError):
+            quantiles([10, 20, 30], 4)          # n is a positional argument
+        with self.assertRaises(StatisticsError):
+            quantiles([10, 20, 30], n=0)        # n is zero
+        with self.assertRaises(StatisticsError):
+            quantiles([10, 20, 30], n=-1)       # n is negative
+        with self.assertRaises(TypeError):
+            quantiles([10, 20, 30], n=1.5)      # n is not an integer
+        with self.assertRaises(ValueError):
+            quantiles([10, 20, 30], method='X') # method is unknown
+        with self.assertRaises(StatisticsError):
+            quantiles([10], n=4)                # not enough data points
+        with self.assertRaises(TypeError):
+            quantiles([10, None, 30], n=4)      # data is non-numeric
+
+
 class TestNormalDist(unittest.TestCase):
 
     # General note on precision: The pdf(), cdf(), and overlap() methods
author	Raymond Hettinger <rhettinger@users.noreply.github.com>	2019-04-23 07:06:35 (GMT)
committer	GitHub <noreply@github.com>	2019-04-23 07:06:35 (GMT)
commit	9013ccf6d8037f6ae78145a42d194141cb10d332 (patch)
tree	9a1bf5b8739569012d9d3ecbf50b739936b730e2 /Lib
parent	d437012cdd4a38b5b3d05f139d5f0a28196e4769 (diff)
download	cpython-9013ccf6d8037f6ae78145a42d194141cb10d332.zip cpython-9013ccf6d8037f6ae78145a42d194141cb10d332.tar.gz cpython-9013ccf6d8037f6ae78145a42d194141cb10d332.tar.bz2