summaryrefslogtreecommitdiffstats
path: root/Lib/statistics.py
diff options
context:
space:
mode:
authorSteven D'Aprano <steve@pearwood.info>2016-08-09 02:49:01 (GMT)
committerSteven D'Aprano <steve@pearwood.info>2016-08-09 02:49:01 (GMT)
commita474afdddc9282fedd63035b5973c88270c99ee8 (patch)
treec8f1f2dade8094ec3dfc3165dafcd0b247e7aaf9 /Lib/statistics.py
parent95e0df8389c8a44c0f6c6b6be8363e602e8e8914 (diff)
downloadcpython-a474afdddc9282fedd63035b5973c88270c99ee8.zip
cpython-a474afdddc9282fedd63035b5973c88270c99ee8.tar.gz
cpython-a474afdddc9282fedd63035b5973c88270c99ee8.tar.bz2
Add harmonic mean and tests.
Diffstat (limited to 'Lib/statistics.py')
-rw-r--r--Lib/statistics.py66
1 files changed, 62 insertions, 4 deletions
diff --git a/Lib/statistics.py b/Lib/statistics.py
index b081b5a..8c41dd3 100644
--- a/Lib/statistics.py
+++ b/Lib/statistics.py
@@ -28,6 +28,7 @@ Calculating averages
Function Description
================== =============================================
mean Arithmetic mean (average) of data.
+harmonic_mean Harmonic mean of data.
median Median (middle value) of data.
median_low Low median of data.
median_high High median of data.
@@ -95,16 +96,17 @@ A single exception is defined: StatisticsError is a subclass of ValueError.
__all__ = [ 'StatisticsError',
'pstdev', 'pvariance', 'stdev', 'variance',
'median', 'median_low', 'median_high', 'median_grouped',
- 'mean', 'mode',
+ 'mean', 'mode', 'harmonic_mean',
]
-
import collections
+import decimal
import math
+import numbers
from fractions import Fraction
from decimal import Decimal
-from itertools import groupby
+from itertools import groupby, chain
from bisect import bisect_left, bisect_right
@@ -135,7 +137,8 @@ def _sum(data, start=0):
Some sources of round-off error will be avoided:
- >>> _sum([1e50, 1, -1e50] * 1000) # Built-in sum returns zero.
+ # Built-in sum returns zero.
+ >>> _sum([1e50, 1, -1e50] * 1000)
(<class 'float'>, Fraction(1000, 1), 3000)
Fractions and Decimals are also supported:
@@ -291,6 +294,15 @@ def _find_rteq(a, l, x):
return i-1
raise ValueError
+
+def _fail_neg(values, errmsg='negative value'):
+ """Iterate over values, failing if any are less than zero."""
+ for x in values:
+ if x < 0:
+ raise StatisticsError(errmsg)
+ yield x
+
+
# === Measures of central tendency (averages) ===
def mean(data):
@@ -319,6 +331,52 @@ def mean(data):
return _convert(total/n, T)
+def harmonic_mean(data):
+ """Return the harmonic mean of data.
+
+ The harmonic mean, sometimes called the subcontrary mean, is the
+ reciprocal of the arithmetic mean of the reciprocals of the data,
+ and is often appropriate when averaging quantities which are rates
+ or ratios, for example speeds. Example:
+
+ Suppose an investor purchases an equal value of shares in each of
+ three companies, with P/E (price/earning) ratios of 2.5, 3 and 10.
+ What is the average P/E ratio for the investor's portfolio?
+
+ >>> harmonic_mean([2.5, 3, 10]) # For an equal investment portfolio.
+ 3.6
+
+ Using the arithmetic mean would give an average of about 5.167, which
+ is too high.
+
+ If ``data`` is empty, or any element is less than zero,
+ ``harmonic_mean`` will raise ``StatisticsError``.
+ """
+ # For a justification for using harmonic mean for P/E ratios, see
+ # http://fixthepitch.pellucid.com/comps-analysis-the-missing-harmony-of-summary-statistics/
+ # http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2621087
+ if iter(data) is data:
+ data = list(data)
+ errmsg = 'harmonic mean does not support negative values'
+ n = len(data)
+ if n < 1:
+ raise StatisticsError('harmonic_mean requires at least one data point')
+ elif n == 1:
+ x = data[0]
+ if isinstance(x, (numbers.Real, Decimal)):
+ if x < 0:
+ raise StatisticsError(errmsg)
+ return x
+ else:
+ raise TypeError('unsupported type')
+ try:
+ T, total, count = _sum(1/x for x in _fail_neg(data, errmsg))
+ except ZeroDivisionError:
+ return 0
+ assert count == n
+ return _convert(n/total, T)
+
+
# FIXME: investigate ways to calculate medians without sorting? Quickselect?
def median(data):
"""Return the median (middle value) of numeric data.