diff options
author | Steven D'Aprano <steve@pearwood.info> | 2016-05-04 17:54:29 (GMT) |
---|---|---|
committer | Steven D'Aprano <steve@pearwood.info> | 2016-05-04 17:54:29 (GMT) |
commit | 3b06e243527da5f4ca4dc2e3126dc9f5bbdc243c (patch) | |
tree | 2c00f35c03dc6ff24ae5c72f1363ef2da761ddc0 /Lib | |
parent | ad039f754805dc9c9d4cd95ed249984bc1405bd6 (diff) | |
download | cpython-3b06e243527da5f4ca4dc2e3126dc9f5bbdc243c.zip cpython-3b06e243527da5f4ca4dc2e3126dc9f5bbdc243c.tar.gz cpython-3b06e243527da5f4ca4dc2e3126dc9f5bbdc243c.tar.bz2 |
Issue 26002 and 25974
patches by Upendra Kumar and Stefan Krah
speed up median by using bisect, and general speedup for Decimals using as_integer_ratio
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/statistics.py | 68 | ||||
-rw-r--r-- | Lib/test/test_statistics.py | 31 |
2 files changed, 44 insertions, 55 deletions
diff --git a/Lib/statistics.py b/Lib/statistics.py index 518f546..af5d41e 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -105,6 +105,7 @@ import math from fractions import Fraction from decimal import Decimal from itertools import groupby +from bisect import bisect_left, bisect_right @@ -223,56 +224,26 @@ def _exact_ratio(x): # Optimise the common case of floats. We expect that the most often # used numeric type will be builtin floats, so try to make this as # fast as possible. - if type(x) is float: + if type(x) is float or type(x) is Decimal: return x.as_integer_ratio() try: # x may be an int, Fraction, or Integral ABC. return (x.numerator, x.denominator) except AttributeError: try: - # x may be a float subclass. + # x may be a float or Decimal subclass. return x.as_integer_ratio() except AttributeError: - try: - # x may be a Decimal. - return _decimal_to_ratio(x) - except AttributeError: - # Just give up? - pass + # Just give up? + pass except (OverflowError, ValueError): # float NAN or INF. - assert not math.isfinite(x) + assert not _isfinite(x) return (x, None) msg = "can't convert type '{}' to numerator/denominator" raise TypeError(msg.format(type(x).__name__)) -# FIXME This is faster than Fraction.from_decimal, but still too slow. -def _decimal_to_ratio(d): - """Convert Decimal d to exact integer ratio (numerator, denominator). - - >>> from decimal import Decimal - >>> _decimal_to_ratio(Decimal("2.6")) - (26, 10) - - """ - sign, digits, exp = d.as_tuple() - if exp in ('F', 'n', 'N'): # INF, NAN, sNAN - assert not d.is_finite() - return (d, None) - num = 0 - for digit in digits: - num = num*10 + digit - if exp < 0: - den = 10**-exp - else: - num *= 10**exp - den = 1 - if sign: - num = -num - return (num, den) - - def _convert(value, T): """Convert value to given numeric type T.""" if type(value) is T: @@ -305,6 +276,21 @@ def _counts(data): return table +def _find_lteq(a, x): + 'Locate the leftmost value exactly equal to x' + i = bisect_left(a, x) + if i != len(a) and a[i] == x: + return i + raise ValueError + + +def _find_rteq(a, l, x): + 'Locate the rightmost value exactly equal to x' + i = bisect_right(a, x, lo=l) + if i != (len(a)+1) and a[i-1] == x: + return i-1 + raise ValueError + # === Measures of central tendency (averages) === def mean(data): @@ -442,9 +428,15 @@ def median_grouped(data, interval=1): except TypeError: # Mixed type. For now we just coerce to float. L = float(x) - float(interval)/2 - cf = data.index(x) # Number of values below the median interval. - # FIXME The following line could be more efficient for big lists. - f = data.count(x) # Number of data points in the median interval. + + # Uses bisection search to search for x in data with log(n) time complexity + # Find the position of leftmost occurence of x in data + l1 = _find_lteq(data, x) + # Find the position of rightmost occurence of x in data[l1...len(data)] + # Assuming always l1 <= l2 + l2 = _find_rteq(data, l1, x) + cf = l1 + f = l2 - l1 + 1 return L + interval*(n/2 - cf)/f diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index 9a54fe1..5275cb6 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -699,13 +699,12 @@ class ExactRatioTest(unittest.TestCase): num, den = statistics._exact_ratio(x) self.assertEqual(x, num/den) - @unittest.skipIf(True, "temporarily disabled: see #25928") def test_decimal(self): D = Decimal _exact_ratio = statistics._exact_ratio - self.assertEqual(_exact_ratio(D("0.125")), (125, 1000)) - self.assertEqual(_exact_ratio(D("12.345")), (12345, 1000)) - self.assertEqual(_exact_ratio(D("-1.98")), (-198, 100)) + self.assertEqual(_exact_ratio(D("0.125")), (1, 8)) + self.assertEqual(_exact_ratio(D("12.345")), (2469, 200)) + self.assertEqual(_exact_ratio(D("-1.98")), (-99, 50)) def test_inf(self): INF = float("INF") @@ -731,7 +730,6 @@ class ExactRatioTest(unittest.TestCase): self.assertIs(ratio[1], None) self.assertEqual(type(ratio[0]), type(nan)) - @unittest.skipIf(True, "temporarily disabled: see #25928") def test_decimal_nan(self): NAN = Decimal("NAN") sNAN = Decimal("sNAN") @@ -745,18 +743,18 @@ class ExactRatioTest(unittest.TestCase): class DecimalToRatioTest(unittest.TestCase): - # Test _decimal_to_ratio private function. + # Test _exact_ratio private function. def test_infinity(self): # Test that INFs are handled correctly. inf = Decimal('INF') - self.assertEqual(statistics._decimal_to_ratio(inf), (inf, None)) - self.assertEqual(statistics._decimal_to_ratio(-inf), (-inf, None)) + self.assertEqual(statistics._exact_ratio(inf), (inf, None)) + self.assertEqual(statistics._exact_ratio(-inf), (-inf, None)) def test_nan(self): # Test that NANs are handled correctly. for nan in (Decimal('NAN'), Decimal('sNAN')): - num, den = statistics._decimal_to_ratio(nan) + num, den = statistics._exact_ratio(nan) # Because NANs always compare non-equal, we cannot use assertEqual. # Nor can we use an identity test, as we don't guarantee anything # about the object identity. @@ -769,30 +767,30 @@ class DecimalToRatioTest(unittest.TestCase): for d in numbers: # First test positive decimals. assert d > 0 - num, den = statistics._decimal_to_ratio(d) + num, den = statistics._exact_ratio(d) self.assertGreaterEqual(num, 0) self.assertGreater(den, 0) # Then test negative decimals. - num, den = statistics._decimal_to_ratio(-d) + num, den = statistics._exact_ratio(-d) self.assertLessEqual(num, 0) self.assertGreater(den, 0) def test_negative_exponent(self): # Test result when the exponent is negative. - t = statistics._decimal_to_ratio(Decimal("0.1234")) - self.assertEqual(t, (1234, 10000)) + t = statistics._exact_ratio(Decimal("0.1234")) + self.assertEqual(t, (617, 5000)) def test_positive_exponent(self): # Test results when the exponent is positive. - t = statistics._decimal_to_ratio(Decimal("1.234e7")) + t = statistics._exact_ratio(Decimal("1.234e7")) self.assertEqual(t, (12340000, 1)) def test_regression_20536(self): # Regression test for issue 20536. # See http://bugs.python.org/issue20536 - t = statistics._decimal_to_ratio(Decimal("1e2")) + t = statistics._exact_ratio(Decimal("1e2")) self.assertEqual(t, (100, 1)) - t = statistics._decimal_to_ratio(Decimal("1.47e5")) + t = statistics._exact_ratio(Decimal("1.47e5")) self.assertEqual(t, (147000, 1)) @@ -1260,7 +1258,6 @@ class SumSpecialValues(NumericTestCase): with decimal.localcontext(decimal.BasicContext): self.assertRaises(decimal.InvalidOperation, statistics._sum, data) - @unittest.skipIf(True, "temporarily disabled: see #25928") def test_decimal_snan_raises(self): # Adding sNAN should raise InvalidOperation. sNAN = Decimal('sNAN') |