summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Lib/_pylong.py432
-rw-r--r--Lib/test/test_int.py79
-rw-r--r--Misc/NEWS.d/next/Core and Builtins/2024-05-09-02-37-25.gh-issue-118750.7aLfT-.rst1
3 files changed, 479 insertions, 33 deletions
diff --git a/Lib/_pylong.py b/Lib/_pylong.py
index 4970eb3..f7aabde 100644
--- a/Lib/_pylong.py
+++ b/Lib/_pylong.py
@@ -45,10 +45,16 @@ except ImportError:
#
# and `mycache[lo]` replaces `base**lo` in the inner function.
#
-# While this does give minor speedups (a few percent at best), the primary
-# intent is to simplify the functions using this, by eliminating the need for
-# them to craft their own ad-hoc caching schemes.
-def compute_powers(w, base, more_than, show=False):
+# If an algorithm wants the powers of ceiling(w/2) instead of the floor,
+# pass keyword argument `need_hi=True`.
+#
+# While this does give minor speedups (a few percent at best), the
+# primary intent is to simplify the functions using this, by eliminating
+# the need for them to craft their own ad-hoc caching schemes.
+#
+# See code near end of file for a block of code that can be enabled to
+# run millions of tests.
+def compute_powers(w, base, more_than, *, need_hi=False, show=False):
seen = set()
need = set()
ws = {w}
@@ -58,40 +64,70 @@ def compute_powers(w, base, more_than, show=False):
continue
seen.add(w)
lo = w >> 1
- # only _need_ lo here; some other path may, or may not, need hi
- need.add(lo)
- ws.add(lo)
- if w & 1:
- ws.add(lo + 1)
+ hi = w - lo
+ # only _need_ one here; the other may, or may not, be needed
+ which = hi if need_hi else lo
+ need.add(which)
+ ws.add(which)
+ if lo != hi:
+ ws.add(w - which)
+
+ # `need` is the set of exponents needed. To compute them all
+ # efficiently, possibly add other exponents to `extra`. The goal is
+ # to ensure that each exponent can be gotten from a smaller one via
+ # multiplying by the base, squaring it, or squaring and then
+ # multiplying by the base.
+ #
+ # If need_hi is False, this is already the case (w can always be
+ # gotten from w >> 1 via one of the squaring strategies). But we do
+ # the work anyway, just in case ;-)
+ #
+ # Note that speed is irrelevant. These loops are working on little
+ # ints (exponents) and go around O(log w) times. The total cost is
+ # insignificant compared to just one of the bigint multiplies.
+ cands = need.copy()
+ extra = set()
+ while cands:
+ w = max(cands)
+ cands.remove(w)
+ lo = w >> 1
+ if lo > more_than and w-1 not in cands and lo not in cands:
+ extra.add(lo)
+ cands.add(lo)
+ assert need_hi or not extra
d = {}
- if not need:
- return d
- it = iter(sorted(need))
- first = next(it)
- if show:
- print("pow at", first)
- d[first] = base ** first
- for this in it:
- if this - 1 in d:
+ for n in sorted(need | extra):
+ lo = n >> 1
+ hi = n - lo
+ if n-1 in d:
if show:
- print("* base at", this)
- d[this] = d[this - 1] * base # cheap
- else:
- lo = this >> 1
- hi = this - lo
- assert lo in d
+ print("* base", end="")
+ result = d[n-1] * base # cheap!
+ elif lo in d:
+ # Multiplying a bigint by itself is about twice as fast
+ # in CPython provided it's the same object.
if show:
- print("square at", this)
- # Multiplying a bigint by itself (same object!) is about twice
- # as fast in CPython.
- sq = d[lo] * d[lo]
+ print("square", end="")
+ result = d[lo] * d[lo] # same object
if hi != lo:
- assert hi == lo + 1
if show:
- print(" and * base")
- sq *= base
- d[this] = sq
+ print(" * base", end="")
+ assert 2 * lo + 1 == n
+ result *= base
+ else: # rare
+ if show:
+ print("pow", end='')
+ result = base ** n
+ if show:
+ print(" at", n, "needed" if n in need else "extra")
+ d[n] = result
+
+ assert need <= d.keys()
+ if excess := d.keys() - need:
+ assert need_hi
+ for n in excess:
+ del d[n]
return d
_unbounded_dec_context = decimal.getcontext().copy()
@@ -211,6 +247,145 @@ def _str_to_int_inner(s):
return inner(0, len(s))
+# Asymptotically faster version, using the C decimal module. See
+# comments at the end of the file. This uses decimal arithmetic to
+# convert from base 10 to base 256. The latter is just a string of
+# bytes, which CPython can convert very efficiently to a Python int.
+
+# log of 10 to base 256 with best-possible 53-bit precision. Obtained
+# via:
+# from mpmath import mp
+# mp.prec = 1000
+# print(float(mp.log(10, 256)).hex())
+_LOG_10_BASE_256 = float.fromhex('0x1.a934f0979a371p-2') # about 0.415
+
+# _spread is for internal testing. It maps a key to the number of times
+# that condition obtained in _dec_str_to_int_inner:
+# key 0 - quotient guess was right
+# key 1 - quotient had to be boosted by 1, one time
+# key 999 - one adjustment wasn't enough, so fell back to divmod
+from collections import defaultdict
+_spread = defaultdict(int)
+del defaultdict
+
+def _dec_str_to_int_inner(s, *, GUARD=8):
+ # Yes, BYTELIM is "large". Large enough that CPython will usually
+ # use the Karatsuba _str_to_int_inner to convert the string. This
+ # allowed reducing the cutoff for calling _this_ function from 3.5M
+ # to 2M digits. We could almost certainly do even better by
+ # fine-tuning this and/or using a larger output base than 256.
+ BYTELIM = 100_000
+ D = decimal.Decimal
+ result = bytearray()
+ # See notes at end of file for discussion of GUARD.
+ assert GUARD > 0 # if 0, `decimal` can blow up - .prec 0 not allowed
+
+ def inner(n, w):
+ #assert n < D256 ** w # required, but too expensive to check
+ if w <= BYTELIM:
+ # XXX Stefan Pochmann discovered that, for 1024-bit ints,
+ # `int(Decimal)` took 2.5x longer than `int(str(Decimal))`.
+ # Worse, `int(Decimal) is still quadratic-time for much
+ # larger ints. So unless/until all that is repaired, the
+ # seemingly redundant `str(Decimal)` is crucial to speed.
+ result.extend(int(str(n)).to_bytes(w)) # big-endian default
+ return
+ w1 = w >> 1
+ w2 = w - w1
+ if 0:
+ # This is maximally clear, but "too slow". `decimal`
+ # division is asymptotically fast, but we have no way to
+ # tell it to reuse the high-precision reciprocal it computes
+ # for pow256[w2], so it has to recompute it over & over &
+ # over again :-(
+ hi, lo = divmod(n, pow256[w2][0])
+ else:
+ p256, recip = pow256[w2]
+ # The integer part will have a number of digits about equal
+ # to the difference between the log10s of `n` and `pow256`
+ # (which, since these are integers, is roughly approximated
+ # by `.adjusted()`). That's the working precision we need,
+ ctx.prec = max(n.adjusted() - p256.adjusted(), 0) + GUARD
+ hi = +n * +recip # unary `+` chops back to ctx.prec digits
+ ctx.prec = decimal.MAX_PREC
+ hi = hi.to_integral_value() # lose the fractional digits
+ lo = n - hi * p256
+ # Because we've been uniformly rounding down, `hi` is a
+ # lower bound on the correct quotient.
+ assert lo >= 0
+ # Adjust quotient up if needed. It usually isn't. In random
+ # testing on inputs through 5 billion digit strings, the
+ # test triggered once in about 200 thousand tries.
+ count = 0
+ if lo >= p256:
+ count = 1
+ lo -= p256
+ hi += 1
+ if lo >= p256:
+ # Complete correction via an exact computation. I
+ # believe it's not possible to get here provided
+ # GUARD >= 3. It's tested by reducing GUARD below
+ # that.
+ count = 999
+ hi2, lo = divmod(lo, p256)
+ hi += hi2
+ _spread[count] += 1
+ # The assert should always succeed, but way too slow to keep
+ # enabled.
+ #assert hi, lo == divmod(n, pow256[w2][0])
+ inner(hi, w1)
+ del hi # at top levels, can free a lot of RAM "early"
+ inner(lo, w2)
+
+ # How many base 256 digits are needed?. Mathematically, exactly
+ # floor(log256(int(s))) + 1. There is no cheap way to compute this.
+ # But we can get an upper bound, and that's necessary for our error
+ # analysis to make sense. int(s) < 10**len(s), so the log needed is
+ # < log256(10**len(s)) = len(s) * log256(10). However, using
+ # finite-precision floating point for this, it's possible that the
+ # computed value is a little less than the true value. If the true
+ # value is at - or a little higher than - an integer, we can get an
+ # off-by-1 error too low. So we add 2 instead of 1 if chopping lost
+ # a fraction > 0.9.
+
+ # The "WASI" test platfrom can complain about `len(s)` if it's too
+ # large to fit in its idea of "an index-sized integer".
+ lenS = s.__len__()
+ log_ub = lenS * _LOG_10_BASE_256
+ log_ub_as_int = int(log_ub)
+ w = log_ub_as_int + 1 + (log_ub - log_ub_as_int > 0.9)
+ # And what if we've plain exhausted the limits of HW floats? We
+ # could compute the log to any desired precision using `decimal`,
+ # but it's not plausible that anyone will pass a string requiring
+ # trillions of bytes (unless they're just trying to "break things").
+ if w.bit_length() >= 46:
+ # "Only" had < 53 - 46 = 7 bits to spare in IEEE-754 double.
+ raise ValueError(f"cannot convert string of len {lenS} to int")
+ with decimal.localcontext(_unbounded_dec_context) as ctx:
+ D256 = D(256)
+ pow256 = compute_powers(w, D256, BYTELIM, need_hi=True)
+ rpow256 = compute_powers(w, 1 / D256, BYTELIM, need_hi=True)
+ # We're going to do inexact, chopped arithmetic, multiplying by
+ # an approximation to the reciprocal of 256**i. We chop to get a
+ # lower bound on the true integer quotient. Our approximation is
+ # a lower bound, the multiplication is chopped too, and
+ # to_integral_value() is also chopped.
+ ctx.traps[decimal.Inexact] = 0
+ ctx.rounding = decimal.ROUND_DOWN
+ for k, v in pow256.items():
+ # No need to save much more precision in the reciprocal than
+ # the power of 256 has, plus some guard digits to absorb
+ # most relevant rounding errors. This is highly significant:
+ # 1/2**i has the same number of significant decimal digits
+ # as 5**i, generally over twice the number in 2**i,
+ ctx.prec = v.adjusted() + GUARD + 1
+ # The unary "+" chops the reciprocal back to that precision.
+ pow256[k] = v, +rpow256[k]
+ del rpow256 # exact reciprocals no longer needed
+ ctx.prec = decimal.MAX_PREC
+ inner(D(s), w)
+ return int.from_bytes(result)
+
def int_from_string(s):
"""Asymptotically fast version of PyLong_FromString(), conversion
of a string of decimal digits into an 'int'."""
@@ -219,7 +394,10 @@ def int_from_string(s):
# and underscores, and stripped leading whitespace. The input can still
# contain underscores and have trailing whitespace.
s = s.rstrip().replace('_', '')
- return _str_to_int_inner(s)
+ func = _str_to_int_inner
+ if len(s) >= 2_000_000 and _decimal is not None:
+ func = _dec_str_to_int_inner
+ return func(s)
def str_to_int(s):
"""Asymptotically fast version of decimal string to 'int' conversion."""
@@ -361,3 +539,191 @@ def int_divmod(a, b):
return ~q, b + ~r
else:
return _divmod_pos(a, b)
+
+
+# Notes on _dec_str_to_int_inner:
+#
+# Stefan Pochmann worked up a str->int function that used the decimal
+# module to, in effect, convert from base 10 to base 256. This is
+# "unnatural", in that it requires multiplying and dividing by large
+# powers of 2, which `decimal` isn't naturally suited to. But
+# `decimal`'s `*` and `/` are asymptotically superior to CPython's, so
+# at _some_ point it could be expected to win.
+#
+# Alas, the crossover point was too high to be of much real interest. I
+# (Tim) then worked on ways to replace its division with multiplication
+# by a cached reciprocal approximation instead, fixing up errors
+# afterwards. This reduced the crossover point significantly,
+#
+# I revisited the code, and found ways to improve and simplify it. The
+# crossover point is at about 3.4 million digits now.
+#
+# About .adjusted()
+# -----------------
+# Restrict to Decimal values x > 0. We don't use negative numbers in the
+# code, and I don't want to have to keep typing, e.g., "absolute value".
+#
+# For convenience, I'll use `x.a` to mean `x.adjusted()`. x.a doesn't
+# look at the digits of x, but instead returns an integer giving x's
+# order of magnitude. These are equivalent:
+#
+# - x.a is the power-of-10 exponent of x's most significant digit.
+# - x.a = the infinitely precise floor(log10(x))
+# - x can be written in this form, where f is a real with 1 <= f < 10:
+# x = f * 10**x.a
+#
+# Observation; if x is an integer, len(str(x)) = x.a + 1.
+#
+# Lemma 1: (x * y).a = x.a + y.a, or one larger
+#
+# Proof: Write x = f * 10**x.a and y = g * 10**y.a, where f and g are in
+# [1, 10). Then x*y = f*g * 10**(x.a + y.a), where 1 <= f*g < 100. If
+# f*g < 10, (x*y).a is x.a+y.a. Else divide f*g by 10 to bring it back
+# into [1, 10], and add 1 to the exponent to compensate. Then (x*y).a is
+# x.a+y.a+1.
+#
+# Lemma 2: ceiling(log10(x/y)) <= x.a - y.a + 1
+#
+# Proof: Express x and y as in Lemma 1. Then x/y = f/g * 10**(x.a -
+# y.a), where 1/10 < f/g < 10. If 1 <= f/g, (x/y).a is x.a-y.a. Else
+# multiply f/g by 10 to bring it back into [1, 10], and subtract 1 from
+# the exponent to compensate. Then (x/y).a is x.a-y.a-1. So the largest
+# (x/y).a can be is x.a-y.a. Since that's the floor of log10(x/y). the
+# ceiling is at most 1 larger (with equality iff f/g = 1 exactly).
+#
+# GUARD digits
+# ------------
+# We only want the integer part of divisions, so don't need to build
+# the full multiplication tree. But using _just_ the number of
+# digits expected in the integer part ignores too much. What's left
+# out can have a very significant effect on the quotient. So we use
+# GUARD additional digits.
+#
+# The default 8 is more than enough so no more than 1 correction step
+# was ever needed for all inputs tried through 2.5 billion digits. In
+# fact, I believe 3 guard digits are always enough - but the proof is
+# very involved, so better safe than sorry.
+#
+# Short course:
+#
+# If prec is the decimal precision in effect, and we're rounding down,
+# the result of an operation is exactly equal to the infinitely precise
+# result times 1-e for some real e with 0 <= e < 10**(1-prec). In
+#
+# ctx.prec = max(n.adjusted() - p256.adjusted(), 0) + GUARD
+# hi = +n * +recip # unary `+` chops to ctx.prec digits
+#
+# we have 3 visible chopped operationa, but there's also a 4th:
+# precomputing a truncated `recip` as part of setup.
+#
+# So the computed product is exactly equal to the true product times
+# (1-e1)*(1-e2)*(1-e3)*(1-e4); since the e's are all very small, an
+# excellent approximation to the second factor is 1-(e1+e2+e3+e4) (the
+# 2nd and higher order terms in the expanded product are too tiny to
+# matter). If they're all as large as possible, that's
+#
+# 1 - 4*10**(1-prec). This, BTW, is all bog-standard FP error analysis.
+#
+# That implies the computed product is within 1 of the true product
+# provided prec >= log10(true_product) + 1.602.
+#
+# Here are telegraphic details, rephrasing the initial condition in
+# equivalent ways, step by step:
+#
+# prod - prod * (1 - 4*10**(1-prec)) <= 1
+# prod - prod + prod * 4*10**(1-prec)) <= 1
+# prod * 4*10**(1-prec)) <= 1
+# 10**(log10(prod)) * 4*10**(1-prec)) <= 1
+# 4*10**(1-prec+log10(prod))) <= 1
+# 10**(1-prec+log10(prod))) <= 1/4
+# 1-prec+log10(prod) <= log10(1/4) = -0.602
+# -prec <= -1.602 - log10(prod)
+# prec >= log10(prod) + 1.602
+#
+# The true product is the same as the true ratio n/p256. By Lemma 2
+# above, n.a - p256.a + 1 is an upper bound on the ceiling of
+# log10(prod). Then 2 is the ceiling of 1.602. so n.a - p256.a + 3 is an
+# upper bound on the right hand side of the inequality. Any prec >= that
+# will work.
+#
+# But since this is just a sketch of a proof ;-), the code uses the
+# empirically tested 8 instead of 3. 5 digits more or less makes no
+# practical difference to speed - these ints are huge. And while
+# increasing GUARD above 3 may not be necessary, every increase cuts the
+# percentage of cases that need a correction at all.
+#
+# On Computing Reciprocals
+# ------------------------
+# In general, the exact reciprocals we compute have over twice as many
+# significant digits as needed. 1/256**i has the same number of
+# significant decimal digits as 5**i. It's a significant waste of RAM
+# to store all those unneeded digits.
+#
+# So we cut exact reciprocals back to the least precision that can
+# be needed so that the error analysis above is valid,
+#
+# [Note: turns out it's very significantly faster to do it this way than
+# to compute 1 / 256**i directly to the desired precision, because the
+# power method doesn't require division. It's also faster than computing
+# (1/256)**i directly to the desired precision - no material division
+# there, but `compute_powers()` is much smarter about _how_ to compute
+# all the powers needed than repeated applications of `**` - that
+# function invokes `**` for at most the few smallest powers needed.]
+#
+# The hard part is that chopping back to a shorter width occurs
+# _outside_ of `inner`. We can't know then what `prec` `inner()` will
+# need. We have to pick, for each value of `w2`, the largest possible
+# value `prec` can become when `inner()` is working on `w2`.
+#
+# This is the `prec` inner() uses:
+# max(n.a - p256.a, 0) + GUARD
+# and what setup uses (renaming its `v` to `p256` - same thing):
+# p256.a + GUARD + 1
+#
+# We need that the second is always at least as large as the first,
+# which is the same as requiring
+#
+# n.a - 2 * p256.a <= 1
+#
+# What's the largest n can be? n < 255**w = 256**(w2 + (w - w2)). The
+# worst case in this context is when w ix even. and then w = 2*w2, so
+# n < 256**(2*w2) = (256**w2)**2 = p256**2. By Lemma 1, then, n.a
+# is at most p256.a + p256.a + 1.
+#
+# So the most n.a - 2 * p256.a can be is
+# p256.a + p256.a + 1 - 2 * p256.a = 1. QED
+#
+# Note: an earlier version of the code split on floor(e/2) instead of on
+# the ceiling. The worst case then is odd `w`, and a more involved proof
+# was needed to show that adding 4 (instead of 1) may be necessary.
+# Basically because, in that case, n may be up to 256 times larger than
+# p256**2. Curiously enough, by splitting on the ceiling instead,
+# nothing in any proof here actually depends on the output base (256).
+
+# Enable for brute-force testing of compute_powers(). This takes about a
+# minute, because it tries millions of cases.
+if 0:
+ def consumer(w, limir, need_hi):
+ seen = set()
+ need = set()
+ def inner(w):
+ if w <= limit:
+ return
+ if w in seen:
+ return
+ seen.add(w)
+ lo = w >> 1
+ hi = w - lo
+ need.add(hi if need_hi else lo)
+ inner(lo)
+ inner(hi)
+ inner(w)
+ exp = compute_powers(w, 1, limir, need_hi=need_hi)
+ assert exp.keys() == need
+
+ from itertools import chain
+ for need_hi in (False, True):
+ for limit in (0, 1, 10, 100, 1_000, 10_000, 100_000):
+ for w in chain(range(1, 100_000),
+ (10**i for i in range(5, 30))):
+ consumer(w, limit, need_hi)
diff --git a/Lib/test/test_int.py b/Lib/test/test_int.py
index 8959ffb..67c0801 100644
--- a/Lib/test/test_int.py
+++ b/Lib/test/test_int.py
@@ -919,5 +919,84 @@ class PyLongModuleTests(unittest.TestCase):
self.assertEqual(n, int(sn))
bits <<= 1
+ @support.requires_resource('cpu')
+ def test_pylong_roundtrip_huge(self):
+ # k blocks of 1234567890
+ k = 1_000_000 # so 10 million digits in all
+ tentoten = 10**10
+ n = 1234567890 * ((tentoten**k - 1) // (tentoten - 1))
+ sn = "1234567890" * k
+ self.assertEqual(n, int(sn))
+ self.assertEqual(sn, str(n))
+
+ @support.requires_resource('cpu')
+ @unittest.skipUnless(_pylong, "_pylong module required")
+ def test_whitebox_dec_str_to_int_inner_failsafe(self):
+ # While I believe the number of GUARD digits in this function is
+ # always enough so that no more than one correction step is ever
+ # needed, the code has a "failsafe" path that takes over if I'm
+ # wrong about that. We have no input that reaches that block.
+ # Here we test a contrived input that _does_ reach that block,
+ # provided the number of guard digits is reduced to 1.
+ sn = "9" * 2000156
+ n = 10**len(sn) - 1
+ orig_spread = _pylong._spread.copy()
+ _pylong._spread.clear()
+ try:
+ self.assertEqual(n, _pylong._dec_str_to_int_inner(sn, GUARD=1))
+ self.assertIn(999, _pylong._spread)
+ finally:
+ _pylong._spread.clear()
+ _pylong._spread.update(orig_spread)
+
+ @unittest.skipUnless(_pylong, "pylong module required")
+ def test_whitebox_dec_str_to_int_inner_monster(self):
+ # I don't think anyone has enough RAM to build a string long enough
+ # for this function to complain. So lie about the string length.
+
+ class LyingStr(str):
+ def __len__(self):
+ return int((1 << 47) / _pylong._LOG_10_BASE_256)
+
+ liar = LyingStr("42")
+ # We have to pass the liar directly to the complaining function. If we
+ # just try `int(liar)`, earlier layers will replace it with plain old
+ # "43".
+ # Embedding `len(liar)` into the f-string failed on the WASI testbot
+ # (don't know what that is):
+ # OverflowError: cannot fit 'int' into an index-sized integer
+ # So a random stab at worming around that.
+ self.assertRaisesRegex(ValueError,
+ f"^cannot convert string of len {liar.__len__()} to int$",
+ _pylong._dec_str_to_int_inner,
+ liar)
+
+ @unittest.skipUnless(_pylong, "_pylong module required")
+ def test_pylong_compute_powers(self):
+ # Basic sanity tests. See end of _pylong.py for manual heavy tests.
+ def consumer(w, base, limit, need_hi):
+ seen = set()
+ need = set()
+ def inner(w):
+ if w <= limit or w in seen:
+ return
+ seen.add(w)
+ lo = w >> 1
+ hi = w - lo
+ need.add(hi if need_hi else lo)
+ inner(lo)
+ inner(hi)
+ inner(w)
+ d = _pylong.compute_powers(w, base, limit, need_hi=need_hi)
+ self.assertEqual(d.keys(), need)
+ for k, v in d.items():
+ self.assertEqual(v, base ** k)
+
+ for base in 2, 5:
+ for need_hi in False, True:
+ for limit in 1, 11:
+ for w in range(250, 550):
+ consumer(w, base, limit, need_hi)
+
if __name__ == "__main__":
unittest.main()
diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-05-09-02-37-25.gh-issue-118750.7aLfT-.rst b/Misc/NEWS.d/next/Core and Builtins/2024-05-09-02-37-25.gh-issue-118750.7aLfT-.rst
new file mode 100644
index 0000000..727427d
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2024-05-09-02-37-25.gh-issue-118750.7aLfT-.rst
@@ -0,0 +1 @@
+If the C version of the ``decimal`` module is available, ``int(str)`` now uses it to supply an asymptotically much faster conversion. However, this only applies if the string contains over about 2 million digits.