diff options
author | Gregory P. Smith <greg@krypto.org> | 2022-01-23 10:00:41 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-01-23 10:00:41 (GMT) |
commit | c7f20f1cc8c20654e5d539552604362feb9b0512 (patch) | |
tree | 944c92342905e40464f231804208c4bc05d83404 /Objects/longobject.c | |
parent | 83a0ef2162aa379071e243f1b696aa6814edcd2a (diff) | |
download | cpython-c7f20f1cc8c20654e5d539552604362feb9b0512.zip cpython-c7f20f1cc8c20654e5d539552604362feb9b0512.tar.gz cpython-c7f20f1cc8c20654e5d539552604362feb9b0512.tar.bz2 |
bpo-46406: Faster single digit int division. (#30626)
* bpo-46406: Faster single digit int division.
This expresses the algorithm in a more basic manner resulting in better
instruction generation by todays compilers.
See https://mail.python.org/archives/list/python-dev@python.org/thread/ZICIMX5VFCX4IOFH5NUPVHCUJCQ4Q7QM/#NEUNFZU3TQU4CPTYZNF3WCN7DOJBBTK5
Diffstat (limited to 'Objects/longobject.c')
-rw-r--r-- | Objects/longobject.c | 36 |
1 files changed, 26 insertions, 10 deletions
diff --git a/Objects/longobject.c b/Objects/longobject.c index 7721f40..ee20e26 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -1617,25 +1617,41 @@ v_rshift(digit *z, digit *a, Py_ssize_t m, int d) in pout, and returning the remainder. pin and pout point at the LSD. It's OK for pin == pout on entry, which saves oodles of mallocs/frees in _PyLong_Format, but that should be done with great care since ints are - immutable. */ + immutable. + This version of the code can be 20% faster than the pre-2022 version + on todays compilers on architectures like amd64. It evolved from Mark + Dickinson observing that a 128:64 divide instruction was always being + generated by the compiler despite us working with 30-bit digit values. + See the thread for full context: + + https://mail.python.org/archives/list/python-dev@python.org/thread/ZICIMX5VFCX4IOFH5NUPVHCUJCQ4Q7QM/#NEUNFZU3TQU4CPTYZNF3WCN7DOJBBTK5 + + If you ever want to change this code, pay attention to performance using + different compilers, optimization levels, and cpu architectures. Beware of + PGO/FDO builds doing value specialization such as a fast path for //10. :) + + Verify that 17 isn't specialized and this works as a quick test: + python -m timeit -s 'x = 10**1000; r=x//10; assert r == 10**999, r' 'x//17' +*/ static digit inplace_divrem1(digit *pout, digit *pin, Py_ssize_t size, digit n) { - twodigits rem = 0; + digit remainder = 0; assert(n > 0 && n <= PyLong_MASK); - pin += size; - pout += size; while (--size >= 0) { - digit hi; - rem = (rem << PyLong_SHIFT) | *--pin; - *--pout = hi = (digit)(rem / n); - rem -= (twodigits)hi * n; - } - return (digit)rem; + twodigits dividend; + dividend = ((twodigits)remainder << PyLong_SHIFT) | pin[size]; + digit quotient; + quotient = (digit)(dividend / n); + remainder = dividend % n; + pout[size] = quotient; + } + return remainder; } + /* Divide an integer by a digit, returning both the quotient (as function result) and the remainder (through *prem). The sign of a is ignored; n should not be zero. */ |