summaryrefslogtreecommitdiffstats
path: root/Objects
diff options
context:
space:
mode:
authorGregory P. Smith <greg@krypto.org>2022-01-23 10:00:41 (GMT)
committerGitHub <noreply@github.com>2022-01-23 10:00:41 (GMT)
commitc7f20f1cc8c20654e5d539552604362feb9b0512 (patch)
tree944c92342905e40464f231804208c4bc05d83404 /Objects
parent83a0ef2162aa379071e243f1b696aa6814edcd2a (diff)
downloadcpython-c7f20f1cc8c20654e5d539552604362feb9b0512.zip
cpython-c7f20f1cc8c20654e5d539552604362feb9b0512.tar.gz
cpython-c7f20f1cc8c20654e5d539552604362feb9b0512.tar.bz2
bpo-46406: Faster single digit int division. (#30626)
* bpo-46406: Faster single digit int division. This expresses the algorithm in a more basic manner resulting in better instruction generation by todays compilers. See https://mail.python.org/archives/list/python-dev@python.org/thread/ZICIMX5VFCX4IOFH5NUPVHCUJCQ4Q7QM/#NEUNFZU3TQU4CPTYZNF3WCN7DOJBBTK5
Diffstat (limited to 'Objects')
-rw-r--r--Objects/longobject.c36
1 files changed, 26 insertions, 10 deletions
diff --git a/Objects/longobject.c b/Objects/longobject.c
index 7721f40..ee20e26 100644
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@@ -1617,25 +1617,41 @@ v_rshift(digit *z, digit *a, Py_ssize_t m, int d)
in pout, and returning the remainder. pin and pout point at the LSD.
It's OK for pin == pout on entry, which saves oodles of mallocs/frees in
_PyLong_Format, but that should be done with great care since ints are
- immutable. */
+ immutable.
+ This version of the code can be 20% faster than the pre-2022 version
+ on todays compilers on architectures like amd64. It evolved from Mark
+ Dickinson observing that a 128:64 divide instruction was always being
+ generated by the compiler despite us working with 30-bit digit values.
+ See the thread for full context:
+
+ https://mail.python.org/archives/list/python-dev@python.org/thread/ZICIMX5VFCX4IOFH5NUPVHCUJCQ4Q7QM/#NEUNFZU3TQU4CPTYZNF3WCN7DOJBBTK5
+
+ If you ever want to change this code, pay attention to performance using
+ different compilers, optimization levels, and cpu architectures. Beware of
+ PGO/FDO builds doing value specialization such as a fast path for //10. :)
+
+ Verify that 17 isn't specialized and this works as a quick test:
+ python -m timeit -s 'x = 10**1000; r=x//10; assert r == 10**999, r' 'x//17'
+*/
static digit
inplace_divrem1(digit *pout, digit *pin, Py_ssize_t size, digit n)
{
- twodigits rem = 0;
+ digit remainder = 0;
assert(n > 0 && n <= PyLong_MASK);
- pin += size;
- pout += size;
while (--size >= 0) {
- digit hi;
- rem = (rem << PyLong_SHIFT) | *--pin;
- *--pout = hi = (digit)(rem / n);
- rem -= (twodigits)hi * n;
- }
- return (digit)rem;
+ twodigits dividend;
+ dividend = ((twodigits)remainder << PyLong_SHIFT) | pin[size];
+ digit quotient;
+ quotient = (digit)(dividend / n);
+ remainder = dividend % n;
+ pout[size] = quotient;
+ }
+ return remainder;
}
+
/* Divide an integer by a digit, returning both the quotient
(as function result) and the remainder (through *prem).
The sign of a is ignored; n should not be zero. */