diff options
-rw-r--r-- | Misc/NEWS | 13 | ||||
-rw-r--r-- | Objects/longobject.c | 39 |
2 files changed, 40 insertions, 12 deletions
@@ -57,9 +57,16 @@ Type/class unification and new-style classes Core and builtins -- XXX Karatsuba multiplication. This is currently used if and only - if envar KARAT exists. It needs more correctness and speed testing, - the latter especially with unbalanced bit lengths. +- When multiplying very large integers, a version of the so-called + Karatsuba algorithm is now used. This is most effective if the + inputs have roughly the same size. If they both have about N digits, + Karatsuba multiplication has O(N**1.58) runtime (the exponent is + log_base_2(3)) instead of the previous O(N**2). Measured results may + be better or worse than that, depending on platform quirks. Note that + this is a simple implementation, and there's no intent here to compete + with, e.g., gmp. It simply gives a very nice speedup when it applies. + XXX Karatsuba multiplication can be slower when the inputs have very + XXX different sizes. - u'%c' will now raise a ValueError in case the argument is an integer outside the valid range of Unicode code point ordinals. diff --git a/Objects/longobject.c b/Objects/longobject.c index bf82d73..0eefc90 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -1645,7 +1645,23 @@ k_mul(PyLongObject *a, PyLongObject *b) if (kmul_split(a, shift, &ah, &al) < 0) goto fail; if (kmul_split(b, shift, &bh, &bl) < 0) goto fail; - /* Allocate result space. */ + /* The plan: + * 1. Allocate result space (asize + bsize digits: that's always + * enough). + * 2. Compute ah*bh, and copy into result at 2*shift. + * 3. Compute al*bl, and copy into result at 0. Note that this + * can't overlap with #2. + * 4. Subtract al*bl from the result, starting at shift. This may + * underflow (borrow out of the high digit), but we don't care: + * we're effectively doing unsigned arithmetic mod + * BASE**(sizea + sizeb), and so long as the *final* result fits, + * borrows and carries out of the high digit can be ignored. + * 5. Subtract ah*bh from the result, starting at shift. + * 6. Compute (ah+al)*(bh+bl), and add it into the result starting + * at shift. + */ + + /* 1. Allocate result space. */ ret = _PyLong_New(asize + bsize); if (ret == NULL) goto fail; #ifdef Py_DEBUG @@ -1653,7 +1669,7 @@ k_mul(PyLongObject *a, PyLongObject *b) memset(ret->ob_digit, 0xDF, ret->ob_size * sizeof(digit)); #endif - /* t1 <- ah*bh, and copy into high digits of result. */ + /* 2. t1 <- ah*bh, and copy into high digits of result. */ if ((t1 = k_mul(ah, bh)) == NULL) goto fail; assert(t1->ob_size >= 0); assert(2*shift + t1->ob_size <= ret->ob_size); @@ -1666,7 +1682,7 @@ k_mul(PyLongObject *a, PyLongObject *b) memset(ret->ob_digit + 2*shift + t1->ob_size, 0, i * sizeof(digit)); - /* t2 <- al*bl, and copy into the low digits. */ + /* 3. t2 <- al*bl, and copy into the low digits. */ if ((t2 = k_mul(al, bl)) == NULL) { Py_DECREF(t1); goto fail; @@ -1680,15 +1696,17 @@ k_mul(PyLongObject *a, PyLongObject *b) if (i) memset(ret->ob_digit + t2->ob_size, 0, i * sizeof(digit)); - /* Subtract ah*bh (t1) and al*bl (t2) from "the middle" digits. */ + /* 4 & 5. Subtract ah*bh (t1) and al*bl (t2). We do al*bl first + * because it's fresher in cache. + */ i = ret->ob_size - shift; /* # digits after shift */ - v_isub(ret->ob_digit + shift, i, t2->ob_digit, t2->ob_size); + (void)v_isub(ret->ob_digit + shift, i, t2->ob_digit, t2->ob_size); Py_DECREF(t2); - v_isub(ret->ob_digit + shift, i, t1->ob_digit, t1->ob_size); + (void)v_isub(ret->ob_digit + shift, i, t1->ob_digit, t1->ob_size); Py_DECREF(t1); - /* t3 <- (ah+al)(bh+bl) */ + /* 6. t3 <- (ah+al)(bh+bl), and add into result. */ if ((t1 = x_add(ah, al)) == NULL) goto fail; Py_DECREF(ah); Py_DECREF(al); @@ -1709,8 +1727,7 @@ k_mul(PyLongObject *a, PyLongObject *b) if (t3 == NULL) goto fail; /* Add t3. */ - v_iadd(ret->ob_digit + shift, ret->ob_size - shift, - t3->ob_digit, t3->ob_size); + (void)v_iadd(ret->ob_digit + shift, i, t3->ob_digit, t3->ob_size); Py_DECREF(t3); return long_normalize(ret); @@ -1743,10 +1760,14 @@ long_mul(PyLongObject *v, PyLongObject *w) return Py_NotImplemented; } +#if 0 if (Py_GETENV("KARAT") != NULL) z = k_mul(a, b); else z = x_mul(a, b); +#else + z = k_mul(a, b); +#endif if(z == NULL) { Py_DECREF(a); Py_DECREF(b); |