diff options
Diffstat (limited to 'Modules/_decimal/libmpdec/umodarith.h')
-rw-r--r-- | Modules/_decimal/libmpdec/umodarith.h | 650 |
1 files changed, 650 insertions, 0 deletions
diff --git a/Modules/_decimal/libmpdec/umodarith.h b/Modules/_decimal/libmpdec/umodarith.h new file mode 100644 index 0000000..a6aeceb --- /dev/null +++ b/Modules/_decimal/libmpdec/umodarith.h @@ -0,0 +1,650 @@ +/* + * Copyright (c) 2008-2012 Stefan Krah. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#ifndef UMODARITH_H +#define UMODARITH_H + + +#include "constants.h" +#include "mpdecimal.h" +#include "typearith.h" + + +/* Bignum: Low level routines for unsigned modular arithmetic. These are + used in the fast convolution functions for very large coefficients. */ + + +/**************************************************************************/ +/* ANSI modular arithmetic */ +/**************************************************************************/ + + +/* + * Restrictions: a < m and b < m + * ACL2 proof: umodarith.lisp: addmod-correct + */ +static inline mpd_uint_t +addmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m) +{ + mpd_uint_t s; + + s = a + b; + s = (s < a) ? s - m : s; + s = (s >= m) ? s - m : s; + + return s; +} + +/* + * Restrictions: a < m and b < m + * ACL2 proof: umodarith.lisp: submod-2-correct + */ +static inline mpd_uint_t +submod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m) +{ + mpd_uint_t d; + + d = a - b; + d = (a < b) ? d + m : d; + + return d; +} + +/* + * Restrictions: a < 2m and b < 2m + * ACL2 proof: umodarith.lisp: section ext-submod + */ +static inline mpd_uint_t +ext_submod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m) +{ + mpd_uint_t d; + + a = (a >= m) ? a - m : a; + b = (b >= m) ? b - m : b; + + d = a - b; + d = (a < b) ? d + m : d; + + return d; +} + +/* + * Reduce double word modulo m. + * Restrictions: m != 0 + * ACL2 proof: umodarith.lisp: section dw-reduce + */ +static inline mpd_uint_t +dw_reduce(mpd_uint_t hi, mpd_uint_t lo, mpd_uint_t m) +{ + mpd_uint_t r1, r2, w; + + _mpd_div_word(&w, &r1, hi, m); + _mpd_div_words(&w, &r2, r1, lo, m); + + return r2; +} + +/* + * Subtract double word from a. + * Restrictions: a < m + * ACL2 proof: umodarith.lisp: section dw-submod + */ +static inline mpd_uint_t +dw_submod(mpd_uint_t a, mpd_uint_t hi, mpd_uint_t lo, mpd_uint_t m) +{ + mpd_uint_t d, r; + + r = dw_reduce(hi, lo, m); + d = a - r; + d = (a < r) ? d + m : d; + + return d; +} + +#ifdef CONFIG_64 + +/**************************************************************************/ +/* 64-bit modular arithmetic */ +/**************************************************************************/ + +/* + * A proof of the algorithm is in literature/mulmod-64.txt. An ACL2 + * proof is in umodarith.lisp: section "Fast modular reduction". + * + * Algorithm: calculate (a * b) % p: + * + * a) hi, lo <- a * b # Calculate a * b. + * + * b) hi, lo <- R(hi, lo) # Reduce modulo p. + * + * c) Repeat step b) until 0 <= hi * 2**64 + lo < 2*p. + * + * d) If the result is less than p, return lo. Otherwise return lo - p. + */ + +static inline mpd_uint_t +x64_mulmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m) +{ + mpd_uint_t hi, lo, x, y; + + + _mpd_mul_words(&hi, &lo, a, b); + + if (m & (1ULL<<32)) { /* P1 */ + + /* first reduction */ + x = y = hi; + hi >>= 32; + + x = lo - x; + if (x > lo) hi--; + + y <<= 32; + lo = y + x; + if (lo < y) hi++; + + /* second reduction */ + x = y = hi; + hi >>= 32; + + x = lo - x; + if (x > lo) hi--; + + y <<= 32; + lo = y + x; + if (lo < y) hi++; + + return (hi || lo >= m ? lo - m : lo); + } + else if (m & (1ULL<<34)) { /* P2 */ + + /* first reduction */ + x = y = hi; + hi >>= 30; + + x = lo - x; + if (x > lo) hi--; + + y <<= 34; + lo = y + x; + if (lo < y) hi++; + + /* second reduction */ + x = y = hi; + hi >>= 30; + + x = lo - x; + if (x > lo) hi--; + + y <<= 34; + lo = y + x; + if (lo < y) hi++; + + /* third reduction */ + x = y = hi; + hi >>= 30; + + x = lo - x; + if (x > lo) hi--; + + y <<= 34; + lo = y + x; + if (lo < y) hi++; + + return (hi || lo >= m ? lo - m : lo); + } + else { /* P3 */ + + /* first reduction */ + x = y = hi; + hi >>= 24; + + x = lo - x; + if (x > lo) hi--; + + y <<= 40; + lo = y + x; + if (lo < y) hi++; + + /* second reduction */ + x = y = hi; + hi >>= 24; + + x = lo - x; + if (x > lo) hi--; + + y <<= 40; + lo = y + x; + if (lo < y) hi++; + + /* third reduction */ + x = y = hi; + hi >>= 24; + + x = lo - x; + if (x > lo) hi--; + + y <<= 40; + lo = y + x; + if (lo < y) hi++; + + return (hi || lo >= m ? lo - m : lo); + } +} + +static inline void +x64_mulmod2c(mpd_uint_t *a, mpd_uint_t *b, mpd_uint_t w, mpd_uint_t m) +{ + *a = x64_mulmod(*a, w, m); + *b = x64_mulmod(*b, w, m); +} + +static inline void +x64_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1, + mpd_uint_t m) +{ + *a0 = x64_mulmod(*a0, b0, m); + *a1 = x64_mulmod(*a1, b1, m); +} + +static inline mpd_uint_t +x64_powmod(mpd_uint_t base, mpd_uint_t exp, mpd_uint_t umod) +{ + mpd_uint_t r = 1; + + while (exp > 0) { + if (exp & 1) + r = x64_mulmod(r, base, umod); + base = x64_mulmod(base, base, umod); + exp >>= 1; + } + + return r; +} + +/* END CONFIG_64 */ +#else /* CONFIG_32 */ + + +/**************************************************************************/ +/* 32-bit modular arithmetic */ +/**************************************************************************/ + +#if defined(ANSI) +#if !defined(LEGACY_COMPILER) +/* HAVE_UINT64_T */ +static inline mpd_uint_t +std_mulmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m) +{ + return ((mpd_uuint_t) a * b) % m; +} + +static inline void +std_mulmod2c(mpd_uint_t *a, mpd_uint_t *b, mpd_uint_t w, mpd_uint_t m) +{ + *a = ((mpd_uuint_t) *a * w) % m; + *b = ((mpd_uuint_t) *b * w) % m; +} + +static inline void +std_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1, + mpd_uint_t m) +{ + *a0 = ((mpd_uuint_t) *a0 * b0) % m; + *a1 = ((mpd_uuint_t) *a1 * b1) % m; +} +/* END HAVE_UINT64_T */ +#else +/* LEGACY_COMPILER */ +static inline mpd_uint_t +std_mulmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m) +{ + mpd_uint_t hi, lo, q, r; + _mpd_mul_words(&hi, &lo, a, b); + _mpd_div_words(&q, &r, hi, lo, m); + return r; +} + +static inline void +std_mulmod2c(mpd_uint_t *a, mpd_uint_t *b, mpd_uint_t w, mpd_uint_t m) +{ + *a = std_mulmod(*a, w, m); + *b = std_mulmod(*b, w, m); +} + +static inline void +std_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1, + mpd_uint_t m) +{ + *a0 = std_mulmod(*a0, b0, m); + *a1 = std_mulmod(*a1, b1, m); +} +/* END LEGACY_COMPILER */ +#endif + +static inline mpd_uint_t +std_powmod(mpd_uint_t base, mpd_uint_t exp, mpd_uint_t umod) +{ + mpd_uint_t r = 1; + + while (exp > 0) { + if (exp & 1) + r = std_mulmod(r, base, umod); + base = std_mulmod(base, base, umod); + exp >>= 1; + } + + return r; +} +#endif /* ANSI CONFIG_32 */ + + +/**************************************************************************/ +/* Pentium Pro modular arithmetic */ +/**************************************************************************/ + +/* + * A proof of the algorithm is in literature/mulmod-ppro.txt. The FPU + * control word must be set to 64-bit precision and truncation mode + * prior to using these functions. + * + * Algorithm: calculate (a * b) % p: + * + * p := prime < 2**31 + * pinv := (long double)1.0 / p (precalculated) + * + * a) n = a * b # Calculate exact product. + * b) qest = n * pinv # Calculate estimate for q = n / p. + * c) q = (qest+2**63)-2**63 # Truncate qest to the exact quotient. + * d) r = n - q * p # Calculate remainder. + * + * Remarks: + * + * - p = dmod and pinv = dinvmod. + * - dinvmod points to an array of three uint32_t, which is interpreted + * as an 80 bit long double by fldt. + * - Intel compilers prior to version 11 do not seem to handle the + * __GNUC__ inline assembly correctly. + * - random tests are provided in tests/extended/ppro_mulmod.c + */ + +#if defined(PPRO) +#if defined(ASM) + +/* Return (a * b) % dmod */ +static inline mpd_uint_t +ppro_mulmod(mpd_uint_t a, mpd_uint_t b, double *dmod, uint32_t *dinvmod) +{ + mpd_uint_t retval; + + __asm__ ( + "fildl %2\n\t" + "fildl %1\n\t" + "fmulp %%st, %%st(1)\n\t" + "fldt (%4)\n\t" + "fmul %%st(1), %%st\n\t" + "flds %5\n\t" + "fadd %%st, %%st(1)\n\t" + "fsubrp %%st, %%st(1)\n\t" + "fldl (%3)\n\t" + "fmulp %%st, %%st(1)\n\t" + "fsubrp %%st, %%st(1)\n\t" + "fistpl %0\n\t" + : "=m" (retval) + : "m" (a), "m" (b), "r" (dmod), "r" (dinvmod), "m" (MPD_TWO63) + : "st", "memory" + ); + + return retval; +} + +/* + * Two modular multiplications in parallel: + * *a0 = (*a0 * w) % dmod + * *a1 = (*a1 * w) % dmod + */ +static inline void +ppro_mulmod2c(mpd_uint_t *a0, mpd_uint_t *a1, mpd_uint_t w, + double *dmod, uint32_t *dinvmod) +{ + __asm__ ( + "fildl %2\n\t" + "fildl (%1)\n\t" + "fmul %%st(1), %%st\n\t" + "fxch %%st(1)\n\t" + "fildl (%0)\n\t" + "fmulp %%st, %%st(1) \n\t" + "fldt (%4)\n\t" + "flds %5\n\t" + "fld %%st(2)\n\t" + "fmul %%st(2)\n\t" + "fadd %%st(1)\n\t" + "fsub %%st(1)\n\t" + "fmull (%3)\n\t" + "fsubrp %%st, %%st(3)\n\t" + "fxch %%st(2)\n\t" + "fistpl (%0)\n\t" + "fmul %%st(2)\n\t" + "fadd %%st(1)\n\t" + "fsubp %%st, %%st(1)\n\t" + "fmull (%3)\n\t" + "fsubrp %%st, %%st(1)\n\t" + "fistpl (%1)\n\t" + : : "r" (a0), "r" (a1), "m" (w), + "r" (dmod), "r" (dinvmod), + "m" (MPD_TWO63) + : "st", "memory" + ); +} + +/* + * Two modular multiplications in parallel: + * *a0 = (*a0 * b0) % dmod + * *a1 = (*a1 * b1) % dmod + */ +static inline void +ppro_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1, + double *dmod, uint32_t *dinvmod) +{ + __asm__ ( + "fildl %3\n\t" + "fildl (%2)\n\t" + "fmulp %%st, %%st(1)\n\t" + "fildl %1\n\t" + "fildl (%0)\n\t" + "fmulp %%st, %%st(1)\n\t" + "fldt (%5)\n\t" + "fld %%st(2)\n\t" + "fmul %%st(1), %%st\n\t" + "fxch %%st(1)\n\t" + "fmul %%st(2), %%st\n\t" + "flds %6\n\t" + "fldl (%4)\n\t" + "fxch %%st(3)\n\t" + "fadd %%st(1), %%st\n\t" + "fxch %%st(2)\n\t" + "fadd %%st(1), %%st\n\t" + "fxch %%st(2)\n\t" + "fsub %%st(1), %%st\n\t" + "fxch %%st(2)\n\t" + "fsubp %%st, %%st(1)\n\t" + "fxch %%st(1)\n\t" + "fmul %%st(2), %%st\n\t" + "fxch %%st(1)\n\t" + "fmulp %%st, %%st(2)\n\t" + "fsubrp %%st, %%st(3)\n\t" + "fsubrp %%st, %%st(1)\n\t" + "fxch %%st(1)\n\t" + "fistpl (%2)\n\t" + "fistpl (%0)\n\t" + : : "r" (a0), "m" (b0), "r" (a1), "m" (b1), + "r" (dmod), "r" (dinvmod), + "m" (MPD_TWO63) + : "st", "memory" + ); +} +/* END PPRO GCC ASM */ +#elif defined(MASM) + +/* Return (a * b) % dmod */ +static inline mpd_uint_t __cdecl +ppro_mulmod(mpd_uint_t a, mpd_uint_t b, double *dmod, uint32_t *dinvmod) +{ + mpd_uint_t retval; + + __asm { + mov eax, dinvmod + mov edx, dmod + fild b + fild a + fmulp st(1), st + fld TBYTE PTR [eax] + fmul st, st(1) + fld MPD_TWO63 + fadd st(1), st + fsubp st(1), st + fld QWORD PTR [edx] + fmulp st(1), st + fsubp st(1), st + fistp retval + } + + return retval; +} + +/* + * Two modular multiplications in parallel: + * *a0 = (*a0 * w) % dmod + * *a1 = (*a1 * w) % dmod + */ +static inline mpd_uint_t __cdecl +ppro_mulmod2c(mpd_uint_t *a0, mpd_uint_t *a1, mpd_uint_t w, + double *dmod, uint32_t *dinvmod) +{ + __asm { + mov ecx, dmod + mov edx, a1 + mov ebx, dinvmod + mov eax, a0 + fild w + fild DWORD PTR [edx] + fmul st, st(1) + fxch st(1) + fild DWORD PTR [eax] + fmulp st(1), st + fld TBYTE PTR [ebx] + fld MPD_TWO63 + fld st(2) + fmul st, st(2) + fadd st, st(1) + fsub st, st(1) + fmul QWORD PTR [ecx] + fsubp st(3), st + fxch st(2) + fistp DWORD PTR [eax] + fmul st, st(2) + fadd st, st(1) + fsubrp st(1), st + fmul QWORD PTR [ecx] + fsubp st(1), st + fistp DWORD PTR [edx] + } +} + +/* + * Two modular multiplications in parallel: + * *a0 = (*a0 * b0) % dmod + * *a1 = (*a1 * b1) % dmod + */ +static inline void __cdecl +ppro_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1, + double *dmod, uint32_t *dinvmod) +{ + __asm { + mov ecx, dmod + mov edx, a1 + mov ebx, dinvmod + mov eax, a0 + fild b1 + fild DWORD PTR [edx] + fmulp st(1), st + fild b0 + fild DWORD PTR [eax] + fmulp st(1), st + fld TBYTE PTR [ebx] + fld st(2) + fmul st, st(1) + fxch st(1) + fmul st, st(2) + fld DWORD PTR MPD_TWO63 + fld QWORD PTR [ecx] + fxch st(3) + fadd st, st(1) + fxch st(2) + fadd st, st(1) + fxch st(2) + fsub st, st(1) + fxch st(2) + fsubrp st(1), st + fxch st(1) + fmul st, st(2) + fxch st(1) + fmulp st(2), st + fsubp st(3), st + fsubp st(1), st + fxch st(1) + fistp DWORD PTR [edx] + fistp DWORD PTR [eax] + } +} +#endif /* PPRO MASM (_MSC_VER) */ + + +/* Return (base ** exp) % dmod */ +static inline mpd_uint_t +ppro_powmod(mpd_uint_t base, mpd_uint_t exp, double *dmod, uint32_t *dinvmod) +{ + mpd_uint_t r = 1; + + while (exp > 0) { + if (exp & 1) + r = ppro_mulmod(r, base, dmod, dinvmod); + base = ppro_mulmod(base, base, dmod, dinvmod); + exp >>= 1; + } + + return r; +} +#endif /* PPRO */ +#endif /* CONFIG_32 */ + + +#endif /* UMODARITH_H */ + + + |