From 72186aa637bc88cd5f5e234803af64acab25994c Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 17 Mar 2023 14:06:52 -0500 Subject: Simplify and improve accuracy for subnormals in hypot() (GH-102785) --- Modules/mathmodule.c | 63 +++++++++++++++++++++++----------------------------- 1 file changed, 28 insertions(+), 35 deletions(-) diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c index ae9e321..48cd9a6 100644 --- a/Modules/mathmodule.c +++ b/Modules/mathmodule.c @@ -2498,7 +2498,7 @@ References: static inline double vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) { - double x, h, scale, oldcsum, csum = 1.0, frac1 = 0.0, frac2 = 0.0; + double x, h, scale, csum = 1.0, frac1 = 0.0, frac2 = 0.0; DoubleLength pr, sm; int max_e; Py_ssize_t i; @@ -2513,49 +2513,42 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan) return max; } frexp(max, &max_e); - if (max_e >= -1023) { - scale = ldexp(1.0, -max_e); - assert(max * scale >= 0.5); - assert(max * scale < 1.0); + if (max_e < -1023) { + /* When max_e < -1023, ldexp(1.0, -max_e) would overflow. + So we first perform lossless scaling from subnormals back to normals, + then recurse back to vector_norm(), and then finally undo the scaling. + */ for (i=0 ; i < n ; i++) { - x = vec[i]; - assert(Py_IS_FINITE(x) && fabs(x) <= max); + vec[i] /= DBL_MIN; + } + return DBL_MIN * vector_norm(n, vec, max / DBL_MIN, found_nan); + } + scale = ldexp(1.0, -max_e); + assert(max * scale >= 0.5); + assert(max * scale < 1.0); + for (i=0 ; i < n ; i++) { + x = vec[i]; + assert(Py_IS_FINITE(x) && fabs(x) <= max); - x *= scale; - assert(fabs(x) < 1.0); + x *= scale; + assert(fabs(x) < 1.0); - pr = dl_mul(x, x); - assert(pr.hi <= 1.0); + pr = dl_mul(x, x); + assert(pr.hi <= 1.0); - sm = dl_fast_sum(csum, pr.hi); - csum = sm.hi; - frac1 += pr.lo; - frac2 += sm.lo; - } - h = sqrt(csum - 1.0 + (frac1 + frac2)); - pr = dl_mul(-h, h); sm = dl_fast_sum(csum, pr.hi); csum = sm.hi; frac1 += pr.lo; frac2 += sm.lo; - x = csum - 1.0 + (frac1 + frac2); - return (h + x / (2.0 * h)) / scale; } - /* When max_e < -1023, ldexp(1.0, -max_e) overflows. - So instead of multiplying by a scale, we just divide by *max*. - */ - for (i=0 ; i < n ; i++) { - x = vec[i]; - assert(Py_IS_FINITE(x) && fabs(x) <= max); - x /= max; - x = x*x; - assert(x <= 1.0); - assert(fabs(csum) >= fabs(x)); - oldcsum = csum; - csum += x; - frac1 += (oldcsum - csum) + x; - } - return max * sqrt(csum - 1.0 + frac1); + h = sqrt(csum - 1.0 + (frac1 + frac2)); + pr = dl_mul(-h, h); + sm = dl_fast_sum(csum, pr.hi); + csum = sm.hi; + frac1 += pr.lo; + frac2 += sm.lo; + x = csum - 1.0 + (frac1 + frac2); + return (h + x / (2.0 * h)) / scale; } #define NUM_STACK_ELEMS 16 -- cgit v0.12