From 0a18ee4be7ba215f414bef04598e0849504f9f1e Mon Sep 17 00:00:00 2001 From: Dong-hee Na Date: Sat, 24 Aug 2019 07:20:30 +0900 Subject: bpo-37798: Add C fastpath for statistics.NormalDist.inv_cdf() (GH-15266) --- Lib/statistics.py | 155 +++++++++++---------- .../2019-08-14-13-51-24.bpo-37798.AmXrik.rst | 1 + Modules/Setup | 1 + Modules/_statisticsmodule.c | 122 ++++++++++++++++ Modules/clinic/_statisticsmodule.c.h | 50 +++++++ PC/config.c | 2 + PCbuild/pythoncore.vcxproj | 1 + PCbuild/pythoncore.vcxproj.filters | 3 + setup.py | 2 + 9 files changed, 264 insertions(+), 73 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2019-08-14-13-51-24.bpo-37798.AmXrik.rst create mode 100644 Modules/_statisticsmodule.c create mode 100644 Modules/clinic/_statisticsmodule.c.h diff --git a/Lib/statistics.py b/Lib/statistics.py index 77291dd6..c7d6568 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -824,6 +824,81 @@ def pstdev(data, mu=None): ## Normal Distribution ##################################################### + +def _normal_dist_inv_cdf(p, mu, sigma): + # There is no closed-form solution to the inverse CDF for the normal + # distribution, so we use a rational approximation instead: + # Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the + # Normal Distribution". Applied Statistics. Blackwell Publishing. 37 + # (3): 477–484. doi:10.2307/2347330. JSTOR 2347330. + q = p - 0.5 + if fabs(q) <= 0.425: + r = 0.180625 - q * q + # Hash sum: 55.88319_28806_14901_4439 + num = (((((((2.50908_09287_30122_6727e+3 * r + + 3.34305_75583_58812_8105e+4) * r + + 6.72657_70927_00870_0853e+4) * r + + 4.59219_53931_54987_1457e+4) * r + + 1.37316_93765_50946_1125e+4) * r + + 1.97159_09503_06551_4427e+3) * r + + 1.33141_66789_17843_7745e+2) * r + + 3.38713_28727_96366_6080e+0) * q + den = (((((((5.22649_52788_52854_5610e+3 * r + + 2.87290_85735_72194_2674e+4) * r + + 3.93078_95800_09271_0610e+4) * r + + 2.12137_94301_58659_5867e+4) * r + + 5.39419_60214_24751_1077e+3) * r + + 6.87187_00749_20579_0830e+2) * r + + 4.23133_30701_60091_1252e+1) * r + + 1.0) + x = num / den + return mu + (x * sigma) + r = p if q <= 0.0 else 1.0 - p + r = sqrt(-log(r)) + if r <= 5.0: + r = r - 1.6 + # Hash sum: 49.33206_50330_16102_89036 + num = (((((((7.74545_01427_83414_07640e-4 * r + + 2.27238_44989_26918_45833e-2) * r + + 2.41780_72517_74506_11770e-1) * r + + 1.27045_82524_52368_38258e+0) * r + + 3.64784_83247_63204_60504e+0) * r + + 5.76949_72214_60691_40550e+0) * r + + 4.63033_78461_56545_29590e+0) * r + + 1.42343_71107_49683_57734e+0) + den = (((((((1.05075_00716_44416_84324e-9 * r + + 5.47593_80849_95344_94600e-4) * r + + 1.51986_66563_61645_71966e-2) * r + + 1.48103_97642_74800_74590e-1) * r + + 6.89767_33498_51000_04550e-1) * r + + 1.67638_48301_83803_84940e+0) * r + + 2.05319_16266_37758_82187e+0) * r + + 1.0) + else: + r = r - 5.0 + # Hash sum: 47.52583_31754_92896_71629 + num = (((((((2.01033_43992_92288_13265e-7 * r + + 2.71155_55687_43487_57815e-5) * r + + 1.24266_09473_88078_43860e-3) * r + + 2.65321_89526_57612_30930e-2) * r + + 2.96560_57182_85048_91230e-1) * r + + 1.78482_65399_17291_33580e+0) * r + + 5.46378_49111_64114_36990e+0) * r + + 6.65790_46435_01103_77720e+0) + den = (((((((2.04426_31033_89939_78564e-15 * r + + 1.42151_17583_16445_88870e-7) * r + + 1.84631_83175_10054_68180e-5) * r + + 7.86869_13114_56132_59100e-4) * r + + 1.48753_61290_85061_48525e-2) * r + + 1.36929_88092_27358_05310e-1) * r + + 5.99832_20655_58879_37690e-1) * r + + 1.0) + x = num / den + if q < 0.0: + x = -x + return mu + (x * sigma) + + class NormalDist: "Normal distribution of a random variable" # https://en.wikipedia.org/wiki/Normal_distribution @@ -882,79 +957,7 @@ class NormalDist: raise StatisticsError('p must be in the range 0.0 < p < 1.0') if self._sigma <= 0.0: raise StatisticsError('cdf() not defined when sigma at or below zero') - - # There is no closed-form solution to the inverse CDF for the normal - # distribution, so we use a rational approximation instead: - # Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the - # Normal Distribution". Applied Statistics. Blackwell Publishing. 37 - # (3): 477–484. doi:10.2307/2347330. JSTOR 2347330. - - q = p - 0.5 - if fabs(q) <= 0.425: - r = 0.180625 - q * q - # Hash sum: 55.88319_28806_14901_4439 - num = (((((((2.50908_09287_30122_6727e+3 * r + - 3.34305_75583_58812_8105e+4) * r + - 6.72657_70927_00870_0853e+4) * r + - 4.59219_53931_54987_1457e+4) * r + - 1.37316_93765_50946_1125e+4) * r + - 1.97159_09503_06551_4427e+3) * r + - 1.33141_66789_17843_7745e+2) * r + - 3.38713_28727_96366_6080e+0) * q - den = (((((((5.22649_52788_52854_5610e+3 * r + - 2.87290_85735_72194_2674e+4) * r + - 3.93078_95800_09271_0610e+4) * r + - 2.12137_94301_58659_5867e+4) * r + - 5.39419_60214_24751_1077e+3) * r + - 6.87187_00749_20579_0830e+2) * r + - 4.23133_30701_60091_1252e+1) * r + - 1.0) - x = num / den - return self._mu + (x * self._sigma) - r = p if q <= 0.0 else 1.0 - p - r = sqrt(-log(r)) - if r <= 5.0: - r = r - 1.6 - # Hash sum: 49.33206_50330_16102_89036 - num = (((((((7.74545_01427_83414_07640e-4 * r + - 2.27238_44989_26918_45833e-2) * r + - 2.41780_72517_74506_11770e-1) * r + - 1.27045_82524_52368_38258e+0) * r + - 3.64784_83247_63204_60504e+0) * r + - 5.76949_72214_60691_40550e+0) * r + - 4.63033_78461_56545_29590e+0) * r + - 1.42343_71107_49683_57734e+0) - den = (((((((1.05075_00716_44416_84324e-9 * r + - 5.47593_80849_95344_94600e-4) * r + - 1.51986_66563_61645_71966e-2) * r + - 1.48103_97642_74800_74590e-1) * r + - 6.89767_33498_51000_04550e-1) * r + - 1.67638_48301_83803_84940e+0) * r + - 2.05319_16266_37758_82187e+0) * r + - 1.0) - else: - r = r - 5.0 - # Hash sum: 47.52583_31754_92896_71629 - num = (((((((2.01033_43992_92288_13265e-7 * r + - 2.71155_55687_43487_57815e-5) * r + - 1.24266_09473_88078_43860e-3) * r + - 2.65321_89526_57612_30930e-2) * r + - 2.96560_57182_85048_91230e-1) * r + - 1.78482_65399_17291_33580e+0) * r + - 5.46378_49111_64114_36990e+0) * r + - 6.65790_46435_01103_77720e+0) - den = (((((((2.04426_31033_89939_78564e-15 * r + - 1.42151_17583_16445_88870e-7) * r + - 1.84631_83175_10054_68180e-5) * r + - 7.86869_13114_56132_59100e-4) * r + - 1.48753_61290_85061_48525e-2) * r + - 1.36929_88092_27358_05310e-1) * r + - 5.99832_20655_58879_37690e-1) * r + - 1.0) - x = num / den - if q < 0.0: - x = -x - return self._mu + (x * self._sigma) + return _normal_dist_inv_cdf(p, self._mu, self._sigma) def overlap(self, other): """Compute the overlapping coefficient (OVL) between two normal distributions. @@ -1078,6 +1081,12 @@ class NormalDist: def __repr__(self): return f'{type(self).__name__}(mu={self._mu!r}, sigma={self._sigma!r})' +# If available, use C implementation +try: + from _statistics import _normal_dist_inv_cdf +except ImportError: + pass + if __name__ == '__main__': diff --git a/Misc/NEWS.d/next/Library/2019-08-14-13-51-24.bpo-37798.AmXrik.rst b/Misc/NEWS.d/next/Library/2019-08-14-13-51-24.bpo-37798.AmXrik.rst new file mode 100644 index 0000000..620f0ec --- /dev/null +++ b/Misc/NEWS.d/next/Library/2019-08-14-13-51-24.bpo-37798.AmXrik.rst @@ -0,0 +1 @@ +Add C fastpath for statistics.NormalDist.inv_cdf() Patch by Dong-hee Na diff --git a/Modules/Setup b/Modules/Setup index ed5ee6c..983fa01 100644 --- a/Modules/Setup +++ b/Modules/Setup @@ -182,6 +182,7 @@ _symtable symtablemodule.c #_heapq _heapqmodule.c # Heap queue algorithm #_asyncio _asynciomodule.c # Fast asyncio Future #_json -I$(srcdir)/Include/internal -DPy_BUILD_CORE_BUILTIN _json.c # _json speedups +#_statistics _statisticsmodule.c # statistics accelerator #unicodedata unicodedata.c # static Unicode character database diff --git a/Modules/_statisticsmodule.c b/Modules/_statisticsmodule.c new file mode 100644 index 0000000..78ec90a --- /dev/null +++ b/Modules/_statisticsmodule.c @@ -0,0 +1,122 @@ +/* statistics accelerator C extensor: _statistics module. */ + +#include "Python.h" +#include "structmember.h" +#include "clinic/_statisticsmodule.c.h" + +/*[clinic input] +module _statistics + +[clinic start generated code]*/ +/*[clinic end generated code: output=da39a3ee5e6b4b0d input=864a6f59b76123b2]*/ + + +static PyMethodDef speedups_methods[] = { + _STATISTICS__NORMAL_DIST_INV_CDF_METHODDEF + {NULL, NULL, 0, NULL} +}; + +/*[clinic input] +_statistics._normal_dist_inv_cdf -> double + p: double + mu: double + sigma: double + / +[clinic start generated code]*/ + +static double +_statistics__normal_dist_inv_cdf_impl(PyObject *module, double p, double mu, + double sigma) +/*[clinic end generated code: output=02fd19ddaab36602 input=24715a74be15296a]*/ +{ + double q, num, den, r, x; + q = p - 0.5; + // Algorithm AS 241: The Percentage Points of the Normal Distribution + if(fabs(q) <= 0.425) { + r = 0.180625 - q * q; + // Hash sum AB: 55.88319 28806 14901 4439 + num = (((((((2.5090809287301226727e+3 * r + + 3.3430575583588128105e+4) * r + + 6.7265770927008700853e+4) * r + + 4.5921953931549871457e+4) * r + + 1.3731693765509461125e+4) * r + + 1.9715909503065514427e+3) * r + + 1.3314166789178437745e+2) * r + + 3.3871328727963666080e+0) * q; + den = (((((((5.2264952788528545610e+3 * r + + 2.8729085735721942674e+4) * r + + 3.9307895800092710610e+4) * r + + 2.1213794301586595867e+4) * r + + 5.3941960214247511077e+3) * r + + 6.8718700749205790830e+2) * r + + 4.2313330701600911252e+1) * r + + 1.0); + x = num / den; + return mu + (x * sigma); + } + r = q <= 0.0? p : 1.0-p; + r = sqrt(-log(r)); + if (r <= 5.0) { + r = r - 1.6; + // Hash sum CD: 49.33206 50330 16102 89036 + num = (((((((7.74545014278341407640e-4 * r + + 2.27238449892691845833e-2) * r + + 2.41780725177450611770e-1) * r + + 1.27045825245236838258e+0) * r + + 3.64784832476320460504e+0) * r + + 5.76949722146069140550e+0) * r + + 4.63033784615654529590e+0) * r + + 1.42343711074968357734e+0); + den = (((((((1.05075007164441684324e-9 * r + + 5.47593808499534494600e-4) * r + + 1.51986665636164571966e-2) * r + + 1.48103976427480074590e-1) * r + + 6.89767334985100004550e-1) * r + + 1.67638483018380384940e+0) * r + + 2.05319162663775882187e+0) * r + + 1.0); + } else { + r -= 5.0; + // Hash sum EF: 47.52583 31754 92896 71629 + num = (((((((2.01033439929228813265e-7 * r + + 2.71155556874348757815e-5) * r + + 1.24266094738807843860e-3) * r + + 2.65321895265761230930e-2) * r + + 2.96560571828504891230e-1) * r + + 1.78482653991729133580e+0) * r + + 5.46378491116411436990e+0) * r + + 6.65790464350110377720e+0); + den = (((((((2.04426310338993978564e-15 * r + + 1.42151175831644588870e-7) * r + + 1.84631831751005468180e-5) * r + + 7.86869131145613259100e-4) * r + + 1.48753612908506148525e-2) * r + + 1.36929880922735805310e-1) * r + + 5.99832206555887937690e-1) * r + + 1.0); + } + x = num / den; + if (q < 0.0) x = -x; + return mu + (x * sigma); +} + +static struct PyModuleDef statisticsmodule = { + PyModuleDef_HEAD_INIT, + "_statistics", + _statistics__normal_dist_inv_cdf__doc__, + -1, + speedups_methods, + NULL, + NULL, + NULL, + NULL +}; + + +PyMODINIT_FUNC +PyInit__statistics(void) +{ + PyObject *m = PyModule_Create(&statisticsmodule); + if (!m) return NULL; + return m; +} diff --git a/Modules/clinic/_statisticsmodule.c.h b/Modules/clinic/_statisticsmodule.c.h new file mode 100644 index 0000000..f5a2e46 --- /dev/null +++ b/Modules/clinic/_statisticsmodule.c.h @@ -0,0 +1,50 @@ +/*[clinic input] +preserve +[clinic start generated code]*/ + +PyDoc_STRVAR(_statistics__normal_dist_inv_cdf__doc__, +"_normal_dist_inv_cdf($module, p, mu, sigma, /)\n" +"--\n" +"\n"); + +#define _STATISTICS__NORMAL_DIST_INV_CDF_METHODDEF \ + {"_normal_dist_inv_cdf", (PyCFunction)(void(*)(void))_statistics__normal_dist_inv_cdf, METH_FASTCALL, _statistics__normal_dist_inv_cdf__doc__}, + +static double +_statistics__normal_dist_inv_cdf_impl(PyObject *module, double p, double mu, + double sigma); + +static PyObject * +_statistics__normal_dist_inv_cdf(PyObject *module, PyObject *const *args, Py_ssize_t nargs) +{ + PyObject *return_value = NULL; + double p; + double mu; + double sigma; + double _return_value; + + if (!_PyArg_CheckPositional("_normal_dist_inv_cdf", nargs, 3, 3)) { + goto exit; + } + p = PyFloat_AsDouble(args[0]); + if (PyErr_Occurred()) { + goto exit; + } + mu = PyFloat_AsDouble(args[1]); + if (PyErr_Occurred()) { + goto exit; + } + sigma = PyFloat_AsDouble(args[2]); + if (PyErr_Occurred()) { + goto exit; + } + _return_value = _statistics__normal_dist_inv_cdf_impl(module, p, mu, sigma); + if ((_return_value == -1.0) && PyErr_Occurred()) { + goto exit; + } + return_value = PyFloat_FromDouble(_return_value); + +exit: + return return_value; +} +/*[clinic end generated code: output=ba6af124acd34732 input=a9049054013a1b77]*/ diff --git a/PC/config.c b/PC/config.c index 6f34962..8eaeb31 100644 --- a/PC/config.c +++ b/PC/config.c @@ -23,6 +23,7 @@ extern PyObject* PyInit__sha1(void); extern PyObject* PyInit__sha256(void); extern PyObject* PyInit__sha512(void); extern PyObject* PyInit__sha3(void); +extern PyObject* PyInit__statistics(void); extern PyObject* PyInit__blake2(void); extern PyObject* PyInit_time(void); extern PyObject* PyInit__thread(void); @@ -103,6 +104,7 @@ struct _inittab _PyImport_Inittab[] = { {"_blake2", PyInit__blake2}, {"time", PyInit_time}, {"_thread", PyInit__thread}, + {"_statistics", PyInit__statistics}, #ifdef WIN32 {"msvcrt", PyInit_msvcrt}, {"_locale", PyInit__locale}, diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index 4fd2607..1c055b6 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -333,6 +333,7 @@ + diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index 2d09e9f..dbff89f 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -611,6 +611,9 @@ Modules + + Modules + Modules diff --git a/setup.py b/setup.py index 7cd7725..02f523c 100644 --- a/setup.py +++ b/setup.py @@ -785,6 +785,8 @@ class PyBuildExt(build_ext): self.add(Extension("_abc", ["_abc.c"])) # _queue module self.add(Extension("_queue", ["_queuemodule.c"])) + # _statistics module + self.add(Extension("_statistics", ["_statisticsmodule.c"])) # Modules with some UNIX dependencies -- on by default: # (If you have a really backward UNIX, select and socket may not be -- cgit v0.12