From 2f233fceae9a0c5e66e439bc0169b36547ba47c3 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Fri, 22 Apr 2022 07:46:26 -0700 Subject: gh-89279: In ceval.c, redefine some macros for speed (#32387) Macros Py_DECREF, Py_XDECREF, Py_IS_TYPE, _Py_atomic_load_32bit_impl and _Py_DECREF_SPECIALIZED are redefined as macros that completely replace the inline functions of the same name. These three came out in the top four of functions that (in MSVC) somehow weren't inlined. Co-authored-by: Victor Stinner Co-authored-by: Dennis Sweeney <36520290+sweeneyde@users.noreply.github.com> --- .../2022-04-13-22-03-04.gh-issue-89279.-jAVxZ.rst | 1 + Python/ceval.c | 73 ++++++++++++++++++---- 2 files changed, 61 insertions(+), 13 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2022-04-13-22-03-04.gh-issue-89279.-jAVxZ.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-04-13-22-03-04.gh-issue-89279.-jAVxZ.rst b/Misc/NEWS.d/next/Core and Builtins/2022-04-13-22-03-04.gh-issue-89279.-jAVxZ.rst new file mode 100644 index 0000000..66b1c88 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2022-04-13-22-03-04.gh-issue-89279.-jAVxZ.rst @@ -0,0 +1 @@ +Improve interpreter performance on Windows by inlining a few specific macros. diff --git a/Python/ceval.c b/Python/ceval.c index a80583f..90068bc 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -44,6 +44,55 @@ # error "ceval.c must be build with Py_BUILD_CORE define for best performance" #endif +#ifndef Py_DEBUG +// GH-89279: The MSVC compiler does not inline these static inline functions +// in PGO build in _PyEval_EvalFrameDefault(), because this function is over +// the limit of PGO, and that limit cannot be configured. +// Define them as macros to make sure that they are always inlined by the +// preprocessor. + +#undef Py_DECREF +#define Py_DECREF(arg) \ + do { \ + PyObject *op = _PyObject_CAST(arg); \ + if (--op->ob_refcnt == 0) { \ + destructor dealloc = Py_TYPE(op)->tp_dealloc; \ + (*dealloc)(op); \ + } \ + } while (0) + +#undef Py_XDECREF +#define Py_XDECREF(arg) \ + do { \ + PyObject *xop = _PyObject_CAST(arg); \ + if (xop != NULL) { \ + Py_DECREF(xop); \ + } \ + } while (0) + +#undef Py_IS_TYPE +#define Py_IS_TYPE(ob, type) \ + (_PyObject_CAST(ob)->ob_type == (type)) + +#undef _Py_DECREF_SPECIALIZED +#define _Py_DECREF_SPECIALIZED(arg, dealloc) \ + do { \ + PyObject *op = _PyObject_CAST(arg); \ + if (--op->ob_refcnt == 0) { \ + destructor d = (destructor)(dealloc); \ + d(op); \ + } \ + } while (0) +#endif + +// GH-89279: Similar to above, force inlining by using a macro. +#if defined(_MSC_VER) && SIZEOF_INT == 4 +#define _Py_atomic_load_relaxed_int32(ATOMIC_VAL) (assert(sizeof((ATOMIC_VAL)->_value) == 4), *((volatile int*)&((ATOMIC_VAL)->_value))) +#else +#define _Py_atomic_load_relaxed_int32(ATOMIC_VAL) _Py_atomic_load_relaxed(ATOMIC_VAL) +#endif + + /* Forward declarations */ static PyObject *trace_call_function( PyThreadState *tstate, PyObject *callable, PyObject **stack, @@ -192,10 +241,10 @@ COMPUTE_EVAL_BREAKER(PyInterpreterState *interp, struct _ceval_state *ceval2) { _Py_atomic_store_relaxed(&ceval2->eval_breaker, - _Py_atomic_load_relaxed(&ceval2->gil_drop_request) - | (_Py_atomic_load_relaxed(&ceval->signals_pending) + _Py_atomic_load_relaxed_int32(&ceval2->gil_drop_request) + | (_Py_atomic_load_relaxed_int32(&ceval->signals_pending) && _Py_ThreadCanHandleSignals(interp)) - | (_Py_atomic_load_relaxed(&ceval2->pending.calls_to_do) + | (_Py_atomic_load_relaxed_int32(&ceval2->pending.calls_to_do) && _Py_ThreadCanHandlePendingCalls()) | ceval2->pending.async_exc); } @@ -740,7 +789,7 @@ _Py_FinishPendingCalls(PyThreadState *tstate) struct _pending_calls *pending = &tstate->interp->ceval.pending; - if (!_Py_atomic_load_relaxed(&(pending->calls_to_do))) { + if (!_Py_atomic_load_relaxed_int32(&(pending->calls_to_do))) { return; } @@ -1187,7 +1236,7 @@ eval_frame_handle_pending(PyThreadState *tstate) struct _ceval_runtime_state *ceval = &runtime->ceval; /* Pending signals */ - if (_Py_atomic_load_relaxed(&ceval->signals_pending)) { + if (_Py_atomic_load_relaxed_int32(&ceval->signals_pending)) { if (handle_signals(tstate) != 0) { return -1; } @@ -1195,14 +1244,14 @@ eval_frame_handle_pending(PyThreadState *tstate) /* Pending calls */ struct _ceval_state *ceval2 = &tstate->interp->ceval; - if (_Py_atomic_load_relaxed(&ceval2->pending.calls_to_do)) { + if (_Py_atomic_load_relaxed_int32(&ceval2->pending.calls_to_do)) { if (make_pending_calls(tstate->interp) != 0) { return -1; } } /* GIL drop request */ - if (_Py_atomic_load_relaxed(&ceval2->gil_drop_request)) { + if (_Py_atomic_load_relaxed_int32(&ceval2->gil_drop_request)) { /* Give another thread a chance */ if (_PyThreadState_Swap(&runtime->gilstate, NULL) != tstate) { Py_FatalError("tstate mix-up"); @@ -1360,7 +1409,7 @@ eval_frame_handle_pending(PyThreadState *tstate) #define CHECK_EVAL_BREAKER() \ _Py_CHECK_EMSCRIPTEN_SIGNALS_PERIODICALLY(); \ - if (_Py_atomic_load_relaxed(eval_breaker)) { \ + if (_Py_atomic_load_relaxed_int32(eval_breaker)) { \ goto handle_eval_breaker; \ } @@ -1640,10 +1689,8 @@ typedef struct { PyObject *kwnames; } CallShape; -static inline bool -is_method(PyObject **stack_pointer, int args) { - return PEEK(args+2) != NULL; -} +// GH-89279: Must be a macro to be sure it's inlined by MSVC. +#define is_method(stack_pointer, args) (PEEK((args)+2) != NULL) #define KWNAMES_LEN() \ (call_shape.kwnames == NULL ? 0 : ((int)PyTuple_GET_SIZE(call_shape.kwnames))) @@ -1796,7 +1843,7 @@ handle_eval_breaker: PREDICTED(RESUME_QUICK); assert(tstate->cframe == &cframe); assert(frame == cframe.current_frame); - if (_Py_atomic_load_relaxed(eval_breaker) && oparg < 2) { + if (_Py_atomic_load_relaxed_int32(eval_breaker) && oparg < 2) { goto handle_eval_breaker; } DISPATCH(); -- cgit v0.12