From ed95e8cbd4cbc813666c7ce7760257cc0f169d03 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Sun, 30 Apr 2023 21:08:26 +0800 Subject: gh-98003: Inline call frames for CALL_FUNCTION_EX (GH-98004) --- Include/internal/pycore_call.h | 10 ++ .../2022-10-06-23-32-11.gh-issue-98003.xWE0Yu.rst | 3 + Objects/call.c | 20 ++-- Python/bytecodes.c | 19 ++++ Python/ceval.c | 46 +++++++++ Python/generated_cases.c.h | 105 ++++++++++++--------- 6 files changed, 148 insertions(+), 55 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2022-10-06-23-32-11.gh-issue-98003.xWE0Yu.rst diff --git a/Include/internal/pycore_call.h b/Include/internal/pycore_call.h index 55378e3..5d9342b 100644 --- a/Include/internal/pycore_call.h +++ b/Include/internal/pycore_call.h @@ -116,6 +116,16 @@ _PyObject_FastCallTstate(PyThreadState *tstate, PyObject *func, PyObject *const return _PyObject_VectorcallTstate(tstate, func, args, (size_t)nargs, NULL); } +PyObject *const * +_PyStack_UnpackDict(PyThreadState *tstate, + PyObject *const *args, Py_ssize_t nargs, + PyObject *kwargs, PyObject **p_kwnames); + +void +_PyStack_UnpackDict_Free(PyObject *const *stack, Py_ssize_t nargs, + PyObject *kwnames); + +void _PyStack_UnpackDict_FreeNoDecRef(PyObject *const *stack, PyObject *kwnames); #ifdef __cplusplus } diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-10-06-23-32-11.gh-issue-98003.xWE0Yu.rst b/Misc/NEWS.d/next/Core and Builtins/2022-10-06-23-32-11.gh-issue-98003.xWE0Yu.rst new file mode 100644 index 0000000..f9e71bc --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2022-10-06-23-32-11.gh-issue-98003.xWE0Yu.rst @@ -0,0 +1,3 @@ +Complex function calls are now faster and consume no C stack +space. + diff --git a/Objects/call.c b/Objects/call.c index bd027e4..cf6e357 100644 --- a/Objects/call.c +++ b/Objects/call.c @@ -8,16 +8,6 @@ #include "pycore_tuple.h" // _PyTuple_ITEMS() -static PyObject *const * -_PyStack_UnpackDict(PyThreadState *tstate, - PyObject *const *args, Py_ssize_t nargs, - PyObject *kwargs, PyObject **p_kwnames); - -static void -_PyStack_UnpackDict_Free(PyObject *const *stack, Py_ssize_t nargs, - PyObject *kwnames); - - static PyObject * null_error(PyThreadState *tstate) { @@ -965,7 +955,7 @@ _PyStack_AsDict(PyObject *const *values, PyObject *kwnames) The newly allocated argument vector supports PY_VECTORCALL_ARGUMENTS_OFFSET. When done, you must call _PyStack_UnpackDict_Free(stack, nargs, kwnames) */ -static PyObject *const * +PyObject *const * _PyStack_UnpackDict(PyThreadState *tstate, PyObject *const *args, Py_ssize_t nargs, PyObject *kwargs, PyObject **p_kwnames) @@ -1034,7 +1024,7 @@ _PyStack_UnpackDict(PyThreadState *tstate, return stack; } -static void +void _PyStack_UnpackDict_Free(PyObject *const *stack, Py_ssize_t nargs, PyObject *kwnames) { @@ -1042,6 +1032,12 @@ _PyStack_UnpackDict_Free(PyObject *const *stack, Py_ssize_t nargs, for (Py_ssize_t i = 0; i < n; i++) { Py_DECREF(stack[i]); } + _PyStack_UnpackDict_FreeNoDecRef(stack, kwnames); +} + +void +_PyStack_UnpackDict_FreeNoDecRef(PyObject *const *stack, PyObject *kwnames) +{ PyMem_Free((PyObject **)stack - 1); Py_DECREF(kwnames); } diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 9de0d92..e83894e 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -3103,6 +3103,25 @@ dummy_func( } } else { + if (Py_TYPE(func) == &PyFunction_Type && + tstate->interp->eval_frame == NULL && + ((PyFunctionObject *)func)->vectorcall == _PyFunction_Vectorcall) { + assert(PyTuple_CheckExact(callargs)); + Py_ssize_t nargs = PyTuple_GET_SIZE(callargs); + int code_flags = ((PyCodeObject *)PyFunction_GET_CODE(func))->co_flags; + PyObject *locals = code_flags & CO_OPTIMIZED ? NULL : Py_NewRef(PyFunction_GET_GLOBALS(func)); + + _PyInterpreterFrame *new_frame = _PyEvalFramePushAndInit_Ex(tstate, + (PyFunctionObject *)func, locals, + nargs, callargs, kwargs); + // Need to manually shrink the stack since we exit with DISPATCH_INLINED. + STACK_SHRINK(oparg + 3); + if (new_frame == NULL) { + goto error; + } + frame->return_offset = 0; + DISPATCH_INLINED(new_frame); + } result = PyObject_Call(func, callargs, kwargs); } DECREF_INPUTS(); diff --git a/Python/ceval.c b/Python/ceval.c index 5d5221b..958689d 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -212,6 +212,9 @@ static _PyInterpreterFrame * _PyEvalFramePushAndInit(PyThreadState *tstate, PyFunctionObject *func, PyObject *locals, PyObject* const* args, size_t argcount, PyObject *kwnames); +static _PyInterpreterFrame * +_PyEvalFramePushAndInit_Ex(PyThreadState *tstate, PyFunctionObject *func, + PyObject *locals, Py_ssize_t nargs, PyObject *callargs, PyObject *kwargs); static void _PyEvalFrameClearAndPop(PyThreadState *tstate, _PyInterpreterFrame *frame); @@ -1501,6 +1504,49 @@ fail: return NULL; } +/* Same as _PyEvalFramePushAndInit but takes an args tuple and kwargs dict. + Steals references to func, callargs and kwargs. +*/ +static _PyInterpreterFrame * +_PyEvalFramePushAndInit_Ex(PyThreadState *tstate, PyFunctionObject *func, + PyObject *locals, Py_ssize_t nargs, PyObject *callargs, PyObject *kwargs) +{ + bool has_dict = (kwargs != NULL && PyDict_GET_SIZE(kwargs) > 0); + PyObject *kwnames = NULL; + PyObject *const *newargs; + if (has_dict) { + newargs = _PyStack_UnpackDict(tstate, _PyTuple_ITEMS(callargs), nargs, kwargs, &kwnames); + if (newargs == NULL) { + Py_DECREF(func); + goto error; + } + } + else { + newargs = &PyTuple_GET_ITEM(callargs, 0); + /* We need to incref all our args since the new frame steals the references. */ + for (Py_ssize_t i = 0; i < nargs; ++i) { + Py_INCREF(PyTuple_GET_ITEM(callargs, i)); + } + } + _PyInterpreterFrame *new_frame = _PyEvalFramePushAndInit( + tstate, (PyFunctionObject *)func, locals, + newargs, nargs, kwnames + ); + if (has_dict) { + _PyStack_UnpackDict_FreeNoDecRef(newargs, kwnames); + } + /* No need to decref func here because the reference has been stolen by + _PyEvalFramePushAndInit. + */ + Py_DECREF(callargs); + Py_XDECREF(kwargs); + return new_frame; +error: + Py_DECREF(callargs); + Py_XDECREF(kwargs); + return NULL; +} + PyObject * _PyEval_Vector(PyThreadState *tstate, PyFunctionObject *func, PyObject *locals, diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 864a4f7..069a7ce 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -4296,16 +4296,35 @@ } } else { + if (Py_TYPE(func) == &PyFunction_Type && + tstate->interp->eval_frame == NULL && + ((PyFunctionObject *)func)->vectorcall == _PyFunction_Vectorcall) { + assert(PyTuple_CheckExact(callargs)); + Py_ssize_t nargs = PyTuple_GET_SIZE(callargs); + int code_flags = ((PyCodeObject *)PyFunction_GET_CODE(func))->co_flags; + PyObject *locals = code_flags & CO_OPTIMIZED ? NULL : Py_NewRef(PyFunction_GET_GLOBALS(func)); + + _PyInterpreterFrame *new_frame = _PyEvalFramePushAndInit_Ex(tstate, + (PyFunctionObject *)func, locals, + nargs, callargs, kwargs); + // Need to manually shrink the stack since we exit with DISPATCH_INLINED. + STACK_SHRINK(oparg + 3); + if (new_frame == NULL) { + goto error; + } + frame->return_offset = 0; + DISPATCH_INLINED(new_frame); + } result = PyObject_Call(func, callargs, kwargs); } - #line 4302 "Python/generated_cases.c.h" + #line 4321 "Python/generated_cases.c.h" Py_DECREF(func); Py_DECREF(callargs); Py_XDECREF(kwargs); - #line 3109 "Python/bytecodes.c" + #line 3128 "Python/bytecodes.c" assert(PEEK(3 + (oparg & 1)) == NULL); if (result == NULL) { STACK_SHRINK(((oparg & 1) ? 1 : 0)); goto pop_3_error; } - #line 4309 "Python/generated_cases.c.h" + #line 4328 "Python/generated_cases.c.h" STACK_SHRINK(((oparg & 1) ? 1 : 0)); STACK_SHRINK(2); stack_pointer[-1] = result; @@ -4320,7 +4339,7 @@ PyObject *kwdefaults = (oparg & 0x02) ? stack_pointer[-(1 + ((oparg & 0x08) ? 1 : 0) + ((oparg & 0x04) ? 1 : 0) + ((oparg & 0x02) ? 1 : 0))] : NULL; PyObject *defaults = (oparg & 0x01) ? stack_pointer[-(1 + ((oparg & 0x08) ? 1 : 0) + ((oparg & 0x04) ? 1 : 0) + ((oparg & 0x02) ? 1 : 0) + ((oparg & 0x01) ? 1 : 0))] : NULL; PyObject *func; - #line 3119 "Python/bytecodes.c" + #line 3138 "Python/bytecodes.c" PyFunctionObject *func_obj = (PyFunctionObject *) PyFunction_New(codeobj, GLOBALS()); @@ -4349,14 +4368,14 @@ func_obj->func_version = ((PyCodeObject *)codeobj)->co_version; func = (PyObject *)func_obj; - #line 4353 "Python/generated_cases.c.h" + #line 4372 "Python/generated_cases.c.h" STACK_SHRINK(((oparg & 0x01) ? 1 : 0) + ((oparg & 0x02) ? 1 : 0) + ((oparg & 0x04) ? 1 : 0) + ((oparg & 0x08) ? 1 : 0)); stack_pointer[-1] = func; DISPATCH(); } TARGET(RETURN_GENERATOR) { - #line 3150 "Python/bytecodes.c" + #line 3169 "Python/bytecodes.c" assert(PyFunction_Check(frame->f_funcobj)); PyFunctionObject *func = (PyFunctionObject *)frame->f_funcobj; PyGenObject *gen = (PyGenObject *)_Py_MakeCoro(func); @@ -4377,7 +4396,7 @@ frame = cframe.current_frame = prev; _PyFrame_StackPush(frame, (PyObject *)gen); goto resume_frame; - #line 4381 "Python/generated_cases.c.h" + #line 4400 "Python/generated_cases.c.h" } TARGET(BUILD_SLICE) { @@ -4385,15 +4404,15 @@ PyObject *stop = stack_pointer[-(1 + ((oparg == 3) ? 1 : 0))]; PyObject *start = stack_pointer[-(2 + ((oparg == 3) ? 1 : 0))]; PyObject *slice; - #line 3173 "Python/bytecodes.c" + #line 3192 "Python/bytecodes.c" slice = PySlice_New(start, stop, step); - #line 4391 "Python/generated_cases.c.h" + #line 4410 "Python/generated_cases.c.h" Py_DECREF(start); Py_DECREF(stop); Py_XDECREF(step); - #line 3175 "Python/bytecodes.c" + #line 3194 "Python/bytecodes.c" if (slice == NULL) { STACK_SHRINK(((oparg == 3) ? 1 : 0)); goto pop_2_error; } - #line 4397 "Python/generated_cases.c.h" + #line 4416 "Python/generated_cases.c.h" STACK_SHRINK(((oparg == 3) ? 1 : 0)); STACK_SHRINK(1); stack_pointer[-1] = slice; @@ -4404,7 +4423,7 @@ PyObject *fmt_spec = ((oparg & FVS_MASK) == FVS_HAVE_SPEC) ? stack_pointer[-((((oparg & FVS_MASK) == FVS_HAVE_SPEC) ? 1 : 0))] : NULL; PyObject *value = stack_pointer[-(1 + (((oparg & FVS_MASK) == FVS_HAVE_SPEC) ? 1 : 0))]; PyObject *result; - #line 3179 "Python/bytecodes.c" + #line 3198 "Python/bytecodes.c" /* Handles f-string value formatting. */ PyObject *(*conv_fn)(PyObject *); int which_conversion = oparg & FVC_MASK; @@ -4439,7 +4458,7 @@ Py_DECREF(value); Py_XDECREF(fmt_spec); if (result == NULL) { STACK_SHRINK((((oparg & FVS_MASK) == FVS_HAVE_SPEC) ? 1 : 0)); goto pop_1_error; } - #line 4443 "Python/generated_cases.c.h" + #line 4462 "Python/generated_cases.c.h" STACK_SHRINK((((oparg & FVS_MASK) == FVS_HAVE_SPEC) ? 1 : 0)); stack_pointer[-1] = result; DISPATCH(); @@ -4448,10 +4467,10 @@ TARGET(COPY) { PyObject *bottom = stack_pointer[-(1 + (oparg-1))]; PyObject *top; - #line 3216 "Python/bytecodes.c" + #line 3235 "Python/bytecodes.c" assert(oparg > 0); top = Py_NewRef(bottom); - #line 4455 "Python/generated_cases.c.h" + #line 4474 "Python/generated_cases.c.h" STACK_GROW(1); stack_pointer[-1] = top; DISPATCH(); @@ -4463,7 +4482,7 @@ PyObject *rhs = stack_pointer[-1]; PyObject *lhs = stack_pointer[-2]; PyObject *res; - #line 3221 "Python/bytecodes.c" + #line 3240 "Python/bytecodes.c" #if ENABLE_SPECIALIZATION _PyBinaryOpCache *cache = (_PyBinaryOpCache *)next_instr; if (ADAPTIVE_COUNTER_IS_ZERO(cache->counter)) { @@ -4478,12 +4497,12 @@ assert((unsigned)oparg < Py_ARRAY_LENGTH(binary_ops)); assert(binary_ops[oparg]); res = binary_ops[oparg](lhs, rhs); - #line 4482 "Python/generated_cases.c.h" + #line 4501 "Python/generated_cases.c.h" Py_DECREF(lhs); Py_DECREF(rhs); - #line 3236 "Python/bytecodes.c" + #line 3255 "Python/bytecodes.c" if (res == NULL) goto pop_2_error; - #line 4487 "Python/generated_cases.c.h" + #line 4506 "Python/generated_cases.c.h" STACK_SHRINK(1); stack_pointer[-1] = res; next_instr += 1; @@ -4493,16 +4512,16 @@ TARGET(SWAP) { PyObject *top = stack_pointer[-1]; PyObject *bottom = stack_pointer[-(2 + (oparg-2))]; - #line 3241 "Python/bytecodes.c" + #line 3260 "Python/bytecodes.c" assert(oparg >= 2); - #line 4499 "Python/generated_cases.c.h" + #line 4518 "Python/generated_cases.c.h" stack_pointer[-1] = bottom; stack_pointer[-(2 + (oparg-2))] = top; DISPATCH(); } TARGET(INSTRUMENTED_LINE) { - #line 3245 "Python/bytecodes.c" + #line 3264 "Python/bytecodes.c" _Py_CODEUNIT *here = next_instr-1; _PyFrame_SetStackPointer(frame, stack_pointer); int original_opcode = _Py_call_instrumentation_line( @@ -4522,11 +4541,11 @@ } opcode = original_opcode; DISPATCH_GOTO(); - #line 4526 "Python/generated_cases.c.h" + #line 4545 "Python/generated_cases.c.h" } TARGET(INSTRUMENTED_INSTRUCTION) { - #line 3267 "Python/bytecodes.c" + #line 3286 "Python/bytecodes.c" int next_opcode = _Py_call_instrumentation_instruction( tstate, frame, next_instr-1); if (next_opcode < 0) goto error; @@ -4538,26 +4557,26 @@ assert(next_opcode > 0 && next_opcode < 256); opcode = next_opcode; DISPATCH_GOTO(); - #line 4542 "Python/generated_cases.c.h" + #line 4561 "Python/generated_cases.c.h" } TARGET(INSTRUMENTED_JUMP_FORWARD) { - #line 3281 "Python/bytecodes.c" + #line 3300 "Python/bytecodes.c" INSTRUMENTED_JUMP(next_instr-1, next_instr+oparg, PY_MONITORING_EVENT_JUMP); - #line 4548 "Python/generated_cases.c.h" + #line 4567 "Python/generated_cases.c.h" DISPATCH(); } TARGET(INSTRUMENTED_JUMP_BACKWARD) { - #line 3285 "Python/bytecodes.c" + #line 3304 "Python/bytecodes.c" INSTRUMENTED_JUMP(next_instr-1, next_instr-oparg, PY_MONITORING_EVENT_JUMP); - #line 4555 "Python/generated_cases.c.h" + #line 4574 "Python/generated_cases.c.h" CHECK_EVAL_BREAKER(); DISPATCH(); } TARGET(INSTRUMENTED_POP_JUMP_IF_TRUE) { - #line 3290 "Python/bytecodes.c" + #line 3309 "Python/bytecodes.c" PyObject *cond = POP(); int err = PyObject_IsTrue(cond); Py_DECREF(cond); @@ -4566,12 +4585,12 @@ assert(err == 0 || err == 1); int offset = err*oparg; INSTRUMENTED_JUMP(here, next_instr + offset, PY_MONITORING_EVENT_BRANCH); - #line 4570 "Python/generated_cases.c.h" + #line 4589 "Python/generated_cases.c.h" DISPATCH(); } TARGET(INSTRUMENTED_POP_JUMP_IF_FALSE) { - #line 3301 "Python/bytecodes.c" + #line 3320 "Python/bytecodes.c" PyObject *cond = POP(); int err = PyObject_IsTrue(cond); Py_DECREF(cond); @@ -4580,12 +4599,12 @@ assert(err == 0 || err == 1); int offset = (1-err)*oparg; INSTRUMENTED_JUMP(here, next_instr + offset, PY_MONITORING_EVENT_BRANCH); - #line 4584 "Python/generated_cases.c.h" + #line 4603 "Python/generated_cases.c.h" DISPATCH(); } TARGET(INSTRUMENTED_POP_JUMP_IF_NONE) { - #line 3312 "Python/bytecodes.c" + #line 3331 "Python/bytecodes.c" PyObject *value = POP(); _Py_CODEUNIT *here = next_instr-1; int offset; @@ -4598,12 +4617,12 @@ offset = 0; } INSTRUMENTED_JUMP(here, next_instr + offset, PY_MONITORING_EVENT_BRANCH); - #line 4602 "Python/generated_cases.c.h" + #line 4621 "Python/generated_cases.c.h" DISPATCH(); } TARGET(INSTRUMENTED_POP_JUMP_IF_NOT_NONE) { - #line 3327 "Python/bytecodes.c" + #line 3346 "Python/bytecodes.c" PyObject *value = POP(); _Py_CODEUNIT *here = next_instr-1; int offset; @@ -4616,30 +4635,30 @@ offset = oparg; } INSTRUMENTED_JUMP(here, next_instr + offset, PY_MONITORING_EVENT_BRANCH); - #line 4620 "Python/generated_cases.c.h" + #line 4639 "Python/generated_cases.c.h" DISPATCH(); } TARGET(EXTENDED_ARG) { - #line 3342 "Python/bytecodes.c" + #line 3361 "Python/bytecodes.c" assert(oparg); opcode = next_instr->op.code; oparg = oparg << 8 | next_instr->op.arg; PRE_DISPATCH_GOTO(); DISPATCH_GOTO(); - #line 4631 "Python/generated_cases.c.h" + #line 4650 "Python/generated_cases.c.h" } TARGET(CACHE) { - #line 3350 "Python/bytecodes.c" + #line 3369 "Python/bytecodes.c" assert(0 && "Executing a cache."); Py_UNREACHABLE(); - #line 4638 "Python/generated_cases.c.h" + #line 4657 "Python/generated_cases.c.h" } TARGET(RESERVED) { - #line 3355 "Python/bytecodes.c" + #line 3374 "Python/bytecodes.c" assert(0 && "Executing RESERVED instruction."); Py_UNREACHABLE(); - #line 4645 "Python/generated_cases.c.h" + #line 4664 "Python/generated_cases.c.h" } -- cgit v0.12