From 55824d01f866d1fa0f21996d897fba0e07d09ac8 Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Thu, 11 Jan 2024 18:20:42 +0000 Subject: GH-113853: Guarantee forward progress in executors (GH-113854) --- Include/cpython/optimizer.h | 2 +- .../2024-01-08-05-36-59.gh-issue-113853.lm-6_a.rst | 2 + Python/bytecodes.c | 18 ++- Python/generated_cases.c.h | 18 ++- Python/optimizer.c | 155 ++++++++++++--------- 5 files changed, 121 insertions(+), 74 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2024-01-08-05-36-59.gh-issue-113853.lm-6_a.rst diff --git a/Include/cpython/optimizer.h b/Include/cpython/optimizer.h index f077da7..0e7bc9f 100644 --- a/Include/cpython/optimizer.h +++ b/Include/cpython/optimizer.h @@ -65,7 +65,7 @@ PyAPI_FUNC(_PyOptimizerObject *) PyUnstable_GetOptimizer(void); PyAPI_FUNC(_PyExecutorObject *) PyUnstable_GetExecutor(PyCodeObject *code, int offset); int -_PyOptimizer_BackEdge(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNIT *dest, PyObject **stack_pointer); +_PyOptimizer_Optimize(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *start, PyObject **stack_pointer); extern _PyOptimizerObject _PyOptimizer_Default; diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-01-08-05-36-59.gh-issue-113853.lm-6_a.rst b/Misc/NEWS.d/next/Core and Builtins/2024-01-08-05-36-59.gh-issue-113853.lm-6_a.rst new file mode 100644 index 0000000..d4f0a76 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2024-01-08-05-36-59.gh-issue-113853.lm-6_a.rst @@ -0,0 +1,2 @@ +Guarantee that all executors make progress. This then guarantees that tier 2 +execution always makes progress. diff --git a/Python/bytecodes.c b/Python/bytecodes.c index f53ddae..2011d96 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -2325,12 +2325,18 @@ dummy_func( // Double-check that the opcode isn't instrumented or something: if (ucounter > threshold && this_instr->op.code == JUMP_BACKWARD) { OPT_STAT_INC(attempts); - int optimized = _PyOptimizer_BackEdge(frame, this_instr, next_instr, stack_pointer); + _Py_CODEUNIT *start = this_instr; + /* Back up over EXTENDED_ARGs so optimizer sees the whole instruction */ + while (oparg > 255) { + oparg >>= 8; + start--; + } + int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer); ERROR_IF(optimized < 0, error); if (optimized) { // Rewind and enter the executor: - assert(this_instr->op.code == ENTER_EXECUTOR); - next_instr = this_instr; + assert(start->op.code == ENTER_EXECUTOR); + next_instr = start; this_instr[1].cache &= ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1); } else { @@ -2370,10 +2376,12 @@ dummy_func( GOTO_TIER_TWO(); } else { - code->co_executors->executors[oparg & 255] = NULL; + /* ENTER_EXECUTOR will be the first code unit of the instruction */ + assert(oparg < 256); + code->co_executors->executors[oparg] = NULL; opcode = this_instr->op.code = executor->vm_data.opcode; this_instr->op.arg = executor->vm_data.oparg; - oparg = (oparg & (~255)) | executor->vm_data.oparg; + oparg = executor->vm_data.oparg; Py_DECREF(executor); next_instr = this_instr; DISPATCH_GOTO(); diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index e693e3e..1e995b6 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -2384,10 +2384,12 @@ GOTO_TIER_TWO(); } else { - code->co_executors->executors[oparg & 255] = NULL; + /* ENTER_EXECUTOR will be the first code unit of the instruction */ + assert(oparg < 256); + code->co_executors->executors[oparg] = NULL; opcode = this_instr->op.code = executor->vm_data.opcode; this_instr->op.arg = executor->vm_data.oparg; - oparg = (oparg & (~255)) | executor->vm_data.oparg; + oparg = executor->vm_data.oparg; Py_DECREF(executor); next_instr = this_instr; DISPATCH_GOTO(); @@ -3289,12 +3291,18 @@ // Double-check that the opcode isn't instrumented or something: if (ucounter > threshold && this_instr->op.code == JUMP_BACKWARD) { OPT_STAT_INC(attempts); - int optimized = _PyOptimizer_BackEdge(frame, this_instr, next_instr, stack_pointer); + _Py_CODEUNIT *start = this_instr; + /* Back up over EXTENDED_ARGs so optimizer sees the whole instruction */ + while (oparg > 255) { + oparg >>= 8; + start--; + } + int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer); if (optimized < 0) goto error; if (optimized) { // Rewind and enter the executor: - assert(this_instr->op.code == ENTER_EXECUTOR); - next_instr = this_instr; + assert(start->op.code == ENTER_EXECUTOR); + next_instr = start; this_instr[1].cache &= ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1); } else { diff --git a/Python/optimizer.c b/Python/optimizer.c index 28e12db..227d6be 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -162,24 +162,22 @@ PyUnstable_SetOptimizer(_PyOptimizerObject *optimizer) } int -_PyOptimizer_BackEdge(_PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNIT *dest, PyObject **stack_pointer) +_PyOptimizer_Optimize(_PyInterpreterFrame *frame, _Py_CODEUNIT *start, PyObject **stack_pointer) { - assert(src->op.code == JUMP_BACKWARD); PyCodeObject *code = (PyCodeObject *)frame->f_executable; assert(PyCode_Check(code)); PyInterpreterState *interp = _PyInterpreterState_GET(); - if (!has_space_for_executor(code, src)) { + if (!has_space_for_executor(code, start)) { return 0; } _PyOptimizerObject *opt = interp->optimizer; _PyExecutorObject *executor = NULL; - /* Start optimizing at the destination to guarantee forward progress */ - int err = opt->optimize(opt, code, dest, &executor, (int)(stack_pointer - _PyFrame_Stackbase(frame))); + int err = opt->optimize(opt, code, start, &executor, (int)(stack_pointer - _PyFrame_Stackbase(frame))); if (err <= 0) { assert(executor == NULL); return err; } - int index = get_index_for_executor(code, src); + int index = get_index_for_executor(code, start); if (index < 0) { /* Out of memory. Don't raise and assume that the * error will show up elsewhere. @@ -190,7 +188,7 @@ _PyOptimizer_BackEdge(_PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNI Py_DECREF(executor); return 0; } - insert_executor(code, src, index, executor); + insert_executor(code, start, index, executor); Py_DECREF(executor); return 1; } @@ -316,38 +314,6 @@ BRANCH_TO_GUARD[4][2] = { #define CONFIDENCE_RANGE 1000 #define CONFIDENCE_CUTOFF 333 -/* Returns 1 on success, - * 0 if it failed to produce a worthwhile trace, - * and -1 on an error. - */ -static int -translate_bytecode_to_trace( - PyCodeObject *code, - _Py_CODEUNIT *instr, - _PyUOpInstruction *trace, - int buffer_size, - _PyBloomFilter *dependencies) -{ - PyCodeObject *initial_code = code; - _Py_BloomFilter_Add(dependencies, initial_code); - _Py_CODEUNIT *initial_instr = instr; - int trace_length = 0; - int max_length = buffer_size; - struct { - PyCodeObject *code; - _Py_CODEUNIT *instr; - } trace_stack[TRACE_STACK_SIZE]; - int trace_stack_depth = 0; - int confidence = CONFIDENCE_RANGE; // Adjusted by branch instructions - -#ifdef Py_DEBUG - char *python_lltrace = Py_GETENV("PYTHON_LLTRACE"); - int lltrace = 0; - if (python_lltrace != NULL && *python_lltrace >= '0') { - lltrace = *python_lltrace - '0'; // TODO: Parse an int and all that - } -#endif - #ifdef Py_DEBUG #define DPRINTF(level, ...) \ if (lltrace >= (level)) { printf(__VA_ARGS__); } @@ -403,6 +369,39 @@ translate_bytecode_to_trace( code = trace_stack[trace_stack_depth].code; \ instr = trace_stack[trace_stack_depth].instr; +/* Returns 1 on success, + * 0 if it failed to produce a worthwhile trace, + * and -1 on an error. + */ +static int +translate_bytecode_to_trace( + PyCodeObject *code, + _Py_CODEUNIT *instr, + _PyUOpInstruction *trace, + int buffer_size, + _PyBloomFilter *dependencies) +{ + bool progress_needed = true; + PyCodeObject *initial_code = code; + _Py_BloomFilter_Add(dependencies, initial_code); + _Py_CODEUNIT *initial_instr = instr; + int trace_length = 0; + int max_length = buffer_size; + struct { + PyCodeObject *code; + _Py_CODEUNIT *instr; + } trace_stack[TRACE_STACK_SIZE]; + int trace_stack_depth = 0; + int confidence = CONFIDENCE_RANGE; // Adjusted by branch instructions + +#ifdef Py_DEBUG + char *python_lltrace = Py_GETENV("PYTHON_LLTRACE"); + int lltrace = 0; + if (python_lltrace != NULL && *python_lltrace >= '0') { + lltrace = *python_lltrace - '0'; // TODO: Parse an int and all that + } +#endif + DPRINTF(4, "Optimizing %s (%s:%d) at byte offset %d\n", PyUnicode_AsUTF8(code->co_qualname), @@ -410,6 +409,7 @@ translate_bytecode_to_trace( code->co_firstlineno, 2 * INSTR_IP(initial_instr, code)); uint32_t target = 0; + top: // Jump here after _PUSH_FRAME or likely branches for (;;) { target = INSTR_IP(instr, code); @@ -421,6 +421,15 @@ top: // Jump here after _PUSH_FRAME or likely branches uint32_t oparg = instr->op.arg; uint32_t extended = 0; + if (opcode == ENTER_EXECUTOR) { + assert(oparg < 256); + _PyExecutorObject *executor = + (_PyExecutorObject *)code->co_executors->executors[oparg]; + opcode = executor->vm_data.opcode; + DPRINTF(2, " * ENTER_EXECUTOR -> %s\n", _PyOpcode_OpName[opcode]); + oparg = executor->vm_data.oparg; + } + if (opcode == EXTENDED_ARG) { instr++; extended = 1; @@ -431,13 +440,23 @@ top: // Jump here after _PUSH_FRAME or likely branches goto done; } } - - if (opcode == ENTER_EXECUTOR) { - _PyExecutorObject *executor = - (_PyExecutorObject *)code->co_executors->executors[oparg&255]; - opcode = executor->vm_data.opcode; - DPRINTF(2, " * ENTER_EXECUTOR -> %s\n", _PyOpcode_OpName[opcode]); - oparg = (oparg & 0xffffff00) | executor->vm_data.oparg; + assert(opcode != ENTER_EXECUTOR && opcode != EXTENDED_ARG); + + /* Special case the first instruction, + * so that we can guarantee forward progress */ + if (progress_needed) { + progress_needed = false; + if (opcode == JUMP_BACKWARD || opcode == JUMP_BACKWARD_NO_INTERRUPT) { + instr += 1 + _PyOpcode_Caches[opcode] - (int32_t)oparg; + initial_instr = instr; + continue; + } + else { + if (OPCODE_HAS_DEOPT(opcode)) { + opcode = _PyOpcode_Deopt[opcode]; + } + assert(!OPCODE_HAS_DEOPT(opcode)); + } } switch (opcode) { @@ -480,7 +499,9 @@ top: // Jump here after _PUSH_FRAME or likely branches case JUMP_BACKWARD: case JUMP_BACKWARD_NO_INTERRUPT: { - if (instr + 2 - oparg == initial_instr && code == initial_code) { + _Py_CODEUNIT *target = instr + 1 + _PyOpcode_Caches[opcode] - (int)oparg; + if (target == initial_instr) { + /* We have looped round to the start */ RESERVE(1); ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0, 0); } @@ -641,19 +662,7 @@ done: } assert(code == initial_code); // Skip short traces like _SET_IP, LOAD_FAST, _SET_IP, _EXIT_TRACE - if (trace_length > 4) { - ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target); - DPRINTF(1, - "Created a trace for %s (%s:%d) at byte offset %d -- length %d\n", - PyUnicode_AsUTF8(code->co_qualname), - PyUnicode_AsUTF8(code->co_filename), - code->co_firstlineno, - 2 * INSTR_IP(initial_instr, code), - trace_length); - OPT_HIST(trace_length + buffer_size - max_length, trace_length_hist); - return 1; - } - else { + if (progress_needed || trace_length < 5) { OPT_STAT_INC(trace_too_short); DPRINTF(4, "No trace for %s (%s:%d) at byte offset %d\n", @@ -661,15 +670,25 @@ done: PyUnicode_AsUTF8(code->co_filename), code->co_firstlineno, 2 * INSTR_IP(initial_instr, code)); + return 0; } - return 0; + ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target); + DPRINTF(1, + "Created a trace for %s (%s:%d) at byte offset %d -- length %d\n", + PyUnicode_AsUTF8(code->co_qualname), + PyUnicode_AsUTF8(code->co_filename), + code->co_firstlineno, + 2 * INSTR_IP(initial_instr, code), + trace_length); + OPT_HIST(trace_length + buffer_size - max_length, trace_length_hist); + return 1; +} #undef RESERVE #undef RESERVE_RAW #undef INSTR_IP #undef ADD_TO_TRACE #undef DPRINTF -} #define UNSET_BIT(array, bit) (array[(bit)>>5] &= ~(1<<((bit)&31))) #define SET_BIT(array, bit) (array[(bit)>>5] |= (1<<((bit)&31))) @@ -854,10 +873,20 @@ counter_optimize( int Py_UNUSED(curr_stackentries) ) { + int oparg = instr->op.arg; + while (instr->op.code == EXTENDED_ARG) { + instr++; + oparg = (oparg << 8) | instr->op.arg; + } + if (instr->op.code != JUMP_BACKWARD) { + /* Counter optimizer can only handle backward edges */ + return 0; + } + _Py_CODEUNIT *target = instr + 1 + _PyOpcode_Caches[JUMP_BACKWARD] - oparg; _PyUOpInstruction buffer[3] = { { .opcode = _LOAD_CONST_INLINE_BORROW, .operand = (uintptr_t)self }, { .opcode = _INTERNAL_INCREMENT_OPT_COUNTER }, - { .opcode = _EXIT_TRACE, .target = (uint32_t)(instr - _PyCode_CODE(code)) } + { .opcode = _EXIT_TRACE, .target = (uint32_t)(target - _PyCode_CODE(code)) } }; _PyBloomFilter empty; _Py_BloomFilter_Init(&empty); -- cgit v0.12