From d78c872e0d680f6e63afa6661df5021775a03690 Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Mon, 6 Nov 2023 11:28:52 +0000 Subject: GH-111646: Simplify optimizer, by compacting uops when making executor. (GH-111647) --- Python/optimizer.c | 201 +++++++++++++++++++------------------------- Python/optimizer_analysis.c | 35 +++++++- 2 files changed, 119 insertions(+), 117 deletions(-) diff --git a/Python/optimizer.c b/Python/optimizer.c index 0e5b437..a332fd1 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -384,34 +384,12 @@ PyTypeObject _PyUOpExecutor_Type = { .tp_methods = executor_methods, }; -static int -move_stubs( - _PyUOpInstruction *trace, - int trace_length, - int stubs_start, - int stubs_end -) -{ - memmove(trace + trace_length, - trace + stubs_start, - (stubs_end - stubs_start) * sizeof(_PyUOpInstruction)); - // Patch up the jump targets - for (int i = 0; i < trace_length; i++) { - if (trace[i].opcode == _POP_JUMP_IF_FALSE || - trace[i].opcode == _POP_JUMP_IF_TRUE) - { - int target = trace[i].oparg; - if (target >= stubs_start) { - target += trace_length - stubs_start; - trace[i].oparg = target; - } - } - } - return trace_length + stubs_end - stubs_start; -} - #define TRACE_STACK_SIZE 5 +/* Returns 1 on success, + * 0 if it failed to produce a worthwhile trace, + * and -1 on an error. + */ static int translate_bytecode_to_trace( PyCodeObject *code, @@ -790,7 +768,7 @@ done: } assert(code == initial_code); // Skip short traces like _SET_IP, LOAD_FAST, _SET_IP, _EXIT_TRACE - if (trace_length > 3) { + if (trace_length > 4) { ADD_TO_TRACE(_EXIT_TRACE, 0, 0); DPRINTF(1, "Created a trace for %s (%s:%d) at byte offset %d -- length %d+%d\n", @@ -800,25 +778,8 @@ done: 2 * INSTR_IP(initial_instr, code), trace_length, buffer_size - max_length); - if (max_length < buffer_size) { - // There are stubs - if (trace_length < max_length) { - // There's a gap before the stubs - // Move the stubs back to be immediately after the main trace - // (which ends at trace_length) - DPRINTF(2, - "Moving %d stub uops back by %d\n", - buffer_size - max_length, - max_length - trace_length); - trace_length = move_stubs(trace, trace_length, max_length, buffer_size); - } - else { - assert(trace_length == max_length); - // There's no gap - trace_length = buffer_size; - } - } - return trace_length; + OPT_HIST(trace_length + buffer_size - max_length, trace_length_hist); + return 1; } else { OPT_STAT_INC(trace_too_short); @@ -838,70 +799,84 @@ done: #undef DPRINTF } +#define UNSET_BIT(array, bit) (array[(bit)>>5] &= ~(1<<((bit)&31))) +#define SET_BIT(array, bit) (array[(bit)>>5] |= (1<<((bit)&31))) +#define BIT_IS_SET(array, bit) (array[(bit)>>5] & (1<<((bit)&31))) + +/* Count the number of used uops, and mark them in the bit vector `used`. + * This can be done in a single pass using simple reachability analysis, + * as there are no backward jumps. + * NOPs are excluded from the count. +*/ static int -remove_unneeded_uops(_PyUOpInstruction *trace, int trace_length) +compute_used(_PyUOpInstruction *buffer, uint32_t *used) { - // Stage 1: Replace unneeded _SET_IP uops with NOP. - // Note that we don't enter stubs, those SET_IPs are needed. - int last_set_ip = -1; - int last_instr = 0; - bool need_ip = true; - for (int pc = 0; pc < trace_length; pc++) { - int opcode = trace[pc].opcode; - if (opcode == _SET_IP) { - if (!need_ip && last_set_ip >= 0) { - trace[last_set_ip].opcode = NOP; - } - need_ip = false; - last_set_ip = pc; + int count = 0; + SET_BIT(used, 0); + for (int i = 0; i < _Py_UOP_MAX_TRACE_LENGTH; i++) { + if (!BIT_IS_SET(used, i)) { + continue; } - else if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE) { - last_instr = pc + 1; - break; + count++; + int opcode = buffer[i].opcode; + if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE) { + continue; } - else { - // If opcode has ERROR or DEOPT, set need_ip to true - if (_PyOpcode_opcode_metadata[opcode].flags & (HAS_ERROR_FLAG | HAS_DEOPT_FLAG) || opcode == _PUSH_FRAME) { - need_ip = true; - } + /* All other micro-ops fall through, so i+1 is reachable */ + SET_BIT(used, i+1); + switch(opcode) { + case NOP: + /* Don't count NOPs as used */ + count--; + UNSET_BIT(used, i); + break; + case _POP_JUMP_IF_FALSE: + case _POP_JUMP_IF_TRUE: + /* Mark target as reachable */ + SET_BIT(used, buffer[i].oparg); } } - // Stage 2: Squash NOP opcodes (pre-existing or set above). - int dest = 0; - for (int pc = 0; pc < last_instr; pc++) { - int opcode = trace[pc].opcode; - if (opcode != NOP) { - if (pc != dest) { - trace[dest] = trace[pc]; - } - dest++; - } + return count; +} + +/* Makes an executor from a buffer of uops. + * Account for the buffer having gaps and NOPs by computing a "used" + * bit vector and only copying the used uops. Here "used" means reachable + * and not a NOP. + */ +static _PyExecutorObject * +make_executor_from_uops(_PyUOpInstruction *buffer, _PyBloomFilter *dependencies) +{ + uint32_t used[(_Py_UOP_MAX_TRACE_LENGTH + 31)/32] = { 0 }; + int length = compute_used(buffer, used); + _PyUOpExecutorObject *executor = PyObject_NewVar(_PyUOpExecutorObject, &_PyUOpExecutor_Type, length); + if (executor == NULL) { + return NULL; } - // Stage 3: Move the stubs back. - if (dest < last_instr) { - int new_trace_length = move_stubs(trace, dest, last_instr, trace_length); -#ifdef Py_DEBUG - char *python_lltrace = Py_GETENV("PYTHON_LLTRACE"); - int lltrace = 0; - if (python_lltrace != NULL && *python_lltrace >= '0') { - lltrace = *python_lltrace - '0'; // TODO: Parse an int and all that + int dest = length - 1; + /* Scan backwards, so that we see the destinations of jumps before the jumps themselves. */ + for (int i = _Py_UOP_MAX_TRACE_LENGTH-1; i >= 0; i--) { + if (!BIT_IS_SET(used, i)) { + continue; } - if (lltrace >= 2) { - printf("Optimized trace (length %d+%d = %d, saved %d):\n", - dest, trace_length - last_instr, new_trace_length, - trace_length - new_trace_length); - for (int pc = 0; pc < new_trace_length; pc++) { - printf("%4d: (%s, %d, %" PRIu64 ")\n", - pc, - uop_name(trace[pc].opcode), - (trace[pc].oparg), - (uint64_t)(trace[pc].operand)); - } + executor->trace[dest] = buffer[i]; + int opcode = buffer[i].opcode; + if (opcode == _POP_JUMP_IF_FALSE || + opcode == _POP_JUMP_IF_TRUE) + { + /* The oparg of the target will already have been set to its new offset */ + int oparg = executor->trace[dest].oparg; + executor->trace[dest].oparg = buffer[oparg].oparg; } -#endif - trace_length = new_trace_length; + /* Set the oparg to be the destination offset, + * so that we can set the oparg of earlier jumps correctly. */ + buffer[i].oparg = dest; + dest--; } - return trace_length; + assert(dest == -1); + executor->base.execute = _PyUopExecute; + _Py_ExecutorInit((_PyExecutorObject *)executor, dependencies); + return (_PyExecutorObject *)executor; } static int @@ -914,28 +889,26 @@ uop_optimize( { _PyBloomFilter dependencies; _Py_BloomFilter_Init(&dependencies); - _PyUOpInstruction trace[_Py_UOP_MAX_TRACE_LENGTH]; - int trace_length = translate_bytecode_to_trace(code, instr, trace, _Py_UOP_MAX_TRACE_LENGTH, &dependencies); - if (trace_length <= 0) { + _PyUOpInstruction buffer[_Py_UOP_MAX_TRACE_LENGTH]; + int err = translate_bytecode_to_trace(code, instr, buffer, _Py_UOP_MAX_TRACE_LENGTH, &dependencies); + if (err <= 0) { // Error or nothing translated - return trace_length; + return err; } - OPT_HIST(trace_length, trace_length_hist); OPT_STAT_INC(traces_created); char *uop_optimize = Py_GETENV("PYTHONUOPSOPTIMIZE"); - if (uop_optimize != NULL && *uop_optimize > '0') { - trace_length = _Py_uop_analyze_and_optimize(code, trace, trace_length, curr_stackentries); + if (uop_optimize == NULL || *uop_optimize > '0') { + err = _Py_uop_analyze_and_optimize(code, buffer, _Py_UOP_MAX_TRACE_LENGTH, curr_stackentries); + if (err < 0) { + return -1; + } } - trace_length = remove_unneeded_uops(trace, trace_length); - _PyUOpExecutorObject *executor = PyObject_NewVar(_PyUOpExecutorObject, &_PyUOpExecutor_Type, trace_length); + _PyExecutorObject *executor = make_executor_from_uops(buffer, &dependencies); if (executor == NULL) { return -1; } - OPT_HIST(trace_length, optimized_trace_length_hist); - executor->base.execute = _PyUopExecute; - memcpy(executor->trace, trace, trace_length * sizeof(_PyUOpInstruction)); - _Py_ExecutorInit((_PyExecutorObject *)executor, &dependencies); - *exec_ptr = (_PyExecutorObject *)executor; + OPT_HIST(Py_SIZE(executor), optimized_trace_length_hist); + *exec_ptr = executor; return 1; } diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index 2d177f1..61bda80 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -13,13 +13,42 @@ #include "pycore_optimizer.h" +static void +remove_unneeded_uops(_PyUOpInstruction *buffer, int buffer_size) +{ + // Note that we don't enter stubs, those SET_IPs are needed. + int last_set_ip = -1; + bool need_ip = true; + for (int pc = 0; pc < buffer_size; pc++) { + int opcode = buffer[pc].opcode; + if (opcode == _SET_IP) { + if (!need_ip && last_set_ip >= 0) { + buffer[last_set_ip].opcode = NOP; + } + need_ip = false; + last_set_ip = pc; + } + else if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE) { + break; + } + else { + // If opcode has ERROR or DEOPT, set need_ip to true + if (_PyOpcode_opcode_metadata[opcode].flags & (HAS_ERROR_FLAG | HAS_DEOPT_FLAG) || opcode == _PUSH_FRAME) { + need_ip = true; + } + } + } +} + + int _Py_uop_analyze_and_optimize( PyCodeObject *co, - _PyUOpInstruction *trace, - int trace_len, + _PyUOpInstruction *buffer, + int buffer_size, int curr_stacklen ) { - return trace_len; + remove_unneeded_uops(buffer, buffer_size); + return 0; } -- cgit v0.12