summaryrefslogtreecommitdiffstats
path: root/Python/optimizer.c
diff options
context:
space:
mode:
authorMark Shannon <mark@hotpy.org>2023-11-06 11:28:52 (GMT)
committerGitHub <noreply@github.com>2023-11-06 11:28:52 (GMT)
commitd78c872e0d680f6e63afa6661df5021775a03690 (patch)
tree2be2c8bd0caeedc4848e7919894cb0a26ece32a3 /Python/optimizer.c
parentc8faa3568afd255708096f6aa8df0afa80cf7697 (diff)
downloadcpython-d78c872e0d680f6e63afa6661df5021775a03690.zip
cpython-d78c872e0d680f6e63afa6661df5021775a03690.tar.gz
cpython-d78c872e0d680f6e63afa6661df5021775a03690.tar.bz2
GH-111646: Simplify optimizer, by compacting uops when making executor. (GH-111647)
Diffstat (limited to 'Python/optimizer.c')
-rw-r--r--Python/optimizer.c201
1 files changed, 87 insertions, 114 deletions
diff --git a/Python/optimizer.c b/Python/optimizer.c
index 0e5b437..a332fd1 100644
--- a/Python/optimizer.c
+++ b/Python/optimizer.c
@@ -384,34 +384,12 @@ PyTypeObject _PyUOpExecutor_Type = {
.tp_methods = executor_methods,
};
-static int
-move_stubs(
- _PyUOpInstruction *trace,
- int trace_length,
- int stubs_start,
- int stubs_end
-)
-{
- memmove(trace + trace_length,
- trace + stubs_start,
- (stubs_end - stubs_start) * sizeof(_PyUOpInstruction));
- // Patch up the jump targets
- for (int i = 0; i < trace_length; i++) {
- if (trace[i].opcode == _POP_JUMP_IF_FALSE ||
- trace[i].opcode == _POP_JUMP_IF_TRUE)
- {
- int target = trace[i].oparg;
- if (target >= stubs_start) {
- target += trace_length - stubs_start;
- trace[i].oparg = target;
- }
- }
- }
- return trace_length + stubs_end - stubs_start;
-}
-
#define TRACE_STACK_SIZE 5
+/* Returns 1 on success,
+ * 0 if it failed to produce a worthwhile trace,
+ * and -1 on an error.
+ */
static int
translate_bytecode_to_trace(
PyCodeObject *code,
@@ -790,7 +768,7 @@ done:
}
assert(code == initial_code);
// Skip short traces like _SET_IP, LOAD_FAST, _SET_IP, _EXIT_TRACE
- if (trace_length > 3) {
+ if (trace_length > 4) {
ADD_TO_TRACE(_EXIT_TRACE, 0, 0);
DPRINTF(1,
"Created a trace for %s (%s:%d) at byte offset %d -- length %d+%d\n",
@@ -800,25 +778,8 @@ done:
2 * INSTR_IP(initial_instr, code),
trace_length,
buffer_size - max_length);
- if (max_length < buffer_size) {
- // There are stubs
- if (trace_length < max_length) {
- // There's a gap before the stubs
- // Move the stubs back to be immediately after the main trace
- // (which ends at trace_length)
- DPRINTF(2,
- "Moving %d stub uops back by %d\n",
- buffer_size - max_length,
- max_length - trace_length);
- trace_length = move_stubs(trace, trace_length, max_length, buffer_size);
- }
- else {
- assert(trace_length == max_length);
- // There's no gap
- trace_length = buffer_size;
- }
- }
- return trace_length;
+ OPT_HIST(trace_length + buffer_size - max_length, trace_length_hist);
+ return 1;
}
else {
OPT_STAT_INC(trace_too_short);
@@ -838,70 +799,84 @@ done:
#undef DPRINTF
}
+#define UNSET_BIT(array, bit) (array[(bit)>>5] &= ~(1<<((bit)&31)))
+#define SET_BIT(array, bit) (array[(bit)>>5] |= (1<<((bit)&31)))
+#define BIT_IS_SET(array, bit) (array[(bit)>>5] & (1<<((bit)&31)))
+
+/* Count the number of used uops, and mark them in the bit vector `used`.
+ * This can be done in a single pass using simple reachability analysis,
+ * as there are no backward jumps.
+ * NOPs are excluded from the count.
+*/
static int
-remove_unneeded_uops(_PyUOpInstruction *trace, int trace_length)
+compute_used(_PyUOpInstruction *buffer, uint32_t *used)
{
- // Stage 1: Replace unneeded _SET_IP uops with NOP.
- // Note that we don't enter stubs, those SET_IPs are needed.
- int last_set_ip = -1;
- int last_instr = 0;
- bool need_ip = true;
- for (int pc = 0; pc < trace_length; pc++) {
- int opcode = trace[pc].opcode;
- if (opcode == _SET_IP) {
- if (!need_ip && last_set_ip >= 0) {
- trace[last_set_ip].opcode = NOP;
- }
- need_ip = false;
- last_set_ip = pc;
+ int count = 0;
+ SET_BIT(used, 0);
+ for (int i = 0; i < _Py_UOP_MAX_TRACE_LENGTH; i++) {
+ if (!BIT_IS_SET(used, i)) {
+ continue;
}
- else if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE) {
- last_instr = pc + 1;
- break;
+ count++;
+ int opcode = buffer[i].opcode;
+ if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE) {
+ continue;
}
- else {
- // If opcode has ERROR or DEOPT, set need_ip to true
- if (_PyOpcode_opcode_metadata[opcode].flags & (HAS_ERROR_FLAG | HAS_DEOPT_FLAG) || opcode == _PUSH_FRAME) {
- need_ip = true;
- }
+ /* All other micro-ops fall through, so i+1 is reachable */
+ SET_BIT(used, i+1);
+ switch(opcode) {
+ case NOP:
+ /* Don't count NOPs as used */
+ count--;
+ UNSET_BIT(used, i);
+ break;
+ case _POP_JUMP_IF_FALSE:
+ case _POP_JUMP_IF_TRUE:
+ /* Mark target as reachable */
+ SET_BIT(used, buffer[i].oparg);
}
}
- // Stage 2: Squash NOP opcodes (pre-existing or set above).
- int dest = 0;
- for (int pc = 0; pc < last_instr; pc++) {
- int opcode = trace[pc].opcode;
- if (opcode != NOP) {
- if (pc != dest) {
- trace[dest] = trace[pc];
- }
- dest++;
- }
+ return count;
+}
+
+/* Makes an executor from a buffer of uops.
+ * Account for the buffer having gaps and NOPs by computing a "used"
+ * bit vector and only copying the used uops. Here "used" means reachable
+ * and not a NOP.
+ */
+static _PyExecutorObject *
+make_executor_from_uops(_PyUOpInstruction *buffer, _PyBloomFilter *dependencies)
+{
+ uint32_t used[(_Py_UOP_MAX_TRACE_LENGTH + 31)/32] = { 0 };
+ int length = compute_used(buffer, used);
+ _PyUOpExecutorObject *executor = PyObject_NewVar(_PyUOpExecutorObject, &_PyUOpExecutor_Type, length);
+ if (executor == NULL) {
+ return NULL;
}
- // Stage 3: Move the stubs back.
- if (dest < last_instr) {
- int new_trace_length = move_stubs(trace, dest, last_instr, trace_length);
-#ifdef Py_DEBUG
- char *python_lltrace = Py_GETENV("PYTHON_LLTRACE");
- int lltrace = 0;
- if (python_lltrace != NULL && *python_lltrace >= '0') {
- lltrace = *python_lltrace - '0'; // TODO: Parse an int and all that
+ int dest = length - 1;
+ /* Scan backwards, so that we see the destinations of jumps before the jumps themselves. */
+ for (int i = _Py_UOP_MAX_TRACE_LENGTH-1; i >= 0; i--) {
+ if (!BIT_IS_SET(used, i)) {
+ continue;
}
- if (lltrace >= 2) {
- printf("Optimized trace (length %d+%d = %d, saved %d):\n",
- dest, trace_length - last_instr, new_trace_length,
- trace_length - new_trace_length);
- for (int pc = 0; pc < new_trace_length; pc++) {
- printf("%4d: (%s, %d, %" PRIu64 ")\n",
- pc,
- uop_name(trace[pc].opcode),
- (trace[pc].oparg),
- (uint64_t)(trace[pc].operand));
- }
+ executor->trace[dest] = buffer[i];
+ int opcode = buffer[i].opcode;
+ if (opcode == _POP_JUMP_IF_FALSE ||
+ opcode == _POP_JUMP_IF_TRUE)
+ {
+ /* The oparg of the target will already have been set to its new offset */
+ int oparg = executor->trace[dest].oparg;
+ executor->trace[dest].oparg = buffer[oparg].oparg;
}
-#endif
- trace_length = new_trace_length;
+ /* Set the oparg to be the destination offset,
+ * so that we can set the oparg of earlier jumps correctly. */
+ buffer[i].oparg = dest;
+ dest--;
}
- return trace_length;
+ assert(dest == -1);
+ executor->base.execute = _PyUopExecute;
+ _Py_ExecutorInit((_PyExecutorObject *)executor, dependencies);
+ return (_PyExecutorObject *)executor;
}
static int
@@ -914,28 +889,26 @@ uop_optimize(
{
_PyBloomFilter dependencies;
_Py_BloomFilter_Init(&dependencies);
- _PyUOpInstruction trace[_Py_UOP_MAX_TRACE_LENGTH];
- int trace_length = translate_bytecode_to_trace(code, instr, trace, _Py_UOP_MAX_TRACE_LENGTH, &dependencies);
- if (trace_length <= 0) {
+ _PyUOpInstruction buffer[_Py_UOP_MAX_TRACE_LENGTH];
+ int err = translate_bytecode_to_trace(code, instr, buffer, _Py_UOP_MAX_TRACE_LENGTH, &dependencies);
+ if (err <= 0) {
// Error or nothing translated
- return trace_length;
+ return err;
}
- OPT_HIST(trace_length, trace_length_hist);
OPT_STAT_INC(traces_created);
char *uop_optimize = Py_GETENV("PYTHONUOPSOPTIMIZE");
- if (uop_optimize != NULL && *uop_optimize > '0') {
- trace_length = _Py_uop_analyze_and_optimize(code, trace, trace_length, curr_stackentries);
+ if (uop_optimize == NULL || *uop_optimize > '0') {
+ err = _Py_uop_analyze_and_optimize(code, buffer, _Py_UOP_MAX_TRACE_LENGTH, curr_stackentries);
+ if (err < 0) {
+ return -1;
+ }
}
- trace_length = remove_unneeded_uops(trace, trace_length);
- _PyUOpExecutorObject *executor = PyObject_NewVar(_PyUOpExecutorObject, &_PyUOpExecutor_Type, trace_length);
+ _PyExecutorObject *executor = make_executor_from_uops(buffer, &dependencies);
if (executor == NULL) {
return -1;
}
- OPT_HIST(trace_length, optimized_trace_length_hist);
- executor->base.execute = _PyUopExecute;
- memcpy(executor->trace, trace, trace_length * sizeof(_PyUOpInstruction));
- _Py_ExecutorInit((_PyExecutorObject *)executor, &dependencies);
- *exec_ptr = (_PyExecutorObject *)executor;
+ OPT_HIST(Py_SIZE(executor), optimized_trace_length_hist);
+ *exec_ptr = executor;
return 1;
}