summaryrefslogtreecommitdiffstats
path: root/Python/optimizer.c
diff options
context:
space:
mode:
authorMark Shannon <mark@hotpy.org>2024-02-20 09:39:55 (GMT)
committerGitHub <noreply@github.com>2024-02-20 09:39:55 (GMT)
commit7b21403ccd16c480812a1e857c0ee2deca592be0 (patch)
treeae54fc68fe298bea063502adff747e3ac1dd734d /Python/optimizer.c
parentacda1757bc682922292215906459c2735ee99c04 (diff)
downloadcpython-7b21403ccd16c480812a1e857c0ee2deca592be0.zip
cpython-7b21403ccd16c480812a1e857c0ee2deca592be0.tar.gz
cpython-7b21403ccd16c480812a1e857c0ee2deca592be0.tar.bz2
GH-112354: Initial implementation of warm up on exits and trace-stitching (GH-114142)
Diffstat (limited to 'Python/optimizer.c')
-rw-r--r--Python/optimizer.c205
1 files changed, 165 insertions, 40 deletions
diff --git a/Python/optimizer.c b/Python/optimizer.c
index efa1968..acc1d54 100644
--- a/Python/optimizer.c
+++ b/Python/optimizer.c
@@ -2,6 +2,7 @@
#include "opcode.h"
#include "pycore_interp.h"
#include "pycore_bitutils.h" // _Py_popcount32()
+#include "pycore_object.h" // _PyObject_GC_UNTRACK()
#include "pycore_opcode_metadata.h" // _PyOpcode_OpName[]
#include "pycore_opcode_utils.h" // MAX_REAL_OPCODE
#include "pycore_optimizer.h" // _Py_uop_analyze_and_optimize()
@@ -128,10 +129,11 @@ static _PyOptimizerObject _PyOptimizer_Default = {
.optimize = never_optimize,
.resume_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD,
.backedge_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD,
+ .side_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD,
};
static uint32_t
-shift_and_offset_threshold(uint16_t threshold)
+shift_and_offset_threshold(uint32_t threshold)
{
return (threshold << OPTIMIZER_BITS_IN_COUNTER) + (1 << 15);
}
@@ -140,41 +142,74 @@ _PyOptimizerObject *
PyUnstable_GetOptimizer(void)
{
PyInterpreterState *interp = _PyInterpreterState_GET();
- if (interp->optimizer == &_PyOptimizer_Default) {
- return NULL;
- }
assert(interp->optimizer_backedge_threshold ==
shift_and_offset_threshold(interp->optimizer->backedge_threshold));
assert(interp->optimizer_resume_threshold ==
shift_and_offset_threshold(interp->optimizer->resume_threshold));
+ if (interp->optimizer == &_PyOptimizer_Default) {
+ return NULL;
+ }
Py_INCREF(interp->optimizer);
return interp->optimizer;
}
+static _PyExecutorObject *
+make_executor_from_uops(_PyUOpInstruction *buffer, const _PyBloomFilter *dependencies);
+
+static int
+init_cold_exit_executor(_PyExecutorObject *executor, int oparg);
+
+static int cold_exits_initialized = 0;
+static _PyExecutorObject COLD_EXITS[UOP_MAX_TRACE_LENGTH] = { 0 };
+
+static const _PyBloomFilter EMPTY_FILTER = { 0 };
+
_PyOptimizerObject *
_Py_SetOptimizer(PyInterpreterState *interp, _PyOptimizerObject *optimizer)
{
if (optimizer == NULL) {
optimizer = &_PyOptimizer_Default;
}
+ else if (cold_exits_initialized == 0) {
+ cold_exits_initialized = 1;
+ for (int i = 0; i < UOP_MAX_TRACE_LENGTH; i++) {
+ if (init_cold_exit_executor(&COLD_EXITS[i], i)) {
+ return NULL;
+ }
+ }
+ }
_PyOptimizerObject *old = interp->optimizer;
+ if (old == NULL) {
+ old = &_PyOptimizer_Default;
+ }
Py_INCREF(optimizer);
interp->optimizer = optimizer;
interp->optimizer_backedge_threshold = shift_and_offset_threshold(optimizer->backedge_threshold);
interp->optimizer_resume_threshold = shift_and_offset_threshold(optimizer->resume_threshold);
+ interp->optimizer_side_threshold = optimizer->side_threshold;
+ if (optimizer == &_PyOptimizer_Default) {
+ assert(interp->optimizer_backedge_threshold > (1 << 16));
+ assert(interp->optimizer_resume_threshold > (1 << 16));
+ }
return old;
}
-void
+int
PyUnstable_SetOptimizer(_PyOptimizerObject *optimizer)
{
PyInterpreterState *interp = _PyInterpreterState_GET();
_PyOptimizerObject *old = _Py_SetOptimizer(interp, optimizer);
- Py_DECREF(old);
+ Py_XDECREF(old);
+ return old == NULL ? -1 : 0;
}
+/* Returns 1 if optimized, 0 if not optimized, and -1 for an error.
+ * If optimized, *executor_ptr contains a new reference to the executor
+ */
int
-_PyOptimizer_Optimize(_PyInterpreterFrame *frame, _Py_CODEUNIT *start, PyObject **stack_pointer)
+_PyOptimizer_Optimize(
+ _PyInterpreterFrame *frame, _Py_CODEUNIT *start,
+ PyObject **stack_pointer, _PyExecutorObject **executor_ptr)
{
PyCodeObject *code = (PyCodeObject *)frame->f_executable;
assert(PyCode_Check(code));
@@ -183,12 +218,11 @@ _PyOptimizer_Optimize(_PyInterpreterFrame *frame, _Py_CODEUNIT *start, PyObject
return 0;
}
_PyOptimizerObject *opt = interp->optimizer;
- _PyExecutorObject *executor = NULL;
- int err = opt->optimize(opt, frame, start, &executor, (int)(stack_pointer - _PyFrame_Stackbase(frame)));
+ int err = opt->optimize(opt, frame, start, executor_ptr, (int)(stack_pointer - _PyFrame_Stackbase(frame)));
if (err <= 0) {
- assert(executor == NULL);
return err;
}
+ assert(*executor_ptr != NULL);
int index = get_index_for_executor(code, start);
if (index < 0) {
/* Out of memory. Don't raise and assume that the
@@ -197,11 +231,11 @@ _PyOptimizer_Optimize(_PyInterpreterFrame *frame, _Py_CODEUNIT *start, PyObject
* If an optimizer has already produced an executor,
* it might get confused by the executor disappearing,
* but there is not much we can do about that here. */
- Py_DECREF(executor);
+ Py_DECREF(*executor_ptr);
return 0;
}
- insert_executor(code, start, index, executor);
- Py_DECREF(executor);
+ insert_executor(code, start, index, *executor_ptr);
+ assert((*executor_ptr)->vm_data.valid);
return 1;
}
@@ -237,11 +271,12 @@ static PyMethodDef executor_methods[] = {
static void
uop_dealloc(_PyExecutorObject *self) {
+ _PyObject_GC_UNTRACK(self);
_Py_ExecutorClear(self);
#ifdef _Py_JIT
_PyJIT_Free(self);
#endif
- PyObject_Free(self);
+ PyObject_GC_Del(self);
}
const char *
@@ -253,7 +288,7 @@ _PyUOpName(int index)
static Py_ssize_t
uop_len(_PyExecutorObject *self)
{
- return Py_SIZE(self);
+ return self->code_size;
}
static PyObject *
@@ -292,15 +327,34 @@ PySequenceMethods uop_as_sequence = {
.sq_item = (ssizeargfunc)uop_item,
};
+static int
+executor_clear(PyObject *o)
+{
+ _Py_ExecutorClear((_PyExecutorObject *)o);
+ return 0;
+}
+
+static int
+executor_traverse(PyObject *o, visitproc visit, void *arg)
+{
+ _PyExecutorObject *executor = (_PyExecutorObject *)o;
+ for (uint32_t i = 0; i < executor->exit_count; i++) {
+ Py_VISIT(executor->exits[i].executor);
+ }
+ return 0;
+}
+
PyTypeObject _PyUOpExecutor_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
.tp_name = "uop_executor",
- .tp_basicsize = offsetof(_PyExecutorObject, trace),
- .tp_itemsize = sizeof(_PyUOpInstruction),
- .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION,
+ .tp_basicsize = offsetof(_PyExecutorObject, exits),
+ .tp_itemsize = 1,
+ .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC,
.tp_dealloc = (destructor)uop_dealloc,
.tp_as_sequence = &uop_as_sequence,
.tp_methods = executor_methods,
+ .tp_traverse = executor_traverse,
+ .tp_clear = executor_clear,
};
/* TO DO -- Generate these tables */
@@ -324,6 +378,7 @@ BRANCH_TO_GUARD[4][2] = {
[POP_JUMP_IF_NOT_NONE - POP_JUMP_IF_FALSE][1] = _GUARD_IS_NOT_NONE_POP,
};
+
#define CONFIDENCE_RANGE 1000
#define CONFIDENCE_CUTOFF 333
@@ -726,9 +781,10 @@ done:
* NOPs are excluded from the count.
*/
static int
-compute_used(_PyUOpInstruction *buffer, uint32_t *used)
+compute_used(_PyUOpInstruction *buffer, uint32_t *used, int *exit_count_ptr)
{
int count = 0;
+ int exit_count = 0;
SET_BIT(used, 0);
for (int i = 0; i < UOP_MAX_TRACE_LENGTH; i++) {
if (!BIT_IS_SET(used, i)) {
@@ -736,6 +792,9 @@ compute_used(_PyUOpInstruction *buffer, uint32_t *used)
}
count++;
int opcode = buffer[i].opcode;
+ if (_PyUop_Flags[opcode] & HAS_EXIT_FLAG) {
+ exit_count++;
+ }
if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE) {
continue;
}
@@ -751,44 +810,76 @@ compute_used(_PyUOpInstruction *buffer, uint32_t *used)
UNSET_BIT(used, i);
}
}
+ *exit_count_ptr = exit_count;
return count;
}
+/* Executor side exits */
+
+static _PyExecutorObject *
+allocate_executor(int exit_count, int length)
+{
+ int size = exit_count*sizeof(_PyExitData) + length*sizeof(_PyUOpInstruction);
+ _PyExecutorObject *res = PyObject_GC_NewVar(_PyExecutorObject, &_PyUOpExecutor_Type, size);
+ if (res == NULL) {
+ return NULL;
+ }
+ res->trace = (_PyUOpInstruction *)(res->exits + exit_count);
+ res->code_size = length;
+ res->exit_count = exit_count;
+ return res;
+}
+
/* Makes an executor from a buffer of uops.
* Account for the buffer having gaps and NOPs by computing a "used"
* bit vector and only copying the used uops. Here "used" means reachable
* and not a NOP.
*/
static _PyExecutorObject *
-make_executor_from_uops(_PyUOpInstruction *buffer, _PyBloomFilter *dependencies)
+make_executor_from_uops(_PyUOpInstruction *buffer, const _PyBloomFilter *dependencies)
{
uint32_t used[(UOP_MAX_TRACE_LENGTH + 31)/32] = { 0 };
- int length = compute_used(buffer, used);
- _PyExecutorObject *executor = PyObject_NewVar(_PyExecutorObject, &_PyUOpExecutor_Type, length);
+ int exit_count;
+ int length = compute_used(buffer, used, &exit_count);
+ _PyExecutorObject *executor = allocate_executor(exit_count, length+1);
if (executor == NULL) {
return NULL;
}
- int dest = length - 1;
+ /* Initialize exits */
+ for (int i = 0; i < exit_count; i++) {
+ executor->exits[i].executor = &COLD_EXITS[i];
+ executor->exits[i].temperature = 0;
+ }
+ int next_exit = exit_count-1;
+ _PyUOpInstruction *dest = (_PyUOpInstruction *)&executor->trace[length];
/* Scan backwards, so that we see the destinations of jumps before the jumps themselves. */
for (int i = UOP_MAX_TRACE_LENGTH-1; i >= 0; i--) {
if (!BIT_IS_SET(used, i)) {
continue;
}
- executor->trace[dest] = buffer[i];
+ *dest = buffer[i];
int opcode = buffer[i].opcode;
if (opcode == _POP_JUMP_IF_FALSE ||
opcode == _POP_JUMP_IF_TRUE)
{
/* The oparg of the target will already have been set to its new offset */
- int oparg = executor->trace[dest].oparg;
- executor->trace[dest].oparg = buffer[oparg].oparg;
+ int oparg = dest->oparg;
+ dest->oparg = buffer[oparg].oparg;
+ }
+ if (_PyUop_Flags[opcode] & HAS_EXIT_FLAG) {
+ executor->exits[next_exit].target = buffer[i].target;
+ dest->exit_index = next_exit;
+ next_exit--;
}
/* Set the oparg to be the destination offset,
* so that we can set the oparg of earlier jumps correctly. */
- buffer[i].oparg = dest;
+ buffer[i].oparg = (uint16_t)(dest - executor->trace);
dest--;
}
- assert(dest == -1);
+ assert(next_exit == -1);
+ assert(dest == executor->trace);
+ dest->opcode = _START_EXECUTOR;
+ dest->operand = (uintptr_t)executor;
_Py_ExecutorInit(executor, dependencies);
#ifdef Py_DEBUG
char *python_lltrace = Py_GETENV("PYTHON_LLTRACE");
@@ -811,15 +902,41 @@ make_executor_from_uops(_PyUOpInstruction *buffer, _PyBloomFilter *dependencies)
#ifdef _Py_JIT
executor->jit_code = NULL;
executor->jit_size = 0;
- if (_PyJIT_Compile(executor, executor->trace, Py_SIZE(executor))) {
+ if (_PyJIT_Compile(executor, executor->trace, length+1)) {
Py_DECREF(executor);
return NULL;
}
#endif
+ _PyObject_GC_TRACK(executor);
return executor;
}
static int
+init_cold_exit_executor(_PyExecutorObject *executor, int oparg)
+{
+ _Py_SetImmortal(executor);
+ Py_SET_TYPE(executor, &_PyUOpExecutor_Type);
+ executor->trace = (_PyUOpInstruction *)executor->exits;
+ executor->code_size = 1;
+ executor->exit_count = 0;
+ _PyUOpInstruction *inst = (_PyUOpInstruction *)&executor->trace[0];
+ inst->opcode = _COLD_EXIT;
+ inst->oparg = oparg;
+ executor->vm_data.valid = true;
+ for (int i = 0; i < BLOOM_FILTER_WORDS; i++) {
+ assert(executor->vm_data.bloom.bits[i] == 0);
+ }
+#ifdef _Py_JIT
+ executor->jit_code = NULL;
+ executor->jit_size = 0;
+ if (_PyJIT_Compile(executor, executor->trace, 1)) {
+ return -1;
+ }
+#endif
+ return 0;
+}
+
+static int
uop_optimize(
_PyOptimizerObject *self,
_PyInterpreterFrame *frame,
@@ -880,13 +997,15 @@ PyUnstable_Optimizer_NewUOpOptimizer(void)
opt->resume_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD;
// Need a few iterations to settle specializations,
// and to ammortize the cost of optimization.
+ opt->side_threshold = 16;
opt->backedge_threshold = 16;
return (PyObject *)opt;
}
static void
counter_dealloc(_PyExecutorObject *self) {
- PyObject *opt = (PyObject *)self->trace[0].operand;
+ /* The optimizer is the operand of the second uop. */
+ PyObject *opt = (PyObject *)self->trace[1].operand;
Py_DECREF(opt);
uop_dealloc(self);
}
@@ -894,11 +1013,13 @@ counter_dealloc(_PyExecutorObject *self) {
PyTypeObject _PyCounterExecutor_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
.tp_name = "counting_executor",
- .tp_basicsize = offsetof(_PyExecutorObject, trace),
- .tp_itemsize = sizeof(_PyUOpInstruction),
- .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION,
+ .tp_basicsize = offsetof(_PyExecutorObject, exits),
+ .tp_itemsize = 1,
+ .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC,
.tp_dealloc = (destructor)counter_dealloc,
.tp_methods = executor_methods,
+ .tp_traverse = executor_traverse,
+ .tp_clear = executor_clear,
};
static int
@@ -926,9 +1047,7 @@ counter_optimize(
{ .opcode = _INTERNAL_INCREMENT_OPT_COUNTER },
{ .opcode = _EXIT_TRACE, .target = (uint32_t)(target - _PyCode_CODE(code)) }
};
- _PyBloomFilter empty;
- _Py_BloomFilter_Init(&empty);
- _PyExecutorObject *executor = make_executor_from_uops(buffer, &empty);
+ _PyExecutorObject *executor = make_executor_from_uops(buffer, &EMPTY_FILTER);
if (executor == NULL) {
return -1;
}
@@ -968,6 +1087,7 @@ PyUnstable_Optimizer_NewCounter(void)
}
opt->base.optimize = counter_optimize;
opt->base.resume_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD;
+ opt->base.side_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD;
opt->base.backedge_threshold = 0;
opt->count = 0;
return (PyObject *)opt;
@@ -1091,9 +1211,6 @@ link_executor(_PyExecutorObject *executor)
static void
unlink_executor(_PyExecutorObject *executor)
{
- if (!executor->vm_data.valid) {
- return;
- }
_PyExecutorLinkListNode *links = &executor->vm_data.links;
_PyExecutorObject *next = links->next;
_PyExecutorObject *prev = links->previous;
@@ -1114,7 +1231,7 @@ unlink_executor(_PyExecutorObject *executor)
/* This must be called by optimizers before using the executor */
void
-_Py_ExecutorInit(_PyExecutorObject *executor, _PyBloomFilter *dependency_set)
+_Py_ExecutorInit(_PyExecutorObject *executor, const _PyBloomFilter *dependency_set)
{
executor->vm_data.valid = true;
for (int i = 0; i < BLOOM_FILTER_WORDS; i++) {
@@ -1127,11 +1244,19 @@ _Py_ExecutorInit(_PyExecutorObject *executor, _PyBloomFilter *dependency_set)
void
_Py_ExecutorClear(_PyExecutorObject *executor)
{
+ if (!executor->vm_data.valid) {
+ return;
+ }
unlink_executor(executor);
PyCodeObject *code = executor->vm_data.code;
if (code == NULL) {
return;
}
+ for (uint32_t i = 0; i < executor->exit_count; i++) {
+ Py_DECREF(executor->exits[i].executor);
+ executor->exits[i].executor = &COLD_EXITS[i];
+ executor->exits[i].temperature = INT16_MIN;
+ }
_Py_CODEUNIT *instruction = &_PyCode_CODE(code)[executor->vm_data.index];
assert(instruction->op.code == ENTER_EXECUTOR);
int index = instruction->op.arg;