diff options
Diffstat (limited to 'Python')
-rw-r--r-- | Python/bytecodes.c | 122 | ||||
-rw-r--r-- | Python/ceval.c | 47 | ||||
-rw-r--r-- | Python/ceval_macros.h | 31 | ||||
-rw-r--r-- | Python/executor_cases.c.h | 96 | ||||
-rw-r--r-- | Python/generated_cases.c.h | 31 | ||||
-rw-r--r-- | Python/jit.c | 13 | ||||
-rw-r--r-- | Python/optimizer.c | 205 | ||||
-rw-r--r-- | Python/pylifecycle.c | 4 | ||||
-rw-r--r-- | Python/pystate.c | 2 | ||||
-rw-r--r-- | Python/tier2_engine.md | 150 | ||||
-rw-r--r-- | Python/tier2_redundancy_eliminator_cases.c.h | 14 |
11 files changed, 580 insertions, 135 deletions
diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 6822e77..2e0008e 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -340,12 +340,12 @@ dummy_func( macro(TO_BOOL) = _SPECIALIZE_TO_BOOL + unused/2 + _TO_BOOL; inst(TO_BOOL_BOOL, (unused/1, unused/2, value -- value)) { - DEOPT_IF(!PyBool_Check(value)); + EXIT_IF(!PyBool_Check(value)); STAT_INC(TO_BOOL, hit); } inst(TO_BOOL_INT, (unused/1, unused/2, value -- res)) { - DEOPT_IF(!PyLong_CheckExact(value)); + EXIT_IF(!PyLong_CheckExact(value)); STAT_INC(TO_BOOL, hit); if (_PyLong_IsZero((PyLongObject *)value)) { assert(_Py_IsImmortal(value)); @@ -358,7 +358,7 @@ dummy_func( } inst(TO_BOOL_LIST, (unused/1, unused/2, value -- res)) { - DEOPT_IF(!PyList_CheckExact(value)); + EXIT_IF(!PyList_CheckExact(value)); STAT_INC(TO_BOOL, hit); res = Py_SIZE(value) ? Py_True : Py_False; DECREF_INPUTS(); @@ -366,13 +366,13 @@ dummy_func( inst(TO_BOOL_NONE, (unused/1, unused/2, value -- res)) { // This one is a bit weird, because we expect *some* failures: - DEOPT_IF(!Py_IsNone(value)); + EXIT_IF(!Py_IsNone(value)); STAT_INC(TO_BOOL, hit); res = Py_False; } inst(TO_BOOL_STR, (unused/1, unused/2, value -- res)) { - DEOPT_IF(!PyUnicode_CheckExact(value)); + EXIT_IF(!PyUnicode_CheckExact(value)); STAT_INC(TO_BOOL, hit); if (value == &_Py_STR(empty)) { assert(_Py_IsImmortal(value)); @@ -388,7 +388,7 @@ dummy_func( inst(TO_BOOL_ALWAYS_TRUE, (unused/1, version/2, value -- res)) { // This one is a bit weird, because we expect *some* failures: assert(version); - DEOPT_IF(Py_TYPE(value)->tp_version_tag != version); + EXIT_IF(Py_TYPE(value)->tp_version_tag != version); STAT_INC(TO_BOOL, hit); DECREF_INPUTS(); res = Py_True; @@ -412,8 +412,8 @@ dummy_func( }; op(_GUARD_BOTH_INT, (left, right -- left, right)) { - DEOPT_IF(!PyLong_CheckExact(left)); - DEOPT_IF(!PyLong_CheckExact(right)); + EXIT_IF(!PyLong_CheckExact(left)); + EXIT_IF(!PyLong_CheckExact(right)); } pure op(_BINARY_OP_MULTIPLY_INT, (left, right -- res)) { @@ -448,8 +448,8 @@ dummy_func( _GUARD_BOTH_INT + unused/1 + _BINARY_OP_SUBTRACT_INT; op(_GUARD_BOTH_FLOAT, (left, right -- left, right)) { - DEOPT_IF(!PyFloat_CheckExact(left)); - DEOPT_IF(!PyFloat_CheckExact(right)); + EXIT_IF(!PyFloat_CheckExact(left)); + EXIT_IF(!PyFloat_CheckExact(right)); } pure op(_BINARY_OP_MULTIPLY_FLOAT, (left, right -- res)) { @@ -484,8 +484,8 @@ dummy_func( _GUARD_BOTH_FLOAT + unused/1 + _BINARY_OP_SUBTRACT_FLOAT; op(_GUARD_BOTH_UNICODE, (left, right -- left, right)) { - DEOPT_IF(!PyUnicode_CheckExact(left)); - DEOPT_IF(!PyUnicode_CheckExact(right)); + EXIT_IF(!PyUnicode_CheckExact(left)); + EXIT_IF(!PyUnicode_CheckExact(right)); } pure op(_BINARY_OP_ADD_UNICODE, (left, right -- res)) { @@ -1904,7 +1904,7 @@ dummy_func( op(_GUARD_TYPE_VERSION, (type_version/2, owner -- owner)) { PyTypeObject *tp = Py_TYPE(owner); assert(type_version != 0); - DEOPT_IF(tp->tp_version_tag != type_version); + EXIT_IF(tp->tp_version_tag != type_version); } op(_CHECK_MANAGED_OBJECT_HAS_VALUES, (owner -- owner)) { @@ -2314,6 +2314,7 @@ dummy_func( } inst(JUMP_BACKWARD, (unused/1 --)) { + TIER_ONE_ONLY CHECK_EVAL_BREAKER(); assert(oparg <= INSTR_OFFSET()); JUMPBY(-oparg); @@ -2335,13 +2336,13 @@ dummy_func( oparg >>= 8; start--; } - int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer); + _PyExecutorObject *executor; + int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer, &executor); ERROR_IF(optimized < 0, error); if (optimized) { - // Rewind and enter the executor: - assert(start->op.code == ENTER_EXECUTOR); - next_instr = start; - this_instr[1].cache &= OPTIMIZER_BITS_MASK; + assert(tstate->previous_executor == NULL); + tstate->previous_executor = Py_None; + GOTO_TIER_TWO(executor); } else { int backoff = this_instr[1].cache & OPTIMIZER_BITS_MASK; @@ -2371,14 +2372,15 @@ dummy_func( inst(ENTER_EXECUTOR, (--)) { TIER_ONE_ONLY CHECK_EVAL_BREAKER(); - PyCodeObject *code = _PyFrame_GetCode(frame); - current_executor = code->co_executors->executors[oparg & 255]; - assert(current_executor->vm_data.index == INSTR_OFFSET() - 1); - assert(current_executor->vm_data.code == code); - assert(current_executor->vm_data.valid); - Py_INCREF(current_executor); - GOTO_TIER_TWO(); + _PyExecutorObject *executor = code->co_executors->executors[oparg & 255]; + assert(executor->vm_data.index == INSTR_OFFSET() - 1); + assert(executor->vm_data.code == code); + assert(executor->vm_data.valid); + assert(tstate->previous_executor == NULL); + tstate->previous_executor = Py_None; + Py_INCREF(executor); + GOTO_TIER_TWO(executor); } replaced op(_POP_JUMP_IF_FALSE, (cond -- )) { @@ -3997,26 +3999,26 @@ dummy_func( inst(CACHE, (--)) { TIER_ONE_ONLY assert(0 && "Executing a cache."); - Py_UNREACHABLE(); + Py_FatalError("Executing a cache."); } inst(RESERVED, (--)) { TIER_ONE_ONLY assert(0 && "Executing RESERVED instruction."); - Py_UNREACHABLE(); + Py_FatalError("Executing RESERVED instruction."); } ///////// Tier-2 only opcodes ///////// op (_GUARD_IS_TRUE_POP, (flag -- )) { SYNC_SP(); - DEOPT_IF(!Py_IsTrue(flag)); + EXIT_IF(!Py_IsTrue(flag)); assert(Py_IsTrue(flag)); } op (_GUARD_IS_FALSE_POP, (flag -- )) { SYNC_SP(); - DEOPT_IF(!Py_IsFalse(flag)); + EXIT_IF(!Py_IsFalse(flag)); assert(Py_IsFalse(flag)); } @@ -4024,18 +4026,20 @@ dummy_func( SYNC_SP(); if (!Py_IsNone(val)) { Py_DECREF(val); - DEOPT_IF(1); + EXIT_IF(1); } } op (_GUARD_IS_NOT_NONE_POP, (val -- )) { SYNC_SP(); - DEOPT_IF(Py_IsNone(val)); + EXIT_IF(Py_IsNone(val)); Py_DECREF(val); } op(_JUMP_TO_TOP, (--)) { - next_uop = current_executor->trace; +#ifndef _Py_JIT + next_uop = ¤t_executor->trace[1]; +#endif CHECK_EVAL_BREAKER(); } @@ -4055,7 +4059,7 @@ dummy_func( op(_EXIT_TRACE, (--)) { TIER_TWO_ONLY - DEOPT_IF(1); + EXIT_IF(1); } op(_CHECK_VALIDITY, (--)) { @@ -4101,6 +4105,58 @@ dummy_func( exe->count++; } + /* Only used for handling cold side exits, should never appear in + * a normal trace or as part of an instruction. + */ + op(_COLD_EXIT, (--)) { + TIER_TWO_ONLY + _PyExecutorObject *previous = (_PyExecutorObject *)tstate->previous_executor; + _PyExitData *exit = &previous->exits[oparg]; + exit->temperature++; + PyCodeObject *code = _PyFrame_GetCode(frame); + _Py_CODEUNIT *target = _PyCode_CODE(code) + exit->target; + if (exit->temperature < (int32_t)tstate->interp->optimizer_side_threshold) { + GOTO_TIER_ONE(target); + } + _PyExecutorObject *executor; + if (target->op.code == ENTER_EXECUTOR) { + executor = code->co_executors->executors[target->op.arg]; + Py_INCREF(executor); + } else { + int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor); + if (optimized <= 0) { + int32_t new_temp = -1 * tstate->interp->optimizer_side_threshold; + exit->temperature = (new_temp < INT16_MIN) ? INT16_MIN : new_temp; + if (optimized < 0) { + Py_DECREF(previous); + tstate->previous_executor = Py_None; + ERROR_IF(1, error); + } + GOTO_TIER_ONE(target); + } + } + /* We need two references. One to store in exit->executor and + * one to keep the executor alive when executing. */ + Py_INCREF(executor); + exit->executor = executor; + GOTO_TIER_TWO(executor); + } + + op(_START_EXECUTOR, (executor/4 --)) { + TIER_TWO_ONLY + Py_DECREF(tstate->previous_executor); + tstate->previous_executor = NULL; +#ifndef _Py_JIT + current_executor = (_PyExecutorObject*)executor; +#endif + } + + op(_FATAL_ERROR, (--)) { + TIER_TWO_ONLY + assert(0); + Py_FatalError("Fatal error uop executed."); + } + op(_CHECK_VALIDITY_AND_SET_IP, (instr_ptr/4 --)) { TIER_TWO_ONLY DEOPT_IF(!current_executor->vm_data.valid); diff --git a/Python/ceval.c b/Python/ceval.c index 4f20800..adccf8f 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -16,6 +16,7 @@ #include "pycore_moduleobject.h" // PyModuleObject #include "pycore_object.h" // _PyObject_GC_TRACK() #include "pycore_opcode_metadata.h" // EXTRA_CASES +#include "pycore_optimizer.h" // _PyUOpExecutor_Type #include "pycore_opcode_utils.h" // MAKE_FUNCTION_* #include "pycore_pyerrors.h" // _PyErr_GetRaisedException() #include "pycore_pystate.h" // _PyInterpreterState_GET() @@ -738,15 +739,16 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int goto resume_with_error; } - /* State shared between Tier 1 and Tier 2 interpreter */ - _PyExecutorObject *current_executor = NULL; - /* Local "register" variables. * These are cached values from the frame and code object. */ - _Py_CODEUNIT *next_instr; PyObject **stack_pointer; +#ifndef _Py_JIT + /* Tier 2 interpreter state */ + _PyExecutorObject *current_executor = NULL; + const _PyUOpInstruction *next_uop = NULL; +#endif start_frame: if (_Py_EnterRecursivePy(tstate)) { @@ -960,18 +962,7 @@ resume_with_error: enter_tier_two: #ifdef _Py_JIT - - ; // ;) - jit_func jitted = current_executor->jit_code; - next_instr = jitted(frame, stack_pointer, tstate); - frame = tstate->current_frame; - Py_DECREF(current_executor); - if (next_instr == NULL) { - goto resume_with_error; - } - stack_pointer = _PyFrame_GetStackPointer(frame); - DISPATCH(); - + assert(0); #else #undef LOAD_IP @@ -1007,12 +998,12 @@ enter_tier_two: #endif OPT_STAT_INC(traces_executed); - _PyUOpInstruction *next_uop = current_executor->trace; uint16_t uopcode; #ifdef Py_STATS uint64_t trace_uop_execution_counter = 0; #endif + assert(next_uop->opcode == _START_EXECUTOR || next_uop->opcode == _COLD_EXIT); for (;;) { uopcode = next_uop->opcode; DPRINTF(3, @@ -1075,23 +1066,39 @@ error_tier_two: frame->return_offset = 0; // Don't leave this random _PyFrame_SetStackPointer(frame, stack_pointer); Py_DECREF(current_executor); + tstate->previous_executor = NULL; goto resume_with_error; // Jump here from DEOPT_IF() deoptimize: next_instr = next_uop[-1].target + _PyCode_CODE(_PyFrame_GetCode(frame)); - DPRINTF(2, "DEOPT: [UOp %d (%s), oparg %d, operand %" PRIu64 ", target %d @ %d -> %s]\n", + DPRINTF(2, "DEOPT: [UOp %d (%s), oparg %d, operand %" PRIu64 ", target %d -> %s]\n", uopcode, _PyUOpName(uopcode), next_uop[-1].oparg, next_uop[-1].operand, next_uop[-1].target, - (int)(next_uop - current_executor->trace - 1), - _PyOpcode_OpName[frame->instr_ptr->op.code]); + _PyOpcode_OpName[next_instr->op.code]); OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); UOP_STAT_INC(uopcode, miss); Py_DECREF(current_executor); + tstate->previous_executor = NULL; DISPATCH(); +// Jump here from EXIT_IF() +side_exit: + OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); + UOP_STAT_INC(uopcode, miss); + uint32_t exit_index = next_uop[-1].exit_index; + assert(exit_index < current_executor->exit_count); + _PyExitData *exit = ¤t_executor->exits[exit_index]; + DPRINTF(2, "SIDE EXIT: [UOp %d (%s), oparg %d, operand %" PRIu64 ", exit %u, temp %d, target %d -> %s]\n", + uopcode, _PyUOpName(uopcode), next_uop[-1].oparg, next_uop[-1].operand, exit_index, exit->temperature, + exit->target, _PyOpcode_OpName[_PyCode_CODE(_PyFrame_GetCode(frame))[exit->target].op.code]); + Py_INCREF(exit->executor); + tstate->previous_executor = (PyObject *)current_executor; + GOTO_TIER_TWO(exit->executor); + #endif // _Py_JIT } + #if defined(__GNUC__) # pragma GCC diagnostic pop #elif defined(_MSC_VER) /* MS_WINDOWS */ diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index 1043966..f796b60 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -394,7 +394,36 @@ stack_pointer = _PyFrame_GetStackPointer(frame); /* Tier-switching macros. */ -#define GOTO_TIER_TWO() goto enter_tier_two; +#ifdef _Py_JIT +#define GOTO_TIER_TWO(EXECUTOR) \ +do { \ + jit_func jitted = (EXECUTOR)->jit_code; \ + next_instr = jitted(frame, stack_pointer, tstate); \ + Py_DECREF(tstate->previous_executor); \ + tstate->previous_executor = NULL; \ + frame = tstate->current_frame; \ + if (next_instr == NULL) { \ + goto resume_with_error; \ + } \ + stack_pointer = _PyFrame_GetStackPointer(frame); \ + DISPATCH(); \ +} while (0) +#else +#define GOTO_TIER_TWO(EXECUTOR) \ +do { \ + next_uop = (EXECUTOR)->trace; \ + assert(next_uop->opcode == _START_EXECUTOR || next_uop->opcode == _COLD_EXIT); \ + goto enter_tier_two; \ +} while (0) +#endif + +#define GOTO_TIER_ONE(TARGET) \ +do { \ + Py_DECREF(tstate->previous_executor); \ + tstate->previous_executor = NULL; \ + next_instr = target; \ + DISPATCH(); \ +} while (0) #define CURRENT_OPARG() (next_uop[-1].oparg) diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 11e2a1f..a18284d 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -141,7 +141,7 @@ case _TO_BOOL_BOOL: { PyObject *value; value = stack_pointer[-1]; - if (!PyBool_Check(value)) goto deoptimize; + if (!PyBool_Check(value)) goto side_exit; STAT_INC(TO_BOOL, hit); break; } @@ -150,7 +150,7 @@ PyObject *value; PyObject *res; value = stack_pointer[-1]; - if (!PyLong_CheckExact(value)) goto deoptimize; + if (!PyLong_CheckExact(value)) goto side_exit; STAT_INC(TO_BOOL, hit); if (_PyLong_IsZero((PyLongObject *)value)) { assert(_Py_IsImmortal(value)); @@ -168,7 +168,7 @@ PyObject *value; PyObject *res; value = stack_pointer[-1]; - if (!PyList_CheckExact(value)) goto deoptimize; + if (!PyList_CheckExact(value)) goto side_exit; STAT_INC(TO_BOOL, hit); res = Py_SIZE(value) ? Py_True : Py_False; Py_DECREF(value); @@ -181,7 +181,7 @@ PyObject *res; value = stack_pointer[-1]; // This one is a bit weird, because we expect *some* failures: - if (!Py_IsNone(value)) goto deoptimize; + if (!Py_IsNone(value)) goto side_exit; STAT_INC(TO_BOOL, hit); res = Py_False; stack_pointer[-1] = res; @@ -192,7 +192,7 @@ PyObject *value; PyObject *res; value = stack_pointer[-1]; - if (!PyUnicode_CheckExact(value)) goto deoptimize; + if (!PyUnicode_CheckExact(value)) goto side_exit; STAT_INC(TO_BOOL, hit); if (value == &_Py_STR(empty)) { assert(_Py_IsImmortal(value)); @@ -214,7 +214,7 @@ uint32_t version = (uint32_t)CURRENT_OPERAND(); // This one is a bit weird, because we expect *some* failures: assert(version); - if (Py_TYPE(value)->tp_version_tag != version) goto deoptimize; + if (Py_TYPE(value)->tp_version_tag != version) goto side_exit; STAT_INC(TO_BOOL, hit); Py_DECREF(value); res = Py_True; @@ -238,8 +238,8 @@ PyObject *left; right = stack_pointer[-1]; left = stack_pointer[-2]; - if (!PyLong_CheckExact(left)) goto deoptimize; - if (!PyLong_CheckExact(right)) goto deoptimize; + if (!PyLong_CheckExact(left)) goto side_exit; + if (!PyLong_CheckExact(right)) goto side_exit; break; } @@ -296,8 +296,8 @@ PyObject *left; right = stack_pointer[-1]; left = stack_pointer[-2]; - if (!PyFloat_CheckExact(left)) goto deoptimize; - if (!PyFloat_CheckExact(right)) goto deoptimize; + if (!PyFloat_CheckExact(left)) goto side_exit; + if (!PyFloat_CheckExact(right)) goto side_exit; break; } @@ -354,8 +354,8 @@ PyObject *left; right = stack_pointer[-1]; left = stack_pointer[-2]; - if (!PyUnicode_CheckExact(left)) goto deoptimize; - if (!PyUnicode_CheckExact(right)) goto deoptimize; + if (!PyUnicode_CheckExact(left)) goto side_exit; + if (!PyUnicode_CheckExact(right)) goto side_exit; break; } @@ -1623,7 +1623,7 @@ uint32_t type_version = (uint32_t)CURRENT_OPERAND(); PyTypeObject *tp = Py_TYPE(owner); assert(type_version != 0); - if (tp->tp_version_tag != type_version) goto deoptimize; + if (tp->tp_version_tag != type_version) goto side_exit; break; } @@ -2013,8 +2013,6 @@ break; } - /* _JUMP_BACKWARD is not a viable micro-op for tier 2 */ - /* _POP_JUMP_IF_FALSE is not a viable micro-op for tier 2 */ /* _POP_JUMP_IF_TRUE is not a viable micro-op for tier 2 */ @@ -3318,7 +3316,7 @@ PyObject *flag; flag = stack_pointer[-1]; stack_pointer += -1; - if (!Py_IsTrue(flag)) goto deoptimize; + if (!Py_IsTrue(flag)) goto side_exit; assert(Py_IsTrue(flag)); break; } @@ -3327,7 +3325,7 @@ PyObject *flag; flag = stack_pointer[-1]; stack_pointer += -1; - if (!Py_IsFalse(flag)) goto deoptimize; + if (!Py_IsFalse(flag)) goto side_exit; assert(Py_IsFalse(flag)); break; } @@ -3338,7 +3336,7 @@ stack_pointer += -1; if (!Py_IsNone(val)) { Py_DECREF(val); - if (1) goto deoptimize; + if (1) goto side_exit; } break; } @@ -3347,13 +3345,15 @@ PyObject *val; val = stack_pointer[-1]; stack_pointer += -1; - if (Py_IsNone(val)) goto deoptimize; + if (Py_IsNone(val)) goto side_exit; Py_DECREF(val); break; } case _JUMP_TO_TOP: { - next_uop = current_executor->trace; + #ifndef _Py_JIT + next_uop = ¤t_executor->trace[1]; + #endif CHECK_EVAL_BREAKER(); break; } @@ -3378,7 +3378,7 @@ case _EXIT_TRACE: { TIER_TWO_ONLY - if (1) goto deoptimize; + if (1) goto side_exit; break; } @@ -3457,6 +3457,60 @@ break; } + case _COLD_EXIT: { + oparg = CURRENT_OPARG(); + TIER_TWO_ONLY + _PyExecutorObject *previous = (_PyExecutorObject *)tstate->previous_executor; + _PyExitData *exit = &previous->exits[oparg]; + exit->temperature++; + PyCodeObject *code = _PyFrame_GetCode(frame); + _Py_CODEUNIT *target = _PyCode_CODE(code) + exit->target; + if (exit->temperature < (int32_t)tstate->interp->optimizer_side_threshold) { + GOTO_TIER_ONE(target); + } + _PyExecutorObject *executor; + if (target->op.code == ENTER_EXECUTOR) { + executor = code->co_executors->executors[target->op.arg]; + Py_INCREF(executor); + } else { + int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor); + if (optimized <= 0) { + int32_t new_temp = -1 * tstate->interp->optimizer_side_threshold; + exit->temperature = (new_temp < INT16_MIN) ? INT16_MIN : new_temp; + if (optimized < 0) { + Py_DECREF(previous); + tstate->previous_executor = Py_None; + if (1) goto error_tier_two; + } + GOTO_TIER_ONE(target); + } + } + /* We need two references. One to store in exit->executor and + * one to keep the executor alive when executing. */ + Py_INCREF(executor); + exit->executor = executor; + GOTO_TIER_TWO(executor); + break; + } + + case _START_EXECUTOR: { + PyObject *executor = (PyObject *)CURRENT_OPERAND(); + TIER_TWO_ONLY + Py_DECREF(tstate->previous_executor); + tstate->previous_executor = NULL; + #ifndef _Py_JIT + current_executor = (_PyExecutorObject*)executor; + #endif + break; + } + + case _FATAL_ERROR: { + TIER_TWO_ONLY + assert(0); + Py_FatalError("Fatal error uop executed."); + break; + } + case _CHECK_VALIDITY_AND_SET_IP: { PyObject *instr_ptr = (PyObject *)CURRENT_OPERAND(); TIER_TWO_ONLY diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 6c19adc..a520d04 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -741,7 +741,8 @@ INSTRUCTION_STATS(CACHE); TIER_ONE_ONLY assert(0 && "Executing a cache."); - Py_UNREACHABLE(); + Py_FatalError("Executing a cache."); + DISPATCH(); } TARGET(CALL) { @@ -2369,12 +2370,14 @@ TIER_ONE_ONLY CHECK_EVAL_BREAKER(); PyCodeObject *code = _PyFrame_GetCode(frame); - current_executor = code->co_executors->executors[oparg & 255]; - assert(current_executor->vm_data.index == INSTR_OFFSET() - 1); - assert(current_executor->vm_data.code == code); - assert(current_executor->vm_data.valid); - Py_INCREF(current_executor); - GOTO_TIER_TWO(); + _PyExecutorObject *executor = code->co_executors->executors[oparg & 255]; + assert(executor->vm_data.index == INSTR_OFFSET() - 1); + assert(executor->vm_data.code == code); + assert(executor->vm_data.valid); + assert(tstate->previous_executor == NULL); + tstate->previous_executor = Py_None; + Py_INCREF(executor); + GOTO_TIER_TWO(executor); DISPATCH(); } @@ -3262,6 +3265,7 @@ next_instr += 2; INSTRUCTION_STATS(JUMP_BACKWARD); /* Skip 1 cache entry */ + TIER_ONE_ONLY CHECK_EVAL_BREAKER(); assert(oparg <= INSTR_OFFSET()); JUMPBY(-oparg); @@ -3283,13 +3287,13 @@ oparg >>= 8; start--; } - int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer); + _PyExecutorObject *executor; + int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer, &executor); if (optimized < 0) goto error; if (optimized) { - // Rewind and enter the executor: - assert(start->op.code == ENTER_EXECUTOR); - next_instr = start; - this_instr[1].cache &= OPTIMIZER_BITS_MASK; + assert(tstate->previous_executor == NULL); + tstate->previous_executor = Py_None; + GOTO_TIER_TWO(executor); } else { int backoff = this_instr[1].cache & OPTIMIZER_BITS_MASK; @@ -4778,7 +4782,8 @@ INSTRUCTION_STATS(RESERVED); TIER_ONE_ONLY assert(0 && "Executing RESERVED instruction."); - Py_UNREACHABLE(); + Py_FatalError("Executing RESERVED instruction."); + DISPATCH(); } TARGET(RESUME) { diff --git a/Python/jit.c b/Python/jit.c index 22949c0..839414b 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -300,13 +300,13 @@ emit(const StencilGroup *group, uint64_t patches[]) // Compiles executor in-place. Don't forget to call _PyJIT_Free later! int -_PyJIT_Compile(_PyExecutorObject *executor, _PyUOpInstruction *trace, size_t length) +_PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size_t length) { // Loop once to find the total compiled size: size_t code_size = 0; size_t data_size = 0; for (size_t i = 0; i < length; i++) { - _PyUOpInstruction *instruction = &trace[i]; + _PyUOpInstruction *instruction = (_PyUOpInstruction *)&trace[i]; const StencilGroup *group = &stencil_groups[instruction->opcode]; code_size += group->code.body_size; data_size += group->data.body_size; @@ -323,8 +323,13 @@ _PyJIT_Compile(_PyExecutorObject *executor, _PyUOpInstruction *trace, size_t len // Loop again to emit the code: char *code = memory; char *data = memory + code_size; + char *top = code; + if (trace[0].opcode == _START_EXECUTOR) { + // Don't want to execute this more than once: + top += stencil_groups[_START_EXECUTOR].code.body_size; + } for (size_t i = 0; i < length; i++) { - _PyUOpInstruction *instruction = &trace[i]; + _PyUOpInstruction *instruction = (_PyUOpInstruction *)&trace[i]; const StencilGroup *group = &stencil_groups[instruction->opcode]; // Think of patches as a dictionary mapping HoleValue to uint64_t: uint64_t patches[] = GET_PATCHES(); @@ -335,7 +340,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, _PyUOpInstruction *trace, size_t len patches[HoleValue_OPARG] = instruction->oparg; patches[HoleValue_OPERAND] = instruction->operand; patches[HoleValue_TARGET] = instruction->target; - patches[HoleValue_TOP] = (uint64_t)memory; + patches[HoleValue_TOP] = (uint64_t)top; patches[HoleValue_ZERO] = 0; emit(group, patches); code += group->code.body_size; diff --git a/Python/optimizer.c b/Python/optimizer.c index efa1968..acc1d54 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -2,6 +2,7 @@ #include "opcode.h" #include "pycore_interp.h" #include "pycore_bitutils.h" // _Py_popcount32() +#include "pycore_object.h" // _PyObject_GC_UNTRACK() #include "pycore_opcode_metadata.h" // _PyOpcode_OpName[] #include "pycore_opcode_utils.h" // MAX_REAL_OPCODE #include "pycore_optimizer.h" // _Py_uop_analyze_and_optimize() @@ -128,10 +129,11 @@ static _PyOptimizerObject _PyOptimizer_Default = { .optimize = never_optimize, .resume_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD, .backedge_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD, + .side_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD, }; static uint32_t -shift_and_offset_threshold(uint16_t threshold) +shift_and_offset_threshold(uint32_t threshold) { return (threshold << OPTIMIZER_BITS_IN_COUNTER) + (1 << 15); } @@ -140,41 +142,74 @@ _PyOptimizerObject * PyUnstable_GetOptimizer(void) { PyInterpreterState *interp = _PyInterpreterState_GET(); - if (interp->optimizer == &_PyOptimizer_Default) { - return NULL; - } assert(interp->optimizer_backedge_threshold == shift_and_offset_threshold(interp->optimizer->backedge_threshold)); assert(interp->optimizer_resume_threshold == shift_and_offset_threshold(interp->optimizer->resume_threshold)); + if (interp->optimizer == &_PyOptimizer_Default) { + return NULL; + } Py_INCREF(interp->optimizer); return interp->optimizer; } +static _PyExecutorObject * +make_executor_from_uops(_PyUOpInstruction *buffer, const _PyBloomFilter *dependencies); + +static int +init_cold_exit_executor(_PyExecutorObject *executor, int oparg); + +static int cold_exits_initialized = 0; +static _PyExecutorObject COLD_EXITS[UOP_MAX_TRACE_LENGTH] = { 0 }; + +static const _PyBloomFilter EMPTY_FILTER = { 0 }; + _PyOptimizerObject * _Py_SetOptimizer(PyInterpreterState *interp, _PyOptimizerObject *optimizer) { if (optimizer == NULL) { optimizer = &_PyOptimizer_Default; } + else if (cold_exits_initialized == 0) { + cold_exits_initialized = 1; + for (int i = 0; i < UOP_MAX_TRACE_LENGTH; i++) { + if (init_cold_exit_executor(&COLD_EXITS[i], i)) { + return NULL; + } + } + } _PyOptimizerObject *old = interp->optimizer; + if (old == NULL) { + old = &_PyOptimizer_Default; + } Py_INCREF(optimizer); interp->optimizer = optimizer; interp->optimizer_backedge_threshold = shift_and_offset_threshold(optimizer->backedge_threshold); interp->optimizer_resume_threshold = shift_and_offset_threshold(optimizer->resume_threshold); + interp->optimizer_side_threshold = optimizer->side_threshold; + if (optimizer == &_PyOptimizer_Default) { + assert(interp->optimizer_backedge_threshold > (1 << 16)); + assert(interp->optimizer_resume_threshold > (1 << 16)); + } return old; } -void +int PyUnstable_SetOptimizer(_PyOptimizerObject *optimizer) { PyInterpreterState *interp = _PyInterpreterState_GET(); _PyOptimizerObject *old = _Py_SetOptimizer(interp, optimizer); - Py_DECREF(old); + Py_XDECREF(old); + return old == NULL ? -1 : 0; } +/* Returns 1 if optimized, 0 if not optimized, and -1 for an error. + * If optimized, *executor_ptr contains a new reference to the executor + */ int -_PyOptimizer_Optimize(_PyInterpreterFrame *frame, _Py_CODEUNIT *start, PyObject **stack_pointer) +_PyOptimizer_Optimize( + _PyInterpreterFrame *frame, _Py_CODEUNIT *start, + PyObject **stack_pointer, _PyExecutorObject **executor_ptr) { PyCodeObject *code = (PyCodeObject *)frame->f_executable; assert(PyCode_Check(code)); @@ -183,12 +218,11 @@ _PyOptimizer_Optimize(_PyInterpreterFrame *frame, _Py_CODEUNIT *start, PyObject return 0; } _PyOptimizerObject *opt = interp->optimizer; - _PyExecutorObject *executor = NULL; - int err = opt->optimize(opt, frame, start, &executor, (int)(stack_pointer - _PyFrame_Stackbase(frame))); + int err = opt->optimize(opt, frame, start, executor_ptr, (int)(stack_pointer - _PyFrame_Stackbase(frame))); if (err <= 0) { - assert(executor == NULL); return err; } + assert(*executor_ptr != NULL); int index = get_index_for_executor(code, start); if (index < 0) { /* Out of memory. Don't raise and assume that the @@ -197,11 +231,11 @@ _PyOptimizer_Optimize(_PyInterpreterFrame *frame, _Py_CODEUNIT *start, PyObject * If an optimizer has already produced an executor, * it might get confused by the executor disappearing, * but there is not much we can do about that here. */ - Py_DECREF(executor); + Py_DECREF(*executor_ptr); return 0; } - insert_executor(code, start, index, executor); - Py_DECREF(executor); + insert_executor(code, start, index, *executor_ptr); + assert((*executor_ptr)->vm_data.valid); return 1; } @@ -237,11 +271,12 @@ static PyMethodDef executor_methods[] = { static void uop_dealloc(_PyExecutorObject *self) { + _PyObject_GC_UNTRACK(self); _Py_ExecutorClear(self); #ifdef _Py_JIT _PyJIT_Free(self); #endif - PyObject_Free(self); + PyObject_GC_Del(self); } const char * @@ -253,7 +288,7 @@ _PyUOpName(int index) static Py_ssize_t uop_len(_PyExecutorObject *self) { - return Py_SIZE(self); + return self->code_size; } static PyObject * @@ -292,15 +327,34 @@ PySequenceMethods uop_as_sequence = { .sq_item = (ssizeargfunc)uop_item, }; +static int +executor_clear(PyObject *o) +{ + _Py_ExecutorClear((_PyExecutorObject *)o); + return 0; +} + +static int +executor_traverse(PyObject *o, visitproc visit, void *arg) +{ + _PyExecutorObject *executor = (_PyExecutorObject *)o; + for (uint32_t i = 0; i < executor->exit_count; i++) { + Py_VISIT(executor->exits[i].executor); + } + return 0; +} + PyTypeObject _PyUOpExecutor_Type = { PyVarObject_HEAD_INIT(&PyType_Type, 0) .tp_name = "uop_executor", - .tp_basicsize = offsetof(_PyExecutorObject, trace), - .tp_itemsize = sizeof(_PyUOpInstruction), - .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION, + .tp_basicsize = offsetof(_PyExecutorObject, exits), + .tp_itemsize = 1, + .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC, .tp_dealloc = (destructor)uop_dealloc, .tp_as_sequence = &uop_as_sequence, .tp_methods = executor_methods, + .tp_traverse = executor_traverse, + .tp_clear = executor_clear, }; /* TO DO -- Generate these tables */ @@ -324,6 +378,7 @@ BRANCH_TO_GUARD[4][2] = { [POP_JUMP_IF_NOT_NONE - POP_JUMP_IF_FALSE][1] = _GUARD_IS_NOT_NONE_POP, }; + #define CONFIDENCE_RANGE 1000 #define CONFIDENCE_CUTOFF 333 @@ -726,9 +781,10 @@ done: * NOPs are excluded from the count. */ static int -compute_used(_PyUOpInstruction *buffer, uint32_t *used) +compute_used(_PyUOpInstruction *buffer, uint32_t *used, int *exit_count_ptr) { int count = 0; + int exit_count = 0; SET_BIT(used, 0); for (int i = 0; i < UOP_MAX_TRACE_LENGTH; i++) { if (!BIT_IS_SET(used, i)) { @@ -736,6 +792,9 @@ compute_used(_PyUOpInstruction *buffer, uint32_t *used) } count++; int opcode = buffer[i].opcode; + if (_PyUop_Flags[opcode] & HAS_EXIT_FLAG) { + exit_count++; + } if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE) { continue; } @@ -751,44 +810,76 @@ compute_used(_PyUOpInstruction *buffer, uint32_t *used) UNSET_BIT(used, i); } } + *exit_count_ptr = exit_count; return count; } +/* Executor side exits */ + +static _PyExecutorObject * +allocate_executor(int exit_count, int length) +{ + int size = exit_count*sizeof(_PyExitData) + length*sizeof(_PyUOpInstruction); + _PyExecutorObject *res = PyObject_GC_NewVar(_PyExecutorObject, &_PyUOpExecutor_Type, size); + if (res == NULL) { + return NULL; + } + res->trace = (_PyUOpInstruction *)(res->exits + exit_count); + res->code_size = length; + res->exit_count = exit_count; + return res; +} + /* Makes an executor from a buffer of uops. * Account for the buffer having gaps and NOPs by computing a "used" * bit vector and only copying the used uops. Here "used" means reachable * and not a NOP. */ static _PyExecutorObject * -make_executor_from_uops(_PyUOpInstruction *buffer, _PyBloomFilter *dependencies) +make_executor_from_uops(_PyUOpInstruction *buffer, const _PyBloomFilter *dependencies) { uint32_t used[(UOP_MAX_TRACE_LENGTH + 31)/32] = { 0 }; - int length = compute_used(buffer, used); - _PyExecutorObject *executor = PyObject_NewVar(_PyExecutorObject, &_PyUOpExecutor_Type, length); + int exit_count; + int length = compute_used(buffer, used, &exit_count); + _PyExecutorObject *executor = allocate_executor(exit_count, length+1); if (executor == NULL) { return NULL; } - int dest = length - 1; + /* Initialize exits */ + for (int i = 0; i < exit_count; i++) { + executor->exits[i].executor = &COLD_EXITS[i]; + executor->exits[i].temperature = 0; + } + int next_exit = exit_count-1; + _PyUOpInstruction *dest = (_PyUOpInstruction *)&executor->trace[length]; /* Scan backwards, so that we see the destinations of jumps before the jumps themselves. */ for (int i = UOP_MAX_TRACE_LENGTH-1; i >= 0; i--) { if (!BIT_IS_SET(used, i)) { continue; } - executor->trace[dest] = buffer[i]; + *dest = buffer[i]; int opcode = buffer[i].opcode; if (opcode == _POP_JUMP_IF_FALSE || opcode == _POP_JUMP_IF_TRUE) { /* The oparg of the target will already have been set to its new offset */ - int oparg = executor->trace[dest].oparg; - executor->trace[dest].oparg = buffer[oparg].oparg; + int oparg = dest->oparg; + dest->oparg = buffer[oparg].oparg; + } + if (_PyUop_Flags[opcode] & HAS_EXIT_FLAG) { + executor->exits[next_exit].target = buffer[i].target; + dest->exit_index = next_exit; + next_exit--; } /* Set the oparg to be the destination offset, * so that we can set the oparg of earlier jumps correctly. */ - buffer[i].oparg = dest; + buffer[i].oparg = (uint16_t)(dest - executor->trace); dest--; } - assert(dest == -1); + assert(next_exit == -1); + assert(dest == executor->trace); + dest->opcode = _START_EXECUTOR; + dest->operand = (uintptr_t)executor; _Py_ExecutorInit(executor, dependencies); #ifdef Py_DEBUG char *python_lltrace = Py_GETENV("PYTHON_LLTRACE"); @@ -811,15 +902,41 @@ make_executor_from_uops(_PyUOpInstruction *buffer, _PyBloomFilter *dependencies) #ifdef _Py_JIT executor->jit_code = NULL; executor->jit_size = 0; - if (_PyJIT_Compile(executor, executor->trace, Py_SIZE(executor))) { + if (_PyJIT_Compile(executor, executor->trace, length+1)) { Py_DECREF(executor); return NULL; } #endif + _PyObject_GC_TRACK(executor); return executor; } static int +init_cold_exit_executor(_PyExecutorObject *executor, int oparg) +{ + _Py_SetImmortal(executor); + Py_SET_TYPE(executor, &_PyUOpExecutor_Type); + executor->trace = (_PyUOpInstruction *)executor->exits; + executor->code_size = 1; + executor->exit_count = 0; + _PyUOpInstruction *inst = (_PyUOpInstruction *)&executor->trace[0]; + inst->opcode = _COLD_EXIT; + inst->oparg = oparg; + executor->vm_data.valid = true; + for (int i = 0; i < BLOOM_FILTER_WORDS; i++) { + assert(executor->vm_data.bloom.bits[i] == 0); + } +#ifdef _Py_JIT + executor->jit_code = NULL; + executor->jit_size = 0; + if (_PyJIT_Compile(executor, executor->trace, 1)) { + return -1; + } +#endif + return 0; +} + +static int uop_optimize( _PyOptimizerObject *self, _PyInterpreterFrame *frame, @@ -880,13 +997,15 @@ PyUnstable_Optimizer_NewUOpOptimizer(void) opt->resume_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD; // Need a few iterations to settle specializations, // and to ammortize the cost of optimization. + opt->side_threshold = 16; opt->backedge_threshold = 16; return (PyObject *)opt; } static void counter_dealloc(_PyExecutorObject *self) { - PyObject *opt = (PyObject *)self->trace[0].operand; + /* The optimizer is the operand of the second uop. */ + PyObject *opt = (PyObject *)self->trace[1].operand; Py_DECREF(opt); uop_dealloc(self); } @@ -894,11 +1013,13 @@ counter_dealloc(_PyExecutorObject *self) { PyTypeObject _PyCounterExecutor_Type = { PyVarObject_HEAD_INIT(&PyType_Type, 0) .tp_name = "counting_executor", - .tp_basicsize = offsetof(_PyExecutorObject, trace), - .tp_itemsize = sizeof(_PyUOpInstruction), - .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION, + .tp_basicsize = offsetof(_PyExecutorObject, exits), + .tp_itemsize = 1, + .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC, .tp_dealloc = (destructor)counter_dealloc, .tp_methods = executor_methods, + .tp_traverse = executor_traverse, + .tp_clear = executor_clear, }; static int @@ -926,9 +1047,7 @@ counter_optimize( { .opcode = _INTERNAL_INCREMENT_OPT_COUNTER }, { .opcode = _EXIT_TRACE, .target = (uint32_t)(target - _PyCode_CODE(code)) } }; - _PyBloomFilter empty; - _Py_BloomFilter_Init(&empty); - _PyExecutorObject *executor = make_executor_from_uops(buffer, &empty); + _PyExecutorObject *executor = make_executor_from_uops(buffer, &EMPTY_FILTER); if (executor == NULL) { return -1; } @@ -968,6 +1087,7 @@ PyUnstable_Optimizer_NewCounter(void) } opt->base.optimize = counter_optimize; opt->base.resume_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD; + opt->base.side_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD; opt->base.backedge_threshold = 0; opt->count = 0; return (PyObject *)opt; @@ -1091,9 +1211,6 @@ link_executor(_PyExecutorObject *executor) static void unlink_executor(_PyExecutorObject *executor) { - if (!executor->vm_data.valid) { - return; - } _PyExecutorLinkListNode *links = &executor->vm_data.links; _PyExecutorObject *next = links->next; _PyExecutorObject *prev = links->previous; @@ -1114,7 +1231,7 @@ unlink_executor(_PyExecutorObject *executor) /* This must be called by optimizers before using the executor */ void -_Py_ExecutorInit(_PyExecutorObject *executor, _PyBloomFilter *dependency_set) +_Py_ExecutorInit(_PyExecutorObject *executor, const _PyBloomFilter *dependency_set) { executor->vm_data.valid = true; for (int i = 0; i < BLOOM_FILTER_WORDS; i++) { @@ -1127,11 +1244,19 @@ _Py_ExecutorInit(_PyExecutorObject *executor, _PyBloomFilter *dependency_set) void _Py_ExecutorClear(_PyExecutorObject *executor) { + if (!executor->vm_data.valid) { + return; + } unlink_executor(executor); PyCodeObject *code = executor->vm_data.code; if (code == NULL) { return; } + for (uint32_t i = 0; i < executor->exit_count; i++) { + Py_DECREF(executor->exits[i].executor); + executor->exits[i].executor = &COLD_EXITS[i]; + executor->exits[i].temperature = INT16_MIN; + } _Py_CODEUNIT *instruction = &_PyCode_CODE(code)[executor->vm_data.index]; assert(instruction->op.code == ENTER_EXECUTOR); int index = instruction->op.arg; diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index b354c03..7b537af 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -1261,7 +1261,9 @@ init_interp_main(PyThreadState *tstate) if (opt == NULL) { return _PyStatus_ERR("can't initialize optimizer"); } - PyUnstable_SetOptimizer((_PyOptimizerObject *)opt); + if (PyUnstable_SetOptimizer((_PyOptimizerObject *)opt)) { + return _PyStatus_ERR("can't initialize optimizer"); + } Py_DECREF(opt); } } diff --git a/Python/pystate.c b/Python/pystate.c index c2ccc27..3484bea 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -782,6 +782,7 @@ interpreter_clear(PyInterpreterState *interp, PyThreadState *tstate) } _PyOptimizerObject *old = _Py_SetOptimizer(interp, NULL); + assert(old != NULL); Py_DECREF(old); /* It is possible that any of the objects below have a finalizer @@ -1346,6 +1347,7 @@ init_threadstate(_PyThreadStateImpl *_tstate, tstate->datastack_top = NULL; tstate->datastack_limit = NULL; tstate->what_event = -1; + tstate->previous_executor = NULL; #ifdef Py_GIL_DISABLED // Initialize biased reference counting inter-thread queue diff --git a/Python/tier2_engine.md b/Python/tier2_engine.md new file mode 100644 index 0000000..df9f6c1 --- /dev/null +++ b/Python/tier2_engine.md @@ -0,0 +1,150 @@ +# The tier 2 execution engine + +## General idea + +When execution in tier 1 becomes "hot", that is the counter for that point in +the code reaches some threshold, we create an executor and execute that +instead of the tier 1 bytecode. + +Since each executor must exit, we also track the "hotness" of those +exits and attach new executors to those exits. + +As the program executes, and the hot parts of the program get optimized, +a graph of executors forms. + +## Superblocks and Executors + +Once a point in the code has become hot enough, we want to optimize it. +Starting from that point we project the likely path of execution, +using information gathered by tier 1 to guide that projection to +form a "superblock", a mostly linear sequence of micro-ops. +Although mostly linear, it may include a single loop. + +We then optimize this superblock to form an optimized superblock, +which is equivalent but more efficient. + +A superblock is a representation of the code we want to execute, +but it is not in executable form. +The executable form is known as an executor. + +Executors are semantically equivalent to the superblock they are +created from, but are in a form that can be efficiently executable. + +There are two execution engines for executors, and two types of executors: +* The hardware which runs machine code executors created by the JIT compiler. +* The tier 2 interpreter runs bytecode executors. + +It would be very wasteful to support both a tier 2 interpreter and +JIT compiler in the same process. +For now, we will make the choice of engine a configuration option, +but we could make it a command line option in the future if that would prove useful. + + +### Tier 2 Interpreter + +For platforms without a JIT and for testing, we need an interpreter +for executors. It is similar in design to the tier 1 interpreter, but has a +different instruction set, and does not adapt. + +### JIT compiler + +The JIT compiler converts superblocks into machine code executors. +These have identical behavior to interpreted executors, except that +they consume more memory for the generated machine code and are a lot faster. + +## Transfering control + +There are three types of control transfer that we need to consider: +* Tier 1 to tier 2 +* Tier 2 to tier 1 +* One executor to another within tier 2 + +Since we expect the graph of executors to span most of the hot +part of the program, transfers from one executor to another should +be the most common. +Therefore, we want to make those transfers fast. + +### Tier 2 to tier 2 + +#### Cold exits + +All side exits start cold and most stay cold, but a few become +hot. We want to keep the memory consumption small for the many +cold exits, but those that become hot need to be fast. +However we cannot know in advance, which will be which. + +So that tier 2 to tier 2 transfers are fast for hot exits, +exits must be implemented as executors. In order to patch +executor exits when they get hot, a pointer to the current +executor must be passed to the exit executor. + +#### Handling reference counts + +There must be an implicit reference to the currently executing +executor, otherwise it might be freed. +Consequently, we must increment the reference count of an +executor just before executing it, and decrement it just after +executing it. + +We want to minimize the amount of data that is passed from +one executor to the next. In the JIT, this reduces the number +of arguments in the tailcall, freeing up registers for other uses. +It is less important in the interpreter, but following the same +design as the JIT simplifies debugging and is good for performance. + +Provided that we incref the new executor before executing it, we +can jump directly to the code of the executor, without needing +to pass a reference to that executor object. +However, we do need a reference to the previous executor, +so that it can be decref'd and for handling of cold exits. +To avoid messing up the JIT's register allocation, we pass a +reference to the previous executor in the thread state's +`previous_executor` field. + +#### The interpreter + +The tier 2 interpreter has a variable `current_executor` which +points to the currently live executor. When transfering from executor +`A` to executor `B` we do the following: +(Initially `current_executor` points to `A`, and the refcount of +`A` is elevated by one) + +1. Set the instruction pointer to start at the beginning of `B` +2. Increment the reference count of `B` +3. Start executing `B` + +We also make the first instruction in `B` do the following: +1. Set `current_executor` to point to `B` +2. Decrement the reference count of `A` (`A` is referenced by `tstate->previous_executor`) + +The net effect of the above is to safely decrement the refcount of `A`, +increment the refcount of `B` and set `current_executor` to point to `B`. + +#### In the JIT + +Transfering control from one executor to another is done via tailcalls. + +The compiled executor should do the same, except that there is no local +variable `current_executor`. + +### Tier 1 to tier 2 + +Since the executor doesn't know if the previous code was tier 1 or tier 2, +we need to make a transfer from tier 1 to tier 2 look like a tier 2 to tier 2 +transfer to the executor. + +We can then perform a tier 1 to tier 2 transfer by setting `current_executor` +to `None`, and then performing a tier 2 to tier 2 transfer as above. + +### Tier 2 to tier 1 + +Each micro-op that might exit to tier 1 contains a `target` value, +which is the offset of the tier 1 instruction to exit to in the +current code object. + +## Counters + +TO DO. +The implementation will change soon, so there is no point in +documenting it until then. + diff --git a/Python/tier2_redundancy_eliminator_cases.c.h b/Python/tier2_redundancy_eliminator_cases.c.h index be2fbb9..98f0bdc 100644 --- a/Python/tier2_redundancy_eliminator_cases.c.h +++ b/Python/tier2_redundancy_eliminator_cases.c.h @@ -1053,8 +1053,6 @@ break; } - /* _JUMP_BACKWARD is not a viable micro-op for tier 2 */ - /* _POP_JUMP_IF_FALSE is not a viable micro-op for tier 2 */ /* _POP_JUMP_IF_TRUE is not a viable micro-op for tier 2 */ @@ -1739,6 +1737,18 @@ break; } + case _COLD_EXIT: { + break; + } + + case _START_EXECUTOR: { + break; + } + + case _FATAL_ERROR: { + break; + } + case _CHECK_VALIDITY_AND_SET_IP: { break; } |