diff options
Diffstat (limited to 'Python')
| -rw-r--r-- | Python/bytecodes.c | 24 | ||||
| -rw-r--r-- | Python/executor_cases.c.h | 42 | ||||
| -rw-r--r-- | Python/optimizer.c | 52 | ||||
| -rw-r--r-- | Python/optimizer_analysis.c | 230 | ||||
| -rw-r--r-- | Python/pylifecycle.c | 22 |
5 files changed, 333 insertions, 37 deletions
diff --git a/Python/bytecodes.c b/Python/bytecodes.c index ebd5b06..6fb4d71 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -4071,11 +4071,35 @@ dummy_func( } op(_LOAD_CONST_INLINE, (ptr/4 -- value)) { + TIER_TWO_ONLY value = Py_NewRef(ptr); } op(_LOAD_CONST_INLINE_BORROW, (ptr/4 -- value)) { + TIER_TWO_ONLY + value = ptr; + } + + op(_LOAD_CONST_INLINE_WITH_NULL, (ptr/4 -- value, null)) { + TIER_TWO_ONLY + value = Py_NewRef(ptr); + null = NULL; + } + + op(_LOAD_CONST_INLINE_BORROW_WITH_NULL, (ptr/4 -- value, null)) { + TIER_TWO_ONLY value = ptr; + null = NULL; + } + + op(_CHECK_GLOBALS, (dict/4 -- )) { + TIER_TWO_ONLY + DEOPT_IF(GLOBALS() != dict); + } + + op(_CHECK_BUILTINS, (dict/4 -- )) { + TIER_TWO_ONLY + DEOPT_IF(BUILTINS() != dict); } /* Internal -- for testing executors */ diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 241b905..2d914b8 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -3393,6 +3393,7 @@ case _LOAD_CONST_INLINE: { PyObject *value; PyObject *ptr = (PyObject *)CURRENT_OPERAND(); + TIER_TWO_ONLY value = Py_NewRef(ptr); stack_pointer[0] = value; stack_pointer += 1; @@ -3402,12 +3403,53 @@ case _LOAD_CONST_INLINE_BORROW: { PyObject *value; PyObject *ptr = (PyObject *)CURRENT_OPERAND(); + TIER_TWO_ONLY value = ptr; stack_pointer[0] = value; stack_pointer += 1; break; } + case _LOAD_CONST_INLINE_WITH_NULL: { + PyObject *value; + PyObject *null; + PyObject *ptr = (PyObject *)CURRENT_OPERAND(); + TIER_TWO_ONLY + value = Py_NewRef(ptr); + null = NULL; + stack_pointer[0] = value; + stack_pointer[1] = null; + stack_pointer += 2; + break; + } + + case _LOAD_CONST_INLINE_BORROW_WITH_NULL: { + PyObject *value; + PyObject *null; + PyObject *ptr = (PyObject *)CURRENT_OPERAND(); + TIER_TWO_ONLY + value = ptr; + null = NULL; + stack_pointer[0] = value; + stack_pointer[1] = null; + stack_pointer += 2; + break; + } + + case _CHECK_GLOBALS: { + PyObject *dict = (PyObject *)CURRENT_OPERAND(); + TIER_TWO_ONLY + if (GLOBALS() != dict) goto deoptimize; + break; + } + + case _CHECK_BUILTINS: { + PyObject *dict = (PyObject *)CURRENT_OPERAND(); + TIER_TWO_ONLY + if (BUILTINS() != dict) goto deoptimize; + break; + } + case _INTERNAL_INCREMENT_OPT_COUNTER: { PyObject *opt; opt = stack_pointer[-1]; diff --git a/Python/optimizer.c b/Python/optimizer.c index 0d04b09..d71ca0a 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -108,16 +108,14 @@ PyUnstable_Replace_Executor(PyCodeObject *code, _Py_CODEUNIT *instr, _PyExecutor } static int -error_optimize( +never_optimize( _PyOptimizerObject* self, - PyCodeObject *code, + _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, _PyExecutorObject **exec, int Py_UNUSED(stack_entries)) { - assert(0); - PyErr_Format(PyExc_SystemError, "Should never call error_optimize"); - return -1; + return 0; } PyTypeObject _PyDefaultOptimizer_Type = { @@ -130,7 +128,7 @@ PyTypeObject _PyDefaultOptimizer_Type = { _PyOptimizerObject _PyOptimizer_Default = { PyObject_HEAD_INIT(&_PyDefaultOptimizer_Type) - .optimize = error_optimize, + .optimize = never_optimize, .resume_threshold = INT16_MAX, .backedge_threshold = INT16_MAX, }; @@ -174,7 +172,7 @@ _PyOptimizer_Optimize(_PyInterpreterFrame *frame, _Py_CODEUNIT *start, PyObject } _PyOptimizerObject *opt = interp->optimizer; _PyExecutorObject *executor = NULL; - int err = opt->optimize(opt, code, start, &executor, (int)(stack_pointer - _PyFrame_Stackbase(frame))); + int err = opt->optimize(opt, frame, start, &executor, (int)(stack_pointer - _PyFrame_Stackbase(frame))); if (err <= 0) { assert(executor == NULL); return err; @@ -363,7 +361,8 @@ BRANCH_TO_GUARD[4][2] = { ADD_TO_TRACE(_EXIT_TRACE, 0, 0, 0); \ goto done; \ } \ - trace_stack[trace_stack_depth].code = code; \ + assert(func->func_code == (PyObject *)code); \ + trace_stack[trace_stack_depth].func = func; \ trace_stack[trace_stack_depth].instr = instr; \ trace_stack_depth++; #define TRACE_STACK_POP() \ @@ -371,7 +370,8 @@ BRANCH_TO_GUARD[4][2] = { Py_FatalError("Trace stack underflow\n"); \ } \ trace_stack_depth--; \ - code = trace_stack[trace_stack_depth].code; \ + func = trace_stack[trace_stack_depth].func; \ + code = (PyCodeObject *)trace_stack[trace_stack_depth].func->func_code; \ instr = trace_stack[trace_stack_depth].instr; /* Returns 1 on success, @@ -380,20 +380,23 @@ BRANCH_TO_GUARD[4][2] = { */ static int translate_bytecode_to_trace( - PyCodeObject *code, + _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, _PyUOpInstruction *trace, int buffer_size, _PyBloomFilter *dependencies) { bool progress_needed = true; + PyCodeObject *code = (PyCodeObject *)frame->f_executable; + PyFunctionObject *func = (PyFunctionObject *)frame->f_funcobj; + assert(PyFunction_Check(func)); PyCodeObject *initial_code = code; _Py_BloomFilter_Add(dependencies, initial_code); _Py_CODEUNIT *initial_instr = instr; int trace_length = 0; int max_length = buffer_size; struct { - PyCodeObject *code; + PyFunctionObject *func; _Py_CODEUNIT *instr; } trace_stack[TRACE_STACK_SIZE]; int trace_stack_depth = 0; @@ -593,9 +596,9 @@ top: // Jump here after _PUSH_FRAME or likely branches ADD_TO_TRACE(uop, oparg, operand, target); if (uop == _POP_FRAME) { TRACE_STACK_POP(); - /* Set the operand to the code object returned to, + /* Set the operand to the function object returned to, * to assist optimization passes */ - trace[trace_length-1].operand = (uintptr_t)code; + trace[trace_length-1].operand = (uintptr_t)func; DPRINTF(2, "Returning to %s (%s:%d) at byte offset %d\n", PyUnicode_AsUTF8(code->co_qualname), @@ -611,10 +614,10 @@ top: // Jump here after _PUSH_FRAME or likely branches // Add one to account for the actual opcode/oparg pair: + 1; uint32_t func_version = read_u32(&instr[func_version_offset].cache); - PyFunctionObject *func = _PyFunction_LookupByVersion(func_version); + PyFunctionObject *new_func = _PyFunction_LookupByVersion(func_version); DPRINTF(3, "Function object: %p\n", func); - if (func != NULL) { - PyCodeObject *new_code = (PyCodeObject *)PyFunction_GET_CODE(func); + if (new_func != NULL) { + PyCodeObject *new_code = (PyCodeObject *)PyFunction_GET_CODE(new_func); if (new_code == code) { // Recursive call, bail (we could be here forever). DPRINTF(2, "Bailing on recursive call to %s (%s:%d)\n", @@ -639,8 +642,9 @@ top: // Jump here after _PUSH_FRAME or likely branches _Py_BloomFilter_Add(dependencies, new_code); /* Set the operand to the callee's code object, * to assist optimization passes */ - trace[trace_length-1].operand = (uintptr_t)new_code; + trace[trace_length-1].operand = (uintptr_t)new_func; code = new_code; + func = new_func; instr = _PyCode_CODE(code); DPRINTF(2, "Continuing in %s (%s:%d) at byte offset %d\n", @@ -808,7 +812,7 @@ make_executor_from_uops(_PyUOpInstruction *buffer, _PyBloomFilter *dependencies) static int uop_optimize( _PyOptimizerObject *self, - PyCodeObject *code, + _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, _PyExecutorObject **exec_ptr, int curr_stackentries) @@ -816,7 +820,7 @@ uop_optimize( _PyBloomFilter dependencies; _Py_BloomFilter_Init(&dependencies); _PyUOpInstruction buffer[UOP_MAX_TRACE_LENGTH]; - int err = translate_bytecode_to_trace(code, instr, buffer, UOP_MAX_TRACE_LENGTH, &dependencies); + int err = translate_bytecode_to_trace(frame, instr, buffer, UOP_MAX_TRACE_LENGTH, &dependencies); if (err <= 0) { // Error or nothing translated return err; @@ -824,9 +828,10 @@ uop_optimize( OPT_STAT_INC(traces_created); char *uop_optimize = Py_GETENV("PYTHONUOPSOPTIMIZE"); if (uop_optimize == NULL || *uop_optimize > '0') { - err = _Py_uop_analyze_and_optimize(code, buffer, UOP_MAX_TRACE_LENGTH, curr_stackentries); - if (err < 0) { - return -1; + err = _Py_uop_analyze_and_optimize(frame, buffer, + UOP_MAX_TRACE_LENGTH, curr_stackentries, &dependencies); + if (err <= 0) { + return err; } } _PyExecutorObject *executor = make_executor_from_uops(buffer, &dependencies); @@ -887,12 +892,13 @@ PyTypeObject _PyCounterExecutor_Type = { static int counter_optimize( _PyOptimizerObject* self, - PyCodeObject *code, + _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, _PyExecutorObject **exec_ptr, int Py_UNUSED(curr_stackentries) ) { + PyCodeObject *code = (PyCodeObject *)frame->f_executable; int oparg = instr->op.arg; while (instr->op.code == EXTENDED_ARG) { instr++; diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index d122599..2cfbf4b 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -1,10 +1,12 @@ #include "Python.h" #include "opcode.h" +#include "pycore_dict.h" #include "pycore_interp.h" #include "pycore_opcode_metadata.h" #include "pycore_opcode_utils.h" #include "pycore_pystate.h" // _PyInterpreterState_GET() #include "pycore_uop_metadata.h" +#include "pycore_dict.h" #include "pycore_long.h" #include "cpython/optimizer.h" #include <stdbool.h> @@ -12,9 +14,210 @@ #include <stddef.h> #include "pycore_optimizer.h" +static int +get_mutations(PyObject* dict) { + assert(PyDict_CheckExact(dict)); + PyDictObject *d = (PyDictObject *)dict; + return (d->ma_version_tag >> DICT_MAX_WATCHERS) & ((1 << DICT_WATCHED_MUTATION_BITS)-1); +} + static void -peephole_opt(PyCodeObject *co, _PyUOpInstruction *buffer, int buffer_size) +increment_mutations(PyObject* dict) { + assert(PyDict_CheckExact(dict)); + PyDictObject *d = (PyDictObject *)dict; + d->ma_version_tag += (1 << DICT_MAX_WATCHERS); +} + +static int +globals_watcher_callback(PyDict_WatchEvent event, PyObject* dict, + PyObject* key, PyObject* new_value) +{ + if (event == PyDict_EVENT_CLONED) { + return 0; + } + uint64_t watched_mutations = get_mutations(dict); + if (watched_mutations < _Py_MAX_ALLOWED_GLOBALS_MODIFICATIONS) { + _Py_Executors_InvalidateDependency(_PyInterpreterState_GET(), dict); + increment_mutations(dict); + } + else { + PyDict_Unwatch(1, dict); + } + return 0; +} + + +static void +global_to_const(_PyUOpInstruction *inst, PyObject *obj) +{ + assert(inst->opcode == _LOAD_GLOBAL_MODULE || inst->opcode == _LOAD_GLOBAL_BUILTINS); + assert(PyDict_CheckExact(obj)); + PyDictObject *dict = (PyDictObject *)obj; + assert(dict->ma_keys->dk_kind == DICT_KEYS_UNICODE); + PyDictUnicodeEntry *entries = DK_UNICODE_ENTRIES(dict->ma_keys); + assert(inst->operand <= UINT16_MAX); + PyObject *res = entries[inst->operand].me_value; + if (res == NULL) { + return; + } + if (_Py_IsImmortal(res)) { + inst->opcode = (inst->oparg & 1) ? _LOAD_CONST_INLINE_BORROW_WITH_NULL : _LOAD_CONST_INLINE_BORROW; + } + else { + inst->opcode = (inst->oparg & 1) ? _LOAD_CONST_INLINE_WITH_NULL : _LOAD_CONST_INLINE; + } + inst->operand = (uint64_t)res; +} + +static int +incorrect_keys(_PyUOpInstruction *inst, PyObject *obj) { + if (!PyDict_CheckExact(obj)) { + return 1; + } + PyDictObject *dict = (PyDictObject *)obj; + if (dict->ma_keys->dk_version != inst->operand) { + return 1; + } + return 0; +} + +/* The first two dict watcher IDs are reserved for CPython, + * so we don't need to check that they haven't been used */ +#define BUILTINS_WATCHER_ID 0 +#define GLOBALS_WATCHER_ID 1 + +/* Returns 1 if successfully optimized + * 0 if the trace is not suitable for optimization (yet) + * -1 if there was an error. */ +static int +remove_globals(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, + int buffer_size, _PyBloomFilter *dependencies) +{ + PyInterpreterState *interp = _PyInterpreterState_GET(); + PyObject *builtins = frame->f_builtins; + if (builtins != interp->builtins) { + return 1; + } + PyObject *globals = frame->f_globals; + assert(PyFunction_Check(((PyFunctionObject *)frame->f_funcobj))); + assert(((PyFunctionObject *)frame->f_funcobj)->func_builtins == builtins); + assert(((PyFunctionObject *)frame->f_funcobj)->func_globals == globals); + /* In order to treat globals as constants, we need to + * know that the globals dict is the one we expected, and + * that it hasn't changed + * In order to treat builtins as constants, we need to + * know that the builtins dict is the one we expected, and + * that it hasn't changed and that the global dictionary's + * keys have not changed */ + + /* These values represent stacks of booleans (one bool per bit). + * Pushing a frame shifts left, popping a frame shifts right. */ + uint32_t builtins_checked = 0; + uint32_t builtins_watched = 0; + uint32_t globals_checked = 0; + uint32_t globals_watched = 0; + if (interp->dict_state.watchers[1] == NULL) { + interp->dict_state.watchers[1] = globals_watcher_callback; + } + for (int pc = 0; pc < buffer_size; pc++) { + _PyUOpInstruction *inst = &buffer[pc]; + int opcode = inst->opcode; + switch(opcode) { + case _GUARD_BUILTINS_VERSION: + if (incorrect_keys(inst, builtins)) { + return 0; + } + if (interp->rare_events.builtin_dict >= _Py_MAX_ALLOWED_BUILTINS_MODIFICATIONS) { + continue; + } + if ((builtins_watched & 1) == 0) { + PyDict_Watch(BUILTINS_WATCHER_ID, builtins); + builtins_watched |= 1; + } + if (builtins_checked & 1) { + buffer[pc].opcode = NOP; + } + else { + buffer[pc].opcode = _CHECK_BUILTINS; + buffer[pc].operand = (uintptr_t)builtins; + builtins_checked |= 1; + } + break; + case _GUARD_GLOBALS_VERSION: + if (incorrect_keys(inst, globals)) { + return 0; + } + uint64_t watched_mutations = get_mutations(globals); + if (watched_mutations >= _Py_MAX_ALLOWED_GLOBALS_MODIFICATIONS) { + continue; + } + if ((globals_watched & 1) == 0) { + PyDict_Watch(GLOBALS_WATCHER_ID, globals); + _Py_BloomFilter_Add(dependencies, globals); + globals_watched |= 1; + } + if (globals_checked & 1) { + buffer[pc].opcode = NOP; + } + else { + buffer[pc].opcode = _CHECK_GLOBALS; + buffer[pc].operand = (uintptr_t)globals; + globals_checked |= 1; + } + break; + case _LOAD_GLOBAL_BUILTINS: + if (globals_checked & builtins_checked & globals_watched & builtins_watched & 1) { + global_to_const(inst, builtins); + } + break; + case _LOAD_GLOBAL_MODULE: + if (globals_checked & globals_watched & 1) { + global_to_const(inst, globals); + } + break; + case _PUSH_FRAME: + { + globals_checked <<= 1; + globals_watched <<= 1; + builtins_checked <<= 1; + builtins_watched <<= 1; + PyFunctionObject *func = (PyFunctionObject *)buffer[pc].operand; + if (func == NULL) { + return 1; + } + assert(PyFunction_Check(func)); + globals = func->func_globals; + builtins = func->func_builtins; + if (builtins != interp->builtins) { + return 1; + } + break; + } + case _POP_FRAME: + { + globals_checked >>= 1; + globals_watched >>= 1; + builtins_checked >>= 1; + builtins_watched >>= 1; + PyFunctionObject *func = (PyFunctionObject *)buffer[pc].operand; + assert(PyFunction_Check(func)); + globals = func->func_globals; + builtins = func->func_builtins; + break; + } + case _JUMP_TO_TOP: + case _EXIT_TRACE: + return 1; + } + } + return 0; +} + +static void +peephole_opt(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, int buffer_size) +{ + PyCodeObject *co = (PyCodeObject *)frame->f_executable; for (int pc = 0; pc < buffer_size; pc++) { int opcode = buffer[pc].opcode; switch(opcode) { @@ -36,8 +239,17 @@ peephole_opt(PyCodeObject *co, _PyUOpInstruction *buffer, int buffer_size) } case _PUSH_FRAME: case _POP_FRAME: - co = (PyCodeObject *)buffer[pc].operand; + { + PyFunctionObject *func = (PyFunctionObject *)buffer[pc].operand; + if (func == NULL) { + co = NULL; + } + else { + assert(PyFunction_Check(func)); + co = (PyCodeObject *)func->func_code; + } break; + } case _JUMP_TO_TOP: case _EXIT_TRACE: return; @@ -83,16 +295,20 @@ remove_unneeded_uops(_PyUOpInstruction *buffer, int buffer_size) } } - int _Py_uop_analyze_and_optimize( - PyCodeObject *co, + _PyInterpreterFrame *frame, _PyUOpInstruction *buffer, int buffer_size, - int curr_stacklen + int curr_stacklen, + _PyBloomFilter *dependencies ) { - peephole_opt(co, buffer, buffer_size); + int err = remove_globals(frame, buffer, buffer_size, dependencies); + if (err <= 0) { + return err; + } + peephole_opt(frame, buffer, buffer_size); remove_unneeded_uops(buffer, buffer_size); - return 0; + return 1; } diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 372f606..0cac710 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -32,6 +32,7 @@ #include "pycore_typevarobject.h" // _Py_clear_generic_types() #include "pycore_unicodeobject.h" // _PyUnicode_InitTypes() #include "pycore_weakref.h" // _PyWeakref_GET_REF() +#include "cpython/optimizer.h" // _Py_MAX_ALLOWED_BUILTINS_MODIFICATIONS #include "pycore_obmalloc.h" // _PyMem_init_obmalloc() #include "opcode.h" @@ -609,7 +610,11 @@ init_interp_create_gil(PyThreadState *tstate, int gil) static int builtins_dict_watcher(PyDict_WatchEvent event, PyObject *dict, PyObject *key, PyObject *new_value) { - RARE_EVENT_INC(builtin_dict); + PyInterpreterState *interp = _PyInterpreterState_GET(); + if (event != PyDict_EVENT_CLONED && interp->rare_events.builtin_dict < _Py_MAX_ALLOWED_BUILTINS_MODIFICATIONS) { + _Py_Executors_InvalidateAll(interp); + } + RARE_EVENT_INTERP_INC(interp, builtin_dict); return 0; } @@ -1287,11 +1292,9 @@ init_interp_main(PyThreadState *tstate) } } - if ((interp->rare_events.builtins_dict_watcher_id = PyDict_AddWatcher(&builtins_dict_watcher)) == -1) { - return _PyStatus_ERR("failed to add builtin dict watcher"); - } - if (PyDict_Watch(interp->rare_events.builtins_dict_watcher_id, interp->builtins) != 0) { + interp->dict_state.watchers[0] = &builtins_dict_watcher; + if (PyDict_Watch(0, interp->builtins) != 0) { return _PyStatus_ERR("failed to set builtin dict watcher"); } @@ -1622,8 +1625,13 @@ finalize_modules(PyThreadState *tstate) { PyInterpreterState *interp = tstate->interp; - // Stop collecting stats on __builtin__ modifications during teardown - PyDict_Unwatch(interp->rare_events.builtins_dict_watcher_id, interp->builtins); + // Invalidate all executors and turn off tier 2 optimizer + _Py_Executors_InvalidateAll(interp); + Py_XDECREF(interp->optimizer); + interp->optimizer = &_PyOptimizer_Default; + + // Stop watching __builtin__ modifications + PyDict_Unwatch(0, interp->builtins); PyObject *modules = _PyImport_GetModules(interp); if (modules == NULL) { |
