diff options
Diffstat (limited to 'Objects/codeobject.c')
-rw-r--r-- | Objects/codeobject.c | 313 |
1 files changed, 309 insertions, 4 deletions
diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 775ea7a..1cf9740 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -6,17 +6,22 @@ #include "pycore_code.h" // _PyCodeConstructor #include "pycore_frame.h" // FRAME_SPECIALS_SIZE #include "pycore_hashtable.h" // _Py_hashtable_t +#include "pycore_index_pool.h" // _PyIndexPool #include "pycore_initconfig.h" // _PyStatus_OK() #include "pycore_interp.h" // PyInterpreterState.co_extra_freefuncs #include "pycore_object.h" // _PyObject_SetDeferredRefcount +#include "pycore_object_stack.h" #include "pycore_opcode_metadata.h" // _PyOpcode_Deopt, _PyOpcode_Caches #include "pycore_opcode_utils.h" // RESUME_AT_FUNC_START +#include "pycore_pymem.h" // _PyMem_FreeDelayed #include "pycore_pystate.h" // _PyInterpreterState_GET() #include "pycore_setobject.h" // _PySet_NextEntry() #include "pycore_tuple.h" // _PyTuple_ITEMS() #include "pycore_uniqueid.h" // _PyObject_AssignUniqueId() #include "clinic/codeobject.c.h" +#define INITIAL_SPECIALIZED_CODE_SIZE 16 + static const char * code_event_name(PyCodeEvent event) { switch (event) { @@ -440,9 +445,15 @@ _PyCode_Validate(struct _PyCodeConstructor *con) return 0; } -extern void _PyCode_Quicken(PyCodeObject *code); +extern void +_PyCode_Quicken(_Py_CODEUNIT *instructions, Py_ssize_t size, PyObject *consts, + int enable_counters); -static void +#ifdef Py_GIL_DISABLED +static _PyCodeArray * _PyCodeArray_New(Py_ssize_t size); +#endif + +static int init_code(PyCodeObject *co, struct _PyCodeConstructor *con) { int nlocalsplus = (int)PyTuple_GET_SIZE(con->localsplusnames); @@ -505,14 +516,27 @@ init_code(PyCodeObject *co, struct _PyCodeConstructor *con) memcpy(_PyCode_CODE(co), PyBytes_AS_STRING(con->code), PyBytes_GET_SIZE(con->code)); +#ifdef Py_GIL_DISABLED + co->co_tlbc = _PyCodeArray_New(INITIAL_SPECIALIZED_CODE_SIZE); + if (co->co_tlbc == NULL) { + return -1; + } + co->co_tlbc->entries[0] = co->co_code_adaptive; +#endif int entry_point = 0; while (entry_point < Py_SIZE(co) && _PyCode_CODE(co)[entry_point].op.code != RESUME) { entry_point++; } co->_co_firsttraceable = entry_point; - _PyCode_Quicken(co); +#ifdef Py_GIL_DISABLED + _PyCode_Quicken(_PyCode_CODE(co), Py_SIZE(co), co->co_consts, + interp->config.tlbc_enabled); +#else + _PyCode_Quicken(_PyCode_CODE(co), Py_SIZE(co), co->co_consts, 1); +#endif notify_code_watchers(PY_CODE_EVENT_CREATE, co); + return 0; } static int @@ -667,7 +691,12 @@ _PyCode_New(struct _PyCodeConstructor *con) PyErr_NoMemory(); return NULL; } - init_code(co, con); + + if (init_code(co, con) < 0) { + Py_DECREF(co); + return NULL; + } + #ifdef Py_GIL_DISABLED co->_co_unique_id = _PyObject_AssignUniqueId((PyObject *)co); _PyObject_GC_TRACK(co); @@ -1871,6 +1900,17 @@ code_dealloc(PyCodeObject *co) PyObject_ClearWeakRefs((PyObject*)co); } free_monitoring_data(co->_co_monitoring); +#ifdef Py_GIL_DISABLED + // The first element always points to the mutable bytecode at the end of + // the code object, which will be freed when the code object is freed. + for (Py_ssize_t i = 1; i < co->co_tlbc->size; i++) { + char *entry = co->co_tlbc->entries[i]; + if (entry != NULL) { + PyMem_Free(entry); + } + } + PyMem_Free(co->co_tlbc); +#endif PyObject_Free(co); } @@ -2646,5 +2686,270 @@ _PyCode_Fini(PyInterpreterState *interp) _Py_hashtable_destroy(state->constants); state->constants = NULL; } + _PyIndexPool_Fini(&interp->tlbc_indices); #endif } + +#ifdef Py_GIL_DISABLED + +// Thread-local bytecode (TLBC) +// +// Each thread specializes a thread-local copy of the bytecode, created on the +// first RESUME, in free-threaded builds. All copies of the bytecode for a code +// object are stored in the `co_tlbc` array. Threads reserve a globally unique +// index identifying its copy of the bytecode in all `co_tlbc` arrays at thread +// creation and release the index at thread destruction. The first entry in +// every `co_tlbc` array always points to the "main" copy of the bytecode that +// is stored at the end of the code object. This ensures that no bytecode is +// copied for programs that do not use threads. +// +// Thread-local bytecode can be disabled at runtime by providing either `-X +// tlbc=0` or `PYTHON_TLBC=0`. Disabling thread-local bytecode also disables +// specialization. All threads share the main copy of the bytecode when +// thread-local bytecode is disabled. +// +// Concurrent modifications to the bytecode made by the specializing +// interpreter and instrumentation use atomics, with specialization taking care +// not to overwrite an instruction that was instrumented concurrently. + +int32_t +_Py_ReserveTLBCIndex(PyInterpreterState *interp) +{ + if (interp->config.tlbc_enabled) { + return _PyIndexPool_AllocIndex(&interp->tlbc_indices); + } + // All threads share the main copy of the bytecode when TLBC is disabled + return 0; +} + +void +_Py_ClearTLBCIndex(_PyThreadStateImpl *tstate) +{ + PyInterpreterState *interp = ((PyThreadState *)tstate)->interp; + if (interp->config.tlbc_enabled) { + _PyIndexPool_FreeIndex(&interp->tlbc_indices, tstate->tlbc_index); + } +} + +static _PyCodeArray * +_PyCodeArray_New(Py_ssize_t size) +{ + _PyCodeArray *arr = PyMem_Calloc( + 1, offsetof(_PyCodeArray, entries) + sizeof(void *) * size); + if (arr == NULL) { + PyErr_NoMemory(); + return NULL; + } + arr->size = size; + return arr; +} + +static void +copy_code(_Py_CODEUNIT *dst, PyCodeObject *co) +{ + int code_len = (int) Py_SIZE(co); + for (int i = 0; i < code_len; i += _PyInstruction_GetLength(co, i)) { + dst[i] = _Py_GetBaseCodeUnit(co, i); + } + _PyCode_Quicken(dst, code_len, co->co_consts, 1); +} + +static Py_ssize_t +get_pow2_greater(Py_ssize_t initial, Py_ssize_t limit) +{ + // initial must be a power of two + assert(!(initial & (initial - 1))); + Py_ssize_t res = initial; + while (res && res < limit) { + res <<= 1; + } + return res; +} + +static _Py_CODEUNIT * +create_tlbc_lock_held(PyCodeObject *co, Py_ssize_t idx) +{ + _PyCodeArray *tlbc = co->co_tlbc; + if (idx >= tlbc->size) { + Py_ssize_t new_size = get_pow2_greater(tlbc->size, idx + 1); + if (!new_size) { + PyErr_NoMemory(); + return NULL; + } + _PyCodeArray *new_tlbc = _PyCodeArray_New(new_size); + if (new_tlbc == NULL) { + return NULL; + } + memcpy(new_tlbc->entries, tlbc->entries, tlbc->size * sizeof(void *)); + _Py_atomic_store_ptr_release(&co->co_tlbc, new_tlbc); + _PyMem_FreeDelayed(tlbc); + tlbc = new_tlbc; + } + char *bc = PyMem_Calloc(1, _PyCode_NBYTES(co)); + if (bc == NULL) { + PyErr_NoMemory(); + return NULL; + } + copy_code((_Py_CODEUNIT *) bc, co); + assert(tlbc->entries[idx] == NULL); + tlbc->entries[idx] = bc; + return (_Py_CODEUNIT *) bc; +} + +static _Py_CODEUNIT * +get_tlbc_lock_held(PyCodeObject *co) +{ + _PyCodeArray *tlbc = co->co_tlbc; + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)PyThreadState_GET(); + int32_t idx = tstate->tlbc_index; + if (idx < tlbc->size && tlbc->entries[idx] != NULL) { + return (_Py_CODEUNIT *)tlbc->entries[idx]; + } + return create_tlbc_lock_held(co, idx); +} + +_Py_CODEUNIT * +_PyCode_GetTLBC(PyCodeObject *co) +{ + _Py_CODEUNIT *result; + Py_BEGIN_CRITICAL_SECTION(co); + result = get_tlbc_lock_held(co); + Py_END_CRITICAL_SECTION(); + return result; +} + +// My kingdom for a bitset +struct flag_set { + uint8_t *flags; + Py_ssize_t size; +}; + +static inline int +flag_is_set(struct flag_set *flags, Py_ssize_t idx) +{ + assert(idx >= 0); + return (idx < flags->size) && flags->flags[idx]; +} + +// Set the flag for each tlbc index in use +static int +get_indices_in_use(PyInterpreterState *interp, struct flag_set *in_use) +{ + assert(interp->stoptheworld.world_stopped); + assert(in_use->flags == NULL); + int32_t max_index = 0; + for (PyThreadState *p = interp->threads.head; p != NULL; p = p->next) { + int32_t idx = ((_PyThreadStateImpl *) p)->tlbc_index; + if (idx > max_index) { + max_index = idx; + } + } + in_use->size = (size_t) max_index + 1; + in_use->flags = PyMem_Calloc(in_use->size, sizeof(*in_use->flags)); + if (in_use->flags == NULL) { + return -1; + } + for (PyThreadState *p = interp->threads.head; p != NULL; p = p->next) { + in_use->flags[((_PyThreadStateImpl *) p)->tlbc_index] = 1; + } + return 0; +} + +struct get_code_args { + _PyObjectStack code_objs; + struct flag_set indices_in_use; + int err; +}; + +static void +clear_get_code_args(struct get_code_args *args) +{ + if (args->indices_in_use.flags != NULL) { + PyMem_Free(args->indices_in_use.flags); + args->indices_in_use.flags = NULL; + } + _PyObjectStack_Clear(&args->code_objs); +} + +static inline int +is_bytecode_unused(_PyCodeArray *tlbc, Py_ssize_t idx, + struct flag_set *indices_in_use) +{ + assert(idx > 0 && idx < tlbc->size); + return tlbc->entries[idx] != NULL && !flag_is_set(indices_in_use, idx); +} + +static int +get_code_with_unused_tlbc(PyObject *obj, struct get_code_args *args) +{ + if (!PyCode_Check(obj)) { + return 1; + } + PyCodeObject *co = (PyCodeObject *) obj; + _PyCodeArray *tlbc = co->co_tlbc; + // The first index always points at the main copy of the bytecode embedded + // in the code object. + for (Py_ssize_t i = 1; i < tlbc->size; i++) { + if (is_bytecode_unused(tlbc, i, &args->indices_in_use)) { + if (_PyObjectStack_Push(&args->code_objs, obj) < 0) { + args->err = -1; + return 0; + } + return 1; + } + } + return 1; +} + +static void +free_unused_bytecode(PyCodeObject *co, struct flag_set *indices_in_use) +{ + _PyCodeArray *tlbc = co->co_tlbc; + // The first index always points at the main copy of the bytecode embedded + // in the code object. + for (Py_ssize_t i = 1; i < tlbc->size; i++) { + if (is_bytecode_unused(tlbc, i, indices_in_use)) { + PyMem_Free(tlbc->entries[i]); + tlbc->entries[i] = NULL; + } + } +} + +int +_Py_ClearUnusedTLBC(PyInterpreterState *interp) +{ + struct get_code_args args = { + .code_objs = {NULL}, + .indices_in_use = {NULL, 0}, + .err = 0, + }; + _PyEval_StopTheWorld(interp); + // Collect in-use tlbc indices + if (get_indices_in_use(interp, &args.indices_in_use) < 0) { + goto err; + } + // Collect code objects that have bytecode not in use by any thread + _PyGC_VisitObjectsWorldStopped( + interp, (gcvisitobjects_t)get_code_with_unused_tlbc, &args); + if (args.err < 0) { + goto err; + } + // Free unused bytecode. This must happen outside of gc_visit_heaps; it is + // unsafe to allocate or free any mimalloc managed memory when it's + // running. + PyObject *obj; + while ((obj = _PyObjectStack_Pop(&args.code_objs)) != NULL) { + free_unused_bytecode((PyCodeObject*) obj, &args.indices_in_use); + } + _PyEval_StartTheWorld(interp); + clear_get_code_args(&args); + return 0; + +err: + _PyEval_StartTheWorld(interp); + clear_get_code_args(&args); + PyErr_NoMemory(); + return -1; +} + +#endif |