/* Perf trampoline instrumentation =============================== This file contains instrumentation to allow to associate calls to the CPython eval loop back to the names of the Python functions and filename being executed. Many native performance profilers like the Linux perf tools are only available to 'see' the C stack when sampling from the profiled process. This means that if we have the following python code: import time def foo(n): # Some CPU intensive code def bar(n): foo(n) def baz(n): bar(n) baz(10000000) A performance profiler that is only able to see native frames will produce the following backtrace when sampling from foo(): _PyEval_EvalFrameDefault -----> Evaluation frame of foo() _PyEval_Vector _PyFunction_Vectorcall PyObject_Vectorcall call_function _PyEval_EvalFrameDefault ------> Evaluation frame of bar() _PyEval_EvalFrame _PyEval_Vector _PyFunction_Vectorcall PyObject_Vectorcall call_function _PyEval_EvalFrameDefault -------> Evaluation frame of baz() _PyEval_EvalFrame _PyEval_Vector _PyFunction_Vectorcall PyObject_Vectorcall call_function ... Py_RunMain Because the profiler is only able to see the native frames and the native function that runs the evaluation loop is the same (_PyEval_EvalFrameDefault) then the profiler and any reporter generated by it will not be able to associate the names of the Python functions and the filenames associated with those calls, rendering the results useless in the Python world. To fix this problem, we introduce the concept of a trampoline frame. A trampoline frame is a piece of code that is unique per Python code object that is executed before entering the CPython eval loop. This piece of code just calls the original Python evaluation function (_PyEval_EvalFrameDefault) and forwards all the arguments received. In this way, when a profiler samples frames from the previous example it will see; _PyEval_EvalFrameDefault -----> Evaluation frame of foo() [Jit compiled code 3] _PyEval_Vector _PyFunction_Vectorcall PyObject_Vectorcall call_function _PyEval_EvalFrameDefault ------> Evaluation frame of bar() [Jit compiled code 2] _PyEval_EvalFrame _PyEval_Vector _PyFunction_Vectorcall PyObject_Vectorcall call_function _PyEval_EvalFrameDefault -------> Evaluation frame of baz() [Jit compiled code 1] _PyEval_EvalFrame _PyEval_Vector _PyFunction_Vectorcall PyObject_Vectorcall call_function ... Py_RunMain When we generate every unique copy of the trampoline (what here we called "[Jit compiled code N]") we write the relationship between the compiled code and the Python function that is associated with it. Every profiler requires this information in a different format. For example, the Linux "perf" profiler requires a file in "/tmp/perf-PID.map" (name and location not configurable) with the following format: If this file is available when "perf" generates reports, it will automatically associate every trampoline with the Python function that it is associated with allowing it to generate reports that include Python information. These reports then can also be filtered in a way that *only* Python information appears. Notice that for this to work, there must be a unique copied of the trampoline per Python code object even if the code in the trampoline is the same. To achieve this we have a assembly template in Objects/asm_trampiline.S that is compiled into the Python executable/shared library. This template generates a symbol that maps the start of the assembly code and another that marks the end of the assembly code for the trampoline. Then, every time we need a unique trampoline for a Python code object, we copy the assembly code into a mmaped area that has executable permissions and we return the start of that area as our trampoline function. Asking for a mmap-ed memory area for trampoline is very wasteful so we allocate big arenas of memory in a single mmap call, we populate the entire arena with copies of the trampoline (this allows us to now have to invalidate the icache for the instructions in the page) and then we return the next available chunk every time someone asks for a new trampoline. We keep a linked list of arenas in case the current memory arena is exhausted and another one is needed. For the best results, Python should be compiled with CFLAGS="-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer" as this allows profilers to unwind using only the frame pointer and not on DWARF debug information (note that as trampilines are dynamically generated there won't be any DWARF information available for them). */ #include "Python.h" #include "pycore_ceval.h" #include "pycore_frame.h" #include "pycore_interp.h" #ifdef PY_HAVE_PERF_TRAMPOLINE #include #include #include #include #include #include #if defined(__arm__) || defined(__arm64__) || defined(__aarch64__) #define PY_HAVE_INVALIDATE_ICACHE #if defined(__clang__) || defined(__GNUC__) extern void __clear_cache(void *, void*); #endif static void invalidate_icache(char* begin, char*end) { #if defined(__clang__) || defined(__GNUC__) return __clear_cache(begin, end); #else return; #endif } #endif /* The function pointer is passed as last argument. The other three arguments * are passed in the same order as the function requires. This results in * shorter, more efficient ASM code for trampoline. */ typedef PyObject *(*py_evaluator)(PyThreadState *, _PyInterpreterFrame *, int throwflag); typedef PyObject *(*py_trampoline)(PyThreadState *, _PyInterpreterFrame *, int, py_evaluator); extern void *_Py_trampoline_func_start; // Start of the template of the // assembly trampoline extern void * _Py_trampoline_func_end; // End of the template of the assembly trampoline struct code_arena_st { char *start_addr; // Start of the memory arena char *current_addr; // Address of the current trampoline within the arena size_t size; // Size of the memory arena size_t size_left; // Remaining size of the memory arena size_t code_size; // Size of the code of every trampoline in the arena struct code_arena_st *prev; // Pointer to the arena or NULL if this is the first arena. }; typedef struct code_arena_st code_arena_t; typedef struct trampoline_api_st trampoline_api_t; #define perf_status _PyRuntime.ceval.perf.status #define extra_code_index _PyRuntime.ceval.perf.extra_code_index #define perf_code_arena _PyRuntime.ceval.perf.code_arena #define trampoline_api _PyRuntime.ceval.perf.trampoline_api #define perf_map_file _PyRuntime.ceval.perf.map_file static void perf_map_write_entry(void *state, const void *code_addr, unsigned int code_size, PyCodeObject *co) { const char *entry = ""; if (co->co_qualname != NULL) { entry = PyUnicode_AsUTF8(co->co_qualname); } const char *filename = ""; if (co->co_filename != NULL) { filename = PyUnicode_AsUTF8(co->co_filename); } size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1; char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size); if (perf_map_entry == NULL) { return; } snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename); PyUnstable_WritePerfMapEntry(code_addr, code_size, perf_map_entry); PyMem_RawFree(perf_map_entry); } _PyPerf_Callbacks _Py_perfmap_callbacks = { NULL, &perf_map_write_entry, NULL, }; static int new_code_arena(void) { // non-trivial programs typically need 64 to 256 kiB. size_t mem_size = 4096 * 16; assert(mem_size % sysconf(_SC_PAGESIZE) == 0); char *memory = mmap(NULL, // address mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, // fd (not used here) 0); // offset (not used here) if (!memory) { PyErr_SetFromErrno(PyExc_OSError); _PyErr_WriteUnraisableMsg( "Failed to create new mmap for perf trampoline", NULL); perf_status = PERF_STATUS_FAILED; return -1; } void *start = &_Py_trampoline_func_start; void *end = &_Py_trampoline_func_end; size_t code_size = end - start; // TODO: Check the effect of alignment of the code chunks. Initial investigation // showed that this has no effect on performance in x86-64 or aarch64 and the current // version has the advantage that the unwinder in GDB can unwind across JIT-ed code. // // We should check the values in the future and see if there is a // measurable performance improvement by rounding trampolines up to 32-bit // or 64-bit alignment. size_t n_copies = mem_size / code_size; for (size_t i = 0; i < n_copies; i++) { memcpy(memory + i * code_size, start, code_size * sizeof(char)); } // Some systems may prevent us from creating executable code on the fly. int res = mprotect(memory, mem_size, PROT_READ | PROT_EXEC); if (res == -1) { PyErr_SetFromErrno(PyExc_OSError); munmap(memory, mem_size); _PyErr_WriteUnraisableMsg( "Failed to set mmap for perf trampoline to PROT_READ | PROT_EXEC", NULL); return -1; } #ifdef PY_HAVE_INVALIDATE_ICACHE // Before the JIT can run a block of code that has been emitted it must invalidate // the instruction cache on some platforms like arm and aarch64. invalidate_icache(memory, memory + mem_size); #endif code_arena_t *new_arena = PyMem_RawCalloc(1, sizeof(code_arena_t)); if (new_arena == NULL) { PyErr_NoMemory(); munmap(memory, mem_size); _PyErr_WriteUnraisableMsg("Failed to allocate new code arena struct", NULL); return -1; } new_arena->start_addr = memory; new_arena->current_addr = memory; new_arena->size = mem_size; new_arena->size_left = mem_size; new_arena->code_size = code_size; new_arena->prev = perf_code_arena; perf_code_arena = new_arena; return 0; } static void free_code_arenas(void) { code_arena_t *cur = perf_code_arena; code_arena_t *prev; perf_code_arena = NULL; // invalid static pointer while (cur) { munmap(cur->start_addr, cur->size); prev = cur->prev; PyMem_RawFree(cur); cur = prev; } } static inline py_trampoline code_arena_new_code(code_arena_t *code_arena) { py_trampoline trampoline = (py_trampoline)code_arena->current_addr; code_arena->size_left -= code_arena->code_size; code_arena->current_addr += code_arena->code_size; return trampoline; } static inline py_trampoline compile_trampoline(void) { if ((perf_code_arena == NULL) || (perf_code_arena->size_left <= perf_code_arena->code_size)) { if (new_code_arena() < 0) { return NULL; } } assert(perf_code_arena->size_left <= perf_code_arena->size); return code_arena_new_code(perf_code_arena); } static PyObject * py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame, int throw) { if (perf_status == PERF_STATUS_FAILED || perf_status == PERF_STATUS_NO_INIT) { goto default_eval; } PyCodeObject *co = _PyFrame_GetCode(frame); py_trampoline f = NULL; assert(extra_code_index != -1); int ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f); if (ret != 0 || f == NULL) { // This is the first time we see this code object so we need // to compile a trampoline for it. py_trampoline new_trampoline = compile_trampoline(); if (new_trampoline == NULL) { goto default_eval; } trampoline_api.write_state(trampoline_api.state, new_trampoline, perf_code_arena->code_size, co); _PyCode_SetExtra((PyObject *)co, extra_code_index, (void *)new_trampoline); f = new_trampoline; } assert(f != NULL); return f(ts, frame, throw, _PyEval_EvalFrameDefault); default_eval: // Something failed, fall back to the default evaluator. return _PyEval_EvalFrameDefault(ts, frame, throw); } #endif // PY_HAVE_PERF_TRAMPOLINE int _PyIsPerfTrampolineActive(void) { #ifdef PY_HAVE_PERF_TRAMPOLINE PyThreadState *tstate = _PyThreadState_GET(); return tstate->interp->eval_frame == py_trampoline_evaluator; #endif return 0; } void _PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks *callbacks) { if (callbacks == NULL) { return; } #ifdef PY_HAVE_PERF_TRAMPOLINE callbacks->init_state = trampoline_api.init_state; callbacks->write_state = trampoline_api.write_state; callbacks->free_state = trampoline_api.free_state; #endif return; } int _PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks *callbacks) { if (callbacks == NULL) { return -1; } #ifdef PY_HAVE_PERF_TRAMPOLINE if (trampoline_api.state) { _PyPerfTrampoline_Fini(); } trampoline_api.init_state = callbacks->init_state; trampoline_api.write_state = callbacks->write_state; trampoline_api.free_state = callbacks->free_state; trampoline_api.state = NULL; perf_status = PERF_STATUS_OK; #endif return 0; } int _PyPerfTrampoline_Init(int activate) { #ifdef PY_HAVE_PERF_TRAMPOLINE PyThreadState *tstate = _PyThreadState_GET(); if (tstate->interp->eval_frame && tstate->interp->eval_frame != py_trampoline_evaluator) { PyErr_SetString(PyExc_RuntimeError, "Trampoline cannot be initialized as a custom eval " "frame is already present"); return -1; } if (!activate) { tstate->interp->eval_frame = NULL; } else { tstate->interp->eval_frame = py_trampoline_evaluator; if (new_code_arena() < 0) { return -1; } extra_code_index = _PyEval_RequestCodeExtraIndex(NULL); if (extra_code_index == -1) { return -1; } perf_status = PERF_STATUS_OK; } #endif return 0; } int _PyPerfTrampoline_Fini(void) { #ifdef PY_HAVE_PERF_TRAMPOLINE PyThreadState *tstate = _PyThreadState_GET(); if (tstate->interp->eval_frame == py_trampoline_evaluator) { tstate->interp->eval_frame = NULL; } free_code_arenas(); extra_code_index = -1; #endif return 0; } PyStatus _PyPerfTrampoline_AfterFork_Child(void) { #ifdef PY_HAVE_PERF_TRAMPOLINE // Restart trampoline in file in child. int was_active = _PyIsPerfTrampolineActive(); _PyPerfTrampoline_Fini(); PyUnstable_PerfMapState_Fini(); if (was_active) { _PyPerfTrampoline_Init(1); } #endif return PyStatus_Ok(); }