From 1f737edb67e702095feb97118a911afb569f5705 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Tue, 25 Oct 2022 23:34:22 +0100 Subject: gh-96143: Move the perf trampoline files to the Python directory (#98675) --- Makefile.pre.in | 4 +- Objects/asm_trampoline.S | 28 -- Objects/perf_trampoline.c | 531 ---------------------------- PCbuild/_freeze_module.vcxproj | 2 +- PCbuild/_freeze_module.vcxproj.filters | 2 +- PCbuild/pythoncore.vcxproj | 2 +- PCbuild/pythoncore.vcxproj.filters | 6 +- Python/asm_trampoline.S | 28 ++ Python/perf_trampoline.c | 531 ++++++++++++++++++++++++++++ Tools/c-analyzer/cpython/globals-to-fix.tsv | 10 +- Tools/c-analyzer/cpython/ignored.tsv | 6 +- configure | 2 +- configure.ac | 2 +- 13 files changed, 577 insertions(+), 577 deletions(-) delete mode 100644 Objects/asm_trampoline.S delete mode 100644 Objects/perf_trampoline.c create mode 100644 Python/asm_trampoline.S create mode 100644 Python/perf_trampoline.c diff --git a/Makefile.pre.in b/Makefile.pre.in index 5b4bf15..6ab1422 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -426,6 +426,7 @@ PYTHON_OBJS= \ Python/formatter_unicode.o \ Python/fileutils.o \ Python/suggestions.o \ + Python/perf_trampoline.o \ Python/$(DYNLOADFILE) \ $(LIBOBJS) \ $(MACHDEP_OBJS) \ @@ -479,7 +480,6 @@ OBJECT_OBJS= \ Objects/unicodectype.o \ Objects/unionobject.o \ Objects/weakrefobject.o \ - Objects/perf_trampoline.o \ @PERF_TRAMPOLINE_OBJ@ DEEPFREEZE_OBJS = Python/deepfreeze/deepfreeze.o @@ -2370,7 +2370,7 @@ config.status: $(srcdir)/configure .PRECIOUS: config.status $(BUILDPYTHON) Makefile Makefile.pre -Objects/asm_trampoline.o: $(srcdir)/Objects/asm_trampoline.S +Python/asm_trampoline.o: $(srcdir)/Python/asm_trampoline.S $(CC) -c $(PY_CORE_CFLAGS) -o $@ $< # Some make's put the object file in the current directory diff --git a/Objects/asm_trampoline.S b/Objects/asm_trampoline.S deleted file mode 100644 index 4607077..0000000 --- a/Objects/asm_trampoline.S +++ /dev/null @@ -1,28 +0,0 @@ - .text - .globl _Py_trampoline_func_start -# The following assembly is equivalent to: -# PyObject * -# trampoline(PyThreadState *ts, _PyInterpreterFrame *f, -# int throwflag, py_evaluator evaluator) -# { -# return evaluator(ts, f, throwflag); -# } -_Py_trampoline_func_start: -#ifdef __x86_64__ - sub $8, %rsp - call *%rcx - add $8, %rsp - ret -#endif // __x86_64__ -#if defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) - // ARM64 little endian, 64bit ABI - // generate with aarch64-linux-gnu-gcc 12.1 - stp x29, x30, [sp, -16]! - mov x29, sp - blr x3 - ldp x29, x30, [sp], 16 - ret -#endif - .globl _Py_trampoline_func_end -_Py_trampoline_func_end: - .section .note.GNU-stack,"",@progbits diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c deleted file mode 100644 index 161e0ef..0000000 --- a/Objects/perf_trampoline.c +++ /dev/null @@ -1,531 +0,0 @@ -/* - -Perf trampoline instrumentation -=============================== - -This file contains instrumentation to allow to associate -calls to the CPython eval loop back to the names of the Python -functions and filename being executed. - -Many native performance profilers like the Linux perf tools are -only available to 'see' the C stack when sampling from the profiled -process. This means that if we have the following python code: - - import time - def foo(n): - # Some CPU intensive code - - def bar(n): - foo(n) - - def baz(n): - bar(n) - - baz(10000000) - -A performance profiler that is only able to see native frames will -produce the following backtrace when sampling from foo(): - - _PyEval_EvalFrameDefault -----> Evaluation frame of foo() - _PyEval_Vector - _PyFunction_Vectorcall - PyObject_Vectorcall - call_function - - _PyEval_EvalFrameDefault ------> Evaluation frame of bar() - _PyEval_EvalFrame - _PyEval_Vector - _PyFunction_Vectorcall - PyObject_Vectorcall - call_function - - _PyEval_EvalFrameDefault -------> Evaluation frame of baz() - _PyEval_EvalFrame - _PyEval_Vector - _PyFunction_Vectorcall - PyObject_Vectorcall - call_function - - ... - - Py_RunMain - -Because the profiler is only able to see the native frames and the native -function that runs the evaluation loop is the same (_PyEval_EvalFrameDefault) -then the profiler and any reporter generated by it will not be able to -associate the names of the Python functions and the filenames associated with -those calls, rendering the results useless in the Python world. - -To fix this problem, we introduce the concept of a trampoline frame. A -trampoline frame is a piece of code that is unique per Python code object that -is executed before entering the CPython eval loop. This piece of code just -calls the original Python evaluation function (_PyEval_EvalFrameDefault) and -forwards all the arguments received. In this way, when a profiler samples -frames from the previous example it will see; - - _PyEval_EvalFrameDefault -----> Evaluation frame of foo() - [Jit compiled code 3] - _PyEval_Vector - _PyFunction_Vectorcall - PyObject_Vectorcall - call_function - - _PyEval_EvalFrameDefault ------> Evaluation frame of bar() - [Jit compiled code 2] - _PyEval_EvalFrame - _PyEval_Vector - _PyFunction_Vectorcall - PyObject_Vectorcall - call_function - - _PyEval_EvalFrameDefault -------> Evaluation frame of baz() - [Jit compiled code 1] - _PyEval_EvalFrame - _PyEval_Vector - _PyFunction_Vectorcall - PyObject_Vectorcall - call_function - - ... - - Py_RunMain - -When we generate every unique copy of the trampoline (what here we called "[Jit -compiled code N]") we write the relationship between the compiled code and the -Python function that is associated with it. Every profiler requires this -information in a different format. For example, the Linux "perf" profiler -requires a file in "/tmp/perf-PID.map" (name and location not configurable) -with the following format: - - - -If this file is available when "perf" generates reports, it will automatically -associate every trampoline with the Python function that it is associated with -allowing it to generate reports that include Python information. These reports -then can also be filtered in a way that *only* Python information appears. - -Notice that for this to work, there must be a unique copied of the trampoline -per Python code object even if the code in the trampoline is the same. To -achieve this we have a assembly template in Objects/asm_trampiline.S that is -compiled into the Python executable/shared library. This template generates a -symbol that maps the start of the assembly code and another that marks the end -of the assembly code for the trampoline. Then, every time we need a unique -trampoline for a Python code object, we copy the assembly code into a mmaped -area that has executable permissions and we return the start of that area as -our trampoline function. - -Asking for a mmap-ed memory area for trampoline is very wasteful so we -allocate big arenas of memory in a single mmap call, we populate the entire -arena with copies of the trampoline (this allows us to now have to invalidate -the icache for the instructions in the page) and then we return the next -available chunk every time someone asks for a new trampoline. We keep a linked -list of arenas in case the current memory arena is exhausted and another one is -needed. - -For the best results, Python should be compiled with -CFLAGS="-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer" as this allows -profilers to unwind using only the frame pointer and not on DWARF debug -information (note that as trampilines are dynamically generated there won't be -any DWARF information available for them). -*/ - -#include "Python.h" -#include "pycore_ceval.h" -#include "pycore_frame.h" -#include "pycore_interp.h" - -typedef enum { - PERF_STATUS_FAILED = -1, // Perf trampoline is in an invalid state - PERF_STATUS_NO_INIT = 0, // Perf trampoline is not initialized - PERF_STATUS_OK = 1, // Perf trampoline is ready to be executed -} perf_status_t; - -#ifdef PY_HAVE_PERF_TRAMPOLINE - -#include -#include -#include -#include -#include -#include - -#if defined(__arm__) || defined(__arm64__) || defined(__aarch64__) -#define PY_HAVE_INVALIDATE_ICACHE - -#if defined(__clang__) || defined(__GNUC__) -extern void __clear_cache(void *, void*); -#endif - -static void invalidate_icache(char* begin, char*end) { -#if defined(__clang__) || defined(__GNUC__) - return __clear_cache(begin, end); -#else - return; -#endif -} -#endif - -/* The function pointer is passed as last argument. The other three arguments - * are passed in the same order as the function requires. This results in - * shorter, more efficient ASM code for trampoline. - */ -typedef PyObject *(*py_evaluator)(PyThreadState *, _PyInterpreterFrame *, - int throwflag); -typedef PyObject *(*py_trampoline)(PyThreadState *, _PyInterpreterFrame *, int, - py_evaluator); - -extern void *_Py_trampoline_func_start; // Start of the template of the - // assembly trampoline -extern void * - _Py_trampoline_func_end; // End of the template of the assembly trampoline - -struct code_arena_st { - char *start_addr; // Start of the memory arena - char *current_addr; // Address of the current trampoline within the arena - size_t size; // Size of the memory arena - size_t size_left; // Remaining size of the memory arena - size_t code_size; // Size of the code of every trampoline in the arena - struct code_arena_st - *prev; // Pointer to the arena or NULL if this is the first arena. -}; - -typedef struct code_arena_st code_arena_t; - -struct trampoline_api_st { - void* (*init_state)(void); - void (*write_state)(void* state, const void *code_addr, - unsigned int code_size, PyCodeObject* code); - int (*free_state)(void* state); - void *state; -}; - -typedef struct trampoline_api_st trampoline_api_t; - - -static perf_status_t perf_status = PERF_STATUS_NO_INIT; -static Py_ssize_t extra_code_index = -1; -static code_arena_t *code_arena; -static trampoline_api_t trampoline_api; - -static FILE *perf_map_file; - -static void * -perf_map_get_file(void) -{ - if (perf_map_file) { - return perf_map_file; - } - char filename[100]; - pid_t pid = getpid(); - // Location and file name of perf map is hard-coded in perf tool. - // Use exclusive create flag wit nofollow to prevent symlink attacks. - int flags = O_WRONLY | O_CREAT | O_EXCL | O_NOFOLLOW | O_CLOEXEC; - snprintf(filename, sizeof(filename) - 1, "/tmp/perf-%jd.map", - (intmax_t)pid); - int fd = open(filename, flags, 0600); - if (fd == -1) { - perf_status = PERF_STATUS_FAILED; - PyErr_SetFromErrnoWithFilename(PyExc_OSError, filename); - return NULL; - } - perf_map_file = fdopen(fd, "w"); - if (!perf_map_file) { - perf_status = PERF_STATUS_FAILED; - PyErr_SetFromErrnoWithFilename(PyExc_OSError, filename); - close(fd); - return NULL; - } - return perf_map_file; -} - -static int -perf_map_close(void *state) -{ - FILE *fp = (FILE *)state; - int ret = 0; - if (fp) { - ret = fclose(fp); - } - perf_map_file = NULL; - perf_status = PERF_STATUS_NO_INIT; - return ret; -} - -static void -perf_map_write_entry(void *state, const void *code_addr, - unsigned int code_size, PyCodeObject *co) -{ - assert(state != NULL); - FILE *method_file = (FILE *)state; - const char *entry = PyUnicode_AsUTF8(co->co_qualname); - if (entry == NULL) { - _PyErr_WriteUnraisableMsg("Failed to get qualname from code object", - NULL); - return; - } - const char *filename = PyUnicode_AsUTF8(co->co_filename); - if (filename == NULL) { - _PyErr_WriteUnraisableMsg("Failed to get filename from code object", - NULL); - return; - } - fprintf(method_file, "%p %x py::%s:%s\n", code_addr, code_size, entry, - filename); - fflush(method_file); -} - -_PyPerf_Callbacks _Py_perfmap_callbacks = { - &perf_map_get_file, - &perf_map_write_entry, - &perf_map_close -}; - -static int -new_code_arena(void) -{ - // non-trivial programs typically need 64 to 256 kiB. - size_t mem_size = 4096 * 16; - assert(mem_size % sysconf(_SC_PAGESIZE) == 0); - char *memory = - mmap(NULL, // address - mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, - -1, // fd (not used here) - 0); // offset (not used here) - if (!memory) { - PyErr_SetFromErrno(PyExc_OSError); - _PyErr_WriteUnraisableMsg( - "Failed to create new mmap for perf trampoline", NULL); - perf_status = PERF_STATUS_FAILED; - return -1; - } - void *start = &_Py_trampoline_func_start; - void *end = &_Py_trampoline_func_end; - size_t code_size = end - start; - // TODO: Check the effect of alignment of the code chunks. Initial investigation - // showed that this has no effect on performance in x86-64 or aarch64 and the current - // version has the advantage that the unwinder in GDB can unwind across JIT-ed code. - // - // We should check the values in the future and see if there is a - // measurable performance improvement by rounding trampolines up to 32-bit - // or 64-bit alignment. - - size_t n_copies = mem_size / code_size; - for (size_t i = 0; i < n_copies; i++) { - memcpy(memory + i * code_size, start, code_size * sizeof(char)); - } - // Some systems may prevent us from creating executable code on the fly. - int res = mprotect(memory, mem_size, PROT_READ | PROT_EXEC); - if (res == -1) { - PyErr_SetFromErrno(PyExc_OSError); - munmap(memory, mem_size); - _PyErr_WriteUnraisableMsg( - "Failed to set mmap for perf trampoline to PROT_READ | PROT_EXEC", - NULL); - return -1; - } - -#ifdef PY_HAVE_INVALIDATE_ICACHE - // Before the JIT can run a block of code that has been emitted it must invalidate - // the instruction cache on some platforms like arm and aarch64. - invalidate_icache(memory, memory + mem_size); -#endif - - code_arena_t *new_arena = PyMem_RawCalloc(1, sizeof(code_arena_t)); - if (new_arena == NULL) { - PyErr_NoMemory(); - munmap(memory, mem_size); - _PyErr_WriteUnraisableMsg("Failed to allocate new code arena struct", - NULL); - return -1; - } - - new_arena->start_addr = memory; - new_arena->current_addr = memory; - new_arena->size = mem_size; - new_arena->size_left = mem_size; - new_arena->code_size = code_size; - new_arena->prev = code_arena; - code_arena = new_arena; - return 0; -} - -static void -free_code_arenas(void) -{ - code_arena_t *cur = code_arena; - code_arena_t *prev; - code_arena = NULL; // invalid static pointer - while (cur) { - munmap(cur->start_addr, cur->size); - prev = cur->prev; - PyMem_RawFree(cur); - cur = prev; - } -} - -static inline py_trampoline -code_arena_new_code(code_arena_t *code_arena) -{ - py_trampoline trampoline = (py_trampoline)code_arena->current_addr; - code_arena->size_left -= code_arena->code_size; - code_arena->current_addr += code_arena->code_size; - return trampoline; -} - -static inline py_trampoline -compile_trampoline(void) -{ - if ((code_arena == NULL) || - (code_arena->size_left <= code_arena->code_size)) { - if (new_code_arena() < 0) { - return NULL; - } - } - assert(code_arena->size_left <= code_arena->size); - return code_arena_new_code(code_arena); -} - -static PyObject * -py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame, - int throw) -{ - if (perf_status == PERF_STATUS_FAILED || - perf_status == PERF_STATUS_NO_INIT) { - goto default_eval; - } - PyCodeObject *co = frame->f_code; - py_trampoline f = NULL; - assert(extra_code_index != -1); - int ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f); - if (ret != 0 || f == NULL) { - // This is the first time we see this code object so we need - // to compile a trampoline for it. - py_trampoline new_trampoline = compile_trampoline(); - if (new_trampoline == NULL) { - goto default_eval; - } - trampoline_api.write_state(trampoline_api.state, new_trampoline, - code_arena->code_size, co); - _PyCode_SetExtra((PyObject *)co, extra_code_index, - (void *)new_trampoline); - f = new_trampoline; - } - assert(f != NULL); - return f(ts, frame, throw, _PyEval_EvalFrameDefault); -default_eval: - // Something failed, fall back to the default evaluator. - return _PyEval_EvalFrameDefault(ts, frame, throw); -} -#endif // PY_HAVE_PERF_TRAMPOLINE - -int -_PyIsPerfTrampolineActive(void) -{ -#ifdef PY_HAVE_PERF_TRAMPOLINE - PyThreadState *tstate = _PyThreadState_GET(); - return tstate->interp->eval_frame == py_trampoline_evaluator; -#endif - return 0; -} - -void -_PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks *callbacks) -{ - if (callbacks == NULL) { - return; - } -#ifdef PY_HAVE_PERF_TRAMPOLINE - callbacks->init_state = trampoline_api.init_state; - callbacks->write_state = trampoline_api.write_state; - callbacks->free_state = trampoline_api.free_state; -#endif - return; -} - -int -_PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks *callbacks) -{ - if (callbacks == NULL) { - return -1; - } -#ifdef PY_HAVE_PERF_TRAMPOLINE - if (trampoline_api.state) { - _PyPerfTrampoline_Fini(); - } - trampoline_api.init_state = callbacks->init_state; - trampoline_api.write_state = callbacks->write_state; - trampoline_api.free_state = callbacks->free_state; - trampoline_api.state = NULL; - perf_status = PERF_STATUS_OK; -#endif - return 0; -} - -int -_PyPerfTrampoline_Init(int activate) -{ -#ifdef PY_HAVE_PERF_TRAMPOLINE - PyThreadState *tstate = _PyThreadState_GET(); - if (tstate->interp->eval_frame && - tstate->interp->eval_frame != py_trampoline_evaluator) { - PyErr_SetString(PyExc_RuntimeError, - "Trampoline cannot be initialized as a custom eval " - "frame is already present"); - return -1; - } - if (!activate) { - tstate->interp->eval_frame = NULL; - } - else { - tstate->interp->eval_frame = py_trampoline_evaluator; - if (new_code_arena() < 0) { - return -1; - } - if (trampoline_api.state == NULL) { - void *state = trampoline_api.init_state(); - if (state == NULL) { - return -1; - } - trampoline_api.state = state; - } - extra_code_index = _PyEval_RequestCodeExtraIndex(NULL); - if (extra_code_index == -1) { - return -1; - } - perf_status = PERF_STATUS_OK; - } -#endif - return 0; -} - -int -_PyPerfTrampoline_Fini(void) -{ -#ifdef PY_HAVE_PERF_TRAMPOLINE - PyThreadState *tstate = _PyThreadState_GET(); - if (tstate->interp->eval_frame == py_trampoline_evaluator) { - tstate->interp->eval_frame = NULL; - } - free_code_arenas(); - if (trampoline_api.state != NULL) { - trampoline_api.free_state(trampoline_api.state); - trampoline_api.state = NULL; - } - extra_code_index = -1; -#endif - return 0; -} - -PyStatus -_PyPerfTrampoline_AfterFork_Child(void) -{ -#ifdef PY_HAVE_PERF_TRAMPOLINE - // Restart trampoline in file in child. - int was_active = _PyIsPerfTrampolineActive(); - _PyPerfTrampoline_Fini(); - if (was_active) { - _PyPerfTrampoline_Init(1); - } -#endif - return PyStatus_Ok(); -} diff --git a/PCbuild/_freeze_module.vcxproj b/PCbuild/_freeze_module.vcxproj index 49e5cc8..8454bd6 100644 --- a/PCbuild/_freeze_module.vcxproj +++ b/PCbuild/_freeze_module.vcxproj @@ -129,7 +129,6 @@ - @@ -211,6 +210,7 @@ + diff --git a/PCbuild/_freeze_module.vcxproj.filters b/PCbuild/_freeze_module.vcxproj.filters index 96ab2f2..6e8498d 100644 --- a/PCbuild/_freeze_module.vcxproj.filters +++ b/PCbuild/_freeze_module.vcxproj.filters @@ -85,7 +85,7 @@ Source Files - + Source Files diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index ff17304..111ad67 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -429,7 +429,6 @@ - @@ -513,6 +512,7 @@ + diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index 7d7fe72..ab7d019 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -923,9 +923,6 @@ Objects - - Objects - Objects @@ -1127,6 +1124,9 @@ Python + + Python + Python diff --git a/Python/asm_trampoline.S b/Python/asm_trampoline.S new file mode 100644 index 0000000..4607077 --- /dev/null +++ b/Python/asm_trampoline.S @@ -0,0 +1,28 @@ + .text + .globl _Py_trampoline_func_start +# The following assembly is equivalent to: +# PyObject * +# trampoline(PyThreadState *ts, _PyInterpreterFrame *f, +# int throwflag, py_evaluator evaluator) +# { +# return evaluator(ts, f, throwflag); +# } +_Py_trampoline_func_start: +#ifdef __x86_64__ + sub $8, %rsp + call *%rcx + add $8, %rsp + ret +#endif // __x86_64__ +#if defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) + // ARM64 little endian, 64bit ABI + // generate with aarch64-linux-gnu-gcc 12.1 + stp x29, x30, [sp, -16]! + mov x29, sp + blr x3 + ldp x29, x30, [sp], 16 + ret +#endif + .globl _Py_trampoline_func_end +_Py_trampoline_func_end: + .section .note.GNU-stack,"",@progbits diff --git a/Python/perf_trampoline.c b/Python/perf_trampoline.c new file mode 100644 index 0000000..161e0ef --- /dev/null +++ b/Python/perf_trampoline.c @@ -0,0 +1,531 @@ +/* + +Perf trampoline instrumentation +=============================== + +This file contains instrumentation to allow to associate +calls to the CPython eval loop back to the names of the Python +functions and filename being executed. + +Many native performance profilers like the Linux perf tools are +only available to 'see' the C stack when sampling from the profiled +process. This means that if we have the following python code: + + import time + def foo(n): + # Some CPU intensive code + + def bar(n): + foo(n) + + def baz(n): + bar(n) + + baz(10000000) + +A performance profiler that is only able to see native frames will +produce the following backtrace when sampling from foo(): + + _PyEval_EvalFrameDefault -----> Evaluation frame of foo() + _PyEval_Vector + _PyFunction_Vectorcall + PyObject_Vectorcall + call_function + + _PyEval_EvalFrameDefault ------> Evaluation frame of bar() + _PyEval_EvalFrame + _PyEval_Vector + _PyFunction_Vectorcall + PyObject_Vectorcall + call_function + + _PyEval_EvalFrameDefault -------> Evaluation frame of baz() + _PyEval_EvalFrame + _PyEval_Vector + _PyFunction_Vectorcall + PyObject_Vectorcall + call_function + + ... + + Py_RunMain + +Because the profiler is only able to see the native frames and the native +function that runs the evaluation loop is the same (_PyEval_EvalFrameDefault) +then the profiler and any reporter generated by it will not be able to +associate the names of the Python functions and the filenames associated with +those calls, rendering the results useless in the Python world. + +To fix this problem, we introduce the concept of a trampoline frame. A +trampoline frame is a piece of code that is unique per Python code object that +is executed before entering the CPython eval loop. This piece of code just +calls the original Python evaluation function (_PyEval_EvalFrameDefault) and +forwards all the arguments received. In this way, when a profiler samples +frames from the previous example it will see; + + _PyEval_EvalFrameDefault -----> Evaluation frame of foo() + [Jit compiled code 3] + _PyEval_Vector + _PyFunction_Vectorcall + PyObject_Vectorcall + call_function + + _PyEval_EvalFrameDefault ------> Evaluation frame of bar() + [Jit compiled code 2] + _PyEval_EvalFrame + _PyEval_Vector + _PyFunction_Vectorcall + PyObject_Vectorcall + call_function + + _PyEval_EvalFrameDefault -------> Evaluation frame of baz() + [Jit compiled code 1] + _PyEval_EvalFrame + _PyEval_Vector + _PyFunction_Vectorcall + PyObject_Vectorcall + call_function + + ... + + Py_RunMain + +When we generate every unique copy of the trampoline (what here we called "[Jit +compiled code N]") we write the relationship between the compiled code and the +Python function that is associated with it. Every profiler requires this +information in a different format. For example, the Linux "perf" profiler +requires a file in "/tmp/perf-PID.map" (name and location not configurable) +with the following format: + + + +If this file is available when "perf" generates reports, it will automatically +associate every trampoline with the Python function that it is associated with +allowing it to generate reports that include Python information. These reports +then can also be filtered in a way that *only* Python information appears. + +Notice that for this to work, there must be a unique copied of the trampoline +per Python code object even if the code in the trampoline is the same. To +achieve this we have a assembly template in Objects/asm_trampiline.S that is +compiled into the Python executable/shared library. This template generates a +symbol that maps the start of the assembly code and another that marks the end +of the assembly code for the trampoline. Then, every time we need a unique +trampoline for a Python code object, we copy the assembly code into a mmaped +area that has executable permissions and we return the start of that area as +our trampoline function. + +Asking for a mmap-ed memory area for trampoline is very wasteful so we +allocate big arenas of memory in a single mmap call, we populate the entire +arena with copies of the trampoline (this allows us to now have to invalidate +the icache for the instructions in the page) and then we return the next +available chunk every time someone asks for a new trampoline. We keep a linked +list of arenas in case the current memory arena is exhausted and another one is +needed. + +For the best results, Python should be compiled with +CFLAGS="-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer" as this allows +profilers to unwind using only the frame pointer and not on DWARF debug +information (note that as trampilines are dynamically generated there won't be +any DWARF information available for them). +*/ + +#include "Python.h" +#include "pycore_ceval.h" +#include "pycore_frame.h" +#include "pycore_interp.h" + +typedef enum { + PERF_STATUS_FAILED = -1, // Perf trampoline is in an invalid state + PERF_STATUS_NO_INIT = 0, // Perf trampoline is not initialized + PERF_STATUS_OK = 1, // Perf trampoline is ready to be executed +} perf_status_t; + +#ifdef PY_HAVE_PERF_TRAMPOLINE + +#include +#include +#include +#include +#include +#include + +#if defined(__arm__) || defined(__arm64__) || defined(__aarch64__) +#define PY_HAVE_INVALIDATE_ICACHE + +#if defined(__clang__) || defined(__GNUC__) +extern void __clear_cache(void *, void*); +#endif + +static void invalidate_icache(char* begin, char*end) { +#if defined(__clang__) || defined(__GNUC__) + return __clear_cache(begin, end); +#else + return; +#endif +} +#endif + +/* The function pointer is passed as last argument. The other three arguments + * are passed in the same order as the function requires. This results in + * shorter, more efficient ASM code for trampoline. + */ +typedef PyObject *(*py_evaluator)(PyThreadState *, _PyInterpreterFrame *, + int throwflag); +typedef PyObject *(*py_trampoline)(PyThreadState *, _PyInterpreterFrame *, int, + py_evaluator); + +extern void *_Py_trampoline_func_start; // Start of the template of the + // assembly trampoline +extern void * + _Py_trampoline_func_end; // End of the template of the assembly trampoline + +struct code_arena_st { + char *start_addr; // Start of the memory arena + char *current_addr; // Address of the current trampoline within the arena + size_t size; // Size of the memory arena + size_t size_left; // Remaining size of the memory arena + size_t code_size; // Size of the code of every trampoline in the arena + struct code_arena_st + *prev; // Pointer to the arena or NULL if this is the first arena. +}; + +typedef struct code_arena_st code_arena_t; + +struct trampoline_api_st { + void* (*init_state)(void); + void (*write_state)(void* state, const void *code_addr, + unsigned int code_size, PyCodeObject* code); + int (*free_state)(void* state); + void *state; +}; + +typedef struct trampoline_api_st trampoline_api_t; + + +static perf_status_t perf_status = PERF_STATUS_NO_INIT; +static Py_ssize_t extra_code_index = -1; +static code_arena_t *code_arena; +static trampoline_api_t trampoline_api; + +static FILE *perf_map_file; + +static void * +perf_map_get_file(void) +{ + if (perf_map_file) { + return perf_map_file; + } + char filename[100]; + pid_t pid = getpid(); + // Location and file name of perf map is hard-coded in perf tool. + // Use exclusive create flag wit nofollow to prevent symlink attacks. + int flags = O_WRONLY | O_CREAT | O_EXCL | O_NOFOLLOW | O_CLOEXEC; + snprintf(filename, sizeof(filename) - 1, "/tmp/perf-%jd.map", + (intmax_t)pid); + int fd = open(filename, flags, 0600); + if (fd == -1) { + perf_status = PERF_STATUS_FAILED; + PyErr_SetFromErrnoWithFilename(PyExc_OSError, filename); + return NULL; + } + perf_map_file = fdopen(fd, "w"); + if (!perf_map_file) { + perf_status = PERF_STATUS_FAILED; + PyErr_SetFromErrnoWithFilename(PyExc_OSError, filename); + close(fd); + return NULL; + } + return perf_map_file; +} + +static int +perf_map_close(void *state) +{ + FILE *fp = (FILE *)state; + int ret = 0; + if (fp) { + ret = fclose(fp); + } + perf_map_file = NULL; + perf_status = PERF_STATUS_NO_INIT; + return ret; +} + +static void +perf_map_write_entry(void *state, const void *code_addr, + unsigned int code_size, PyCodeObject *co) +{ + assert(state != NULL); + FILE *method_file = (FILE *)state; + const char *entry = PyUnicode_AsUTF8(co->co_qualname); + if (entry == NULL) { + _PyErr_WriteUnraisableMsg("Failed to get qualname from code object", + NULL); + return; + } + const char *filename = PyUnicode_AsUTF8(co->co_filename); + if (filename == NULL) { + _PyErr_WriteUnraisableMsg("Failed to get filename from code object", + NULL); + return; + } + fprintf(method_file, "%p %x py::%s:%s\n", code_addr, code_size, entry, + filename); + fflush(method_file); +} + +_PyPerf_Callbacks _Py_perfmap_callbacks = { + &perf_map_get_file, + &perf_map_write_entry, + &perf_map_close +}; + +static int +new_code_arena(void) +{ + // non-trivial programs typically need 64 to 256 kiB. + size_t mem_size = 4096 * 16; + assert(mem_size % sysconf(_SC_PAGESIZE) == 0); + char *memory = + mmap(NULL, // address + mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, + -1, // fd (not used here) + 0); // offset (not used here) + if (!memory) { + PyErr_SetFromErrno(PyExc_OSError); + _PyErr_WriteUnraisableMsg( + "Failed to create new mmap for perf trampoline", NULL); + perf_status = PERF_STATUS_FAILED; + return -1; + } + void *start = &_Py_trampoline_func_start; + void *end = &_Py_trampoline_func_end; + size_t code_size = end - start; + // TODO: Check the effect of alignment of the code chunks. Initial investigation + // showed that this has no effect on performance in x86-64 or aarch64 and the current + // version has the advantage that the unwinder in GDB can unwind across JIT-ed code. + // + // We should check the values in the future and see if there is a + // measurable performance improvement by rounding trampolines up to 32-bit + // or 64-bit alignment. + + size_t n_copies = mem_size / code_size; + for (size_t i = 0; i < n_copies; i++) { + memcpy(memory + i * code_size, start, code_size * sizeof(char)); + } + // Some systems may prevent us from creating executable code on the fly. + int res = mprotect(memory, mem_size, PROT_READ | PROT_EXEC); + if (res == -1) { + PyErr_SetFromErrno(PyExc_OSError); + munmap(memory, mem_size); + _PyErr_WriteUnraisableMsg( + "Failed to set mmap for perf trampoline to PROT_READ | PROT_EXEC", + NULL); + return -1; + } + +#ifdef PY_HAVE_INVALIDATE_ICACHE + // Before the JIT can run a block of code that has been emitted it must invalidate + // the instruction cache on some platforms like arm and aarch64. + invalidate_icache(memory, memory + mem_size); +#endif + + code_arena_t *new_arena = PyMem_RawCalloc(1, sizeof(code_arena_t)); + if (new_arena == NULL) { + PyErr_NoMemory(); + munmap(memory, mem_size); + _PyErr_WriteUnraisableMsg("Failed to allocate new code arena struct", + NULL); + return -1; + } + + new_arena->start_addr = memory; + new_arena->current_addr = memory; + new_arena->size = mem_size; + new_arena->size_left = mem_size; + new_arena->code_size = code_size; + new_arena->prev = code_arena; + code_arena = new_arena; + return 0; +} + +static void +free_code_arenas(void) +{ + code_arena_t *cur = code_arena; + code_arena_t *prev; + code_arena = NULL; // invalid static pointer + while (cur) { + munmap(cur->start_addr, cur->size); + prev = cur->prev; + PyMem_RawFree(cur); + cur = prev; + } +} + +static inline py_trampoline +code_arena_new_code(code_arena_t *code_arena) +{ + py_trampoline trampoline = (py_trampoline)code_arena->current_addr; + code_arena->size_left -= code_arena->code_size; + code_arena->current_addr += code_arena->code_size; + return trampoline; +} + +static inline py_trampoline +compile_trampoline(void) +{ + if ((code_arena == NULL) || + (code_arena->size_left <= code_arena->code_size)) { + if (new_code_arena() < 0) { + return NULL; + } + } + assert(code_arena->size_left <= code_arena->size); + return code_arena_new_code(code_arena); +} + +static PyObject * +py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame, + int throw) +{ + if (perf_status == PERF_STATUS_FAILED || + perf_status == PERF_STATUS_NO_INIT) { + goto default_eval; + } + PyCodeObject *co = frame->f_code; + py_trampoline f = NULL; + assert(extra_code_index != -1); + int ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f); + if (ret != 0 || f == NULL) { + // This is the first time we see this code object so we need + // to compile a trampoline for it. + py_trampoline new_trampoline = compile_trampoline(); + if (new_trampoline == NULL) { + goto default_eval; + } + trampoline_api.write_state(trampoline_api.state, new_trampoline, + code_arena->code_size, co); + _PyCode_SetExtra((PyObject *)co, extra_code_index, + (void *)new_trampoline); + f = new_trampoline; + } + assert(f != NULL); + return f(ts, frame, throw, _PyEval_EvalFrameDefault); +default_eval: + // Something failed, fall back to the default evaluator. + return _PyEval_EvalFrameDefault(ts, frame, throw); +} +#endif // PY_HAVE_PERF_TRAMPOLINE + +int +_PyIsPerfTrampolineActive(void) +{ +#ifdef PY_HAVE_PERF_TRAMPOLINE + PyThreadState *tstate = _PyThreadState_GET(); + return tstate->interp->eval_frame == py_trampoline_evaluator; +#endif + return 0; +} + +void +_PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks *callbacks) +{ + if (callbacks == NULL) { + return; + } +#ifdef PY_HAVE_PERF_TRAMPOLINE + callbacks->init_state = trampoline_api.init_state; + callbacks->write_state = trampoline_api.write_state; + callbacks->free_state = trampoline_api.free_state; +#endif + return; +} + +int +_PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks *callbacks) +{ + if (callbacks == NULL) { + return -1; + } +#ifdef PY_HAVE_PERF_TRAMPOLINE + if (trampoline_api.state) { + _PyPerfTrampoline_Fini(); + } + trampoline_api.init_state = callbacks->init_state; + trampoline_api.write_state = callbacks->write_state; + trampoline_api.free_state = callbacks->free_state; + trampoline_api.state = NULL; + perf_status = PERF_STATUS_OK; +#endif + return 0; +} + +int +_PyPerfTrampoline_Init(int activate) +{ +#ifdef PY_HAVE_PERF_TRAMPOLINE + PyThreadState *tstate = _PyThreadState_GET(); + if (tstate->interp->eval_frame && + tstate->interp->eval_frame != py_trampoline_evaluator) { + PyErr_SetString(PyExc_RuntimeError, + "Trampoline cannot be initialized as a custom eval " + "frame is already present"); + return -1; + } + if (!activate) { + tstate->interp->eval_frame = NULL; + } + else { + tstate->interp->eval_frame = py_trampoline_evaluator; + if (new_code_arena() < 0) { + return -1; + } + if (trampoline_api.state == NULL) { + void *state = trampoline_api.init_state(); + if (state == NULL) { + return -1; + } + trampoline_api.state = state; + } + extra_code_index = _PyEval_RequestCodeExtraIndex(NULL); + if (extra_code_index == -1) { + return -1; + } + perf_status = PERF_STATUS_OK; + } +#endif + return 0; +} + +int +_PyPerfTrampoline_Fini(void) +{ +#ifdef PY_HAVE_PERF_TRAMPOLINE + PyThreadState *tstate = _PyThreadState_GET(); + if (tstate->interp->eval_frame == py_trampoline_evaluator) { + tstate->interp->eval_frame = NULL; + } + free_code_arenas(); + if (trampoline_api.state != NULL) { + trampoline_api.free_state(trampoline_api.state); + trampoline_api.state = NULL; + } + extra_code_index = -1; +#endif + return 0; +} + +PyStatus +_PyPerfTrampoline_AfterFork_Child(void) +{ +#ifdef PY_HAVE_PERF_TRAMPOLINE + // Restart trampoline in file in child. + int was_active = _PyIsPerfTrampolineActive(); + _PyPerfTrampoline_Fini(); + if (was_active) { + _PyPerfTrampoline_Init(1); + } +#endif + return PyStatus_Ok(); +} diff --git a/Tools/c-analyzer/cpython/globals-to-fix.tsv b/Tools/c-analyzer/cpython/globals-to-fix.tsv index 196d62d..e327f0a 100644 --- a/Tools/c-analyzer/cpython/globals-to-fix.tsv +++ b/Tools/c-analyzer/cpython/globals-to-fix.tsv @@ -380,7 +380,7 @@ Objects/floatobject.c - float_format - Objects/longobject.c long_from_non_binary_base log_base_BASE - Objects/longobject.c long_from_non_binary_base convwidth_base - Objects/longobject.c long_from_non_binary_base convmultmax_base - -Objects/perf_trampoline.c - perf_map_file - +Python/perf_trampoline.c - perf_map_file - Objects/unicodeobject.c - ucnhash_capi - Parser/action_helpers.c _PyPegen_dummy_name cache - Python/dtoa.c - p5s - @@ -456,10 +456,10 @@ Objects/dictobject.c - next_dict_keys_version - Objects/funcobject.c - next_func_version - Objects/moduleobject.c - max_module_number - Objects/object.c - _Py_RefTotal - -Objects/perf_trampoline.c - perf_status - -Objects/perf_trampoline.c - extra_code_index - -Objects/perf_trampoline.c - code_arena - -Objects/perf_trampoline.c - trampoline_api - +Python/perf_trampoline.c - perf_status - +Python/perf_trampoline.c - extra_code_index - +Python/perf_trampoline.c - code_arena - +Python/perf_trampoline.c - trampoline_api - Objects/typeobject.c - next_version_tag - Objects/typeobject.c resolve_slotdups ptrs - Parser/pegen.c - memo_statistics - diff --git a/Tools/c-analyzer/cpython/ignored.tsv b/Tools/c-analyzer/cpython/ignored.tsv index 28c2325..dbfb0e0 100644 --- a/Tools/c-analyzer/cpython/ignored.tsv +++ b/Tools/c-analyzer/cpython/ignored.tsv @@ -77,8 +77,8 @@ Objects/object.c - _Py_GenericAliasIterType - Objects/object.c - _PyMemoryIter_Type - Objects/object.c - _PyLineIterator - Objects/object.c - _PyPositionsIterator - -Objects/perf_trampoline.c - _Py_trampoline_func_start - -Objects/perf_trampoline.c - _Py_trampoline_func_end - +Python/perf_trampoline.c - _Py_trampoline_func_start - +Python/perf_trampoline.c - _Py_trampoline_func_end - Python/importdl.h - _PyImport_DynLoadFiletab - Modules/expat/xmlrole.c - prolog0 - @@ -465,7 +465,7 @@ Objects/obmalloc.c - _PyMem_Debug - Objects/obmalloc.c - _PyMem_Raw - Objects/obmalloc.c - _PyObject - Objects/obmalloc.c - usedpools - -Objects/perf_trampoline.c - _Py_perfmap_callbacks - +Python/perf_trampoline.c - _Py_perfmap_callbacks - Objects/typeobject.c - name_op - Objects/unicodeobject.c - stripfuncnames - Objects/unicodeobject.c - utf7_category - diff --git a/configure b/configure index 15d9796..953c558 100755 --- a/configure +++ b/configure @@ -11629,7 +11629,7 @@ if test "x$perf_trampoline" = xyes; then : $as_echo "#define PY_HAVE_PERF_TRAMPOLINE 1" >>confdefs.h - PERF_TRAMPOLINE_OBJ=Objects/asm_trampoline.o + PERF_TRAMPOLINE_OBJ=Python/asm_trampoline.o if test "x$Py_DEBUG" = xtrue; then : diff --git a/configure.ac b/configure.ac index c7945aa..210ce32 100644 --- a/configure.ac +++ b/configure.ac @@ -3474,7 +3474,7 @@ AC_MSG_RESULT([$perf_trampoline]) AS_VAR_IF([perf_trampoline], [yes], [ AC_DEFINE([PY_HAVE_PERF_TRAMPOLINE], [1], [Define to 1 if you have the perf trampoline.]) - PERF_TRAMPOLINE_OBJ=Objects/asm_trampoline.o + PERF_TRAMPOLINE_OBJ=Python/asm_trampoline.o dnl perf needs frame pointers for unwinding, include compiler option in debug builds AS_VAR_IF([Py_DEBUG], [true], [ -- cgit v0.12