summaryrefslogtreecommitdiffstats
path: root/Objects
diff options
context:
space:
mode:
authorPablo Galindo Salgado <Pablogsal@gmail.com>2022-08-30 17:11:18 (GMT)
committerGitHub <noreply@github.com>2022-08-30 17:11:18 (GMT)
commit6d791a97364b68d5f9c3514a0470aac487fc538d (patch)
tree745205d7e8698ea7398eb353311f55dc973507bf /Objects
parent0f733fffe8f4caaac3ce1b5306af86b42fb0c7fa (diff)
downloadcpython-6d791a97364b68d5f9c3514a0470aac487fc538d.zip
cpython-6d791a97364b68d5f9c3514a0470aac487fc538d.tar.gz
cpython-6d791a97364b68d5f9c3514a0470aac487fc538d.tar.bz2
gh-96143: Allow Linux perf profiler to see Python calls (GH-96123)
:warning: :warning: Note for reviewers, hackers and fellow systems/low-level/compiler engineers :warning: :warning: If you have a lot of experience with this kind of shenanigans and want to improve the **first** version, **please make a PR against my branch** or **reach out by email** or **suggest code changes directly on GitHub**. If you have any **refinements or optimizations** please, wait until the first version is merged before starting hacking or proposing those so we can keep this PR productive.
Diffstat (limited to 'Objects')
-rw-r--r--Objects/asm_trampoline.S28
-rw-r--r--Objects/perf_trampoline.c501
2 files changed, 529 insertions, 0 deletions
diff --git a/Objects/asm_trampoline.S b/Objects/asm_trampoline.S
new file mode 100644
index 0000000..4607077
--- /dev/null
+++ b/Objects/asm_trampoline.S
@@ -0,0 +1,28 @@
+ .text
+ .globl _Py_trampoline_func_start
+# The following assembly is equivalent to:
+# PyObject *
+# trampoline(PyThreadState *ts, _PyInterpreterFrame *f,
+# int throwflag, py_evaluator evaluator)
+# {
+# return evaluator(ts, f, throwflag);
+# }
+_Py_trampoline_func_start:
+#ifdef __x86_64__
+ sub $8, %rsp
+ call *%rcx
+ add $8, %rsp
+ ret
+#endif // __x86_64__
+#if defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
+ // ARM64 little endian, 64bit ABI
+ // generate with aarch64-linux-gnu-gcc 12.1
+ stp x29, x30, [sp, -16]!
+ mov x29, sp
+ blr x3
+ ldp x29, x30, [sp], 16
+ ret
+#endif
+ .globl _Py_trampoline_func_end
+_Py_trampoline_func_end:
+ .section .note.GNU-stack,"",@progbits
diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c
new file mode 100644
index 0000000..02206b2
--- /dev/null
+++ b/Objects/perf_trampoline.c
@@ -0,0 +1,501 @@
+/*
+
+Perf trampoline instrumentation
+===============================
+
+This file contains instrumentation to allow to associate
+calls to the CPython eval loop back to the names of the Python
+functions and filename being executed.
+
+Many native performance profilers like the Linux perf tools are
+only available to 'see' the C stack when sampling from the profiled
+process. This means that if we have the following python code:
+
+ import time
+ def foo(n):
+ # Some CPU intensive code
+
+ def bar(n):
+ foo(n)
+
+ def baz(n):
+ bar(n)
+
+ baz(10000000)
+
+A performance profiler that is only able to see native frames will
+produce the following backtrace when sampling from foo():
+
+ _PyEval_EvalFrameDefault -----> Evaluation frame of foo()
+ _PyEval_Vector
+ _PyFunction_Vectorcall
+ PyObject_Vectorcall
+ call_function
+
+ _PyEval_EvalFrameDefault ------> Evaluation frame of bar()
+ _PyEval_EvalFrame
+ _PyEval_Vector
+ _PyFunction_Vectorcall
+ PyObject_Vectorcall
+ call_function
+
+ _PyEval_EvalFrameDefault -------> Evaluation frame of baz()
+ _PyEval_EvalFrame
+ _PyEval_Vector
+ _PyFunction_Vectorcall
+ PyObject_Vectorcall
+ call_function
+
+ ...
+
+ Py_RunMain
+
+Because the profiler is only able to see the native frames and the native
+function that runs the evaluation loop is the same (_PyEval_EvalFrameDefault)
+then the profiler and any reporter generated by it will not be able to
+associate the names of the Python functions and the filenames associated with
+those calls, rendering the results useless in the Python world.
+
+To fix this problem, we introduce the concept of a trampoline frame. A
+trampoline frame is a piece of code that is unique per Python code object that
+is executed before entering the CPython eval loop. This piece of code just
+calls the original Python evaluation function (_PyEval_EvalFrameDefault) and
+forwards all the arguments received. In this way, when a profiler samples
+frames from the previous example it will see;
+
+ _PyEval_EvalFrameDefault -----> Evaluation frame of foo()
+ [Jit compiled code 3]
+ _PyEval_Vector
+ _PyFunction_Vectorcall
+ PyObject_Vectorcall
+ call_function
+
+ _PyEval_EvalFrameDefault ------> Evaluation frame of bar()
+ [Jit compiled code 2]
+ _PyEval_EvalFrame
+ _PyEval_Vector
+ _PyFunction_Vectorcall
+ PyObject_Vectorcall
+ call_function
+
+ _PyEval_EvalFrameDefault -------> Evaluation frame of baz()
+ [Jit compiled code 1]
+ _PyEval_EvalFrame
+ _PyEval_Vector
+ _PyFunction_Vectorcall
+ PyObject_Vectorcall
+ call_function
+
+ ...
+
+ Py_RunMain
+
+When we generate every unique copy of the trampoline (what here we called "[Jit
+compiled code N]") we write the relationship between the compiled code and the
+Python function that is associated with it. Every profiler requires this
+information in a different format. For example, the Linux "perf" profiler
+requires a file in "/tmp/perf-PID.map" (name and location not configurable)
+with the following format:
+
+ <compiled code address> <compiled code size> <name of the compiled code>
+
+If this file is available when "perf" generates reports, it will automatically
+associate every trampoline with the Python function that it is associated with
+allowing it to generate reports that include Python information. These reports
+then can also be filtered in a way that *only* Python information appears.
+
+Notice that for this to work, there must be a unique copied of the trampoline
+per Python code object even if the code in the trampoline is the same. To
+achieve this we have a assembly template in Objects/asm_trampiline.S that is
+compiled into the Python executable/shared library. This template generates a
+symbol that maps the start of the assembly code and another that marks the end
+of the assembly code for the trampoline. Then, every time we need a unique
+trampoline for a Python code object, we copy the assembly code into a mmaped
+area that has executable permissions and we return the start of that area as
+our trampoline function.
+
+Asking for a mmap-ed memory area for trampoline is very wasteful so we
+allocate big arenas of memory in a single mmap call, we populate the entire
+arena with copies of the trampoline (this allows us to now have to invalidate
+the icache for the instructions in the page) and then we return the next
+available chunk every time someone asks for a new trampoline. We keep a linked
+list of arenas in case the current memory arena is exhausted and another one is
+needed.
+
+For the best results, Python should be compiled with
+CFLAGS="-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer" as this allows
+profilers to unwind using only the frame pointer and not on DWARF debug
+information (note that as trampilines are dynamically generated there won't be
+any DWARF information available for them).
+*/
+
+#include "Python.h"
+#include "pycore_ceval.h"
+#include "pycore_frame.h"
+#include "pycore_interp.h"
+
+typedef enum {
+ PERF_STATUS_FAILED = -1, // Perf trampoline is in an invalid state
+ PERF_STATUS_NO_INIT = 0, // Perf trampoline is not initialized
+ PERF_STATUS_OK = 1, // Perf trampoline is ready to be executed
+} perf_status_t;
+
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+/* The function pointer is passed as last argument. The other three arguments
+ * are passed in the same order as the function requires. This results in
+ * shorter, more efficient ASM code for trampoline.
+ */
+typedef PyObject *(*py_evaluator)(PyThreadState *, _PyInterpreterFrame *,
+ int throwflag);
+typedef PyObject *(*py_trampoline)(PyThreadState *, _PyInterpreterFrame *, int,
+ py_evaluator);
+
+extern void *_Py_trampoline_func_start; // Start of the template of the
+ // assembly trampoline
+extern void *
+ _Py_trampoline_func_end; // End of the template of the assembly trampoline
+
+struct code_arena_st {
+ char *start_addr; // Start of the memory arena
+ char *current_addr; // Address of the current trampoline within the arena
+ size_t size; // Size of the memory arena
+ size_t size_left; // Remaining size of the memory arena
+ size_t code_size; // Size of the code of every trampoline in the arena
+ struct code_arena_st
+ *prev; // Pointer to the arena or NULL if this is the first arena.
+};
+
+typedef struct code_arena_st code_arena_t;
+
+struct trampoline_api_st {
+ void* (*init_state)(void);
+ void (*write_state)(void* state, const void *code_addr,
+ unsigned int code_size, PyCodeObject* code);
+ int (*free_state)(void* state);
+ void *state;
+};
+
+typedef struct trampoline_api_st trampoline_api_t;
+
+static perf_status_t perf_status = PERF_STATUS_NO_INIT;
+static Py_ssize_t extra_code_index = -1;
+static code_arena_t *code_arena;
+static trampoline_api_t trampoline_api;
+
+static FILE *perf_map_file;
+
+static void *
+perf_map_get_file(void)
+{
+ if (perf_map_file) {
+ return perf_map_file;
+ }
+ char filename[100];
+ pid_t pid = getpid();
+ // Location and file name of perf map is hard-coded in perf tool.
+ // Use exclusive create flag wit nofollow to prevent symlink attacks.
+ int flags = O_WRONLY | O_CREAT | O_EXCL | O_NOFOLLOW | O_CLOEXEC;
+ snprintf(filename, sizeof(filename) - 1, "/tmp/perf-%jd.map",
+ (intmax_t)pid);
+ int fd = open(filename, flags, 0600);
+ if (fd == -1) {
+ perf_status = PERF_STATUS_FAILED;
+ PyErr_SetFromErrnoWithFilename(PyExc_OSError, filename);
+ return NULL;
+ }
+ perf_map_file = fdopen(fd, "w");
+ if (!perf_map_file) {
+ perf_status = PERF_STATUS_FAILED;
+ PyErr_SetFromErrnoWithFilename(PyExc_OSError, filename);
+ close(fd);
+ return NULL;
+ }
+ return perf_map_file;
+}
+
+static int
+perf_map_close(void *state)
+{
+ FILE *fp = (FILE *)state;
+ int ret = 0;
+ if (fp) {
+ ret = fclose(fp);
+ }
+ perf_map_file = NULL;
+ perf_status = PERF_STATUS_NO_INIT;
+ return ret;
+}
+
+static void
+perf_map_write_entry(void *state, const void *code_addr,
+ unsigned int code_size, PyCodeObject *co)
+{
+ assert(state != NULL);
+ FILE *method_file = (FILE *)state;
+ const char *entry = PyUnicode_AsUTF8(co->co_qualname);
+ if (entry == NULL) {
+ _PyErr_WriteUnraisableMsg("Failed to get qualname from code object",
+ NULL);
+ return;
+ }
+ const char *filename = PyUnicode_AsUTF8(co->co_filename);
+ if (filename == NULL) {
+ _PyErr_WriteUnraisableMsg("Failed to get filename from code object",
+ NULL);
+ return;
+ }
+ fprintf(method_file, "%p %x py::%s:%s\n", code_addr, code_size, entry,
+ filename);
+ fflush(method_file);
+}
+
+_PyPerf_Callbacks _Py_perfmap_callbacks = {
+ &perf_map_get_file,
+ &perf_map_write_entry,
+ &perf_map_close
+};
+
+static int
+new_code_arena(void)
+{
+ // non-trivial programs typically need 64 to 256 kiB.
+ size_t mem_size = 4096 * 16;
+ assert(mem_size % sysconf(_SC_PAGESIZE) == 0);
+ char *memory =
+ mmap(NULL, // address
+ mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS,
+ -1, // fd (not used here)
+ 0); // offset (not used here)
+ if (!memory) {
+ PyErr_SetFromErrno(PyExc_OSError);
+ _PyErr_WriteUnraisableMsg(
+ "Failed to create new mmap for perf trampoline", NULL);
+ perf_status = PERF_STATUS_FAILED;
+ return -1;
+ }
+ void *start = &_Py_trampoline_func_start;
+ void *end = &_Py_trampoline_func_end;
+ size_t code_size = end - start;
+
+ size_t n_copies = mem_size / code_size;
+ for (size_t i = 0; i < n_copies; i++) {
+ memcpy(memory + i * code_size, start, code_size * sizeof(char));
+ }
+ // Some systems may prevent us from creating executable code on the fly.
+ int res = mprotect(memory, mem_size, PROT_READ | PROT_EXEC);
+ if (res == -1) {
+ PyErr_SetFromErrno(PyExc_OSError);
+ munmap(memory, mem_size);
+ _PyErr_WriteUnraisableMsg(
+ "Failed to set mmap for perf trampoline to PROT_READ | PROT_EXEC",
+ NULL);
+ return -1;
+ }
+
+ code_arena_t *new_arena = PyMem_RawCalloc(1, sizeof(code_arena_t));
+ if (new_arena == NULL) {
+ PyErr_NoMemory();
+ munmap(memory, mem_size);
+ _PyErr_WriteUnraisableMsg("Failed to allocate new code arena struct",
+ NULL);
+ return -1;
+ }
+
+ new_arena->start_addr = memory;
+ new_arena->current_addr = memory;
+ new_arena->size = mem_size;
+ new_arena->size_left = mem_size;
+ new_arena->code_size = code_size;
+ new_arena->prev = code_arena;
+ code_arena = new_arena;
+ return 0;
+}
+
+static void
+free_code_arenas(void)
+{
+ code_arena_t *cur = code_arena;
+ code_arena_t *prev;
+ code_arena = NULL; // invalid static pointer
+ while (cur) {
+ munmap(cur->start_addr, cur->size);
+ prev = cur->prev;
+ PyMem_RawFree(cur);
+ cur = prev;
+ }
+}
+
+static inline py_trampoline
+code_arena_new_code(code_arena_t *code_arena)
+{
+ py_trampoline trampoline = (py_trampoline)code_arena->current_addr;
+ code_arena->size_left -= code_arena->code_size;
+ code_arena->current_addr += code_arena->code_size;
+ return trampoline;
+}
+
+static inline py_trampoline
+compile_trampoline(void)
+{
+ if ((code_arena == NULL) ||
+ (code_arena->size_left <= code_arena->code_size)) {
+ if (new_code_arena() < 0) {
+ return NULL;
+ }
+ }
+ assert(code_arena->size_left <= code_arena->size);
+ return code_arena_new_code(code_arena);
+}
+
+static PyObject *
+py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
+ int throw)
+{
+ if (perf_status == PERF_STATUS_FAILED ||
+ perf_status == PERF_STATUS_NO_INIT) {
+ goto default_eval;
+ }
+ PyCodeObject *co = frame->f_code;
+ py_trampoline f = NULL;
+ assert(extra_code_index != -1);
+ int ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f);
+ if (ret != 0 || f == NULL) {
+ // This is the first time we see this code object so we need
+ // to compile a trampoline for it.
+ py_trampoline new_trampoline = compile_trampoline();
+ if (new_trampoline == NULL) {
+ goto default_eval;
+ }
+ trampoline_api.write_state(trampoline_api.state, new_trampoline,
+ code_arena->code_size, co);
+ _PyCode_SetExtra((PyObject *)co, extra_code_index,
+ (void *)new_trampoline);
+ f = new_trampoline;
+ }
+ assert(f != NULL);
+ return f(ts, frame, throw, _PyEval_EvalFrameDefault);
+default_eval:
+ // Something failed, fall back to the default evaluator.
+ return _PyEval_EvalFrameDefault(ts, frame, throw);
+}
+#endif // PY_HAVE_PERF_TRAMPOLINE
+
+int
+_PyIsPerfTrampolineActive(void)
+{
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+ PyThreadState *tstate = _PyThreadState_GET();
+ return tstate->interp->eval_frame == py_trampoline_evaluator;
+#endif
+ return 0;
+}
+
+void
+_PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks *callbacks)
+{
+ if (callbacks == NULL) {
+ return;
+ }
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+ callbacks->init_state = trampoline_api.init_state;
+ callbacks->write_state = trampoline_api.write_state;
+ callbacks->free_state = trampoline_api.free_state;
+#endif
+ return;
+}
+
+int
+_PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks *callbacks)
+{
+ if (callbacks == NULL) {
+ return -1;
+ }
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+ if (trampoline_api.state) {
+ _PyPerfTrampoline_Fini();
+ }
+ trampoline_api.init_state = callbacks->init_state;
+ trampoline_api.write_state = callbacks->write_state;
+ trampoline_api.free_state = callbacks->free_state;
+ trampoline_api.state = NULL;
+ perf_status = PERF_STATUS_OK;
+#endif
+ return 0;
+}
+
+int
+_PyPerfTrampoline_Init(int activate)
+{
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+ PyThreadState *tstate = _PyThreadState_GET();
+ if (tstate->interp->eval_frame &&
+ tstate->interp->eval_frame != py_trampoline_evaluator) {
+ PyErr_SetString(PyExc_RuntimeError,
+ "Trampoline cannot be initialized as a custom eval "
+ "frame is already present");
+ return -1;
+ }
+ if (!activate) {
+ tstate->interp->eval_frame = NULL;
+ }
+ else {
+ tstate->interp->eval_frame = py_trampoline_evaluator;
+ if (new_code_arena() < 0) {
+ return -1;
+ }
+ if (trampoline_api.state == NULL) {
+ void *state = trampoline_api.init_state();
+ if (state == NULL) {
+ return -1;
+ }
+ trampoline_api.state = state;
+ }
+ extra_code_index = _PyEval_RequestCodeExtraIndex(NULL);
+ if (extra_code_index == -1) {
+ return -1;
+ }
+ perf_status = PERF_STATUS_OK;
+ }
+#endif
+ return 0;
+}
+
+int
+_PyPerfTrampoline_Fini(void)
+{
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+ PyThreadState *tstate = _PyThreadState_GET();
+ if (tstate->interp->eval_frame == py_trampoline_evaluator) {
+ tstate->interp->eval_frame = NULL;
+ }
+ free_code_arenas();
+ if (trampoline_api.state != NULL) {
+ trampoline_api.free_state(trampoline_api.state);
+ trampoline_api.state = NULL;
+ }
+ extra_code_index = -1;
+#endif
+ return 0;
+}
+
+PyStatus
+_PyPerfTrampoline_AfterFork_Child(void)
+{
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+ // Restart trampoline in file in child.
+ int was_active = _PyIsPerfTrampolineActive();
+ _PyPerfTrampoline_Fini();
+ if (was_active) {
+ _PyPerfTrampoline_Init(1);
+ }
+#endif
+ return PyStatus_Ok();
+}