From 1f737edb67e702095feb97118a911afb569f5705 Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <Pablogsal@gmail.com>
Date: Tue, 25 Oct 2022 23:34:22 +0100
Subject: gh-96143: Move the perf trampoline files to the Python directory
 (#98675)

---
 Makefile.pre.in                             |   4 +-
 Objects/asm_trampoline.S                    |  28 --
 Objects/perf_trampoline.c                   | 531 ----------------------------
 PCbuild/_freeze_module.vcxproj              |   2 +-
 PCbuild/_freeze_module.vcxproj.filters      |   2 +-
 PCbuild/pythoncore.vcxproj                  |   2 +-
 PCbuild/pythoncore.vcxproj.filters          |   6 +-
 Python/asm_trampoline.S                     |  28 ++
 Python/perf_trampoline.c                    | 531 ++++++++++++++++++++++++++++
 Tools/c-analyzer/cpython/globals-to-fix.tsv |  10 +-
 Tools/c-analyzer/cpython/ignored.tsv        |   6 +-
 configure                                   |   2 +-
 configure.ac                                |   2 +-
 13 files changed, 577 insertions(+), 577 deletions(-)
 delete mode 100644 Objects/asm_trampoline.S
 delete mode 100644 Objects/perf_trampoline.c
 create mode 100644 Python/asm_trampoline.S
 create mode 100644 Python/perf_trampoline.c

diff --git a/Makefile.pre.in b/Makefile.pre.in
index 5b4bf15..6ab1422 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -426,6 +426,7 @@ PYTHON_OBJS=	\
 		Python/formatter_unicode.o \
 		Python/fileutils.o \
 		Python/suggestions.o \
+		Python/perf_trampoline.o \
 		Python/$(DYNLOADFILE) \
 		$(LIBOBJS) \
 		$(MACHDEP_OBJS) \
@@ -479,7 +480,6 @@ OBJECT_OBJS=	\
 		Objects/unicodectype.o \
 		Objects/unionobject.o \
 		Objects/weakrefobject.o \
-		Objects/perf_trampoline.o \
 		@PERF_TRAMPOLINE_OBJ@
 
 DEEPFREEZE_OBJS = Python/deepfreeze/deepfreeze.o
@@ -2370,7 +2370,7 @@ config.status:	$(srcdir)/configure
 
 .PRECIOUS: config.status $(BUILDPYTHON) Makefile Makefile.pre
 
-Objects/asm_trampoline.o: $(srcdir)/Objects/asm_trampoline.S
+Python/asm_trampoline.o: $(srcdir)/Python/asm_trampoline.S
 	$(CC) -c $(PY_CORE_CFLAGS) -o $@ $<
 
 # Some make's put the object file in the current directory
diff --git a/Objects/asm_trampoline.S b/Objects/asm_trampoline.S
deleted file mode 100644
index 4607077..0000000
--- a/Objects/asm_trampoline.S
+++ /dev/null
@@ -1,28 +0,0 @@
-    .text
-    .globl	_Py_trampoline_func_start
-# The following assembly is equivalent to:
-# PyObject *
-# trampoline(PyThreadState *ts, _PyInterpreterFrame *f,
-#            int throwflag, py_evaluator evaluator)
-# {
-#     return evaluator(ts, f, throwflag);
-# }
-_Py_trampoline_func_start:
-#ifdef __x86_64__
-    sub    $8, %rsp
-    call    *%rcx
-    add    $8, %rsp
-    ret
-#endif // __x86_64__
-#if defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
-    // ARM64 little endian, 64bit ABI
-    // generate with aarch64-linux-gnu-gcc 12.1
-    stp     x29, x30, [sp, -16]!
-    mov     x29, sp
-    blr     x3
-    ldp     x29, x30, [sp], 16
-    ret
-#endif
-    .globl	_Py_trampoline_func_end
-_Py_trampoline_func_end:
-    .section        .note.GNU-stack,"",@progbits
diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c
deleted file mode 100644
index 161e0ef..0000000
--- a/Objects/perf_trampoline.c
+++ /dev/null
@@ -1,531 +0,0 @@
-/*
-
-Perf trampoline instrumentation
-===============================
-
-This file contains instrumentation to allow to associate
-calls to the CPython eval loop back to the names of the Python
-functions and filename being executed.
-
-Many native performance profilers like the Linux perf tools are
-only available to 'see' the C stack when sampling from the profiled
-process. This means that if we have the following python code:
-
-    import time
-    def foo(n):
-        # Some CPU intensive code
-
-    def bar(n):
-        foo(n)
-
-    def baz(n):
-        bar(n)
-
-    baz(10000000)
-
-A performance profiler that is only able to see native frames will
-produce the following backtrace when sampling from foo():
-
-    _PyEval_EvalFrameDefault -----> Evaluation frame of foo()
-    _PyEval_Vector
-    _PyFunction_Vectorcall
-    PyObject_Vectorcall
-    call_function
-
-    _PyEval_EvalFrameDefault ------> Evaluation frame of bar()
-    _PyEval_EvalFrame
-    _PyEval_Vector
-    _PyFunction_Vectorcall
-    PyObject_Vectorcall
-    call_function
-
-    _PyEval_EvalFrameDefault -------> Evaluation frame of baz()
-    _PyEval_EvalFrame
-    _PyEval_Vector
-    _PyFunction_Vectorcall
-    PyObject_Vectorcall
-    call_function
-
-    ...
-
-    Py_RunMain
-
-Because the profiler is only able to see the native frames and the native
-function that runs the evaluation loop is the same (_PyEval_EvalFrameDefault)
-then the profiler and any reporter generated by it will not be able to
-associate the names of the Python functions and the filenames associated with
-those calls, rendering the results useless in the Python world.
-
-To fix this problem, we introduce the concept of a trampoline frame. A
-trampoline frame is a piece of code that is unique per Python code object that
-is executed before entering the CPython eval loop. This piece of code just
-calls the original Python evaluation function (_PyEval_EvalFrameDefault) and
-forwards all the arguments received. In this way, when a profiler samples
-frames from the previous example it will see;
-
-    _PyEval_EvalFrameDefault -----> Evaluation frame of foo()
-    [Jit compiled code 3]
-    _PyEval_Vector
-    _PyFunction_Vectorcall
-    PyObject_Vectorcall
-    call_function
-
-    _PyEval_EvalFrameDefault ------> Evaluation frame of bar()
-    [Jit compiled code 2]
-    _PyEval_EvalFrame
-    _PyEval_Vector
-    _PyFunction_Vectorcall
-    PyObject_Vectorcall
-    call_function
-
-    _PyEval_EvalFrameDefault -------> Evaluation frame of baz()
-    [Jit compiled code 1]
-    _PyEval_EvalFrame
-    _PyEval_Vector
-    _PyFunction_Vectorcall
-    PyObject_Vectorcall
-    call_function
-
-    ...
-
-    Py_RunMain
-
-When we generate every unique copy of the trampoline (what here we called "[Jit
-compiled code N]") we write the relationship between the compiled code and the
-Python function that is associated with it. Every profiler requires this
-information in a different format. For example, the Linux "perf" profiler
-requires a file in "/tmp/perf-PID.map" (name and location not configurable)
-with the following format:
-
-    <compiled code address> <compiled code size> <name of the compiled code>
-
-If this file is available when "perf" generates reports, it will automatically
-associate every trampoline with the Python function that it is associated with
-allowing it to generate reports that include Python information. These reports
-then can also be filtered in a way that *only* Python information appears.
-
-Notice that for this to work, there must be a unique copied of the trampoline
-per Python code object even if the code in the trampoline is the same. To
-achieve this we have a assembly template in Objects/asm_trampiline.S that is
-compiled into the Python executable/shared library. This template generates a
-symbol that maps the start of the assembly code and another that marks the end
-of the assembly code for the trampoline.  Then, every time we need a unique
-trampoline for a Python code object, we copy the assembly code into a mmaped
-area that has executable permissions and we return the start of that area as
-our trampoline function.
-
-Asking for a mmap-ed memory area for trampoline is very wasteful so we
-allocate big arenas of memory in a single mmap call, we populate the entire
-arena with copies of the trampoline (this allows us to now have to invalidate
-the icache for the instructions in the page) and then we return the next
-available chunk every time someone asks for a new trampoline. We keep a linked
-list of arenas in case the current memory arena is exhausted and another one is
-needed.
-
-For the best results, Python should be compiled with
-CFLAGS="-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer" as this allows
-profilers to unwind using only the frame pointer and not on DWARF debug
-information (note that as trampilines are dynamically generated there won't be
-any DWARF information available for them).
-*/
-
-#include "Python.h"
-#include "pycore_ceval.h"
-#include "pycore_frame.h"
-#include "pycore_interp.h"
-
-typedef enum {
-    PERF_STATUS_FAILED = -1,  // Perf trampoline is in an invalid state
-    PERF_STATUS_NO_INIT = 0,  // Perf trampoline is not initialized
-    PERF_STATUS_OK = 1,       // Perf trampoline is ready to be executed
-} perf_status_t;
-
-#ifdef PY_HAVE_PERF_TRAMPOLINE
-
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#if defined(__arm__) || defined(__arm64__) || defined(__aarch64__)
-#define PY_HAVE_INVALIDATE_ICACHE
-
-#if defined(__clang__) || defined(__GNUC__)
-extern void __clear_cache(void *, void*);
-#endif
-
-static void invalidate_icache(char* begin, char*end) {
-#if defined(__clang__) || defined(__GNUC__)
-    return __clear_cache(begin, end);
-#else
-    return;
-#endif
-}
-#endif
-
-/* The function pointer is passed as last argument. The other three arguments
- * are passed in the same order as the function requires. This results in
- * shorter, more efficient ASM code for trampoline.
- */
-typedef PyObject *(*py_evaluator)(PyThreadState *, _PyInterpreterFrame *,
-                                  int throwflag);
-typedef PyObject *(*py_trampoline)(PyThreadState *, _PyInterpreterFrame *, int,
-                                   py_evaluator);
-
-extern void *_Py_trampoline_func_start;  // Start of the template of the
-                                         // assembly trampoline
-extern void *
-    _Py_trampoline_func_end;  // End of the template of the assembly trampoline
-
-struct code_arena_st {
-    char *start_addr;    // Start of the memory arena
-    char *current_addr;  // Address of the current trampoline within the arena
-    size_t size;         // Size of the memory arena
-    size_t size_left;    // Remaining size of the memory arena
-    size_t code_size;    // Size of the code of every trampoline in the arena
-    struct code_arena_st
-        *prev;  // Pointer to the arena  or NULL if this is the first arena.
-};
-
-typedef struct code_arena_st code_arena_t;
-
-struct trampoline_api_st {
-    void* (*init_state)(void);
-    void (*write_state)(void* state, const void *code_addr,
-                        unsigned int code_size, PyCodeObject* code);
-    int (*free_state)(void* state);
-    void *state;
-};
-
-typedef struct trampoline_api_st trampoline_api_t;
-
-
-static perf_status_t perf_status = PERF_STATUS_NO_INIT;
-static Py_ssize_t extra_code_index = -1;
-static code_arena_t *code_arena;
-static trampoline_api_t trampoline_api;
-
-static FILE *perf_map_file;
-
-static void *
-perf_map_get_file(void)
-{
-    if (perf_map_file) {
-        return perf_map_file;
-    }
-    char filename[100];
-    pid_t pid = getpid();
-    // Location and file name of perf map is hard-coded in perf tool.
-    // Use exclusive create flag wit nofollow to prevent symlink attacks.
-    int flags = O_WRONLY | O_CREAT | O_EXCL | O_NOFOLLOW | O_CLOEXEC;
-    snprintf(filename, sizeof(filename) - 1, "/tmp/perf-%jd.map",
-             (intmax_t)pid);
-    int fd = open(filename, flags, 0600);
-    if (fd == -1) {
-        perf_status = PERF_STATUS_FAILED;
-        PyErr_SetFromErrnoWithFilename(PyExc_OSError, filename);
-        return NULL;
-    }
-    perf_map_file = fdopen(fd, "w");
-    if (!perf_map_file) {
-        perf_status = PERF_STATUS_FAILED;
-        PyErr_SetFromErrnoWithFilename(PyExc_OSError, filename);
-        close(fd);
-        return NULL;
-    }
-    return perf_map_file;
-}
-
-static int
-perf_map_close(void *state)
-{
-    FILE *fp = (FILE *)state;
-    int ret = 0;
-    if (fp) {
-        ret = fclose(fp);
-    }
-    perf_map_file = NULL;
-    perf_status = PERF_STATUS_NO_INIT;
-    return ret;
-}
-
-static void
-perf_map_write_entry(void *state, const void *code_addr,
-                         unsigned int code_size, PyCodeObject *co)
-{
-    assert(state != NULL);
-    FILE *method_file = (FILE *)state;
-    const char *entry = PyUnicode_AsUTF8(co->co_qualname);
-    if (entry == NULL) {
-        _PyErr_WriteUnraisableMsg("Failed to get qualname from code object",
-                                  NULL);
-        return;
-    }
-    const char *filename = PyUnicode_AsUTF8(co->co_filename);
-    if (filename == NULL) {
-        _PyErr_WriteUnraisableMsg("Failed to get filename from code object",
-                                  NULL);
-        return;
-    }
-    fprintf(method_file, "%p %x py::%s:%s\n", code_addr, code_size, entry,
-            filename);
-    fflush(method_file);
-}
-
-_PyPerf_Callbacks _Py_perfmap_callbacks = {
-    &perf_map_get_file,
-    &perf_map_write_entry,
-    &perf_map_close
-};
-
-static int
-new_code_arena(void)
-{
-    // non-trivial programs typically need 64 to 256 kiB.
-    size_t mem_size = 4096 * 16;
-    assert(mem_size % sysconf(_SC_PAGESIZE) == 0);
-    char *memory =
-        mmap(NULL,  // address
-             mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS,
-             -1,  // fd (not used here)
-             0);  // offset (not used here)
-    if (!memory) {
-        PyErr_SetFromErrno(PyExc_OSError);
-        _PyErr_WriteUnraisableMsg(
-            "Failed to create new mmap for perf trampoline", NULL);
-        perf_status = PERF_STATUS_FAILED;
-        return -1;
-    }
-    void *start = &_Py_trampoline_func_start;
-    void *end = &_Py_trampoline_func_end;
-    size_t code_size = end - start;
-    // TODO: Check the effect of alignment of the code chunks. Initial investigation
-    // showed that this has no effect on performance in x86-64 or aarch64 and the current
-    // version has the advantage that the unwinder in GDB can unwind across JIT-ed code.
-    //
-    // We should check the values in the future and see if there is a
-    // measurable performance improvement by rounding trampolines up to 32-bit
-    // or 64-bit alignment.
-
-    size_t n_copies = mem_size / code_size;
-    for (size_t i = 0; i < n_copies; i++) {
-        memcpy(memory + i * code_size, start, code_size * sizeof(char));
-    }
-    // Some systems may prevent us from creating executable code on the fly.
-    int res = mprotect(memory, mem_size, PROT_READ | PROT_EXEC);
-    if (res == -1) {
-        PyErr_SetFromErrno(PyExc_OSError);
-        munmap(memory, mem_size);
-        _PyErr_WriteUnraisableMsg(
-            "Failed to set mmap for perf trampoline to PROT_READ | PROT_EXEC",
-            NULL);
-        return -1;
-    }
-
-#ifdef PY_HAVE_INVALIDATE_ICACHE
-    // Before the JIT can run a block of code that has been emitted it must invalidate
-    // the instruction cache on some platforms like arm and aarch64.
-    invalidate_icache(memory, memory + mem_size);
-#endif
-
-    code_arena_t *new_arena = PyMem_RawCalloc(1, sizeof(code_arena_t));
-    if (new_arena == NULL) {
-        PyErr_NoMemory();
-        munmap(memory, mem_size);
-        _PyErr_WriteUnraisableMsg("Failed to allocate new code arena struct",
-                                  NULL);
-        return -1;
-    }
-
-    new_arena->start_addr = memory;
-    new_arena->current_addr = memory;
-    new_arena->size = mem_size;
-    new_arena->size_left = mem_size;
-    new_arena->code_size = code_size;
-    new_arena->prev = code_arena;
-    code_arena = new_arena;
-    return 0;
-}
-
-static void
-free_code_arenas(void)
-{
-    code_arena_t *cur = code_arena;
-    code_arena_t *prev;
-    code_arena = NULL;  // invalid static pointer
-    while (cur) {
-        munmap(cur->start_addr, cur->size);
-        prev = cur->prev;
-        PyMem_RawFree(cur);
-        cur = prev;
-    }
-}
-
-static inline py_trampoline
-code_arena_new_code(code_arena_t *code_arena)
-{
-    py_trampoline trampoline = (py_trampoline)code_arena->current_addr;
-    code_arena->size_left -= code_arena->code_size;
-    code_arena->current_addr += code_arena->code_size;
-    return trampoline;
-}
-
-static inline py_trampoline
-compile_trampoline(void)
-{
-    if ((code_arena == NULL) ||
-        (code_arena->size_left <= code_arena->code_size)) {
-        if (new_code_arena() < 0) {
-            return NULL;
-        }
-    }
-    assert(code_arena->size_left <= code_arena->size);
-    return code_arena_new_code(code_arena);
-}
-
-static PyObject *
-py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
-                        int throw)
-{
-    if (perf_status == PERF_STATUS_FAILED ||
-        perf_status == PERF_STATUS_NO_INIT) {
-        goto default_eval;
-    }
-    PyCodeObject *co = frame->f_code;
-    py_trampoline f = NULL;
-    assert(extra_code_index != -1);
-    int ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f);
-    if (ret != 0 || f == NULL) {
-        // This is the first time we see this code object so we need
-        // to compile a trampoline for it.
-        py_trampoline new_trampoline = compile_trampoline();
-        if (new_trampoline == NULL) {
-            goto default_eval;
-        }
-        trampoline_api.write_state(trampoline_api.state, new_trampoline,
-                                   code_arena->code_size, co);
-        _PyCode_SetExtra((PyObject *)co, extra_code_index,
-                         (void *)new_trampoline);
-        f = new_trampoline;
-    }
-    assert(f != NULL);
-    return f(ts, frame, throw, _PyEval_EvalFrameDefault);
-default_eval:
-    // Something failed, fall back to the default evaluator.
-    return _PyEval_EvalFrameDefault(ts, frame, throw);
-}
-#endif  // PY_HAVE_PERF_TRAMPOLINE
-
-int
-_PyIsPerfTrampolineActive(void)
-{
-#ifdef PY_HAVE_PERF_TRAMPOLINE
-    PyThreadState *tstate = _PyThreadState_GET();
-    return tstate->interp->eval_frame == py_trampoline_evaluator;
-#endif
-    return 0;
-}
-
-void
-_PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks *callbacks)
-{
-    if (callbacks == NULL) {
-        return;
-    }
-#ifdef PY_HAVE_PERF_TRAMPOLINE
-    callbacks->init_state = trampoline_api.init_state;
-    callbacks->write_state = trampoline_api.write_state;
-    callbacks->free_state = trampoline_api.free_state;
-#endif
-    return;
-}
-
-int
-_PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks *callbacks)
-{
-    if (callbacks == NULL) {
-        return -1;
-    }
-#ifdef PY_HAVE_PERF_TRAMPOLINE
-    if (trampoline_api.state) {
-        _PyPerfTrampoline_Fini();
-    }
-    trampoline_api.init_state = callbacks->init_state;
-    trampoline_api.write_state = callbacks->write_state;
-    trampoline_api.free_state = callbacks->free_state;
-    trampoline_api.state = NULL;
-    perf_status = PERF_STATUS_OK;
-#endif
-    return 0;
-}
-
-int
-_PyPerfTrampoline_Init(int activate)
-{
-#ifdef PY_HAVE_PERF_TRAMPOLINE
-    PyThreadState *tstate = _PyThreadState_GET();
-    if (tstate->interp->eval_frame &&
-        tstate->interp->eval_frame != py_trampoline_evaluator) {
-        PyErr_SetString(PyExc_RuntimeError,
-                        "Trampoline cannot be initialized as a custom eval "
-                        "frame is already present");
-        return -1;
-    }
-    if (!activate) {
-        tstate->interp->eval_frame = NULL;
-    }
-    else {
-        tstate->interp->eval_frame = py_trampoline_evaluator;
-        if (new_code_arena() < 0) {
-            return -1;
-        }
-        if (trampoline_api.state == NULL) {
-            void *state = trampoline_api.init_state();
-            if (state == NULL) {
-                return -1;
-            }
-            trampoline_api.state = state;
-        }
-        extra_code_index = _PyEval_RequestCodeExtraIndex(NULL);
-        if (extra_code_index == -1) {
-            return -1;
-        }
-        perf_status = PERF_STATUS_OK;
-    }
-#endif
-    return 0;
-}
-
-int
-_PyPerfTrampoline_Fini(void)
-{
-#ifdef PY_HAVE_PERF_TRAMPOLINE
-    PyThreadState *tstate = _PyThreadState_GET();
-    if (tstate->interp->eval_frame == py_trampoline_evaluator) {
-        tstate->interp->eval_frame = NULL;
-    }
-    free_code_arenas();
-    if (trampoline_api.state != NULL) {
-        trampoline_api.free_state(trampoline_api.state);
-        trampoline_api.state = NULL;
-    }
-    extra_code_index = -1;
-#endif
-    return 0;
-}
-
-PyStatus
-_PyPerfTrampoline_AfterFork_Child(void)
-{
-#ifdef PY_HAVE_PERF_TRAMPOLINE
-    // Restart trampoline in file in child.
-    int was_active = _PyIsPerfTrampolineActive();
-    _PyPerfTrampoline_Fini();
-    if (was_active) {
-        _PyPerfTrampoline_Init(1);
-    }
-#endif
-    return PyStatus_Ok();
-}
diff --git a/PCbuild/_freeze_module.vcxproj b/PCbuild/_freeze_module.vcxproj
index 49e5cc8..8454bd6 100644
--- a/PCbuild/_freeze_module.vcxproj
+++ b/PCbuild/_freeze_module.vcxproj
@@ -129,7 +129,6 @@
     <ClCompile Include="..\Objects\cellobject.c" />
     <ClCompile Include="..\Objects\classobject.c" />
     <ClCompile Include="..\Objects\codeobject.c" />
-    <ClCompile Include="..\Objects\perf_trampoline.c" />
     <ClCompile Include="..\Objects\complexobject.c" />
     <ClCompile Include="..\Objects\descrobject.c" />
     <ClCompile Include="..\Objects\dictobject.c" />
@@ -211,6 +210,7 @@
     <ClCompile Include="..\Python\mysnprintf.c" />
     <ClCompile Include="..\Python\mystrtoul.c" />
     <ClCompile Include="..\Python\pathconfig.c" />
+    <ClCompile Include="..\Python\perf_trampoline.c" />
     <ClCompile Include="..\Python\preconfig.c" />
     <ClCompile Include="..\Python\pyarena.c" />
     <ClCompile Include="..\Python\pyctype.c" />
diff --git a/PCbuild/_freeze_module.vcxproj.filters b/PCbuild/_freeze_module.vcxproj.filters
index 96ab2f2..6e8498d 100644
--- a/PCbuild/_freeze_module.vcxproj.filters
+++ b/PCbuild/_freeze_module.vcxproj.filters
@@ -85,7 +85,7 @@
     <ClCompile Include="..\Objects\codeobject.c">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="..\Objects\perf_trampoline.c">
+    <ClCompile Include="..\Python\perf_trampoline.c">
       <Filter>Source Files</Filter>
     </ClCompile>
     <ClCompile Include="..\Python\compile.c">
diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj
index ff17304..111ad67 100644
--- a/PCbuild/pythoncore.vcxproj
+++ b/PCbuild/pythoncore.vcxproj
@@ -429,7 +429,6 @@
     <ClCompile Include="..\Objects\cellobject.c" />
     <ClCompile Include="..\Objects\classobject.c" />
     <ClCompile Include="..\Objects\codeobject.c" />
-    <ClCompile Include="..\Objects\perf_trampoline.c" />
     <ClCompile Include="..\Objects\complexobject.c" />
     <ClCompile Include="..\Objects\descrobject.c" />
     <ClCompile Include="..\Objects\dictobject.c" />
@@ -513,6 +512,7 @@
     <ClCompile Include="..\Python\mysnprintf.c" />
     <ClCompile Include="..\Python\mystrtoul.c" />
     <ClCompile Include="..\Python\pathconfig.c" />
+    <ClCompile Include="..\Python\perf_trampoline.c" />
     <ClCompile Include="..\Python\preconfig.c" />
     <ClCompile Include="..\Python\pyarena.c" />
     <ClCompile Include="..\Python\pyctype.c" />
diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters
index 7d7fe72..ab7d019 100644
--- a/PCbuild/pythoncore.vcxproj.filters
+++ b/PCbuild/pythoncore.vcxproj.filters
@@ -923,9 +923,6 @@
     <ClCompile Include="..\Objects\codeobject.c">
       <Filter>Objects</Filter>
     </ClCompile>
-    <ClCompile Include="..\Objects\perf_trampoline.c">
-      <Filter>Objects</Filter>
-    </ClCompile>
     <ClCompile Include="..\Objects\complexobject.c">
       <Filter>Objects</Filter>
     </ClCompile>
@@ -1127,6 +1124,9 @@
     <ClCompile Include="..\Python\pathconfig.c">
       <Filter>Python</Filter>
     </ClCompile>
+    <ClCompile Include="..\Python\perf_trampoline.c">
+      <Filter>Python</Filter>
+    </ClCompile>
     <ClCompile Include="..\Python\preconfig.c">
       <Filter>Python</Filter>
     </ClCompile>
diff --git a/Python/asm_trampoline.S b/Python/asm_trampoline.S
new file mode 100644
index 0000000..4607077
--- /dev/null
+++ b/Python/asm_trampoline.S
@@ -0,0 +1,28 @@
+    .text
+    .globl	_Py_trampoline_func_start
+# The following assembly is equivalent to:
+# PyObject *
+# trampoline(PyThreadState *ts, _PyInterpreterFrame *f,
+#            int throwflag, py_evaluator evaluator)
+# {
+#     return evaluator(ts, f, throwflag);
+# }
+_Py_trampoline_func_start:
+#ifdef __x86_64__
+    sub    $8, %rsp
+    call    *%rcx
+    add    $8, %rsp
+    ret
+#endif // __x86_64__
+#if defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
+    // ARM64 little endian, 64bit ABI
+    // generate with aarch64-linux-gnu-gcc 12.1
+    stp     x29, x30, [sp, -16]!
+    mov     x29, sp
+    blr     x3
+    ldp     x29, x30, [sp], 16
+    ret
+#endif
+    .globl	_Py_trampoline_func_end
+_Py_trampoline_func_end:
+    .section        .note.GNU-stack,"",@progbits
diff --git a/Python/perf_trampoline.c b/Python/perf_trampoline.c
new file mode 100644
index 0000000..161e0ef
--- /dev/null
+++ b/Python/perf_trampoline.c
@@ -0,0 +1,531 @@
+/*
+
+Perf trampoline instrumentation
+===============================
+
+This file contains instrumentation to allow to associate
+calls to the CPython eval loop back to the names of the Python
+functions and filename being executed.
+
+Many native performance profilers like the Linux perf tools are
+only available to 'see' the C stack when sampling from the profiled
+process. This means that if we have the following python code:
+
+    import time
+    def foo(n):
+        # Some CPU intensive code
+
+    def bar(n):
+        foo(n)
+
+    def baz(n):
+        bar(n)
+
+    baz(10000000)
+
+A performance profiler that is only able to see native frames will
+produce the following backtrace when sampling from foo():
+
+    _PyEval_EvalFrameDefault -----> Evaluation frame of foo()
+    _PyEval_Vector
+    _PyFunction_Vectorcall
+    PyObject_Vectorcall
+    call_function
+
+    _PyEval_EvalFrameDefault ------> Evaluation frame of bar()
+    _PyEval_EvalFrame
+    _PyEval_Vector
+    _PyFunction_Vectorcall
+    PyObject_Vectorcall
+    call_function
+
+    _PyEval_EvalFrameDefault -------> Evaluation frame of baz()
+    _PyEval_EvalFrame
+    _PyEval_Vector
+    _PyFunction_Vectorcall
+    PyObject_Vectorcall
+    call_function
+
+    ...
+
+    Py_RunMain
+
+Because the profiler is only able to see the native frames and the native
+function that runs the evaluation loop is the same (_PyEval_EvalFrameDefault)
+then the profiler and any reporter generated by it will not be able to
+associate the names of the Python functions and the filenames associated with
+those calls, rendering the results useless in the Python world.
+
+To fix this problem, we introduce the concept of a trampoline frame. A
+trampoline frame is a piece of code that is unique per Python code object that
+is executed before entering the CPython eval loop. This piece of code just
+calls the original Python evaluation function (_PyEval_EvalFrameDefault) and
+forwards all the arguments received. In this way, when a profiler samples
+frames from the previous example it will see;
+
+    _PyEval_EvalFrameDefault -----> Evaluation frame of foo()
+    [Jit compiled code 3]
+    _PyEval_Vector
+    _PyFunction_Vectorcall
+    PyObject_Vectorcall
+    call_function
+
+    _PyEval_EvalFrameDefault ------> Evaluation frame of bar()
+    [Jit compiled code 2]
+    _PyEval_EvalFrame
+    _PyEval_Vector
+    _PyFunction_Vectorcall
+    PyObject_Vectorcall
+    call_function
+
+    _PyEval_EvalFrameDefault -------> Evaluation frame of baz()
+    [Jit compiled code 1]
+    _PyEval_EvalFrame
+    _PyEval_Vector
+    _PyFunction_Vectorcall
+    PyObject_Vectorcall
+    call_function
+
+    ...
+
+    Py_RunMain
+
+When we generate every unique copy of the trampoline (what here we called "[Jit
+compiled code N]") we write the relationship between the compiled code and the
+Python function that is associated with it. Every profiler requires this
+information in a different format. For example, the Linux "perf" profiler
+requires a file in "/tmp/perf-PID.map" (name and location not configurable)
+with the following format:
+
+    <compiled code address> <compiled code size> <name of the compiled code>
+
+If this file is available when "perf" generates reports, it will automatically
+associate every trampoline with the Python function that it is associated with
+allowing it to generate reports that include Python information. These reports
+then can also be filtered in a way that *only* Python information appears.
+
+Notice that for this to work, there must be a unique copied of the trampoline
+per Python code object even if the code in the trampoline is the same. To
+achieve this we have a assembly template in Objects/asm_trampiline.S that is
+compiled into the Python executable/shared library. This template generates a
+symbol that maps the start of the assembly code and another that marks the end
+of the assembly code for the trampoline.  Then, every time we need a unique
+trampoline for a Python code object, we copy the assembly code into a mmaped
+area that has executable permissions and we return the start of that area as
+our trampoline function.
+
+Asking for a mmap-ed memory area for trampoline is very wasteful so we
+allocate big arenas of memory in a single mmap call, we populate the entire
+arena with copies of the trampoline (this allows us to now have to invalidate
+the icache for the instructions in the page) and then we return the next
+available chunk every time someone asks for a new trampoline. We keep a linked
+list of arenas in case the current memory arena is exhausted and another one is
+needed.
+
+For the best results, Python should be compiled with
+CFLAGS="-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer" as this allows
+profilers to unwind using only the frame pointer and not on DWARF debug
+information (note that as trampilines are dynamically generated there won't be
+any DWARF information available for them).
+*/
+
+#include "Python.h"
+#include "pycore_ceval.h"
+#include "pycore_frame.h"
+#include "pycore_interp.h"
+
+typedef enum {
+    PERF_STATUS_FAILED = -1,  // Perf trampoline is in an invalid state
+    PERF_STATUS_NO_INIT = 0,  // Perf trampoline is not initialized
+    PERF_STATUS_OK = 1,       // Perf trampoline is ready to be executed
+} perf_status_t;
+
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#if defined(__arm__) || defined(__arm64__) || defined(__aarch64__)
+#define PY_HAVE_INVALIDATE_ICACHE
+
+#if defined(__clang__) || defined(__GNUC__)
+extern void __clear_cache(void *, void*);
+#endif
+
+static void invalidate_icache(char* begin, char*end) {
+#if defined(__clang__) || defined(__GNUC__)
+    return __clear_cache(begin, end);
+#else
+    return;
+#endif
+}
+#endif
+
+/* The function pointer is passed as last argument. The other three arguments
+ * are passed in the same order as the function requires. This results in
+ * shorter, more efficient ASM code for trampoline.
+ */
+typedef PyObject *(*py_evaluator)(PyThreadState *, _PyInterpreterFrame *,
+                                  int throwflag);
+typedef PyObject *(*py_trampoline)(PyThreadState *, _PyInterpreterFrame *, int,
+                                   py_evaluator);
+
+extern void *_Py_trampoline_func_start;  // Start of the template of the
+                                         // assembly trampoline
+extern void *
+    _Py_trampoline_func_end;  // End of the template of the assembly trampoline
+
+struct code_arena_st {
+    char *start_addr;    // Start of the memory arena
+    char *current_addr;  // Address of the current trampoline within the arena
+    size_t size;         // Size of the memory arena
+    size_t size_left;    // Remaining size of the memory arena
+    size_t code_size;    // Size of the code of every trampoline in the arena
+    struct code_arena_st
+        *prev;  // Pointer to the arena  or NULL if this is the first arena.
+};
+
+typedef struct code_arena_st code_arena_t;
+
+struct trampoline_api_st {
+    void* (*init_state)(void);
+    void (*write_state)(void* state, const void *code_addr,
+                        unsigned int code_size, PyCodeObject* code);
+    int (*free_state)(void* state);
+    void *state;
+};
+
+typedef struct trampoline_api_st trampoline_api_t;
+
+
+static perf_status_t perf_status = PERF_STATUS_NO_INIT;
+static Py_ssize_t extra_code_index = -1;
+static code_arena_t *code_arena;
+static trampoline_api_t trampoline_api;
+
+static FILE *perf_map_file;
+
+static void *
+perf_map_get_file(void)
+{
+    if (perf_map_file) {
+        return perf_map_file;
+    }
+    char filename[100];
+    pid_t pid = getpid();
+    // Location and file name of perf map is hard-coded in perf tool.
+    // Use exclusive create flag wit nofollow to prevent symlink attacks.
+    int flags = O_WRONLY | O_CREAT | O_EXCL | O_NOFOLLOW | O_CLOEXEC;
+    snprintf(filename, sizeof(filename) - 1, "/tmp/perf-%jd.map",
+             (intmax_t)pid);
+    int fd = open(filename, flags, 0600);
+    if (fd == -1) {
+        perf_status = PERF_STATUS_FAILED;
+        PyErr_SetFromErrnoWithFilename(PyExc_OSError, filename);
+        return NULL;
+    }
+    perf_map_file = fdopen(fd, "w");
+    if (!perf_map_file) {
+        perf_status = PERF_STATUS_FAILED;
+        PyErr_SetFromErrnoWithFilename(PyExc_OSError, filename);
+        close(fd);
+        return NULL;
+    }
+    return perf_map_file;
+}
+
+static int
+perf_map_close(void *state)
+{
+    FILE *fp = (FILE *)state;
+    int ret = 0;
+    if (fp) {
+        ret = fclose(fp);
+    }
+    perf_map_file = NULL;
+    perf_status = PERF_STATUS_NO_INIT;
+    return ret;
+}
+
+static void
+perf_map_write_entry(void *state, const void *code_addr,
+                         unsigned int code_size, PyCodeObject *co)
+{
+    assert(state != NULL);
+    FILE *method_file = (FILE *)state;
+    const char *entry = PyUnicode_AsUTF8(co->co_qualname);
+    if (entry == NULL) {
+        _PyErr_WriteUnraisableMsg("Failed to get qualname from code object",
+                                  NULL);
+        return;
+    }
+    const char *filename = PyUnicode_AsUTF8(co->co_filename);
+    if (filename == NULL) {
+        _PyErr_WriteUnraisableMsg("Failed to get filename from code object",
+                                  NULL);
+        return;
+    }
+    fprintf(method_file, "%p %x py::%s:%s\n", code_addr, code_size, entry,
+            filename);
+    fflush(method_file);
+}
+
+_PyPerf_Callbacks _Py_perfmap_callbacks = {
+    &perf_map_get_file,
+    &perf_map_write_entry,
+    &perf_map_close
+};
+
+static int
+new_code_arena(void)
+{
+    // non-trivial programs typically need 64 to 256 kiB.
+    size_t mem_size = 4096 * 16;
+    assert(mem_size % sysconf(_SC_PAGESIZE) == 0);
+    char *memory =
+        mmap(NULL,  // address
+             mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS,
+             -1,  // fd (not used here)
+             0);  // offset (not used here)
+    if (!memory) {
+        PyErr_SetFromErrno(PyExc_OSError);
+        _PyErr_WriteUnraisableMsg(
+            "Failed to create new mmap for perf trampoline", NULL);
+        perf_status = PERF_STATUS_FAILED;
+        return -1;
+    }
+    void *start = &_Py_trampoline_func_start;
+    void *end = &_Py_trampoline_func_end;
+    size_t code_size = end - start;
+    // TODO: Check the effect of alignment of the code chunks. Initial investigation
+    // showed that this has no effect on performance in x86-64 or aarch64 and the current
+    // version has the advantage that the unwinder in GDB can unwind across JIT-ed code.
+    //
+    // We should check the values in the future and see if there is a
+    // measurable performance improvement by rounding trampolines up to 32-bit
+    // or 64-bit alignment.
+
+    size_t n_copies = mem_size / code_size;
+    for (size_t i = 0; i < n_copies; i++) {
+        memcpy(memory + i * code_size, start, code_size * sizeof(char));
+    }
+    // Some systems may prevent us from creating executable code on the fly.
+    int res = mprotect(memory, mem_size, PROT_READ | PROT_EXEC);
+    if (res == -1) {
+        PyErr_SetFromErrno(PyExc_OSError);
+        munmap(memory, mem_size);
+        _PyErr_WriteUnraisableMsg(
+            "Failed to set mmap for perf trampoline to PROT_READ | PROT_EXEC",
+            NULL);
+        return -1;
+    }
+
+#ifdef PY_HAVE_INVALIDATE_ICACHE
+    // Before the JIT can run a block of code that has been emitted it must invalidate
+    // the instruction cache on some platforms like arm and aarch64.
+    invalidate_icache(memory, memory + mem_size);
+#endif
+
+    code_arena_t *new_arena = PyMem_RawCalloc(1, sizeof(code_arena_t));
+    if (new_arena == NULL) {
+        PyErr_NoMemory();
+        munmap(memory, mem_size);
+        _PyErr_WriteUnraisableMsg("Failed to allocate new code arena struct",
+                                  NULL);
+        return -1;
+    }
+
+    new_arena->start_addr = memory;
+    new_arena->current_addr = memory;
+    new_arena->size = mem_size;
+    new_arena->size_left = mem_size;
+    new_arena->code_size = code_size;
+    new_arena->prev = code_arena;
+    code_arena = new_arena;
+    return 0;
+}
+
+static void
+free_code_arenas(void)
+{
+    code_arena_t *cur = code_arena;
+    code_arena_t *prev;
+    code_arena = NULL;  // invalid static pointer
+    while (cur) {
+        munmap(cur->start_addr, cur->size);
+        prev = cur->prev;
+        PyMem_RawFree(cur);
+        cur = prev;
+    }
+}
+
+static inline py_trampoline
+code_arena_new_code(code_arena_t *code_arena)
+{
+    py_trampoline trampoline = (py_trampoline)code_arena->current_addr;
+    code_arena->size_left -= code_arena->code_size;
+    code_arena->current_addr += code_arena->code_size;
+    return trampoline;
+}
+
+static inline py_trampoline
+compile_trampoline(void)
+{
+    if ((code_arena == NULL) ||
+        (code_arena->size_left <= code_arena->code_size)) {
+        if (new_code_arena() < 0) {
+            return NULL;
+        }
+    }
+    assert(code_arena->size_left <= code_arena->size);
+    return code_arena_new_code(code_arena);
+}
+
+static PyObject *
+py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
+                        int throw)
+{
+    if (perf_status == PERF_STATUS_FAILED ||
+        perf_status == PERF_STATUS_NO_INIT) {
+        goto default_eval;
+    }
+    PyCodeObject *co = frame->f_code;
+    py_trampoline f = NULL;
+    assert(extra_code_index != -1);
+    int ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f);
+    if (ret != 0 || f == NULL) {
+        // This is the first time we see this code object so we need
+        // to compile a trampoline for it.
+        py_trampoline new_trampoline = compile_trampoline();
+        if (new_trampoline == NULL) {
+            goto default_eval;
+        }
+        trampoline_api.write_state(trampoline_api.state, new_trampoline,
+                                   code_arena->code_size, co);
+        _PyCode_SetExtra((PyObject *)co, extra_code_index,
+                         (void *)new_trampoline);
+        f = new_trampoline;
+    }
+    assert(f != NULL);
+    return f(ts, frame, throw, _PyEval_EvalFrameDefault);
+default_eval:
+    // Something failed, fall back to the default evaluator.
+    return _PyEval_EvalFrameDefault(ts, frame, throw);
+}
+#endif  // PY_HAVE_PERF_TRAMPOLINE
+
+int
+_PyIsPerfTrampolineActive(void)
+{
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+    PyThreadState *tstate = _PyThreadState_GET();
+    return tstate->interp->eval_frame == py_trampoline_evaluator;
+#endif
+    return 0;
+}
+
+void
+_PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks *callbacks)
+{
+    if (callbacks == NULL) {
+        return;
+    }
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+    callbacks->init_state = trampoline_api.init_state;
+    callbacks->write_state = trampoline_api.write_state;
+    callbacks->free_state = trampoline_api.free_state;
+#endif
+    return;
+}
+
+int
+_PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks *callbacks)
+{
+    if (callbacks == NULL) {
+        return -1;
+    }
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+    if (trampoline_api.state) {
+        _PyPerfTrampoline_Fini();
+    }
+    trampoline_api.init_state = callbacks->init_state;
+    trampoline_api.write_state = callbacks->write_state;
+    trampoline_api.free_state = callbacks->free_state;
+    trampoline_api.state = NULL;
+    perf_status = PERF_STATUS_OK;
+#endif
+    return 0;
+}
+
+int
+_PyPerfTrampoline_Init(int activate)
+{
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+    PyThreadState *tstate = _PyThreadState_GET();
+    if (tstate->interp->eval_frame &&
+        tstate->interp->eval_frame != py_trampoline_evaluator) {
+        PyErr_SetString(PyExc_RuntimeError,
+                        "Trampoline cannot be initialized as a custom eval "
+                        "frame is already present");
+        return -1;
+    }
+    if (!activate) {
+        tstate->interp->eval_frame = NULL;
+    }
+    else {
+        tstate->interp->eval_frame = py_trampoline_evaluator;
+        if (new_code_arena() < 0) {
+            return -1;
+        }
+        if (trampoline_api.state == NULL) {
+            void *state = trampoline_api.init_state();
+            if (state == NULL) {
+                return -1;
+            }
+            trampoline_api.state = state;
+        }
+        extra_code_index = _PyEval_RequestCodeExtraIndex(NULL);
+        if (extra_code_index == -1) {
+            return -1;
+        }
+        perf_status = PERF_STATUS_OK;
+    }
+#endif
+    return 0;
+}
+
+int
+_PyPerfTrampoline_Fini(void)
+{
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+    PyThreadState *tstate = _PyThreadState_GET();
+    if (tstate->interp->eval_frame == py_trampoline_evaluator) {
+        tstate->interp->eval_frame = NULL;
+    }
+    free_code_arenas();
+    if (trampoline_api.state != NULL) {
+        trampoline_api.free_state(trampoline_api.state);
+        trampoline_api.state = NULL;
+    }
+    extra_code_index = -1;
+#endif
+    return 0;
+}
+
+PyStatus
+_PyPerfTrampoline_AfterFork_Child(void)
+{
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+    // Restart trampoline in file in child.
+    int was_active = _PyIsPerfTrampolineActive();
+    _PyPerfTrampoline_Fini();
+    if (was_active) {
+        _PyPerfTrampoline_Init(1);
+    }
+#endif
+    return PyStatus_Ok();
+}
diff --git a/Tools/c-analyzer/cpython/globals-to-fix.tsv b/Tools/c-analyzer/cpython/globals-to-fix.tsv
index 196d62d..e327f0a 100644
--- a/Tools/c-analyzer/cpython/globals-to-fix.tsv
+++ b/Tools/c-analyzer/cpython/globals-to-fix.tsv
@@ -380,7 +380,7 @@ Objects/floatobject.c	-	float_format	-
 Objects/longobject.c	long_from_non_binary_base	log_base_BASE	-
 Objects/longobject.c	long_from_non_binary_base	convwidth_base	-
 Objects/longobject.c	long_from_non_binary_base	convmultmax_base	-
-Objects/perf_trampoline.c	-	perf_map_file	-
+Python/perf_trampoline.c	-	perf_map_file	-
 Objects/unicodeobject.c	-	ucnhash_capi	-
 Parser/action_helpers.c	_PyPegen_dummy_name	cache	-
 Python/dtoa.c	-	p5s	-
@@ -456,10 +456,10 @@ Objects/dictobject.c	-	next_dict_keys_version	-
 Objects/funcobject.c	-	next_func_version	-
 Objects/moduleobject.c	-	max_module_number	-
 Objects/object.c	-	_Py_RefTotal	-
-Objects/perf_trampoline.c	-	perf_status	-
-Objects/perf_trampoline.c	-	extra_code_index	-
-Objects/perf_trampoline.c	-	code_arena	-
-Objects/perf_trampoline.c	-	trampoline_api	-
+Python/perf_trampoline.c	-	perf_status	-
+Python/perf_trampoline.c	-	extra_code_index	-
+Python/perf_trampoline.c	-	code_arena	-
+Python/perf_trampoline.c	-	trampoline_api	-
 Objects/typeobject.c	-	next_version_tag	-
 Objects/typeobject.c	resolve_slotdups	ptrs	-
 Parser/pegen.c	-	memo_statistics	-
diff --git a/Tools/c-analyzer/cpython/ignored.tsv b/Tools/c-analyzer/cpython/ignored.tsv
index 28c2325..dbfb0e0 100644
--- a/Tools/c-analyzer/cpython/ignored.tsv
+++ b/Tools/c-analyzer/cpython/ignored.tsv
@@ -77,8 +77,8 @@ Objects/object.c	-	_Py_GenericAliasIterType	-
 Objects/object.c	-	_PyMemoryIter_Type	-
 Objects/object.c	-	_PyLineIterator	-
 Objects/object.c	-	_PyPositionsIterator	-
-Objects/perf_trampoline.c	-	_Py_trampoline_func_start	-
-Objects/perf_trampoline.c	-	_Py_trampoline_func_end	-
+Python/perf_trampoline.c	-	_Py_trampoline_func_start	-
+Python/perf_trampoline.c	-	_Py_trampoline_func_end	-
 Python/importdl.h	-	_PyImport_DynLoadFiletab	-
 
 Modules/expat/xmlrole.c	-	prolog0	-
@@ -465,7 +465,7 @@ Objects/obmalloc.c	-	_PyMem_Debug	-
 Objects/obmalloc.c	-	_PyMem_Raw	-
 Objects/obmalloc.c	-	_PyObject	-
 Objects/obmalloc.c	-	usedpools	-
-Objects/perf_trampoline.c	-	_Py_perfmap_callbacks	-
+Python/perf_trampoline.c	-	_Py_perfmap_callbacks	-
 Objects/typeobject.c	-	name_op	-
 Objects/unicodeobject.c	-	stripfuncnames	-
 Objects/unicodeobject.c	-	utf7_category	-
diff --git a/configure b/configure
index 15d9796..953c558 100755
--- a/configure
+++ b/configure
@@ -11629,7 +11629,7 @@ if test "x$perf_trampoline" = xyes; then :
 
 $as_echo "#define PY_HAVE_PERF_TRAMPOLINE 1" >>confdefs.h
 
-  PERF_TRAMPOLINE_OBJ=Objects/asm_trampoline.o
+  PERF_TRAMPOLINE_OBJ=Python/asm_trampoline.o
 
     if test "x$Py_DEBUG" = xtrue; then :
 
diff --git a/configure.ac b/configure.ac
index c7945aa..210ce32 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3474,7 +3474,7 @@ AC_MSG_RESULT([$perf_trampoline])
 
 AS_VAR_IF([perf_trampoline], [yes], [
   AC_DEFINE([PY_HAVE_PERF_TRAMPOLINE], [1], [Define to 1 if you have the perf trampoline.])
-  PERF_TRAMPOLINE_OBJ=Objects/asm_trampoline.o
+  PERF_TRAMPOLINE_OBJ=Python/asm_trampoline.o
 
   dnl perf needs frame pointers for unwinding, include compiler option in debug builds
   AS_VAR_IF([Py_DEBUG], [true], [
-- 
cgit v0.12