5 files changed, 694 insertions, 6 deletions
diff --git a/Python/initconfig.c b/Python/initconfig.c
index d91a819..1880a28 100644
--- a/Python/initconfig.c
+++ b/Python/initconfig.c
@@ -1703,6 +1703,20 @@ config_init_perf_profiling(PyConfig *config)
     if (xoption) {
         config->perf_profiling = 1;
     }
+    env = config_get_env(config, "PYTHONPERFJITSUPPORT");
+    if (env) {
+        if (_Py_str_to_int(env, &active) != 0) {
+            active = 0;
+        }
+        if (active) {
+            config->perf_profiling = 2;
+        }
+    }
+    xoption = config_get_xoption(config, L"perfjit");
+    if (xoption) {
+        config->perf_profiling = 2;
+    }
+
     return _PyStatus_OK();
 
 }
diff --git a/Python/perf_jit_trampoline.c b/Python/perf_jit_trampoline.c
new file mode 100644
index 0000000..fdce0da
--- /dev/null
+++ b/Python/perf_jit_trampoline.c
@@ -0,0 +1,615 @@
+#include "Python.h"
+#include "pycore_ceval.h"         // _PyPerf_Callbacks
+#include "pycore_frame.h"
+#include "pycore_interp.h"
+
+
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>             // mmap()
+#include <sys/types.h>
+#include <unistd.h>               // sysconf()
+#include <sys/time.h>           // gettimeofday()
+
+// ----------------------------------
+//         Perf jitdump API
+// ----------------------------------
+
+typedef struct {
+    FILE* perf_map;
+    PyThread_type_lock map_lock;
+    void* mapped_buffer;
+    size_t mapped_size;
+    int code_id;
+} PerfMapJitState;
+
+static PerfMapJitState perf_jit_map_state;
+
+/*
+Usually the binary and libraries are mapped in separate region like below:
+
+  address ->
+   --+---------------------+--//--+---------------------+--
+     | .text | .data | ... |      | .text | .data | ... |
+   --+---------------------+--//--+---------------------+--
+         myprog                      libc.so
+
+So it'd be easy and straight-forward to find a mapped binary or library from an
+address.
+
+But for JIT code, the code arena only cares about the code section. But the
+resulting DSOs (which is generated by perf inject -j) contain ELF headers and
+unwind info too. Then it'd generate following address space with synthesized
+MMAP events. Let's say it has a sample between address B and C.
+
+                                               sample
+                                                 |
+  address ->                         A       B   v   C
+  ---------------------------------------------------------------------------------------------------
+  /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
+  /tmp/jitted-PID-1.so           | (headers) | .text | unwind info |
+  /tmp/jitted-PID-2.so                   | (headers) | .text | unwind info |
+    ...
+  ---------------------------------------------------------------------------------------------------
+
+If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see
+the unwind info. If it maps both .text section and unwind sections, the sample
+could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing
+which one is right. So to make perf happy we have non-overlapping ranges for each
+DSO:
+
+  address ->
+  -------------------------------------------------------------------------------------------------------
+  /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
+  /tmp/jitted-PID-1.so                         | (headers) | .text | unwind info |
+  /tmp/jitted-PID-2.so                                               | (headers) | .text | unwind info |
+    ...
+  -------------------------------------------------------------------------------------------------------
+
+As the trampolines are constant, we add a constant padding but in general the padding needs to have the
+size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50
+ */
+
+#define PERF_JIT_CODE_PADDING 0x100
+#define trampoline_api _PyRuntime.ceval.perf.trampoline_api
+
+typedef uint64_t uword;
+typedef const char* CodeComments;
+
+#define Pd "d"
+#define MB (1024 * 1024)
+
+#define EM_386      3
+#define EM_X86_64   62
+#define EM_ARM      40
+#define EM_AARCH64  183
+#define EM_RISCV    243
+
+#define TARGET_ARCH_IA32   0
+#define TARGET_ARCH_X64    0
+#define TARGET_ARCH_ARM    0
+#define TARGET_ARCH_ARM64  0
+#define TARGET_ARCH_RISCV32 0
+#define TARGET_ARCH_RISCV64 0
+
+#define FLAG_generate_perf_jitdump 0
+#define FLAG_write_protect_code 0
+#define FLAG_write_protect_vm_isolate 0
+#define FLAG_code_comments 0
+
+#define UNREACHABLE()
+
+static uword GetElfMachineArchitecture(void) {
+#if TARGET_ARCH_IA32
+    return EM_386;
+#elif TARGET_ARCH_X64
+    return EM_X86_64;
+#elif TARGET_ARCH_ARM
+    return EM_ARM;
+#elif TARGET_ARCH_ARM64
+    return EM_AARCH64;
+#elif TARGET_ARCH_RISCV32 || TARGET_ARCH_RISCV64
+    return EM_RISCV;
+#else
+    UNREACHABLE();
+    return 0;
+#endif
+}
+
+typedef struct {
+    uint32_t magic;
+    uint32_t version;
+    uint32_t size;
+    uint32_t elf_mach_target;
+    uint32_t reserved;
+    uint32_t process_id;
+    uint64_t time_stamp;
+    uint64_t flags;
+} Header;
+
+ enum PerfEvent {
+    PerfLoad = 0,
+    PerfMove = 1,
+    PerfDebugInfo = 2,
+    PerfClose = 3,
+    PerfUnwindingInfo = 4
+};
+
+struct BaseEvent {
+    uint32_t event;
+    uint32_t size;
+    uint64_t time_stamp;
+  };
+
+typedef struct {
+    struct BaseEvent base;
+    uint32_t process_id;
+    uint32_t thread_id;
+    uint64_t vma;
+    uint64_t code_address;
+    uint64_t code_size;
+    uint64_t code_id;
+} CodeLoadEvent;
+
+typedef struct {
+    struct BaseEvent base;
+    uint64_t unwind_data_size;
+    uint64_t eh_frame_hdr_size;
+    uint64_t mapped_size;
+} CodeUnwindingInfoEvent;
+
+static const intptr_t nanoseconds_per_second = 1000000000;
+
+// Dwarf encoding constants
+
+static const uint8_t DwarfUData4 = 0x03;
+static const uint8_t DwarfSData4 = 0x0b;
+static const uint8_t DwarfPcRel = 0x10;
+static const uint8_t DwarfDataRel = 0x30;
+// static uint8_t DwarfOmit = 0xff;
+typedef struct {
+    unsigned char version;
+    unsigned char eh_frame_ptr_enc;
+    unsigned char	fde_count_enc;
+    unsigned char	table_enc;
+    int32_t eh_frame_ptr;
+    int32_t eh_fde_count;
+    int32_t from;
+    int32_t to;
+} EhFrameHeader;
+
+static int64_t get_current_monotonic_ticks(void) {
+    struct timespec ts;
+    if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) {
+        UNREACHABLE();
+        return 0;
+    }
+    // Convert to nanoseconds.
+    int64_t result = ts.tv_sec;
+    result *= nanoseconds_per_second;
+    result += ts.tv_nsec;
+    return result;
+}
+
+static int64_t get_current_time_microseconds(void) {
+  // gettimeofday has microsecond resolution.
+  struct timeval tv;
+  if (gettimeofday(&tv, NULL) < 0) {
+    UNREACHABLE();
+    return 0;
+  }
+  return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec;
+}
+
+
+static size_t round_up(int64_t value, int64_t multiple) {
+    if (multiple == 0) {
+        // Avoid division by zero
+        return value;
+    }
+
+    int64_t remainder = value % multiple;
+    if (remainder == 0) {
+        // Value is already a multiple of 'multiple'
+        return value;
+    }
+
+    // Calculate the difference to the next multiple
+    int64_t difference = multiple - remainder;
+
+    // Add the difference to the value
+    int64_t rounded_up_value = value + difference;
+
+    return rounded_up_value;
+}
+
+
+static void perf_map_jit_write_fully(const void* buffer, size_t size) {
+    FILE* out_file = perf_jit_map_state.perf_map;
+    const char* ptr = (const char*)(buffer);
+    while (size > 0) {
+        const size_t written = fwrite(ptr, 1, size, out_file);
+        if (written == 0) {
+            UNREACHABLE();
+            break;
+        }
+        size -= written;
+        ptr += written;
+    }
+}
+
+static void perf_map_jit_write_header(int pid, FILE* out_file) {
+    Header header;
+    header.magic = 0x4A695444;
+    header.version = 1;
+    header.size = sizeof(Header);
+    header.elf_mach_target = GetElfMachineArchitecture();
+    header.process_id = pid;
+    header.time_stamp = get_current_time_microseconds();
+    header.flags = 0;
+    perf_map_jit_write_fully(&header, sizeof(header));
+}
+
+static void* perf_map_jit_init(void) {
+    char filename[100];
+    int pid = getpid();
+    snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid);
+    const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666);
+    if (fd == -1) {
+        return NULL;
+    }
+
+    const long page_size = sysconf(_SC_PAGESIZE);  // NOLINT(runtime/int)
+    if (page_size == -1) {
+        close(fd);
+        return NULL;
+    }
+
+    // The perf jit interface forces us to map the first page of the file
+    // to signal that we are using the interface.
+    perf_jit_map_state.mapped_buffer = mmap(NULL, page_size, PROT_READ | PROT_EXEC, MAP_PRIVATE, fd, 0);
+    if (perf_jit_map_state.mapped_buffer == NULL) {
+        close(fd);
+        return NULL;
+    }
+    perf_jit_map_state.mapped_size = page_size;
+    perf_jit_map_state.perf_map = fdopen(fd, "w+");
+    if (perf_jit_map_state.perf_map == NULL) {
+        close(fd);
+        return NULL;
+    }
+    setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB);
+    perf_map_jit_write_header(pid, perf_jit_map_state.perf_map);
+
+    perf_jit_map_state.map_lock = PyThread_allocate_lock();
+    if (perf_jit_map_state.map_lock == NULL) {
+        fclose(perf_jit_map_state.perf_map);
+        return NULL;
+    }
+    perf_jit_map_state.code_id = 0;
+
+    // trampoline_api.code_padding = PERF_JIT_CODE_PADDING;
+    return &perf_jit_map_state;
+}
+
+/* DWARF definitions. */
+
+#define DWRF_CIE_VERSION 1
+
+enum {
+    DWRF_CFA_nop = 0x0,
+    DWRF_CFA_offset_extended = 0x5,
+    DWRF_CFA_def_cfa = 0xc,
+    DWRF_CFA_def_cfa_offset = 0xe,
+    DWRF_CFA_offset_extended_sf = 0x11,
+    DWRF_CFA_advance_loc = 0x40,
+    DWRF_CFA_offset = 0x80
+};
+
+enum
+  {
+    DWRF_EH_PE_absptr = 0x00,
+    DWRF_EH_PE_omit = 0xff,
+
+    /* FDE data encoding.  */
+    DWRF_EH_PE_uleb128 = 0x01,
+    DWRF_EH_PE_udata2 = 0x02,
+    DWRF_EH_PE_udata4 = 0x03,
+    DWRF_EH_PE_udata8 = 0x04,
+    DWRF_EH_PE_sleb128 = 0x09,
+    DWRF_EH_PE_sdata2 = 0x0a,
+    DWRF_EH_PE_sdata4 = 0x0b,
+    DWRF_EH_PE_sdata8 = 0x0c,
+    DWRF_EH_PE_signed = 0x08,
+
+    /* FDE flags.  */
+    DWRF_EH_PE_pcrel = 0x10,
+    DWRF_EH_PE_textrel = 0x20,
+    DWRF_EH_PE_datarel = 0x30,
+    DWRF_EH_PE_funcrel = 0x40,
+    DWRF_EH_PE_aligned = 0x50,
+
+    DWRF_EH_PE_indirect = 0x80
+  };
+
+enum { DWRF_TAG_compile_unit = 0x11 };
+
+enum { DWRF_children_no = 0, DWRF_children_yes = 1 };
+
+enum { DWRF_AT_name = 0x03, DWRF_AT_stmt_list = 0x10, DWRF_AT_low_pc = 0x11, DWRF_AT_high_pc = 0x12 };
+
+enum { DWRF_FORM_addr = 0x01, DWRF_FORM_data4 = 0x06, DWRF_FORM_string = 0x08 };
+
+enum { DWRF_LNS_extended_op = 0, DWRF_LNS_copy = 1, DWRF_LNS_advance_pc = 2, DWRF_LNS_advance_line = 3 };
+
+enum { DWRF_LNE_end_sequence = 1, DWRF_LNE_set_address = 2 };
+
+enum {
+#ifdef __x86_64__
+    /* Yes, the order is strange, but correct. */
+    DWRF_REG_AX,
+    DWRF_REG_DX,
+    DWRF_REG_CX,
+    DWRF_REG_BX,
+    DWRF_REG_SI,
+    DWRF_REG_DI,
+    DWRF_REG_BP,
+    DWRF_REG_SP,
+    DWRF_REG_8,
+    DWRF_REG_9,
+    DWRF_REG_10,
+    DWRF_REG_11,
+    DWRF_REG_12,
+    DWRF_REG_13,
+    DWRF_REG_14,
+    DWRF_REG_15,
+    DWRF_REG_RA,
+#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
+    DWRF_REG_SP = 31,
+    DWRF_REG_RA = 30,
+#else
+#    error "Unsupported target architecture"
+#endif
+};
+
+typedef struct ELFObjectContext
+{
+    uint8_t* p; /* Pointer to next address in obj.space. */
+    uint8_t* startp; /* Pointer to start address in obj.space. */
+    uint8_t* eh_frame_p; /* Pointer to start address in obj.space. */
+    uint32_t code_size; /* Size of machine code. */
+} ELFObjectContext;
+
+/* Append a null-terminated string. */
+static uint32_t
+elfctx_append_string(ELFObjectContext* ctx, const char* str)
+{
+    uint8_t* p = ctx->p;
+    uint32_t ofs = (uint32_t)(p - ctx->startp);
+    do {
+        *p++ = (uint8_t)*str;
+    } while (*str++);
+    ctx->p = p;
+    return ofs;
+}
+
+/* Append a SLEB128 value. */
+static void
+elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v)
+{
+    uint8_t* p = ctx->p;
+    for (; (uint32_t)(v + 0x40) >= 0x80; v >>= 7) {
+        *p++ = (uint8_t)((v & 0x7f) | 0x80);
+    }
+    *p++ = (uint8_t)(v & 0x7f);
+    ctx->p = p;
+}
+
+/* Append a ULEB128 to buffer. */
+static void
+elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v)
+{
+    uint8_t* p = ctx->p;
+    for (; v >= 0x80; v >>= 7) {
+        *p++ = (char)((v & 0x7f) | 0x80);
+    }
+    *p++ = (char)v;
+    ctx->p = p;
+}
+
+/* Shortcuts to generate DWARF structures. */
+#define DWRF_U8(x) (*p++ = (x))
+#define DWRF_I8(x) (*(int8_t*)p = (x), p++)
+#define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2)
+#define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4)
+#define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t))
+#define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p)
+#define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p)
+#define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p)
+#define DWRF_ALIGNNOP(s)                                                                                \
+    while ((uintptr_t)p & ((s)-1)) {                                                                    \
+        *p++ = DWRF_CFA_nop;                                                                            \
+    }
+#define DWRF_SECTION(name, stmt)                                                                        \
+    {                                                                                                   \
+        uint32_t* szp_##name = (uint32_t*)p;                                                            \
+        p += 4;                                                                                         \
+        stmt;                                                                                           \
+        *szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4);                                       \
+    }
+
+/* Initialize .eh_frame section. */
+static void
+elf_init_ehframe(ELFObjectContext* ctx)
+{
+    uint8_t* p = ctx->p;
+    uint8_t* framep = p;
+
+    /* Emit DWARF EH CIE. */
+    DWRF_SECTION(CIE, DWRF_U32(0); /* Offset to CIE itself. */
+                 DWRF_U8(DWRF_CIE_VERSION);
+                 DWRF_STR("zR"); /* Augmentation. */
+                 DWRF_UV(1); /* Code alignment factor. */
+                 DWRF_SV(-(int64_t)sizeof(uintptr_t)); /* Data alignment factor. */
+                 DWRF_U8(DWRF_REG_RA); /* Return address register. */
+                 DWRF_UV(1);
+                 DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); /* Augmentation data. */
+                 DWRF_U8(DWRF_CFA_def_cfa); DWRF_UV(DWRF_REG_SP); DWRF_UV(sizeof(uintptr_t));
+                 DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); DWRF_UV(1);
+                 DWRF_ALIGNNOP(sizeof(uintptr_t));
+    )
+
+    ctx->eh_frame_p = p;
+
+    /* Emit DWARF EH FDE. */
+    DWRF_SECTION(FDE, DWRF_U32((uint32_t)(p - framep)); /* Offset to CIE. */
+                 DWRF_U32(-0x30); /* Machine code offset relative to .text. */
+                 DWRF_U32(ctx->code_size); /* Machine code length. */
+                 DWRF_U8(0); /* Augmentation data. */
+    /* Registers saved in CFRAME. */
+#ifdef __x86_64__
+                 DWRF_U8(DWRF_CFA_advance_loc | 4);
+                 DWRF_U8(DWRF_CFA_def_cfa_offset); DWRF_UV(16);
+                 DWRF_U8(DWRF_CFA_advance_loc | 6);
+                 DWRF_U8(DWRF_CFA_def_cfa_offset); DWRF_UV(8);
+    /* Extra registers saved for JIT-compiled code. */
+#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
+                 DWRF_U8(DWRF_CFA_advance_loc | 1);
+                 DWRF_U8(DWRF_CFA_def_cfa_offset); DWRF_UV(16);
+                 DWRF_U8(DWRF_CFA_offset | 29); DWRF_UV(2);
+                 DWRF_U8(DWRF_CFA_offset | 30); DWRF_UV(1);
+                 DWRF_U8(DWRF_CFA_advance_loc | 3);
+                 DWRF_U8(DWRF_CFA_offset | -(64 - 29));
+                 DWRF_U8(DWRF_CFA_offset | -(64 - 30));
+                 DWRF_U8(DWRF_CFA_def_cfa_offset);
+                 DWRF_UV(0);
+#else
+#    error "Unsupported target architecture"
+#endif
+                 DWRF_ALIGNNOP(sizeof(uintptr_t));)
+
+    ctx->p = p;
+}
+
+static void perf_map_jit_write_entry(void *state, const void *code_addr,
+                         unsigned int code_size, PyCodeObject *co)
+{
+
+    if (perf_jit_map_state.perf_map == NULL) {
+        void* ret = perf_map_jit_init();
+        if(ret == NULL){
+            return;
+        }
+    }
+
+    const char *entry = "";
+    if (co->co_qualname != NULL) {
+        entry = PyUnicode_AsUTF8(co->co_qualname);
+    }
+    const char *filename = "";
+    if (co->co_filename != NULL) {
+        filename = PyUnicode_AsUTF8(co->co_filename);
+    }
+
+
+    size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
+    char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size);
+    if (perf_map_entry == NULL) {
+        return;
+    }
+    snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename);
+
+    const size_t name_length = strlen(perf_map_entry);
+    uword base = (uword)code_addr;
+    uword size = code_size;
+
+    // Write the code unwinding info event.
+
+    // Create unwinding information (eh frame)
+    ELFObjectContext ctx;
+    char buffer[1024];
+    ctx.code_size = code_size;
+    ctx.startp = ctx.p = (uint8_t*)buffer;
+    elf_init_ehframe(&ctx);
+    int eh_frame_size = ctx.p - ctx.startp;
+
+    // Populate the unwind info event for perf
+    CodeUnwindingInfoEvent ev2;
+    ev2.base.event = PerfUnwindingInfo;
+    ev2.base.time_stamp = get_current_monotonic_ticks();
+    ev2.unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;
+    // Ensure we have enough space between DSOs when perf maps them
+    assert(ev2.unwind_data_size <= PERF_JIT_CODE_PADDING);
+    ev2.eh_frame_hdr_size = sizeof(EhFrameHeader);
+    ev2.mapped_size = round_up(ev2.unwind_data_size, 16);
+    int content_size = sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size;
+    int padding_size = round_up(content_size, 8) - content_size;
+    ev2.base.size = content_size + padding_size;
+    perf_map_jit_write_fully(&ev2, sizeof(ev2));
+
+
+    // Populate the eh Frame header
+    EhFrameHeader f;
+    f.version = 1;
+    f.eh_frame_ptr_enc = DwarfSData4 | DwarfPcRel;
+    f.fde_count_enc = DwarfUData4;
+    f.table_enc = DwarfSData4 | DwarfDataRel;
+    f.eh_frame_ptr = -(eh_frame_size + 4 * sizeof(unsigned char));
+    f.eh_fde_count = 1;
+    f.from = -(round_up(code_size, 8) + eh_frame_size);
+    int cie_size = ctx.eh_frame_p - ctx.startp;
+    f.to = -(eh_frame_size - cie_size);
+
+    perf_map_jit_write_fully(ctx.startp, eh_frame_size);
+    perf_map_jit_write_fully(&f, sizeof(f));
+
+    char padding_bytes[] = "\0\0\0\0\0\0\0\0";
+    perf_map_jit_write_fully(&padding_bytes, padding_size);
+
+    // Write the code load event.
+    CodeLoadEvent ev;
+    ev.base.event = PerfLoad;
+    ev.base.size = sizeof(ev) + (name_length+1) + size;
+    ev.base.time_stamp = get_current_monotonic_ticks();
+    ev.process_id = getpid();
+    ev.thread_id = gettid();
+    ev.vma = base;
+    ev.code_address = base;
+    ev.code_size = size;
+    perf_jit_map_state.code_id += 1;
+    ev.code_id = perf_jit_map_state.code_id;
+
+    perf_map_jit_write_fully(&ev, sizeof(ev));
+    perf_map_jit_write_fully(perf_map_entry, name_length+1);
+    perf_map_jit_write_fully((void*)(base), size);
+    return;
+}
+
+static int perf_map_jit_fini(void* state) {
+    if (perf_jit_map_state.perf_map != NULL) {
+        // close the file
+        PyThread_acquire_lock(perf_jit_map_state.map_lock, 1);
+        fclose(perf_jit_map_state.perf_map);
+        PyThread_release_lock(perf_jit_map_state.map_lock);
+
+        // clean up the lock and state
+        PyThread_free_lock(perf_jit_map_state.map_lock);
+        perf_jit_map_state.perf_map = NULL;
+    }
+    if (perf_jit_map_state.mapped_buffer != NULL) {
+        munmap(perf_jit_map_state.mapped_buffer, perf_jit_map_state.mapped_size);
+    }
+    trampoline_api.state = NULL;
+    return 0;
+}
+
+_PyPerf_Callbacks _Py_perfmap_jit_callbacks = {
+    &perf_map_jit_init,
+    &perf_map_jit_write_entry,
+    &perf_map_jit_fini,
+};
+
+#endif
diff --git a/Python/perf_trampoline.c b/Python/perf_trampoline.c
index 750ba18..f144f7d 100644
--- a/Python/perf_trampoline.c
+++ b/Python/perf_trampoline.c
@@ -143,6 +143,8 @@ any DWARF information available for them).
 #include <sys/mman.h>             // mmap()
 #include <sys/types.h>
 #include <unistd.h>               // sysconf()
+#include <sys/time.h>           // gettimeofday()
+
 
 #if defined(__arm__) || defined(__arm64__) || defined(__aarch64__)
 #define PY_HAVE_INVALIDATE_ICACHE
@@ -187,12 +189,19 @@ struct code_arena_st {
 typedef struct code_arena_st code_arena_t;
 typedef struct trampoline_api_st trampoline_api_t;
 
+enum perf_trampoline_type {
+    PERF_TRAMPOLINE_UNSET = 0,
+    PERF_TRAMPOLINE_TYPE_MAP = 1,
+    PERF_TRAMPOLINE_TYPE_JITDUMP = 2,
+};
+
 #define perf_status _PyRuntime.ceval.perf.status
 #define extra_code_index _PyRuntime.ceval.perf.extra_code_index
 #define perf_code_arena _PyRuntime.ceval.perf.code_arena
 #define trampoline_api _PyRuntime.ceval.perf.trampoline_api
 #define perf_map_file _PyRuntime.ceval.perf.map_file
 #define persist_after_fork _PyRuntime.ceval.perf.persist_after_fork
+#define perf_trampoline_type _PyRuntime.ceval.perf.perf_trampoline_type
 
 static void
 perf_map_write_entry(void *state, const void *code_addr,
@@ -220,6 +229,8 @@ static void*
 perf_map_init_state(void)
 {
     PyUnstable_PerfMapState_Init();
+    trampoline_api.code_padding = 0;
+    perf_trampoline_type = PERF_TRAMPOLINE_TYPE_MAP;
     return NULL;
 }
 
@@ -236,6 +247,30 @@ _PyPerf_Callbacks _Py_perfmap_callbacks = {
     &perf_map_free_state,
 };
 
+
+static size_t round_up(int64_t value, int64_t multiple) {
+    if (multiple == 0) {
+        // Avoid division by zero
+        return value;
+    }
+
+    int64_t remainder = value % multiple;
+    if (remainder == 0) {
+        // Value is already a multiple of 'multiple'
+        return value;
+    }
+
+    // Calculate the difference to the next multiple
+    int64_t difference = multiple - remainder;
+
+    // Add the difference to the value
+    int64_t rounded_up_value = value + difference;
+
+    return rounded_up_value;
+}
+
+// TRAMPOLINE MANAGEMENT API
+
 static int
 new_code_arena(void)
 {
@@ -256,6 +291,7 @@ new_code_arena(void)
     void *start = &_Py_trampoline_func_start;
     void *end = &_Py_trampoline_func_end;
     size_t code_size = end - start;
+    size_t chunk_size = round_up(code_size + trampoline_api.code_padding, 16);
     // TODO: Check the effect of alignment of the code chunks. Initial investigation
     // showed that this has no effect on performance in x86-64 or aarch64 and the current
     // version has the advantage that the unwinder in GDB can unwind across JIT-ed code.
@@ -264,9 +300,9 @@ new_code_arena(void)
     // measurable performance improvement by rounding trampolines up to 32-bit
     // or 64-bit alignment.
 
-    size_t n_copies = mem_size / code_size;
+    size_t n_copies = mem_size / chunk_size;
     for (size_t i = 0; i < n_copies; i++) {
-        memcpy(memory + i * code_size, start, code_size * sizeof(char));
+        memcpy(memory + i * chunk_size, start, code_size * sizeof(char));
     }
     // Some systems may prevent us from creating executable code on the fly.
     int res = mprotect(memory, mem_size, PROT_READ | PROT_EXEC);
@@ -320,16 +356,18 @@ static inline py_trampoline
 code_arena_new_code(code_arena_t *code_arena)
 {
     py_trampoline trampoline = (py_trampoline)code_arena->current_addr;
-    code_arena->size_left -= code_arena->code_size;
-    code_arena->current_addr += code_arena->code_size;
+    size_t total_code_size = round_up(code_arena->code_size + trampoline_api.code_padding, 16);
+    code_arena->size_left -= total_code_size;
+    code_arena->current_addr += total_code_size;
     return trampoline;
 }
 
 static inline py_trampoline
 compile_trampoline(void)
 {
+    size_t total_code_size = round_up(perf_code_arena->code_size + trampoline_api.code_padding, 16);
     if ((perf_code_arena == NULL) ||
-        (perf_code_arena->size_left <= perf_code_arena->code_size)) {
+        (perf_code_arena->size_left <= total_code_size)) {
         if (new_code_arena() < 0) {
             return NULL;
         }
@@ -480,6 +518,7 @@ _PyPerfTrampoline_Fini(void)
     }
     if (perf_status == PERF_STATUS_OK) {
         trampoline_api.free_state(trampoline_api.state);
+        perf_trampoline_type = PERF_TRAMPOLINE_UNSET;
     }
     extra_code_index = -1;
     perf_status = PERF_STATUS_NO_INIT;
@@ -508,6 +547,9 @@ _PyPerfTrampoline_AfterFork_Child(void)
 {
 #ifdef PY_HAVE_PERF_TRAMPOLINE
     if (persist_after_fork) {
+        if (perf_trampoline_type != PERF_TRAMPOLINE_TYPE_MAP) {
+            return PyStatus_Error("Failed to copy perf map file as perf trampoline type is not type map.");
+        }
         _PyPerfTrampoline_Fini();
         char filename[256];
         pid_t parent_pid = getppid();
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index 9dc6e3f..f24b048 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -1210,7 +1210,14 @@ init_interp_main(PyThreadState *tstate)
 
 #ifdef PY_HAVE_PERF_TRAMPOLINE
         if (config->perf_profiling) {
-            if (_PyPerfTrampoline_SetCallbacks(&_Py_perfmap_callbacks) < 0 ||
+            _PyPerf_Callbacks *cur_cb;
+            if (config->perf_profiling == 1) {
+                cur_cb = &_Py_perfmap_callbacks;
+            }
+            else {
+                cur_cb = &_Py_perfmap_jit_callbacks;
+            }
+            if (_PyPerfTrampoline_SetCallbacks(cur_cb) < 0 ||
                     _PyPerfTrampoline_Init(config->perf_profiling) < 0) {
                 return _PyStatus_ERR("can't initialize the perf trampoline");
             }
diff --git a/Python/sysmodule.c b/Python/sysmodule.c
index bd7f821..17c4a5f 100644
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c
@@ -2282,6 +2282,16 @@ sys_activate_stack_trampoline_impl(PyObject *module, const char *backend)
                 return NULL;
             }
         }
+        else if (strcmp(backend, "perfjit") == 0) {
+            _PyPerf_Callbacks cur_cb;
+            _PyPerfTrampoline_GetCallbacks(&cur_cb);
+            if (cur_cb.write_state != _Py_perfmap_jit_callbacks.write_state) {
+                if (_PyPerfTrampoline_SetCallbacks(&_Py_perfmap_jit_callbacks) < 0 ) {
+                    PyErr_SetString(PyExc_ValueError, "can't activate perf jit trampoline");
+                    return NULL;
+                }
+            }
+        }
     }
     else {
         PyErr_Format(PyExc_ValueError, "invalid backend: %s", backend);