From 1b7e5e6e60e0d22b2a928cbbb36ebb989183450f Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Fri, 3 May 2024 16:41:07 -0700 Subject: GH-113464: Generate a more efficient JIT (GH-118512) --- Python/jit.c | 587 +++++++++++++++++++++++-------------------------- Tools/jit/_stencils.py | 116 +++++++++- Tools/jit/_writer.py | 107 +++------ 3 files changed, 418 insertions(+), 392 deletions(-) diff --git a/Python/jit.c b/Python/jit.c index df14e48..7c316a4 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -16,8 +16,6 @@ #include "pycore_sliceobject.h" #include "pycore_jit.h" -#include "jit_stencils.h" - // Memory management stuff: //////////////////////////////////////////////////// #ifndef MS_WINDOWS @@ -146,256 +144,275 @@ set_bits(uint32_t *loc, uint8_t loc_start, uint64_t value, uint8_t value_start, #define IS_AARCH64_LDR_OR_STR(I) (((I) & 0x3B000000) == 0x39000000) #define IS_AARCH64_MOV(I) (((I) & 0x9F800000) == 0x92800000) -// Fill all of stencil's holes in the memory pointed to by base, using the -// values in patches. -static void -patch(unsigned char *base, const Stencil *stencil, uintptr_t patches[]) +// LLD is a great reference for performing relocations... just keep in +// mind that Tools/jit/build.py does filtering and preprocessing for us! +// Here's a good place to start for each platform: +// - aarch64-apple-darwin: +// - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64.cpp +// - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.cpp +// - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.h +// - aarch64-pc-windows-msvc: +// - https://github.com/llvm/llvm-project/blob/main/lld/COFF/Chunks.cpp +// - aarch64-unknown-linux-gnu: +// - https://github.com/llvm/llvm-project/blob/main/lld/ELF/Arch/AArch64.cpp +// - i686-pc-windows-msvc: +// - https://github.com/llvm/llvm-project/blob/main/lld/COFF/Chunks.cpp +// - x86_64-apple-darwin: +// - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/X86_64.cpp +// - x86_64-pc-windows-msvc: +// - https://github.com/llvm/llvm-project/blob/main/lld/COFF/Chunks.cpp +// - x86_64-unknown-linux-gnu: +// - https://github.com/llvm/llvm-project/blob/main/lld/ELF/Arch/X86_64.cpp + +// Many of these patches are "relaxing", meaning that they can rewrite the +// code they're patching to be more efficient (like turning a 64-bit memory +// load into a 32-bit immediate load). These patches have an "x" in their name. +// Relative patches have an "r" in their name. + +// 32-bit absolute address. +void +patch_32(unsigned char *location, uint64_t value) { - for (size_t i = 0; i < stencil->holes_size; i++) { - const Hole *hole = &stencil->holes[i]; - unsigned char *location = base + hole->offset; - uint64_t value = patches[hole->value] + (uintptr_t)hole->symbol + hole->addend; - uint8_t *loc8 = (uint8_t *)location; - uint32_t *loc32 = (uint32_t *)location; - uint64_t *loc64 = (uint64_t *)location; - // LLD is a great reference for performing relocations... just keep in - // mind that Tools/jit/build.py does filtering and preprocessing for us! - // Here's a good place to start for each platform: - // - aarch64-apple-darwin: - // - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64.cpp - // - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.cpp - // - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.h - // - aarch64-pc-windows-msvc: - // - https://github.com/llvm/llvm-project/blob/main/lld/COFF/Chunks.cpp - // - aarch64-unknown-linux-gnu: - // - https://github.com/llvm/llvm-project/blob/main/lld/ELF/Arch/AArch64.cpp - // - i686-pc-windows-msvc: - // - https://github.com/llvm/llvm-project/blob/main/lld/COFF/Chunks.cpp - // - x86_64-apple-darwin: - // - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/X86_64.cpp - // - x86_64-pc-windows-msvc: - // - https://github.com/llvm/llvm-project/blob/main/lld/COFF/Chunks.cpp - // - x86_64-unknown-linux-gnu: - // - https://github.com/llvm/llvm-project/blob/main/lld/ELF/Arch/X86_64.cpp - switch (hole->kind) { - case HoleKind_IMAGE_REL_I386_DIR32: - // 32-bit absolute address. - // Check that we're not out of range of 32 unsigned bits: - assert(value < (1ULL << 32)); - *loc32 = (uint32_t)value; - continue; - case HoleKind_ARM64_RELOC_UNSIGNED: - case HoleKind_R_AARCH64_ABS64: - case HoleKind_X86_64_RELOC_UNSIGNED: - case HoleKind_R_X86_64_64: - // 64-bit absolute address. - *loc64 = value; - continue; - case HoleKind_IMAGE_REL_AMD64_REL32: - case HoleKind_IMAGE_REL_I386_REL32: - case HoleKind_R_X86_64_GOTPCRELX: - case HoleKind_R_X86_64_REX_GOTPCRELX: - case HoleKind_X86_64_RELOC_GOT: - case HoleKind_X86_64_RELOC_GOT_LOAD: { - // 32-bit relative address. - // Try to relax the GOT load into an immediate value: - uint64_t relaxed = *(uint64_t *)(value + 4) - 4; - if ((int64_t)relaxed - (int64_t)location >= -(1LL << 31) && - (int64_t)relaxed - (int64_t)location + 1 < (1LL << 31)) - { - if (loc8[-2] == 0x8B) { - // mov reg, dword ptr [rip + AAA] -> lea reg, [rip + XXX] - loc8[-2] = 0x8D; - value = relaxed; - } - else if (loc8[-2] == 0xFF && loc8[-1] == 0x15) { - // call qword ptr [rip + AAA] -> nop; call XXX - loc8[-2] = 0x90; - loc8[-1] = 0xE8; - value = relaxed; - } - else if (loc8[-2] == 0xFF && loc8[-1] == 0x25) { - // jmp qword ptr [rip + AAA] -> nop; jmp XXX - loc8[-2] = 0x90; - loc8[-1] = 0xE9; - value = relaxed; - } - } - } - // Fall through... - case HoleKind_R_X86_64_GOTPCREL: - case HoleKind_R_X86_64_PC32: - case HoleKind_X86_64_RELOC_SIGNED: - case HoleKind_X86_64_RELOC_BRANCH: - // 32-bit relative address. - value -= (uintptr_t)location; - // Check that we're not out of range of 32 signed bits: - assert((int64_t)value >= -(1LL << 31)); - assert((int64_t)value < (1LL << 31)); - *loc32 = (uint32_t)value; - continue; - case HoleKind_ARM64_RELOC_BRANCH26: - case HoleKind_IMAGE_REL_ARM64_BRANCH26: - case HoleKind_R_AARCH64_CALL26: - case HoleKind_R_AARCH64_JUMP26: - // 28-bit relative branch. - assert(IS_AARCH64_BRANCH(*loc32)); - value -= (uintptr_t)location; - // Check that we're not out of range of 28 signed bits: - assert((int64_t)value >= -(1 << 27)); - assert((int64_t)value < (1 << 27)); - // Since instructions are 4-byte aligned, only use 26 bits: - assert(get_bits(value, 0, 2) == 0); - set_bits(loc32, 0, value, 2, 26); - continue; - case HoleKind_R_AARCH64_MOVW_UABS_G0_NC: - // 16-bit low part of an absolute address. - assert(IS_AARCH64_MOV(*loc32)); - // Check the implicit shift (this is "part 0 of 3"): - assert(get_bits(*loc32, 21, 2) == 0); - set_bits(loc32, 5, value, 0, 16); - continue; - case HoleKind_R_AARCH64_MOVW_UABS_G1_NC: - // 16-bit middle-low part of an absolute address. - assert(IS_AARCH64_MOV(*loc32)); - // Check the implicit shift (this is "part 1 of 3"): - assert(get_bits(*loc32, 21, 2) == 1); - set_bits(loc32, 5, value, 16, 16); - continue; - case HoleKind_R_AARCH64_MOVW_UABS_G2_NC: - // 16-bit middle-high part of an absolute address. - assert(IS_AARCH64_MOV(*loc32)); - // Check the implicit shift (this is "part 2 of 3"): - assert(get_bits(*loc32, 21, 2) == 2); - set_bits(loc32, 5, value, 32, 16); - continue; - case HoleKind_R_AARCH64_MOVW_UABS_G3: - // 16-bit high part of an absolute address. - assert(IS_AARCH64_MOV(*loc32)); - // Check the implicit shift (this is "part 3 of 3"): - assert(get_bits(*loc32, 21, 2) == 3); - set_bits(loc32, 5, value, 48, 16); - continue; - case HoleKind_ARM64_RELOC_GOT_LOAD_PAGE21: - case HoleKind_IMAGE_REL_ARM64_PAGEBASE_REL21: - case HoleKind_R_AARCH64_ADR_GOT_PAGE: - case HoleKind_R_AARCH64_ADR_PREL_PG_HI21: - // 21-bit count of pages between this page and an absolute address's - // page... I know, I know, it's weird. Pairs nicely with - // ARM64_RELOC_GOT_LOAD_PAGEOFF12 (below). - assert(IS_AARCH64_ADRP(*loc32)); - // Try to relax the pair of GOT loads into an immediate value: - const Hole *next_hole = &stencil->holes[i + 1]; - if (i + 1 < stencil->holes_size && - (next_hole->kind == HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12 || - next_hole->kind == HoleKind_IMAGE_REL_ARM64_PAGEOFFSET_12L || - next_hole->kind == HoleKind_R_AARCH64_LD64_GOT_LO12_NC) && - next_hole->offset == hole->offset + 4 && - next_hole->symbol == hole->symbol && - next_hole->addend == hole->addend && - next_hole->value == hole->value) - { - unsigned char reg = get_bits(loc32[0], 0, 5); - assert(IS_AARCH64_LDR_OR_STR(loc32[1])); - // There should be only one register involved: - assert(reg == get_bits(loc32[1], 0, 5)); // ldr's output register. - assert(reg == get_bits(loc32[1], 5, 5)); // ldr's input register. - uint64_t relaxed = *(uint64_t *)value; - if (relaxed < (1UL << 16)) { - // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; nop - loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | reg; - loc32[1] = 0xD503201F; - i++; - continue; - } - if (relaxed < (1ULL << 32)) { - // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; movk reg, YYY - loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | reg; - loc32[1] = 0xF2A00000 | (get_bits(relaxed, 16, 16) << 5) | reg; - i++; - continue; - } - relaxed = value - (uintptr_t)location; - if ((relaxed & 0x3) == 0 && - (int64_t)relaxed >= -(1L << 19) && - (int64_t)relaxed < (1L << 19)) - { - // adrp reg, AAA; ldr reg, [reg + BBB] -> ldr reg, XXX; nop - loc32[0] = 0x58000000 | (get_bits(relaxed, 2, 19) << 5) | reg; - loc32[1] = 0xD503201F; - i++; - continue; - } - } - // Fall through... - case HoleKind_ARM64_RELOC_PAGE21: - // Number of pages between this page and the value's page: - value = (value >> 12) - ((uintptr_t)location >> 12); - // Check that we're not out of range of 21 signed bits: - assert((int64_t)value >= -(1 << 20)); - assert((int64_t)value < (1 << 20)); - // value[0:2] goes in loc[29:31]: - set_bits(loc32, 29, value, 0, 2); - // value[2:21] goes in loc[5:26]: - set_bits(loc32, 5, value, 2, 19); - continue; - case HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12: - case HoleKind_ARM64_RELOC_PAGEOFF12: - case HoleKind_IMAGE_REL_ARM64_PAGEOFFSET_12A: - case HoleKind_IMAGE_REL_ARM64_PAGEOFFSET_12L: - case HoleKind_R_AARCH64_ADD_ABS_LO12_NC: - case HoleKind_R_AARCH64_LD64_GOT_LO12_NC: - // 12-bit low part of an absolute address. Pairs nicely with - // ARM64_RELOC_GOT_LOAD_PAGE21 (above). - assert(IS_AARCH64_LDR_OR_STR(*loc32) || IS_AARCH64_ADD_OR_SUB(*loc32)); - // There might be an implicit shift encoded in the instruction: - uint8_t shift = 0; - if (IS_AARCH64_LDR_OR_STR(*loc32)) { - shift = (uint8_t)get_bits(*loc32, 30, 2); - // If both of these are set, the shift is supposed to be 4. - // That's pretty weird, and it's never actually been observed... - assert(get_bits(*loc32, 23, 1) == 0 || get_bits(*loc32, 26, 1) == 0); - } - value = get_bits(value, 0, 12); - assert(get_bits(value, 0, shift) == 0); - set_bits(loc32, 10, value, shift, 12); - continue; - } - Py_UNREACHABLE(); + uint32_t *loc32 = (uint32_t *)location; + // Check that we're not out of range of 32 unsigned bits: + assert(value < (1ULL << 32)); + *loc32 = (uint32_t)value; +} + +// 32-bit relative address. +void +patch_32r(unsigned char *location, uint64_t value) +{ + uint32_t *loc32 = (uint32_t *)location; + value -= (uintptr_t)location; + // Check that we're not out of range of 32 signed bits: + assert((int64_t)value >= -(1LL << 31)); + assert((int64_t)value < (1LL << 31)); + *loc32 = (uint32_t)value; +} + +// 64-bit absolute address. +void +patch_64(unsigned char *location, uint64_t value) +{ + uint64_t *loc64 = (uint64_t *)location; + *loc64 = value; +} + +// 12-bit low part of an absolute address. Pairs nicely with patch_aarch64_21r +// (below). +void +patch_aarch64_12(unsigned char *location, uint64_t value) +{ + uint32_t *loc32 = (uint32_t *)location; + assert(IS_AARCH64_LDR_OR_STR(*loc32) || IS_AARCH64_ADD_OR_SUB(*loc32)); + // There might be an implicit shift encoded in the instruction: + uint8_t shift = 0; + if (IS_AARCH64_LDR_OR_STR(*loc32)) { + shift = (uint8_t)get_bits(*loc32, 30, 2); + // If both of these are set, the shift is supposed to be 4. + // That's pretty weird, and it's never actually been observed... + assert(get_bits(*loc32, 23, 1) == 0 || get_bits(*loc32, 26, 1) == 0); } + value = get_bits(value, 0, 12); + assert(get_bits(value, 0, shift) == 0); + set_bits(loc32, 10, value, shift, 12); } -static void -copy_and_patch(unsigned char *base, const Stencil *stencil, uintptr_t patches[]) +// Relaxable 12-bit low part of an absolute address. Pairs nicely with +// patch_aarch64_21rx (below). +void +patch_aarch64_12x(unsigned char *location, uint64_t value) { - memcpy(base, stencil->body, stencil->body_size); - patch(base, stencil, patches); + // This can *only* be relaxed if it occurs immediately before a matching + // patch_aarch64_21rx. If that happens, the JIT build step will replace both + // calls with a single call to patch_aarch64_33rx. Otherwise, we end up + // here, and the instruction is patched normally: + patch_aarch64_12(location, value); } -static void -emit(const StencilGroup *group, uintptr_t patches[]) +// 16-bit low part of an absolute address. +void +patch_aarch64_16a(unsigned char *location, uint64_t value) +{ + uint32_t *loc32 = (uint32_t *)location; + assert(IS_AARCH64_MOV(*loc32)); + // Check the implicit shift (this is "part 0 of 3"): + assert(get_bits(*loc32, 21, 2) == 0); + set_bits(loc32, 5, value, 0, 16); +} + +// 16-bit middle-low part of an absolute address. +void +patch_aarch64_16b(unsigned char *location, uint64_t value) +{ + uint32_t *loc32 = (uint32_t *)location; + assert(IS_AARCH64_MOV(*loc32)); + // Check the implicit shift (this is "part 1 of 3"): + assert(get_bits(*loc32, 21, 2) == 1); + set_bits(loc32, 5, value, 16, 16); +} + +// 16-bit middle-high part of an absolute address. +void +patch_aarch64_16c(unsigned char *location, uint64_t value) { - copy_and_patch((unsigned char *)patches[HoleValue_DATA], &group->data, patches); - copy_and_patch((unsigned char *)patches[HoleValue_CODE], &group->code, patches); + uint32_t *loc32 = (uint32_t *)location; + assert(IS_AARCH64_MOV(*loc32)); + // Check the implicit shift (this is "part 2 of 3"): + assert(get_bits(*loc32, 21, 2) == 2); + set_bits(loc32, 5, value, 32, 16); } +// 16-bit high part of an absolute address. +void +patch_aarch64_16d(unsigned char *location, uint64_t value) +{ + uint32_t *loc32 = (uint32_t *)location; + assert(IS_AARCH64_MOV(*loc32)); + // Check the implicit shift (this is "part 3 of 3"): + assert(get_bits(*loc32, 21, 2) == 3); + set_bits(loc32, 5, value, 48, 16); +} + +// 21-bit count of pages between this page and an absolute address's page... I +// know, I know, it's weird. Pairs nicely with patch_aarch64_12 (above). +void +patch_aarch64_21r(unsigned char *location, uint64_t value) +{ + uint32_t *loc32 = (uint32_t *)location; + value = (value >> 12) - ((uintptr_t)location >> 12); + // Check that we're not out of range of 21 signed bits: + assert((int64_t)value >= -(1 << 20)); + assert((int64_t)value < (1 << 20)); + // value[0:2] goes in loc[29:31]: + set_bits(loc32, 29, value, 0, 2); + // value[2:21] goes in loc[5:26]: + set_bits(loc32, 5, value, 2, 19); +} + +// Relaxable 21-bit count of pages between this page and an absolute address's +// page. Pairs nicely with patch_aarch64_12x (above). +void +patch_aarch64_21rx(unsigned char *location, uint64_t value) +{ + // This can *only* be relaxed if it occurs immediately before a matching + // patch_aarch64_12x. If that happens, the JIT build step will replace both + // calls with a single call to patch_aarch64_33rx. Otherwise, we end up + // here, and the instruction is patched normally: + patch_aarch64_21r(location, value); +} + +// 28-bit relative branch. +void +patch_aarch64_26r(unsigned char *location, uint64_t value) +{ + uint32_t *loc32 = (uint32_t *)location; + assert(IS_AARCH64_BRANCH(*loc32)); + value -= (uintptr_t)location; + // Check that we're not out of range of 28 signed bits: + assert((int64_t)value >= -(1 << 27)); + assert((int64_t)value < (1 << 27)); + // Since instructions are 4-byte aligned, only use 26 bits: + assert(get_bits(value, 0, 2) == 0); + set_bits(loc32, 0, value, 2, 26); +} + +// A pair of patch_aarch64_21rx and patch_aarch64_12x. +void +patch_aarch64_33rx(unsigned char *location, uint64_t value) +{ + uint32_t *loc32 = (uint32_t *)location; + // Try to relax the pair of GOT loads into an immediate value: + assert(IS_AARCH64_ADRP(*loc32)); + unsigned char reg = get_bits(loc32[0], 0, 5); + assert(IS_AARCH64_LDR_OR_STR(loc32[1])); + // There should be only one register involved: + assert(reg == get_bits(loc32[1], 0, 5)); // ldr's output register. + assert(reg == get_bits(loc32[1], 5, 5)); // ldr's input register. + uint64_t relaxed = *(uint64_t *)value; + if (relaxed < (1UL << 16)) { + // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; nop + loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | reg; + loc32[1] = 0xD503201F; + return; + } + if (relaxed < (1ULL << 32)) { + // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; movk reg, YYY + loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | reg; + loc32[1] = 0xF2A00000 | (get_bits(relaxed, 16, 16) << 5) | reg; + return; + } + relaxed = value - (uintptr_t)location; + if ((relaxed & 0x3) == 0 && + (int64_t)relaxed >= -(1L << 19) && + (int64_t)relaxed < (1L << 19)) + { + // adrp reg, AAA; ldr reg, [reg + BBB] -> ldr reg, XXX; nop + loc32[0] = 0x58000000 | (get_bits(relaxed, 2, 19) << 5) | reg; + loc32[1] = 0xD503201F; + return; + } + // Couldn't do it. Just patch the two instructions normally: + patch_aarch64_21rx(location, value); + patch_aarch64_12x(location + 4, value); +} + +// Relaxable 32-bit relative address. +void +patch_x86_64_32rx(unsigned char *location, uint64_t value) +{ + uint8_t *loc8 = (uint8_t *)location; + // Try to relax the GOT load into an immediate value: + uint64_t relaxed = *(uint64_t *)(value + 4) - 4; + if ((int64_t)relaxed - (int64_t)location >= -(1LL << 31) && + (int64_t)relaxed - (int64_t)location + 1 < (1LL << 31)) + { + if (loc8[-2] == 0x8B) { + // mov reg, dword ptr [rip + AAA] -> lea reg, [rip + XXX] + loc8[-2] = 0x8D; + value = relaxed; + } + else if (loc8[-2] == 0xFF && loc8[-1] == 0x15) { + // call qword ptr [rip + AAA] -> nop; call XXX + loc8[-2] = 0x90; + loc8[-1] = 0xE8; + value = relaxed; + } + else if (loc8[-2] == 0xFF && loc8[-1] == 0x25) { + // jmp qword ptr [rip + AAA] -> nop; jmp XXX + loc8[-2] = 0x90; + loc8[-1] = 0xE9; + value = relaxed; + } + } + patch_32r(location, value); +} + +#include "jit_stencils.h" + // Compiles executor in-place. Don't forget to call _PyJIT_Free later! int -_PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size_t length) +_PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], size_t length) { + const StencilGroup *group; // Loop once to find the total compiled size: - size_t instruction_starts[UOP_MAX_TRACE_LENGTH]; - size_t code_size = trampoline.code.body_size; - size_t data_size = trampoline.data.body_size; + uintptr_t instruction_starts[UOP_MAX_TRACE_LENGTH]; + size_t code_size = 0; + size_t data_size = 0; + group = &trampoline; + code_size += group->code_size; + data_size += group->data_size; for (size_t i = 0; i < length; i++) { - _PyUOpInstruction *instruction = (_PyUOpInstruction *)&trace[i]; - const StencilGroup *group = &stencil_groups[instruction->opcode]; + const _PyUOpInstruction *instruction = &trace[i]; + group = &stencil_groups[instruction->opcode]; instruction_starts[i] = code_size; - code_size += group->code.body_size; - data_size += group->data.body_size; + code_size += group->code_size; + data_size += group->data_size; } - code_size += stencil_groups[_FATAL_ERROR].code.body_size; - data_size += stencil_groups[_FATAL_ERROR].data.body_size; + group = &stencil_groups[_FATAL_ERROR]; + code_size += group->code_size; + data_size += group->data_size; // Round up to the nearest page: size_t page_size = get_page_size(); assert((page_size & (page_size - 1)) == 0); @@ -405,87 +422,35 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size if (memory == NULL) { return -1; } + // Update the offsets of each instruction: + for (size_t i = 0; i < length; i++) { + instruction_starts[i] += (uintptr_t)memory; + } // Loop again to emit the code: unsigned char *code = memory; unsigned char *data = memory + code_size; - { - // Compile the trampoline, which handles converting between the native - // calling convention and the calling convention used by jitted code - // (which may be different for efficiency reasons). On platforms where - // we don't change calling conventions, the trampoline is empty and - // nothing is emitted here: - const StencilGroup *group = &trampoline; - // Think of patches as a dictionary mapping HoleValue to uintptr_t: - uintptr_t patches[] = GET_PATCHES(); - patches[HoleValue_CODE] = (uintptr_t)code; - patches[HoleValue_CONTINUE] = (uintptr_t)code + group->code.body_size; - patches[HoleValue_DATA] = (uintptr_t)data; - patches[HoleValue_EXECUTOR] = (uintptr_t)executor; - patches[HoleValue_TOP] = (uintptr_t)memory + trampoline.code.body_size; - patches[HoleValue_ZERO] = 0; - emit(group, patches); - code += group->code.body_size; - data += group->data.body_size; - } + // Compile the trampoline, which handles converting between the native + // calling convention and the calling convention used by jitted code + // (which may be different for efficiency reasons). On platforms where + // we don't change calling conventions, the trampoline is empty and + // nothing is emitted here: + group = &trampoline; + group->emit(code, data, executor, NULL, instruction_starts); + code += group->code_size; + data += group->data_size; assert(trace[0].opcode == _START_EXECUTOR || trace[0].opcode == _COLD_EXIT); for (size_t i = 0; i < length; i++) { - _PyUOpInstruction *instruction = (_PyUOpInstruction *)&trace[i]; - const StencilGroup *group = &stencil_groups[instruction->opcode]; - uintptr_t patches[] = GET_PATCHES(); - patches[HoleValue_CODE] = (uintptr_t)code; - patches[HoleValue_CONTINUE] = (uintptr_t)code + group->code.body_size; - patches[HoleValue_DATA] = (uintptr_t)data; - patches[HoleValue_EXECUTOR] = (uintptr_t)executor; - patches[HoleValue_OPARG] = instruction->oparg; - #if SIZEOF_VOID_P == 8 - patches[HoleValue_OPERAND] = instruction->operand; - #else - assert(SIZEOF_VOID_P == 4); - patches[HoleValue_OPERAND_HI] = instruction->operand >> 32; - patches[HoleValue_OPERAND_LO] = instruction->operand & UINT32_MAX; - #endif - switch (instruction->format) { - case UOP_FORMAT_TARGET: - patches[HoleValue_TARGET] = instruction->target; - break; - case UOP_FORMAT_EXIT: - assert(instruction->exit_index < executor->exit_count); - patches[HoleValue_EXIT_INDEX] = instruction->exit_index; - if (instruction->error_target < length) { - patches[HoleValue_ERROR_TARGET] = (uintptr_t)memory + instruction_starts[instruction->error_target]; - } - break; - case UOP_FORMAT_JUMP: - assert(instruction->jump_target < length); - patches[HoleValue_JUMP_TARGET] = (uintptr_t)memory + instruction_starts[instruction->jump_target]; - if (instruction->error_target < length) { - patches[HoleValue_ERROR_TARGET] = (uintptr_t)memory + instruction_starts[instruction->error_target]; - } - break; - default: - assert(0); - Py_FatalError("Illegal instruction format"); - } - patches[HoleValue_TOP] = (uintptr_t)memory + instruction_starts[1]; - patches[HoleValue_ZERO] = 0; - emit(group, patches); - code += group->code.body_size; - data += group->data.body_size; - } - { - // Protect against accidental buffer overrun into data: - const StencilGroup *group = &stencil_groups[_FATAL_ERROR]; - uintptr_t patches[] = GET_PATCHES(); - patches[HoleValue_CODE] = (uintptr_t)code; - patches[HoleValue_CONTINUE] = (uintptr_t)code; - patches[HoleValue_DATA] = (uintptr_t)data; - patches[HoleValue_EXECUTOR] = (uintptr_t)executor; - patches[HoleValue_TOP] = (uintptr_t)code; - patches[HoleValue_ZERO] = 0; - emit(group, patches); - code += group->code.body_size; - data += group->data.body_size; + const _PyUOpInstruction *instruction = &trace[i]; + group = &stencil_groups[instruction->opcode]; + group->emit(code, data, executor, instruction, instruction_starts); + code += group->code_size; + data += group->data_size; } + // Protect against accidental buffer overrun into data: + group = &stencil_groups[_FATAL_ERROR]; + group->emit(code, data, executor, NULL, instruction_starts); + code += group->code_size; + data += group->data_size; assert(code == memory + code_size); assert(data == memory + code_size + data_size); if (mark_executable(memory, total_size)) { @@ -493,7 +458,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size return -1; } executor->jit_code = memory; - executor->jit_side_entry = memory + trampoline.code.body_size; + executor->jit_side_entry = memory + trampoline.code_size; executor->jit_size = total_size; return 0; } diff --git a/Tools/jit/_stencils.py b/Tools/jit/_stencils.py index 9feceb4..6e046df 100644 --- a/Tools/jit/_stencils.py +++ b/Tools/jit/_stencils.py @@ -3,6 +3,7 @@ import dataclasses import enum import sys +import typing import _schema @@ -47,6 +48,73 @@ class HoleValue(enum.Enum): ZERO = enum.auto() +# Map relocation types to our JIT's patch functions. "r" suffixes indicate that +# the patch function is relative. "x" suffixes indicate that they are "relaxing" +# (see comments in jit.c for more info): +_PATCH_FUNCS = { + # aarch64-apple-darwin: + "ARM64_RELOC_BRANCH26": "patch_aarch64_26r", + "ARM64_RELOC_GOT_LOAD_PAGE21": "patch_aarch64_21rx", + "ARM64_RELOC_GOT_LOAD_PAGEOFF12": "patch_aarch64_12x", + "ARM64_RELOC_PAGE21": "patch_aarch64_21r", + "ARM64_RELOC_PAGEOFF12": "patch_aarch64_12", + "ARM64_RELOC_UNSIGNED": "patch_64", + # x86_64-pc-windows-msvc: + "IMAGE_REL_AMD64_REL32": "patch_x86_64_32rx", + # aarch64-pc-windows-msvc: + "IMAGE_REL_ARM64_BRANCH26": "patch_aarch64_26r", + "IMAGE_REL_ARM64_PAGEBASE_REL21": "patch_aarch64_21rx", + "IMAGE_REL_ARM64_PAGEOFFSET_12A": "patch_aarch64_12", + "IMAGE_REL_ARM64_PAGEOFFSET_12L": "patch_aarch64_12x", + # i686-pc-windows-msvc: + "IMAGE_REL_I386_DIR32": "patch_32", + "IMAGE_REL_I386_REL32": "patch_x86_64_32rx", + # aarch64-unknown-linux-gnu: + "R_AARCH64_ABS64": "patch_64", + "R_AARCH64_ADD_ABS_LO12_NC": "patch_aarch64_12", + "R_AARCH64_ADR_GOT_PAGE": "patch_aarch64_21rx", + "R_AARCH64_ADR_PREL_PG_HI21": "patch_aarch64_21r", + "R_AARCH64_CALL26": "patch_aarch64_26r", + "R_AARCH64_JUMP26": "patch_aarch64_26r", + "R_AARCH64_LD64_GOT_LO12_NC": "patch_aarch64_12x", + "R_AARCH64_MOVW_UABS_G0_NC": "patch_aarch64_16a", + "R_AARCH64_MOVW_UABS_G1_NC": "patch_aarch64_16b", + "R_AARCH64_MOVW_UABS_G2_NC": "patch_aarch64_16c", + "R_AARCH64_MOVW_UABS_G3": "patch_aarch64_16d", + # x86_64-unknown-linux-gnu: + "R_X86_64_64": "patch_64", + "R_X86_64_GOTPCREL": "patch_32r", + "R_X86_64_GOTPCRELX": "patch_x86_64_32rx", + "R_X86_64_PC32": "patch_32r", + "R_X86_64_REX_GOTPCRELX": "patch_x86_64_32rx", + # x86_64-apple-darwin: + "X86_64_RELOC_BRANCH": "patch_32r", + "X86_64_RELOC_GOT": "patch_x86_64_32rx", + "X86_64_RELOC_GOT_LOAD": "patch_x86_64_32rx", + "X86_64_RELOC_SIGNED": "patch_32r", + "X86_64_RELOC_UNSIGNED": "patch_64", +} +# Translate HoleValues to C expressions: +_HOLE_EXPRS = { + HoleValue.CODE: "(uintptr_t)code", + HoleValue.CONTINUE: "(uintptr_t)code + sizeof(code_body)", + HoleValue.DATA: "(uintptr_t)data", + HoleValue.EXECUTOR: "(uintptr_t)executor", + # These should all have been turned into DATA values by process_relocations: + # HoleValue.GOT: "", + HoleValue.OPARG: "instruction->oparg", + HoleValue.OPERAND: "instruction->operand", + HoleValue.OPERAND_HI: "(instruction->operand >> 32)", + HoleValue.OPERAND_LO: "(instruction->operand & UINT32_MAX)", + HoleValue.TARGET: "instruction->target", + HoleValue.JUMP_TARGET: "instruction_starts[instruction->jump_target]", + HoleValue.ERROR_TARGET: "instruction_starts[instruction->error_target]", + HoleValue.EXIT_INDEX: "instruction->exit_index", + HoleValue.TOP: "instruction_starts[1]", + HoleValue.ZERO: "", +} + + @dataclasses.dataclass class Hole: """ @@ -63,19 +131,43 @@ class Hole: symbol: str | None # ...plus this addend: addend: int + func: str = dataclasses.field(init=False) # Convenience method: replace = dataclasses.replace - def as_c(self) -> str: - """Dump this hole as an initialization of a C Hole struct.""" - parts = [ - f"{self.offset:#x}", - f"HoleKind_{self.kind}", - f"HoleValue_{self.value.name}", - f"&{self.symbol}" if self.symbol else "NULL", - f"{_signed(self.addend):#x}", - ] - return f"{{{', '.join(parts)}}}" + def __post_init__(self) -> None: + self.func = _PATCH_FUNCS[self.kind] + + def fold(self, other: typing.Self) -> typing.Self | None: + """Combine two holes into a single hole, if possible.""" + if ( + self.offset + 4 == other.offset + and self.value == other.value + and self.symbol == other.symbol + and self.addend == other.addend + and self.func == "patch_aarch64_21rx" + and other.func == "patch_aarch64_12x" + ): + # These can *only* be properly relaxed when they appear together and + # patch the same value: + folded = self.replace() + folded.func = "patch_aarch64_33rx" + return folded + return None + + def as_c(self, where: str) -> str: + """Dump this hole as a call to a patch_* function.""" + location = f"{where} + {self.offset:#x}" + value = _HOLE_EXPRS[self.value] + if self.symbol: + if value: + value += " + " + value += f"(uintptr_t)&{self.symbol}" + if _signed(self.addend): + if value: + value += " + " + value += f"{_signed(self.addend):#x}" + return f"{self.func}({location}, {value});" @dataclasses.dataclass @@ -265,6 +357,10 @@ class StencilGroup: ) self.data.body.extend([0] * 8) + def as_c(self, opname: str) -> str: + """Dump this hole as a StencilGroup initializer.""" + return f"{{emit_{opname}, {len(self.code.body)}, {len(self.data.body)}}}" + def symbol_to_value(symbol: str) -> tuple[HoleValue, str | None]: """ diff --git a/Tools/jit/_writer.py b/Tools/jit/_writer.py index ccd6785..9d11094 100644 --- a/Tools/jit/_writer.py +++ b/Tools/jit/_writer.py @@ -1,100 +1,65 @@ """Utilities for writing StencilGroups out to a C header file.""" +import itertools import typing -import _schema import _stencils -def _dump_header() -> typing.Iterator[str]: - yield "typedef enum {" - for kind in typing.get_args(_schema.HoleKind): - yield f" HoleKind_{kind}," - yield "} HoleKind;" - yield "" - yield "typedef enum {" - for value in _stencils.HoleValue: - yield f" HoleValue_{value.name}," - yield "} HoleValue;" - yield "" - yield "typedef struct {" - yield " const size_t offset;" - yield " const HoleKind kind;" - yield " const HoleValue value;" - yield " const void *symbol;" - yield " const uint64_t addend;" - yield "} Hole;" - yield "" +def _dump_footer(groups: dict[str, _stencils.StencilGroup]) -> typing.Iterator[str]: yield "typedef struct {" - yield " const size_t body_size;" - yield " const unsigned char * const body;" - yield " const size_t holes_size;" - yield " const Hole * const holes;" - yield "} Stencil;" - yield "" - yield "typedef struct {" - yield " const Stencil code;" - yield " const Stencil data;" + yield " void (*emit)(" + yield " unsigned char *code, unsigned char *data, _PyExecutorObject *executor," + yield " const _PyUOpInstruction *instruction, uintptr_t instruction_starts[]);" + yield " size_t code_size;" + yield " size_t data_size;" yield "} StencilGroup;" yield "" - - -def _dump_footer(opnames: typing.Iterable[str]) -> typing.Iterator[str]: - yield "#define INIT_STENCIL(STENCIL) { \\" - yield " .body_size = Py_ARRAY_LENGTH(STENCIL##_body) - 1, \\" - yield " .body = STENCIL##_body, \\" - yield " .holes_size = Py_ARRAY_LENGTH(STENCIL##_holes) - 1, \\" - yield " .holes = STENCIL##_holes, \\" - yield "}" - yield "" - yield "#define INIT_STENCIL_GROUP(OP) { \\" - yield " .code = INIT_STENCIL(OP##_code), \\" - yield " .data = INIT_STENCIL(OP##_data), \\" - yield "}" + yield f"static const StencilGroup trampoline = {groups['trampoline'].as_c('trampoline')};" yield "" - yield "static const StencilGroup stencil_groups[512] = {" - for opname in opnames: + yield "static const StencilGroup stencil_groups[MAX_UOP_ID + 1] = {" + for opname, group in sorted(groups.items()): if opname == "trampoline": continue - yield f" [{opname}] = INIT_STENCIL_GROUP({opname})," + yield f" [{opname}] = {group.as_c(opname)}," yield "};" - yield "" - yield "static const StencilGroup trampoline = INIT_STENCIL_GROUP(trampoline);" - yield "" - yield "#define GET_PATCHES() { \\" - for value in _stencils.HoleValue: - yield f" [HoleValue_{value.name}] = (uintptr_t)0xBADBADBADBADBADB, \\" - yield "}" def _dump_stencil(opname: str, group: _stencils.StencilGroup) -> typing.Iterator[str]: - yield f"// {opname}" + yield "void" + yield f"emit_{opname}(" + yield " unsigned char *code, unsigned char *data, _PyExecutorObject *executor," + yield " const _PyUOpInstruction *instruction, uintptr_t instruction_starts[])" + yield "{" for part, stencil in [("code", group.code), ("data", group.data)]: for line in stencil.disassembly: - yield f"// {line}" + yield f" // {line}" if stencil.body: - size = len(stencil.body) + 1 - yield f"static const unsigned char {opname}_{part}_body[{size}] = {{" + yield f" const unsigned char {part}_body[{len(stencil.body)}] = {{" for i in range(0, len(stencil.body), 8): row = " ".join(f"{byte:#04x}," for byte in stencil.body[i : i + 8]) - yield f" {row}" - yield "};" - else: - yield f"static const unsigned char {opname}_{part}_body[1];" - if stencil.holes: - size = len(stencil.holes) + 1 - yield f"static const Hole {opname}_{part}_holes[{size}] = {{" - for hole in stencil.holes: - yield f" {hole.as_c()}," - yield "};" - else: - yield f"static const Hole {opname}_{part}_holes[1];" + yield f" {row}" + yield " };" + # Data is written first (so relaxations in the code work properly): + for part, stencil in [("data", group.data), ("code", group.code)]: + if stencil.body: + yield f" memcpy({part}, {part}_body, sizeof({part}_body));" + skip = False + stencil.holes.sort(key=lambda hole: hole.offset) + for hole, pair in itertools.zip_longest(stencil.holes, stencil.holes[1:]): + if skip: + skip = False + continue + if pair and (folded := hole.fold(pair)): + skip = True + hole = folded + yield f" {hole.as_c(part)}" + yield "}" yield "" def dump(groups: dict[str, _stencils.StencilGroup]) -> typing.Iterator[str]: """Yield a JIT compiler line-by-line as a C header file.""" - yield from _dump_header() - for opname, group in groups.items(): + for opname, group in sorted(groups.items()): yield from _dump_stencil(opname, group) yield from _dump_footer(groups) -- cgit v0.12