diff options
author | Brandt Bucher <brandtbucher@microsoft.com> | 2024-02-26 16:32:44 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-26 16:32:44 (GMT) |
commit | 7259480957e10359cc5ab8786f32f197c88e274c (patch) | |
tree | e8e92b37b5f302d3b1338657431811e655a8aa12 /Python/jit.c | |
parent | 5a832922130908994d313b56a3345ff410a0e11a (diff) | |
download | cpython-7259480957e10359cc5ab8786f32f197c88e274c.zip cpython-7259480957e10359cc5ab8786f32f197c88e274c.tar.gz cpython-7259480957e10359cc5ab8786f32f197c88e274c.tar.bz2 |
GH-115802: JIT "small" code for macOS and Linux (GH-115826)
Diffstat (limited to 'Python/jit.c')
-rw-r--r-- | Python/jit.c | 119 |
1 files changed, 103 insertions, 16 deletions
diff --git a/Python/jit.c b/Python/jit.c index 839414b..ac2c60e 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -47,18 +47,18 @@ jit_error(const char *message) PyErr_Format(PyExc_RuntimeWarning, "JIT %s (%d)", message, hint); } -static char * +static unsigned char * jit_alloc(size_t size) { assert(size); assert(size % get_page_size() == 0); #ifdef MS_WINDOWS int flags = MEM_COMMIT | MEM_RESERVE; - char *memory = VirtualAlloc(NULL, size, flags, PAGE_READWRITE); + unsigned char *memory = VirtualAlloc(NULL, size, flags, PAGE_READWRITE); int failed = memory == NULL; #else int flags = MAP_ANONYMOUS | MAP_PRIVATE; - char *memory = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); + unsigned char *memory = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); int failed = memory == MAP_FAILED; #endif if (failed) { @@ -69,7 +69,7 @@ jit_alloc(size_t size) } static int -jit_free(char *memory, size_t size) +jit_free(unsigned char *memory, size_t size) { assert(size); assert(size % get_page_size() == 0); @@ -86,7 +86,7 @@ jit_free(char *memory, size_t size) } static int -mark_executable(char *memory, size_t size) +mark_executable(unsigned char *memory, size_t size) { if (size == 0) { return 0; @@ -113,7 +113,7 @@ mark_executable(char *memory, size_t size) } static int -mark_readable(char *memory, size_t size) +mark_readable(unsigned char *memory, size_t size) { if (size == 0) { return 0; @@ -169,18 +169,20 @@ set_bits(uint32_t *loc, uint8_t loc_start, uint64_t value, uint8_t value_start, // Fill all of stencil's holes in the memory pointed to by base, using the // values in patches. static void -patch(char *base, const Stencil *stencil, uint64_t *patches) +patch(unsigned char *base, const Stencil *stencil, uint64_t *patches) { for (uint64_t i = 0; i < stencil->holes_size; i++) { const Hole *hole = &stencil->holes[i]; - void *location = base + hole->offset; + unsigned char *location = base + hole->offset; uint64_t value = patches[hole->value] + (uint64_t)hole->symbol + hole->addend; + uint8_t *loc8 = (uint8_t *)location; uint32_t *loc32 = (uint32_t *)location; uint64_t *loc64 = (uint64_t *)location; // LLD is a great reference for performing relocations... just keep in // mind that Tools/jit/build.py does filtering and preprocessing for us! // Here's a good place to start for each platform: // - aarch64-apple-darwin: + // - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64.cpp // - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.cpp // - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.h // - aarch64-unknown-linux-gnu: @@ -208,6 +210,47 @@ patch(char *base, const Stencil *stencil, uint64_t *patches) // 64-bit absolute address. *loc64 = value; continue; + case HoleKind_R_X86_64_GOTPCRELX: + case HoleKind_R_X86_64_REX_GOTPCRELX: + case HoleKind_X86_64_RELOC_GOT: + case HoleKind_X86_64_RELOC_GOT_LOAD: { + // 32-bit relative address. + // Try to relax the GOT load into an immediate value: + uint64_t relaxed = *(uint64_t *)(value + 4) - 4; + if ((int64_t)relaxed - (int64_t)location >= -(1LL << 31) && + (int64_t)relaxed - (int64_t)location + 1 < (1LL << 31)) + { + if (loc8[-2] == 0x8B) { + // mov reg, dword ptr [rip + AAA] -> lea reg, [rip + XXX] + loc8[-2] = 0x8D; + value = relaxed; + } + else if (loc8[-2] == 0xFF && loc8[-1] == 0x15) { + // call qword ptr [rip + AAA] -> nop; call XXX + loc8[-2] = 0x90; + loc8[-1] = 0xE8; + value = relaxed; + } + else if (loc8[-2] == 0xFF && loc8[-1] == 0x25) { + // jmp qword ptr [rip + AAA] -> nop; jmp XXX + loc8[-2] = 0x90; + loc8[-1] = 0xE9; + value = relaxed; + } + } + } + // Fall through... + case HoleKind_R_X86_64_GOTPCREL: + case HoleKind_R_X86_64_PC32: + case HoleKind_X86_64_RELOC_SIGNED: + case HoleKind_X86_64_RELOC_BRANCH: + // 32-bit relative address. + value -= (uint64_t)location; + // Check that we're not out of range of 32 signed bits: + assert((int64_t)value >= -(1LL << 31)); + assert((int64_t)value < (1LL << 31)); + loc32[0] = (uint32_t)value; + continue; case HoleKind_R_AARCH64_CALL26: case HoleKind_R_AARCH64_JUMP26: // 28-bit relative branch. @@ -249,10 +292,53 @@ patch(char *base, const Stencil *stencil, uint64_t *patches) set_bits(loc32, 5, value, 48, 16); continue; case HoleKind_ARM64_RELOC_GOT_LOAD_PAGE21: + case HoleKind_R_AARCH64_ADR_GOT_PAGE: // 21-bit count of pages between this page and an absolute address's // page... I know, I know, it's weird. Pairs nicely with // ARM64_RELOC_GOT_LOAD_PAGEOFF12 (below). assert(IS_AARCH64_ADRP(*loc32)); + // Try to relax the pair of GOT loads into an immediate value: + const Hole *next_hole = &stencil->holes[i + 1]; + if (i + 1 < stencil->holes_size && + (next_hole->kind == HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12 || + next_hole->kind == HoleKind_R_AARCH64_LD64_GOT_LO12_NC) && + next_hole->offset == hole->offset + 4 && + next_hole->symbol == hole->symbol && + next_hole->addend == hole->addend && + next_hole->value == hole->value) + { + unsigned char rd = get_bits(loc32[0], 0, 5); + assert(IS_AARCH64_LDR_OR_STR(loc32[1])); + unsigned char rt = get_bits(loc32[1], 0, 5); + unsigned char rn = get_bits(loc32[1], 5, 5); + assert(rd == rn && rn == rt); + uint64_t relaxed = *(uint64_t *)value; + if (relaxed < (1UL << 16)) { + // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; nop + loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | rd; + loc32[1] = 0xD503201F; + i++; + continue; + } + if (relaxed < (1ULL << 32)) { + // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; movk reg, YYY + loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | rd; + loc32[1] = 0xF2A00000 | (get_bits(relaxed, 16, 16) << 5) | rd; + i++; + continue; + } + relaxed = (uint64_t)value - (uint64_t)location; + if ((relaxed & 0x3) == 0 && + (int64_t)relaxed >= -(1L << 19) && + (int64_t)relaxed < (1L << 19)) + { + // adrp reg, AAA; ldr reg, [reg + BBB] -> ldr x0, XXX; nop + loc32[0] = 0x58000000 | (get_bits(relaxed, 2, 19) << 5) | rd; + loc32[1] = 0xD503201F; + i++; + continue; + } + } // Number of pages between this page and the value's page: value = (value >> 12) - ((uint64_t)location >> 12); // Check that we're not out of range of 21 signed bits: @@ -264,6 +350,7 @@ patch(char *base, const Stencil *stencil, uint64_t *patches) set_bits(loc32, 5, value, 2, 19); continue; case HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12: + case HoleKind_R_AARCH64_LD64_GOT_LO12_NC: // 12-bit low part of an absolute address. Pairs nicely with // ARM64_RELOC_GOT_LOAD_PAGE21 (above). assert(IS_AARCH64_LDR_OR_STR(*loc32) || IS_AARCH64_ADD_OR_SUB(*loc32)); @@ -285,7 +372,7 @@ patch(char *base, const Stencil *stencil, uint64_t *patches) } static void -copy_and_patch(char *base, const Stencil *stencil, uint64_t *patches) +copy_and_patch(unsigned char *base, const Stencil *stencil, uint64_t *patches) { memcpy(base, stencil->body, stencil->body_size); patch(base, stencil, patches); @@ -294,8 +381,8 @@ copy_and_patch(char *base, const Stencil *stencil, uint64_t *patches) static void emit(const StencilGroup *group, uint64_t patches[]) { - copy_and_patch((char *)patches[HoleValue_CODE], &group->code, patches); - copy_and_patch((char *)patches[HoleValue_DATA], &group->data, patches); + copy_and_patch((unsigned char *)patches[HoleValue_DATA], &group->data, patches); + copy_and_patch((unsigned char *)patches[HoleValue_CODE], &group->code, patches); } // Compiles executor in-place. Don't forget to call _PyJIT_Free later! @@ -316,14 +403,14 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size assert((page_size & (page_size - 1)) == 0); code_size += page_size - (code_size & (page_size - 1)); data_size += page_size - (data_size & (page_size - 1)); - char *memory = jit_alloc(code_size + data_size); + unsigned char *memory = jit_alloc(code_size + data_size); if (memory == NULL) { return -1; } // Loop again to emit the code: - char *code = memory; - char *data = memory + code_size; - char *top = code; + unsigned char *code = memory; + unsigned char *data = memory + code_size; + unsigned char *top = code; if (trace[0].opcode == _START_EXECUTOR) { // Don't want to execute this more than once: top += stencil_groups[_START_EXECUTOR].code.body_size; @@ -360,7 +447,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size void _PyJIT_Free(_PyExecutorObject *executor) { - char *memory = (char *)executor->jit_code; + unsigned char *memory = (unsigned char *)executor->jit_code; size_t size = executor->jit_size; if (memory) { executor->jit_code = NULL; |