gh-115999: Implement thread-local bytecode and enable specialization for `BINARY_OP` (#123926)

Each thread specializes a thread-local copy of the bytecode, created on the first RESUME, in free-threaded builds. All copies of the bytecode for a code object are stored in the co_tlbc array on the code object. Threads reserve a globally unique index identifying its copy of the bytecode in all co_tlbc arrays at thread creation and release the index at thread destruction. The first entry in every co_tlbc array always points to the "main" copy of the bytecode that is stored at the end of the code object. This ensures that no bytecode is copied for programs that do not use threads. Thread-local bytecode can be disabled at runtime by providing either -X tlbc=0 or PYTHON_TLBC=0. Disabling thread-local bytecode also disables specialization. Concurrent modifications to the bytecode made by the specializing interpreter and instrumentation use atomics, with specialization taking care not to overwrite an instruction that was instrumented concurrently.
author: mpage <mpage@cs.stanford.edu> 2024-11-04 19:13:32 (GMT)
committer: GitHub <noreply@github.com> 2024-11-04 19:13:32 (GMT)
commit: 2e95c5ba3bf7e5004c7e2304afda4a8f8e2443a7 (patch)
tree: de32ac52ed5ffcb9460dfc062effc6b4b662ee5d /Python/ceval_macros.h
parent: e5a4b402ae55f5eeeb44d3e7bc3f3ec39b249846 (diff)
download: cpython-2e95c5ba3bf7e5004c7e2304afda4a8f8e2443a7.zip
cpython-2e95c5ba3bf7e5004c7e2304afda4a8f8e2443a7.tar.gz
cpython-2e95c5ba3bf7e5004c7e2304afda4a8f8e2443a7.tar.bz2
1 files changed, 13 insertions, 9 deletions
diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h
index 6674c4c..5df5581 100644
--- a/Python/ceval_macros.h
+++ b/Python/ceval_macros.h
@@ -151,7 +151,7 @@ GETITEM(PyObject *v, Py_ssize_t i) {
 /* Code access macros */
 
 /* The integer overflow is checked by an assertion below. */
-#define INSTR_OFFSET() ((int)(next_instr - _PyCode_CODE(_PyFrame_GetCode(frame))))
+#define INSTR_OFFSET() ((int)(next_instr - _PyFrame_GetBytecode(frame)))
 #define NEXTOPARG()  do { \
         _Py_CODEUNIT word  = {.cache = FT_ATOMIC_LOAD_UINT16_RELAXED(*(uint16_t*)next_instr)}; \
         opcode = word.op.code; \
@@ -301,14 +301,6 @@ GETITEM(PyObject *v, Py_ssize_t i) {
 #define ADAPTIVE_COUNTER_TRIGGERS(COUNTER) \
     backoff_counter_triggers(forge_backoff_counter((COUNTER)))
 
-#ifdef Py_GIL_DISABLED
-#define ADVANCE_ADAPTIVE_COUNTER(COUNTER) \
-    do { \
-        /* gh-115999 tracks progress on addressing this. */ \
-        static_assert(0, "The specializing interpreter is not yet thread-safe"); \
-    } while (0);
-#define PAUSE_ADAPTIVE_COUNTER(COUNTER) ((void)COUNTER)
-#else
 #define ADVANCE_ADAPTIVE_COUNTER(COUNTER) \
     do { \
         (COUNTER) = advance_backoff_counter((COUNTER)); \
@@ -318,6 +310,18 @@ GETITEM(PyObject *v, Py_ssize_t i) {
     do { \
         (COUNTER) = pause_backoff_counter((COUNTER)); \
     } while (0);
+
+#ifdef ENABLE_SPECIALIZATION_FT
+/* Multiple threads may execute these concurrently if thread-local bytecode is
+ * disabled and they all execute the main copy of the bytecode. Specialization
+ * is disabled in that case so the value is unused, but the RMW cycle should be
+ * free of data races.
+ */
+#define RECORD_BRANCH_TAKEN(bitset, flag) \
+    FT_ATOMIC_STORE_UINT16_RELAXED(       \
+        bitset, (FT_ATOMIC_LOAD_UINT16_RELAXED(bitset) << 1) | (flag))
+#else
+#define RECORD_BRANCH_TAKEN(bitset, flag)
 #endif
 
 #define UNBOUNDLOCAL_ERROR_MSG \
author	mpage <mpage@cs.stanford.edu>	2024-11-04 19:13:32 (GMT)
committer	GitHub <noreply@github.com>	2024-11-04 19:13:32 (GMT)
commit	2e95c5ba3bf7e5004c7e2304afda4a8f8e2443a7 (patch)
tree	de32ac52ed5ffcb9460dfc062effc6b4b662ee5d /Python/ceval_macros.h
parent	e5a4b402ae55f5eeeb44d3e7bc3f3ec39b249846 (diff)
download	cpython-2e95c5ba3bf7e5004c7e2304afda4a8f8e2443a7.zip cpython-2e95c5ba3bf7e5004c7e2304afda4a8f8e2443a7.tar.gz cpython-2e95c5ba3bf7e5004c7e2304afda4a8f8e2443a7.tar.bz2