gh-115103: Implement delayed memory reclamation (QSBR) (#115180)

This adds a safe memory reclamation scheme based on FreeBSD's "GUS" and quiescent state based reclamation (QSBR). The API provides a mechanism for callers to detect when it is safe to free memory that may be concurrently accessed by readers.
author: Sam Gross <colesbury@gmail.com> 2024-02-16 20:25:19 (GMT)
committer: GitHub <noreply@github.com> 2024-02-16 20:25:19 (GMT)
commit: 590319072773bd6cdcca655c420d3adb84838e96 (patch)
tree: b2e7ec5cb49ef21d0fe9f35b9f32d69e8578fb86 /Include
parent: 711f42de2e3749208cfa7effa0d45b04e4e1fdd4 (diff)
download: cpython-590319072773bd6cdcca655c420d3adb84838e96.zip
cpython-590319072773bd6cdcca655c420d3adb84838e96.tar.gz
cpython-590319072773bd6cdcca655c420d3adb84838e96.tar.bz2
8 files changed, 207 insertions, 2 deletions
diff --git a/Include/cpython/pyatomic.h b/Include/cpython/pyatomic.h
index 9b57741..737eed8 100644
--- a/Include/cpython/pyatomic.h
+++ b/Include/cpython/pyatomic.h
@@ -475,6 +475,12 @@ _Py_atomic_store_int_release(int *obj, int value);
 static inline int
 _Py_atomic_load_int_acquire(const int *obj);
 
+static inline void
+_Py_atomic_store_uint64_release(uint64_t *obj, uint64_t value);
+
+static inline uint64_t
+_Py_atomic_load_uint64_acquire(const uint64_t *obj);
+
 static inline uint32_t
 _Py_atomic_load_uint32_acquire(const uint32_t *obj);
 
diff --git a/Include/cpython/pyatomic_gcc.h b/Include/cpython/pyatomic_gcc.h
index bc74149..de23edf 100644
--- a/Include/cpython/pyatomic_gcc.h
+++ b/Include/cpython/pyatomic_gcc.h
@@ -504,6 +504,14 @@ static inline int
 _Py_atomic_load_int_acquire(const int *obj)
 { return __atomic_load_n(obj, __ATOMIC_ACQUIRE); }
 
+static inline void
+_Py_atomic_store_uint64_release(uint64_t *obj, uint64_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_RELEASE); }
+
+static inline uint64_t
+_Py_atomic_load_uint64_acquire(const uint64_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_ACQUIRE); }
+
 static inline uint32_t
 _Py_atomic_load_uint32_acquire(const uint32_t *obj)
 { return __atomic_load_n(obj, __ATOMIC_ACQUIRE); }
diff --git a/Include/cpython/pyatomic_msc.h b/Include/cpython/pyatomic_msc.h
index 6ab6401..9809d98 100644
--- a/Include/cpython/pyatomic_msc.h
+++ b/Include/cpython/pyatomic_msc.h
@@ -952,13 +952,39 @@ _Py_atomic_load_int_acquire(const int *obj)
 #endif
 }
 
+static inline void
+_Py_atomic_store_uint64_release(uint64_t *obj, uint64_t value)
+{
+#if defined(_M_X64) || defined(_M_IX86)
+    *(uint64_t volatile *)obj = value;
+#elif defined(_M_ARM64)
+    _Py_atomic_ASSERT_ARG_TYPE(unsigned __int64);
+    __stlr64((unsigned __int64 volatile *)obj, (unsigned __int64)value);
+#else
+#  error "no implementation of _Py_atomic_store_uint64_release"
+#endif
+}
+
+static inline uint64_t
+_Py_atomic_load_uint64_acquire(const uint64_t *obj)
+{
+#if defined(_M_X64) || defined(_M_IX86)
+    return *(uint64_t volatile *)obj;
+#elif defined(_M_ARM64)
+    _Py_atomic_ASSERT_ARG_TYPE(__int64);
+    return (uint64_t)__ldar64((unsigned __int64 volatile *)obj);
+#else
+#  error "no implementation of _Py_atomic_load_uint64_acquire"
+#endif
+}
+
 static inline uint32_t
 _Py_atomic_load_uint32_acquire(const uint32_t *obj)
 {
 #if defined(_M_X64) || defined(_M_IX86)
     return *(uint32_t volatile *)obj;
 #elif defined(_M_ARM64)
-    return (int)__ldar32((uint32_t volatile *)obj);
+    return (uint32_t)__ldar32((uint32_t volatile *)obj);
 #else
 #  error "no implementation of _Py_atomic_load_uint32_acquire"
 #endif
diff --git a/Include/cpython/pyatomic_std.h b/Include/cpython/pyatomic_std.h
index d3004db..f5bd73a 100644
--- a/Include/cpython/pyatomic_std.h
+++ b/Include/cpython/pyatomic_std.h
@@ -887,6 +887,22 @@ _Py_atomic_load_int_acquire(const int *obj)
                                 memory_order_acquire);
 }
 
+static inline void
+_Py_atomic_store_uint64_release(uint64_t *obj, uint64_t value)
+{
+    _Py_USING_STD;
+    atomic_store_explicit((_Atomic(uint64_t)*)obj, value,
+                          memory_order_release);
+}
+
+static inline uint64_t
+_Py_atomic_load_uint64_acquire(const uint64_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load_explicit((const _Atomic(uint64_t)*)obj,
+                                memory_order_acquire);
+}
+
 static inline uint32_t
 _Py_atomic_load_uint32_acquire(const uint32_t *obj)
 {
diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h
index c074471..567d6a9 100644
--- a/Include/internal/pycore_interp.h
+++ b/Include/internal/pycore_interp.h
@@ -30,6 +30,7 @@ extern "C" {
 #include "pycore_mimalloc.h"      // struct _mimalloc_interp_state
 #include "pycore_object_state.h"  // struct _py_object_state
 #include "pycore_obmalloc.h"      // struct _obmalloc_state
+#include "pycore_qsbr.h"          // struct _qsbr_state
 #include "pycore_tstate.h"        // _PyThreadStateImpl
 #include "pycore_tuple.h"         // struct _Py_tuple_state
 #include "pycore_typeobject.h"    // struct types_state
@@ -197,6 +198,7 @@ struct _is {
     struct _warnings_runtime_state warnings;
     struct atexit_state atexit;
     struct _stoptheworld_state stoptheworld;
+    struct _qsbr_shared qsbr;
 
 #if defined(Py_GIL_DISABLED)
     struct _mimalloc_interp_state mimalloc;
diff --git a/Include/internal/pycore_qsbr.h b/Include/internal/pycore_qsbr.h
new file mode 100644
index 0000000..475f00d
--- /dev/null
+++ b/Include/internal/pycore_qsbr.h
@@ -0,0 +1,139 @@
+// The QSBR APIs (quiescent state-based reclamation) provide a mechanism for
+// the free-threaded build to safely reclaim memory when there may be
+// concurrent accesses.
+//
+// Many operations in the free-threaded build are protected by locks. However,
+// in some cases, we want to allow reads to happen concurrently with updates.
+// In this case, we need to delay freeing ("reclaiming") any memory that may be
+// concurrently accessed by a reader. The QSBR APIs provide a way to do this.
+#ifndef Py_INTERNAL_QSBR_H
+#define Py_INTERNAL_QSBR_H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include "pycore_lock.h"        // PyMutex
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+// The shared write sequence is always odd and incremented by two. Detached
+// threads are indicated by a read sequence of zero. This avoids collisions
+// between the offline state and any valid sequence number even if the
+// sequences numbers wrap around.
+#define QSBR_OFFLINE 0
+#define QSBR_INITIAL 1
+#define QSBR_INCR    2
+
+struct _qsbr_shared;
+struct _PyThreadStateImpl;  // forward declare to avoid circular dependency
+
+// Per-thread state
+struct _qsbr_thread_state {
+    // Last observed write sequence (or 0 if detached)
+    uint64_t seq;
+
+    // Shared (per-interpreter) QSBR state
+    struct _qsbr_shared *shared;
+
+    // Thread state (or NULL)
+    PyThreadState *tstate;
+
+    // Used to defer advancing write sequence a fixed number of times
+    int deferrals;
+
+    // Is this thread state allocated?
+    bool allocated;
+    struct _qsbr_thread_state *freelist_next;
+};
+
+// Padding to avoid false sharing
+struct _qsbr_pad {
+    struct _qsbr_thread_state qsbr;
+    char __padding[64 - sizeof(struct _qsbr_thread_state)];
+};
+
+// Per-interpreter state
+struct _qsbr_shared {
+    // Write sequence: always odd, incremented by two
+    uint64_t wr_seq;
+
+    // Minimum observed read sequence of all QSBR thread states
+    uint64_t rd_seq;
+
+    // Array of QSBR thread states.
+    struct _qsbr_pad *array;
+    Py_ssize_t size;
+
+    // Freelist of unused _qsbr_thread_states (protected by mutex)
+    PyMutex mutex;
+    struct _qsbr_thread_state *freelist;
+};
+
+static inline uint64_t
+_Py_qsbr_shared_current(struct _qsbr_shared *shared)
+{
+    return _Py_atomic_load_uint64_acquire(&shared->wr_seq);
+}
+
+// Reports a quiescent state: the caller no longer holds any pointer to shared
+// data not protected by locks or reference counts.
+static inline void
+_Py_qsbr_quiescent_state(struct _qsbr_thread_state *qsbr)
+{
+    uint64_t seq = _Py_qsbr_shared_current(qsbr->shared);
+    _Py_atomic_store_uint64_release(&qsbr->seq, seq);
+}
+
+// Advance the write sequence and return the new goal. This should be called
+// after data is removed. The returned goal is used with `_Py_qsbr_poll()` to
+// determine when it is safe to reclaim (free) the memory.
+extern uint64_t
+_Py_qsbr_advance(struct _qsbr_shared *shared);
+
+// Batches requests to advance the write sequence. This advances the write
+// sequence every N calls, which reduces overhead but increases time to
+// reclamation. Returns the new goal.
+extern uint64_t
+_Py_qsbr_deferred_advance(struct _qsbr_thread_state *qsbr);
+
+// Have the read sequences advanced to the given goal? If this returns true,
+// it safe to reclaim any memory tagged with the goal (or earlier goal).
+extern bool
+_Py_qsbr_poll(struct _qsbr_thread_state *qsbr, uint64_t goal);
+
+// Called when thread attaches to interpreter
+extern void
+_Py_qsbr_attach(struct _qsbr_thread_state *qsbr);
+
+// Called when thread detaches from interpreter
+extern void
+_Py_qsbr_detach(struct _qsbr_thread_state *qsbr);
+
+// Reserves (allocates) a QSBR state and returns its index.
+extern Py_ssize_t
+_Py_qsbr_reserve(PyInterpreterState *interp);
+
+// Associates a PyThreadState with the QSBR state at the given index
+extern void
+_Py_qsbr_register(struct _PyThreadStateImpl *tstate,
+                  PyInterpreterState *interp, Py_ssize_t index);
+
+// Disassociates a PyThreadState from the QSBR state and frees the QSBR state.
+extern void
+_Py_qsbr_unregister(struct _PyThreadStateImpl *tstate);
+
+extern void
+_Py_qsbr_fini(PyInterpreterState *interp);
+
+extern void
+_Py_qsbr_after_fork(struct _PyThreadStateImpl *tstate);
+
+#ifdef __cplusplus
+}
+#endif
+#endif   /* !Py_INTERNAL_QSBR_H */
diff --git a/Include/internal/pycore_runtime_init.h b/Include/internal/pycore_runtime_init.h
index 7a05c10..be81604 100644
--- a/Include/internal/pycore_runtime_init.h
+++ b/Include/internal/pycore_runtime_init.h
@@ -17,6 +17,7 @@ extern "C" {
 #include "pycore_pyhash.h"        // pyhash_state_INIT
 #include "pycore_pymem_init.h"    // _pymem_allocators_standard_INIT
 #include "pycore_pythread.h"      // _pythread_RUNTIME_INIT
+#include "pycore_qsbr.h"          // QSBR_INITIAL
 #include "pycore_runtime_init_generated.h"  // _Py_bytes_characters_INIT
 #include "pycore_signal.h"        // _signals_RUNTIME_INIT
 #include "pycore_tracemalloc.h"   // _tracemalloc_runtime_state_INIT
@@ -169,6 +170,10 @@ extern PyTypeObject _PyExc_MemoryError;
                 { .threshold = 10, }, \
             }, \
         }, \
+        .qsbr = { \
+            .wr_seq = QSBR_INITIAL, \
+            .rd_seq = QSBR_INITIAL, \
+        }, \
         .dtoa = _dtoa_state_INIT(&(INTERP)), \
         .dict_state = _dict_state_INIT, \
         .func_state = { \
diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h
index 7fb9ab2..d0f980e 100644
--- a/Include/internal/pycore_tstate.h
+++ b/Include/internal/pycore_tstate.h
@@ -8,9 +8,10 @@ extern "C" {
 #  error "this header requires Py_BUILD_CORE define"
 #endif
 
+#include "pycore_brc.h"           // struct _brc_thread_state
 #include "pycore_freelist.h"      // struct _Py_freelist_state
 #include "pycore_mimalloc.h"      // struct _mimalloc_thread_state
-#include "pycore_brc.h"           // struct _brc_thread_state
+#include "pycore_qsbr.h"          // struct qsbr
 
 
 static inline void
@@ -27,6 +28,8 @@ typedef struct _PyThreadStateImpl {
     // semi-public fields are in PyThreadState.
     PyThreadState base;
 
+    struct _qsbr_thread_state *qsbr;  // only used by free-threaded build
+
 #ifdef Py_GIL_DISABLED
     struct _gc_thread_state gc;
     struct _mimalloc_thread_state mimalloc;
author	Sam Gross <colesbury@gmail.com>	2024-02-16 20:25:19 (GMT)
committer	GitHub <noreply@github.com>	2024-02-16 20:25:19 (GMT)
commit	590319072773bd6cdcca655c420d3adb84838e96 (patch)
tree	b2e7ec5cb49ef21d0fe9f35b9f32d69e8578fb86 /Include
parent	711f42de2e3749208cfa7effa0d45b04e4e1fdd4 (diff)
download	cpython-590319072773bd6cdcca655c420d3adb84838e96.zip cpython-590319072773bd6cdcca655c420d3adb84838e96.tar.gz cpython-590319072773bd6cdcca655c420d3adb84838e96.tar.bz2