diff options
author | Sam Gross <colesbury@gmail.com> | 2024-02-16 20:25:19 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-16 20:25:19 (GMT) |
commit | 590319072773bd6cdcca655c420d3adb84838e96 (patch) | |
tree | b2e7ec5cb49ef21d0fe9f35b9f32d69e8578fb86 /Include | |
parent | 711f42de2e3749208cfa7effa0d45b04e4e1fdd4 (diff) | |
download | cpython-590319072773bd6cdcca655c420d3adb84838e96.zip cpython-590319072773bd6cdcca655c420d3adb84838e96.tar.gz cpython-590319072773bd6cdcca655c420d3adb84838e96.tar.bz2 |
gh-115103: Implement delayed memory reclamation (QSBR) (#115180)
This adds a safe memory reclamation scheme based on FreeBSD's "GUS" and
quiescent state based reclamation (QSBR). The API provides a mechanism
for callers to detect when it is safe to free memory that may be
concurrently accessed by readers.
Diffstat (limited to 'Include')
-rw-r--r-- | Include/cpython/pyatomic.h | 6 | ||||
-rw-r--r-- | Include/cpython/pyatomic_gcc.h | 8 | ||||
-rw-r--r-- | Include/cpython/pyatomic_msc.h | 28 | ||||
-rw-r--r-- | Include/cpython/pyatomic_std.h | 16 | ||||
-rw-r--r-- | Include/internal/pycore_interp.h | 2 | ||||
-rw-r--r-- | Include/internal/pycore_qsbr.h | 139 | ||||
-rw-r--r-- | Include/internal/pycore_runtime_init.h | 5 | ||||
-rw-r--r-- | Include/internal/pycore_tstate.h | 5 |
8 files changed, 207 insertions, 2 deletions
diff --git a/Include/cpython/pyatomic.h b/Include/cpython/pyatomic.h index 9b57741..737eed8 100644 --- a/Include/cpython/pyatomic.h +++ b/Include/cpython/pyatomic.h @@ -475,6 +475,12 @@ _Py_atomic_store_int_release(int *obj, int value); static inline int _Py_atomic_load_int_acquire(const int *obj); +static inline void +_Py_atomic_store_uint64_release(uint64_t *obj, uint64_t value); + +static inline uint64_t +_Py_atomic_load_uint64_acquire(const uint64_t *obj); + static inline uint32_t _Py_atomic_load_uint32_acquire(const uint32_t *obj); diff --git a/Include/cpython/pyatomic_gcc.h b/Include/cpython/pyatomic_gcc.h index bc74149..de23edf 100644 --- a/Include/cpython/pyatomic_gcc.h +++ b/Include/cpython/pyatomic_gcc.h @@ -504,6 +504,14 @@ static inline int _Py_atomic_load_int_acquire(const int *obj) { return __atomic_load_n(obj, __ATOMIC_ACQUIRE); } +static inline void +_Py_atomic_store_uint64_release(uint64_t *obj, uint64_t value) +{ __atomic_store_n(obj, value, __ATOMIC_RELEASE); } + +static inline uint64_t +_Py_atomic_load_uint64_acquire(const uint64_t *obj) +{ return __atomic_load_n(obj, __ATOMIC_ACQUIRE); } + static inline uint32_t _Py_atomic_load_uint32_acquire(const uint32_t *obj) { return __atomic_load_n(obj, __ATOMIC_ACQUIRE); } diff --git a/Include/cpython/pyatomic_msc.h b/Include/cpython/pyatomic_msc.h index 6ab6401..9809d98 100644 --- a/Include/cpython/pyatomic_msc.h +++ b/Include/cpython/pyatomic_msc.h @@ -952,13 +952,39 @@ _Py_atomic_load_int_acquire(const int *obj) #endif } +static inline void +_Py_atomic_store_uint64_release(uint64_t *obj, uint64_t value) +{ +#if defined(_M_X64) || defined(_M_IX86) + *(uint64_t volatile *)obj = value; +#elif defined(_M_ARM64) + _Py_atomic_ASSERT_ARG_TYPE(unsigned __int64); + __stlr64((unsigned __int64 volatile *)obj, (unsigned __int64)value); +#else +# error "no implementation of _Py_atomic_store_uint64_release" +#endif +} + +static inline uint64_t +_Py_atomic_load_uint64_acquire(const uint64_t *obj) +{ +#if defined(_M_X64) || defined(_M_IX86) + return *(uint64_t volatile *)obj; +#elif defined(_M_ARM64) + _Py_atomic_ASSERT_ARG_TYPE(__int64); + return (uint64_t)__ldar64((unsigned __int64 volatile *)obj); +#else +# error "no implementation of _Py_atomic_load_uint64_acquire" +#endif +} + static inline uint32_t _Py_atomic_load_uint32_acquire(const uint32_t *obj) { #if defined(_M_X64) || defined(_M_IX86) return *(uint32_t volatile *)obj; #elif defined(_M_ARM64) - return (int)__ldar32((uint32_t volatile *)obj); + return (uint32_t)__ldar32((uint32_t volatile *)obj); #else # error "no implementation of _Py_atomic_load_uint32_acquire" #endif diff --git a/Include/cpython/pyatomic_std.h b/Include/cpython/pyatomic_std.h index d3004db..f5bd73a 100644 --- a/Include/cpython/pyatomic_std.h +++ b/Include/cpython/pyatomic_std.h @@ -887,6 +887,22 @@ _Py_atomic_load_int_acquire(const int *obj) memory_order_acquire); } +static inline void +_Py_atomic_store_uint64_release(uint64_t *obj, uint64_t value) +{ + _Py_USING_STD; + atomic_store_explicit((_Atomic(uint64_t)*)obj, value, + memory_order_release); +} + +static inline uint64_t +_Py_atomic_load_uint64_acquire(const uint64_t *obj) +{ + _Py_USING_STD; + return atomic_load_explicit((const _Atomic(uint64_t)*)obj, + memory_order_acquire); +} + static inline uint32_t _Py_atomic_load_uint32_acquire(const uint32_t *obj) { diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h index c074471..567d6a9 100644 --- a/Include/internal/pycore_interp.h +++ b/Include/internal/pycore_interp.h @@ -30,6 +30,7 @@ extern "C" { #include "pycore_mimalloc.h" // struct _mimalloc_interp_state #include "pycore_object_state.h" // struct _py_object_state #include "pycore_obmalloc.h" // struct _obmalloc_state +#include "pycore_qsbr.h" // struct _qsbr_state #include "pycore_tstate.h" // _PyThreadStateImpl #include "pycore_tuple.h" // struct _Py_tuple_state #include "pycore_typeobject.h" // struct types_state @@ -197,6 +198,7 @@ struct _is { struct _warnings_runtime_state warnings; struct atexit_state atexit; struct _stoptheworld_state stoptheworld; + struct _qsbr_shared qsbr; #if defined(Py_GIL_DISABLED) struct _mimalloc_interp_state mimalloc; diff --git a/Include/internal/pycore_qsbr.h b/Include/internal/pycore_qsbr.h new file mode 100644 index 0000000..475f00d --- /dev/null +++ b/Include/internal/pycore_qsbr.h @@ -0,0 +1,139 @@ +// The QSBR APIs (quiescent state-based reclamation) provide a mechanism for +// the free-threaded build to safely reclaim memory when there may be +// concurrent accesses. +// +// Many operations in the free-threaded build are protected by locks. However, +// in some cases, we want to allow reads to happen concurrently with updates. +// In this case, we need to delay freeing ("reclaiming") any memory that may be +// concurrently accessed by a reader. The QSBR APIs provide a way to do this. +#ifndef Py_INTERNAL_QSBR_H +#define Py_INTERNAL_QSBR_H + +#include <stdbool.h> +#include <stdint.h> +#include "pycore_lock.h" // PyMutex + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef Py_BUILD_CORE +# error "this header requires Py_BUILD_CORE define" +#endif + +// The shared write sequence is always odd and incremented by two. Detached +// threads are indicated by a read sequence of zero. This avoids collisions +// between the offline state and any valid sequence number even if the +// sequences numbers wrap around. +#define QSBR_OFFLINE 0 +#define QSBR_INITIAL 1 +#define QSBR_INCR 2 + +struct _qsbr_shared; +struct _PyThreadStateImpl; // forward declare to avoid circular dependency + +// Per-thread state +struct _qsbr_thread_state { + // Last observed write sequence (or 0 if detached) + uint64_t seq; + + // Shared (per-interpreter) QSBR state + struct _qsbr_shared *shared; + + // Thread state (or NULL) + PyThreadState *tstate; + + // Used to defer advancing write sequence a fixed number of times + int deferrals; + + // Is this thread state allocated? + bool allocated; + struct _qsbr_thread_state *freelist_next; +}; + +// Padding to avoid false sharing +struct _qsbr_pad { + struct _qsbr_thread_state qsbr; + char __padding[64 - sizeof(struct _qsbr_thread_state)]; +}; + +// Per-interpreter state +struct _qsbr_shared { + // Write sequence: always odd, incremented by two + uint64_t wr_seq; + + // Minimum observed read sequence of all QSBR thread states + uint64_t rd_seq; + + // Array of QSBR thread states. + struct _qsbr_pad *array; + Py_ssize_t size; + + // Freelist of unused _qsbr_thread_states (protected by mutex) + PyMutex mutex; + struct _qsbr_thread_state *freelist; +}; + +static inline uint64_t +_Py_qsbr_shared_current(struct _qsbr_shared *shared) +{ + return _Py_atomic_load_uint64_acquire(&shared->wr_seq); +} + +// Reports a quiescent state: the caller no longer holds any pointer to shared +// data not protected by locks or reference counts. +static inline void +_Py_qsbr_quiescent_state(struct _qsbr_thread_state *qsbr) +{ + uint64_t seq = _Py_qsbr_shared_current(qsbr->shared); + _Py_atomic_store_uint64_release(&qsbr->seq, seq); +} + +// Advance the write sequence and return the new goal. This should be called +// after data is removed. The returned goal is used with `_Py_qsbr_poll()` to +// determine when it is safe to reclaim (free) the memory. +extern uint64_t +_Py_qsbr_advance(struct _qsbr_shared *shared); + +// Batches requests to advance the write sequence. This advances the write +// sequence every N calls, which reduces overhead but increases time to +// reclamation. Returns the new goal. +extern uint64_t +_Py_qsbr_deferred_advance(struct _qsbr_thread_state *qsbr); + +// Have the read sequences advanced to the given goal? If this returns true, +// it safe to reclaim any memory tagged with the goal (or earlier goal). +extern bool +_Py_qsbr_poll(struct _qsbr_thread_state *qsbr, uint64_t goal); + +// Called when thread attaches to interpreter +extern void +_Py_qsbr_attach(struct _qsbr_thread_state *qsbr); + +// Called when thread detaches from interpreter +extern void +_Py_qsbr_detach(struct _qsbr_thread_state *qsbr); + +// Reserves (allocates) a QSBR state and returns its index. +extern Py_ssize_t +_Py_qsbr_reserve(PyInterpreterState *interp); + +// Associates a PyThreadState with the QSBR state at the given index +extern void +_Py_qsbr_register(struct _PyThreadStateImpl *tstate, + PyInterpreterState *interp, Py_ssize_t index); + +// Disassociates a PyThreadState from the QSBR state and frees the QSBR state. +extern void +_Py_qsbr_unregister(struct _PyThreadStateImpl *tstate); + +extern void +_Py_qsbr_fini(PyInterpreterState *interp); + +extern void +_Py_qsbr_after_fork(struct _PyThreadStateImpl *tstate); + +#ifdef __cplusplus +} +#endif +#endif /* !Py_INTERNAL_QSBR_H */ diff --git a/Include/internal/pycore_runtime_init.h b/Include/internal/pycore_runtime_init.h index 7a05c10..be81604 100644 --- a/Include/internal/pycore_runtime_init.h +++ b/Include/internal/pycore_runtime_init.h @@ -17,6 +17,7 @@ extern "C" { #include "pycore_pyhash.h" // pyhash_state_INIT #include "pycore_pymem_init.h" // _pymem_allocators_standard_INIT #include "pycore_pythread.h" // _pythread_RUNTIME_INIT +#include "pycore_qsbr.h" // QSBR_INITIAL #include "pycore_runtime_init_generated.h" // _Py_bytes_characters_INIT #include "pycore_signal.h" // _signals_RUNTIME_INIT #include "pycore_tracemalloc.h" // _tracemalloc_runtime_state_INIT @@ -169,6 +170,10 @@ extern PyTypeObject _PyExc_MemoryError; { .threshold = 10, }, \ }, \ }, \ + .qsbr = { \ + .wr_seq = QSBR_INITIAL, \ + .rd_seq = QSBR_INITIAL, \ + }, \ .dtoa = _dtoa_state_INIT(&(INTERP)), \ .dict_state = _dict_state_INIT, \ .func_state = { \ diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h index 7fb9ab2..d0f980e 100644 --- a/Include/internal/pycore_tstate.h +++ b/Include/internal/pycore_tstate.h @@ -8,9 +8,10 @@ extern "C" { # error "this header requires Py_BUILD_CORE define" #endif +#include "pycore_brc.h" // struct _brc_thread_state #include "pycore_freelist.h" // struct _Py_freelist_state #include "pycore_mimalloc.h" // struct _mimalloc_thread_state -#include "pycore_brc.h" // struct _brc_thread_state +#include "pycore_qsbr.h" // struct qsbr static inline void @@ -27,6 +28,8 @@ typedef struct _PyThreadStateImpl { // semi-public fields are in PyThreadState. PyThreadState base; + struct _qsbr_thread_state *qsbr; // only used by free-threaded build + #ifdef Py_GIL_DISABLED struct _gc_thread_state gc; struct _mimalloc_thread_state mimalloc; |