summaryrefslogtreecommitdiffstats
path: root/Include
diff options
context:
space:
mode:
authorSam Gross <colesbury@gmail.com>2024-02-16 20:25:19 (GMT)
committerGitHub <noreply@github.com>2024-02-16 20:25:19 (GMT)
commit590319072773bd6cdcca655c420d3adb84838e96 (patch)
treeb2e7ec5cb49ef21d0fe9f35b9f32d69e8578fb86 /Include
parent711f42de2e3749208cfa7effa0d45b04e4e1fdd4 (diff)
downloadcpython-590319072773bd6cdcca655c420d3adb84838e96.zip
cpython-590319072773bd6cdcca655c420d3adb84838e96.tar.gz
cpython-590319072773bd6cdcca655c420d3adb84838e96.tar.bz2
gh-115103: Implement delayed memory reclamation (QSBR) (#115180)
This adds a safe memory reclamation scheme based on FreeBSD's "GUS" and quiescent state based reclamation (QSBR). The API provides a mechanism for callers to detect when it is safe to free memory that may be concurrently accessed by readers.
Diffstat (limited to 'Include')
-rw-r--r--Include/cpython/pyatomic.h6
-rw-r--r--Include/cpython/pyatomic_gcc.h8
-rw-r--r--Include/cpython/pyatomic_msc.h28
-rw-r--r--Include/cpython/pyatomic_std.h16
-rw-r--r--Include/internal/pycore_interp.h2
-rw-r--r--Include/internal/pycore_qsbr.h139
-rw-r--r--Include/internal/pycore_runtime_init.h5
-rw-r--r--Include/internal/pycore_tstate.h5
8 files changed, 207 insertions, 2 deletions
diff --git a/Include/cpython/pyatomic.h b/Include/cpython/pyatomic.h
index 9b57741..737eed8 100644
--- a/Include/cpython/pyatomic.h
+++ b/Include/cpython/pyatomic.h
@@ -475,6 +475,12 @@ _Py_atomic_store_int_release(int *obj, int value);
static inline int
_Py_atomic_load_int_acquire(const int *obj);
+static inline void
+_Py_atomic_store_uint64_release(uint64_t *obj, uint64_t value);
+
+static inline uint64_t
+_Py_atomic_load_uint64_acquire(const uint64_t *obj);
+
static inline uint32_t
_Py_atomic_load_uint32_acquire(const uint32_t *obj);
diff --git a/Include/cpython/pyatomic_gcc.h b/Include/cpython/pyatomic_gcc.h
index bc74149..de23edf 100644
--- a/Include/cpython/pyatomic_gcc.h
+++ b/Include/cpython/pyatomic_gcc.h
@@ -504,6 +504,14 @@ static inline int
_Py_atomic_load_int_acquire(const int *obj)
{ return __atomic_load_n(obj, __ATOMIC_ACQUIRE); }
+static inline void
+_Py_atomic_store_uint64_release(uint64_t *obj, uint64_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_RELEASE); }
+
+static inline uint64_t
+_Py_atomic_load_uint64_acquire(const uint64_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_ACQUIRE); }
+
static inline uint32_t
_Py_atomic_load_uint32_acquire(const uint32_t *obj)
{ return __atomic_load_n(obj, __ATOMIC_ACQUIRE); }
diff --git a/Include/cpython/pyatomic_msc.h b/Include/cpython/pyatomic_msc.h
index 6ab6401..9809d98 100644
--- a/Include/cpython/pyatomic_msc.h
+++ b/Include/cpython/pyatomic_msc.h
@@ -952,13 +952,39 @@ _Py_atomic_load_int_acquire(const int *obj)
#endif
}
+static inline void
+_Py_atomic_store_uint64_release(uint64_t *obj, uint64_t value)
+{
+#if defined(_M_X64) || defined(_M_IX86)
+ *(uint64_t volatile *)obj = value;
+#elif defined(_M_ARM64)
+ _Py_atomic_ASSERT_ARG_TYPE(unsigned __int64);
+ __stlr64((unsigned __int64 volatile *)obj, (unsigned __int64)value);
+#else
+# error "no implementation of _Py_atomic_store_uint64_release"
+#endif
+}
+
+static inline uint64_t
+_Py_atomic_load_uint64_acquire(const uint64_t *obj)
+{
+#if defined(_M_X64) || defined(_M_IX86)
+ return *(uint64_t volatile *)obj;
+#elif defined(_M_ARM64)
+ _Py_atomic_ASSERT_ARG_TYPE(__int64);
+ return (uint64_t)__ldar64((unsigned __int64 volatile *)obj);
+#else
+# error "no implementation of _Py_atomic_load_uint64_acquire"
+#endif
+}
+
static inline uint32_t
_Py_atomic_load_uint32_acquire(const uint32_t *obj)
{
#if defined(_M_X64) || defined(_M_IX86)
return *(uint32_t volatile *)obj;
#elif defined(_M_ARM64)
- return (int)__ldar32((uint32_t volatile *)obj);
+ return (uint32_t)__ldar32((uint32_t volatile *)obj);
#else
# error "no implementation of _Py_atomic_load_uint32_acquire"
#endif
diff --git a/Include/cpython/pyatomic_std.h b/Include/cpython/pyatomic_std.h
index d3004db..f5bd73a 100644
--- a/Include/cpython/pyatomic_std.h
+++ b/Include/cpython/pyatomic_std.h
@@ -887,6 +887,22 @@ _Py_atomic_load_int_acquire(const int *obj)
memory_order_acquire);
}
+static inline void
+_Py_atomic_store_uint64_release(uint64_t *obj, uint64_t value)
+{
+ _Py_USING_STD;
+ atomic_store_explicit((_Atomic(uint64_t)*)obj, value,
+ memory_order_release);
+}
+
+static inline uint64_t
+_Py_atomic_load_uint64_acquire(const uint64_t *obj)
+{
+ _Py_USING_STD;
+ return atomic_load_explicit((const _Atomic(uint64_t)*)obj,
+ memory_order_acquire);
+}
+
static inline uint32_t
_Py_atomic_load_uint32_acquire(const uint32_t *obj)
{
diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h
index c074471..567d6a9 100644
--- a/Include/internal/pycore_interp.h
+++ b/Include/internal/pycore_interp.h
@@ -30,6 +30,7 @@ extern "C" {
#include "pycore_mimalloc.h" // struct _mimalloc_interp_state
#include "pycore_object_state.h" // struct _py_object_state
#include "pycore_obmalloc.h" // struct _obmalloc_state
+#include "pycore_qsbr.h" // struct _qsbr_state
#include "pycore_tstate.h" // _PyThreadStateImpl
#include "pycore_tuple.h" // struct _Py_tuple_state
#include "pycore_typeobject.h" // struct types_state
@@ -197,6 +198,7 @@ struct _is {
struct _warnings_runtime_state warnings;
struct atexit_state atexit;
struct _stoptheworld_state stoptheworld;
+ struct _qsbr_shared qsbr;
#if defined(Py_GIL_DISABLED)
struct _mimalloc_interp_state mimalloc;
diff --git a/Include/internal/pycore_qsbr.h b/Include/internal/pycore_qsbr.h
new file mode 100644
index 0000000..475f00d
--- /dev/null
+++ b/Include/internal/pycore_qsbr.h
@@ -0,0 +1,139 @@
+// The QSBR APIs (quiescent state-based reclamation) provide a mechanism for
+// the free-threaded build to safely reclaim memory when there may be
+// concurrent accesses.
+//
+// Many operations in the free-threaded build are protected by locks. However,
+// in some cases, we want to allow reads to happen concurrently with updates.
+// In this case, we need to delay freeing ("reclaiming") any memory that may be
+// concurrently accessed by a reader. The QSBR APIs provide a way to do this.
+#ifndef Py_INTERNAL_QSBR_H
+#define Py_INTERNAL_QSBR_H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include "pycore_lock.h" // PyMutex
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+# error "this header requires Py_BUILD_CORE define"
+#endif
+
+// The shared write sequence is always odd and incremented by two. Detached
+// threads are indicated by a read sequence of zero. This avoids collisions
+// between the offline state and any valid sequence number even if the
+// sequences numbers wrap around.
+#define QSBR_OFFLINE 0
+#define QSBR_INITIAL 1
+#define QSBR_INCR 2
+
+struct _qsbr_shared;
+struct _PyThreadStateImpl; // forward declare to avoid circular dependency
+
+// Per-thread state
+struct _qsbr_thread_state {
+ // Last observed write sequence (or 0 if detached)
+ uint64_t seq;
+
+ // Shared (per-interpreter) QSBR state
+ struct _qsbr_shared *shared;
+
+ // Thread state (or NULL)
+ PyThreadState *tstate;
+
+ // Used to defer advancing write sequence a fixed number of times
+ int deferrals;
+
+ // Is this thread state allocated?
+ bool allocated;
+ struct _qsbr_thread_state *freelist_next;
+};
+
+// Padding to avoid false sharing
+struct _qsbr_pad {
+ struct _qsbr_thread_state qsbr;
+ char __padding[64 - sizeof(struct _qsbr_thread_state)];
+};
+
+// Per-interpreter state
+struct _qsbr_shared {
+ // Write sequence: always odd, incremented by two
+ uint64_t wr_seq;
+
+ // Minimum observed read sequence of all QSBR thread states
+ uint64_t rd_seq;
+
+ // Array of QSBR thread states.
+ struct _qsbr_pad *array;
+ Py_ssize_t size;
+
+ // Freelist of unused _qsbr_thread_states (protected by mutex)
+ PyMutex mutex;
+ struct _qsbr_thread_state *freelist;
+};
+
+static inline uint64_t
+_Py_qsbr_shared_current(struct _qsbr_shared *shared)
+{
+ return _Py_atomic_load_uint64_acquire(&shared->wr_seq);
+}
+
+// Reports a quiescent state: the caller no longer holds any pointer to shared
+// data not protected by locks or reference counts.
+static inline void
+_Py_qsbr_quiescent_state(struct _qsbr_thread_state *qsbr)
+{
+ uint64_t seq = _Py_qsbr_shared_current(qsbr->shared);
+ _Py_atomic_store_uint64_release(&qsbr->seq, seq);
+}
+
+// Advance the write sequence and return the new goal. This should be called
+// after data is removed. The returned goal is used with `_Py_qsbr_poll()` to
+// determine when it is safe to reclaim (free) the memory.
+extern uint64_t
+_Py_qsbr_advance(struct _qsbr_shared *shared);
+
+// Batches requests to advance the write sequence. This advances the write
+// sequence every N calls, which reduces overhead but increases time to
+// reclamation. Returns the new goal.
+extern uint64_t
+_Py_qsbr_deferred_advance(struct _qsbr_thread_state *qsbr);
+
+// Have the read sequences advanced to the given goal? If this returns true,
+// it safe to reclaim any memory tagged with the goal (or earlier goal).
+extern bool
+_Py_qsbr_poll(struct _qsbr_thread_state *qsbr, uint64_t goal);
+
+// Called when thread attaches to interpreter
+extern void
+_Py_qsbr_attach(struct _qsbr_thread_state *qsbr);
+
+// Called when thread detaches from interpreter
+extern void
+_Py_qsbr_detach(struct _qsbr_thread_state *qsbr);
+
+// Reserves (allocates) a QSBR state and returns its index.
+extern Py_ssize_t
+_Py_qsbr_reserve(PyInterpreterState *interp);
+
+// Associates a PyThreadState with the QSBR state at the given index
+extern void
+_Py_qsbr_register(struct _PyThreadStateImpl *tstate,
+ PyInterpreterState *interp, Py_ssize_t index);
+
+// Disassociates a PyThreadState from the QSBR state and frees the QSBR state.
+extern void
+_Py_qsbr_unregister(struct _PyThreadStateImpl *tstate);
+
+extern void
+_Py_qsbr_fini(PyInterpreterState *interp);
+
+extern void
+_Py_qsbr_after_fork(struct _PyThreadStateImpl *tstate);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_QSBR_H */
diff --git a/Include/internal/pycore_runtime_init.h b/Include/internal/pycore_runtime_init.h
index 7a05c10..be81604 100644
--- a/Include/internal/pycore_runtime_init.h
+++ b/Include/internal/pycore_runtime_init.h
@@ -17,6 +17,7 @@ extern "C" {
#include "pycore_pyhash.h" // pyhash_state_INIT
#include "pycore_pymem_init.h" // _pymem_allocators_standard_INIT
#include "pycore_pythread.h" // _pythread_RUNTIME_INIT
+#include "pycore_qsbr.h" // QSBR_INITIAL
#include "pycore_runtime_init_generated.h" // _Py_bytes_characters_INIT
#include "pycore_signal.h" // _signals_RUNTIME_INIT
#include "pycore_tracemalloc.h" // _tracemalloc_runtime_state_INIT
@@ -169,6 +170,10 @@ extern PyTypeObject _PyExc_MemoryError;
{ .threshold = 10, }, \
}, \
}, \
+ .qsbr = { \
+ .wr_seq = QSBR_INITIAL, \
+ .rd_seq = QSBR_INITIAL, \
+ }, \
.dtoa = _dtoa_state_INIT(&(INTERP)), \
.dict_state = _dict_state_INIT, \
.func_state = { \
diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h
index 7fb9ab2..d0f980e 100644
--- a/Include/internal/pycore_tstate.h
+++ b/Include/internal/pycore_tstate.h
@@ -8,9 +8,10 @@ extern "C" {
# error "this header requires Py_BUILD_CORE define"
#endif
+#include "pycore_brc.h" // struct _brc_thread_state
#include "pycore_freelist.h" // struct _Py_freelist_state
#include "pycore_mimalloc.h" // struct _mimalloc_thread_state
-#include "pycore_brc.h" // struct _brc_thread_state
+#include "pycore_qsbr.h" // struct qsbr
static inline void
@@ -27,6 +28,8 @@ typedef struct _PyThreadStateImpl {
// semi-public fields are in PyThreadState.
PyThreadState base;
+ struct _qsbr_thread_state *qsbr; // only used by free-threaded build
+
#ifdef Py_GIL_DISABLED
struct _gc_thread_state gc;
struct _mimalloc_thread_state mimalloc;