summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSam Gross <colesbury@gmail.com>2024-02-20 18:04:37 (GMT)
committerGitHub <noreply@github.com>2024-02-20 18:04:37 (GMT)
commite3ad6ca56f9b49db0694f432a870f907a8039f79 (patch)
treee04a42ed027f6595dec9efe051104d487cfb03a8
parentd207c7cd5a8c0d3e5f6c5eb947243e4afcd718b0 (diff)
downloadcpython-e3ad6ca56f9b49db0694f432a870f907a8039f79.zip
cpython-e3ad6ca56f9b49db0694f432a870f907a8039f79.tar.gz
cpython-e3ad6ca56f9b49db0694f432a870f907a8039f79.tar.bz2
gh-115103: Implement delayed free mechanism for free-threaded builds (#115367)
This adds `_PyMem_FreeDelayed()` and supporting functions. The `_PyMem_FreeDelayed()` function frees memory with the same allocator as `PyMem_Free()`, but after some delay to ensure that concurrent lock-free readers have finished.
-rw-r--r--Include/internal/pycore_interp.h1
-rw-r--r--Include/internal/pycore_pymem.h19
-rw-r--r--Include/internal/pycore_pymem_init.h5
-rw-r--r--Include/internal/pycore_runtime_init.h1
-rw-r--r--Include/internal/pycore_tstate.h1
-rw-r--r--Objects/obmalloc.c190
-rw-r--r--Python/pylifecycle.c3
-rw-r--r--Python/pystate.c6
8 files changed, 226 insertions, 0 deletions
diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h
index 06eba66..6a00aaf 100644
--- a/Include/internal/pycore_interp.h
+++ b/Include/internal/pycore_interp.h
@@ -231,6 +231,7 @@ struct _is {
struct _Py_dict_state dict_state;
struct _Py_exc_state exc_state;
+ struct _Py_mem_interp_free_queue mem_free_queue;
struct ast_state ast;
struct types_state types;
diff --git a/Include/internal/pycore_pymem.h b/Include/internal/pycore_pymem.h
index 1a72d07..1aea91a 100644
--- a/Include/internal/pycore_pymem.h
+++ b/Include/internal/pycore_pymem.h
@@ -1,6 +1,7 @@
#ifndef Py_INTERNAL_PYMEM_H
#define Py_INTERNAL_PYMEM_H
+#include "pycore_llist.h" // struct llist_node
#include "pycore_lock.h" // PyMutex
#ifdef __cplusplus
@@ -48,6 +49,11 @@ struct _pymem_allocators {
PyObjectArenaAllocator obj_arena;
};
+struct _Py_mem_interp_free_queue {
+ int has_work; // true if the queue is not empty
+ PyMutex mutex; // protects the queue
+ struct llist_node head; // queue of _mem_work_chunk items
+};
/* Set the memory allocator of the specified domain to the default.
Save the old allocator into *old_alloc if it's non-NULL.
@@ -110,6 +116,19 @@ extern int _PyMem_SetupAllocators(PyMemAllocatorName allocator);
/* Is the debug allocator enabled? */
extern int _PyMem_DebugEnabled(void);
+// Enqueue a pointer to be freed possibly after some delay.
+extern void _PyMem_FreeDelayed(void *ptr);
+
+// Periodically process delayed free requests.
+extern void _PyMem_ProcessDelayed(PyThreadState *tstate);
+
+// Abandon all thread-local delayed free requests and push them to the
+// interpreter's queue.
+extern void _PyMem_AbandonDelayed(PyThreadState *tstate);
+
+// On interpreter shutdown, frees all delayed free requests.
+extern void _PyMem_FiniDelayed(PyInterpreterState *interp);
+
#ifdef __cplusplus
}
#endif
diff --git a/Include/internal/pycore_pymem_init.h b/Include/internal/pycore_pymem_init.h
index 96c49ed..c593edc 100644
--- a/Include/internal/pycore_pymem_init.h
+++ b/Include/internal/pycore_pymem_init.h
@@ -92,6 +92,11 @@ extern void _PyMem_ArenaFree(void *, void *, size_t);
{ NULL, _PyMem_ArenaAlloc, _PyMem_ArenaFree }
+#define _Py_mem_free_queue_INIT(queue) \
+ { \
+ .head = LLIST_INIT(queue.head), \
+ }
+
#ifdef __cplusplus
}
#endif
diff --git a/Include/internal/pycore_runtime_init.h b/Include/internal/pycore_runtime_init.h
index be81604..d093047 100644
--- a/Include/internal/pycore_runtime_init.h
+++ b/Include/internal/pycore_runtime_init.h
@@ -176,6 +176,7 @@ extern PyTypeObject _PyExc_MemoryError;
}, \
.dtoa = _dtoa_state_INIT(&(INTERP)), \
.dict_state = _dict_state_INIT, \
+ .mem_free_queue = _Py_mem_free_queue_INIT(INTERP.mem_free_queue), \
.func_state = { \
.next_version = 1, \
}, \
diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h
index d0f980e..e268e6f 100644
--- a/Include/internal/pycore_tstate.h
+++ b/Include/internal/pycore_tstate.h
@@ -29,6 +29,7 @@ typedef struct _PyThreadStateImpl {
PyThreadState base;
struct _qsbr_thread_state *qsbr; // only used by free-threaded build
+ struct llist_node mem_free_queue; // delayed free queue
#ifdef Py_GIL_DISABLED
struct _gc_thread_state gc;
diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c
index 6a12c3d..9bf4eeb 100644
--- a/Objects/obmalloc.c
+++ b/Objects/obmalloc.c
@@ -948,6 +948,196 @@ _PyMem_Strdup(const char *str)
return copy;
}
+/***********************************************/
+/* Delayed freeing support for Py_GIL_DISABLED */
+/***********************************************/
+
+// So that sizeof(struct _mem_work_chunk) is 4096 bytes on 64-bit platforms.
+#define WORK_ITEMS_PER_CHUNK 254
+
+// A pointer to be freed once the QSBR read sequence reaches qsbr_goal.
+struct _mem_work_item {
+ void *ptr;
+ uint64_t qsbr_goal;
+};
+
+// A fixed-size buffer of pointers to be freed
+struct _mem_work_chunk {
+ // Linked list node of chunks in queue
+ struct llist_node node;
+
+ Py_ssize_t rd_idx; // index of next item to read
+ Py_ssize_t wr_idx; // index of next item to write
+ struct _mem_work_item array[WORK_ITEMS_PER_CHUNK];
+};
+
+void
+_PyMem_FreeDelayed(void *ptr)
+{
+#ifndef Py_GIL_DISABLED
+ PyMem_Free(ptr);
+#else
+ if (_PyRuntime.stoptheworld.world_stopped) {
+ // Free immediately if the world is stopped, including during
+ // interpreter shutdown.
+ PyMem_Free(ptr);
+ return;
+ }
+
+ _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET();
+ struct llist_node *head = &tstate->mem_free_queue;
+
+ struct _mem_work_chunk *buf = NULL;
+ if (!llist_empty(head)) {
+ // Try to re-use the last buffer
+ buf = llist_data(head->prev, struct _mem_work_chunk, node);
+ if (buf->wr_idx == WORK_ITEMS_PER_CHUNK) {
+ // already full
+ buf = NULL;
+ }
+ }
+
+ if (buf == NULL) {
+ buf = PyMem_Calloc(1, sizeof(*buf));
+ if (buf != NULL) {
+ llist_insert_tail(head, &buf->node);
+ }
+ }
+
+ if (buf == NULL) {
+ // failed to allocate a buffer, free immediately
+ _PyEval_StopTheWorld(tstate->base.interp);
+ PyMem_Free(ptr);
+ _PyEval_StartTheWorld(tstate->base.interp);
+ return;
+ }
+
+ assert(buf != NULL && buf->wr_idx < WORK_ITEMS_PER_CHUNK);
+ uint64_t seq = _Py_qsbr_deferred_advance(tstate->qsbr);
+ buf->array[buf->wr_idx].ptr = ptr;
+ buf->array[buf->wr_idx].qsbr_goal = seq;
+ buf->wr_idx++;
+
+ if (buf->wr_idx == WORK_ITEMS_PER_CHUNK) {
+ _PyMem_ProcessDelayed((PyThreadState *)tstate);
+ }
+#endif
+}
+
+static struct _mem_work_chunk *
+work_queue_first(struct llist_node *head)
+{
+ return llist_data(head->next, struct _mem_work_chunk, node);
+}
+
+static void
+process_queue(struct llist_node *head, struct _qsbr_thread_state *qsbr,
+ bool keep_empty)
+{
+ while (!llist_empty(head)) {
+ struct _mem_work_chunk *buf = work_queue_first(head);
+
+ while (buf->rd_idx < buf->wr_idx) {
+ struct _mem_work_item *item = &buf->array[buf->rd_idx];
+ if (!_Py_qsbr_poll(qsbr, item->qsbr_goal)) {
+ return;
+ }
+
+ PyMem_Free(item->ptr);
+ buf->rd_idx++;
+ }
+
+ assert(buf->rd_idx == buf->wr_idx);
+ if (keep_empty && buf->node.next == head) {
+ // Keep the last buffer in the queue to reduce re-allocations
+ buf->rd_idx = buf->wr_idx = 0;
+ return;
+ }
+
+ llist_remove(&buf->node);
+ PyMem_Free(buf);
+ }
+}
+
+static void
+process_interp_queue(struct _Py_mem_interp_free_queue *queue,
+ struct _qsbr_thread_state *qsbr)
+{
+ if (!_Py_atomic_load_int_relaxed(&queue->has_work)) {
+ return;
+ }
+
+ // Try to acquire the lock, but don't block if it's already held.
+ if (_PyMutex_LockTimed(&queue->mutex, 0, 0) == PY_LOCK_ACQUIRED) {
+ process_queue(&queue->head, qsbr, false);
+
+ int more_work = !llist_empty(&queue->head);
+ _Py_atomic_store_int_relaxed(&queue->has_work, more_work);
+
+ PyMutex_Unlock(&queue->mutex);
+ }
+}
+
+void
+_PyMem_ProcessDelayed(PyThreadState *tstate)
+{
+ PyInterpreterState *interp = tstate->interp;
+ _PyThreadStateImpl *tstate_impl = (_PyThreadStateImpl *)tstate;
+
+ // Process thread-local work
+ process_queue(&tstate_impl->mem_free_queue, tstate_impl->qsbr, true);
+
+ // Process shared interpreter work
+ process_interp_queue(&interp->mem_free_queue, tstate_impl->qsbr);
+}
+
+void
+_PyMem_AbandonDelayed(PyThreadState *tstate)
+{
+ PyInterpreterState *interp = tstate->interp;
+ struct llist_node *queue = &((_PyThreadStateImpl *)tstate)->mem_free_queue;
+
+ if (llist_empty(queue)) {
+ return;
+ }
+
+ // Check if the queue contains one empty buffer
+ struct _mem_work_chunk *buf = work_queue_first(queue);
+ if (buf->rd_idx == buf->wr_idx) {
+ llist_remove(&buf->node);
+ PyMem_Free(buf);
+ assert(llist_empty(queue));
+ return;
+ }
+
+ // Merge the thread's work queue into the interpreter's work queue.
+ PyMutex_Lock(&interp->mem_free_queue.mutex);
+ llist_concat(&interp->mem_free_queue.head, queue);
+ _Py_atomic_store_int_relaxed(&interp->mem_free_queue.has_work, 1);
+ PyMutex_Unlock(&interp->mem_free_queue.mutex);
+
+ assert(llist_empty(queue)); // the thread's queue is now empty
+}
+
+void
+_PyMem_FiniDelayed(PyInterpreterState *interp)
+{
+ struct llist_node *head = &interp->mem_free_queue.head;
+ while (!llist_empty(head)) {
+ struct _mem_work_chunk *buf = work_queue_first(head);
+
+ while (buf->rd_idx < buf->wr_idx) {
+ // Free the remaining items immediately. There should be no other
+ // threads accessing the memory at this point during shutdown.
+ struct _mem_work_item *item = &buf->array[buf->rd_idx];
+ PyMem_Free(item->ptr);
+ buf->rd_idx++;
+ }
+
+ llist_remove(&buf->node);
+ PyMem_Free(buf);
+ }
+}
/**************************/
/* the "object" allocator */
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index 656d821..0448734 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -1837,6 +1837,9 @@ finalize_interp_clear(PyThreadState *tstate)
finalize_interp_types(tstate->interp);
+ /* Free any delayed free requests immediately */
+ _PyMem_FiniDelayed(tstate->interp);
+
/* finalize_interp_types may allocate Python objects so we may need to
abandon mimalloc segments again */
_PyThreadState_ClearMimallocHeaps(tstate);
diff --git a/Python/pystate.c b/Python/pystate.c
index 1d8c096..bb8e24c 100644
--- a/Python/pystate.c
+++ b/Python/pystate.c
@@ -617,6 +617,7 @@ init_interpreter(PyInterpreterState *interp,
#ifdef Py_GIL_DISABLED
_Py_brc_init_state(interp);
#endif
+ llist_init(&interp->mem_free_queue.head);
for (int i = 0; i < _PY_MONITORING_UNGROUPED_EVENTS; i++) {
interp->monitors.tools[i] = 0;
}
@@ -1353,6 +1354,7 @@ init_threadstate(_PyThreadStateImpl *_tstate,
// Initialize biased reference counting inter-thread queue
_Py_brc_init_thread(tstate);
#endif
+ llist_init(&_tstate->mem_free_queue);
if (interp->stoptheworld.requested || _PyRuntime.stoptheworld.requested) {
// Start in the suspended state if there is an ongoing stop-the-world.
@@ -1574,6 +1576,7 @@ PyThreadState_Clear(PyThreadState *tstate)
// don't call _PyInterpreterState_SetNotRunningMain() yet.
tstate->on_delete(tstate->on_delete_data);
}
+
#ifdef Py_GIL_DISABLED
// Each thread should clear own freelists in free-threading builds.
struct _Py_object_freelists *freelists = _Py_object_freelists_GET();
@@ -1583,6 +1586,9 @@ PyThreadState_Clear(PyThreadState *tstate)
_Py_brc_remove_thread(tstate);
#endif
+ // Merge our queue of pointers to be freed into the interpreter queue.
+ _PyMem_AbandonDelayed(tstate);
+
_PyThreadState_ClearMimallocHeaps(tstate);
tstate->_status.cleared = 1;