summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSam Gross <colesbury@gmail.com>2024-02-16 16:22:27 (GMT)
committerGitHub <noreply@github.com>2024-02-16 16:22:27 (GMT)
commitb24c9161a651f549ed48f4b4dba8996fe9cc4e09 (patch)
tree549acacced5ca0b1923e0df94b40ef633c6cdf8f
parentf92857a93016aa26ba93959d2bdb690ef52e7f07 (diff)
downloadcpython-b24c9161a651f549ed48f4b4dba8996fe9cc4e09.zip
cpython-b24c9161a651f549ed48f4b4dba8996fe9cc4e09.tar.gz
cpython-b24c9161a651f549ed48f4b4dba8996fe9cc4e09.tar.bz2
gh-112529: Make the GC scheduling thread-safe (#114880)
The GC keeps track of the number of allocations (less deallocations) since the last GC. This buffers the count in thread-local state and uses atomic operations to modify the per-interpreter count. The thread-local buffering avoids contention on shared state. A consequence is that the GC scheduling is not as precise, so "test_sneaky_frame_object" is skipped because it requires that the GC be run exactly after allocating a frame object.
-rw-r--r--Include/internal/pycore_gc.h7
-rw-r--r--Include/internal/pycore_tstate.h1
-rw-r--r--Lib/test/test_frame.py3
-rw-r--r--Lib/test/test_gc.py1
-rw-r--r--Modules/gcmodule.c10
-rw-r--r--Objects/typeobject.c2
-rw-r--r--Python/gc_free_threading.c63
7 files changed, 71 insertions, 16 deletions
diff --git a/Include/internal/pycore_gc.h b/Include/internal/pycore_gc.h
index 582a16b..a98864f 100644
--- a/Include/internal/pycore_gc.h
+++ b/Include/internal/pycore_gc.h
@@ -260,6 +260,13 @@ struct _gc_runtime_state {
Py_ssize_t long_lived_pending;
};
+#ifdef Py_GIL_DISABLED
+struct _gc_thread_state {
+ /* Thread-local allocation count. */
+ Py_ssize_t alloc_count;
+};
+#endif
+
extern void _PyGC_InitState(struct _gc_runtime_state *);
diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h
index 97aa85a..7fb9ab2 100644
--- a/Include/internal/pycore_tstate.h
+++ b/Include/internal/pycore_tstate.h
@@ -28,6 +28,7 @@ typedef struct _PyThreadStateImpl {
PyThreadState base;
#ifdef Py_GIL_DISABLED
+ struct _gc_thread_state gc;
struct _mimalloc_thread_state mimalloc;
struct _Py_object_freelists freelists;
struct _brc_thread_state brc;
diff --git a/Lib/test/test_frame.py b/Lib/test/test_frame.py
index baed03d..f88206d 100644
--- a/Lib/test/test_frame.py
+++ b/Lib/test/test_frame.py
@@ -13,7 +13,7 @@ except ImportError:
_testcapi = None
from test import support
-from test.support import threading_helper
+from test.support import threading_helper, Py_GIL_DISABLED
from test.support.script_helper import assert_python_ok
@@ -294,6 +294,7 @@ class TestIncompleteFrameAreInvisible(unittest.TestCase):
assert_python_ok("-c", code)
@support.cpython_only
+ @unittest.skipIf(Py_GIL_DISABLED, "test requires precise GC scheduling")
def test_sneaky_frame_object(self):
def trace(frame, event, arg):
diff --git a/Lib/test/test_gc.py b/Lib/test/test_gc.py
index b01f344..dd09643 100644
--- a/Lib/test/test_gc.py
+++ b/Lib/test/test_gc.py
@@ -363,6 +363,7 @@ class GCTests(unittest.TestCase):
# To minimize variations, though, we first store the get_count() results
# and check them at the end.
@refcount_test
+ @unittest.skipIf(Py_GIL_DISABLED, 'needs precise allocation counts')
def test_get_count(self):
gc.collect()
a, b, c = gc.get_count()
diff --git a/Modules/gcmodule.c b/Modules/gcmodule.c
index a2b66b9..961165e 100644
--- a/Modules/gcmodule.c
+++ b/Modules/gcmodule.c
@@ -201,6 +201,16 @@ gc_get_count_impl(PyObject *module)
/*[clinic end generated code: output=354012e67b16398f input=a392794a08251751]*/
{
GCState *gcstate = get_gc_state();
+
+#ifdef Py_GIL_DISABLED
+ _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET();
+ struct _gc_thread_state *gc = &tstate->gc;
+
+ // Flush the local allocation count to the global count
+ _Py_atomic_add_int(&gcstate->generations[0].count, (int)gc->alloc_count);
+ gc->alloc_count = 0;
+#endif
+
return Py_BuildValue("(iii)",
gcstate->generations[0].count,
gcstate->generations[1].count,
diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index 0118ee2..2e25c20 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -1835,6 +1835,8 @@ _PyType_AllocNoTrack(PyTypeObject *type, Py_ssize_t nitems)
if (presize) {
((PyObject **)alloc)[0] = NULL;
((PyObject **)alloc)[1] = NULL;
+ }
+ if (PyType_IS_GC(type)) {
_PyObject_GC_Link(obj);
}
memset(obj, '\0', size);
diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c
index 3dc1dc1..a758c99 100644
--- a/Python/gc_free_threading.c
+++ b/Python/gc_free_threading.c
@@ -23,6 +23,11 @@ typedef struct _gc_runtime_state GCState;
# define GC_DEBUG
#endif
+// Each thread buffers the count of allocated objects in a thread-local
+// variable up to +/- this amount to reduce the overhead of updating
+// the global count.
+#define LOCAL_ALLOC_COUNT_THRESHOLD 512
+
// Automatically choose the generation that needs collecting.
#define GENERATION_AUTO (-1)
@@ -960,6 +965,41 @@ gc_should_collect(GCState *gcstate)
}
static void
+record_allocation(PyThreadState *tstate)
+{
+ struct _gc_thread_state *gc = &((_PyThreadStateImpl *)tstate)->gc;
+
+ // We buffer the allocation count to avoid the overhead of atomic
+ // operations for every allocation.
+ gc->alloc_count++;
+ if (gc->alloc_count >= LOCAL_ALLOC_COUNT_THRESHOLD) {
+ // TODO: Use Py_ssize_t for the generation count.
+ GCState *gcstate = &tstate->interp->gc;
+ _Py_atomic_add_int(&gcstate->generations[0].count, (int)gc->alloc_count);
+ gc->alloc_count = 0;
+
+ if (gc_should_collect(gcstate) &&
+ !_Py_atomic_load_int_relaxed(&gcstate->collecting))
+ {
+ _Py_ScheduleGC(tstate->interp);
+ }
+ }
+}
+
+static void
+record_deallocation(PyThreadState *tstate)
+{
+ struct _gc_thread_state *gc = &((_PyThreadStateImpl *)tstate)->gc;
+
+ gc->alloc_count--;
+ if (gc->alloc_count <= -LOCAL_ALLOC_COUNT_THRESHOLD) {
+ GCState *gcstate = &tstate->interp->gc;
+ _Py_atomic_add_int(&gcstate->generations[0].count, (int)gc->alloc_count);
+ gc->alloc_count = 0;
+ }
+}
+
+static void
gc_collect_internal(PyInterpreterState *interp, struct collection_state *state)
{
_PyEval_StopTheWorld(interp);
@@ -981,6 +1021,9 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state)
}
}
+ // Record the number of live GC objects
+ interp->gc.long_lived_total = state->long_lived_total;
+
// Clear weakrefs and enqueue callbacks (but do not call them).
clear_weakrefs(state);
_PyEval_StartTheWorld(interp);
@@ -1090,7 +1133,6 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason)
m = state.collected;
n = state.uncollectable;
- gcstate->long_lived_total = state.long_lived_total;
if (gcstate->debug & _PyGC_DEBUG_STATS) {
double d = _PyTime_AsSecondsDouble(_PyTime_GetPerfCounter() - t1);
@@ -1530,15 +1572,7 @@ _Py_ScheduleGC(PyInterpreterState *interp)
void
_PyObject_GC_Link(PyObject *op)
{
- PyThreadState *tstate = _PyThreadState_GET();
- GCState *gcstate = &tstate->interp->gc;
- gcstate->generations[0].count++;
-
- if (gc_should_collect(gcstate) &&
- !_Py_atomic_load_int_relaxed(&gcstate->collecting))
- {
- _Py_ScheduleGC(tstate->interp);
- }
+ record_allocation(_PyThreadState_GET());
}
void
@@ -1564,7 +1598,7 @@ gc_alloc(PyTypeObject *tp, size_t basicsize, size_t presize)
((PyObject **)mem)[1] = NULL;
}
PyObject *op = (PyObject *)(mem + presize);
- _PyObject_GC_Link(op);
+ record_allocation(tstate);
return op;
}
@@ -1646,10 +1680,9 @@ PyObject_GC_Del(void *op)
PyErr_SetRaisedException(exc);
#endif
}
- GCState *gcstate = get_gc_state();
- if (gcstate->generations[0].count > 0) {
- gcstate->generations[0].count--;
- }
+
+ record_deallocation(_PyThreadState_GET());
+
PyObject_Free(((char *)op)-presize);
}