summaryrefslogtreecommitdiffstats
path: root/Lib/threading.py
diff options
context:
space:
mode:
authormpage <mpage@meta.com>2024-03-16 12:56:30 (GMT)
committerGitHub <noreply@github.com>2024-03-16 12:56:30 (GMT)
commit33da0e844c922b3dcded75fbb9b7be67cb013a17 (patch)
tree940ddc29bb00e450f9a984e01bff7674f929543e /Lib/threading.py
parent86bc40dd414bceb3f93382cc9f670936de9d68be (diff)
downloadcpython-33da0e844c922b3dcded75fbb9b7be67cb013a17.zip
cpython-33da0e844c922b3dcded75fbb9b7be67cb013a17.tar.gz
cpython-33da0e844c922b3dcded75fbb9b7be67cb013a17.tar.bz2
gh-114271: Fix race in `Thread.join()` (#114839)
There is a race between when `Thread._tstate_lock` is released[^1] in `Thread._wait_for_tstate_lock()` and when `Thread._stop()` asserts[^2] that it is unlocked. Consider the following execution involving threads A, B, and C: 1. A starts. 2. B joins A, blocking on its `_tstate_lock`. 3. C joins A, blocking on its `_tstate_lock`. 4. A finishes and releases its `_tstate_lock`. 5. B acquires A's `_tstate_lock` in `_wait_for_tstate_lock()`, releases it, but is swapped out before calling `_stop()`. 6. C is scheduled, acquires A's `_tstate_lock` in `_wait_for_tstate_lock()` but is swapped out before releasing it. 7. B is scheduled, calls `_stop()`, which asserts that A's `_tstate_lock` is not held. However, C holds it, so the assertion fails. The race can be reproduced[^3] by inserting sleeps at the appropriate points in the threading code. To do so, run the `repro_join_race.py` from the linked repo. There are two main parts to this PR: 1. `_tstate_lock` is replaced with an event that is attached to `PyThreadState`. The event is set by the runtime prior to the thread being cleared (in the same place that `_tstate_lock` was released). `Thread.join()` blocks waiting for the event to be set. 2. `_PyInterpreterState_WaitForThreads()` provides the ability to wait for all non-daemon threads to exit. To do so, an `is_daemon` predicate was added to `PyThreadState`. This field is set each time a thread is created. `threading._shutdown()` now calls into `_PyInterpreterState_WaitForThreads()` instead of waiting on `_tstate_lock`s. [^1]: https://github.com/python/cpython/blob/441affc9e7f419ef0b68f734505fa2f79fe653c7/Lib/threading.py#L1201 [^2]: https://github.com/python/cpython/blob/441affc9e7f419ef0b68f734505fa2f79fe653c7/Lib/threading.py#L1115 [^3]: https://github.com/mpage/cpython/commit/81946532792f938cd6f6ab4c4ff92a4edf61314f --------- Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com> Co-authored-by: Antoine Pitrou <antoine@python.org>
Diffstat (limited to 'Lib/threading.py')
-rw-r--r--Lib/threading.py218
1 files changed, 32 insertions, 186 deletions
diff --git a/Lib/threading.py b/Lib/threading.py
index ec89550..31ab77c 100644
--- a/Lib/threading.py
+++ b/Lib/threading.py
@@ -36,8 +36,11 @@ _start_joinable_thread = _thread.start_joinable_thread
_daemon_threads_allowed = _thread.daemon_threads_allowed
_allocate_lock = _thread.allocate_lock
_LockType = _thread.LockType
-_set_sentinel = _thread._set_sentinel
+_thread_shutdown = _thread._shutdown
+_make_thread_handle = _thread._make_thread_handle
+_ThreadHandle = _thread._ThreadHandle
get_ident = _thread.get_ident
+_get_main_thread_ident = _thread._get_main_thread_ident
_is_main_interpreter = _thread._is_main_interpreter
try:
get_native_id = _thread.get_native_id
@@ -847,25 +850,6 @@ _active = {} # maps thread id to Thread object
_limbo = {}
_dangling = WeakSet()
-# Set of Thread._tstate_lock locks of non-daemon threads used by _shutdown()
-# to wait until all Python thread states get deleted:
-# see Thread._set_tstate_lock().
-_shutdown_locks_lock = _allocate_lock()
-_shutdown_locks = set()
-
-def _maintain_shutdown_locks():
- """
- Drop any shutdown locks that don't correspond to running threads anymore.
-
- Calling this from time to time avoids an ever-growing _shutdown_locks
- set when Thread objects are not joined explicitly. See bpo-37788.
-
- This must be called with _shutdown_locks_lock acquired.
- """
- # If a lock was released, the corresponding thread has exited
- to_remove = [lock for lock in _shutdown_locks if not lock.locked()]
- _shutdown_locks.difference_update(to_remove)
-
# Main class for threads
@@ -930,10 +914,8 @@ class Thread:
self._ident = None
if _HAVE_THREAD_NATIVE_ID:
self._native_id = None
- self._tstate_lock = None
- self._handle = None
+ self._handle = _ThreadHandle()
self._started = Event()
- self._is_stopped = False
self._initialized = True
# Copy of sys.stderr used by self._invoke_excepthook()
self._stderr = _sys.stderr
@@ -947,28 +929,18 @@ class Thread:
if new_ident is not None:
# This thread is alive.
self._ident = new_ident
- if self._handle is not None:
- assert self._handle.ident == new_ident
- # bpo-42350: If the fork happens when the thread is already stopped
- # (ex: after threading._shutdown() has been called), _tstate_lock
- # is None. Do nothing in this case.
- if self._tstate_lock is not None:
- self._tstate_lock._at_fork_reinit()
- self._tstate_lock.acquire()
+ assert self._handle.ident == new_ident
else:
- # This thread isn't alive after fork: it doesn't have a tstate
- # anymore.
- self._is_stopped = True
- self._tstate_lock = None
- self._handle = None
+ # Otherwise, the thread is dead, Jim. _PyThread_AfterFork()
+ # already marked our handle done.
+ pass
def __repr__(self):
assert self._initialized, "Thread.__init__() was not called"
status = "initial"
if self._started.is_set():
status = "started"
- self.is_alive() # easy way to get ._is_stopped set when appropriate
- if self._is_stopped:
+ if self._handle.is_done():
status = "stopped"
if self._daemonic:
status += " daemon"
@@ -996,7 +968,8 @@ class Thread:
_limbo[self] = self
try:
# Start joinable thread
- self._handle = _start_joinable_thread(self._bootstrap)
+ _start_joinable_thread(self._bootstrap, handle=self._handle,
+ daemon=self.daemon)
except Exception:
with _active_limbo_lock:
del _limbo[self]
@@ -1047,23 +1020,9 @@ class Thread:
def _set_native_id(self):
self._native_id = get_native_id()
- def _set_tstate_lock(self):
- """
- Set a lock object which will be released by the interpreter when
- the underlying thread state (see pystate.h) gets deleted.
- """
- self._tstate_lock = _set_sentinel()
- self._tstate_lock.acquire()
-
- if not self.daemon:
- with _shutdown_locks_lock:
- _maintain_shutdown_locks()
- _shutdown_locks.add(self._tstate_lock)
-
def _bootstrap_inner(self):
try:
self._set_ident()
- self._set_tstate_lock()
if _HAVE_THREAD_NATIVE_ID:
self._set_native_id()
self._started.set()
@@ -1083,33 +1042,6 @@ class Thread:
finally:
self._delete()
- def _stop(self):
- # After calling ._stop(), .is_alive() returns False and .join() returns
- # immediately. ._tstate_lock must be released before calling ._stop().
- #
- # Normal case: C code at the end of the thread's life
- # (release_sentinel in _threadmodule.c) releases ._tstate_lock, and
- # that's detected by our ._wait_for_tstate_lock(), called by .join()
- # and .is_alive(). Any number of threads _may_ call ._stop()
- # simultaneously (for example, if multiple threads are blocked in
- # .join() calls), and they're not serialized. That's harmless -
- # they'll just make redundant rebindings of ._is_stopped and
- # ._tstate_lock. Obscure: we rebind ._tstate_lock last so that the
- # "assert self._is_stopped" in ._wait_for_tstate_lock() always works
- # (the assert is executed only if ._tstate_lock is None).
- #
- # Special case: _main_thread releases ._tstate_lock via this
- # module's _shutdown() function.
- lock = self._tstate_lock
- if lock is not None:
- assert not lock.locked()
- self._is_stopped = True
- self._tstate_lock = None
- if not self.daemon:
- with _shutdown_locks_lock:
- # Remove our lock and other released locks from _shutdown_locks
- _maintain_shutdown_locks()
-
def _delete(self):
"Remove current thread from the dict of currently running threads."
with _active_limbo_lock:
@@ -1150,47 +1082,12 @@ class Thread:
if self is current_thread():
raise RuntimeError("cannot join current thread")
- if timeout is None:
- self._wait_for_tstate_lock()
- else:
- # the behavior of a negative timeout isn't documented, but
- # historically .join(timeout=x) for x<0 has acted as if timeout=0
- self._wait_for_tstate_lock(timeout=max(timeout, 0))
-
- if self._is_stopped:
- self._join_os_thread()
-
- def _join_os_thread(self):
- # self._handle may be cleared post-fork
- if self._handle is not None:
- self._handle.join()
-
- def _wait_for_tstate_lock(self, block=True, timeout=-1):
- # Issue #18808: wait for the thread state to be gone.
- # At the end of the thread's life, after all knowledge of the thread
- # is removed from C data structures, C code releases our _tstate_lock.
- # This method passes its arguments to _tstate_lock.acquire().
- # If the lock is acquired, the C code is done, and self._stop() is
- # called. That sets ._is_stopped to True, and ._tstate_lock to None.
- lock = self._tstate_lock
- if lock is None:
- # already determined that the C code is done
- assert self._is_stopped
- return
+ # the behavior of a negative timeout isn't documented, but
+ # historically .join(timeout=x) for x<0 has acted as if timeout=0
+ if timeout is not None:
+ timeout = max(timeout, 0)
- try:
- if lock.acquire(block, timeout):
- lock.release()
- self._stop()
- except:
- if lock.locked():
- # bpo-45274: lock.acquire() acquired the lock, but the function
- # was interrupted with an exception before reaching the
- # lock.release(). It can happen if a signal handler raises an
- # exception, like CTRL+C which raises KeyboardInterrupt.
- lock.release()
- self._stop()
- raise
+ self._handle.join(timeout)
@property
def name(self):
@@ -1241,13 +1138,7 @@ class Thread:
"""
assert self._initialized, "Thread.__init__() not called"
- if self._is_stopped or not self._started.is_set():
- return False
- self._wait_for_tstate_lock(False)
- if not self._is_stopped:
- return True
- self._join_os_thread()
- return False
+ return self._started.is_set() and not self._handle.is_done()
@property
def daemon(self):
@@ -1456,18 +1347,14 @@ class _MainThread(Thread):
def __init__(self):
Thread.__init__(self, name="MainThread", daemon=False)
- self._set_tstate_lock()
self._started.set()
- self._set_ident()
+ self._ident = _get_main_thread_ident()
+ self._handle = _make_thread_handle(self._ident)
if _HAVE_THREAD_NATIVE_ID:
self._set_native_id()
with _active_limbo_lock:
_active[self._ident] = self
- def _join_os_thread(self):
- # No ThreadHandle for main thread
- pass
-
# Helper thread-local instance to detect when a _DummyThread
# is collected. Not a part of the public API.
@@ -1510,17 +1397,15 @@ class _DummyThread(Thread):
daemon=_daemon_threads_allowed())
self._started.set()
self._set_ident()
+ self._handle = _make_thread_handle(self._ident)
if _HAVE_THREAD_NATIVE_ID:
self._set_native_id()
with _active_limbo_lock:
_active[self._ident] = self
_DeleteDummyThreadOnDel(self)
- def _stop(self):
- pass
-
def is_alive(self):
- if not self._is_stopped and self._started.is_set():
+ if not self._handle.is_done() and self._started.is_set():
return True
raise RuntimeError("thread is not alive")
@@ -1532,7 +1417,6 @@ class _DummyThread(Thread):
self.__class__ = _MainThread
self._name = 'MainThread'
self._daemonic = False
- self._set_tstate_lock()
Thread._after_fork(self, new_ident=new_ident)
@@ -1631,12 +1515,11 @@ def _shutdown():
"""
Wait until the Python thread state of all non-daemon threads get deleted.
"""
- # Obscure: other threads may be waiting to join _main_thread. That's
- # dubious, but some code does it. We can't wait for C code to release
- # the main thread's tstate_lock - that won't happen until the interpreter
- # is nearly dead. So we release it here. Note that just calling _stop()
- # isn't enough: other threads may already be waiting on _tstate_lock.
- if _main_thread._is_stopped and _is_main_interpreter():
+ # Obscure: other threads may be waiting to join _main_thread. That's
+ # dubious, but some code does it. We can't wait for it to be marked as done
+ # normally - that won't happen until the interpreter is nearly dead. So
+ # mark it done here.
+ if _main_thread._handle.is_done() and _is_main_interpreter():
# _shutdown() was already called
return
@@ -1648,42 +1531,11 @@ def _shutdown():
for atexit_call in reversed(_threading_atexits):
atexit_call()
- # Main thread
- if _main_thread.ident == get_ident():
- tlock = _main_thread._tstate_lock
- # The main thread isn't finished yet, so its thread state lock can't
- # have been released.
- assert tlock is not None
- if tlock.locked():
- # It should have been released already by
- # _PyInterpreterState_SetNotRunningMain(), but there may be
- # embedders that aren't calling that yet.
- tlock.release()
- _main_thread._stop()
- else:
- # bpo-1596321: _shutdown() must be called in the main thread.
- # If the threading module was not imported by the main thread,
- # _main_thread is the thread which imported the threading module.
- # In this case, ignore _main_thread, similar behavior than for threads
- # spawned by C libraries or using _thread.start_new_thread().
- pass
-
- # Join all non-deamon threads
- while True:
- with _shutdown_locks_lock:
- locks = list(_shutdown_locks)
- _shutdown_locks.clear()
-
- if not locks:
- break
-
- for lock in locks:
- # mimic Thread.join()
- lock.acquire()
- lock.release()
-
- # new threads can be spawned while we were waiting for the other
- # threads to complete
+ if _is_main_interpreter():
+ _main_thread._handle._set_done()
+
+ # Wait for all non-daemon threads to exit.
+ _thread_shutdown()
def main_thread():
@@ -1703,7 +1555,6 @@ def _after_fork():
# Reset _active_limbo_lock, in case we forked while the lock was held
# by another (non-forked) thread. http://bugs.python.org/issue874900
global _active_limbo_lock, _main_thread
- global _shutdown_locks_lock, _shutdown_locks
_active_limbo_lock = RLock()
# fork() only copied the current thread; clear references to others.
@@ -1719,10 +1570,6 @@ def _after_fork():
_main_thread = current
- # reset _shutdown() locks: threads re-register their _tstate_lock below
- _shutdown_locks_lock = _allocate_lock()
- _shutdown_locks = set()
-
with _active_limbo_lock:
# Dangling thread instances must still have their locks reset,
# because someone may join() them.
@@ -1739,7 +1586,6 @@ def _after_fork():
else:
# All the others are already stopped.
thread._after_fork()
- thread._stop()
_limbo.clear()
_active.clear()