diff options
author | mpage <mpage@meta.com> | 2024-03-16 12:56:30 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-16 12:56:30 (GMT) |
commit | 33da0e844c922b3dcded75fbb9b7be67cb013a17 (patch) | |
tree | 940ddc29bb00e450f9a984e01bff7674f929543e /Lib/threading.py | |
parent | 86bc40dd414bceb3f93382cc9f670936de9d68be (diff) | |
download | cpython-33da0e844c922b3dcded75fbb9b7be67cb013a17.zip cpython-33da0e844c922b3dcded75fbb9b7be67cb013a17.tar.gz cpython-33da0e844c922b3dcded75fbb9b7be67cb013a17.tar.bz2 |
gh-114271: Fix race in `Thread.join()` (#114839)
There is a race between when `Thread._tstate_lock` is released[^1] in `Thread._wait_for_tstate_lock()`
and when `Thread._stop()` asserts[^2] that it is unlocked. Consider the following execution
involving threads A, B, and C:
1. A starts.
2. B joins A, blocking on its `_tstate_lock`.
3. C joins A, blocking on its `_tstate_lock`.
4. A finishes and releases its `_tstate_lock`.
5. B acquires A's `_tstate_lock` in `_wait_for_tstate_lock()`, releases it, but is swapped
out before calling `_stop()`.
6. C is scheduled, acquires A's `_tstate_lock` in `_wait_for_tstate_lock()` but is swapped
out before releasing it.
7. B is scheduled, calls `_stop()`, which asserts that A's `_tstate_lock` is not held.
However, C holds it, so the assertion fails.
The race can be reproduced[^3] by inserting sleeps at the appropriate points in
the threading code. To do so, run the `repro_join_race.py` from the linked repo.
There are two main parts to this PR:
1. `_tstate_lock` is replaced with an event that is attached to `PyThreadState`.
The event is set by the runtime prior to the thread being cleared (in the same
place that `_tstate_lock` was released). `Thread.join()` blocks waiting for the
event to be set.
2. `_PyInterpreterState_WaitForThreads()` provides the ability to wait for all
non-daemon threads to exit. To do so, an `is_daemon` predicate was added to
`PyThreadState`. This field is set each time a thread is created. `threading._shutdown()`
now calls into `_PyInterpreterState_WaitForThreads()` instead of waiting on
`_tstate_lock`s.
[^1]: https://github.com/python/cpython/blob/441affc9e7f419ef0b68f734505fa2f79fe653c7/Lib/threading.py#L1201
[^2]: https://github.com/python/cpython/blob/441affc9e7f419ef0b68f734505fa2f79fe653c7/Lib/threading.py#L1115
[^3]: https://github.com/mpage/cpython/commit/81946532792f938cd6f6ab4c4ff92a4edf61314f
---------
Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com>
Co-authored-by: Antoine Pitrou <antoine@python.org>
Diffstat (limited to 'Lib/threading.py')
-rw-r--r-- | Lib/threading.py | 218 |
1 files changed, 32 insertions, 186 deletions
diff --git a/Lib/threading.py b/Lib/threading.py index ec89550..31ab77c 100644 --- a/Lib/threading.py +++ b/Lib/threading.py @@ -36,8 +36,11 @@ _start_joinable_thread = _thread.start_joinable_thread _daemon_threads_allowed = _thread.daemon_threads_allowed _allocate_lock = _thread.allocate_lock _LockType = _thread.LockType -_set_sentinel = _thread._set_sentinel +_thread_shutdown = _thread._shutdown +_make_thread_handle = _thread._make_thread_handle +_ThreadHandle = _thread._ThreadHandle get_ident = _thread.get_ident +_get_main_thread_ident = _thread._get_main_thread_ident _is_main_interpreter = _thread._is_main_interpreter try: get_native_id = _thread.get_native_id @@ -847,25 +850,6 @@ _active = {} # maps thread id to Thread object _limbo = {} _dangling = WeakSet() -# Set of Thread._tstate_lock locks of non-daemon threads used by _shutdown() -# to wait until all Python thread states get deleted: -# see Thread._set_tstate_lock(). -_shutdown_locks_lock = _allocate_lock() -_shutdown_locks = set() - -def _maintain_shutdown_locks(): - """ - Drop any shutdown locks that don't correspond to running threads anymore. - - Calling this from time to time avoids an ever-growing _shutdown_locks - set when Thread objects are not joined explicitly. See bpo-37788. - - This must be called with _shutdown_locks_lock acquired. - """ - # If a lock was released, the corresponding thread has exited - to_remove = [lock for lock in _shutdown_locks if not lock.locked()] - _shutdown_locks.difference_update(to_remove) - # Main class for threads @@ -930,10 +914,8 @@ class Thread: self._ident = None if _HAVE_THREAD_NATIVE_ID: self._native_id = None - self._tstate_lock = None - self._handle = None + self._handle = _ThreadHandle() self._started = Event() - self._is_stopped = False self._initialized = True # Copy of sys.stderr used by self._invoke_excepthook() self._stderr = _sys.stderr @@ -947,28 +929,18 @@ class Thread: if new_ident is not None: # This thread is alive. self._ident = new_ident - if self._handle is not None: - assert self._handle.ident == new_ident - # bpo-42350: If the fork happens when the thread is already stopped - # (ex: after threading._shutdown() has been called), _tstate_lock - # is None. Do nothing in this case. - if self._tstate_lock is not None: - self._tstate_lock._at_fork_reinit() - self._tstate_lock.acquire() + assert self._handle.ident == new_ident else: - # This thread isn't alive after fork: it doesn't have a tstate - # anymore. - self._is_stopped = True - self._tstate_lock = None - self._handle = None + # Otherwise, the thread is dead, Jim. _PyThread_AfterFork() + # already marked our handle done. + pass def __repr__(self): assert self._initialized, "Thread.__init__() was not called" status = "initial" if self._started.is_set(): status = "started" - self.is_alive() # easy way to get ._is_stopped set when appropriate - if self._is_stopped: + if self._handle.is_done(): status = "stopped" if self._daemonic: status += " daemon" @@ -996,7 +968,8 @@ class Thread: _limbo[self] = self try: # Start joinable thread - self._handle = _start_joinable_thread(self._bootstrap) + _start_joinable_thread(self._bootstrap, handle=self._handle, + daemon=self.daemon) except Exception: with _active_limbo_lock: del _limbo[self] @@ -1047,23 +1020,9 @@ class Thread: def _set_native_id(self): self._native_id = get_native_id() - def _set_tstate_lock(self): - """ - Set a lock object which will be released by the interpreter when - the underlying thread state (see pystate.h) gets deleted. - """ - self._tstate_lock = _set_sentinel() - self._tstate_lock.acquire() - - if not self.daemon: - with _shutdown_locks_lock: - _maintain_shutdown_locks() - _shutdown_locks.add(self._tstate_lock) - def _bootstrap_inner(self): try: self._set_ident() - self._set_tstate_lock() if _HAVE_THREAD_NATIVE_ID: self._set_native_id() self._started.set() @@ -1083,33 +1042,6 @@ class Thread: finally: self._delete() - def _stop(self): - # After calling ._stop(), .is_alive() returns False and .join() returns - # immediately. ._tstate_lock must be released before calling ._stop(). - # - # Normal case: C code at the end of the thread's life - # (release_sentinel in _threadmodule.c) releases ._tstate_lock, and - # that's detected by our ._wait_for_tstate_lock(), called by .join() - # and .is_alive(). Any number of threads _may_ call ._stop() - # simultaneously (for example, if multiple threads are blocked in - # .join() calls), and they're not serialized. That's harmless - - # they'll just make redundant rebindings of ._is_stopped and - # ._tstate_lock. Obscure: we rebind ._tstate_lock last so that the - # "assert self._is_stopped" in ._wait_for_tstate_lock() always works - # (the assert is executed only if ._tstate_lock is None). - # - # Special case: _main_thread releases ._tstate_lock via this - # module's _shutdown() function. - lock = self._tstate_lock - if lock is not None: - assert not lock.locked() - self._is_stopped = True - self._tstate_lock = None - if not self.daemon: - with _shutdown_locks_lock: - # Remove our lock and other released locks from _shutdown_locks - _maintain_shutdown_locks() - def _delete(self): "Remove current thread from the dict of currently running threads." with _active_limbo_lock: @@ -1150,47 +1082,12 @@ class Thread: if self is current_thread(): raise RuntimeError("cannot join current thread") - if timeout is None: - self._wait_for_tstate_lock() - else: - # the behavior of a negative timeout isn't documented, but - # historically .join(timeout=x) for x<0 has acted as if timeout=0 - self._wait_for_tstate_lock(timeout=max(timeout, 0)) - - if self._is_stopped: - self._join_os_thread() - - def _join_os_thread(self): - # self._handle may be cleared post-fork - if self._handle is not None: - self._handle.join() - - def _wait_for_tstate_lock(self, block=True, timeout=-1): - # Issue #18808: wait for the thread state to be gone. - # At the end of the thread's life, after all knowledge of the thread - # is removed from C data structures, C code releases our _tstate_lock. - # This method passes its arguments to _tstate_lock.acquire(). - # If the lock is acquired, the C code is done, and self._stop() is - # called. That sets ._is_stopped to True, and ._tstate_lock to None. - lock = self._tstate_lock - if lock is None: - # already determined that the C code is done - assert self._is_stopped - return + # the behavior of a negative timeout isn't documented, but + # historically .join(timeout=x) for x<0 has acted as if timeout=0 + if timeout is not None: + timeout = max(timeout, 0) - try: - if lock.acquire(block, timeout): - lock.release() - self._stop() - except: - if lock.locked(): - # bpo-45274: lock.acquire() acquired the lock, but the function - # was interrupted with an exception before reaching the - # lock.release(). It can happen if a signal handler raises an - # exception, like CTRL+C which raises KeyboardInterrupt. - lock.release() - self._stop() - raise + self._handle.join(timeout) @property def name(self): @@ -1241,13 +1138,7 @@ class Thread: """ assert self._initialized, "Thread.__init__() not called" - if self._is_stopped or not self._started.is_set(): - return False - self._wait_for_tstate_lock(False) - if not self._is_stopped: - return True - self._join_os_thread() - return False + return self._started.is_set() and not self._handle.is_done() @property def daemon(self): @@ -1456,18 +1347,14 @@ class _MainThread(Thread): def __init__(self): Thread.__init__(self, name="MainThread", daemon=False) - self._set_tstate_lock() self._started.set() - self._set_ident() + self._ident = _get_main_thread_ident() + self._handle = _make_thread_handle(self._ident) if _HAVE_THREAD_NATIVE_ID: self._set_native_id() with _active_limbo_lock: _active[self._ident] = self - def _join_os_thread(self): - # No ThreadHandle for main thread - pass - # Helper thread-local instance to detect when a _DummyThread # is collected. Not a part of the public API. @@ -1510,17 +1397,15 @@ class _DummyThread(Thread): daemon=_daemon_threads_allowed()) self._started.set() self._set_ident() + self._handle = _make_thread_handle(self._ident) if _HAVE_THREAD_NATIVE_ID: self._set_native_id() with _active_limbo_lock: _active[self._ident] = self _DeleteDummyThreadOnDel(self) - def _stop(self): - pass - def is_alive(self): - if not self._is_stopped and self._started.is_set(): + if not self._handle.is_done() and self._started.is_set(): return True raise RuntimeError("thread is not alive") @@ -1532,7 +1417,6 @@ class _DummyThread(Thread): self.__class__ = _MainThread self._name = 'MainThread' self._daemonic = False - self._set_tstate_lock() Thread._after_fork(self, new_ident=new_ident) @@ -1631,12 +1515,11 @@ def _shutdown(): """ Wait until the Python thread state of all non-daemon threads get deleted. """ - # Obscure: other threads may be waiting to join _main_thread. That's - # dubious, but some code does it. We can't wait for C code to release - # the main thread's tstate_lock - that won't happen until the interpreter - # is nearly dead. So we release it here. Note that just calling _stop() - # isn't enough: other threads may already be waiting on _tstate_lock. - if _main_thread._is_stopped and _is_main_interpreter(): + # Obscure: other threads may be waiting to join _main_thread. That's + # dubious, but some code does it. We can't wait for it to be marked as done + # normally - that won't happen until the interpreter is nearly dead. So + # mark it done here. + if _main_thread._handle.is_done() and _is_main_interpreter(): # _shutdown() was already called return @@ -1648,42 +1531,11 @@ def _shutdown(): for atexit_call in reversed(_threading_atexits): atexit_call() - # Main thread - if _main_thread.ident == get_ident(): - tlock = _main_thread._tstate_lock - # The main thread isn't finished yet, so its thread state lock can't - # have been released. - assert tlock is not None - if tlock.locked(): - # It should have been released already by - # _PyInterpreterState_SetNotRunningMain(), but there may be - # embedders that aren't calling that yet. - tlock.release() - _main_thread._stop() - else: - # bpo-1596321: _shutdown() must be called in the main thread. - # If the threading module was not imported by the main thread, - # _main_thread is the thread which imported the threading module. - # In this case, ignore _main_thread, similar behavior than for threads - # spawned by C libraries or using _thread.start_new_thread(). - pass - - # Join all non-deamon threads - while True: - with _shutdown_locks_lock: - locks = list(_shutdown_locks) - _shutdown_locks.clear() - - if not locks: - break - - for lock in locks: - # mimic Thread.join() - lock.acquire() - lock.release() - - # new threads can be spawned while we were waiting for the other - # threads to complete + if _is_main_interpreter(): + _main_thread._handle._set_done() + + # Wait for all non-daemon threads to exit. + _thread_shutdown() def main_thread(): @@ -1703,7 +1555,6 @@ def _after_fork(): # Reset _active_limbo_lock, in case we forked while the lock was held # by another (non-forked) thread. http://bugs.python.org/issue874900 global _active_limbo_lock, _main_thread - global _shutdown_locks_lock, _shutdown_locks _active_limbo_lock = RLock() # fork() only copied the current thread; clear references to others. @@ -1719,10 +1570,6 @@ def _after_fork(): _main_thread = current - # reset _shutdown() locks: threads re-register their _tstate_lock below - _shutdown_locks_lock = _allocate_lock() - _shutdown_locks = set() - with _active_limbo_lock: # Dangling thread instances must still have their locks reset, # because someone may join() them. @@ -1739,7 +1586,6 @@ def _after_fork(): else: # All the others are already stopped. thread._after_fork() - thread._stop() _limbo.clear() _active.clear() |