diff options
author | Sam Gross <colesbury@gmail.com> | 2023-09-19 15:54:29 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-09-19 15:54:29 (GMT) |
commit | 0c89056fe59ac42f09978582479d40e58a236856 (patch) | |
tree | 06cd5a790da2a6dd3862567419c25572f96ae373 /Python/lock.c | |
parent | 0a31ff0050eec5079fd4c9cafd33b4e3e9afd9ab (diff) | |
download | cpython-0c89056fe59ac42f09978582479d40e58a236856.zip cpython-0c89056fe59ac42f09978582479d40e58a236856.tar.gz cpython-0c89056fe59ac42f09978582479d40e58a236856.tar.bz2 |
gh-108724: Add PyMutex and _PyParkingLot APIs (gh-109344)
PyMutex is a one byte lock with fast, inlineable lock and unlock functions for the common uncontended case. The design is based on WebKit's WTF::Lock.
PyMutex is built using the _PyParkingLot APIs, which provides a cross-platform futex-like API (based on WebKit's WTF::ParkingLot). This internal API will be used for building other synchronization primitives used to implement PEP 703, such as one-time initialization and events.
This also includes tests and a mini benchmark in Tools/lockbench/lockbench.py to compare with the existing PyThread_type_lock.
Uncontended acquisition + release:
* Linux (x86-64): PyMutex: 11 ns, PyThread_type_lock: 44 ns
* macOS (arm64): PyMutex: 13 ns, PyThread_type_lock: 18 ns
* Windows (x86-64): PyMutex: 13 ns, PyThread_type_lock: 38 ns
PR Overview:
The primary purpose of this PR is to implement PyMutex, but there are a number of support pieces (described below).
* PyMutex: A 1-byte lock that doesn't require memory allocation to initialize and is generally faster than the existing PyThread_type_lock. The API is internal only for now.
* _PyParking_Lot: A futex-like API based on the API of the same name in WebKit. Used to implement PyMutex.
* _PyRawMutex: A word sized lock used to implement _PyParking_Lot.
* PyEvent: A one time event. This was used a bunch in the "nogil" fork and is useful for testing the PyMutex implementation, so I've included it as part of the PR.
* pycore_llist.h: Defines common operations on doubly-linked list. Not strictly necessary (could do the list operations manually), but they come up frequently in the "nogil" fork. ( Similar to https://man.freebsd.org/cgi/man.cgi?queue)
---------
Co-authored-by: Eric Snow <ericsnowcurrently@gmail.com>
Diffstat (limited to 'Python/lock.c')
-rw-r--r-- | Python/lock.c | 297 |
1 files changed, 297 insertions, 0 deletions
diff --git a/Python/lock.c b/Python/lock.c new file mode 100644 index 0000000..3dad2aa --- /dev/null +++ b/Python/lock.c @@ -0,0 +1,297 @@ +// Lock implementation + +#include "Python.h" + +#include "pycore_lock.h" +#include "pycore_parking_lot.h" +#include "pycore_semaphore.h" + +#ifdef MS_WINDOWS +#define WIN32_LEAN_AND_MEAN +#include <windows.h> // SwitchToThread() +#elif defined(HAVE_SCHED_H) +#include <sched.h> // sched_yield() +#endif + +// If a thread waits on a lock for longer than TIME_TO_BE_FAIR_NS (1 ms), then +// the unlocking thread directly hands off ownership of the lock. This avoids +// starvation. +static const _PyTime_t TIME_TO_BE_FAIR_NS = 1000*1000; + +// Spin for a bit before parking the thread. This is only enabled for +// `--disable-gil` builds because it is unlikely to be helpful if the GIL is +// enabled. +#if Py_NOGIL +static const int MAX_SPIN_COUNT = 40; +#else +static const int MAX_SPIN_COUNT = 0; +#endif + +struct mutex_entry { + // The time after which the unlocking thread should hand off lock ownership + // directly to the waiting thread. Written by the waiting thread. + _PyTime_t time_to_be_fair; + + // Set to 1 if the lock was handed off. Written by the unlocking thread. + int handed_off; +}; + +static void +_Py_yield(void) +{ +#ifdef MS_WINDOWS + SwitchToThread(); +#elif defined(HAVE_SCHED_H) + sched_yield(); +#endif +} + +void +_PyMutex_LockSlow(PyMutex *m) +{ + _PyMutex_LockTimed(m, -1, _PY_LOCK_DETACH); +} + +PyLockStatus +_PyMutex_LockTimed(PyMutex *m, _PyTime_t timeout, _PyLockFlags flags) +{ + uint8_t v = _Py_atomic_load_uint8_relaxed(&m->v); + if ((v & _Py_LOCKED) == 0) { + if (_Py_atomic_compare_exchange_uint8(&m->v, &v, v|_Py_LOCKED)) { + return PY_LOCK_ACQUIRED; + } + } + else if (timeout == 0) { + return PY_LOCK_FAILURE; + } + + _PyTime_t now = _PyTime_GetMonotonicClock(); + _PyTime_t endtime = 0; + if (timeout > 0) { + endtime = _PyTime_Add(now, timeout); + } + + struct mutex_entry entry = { + .time_to_be_fair = now + TIME_TO_BE_FAIR_NS, + .handed_off = 0, + }; + + Py_ssize_t spin_count = 0; + for (;;) { + if ((v & _Py_LOCKED) == 0) { + // The lock is unlocked. Try to grab it. + if (_Py_atomic_compare_exchange_uint8(&m->v, &v, v|_Py_LOCKED)) { + return PY_LOCK_ACQUIRED; + } + continue; + } + + if (!(v & _Py_HAS_PARKED) && spin_count < MAX_SPIN_COUNT) { + // Spin for a bit. + _Py_yield(); + spin_count++; + continue; + } + + if (timeout == 0) { + return PY_LOCK_FAILURE; + } + + uint8_t newv = v; + if (!(v & _Py_HAS_PARKED)) { + // We are the first waiter. Set the _Py_HAS_PARKED flag. + newv = v | _Py_HAS_PARKED; + if (!_Py_atomic_compare_exchange_uint8(&m->v, &v, newv)) { + continue; + } + } + + int ret = _PyParkingLot_Park(&m->v, &newv, sizeof(newv), timeout, + &entry, (flags & _PY_LOCK_DETACH) != 0); + if (ret == Py_PARK_OK) { + if (entry.handed_off) { + // We own the lock now. + assert(_Py_atomic_load_uint8_relaxed(&m->v) & _Py_LOCKED); + return PY_LOCK_ACQUIRED; + } + } + else if (ret == Py_PARK_INTR && (flags & _PY_LOCK_HANDLE_SIGNALS)) { + if (Py_MakePendingCalls() < 0) { + return PY_LOCK_INTR; + } + } + else if (ret == Py_PARK_TIMEOUT) { + assert(timeout >= 0); + return PY_LOCK_FAILURE; + } + + if (timeout > 0) { + timeout = _PyDeadline_Get(endtime); + if (timeout <= 0) { + // Avoid negative values because those mean block forever. + timeout = 0; + } + } + + v = _Py_atomic_load_uint8_relaxed(&m->v); + } +} + +static void +mutex_unpark(PyMutex *m, struct mutex_entry *entry, int has_more_waiters) +{ + uint8_t v = 0; + if (entry) { + _PyTime_t now = _PyTime_GetMonotonicClock(); + int should_be_fair = now > entry->time_to_be_fair; + + entry->handed_off = should_be_fair; + if (should_be_fair) { + v |= _Py_LOCKED; + } + if (has_more_waiters) { + v |= _Py_HAS_PARKED; + } + } + _Py_atomic_store_uint8(&m->v, v); +} + +int +_PyMutex_TryUnlock(PyMutex *m) +{ + uint8_t v = _Py_atomic_load_uint8(&m->v); + for (;;) { + if ((v & _Py_LOCKED) == 0) { + // error: the mutex is not locked + return -1; + } + else if ((v & _Py_HAS_PARKED)) { + // wake up a single thread + _PyParkingLot_Unpark(&m->v, (_Py_unpark_fn_t *)mutex_unpark, m); + return 0; + } + else if (_Py_atomic_compare_exchange_uint8(&m->v, &v, _Py_UNLOCKED)) { + // fast-path: no waiters + return 0; + } + } +} + +void +_PyMutex_UnlockSlow(PyMutex *m) +{ + if (_PyMutex_TryUnlock(m) < 0) { + Py_FatalError("unlocking mutex that is not locked"); + } +} + +// _PyRawMutex stores a linked list of `struct raw_mutex_entry`, one for each +// thread waiting on the mutex, directly in the mutex itself. +struct raw_mutex_entry { + struct raw_mutex_entry *next; + _PySemaphore sema; +}; + +void +_PyRawMutex_LockSlow(_PyRawMutex *m) +{ + struct raw_mutex_entry waiter; + _PySemaphore_Init(&waiter.sema); + + uintptr_t v = _Py_atomic_load_uintptr(&m->v); + for (;;) { + if ((v & _Py_LOCKED) == 0) { + // Unlocked: try to grab it (even if it has a waiter). + if (_Py_atomic_compare_exchange_uintptr(&m->v, &v, v|_Py_LOCKED)) { + break; + } + continue; + } + + // Locked: try to add ourselves as a waiter. + waiter.next = (struct raw_mutex_entry *)(v & ~1); + uintptr_t desired = ((uintptr_t)&waiter)|_Py_LOCKED; + if (!_Py_atomic_compare_exchange_uintptr(&m->v, &v, desired)) { + continue; + } + + // Wait for us to be woken up. Note that we still have to lock the + // mutex ourselves: it is NOT handed off to us. + _PySemaphore_Wait(&waiter.sema, -1, /*detach=*/0); + } + + _PySemaphore_Destroy(&waiter.sema); +} + +void +_PyRawMutex_UnlockSlow(_PyRawMutex *m) +{ + uintptr_t v = _Py_atomic_load_uintptr(&m->v); + for (;;) { + if ((v & _Py_LOCKED) == 0) { + Py_FatalError("unlocking mutex that is not locked"); + } + + struct raw_mutex_entry *waiter = (struct raw_mutex_entry *)(v & ~1); + if (waiter) { + uintptr_t next_waiter = (uintptr_t)waiter->next; + if (_Py_atomic_compare_exchange_uintptr(&m->v, &v, next_waiter)) { + _PySemaphore_Wakeup(&waiter->sema); + return; + } + } + else { + if (_Py_atomic_compare_exchange_uintptr(&m->v, &v, _Py_UNLOCKED)) { + return; + } + } + } +} + +void +_PyEvent_Notify(PyEvent *evt) +{ + uintptr_t v = _Py_atomic_exchange_uint8(&evt->v, _Py_LOCKED); + if (v == _Py_UNLOCKED) { + // no waiters + return; + } + else if (v == _Py_LOCKED) { + // event already set + return; + } + else { + assert(v == _Py_HAS_PARKED); + _PyParkingLot_UnparkAll(&evt->v); + } +} + +void +PyEvent_Wait(PyEvent *evt) +{ + while (!PyEvent_WaitTimed(evt, -1)) + ; +} + +int +PyEvent_WaitTimed(PyEvent *evt, _PyTime_t timeout_ns) +{ + for (;;) { + uint8_t v = _Py_atomic_load_uint8(&evt->v); + if (v == _Py_LOCKED) { + // event already set + return 1; + } + if (v == _Py_UNLOCKED) { + if (!_Py_atomic_compare_exchange_uint8(&evt->v, &v, _Py_HAS_PARKED)) { + continue; + } + } + + uint8_t expected = _Py_HAS_PARKED; + (void) _PyParkingLot_Park(&evt->v, &expected, sizeof(evt->v), + timeout_ns, NULL, 1); + + return _Py_atomic_load_uint8(&evt->v) == _Py_LOCKED; + } +} |