diff options
author | Sam Gross <colesbury@gmail.com> | 2023-09-19 15:54:29 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-09-19 15:54:29 (GMT) |
commit | 0c89056fe59ac42f09978582479d40e58a236856 (patch) | |
tree | 06cd5a790da2a6dd3862567419c25572f96ae373 /Include/internal | |
parent | 0a31ff0050eec5079fd4c9cafd33b4e3e9afd9ab (diff) | |
download | cpython-0c89056fe59ac42f09978582479d40e58a236856.zip cpython-0c89056fe59ac42f09978582479d40e58a236856.tar.gz cpython-0c89056fe59ac42f09978582479d40e58a236856.tar.bz2 |
gh-108724: Add PyMutex and _PyParkingLot APIs (gh-109344)
PyMutex is a one byte lock with fast, inlineable lock and unlock functions for the common uncontended case. The design is based on WebKit's WTF::Lock.
PyMutex is built using the _PyParkingLot APIs, which provides a cross-platform futex-like API (based on WebKit's WTF::ParkingLot). This internal API will be used for building other synchronization primitives used to implement PEP 703, such as one-time initialization and events.
This also includes tests and a mini benchmark in Tools/lockbench/lockbench.py to compare with the existing PyThread_type_lock.
Uncontended acquisition + release:
* Linux (x86-64): PyMutex: 11 ns, PyThread_type_lock: 44 ns
* macOS (arm64): PyMutex: 13 ns, PyThread_type_lock: 18 ns
* Windows (x86-64): PyMutex: 13 ns, PyThread_type_lock: 38 ns
PR Overview:
The primary purpose of this PR is to implement PyMutex, but there are a number of support pieces (described below).
* PyMutex: A 1-byte lock that doesn't require memory allocation to initialize and is generally faster than the existing PyThread_type_lock. The API is internal only for now.
* _PyParking_Lot: A futex-like API based on the API of the same name in WebKit. Used to implement PyMutex.
* _PyRawMutex: A word sized lock used to implement _PyParking_Lot.
* PyEvent: A one time event. This was used a bunch in the "nogil" fork and is useful for testing the PyMutex implementation, so I've included it as part of the PR.
* pycore_llist.h: Defines common operations on doubly-linked list. Not strictly necessary (could do the list operations manually), but they come up frequently in the "nogil" fork. ( Similar to https://man.freebsd.org/cgi/man.cgi?queue)
---------
Co-authored-by: Eric Snow <ericsnowcurrently@gmail.com>
Diffstat (limited to 'Include/internal')
-rw-r--r-- | Include/internal/pycore_llist.h | 107 | ||||
-rw-r--r-- | Include/internal/pycore_lock.h | 158 | ||||
-rw-r--r-- | Include/internal/pycore_parking_lot.h | 99 | ||||
-rw-r--r-- | Include/internal/pycore_semaphore.h | 63 |
4 files changed, 427 insertions, 0 deletions
diff --git a/Include/internal/pycore_llist.h b/Include/internal/pycore_llist.h new file mode 100644 index 0000000..5fd261d --- /dev/null +++ b/Include/internal/pycore_llist.h @@ -0,0 +1,107 @@ +// A doubly-linked list that can be embedded in a struct. +// +// Usage: +// struct llist_node head = LLIST_INIT(head); +// typedef struct { +// ... +// struct llist_node node; +// ... +// } MyObj; +// +// llist_insert_tail(&head, &obj->node); +// llist_remove(&obj->node); +// +// struct llist_node *node; +// llist_for_each(node, &head) { +// MyObj *obj = llist_data(node, MyObj, node); +// ... +// } +// + +#ifndef Py_INTERNAL_LLIST_H +#define Py_INTERNAL_LLIST_H + +#include <stddef.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef Py_BUILD_CORE +# error "Py_BUILD_CORE must be defined to include this header" +#endif + +struct llist_node { + struct llist_node *next; + struct llist_node *prev; +}; + +// Get the struct containing a node. +#define llist_data(node, type, member) \ + (type*)((char*)node - offsetof(type, member)) + +// Iterate over a list. +#define llist_for_each(node, head) \ + for (node = (head)->next; node != (head); node = node->next) + +// Iterate over a list, but allow removal of the current node. +#define llist_for_each_safe(node, head) \ + for (struct llist_node *_next = (node = (head)->next, node->next); \ + node != (head); node = _next, _next = node->next) + +#define LLIST_INIT(head) { &head, &head } + +static inline void +llist_init(struct llist_node *head) +{ + head->next = head; + head->prev = head; +} + +// Returns 1 if the list is empty, 0 otherwise. +static inline int +llist_empty(struct llist_node *head) +{ + return head->next == head; +} + +// Appends to the tail of the list. +static inline void +llist_insert_tail(struct llist_node *head, struct llist_node *node) +{ + node->prev = head->prev; + node->next = head; + head->prev->next = node; + head->prev = node; +} + +// Remove a node from the list. +static inline void +llist_remove(struct llist_node *node) +{ + struct llist_node *prev = node->prev; + struct llist_node *next = node->next; + prev->next = next; + next->prev = prev; + node->prev = NULL; + node->next = NULL; +} + +// Append all nodes from head2 onto head1. head2 is left empty. +static inline void +llist_concat(struct llist_node *head1, struct llist_node *head2) +{ + if (!llist_empty(head2)) { + head1->prev->next = head2->next; + head2->next->prev = head1->prev; + + head1->prev = head2->prev; + head2->prev->next = head1; + llist_init(head2); + } +} + +#ifdef __cplusplus +} +#endif +#endif /* !Py_INTERNAL_LLIST_H */ diff --git a/Include/internal/pycore_lock.h b/Include/internal/pycore_lock.h new file mode 100644 index 0000000..c4bb76a --- /dev/null +++ b/Include/internal/pycore_lock.h @@ -0,0 +1,158 @@ +// Lightweight locks and other synchronization mechanisms. +// +// These implementations are based on WebKit's WTF::Lock. See +// https://webkit.org/blog/6161/locking-in-webkit/ for a description of the +// design. +#ifndef Py_INTERNAL_LOCK_H +#define Py_INTERNAL_LOCK_H +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef Py_BUILD_CORE +# error "this header requires Py_BUILD_CORE define" +#endif + +#include "pycore_time.h" // _PyTime_t + + +// A mutex that occupies one byte. The lock can be zero initialized. +// +// Only the two least significant bits are used. The remaining bits should be +// zero: +// 0b00: unlocked +// 0b01: locked +// 0b10: unlocked and has parked threads +// 0b11: locked and has parked threads +// +// Typical initialization: +// PyMutex m = (PyMutex){0}; +// +// Typical usage: +// PyMutex_Lock(&m); +// ... +// PyMutex_Unlock(&m); +typedef struct _PyMutex { + uint8_t v; +} PyMutex; + +#define _Py_UNLOCKED 0 +#define _Py_LOCKED 1 +#define _Py_HAS_PARKED 2 + +// (private) slow path for locking the mutex +PyAPI_FUNC(void) _PyMutex_LockSlow(PyMutex *m); + +// (private) slow path for unlocking the mutex +PyAPI_FUNC(void) _PyMutex_UnlockSlow(PyMutex *m); + +// Locks the mutex. +// +// If the mutex is currently locked, the calling thread will be parked until +// the mutex is unlocked. If the current thread holds the GIL, then the GIL +// will be released while the thread is parked. +static inline void +PyMutex_Lock(PyMutex *m) +{ + uint8_t expected = _Py_UNLOCKED; + if (!_Py_atomic_compare_exchange_uint8(&m->v, &expected, _Py_LOCKED)) { + _PyMutex_LockSlow(m); + } +} + +// Unlocks the mutex. +static inline void +PyMutex_Unlock(PyMutex *m) +{ + uint8_t expected = _Py_LOCKED; + if (!_Py_atomic_compare_exchange_uint8(&m->v, &expected, _Py_UNLOCKED)) { + _PyMutex_UnlockSlow(m); + } +} + +// Checks if the mutex is currently locked. +static inline int +PyMutex_IsLocked(PyMutex *m) +{ + return (_Py_atomic_load_uint8(&m->v) & _Py_LOCKED) != 0; +} + +typedef enum _PyLockFlags { + // Do not detach/release the GIL when waiting on the lock. + _Py_LOCK_DONT_DETACH = 0, + + // Detach/release the GIL while waiting on the lock. + _PY_LOCK_DETACH = 1, + + // Handle signals if interrupted while waiting on the lock. + _PY_LOCK_HANDLE_SIGNALS = 2, +} _PyLockFlags; + +// Lock a mutex with an optional timeout and additional options. See +// _PyLockFlags for details. +extern PyLockStatus +_PyMutex_LockTimed(PyMutex *m, _PyTime_t timeout_ns, _PyLockFlags flags); + +// Unlock a mutex, returns 0 if the mutex is not locked (used for improved +// error messages). +extern int _PyMutex_TryUnlock(PyMutex *m); + + +// PyEvent is a one-time event notification +typedef struct { + uint8_t v; +} PyEvent; + +// Set the event and notify any waiting threads. +// Export for '_testinternalcapi' shared extension +PyAPI_FUNC(void) _PyEvent_Notify(PyEvent *evt); + +// Wait for the event to be set. If the event is already set, then this returns +// immediately. +PyAPI_FUNC(void) PyEvent_Wait(PyEvent *evt); + +// Wait for the event to be set, or until the timeout expires. If the event is +// already set, then this returns immediately. Returns 1 if the event was set, +// and 0 if the timeout expired or thread was interrupted. +PyAPI_FUNC(int) PyEvent_WaitTimed(PyEvent *evt, _PyTime_t timeout_ns); + + +// _PyRawMutex implements a word-sized mutex that that does not depend on the +// parking lot API, and therefore can be used in the parking lot +// implementation. +// +// The mutex uses a packed representation: the least significant bit is used to +// indicate whether the mutex is locked or not. The remaining bits are either +// zero or a pointer to a `struct raw_mutex_entry` (see lock.c). +typedef struct { + uintptr_t v; +} _PyRawMutex; + +// Slow paths for lock/unlock +extern void _PyRawMutex_LockSlow(_PyRawMutex *m); +extern void _PyRawMutex_UnlockSlow(_PyRawMutex *m); + +static inline void +_PyRawMutex_Lock(_PyRawMutex *m) +{ + uintptr_t unlocked = _Py_UNLOCKED; + if (_Py_atomic_compare_exchange_uintptr(&m->v, &unlocked, _Py_LOCKED)) { + return; + } + _PyRawMutex_LockSlow(m); +} + +static inline void +_PyRawMutex_Unlock(_PyRawMutex *m) +{ + uintptr_t locked = _Py_LOCKED; + if (_Py_atomic_compare_exchange_uintptr(&m->v, &locked, _Py_UNLOCKED)) { + return; + } + _PyRawMutex_UnlockSlow(m); +} + +#ifdef __cplusplus +} +#endif +#endif /* !Py_INTERNAL_LOCK_H */ diff --git a/Include/internal/pycore_parking_lot.h b/Include/internal/pycore_parking_lot.h new file mode 100644 index 0000000..f444da7 --- /dev/null +++ b/Include/internal/pycore_parking_lot.h @@ -0,0 +1,99 @@ +// ParkingLot is an internal API for building efficient synchronization +// primitives like mutexes and events. +// +// The API and name is inspired by WebKit's WTF::ParkingLot, which in turn +// is inspired Linux's futex API. +// See https://webkit.org/blog/6161/locking-in-webkit/. +// +// The core functionality is an atomic "compare-and-sleep" operation along with +// an atomic "wake-up" operation. + +#ifndef Py_INTERNAL_PARKING_LOT_H +#define Py_INTERNAL_PARKING_LOT_H +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef Py_BUILD_CORE +# error "this header requires Py_BUILD_CORE define" +#endif + +#include "pycore_time.h" // _PyTime_t + + +enum { + // The thread was unparked by another thread. + Py_PARK_OK = 0, + + // The value of `address` did not match `expected`. + Py_PARK_AGAIN = -1, + + // The thread was unparked due to a timeout. + Py_PARK_TIMEOUT = -2, + + // The thread was interrupted by a signal. + Py_PARK_INTR = -3, +}; + +// Checks that `*address == *expected` and puts the thread to sleep until an +// unpark operation is called on the same `address`. Otherwise, the function +// returns `Py_PARK_AGAIN`. The comparison behaves like memcmp, but is +// performed atomically with respect to unpark operations. +// +// The `address_size` argument is the size of the data pointed to by the +// `address` and `expected` pointers (i.e., sizeof(*address)). It must be +// 1, 2, 4, or 8. +// +// The `timeout_ns` argument specifies the maximum amount of time to wait, with +// -1 indicating an infinite wait. +// +// `park_arg`, which can be NULL, is passed to the unpark operation. +// +// If `detach` is true, then the thread will detach/release the GIL while +// waiting. +// +// Example usage: +// +// if (_Py_atomic_compare_exchange_uint8(address, &expected, new_value)) { +// int res = _PyParkingLot_Park(address, &new_value, sizeof(*address), +// timeout_ns, NULL, 1); +// ... +// } +PyAPI_FUNC(int) +_PyParkingLot_Park(const void *address, const void *expected, + size_t address_size, _PyTime_t timeout_ns, + void *park_arg, int detach); + +// Callback for _PyParkingLot_Unpark: +// +// `arg` is the data of the same name provided to the _PyParkingLot_Unpark() +// call. +// `park_arg` is the data provided to _PyParkingLot_Park() call or NULL if +// no waiting thread was found. +// `has_more_waiters` is true if there are more threads waiting on the same +// address. May be true in cases where threads are waiting on a different +// address that map to the same internal bucket. +typedef void _Py_unpark_fn_t(void *arg, void *park_arg, int has_more_waiters); + +// Unparks a single thread waiting on `address`. +// +// Note that fn() is called regardless of whether a thread was unparked. If +// no threads are waiting on `address` then the `park_arg` argument to fn() +// will be NULL. +// +// Example usage: +// void callback(void *arg, void *park_arg, int has_more_waiters); +// _PyParkingLot_Unpark(address, &callback, arg); +PyAPI_FUNC(void) +_PyParkingLot_Unpark(const void *address, _Py_unpark_fn_t *fn, void *arg); + +// Unparks all threads waiting on `address`. +PyAPI_FUNC(void) _PyParkingLot_UnparkAll(const void *address); + +// Resets the parking lot state after a fork. Forgets all parked threads. +PyAPI_FUNC(void) _PyParkingLot_AfterFork(void); + +#ifdef __cplusplus +} +#endif +#endif /* !Py_INTERNAL_PARKING_LOT_H */ diff --git a/Include/internal/pycore_semaphore.h b/Include/internal/pycore_semaphore.h new file mode 100644 index 0000000..c1df833 --- /dev/null +++ b/Include/internal/pycore_semaphore.h @@ -0,0 +1,63 @@ +// The _PySemaphore API a simplified cross-platform semaphore used to implement +// wakeup/sleep. +#ifndef Py_INTERNAL_SEMAPHORE_H +#define Py_INTERNAL_SEMAPHORE_H + +#ifndef Py_BUILD_CORE +# error "this header requires Py_BUILD_CORE define" +#endif + +#include "pycore_time.h" // _PyTime_t + +#ifdef MS_WINDOWS +# define WIN32_LEAN_AND_MEAN +# include <windows.h> +#elif defined(HAVE_PTHREAD_H) +# include <pthread.h> +#elif defined(HAVE_PTHREAD_STUBS) +# include "cpython/pthread_stubs.h" +#else +# error "Require native threads. See https://bugs.python.org/issue31370" +#endif + +#if defined(_POSIX_SEMAPHORES) && (_POSIX_SEMAPHORES+0) != -1 +# define _Py_USE_SEMAPHORES +# include <semaphore.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _PySemaphore { +#if defined(MS_WINDOWS) + HANDLE platform_sem; +#elif defined(_Py_USE_SEMAPHORES) + sem_t platform_sem; +#else + pthread_mutex_t mutex; + pthread_cond_t cond; + int counter; +#endif +} _PySemaphore; + +// Puts the current thread to sleep until _PySemaphore_Wakeup() is called. +// If `detach` is true, then the thread will detach/release the GIL while +// sleeping. +PyAPI_FUNC(int) +_PySemaphore_Wait(_PySemaphore *sema, _PyTime_t timeout_ns, int detach); + +// Wakes up a single thread waiting on sema. Note that _PySemaphore_Wakeup() +// can be called before _PySemaphore_Wait(). +PyAPI_FUNC(void) +_PySemaphore_Wakeup(_PySemaphore *sema); + +// Initializes/destroys a semaphore +PyAPI_FUNC(void) _PySemaphore_Init(_PySemaphore *sema); +PyAPI_FUNC(void) _PySemaphore_Destroy(_PySemaphore *sema); + + +#ifdef __cplusplus +} +#endif +#endif /* !Py_INTERNAL_SEMAPHORE_H */ |