gh-108724: Add PyMutex and _PyParkingLot APIs (gh-109344)

PyMutex is a one byte lock with fast, inlineable lock and unlock functions for the common uncontended case. The design is based on WebKit's WTF::Lock. PyMutex is built using the _PyParkingLot APIs, which provides a cross-platform futex-like API (based on WebKit's WTF::ParkingLot). This internal API will be used for building other synchronization primitives used to implement PEP 703, such as one-time initialization and events. This also includes tests and a mini benchmark in Tools/lockbench/lockbench.py to compare with the existing PyThread_type_lock. Uncontended acquisition + release: * Linux (x86-64): PyMutex: 11 ns, PyThread_type_lock: 44 ns * macOS (arm64): PyMutex: 13 ns, PyThread_type_lock: 18 ns * Windows (x86-64): PyMutex: 13 ns, PyThread_type_lock: 38 ns PR Overview: The primary purpose of this PR is to implement PyMutex, but there are a number of support pieces (described below). * PyMutex: A 1-byte lock that doesn't require memory allocation to initialize and is generally faster than the existing PyThread_type_lock. The API is internal only for now. * _PyParking_Lot: A futex-like API based on the API of the same name in WebKit. Used to implement PyMutex. * _PyRawMutex: A word sized lock used to implement _PyParking_Lot. * PyEvent: A one time event. This was used a bunch in the "nogil" fork and is useful for testing the PyMutex implementation, so I've included it as part of the PR. * pycore_llist.h: Defines common operations on doubly-linked list. Not strictly necessary (could do the list operations manually), but they come up frequently in the "nogil" fork. ( Similar to https://man.freebsd.org/cgi/man.cgi?queue) --------- Co-authored-by: Eric Snow <ericsnowcurrently@gmail.com>
author: Sam Gross <colesbury@gmail.com> 2023-09-19 15:54:29 (GMT)
committer: GitHub <noreply@github.com> 2023-09-19 15:54:29 (GMT)
commit: 0c89056fe59ac42f09978582479d40e58a236856 (patch)
tree: 06cd5a790da2a6dd3862567419c25572f96ae373 /Include/internal
parent: 0a31ff0050eec5079fd4c9cafd33b4e3e9afd9ab (diff)
download: cpython-0c89056fe59ac42f09978582479d40e58a236856.zip
cpython-0c89056fe59ac42f09978582479d40e58a236856.tar.gz
cpython-0c89056fe59ac42f09978582479d40e58a236856.tar.bz2
4 files changed, 427 insertions, 0 deletions
diff --git a/Include/internal/pycore_llist.h b/Include/internal/pycore_llist.h
new file mode 100644
index 0000000..5fd261d
--- /dev/null
+++ b/Include/internal/pycore_llist.h
@@ -0,0 +1,107 @@
+// A doubly-linked list that can be embedded in a struct.
+//
+// Usage:
+//  struct llist_node head = LLIST_INIT(head);
+//  typedef struct {
+//      ...
+//      struct llist_node node;
+//      ...
+//  } MyObj;
+//
+//  llist_insert_tail(&head, &obj->node);
+//  llist_remove(&obj->node);
+//
+//  struct llist_node *node;
+//  llist_for_each(node, &head) {
+//      MyObj *obj = llist_data(node, MyObj, node);
+//      ...
+//  }
+//
+
+#ifndef Py_INTERNAL_LLIST_H
+#define Py_INTERNAL_LLIST_H
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "Py_BUILD_CORE must be defined to include this header"
+#endif
+
+struct llist_node {
+    struct llist_node *next;
+    struct llist_node *prev;
+};
+
+// Get the struct containing a node.
+#define llist_data(node, type, member) \
+    (type*)((char*)node - offsetof(type, member))
+
+// Iterate over a list.
+#define llist_for_each(node, head) \
+    for (node = (head)->next; node != (head); node = node->next)
+
+// Iterate over a list, but allow removal of the current node.
+#define llist_for_each_safe(node, head) \
+    for (struct llist_node *_next = (node = (head)->next, node->next); \
+         node != (head); node = _next, _next = node->next)
+
+#define LLIST_INIT(head) { &head, &head }
+
+static inline void
+llist_init(struct llist_node *head)
+{
+    head->next = head;
+    head->prev = head;
+}
+
+// Returns 1 if the list is empty, 0 otherwise.
+static inline int
+llist_empty(struct llist_node *head)
+{
+    return head->next == head;
+}
+
+// Appends to the tail of the list.
+static inline void
+llist_insert_tail(struct llist_node *head, struct llist_node *node)
+{
+    node->prev = head->prev;
+    node->next = head;
+    head->prev->next = node;
+    head->prev = node;
+}
+
+// Remove a node from the list.
+static inline void
+llist_remove(struct llist_node *node)
+{
+    struct llist_node *prev = node->prev;
+    struct llist_node *next = node->next;
+    prev->next = next;
+    next->prev = prev;
+    node->prev = NULL;
+    node->next = NULL;
+}
+
+// Append all nodes from head2 onto head1. head2 is left empty.
+static inline void
+llist_concat(struct llist_node *head1, struct llist_node *head2)
+{
+    if (!llist_empty(head2)) {
+        head1->prev->next = head2->next;
+        head2->next->prev = head1->prev;
+
+        head1->prev = head2->prev;
+        head2->prev->next = head1;
+        llist_init(head2);
+    }
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_LLIST_H */
diff --git a/Include/internal/pycore_lock.h b/Include/internal/pycore_lock.h
new file mode 100644
index 0000000..c4bb76a
--- /dev/null
+++ b/Include/internal/pycore_lock.h
@@ -0,0 +1,158 @@
+// Lightweight locks and other synchronization mechanisms.
+//
+// These implementations are based on WebKit's WTF::Lock. See
+// https://webkit.org/blog/6161/locking-in-webkit/ for a description of the
+// design.
+#ifndef Py_INTERNAL_LOCK_H
+#define Py_INTERNAL_LOCK_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_time.h"          // _PyTime_t
+
+
+// A mutex that occupies one byte. The lock can be zero initialized.
+//
+// Only the two least significant bits are used. The remaining bits should be
+// zero:
+// 0b00: unlocked
+// 0b01: locked
+// 0b10: unlocked and has parked threads
+// 0b11: locked and has parked threads
+//
+// Typical initialization:
+//   PyMutex m = (PyMutex){0};
+//
+// Typical usage:
+//   PyMutex_Lock(&m);
+//   ...
+//   PyMutex_Unlock(&m);
+typedef struct _PyMutex {
+    uint8_t v;
+} PyMutex;
+
+#define _Py_UNLOCKED    0
+#define _Py_LOCKED      1
+#define _Py_HAS_PARKED  2
+
+// (private) slow path for locking the mutex
+PyAPI_FUNC(void) _PyMutex_LockSlow(PyMutex *m);
+
+// (private) slow path for unlocking the mutex
+PyAPI_FUNC(void) _PyMutex_UnlockSlow(PyMutex *m);
+
+// Locks the mutex.
+//
+// If the mutex is currently locked, the calling thread will be parked until
+// the mutex is unlocked. If the current thread holds the GIL, then the GIL
+// will be released while the thread is parked.
+static inline void
+PyMutex_Lock(PyMutex *m)
+{
+    uint8_t expected = _Py_UNLOCKED;
+    if (!_Py_atomic_compare_exchange_uint8(&m->v, &expected, _Py_LOCKED)) {
+        _PyMutex_LockSlow(m);
+    }
+}
+
+// Unlocks the mutex.
+static inline void
+PyMutex_Unlock(PyMutex *m)
+{
+    uint8_t expected = _Py_LOCKED;
+    if (!_Py_atomic_compare_exchange_uint8(&m->v, &expected, _Py_UNLOCKED)) {
+        _PyMutex_UnlockSlow(m);
+    }
+}
+
+// Checks if the mutex is currently locked.
+static inline int
+PyMutex_IsLocked(PyMutex *m)
+{
+    return (_Py_atomic_load_uint8(&m->v) & _Py_LOCKED) != 0;
+}
+
+typedef enum _PyLockFlags {
+    // Do not detach/release the GIL when waiting on the lock.
+    _Py_LOCK_DONT_DETACH = 0,
+
+    // Detach/release the GIL while waiting on the lock.
+    _PY_LOCK_DETACH = 1,
+
+    // Handle signals if interrupted while waiting on the lock.
+    _PY_LOCK_HANDLE_SIGNALS = 2,
+} _PyLockFlags;
+
+// Lock a mutex with an optional timeout and additional options. See
+// _PyLockFlags for details.
+extern PyLockStatus
+_PyMutex_LockTimed(PyMutex *m, _PyTime_t timeout_ns, _PyLockFlags flags);
+
+// Unlock a mutex, returns 0 if the mutex is not locked (used for improved
+// error messages).
+extern int _PyMutex_TryUnlock(PyMutex *m);
+
+
+// PyEvent is a one-time event notification
+typedef struct {
+    uint8_t v;
+} PyEvent;
+
+// Set the event and notify any waiting threads.
+// Export for '_testinternalcapi' shared extension
+PyAPI_FUNC(void) _PyEvent_Notify(PyEvent *evt);
+
+// Wait for the event to be set. If the event is already set, then this returns
+// immediately.
+PyAPI_FUNC(void) PyEvent_Wait(PyEvent *evt);
+
+// Wait for the event to be set, or until the timeout expires. If the event is
+// already set, then this returns immediately. Returns 1 if the event was set,
+// and 0 if the timeout expired or thread was interrupted.
+PyAPI_FUNC(int) PyEvent_WaitTimed(PyEvent *evt, _PyTime_t timeout_ns);
+
+
+// _PyRawMutex implements a word-sized mutex that that does not depend on the
+// parking lot API, and therefore can be used in the parking lot
+// implementation.
+//
+// The mutex uses a packed representation: the least significant bit is used to
+// indicate whether the mutex is locked or not. The remaining bits are either
+// zero or a pointer to a `struct raw_mutex_entry` (see lock.c).
+typedef struct {
+    uintptr_t v;
+} _PyRawMutex;
+
+// Slow paths for lock/unlock
+extern void _PyRawMutex_LockSlow(_PyRawMutex *m);
+extern void _PyRawMutex_UnlockSlow(_PyRawMutex *m);
+
+static inline void
+_PyRawMutex_Lock(_PyRawMutex *m)
+{
+    uintptr_t unlocked = _Py_UNLOCKED;
+    if (_Py_atomic_compare_exchange_uintptr(&m->v, &unlocked, _Py_LOCKED)) {
+        return;
+    }
+    _PyRawMutex_LockSlow(m);
+}
+
+static inline void
+_PyRawMutex_Unlock(_PyRawMutex *m)
+{
+    uintptr_t locked = _Py_LOCKED;
+    if (_Py_atomic_compare_exchange_uintptr(&m->v, &locked, _Py_UNLOCKED)) {
+        return;
+    }
+    _PyRawMutex_UnlockSlow(m);
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif   /* !Py_INTERNAL_LOCK_H */
diff --git a/Include/internal/pycore_parking_lot.h b/Include/internal/pycore_parking_lot.h
new file mode 100644
index 0000000..f444da7
--- /dev/null
+++ b/Include/internal/pycore_parking_lot.h
@@ -0,0 +1,99 @@
+// ParkingLot is an internal API for building efficient synchronization
+// primitives like mutexes and events.
+//
+// The API and name is inspired by WebKit's WTF::ParkingLot, which in turn
+// is inspired Linux's futex API.
+// See https://webkit.org/blog/6161/locking-in-webkit/.
+//
+// The core functionality is an atomic "compare-and-sleep" operation along with
+// an atomic "wake-up" operation.
+
+#ifndef Py_INTERNAL_PARKING_LOT_H
+#define Py_INTERNAL_PARKING_LOT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_time.h"        // _PyTime_t
+
+
+enum {
+    // The thread was unparked by another thread.
+    Py_PARK_OK = 0,
+
+    // The value of `address` did not match `expected`.
+    Py_PARK_AGAIN = -1,
+
+    // The thread was unparked due to a timeout.
+    Py_PARK_TIMEOUT = -2,
+
+    // The thread was interrupted by a signal.
+    Py_PARK_INTR = -3,
+};
+
+// Checks that `*address == *expected` and puts the thread to sleep until an
+// unpark operation is called on the same `address`. Otherwise, the function
+// returns `Py_PARK_AGAIN`. The comparison behaves like memcmp, but is
+// performed atomically with respect to unpark operations.
+//
+// The `address_size` argument is the size of the data pointed to by the
+// `address` and `expected` pointers (i.e., sizeof(*address)). It must be
+// 1, 2, 4, or 8.
+//
+// The `timeout_ns` argument specifies the maximum amount of time to wait, with
+// -1 indicating an infinite wait.
+//
+// `park_arg`, which can be NULL, is passed to the unpark operation.
+//
+// If `detach` is true, then the thread will detach/release the GIL while
+// waiting.
+//
+// Example usage:
+//
+//  if (_Py_atomic_compare_exchange_uint8(address, &expected, new_value)) {
+//    int res = _PyParkingLot_Park(address, &new_value, sizeof(*address),
+//                                 timeout_ns, NULL, 1);
+//    ...
+//  }
+PyAPI_FUNC(int)
+_PyParkingLot_Park(const void *address, const void *expected,
+                   size_t address_size, _PyTime_t timeout_ns,
+                   void *park_arg, int detach);
+
+// Callback for _PyParkingLot_Unpark:
+//
+// `arg` is the data of the same name provided to the _PyParkingLot_Unpark()
+//      call.
+// `park_arg` is the data provided to _PyParkingLot_Park() call or NULL if
+//      no waiting thread was found.
+// `has_more_waiters` is true if there are more threads waiting on the same
+//      address. May be true in cases where threads are waiting on a different
+//      address that map to the same internal bucket.
+typedef void _Py_unpark_fn_t(void *arg, void *park_arg, int has_more_waiters);
+
+// Unparks a single thread waiting on `address`.
+//
+// Note that fn() is called regardless of whether a thread was unparked. If
+// no threads are waiting on `address` then the `park_arg` argument to fn()
+// will be NULL.
+//
+// Example usage:
+//  void callback(void *arg, void *park_arg, int has_more_waiters);
+//  _PyParkingLot_Unpark(address, &callback, arg);
+PyAPI_FUNC(void)
+_PyParkingLot_Unpark(const void *address, _Py_unpark_fn_t *fn, void *arg);
+
+// Unparks all threads waiting on `address`.
+PyAPI_FUNC(void) _PyParkingLot_UnparkAll(const void *address);
+
+// Resets the parking lot state after a fork. Forgets all parked threads.
+PyAPI_FUNC(void) _PyParkingLot_AfterFork(void);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_PARKING_LOT_H */
diff --git a/Include/internal/pycore_semaphore.h b/Include/internal/pycore_semaphore.h
new file mode 100644
index 0000000..c1df833
--- /dev/null
+++ b/Include/internal/pycore_semaphore.h
@@ -0,0 +1,63 @@
+// The _PySemaphore API a simplified cross-platform semaphore used to implement
+// wakeup/sleep.
+#ifndef Py_INTERNAL_SEMAPHORE_H
+#define Py_INTERNAL_SEMAPHORE_H
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_time.h"            // _PyTime_t
+
+#ifdef MS_WINDOWS
+#   define WIN32_LEAN_AND_MEAN
+#   include <windows.h>
+#elif defined(HAVE_PTHREAD_H)
+#   include <pthread.h>
+#elif defined(HAVE_PTHREAD_STUBS)
+#   include "cpython/pthread_stubs.h"
+#else
+#   error "Require native threads. See https://bugs.python.org/issue31370"
+#endif
+
+#if defined(_POSIX_SEMAPHORES) && (_POSIX_SEMAPHORES+0) != -1
+#   define _Py_USE_SEMAPHORES
+#   include <semaphore.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _PySemaphore {
+#if defined(MS_WINDOWS)
+    HANDLE platform_sem;
+#elif defined(_Py_USE_SEMAPHORES)
+    sem_t platform_sem;
+#else
+    pthread_mutex_t mutex;
+    pthread_cond_t cond;
+    int counter;
+#endif
+} _PySemaphore;
+
+// Puts the current thread to sleep until _PySemaphore_Wakeup() is called.
+// If `detach` is true, then the thread will detach/release the GIL while
+// sleeping.
+PyAPI_FUNC(int)
+_PySemaphore_Wait(_PySemaphore *sema, _PyTime_t timeout_ns, int detach);
+
+// Wakes up a single thread waiting on sema. Note that _PySemaphore_Wakeup()
+// can be called before _PySemaphore_Wait().
+PyAPI_FUNC(void)
+_PySemaphore_Wakeup(_PySemaphore *sema);
+
+// Initializes/destroys a semaphore
+PyAPI_FUNC(void) _PySemaphore_Init(_PySemaphore *sema);
+PyAPI_FUNC(void) _PySemaphore_Destroy(_PySemaphore *sema);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_SEMAPHORE_H */
author	Sam Gross <colesbury@gmail.com>	2023-09-19 15:54:29 (GMT)
committer	GitHub <noreply@github.com>	2023-09-19 15:54:29 (GMT)
commit	0c89056fe59ac42f09978582479d40e58a236856 (patch)
tree	06cd5a790da2a6dd3862567419c25572f96ae373 /Include/internal
parent	0a31ff0050eec5079fd4c9cafd33b4e3e9afd9ab (diff)
download	cpython-0c89056fe59ac42f09978582479d40e58a236856.zip cpython-0c89056fe59ac42f09978582479d40e58a236856.tar.gz cpython-0c89056fe59ac42f09978582479d40e58a236856.tar.bz2