From e664d7f89d2b9960d9049237136396e824795cac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A4r=20Bj=C3=B6rklund?= <per.bjorklund@gmail.com>
Date: Sat, 12 Aug 2017 11:19:30 +0200
Subject: bpo-30747: Attempt to fix atomic load/store (#2383)

_Py_atomic_* are currently not implemented as atomic operations
when building with MSVC. This patch attempts to implement parts
of the functionality required.
---
 Include/pyatomic.h                                 | 305 ++++++++++++++++++++-
 .../2017-08-08-12-00-29.bpo-30747.g2kZRT.rst       |   2 +
 2 files changed, 297 insertions(+), 10 deletions(-)
 create mode 100644 Misc/NEWS.d/next/Core and Builtins/2017-08-08-12-00-29.bpo-30747.g2kZRT.rst

diff --git a/Include/pyatomic.h b/Include/pyatomic.h
index 893d30d..832d951 100644
--- a/Include/pyatomic.h
+++ b/Include/pyatomic.h
@@ -10,6 +10,12 @@
 #include <stdatomic.h>
 #endif
 
+
+#if defined(_MSC_VER) 
+#include <intrin.h>
+#include <immintrin.h>
+#endif
+
 /* This is modeled after the atomics interface from C1x, according to
  * the draft at
  * http://www.open-std.org/JTC1/SC22/wg14/www/docs/n1425.pdf.
@@ -87,8 +93,9 @@ typedef struct _Py_atomic_int {
             || (ORDER) == __ATOMIC_CONSUME),                  \
      __atomic_load_n(&(ATOMIC_VAL)->_value, ORDER))
 
-#else
-
+/* Only support GCC (for expression statements) and x86 (for simple
+ * atomic semantics) and MSVC x86/x64/ARM */
+#elif defined(__GNUC__) && (defined(__i386__) || defined(__amd64))
 typedef enum _Py_memory_order {
     _Py_memory_order_relaxed,
     _Py_memory_order_acquire,
@@ -105,9 +112,6 @@ typedef struct _Py_atomic_int {
     int _value;
 } _Py_atomic_int;
 
-/* Only support GCC (for expression statements) and x86 (for simple
- * atomic semantics) for now */
-#if defined(__GNUC__) && (defined(__i386__) || defined(__amd64))
 
 static __inline__ void
 _Py_atomic_signal_fence(_Py_memory_order order)
@@ -127,7 +131,7 @@ _Py_atomic_thread_fence(_Py_memory_order order)
 static __inline__ void
 _Py_ANNOTATE_MEMORY_ORDER(const volatile void *address, _Py_memory_order order)
 {
-    (void)address;		/* shut up -Wunused-parameter */
+    (void)address;              /* shut up -Wunused-parameter */
     switch(order) {
     case _Py_memory_order_release:
     case _Py_memory_order_acq_rel:
@@ -219,7 +223,291 @@ _Py_ANNOTATE_MEMORY_ORDER(const volatile void *address, _Py_memory_order order)
         result; \
     })
 
-#else  /* !gcc x86 */
+#elif defined(_MSC_VER) 
+/*  _Interlocked* functions provide a full memory barrier and are therefore
+    enough for acq_rel and seq_cst. If the HLE variants aren't available
+    in hardware they will fall back to a full memory barrier as well.
+
+    This might affect performance but likely only in some very specific and
+    hard to meassure scenario.
+*/
+#if defined(_M_IX86) || defined(_M_X64)
+typedef enum _Py_memory_order {
+    _Py_memory_order_relaxed,
+    _Py_memory_order_acquire,
+    _Py_memory_order_release,
+    _Py_memory_order_acq_rel,
+    _Py_memory_order_seq_cst
+} _Py_memory_order;
+
+typedef struct _Py_atomic_address {
+    volatile uintptr_t _value;
+} _Py_atomic_address;
+
+typedef struct _Py_atomic_int {
+    volatile int _value;
+} _Py_atomic_int;
+
+
+#if defined(_M_X64) 
+#define _Py_atomic_store_64bit(ATOMIC_VAL, NEW_VAL, ORDER) \
+    switch (ORDER) { \
+    case _Py_memory_order_acquire: \
+      _InterlockedExchange64_HLEAcquire((__int64 volatile*)ATOMIC_VAL, (__int64)NEW_VAL); \
+      break; \
+    case _Py_memory_order_release: \
+      _InterlockedExchange64_HLERelease((__int64 volatile*)ATOMIC_VAL, (__int64)NEW_VAL); \
+      break; \
+    default: \
+      _InterlockedExchange64((__int64 volatile*)ATOMIC_VAL, (__int64)NEW_VAL); \
+      break; \
+  }
+#else
+#define _Py_atomic_store_64bit(ATOMIC_VAL, NEW_VAL, ORDER) ((void)0);
+#endif
+
+#define _Py_atomic_store_32bit(ATOMIC_VAL, NEW_VAL, ORDER) \
+  switch (ORDER) { \
+  case _Py_memory_order_acquire: \
+    _InterlockedExchange_HLEAcquire((volatile long*)ATOMIC_VAL, (int)NEW_VAL); \
+    break; \
+  case _Py_memory_order_release: \
+    _InterlockedExchange_HLERelease((volatile long*)ATOMIC_VAL, (int)NEW_VAL); \
+    break; \
+  default: \
+    _InterlockedExchange((volatile long*)ATOMIC_VAL, (int)NEW_VAL); \
+    break; \
+  }
+
+#if defined(_M_X64)
+/*  This has to be an intptr_t for now.
+    gil_created() uses -1 as a sentinel value, if this returns
+    a uintptr_t it will do an unsigned compare and crash
+*/
+inline intptr_t _Py_atomic_load_64bit(volatile uintptr_t* value, int order) {
+    uintptr_t old;
+    switch (order) {
+    case _Py_memory_order_acquire:
+    {
+      do {
+        old = *value;
+      } while(_InterlockedCompareExchange64_HLEAcquire(value, old, old) != old);
+      break;
+    }
+    case _Py_memory_order_release:
+    {
+      do {
+        old = *value;
+      } while(_InterlockedCompareExchange64_HLERelease(value, old, old) != old);
+      break;
+    }
+    case _Py_memory_order_relaxed:
+      old = *value;
+      break;
+    default:
+    {
+      do {
+        old = *value;
+      } while(_InterlockedCompareExchange64(value, old, old) != old);
+      break;
+    }
+    }
+    return old; 
+}
+
+#else
+#define _Py_atomic_load_64bit(ATOMIC_VAL, ORDER) *ATOMIC_VAL
+#endif
+
+inline int _Py_atomic_load_32bit(volatile int* value, int order) {
+    int old;
+    switch (order) {
+    case _Py_memory_order_acquire:
+    {
+      do {
+        old = *value;
+      } while(_InterlockedCompareExchange_HLEAcquire(value, old, old) != old);
+      break;
+    }
+    case _Py_memory_order_release:
+    {
+      do {
+        old = *value;
+      } while(_InterlockedCompareExchange_HLERelease(value, old, old) != old);
+      break;
+    }
+    case _Py_memory_order_relaxed:
+      old = *value;
+      break;
+    default:
+    {
+      do {
+        old = *value;
+      } while(_InterlockedCompareExchange(value, old, old) != old);
+      break;
+    }
+    }
+    return old; 
+}
+
+#define _Py_atomic_store_explicit(ATOMIC_VAL, NEW_VAL, ORDER) \
+  if (sizeof(*ATOMIC_VAL._value) == 8) { \
+    _Py_atomic_store_64bit(ATOMIC_VAL._value, NEW_VAL, ORDER) } else { \
+    _Py_atomic_store_32bit(ATOMIC_VAL._value, NEW_VAL, ORDER) } 
+
+#define _Py_atomic_load_explicit(ATOMIC_VAL, ORDER) \
+  ( \
+    sizeof(*(ATOMIC_VAL._value)) == 8 ? \
+    _Py_atomic_load_64bit(ATOMIC_VAL._value, ORDER) : \
+    _Py_atomic_load_32bit(ATOMIC_VAL._value, ORDER) \
+  )
+#elif defined(_M_ARM) || defined(_M_ARM64)
+typedef enum _Py_memory_order {
+    _Py_memory_order_relaxed,
+    _Py_memory_order_acquire,
+    _Py_memory_order_release,
+    _Py_memory_order_acq_rel,
+    _Py_memory_order_seq_cst
+} _Py_memory_order;
+
+typedef struct _Py_atomic_address {
+    volatile uintptr_t _value;
+} _Py_atomic_address;
+
+typedef struct _Py_atomic_int {
+    volatile int _value;
+} _Py_atomic_int;
+
+
+#if defined(_M_ARM64) 
+#define _Py_atomic_store_64bit(ATOMIC_VAL, NEW_VAL, ORDER) \
+    switch (ORDER) { \
+    case _Py_memory_order_acquire: \
+      _InterlockedExchange64_acq((__int64 volatile*)ATOMIC_VAL, (__int64)NEW_VAL); \
+      break; \
+    case _Py_memory_order_release: \
+      _InterlockedExchange64_rel((__int64 volatile*)ATOMIC_VAL, (__int64)NEW_VAL); \
+      break; \
+    default: \
+      _InterlockedExchange64((__int64 volatile*)ATOMIC_VAL, (__int64)NEW_VAL); \
+      break; \
+  }
+#else
+#define _Py_atomic_store_64bit(ATOMIC_VAL, NEW_VAL, ORDER) ((void)0);
+#endif
+
+#define _Py_atomic_store_32bit(ATOMIC_VAL, NEW_VAL, ORDER) \
+  switch (ORDER) { \
+  case _Py_memory_order_acquire: \
+    _InterlockedExchange_acq((volatile long*)ATOMIC_VAL, (int)NEW_VAL); \
+    break; \
+  case _Py_memory_order_release: \
+    _InterlockedExchange_rel((volatile long*)ATOMIC_VAL, (int)NEW_VAL); \
+    break; \
+  default: \
+    _InterlockedExchange((volatile long*)ATOMIC_VAL, (int)NEW_VAL); \
+    break; \
+  }
+
+#if defined(_M_ARM64)
+/*  This has to be an intptr_t for now.
+    gil_created() uses -1 as a sentinel value, if this returns
+    a uintptr_t it will do an unsigned compare and crash
+*/
+inline intptr_t _Py_atomic_load_64bit(volatile uintptr_t* value, int order) {
+    uintptr_t old;
+    switch (order) {
+    case _Py_memory_order_acquire:
+    {
+      do {
+        old = *value;
+      } while(_InterlockedCompareExchange64_acq(value, old, old) != old);
+      break;
+    }
+    case _Py_memory_order_release:
+    {
+      do {
+        old = *value;
+      } while(_InterlockedCompareExchange64_rel(value, old, old) != old);
+      break;
+    }
+    case _Py_memory_order_relaxed:
+      old = *value;
+      break;
+    default:
+    {
+      do {
+        old = *value;
+      } while(_InterlockedCompareExchange64(value, old, old) != old);
+      break;
+    }
+    }
+    return old; 
+}
+
+#else
+#define _Py_atomic_load_64bit(ATOMIC_VAL, ORDER) *ATOMIC_VAL
+#endif
+
+inline int _Py_atomic_load_32bit(volatile int* value, int order) {
+    int old;
+    switch (order) {
+    case _Py_memory_order_acquire:
+    {
+      do {
+        old = *value;
+      } while(_InterlockedCompareExchange_acq(value, old, old) != old);
+      break;
+    }
+    case _Py_memory_order_release:
+    {
+      do {
+        old = *value;
+      } while(_InterlockedCompareExchange_rel(value, old, old) != old);
+      break;
+    }
+    case _Py_memory_order_relaxed:
+      old = *value;
+      break;
+    default:
+    {
+      do {
+        old = *value;
+      } while(_InterlockedCompareExchange(value, old, old) != old);
+      break;
+    }
+    }
+    return old; 
+}
+
+#define _Py_atomic_store_explicit(ATOMIC_VAL, NEW_VAL, ORDER) \
+  if (sizeof(*ATOMIC_VAL._value) == 8) { \
+    _Py_atomic_store_64bit(ATOMIC_VAL._value, NEW_VAL, ORDER) } else { \
+    _Py_atomic_store_32bit(ATOMIC_VAL._value, NEW_VAL, ORDER) } 
+
+#define _Py_atomic_load_explicit(ATOMIC_VAL, ORDER) \
+  ( \
+    sizeof(*(ATOMIC_VAL._value)) == 8 ? \
+    _Py_atomic_load_64bit(ATOMIC_VAL._value, ORDER) : \
+    _Py_atomic_load_32bit(ATOMIC_VAL._value, ORDER) \
+  )
+#endif
+#else  /* !gcc x86  !_msc_ver */
+typedef enum _Py_memory_order {
+    _Py_memory_order_relaxed,
+    _Py_memory_order_acquire,
+    _Py_memory_order_release,
+    _Py_memory_order_acq_rel,
+    _Py_memory_order_seq_cst
+} _Py_memory_order;
+
+typedef struct _Py_atomic_address {
+    uintptr_t _value;
+} _Py_atomic_address;
+
+typedef struct _Py_atomic_int {
+    int _value;
+} _Py_atomic_int;
 /* Fall back to other compilers and processors by assuming that simple
    volatile accesses are atomic.  This is false, so people should port
    this. */
@@ -229,8 +517,6 @@ _Py_ANNOTATE_MEMORY_ORDER(const volatile void *address, _Py_memory_order order)
     ((ATOMIC_VAL)->_value = NEW_VAL)
 #define _Py_atomic_load_explicit(ATOMIC_VAL, ORDER) \
     ((ATOMIC_VAL)->_value)
-
-#endif  /* !gcc x86 */
 #endif
 
 /* Standardized shortcuts. */
@@ -245,6 +531,5 @@ _Py_ANNOTATE_MEMORY_ORDER(const volatile void *address, _Py_memory_order order)
     _Py_atomic_store_explicit(ATOMIC_VAL, NEW_VAL, _Py_memory_order_relaxed)
 #define _Py_atomic_load_relaxed(ATOMIC_VAL) \
     _Py_atomic_load_explicit(ATOMIC_VAL, _Py_memory_order_relaxed)
-
 #endif  /* Py_BUILD_CORE */
 #endif  /* Py_ATOMIC_H */
diff --git a/Misc/NEWS.d/next/Core and Builtins/2017-08-08-12-00-29.bpo-30747.g2kZRT.rst b/Misc/NEWS.d/next/Core and Builtins/2017-08-08-12-00-29.bpo-30747.g2kZRT.rst
new file mode 100644
index 0000000..04a726a
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2017-08-08-12-00-29.bpo-30747.g2kZRT.rst	
@@ -0,0 +1,2 @@
+Add a non-dummy implementation of _Py_atomic_store and _Py_atomic_load on
+MSVC.
-- 
cgit v0.12