From 39370830a96068ecbad006fe38fdb13107d8cd6b Mon Sep 17 00:00:00 2001
From: Jeffrey Yasskin <jyasskin@gmail.com>
Date: Mon, 3 May 2010 19:29:34 +0000
Subject: Make (most of) Python's tests pass under Thread Sanitizer.

http://code.google.com/p/data-race-test/wiki/ThreadSanitizer is a dynamic data
race detector that runs on top of valgrind. With this patch, the binaries at
http://code.google.com/p/data-race-test/wiki/ThreadSanitizer#Binaries pass many
but not all of the Python tests. All of regrtest still passes outside of tsan.

I've implemented part of the C1x atomic types so that we can explicitly mark
variables that are used across threads, and get defined behavior as compilers
advance.

I've added tsan's client header and implementation to the codebase in
dynamic_annotations.{h,c} (docs at
http://code.google.com/p/data-race-test/wiki/DynamicAnnotations).
Unfortunately, I haven't been able to get helgrind and drd to give sensible
error messages, even when I use their client annotations, so I'm not supporting
them.
---
 Include/Python.h              |   2 +
 Include/dynamic_annotations.h | 499 ++++++++++++++++++++++++++++++++++++++++++
 Include/pyatomic.h            | 179 +++++++++++++++
 Include/pystate.h             |   7 +-
 Makefile.pre.in               |   4 +
 Objects/dictobject.c          |   3 +-
 PC/VS7.1/pythoncore.vcproj    |   3 +
 PC/VS8.0/pythoncore.vcproj    |   8 +
 PC/os2emx/Makefile            |   1 +
 PCbuild/pythoncore.vcproj     |   8 +
 Python/ceval.c                |  63 ++++--
 Python/ceval_gil.h            |  48 ++--
 Python/dynamic_annotations.c  | 154 +++++++++++++
 Python/pystate.c              |  37 ++--
 Python/thread_pthread.h       |   6 +
 configure                     |  11 +-
 configure.in                  |   1 +
 17 files changed, 971 insertions(+), 63 deletions(-)
 create mode 100644 Include/dynamic_annotations.h
 create mode 100644 Include/pyatomic.h
 create mode 100644 Python/dynamic_annotations.c

diff --git a/Include/Python.h b/Include/Python.h
index 8b038ac..c0e469e 100644
--- a/Include/Python.h
+++ b/Include/Python.h
@@ -49,6 +49,8 @@
 
 #include "pyport.h"
 
+#include "pyatomic.h"
+
 /* Debug-mode build with pymalloc implies PYMALLOC_DEBUG.
  *  PYMALLOC_DEBUG is in error if pymalloc is not in use.
  */
diff --git a/Include/dynamic_annotations.h b/Include/dynamic_annotations.h
new file mode 100644
index 0000000..2f33294
--- /dev/null
+++ b/Include/dynamic_annotations.h
@@ -0,0 +1,499 @@
+/* Copyright (c) 2008-2009, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Kostya Serebryany
+ * Copied to CPython by Jeffrey Yasskin, with all macros renamed to
+ * start with _Py_ to avoid colliding with users embedding Python, and
+ * with deprecated macros removed.
+ */
+
+/* This file defines dynamic annotations for use with dynamic analysis
+   tool such as valgrind, PIN, etc.
+
+   Dynamic annotation is a source code annotation that affects
+   the generated code (that is, the annotation is not a comment).
+   Each such annotation is attached to a particular
+   instruction and/or to a particular object (address) in the program.
+
+   The annotations that should be used by users are macros in all upper-case
+   (e.g., _Py_ANNOTATE_NEW_MEMORY).
+
+   Actual implementation of these macros may differ depending on the
+   dynamic analysis tool being used.
+
+   See http://code.google.com/p/data-race-test/  for more information.
+
+   This file supports the following dynamic analysis tools:
+   - None (DYNAMIC_ANNOTATIONS_ENABLED is not defined or zero).
+      Macros are defined empty.
+   - ThreadSanitizer, Helgrind, DRD (DYNAMIC_ANNOTATIONS_ENABLED is 1).
+      Macros are defined as calls to non-inlinable empty functions
+      that are intercepted by Valgrind. */
+
+#ifndef __DYNAMIC_ANNOTATIONS_H__
+#define __DYNAMIC_ANNOTATIONS_H__
+
+#ifndef DYNAMIC_ANNOTATIONS_ENABLED
+# define DYNAMIC_ANNOTATIONS_ENABLED 0
+#endif
+
+#if DYNAMIC_ANNOTATIONS_ENABLED != 0
+
+  /* -------------------------------------------------------------
+     Annotations useful when implementing condition variables such as CondVar,
+     using conditional critical sections (Await/LockWhen) and when constructing
+     user-defined synchronization mechanisms.
+
+     The annotations _Py_ANNOTATE_HAPPENS_BEFORE() and
+     _Py_ANNOTATE_HAPPENS_AFTER() can be used to define happens-before arcs in
+     user-defined synchronization mechanisms: the race detector will infer an
+     arc from the former to the latter when they share the same argument
+     pointer.
+
+     Example 1 (reference counting):
+
+     void Unref() {
+       _Py_ANNOTATE_HAPPENS_BEFORE(&refcount_);
+       if (AtomicDecrementByOne(&refcount_) == 0) {
+         _Py_ANNOTATE_HAPPENS_AFTER(&refcount_);
+         delete this;
+       }
+     }
+
+     Example 2 (message queue):
+
+     void MyQueue::Put(Type *e) {
+       MutexLock lock(&mu_);
+       _Py_ANNOTATE_HAPPENS_BEFORE(e);
+       PutElementIntoMyQueue(e);
+     }
+
+     Type *MyQueue::Get() {
+       MutexLock lock(&mu_);
+       Type *e = GetElementFromMyQueue();
+       _Py_ANNOTATE_HAPPENS_AFTER(e);
+       return e;
+     }
+
+     Note: when possible, please use the existing reference counting and message
+     queue implementations instead of inventing new ones. */
+
+  /* Report that wait on the condition variable at address "cv" has succeeded
+     and the lock at address "lock" is held. */
+  #define _Py_ANNOTATE_CONDVAR_LOCK_WAIT(cv, lock) \
+    AnnotateCondVarWait(__FILE__, __LINE__, cv, lock)
+
+  /* Report that wait on the condition variable at "cv" has succeeded.  Variant
+     w/o lock. */
+  #define _Py_ANNOTATE_CONDVAR_WAIT(cv) \
+    AnnotateCondVarWait(__FILE__, __LINE__, cv, NULL)
+
+  /* Report that we are about to signal on the condition variable at address
+     "cv". */
+  #define _Py_ANNOTATE_CONDVAR_SIGNAL(cv) \
+    AnnotateCondVarSignal(__FILE__, __LINE__, cv)
+
+  /* Report that we are about to signal_all on the condition variable at "cv". */
+  #define _Py_ANNOTATE_CONDVAR_SIGNAL_ALL(cv) \
+    AnnotateCondVarSignalAll(__FILE__, __LINE__, cv)
+
+  /* Annotations for user-defined synchronization mechanisms. */
+  #define _Py_ANNOTATE_HAPPENS_BEFORE(obj) _Py_ANNOTATE_CONDVAR_SIGNAL(obj)
+  #define _Py_ANNOTATE_HAPPENS_AFTER(obj)  _Py_ANNOTATE_CONDVAR_WAIT(obj)
+
+  /* Report that the bytes in the range [pointer, pointer+size) are about
+     to be published safely. The race checker will create a happens-before
+     arc from the call _Py_ANNOTATE_PUBLISH_MEMORY_RANGE(pointer, size) to
+     subsequent accesses to this memory.
+     Note: this annotation may not work properly if the race detector uses
+     sampling, i.e. does not observe all memory accesses.
+     */
+  #define _Py_ANNOTATE_PUBLISH_MEMORY_RANGE(pointer, size) \
+    AnnotatePublishMemoryRange(__FILE__, __LINE__, pointer, size)
+
+  /* Instruct the tool to create a happens-before arc between mu->Unlock() and
+     mu->Lock(). This annotation may slow down the race detector and hide real
+     races. Normally it is used only when it would be difficult to annotate each
+     of the mutex's critical sections individually using the annotations above.
+     This annotation makes sense only for hybrid race detectors. For pure
+     happens-before detectors this is a no-op. For more details see
+     http://code.google.com/p/data-race-test/wiki/PureHappensBeforeVsHybrid . */
+  #define _Py_ANNOTATE_PURE_HAPPENS_BEFORE_MUTEX(mu) \
+    AnnotateMutexIsUsedAsCondVar(__FILE__, __LINE__, mu)
+
+  /* -------------------------------------------------------------
+     Annotations useful when defining memory allocators, or when memory that
+     was protected in one way starts to be protected in another. */
+
+  /* Report that a new memory at "address" of size "size" has been allocated.
+     This might be used when the memory has been retrieved from a free list and
+     is about to be reused, or when a the locking discipline for a variable
+     changes. */
+  #define _Py_ANNOTATE_NEW_MEMORY(address, size) \
+    AnnotateNewMemory(__FILE__, __LINE__, address, size)
+
+  /* -------------------------------------------------------------
+     Annotations useful when defining FIFO queues that transfer data between
+     threads. */
+
+  /* Report that the producer-consumer queue (such as ProducerConsumerQueue) at
+     address "pcq" has been created.  The _Py_ANNOTATE_PCQ_* annotations should
+     be used only for FIFO queues.  For non-FIFO queues use
+     _Py_ANNOTATE_HAPPENS_BEFORE (for put) and _Py_ANNOTATE_HAPPENS_AFTER (for
+     get). */
+  #define _Py_ANNOTATE_PCQ_CREATE(pcq) \
+    AnnotatePCQCreate(__FILE__, __LINE__, pcq)
+
+  /* Report that the queue at address "pcq" is about to be destroyed. */
+  #define _Py_ANNOTATE_PCQ_DESTROY(pcq) \
+    AnnotatePCQDestroy(__FILE__, __LINE__, pcq)
+
+  /* Report that we are about to put an element into a FIFO queue at address
+     "pcq". */
+  #define _Py_ANNOTATE_PCQ_PUT(pcq) \
+    AnnotatePCQPut(__FILE__, __LINE__, pcq)
+
+  /* Report that we've just got an element from a FIFO queue at address "pcq". */
+  #define _Py_ANNOTATE_PCQ_GET(pcq) \
+    AnnotatePCQGet(__FILE__, __LINE__, pcq)
+
+  /* -------------------------------------------------------------
+     Annotations that suppress errors.  It is usually better to express the
+     program's synchronization using the other annotations, but these can
+     be used when all else fails. */
+
+  /* Report that we may have a benign race at "pointer", with size
+     "sizeof(*(pointer))". "pointer" must be a non-void* pointer.  Insert at the
+     point where "pointer" has been allocated, preferably close to the point
+     where the race happens.  See also _Py_ANNOTATE_BENIGN_RACE_STATIC. */
+  #define _Py_ANNOTATE_BENIGN_RACE(pointer, description) \
+    AnnotateBenignRaceSized(__FILE__, __LINE__, pointer, \
+                            sizeof(*(pointer)), description)
+
+  /* Same as _Py_ANNOTATE_BENIGN_RACE(address, description), but applies to
+     the memory range [address, address+size). */
+  #define _Py_ANNOTATE_BENIGN_RACE_SIZED(address, size, description) \
+    AnnotateBenignRaceSized(__FILE__, __LINE__, address, size, description)
+
+  /* Request the analysis tool to ignore all reads in the current thread
+     until _Py_ANNOTATE_IGNORE_READS_END is called.
+     Useful to ignore intentional racey reads, while still checking
+     other reads and all writes.
+     See also _Py_ANNOTATE_UNPROTECTED_READ. */
+  #define _Py_ANNOTATE_IGNORE_READS_BEGIN() \
+    AnnotateIgnoreReadsBegin(__FILE__, __LINE__)
+
+  /* Stop ignoring reads. */
+  #define _Py_ANNOTATE_IGNORE_READS_END() \
+    AnnotateIgnoreReadsEnd(__FILE__, __LINE__)
+
+  /* Similar to _Py_ANNOTATE_IGNORE_READS_BEGIN, but ignore writes. */
+  #define _Py_ANNOTATE_IGNORE_WRITES_BEGIN() \
+    AnnotateIgnoreWritesBegin(__FILE__, __LINE__)
+
+  /* Stop ignoring writes. */
+  #define _Py_ANNOTATE_IGNORE_WRITES_END() \
+    AnnotateIgnoreWritesEnd(__FILE__, __LINE__)
+
+  /* Start ignoring all memory accesses (reads and writes). */
+  #define _Py_ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN() \
+    do {\
+      _Py_ANNOTATE_IGNORE_READS_BEGIN();\
+      _Py_ANNOTATE_IGNORE_WRITES_BEGIN();\
+    }while(0)\
+
+  /* Stop ignoring all memory accesses. */
+  #define _Py_ANNOTATE_IGNORE_READS_AND_WRITES_END() \
+    do {\
+      _Py_ANNOTATE_IGNORE_WRITES_END();\
+      _Py_ANNOTATE_IGNORE_READS_END();\
+    }while(0)\
+
+  /* Similar to _Py_ANNOTATE_IGNORE_READS_BEGIN, but ignore synchronization events:
+     RWLOCK* and CONDVAR*. */
+  #define _Py_ANNOTATE_IGNORE_SYNC_BEGIN() \
+    AnnotateIgnoreSyncBegin(__FILE__, __LINE__)
+
+  /* Stop ignoring sync events. */
+  #define _Py_ANNOTATE_IGNORE_SYNC_END() \
+    AnnotateIgnoreSyncEnd(__FILE__, __LINE__)
+
+
+  /* Enable (enable!=0) or disable (enable==0) race detection for all threads.
+     This annotation could be useful if you want to skip expensive race analysis
+     during some period of program execution, e.g. during initialization. */
+  #define _Py_ANNOTATE_ENABLE_RACE_DETECTION(enable) \
+    AnnotateEnableRaceDetection(__FILE__, __LINE__, enable)
+
+  /* -------------------------------------------------------------
+     Annotations useful for debugging. */
+
+  /* Request to trace every access to "address". */
+  #define _Py_ANNOTATE_TRACE_MEMORY(address) \
+    AnnotateTraceMemory(__FILE__, __LINE__, address)
+
+  /* Report the current thread name to a race detector. */
+  #define _Py_ANNOTATE_THREAD_NAME(name) \
+    AnnotateThreadName(__FILE__, __LINE__, name)
+
+  /* -------------------------------------------------------------
+     Annotations useful when implementing locks.  They are not
+     normally needed by modules that merely use locks.
+     The "lock" argument is a pointer to the lock object. */
+
+  /* Report that a lock has been created at address "lock". */
+  #define _Py_ANNOTATE_RWLOCK_CREATE(lock) \
+    AnnotateRWLockCreate(__FILE__, __LINE__, lock)
+
+  /* Report that the lock at address "lock" is about to be destroyed. */
+  #define _Py_ANNOTATE_RWLOCK_DESTROY(lock) \
+    AnnotateRWLockDestroy(__FILE__, __LINE__, lock)
+
+  /* Report that the lock at address "lock" has been acquired.
+     is_w=1 for writer lock, is_w=0 for reader lock. */
+  #define _Py_ANNOTATE_RWLOCK_ACQUIRED(lock, is_w) \
+    AnnotateRWLockAcquired(__FILE__, __LINE__, lock, is_w)
+
+  /* Report that the lock at address "lock" is about to be released. */
+  #define _Py_ANNOTATE_RWLOCK_RELEASED(lock, is_w) \
+    AnnotateRWLockReleased(__FILE__, __LINE__, lock, is_w)
+
+  /* -------------------------------------------------------------
+     Annotations useful when implementing barriers.  They are not
+     normally needed by modules that merely use barriers.
+     The "barrier" argument is a pointer to the barrier object. */
+
+  /* Report that the "barrier" has been initialized with initial "count".
+   If 'reinitialization_allowed' is true, initialization is allowed to happen
+   multiple times w/o calling barrier_destroy() */
+  #define _Py_ANNOTATE_BARRIER_INIT(barrier, count, reinitialization_allowed) \
+    AnnotateBarrierInit(__FILE__, __LINE__, barrier, count, \
+                        reinitialization_allowed)
+
+  /* Report that we are about to enter barrier_wait("barrier"). */
+  #define _Py_ANNOTATE_BARRIER_WAIT_BEFORE(barrier) \
+    AnnotateBarrierWaitBefore(__FILE__, __LINE__, barrier)
+
+  /* Report that we just exited barrier_wait("barrier"). */
+  #define _Py_ANNOTATE_BARRIER_WAIT_AFTER(barrier) \
+    AnnotateBarrierWaitAfter(__FILE__, __LINE__, barrier)
+
+  /* Report that the "barrier" has been destroyed. */
+  #define _Py_ANNOTATE_BARRIER_DESTROY(barrier) \
+    AnnotateBarrierDestroy(__FILE__, __LINE__, barrier)
+
+  /* -------------------------------------------------------------
+     Annotations useful for testing race detectors. */
+
+  /* Report that we expect a race on the variable at "address".
+     Use only in unit tests for a race detector. */
+  #define _Py_ANNOTATE_EXPECT_RACE(address, description) \
+    AnnotateExpectRace(__FILE__, __LINE__, address, description)
+
+  /* A no-op. Insert where you like to test the interceptors. */
+  #define _Py_ANNOTATE_NO_OP(arg) \
+    AnnotateNoOp(__FILE__, __LINE__, arg)
+
+  /* Force the race detector to flush its state. The actual effect depends on
+   * the implementation of the detector. */
+  #define _Py_ANNOTATE_FLUSH_STATE() \
+    AnnotateFlushState(__FILE__, __LINE__)
+
+
+#else  /* DYNAMIC_ANNOTATIONS_ENABLED == 0 */
+
+  #define _Py_ANNOTATE_RWLOCK_CREATE(lock) /* empty */
+  #define _Py_ANNOTATE_RWLOCK_DESTROY(lock) /* empty */
+  #define _Py_ANNOTATE_RWLOCK_ACQUIRED(lock, is_w) /* empty */
+  #define _Py_ANNOTATE_RWLOCK_RELEASED(lock, is_w) /* empty */
+  #define _Py_ANNOTATE_BARRIER_INIT(barrier, count, reinitialization_allowed) /* */
+  #define _Py_ANNOTATE_BARRIER_WAIT_BEFORE(barrier) /* empty */
+  #define _Py_ANNOTATE_BARRIER_WAIT_AFTER(barrier) /* empty */
+  #define _Py_ANNOTATE_BARRIER_DESTROY(barrier) /* empty */
+  #define _Py_ANNOTATE_CONDVAR_LOCK_WAIT(cv, lock) /* empty */
+  #define _Py_ANNOTATE_CONDVAR_WAIT(cv) /* empty */
+  #define _Py_ANNOTATE_CONDVAR_SIGNAL(cv) /* empty */
+  #define _Py_ANNOTATE_CONDVAR_SIGNAL_ALL(cv) /* empty */
+  #define _Py_ANNOTATE_HAPPENS_BEFORE(obj) /* empty */
+  #define _Py_ANNOTATE_HAPPENS_AFTER(obj) /* empty */
+  #define _Py_ANNOTATE_PUBLISH_MEMORY_RANGE(address, size) /* empty */
+  #define _Py_ANNOTATE_UNPUBLISH_MEMORY_RANGE(address, size)  /* empty */
+  #define _Py_ANNOTATE_SWAP_MEMORY_RANGE(address, size)  /* empty */
+  #define _Py_ANNOTATE_PCQ_CREATE(pcq) /* empty */
+  #define _Py_ANNOTATE_PCQ_DESTROY(pcq) /* empty */
+  #define _Py_ANNOTATE_PCQ_PUT(pcq) /* empty */
+  #define _Py_ANNOTATE_PCQ_GET(pcq) /* empty */
+  #define _Py_ANNOTATE_NEW_MEMORY(address, size) /* empty */
+  #define _Py_ANNOTATE_EXPECT_RACE(address, description) /* empty */
+  #define _Py_ANNOTATE_BENIGN_RACE(address, description) /* empty */
+  #define _Py_ANNOTATE_BENIGN_RACE_SIZED(address, size, description) /* empty */
+  #define _Py_ANNOTATE_PURE_HAPPENS_BEFORE_MUTEX(mu) /* empty */
+  #define _Py_ANNOTATE_MUTEX_IS_USED_AS_CONDVAR(mu) /* empty */
+  #define _Py_ANNOTATE_TRACE_MEMORY(arg) /* empty */
+  #define _Py_ANNOTATE_THREAD_NAME(name) /* empty */
+  #define _Py_ANNOTATE_IGNORE_READS_BEGIN() /* empty */
+  #define _Py_ANNOTATE_IGNORE_READS_END() /* empty */
+  #define _Py_ANNOTATE_IGNORE_WRITES_BEGIN() /* empty */
+  #define _Py_ANNOTATE_IGNORE_WRITES_END() /* empty */
+  #define _Py_ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN() /* empty */
+  #define _Py_ANNOTATE_IGNORE_READS_AND_WRITES_END() /* empty */
+  #define _Py_ANNOTATE_IGNORE_SYNC_BEGIN() /* empty */
+  #define _Py_ANNOTATE_IGNORE_SYNC_END() /* empty */
+  #define _Py_ANNOTATE_ENABLE_RACE_DETECTION(enable) /* empty */
+  #define _Py_ANNOTATE_NO_OP(arg) /* empty */
+  #define _Py_ANNOTATE_FLUSH_STATE() /* empty */
+
+#endif  /* DYNAMIC_ANNOTATIONS_ENABLED */
+
+/* Use the macros above rather than using these functions directly. */
+#ifdef __cplusplus
+extern "C" {
+#endif
+void AnnotateRWLockCreate(const char *file, int line,
+                          const volatile void *lock);
+void AnnotateRWLockDestroy(const char *file, int line,
+                           const volatile void *lock);
+void AnnotateRWLockAcquired(const char *file, int line,
+                            const volatile void *lock, long is_w);
+void AnnotateRWLockReleased(const char *file, int line,
+                            const volatile void *lock, long is_w);
+void AnnotateBarrierInit(const char *file, int line,
+                         const volatile void *barrier, long count,
+                         long reinitialization_allowed);
+void AnnotateBarrierWaitBefore(const char *file, int line,
+                               const volatile void *barrier);
+void AnnotateBarrierWaitAfter(const char *file, int line,
+                              const volatile void *barrier);
+void AnnotateBarrierDestroy(const char *file, int line,
+                            const volatile void *barrier);
+void AnnotateCondVarWait(const char *file, int line,
+                         const volatile void *cv,
+                         const volatile void *lock);
+void AnnotateCondVarSignal(const char *file, int line,
+                           const volatile void *cv);
+void AnnotateCondVarSignalAll(const char *file, int line,
+                              const volatile void *cv);
+void AnnotatePublishMemoryRange(const char *file, int line,
+                                const volatile void *address,
+                                long size);
+void AnnotateUnpublishMemoryRange(const char *file, int line,
+                                  const volatile void *address,
+                                  long size);
+void AnnotatePCQCreate(const char *file, int line,
+                       const volatile void *pcq);
+void AnnotatePCQDestroy(const char *file, int line,
+                        const volatile void *pcq);
+void AnnotatePCQPut(const char *file, int line,
+                    const volatile void *pcq);
+void AnnotatePCQGet(const char *file, int line,
+                    const volatile void *pcq);
+void AnnotateNewMemory(const char *file, int line,
+                       const volatile void *address,
+                       long size);
+void AnnotateExpectRace(const char *file, int line,
+                        const volatile void *address,
+                        const char *description);
+void AnnotateBenignRace(const char *file, int line,
+                        const volatile void *address,
+                        const char *description);
+void AnnotateBenignRaceSized(const char *file, int line,
+                        const volatile void *address,
+                        long size,
+                        const char *description);
+void AnnotateMutexIsUsedAsCondVar(const char *file, int line,
+                                  const volatile void *mu);
+void AnnotateTraceMemory(const char *file, int line,
+                         const volatile void *arg);
+void AnnotateThreadName(const char *file, int line,
+                        const char *name);
+void AnnotateIgnoreReadsBegin(const char *file, int line);
+void AnnotateIgnoreReadsEnd(const char *file, int line);
+void AnnotateIgnoreWritesBegin(const char *file, int line);
+void AnnotateIgnoreWritesEnd(const char *file, int line);
+void AnnotateEnableRaceDetection(const char *file, int line, int enable);
+void AnnotateNoOp(const char *file, int line,
+                  const volatile void *arg);
+void AnnotateFlushState(const char *file, int line);
+
+/* Return non-zero value if running under valgrind.
+
+  If "valgrind.h" is included into dynamic_annotations.c,
+  the regular valgrind mechanism will be used.
+  See http://valgrind.org/docs/manual/manual-core-adv.html about
+  RUNNING_ON_VALGRIND and other valgrind "client requests".
+  The file "valgrind.h" may be obtained by doing
+     svn co svn://svn.valgrind.org/valgrind/trunk/include
+
+  If for some reason you can't use "valgrind.h" or want to fake valgrind,
+  there are two ways to make this function return non-zero:
+    - Use environment variable: export RUNNING_ON_VALGRIND=1
+    - Make your tool intercept the function RunningOnValgrind() and
+      change its return value.
+ */
+int RunningOnValgrind(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#if DYNAMIC_ANNOTATIONS_ENABLED != 0 && defined(__cplusplus)
+
+  /* _Py_ANNOTATE_UNPROTECTED_READ is the preferred way to annotate racey reads.
+
+     Instead of doing
+        _Py_ANNOTATE_IGNORE_READS_BEGIN();
+        ... = x;
+        _Py_ANNOTATE_IGNORE_READS_END();
+     one can use
+        ... = _Py_ANNOTATE_UNPROTECTED_READ(x); */
+  template <class T>
+  inline T _Py_ANNOTATE_UNPROTECTED_READ(const volatile T &x) {
+    _Py_ANNOTATE_IGNORE_READS_BEGIN();
+    T res = x;
+    _Py_ANNOTATE_IGNORE_READS_END();
+    return res;
+  }
+  /* Apply _Py_ANNOTATE_BENIGN_RACE_SIZED to a static variable. */
+  #define _Py_ANNOTATE_BENIGN_RACE_STATIC(static_var, description)        \
+    namespace {                                                       \
+      class static_var ## _annotator {                                \
+       public:                                                        \
+        static_var ## _annotator() {                                  \
+          _Py_ANNOTATE_BENIGN_RACE_SIZED(&static_var,                     \
+                                      sizeof(static_var),             \
+            # static_var ": " description);                           \
+        }                                                             \
+      };                                                              \
+      static static_var ## _annotator the ## static_var ## _annotator;\
+    }
+#else /* DYNAMIC_ANNOTATIONS_ENABLED == 0 */
+
+  #define _Py_ANNOTATE_UNPROTECTED_READ(x) (x)
+  #define _Py_ANNOTATE_BENIGN_RACE_STATIC(static_var, description)  /* empty */
+
+#endif /* DYNAMIC_ANNOTATIONS_ENABLED */
+
+#endif  /* __DYNAMIC_ANNOTATIONS_H__ */
diff --git a/Include/pyatomic.h b/Include/pyatomic.h
new file mode 100644
index 0000000..7d3449d
--- /dev/null
+++ b/Include/pyatomic.h
@@ -0,0 +1,179 @@
+#ifndef Py_ATOMIC_H
+#define Py_ATOMIC_H
+/* XXX: When compilers start offering a stdatomic.h with lock-free
+   atomic_int and atomic_address types, include that here and rewrite
+   the atomic operations in terms of it. */
+
+#include "dynamic_annotations.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* This is modeled after the atomics interface from C1x, according to
+ * the draft at
+ * http://www.open-std.org/JTC1/SC22/wg14/www/docs/n1425.pdf.
+ * Operations and types are named the same except with a _Py_ prefix
+ * and have the same semantics.
+ *
+ * Beware, the implementations here are deep magic.
+ */
+
+typedef enum _Py_memory_order {
+    _Py_memory_order_relaxed,
+    _Py_memory_order_acquire,
+    _Py_memory_order_release,
+    _Py_memory_order_acq_rel,
+    _Py_memory_order_seq_cst
+} _Py_memory_order;
+
+typedef struct _Py_atomic_address {
+    void *_value;
+} _Py_atomic_address;
+
+typedef struct _Py_atomic_int {
+    int _value;
+} _Py_atomic_int;
+
+/* Only support GCC (for expression statements) and x86 (for simple
+ * atomic semantics) for now */
+#if defined(__GNUC__) && (defined(__i386__) || defined(__amd64))
+
+static __inline__ void
+_Py_atomic_signal_fence(_Py_memory_order order)
+{
+    if (order != _Py_memory_order_relaxed)
+        __asm__ volatile("":::"memory");
+}
+
+static __inline__ void
+_Py_atomic_thread_fence(_Py_memory_order order)
+{
+    if (order != _Py_memory_order_relaxed)
+        __asm__ volatile("mfence":::"memory");
+}
+
+/* Tell the race checker about this operation's effects. */
+static __inline__ void
+_Py_ANNOTATE_MEMORY_ORDER(const volatile void *address, _Py_memory_order order)
+{
+    switch(order) {
+    case _Py_memory_order_release:
+    case _Py_memory_order_acq_rel:
+    case _Py_memory_order_seq_cst:
+        _Py_ANNOTATE_HAPPENS_BEFORE(address);
+        break;
+    default:
+        break;
+    }
+    switch(order) {
+    case _Py_memory_order_acquire:
+    case _Py_memory_order_acq_rel:
+    case _Py_memory_order_seq_cst:
+        _Py_ANNOTATE_HAPPENS_AFTER(address);
+        break;
+    default:
+        break;
+    }
+}
+
+#define _Py_atomic_store_explicit(ATOMIC_VAL, NEW_VAL, ORDER) \
+    __extension__ ({ \
+        __typeof__(ATOMIC_VAL) atomic_val = ATOMIC_VAL; \
+        __typeof__(atomic_val->_value) new_val = NEW_VAL;\
+        volatile __typeof__(new_val) *volatile_data = &atomic_val->_value; \
+        _Py_memory_order order = ORDER; \
+        _Py_ANNOTATE_MEMORY_ORDER(atomic_val, order); \
+        \
+        /* Perform the operation. */ \
+        _Py_ANNOTATE_IGNORE_WRITES_BEGIN(); \
+        switch(order) { \
+        case _Py_memory_order_release: \
+            _Py_atomic_signal_fence(_Py_memory_order_release); \
+            /* fallthrough */ \
+        case _Py_memory_order_relaxed: \
+            *volatile_data = new_val; \
+            break; \
+        \
+        case _Py_memory_order_acquire: \
+        case _Py_memory_order_acq_rel: \
+        case _Py_memory_order_seq_cst: \
+            __asm__ volatile("xchg %0, %1" \
+                         : "+r"(new_val) \
+                         : "m"(atomic_val->_value) \
+                         : "memory"); \
+            break; \
+        } \
+        _Py_ANNOTATE_IGNORE_WRITES_END(); \
+    })
+
+#define _Py_atomic_load_explicit(ATOMIC_VAL, ORDER) \
+    __extension__ ({  \
+        __typeof__(ATOMIC_VAL) atomic_val = ATOMIC_VAL; \
+        __typeof__(atomic_val->_value) result; \
+        volatile __typeof__(result) *volatile_data = &atomic_val->_value; \
+        _Py_memory_order order = ORDER; \
+        _Py_ANNOTATE_MEMORY_ORDER(atomic_val, order); \
+        \
+        /* Perform the operation. */ \
+        _Py_ANNOTATE_IGNORE_READS_BEGIN(); \
+        switch(order) { \
+        case _Py_memory_order_release: \
+        case _Py_memory_order_acq_rel: \
+        case _Py_memory_order_seq_cst: \
+            /* Loads on x86 are not releases by default, so need a */ \
+            /* thread fence. */ \
+            _Py_atomic_thread_fence(_Py_memory_order_release); \
+            break; \
+        default: \
+            /* No fence */ \
+            break; \
+        } \
+        result = *volatile_data; \
+        switch(order) { \
+        case _Py_memory_order_acquire: \
+        case _Py_memory_order_acq_rel: \
+        case _Py_memory_order_seq_cst: \
+            /* Loads on x86 are automatically acquire operations so */ \
+            /* can get by with just a compiler fence. */ \
+            _Py_atomic_signal_fence(_Py_memory_order_acquire); \
+            break; \
+        default: \
+            /* No fence */ \
+            break; \
+        } \
+        _Py_ANNOTATE_IGNORE_READS_END(); \
+        result; \
+    })
+
+#else  /* !gcc x86 */
+/* Fall back to other compilers and processors by assuming that simple
+   volatile accesses are atomic.  This is false, so people should port
+   this. */
+#define _Py_atomic_signal_fence(/*memory_order*/ ORDER) ((void)0)
+#define _Py_atomic_thread_fence(/*memory_order*/ ORDER) ((void)0)
+#define _Py_atomic_store_explicit(ATOMIC_VAL, NEW_VAL, ORDER) \
+    ((ATOMIC_VAL)->_value = NEW_VAL)
+#define _Py_atomic_load_explicit(ATOMIC_VAL, ORDER) \
+    ((ATOMIC_VAL)->_value)
+
+#endif  /* !gcc x86 */
+
+/* Standardized shortcuts. */
+#define _Py_atomic_store(ATOMIC_VAL, NEW_VAL) \
+    _Py_atomic_store_explicit(ATOMIC_VAL, NEW_VAL, _Py_memory_order_seq_cst)
+#define _Py_atomic_load(ATOMIC_VAL) \
+    _Py_atomic_load_explicit(ATOMIC_VAL, _Py_memory_order_seq_cst)
+
+/* Python-local extensions */
+
+#define _Py_atomic_store_relaxed(ATOMIC_VAL, NEW_VAL) \
+    _Py_atomic_store_explicit(ATOMIC_VAL, NEW_VAL, _Py_memory_order_relaxed)
+#define _Py_atomic_load_relaxed(ATOMIC_VAL) \
+    _Py_atomic_load_explicit(ATOMIC_VAL, _Py_memory_order_relaxed)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* Py_ATOMIC_H */
diff --git a/Include/pystate.h b/Include/pystate.h
index f85fa8c..7b6b602 100644
--- a/Include/pystate.h
+++ b/Include/pystate.h
@@ -131,12 +131,15 @@ PyAPI_FUNC(int) PyThreadState_SetAsyncExc(long, PyObject *);
 
 /* Variable and macro for in-line access to current thread state */
 
-PyAPI_DATA(PyThreadState *) _PyThreadState_Current;
+/* Assuming the current thread holds the GIL, this is the
+   PyThreadState for the current thread. */
+PyAPI_DATA(_Py_atomic_address) _PyThreadState_Current;
 
 #ifdef Py_DEBUG
 #define PyThreadState_GET() PyThreadState_Get()
 #else
-#define PyThreadState_GET() (_PyThreadState_Current)
+#define PyThreadState_GET() \
+    ((PyThreadState*)_Py_atomic_load_relaxed(&_PyThreadState_Current))
 #endif
 
 typedef
diff --git a/Makefile.pre.in b/Makefile.pre.in
index 4db7615..55cac90 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -238,6 +238,7 @@ PARSER_OBJS=	$(POBJS) Parser/myreadline.o Parser/tokenizer.o
 
 PGOBJS=		\
 		Objects/obmalloc.o \
+		Python/dynamic_annotations.o \
 		Python/mysnprintf.o \
 		Python/pyctype.o \
 		Parser/tokenizer_pgen.o \
@@ -283,6 +284,7 @@ PYTHON_OBJS=	\
 		Python/ceval.o \
 		Python/compile.o \
 		Python/codecs.o \
+		Python/dynamic_annotations.o \
 		Python/errors.o \
 		Python/frozen.o \
 		Python/frozenmain.o \
@@ -644,6 +646,7 @@ PYTHON_HEADERS= \
 		Include/descrobject.h \
 		Include/dictobject.h \
 		Include/dtoa.h \
+		Include/dynamic_annotations.h \
 		Include/enumobject.h \
 		Include/errcode.h \
 		Include/eval.h \
@@ -674,6 +677,7 @@ PYTHON_HEADERS= \
 		Include/pgen.h \
 		Include/pgenheaders.h \
 		Include/pyarena.h \
+		Include/pyatomic.h \
 		Include/pycapsule.h \
 		Include/pyctype.h \
 		Include/pydebug.h \
diff --git a/Objects/dictobject.c b/Objects/dictobject.c
index 5433ff7..2eddc86 100644
--- a/Objects/dictobject.c
+++ b/Objects/dictobject.c
@@ -734,7 +734,8 @@ PyDict_GetItem(PyObject *op, PyObject *key)
 	   Let's just hope that no exception occurs then...  This must be
 	   _PyThreadState_Current and not PyThreadState_GET() because in debug
 	   mode, the latter complains if tstate is NULL. */
-	tstate = _PyThreadState_Current;
+	tstate = (PyThreadState*)_Py_atomic_load_relaxed(
+		&_PyThreadState_Current);
 	if (tstate != NULL && tstate->curexc_type != NULL) {
 		/* preserve the existing exception */
 		PyObject *err_type, *err_value, *err_tb;
diff --git a/PC/VS7.1/pythoncore.vcproj b/PC/VS7.1/pythoncore.vcproj
index b5b4f58..a1a2501 100644
--- a/PC/VS7.1/pythoncore.vcproj
+++ b/PC/VS7.1/pythoncore.vcproj
@@ -508,6 +508,9 @@
 			RelativePath="..\..\PC\config.c">
 		</File>
 		<File
+			RelativePath="..\..\Python\dynamic_annotations.c">
+		</File>
+		<File
 			RelativePath="..\..\Modules\datetimemodule.c">
 		</File>
 		<File
diff --git a/PC/VS8.0/pythoncore.vcproj b/PC/VS8.0/pythoncore.vcproj
index 86d440a..d5325fc 100644
--- a/PC/VS8.0/pythoncore.vcproj
+++ b/PC/VS8.0/pythoncore.vcproj
@@ -707,6 +707,10 @@
 				>
 			</File>			
 			<File
+				RelativePath="..\..\Include\dynamic_annotations.h"
+				>
+			</File>
+			<File
 				RelativePath="..\..\Include\enumobject.h"
 				>
 			</File>
@@ -1655,6 +1659,10 @@
 				>
 			</File>
 			<File
+				RelativePath="..\..\Python\dynamic_annotations.c"
+				>
+			</File>
+			<File
 				RelativePath="..\..\Python\dtoa.c"
 				>
 			</File>
diff --git a/PC/os2emx/Makefile b/PC/os2emx/Makefile
index 0db46a5..d9cded7 100644
--- a/PC/os2emx/Makefile
+++ b/PC/os2emx/Makefile
@@ -332,6 +332,7 @@ SRC.PYTHON=	$(addprefix $(TOP), \
 		Python/ceval.c \
 		Python/compile.c \
 		Python/codecs.c \
+		Python/dynamic_annotations.c \
 		Python/errors.c \
 		Python/frozen.c \
 		Python/frozenmain.c \
diff --git a/PCbuild/pythoncore.vcproj b/PCbuild/pythoncore.vcproj
index fde5cd1..2388a71 100644
--- a/PCbuild/pythoncore.vcproj
+++ b/PCbuild/pythoncore.vcproj
@@ -703,6 +703,10 @@
 				>
 			</File>
 			<File
+				RelativePath="..\Include\dynamic_annotations.h"
+				>
+			</File>
+			<File
 				RelativePath="..\Include\enumobject.h"
 				>
 			</File>
@@ -1660,6 +1664,10 @@
 				>
 			</File>
 			<File
+				RelativePath="..\Python\dynamic_annotations.c"
+				>
+			</File>
+			<File
 				RelativePath="..\Python\dynload_win.c"
 				>
 			</File>
diff --git a/Python/ceval.c b/Python/ceval.c
index 0c14eb0..09f939e 100644
--- a/Python/ceval.c
+++ b/Python/ceval.c
@@ -216,23 +216,46 @@ PyEval_GetCallStats(PyObject *self)
 #endif
 
 
+/* This can set eval_breaker to 0 even though gil_drop_request became
+   1.  We believe this is all right because the eval loop will release
+   the GIL eventually anyway. */
 #define COMPUTE_EVAL_BREAKER() \
-	(eval_breaker = gil_drop_request | pendingcalls_to_do | pending_async_exc)
+	_Py_atomic_store_relaxed( \
+		&eval_breaker, \
+		_Py_atomic_load_relaxed(&gil_drop_request) | \
+		_Py_atomic_load_relaxed(&pendingcalls_to_do) | \
+		pending_async_exc)
 
 #define SET_GIL_DROP_REQUEST() \
-	do { gil_drop_request = 1; eval_breaker = 1; } while (0)
+	do { \
+		_Py_atomic_store_relaxed(&gil_drop_request, 1); \
+		_Py_atomic_store_relaxed(&eval_breaker, 1); \
+	} while (0)
 
 #define RESET_GIL_DROP_REQUEST() \
-	do { gil_drop_request = 0; COMPUTE_EVAL_BREAKER(); } while (0)
+	do { \
+		_Py_atomic_store_relaxed(&gil_drop_request, 0); \
+		COMPUTE_EVAL_BREAKER(); \
+	} while (0)
 
+/* Pending calls are only modified under pending_lock */
 #define SIGNAL_PENDING_CALLS() \
-	do { pendingcalls_to_do = 1; eval_breaker = 1; } while (0)
+	do { \
+		_Py_atomic_store_relaxed(&pendingcalls_to_do, 1); \
+		_Py_atomic_store_relaxed(&eval_breaker, 1); \
+	} while (0)
 
 #define UNSIGNAL_PENDING_CALLS() \
-	do { pendingcalls_to_do = 0; COMPUTE_EVAL_BREAKER(); } while (0)
+	do { \
+		_Py_atomic_store_relaxed(&pendingcalls_to_do, 0); \
+		COMPUTE_EVAL_BREAKER(); \
+	} while (0)
 
 #define SIGNAL_ASYNC_EXC() \
-	do { pending_async_exc = 1; eval_breaker = 1; } while (0)
+	do { \
+		pending_async_exc = 1; \
+		_Py_atomic_store_relaxed(&eval_breaker, 1); \
+	} while (0)
 
 #define UNSIGNAL_ASYNC_EXC() \
 	do { pending_async_exc = 0; COMPUTE_EVAL_BREAKER(); } while (0)
@@ -249,13 +272,14 @@ static PyThread_type_lock pending_lock = 0; /* for pending calls */
 static long main_thread = 0;
 /* This single variable consolidates all requests to break out of the fast path
    in the eval loop. */
-static volatile int eval_breaker = 0;
-/* Request for droppping the GIL */
-static volatile int gil_drop_request = 0;
-/* Request for running pending calls */
-static volatile int pendingcalls_to_do = 0; 
-/* Request for looking at the `async_exc` field of the current thread state */
-static volatile int pending_async_exc = 0;
+static _Py_atomic_int eval_breaker = {0};
+/* Request for dropping the GIL */
+static _Py_atomic_int gil_drop_request = {0};
+/* Request for running pending calls. */
+static _Py_atomic_int pendingcalls_to_do = {0};
+/* Request for looking at the `async_exc` field of the current thread state.
+   Guarded by the GIL. */
+static int pending_async_exc = 0;
 
 #include "ceval_gil.h"
 
@@ -293,7 +317,8 @@ PyEval_ReleaseLock(void)
 	   We therefore avoid PyThreadState_GET() which dumps a fatal error
 	   in debug mode.
 	*/
-	drop_gil(_PyThreadState_Current);
+	drop_gil((PyThreadState*)_Py_atomic_load_relaxed(
+		&_PyThreadState_Current));
 }
 
 void
@@ -360,8 +385,8 @@ PyEval_ReInitThreads(void)
 }
 
 #else
-static int eval_breaker = 0;
-static int gil_drop_request = 0;
+static _Py_atomic_int eval_breaker = {0};
+static _Py_atomic_int gil_drop_request = {0};
 static int pending_async_exc = 0;
 #endif /* WITH_THREAD */
 
@@ -1217,7 +1242,7 @@ PyEval_EvalFrameEx(PyFrameObject *f, int throwflag)
 		   async I/O handler); see Py_AddPendingCall() and
 		   Py_MakePendingCalls() above. */
 
-		if (eval_breaker) {
+		if (_Py_atomic_load_relaxed(&eval_breaker)) {
 			if (*next_instr == SETUP_FINALLY) {
 				/* Make the last opcode before
 				   a try: finally: block uninterruptable. */
@@ -1227,13 +1252,13 @@ PyEval_EvalFrameEx(PyFrameObject *f, int throwflag)
 #ifdef WITH_TSC
 			ticked = 1;
 #endif
-			if (pendingcalls_to_do) {
+			if (_Py_atomic_load_relaxed(&pendingcalls_to_do)) {
 				if (Py_MakePendingCalls() < 0) {
 					why = WHY_EXCEPTION;
 					goto on_error;
 				}
 			}
-			if (gil_drop_request) {
+			if (_Py_atomic_load_relaxed(&gil_drop_request)) {
 #ifdef WITH_THREAD
 				/* Give another thread a chance */
 				if (PyThreadState_Swap(NULL) != tstate)
diff --git a/Python/ceval_gil.h b/Python/ceval_gil.h
index d4d6fdd..a284c5d 100644
--- a/Python/ceval_gil.h
+++ b/Python/ceval_gil.h
@@ -207,14 +207,14 @@ do { \
 #endif /* _POSIX_THREADS, NT_THREADS */
 
 
-/* Whether the GIL is already taken (-1 if uninitialized). This is volatile
+/* Whether the GIL is already taken (-1 if uninitialized). This is atomic
    because it can be read without any lock taken in ceval.c. */
-static volatile int gil_locked = -1;
+static _Py_atomic_int gil_locked = {-1};
 /* Number of GIL switches since the beginning. */
 static unsigned long gil_switch_number = 0;
-/* Last thread holding / having held the GIL. This helps us know whether
-   anyone else was scheduled after we dropped the GIL. */
-static PyThreadState *gil_last_holder = NULL;
+/* Last PyThreadState holding / having held the GIL. This helps us know
+   whether anyone else was scheduled after we dropped the GIL. */
+static _Py_atomic_address gil_last_holder = {NULL};
 
 /* This condition variable allows one or several threads to wait until
    the GIL is released. In addition, the mutex also protects the above
@@ -232,7 +232,7 @@ static MUTEX_T switch_mutex;
 
 static int gil_created(void)
 {
-    return gil_locked >= 0;
+    return _Py_atomic_load_explicit(&gil_locked, _Py_memory_order_acquire) >= 0;
 }
 
 static void create_gil(void)
@@ -245,33 +245,37 @@ static void create_gil(void)
 #ifdef FORCE_SWITCHING
     COND_INIT(switch_cond);
 #endif
-    gil_locked = 0;
-    gil_last_holder = NULL;
+    _Py_atomic_store_relaxed(&gil_last_holder, NULL);
+    _Py_ANNOTATE_RWLOCK_CREATE(&gil_locked);
+    _Py_atomic_store_explicit(&gil_locked, 0, _Py_memory_order_release);
 }
 
 static void recreate_gil(void)
 {
+    _Py_ANNOTATE_RWLOCK_DESTROY(&gil_locked);
     create_gil();
 }
 
 static void drop_gil(PyThreadState *tstate)
 {
     /* NOTE: tstate is allowed to be NULL. */
-    if (!gil_locked)
+    if (!_Py_atomic_load_relaxed(&gil_locked))
         Py_FatalError("drop_gil: GIL is not locked");
-    if (tstate != NULL && tstate != gil_last_holder)
+    if (tstate != NULL &&
+        tstate != _Py_atomic_load_relaxed(&gil_last_holder))
         Py_FatalError("drop_gil: wrong thread state");
 
     MUTEX_LOCK(gil_mutex);
-    gil_locked = 0;
+    _Py_ANNOTATE_RWLOCK_RELEASED(&gil_locked, /*is_write=*/1);
+    _Py_atomic_store_relaxed(&gil_locked, 0);
     COND_SIGNAL(gil_cond);
     MUTEX_UNLOCK(gil_mutex);
     
 #ifdef FORCE_SWITCHING
-    if (gil_drop_request && tstate != NULL) {
+    if (_Py_atomic_load_relaxed(&gil_drop_request) && tstate != NULL) {
         MUTEX_LOCK(switch_mutex);
         /* Not switched yet => wait */
-        if (gil_last_holder == tstate) {
+        if (_Py_atomic_load_relaxed(&gil_last_holder) == tstate) {
 	    RESET_GIL_DROP_REQUEST();
             /* NOTE: if COND_WAIT does not atomically start waiting when
                releasing the mutex, another thread can run through, take
@@ -294,11 +298,11 @@ static void take_gil(PyThreadState *tstate)
     err = errno;
     MUTEX_LOCK(gil_mutex);
 
-    if (!gil_locked)
+    if (!_Py_atomic_load_relaxed(&gil_locked))
         goto _ready;
     
     COND_RESET(gil_cond);
-    while (gil_locked) {
+    while (_Py_atomic_load_relaxed(&gil_locked)) {
         int timed_out = 0;
         unsigned long saved_switchnum;
 
@@ -306,7 +310,9 @@ static void take_gil(PyThreadState *tstate)
         COND_TIMED_WAIT(gil_cond, gil_mutex, INTERVAL, timed_out);
         /* If we timed out and no switch occurred in the meantime, it is time
            to ask the GIL-holding thread to drop it. */
-        if (timed_out && gil_locked && gil_switch_number == saved_switchnum) {
+        if (timed_out &&
+            _Py_atomic_load_relaxed(&gil_locked) &&
+            gil_switch_number == saved_switchnum) {
             SET_GIL_DROP_REQUEST();
         }
     }
@@ -316,17 +322,19 @@ _ready:
     MUTEX_LOCK(switch_mutex);
 #endif
     /* We now hold the GIL */
-    gil_locked = 1;
+    _Py_atomic_store_relaxed(&gil_locked, 1);
+    _Py_ANNOTATE_RWLOCK_ACQUIRED(&gil_locked, /*is_write=*/1);
 
-    if (tstate != gil_last_holder) {
-        gil_last_holder = tstate;
+    if (tstate != _Py_atomic_load_relaxed(&gil_last_holder)) {
+        _Py_atomic_store_relaxed(&gil_last_holder, tstate);
         ++gil_switch_number;
     }
+
 #ifdef FORCE_SWITCHING
     COND_SIGNAL(switch_cond);
     MUTEX_UNLOCK(switch_mutex);
 #endif
-    if (gil_drop_request) {
+    if (_Py_atomic_load_relaxed(&gil_drop_request)) {
         RESET_GIL_DROP_REQUEST();
     }
     if (tstate->async_exc != NULL) {
diff --git a/Python/dynamic_annotations.c b/Python/dynamic_annotations.c
new file mode 100644
index 0000000..10511da
--- /dev/null
+++ b/Python/dynamic_annotations.c
@@ -0,0 +1,154 @@
+/* Copyright (c) 2008-2009, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Kostya Serebryany
+ */
+
+#ifdef _MSC_VER
+# include <windows.h>
+#endif
+
+#ifdef __cplusplus
+# error "This file should be built as pure C to avoid name mangling"
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "dynamic_annotations.h"
+
+/* Each function is empty and called (via a macro) only in debug mode.
+   The arguments are captured by dynamic tools at runtime. */
+
+#if DYNAMIC_ANNOTATIONS_ENABLED == 1
+
+void AnnotateRWLockCreate(const char *file, int line,
+                          const volatile void *lock){}
+void AnnotateRWLockDestroy(const char *file, int line,
+                           const volatile void *lock){}
+void AnnotateRWLockAcquired(const char *file, int line,
+                            const volatile void *lock, long is_w){}
+void AnnotateRWLockReleased(const char *file, int line,
+                            const volatile void *lock, long is_w){}
+void AnnotateBarrierInit(const char *file, int line,
+                         const volatile void *barrier, long count,
+                         long reinitialization_allowed) {}
+void AnnotateBarrierWaitBefore(const char *file, int line,
+                               const volatile void *barrier) {}
+void AnnotateBarrierWaitAfter(const char *file, int line,
+                              const volatile void *barrier) {}
+void AnnotateBarrierDestroy(const char *file, int line,
+                            const volatile void *barrier) {}
+
+void AnnotateCondVarWait(const char *file, int line,
+                         const volatile void *cv,
+                         const volatile void *lock){}
+void AnnotateCondVarSignal(const char *file, int line,
+                           const volatile void *cv){}
+void AnnotateCondVarSignalAll(const char *file, int line,
+                              const volatile void *cv){}
+void AnnotatePublishMemoryRange(const char *file, int line,
+                                const volatile void *address,
+                                long size){}
+void AnnotateUnpublishMemoryRange(const char *file, int line,
+                                  const volatile void *address,
+                                  long size){}
+void AnnotatePCQCreate(const char *file, int line,
+                       const volatile void *pcq){}
+void AnnotatePCQDestroy(const char *file, int line,
+                        const volatile void *pcq){}
+void AnnotatePCQPut(const char *file, int line,
+                    const volatile void *pcq){}
+void AnnotatePCQGet(const char *file, int line,
+                    const volatile void *pcq){}
+void AnnotateNewMemory(const char *file, int line,
+                       const volatile void *mem,
+                       long size){}
+void AnnotateExpectRace(const char *file, int line,
+                        const volatile void *mem,
+                        const char *description){}
+void AnnotateBenignRace(const char *file, int line,
+                        const volatile void *mem,
+                        const char *description){}
+void AnnotateBenignRaceSized(const char *file, int line,
+                             const volatile void *mem,
+                             long size,
+                             const char *description) {}
+void AnnotateMutexIsUsedAsCondVar(const char *file, int line,
+                                  const volatile void *mu){}
+void AnnotateTraceMemory(const char *file, int line,
+                         const volatile void *arg){}
+void AnnotateThreadName(const char *file, int line,
+                        const char *name){}
+void AnnotateIgnoreReadsBegin(const char *file, int line){}
+void AnnotateIgnoreReadsEnd(const char *file, int line){}
+void AnnotateIgnoreWritesBegin(const char *file, int line){}
+void AnnotateIgnoreWritesEnd(const char *file, int line){}
+void AnnotateIgnoreSyncBegin(const char *file, int line){}
+void AnnotateIgnoreSyncEnd(const char *file, int line){}
+void AnnotateEnableRaceDetection(const char *file, int line, int enable){}
+void AnnotateNoOp(const char *file, int line,
+                  const volatile void *arg){}
+void AnnotateFlushState(const char *file, int line){}
+
+static int GetRunningOnValgrind(void) {
+#ifdef RUNNING_ON_VALGRIND
+  if (RUNNING_ON_VALGRIND) return 1;
+#endif
+
+#ifndef _MSC_VER
+  char *running_on_valgrind_str = getenv("RUNNING_ON_VALGRIND");
+  if (running_on_valgrind_str) {
+    return strcmp(running_on_valgrind_str, "0") != 0;
+  }
+#else
+  /* Visual Studio issues warnings if we use getenv,
+   * so we use GetEnvironmentVariableA instead.
+   */
+  char value[100] = "1";
+  int res = GetEnvironmentVariableA("RUNNING_ON_VALGRIND",
+                                    value, sizeof(value));
+  /* value will remain "1" if res == 0 or res >= sizeof(value). The latter
+   * can happen only if the given value is long, in this case it can't be "0".
+   */
+  if (res > 0 && !strcmp(value, "0"))
+    return 1;
+#endif
+  return 0;
+}
+
+/* See the comments in dynamic_annotations.h */
+int RunningOnValgrind(void) {
+  static volatile int running_on_valgrind = -1;
+  /* C doesn't have thread-safe initialization of statics, and we
+     don't want to depend on pthread_once here, so hack it. */
+  int local_running_on_valgrind = running_on_valgrind;
+  if (local_running_on_valgrind == -1)
+    running_on_valgrind = local_running_on_valgrind = GetRunningOnValgrind();
+  return local_running_on_valgrind;
+}
+
+#endif  /* DYNAMIC_ANNOTATIONS_ENABLED == 1 */
diff --git a/Python/pystate.c b/Python/pystate.c
index eb2dfa6..7154aea 100644
--- a/Python/pystate.c
+++ b/Python/pystate.c
@@ -47,7 +47,9 @@ static int autoTLSkey = 0;
 
 static PyInterpreterState *interp_head = NULL;
 
-PyThreadState *_PyThreadState_Current = NULL;
+/* Assuming the current thread holds the GIL, this is the
+   PyThreadState for the current thread. */
+_Py_atomic_address _PyThreadState_Current = {NULL};
 PyThreadFrameGetter _PyThreadState_GetFrame = NULL;
 
 #ifdef WITH_THREAD
@@ -334,7 +336,7 @@ tstate_delete_common(PyThreadState *tstate)
 void
 PyThreadState_Delete(PyThreadState *tstate)
 {
-	if (tstate == _PyThreadState_Current)
+	if (tstate == _Py_atomic_load_relaxed(&_PyThreadState_Current))
 		Py_FatalError("PyThreadState_Delete: tstate is still current");
 	tstate_delete_common(tstate);
 #ifdef WITH_THREAD
@@ -348,11 +350,12 @@ PyThreadState_Delete(PyThreadState *tstate)
 void
 PyThreadState_DeleteCurrent()
 {
-	PyThreadState *tstate = _PyThreadState_Current;
+	PyThreadState *tstate = (PyThreadState*)_Py_atomic_load_relaxed(
+		&_PyThreadState_Current);
 	if (tstate == NULL)
 		Py_FatalError(
 			"PyThreadState_DeleteCurrent: no current tstate");
-	_PyThreadState_Current = NULL;
+	_Py_atomic_store_relaxed(&_PyThreadState_Current, NULL);
 	tstate_delete_common(tstate);
 	if (autoTLSkey && PyThread_get_key_value(autoTLSkey) == tstate)
 		PyThread_delete_key_value(autoTLSkey);
@@ -364,19 +367,22 @@ PyThreadState_DeleteCurrent()
 PyThreadState *
 PyThreadState_Get(void)
 {
-	if (_PyThreadState_Current == NULL)
+	PyThreadState *tstate = (PyThreadState*)_Py_atomic_load_relaxed(
+		&_PyThreadState_Current);
+	if (tstate == NULL)
 		Py_FatalError("PyThreadState_Get: no current thread");
 
-	return _PyThreadState_Current;
+	return tstate;
 }
 
 
 PyThreadState *
 PyThreadState_Swap(PyThreadState *newts)
 {
-	PyThreadState *oldts = _PyThreadState_Current;
+	PyThreadState *oldts = (PyThreadState*)_Py_atomic_load_relaxed(
+		&_PyThreadState_Current);
 
-	_PyThreadState_Current = newts;
+	_Py_atomic_store_relaxed(&_PyThreadState_Current, newts);
 	/* It should not be possible for more than one thread state
 	   to be used for a thread.  Check this the best we can in debug
 	   builds.
@@ -405,16 +411,18 @@ PyThreadState_Swap(PyThreadState *newts)
 PyObject *
 PyThreadState_GetDict(void)
 {
-	if (_PyThreadState_Current == NULL)
+	PyThreadState *tstate = (PyThreadState*)_Py_atomic_load_relaxed(
+		&_PyThreadState_Current);
+	if (tstate == NULL)
 		return NULL;
 
-	if (_PyThreadState_Current->dict == NULL) {
+	if (tstate->dict == NULL) {
 		PyObject *d;
-		_PyThreadState_Current->dict = d = PyDict_New();
+		tstate->dict = d = PyDict_New();
 		if (d == NULL)
 			PyErr_Clear();
 	}
-	return _PyThreadState_Current->dict;
+	return tstate->dict;
 }
 
 
@@ -550,10 +558,7 @@ PyThreadState_IsCurrent(PyThreadState *tstate)
 {
 	/* Must be the tstate for this thread */
 	assert(PyGILState_GetThisThreadState()==tstate);
-	/* On Windows at least, simple reads and writes to 32 bit values
-	   are atomic.
-	*/
-	return tstate == _PyThreadState_Current;
+	return tstate == _Py_atomic_load_relaxed(&_PyThreadState_Current);
 }
 
 /* Internal initialization/finalization functions called by
diff --git a/Python/thread_pthread.h b/Python/thread_pthread.h
index 6088c71..f60f36d 100644
--- a/Python/thread_pthread.h
+++ b/Python/thread_pthread.h
@@ -397,6 +397,12 @@ PyThread_allocate_lock(void)
 		status = pthread_mutex_init(&lock->mut,
 					    pthread_mutexattr_default);
 		CHECK_STATUS("pthread_mutex_init");
+                /* Mark the pthread mutex underlying a Python mutex as
+                   pure happens-before.  We can't simply mark the
+                   Python-level mutex as a mutex because it can be
+                   acquired and released in different threads, which
+                   will cause errors. */
+		_Py_ANNOTATE_PURE_HAPPENS_BEFORE_MUTEX(&lock->mut);
 
 		status = pthread_cond_init(&lock->lock_released,
 					   pthread_condattr_default);
diff --git a/configure b/configure
index 7afe2c8..f0eb99e 100755
--- a/configure
+++ b/configure
@@ -1,5 +1,5 @@
 #! /bin/sh
-# From configure.in Revision: 80648 .
+# From configure.in Revision: 80666 .
 # Guess values for system-dependent variables and create Makefiles.
 # Generated by GNU Autoconf 2.65 for python 3.2.
 #
@@ -1929,11 +1929,11 @@ else
        cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 $ac_includes_default
+	     enum { N = $2 / 2 - 1 };
 int
 main ()
 {
-static int test_array [1 - 2 * !(enum { N = $2 / 2 - 1 };
-	     0 < ($ac_type) ((((($ac_type) 1 << N) << N) - 1) * 2 + 1))];
+static int test_array [1 - 2 * !(0 < ($ac_type) ((((($ac_type) 1 << N) << N) - 1) * 2 + 1))];
 test_array [0] = 0
 
   ;
@@ -1944,11 +1944,11 @@ if ac_fn_c_try_compile "$LINENO"; then :
   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 $ac_includes_default
+	        enum { N = $2 / 2 - 1 };
 int
 main ()
 {
-static int test_array [1 - 2 * !(enum { N = $2 / 2 - 1 };
-		($ac_type) ((((($ac_type) 1 << N) << N) - 1) * 2 + 1)
+static int test_array [1 - 2 * !(($ac_type) ((((($ac_type) 1 << N) << N) - 1) * 2 + 1)
 		 < ($ac_type) ((((($ac_type) 1 << N) << N) - 1) * 2 + 2))];
 test_array [0] = 0
 
@@ -9210,6 +9210,7 @@ else
 fi
 
 
+    OPT="-DDYNAMIC_ANNOTATIONS_ENABLED=1 $OPT"
 fi
 
 # Check for --with-wctype-functions
diff --git a/configure.in b/configure.in
index 0cd867c..f991823 100644
--- a/configure.in
+++ b/configure.in
@@ -2515,6 +2515,7 @@ if test "$with_valgrind" != no; then
       [AC_DEFINE([WITH_VALGRIND], 1, [Define if you want pymalloc to be disabled when running under valgrind])],
       [AC_MSG_ERROR([Valgrind support requested but headers not available])]
     )
+    OPT="-DDYNAMIC_ANNOTATIONS_ENABLED=1 $OPT"
 fi
 
 # Check for --with-wctype-functions
-- 
cgit v0.12