From fa2d64c94b07ee21a0f6f44b9fe6e3bbefa51c6c Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 12 Feb 2017 17:03:46 -0800
Subject: Convert arena->prof_accumbytes synchronization to atomics.

---
 include/jemalloc/internal/arena_inlines_a.h      |  34 +---
 include/jemalloc/internal/arena_structs_b.h      |   3 +-
 include/jemalloc/internal/atomic_inlines.h       |   4 +-
 include/jemalloc/internal/atomic_types.h         |   8 +
 include/jemalloc/internal/jemalloc_internal.h.in |   7 +-
 include/jemalloc/internal/private_symbols.txt    |   5 +-
 include/jemalloc/internal/prof_externs.h         |   1 +
 include/jemalloc/internal/prof_inlines.h         | 240 -----------------------
 include/jemalloc/internal/prof_inlines_a.h       |  76 +++++++
 include/jemalloc/internal/prof_inlines_b.h       | 240 +++++++++++++++++++++++
 include/jemalloc/internal/prof_structs.h         |   7 +
 include/jemalloc/internal/prof_types.h           |   1 +
 include/jemalloc/internal/witness_types.h        |   1 +
 src/arena.c                                      |  18 +-
 src/prof.c                                       |  14 ++
 src/tcache.c                                     |   2 +-
 16 files changed, 365 insertions(+), 296 deletions(-)
 create mode 100644 include/jemalloc/internal/atomic_types.h
 delete mode 100644 include/jemalloc/internal/prof_inlines.h
 create mode 100644 include/jemalloc/internal/prof_inlines_a.h
 create mode 100644 include/jemalloc/internal/prof_inlines_b.h

diff --git a/include/jemalloc/internal/arena_inlines_a.h b/include/jemalloc/internal/arena_inlines_a.h
index a81aaf5..ea7e099 100644
--- a/include/jemalloc/internal/arena_inlines_a.h
+++ b/include/jemalloc/internal/arena_inlines_a.h
@@ -6,8 +6,6 @@ unsigned	arena_ind_get(const arena_t *arena);
 void	arena_internal_add(arena_t *arena, size_t size);
 void	arena_internal_sub(arena_t *arena, size_t size);
 size_t	arena_internal_get(arena_t *arena);
-bool	arena_prof_accum_impl(arena_t *arena, uint64_t accumbytes);
-bool	arena_prof_accum_locked(arena_t *arena, uint64_t accumbytes);
 bool	arena_prof_accum(tsdn_t *tsdn, arena_t *arena, uint64_t accumbytes);
 #endif /* JEMALLOC_ENABLE_INLINE */
 
@@ -34,29 +32,6 @@ arena_internal_get(arena_t *arena) {
 }
 
 JEMALLOC_INLINE bool
-arena_prof_accum_impl(arena_t *arena, uint64_t accumbytes) {
-	cassert(config_prof);
-	assert(prof_interval != 0);
-
-	arena->prof_accumbytes += accumbytes;
-	if (arena->prof_accumbytes >= prof_interval) {
-		arena->prof_accumbytes %= prof_interval;
-		return true;
-	}
-	return false;
-}
-
-JEMALLOC_INLINE bool
-arena_prof_accum_locked(arena_t *arena, uint64_t accumbytes) {
-	cassert(config_prof);
-
-	if (likely(prof_interval == 0)) {
-		return false;
-	}
-	return arena_prof_accum_impl(arena, accumbytes);
-}
-
-JEMALLOC_INLINE bool
 arena_prof_accum(tsdn_t *tsdn, arena_t *arena, uint64_t accumbytes) {
 	cassert(config_prof);
 
@@ -64,14 +39,7 @@ arena_prof_accum(tsdn_t *tsdn, arena_t *arena, uint64_t accumbytes) {
 		return false;
 	}
 
-	{
-		bool ret;
-
-		malloc_mutex_lock(tsdn, &arena->lock);
-		ret = arena_prof_accum_impl(arena, accumbytes);
-		malloc_mutex_unlock(tsdn, &arena->lock);
-		return ret;
-	}
+	return prof_accum_add(tsdn, &arena->prof_accum, accumbytes);
 }
 
 #endif /* (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ARENA_C_)) */
diff --git a/include/jemalloc/internal/arena_structs_b.h b/include/jemalloc/internal/arena_structs_b.h
index dde2689..2ee5690 100644
--- a/include/jemalloc/internal/arena_structs_b.h
+++ b/include/jemalloc/internal/arena_structs_b.h
@@ -138,7 +138,8 @@ struct arena_s {
 	 */
 	ql_head(tcache_t)	tcache_ql;
 
-	/* Synchronization: lock. */
+	/* Synchronization: internal. */
+	prof_accum_t		prof_accum;
 	uint64_t		prof_accumbytes;
 
 	/*
diff --git a/include/jemalloc/internal/atomic_inlines.h b/include/jemalloc/internal/atomic_inlines.h
index 7c1902f..de66d57 100644
--- a/include/jemalloc/internal/atomic_inlines.h
+++ b/include/jemalloc/internal/atomic_inlines.h
@@ -23,7 +23,7 @@
  */
 
 #ifndef JEMALLOC_ENABLE_INLINE
-#  if (LG_SIZEOF_PTR == 3 || LG_SIZEOF_INT == 3)
+#  ifdef JEMALLOC_ATOMIC_U64
 uint64_t	atomic_add_u64(uint64_t *p, uint64_t x);
 uint64_t	atomic_sub_u64(uint64_t *p, uint64_t x);
 bool	atomic_cas_u64(uint64_t *p, uint64_t c, uint64_t s);
@@ -50,7 +50,7 @@ void	atomic_write_u(unsigned *p, unsigned x);
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ATOMIC_C_))
 /******************************************************************************/
 /* 64-bit operations. */
-#if (LG_SIZEOF_PTR == 3 || LG_SIZEOF_INT == 3)
+#ifdef JEMALLOC_ATOMIC_U64
 #  if (defined(__amd64__) || defined(__x86_64__))
 JEMALLOC_INLINE uint64_t
 atomic_add_u64(uint64_t *p, uint64_t x) {
diff --git a/include/jemalloc/internal/atomic_types.h b/include/jemalloc/internal/atomic_types.h
new file mode 100644
index 0000000..0fd5e5b
--- /dev/null
+++ b/include/jemalloc/internal/atomic_types.h
@@ -0,0 +1,8 @@
+#ifndef JEMALLOC_INTERNAL_ATOMIC_TYPES_H
+#define JEMALLOC_INTERNAL_ATOMIC_TYPES_H
+
+#if (LG_SIZEOF_PTR == 3 || LG_SIZEOF_INT == 3)
+#  define JEMALLOC_ATOMIC_U64
+#endif
+
+#endif /* JEMALLOC_INTERNAL_ATOMIC_TYPES_H */
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index bace9c4..7e9c24b 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -380,6 +380,7 @@ typedef unsigned szind_t;
 
 #include "jemalloc/internal/nstime_types.h"
 #include "jemalloc/internal/util_types.h"
+#include "jemalloc/internal/atomic_types.h"
 #include "jemalloc/internal/spin_types.h"
 #include "jemalloc/internal/prng_types.h"
 #include "jemalloc/internal/ticker_types.h"
@@ -419,10 +420,10 @@ typedef unsigned szind_t;
 #include "jemalloc/internal/extent_structs.h"
 #include "jemalloc/internal/extent_dss_structs.h"
 #include "jemalloc/internal/base_structs.h"
+#include "jemalloc/internal/prof_structs.h"
 #include "jemalloc/internal/arena_structs_b.h"
 #include "jemalloc/internal/rtree_structs.h"
 #include "jemalloc/internal/tcache_structs.h"
-#include "jemalloc/internal/prof_structs.h"
 #include "jemalloc/internal/tsd_structs.h"
 
 
@@ -902,6 +903,7 @@ decay_ticker_get(tsd_t *tsd, unsigned ind) {
  * Include portions of arena code interleaved with tcache code in order to
  * resolve circular dependencies.
  */
+#include "jemalloc/internal/prof_inlines_a.h"
 #include "jemalloc/internal/arena_inlines_a.h"
 
 #ifndef JEMALLOC_ENABLE_INLINE
@@ -1163,8 +1165,7 @@ ixalloc(tsdn_t *tsdn, extent_t *extent, void *ptr, size_t oldsize, size_t size,
 }
 #endif
 
-#include "jemalloc/internal/prof_inlines.h"
-
+#include "jemalloc/internal/prof_inlines_b.h"
 
 #ifdef __cplusplus
 }
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index ab5a672..4e79991 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -54,8 +54,6 @@ arena_prefork1
 arena_prefork2
 arena_prefork3
 arena_prof_accum
-arena_prof_accum_impl
-arena_prof_accum_locked
 arena_prof_promote
 arena_prof_tctx_get
 arena_prof_tctx_reset
@@ -364,6 +362,9 @@ prng_range_zu
 prng_state_next_u32
 prng_state_next_u64
 prng_state_next_zu
+prof_accum_add
+prof_accum_cancel
+prof_accum_init
 prof_active
 prof_active_get
 prof_active_get_unlocked
diff --git a/include/jemalloc/internal/prof_externs.h b/include/jemalloc/internal/prof_externs.h
index 76505f8..f3b6f8d 100644
--- a/include/jemalloc/internal/prof_externs.h
+++ b/include/jemalloc/internal/prof_externs.h
@@ -55,6 +55,7 @@ extern prof_dump_header_t *prof_dump_header;
 void	prof_cnt_all(uint64_t *curobjs, uint64_t *curbytes,
     uint64_t *accumobjs, uint64_t *accumbytes);
 #endif
+bool prof_accum_init(tsdn_t *tsdn, prof_accum_t *prof_accum);
 void	prof_idump(tsdn_t *tsdn);
 bool	prof_mdump(tsd_t *tsd, const char *filename);
 void	prof_gdump(tsdn_t *tsdn);
diff --git a/include/jemalloc/internal/prof_inlines.h b/include/jemalloc/internal/prof_inlines.h
deleted file mode 100644
index aba2936..0000000
--- a/include/jemalloc/internal/prof_inlines.h
+++ /dev/null
@@ -1,240 +0,0 @@
-#ifndef JEMALLOC_INTERNAL_PROF_INLINES_H
-#define JEMALLOC_INTERNAL_PROF_INLINES_H
-
-#ifndef JEMALLOC_ENABLE_INLINE
-bool	prof_active_get_unlocked(void);
-bool	prof_gdump_get_unlocked(void);
-prof_tdata_t	*prof_tdata_get(tsd_t *tsd, bool create);
-prof_tctx_t	*prof_tctx_get(tsdn_t *tsdn, const extent_t *extent,
-    const void *ptr);
-void	prof_tctx_set(tsdn_t *tsdn, extent_t *extent, const void *ptr,
-    size_t usize, prof_tctx_t *tctx);
-void	prof_tctx_reset(tsdn_t *tsdn, extent_t *extent, const void *ptr,
-    prof_tctx_t *tctx);
-bool	prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
-    prof_tdata_t **tdata_out);
-prof_tctx_t	*prof_alloc_prep(tsd_t *tsd, size_t usize, bool prof_active,
-    bool update);
-void	prof_malloc(tsdn_t *tsdn, extent_t *extent, const void *ptr,
-    size_t usize, prof_tctx_t *tctx);
-void	prof_realloc(tsd_t *tsd, extent_t *extent, const void *ptr,
-    size_t usize, prof_tctx_t *tctx, bool prof_active, bool updated,
-    extent_t *old_extent, const void *old_ptr, size_t old_usize,
-    prof_tctx_t *old_tctx);
-void	prof_free(tsd_t *tsd, const extent_t *extent, const void *ptr,
-    size_t usize);
-#endif
-
-#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
-JEMALLOC_ALWAYS_INLINE bool
-prof_active_get_unlocked(void) {
-	/*
-	 * Even if opt_prof is true, sampling can be temporarily disabled by
-	 * setting prof_active to false.  No locking is used when reading
-	 * prof_active in the fast path, so there are no guarantees regarding
-	 * how long it will take for all threads to notice state changes.
-	 */
-	return prof_active;
-}
-
-JEMALLOC_ALWAYS_INLINE bool
-prof_gdump_get_unlocked(void) {
-	/*
-	 * No locking is used when reading prof_gdump_val in the fast path, so
-	 * there are no guarantees regarding how long it will take for all
-	 * threads to notice state changes.
-	 */
-	return prof_gdump_val;
-}
-
-JEMALLOC_ALWAYS_INLINE prof_tdata_t *
-prof_tdata_get(tsd_t *tsd, bool create) {
-	prof_tdata_t *tdata;
-
-	cassert(config_prof);
-
-	tdata = tsd_prof_tdata_get(tsd);
-	if (create) {
-		if (unlikely(tdata == NULL)) {
-			if (tsd_nominal(tsd)) {
-				tdata = prof_tdata_init(tsd);
-				tsd_prof_tdata_set(tsd, tdata);
-			}
-		} else if (unlikely(tdata->expired)) {
-			tdata = prof_tdata_reinit(tsd, tdata);
-			tsd_prof_tdata_set(tsd, tdata);
-		}
-		assert(tdata == NULL || tdata->attached);
-	}
-
-	return tdata;
-}
-
-JEMALLOC_ALWAYS_INLINE prof_tctx_t *
-prof_tctx_get(tsdn_t *tsdn, const extent_t *extent, const void *ptr) {
-	cassert(config_prof);
-	assert(ptr != NULL);
-
-	return arena_prof_tctx_get(tsdn, extent, ptr);
-}
-
-JEMALLOC_ALWAYS_INLINE void
-prof_tctx_set(tsdn_t *tsdn, extent_t *extent, const void *ptr, size_t usize,
-    prof_tctx_t *tctx) {
-	cassert(config_prof);
-	assert(ptr != NULL);
-
-	arena_prof_tctx_set(tsdn, extent, ptr, usize, tctx);
-}
-
-JEMALLOC_ALWAYS_INLINE void
-prof_tctx_reset(tsdn_t *tsdn, extent_t *extent, const void *ptr,
-    prof_tctx_t *tctx) {
-	cassert(config_prof);
-	assert(ptr != NULL);
-
-	arena_prof_tctx_reset(tsdn, extent, ptr, tctx);
-}
-
-JEMALLOC_ALWAYS_INLINE bool
-prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
-    prof_tdata_t **tdata_out) {
-	prof_tdata_t *tdata;
-
-	cassert(config_prof);
-
-	tdata = prof_tdata_get(tsd, true);
-	if (unlikely((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)) {
-		tdata = NULL;
-	}
-
-	if (tdata_out != NULL) {
-		*tdata_out = tdata;
-	}
-
-	if (unlikely(tdata == NULL)) {
-		return true;
-	}
-
-	if (likely(tdata->bytes_until_sample >= usize)) {
-		if (update) {
-			tdata->bytes_until_sample -= usize;
-		}
-		return true;
-	} else {
-		/* Compute new sample threshold. */
-		if (update) {
-			prof_sample_threshold_update(tdata);
-		}
-		return !tdata->active;
-	}
-}
-
-JEMALLOC_ALWAYS_INLINE prof_tctx_t *
-prof_alloc_prep(tsd_t *tsd, size_t usize, bool prof_active, bool update) {
-	prof_tctx_t *ret;
-	prof_tdata_t *tdata;
-	prof_bt_t bt;
-
-	assert(usize == s2u(usize));
-
-	if (!prof_active || likely(prof_sample_accum_update(tsd, usize, update,
-	    &tdata))) {
-		ret = (prof_tctx_t *)(uintptr_t)1U;
-	} else {
-		bt_init(&bt, tdata->vec);
-		prof_backtrace(&bt);
-		ret = prof_lookup(tsd, &bt);
-	}
-
-	return ret;
-}
-
-JEMALLOC_ALWAYS_INLINE void
-prof_malloc(tsdn_t *tsdn, extent_t *extent, const void *ptr, size_t usize,
-    prof_tctx_t *tctx) {
-	cassert(config_prof);
-	assert(ptr != NULL);
-	assert(usize == isalloc(tsdn, extent, ptr));
-
-	if (unlikely((uintptr_t)tctx > (uintptr_t)1U)) {
-		prof_malloc_sample_object(tsdn, extent, ptr, usize, tctx);
-	} else {
-		prof_tctx_set(tsdn, extent, ptr, usize,
-		    (prof_tctx_t *)(uintptr_t)1U);
-	}
-}
-
-JEMALLOC_ALWAYS_INLINE void
-prof_realloc(tsd_t *tsd, extent_t *extent, const void *ptr, size_t usize,
-    prof_tctx_t *tctx, bool prof_active, bool updated, extent_t *old_extent,
-    const void *old_ptr, size_t old_usize, prof_tctx_t *old_tctx) {
-	bool sampled, old_sampled, moved;
-
-	cassert(config_prof);
-	assert(ptr != NULL || (uintptr_t)tctx <= (uintptr_t)1U);
-
-	if (prof_active && !updated && ptr != NULL) {
-		assert(usize == isalloc(tsd_tsdn(tsd), extent, ptr));
-		if (prof_sample_accum_update(tsd, usize, true, NULL)) {
-			/*
-			 * Don't sample.  The usize passed to prof_alloc_prep()
-			 * was larger than what actually got allocated, so a
-			 * backtrace was captured for this allocation, even
-			 * though its actual usize was insufficient to cross the
-			 * sample threshold.
-			 */
-			prof_alloc_rollback(tsd, tctx, true);
-			tctx = (prof_tctx_t *)(uintptr_t)1U;
-		}
-	}
-
-	sampled = ((uintptr_t)tctx > (uintptr_t)1U);
-	old_sampled = ((uintptr_t)old_tctx > (uintptr_t)1U);
-	moved = (ptr != old_ptr);
-
-	if (unlikely(sampled)) {
-		prof_malloc_sample_object(tsd_tsdn(tsd), extent, ptr, usize,
-		    tctx);
-	} else if (moved) {
-		prof_tctx_set(tsd_tsdn(tsd), extent, ptr, usize,
-		    (prof_tctx_t *)(uintptr_t)1U);
-	} else if (unlikely(old_sampled)) {
-		/*
-		 * prof_tctx_set() would work for the !moved case as well, but
-		 * prof_tctx_reset() is slightly cheaper, and the proper thing
-		 * to do here in the presence of explicit knowledge re: moved
-		 * state.
-		 */
-		prof_tctx_reset(tsd_tsdn(tsd), extent, ptr, tctx);
-	} else {
-		assert((uintptr_t)prof_tctx_get(tsd_tsdn(tsd), extent, ptr) ==
-		    (uintptr_t)1U);
-	}
-
-	/*
-	 * The prof_free_sampled_object() call must come after the
-	 * prof_malloc_sample_object() call, because tctx and old_tctx may be
-	 * the same, in which case reversing the call order could cause the tctx
-	 * to be prematurely destroyed as a side effect of momentarily zeroed
-	 * counters.
-	 */
-	if (unlikely(old_sampled)) {
-		prof_free_sampled_object(tsd, old_usize, old_tctx);
-	}
-}
-
-JEMALLOC_ALWAYS_INLINE void
-prof_free(tsd_t *tsd, const extent_t *extent, const void *ptr, size_t usize) {
-	prof_tctx_t *tctx = prof_tctx_get(tsd_tsdn(tsd), extent, ptr);
-
-	cassert(config_prof);
-	assert(usize == isalloc(tsd_tsdn(tsd), extent, ptr));
-
-	if (unlikely((uintptr_t)tctx > (uintptr_t)1U)) {
-		prof_free_sampled_object(tsd, usize, tctx);
-	}
-}
-#endif
-
-#endif /* JEMALLOC_INTERNAL_PROF_INLINES_H */
diff --git a/include/jemalloc/internal/prof_inlines_a.h b/include/jemalloc/internal/prof_inlines_a.h
new file mode 100644
index 0000000..d77635a
--- /dev/null
+++ b/include/jemalloc/internal/prof_inlines_a.h
@@ -0,0 +1,76 @@
+#ifndef JEMALLOC_INTERNAL_PROF_INLINES_A_H
+#define JEMALLOC_INTERNAL_PROF_INLINES_A_H
+
+#ifndef JEMALLOC_ENABLE_INLINE
+bool prof_accum_add(tsdn_t *tsdn, prof_accum_t *prof_accum,
+    uint64_t accumbytes);
+void prof_accum_cancel(tsdn_t *tsdn, prof_accum_t *prof_accum, size_t usize);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
+JEMALLOC_INLINE bool
+prof_accum_add(tsdn_t *tsdn, prof_accum_t *prof_accum, uint64_t accumbytes) {
+	cassert(config_prof);
+
+	bool overflow;
+	uint64_t a0, a1;
+
+	/*
+	 * If the application allocates fast enough (and/or if idump is slow
+	 * enough), extreme overflow here (a1 >= prof_interval * 2) can cause
+	 * idump trigger coalescing.  This is an intentional mechanism that
+	 * avoids rate-limiting allocation.
+	 */
+#ifdef JEMALLOC_ATOMIC_U64
+	do {
+		a0 = atomic_read_u64(&prof_accum->accumbytes);
+		a1 = a0 + accumbytes;
+		assert(a1 >= a0);
+		overflow = (a1 >= prof_interval);
+		if (overflow) {
+			a1 %= prof_interval;
+		}
+	} while (atomic_cas_u64(&prof_accum->accumbytes, a0, a1));
+#else
+	malloc_mutex_lock(tsdn, &prof_accum->mtx);
+	a0 = prof_accum->accumbytes;
+	a1 = a0 + accumbytes;
+	overflow = (a1 >= prof_interval);
+	if (overflow) {
+		a1 %= prof_interval;
+	}
+	prof_accum->accumbytes = a1;
+	malloc_mutex_unlock(tsdn, &prof_accum->mtx);
+#endif
+	return overflow;
+}
+
+JEMALLOC_INLINE void
+prof_accum_cancel(tsdn_t *tsdn, prof_accum_t *prof_accum, size_t usize) {
+	cassert(config_prof);
+
+	/*
+	 * Cancel out as much of the excessive prof_accumbytes increase as
+	 * possible without underflowing.  Interval-triggered dumps occur
+	 * slightly more often than intended as a result of incomplete
+	 * canceling.
+	 */
+	uint64_t a0, a1;
+#ifdef JEMALLOC_ATOMIC_U64
+	do {
+		a0 = atomic_read_u64(&prof_accum->accumbytes);
+		a1 = (a0 >= LARGE_MINCLASS - usize) ?  a0 - (LARGE_MINCLASS -
+		    usize) : 0;
+	} while (atomic_cas_u64(&prof_accum->accumbytes, a0, a1));
+#else
+	malloc_mutex_lock(tsdn, &prof_accum->mtx);
+	a0 = prof_accum->accumbytes;
+	a1 = (a0 >= LARGE_MINCLASS - usize) ?  a0 - (LARGE_MINCLASS - usize) :
+	    0;
+	prof_accum->accumbytes = a1;
+	malloc_mutex_unlock(tsdn, &prof_accum->mtx);
+#endif
+}
+#endif
+
+#endif /* JEMALLOC_INTERNAL_PROF_INLINES_A_H */
diff --git a/include/jemalloc/internal/prof_inlines_b.h b/include/jemalloc/internal/prof_inlines_b.h
new file mode 100644
index 0000000..9e969a0
--- /dev/null
+++ b/include/jemalloc/internal/prof_inlines_b.h
@@ -0,0 +1,240 @@
+#ifndef JEMALLOC_INTERNAL_PROF_INLINES_B_H
+#define JEMALLOC_INTERNAL_PROF_INLINES_B_H
+
+#ifndef JEMALLOC_ENABLE_INLINE
+bool	prof_active_get_unlocked(void);
+bool	prof_gdump_get_unlocked(void);
+prof_tdata_t	*prof_tdata_get(tsd_t *tsd, bool create);
+prof_tctx_t	*prof_tctx_get(tsdn_t *tsdn, const extent_t *extent,
+    const void *ptr);
+void	prof_tctx_set(tsdn_t *tsdn, extent_t *extent, const void *ptr,
+    size_t usize, prof_tctx_t *tctx);
+void	prof_tctx_reset(tsdn_t *tsdn, extent_t *extent, const void *ptr,
+    prof_tctx_t *tctx);
+bool	prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
+    prof_tdata_t **tdata_out);
+prof_tctx_t	*prof_alloc_prep(tsd_t *tsd, size_t usize, bool prof_active,
+    bool update);
+void	prof_malloc(tsdn_t *tsdn, extent_t *extent, const void *ptr,
+    size_t usize, prof_tctx_t *tctx);
+void	prof_realloc(tsd_t *tsd, extent_t *extent, const void *ptr,
+    size_t usize, prof_tctx_t *tctx, bool prof_active, bool updated,
+    extent_t *old_extent, const void *old_ptr, size_t old_usize,
+    prof_tctx_t *old_tctx);
+void	prof_free(tsd_t *tsd, const extent_t *extent, const void *ptr,
+    size_t usize);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
+JEMALLOC_ALWAYS_INLINE bool
+prof_active_get_unlocked(void) {
+	/*
+	 * Even if opt_prof is true, sampling can be temporarily disabled by
+	 * setting prof_active to false.  No locking is used when reading
+	 * prof_active in the fast path, so there are no guarantees regarding
+	 * how long it will take for all threads to notice state changes.
+	 */
+	return prof_active;
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+prof_gdump_get_unlocked(void) {
+	/*
+	 * No locking is used when reading prof_gdump_val in the fast path, so
+	 * there are no guarantees regarding how long it will take for all
+	 * threads to notice state changes.
+	 */
+	return prof_gdump_val;
+}
+
+JEMALLOC_ALWAYS_INLINE prof_tdata_t *
+prof_tdata_get(tsd_t *tsd, bool create) {
+	prof_tdata_t *tdata;
+
+	cassert(config_prof);
+
+	tdata = tsd_prof_tdata_get(tsd);
+	if (create) {
+		if (unlikely(tdata == NULL)) {
+			if (tsd_nominal(tsd)) {
+				tdata = prof_tdata_init(tsd);
+				tsd_prof_tdata_set(tsd, tdata);
+			}
+		} else if (unlikely(tdata->expired)) {
+			tdata = prof_tdata_reinit(tsd, tdata);
+			tsd_prof_tdata_set(tsd, tdata);
+		}
+		assert(tdata == NULL || tdata->attached);
+	}
+
+	return tdata;
+}
+
+JEMALLOC_ALWAYS_INLINE prof_tctx_t *
+prof_tctx_get(tsdn_t *tsdn, const extent_t *extent, const void *ptr) {
+	cassert(config_prof);
+	assert(ptr != NULL);
+
+	return arena_prof_tctx_get(tsdn, extent, ptr);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+prof_tctx_set(tsdn_t *tsdn, extent_t *extent, const void *ptr, size_t usize,
+    prof_tctx_t *tctx) {
+	cassert(config_prof);
+	assert(ptr != NULL);
+
+	arena_prof_tctx_set(tsdn, extent, ptr, usize, tctx);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+prof_tctx_reset(tsdn_t *tsdn, extent_t *extent, const void *ptr,
+    prof_tctx_t *tctx) {
+	cassert(config_prof);
+	assert(ptr != NULL);
+
+	arena_prof_tctx_reset(tsdn, extent, ptr, tctx);
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
+    prof_tdata_t **tdata_out) {
+	prof_tdata_t *tdata;
+
+	cassert(config_prof);
+
+	tdata = prof_tdata_get(tsd, true);
+	if (unlikely((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)) {
+		tdata = NULL;
+	}
+
+	if (tdata_out != NULL) {
+		*tdata_out = tdata;
+	}
+
+	if (unlikely(tdata == NULL)) {
+		return true;
+	}
+
+	if (likely(tdata->bytes_until_sample >= usize)) {
+		if (update) {
+			tdata->bytes_until_sample -= usize;
+		}
+		return true;
+	} else {
+		/* Compute new sample threshold. */
+		if (update) {
+			prof_sample_threshold_update(tdata);
+		}
+		return !tdata->active;
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE prof_tctx_t *
+prof_alloc_prep(tsd_t *tsd, size_t usize, bool prof_active, bool update) {
+	prof_tctx_t *ret;
+	prof_tdata_t *tdata;
+	prof_bt_t bt;
+
+	assert(usize == s2u(usize));
+
+	if (!prof_active || likely(prof_sample_accum_update(tsd, usize, update,
+	    &tdata))) {
+		ret = (prof_tctx_t *)(uintptr_t)1U;
+	} else {
+		bt_init(&bt, tdata->vec);
+		prof_backtrace(&bt);
+		ret = prof_lookup(tsd, &bt);
+	}
+
+	return ret;
+}
+
+JEMALLOC_ALWAYS_INLINE void
+prof_malloc(tsdn_t *tsdn, extent_t *extent, const void *ptr, size_t usize,
+    prof_tctx_t *tctx) {
+	cassert(config_prof);
+	assert(ptr != NULL);
+	assert(usize == isalloc(tsdn, extent, ptr));
+
+	if (unlikely((uintptr_t)tctx > (uintptr_t)1U)) {
+		prof_malloc_sample_object(tsdn, extent, ptr, usize, tctx);
+	} else {
+		prof_tctx_set(tsdn, extent, ptr, usize,
+		    (prof_tctx_t *)(uintptr_t)1U);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void
+prof_realloc(tsd_t *tsd, extent_t *extent, const void *ptr, size_t usize,
+    prof_tctx_t *tctx, bool prof_active, bool updated, extent_t *old_extent,
+    const void *old_ptr, size_t old_usize, prof_tctx_t *old_tctx) {
+	bool sampled, old_sampled, moved;
+
+	cassert(config_prof);
+	assert(ptr != NULL || (uintptr_t)tctx <= (uintptr_t)1U);
+
+	if (prof_active && !updated && ptr != NULL) {
+		assert(usize == isalloc(tsd_tsdn(tsd), extent, ptr));
+		if (prof_sample_accum_update(tsd, usize, true, NULL)) {
+			/*
+			 * Don't sample.  The usize passed to prof_alloc_prep()
+			 * was larger than what actually got allocated, so a
+			 * backtrace was captured for this allocation, even
+			 * though its actual usize was insufficient to cross the
+			 * sample threshold.
+			 */
+			prof_alloc_rollback(tsd, tctx, true);
+			tctx = (prof_tctx_t *)(uintptr_t)1U;
+		}
+	}
+
+	sampled = ((uintptr_t)tctx > (uintptr_t)1U);
+	old_sampled = ((uintptr_t)old_tctx > (uintptr_t)1U);
+	moved = (ptr != old_ptr);
+
+	if (unlikely(sampled)) {
+		prof_malloc_sample_object(tsd_tsdn(tsd), extent, ptr, usize,
+		    tctx);
+	} else if (moved) {
+		prof_tctx_set(tsd_tsdn(tsd), extent, ptr, usize,
+		    (prof_tctx_t *)(uintptr_t)1U);
+	} else if (unlikely(old_sampled)) {
+		/*
+		 * prof_tctx_set() would work for the !moved case as well, but
+		 * prof_tctx_reset() is slightly cheaper, and the proper thing
+		 * to do here in the presence of explicit knowledge re: moved
+		 * state.
+		 */
+		prof_tctx_reset(tsd_tsdn(tsd), extent, ptr, tctx);
+	} else {
+		assert((uintptr_t)prof_tctx_get(tsd_tsdn(tsd), extent, ptr) ==
+		    (uintptr_t)1U);
+	}
+
+	/*
+	 * The prof_free_sampled_object() call must come after the
+	 * prof_malloc_sample_object() call, because tctx and old_tctx may be
+	 * the same, in which case reversing the call order could cause the tctx
+	 * to be prematurely destroyed as a side effect of momentarily zeroed
+	 * counters.
+	 */
+	if (unlikely(old_sampled)) {
+		prof_free_sampled_object(tsd, old_usize, old_tctx);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void
+prof_free(tsd_t *tsd, const extent_t *extent, const void *ptr, size_t usize) {
+	prof_tctx_t *tctx = prof_tctx_get(tsd_tsdn(tsd), extent, ptr);
+
+	cassert(config_prof);
+	assert(usize == isalloc(tsd_tsdn(tsd), extent, ptr));
+
+	if (unlikely((uintptr_t)tctx > (uintptr_t)1U)) {
+		prof_free_sampled_object(tsd, usize, tctx);
+	}
+}
+#endif
+
+#endif /* JEMALLOC_INTERNAL_PROF_INLINES_B_H */
diff --git a/include/jemalloc/internal/prof_structs.h b/include/jemalloc/internal/prof_structs.h
index caae125..afff6aa 100644
--- a/include/jemalloc/internal/prof_structs.h
+++ b/include/jemalloc/internal/prof_structs.h
@@ -15,6 +15,13 @@ typedef struct {
 } prof_unwind_data_t;
 #endif
 
+struct prof_accum_s {
+#ifndef JEMALLOC_ATOMIC_U64
+	malloc_mutex_t	mtx;
+#endif
+	uint64_t	accumbytes;
+};
+
 struct prof_cnt_s {
 	/* Profiling counters. */
 	uint64_t	curobjs;
diff --git a/include/jemalloc/internal/prof_types.h b/include/jemalloc/internal/prof_types.h
index ff0db65..1eff995 100644
--- a/include/jemalloc/internal/prof_types.h
+++ b/include/jemalloc/internal/prof_types.h
@@ -2,6 +2,7 @@
 #define JEMALLOC_INTERNAL_PROF_TYPES_H
 
 typedef struct prof_bt_s prof_bt_t;
+typedef struct prof_accum_s prof_accum_t;
 typedef struct prof_cnt_s prof_cnt_t;
 typedef struct prof_tctx_s prof_tctx_t;
 typedef struct prof_gctx_s prof_gctx_t;
diff --git a/include/jemalloc/internal/witness_types.h b/include/jemalloc/internal/witness_types.h
index 2929916..f919cc5 100644
--- a/include/jemalloc/internal/witness_types.h
+++ b/include/jemalloc/internal/witness_types.h
@@ -47,6 +47,7 @@ typedef int witness_comp_t (const witness_t *, void *, const witness_t *,
 #define WITNESS_RANK_ARENA_LARGE	WITNESS_RANK_LEAF
 #define WITNESS_RANK_DSS		WITNESS_RANK_LEAF
 #define WITNESS_RANK_PROF_ACTIVE	WITNESS_RANK_LEAF
+#define WITNESS_RANK_PROF_ACCUM		WITNESS_RANK_LEAF
 #define WITNESS_RANK_PROF_DUMP_SEQ	WITNESS_RANK_LEAF
 #define WITNESS_RANK_PROF_GDUMP		WITNESS_RANK_LEAF
 #define WITNESS_RANK_PROF_NEXT_THR_UID	WITNESS_RANK_LEAF
diff --git a/src/arena.c b/src/arena.c
index 345c57d..40db9d1 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1148,19 +1148,7 @@ arena_prof_promote(tsdn_t *tsdn, extent_t *extent, const void *ptr,
 
 	extent_usize_set(extent, usize);
 
-	/*
-	 * Cancel out as much of the excessive prof_accumbytes increase as
-	 * possible without underflowing.  Interval-triggered dumps occur
-	 * slightly more often than intended as a result of incomplete
-	 * canceling.
-	 */
-	malloc_mutex_lock(tsdn, &arena->lock);
-	if (arena->prof_accumbytes >= LARGE_MINCLASS - usize) {
-		arena->prof_accumbytes -= LARGE_MINCLASS - usize;
-	} else {
-		arena->prof_accumbytes = 0;
-	}
-	malloc_mutex_unlock(tsdn, &arena->lock);
+	prof_accum_cancel(tsdn, &arena->prof_accum, usize);
 
 	assert(isalloc(tsdn, extent, ptr) == usize);
 }
@@ -1574,7 +1562,9 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 	}
 
 	if (config_prof) {
-		arena->prof_accumbytes = 0;
+		if (prof_accum_init(tsdn, &arena->prof_accum)) {
+			goto label_error;
+		}
 	}
 
 	if (config_cache_oblivious) {
diff --git a/src/prof.c b/src/prof.c
index 5aeefb2..13fa20d 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -1753,6 +1753,20 @@ prof_fdump(void) {
 	prof_dump(tsd, false, filename, opt_prof_leak);
 }
 
+bool
+prof_accum_init(tsdn_t *tsdn, prof_accum_t *prof_accum) {
+	cassert(config_prof);
+
+#ifndef JEMALLOC_ATOMIC_U64
+	if (malloc_mutex_init(&prof_accum->mtx, "prof_accum",
+	    WITNESS_RANK_PROF_ACCUM)) {
+		return true;
+	}
+#endif
+	prof_accum->accumbytes = 0;
+	return false;
+}
+
 void
 prof_idump(tsdn_t *tsdn) {
 	tsd_t *tsd;
diff --git a/src/tcache.c b/src/tcache.c
index 94c4570..f38c2d5 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -200,7 +200,7 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, szind_t binind,
 		}
 		if ((config_prof || config_stats) && locked_arena == arena) {
 			if (config_prof) {
-				idump = arena_prof_accum_locked(arena,
+				idump = arena_prof_accum(tsd_tsdn(tsd), arena,
 				    tcache->prof_accumbytes);
 				tcache->prof_accumbytes = 0;
 			}
-- 
cgit v0.12