From 602c8e0971160e4b85b08b16cf8a2375aa24bc04 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Mon, 18 Aug 2014 16:22:13 -0700
Subject: Implement per thread heap profiling.

Rename data structures (prof_thr_cnt_t-->prof_tctx_t,
prof_ctx_t-->prof_gctx_t), and convert to storing a prof_tctx_t for
sampled objects.

Convert PROF_ALLOC_PREP() to prof_alloc_prep(), since precise backtrace
depth within jemalloc functions is no longer an issue (pprof prunes
irrelevant frames).

Implement mallctl's:
- prof.reset implements full sample data reset, and optional change of
  sample interval.
- prof.lg_sample reads the current sample interval (opt.lg_prof_sample
  was the permanent source of truth prior to prof.reset).
- thread.prof.name provides naming capability for threads within heap
  profile dumps.
- thread.prof.active makes it possible to activate/deactivate heap
  profiling for individual threads.

Modify the heap dump files to contain per thread heap profile data.
This change is incompatible with the existing pprof, which will require
enhancements to read and process the enriched data.
---
 doc/jemalloc.xml.in                           |   56 +-
 include/jemalloc/internal/arena.h             |   22 +-
 include/jemalloc/internal/extent.h            |    2 +-
 include/jemalloc/internal/huge.h              |    4 +-
 include/jemalloc/internal/private_symbols.txt |   21 +-
 include/jemalloc/internal/prof.h              |  440 +++++-----
 src/ctl.c                                     |   97 ++-
 src/huge.c                                    |   12 +-
 src/jemalloc.c                                |  140 +--
 src/prof.c                                    | 1127 +++++++++++++++++--------
 src/stats.c                                   |    2 +-
 11 files changed, 1217 insertions(+), 706 deletions(-)
diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 308d0c6..8f4327f 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -1047,7 +1047,7 @@ malloc_conf = "xmalloc:true";]]></programlisting>
       <varlistentry id="opt.lg_prof_sample">
         <term>
           <mallctl>opt.lg_prof_sample</mallctl>
-          (<type>ssize_t</type>)
+          (<type>size_t</type>)
           <literal>r-</literal>
           [<option>--enable-prof</option>]
         </term>
@@ -1243,6 +1243,35 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         the developer may find manual flushing useful.</para></listitem>
       </varlistentry>
 
+      <varlistentry id="thread.prof.name">
+        <term>
+          <mallctl>thread.prof.name</mallctl>
+          (<type>const char *</type>)
+          <literal>rw</literal>
+          [<option>--enable-prof</option>]
+        </term>
+        <listitem><para>Get/set the descriptive name associated with the calling
+        thread in memory profile dumps.  An internal copy of the name string is
+        created, so the input string need not be maintained after this interface
+        completes execution.  The output string of this interface should be
+        copied for non-ephemeral uses, because multiple implementation details
+        can cause asynchronous string deallocation.</para></listitem>
+      </varlistentry>
+
+      <varlistentry id="thread.prof.active">
+        <term>
+          <mallctl>thread.prof.active</mallctl>
+          (<type>bool</type>)
+          <literal>rw</literal>
+          [<option>--enable-prof</option>]
+        </term>
+        <listitem><para>Control whether sampling is currently active for the
+        calling thread.  This is a deactivation mechanism in addition to <link
+        linkend="prof.active"><mallctl>prof.active</mallctl></link>; both must
+        be active for the calling thread to sample.  This flag is enabled by
+        default.</para></listitem>
+      </varlistentry>
+
       <varlistentry id="arena.i.purge">
         <term>
           <mallctl>arena.&lt;i&gt;.purge</mallctl>
@@ -1492,6 +1521,31 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         option.</para></listitem>
       </varlistentry>
 
+      <varlistentry id="prof.reset">
+        <term>
+          <mallctl>prof.reset</mallctl>
+          (<type>size_t</type>)
+          <literal>-w</literal>
+          [<option>--enable-prof</option>]
+        </term>
+        <listitem><para>Reset all memory profile statistics, and optionally
+        update the sample rate (see <link
+        linkend="opt.lg_prof_sample"><mallctl>opt.lg_prof_sample</mallctl></link>).
+        </para></listitem>
+      </varlistentry>
+
+      <varlistentry id="prof.lg_sample">
+        <term>
+          <mallctl>prof.lg_sample</mallctl>
+          (<type>size_t</type>)
+          <literal>r-</literal>
+          [<option>--enable-prof</option>]
+        </term>
+        <listitem><para>Get the sample rate (see <link
+        linkend="opt.lg_prof_sample"><mallctl>opt.lg_prof_sample</mallctl></link>).
+        </para></listitem>
+      </varlistentry>
+
       <varlistentry id="prof.interval">
         <term>
           <mallctl>prof.interval</mallctl>
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 9351e3b..f3f6426 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -58,7 +58,7 @@ typedef struct arena_s arena_t;
 struct arena_chunk_map_s {
 #ifndef JEMALLOC_PROF
 	/*
-	 * Overlay prof_ctx in order to allow it to be referenced by dead code.
+	 * Overlay prof_tctx in order to allow it to be referenced by dead code.
 	 * Such antics aren't warranted for per arena data structures, but
 	 * chunk map overhead accounts for a percentage of memory, rather than
 	 * being just a fixed cost.
@@ -75,7 +75,7 @@ struct arena_chunk_map_s {
 	rb_node(arena_chunk_map_t)	rb_link;
 
 	/* Profile counters, used for large object runs. */
-	prof_ctx_t			*prof_ctx;
+	prof_tctx_t			*prof_tctx;
 #ifndef JEMALLOC_PROF
 	}; /* union { ... }; */
 #endif
@@ -472,8 +472,8 @@ size_t	arena_ptr_small_binind_get(const void *ptr, size_t mapbits);
 size_t	arena_bin_index(arena_t *arena, arena_bin_t *bin);
 unsigned	arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info,
     const void *ptr);
-prof_ctx_t	*arena_prof_ctx_get(const void *ptr);
-void	arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
+prof_tctx_t	*arena_prof_tctx_get(const void *ptr);
+void	arena_prof_tctx_set(const void *ptr, prof_tctx_t *tctx);
 void	*arena_malloc(arena_t *arena, size_t size, bool zero, bool try_tcache);
 size_t	arena_salloc(const void *ptr, bool demote);
 void	arena_dalloc(arena_chunk_t *chunk, void *ptr, bool try_tcache);
@@ -987,10 +987,10 @@ arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, const void *ptr)
 	return (regind);
 }
 
-JEMALLOC_INLINE prof_ctx_t *
-arena_prof_ctx_get(const void *ptr)
+JEMALLOC_INLINE prof_tctx_t *
+arena_prof_tctx_get(const void *ptr)
 {
-	prof_ctx_t *ret;
+	prof_tctx_t *ret;
 	arena_chunk_t *chunk;
 	size_t pageind, mapbits;
 
@@ -1003,15 +1003,15 @@ arena_prof_ctx_get(const void *ptr)
 	mapbits = arena_mapbits_get(chunk, pageind);
 	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
 	if ((mapbits & CHUNK_MAP_LARGE) == 0)
-		ret = (prof_ctx_t *)(uintptr_t)1U;
+		ret = (prof_tctx_t *)(uintptr_t)1U;
 	else
-		ret = arena_mapp_get(chunk, pageind)->prof_ctx;
+		ret = arena_mapp_get(chunk, pageind)->prof_tctx;
 
 	return (ret);
 }
 
 JEMALLOC_INLINE void
-arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
+arena_prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 {
 	arena_chunk_t *chunk;
 	size_t pageind;
@@ -1025,7 +1025,7 @@ arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 	assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
 
 	if (arena_mapbits_large_get(chunk, pageind) != 0)
-		arena_mapp_get(chunk, pageind)->prof_ctx = ctx;
+		arena_mapp_get(chunk, pageind)->prof_tctx = tctx;
 }
 
 JEMALLOC_ALWAYS_INLINE void *
diff --git a/include/jemalloc/internal/extent.h b/include/jemalloc/internal/extent.h
index 000ef6d..5b00076 100644
--- a/include/jemalloc/internal/extent.h
+++ b/include/jemalloc/internal/extent.h
@@ -16,7 +16,7 @@ struct extent_node_s {
 	rb_node(extent_node_t)	link_ad;
 
 	/* Profile counters, used for huge objects. */
-	prof_ctx_t		*prof_ctx;
+	prof_tctx_t		*prof_tctx;
 
 	/* Pointer to the extent that this tree node is responsible for. */
 	void			*addr;
diff --git a/include/jemalloc/internal/huge.h b/include/jemalloc/internal/huge.h
index 1e54536..2ec7752 100644
--- a/include/jemalloc/internal/huge.h
+++ b/include/jemalloc/internal/huge.h
@@ -21,8 +21,8 @@ extern huge_dalloc_junk_t *huge_dalloc_junk;
 #endif
 void	huge_dalloc(void *ptr);
 size_t	huge_salloc(const void *ptr);
-prof_ctx_t	*huge_prof_ctx_get(const void *ptr);
-void	huge_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
+prof_tctx_t	*huge_prof_tctx_get(const void *ptr);
+void	huge_prof_tctx_set(const void *ptr, prof_tctx_t *tctx);
 bool	huge_boot(void);
 void	huge_prefork(void);
 void	huge_postfork_parent(void);
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 3401301..1350545 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -48,9 +48,9 @@ arena_prefork
 arena_prof_accum
 arena_prof_accum_impl
 arena_prof_accum_locked
-arena_prof_ctx_get
-arena_prof_ctx_set
 arena_prof_promoted
+arena_prof_tctx_get
+arena_prof_tctx_set
 arena_ptr_small_binind_get
 arena_purge_all
 arena_quarantine_junk_small
@@ -208,8 +208,8 @@ huge_palloc
 huge_postfork_child
 huge_postfork_parent
 huge_prefork
-huge_prof_ctx_get
-huge_prof_ctx_set
+huge_prof_tctx_get
+huge_prof_tctx_set
 huge_ralloc
 huge_ralloc_no_move
 huge_salloc
@@ -287,28 +287,31 @@ opt_zero
 p2rz
 pages_purge
 pow2_ceil
+prof_alloc_prep
 prof_backtrace
 prof_boot0
 prof_boot1
 prof_boot2
 prof_bt_count
-prof_ctx_get
-prof_ctx_set
 prof_dump_open
 prof_free
+prof_free_sampled_object
 prof_gdump
 prof_idump
 prof_interval
 prof_lookup
 prof_malloc
-prof_malloc_record_object
+prof_malloc_sample_object
 prof_mdump
 prof_postfork_child
 prof_postfork_parent
 prof_prefork
 prof_realloc
+prof_reset
 prof_sample_accum_update
 prof_sample_threshold_update
+prof_tctx_get
+prof_tctx_set
 prof_tdata_booted
 prof_tdata_cleanup
 prof_tdata_get
@@ -322,6 +325,10 @@ prof_tdata_tsd_get
 prof_tdata_tsd_get_wrapper
 prof_tdata_tsd_init_head
 prof_tdata_tsd_set
+prof_thread_active_get
+prof_thread_active_set
+prof_thread_name_get
+prof_thread_name_set
 quarantine
 quarantine_alloc_hook
 quarantine_boot
diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index 9398ad9..104bfad 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -1,11 +1,10 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_TYPES
 
-typedef uint64_t prof_thr_uid_t;
 typedef struct prof_bt_s prof_bt_t;
 typedef struct prof_cnt_s prof_cnt_t;
-typedef struct prof_thr_cnt_s prof_thr_cnt_t;
-typedef struct prof_ctx_s prof_ctx_t;
+typedef struct prof_tctx_s prof_tctx_t;
+typedef struct prof_gctx_s prof_gctx_t;
 typedef struct prof_tdata_s prof_tdata_t;
 
 /* Option defaults. */
@@ -34,12 +33,18 @@ typedef struct prof_tdata_s prof_tdata_t;
 #define	PROF_PRINTF_BUFSIZE		128
 
 /*
- * Number of mutexes shared among all ctx's.  No space is allocated for these
+ * Number of mutexes shared among all gctx's.  No space is allocated for these
  * unless profiling is enabled, so it's okay to over-provision.
  */
 #define	PROF_NCTX_LOCKS			1024
 
 /*
+ * Number of mutexes shared among all tdata's.  No space is allocated for these
+ * unless profiling is enabled, so it's okay to over-provision.
+ */
+#define	PROF_NTDATA_LOCKS		256
+
+/*
  * prof_tdata pointers close to NULL are used to encode state information that
  * is used for cleaning up during thread shutdown.
  */
@@ -66,87 +71,70 @@ typedef struct {
 #endif
 
 struct prof_cnt_s {
-	/*
-	 * Profiling counters.  An allocation/deallocation pair can operate on
-	 * different prof_thr_cnt_t objects that are linked into the same
-	 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
-	 * negative.  In principle it is possible for the *bytes counters to
-	 * overflow/underflow, but a general solution would require something
-	 * like 128-bit counters; this implementation doesn't bother to solve
-	 * that problem.
-	 */
-	int64_t		curobjs;
-	int64_t		curbytes;
+	/* Profiling counters. */
+	uint64_t	curobjs;
+	uint64_t	curbytes;
 	uint64_t	accumobjs;
 	uint64_t	accumbytes;
 };
 
-struct prof_thr_cnt_s {
-	prof_thr_uid_t		thr_uid;
+typedef enum {
+	prof_tctx_state_nominal,
+	prof_tctx_state_dumping,
+	prof_tctx_state_purgatory /* Dumper must finish destroying. */
+} prof_tctx_state_t;
 
-	/* Linkage into prof_ctx_t's thr_cnts. */
-	rb_node(prof_thr_cnt_t)	thr_cnt_link;
+struct prof_tctx_s {
+	/* Thread data for thread that performed the allocation. */
+	prof_tdata_t		*tdata;
 
-	/*
-	 * Associated context.  If a thread frees an object that it did not
-	 * allocate, it is possible that the context is not present in the
-	 * thread's hash table, in which case it must be able to look up the
-	 * context, insert a new prof_thr_cnt_t into the thread's hash table,
-	 * and link it into the prof_ctx_t's thr_cnts.
-	 */
-	prof_ctx_t		*ctx;
+	/* Profiling counters, protected by tdata->lock. */
+	prof_cnt_t		cnts;
+
+	/* Associated global context. */
+	prof_gctx_t		*gctx;
+
+	/* Linkage into gctx's tctxs. */
+	rb_node(prof_tctx_t)	tctx_link;
+
+	/* Current dump-related state, protected by gctx->lock. */
+	prof_tctx_state_t	state;
 
 	/*
-	 * Threads use memory barriers to update the counters.  Since there is
-	 * only ever one writer, the only challenge is for the reader to get a
-	 * consistent read of the counters.
-	 *
-	 * The writer uses this series of operations:
-	 *
-	 * 1) Increment epoch to an odd number.
-	 * 2) Update counters.
-	 * 3) Increment epoch to an even number.
-	 *
-	 * The reader must assure 1) that the epoch is even while it reads the
-	 * counters, and 2) that the epoch doesn't change between the time it
-	 * starts and finishes reading the counters.
+	 * Copy of cnts snapshotted during early dump phase, protected by
+	 * dump_mtx.
 	 */
-	unsigned		epoch;
-
-	/* Profiling counters. */
-	prof_cnt_t		cnts;
+	prof_cnt_t		dump_cnts;
 };
-typedef rb_tree(prof_thr_cnt_t) prof_thr_cnt_tree_t;
+typedef rb_tree(prof_tctx_t) prof_tctx_tree_t;
 
-struct prof_ctx_s {
-	/* Protects nlimbo, cnt_merged, and thr_cnts. */
+struct prof_gctx_s {
+	/* Protects nlimbo, cnt_summed, and tctxs. */
 	malloc_mutex_t		*lock;
 
 	/*
-	 * Number of threads that currently cause this ctx to be in a state of
+	 * Number of threads that currently cause this gctx to be in a state of
 	 * limbo due to one of:
-	 *   - Initializing per thread counters associated with this ctx.
-	 *   - Preparing to destroy this ctx.
-	 *   - Dumping a heap profile that includes this ctx.
+	 *   - Initializing this gctx.
+	 *   - Initializing per thread counters associated with this gctx.
+	 *   - Preparing to destroy this gctx.
+	 *   - Dumping a heap profile that includes this gctx.
 	 * nlimbo must be 1 (single destroyer) in order to safely destroy the
-	 * ctx.
+	 * gctx.
 	 */
 	unsigned		nlimbo;
 
-	/* Temporary storage for summation during dump. */
-	prof_cnt_t		cnt_summed;
-
-	/* When threads exit, they merge their stats into cnt_merged. */
-	prof_cnt_t		cnt_merged;
-
 	/*
 	 * Tree of profile counters, one for each thread that has allocated in
 	 * this context.
 	 */
-	prof_thr_cnt_tree_t	thr_cnts;
+	prof_tctx_tree_t	tctxs;
 
 	/* Linkage for tree of contexts to be dumped. */
-	rb_node(prof_ctx_t)	dump_link;
+	rb_node(prof_gctx_t)	dump_link;
+
+	/* Temporary storage for summation during dump. */
+	prof_cnt_t		cnt_summed;
 
 	/* Associated backtrace. */
 	prof_bt_t		bt;
@@ -154,21 +142,34 @@ struct prof_ctx_s {
 	/* Backtrace vector, variable size, referred to by bt. */
 	void			*vec[1];
 };
-typedef rb_tree(prof_ctx_t) prof_ctx_tree_t;
+typedef rb_tree(prof_gctx_t) prof_gctx_tree_t;
+
+typedef enum {
+	prof_tdata_state_attached, /* Active thread attached, data valid. */
+	prof_tdata_state_detached, /* Defunct thread, data remain valid. */
+	prof_tdata_state_expired   /* Predates reset, omit data from dump. */
+} prof_tdata_state_t;
 
 struct prof_tdata_s {
+	malloc_mutex_t		*lock;
+
+	/* Monotonically increasing unique thread identifier. */
+	uint64_t		thr_uid;
+
+	/* Included in heap profile dumps if non-NULL. */
+	char			*thread_name;
+
+	prof_tdata_state_t	state;
+
+	rb_node(prof_tdata_t)	tdata_link;
+
 	/*
-	 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread tracks
+	 * Hash of (prof_bt_t *)-->(prof_tctx_t *).  Each thread tracks
 	 * backtraces for which it has non-zero allocation/deallocation counters
-	 * associated with thread-specific prof_thr_cnt_t objects.  Other
-	 * threads may read the prof_thr_cnt_t contents, but no others will ever
-	 * write them.
-	 *
-	 * Upon thread exit, the thread must merge all the prof_thr_cnt_t
-	 * counter data into the associated prof_ctx_t objects, and unlink/free
-	 * the prof_thr_cnt_t objects.
+	 * associated with thread-specific prof_tctx_t objects.  Other threads
+	 * may write to prof_tctx_t contents when freeing associated objects.
 	 */
-	ckh_t			bt2cnt;
+	ckh_t			bt2tctx;
 
 	/* Sampling state. */
 	uint64_t		prng_state;
@@ -179,9 +180,27 @@ struct prof_tdata_s {
 	bool			enq_idump;
 	bool			enq_gdump;
 
+	/*
+	 * Set to true during an early dump phase for tdata's which are
+	 * currently being dumped.  New threads' tdata's have this initialized
+	 * to false so that they aren't accidentally included in later dump
+	 * phases.
+	 */
+	bool			dumping;
+
+	/*
+	 * True if profiling is active for this tdata's thread
+	 * (thread.prof.active mallctl).
+	 */
+	bool			active;
+
+	/* Temporary storage for summation during dump. */
+	prof_cnt_t		cnt_summed;
+
 	/* Backtrace vector, used for calls to prof_backtrace(). */
 	void			*vec[PROF_BT_MAX];
 };
+typedef rb_tree(prof_tdata_t) prof_tdata_tree_t;
 
 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
@@ -217,9 +236,18 @@ extern char	opt_prof_prefix[
  */
 extern uint64_t	prof_interval;
 
+/*
+ * Initialized as opt_lg_prof_sample, and potentially modified during profiling
+ * resets.
+ */
+extern size_t	lg_prof_sample;
+
+void	prof_malloc_sample_object(const void *ptr, size_t usize,
+    prof_tctx_t *tctx);
+void	prof_free_sampled_object(size_t usize, prof_tctx_t *tctx);
 void	bt_init(prof_bt_t *bt, void **vec);
 void	prof_backtrace(prof_bt_t *bt);
-prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
+prof_tctx_t	*prof_lookup(prof_bt_t *bt);
 #ifdef JEMALLOC_JET
 size_t	prof_bt_count(void);
 typedef int (prof_dump_open_t)(bool, const char *);
@@ -229,53 +257,44 @@ void	prof_idump(void);
 bool	prof_mdump(const char *filename);
 void	prof_gdump(void);
 prof_tdata_t	*prof_tdata_init(void);
+prof_tdata_t	*prof_tdata_reinit(prof_tdata_t *tdata);
+void	prof_reset(size_t lg_sample);
 void	prof_tdata_cleanup(void *arg);
+const char	*prof_thread_name_get(void);
+bool	prof_thread_name_set(const char *thread_name);
+bool	prof_thread_active_get(void);
+bool	prof_thread_active_set(bool active);
 void	prof_boot0(void);
 void	prof_boot1(void);
 bool	prof_boot2(void);
 void	prof_prefork(void);
 void	prof_postfork_parent(void);
 void	prof_postfork_child(void);
-void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
+void	prof_sample_threshold_update(prof_tdata_t *tdata);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
 
-#define	PROF_ALLOC_PREP(size, ret) do {					\
-	prof_tdata_t *prof_tdata;					\
-	prof_bt_t bt;							\
-									\
-	assert(size == s2u(size));					\
-									\
-	if (!opt_prof_active ||						\
-	    prof_sample_accum_update(size, false, &prof_tdata)) {	\
-		ret = (prof_thr_cnt_t *)(uintptr_t)1U;			\
-	} else {							\
-		bt_init(&bt, prof_tdata->vec);				\
-		prof_backtrace(&bt);					\
-		ret = prof_lookup(&bt);					\
-	}								\
-} while (0)
-
 #ifndef JEMALLOC_ENABLE_INLINE
 malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *)
 
 prof_tdata_t	*prof_tdata_get(bool create);
-bool	prof_sample_accum_update(size_t size, bool commit,
-    prof_tdata_t **prof_tdata_out);
-prof_ctx_t	*prof_ctx_get(const void *ptr);
-void	prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
-void	prof_malloc_record_object(const void *ptr, size_t usize,
-    prof_thr_cnt_t *cnt);
-void	prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt);
-void	prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
-    size_t old_usize, prof_ctx_t *old_ctx);
-void	prof_free(const void *ptr, size_t size);
+bool	prof_sample_accum_update(size_t usize, bool commit,
+    prof_tdata_t **tdata_out);
+prof_tctx_t	*prof_alloc_prep(size_t usize);
+prof_tctx_t	*prof_tctx_get(const void *ptr);
+void	prof_tctx_set(const void *ptr, prof_tctx_t *tctx);
+void	prof_malloc_sample_object(const void *ptr, size_t usize,
+    prof_tctx_t *tctx);
+void	prof_malloc(const void *ptr, size_t usize, prof_tctx_t *tctx);
+void	prof_realloc(const void *ptr, size_t usize, prof_tctx_t *tctx,
+    size_t old_usize, prof_tctx_t *old_tctx);
+void	prof_free(const void *ptr, size_t usize);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
-/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
+/* Thread-specific backtrace cache, used to reduce bt2gctx contention. */
 malloc_tsd_externs(prof_tdata, prof_tdata_t *)
 malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL,
     prof_tdata_cleanup)
@@ -283,21 +302,27 @@ malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL,
 JEMALLOC_INLINE prof_tdata_t *
 prof_tdata_get(bool create)
 {
-	prof_tdata_t *prof_tdata;
+	prof_tdata_t *tdata;
 
 	cassert(config_prof);
 
-	prof_tdata = *prof_tdata_tsd_get();
-	if (create && prof_tdata == NULL)
-		prof_tdata = prof_tdata_init();
+	tdata = *prof_tdata_tsd_get();
+	if (create) {
+		if (tdata == NULL)
+			tdata = prof_tdata_init();
+		else if (tdata->state == prof_tdata_state_expired)
+			tdata = prof_tdata_reinit(tdata);
+		assert(tdata == NULL || tdata->state ==
+		    prof_tdata_state_attached);
+	}
 
-	return (prof_tdata);
+	return (tdata);
 }
 
-JEMALLOC_INLINE prof_ctx_t *
-prof_ctx_get(const void *ptr)
+JEMALLOC_INLINE prof_tctx_t *
+prof_tctx_get(const void *ptr)
 {
-	prof_ctx_t *ret;
+	prof_tctx_t *ret;
 	arena_chunk_t *chunk;
 
 	cassert(config_prof);
@@ -306,15 +331,15 @@ prof_ctx_get(const void *ptr)
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (chunk != ptr) {
 		/* Region. */
-		ret = arena_prof_ctx_get(ptr);
+		ret = arena_prof_tctx_get(ptr);
 	} else
-		ret = huge_prof_ctx_get(ptr);
+		ret = huge_prof_tctx_get(ptr);
 
 	return (ret);
 }
 
 JEMALLOC_INLINE void
-prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
+prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 {
 	arena_chunk_t *chunk;
 
@@ -324,66 +349,62 @@ prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (chunk != ptr) {
 		/* Region. */
-		arena_prof_ctx_set(ptr, ctx);
+		arena_prof_tctx_set(ptr, tctx);
 	} else
-		huge_prof_ctx_set(ptr, ctx);
+		huge_prof_tctx_set(ptr, tctx);
 }
 
 JEMALLOC_INLINE bool
-prof_sample_accum_update(size_t size, bool commit,
-    prof_tdata_t **prof_tdata_out)
+prof_sample_accum_update(size_t usize, bool commit, prof_tdata_t **tdata_out)
 {
-	prof_tdata_t *prof_tdata;
+	prof_tdata_t *tdata;
 
 	cassert(config_prof);
 
-	prof_tdata = prof_tdata_get(true);
-	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
-		prof_tdata = NULL;
+	tdata = prof_tdata_get(true);
+	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+		tdata = NULL;
 
-	if (prof_tdata_out != NULL)
-		*prof_tdata_out = prof_tdata;
+	if (tdata_out != NULL)
+		*tdata_out = tdata;
 
-	if (prof_tdata == NULL)
+	if (tdata == NULL)
 		return (true);
 
-	if (prof_tdata->bytes_until_sample >= size) {
+	if (tdata->bytes_until_sample >= usize) {
 		if (commit)
-			prof_tdata->bytes_until_sample -= size;
+			tdata->bytes_until_sample -= usize;
 		return (true);
 	} else {
 		/* Compute new sample threshold. */
 		if (commit)
-			prof_sample_threshold_update(prof_tdata);
-		return (false);
+			prof_sample_threshold_update(tdata);
+		return (tdata->active == false);
 	}
 }
 
-JEMALLOC_INLINE void
-prof_malloc_record_object(const void *ptr, size_t usize, prof_thr_cnt_t *cnt) {
-	prof_ctx_set(ptr, cnt->ctx);
-
-	cnt->epoch++;
-	/*********/
-	mb_write();
-	/*********/
-	cnt->cnts.curobjs++;
-	cnt->cnts.curbytes += usize;
-	if (opt_prof_accum) {
-		cnt->cnts.accumobjs++;
-		cnt->cnts.accumbytes += usize;
+JEMALLOC_INLINE prof_tctx_t *
+prof_alloc_prep(size_t usize)
+{
+	prof_tctx_t *ret;
+	prof_tdata_t *tdata;
+	prof_bt_t bt;
+
+	assert(usize == s2u(usize));
+
+	if (!opt_prof_active || prof_sample_accum_update(usize, false, &tdata))
+		ret = (prof_tctx_t *)(uintptr_t)1U;
+	else {
+		bt_init(&bt, tdata->vec);
+		prof_backtrace(&bt);
+		ret = prof_lookup(&bt);
 	}
-	/*********/
-	mb_write();
-	/*********/
-	cnt->epoch++;
-	/*********/
-	mb_write();
-	/*********/
+
+	return (ret);
 }
 
 JEMALLOC_INLINE void
-prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt)
+prof_malloc(const void *ptr, size_t usize, prof_tctx_t *tctx)
 {
 
 	cassert(config_prof);
@@ -392,131 +413,60 @@ prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt)
 
 	if (prof_sample_accum_update(usize, true, NULL)) {
 		/*
-		 * Don't sample.  For malloc()-like allocation, it is
-		 * always possible to tell in advance how large an
-		 * object's usable size will be, so there should never
-		 * be a difference between the usize passed to
-		 * PROF_ALLOC_PREP() and prof_malloc().
+		 * Don't sample.  For malloc()-like allocation, it is always
+		 * possible to tell in advance how large an object's usable size
+		 * will be, so there should never be a difference between the
+		 * usize passed to PROF_ALLOC_PREP() and prof_malloc().
 		 */
-		assert((uintptr_t)cnt == (uintptr_t)1U);
+		assert((uintptr_t)tctx == (uintptr_t)1U);
 	}
 
-	if ((uintptr_t)cnt > (uintptr_t)1U)
-		prof_malloc_record_object(ptr, usize, cnt);
+	if ((uintptr_t)tctx > (uintptr_t)1U)
+		prof_malloc_sample_object(ptr, usize, tctx);
 	else
-		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
+		prof_tctx_set(ptr, (prof_tctx_t *)(uintptr_t)1U);
 }
 
 JEMALLOC_INLINE void
-prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
-    size_t old_usize, prof_ctx_t *old_ctx)
+prof_realloc(const void *ptr, size_t usize, prof_tctx_t *tctx, size_t old_usize,
+    prof_tctx_t *old_tctx)
 {
-	prof_thr_cnt_t *told_cnt;
 
 	cassert(config_prof);
-	assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
+	assert(ptr != NULL || (uintptr_t)tctx <= (uintptr_t)1U);
 
 	if (ptr != NULL) {
 		assert(usize == isalloc(ptr, true));
 		if (prof_sample_accum_update(usize, true, NULL)) {
 			/*
-			 * Don't sample.  The usize passed to
-			 * PROF_ALLOC_PREP() was larger than what
-			 * actually got allocated, so a backtrace was
-			 * captured for this allocation, even though
-			 * its actual usize was insufficient to cross
-			 * the sample threshold.
+			 * Don't sample.  The usize passed to PROF_ALLOC_PREP()
+			 * was larger than what actually got allocated, so a
+			 * backtrace was captured for this allocation, even
+			 * though its actual usize was insufficient to cross the
+			 * sample threshold.
 			 */
-			cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
+			tctx = (prof_tctx_t *)(uintptr_t)1U;
 		}
 	}
 
-	if ((uintptr_t)old_ctx > (uintptr_t)1U) {
-		told_cnt = prof_lookup(&old_ctx->bt);
-		if (told_cnt == NULL) {
-			/*
-			 * It's too late to propagate OOM for this realloc(),
-			 * so operate directly on old_cnt->ctx->cnt_merged.
-			 */
-			malloc_mutex_lock(old_ctx->lock);
-			old_ctx->cnt_merged.curobjs--;
-			old_ctx->cnt_merged.curbytes -= old_usize;
-			malloc_mutex_unlock(old_ctx->lock);
-			told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
-		}
-	} else
-		told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
-
-	if ((uintptr_t)told_cnt > (uintptr_t)1U)
-		told_cnt->epoch++;
-	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		prof_ctx_set(ptr, cnt->ctx);
-		cnt->epoch++;
-	} else if (ptr != NULL)
-		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
-	/*********/
-	mb_write();
-	/*********/
-	if ((uintptr_t)told_cnt > (uintptr_t)1U) {
-		told_cnt->cnts.curobjs--;
-		told_cnt->cnts.curbytes -= old_usize;
-	}
-	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		cnt->cnts.curobjs++;
-		cnt->cnts.curbytes += usize;
-		if (opt_prof_accum) {
-			cnt->cnts.accumobjs++;
-			cnt->cnts.accumbytes += usize;
-		}
-	}
-	/*********/
-	mb_write();
-	/*********/
-	if ((uintptr_t)told_cnt > (uintptr_t)1U)
-		told_cnt->epoch++;
-	if ((uintptr_t)cnt > (uintptr_t)1U)
-		cnt->epoch++;
-	/*********/
-	mb_write(); /* Not strictly necessary. */
+	if ((uintptr_t)old_tctx > (uintptr_t)1U)
+		prof_free_sampled_object(old_usize, old_tctx);
+	if ((uintptr_t)tctx > (uintptr_t)1U)
+		prof_malloc_sample_object(ptr, usize, tctx);
+	else
+		prof_tctx_set(ptr, (prof_tctx_t *)(uintptr_t)1U);
 }
 
 JEMALLOC_INLINE void
-prof_free(const void *ptr, size_t size)
+prof_free(const void *ptr, size_t usize)
 {
-	prof_ctx_t *ctx = prof_ctx_get(ptr);
+	prof_tctx_t *tctx = prof_tctx_get(ptr);
 
 	cassert(config_prof);
+	assert(usize == isalloc(ptr, true));
 
-	if ((uintptr_t)ctx > (uintptr_t)1) {
-		prof_thr_cnt_t *tcnt;
-		assert(size == isalloc(ptr, true));
-		tcnt = prof_lookup(&ctx->bt);
-
-		if (tcnt != NULL) {
-			tcnt->epoch++;
-			/*********/
-			mb_write();
-			/*********/
-			tcnt->cnts.curobjs--;
-			tcnt->cnts.curbytes -= size;
-			/*********/
-			mb_write();
-			/*********/
-			tcnt->epoch++;
-			/*********/
-			mb_write();
-			/*********/
-		} else {
-			/*
-			 * OOM during free() cannot be propagated, so operate
-			 * directly on cnt->ctx->cnt_merged.
-			 */
-			malloc_mutex_lock(ctx->lock);
-			ctx->cnt_merged.curobjs--;
-			ctx->cnt_merged.curbytes -= size;
-			malloc_mutex_unlock(ctx->lock);
-		}
-	}
+	if ((uintptr_t)tctx > (uintptr_t)1U)
+		prof_free_sampled_object(usize, tctx);
 }
 #endif
 
diff --git a/src/ctl.c b/src/ctl.c
index fa52a6c..b816c84 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -68,6 +68,8 @@ CTL_PROTO(version)
 CTL_PROTO(epoch)
 CTL_PROTO(thread_tcache_enabled)
 CTL_PROTO(thread_tcache_flush)
+CTL_PROTO(thread_prof_name)
+CTL_PROTO(thread_prof_active)
 CTL_PROTO(thread_arena)
 CTL_PROTO(thread_allocated)
 CTL_PROTO(thread_allocatedp)
@@ -132,7 +134,9 @@ CTL_PROTO(arenas_nlruns)
 CTL_PROTO(arenas_extend)
 CTL_PROTO(prof_active)
 CTL_PROTO(prof_dump)
+CTL_PROTO(prof_reset)
 CTL_PROTO(prof_interval)
+CTL_PROTO(lg_prof_sample)
 CTL_PROTO(stats_chunks_current)
 CTL_PROTO(stats_chunks_total)
 CTL_PROTO(stats_chunks_high)
@@ -196,18 +200,24 @@ CTL_PROTO(stats_mapped)
  */
 #define	INDEX(i)	{false},	i##_index
 
-static const ctl_named_node_t	tcache_node[] = {
+static const ctl_named_node_t	thread_tcache_node[] = {
 	{NAME("enabled"),	CTL(thread_tcache_enabled)},
 	{NAME("flush"),		CTL(thread_tcache_flush)}
 };
 
+static const ctl_named_node_t	thread_prof_node[] = {
+	{NAME("name"),		CTL(thread_prof_name)},
+	{NAME("active"),	CTL(thread_prof_active)}
+};
+
 static const ctl_named_node_t	thread_node[] = {
 	{NAME("arena"),		CTL(thread_arena)},
 	{NAME("allocated"),	CTL(thread_allocated)},
 	{NAME("allocatedp"),	CTL(thread_allocatedp)},
 	{NAME("deallocated"),	CTL(thread_deallocated)},
 	{NAME("deallocatedp"),	CTL(thread_deallocatedp)},
-	{NAME("tcache"),	CHILD(named, tcache)}
+	{NAME("tcache"),	CHILD(named, thread_tcache)},
+	{NAME("prof"),		CHILD(named, thread_prof)}
 };
 
 static const ctl_named_node_t	config_node[] = {
@@ -311,7 +321,9 @@ static const ctl_named_node_t arenas_node[] = {
 static const ctl_named_node_t	prof_node[] = {
 	{NAME("active"),	CTL(prof_active)},
 	{NAME("dump"),		CTL(prof_dump)},
-	{NAME("interval"),	CTL(prof_interval)}
+	{NAME("reset"),		CTL(prof_reset)},
+	{NAME("interval"),	CTL(prof_interval)},
+	{NAME("lg_sample"),	CTL(lg_prof_sample)}
 };
 
 static const ctl_named_node_t stats_chunks_node[] = {
@@ -1281,6 +1293,62 @@ label_return:
 	return (ret);
 }
 
+static int
+thread_prof_name_ctl(const size_t *mib, size_t miblen, void *oldp,
+    size_t *oldlenp, void *newp, size_t newlen)
+{
+	int ret;
+	const char *oldname;
+
+	if (config_prof == false)
+		return (ENOENT);
+
+	oldname = prof_thread_name_get();
+	if (newp != NULL) {
+		if (newlen != sizeof(const char *)) {
+			ret = EINVAL;
+			goto label_return;
+		}
+		if (prof_thread_name_set(*(const char **)newp)) {
+			ret = EAGAIN;
+			goto label_return;
+		}
+	}
+	READ(oldname, const char *);
+
+	ret = 0;
+label_return:
+	return (ret);
+}
+
+static int
+thread_prof_active_ctl(const size_t *mib, size_t miblen, void *oldp,
+    size_t *oldlenp, void *newp, size_t newlen)
+{
+	int ret;
+	bool oldval;
+
+	if (config_prof == false)
+		return (ENOENT);
+
+	oldval = prof_thread_active_get();
+	if (newp != NULL) {
+		if (newlen != sizeof(bool)) {
+			ret = EINVAL;
+			goto label_return;
+		}
+		if (prof_thread_active_set(*(bool *)newp)) {
+			ret = EAGAIN;
+			goto label_return;
+		}
+	}
+	READ(oldval, bool);
+
+	ret = 0;
+label_return:
+	return (ret);
+}
+
 /******************************************************************************/
 
 /* ctl_mutex must be held during execution of this function. */
@@ -1601,7 +1669,30 @@ label_return:
 	return (ret);
 }
 
+static int
+prof_reset_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
+    void *newp, size_t newlen)
+{
+	int ret;
+	size_t lg_sample = lg_prof_sample;
+
+	if (config_prof == false)
+		return (ENOENT);
+
+	WRITEONLY();
+	WRITE(lg_sample, size_t);
+	if (lg_sample >= (sizeof(uint64_t) << 3))
+		lg_sample = (sizeof(uint64_t) << 3) - 1;
+
+	prof_reset(lg_sample);
+
+	ret = 0;
+label_return:
+	return (ret);
+}
+
 CTL_RO_NL_CGEN(config_prof, prof_interval, prof_interval, uint64_t)
+CTL_RO_NL_CGEN(config_prof, lg_prof_sample, lg_prof_sample, size_t)
 
 /******************************************************************************/
 
diff --git a/src/huge.c b/src/huge.c
index d08ed4a..5f0c698 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -197,10 +197,10 @@ huge_salloc(const void *ptr)
 	return (ret);
 }
 
-prof_ctx_t *
-huge_prof_ctx_get(const void *ptr)
+prof_tctx_t *
+huge_prof_tctx_get(const void *ptr)
 {
-	prof_ctx_t *ret;
+	prof_tctx_t *ret;
 	extent_node_t *node, key;
 
 	malloc_mutex_lock(&huge_mtx);
@@ -210,7 +210,7 @@ huge_prof_ctx_get(const void *ptr)
 	node = extent_tree_ad_search(&huge, &key);
 	assert(node != NULL);
 
-	ret = node->prof_ctx;
+	ret = node->prof_tctx;
 
 	malloc_mutex_unlock(&huge_mtx);
 
@@ -218,7 +218,7 @@ huge_prof_ctx_get(const void *ptr)
 }
 
 void
-huge_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
+huge_prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 {
 	extent_node_t *node, key;
 
@@ -229,7 +229,7 @@ huge_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 	node = extent_tree_ad_search(&huge, &key);
 	assert(node != NULL);
 
-	node->prof_ctx = ctx;
+	node->prof_tctx = tctx;
 
 	malloc_mutex_unlock(&huge_mtx);
 }
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 0983c00..2d01272 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -636,9 +636,9 @@ malloc_conf_init(void)
 				    "prof_prefix", "jeprof")
 				CONF_HANDLE_BOOL(opt_prof_active, "prof_active",
 				    true)
-				CONF_HANDLE_SSIZE_T(opt_lg_prof_sample,
+				CONF_HANDLE_SIZE_T(opt_lg_prof_sample,
 				    "lg_prof_sample", 0,
-				    (sizeof(uint64_t) << 3) - 1)
+				    (sizeof(uint64_t) << 3) - 1, true)
 				CONF_HANDLE_BOOL(opt_prof_accum, "prof_accum",
 				    true)
 				CONF_HANDLE_SSIZE_T(opt_lg_prof_interval,
@@ -863,11 +863,11 @@ malloc_init_hard(void)
  */
 
 static void *
-imalloc_prof_sample(size_t usize, prof_thr_cnt_t *cnt)
+imalloc_prof_sample(size_t usize, prof_tctx_t *tctx)
 {
 	void *p;
 
-	if (cnt == NULL)
+	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
 		p = imalloc(SMALL_MAXCLASS+1);
@@ -884,16 +884,16 @@ JEMALLOC_ALWAYS_INLINE_C void *
 imalloc_prof(size_t usize)
 {
 	void *p;
-	prof_thr_cnt_t *cnt;
+	prof_tctx_t *tctx;
 
-	PROF_ALLOC_PREP(usize, cnt);
-	if ((uintptr_t)cnt != (uintptr_t)1U)
-		p = imalloc_prof_sample(usize, cnt);
+	tctx = prof_alloc_prep(usize);
+	if ((uintptr_t)tctx != (uintptr_t)1U)
+		p = imalloc_prof_sample(usize, tctx);
 	else
 		p = imalloc(usize);
 	if (p == NULL)
 		return (NULL);
-	prof_malloc(p, usize, cnt);
+	prof_malloc(p, usize, tctx);
 
 	return (p);
 }
@@ -943,11 +943,11 @@ je_malloc(size_t size)
 }
 
 static void *
-imemalign_prof_sample(size_t alignment, size_t usize, prof_thr_cnt_t *cnt)
+imemalign_prof_sample(size_t alignment, size_t usize, prof_tctx_t *tctx)
 {
 	void *p;
 
-	if (cnt == NULL)
+	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
 		assert(sa2u(SMALL_MAXCLASS+1, alignment) != 0);
@@ -963,17 +963,17 @@ imemalign_prof_sample(size_t alignment, size_t usize, prof_thr_cnt_t *cnt)
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
-imemalign_prof(size_t alignment, size_t usize, prof_thr_cnt_t *cnt)
+imemalign_prof(size_t alignment, size_t usize, prof_tctx_t *tctx)
 {
 	void *p;
 
-	if ((uintptr_t)cnt != (uintptr_t)1U)
-		p = imemalign_prof_sample(alignment, usize, cnt);
+	if ((uintptr_t)tctx != (uintptr_t)1U)
+		p = imemalign_prof_sample(alignment, usize, tctx);
 	else
 		p = ipalloc(usize, alignment, false);
 	if (p == NULL)
 		return (NULL);
-	prof_malloc(p, usize, cnt);
+	prof_malloc(p, usize, tctx);
 
 	return (p);
 }
@@ -1015,10 +1015,10 @@ imemalign(void **memptr, size_t alignment, size_t size, size_t min_alignment)
 		}
 
 		if (config_prof && opt_prof) {
-			prof_thr_cnt_t *cnt;
+			prof_tctx_t *tctx;
 
-			PROF_ALLOC_PREP(usize, cnt);
-			result = imemalign_prof(alignment, usize, cnt);
+			tctx = prof_alloc_prep(usize);
+			result = imemalign_prof(alignment, usize, tctx);
 		} else
 			result = ipalloc(usize, alignment, false);
 		if (result == NULL)
@@ -1070,11 +1070,11 @@ je_aligned_alloc(size_t alignment, size_t size)
 }
 
 static void *
-icalloc_prof_sample(size_t usize, prof_thr_cnt_t *cnt)
+icalloc_prof_sample(size_t usize, prof_tctx_t *tctx)
 {
 	void *p;
 
-	if (cnt == NULL)
+	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
 		p = icalloc(SMALL_MAXCLASS+1);
@@ -1088,17 +1088,17 @@ icalloc_prof_sample(size_t usize, prof_thr_cnt_t *cnt)
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
-icalloc_prof(size_t usize, prof_thr_cnt_t *cnt)
+icalloc_prof(size_t usize, prof_tctx_t *tctx)
 {
 	void *p;
 
-	if ((uintptr_t)cnt != (uintptr_t)1U)
-		p = icalloc_prof_sample(usize, cnt);
+	if ((uintptr_t)tctx != (uintptr_t)1U)
+		p = icalloc_prof_sample(usize, tctx);
 	else
 		p = icalloc(usize);
 	if (p == NULL)
 		return (NULL);
-	prof_malloc(p, usize, cnt);
+	prof_malloc(p, usize, tctx);
 
 	return (p);
 }
@@ -1137,11 +1137,11 @@ je_calloc(size_t num, size_t size)
 	}
 
 	if (config_prof && opt_prof) {
-		prof_thr_cnt_t *cnt;
+		prof_tctx_t *tctx;
 
 		usize = s2u(num_size);
-		PROF_ALLOC_PREP(usize, cnt);
-		ret = icalloc_prof(usize, cnt);
+		tctx = prof_alloc_prep(usize);
+		ret = icalloc_prof(usize, tctx);
 	} else {
 		if (config_stats || (config_valgrind && in_valgrind))
 			usize = s2u(num_size);
@@ -1167,11 +1167,11 @@ label_return:
 }
 
 static void *
-irealloc_prof_sample(void *oldptr, size_t usize, prof_thr_cnt_t *cnt)
+irealloc_prof_sample(void *oldptr, size_t usize, prof_tctx_t *tctx)
 {
 	void *p;
 
-	if (cnt == NULL)
+	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
 		p = iralloc(oldptr, SMALL_MAXCLASS+1, 0, 0, false);
@@ -1185,19 +1185,19 @@ irealloc_prof_sample(void *oldptr, size_t usize, prof_thr_cnt_t *cnt)
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
-irealloc_prof(void *oldptr, size_t old_usize, size_t usize, prof_thr_cnt_t *cnt)
+irealloc_prof(void *oldptr, size_t old_usize, size_t usize, prof_tctx_t *tctx)
 {
 	void *p;
-	prof_ctx_t *old_ctx;
+	prof_tctx_t *old_tctx;
 
-	old_ctx = prof_ctx_get(oldptr);
-	if ((uintptr_t)cnt != (uintptr_t)1U)
-		p = irealloc_prof_sample(oldptr, usize, cnt);
+	old_tctx = prof_tctx_get(oldptr);
+	if ((uintptr_t)tctx != (uintptr_t)1U)
+		p = irealloc_prof_sample(oldptr, usize, tctx);
 	else
 		p = iralloc(oldptr, usize, 0, 0, false);
 	if (p == NULL)
 		return (NULL);
-	prof_realloc(p, usize, cnt, old_usize, old_ctx);
+	prof_realloc(p, usize, tctx, old_usize, old_tctx);
 
 	return (p);
 }
@@ -1253,11 +1253,11 @@ je_realloc(void *ptr, size_t size)
 			old_rzsize = config_prof ? p2rz(ptr) : u2rz(old_usize);
 
 		if (config_prof && opt_prof) {
-			prof_thr_cnt_t *cnt;
+			prof_tctx_t *tctx;
 
 			usize = s2u(size);
-			PROF_ALLOC_PREP(usize, cnt);
-			ret = irealloc_prof(ptr, old_usize, usize, cnt);
+			tctx = prof_alloc_prep(usize);
+			ret = irealloc_prof(ptr, old_usize, usize, tctx);
 		} else {
 			if (config_stats || (config_valgrind && in_valgrind))
 				usize = s2u(size);
@@ -1379,11 +1379,11 @@ imallocx(size_t usize, size_t alignment, bool zero, bool try_tcache,
 
 static void *
 imallocx_prof_sample(size_t usize, size_t alignment, bool zero, bool try_tcache,
-    arena_t *arena, prof_thr_cnt_t *cnt)
+    arena_t *arena, prof_tctx_t *tctx)
 {
 	void *p;
 
-	if (cnt == NULL)
+	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
 		size_t usize_promoted = (alignment == 0) ?
@@ -1402,18 +1402,18 @@ imallocx_prof_sample(size_t usize, size_t alignment, bool zero, bool try_tcache,
 
 JEMALLOC_ALWAYS_INLINE_C void *
 imallocx_prof(size_t usize, size_t alignment, bool zero, bool try_tcache,
-    arena_t *arena, prof_thr_cnt_t *cnt)
+    arena_t *arena, prof_tctx_t *tctx)
 {
 	void *p;
 
-	if ((uintptr_t)cnt != (uintptr_t)1U) {
+	if ((uintptr_t)tctx != (uintptr_t)1U) {
 		p = imallocx_prof_sample(usize, alignment, zero, try_tcache,
-		    arena, cnt);
+		    arena, tctx);
 	} else
 		p = imallocx(usize, alignment, zero, try_tcache, arena);
 	if (p == NULL)
 		return (NULL);
-	prof_malloc(p, usize, cnt);
+	prof_malloc(p, usize, tctx);
 
 	return (p);
 }
@@ -1447,11 +1447,11 @@ je_mallocx(size_t size, int flags)
 	assert(usize != 0);
 
 	if (config_prof && opt_prof) {
-		prof_thr_cnt_t *cnt;
+		prof_tctx_t *tctx;
 
-		PROF_ALLOC_PREP(usize, cnt);
+		tctx = prof_alloc_prep(usize);
 		p = imallocx_prof(usize, alignment, zero, try_tcache, arena,
-		    cnt);
+		    tctx);
 	} else
 		p = imallocx(usize, alignment, zero, try_tcache, arena);
 	if (p == NULL)
@@ -1476,11 +1476,11 @@ label_oom:
 static void *
 irallocx_prof_sample(void *oldptr, size_t size, size_t alignment, size_t usize,
     bool zero, bool try_tcache_alloc, bool try_tcache_dalloc, arena_t *arena,
-    prof_thr_cnt_t *cnt)
+    prof_tctx_t *tctx)
 {
 	void *p;
 
-	if (cnt == NULL)
+	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
 		p = iralloct(oldptr, SMALL_MAXCLASS+1, (SMALL_MAXCLASS+1 >=
@@ -1500,15 +1500,15 @@ irallocx_prof_sample(void *oldptr, size_t size, size_t alignment, size_t usize,
 JEMALLOC_ALWAYS_INLINE_C void *
 irallocx_prof(void *oldptr, size_t old_usize, size_t size, size_t alignment,
     size_t *usize, bool zero, bool try_tcache_alloc, bool try_tcache_dalloc,
-    arena_t *arena, prof_thr_cnt_t *cnt)
+    arena_t *arena, prof_tctx_t *tctx)
 {
 	void *p;
-	prof_ctx_t *old_ctx;
+	prof_tctx_t *old_tctx;
 
-	old_ctx = prof_ctx_get(oldptr);
-	if ((uintptr_t)cnt != (uintptr_t)1U)
+	old_tctx = prof_tctx_get(oldptr);
+	if ((uintptr_t)tctx != (uintptr_t)1U)
 		p = irallocx_prof_sample(oldptr, size, alignment, *usize, zero,
-		    try_tcache_alloc, try_tcache_dalloc, arena, cnt);
+		    try_tcache_alloc, try_tcache_dalloc, arena, tctx);
 	else {
 		p = iralloct(oldptr, size, 0, alignment, zero,
 		    try_tcache_alloc, try_tcache_dalloc, arena);
@@ -1527,7 +1527,7 @@ irallocx_prof(void *oldptr, size_t old_usize, size_t size, size_t alignment,
 		 */
 		*usize = isalloc(p, config_prof);
 	}
-	prof_realloc(p, *usize, cnt, old_usize, old_ctx);
+	prof_realloc(p, *usize, tctx, old_usize, old_tctx);
 
 	return (p);
 }
@@ -1570,13 +1570,13 @@ je_rallocx(void *ptr, size_t size, int flags)
 		old_rzsize = u2rz(old_usize);
 
 	if (config_prof && opt_prof) {
-		prof_thr_cnt_t *cnt;
+		prof_tctx_t *tctx;
 
 		usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment);
 		assert(usize != 0);
-		PROF_ALLOC_PREP(usize, cnt);
+		tctx = prof_alloc_prep(usize);
 		p = irallocx_prof(ptr, old_usize, size, alignment, &usize, zero,
-		    try_tcache_alloc, try_tcache_dalloc, arena, cnt);
+		    try_tcache_alloc, try_tcache_dalloc, arena, tctx);
 		if (p == NULL)
 			goto label_oom;
 	} else {
@@ -1623,11 +1623,11 @@ ixallocx_helper(void *ptr, size_t old_usize, size_t size, size_t extra,
 static size_t
 ixallocx_prof_sample(void *ptr, size_t old_usize, size_t size, size_t extra,
     size_t alignment, size_t max_usize, bool zero, arena_t *arena,
-    prof_thr_cnt_t *cnt)
+    prof_tctx_t *tctx)
 {
 	size_t usize;
 
-	if (cnt == NULL)
+	if (tctx == NULL)
 		return (old_usize);
 	/* Use minimum usize to determine whether promotion may happen. */
 	if (((alignment == 0) ? s2u(size) : sa2u(size, alignment)) <=
@@ -1650,22 +1650,22 @@ ixallocx_prof_sample(void *ptr, size_t old_usize, size_t size, size_t extra,
 JEMALLOC_ALWAYS_INLINE_C size_t
 ixallocx_prof(void *ptr, size_t old_usize, size_t size, size_t extra,
     size_t alignment, size_t max_usize, bool zero, arena_t *arena,
-    prof_thr_cnt_t *cnt)
+    prof_tctx_t *tctx)
 {
 	size_t usize;
-	prof_ctx_t *old_ctx;
+	prof_tctx_t *old_tctx;
 
-	old_ctx = prof_ctx_get(ptr);
-	if ((uintptr_t)cnt != (uintptr_t)1U) {
+	old_tctx = prof_tctx_get(ptr);
+	if ((uintptr_t)tctx != (uintptr_t)1U) {
 		usize = ixallocx_prof_sample(ptr, old_usize, size, extra,
-		    alignment, zero, max_usize, arena, cnt);
+		    alignment, zero, max_usize, arena, tctx);
 	} else {
 		usize = ixallocx_helper(ptr, old_usize, size, extra, alignment,
 		    zero, arena);
 	}
 	if (usize == old_usize)
 		return (usize);
-	prof_realloc(ptr, usize, cnt, old_usize, old_ctx);
+	prof_realloc(ptr, usize, tctx, old_usize, old_tctx);
 
 	return (usize);
 }
@@ -1697,19 +1697,19 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags)
 		old_rzsize = u2rz(old_usize);
 
 	if (config_prof && opt_prof) {
-		prof_thr_cnt_t *cnt;
+		prof_tctx_t *tctx;
 		/*
 		 * usize isn't knowable before ixalloc() returns when extra is
 		 * non-zero.  Therefore, compute its maximum possible value and
-		 * use that in PROF_ALLOC_PREP() to decide whether to capture a
+		 * use that in prof_alloc_prep() to decide whether to capture a
 		 * backtrace.  prof_realloc() will use the actual usize to
 		 * decide whether to sample.
 		 */
 		size_t max_usize = (alignment == 0) ? s2u(size+extra) :
 		    sa2u(size+extra, alignment);
-		PROF_ALLOC_PREP(max_usize, cnt);
+		tctx = prof_alloc_prep(max_usize);
 		usize = ixallocx_prof(ptr, old_usize, size, extra, alignment,
-		    max_usize, zero, arena, cnt);
+		    max_usize, zero, arena, tctx);
 	} else {
 		usize = ixallocx_helper(ptr, old_usize, size, extra, alignment,
 		    zero, arena);
diff --git a/src/prof.c b/src/prof.c
index 497ccf4..044acd8 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -33,22 +33,41 @@ char		opt_prof_prefix[
 
 uint64_t	prof_interval = 0;
 
+size_t		lg_prof_sample;
+
 /*
- * Table of mutexes that are shared among ctx's.  These are leaf locks, so
- * there is no problem with using them for more than one ctx at the same time.
- * The primary motivation for this sharing though is that ctx's are ephemeral,
+ * Table of mutexes that are shared among gctx's.  These are leaf locks, so
+ * there is no problem with using them for more than one gctx at the same time.
+ * The primary motivation for this sharing though is that gctx's are ephemeral,
  * and destroying mutexes causes complications for systems that allocate when
  * creating/destroying mutexes.
  */
-static malloc_mutex_t	*ctx_locks;
-static unsigned		cum_ctxs; /* Atomic counter. */
+static malloc_mutex_t	*gctx_locks;
+static unsigned		cum_gctxs; /* Atomic counter. */
 
 /*
- * Global hash of (prof_bt_t *)-->(prof_ctx_t *).  This is the master data
+ * Table of mutexes that are shared among tdata's.  No operations require
+ * holding multiple tdata locks, so there is no problem with using them for more
+ * than one tdata at the same time, even though a gctx lock may be acquired
+ * while holding a tdata lock.
+ */
+static malloc_mutex_t	*tdata_locks;
+
+/*
+ * Global hash of (prof_bt_t *)-->(prof_gctx_t *).  This is the master data
  * structure that knows about all backtraces currently captured.
  */
-static ckh_t		bt2ctx;
-static malloc_mutex_t	bt2ctx_mtx;
+static ckh_t		bt2gctx;
+static malloc_mutex_t	bt2gctx_mtx;
+
+/*
+ * Tree of all extant prof_tdata_t structures, regardless of state,
+ * {attached,detached,expired}.
+ */
+static prof_tdata_tree_t	tdatas;
+static malloc_mutex_t	tdatas_mtx;
+
+static uint64_t		next_thr_uid;
 
 static malloc_mutex_t	prof_dump_seq_mtx;
 static uint64_t		prof_dump_seq;
@@ -76,21 +95,33 @@ static int		prof_dump_fd;
 static bool		prof_booted = false;
 
 /******************************************************************************/
+/*
+ * Function prototypes for static functions that are referenced prior to
+ * definition.
+ */
+
+static bool	prof_tctx_should_destroy(prof_tctx_t *tctx);
+static void	prof_tctx_destroy(prof_tctx_t *tctx);
+static bool	prof_tdata_should_destroy(prof_tdata_t *tdata);
+static void	prof_tdata_destroy(prof_tdata_t *tdata);
+
+/******************************************************************************/
+/* Red-black trees. */
 
 JEMALLOC_INLINE_C int
-prof_thr_cnt_comp(const prof_thr_cnt_t *a, const prof_thr_cnt_t *b)
+prof_tctx_comp(const prof_tctx_t *a, const prof_tctx_t *b)
 {
-	prof_thr_uid_t a_uid = a->thr_uid;
-	prof_thr_uid_t b_uid = b->thr_uid;
+	uint64_t a_uid = a->tdata->thr_uid;
+	uint64_t b_uid = b->tdata->thr_uid;
 
 	return ((a_uid > b_uid) - (a_uid < b_uid));
 }
 
-rb_gen(static UNUSED, thr_cnt_tree_, prof_thr_cnt_tree_t, prof_thr_cnt_t,
-    thr_cnt_link, prof_thr_cnt_comp)
+rb_gen(static UNUSED, tctx_tree_, prof_tctx_tree_t, prof_tctx_t,
+    tctx_link, prof_tctx_comp)
 
 JEMALLOC_INLINE_C int
-prof_ctx_comp(const prof_ctx_t *a, const prof_ctx_t *b)
+prof_gctx_comp(const prof_gctx_t *a, const prof_gctx_t *b)
 {
 	unsigned a_len = a->bt.len;
 	unsigned b_len = b->bt.len;
@@ -101,8 +132,52 @@ prof_ctx_comp(const prof_ctx_t *a, const prof_ctx_t *b)
 	return (ret);
 }
 
-rb_gen(static UNUSED, ctx_tree_, prof_ctx_tree_t, prof_ctx_t, dump_link,
-    prof_ctx_comp)
+rb_gen(static UNUSED, gctx_tree_, prof_gctx_tree_t, prof_gctx_t, dump_link,
+    prof_gctx_comp)
+
+JEMALLOC_INLINE_C int
+prof_tdata_comp(const prof_tdata_t *a, const prof_tdata_t *b)
+{
+	uint64_t a_uid = a->thr_uid;
+	uint64_t b_uid = b->thr_uid;
+
+	return ((a_uid > b_uid) - (a_uid < b_uid));
+}
+
+rb_gen(static UNUSED, tdata_tree_, prof_tdata_tree_t, prof_tdata_t, tdata_link,
+    prof_tdata_comp)
+
+/******************************************************************************/
+
+void
+prof_malloc_sample_object(const void *ptr, size_t usize, prof_tctx_t *tctx) {
+	prof_tctx_set(ptr, tctx);
+
+	malloc_mutex_lock(tctx->tdata->lock);
+	tctx->cnts.curobjs++;
+	tctx->cnts.curbytes += usize;
+	if (opt_prof_accum) {
+		tctx->cnts.accumobjs++;
+		tctx->cnts.accumbytes += usize;
+	}
+	malloc_mutex_unlock(tctx->tdata->lock);
+}
+
+void
+prof_free_sampled_object(size_t usize, prof_tctx_t *tctx)
+{
+
+	malloc_mutex_lock(tctx->tdata->lock);
+	assert(tctx->cnts.curobjs > 0);
+	assert(tctx->cnts.curbytes >= usize);
+	tctx->cnts.curobjs--;
+	tctx->cnts.curbytes -= usize;
+
+	if (prof_tctx_should_destroy(tctx))
+		prof_tctx_destroy(tctx);
+	else
+		malloc_mutex_unlock(tctx->tdata->lock);
+}
 
 void
 bt_init(prof_bt_t *bt, void **vec)
@@ -115,32 +190,32 @@ bt_init(prof_bt_t *bt, void **vec)
 }
 
 static inline void
-prof_enter(prof_tdata_t *prof_tdata)
+prof_enter(prof_tdata_t *tdata)
 {
 
 	cassert(config_prof);
 
-	assert(prof_tdata->enq == false);
-	prof_tdata->enq = true;
+	assert(tdata->enq == false);
+	tdata->enq = true;
 
-	malloc_mutex_lock(&bt2ctx_mtx);
+	malloc_mutex_lock(&bt2gctx_mtx);
 }
 
 static inline void
-prof_leave(prof_tdata_t *prof_tdata)
+prof_leave(prof_tdata_t *tdata)
 {
 	bool idump, gdump;
 
 	cassert(config_prof);
 
-	malloc_mutex_unlock(&bt2ctx_mtx);
+	malloc_mutex_unlock(&bt2gctx_mtx);
 
-	assert(prof_tdata->enq);
-	prof_tdata->enq = false;
-	idump = prof_tdata->enq_idump;
-	prof_tdata->enq_idump = false;
-	gdump = prof_tdata->enq_gdump;
-	prof_tdata->enq_gdump = false;
+	assert(tdata->enq);
+	tdata->enq = false;
+	idump = tdata->enq_idump;
+	tdata->enq_idump = false;
+	gdump = tdata->enq_gdump;
+	tdata->enq_gdump = false;
 
 	if (idump)
 		prof_idump();
@@ -373,220 +448,268 @@ prof_backtrace(prof_bt_t *bt)
 #endif
 
 static malloc_mutex_t *
-prof_ctx_mutex_choose(void)
+prof_gctx_mutex_choose(void)
+{
+	unsigned ngctxs = atomic_add_u(&cum_gctxs, 1);
+
+	return (&gctx_locks[(ngctxs - 1) % PROF_NCTX_LOCKS]);
+}
+
+static malloc_mutex_t *
+prof_tdata_mutex_choose(uint64_t thr_uid)
 {
-	unsigned nctxs = atomic_add_u(&cum_ctxs, 1);
 
-	return (&ctx_locks[(nctxs - 1) % PROF_NCTX_LOCKS]);
+	return (&tdata_locks[thr_uid % PROF_NTDATA_LOCKS]);
 }
 
-static prof_ctx_t *
-prof_ctx_create(prof_bt_t *bt)
+static prof_gctx_t *
+prof_gctx_create(prof_bt_t *bt)
 {
 	/*
 	 * Create a single allocation that has space for vec of length bt->len.
 	 */
-	prof_ctx_t *ctx = (prof_ctx_t *)imalloc(offsetof(prof_ctx_t, vec) +
+	prof_gctx_t *gctx = (prof_gctx_t *)imalloc(offsetof(prof_gctx_t, vec) +
 	    (bt->len * sizeof(void *)));
-	if (ctx == NULL)
+	if (gctx == NULL)
 		return (NULL);
-	ctx->lock = prof_ctx_mutex_choose();
+	gctx->lock = prof_gctx_mutex_choose();
 	/*
 	 * Set nlimbo to 1, in order to avoid a race condition with
-	 * prof_ctx_merge()/prof_ctx_destroy().
+	 * prof_tctx_destroy()/prof_gctx_maybe_destroy().
 	 */
-	ctx->nlimbo = 1;
-	memset(&ctx->cnt_merged, 0, sizeof(prof_cnt_t));
-	thr_cnt_tree_new(&ctx->thr_cnts);
+	gctx->nlimbo = 1;
+	tctx_tree_new(&gctx->tctxs);
 	/* Duplicate bt. */
-	memcpy(ctx->vec, bt->vec, bt->len * sizeof(void *));
-	ctx->bt.vec = ctx->vec;
-	ctx->bt.len = bt->len;
-	return (ctx);
+	memcpy(gctx->vec, bt->vec, bt->len * sizeof(void *));
+	gctx->bt.vec = gctx->vec;
+	gctx->bt.len = bt->len;
+	return (gctx);
 }
 
 static void
-prof_ctx_destroy(prof_ctx_t *ctx)
+prof_gctx_maybe_destroy(prof_gctx_t *gctx)
 {
-	prof_tdata_t *prof_tdata;
+	prof_tdata_t *tdata;
 
 	cassert(config_prof);
 
 	/*
-	 * Check that ctx is still unused by any thread cache before destroying
-	 * it.  prof_lookup() increments ctx->nlimbo in order to avoid a race
-	 * condition with this function, as does prof_ctx_merge() in order to
-	 * avoid a race between the main body of prof_ctx_merge() and entry
+	 * Check that gctx is still unused by any thread cache before destroying
+	 * it.  prof_lookup() increments gctx->nlimbo in order to avoid a race
+	 * condition with this function, as does prof_tctx_destroy() in order to
+	 * avoid a race between the main body of prof_tctx_destroy() and entry
 	 * into this function.
 	 */
-	prof_tdata = prof_tdata_get(false);
-	assert((uintptr_t)prof_tdata > (uintptr_t)PROF_TDATA_STATE_MAX);
-	prof_enter(prof_tdata);
-	malloc_mutex_lock(ctx->lock);
-	if (thr_cnt_tree_first(&ctx->thr_cnts) == NULL &&
-	    ctx->cnt_merged.curobjs == 0 && ctx->nlimbo == 1) {
-		assert(ctx->cnt_merged.curbytes == 0);
-		assert(ctx->cnt_merged.accumobjs == 0);
-		assert(ctx->cnt_merged.accumbytes == 0);
-		/* Remove ctx from bt2ctx. */
-		if (ckh_remove(&bt2ctx, &ctx->bt, NULL, NULL))
+	tdata = prof_tdata_get(false);
+	assert((uintptr_t)tdata > (uintptr_t)PROF_TDATA_STATE_MAX);
+	prof_enter(tdata);
+	malloc_mutex_lock(gctx->lock);
+	if (tctx_tree_empty(&gctx->tctxs) && gctx->nlimbo == 1) {
+		/* Remove gctx from bt2gctx. */
+		if (ckh_remove(&bt2gctx, &gctx->bt, NULL, NULL))
 			not_reached();
-		prof_leave(prof_tdata);
-		/* Destroy ctx. */
-		malloc_mutex_unlock(ctx->lock);
-		idalloc(ctx);
+		prof_leave(tdata);
+		/* Destroy gctx. */
+		malloc_mutex_unlock(gctx->lock);
+		idalloc(gctx);
 	} else {
 		/*
-		 * Compensate for increment in prof_ctx_merge() or
+		 * Compensate for increment in prof_tctx_destroy() or
 		 * prof_lookup().
 		 */
-		ctx->nlimbo--;
-		malloc_mutex_unlock(ctx->lock);
-		prof_leave(prof_tdata);
+		gctx->nlimbo--;
+		malloc_mutex_unlock(gctx->lock);
+		prof_leave(tdata);
 	}
 }
 
-static void
-prof_ctx_merge(prof_ctx_t *ctx, prof_thr_cnt_t *cnt)
+/* tctx->tdata->lock must be held. */
+static bool
+prof_tctx_should_destroy(prof_tctx_t *tctx)
 {
-	bool destroy;
 
-	cassert(config_prof);
+	if (opt_prof_accum)
+		return (false);
+	if (tctx->cnts.curobjs != 0)
+		return (false);
+	return (true);
+}
+
+static bool
+prof_gctx_should_destroy(prof_gctx_t *gctx)
+{
 
-	/* Merge cnt stats and detach from ctx. */
-	malloc_mutex_lock(ctx->lock);
-	ctx->cnt_merged.curobjs += cnt->cnts.curobjs;
-	ctx->cnt_merged.curbytes += cnt->cnts.curbytes;
-	ctx->cnt_merged.accumobjs += cnt->cnts.accumobjs;
-	ctx->cnt_merged.accumbytes += cnt->cnts.accumbytes;
-	thr_cnt_tree_remove(&ctx->thr_cnts, cnt);
-	if (opt_prof_accum == false && thr_cnt_tree_first(&ctx->thr_cnts) ==
-	    NULL && ctx->cnt_merged.curobjs == 0 && ctx->nlimbo == 0) {
+	if (opt_prof_accum)
+		return (false);
+	if (tctx_tree_empty(&gctx->tctxs) == false)
+		return (false);
+	if (gctx->nlimbo != 0)
+		return (false);
+	return (true);
+}
+
+/* tctx->tdata->lock is held upon entry, and released before return. */
+static void
+prof_tctx_destroy(prof_tctx_t *tctx)
+{
+	prof_gctx_t *gctx = tctx->gctx;
+	bool destroy_gctx;
+
+	assert(tctx->cnts.curobjs == 0);
+	assert(tctx->cnts.curbytes == 0);
+	assert(opt_prof_accum == false);
+	assert(tctx->cnts.accumobjs == 0);
+	assert(tctx->cnts.accumbytes == 0);
+
+	{
+		prof_tdata_t *tdata = tctx->tdata;
+		bool tdata_destroy;
+
+		ckh_remove(&tdata->bt2tctx, &gctx->bt, NULL, NULL);
+		tdata_destroy = prof_tdata_should_destroy(tdata);
+		malloc_mutex_unlock(tdata->lock);
+		if (tdata_destroy)
+			prof_tdata_destroy(tdata);
+	}
+
+	malloc_mutex_lock(gctx->lock);
+	tctx_tree_remove(&gctx->tctxs, tctx);
+	if (prof_gctx_should_destroy(gctx)) {
 		/*
-		 * Increment ctx->nlimbo in order to keep another thread from
-		 * winning the race to destroy ctx while this one has ctx->lock
-		 * dropped.  Without this, it would be possible for another
-		 * thread to:
+		 * Increment gctx->nlimbo in order to keep another thread from
+		 * winning the race to destroy gctx while this one has
+		 * gctx->lock dropped.  Without this, it would be possible for
+		 * another thread to:
 		 *
-		 * 1) Sample an allocation associated with ctx.
+		 * 1) Sample an allocation associated with gctx.
 		 * 2) Deallocate the sampled object.
-		 * 3) Successfully prof_ctx_destroy(ctx).
+		 * 3) Successfully prof_gctx_maybe_destroy(gctx).
 		 *
-		 * The result would be that ctx no longer exists by the time
-		 * this thread accesses it in prof_ctx_destroy().
+		 * The result would be that gctx no longer exists by the time
+		 * this thread accesses it in prof_gctx_maybe_destroy().
 		 */
-		ctx->nlimbo++;
-		destroy = true;
+		gctx->nlimbo++;
+		destroy_gctx = true;
 	} else
-		destroy = false;
-	malloc_mutex_unlock(ctx->lock);
-	if (destroy)
-		prof_ctx_destroy(ctx);
+		destroy_gctx = false;
+	malloc_mutex_unlock(gctx->lock);
+	if (destroy_gctx)
+		prof_gctx_maybe_destroy(gctx);
+
+	idalloc(tctx);
 }
 
 static bool
-prof_lookup_global(prof_bt_t *bt, prof_tdata_t *prof_tdata, void **p_btkey,
-    prof_ctx_t **p_ctx, bool *p_new_ctx)
+prof_lookup_global(prof_bt_t *bt, prof_tdata_t *tdata, void **p_btkey,
+    prof_gctx_t **p_gctx, bool *p_new_gctx)
 {
 	union {
-		prof_ctx_t	*p;
+		prof_gctx_t	*p;
 		void		*v;
-	} ctx;
+	} gctx;
 	union {
 		prof_bt_t	*p;
 		void		*v;
 	} btkey;
-	bool new_ctx;
+	bool new_gctx;
 
-	prof_enter(prof_tdata);
-	if (ckh_search(&bt2ctx, bt, &btkey.v, &ctx.v)) {
+	prof_enter(tdata);
+	if (ckh_search(&bt2gctx, bt, &btkey.v, &gctx.v)) {
 		/* bt has never been seen before.  Insert it. */
-		ctx.p = prof_ctx_create(bt);
-		if (ctx.v == NULL) {
-			prof_leave(prof_tdata);
+		gctx.p = prof_gctx_create(bt);
+		if (gctx.v == NULL) {
+			prof_leave(tdata);
 			return (true);
 		}
-		btkey.p = &ctx.p->bt;
-		if (ckh_insert(&bt2ctx, btkey.v, ctx.v)) {
+		btkey.p = &gctx.p->bt;
+		if (ckh_insert(&bt2gctx, btkey.v, gctx.v)) {
 			/* OOM. */
-			prof_leave(prof_tdata);
-			idalloc(ctx.v);
+			prof_leave(tdata);
+			idalloc(gctx.v);
 			return (true);
 		}
-		new_ctx = true;
+		new_gctx = true;
 	} else {
 		/*
 		 * Increment nlimbo, in order to avoid a race condition with
-		 * prof_ctx_merge()/prof_ctx_destroy().
+		 * prof_tctx_destroy()/prof_gctx_maybe_destroy().
 		 */
-		malloc_mutex_lock(ctx.p->lock);
-		ctx.p->nlimbo++;
-		malloc_mutex_unlock(ctx.p->lock);
-		new_ctx = false;
+		malloc_mutex_lock(gctx.p->lock);
+		gctx.p->nlimbo++;
+		malloc_mutex_unlock(gctx.p->lock);
+		new_gctx = false;
 	}
-	prof_leave(prof_tdata);
+	prof_leave(tdata);
 
 	*p_btkey = btkey.v;
-	*p_ctx = ctx.p;
-	*p_new_ctx = new_ctx;
+	*p_gctx = gctx.p;
+	*p_new_gctx = new_gctx;
 	return (false);
 }
 
-prof_thr_cnt_t *
+prof_tctx_t *
 prof_lookup(prof_bt_t *bt)
 {
 	union {
-		prof_thr_cnt_t	*p;
+		prof_tctx_t	*p;
 		void		*v;
 	} ret;
-	prof_tdata_t *prof_tdata;
+	prof_tdata_t *tdata;
+	bool not_found;
 
 	cassert(config_prof);
 
-	prof_tdata = prof_tdata_get(false);
-	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+	tdata = prof_tdata_get(false);
+	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
 		return (NULL);
 
-	if (ckh_search(&prof_tdata->bt2cnt, bt, NULL, &ret.v)) {
+	malloc_mutex_lock(tdata->lock);
+	not_found = ckh_search(&tdata->bt2tctx, bt, NULL, &ret.v);
+	malloc_mutex_unlock(tdata->lock);
+	if (not_found) {
 		void *btkey;
-		prof_ctx_t *ctx;
-		bool new_ctx;
+		prof_gctx_t *gctx;
+		bool new_gctx, error;
 
 		/*
 		 * This thread's cache lacks bt.  Look for it in the global
 		 * cache.
 		 */
-		if (prof_lookup_global(bt, prof_tdata, &btkey, &ctx, &new_ctx))
+		if (prof_lookup_global(bt, tdata, &btkey, &gctx,
+		    &new_gctx))
 			return (NULL);
 
-		/* Link a prof_thd_cnt_t into ctx for this thread. */
-		ret.v = imalloc(sizeof(prof_thr_cnt_t));
+		/* Link a prof_tctx_t into gctx for this thread. */
+		ret.v = imalloc(sizeof(prof_tctx_t));
 		if (ret.p == NULL) {
-			if (new_ctx)
-				prof_ctx_destroy(ctx);
+			if (new_gctx)
+				prof_gctx_maybe_destroy(gctx);
 			return (NULL);
 		}
-		ret.p->ctx = ctx;
-		ret.p->epoch = 0;
+		ret.p->tdata = tdata;
 		memset(&ret.p->cnts, 0, sizeof(prof_cnt_t));
-		if (ckh_insert(&prof_tdata->bt2cnt, btkey, ret.v)) {
-			if (new_ctx)
-				prof_ctx_destroy(ctx);
+		ret.p->gctx = gctx;
+		ret.p->state = prof_tctx_state_nominal;
+		malloc_mutex_lock(tdata->lock);
+		error = ckh_insert(&tdata->bt2tctx, btkey, ret.v);
+		malloc_mutex_unlock(tdata->lock);
+		if (error) {
+			if (new_gctx)
+				prof_gctx_maybe_destroy(gctx);
 			idalloc(ret.v);
 			return (NULL);
 		}
-		malloc_mutex_lock(ctx->lock);
-		thr_cnt_tree_insert(&ctx->thr_cnts, ret.p);
-		ctx->nlimbo--;
-		malloc_mutex_unlock(ctx->lock);
+		malloc_mutex_lock(gctx->lock);
+		tctx_tree_insert(&gctx->tctxs, ret.p);
+		gctx->nlimbo--;
+		malloc_mutex_unlock(gctx->lock);
 	}
 
 	return (ret.p);
 }
 
-
 void
-prof_sample_threshold_update(prof_tdata_t *prof_tdata)
+prof_sample_threshold_update(prof_tdata_t *tdata)
 {
 	/*
 	 * The body of this function is compiled out unless heap profiling is
@@ -608,23 +731,20 @@ prof_sample_threshold_update(prof_tdata_t *prof_tdata)
 	if (!config_prof)
 		return;
 
-	if (prof_tdata == NULL)
-		prof_tdata = prof_tdata_get(false);
-
-	if (opt_lg_prof_sample == 0) {
-		prof_tdata->bytes_until_sample = 0;
+	if (lg_prof_sample == 0) {
+		tdata->bytes_until_sample = 0;
 		return;
 	}
 
 	/*
-	 * Compute sample threshold as a geometrically distributed random
-	 * variable with mean (2^opt_lg_prof_sample).
+	 * Compute sample interval as a geometrically distributed random
+	 * variable with mean (2^lg_prof_sample).
 	 *
-	 *                         __        __
-	 *                         |  log(u)  |                     1
-	 * prof_tdata->threshold = | -------- |, where p = -------------------
-	 *                         | log(1-p) |             opt_lg_prof_sample
-	 *                                                 2
+	 *                             __        __
+	 *                             |  log(u)  |                     1
+	 * tdata->bytes_until_sample = | -------- |, where p = ---------------
+	 *                             | log(1-p) |             lg_prof_sample
+	 *                                                     2
 	 *
 	 * For more information on the math, see:
 	 *
@@ -634,30 +754,29 @@ prof_sample_threshold_update(prof_tdata_t *prof_tdata)
 	 *   pp 500
 	 *   (http://luc.devroye.org/rnbookindex.html)
 	 */
-	prng64(r, 53, prof_tdata->prng_state,
-	    UINT64_C(6364136223846793005), UINT64_C(1442695040888963407));
+	prng64(r, 53, tdata->prng_state, UINT64_C(6364136223846793005),
+	    UINT64_C(1442695040888963407));
 	u = (double)r * (1.0/9007199254740992.0L);
-	prof_tdata->bytes_until_sample = (uint64_t)(log(u) /
-	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
+	tdata->bytes_until_sample = (uint64_t)(log(u) /
+	    log(1.0 - (1.0 / (double)((uint64_t)1U << lg_prof_sample))))
 	    + (uint64_t)1U;
 #endif
 }
 
-
 #ifdef JEMALLOC_JET
 size_t
 prof_bt_count(void)
 {
 	size_t bt_count;
-	prof_tdata_t *prof_tdata;
+	prof_tdata_t *tdata;
 
-	prof_tdata = prof_tdata_get(false);
-	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+	tdata = prof_tdata_get(false);
+	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
 		return (0);
 
-	prof_enter(prof_tdata);
-	bt_count = ckh_count(&bt2ctx);
-	prof_leave(prof_tdata);
+	prof_enter(tdata);
+	bt_count = ckh_count(&bt2gctx);
+	prof_leave(tdata);
 
 	return (bt_count);
 }
@@ -770,146 +889,249 @@ prof_dump_printf(bool propagate_err, const char *format, ...)
 	return (ret);
 }
 
-static prof_thr_cnt_t *
-ctx_sum_iter(prof_thr_cnt_tree_t *thr_cnts, prof_thr_cnt_t *thr_cnt, void *arg)
+/* tctx->tdata->lock is held. */
+static void
+prof_tctx_merge_tdata(prof_tctx_t *tctx, prof_tdata_t *tdata)
 {
-	prof_ctx_t *ctx = (prof_ctx_t *)arg;
-	volatile unsigned *epoch = &thr_cnt->epoch;
-	prof_cnt_t tcnt;
 
-	while (true) {
-		unsigned epoch0 = *epoch;
+	assert(tctx->state == prof_tctx_state_nominal);
+	tctx->state = prof_tctx_state_dumping;
+	memcpy(&tctx->dump_cnts, &tctx->cnts, sizeof(prof_cnt_t));
 
-		/* Make sure epoch is even. */
-		if (epoch0 & 1U)
-			continue;
+	tdata->cnt_summed.curobjs += tctx->dump_cnts.curobjs;
+	tdata->cnt_summed.curbytes += tctx->dump_cnts.curbytes;
+	if (opt_prof_accum) {
+		tdata->cnt_summed.accumobjs += tctx->dump_cnts.accumobjs;
+		tdata->cnt_summed.accumbytes += tctx->dump_cnts.accumbytes;
+	}
+}
 
-		memcpy(&tcnt, &thr_cnt->cnts, sizeof(prof_cnt_t));
+/* gctx->lock is held. */
+static void
+prof_tctx_merge_gctx(prof_tctx_t *tctx, prof_gctx_t *gctx)
+{
 
-		/* Terminate if epoch didn't change while reading. */
-		if (*epoch == epoch0)
-			break;
+	gctx->cnt_summed.curobjs += tctx->dump_cnts.curobjs;
+	gctx->cnt_summed.curbytes += tctx->dump_cnts.curbytes;
+	if (opt_prof_accum) {
+		gctx->cnt_summed.accumobjs += tctx->dump_cnts.accumobjs;
+		gctx->cnt_summed.accumbytes += tctx->dump_cnts.accumbytes;
 	}
+}
 
-	ctx->cnt_summed.curobjs += tcnt.curobjs;
-	ctx->cnt_summed.curbytes += tcnt.curbytes;
-	if (opt_prof_accum) {
-		ctx->cnt_summed.accumobjs += tcnt.accumobjs;
-		ctx->cnt_summed.accumbytes += tcnt.accumbytes;
+/* tctx->gctx is held. */
+static prof_tctx_t *
+prof_tctx_merge_iter(prof_tctx_tree_t *tctxs, prof_tctx_t *tctx, void *arg)
+{
+
+	switch (tctx->state) {
+	case prof_tctx_state_nominal:
+		/* New since dumping started; ignore. */
+		break;
+	case prof_tctx_state_dumping:
+	case prof_tctx_state_purgatory:
+		prof_tctx_merge_gctx(tctx, tctx->gctx);
+		break;
+	default:
+		not_reached();
 	}
 
 	return (NULL);
 }
 
+/* gctx->lock is held. */
+static prof_tctx_t *
+prof_tctx_dump_iter(prof_tctx_tree_t *tctxs, prof_tctx_t *tctx, void *arg)
+{
+	bool propagate_err = *(bool *)arg;
+
+	if (prof_dump_printf(propagate_err,
+	    "  t%"PRIu64": %"PRIu64": %"PRIu64" [%"PRIu64": %"PRIu64"]\n",
+	    tctx->tdata->thr_uid, tctx->dump_cnts.curobjs,
+	    tctx->dump_cnts.curbytes, tctx->dump_cnts.accumobjs,
+	    tctx->dump_cnts.accumbytes))
+		return (tctx);
+	return (NULL);
+}
+
+/* tctx->gctx is held. */
+static prof_tctx_t *
+prof_tctx_finish_iter(prof_tctx_tree_t *tctxs, prof_tctx_t *tctx, void *arg)
+{
+	prof_tctx_t *ret;
+
+	switch (tctx->state) {
+	case prof_tctx_state_nominal:
+		/* New since dumping started; ignore. */
+		break;
+	case prof_tctx_state_dumping:
+		tctx->state = prof_tctx_state_nominal;
+		break;
+	case prof_tctx_state_purgatory:
+		ret = tctx_tree_next(tctxs, tctx);
+		tctx_tree_remove(tctxs, tctx);
+		idalloc(tctx);
+		goto label_return;
+	default:
+		not_reached();
+	}
+
+	ret = NULL;
+label_return:
+	return (ret);
+}
+
 static void
-prof_dump_ctx_prep(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx,
-    prof_ctx_tree_t *ctxs)
+prof_dump_gctx_prep(prof_gctx_t *gctx, prof_gctx_tree_t *gctxs)
 {
 
 	cassert(config_prof);
 
-	malloc_mutex_lock(ctx->lock);
+	malloc_mutex_lock(gctx->lock);
 
 	/*
-	 * Increment nlimbo so that ctx won't go away before dump.
-	 * Additionally, link ctx into the dump list so that it is included in
+	 * Increment nlimbo so that gctx won't go away before dump.
+	 * Additionally, link gctx into the dump list so that it is included in
 	 * prof_dump()'s second pass.
 	 */
-	ctx->nlimbo++;
-	ctx_tree_insert(ctxs, ctx);
+	gctx->nlimbo++;
+	gctx_tree_insert(gctxs, gctx);
 
-	memcpy(&ctx->cnt_summed, &ctx->cnt_merged, sizeof(prof_cnt_t));
-	thr_cnt_tree_iter(&ctx->thr_cnts, NULL, ctx_sum_iter, (void *)ctx);
+	memset(&gctx->cnt_summed, 0, sizeof(prof_cnt_t));
 
-	if (ctx->cnt_summed.curobjs != 0)
-		(*leak_nctx)++;
+	malloc_mutex_unlock(gctx->lock);
+}
 
-	/* Add to cnt_all. */
-	cnt_all->curobjs += ctx->cnt_summed.curobjs;
-	cnt_all->curbytes += ctx->cnt_summed.curbytes;
-	if (opt_prof_accum) {
-		cnt_all->accumobjs += ctx->cnt_summed.accumobjs;
-		cnt_all->accumbytes += ctx->cnt_summed.accumbytes;
-	}
+static prof_gctx_t *
+prof_gctx_merge_iter(prof_gctx_tree_t *gctxs, prof_gctx_t *gctx, void *arg)
+{
+	size_t *leak_ngctx = (size_t *)arg;
 
-	malloc_mutex_unlock(ctx->lock);
+	malloc_mutex_lock(gctx->lock);
+	tctx_tree_iter(&gctx->tctxs, NULL, prof_tctx_merge_iter, NULL);
+	if (gctx->cnt_summed.curobjs != 0)
+		(*leak_ngctx)++;
+	malloc_mutex_unlock(gctx->lock);
+
+	return (NULL);
 }
 
-static bool
-prof_dump_header(bool propagate_err, const prof_cnt_t *cnt_all)
+static prof_gctx_t *
+prof_gctx_finish_iter(prof_gctx_tree_t *gctxs, prof_gctx_t *gctx, void *arg)
 {
+	prof_tctx_t *next;
+	bool destroy_gctx;
 
-	if (opt_lg_prof_sample == 0) {
-		if (prof_dump_printf(propagate_err,
-		    "heap profile: %"PRId64": %"PRId64
-		    " [%"PRIu64": %"PRIu64"] @ heapprofile\n",
-		    cnt_all->curobjs, cnt_all->curbytes,
-		    cnt_all->accumobjs, cnt_all->accumbytes))
-			return (true);
-	} else {
-		if (prof_dump_printf(propagate_err,
-		    "heap profile: %"PRId64": %"PRId64
-		    " [%"PRIu64": %"PRIu64"] @ heap_v2/%"PRIu64"\n",
-		    cnt_all->curobjs, cnt_all->curbytes,
-		    cnt_all->accumobjs, cnt_all->accumbytes,
-		    ((uint64_t)1U << opt_lg_prof_sample)))
-			return (true);
-	}
+	malloc_mutex_lock(gctx->lock);
+	next = NULL;
+	do {
+		next = tctx_tree_iter(&gctx->tctxs, next, prof_tctx_finish_iter,
+		    NULL);
+	} while (next != NULL);
+	gctx->nlimbo--;
+	destroy_gctx = prof_gctx_should_destroy(gctx);
+	malloc_mutex_unlock(gctx->lock);
+	if (destroy_gctx)
+		prof_gctx_maybe_destroy(gctx);
 
-	return (false);
+	return (NULL);
 }
 
-static void
-prof_dump_ctx_cleanup_locked(prof_ctx_t *ctx, prof_ctx_tree_t *ctxs)
+static prof_tdata_t *
+prof_tdata_merge_iter(prof_tdata_tree_t *tdatas, prof_tdata_t *tdata, void *arg)
 {
+	prof_cnt_t *cnt_all = (prof_cnt_t *)arg;
+
+	malloc_mutex_lock(tdata->lock);
+	if (tdata->state != prof_tdata_state_expired) {
+		size_t tabind;
+		union {
+			prof_tctx_t	*p;
+			void		*v;
+		} tctx;
+
+		tdata->dumping = true;
+		memset(&tdata->cnt_summed, 0, sizeof(prof_cnt_t));
+		for (tabind = 0; ckh_iter(&tdata->bt2tctx, &tabind, NULL,
+		    &tctx.v) == false;)
+			prof_tctx_merge_tdata(tctx.p, tdata);
+
+		cnt_all->curobjs += tdata->cnt_summed.curobjs;
+		cnt_all->curbytes += tdata->cnt_summed.curbytes;
+		if (opt_prof_accum) {
+			cnt_all->accumobjs += tdata->cnt_summed.accumobjs;
+			cnt_all->accumbytes += tdata->cnt_summed.accumbytes;
+		}
+	} else
+		tdata->dumping = false;
+	malloc_mutex_unlock(tdata->lock);
 
-	ctx->nlimbo--;
+	return (NULL);
 }
 
-static void
-prof_dump_ctx_cleanup(prof_ctx_t *ctx, prof_ctx_tree_t *ctxs)
+static prof_tdata_t *
+prof_tdata_dump_iter(prof_tdata_tree_t *tdatas, prof_tdata_t *tdata, void *arg)
+{
+	bool propagate_err = *(bool *)arg;
+
+	if (tdata->dumping == false)
+		return (NULL);
+
+	if (prof_dump_printf(propagate_err,
+	    "  t%"PRIu64": %"PRIu64": %"PRIu64" [%"PRIu64": %"PRIu64"]%s%s\n",
+	    tdata->thr_uid, tdata->cnt_summed.curobjs,
+	    tdata->cnt_summed.curbytes, tdata->cnt_summed.accumobjs,
+	    tdata->cnt_summed.accumbytes,
+	    (tdata->thread_name != NULL) ? " " : "",
+	    (tdata->thread_name != NULL) ? tdata->thread_name : ""))
+		return (tdata);
+	return (NULL);
+}
+
+static bool
+prof_dump_header(bool propagate_err, const prof_cnt_t *cnt_all)
 {
+	bool ret;
+
+	if (prof_dump_printf(propagate_err,
+	    "heap_v2/%"PRIu64"\n"
+	    "  t*: %"PRIu64": %"PRIu64" [%"PRIu64": %"PRIu64"]\n",
+	    ((uint64_t)1U << lg_prof_sample), cnt_all->curobjs,
+	    cnt_all->curbytes, cnt_all->accumobjs, cnt_all->accumbytes))
+		return (true);
 
-	malloc_mutex_lock(ctx->lock);
-	prof_dump_ctx_cleanup_locked(ctx, ctxs);
-	malloc_mutex_unlock(ctx->lock);
+	malloc_mutex_lock(&tdatas_mtx);
+	ret = (tdata_tree_iter(&tdatas, NULL, prof_tdata_dump_iter,
+	    (void *)&propagate_err) != NULL);
+	malloc_mutex_unlock(&tdatas_mtx);
+	return (ret);
 }
 
+/* gctx->lock is held. */
 static bool
-prof_dump_ctx(bool propagate_err, prof_ctx_t *ctx, const prof_bt_t *bt,
-    prof_ctx_tree_t *ctxs)
+prof_dump_gctx(bool propagate_err, prof_gctx_t *gctx, const prof_bt_t *bt,
+    prof_gctx_tree_t *gctxs)
 {
 	bool ret;
 	unsigned i;
 
 	cassert(config_prof);
 
-	/*
-	 * Current statistics can sum to 0 as a result of unmerged per thread
-	 * statistics.  Additionally, interval- and growth-triggered dumps can
-	 * occur between the time a ctx is created and when its statistics are
-	 * filled in.  Avoid dumping any ctx that is an artifact of either
-	 * implementation detail.
-	 */
-	malloc_mutex_lock(ctx->lock);
-	if ((opt_prof_accum == false && ctx->cnt_summed.curobjs == 0) ||
-	    (opt_prof_accum && ctx->cnt_summed.accumobjs == 0)) {
-		assert(ctx->cnt_summed.curobjs == 0);
-		assert(ctx->cnt_summed.curbytes == 0);
-		assert(ctx->cnt_summed.accumobjs == 0);
-		assert(ctx->cnt_summed.accumbytes == 0);
+	/* Avoid dumping such gctx's that have no useful data. */
+	if ((opt_prof_accum == false && gctx->cnt_summed.curobjs == 0) ||
+	    (opt_prof_accum && gctx->cnt_summed.accumobjs == 0)) {
+		assert(gctx->cnt_summed.curobjs == 0);
+		assert(gctx->cnt_summed.curbytes == 0);
+		assert(gctx->cnt_summed.accumobjs == 0);
+		assert(gctx->cnt_summed.accumbytes == 0);
 		ret = false;
 		goto label_return;
 	}
 
-	if (prof_dump_printf(propagate_err, "%"PRId64": %"PRId64
-	    " [%"PRIu64": %"PRIu64"] @",
-	    ctx->cnt_summed.curobjs, ctx->cnt_summed.curbytes,
-	    ctx->cnt_summed.accumobjs, ctx->cnt_summed.accumbytes)) {
+	if (prof_dump_printf(propagate_err, "@")) {
 		ret = true;
 		goto label_return;
 	}
-
 	for (i = 0; i < bt->len; i++) {
 		if (prof_dump_printf(propagate_err, " %#"PRIxPTR,
 		    (uintptr_t)bt->vec[i])) {
@@ -918,15 +1140,23 @@ prof_dump_ctx(bool propagate_err, prof_ctx_t *ctx, const prof_bt_t *bt,
 		}
 	}
 
-	if (prof_dump_write(propagate_err, "\n")) {
+	if (prof_dump_printf(propagate_err,
+	    "\n"
+	    "  t*: %"PRIu64": %"PRIu64" [%"PRIu64": %"PRIu64"]\n",
+	    gctx->cnt_summed.curobjs, gctx->cnt_summed.curbytes,
+	    gctx->cnt_summed.accumobjs, gctx->cnt_summed.accumbytes)) {
+		ret = true;
+		goto label_return;
+	}
+
+	if (tctx_tree_iter(&gctx->tctxs, NULL, prof_tctx_dump_iter,
+	    (void *)&propagate_err) != NULL) {
 		ret = true;
 		goto label_return;
 	}
 
 	ret = false;
 label_return:
-	prof_dump_ctx_cleanup_locked(ctx, ctxs);
-	malloc_mutex_unlock(ctx->lock);
 	return (ret);
 }
 
@@ -980,72 +1210,85 @@ label_return:
 }
 
 static void
-prof_leakcheck(const prof_cnt_t *cnt_all, size_t leak_nctx,
+prof_leakcheck(const prof_cnt_t *cnt_all, size_t leak_ngctx,
     const char *filename)
 {
 
 	if (cnt_all->curbytes != 0) {
-		malloc_printf("<jemalloc>: Leak summary: %"PRId64" byte%s, %"
-		    PRId64" object%s, %zu context%s\n",
+		malloc_printf("<jemalloc>: Leak summary: %"PRIu64" byte%s, %"
+		    PRIu64" object%s, %zu context%s\n",
 		    cnt_all->curbytes, (cnt_all->curbytes != 1) ? "s" : "",
 		    cnt_all->curobjs, (cnt_all->curobjs != 1) ? "s" : "",
-		    leak_nctx, (leak_nctx != 1) ? "s" : "");
+		    leak_ngctx, (leak_ngctx != 1) ? "s" : "");
 		malloc_printf(
 		    "<jemalloc>: Run pprof on \"%s\" for leak detail\n",
 		    filename);
 	}
 }
 
-static prof_ctx_t *
-prof_ctx_dump_iter(prof_ctx_tree_t *ctxs, prof_ctx_t *ctx, void *arg)
+static prof_gctx_t *
+prof_gctx_dump_iter(prof_gctx_tree_t *gctxs, prof_gctx_t *gctx, void *arg)
 {
+	prof_gctx_t *ret;
 	bool propagate_err = *(bool *)arg;
 
-	if (prof_dump_ctx(propagate_err, ctx, &ctx->bt, ctxs))
-		return (ctx_tree_next(ctxs, ctx));
+	malloc_mutex_lock(gctx->lock);
 
-	return (NULL);
-}
-
-static prof_ctx_t *
-prof_ctx_cleanup_iter(prof_ctx_tree_t *ctxs, prof_ctx_t *ctx, void *arg)
-{
-
-	prof_dump_ctx_cleanup(ctx, ctxs);
+	if (prof_dump_gctx(propagate_err, gctx, &gctx->bt, gctxs)) {
+		ret = gctx_tree_next(gctxs, gctx);
+		goto label_return;
+	}
 
-	return (NULL);
+	ret = NULL;
+label_return:
+	malloc_mutex_unlock(gctx->lock);
+	return (ret);
 }
 
 static bool
 prof_dump(bool propagate_err, const char *filename, bool leakcheck)
 {
-	prof_tdata_t *prof_tdata;
+	prof_tdata_t *tdata;
 	prof_cnt_t cnt_all;
 	size_t tabind;
 	union {
-		prof_ctx_t	*p;
+		prof_gctx_t	*p;
 		void		*v;
-	} ctx;
-	size_t leak_nctx;
-	prof_ctx_tree_t ctxs;
-	prof_ctx_t *cleanup_start = NULL;
+	} gctx;
+	size_t leak_ngctx;
+	prof_gctx_tree_t gctxs;
 
 	cassert(config_prof);
 
-	prof_tdata = prof_tdata_get(false);
-	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+	tdata = prof_tdata_get(false);
+	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
 		return (true);
 
 	malloc_mutex_lock(&prof_dump_mtx);
+	prof_enter(tdata);
 
-	/* Merge per thread profile stats, and sum them in cnt_all. */
+	/*
+	 * Put gctx's in limbo and clear their counters in preparation for
+	 * summing.
+	 */
+	gctx_tree_new(&gctxs);
+	for (tabind = 0; ckh_iter(&bt2gctx, &tabind, NULL, &gctx.v) == false;)
+		prof_dump_gctx_prep(gctx.p, &gctxs);
+
+	/*
+	 * Iterate over tdatas, and for the non-expired ones snapshot their tctx
+	 * stats and merge them into the associated gctx's.
+	 */
 	memset(&cnt_all, 0, sizeof(prof_cnt_t));
-	leak_nctx = 0;
-	ctx_tree_new(&ctxs);
-	prof_enter(prof_tdata);
-	for (tabind = 0; ckh_iter(&bt2ctx, &tabind, NULL, &ctx.v) == false;)
-		prof_dump_ctx_prep(ctx.p, &cnt_all, &leak_nctx, &ctxs);
-	prof_leave(prof_tdata);
+	malloc_mutex_lock(&tdatas_mtx);
+	tdata_tree_iter(&tdatas, NULL, prof_tdata_merge_iter, (void *)&cnt_all);
+	malloc_mutex_unlock(&tdatas_mtx);
+
+	/* Merge tctx stats into gctx's. */
+	leak_ngctx = 0;
+	gctx_tree_iter(&gctxs, NULL, prof_gctx_merge_iter, (void *)&leak_ngctx);
+
+	prof_leave(tdata);
 
 	/* Create dump file. */
 	if ((prof_dump_fd = prof_dump_open(propagate_err, filename)) == -1)
@@ -1055,10 +1298,9 @@ prof_dump(bool propagate_err, const char *filename, bool leakcheck)
 	if (prof_dump_header(propagate_err, &cnt_all))
 		goto label_write_error;
 
-	/* Dump per ctx profile stats. */
-	cleanup_start = ctx_tree_iter(&ctxs, NULL, prof_ctx_dump_iter,
-	    (void *)&propagate_err);
-	if (cleanup_start != NULL)
+	/* Dump per gctx profile stats. */
+	if (gctx_tree_iter(&gctxs, NULL, prof_gctx_dump_iter,
+	    (void *)&propagate_err) != NULL)
 		goto label_write_error;
 
 	/* Dump /proc/<pid>/maps if possible. */
@@ -1068,19 +1310,17 @@ prof_dump(bool propagate_err, const char *filename, bool leakcheck)
 	if (prof_dump_close(propagate_err))
 		goto label_open_close_error;
 
+	gctx_tree_iter(&gctxs, NULL, prof_gctx_finish_iter, NULL);
 	malloc_mutex_unlock(&prof_dump_mtx);
 
 	if (leakcheck)
-		prof_leakcheck(&cnt_all, leak_nctx, filename);
+		prof_leakcheck(&cnt_all, leak_ngctx, filename);
 
 	return (false);
 label_write_error:
 	prof_dump_close(propagate_err);
 label_open_close_error:
-	if (cleanup_start != NULL) {
-		ctx_tree_iter(&ctxs, cleanup_start, prof_ctx_cleanup_iter,
-		    NULL);
-	}
+	gctx_tree_iter(&gctxs, NULL, prof_gctx_finish_iter, NULL);
 	malloc_mutex_unlock(&prof_dump_mtx);
 	return (true);
 }
@@ -1128,18 +1368,18 @@ prof_fdump(void)
 void
 prof_idump(void)
 {
-	prof_tdata_t *prof_tdata;
+	prof_tdata_t *tdata;
 	char filename[PATH_MAX + 1];
 
 	cassert(config_prof);
 
 	if (prof_booted == false)
 		return;
-	prof_tdata = prof_tdata_get(false);
-	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+	tdata = prof_tdata_get(false);
+	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
 		return;
-	if (prof_tdata->enq) {
-		prof_tdata->enq_idump = true;
+	if (tdata->enq) {
+		tdata->enq_idump = true;
 		return;
 	}
 
@@ -1178,18 +1418,18 @@ prof_mdump(const char *filename)
 void
 prof_gdump(void)
 {
-	prof_tdata_t *prof_tdata;
+	prof_tdata_t *tdata;
 	char filename[DUMP_FILENAME_BUFSIZE];
 
 	cassert(config_prof);
 
 	if (prof_booted == false)
 		return;
-	prof_tdata = prof_tdata_get(false);
-	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+	tdata = prof_tdata_get(false);
+	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
 		return;
-	if (prof_tdata->enq) {
-		prof_tdata->enq_gdump = true;
+	if (tdata->enq) {
+		tdata->enq_gdump = true;
 		return;
 	}
 
@@ -1225,81 +1465,233 @@ prof_bt_keycomp(const void *k1, const void *k2)
 	return (memcmp(bt1->vec, bt2->vec, bt1->len * sizeof(void *)) == 0);
 }
 
-prof_tdata_t *
-prof_tdata_init(void)
+JEMALLOC_INLINE_C uint64_t
+prof_thr_uid_alloc(void)
+{
+
+	return (atomic_add_uint64(&next_thr_uid, 1) - 1);
+}
+
+static prof_tdata_t *
+prof_tdata_init_impl(uint64_t thr_uid)
 {
-	prof_tdata_t *prof_tdata;
+	prof_tdata_t *tdata;
 
 	cassert(config_prof);
 
 	/* Initialize an empty cache for this thread. */
-	prof_tdata = (prof_tdata_t *)imalloc(sizeof(prof_tdata_t));
-	if (prof_tdata == NULL)
+	tdata = (prof_tdata_t *)imalloc(sizeof(prof_tdata_t));
+	if (tdata == NULL)
 		return (NULL);
 
-	if (ckh_new(&prof_tdata->bt2cnt, PROF_CKH_MINITEMS,
+	tdata->lock = prof_tdata_mutex_choose(thr_uid);
+	tdata->thr_uid = thr_uid;
+	tdata->thread_name = NULL;
+	tdata->state = prof_tdata_state_attached;
+
+	if (ckh_new(&tdata->bt2tctx, PROF_CKH_MINITEMS,
 	    prof_bt_hash, prof_bt_keycomp)) {
-		idalloc(prof_tdata);
+		idalloc(tdata);
 		return (NULL);
 	}
 
-	prof_tdata->prng_state = (uint64_t)(uintptr_t)prof_tdata;
-	prof_sample_threshold_update(prof_tdata);
+	tdata->prng_state = (uint64_t)(uintptr_t)tdata;
+	prof_sample_threshold_update(tdata);
+
+	tdata->enq = false;
+	tdata->enq_idump = false;
+	tdata->enq_gdump = false;
+
+	tdata->dumping = false;
+	tdata->active = true;
+
+	prof_tdata_tsd_set(&tdata);
+
+	malloc_mutex_lock(&tdatas_mtx);
+	tdata_tree_insert(&tdatas, tdata);
+	malloc_mutex_unlock(&tdatas_mtx);
+
+	return (tdata);
+}
+
+prof_tdata_t *
+prof_tdata_init(void)
+{
+
+	return (prof_tdata_init_impl(prof_thr_uid_alloc()));
+}
+
+prof_tdata_t *
+prof_tdata_reinit(prof_tdata_t *tdata)
+{
+
+	return (prof_tdata_init_impl(tdata->thr_uid));
+}
+
+/* tdata->lock must be held. */
+static bool
+prof_tdata_should_destroy(prof_tdata_t *tdata)
+{
+
+	if (tdata->state == prof_tdata_state_attached)
+		return (false);
+	if (ckh_count(&tdata->bt2tctx) != 0)
+		return (false);
+	return (true);
+}
+
+static void
+prof_tdata_destroy(prof_tdata_t *tdata)
+{
+
+	assert(prof_tdata_should_destroy(tdata));
+
+	malloc_mutex_lock(&tdatas_mtx);
+	tdata_tree_remove(&tdatas, tdata);
+	malloc_mutex_unlock(&tdatas_mtx);
+
+	if (tdata->thread_name != NULL)
+		idalloc(tdata->thread_name);
+	ckh_delete(&tdata->bt2tctx);
+	idalloc(tdata);
+}
+
+static void
+prof_tdata_state_transition(prof_tdata_t *tdata, prof_tdata_state_t state)
+{
+	bool destroy_tdata;
+
+	malloc_mutex_lock(tdata->lock);
+	if (tdata->state != state) {
+		tdata->state = state;
+		destroy_tdata = prof_tdata_should_destroy(tdata);
+	} else
+		destroy_tdata = false;
+	malloc_mutex_unlock(tdata->lock);
+	if (destroy_tdata)
+		prof_tdata_destroy(tdata);
+}
 
-	prof_tdata->enq = false;
-	prof_tdata->enq_idump = false;
-	prof_tdata->enq_gdump = false;
+static void
+prof_tdata_detach(prof_tdata_t *tdata)
+{
 
-	prof_tdata_tsd_set(&prof_tdata);
+	prof_tdata_state_transition(tdata, prof_tdata_state_detached);
+}
 
-	return (prof_tdata);
+static void
+prof_tdata_expire(prof_tdata_t *tdata)
+{
+
+	prof_tdata_state_transition(tdata, prof_tdata_state_expired);
+}
+
+static prof_tdata_t *
+prof_tdata_reset_iter(prof_tdata_tree_t *tdatas, prof_tdata_t *tdata, void *arg)
+{
+
+	prof_tdata_expire(tdata);
+	return (NULL);
+}
+
+void
+prof_reset(size_t lg_sample)
+{
+
+	assert(lg_sample < (sizeof(uint64_t) << 3));
+
+	malloc_mutex_lock(&prof_dump_mtx);
+	malloc_mutex_lock(&tdatas_mtx);
+
+	lg_prof_sample = lg_sample;
+	tdata_tree_iter(&tdatas, NULL, prof_tdata_reset_iter, NULL);
+
+	malloc_mutex_unlock(&tdatas_mtx);
+	malloc_mutex_unlock(&prof_dump_mtx);
 }
 
 void
 prof_tdata_cleanup(void *arg)
 {
-	prof_tdata_t *prof_tdata = *(prof_tdata_t **)arg;
+	prof_tdata_t *tdata = *(prof_tdata_t **)arg;
 
 	cassert(config_prof);
 
-	if (prof_tdata == PROF_TDATA_STATE_REINCARNATED) {
+	if (tdata == PROF_TDATA_STATE_REINCARNATED) {
 		/*
 		 * Another destructor deallocated memory after this destructor
-		 * was called.  Reset prof_tdata to PROF_TDATA_STATE_PURGATORY
-		 * in order to receive another callback.
+		 * was called.  Reset tdata to PROF_TDATA_STATE_PURGATORY in
+		 * order to receive another callback.
 		 */
-		prof_tdata = PROF_TDATA_STATE_PURGATORY;
-		prof_tdata_tsd_set(&prof_tdata);
-	} else if (prof_tdata == PROF_TDATA_STATE_PURGATORY) {
+		tdata = PROF_TDATA_STATE_PURGATORY;
+		prof_tdata_tsd_set(&tdata);
+	} else if (tdata == PROF_TDATA_STATE_PURGATORY) {
 		/*
 		 * The previous time this destructor was called, we set the key
 		 * to PROF_TDATA_STATE_PURGATORY so that other destructors
-		 * wouldn't cause re-creation of the prof_tdata.  This time, do
+		 * wouldn't cause re-creation of the tdata.  This time, do
 		 * nothing, so that the destructor will not be called again.
 		 */
-	} else if (prof_tdata != NULL) {
-		union {
-			prof_thr_cnt_t	*p;
-			void		*v;
-		} cnt;
-		size_t tabind;
-
-		/*
-		 * Iteratively merge cnt's into the global stats and delete
-		 * them.
-		 */
-		for (tabind = 0; ckh_iter(&prof_tdata->bt2cnt, &tabind, NULL,
-		    &cnt.v);) {
-			prof_ctx_merge(cnt.p->ctx, cnt.p);
-			idalloc(cnt.v);
-		}
-		ckh_delete(&prof_tdata->bt2cnt);
-		idalloc(prof_tdata);
-		prof_tdata = PROF_TDATA_STATE_PURGATORY;
-		prof_tdata_tsd_set(&prof_tdata);
+	} else if (tdata != NULL) {
+		prof_tdata_detach(tdata);
+		tdata = PROF_TDATA_STATE_PURGATORY;
+		prof_tdata_tsd_set(&tdata);
 	}
 }
 
+const char *
+prof_thread_name_get(void)
+{
+	prof_tdata_t *tdata = prof_tdata_get(true);
+	if (tdata == NULL)
+		return (NULL);
+	return (tdata->thread_name);
+}
+
+bool
+prof_thread_name_set(const char *thread_name)
+{
+	prof_tdata_t *tdata;
+	size_t size;
+	char *s;
+
+	tdata = prof_tdata_get(true);
+	if (tdata == NULL)
+		return (true);
+
+	size = strlen(thread_name) + 1;
+	s = imalloc(size);
+	if (s == NULL)
+		return (true);
+
+	memcpy(s, thread_name, size);
+	if (tdata->thread_name != NULL)
+		idalloc(tdata->thread_name);
+	tdata->thread_name = s;
+	return (false);
+}
+
+bool
+prof_thread_active_get(void)
+{
+	prof_tdata_t *tdata = prof_tdata_get(true);
+	if (tdata == NULL)
+		return (false);
+	return (tdata->active);
+}
+
+bool
+prof_thread_active_set(bool active)
+{
+	prof_tdata_t *tdata;
+
+	tdata = prof_tdata_get(true);
+	if (tdata == NULL)
+		return (true);
+	tdata->active = active;
+	return (false);
+}
+
 void
 prof_boot0(void)
 {
@@ -1345,10 +1737,12 @@ prof_boot2(void)
 	if (opt_prof) {
 		unsigned i;
 
-		if (ckh_new(&bt2ctx, PROF_CKH_MINITEMS, prof_bt_hash,
+		lg_prof_sample = opt_lg_prof_sample;
+
+		if (ckh_new(&bt2gctx, PROF_CKH_MINITEMS, prof_bt_hash,
 		    prof_bt_keycomp))
 			return (true);
-		if (malloc_mutex_init(&bt2ctx_mtx))
+		if (malloc_mutex_init(&bt2gctx_mtx))
 			return (true);
 		if (prof_tdata_tsd_boot()) {
 			malloc_write(
@@ -1356,6 +1750,12 @@ prof_boot2(void)
 			abort();
 		}
 
+		tdata_tree_new(&tdatas);
+		if (malloc_mutex_init(&tdatas_mtx))
+			return (true);
+
+		next_thr_uid = 0;
+
 		if (malloc_mutex_init(&prof_dump_seq_mtx))
 			return (true);
 		if (malloc_mutex_init(&prof_dump_mtx))
@@ -1367,12 +1767,21 @@ prof_boot2(void)
 				abort();
 		}
 
-		ctx_locks = (malloc_mutex_t *)base_alloc(PROF_NCTX_LOCKS *
+		gctx_locks = (malloc_mutex_t *)base_alloc(PROF_NCTX_LOCKS *
 		    sizeof(malloc_mutex_t));
-		if (ctx_locks == NULL)
+		if (gctx_locks == NULL)
 			return (true);
 		for (i = 0; i < PROF_NCTX_LOCKS; i++) {
-			if (malloc_mutex_init(&ctx_locks[i]))
+			if (malloc_mutex_init(&gctx_locks[i]))
+				return (true);
+		}
+
+		tdata_locks = (malloc_mutex_t *)base_alloc(PROF_NTDATA_LOCKS *
+		    sizeof(malloc_mutex_t));
+		if (tdata_locks == NULL)
+			return (true);
+		for (i = 0; i < PROF_NTDATA_LOCKS; i++) {
+			if (malloc_mutex_init(&tdata_locks[i]))
 				return (true);
 		}
 	}
@@ -1397,10 +1806,10 @@ prof_prefork(void)
 	if (opt_prof) {
 		unsigned i;
 
-		malloc_mutex_prefork(&bt2ctx_mtx);
+		malloc_mutex_prefork(&bt2gctx_mtx);
 		malloc_mutex_prefork(&prof_dump_seq_mtx);
 		for (i = 0; i < PROF_NCTX_LOCKS; i++)
-			malloc_mutex_prefork(&ctx_locks[i]);
+			malloc_mutex_prefork(&gctx_locks[i]);
 	}
 }
 
@@ -1412,9 +1821,9 @@ prof_postfork_parent(void)
 		unsigned i;
 
 		for (i = 0; i < PROF_NCTX_LOCKS; i++)
-			malloc_mutex_postfork_parent(&ctx_locks[i]);
+			malloc_mutex_postfork_parent(&gctx_locks[i]);
 		malloc_mutex_postfork_parent(&prof_dump_seq_mtx);
-		malloc_mutex_postfork_parent(&bt2ctx_mtx);
+		malloc_mutex_postfork_parent(&bt2gctx_mtx);
 	}
 }
 
@@ -1426,9 +1835,9 @@ prof_postfork_child(void)
 		unsigned i;
 
 		for (i = 0; i < PROF_NCTX_LOCKS; i++)
-			malloc_mutex_postfork_child(&ctx_locks[i]);
+			malloc_mutex_postfork_child(&gctx_locks[i]);
 		malloc_mutex_postfork_child(&prof_dump_seq_mtx);
-		malloc_mutex_postfork_child(&bt2ctx_mtx);
+		malloc_mutex_postfork_child(&bt2gctx_mtx);
 	}
 }
 
diff --git a/src/stats.c b/src/stats.c
index a0eb297..db34275 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -441,7 +441,7 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		}
 		if ((err = je_mallctl("opt.prof", &bv, &bsz, NULL, 0)) == 0 &&
 		    bv) {
-			CTL_GET("opt.lg_prof_sample", &sv, size_t);
+			CTL_GET("prof.lg_sample", &sv, size_t);
 			malloc_cprintf(write_cb, cbopaque,
 			    "Average profile sample interval: %"PRIu64
 			    " (2^%zu)\n", (((uint64_t)1U) << sv), sv);
-- 
cgit v0.12