11 files changed, 167 insertions, 126 deletions
diff --git a/jemalloc/Makefile.in b/jemalloc/Makefile.in
index ce70588..ac9b782 100644
--- a/jemalloc/Makefile.in
+++ b/jemalloc/Makefile.in
@@ -112,6 +112,7 @@ clean:
 	rm -f $(DSOS)
 
 distclean: clean
+	rm -rf @objroot@autom4te.cache
 	rm -f @objroot@config.log
 	rm -f @objroot@config.status
 	rm -f @objroot@cfghdrs.stamp
@@ -120,7 +121,6 @@ distclean: clean
 	rm -f @cfgoutputs_out@
 
 relclean: distclean
-	rm -rf @objroot@autom4te.cache
 	rm -f @objroot@configure
 	rm -f @srcroot@VERSION
 
diff --git a/jemalloc/include/jemalloc/internal/arena.h b/jemalloc/include/jemalloc/internal/arena.h
index bb4ce2a..c1955f1 100644
--- a/jemalloc/include/jemalloc/internal/arena.h
+++ b/jemalloc/include/jemalloc/internal/arena.h
@@ -98,7 +98,7 @@ struct arena_chunk_map_s {
 
 #ifdef JEMALLOC_PROF
 	/* Profile counters, used for large object runs. */
-	prof_thr_cnt_t			*prof_cnt;
+	prof_ctx_t			*prof_ctx;
 #endif
 
 	/*
@@ -246,10 +246,10 @@ struct arena_bin_s {
 
 #ifdef JEMALLOC_PROF
 	/*
-	 * Offset of first (prof_cnt_t *) in a run header for this bin's size
+	 * Offset of first (prof_ctx_t *) in a run header for this bin's size
 	 * class, or 0 if (opt_prof == false).
 	 */
-	uint32_t	cnt0_offset;
+	uint32_t	ctx0_offset;
 #endif
 
 	/* Offset of first region in a run for this bin's size class. */
@@ -438,8 +438,8 @@ size_t	arena_salloc(const void *ptr);
 #ifdef JEMALLOC_PROF
 void	arena_prof_promoted(const void *ptr, size_t size);
 size_t	arena_salloc_demote(const void *ptr);
-prof_thr_cnt_t	*arena_prof_cnt_get(const void *ptr);
-void	arena_prof_cnt_set(const void *ptr, prof_thr_cnt_t *cnt);
+prof_ctx_t	*arena_prof_ctx_get(const void *ptr);
+void	arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 #endif
 void	arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
     arena_chunk_map_t *mapelm);
diff --git a/jemalloc/include/jemalloc/internal/extent.h b/jemalloc/include/jemalloc/internal/extent.h
index 33a4e9a..6fe9702 100644
--- a/jemalloc/include/jemalloc/internal/extent.h
+++ b/jemalloc/include/jemalloc/internal/extent.h
@@ -19,7 +19,7 @@ struct extent_node_s {
 
 #ifdef JEMALLOC_PROF
 	/* Profile counters, used for huge objects. */
-	prof_thr_cnt_t		*prof_cnt;
+	prof_ctx_t		*prof_ctx;
 #endif
 
 	/* Pointer to the extent that this tree node is responsible for. */
diff --git a/jemalloc/include/jemalloc/internal/huge.h b/jemalloc/include/jemalloc/internal/huge.h
index 3cf32f7..0c0582f 100644
--- a/jemalloc/include/jemalloc/internal/huge.h
+++ b/jemalloc/include/jemalloc/internal/huge.h
@@ -25,8 +25,8 @@ void	*huge_ralloc(void *ptr, size_t size, size_t oldsize);
 void	huge_dalloc(void *ptr);
 size_t	huge_salloc(const void *ptr);
 #ifdef JEMALLOC_PROF
-prof_thr_cnt_t	*huge_prof_cnt_get(const void *ptr);
-void	huge_prof_cnt_set(const void *ptr, prof_thr_cnt_t *cnt);
+prof_ctx_t	*huge_prof_ctx_get(const void *ptr);
+void	huge_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 #endif
 bool	huge_boot(void);
 
diff --git a/jemalloc/include/jemalloc/internal/prof.h b/jemalloc/include/jemalloc/internal/prof.h
index 6e71552..fb55fb9 100644
--- a/jemalloc/include/jemalloc/internal/prof.h
+++ b/jemalloc/include/jemalloc/internal/prof.h
@@ -98,6 +98,9 @@ struct prof_thr_cnt_s {
 };
 
 struct prof_ctx_s {
+	/* Associated backtrace. */
+	prof_bt_t		*bt;
+
 	/* Protects cnt_merged and sets_ql. */
 	malloc_mutex_t		lock;
 
@@ -151,10 +154,10 @@ bool	prof_init(prof_t *prof, bool master);
 void	prof_destroy(prof_t *prof);
 
 prof_thr_cnt_t	*prof_alloc_prep(size_t size);
-prof_thr_cnt_t	*prof_cnt_get(const void *ptr);
+prof_ctx_t	*prof_ctx_get(const void *ptr);
 void	prof_malloc(const void *ptr, prof_thr_cnt_t *cnt);
 void	prof_realloc(const void *ptr, prof_thr_cnt_t *cnt, const void *old_ptr,
-    size_t old_size, prof_thr_cnt_t *old_cnt);
+    size_t old_size, prof_ctx_t *old_ctx);
 void	prof_free(const void *ptr);
 void	prof_idump(void);
 bool	prof_mdump(const char *filename);
diff --git a/jemalloc/include/jemalloc/internal/tcache.h b/jemalloc/include/jemalloc/internal/tcache.h
index c76597f..fa6c53f 100644
--- a/jemalloc/include/jemalloc/internal/tcache.h
+++ b/jemalloc/include/jemalloc/internal/tcache.h
@@ -353,7 +353,7 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
 
 #ifdef JEMALLOC_FILL
 	if (opt_junk)
-		memset(ptr, 0x5a, bin->reg_size);
+		memset(ptr, 0x5a, arena->bins[binind].reg_size);
 #endif
 
 	tbin = &tcache->tbins[binind];
diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index e74b470..e414226 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -470,10 +470,22 @@ arena_chunk_dealloc(arena_t *arena, arena_chunk_t *chunk)
 {
 	arena_avail_tree_t *runs_avail;
 
-	while (arena->spare != NULL) {
+	/*
+	 * Remove run from the appropriate runs_avail_* tree, so that the arena
+	 * does not use it.
+	 */
+	if ((chunk->map[arena_chunk_header_npages].bits &
+	    CHUNK_MAP_DIRTY) == 0)
+		runs_avail = &arena->runs_avail_clean;
+	else
+		runs_avail = &arena->runs_avail_dirty;
+	arena_avail_tree_remove(runs_avail,
+	    &chunk->map[arena_chunk_header_npages]);
+
+	if (arena->spare != NULL) {
 		arena_chunk_t *spare = arena->spare;
 
-		arena->spare = NULL;
+		arena->spare = chunk;
 		if (spare->dirtied) {
 			ql_remove(&chunk->arena->chunks_dirty, spare,
 			    link_dirty);
@@ -485,21 +497,8 @@ arena_chunk_dealloc(arena_t *arena, arena_chunk_t *chunk)
 #ifdef JEMALLOC_STATS
 		arena->stats.mapped -= chunksize;
 #endif
-	}
-
-	/*
-	 * Remove run from the appropriate runs_avail_* tree, so that the arena
-	 * does not use it.
-	 */
-	if ((chunk->map[arena_chunk_header_npages].bits &
-	    CHUNK_MAP_DIRTY) == 0)
-		runs_avail = &arena->runs_avail_clean;
-	else
-		runs_avail = &arena->runs_avail_dirty;
-	arena_avail_tree_remove(runs_avail,
-	    &chunk->map[arena_chunk_header_npages]);
-
-	arena->spare = chunk;
+	} else
+		arena->spare = chunk;
 }
 
 static arena_run_t *
@@ -925,6 +924,18 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty)
 	/* Insert into runs_avail, now that coalescing is complete. */
 	arena_avail_tree_insert(runs_avail, &chunk->map[run_ind]);
 
+	if (dirty) {
+		/*
+		 * Insert into chunks_dirty before potentially calling
+		 * arena_chunk_dealloc(), so that chunks_dirty and
+		 * arena->ndirty are consistent.
+		 */
+		if (chunk->dirtied == false) {
+			ql_tail_insert(&arena->chunks_dirty, chunk, link_dirty);
+			chunk->dirtied = true;
+		}
+	}
+
 	/*
 	 * Deallocate chunk if it is now completely unused.  The bit
 	 * manipulation checks whether the first run is unallocated and extends
@@ -935,19 +946,14 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty)
 		arena_chunk_dealloc(arena, chunk);
 
 	/*
-	 * It is okay to do dirty page processing even if the chunk was
+	 * It is okay to do dirty page processing here even if the chunk was
 	 * deallocated above, since in that case it is the spare.  Waiting
 	 * until after possible chunk deallocation to do dirty processing
 	 * allows for an old spare to be fully deallocated, thus decreasing the
 	 * chances of spuriously crossing the dirty page purging threshold.
 	 */
-	if (dirty) {
-		if (chunk->dirtied == false) {
-			ql_tail_insert(&arena->chunks_dirty, chunk, link_dirty);
-			chunk->dirtied = true;
-		}
+	if (dirty)
 		arena_maybe_purge(arena);
-	}
 }
 
 static void
@@ -1198,7 +1204,7 @@ arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size)
 	uint32_t try_nregs, good_nregs;
 	uint32_t try_hdr_size, good_hdr_size;
 #ifdef JEMALLOC_PROF
-	uint32_t try_cnt0_offset, good_cnt0_offset;
+	uint32_t try_ctx0_offset, good_ctx0_offset;
 #endif
 	uint32_t try_reg0_offset, good_reg0_offset;
 
@@ -1225,11 +1231,11 @@ arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size)
 		if (opt_prof && prof_promote == false) {
 			/* Pad to a quantum boundary. */
 			try_hdr_size = QUANTUM_CEILING(try_hdr_size);
-			try_cnt0_offset = try_hdr_size;
-			/* Add space for one (prof_thr_cnt_t *) per region. */
-			try_hdr_size += try_nregs * sizeof(prof_thr_cnt_t *);
+			try_ctx0_offset = try_hdr_size;
+			/* Add space for one (prof_ctx_t *) per region. */
+			try_hdr_size += try_nregs * sizeof(prof_ctx_t *);
 		} else
-			try_cnt0_offset = 0;
+			try_ctx0_offset = 0;
 #endif
 		try_reg0_offset = try_run_size - (try_nregs * bin->reg_size);
 	} while (try_hdr_size > try_reg0_offset);
@@ -1243,7 +1249,7 @@ arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size)
 		good_nregs = try_nregs;
 		good_hdr_size = try_hdr_size;
 #ifdef JEMALLOC_PROF
-		good_cnt0_offset = try_cnt0_offset;
+		good_ctx0_offset = try_ctx0_offset;
 #endif
 		good_reg0_offset = try_reg0_offset;
 
@@ -1258,13 +1264,12 @@ arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size)
 			if (opt_prof && prof_promote == false) {
 				/* Pad to a quantum boundary. */
 				try_hdr_size = QUANTUM_CEILING(try_hdr_size);
-				try_cnt0_offset = try_hdr_size;
+				try_ctx0_offset = try_hdr_size;
 				/*
-				 * Add space for one (prof_thr_cnt_t *) per
-				 * region.
+				 * Add space for one (prof_ctx_t *) per region.
 				 */
 				try_hdr_size += try_nregs *
-				    sizeof(prof_thr_cnt_t *);
+				    sizeof(prof_ctx_t *);
 			}
 #endif
 			try_reg0_offset = try_run_size - (try_nregs *
@@ -1282,7 +1287,7 @@ arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size)
 	bin->run_size = good_run_size;
 	bin->nregs = good_nregs;
 #ifdef JEMALLOC_PROF
-	bin->cnt0_offset = good_cnt0_offset;
+	bin->ctx0_offset = good_ctx0_offset;
 #endif
 	bin->reg0_offset = good_reg0_offset;
 
@@ -1639,10 +1644,10 @@ arena_run_regind(arena_run_t *run, arena_bin_t *bin, const void *ptr,
 	return (regind);
 }
 
-prof_thr_cnt_t *
-arena_prof_cnt_get(const void *ptr)
+prof_ctx_t *
+arena_prof_ctx_get(const void *ptr)
 {
-	prof_thr_cnt_t *ret;
+	prof_ctx_t *ret;
 	arena_chunk_t *chunk;
 	size_t pageind, mapbits;
 
@@ -1655,7 +1660,7 @@ arena_prof_cnt_get(const void *ptr)
 	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
 	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
 		if (prof_promote)
-			ret = (prof_thr_cnt_t *)(uintptr_t)1U;
+			ret = (prof_ctx_t *)(uintptr_t)1U;
 		else {
 			arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
 			    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
@@ -1665,18 +1670,18 @@ arena_prof_cnt_get(const void *ptr)
 
 			assert(run->magic == ARENA_RUN_MAGIC);
 			regind = arena_run_regind(run, bin, ptr, bin->reg_size);
-			ret = *(prof_thr_cnt_t **)((uintptr_t)run +
-			    bin->cnt0_offset + (regind *
-			    sizeof(prof_thr_cnt_t *)));
+			ret = *(prof_ctx_t **)((uintptr_t)run +
+			    bin->ctx0_offset + (regind *
+			    sizeof(prof_ctx_t *)));
 		}
 	} else
-		ret = chunk->map[pageind].prof_cnt;
+		ret = chunk->map[pageind].prof_ctx;
 
 	return (ret);
 }
 
 void
-arena_prof_cnt_set(const void *ptr, prof_thr_cnt_t *cnt)
+arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 {
 	arena_chunk_t *chunk;
 	size_t pageind, mapbits;
@@ -1699,12 +1704,12 @@ arena_prof_cnt_set(const void *ptr, prof_thr_cnt_t *cnt)
 			assert(run->magic == ARENA_RUN_MAGIC);
 			regind = arena_run_regind(run, bin, ptr, bin->reg_size);
 
-			*((prof_thr_cnt_t **)((uintptr_t)run + bin->cnt0_offset
-			    + (regind * sizeof(prof_thr_cnt_t *)))) = cnt;
+			*((prof_ctx_t **)((uintptr_t)run + bin->ctx0_offset
+			    + (regind * sizeof(prof_ctx_t *)))) = ctx;
 		} else
-			assert((uintptr_t)cnt == (uintptr_t)1U);
+			assert((uintptr_t)ctx == (uintptr_t)1U);
 	} else
-		chunk->map[pageind].prof_cnt = cnt;
+		chunk->map[pageind].prof_ctx = ctx;
 }
 #endif
 
diff --git a/jemalloc/src/huge.c b/jemalloc/src/huge.c
index d35aa5c..49962ea 100644
--- a/jemalloc/src/huge.c
+++ b/jemalloc/src/huge.c
@@ -241,10 +241,10 @@ huge_salloc(const void *ptr)
 }
 
 #ifdef JEMALLOC_PROF
-prof_thr_cnt_t *
-huge_prof_cnt_get(const void *ptr)
+prof_ctx_t *
+huge_prof_ctx_get(const void *ptr)
 {
-	prof_thr_cnt_t *ret;
+	prof_ctx_t *ret;
 	extent_node_t *node, key;
 
 	malloc_mutex_lock(&huge_mtx);
@@ -254,7 +254,7 @@ huge_prof_cnt_get(const void *ptr)
 	node = extent_tree_ad_search(&huge, &key);
 	assert(node != NULL);
 
-	ret = node->prof_cnt;
+	ret = node->prof_ctx;
 
 	malloc_mutex_unlock(&huge_mtx);
 
@@ -262,7 +262,7 @@ huge_prof_cnt_get(const void *ptr)
 }
 
 void
-huge_prof_cnt_set(const void *ptr, prof_thr_cnt_t *cnt)
+huge_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 {
 	extent_node_t *node, key;
 
@@ -273,7 +273,7 @@ huge_prof_cnt_set(const void *ptr, prof_thr_cnt_t *cnt)
 	node = extent_tree_ad_search(&huge, &key);
 	assert(node != NULL);
 
-	node->prof_cnt = cnt;
+	node->prof_ctx = ctx;
 
 	malloc_mutex_unlock(&huge_mtx);
 }
diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c
index e01de0d..aeab140 100644
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@@ -1060,7 +1060,8 @@ JEMALLOC_P(realloc)(void *ptr, size_t size)
 	void *ret;
 #ifdef JEMALLOC_PROF
 	size_t old_size;
-	prof_thr_cnt_t *cnt, *old_cnt;
+	prof_thr_cnt_t *cnt;
+	prof_ctx_t *old_ctx;
 #endif
 
 	if (size == 0) {
@@ -1074,7 +1075,7 @@ JEMALLOC_P(realloc)(void *ptr, size_t size)
 #ifdef JEMALLOC_PROF
 				if (opt_prof) {
 					old_size = isalloc(ptr);
-					old_cnt = prof_cnt_get(ptr);
+					old_ctx = prof_ctx_get(ptr);
 					cnt = NULL;
 				}
 #endif
@@ -1083,7 +1084,7 @@ JEMALLOC_P(realloc)(void *ptr, size_t size)
 #ifdef JEMALLOC_PROF
 			else if (opt_prof) {
 				old_size = 0;
-				old_cnt = NULL;
+				old_ctx = NULL;
 				cnt = NULL;
 			}
 #endif
@@ -1100,7 +1101,7 @@ JEMALLOC_P(realloc)(void *ptr, size_t size)
 #ifdef JEMALLOC_PROF
 		if (opt_prof) {
 			old_size = isalloc(ptr);
-			old_cnt = prof_cnt_get(ptr);
+			old_ctx = prof_ctx_get(ptr);
 			if ((cnt = prof_alloc_prep(size)) == NULL) {
 				ret = NULL;
 				goto OOM;
@@ -1133,7 +1134,7 @@ OOM:
 #ifdef JEMALLOC_PROF
 		if (opt_prof) {
 			old_size = 0;
-			old_cnt = NULL;
+			old_ctx = NULL;
 		}
 #endif
 		if (malloc_init()) {
@@ -1181,7 +1182,7 @@ RETURN:
 #endif
 #ifdef JEMALLOC_PROF
 	if (opt_prof)
-		prof_realloc(ret, cnt, ptr, old_size, old_cnt);
+		prof_realloc(ret, cnt, ptr, old_size, old_ctx);
 #endif
 	return (ret);
 }
diff --git a/jemalloc/src/prof.c b/jemalloc/src/prof.c
index 6326188..93904b8 100644
--- a/jemalloc/src/prof.c
+++ b/jemalloc/src/prof.c
@@ -48,7 +48,7 @@ static malloc_mutex_t	bt2ctx_mtx;
 static __thread ckh_t	*bt2cnt_tls JEMALLOC_ATTR(tls_model("initial-exec"));
 
 /*
- * Same contents as b2cnt, but initialized such that the TSD destructor is
+ * Same contents as b2cnt_tls, but initialized such that the TSD destructor is
  * called when a thread exits, so that bt2cnt_tls contents can be merged,
  * unlinked, and deallocated.
  */
@@ -100,7 +100,7 @@ static _Unwind_Reason_Code	prof_unwind_callback(
 #endif
 static void	prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max);
 static prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
-static void	prof_cnt_set(const void *ptr, prof_thr_cnt_t *cnt);
+static void	prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 static bool	prof_flush(bool propagate_err);
 static bool	prof_write(const char *s, bool propagate_err);
 static void	prof_ctx_merge(prof_ctx_t *ctx, prof_cnt_t *cnt_all,
@@ -450,6 +450,7 @@ prof_lookup(prof_bt_t *bt)
 			return (NULL);
 		}
 		bt2cnt_tls = bt2cnt;
+		pthread_setspecific(bt2cnt_tsd, bt2cnt);
 	}
 
 	if (ckh_search(bt2cnt, bt, NULL, (void **)&ret)) {
@@ -475,6 +476,7 @@ prof_lookup(prof_bt_t *bt)
 				idalloc(ctx);
 				return (NULL);
 			}
+			ctx->bt = btkey;
 			if (malloc_mutex_init(&ctx->lock)) {
 				prof_leave();
 				idalloc(btkey);
@@ -580,10 +582,10 @@ prof_alloc_prep(size_t size)
 	return (ret);
 }
 
-prof_thr_cnt_t *
-prof_cnt_get(const void *ptr)
+prof_ctx_t *
+prof_ctx_get(const void *ptr)
 {
-	prof_thr_cnt_t *ret;
+	prof_ctx_t *ret;
 	arena_chunk_t *chunk;
 
 	assert(ptr != NULL);
@@ -593,15 +595,15 @@ prof_cnt_get(const void *ptr)
 		/* Region. */
 		assert(chunk->arena->magic == ARENA_MAGIC);
 
-		ret = arena_prof_cnt_get(ptr);
+		ret = arena_prof_ctx_get(ptr);
 	} else
-		ret = huge_prof_cnt_get(ptr);
+		ret = huge_prof_ctx_get(ptr);
 
 	return (ret);
 }
 
 static void
-prof_cnt_set(const void *ptr, prof_thr_cnt_t *cnt)
+prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 {
 	arena_chunk_t *chunk;
 
@@ -612,9 +614,9 @@ prof_cnt_set(const void *ptr, prof_thr_cnt_t *cnt)
 		/* Region. */
 		assert(chunk->arena->magic == ARENA_MAGIC);
 
-		arena_prof_cnt_set(ptr, cnt);
+		arena_prof_ctx_set(ptr, ctx);
 	} else
-		huge_prof_cnt_set(ptr, cnt);
+		huge_prof_ctx_set(ptr, ctx);
 }
 
 static inline void
@@ -649,10 +651,11 @@ prof_malloc(const void *ptr, prof_thr_cnt_t *cnt)
 
 	assert(ptr != NULL);
 
-	prof_cnt_set(ptr, cnt);
 	prof_sample_accum_update(size);
 
 	if ((uintptr_t)cnt > (uintptr_t)1U) {
+		prof_ctx_set(ptr, cnt->ctx);
+
 		cnt->epoch++;
 		/*********/
 		mb_write();
@@ -668,30 +671,49 @@ prof_malloc(const void *ptr, prof_thr_cnt_t *cnt)
 		/*********/
 		mb_write();
 		/*********/
-	}
+	} else
+		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
 }
 
 void
 prof_realloc(const void *ptr, prof_thr_cnt_t *cnt, const void *old_ptr,
-    size_t old_size, prof_thr_cnt_t *old_cnt)
+    size_t old_size, prof_ctx_t *old_ctx)
 {
 	size_t size = isalloc(ptr);
+	prof_thr_cnt_t *told_cnt;
 
-	if (ptr != NULL) {
-		prof_cnt_set(ptr, cnt);
+	if (ptr != NULL)
 		prof_sample_accum_update(size);
-	}
 
-	if ((uintptr_t)old_cnt > (uintptr_t)1U)
-		old_cnt->epoch++;
-	if ((uintptr_t)cnt > (uintptr_t)1U)
+	if ((uintptr_t)old_ctx > (uintptr_t)1U) {
+		told_cnt = prof_lookup(old_ctx->bt);
+		if (told_cnt == NULL) {
+			/*
+			 * It's too late to propagate OOM for this realloc(),
+			 * so operate directly on old_cnt->ctx->cnt_merged.
+			 */
+			malloc_mutex_lock(&old_ctx->lock);
+			old_ctx->cnt_merged.curobjs--;
+			old_ctx->cnt_merged.curbytes -= old_size;
+			malloc_mutex_unlock(&old_ctx->lock);
+			told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
+		}
+	} else
+		told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
+
+	if ((uintptr_t)told_cnt > (uintptr_t)1U)
+		told_cnt->epoch++;
+	if ((uintptr_t)cnt > (uintptr_t)1U) {
+		prof_ctx_set(ptr, cnt->ctx);
 		cnt->epoch++;
+	} else
+		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
 	/*********/
 	mb_write();
 	/*********/
-	if ((uintptr_t)old_cnt > (uintptr_t)1U) {
-		old_cnt->cnts.curobjs--;
-		old_cnt->cnts.curbytes -= old_size;
+	if ((uintptr_t)told_cnt > (uintptr_t)1U) {
+		told_cnt->cnts.curobjs--;
+		told_cnt->cnts.curbytes -= old_size;
 	}
 	if ((uintptr_t)cnt > (uintptr_t)1U) {
 		cnt->cnts.curobjs++;
@@ -702,8 +724,8 @@ prof_realloc(const void *ptr, prof_thr_cnt_t *cnt, const void *old_ptr,
 	/*********/
 	mb_write();
 	/*********/
-	if ((uintptr_t)old_cnt > (uintptr_t)1U)
-		old_cnt->epoch++;
+	if ((uintptr_t)told_cnt > (uintptr_t)1U)
+		told_cnt->epoch++;
 	if ((uintptr_t)cnt > (uintptr_t)1U)
 		cnt->epoch++;
 	/*********/
@@ -713,24 +735,36 @@ prof_realloc(const void *ptr, prof_thr_cnt_t *cnt, const void *old_ptr,
 void
 prof_free(const void *ptr)
 {
-	prof_thr_cnt_t *cnt = prof_cnt_get(ptr);
+	prof_ctx_t *ctx = prof_ctx_get(ptr);
 
-	if ((uintptr_t)cnt > (uintptr_t)1) {
+	if ((uintptr_t)ctx > (uintptr_t)1) {
 		size_t size = isalloc(ptr);
-
-		cnt->epoch++;
-		/*********/
-		mb_write();
-		/*********/
-		cnt->cnts.curobjs--;
-		cnt->cnts.curbytes -= size;
-		/*********/
-		mb_write();
-		/*********/
-		cnt->epoch++;
-		/*********/
-		mb_write();
-		/*********/
+		prof_thr_cnt_t *tcnt = prof_lookup(ctx->bt);
+
+		if (tcnt != NULL) {
+			tcnt->epoch++;
+			/*********/
+			mb_write();
+			/*********/
+			tcnt->cnts.curobjs--;
+			tcnt->cnts.curbytes -= size;
+			/*********/
+			mb_write();
+			/*********/
+			tcnt->epoch++;
+			/*********/
+			mb_write();
+			/*********/
+		} else {
+			/*
+			 * OOM during free() cannot be propagated, so operate
+			 * directly on cnt->ctx->cnt_merged.
+			 */
+			malloc_mutex_lock(&ctx->lock);
+			ctx->cnt_merged.curobjs--;
+			ctx->cnt_merged.curbytes -= size;
+			malloc_mutex_unlock(&ctx->lock);
+		}
 	}
 }
 
diff --git a/jemalloc/src/tcache.c b/jemalloc/src/tcache.c
index ce6ec99..ace24ce 100644
--- a/jemalloc/src/tcache.c
+++ b/jemalloc/src/tcache.c
@@ -55,12 +55,14 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem
 {
 	void *flush, *deferred, *ptr;
 	unsigned i, nflush, ndeferred;
+	bool first_pass;
 
 	assert(binind < nbins);
 	assert(rem <= tbin->ncached);
+	assert(tbin->ncached > 0 || tbin->avail == NULL);
 
-	for (flush = tbin->avail, nflush = tbin->ncached - rem; flush != NULL;
-	    flush = deferred, nflush = ndeferred) {
+	for (flush = tbin->avail, nflush = tbin->ncached - rem, first_pass =
+	    true; flush != NULL; flush = deferred, nflush = ndeferred) {
 		/* Lock the arena bin associated with the first object. */
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(flush);
 		arena_t *arena = chunk->arena;
@@ -110,12 +112,9 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem
 		}
 		malloc_mutex_unlock(&bin->lock);
 
-		if (flush != NULL) {
-			/*
-			 * This was the first pass, and rem cached objects
-			 * remain.
-			 */
+		if (first_pass) {
 			tbin->avail = flush;
+			first_pass = false;
 		}
 	}
 
@@ -133,12 +132,14 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem
 {
 	void *flush, *deferred, *ptr;
 	unsigned i, nflush, ndeferred;
+	bool first_pass;
 
 	assert(binind < nhbins);
 	assert(rem <= tbin->ncached);
+	assert(tbin->ncached > 0 || tbin->avail == NULL);
 
-	for (flush = tbin->avail, nflush = tbin->ncached - rem; flush != NULL;
-	    flush = deferred, nflush = ndeferred) {
+	for (flush = tbin->avail, nflush = tbin->ncached - rem, first_pass =
+	    true; flush != NULL; flush = deferred, nflush = ndeferred) {
 		/* Lock the arena associated with the first object. */
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(flush);
 		arena_t *arena = chunk->arena;
@@ -183,12 +184,9 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem
 		}
 		malloc_mutex_unlock(&arena->lock);
 
-		if (flush != NULL) {
-			/*
-			 * This was the first pass, and rem cached objects
-			 * remain.
-			 */
+		if (first_pass) {
 			tbin->avail = flush;
+			first_pass = false;
 		}
 	}