From be8e59f5a64ef775c9694aee0d6a87d92336d303 Mon Sep 17 00:00:00 2001
From: Ben Maurer <bmaurer@fb.com>
Date: Sat, 5 Apr 2014 15:59:08 -0700
Subject: Don't dereference chunk->arena in free() hot path

When you call free() we load chunk->arena even though that
data isn't used on the tcache hot path.

In profiling some FB applications, I found that ~30% of the
dTLB misses in the free() function come from this line. With
4 MB chunks, the arena_chunk_t->map is ~ 32 KB (1024 pages
in the chunk, 4 8 byte pointers in arena_chunk_map_t). This
means there's only a 1/8 chance of the page containing
chunk->arena also comtaining the map bits.
---
 include/jemalloc/internal/arena.h                | 11 ++++-------
 include/jemalloc/internal/jemalloc_internal.h.in |  2 +-
 src/jemalloc.c                                   |  2 +-
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 9d000c0..b899888 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -495,8 +495,7 @@ prof_ctx_t	*arena_prof_ctx_get(const void *ptr);
 void	arena_prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx);
 void	*arena_malloc(arena_t *arena, size_t size, bool zero, bool try_tcache);
 size_t	arena_salloc(const void *ptr, bool demote);
-void	arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr,
-    bool try_tcache);
+void	arena_dalloc(arena_chunk_t *chunk, void *ptr, bool try_tcache);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ARENA_C_))
@@ -1022,13 +1021,11 @@ arena_salloc(const void *ptr, bool demote)
 }
 
 JEMALLOC_ALWAYS_INLINE void
-arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr, bool try_tcache)
+arena_dalloc(arena_chunk_t *chunk, void *ptr, bool try_tcache)
 {
 	size_t pageind, mapbits;
 	tcache_t *tcache;
 
-	assert(arena != NULL);
-	assert(chunk->arena == arena);
 	assert(ptr != NULL);
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
@@ -1043,7 +1040,7 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr, bool try_tcache)
 			binind = arena_ptr_small_binind_get(ptr, mapbits);
 			tcache_dalloc_small(tcache, ptr, binind);
 		} else
-			arena_dalloc_small(arena, chunk, ptr, pageind);
+			arena_dalloc_small(chunk->arena, chunk, ptr, pageind);
 	} else {
 		size_t size = arena_mapbits_large_size_get(chunk, pageind);
 
@@ -1053,7 +1050,7 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr, bool try_tcache)
 		    tcache_get(false)) != NULL) {
 			tcache_dalloc_large(tcache, ptr, size);
 		} else
-			arena_dalloc_large(arena, chunk, ptr);
+			arena_dalloc_large(chunk->arena, chunk, ptr);
 	}
 }
 #  endif /* JEMALLOC_ARENA_INLINE_B */
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 574bbb1..9c79ae0 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -895,7 +895,7 @@ idalloct(void *ptr, bool try_tcache)
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (chunk != ptr)
-		arena_dalloc(chunk->arena, chunk, ptr, try_tcache);
+		arena_dalloc(chunk, ptr, try_tcache);
 	else
 		huge_dalloc(ptr, true);
 }
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 204778b..558dbb2 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -2103,7 +2103,7 @@ a0free(void *ptr)
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (chunk != ptr)
-		arena_dalloc(chunk->arena, chunk, ptr, false);
+		arena_dalloc(chunk, ptr, false);
 	else
 		huge_dalloc(ptr, true);
 }
-- 
cgit v0.12


From f9ff60346d7c25ad653ea062e496a5d0864233b2 Mon Sep 17 00:00:00 2001
From: Ben Maurer <bmaurer@fb.com>
Date: Sun, 6 Apr 2014 13:24:16 -0700
Subject: refactoring for bits splitting

---
 src/arena.c | 76 ++++++++++++++++++++++++++++++++-----------------------------
 1 file changed, 40 insertions(+), 36 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index dad707b..3cb6260 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -53,6 +53,22 @@ static void	arena_bin_lower_run(arena_t *arena, arena_chunk_t *chunk,
 
 /******************************************************************************/
 
+JEMALLOC_INLINE_C size_t
+arena_mapelm_to_pageind(arena_chunk_map_t *mapelm)
+{
+	uintptr_t map_offset =
+	    CHUNK_ADDR2OFFSET(mapelm) - offsetof(arena_chunk_t, map);
+
+	return ((map_offset / sizeof(arena_chunk_map_t)) + map_bias);
+}
+
+JEMALLOC_INLINE_C size_t
+arena_mapelm_to_bits(arena_chunk_map_t *mapelm)
+{
+
+	return (mapelm->bits);
+}
+
 static inline int
 arena_run_comp(arena_chunk_map_t *a, arena_chunk_map_t *b)
 {
@@ -73,26 +89,19 @@ static inline int
 arena_avail_comp(arena_chunk_map_t *a, arena_chunk_map_t *b)
 {
 	int ret;
-	size_t a_size = a->bits & ~PAGE_MASK;
-	size_t b_size = b->bits & ~PAGE_MASK;
-
-	ret = (a_size > b_size) - (a_size < b_size);
-	if (ret == 0) {
-		uintptr_t a_mapelm, b_mapelm;
+	size_t a_size;
+	size_t b_size = arena_mapelm_to_bits(b) & ~PAGE_MASK;
+	uintptr_t a_mapelm = (uintptr_t)a;
+	uintptr_t b_mapelm = (uintptr_t)b;
 
-		if ((a->bits & CHUNK_MAP_KEY) != CHUNK_MAP_KEY)
-			a_mapelm = (uintptr_t)a;
-		else {
-			/*
-			 * Treat keys as though they are lower than anything
-			 * else.
-			 */
-			a_mapelm = 0;
-		}
-		b_mapelm = (uintptr_t)b;
+        if (a_mapelm & CHUNK_MAP_KEY)
+		a_size = a_mapelm & ~PAGE_MASK;
+        else
+		a_size = arena_mapelm_to_bits(a) & ~PAGE_MASK;
 
+	ret = (a_size > b_size) - (a_size < b_size);
+	if (ret == 0 && (!(a_mapelm & CHUNK_MAP_KEY)))
 		ret = (a_mapelm > b_mapelm) - (a_mapelm < b_mapelm);
-	}
 
 	return (ret);
 }
@@ -663,15 +672,14 @@ static arena_run_t *
 arena_run_alloc_large_helper(arena_t *arena, size_t size, bool zero)
 {
 	arena_run_t *run;
-	arena_chunk_map_t *mapelm, key;
+	arena_chunk_map_t *mapelm;
+	arena_chunk_map_t *key;
 
-	key.bits = size | CHUNK_MAP_KEY;
-	mapelm = arena_avail_tree_nsearch(&arena->runs_avail, &key);
+	key = (arena_chunk_map_t *)(size | CHUNK_MAP_KEY);
+	mapelm = arena_avail_tree_nsearch(&arena->runs_avail, key);
 	if (mapelm != NULL) {
 		arena_chunk_t *run_chunk = CHUNK_ADDR2BASE(mapelm);
-		size_t pageind = (((uintptr_t)mapelm -
-		    (uintptr_t)run_chunk->map) / sizeof(arena_chunk_map_t))
-		    + map_bias;
+		size_t pageind = arena_mapelm_to_pageind(mapelm);
 
 		run = (arena_run_t *)((uintptr_t)run_chunk + (pageind <<
 		    LG_PAGE));
@@ -718,15 +726,14 @@ static arena_run_t *
 arena_run_alloc_small_helper(arena_t *arena, size_t size, size_t binind)
 {
 	arena_run_t *run;
-	arena_chunk_map_t *mapelm, key;
+	arena_chunk_map_t *mapelm;
+	arena_chunk_map_t *key;
 
-	key.bits = size | CHUNK_MAP_KEY;
-	mapelm = arena_avail_tree_nsearch(&arena->runs_avail, &key);
+	key = (arena_chunk_map_t *)(size | CHUNK_MAP_KEY);
+	mapelm = arena_avail_tree_nsearch(&arena->runs_avail, key);
 	if (mapelm != NULL) {
 		arena_chunk_t *run_chunk = CHUNK_ADDR2BASE(mapelm);
-		size_t pageind = (((uintptr_t)mapelm -
-		    (uintptr_t)run_chunk->map) / sizeof(arena_chunk_map_t))
-		    + map_bias;
+		size_t pageind = arena_mapelm_to_pageind(mapelm);
 
 		run = (arena_run_t *)((uintptr_t)run_chunk + (pageind <<
 		    LG_PAGE));
@@ -897,8 +904,7 @@ arena_chunk_purge_stashed(arena_t *arena, arena_chunk_t *chunk,
 		bool unzeroed;
 		size_t flag_unzeroed, i;
 
-		pageind = (((uintptr_t)mapelm - (uintptr_t)chunk->map) /
-		    sizeof(arena_chunk_map_t)) + map_bias;
+		pageind = arena_mapelm_to_pageind(mapelm);
 		npages = arena_mapbits_large_size_get(chunk, pageind) >>
 		    LG_PAGE;
 		assert(pageind + npages <= chunk_npages);
@@ -942,8 +948,7 @@ arena_chunk_unstash_purged(arena_t *arena, arena_chunk_t *chunk,
 	    mapelm = ql_first(mapelms)) {
 		arena_run_t *run;
 
-		pageind = (((uintptr_t)mapelm - (uintptr_t)chunk->map) /
-		    sizeof(arena_chunk_map_t)) + map_bias;
+		pageind = arena_mapelm_to_pageind(mapelm);
 		run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)(pageind <<
 		    LG_PAGE));
 		ql_remove(mapelms, mapelm, u.ql_link);
@@ -1307,8 +1312,7 @@ arena_bin_runs_first(arena_bin_t *bin)
 		arena_run_t *run;
 
 		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(mapelm);
-		pageind = ((((uintptr_t)mapelm - (uintptr_t)chunk->map) /
-		    sizeof(arena_chunk_map_t))) + map_bias;
+		pageind = arena_mapelm_to_pageind(mapelm);
 		run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
 		    arena_mapbits_small_runind_get(chunk, pageind)) <<
 		    LG_PAGE));
@@ -1882,7 +1886,7 @@ arena_dalloc_bin_locked(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
 	    arena_mapbits_small_runind_get(chunk, pageind)) << LG_PAGE));
 	bin = run->bin;
-	binind = arena_ptr_small_binind_get(ptr, mapelm->bits);
+	binind = arena_ptr_small_binind_get(ptr, arena_mapbits_get(chunk, pageind));
 	bin_info = &arena_bin_info[binind];
 	if (config_fill || config_stats)
 		size = bin_info->reg_size;
-- 
cgit v0.12


From 9b0cbf0850b130a9b0a8c58bd10b2926b2083510 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Fri, 11 Apr 2014 14:24:51 -0700
Subject: Remove support for non-prof-promote heap profiling metadata.

Make promotion of sampled small objects to large objects mandatory, so
that profiling metadata can always be stored in the chunk map, rather
than requiring one pointer per small region in each small-region page
run.  In practice the non-prof-promote code was only useful when using
jemalloc to track all objects and report them as leaks at program exit.
However, Valgrind is at least as good a tool for this particular use
case.

Furthermore, the non-prof-promote code is getting in the way of
some optimizations that will make heap profiling much cheaper for the
predominant use case (sampling a small representative proportion of all
allocations).
---
 include/jemalloc/internal/arena.h             | 66 ++++-----------------------
 include/jemalloc/internal/private_symbols.txt |  1 -
 include/jemalloc/internal/prof.h              | 20 +++-----
 include/jemalloc/internal/size_classes.sh     |  5 +-
 include/jemalloc/internal/tcache.h            |  2 +-
 src/arena.c                                   | 21 ---------
 src/jemalloc.c                                | 16 +++----
 src/prof.c                                    |  7 +--
 8 files changed, 28 insertions(+), 110 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index b899888..0e14c2c 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -110,7 +110,6 @@ struct arena_chunk_map_s {
 	 * p : run page offset
 	 * s : run size
 	 * n : binind for size class; large objects set these to BININD_INVALID
-	 *     except for promoted allocations (see prof_promote)
 	 * x : don't care
 	 * - : 0
 	 * + : 1
@@ -216,8 +215,6 @@ struct arena_run_s {
  *               | ...                |
  * bitmap_offset | bitmap             |
  *               | ...                |
- *   ctx0_offset | ctx map            |
- *               | ...                |
  *               |--------------------|
  *               | redzone            |
  *   reg0_offset | region 0           |
@@ -270,12 +267,6 @@ struct arena_bin_info_s {
 	 */
 	bitmap_info_t	bitmap_info;
 
-	/*
-	 * Offset of first (prof_ctx_t *) in a run header for this bin's size
-	 * class, or 0 if (config_prof == false || opt_prof == false).
-	 */
-	uint32_t	ctx0_offset;
-
 	/* Offset of first region in a run for this bin's size class. */
 	uint32_t	reg0_offset;
 };
@@ -492,7 +483,7 @@ size_t	arena_bin_index(arena_t *arena, arena_bin_t *bin);
 unsigned	arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info,
     const void *ptr);
 prof_ctx_t	*arena_prof_ctx_get(const void *ptr);
-void	arena_prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx);
+void	arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 void	*arena_malloc(arena_t *arena, size_t size, bool zero, bool try_tcache);
 size_t	arena_salloc(const void *ptr, bool demote);
 void	arena_dalloc(arena_chunk_t *chunk, void *ptr, bool try_tcache);
@@ -879,31 +870,16 @@ arena_prof_ctx_get(const void *ptr)
 	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
 	mapbits = arena_mapbits_get(chunk, pageind);
 	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
-	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
-		if (prof_promote)
-			ret = (prof_ctx_t *)(uintptr_t)1U;
-		else {
-			arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
-			    (uintptr_t)((pageind - (mapbits >> LG_PAGE)) <<
-			    LG_PAGE));
-			size_t binind = arena_ptr_small_binind_get(ptr,
-			    mapbits);
-			arena_bin_info_t *bin_info = &arena_bin_info[binind];
-			unsigned regind;
-
-			regind = arena_run_regind(run, bin_info, ptr);
-			ret = *(prof_ctx_t **)((uintptr_t)run +
-			    bin_info->ctx0_offset + (regind *
-			    sizeof(prof_ctx_t *)));
-		}
-	} else
+	if ((mapbits & CHUNK_MAP_LARGE) == 0)
+		ret = (prof_ctx_t *)(uintptr_t)1U;
+	else
 		ret = arena_mapp_get(chunk, pageind)->prof_ctx;
 
 	return (ret);
 }
 
 JEMALLOC_INLINE void
-arena_prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx)
+arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 {
 	arena_chunk_t *chunk;
 	size_t pageind;
@@ -916,31 +892,8 @@ arena_prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx)
 	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
 	assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
 
-	if (usize > SMALL_MAXCLASS || (prof_promote &&
-	    ((uintptr_t)ctx != (uintptr_t)1U || arena_mapbits_large_get(chunk,
-	    pageind) != 0))) {
-		assert(arena_mapbits_large_get(chunk, pageind) != 0);
+	if (arena_mapbits_large_get(chunk, pageind) != 0)
 		arena_mapp_get(chunk, pageind)->prof_ctx = ctx;
-	} else {
-		assert(arena_mapbits_large_get(chunk, pageind) == 0);
-		if (prof_promote == false) {
-			size_t mapbits = arena_mapbits_get(chunk, pageind);
-			arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
-			    (uintptr_t)((pageind - (mapbits >> LG_PAGE)) <<
-			    LG_PAGE));
-			size_t binind;
-			arena_bin_info_t *bin_info;
-			unsigned regind;
-
-			binind = arena_ptr_small_binind_get(ptr, mapbits);
-			bin_info = &arena_bin_info[binind];
-			regind = arena_run_regind(run, bin_info, ptr);
-
-			*((prof_ctx_t **)((uintptr_t)run +
-			    bin_info->ctx0_offset + (regind * sizeof(prof_ctx_t
-			    *)))) = ctx;
-		}
-	}
 }
 
 JEMALLOC_ALWAYS_INLINE void *
@@ -989,7 +942,7 @@ arena_salloc(const void *ptr, bool demote)
 	assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
 	binind = arena_mapbits_binind_get(chunk, pageind);
 	if (binind == BININD_INVALID || (config_prof && demote == false &&
-	    prof_promote && arena_mapbits_large_get(chunk, pageind) != 0)) {
+	    arena_mapbits_large_get(chunk, pageind) != 0)) {
 		/*
 		 * Large allocation.  In the common case (demote == true), and
 		 * as this is an inline function, most callers will only end up
@@ -1007,10 +960,7 @@ arena_salloc(const void *ptr, bool demote)
 		assert(arena_mapbits_dirty_get(chunk, pageind) ==
 		    arena_mapbits_dirty_get(chunk, pageind+(ret>>LG_PAGE)-1));
 	} else {
-		/*
-		 * Small allocation (possibly promoted to a large object due to
-		 * prof_promote).
-		 */
+		/* Small allocation (possibly promoted to a large object). */
 		assert(arena_mapbits_large_get(chunk, pageind) != 0 ||
 		    arena_ptr_small_binind_get(ptr, arena_mapbits_get(chunk,
 		    pageind)) == binind);
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 93516d2..f52d49f 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -303,7 +303,6 @@ prof_mdump
 prof_postfork_child
 prof_postfork_parent
 prof_prefork
-prof_promote
 prof_realloc
 prof_sample_accum_update
 prof_sample_threshold_update
diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index 6f162d2..56014f1 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -220,12 +220,6 @@ extern char	opt_prof_prefix[
  */
 extern uint64_t	prof_interval;
 
-/*
- * If true, promote small sampled objects to large objects, since small run
- * headers do not have embedded profile context pointers.
- */
-extern bool	prof_promote;
-
 void	bt_init(prof_bt_t *bt, void **vec);
 void	prof_backtrace(prof_bt_t *bt, unsigned nignore);
 prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
@@ -308,7 +302,7 @@ malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *)
 prof_tdata_t	*prof_tdata_get(bool create);
 void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
 prof_ctx_t	*prof_ctx_get(const void *ptr);
-void	prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx);
+void	prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 bool	prof_sample_accum_update(size_t size);
 void	prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt);
 void	prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
@@ -405,7 +399,7 @@ prof_ctx_get(const void *ptr)
 }
 
 JEMALLOC_INLINE void
-prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx)
+prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 {
 	arena_chunk_t *chunk;
 
@@ -415,7 +409,7 @@ prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx)
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (chunk != ptr) {
 		/* Region. */
-		arena_prof_ctx_set(ptr, usize, ctx);
+		arena_prof_ctx_set(ptr, ctx);
 	} else
 		huge_prof_ctx_set(ptr, ctx);
 }
@@ -471,7 +465,7 @@ prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt)
 	}
 
 	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		prof_ctx_set(ptr, usize, cnt->ctx);
+		prof_ctx_set(ptr, cnt->ctx);
 
 		cnt->epoch++;
 		/*********/
@@ -491,7 +485,7 @@ prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt)
 		mb_write();
 		/*********/
 	} else
-		prof_ctx_set(ptr, usize, (prof_ctx_t *)(uintptr_t)1U);
+		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
 }
 
 JEMALLOC_INLINE void
@@ -539,10 +533,10 @@ prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
 	if ((uintptr_t)told_cnt > (uintptr_t)1U)
 		told_cnt->epoch++;
 	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		prof_ctx_set(ptr, usize, cnt->ctx);
+		prof_ctx_set(ptr, cnt->ctx);
 		cnt->epoch++;
 	} else if (ptr != NULL)
-		prof_ctx_set(ptr, usize, (prof_ctx_t *)(uintptr_t)1U);
+		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
 	/*********/
 	mb_write();
 	/*********/
diff --git a/include/jemalloc/internal/size_classes.sh b/include/jemalloc/internal/size_classes.sh
index 29c80c1..960674a 100755
--- a/include/jemalloc/internal/size_classes.sh
+++ b/include/jemalloc/internal/size_classes.sh
@@ -94,9 +94,8 @@ cat <<EOF
 /*
  * The small_size2bin lookup table uses uint8_t to encode each bin index, so we
  * cannot support more than 256 small size classes.  Further constrain NBINS to
- * 255 to support prof_promote, since all small size classes, plus a "not
- * small" size class must be stored in 8 bits of arena_chunk_map_t's bits
- * field.
+ * 255 since all small size classes, plus a "not small" size class must be
+ * stored in 8 bits of arena_chunk_map_t's bits field.
  */
 #if (NBINS > 255)
 #  error "Too many small size classes"
diff --git a/include/jemalloc/internal/tcache.h b/include/jemalloc/internal/tcache.h
index c3d4b58..5197413 100644
--- a/include/jemalloc/internal/tcache.h
+++ b/include/jemalloc/internal/tcache.h
@@ -354,7 +354,7 @@ tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
 		if (ret == NULL)
 			return (NULL);
 	} else {
-		if (config_prof && prof_promote && size == PAGE) {
+		if (config_prof && size == PAGE) {
 			arena_chunk_t *chunk =
 			    (arena_chunk_t *)CHUNK_ADDR2BASE(ret);
 			size_t pageind = (((uintptr_t)ret - (uintptr_t)chunk) >>
diff --git a/src/arena.c b/src/arena.c
index 3cb6260..d574100 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -2373,7 +2373,6 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size)
 	uint32_t try_nregs, good_nregs;
 	uint32_t try_hdr_size, good_hdr_size;
 	uint32_t try_bitmap_offset, good_bitmap_offset;
-	uint32_t try_ctx0_offset, good_ctx0_offset;
 	uint32_t try_redzone0_offset, good_redzone0_offset;
 
 	assert(min_run_size >= PAGE);
@@ -2428,14 +2427,6 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size)
 		try_bitmap_offset = try_hdr_size;
 		/* Add space for bitmap. */
 		try_hdr_size += bitmap_size(try_nregs);
-		if (config_prof && opt_prof && prof_promote == false) {
-			/* Pad to a quantum boundary. */
-			try_hdr_size = QUANTUM_CEILING(try_hdr_size);
-			try_ctx0_offset = try_hdr_size;
-			/* Add space for one (prof_ctx_t *) per region. */
-			try_hdr_size += try_nregs * sizeof(prof_ctx_t *);
-		} else
-			try_ctx0_offset = 0;
 		try_redzone0_offset = try_run_size - (try_nregs *
 		    bin_info->reg_interval) - pad_size;
 	} while (try_hdr_size > try_redzone0_offset);
@@ -2449,7 +2440,6 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size)
 		good_nregs = try_nregs;
 		good_hdr_size = try_hdr_size;
 		good_bitmap_offset = try_bitmap_offset;
-		good_ctx0_offset = try_ctx0_offset;
 		good_redzone0_offset = try_redzone0_offset;
 
 		/* Try more aggressive settings. */
@@ -2469,16 +2459,6 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size)
 			try_bitmap_offset = try_hdr_size;
 			/* Add space for bitmap. */
 			try_hdr_size += bitmap_size(try_nregs);
-			if (config_prof && opt_prof && prof_promote == false) {
-				/* Pad to a quantum boundary. */
-				try_hdr_size = QUANTUM_CEILING(try_hdr_size);
-				try_ctx0_offset = try_hdr_size;
-				/*
-				 * Add space for one (prof_ctx_t *) per region.
-				 */
-				try_hdr_size += try_nregs *
-				    sizeof(prof_ctx_t *);
-			}
 			try_redzone0_offset = try_run_size - (try_nregs *
 			    bin_info->reg_interval) - pad_size;
 		} while (try_hdr_size > try_redzone0_offset);
@@ -2494,7 +2474,6 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size)
 	bin_info->run_size = good_run_size;
 	bin_info->nregs = good_nregs;
 	bin_info->bitmap_offset = good_bitmap_offset;
-	bin_info->ctx0_offset = good_ctx0_offset;
 	bin_info->reg0_offset = good_redzone0_offset + bin_info->redzone_size;
 
 	assert(bin_info->reg0_offset - bin_info->redzone_size + (bin_info->nregs
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 558dbb2..816a12e 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -853,7 +853,7 @@ imalloc_prof_sample(size_t usize, prof_thr_cnt_t *cnt)
 
 	if (cnt == NULL)
 		return (NULL);
-	if (prof_promote && usize <= SMALL_MAXCLASS) {
+	if (usize <= SMALL_MAXCLASS) {
 		p = imalloc(SMALL_MAXCLASS+1);
 		if (p == NULL)
 			return (NULL);
@@ -952,7 +952,7 @@ imemalign_prof_sample(size_t alignment, size_t usize, prof_thr_cnt_t *cnt)
 
 	if (cnt == NULL)
 		return (NULL);
-	if (prof_promote && usize <= SMALL_MAXCLASS) {
+	if (usize <= SMALL_MAXCLASS) {
 		assert(sa2u(SMALL_MAXCLASS+1, alignment) != 0);
 		p = ipalloc(sa2u(SMALL_MAXCLASS+1, alignment), alignment,
 		    false);
@@ -1086,7 +1086,7 @@ icalloc_prof_sample(size_t usize, prof_thr_cnt_t *cnt)
 
 	if (cnt == NULL)
 		return (NULL);
-	if (prof_promote && usize <= SMALL_MAXCLASS) {
+	if (usize <= SMALL_MAXCLASS) {
 		p = icalloc(SMALL_MAXCLASS+1);
 		if (p == NULL)
 			return (NULL);
@@ -1183,7 +1183,7 @@ irealloc_prof_sample(void *oldptr, size_t usize, prof_thr_cnt_t *cnt)
 
 	if (cnt == NULL)
 		return (NULL);
-	if (prof_promote && usize <= SMALL_MAXCLASS) {
+	if (usize <= SMALL_MAXCLASS) {
 		p = iralloc(oldptr, SMALL_MAXCLASS+1, 0, 0, false);
 		if (p == NULL)
 			return (NULL);
@@ -1395,7 +1395,7 @@ imallocx_prof_sample(size_t usize, size_t alignment, bool zero, bool try_tcache,
 
 	if (cnt == NULL)
 		return (NULL);
-	if (prof_promote && usize <= SMALL_MAXCLASS) {
+	if (usize <= SMALL_MAXCLASS) {
 		size_t usize_promoted = (alignment == 0) ?
 		    s2u(SMALL_MAXCLASS+1) : sa2u(SMALL_MAXCLASS+1, alignment);
 		assert(usize_promoted != 0);
@@ -1492,7 +1492,7 @@ irallocx_prof_sample(void *oldptr, size_t size, size_t alignment, size_t usize,
 
 	if (cnt == NULL)
 		return (NULL);
-	if (prof_promote && usize <= SMALL_MAXCLASS) {
+	if (usize <= SMALL_MAXCLASS) {
 		p = iralloct(oldptr, SMALL_MAXCLASS+1, (SMALL_MAXCLASS+1 >=
 		    size) ? 0 : size - (SMALL_MAXCLASS+1), alignment, zero,
 		    try_tcache_alloc, try_tcache_dalloc, arena);
@@ -1639,8 +1639,8 @@ ixallocx_prof_sample(void *ptr, size_t old_usize, size_t size, size_t extra,
 	if (cnt == NULL)
 		return (old_usize);
 	/* Use minimum usize to determine whether promotion may happen. */
-	if (prof_promote && ((alignment == 0) ? s2u(size) : sa2u(size,
-	    alignment)) <= SMALL_MAXCLASS) {
+	if (((alignment == 0) ? s2u(size) : sa2u(size, alignment)) <=
+	    SMALL_MAXCLASS) {
 		if (ixalloc(ptr, SMALL_MAXCLASS+1, (SMALL_MAXCLASS+1 >=
 		    size+extra) ? 0 : size+extra - (SMALL_MAXCLASS+1),
 		    alignment, zero))
diff --git a/src/prof.c b/src/prof.c
index 7722b7b..1b1f7a8 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -32,7 +32,6 @@ char		opt_prof_prefix[
     1];
 
 uint64_t	prof_interval = 0;
-bool		prof_promote;
 
 /*
  * Table of mutexes that are shared among ctx's.  These are leaf locks, so
@@ -1300,8 +1299,8 @@ prof_boot1(void)
 	cassert(config_prof);
 
 	/*
-	 * opt_prof and prof_promote must be in their final state before any
-	 * arenas are initialized, so this function must be executed early.
+	 * opt_prof must be in its final state before any arenas are
+	 * initialized, so this function must be executed early.
 	 */
 
 	if (opt_prof_leak && opt_prof == false) {
@@ -1317,8 +1316,6 @@ prof_boot1(void)
 			    opt_lg_prof_interval);
 		}
 	}
-
-	prof_promote = (opt_prof && opt_lg_prof_sample > LG_PAGE);
 }
 
 bool
-- 
cgit v0.12


From 9790b9667fd975b1f9a4f108f9d0a20ab265c6b6 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Mon, 14 Apr 2014 22:32:31 -0700
Subject: Remove the *allocm() API, which is superceded by the *allocx() API.

---
 INSTALL                                          |   3 -
 Makefile.in                                      |   7 +-
 configure.ac                                     |  21 +--
 doc/jemalloc.xml.in                              | 191 +----------------------
 include/jemalloc/internal/jemalloc_internal.h.in |   1 -
 include/jemalloc/internal/private_symbols.txt    |   1 -
 include/jemalloc/jemalloc_defs.h.in              |   3 -
 include/jemalloc/jemalloc_macros.h.in            |  17 --
 include/jemalloc/jemalloc_protos.h.in            |  12 --
 src/jemalloc.c                                   |  85 ----------
 test/integration/MALLOCX_ARENA.c                 |   4 +-
 test/integration/allocm.c                        | 107 -------------
 test/integration/rallocm.c                       | 111 -------------
 test/unit/mq.c                                   |   2 +-
 14 files changed, 7 insertions(+), 558 deletions(-)
 delete mode 100644 test/integration/allocm.c
 delete mode 100644 test/integration/rallocm.c

diff --git a/INSTALL b/INSTALL
index 841704d..8530643 100644
--- a/INSTALL
+++ b/INSTALL
@@ -157,9 +157,6 @@ any of the following arguments (not a definitive list) to 'configure':
 --disable-valgrind
     Disable support for Valgrind.
 
---disable-experimental
-    Disable support for the experimental API (*allocm()).
-
 --disable-zone-allocator
     Disable zone allocator for Darwin. This means jemalloc won't be hooked as
     the default allocator on OSX/iOS.
diff --git a/Makefile.in b/Makefile.in
index d6b7d6e..f7aa7d8 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -48,7 +48,6 @@ cfgoutputs_in := @cfgoutputs_in@
 cfgoutputs_out := @cfgoutputs_out@
 enable_autogen := @enable_autogen@
 enable_code_coverage := @enable_code_coverage@
-enable_experimental := @enable_experimental@
 enable_zone_allocator := @enable_zone_allocator@
 DSO_LDFLAGS = @DSO_LDFLAGS@
 SOREV = @SOREV@
@@ -133,17 +132,13 @@ TESTS_UNIT_AUX := $(srcroot)test/unit/prof_accum_a.c \
 TESTS_INTEGRATION := $(srcroot)test/integration/aligned_alloc.c \
 	$(srcroot)test/integration/allocated.c \
 	$(srcroot)test/integration/mallocx.c \
+	$(srcroot)test/integration/MALLOCX_ARENA.c \
 	$(srcroot)test/integration/mremap.c \
 	$(srcroot)test/integration/posix_memalign.c \
 	$(srcroot)test/integration/rallocx.c \
 	$(srcroot)test/integration/thread_arena.c \
 	$(srcroot)test/integration/thread_tcache_enabled.c \
 	$(srcroot)test/integration/xallocx.c
-ifeq ($(enable_experimental), 1)
-TESTS_INTEGRATION += $(srcroot)test/integration/allocm.c \
-	$(srcroot)test/integration/MALLOCX_ARENA.c \
-	$(srcroot)test/integration/rallocm.c
-endif
 TESTS_STRESS :=
 TESTS := $(TESTS_UNIT) $(TESTS_INTEGRATION) $(TESTS_STRESS)
 
diff --git a/configure.ac b/configure.ac
index 4de81dc..04cefe9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -44,7 +44,7 @@ AC_CACHE_CHECK([whether $1 is compilable],
 dnl ============================================================================
 
 dnl Library revision.
-rev=1
+rev=2
 AC_SUBST([rev])
 
 srcroot=$srcdir
@@ -438,24 +438,6 @@ AC_CHECK_FUNC([valloc],
 	      [AC_DEFINE([JEMALLOC_OVERRIDE_VALLOC], [ ])
 	       public_syms="${public_syms} valloc"])
 
-dnl Support the experimental API by default.
-AC_ARG_ENABLE([experimental],
-  [AS_HELP_STRING([--disable-experimental],
-   [Disable support for the experimental API])],
-[if test "x$enable_experimental" = "xno" ; then
-  enable_experimental="0"
-else
-  enable_experimental="1"
-fi
-],
-[enable_experimental="1"]
-)
-if test "x$enable_experimental" = "x1" ; then
-  AC_DEFINE([JEMALLOC_EXPERIMENTAL], [ ])
-  public_syms="${public_syms} allocm dallocm nallocm rallocm sallocm"
-fi
-AC_SUBST([enable_experimental])
-
 dnl Do not compute test code coverage by default.
 GCOV_FLAGS=
 AC_ARG_ENABLE([code-coverage],
@@ -1465,7 +1447,6 @@ AC_MSG_RESULT([JEMALLOC_PRIVATE_NAMESPACE])
 AC_MSG_RESULT([                   : ${JEMALLOC_PRIVATE_NAMESPACE}])
 AC_MSG_RESULT([install_suffix     : ${install_suffix}])
 AC_MSG_RESULT([autogen            : ${enable_autogen}])
-AC_MSG_RESULT([experimental       : ${enable_experimental}])
 AC_MSG_RESULT([cc-silence         : ${enable_cc_silence}])
 AC_MSG_RESULT([debug              : ${enable_debug}])
 AC_MSG_RESULT([code-coverage      : ${enable_code_coverage}])
diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index d8e2e71..a423240 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -44,11 +44,6 @@
     <refname>mallctlbymib</refname>
     <refname>malloc_stats_print</refname>
     <refname>malloc_usable_size</refname>
-    <refname>allocm</refname>
-    <refname>rallocm</refname>
-    <refname>sallocm</refname>
-    <refname>dallocm</refname>
-    <refname>nallocm</refname>
     -->
     <refpurpose>general purpose memory allocation functions</refpurpose>
   </refnamediv>
@@ -172,41 +167,6 @@
         </funcprototype>
         <para><type>const char *</type><varname>malloc_conf</varname>;</para>
       </refsect2>
-      <refsect2>
-      <title>Experimental API</title>
-        <funcprototype>
-          <funcdef>int <function>allocm</function></funcdef>
-          <paramdef>void **<parameter>ptr</parameter></paramdef>
-          <paramdef>size_t *<parameter>rsize</parameter></paramdef>
-          <paramdef>size_t <parameter>size</parameter></paramdef>
-          <paramdef>int <parameter>flags</parameter></paramdef>
-        </funcprototype>
-        <funcprototype>
-          <funcdef>int <function>rallocm</function></funcdef>
-          <paramdef>void **<parameter>ptr</parameter></paramdef>
-          <paramdef>size_t *<parameter>rsize</parameter></paramdef>
-          <paramdef>size_t <parameter>size</parameter></paramdef>
-          <paramdef>size_t <parameter>extra</parameter></paramdef>
-          <paramdef>int <parameter>flags</parameter></paramdef>
-        </funcprototype>
-        <funcprototype>
-          <funcdef>int <function>sallocm</function></funcdef>
-          <paramdef>const void *<parameter>ptr</parameter></paramdef>
-          <paramdef>size_t *<parameter>rsize</parameter></paramdef>
-          <paramdef>int <parameter>flags</parameter></paramdef>
-        </funcprototype>
-        <funcprototype>
-          <funcdef>int <function>dallocm</function></funcdef>
-          <paramdef>void *<parameter>ptr</parameter></paramdef>
-          <paramdef>int <parameter>flags</parameter></paramdef>
-        </funcprototype>
-        <funcprototype>
-          <funcdef>int <function>nallocm</function></funcdef>
-          <paramdef>size_t *<parameter>rsize</parameter></paramdef>
-          <paramdef>size_t <parameter>size</parameter></paramdef>
-          <paramdef>int <parameter>flags</parameter></paramdef>
-        </funcprototype>
-      </refsect2>
     </funcsynopsis>
   </refsynopsisdiv>
   <refsect1 id="description">
@@ -449,116 +409,6 @@ for (i = 0; i < nbins; i++) {
       depended on, since such behavior is entirely implementation-dependent.
       </para>
     </refsect2>
-    <refsect2>
-      <title>Experimental API</title>
-      <para>The experimental API is subject to change or removal without regard
-      for backward compatibility.  If <option>--disable-experimental</option>
-      is specified during configuration, the experimental API is
-      omitted.</para>
-
-      <para>The <function>allocm<parameter/></function>,
-      <function>rallocm<parameter/></function>,
-      <function>sallocm<parameter/></function>,
-      <function>dallocm<parameter/></function>, and
-      <function>nallocm<parameter/></function> functions all have a
-      <parameter>flags</parameter> argument that can be used to specify
-      options.  The functions only check the options that are contextually
-      relevant.  Use bitwise or (<code language="C">|</code>) operations to
-      specify one or more of the following:
-        <variablelist>
-          <varlistentry>
-            <term><constant>ALLOCM_LG_ALIGN(<parameter>la</parameter>)
-            </constant></term>
-
-            <listitem><para>Align the memory allocation to start at an address
-            that is a multiple of <code language="C">(1 &lt;&lt;
-            <parameter>la</parameter>)</code>.  This macro does not validate
-            that <parameter>la</parameter> is within the valid
-            range.</para></listitem>
-          </varlistentry>
-          <varlistentry>
-            <term><constant>ALLOCM_ALIGN(<parameter>a</parameter>)
-            </constant></term>
-
-            <listitem><para>Align the memory allocation to start at an address
-            that is a multiple of <parameter>a</parameter>, where
-            <parameter>a</parameter> is a power of two.  This macro does not
-            validate that <parameter>a</parameter> is a power of 2.
-            </para></listitem>
-          </varlistentry>
-          <varlistentry>
-            <term><constant>ALLOCM_ZERO</constant></term>
-
-            <listitem><para>Initialize newly allocated memory to contain zero
-            bytes.  In the growing reallocation case, the real size prior to
-            reallocation defines the boundary between untouched bytes and those
-            that are initialized to contain zero bytes.  If this macro is
-            absent, newly allocated memory is uninitialized.</para></listitem>
-          </varlistentry>
-          <varlistentry>
-            <term><constant>ALLOCM_NO_MOVE</constant></term>
-
-            <listitem><para>For reallocation, fail rather than moving the
-            object.  This constraint can apply to both growth and
-            shrinkage.</para></listitem>
-          </varlistentry>
-          <varlistentry>
-            <term><constant>ALLOCM_ARENA(<parameter>a</parameter>)
-            </constant></term>
-
-            <listitem><para>Use the arena specified by the index
-            <parameter>a</parameter> (and by necessity bypass the thread
-            cache).  This macro has no effect for huge regions, nor for regions
-            that were allocated via an arena other than the one specified.
-            This macro does not validate that <parameter>a</parameter>
-            specifies an arena index in the valid range.</para></listitem>
-          </varlistentry>
-        </variablelist>
-      </para>
-
-      <para>The <function>allocm<parameter/></function> function allocates at
-      least <parameter>size</parameter> bytes of memory, sets
-      <parameter>*ptr</parameter> to the base address of the allocation, and
-      sets <parameter>*rsize</parameter> to the real size of the allocation if
-      <parameter>rsize</parameter> is not <constant>NULL</constant>.  Behavior
-      is undefined if <parameter>size</parameter> is <constant>0</constant>, or
-      if request size overflows due to size class and/or alignment
-      constraints.</para>
-
-      <para>The <function>rallocm<parameter/></function> function resizes the
-      allocation at <parameter>*ptr</parameter> to be at least
-      <parameter>size</parameter> bytes, sets <parameter>*ptr</parameter> to
-      the base address of the allocation if it moved, and sets
-      <parameter>*rsize</parameter> to the real size of the allocation if
-      <parameter>rsize</parameter> is not <constant>NULL</constant>.  If
-      <parameter>extra</parameter> is non-zero, an attempt is made to resize
-      the allocation to be at least <code
-      language="C">(<parameter>size</parameter> +
-      <parameter>extra</parameter>)</code> bytes, though inability to allocate
-      the extra byte(s) will not by itself result in failure.  Behavior is
-      undefined if <parameter>size</parameter> is <constant>0</constant>, if
-      request size overflows due to size class and/or alignment constraints, or
-      if <code language="C">(<parameter>size</parameter> +
-      <parameter>extra</parameter> &gt;
-      <constant>SIZE_T_MAX</constant>)</code>.</para>
-
-      <para>The <function>sallocm<parameter/></function> function sets
-      <parameter>*rsize</parameter> to the real size of the allocation.</para>
-
-      <para>The <function>dallocm<parameter/></function> function causes the
-      memory referenced by <parameter>ptr</parameter> to be made available for
-      future allocations.</para>
-
-      <para>The <function>nallocm<parameter/></function> function allocates no
-      memory, but it performs the same size computation as the
-      <function>allocm<parameter/></function> function, and if
-      <parameter>rsize</parameter> is not <constant>NULL</constant> it sets
-      <parameter>*rsize</parameter> to the real size of the allocation that
-      would result from the equivalent <function>allocm<parameter/></function>
-      function call.  Behavior is undefined if <parameter>size</parameter> is
-      <constant>0</constant>, or if request size overflows due to size class
-      and/or alignment constraints.</para>
-    </refsect2>
   </refsect1>
   <refsect1 id="tuning">
     <title>TUNING</title>
@@ -1076,9 +926,8 @@ for (i = 0; i < nbins; i++) {
         <listitem><para>Zero filling enabled/disabled.  If enabled, each byte
         of uninitialized allocated memory will be initialized to 0.  Note that
         this initialization only happens once for each byte, so
-        <function>realloc<parameter/></function>,
-        <function>rallocx<parameter/></function> and
-        <function>rallocm<parameter/></function> calls do not zero memory that
+        <function>realloc<parameter/></function> and
+        <function>rallocx<parameter/></function> calls do not zero memory that
         was previously allocated.  This is intended for debugging and will
         impact performance negatively.  This option is disabled by default.
         </para></listitem>
@@ -2253,42 +2102,6 @@ malloc_conf = "xmalloc:true";]]></programlisting>
       returns the usable size of the allocation pointed to by
       <parameter>ptr</parameter>.  </para>
     </refsect2>
-    <refsect2>
-      <title>Experimental API</title>
-      <para>The <function>allocm<parameter/></function>,
-      <function>rallocm<parameter/></function>,
-      <function>sallocm<parameter/></function>,
-      <function>dallocm<parameter/></function>, and
-      <function>nallocm<parameter/></function> functions return
-      <constant>ALLOCM_SUCCESS</constant> on success; otherwise they return an
-      error value.  The <function>allocm<parameter/></function>,
-      <function>rallocm<parameter/></function>, and
-      <function>nallocm<parameter/></function> functions will fail if:
-        <variablelist>
-          <varlistentry>
-            <term><errorname>ALLOCM_ERR_OOM</errorname></term>
-
-            <listitem><para>Out of memory.  Insufficient contiguous memory was
-            available to service the allocation request.  The
-            <function>allocm<parameter/></function> function additionally sets
-            <parameter>*ptr</parameter> to <constant>NULL</constant>, whereas
-            the <function>rallocm<parameter/></function> function leaves
-            <constant>*ptr</constant> unmodified.</para></listitem>
-          </varlistentry>
-        </variablelist>
-      The <function>rallocm<parameter/></function> function will also
-      fail if:
-        <variablelist>
-          <varlistentry>
-            <term><errorname>ALLOCM_ERR_NOT_MOVED</errorname></term>
-
-            <listitem><para><constant>ALLOCM_NO_MOVE</constant> was specified,
-            but the reallocation request could not be serviced without moving
-            the object.</para></listitem>
-          </varlistentry>
-        </variablelist>
-      </para>
-    </refsect2>
   </refsect1>
   <refsect1 id="environment">
     <title>ENVIRONMENT</title>
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 9c79ae0..a374e2a 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -230,7 +230,6 @@ static const bool config_ivsalloc =
 #include "jemalloc/internal/jemalloc_internal_macros.h"
 
 #define	MALLOCX_LG_ALIGN_MASK	((int)0x3f)
-#define	ALLOCM_LG_ALIGN_MASK	((int)0x3f)
 
 /* Smallest size class to support. */
 #define	LG_TINY_MIN		3
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index f52d49f..e1cb28f 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -211,7 +211,6 @@ huge_prof_ctx_set
 huge_ralloc
 huge_ralloc_no_move
 huge_salloc
-iallocm
 icalloc
 icalloct
 idalloc
diff --git a/include/jemalloc/jemalloc_defs.h.in b/include/jemalloc/jemalloc_defs.h.in
index eb38d71..ce6c698 100644
--- a/include/jemalloc/jemalloc_defs.h.in
+++ b/include/jemalloc/jemalloc_defs.h.in
@@ -1,9 +1,6 @@
 /* Defined if __attribute__((...)) syntax is supported. */
 #undef JEMALLOC_HAVE_ATTR
 
-/* Support the experimental API. */
-#undef JEMALLOC_EXPERIMENTAL
-
 /*
  * Define overrides for non-standard allocator-related functions if they are
  * present on the system.
diff --git a/include/jemalloc/jemalloc_macros.h.in b/include/jemalloc/jemalloc_macros.h.in
index 13dbdd9..1530f9c 100644
--- a/include/jemalloc/jemalloc_macros.h.in
+++ b/include/jemalloc/jemalloc_macros.h.in
@@ -19,23 +19,6 @@
 /* Bias arena index bits so that 0 encodes "MALLOCX_ARENA() unspecified". */
 #  define MALLOCX_ARENA(a)	((int)(((a)+1) << 8))
 
-#ifdef JEMALLOC_EXPERIMENTAL
-#  define ALLOCM_LG_ALIGN(la)	(la)
-#  if LG_SIZEOF_PTR == 2
-#    define ALLOCM_ALIGN(a)	(ffs(a)-1)
-#  else
-#    define ALLOCM_ALIGN(a)						\
-	 ((a < (size_t)INT_MAX) ? ffs(a)-1 : ffs(a>>32)+31)
-#  endif
-#  define ALLOCM_ZERO	((int)0x40)
-#  define ALLOCM_NO_MOVE	((int)0x80)
-/* Bias arena index bits so that 0 encodes "ALLOCM_ARENA() unspecified". */
-#  define ALLOCM_ARENA(a)	((int)(((a)+1) << 8))
-#  define ALLOCM_SUCCESS	0
-#  define ALLOCM_ERR_OOM	1
-#  define ALLOCM_ERR_NOT_MOVED	2
-#endif
-
 #ifdef JEMALLOC_HAVE_ATTR
 #  define JEMALLOC_ATTR(s) __attribute__((s))
 #  define JEMALLOC_EXPORT JEMALLOC_ATTR(visibility("default"))
diff --git a/include/jemalloc/jemalloc_protos.h.in b/include/jemalloc/jemalloc_protos.h.in
index 25446de..59aeee1 100644
--- a/include/jemalloc/jemalloc_protos.h.in
+++ b/include/jemalloc/jemalloc_protos.h.in
@@ -44,15 +44,3 @@ JEMALLOC_EXPORT void *	@je_@memalign(size_t alignment, size_t size)
 #ifdef JEMALLOC_OVERRIDE_VALLOC
 JEMALLOC_EXPORT void *	@je_@valloc(size_t size) JEMALLOC_ATTR(malloc);
 #endif
-
-#ifdef JEMALLOC_EXPERIMENTAL
-JEMALLOC_EXPORT int	@je_@allocm(void **ptr, size_t *rsize, size_t size,
-    int flags) JEMALLOC_ATTR(nonnull(1));
-JEMALLOC_EXPORT int	@je_@rallocm(void **ptr, size_t *rsize, size_t size,
-    size_t extra, int flags) JEMALLOC_ATTR(nonnull(1));
-JEMALLOC_EXPORT int	@je_@sallocm(const void *ptr, size_t *rsize, int flags)
-    JEMALLOC_ATTR(nonnull(1));
-JEMALLOC_EXPORT int	@je_@dallocm(void *ptr, int flags)
-    JEMALLOC_ATTR(nonnull(1));
-JEMALLOC_EXPORT int	@je_@nallocm(size_t *rsize, size_t size, int flags);
-#endif
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 816a12e..0de5940 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1868,91 +1868,6 @@ je_malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *ptr)
  */
 /******************************************************************************/
 /*
- * Begin experimental functions.
- */
-#ifdef JEMALLOC_EXPERIMENTAL
-
-int
-je_allocm(void **ptr, size_t *rsize, size_t size, int flags)
-{
-	void *p;
-
-	assert(ptr != NULL);
-
-	p = je_mallocx(size, flags);
-	if (p == NULL)
-		return (ALLOCM_ERR_OOM);
-	if (rsize != NULL)
-		*rsize = isalloc(p, config_prof);
-	*ptr = p;
-	return (ALLOCM_SUCCESS);
-}
-
-int
-je_rallocm(void **ptr, size_t *rsize, size_t size, size_t extra, int flags)
-{
-	int ret;
-	bool no_move = flags & ALLOCM_NO_MOVE;
-
-	assert(ptr != NULL);
-	assert(*ptr != NULL);
-	assert(size != 0);
-	assert(SIZE_T_MAX - size >= extra);
-
-	if (no_move) {
-		size_t usize = je_xallocx(*ptr, size, extra, flags);
-		ret = (usize >= size) ? ALLOCM_SUCCESS : ALLOCM_ERR_NOT_MOVED;
-		if (rsize != NULL)
-			*rsize = usize;
-	} else {
-		void *p = je_rallocx(*ptr, size+extra, flags);
-		if (p != NULL) {
-			*ptr = p;
-			ret = ALLOCM_SUCCESS;
-		} else
-			ret = ALLOCM_ERR_OOM;
-		if (rsize != NULL)
-			*rsize = isalloc(*ptr, config_prof);
-	}
-	return (ret);
-}
-
-int
-je_sallocm(const void *ptr, size_t *rsize, int flags)
-{
-
-	assert(rsize != NULL);
-	*rsize = je_sallocx(ptr, flags);
-	return (ALLOCM_SUCCESS);
-}
-
-int
-je_dallocm(void *ptr, int flags)
-{
-
-	je_dallocx(ptr, flags);
-	return (ALLOCM_SUCCESS);
-}
-
-int
-je_nallocm(size_t *rsize, size_t size, int flags)
-{
-	size_t usize;
-
-	usize = je_nallocx(size, flags);
-	if (usize == 0)
-		return (ALLOCM_ERR_OOM);
-	if (rsize != NULL)
-		*rsize = usize;
-	return (ALLOCM_SUCCESS);
-}
-
-#endif
-/*
- * End experimental functions.
- */
-/******************************************************************************/
-/*
  * The following functions are used by threading libraries for protection of
  * malloc during fork().
  */
diff --git a/test/integration/MALLOCX_ARENA.c b/test/integration/MALLOCX_ARENA.c
index 71cf6f2..695a5b6 100644
--- a/test/integration/MALLOCX_ARENA.c
+++ b/test/integration/MALLOCX_ARENA.c
@@ -34,7 +34,7 @@ thd_start(void *arg)
 	return (NULL);
 }
 
-TEST_BEGIN(test_ALLOCM_ARENA)
+TEST_BEGIN(test_MALLOCX_ARENA)
 {
 	thd_t thds[NTHREADS];
 	unsigned i;
@@ -54,5 +54,5 @@ main(void)
 {
 
 	return (test(
-	    test_ALLOCM_ARENA));
+	    test_MALLOCX_ARENA));
 }
diff --git a/test/integration/allocm.c b/test/integration/allocm.c
deleted file mode 100644
index 7b4ea0c..0000000
--- a/test/integration/allocm.c
+++ /dev/null
@@ -1,107 +0,0 @@
-#include "test/jemalloc_test.h"
-
-#define	CHUNK 0x400000
-#define	MAXALIGN (((size_t)1) << 25)
-#define	NITER 4
-
-TEST_BEGIN(test_basic)
-{
-	size_t nsz, rsz, sz;
-	void *p;
-
-	sz = 42;
-	nsz = 0;
-	assert_d_eq(nallocm(&nsz, sz, 0), ALLOCM_SUCCESS,
-	    "Unexpected nallocm() error");
-	rsz = 0;
-	assert_d_eq(allocm(&p, &rsz, sz, 0), ALLOCM_SUCCESS,
-	    "Unexpected allocm() error");
-	assert_zu_ge(rsz, sz, "Real size smaller than expected");
-	assert_zu_eq(nsz, rsz, "nallocm()/allocm() rsize mismatch");
-	assert_d_eq(dallocm(p, 0), ALLOCM_SUCCESS,
-	    "Unexpected dallocm() error");
-
-	assert_d_eq(allocm(&p, NULL, sz, 0), ALLOCM_SUCCESS,
-	    "Unexpected allocm() error");
-	assert_d_eq(dallocm(p, 0), ALLOCM_SUCCESS,
-	    "Unexpected dallocm() error");
-
-	nsz = 0;
-	assert_d_eq(nallocm(&nsz, sz, ALLOCM_ZERO), ALLOCM_SUCCESS,
-	    "Unexpected nallocm() error");
-	rsz = 0;
-	assert_d_eq(allocm(&p, &rsz, sz, ALLOCM_ZERO), ALLOCM_SUCCESS,
-	    "Unexpected allocm() error");
-	assert_zu_eq(nsz, rsz, "nallocm()/allocm() rsize mismatch");
-	assert_d_eq(dallocm(p, 0), ALLOCM_SUCCESS,
-	    "Unexpected dallocm() error");
-}
-TEST_END
-
-TEST_BEGIN(test_alignment_and_size)
-{
-	int r;
-	size_t nsz, rsz, sz, alignment, total;
-	unsigned i;
-	void *ps[NITER];
-
-	for (i = 0; i < NITER; i++)
-		ps[i] = NULL;
-
-	for (alignment = 8;
-	    alignment <= MAXALIGN;
-	    alignment <<= 1) {
-		total = 0;
-		for (sz = 1;
-		    sz < 3 * alignment && sz < (1U << 31);
-		    sz += (alignment >> (LG_SIZEOF_PTR-1)) - 1) {
-			for (i = 0; i < NITER; i++) {
-				nsz = 0;
-				r = nallocm(&nsz, sz, ALLOCM_ALIGN(alignment) |
-				    ALLOCM_ZERO);
-				assert_d_eq(r, ALLOCM_SUCCESS,
-				    "nallocm() error for alignment=%zu, "
-				    "size=%zu (%#zx): %d",
-				    alignment, sz, sz, r);
-				rsz = 0;
-				r = allocm(&ps[i], &rsz, sz,
-				    ALLOCM_ALIGN(alignment) | ALLOCM_ZERO);
-				assert_d_eq(r, ALLOCM_SUCCESS,
-				    "allocm() error for alignment=%zu, "
-				    "size=%zu (%#zx): %d",
-				    alignment, sz, sz, r);
-				assert_zu_ge(rsz, sz,
-				    "Real size smaller than expected for "
-				    "alignment=%zu, size=%zu", alignment, sz);
-				assert_zu_eq(nsz, rsz,
-				    "nallocm()/allocm() rsize mismatch for "
-				    "alignment=%zu, size=%zu", alignment, sz);
-				assert_ptr_null(
-				    (void *)((uintptr_t)ps[i] & (alignment-1)),
-				    "%p inadequately aligned for"
-				    " alignment=%zu, size=%zu", ps[i],
-				    alignment, sz);
-				sallocm(ps[i], &rsz, 0);
-				total += rsz;
-				if (total >= (MAXALIGN << 1))
-					break;
-			}
-			for (i = 0; i < NITER; i++) {
-				if (ps[i] != NULL) {
-					dallocm(ps[i], 0);
-					ps[i] = NULL;
-				}
-			}
-		}
-	}
-}
-TEST_END
-
-int
-main(void)
-{
-
-	return (test(
-	    test_basic,
-	    test_alignment_and_size));
-}
diff --git a/test/integration/rallocm.c b/test/integration/rallocm.c
deleted file mode 100644
index 33c11bb..0000000
--- a/test/integration/rallocm.c
+++ /dev/null
@@ -1,111 +0,0 @@
-#include "test/jemalloc_test.h"
-
-TEST_BEGIN(test_same_size)
-{
-	void *p, *q;
-	size_t sz, tsz;
-
-	assert_d_eq(allocm(&p, &sz, 42, 0), ALLOCM_SUCCESS,
-	    "Unexpected allocm() error");
-
-	q = p;
-	assert_d_eq(rallocm(&q, &tsz, sz, 0, ALLOCM_NO_MOVE), ALLOCM_SUCCESS,
-	    "Unexpected rallocm() error");
-	assert_ptr_eq(q, p, "Unexpected object move");
-	assert_zu_eq(tsz, sz, "Unexpected size change: %zu --> %zu", sz, tsz);
-
-	assert_d_eq(dallocm(p, 0), ALLOCM_SUCCESS,
-	    "Unexpected dallocm() error");
-}
-TEST_END
-
-TEST_BEGIN(test_extra_no_move)
-{
-	void *p, *q;
-	size_t sz, tsz;
-
-	assert_d_eq(allocm(&p, &sz, 42, 0), ALLOCM_SUCCESS,
-	    "Unexpected allocm() error");
-
-	q = p;
-	assert_d_eq(rallocm(&q, &tsz, sz, sz-42, ALLOCM_NO_MOVE),
-	    ALLOCM_SUCCESS, "Unexpected rallocm() error");
-	assert_ptr_eq(q, p, "Unexpected object move");
-	assert_zu_eq(tsz, sz, "Unexpected size change: %zu --> %zu", sz, tsz);
-
-	assert_d_eq(dallocm(p, 0), ALLOCM_SUCCESS,
-	    "Unexpected dallocm() error");
-}
-TEST_END
-
-TEST_BEGIN(test_no_move_fail)
-{
-	void *p, *q;
-	size_t sz, tsz;
-
-	assert_d_eq(allocm(&p, &sz, 42, 0), ALLOCM_SUCCESS,
-	    "Unexpected allocm() error");
-
-	q = p;
-	assert_d_eq(rallocm(&q, &tsz, sz + 5, 0, ALLOCM_NO_MOVE),
-	    ALLOCM_ERR_NOT_MOVED, "Unexpected rallocm() result");
-	assert_ptr_eq(q, p, "Unexpected object move");
-	assert_zu_eq(tsz, sz, "Unexpected size change: %zu --> %zu", sz, tsz);
-
-	assert_d_eq(dallocm(p, 0), ALLOCM_SUCCESS,
-	    "Unexpected dallocm() error");
-}
-TEST_END
-
-TEST_BEGIN(test_grow_and_shrink)
-{
-	void *p, *q;
-	size_t tsz;
-#define	NCYCLES 3
-	unsigned i, j;
-#define	NSZS 2500
-	size_t szs[NSZS];
-#define	MAXSZ ZU(12 * 1024 * 1024)
-
-	assert_d_eq(allocm(&p, &szs[0], 1, 0), ALLOCM_SUCCESS,
-	    "Unexpected allocm() error");
-
-	for (i = 0; i < NCYCLES; i++) {
-		for (j = 1; j < NSZS && szs[j-1] < MAXSZ; j++) {
-			q = p;
-			assert_d_eq(rallocm(&q, &szs[j], szs[j-1]+1, 0, 0),
-			    ALLOCM_SUCCESS,
-			    "Unexpected rallocm() error for size=%zu-->%zu",
-			    szs[j-1], szs[j-1]+1);
-			assert_zu_ne(szs[j], szs[j-1]+1,
-			    "Expected size to at least: %zu", szs[j-1]+1);
-			p = q;
-		}
-
-		for (j--; j > 0; j--) {
-			q = p;
-			assert_d_eq(rallocm(&q, &tsz, szs[j-1], 0, 0),
-			    ALLOCM_SUCCESS,
-			    "Unexpected rallocm() error for size=%zu-->%zu",
-			    szs[j], szs[j-1]);
-			assert_zu_eq(tsz, szs[j-1],
-			    "Expected size=%zu, got size=%zu", szs[j-1], tsz);
-			p = q;
-		}
-	}
-
-	assert_d_eq(dallocm(p, 0), ALLOCM_SUCCESS,
-	    "Unexpected dallocm() error");
-}
-TEST_END
-
-int
-main(void)
-{
-
-	return (test(
-	    test_same_size,
-	    test_extra_no_move,
-	    test_no_move_fail,
-	    test_grow_and_shrink));
-}
diff --git a/test/unit/mq.c b/test/unit/mq.c
index f57e96a..bd289c5 100644
--- a/test/unit/mq.c
+++ b/test/unit/mq.c
@@ -54,7 +54,7 @@ thd_sender_start(void *arg)
 		mq_msg_t *msg;
 		void *p;
 		p = mallocx(sizeof(mq_msg_t), 0);
-		assert_ptr_not_null(p, "Unexpected allocm() failure");
+		assert_ptr_not_null(p, "Unexpected mallocx() failure");
 		msg = (mq_msg_t *)p;
 		mq_put(mq, msg);
 	}
-- 
cgit v0.12


From 24a4ba77e1e4c73488c6ac87db6db972232b392d Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Mon, 14 Apr 2014 22:38:59 -0700
Subject: Update MALLOCX_ARENA() documentation.

Update MALLOCX_ARENA() documentation to no longer claim that it has no
effect for huge region allocations.
---
 doc/jemalloc.xml.in | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index a423240..4acb07f 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -270,10 +270,10 @@
 
             <listitem><para>Use the arena specified by the index
             <parameter>a</parameter> (and by necessity bypass the thread
-            cache).  This macro has no effect for huge regions, nor for regions
-            that were allocated via an arena other than the one specified.
-            This macro does not validate that <parameter>a</parameter>
-            specifies an arena index in the valid range.</para></listitem>
+            cache).  This macro has no effect for regions that were allocated
+            via an arena other than the one specified.  This macro does not
+            validate that <parameter>a</parameter> specifies an arena index in
+            the valid range.</para></listitem>
           </varlistentry>
         </variablelist>
       </para>
-- 
cgit v0.12


From 644d414bc9ab52efbbf7ebeb350170106ec1f937 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Mon, 14 Apr 2014 22:49:23 -0700
Subject: Reverse the cc-silence default.

Replace --enable-cc-silence with --disable-cc-silence, so that by
default people won't see spurious warnings when building jemalloc.
---
 INSTALL      | 8 ++++----
 configure.ac | 9 ++++-----
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/INSTALL b/INSTALL
index 8530643..55c919a 100644
--- a/INSTALL
+++ b/INSTALL
@@ -71,10 +71,10 @@ any of the following arguments (not a definitive list) to 'configure':
     versions of jemalloc can coexist in the same installation directory.  For
     example, libjemalloc.so.0 becomes libjemalloc<suffix>.so.0.
 
---enable-cc-silence
-    Enable code that silences non-useful compiler warnings.  This is helpful
-    when trying to tell serious warnings from those due to compiler
-    limitations, but it potentially incurs a performance penalty.
+--disable-cc-silence
+    Disable code that silences non-useful compiler warnings.  This is mainly
+    useful during development when auditing the set of warnings that are being
+    silenced.
 
 --enable-debug
     Enable assertions and validation code.  This incurs a substantial
diff --git a/configure.ac b/configure.ac
index 04cefe9..b47d572 100644
--- a/configure.ac
+++ b/configure.ac
@@ -577,18 +577,17 @@ cfghdrs_tup="include/jemalloc/jemalloc_defs.h:include/jemalloc/jemalloc_defs.h.i
 cfghdrs_tup="${cfghdrs_tup} include/jemalloc/internal/jemalloc_internal_defs.h:${srcroot}include/jemalloc/internal/jemalloc_internal_defs.h.in"
 cfghdrs_tup="${cfghdrs_tup} test/include/test/jemalloc_test_defs.h:${srcroot}test/include/test/jemalloc_test_defs.h.in"
 
-dnl Do not silence irrelevant compiler warnings by default, since enabling this
-dnl option incurs a performance penalty.
+dnl Silence irrelevant compiler warnings by default.
 AC_ARG_ENABLE([cc-silence],
-  [AS_HELP_STRING([--enable-cc-silence],
-                  [Silence irrelevant compiler warnings])],
+  [AS_HELP_STRING([--disable-cc-silence],
+                  [Do not silence irrelevant compiler warnings])],
 [if test "x$enable_cc_silence" = "xno" ; then
   enable_cc_silence="0"
 else
   enable_cc_silence="1"
 fi
 ],
-[enable_cc_silence="0"]
+[enable_cc_silence="1"]
 )
 if test "x$enable_cc_silence" = "x1" ; then
   AC_DEFINE([JEMALLOC_CC_SILENCE], [ ])
-- 
cgit v0.12


From 4d434adb146375ad17f0d5e994ed5728d2942e3f Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Tue, 15 Apr 2014 12:09:48 -0700
Subject: Make dss non-optional, and fix an "arena.<i>.dss" mallctl bug.

Make dss non-optional on all platforms which support sbrk(2).

Fix the "arena.<i>.dss" mallctl to return an error if "primary" or
"secondary" precedence is specified, but sbrk(2) is not supported.
---
 INSTALL                                            |  4 ---
 configure.ac                                       | 23 ++++-------------
 doc/jemalloc.xml.in                                | 29 ++++++++++------------
 include/jemalloc/internal/arena.h                  |  2 +-
 include/jemalloc/internal/jemalloc_internal.h.in   |  2 +-
 .../jemalloc/internal/jemalloc_internal_defs.h.in  |  3 ---
 src/arena.c                                        |  5 +++-
 src/chunk.c                                        |  8 +++---
 src/chunk_dss.c                                    | 20 +++++++--------
 src/ctl.c                                          |  6 +----
 src/huge.c                                         |  6 ++---
 test/integration/MALLOCX_ARENA.c                   | 17 ++++++++++---
 test/unit/mallctl.c                                | 20 ++++++++++++---
 13 files changed, 72 insertions(+), 73 deletions(-)

diff --git a/INSTALL b/INSTALL
index 55c919a..07f51d1 100644
--- a/INSTALL
+++ b/INSTALL
@@ -145,10 +145,6 @@ any of the following arguments (not a definitive list) to 'configure':
     memory allocation algorithm that causes semi-permanent VM map holes under
     normal jemalloc operation.
 
---enable-dss
-    Enable support for page allocation/deallocation via sbrk(2), in addition to
-    mmap(2).
-
 --disable-fill
     Disable support for junk/zero filling of memory, quarantine, and redzones.
     See the "opt.junk", "opt.zero", "opt.quarantine", and "opt.redzone" option
diff --git a/configure.ac b/configure.ac
index b47d572..dc817e1 100644
--- a/configure.ac
+++ b/configure.ac
@@ -836,34 +836,22 @@ if test "x$enable_munmap" = "x1" ; then
 fi
 AC_SUBST([enable_munmap])
 
-dnl Do not enable allocation from DSS by default.
-AC_ARG_ENABLE([dss],
-  [AS_HELP_STRING([--enable-dss], [Enable allocation from DSS])],
-[if test "x$enable_dss" = "xno" ; then
-  enable_dss="0"
-else
-  enable_dss="1"
-fi
-],
-[enable_dss="0"]
-)
+dnl Enable allocation from DSS if supported by the OS.
+have_dss="1"
 dnl Check whether the BSD/SUSv1 sbrk() exists.  If not, disable DSS support.
 AC_CHECK_FUNC([sbrk], [have_sbrk="1"], [have_sbrk="0"])
 if test "x$have_sbrk" = "x1" ; then
   if test "x$sbrk_deprecated" == "x1" ; then
     AC_MSG_RESULT([Disabling dss allocation because sbrk is deprecated])
-    enable_dss="0"
-  else
-    AC_DEFINE([JEMALLOC_HAVE_SBRK], [ ])
+    have_dss="0"
   fi
 else
-  enable_dss="0"
+  have_dss="0"
 fi
 
-if test "x$enable_dss" = "x1" ; then
+if test "x$have_dss" = "x1" ; then
   AC_DEFINE([JEMALLOC_DSS], [ ])
 fi
-AC_SUBST([enable_dss])
 
 dnl Support the junk/zero filling option by default.
 AC_ARG_ENABLE([fill],
@@ -1461,7 +1449,6 @@ AC_MSG_RESULT([valgrind           : ${enable_valgrind}])
 AC_MSG_RESULT([xmalloc            : ${enable_xmalloc}])
 AC_MSG_RESULT([mremap             : ${enable_mremap}])
 AC_MSG_RESULT([munmap             : ${enable_munmap}])
-AC_MSG_RESULT([dss                : ${enable_dss}])
 AC_MSG_RESULT([lazy_lock          : ${enable_lazy_lock}])
 AC_MSG_RESULT([tls                : ${enable_tls}])
 AC_MSG_RESULT([===============================================================================])
diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 4acb07f..16dd0bb 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -448,8 +448,10 @@ for (i = 0; i < nbins; i++) {
     <manvolnum>2</manvolnum></citerefentry> to obtain memory, which is
     suboptimal for several reasons, including race conditions, increased
     fragmentation, and artificial limitations on maximum usable memory.  If
-    <option>--enable-dss</option> is specified during configuration, this
-    allocator uses both <citerefentry><refentrytitle>mmap</refentrytitle>
+    <citerefentry><refentrytitle>sbrk</refentrytitle>
+    <manvolnum>2</manvolnum></citerefentry> is supported by the operating
+    system, this allocator uses both
+    <citerefentry><refentrytitle>mmap</refentrytitle>
     <manvolnum>2</manvolnum></citerefentry> and
     <citerefentry><refentrytitle>sbrk</refentrytitle>
     <manvolnum>2</manvolnum></citerefentry>, in that order of preference;
@@ -625,16 +627,6 @@ for (i = 0; i < nbins; i++) {
         build configuration.</para></listitem>
       </varlistentry>
 
-      <varlistentry id="config.dss">
-        <term>
-          <mallctl>config.dss</mallctl>
-          (<type>bool</type>)
-          <literal>r-</literal>
-        </term>
-        <listitem><para><option>--enable-dss</option> was specified during
-        build configuration.</para></listitem>
-      </varlistentry>
-
       <varlistentry id="config.fill">
         <term>
           <mallctl>config.fill</mallctl>
@@ -790,10 +782,15 @@ for (i = 0; i < nbins; i++) {
         <manvolnum>2</manvolnum></citerefentry>) allocation precedence as
         related to <citerefentry><refentrytitle>mmap</refentrytitle>
         <manvolnum>2</manvolnum></citerefentry> allocation.  The following
-        settings are supported: &ldquo;disabled&rdquo;, &ldquo;primary&rdquo;,
-        and &ldquo;secondary&rdquo;.  The default is &ldquo;secondary&rdquo; if
-        <link linkend="config.dss"><mallctl>config.dss</mallctl></link> is
-        true, &ldquo;disabled&rdquo; otherwise.
+        settings are supported if
+        <citerefentry><refentrytitle>sbrk</refentrytitle>
+        <manvolnum>2</manvolnum></citerefentry> is supported by the operating
+        system: &ldquo;disabled&rdquo;, &ldquo;primary&rdquo;, and
+        &ldquo;secondary&rdquo;; otherwise only &ldquo;disabled&rdquo; is
+        supported.  The default is &ldquo;secondary&rdquo; if
+        <citerefentry><refentrytitle>sbrk</refentrytitle>
+        <manvolnum>2</manvolnum></citerefentry> is supported by the operating
+        system; &ldquo;disabled&rdquo; otherwise.
         </para></listitem>
       </varlistentry>
 
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 0e14c2c..6de312e 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -434,7 +434,7 @@ void	*arena_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
     size_t extra, size_t alignment, bool zero, bool try_tcache_alloc,
     bool try_tcache_dalloc);
 dss_prec_t	arena_dss_prec_get(arena_t *arena);
-void	arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec);
+bool	arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec);
 void	arena_stats_merge(arena_t *arena, const char **dss, size_t *nactive,
     size_t *ndirty, arena_stats_t *astats, malloc_bin_stats_t *bstats,
     malloc_large_stats_t *lstats);
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index a374e2a..4821b9b 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -85,7 +85,7 @@ static const bool config_debug =
     false
 #endif
     ;
-static const bool config_dss =
+static const bool have_dss =
 #ifdef JEMALLOC_DSS
     true
 #else
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index c166fbd..fc95967 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -76,9 +76,6 @@
  */
 #undef JEMALLOC_MUTEX_INIT_CB
 
-/* Defined if sbrk() is supported. */
-#undef JEMALLOC_HAVE_SBRK
-
 /* Non-empty if the tls_model attribute is supported. */
 #undef JEMALLOC_TLS_MODEL
 
diff --git a/src/arena.c b/src/arena.c
index d574100..8aa36fd 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -2243,13 +2243,16 @@ arena_dss_prec_get(arena_t *arena)
 	return (ret);
 }
 
-void
+bool
 arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec)
 {
 
+	if (have_dss == false)
+		return (dss_prec != dss_prec_disabled);
 	malloc_mutex_lock(&arena->lock);
 	arena->dss_prec = dss_prec;
 	malloc_mutex_unlock(&arena->lock);
+	return (false);
 }
 
 void
diff --git a/src/chunk.c b/src/chunk.c
index 90ab116..fdd693e 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -153,7 +153,7 @@ chunk_alloc(size_t size, size_t alignment, bool base, bool *zero,
 	assert((alignment & chunksize_mask) == 0);
 
 	/* "primary" dss. */
-	if (config_dss && dss_prec == dss_prec_primary) {
+	if (have_dss && dss_prec == dss_prec_primary) {
 		if ((ret = chunk_recycle(&chunks_szad_dss, &chunks_ad_dss, size,
 		    alignment, base, zero)) != NULL)
 			goto label_return;
@@ -167,7 +167,7 @@ chunk_alloc(size_t size, size_t alignment, bool base, bool *zero,
 	if ((ret = chunk_alloc_mmap(size, alignment, zero)) != NULL)
 		goto label_return;
 	/* "secondary" dss. */
-	if (config_dss && dss_prec == dss_prec_secondary) {
+	if (have_dss && dss_prec == dss_prec_secondary) {
 		if ((ret = chunk_recycle(&chunks_szad_dss, &chunks_ad_dss, size,
 		    alignment, base, zero)) != NULL)
 			goto label_return;
@@ -305,7 +305,7 @@ chunk_unmap(void *chunk, size_t size)
 	assert(size != 0);
 	assert((size & chunksize_mask) == 0);
 
-	if (config_dss && chunk_in_dss(chunk))
+	if (have_dss && chunk_in_dss(chunk))
 		chunk_record(&chunks_szad_dss, &chunks_ad_dss, chunk, size);
 	else if (chunk_dealloc_mmap(chunk, size))
 		chunk_record(&chunks_szad_mmap, &chunks_ad_mmap, chunk, size);
@@ -348,7 +348,7 @@ chunk_boot(void)
 			return (true);
 		memset(&stats_chunks, 0, sizeof(chunk_stats_t));
 	}
-	if (config_dss && chunk_dss_boot())
+	if (have_dss && chunk_dss_boot())
 		return (true);
 	extent_tree_szad_new(&chunks_szad_mmap);
 	extent_tree_ad_new(&chunks_ad_mmap);
diff --git a/src/chunk_dss.c b/src/chunk_dss.c
index 510bb8b..36133f1 100644
--- a/src/chunk_dss.c
+++ b/src/chunk_dss.c
@@ -32,7 +32,7 @@ static void *
 chunk_dss_sbrk(intptr_t increment)
 {
 
-#ifdef JEMALLOC_HAVE_SBRK
+#ifdef JEMALLOC_DSS
 	return (sbrk(increment));
 #else
 	not_implemented();
@@ -45,7 +45,7 @@ chunk_dss_prec_get(void)
 {
 	dss_prec_t ret;
 
-	if (config_dss == false)
+	if (have_dss == false)
 		return (dss_prec_disabled);
 	malloc_mutex_lock(&dss_mtx);
 	ret = dss_prec_default;
@@ -57,8 +57,8 @@ bool
 chunk_dss_prec_set(dss_prec_t dss_prec)
 {
 
-	if (config_dss == false)
-		return (true);
+	if (have_dss == false)
+		return (dss_prec != dss_prec_disabled);
 	malloc_mutex_lock(&dss_mtx);
 	dss_prec_default = dss_prec;
 	malloc_mutex_unlock(&dss_mtx);
@@ -70,7 +70,7 @@ chunk_alloc_dss(size_t size, size_t alignment, bool *zero)
 {
 	void *ret;
 
-	cassert(config_dss);
+	cassert(have_dss);
 	assert(size > 0 && (size & chunksize_mask) == 0);
 	assert(alignment > 0 && (alignment & chunksize_mask) == 0);
 
@@ -143,7 +143,7 @@ chunk_in_dss(void *chunk)
 {
 	bool ret;
 
-	cassert(config_dss);
+	cassert(have_dss);
 
 	malloc_mutex_lock(&dss_mtx);
 	if ((uintptr_t)chunk >= (uintptr_t)dss_base
@@ -160,7 +160,7 @@ bool
 chunk_dss_boot(void)
 {
 
-	cassert(config_dss);
+	cassert(have_dss);
 
 	if (malloc_mutex_init(&dss_mtx))
 		return (true);
@@ -175,7 +175,7 @@ void
 chunk_dss_prefork(void)
 {
 
-	if (config_dss)
+	if (have_dss)
 		malloc_mutex_prefork(&dss_mtx);
 }
 
@@ -183,7 +183,7 @@ void
 chunk_dss_postfork_parent(void)
 {
 
-	if (config_dss)
+	if (have_dss)
 		malloc_mutex_postfork_parent(&dss_mtx);
 }
 
@@ -191,7 +191,7 @@ void
 chunk_dss_postfork_child(void)
 {
 
-	if (config_dss)
+	if (have_dss)
 		malloc_mutex_postfork_child(&dss_mtx);
 }
 
diff --git a/src/ctl.c b/src/ctl.c
index cc2c5ae..0340a27 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -74,7 +74,6 @@ CTL_PROTO(thread_allocatedp)
 CTL_PROTO(thread_deallocated)
 CTL_PROTO(thread_deallocatedp)
 CTL_PROTO(config_debug)
-CTL_PROTO(config_dss)
 CTL_PROTO(config_fill)
 CTL_PROTO(config_lazy_lock)
 CTL_PROTO(config_mremap)
@@ -213,7 +212,6 @@ static const ctl_named_node_t	thread_node[] = {
 
 static const ctl_named_node_t	config_node[] = {
 	{NAME("debug"),			CTL(config_debug)},
-	{NAME("dss"),			CTL(config_dss)},
 	{NAME("fill"),			CTL(config_fill)},
 	{NAME("lazy_lock"),		CTL(config_lazy_lock)},
 	{NAME("mremap"),		CTL(config_mremap)},
@@ -1136,7 +1134,6 @@ label_return:
 /******************************************************************************/
 
 CTL_RO_BOOL_CONFIG_GEN(config_debug)
-CTL_RO_BOOL_CONFIG_GEN(config_dss)
 CTL_RO_BOOL_CONFIG_GEN(config_fill)
 CTL_RO_BOOL_CONFIG_GEN(config_lazy_lock)
 CTL_RO_BOOL_CONFIG_GEN(config_mremap)
@@ -1356,8 +1353,7 @@ arena_i_dss_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 		arena_t *arena = arenas[arena_ind];
 		if (arena != NULL) {
 			dss_prec_old = arena_dss_prec_get(arena);
-			arena_dss_prec_set(arena, dss_prec);
-			err = false;
+			err = arena_dss_prec_set(arena, dss_prec);
 		} else
 			err = true;
 	} else {
diff --git a/src/huge.c b/src/huge.c
index d72f213..e725fd9 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -140,7 +140,7 @@ huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
 	 * Use mremap(2) if this is a huge-->huge reallocation, and neither the
 	 * source nor the destination are in dss.
 	 */
-	if (oldsize >= chunksize && (config_dss == false || (chunk_in_dss(ptr)
+	if (oldsize >= chunksize && (have_dss == false || (chunk_in_dss(ptr)
 	    == false && chunk_in_dss(ret) == false))) {
 		size_t newsize = huge_salloc(ret);
 
@@ -198,12 +198,12 @@ static void
 huge_dalloc_junk(void *ptr, size_t usize)
 {
 
-	if (config_fill && config_dss && opt_junk) {
+	if (config_fill && have_dss && opt_junk) {
 		/*
 		 * Only bother junk filling if the chunk isn't about to be
 		 * unmapped.
 		 */
-		if (config_munmap == false || (config_dss && chunk_in_dss(ptr)))
+		if (config_munmap == false || (have_dss && chunk_in_dss(ptr)))
 			memset(ptr, 0x5a, usize);
 	}
 }
diff --git a/test/integration/MALLOCX_ARENA.c b/test/integration/MALLOCX_ARENA.c
index 695a5b6..30c203a 100644
--- a/test/integration/MALLOCX_ARENA.c
+++ b/test/integration/MALLOCX_ARENA.c
@@ -2,6 +2,14 @@
 
 #define	NTHREADS 10
 
+static bool have_dss =
+#ifdef JEMALLOC_DSS
+    true
+#else
+    false
+#endif
+    ;
+
 void *
 thd_start(void *arg)
 {
@@ -18,13 +26,16 @@ thd_start(void *arg)
 		size_t mib[3];
 		size_t miblen = sizeof(mib) / sizeof(size_t);
 		const char *dss_precs[] = {"disabled", "primary", "secondary"};
-		const char *dss = dss_precs[thread_ind %
-		    (sizeof(dss_precs)/sizeof(char*))];
+		unsigned prec_ind = thread_ind %
+		    (sizeof(dss_precs)/sizeof(char*));
+		const char *dss = dss_precs[prec_ind];
+		int expected_err = (have_dss || prec_ind == 0) ? 0 : EFAULT;
 		assert_d_eq(mallctlnametomib("arena.0.dss", mib, &miblen), 0,
 		    "Error in mallctlnametomib()");
 		mib[1] = arena_ind;
 		assert_d_eq(mallctlbymib(mib, miblen, NULL, NULL, (void *)&dss,
-		    sizeof(const char *)), 0, "Error in mallctlbymib()");
+		    sizeof(const char *)), expected_err,
+		    "Error in mallctlbymib()");
 	}
 
 	p = mallocx(1, MALLOCX_ARENA(arena_ind));
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index 31fb810..caf20f8 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -127,7 +127,6 @@ TEST_BEGIN(test_mallctl_config)
 } while (0)
 
 	TEST_MALLCTL_CONFIG(debug);
-	TEST_MALLCTL_CONFIG(dss);
 	TEST_MALLCTL_CONFIG(fill);
 	TEST_MALLCTL_CONFIG(lazy_lock);
 	TEST_MALLCTL_CONFIG(mremap);
@@ -255,15 +254,28 @@ TEST_BEGIN(test_arena_i_dss)
 {
 	const char *dss_prec_old, *dss_prec_new;
 	size_t sz = sizeof(dss_prec_old);
+	size_t mib[3];
+	size_t miblen;
+
+	miblen = sizeof(mib)/sizeof(size_t);
+	assert_d_eq(mallctlnametomib("arena.0.dss", mib, &miblen), 0,
+	    "Unexpected mallctlnametomib() error");
 
-	dss_prec_new = "primary";
-	assert_d_eq(mallctl("arena.0.dss", &dss_prec_old, &sz, &dss_prec_new,
+	dss_prec_new = "disabled";
+	assert_d_eq(mallctlbymib(mib, miblen, &dss_prec_old, &sz, &dss_prec_new,
 	    sizeof(dss_prec_new)), 0, "Unexpected mallctl() failure");
 	assert_str_ne(dss_prec_old, "primary",
 	    "Unexpected default for dss precedence");
 
-	assert_d_eq(mallctl("arena.0.dss", &dss_prec_new, &sz, &dss_prec_old,
+	assert_d_eq(mallctlbymib(mib, miblen, &dss_prec_new, &sz, &dss_prec_old,
 	    sizeof(dss_prec_old)), 0, "Unexpected mallctl() failure");
+
+	mib[1] = narenas_total_get();
+	dss_prec_new = "disabled";
+	assert_d_eq(mallctlbymib(mib, miblen, &dss_prec_old, &sz, &dss_prec_new,
+	    sizeof(dss_prec_new)), 0, "Unexpected mallctl() failure");
+	assert_str_ne(dss_prec_old, "primary",
+	    "Unexpected default for dss precedence");
 }
 TEST_END
 
-- 
cgit v0.12


From a2c719b37445ce9083b6fc5084436dc37ceb7f75 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Tue, 15 Apr 2014 12:46:28 -0700
Subject: Remove the "arenas.purge" mallctl.

Remove the "arenas.purge" mallctl, which was obsoleted by the
"arena.<i>.purge" mallctl in 3.1.0.
---
 doc/jemalloc.xml.in | 12 +-----------
 src/ctl.c           | 27 ---------------------------
 test/unit/mallctl.c | 13 -------------
 3 files changed, 1 insertion(+), 51 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 16dd0bb..b0c77c2 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -1267,7 +1267,7 @@ malloc_conf = "xmalloc:true";]]></programlisting>
       <varlistentry id="arena.i.purge">
         <term>
           <mallctl>arena.&lt;i&gt;.purge</mallctl>
-          (<type>unsigned</type>)
+          (<type>void</type>)
           <literal>--</literal>
         </term>
         <listitem><para>Purge unused dirty pages for arena &lt;i&gt;, or for
@@ -1410,16 +1410,6 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         class.</para></listitem>
       </varlistentry>
 
-      <varlistentry id="arenas.purge">
-        <term>
-          <mallctl>arenas.purge</mallctl>
-          (<type>unsigned</type>)
-          <literal>-w</literal>
-        </term>
-        <listitem><para>Purge unused dirty pages for the specified arena, or
-        for all arenas if none is specified.</para></listitem>
-      </varlistentry>
-
       <varlistentry id="arenas.extend">
         <term>
           <mallctl>arenas.extend</mallctl>
diff --git a/src/ctl.c b/src/ctl.c
index 0340a27..3d44a95 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -129,7 +129,6 @@ CTL_PROTO(arenas_tcache_max)
 CTL_PROTO(arenas_nbins)
 CTL_PROTO(arenas_nhbins)
 CTL_PROTO(arenas_nlruns)
-CTL_PROTO(arenas_purge)
 CTL_PROTO(arenas_extend)
 CTL_PROTO(prof_active)
 CTL_PROTO(prof_dump)
@@ -301,7 +300,6 @@ static const ctl_named_node_t arenas_node[] = {
 	{NAME("bin"),			CHILD(indexed, arenas_bin)},
 	{NAME("nlruns"),		CTL(arenas_nlruns)},
 	{NAME("lrun"),			CHILD(indexed, arenas_lrun)},
-	{NAME("purge"),			CTL(arenas_purge)},
 	{NAME("extend"),		CTL(arenas_extend)}
 };
 
@@ -1469,31 +1467,6 @@ arenas_lrun_i_index(const size_t *mib, size_t miblen, size_t i)
 }
 
 static int
-arenas_purge_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
-    void *newp, size_t newlen)
-{
-	int ret;
-	unsigned arena_ind;
-
-	malloc_mutex_lock(&ctl_mtx);
-	WRITEONLY();
-	arena_ind = UINT_MAX;
-	WRITE(arena_ind, unsigned);
-	if (newp != NULL && arena_ind >= ctl_stats.narenas)
-		ret = EFAULT;
-	else {
-		if (arena_ind == UINT_MAX)
-			arena_ind = ctl_stats.narenas;
-		arena_purge(arena_ind);
-		ret = 0;
-	}
-
-label_return:
-	malloc_mutex_unlock(&ctl_mtx);
-	return (ret);
-}
-
-static int
 arenas_extend_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
     void *newp, size_t newlen)
 {
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index caf20f8..73f42dd 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -279,18 +279,6 @@ TEST_BEGIN(test_arena_i_dss)
 }
 TEST_END
 
-TEST_BEGIN(test_arenas_purge)
-{
-	unsigned arena = 0;
-
-	assert_d_eq(mallctl("arenas.purge", NULL, NULL, &arena, sizeof(arena)),
-	    0, "Unexpected mallctl() failure");
-
-	assert_d_eq(mallctl("arenas.purge", NULL, NULL, NULL, 0), 0,
-	    "Unexpected mallctl() failure");
-}
-TEST_END
-
 TEST_BEGIN(test_arenas_initialized)
 {
 	unsigned narenas;
@@ -417,7 +405,6 @@ main(void)
 	    test_thread_arena,
 	    test_arena_i_purge,
 	    test_arena_i_dss,
-	    test_arenas_purge,
 	    test_arenas_initialized,
 	    test_arenas_constants,
 	    test_arenas_bin_constants,
-- 
cgit v0.12


From ecd3e59ca351d7111ec72a327fe0c009f2aa69a0 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Tue, 15 Apr 2014 14:33:50 -0700
Subject: Remove the "opt.valgrind" mallctl.

Remove the "opt.valgrind" mallctl because it is unnecessary -- jemalloc
automatically detects whether it is running inside valgrind.
---
 doc/jemalloc.xml.in                              | 13 ---------
 include/jemalloc/internal/jemalloc_internal.h.in |  9 +++---
 include/jemalloc/internal/private_symbols.txt    |  2 +-
 src/ctl.c                                        |  3 --
 src/jemalloc.c                                   | 35 ++++++++++++------------
 src/quarantine.c                                 |  2 +-
 test/unit/mallctl.c                              |  1 -
 7 files changed, 24 insertions(+), 41 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index b0c77c2..3e0b806 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -943,19 +943,6 @@ for (i = 0; i < nbins; i++) {
         is disabled by default.</para></listitem>
       </varlistentry>
 
-      <varlistentry id="opt.valgrind">
-        <term>
-          <mallctl>opt.valgrind</mallctl>
-          (<type>bool</type>)
-          <literal>r-</literal>
-          [<option>--enable-valgrind</option>]
-        </term>
-        <listitem><para><ulink url="http://valgrind.org/">Valgrind</ulink>
-        support enabled/disabled.  This option is vestigal because jemalloc
-        auto-detects whether it is running inside Valgrind.  This option is
-        disabled by default, unless running inside Valgrind.</para></listitem>
-      </varlistentry>
-
       <varlistentry id="opt.xmalloc">
         <term>
           <mallctl>opt.xmalloc</mallctl>
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 4821b9b..9b1a6c8 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -377,12 +377,12 @@ static const bool config_ivsalloc =
  * usable space.
  */
 #define	JEMALLOC_VALGRIND_MALLOC(cond, ptr, usize, zero) do {		\
-	if (config_valgrind && opt_valgrind && cond)			\
+	if (config_valgrind && in_valgrind && cond)			\
 		VALGRIND_MALLOCLIKE_BLOCK(ptr, usize, p2rz(ptr), zero);	\
 } while (0)
 #define	JEMALLOC_VALGRIND_REALLOC(ptr, usize, old_ptr, old_usize,	\
     old_rzsize, zero)  do {						\
-	if (config_valgrind && opt_valgrind) {				\
+	if (config_valgrind && in_valgrind) {				\
 		size_t rzsize = p2rz(ptr);				\
 									\
 		if (ptr == old_ptr) {					\
@@ -418,7 +418,7 @@ static const bool config_ivsalloc =
 	}								\
 } while (0)
 #define	JEMALLOC_VALGRIND_FREE(ptr, rzsize) do {			\
-	if (config_valgrind && opt_valgrind)				\
+	if (config_valgrind && in_valgrind)				\
 		VALGRIND_FREELIKE_BLOCK(ptr, rzsize);			\
 } while (0)
 #else
@@ -504,11 +504,12 @@ extern bool	opt_junk;
 extern size_t	opt_quarantine;
 extern bool	opt_redzone;
 extern bool	opt_utrace;
-extern bool	opt_valgrind;
 extern bool	opt_xmalloc;
 extern bool	opt_zero;
 extern size_t	opt_narenas;
 
+extern bool	in_valgrind;
+
 /* Number of CPUs. */
 extern unsigned		ncpus;
 
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index e1cb28f..c140377 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -217,6 +217,7 @@ idalloc
 idalloct
 imalloc
 imalloct
+in_valgrind
 ipalloc
 ipalloct
 iqalloc
@@ -278,7 +279,6 @@ opt_redzone
 opt_stats_print
 opt_tcache
 opt_utrace
-opt_valgrind
 opt_xmalloc
 opt_zero
 p2rz
diff --git a/src/ctl.c b/src/ctl.c
index 3d44a95..9ee5de9 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -98,7 +98,6 @@ CTL_PROTO(opt_zero)
 CTL_PROTO(opt_quarantine)
 CTL_PROTO(opt_redzone)
 CTL_PROTO(opt_utrace)
-CTL_PROTO(opt_valgrind)
 CTL_PROTO(opt_xmalloc)
 CTL_PROTO(opt_tcache)
 CTL_PROTO(opt_lg_tcache_max)
@@ -238,7 +237,6 @@ static const ctl_named_node_t opt_node[] = {
 	{NAME("quarantine"),		CTL(opt_quarantine)},
 	{NAME("redzone"),		CTL(opt_redzone)},
 	{NAME("utrace"),		CTL(opt_utrace)},
-	{NAME("valgrind"),		CTL(opt_valgrind)},
 	{NAME("xmalloc"),		CTL(opt_xmalloc)},
 	{NAME("tcache"),		CTL(opt_tcache)},
 	{NAME("lg_tcache_max"),		CTL(opt_lg_tcache_max)},
@@ -1159,7 +1157,6 @@ CTL_RO_NL_CGEN(config_fill, opt_quarantine, opt_quarantine, size_t)
 CTL_RO_NL_CGEN(config_fill, opt_redzone, opt_redzone, bool)
 CTL_RO_NL_CGEN(config_fill, opt_zero, opt_zero, bool)
 CTL_RO_NL_CGEN(config_utrace, opt_utrace, opt_utrace, bool)
-CTL_RO_NL_CGEN(config_valgrind, opt_valgrind, opt_valgrind, bool)
 CTL_RO_NL_CGEN(config_xmalloc, opt_xmalloc, opt_xmalloc, bool)
 CTL_RO_NL_CGEN(config_tcache, opt_tcache, opt_tcache, bool)
 CTL_RO_NL_CGEN(config_tcache, opt_lg_tcache_max, opt_lg_tcache_max, ssize_t)
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 0de5940..11f1c45 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -27,11 +27,13 @@ bool	opt_junk =
 size_t	opt_quarantine = ZU(0);
 bool	opt_redzone = false;
 bool	opt_utrace = false;
-bool	opt_valgrind = false;
 bool	opt_xmalloc = false;
 bool	opt_zero = false;
 size_t	opt_narenas = 0;
 
+/* Initialized to true if the process is running inside Valgrind. */
+bool	in_valgrind;
+
 unsigned	ncpus;
 
 malloc_mutex_t		arenas_lock;
@@ -394,14 +396,14 @@ malloc_conf_init(void)
 	 * valgrind option remains in jemalloc 3.x for compatibility reasons.
 	 */
 	if (config_valgrind) {
-		opt_valgrind = (RUNNING_ON_VALGRIND != 0) ? true : false;
-		if (config_fill && opt_valgrind) {
+		in_valgrind = (RUNNING_ON_VALGRIND != 0) ? true : false;
+		if (config_fill && in_valgrind) {
 			opt_junk = false;
 			assert(opt_zero == false);
 			opt_quarantine = JEMALLOC_VALGRIND_QUARANTINE_DEFAULT;
 			opt_redzone = true;
 		}
-		if (config_tcache && opt_valgrind)
+		if (config_tcache && in_valgrind)
 			opt_tcache = false;
 	}
 
@@ -608,9 +610,6 @@ malloc_conf_init(void)
 			if (config_utrace) {
 				CONF_HANDLE_BOOL(opt_utrace, "utrace")
 			}
-			if (config_valgrind) {
-				CONF_HANDLE_BOOL(opt_valgrind, "valgrind")
-			}
 			if (config_xmalloc) {
 				CONF_HANDLE_BOOL(opt_xmalloc, "xmalloc")
 			}
@@ -910,7 +909,7 @@ imalloc_prof(size_t usize, prof_thr_cnt_t *cnt)
 			ret = imalloc_prof(usize, cnt);			\
 		} else {						\
 			if (config_stats || (config_valgrind &&		\
-			    opt_valgrind))				\
+			    in_valgrind))				\
 				usize = s2u(size);			\
 			ret = imalloc(size);				\
 		}							\
@@ -1153,7 +1152,7 @@ je_calloc(size_t num, size_t size)
 		PROF_ALLOC_PREP(1, usize, cnt);
 		ret = icalloc_prof(usize, cnt);
 	} else {
-		if (config_stats || (config_valgrind && opt_valgrind))
+		if (config_stats || (config_valgrind && in_valgrind))
 			usize = s2u(num_size);
 		ret = icalloc(num_size);
 	}
@@ -1228,7 +1227,7 @@ ifree(void *ptr)
 		usize = isalloc(ptr, config_prof);
 	if (config_stats)
 		thread_allocated_tsd_get()->deallocated += usize;
-	if (config_valgrind && opt_valgrind)
+	if (config_valgrind && in_valgrind)
 		rzsize = p2rz(ptr);
 	iqalloc(ptr);
 	JEMALLOC_VALGRIND_FREE(ptr, rzsize);
@@ -1257,9 +1256,9 @@ je_realloc(void *ptr, size_t size)
 		malloc_thread_init();
 
 		if ((config_prof && opt_prof) || config_stats ||
-		    (config_valgrind && opt_valgrind))
+		    (config_valgrind && in_valgrind))
 			old_usize = isalloc(ptr, config_prof);
-		if (config_valgrind && opt_valgrind)
+		if (config_valgrind && in_valgrind)
 			old_rzsize = config_prof ? p2rz(ptr) : u2rz(old_usize);
 
 		if (config_prof && opt_prof) {
@@ -1269,7 +1268,7 @@ je_realloc(void *ptr, size_t size)
 			PROF_ALLOC_PREP(1, usize, cnt);
 			ret = irealloc_prof(ptr, old_usize, usize, cnt);
 		} else {
-			if (config_stats || (config_valgrind && opt_valgrind))
+			if (config_stats || (config_valgrind && in_valgrind))
 				usize = s2u(size);
 			ret = iralloc(ptr, size, 0, 0, false);
 		}
@@ -1574,9 +1573,9 @@ je_rallocx(void *ptr, size_t size, int flags)
 	}
 
 	if ((config_prof && opt_prof) || config_stats ||
-	    (config_valgrind && opt_valgrind))
+	    (config_valgrind && in_valgrind))
 		old_usize = isalloc(ptr, config_prof);
-	if (config_valgrind && opt_valgrind)
+	if (config_valgrind && in_valgrind)
 		old_rzsize = u2rz(old_usize);
 
 	if (config_prof && opt_prof) {
@@ -1594,7 +1593,7 @@ je_rallocx(void *ptr, size_t size, int flags)
 		    try_tcache_dalloc, arena);
 		if (p == NULL)
 			goto label_oom;
-		if (config_stats || (config_valgrind && opt_valgrind))
+		if (config_stats || (config_valgrind && in_valgrind))
 			usize = isalloc(p, config_prof);
 	}
 
@@ -1702,7 +1701,7 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags)
 		arena = NULL;
 
 	old_usize = isalloc(ptr, config_prof);
-	if (config_valgrind && opt_valgrind)
+	if (config_valgrind && in_valgrind)
 		old_rzsize = u2rz(old_usize);
 
 	if (config_prof && opt_prof) {
@@ -1784,7 +1783,7 @@ je_dallocx(void *ptr, int flags)
 	}
 	if (config_stats)
 		thread_allocated_tsd_get()->deallocated += usize;
-	if (config_valgrind && opt_valgrind)
+	if (config_valgrind && in_valgrind)
 		rzsize = p2rz(ptr);
 	iqalloct(ptr, try_tcache);
 	JEMALLOC_VALGRIND_FREE(ptr, rzsize);
diff --git a/src/quarantine.c b/src/quarantine.c
index 5431511..3b87442 100644
--- a/src/quarantine.c
+++ b/src/quarantine.c
@@ -146,7 +146,7 @@ quarantine(void *ptr)
 			 * Only do redzone validation if Valgrind isn't in
 			 * operation.
 			 */
-			if ((config_valgrind == false || opt_valgrind == false)
+			if ((config_valgrind == false || in_valgrind == false)
 			    && usize <= SMALL_MAXCLASS)
 				arena_quarantine_junk_small(ptr, usize);
 			else
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index 73f42dd..754834c 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -170,7 +170,6 @@ TEST_BEGIN(test_mallctl_opt)
 	TEST_MALLCTL_OPT(bool, redzone, fill);
 	TEST_MALLCTL_OPT(bool, zero, fill);
 	TEST_MALLCTL_OPT(bool, utrace, utrace);
-	TEST_MALLCTL_OPT(bool, valgrind, valgrind);
 	TEST_MALLCTL_OPT(bool, xmalloc, xmalloc);
 	TEST_MALLCTL_OPT(bool, tcache, tcache);
 	TEST_MALLCTL_OPT(size_t, lg_tcache_max, tcache);
-- 
cgit v0.12


From bd87b01999416ec7418ff8bdb504d9b6c009ff68 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Tue, 15 Apr 2014 16:35:08 -0700
Subject: Optimize Valgrind integration.

Forcefully disable tcache if running inside Valgrind, and remove
Valgrind calls in tcache-specific code.

Restructure Valgrind-related code to move most Valgrind calls out of the
fast path functions.

Take advantage of static knowledge to elide some branches in
JEMALLOC_VALGRIND_REALLOC().
---
 Makefile.in                                      |   4 +
 doc/jemalloc.xml.in                              |   3 +-
 include/jemalloc/internal/jemalloc_internal.h.in |  84 +----------------
 include/jemalloc/internal/private_symbols.txt    |   4 +
 include/jemalloc/internal/tcache.h               |   7 +-
 include/jemalloc/internal/valgrind.h             | 112 +++++++++++++++++++++++
 src/arena.c                                      |  28 +++---
 src/base.c                                       |   7 +-
 src/chunk.c                                      |   6 +-
 src/chunk_dss.c                                  |   3 +-
 src/jemalloc.c                                   |  75 +++++++++------
 src/valgrind.c                                   |  34 +++++++
 12 files changed, 231 insertions(+), 136 deletions(-)
 create mode 100644 include/jemalloc/internal/valgrind.h
 create mode 100644 src/valgrind.c

diff --git a/Makefile.in b/Makefile.in
index f7aa7d8..e411804 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -48,6 +48,7 @@ cfgoutputs_in := @cfgoutputs_in@
 cfgoutputs_out := @cfgoutputs_out@
 enable_autogen := @enable_autogen@
 enable_code_coverage := @enable_code_coverage@
+enable_valgrind := @enable_valgrind@
 enable_zone_allocator := @enable_zone_allocator@
 DSO_LDFLAGS = @DSO_LDFLAGS@
 SOREV = @SOREV@
@@ -82,6 +83,9 @@ C_SRCS := $(srcroot)src/jemalloc.c $(srcroot)src/arena.c \
 	$(srcroot)src/mb.c $(srcroot)src/mutex.c $(srcroot)src/prof.c \
 	$(srcroot)src/quarantine.c $(srcroot)src/rtree.c $(srcroot)src/stats.c \
 	$(srcroot)src/tcache.c $(srcroot)src/util.c $(srcroot)src/tsd.c
+ifeq ($(enable_valgrind), 1)
+C_SRCS += $(srcroot)src/valgrind.c
+endif
 ifeq ($(enable_zone_allocator), 1)
 C_SRCS += $(srcroot)src/zone.c
 endif
diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 3e0b806..78e9b3c 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -979,7 +979,8 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         linkend="opt.lg_tcache_max"><mallctl>opt.lg_tcache_max</mallctl></link>
         option for related tuning information.  This option is enabled by
         default unless running inside <ulink
-        url="http://valgrind.org/">Valgrind</ulink>.</para></listitem>
+        url="http://valgrind.org/">Valgrind</ulink>, in which case it is
+        forcefully disabled.</para></listitem>
       </varlistentry>
 
       <varlistentry id="opt.lg_tcache_max">
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 9b1a6c8..50d44cc 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -60,11 +60,6 @@ typedef intptr_t ssize_t;
 #include <sys/ktrace.h>
 #endif
 
-#ifdef JEMALLOC_VALGRIND
-#include <valgrind/valgrind.h>
-#include <valgrind/memcheck.h>
-#endif
-
 #define	JEMALLOC_NO_DEMANGLE
 #ifdef JEMALLOC_JET
 #  define JEMALLOC_N(n) jet_##n
@@ -362,81 +357,7 @@ static const bool config_ivsalloc =
 #  define VARIABLE_ARRAY(type, name, count) type name[count]
 #endif
 
-#ifdef JEMALLOC_VALGRIND
-/*
- * The JEMALLOC_VALGRIND_*() macros must be macros rather than functions
- * so that when Valgrind reports errors, there are no extra stack frames
- * in the backtraces.
- *
- * The size that is reported to valgrind must be consistent through a chain of
- * malloc..realloc..realloc calls.  Request size isn't recorded anywhere in
- * jemalloc, so it is critical that all callers of these macros provide usize
- * rather than request size.  As a result, buffer overflow detection is
- * technically weakened for the standard API, though it is generally accepted
- * practice to consider any extra bytes reported by malloc_usable_size() as
- * usable space.
- */
-#define	JEMALLOC_VALGRIND_MALLOC(cond, ptr, usize, zero) do {		\
-	if (config_valgrind && in_valgrind && cond)			\
-		VALGRIND_MALLOCLIKE_BLOCK(ptr, usize, p2rz(ptr), zero);	\
-} while (0)
-#define	JEMALLOC_VALGRIND_REALLOC(ptr, usize, old_ptr, old_usize,	\
-    old_rzsize, zero)  do {						\
-	if (config_valgrind && in_valgrind) {				\
-		size_t rzsize = p2rz(ptr);				\
-									\
-		if (ptr == old_ptr) {					\
-			VALGRIND_RESIZEINPLACE_BLOCK(ptr, old_usize,	\
-			    usize, rzsize);				\
-			if (zero && old_usize < usize) {		\
-				VALGRIND_MAKE_MEM_DEFINED(		\
-				    (void *)((uintptr_t)ptr +		\
-				    old_usize), usize - old_usize);	\
-			}						\
-		} else {						\
-			if (old_ptr != NULL) {				\
-				VALGRIND_FREELIKE_BLOCK(old_ptr,	\
-				    old_rzsize);			\
-			}						\
-			if (ptr != NULL) {				\
-				size_t copy_size = (old_usize < usize)	\
-				    ?  old_usize : usize;		\
-				size_t tail_size = usize - copy_size;	\
-				VALGRIND_MALLOCLIKE_BLOCK(ptr, usize,	\
-				    rzsize, false);			\
-				if (copy_size > 0) {			\
-					VALGRIND_MAKE_MEM_DEFINED(ptr,	\
-					    copy_size);			\
-				}					\
-				if (zero && tail_size > 0) {		\
-					VALGRIND_MAKE_MEM_DEFINED(	\
-					    (void *)((uintptr_t)ptr +	\
-					    copy_size), tail_size);	\
-				}					\
-			}						\
-		}							\
-	}								\
-} while (0)
-#define	JEMALLOC_VALGRIND_FREE(ptr, rzsize) do {			\
-	if (config_valgrind && in_valgrind)				\
-		VALGRIND_FREELIKE_BLOCK(ptr, rzsize);			\
-} while (0)
-#else
-#define	RUNNING_ON_VALGRIND	((unsigned)0)
-#define	VALGRIND_MALLOCLIKE_BLOCK(addr, sizeB, rzB, is_zeroed) \
-    do {} while (0)
-#define	VALGRIND_RESIZEINPLACE_BLOCK(addr, oldSizeB, newSizeB, rzB) \
-    do {} while (0)
-#define	VALGRIND_FREELIKE_BLOCK(addr, rzB) do {} while (0)
-#define	VALGRIND_MAKE_MEM_NOACCESS(_qzz_addr, _qzz_len) do {} while (0)
-#define	VALGRIND_MAKE_MEM_UNDEFINED(_qzz_addr, _qzz_len) do {} while (0)
-#define	VALGRIND_MAKE_MEM_DEFINED(_qzz_addr, _qzz_len) do {} while (0)
-#define	JEMALLOC_VALGRIND_MALLOC(cond, ptr, usize, zero) do {} while (0)
-#define	JEMALLOC_VALGRIND_REALLOC(ptr, usize, old_ptr, old_usize,	\
-    old_rzsize, zero) do {} while (0)
-#define	JEMALLOC_VALGRIND_FREE(ptr, rzsize) do {} while (0)
-#endif
-
+#include "jemalloc/internal/valgrind.h"
 #include "jemalloc/internal/util.h"
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/prng.h"
@@ -463,6 +384,7 @@ static const bool config_ivsalloc =
 /******************************************************************************/
 #define	JEMALLOC_H_STRUCTS
 
+#include "jemalloc/internal/valgrind.h"
 #include "jemalloc/internal/util.h"
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/prng.h"
@@ -534,6 +456,7 @@ void	jemalloc_prefork(void);
 void	jemalloc_postfork_parent(void);
 void	jemalloc_postfork_child(void);
 
+#include "jemalloc/internal/valgrind.h"
 #include "jemalloc/internal/util.h"
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/prng.h"
@@ -560,6 +483,7 @@ void	jemalloc_postfork_child(void);
 /******************************************************************************/
 #define	JEMALLOC_H_INLINES
 
+#include "jemalloc/internal/valgrind.h"
 #include "jemalloc/internal/util.h"
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/prng.h"
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index c140377..9d77cba 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -409,3 +409,7 @@ thread_allocated_tsd_set
 tsd_init_check_recursion
 tsd_init_finish
 u2rz
+valgrind_freelike_block
+valgrind_make_mem_defined
+valgrind_make_mem_noaccess
+valgrind_make_mem_undefined
diff --git a/include/jemalloc/internal/tcache.h b/include/jemalloc/internal/tcache.h
index 5197413..af24845 100644
--- a/include/jemalloc/internal/tcache.h
+++ b/include/jemalloc/internal/tcache.h
@@ -314,13 +314,11 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
 			} else if (opt_zero)
 				memset(ret, 0, size);
 		}
-		VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
 	} else {
 		if (config_fill && opt_junk) {
 			arena_alloc_junk_small(ret, &arena_bin_info[binind],
 			    true);
 		}
-		VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
 		memset(ret, 0, size);
 	}
 
@@ -369,11 +367,8 @@ tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
 				else if (opt_zero)
 					memset(ret, 0, size);
 			}
-			VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
-		} else {
-			VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
+		} else
 			memset(ret, 0, size);
-		}
 
 		if (config_stats)
 			tbin->tstats.nrequests++;
diff --git a/include/jemalloc/internal/valgrind.h b/include/jemalloc/internal/valgrind.h
new file mode 100644
index 0000000..52c93f2
--- /dev/null
+++ b/include/jemalloc/internal/valgrind.h
@@ -0,0 +1,112 @@
+/******************************************************************************/
+#ifdef JEMALLOC_H_TYPES
+
+#ifdef JEMALLOC_VALGRIND
+#include <valgrind/valgrind.h>
+
+/*
+ * The size that is reported to Valgrind must be consistent through a chain of
+ * malloc..realloc..realloc calls.  Request size isn't recorded anywhere in
+ * jemalloc, so it is critical that all callers of these macros provide usize
+ * rather than request size.  As a result, buffer overflow detection is
+ * technically weakened for the standard API, though it is generally accepted
+ * practice to consider any extra bytes reported by malloc_usable_size() as
+ * usable space.
+ */
+#define	JEMALLOC_VALGRIND_MAKE_MEM_NOACCESS(ptr, usize) do {		\
+	if (in_valgrind)						\
+		valgrind_make_mem_noaccess(ptr, usize);			\
+} while (0)
+#define	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ptr, usize) do {		\
+	if (in_valgrind)						\
+		valgrind_make_mem_undefined(ptr, usize);		\
+} while (0)
+#define	JEMALLOC_VALGRIND_MAKE_MEM_DEFINED(ptr, usize) do {		\
+	if (in_valgrind)						\
+		valgrind_make_mem_defined(ptr, usize);			\
+} while (0)
+/*
+ * The VALGRIND_MALLOCLIKE_BLOCK() and VALGRIND_RESIZEINPLACE_BLOCK() macro
+ * calls must be embedded in macros rather than in functions so that when
+ * Valgrind reports errors, there are no extra stack frames in the backtraces.
+ */
+#define	JEMALLOC_VALGRIND_MALLOC(cond, ptr, usize, zero) do {		\
+	if (in_valgrind && cond)					\
+		VALGRIND_MALLOCLIKE_BLOCK(ptr, usize, p2rz(ptr), zero);	\
+} while (0)
+#define	JEMALLOC_VALGRIND_REALLOC(maybe_moved, ptr, usize,		\
+    ptr_maybe_null, old_ptr, old_usize, old_rzsize, old_ptr_maybe_null,	\
+    zero) do {								\
+	if (in_valgrind) {						\
+		size_t rzsize = p2rz(ptr);				\
+									\
+		if (!maybe_moved || ptr == old_ptr) {			\
+			VALGRIND_RESIZEINPLACE_BLOCK(ptr, old_usize,	\
+			    usize, rzsize);				\
+			if (zero && old_usize < usize) {		\
+				valgrind_make_mem_defined(		\
+				    (void *)((uintptr_t)ptr +		\
+				    old_usize), usize - old_usize);	\
+			}						\
+		} else {						\
+			if (!old_ptr_maybe_null || old_ptr != NULL) {	\
+				valgrind_freelike_block(old_ptr,	\
+				    old_rzsize);			\
+			}						\
+			if (!ptr_maybe_null || ptr != NULL) {		\
+				size_t copy_size = (old_usize < usize)	\
+				    ?  old_usize : usize;		\
+				size_t tail_size = usize - copy_size;	\
+				VALGRIND_MALLOCLIKE_BLOCK(ptr, usize,	\
+				    rzsize, false);			\
+				if (copy_size > 0) {			\
+					valgrind_make_mem_defined(ptr,	\
+					copy_size);			\
+				}					\
+				if (zero && tail_size > 0) {		\
+					valgrind_make_mem_defined(	\
+					    (void *)((uintptr_t)ptr +	\
+					    copy_size), tail_size);	\
+				}					\
+			}						\
+		}							\
+	}								\
+} while (0)
+#define	JEMALLOC_VALGRIND_FREE(ptr, rzsize) do {			\
+	if (in_valgrind)						\
+		valgrind_freelike_block(ptr, rzsize);			\
+} while (0)
+#else
+#define	RUNNING_ON_VALGRIND	((unsigned)0)
+#define	JEMALLOC_VALGRIND_MAKE_MEM_NOACCESS(ptr, usize) do {} while (0)
+#define	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ptr, usize) do {} while (0)
+#define	JEMALLOC_VALGRIND_MAKE_MEM_DEFINED(ptr, usize) do {} while (0)
+#define	JEMALLOC_VALGRIND_MALLOC(cond, ptr, usize, zero) do {} while (0)
+#define	JEMALLOC_VALGRIND_REALLOC(maybe_moved, ptr, usize,		\
+    ptr_maybe_null, old_ptr, old_usize, old_rzsize, old_ptr_maybe_null,	\
+    zero) do {} while (0)
+#define	JEMALLOC_VALGRIND_FREE(ptr, rzsize) do {} while (0)
+#endif
+
+#endif /* JEMALLOC_H_TYPES */
+/******************************************************************************/
+#ifdef JEMALLOC_H_STRUCTS
+
+#endif /* JEMALLOC_H_STRUCTS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_EXTERNS
+
+#ifdef JEMALLOC_VALGRIND
+void	valgrind_make_mem_noaccess(void *ptr, size_t usize);
+void	valgrind_make_mem_undefined(void *ptr, size_t usize);
+void	valgrind_make_mem_defined(void *ptr, size_t usize);
+void	valgrind_freelike_block(void *ptr, size_t usize);
+#endif
+
+#endif /* JEMALLOC_H_EXTERNS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_INLINES
+
+#endif /* JEMALLOC_H_INLINES */
+/******************************************************************************/
+
diff --git a/src/arena.c b/src/arena.c
index 8aa36fd..3952e70 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -337,8 +337,8 @@ static inline void
 arena_run_zero(arena_chunk_t *chunk, size_t run_ind, size_t npages)
 {
 
-	VALGRIND_MAKE_MEM_UNDEFINED((void *)((uintptr_t)chunk + (run_ind <<
-	    LG_PAGE)), (npages << LG_PAGE));
+	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED((void *)((uintptr_t)chunk +
+	    (run_ind << LG_PAGE)), (npages << LG_PAGE));
 	memset((void *)((uintptr_t)chunk + (run_ind << LG_PAGE)), 0,
 	    (npages << LG_PAGE));
 }
@@ -347,8 +347,8 @@ static inline void
 arena_run_page_mark_zeroed(arena_chunk_t *chunk, size_t run_ind)
 {
 
-	VALGRIND_MAKE_MEM_DEFINED((void *)((uintptr_t)chunk + (run_ind <<
-	    LG_PAGE)), PAGE);
+	JEMALLOC_VALGRIND_MAKE_MEM_DEFINED((void *)((uintptr_t)chunk + (run_ind
+	    << LG_PAGE)), PAGE);
 }
 
 static inline void
@@ -457,7 +457,7 @@ arena_run_split_large_helper(arena_t *arena, arena_run_t *run, size_t size,
 			arena_run_zero(chunk, run_ind, need_pages);
 		}
 	} else {
-		VALGRIND_MAKE_MEM_UNDEFINED((void *)((uintptr_t)chunk +
+		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED((void *)((uintptr_t)chunk +
 		    (run_ind << LG_PAGE)), (need_pages << LG_PAGE));
 	}
 
@@ -525,7 +525,7 @@ arena_run_split_small(arena_t *arena, arena_run_t *run, size_t size,
 	if (config_debug && flag_dirty == 0 && arena_mapbits_unzeroed_get(chunk,
 	    run_ind+need_pages-1) == 0)
 		arena_run_page_validate_zeroed(chunk, run_ind+need_pages-1);
-	VALGRIND_MAKE_MEM_UNDEFINED((void *)((uintptr_t)chunk +
+	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED((void *)((uintptr_t)chunk +
 	    (run_ind << LG_PAGE)), (need_pages << LG_PAGE));
 }
 
@@ -592,14 +592,14 @@ arena_chunk_init_hard(arena_t *arena)
 	 * the chunk is not zeroed.
 	 */
 	if (zero == false) {
-		VALGRIND_MAKE_MEM_UNDEFINED((void *)arena_mapp_get(chunk,
-		    map_bias+1), (size_t)((uintptr_t) arena_mapp_get(chunk,
-		    chunk_npages-1) - (uintptr_t)arena_mapp_get(chunk,
-		    map_bias+1)));
+		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(
+		    (void *)arena_mapp_get(chunk, map_bias+1),
+		    (size_t)((uintptr_t) arena_mapp_get(chunk, chunk_npages-1) -
+		    (uintptr_t)arena_mapp_get(chunk, map_bias+1)));
 		for (i = map_bias+1; i < chunk_npages-1; i++)
 			arena_mapbits_unzeroed_set(chunk, i, unzeroed);
 	} else {
-		VALGRIND_MAKE_MEM_DEFINED((void *)arena_mapp_get(chunk,
+		JEMALLOC_VALGRIND_MAKE_MEM_DEFINED((void *)arena_mapp_get(chunk,
 		    map_bias+1), (size_t)((uintptr_t) arena_mapp_get(chunk,
 		    chunk_npages-1) - (uintptr_t)arena_mapp_get(chunk,
 		    map_bias+1)));
@@ -1645,13 +1645,13 @@ arena_malloc_small(arena_t *arena, size_t size, bool zero)
 			} else if (opt_zero)
 				memset(ret, 0, size);
 		}
-		VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
+		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
 	} else {
 		if (config_fill && opt_junk) {
 			arena_alloc_junk_small(ret, &arena_bin_info[binind],
 			    true);
 		}
-		VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
+		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
 		memset(ret, 0, size);
 	}
 
@@ -2226,7 +2226,7 @@ arena_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
 	 * expectation that the extra bytes will be reliably preserved.
 	 */
 	copysize = (size < oldsize) ? size : oldsize;
-	VALGRIND_MAKE_MEM_UNDEFINED(ret, copysize);
+	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, copysize);
 	memcpy(ret, ptr, copysize);
 	iqalloct(ptr, try_tcache_dalloc);
 	return (ret);
diff --git a/src/base.c b/src/base.c
index 4e62e8f..03dcf8f 100644
--- a/src/base.c
+++ b/src/base.c
@@ -63,7 +63,7 @@ base_alloc(size_t size)
 	ret = base_next_addr;
 	base_next_addr = (void *)((uintptr_t)base_next_addr + csize);
 	malloc_mutex_unlock(&base_mtx);
-	VALGRIND_MAKE_MEM_UNDEFINED(ret, csize);
+	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, csize);
 
 	return (ret);
 }
@@ -89,7 +89,8 @@ base_node_alloc(void)
 		ret = base_nodes;
 		base_nodes = *(extent_node_t **)ret;
 		malloc_mutex_unlock(&base_mtx);
-		VALGRIND_MAKE_MEM_UNDEFINED(ret, sizeof(extent_node_t));
+		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret,
+		    sizeof(extent_node_t));
 	} else {
 		malloc_mutex_unlock(&base_mtx);
 		ret = (extent_node_t *)base_alloc(sizeof(extent_node_t));
@@ -102,7 +103,7 @@ void
 base_node_dealloc(extent_node_t *node)
 {
 
-	VALGRIND_MAKE_MEM_UNDEFINED(node, sizeof(extent_node_t));
+	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(node, sizeof(extent_node_t));
 	malloc_mutex_lock(&base_mtx);
 	*(extent_node_t **)node = base_nodes;
 	base_nodes = node;
diff --git a/src/chunk.c b/src/chunk.c
index fdd693e..246324a 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -127,7 +127,7 @@ chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, size_t size,
 			size_t i;
 			size_t *p = (size_t *)(uintptr_t)ret;
 
-			VALGRIND_MAKE_MEM_DEFINED(ret, size);
+			JEMALLOC_VALGRIND_MAKE_MEM_DEFINED(ret, size);
 			for (i = 0; i < size / sizeof(size_t); i++)
 				assert(p[i] == 0);
 		}
@@ -203,7 +203,7 @@ label_return:
 				prof_gdump();
 		}
 		if (config_valgrind)
-			VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
+			JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
 	}
 	assert(CHUNK_ADDR2BASE(ret) == ret);
 	return (ret);
@@ -217,7 +217,7 @@ chunk_record(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, void *chunk,
 	extent_node_t *xnode, *node, *prev, *xprev, key;
 
 	unzeroed = pages_purge(chunk, size);
-	VALGRIND_MAKE_MEM_NOACCESS(chunk, size);
+	JEMALLOC_VALGRIND_MAKE_MEM_NOACCESS(chunk, size);
 
 	/*
 	 * Allocate a node before acquiring chunks_mtx even though it might not
diff --git a/src/chunk_dss.c b/src/chunk_dss.c
index 36133f1..82faf91 100644
--- a/src/chunk_dss.c
+++ b/src/chunk_dss.c
@@ -126,7 +126,8 @@ chunk_alloc_dss(size_t size, size_t alignment, bool *zero)
 				if (cpad_size != 0)
 					chunk_unmap(cpad, cpad_size);
 				if (*zero) {
-					VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
+					JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(
+					    ret, size);
 					memset(ret, 0, size);
 				}
 				return (ret);
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 11f1c45..36eae72 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -479,9 +479,10 @@ malloc_conf_init(void)
 
 		while (*opts != '\0' && malloc_conf_next(&opts, &k, &klen, &v,
 		    &vlen) == false) {
-#define	CONF_HANDLE_BOOL(o, n)						\
-			if (sizeof(n)-1 == klen && strncmp(n, k,	\
-			    klen) == 0) {				\
+#define	CONF_MATCH(n)							\
+	(sizeof(n)-1 == klen && strncmp(n, k, klen) == 0)
+#define	CONF_HANDLE_BOOL(o, n, cont)					\
+			if (CONF_MATCH(n)) {				\
 				if (strncmp("true", v, vlen) == 0 &&	\
 				    vlen == sizeof("true")-1)		\
 					o = true;			\
@@ -493,11 +494,11 @@ malloc_conf_init(void)
 					    "Invalid conf value",	\
 					    k, klen, v, vlen);		\
 				}					\
-				continue;				\
+				if (cont)				\
+					continue;			\
 			}
 #define	CONF_HANDLE_SIZE_T(o, n, min, max, clip)			\
-			if (sizeof(n)-1 == klen && strncmp(n, k,	\
-			    klen) == 0) {				\
+			if (CONF_MATCH(n)) {				\
 				uintmax_t um;				\
 				char *end;				\
 									\
@@ -528,8 +529,7 @@ malloc_conf_init(void)
 				continue;				\
 			}
 #define	CONF_HANDLE_SSIZE_T(o, n, min, max)				\
-			if (sizeof(n)-1 == klen && strncmp(n, k,	\
-			    klen) == 0) {				\
+			if (CONF_MATCH(n)) {				\
 				long l;					\
 				char *end;				\
 									\
@@ -550,8 +550,7 @@ malloc_conf_init(void)
 				continue;				\
 			}
 #define	CONF_HANDLE_CHAR_P(o, n, d)					\
-			if (sizeof(n)-1 == klen && strncmp(n, k,	\
-			    klen) == 0) {				\
+			if (CONF_MATCH(n)) {				\
 				size_t cpylen = (vlen <=		\
 				    sizeof(o)-1) ? vlen :		\
 				    sizeof(o)-1;			\
@@ -560,7 +559,7 @@ malloc_conf_init(void)
 				continue;				\
 			}
 
-			CONF_HANDLE_BOOL(opt_abort, "abort")
+			CONF_HANDLE_BOOL(opt_abort, "abort", true)
 			/*
 			 * Chunks always require at least one header page, plus
 			 * one data page in the absence of redzones, or three
@@ -599,44 +598,62 @@ malloc_conf_init(void)
 			    SIZE_T_MAX, false)
 			CONF_HANDLE_SSIZE_T(opt_lg_dirty_mult, "lg_dirty_mult",
 			    -1, (sizeof(size_t) << 3) - 1)
-			CONF_HANDLE_BOOL(opt_stats_print, "stats_print")
+			CONF_HANDLE_BOOL(opt_stats_print, "stats_print", true)
 			if (config_fill) {
-				CONF_HANDLE_BOOL(opt_junk, "junk")
+				CONF_HANDLE_BOOL(opt_junk, "junk", true)
 				CONF_HANDLE_SIZE_T(opt_quarantine, "quarantine",
 				    0, SIZE_T_MAX, false)
-				CONF_HANDLE_BOOL(opt_redzone, "redzone")
-				CONF_HANDLE_BOOL(opt_zero, "zero")
+				CONF_HANDLE_BOOL(opt_redzone, "redzone", true)
+				CONF_HANDLE_BOOL(opt_zero, "zero", true)
 			}
 			if (config_utrace) {
-				CONF_HANDLE_BOOL(opt_utrace, "utrace")
+				CONF_HANDLE_BOOL(opt_utrace, "utrace", true)
 			}
 			if (config_xmalloc) {
-				CONF_HANDLE_BOOL(opt_xmalloc, "xmalloc")
+				CONF_HANDLE_BOOL(opt_xmalloc, "xmalloc", true)
 			}
 			if (config_tcache) {
-				CONF_HANDLE_BOOL(opt_tcache, "tcache")
+				CONF_HANDLE_BOOL(opt_tcache, "tcache",
+				    !config_valgrind || !in_valgrind)
+				if (CONF_MATCH("tcache")) {
+					assert(config_valgrind && in_valgrind);
+					if (opt_tcache) {
+						opt_tcache = false;
+						malloc_conf_error(
+						"tcache cannot be enabled "
+						"while running inside Valgrind",
+						k, klen, v, vlen);
+					}
+					continue;
+				}
 				CONF_HANDLE_SSIZE_T(opt_lg_tcache_max,
 				    "lg_tcache_max", -1,
 				    (sizeof(size_t) << 3) - 1)
 			}
 			if (config_prof) {
-				CONF_HANDLE_BOOL(opt_prof, "prof")
+				CONF_HANDLE_BOOL(opt_prof, "prof", true)
 				CONF_HANDLE_CHAR_P(opt_prof_prefix,
 				    "prof_prefix", "jeprof")
-				CONF_HANDLE_BOOL(opt_prof_active, "prof_active")
+				CONF_HANDLE_BOOL(opt_prof_active, "prof_active",
+				    true)
 				CONF_HANDLE_SSIZE_T(opt_lg_prof_sample,
 				    "lg_prof_sample", 0,
 				    (sizeof(uint64_t) << 3) - 1)
-				CONF_HANDLE_BOOL(opt_prof_accum, "prof_accum")
+				CONF_HANDLE_BOOL(opt_prof_accum, "prof_accum",
+				    true)
 				CONF_HANDLE_SSIZE_T(opt_lg_prof_interval,
 				    "lg_prof_interval", -1,
 				    (sizeof(uint64_t) << 3) - 1)
-				CONF_HANDLE_BOOL(opt_prof_gdump, "prof_gdump")
-				CONF_HANDLE_BOOL(opt_prof_final, "prof_final")
-				CONF_HANDLE_BOOL(opt_prof_leak, "prof_leak")
+				CONF_HANDLE_BOOL(opt_prof_gdump, "prof_gdump",
+				    true)
+				CONF_HANDLE_BOOL(opt_prof_final, "prof_final",
+				    true)
+				CONF_HANDLE_BOOL(opt_prof_leak, "prof_leak",
+				    true)
 			}
 			malloc_conf_error("Invalid conf pair", k, klen, v,
 			    vlen);
+#undef CONF_MATCH
 #undef CONF_HANDLE_BOOL
 #undef CONF_HANDLE_SIZE_T
 #undef CONF_HANDLE_SSIZE_T
@@ -1293,8 +1310,8 @@ je_realloc(void *ptr, size_t size)
 		ta->deallocated += old_usize;
 	}
 	UTRACE(ptr, size, ret);
-	JEMALLOC_VALGRIND_REALLOC(ret, usize, ptr, old_usize, old_rzsize,
-	    false);
+	JEMALLOC_VALGRIND_REALLOC(true, ret, usize, true, ptr, old_usize,
+	    old_rzsize, true, false);
 	return (ret);
 }
 
@@ -1604,7 +1621,8 @@ je_rallocx(void *ptr, size_t size, int flags)
 		ta->deallocated += old_usize;
 	}
 	UTRACE(ptr, size, p);
-	JEMALLOC_VALGRIND_REALLOC(p, usize, ptr, old_usize, old_rzsize, zero);
+	JEMALLOC_VALGRIND_REALLOC(true, p, usize, false, ptr, old_usize,
+	    old_rzsize, false, zero);
 	return (p);
 label_oom:
 	if (config_xmalloc && opt_xmalloc) {
@@ -1731,7 +1749,8 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags)
 		ta->allocated += usize;
 		ta->deallocated += old_usize;
 	}
-	JEMALLOC_VALGRIND_REALLOC(ptr, usize, ptr, old_usize, old_rzsize, zero);
+	JEMALLOC_VALGRIND_REALLOC(false, ptr, usize, false, ptr, old_usize,
+	    old_rzsize, false, zero);
 label_not_resized:
 	UTRACE(ptr, size, ptr);
 	return (usize);
diff --git a/src/valgrind.c b/src/valgrind.c
new file mode 100644
index 0000000..8e7ef3a
--- /dev/null
+++ b/src/valgrind.c
@@ -0,0 +1,34 @@
+#include "jemalloc/internal/jemalloc_internal.h"
+#ifndef JEMALLOC_VALGRIND
+#  error "This source file is for Valgrind integration."
+#endif
+
+#include <valgrind/memcheck.h>
+
+void
+valgrind_make_mem_noaccess(void *ptr, size_t usize)
+{
+
+	VALGRIND_MAKE_MEM_NOACCESS(ptr, usize);
+}
+
+void
+valgrind_make_mem_undefined(void *ptr, size_t usize)
+{
+
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr, usize);
+}
+
+void
+valgrind_make_mem_defined(void *ptr, size_t usize)
+{
+
+	VALGRIND_MAKE_MEM_DEFINED(ptr, usize);
+}
+
+void
+valgrind_freelike_block(void *ptr, size_t usize)
+{
+
+	VALGRIND_FREELIKE_BLOCK(ptr, usize);
+}
-- 
cgit v0.12


From a7619b7fa56f98d1ca99a23b458696dd37c12b77 Mon Sep 17 00:00:00 2001
From: Ben Maurer <bmaurer@fb.com>
Date: Tue, 15 Apr 2014 13:28:37 -0700
Subject: outline rare tcache_get codepaths

---
 include/jemalloc/internal/private_symbols.txt |  1 +
 include/jemalloc/internal/tcache.h            | 35 ++---------------------
 src/tcache.c                                  | 40 +++++++++++++++++++++++++++
 3 files changed, 43 insertions(+), 33 deletions(-)

diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index f52d49f..376f95d 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -384,6 +384,7 @@ tcache_event
 tcache_event_hard
 tcache_flush
 tcache_get
+tcache_get_hard
 tcache_initialized
 tcache_maxclass
 tcache_salloc
diff --git a/include/jemalloc/internal/tcache.h b/include/jemalloc/internal/tcache.h
index 5197413..96447f4 100644
--- a/include/jemalloc/internal/tcache.h
+++ b/include/jemalloc/internal/tcache.h
@@ -110,6 +110,7 @@ void	tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem,
     tcache_t *tcache);
 void	tcache_arena_associate(tcache_t *tcache, arena_t *arena);
 void	tcache_arena_dissociate(tcache_t *tcache);
+tcache_t *tcache_get_hard(tcache_t *tcache, bool create);
 tcache_t *tcache_create(arena_t *arena);
 void	tcache_destroy(tcache_t *tcache);
 void	tcache_thread_cleanup(void *arg);
@@ -220,39 +221,7 @@ tcache_get(bool create)
 	if ((uintptr_t)tcache <= (uintptr_t)TCACHE_STATE_MAX) {
 		if (tcache == TCACHE_STATE_DISABLED)
 			return (NULL);
-		if (tcache == NULL) {
-			if (create == false) {
-				/*
-				 * Creating a tcache here would cause
-				 * allocation as a side effect of free().
-				 * Ordinarily that would be okay since
-				 * tcache_create() failure is a soft failure
-				 * that doesn't propagate.  However, if TLS
-				 * data are freed via free() as in glibc,
-				 * subtle corruption could result from setting
-				 * a TLS variable after its backing memory is
-				 * freed.
-				 */
-				return (NULL);
-			}
-			if (tcache_enabled_get() == false) {
-				tcache_enabled_set(false); /* Memoize. */
-				return (NULL);
-			}
-			return (tcache_create(choose_arena(NULL)));
-		}
-		if (tcache == TCACHE_STATE_PURGATORY) {
-			/*
-			 * Make a note that an allocator function was called
-			 * after tcache_thread_cleanup() was called.
-			 */
-			tcache = TCACHE_STATE_REINCARNATED;
-			tcache_tsd_set(&tcache);
-			return (NULL);
-		}
-		if (tcache == TCACHE_STATE_REINCARNATED)
-			return (NULL);
-		not_reached();
+		tcache = tcache_get_hard(tcache, create);
 	}
 
 	return (tcache);
diff --git a/src/tcache.c b/src/tcache.c
index 6de9296..868f2d7 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -266,6 +266,46 @@ tcache_arena_dissociate(tcache_t *tcache)
 }
 
 tcache_t *
+tcache_get_hard(tcache_t *tcache, bool create)
+{
+
+	if (tcache == NULL) {
+		if (create == false) {
+			/*
+			 * Creating a tcache here would cause
+			 * allocation as a side effect of free().
+			 * Ordinarily that would be okay since
+			 * tcache_create() failure is a soft failure
+			 * that doesn't propagate.  However, if TLS
+			 * data are freed via free() as in glibc,
+			 * subtle corruption could result from setting
+			 * a TLS variable after its backing memory is
+			 * freed.
+			 */
+			return (NULL);
+		}
+		if (tcache_enabled_get() == false) {
+			tcache_enabled_set(false); /* Memoize. */
+			return (NULL);
+		}
+		return (tcache_create(choose_arena(NULL)));
+	}
+	if (tcache == TCACHE_STATE_PURGATORY) {
+		/*
+		 * Make a note that an allocator function was called
+		 * after tcache_thread_cleanup() was called.
+		 */
+		tcache = TCACHE_STATE_REINCARNATED;
+		tcache_tsd_set(&tcache);
+		return (NULL);
+	}
+	if (tcache == TCACHE_STATE_REINCARNATED)
+		return (NULL);
+	not_reached();
+	return (NULL);
+}
+
+tcache_t *
 tcache_create(arena_t *arena)
 {
 	tcache_t *tcache;
-- 
cgit v0.12


From 6c39f9e059d0825f4c29d8cec9f318b798912c3c Mon Sep 17 00:00:00 2001
From: Ben Maurer <bmaurer@fb.com>
Date: Tue, 15 Apr 2014 13:47:13 -0700
Subject: refactor profiling. only use a bytes till next sample variable.

---
 include/jemalloc/internal/private_symbols.txt |   1 +
 include/jemalloc/internal/prof.h              | 218 ++++++++------------------
 src/prof.c                                    |  65 +++++++-
 3 files changed, 132 insertions(+), 152 deletions(-)

diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 376f95d..032bed4 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -299,6 +299,7 @@ prof_idump
 prof_interval
 prof_lookup
 prof_malloc
+prof_malloc_record_object
 prof_mdump
 prof_postfork_child
 prof_postfork_parent
diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index 56014f1..27be10c 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -177,8 +177,7 @@ struct prof_tdata_s {
 
 	/* Sampling state. */
 	uint64_t		prng_state;
-	uint64_t		threshold;
-	uint64_t		accum;
+	uint64_t		bytes_until_sample;
 
 	/* State used to avoid dumping while operating on prof internals. */
 	bool			enq;
@@ -239,6 +238,7 @@ bool	prof_boot2(void);
 void	prof_prefork(void);
 void	prof_postfork_parent(void);
 void	prof_postfork_child(void);
+void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
@@ -250,49 +250,13 @@ void	prof_postfork_child(void);
 									\
 	assert(size == s2u(size));					\
 									\
-	prof_tdata = prof_tdata_get(true);				\
-	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX) {	\
-		if (prof_tdata != NULL)					\
-			ret = (prof_thr_cnt_t *)(uintptr_t)1U;		\
-		else							\
-			ret = NULL;					\
-		break;							\
-	}								\
-									\
-	if (opt_prof_active == false) {					\
-		/* Sampling is currently inactive, so avoid sampling. */\
+	if (!opt_prof_active ||						\
+	    prof_sample_accum_update(size, false, &prof_tdata)) {	\
 		ret = (prof_thr_cnt_t *)(uintptr_t)1U;			\
-	} else if (opt_lg_prof_sample == 0) {				\
-		/* Don't bother with sampling logic, since sampling   */\
-		/* interval is 1.                                     */\
+	} else {							\
 		bt_init(&bt, prof_tdata->vec);				\
 		prof_backtrace(&bt, nignore);				\
 		ret = prof_lookup(&bt);					\
-	} else {							\
-		if (prof_tdata->threshold == 0) {			\
-			/* Initialize.  Seed the prng differently for */\
-			/* each thread.                               */\
-			prof_tdata->prng_state =			\
-			    (uint64_t)(uintptr_t)&size;			\
-			prof_sample_threshold_update(prof_tdata);	\
-		}							\
-									\
-		/* Determine whether to capture a backtrace based on  */\
-		/* whether size is enough for prof_accum to reach     */\
-		/* prof_tdata->threshold.  However, delay updating    */\
-		/* these variables until prof_{m,re}alloc(), because  */\
-		/* we don't know for sure that the allocation will    */\
-		/* succeed.                                           */\
-		/*                                                    */\
-		/* Use subtraction rather than addition to avoid      */\
-		/* potential integer overflow.                        */\
-		if (size >= prof_tdata->threshold -			\
-		    prof_tdata->accum) {				\
-			bt_init(&bt, prof_tdata->vec);			\
-			prof_backtrace(&bt, nignore);			\
-			ret = prof_lookup(&bt);				\
-		} else							\
-			ret = (prof_thr_cnt_t *)(uintptr_t)1U;		\
 	}								\
 } while (0)
 
@@ -300,10 +264,13 @@ void	prof_postfork_child(void);
 malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *)
 
 prof_tdata_t	*prof_tdata_get(bool create);
-void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
+void	prof_sample_accum_update(size_t size, bool commit,
+    prof_tdata_t **prof_tdata_out);
 prof_ctx_t	*prof_ctx_get(const void *ptr);
 void	prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 bool	prof_sample_accum_update(size_t size);
+void	prof_malloc_record_object(const void *ptr, size_t usize,
+    prof_thr_cnt_t *cnt)
 void	prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt);
 void	prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
     size_t old_usize, prof_ctx_t *old_ctx);
@@ -330,55 +297,6 @@ prof_tdata_get(bool create)
 	return (prof_tdata);
 }
 
-JEMALLOC_INLINE void
-prof_sample_threshold_update(prof_tdata_t *prof_tdata)
-{
-	/*
-	 * The body of this function is compiled out unless heap profiling is
-	 * enabled, so that it is possible to compile jemalloc with floating
-	 * point support completely disabled.  Avoiding floating point code is
-	 * important on memory-constrained systems, but it also enables a
-	 * workaround for versions of glibc that don't properly save/restore
-	 * floating point registers during dynamic lazy symbol loading (which
-	 * internally calls into whatever malloc implementation happens to be
-	 * integrated into the application).  Note that some compilers (e.g.
-	 * gcc 4.8) may use floating point registers for fast memory moves, so
-	 * jemalloc must be compiled with such optimizations disabled (e.g.
-	 * -mno-sse) in order for the workaround to be complete.
-	 */
-#ifdef JEMALLOC_PROF
-	uint64_t r;
-	double u;
-
-	cassert(config_prof);
-
-	/*
-	 * Compute sample threshold as a geometrically distributed random
-	 * variable with mean (2^opt_lg_prof_sample).
-	 *
-	 *                         __        __
-	 *                         |  log(u)  |                     1
-	 * prof_tdata->threshold = | -------- |, where p = -------------------
-	 *                         | log(1-p) |             opt_lg_prof_sample
-	 *                                                 2
-	 *
-	 * For more information on the math, see:
-	 *
-	 *   Non-Uniform Random Variate Generation
-	 *   Luc Devroye
-	 *   Springer-Verlag, New York, 1986
-	 *   pp 500
-	 *   (http://luc.devroye.org/rnbookindex.html)
-	 */
-	prng64(r, 53, prof_tdata->prng_state,
-	    UINT64_C(6364136223846793005), UINT64_C(1442695040888963407));
-	u = (double)r * (1.0/9007199254740992.0L);
-	prof_tdata->threshold = (uint64_t)(log(u) /
-	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
-	    + (uint64_t)1U;
-#endif
-}
-
 JEMALLOC_INLINE prof_ctx_t *
 prof_ctx_get(const void *ptr)
 {
@@ -415,35 +333,59 @@ prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 }
 
 JEMALLOC_INLINE bool
-prof_sample_accum_update(size_t size)
+prof_sample_accum_update(size_t size, bool commit,
+    prof_tdata_t **prof_tdata_out)
 {
 	prof_tdata_t *prof_tdata;
 
 	cassert(config_prof);
-	/* Sampling logic is unnecessary if the interval is 1. */
-	assert(opt_lg_prof_sample != 0);
 
-	prof_tdata = prof_tdata_get(false);
+	prof_tdata = prof_tdata_get(true);
 	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+		prof_tdata = NULL;
+
+	if (prof_tdata_out != NULL)
+		*prof_tdata_out = prof_tdata;
+
+	if (prof_tdata == NULL)
 		return (true);
 
-	/* Take care to avoid integer overflow. */
-	if (size >= prof_tdata->threshold - prof_tdata->accum) {
-		prof_tdata->accum -= (prof_tdata->threshold - size);
+	if (prof_tdata->bytes_until_sample >= size) {
+		if (commit)
+			prof_tdata->bytes_until_sample -= size;
+		return (true);
+	} else {
 		/* Compute new sample threshold. */
-		prof_sample_threshold_update(prof_tdata);
-		while (prof_tdata->accum >= prof_tdata->threshold) {
-			prof_tdata->accum -= prof_tdata->threshold;
+		if (commit)
 			prof_sample_threshold_update(prof_tdata);
-		}
 		return (false);
-	} else {
-		prof_tdata->accum += size;
-		return (true);
 	}
 }
 
 JEMALLOC_INLINE void
+prof_malloc_record_object(const void *ptr, size_t usize, prof_thr_cnt_t *cnt) {
+	prof_ctx_set(ptr, cnt->ctx);
+
+	cnt->epoch++;
+	/*********/
+	mb_write();
+	/*********/
+	cnt->cnts.curobjs++;
+	cnt->cnts.curbytes += usize;
+	if (opt_prof_accum) {
+		cnt->cnts.accumobjs++;
+		cnt->cnts.accumbytes += usize;
+	}
+	/*********/
+	mb_write();
+	/*********/
+	cnt->epoch++;
+	/*********/
+	mb_write();
+	/*********/
+}
+
+JEMALLOC_INLINE void
 prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt)
 {
 
@@ -451,40 +393,20 @@ prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt)
 	assert(ptr != NULL);
 	assert(usize == isalloc(ptr, true));
 
-	if (opt_lg_prof_sample != 0) {
-		if (prof_sample_accum_update(usize)) {
-			/*
-			 * Don't sample.  For malloc()-like allocation, it is
-			 * always possible to tell in advance how large an
-			 * object's usable size will be, so there should never
-			 * be a difference between the usize passed to
-			 * PROF_ALLOC_PREP() and prof_malloc().
-			 */
-			assert((uintptr_t)cnt == (uintptr_t)1U);
-		}
+	if (prof_sample_accum_update(usize, true, NULL)) {
+		/*
+		 * Don't sample.  For malloc()-like allocation, it is
+		 * always possible to tell in advance how large an
+		 * object's usable size will be, so there should never
+		 * be a difference between the usize passed to
+		 * PROF_ALLOC_PREP() and prof_malloc().
+		 */
+		assert((uintptr_t)cnt == (uintptr_t)1U);
 	}
 
-	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		prof_ctx_set(ptr, cnt->ctx);
-
-		cnt->epoch++;
-		/*********/
-		mb_write();
-		/*********/
-		cnt->cnts.curobjs++;
-		cnt->cnts.curbytes += usize;
-		if (opt_prof_accum) {
-			cnt->cnts.accumobjs++;
-			cnt->cnts.accumbytes += usize;
-		}
-		/*********/
-		mb_write();
-		/*********/
-		cnt->epoch++;
-		/*********/
-		mb_write();
-		/*********/
-	} else
+	if ((uintptr_t)cnt > (uintptr_t)1U)
+		prof_malloc_record_object(ptr, usize, cnt);
+	else
 		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
 }
 
@@ -499,18 +421,16 @@ prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
 
 	if (ptr != NULL) {
 		assert(usize == isalloc(ptr, true));
-		if (opt_lg_prof_sample != 0) {
-			if (prof_sample_accum_update(usize)) {
-				/*
-				 * Don't sample.  The usize passed to
-				 * PROF_ALLOC_PREP() was larger than what
-				 * actually got allocated, so a backtrace was
-				 * captured for this allocation, even though
-				 * its actual usize was insufficient to cross
-				 * the sample threshold.
-				 */
-				cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
-			}
+		if (prof_sample_accum_update(usize, true, NULL)) {
+			/*
+			 * Don't sample.  The usize passed to
+			 * PROF_ALLOC_PREP() was larger than what
+			 * actually got allocated, so a backtrace was
+			 * captured for this allocation, even though
+			 * its actual usize was insufficient to cross
+			 * the sample threshold.
+			 */
+			cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
 		}
 	}
 
diff --git a/src/prof.c b/src/prof.c
index 1b1f7a8..82c7f70 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -645,6 +645,66 @@ prof_lookup(prof_bt_t *bt)
 	return (ret.p);
 }
 
+
+void
+prof_sample_threshold_update(prof_tdata_t *prof_tdata)
+{
+	/*
+	 * The body of this function is compiled out unless heap profiling is
+	 * enabled, so that it is possible to compile jemalloc with floating
+	 * point support completely disabled.  Avoiding floating point code is
+	 * important on memory-constrained systems, but it also enables a
+	 * workaround for versions of glibc that don't properly save/restore
+	 * floating point registers during dynamic lazy symbol loading (which
+	 * internally calls into whatever malloc implementation happens to be
+	 * integrated into the application).  Note that some compilers (e.g.
+	 * gcc 4.8) may use floating point registers for fast memory moves, so
+	 * jemalloc must be compiled with such optimizations disabled (e.g.
+	 * -mno-sse) in order for the workaround to be complete.
+	 */
+#ifdef JEMALLOC_PROF
+	uint64_t r;
+	double u;
+
+	if (!config_prof)
+		return;
+
+	if (prof_tdata == NULL)
+		prof_tdata = prof_tdata_get(false);
+
+	if (opt_lg_prof_sample == 0) {
+		prof_tdata->bytes_until_sample = 0;
+		return;
+	}
+
+	/*
+	 * Compute sample threshold as a geometrically distributed random
+	 * variable with mean (2^opt_lg_prof_sample).
+	 *
+	 *                         __        __
+	 *                         |  log(u)  |                     1
+	 * prof_tdata->threshold = | -------- |, where p = -------------------
+	 *                         | log(1-p) |             opt_lg_prof_sample
+	 *                                                 2
+	 *
+	 * For more information on the math, see:
+	 *
+	 *   Non-Uniform Random Variate Generation
+	 *   Luc Devroye
+	 *   Springer-Verlag, New York, 1986
+	 *   pp 500
+	 *   (http://luc.devroye.org/rnbookindex.html)
+	 */
+	prng64(r, 53, prof_tdata->prng_state,
+	    UINT64_C(6364136223846793005), UINT64_C(1442695040888963407));
+	u = (double)r * (1.0/9007199254740992.0L);
+	prof_tdata->bytes_until_sample = (uint64_t)(log(u) /
+	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
+	    + (uint64_t)1U;
+#endif
+}
+
+
 #ifdef JEMALLOC_JET
 size_t
 prof_bt_count(void)
@@ -1224,9 +1284,8 @@ prof_tdata_init(void)
 		return (NULL);
 	}
 
-	prof_tdata->prng_state = 0;
-	prof_tdata->threshold = 0;
-	prof_tdata->accum = 0;
+	prof_tdata->prng_state = (uint64_t)(uintptr_t)prof_tdata;
+	prof_sample_threshold_update(prof_tdata);
 
 	prof_tdata->enq = false;
 	prof_tdata->enq_idump = false;
-- 
cgit v0.12


From 021136ce4db79f50031a1fd5dd751891888fbc7b Mon Sep 17 00:00:00 2001
From: Ben Maurer <bmaurer@fb.com>
Date: Wed, 16 Apr 2014 14:31:24 -0700
Subject: Create a const array with only a small bin to size map

---
 include/jemalloc/internal/arena.h                |  3 ++-
 include/jemalloc/internal/jemalloc_internal.h.in |  4 ++--
 include/jemalloc/internal/private_symbols.txt    |  1 +
 include/jemalloc/internal/tcache.h               |  6 +++---
 src/arena.c                                      | 10 +++++++++-
 5 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 0e14c2c..b435d0b 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -385,6 +385,7 @@ extern ssize_t	opt_lg_dirty_mult;
  * and all accesses are via the SMALL_SIZE2BIN macro.
  */
 extern uint8_t const	small_size2bin[];
+extern uint32_t const	small_bin2size[];
 #define	SMALL_SIZE2BIN(s)	(small_size2bin[(s-1) >> LG_TINY_MIN])
 
 extern arena_bin_info_t	arena_bin_info[NBINS];
@@ -964,7 +965,7 @@ arena_salloc(const void *ptr, bool demote)
 		assert(arena_mapbits_large_get(chunk, pageind) != 0 ||
 		    arena_ptr_small_binind_get(ptr, arena_mapbits_get(chunk,
 		    pageind)) == binind);
-		ret = arena_bin_info[binind].reg_size;
+		ret = small_bin2size[binind];
 	}
 
 	return (ret);
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 9c79ae0..17d7762 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -602,7 +602,7 @@ s2u(size_t size)
 {
 
 	if (size <= SMALL_MAXCLASS)
-		return (arena_bin_info[SMALL_SIZE2BIN(size)].reg_size);
+		return (small_bin2size[SMALL_SIZE2BIN(size)]);
 	if (size <= arena_maxclass)
 		return (PAGE_CEILING(size));
 	return (CHUNK_CEILING(size));
@@ -645,7 +645,7 @@ sa2u(size_t size, size_t alignment)
 
 	if (usize <= arena_maxclass && alignment <= PAGE) {
 		if (usize <= SMALL_MAXCLASS)
-			return (arena_bin_info[SMALL_SIZE2BIN(usize)].reg_size);
+			return (small_bin2size[SMALL_SIZE2BIN(usize)]);
 		return (PAGE_CEILING(usize));
 	} else {
 		size_t run_size;
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 032bed4..12d64dc 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -346,6 +346,7 @@ rtree_set
 s2u
 sa2u
 set_errno
+small_bin2size
 small_size2bin
 stats_cactive
 stats_cactive_add
diff --git a/include/jemalloc/internal/tcache.h b/include/jemalloc/internal/tcache.h
index 96447f4..098d19a 100644
--- a/include/jemalloc/internal/tcache.h
+++ b/include/jemalloc/internal/tcache.h
@@ -266,14 +266,14 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
 	binind = SMALL_SIZE2BIN(size);
 	assert(binind < NBINS);
 	tbin = &tcache->tbins[binind];
-	size = arena_bin_info[binind].reg_size;
+	size = small_bin2size[binind];
 	ret = tcache_alloc_easy(tbin);
 	if (ret == NULL) {
 		ret = tcache_alloc_small_hard(tcache, tbin, binind);
 		if (ret == NULL)
 			return (NULL);
 	}
-	assert(tcache_salloc(ret) == arena_bin_info[binind].reg_size);
+	assert(tcache_salloc(ret) == size);
 
 	if (zero == false) {
 		if (config_fill) {
@@ -296,7 +296,7 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
 	if (config_stats)
 		tbin->tstats.nrequests++;
 	if (config_prof)
-		tcache->prof_accumbytes += arena_bin_info[binind].reg_size;
+		tcache->prof_accumbytes += size;
 	tcache_event(tcache);
 	return (ret);
 }
diff --git a/src/arena.c b/src/arena.c
index d574100..37487ff 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -8,6 +8,14 @@ ssize_t		opt_lg_dirty_mult = LG_DIRTY_MULT_DEFAULT;
 arena_bin_info_t	arena_bin_info[NBINS];
 
 JEMALLOC_ALIGNED(CACHELINE)
+const uint32_t	small_bin2size[NBINS] = {
+#define SIZE_CLASS(bin, delta, size)		\
+	size,
+	SIZE_CLASSES
+#undef SIZE_CLASS
+};
+
+JEMALLOC_ALIGNED(CACHELINE)
 const uint8_t	small_size2bin[] = {
 #define	S2B_8(i)	i,
 #define	S2B_16(i)	S2B_8(i) S2B_8(i)
@@ -1615,7 +1623,7 @@ arena_malloc_small(arena_t *arena, size_t size, bool zero)
 	binind = SMALL_SIZE2BIN(size);
 	assert(binind < NBINS);
 	bin = &arena->bins[binind];
-	size = arena_bin_info[binind].reg_size;
+	size = small_bin2size[binind];
 
 	malloc_mutex_lock(&bin->lock);
 	if ((run = bin->runcur) != NULL && run->nfree > 0)
-- 
cgit v0.12


From 0b49403958b68294eee0eca8a0b5195e761cf316 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Wed, 16 Apr 2014 16:38:22 -0700
Subject: Fix debug-only compilation failures.

Fix debug-only compilation failures introduced by changes to
prof_sample_accum_update() in:

    6c39f9e059d0825f4c29d8cec9f318b798912c3c
    refactor profiling. only use a bytes till next sample variable.
---
 include/jemalloc/internal/prof.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index 27be10c..d742253 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -264,13 +264,12 @@ void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
 malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *)
 
 prof_tdata_t	*prof_tdata_get(bool create);
-void	prof_sample_accum_update(size_t size, bool commit,
+bool	prof_sample_accum_update(size_t size, bool commit,
     prof_tdata_t **prof_tdata_out);
 prof_ctx_t	*prof_ctx_get(const void *ptr);
 void	prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
-bool	prof_sample_accum_update(size_t size);
 void	prof_malloc_record_object(const void *ptr, size_t usize,
-    prof_thr_cnt_t *cnt)
+    prof_thr_cnt_t *cnt);
 void	prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt);
 void	prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
     size_t old_usize, prof_ctx_t *old_ctx);
-- 
cgit v0.12


From 3541a904d6fb949f3f0aea05418ccce7cbd4b705 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Wed, 16 Apr 2014 17:14:33 -0700
Subject: Refactor small_size2bin and small_bin2size.

Refactor small_size2bin and small_bin2size to be inline functions rather
than directly accessed arrays.
---
 include/jemalloc/internal/arena.h                | 40 ++++++++++++++++++------
 include/jemalloc/internal/jemalloc_internal.h.in | 26 +++++++++------
 include/jemalloc/internal/private_symbols.txt    |  2 ++
 include/jemalloc/internal/tcache.h               |  4 +--
 src/arena.c                                      | 18 +++++------
 5 files changed, 61 insertions(+), 29 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index fbbbb91..605a87e 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -380,13 +380,17 @@ struct arena_s {
 
 extern ssize_t	opt_lg_dirty_mult;
 /*
- * small_size2bin is a compact lookup table that rounds request sizes up to
+ * small_size2bin_tab is a compact lookup table that rounds request sizes up to
  * size classes.  In order to reduce cache footprint, the table is compressed,
- * and all accesses are via the SMALL_SIZE2BIN macro.
+ * and all accesses are via small_size2bin().
  */
-extern uint8_t const	small_size2bin[];
-extern uint32_t const	small_bin2size[];
-#define	SMALL_SIZE2BIN(s)	(small_size2bin[(s-1) >> LG_TINY_MIN])
+extern uint8_t const	small_size2bin_tab[];
+/*
+ * small_bin2size_tab duplicates information in arena_bin_info, but in a const
+ * array, for which it is easier for the compiler to optimize repeated
+ * dereferences.
+ */
+extern uint32_t const	small_bin2size_tab[NBINS];
 
 extern arena_bin_info_t	arena_bin_info[NBINS];
 
@@ -450,6 +454,8 @@ void	arena_postfork_child(arena_t *arena);
 #ifdef JEMALLOC_H_INLINES
 
 #ifndef JEMALLOC_ENABLE_INLINE
+size_t	small_size2bin(size_t size);
+size_t	small_bin2size(size_t binind);
 arena_chunk_map_t	*arena_mapp_get(arena_chunk_t *chunk, size_t pageind);
 size_t	*arena_mapbitsp_get(arena_chunk_t *chunk, size_t pageind);
 size_t	arena_mapbitsp_read(size_t *mapbitsp);
@@ -492,6 +498,22 @@ void	arena_dalloc(arena_chunk_t *chunk, void *ptr, bool try_tcache);
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ARENA_C_))
 #  ifdef JEMALLOC_ARENA_INLINE_A
+JEMALLOC_ALWAYS_INLINE size_t
+small_size2bin(size_t size)
+{
+
+	return ((size_t)(small_size2bin_tab[(size-1) >> LG_TINY_MIN]));
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+small_bin2size(size_t binind)
+{
+
+	return ((size_t)(small_bin2size_tab[binind]));
+}
+#  endif /* JEMALLOC_ARENA_INLINE_A */
+
+#  ifdef JEMALLOC_ARENA_INLINE_B
 JEMALLOC_ALWAYS_INLINE arena_chunk_map_t *
 arena_mapp_get(arena_chunk_t *chunk, size_t pageind)
 {
@@ -773,9 +795,9 @@ arena_ptr_small_binind_get(const void *ptr, size_t mapbits)
 
 	return (binind);
 }
-#  endif /* JEMALLOC_ARENA_INLINE_A */
+#  endif /* JEMALLOC_ARENA_INLINE_B */
 
-#  ifdef JEMALLOC_ARENA_INLINE_B
+#  ifdef JEMALLOC_ARENA_INLINE_C
 JEMALLOC_INLINE size_t
 arena_bin_index(arena_t *arena, arena_bin_t *bin)
 {
@@ -965,7 +987,7 @@ arena_salloc(const void *ptr, bool demote)
 		assert(arena_mapbits_large_get(chunk, pageind) != 0 ||
 		    arena_ptr_small_binind_get(ptr, arena_mapbits_get(chunk,
 		    pageind)) == binind);
-		ret = small_bin2size[binind];
+		ret = small_bin2size(binind);
 	}
 
 	return (ret);
@@ -1004,7 +1026,7 @@ arena_dalloc(arena_chunk_t *chunk, void *ptr, bool try_tcache)
 			arena_dalloc_large(chunk->arena, chunk, ptr);
 	}
 }
-#  endif /* JEMALLOC_ARENA_INLINE_B */
+#  endif /* JEMALLOC_ARENA_INLINE_C */
 #endif
 
 #endif /* JEMALLOC_H_INLINES */
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 6f11d4b..d530c3b 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -499,6 +499,14 @@ void	jemalloc_postfork_child(void);
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
 
+/*
+ * Include arena.h the first time in order to provide inline functions for this
+ * header's inlines.
+ */
+#define	JEMALLOC_ARENA_INLINE_A
+#include "jemalloc/internal/arena.h"
+#undef JEMALLOC_ARENA_INLINE_A
+
 #ifndef JEMALLOC_ENABLE_INLINE
 malloc_tsd_protos(JEMALLOC_ATTR(unused), arenas, arena_t *)
 
@@ -526,7 +534,7 @@ s2u(size_t size)
 {
 
 	if (size <= SMALL_MAXCLASS)
-		return (small_bin2size[SMALL_SIZE2BIN(size)]);
+		return (small_bin2size(small_size2bin(size)));
 	if (size <= arena_maxclass)
 		return (PAGE_CEILING(size));
 	return (CHUNK_CEILING(size));
@@ -569,7 +577,7 @@ sa2u(size_t size, size_t alignment)
 
 	if (usize <= arena_maxclass && alignment <= PAGE) {
 		if (usize <= SMALL_MAXCLASS)
-			return (small_bin2size[SMALL_SIZE2BIN(usize)]);
+			return (small_bin2size(small_size2bin(usize)));
 		return (PAGE_CEILING(usize));
 	} else {
 		size_t run_size;
@@ -643,16 +651,16 @@ choose_arena(arena_t *arena)
 #include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/rtree.h"
 /*
- * Include arena.h twice in order to resolve circular dependencies with
- * tcache.h.
+ * Include arena.h the second and third times in order to resolve circular
+ * dependencies with tcache.h.
  */
-#define	JEMALLOC_ARENA_INLINE_A
-#include "jemalloc/internal/arena.h"
-#undef JEMALLOC_ARENA_INLINE_A
-#include "jemalloc/internal/tcache.h"
 #define	JEMALLOC_ARENA_INLINE_B
 #include "jemalloc/internal/arena.h"
 #undef JEMALLOC_ARENA_INLINE_B
+#include "jemalloc/internal/tcache.h"
+#define	JEMALLOC_ARENA_INLINE_C
+#include "jemalloc/internal/arena.h"
+#undef JEMALLOC_ARENA_INLINE_C
 #include "jemalloc/internal/hash.h"
 #include "jemalloc/internal/quarantine.h"
 
@@ -794,7 +802,7 @@ u2rz(size_t usize)
 	size_t ret;
 
 	if (usize <= SMALL_MAXCLASS) {
-		size_t binind = SMALL_SIZE2BIN(usize);
+		size_t binind = small_size2bin(usize);
 		ret = arena_bin_info[binind].redzone_size;
 	} else
 		ret = 0;
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index ff9ed47..ccbb3a9 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -346,7 +346,9 @@ s2u
 sa2u
 set_errno
 small_bin2size
+small_bin2size_tab
 small_size2bin
+small_size2bin_tab
 stats_cactive
 stats_cactive_add
 stats_cactive_get
diff --git a/include/jemalloc/internal/tcache.h b/include/jemalloc/internal/tcache.h
index 06c7c8f..c0d48b9 100644
--- a/include/jemalloc/internal/tcache.h
+++ b/include/jemalloc/internal/tcache.h
@@ -263,10 +263,10 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
 	size_t binind;
 	tcache_bin_t *tbin;
 
-	binind = SMALL_SIZE2BIN(size);
+	binind = small_size2bin(size);
 	assert(binind < NBINS);
 	tbin = &tcache->tbins[binind];
-	size = small_bin2size[binind];
+	size = small_bin2size(binind);
 	ret = tcache_alloc_easy(tbin);
 	if (ret == NULL) {
 		ret = tcache_alloc_small_hard(tcache, tbin, binind);
diff --git a/src/arena.c b/src/arena.c
index 4256344..d956be3 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -8,7 +8,7 @@ ssize_t		opt_lg_dirty_mult = LG_DIRTY_MULT_DEFAULT;
 arena_bin_info_t	arena_bin_info[NBINS];
 
 JEMALLOC_ALIGNED(CACHELINE)
-const uint32_t	small_bin2size[NBINS] = {
+const uint32_t	small_bin2size_tab[NBINS] = {
 #define SIZE_CLASS(bin, delta, size)		\
 	size,
 	SIZE_CLASSES
@@ -16,7 +16,7 @@ const uint32_t	small_bin2size[NBINS] = {
 };
 
 JEMALLOC_ALIGNED(CACHELINE)
-const uint8_t	small_size2bin[] = {
+const uint8_t	small_size2bin_tab[] = {
 #define	S2B_8(i)	i,
 #define	S2B_16(i)	S2B_8(i) S2B_8(i)
 #define	S2B_32(i)	S2B_16(i) S2B_16(i)
@@ -1607,7 +1607,7 @@ arena_quarantine_junk_small(void *ptr, size_t usize)
 	assert(opt_quarantine);
 	assert(usize <= SMALL_MAXCLASS);
 
-	binind = SMALL_SIZE2BIN(usize);
+	binind = small_size2bin(usize);
 	bin_info = &arena_bin_info[binind];
 	arena_redzones_validate(ptr, bin_info, true);
 }
@@ -1620,10 +1620,10 @@ arena_malloc_small(arena_t *arena, size_t size, bool zero)
 	arena_run_t *run;
 	size_t binind;
 
-	binind = SMALL_SIZE2BIN(size);
+	binind = small_size2bin(size);
 	assert(binind < NBINS);
 	bin = &arena->bins[binind];
-	size = small_bin2size[binind];
+	size = small_bin2size(binind);
 
 	malloc_mutex_lock(&bin->lock);
 	if ((run = bin->runcur) != NULL && run->nfree > 0)
@@ -1777,7 +1777,7 @@ arena_prof_promoted(const void *ptr, size_t size)
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
-	binind = SMALL_SIZE2BIN(size);
+	binind = small_size2bin(size);
 	assert(binind < NBINS);
 	arena_mapbits_large_binind_set(chunk, pageind, binind);
 
@@ -2164,11 +2164,11 @@ arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
 	 */
 	if (oldsize <= arena_maxclass) {
 		if (oldsize <= SMALL_MAXCLASS) {
-			assert(arena_bin_info[SMALL_SIZE2BIN(oldsize)].reg_size
+			assert(arena_bin_info[small_size2bin(oldsize)].reg_size
 			    == oldsize);
 			if ((size + extra <= SMALL_MAXCLASS &&
-			    SMALL_SIZE2BIN(size + extra) ==
-			    SMALL_SIZE2BIN(oldsize)) || (size <= oldsize &&
+			    small_size2bin(size + extra) ==
+			    small_size2bin(oldsize)) || (size <= oldsize &&
 			    size + extra >= oldsize))
 				return (false);
 		} else {
-- 
cgit v0.12


From 9d4e13f45a281a2eabe4d3528ab26e5f3903d5a5 Mon Sep 17 00:00:00 2001
From: Lucian Adrian Grijincu <lucian@fb.com>
Date: Mon, 21 Apr 2014 20:52:35 -0700
Subject: prof_backtrace: use unw_backtrace

unw_backtrace:
- does internal per-thread caching
- doesn't acquire an internal lock
---
 include/jemalloc/internal/jemalloc_internal.h.in |  4 +--
 src/prof.c                                       | 33 +++++++-----------------
 2 files changed, 11 insertions(+), 26 deletions(-)

diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index d530c3b..dc77b5a 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -352,9 +352,9 @@ static const bool config_ivsalloc =
 #    endif
 #  endif
 #  define VARIABLE_ARRAY(type, name, count) \
-	type *name = alloca(sizeof(type) * count)
+	type *name = alloca(sizeof(type) * (count))
 #else
-#  define VARIABLE_ARRAY(type, name, count) type name[count]
+#  define VARIABLE_ARRAY(type, name, count) type name[(count)]
 #endif
 
 #include "jemalloc/internal/valgrind.h"
diff --git a/src/prof.c b/src/prof.c
index 82c7f70..11f1267 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -160,36 +160,21 @@ prof_leave(prof_tdata_t *prof_tdata)
 void
 prof_backtrace(prof_bt_t *bt, unsigned nignore)
 {
-	unw_context_t uc;
-	unw_cursor_t cursor;
-	unsigned i;
-	int err;
-
 	cassert(config_prof);
 	assert(bt->len == 0);
 	assert(bt->vec != NULL);
 
-	unw_getcontext(&uc);
-	unw_init_local(&cursor, &uc);
+	VARIABLE_ARRAY(void *, frames, nignore + PROF_BT_MAX);
+	int n = unw_backtrace(frames, nignore + PROF_BT_MAX);
+	if (n <= 0)
+		return;
 
 	/* Throw away (nignore+1) stack frames, if that many exist. */
-	for (i = 0; i < nignore + 1; i++) {
-		err = unw_step(&cursor);
-		if (err <= 0)
-			return;
-	}
-
-	/*
-	 * Iterate over stack frames until there are no more, or until no space
-	 * remains in bt.
-	 */
-	for (i = 0; i < PROF_BT_MAX; i++) {
-		unw_get_reg(&cursor, UNW_REG_IP, (unw_word_t *)&bt->vec[i]);
-		bt->len++;
-		err = unw_step(&cursor);
-		if (err <= 0)
-			break;
-	}
+	nignore++;
+	if (nignore >= n)
+		return;
+	memcpy(bt->vec, &frames[nignore], sizeof(frames[0]) * (n - nignore));
+	bt->len = n - nignore;
 }
 #elif (defined(JEMALLOC_PROF_LIBGCC))
 static _Unwind_Reason_Code
-- 
cgit v0.12


From 05125b83778a5695c29777acdc662d999d016d32 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 22 Apr 2014 20:48:07 -0700
Subject: Update libunwind configuration check to look for unw_backtrace().

Update libunwind configuration check to look for unw_backtrace(), which
is a newer API not available in older versions of libunwind.
---
 configure.ac | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index dc817e1..eb9ca45 100644
--- a/configure.ac
+++ b/configure.ac
@@ -702,7 +702,7 @@ fi,
 if test "x$backtrace_method" = "x" -a "x$enable_prof_libunwind" = "x1" ; then
   AC_CHECK_HEADERS([libunwind.h], , [enable_prof_libunwind="0"])
   if test "x$LUNWIND" = "x-lunwind" ; then
-    AC_CHECK_LIB([unwind], [backtrace], [LIBS="$LIBS $LUNWIND"],
+    AC_CHECK_LIB([unwind], [unw_backtrace], [LIBS="$LIBS $LUNWIND"],
                  [enable_prof_libunwind="0"])
   else
     LIBS="$LIBS $LUNWIND"
-- 
cgit v0.12


From 6f001059aa33d77a3cb7799002044faf8dd08fc0 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 22 Apr 2014 18:41:15 -0700
Subject: Simplify backtracing.

Simplify backtracing to not ignore any frames, and compensate for this
in pprof in order to increase flexibility with respect to function-based
refactoring even in the presence of non-deterministic inlining.  Modify
pprof to blacklist all jemalloc allocation entry points including
non-standard ones like mallocx(), and ignore all allocator-internal
frames.  Prior to this change, pprof excluded the specifically
blacklisted functions from backtraces, but it left allocator-internal
frames intact.
---
 bin/pprof                        |  9 +++++
 include/jemalloc/internal/prof.h |  7 ++--
 src/jemalloc.c                   | 80 ++++++++++++++--------------------------
 src/prof.c                       | 55 +++++++++++----------------
 4 files changed, 60 insertions(+), 91 deletions(-)

diff --git a/bin/pprof b/bin/pprof
index a309943..328138c 100755
--- a/bin/pprof
+++ b/bin/pprof
@@ -2811,9 +2811,14 @@ sub RemoveUninterestingFrames {
                       'free',
                       'memalign',
                       'posix_memalign',
+                      'aligned_alloc',
                       'pvalloc',
                       'valloc',
                       'realloc',
+                      'mallocx', # jemalloc
+                      'rallocx', # jemalloc
+                      'xallocx', # jemalloc
+                      'dallocx', # jemalloc
                       'tc_calloc',
                       'tc_cfree',
                       'tc_malloc',
@@ -2923,6 +2928,10 @@ sub RemoveUninterestingFrames {
       if (exists($symbols->{$a})) {
         my $func = $symbols->{$a}->[0];
         if ($skip{$func} || ($func =~ m/$skip_regexp/)) {
+          # Throw away the portion of the backtrace seen so far, under the
+          # assumption that previous frames were for functions internal to the
+          # allocator.
+          @path = ();
           next;
         }
       }
diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index d742253..d82fbc4 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -63,7 +63,6 @@ struct prof_bt_s {
 /* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
 typedef struct {
 	prof_bt_t	*bt;
-	unsigned	nignore;
 	unsigned	max;
 } prof_unwind_data_t;
 #endif
@@ -220,7 +219,7 @@ extern char	opt_prof_prefix[
 extern uint64_t	prof_interval;
 
 void	bt_init(prof_bt_t *bt, void **vec);
-void	prof_backtrace(prof_bt_t *bt, unsigned nignore);
+void	prof_backtrace(prof_bt_t *bt);
 prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
 #ifdef JEMALLOC_JET
 size_t	prof_bt_count(void);
@@ -244,7 +243,7 @@ void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
 
-#define	PROF_ALLOC_PREP(nignore, size, ret) do {			\
+#define	PROF_ALLOC_PREP(size, ret) do {					\
 	prof_tdata_t *prof_tdata;					\
 	prof_bt_t bt;							\
 									\
@@ -255,7 +254,7 @@ void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
 		ret = (prof_thr_cnt_t *)(uintptr_t)1U;			\
 	} else {							\
 		bt_init(&bt, prof_tdata->vec);				\
-		prof_backtrace(&bt, nignore);				\
+		prof_backtrace(&bt);					\
 		ret = prof_lookup(&bt);					\
 	}								\
 } while (0)
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 36eae72..f1dda75 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -881,10 +881,12 @@ imalloc_prof_sample(size_t usize, prof_thr_cnt_t *cnt)
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
-imalloc_prof(size_t usize, prof_thr_cnt_t *cnt)
+imalloc_prof(size_t usize)
 {
 	void *p;
+	prof_thr_cnt_t *cnt;
 
+	PROF_ALLOC_PREP(usize, cnt);
 	if ((uintptr_t)cnt != (uintptr_t)1U)
 		p = imalloc_prof_sample(usize, cnt);
 	else
@@ -896,42 +898,22 @@ imalloc_prof(size_t usize, prof_thr_cnt_t *cnt)
 	return (p);
 }
 
-/*
- * MALLOC_BODY() is a macro rather than a function because its contents are in
- * the fast path, but inlining would cause reliability issues when determining
- * how many frames to discard from heap profiling backtraces.
- */
-#define	MALLOC_BODY(ret, size, usize) do {				\
-	if (malloc_init())						\
-		ret = NULL;						\
-	else {								\
-		if (config_prof && opt_prof) {				\
-			prof_thr_cnt_t *cnt;				\
-									\
-			usize = s2u(size);				\
-			/*						\
-			 * Call PROF_ALLOC_PREP() here rather than in	\
-			 * imalloc_prof() so that imalloc_prof() can be	\
-			 * inlined without introducing uncertainty	\
-			 * about the number of backtrace frames to	\
-			 * ignore.  imalloc_prof() is in the fast path	\
-			 * when heap profiling is enabled, so inlining	\
-			 * is critical to performance.  (For		\
-			 * consistency all callers of PROF_ALLOC_PREP()	\
-			 * are structured similarly, even though e.g.	\
-			 * realloc() isn't called enough for inlining	\
-			 * to be critical.)				\
-			 */						\
-			PROF_ALLOC_PREP(1, usize, cnt);			\
-			ret = imalloc_prof(usize, cnt);			\
-		} else {						\
-			if (config_stats || (config_valgrind &&		\
-			    in_valgrind))				\
-				usize = s2u(size);			\
-			ret = imalloc(size);				\
-		}							\
-	}								\
-} while (0)
+JEMALLOC_ALWAYS_INLINE_C void *
+imalloc_body(size_t size, size_t *usize)
+{
+
+	if (malloc_init())
+		return (NULL);
+
+	if (config_prof && opt_prof) {
+		*usize = s2u(size);
+		return (imalloc_prof(*usize));
+	}
+
+	if (config_stats || (config_valgrind && in_valgrind))
+		*usize = s2u(size);
+	return (imalloc(size));
+}
 
 void *
 je_malloc(size_t size)
@@ -942,8 +924,7 @@ je_malloc(size_t size)
 	if (size == 0)
 		size = 1;
 
-	MALLOC_BODY(ret, size, usize);
-
+	ret = imalloc_body(size, &usize);
 	if (ret == NULL) {
 		if (config_xmalloc && opt_xmalloc) {
 			malloc_write("<jemalloc>: Error in malloc(): "
@@ -998,13 +979,6 @@ imemalign_prof(size_t alignment, size_t usize, prof_thr_cnt_t *cnt)
 }
 
 JEMALLOC_ATTR(nonnull(1))
-#ifdef JEMALLOC_PROF
-/*
- * Avoid any uncertainty as to how many backtrace frames to ignore in
- * PROF_ALLOC_PREP().
- */
-JEMALLOC_NOINLINE
-#endif
 static int
 imemalign(void **memptr, size_t alignment, size_t size, size_t min_alignment)
 {
@@ -1043,7 +1017,7 @@ imemalign(void **memptr, size_t alignment, size_t size, size_t min_alignment)
 		if (config_prof && opt_prof) {
 			prof_thr_cnt_t *cnt;
 
-			PROF_ALLOC_PREP(2, usize, cnt);
+			PROF_ALLOC_PREP(usize, cnt);
 			result = imemalign_prof(alignment, usize, cnt);
 		} else
 			result = ipalloc(usize, alignment, false);
@@ -1166,7 +1140,7 @@ je_calloc(size_t num, size_t size)
 		prof_thr_cnt_t *cnt;
 
 		usize = s2u(num_size);
-		PROF_ALLOC_PREP(1, usize, cnt);
+		PROF_ALLOC_PREP(usize, cnt);
 		ret = icalloc_prof(usize, cnt);
 	} else {
 		if (config_stats || (config_valgrind && in_valgrind))
@@ -1282,7 +1256,7 @@ je_realloc(void *ptr, size_t size)
 			prof_thr_cnt_t *cnt;
 
 			usize = s2u(size);
-			PROF_ALLOC_PREP(1, usize, cnt);
+			PROF_ALLOC_PREP(usize, cnt);
 			ret = irealloc_prof(ptr, old_usize, usize, cnt);
 		} else {
 			if (config_stats || (config_valgrind && in_valgrind))
@@ -1291,7 +1265,7 @@ je_realloc(void *ptr, size_t size)
 		}
 	} else {
 		/* realloc(NULL, size) is equivalent to malloc(size). */
-		MALLOC_BODY(ret, size, usize);
+		ret = imalloc_body(size, &usize);
 	}
 
 	if (ret == NULL) {
@@ -1475,7 +1449,7 @@ je_mallocx(size_t size, int flags)
 	if (config_prof && opt_prof) {
 		prof_thr_cnt_t *cnt;
 
-		PROF_ALLOC_PREP(1, usize, cnt);
+		PROF_ALLOC_PREP(usize, cnt);
 		p = imallocx_prof(usize, alignment, zero, try_tcache, arena,
 		    cnt);
 	} else
@@ -1600,7 +1574,7 @@ je_rallocx(void *ptr, size_t size, int flags)
 
 		usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment);
 		assert(usize != 0);
-		PROF_ALLOC_PREP(1, usize, cnt);
+		PROF_ALLOC_PREP(usize, cnt);
 		p = irallocx_prof(ptr, old_usize, size, alignment, &usize, zero,
 		    try_tcache_alloc, try_tcache_dalloc, arena, cnt);
 		if (p == NULL)
@@ -1733,7 +1707,7 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags)
 		 */
 		size_t max_usize = (alignment == 0) ? s2u(size+extra) :
 		    sa2u(size+extra, alignment);
-		PROF_ALLOC_PREP(1, max_usize, cnt);
+		PROF_ALLOC_PREP(max_usize, cnt);
 		usize = ixallocx_prof(ptr, old_usize, size, extra, alignment,
 		    max_usize, zero, arena, cnt);
 	} else {
diff --git a/src/prof.c b/src/prof.c
index 11f1267..b64386e 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -158,23 +158,18 @@ prof_leave(prof_tdata_t *prof_tdata)
 
 #ifdef JEMALLOC_PROF_LIBUNWIND
 void
-prof_backtrace(prof_bt_t *bt, unsigned nignore)
+prof_backtrace(prof_bt_t *bt)
 {
+	int nframes;
+
 	cassert(config_prof);
 	assert(bt->len == 0);
 	assert(bt->vec != NULL);
 
-	VARIABLE_ARRAY(void *, frames, nignore + PROF_BT_MAX);
-	int n = unw_backtrace(frames, nignore + PROF_BT_MAX);
-	if (n <= 0)
-		return;
-
-	/* Throw away (nignore+1) stack frames, if that many exist. */
-	nignore++;
-	if (nignore >= n)
+	nframes = unw_backtrace(bt->vec, PROF_BT_MAX);
+	if (nframes <= 0)
 		return;
-	memcpy(bt->vec, &frames[nignore], sizeof(frames[0]) * (n - nignore));
-	bt->len = n - nignore;
+	bt->len = nframes;
 }
 #elif (defined(JEMALLOC_PROF_LIBGCC))
 static _Unwind_Reason_Code
@@ -190,25 +185,25 @@ static _Unwind_Reason_Code
 prof_unwind_callback(struct _Unwind_Context *context, void *arg)
 {
 	prof_unwind_data_t *data = (prof_unwind_data_t *)arg;
+	void *ip;
 
 	cassert(config_prof);
 
-	if (data->nignore > 0)
-		data->nignore--;
-	else {
-		data->bt->vec[data->bt->len] = (void *)_Unwind_GetIP(context);
-		data->bt->len++;
-		if (data->bt->len == data->max)
-			return (_URC_END_OF_STACK);
-	}
+	ip = (void *)_Unwind_GetIP(context);
+	if (ip == NULL)
+		return (_URC_END_OF_STACK);
+	data->bt->vec[data->bt->len] = ip;
+	data->bt->len++;
+	if (data->bt->len == data->max)
+		return (_URC_END_OF_STACK);
 
 	return (_URC_NO_REASON);
 }
 
 void
-prof_backtrace(prof_bt_t *bt, unsigned nignore)
+prof_backtrace(prof_bt_t *bt)
 {
-	prof_unwind_data_t data = {bt, nignore, PROF_BT_MAX};
+	prof_unwind_data_t data = {bt, PROF_BT_MAX};
 
 	cassert(config_prof);
 
@@ -216,25 +211,22 @@ prof_backtrace(prof_bt_t *bt, unsigned nignore)
 }
 #elif (defined(JEMALLOC_PROF_GCC))
 void
-prof_backtrace(prof_bt_t *bt, unsigned nignore)
+prof_backtrace(prof_bt_t *bt)
 {
 #define	BT_FRAME(i)							\
-	if ((i) < nignore + PROF_BT_MAX) {				\
+	if ((i) < PROF_BT_MAX) {					\
 		void *p;						\
 		if (__builtin_frame_address(i) == 0)			\
 			return;						\
 		p = __builtin_return_address(i);			\
 		if (p == NULL)						\
 			return;						\
-		if (i >= nignore) {					\
-			bt->vec[(i) - nignore] = p;			\
-			bt->len = (i) - nignore + 1;			\
-		}							\
+		bt->vec[(i)] = p;					\
+		bt->len = (i) + 1;					\
 	} else								\
 		return;
 
 	cassert(config_prof);
-	assert(nignore <= 3);
 
 	BT_FRAME(0)
 	BT_FRAME(1)
@@ -376,16 +368,11 @@ prof_backtrace(prof_bt_t *bt, unsigned nignore)
 	BT_FRAME(125)
 	BT_FRAME(126)
 	BT_FRAME(127)
-
-	/* Extras to compensate for nignore. */
-	BT_FRAME(128)
-	BT_FRAME(129)
-	BT_FRAME(130)
 #undef BT_FRAME
 }
 #else
 void
-prof_backtrace(prof_bt_t *bt, unsigned nignore)
+prof_backtrace(prof_bt_t *bt)
 {
 
 	cassert(config_prof);
-- 
cgit v0.12


From a344dd01c74a7e385087819046105f689931905d Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Thu, 1 May 2014 15:51:30 -0700
Subject: Fix coding sytle nits.

---
 src/jemalloc.c           | 8 ++++----
 test/include/test/test.h | 2 +-
 test/src/test.c          | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/jemalloc.c b/src/jemalloc.c
index f1dda75..289d7f7 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1346,10 +1346,10 @@ je_valloc(size_t size)
  * passed an extra argument for the caller return address, which will be
  * ignored.
  */
-JEMALLOC_EXPORT void (* __free_hook)(void *ptr) = je_free;
-JEMALLOC_EXPORT void *(* __malloc_hook)(size_t size) = je_malloc;
-JEMALLOC_EXPORT void *(* __realloc_hook)(void *ptr, size_t size) = je_realloc;
-JEMALLOC_EXPORT void *(* __memalign_hook)(size_t alignment, size_t size) =
+JEMALLOC_EXPORT void (*__free_hook)(void *ptr) = je_free;
+JEMALLOC_EXPORT void *(*__malloc_hook)(size_t size) = je_malloc;
+JEMALLOC_EXPORT void *(*__realloc_hook)(void *ptr, size_t size) = je_realloc;
+JEMALLOC_EXPORT void *(*__memalign_hook)(size_t alignment, size_t size) =
     je_memalign;
 #endif
 
diff --git a/test/include/test/test.h b/test/include/test/test.h
index a32ec07..161fafd 100644
--- a/test/include/test/test.h
+++ b/test/include/test/test.h
@@ -323,7 +323,7 @@ void	test_skip(const char *format, ...) JEMALLOC_ATTR(format(printf, 1, 2));
 void	test_fail(const char *format, ...) JEMALLOC_ATTR(format(printf, 1, 2));
 
 /* For private use by macros. */
-test_status_t	p_test(test_t* t, ...);
+test_status_t	p_test(test_t *t, ...);
 void	p_test_init(const char *name);
 void	p_test_fini(void);
 void	p_test_fail(const char *prefix, const char *message);
diff --git a/test/src/test.c b/test/src/test.c
index 528d858..3acf845 100644
--- a/test/src/test.c
+++ b/test/src/test.c
@@ -61,13 +61,13 @@ p_test_fini(void)
 }
 
 test_status_t
-p_test(test_t* t, ...)
+p_test(test_t *t, ...)
 {
 	test_status_t ret = test_status_pass;
 	va_list ap;
 
 	va_start(ap, t);
-	for (; t != NULL; t = va_arg(ap, test_t*)) {
+	for (; t != NULL; t = va_arg(ap, test_t *)) {
 		t();
 		if (test_status > ret)
 			ret = test_status;
-- 
cgit v0.12


From 74b1ea5ce09c8455f35da0fbbd41f678708151d8 Mon Sep 17 00:00:00 2001
From: Daniel Micay <danielmicay@gmail.com>
Date: Wed, 7 May 2014 17:58:54 -0400
Subject: fix git handling of newlines on windows

By default, git will coerce LF to CRLF when files are checked out on
Windows. This causes hard to diagnose errors when compiling with
mingw-w64 from Windows rather than cross-compiling.
---
 .gitattributes | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 .gitattributes

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..6313b56
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1 @@
+* text=auto eol=lf
-- 
cgit v0.12


From fb7fe50a88ca9bde74e9a401ae17ad3b15bbae28 Mon Sep 17 00:00:00 2001
From: aravind <aravind@fb.com>
Date: Mon, 5 May 2014 15:16:56 -0700
Subject: Add support for user-specified chunk allocators/deallocators.

Add new mallctl endpoints "arena<i>.chunk.alloc" and
"arena<i>.chunk.dealloc" to allow userspace to configure
jemalloc's chunk allocator and deallocator on a per-arena
basis.
---
 Makefile.in                                      |  3 +-
 doc/jemalloc.xml.in                              | 63 ++++++++++++++++++++++++
 include/jemalloc/internal/arena.h                |  6 +++
 include/jemalloc/internal/chunk.h                |  8 +--
 include/jemalloc/internal/extent.h               |  3 ++
 include/jemalloc/internal/huge.h                 | 10 ++--
 include/jemalloc/internal/jemalloc_internal.h.in | 14 ++++--
 include/jemalloc/internal/private_symbols.txt    |  1 +
 include/jemalloc/jemalloc_protos.h.in            |  3 ++
 src/arena.c                                      |  8 +--
 src/base.c                                       |  2 +-
 src/chunk.c                                      | 58 ++++++++++++++++------
 src/ctl.c                                        | 61 ++++++++++++++++++++++-
 src/huge.c                                       | 25 +++++-----
 src/jemalloc.c                                   |  2 +-
 test/integration/chunk.c                         | 61 +++++++++++++++++++++++
 16 files changed, 283 insertions(+), 45 deletions(-)
 create mode 100644 test/integration/chunk.c

diff --git a/Makefile.in b/Makefile.in
index e411804..800dd08 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -142,7 +142,8 @@ TESTS_INTEGRATION := $(srcroot)test/integration/aligned_alloc.c \
 	$(srcroot)test/integration/rallocx.c \
 	$(srcroot)test/integration/thread_arena.c \
 	$(srcroot)test/integration/thread_tcache_enabled.c \
-	$(srcroot)test/integration/xallocx.c
+	$(srcroot)test/integration/xallocx.c \
+	$(srcroot)test/integration/chunk.c
 TESTS_STRESS :=
 TESTS := $(TESTS_UNIT) $(TESTS_INTEGRATION) $(TESTS_STRESS)
 
diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 78e9b3c..a7c38b5 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -1283,6 +1283,69 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         </para></listitem>
       </varlistentry>
 
+      <varlistentry id="arena.i.chunk.alloc">
+        <term>
+          <mallctl>arena.&lt;i&gt;.chunk.alloc</mallctl>
+          (<type>chunk_alloc_t *</type>)
+          <literal>rw</literal>
+        </term>
+        <listitem><para>Get or set the chunk allocation function for arena
+        &lt;i&gt;.  If setting, the chunk deallocation function should
+        also be set via <link linkend="arena.i.chunk.dealloc">
+        <mallctl>arena.&lt;i&gt;.chunk.dealloc</mallctl></link> to a companion
+        function that knows how to deallocate the chunks.
+        <funcprototype>
+          <funcdef>typedef void *<function>(chunk_alloc_t)</function></funcdef>
+          <paramdef>size_t <parameter>size</parameter></paramdef>
+          <paramdef>size_t <parameter>alignment</parameter></paramdef>
+          <paramdef>bool *<parameter>zero</parameter></paramdef>
+          <paramdef>unsigned <parameter>arena_ind</parameter></paramdef>
+        </funcprototype>
+        A chunk allocation function conforms to the <type>chunk_alloc_t</type>
+        type and upon success returns a pointer to <parameter>size</parameter>
+        bytes of memory on behalf of arena <parameter>arena_ind</parameter> such
+        that the chunk's base address is a multiple of
+        <parameter>alignment</parameter>, as well as setting
+        <parameter>*zero</parameter> to indicate whether the chunk is zeroed.
+        Upon error the function returns <constant>NULL</constant> and leaves
+        <parameter>*zero</parameter> unmodified.  The
+        <parameter>size</parameter> parameter is always a multiple of the chunk
+        size.  The <parameter>alignment</parameter> parameter is always a power
+        of two at least as large as the chunk size.  Zeroing is mandatory if
+        <parameter>*zero</parameter> is true upon function
+        entry.</para></listitem>
+      </varlistentry>
+
+      <varlistentry id="arena.i.chunk.dealloc">
+        <term>
+          <mallctl>arena.&lt;i&gt;.chunk.dealloc</mallctl>
+          (<type>chunk_dealloc_t *</type>)
+          <literal>rw</literal>
+        </term>
+        <listitem><para>Get or set the chunk deallocation function for arena
+        &lt;i&gt;.  If setting, the chunk deallocation function must
+        be capable of deallocating all extant chunks associated with arena
+        &lt;i&gt;, usually by passing unknown chunks to the deallocation
+        function that was replaced.  In practice, it is feasible to control
+        allocation for arenas created via <link
+        linkend="arenas.extend"><mallctl>arenas.extend</mallctl></link> such
+        that all chunks originate from an application-supplied chunk allocator
+        (by setting custom chunk allocation/deallocation functions just after
+        arena creation), but the automatically created arenas may have already
+        created chunks prior to the application having an opportunity to take
+        over chunk allocation.
+        <funcprototype>
+          <funcdef>typedef void <function>(chunk_dealloc_t)</function></funcdef>
+          <paramdef>void *<parameter>chunk</parameter></paramdef>
+          <paramdef>size_t <parameter>size</parameter></paramdef>
+          <paramdef>unsigned <parameter>arena_ind</parameter></paramdef>
+        </funcprototype>
+        A chunk deallocation function conforms to the
+        <type>chunk_dealloc_t</type> type and deallocates a
+        <parameter>chunk</parameter> of given <parameter>size</parameter> on
+        behalf of arena <parameter>arena_ind</parameter>.</para></listitem>
+      </varlistentry>
+
       <varlistentry id="arenas.narenas">
         <term>
           <mallctl>arenas.narenas</mallctl>
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 605a87e..d50159b 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -370,6 +370,12 @@ struct arena_s {
 	 */
 	arena_avail_tree_t	runs_avail;
 
+	/*
+	 * user-configureable chunk allocation and deallocation functions.
+	 */
+	chunk_alloc_t		*chunk_alloc;
+	chunk_dealloc_t		*chunk_dealloc;
+
 	/* bins is used to store trees of free regions. */
 	arena_bin_t		bins[NBINS];
 };
diff --git a/include/jemalloc/internal/chunk.h b/include/jemalloc/internal/chunk.h
index 87d8700..cea0e8a 100644
--- a/include/jemalloc/internal/chunk.h
+++ b/include/jemalloc/internal/chunk.h
@@ -43,10 +43,12 @@ extern size_t		chunk_npages;
 extern size_t		map_bias; /* Number of arena chunk header pages. */
 extern size_t		arena_maxclass; /* Max size class for arenas. */
 
-void	*chunk_alloc(size_t size, size_t alignment, bool base, bool *zero,
-    dss_prec_t dss_prec);
+void	*chunk_alloc(arena_t *arena, size_t size, size_t alignment, bool base,
+    bool *zero, dss_prec_t dss_prec);
+void	*chunk_alloc_default(size_t size, size_t alignment, bool *zero,
+    unsigned arena_ind);
 void	chunk_unmap(void *chunk, size_t size);
-void	chunk_dealloc(void *chunk, size_t size, bool unmap);
+void	chunk_dealloc(arena_t *arena, void *chunk, size_t size, bool unmap);
 bool	chunk_boot(void);
 void	chunk_prefork(void);
 void	chunk_postfork_parent(void);
diff --git a/include/jemalloc/internal/extent.h b/include/jemalloc/internal/extent.h
index ba95ca8..000ef6d 100644
--- a/include/jemalloc/internal/extent.h
+++ b/include/jemalloc/internal/extent.h
@@ -24,6 +24,9 @@ struct extent_node_s {
 	/* Total region size. */
 	size_t			size;
 
+	/* Arena from which this extent came, if any */
+	arena_t			*arena;
+
 	/* True if zero-filled; used by chunk recycling code. */
 	bool			zeroed;
 };
diff --git a/include/jemalloc/internal/huge.h b/include/jemalloc/internal/huge.h
index a2b9c77..ab8d44a 100644
--- a/include/jemalloc/internal/huge.h
+++ b/include/jemalloc/internal/huge.h
@@ -17,13 +17,15 @@ extern size_t		huge_allocated;
 /* Protects chunk-related data structures. */
 extern malloc_mutex_t	huge_mtx;
 
-void	*huge_malloc(size_t size, bool zero, dss_prec_t dss_prec);
-void	*huge_palloc(size_t size, size_t alignment, bool zero,
+void	*huge_malloc(arena_t *arena, size_t size, bool zero,
+    dss_prec_t dss_prec);
+void	*huge_palloc(arena_t *arena, size_t size, size_t alignment, bool zero,
     dss_prec_t dss_prec);
 bool	huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
     size_t extra);
-void	*huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
-    size_t alignment, bool zero, bool try_tcache_dalloc, dss_prec_t dss_prec);
+void	*huge_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
+    size_t extra, size_t alignment, bool zero, bool try_tcache_dalloc,
+    dss_prec_t dss_prec);
 #ifdef JEMALLOC_JET
 typedef void (huge_dalloc_junk_t)(void *, size_t);
 extern huge_dalloc_junk_t *huge_dalloc_junk;
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index dc77b5a..9e779c6 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -702,7 +702,8 @@ imalloct(size_t size, bool try_tcache, arena_t *arena)
 	if (size <= arena_maxclass)
 		return (arena_malloc(arena, size, false, try_tcache));
 	else
-		return (huge_malloc(size, false, huge_dss_prec_get(arena)));
+		return (huge_malloc(arena, size, false,
+		    huge_dss_prec_get(arena)));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
@@ -719,7 +720,8 @@ icalloct(size_t size, bool try_tcache, arena_t *arena)
 	if (size <= arena_maxclass)
 		return (arena_malloc(arena, size, true, try_tcache));
 	else
-		return (huge_malloc(size, true, huge_dss_prec_get(arena)));
+		return (huge_malloc(arena, size, true,
+		    huge_dss_prec_get(arena)));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
@@ -745,9 +747,11 @@ ipalloct(size_t usize, size_t alignment, bool zero, bool try_tcache,
 			ret = arena_palloc(choose_arena(arena), usize,
 			    alignment, zero);
 		} else if (alignment <= chunksize)
-			ret = huge_malloc(usize, zero, huge_dss_prec_get(arena));
+			ret = huge_malloc(arena, usize, zero,
+			    huge_dss_prec_get(arena));
 		else
-			ret = huge_palloc(usize, alignment, zero, huge_dss_prec_get(arena));
+			ret = huge_palloc(arena, usize, alignment, zero,
+			    huge_dss_prec_get(arena));
 	}
 
 	assert(ALIGNMENT_ADDR2BASE(ret, alignment) == ret);
@@ -915,7 +919,7 @@ iralloct(void *ptr, size_t size, size_t extra, size_t alignment, bool zero,
 		    alignment, zero, try_tcache_alloc,
 		    try_tcache_dalloc));
 	} else {
-		return (huge_ralloc(ptr, oldsize, size, extra,
+		return (huge_ralloc(arena, ptr, oldsize, size, extra,
 		    alignment, zero, try_tcache_dalloc, huge_dss_prec_get(arena)));
 	}
 }
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index ccbb3a9..589b56a 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -104,6 +104,7 @@ buferror
 choose_arena
 choose_arena_hard
 chunk_alloc
+chunk_alloc_default
 chunk_alloc_dss
 chunk_alloc_mmap
 chunk_boot
diff --git a/include/jemalloc/jemalloc_protos.h.in b/include/jemalloc/jemalloc_protos.h.in
index 59aeee1..8e945fa 100644
--- a/include/jemalloc/jemalloc_protos.h.in
+++ b/include/jemalloc/jemalloc_protos.h.in
@@ -44,3 +44,6 @@ JEMALLOC_EXPORT void *	@je_@memalign(size_t alignment, size_t size)
 #ifdef JEMALLOC_OVERRIDE_VALLOC
 JEMALLOC_EXPORT void *	@je_@valloc(size_t size) JEMALLOC_ATTR(malloc);
 #endif
+
+typedef void *(chunk_alloc_t)(size_t, size_t, bool *, unsigned);
+typedef bool (chunk_dealloc_t)(void *, size_t, unsigned);
diff --git a/src/arena.c b/src/arena.c
index d956be3..6db2b63 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -570,8 +570,8 @@ arena_chunk_init_hard(arena_t *arena)
 
 	zero = false;
 	malloc_mutex_unlock(&arena->lock);
-	chunk = (arena_chunk_t *)chunk_alloc(chunksize, chunksize, false,
-	    &zero, arena->dss_prec);
+	chunk = (arena_chunk_t *)chunk_alloc(arena, chunksize, chunksize,
+	    false, &zero, arena->dss_prec);
 	malloc_mutex_lock(&arena->lock);
 	if (chunk == NULL)
 		return (NULL);
@@ -668,7 +668,7 @@ arena_chunk_dealloc(arena_t *arena, arena_chunk_t *chunk)
 
 		arena->spare = chunk;
 		malloc_mutex_unlock(&arena->lock);
-		chunk_dealloc((void *)spare, chunksize, true);
+		chunk_dealloc(arena, (void *)spare, chunksize, true);
 		malloc_mutex_lock(&arena->lock);
 		if (config_stats)
 			arena->stats.mapped -= chunksize;
@@ -2319,6 +2319,8 @@ arena_new(arena_t *arena, unsigned ind)
 
 	arena->ind = ind;
 	arena->nthreads = 0;
+	arena->chunk_alloc = chunk_alloc_default;
+	arena->chunk_dealloc = (chunk_dealloc_t *)chunk_unmap;
 
 	if (malloc_mutex_init(&arena->lock))
 		return (true);
diff --git a/src/base.c b/src/base.c
index 03dcf8f..e8b312e 100644
--- a/src/base.c
+++ b/src/base.c
@@ -32,7 +32,7 @@ base_pages_alloc(size_t minsize)
 	assert(minsize != 0);
 	csize = CHUNK_CEILING(minsize);
 	zero = false;
-	base_pages = chunk_alloc(csize, chunksize, true, &zero,
+	base_pages = chunk_alloc(NULL, csize, chunksize, true, &zero,
 	    chunk_dss_prec_get());
 	if (base_pages == NULL)
 		return (true);
diff --git a/src/chunk.c b/src/chunk.c
index 246324a..8bb0722 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -104,7 +104,7 @@ chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, size_t size,
 			malloc_mutex_unlock(&chunks_mtx);
 			node = base_node_alloc();
 			if (node == NULL) {
-				chunk_dealloc(ret, size, true);
+				chunk_dealloc(NULL, ret, size, true);
 				return (NULL);
 			}
 			malloc_mutex_lock(&chunks_mtx);
@@ -141,8 +141,8 @@ chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, size_t size,
  * takes advantage of this to avoid demanding zeroed chunks, but taking
  * advantage of them if they are returned.
  */
-void *
-chunk_alloc(size_t size, size_t alignment, bool base, bool *zero,
+static void *
+chunk_alloc_core(size_t size, size_t alignment, bool base, bool *zero,
     dss_prec_t dss_prec)
 {
 	void *ret;
@@ -156,32 +156,56 @@ chunk_alloc(size_t size, size_t alignment, bool base, bool *zero,
 	if (have_dss && dss_prec == dss_prec_primary) {
 		if ((ret = chunk_recycle(&chunks_szad_dss, &chunks_ad_dss, size,
 		    alignment, base, zero)) != NULL)
-			goto label_return;
+			return (ret);
 		if ((ret = chunk_alloc_dss(size, alignment, zero)) != NULL)
-			goto label_return;
+			return (ret);
 	}
 	/* mmap. */
 	if ((ret = chunk_recycle(&chunks_szad_mmap, &chunks_ad_mmap, size,
 	    alignment, base, zero)) != NULL)
-		goto label_return;
+		return (ret);
 	if ((ret = chunk_alloc_mmap(size, alignment, zero)) != NULL)
-		goto label_return;
+		return (ret);
 	/* "secondary" dss. */
 	if (have_dss && dss_prec == dss_prec_secondary) {
 		if ((ret = chunk_recycle(&chunks_szad_dss, &chunks_ad_dss, size,
 		    alignment, base, zero)) != NULL)
-			goto label_return;
+			return (ret);
 		if ((ret = chunk_alloc_dss(size, alignment, zero)) != NULL)
-			goto label_return;
+			return (ret);
 	}
 
 	/* All strategies for allocation failed. */
-	ret = NULL;
-label_return:
+	return (NULL);
+}
+
+/*
+ * Default arena chunk allocation routine in the absence of user-override.
+ */
+void *
+chunk_alloc_default(size_t size, size_t alignment, bool *zero,
+    unsigned arena_ind)
+{
+
+	return (chunk_alloc_core(size, alignment, false, zero,
+	    arenas[arena_ind]->dss_prec));
+}
+
+void *
+chunk_alloc(arena_t *arena, size_t size, size_t alignment, bool base,
+    bool *zero, dss_prec_t dss_prec)
+{
+	void *ret;
+
+	if (arena)
+		ret = arena->chunk_alloc(size, alignment, zero, arena->ind);
+	else
+		ret = chunk_alloc_core(size, alignment, base, zero, dss_prec);
+
 	if (ret != NULL) {
 		if (config_ivsalloc && base == false) {
 			if (rtree_set(chunks_rtree, (uintptr_t)ret, 1)) {
-				chunk_dealloc(ret, size, true);
+				chunk_dealloc(arena, ret, size, true);
 				return (NULL);
 			}
 		}
@@ -312,7 +336,7 @@ chunk_unmap(void *chunk, size_t size)
 }
 
 void
-chunk_dealloc(void *chunk, size_t size, bool unmap)
+chunk_dealloc(arena_t *arena, void *chunk, size_t size, bool unmap)
 {
 
 	assert(chunk != NULL);
@@ -329,8 +353,12 @@ chunk_dealloc(void *chunk, size_t size, bool unmap)
 		malloc_mutex_unlock(&chunks_mtx);
 	}
 
-	if (unmap)
-		chunk_unmap(chunk, size);
+	if (unmap) {
+		if (arena)
+			arena->chunk_dealloc(chunk, size, arena->ind);
+		else
+			chunk_unmap(chunk, size);
+	}
 }
 
 bool
diff --git a/src/ctl.c b/src/ctl.c
index 9ee5de9..395c32a 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -113,6 +113,8 @@ CTL_PROTO(opt_prof_accum)
 CTL_PROTO(arena_i_purge)
 static void	arena_purge(unsigned arena_ind);
 CTL_PROTO(arena_i_dss)
+CTL_PROTO(arena_i_chunk_alloc)
+CTL_PROTO(arena_i_chunk_dealloc)
 INDEX_PROTO(arena_i)
 CTL_PROTO(arenas_bin_i_size)
 CTL_PROTO(arenas_bin_i_nregs)
@@ -251,9 +253,15 @@ static const ctl_named_node_t opt_node[] = {
 	{NAME("prof_accum"),		CTL(opt_prof_accum)}
 };
 
+static const ctl_named_node_t chunk_node[] = {
+	{NAME("alloc"),			CTL(arena_i_chunk_alloc)},
+	{NAME("dealloc"),		CTL(arena_i_chunk_dealloc)}
+};
+
 static const ctl_named_node_t arena_i_node[] = {
 	{NAME("purge"),			CTL(arena_i_purge)},
-	{NAME("dss"),			CTL(arena_i_dss)}
+	{NAME("dss"),			CTL(arena_i_dss)},
+	{NAME("chunk"),			CHILD(named, chunk)},
 };
 static const ctl_named_node_t super_arena_i_node[] = {
 	{NAME(""),			CHILD(named, arena_i)}
@@ -1368,6 +1376,57 @@ label_return:
 	return (ret);
 }
 
+static int
+arena_i_chunk_alloc_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
+    void *newp, size_t newlen)
+{
+	int ret;
+	unsigned arena_ind = mib[1];
+	arena_t *arena;
+
+	malloc_mutex_lock(&ctl_mtx);
+	if (arena_ind < narenas_total && (arena = arenas[arena_ind]) != NULL) {
+		malloc_mutex_lock(&arena->lock);
+		READ(arena->chunk_alloc, chunk_alloc_t *);
+		WRITE(arena->chunk_alloc, chunk_alloc_t *);
+	} else {
+		ret = EFAULT;
+		goto label_outer_return;
+	}
+	ret = 0;
+label_return:
+	malloc_mutex_unlock(&arena->lock);
+label_outer_return:
+	malloc_mutex_unlock(&ctl_mtx);
+	return (ret);
+}
+
+static int
+arena_i_chunk_dealloc_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
+    void *newp, size_t newlen)
+{
+
+	int ret;
+	unsigned arena_ind = mib[1];
+	arena_t *arena;
+
+	malloc_mutex_lock(&ctl_mtx);
+	if (arena_ind < narenas_total && (arena = arenas[arena_ind]) != NULL) {
+		malloc_mutex_lock(&arena->lock);
+		READ(arena->chunk_dealloc, chunk_dealloc_t *);
+		WRITE(arena->chunk_dealloc, chunk_dealloc_t *);
+	} else {
+		ret = EFAULT;
+		goto label_outer_return;
+	}
+	ret = 0;
+label_return:
+	malloc_mutex_unlock(&arena->lock);
+label_outer_return:
+	malloc_mutex_unlock(&ctl_mtx);
+	return (ret);
+}
+
 static const ctl_named_node_t *
 arena_i_index(const size_t *mib, size_t miblen, size_t i)
 {
diff --git a/src/huge.c b/src/huge.c
index e725fd9..ab05c90 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -16,14 +16,15 @@ malloc_mutex_t	huge_mtx;
 static extent_tree_t	huge;
 
 void *
-huge_malloc(size_t size, bool zero, dss_prec_t dss_prec)
+huge_malloc(arena_t *arena, size_t size, bool zero, dss_prec_t dss_prec)
 {
 
-	return (huge_palloc(size, chunksize, zero, dss_prec));
+	return (huge_palloc(arena, size, chunksize, zero, dss_prec));
 }
 
 void *
-huge_palloc(size_t size, size_t alignment, bool zero, dss_prec_t dss_prec)
+huge_palloc(arena_t *arena, size_t size, size_t alignment, bool zero,
+    dss_prec_t dss_prec)
 {
 	void *ret;
 	size_t csize;
@@ -48,7 +49,7 @@ huge_palloc(size_t size, size_t alignment, bool zero, dss_prec_t dss_prec)
 	 * it is possible to make correct junk/zero fill decisions below.
 	 */
 	is_zeroed = zero;
-	ret = chunk_alloc(csize, alignment, false, &is_zeroed, dss_prec);
+	ret = chunk_alloc(arena, csize, alignment, false, &is_zeroed, dss_prec);
 	if (ret == NULL) {
 		base_node_dealloc(node);
 		return (NULL);
@@ -57,6 +58,7 @@ huge_palloc(size_t size, size_t alignment, bool zero, dss_prec_t dss_prec)
 	/* Insert node into huge. */
 	node->addr = ret;
 	node->size = csize;
+	node->arena = arena;
 
 	malloc_mutex_lock(&huge_mtx);
 	extent_tree_ad_insert(&huge, node);
@@ -96,8 +98,9 @@ huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra)
 }
 
 void *
-huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
-    size_t alignment, bool zero, bool try_tcache_dalloc, dss_prec_t dss_prec)
+huge_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
+    size_t extra, size_t alignment, bool zero, bool try_tcache_dalloc,
+    dss_prec_t dss_prec)
 {
 	void *ret;
 	size_t copysize;
@@ -112,18 +115,18 @@ huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
 	 * space and copying.
 	 */
 	if (alignment > chunksize)
-		ret = huge_palloc(size + extra, alignment, zero, dss_prec);
+		ret = huge_palloc(arena, size + extra, alignment, zero, dss_prec);
 	else
-		ret = huge_malloc(size + extra, zero, dss_prec);
+		ret = huge_malloc(arena, size + extra, zero, dss_prec);
 
 	if (ret == NULL) {
 		if (extra == 0)
 			return (NULL);
 		/* Try again, this time without extra. */
 		if (alignment > chunksize)
-			ret = huge_palloc(size, alignment, zero, dss_prec);
+			ret = huge_palloc(arena, size, alignment, zero, dss_prec);
 		else
-			ret = huge_malloc(size, zero, dss_prec);
+			ret = huge_malloc(arena, size, zero, dss_prec);
 
 		if (ret == NULL)
 			return (NULL);
@@ -238,7 +241,7 @@ huge_dalloc(void *ptr, bool unmap)
 	if (unmap)
 		huge_dalloc_junk(node->addr, node->size);
 
-	chunk_dealloc(node->addr, node->size, unmap);
+	chunk_dealloc(node->arena, node->addr, node->size, unmap);
 
 	base_node_dealloc(node);
 }
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 289d7f7..e0f9275 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1983,7 +1983,7 @@ a0alloc(size_t size, bool zero)
 	if (size <= arena_maxclass)
 		return (arena_malloc(arenas[0], size, zero, false));
 	else
-		return (huge_malloc(size, zero, huge_dss_prec_get(arenas[0])));
+		return (huge_malloc(NULL, size, zero, huge_dss_prec_get(arenas[0])));
 }
 
 void *
diff --git a/test/integration/chunk.c b/test/integration/chunk.c
new file mode 100644
index 0000000..1365989
--- /dev/null
+++ b/test/integration/chunk.c
@@ -0,0 +1,61 @@
+#include "test/jemalloc_test.h"
+
+chunk_alloc_t *old_alloc;
+chunk_dealloc_t *old_dealloc;
+
+bool
+chunk_dealloc(void *chunk, size_t size, unsigned arena_ind)
+{
+
+	return (old_dealloc(chunk, size, arena_ind));
+}
+
+void *
+chunk_alloc(size_t size, size_t alignment, bool *zero, unsigned arena_ind)
+{
+
+	return (old_alloc(size, alignment, zero, arena_ind));
+}
+
+TEST_BEGIN(test_chunk)
+{
+	void *p;
+	chunk_alloc_t *new_alloc;
+	chunk_dealloc_t *new_dealloc;
+	size_t old_size, new_size;
+
+	new_alloc = chunk_alloc;
+	new_dealloc = chunk_dealloc;
+	old_size = sizeof(chunk_alloc_t *);
+	new_size = sizeof(chunk_alloc_t *);
+
+	assert_d_eq(mallctl("arena.0.chunk.alloc", &old_alloc,
+	    &old_size, &new_alloc, new_size), 0,
+	    "Unexpected alloc error");
+	assert_ptr_ne(old_alloc, new_alloc,
+	    "Unexpected alloc error");
+	assert_d_eq(mallctl("arena.0.chunk.dealloc", &old_dealloc,
+	    &old_size, &new_dealloc, new_size), 0,
+	    "Unexpected dealloc error");
+	assert_ptr_ne(old_dealloc, new_dealloc,
+	    "Unexpected dealloc error");
+
+	p = mallocx(42, 0);
+	assert_ptr_ne(p, NULL, "Unexpected alloc error");
+	free(p);
+
+	assert_d_eq(mallctl("arena.0.chunk.alloc", NULL,
+	    NULL, &old_alloc, old_size), 0,
+	    "Unexpected alloc error");
+	assert_d_eq(mallctl("arena.0.chunk.dealloc", NULL,
+	    NULL, &old_dealloc, old_size), 0,
+	    "Unexpected dealloc error");
+}
+TEST_END
+
+int
+main(void)
+{
+
+	return (test(test_chunk));
+}
-- 
cgit v0.12


From e2deab7a751c8080c2b2cdcfd7b11887332be1bb Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Thu, 15 May 2014 22:22:27 -0700
Subject: Refactor huge allocation to be managed by arenas.

Refactor huge allocation to be managed by arenas (though the global
red-black tree of huge allocations remains for lookup during
deallocation).  This is the logical conclusion of recent changes that 1)
made per arena dss precedence apply to huge allocation, and 2) made it
possible to replace the per arena chunk allocation/deallocation
functions.

Remove the top level huge stats, and replace them with per arena huge
stats.

Normalize function names and types to *dalloc* (some were *dealloc*).

Remove the --enable-mremap option.  As jemalloc currently operates, this
is a performace regression for some applications, but planned work to
logarithmically space huge size classes should provide similar amortized
performance.  The motivation for this change was that mremap-based huge
reallocation forced leaky abstractions that prevented refactoring.
---
 INSTALL                                            |   6 -
 Makefile.in                                        |   1 -
 configure.ac                                       |  28 ----
 doc/jemalloc.xml.in                                | 128 +++++++++---------
 include/jemalloc/internal/arena.h                  |   7 +-
 include/jemalloc/internal/base.h                   |   2 +-
 include/jemalloc/internal/chunk.h                  |   8 +-
 include/jemalloc/internal/chunk_mmap.h             |   2 +-
 include/jemalloc/internal/ctl.h                    |   5 -
 include/jemalloc/internal/huge.h                   |  20 +--
 include/jemalloc/internal/jemalloc_internal.h.in   |  23 +---
 .../jemalloc/internal/jemalloc_internal_defs.h.in  |   7 -
 include/jemalloc/internal/private_symbols.txt      |  13 +-
 include/jemalloc/internal/stats.h                  |   5 +
 include/jemalloc/jemalloc_protos.h.in              |   2 +-
 src/arena.c                                        | 113 ++++++++++++++--
 src/base.c                                         |  12 +-
 src/chunk.c                                        | 145 ++++++++++++---------
 src/chunk_mmap.c                                   |   2 +-
 src/ctl.c                                          |  68 +++++-----
 src/huge.c                                         | 120 +++--------------
 src/jemalloc.c                                     |   4 +-
 src/stats.c                                        |  29 ++---
 test/integration/chunk.c                           |  23 ++--
 test/integration/mremap.c                          |  45 -------
 test/unit/junk.c                                   |   9 +-
 test/unit/mallctl.c                                |   1 -
 test/unit/stats.c                                  |  18 ++-
 28 files changed, 380 insertions(+), 466 deletions(-)
 delete mode 100644 test/integration/mremap.c

diff --git a/INSTALL b/INSTALL
index 07f51d1..2df667c 100644
--- a/INSTALL
+++ b/INSTALL
@@ -132,12 +132,6 @@ any of the following arguments (not a definitive list) to 'configure':
     released in bulk, thus reducing the total number of mutex operations.  See
     the "opt.tcache" option for usage details.
 
---enable-mremap
-    Enable huge realloc() via mremap(2).  mremap() is disabled by default
-    because the flavor used is specific to Linux, which has a quirk in its
-    virtual memory allocation algorithm that causes semi-permanent VM map holes
-    under normal jemalloc operation.
-
 --disable-munmap
     Disable virtual memory deallocation via munmap(2); instead keep track of
     the virtual memory for later use.  munmap() is disabled by default (i.e.
diff --git a/Makefile.in b/Makefile.in
index 800dd08..90869eb 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -137,7 +137,6 @@ TESTS_INTEGRATION := $(srcroot)test/integration/aligned_alloc.c \
 	$(srcroot)test/integration/allocated.c \
 	$(srcroot)test/integration/mallocx.c \
 	$(srcroot)test/integration/MALLOCX_ARENA.c \
-	$(srcroot)test/integration/mremap.c \
 	$(srcroot)test/integration/posix_memalign.c \
 	$(srcroot)test/integration/rallocx.c \
 	$(srcroot)test/integration/thread_arena.c \
diff --git a/configure.ac b/configure.ac
index eb9ca45..57015d1 100644
--- a/configure.ac
+++ b/configure.ac
@@ -793,33 +793,6 @@ if test "x$enable_tcache" = "x1" ; then
 fi
 AC_SUBST([enable_tcache])
 
-dnl Disable mremap() for huge realloc() by default.
-AC_ARG_ENABLE([mremap],
-  [AS_HELP_STRING([--enable-mremap], [Enable mremap(2) for huge realloc()])],
-[if test "x$enable_mremap" = "xno" ; then
-  enable_mremap="0"
-else
-  enable_mremap="1"
-fi
-],
-[enable_mremap="0"]
-)
-if test "x$enable_mremap" = "x1" ; then
-  JE_COMPILABLE([mremap(...MREMAP_FIXED...)], [
-#define	_GNU_SOURCE
-#include <sys/mman.h>
-], [
-void *p = mremap((void *)0, 0, 0, MREMAP_MAYMOVE|MREMAP_FIXED, (void *)0);
-], [je_cv_mremap_fixed])
-  if test "x${je_cv_mremap_fixed}" = "xno" ; then
-    enable_mremap="0"
-  fi
-fi
-if test "x$enable_mremap" = "x1" ; then
-  AC_DEFINE([JEMALLOC_MREMAP], [ ])
-fi
-AC_SUBST([enable_mremap])
-
 dnl Enable VM deallocation via munmap() by default.
 AC_ARG_ENABLE([munmap],
   [AS_HELP_STRING([--disable-munmap], [Disable VM deallocation via munmap(2)])],
@@ -1447,7 +1420,6 @@ AC_MSG_RESULT([fill               : ${enable_fill}])
 AC_MSG_RESULT([utrace             : ${enable_utrace}])
 AC_MSG_RESULT([valgrind           : ${enable_valgrind}])
 AC_MSG_RESULT([xmalloc            : ${enable_xmalloc}])
-AC_MSG_RESULT([mremap             : ${enable_mremap}])
 AC_MSG_RESULT([munmap             : ${enable_munmap}])
 AC_MSG_RESULT([lazy_lock          : ${enable_lazy_lock}])
 AC_MSG_RESULT([tls                : ${enable_tls}])
diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index a7c38b5..46e505f 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -486,10 +486,11 @@ for (i = 0; i < nbins; i++) {
     <para>User objects are broken into three categories according to size:
     small, large, and huge.  Small objects are smaller than one page.  Large
     objects are smaller than the chunk size.  Huge objects are a multiple of
-    the chunk size.  Small and large objects are managed by arenas; huge
-    objects are managed separately in a single data structure that is shared by
-    all threads.  Huge objects are used by applications infrequently enough
-    that this single data structure is not a scalability issue.</para>
+    the chunk size.  Small and large objects are managed entirely by arenas;
+    huge objects are additionally aggregated in a single data structure that is
+    shared by all threads.  Huge objects are typically used by applications
+    infrequently enough that this single data structure is not a scalability
+    issue.</para>
 
     <para>Each chunk that is managed by an arena tracks its contents as runs of
     contiguous pages (unused, backing a set of small objects, or backing one
@@ -647,16 +648,6 @@ for (i = 0; i < nbins; i++) {
         during build configuration.</para></listitem>
       </varlistentry>
 
-      <varlistentry id="config.mremap">
-        <term>
-          <mallctl>config.mremap</mallctl>
-          (<type>bool</type>)
-          <literal>r-</literal>
-        </term>
-        <listitem><para><option>--enable-mremap</option> was specified during
-        build configuration.</para></listitem>
-      </varlistentry>
-
       <varlistentry id="config.munmap">
         <term>
           <mallctl>config.munmap</mallctl>
@@ -1273,14 +1264,9 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         <listitem><para>Set the precedence of dss allocation as related to mmap
         allocation for arena &lt;i&gt;, or for all arenas if &lt;i&gt; equals
         <link
-        linkend="arenas.narenas"><mallctl>arenas.narenas</mallctl></link>.  Note
-        that even during huge allocation this setting is read from the arena
-        that would be chosen for small or large allocation so that applications
-        can depend on consistent dss versus mmap allocation regardless of
-        allocation size.  See <link
-        linkend="opt.dss"><mallctl>opt.dss</mallctl></link> for supported
-        settings.
-        </para></listitem>
+        linkend="arenas.narenas"><mallctl>arenas.narenas</mallctl></link>.  See
+        <link linkend="opt.dss"><mallctl>opt.dss</mallctl></link> for supported
+        settings.</para></listitem>
       </varlistentry>
 
       <varlistentry id="arena.i.chunk.alloc">
@@ -1291,8 +1277,8 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         </term>
         <listitem><para>Get or set the chunk allocation function for arena
         &lt;i&gt;.  If setting, the chunk deallocation function should
-        also be set via <link linkend="arena.i.chunk.dealloc">
-        <mallctl>arena.&lt;i&gt;.chunk.dealloc</mallctl></link> to a companion
+        also be set via <link linkend="arena.i.chunk.dalloc">
+        <mallctl>arena.&lt;i&gt;.chunk.dalloc</mallctl></link> to a companion
         function that knows how to deallocate the chunks.
         <funcprototype>
           <funcdef>typedef void *<function>(chunk_alloc_t)</function></funcdef>
@@ -1313,13 +1299,18 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         size.  The <parameter>alignment</parameter> parameter is always a power
         of two at least as large as the chunk size.  Zeroing is mandatory if
         <parameter>*zero</parameter> is true upon function
-        entry.</para></listitem>
+        entry.</para>
+
+        <para>Note that replacing the default chunk allocation function makes
+        the arena's <link
+        linkend="arena.i.dss"><mallctl>arena.&lt;i&gt;.dss</mallctl></link>
+        setting irrelevant.</para></listitem>
       </varlistentry>
 
-      <varlistentry id="arena.i.chunk.dealloc">
+      <varlistentry id="arena.i.chunk.dalloc">
         <term>
-          <mallctl>arena.&lt;i&gt;.chunk.dealloc</mallctl>
-          (<type>chunk_dealloc_t *</type>)
+          <mallctl>arena.&lt;i&gt;.chunk.dalloc</mallctl>
+          (<type>chunk_dalloc_t *</type>)
           <literal>rw</literal>
         </term>
         <listitem><para>Get or set the chunk deallocation function for arena
@@ -1335,13 +1326,13 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         created chunks prior to the application having an opportunity to take
         over chunk allocation.
         <funcprototype>
-          <funcdef>typedef void <function>(chunk_dealloc_t)</function></funcdef>
+          <funcdef>typedef void <function>(chunk_dalloc_t)</function></funcdef>
           <paramdef>void *<parameter>chunk</parameter></paramdef>
           <paramdef>size_t <parameter>size</parameter></paramdef>
           <paramdef>unsigned <parameter>arena_ind</parameter></paramdef>
         </funcprototype>
         A chunk deallocation function conforms to the
-        <type>chunk_dealloc_t</type> type and deallocates a
+        <type>chunk_dalloc_t</type> type and deallocates a
         <parameter>chunk</parameter> of given <parameter>size</parameter> on
         behalf of arena <parameter>arena_ind</parameter>.</para></listitem>
       </varlistentry>
@@ -1608,39 +1599,6 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         </para></listitem>
       </varlistentry>
 
-      <varlistentry id="stats.huge.allocated">
-        <term>
-          <mallctl>stats.huge.allocated</mallctl>
-          (<type>size_t</type>)
-          <literal>r-</literal>
-          [<option>--enable-stats</option>]
-        </term>
-        <listitem><para>Number of bytes currently allocated by huge objects.
-        </para></listitem>
-      </varlistentry>
-
-      <varlistentry id="stats.huge.nmalloc">
-        <term>
-          <mallctl>stats.huge.nmalloc</mallctl>
-          (<type>uint64_t</type>)
-          <literal>r-</literal>
-          [<option>--enable-stats</option>]
-        </term>
-        <listitem><para>Cumulative number of huge allocation requests.
-        </para></listitem>
-      </varlistentry>
-
-      <varlistentry id="stats.huge.ndalloc">
-        <term>
-          <mallctl>stats.huge.ndalloc</mallctl>
-          (<type>uint64_t</type>)
-          <literal>r-</literal>
-          [<option>--enable-stats</option>]
-        </term>
-        <listitem><para>Cumulative number of huge deallocation requests.
-        </para></listitem>
-      </varlistentry>
-
       <varlistentry id="stats.arenas.i.dss">
         <term>
           <mallctl>stats.arenas.&lt;i&gt;.dss</mallctl>
@@ -1817,6 +1775,50 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         </para></listitem>
       </varlistentry>
 
+      <varlistentry id="stats.arenas.i.huge.allocated">
+        <term>
+          <mallctl>stats.arenas.&lt;i&gt;.huge.allocated</mallctl>
+          (<type>size_t</type>)
+          <literal>r-</literal>
+          [<option>--enable-stats</option>]
+        </term>
+        <listitem><para>Number of bytes currently allocated by huge objects.
+        </para></listitem>
+      </varlistentry>
+
+      <varlistentry id="stats.arenas.i.huge.nmalloc">
+        <term>
+          <mallctl>stats.arenas.&lt;i&gt;.huge.nmalloc</mallctl>
+          (<type>uint64_t</type>)
+          <literal>r-</literal>
+          [<option>--enable-stats</option>]
+        </term>
+        <listitem><para>Cumulative number of huge allocation requests served
+        directly by the arena.</para></listitem>
+      </varlistentry>
+
+      <varlistentry id="stats.arenas.i.huge.ndalloc">
+        <term>
+          <mallctl>stats.arenas.&lt;i&gt;.huge.ndalloc</mallctl>
+          (<type>uint64_t</type>)
+          <literal>r-</literal>
+          [<option>--enable-stats</option>]
+        </term>
+        <listitem><para>Cumulative number of huge deallocation requests served
+        directly by the arena.</para></listitem>
+      </varlistentry>
+
+      <varlistentry id="stats.arenas.i.huge.nrequests">
+        <term>
+          <mallctl>stats.arenas.&lt;i&gt;.huge.nrequests</mallctl>
+          (<type>uint64_t</type>)
+          <literal>r-</literal>
+          [<option>--enable-stats</option>]
+        </term>
+        <listitem><para>Cumulative number of huge allocation requests.
+        </para></listitem>
+      </varlistentry>
+
       <varlistentry id="stats.arenas.i.bins.j.allocated">
         <term>
           <mallctl>stats.arenas.&lt;i&gt;.bins.&lt;j&gt;.allocated</mallctl>
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index d50159b..598a89b 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -345,7 +345,7 @@ struct arena_s {
 	 */
 	arena_chunk_t		*spare;
 
-	/* Number of pages in active runs. */
+	/* Number of pages in active runs and huge regions. */
 	size_t			nactive;
 
 	/*
@@ -374,7 +374,7 @@ struct arena_s {
 	 * user-configureable chunk allocation and deallocation functions.
 	 */
 	chunk_alloc_t		*chunk_alloc;
-	chunk_dealloc_t		*chunk_dealloc;
+	chunk_dalloc_t		*chunk_dalloc;
 
 	/* bins is used to store trees of free regions. */
 	arena_bin_t		bins[NBINS];
@@ -403,6 +403,9 @@ extern arena_bin_info_t	arena_bin_info[NBINS];
 /* Number of large size classes. */
 #define			nlclasses (chunk_npages - map_bias)
 
+void	*arena_chunk_alloc_huge(arena_t *arena, size_t size, size_t alignment,
+    bool *zero);
+void	arena_chunk_dalloc_huge(arena_t *arena, void *chunk, size_t size);
 void	arena_purge_all(arena_t *arena);
 void	arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin,
     size_t binind, uint64_t prof_accumbytes);
diff --git a/include/jemalloc/internal/base.h b/include/jemalloc/internal/base.h
index 9cf75ff..3fb80b9 100644
--- a/include/jemalloc/internal/base.h
+++ b/include/jemalloc/internal/base.h
@@ -12,7 +12,7 @@
 void	*base_alloc(size_t size);
 void	*base_calloc(size_t number, size_t size);
 extent_node_t *base_node_alloc(void);
-void	base_node_dealloc(extent_node_t *node);
+void	base_node_dalloc(extent_node_t *node);
 bool	base_boot(void);
 void	base_prefork(void);
 void	base_postfork_parent(void);
diff --git a/include/jemalloc/internal/chunk.h b/include/jemalloc/internal/chunk.h
index cea0e8a..f3bfbe0 100644
--- a/include/jemalloc/internal/chunk.h
+++ b/include/jemalloc/internal/chunk.h
@@ -43,12 +43,14 @@ extern size_t		chunk_npages;
 extern size_t		map_bias; /* Number of arena chunk header pages. */
 extern size_t		arena_maxclass; /* Max size class for arenas. */
 
-void	*chunk_alloc(arena_t *arena, size_t size, size_t alignment, bool base,
-    bool *zero, dss_prec_t dss_prec);
+void	*chunk_alloc_base(size_t size);
+void	*chunk_alloc_arena(chunk_alloc_t *chunk_alloc,
+    chunk_dalloc_t *chunk_dalloc, unsigned arena_ind, size_t size,
+    size_t alignment, bool *zero);
 void	*chunk_alloc_default(size_t size, size_t alignment, bool *zero,
     unsigned arena_ind);
 void	chunk_unmap(void *chunk, size_t size);
-void	chunk_dealloc(arena_t *arena, void *chunk, size_t size, bool unmap);
+bool	chunk_dalloc_default(void *chunk, size_t size, unsigned arena_ind);
 bool	chunk_boot(void);
 void	chunk_prefork(void);
 void	chunk_postfork_parent(void);
diff --git a/include/jemalloc/internal/chunk_mmap.h b/include/jemalloc/internal/chunk_mmap.h
index f24abac..c5d5c6c 100644
--- a/include/jemalloc/internal/chunk_mmap.h
+++ b/include/jemalloc/internal/chunk_mmap.h
@@ -12,7 +12,7 @@
 bool	pages_purge(void *addr, size_t length);
 
 void	*chunk_alloc_mmap(size_t size, size_t alignment, bool *zero);
-bool	chunk_dealloc_mmap(void *chunk, size_t size);
+bool	chunk_dalloc_mmap(void *chunk, size_t size);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
diff --git a/include/jemalloc/internal/ctl.h b/include/jemalloc/internal/ctl.h
index 0ffecc5..2d301bf 100644
--- a/include/jemalloc/internal/ctl.h
+++ b/include/jemalloc/internal/ctl.h
@@ -57,11 +57,6 @@ struct ctl_stats_s {
 		uint64_t	total;		/* stats_chunks.nchunks */
 		size_t		high;		/* stats_chunks.highchunks */
 	} chunks;
-	struct {
-		size_t		allocated;	/* huge_allocated */
-		uint64_t	nmalloc;	/* huge_nmalloc */
-		uint64_t	ndalloc;	/* huge_ndalloc */
-	} huge;
 	unsigned		narenas;
 	ctl_arena_stats_t	*arenas;	/* (narenas + 1) elements. */
 };
diff --git a/include/jemalloc/internal/huge.h b/include/jemalloc/internal/huge.h
index ab8d44a..1e54536 100644
--- a/include/jemalloc/internal/huge.h
+++ b/include/jemalloc/internal/huge.h
@@ -9,30 +9,18 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
-/* Huge allocation statistics. */
-extern uint64_t		huge_nmalloc;
-extern uint64_t		huge_ndalloc;
-extern size_t		huge_allocated;
-
-/* Protects chunk-related data structures. */
-extern malloc_mutex_t	huge_mtx;
-
-void	*huge_malloc(arena_t *arena, size_t size, bool zero,
-    dss_prec_t dss_prec);
-void	*huge_palloc(arena_t *arena, size_t size, size_t alignment, bool zero,
-    dss_prec_t dss_prec);
+void	*huge_malloc(arena_t *arena, size_t size, bool zero);
+void	*huge_palloc(arena_t *arena, size_t size, size_t alignment, bool zero);
 bool	huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
     size_t extra);
 void	*huge_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
-    size_t extra, size_t alignment, bool zero, bool try_tcache_dalloc,
-    dss_prec_t dss_prec);
+    size_t extra, size_t alignment, bool zero, bool try_tcache_dalloc);
 #ifdef JEMALLOC_JET
 typedef void (huge_dalloc_junk_t)(void *, size_t);
 extern huge_dalloc_junk_t *huge_dalloc_junk;
 #endif
-void	huge_dalloc(void *ptr, bool unmap);
+void	huge_dalloc(void *ptr);
 size_t	huge_salloc(const void *ptr);
-dss_prec_t	huge_dss_prec_get(arena_t *arena);
 prof_ctx_t	*huge_prof_ctx_get(const void *ptr);
 void	huge_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 bool	huge_boot(void);
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 9e779c6..c9462e5 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -122,13 +122,6 @@ static const bool config_prof_libunwind =
     false
 #endif
     ;
-static const bool config_mremap =
-#ifdef JEMALLOC_MREMAP
-    true
-#else
-    false
-#endif
-    ;
 static const bool config_munmap =
 #ifdef JEMALLOC_MUNMAP
     true
@@ -702,8 +695,7 @@ imalloct(size_t size, bool try_tcache, arena_t *arena)
 	if (size <= arena_maxclass)
 		return (arena_malloc(arena, size, false, try_tcache));
 	else
-		return (huge_malloc(arena, size, false,
-		    huge_dss_prec_get(arena)));
+		return (huge_malloc(arena, size, false));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
@@ -720,8 +712,7 @@ icalloct(size_t size, bool try_tcache, arena_t *arena)
 	if (size <= arena_maxclass)
 		return (arena_malloc(arena, size, true, try_tcache));
 	else
-		return (huge_malloc(arena, size, true,
-		    huge_dss_prec_get(arena)));
+		return (huge_malloc(arena, size, true));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
@@ -747,11 +738,9 @@ ipalloct(size_t usize, size_t alignment, bool zero, bool try_tcache,
 			ret = arena_palloc(choose_arena(arena), usize,
 			    alignment, zero);
 		} else if (alignment <= chunksize)
-			ret = huge_malloc(arena, usize, zero,
-			    huge_dss_prec_get(arena));
+			ret = huge_malloc(arena, usize, zero);
 		else
-			ret = huge_palloc(arena, usize, alignment, zero,
-			    huge_dss_prec_get(arena));
+			ret = huge_palloc(arena, usize, alignment, zero);
 	}
 
 	assert(ALIGNMENT_ADDR2BASE(ret, alignment) == ret);
@@ -833,7 +822,7 @@ idalloct(void *ptr, bool try_tcache)
 	if (chunk != ptr)
 		arena_dalloc(chunk, ptr, try_tcache);
 	else
-		huge_dalloc(ptr, true);
+		huge_dalloc(ptr);
 }
 
 JEMALLOC_ALWAYS_INLINE void
@@ -920,7 +909,7 @@ iralloct(void *ptr, size_t size, size_t extra, size_t alignment, bool zero,
 		    try_tcache_dalloc));
 	} else {
 		return (huge_ralloc(arena, ptr, oldsize, size, extra,
-		    alignment, zero, try_tcache_dalloc, huge_dss_prec_get(arena)));
+		    alignment, zero, try_tcache_dalloc));
 	}
 }
 
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index fc95967..09ddd4f 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -144,13 +144,6 @@
  */
 #undef JEMALLOC_MUNMAP
 
-/*
- * If defined, use mremap(...MREMAP_FIXED...) for huge realloc().  This is
- * disabled by default because it is Linux-specific and it will cause virtual
- * memory map holes, much like munmap(2) does.
- */
-#undef JEMALLOC_MREMAP
-
 /* TLS is used to map arenas and magazine caches to threads. */
 #undef JEMALLOC_TLS
 
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 589b56a..f6c4fbc 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -5,6 +5,8 @@ arena_alloc_junk_small
 arena_bin_index
 arena_bin_info
 arena_boot
+arena_chunk_alloc_huge
+arena_chunk_dalloc_huge
 arena_dalloc
 arena_dalloc_bin
 arena_dalloc_bin_locked
@@ -86,7 +88,7 @@ base_alloc
 base_boot
 base_calloc
 base_node_alloc
-base_node_dealloc
+base_node_dalloc
 base_postfork_child
 base_postfork_parent
 base_prefork
@@ -103,13 +105,14 @@ bt_init
 buferror
 choose_arena
 choose_arena_hard
-chunk_alloc
+chunk_alloc_arena
+chunk_alloc_base
 chunk_alloc_default
 chunk_alloc_dss
 chunk_alloc_mmap
 chunk_boot
-chunk_dealloc
-chunk_dealloc_mmap
+chunk_dalloc_default
+chunk_dalloc_mmap
 chunk_dss_boot
 chunk_dss_postfork_child
 chunk_dss_postfork_parent
@@ -198,9 +201,7 @@ huge_allocated
 huge_boot
 huge_dalloc
 huge_dalloc_junk
-huge_dss_prec_get
 huge_malloc
-huge_mtx
 huge_ndalloc
 huge_nmalloc
 huge_palloc
diff --git a/include/jemalloc/internal/stats.h b/include/jemalloc/internal/stats.h
index 27f68e3..ce96476 100644
--- a/include/jemalloc/internal/stats.h
+++ b/include/jemalloc/internal/stats.h
@@ -101,6 +101,11 @@ struct arena_stats_s {
 	uint64_t	ndalloc_large;
 	uint64_t	nrequests_large;
 
+	size_t		allocated_huge;
+	uint64_t	nmalloc_huge;
+	uint64_t	ndalloc_huge;
+	uint64_t	nrequests_huge;
+
 	/*
 	 * One element for each possible size class, including sizes that
 	 * overlap with bin size classes.  This is necessary because ipalloc()
diff --git a/include/jemalloc/jemalloc_protos.h.in b/include/jemalloc/jemalloc_protos.h.in
index 8e945fa..67268c4 100644
--- a/include/jemalloc/jemalloc_protos.h.in
+++ b/include/jemalloc/jemalloc_protos.h.in
@@ -46,4 +46,4 @@ JEMALLOC_EXPORT void *	@je_@valloc(size_t size) JEMALLOC_ATTR(malloc);
 #endif
 
 typedef void *(chunk_alloc_t)(size_t, size_t, bool *, unsigned);
-typedef bool (chunk_dealloc_t)(void *, size_t, unsigned);
+typedef bool (chunk_dalloc_t)(void *, size_t, unsigned);
diff --git a/src/arena.c b/src/arena.c
index 6db2b63..f5d7d06 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -560,6 +560,65 @@ arena_chunk_init_spare(arena_t *arena)
 }
 
 static arena_chunk_t *
+arena_chunk_alloc_internal(arena_t *arena, size_t size, size_t alignment,
+    bool *zero)
+{
+	arena_chunk_t *chunk;
+	chunk_alloc_t *chunk_alloc;
+	chunk_dalloc_t *chunk_dalloc;
+
+	chunk_alloc = arena->chunk_alloc;
+	chunk_dalloc = arena->chunk_dalloc;
+	malloc_mutex_unlock(&arena->lock);
+	chunk = (arena_chunk_t *)chunk_alloc_arena(chunk_alloc, chunk_dalloc,
+	    arena->ind, size, alignment, zero);
+	malloc_mutex_lock(&arena->lock);
+	if (config_stats && chunk != NULL)
+		arena->stats.mapped += chunksize;
+
+	return (chunk);
+}
+
+void *
+arena_chunk_alloc_huge(arena_t *arena, size_t size, size_t alignment,
+    bool *zero)
+{
+	void *ret;
+	chunk_alloc_t *chunk_alloc;
+	chunk_dalloc_t *chunk_dalloc;
+
+	malloc_mutex_lock(&arena->lock);
+	chunk_alloc = arena->chunk_alloc;
+	chunk_dalloc = arena->chunk_dalloc;
+	if (config_stats) {
+		/* Optimistically update stats prior to unlocking. */
+		arena->stats.mapped += size;
+		arena->stats.allocated_huge += size;
+		arena->stats.nmalloc_huge++;
+		arena->stats.nrequests_huge++;
+	}
+	arena->nactive += (size >> LG_PAGE);
+	malloc_mutex_unlock(&arena->lock);
+
+	ret = chunk_alloc_arena(chunk_alloc, chunk_dalloc, arena->ind,
+	    size, alignment, zero);
+	if (config_stats) {
+		if (ret != NULL)
+			stats_cactive_add(size);
+		else {
+			/* Revert optimistic stats updates. */
+			malloc_mutex_lock(&arena->lock);
+			arena->stats.mapped -= size;
+			arena->stats.allocated_huge -= size;
+			arena->stats.nmalloc_huge--;
+			malloc_mutex_unlock(&arena->lock);
+		}
+	}
+
+	return (ret);
+}
+
+static arena_chunk_t *
 arena_chunk_init_hard(arena_t *arena)
 {
 	arena_chunk_t *chunk;
@@ -569,14 +628,9 @@ arena_chunk_init_hard(arena_t *arena)
 	assert(arena->spare == NULL);
 
 	zero = false;
-	malloc_mutex_unlock(&arena->lock);
-	chunk = (arena_chunk_t *)chunk_alloc(arena, chunksize, chunksize,
-	    false, &zero, arena->dss_prec);
-	malloc_mutex_lock(&arena->lock);
+	chunk = arena_chunk_alloc_internal(arena, chunksize, chunksize, &zero);
 	if (chunk == NULL)
 		return (NULL);
-	if (config_stats)
-		arena->stats.mapped += chunksize;
 
 	chunk->arena = arena;
 
@@ -645,7 +699,38 @@ arena_chunk_alloc(arena_t *arena)
 }
 
 static void
-arena_chunk_dealloc(arena_t *arena, arena_chunk_t *chunk)
+arena_chunk_dalloc_internal(arena_t *arena, arena_chunk_t *chunk)
+{
+	chunk_dalloc_t *chunk_dalloc;
+
+	chunk_dalloc = arena->chunk_dalloc;
+	malloc_mutex_unlock(&arena->lock);
+	chunk_dalloc((void *)chunk, chunksize, arena->ind);
+	malloc_mutex_lock(&arena->lock);
+	if (config_stats)
+		arena->stats.mapped -= chunksize;
+}
+
+void
+arena_chunk_dalloc_huge(arena_t *arena, void *chunk, size_t size)
+{
+	chunk_dalloc_t *chunk_dalloc;
+
+	malloc_mutex_lock(&arena->lock);
+	chunk_dalloc = arena->chunk_dalloc;
+	if (config_stats) {
+		arena->stats.mapped -= size;
+		arena->stats.allocated_huge -= size;
+		arena->stats.ndalloc_huge++;
+		stats_cactive_sub(size);
+	}
+	arena->nactive -= (size >> LG_PAGE);
+	malloc_mutex_unlock(&arena->lock);
+	chunk_dalloc(chunk, size, arena->ind);
+}
+
+static void
+arena_chunk_dalloc(arena_t *arena, arena_chunk_t *chunk)
 {
 	assert(arena_mapbits_allocated_get(chunk, map_bias) == 0);
 	assert(arena_mapbits_allocated_get(chunk, chunk_npages-1) == 0);
@@ -667,11 +752,7 @@ arena_chunk_dealloc(arena_t *arena, arena_chunk_t *chunk)
 		arena_chunk_t *spare = arena->spare;
 
 		arena->spare = chunk;
-		malloc_mutex_unlock(&arena->lock);
-		chunk_dealloc(arena, (void *)spare, chunksize, true);
-		malloc_mutex_lock(&arena->lock);
-		if (config_stats)
-			arena->stats.mapped -= chunksize;
+		arena_chunk_dalloc_internal(arena, spare);
 	} else
 		arena->spare = chunk;
 }
@@ -1231,7 +1312,7 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned)
 	if (size == arena_maxclass) {
 		assert(run_ind == map_bias);
 		assert(run_pages == (arena_maxclass >> LG_PAGE));
-		arena_chunk_dealloc(arena, chunk);
+		arena_chunk_dalloc(arena, chunk);
 	}
 
 	/*
@@ -2283,6 +2364,10 @@ arena_stats_merge(arena_t *arena, const char **dss, size_t *nactive,
 	astats->nmalloc_large += arena->stats.nmalloc_large;
 	astats->ndalloc_large += arena->stats.ndalloc_large;
 	astats->nrequests_large += arena->stats.nrequests_large;
+	astats->allocated_huge += arena->stats.allocated_huge;
+	astats->nmalloc_huge += arena->stats.nmalloc_huge;
+	astats->ndalloc_huge += arena->stats.ndalloc_huge;
+	astats->nrequests_huge += arena->stats.nrequests_huge;
 
 	for (i = 0; i < nlclasses; i++) {
 		lstats[i].nmalloc += arena->stats.lstats[i].nmalloc;
@@ -2320,7 +2405,7 @@ arena_new(arena_t *arena, unsigned ind)
 	arena->ind = ind;
 	arena->nthreads = 0;
 	arena->chunk_alloc = chunk_alloc_default;
-	arena->chunk_dealloc = (chunk_dealloc_t *)chunk_unmap;
+	arena->chunk_dalloc = chunk_dalloc_default;
 
 	if (malloc_mutex_init(&arena->lock))
 		return (true);
diff --git a/src/base.c b/src/base.c
index e8b312e..409c7bb 100644
--- a/src/base.c
+++ b/src/base.c
@@ -17,23 +17,15 @@ static void		*base_past_addr; /* Addr immediately past base_pages. */
 static extent_node_t	*base_nodes;
 
 /******************************************************************************/
-/* Function prototypes for non-inline static functions. */
-
-static bool	base_pages_alloc(size_t minsize);
-
-/******************************************************************************/
 
 static bool
 base_pages_alloc(size_t minsize)
 {
 	size_t csize;
-	bool zero;
 
 	assert(minsize != 0);
 	csize = CHUNK_CEILING(minsize);
-	zero = false;
-	base_pages = chunk_alloc(NULL, csize, chunksize, true, &zero,
-	    chunk_dss_prec_get());
+	base_pages = chunk_alloc_base(csize);
 	if (base_pages == NULL)
 		return (true);
 	base_next_addr = base_pages;
@@ -100,7 +92,7 @@ base_node_alloc(void)
 }
 
 void
-base_node_dealloc(extent_node_t *node)
+base_node_dalloc(extent_node_t *node)
 {
 
 	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(node, sizeof(extent_node_t));
diff --git a/src/chunk.c b/src/chunk.c
index 8bb0722..38d0286 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -31,13 +31,12 @@ size_t		map_bias;
 size_t		arena_maxclass; /* Max size class for arenas. */
 
 /******************************************************************************/
-/* Function prototypes for non-inline static functions. */
+/*
+ * Function prototypes for static functions that are referenced prior to
+ * definition.
+ */
 
-static void	*chunk_recycle(extent_tree_t *chunks_szad,
-    extent_tree_t *chunks_ad, size_t size, size_t alignment, bool base,
-    bool *zero);
-static void	chunk_record(extent_tree_t *chunks_szad,
-    extent_tree_t *chunks_ad, void *chunk, size_t size);
+static void	chunk_dalloc_core(void *chunk, size_t size);
 
 /******************************************************************************/
 
@@ -104,7 +103,7 @@ chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, size_t size,
 			malloc_mutex_unlock(&chunks_mtx);
 			node = base_node_alloc();
 			if (node == NULL) {
-				chunk_dealloc(NULL, ret, size, true);
+				chunk_dalloc_core(ret, size);
 				return (NULL);
 			}
 			malloc_mutex_lock(&chunks_mtx);
@@ -119,7 +118,7 @@ chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, size_t size,
 	malloc_mutex_unlock(&chunks_mtx);
 
 	if (node != NULL)
-		base_node_dealloc(node);
+		base_node_dalloc(node);
 	if (*zero) {
 		if (zeroed == false)
 			memset(ret, 0, size);
@@ -179,60 +178,82 @@ chunk_alloc_core(size_t size, size_t alignment, bool base, bool *zero,
 	return (NULL);
 }
 
-/*
- * Default arena chunk allocation routine in the absence of user-override.
- */
-void *
-chunk_alloc_default(size_t size, size_t alignment, bool *zero,
-    unsigned arena_ind)
+static bool
+chunk_register(void *chunk, size_t size, bool base)
 {
 
-	return (chunk_alloc_core(size, alignment, false, zero,
-	    arenas[arena_ind]->dss_prec));
+	assert(chunk != NULL);
+	assert(CHUNK_ADDR2BASE(chunk) == chunk);
+
+	if (config_ivsalloc && base == false) {
+		if (rtree_set(chunks_rtree, (uintptr_t)chunk, 1))
+			return (true);
+	}
+	if (config_stats || config_prof) {
+		bool gdump;
+		malloc_mutex_lock(&chunks_mtx);
+		if (config_stats)
+			stats_chunks.nchunks += (size / chunksize);
+		stats_chunks.curchunks += (size / chunksize);
+		if (stats_chunks.curchunks > stats_chunks.highchunks) {
+			stats_chunks.highchunks =
+			    stats_chunks.curchunks;
+			if (config_prof)
+				gdump = true;
+		} else if (config_prof)
+			gdump = false;
+		malloc_mutex_unlock(&chunks_mtx);
+		if (config_prof && opt_prof && opt_prof_gdump && gdump)
+			prof_gdump();
+	}
+	if (config_valgrind)
+		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(chunk, size);
+	return (false);
 }
 
 void *
-chunk_alloc(arena_t *arena, size_t size, size_t alignment, bool base,
-    bool *zero, dss_prec_t dss_prec)
+chunk_alloc_base(size_t size)
 {
 	void *ret;
+	bool zero;
 
-	if (arena)
-		ret = arena->chunk_alloc(size, alignment, zero, arena->ind);
-	else
-		ret = chunk_alloc_core(size, alignment, base, zero, dss_prec);
+	zero = false;
+	ret = chunk_alloc_core(size, chunksize, true, &zero,
+	    chunk_dss_prec_get());
+	if (ret == NULL)
+		return (NULL);
+	if (chunk_register(ret, size, true)) {
+		chunk_dalloc_core(ret, size);
+		return (NULL);
+	}
+	return (ret);
+}
 
-	if (ret != NULL) {
-		if (config_ivsalloc && base == false) {
-			if (rtree_set(chunks_rtree, (uintptr_t)ret, 1)) {
-				chunk_dealloc(arena, ret, size, true);
-				return (NULL);
-			}
-		}
-		if (config_stats || config_prof) {
-			bool gdump;
-			malloc_mutex_lock(&chunks_mtx);
-			if (config_stats)
-				stats_chunks.nchunks += (size / chunksize);
-			stats_chunks.curchunks += (size / chunksize);
-			if (stats_chunks.curchunks > stats_chunks.highchunks) {
-				stats_chunks.highchunks =
-				    stats_chunks.curchunks;
-				if (config_prof)
-					gdump = true;
-			} else if (config_prof)
-				gdump = false;
-			malloc_mutex_unlock(&chunks_mtx);
-			if (config_prof && opt_prof && opt_prof_gdump && gdump)
-				prof_gdump();
-		}
-		if (config_valgrind)
-			JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
+void *
+chunk_alloc_arena(chunk_alloc_t *chunk_alloc, chunk_dalloc_t *chunk_dalloc,
+    unsigned arena_ind, size_t size, size_t alignment, bool *zero)
+{
+	void *ret;
+
+	ret = chunk_alloc(size, alignment, zero, arena_ind);
+	if (ret != NULL && chunk_register(ret, size, false)) {
+		chunk_dalloc(ret, size, arena_ind);
+		ret = NULL;
 	}
-	assert(CHUNK_ADDR2BASE(ret) == ret);
+
 	return (ret);
 }
 
+/* Default arena chunk allocation routine in the absence of user override. */
+void *
+chunk_alloc_default(size_t size, size_t alignment, bool *zero,
+    unsigned arena_ind)
+{
+
+	return (chunk_alloc_core(size, alignment, false, zero,
+	    arenas[arena_ind]->dss_prec));
+}
+
 static void
 chunk_record(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, void *chunk,
     size_t size)
@@ -316,9 +337,9 @@ label_return:
 	 * avoid potential deadlock.
 	 */
 	if (xnode != NULL)
-		base_node_dealloc(xnode);
+		base_node_dalloc(xnode);
 	if (xprev != NULL)
-		base_node_dealloc(xprev);
+		base_node_dalloc(xprev);
 }
 
 void
@@ -331,12 +352,12 @@ chunk_unmap(void *chunk, size_t size)
 
 	if (have_dss && chunk_in_dss(chunk))
 		chunk_record(&chunks_szad_dss, &chunks_ad_dss, chunk, size);
-	else if (chunk_dealloc_mmap(chunk, size))
+	else if (chunk_dalloc_mmap(chunk, size))
 		chunk_record(&chunks_szad_mmap, &chunks_ad_mmap, chunk, size);
 }
 
-void
-chunk_dealloc(arena_t *arena, void *chunk, size_t size, bool unmap)
+static void
+chunk_dalloc_core(void *chunk, size_t size)
 {
 
 	assert(chunk != NULL);
@@ -353,12 +374,16 @@ chunk_dealloc(arena_t *arena, void *chunk, size_t size, bool unmap)
 		malloc_mutex_unlock(&chunks_mtx);
 	}
 
-	if (unmap) {
-		if (arena)
-			arena->chunk_dealloc(chunk, size, arena->ind);
-		else
-			chunk_unmap(chunk, size);
-	}
+	chunk_unmap(chunk, size);
+}
+
+/* Default arena chunk deallocation routine in the absence of user override. */
+bool
+chunk_dalloc_default(void *chunk, size_t size, unsigned arena_ind)
+{
+
+	chunk_dalloc_core(chunk, size);
+	return (false);
 }
 
 bool
diff --git a/src/chunk_mmap.c b/src/chunk_mmap.c
index 2056d79..f960e06 100644
--- a/src/chunk_mmap.c
+++ b/src/chunk_mmap.c
@@ -200,7 +200,7 @@ chunk_alloc_mmap(size_t size, size_t alignment, bool *zero)
 }
 
 bool
-chunk_dealloc_mmap(void *chunk, size_t size)
+chunk_dalloc_mmap(void *chunk, size_t size)
 {
 
 	if (config_munmap)
diff --git a/src/ctl.c b/src/ctl.c
index 395c32a..a193605 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -76,7 +76,6 @@ CTL_PROTO(thread_deallocatedp)
 CTL_PROTO(config_debug)
 CTL_PROTO(config_fill)
 CTL_PROTO(config_lazy_lock)
-CTL_PROTO(config_mremap)
 CTL_PROTO(config_munmap)
 CTL_PROTO(config_prof)
 CTL_PROTO(config_prof_libgcc)
@@ -114,7 +113,7 @@ CTL_PROTO(arena_i_purge)
 static void	arena_purge(unsigned arena_ind);
 CTL_PROTO(arena_i_dss)
 CTL_PROTO(arena_i_chunk_alloc)
-CTL_PROTO(arena_i_chunk_dealloc)
+CTL_PROTO(arena_i_chunk_dalloc)
 INDEX_PROTO(arena_i)
 CTL_PROTO(arenas_bin_i_size)
 CTL_PROTO(arenas_bin_i_nregs)
@@ -137,9 +136,6 @@ CTL_PROTO(prof_interval)
 CTL_PROTO(stats_chunks_current)
 CTL_PROTO(stats_chunks_total)
 CTL_PROTO(stats_chunks_high)
-CTL_PROTO(stats_huge_allocated)
-CTL_PROTO(stats_huge_nmalloc)
-CTL_PROTO(stats_huge_ndalloc)
 CTL_PROTO(stats_arenas_i_small_allocated)
 CTL_PROTO(stats_arenas_i_small_nmalloc)
 CTL_PROTO(stats_arenas_i_small_ndalloc)
@@ -148,6 +144,10 @@ CTL_PROTO(stats_arenas_i_large_allocated)
 CTL_PROTO(stats_arenas_i_large_nmalloc)
 CTL_PROTO(stats_arenas_i_large_ndalloc)
 CTL_PROTO(stats_arenas_i_large_nrequests)
+CTL_PROTO(stats_arenas_i_huge_allocated)
+CTL_PROTO(stats_arenas_i_huge_nmalloc)
+CTL_PROTO(stats_arenas_i_huge_ndalloc)
+CTL_PROTO(stats_arenas_i_huge_nrequests)
 CTL_PROTO(stats_arenas_i_bins_j_allocated)
 CTL_PROTO(stats_arenas_i_bins_j_nmalloc)
 CTL_PROTO(stats_arenas_i_bins_j_ndalloc)
@@ -214,7 +214,6 @@ static const ctl_named_node_t	config_node[] = {
 	{NAME("debug"),			CTL(config_debug)},
 	{NAME("fill"),			CTL(config_fill)},
 	{NAME("lazy_lock"),		CTL(config_lazy_lock)},
-	{NAME("mremap"),		CTL(config_mremap)},
 	{NAME("munmap"),		CTL(config_munmap)},
 	{NAME("prof"),			CTL(config_prof)},
 	{NAME("prof_libgcc"),		CTL(config_prof_libgcc)},
@@ -255,7 +254,7 @@ static const ctl_named_node_t opt_node[] = {
 
 static const ctl_named_node_t chunk_node[] = {
 	{NAME("alloc"),			CTL(arena_i_chunk_alloc)},
-	{NAME("dealloc"),		CTL(arena_i_chunk_dealloc)}
+	{NAME("dalloc"),		CTL(arena_i_chunk_dalloc)}
 };
 
 static const ctl_named_node_t arena_i_node[] = {
@@ -321,12 +320,6 @@ static const ctl_named_node_t stats_chunks_node[] = {
 	{NAME("high"),			CTL(stats_chunks_high)}
 };
 
-static const ctl_named_node_t stats_huge_node[] = {
-	{NAME("allocated"),		CTL(stats_huge_allocated)},
-	{NAME("nmalloc"),		CTL(stats_huge_nmalloc)},
-	{NAME("ndalloc"),		CTL(stats_huge_ndalloc)}
-};
-
 static const ctl_named_node_t stats_arenas_i_small_node[] = {
 	{NAME("allocated"),		CTL(stats_arenas_i_small_allocated)},
 	{NAME("nmalloc"),		CTL(stats_arenas_i_small_nmalloc)},
@@ -341,6 +334,13 @@ static const ctl_named_node_t stats_arenas_i_large_node[] = {
 	{NAME("nrequests"),		CTL(stats_arenas_i_large_nrequests)}
 };
 
+static const ctl_named_node_t stats_arenas_i_huge_node[] = {
+	{NAME("allocated"),		CTL(stats_arenas_i_huge_allocated)},
+	{NAME("nmalloc"),		CTL(stats_arenas_i_huge_nmalloc)},
+	{NAME("ndalloc"),		CTL(stats_arenas_i_huge_ndalloc)},
+	{NAME("nrequests"),		CTL(stats_arenas_i_huge_nrequests)},
+};
+
 static const ctl_named_node_t stats_arenas_i_bins_j_node[] = {
 	{NAME("allocated"),		CTL(stats_arenas_i_bins_j_allocated)},
 	{NAME("nmalloc"),		CTL(stats_arenas_i_bins_j_nmalloc)},
@@ -385,6 +385,7 @@ static const ctl_named_node_t stats_arenas_i_node[] = {
 	{NAME("purged"),		CTL(stats_arenas_i_purged)},
 	{NAME("small"),			CHILD(named, stats_arenas_i_small)},
 	{NAME("large"),			CHILD(named, stats_arenas_i_large)},
+	{NAME("huge"),			CHILD(named, stats_arenas_i_huge)},
 	{NAME("bins"),			CHILD(indexed, stats_arenas_i_bins)},
 	{NAME("lruns"),			CHILD(indexed, stats_arenas_i_lruns)}
 };
@@ -402,7 +403,6 @@ static const ctl_named_node_t stats_node[] = {
 	{NAME("active"),		CTL(stats_active)},
 	{NAME("mapped"),		CTL(stats_mapped)},
 	{NAME("chunks"),		CHILD(named, stats_chunks)},
-	{NAME("huge"),			CHILD(named, stats_huge)},
 	{NAME("arenas"),		CHILD(indexed, stats_arenas)}
 };
 
@@ -500,6 +500,11 @@ ctl_arena_stats_smerge(ctl_arena_stats_t *sstats, ctl_arena_stats_t *astats)
 	sstats->astats.ndalloc_large += astats->astats.ndalloc_large;
 	sstats->astats.nrequests_large += astats->astats.nrequests_large;
 
+	sstats->astats.allocated_huge += astats->astats.allocated_huge;
+	sstats->astats.nmalloc_huge += astats->astats.nmalloc_huge;
+	sstats->astats.ndalloc_huge += astats->astats.ndalloc_huge;
+	sstats->astats.nrequests_huge += astats->astats.nrequests_huge;
+
 	for (i = 0; i < nlclasses; i++) {
 		sstats->lstats[i].nmalloc += astats->lstats[i].nmalloc;
 		sstats->lstats[i].ndalloc += astats->lstats[i].ndalloc;
@@ -626,12 +631,6 @@ ctl_refresh(void)
 		ctl_stats.chunks.total = stats_chunks.nchunks;
 		ctl_stats.chunks.high = stats_chunks.highchunks;
 		malloc_mutex_unlock(&chunks_mtx);
-
-		malloc_mutex_lock(&huge_mtx);
-		ctl_stats.huge.allocated = huge_allocated;
-		ctl_stats.huge.nmalloc = huge_nmalloc;
-		ctl_stats.huge.ndalloc = huge_ndalloc;
-		malloc_mutex_unlock(&huge_mtx);
 	}
 
 	/*
@@ -662,10 +661,9 @@ ctl_refresh(void)
 		ctl_stats.allocated =
 		    ctl_stats.arenas[ctl_stats.narenas].allocated_small
 		    + ctl_stats.arenas[ctl_stats.narenas].astats.allocated_large
-		    + ctl_stats.huge.allocated;
+		    + ctl_stats.arenas[ctl_stats.narenas].astats.allocated_huge;
 		ctl_stats.active =
-		    (ctl_stats.arenas[ctl_stats.narenas].pactive << LG_PAGE)
-		    + ctl_stats.huge.allocated;
+		    (ctl_stats.arenas[ctl_stats.narenas].pactive << LG_PAGE);
 		ctl_stats.mapped = (ctl_stats.chunks.current << opt_lg_chunk);
 	}
 
@@ -1140,7 +1138,6 @@ label_return:
 CTL_RO_BOOL_CONFIG_GEN(config_debug)
 CTL_RO_BOOL_CONFIG_GEN(config_fill)
 CTL_RO_BOOL_CONFIG_GEN(config_lazy_lock)
-CTL_RO_BOOL_CONFIG_GEN(config_mremap)
 CTL_RO_BOOL_CONFIG_GEN(config_munmap)
 CTL_RO_BOOL_CONFIG_GEN(config_prof)
 CTL_RO_BOOL_CONFIG_GEN(config_prof_libgcc)
@@ -1377,8 +1374,8 @@ label_return:
 }
 
 static int
-arena_i_chunk_alloc_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
-    void *newp, size_t newlen)
+arena_i_chunk_alloc_ctl(const size_t *mib, size_t miblen, void *oldp,
+    size_t *oldlenp, void *newp, size_t newlen)
 {
 	int ret;
 	unsigned arena_ind = mib[1];
@@ -1402,8 +1399,8 @@ label_outer_return:
 }
 
 static int
-arena_i_chunk_dealloc_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
-    void *newp, size_t newlen)
+arena_i_chunk_dalloc_ctl(const size_t *mib, size_t miblen, void *oldp,
+    size_t *oldlenp, void *newp, size_t newlen)
 {
 
 	int ret;
@@ -1413,8 +1410,8 @@ arena_i_chunk_dealloc_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *
 	malloc_mutex_lock(&ctl_mtx);
 	if (arena_ind < narenas_total && (arena = arenas[arena_ind]) != NULL) {
 		malloc_mutex_lock(&arena->lock);
-		READ(arena->chunk_dealloc, chunk_dealloc_t *);
-		WRITE(arena->chunk_dealloc, chunk_dealloc_t *);
+		READ(arena->chunk_dalloc, chunk_dalloc_t *);
+		WRITE(arena->chunk_dalloc, chunk_dalloc_t *);
 	} else {
 		ret = EFAULT;
 		goto label_outer_return;
@@ -1611,9 +1608,6 @@ CTL_RO_CGEN(config_stats, stats_chunks_current, ctl_stats.chunks.current,
     size_t)
 CTL_RO_CGEN(config_stats, stats_chunks_total, ctl_stats.chunks.total, uint64_t)
 CTL_RO_CGEN(config_stats, stats_chunks_high, ctl_stats.chunks.high, size_t)
-CTL_RO_CGEN(config_stats, stats_huge_allocated, huge_allocated, size_t)
-CTL_RO_CGEN(config_stats, stats_huge_nmalloc, huge_nmalloc, uint64_t)
-CTL_RO_CGEN(config_stats, stats_huge_ndalloc, huge_ndalloc, uint64_t)
 
 CTL_RO_GEN(stats_arenas_i_dss, ctl_stats.arenas[mib[2]].dss, const char *)
 CTL_RO_GEN(stats_arenas_i_nthreads, ctl_stats.arenas[mib[2]].nthreads, unsigned)
@@ -1644,6 +1638,14 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_large_ndalloc,
     ctl_stats.arenas[mib[2]].astats.ndalloc_large, uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_large_nrequests,
     ctl_stats.arenas[mib[2]].astats.nrequests_large, uint64_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_huge_allocated,
+    ctl_stats.arenas[mib[2]].astats.allocated_huge, size_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_huge_nmalloc,
+    ctl_stats.arenas[mib[2]].astats.nmalloc_huge, uint64_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_huge_ndalloc,
+    ctl_stats.arenas[mib[2]].astats.ndalloc_huge, uint64_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_huge_nrequests,
+    ctl_stats.arenas[mib[2]].astats.nrequests_huge, uint64_t)
 
 CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_allocated,
     ctl_stats.arenas[mib[2]].bstats[mib[4]].allocated, size_t)
diff --git a/src/huge.c b/src/huge.c
index ab05c90..d08ed4a 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -4,11 +4,8 @@
 /******************************************************************************/
 /* Data. */
 
-uint64_t	huge_nmalloc;
-uint64_t	huge_ndalloc;
-size_t		huge_allocated;
-
-malloc_mutex_t	huge_mtx;
+/* Protects chunk-related data structures. */
+static malloc_mutex_t	huge_mtx;
 
 /******************************************************************************/
 
@@ -16,15 +13,14 @@ malloc_mutex_t	huge_mtx;
 static extent_tree_t	huge;
 
 void *
-huge_malloc(arena_t *arena, size_t size, bool zero, dss_prec_t dss_prec)
+huge_malloc(arena_t *arena, size_t size, bool zero)
 {
 
-	return (huge_palloc(arena, size, chunksize, zero, dss_prec));
+	return (huge_palloc(arena, size, chunksize, zero));
 }
 
 void *
-huge_palloc(arena_t *arena, size_t size, size_t alignment, bool zero,
-    dss_prec_t dss_prec)
+huge_palloc(arena_t *arena, size_t size, size_t alignment, bool zero)
 {
 	void *ret;
 	size_t csize;
@@ -49,9 +45,10 @@ huge_palloc(arena_t *arena, size_t size, size_t alignment, bool zero,
 	 * it is possible to make correct junk/zero fill decisions below.
 	 */
 	is_zeroed = zero;
-	ret = chunk_alloc(arena, csize, alignment, false, &is_zeroed, dss_prec);
+	arena = choose_arena(arena);
+	ret = arena_chunk_alloc_huge(arena, csize, alignment, &is_zeroed);
 	if (ret == NULL) {
-		base_node_dealloc(node);
+		base_node_dalloc(node);
 		return (NULL);
 	}
 
@@ -62,11 +59,6 @@ huge_palloc(arena_t *arena, size_t size, size_t alignment, bool zero,
 
 	malloc_mutex_lock(&huge_mtx);
 	extent_tree_ad_insert(&huge, node);
-	if (config_stats) {
-		stats_cactive_add(csize);
-		huge_nmalloc++;
-		huge_allocated += csize;
-	}
 	malloc_mutex_unlock(&huge_mtx);
 
 	if (config_fill && zero == false) {
@@ -99,8 +91,7 @@ huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra)
 
 void *
 huge_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
-    size_t extra, size_t alignment, bool zero, bool try_tcache_dalloc,
-    dss_prec_t dss_prec)
+    size_t extra, size_t alignment, bool zero, bool try_tcache_dalloc)
 {
 	void *ret;
 	size_t copysize;
@@ -115,18 +106,18 @@ huge_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
 	 * space and copying.
 	 */
 	if (alignment > chunksize)
-		ret = huge_palloc(arena, size + extra, alignment, zero, dss_prec);
+		ret = huge_palloc(arena, size + extra, alignment, zero);
 	else
-		ret = huge_malloc(arena, size + extra, zero, dss_prec);
+		ret = huge_malloc(arena, size + extra, zero);
 
 	if (ret == NULL) {
 		if (extra == 0)
 			return (NULL);
 		/* Try again, this time without extra. */
 		if (alignment > chunksize)
-			ret = huge_palloc(arena, size, alignment, zero, dss_prec);
+			ret = huge_palloc(arena, size, alignment, zero);
 		else
-			ret = huge_malloc(arena, size, zero, dss_prec);
+			ret = huge_malloc(arena, size, zero);
 
 		if (ret == NULL)
 			return (NULL);
@@ -137,59 +128,8 @@ huge_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
 	 * expectation that the extra bytes will be reliably preserved.
 	 */
 	copysize = (size < oldsize) ? size : oldsize;
-
-#ifdef JEMALLOC_MREMAP
-	/*
-	 * Use mremap(2) if this is a huge-->huge reallocation, and neither the
-	 * source nor the destination are in dss.
-	 */
-	if (oldsize >= chunksize && (have_dss == false || (chunk_in_dss(ptr)
-	    == false && chunk_in_dss(ret) == false))) {
-		size_t newsize = huge_salloc(ret);
-
-		/*
-		 * Remove ptr from the tree of huge allocations before
-		 * performing the remap operation, in order to avoid the
-		 * possibility of another thread acquiring that mapping before
-		 * this one removes it from the tree.
-		 */
-		huge_dalloc(ptr, false);
-		if (mremap(ptr, oldsize, newsize, MREMAP_MAYMOVE|MREMAP_FIXED,
-		    ret) == MAP_FAILED) {
-			/*
-			 * Assuming no chunk management bugs in the allocator,
-			 * the only documented way an error can occur here is
-			 * if the application changed the map type for a
-			 * portion of the old allocation.  This is firmly in
-			 * undefined behavior territory, so write a diagnostic
-			 * message, and optionally abort.
-			 */
-			char buf[BUFERROR_BUF];
-
-			buferror(get_errno(), buf, sizeof(buf));
-			malloc_printf("<jemalloc>: Error in mremap(): %s\n",
-			    buf);
-			if (opt_abort)
-				abort();
-			memcpy(ret, ptr, copysize);
-			chunk_dealloc_mmap(ptr, oldsize);
-		} else if (config_fill && zero == false && opt_junk && oldsize
-		    < newsize) {
-			/*
-			 * mremap(2) clobbers the original mapping, so
-			 * junk/zero filling is not preserved.  There is no
-			 * need to zero fill here, since any trailing
-			 * uninititialized memory is demand-zeroed by the
-			 * kernel, but junk filling must be redone.
-			 */
-			memset(ret + oldsize, 0xa5, newsize - oldsize);
-		}
-	} else
-#endif
-	{
-		memcpy(ret, ptr, copysize);
-		iqalloct(ptr, try_tcache_dalloc);
-	}
+	memcpy(ret, ptr, copysize);
+	iqalloct(ptr, try_tcache_dalloc);
 	return (ret);
 }
 
@@ -217,7 +157,7 @@ huge_dalloc_junk_t *huge_dalloc_junk = JEMALLOC_N(huge_dalloc_junk_impl);
 #endif
 
 void
-huge_dalloc(void *ptr, bool unmap)
+huge_dalloc(void *ptr)
 {
 	extent_node_t *node, key;
 
@@ -230,20 +170,11 @@ huge_dalloc(void *ptr, bool unmap)
 	assert(node->addr == ptr);
 	extent_tree_ad_remove(&huge, node);
 
-	if (config_stats) {
-		stats_cactive_sub(node->size);
-		huge_ndalloc++;
-		huge_allocated -= node->size;
-	}
-
 	malloc_mutex_unlock(&huge_mtx);
 
-	if (unmap)
-		huge_dalloc_junk(node->addr, node->size);
-
-	chunk_dealloc(node->arena, node->addr, node->size, unmap);
-
-	base_node_dealloc(node);
+	huge_dalloc_junk(node->addr, node->size);
+	arena_chunk_dalloc_huge(node->arena, node->addr, node->size);
+	base_node_dalloc(node);
 }
 
 size_t
@@ -266,13 +197,6 @@ huge_salloc(const void *ptr)
 	return (ret);
 }
 
-dss_prec_t
-huge_dss_prec_get(arena_t *arena)
-{
-
-	return (arena_dss_prec_get(choose_arena(arena)));
-}
-
 prof_ctx_t *
 huge_prof_ctx_get(const void *ptr)
 {
@@ -319,12 +243,6 @@ huge_boot(void)
 		return (true);
 	extent_tree_ad_new(&huge);
 
-	if (config_stats) {
-		huge_nmalloc = 0;
-		huge_ndalloc = 0;
-		huge_allocated = 0;
-	}
-
 	return (false);
 }
 
diff --git a/src/jemalloc.c b/src/jemalloc.c
index e0f9275..43a494e 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1983,7 +1983,7 @@ a0alloc(size_t size, bool zero)
 	if (size <= arena_maxclass)
 		return (arena_malloc(arenas[0], size, zero, false));
 	else
-		return (huge_malloc(NULL, size, zero, huge_dss_prec_get(arenas[0])));
+		return (huge_malloc(NULL, size, zero));
 }
 
 void *
@@ -2012,7 +2012,7 @@ a0free(void *ptr)
 	if (chunk != ptr)
 		arena_dalloc(chunk, ptr, false);
 	else
-		huge_dalloc(ptr, true);
+		huge_dalloc(ptr);
 }
 
 /******************************************************************************/
diff --git a/src/stats.c b/src/stats.c
index bef2ab3..a0eb297 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -213,6 +213,8 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	uint64_t small_nmalloc, small_ndalloc, small_nrequests;
 	size_t large_allocated;
 	uint64_t large_nmalloc, large_ndalloc, large_nrequests;
+	size_t huge_allocated;
+	uint64_t huge_nmalloc, huge_ndalloc, huge_nrequests;
 
 	CTL_GET("arenas.page", &page, size_t);
 
@@ -249,12 +251,19 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	malloc_cprintf(write_cb, cbopaque,
 	    "large:   %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64"\n",
 	    large_allocated, large_nmalloc, large_ndalloc, large_nrequests);
+	CTL_I_GET("stats.arenas.0.huge.allocated", &huge_allocated, size_t);
+	CTL_I_GET("stats.arenas.0.huge.nmalloc", &huge_nmalloc, uint64_t);
+	CTL_I_GET("stats.arenas.0.huge.ndalloc", &huge_ndalloc, uint64_t);
+	CTL_I_GET("stats.arenas.0.huge.nrequests", &huge_nrequests, uint64_t);
+	malloc_cprintf(write_cb, cbopaque,
+	    "huge:    %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64"\n",
+	    huge_allocated, huge_nmalloc, huge_ndalloc, huge_nrequests);
 	malloc_cprintf(write_cb, cbopaque,
 	    "total:   %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64"\n",
-	    small_allocated + large_allocated,
-	    small_nmalloc + large_nmalloc,
-	    small_ndalloc + large_ndalloc,
-	    small_nrequests + large_nrequests);
+	    small_allocated + large_allocated + huge_allocated,
+	    small_nmalloc + large_nmalloc + huge_nmalloc,
+	    small_ndalloc + large_ndalloc + huge_ndalloc,
+	    small_nrequests + large_nrequests + huge_nrequests);
 	malloc_cprintf(write_cb, cbopaque, "active:  %12zu\n", pactive * page);
 	CTL_I_GET("stats.arenas.0.mapped", &mapped, size_t);
 	malloc_cprintf(write_cb, cbopaque, "mapped:  %12zu\n", mapped);
@@ -458,8 +467,6 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		size_t allocated, active, mapped;
 		size_t chunks_current, chunks_high;
 		uint64_t chunks_total;
-		size_t huge_allocated;
-		uint64_t huge_nmalloc, huge_ndalloc;
 
 		CTL_GET("stats.cactive", &cactive, size_t *);
 		CTL_GET("stats.allocated", &allocated, size_t);
@@ -481,16 +488,6 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		    "  %13"PRIu64" %12zu %12zu\n",
 		    chunks_total, chunks_high, chunks_current);
 
-		/* Print huge stats. */
-		CTL_GET("stats.huge.nmalloc", &huge_nmalloc, uint64_t);
-		CTL_GET("stats.huge.ndalloc", &huge_ndalloc, uint64_t);
-		CTL_GET("stats.huge.allocated", &huge_allocated, size_t);
-		malloc_cprintf(write_cb, cbopaque,
-		    "huge: nmalloc      ndalloc    allocated\n");
-		malloc_cprintf(write_cb, cbopaque,
-		    " %12"PRIu64" %12"PRIu64" %12zu\n",
-		    huge_nmalloc, huge_ndalloc, huge_allocated);
-
 		if (merged) {
 			unsigned narenas;
 
diff --git a/test/integration/chunk.c b/test/integration/chunk.c
index 1365989..2853709 100644
--- a/test/integration/chunk.c
+++ b/test/integration/chunk.c
@@ -1,13 +1,13 @@
 #include "test/jemalloc_test.h"
 
 chunk_alloc_t *old_alloc;
-chunk_dealloc_t *old_dealloc;
+chunk_dalloc_t *old_dalloc;
 
 bool
-chunk_dealloc(void *chunk, size_t size, unsigned arena_ind)
+chunk_dalloc(void *chunk, size_t size, unsigned arena_ind)
 {
 
-	return (old_dealloc(chunk, size, arena_ind));
+	return (old_dalloc(chunk, size, arena_ind));
 }
 
 void *
@@ -21,11 +21,11 @@ TEST_BEGIN(test_chunk)
 {
 	void *p;
 	chunk_alloc_t *new_alloc;
-	chunk_dealloc_t *new_dealloc;
+	chunk_dalloc_t *new_dalloc;
 	size_t old_size, new_size;
 
 	new_alloc = chunk_alloc;
-	new_dealloc = chunk_dealloc;
+	new_dalloc = chunk_dalloc;
 	old_size = sizeof(chunk_alloc_t *);
 	new_size = sizeof(chunk_alloc_t *);
 
@@ -34,11 +34,9 @@ TEST_BEGIN(test_chunk)
 	    "Unexpected alloc error");
 	assert_ptr_ne(old_alloc, new_alloc,
 	    "Unexpected alloc error");
-	assert_d_eq(mallctl("arena.0.chunk.dealloc", &old_dealloc,
-	    &old_size, &new_dealloc, new_size), 0,
-	    "Unexpected dealloc error");
-	assert_ptr_ne(old_dealloc, new_dealloc,
-	    "Unexpected dealloc error");
+	assert_d_eq(mallctl("arena.0.chunk.dalloc", &old_dalloc, &old_size,
+	    &new_dalloc, new_size), 0, "Unexpected dalloc error");
+	assert_ptr_ne(old_dalloc, new_dalloc, "Unexpected dalloc error");
 
 	p = mallocx(42, 0);
 	assert_ptr_ne(p, NULL, "Unexpected alloc error");
@@ -47,9 +45,8 @@ TEST_BEGIN(test_chunk)
 	assert_d_eq(mallctl("arena.0.chunk.alloc", NULL,
 	    NULL, &old_alloc, old_size), 0,
 	    "Unexpected alloc error");
-	assert_d_eq(mallctl("arena.0.chunk.dealloc", NULL,
-	    NULL, &old_dealloc, old_size), 0,
-	    "Unexpected dealloc error");
+	assert_d_eq(mallctl("arena.0.chunk.dalloc", NULL, NULL, &old_dalloc,
+	    old_size), 0, "Unexpected dalloc error");
 }
 TEST_END
 
diff --git a/test/integration/mremap.c b/test/integration/mremap.c
deleted file mode 100644
index a7fb7ef..0000000
--- a/test/integration/mremap.c
+++ /dev/null
@@ -1,45 +0,0 @@
-#include "test/jemalloc_test.h"
-
-TEST_BEGIN(test_mremap)
-{
-	int err;
-	size_t sz, lg_chunk, chunksize, i;
-	char *p, *q;
-
-	sz = sizeof(lg_chunk);
-	err = mallctl("opt.lg_chunk", &lg_chunk, &sz, NULL, 0);
-	assert_d_eq(err, 0, "Error in mallctl(): %s", strerror(err));
-	chunksize = ((size_t)1U) << lg_chunk;
-
-	p = (char *)malloc(chunksize);
-	assert_ptr_not_null(p, "malloc(%zu) --> %p", chunksize, p);
-	memset(p, 'a', chunksize);
-
-	q = (char *)realloc(p, chunksize * 2);
-	assert_ptr_not_null(q, "realloc(%p, %zu) --> %p", p, chunksize * 2,
-	    q);
-	for (i = 0; i < chunksize; i++) {
-		assert_c_eq(q[i], 'a',
-		    "realloc() should preserve existing bytes across copies");
-	}
-
-	p = q;
-
-	q = (char *)realloc(p, chunksize);
-	assert_ptr_not_null(q, "realloc(%p, %zu) --> %p", p, chunksize, q);
-	for (i = 0; i < chunksize; i++) {
-		assert_c_eq(q[i], 'a',
-		    "realloc() should preserve existing bytes across copies");
-	}
-
-	free(q);
-}
-TEST_END
-
-int
-main(void)
-{
-
-	return (test(
-	    test_mremap));
-}
diff --git a/test/unit/junk.c b/test/unit/junk.c
index 85bbf9e..301428f 100644
--- a/test/unit/junk.c
+++ b/test/unit/junk.c
@@ -92,12 +92,9 @@ test_junk(size_t sz_min, size_t sz_max)
 			s = (char *)rallocx(s, sz+1, 0);
 			assert_ptr_not_null((void *)s,
 			    "Unexpected rallocx() failure");
-			if (!config_mremap || sz+1 <= arena_maxclass) {
-				assert_ptr_eq(most_recently_junked, junked,
-				    "Expected region of size %zu to be "
-				    "junk-filled",
-				    sz);
-			}
+			assert_ptr_eq(most_recently_junked, junked,
+			    "Expected region of size %zu to be junk-filled",
+			    sz);
 		}
 	}
 
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index 754834c..cb12049 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -129,7 +129,6 @@ TEST_BEGIN(test_mallctl_config)
 	TEST_MALLCTL_CONFIG(debug);
 	TEST_MALLCTL_CONFIG(fill);
 	TEST_MALLCTL_CONFIG(lazy_lock);
-	TEST_MALLCTL_CONFIG(mremap);
 	TEST_MALLCTL_CONFIG(munmap);
 	TEST_MALLCTL_CONFIG(prof);
 	TEST_MALLCTL_CONFIG(prof_libgcc);
diff --git a/test/unit/stats.c b/test/unit/stats.c
index 03a55c7..ab87b29 100644
--- a/test/unit/stats.c
+++ b/test/unit/stats.c
@@ -60,7 +60,7 @@ TEST_BEGIN(test_stats_huge)
 	void *p;
 	uint64_t epoch;
 	size_t allocated;
-	uint64_t nmalloc, ndalloc;
+	uint64_t nmalloc, ndalloc, nrequests;
 	size_t sz;
 	int expected = config_stats ? 0 : ENOENT;
 
@@ -71,19 +71,23 @@ TEST_BEGIN(test_stats_huge)
 	    "Unexpected mallctl() failure");
 
 	sz = sizeof(size_t);
-	assert_d_eq(mallctl("stats.huge.allocated", &allocated, &sz, NULL, 0),
-	    expected, "Unexpected mallctl() result");
+	assert_d_eq(mallctl("stats.arenas.0.huge.allocated", &allocated, &sz,
+	    NULL, 0), expected, "Unexpected mallctl() result");
 	sz = sizeof(uint64_t);
-	assert_d_eq(mallctl("stats.huge.nmalloc", &nmalloc, &sz, NULL, 0),
-	    expected, "Unexpected mallctl() result");
-	assert_d_eq(mallctl("stats.huge.ndalloc", &ndalloc, &sz, NULL, 0),
-	    expected, "Unexpected mallctl() result");
+	assert_d_eq(mallctl("stats.arenas.0.huge.nmalloc", &nmalloc, &sz, NULL,
+	    0), expected, "Unexpected mallctl() result");
+	assert_d_eq(mallctl("stats.arenas.0.huge.ndalloc", &ndalloc, &sz, NULL,
+	    0), expected, "Unexpected mallctl() result");
+	assert_d_eq(mallctl("stats.arenas.0.huge.nrequests", &nrequests, &sz,
+	    NULL, 0), expected, "Unexpected mallctl() result");
 
 	if (config_stats) {
 		assert_zu_gt(allocated, 0,
 		    "allocated should be greater than zero");
 		assert_u64_ge(nmalloc, ndalloc,
 		    "nmalloc should be at least as large as ndalloc");
+		assert_u64_le(nmalloc, nrequests,
+		    "nmalloc should no larger than nrequests");
 	}
 
 	dallocx(p, 0);
-- 
cgit v0.12


From b4d62cd61b46130b7947c3a427a2b007e7fa0eb8 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Thu, 15 May 2014 22:46:24 -0700
Subject: Minor doc edit.

---
 doc/jemalloc.xml.in | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 46e505f..308d0c6 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -189,15 +189,15 @@
 
       <para>The <function>posix_memalign<parameter/></function> function
       allocates <parameter>size</parameter> bytes of memory such that the
-      allocation's base address is an even multiple of
+      allocation's base address is a multiple of
       <parameter>alignment</parameter>, and returns the allocation in the value
       pointed to by <parameter>ptr</parameter>.  The requested
-      <parameter>alignment</parameter> must be a power of 2 at least as large
-      as <code language="C">sizeof(<type>void *</type>)</code>.</para>
+      <parameter>alignment</parameter> must be a power of 2 at least as large as
+      <code language="C">sizeof(<type>void *</type>)</code>.</para>
 
       <para>The <function>aligned_alloc<parameter/></function> function
       allocates <parameter>size</parameter> bytes of memory such that the
-      allocation's base address is an even multiple of
+      allocation's base address is a multiple of
       <parameter>alignment</parameter>.  The requested
       <parameter>alignment</parameter> must be a power of 2.  Behavior is
       undefined if <parameter>size</parameter> is not an integral multiple of
-- 
cgit v0.12


From ed0b0ec935a6df9ef429e56a08c0c9b63c3ba358 Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Wed, 21 May 2014 16:38:24 +0900
Subject: Fix manual dependency on jemalloc_test.h

---
 Makefile.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.in b/Makefile.in
index 90869eb..4cb1a65 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -228,7 +228,7 @@ HEADER_DIRS = $(srcroot)include/jemalloc/internal \
 	$(objroot)include/jemalloc $(objroot)include/jemalloc/internal
 HEADERS = $(wildcard $(foreach dir,$(HEADER_DIRS),$(dir)/*.h))
 $(C_OBJS) $(C_PIC_OBJS) $(C_JET_OBJS) $(C_TESTLIB_OBJS) $(TESTS_OBJS): $(HEADERS)
-$(TESTS_OBJS): $(objroot)test/unit/jemalloc_test.h
+$(TESTS_OBJS): $(objroot)test/include/test/jemalloc_test.h
 endif
 
 $(C_OBJS) $(C_PIC_OBJS) $(C_JET_OBJS) $(C_TESTLIB_OBJS) $(TESTS_OBJS): %.$(O):
-- 
cgit v0.12


From 47d58a01ff9d894f854412f3f6d3ba97a7aa2929 Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Wed, 21 May 2014 16:59:50 +0900
Subject: Define _CRT_SPINCOUNT in test/src/mtx.c like in src/mutex.c

---
 test/src/mtx.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/src/mtx.c b/test/src/mtx.c
index 41b95d5..73bd02f 100644
--- a/test/src/mtx.c
+++ b/test/src/mtx.c
@@ -1,5 +1,9 @@
 #include "test/jemalloc_test.h"
 
+#ifndef _CRT_SPINCOUNT
+#define	_CRT_SPINCOUNT 4000
+#endif
+
 bool
 mtx_init(mtx_t *mtx)
 {
-- 
cgit v0.12


From d6fd11413e1fe33a9bc947d794e880d7d10f7786 Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Wed, 21 May 2014 17:04:24 +0900
Subject: Define DLLEXPORT when building .jet objects

---
 Makefile.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.in b/Makefile.in
index 4cb1a65..65d73db 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -219,7 +219,7 @@ $(TESTS_STRESS_OBJS): CPPFLAGS += -DJEMALLOC_STRESS_TEST
 $(TESTS_OBJS): $(objroot)test/%.$(O): $(srcroot)test/%.c
 $(TESTS_OBJS): CPPFLAGS += -I$(srcroot)test/include -I$(objroot)test/include
 ifneq ($(IMPORTLIB),$(SO))
-$(C_OBJS): CPPFLAGS += -DDLLEXPORT
+$(C_OBJS) $(C_JET_OBJS): CPPFLAGS += -DDLLEXPORT
 endif
 
 ifndef CC_MM
-- 
cgit v0.12


From f41f14366877538b03109ecf346dbff2e21bbb16 Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Wed, 21 May 2014 17:24:08 +0900
Subject: Replace variable arrays in tests with VARIABLE_ARRAY

---
 test/unit/hash.c    | 4 ++--
 test/unit/mallctl.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/unit/hash.c b/test/unit/hash.c
index abb394a..77a8ced 100644
--- a/test/unit/hash.c
+++ b/test/unit/hash.c
@@ -64,8 +64,8 @@ hash_variant_verify(hash_variant_t variant)
 {
 	const size_t hashbytes = hash_variant_bits(variant) / 8;
 	uint8_t key[256];
-	uint8_t hashes[hashbytes * 256];
-	uint8_t final[hashbytes];
+	VARIABLE_ARRAY(uint8_t, hashes, hashbytes * 256);
+	VARIABLE_ARRAY(uint8_t, final, hashbytes);
 	unsigned i;
 	uint32_t computed, expected;
 
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index cb12049..7a8b55f 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -285,7 +285,7 @@ TEST_BEGIN(test_arenas_initialized)
 	assert_d_eq(mallctl("arenas.narenas", &narenas, &sz, NULL, 0), 0,
 	    "Unexpected mallctl() failure");
 	{
-		bool initialized[narenas];
+		VARIABLE_ARRAY(bool, initialized, narenas);
 
 		sz = narenas * sizeof(bool);
 		assert_d_eq(mallctl("arenas.initialized", initialized, &sz,
-- 
cgit v0.12


From 1ad4a6e9f9ba55c874d0ad63041e09b96b459b1f Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Wed, 21 May 2014 17:44:42 +0900
Subject: Add missing $(EXE) to filter TESTS_UNIT_AUX_OBJS

---
 Makefile.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.in b/Makefile.in
index 65d73db..839bb08 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -213,7 +213,7 @@ define make-unit-link-dep
 $(1): TESTS_UNIT_LINK_OBJS += $(2)
 $(1): $(2)
 endef
-$(foreach test, $(TESTS_UNIT:$(srcroot)test/unit/%.c=$(objroot)test/unit/%$(EXE)), $(eval $(call make-unit-link-dep,$(test),$(filter $(test:%=%_a.$(O)) $(test:%=%_b.$(O)),$(TESTS_UNIT_AUX_OBJS)))))
+$(foreach test, $(TESTS_UNIT:$(srcroot)test/unit/%.c=$(objroot)test/unit/%$(EXE)), $(eval $(call make-unit-link-dep,$(test),$(filter $(test:%$(EXE)=%_a.$(O)) $(test:%$(EXE)=%_b.$(O)),$(TESTS_UNIT_AUX_OBJS)))))
 $(TESTS_INTEGRATION_OBJS): CPPFLAGS += -DJEMALLOC_INTEGRATION_TEST
 $(TESTS_STRESS_OBJS): CPPFLAGS += -DJEMALLOC_STRESS_TEST
 $(TESTS_OBJS): $(objroot)test/%.$(O): $(srcroot)test/%.c
-- 
cgit v0.12


From 7330c3770af0e5328d749635217387efbbe0ae3c Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Wed, 21 May 2014 18:00:15 +0900
Subject: Use C99 varadic macros instead of GCC ones

---
 test/include/test/test.h | 384 +++++++++++++++++++++++------------------------
 test/unit/util.c         |   8 +-
 2 files changed, 196 insertions(+), 196 deletions(-)

diff --git a/test/include/test/test.h b/test/include/test/test.h
index 161fafd..f55bafc 100644
--- a/test/include/test/test.h
+++ b/test/include/test/test.h
@@ -1,6 +1,6 @@
 #define	ASSERT_BUFSIZE	256
 
-#define	assert_cmp(t, a, b, cmp, neg_cmp, pri, fmt...) do {		\
+#define	assert_cmp(t, a, b, cmp, neg_cmp, pri, ...) do {		\
 	t a_ = (a);							\
 	t b_ = (b);							\
 	if (!(a_ cmp b_)) {						\
@@ -12,205 +12,205 @@
 		    "%"pri" "#neg_cmp" %"pri": ",			\
 		    __func__, __FILE__, __LINE__,			\
 		    #a, #b, a_, b_);					\
-		malloc_snprintf(message, sizeof(message), fmt);		\
+		malloc_snprintf(message, sizeof(message), __VA_ARGS__);	\
 		p_test_fail(prefix, message);				\
 	}								\
 } while (0)
 
-#define	assert_ptr_eq(a, b, fmt...)	assert_cmp(void *, a, b, ==,	\
-    !=, "p", fmt)
-#define	assert_ptr_ne(a, b, fmt...)	assert_cmp(void *, a, b, !=,	\
-    ==, "p", fmt)
-#define	assert_ptr_null(a, fmt...)	assert_cmp(void *, a, NULL, ==,	\
-    !=, "p", fmt)
-#define	assert_ptr_not_null(a, fmt...)	assert_cmp(void *, a, NULL, !=,	\
-    ==, "p", fmt)
+#define	assert_ptr_eq(a, b, ...)	assert_cmp(void *, a, b, ==,	\
+    !=, "p", __VA_ARGS__)
+#define	assert_ptr_ne(a, b, ...)	assert_cmp(void *, a, b, !=,	\
+    ==, "p", __VA_ARGS__)
+#define	assert_ptr_null(a, ...)		assert_cmp(void *, a, NULL, ==,	\
+    !=, "p", __VA_ARGS__)
+#define	assert_ptr_not_null(a, ...)	assert_cmp(void *, a, NULL, !=,	\
+    ==, "p", __VA_ARGS__)
 
-#define	assert_c_eq(a, b, fmt...)	assert_cmp(char, a, b, ==, !=, "c", fmt)
-#define	assert_c_ne(a, b, fmt...)	assert_cmp(char, a, b, !=, ==, "c", fmt)
-#define	assert_c_lt(a, b, fmt...)	assert_cmp(char, a, b, <, >=, "c", fmt)
-#define	assert_c_le(a, b, fmt...)	assert_cmp(char, a, b, <=, >, "c", fmt)
-#define	assert_c_ge(a, b, fmt...)	assert_cmp(char, a, b, >=, <, "c", fmt)
-#define	assert_c_gt(a, b, fmt...)	assert_cmp(char, a, b, >, <=, "c", fmt)
+#define	assert_c_eq(a, b, ...)	assert_cmp(char, a, b, ==, !=, "c", __VA_ARGS__)
+#define	assert_c_ne(a, b, ...)	assert_cmp(char, a, b, !=, ==, "c", __VA_ARGS__)
+#define	assert_c_lt(a, b, ...)	assert_cmp(char, a, b, <, >=, "c", __VA_ARGS__)
+#define	assert_c_le(a, b, ...)	assert_cmp(char, a, b, <=, >, "c", __VA_ARGS__)
+#define	assert_c_ge(a, b, ...)	assert_cmp(char, a, b, >=, <, "c", __VA_ARGS__)
+#define	assert_c_gt(a, b, ...)	assert_cmp(char, a, b, >, <=, "c", __VA_ARGS__)
 
-#define	assert_x_eq(a, b, fmt...)	assert_cmp(int, a, b, ==, !=, "#x", fmt)
-#define	assert_x_ne(a, b, fmt...)	assert_cmp(int, a, b, !=, ==, "#x", fmt)
-#define	assert_x_lt(a, b, fmt...)	assert_cmp(int, a, b, <, >=, "#x", fmt)
-#define	assert_x_le(a, b, fmt...)	assert_cmp(int, a, b, <=, >, "#x", fmt)
-#define	assert_x_ge(a, b, fmt...)	assert_cmp(int, a, b, >=, <, "#x", fmt)
-#define	assert_x_gt(a, b, fmt...)	assert_cmp(int, a, b, >, <=, "#x", fmt)
+#define	assert_x_eq(a, b, ...)	assert_cmp(int, a, b, ==, !=, "#x", __VA_ARGS__)
+#define	assert_x_ne(a, b, ...)	assert_cmp(int, a, b, !=, ==, "#x", __VA_ARGS__)
+#define	assert_x_lt(a, b, ...)	assert_cmp(int, a, b, <, >=, "#x", __VA_ARGS__)
+#define	assert_x_le(a, b, ...)	assert_cmp(int, a, b, <=, >, "#x", __VA_ARGS__)
+#define	assert_x_ge(a, b, ...)	assert_cmp(int, a, b, >=, <, "#x", __VA_ARGS__)
+#define	assert_x_gt(a, b, ...)	assert_cmp(int, a, b, >, <=, "#x", __VA_ARGS__)
 
-#define	assert_d_eq(a, b, fmt...)	assert_cmp(int, a, b, ==, !=, "d", fmt)
-#define	assert_d_ne(a, b, fmt...)	assert_cmp(int, a, b, !=, ==, "d", fmt)
-#define	assert_d_lt(a, b, fmt...)	assert_cmp(int, a, b, <, >=, "d", fmt)
-#define	assert_d_le(a, b, fmt...)	assert_cmp(int, a, b, <=, >, "d", fmt)
-#define	assert_d_ge(a, b, fmt...)	assert_cmp(int, a, b, >=, <, "d", fmt)
-#define	assert_d_gt(a, b, fmt...)	assert_cmp(int, a, b, >, <=, "d", fmt)
+#define	assert_d_eq(a, b, ...)	assert_cmp(int, a, b, ==, !=, "d", __VA_ARGS__)
+#define	assert_d_ne(a, b, ...)	assert_cmp(int, a, b, !=, ==, "d", __VA_ARGS__)
+#define	assert_d_lt(a, b, ...)	assert_cmp(int, a, b, <, >=, "d", __VA_ARGS__)
+#define	assert_d_le(a, b, ...)	assert_cmp(int, a, b, <=, >, "d", __VA_ARGS__)
+#define	assert_d_ge(a, b, ...)	assert_cmp(int, a, b, >=, <, "d", __VA_ARGS__)
+#define	assert_d_gt(a, b, ...)	assert_cmp(int, a, b, >, <=, "d", __VA_ARGS__)
 
-#define	assert_u_eq(a, b, fmt...)	assert_cmp(int, a, b, ==, !=, "u", fmt)
-#define	assert_u_ne(a, b, fmt...)	assert_cmp(int, a, b, !=, ==, "u", fmt)
-#define	assert_u_lt(a, b, fmt...)	assert_cmp(int, a, b, <, >=, "u", fmt)
-#define	assert_u_le(a, b, fmt...)	assert_cmp(int, a, b, <=, >, "u", fmt)
-#define	assert_u_ge(a, b, fmt...)	assert_cmp(int, a, b, >=, <, "u", fmt)
-#define	assert_u_gt(a, b, fmt...)	assert_cmp(int, a, b, >, <=, "u", fmt)
+#define	assert_u_eq(a, b, ...)	assert_cmp(int, a, b, ==, !=, "u", __VA_ARGS__)
+#define	assert_u_ne(a, b, ...)	assert_cmp(int, a, b, !=, ==, "u", __VA_ARGS__)
+#define	assert_u_lt(a, b, ...)	assert_cmp(int, a, b, <, >=, "u", __VA_ARGS__)
+#define	assert_u_le(a, b, ...)	assert_cmp(int, a, b, <=, >, "u", __VA_ARGS__)
+#define	assert_u_ge(a, b, ...)	assert_cmp(int, a, b, >=, <, "u", __VA_ARGS__)
+#define	assert_u_gt(a, b, ...)	assert_cmp(int, a, b, >, <=, "u", __VA_ARGS__)
 
-#define	assert_ld_eq(a, b, fmt...)	assert_cmp(long, a, b, ==,	\
-    !=, "ld", fmt)
-#define	assert_ld_ne(a, b, fmt...)	assert_cmp(long, a, b, !=,	\
-    ==, "ld", fmt)
-#define	assert_ld_lt(a, b, fmt...)	assert_cmp(long, a, b, <,	\
-    >=, "ld", fmt)
-#define	assert_ld_le(a, b, fmt...)	assert_cmp(long, a, b, <=,	\
-    >, "ld", fmt)
-#define	assert_ld_ge(a, b, fmt...)	assert_cmp(long, a, b, >=,	\
-    <, "ld", fmt)
-#define	assert_ld_gt(a, b, fmt...)	assert_cmp(long, a, b, >,	\
-    <=, "ld", fmt)
+#define	assert_ld_eq(a, b, ...)	assert_cmp(long, a, b, ==,	\
+    !=, "ld", __VA_ARGS__)
+#define	assert_ld_ne(a, b, ...)	assert_cmp(long, a, b, !=,	\
+    ==, "ld", __VA_ARGS__)
+#define	assert_ld_lt(a, b, ...)	assert_cmp(long, a, b, <,	\
+    >=, "ld", __VA_ARGS__)
+#define	assert_ld_le(a, b, ...)	assert_cmp(long, a, b, <=,	\
+    >, "ld", __VA_ARGS__)
+#define	assert_ld_ge(a, b, ...)	assert_cmp(long, a, b, >=,	\
+    <, "ld", __VA_ARGS__)
+#define	assert_ld_gt(a, b, ...)	assert_cmp(long, a, b, >,	\
+    <=, "ld", __VA_ARGS__)
 
-#define	assert_lu_eq(a, b, fmt...)	assert_cmp(unsigned long,	\
-    a, b, ==, !=, "lu", fmt)
-#define	assert_lu_ne(a, b, fmt...)	assert_cmp(unsigned long,	\
-    a, b, !=, ==, "lu", fmt)
-#define	assert_lu_lt(a, b, fmt...)	assert_cmp(unsigned long,	\
-    a, b, <, >=, "lu", fmt)
-#define	assert_lu_le(a, b, fmt...)	assert_cmp(unsigned long,	\
-    a, b, <=, >, "lu", fmt)
-#define	assert_lu_ge(a, b, fmt...)	assert_cmp(unsigned long,	\
-    a, b, >=, <, "lu", fmt)
-#define	assert_lu_gt(a, b, fmt...)	assert_cmp(unsigned long,	\
-    a, b, >, <=, "lu", fmt)
+#define	assert_lu_eq(a, b, ...)	assert_cmp(unsigned long,	\
+    a, b, ==, !=, "lu", __VA_ARGS__)
+#define	assert_lu_ne(a, b, ...)	assert_cmp(unsigned long,	\
+    a, b, !=, ==, "lu", __VA_ARGS__)
+#define	assert_lu_lt(a, b, ...)	assert_cmp(unsigned long,	\
+    a, b, <, >=, "lu", __VA_ARGS__)
+#define	assert_lu_le(a, b, ...)	assert_cmp(unsigned long,	\
+    a, b, <=, >, "lu", __VA_ARGS__)
+#define	assert_lu_ge(a, b, ...)	assert_cmp(unsigned long,	\
+    a, b, >=, <, "lu", __VA_ARGS__)
+#define	assert_lu_gt(a, b, ...)	assert_cmp(unsigned long,	\
+    a, b, >, <=, "lu", __VA_ARGS__)
 
-#define	assert_qd_eq(a, b, fmt...)	assert_cmp(long long, a, b, ==,	\
-    !=, "qd", fmt)
-#define	assert_qd_ne(a, b, fmt...)	assert_cmp(long long, a, b, !=,	\
-    ==, "qd", fmt)
-#define	assert_qd_lt(a, b, fmt...)	assert_cmp(long long, a, b, <,	\
-    >=, "qd", fmt)
-#define	assert_qd_le(a, b, fmt...)	assert_cmp(long long, a, b, <=,	\
-    >, "qd", fmt)
-#define	assert_qd_ge(a, b, fmt...)	assert_cmp(long long, a, b, >=,	\
-    <, "qd", fmt)
-#define	assert_qd_gt(a, b, fmt...)	assert_cmp(long long, a, b, >,	\
-    <=, "qd", fmt)
+#define	assert_qd_eq(a, b, ...)	assert_cmp(long long, a, b, ==,	\
+    !=, "qd", __VA_ARGS__)
+#define	assert_qd_ne(a, b, ...)	assert_cmp(long long, a, b, !=,	\
+    ==, "qd", __VA_ARGS__)
+#define	assert_qd_lt(a, b, ...)	assert_cmp(long long, a, b, <,	\
+    >=, "qd", __VA_ARGS__)
+#define	assert_qd_le(a, b, ...)	assert_cmp(long long, a, b, <=,	\
+    >, "qd", __VA_ARGS__)
+#define	assert_qd_ge(a, b, ...)	assert_cmp(long long, a, b, >=,	\
+    <, "qd", __VA_ARGS__)
+#define	assert_qd_gt(a, b, ...)	assert_cmp(long long, a, b, >,	\
+    <=, "qd", __VA_ARGS__)
 
-#define	assert_qu_eq(a, b, fmt...)	assert_cmp(unsigned long long,	\
-    a, b, ==, !=, "qu", fmt)
-#define	assert_qu_ne(a, b, fmt...)	assert_cmp(unsigned long long,	\
-    a, b, !=, ==, "qu", fmt)
-#define	assert_qu_lt(a, b, fmt...)	assert_cmp(unsigned long long,	\
-    a, b, <, >=, "qu", fmt)
-#define	assert_qu_le(a, b, fmt...)	assert_cmp(unsigned long long,	\
-    a, b, <=, >, "qu", fmt)
-#define	assert_qu_ge(a, b, fmt...)	assert_cmp(unsigned long long,	\
-    a, b, >=, <, "qu", fmt)
-#define	assert_qu_gt(a, b, fmt...)	assert_cmp(unsigned long long,	\
-    a, b, >, <=, "qu", fmt)
+#define	assert_qu_eq(a, b, ...)	assert_cmp(unsigned long long,	\
+    a, b, ==, !=, "qu", __VA_ARGS__)
+#define	assert_qu_ne(a, b, ...)	assert_cmp(unsigned long long,	\
+    a, b, !=, ==, "qu", __VA_ARGS__)
+#define	assert_qu_lt(a, b, ...)	assert_cmp(unsigned long long,	\
+    a, b, <, >=, "qu", __VA_ARGS__)
+#define	assert_qu_le(a, b, ...)	assert_cmp(unsigned long long,	\
+    a, b, <=, >, "qu", __VA_ARGS__)
+#define	assert_qu_ge(a, b, ...)	assert_cmp(unsigned long long,	\
+    a, b, >=, <, "qu", __VA_ARGS__)
+#define	assert_qu_gt(a, b, ...)	assert_cmp(unsigned long long,	\
+    a, b, >, <=, "qu", __VA_ARGS__)
 
-#define	assert_jd_eq(a, b, fmt...)	assert_cmp(intmax_t, a, b, ==,	\
-    !=, "jd", fmt)
-#define	assert_jd_ne(a, b, fmt...)	assert_cmp(intmax_t, a, b, !=,	\
-    ==, "jd", fmt)
-#define	assert_jd_lt(a, b, fmt...)	assert_cmp(intmax_t, a, b, <,	\
-    >=, "jd", fmt)
-#define	assert_jd_le(a, b, fmt...)	assert_cmp(intmax_t, a, b, <=,	\
-    >, "jd", fmt)
-#define	assert_jd_ge(a, b, fmt...)	assert_cmp(intmax_t, a, b, >=,	\
-    <, "jd", fmt)
-#define	assert_jd_gt(a, b, fmt...)	assert_cmp(intmax_t, a, b, >,	\
-    <=, "jd", fmt)
+#define	assert_jd_eq(a, b, ...)	assert_cmp(intmax_t, a, b, ==,	\
+    !=, "jd", __VA_ARGS__)
+#define	assert_jd_ne(a, b, ...)	assert_cmp(intmax_t, a, b, !=,	\
+    ==, "jd", __VA_ARGS__)
+#define	assert_jd_lt(a, b, ...)	assert_cmp(intmax_t, a, b, <,	\
+    >=, "jd", __VA_ARGS__)
+#define	assert_jd_le(a, b, ...)	assert_cmp(intmax_t, a, b, <=,	\
+    >, "jd", __VA_ARGS__)
+#define	assert_jd_ge(a, b, ...)	assert_cmp(intmax_t, a, b, >=,	\
+    <, "jd", __VA_ARGS__)
+#define	assert_jd_gt(a, b, ...)	assert_cmp(intmax_t, a, b, >,	\
+    <=, "jd", __VA_ARGS__)
 
-#define	assert_ju_eq(a, b, fmt...)	assert_cmp(uintmax_t, a, b, ==,	\
-    !=, "ju", fmt)
-#define	assert_ju_ne(a, b, fmt...)	assert_cmp(uintmax_t, a, b, !=,	\
-    ==, "ju", fmt)
-#define	assert_ju_lt(a, b, fmt...)	assert_cmp(uintmax_t, a, b, <,	\
-    >=, "ju", fmt)
-#define	assert_ju_le(a, b, fmt...)	assert_cmp(uintmax_t, a, b, <=,	\
-    >, "ju", fmt)
-#define	assert_ju_ge(a, b, fmt...)	assert_cmp(uintmax_t, a, b, >=,	\
-    <, "ju", fmt)
-#define	assert_ju_gt(a, b, fmt...)	assert_cmp(uintmax_t, a, b, >,	\
-    <=, "ju", fmt)
+#define	assert_ju_eq(a, b, ...)	assert_cmp(uintmax_t, a, b, ==,	\
+    !=, "ju", __VA_ARGS__)
+#define	assert_ju_ne(a, b, ...)	assert_cmp(uintmax_t, a, b, !=,	\
+    ==, "ju", __VA_ARGS__)
+#define	assert_ju_lt(a, b, ...)	assert_cmp(uintmax_t, a, b, <,	\
+    >=, "ju", __VA_ARGS__)
+#define	assert_ju_le(a, b, ...)	assert_cmp(uintmax_t, a, b, <=,	\
+    >, "ju", __VA_ARGS__)
+#define	assert_ju_ge(a, b, ...)	assert_cmp(uintmax_t, a, b, >=,	\
+    <, "ju", __VA_ARGS__)
+#define	assert_ju_gt(a, b, ...)	assert_cmp(uintmax_t, a, b, >,	\
+    <=, "ju", __VA_ARGS__)
 
-#define	assert_zd_eq(a, b, fmt...)	assert_cmp(ssize_t, a, b, ==,	\
-    !=, "zd", fmt)
-#define	assert_zd_ne(a, b, fmt...)	assert_cmp(ssize_t, a, b, !=,	\
-    ==, "zd", fmt)
-#define	assert_zd_lt(a, b, fmt...)	assert_cmp(ssize_t, a, b, <,	\
-    >=, "zd", fmt)
-#define	assert_zd_le(a, b, fmt...)	assert_cmp(ssize_t, a, b, <=,	\
-    >, "zd", fmt)
-#define	assert_zd_ge(a, b, fmt...)	assert_cmp(ssize_t, a, b, >=,	\
-    <, "zd", fmt)
-#define	assert_zd_gt(a, b, fmt...)	assert_cmp(ssize_t, a, b, >,	\
-    <=, "zd", fmt)
+#define	assert_zd_eq(a, b, ...)	assert_cmp(ssize_t, a, b, ==,	\
+    !=, "zd", __VA_ARGS__)
+#define	assert_zd_ne(a, b, ...)	assert_cmp(ssize_t, a, b, !=,	\
+    ==, "zd", __VA_ARGS__)
+#define	assert_zd_lt(a, b, ...)	assert_cmp(ssize_t, a, b, <,	\
+    >=, "zd", __VA_ARGS__)
+#define	assert_zd_le(a, b, ...)	assert_cmp(ssize_t, a, b, <=,	\
+    >, "zd", __VA_ARGS__)
+#define	assert_zd_ge(a, b, ...)	assert_cmp(ssize_t, a, b, >=,	\
+    <, "zd", __VA_ARGS__)
+#define	assert_zd_gt(a, b, ...)	assert_cmp(ssize_t, a, b, >,	\
+    <=, "zd", __VA_ARGS__)
 
-#define	assert_zu_eq(a, b, fmt...)	assert_cmp(size_t, a, b, ==,	\
-    !=, "zu", fmt)
-#define	assert_zu_ne(a, b, fmt...)	assert_cmp(size_t, a, b, !=,	\
-    ==, "zu", fmt)
-#define	assert_zu_lt(a, b, fmt...)	assert_cmp(size_t, a, b, <,	\
-    >=, "zu", fmt)
-#define	assert_zu_le(a, b, fmt...)	assert_cmp(size_t, a, b, <=,	\
-    >, "zu", fmt)
-#define	assert_zu_ge(a, b, fmt...)	assert_cmp(size_t, a, b, >=,	\
-    <, "zu", fmt)
-#define	assert_zu_gt(a, b, fmt...)	assert_cmp(size_t, a, b, >,	\
-    <=, "zu", fmt)
+#define	assert_zu_eq(a, b, ...)	assert_cmp(size_t, a, b, ==,	\
+    !=, "zu", __VA_ARGS__)
+#define	assert_zu_ne(a, b, ...)	assert_cmp(size_t, a, b, !=,	\
+    ==, "zu", __VA_ARGS__)
+#define	assert_zu_lt(a, b, ...)	assert_cmp(size_t, a, b, <,	\
+    >=, "zu", __VA_ARGS__)
+#define	assert_zu_le(a, b, ...)	assert_cmp(size_t, a, b, <=,	\
+    >, "zu", __VA_ARGS__)
+#define	assert_zu_ge(a, b, ...)	assert_cmp(size_t, a, b, >=,	\
+    <, "zu", __VA_ARGS__)
+#define	assert_zu_gt(a, b, ...)	assert_cmp(size_t, a, b, >,	\
+    <=, "zu", __VA_ARGS__)
 
-#define	assert_d32_eq(a, b, fmt...)	assert_cmp(int32_t, a, b, ==,	\
-    !=, PRId32, fmt)
-#define	assert_d32_ne(a, b, fmt...)	assert_cmp(int32_t, a, b, !=,	\
-    ==, PRId32, fmt)
-#define	assert_d32_lt(a, b, fmt...)	assert_cmp(int32_t, a, b, <,	\
-    >=, PRId32, fmt)
-#define	assert_d32_le(a, b, fmt...)	assert_cmp(int32_t, a, b, <=,	\
-    >, PRId32, fmt)
-#define	assert_d32_ge(a, b, fmt...)	assert_cmp(int32_t, a, b, >=,	\
-    <, PRId32, fmt)
-#define	assert_d32_gt(a, b, fmt...)	assert_cmp(int32_t, a, b, >,	\
-    <=, PRId32, fmt)
+#define	assert_d32_eq(a, b, ...)	assert_cmp(int32_t, a, b, ==,	\
+    !=, PRId32, __VA_ARGS__)
+#define	assert_d32_ne(a, b, ...)	assert_cmp(int32_t, a, b, !=,	\
+    ==, PRId32, __VA_ARGS__)
+#define	assert_d32_lt(a, b, ...)	assert_cmp(int32_t, a, b, <,	\
+    >=, PRId32, __VA_ARGS__)
+#define	assert_d32_le(a, b, ...)	assert_cmp(int32_t, a, b, <=,	\
+    >, PRId32, __VA_ARGS__)
+#define	assert_d32_ge(a, b, ...)	assert_cmp(int32_t, a, b, >=,	\
+    <, PRId32, __VA_ARGS__)
+#define	assert_d32_gt(a, b, ...)	assert_cmp(int32_t, a, b, >,	\
+    <=, PRId32, __VA_ARGS__)
 
-#define	assert_u32_eq(a, b, fmt...)	assert_cmp(uint32_t, a, b, ==,	\
-    !=, PRIu32, fmt)
-#define	assert_u32_ne(a, b, fmt...)	assert_cmp(uint32_t, a, b, !=,	\
-    ==, PRIu32, fmt)
-#define	assert_u32_lt(a, b, fmt...)	assert_cmp(uint32_t, a, b, <,	\
-    >=, PRIu32, fmt)
-#define	assert_u32_le(a, b, fmt...)	assert_cmp(uint32_t, a, b, <=,	\
-    >, PRIu32, fmt)
-#define	assert_u32_ge(a, b, fmt...)	assert_cmp(uint32_t, a, b, >=,	\
-    <, PRIu32, fmt)
-#define	assert_u32_gt(a, b, fmt...)	assert_cmp(uint32_t, a, b, >,	\
-    <=, PRIu32, fmt)
+#define	assert_u32_eq(a, b, ...)	assert_cmp(uint32_t, a, b, ==,	\
+    !=, PRIu32, __VA_ARGS__)
+#define	assert_u32_ne(a, b, ...)	assert_cmp(uint32_t, a, b, !=,	\
+    ==, PRIu32, __VA_ARGS__)
+#define	assert_u32_lt(a, b, ...)	assert_cmp(uint32_t, a, b, <,	\
+    >=, PRIu32, __VA_ARGS__)
+#define	assert_u32_le(a, b, ...)	assert_cmp(uint32_t, a, b, <=,	\
+    >, PRIu32, __VA_ARGS__)
+#define	assert_u32_ge(a, b, ...)	assert_cmp(uint32_t, a, b, >=,	\
+    <, PRIu32, __VA_ARGS__)
+#define	assert_u32_gt(a, b, ...)	assert_cmp(uint32_t, a, b, >,	\
+    <=, PRIu32, __VA_ARGS__)
 
-#define	assert_d64_eq(a, b, fmt...)	assert_cmp(int64_t, a, b, ==,	\
-    !=, PRId64, fmt)
-#define	assert_d64_ne(a, b, fmt...)	assert_cmp(int64_t, a, b, !=,	\
-    ==, PRId64, fmt)
-#define	assert_d64_lt(a, b, fmt...)	assert_cmp(int64_t, a, b, <,	\
-    >=, PRId64, fmt)
-#define	assert_d64_le(a, b, fmt...)	assert_cmp(int64_t, a, b, <=,	\
-    >, PRId64, fmt)
-#define	assert_d64_ge(a, b, fmt...)	assert_cmp(int64_t, a, b, >=,	\
-    <, PRId64, fmt)
-#define	assert_d64_gt(a, b, fmt...)	assert_cmp(int64_t, a, b, >,	\
-    <=, PRId64, fmt)
+#define	assert_d64_eq(a, b, ...)	assert_cmp(int64_t, a, b, ==,	\
+    !=, PRId64, __VA_ARGS__)
+#define	assert_d64_ne(a, b, ...)	assert_cmp(int64_t, a, b, !=,	\
+    ==, PRId64, __VA_ARGS__)
+#define	assert_d64_lt(a, b, ...)	assert_cmp(int64_t, a, b, <,	\
+    >=, PRId64, __VA_ARGS__)
+#define	assert_d64_le(a, b, ...)	assert_cmp(int64_t, a, b, <=,	\
+    >, PRId64, __VA_ARGS__)
+#define	assert_d64_ge(a, b, ...)	assert_cmp(int64_t, a, b, >=,	\
+    <, PRId64, __VA_ARGS__)
+#define	assert_d64_gt(a, b, ...)	assert_cmp(int64_t, a, b, >,	\
+    <=, PRId64, __VA_ARGS__)
 
-#define	assert_u64_eq(a, b, fmt...)	assert_cmp(uint64_t, a, b, ==,	\
-    !=, PRIu64, fmt)
-#define	assert_u64_ne(a, b, fmt...)	assert_cmp(uint64_t, a, b, !=,	\
-    ==, PRIu64, fmt)
-#define	assert_u64_lt(a, b, fmt...)	assert_cmp(uint64_t, a, b, <,	\
-    >=, PRIu64, fmt)
-#define	assert_u64_le(a, b, fmt...)	assert_cmp(uint64_t, a, b, <=,	\
-    >, PRIu64, fmt)
-#define	assert_u64_ge(a, b, fmt...)	assert_cmp(uint64_t, a, b, >=,	\
-    <, PRIu64, fmt)
-#define	assert_u64_gt(a, b, fmt...)	assert_cmp(uint64_t, a, b, >,	\
-    <=, PRIu64, fmt)
+#define	assert_u64_eq(a, b, ...)	assert_cmp(uint64_t, a, b, ==,	\
+    !=, PRIu64, __VA_ARGS__)
+#define	assert_u64_ne(a, b, ...)	assert_cmp(uint64_t, a, b, !=,	\
+    ==, PRIu64, __VA_ARGS__)
+#define	assert_u64_lt(a, b, ...)	assert_cmp(uint64_t, a, b, <,	\
+    >=, PRIu64, __VA_ARGS__)
+#define	assert_u64_le(a, b, ...)	assert_cmp(uint64_t, a, b, <=,	\
+    >, PRIu64, __VA_ARGS__)
+#define	assert_u64_ge(a, b, ...)	assert_cmp(uint64_t, a, b, >=,	\
+    <, PRIu64, __VA_ARGS__)
+#define	assert_u64_gt(a, b, ...)	assert_cmp(uint64_t, a, b, >,	\
+    <=, PRIu64, __VA_ARGS__)
 
-#define	assert_b_eq(a, b, fmt...) do {					\
+#define	assert_b_eq(a, b, ...) do {					\
 	bool a_ = (a);							\
 	bool b_ = (b);							\
 	if (!(a_ == b_)) {						\
@@ -222,11 +222,11 @@
 		    __func__, __FILE__, __LINE__,			\
 		    #a, #b, a_ ? "true" : "false",			\
 		    b_ ? "true" : "false");				\
-		malloc_snprintf(message, sizeof(message), fmt);		\
+		malloc_snprintf(message, sizeof(message), __VA_ARGS__);	\
 		p_test_fail(prefix, message);				\
 	}								\
 } while (0)
-#define	assert_b_ne(a, b, fmt...) do {					\
+#define	assert_b_ne(a, b, ...) do {					\
 	bool a_ = (a);							\
 	bool b_ = (b);							\
 	if (!(a_ != b_)) {						\
@@ -238,14 +238,14 @@
 		    __func__, __FILE__, __LINE__,			\
 		    #a, #b, a_ ? "true" : "false",			\
 		    b_ ? "true" : "false");				\
-		malloc_snprintf(message, sizeof(message), fmt);		\
+		malloc_snprintf(message, sizeof(message), __VA_ARGS__);	\
 		p_test_fail(prefix, message);				\
 	}								\
 } while (0)
-#define	assert_true(a, fmt...)	assert_b_eq(a, true, fmt)
-#define	assert_false(a, fmt...)	assert_b_eq(a, false, fmt)
+#define	assert_true(a, ...)	assert_b_eq(a, true, __VA_ARGS__)
+#define	assert_false(a, ...)	assert_b_eq(a, false, __VA_ARGS__)
 
-#define	assert_str_eq(a, b, fmt...) do {				\
+#define	assert_str_eq(a, b, ...) do {				\
 	if (strcmp((a), (b))) {						\
 		char prefix[ASSERT_BUFSIZE];				\
 		char message[ASSERT_BUFSIZE];				\
@@ -254,11 +254,11 @@
 		    "(%s) same as (%s) --> "				\
 		    "\"%s\" differs from \"%s\": ",			\
 		    __func__, __FILE__, __LINE__, #a, #b, a, b);	\
-		malloc_snprintf(message, sizeof(message), fmt);		\
+		malloc_snprintf(message, sizeof(message), __VA_ARGS__);	\
 		p_test_fail(prefix, message);				\
 	}								\
 } while (0)
-#define	assert_str_ne(a, b, fmt...) do {				\
+#define	assert_str_ne(a, b, ...) do {				\
 	if (!strcmp((a), (b))) {					\
 		char prefix[ASSERT_BUFSIZE];				\
 		char message[ASSERT_BUFSIZE];				\
@@ -267,18 +267,18 @@
 		    "(%s) differs from (%s) --> "			\
 		    "\"%s\" same as \"%s\": ",				\
 		    __func__, __FILE__, __LINE__, #a, #b, a, b);	\
-		malloc_snprintf(message, sizeof(message), fmt);		\
+		malloc_snprintf(message, sizeof(message), __VA_ARGS__);	\
 		p_test_fail(prefix, message);				\
 	}								\
 } while (0)
 
-#define	assert_not_reached(fmt...) do {					\
+#define	assert_not_reached(...) do {					\
 	char prefix[ASSERT_BUFSIZE];					\
 	char message[ASSERT_BUFSIZE];					\
 	malloc_snprintf(prefix, sizeof(prefix),				\
 	    "%s:%s:%d: Unreachable code reached: ",			\
 	    __func__, __FILE__, __LINE__);				\
-	malloc_snprintf(message, sizeof(message), fmt);			\
+	malloc_snprintf(message, sizeof(message), __VA_ARGS__);		\
 	p_test_fail(prefix, message);					\
 } while (0)
 
@@ -308,8 +308,8 @@ label_test_end:								\
 	p_test_fini();							\
 }
 
-#define	test(tests...)							\
-	p_test(tests, NULL)
+#define	test(...)							\
+	p_test(__VA_ARGS__, NULL)
 
 #define	test_skip_if(e) do {						\
 	if (e) {							\
diff --git a/test/unit/util.c b/test/unit/util.c
index dc3cfe8..c11d598 100644
--- a/test/unit/util.c
+++ b/test/unit/util.c
@@ -141,8 +141,8 @@ TEST_BEGIN(test_malloc_snprintf_truncated)
 	char buf[BUFLEN];
 	int result;
 	size_t len;
-#define TEST(expected_str_untruncated, fmt...) do {			\
-	result = malloc_snprintf(buf, len, fmt);			\
+#define TEST(expected_str_untruncated, ...) do {			\
+	result = malloc_snprintf(buf, len, __VA_ARGS__);		\
 	assert_d_eq(strncmp(buf, expected_str_untruncated, len-1), 0,	\
 	    "Unexpected string inequality (\"%s\" vs \"%s\")",		\
 	    buf, expected_str_untruncated);		\
@@ -173,8 +173,8 @@ TEST_BEGIN(test_malloc_snprintf)
 #define	BUFLEN	128
 	char buf[BUFLEN];
 	int result;
-#define	TEST(expected_str, fmt...) do {					\
-	result = malloc_snprintf(buf, sizeof(buf), fmt);		\
+#define	TEST(expected_str, ...) do {					\
+	result = malloc_snprintf(buf, sizeof(buf), __VA_ARGS__);	\
 	assert_str_eq(buf, expected_str, "Unexpected output");		\
 	assert_d_eq(result, strlen(expected_str), "Unexpected result");	\
 } while (0)
-- 
cgit v0.12


From 86e2e703ffb3cc17e05af816df8895db62a9272e Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Wed, 21 May 2014 18:01:21 +0900
Subject: Rename "small" local variable, because windows headers #define it

---
 test/unit/stats.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/unit/stats.c b/test/unit/stats.c
index ab87b29..78c78cd 100644
--- a/test/unit/stats.c
+++ b/test/unit/stats.c
@@ -97,7 +97,7 @@ TEST_END
 TEST_BEGIN(test_stats_arenas_summary)
 {
 	unsigned arena;
-	void *small, *large;
+	void *little, *large;
 	uint64_t epoch;
 	size_t sz;
 	int expected = config_stats ? 0 : ENOENT;
@@ -108,8 +108,8 @@ TEST_BEGIN(test_stats_arenas_summary)
 	assert_d_eq(mallctl("thread.arena", NULL, NULL, &arena, sizeof(arena)),
 	    0, "Unexpected mallctl() failure");
 
-	small = mallocx(SMALL_MAXCLASS, 0);
-	assert_ptr_not_null(small, "Unexpected mallocx() failure");
+	little = mallocx(SMALL_MAXCLASS, 0);
+	assert_ptr_not_null(little, "Unexpected mallocx() failure");
 	large = mallocx(arena_maxclass, 0);
 	assert_ptr_not_null(large, "Unexpected mallocx() failure");
 
@@ -137,7 +137,7 @@ TEST_BEGIN(test_stats_arenas_summary)
 		    "nmadvise should be no greater than purged");
 	}
 
-	dallocx(small, 0);
+	dallocx(little, 0);
 	dallocx(large, 0);
 }
 TEST_END
-- 
cgit v0.12


From 3a730dfd5062ecd6fc46b68f28342e14b461f560 Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Wed, 21 May 2014 18:13:21 +0900
Subject: Avoid pointer arithmetic on void* in test/integration/rallocx.c

---
 test/integration/rallocx.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/test/integration/rallocx.c b/test/integration/rallocx.c
index ee21aed..e78e02f 100644
--- a/test/integration/rallocx.c
+++ b/test/integration/rallocx.c
@@ -95,7 +95,8 @@ TEST_BEGIN(test_zero)
 				    "Expected zeroed memory");
 			}
 			if (psz != qsz) {
-				memset(q+psz, FILL_BYTE, qsz-psz);
+				memset((void *)(uintptr_t)q+psz, FILL_BYTE,
+				    qsz-psz);
 				psz = qsz;
 			}
 			p = q;
@@ -159,8 +160,9 @@ TEST_BEGIN(test_lg_align_and_zero)
 		} else {
 			assert_false(validate_fill(q, 0, 0, MAX_VALIDATE),
 			    "Expected zeroed memory");
-			assert_false(validate_fill(q+sz-MAX_VALIDATE, 0, 0,
-			    MAX_VALIDATE), "Expected zeroed memory");
+			assert_false(validate_fill(
+			    (void *)(uintptr_t)q+sz-MAX_VALIDATE,
+			    0, 0, MAX_VALIDATE), "Expected zeroed memory");
 		}
 		p = q;
 	}
-- 
cgit v0.12


From a9df1ae622d0eb91a26208c03c51d0c518cce146 Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Wed, 21 May 2014 16:34:02 +0900
Subject: Use ULL prefix instead of LLU for unsigned long longs

MSVC only supports the former.
---
 include/jemalloc/internal/hash.h |    8 +-
 test/src/SFMT.c                  |    2 +-
 test/unit/SFMT.c                 | 2000 +++++++++++++++++++-------------------
 3 files changed, 1005 insertions(+), 1005 deletions(-)

diff --git a/include/jemalloc/internal/hash.h b/include/jemalloc/internal/hash.h
index c7183ed..f2b3a16 100644
--- a/include/jemalloc/internal/hash.h
+++ b/include/jemalloc/internal/hash.h
@@ -76,9 +76,9 @@ hash_fmix_64(uint64_t k)
 {
 
 	k ^= k >> 33;
-	k *= QU(0xff51afd7ed558ccdLLU);
+	k *= QU(0xff51afd7ed558ccdULL);
 	k ^= k >> 33;
-	k *= QU(0xc4ceb9fe1a85ec53LLU);
+	k *= QU(0xc4ceb9fe1a85ec53ULL);
 	k ^= k >> 33;
 
 	return (k);
@@ -247,8 +247,8 @@ hash_x64_128(const void *key, const int len, const uint32_t seed,
 	uint64_t h1 = seed;
 	uint64_t h2 = seed;
 
-	const uint64_t c1 = QU(0x87c37b91114253d5LLU);
-	const uint64_t c2 = QU(0x4cf5ad432745937fLLU);
+	const uint64_t c1 = QU(0x87c37b91114253d5ULL);
+	const uint64_t c2 = QU(0x4cf5ad432745937fULL);
 
 	/* body */
 	{
diff --git a/test/src/SFMT.c b/test/src/SFMT.c
index e6f8dee..d2cc9d1 100644
--- a/test/src/SFMT.c
+++ b/test/src/SFMT.c
@@ -511,7 +511,7 @@ uint64_t gen_rand64(sfmt_t *ctx) {
 uint64_t gen_rand64_range(sfmt_t *ctx, uint64_t limit) {
     uint64_t ret, above;
 
-    above = 0xffffffffffffffffLLU - (0xffffffffffffffffLLU  % limit);
+    above = 0xffffffffffffffffULL - (0xffffffffffffffffULL  % limit);
     while (1) {
         ret = gen_rand64(ctx);
         if (ret < above) {
diff --git a/test/unit/SFMT.c b/test/unit/SFMT.c
index c57bd68..0ad9c23 100644
--- a/test/unit/SFMT.c
+++ b/test/unit/SFMT.c
@@ -445,1008 +445,1008 @@ static const uint32_t init_by_array_32_expected[] = {
 	2750138839U, 3518055702U,  733072558U, 4169325400U,  788493625U
 };
 static const uint64_t init_gen_rand_64_expected[] = {
-	QU(16924766246869039260LLU), QU( 8201438687333352714LLU),
-	QU( 2265290287015001750LLU), QU(18397264611805473832LLU),
-	QU( 3375255223302384358LLU), QU( 6345559975416828796LLU),
-	QU(18229739242790328073LLU), QU( 7596792742098800905LLU),
-	QU(  255338647169685981LLU), QU( 2052747240048610300LLU),
-	QU(18328151576097299343LLU), QU(12472905421133796567LLU),
-	QU(11315245349717600863LLU), QU(16594110197775871209LLU),
-	QU(15708751964632456450LLU), QU(10452031272054632535LLU),
-	QU(11097646720811454386LLU), QU( 4556090668445745441LLU),
-	QU(17116187693090663106LLU), QU(14931526836144510645LLU),
-	QU( 9190752218020552591LLU), QU( 9625800285771901401LLU),
-	QU(13995141077659972832LLU), QU( 5194209094927829625LLU),
-	QU( 4156788379151063303LLU), QU( 8523452593770139494LLU),
-	QU(14082382103049296727LLU), QU( 2462601863986088483LLU),
-	QU( 3030583461592840678LLU), QU( 5221622077872827681LLU),
-	QU( 3084210671228981236LLU), QU(13956758381389953823LLU),
-	QU(13503889856213423831LLU), QU(15696904024189836170LLU),
-	QU( 4612584152877036206LLU), QU( 6231135538447867881LLU),
-	QU(10172457294158869468LLU), QU( 6452258628466708150LLU),
-	QU(14044432824917330221LLU), QU(  370168364480044279LLU),
-	QU(10102144686427193359LLU), QU(  667870489994776076LLU),
-	QU( 2732271956925885858LLU), QU(18027788905977284151LLU),
-	QU(15009842788582923859LLU), QU( 7136357960180199542LLU),
-	QU(15901736243475578127LLU), QU(16951293785352615701LLU),
-	QU(10551492125243691632LLU), QU(17668869969146434804LLU),
-	QU(13646002971174390445LLU), QU( 9804471050759613248LLU),
-	QU( 5511670439655935493LLU), QU(18103342091070400926LLU),
-	QU(17224512747665137533LLU), QU(15534627482992618168LLU),
-	QU( 1423813266186582647LLU), QU(15821176807932930024LLU),
-	QU(   30323369733607156LLU), QU(11599382494723479403LLU),
-	QU(  653856076586810062LLU), QU( 3176437395144899659LLU),
-	QU(14028076268147963917LLU), QU(16156398271809666195LLU),
-	QU( 3166955484848201676LLU), QU( 5746805620136919390LLU),
-	QU(17297845208891256593LLU), QU(11691653183226428483LLU),
-	QU(17900026146506981577LLU), QU(15387382115755971042LLU),
-	QU(16923567681040845943LLU), QU( 8039057517199388606LLU),
-	QU(11748409241468629263LLU), QU(  794358245539076095LLU),
-	QU(13438501964693401242LLU), QU(14036803236515618962LLU),
-	QU( 5252311215205424721LLU), QU(17806589612915509081LLU),
-	QU( 6802767092397596006LLU), QU(14212120431184557140LLU),
-	QU( 1072951366761385712LLU), QU(13098491780722836296LLU),
-	QU( 9466676828710797353LLU), QU(12673056849042830081LLU),
-	QU(12763726623645357580LLU), QU(16468961652999309493LLU),
-	QU(15305979875636438926LLU), QU(17444713151223449734LLU),
-	QU( 5692214267627883674LLU), QU(13049589139196151505LLU),
-	QU(  880115207831670745LLU), QU( 1776529075789695498LLU),
-	QU(16695225897801466485LLU), QU(10666901778795346845LLU),
-	QU( 6164389346722833869LLU), QU( 2863817793264300475LLU),
-	QU( 9464049921886304754LLU), QU( 3993566636740015468LLU),
-	QU( 9983749692528514136LLU), QU(16375286075057755211LLU),
-	QU(16042643417005440820LLU), QU(11445419662923489877LLU),
-	QU( 7999038846885158836LLU), QU( 6721913661721511535LLU),
-	QU( 5363052654139357320LLU), QU( 1817788761173584205LLU),
-	QU(13290974386445856444LLU), QU( 4650350818937984680LLU),
-	QU( 8219183528102484836LLU), QU( 1569862923500819899LLU),
-	QU( 4189359732136641860LLU), QU(14202822961683148583LLU),
-	QU( 4457498315309429058LLU), QU(13089067387019074834LLU),
-	QU(11075517153328927293LLU), QU(10277016248336668389LLU),
-	QU( 7070509725324401122LLU), QU(17808892017780289380LLU),
-	QU(13143367339909287349LLU), QU( 1377743745360085151LLU),
-	QU( 5749341807421286485LLU), QU(14832814616770931325LLU),
-	QU( 7688820635324359492LLU), QU(10960474011539770045LLU),
-	QU(   81970066653179790LLU), QU(12619476072607878022LLU),
-	QU( 4419566616271201744LLU), QU(15147917311750568503LLU),
-	QU( 5549739182852706345LLU), QU( 7308198397975204770LLU),
-	QU(13580425496671289278LLU), QU(17070764785210130301LLU),
-	QU( 8202832846285604405LLU), QU( 6873046287640887249LLU),
-	QU( 6927424434308206114LLU), QU( 6139014645937224874LLU),
-	QU(10290373645978487639LLU), QU(15904261291701523804LLU),
-	QU( 9628743442057826883LLU), QU(18383429096255546714LLU),
-	QU( 4977413265753686967LLU), QU( 7714317492425012869LLU),
-	QU( 9025232586309926193LLU), QU(14627338359776709107LLU),
-	QU(14759849896467790763LLU), QU(10931129435864423252LLU),
-	QU( 4588456988775014359LLU), QU(10699388531797056724LLU),
-	QU(  468652268869238792LLU), QU( 5755943035328078086LLU),
-	QU( 2102437379988580216LLU), QU( 9986312786506674028LLU),
-	QU( 2654207180040945604LLU), QU( 8726634790559960062LLU),
-	QU(  100497234871808137LLU), QU( 2800137176951425819LLU),
-	QU( 6076627612918553487LLU), QU( 5780186919186152796LLU),
-	QU( 8179183595769929098LLU), QU( 6009426283716221169LLU),
-	QU( 2796662551397449358LLU), QU( 1756961367041986764LLU),
-	QU( 6972897917355606205LLU), QU(14524774345368968243LLU),
-	QU( 2773529684745706940LLU), QU( 4853632376213075959LLU),
-	QU( 4198177923731358102LLU), QU( 8271224913084139776LLU),
-	QU( 2741753121611092226LLU), QU(16782366145996731181LLU),
-	QU(15426125238972640790LLU), QU(13595497100671260342LLU),
-	QU( 3173531022836259898LLU), QU( 6573264560319511662LLU),
-	QU(18041111951511157441LLU), QU( 2351433581833135952LLU),
-	QU( 3113255578908173487LLU), QU( 1739371330877858784LLU),
-	QU(16046126562789165480LLU), QU( 8072101652214192925LLU),
-	QU(15267091584090664910LLU), QU( 9309579200403648940LLU),
-	QU( 5218892439752408722LLU), QU(14492477246004337115LLU),
-	QU(17431037586679770619LLU), QU( 7385248135963250480LLU),
-	QU( 9580144956565560660LLU), QU( 4919546228040008720LLU),
-	QU(15261542469145035584LLU), QU(18233297270822253102LLU),
-	QU( 5453248417992302857LLU), QU( 9309519155931460285LLU),
-	QU(10342813012345291756LLU), QU(15676085186784762381LLU),
-	QU(15912092950691300645LLU), QU( 9371053121499003195LLU),
-	QU( 9897186478226866746LLU), QU(14061858287188196327LLU),
-	QU(  122575971620788119LLU), QU(12146750969116317754LLU),
-	QU( 4438317272813245201LLU), QU( 8332576791009527119LLU),
-	QU(13907785691786542057LLU), QU(10374194887283287467LLU),
-	QU( 2098798755649059566LLU), QU( 3416235197748288894LLU),
-	QU( 8688269957320773484LLU), QU( 7503964602397371571LLU),
-	QU(16724977015147478236LLU), QU( 9461512855439858184LLU),
-	QU(13259049744534534727LLU), QU( 3583094952542899294LLU),
-	QU( 8764245731305528292LLU), QU(13240823595462088985LLU),
-	QU(13716141617617910448LLU), QU(18114969519935960955LLU),
-	QU( 2297553615798302206LLU), QU( 4585521442944663362LLU),
-	QU(17776858680630198686LLU), QU( 4685873229192163363LLU),
-	QU(  152558080671135627LLU), QU(15424900540842670088LLU),
-	QU(13229630297130024108LLU), QU(17530268788245718717LLU),
-	QU(16675633913065714144LLU), QU( 3158912717897568068LLU),
-	QU(15399132185380087288LLU), QU( 7401418744515677872LLU),
-	QU(13135412922344398535LLU), QU( 6385314346100509511LLU),
-	QU(13962867001134161139LLU), QU(10272780155442671999LLU),
-	QU(12894856086597769142LLU), QU(13340877795287554994LLU),
-	QU(12913630602094607396LLU), QU(12543167911119793857LLU),
-	QU(17343570372251873096LLU), QU(10959487764494150545LLU),
-	QU( 6966737953093821128LLU), QU(13780699135496988601LLU),
-	QU( 4405070719380142046LLU), QU(14923788365607284982LLU),
-	QU( 2869487678905148380LLU), QU( 6416272754197188403LLU),
-	QU(15017380475943612591LLU), QU( 1995636220918429487LLU),
-	QU( 3402016804620122716LLU), QU(15800188663407057080LLU),
-	QU(11362369990390932882LLU), QU(15262183501637986147LLU),
-	QU(10239175385387371494LLU), QU( 9352042420365748334LLU),
-	QU( 1682457034285119875LLU), QU( 1724710651376289644LLU),
-	QU( 2038157098893817966LLU), QU( 9897825558324608773LLU),
-	QU( 1477666236519164736LLU), QU(16835397314511233640LLU),
-	QU(10370866327005346508LLU), QU(10157504370660621982LLU),
-	QU(12113904045335882069LLU), QU(13326444439742783008LLU),
-	QU(11302769043000765804LLU), QU(13594979923955228484LLU),
-	QU(11779351762613475968LLU), QU( 3786101619539298383LLU),
-	QU( 8021122969180846063LLU), QU(15745904401162500495LLU),
-	QU(10762168465993897267LLU), QU(13552058957896319026LLU),
-	QU(11200228655252462013LLU), QU( 5035370357337441226LLU),
-	QU( 7593918984545500013LLU), QU( 5418554918361528700LLU),
-	QU( 4858270799405446371LLU), QU( 9974659566876282544LLU),
-	QU(18227595922273957859LLU), QU( 2772778443635656220LLU),
-	QU(14285143053182085385LLU), QU( 9939700992429600469LLU),
-	QU(12756185904545598068LLU), QU( 2020783375367345262LLU),
-	QU(   57026775058331227LLU), QU(  950827867930065454LLU),
-	QU( 6602279670145371217LLU), QU( 2291171535443566929LLU),
-	QU( 5832380724425010313LLU), QU( 1220343904715982285LLU),
-	QU(17045542598598037633LLU), QU(15460481779702820971LLU),
-	QU(13948388779949365130LLU), QU(13975040175430829518LLU),
-	QU(17477538238425541763LLU), QU(11104663041851745725LLU),
-	QU(15860992957141157587LLU), QU(14529434633012950138LLU),
-	QU( 2504838019075394203LLU), QU( 7512113882611121886LLU),
-	QU( 4859973559980886617LLU), QU( 1258601555703250219LLU),
-	QU(15594548157514316394LLU), QU( 4516730171963773048LLU),
-	QU(11380103193905031983LLU), QU( 6809282239982353344LLU),
-	QU(18045256930420065002LLU), QU( 2453702683108791859LLU),
-	QU(  977214582986981460LLU), QU( 2006410402232713466LLU),
-	QU( 6192236267216378358LLU), QU( 3429468402195675253LLU),
-	QU(18146933153017348921LLU), QU(17369978576367231139LLU),
-	QU( 1246940717230386603LLU), QU(11335758870083327110LLU),
-	QU(14166488801730353682LLU), QU( 9008573127269635732LLU),
-	QU(10776025389820643815LLU), QU(15087605441903942962LLU),
-	QU( 1359542462712147922LLU), QU(13898874411226454206LLU),
-	QU(17911176066536804411LLU), QU( 9435590428600085274LLU),
-	QU(  294488509967864007LLU), QU( 8890111397567922046LLU),
-	QU( 7987823476034328778LLU), QU(13263827582440967651LLU),
-	QU( 7503774813106751573LLU), QU(14974747296185646837LLU),
-	QU( 8504765037032103375LLU), QU(17340303357444536213LLU),
-	QU( 7704610912964485743LLU), QU( 8107533670327205061LLU),
-	QU( 9062969835083315985LLU), QU(16968963142126734184LLU),
-	QU(12958041214190810180LLU), QU( 2720170147759570200LLU),
-	QU( 2986358963942189566LLU), QU(14884226322219356580LLU),
-	QU(  286224325144368520LLU), QU(11313800433154279797LLU),
-	QU(18366849528439673248LLU), QU(17899725929482368789LLU),
-	QU( 3730004284609106799LLU), QU( 1654474302052767205LLU),
-	QU( 5006698007047077032LLU), QU( 8196893913601182838LLU),
-	QU(15214541774425211640LLU), QU(17391346045606626073LLU),
-	QU( 8369003584076969089LLU), QU( 3939046733368550293LLU),
-	QU(10178639720308707785LLU), QU( 2180248669304388697LLU),
-	QU(   62894391300126322LLU), QU( 9205708961736223191LLU),
-	QU( 6837431058165360438LLU), QU( 3150743890848308214LLU),
-	QU(17849330658111464583LLU), QU(12214815643135450865LLU),
-	QU(13410713840519603402LLU), QU( 3200778126692046802LLU),
-	QU(13354780043041779313LLU), QU(  800850022756886036LLU),
-	QU(15660052933953067433LLU), QU( 6572823544154375676LLU),
-	QU(11030281857015819266LLU), QU(12682241941471433835LLU),
-	QU(11654136407300274693LLU), QU( 4517795492388641109LLU),
-	QU( 9757017371504524244LLU), QU(17833043400781889277LLU),
-	QU(12685085201747792227LLU), QU(10408057728835019573LLU),
-	QU(   98370418513455221LLU), QU( 6732663555696848598LLU),
-	QU(13248530959948529780LLU), QU( 3530441401230622826LLU),
-	QU(18188251992895660615LLU), QU( 1847918354186383756LLU),
-	QU( 1127392190402660921LLU), QU(11293734643143819463LLU),
-	QU( 3015506344578682982LLU), QU(13852645444071153329LLU),
-	QU( 2121359659091349142LLU), QU( 1294604376116677694LLU),
-	QU( 5616576231286352318LLU), QU( 7112502442954235625LLU),
-	QU(11676228199551561689LLU), QU(12925182803007305359LLU),
-	QU( 7852375518160493082LLU), QU( 1136513130539296154LLU),
-	QU( 5636923900916593195LLU), QU( 3221077517612607747LLU),
-	QU(17784790465798152513LLU), QU( 3554210049056995938LLU),
-	QU(17476839685878225874LLU), QU( 3206836372585575732LLU),
-	QU( 2765333945644823430LLU), QU(10080070903718799528LLU),
-	QU( 5412370818878286353LLU), QU( 9689685887726257728LLU),
-	QU( 8236117509123533998LLU), QU( 1951139137165040214LLU),
-	QU( 4492205209227980349LLU), QU(16541291230861602967LLU),
-	QU( 1424371548301437940LLU), QU( 9117562079669206794LLU),
-	QU(14374681563251691625LLU), QU(13873164030199921303LLU),
-	QU( 6680317946770936731LLU), QU(15586334026918276214LLU),
-	QU(10896213950976109802LLU), QU( 9506261949596413689LLU),
-	QU( 9903949574308040616LLU), QU( 6038397344557204470LLU),
-	QU(  174601465422373648LLU), QU(15946141191338238030LLU),
-	QU(17142225620992044937LLU), QU( 7552030283784477064LLU),
-	QU( 2947372384532947997LLU), QU(  510797021688197711LLU),
-	QU( 4962499439249363461LLU), QU(   23770320158385357LLU),
-	QU(  959774499105138124LLU), QU( 1468396011518788276LLU),
-	QU( 2015698006852312308LLU), QU( 4149400718489980136LLU),
-	QU( 5992916099522371188LLU), QU(10819182935265531076LLU),
-	QU(16189787999192351131LLU), QU(  342833961790261950LLU),
-	QU(12470830319550495336LLU), QU(18128495041912812501LLU),
-	QU( 1193600899723524337LLU), QU( 9056793666590079770LLU),
-	QU( 2154021227041669041LLU), QU( 4963570213951235735LLU),
-	QU( 4865075960209211409LLU), QU( 2097724599039942963LLU),
-	QU( 2024080278583179845LLU), QU(11527054549196576736LLU),
-	QU(10650256084182390252LLU), QU( 4808408648695766755LLU),
-	QU( 1642839215013788844LLU), QU(10607187948250398390LLU),
-	QU( 7076868166085913508LLU), QU(  730522571106887032LLU),
-	QU(12500579240208524895LLU), QU( 4484390097311355324LLU),
-	QU(15145801330700623870LLU), QU( 8055827661392944028LLU),
-	QU( 5865092976832712268LLU), QU(15159212508053625143LLU),
-	QU( 3560964582876483341LLU), QU( 4070052741344438280LLU),
-	QU( 6032585709886855634LLU), QU(15643262320904604873LLU),
-	QU( 2565119772293371111LLU), QU(  318314293065348260LLU),
-	QU(15047458749141511872LLU), QU( 7772788389811528730LLU),
-	QU( 7081187494343801976LLU), QU( 6465136009467253947LLU),
-	QU(10425940692543362069LLU), QU(  554608190318339115LLU),
-	QU(14796699860302125214LLU), QU( 1638153134431111443LLU),
-	QU(10336967447052276248LLU), QU( 8412308070396592958LLU),
-	QU( 4004557277152051226LLU), QU( 8143598997278774834LLU),
-	QU(16413323996508783221LLU), QU(13139418758033994949LLU),
-	QU( 9772709138335006667LLU), QU( 2818167159287157659LLU),
-	QU(17091740573832523669LLU), QU(14629199013130751608LLU),
-	QU(18268322711500338185LLU), QU( 8290963415675493063LLU),
-	QU( 8830864907452542588LLU), QU( 1614839084637494849LLU),
-	QU(14855358500870422231LLU), QU( 3472996748392519937LLU),
-	QU(15317151166268877716LLU), QU( 5825895018698400362LLU),
-	QU(16730208429367544129LLU), QU(10481156578141202800LLU),
-	QU( 4746166512382823750LLU), QU(12720876014472464998LLU),
-	QU( 8825177124486735972LLU), QU(13733447296837467838LLU),
-	QU( 6412293741681359625LLU), QU( 8313213138756135033LLU),
-	QU(11421481194803712517LLU), QU( 7997007691544174032LLU),
-	QU( 6812963847917605930LLU), QU( 9683091901227558641LLU),
-	QU(14703594165860324713LLU), QU( 1775476144519618309LLU),
-	QU( 2724283288516469519LLU), QU(  717642555185856868LLU),
-	QU( 8736402192215092346LLU), QU(11878800336431381021LLU),
-	QU( 4348816066017061293LLU), QU( 6115112756583631307LLU),
-	QU( 9176597239667142976LLU), QU(12615622714894259204LLU),
-	QU(10283406711301385987LLU), QU( 5111762509485379420LLU),
-	QU( 3118290051198688449LLU), QU( 7345123071632232145LLU),
-	QU( 9176423451688682359LLU), QU( 4843865456157868971LLU),
-	QU(12008036363752566088LLU), QU(12058837181919397720LLU),
-	QU( 2145073958457347366LLU), QU( 1526504881672818067LLU),
-	QU( 3488830105567134848LLU), QU(13208362960674805143LLU),
-	QU( 4077549672899572192LLU), QU( 7770995684693818365LLU),
-	QU( 1398532341546313593LLU), QU(12711859908703927840LLU),
-	QU( 1417561172594446813LLU), QU(17045191024194170604LLU),
-	QU( 4101933177604931713LLU), QU(14708428834203480320LLU),
-	QU(17447509264469407724LLU), QU(14314821973983434255LLU),
-	QU(17990472271061617265LLU), QU( 5087756685841673942LLU),
-	QU(12797820586893859939LLU), QU( 1778128952671092879LLU),
-	QU( 3535918530508665898LLU), QU( 9035729701042481301LLU),
-	QU(14808661568277079962LLU), QU(14587345077537747914LLU),
-	QU(11920080002323122708LLU), QU( 6426515805197278753LLU),
-	QU( 3295612216725984831LLU), QU(11040722532100876120LLU),
-	QU(12305952936387598754LLU), QU(16097391899742004253LLU),
-	QU( 4908537335606182208LLU), QU(12446674552196795504LLU),
-	QU(16010497855816895177LLU), QU( 9194378874788615551LLU),
-	QU( 3382957529567613384LLU), QU( 5154647600754974077LLU),
-	QU( 9801822865328396141LLU), QU( 9023662173919288143LLU),
-	QU(17623115353825147868LLU), QU( 8238115767443015816LLU),
-	QU(15811444159859002560LLU), QU( 9085612528904059661LLU),
-	QU( 6888601089398614254LLU), QU(  258252992894160189LLU),
-	QU( 6704363880792428622LLU), QU( 6114966032147235763LLU),
-	QU(11075393882690261875LLU), QU( 8797664238933620407LLU),
-	QU( 5901892006476726920LLU), QU( 5309780159285518958LLU),
-	QU(14940808387240817367LLU), QU(14642032021449656698LLU),
-	QU( 9808256672068504139LLU), QU( 3670135111380607658LLU),
-	QU(11211211097845960152LLU), QU( 1474304506716695808LLU),
-	QU(15843166204506876239LLU), QU( 7661051252471780561LLU),
-	QU(10170905502249418476LLU), QU( 7801416045582028589LLU),
-	QU( 2763981484737053050LLU), QU( 9491377905499253054LLU),
-	QU(16201395896336915095LLU), QU( 9256513756442782198LLU),
-	QU( 5411283157972456034LLU), QU( 5059433122288321676LLU),
-	QU( 4327408006721123357LLU), QU( 9278544078834433377LLU),
-	QU( 7601527110882281612LLU), QU(11848295896975505251LLU),
-	QU(12096998801094735560LLU), QU(14773480339823506413LLU),
-	QU(15586227433895802149LLU), QU(12786541257830242872LLU),
-	QU( 6904692985140503067LLU), QU( 5309011515263103959LLU),
-	QU(12105257191179371066LLU), QU(14654380212442225037LLU),
-	QU( 2556774974190695009LLU), QU( 4461297399927600261LLU),
-	QU(14888225660915118646LLU), QU(14915459341148291824LLU),
-	QU( 2738802166252327631LLU), QU( 6047155789239131512LLU),
-	QU(12920545353217010338LLU), QU(10697617257007840205LLU),
-	QU( 2751585253158203504LLU), QU(13252729159780047496LLU),
-	QU(14700326134672815469LLU), QU(14082527904374600529LLU),
-	QU(16852962273496542070LLU), QU(17446675504235853907LLU),
-	QU(15019600398527572311LLU), QU(12312781346344081551LLU),
-	QU(14524667935039810450LLU), QU( 5634005663377195738LLU),
-	QU(11375574739525000569LLU), QU( 2423665396433260040LLU),
-	QU( 5222836914796015410LLU), QU( 4397666386492647387LLU),
-	QU( 4619294441691707638LLU), QU(  665088602354770716LLU),
-	QU(13246495665281593610LLU), QU( 6564144270549729409LLU),
-	QU(10223216188145661688LLU), QU( 3961556907299230585LLU),
-	QU(11543262515492439914LLU), QU(16118031437285993790LLU),
-	QU( 7143417964520166465LLU), QU(13295053515909486772LLU),
-	QU(   40434666004899675LLU), QU(17127804194038347164LLU),
-	QU( 8599165966560586269LLU), QU( 8214016749011284903LLU),
-	QU(13725130352140465239LLU), QU( 5467254474431726291LLU),
-	QU( 7748584297438219877LLU), QU(16933551114829772472LLU),
-	QU( 2169618439506799400LLU), QU( 2169787627665113463LLU),
-	QU(17314493571267943764LLU), QU(18053575102911354912LLU),
-	QU(11928303275378476973LLU), QU(11593850925061715550LLU),
-	QU(17782269923473589362LLU), QU( 3280235307704747039LLU),
-	QU( 6145343578598685149LLU), QU(17080117031114086090LLU),
-	QU(18066839902983594755LLU), QU( 6517508430331020706LLU),
-	QU( 8092908893950411541LLU), QU(12558378233386153732LLU),
-	QU( 4476532167973132976LLU), QU(16081642430367025016LLU),
-	QU( 4233154094369139361LLU), QU( 8693630486693161027LLU),
-	QU(11244959343027742285LLU), QU(12273503967768513508LLU),
-	QU(14108978636385284876LLU), QU( 7242414665378826984LLU),
-	QU( 6561316938846562432LLU), QU( 8601038474994665795LLU),
-	QU(17532942353612365904LLU), QU(17940076637020912186LLU),
-	QU( 7340260368823171304LLU), QU( 7061807613916067905LLU),
-	QU(10561734935039519326LLU), QU(17990796503724650862LLU),
-	QU( 6208732943911827159LLU), QU(  359077562804090617LLU),
-	QU(14177751537784403113LLU), QU(10659599444915362902LLU),
-	QU(15081727220615085833LLU), QU(13417573895659757486LLU),
-	QU(15513842342017811524LLU), QU(11814141516204288231LLU),
-	QU( 1827312513875101814LLU), QU( 2804611699894603103LLU),
-	QU(17116500469975602763LLU), QU(12270191815211952087LLU),
-	QU(12256358467786024988LLU), QU(18435021722453971267LLU),
-	QU(  671330264390865618LLU), QU(  476504300460286050LLU),
-	QU(16465470901027093441LLU), QU( 4047724406247136402LLU),
-	QU( 1322305451411883346LLU), QU( 1388308688834322280LLU),
-	QU( 7303989085269758176LLU), QU( 9323792664765233642LLU),
-	QU( 4542762575316368936LLU), QU(17342696132794337618LLU),
-	QU( 4588025054768498379LLU), QU(13415475057390330804LLU),
-	QU(17880279491733405570LLU), QU(10610553400618620353LLU),
-	QU( 3180842072658960139LLU), QU(13002966655454270120LLU),
-	QU( 1665301181064982826LLU), QU( 7083673946791258979LLU),
-	QU(  190522247122496820LLU), QU(17388280237250677740LLU),
-	QU( 8430770379923642945LLU), QU(12987180971921668584LLU),
-	QU( 2311086108365390642LLU), QU( 2870984383579822345LLU),
-	QU(14014682609164653318LLU), QU(14467187293062251484LLU),
-	QU(  192186361147413298LLU), QU(15171951713531796524LLU),
-	QU( 9900305495015948728LLU), QU(17958004775615466344LLU),
-	QU(14346380954498606514LLU), QU(18040047357617407096LLU),
-	QU( 5035237584833424532LLU), QU(15089555460613972287LLU),
-	QU( 4131411873749729831LLU), QU( 1329013581168250330LLU),
-	QU(10095353333051193949LLU), QU(10749518561022462716LLU),
-	QU( 9050611429810755847LLU), QU(15022028840236655649LLU),
-	QU( 8775554279239748298LLU), QU(13105754025489230502LLU),
-	QU(15471300118574167585LLU), QU(   89864764002355628LLU),
-	QU( 8776416323420466637LLU), QU( 5280258630612040891LLU),
-	QU( 2719174488591862912LLU), QU( 7599309137399661994LLU),
-	QU(15012887256778039979LLU), QU(14062981725630928925LLU),
-	QU(12038536286991689603LLU), QU( 7089756544681775245LLU),
-	QU(10376661532744718039LLU), QU( 1265198725901533130LLU),
-	QU(13807996727081142408LLU), QU( 2935019626765036403LLU),
-	QU( 7651672460680700141LLU), QU( 3644093016200370795LLU),
-	QU( 2840982578090080674LLU), QU(17956262740157449201LLU),
-	QU(18267979450492880548LLU), QU(11799503659796848070LLU),
-	QU( 9942537025669672388LLU), QU(11886606816406990297LLU),
-	QU( 5488594946437447576LLU), QU( 7226714353282744302LLU),
-	QU( 3784851653123877043LLU), QU(  878018453244803041LLU),
-	QU(12110022586268616085LLU), QU(  734072179404675123LLU),
-	QU(11869573627998248542LLU), QU(  469150421297783998LLU),
-	QU(  260151124912803804LLU), QU(11639179410120968649LLU),
-	QU( 9318165193840846253LLU), QU(12795671722734758075LLU),
-	QU(15318410297267253933LLU), QU(  691524703570062620LLU),
-	QU( 5837129010576994601LLU), QU(15045963859726941052LLU),
-	QU( 5850056944932238169LLU), QU(12017434144750943807LLU),
-	QU( 7447139064928956574LLU), QU( 3101711812658245019LLU),
-	QU(16052940704474982954LLU), QU(18195745945986994042LLU),
-	QU( 8932252132785575659LLU), QU(13390817488106794834LLU),
-	QU(11582771836502517453LLU), QU( 4964411326683611686LLU),
-	QU( 2195093981702694011LLU), QU(14145229538389675669LLU),
-	QU(16459605532062271798LLU), QU(  866316924816482864LLU),
-	QU( 4593041209937286377LLU), QU( 8415491391910972138LLU),
-	QU( 4171236715600528969LLU), QU(16637569303336782889LLU),
-	QU( 2002011073439212680LLU), QU(17695124661097601411LLU),
-	QU( 4627687053598611702LLU), QU( 7895831936020190403LLU),
-	QU( 8455951300917267802LLU), QU( 2923861649108534854LLU),
-	QU( 8344557563927786255LLU), QU( 6408671940373352556LLU),
-	QU(12210227354536675772LLU), QU(14294804157294222295LLU),
-	QU(10103022425071085127LLU), QU(10092959489504123771LLU),
-	QU( 6554774405376736268LLU), QU(12629917718410641774LLU),
-	QU( 6260933257596067126LLU), QU( 2460827021439369673LLU),
-	QU( 2541962996717103668LLU), QU(  597377203127351475LLU),
-	QU( 5316984203117315309LLU), QU( 4811211393563241961LLU),
-	QU(13119698597255811641LLU), QU( 8048691512862388981LLU),
-	QU(10216818971194073842LLU), QU( 4612229970165291764LLU),
-	QU(10000980798419974770LLU), QU( 6877640812402540687LLU),
-	QU( 1488727563290436992LLU), QU( 2227774069895697318LLU),
-	QU(11237754507523316593LLU), QU(13478948605382290972LLU),
-	QU( 1963583846976858124LLU), QU( 5512309205269276457LLU),
-	QU( 3972770164717652347LLU), QU( 3841751276198975037LLU),
-	QU(10283343042181903117LLU), QU( 8564001259792872199LLU),
-	QU(16472187244722489221LLU), QU( 8953493499268945921LLU),
-	QU( 3518747340357279580LLU), QU( 4003157546223963073LLU),
-	QU( 3270305958289814590LLU), QU( 3966704458129482496LLU),
-	QU( 8122141865926661939LLU), QU(14627734748099506653LLU),
-	QU(13064426990862560568LLU), QU( 2414079187889870829LLU),
-	QU( 5378461209354225306LLU), QU(10841985740128255566LLU),
-	QU(  538582442885401738LLU), QU( 7535089183482905946LLU),
-	QU(16117559957598879095LLU), QU( 8477890721414539741LLU),
-	QU( 1459127491209533386LLU), QU(17035126360733620462LLU),
-	QU( 8517668552872379126LLU), QU(10292151468337355014LLU),
-	QU(17081267732745344157LLU), QU(13751455337946087178LLU),
-	QU(14026945459523832966LLU), QU( 6653278775061723516LLU),
-	QU(10619085543856390441LLU), QU( 2196343631481122885LLU),
-	QU(10045966074702826136LLU), QU(10082317330452718282LLU),
-	QU( 5920859259504831242LLU), QU( 9951879073426540617LLU),
-	QU( 7074696649151414158LLU), QU(15808193543879464318LLU),
-	QU( 7385247772746953374LLU), QU( 3192003544283864292LLU),
-	QU(18153684490917593847LLU), QU(12423498260668568905LLU),
-	QU(10957758099756378169LLU), QU(11488762179911016040LLU),
-	QU( 2099931186465333782LLU), QU(11180979581250294432LLU),
-	QU( 8098916250668367933LLU), QU( 3529200436790763465LLU),
-	QU(12988418908674681745LLU), QU( 6147567275954808580LLU),
-	QU( 3207503344604030989LLU), QU(10761592604898615360LLU),
-	QU(  229854861031893504LLU), QU( 8809853962667144291LLU),
-	QU(13957364469005693860LLU), QU( 7634287665224495886LLU),
-	QU(12353487366976556874LLU), QU( 1134423796317152034LLU),
-	QU( 2088992471334107068LLU), QU( 7393372127190799698LLU),
-	QU( 1845367839871058391LLU), QU(  207922563987322884LLU),
-	QU(11960870813159944976LLU), QU(12182120053317317363LLU),
-	QU(17307358132571709283LLU), QU(13871081155552824936LLU),
-	QU(18304446751741566262LLU), QU( 7178705220184302849LLU),
-	QU(10929605677758824425LLU), QU(16446976977835806844LLU),
-	QU(13723874412159769044LLU), QU( 6942854352100915216LLU),
-	QU( 1726308474365729390LLU), QU( 2150078766445323155LLU),
-	QU(15345558947919656626LLU), QU(12145453828874527201LLU),
-	QU( 2054448620739726849LLU), QU( 2740102003352628137LLU),
-	QU(11294462163577610655LLU), QU(  756164283387413743LLU),
-	QU(17841144758438810880LLU), QU(10802406021185415861LLU),
-	QU( 8716455530476737846LLU), QU( 6321788834517649606LLU),
-	QU(14681322910577468426LLU), QU(17330043563884336387LLU),
-	QU(12701802180050071614LLU), QU(14695105111079727151LLU),
-	QU( 5112098511654172830LLU), QU( 4957505496794139973LLU),
-	QU( 8270979451952045982LLU), QU(12307685939199120969LLU),
-	QU(12425799408953443032LLU), QU( 8376410143634796588LLU),
-	QU(16621778679680060464LLU), QU( 3580497854566660073LLU),
-	QU( 1122515747803382416LLU), QU(  857664980960597599LLU),
-	QU( 6343640119895925918LLU), QU(12878473260854462891LLU),
-	QU(10036813920765722626LLU), QU(14451335468363173812LLU),
-	QU( 5476809692401102807LLU), QU(16442255173514366342LLU),
-	QU(13060203194757167104LLU), QU(14354124071243177715LLU),
-	QU(15961249405696125227LLU), QU(13703893649690872584LLU),
-	QU(  363907326340340064LLU), QU( 6247455540491754842LLU),
-	QU(12242249332757832361LLU), QU(  156065475679796717LLU),
-	QU( 9351116235749732355LLU), QU( 4590350628677701405LLU),
-	QU( 1671195940982350389LLU), QU(13501398458898451905LLU),
-	QU( 6526341991225002255LLU), QU( 1689782913778157592LLU),
-	QU( 7439222350869010334LLU), QU(13975150263226478308LLU),
-	QU(11411961169932682710LLU), QU(17204271834833847277LLU),
-	QU(  541534742544435367LLU), QU( 6591191931218949684LLU),
-	QU( 2645454775478232486LLU), QU( 4322857481256485321LLU),
-	QU( 8477416487553065110LLU), QU(12902505428548435048LLU),
-	QU(  971445777981341415LLU), QU(14995104682744976712LLU),
-	QU( 4243341648807158063LLU), QU( 8695061252721927661LLU),
-	QU( 5028202003270177222LLU), QU( 2289257340915567840LLU),
-	QU(13870416345121866007LLU), QU(13994481698072092233LLU),
-	QU( 6912785400753196481LLU), QU( 2278309315841980139LLU),
-	QU( 4329765449648304839LLU), QU( 5963108095785485298LLU),
-	QU( 4880024847478722478LLU), QU(16015608779890240947LLU),
-	QU( 1866679034261393544LLU), QU(  914821179919731519LLU),
-	QU( 9643404035648760131LLU), QU( 2418114953615593915LLU),
-	QU(  944756836073702374LLU), QU(15186388048737296834LLU),
-	QU( 7723355336128442206LLU), QU( 7500747479679599691LLU),
-	QU(18013961306453293634LLU), QU( 2315274808095756456LLU),
-	QU(13655308255424029566LLU), QU(17203800273561677098LLU),
-	QU( 1382158694422087756LLU), QU( 5090390250309588976LLU),
-	QU(  517170818384213989LLU), QU( 1612709252627729621LLU),
-	QU( 1330118955572449606LLU), QU(  300922478056709885LLU),
-	QU(18115693291289091987LLU), QU(13491407109725238321LLU),
-	QU(15293714633593827320LLU), QU( 5151539373053314504LLU),
-	QU( 5951523243743139207LLU), QU(14459112015249527975LLU),
-	QU( 5456113959000700739LLU), QU( 3877918438464873016LLU),
-	QU(12534071654260163555LLU), QU(15871678376893555041LLU),
-	QU(11005484805712025549LLU), QU(16353066973143374252LLU),
-	QU( 4358331472063256685LLU), QU( 8268349332210859288LLU),
-	QU(12485161590939658075LLU), QU(13955993592854471343LLU),
-	QU( 5911446886848367039LLU), QU(14925834086813706974LLU),
-	QU( 6590362597857994805LLU), QU( 1280544923533661875LLU),
-	QU( 1637756018947988164LLU), QU( 4734090064512686329LLU),
-	QU(16693705263131485912LLU), QU( 6834882340494360958LLU),
-	QU( 8120732176159658505LLU), QU( 2244371958905329346LLU),
-	QU(10447499707729734021LLU), QU( 7318742361446942194LLU),
-	QU( 8032857516355555296LLU), QU(14023605983059313116LLU),
-	QU( 1032336061815461376LLU), QU( 9840995337876562612LLU),
-	QU( 9869256223029203587LLU), QU(12227975697177267636LLU),
-	QU(12728115115844186033LLU), QU( 7752058479783205470LLU),
-	QU(  729733219713393087LLU), QU(12954017801239007622LLU)
+	QU(16924766246869039260ULL), QU( 8201438687333352714ULL),
+	QU( 2265290287015001750ULL), QU(18397264611805473832ULL),
+	QU( 3375255223302384358ULL), QU( 6345559975416828796ULL),
+	QU(18229739242790328073ULL), QU( 7596792742098800905ULL),
+	QU(  255338647169685981ULL), QU( 2052747240048610300ULL),
+	QU(18328151576097299343ULL), QU(12472905421133796567ULL),
+	QU(11315245349717600863ULL), QU(16594110197775871209ULL),
+	QU(15708751964632456450ULL), QU(10452031272054632535ULL),
+	QU(11097646720811454386ULL), QU( 4556090668445745441ULL),
+	QU(17116187693090663106ULL), QU(14931526836144510645ULL),
+	QU( 9190752218020552591ULL), QU( 9625800285771901401ULL),
+	QU(13995141077659972832ULL), QU( 5194209094927829625ULL),
+	QU( 4156788379151063303ULL), QU( 8523452593770139494ULL),
+	QU(14082382103049296727ULL), QU( 2462601863986088483ULL),
+	QU( 3030583461592840678ULL), QU( 5221622077872827681ULL),
+	QU( 3084210671228981236ULL), QU(13956758381389953823ULL),
+	QU(13503889856213423831ULL), QU(15696904024189836170ULL),
+	QU( 4612584152877036206ULL), QU( 6231135538447867881ULL),
+	QU(10172457294158869468ULL), QU( 6452258628466708150ULL),
+	QU(14044432824917330221ULL), QU(  370168364480044279ULL),
+	QU(10102144686427193359ULL), QU(  667870489994776076ULL),
+	QU( 2732271956925885858ULL), QU(18027788905977284151ULL),
+	QU(15009842788582923859ULL), QU( 7136357960180199542ULL),
+	QU(15901736243475578127ULL), QU(16951293785352615701ULL),
+	QU(10551492125243691632ULL), QU(17668869969146434804ULL),
+	QU(13646002971174390445ULL), QU( 9804471050759613248ULL),
+	QU( 5511670439655935493ULL), QU(18103342091070400926ULL),
+	QU(17224512747665137533ULL), QU(15534627482992618168ULL),
+	QU( 1423813266186582647ULL), QU(15821176807932930024ULL),
+	QU(   30323369733607156ULL), QU(11599382494723479403ULL),
+	QU(  653856076586810062ULL), QU( 3176437395144899659ULL),
+	QU(14028076268147963917ULL), QU(16156398271809666195ULL),
+	QU( 3166955484848201676ULL), QU( 5746805620136919390ULL),
+	QU(17297845208891256593ULL), QU(11691653183226428483ULL),
+	QU(17900026146506981577ULL), QU(15387382115755971042ULL),
+	QU(16923567681040845943ULL), QU( 8039057517199388606ULL),
+	QU(11748409241468629263ULL), QU(  794358245539076095ULL),
+	QU(13438501964693401242ULL), QU(14036803236515618962ULL),
+	QU( 5252311215205424721ULL), QU(17806589612915509081ULL),
+	QU( 6802767092397596006ULL), QU(14212120431184557140ULL),
+	QU( 1072951366761385712ULL), QU(13098491780722836296ULL),
+	QU( 9466676828710797353ULL), QU(12673056849042830081ULL),
+	QU(12763726623645357580ULL), QU(16468961652999309493ULL),
+	QU(15305979875636438926ULL), QU(17444713151223449734ULL),
+	QU( 5692214267627883674ULL), QU(13049589139196151505ULL),
+	QU(  880115207831670745ULL), QU( 1776529075789695498ULL),
+	QU(16695225897801466485ULL), QU(10666901778795346845ULL),
+	QU( 6164389346722833869ULL), QU( 2863817793264300475ULL),
+	QU( 9464049921886304754ULL), QU( 3993566636740015468ULL),
+	QU( 9983749692528514136ULL), QU(16375286075057755211ULL),
+	QU(16042643417005440820ULL), QU(11445419662923489877ULL),
+	QU( 7999038846885158836ULL), QU( 6721913661721511535ULL),
+	QU( 5363052654139357320ULL), QU( 1817788761173584205ULL),
+	QU(13290974386445856444ULL), QU( 4650350818937984680ULL),
+	QU( 8219183528102484836ULL), QU( 1569862923500819899ULL),
+	QU( 4189359732136641860ULL), QU(14202822961683148583ULL),
+	QU( 4457498315309429058ULL), QU(13089067387019074834ULL),
+	QU(11075517153328927293ULL), QU(10277016248336668389ULL),
+	QU( 7070509725324401122ULL), QU(17808892017780289380ULL),
+	QU(13143367339909287349ULL), QU( 1377743745360085151ULL),
+	QU( 5749341807421286485ULL), QU(14832814616770931325ULL),
+	QU( 7688820635324359492ULL), QU(10960474011539770045ULL),
+	QU(   81970066653179790ULL), QU(12619476072607878022ULL),
+	QU( 4419566616271201744ULL), QU(15147917311750568503ULL),
+	QU( 5549739182852706345ULL), QU( 7308198397975204770ULL),
+	QU(13580425496671289278ULL), QU(17070764785210130301ULL),
+	QU( 8202832846285604405ULL), QU( 6873046287640887249ULL),
+	QU( 6927424434308206114ULL), QU( 6139014645937224874ULL),
+	QU(10290373645978487639ULL), QU(15904261291701523804ULL),
+	QU( 9628743442057826883ULL), QU(18383429096255546714ULL),
+	QU( 4977413265753686967ULL), QU( 7714317492425012869ULL),
+	QU( 9025232586309926193ULL), QU(14627338359776709107ULL),
+	QU(14759849896467790763ULL), QU(10931129435864423252ULL),
+	QU( 4588456988775014359ULL), QU(10699388531797056724ULL),
+	QU(  468652268869238792ULL), QU( 5755943035328078086ULL),
+	QU( 2102437379988580216ULL), QU( 9986312786506674028ULL),
+	QU( 2654207180040945604ULL), QU( 8726634790559960062ULL),
+	QU(  100497234871808137ULL), QU( 2800137176951425819ULL),
+	QU( 6076627612918553487ULL), QU( 5780186919186152796ULL),
+	QU( 8179183595769929098ULL), QU( 6009426283716221169ULL),
+	QU( 2796662551397449358ULL), QU( 1756961367041986764ULL),
+	QU( 6972897917355606205ULL), QU(14524774345368968243ULL),
+	QU( 2773529684745706940ULL), QU( 4853632376213075959ULL),
+	QU( 4198177923731358102ULL), QU( 8271224913084139776ULL),
+	QU( 2741753121611092226ULL), QU(16782366145996731181ULL),
+	QU(15426125238972640790ULL), QU(13595497100671260342ULL),
+	QU( 3173531022836259898ULL), QU( 6573264560319511662ULL),
+	QU(18041111951511157441ULL), QU( 2351433581833135952ULL),
+	QU( 3113255578908173487ULL), QU( 1739371330877858784ULL),
+	QU(16046126562789165480ULL), QU( 8072101652214192925ULL),
+	QU(15267091584090664910ULL), QU( 9309579200403648940ULL),
+	QU( 5218892439752408722ULL), QU(14492477246004337115ULL),
+	QU(17431037586679770619ULL), QU( 7385248135963250480ULL),
+	QU( 9580144956565560660ULL), QU( 4919546228040008720ULL),
+	QU(15261542469145035584ULL), QU(18233297270822253102ULL),
+	QU( 5453248417992302857ULL), QU( 9309519155931460285ULL),
+	QU(10342813012345291756ULL), QU(15676085186784762381ULL),
+	QU(15912092950691300645ULL), QU( 9371053121499003195ULL),
+	QU( 9897186478226866746ULL), QU(14061858287188196327ULL),
+	QU(  122575971620788119ULL), QU(12146750969116317754ULL),
+	QU( 4438317272813245201ULL), QU( 8332576791009527119ULL),
+	QU(13907785691786542057ULL), QU(10374194887283287467ULL),
+	QU( 2098798755649059566ULL), QU( 3416235197748288894ULL),
+	QU( 8688269957320773484ULL), QU( 7503964602397371571ULL),
+	QU(16724977015147478236ULL), QU( 9461512855439858184ULL),
+	QU(13259049744534534727ULL), QU( 3583094952542899294ULL),
+	QU( 8764245731305528292ULL), QU(13240823595462088985ULL),
+	QU(13716141617617910448ULL), QU(18114969519935960955ULL),
+	QU( 2297553615798302206ULL), QU( 4585521442944663362ULL),
+	QU(17776858680630198686ULL), QU( 4685873229192163363ULL),
+	QU(  152558080671135627ULL), QU(15424900540842670088ULL),
+	QU(13229630297130024108ULL), QU(17530268788245718717ULL),
+	QU(16675633913065714144ULL), QU( 3158912717897568068ULL),
+	QU(15399132185380087288ULL), QU( 7401418744515677872ULL),
+	QU(13135412922344398535ULL), QU( 6385314346100509511ULL),
+	QU(13962867001134161139ULL), QU(10272780155442671999ULL),
+	QU(12894856086597769142ULL), QU(13340877795287554994ULL),
+	QU(12913630602094607396ULL), QU(12543167911119793857ULL),
+	QU(17343570372251873096ULL), QU(10959487764494150545ULL),
+	QU( 6966737953093821128ULL), QU(13780699135496988601ULL),
+	QU( 4405070719380142046ULL), QU(14923788365607284982ULL),
+	QU( 2869487678905148380ULL), QU( 6416272754197188403ULL),
+	QU(15017380475943612591ULL), QU( 1995636220918429487ULL),
+	QU( 3402016804620122716ULL), QU(15800188663407057080ULL),
+	QU(11362369990390932882ULL), QU(15262183501637986147ULL),
+	QU(10239175385387371494ULL), QU( 9352042420365748334ULL),
+	QU( 1682457034285119875ULL), QU( 1724710651376289644ULL),
+	QU( 2038157098893817966ULL), QU( 9897825558324608773ULL),
+	QU( 1477666236519164736ULL), QU(16835397314511233640ULL),
+	QU(10370866327005346508ULL), QU(10157504370660621982ULL),
+	QU(12113904045335882069ULL), QU(13326444439742783008ULL),
+	QU(11302769043000765804ULL), QU(13594979923955228484ULL),
+	QU(11779351762613475968ULL), QU( 3786101619539298383ULL),
+	QU( 8021122969180846063ULL), QU(15745904401162500495ULL),
+	QU(10762168465993897267ULL), QU(13552058957896319026ULL),
+	QU(11200228655252462013ULL), QU( 5035370357337441226ULL),
+	QU( 7593918984545500013ULL), QU( 5418554918361528700ULL),
+	QU( 4858270799405446371ULL), QU( 9974659566876282544ULL),
+	QU(18227595922273957859ULL), QU( 2772778443635656220ULL),
+	QU(14285143053182085385ULL), QU( 9939700992429600469ULL),
+	QU(12756185904545598068ULL), QU( 2020783375367345262ULL),
+	QU(   57026775058331227ULL), QU(  950827867930065454ULL),
+	QU( 6602279670145371217ULL), QU( 2291171535443566929ULL),
+	QU( 5832380724425010313ULL), QU( 1220343904715982285ULL),
+	QU(17045542598598037633ULL), QU(15460481779702820971ULL),
+	QU(13948388779949365130ULL), QU(13975040175430829518ULL),
+	QU(17477538238425541763ULL), QU(11104663041851745725ULL),
+	QU(15860992957141157587ULL), QU(14529434633012950138ULL),
+	QU( 2504838019075394203ULL), QU( 7512113882611121886ULL),
+	QU( 4859973559980886617ULL), QU( 1258601555703250219ULL),
+	QU(15594548157514316394ULL), QU( 4516730171963773048ULL),
+	QU(11380103193905031983ULL), QU( 6809282239982353344ULL),
+	QU(18045256930420065002ULL), QU( 2453702683108791859ULL),
+	QU(  977214582986981460ULL), QU( 2006410402232713466ULL),
+	QU( 6192236267216378358ULL), QU( 3429468402195675253ULL),
+	QU(18146933153017348921ULL), QU(17369978576367231139ULL),
+	QU( 1246940717230386603ULL), QU(11335758870083327110ULL),
+	QU(14166488801730353682ULL), QU( 9008573127269635732ULL),
+	QU(10776025389820643815ULL), QU(15087605441903942962ULL),
+	QU( 1359542462712147922ULL), QU(13898874411226454206ULL),
+	QU(17911176066536804411ULL), QU( 9435590428600085274ULL),
+	QU(  294488509967864007ULL), QU( 8890111397567922046ULL),
+	QU( 7987823476034328778ULL), QU(13263827582440967651ULL),
+	QU( 7503774813106751573ULL), QU(14974747296185646837ULL),
+	QU( 8504765037032103375ULL), QU(17340303357444536213ULL),
+	QU( 7704610912964485743ULL), QU( 8107533670327205061ULL),
+	QU( 9062969835083315985ULL), QU(16968963142126734184ULL),
+	QU(12958041214190810180ULL), QU( 2720170147759570200ULL),
+	QU( 2986358963942189566ULL), QU(14884226322219356580ULL),
+	QU(  286224325144368520ULL), QU(11313800433154279797ULL),
+	QU(18366849528439673248ULL), QU(17899725929482368789ULL),
+	QU( 3730004284609106799ULL), QU( 1654474302052767205ULL),
+	QU( 5006698007047077032ULL), QU( 8196893913601182838ULL),
+	QU(15214541774425211640ULL), QU(17391346045606626073ULL),
+	QU( 8369003584076969089ULL), QU( 3939046733368550293ULL),
+	QU(10178639720308707785ULL), QU( 2180248669304388697ULL),
+	QU(   62894391300126322ULL), QU( 9205708961736223191ULL),
+	QU( 6837431058165360438ULL), QU( 3150743890848308214ULL),
+	QU(17849330658111464583ULL), QU(12214815643135450865ULL),
+	QU(13410713840519603402ULL), QU( 3200778126692046802ULL),
+	QU(13354780043041779313ULL), QU(  800850022756886036ULL),
+	QU(15660052933953067433ULL), QU( 6572823544154375676ULL),
+	QU(11030281857015819266ULL), QU(12682241941471433835ULL),
+	QU(11654136407300274693ULL), QU( 4517795492388641109ULL),
+	QU( 9757017371504524244ULL), QU(17833043400781889277ULL),
+	QU(12685085201747792227ULL), QU(10408057728835019573ULL),
+	QU(   98370418513455221ULL), QU( 6732663555696848598ULL),
+	QU(13248530959948529780ULL), QU( 3530441401230622826ULL),
+	QU(18188251992895660615ULL), QU( 1847918354186383756ULL),
+	QU( 1127392190402660921ULL), QU(11293734643143819463ULL),
+	QU( 3015506344578682982ULL), QU(13852645444071153329ULL),
+	QU( 2121359659091349142ULL), QU( 1294604376116677694ULL),
+	QU( 5616576231286352318ULL), QU( 7112502442954235625ULL),
+	QU(11676228199551561689ULL), QU(12925182803007305359ULL),
+	QU( 7852375518160493082ULL), QU( 1136513130539296154ULL),
+	QU( 5636923900916593195ULL), QU( 3221077517612607747ULL),
+	QU(17784790465798152513ULL), QU( 3554210049056995938ULL),
+	QU(17476839685878225874ULL), QU( 3206836372585575732ULL),
+	QU( 2765333945644823430ULL), QU(10080070903718799528ULL),
+	QU( 5412370818878286353ULL), QU( 9689685887726257728ULL),
+	QU( 8236117509123533998ULL), QU( 1951139137165040214ULL),
+	QU( 4492205209227980349ULL), QU(16541291230861602967ULL),
+	QU( 1424371548301437940ULL), QU( 9117562079669206794ULL),
+	QU(14374681563251691625ULL), QU(13873164030199921303ULL),
+	QU( 6680317946770936731ULL), QU(15586334026918276214ULL),
+	QU(10896213950976109802ULL), QU( 9506261949596413689ULL),
+	QU( 9903949574308040616ULL), QU( 6038397344557204470ULL),
+	QU(  174601465422373648ULL), QU(15946141191338238030ULL),
+	QU(17142225620992044937ULL), QU( 7552030283784477064ULL),
+	QU( 2947372384532947997ULL), QU(  510797021688197711ULL),
+	QU( 4962499439249363461ULL), QU(   23770320158385357ULL),
+	QU(  959774499105138124ULL), QU( 1468396011518788276ULL),
+	QU( 2015698006852312308ULL), QU( 4149400718489980136ULL),
+	QU( 5992916099522371188ULL), QU(10819182935265531076ULL),
+	QU(16189787999192351131ULL), QU(  342833961790261950ULL),
+	QU(12470830319550495336ULL), QU(18128495041912812501ULL),
+	QU( 1193600899723524337ULL), QU( 9056793666590079770ULL),
+	QU( 2154021227041669041ULL), QU( 4963570213951235735ULL),
+	QU( 4865075960209211409ULL), QU( 2097724599039942963ULL),
+	QU( 2024080278583179845ULL), QU(11527054549196576736ULL),
+	QU(10650256084182390252ULL), QU( 4808408648695766755ULL),
+	QU( 1642839215013788844ULL), QU(10607187948250398390ULL),
+	QU( 7076868166085913508ULL), QU(  730522571106887032ULL),
+	QU(12500579240208524895ULL), QU( 4484390097311355324ULL),
+	QU(15145801330700623870ULL), QU( 8055827661392944028ULL),
+	QU( 5865092976832712268ULL), QU(15159212508053625143ULL),
+	QU( 3560964582876483341ULL), QU( 4070052741344438280ULL),
+	QU( 6032585709886855634ULL), QU(15643262320904604873ULL),
+	QU( 2565119772293371111ULL), QU(  318314293065348260ULL),
+	QU(15047458749141511872ULL), QU( 7772788389811528730ULL),
+	QU( 7081187494343801976ULL), QU( 6465136009467253947ULL),
+	QU(10425940692543362069ULL), QU(  554608190318339115ULL),
+	QU(14796699860302125214ULL), QU( 1638153134431111443ULL),
+	QU(10336967447052276248ULL), QU( 8412308070396592958ULL),
+	QU( 4004557277152051226ULL), QU( 8143598997278774834ULL),
+	QU(16413323996508783221ULL), QU(13139418758033994949ULL),
+	QU( 9772709138335006667ULL), QU( 2818167159287157659ULL),
+	QU(17091740573832523669ULL), QU(14629199013130751608ULL),
+	QU(18268322711500338185ULL), QU( 8290963415675493063ULL),
+	QU( 8830864907452542588ULL), QU( 1614839084637494849ULL),
+	QU(14855358500870422231ULL), QU( 3472996748392519937ULL),
+	QU(15317151166268877716ULL), QU( 5825895018698400362ULL),
+	QU(16730208429367544129ULL), QU(10481156578141202800ULL),
+	QU( 4746166512382823750ULL), QU(12720876014472464998ULL),
+	QU( 8825177124486735972ULL), QU(13733447296837467838ULL),
+	QU( 6412293741681359625ULL), QU( 8313213138756135033ULL),
+	QU(11421481194803712517ULL), QU( 7997007691544174032ULL),
+	QU( 6812963847917605930ULL), QU( 9683091901227558641ULL),
+	QU(14703594165860324713ULL), QU( 1775476144519618309ULL),
+	QU( 2724283288516469519ULL), QU(  717642555185856868ULL),
+	QU( 8736402192215092346ULL), QU(11878800336431381021ULL),
+	QU( 4348816066017061293ULL), QU( 6115112756583631307ULL),
+	QU( 9176597239667142976ULL), QU(12615622714894259204ULL),
+	QU(10283406711301385987ULL), QU( 5111762509485379420ULL),
+	QU( 3118290051198688449ULL), QU( 7345123071632232145ULL),
+	QU( 9176423451688682359ULL), QU( 4843865456157868971ULL),
+	QU(12008036363752566088ULL), QU(12058837181919397720ULL),
+	QU( 2145073958457347366ULL), QU( 1526504881672818067ULL),
+	QU( 3488830105567134848ULL), QU(13208362960674805143ULL),
+	QU( 4077549672899572192ULL), QU( 7770995684693818365ULL),
+	QU( 1398532341546313593ULL), QU(12711859908703927840ULL),
+	QU( 1417561172594446813ULL), QU(17045191024194170604ULL),
+	QU( 4101933177604931713ULL), QU(14708428834203480320ULL),
+	QU(17447509264469407724ULL), QU(14314821973983434255ULL),
+	QU(17990472271061617265ULL), QU( 5087756685841673942ULL),
+	QU(12797820586893859939ULL), QU( 1778128952671092879ULL),
+	QU( 3535918530508665898ULL), QU( 9035729701042481301ULL),
+	QU(14808661568277079962ULL), QU(14587345077537747914ULL),
+	QU(11920080002323122708ULL), QU( 6426515805197278753ULL),
+	QU( 3295612216725984831ULL), QU(11040722532100876120ULL),
+	QU(12305952936387598754ULL), QU(16097391899742004253ULL),
+	QU( 4908537335606182208ULL), QU(12446674552196795504ULL),
+	QU(16010497855816895177ULL), QU( 9194378874788615551ULL),
+	QU( 3382957529567613384ULL), QU( 5154647600754974077ULL),
+	QU( 9801822865328396141ULL), QU( 9023662173919288143ULL),
+	QU(17623115353825147868ULL), QU( 8238115767443015816ULL),
+	QU(15811444159859002560ULL), QU( 9085612528904059661ULL),
+	QU( 6888601089398614254ULL), QU(  258252992894160189ULL),
+	QU( 6704363880792428622ULL), QU( 6114966032147235763ULL),
+	QU(11075393882690261875ULL), QU( 8797664238933620407ULL),
+	QU( 5901892006476726920ULL), QU( 5309780159285518958ULL),
+	QU(14940808387240817367ULL), QU(14642032021449656698ULL),
+	QU( 9808256672068504139ULL), QU( 3670135111380607658ULL),
+	QU(11211211097845960152ULL), QU( 1474304506716695808ULL),
+	QU(15843166204506876239ULL), QU( 7661051252471780561ULL),
+	QU(10170905502249418476ULL), QU( 7801416045582028589ULL),
+	QU( 2763981484737053050ULL), QU( 9491377905499253054ULL),
+	QU(16201395896336915095ULL), QU( 9256513756442782198ULL),
+	QU( 5411283157972456034ULL), QU( 5059433122288321676ULL),
+	QU( 4327408006721123357ULL), QU( 9278544078834433377ULL),
+	QU( 7601527110882281612ULL), QU(11848295896975505251ULL),
+	QU(12096998801094735560ULL), QU(14773480339823506413ULL),
+	QU(15586227433895802149ULL), QU(12786541257830242872ULL),
+	QU( 6904692985140503067ULL), QU( 5309011515263103959ULL),
+	QU(12105257191179371066ULL), QU(14654380212442225037ULL),
+	QU( 2556774974190695009ULL), QU( 4461297399927600261ULL),
+	QU(14888225660915118646ULL), QU(14915459341148291824ULL),
+	QU( 2738802166252327631ULL), QU( 6047155789239131512ULL),
+	QU(12920545353217010338ULL), QU(10697617257007840205ULL),
+	QU( 2751585253158203504ULL), QU(13252729159780047496ULL),
+	QU(14700326134672815469ULL), QU(14082527904374600529ULL),
+	QU(16852962273496542070ULL), QU(17446675504235853907ULL),
+	QU(15019600398527572311ULL), QU(12312781346344081551ULL),
+	QU(14524667935039810450ULL), QU( 5634005663377195738ULL),
+	QU(11375574739525000569ULL), QU( 2423665396433260040ULL),
+	QU( 5222836914796015410ULL), QU( 4397666386492647387ULL),
+	QU( 4619294441691707638ULL), QU(  665088602354770716ULL),
+	QU(13246495665281593610ULL), QU( 6564144270549729409ULL),
+	QU(10223216188145661688ULL), QU( 3961556907299230585ULL),
+	QU(11543262515492439914ULL), QU(16118031437285993790ULL),
+	QU( 7143417964520166465ULL), QU(13295053515909486772ULL),
+	QU(   40434666004899675ULL), QU(17127804194038347164ULL),
+	QU( 8599165966560586269ULL), QU( 8214016749011284903ULL),
+	QU(13725130352140465239ULL), QU( 5467254474431726291ULL),
+	QU( 7748584297438219877ULL), QU(16933551114829772472ULL),
+	QU( 2169618439506799400ULL), QU( 2169787627665113463ULL),
+	QU(17314493571267943764ULL), QU(18053575102911354912ULL),
+	QU(11928303275378476973ULL), QU(11593850925061715550ULL),
+	QU(17782269923473589362ULL), QU( 3280235307704747039ULL),
+	QU( 6145343578598685149ULL), QU(17080117031114086090ULL),
+	QU(18066839902983594755ULL), QU( 6517508430331020706ULL),
+	QU( 8092908893950411541ULL), QU(12558378233386153732ULL),
+	QU( 4476532167973132976ULL), QU(16081642430367025016ULL),
+	QU( 4233154094369139361ULL), QU( 8693630486693161027ULL),
+	QU(11244959343027742285ULL), QU(12273503967768513508ULL),
+	QU(14108978636385284876ULL), QU( 7242414665378826984ULL),
+	QU( 6561316938846562432ULL), QU( 8601038474994665795ULL),
+	QU(17532942353612365904ULL), QU(17940076637020912186ULL),
+	QU( 7340260368823171304ULL), QU( 7061807613916067905ULL),
+	QU(10561734935039519326ULL), QU(17990796503724650862ULL),
+	QU( 6208732943911827159ULL), QU(  359077562804090617ULL),
+	QU(14177751537784403113ULL), QU(10659599444915362902ULL),
+	QU(15081727220615085833ULL), QU(13417573895659757486ULL),
+	QU(15513842342017811524ULL), QU(11814141516204288231ULL),
+	QU( 1827312513875101814ULL), QU( 2804611699894603103ULL),
+	QU(17116500469975602763ULL), QU(12270191815211952087ULL),
+	QU(12256358467786024988ULL), QU(18435021722453971267ULL),
+	QU(  671330264390865618ULL), QU(  476504300460286050ULL),
+	QU(16465470901027093441ULL), QU( 4047724406247136402ULL),
+	QU( 1322305451411883346ULL), QU( 1388308688834322280ULL),
+	QU( 7303989085269758176ULL), QU( 9323792664765233642ULL),
+	QU( 4542762575316368936ULL), QU(17342696132794337618ULL),
+	QU( 4588025054768498379ULL), QU(13415475057390330804ULL),
+	QU(17880279491733405570ULL), QU(10610553400618620353ULL),
+	QU( 3180842072658960139ULL), QU(13002966655454270120ULL),
+	QU( 1665301181064982826ULL), QU( 7083673946791258979ULL),
+	QU(  190522247122496820ULL), QU(17388280237250677740ULL),
+	QU( 8430770379923642945ULL), QU(12987180971921668584ULL),
+	QU( 2311086108365390642ULL), QU( 2870984383579822345ULL),
+	QU(14014682609164653318ULL), QU(14467187293062251484ULL),
+	QU(  192186361147413298ULL), QU(15171951713531796524ULL),
+	QU( 9900305495015948728ULL), QU(17958004775615466344ULL),
+	QU(14346380954498606514ULL), QU(18040047357617407096ULL),
+	QU( 5035237584833424532ULL), QU(15089555460613972287ULL),
+	QU( 4131411873749729831ULL), QU( 1329013581168250330ULL),
+	QU(10095353333051193949ULL), QU(10749518561022462716ULL),
+	QU( 9050611429810755847ULL), QU(15022028840236655649ULL),
+	QU( 8775554279239748298ULL), QU(13105754025489230502ULL),
+	QU(15471300118574167585ULL), QU(   89864764002355628ULL),
+	QU( 8776416323420466637ULL), QU( 5280258630612040891ULL),
+	QU( 2719174488591862912ULL), QU( 7599309137399661994ULL),
+	QU(15012887256778039979ULL), QU(14062981725630928925ULL),
+	QU(12038536286991689603ULL), QU( 7089756544681775245ULL),
+	QU(10376661532744718039ULL), QU( 1265198725901533130ULL),
+	QU(13807996727081142408ULL), QU( 2935019626765036403ULL),
+	QU( 7651672460680700141ULL), QU( 3644093016200370795ULL),
+	QU( 2840982578090080674ULL), QU(17956262740157449201ULL),
+	QU(18267979450492880548ULL), QU(11799503659796848070ULL),
+	QU( 9942537025669672388ULL), QU(11886606816406990297ULL),
+	QU( 5488594946437447576ULL), QU( 7226714353282744302ULL),
+	QU( 3784851653123877043ULL), QU(  878018453244803041ULL),
+	QU(12110022586268616085ULL), QU(  734072179404675123ULL),
+	QU(11869573627998248542ULL), QU(  469150421297783998ULL),
+	QU(  260151124912803804ULL), QU(11639179410120968649ULL),
+	QU( 9318165193840846253ULL), QU(12795671722734758075ULL),
+	QU(15318410297267253933ULL), QU(  691524703570062620ULL),
+	QU( 5837129010576994601ULL), QU(15045963859726941052ULL),
+	QU( 5850056944932238169ULL), QU(12017434144750943807ULL),
+	QU( 7447139064928956574ULL), QU( 3101711812658245019ULL),
+	QU(16052940704474982954ULL), QU(18195745945986994042ULL),
+	QU( 8932252132785575659ULL), QU(13390817488106794834ULL),
+	QU(11582771836502517453ULL), QU( 4964411326683611686ULL),
+	QU( 2195093981702694011ULL), QU(14145229538389675669ULL),
+	QU(16459605532062271798ULL), QU(  866316924816482864ULL),
+	QU( 4593041209937286377ULL), QU( 8415491391910972138ULL),
+	QU( 4171236715600528969ULL), QU(16637569303336782889ULL),
+	QU( 2002011073439212680ULL), QU(17695124661097601411ULL),
+	QU( 4627687053598611702ULL), QU( 7895831936020190403ULL),
+	QU( 8455951300917267802ULL), QU( 2923861649108534854ULL),
+	QU( 8344557563927786255ULL), QU( 6408671940373352556ULL),
+	QU(12210227354536675772ULL), QU(14294804157294222295ULL),
+	QU(10103022425071085127ULL), QU(10092959489504123771ULL),
+	QU( 6554774405376736268ULL), QU(12629917718410641774ULL),
+	QU( 6260933257596067126ULL), QU( 2460827021439369673ULL),
+	QU( 2541962996717103668ULL), QU(  597377203127351475ULL),
+	QU( 5316984203117315309ULL), QU( 4811211393563241961ULL),
+	QU(13119698597255811641ULL), QU( 8048691512862388981ULL),
+	QU(10216818971194073842ULL), QU( 4612229970165291764ULL),
+	QU(10000980798419974770ULL), QU( 6877640812402540687ULL),
+	QU( 1488727563290436992ULL), QU( 2227774069895697318ULL),
+	QU(11237754507523316593ULL), QU(13478948605382290972ULL),
+	QU( 1963583846976858124ULL), QU( 5512309205269276457ULL),
+	QU( 3972770164717652347ULL), QU( 3841751276198975037ULL),
+	QU(10283343042181903117ULL), QU( 8564001259792872199ULL),
+	QU(16472187244722489221ULL), QU( 8953493499268945921ULL),
+	QU( 3518747340357279580ULL), QU( 4003157546223963073ULL),
+	QU( 3270305958289814590ULL), QU( 3966704458129482496ULL),
+	QU( 8122141865926661939ULL), QU(14627734748099506653ULL),
+	QU(13064426990862560568ULL), QU( 2414079187889870829ULL),
+	QU( 5378461209354225306ULL), QU(10841985740128255566ULL),
+	QU(  538582442885401738ULL), QU( 7535089183482905946ULL),
+	QU(16117559957598879095ULL), QU( 8477890721414539741ULL),
+	QU( 1459127491209533386ULL), QU(17035126360733620462ULL),
+	QU( 8517668552872379126ULL), QU(10292151468337355014ULL),
+	QU(17081267732745344157ULL), QU(13751455337946087178ULL),
+	QU(14026945459523832966ULL), QU( 6653278775061723516ULL),
+	QU(10619085543856390441ULL), QU( 2196343631481122885ULL),
+	QU(10045966074702826136ULL), QU(10082317330452718282ULL),
+	QU( 5920859259504831242ULL), QU( 9951879073426540617ULL),
+	QU( 7074696649151414158ULL), QU(15808193543879464318ULL),
+	QU( 7385247772746953374ULL), QU( 3192003544283864292ULL),
+	QU(18153684490917593847ULL), QU(12423498260668568905ULL),
+	QU(10957758099756378169ULL), QU(11488762179911016040ULL),
+	QU( 2099931186465333782ULL), QU(11180979581250294432ULL),
+	QU( 8098916250668367933ULL), QU( 3529200436790763465ULL),
+	QU(12988418908674681745ULL), QU( 6147567275954808580ULL),
+	QU( 3207503344604030989ULL), QU(10761592604898615360ULL),
+	QU(  229854861031893504ULL), QU( 8809853962667144291ULL),
+	QU(13957364469005693860ULL), QU( 7634287665224495886ULL),
+	QU(12353487366976556874ULL), QU( 1134423796317152034ULL),
+	QU( 2088992471334107068ULL), QU( 7393372127190799698ULL),
+	QU( 1845367839871058391ULL), QU(  207922563987322884ULL),
+	QU(11960870813159944976ULL), QU(12182120053317317363ULL),
+	QU(17307358132571709283ULL), QU(13871081155552824936ULL),
+	QU(18304446751741566262ULL), QU( 7178705220184302849ULL),
+	QU(10929605677758824425ULL), QU(16446976977835806844ULL),
+	QU(13723874412159769044ULL), QU( 6942854352100915216ULL),
+	QU( 1726308474365729390ULL), QU( 2150078766445323155ULL),
+	QU(15345558947919656626ULL), QU(12145453828874527201ULL),
+	QU( 2054448620739726849ULL), QU( 2740102003352628137ULL),
+	QU(11294462163577610655ULL), QU(  756164283387413743ULL),
+	QU(17841144758438810880ULL), QU(10802406021185415861ULL),
+	QU( 8716455530476737846ULL), QU( 6321788834517649606ULL),
+	QU(14681322910577468426ULL), QU(17330043563884336387ULL),
+	QU(12701802180050071614ULL), QU(14695105111079727151ULL),
+	QU( 5112098511654172830ULL), QU( 4957505496794139973ULL),
+	QU( 8270979451952045982ULL), QU(12307685939199120969ULL),
+	QU(12425799408953443032ULL), QU( 8376410143634796588ULL),
+	QU(16621778679680060464ULL), QU( 3580497854566660073ULL),
+	QU( 1122515747803382416ULL), QU(  857664980960597599ULL),
+	QU( 6343640119895925918ULL), QU(12878473260854462891ULL),
+	QU(10036813920765722626ULL), QU(14451335468363173812ULL),
+	QU( 5476809692401102807ULL), QU(16442255173514366342ULL),
+	QU(13060203194757167104ULL), QU(14354124071243177715ULL),
+	QU(15961249405696125227ULL), QU(13703893649690872584ULL),
+	QU(  363907326340340064ULL), QU( 6247455540491754842ULL),
+	QU(12242249332757832361ULL), QU(  156065475679796717ULL),
+	QU( 9351116235749732355ULL), QU( 4590350628677701405ULL),
+	QU( 1671195940982350389ULL), QU(13501398458898451905ULL),
+	QU( 6526341991225002255ULL), QU( 1689782913778157592ULL),
+	QU( 7439222350869010334ULL), QU(13975150263226478308ULL),
+	QU(11411961169932682710ULL), QU(17204271834833847277ULL),
+	QU(  541534742544435367ULL), QU( 6591191931218949684ULL),
+	QU( 2645454775478232486ULL), QU( 4322857481256485321ULL),
+	QU( 8477416487553065110ULL), QU(12902505428548435048ULL),
+	QU(  971445777981341415ULL), QU(14995104682744976712ULL),
+	QU( 4243341648807158063ULL), QU( 8695061252721927661ULL),
+	QU( 5028202003270177222ULL), QU( 2289257340915567840ULL),
+	QU(13870416345121866007ULL), QU(13994481698072092233ULL),
+	QU( 6912785400753196481ULL), QU( 2278309315841980139ULL),
+	QU( 4329765449648304839ULL), QU( 5963108095785485298ULL),
+	QU( 4880024847478722478ULL), QU(16015608779890240947ULL),
+	QU( 1866679034261393544ULL), QU(  914821179919731519ULL),
+	QU( 9643404035648760131ULL), QU( 2418114953615593915ULL),
+	QU(  944756836073702374ULL), QU(15186388048737296834ULL),
+	QU( 7723355336128442206ULL), QU( 7500747479679599691ULL),
+	QU(18013961306453293634ULL), QU( 2315274808095756456ULL),
+	QU(13655308255424029566ULL), QU(17203800273561677098ULL),
+	QU( 1382158694422087756ULL), QU( 5090390250309588976ULL),
+	QU(  517170818384213989ULL), QU( 1612709252627729621ULL),
+	QU( 1330118955572449606ULL), QU(  300922478056709885ULL),
+	QU(18115693291289091987ULL), QU(13491407109725238321ULL),
+	QU(15293714633593827320ULL), QU( 5151539373053314504ULL),
+	QU( 5951523243743139207ULL), QU(14459112015249527975ULL),
+	QU( 5456113959000700739ULL), QU( 3877918438464873016ULL),
+	QU(12534071654260163555ULL), QU(15871678376893555041ULL),
+	QU(11005484805712025549ULL), QU(16353066973143374252ULL),
+	QU( 4358331472063256685ULL), QU( 8268349332210859288ULL),
+	QU(12485161590939658075ULL), QU(13955993592854471343ULL),
+	QU( 5911446886848367039ULL), QU(14925834086813706974ULL),
+	QU( 6590362597857994805ULL), QU( 1280544923533661875ULL),
+	QU( 1637756018947988164ULL), QU( 4734090064512686329ULL),
+	QU(16693705263131485912ULL), QU( 6834882340494360958ULL),
+	QU( 8120732176159658505ULL), QU( 2244371958905329346ULL),
+	QU(10447499707729734021ULL), QU( 7318742361446942194ULL),
+	QU( 8032857516355555296ULL), QU(14023605983059313116ULL),
+	QU( 1032336061815461376ULL), QU( 9840995337876562612ULL),
+	QU( 9869256223029203587ULL), QU(12227975697177267636ULL),
+	QU(12728115115844186033ULL), QU( 7752058479783205470ULL),
+	QU(  729733219713393087ULL), QU(12954017801239007622ULL)
 };
 static const uint64_t init_by_array_64_expected[] = {
-	QU( 2100341266307895239LLU), QU( 8344256300489757943LLU),
-	QU(15687933285484243894LLU), QU( 8268620370277076319LLU),
-	QU(12371852309826545459LLU), QU( 8800491541730110238LLU),
-	QU(18113268950100835773LLU), QU( 2886823658884438119LLU),
-	QU( 3293667307248180724LLU), QU( 9307928143300172731LLU),
-	QU( 7688082017574293629LLU), QU(  900986224735166665LLU),
-	QU( 9977972710722265039LLU), QU( 6008205004994830552LLU),
-	QU(  546909104521689292LLU), QU( 7428471521869107594LLU),
-	QU(14777563419314721179LLU), QU(16116143076567350053LLU),
-	QU( 5322685342003142329LLU), QU( 4200427048445863473LLU),
-	QU( 4693092150132559146LLU), QU(13671425863759338582LLU),
-	QU( 6747117460737639916LLU), QU( 4732666080236551150LLU),
-	QU( 5912839950611941263LLU), QU( 3903717554504704909LLU),
-	QU( 2615667650256786818LLU), QU(10844129913887006352LLU),
-	QU(13786467861810997820LLU), QU(14267853002994021570LLU),
-	QU(13767807302847237439LLU), QU(16407963253707224617LLU),
-	QU( 4802498363698583497LLU), QU( 2523802839317209764LLU),
-	QU( 3822579397797475589LLU), QU( 8950320572212130610LLU),
-	QU( 3745623504978342534LLU), QU(16092609066068482806LLU),
-	QU( 9817016950274642398LLU), QU(10591660660323829098LLU),
-	QU(11751606650792815920LLU), QU( 5122873818577122211LLU),
-	QU(17209553764913936624LLU), QU( 6249057709284380343LLU),
-	QU(15088791264695071830LLU), QU(15344673071709851930LLU),
-	QU( 4345751415293646084LLU), QU( 2542865750703067928LLU),
-	QU(13520525127852368784LLU), QU(18294188662880997241LLU),
-	QU( 3871781938044881523LLU), QU( 2873487268122812184LLU),
-	QU(15099676759482679005LLU), QU(15442599127239350490LLU),
-	QU( 6311893274367710888LLU), QU( 3286118760484672933LLU),
-	QU( 4146067961333542189LLU), QU(13303942567897208770LLU),
-	QU( 8196013722255630418LLU), QU( 4437815439340979989LLU),
-	QU(15433791533450605135LLU), QU( 4254828956815687049LLU),
-	QU( 1310903207708286015LLU), QU(10529182764462398549LLU),
-	QU(14900231311660638810LLU), QU( 9727017277104609793LLU),
-	QU( 1821308310948199033LLU), QU(11628861435066772084LLU),
-	QU( 9469019138491546924LLU), QU( 3145812670532604988LLU),
-	QU( 9938468915045491919LLU), QU( 1562447430672662142LLU),
-	QU(13963995266697989134LLU), QU( 3356884357625028695LLU),
-	QU( 4499850304584309747LLU), QU( 8456825817023658122LLU),
-	QU(10859039922814285279LLU), QU( 8099512337972526555LLU),
-	QU(  348006375109672149LLU), QU(11919893998241688603LLU),
-	QU( 1104199577402948826LLU), QU(16689191854356060289LLU),
-	QU(10992552041730168078LLU), QU( 7243733172705465836LLU),
-	QU( 5668075606180319560LLU), QU(18182847037333286970LLU),
-	QU( 4290215357664631322LLU), QU( 4061414220791828613LLU),
-	QU(13006291061652989604LLU), QU( 7140491178917128798LLU),
-	QU(12703446217663283481LLU), QU( 5500220597564558267LLU),
-	QU(10330551509971296358LLU), QU(15958554768648714492LLU),
-	QU( 5174555954515360045LLU), QU( 1731318837687577735LLU),
-	QU( 3557700801048354857LLU), QU(13764012341928616198LLU),
-	QU(13115166194379119043LLU), QU( 7989321021560255519LLU),
-	QU( 2103584280905877040LLU), QU( 9230788662155228488LLU),
-	QU(16396629323325547654LLU), QU(  657926409811318051LLU),
-	QU(15046700264391400727LLU), QU( 5120132858771880830LLU),
-	QU( 7934160097989028561LLU), QU( 6963121488531976245LLU),
-	QU(17412329602621742089LLU), QU(15144843053931774092LLU),
-	QU(17204176651763054532LLU), QU(13166595387554065870LLU),
-	QU( 8590377810513960213LLU), QU( 5834365135373991938LLU),
-	QU( 7640913007182226243LLU), QU( 3479394703859418425LLU),
-	QU(16402784452644521040LLU), QU( 4993979809687083980LLU),
-	QU(13254522168097688865LLU), QU(15643659095244365219LLU),
-	QU( 5881437660538424982LLU), QU(11174892200618987379LLU),
-	QU(  254409966159711077LLU), QU(17158413043140549909LLU),
-	QU( 3638048789290376272LLU), QU( 1376816930299489190LLU),
-	QU( 4622462095217761923LLU), QU(15086407973010263515LLU),
-	QU(13253971772784692238LLU), QU( 5270549043541649236LLU),
-	QU(11182714186805411604LLU), QU(12283846437495577140LLU),
-	QU( 5297647149908953219LLU), QU(10047451738316836654LLU),
-	QU( 4938228100367874746LLU), QU(12328523025304077923LLU),
-	QU( 3601049438595312361LLU), QU( 9313624118352733770LLU),
-	QU(13322966086117661798LLU), QU(16660005705644029394LLU),
-	QU(11337677526988872373LLU), QU(13869299102574417795LLU),
-	QU(15642043183045645437LLU), QU( 3021755569085880019LLU),
-	QU( 4979741767761188161LLU), QU(13679979092079279587LLU),
-	QU( 3344685842861071743LLU), QU(13947960059899588104LLU),
-	QU(  305806934293368007LLU), QU( 5749173929201650029LLU),
-	QU(11123724852118844098LLU), QU(15128987688788879802LLU),
-	QU(15251651211024665009LLU), QU( 7689925933816577776LLU),
-	QU(16732804392695859449LLU), QU(17087345401014078468LLU),
-	QU(14315108589159048871LLU), QU( 4820700266619778917LLU),
-	QU(16709637539357958441LLU), QU( 4936227875177351374LLU),
-	QU( 2137907697912987247LLU), QU(11628565601408395420LLU),
-	QU( 2333250549241556786LLU), QU( 5711200379577778637LLU),
-	QU( 5170680131529031729LLU), QU(12620392043061335164LLU),
-	QU(   95363390101096078LLU), QU( 5487981914081709462LLU),
-	QU( 1763109823981838620LLU), QU( 3395861271473224396LLU),
-	QU( 1300496844282213595LLU), QU( 6894316212820232902LLU),
-	QU(10673859651135576674LLU), QU( 5911839658857903252LLU),
-	QU(17407110743387299102LLU), QU( 8257427154623140385LLU),
-	QU(11389003026741800267LLU), QU( 4070043211095013717LLU),
-	QU(11663806997145259025LLU), QU(15265598950648798210LLU),
-	QU(  630585789434030934LLU), QU( 3524446529213587334LLU),
-	QU( 7186424168495184211LLU), QU(10806585451386379021LLU),
-	QU(11120017753500499273LLU), QU( 1586837651387701301LLU),
-	QU(17530454400954415544LLU), QU( 9991670045077880430LLU),
-	QU( 7550997268990730180LLU), QU( 8640249196597379304LLU),
-	QU( 3522203892786893823LLU), QU(10401116549878854788LLU),
-	QU(13690285544733124852LLU), QU( 8295785675455774586LLU),
-	QU(15535716172155117603LLU), QU( 3112108583723722511LLU),
-	QU(17633179955339271113LLU), QU(18154208056063759375LLU),
-	QU( 1866409236285815666LLU), QU(13326075895396412882LLU),
-	QU( 8756261842948020025LLU), QU( 6281852999868439131LLU),
-	QU(15087653361275292858LLU), QU(10333923911152949397LLU),
-	QU( 5265567645757408500LLU), QU(12728041843210352184LLU),
-	QU( 6347959327507828759LLU), QU(  154112802625564758LLU),
-	QU(18235228308679780218LLU), QU( 3253805274673352418LLU),
-	QU( 4849171610689031197LLU), QU(17948529398340432518LLU),
-	QU(13803510475637409167LLU), QU(13506570190409883095LLU),
-	QU(15870801273282960805LLU), QU( 8451286481299170773LLU),
-	QU( 9562190620034457541LLU), QU( 8518905387449138364LLU),
-	QU(12681306401363385655LLU), QU( 3788073690559762558LLU),
-	QU( 5256820289573487769LLU), QU( 2752021372314875467LLU),
-	QU( 6354035166862520716LLU), QU( 4328956378309739069LLU),
-	QU(  449087441228269600LLU), QU( 5533508742653090868LLU),
-	QU( 1260389420404746988LLU), QU(18175394473289055097LLU),
-	QU( 1535467109660399420LLU), QU( 8818894282874061442LLU),
-	QU(12140873243824811213LLU), QU(15031386653823014946LLU),
-	QU( 1286028221456149232LLU), QU( 6329608889367858784LLU),
-	QU( 9419654354945132725LLU), QU( 6094576547061672379LLU),
-	QU(17706217251847450255LLU), QU( 1733495073065878126LLU),
-	QU(16918923754607552663LLU), QU( 8881949849954945044LLU),
-	QU(12938977706896313891LLU), QU(14043628638299793407LLU),
-	QU(18393874581723718233LLU), QU( 6886318534846892044LLU),
-	QU(14577870878038334081LLU), QU(13541558383439414119LLU),
-	QU(13570472158807588273LLU), QU(18300760537910283361LLU),
-	QU(  818368572800609205LLU), QU( 1417000585112573219LLU),
-	QU(12337533143867683655LLU), QU(12433180994702314480LLU),
-	QU(  778190005829189083LLU), QU(13667356216206524711LLU),
-	QU( 9866149895295225230LLU), QU(11043240490417111999LLU),
-	QU( 1123933826541378598LLU), QU( 6469631933605123610LLU),
-	QU(14508554074431980040LLU), QU(13918931242962026714LLU),
-	QU( 2870785929342348285LLU), QU(14786362626740736974LLU),
-	QU(13176680060902695786LLU), QU( 9591778613541679456LLU),
-	QU( 9097662885117436706LLU), QU(  749262234240924947LLU),
-	QU( 1944844067793307093LLU), QU( 4339214904577487742LLU),
-	QU( 8009584152961946551LLU), QU(16073159501225501777LLU),
-	QU( 3335870590499306217LLU), QU(17088312653151202847LLU),
-	QU( 3108893142681931848LLU), QU(16636841767202792021LLU),
-	QU(10423316431118400637LLU), QU( 8008357368674443506LLU),
-	QU(11340015231914677875LLU), QU(17687896501594936090LLU),
-	QU(15173627921763199958LLU), QU(  542569482243721959LLU),
-	QU(15071714982769812975LLU), QU( 4466624872151386956LLU),
-	QU( 1901780715602332461LLU), QU( 9822227742154351098LLU),
-	QU( 1479332892928648780LLU), QU( 6981611948382474400LLU),
-	QU( 7620824924456077376LLU), QU(14095973329429406782LLU),
-	QU( 7902744005696185404LLU), QU(15830577219375036920LLU),
-	QU(10287076667317764416LLU), QU(12334872764071724025LLU),
-	QU( 4419302088133544331LLU), QU(14455842851266090520LLU),
-	QU(12488077416504654222LLU), QU( 7953892017701886766LLU),
-	QU( 6331484925529519007LLU), QU( 4902145853785030022LLU),
-	QU(17010159216096443073LLU), QU(11945354668653886087LLU),
-	QU(15112022728645230829LLU), QU(17363484484522986742LLU),
-	QU( 4423497825896692887LLU), QU( 8155489510809067471LLU),
-	QU(  258966605622576285LLU), QU( 5462958075742020534LLU),
-	QU( 6763710214913276228LLU), QU( 2368935183451109054LLU),
-	QU(14209506165246453811LLU), QU( 2646257040978514881LLU),
-	QU( 3776001911922207672LLU), QU( 1419304601390147631LLU),
-	QU(14987366598022458284LLU), QU( 3977770701065815721LLU),
-	QU(  730820417451838898LLU), QU( 3982991703612885327LLU),
-	QU( 2803544519671388477LLU), QU(17067667221114424649LLU),
-	QU( 2922555119737867166LLU), QU( 1989477584121460932LLU),
-	QU(15020387605892337354LLU), QU( 9293277796427533547LLU),
-	QU(10722181424063557247LLU), QU(16704542332047511651LLU),
-	QU( 5008286236142089514LLU), QU(16174732308747382540LLU),
-	QU(17597019485798338402LLU), QU(13081745199110622093LLU),
-	QU( 8850305883842258115LLU), QU(12723629125624589005LLU),
-	QU( 8140566453402805978LLU), QU(15356684607680935061LLU),
-	QU(14222190387342648650LLU), QU(11134610460665975178LLU),
-	QU( 1259799058620984266LLU), QU(13281656268025610041LLU),
-	QU(  298262561068153992LLU), QU(12277871700239212922LLU),
-	QU(13911297774719779438LLU), QU(16556727962761474934LLU),
-	QU(17903010316654728010LLU), QU( 9682617699648434744LLU),
-	QU(14757681836838592850LLU), QU( 1327242446558524473LLU),
-	QU(11126645098780572792LLU), QU( 1883602329313221774LLU),
-	QU( 2543897783922776873LLU), QU(15029168513767772842LLU),
-	QU(12710270651039129878LLU), QU(16118202956069604504LLU),
-	QU(15010759372168680524LLU), QU( 2296827082251923948LLU),
-	QU(10793729742623518101LLU), QU(13829764151845413046LLU),
-	QU(17769301223184451213LLU), QU( 3118268169210783372LLU),
-	QU(17626204544105123127LLU), QU( 7416718488974352644LLU),
-	QU(10450751996212925994LLU), QU( 9352529519128770586LLU),
-	QU(  259347569641110140LLU), QU( 8048588892269692697LLU),
-	QU( 1774414152306494058LLU), QU(10669548347214355622LLU),
-	QU(13061992253816795081LLU), QU(18432677803063861659LLU),
-	QU( 8879191055593984333LLU), QU(12433753195199268041LLU),
-	QU(14919392415439730602LLU), QU( 6612848378595332963LLU),
-	QU( 6320986812036143628LLU), QU(10465592420226092859LLU),
-	QU( 4196009278962570808LLU), QU( 3747816564473572224LLU),
-	QU(17941203486133732898LLU), QU( 2350310037040505198LLU),
-	QU( 5811779859134370113LLU), QU(10492109599506195126LLU),
-	QU( 7699650690179541274LLU), QU( 1954338494306022961LLU),
-	QU(14095816969027231152LLU), QU( 5841346919964852061LLU),
-	QU(14945969510148214735LLU), QU( 3680200305887550992LLU),
-	QU( 6218047466131695792LLU), QU( 8242165745175775096LLU),
-	QU(11021371934053307357LLU), QU( 1265099502753169797LLU),
-	QU( 4644347436111321718LLU), QU( 3609296916782832859LLU),
-	QU( 8109807992218521571LLU), QU(18387884215648662020LLU),
-	QU(14656324896296392902LLU), QU(17386819091238216751LLU),
-	QU(17788300878582317152LLU), QU( 7919446259742399591LLU),
-	QU( 4466613134576358004LLU), QU(12928181023667938509LLU),
-	QU(13147446154454932030LLU), QU(16552129038252734620LLU),
-	QU( 8395299403738822450LLU), QU(11313817655275361164LLU),
-	QU(  434258809499511718LLU), QU( 2074882104954788676LLU),
-	QU( 7929892178759395518LLU), QU( 9006461629105745388LLU),
-	QU( 5176475650000323086LLU), QU(11128357033468341069LLU),
-	QU(12026158851559118955LLU), QU(14699716249471156500LLU),
-	QU(  448982497120206757LLU), QU( 4156475356685519900LLU),
-	QU( 6063816103417215727LLU), QU(10073289387954971479LLU),
-	QU( 8174466846138590962LLU), QU( 2675777452363449006LLU),
-	QU( 9090685420572474281LLU), QU( 6659652652765562060LLU),
-	QU(12923120304018106621LLU), QU(11117480560334526775LLU),
-	QU(  937910473424587511LLU), QU( 1838692113502346645LLU),
-	QU(11133914074648726180LLU), QU( 7922600945143884053LLU),
-	QU(13435287702700959550LLU), QU( 5287964921251123332LLU),
-	QU(11354875374575318947LLU), QU(17955724760748238133LLU),
-	QU(13728617396297106512LLU), QU( 4107449660118101255LLU),
-	QU( 1210269794886589623LLU), QU(11408687205733456282LLU),
-	QU( 4538354710392677887LLU), QU(13566803319341319267LLU),
-	QU(17870798107734050771LLU), QU( 3354318982568089135LLU),
-	QU( 9034450839405133651LLU), QU(13087431795753424314LLU),
-	QU(  950333102820688239LLU), QU( 1968360654535604116LLU),
-	QU(16840551645563314995LLU), QU( 8867501803892924995LLU),
-	QU(11395388644490626845LLU), QU( 1529815836300732204LLU),
-	QU(13330848522996608842LLU), QU( 1813432878817504265LLU),
-	QU( 2336867432693429560LLU), QU(15192805445973385902LLU),
-	QU( 2528593071076407877LLU), QU(  128459777936689248LLU),
-	QU( 9976345382867214866LLU), QU( 6208885766767996043LLU),
-	QU(14982349522273141706LLU), QU( 3099654362410737822LLU),
-	QU(13776700761947297661LLU), QU( 8806185470684925550LLU),
-	QU( 8151717890410585321LLU), QU(  640860591588072925LLU),
-	QU(14592096303937307465LLU), QU( 9056472419613564846LLU),
-	QU(14861544647742266352LLU), QU(12703771500398470216LLU),
-	QU( 3142372800384138465LLU), QU( 6201105606917248196LLU),
-	QU(18337516409359270184LLU), QU(15042268695665115339LLU),
-	QU(15188246541383283846LLU), QU(12800028693090114519LLU),
-	QU( 5992859621101493472LLU), QU(18278043971816803521LLU),
-	QU( 9002773075219424560LLU), QU( 7325707116943598353LLU),
-	QU( 7930571931248040822LLU), QU( 5645275869617023448LLU),
-	QU( 7266107455295958487LLU), QU( 4363664528273524411LLU),
-	QU(14313875763787479809LLU), QU(17059695613553486802LLU),
-	QU( 9247761425889940932LLU), QU(13704726459237593128LLU),
-	QU( 2701312427328909832LLU), QU(17235532008287243115LLU),
-	QU(14093147761491729538LLU), QU( 6247352273768386516LLU),
-	QU( 8268710048153268415LLU), QU( 7985295214477182083LLU),
-	QU(15624495190888896807LLU), QU( 3772753430045262788LLU),
-	QU( 9133991620474991698LLU), QU( 5665791943316256028LLU),
-	QU( 7551996832462193473LLU), QU(13163729206798953877LLU),
-	QU( 9263532074153846374LLU), QU( 1015460703698618353LLU),
-	QU(17929874696989519390LLU), QU(18257884721466153847LLU),
-	QU(16271867543011222991LLU), QU( 3905971519021791941LLU),
-	QU(16814488397137052085LLU), QU( 1321197685504621613LLU),
-	QU( 2870359191894002181LLU), QU(14317282970323395450LLU),
-	QU(13663920845511074366LLU), QU( 2052463995796539594LLU),
-	QU(14126345686431444337LLU), QU( 1727572121947022534LLU),
-	QU(17793552254485594241LLU), QU( 6738857418849205750LLU),
-	QU( 1282987123157442952LLU), QU(16655480021581159251LLU),
-	QU( 6784587032080183866LLU), QU(14726758805359965162LLU),
-	QU( 7577995933961987349LLU), QU(12539609320311114036LLU),
-	QU(10789773033385439494LLU), QU( 8517001497411158227LLU),
-	QU(10075543932136339710LLU), QU(14838152340938811081LLU),
-	QU( 9560840631794044194LLU), QU(17445736541454117475LLU),
-	QU(10633026464336393186LLU), QU(15705729708242246293LLU),
-	QU( 1117517596891411098LLU), QU( 4305657943415886942LLU),
-	QU( 4948856840533979263LLU), QU(16071681989041789593LLU),
-	QU(13723031429272486527LLU), QU( 7639567622306509462LLU),
-	QU(12670424537483090390LLU), QU( 9715223453097197134LLU),
-	QU( 5457173389992686394LLU), QU(  289857129276135145LLU),
-	QU(17048610270521972512LLU), QU(  692768013309835485LLU),
-	QU(14823232360546632057LLU), QU(18218002361317895936LLU),
-	QU( 3281724260212650204LLU), QU(16453957266549513795LLU),
-	QU( 8592711109774511881LLU), QU(  929825123473369579LLU),
-	QU(15966784769764367791LLU), QU( 9627344291450607588LLU),
-	QU(10849555504977813287LLU), QU( 9234566913936339275LLU),
-	QU( 6413807690366911210LLU), QU(10862389016184219267LLU),
-	QU(13842504799335374048LLU), QU( 1531994113376881174LLU),
-	QU( 2081314867544364459LLU), QU(16430628791616959932LLU),
-	QU( 8314714038654394368LLU), QU( 9155473892098431813LLU),
-	QU(12577843786670475704LLU), QU( 4399161106452401017LLU),
-	QU( 1668083091682623186LLU), QU( 1741383777203714216LLU),
-	QU( 2162597285417794374LLU), QU(15841980159165218736LLU),
-	QU( 1971354603551467079LLU), QU( 1206714764913205968LLU),
-	QU( 4790860439591272330LLU), QU(14699375615594055799LLU),
-	QU( 8374423871657449988LLU), QU(10950685736472937738LLU),
-	QU(  697344331343267176LLU), QU(10084998763118059810LLU),
-	QU(12897369539795983124LLU), QU(12351260292144383605LLU),
-	QU( 1268810970176811234LLU), QU( 7406287800414582768LLU),
-	QU(  516169557043807831LLU), QU( 5077568278710520380LLU),
-	QU( 3828791738309039304LLU), QU( 7721974069946943610LLU),
-	QU( 3534670260981096460LLU), QU( 4865792189600584891LLU),
-	QU(16892578493734337298LLU), QU( 9161499464278042590LLU),
-	QU(11976149624067055931LLU), QU(13219479887277343990LLU),
-	QU(14161556738111500680LLU), QU(14670715255011223056LLU),
-	QU( 4671205678403576558LLU), QU(12633022931454259781LLU),
-	QU(14821376219869187646LLU), QU(  751181776484317028LLU),
-	QU( 2192211308839047070LLU), QU(11787306362361245189LLU),
-	QU(10672375120744095707LLU), QU( 4601972328345244467LLU),
-	QU(15457217788831125879LLU), QU( 8464345256775460809LLU),
-	QU(10191938789487159478LLU), QU( 6184348739615197613LLU),
-	QU(11425436778806882100LLU), QU( 2739227089124319793LLU),
-	QU(  461464518456000551LLU), QU( 4689850170029177442LLU),
-	QU( 6120307814374078625LLU), QU(11153579230681708671LLU),
-	QU( 7891721473905347926LLU), QU(10281646937824872400LLU),
-	QU( 3026099648191332248LLU), QU( 8666750296953273818LLU),
-	QU(14978499698844363232LLU), QU(13303395102890132065LLU),
-	QU( 8182358205292864080LLU), QU(10560547713972971291LLU),
-	QU(11981635489418959093LLU), QU( 3134621354935288409LLU),
-	QU(11580681977404383968LLU), QU(14205530317404088650LLU),
-	QU( 5997789011854923157LLU), QU(13659151593432238041LLU),
-	QU(11664332114338865086LLU), QU( 7490351383220929386LLU),
-	QU( 7189290499881530378LLU), QU(15039262734271020220LLU),
-	QU( 2057217285976980055LLU), QU(  555570804905355739LLU),
-	QU(11235311968348555110LLU), QU(13824557146269603217LLU),
-	QU(16906788840653099693LLU), QU( 7222878245455661677LLU),
-	QU( 5245139444332423756LLU), QU( 4723748462805674292LLU),
-	QU(12216509815698568612LLU), QU(17402362976648951187LLU),
-	QU(17389614836810366768LLU), QU( 4880936484146667711LLU),
-	QU( 9085007839292639880LLU), QU(13837353458498535449LLU),
-	QU(11914419854360366677LLU), QU(16595890135313864103LLU),
-	QU( 6313969847197627222LLU), QU(18296909792163910431LLU),
-	QU(10041780113382084042LLU), QU( 2499478551172884794LLU),
-	QU(11057894246241189489LLU), QU( 9742243032389068555LLU),
-	QU(12838934582673196228LLU), QU(13437023235248490367LLU),
-	QU(13372420669446163240LLU), QU( 6752564244716909224LLU),
-	QU( 7157333073400313737LLU), QU(12230281516370654308LLU),
-	QU( 1182884552219419117LLU), QU( 2955125381312499218LLU),
-	QU(10308827097079443249LLU), QU( 1337648572986534958LLU),
-	QU(16378788590020343939LLU), QU(  108619126514420935LLU),
-	QU( 3990981009621629188LLU), QU( 5460953070230946410LLU),
-	QU( 9703328329366531883LLU), QU(13166631489188077236LLU),
-	QU( 1104768831213675170LLU), QU( 3447930458553877908LLU),
-	QU( 8067172487769945676LLU), QU( 5445802098190775347LLU),
-	QU( 3244840981648973873LLU), QU(17314668322981950060LLU),
-	QU( 5006812527827763807LLU), QU(18158695070225526260LLU),
-	QU( 2824536478852417853LLU), QU(13974775809127519886LLU),
-	QU( 9814362769074067392LLU), QU(17276205156374862128LLU),
-	QU(11361680725379306967LLU), QU( 3422581970382012542LLU),
-	QU(11003189603753241266LLU), QU(11194292945277862261LLU),
-	QU( 6839623313908521348LLU), QU(11935326462707324634LLU),
-	QU( 1611456788685878444LLU), QU(13112620989475558907LLU),
-	QU(  517659108904450427LLU), QU(13558114318574407624LLU),
-	QU(15699089742731633077LLU), QU( 4988979278862685458LLU),
-	QU( 8111373583056521297LLU), QU( 3891258746615399627LLU),
-	QU( 8137298251469718086LLU), QU(12748663295624701649LLU),
-	QU( 4389835683495292062LLU), QU( 5775217872128831729LLU),
-	QU( 9462091896405534927LLU), QU( 8498124108820263989LLU),
-	QU( 8059131278842839525LLU), QU(10503167994254090892LLU),
-	QU(11613153541070396656LLU), QU(18069248738504647790LLU),
-	QU(  570657419109768508LLU), QU( 3950574167771159665LLU),
-	QU( 5514655599604313077LLU), QU( 2908460854428484165LLU),
-	QU(10777722615935663114LLU), QU(12007363304839279486LLU),
-	QU( 9800646187569484767LLU), QU( 8795423564889864287LLU),
-	QU(14257396680131028419LLU), QU( 6405465117315096498LLU),
-	QU( 7939411072208774878LLU), QU(17577572378528990006LLU),
-	QU(14785873806715994850LLU), QU(16770572680854747390LLU),
-	QU(18127549474419396481LLU), QU(11637013449455757750LLU),
-	QU(14371851933996761086LLU), QU( 3601181063650110280LLU),
-	QU( 4126442845019316144LLU), QU(10198287239244320669LLU),
-	QU(18000169628555379659LLU), QU(18392482400739978269LLU),
-	QU( 6219919037686919957LLU), QU( 3610085377719446052LLU),
-	QU( 2513925039981776336LLU), QU(16679413537926716955LLU),
-	QU(12903302131714909434LLU), QU( 5581145789762985009LLU),
-	QU(12325955044293303233LLU), QU(17216111180742141204LLU),
-	QU( 6321919595276545740LLU), QU( 3507521147216174501LLU),
-	QU( 9659194593319481840LLU), QU(11473976005975358326LLU),
-	QU(14742730101435987026LLU), QU(  492845897709954780LLU),
-	QU(16976371186162599676LLU), QU(17712703422837648655LLU),
-	QU( 9881254778587061697LLU), QU( 8413223156302299551LLU),
-	QU( 1563841828254089168LLU), QU( 9996032758786671975LLU),
-	QU(  138877700583772667LLU), QU(13003043368574995989LLU),
-	QU( 4390573668650456587LLU), QU( 8610287390568126755LLU),
-	QU(15126904974266642199LLU), QU( 6703637238986057662LLU),
-	QU( 2873075592956810157LLU), QU( 6035080933946049418LLU),
-	QU(13382846581202353014LLU), QU( 7303971031814642463LLU),
-	QU(18418024405307444267LLU), QU( 5847096731675404647LLU),
-	QU( 4035880699639842500LLU), QU(11525348625112218478LLU),
-	QU( 3041162365459574102LLU), QU( 2604734487727986558LLU),
-	QU(15526341771636983145LLU), QU(14556052310697370254LLU),
-	QU(12997787077930808155LLU), QU( 9601806501755554499LLU),
-	QU(11349677952521423389LLU), QU(14956777807644899350LLU),
-	QU(16559736957742852721LLU), QU(12360828274778140726LLU),
-	QU( 6685373272009662513LLU), QU(16932258748055324130LLU),
-	QU(15918051131954158508LLU), QU( 1692312913140790144LLU),
-	QU(  546653826801637367LLU), QU( 5341587076045986652LLU),
-	QU(14975057236342585662LLU), QU(12374976357340622412LLU),
-	QU(10328833995181940552LLU), QU(12831807101710443149LLU),
-	QU(10548514914382545716LLU), QU( 2217806727199715993LLU),
-	QU(12627067369242845138LLU), QU( 4598965364035438158LLU),
-	QU(  150923352751318171LLU), QU(14274109544442257283LLU),
-	QU( 4696661475093863031LLU), QU( 1505764114384654516LLU),
-	QU(10699185831891495147LLU), QU( 2392353847713620519LLU),
-	QU( 3652870166711788383LLU), QU( 8640653276221911108LLU),
-	QU( 3894077592275889704LLU), QU( 4918592872135964845LLU),
-	QU(16379121273281400789LLU), QU(12058465483591683656LLU),
-	QU(11250106829302924945LLU), QU( 1147537556296983005LLU),
-	QU( 6376342756004613268LLU), QU(14967128191709280506LLU),
-	QU(18007449949790627628LLU), QU( 9497178279316537841LLU),
-	QU( 7920174844809394893LLU), QU(10037752595255719907LLU),
-	QU(15875342784985217697LLU), QU(15311615921712850696LLU),
-	QU( 9552902652110992950LLU), QU(14054979450099721140LLU),
-	QU( 5998709773566417349LLU), QU(18027910339276320187LLU),
-	QU( 8223099053868585554LLU), QU( 7842270354824999767LLU),
-	QU( 4896315688770080292LLU), QU(12969320296569787895LLU),
-	QU( 2674321489185759961LLU), QU( 4053615936864718439LLU),
-	QU(11349775270588617578LLU), QU( 4743019256284553975LLU),
-	QU( 5602100217469723769LLU), QU(14398995691411527813LLU),
-	QU( 7412170493796825470LLU), QU(  836262406131744846LLU),
-	QU( 8231086633845153022LLU), QU( 5161377920438552287LLU),
-	QU( 8828731196169924949LLU), QU(16211142246465502680LLU),
-	QU( 3307990879253687818LLU), QU( 5193405406899782022LLU),
-	QU( 8510842117467566693LLU), QU( 6070955181022405365LLU),
-	QU(14482950231361409799LLU), QU(12585159371331138077LLU),
-	QU( 3511537678933588148LLU), QU( 2041849474531116417LLU),
-	QU(10944936685095345792LLU), QU(18303116923079107729LLU),
-	QU( 2720566371239725320LLU), QU( 4958672473562397622LLU),
-	QU( 3032326668253243412LLU), QU(13689418691726908338LLU),
-	QU( 1895205511728843996LLU), QU( 8146303515271990527LLU),
-	QU(16507343500056113480LLU), QU(  473996939105902919LLU),
-	QU( 9897686885246881481LLU), QU(14606433762712790575LLU),
-	QU( 6732796251605566368LLU), QU( 1399778120855368916LLU),
-	QU(  935023885182833777LLU), QU(16066282816186753477LLU),
-	QU( 7291270991820612055LLU), QU(17530230393129853844LLU),
-	QU(10223493623477451366LLU), QU(15841725630495676683LLU),
-	QU(17379567246435515824LLU), QU( 8588251429375561971LLU),
-	QU(18339511210887206423LLU), QU(17349587430725976100LLU),
-	QU(12244876521394838088LLU), QU( 6382187714147161259LLU),
-	QU(12335807181848950831LLU), QU(16948885622305460665LLU),
-	QU(13755097796371520506LLU), QU(14806740373324947801LLU),
-	QU( 4828699633859287703LLU), QU( 8209879281452301604LLU),
-	QU(12435716669553736437LLU), QU(13970976859588452131LLU),
-	QU( 6233960842566773148LLU), QU(12507096267900505759LLU),
-	QU( 1198713114381279421LLU), QU(14989862731124149015LLU),
-	QU(15932189508707978949LLU), QU( 2526406641432708722LLU),
-	QU(   29187427817271982LLU), QU( 1499802773054556353LLU),
-	QU(10816638187021897173LLU), QU( 5436139270839738132LLU),
-	QU( 6659882287036010082LLU), QU( 2154048955317173697LLU),
-	QU(10887317019333757642LLU), QU(16281091802634424955LLU),
-	QU(10754549879915384901LLU), QU(10760611745769249815LLU),
-	QU( 2161505946972504002LLU), QU( 5243132808986265107LLU),
-	QU(10129852179873415416LLU), QU(  710339480008649081LLU),
-	QU( 7802129453068808528LLU), QU(17967213567178907213LLU),
-	QU(15730859124668605599LLU), QU(13058356168962376502LLU),
-	QU( 3701224985413645909LLU), QU(14464065869149109264LLU),
-	QU( 9959272418844311646LLU), QU(10157426099515958752LLU),
-	QU(14013736814538268528LLU), QU(17797456992065653951LLU),
-	QU(17418878140257344806LLU), QU(15457429073540561521LLU),
-	QU( 2184426881360949378LLU), QU( 2062193041154712416LLU),
-	QU( 8553463347406931661LLU), QU( 4913057625202871854LLU),
-	QU( 2668943682126618425LLU), QU(17064444737891172288LLU),
-	QU( 4997115903913298637LLU), QU(12019402608892327416LLU),
-	QU(17603584559765897352LLU), QU(11367529582073647975LLU),
-	QU( 8211476043518436050LLU), QU( 8676849804070323674LLU),
-	QU(18431829230394475730LLU), QU(10490177861361247904LLU),
-	QU( 9508720602025651349LLU), QU( 7409627448555722700LLU),
-	QU( 5804047018862729008LLU), QU(11943858176893142594LLU),
-	QU(11908095418933847092LLU), QU( 5415449345715887652LLU),
-	QU( 1554022699166156407LLU), QU( 9073322106406017161LLU),
-	QU( 7080630967969047082LLU), QU(18049736940860732943LLU),
-	QU(12748714242594196794LLU), QU( 1226992415735156741LLU),
-	QU(17900981019609531193LLU), QU(11720739744008710999LLU),
-	QU( 3006400683394775434LLU), QU(11347974011751996028LLU),
-	QU( 3316999628257954608LLU), QU( 8384484563557639101LLU),
-	QU(18117794685961729767LLU), QU( 1900145025596618194LLU),
-	QU(17459527840632892676LLU), QU( 5634784101865710994LLU),
-	QU( 7918619300292897158LLU), QU( 3146577625026301350LLU),
-	QU( 9955212856499068767LLU), QU( 1873995843681746975LLU),
-	QU( 1561487759967972194LLU), QU( 8322718804375878474LLU),
-	QU(11300284215327028366LLU), QU( 4667391032508998982LLU),
-	QU( 9820104494306625580LLU), QU(17922397968599970610LLU),
-	QU( 1784690461886786712LLU), QU(14940365084341346821LLU),
-	QU( 5348719575594186181LLU), QU(10720419084507855261LLU),
-	QU(14210394354145143274LLU), QU( 2426468692164000131LLU),
-	QU(16271062114607059202LLU), QU(14851904092357070247LLU),
-	QU( 6524493015693121897LLU), QU( 9825473835127138531LLU),
-	QU(14222500616268569578LLU), QU(15521484052007487468LLU),
-	QU(14462579404124614699LLU), QU(11012375590820665520LLU),
-	QU(11625327350536084927LLU), QU(14452017765243785417LLU),
-	QU( 9989342263518766305LLU), QU( 3640105471101803790LLU),
-	QU( 4749866455897513242LLU), QU(13963064946736312044LLU),
-	QU(10007416591973223791LLU), QU(18314132234717431115LLU),
-	QU( 3286596588617483450LLU), QU( 7726163455370818765LLU),
-	QU( 7575454721115379328LLU), QU( 5308331576437663422LLU),
-	QU(18288821894903530934LLU), QU( 8028405805410554106LLU),
-	QU(15744019832103296628LLU), QU(  149765559630932100LLU),
-	QU( 6137705557200071977LLU), QU(14513416315434803615LLU),
-	QU(11665702820128984473LLU), QU(  218926670505601386LLU),
-	QU( 6868675028717769519LLU), QU(15282016569441512302LLU),
-	QU( 5707000497782960236LLU), QU( 6671120586555079567LLU),
-	QU( 2194098052618985448LLU), QU(16849577895477330978LLU),
-	QU(12957148471017466283LLU), QU( 1997805535404859393LLU),
-	QU( 1180721060263860490LLU), QU(13206391310193756958LLU),
-	QU(12980208674461861797LLU), QU( 3825967775058875366LLU),
-	QU(17543433670782042631LLU), QU( 1518339070120322730LLU),
-	QU(16344584340890991669LLU), QU( 2611327165318529819LLU),
-	QU(11265022723283422529LLU), QU( 4001552800373196817LLU),
-	QU(14509595890079346161LLU), QU( 3528717165416234562LLU),
-	QU(18153222571501914072LLU), QU( 9387182977209744425LLU),
-	QU(10064342315985580021LLU), QU(11373678413215253977LLU),
-	QU( 2308457853228798099LLU), QU( 9729042942839545302LLU),
-	QU( 7833785471140127746LLU), QU( 6351049900319844436LLU),
-	QU(14454610627133496067LLU), QU(12533175683634819111LLU),
-	QU(15570163926716513029LLU), QU(13356980519185762498LLU)
+	QU( 2100341266307895239ULL), QU( 8344256300489757943ULL),
+	QU(15687933285484243894ULL), QU( 8268620370277076319ULL),
+	QU(12371852309826545459ULL), QU( 8800491541730110238ULL),
+	QU(18113268950100835773ULL), QU( 2886823658884438119ULL),
+	QU( 3293667307248180724ULL), QU( 9307928143300172731ULL),
+	QU( 7688082017574293629ULL), QU(  900986224735166665ULL),
+	QU( 9977972710722265039ULL), QU( 6008205004994830552ULL),
+	QU(  546909104521689292ULL), QU( 7428471521869107594ULL),
+	QU(14777563419314721179ULL), QU(16116143076567350053ULL),
+	QU( 5322685342003142329ULL), QU( 4200427048445863473ULL),
+	QU( 4693092150132559146ULL), QU(13671425863759338582ULL),
+	QU( 6747117460737639916ULL), QU( 4732666080236551150ULL),
+	QU( 5912839950611941263ULL), QU( 3903717554504704909ULL),
+	QU( 2615667650256786818ULL), QU(10844129913887006352ULL),
+	QU(13786467861810997820ULL), QU(14267853002994021570ULL),
+	QU(13767807302847237439ULL), QU(16407963253707224617ULL),
+	QU( 4802498363698583497ULL), QU( 2523802839317209764ULL),
+	QU( 3822579397797475589ULL), QU( 8950320572212130610ULL),
+	QU( 3745623504978342534ULL), QU(16092609066068482806ULL),
+	QU( 9817016950274642398ULL), QU(10591660660323829098ULL),
+	QU(11751606650792815920ULL), QU( 5122873818577122211ULL),
+	QU(17209553764913936624ULL), QU( 6249057709284380343ULL),
+	QU(15088791264695071830ULL), QU(15344673071709851930ULL),
+	QU( 4345751415293646084ULL), QU( 2542865750703067928ULL),
+	QU(13520525127852368784ULL), QU(18294188662880997241ULL),
+	QU( 3871781938044881523ULL), QU( 2873487268122812184ULL),
+	QU(15099676759482679005ULL), QU(15442599127239350490ULL),
+	QU( 6311893274367710888ULL), QU( 3286118760484672933ULL),
+	QU( 4146067961333542189ULL), QU(13303942567897208770ULL),
+	QU( 8196013722255630418ULL), QU( 4437815439340979989ULL),
+	QU(15433791533450605135ULL), QU( 4254828956815687049ULL),
+	QU( 1310903207708286015ULL), QU(10529182764462398549ULL),
+	QU(14900231311660638810ULL), QU( 9727017277104609793ULL),
+	QU( 1821308310948199033ULL), QU(11628861435066772084ULL),
+	QU( 9469019138491546924ULL), QU( 3145812670532604988ULL),
+	QU( 9938468915045491919ULL), QU( 1562447430672662142ULL),
+	QU(13963995266697989134ULL), QU( 3356884357625028695ULL),
+	QU( 4499850304584309747ULL), QU( 8456825817023658122ULL),
+	QU(10859039922814285279ULL), QU( 8099512337972526555ULL),
+	QU(  348006375109672149ULL), QU(11919893998241688603ULL),
+	QU( 1104199577402948826ULL), QU(16689191854356060289ULL),
+	QU(10992552041730168078ULL), QU( 7243733172705465836ULL),
+	QU( 5668075606180319560ULL), QU(18182847037333286970ULL),
+	QU( 4290215357664631322ULL), QU( 4061414220791828613ULL),
+	QU(13006291061652989604ULL), QU( 7140491178917128798ULL),
+	QU(12703446217663283481ULL), QU( 5500220597564558267ULL),
+	QU(10330551509971296358ULL), QU(15958554768648714492ULL),
+	QU( 5174555954515360045ULL), QU( 1731318837687577735ULL),
+	QU( 3557700801048354857ULL), QU(13764012341928616198ULL),
+	QU(13115166194379119043ULL), QU( 7989321021560255519ULL),
+	QU( 2103584280905877040ULL), QU( 9230788662155228488ULL),
+	QU(16396629323325547654ULL), QU(  657926409811318051ULL),
+	QU(15046700264391400727ULL), QU( 5120132858771880830ULL),
+	QU( 7934160097989028561ULL), QU( 6963121488531976245ULL),
+	QU(17412329602621742089ULL), QU(15144843053931774092ULL),
+	QU(17204176651763054532ULL), QU(13166595387554065870ULL),
+	QU( 8590377810513960213ULL), QU( 5834365135373991938ULL),
+	QU( 7640913007182226243ULL), QU( 3479394703859418425ULL),
+	QU(16402784452644521040ULL), QU( 4993979809687083980ULL),
+	QU(13254522168097688865ULL), QU(15643659095244365219ULL),
+	QU( 5881437660538424982ULL), QU(11174892200618987379ULL),
+	QU(  254409966159711077ULL), QU(17158413043140549909ULL),
+	QU( 3638048789290376272ULL), QU( 1376816930299489190ULL),
+	QU( 4622462095217761923ULL), QU(15086407973010263515ULL),
+	QU(13253971772784692238ULL), QU( 5270549043541649236ULL),
+	QU(11182714186805411604ULL), QU(12283846437495577140ULL),
+	QU( 5297647149908953219ULL), QU(10047451738316836654ULL),
+	QU( 4938228100367874746ULL), QU(12328523025304077923ULL),
+	QU( 3601049438595312361ULL), QU( 9313624118352733770ULL),
+	QU(13322966086117661798ULL), QU(16660005705644029394ULL),
+	QU(11337677526988872373ULL), QU(13869299102574417795ULL),
+	QU(15642043183045645437ULL), QU( 3021755569085880019ULL),
+	QU( 4979741767761188161ULL), QU(13679979092079279587ULL),
+	QU( 3344685842861071743ULL), QU(13947960059899588104ULL),
+	QU(  305806934293368007ULL), QU( 5749173929201650029ULL),
+	QU(11123724852118844098ULL), QU(15128987688788879802ULL),
+	QU(15251651211024665009ULL), QU( 7689925933816577776ULL),
+	QU(16732804392695859449ULL), QU(17087345401014078468ULL),
+	QU(14315108589159048871ULL), QU( 4820700266619778917ULL),
+	QU(16709637539357958441ULL), QU( 4936227875177351374ULL),
+	QU( 2137907697912987247ULL), QU(11628565601408395420ULL),
+	QU( 2333250549241556786ULL), QU( 5711200379577778637ULL),
+	QU( 5170680131529031729ULL), QU(12620392043061335164ULL),
+	QU(   95363390101096078ULL), QU( 5487981914081709462ULL),
+	QU( 1763109823981838620ULL), QU( 3395861271473224396ULL),
+	QU( 1300496844282213595ULL), QU( 6894316212820232902ULL),
+	QU(10673859651135576674ULL), QU( 5911839658857903252ULL),
+	QU(17407110743387299102ULL), QU( 8257427154623140385ULL),
+	QU(11389003026741800267ULL), QU( 4070043211095013717ULL),
+	QU(11663806997145259025ULL), QU(15265598950648798210ULL),
+	QU(  630585789434030934ULL), QU( 3524446529213587334ULL),
+	QU( 7186424168495184211ULL), QU(10806585451386379021ULL),
+	QU(11120017753500499273ULL), QU( 1586837651387701301ULL),
+	QU(17530454400954415544ULL), QU( 9991670045077880430ULL),
+	QU( 7550997268990730180ULL), QU( 8640249196597379304ULL),
+	QU( 3522203892786893823ULL), QU(10401116549878854788ULL),
+	QU(13690285544733124852ULL), QU( 8295785675455774586ULL),
+	QU(15535716172155117603ULL), QU( 3112108583723722511ULL),
+	QU(17633179955339271113ULL), QU(18154208056063759375ULL),
+	QU( 1866409236285815666ULL), QU(13326075895396412882ULL),
+	QU( 8756261842948020025ULL), QU( 6281852999868439131ULL),
+	QU(15087653361275292858ULL), QU(10333923911152949397ULL),
+	QU( 5265567645757408500ULL), QU(12728041843210352184ULL),
+	QU( 6347959327507828759ULL), QU(  154112802625564758ULL),
+	QU(18235228308679780218ULL), QU( 3253805274673352418ULL),
+	QU( 4849171610689031197ULL), QU(17948529398340432518ULL),
+	QU(13803510475637409167ULL), QU(13506570190409883095ULL),
+	QU(15870801273282960805ULL), QU( 8451286481299170773ULL),
+	QU( 9562190620034457541ULL), QU( 8518905387449138364ULL),
+	QU(12681306401363385655ULL), QU( 3788073690559762558ULL),
+	QU( 5256820289573487769ULL), QU( 2752021372314875467ULL),
+	QU( 6354035166862520716ULL), QU( 4328956378309739069ULL),
+	QU(  449087441228269600ULL), QU( 5533508742653090868ULL),
+	QU( 1260389420404746988ULL), QU(18175394473289055097ULL),
+	QU( 1535467109660399420ULL), QU( 8818894282874061442ULL),
+	QU(12140873243824811213ULL), QU(15031386653823014946ULL),
+	QU( 1286028221456149232ULL), QU( 6329608889367858784ULL),
+	QU( 9419654354945132725ULL), QU( 6094576547061672379ULL),
+	QU(17706217251847450255ULL), QU( 1733495073065878126ULL),
+	QU(16918923754607552663ULL), QU( 8881949849954945044ULL),
+	QU(12938977706896313891ULL), QU(14043628638299793407ULL),
+	QU(18393874581723718233ULL), QU( 6886318534846892044ULL),
+	QU(14577870878038334081ULL), QU(13541558383439414119ULL),
+	QU(13570472158807588273ULL), QU(18300760537910283361ULL),
+	QU(  818368572800609205ULL), QU( 1417000585112573219ULL),
+	QU(12337533143867683655ULL), QU(12433180994702314480ULL),
+	QU(  778190005829189083ULL), QU(13667356216206524711ULL),
+	QU( 9866149895295225230ULL), QU(11043240490417111999ULL),
+	QU( 1123933826541378598ULL), QU( 6469631933605123610ULL),
+	QU(14508554074431980040ULL), QU(13918931242962026714ULL),
+	QU( 2870785929342348285ULL), QU(14786362626740736974ULL),
+	QU(13176680060902695786ULL), QU( 9591778613541679456ULL),
+	QU( 9097662885117436706ULL), QU(  749262234240924947ULL),
+	QU( 1944844067793307093ULL), QU( 4339214904577487742ULL),
+	QU( 8009584152961946551ULL), QU(16073159501225501777ULL),
+	QU( 3335870590499306217ULL), QU(17088312653151202847ULL),
+	QU( 3108893142681931848ULL), QU(16636841767202792021ULL),
+	QU(10423316431118400637ULL), QU( 8008357368674443506ULL),
+	QU(11340015231914677875ULL), QU(17687896501594936090ULL),
+	QU(15173627921763199958ULL), QU(  542569482243721959ULL),
+	QU(15071714982769812975ULL), QU( 4466624872151386956ULL),
+	QU( 1901780715602332461ULL), QU( 9822227742154351098ULL),
+	QU( 1479332892928648780ULL), QU( 6981611948382474400ULL),
+	QU( 7620824924456077376ULL), QU(14095973329429406782ULL),
+	QU( 7902744005696185404ULL), QU(15830577219375036920ULL),
+	QU(10287076667317764416ULL), QU(12334872764071724025ULL),
+	QU( 4419302088133544331ULL), QU(14455842851266090520ULL),
+	QU(12488077416504654222ULL), QU( 7953892017701886766ULL),
+	QU( 6331484925529519007ULL), QU( 4902145853785030022ULL),
+	QU(17010159216096443073ULL), QU(11945354668653886087ULL),
+	QU(15112022728645230829ULL), QU(17363484484522986742ULL),
+	QU( 4423497825896692887ULL), QU( 8155489510809067471ULL),
+	QU(  258966605622576285ULL), QU( 5462958075742020534ULL),
+	QU( 6763710214913276228ULL), QU( 2368935183451109054ULL),
+	QU(14209506165246453811ULL), QU( 2646257040978514881ULL),
+	QU( 3776001911922207672ULL), QU( 1419304601390147631ULL),
+	QU(14987366598022458284ULL), QU( 3977770701065815721ULL),
+	QU(  730820417451838898ULL), QU( 3982991703612885327ULL),
+	QU( 2803544519671388477ULL), QU(17067667221114424649ULL),
+	QU( 2922555119737867166ULL), QU( 1989477584121460932ULL),
+	QU(15020387605892337354ULL), QU( 9293277796427533547ULL),
+	QU(10722181424063557247ULL), QU(16704542332047511651ULL),
+	QU( 5008286236142089514ULL), QU(16174732308747382540ULL),
+	QU(17597019485798338402ULL), QU(13081745199110622093ULL),
+	QU( 8850305883842258115ULL), QU(12723629125624589005ULL),
+	QU( 8140566453402805978ULL), QU(15356684607680935061ULL),
+	QU(14222190387342648650ULL), QU(11134610460665975178ULL),
+	QU( 1259799058620984266ULL), QU(13281656268025610041ULL),
+	QU(  298262561068153992ULL), QU(12277871700239212922ULL),
+	QU(13911297774719779438ULL), QU(16556727962761474934ULL),
+	QU(17903010316654728010ULL), QU( 9682617699648434744ULL),
+	QU(14757681836838592850ULL), QU( 1327242446558524473ULL),
+	QU(11126645098780572792ULL), QU( 1883602329313221774ULL),
+	QU( 2543897783922776873ULL), QU(15029168513767772842ULL),
+	QU(12710270651039129878ULL), QU(16118202956069604504ULL),
+	QU(15010759372168680524ULL), QU( 2296827082251923948ULL),
+	QU(10793729742623518101ULL), QU(13829764151845413046ULL),
+	QU(17769301223184451213ULL), QU( 3118268169210783372ULL),
+	QU(17626204544105123127ULL), QU( 7416718488974352644ULL),
+	QU(10450751996212925994ULL), QU( 9352529519128770586ULL),
+	QU(  259347569641110140ULL), QU( 8048588892269692697ULL),
+	QU( 1774414152306494058ULL), QU(10669548347214355622ULL),
+	QU(13061992253816795081ULL), QU(18432677803063861659ULL),
+	QU( 8879191055593984333ULL), QU(12433753195199268041ULL),
+	QU(14919392415439730602ULL), QU( 6612848378595332963ULL),
+	QU( 6320986812036143628ULL), QU(10465592420226092859ULL),
+	QU( 4196009278962570808ULL), QU( 3747816564473572224ULL),
+	QU(17941203486133732898ULL), QU( 2350310037040505198ULL),
+	QU( 5811779859134370113ULL), QU(10492109599506195126ULL),
+	QU( 7699650690179541274ULL), QU( 1954338494306022961ULL),
+	QU(14095816969027231152ULL), QU( 5841346919964852061ULL),
+	QU(14945969510148214735ULL), QU( 3680200305887550992ULL),
+	QU( 6218047466131695792ULL), QU( 8242165745175775096ULL),
+	QU(11021371934053307357ULL), QU( 1265099502753169797ULL),
+	QU( 4644347436111321718ULL), QU( 3609296916782832859ULL),
+	QU( 8109807992218521571ULL), QU(18387884215648662020ULL),
+	QU(14656324896296392902ULL), QU(17386819091238216751ULL),
+	QU(17788300878582317152ULL), QU( 7919446259742399591ULL),
+	QU( 4466613134576358004ULL), QU(12928181023667938509ULL),
+	QU(13147446154454932030ULL), QU(16552129038252734620ULL),
+	QU( 8395299403738822450ULL), QU(11313817655275361164ULL),
+	QU(  434258809499511718ULL), QU( 2074882104954788676ULL),
+	QU( 7929892178759395518ULL), QU( 9006461629105745388ULL),
+	QU( 5176475650000323086ULL), QU(11128357033468341069ULL),
+	QU(12026158851559118955ULL), QU(14699716249471156500ULL),
+	QU(  448982497120206757ULL), QU( 4156475356685519900ULL),
+	QU( 6063816103417215727ULL), QU(10073289387954971479ULL),
+	QU( 8174466846138590962ULL), QU( 2675777452363449006ULL),
+	QU( 9090685420572474281ULL), QU( 6659652652765562060ULL),
+	QU(12923120304018106621ULL), QU(11117480560334526775ULL),
+	QU(  937910473424587511ULL), QU( 1838692113502346645ULL),
+	QU(11133914074648726180ULL), QU( 7922600945143884053ULL),
+	QU(13435287702700959550ULL), QU( 5287964921251123332ULL),
+	QU(11354875374575318947ULL), QU(17955724760748238133ULL),
+	QU(13728617396297106512ULL), QU( 4107449660118101255ULL),
+	QU( 1210269794886589623ULL), QU(11408687205733456282ULL),
+	QU( 4538354710392677887ULL), QU(13566803319341319267ULL),
+	QU(17870798107734050771ULL), QU( 3354318982568089135ULL),
+	QU( 9034450839405133651ULL), QU(13087431795753424314ULL),
+	QU(  950333102820688239ULL), QU( 1968360654535604116ULL),
+	QU(16840551645563314995ULL), QU( 8867501803892924995ULL),
+	QU(11395388644490626845ULL), QU( 1529815836300732204ULL),
+	QU(13330848522996608842ULL), QU( 1813432878817504265ULL),
+	QU( 2336867432693429560ULL), QU(15192805445973385902ULL),
+	QU( 2528593071076407877ULL), QU(  128459777936689248ULL),
+	QU( 9976345382867214866ULL), QU( 6208885766767996043ULL),
+	QU(14982349522273141706ULL), QU( 3099654362410737822ULL),
+	QU(13776700761947297661ULL), QU( 8806185470684925550ULL),
+	QU( 8151717890410585321ULL), QU(  640860591588072925ULL),
+	QU(14592096303937307465ULL), QU( 9056472419613564846ULL),
+	QU(14861544647742266352ULL), QU(12703771500398470216ULL),
+	QU( 3142372800384138465ULL), QU( 6201105606917248196ULL),
+	QU(18337516409359270184ULL), QU(15042268695665115339ULL),
+	QU(15188246541383283846ULL), QU(12800028693090114519ULL),
+	QU( 5992859621101493472ULL), QU(18278043971816803521ULL),
+	QU( 9002773075219424560ULL), QU( 7325707116943598353ULL),
+	QU( 7930571931248040822ULL), QU( 5645275869617023448ULL),
+	QU( 7266107455295958487ULL), QU( 4363664528273524411ULL),
+	QU(14313875763787479809ULL), QU(17059695613553486802ULL),
+	QU( 9247761425889940932ULL), QU(13704726459237593128ULL),
+	QU( 2701312427328909832ULL), QU(17235532008287243115ULL),
+	QU(14093147761491729538ULL), QU( 6247352273768386516ULL),
+	QU( 8268710048153268415ULL), QU( 7985295214477182083ULL),
+	QU(15624495190888896807ULL), QU( 3772753430045262788ULL),
+	QU( 9133991620474991698ULL), QU( 5665791943316256028ULL),
+	QU( 7551996832462193473ULL), QU(13163729206798953877ULL),
+	QU( 9263532074153846374ULL), QU( 1015460703698618353ULL),
+	QU(17929874696989519390ULL), QU(18257884721466153847ULL),
+	QU(16271867543011222991ULL), QU( 3905971519021791941ULL),
+	QU(16814488397137052085ULL), QU( 1321197685504621613ULL),
+	QU( 2870359191894002181ULL), QU(14317282970323395450ULL),
+	QU(13663920845511074366ULL), QU( 2052463995796539594ULL),
+	QU(14126345686431444337ULL), QU( 1727572121947022534ULL),
+	QU(17793552254485594241ULL), QU( 6738857418849205750ULL),
+	QU( 1282987123157442952ULL), QU(16655480021581159251ULL),
+	QU( 6784587032080183866ULL), QU(14726758805359965162ULL),
+	QU( 7577995933961987349ULL), QU(12539609320311114036ULL),
+	QU(10789773033385439494ULL), QU( 8517001497411158227ULL),
+	QU(10075543932136339710ULL), QU(14838152340938811081ULL),
+	QU( 9560840631794044194ULL), QU(17445736541454117475ULL),
+	QU(10633026464336393186ULL), QU(15705729708242246293ULL),
+	QU( 1117517596891411098ULL), QU( 4305657943415886942ULL),
+	QU( 4948856840533979263ULL), QU(16071681989041789593ULL),
+	QU(13723031429272486527ULL), QU( 7639567622306509462ULL),
+	QU(12670424537483090390ULL), QU( 9715223453097197134ULL),
+	QU( 5457173389992686394ULL), QU(  289857129276135145ULL),
+	QU(17048610270521972512ULL), QU(  692768013309835485ULL),
+	QU(14823232360546632057ULL), QU(18218002361317895936ULL),
+	QU( 3281724260212650204ULL), QU(16453957266549513795ULL),
+	QU( 8592711109774511881ULL), QU(  929825123473369579ULL),
+	QU(15966784769764367791ULL), QU( 9627344291450607588ULL),
+	QU(10849555504977813287ULL), QU( 9234566913936339275ULL),
+	QU( 6413807690366911210ULL), QU(10862389016184219267ULL),
+	QU(13842504799335374048ULL), QU( 1531994113376881174ULL),
+	QU( 2081314867544364459ULL), QU(16430628791616959932ULL),
+	QU( 8314714038654394368ULL), QU( 9155473892098431813ULL),
+	QU(12577843786670475704ULL), QU( 4399161106452401017ULL),
+	QU( 1668083091682623186ULL), QU( 1741383777203714216ULL),
+	QU( 2162597285417794374ULL), QU(15841980159165218736ULL),
+	QU( 1971354603551467079ULL), QU( 1206714764913205968ULL),
+	QU( 4790860439591272330ULL), QU(14699375615594055799ULL),
+	QU( 8374423871657449988ULL), QU(10950685736472937738ULL),
+	QU(  697344331343267176ULL), QU(10084998763118059810ULL),
+	QU(12897369539795983124ULL), QU(12351260292144383605ULL),
+	QU( 1268810970176811234ULL), QU( 7406287800414582768ULL),
+	QU(  516169557043807831ULL), QU( 5077568278710520380ULL),
+	QU( 3828791738309039304ULL), QU( 7721974069946943610ULL),
+	QU( 3534670260981096460ULL), QU( 4865792189600584891ULL),
+	QU(16892578493734337298ULL), QU( 9161499464278042590ULL),
+	QU(11976149624067055931ULL), QU(13219479887277343990ULL),
+	QU(14161556738111500680ULL), QU(14670715255011223056ULL),
+	QU( 4671205678403576558ULL), QU(12633022931454259781ULL),
+	QU(14821376219869187646ULL), QU(  751181776484317028ULL),
+	QU( 2192211308839047070ULL), QU(11787306362361245189ULL),
+	QU(10672375120744095707ULL), QU( 4601972328345244467ULL),
+	QU(15457217788831125879ULL), QU( 8464345256775460809ULL),
+	QU(10191938789487159478ULL), QU( 6184348739615197613ULL),
+	QU(11425436778806882100ULL), QU( 2739227089124319793ULL),
+	QU(  461464518456000551ULL), QU( 4689850170029177442ULL),
+	QU( 6120307814374078625ULL), QU(11153579230681708671ULL),
+	QU( 7891721473905347926ULL), QU(10281646937824872400ULL),
+	QU( 3026099648191332248ULL), QU( 8666750296953273818ULL),
+	QU(14978499698844363232ULL), QU(13303395102890132065ULL),
+	QU( 8182358205292864080ULL), QU(10560547713972971291ULL),
+	QU(11981635489418959093ULL), QU( 3134621354935288409ULL),
+	QU(11580681977404383968ULL), QU(14205530317404088650ULL),
+	QU( 5997789011854923157ULL), QU(13659151593432238041ULL),
+	QU(11664332114338865086ULL), QU( 7490351383220929386ULL),
+	QU( 7189290499881530378ULL), QU(15039262734271020220ULL),
+	QU( 2057217285976980055ULL), QU(  555570804905355739ULL),
+	QU(11235311968348555110ULL), QU(13824557146269603217ULL),
+	QU(16906788840653099693ULL), QU( 7222878245455661677ULL),
+	QU( 5245139444332423756ULL), QU( 4723748462805674292ULL),
+	QU(12216509815698568612ULL), QU(17402362976648951187ULL),
+	QU(17389614836810366768ULL), QU( 4880936484146667711ULL),
+	QU( 9085007839292639880ULL), QU(13837353458498535449ULL),
+	QU(11914419854360366677ULL), QU(16595890135313864103ULL),
+	QU( 6313969847197627222ULL), QU(18296909792163910431ULL),
+	QU(10041780113382084042ULL), QU( 2499478551172884794ULL),
+	QU(11057894246241189489ULL), QU( 9742243032389068555ULL),
+	QU(12838934582673196228ULL), QU(13437023235248490367ULL),
+	QU(13372420669446163240ULL), QU( 6752564244716909224ULL),
+	QU( 7157333073400313737ULL), QU(12230281516370654308ULL),
+	QU( 1182884552219419117ULL), QU( 2955125381312499218ULL),
+	QU(10308827097079443249ULL), QU( 1337648572986534958ULL),
+	QU(16378788590020343939ULL), QU(  108619126514420935ULL),
+	QU( 3990981009621629188ULL), QU( 5460953070230946410ULL),
+	QU( 9703328329366531883ULL), QU(13166631489188077236ULL),
+	QU( 1104768831213675170ULL), QU( 3447930458553877908ULL),
+	QU( 8067172487769945676ULL), QU( 5445802098190775347ULL),
+	QU( 3244840981648973873ULL), QU(17314668322981950060ULL),
+	QU( 5006812527827763807ULL), QU(18158695070225526260ULL),
+	QU( 2824536478852417853ULL), QU(13974775809127519886ULL),
+	QU( 9814362769074067392ULL), QU(17276205156374862128ULL),
+	QU(11361680725379306967ULL), QU( 3422581970382012542ULL),
+	QU(11003189603753241266ULL), QU(11194292945277862261ULL),
+	QU( 6839623313908521348ULL), QU(11935326462707324634ULL),
+	QU( 1611456788685878444ULL), QU(13112620989475558907ULL),
+	QU(  517659108904450427ULL), QU(13558114318574407624ULL),
+	QU(15699089742731633077ULL), QU( 4988979278862685458ULL),
+	QU( 8111373583056521297ULL), QU( 3891258746615399627ULL),
+	QU( 8137298251469718086ULL), QU(12748663295624701649ULL),
+	QU( 4389835683495292062ULL), QU( 5775217872128831729ULL),
+	QU( 9462091896405534927ULL), QU( 8498124108820263989ULL),
+	QU( 8059131278842839525ULL), QU(10503167994254090892ULL),
+	QU(11613153541070396656ULL), QU(18069248738504647790ULL),
+	QU(  570657419109768508ULL), QU( 3950574167771159665ULL),
+	QU( 5514655599604313077ULL), QU( 2908460854428484165ULL),
+	QU(10777722615935663114ULL), QU(12007363304839279486ULL),
+	QU( 9800646187569484767ULL), QU( 8795423564889864287ULL),
+	QU(14257396680131028419ULL), QU( 6405465117315096498ULL),
+	QU( 7939411072208774878ULL), QU(17577572378528990006ULL),
+	QU(14785873806715994850ULL), QU(16770572680854747390ULL),
+	QU(18127549474419396481ULL), QU(11637013449455757750ULL),
+	QU(14371851933996761086ULL), QU( 3601181063650110280ULL),
+	QU( 4126442845019316144ULL), QU(10198287239244320669ULL),
+	QU(18000169628555379659ULL), QU(18392482400739978269ULL),
+	QU( 6219919037686919957ULL), QU( 3610085377719446052ULL),
+	QU( 2513925039981776336ULL), QU(16679413537926716955ULL),
+	QU(12903302131714909434ULL), QU( 5581145789762985009ULL),
+	QU(12325955044293303233ULL), QU(17216111180742141204ULL),
+	QU( 6321919595276545740ULL), QU( 3507521147216174501ULL),
+	QU( 9659194593319481840ULL), QU(11473976005975358326ULL),
+	QU(14742730101435987026ULL), QU(  492845897709954780ULL),
+	QU(16976371186162599676ULL), QU(17712703422837648655ULL),
+	QU( 9881254778587061697ULL), QU( 8413223156302299551ULL),
+	QU( 1563841828254089168ULL), QU( 9996032758786671975ULL),
+	QU(  138877700583772667ULL), QU(13003043368574995989ULL),
+	QU( 4390573668650456587ULL), QU( 8610287390568126755ULL),
+	QU(15126904974266642199ULL), QU( 6703637238986057662ULL),
+	QU( 2873075592956810157ULL), QU( 6035080933946049418ULL),
+	QU(13382846581202353014ULL), QU( 7303971031814642463ULL),
+	QU(18418024405307444267ULL), QU( 5847096731675404647ULL),
+	QU( 4035880699639842500ULL), QU(11525348625112218478ULL),
+	QU( 3041162365459574102ULL), QU( 2604734487727986558ULL),
+	QU(15526341771636983145ULL), QU(14556052310697370254ULL),
+	QU(12997787077930808155ULL), QU( 9601806501755554499ULL),
+	QU(11349677952521423389ULL), QU(14956777807644899350ULL),
+	QU(16559736957742852721ULL), QU(12360828274778140726ULL),
+	QU( 6685373272009662513ULL), QU(16932258748055324130ULL),
+	QU(15918051131954158508ULL), QU( 1692312913140790144ULL),
+	QU(  546653826801637367ULL), QU( 5341587076045986652ULL),
+	QU(14975057236342585662ULL), QU(12374976357340622412ULL),
+	QU(10328833995181940552ULL), QU(12831807101710443149ULL),
+	QU(10548514914382545716ULL), QU( 2217806727199715993ULL),
+	QU(12627067369242845138ULL), QU( 4598965364035438158ULL),
+	QU(  150923352751318171ULL), QU(14274109544442257283ULL),
+	QU( 4696661475093863031ULL), QU( 1505764114384654516ULL),
+	QU(10699185831891495147ULL), QU( 2392353847713620519ULL),
+	QU( 3652870166711788383ULL), QU( 8640653276221911108ULL),
+	QU( 3894077592275889704ULL), QU( 4918592872135964845ULL),
+	QU(16379121273281400789ULL), QU(12058465483591683656ULL),
+	QU(11250106829302924945ULL), QU( 1147537556296983005ULL),
+	QU( 6376342756004613268ULL), QU(14967128191709280506ULL),
+	QU(18007449949790627628ULL), QU( 9497178279316537841ULL),
+	QU( 7920174844809394893ULL), QU(10037752595255719907ULL),
+	QU(15875342784985217697ULL), QU(15311615921712850696ULL),
+	QU( 9552902652110992950ULL), QU(14054979450099721140ULL),
+	QU( 5998709773566417349ULL), QU(18027910339276320187ULL),
+	QU( 8223099053868585554ULL), QU( 7842270354824999767ULL),
+	QU( 4896315688770080292ULL), QU(12969320296569787895ULL),
+	QU( 2674321489185759961ULL), QU( 4053615936864718439ULL),
+	QU(11349775270588617578ULL), QU( 4743019256284553975ULL),
+	QU( 5602100217469723769ULL), QU(14398995691411527813ULL),
+	QU( 7412170493796825470ULL), QU(  836262406131744846ULL),
+	QU( 8231086633845153022ULL), QU( 5161377920438552287ULL),
+	QU( 8828731196169924949ULL), QU(16211142246465502680ULL),
+	QU( 3307990879253687818ULL), QU( 5193405406899782022ULL),
+	QU( 8510842117467566693ULL), QU( 6070955181022405365ULL),
+	QU(14482950231361409799ULL), QU(12585159371331138077ULL),
+	QU( 3511537678933588148ULL), QU( 2041849474531116417ULL),
+	QU(10944936685095345792ULL), QU(18303116923079107729ULL),
+	QU( 2720566371239725320ULL), QU( 4958672473562397622ULL),
+	QU( 3032326668253243412ULL), QU(13689418691726908338ULL),
+	QU( 1895205511728843996ULL), QU( 8146303515271990527ULL),
+	QU(16507343500056113480ULL), QU(  473996939105902919ULL),
+	QU( 9897686885246881481ULL), QU(14606433762712790575ULL),
+	QU( 6732796251605566368ULL), QU( 1399778120855368916ULL),
+	QU(  935023885182833777ULL), QU(16066282816186753477ULL),
+	QU( 7291270991820612055ULL), QU(17530230393129853844ULL),
+	QU(10223493623477451366ULL), QU(15841725630495676683ULL),
+	QU(17379567246435515824ULL), QU( 8588251429375561971ULL),
+	QU(18339511210887206423ULL), QU(17349587430725976100ULL),
+	QU(12244876521394838088ULL), QU( 6382187714147161259ULL),
+	QU(12335807181848950831ULL), QU(16948885622305460665ULL),
+	QU(13755097796371520506ULL), QU(14806740373324947801ULL),
+	QU( 4828699633859287703ULL), QU( 8209879281452301604ULL),
+	QU(12435716669553736437ULL), QU(13970976859588452131ULL),
+	QU( 6233960842566773148ULL), QU(12507096267900505759ULL),
+	QU( 1198713114381279421ULL), QU(14989862731124149015ULL),
+	QU(15932189508707978949ULL), QU( 2526406641432708722ULL),
+	QU(   29187427817271982ULL), QU( 1499802773054556353ULL),
+	QU(10816638187021897173ULL), QU( 5436139270839738132ULL),
+	QU( 6659882287036010082ULL), QU( 2154048955317173697ULL),
+	QU(10887317019333757642ULL), QU(16281091802634424955ULL),
+	QU(10754549879915384901ULL), QU(10760611745769249815ULL),
+	QU( 2161505946972504002ULL), QU( 5243132808986265107ULL),
+	QU(10129852179873415416ULL), QU(  710339480008649081ULL),
+	QU( 7802129453068808528ULL), QU(17967213567178907213ULL),
+	QU(15730859124668605599ULL), QU(13058356168962376502ULL),
+	QU( 3701224985413645909ULL), QU(14464065869149109264ULL),
+	QU( 9959272418844311646ULL), QU(10157426099515958752ULL),
+	QU(14013736814538268528ULL), QU(17797456992065653951ULL),
+	QU(17418878140257344806ULL), QU(15457429073540561521ULL),
+	QU( 2184426881360949378ULL), QU( 2062193041154712416ULL),
+	QU( 8553463347406931661ULL), QU( 4913057625202871854ULL),
+	QU( 2668943682126618425ULL), QU(17064444737891172288ULL),
+	QU( 4997115903913298637ULL), QU(12019402608892327416ULL),
+	QU(17603584559765897352ULL), QU(11367529582073647975ULL),
+	QU( 8211476043518436050ULL), QU( 8676849804070323674ULL),
+	QU(18431829230394475730ULL), QU(10490177861361247904ULL),
+	QU( 9508720602025651349ULL), QU( 7409627448555722700ULL),
+	QU( 5804047018862729008ULL), QU(11943858176893142594ULL),
+	QU(11908095418933847092ULL), QU( 5415449345715887652ULL),
+	QU( 1554022699166156407ULL), QU( 9073322106406017161ULL),
+	QU( 7080630967969047082ULL), QU(18049736940860732943ULL),
+	QU(12748714242594196794ULL), QU( 1226992415735156741ULL),
+	QU(17900981019609531193ULL), QU(11720739744008710999ULL),
+	QU( 3006400683394775434ULL), QU(11347974011751996028ULL),
+	QU( 3316999628257954608ULL), QU( 8384484563557639101ULL),
+	QU(18117794685961729767ULL), QU( 1900145025596618194ULL),
+	QU(17459527840632892676ULL), QU( 5634784101865710994ULL),
+	QU( 7918619300292897158ULL), QU( 3146577625026301350ULL),
+	QU( 9955212856499068767ULL), QU( 1873995843681746975ULL),
+	QU( 1561487759967972194ULL), QU( 8322718804375878474ULL),
+	QU(11300284215327028366ULL), QU( 4667391032508998982ULL),
+	QU( 9820104494306625580ULL), QU(17922397968599970610ULL),
+	QU( 1784690461886786712ULL), QU(14940365084341346821ULL),
+	QU( 5348719575594186181ULL), QU(10720419084507855261ULL),
+	QU(14210394354145143274ULL), QU( 2426468692164000131ULL),
+	QU(16271062114607059202ULL), QU(14851904092357070247ULL),
+	QU( 6524493015693121897ULL), QU( 9825473835127138531ULL),
+	QU(14222500616268569578ULL), QU(15521484052007487468ULL),
+	QU(14462579404124614699ULL), QU(11012375590820665520ULL),
+	QU(11625327350536084927ULL), QU(14452017765243785417ULL),
+	QU( 9989342263518766305ULL), QU( 3640105471101803790ULL),
+	QU( 4749866455897513242ULL), QU(13963064946736312044ULL),
+	QU(10007416591973223791ULL), QU(18314132234717431115ULL),
+	QU( 3286596588617483450ULL), QU( 7726163455370818765ULL),
+	QU( 7575454721115379328ULL), QU( 5308331576437663422ULL),
+	QU(18288821894903530934ULL), QU( 8028405805410554106ULL),
+	QU(15744019832103296628ULL), QU(  149765559630932100ULL),
+	QU( 6137705557200071977ULL), QU(14513416315434803615ULL),
+	QU(11665702820128984473ULL), QU(  218926670505601386ULL),
+	QU( 6868675028717769519ULL), QU(15282016569441512302ULL),
+	QU( 5707000497782960236ULL), QU( 6671120586555079567ULL),
+	QU( 2194098052618985448ULL), QU(16849577895477330978ULL),
+	QU(12957148471017466283ULL), QU( 1997805535404859393ULL),
+	QU( 1180721060263860490ULL), QU(13206391310193756958ULL),
+	QU(12980208674461861797ULL), QU( 3825967775058875366ULL),
+	QU(17543433670782042631ULL), QU( 1518339070120322730ULL),
+	QU(16344584340890991669ULL), QU( 2611327165318529819ULL),
+	QU(11265022723283422529ULL), QU( 4001552800373196817ULL),
+	QU(14509595890079346161ULL), QU( 3528717165416234562ULL),
+	QU(18153222571501914072ULL), QU( 9387182977209744425ULL),
+	QU(10064342315985580021ULL), QU(11373678413215253977ULL),
+	QU( 2308457853228798099ULL), QU( 9729042942839545302ULL),
+	QU( 7833785471140127746ULL), QU( 6351049900319844436ULL),
+	QU(14454610627133496067ULL), QU(12533175683634819111ULL),
+	QU(15570163926716513029ULL), QU(13356980519185762498ULL)
 };
 
 TEST_BEGIN(test_gen_rand_32)
-- 
cgit v0.12


From 22bc570fba00c4dd04cb4962e219d4230f137a4c Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Wed, 21 May 2014 18:06:14 +0900
Subject: Move __func__ to jemalloc_internal_macros.h

test/integration/aligned_alloc.c needs it.
---
 include/jemalloc/internal/jemalloc_internal.h.in     | 1 -
 include/jemalloc/internal/jemalloc_internal_macros.h | 4 ++++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index c9462e5..d9bfadf 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -46,7 +46,6 @@
 typedef intptr_t ssize_t;
 #  define PATH_MAX 1024
 #  define STDERR_FILENO 2
-#  define __func__ __FUNCTION__
 /* Disable warnings about deprecated system functions */
 #  pragma warning(disable: 4996)
 #else
diff --git a/include/jemalloc/internal/jemalloc_internal_macros.h b/include/jemalloc/internal/jemalloc_internal_macros.h
index 4e23923..bb81e99 100644
--- a/include/jemalloc/internal/jemalloc_internal_macros.h
+++ b/include/jemalloc/internal/jemalloc_internal_macros.h
@@ -49,3 +49,7 @@
 #ifndef JEMALLOC_HAS_RESTRICT
 #  define restrict
 #endif
+
+#ifdef _MSC_VER
+#  define __func__ __FUNCTION__
+#endif
-- 
cgit v0.12


From affe009e3765384805a23d804152fbf04151b117 Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Wed, 28 May 2014 08:10:12 +0900
Subject: Use a configure test to detect the form of malloc_usable_size in
 malloc.h

---
 configure.ac | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/configure.ac b/configure.ac
index 57015d1..04bb294 100644
--- a/configure.ac
+++ b/configure.ac
@@ -258,7 +258,6 @@ dnl Define cpp macros in CPPFLAGS, rather than doing AC_DEFINE(macro), since the
 dnl definitions need to be seen before any headers are included, which is a pain
 dnl to make happen otherwise.
 default_munmap="1"
-JEMALLOC_USABLE_SIZE_CONST="const"
 case "${host}" in
   *-*-darwin*)
 	CFLAGS="$CFLAGS"
@@ -286,7 +285,6 @@ case "${host}" in
 	AC_DEFINE([JEMALLOC_HAS_ALLOCA_H])
 	AC_DEFINE([JEMALLOC_PURGE_MADVISE_DONTNEED], [ ])
 	AC_DEFINE([JEMALLOC_THREADED_INIT], [ ])
-	JEMALLOC_USABLE_SIZE_CONST=""
 	default_munmap="0"
 	;;
   *-*-netbsd*)
@@ -351,6 +349,22 @@ case "${host}" in
 	abi="elf"
 	;;
 esac
+
+JEMALLOC_USABLE_SIZE_CONST=const
+AC_CHECK_HEADERS([malloc.h], [
+  AC_MSG_CHECKING([whether malloc_usable_size definition can use const argument])
+  AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
+    [#include <malloc.h>
+     #include <stddef.h>
+    size_t malloc_usable_size(const void *ptr);
+    ],
+    [])],[
+                AC_MSG_RESULT([yes])
+         ],[
+                JEMALLOC_USABLE_SIZE_CONST=
+                AC_MSG_RESULT([no])
+         ])
+])
 AC_DEFINE_UNQUOTED([JEMALLOC_USABLE_SIZE_CONST], [$JEMALLOC_USABLE_SIZE_CONST])
 AC_SUBST([abi])
 AC_SUBST([RPATH])
-- 
cgit v0.12


From 12f74e680c1d53c8fe5323a4ff66877534dcadd3 Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Wed, 28 May 2014 12:39:13 +0900
Subject: Move platform headers and tricks from jemalloc_internal.h.in to a new
 jemalloc_internal_decls.h header

---
 include/jemalloc/internal/jemalloc_internal.h.in   | 53 +-------------------
 .../jemalloc/internal/jemalloc_internal_decls.h    | 58 ++++++++++++++++++++++
 .../jemalloc/internal/jemalloc_internal_macros.h   |  4 --
 test/include/test/jemalloc_test_defs.h.in          |  1 +
 4 files changed, 60 insertions(+), 56 deletions(-)
 create mode 100644 include/jemalloc/internal/jemalloc_internal_decls.h

diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index d9bfadf..cf20f1f 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -1,59 +1,8 @@
 #ifndef JEMALLOC_INTERNAL_H
 #define	JEMALLOC_INTERNAL_H
-#include <math.h>
-#ifdef _WIN32
-#  include <windows.h>
-#  define ENOENT ERROR_PATH_NOT_FOUND
-#  define EINVAL ERROR_BAD_ARGUMENTS
-#  define EAGAIN ERROR_OUTOFMEMORY
-#  define EPERM  ERROR_WRITE_FAULT
-#  define EFAULT ERROR_INVALID_ADDRESS
-#  define ENOMEM ERROR_NOT_ENOUGH_MEMORY
-#  undef ERANGE
-#  define ERANGE ERROR_INVALID_DATA
-#else
-#  include <sys/param.h>
-#  include <sys/mman.h>
-#  include <sys/syscall.h>
-#  if !defined(SYS_write) && defined(__NR_write)
-#    define SYS_write __NR_write
-#  endif
-#  include <sys/uio.h>
-#  include <pthread.h>
-#  include <errno.h>
-#endif
-#include <sys/types.h>
-
-#include <limits.h>
-#ifndef SIZE_T_MAX
-#  define SIZE_T_MAX	SIZE_MAX
-#endif
-#include <stdarg.h>
-#include <stdbool.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <stddef.h>
-#ifndef offsetof
-#  define offsetof(type, member)	((size_t)&(((type *)NULL)->member))
-#endif
-#include <inttypes.h>
-#include <string.h>
-#include <strings.h>
-#include <ctype.h>
-#ifdef _MSC_VER
-#  include <io.h>
-typedef intptr_t ssize_t;
-#  define PATH_MAX 1024
-#  define STDERR_FILENO 2
-/* Disable warnings about deprecated system functions */
-#  pragma warning(disable: 4996)
-#else
-#  include <unistd.h>
-#endif
-#include <fcntl.h>
 
 #include "jemalloc_internal_defs.h"
+#include "jemalloc/internal/jemalloc_internal_decls.h"
 
 #ifdef JEMALLOC_UTRACE
 #include <sys/ktrace.h>
diff --git a/include/jemalloc/internal/jemalloc_internal_decls.h b/include/jemalloc/internal/jemalloc_internal_decls.h
new file mode 100644
index 0000000..7775ab3
--- /dev/null
+++ b/include/jemalloc/internal/jemalloc_internal_decls.h
@@ -0,0 +1,58 @@
+#ifndef JEMALLOC_INTERNAL_DECLS_H
+#define	JEMALLOC_INTERNAL_DECLS_H
+
+#include <math.h>
+#ifdef _WIN32
+#  include <windows.h>
+#  define ENOENT ERROR_PATH_NOT_FOUND
+#  define EINVAL ERROR_BAD_ARGUMENTS
+#  define EAGAIN ERROR_OUTOFMEMORY
+#  define EPERM  ERROR_WRITE_FAULT
+#  define EFAULT ERROR_INVALID_ADDRESS
+#  define ENOMEM ERROR_NOT_ENOUGH_MEMORY
+#  undef ERANGE
+#  define ERANGE ERROR_INVALID_DATA
+#else
+#  include <sys/param.h>
+#  include <sys/mman.h>
+#  include <sys/syscall.h>
+#  if !defined(SYS_write) && defined(__NR_write)
+#    define SYS_write __NR_write
+#  endif
+#  include <sys/uio.h>
+#  include <pthread.h>
+#  include <errno.h>
+#endif
+#include <sys/types.h>
+
+#include <limits.h>
+#ifndef SIZE_T_MAX
+#  define SIZE_T_MAX	SIZE_MAX
+#endif
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stddef.h>
+#ifndef offsetof
+#  define offsetof(type, member)	((size_t)&(((type *)NULL)->member))
+#endif
+#include <inttypes.h>
+#include <string.h>
+#include <strings.h>
+#include <ctype.h>
+#ifdef _MSC_VER
+#  include <io.h>
+typedef intptr_t ssize_t;
+#  define PATH_MAX 1024
+#  define STDERR_FILENO 2
+#  define __func__ __FUNCTION__
+/* Disable warnings about deprecated system functions */
+#  pragma warning(disable: 4996)
+#else
+#  include <unistd.h>
+#endif
+#include <fcntl.h>
+
+#endif /* JEMALLOC_INTERNAL_H */
diff --git a/include/jemalloc/internal/jemalloc_internal_macros.h b/include/jemalloc/internal/jemalloc_internal_macros.h
index bb81e99..4e23923 100644
--- a/include/jemalloc/internal/jemalloc_internal_macros.h
+++ b/include/jemalloc/internal/jemalloc_internal_macros.h
@@ -49,7 +49,3 @@
 #ifndef JEMALLOC_HAS_RESTRICT
 #  define restrict
 #endif
-
-#ifdef _MSC_VER
-#  define __func__ __FUNCTION__
-#endif
diff --git a/test/include/test/jemalloc_test_defs.h.in b/test/include/test/jemalloc_test_defs.h.in
index 18a9773..aaaaec1 100644
--- a/test/include/test/jemalloc_test_defs.h.in
+++ b/test/include/test/jemalloc_test_defs.h.in
@@ -1,4 +1,5 @@
 #include "jemalloc/internal/jemalloc_internal_defs.h"
+#include "jemalloc/internal/jemalloc_internal_decls.h"
 
 /* For use by SFMT. */
 #undef HAVE_SSE2
-- 
cgit v0.12


From 26246af977250a520194a1ced89cbc73ce218ca7 Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Wed, 28 May 2014 13:14:46 +0900
Subject: Define INFINITY when it's not defined

---
 test/unit/math.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/unit/math.c b/test/unit/math.c
index a1b288e..ebec77a 100644
--- a/test/unit/math.c
+++ b/test/unit/math.c
@@ -3,6 +3,12 @@
 #define	MAX_REL_ERR 1.0e-9
 #define	MAX_ABS_ERR 1.0e-9
 
+#include <float.h>
+
+#ifndef INFINITY
+#define	INFINITY (DBL_MAX + DBL_MAX)
+#endif
+
 static bool
 double_eq_rel(double a, double b, double max_rel_err, double max_abs_err)
 {
-- 
cgit v0.12


From 17767b5f2b195076a8b57f8489addabb1ee68009 Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Wed, 28 May 2014 14:06:30 +0900
Subject: Correctly return exit code from thd_join on Windows

---
 test/src/thd.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/src/thd.c b/test/src/thd.c
index 233242a..7e53625 100644
--- a/test/src/thd.c
+++ b/test/src/thd.c
@@ -14,7 +14,8 @@ void
 thd_join(thd_t thd, void **ret)
 {
 
-	WaitForSingleObject(thd, INFINITE);
+	if (WaitForSingleObject(thd, INFINITE) == WAIT_OBJECT_0 && ret)
+		GetExitCodeThread(thd, (LPDWORD) ret);
 }
 
 #else
-- 
cgit v0.12


From b54aef1d8cc16f7b3f295cf857842aa6d5844d46 Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Wed, 28 May 2014 14:17:01 +0900
Subject: Fixup after 3a730df (Avoid pointer arithmetic on void*[...])

---
 test/integration/rallocx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/integration/rallocx.c b/test/integration/rallocx.c
index e78e02f..b698072 100644
--- a/test/integration/rallocx.c
+++ b/test/integration/rallocx.c
@@ -95,7 +95,7 @@ TEST_BEGIN(test_zero)
 				    "Expected zeroed memory");
 			}
 			if (psz != qsz) {
-				memset((void *)(uintptr_t)q+psz, FILL_BYTE,
+				memset((void *)((uintptr_t)q+psz), FILL_BYTE,
 				    qsz-psz);
 				psz = qsz;
 			}
@@ -161,7 +161,7 @@ TEST_BEGIN(test_lg_align_and_zero)
 			assert_false(validate_fill(q, 0, 0, MAX_VALIDATE),
 			    "Expected zeroed memory");
 			assert_false(validate_fill(
-			    (void *)(uintptr_t)q+sz-MAX_VALIDATE,
+			    (void *)((uintptr_t)q+sz-MAX_VALIDATE),
 			    0, 0, MAX_VALIDATE), "Expected zeroed memory");
 		}
 		p = q;
-- 
cgit v0.12


From ccf046659a7c83e4e1573a1df30415144b4efdb6 Mon Sep 17 00:00:00 2001
From: Daniel Micay <danielmicay@gmail.com>
Date: Wed, 7 May 2014 01:17:05 -0400
Subject: STATIC_PAGE_SHIFT for cross-compiling jemalloc

Sets `STATIC_PAGE_SHIFT` for cross-compiling jemalloc to 12. A
shift of 12 represents a page size of 4k for practically all
platforms.
---
 configure.ac | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 04bb294..58f6289 100644
--- a/configure.ac
+++ b/configure.ac
@@ -968,7 +968,8 @@ AC_CACHE_CHECK([STATIC_PAGE_SHIFT],
     return 0;
 ]])],
                              [je_cv_static_page_shift=`cat conftest.out`],
-                             [je_cv_static_page_shift=undefined]))
+                             [je_cv_static_page_shift=undefined],
+                             [je_cv_static_page_shift=12]))
 
 if test "x$je_cv_static_page_shift" != "xundefined"; then
    AC_DEFINE_UNQUOTED([STATIC_PAGE_SHIFT], [$je_cv_static_page_shift])
-- 
cgit v0.12


From 26f44df742893306a53a90328e15a62ed11b9e57 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Wed, 28 May 2014 11:08:17 -0700
Subject: Make sure initialization occurs prior to running tests.

---
 test/src/test.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/test/src/test.c b/test/src/test.c
index 3acf845..17728ca 100644
--- a/test/src/test.c
+++ b/test/src/test.c
@@ -63,9 +63,22 @@ p_test_fini(void)
 test_status_t
 p_test(test_t *t, ...)
 {
-	test_status_t ret = test_status_pass;
+	test_status_t ret;
 	va_list ap;
 
+	/*
+	 * Make sure initialization occurs prior to running tests.  Tests are
+	 * special because they may use internal facilities prior to triggering
+	 * initialization as a side effect of calling into the public API.  This
+	 * is a final safety that works even if jemalloc_constructor() doesn't
+	 * run, as for MSVC builds.
+	 */
+	if (mallctl("version", NULL, NULL, NULL, 0) != 0) {
+		malloc_printf("Initialization error");
+		return (test_status_fail);
+	}
+
+	ret = test_status_pass;
 	va_start(ap, t);
 	for (; t != NULL; t = va_arg(ap, test_t *)) {
 		t();
-- 
cgit v0.12


From 99118622ff5204feaabd2ee4109a7847ab388282 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Wed, 28 May 2014 11:23:01 -0700
Subject: Use nallocx() rather than mallctl() to trigger initialization.

Use nallocx() rather than mallctl() to trigger initialization, because
nallocx() has no side effects other than initialization, whereas
mallctl() does a bunch of internal memory allocation.
---
 test/src/test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/src/test.c b/test/src/test.c
index 17728ca..0f8bd49 100644
--- a/test/src/test.c
+++ b/test/src/test.c
@@ -73,7 +73,7 @@ p_test(test_t *t, ...)
 	 * is a final safety that works even if jemalloc_constructor() doesn't
 	 * run, as for MSVC builds.
 	 */
-	if (mallctl("version", NULL, NULL, NULL, 0) != 0) {
+	if (nallocx(1, 0) == 0) {
 		malloc_printf("Initialization error");
 		return (test_status_fail);
 	}
-- 
cgit v0.12


From d04047cc29bbc9d1f87a9346d1601e3dd87b6ca0 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Wed, 28 May 2014 16:11:55 -0700
Subject: Add size class computation capability.

Add size class computation capability, currently used only as validation
of the size class lookup tables.  Generalize the size class spacing used
for bins, for eventual use throughout the full range of allocation
sizes.
---
 configure.ac                                       |  23 ++
 include/jemalloc/internal/arena.h                  | 137 ++++++++++-
 include/jemalloc/internal/jemalloc_internal.h.in   |   4 +-
 .../jemalloc/internal/jemalloc_internal_defs.h.in  |   5 +
 .../jemalloc/internal/jemalloc_internal_macros.h   |   6 +
 include/jemalloc/internal/private_symbols.txt      |   8 +
 include/jemalloc/internal/size_classes.sh          | 261 ++++++++++++++++-----
 include/jemalloc/internal/util.h                   |  47 ++++
 src/arena.c                                        |  62 ++---
 9 files changed, 462 insertions(+), 91 deletions(-)

diff --git a/configure.ac b/configure.ac
index 58f6289..5852249 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1201,6 +1201,29 @@ if test "x${je_cv_atomic9}" != "xyes" -a "x${je_cv_osatomic}" != "xyes" ; then
 fi
 
 dnl ============================================================================
+dnl Check for __builtin_clz() and __builtin_clzl().
+
+AC_CACHE_CHECK([for __builtin_clz],
+               [je_cv_builtin_clz],
+               [AC_LINK_IFELSE([AC_LANG_PROGRAM([],
+                                                [
+                                                {
+                                                        unsigned x = 0;
+                                                        int y = __builtin_clz(x);
+                                                }
+                                                {
+                                                        unsigned long x = 0;
+                                                        int y = __builtin_clzl(x);
+                                                }
+                                                ])],
+                               [je_cv_builtin_clz=yes],
+                               [je_cv_builtin_clz=no])])
+
+if test "x${je_cv_builtin_clz}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_BUILTIN_CLZ], [ ])
+fi
+
+dnl ============================================================================
 dnl Check for spinlock(3) operations as provided on Darwin.
 
 JE_COMPILABLE([Darwin OSSpin*()], [
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 598a89b..2dc9501 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -463,8 +463,15 @@ void	arena_postfork_child(arena_t *arena);
 #ifdef JEMALLOC_H_INLINES
 
 #ifndef JEMALLOC_ENABLE_INLINE
+size_t	small_size2bin_compute(size_t size);
+size_t	small_size2bin_lookup(size_t size);
 size_t	small_size2bin(size_t size);
+size_t	small_bin2size_compute(size_t binind);
+size_t	small_bin2size_lookup(size_t binind);
 size_t	small_bin2size(size_t binind);
+size_t	small_s2u_compute(size_t size);
+size_t	small_s2u_lookup(size_t size);
+size_t	small_s2u(size_t size);
 arena_chunk_map_t	*arena_mapp_get(arena_chunk_t *chunk, size_t pageind);
 size_t	*arena_mapbitsp_get(arena_chunk_t *chunk, size_t pageind);
 size_t	arena_mapbitsp_read(size_t *mapbitsp);
@@ -507,18 +514,144 @@ void	arena_dalloc(arena_chunk_t *chunk, void *ptr, bool try_tcache);
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ARENA_C_))
 #  ifdef JEMALLOC_ARENA_INLINE_A
+JEMALLOC_INLINE size_t
+small_size2bin_compute(size_t size)
+{
+#if (NTBINS != 0)
+	if (size <= (ZU(1) << LG_TINY_MAXCLASS)) {
+		size_t lg_tmin = LG_TINY_MAXCLASS - NTBINS + 1;
+		size_t lg_ceil = lg_floor(pow2_ceil(size));
+		return (lg_ceil < lg_tmin ? 0 : lg_ceil - lg_tmin);
+	} else
+#endif
+	{
+		size_t x = lg_floor((size<<1)-1);
+		size_t shift = (x < LG_SIZE_CLASS_GROUP + LG_QUANTUM) ? 0 :
+		    x - (LG_SIZE_CLASS_GROUP + LG_QUANTUM);
+		size_t grp = shift << LG_SIZE_CLASS_GROUP;
+
+		size_t lg_delta = (x < LG_SIZE_CLASS_GROUP + LG_QUANTUM + 1)
+		    ? LG_QUANTUM : x - LG_SIZE_CLASS_GROUP - 1;
+
+		size_t delta_inverse_mask = ZI(-1) << lg_delta;
+		size_t mod = ((((size-1) & delta_inverse_mask) >> lg_delta)) &
+		    ((ZU(1) << LG_SIZE_CLASS_GROUP) - 1);
+
+		size_t bin = NTBINS + grp + mod;
+		return (bin);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+small_size2bin_lookup(size_t size)
+{
+
+	assert(size <= LOOKUP_MAXCLASS);
+	{
+		size_t ret = ((size_t)(small_size2bin_tab[(size-1) >>
+		    LG_TINY_MIN]));
+		assert(ret == small_size2bin_compute(size));
+		return (ret);
+	}
+}
+
 JEMALLOC_ALWAYS_INLINE size_t
 small_size2bin(size_t size)
 {
 
-	return ((size_t)(small_size2bin_tab[(size-1) >> LG_TINY_MIN]));
+	assert(size > 0);
+	if (size <= LOOKUP_MAXCLASS)
+		return (small_size2bin_lookup(size));
+	else
+		return (small_size2bin_compute(size));
+}
+
+JEMALLOC_INLINE size_t
+small_bin2size_compute(size_t binind)
+{
+#if (NTBINS > 0)
+	if (binind < NTBINS)
+		return (ZU(1) << (LG_TINY_MAXCLASS - NTBINS + 1 + binind));
+	else
+#endif
+	{
+		size_t reduced_binind = binind - NTBINS;
+		size_t grp = reduced_binind >> LG_SIZE_CLASS_GROUP;
+		size_t mod = reduced_binind & ((ZU(1) << LG_SIZE_CLASS_GROUP) -
+		    1);
+
+		size_t grp_size_mask = ~((!!grp)-1);
+		size_t grp_size = ((ZU(1) << (LG_QUANTUM +
+		    (LG_SIZE_CLASS_GROUP-1))) << grp) & grp_size_mask;
+
+		size_t shift = (grp == 0) ? 1 : grp;
+		size_t lg_delta = shift + (LG_QUANTUM-1);
+		size_t mod_size = (mod+1) << lg_delta;
+
+		size_t usize = grp_size + mod_size;
+		return (usize);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+small_bin2size_lookup(size_t binind)
+{
+
+	assert(binind < NBINS);
+	{
+		size_t ret = ((size_t)(small_bin2size_tab[binind]));
+		assert(ret == small_bin2size_compute(binind));
+		return (ret);
+	}
 }
 
 JEMALLOC_ALWAYS_INLINE size_t
 small_bin2size(size_t binind)
 {
 
-	return ((size_t)(small_bin2size_tab[binind]));
+	return (small_bin2size_lookup(binind));
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+small_s2u_compute(size_t size)
+{
+#if (NTBINS > 0)
+	if (size <= (ZU(1) << LG_TINY_MAXCLASS)) {
+		size_t lg_tmin = LG_TINY_MAXCLASS - NTBINS + 1;
+		size_t lg_ceil = lg_floor(pow2_ceil(size));
+		return (lg_ceil < lg_tmin ? (ZU(1) << lg_tmin) :
+		    (ZU(1) << lg_ceil));
+	} else
+#endif
+	{
+		size_t x = lg_floor((size<<1)-1);
+		size_t lg_delta = (x < LG_SIZE_CLASS_GROUP + LG_QUANTUM + 1)
+		    ?  LG_QUANTUM : x - LG_SIZE_CLASS_GROUP - 1;
+		size_t delta = ZU(1) << lg_delta;
+		size_t delta_mask = delta - 1;
+		size_t usize = (size + delta_mask) & ~delta_mask;
+		return (usize);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+small_s2u_lookup(size_t size)
+{
+	size_t ret = (small_bin2size(small_size2bin(size)));
+
+	assert(ret == small_s2u_compute(size));
+	return (ret);
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+small_s2u(size_t size)
+{
+
+	assert(size > 0);
+	if (size <= LOOKUP_MAXCLASS)
+		return (small_s2u_lookup(size));
+	else
+		return (small_s2u_compute(size));
 }
 #  endif /* JEMALLOC_ARENA_INLINE_A */
 
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index cf20f1f..491345c 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -475,7 +475,7 @@ s2u(size_t size)
 {
 
 	if (size <= SMALL_MAXCLASS)
-		return (small_bin2size(small_size2bin(size)));
+		return (small_s2u(size));
 	if (size <= arena_maxclass)
 		return (PAGE_CEILING(size));
 	return (CHUNK_CEILING(size));
@@ -518,7 +518,7 @@ sa2u(size_t size, size_t alignment)
 
 	if (usize <= arena_maxclass && alignment <= PAGE) {
 		if (usize <= SMALL_MAXCLASS)
-			return (small_bin2size(small_size2bin(usize)));
+			return (small_s2u(usize));
 		return (PAGE_CEILING(usize));
 	} else {
 		size_t run_size;
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index 09ddd4f..a9a50f1 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -48,6 +48,11 @@
 #undef JE_FORCE_SYNC_COMPARE_AND_SWAP_8
 
 /*
+ * Defined if __builtin_clz() and __builtin_clzl() are available.
+ */
+#undef JEMALLOC_HAVE_BUILTIN_CLZ
+
+/*
  * Defined if OSSpin*() functions are available, as provided by Darwin, and
  * documented in the spinlock(3) manual page.
  */
diff --git a/include/jemalloc/internal/jemalloc_internal_macros.h b/include/jemalloc/internal/jemalloc_internal_macros.h
index 4e23923..38e2886 100644
--- a/include/jemalloc/internal/jemalloc_internal_macros.h
+++ b/include/jemalloc/internal/jemalloc_internal_macros.h
@@ -39,9 +39,15 @@
 #endif
 
 #define	ZU(z)	((size_t)z)
+#define	ZI(z)	((ssize_t)z)
 #define	QU(q)	((uint64_t)q)
 #define	QI(q)	((int64_t)q)
 
+#define	KZU(z)	ZU(z##ULL)
+#define	KZI(z)	ZI(z##ULL)
+#define	KQU(q)	QU(q##ULL)
+#define	KQI(q)	QI(q##ULL)
+
 #ifndef __DECONST
 #  define	__DECONST(type, var)	((type)(uintptr_t)(const void *)(var))
 #endif
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index f6c4fbc..3401301 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -234,6 +234,7 @@ ixalloc
 jemalloc_postfork_child
 jemalloc_postfork_parent
 jemalloc_prefork
+lg_floor
 malloc_cprintf
 malloc_mutex_init
 malloc_mutex_lock
@@ -348,8 +349,15 @@ s2u
 sa2u
 set_errno
 small_bin2size
+small_bin2size_compute
+small_bin2size_lookup
 small_bin2size_tab
+small_s2u
+small_s2u_compute
+small_s2u_lookup
 small_size2bin
+small_size2bin_compute
+small_size2bin_lookup
 small_size2bin_tab
 stats_cactive
 stats_cactive_add
diff --git a/include/jemalloc/internal/size_classes.sh b/include/jemalloc/internal/size_classes.sh
index 960674a..3edebf2 100755
--- a/include/jemalloc/internal/size_classes.sh
+++ b/include/jemalloc/internal/size_classes.sh
@@ -2,16 +2,23 @@
 
 # The following limits are chosen such that they cover all supported platforms.
 
-# Range of quanta.
-lg_qmin=3
-lg_qmax=4
+# Pointer sizes.
+lg_zarr="2 3"
+
+# Quanta.
+lg_qarr="3 4"
 
 # The range of tiny size classes is [2^lg_tmin..2^(lg_q-1)].
 lg_tmin=3
 
-# Range of page sizes.
-lg_pmin=12
-lg_pmax=16
+# Maximum lookup size.
+lg_kmax=12
+
+# Page sizes.
+lg_parr="12 13 16"
+
+# Size class group size (number of size classes for each size doubling).
+lg_g=2
 
 pow2() {
   e=$1
@@ -22,68 +29,206 @@ pow2() {
   done
 }
 
+lg() {
+  x=$1
+  lg_result=0
+  while [ ${x} -gt 1 ] ; do
+    lg_result=$((${lg_result} + 1))
+    x=$((${x} / 2))
+  done
+}
+
+size_class() {
+  index=$1
+  lg_grp=$2
+  lg_delta=$3
+  ndelta=$4
+  lg_p=$5
+  lg_kmax=$6
+
+  lg ${ndelta}; lg_ndelta=${lg_result}; pow2 ${lg_ndelta}
+  if [ ${pow2_result} -lt ${ndelta} ] ; then
+    rem="yes"
+  else
+    rem="no"
+  fi
+
+  lg_size=${lg_grp}
+  if [ $((${lg_delta} + ${lg_ndelta})) -eq ${lg_grp} ] ; then
+    lg_size=$((${lg_grp} + 1))
+  else
+    lg_size=${lg_grp}
+    rem="yes"
+  fi
+
+  if [ ${lg_size} -lt ${lg_p} ] ; then
+    bin="yes"
+  else
+    bin="no"
+  fi
+  if [ ${lg_size} -lt ${lg_kmax} \
+      -o ${lg_size} -eq ${lg_kmax} -a ${rem} = "no" ] ; then
+    lg_delta_lookup=${lg_delta}
+  else
+    lg_delta_lookup="no"
+  fi
+  printf '    SC(%3d, %6d, %8d, %6d, %3s, %2s) \\\n' ${index} ${lg_grp} ${lg_delta} ${ndelta} ${bin} ${lg_delta_lookup}
+  # Defined upon return:
+  # - lg_delta_lookup (${lg_delta} or "no")
+  # - bin ("yes" or "no")
+}
+
+sep_line() {
+  echo "                                               \\"
+}
+
+size_classes() {
+  lg_z=$1
+  lg_q=$2
+  lg_t=$3
+  lg_p=$4
+  lg_g=$5
+
+  pow2 $((${lg_z} + 3)); ptr_bits=${pow2_result}
+  pow2 ${lg_g}; g=${pow2_result}
+
+  echo "#define	SIZE_CLASSES \\"
+  echo "  /* index, lg_grp, lg_delta, ndelta, bin, lg_delta_lookup */ \\"
+
+  ntbins=0
+  nlbins=0
+  lg_tiny_maxclass='"NA"'
+  nbins=0
+
+  # Tiny size classes.
+  ndelta=0
+  index=0
+  lg_grp=${lg_t}
+  lg_delta=${lg_grp}
+  while [ ${lg_grp} -lt ${lg_q} ] ; do
+    size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
+    if [ ${lg_delta_lookup} != "no" ] ; then
+      nlbins=$((${index} + 1))
+    fi
+    if [ ${bin} != "no" ] ; then
+      nbins=$((${index} + 1))
+    fi
+    ntbins=$((${ntbins} + 1))
+    lg_tiny_maxclass=${lg_grp} # Final written value is correct.
+    index=$((${index} + 1))
+    lg_delta=${lg_grp}
+    lg_grp=$((${lg_grp} + 1))
+  done
+
+  # First non-tiny group.
+  if [ ${ntbins} -gt 0 ] ; then
+    sep_line
+    # The first size class has an unusual encoding, because the size has to be
+    # split between grp and delta*ndelta.
+    lg_grp=$((${lg_grp} - 1))
+    ndelta=1
+    size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
+    index=$((${index} + 1))
+    lg_grp=$((${lg_grp} + 1))
+    lg_delta=$((${lg_delta} + 1))
+  fi
+  while [ ${ndelta} -lt ${g} ] ; do
+    size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
+    index=$((${index} + 1))
+    ndelta=$((${ndelta} + 1))
+  done
+
+  # All remaining groups.
+  lg_grp=$((${lg_grp} + ${lg_g}))
+  while [ ${lg_grp} -lt ${ptr_bits} ] ; do
+    sep_line
+    ndelta=1
+    if [ ${lg_grp} -eq $((${ptr_bits} - 1)) ] ; then
+      ndelta_limit=$((${g} - 1))
+    else
+      ndelta_limit=${g}
+    fi
+    while [ ${ndelta} -le ${ndelta_limit} ] ; do
+      size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
+      if [ ${lg_delta_lookup} != "no" ] ; then
+        nlbins=$((${index} + 1))
+        # Final written value is correct:
+        lookup_maxclass="((((size_t)1) << ${lg_grp}) + (((size_t)${ndelta}) << ${lg_delta}))"
+      fi
+      if [ ${bin} != "no" ] ; then
+        nbins=$((${index} + 1))
+        # Final written value is correct:
+        small_maxclass="((((size_t)1) << ${lg_grp}) + (((size_t)${ndelta}) << ${lg_delta}))"
+      fi
+      index=$((${index} + 1))
+      ndelta=$((${ndelta} + 1))
+    done
+    lg_grp=$((${lg_grp} + 1))
+    lg_delta=$((${lg_delta} + 1))
+  done
+  echo
+
+  # Defined upon completion:
+  # - ntbins
+  # - nlbins
+  # - nbins
+  # - lg_tiny_maxclass
+  # - lookup_maxclass
+  # - small_maxclass
+}
+
 cat <<EOF
 /* This file was automatically generated by size_classes.sh. */
 /******************************************************************************/
 #ifdef JEMALLOC_H_TYPES
 
+/*
+ * This header requires LG_SIZEOF_PTR, LG_TINY_MIN, LG_QUANTUM, and LG_PAGE to
+ * be defined prior to inclusion, and it in turn defines:
+ *
+ *   LG_SIZE_CLASS_GROUP: Lg of size class count for each size doubling.
+ *   SIZE_CLASSES: Complete table of
+ *                 SC(index, lg_delta, size, bin, lg_delta_lookup) tuples.
+ *     index: Size class index.
+ *     lg_grp: Lg group base size (no deltas added).
+ *     lg_delta: Lg delta to previous size class.
+ *     ndelta: Delta multiplier.  size == 1<<lg_grp + ndelta<<lg_delta
+ *     bin: 'yes' if a small bin size class, 'no' otherwise.
+ *     lg_delta_lookup: Same as lg_delta if a lookup table size class, 'no'
+ *                      otherwise.
+ *   NTBINS: Number of tiny bins.
+ *   NLBINS: Number of bins supported by the lookup table.
+ *   NBINS: Number of small size class bins.
+ *   LG_TINY_MAXCLASS: Lg of maximum tiny size class.
+ *   LOOKUP_MAXCLASS: Maximum size class included in lookup table.
+ *   SMALL_MAXCLASS: Maximum small size class.
+ */
+
+#define	LG_SIZE_CLASS_GROUP	${lg_g}
+
 EOF
 
-lg_q=${lg_qmin}
-while [ ${lg_q} -le ${lg_qmax} ] ; do
-  lg_t=${lg_tmin}
-  while [ ${lg_t} -le ${lg_q} ] ; do
-    lg_p=${lg_pmin}
-    while [ ${lg_p} -le ${lg_pmax} ] ; do
-      echo "#if (LG_TINY_MIN == ${lg_t} && LG_QUANTUM == ${lg_q} && LG_PAGE == ${lg_p})"
-      echo "#define	SIZE_CLASSES_DEFINED"
-      pow2 ${lg_q}; q=${pow2_result}
-      pow2 ${lg_t}; t=${pow2_result}
-      pow2 ${lg_p}; p=${pow2_result}
-      bin=0
-      psz=0
-      sz=${t}
-      delta=$((${sz} - ${psz}))
-      echo "/*  SIZE_CLASS(bin,	delta,	sz) */"
-      echo "#define	SIZE_CLASSES							\\"
-
-      # Tiny size classes.
-      while [ ${sz} -lt ${q} ] ; do
-        echo "    SIZE_CLASS(${bin},	${delta},	${sz})					\\"
-        bin=$((${bin} + 1))
-        psz=${sz}
-        sz=$((${sz} + ${sz}))
-        delta=$((${sz} - ${psz}))
-      done
-      # Quantum-multiple size classes.  For each doubling of sz, as many as 4
-      # size classes exist.  Their spacing is the greater of:
-      # - q
-      # - sz/4, where sz is a power of 2
-      while [ ${sz} -lt ${p} ] ; do
-        if [ ${sz} -ge $((${q} * 4)) ] ; then
-          i=$((${sz} / 4))
-        else
-          i=${q}
-        fi
-        next_2pow=$((${sz} * 2))
-        while [ ${sz} -lt $next_2pow ] ; do
-          echo "    SIZE_CLASS(${bin},	${delta},	${sz})					\\"
-          bin=$((${bin} + 1))
-          psz=${sz}
-          sz=$((${sz} + ${i}))
-          delta=$((${sz} - ${psz}))
-        done
+for lg_z in ${lg_zarr} ; do
+  for lg_q in ${lg_qarr} ; do
+    lg_t=${lg_tmin}
+    while [ ${lg_t} -le ${lg_q} ] ; do
+      # Iterate through page sizes and compute how many bins there are.
+      for lg_p in ${lg_parr} ; do
+        echo "#if (LG_SIZEOF_PTR == ${lg_z} && LG_TINY_MIN == ${lg_t} && LG_QUANTUM == ${lg_q} && LG_PAGE == ${lg_p})"
+        size_classes ${lg_z} ${lg_q} ${lg_t} ${lg_p} ${lg_g}
+        echo "#define	SIZE_CLASSES_DEFINED"
+        echo "#define	NTBINS			${ntbins}"
+        echo "#define	NLBINS			${nlbins}"
+        echo "#define	NBINS			${nbins}"
+        echo "#define	LG_TINY_MAXCLASS	${lg_tiny_maxclass}"
+        echo "#define	LOOKUP_MAXCLASS		${lookup_maxclass}"
+        echo "#define	SMALL_MAXCLASS		${small_maxclass}"
+        echo "#endif"
+        echo
       done
-      echo
-      echo "#define	NBINS		${bin}"
-      echo "#define	SMALL_MAXCLASS	${psz}"
-      echo "#endif"
-      echo
-      lg_p=$((${lg_p} + 1))
+      lg_t=$((${lg_t} + 1))
     done
-    lg_t=$((${lg_t} + 1))
   done
-  lg_q=$((${lg_q} + 1))
 done
 
 cat <<EOF
diff --git a/include/jemalloc/internal/util.h b/include/jemalloc/internal/util.h
index 6b938f7..7864823 100644
--- a/include/jemalloc/internal/util.h
+++ b/include/jemalloc/internal/util.h
@@ -110,6 +110,7 @@ void	malloc_printf(const char *format, ...)
 
 #ifndef JEMALLOC_ENABLE_INLINE
 size_t	pow2_ceil(size_t x);
+size_t	lg_floor(size_t x);
 void	set_errno(int errnum);
 int	get_errno(void);
 #endif
@@ -133,6 +134,52 @@ pow2_ceil(size_t x)
 	return (x);
 }
 
+#if (defined(__i386__) || defined(__amd64__) || defined(__x86_64__))
+JEMALLOC_INLINE size_t
+lg_floor(size_t x)
+{
+	size_t ret;
+
+	asm ("bsr %1, %0"
+	    : "=r"(ret) // Outputs.
+	    : "r"(x)    // Inputs.
+	    );
+	return (ret);
+}
+#elif (defined(JEMALLOC_HAVE_BUILTIN_CLZ))
+JEMALLOC_INLINE size_t
+lg_floor(size_t x)
+{
+
+#if (LG_SIZEOF_PTR == LG_SIZEOF_INT)
+	return ((8 << LG_SIZEOF_PTR - 1) - __builtin_clz(x));
+#elif (LG_SIZEOF_PTR == LG_SIZEOF_LONG)
+	return ((8 << LG_SIZEOF_PTR - 1) - __builtin_clzl(x));
+#else
+#  error "Unsupported type sizes for lg_floor()"
+#endif
+}
+#else
+JEMALLOC_INLINE size_t
+lg_floor(size_t x)
+{
+
+        x |= (x >> 1);
+        x |= (x >> 2);
+        x |= (x >> 4);
+        x |= (x >> 8);
+        x |= (x >> 16);
+#if (LG_SIZEOF_PTR == 3 && LG_SIZEOF_PTR == LG_SIZEOF_LONG)
+        x |= (x >> 32);
+        return (65 - ffsl(~x));
+#elif (LG_SIZEOF_PTR == 2)
+        return (33 - ffs(~x));
+#else
+#  error "Unsupported type sizes for lg_floor()"
+#endif
+}
+#endif
+
 /* Sets error code */
 JEMALLOC_INLINE void
 set_errno(int errnum)
diff --git a/src/arena.c b/src/arena.c
index f5d7d06..c392419 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -9,40 +9,39 @@ arena_bin_info_t	arena_bin_info[NBINS];
 
 JEMALLOC_ALIGNED(CACHELINE)
 const uint32_t	small_bin2size_tab[NBINS] = {
-#define SIZE_CLASS(bin, delta, size)		\
+#define	B2S_bin_yes(size) \
 	size,
+#define	B2S_bin_no(size)
+#define	SC(index, lg_grp, lg_delta, ndelta, bin, lg_delta_lookup) \
+	B2S_bin_##bin((ZU(1)<<lg_grp) + (ZU(ndelta)<<lg_delta))
 	SIZE_CLASSES
-#undef SIZE_CLASS
+#undef B2S_bin_yes
+#undef B2S_bin_no
+#undef SC
 };
 
 JEMALLOC_ALIGNED(CACHELINE)
 const uint8_t	small_size2bin_tab[] = {
-#define	S2B_8(i)	i,
-#define	S2B_16(i)	S2B_8(i) S2B_8(i)
-#define	S2B_32(i)	S2B_16(i) S2B_16(i)
-#define	S2B_64(i)	S2B_32(i) S2B_32(i)
-#define	S2B_128(i)	S2B_64(i) S2B_64(i)
-#define	S2B_256(i)	S2B_128(i) S2B_128(i)
-#define	S2B_512(i)	S2B_256(i) S2B_256(i)
-#define	S2B_1024(i)	S2B_512(i) S2B_512(i)
-#define	S2B_2048(i)	S2B_1024(i) S2B_1024(i)
-#define	S2B_4096(i)	S2B_2048(i) S2B_2048(i)
-#define	S2B_8192(i)	S2B_4096(i) S2B_4096(i)
-#define	SIZE_CLASS(bin, delta, size)					\
-	S2B_##delta(bin)
+#define	S2B_3(i)	i,
+#define	S2B_4(i)	S2B_3(i) S2B_3(i)
+#define	S2B_5(i)	S2B_4(i) S2B_4(i)
+#define	S2B_6(i)	S2B_5(i) S2B_5(i)
+#define	S2B_7(i)	S2B_6(i) S2B_6(i)
+#define	S2B_8(i)	S2B_7(i) S2B_7(i)
+#define	S2B_9(i)	S2B_8(i) S2B_8(i)
+#define	S2B_no(i)
+#define	SC(index, lg_grp, lg_delta, ndelta, bin, lg_delta_lookup) \
+	S2B_##lg_delta_lookup(index)
 	SIZE_CLASSES
+#undef S2B_3
+#undef S2B_4
+#undef S2B_5
+#undef S2B_6
+#undef S2B_7
 #undef S2B_8
-#undef S2B_16
-#undef S2B_32
-#undef S2B_64
-#undef S2B_128
-#undef S2B_256
-#undef S2B_512
-#undef S2B_1024
-#undef S2B_2048
-#undef S2B_4096
-#undef S2B_8192
-#undef SIZE_CLASS
+#undef S2B_9
+#undef S2B_no
+#undef SC
 };
 
 /******************************************************************************/
@@ -2586,13 +2585,18 @@ bin_info_init(void)
 	arena_bin_info_t *bin_info;
 	size_t prev_run_size = PAGE;
 
-#define	SIZE_CLASS(bin, delta, size)					\
-	bin_info = &arena_bin_info[bin];				\
+#define	BIN_INFO_INIT_bin_yes(index, size) \
+	bin_info = &arena_bin_info[index];				\
 	bin_info->reg_size = size;					\
 	prev_run_size = bin_info_run_size_calc(bin_info, prev_run_size);\
 	bitmap_info_init(&bin_info->bitmap_info, bin_info->nregs);
+#define	BIN_INFO_INIT_bin_no(index, size)
+#define	SC(index, lg_grp, lg_delta, ndelta, bin, lg_delta_lookup)	\
+	BIN_INFO_INIT_bin_##bin(index, (ZU(1)<<lg_grp) + (ZU(ndelta)<<lg_delta))
 	SIZE_CLASSES
-#undef SIZE_CLASS
+#undef BIN_INFO_INIT_bin_yes
+#undef BIN_INFO_INIT_bin_no
+#undef SC
 }
 
 void
-- 
cgit v0.12


From 1f6d77e1f687c3c4fa4ae6768b689a7936923f07 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Wed, 28 May 2014 21:14:16 -0700
Subject: Use KQU() rather than QU() where applicable.

Fix KZI() and KQI() to append LL rather than ULL.
---
 include/jemalloc/internal/hash.h                   |    8 +-
 .../jemalloc/internal/jemalloc_internal_macros.h   |    4 +-
 test/src/SFMT.c                                    |    2 +-
 test/unit/SFMT.c                                   | 2000 ++++++++++----------
 test/unit/util.c                                   |   90 +-
 5 files changed, 1052 insertions(+), 1052 deletions(-)

diff --git a/include/jemalloc/internal/hash.h b/include/jemalloc/internal/hash.h
index f2b3a16..a43bbbe 100644
--- a/include/jemalloc/internal/hash.h
+++ b/include/jemalloc/internal/hash.h
@@ -76,9 +76,9 @@ hash_fmix_64(uint64_t k)
 {
 
 	k ^= k >> 33;
-	k *= QU(0xff51afd7ed558ccdULL);
+	k *= KQU(0xff51afd7ed558ccd);
 	k ^= k >> 33;
-	k *= QU(0xc4ceb9fe1a85ec53ULL);
+	k *= KQU(0xc4ceb9fe1a85ec53);
 	k ^= k >> 33;
 
 	return (k);
@@ -247,8 +247,8 @@ hash_x64_128(const void *key, const int len, const uint32_t seed,
 	uint64_t h1 = seed;
 	uint64_t h2 = seed;
 
-	const uint64_t c1 = QU(0x87c37b91114253d5ULL);
-	const uint64_t c2 = QU(0x4cf5ad432745937fULL);
+	const uint64_t c1 = KQU(0x87c37b91114253d5);
+	const uint64_t c2 = KQU(0x4cf5ad432745937f);
 
 	/* body */
 	{
diff --git a/include/jemalloc/internal/jemalloc_internal_macros.h b/include/jemalloc/internal/jemalloc_internal_macros.h
index 38e2886..a08ba77 100644
--- a/include/jemalloc/internal/jemalloc_internal_macros.h
+++ b/include/jemalloc/internal/jemalloc_internal_macros.h
@@ -44,9 +44,9 @@
 #define	QI(q)	((int64_t)q)
 
 #define	KZU(z)	ZU(z##ULL)
-#define	KZI(z)	ZI(z##ULL)
+#define	KZI(z)	ZI(z##LL)
 #define	KQU(q)	QU(q##ULL)
-#define	KQI(q)	QI(q##ULL)
+#define	KQI(q)	QI(q##LL)
 
 #ifndef __DECONST
 #  define	__DECONST(type, var)	((type)(uintptr_t)(const void *)(var))
diff --git a/test/src/SFMT.c b/test/src/SFMT.c
index d2cc9d1..22a5ac5 100644
--- a/test/src/SFMT.c
+++ b/test/src/SFMT.c
@@ -511,7 +511,7 @@ uint64_t gen_rand64(sfmt_t *ctx) {
 uint64_t gen_rand64_range(sfmt_t *ctx, uint64_t limit) {
     uint64_t ret, above;
 
-    above = 0xffffffffffffffffULL - (0xffffffffffffffffULL  % limit);
+    above = KQU(0xffffffffffffffff) - (KQU(0xffffffffffffffff) % limit);
     while (1) {
         ret = gen_rand64(ctx);
         if (ret < above) {
diff --git a/test/unit/SFMT.c b/test/unit/SFMT.c
index 0ad9c23..88b31f6 100644
--- a/test/unit/SFMT.c
+++ b/test/unit/SFMT.c
@@ -445,1008 +445,1008 @@ static const uint32_t init_by_array_32_expected[] = {
 	2750138839U, 3518055702U,  733072558U, 4169325400U,  788493625U
 };
 static const uint64_t init_gen_rand_64_expected[] = {
-	QU(16924766246869039260ULL), QU( 8201438687333352714ULL),
-	QU( 2265290287015001750ULL), QU(18397264611805473832ULL),
-	QU( 3375255223302384358ULL), QU( 6345559975416828796ULL),
-	QU(18229739242790328073ULL), QU( 7596792742098800905ULL),
-	QU(  255338647169685981ULL), QU( 2052747240048610300ULL),
-	QU(18328151576097299343ULL), QU(12472905421133796567ULL),
-	QU(11315245349717600863ULL), QU(16594110197775871209ULL),
-	QU(15708751964632456450ULL), QU(10452031272054632535ULL),
-	QU(11097646720811454386ULL), QU( 4556090668445745441ULL),
-	QU(17116187693090663106ULL), QU(14931526836144510645ULL),
-	QU( 9190752218020552591ULL), QU( 9625800285771901401ULL),
-	QU(13995141077659972832ULL), QU( 5194209094927829625ULL),
-	QU( 4156788379151063303ULL), QU( 8523452593770139494ULL),
-	QU(14082382103049296727ULL), QU( 2462601863986088483ULL),
-	QU( 3030583461592840678ULL), QU( 5221622077872827681ULL),
-	QU( 3084210671228981236ULL), QU(13956758381389953823ULL),
-	QU(13503889856213423831ULL), QU(15696904024189836170ULL),
-	QU( 4612584152877036206ULL), QU( 6231135538447867881ULL),
-	QU(10172457294158869468ULL), QU( 6452258628466708150ULL),
-	QU(14044432824917330221ULL), QU(  370168364480044279ULL),
-	QU(10102144686427193359ULL), QU(  667870489994776076ULL),
-	QU( 2732271956925885858ULL), QU(18027788905977284151ULL),
-	QU(15009842788582923859ULL), QU( 7136357960180199542ULL),
-	QU(15901736243475578127ULL), QU(16951293785352615701ULL),
-	QU(10551492125243691632ULL), QU(17668869969146434804ULL),
-	QU(13646002971174390445ULL), QU( 9804471050759613248ULL),
-	QU( 5511670439655935493ULL), QU(18103342091070400926ULL),
-	QU(17224512747665137533ULL), QU(15534627482992618168ULL),
-	QU( 1423813266186582647ULL), QU(15821176807932930024ULL),
-	QU(   30323369733607156ULL), QU(11599382494723479403ULL),
-	QU(  653856076586810062ULL), QU( 3176437395144899659ULL),
-	QU(14028076268147963917ULL), QU(16156398271809666195ULL),
-	QU( 3166955484848201676ULL), QU( 5746805620136919390ULL),
-	QU(17297845208891256593ULL), QU(11691653183226428483ULL),
-	QU(17900026146506981577ULL), QU(15387382115755971042ULL),
-	QU(16923567681040845943ULL), QU( 8039057517199388606ULL),
-	QU(11748409241468629263ULL), QU(  794358245539076095ULL),
-	QU(13438501964693401242ULL), QU(14036803236515618962ULL),
-	QU( 5252311215205424721ULL), QU(17806589612915509081ULL),
-	QU( 6802767092397596006ULL), QU(14212120431184557140ULL),
-	QU( 1072951366761385712ULL), QU(13098491780722836296ULL),
-	QU( 9466676828710797353ULL), QU(12673056849042830081ULL),
-	QU(12763726623645357580ULL), QU(16468961652999309493ULL),
-	QU(15305979875636438926ULL), QU(17444713151223449734ULL),
-	QU( 5692214267627883674ULL), QU(13049589139196151505ULL),
-	QU(  880115207831670745ULL), QU( 1776529075789695498ULL),
-	QU(16695225897801466485ULL), QU(10666901778795346845ULL),
-	QU( 6164389346722833869ULL), QU( 2863817793264300475ULL),
-	QU( 9464049921886304754ULL), QU( 3993566636740015468ULL),
-	QU( 9983749692528514136ULL), QU(16375286075057755211ULL),
-	QU(16042643417005440820ULL), QU(11445419662923489877ULL),
-	QU( 7999038846885158836ULL), QU( 6721913661721511535ULL),
-	QU( 5363052654139357320ULL), QU( 1817788761173584205ULL),
-	QU(13290974386445856444ULL), QU( 4650350818937984680ULL),
-	QU( 8219183528102484836ULL), QU( 1569862923500819899ULL),
-	QU( 4189359732136641860ULL), QU(14202822961683148583ULL),
-	QU( 4457498315309429058ULL), QU(13089067387019074834ULL),
-	QU(11075517153328927293ULL), QU(10277016248336668389ULL),
-	QU( 7070509725324401122ULL), QU(17808892017780289380ULL),
-	QU(13143367339909287349ULL), QU( 1377743745360085151ULL),
-	QU( 5749341807421286485ULL), QU(14832814616770931325ULL),
-	QU( 7688820635324359492ULL), QU(10960474011539770045ULL),
-	QU(   81970066653179790ULL), QU(12619476072607878022ULL),
-	QU( 4419566616271201744ULL), QU(15147917311750568503ULL),
-	QU( 5549739182852706345ULL), QU( 7308198397975204770ULL),
-	QU(13580425496671289278ULL), QU(17070764785210130301ULL),
-	QU( 8202832846285604405ULL), QU( 6873046287640887249ULL),
-	QU( 6927424434308206114ULL), QU( 6139014645937224874ULL),
-	QU(10290373645978487639ULL), QU(15904261291701523804ULL),
-	QU( 9628743442057826883ULL), QU(18383429096255546714ULL),
-	QU( 4977413265753686967ULL), QU( 7714317492425012869ULL),
-	QU( 9025232586309926193ULL), QU(14627338359776709107ULL),
-	QU(14759849896467790763ULL), QU(10931129435864423252ULL),
-	QU( 4588456988775014359ULL), QU(10699388531797056724ULL),
-	QU(  468652268869238792ULL), QU( 5755943035328078086ULL),
-	QU( 2102437379988580216ULL), QU( 9986312786506674028ULL),
-	QU( 2654207180040945604ULL), QU( 8726634790559960062ULL),
-	QU(  100497234871808137ULL), QU( 2800137176951425819ULL),
-	QU( 6076627612918553487ULL), QU( 5780186919186152796ULL),
-	QU( 8179183595769929098ULL), QU( 6009426283716221169ULL),
-	QU( 2796662551397449358ULL), QU( 1756961367041986764ULL),
-	QU( 6972897917355606205ULL), QU(14524774345368968243ULL),
-	QU( 2773529684745706940ULL), QU( 4853632376213075959ULL),
-	QU( 4198177923731358102ULL), QU( 8271224913084139776ULL),
-	QU( 2741753121611092226ULL), QU(16782366145996731181ULL),
-	QU(15426125238972640790ULL), QU(13595497100671260342ULL),
-	QU( 3173531022836259898ULL), QU( 6573264560319511662ULL),
-	QU(18041111951511157441ULL), QU( 2351433581833135952ULL),
-	QU( 3113255578908173487ULL), QU( 1739371330877858784ULL),
-	QU(16046126562789165480ULL), QU( 8072101652214192925ULL),
-	QU(15267091584090664910ULL), QU( 9309579200403648940ULL),
-	QU( 5218892439752408722ULL), QU(14492477246004337115ULL),
-	QU(17431037586679770619ULL), QU( 7385248135963250480ULL),
-	QU( 9580144956565560660ULL), QU( 4919546228040008720ULL),
-	QU(15261542469145035584ULL), QU(18233297270822253102ULL),
-	QU( 5453248417992302857ULL), QU( 9309519155931460285ULL),
-	QU(10342813012345291756ULL), QU(15676085186784762381ULL),
-	QU(15912092950691300645ULL), QU( 9371053121499003195ULL),
-	QU( 9897186478226866746ULL), QU(14061858287188196327ULL),
-	QU(  122575971620788119ULL), QU(12146750969116317754ULL),
-	QU( 4438317272813245201ULL), QU( 8332576791009527119ULL),
-	QU(13907785691786542057ULL), QU(10374194887283287467ULL),
-	QU( 2098798755649059566ULL), QU( 3416235197748288894ULL),
-	QU( 8688269957320773484ULL), QU( 7503964602397371571ULL),
-	QU(16724977015147478236ULL), QU( 9461512855439858184ULL),
-	QU(13259049744534534727ULL), QU( 3583094952542899294ULL),
-	QU( 8764245731305528292ULL), QU(13240823595462088985ULL),
-	QU(13716141617617910448ULL), QU(18114969519935960955ULL),
-	QU( 2297553615798302206ULL), QU( 4585521442944663362ULL),
-	QU(17776858680630198686ULL), QU( 4685873229192163363ULL),
-	QU(  152558080671135627ULL), QU(15424900540842670088ULL),
-	QU(13229630297130024108ULL), QU(17530268788245718717ULL),
-	QU(16675633913065714144ULL), QU( 3158912717897568068ULL),
-	QU(15399132185380087288ULL), QU( 7401418744515677872ULL),
-	QU(13135412922344398535ULL), QU( 6385314346100509511ULL),
-	QU(13962867001134161139ULL), QU(10272780155442671999ULL),
-	QU(12894856086597769142ULL), QU(13340877795287554994ULL),
-	QU(12913630602094607396ULL), QU(12543167911119793857ULL),
-	QU(17343570372251873096ULL), QU(10959487764494150545ULL),
-	QU( 6966737953093821128ULL), QU(13780699135496988601ULL),
-	QU( 4405070719380142046ULL), QU(14923788365607284982ULL),
-	QU( 2869487678905148380ULL), QU( 6416272754197188403ULL),
-	QU(15017380475943612591ULL), QU( 1995636220918429487ULL),
-	QU( 3402016804620122716ULL), QU(15800188663407057080ULL),
-	QU(11362369990390932882ULL), QU(15262183501637986147ULL),
-	QU(10239175385387371494ULL), QU( 9352042420365748334ULL),
-	QU( 1682457034285119875ULL), QU( 1724710651376289644ULL),
-	QU( 2038157098893817966ULL), QU( 9897825558324608773ULL),
-	QU( 1477666236519164736ULL), QU(16835397314511233640ULL),
-	QU(10370866327005346508ULL), QU(10157504370660621982ULL),
-	QU(12113904045335882069ULL), QU(13326444439742783008ULL),
-	QU(11302769043000765804ULL), QU(13594979923955228484ULL),
-	QU(11779351762613475968ULL), QU( 3786101619539298383ULL),
-	QU( 8021122969180846063ULL), QU(15745904401162500495ULL),
-	QU(10762168465993897267ULL), QU(13552058957896319026ULL),
-	QU(11200228655252462013ULL), QU( 5035370357337441226ULL),
-	QU( 7593918984545500013ULL), QU( 5418554918361528700ULL),
-	QU( 4858270799405446371ULL), QU( 9974659566876282544ULL),
-	QU(18227595922273957859ULL), QU( 2772778443635656220ULL),
-	QU(14285143053182085385ULL), QU( 9939700992429600469ULL),
-	QU(12756185904545598068ULL), QU( 2020783375367345262ULL),
-	QU(   57026775058331227ULL), QU(  950827867930065454ULL),
-	QU( 6602279670145371217ULL), QU( 2291171535443566929ULL),
-	QU( 5832380724425010313ULL), QU( 1220343904715982285ULL),
-	QU(17045542598598037633ULL), QU(15460481779702820971ULL),
-	QU(13948388779949365130ULL), QU(13975040175430829518ULL),
-	QU(17477538238425541763ULL), QU(11104663041851745725ULL),
-	QU(15860992957141157587ULL), QU(14529434633012950138ULL),
-	QU( 2504838019075394203ULL), QU( 7512113882611121886ULL),
-	QU( 4859973559980886617ULL), QU( 1258601555703250219ULL),
-	QU(15594548157514316394ULL), QU( 4516730171963773048ULL),
-	QU(11380103193905031983ULL), QU( 6809282239982353344ULL),
-	QU(18045256930420065002ULL), QU( 2453702683108791859ULL),
-	QU(  977214582986981460ULL), QU( 2006410402232713466ULL),
-	QU( 6192236267216378358ULL), QU( 3429468402195675253ULL),
-	QU(18146933153017348921ULL), QU(17369978576367231139ULL),
-	QU( 1246940717230386603ULL), QU(11335758870083327110ULL),
-	QU(14166488801730353682ULL), QU( 9008573127269635732ULL),
-	QU(10776025389820643815ULL), QU(15087605441903942962ULL),
-	QU( 1359542462712147922ULL), QU(13898874411226454206ULL),
-	QU(17911176066536804411ULL), QU( 9435590428600085274ULL),
-	QU(  294488509967864007ULL), QU( 8890111397567922046ULL),
-	QU( 7987823476034328778ULL), QU(13263827582440967651ULL),
-	QU( 7503774813106751573ULL), QU(14974747296185646837ULL),
-	QU( 8504765037032103375ULL), QU(17340303357444536213ULL),
-	QU( 7704610912964485743ULL), QU( 8107533670327205061ULL),
-	QU( 9062969835083315985ULL), QU(16968963142126734184ULL),
-	QU(12958041214190810180ULL), QU( 2720170147759570200ULL),
-	QU( 2986358963942189566ULL), QU(14884226322219356580ULL),
-	QU(  286224325144368520ULL), QU(11313800433154279797ULL),
-	QU(18366849528439673248ULL), QU(17899725929482368789ULL),
-	QU( 3730004284609106799ULL), QU( 1654474302052767205ULL),
-	QU( 5006698007047077032ULL), QU( 8196893913601182838ULL),
-	QU(15214541774425211640ULL), QU(17391346045606626073ULL),
-	QU( 8369003584076969089ULL), QU( 3939046733368550293ULL),
-	QU(10178639720308707785ULL), QU( 2180248669304388697ULL),
-	QU(   62894391300126322ULL), QU( 9205708961736223191ULL),
-	QU( 6837431058165360438ULL), QU( 3150743890848308214ULL),
-	QU(17849330658111464583ULL), QU(12214815643135450865ULL),
-	QU(13410713840519603402ULL), QU( 3200778126692046802ULL),
-	QU(13354780043041779313ULL), QU(  800850022756886036ULL),
-	QU(15660052933953067433ULL), QU( 6572823544154375676ULL),
-	QU(11030281857015819266ULL), QU(12682241941471433835ULL),
-	QU(11654136407300274693ULL), QU( 4517795492388641109ULL),
-	QU( 9757017371504524244ULL), QU(17833043400781889277ULL),
-	QU(12685085201747792227ULL), QU(10408057728835019573ULL),
-	QU(   98370418513455221ULL), QU( 6732663555696848598ULL),
-	QU(13248530959948529780ULL), QU( 3530441401230622826ULL),
-	QU(18188251992895660615ULL), QU( 1847918354186383756ULL),
-	QU( 1127392190402660921ULL), QU(11293734643143819463ULL),
-	QU( 3015506344578682982ULL), QU(13852645444071153329ULL),
-	QU( 2121359659091349142ULL), QU( 1294604376116677694ULL),
-	QU( 5616576231286352318ULL), QU( 7112502442954235625ULL),
-	QU(11676228199551561689ULL), QU(12925182803007305359ULL),
-	QU( 7852375518160493082ULL), QU( 1136513130539296154ULL),
-	QU( 5636923900916593195ULL), QU( 3221077517612607747ULL),
-	QU(17784790465798152513ULL), QU( 3554210049056995938ULL),
-	QU(17476839685878225874ULL), QU( 3206836372585575732ULL),
-	QU( 2765333945644823430ULL), QU(10080070903718799528ULL),
-	QU( 5412370818878286353ULL), QU( 9689685887726257728ULL),
-	QU( 8236117509123533998ULL), QU( 1951139137165040214ULL),
-	QU( 4492205209227980349ULL), QU(16541291230861602967ULL),
-	QU( 1424371548301437940ULL), QU( 9117562079669206794ULL),
-	QU(14374681563251691625ULL), QU(13873164030199921303ULL),
-	QU( 6680317946770936731ULL), QU(15586334026918276214ULL),
-	QU(10896213950976109802ULL), QU( 9506261949596413689ULL),
-	QU( 9903949574308040616ULL), QU( 6038397344557204470ULL),
-	QU(  174601465422373648ULL), QU(15946141191338238030ULL),
-	QU(17142225620992044937ULL), QU( 7552030283784477064ULL),
-	QU( 2947372384532947997ULL), QU(  510797021688197711ULL),
-	QU( 4962499439249363461ULL), QU(   23770320158385357ULL),
-	QU(  959774499105138124ULL), QU( 1468396011518788276ULL),
-	QU( 2015698006852312308ULL), QU( 4149400718489980136ULL),
-	QU( 5992916099522371188ULL), QU(10819182935265531076ULL),
-	QU(16189787999192351131ULL), QU(  342833961790261950ULL),
-	QU(12470830319550495336ULL), QU(18128495041912812501ULL),
-	QU( 1193600899723524337ULL), QU( 9056793666590079770ULL),
-	QU( 2154021227041669041ULL), QU( 4963570213951235735ULL),
-	QU( 4865075960209211409ULL), QU( 2097724599039942963ULL),
-	QU( 2024080278583179845ULL), QU(11527054549196576736ULL),
-	QU(10650256084182390252ULL), QU( 4808408648695766755ULL),
-	QU( 1642839215013788844ULL), QU(10607187948250398390ULL),
-	QU( 7076868166085913508ULL), QU(  730522571106887032ULL),
-	QU(12500579240208524895ULL), QU( 4484390097311355324ULL),
-	QU(15145801330700623870ULL), QU( 8055827661392944028ULL),
-	QU( 5865092976832712268ULL), QU(15159212508053625143ULL),
-	QU( 3560964582876483341ULL), QU( 4070052741344438280ULL),
-	QU( 6032585709886855634ULL), QU(15643262320904604873ULL),
-	QU( 2565119772293371111ULL), QU(  318314293065348260ULL),
-	QU(15047458749141511872ULL), QU( 7772788389811528730ULL),
-	QU( 7081187494343801976ULL), QU( 6465136009467253947ULL),
-	QU(10425940692543362069ULL), QU(  554608190318339115ULL),
-	QU(14796699860302125214ULL), QU( 1638153134431111443ULL),
-	QU(10336967447052276248ULL), QU( 8412308070396592958ULL),
-	QU( 4004557277152051226ULL), QU( 8143598997278774834ULL),
-	QU(16413323996508783221ULL), QU(13139418758033994949ULL),
-	QU( 9772709138335006667ULL), QU( 2818167159287157659ULL),
-	QU(17091740573832523669ULL), QU(14629199013130751608ULL),
-	QU(18268322711500338185ULL), QU( 8290963415675493063ULL),
-	QU( 8830864907452542588ULL), QU( 1614839084637494849ULL),
-	QU(14855358500870422231ULL), QU( 3472996748392519937ULL),
-	QU(15317151166268877716ULL), QU( 5825895018698400362ULL),
-	QU(16730208429367544129ULL), QU(10481156578141202800ULL),
-	QU( 4746166512382823750ULL), QU(12720876014472464998ULL),
-	QU( 8825177124486735972ULL), QU(13733447296837467838ULL),
-	QU( 6412293741681359625ULL), QU( 8313213138756135033ULL),
-	QU(11421481194803712517ULL), QU( 7997007691544174032ULL),
-	QU( 6812963847917605930ULL), QU( 9683091901227558641ULL),
-	QU(14703594165860324713ULL), QU( 1775476144519618309ULL),
-	QU( 2724283288516469519ULL), QU(  717642555185856868ULL),
-	QU( 8736402192215092346ULL), QU(11878800336431381021ULL),
-	QU( 4348816066017061293ULL), QU( 6115112756583631307ULL),
-	QU( 9176597239667142976ULL), QU(12615622714894259204ULL),
-	QU(10283406711301385987ULL), QU( 5111762509485379420ULL),
-	QU( 3118290051198688449ULL), QU( 7345123071632232145ULL),
-	QU( 9176423451688682359ULL), QU( 4843865456157868971ULL),
-	QU(12008036363752566088ULL), QU(12058837181919397720ULL),
-	QU( 2145073958457347366ULL), QU( 1526504881672818067ULL),
-	QU( 3488830105567134848ULL), QU(13208362960674805143ULL),
-	QU( 4077549672899572192ULL), QU( 7770995684693818365ULL),
-	QU( 1398532341546313593ULL), QU(12711859908703927840ULL),
-	QU( 1417561172594446813ULL), QU(17045191024194170604ULL),
-	QU( 4101933177604931713ULL), QU(14708428834203480320ULL),
-	QU(17447509264469407724ULL), QU(14314821973983434255ULL),
-	QU(17990472271061617265ULL), QU( 5087756685841673942ULL),
-	QU(12797820586893859939ULL), QU( 1778128952671092879ULL),
-	QU( 3535918530508665898ULL), QU( 9035729701042481301ULL),
-	QU(14808661568277079962ULL), QU(14587345077537747914ULL),
-	QU(11920080002323122708ULL), QU( 6426515805197278753ULL),
-	QU( 3295612216725984831ULL), QU(11040722532100876120ULL),
-	QU(12305952936387598754ULL), QU(16097391899742004253ULL),
-	QU( 4908537335606182208ULL), QU(12446674552196795504ULL),
-	QU(16010497855816895177ULL), QU( 9194378874788615551ULL),
-	QU( 3382957529567613384ULL), QU( 5154647600754974077ULL),
-	QU( 9801822865328396141ULL), QU( 9023662173919288143ULL),
-	QU(17623115353825147868ULL), QU( 8238115767443015816ULL),
-	QU(15811444159859002560ULL), QU( 9085612528904059661ULL),
-	QU( 6888601089398614254ULL), QU(  258252992894160189ULL),
-	QU( 6704363880792428622ULL), QU( 6114966032147235763ULL),
-	QU(11075393882690261875ULL), QU( 8797664238933620407ULL),
-	QU( 5901892006476726920ULL), QU( 5309780159285518958ULL),
-	QU(14940808387240817367ULL), QU(14642032021449656698ULL),
-	QU( 9808256672068504139ULL), QU( 3670135111380607658ULL),
-	QU(11211211097845960152ULL), QU( 1474304506716695808ULL),
-	QU(15843166204506876239ULL), QU( 7661051252471780561ULL),
-	QU(10170905502249418476ULL), QU( 7801416045582028589ULL),
-	QU( 2763981484737053050ULL), QU( 9491377905499253054ULL),
-	QU(16201395896336915095ULL), QU( 9256513756442782198ULL),
-	QU( 5411283157972456034ULL), QU( 5059433122288321676ULL),
-	QU( 4327408006721123357ULL), QU( 9278544078834433377ULL),
-	QU( 7601527110882281612ULL), QU(11848295896975505251ULL),
-	QU(12096998801094735560ULL), QU(14773480339823506413ULL),
-	QU(15586227433895802149ULL), QU(12786541257830242872ULL),
-	QU( 6904692985140503067ULL), QU( 5309011515263103959ULL),
-	QU(12105257191179371066ULL), QU(14654380212442225037ULL),
-	QU( 2556774974190695009ULL), QU( 4461297399927600261ULL),
-	QU(14888225660915118646ULL), QU(14915459341148291824ULL),
-	QU( 2738802166252327631ULL), QU( 6047155789239131512ULL),
-	QU(12920545353217010338ULL), QU(10697617257007840205ULL),
-	QU( 2751585253158203504ULL), QU(13252729159780047496ULL),
-	QU(14700326134672815469ULL), QU(14082527904374600529ULL),
-	QU(16852962273496542070ULL), QU(17446675504235853907ULL),
-	QU(15019600398527572311ULL), QU(12312781346344081551ULL),
-	QU(14524667935039810450ULL), QU( 5634005663377195738ULL),
-	QU(11375574739525000569ULL), QU( 2423665396433260040ULL),
-	QU( 5222836914796015410ULL), QU( 4397666386492647387ULL),
-	QU( 4619294441691707638ULL), QU(  665088602354770716ULL),
-	QU(13246495665281593610ULL), QU( 6564144270549729409ULL),
-	QU(10223216188145661688ULL), QU( 3961556907299230585ULL),
-	QU(11543262515492439914ULL), QU(16118031437285993790ULL),
-	QU( 7143417964520166465ULL), QU(13295053515909486772ULL),
-	QU(   40434666004899675ULL), QU(17127804194038347164ULL),
-	QU( 8599165966560586269ULL), QU( 8214016749011284903ULL),
-	QU(13725130352140465239ULL), QU( 5467254474431726291ULL),
-	QU( 7748584297438219877ULL), QU(16933551114829772472ULL),
-	QU( 2169618439506799400ULL), QU( 2169787627665113463ULL),
-	QU(17314493571267943764ULL), QU(18053575102911354912ULL),
-	QU(11928303275378476973ULL), QU(11593850925061715550ULL),
-	QU(17782269923473589362ULL), QU( 3280235307704747039ULL),
-	QU( 6145343578598685149ULL), QU(17080117031114086090ULL),
-	QU(18066839902983594755ULL), QU( 6517508430331020706ULL),
-	QU( 8092908893950411541ULL), QU(12558378233386153732ULL),
-	QU( 4476532167973132976ULL), QU(16081642430367025016ULL),
-	QU( 4233154094369139361ULL), QU( 8693630486693161027ULL),
-	QU(11244959343027742285ULL), QU(12273503967768513508ULL),
-	QU(14108978636385284876ULL), QU( 7242414665378826984ULL),
-	QU( 6561316938846562432ULL), QU( 8601038474994665795ULL),
-	QU(17532942353612365904ULL), QU(17940076637020912186ULL),
-	QU( 7340260368823171304ULL), QU( 7061807613916067905ULL),
-	QU(10561734935039519326ULL), QU(17990796503724650862ULL),
-	QU( 6208732943911827159ULL), QU(  359077562804090617ULL),
-	QU(14177751537784403113ULL), QU(10659599444915362902ULL),
-	QU(15081727220615085833ULL), QU(13417573895659757486ULL),
-	QU(15513842342017811524ULL), QU(11814141516204288231ULL),
-	QU( 1827312513875101814ULL), QU( 2804611699894603103ULL),
-	QU(17116500469975602763ULL), QU(12270191815211952087ULL),
-	QU(12256358467786024988ULL), QU(18435021722453971267ULL),
-	QU(  671330264390865618ULL), QU(  476504300460286050ULL),
-	QU(16465470901027093441ULL), QU( 4047724406247136402ULL),
-	QU( 1322305451411883346ULL), QU( 1388308688834322280ULL),
-	QU( 7303989085269758176ULL), QU( 9323792664765233642ULL),
-	QU( 4542762575316368936ULL), QU(17342696132794337618ULL),
-	QU( 4588025054768498379ULL), QU(13415475057390330804ULL),
-	QU(17880279491733405570ULL), QU(10610553400618620353ULL),
-	QU( 3180842072658960139ULL), QU(13002966655454270120ULL),
-	QU( 1665301181064982826ULL), QU( 7083673946791258979ULL),
-	QU(  190522247122496820ULL), QU(17388280237250677740ULL),
-	QU( 8430770379923642945ULL), QU(12987180971921668584ULL),
-	QU( 2311086108365390642ULL), QU( 2870984383579822345ULL),
-	QU(14014682609164653318ULL), QU(14467187293062251484ULL),
-	QU(  192186361147413298ULL), QU(15171951713531796524ULL),
-	QU( 9900305495015948728ULL), QU(17958004775615466344ULL),
-	QU(14346380954498606514ULL), QU(18040047357617407096ULL),
-	QU( 5035237584833424532ULL), QU(15089555460613972287ULL),
-	QU( 4131411873749729831ULL), QU( 1329013581168250330ULL),
-	QU(10095353333051193949ULL), QU(10749518561022462716ULL),
-	QU( 9050611429810755847ULL), QU(15022028840236655649ULL),
-	QU( 8775554279239748298ULL), QU(13105754025489230502ULL),
-	QU(15471300118574167585ULL), QU(   89864764002355628ULL),
-	QU( 8776416323420466637ULL), QU( 5280258630612040891ULL),
-	QU( 2719174488591862912ULL), QU( 7599309137399661994ULL),
-	QU(15012887256778039979ULL), QU(14062981725630928925ULL),
-	QU(12038536286991689603ULL), QU( 7089756544681775245ULL),
-	QU(10376661532744718039ULL), QU( 1265198725901533130ULL),
-	QU(13807996727081142408ULL), QU( 2935019626765036403ULL),
-	QU( 7651672460680700141ULL), QU( 3644093016200370795ULL),
-	QU( 2840982578090080674ULL), QU(17956262740157449201ULL),
-	QU(18267979450492880548ULL), QU(11799503659796848070ULL),
-	QU( 9942537025669672388ULL), QU(11886606816406990297ULL),
-	QU( 5488594946437447576ULL), QU( 7226714353282744302ULL),
-	QU( 3784851653123877043ULL), QU(  878018453244803041ULL),
-	QU(12110022586268616085ULL), QU(  734072179404675123ULL),
-	QU(11869573627998248542ULL), QU(  469150421297783998ULL),
-	QU(  260151124912803804ULL), QU(11639179410120968649ULL),
-	QU( 9318165193840846253ULL), QU(12795671722734758075ULL),
-	QU(15318410297267253933ULL), QU(  691524703570062620ULL),
-	QU( 5837129010576994601ULL), QU(15045963859726941052ULL),
-	QU( 5850056944932238169ULL), QU(12017434144750943807ULL),
-	QU( 7447139064928956574ULL), QU( 3101711812658245019ULL),
-	QU(16052940704474982954ULL), QU(18195745945986994042ULL),
-	QU( 8932252132785575659ULL), QU(13390817488106794834ULL),
-	QU(11582771836502517453ULL), QU( 4964411326683611686ULL),
-	QU( 2195093981702694011ULL), QU(14145229538389675669ULL),
-	QU(16459605532062271798ULL), QU(  866316924816482864ULL),
-	QU( 4593041209937286377ULL), QU( 8415491391910972138ULL),
-	QU( 4171236715600528969ULL), QU(16637569303336782889ULL),
-	QU( 2002011073439212680ULL), QU(17695124661097601411ULL),
-	QU( 4627687053598611702ULL), QU( 7895831936020190403ULL),
-	QU( 8455951300917267802ULL), QU( 2923861649108534854ULL),
-	QU( 8344557563927786255ULL), QU( 6408671940373352556ULL),
-	QU(12210227354536675772ULL), QU(14294804157294222295ULL),
-	QU(10103022425071085127ULL), QU(10092959489504123771ULL),
-	QU( 6554774405376736268ULL), QU(12629917718410641774ULL),
-	QU( 6260933257596067126ULL), QU( 2460827021439369673ULL),
-	QU( 2541962996717103668ULL), QU(  597377203127351475ULL),
-	QU( 5316984203117315309ULL), QU( 4811211393563241961ULL),
-	QU(13119698597255811641ULL), QU( 8048691512862388981ULL),
-	QU(10216818971194073842ULL), QU( 4612229970165291764ULL),
-	QU(10000980798419974770ULL), QU( 6877640812402540687ULL),
-	QU( 1488727563290436992ULL), QU( 2227774069895697318ULL),
-	QU(11237754507523316593ULL), QU(13478948605382290972ULL),
-	QU( 1963583846976858124ULL), QU( 5512309205269276457ULL),
-	QU( 3972770164717652347ULL), QU( 3841751276198975037ULL),
-	QU(10283343042181903117ULL), QU( 8564001259792872199ULL),
-	QU(16472187244722489221ULL), QU( 8953493499268945921ULL),
-	QU( 3518747340357279580ULL), QU( 4003157546223963073ULL),
-	QU( 3270305958289814590ULL), QU( 3966704458129482496ULL),
-	QU( 8122141865926661939ULL), QU(14627734748099506653ULL),
-	QU(13064426990862560568ULL), QU( 2414079187889870829ULL),
-	QU( 5378461209354225306ULL), QU(10841985740128255566ULL),
-	QU(  538582442885401738ULL), QU( 7535089183482905946ULL),
-	QU(16117559957598879095ULL), QU( 8477890721414539741ULL),
-	QU( 1459127491209533386ULL), QU(17035126360733620462ULL),
-	QU( 8517668552872379126ULL), QU(10292151468337355014ULL),
-	QU(17081267732745344157ULL), QU(13751455337946087178ULL),
-	QU(14026945459523832966ULL), QU( 6653278775061723516ULL),
-	QU(10619085543856390441ULL), QU( 2196343631481122885ULL),
-	QU(10045966074702826136ULL), QU(10082317330452718282ULL),
-	QU( 5920859259504831242ULL), QU( 9951879073426540617ULL),
-	QU( 7074696649151414158ULL), QU(15808193543879464318ULL),
-	QU( 7385247772746953374ULL), QU( 3192003544283864292ULL),
-	QU(18153684490917593847ULL), QU(12423498260668568905ULL),
-	QU(10957758099756378169ULL), QU(11488762179911016040ULL),
-	QU( 2099931186465333782ULL), QU(11180979581250294432ULL),
-	QU( 8098916250668367933ULL), QU( 3529200436790763465ULL),
-	QU(12988418908674681745ULL), QU( 6147567275954808580ULL),
-	QU( 3207503344604030989ULL), QU(10761592604898615360ULL),
-	QU(  229854861031893504ULL), QU( 8809853962667144291ULL),
-	QU(13957364469005693860ULL), QU( 7634287665224495886ULL),
-	QU(12353487366976556874ULL), QU( 1134423796317152034ULL),
-	QU( 2088992471334107068ULL), QU( 7393372127190799698ULL),
-	QU( 1845367839871058391ULL), QU(  207922563987322884ULL),
-	QU(11960870813159944976ULL), QU(12182120053317317363ULL),
-	QU(17307358132571709283ULL), QU(13871081155552824936ULL),
-	QU(18304446751741566262ULL), QU( 7178705220184302849ULL),
-	QU(10929605677758824425ULL), QU(16446976977835806844ULL),
-	QU(13723874412159769044ULL), QU( 6942854352100915216ULL),
-	QU( 1726308474365729390ULL), QU( 2150078766445323155ULL),
-	QU(15345558947919656626ULL), QU(12145453828874527201ULL),
-	QU( 2054448620739726849ULL), QU( 2740102003352628137ULL),
-	QU(11294462163577610655ULL), QU(  756164283387413743ULL),
-	QU(17841144758438810880ULL), QU(10802406021185415861ULL),
-	QU( 8716455530476737846ULL), QU( 6321788834517649606ULL),
-	QU(14681322910577468426ULL), QU(17330043563884336387ULL),
-	QU(12701802180050071614ULL), QU(14695105111079727151ULL),
-	QU( 5112098511654172830ULL), QU( 4957505496794139973ULL),
-	QU( 8270979451952045982ULL), QU(12307685939199120969ULL),
-	QU(12425799408953443032ULL), QU( 8376410143634796588ULL),
-	QU(16621778679680060464ULL), QU( 3580497854566660073ULL),
-	QU( 1122515747803382416ULL), QU(  857664980960597599ULL),
-	QU( 6343640119895925918ULL), QU(12878473260854462891ULL),
-	QU(10036813920765722626ULL), QU(14451335468363173812ULL),
-	QU( 5476809692401102807ULL), QU(16442255173514366342ULL),
-	QU(13060203194757167104ULL), QU(14354124071243177715ULL),
-	QU(15961249405696125227ULL), QU(13703893649690872584ULL),
-	QU(  363907326340340064ULL), QU( 6247455540491754842ULL),
-	QU(12242249332757832361ULL), QU(  156065475679796717ULL),
-	QU( 9351116235749732355ULL), QU( 4590350628677701405ULL),
-	QU( 1671195940982350389ULL), QU(13501398458898451905ULL),
-	QU( 6526341991225002255ULL), QU( 1689782913778157592ULL),
-	QU( 7439222350869010334ULL), QU(13975150263226478308ULL),
-	QU(11411961169932682710ULL), QU(17204271834833847277ULL),
-	QU(  541534742544435367ULL), QU( 6591191931218949684ULL),
-	QU( 2645454775478232486ULL), QU( 4322857481256485321ULL),
-	QU( 8477416487553065110ULL), QU(12902505428548435048ULL),
-	QU(  971445777981341415ULL), QU(14995104682744976712ULL),
-	QU( 4243341648807158063ULL), QU( 8695061252721927661ULL),
-	QU( 5028202003270177222ULL), QU( 2289257340915567840ULL),
-	QU(13870416345121866007ULL), QU(13994481698072092233ULL),
-	QU( 6912785400753196481ULL), QU( 2278309315841980139ULL),
-	QU( 4329765449648304839ULL), QU( 5963108095785485298ULL),
-	QU( 4880024847478722478ULL), QU(16015608779890240947ULL),
-	QU( 1866679034261393544ULL), QU(  914821179919731519ULL),
-	QU( 9643404035648760131ULL), QU( 2418114953615593915ULL),
-	QU(  944756836073702374ULL), QU(15186388048737296834ULL),
-	QU( 7723355336128442206ULL), QU( 7500747479679599691ULL),
-	QU(18013961306453293634ULL), QU( 2315274808095756456ULL),
-	QU(13655308255424029566ULL), QU(17203800273561677098ULL),
-	QU( 1382158694422087756ULL), QU( 5090390250309588976ULL),
-	QU(  517170818384213989ULL), QU( 1612709252627729621ULL),
-	QU( 1330118955572449606ULL), QU(  300922478056709885ULL),
-	QU(18115693291289091987ULL), QU(13491407109725238321ULL),
-	QU(15293714633593827320ULL), QU( 5151539373053314504ULL),
-	QU( 5951523243743139207ULL), QU(14459112015249527975ULL),
-	QU( 5456113959000700739ULL), QU( 3877918438464873016ULL),
-	QU(12534071654260163555ULL), QU(15871678376893555041ULL),
-	QU(11005484805712025549ULL), QU(16353066973143374252ULL),
-	QU( 4358331472063256685ULL), QU( 8268349332210859288ULL),
-	QU(12485161590939658075ULL), QU(13955993592854471343ULL),
-	QU( 5911446886848367039ULL), QU(14925834086813706974ULL),
-	QU( 6590362597857994805ULL), QU( 1280544923533661875ULL),
-	QU( 1637756018947988164ULL), QU( 4734090064512686329ULL),
-	QU(16693705263131485912ULL), QU( 6834882340494360958ULL),
-	QU( 8120732176159658505ULL), QU( 2244371958905329346ULL),
-	QU(10447499707729734021ULL), QU( 7318742361446942194ULL),
-	QU( 8032857516355555296ULL), QU(14023605983059313116ULL),
-	QU( 1032336061815461376ULL), QU( 9840995337876562612ULL),
-	QU( 9869256223029203587ULL), QU(12227975697177267636ULL),
-	QU(12728115115844186033ULL), QU( 7752058479783205470ULL),
-	QU(  729733219713393087ULL), QU(12954017801239007622ULL)
+	KQU(16924766246869039260), KQU( 8201438687333352714),
+	KQU( 2265290287015001750), KQU(18397264611805473832),
+	KQU( 3375255223302384358), KQU( 6345559975416828796),
+	KQU(18229739242790328073), KQU( 7596792742098800905),
+	KQU(  255338647169685981), KQU( 2052747240048610300),
+	KQU(18328151576097299343), KQU(12472905421133796567),
+	KQU(11315245349717600863), KQU(16594110197775871209),
+	KQU(15708751964632456450), KQU(10452031272054632535),
+	KQU(11097646720811454386), KQU( 4556090668445745441),
+	KQU(17116187693090663106), KQU(14931526836144510645),
+	KQU( 9190752218020552591), KQU( 9625800285771901401),
+	KQU(13995141077659972832), KQU( 5194209094927829625),
+	KQU( 4156788379151063303), KQU( 8523452593770139494),
+	KQU(14082382103049296727), KQU( 2462601863986088483),
+	KQU( 3030583461592840678), KQU( 5221622077872827681),
+	KQU( 3084210671228981236), KQU(13956758381389953823),
+	KQU(13503889856213423831), KQU(15696904024189836170),
+	KQU( 4612584152877036206), KQU( 6231135538447867881),
+	KQU(10172457294158869468), KQU( 6452258628466708150),
+	KQU(14044432824917330221), KQU(  370168364480044279),
+	KQU(10102144686427193359), KQU(  667870489994776076),
+	KQU( 2732271956925885858), KQU(18027788905977284151),
+	KQU(15009842788582923859), KQU( 7136357960180199542),
+	KQU(15901736243475578127), KQU(16951293785352615701),
+	KQU(10551492125243691632), KQU(17668869969146434804),
+	KQU(13646002971174390445), KQU( 9804471050759613248),
+	KQU( 5511670439655935493), KQU(18103342091070400926),
+	KQU(17224512747665137533), KQU(15534627482992618168),
+	KQU( 1423813266186582647), KQU(15821176807932930024),
+	KQU(   30323369733607156), KQU(11599382494723479403),
+	KQU(  653856076586810062), KQU( 3176437395144899659),
+	KQU(14028076268147963917), KQU(16156398271809666195),
+	KQU( 3166955484848201676), KQU( 5746805620136919390),
+	KQU(17297845208891256593), KQU(11691653183226428483),
+	KQU(17900026146506981577), KQU(15387382115755971042),
+	KQU(16923567681040845943), KQU( 8039057517199388606),
+	KQU(11748409241468629263), KQU(  794358245539076095),
+	KQU(13438501964693401242), KQU(14036803236515618962),
+	KQU( 5252311215205424721), KQU(17806589612915509081),
+	KQU( 6802767092397596006), KQU(14212120431184557140),
+	KQU( 1072951366761385712), KQU(13098491780722836296),
+	KQU( 9466676828710797353), KQU(12673056849042830081),
+	KQU(12763726623645357580), KQU(16468961652999309493),
+	KQU(15305979875636438926), KQU(17444713151223449734),
+	KQU( 5692214267627883674), KQU(13049589139196151505),
+	KQU(  880115207831670745), KQU( 1776529075789695498),
+	KQU(16695225897801466485), KQU(10666901778795346845),
+	KQU( 6164389346722833869), KQU( 2863817793264300475),
+	KQU( 9464049921886304754), KQU( 3993566636740015468),
+	KQU( 9983749692528514136), KQU(16375286075057755211),
+	KQU(16042643417005440820), KQU(11445419662923489877),
+	KQU( 7999038846885158836), KQU( 6721913661721511535),
+	KQU( 5363052654139357320), KQU( 1817788761173584205),
+	KQU(13290974386445856444), KQU( 4650350818937984680),
+	KQU( 8219183528102484836), KQU( 1569862923500819899),
+	KQU( 4189359732136641860), KQU(14202822961683148583),
+	KQU( 4457498315309429058), KQU(13089067387019074834),
+	KQU(11075517153328927293), KQU(10277016248336668389),
+	KQU( 7070509725324401122), KQU(17808892017780289380),
+	KQU(13143367339909287349), KQU( 1377743745360085151),
+	KQU( 5749341807421286485), KQU(14832814616770931325),
+	KQU( 7688820635324359492), KQU(10960474011539770045),
+	KQU(   81970066653179790), KQU(12619476072607878022),
+	KQU( 4419566616271201744), KQU(15147917311750568503),
+	KQU( 5549739182852706345), KQU( 7308198397975204770),
+	KQU(13580425496671289278), KQU(17070764785210130301),
+	KQU( 8202832846285604405), KQU( 6873046287640887249),
+	KQU( 6927424434308206114), KQU( 6139014645937224874),
+	KQU(10290373645978487639), KQU(15904261291701523804),
+	KQU( 9628743442057826883), KQU(18383429096255546714),
+	KQU( 4977413265753686967), KQU( 7714317492425012869),
+	KQU( 9025232586309926193), KQU(14627338359776709107),
+	KQU(14759849896467790763), KQU(10931129435864423252),
+	KQU( 4588456988775014359), KQU(10699388531797056724),
+	KQU(  468652268869238792), KQU( 5755943035328078086),
+	KQU( 2102437379988580216), KQU( 9986312786506674028),
+	KQU( 2654207180040945604), KQU( 8726634790559960062),
+	KQU(  100497234871808137), KQU( 2800137176951425819),
+	KQU( 6076627612918553487), KQU( 5780186919186152796),
+	KQU( 8179183595769929098), KQU( 6009426283716221169),
+	KQU( 2796662551397449358), KQU( 1756961367041986764),
+	KQU( 6972897917355606205), KQU(14524774345368968243),
+	KQU( 2773529684745706940), KQU( 4853632376213075959),
+	KQU( 4198177923731358102), KQU( 8271224913084139776),
+	KQU( 2741753121611092226), KQU(16782366145996731181),
+	KQU(15426125238972640790), KQU(13595497100671260342),
+	KQU( 3173531022836259898), KQU( 6573264560319511662),
+	KQU(18041111951511157441), KQU( 2351433581833135952),
+	KQU( 3113255578908173487), KQU( 1739371330877858784),
+	KQU(16046126562789165480), KQU( 8072101652214192925),
+	KQU(15267091584090664910), KQU( 9309579200403648940),
+	KQU( 5218892439752408722), KQU(14492477246004337115),
+	KQU(17431037586679770619), KQU( 7385248135963250480),
+	KQU( 9580144956565560660), KQU( 4919546228040008720),
+	KQU(15261542469145035584), KQU(18233297270822253102),
+	KQU( 5453248417992302857), KQU( 9309519155931460285),
+	KQU(10342813012345291756), KQU(15676085186784762381),
+	KQU(15912092950691300645), KQU( 9371053121499003195),
+	KQU( 9897186478226866746), KQU(14061858287188196327),
+	KQU(  122575971620788119), KQU(12146750969116317754),
+	KQU( 4438317272813245201), KQU( 8332576791009527119),
+	KQU(13907785691786542057), KQU(10374194887283287467),
+	KQU( 2098798755649059566), KQU( 3416235197748288894),
+	KQU( 8688269957320773484), KQU( 7503964602397371571),
+	KQU(16724977015147478236), KQU( 9461512855439858184),
+	KQU(13259049744534534727), KQU( 3583094952542899294),
+	KQU( 8764245731305528292), KQU(13240823595462088985),
+	KQU(13716141617617910448), KQU(18114969519935960955),
+	KQU( 2297553615798302206), KQU( 4585521442944663362),
+	KQU(17776858680630198686), KQU( 4685873229192163363),
+	KQU(  152558080671135627), KQU(15424900540842670088),
+	KQU(13229630297130024108), KQU(17530268788245718717),
+	KQU(16675633913065714144), KQU( 3158912717897568068),
+	KQU(15399132185380087288), KQU( 7401418744515677872),
+	KQU(13135412922344398535), KQU( 6385314346100509511),
+	KQU(13962867001134161139), KQU(10272780155442671999),
+	KQU(12894856086597769142), KQU(13340877795287554994),
+	KQU(12913630602094607396), KQU(12543167911119793857),
+	KQU(17343570372251873096), KQU(10959487764494150545),
+	KQU( 6966737953093821128), KQU(13780699135496988601),
+	KQU( 4405070719380142046), KQU(14923788365607284982),
+	KQU( 2869487678905148380), KQU( 6416272754197188403),
+	KQU(15017380475943612591), KQU( 1995636220918429487),
+	KQU( 3402016804620122716), KQU(15800188663407057080),
+	KQU(11362369990390932882), KQU(15262183501637986147),
+	KQU(10239175385387371494), KQU( 9352042420365748334),
+	KQU( 1682457034285119875), KQU( 1724710651376289644),
+	KQU( 2038157098893817966), KQU( 9897825558324608773),
+	KQU( 1477666236519164736), KQU(16835397314511233640),
+	KQU(10370866327005346508), KQU(10157504370660621982),
+	KQU(12113904045335882069), KQU(13326444439742783008),
+	KQU(11302769043000765804), KQU(13594979923955228484),
+	KQU(11779351762613475968), KQU( 3786101619539298383),
+	KQU( 8021122969180846063), KQU(15745904401162500495),
+	KQU(10762168465993897267), KQU(13552058957896319026),
+	KQU(11200228655252462013), KQU( 5035370357337441226),
+	KQU( 7593918984545500013), KQU( 5418554918361528700),
+	KQU( 4858270799405446371), KQU( 9974659566876282544),
+	KQU(18227595922273957859), KQU( 2772778443635656220),
+	KQU(14285143053182085385), KQU( 9939700992429600469),
+	KQU(12756185904545598068), KQU( 2020783375367345262),
+	KQU(   57026775058331227), KQU(  950827867930065454),
+	KQU( 6602279670145371217), KQU( 2291171535443566929),
+	KQU( 5832380724425010313), KQU( 1220343904715982285),
+	KQU(17045542598598037633), KQU(15460481779702820971),
+	KQU(13948388779949365130), KQU(13975040175430829518),
+	KQU(17477538238425541763), KQU(11104663041851745725),
+	KQU(15860992957141157587), KQU(14529434633012950138),
+	KQU( 2504838019075394203), KQU( 7512113882611121886),
+	KQU( 4859973559980886617), KQU( 1258601555703250219),
+	KQU(15594548157514316394), KQU( 4516730171963773048),
+	KQU(11380103193905031983), KQU( 6809282239982353344),
+	KQU(18045256930420065002), KQU( 2453702683108791859),
+	KQU(  977214582986981460), KQU( 2006410402232713466),
+	KQU( 6192236267216378358), KQU( 3429468402195675253),
+	KQU(18146933153017348921), KQU(17369978576367231139),
+	KQU( 1246940717230386603), KQU(11335758870083327110),
+	KQU(14166488801730353682), KQU( 9008573127269635732),
+	KQU(10776025389820643815), KQU(15087605441903942962),
+	KQU( 1359542462712147922), KQU(13898874411226454206),
+	KQU(17911176066536804411), KQU( 9435590428600085274),
+	KQU(  294488509967864007), KQU( 8890111397567922046),
+	KQU( 7987823476034328778), KQU(13263827582440967651),
+	KQU( 7503774813106751573), KQU(14974747296185646837),
+	KQU( 8504765037032103375), KQU(17340303357444536213),
+	KQU( 7704610912964485743), KQU( 8107533670327205061),
+	KQU( 9062969835083315985), KQU(16968963142126734184),
+	KQU(12958041214190810180), KQU( 2720170147759570200),
+	KQU( 2986358963942189566), KQU(14884226322219356580),
+	KQU(  286224325144368520), KQU(11313800433154279797),
+	KQU(18366849528439673248), KQU(17899725929482368789),
+	KQU( 3730004284609106799), KQU( 1654474302052767205),
+	KQU( 5006698007047077032), KQU( 8196893913601182838),
+	KQU(15214541774425211640), KQU(17391346045606626073),
+	KQU( 8369003584076969089), KQU( 3939046733368550293),
+	KQU(10178639720308707785), KQU( 2180248669304388697),
+	KQU(   62894391300126322), KQU( 9205708961736223191),
+	KQU( 6837431058165360438), KQU( 3150743890848308214),
+	KQU(17849330658111464583), KQU(12214815643135450865),
+	KQU(13410713840519603402), KQU( 3200778126692046802),
+	KQU(13354780043041779313), KQU(  800850022756886036),
+	KQU(15660052933953067433), KQU( 6572823544154375676),
+	KQU(11030281857015819266), KQU(12682241941471433835),
+	KQU(11654136407300274693), KQU( 4517795492388641109),
+	KQU( 9757017371504524244), KQU(17833043400781889277),
+	KQU(12685085201747792227), KQU(10408057728835019573),
+	KQU(   98370418513455221), KQU( 6732663555696848598),
+	KQU(13248530959948529780), KQU( 3530441401230622826),
+	KQU(18188251992895660615), KQU( 1847918354186383756),
+	KQU( 1127392190402660921), KQU(11293734643143819463),
+	KQU( 3015506344578682982), KQU(13852645444071153329),
+	KQU( 2121359659091349142), KQU( 1294604376116677694),
+	KQU( 5616576231286352318), KQU( 7112502442954235625),
+	KQU(11676228199551561689), KQU(12925182803007305359),
+	KQU( 7852375518160493082), KQU( 1136513130539296154),
+	KQU( 5636923900916593195), KQU( 3221077517612607747),
+	KQU(17784790465798152513), KQU( 3554210049056995938),
+	KQU(17476839685878225874), KQU( 3206836372585575732),
+	KQU( 2765333945644823430), KQU(10080070903718799528),
+	KQU( 5412370818878286353), KQU( 9689685887726257728),
+	KQU( 8236117509123533998), KQU( 1951139137165040214),
+	KQU( 4492205209227980349), KQU(16541291230861602967),
+	KQU( 1424371548301437940), KQU( 9117562079669206794),
+	KQU(14374681563251691625), KQU(13873164030199921303),
+	KQU( 6680317946770936731), KQU(15586334026918276214),
+	KQU(10896213950976109802), KQU( 9506261949596413689),
+	KQU( 9903949574308040616), KQU( 6038397344557204470),
+	KQU(  174601465422373648), KQU(15946141191338238030),
+	KQU(17142225620992044937), KQU( 7552030283784477064),
+	KQU( 2947372384532947997), KQU(  510797021688197711),
+	KQU( 4962499439249363461), KQU(   23770320158385357),
+	KQU(  959774499105138124), KQU( 1468396011518788276),
+	KQU( 2015698006852312308), KQU( 4149400718489980136),
+	KQU( 5992916099522371188), KQU(10819182935265531076),
+	KQU(16189787999192351131), KQU(  342833961790261950),
+	KQU(12470830319550495336), KQU(18128495041912812501),
+	KQU( 1193600899723524337), KQU( 9056793666590079770),
+	KQU( 2154021227041669041), KQU( 4963570213951235735),
+	KQU( 4865075960209211409), KQU( 2097724599039942963),
+	KQU( 2024080278583179845), KQU(11527054549196576736),
+	KQU(10650256084182390252), KQU( 4808408648695766755),
+	KQU( 1642839215013788844), KQU(10607187948250398390),
+	KQU( 7076868166085913508), KQU(  730522571106887032),
+	KQU(12500579240208524895), KQU( 4484390097311355324),
+	KQU(15145801330700623870), KQU( 8055827661392944028),
+	KQU( 5865092976832712268), KQU(15159212508053625143),
+	KQU( 3560964582876483341), KQU( 4070052741344438280),
+	KQU( 6032585709886855634), KQU(15643262320904604873),
+	KQU( 2565119772293371111), KQU(  318314293065348260),
+	KQU(15047458749141511872), KQU( 7772788389811528730),
+	KQU( 7081187494343801976), KQU( 6465136009467253947),
+	KQU(10425940692543362069), KQU(  554608190318339115),
+	KQU(14796699860302125214), KQU( 1638153134431111443),
+	KQU(10336967447052276248), KQU( 8412308070396592958),
+	KQU( 4004557277152051226), KQU( 8143598997278774834),
+	KQU(16413323996508783221), KQU(13139418758033994949),
+	KQU( 9772709138335006667), KQU( 2818167159287157659),
+	KQU(17091740573832523669), KQU(14629199013130751608),
+	KQU(18268322711500338185), KQU( 8290963415675493063),
+	KQU( 8830864907452542588), KQU( 1614839084637494849),
+	KQU(14855358500870422231), KQU( 3472996748392519937),
+	KQU(15317151166268877716), KQU( 5825895018698400362),
+	KQU(16730208429367544129), KQU(10481156578141202800),
+	KQU( 4746166512382823750), KQU(12720876014472464998),
+	KQU( 8825177124486735972), KQU(13733447296837467838),
+	KQU( 6412293741681359625), KQU( 8313213138756135033),
+	KQU(11421481194803712517), KQU( 7997007691544174032),
+	KQU( 6812963847917605930), KQU( 9683091901227558641),
+	KQU(14703594165860324713), KQU( 1775476144519618309),
+	KQU( 2724283288516469519), KQU(  717642555185856868),
+	KQU( 8736402192215092346), KQU(11878800336431381021),
+	KQU( 4348816066017061293), KQU( 6115112756583631307),
+	KQU( 9176597239667142976), KQU(12615622714894259204),
+	KQU(10283406711301385987), KQU( 5111762509485379420),
+	KQU( 3118290051198688449), KQU( 7345123071632232145),
+	KQU( 9176423451688682359), KQU( 4843865456157868971),
+	KQU(12008036363752566088), KQU(12058837181919397720),
+	KQU( 2145073958457347366), KQU( 1526504881672818067),
+	KQU( 3488830105567134848), KQU(13208362960674805143),
+	KQU( 4077549672899572192), KQU( 7770995684693818365),
+	KQU( 1398532341546313593), KQU(12711859908703927840),
+	KQU( 1417561172594446813), KQU(17045191024194170604),
+	KQU( 4101933177604931713), KQU(14708428834203480320),
+	KQU(17447509264469407724), KQU(14314821973983434255),
+	KQU(17990472271061617265), KQU( 5087756685841673942),
+	KQU(12797820586893859939), KQU( 1778128952671092879),
+	KQU( 3535918530508665898), KQU( 9035729701042481301),
+	KQU(14808661568277079962), KQU(14587345077537747914),
+	KQU(11920080002323122708), KQU( 6426515805197278753),
+	KQU( 3295612216725984831), KQU(11040722532100876120),
+	KQU(12305952936387598754), KQU(16097391899742004253),
+	KQU( 4908537335606182208), KQU(12446674552196795504),
+	KQU(16010497855816895177), KQU( 9194378874788615551),
+	KQU( 3382957529567613384), KQU( 5154647600754974077),
+	KQU( 9801822865328396141), KQU( 9023662173919288143),
+	KQU(17623115353825147868), KQU( 8238115767443015816),
+	KQU(15811444159859002560), KQU( 9085612528904059661),
+	KQU( 6888601089398614254), KQU(  258252992894160189),
+	KQU( 6704363880792428622), KQU( 6114966032147235763),
+	KQU(11075393882690261875), KQU( 8797664238933620407),
+	KQU( 5901892006476726920), KQU( 5309780159285518958),
+	KQU(14940808387240817367), KQU(14642032021449656698),
+	KQU( 9808256672068504139), KQU( 3670135111380607658),
+	KQU(11211211097845960152), KQU( 1474304506716695808),
+	KQU(15843166204506876239), KQU( 7661051252471780561),
+	KQU(10170905502249418476), KQU( 7801416045582028589),
+	KQU( 2763981484737053050), KQU( 9491377905499253054),
+	KQU(16201395896336915095), KQU( 9256513756442782198),
+	KQU( 5411283157972456034), KQU( 5059433122288321676),
+	KQU( 4327408006721123357), KQU( 9278544078834433377),
+	KQU( 7601527110882281612), KQU(11848295896975505251),
+	KQU(12096998801094735560), KQU(14773480339823506413),
+	KQU(15586227433895802149), KQU(12786541257830242872),
+	KQU( 6904692985140503067), KQU( 5309011515263103959),
+	KQU(12105257191179371066), KQU(14654380212442225037),
+	KQU( 2556774974190695009), KQU( 4461297399927600261),
+	KQU(14888225660915118646), KQU(14915459341148291824),
+	KQU( 2738802166252327631), KQU( 6047155789239131512),
+	KQU(12920545353217010338), KQU(10697617257007840205),
+	KQU( 2751585253158203504), KQU(13252729159780047496),
+	KQU(14700326134672815469), KQU(14082527904374600529),
+	KQU(16852962273496542070), KQU(17446675504235853907),
+	KQU(15019600398527572311), KQU(12312781346344081551),
+	KQU(14524667935039810450), KQU( 5634005663377195738),
+	KQU(11375574739525000569), KQU( 2423665396433260040),
+	KQU( 5222836914796015410), KQU( 4397666386492647387),
+	KQU( 4619294441691707638), KQU(  665088602354770716),
+	KQU(13246495665281593610), KQU( 6564144270549729409),
+	KQU(10223216188145661688), KQU( 3961556907299230585),
+	KQU(11543262515492439914), KQU(16118031437285993790),
+	KQU( 7143417964520166465), KQU(13295053515909486772),
+	KQU(   40434666004899675), KQU(17127804194038347164),
+	KQU( 8599165966560586269), KQU( 8214016749011284903),
+	KQU(13725130352140465239), KQU( 5467254474431726291),
+	KQU( 7748584297438219877), KQU(16933551114829772472),
+	KQU( 2169618439506799400), KQU( 2169787627665113463),
+	KQU(17314493571267943764), KQU(18053575102911354912),
+	KQU(11928303275378476973), KQU(11593850925061715550),
+	KQU(17782269923473589362), KQU( 3280235307704747039),
+	KQU( 6145343578598685149), KQU(17080117031114086090),
+	KQU(18066839902983594755), KQU( 6517508430331020706),
+	KQU( 8092908893950411541), KQU(12558378233386153732),
+	KQU( 4476532167973132976), KQU(16081642430367025016),
+	KQU( 4233154094369139361), KQU( 8693630486693161027),
+	KQU(11244959343027742285), KQU(12273503967768513508),
+	KQU(14108978636385284876), KQU( 7242414665378826984),
+	KQU( 6561316938846562432), KQU( 8601038474994665795),
+	KQU(17532942353612365904), KQU(17940076637020912186),
+	KQU( 7340260368823171304), KQU( 7061807613916067905),
+	KQU(10561734935039519326), KQU(17990796503724650862),
+	KQU( 6208732943911827159), KQU(  359077562804090617),
+	KQU(14177751537784403113), KQU(10659599444915362902),
+	KQU(15081727220615085833), KQU(13417573895659757486),
+	KQU(15513842342017811524), KQU(11814141516204288231),
+	KQU( 1827312513875101814), KQU( 2804611699894603103),
+	KQU(17116500469975602763), KQU(12270191815211952087),
+	KQU(12256358467786024988), KQU(18435021722453971267),
+	KQU(  671330264390865618), KQU(  476504300460286050),
+	KQU(16465470901027093441), KQU( 4047724406247136402),
+	KQU( 1322305451411883346), KQU( 1388308688834322280),
+	KQU( 7303989085269758176), KQU( 9323792664765233642),
+	KQU( 4542762575316368936), KQU(17342696132794337618),
+	KQU( 4588025054768498379), KQU(13415475057390330804),
+	KQU(17880279491733405570), KQU(10610553400618620353),
+	KQU( 3180842072658960139), KQU(13002966655454270120),
+	KQU( 1665301181064982826), KQU( 7083673946791258979),
+	KQU(  190522247122496820), KQU(17388280237250677740),
+	KQU( 8430770379923642945), KQU(12987180971921668584),
+	KQU( 2311086108365390642), KQU( 2870984383579822345),
+	KQU(14014682609164653318), KQU(14467187293062251484),
+	KQU(  192186361147413298), KQU(15171951713531796524),
+	KQU( 9900305495015948728), KQU(17958004775615466344),
+	KQU(14346380954498606514), KQU(18040047357617407096),
+	KQU( 5035237584833424532), KQU(15089555460613972287),
+	KQU( 4131411873749729831), KQU( 1329013581168250330),
+	KQU(10095353333051193949), KQU(10749518561022462716),
+	KQU( 9050611429810755847), KQU(15022028840236655649),
+	KQU( 8775554279239748298), KQU(13105754025489230502),
+	KQU(15471300118574167585), KQU(   89864764002355628),
+	KQU( 8776416323420466637), KQU( 5280258630612040891),
+	KQU( 2719174488591862912), KQU( 7599309137399661994),
+	KQU(15012887256778039979), KQU(14062981725630928925),
+	KQU(12038536286991689603), KQU( 7089756544681775245),
+	KQU(10376661532744718039), KQU( 1265198725901533130),
+	KQU(13807996727081142408), KQU( 2935019626765036403),
+	KQU( 7651672460680700141), KQU( 3644093016200370795),
+	KQU( 2840982578090080674), KQU(17956262740157449201),
+	KQU(18267979450492880548), KQU(11799503659796848070),
+	KQU( 9942537025669672388), KQU(11886606816406990297),
+	KQU( 5488594946437447576), KQU( 7226714353282744302),
+	KQU( 3784851653123877043), KQU(  878018453244803041),
+	KQU(12110022586268616085), KQU(  734072179404675123),
+	KQU(11869573627998248542), KQU(  469150421297783998),
+	KQU(  260151124912803804), KQU(11639179410120968649),
+	KQU( 9318165193840846253), KQU(12795671722734758075),
+	KQU(15318410297267253933), KQU(  691524703570062620),
+	KQU( 5837129010576994601), KQU(15045963859726941052),
+	KQU( 5850056944932238169), KQU(12017434144750943807),
+	KQU( 7447139064928956574), KQU( 3101711812658245019),
+	KQU(16052940704474982954), KQU(18195745945986994042),
+	KQU( 8932252132785575659), KQU(13390817488106794834),
+	KQU(11582771836502517453), KQU( 4964411326683611686),
+	KQU( 2195093981702694011), KQU(14145229538389675669),
+	KQU(16459605532062271798), KQU(  866316924816482864),
+	KQU( 4593041209937286377), KQU( 8415491391910972138),
+	KQU( 4171236715600528969), KQU(16637569303336782889),
+	KQU( 2002011073439212680), KQU(17695124661097601411),
+	KQU( 4627687053598611702), KQU( 7895831936020190403),
+	KQU( 8455951300917267802), KQU( 2923861649108534854),
+	KQU( 8344557563927786255), KQU( 6408671940373352556),
+	KQU(12210227354536675772), KQU(14294804157294222295),
+	KQU(10103022425071085127), KQU(10092959489504123771),
+	KQU( 6554774405376736268), KQU(12629917718410641774),
+	KQU( 6260933257596067126), KQU( 2460827021439369673),
+	KQU( 2541962996717103668), KQU(  597377203127351475),
+	KQU( 5316984203117315309), KQU( 4811211393563241961),
+	KQU(13119698597255811641), KQU( 8048691512862388981),
+	KQU(10216818971194073842), KQU( 4612229970165291764),
+	KQU(10000980798419974770), KQU( 6877640812402540687),
+	KQU( 1488727563290436992), KQU( 2227774069895697318),
+	KQU(11237754507523316593), KQU(13478948605382290972),
+	KQU( 1963583846976858124), KQU( 5512309205269276457),
+	KQU( 3972770164717652347), KQU( 3841751276198975037),
+	KQU(10283343042181903117), KQU( 8564001259792872199),
+	KQU(16472187244722489221), KQU( 8953493499268945921),
+	KQU( 3518747340357279580), KQU( 4003157546223963073),
+	KQU( 3270305958289814590), KQU( 3966704458129482496),
+	KQU( 8122141865926661939), KQU(14627734748099506653),
+	KQU(13064426990862560568), KQU( 2414079187889870829),
+	KQU( 5378461209354225306), KQU(10841985740128255566),
+	KQU(  538582442885401738), KQU( 7535089183482905946),
+	KQU(16117559957598879095), KQU( 8477890721414539741),
+	KQU( 1459127491209533386), KQU(17035126360733620462),
+	KQU( 8517668552872379126), KQU(10292151468337355014),
+	KQU(17081267732745344157), KQU(13751455337946087178),
+	KQU(14026945459523832966), KQU( 6653278775061723516),
+	KQU(10619085543856390441), KQU( 2196343631481122885),
+	KQU(10045966074702826136), KQU(10082317330452718282),
+	KQU( 5920859259504831242), KQU( 9951879073426540617),
+	KQU( 7074696649151414158), KQU(15808193543879464318),
+	KQU( 7385247772746953374), KQU( 3192003544283864292),
+	KQU(18153684490917593847), KQU(12423498260668568905),
+	KQU(10957758099756378169), KQU(11488762179911016040),
+	KQU( 2099931186465333782), KQU(11180979581250294432),
+	KQU( 8098916250668367933), KQU( 3529200436790763465),
+	KQU(12988418908674681745), KQU( 6147567275954808580),
+	KQU( 3207503344604030989), KQU(10761592604898615360),
+	KQU(  229854861031893504), KQU( 8809853962667144291),
+	KQU(13957364469005693860), KQU( 7634287665224495886),
+	KQU(12353487366976556874), KQU( 1134423796317152034),
+	KQU( 2088992471334107068), KQU( 7393372127190799698),
+	KQU( 1845367839871058391), KQU(  207922563987322884),
+	KQU(11960870813159944976), KQU(12182120053317317363),
+	KQU(17307358132571709283), KQU(13871081155552824936),
+	KQU(18304446751741566262), KQU( 7178705220184302849),
+	KQU(10929605677758824425), KQU(16446976977835806844),
+	KQU(13723874412159769044), KQU( 6942854352100915216),
+	KQU( 1726308474365729390), KQU( 2150078766445323155),
+	KQU(15345558947919656626), KQU(12145453828874527201),
+	KQU( 2054448620739726849), KQU( 2740102003352628137),
+	KQU(11294462163577610655), KQU(  756164283387413743),
+	KQU(17841144758438810880), KQU(10802406021185415861),
+	KQU( 8716455530476737846), KQU( 6321788834517649606),
+	KQU(14681322910577468426), KQU(17330043563884336387),
+	KQU(12701802180050071614), KQU(14695105111079727151),
+	KQU( 5112098511654172830), KQU( 4957505496794139973),
+	KQU( 8270979451952045982), KQU(12307685939199120969),
+	KQU(12425799408953443032), KQU( 8376410143634796588),
+	KQU(16621778679680060464), KQU( 3580497854566660073),
+	KQU( 1122515747803382416), KQU(  857664980960597599),
+	KQU( 6343640119895925918), KQU(12878473260854462891),
+	KQU(10036813920765722626), KQU(14451335468363173812),
+	KQU( 5476809692401102807), KQU(16442255173514366342),
+	KQU(13060203194757167104), KQU(14354124071243177715),
+	KQU(15961249405696125227), KQU(13703893649690872584),
+	KQU(  363907326340340064), KQU( 6247455540491754842),
+	KQU(12242249332757832361), KQU(  156065475679796717),
+	KQU( 9351116235749732355), KQU( 4590350628677701405),
+	KQU( 1671195940982350389), KQU(13501398458898451905),
+	KQU( 6526341991225002255), KQU( 1689782913778157592),
+	KQU( 7439222350869010334), KQU(13975150263226478308),
+	KQU(11411961169932682710), KQU(17204271834833847277),
+	KQU(  541534742544435367), KQU( 6591191931218949684),
+	KQU( 2645454775478232486), KQU( 4322857481256485321),
+	KQU( 8477416487553065110), KQU(12902505428548435048),
+	KQU(  971445777981341415), KQU(14995104682744976712),
+	KQU( 4243341648807158063), KQU( 8695061252721927661),
+	KQU( 5028202003270177222), KQU( 2289257340915567840),
+	KQU(13870416345121866007), KQU(13994481698072092233),
+	KQU( 6912785400753196481), KQU( 2278309315841980139),
+	KQU( 4329765449648304839), KQU( 5963108095785485298),
+	KQU( 4880024847478722478), KQU(16015608779890240947),
+	KQU( 1866679034261393544), KQU(  914821179919731519),
+	KQU( 9643404035648760131), KQU( 2418114953615593915),
+	KQU(  944756836073702374), KQU(15186388048737296834),
+	KQU( 7723355336128442206), KQU( 7500747479679599691),
+	KQU(18013961306453293634), KQU( 2315274808095756456),
+	KQU(13655308255424029566), KQU(17203800273561677098),
+	KQU( 1382158694422087756), KQU( 5090390250309588976),
+	KQU(  517170818384213989), KQU( 1612709252627729621),
+	KQU( 1330118955572449606), KQU(  300922478056709885),
+	KQU(18115693291289091987), KQU(13491407109725238321),
+	KQU(15293714633593827320), KQU( 5151539373053314504),
+	KQU( 5951523243743139207), KQU(14459112015249527975),
+	KQU( 5456113959000700739), KQU( 3877918438464873016),
+	KQU(12534071654260163555), KQU(15871678376893555041),
+	KQU(11005484805712025549), KQU(16353066973143374252),
+	KQU( 4358331472063256685), KQU( 8268349332210859288),
+	KQU(12485161590939658075), KQU(13955993592854471343),
+	KQU( 5911446886848367039), KQU(14925834086813706974),
+	KQU( 6590362597857994805), KQU( 1280544923533661875),
+	KQU( 1637756018947988164), KQU( 4734090064512686329),
+	KQU(16693705263131485912), KQU( 6834882340494360958),
+	KQU( 8120732176159658505), KQU( 2244371958905329346),
+	KQU(10447499707729734021), KQU( 7318742361446942194),
+	KQU( 8032857516355555296), KQU(14023605983059313116),
+	KQU( 1032336061815461376), KQU( 9840995337876562612),
+	KQU( 9869256223029203587), KQU(12227975697177267636),
+	KQU(12728115115844186033), KQU( 7752058479783205470),
+	KQU(  729733219713393087), KQU(12954017801239007622)
 };
 static const uint64_t init_by_array_64_expected[] = {
-	QU( 2100341266307895239ULL), QU( 8344256300489757943ULL),
-	QU(15687933285484243894ULL), QU( 8268620370277076319ULL),
-	QU(12371852309826545459ULL), QU( 8800491541730110238ULL),
-	QU(18113268950100835773ULL), QU( 2886823658884438119ULL),
-	QU( 3293667307248180724ULL), QU( 9307928143300172731ULL),
-	QU( 7688082017574293629ULL), QU(  900986224735166665ULL),
-	QU( 9977972710722265039ULL), QU( 6008205004994830552ULL),
-	QU(  546909104521689292ULL), QU( 7428471521869107594ULL),
-	QU(14777563419314721179ULL), QU(16116143076567350053ULL),
-	QU( 5322685342003142329ULL), QU( 4200427048445863473ULL),
-	QU( 4693092150132559146ULL), QU(13671425863759338582ULL),
-	QU( 6747117460737639916ULL), QU( 4732666080236551150ULL),
-	QU( 5912839950611941263ULL), QU( 3903717554504704909ULL),
-	QU( 2615667650256786818ULL), QU(10844129913887006352ULL),
-	QU(13786467861810997820ULL), QU(14267853002994021570ULL),
-	QU(13767807302847237439ULL), QU(16407963253707224617ULL),
-	QU( 4802498363698583497ULL), QU( 2523802839317209764ULL),
-	QU( 3822579397797475589ULL), QU( 8950320572212130610ULL),
-	QU( 3745623504978342534ULL), QU(16092609066068482806ULL),
-	QU( 9817016950274642398ULL), QU(10591660660323829098ULL),
-	QU(11751606650792815920ULL), QU( 5122873818577122211ULL),
-	QU(17209553764913936624ULL), QU( 6249057709284380343ULL),
-	QU(15088791264695071830ULL), QU(15344673071709851930ULL),
-	QU( 4345751415293646084ULL), QU( 2542865750703067928ULL),
-	QU(13520525127852368784ULL), QU(18294188662880997241ULL),
-	QU( 3871781938044881523ULL), QU( 2873487268122812184ULL),
-	QU(15099676759482679005ULL), QU(15442599127239350490ULL),
-	QU( 6311893274367710888ULL), QU( 3286118760484672933ULL),
-	QU( 4146067961333542189ULL), QU(13303942567897208770ULL),
-	QU( 8196013722255630418ULL), QU( 4437815439340979989ULL),
-	QU(15433791533450605135ULL), QU( 4254828956815687049ULL),
-	QU( 1310903207708286015ULL), QU(10529182764462398549ULL),
-	QU(14900231311660638810ULL), QU( 9727017277104609793ULL),
-	QU( 1821308310948199033ULL), QU(11628861435066772084ULL),
-	QU( 9469019138491546924ULL), QU( 3145812670532604988ULL),
-	QU( 9938468915045491919ULL), QU( 1562447430672662142ULL),
-	QU(13963995266697989134ULL), QU( 3356884357625028695ULL),
-	QU( 4499850304584309747ULL), QU( 8456825817023658122ULL),
-	QU(10859039922814285279ULL), QU( 8099512337972526555ULL),
-	QU(  348006375109672149ULL), QU(11919893998241688603ULL),
-	QU( 1104199577402948826ULL), QU(16689191854356060289ULL),
-	QU(10992552041730168078ULL), QU( 7243733172705465836ULL),
-	QU( 5668075606180319560ULL), QU(18182847037333286970ULL),
-	QU( 4290215357664631322ULL), QU( 4061414220791828613ULL),
-	QU(13006291061652989604ULL), QU( 7140491178917128798ULL),
-	QU(12703446217663283481ULL), QU( 5500220597564558267ULL),
-	QU(10330551509971296358ULL), QU(15958554768648714492ULL),
-	QU( 5174555954515360045ULL), QU( 1731318837687577735ULL),
-	QU( 3557700801048354857ULL), QU(13764012341928616198ULL),
-	QU(13115166194379119043ULL), QU( 7989321021560255519ULL),
-	QU( 2103584280905877040ULL), QU( 9230788662155228488ULL),
-	QU(16396629323325547654ULL), QU(  657926409811318051ULL),
-	QU(15046700264391400727ULL), QU( 5120132858771880830ULL),
-	QU( 7934160097989028561ULL), QU( 6963121488531976245ULL),
-	QU(17412329602621742089ULL), QU(15144843053931774092ULL),
-	QU(17204176651763054532ULL), QU(13166595387554065870ULL),
-	QU( 8590377810513960213ULL), QU( 5834365135373991938ULL),
-	QU( 7640913007182226243ULL), QU( 3479394703859418425ULL),
-	QU(16402784452644521040ULL), QU( 4993979809687083980ULL),
-	QU(13254522168097688865ULL), QU(15643659095244365219ULL),
-	QU( 5881437660538424982ULL), QU(11174892200618987379ULL),
-	QU(  254409966159711077ULL), QU(17158413043140549909ULL),
-	QU( 3638048789290376272ULL), QU( 1376816930299489190ULL),
-	QU( 4622462095217761923ULL), QU(15086407973010263515ULL),
-	QU(13253971772784692238ULL), QU( 5270549043541649236ULL),
-	QU(11182714186805411604ULL), QU(12283846437495577140ULL),
-	QU( 5297647149908953219ULL), QU(10047451738316836654ULL),
-	QU( 4938228100367874746ULL), QU(12328523025304077923ULL),
-	QU( 3601049438595312361ULL), QU( 9313624118352733770ULL),
-	QU(13322966086117661798ULL), QU(16660005705644029394ULL),
-	QU(11337677526988872373ULL), QU(13869299102574417795ULL),
-	QU(15642043183045645437ULL), QU( 3021755569085880019ULL),
-	QU( 4979741767761188161ULL), QU(13679979092079279587ULL),
-	QU( 3344685842861071743ULL), QU(13947960059899588104ULL),
-	QU(  305806934293368007ULL), QU( 5749173929201650029ULL),
-	QU(11123724852118844098ULL), QU(15128987688788879802ULL),
-	QU(15251651211024665009ULL), QU( 7689925933816577776ULL),
-	QU(16732804392695859449ULL), QU(17087345401014078468ULL),
-	QU(14315108589159048871ULL), QU( 4820700266619778917ULL),
-	QU(16709637539357958441ULL), QU( 4936227875177351374ULL),
-	QU( 2137907697912987247ULL), QU(11628565601408395420ULL),
-	QU( 2333250549241556786ULL), QU( 5711200379577778637ULL),
-	QU( 5170680131529031729ULL), QU(12620392043061335164ULL),
-	QU(   95363390101096078ULL), QU( 5487981914081709462ULL),
-	QU( 1763109823981838620ULL), QU( 3395861271473224396ULL),
-	QU( 1300496844282213595ULL), QU( 6894316212820232902ULL),
-	QU(10673859651135576674ULL), QU( 5911839658857903252ULL),
-	QU(17407110743387299102ULL), QU( 8257427154623140385ULL),
-	QU(11389003026741800267ULL), QU( 4070043211095013717ULL),
-	QU(11663806997145259025ULL), QU(15265598950648798210ULL),
-	QU(  630585789434030934ULL), QU( 3524446529213587334ULL),
-	QU( 7186424168495184211ULL), QU(10806585451386379021ULL),
-	QU(11120017753500499273ULL), QU( 1586837651387701301ULL),
-	QU(17530454400954415544ULL), QU( 9991670045077880430ULL),
-	QU( 7550997268990730180ULL), QU( 8640249196597379304ULL),
-	QU( 3522203892786893823ULL), QU(10401116549878854788ULL),
-	QU(13690285544733124852ULL), QU( 8295785675455774586ULL),
-	QU(15535716172155117603ULL), QU( 3112108583723722511ULL),
-	QU(17633179955339271113ULL), QU(18154208056063759375ULL),
-	QU( 1866409236285815666ULL), QU(13326075895396412882ULL),
-	QU( 8756261842948020025ULL), QU( 6281852999868439131ULL),
-	QU(15087653361275292858ULL), QU(10333923911152949397ULL),
-	QU( 5265567645757408500ULL), QU(12728041843210352184ULL),
-	QU( 6347959327507828759ULL), QU(  154112802625564758ULL),
-	QU(18235228308679780218ULL), QU( 3253805274673352418ULL),
-	QU( 4849171610689031197ULL), QU(17948529398340432518ULL),
-	QU(13803510475637409167ULL), QU(13506570190409883095ULL),
-	QU(15870801273282960805ULL), QU( 8451286481299170773ULL),
-	QU( 9562190620034457541ULL), QU( 8518905387449138364ULL),
-	QU(12681306401363385655ULL), QU( 3788073690559762558ULL),
-	QU( 5256820289573487769ULL), QU( 2752021372314875467ULL),
-	QU( 6354035166862520716ULL), QU( 4328956378309739069ULL),
-	QU(  449087441228269600ULL), QU( 5533508742653090868ULL),
-	QU( 1260389420404746988ULL), QU(18175394473289055097ULL),
-	QU( 1535467109660399420ULL), QU( 8818894282874061442ULL),
-	QU(12140873243824811213ULL), QU(15031386653823014946ULL),
-	QU( 1286028221456149232ULL), QU( 6329608889367858784ULL),
-	QU( 9419654354945132725ULL), QU( 6094576547061672379ULL),
-	QU(17706217251847450255ULL), QU( 1733495073065878126ULL),
-	QU(16918923754607552663ULL), QU( 8881949849954945044ULL),
-	QU(12938977706896313891ULL), QU(14043628638299793407ULL),
-	QU(18393874581723718233ULL), QU( 6886318534846892044ULL),
-	QU(14577870878038334081ULL), QU(13541558383439414119ULL),
-	QU(13570472158807588273ULL), QU(18300760537910283361ULL),
-	QU(  818368572800609205ULL), QU( 1417000585112573219ULL),
-	QU(12337533143867683655ULL), QU(12433180994702314480ULL),
-	QU(  778190005829189083ULL), QU(13667356216206524711ULL),
-	QU( 9866149895295225230ULL), QU(11043240490417111999ULL),
-	QU( 1123933826541378598ULL), QU( 6469631933605123610ULL),
-	QU(14508554074431980040ULL), QU(13918931242962026714ULL),
-	QU( 2870785929342348285ULL), QU(14786362626740736974ULL),
-	QU(13176680060902695786ULL), QU( 9591778613541679456ULL),
-	QU( 9097662885117436706ULL), QU(  749262234240924947ULL),
-	QU( 1944844067793307093ULL), QU( 4339214904577487742ULL),
-	QU( 8009584152961946551ULL), QU(16073159501225501777ULL),
-	QU( 3335870590499306217ULL), QU(17088312653151202847ULL),
-	QU( 3108893142681931848ULL), QU(16636841767202792021ULL),
-	QU(10423316431118400637ULL), QU( 8008357368674443506ULL),
-	QU(11340015231914677875ULL), QU(17687896501594936090ULL),
-	QU(15173627921763199958ULL), QU(  542569482243721959ULL),
-	QU(15071714982769812975ULL), QU( 4466624872151386956ULL),
-	QU( 1901780715602332461ULL), QU( 9822227742154351098ULL),
-	QU( 1479332892928648780ULL), QU( 6981611948382474400ULL),
-	QU( 7620824924456077376ULL), QU(14095973329429406782ULL),
-	QU( 7902744005696185404ULL), QU(15830577219375036920ULL),
-	QU(10287076667317764416ULL), QU(12334872764071724025ULL),
-	QU( 4419302088133544331ULL), QU(14455842851266090520ULL),
-	QU(12488077416504654222ULL), QU( 7953892017701886766ULL),
-	QU( 6331484925529519007ULL), QU( 4902145853785030022ULL),
-	QU(17010159216096443073ULL), QU(11945354668653886087ULL),
-	QU(15112022728645230829ULL), QU(17363484484522986742ULL),
-	QU( 4423497825896692887ULL), QU( 8155489510809067471ULL),
-	QU(  258966605622576285ULL), QU( 5462958075742020534ULL),
-	QU( 6763710214913276228ULL), QU( 2368935183451109054ULL),
-	QU(14209506165246453811ULL), QU( 2646257040978514881ULL),
-	QU( 3776001911922207672ULL), QU( 1419304601390147631ULL),
-	QU(14987366598022458284ULL), QU( 3977770701065815721ULL),
-	QU(  730820417451838898ULL), QU( 3982991703612885327ULL),
-	QU( 2803544519671388477ULL), QU(17067667221114424649ULL),
-	QU( 2922555119737867166ULL), QU( 1989477584121460932ULL),
-	QU(15020387605892337354ULL), QU( 9293277796427533547ULL),
-	QU(10722181424063557247ULL), QU(16704542332047511651ULL),
-	QU( 5008286236142089514ULL), QU(16174732308747382540ULL),
-	QU(17597019485798338402ULL), QU(13081745199110622093ULL),
-	QU( 8850305883842258115ULL), QU(12723629125624589005ULL),
-	QU( 8140566453402805978ULL), QU(15356684607680935061ULL),
-	QU(14222190387342648650ULL), QU(11134610460665975178ULL),
-	QU( 1259799058620984266ULL), QU(13281656268025610041ULL),
-	QU(  298262561068153992ULL), QU(12277871700239212922ULL),
-	QU(13911297774719779438ULL), QU(16556727962761474934ULL),
-	QU(17903010316654728010ULL), QU( 9682617699648434744ULL),
-	QU(14757681836838592850ULL), QU( 1327242446558524473ULL),
-	QU(11126645098780572792ULL), QU( 1883602329313221774ULL),
-	QU( 2543897783922776873ULL), QU(15029168513767772842ULL),
-	QU(12710270651039129878ULL), QU(16118202956069604504ULL),
-	QU(15010759372168680524ULL), QU( 2296827082251923948ULL),
-	QU(10793729742623518101ULL), QU(13829764151845413046ULL),
-	QU(17769301223184451213ULL), QU( 3118268169210783372ULL),
-	QU(17626204544105123127ULL), QU( 7416718488974352644ULL),
-	QU(10450751996212925994ULL), QU( 9352529519128770586ULL),
-	QU(  259347569641110140ULL), QU( 8048588892269692697ULL),
-	QU( 1774414152306494058ULL), QU(10669548347214355622ULL),
-	QU(13061992253816795081ULL), QU(18432677803063861659ULL),
-	QU( 8879191055593984333ULL), QU(12433753195199268041ULL),
-	QU(14919392415439730602ULL), QU( 6612848378595332963ULL),
-	QU( 6320986812036143628ULL), QU(10465592420226092859ULL),
-	QU( 4196009278962570808ULL), QU( 3747816564473572224ULL),
-	QU(17941203486133732898ULL), QU( 2350310037040505198ULL),
-	QU( 5811779859134370113ULL), QU(10492109599506195126ULL),
-	QU( 7699650690179541274ULL), QU( 1954338494306022961ULL),
-	QU(14095816969027231152ULL), QU( 5841346919964852061ULL),
-	QU(14945969510148214735ULL), QU( 3680200305887550992ULL),
-	QU( 6218047466131695792ULL), QU( 8242165745175775096ULL),
-	QU(11021371934053307357ULL), QU( 1265099502753169797ULL),
-	QU( 4644347436111321718ULL), QU( 3609296916782832859ULL),
-	QU( 8109807992218521571ULL), QU(18387884215648662020ULL),
-	QU(14656324896296392902ULL), QU(17386819091238216751ULL),
-	QU(17788300878582317152ULL), QU( 7919446259742399591ULL),
-	QU( 4466613134576358004ULL), QU(12928181023667938509ULL),
-	QU(13147446154454932030ULL), QU(16552129038252734620ULL),
-	QU( 8395299403738822450ULL), QU(11313817655275361164ULL),
-	QU(  434258809499511718ULL), QU( 2074882104954788676ULL),
-	QU( 7929892178759395518ULL), QU( 9006461629105745388ULL),
-	QU( 5176475650000323086ULL), QU(11128357033468341069ULL),
-	QU(12026158851559118955ULL), QU(14699716249471156500ULL),
-	QU(  448982497120206757ULL), QU( 4156475356685519900ULL),
-	QU( 6063816103417215727ULL), QU(10073289387954971479ULL),
-	QU( 8174466846138590962ULL), QU( 2675777452363449006ULL),
-	QU( 9090685420572474281ULL), QU( 6659652652765562060ULL),
-	QU(12923120304018106621ULL), QU(11117480560334526775ULL),
-	QU(  937910473424587511ULL), QU( 1838692113502346645ULL),
-	QU(11133914074648726180ULL), QU( 7922600945143884053ULL),
-	QU(13435287702700959550ULL), QU( 5287964921251123332ULL),
-	QU(11354875374575318947ULL), QU(17955724760748238133ULL),
-	QU(13728617396297106512ULL), QU( 4107449660118101255ULL),
-	QU( 1210269794886589623ULL), QU(11408687205733456282ULL),
-	QU( 4538354710392677887ULL), QU(13566803319341319267ULL),
-	QU(17870798107734050771ULL), QU( 3354318982568089135ULL),
-	QU( 9034450839405133651ULL), QU(13087431795753424314ULL),
-	QU(  950333102820688239ULL), QU( 1968360654535604116ULL),
-	QU(16840551645563314995ULL), QU( 8867501803892924995ULL),
-	QU(11395388644490626845ULL), QU( 1529815836300732204ULL),
-	QU(13330848522996608842ULL), QU( 1813432878817504265ULL),
-	QU( 2336867432693429560ULL), QU(15192805445973385902ULL),
-	QU( 2528593071076407877ULL), QU(  128459777936689248ULL),
-	QU( 9976345382867214866ULL), QU( 6208885766767996043ULL),
-	QU(14982349522273141706ULL), QU( 3099654362410737822ULL),
-	QU(13776700761947297661ULL), QU( 8806185470684925550ULL),
-	QU( 8151717890410585321ULL), QU(  640860591588072925ULL),
-	QU(14592096303937307465ULL), QU( 9056472419613564846ULL),
-	QU(14861544647742266352ULL), QU(12703771500398470216ULL),
-	QU( 3142372800384138465ULL), QU( 6201105606917248196ULL),
-	QU(18337516409359270184ULL), QU(15042268695665115339ULL),
-	QU(15188246541383283846ULL), QU(12800028693090114519ULL),
-	QU( 5992859621101493472ULL), QU(18278043971816803521ULL),
-	QU( 9002773075219424560ULL), QU( 7325707116943598353ULL),
-	QU( 7930571931248040822ULL), QU( 5645275869617023448ULL),
-	QU( 7266107455295958487ULL), QU( 4363664528273524411ULL),
-	QU(14313875763787479809ULL), QU(17059695613553486802ULL),
-	QU( 9247761425889940932ULL), QU(13704726459237593128ULL),
-	QU( 2701312427328909832ULL), QU(17235532008287243115ULL),
-	QU(14093147761491729538ULL), QU( 6247352273768386516ULL),
-	QU( 8268710048153268415ULL), QU( 7985295214477182083ULL),
-	QU(15624495190888896807ULL), QU( 3772753430045262788ULL),
-	QU( 9133991620474991698ULL), QU( 5665791943316256028ULL),
-	QU( 7551996832462193473ULL), QU(13163729206798953877ULL),
-	QU( 9263532074153846374ULL), QU( 1015460703698618353ULL),
-	QU(17929874696989519390ULL), QU(18257884721466153847ULL),
-	QU(16271867543011222991ULL), QU( 3905971519021791941ULL),
-	QU(16814488397137052085ULL), QU( 1321197685504621613ULL),
-	QU( 2870359191894002181ULL), QU(14317282970323395450ULL),
-	QU(13663920845511074366ULL), QU( 2052463995796539594ULL),
-	QU(14126345686431444337ULL), QU( 1727572121947022534ULL),
-	QU(17793552254485594241ULL), QU( 6738857418849205750ULL),
-	QU( 1282987123157442952ULL), QU(16655480021581159251ULL),
-	QU( 6784587032080183866ULL), QU(14726758805359965162ULL),
-	QU( 7577995933961987349ULL), QU(12539609320311114036ULL),
-	QU(10789773033385439494ULL), QU( 8517001497411158227ULL),
-	QU(10075543932136339710ULL), QU(14838152340938811081ULL),
-	QU( 9560840631794044194ULL), QU(17445736541454117475ULL),
-	QU(10633026464336393186ULL), QU(15705729708242246293ULL),
-	QU( 1117517596891411098ULL), QU( 4305657943415886942ULL),
-	QU( 4948856840533979263ULL), QU(16071681989041789593ULL),
-	QU(13723031429272486527ULL), QU( 7639567622306509462ULL),
-	QU(12670424537483090390ULL), QU( 9715223453097197134ULL),
-	QU( 5457173389992686394ULL), QU(  289857129276135145ULL),
-	QU(17048610270521972512ULL), QU(  692768013309835485ULL),
-	QU(14823232360546632057ULL), QU(18218002361317895936ULL),
-	QU( 3281724260212650204ULL), QU(16453957266549513795ULL),
-	QU( 8592711109774511881ULL), QU(  929825123473369579ULL),
-	QU(15966784769764367791ULL), QU( 9627344291450607588ULL),
-	QU(10849555504977813287ULL), QU( 9234566913936339275ULL),
-	QU( 6413807690366911210ULL), QU(10862389016184219267ULL),
-	QU(13842504799335374048ULL), QU( 1531994113376881174ULL),
-	QU( 2081314867544364459ULL), QU(16430628791616959932ULL),
-	QU( 8314714038654394368ULL), QU( 9155473892098431813ULL),
-	QU(12577843786670475704ULL), QU( 4399161106452401017ULL),
-	QU( 1668083091682623186ULL), QU( 1741383777203714216ULL),
-	QU( 2162597285417794374ULL), QU(15841980159165218736ULL),
-	QU( 1971354603551467079ULL), QU( 1206714764913205968ULL),
-	QU( 4790860439591272330ULL), QU(14699375615594055799ULL),
-	QU( 8374423871657449988ULL), QU(10950685736472937738ULL),
-	QU(  697344331343267176ULL), QU(10084998763118059810ULL),
-	QU(12897369539795983124ULL), QU(12351260292144383605ULL),
-	QU( 1268810970176811234ULL), QU( 7406287800414582768ULL),
-	QU(  516169557043807831ULL), QU( 5077568278710520380ULL),
-	QU( 3828791738309039304ULL), QU( 7721974069946943610ULL),
-	QU( 3534670260981096460ULL), QU( 4865792189600584891ULL),
-	QU(16892578493734337298ULL), QU( 9161499464278042590ULL),
-	QU(11976149624067055931ULL), QU(13219479887277343990ULL),
-	QU(14161556738111500680ULL), QU(14670715255011223056ULL),
-	QU( 4671205678403576558ULL), QU(12633022931454259781ULL),
-	QU(14821376219869187646ULL), QU(  751181776484317028ULL),
-	QU( 2192211308839047070ULL), QU(11787306362361245189ULL),
-	QU(10672375120744095707ULL), QU( 4601972328345244467ULL),
-	QU(15457217788831125879ULL), QU( 8464345256775460809ULL),
-	QU(10191938789487159478ULL), QU( 6184348739615197613ULL),
-	QU(11425436778806882100ULL), QU( 2739227089124319793ULL),
-	QU(  461464518456000551ULL), QU( 4689850170029177442ULL),
-	QU( 6120307814374078625ULL), QU(11153579230681708671ULL),
-	QU( 7891721473905347926ULL), QU(10281646937824872400ULL),
-	QU( 3026099648191332248ULL), QU( 8666750296953273818ULL),
-	QU(14978499698844363232ULL), QU(13303395102890132065ULL),
-	QU( 8182358205292864080ULL), QU(10560547713972971291ULL),
-	QU(11981635489418959093ULL), QU( 3134621354935288409ULL),
-	QU(11580681977404383968ULL), QU(14205530317404088650ULL),
-	QU( 5997789011854923157ULL), QU(13659151593432238041ULL),
-	QU(11664332114338865086ULL), QU( 7490351383220929386ULL),
-	QU( 7189290499881530378ULL), QU(15039262734271020220ULL),
-	QU( 2057217285976980055ULL), QU(  555570804905355739ULL),
-	QU(11235311968348555110ULL), QU(13824557146269603217ULL),
-	QU(16906788840653099693ULL), QU( 7222878245455661677ULL),
-	QU( 5245139444332423756ULL), QU( 4723748462805674292ULL),
-	QU(12216509815698568612ULL), QU(17402362976648951187ULL),
-	QU(17389614836810366768ULL), QU( 4880936484146667711ULL),
-	QU( 9085007839292639880ULL), QU(13837353458498535449ULL),
-	QU(11914419854360366677ULL), QU(16595890135313864103ULL),
-	QU( 6313969847197627222ULL), QU(18296909792163910431ULL),
-	QU(10041780113382084042ULL), QU( 2499478551172884794ULL),
-	QU(11057894246241189489ULL), QU( 9742243032389068555ULL),
-	QU(12838934582673196228ULL), QU(13437023235248490367ULL),
-	QU(13372420669446163240ULL), QU( 6752564244716909224ULL),
-	QU( 7157333073400313737ULL), QU(12230281516370654308ULL),
-	QU( 1182884552219419117ULL), QU( 2955125381312499218ULL),
-	QU(10308827097079443249ULL), QU( 1337648572986534958ULL),
-	QU(16378788590020343939ULL), QU(  108619126514420935ULL),
-	QU( 3990981009621629188ULL), QU( 5460953070230946410ULL),
-	QU( 9703328329366531883ULL), QU(13166631489188077236ULL),
-	QU( 1104768831213675170ULL), QU( 3447930458553877908ULL),
-	QU( 8067172487769945676ULL), QU( 5445802098190775347ULL),
-	QU( 3244840981648973873ULL), QU(17314668322981950060ULL),
-	QU( 5006812527827763807ULL), QU(18158695070225526260ULL),
-	QU( 2824536478852417853ULL), QU(13974775809127519886ULL),
-	QU( 9814362769074067392ULL), QU(17276205156374862128ULL),
-	QU(11361680725379306967ULL), QU( 3422581970382012542ULL),
-	QU(11003189603753241266ULL), QU(11194292945277862261ULL),
-	QU( 6839623313908521348ULL), QU(11935326462707324634ULL),
-	QU( 1611456788685878444ULL), QU(13112620989475558907ULL),
-	QU(  517659108904450427ULL), QU(13558114318574407624ULL),
-	QU(15699089742731633077ULL), QU( 4988979278862685458ULL),
-	QU( 8111373583056521297ULL), QU( 3891258746615399627ULL),
-	QU( 8137298251469718086ULL), QU(12748663295624701649ULL),
-	QU( 4389835683495292062ULL), QU( 5775217872128831729ULL),
-	QU( 9462091896405534927ULL), QU( 8498124108820263989ULL),
-	QU( 8059131278842839525ULL), QU(10503167994254090892ULL),
-	QU(11613153541070396656ULL), QU(18069248738504647790ULL),
-	QU(  570657419109768508ULL), QU( 3950574167771159665ULL),
-	QU( 5514655599604313077ULL), QU( 2908460854428484165ULL),
-	QU(10777722615935663114ULL), QU(12007363304839279486ULL),
-	QU( 9800646187569484767ULL), QU( 8795423564889864287ULL),
-	QU(14257396680131028419ULL), QU( 6405465117315096498ULL),
-	QU( 7939411072208774878ULL), QU(17577572378528990006ULL),
-	QU(14785873806715994850ULL), QU(16770572680854747390ULL),
-	QU(18127549474419396481ULL), QU(11637013449455757750ULL),
-	QU(14371851933996761086ULL), QU( 3601181063650110280ULL),
-	QU( 4126442845019316144ULL), QU(10198287239244320669ULL),
-	QU(18000169628555379659ULL), QU(18392482400739978269ULL),
-	QU( 6219919037686919957ULL), QU( 3610085377719446052ULL),
-	QU( 2513925039981776336ULL), QU(16679413537926716955ULL),
-	QU(12903302131714909434ULL), QU( 5581145789762985009ULL),
-	QU(12325955044293303233ULL), QU(17216111180742141204ULL),
-	QU( 6321919595276545740ULL), QU( 3507521147216174501ULL),
-	QU( 9659194593319481840ULL), QU(11473976005975358326ULL),
-	QU(14742730101435987026ULL), QU(  492845897709954780ULL),
-	QU(16976371186162599676ULL), QU(17712703422837648655ULL),
-	QU( 9881254778587061697ULL), QU( 8413223156302299551ULL),
-	QU( 1563841828254089168ULL), QU( 9996032758786671975ULL),
-	QU(  138877700583772667ULL), QU(13003043368574995989ULL),
-	QU( 4390573668650456587ULL), QU( 8610287390568126755ULL),
-	QU(15126904974266642199ULL), QU( 6703637238986057662ULL),
-	QU( 2873075592956810157ULL), QU( 6035080933946049418ULL),
-	QU(13382846581202353014ULL), QU( 7303971031814642463ULL),
-	QU(18418024405307444267ULL), QU( 5847096731675404647ULL),
-	QU( 4035880699639842500ULL), QU(11525348625112218478ULL),
-	QU( 3041162365459574102ULL), QU( 2604734487727986558ULL),
-	QU(15526341771636983145ULL), QU(14556052310697370254ULL),
-	QU(12997787077930808155ULL), QU( 9601806501755554499ULL),
-	QU(11349677952521423389ULL), QU(14956777807644899350ULL),
-	QU(16559736957742852721ULL), QU(12360828274778140726ULL),
-	QU( 6685373272009662513ULL), QU(16932258748055324130ULL),
-	QU(15918051131954158508ULL), QU( 1692312913140790144ULL),
-	QU(  546653826801637367ULL), QU( 5341587076045986652ULL),
-	QU(14975057236342585662ULL), QU(12374976357340622412ULL),
-	QU(10328833995181940552ULL), QU(12831807101710443149ULL),
-	QU(10548514914382545716ULL), QU( 2217806727199715993ULL),
-	QU(12627067369242845138ULL), QU( 4598965364035438158ULL),
-	QU(  150923352751318171ULL), QU(14274109544442257283ULL),
-	QU( 4696661475093863031ULL), QU( 1505764114384654516ULL),
-	QU(10699185831891495147ULL), QU( 2392353847713620519ULL),
-	QU( 3652870166711788383ULL), QU( 8640653276221911108ULL),
-	QU( 3894077592275889704ULL), QU( 4918592872135964845ULL),
-	QU(16379121273281400789ULL), QU(12058465483591683656ULL),
-	QU(11250106829302924945ULL), QU( 1147537556296983005ULL),
-	QU( 6376342756004613268ULL), QU(14967128191709280506ULL),
-	QU(18007449949790627628ULL), QU( 9497178279316537841ULL),
-	QU( 7920174844809394893ULL), QU(10037752595255719907ULL),
-	QU(15875342784985217697ULL), QU(15311615921712850696ULL),
-	QU( 9552902652110992950ULL), QU(14054979450099721140ULL),
-	QU( 5998709773566417349ULL), QU(18027910339276320187ULL),
-	QU( 8223099053868585554ULL), QU( 7842270354824999767ULL),
-	QU( 4896315688770080292ULL), QU(12969320296569787895ULL),
-	QU( 2674321489185759961ULL), QU( 4053615936864718439ULL),
-	QU(11349775270588617578ULL), QU( 4743019256284553975ULL),
-	QU( 5602100217469723769ULL), QU(14398995691411527813ULL),
-	QU( 7412170493796825470ULL), QU(  836262406131744846ULL),
-	QU( 8231086633845153022ULL), QU( 5161377920438552287ULL),
-	QU( 8828731196169924949ULL), QU(16211142246465502680ULL),
-	QU( 3307990879253687818ULL), QU( 5193405406899782022ULL),
-	QU( 8510842117467566693ULL), QU( 6070955181022405365ULL),
-	QU(14482950231361409799ULL), QU(12585159371331138077ULL),
-	QU( 3511537678933588148ULL), QU( 2041849474531116417ULL),
-	QU(10944936685095345792ULL), QU(18303116923079107729ULL),
-	QU( 2720566371239725320ULL), QU( 4958672473562397622ULL),
-	QU( 3032326668253243412ULL), QU(13689418691726908338ULL),
-	QU( 1895205511728843996ULL), QU( 8146303515271990527ULL),
-	QU(16507343500056113480ULL), QU(  473996939105902919ULL),
-	QU( 9897686885246881481ULL), QU(14606433762712790575ULL),
-	QU( 6732796251605566368ULL), QU( 1399778120855368916ULL),
-	QU(  935023885182833777ULL), QU(16066282816186753477ULL),
-	QU( 7291270991820612055ULL), QU(17530230393129853844ULL),
-	QU(10223493623477451366ULL), QU(15841725630495676683ULL),
-	QU(17379567246435515824ULL), QU( 8588251429375561971ULL),
-	QU(18339511210887206423ULL), QU(17349587430725976100ULL),
-	QU(12244876521394838088ULL), QU( 6382187714147161259ULL),
-	QU(12335807181848950831ULL), QU(16948885622305460665ULL),
-	QU(13755097796371520506ULL), QU(14806740373324947801ULL),
-	QU( 4828699633859287703ULL), QU( 8209879281452301604ULL),
-	QU(12435716669553736437ULL), QU(13970976859588452131ULL),
-	QU( 6233960842566773148ULL), QU(12507096267900505759ULL),
-	QU( 1198713114381279421ULL), QU(14989862731124149015ULL),
-	QU(15932189508707978949ULL), QU( 2526406641432708722ULL),
-	QU(   29187427817271982ULL), QU( 1499802773054556353ULL),
-	QU(10816638187021897173ULL), QU( 5436139270839738132ULL),
-	QU( 6659882287036010082ULL), QU( 2154048955317173697ULL),
-	QU(10887317019333757642ULL), QU(16281091802634424955ULL),
-	QU(10754549879915384901ULL), QU(10760611745769249815ULL),
-	QU( 2161505946972504002ULL), QU( 5243132808986265107ULL),
-	QU(10129852179873415416ULL), QU(  710339480008649081ULL),
-	QU( 7802129453068808528ULL), QU(17967213567178907213ULL),
-	QU(15730859124668605599ULL), QU(13058356168962376502ULL),
-	QU( 3701224985413645909ULL), QU(14464065869149109264ULL),
-	QU( 9959272418844311646ULL), QU(10157426099515958752ULL),
-	QU(14013736814538268528ULL), QU(17797456992065653951ULL),
-	QU(17418878140257344806ULL), QU(15457429073540561521ULL),
-	QU( 2184426881360949378ULL), QU( 2062193041154712416ULL),
-	QU( 8553463347406931661ULL), QU( 4913057625202871854ULL),
-	QU( 2668943682126618425ULL), QU(17064444737891172288ULL),
-	QU( 4997115903913298637ULL), QU(12019402608892327416ULL),
-	QU(17603584559765897352ULL), QU(11367529582073647975ULL),
-	QU( 8211476043518436050ULL), QU( 8676849804070323674ULL),
-	QU(18431829230394475730ULL), QU(10490177861361247904ULL),
-	QU( 9508720602025651349ULL), QU( 7409627448555722700ULL),
-	QU( 5804047018862729008ULL), QU(11943858176893142594ULL),
-	QU(11908095418933847092ULL), QU( 5415449345715887652ULL),
-	QU( 1554022699166156407ULL), QU( 9073322106406017161ULL),
-	QU( 7080630967969047082ULL), QU(18049736940860732943ULL),
-	QU(12748714242594196794ULL), QU( 1226992415735156741ULL),
-	QU(17900981019609531193ULL), QU(11720739744008710999ULL),
-	QU( 3006400683394775434ULL), QU(11347974011751996028ULL),
-	QU( 3316999628257954608ULL), QU( 8384484563557639101ULL),
-	QU(18117794685961729767ULL), QU( 1900145025596618194ULL),
-	QU(17459527840632892676ULL), QU( 5634784101865710994ULL),
-	QU( 7918619300292897158ULL), QU( 3146577625026301350ULL),
-	QU( 9955212856499068767ULL), QU( 1873995843681746975ULL),
-	QU( 1561487759967972194ULL), QU( 8322718804375878474ULL),
-	QU(11300284215327028366ULL), QU( 4667391032508998982ULL),
-	QU( 9820104494306625580ULL), QU(17922397968599970610ULL),
-	QU( 1784690461886786712ULL), QU(14940365084341346821ULL),
-	QU( 5348719575594186181ULL), QU(10720419084507855261ULL),
-	QU(14210394354145143274ULL), QU( 2426468692164000131ULL),
-	QU(16271062114607059202ULL), QU(14851904092357070247ULL),
-	QU( 6524493015693121897ULL), QU( 9825473835127138531ULL),
-	QU(14222500616268569578ULL), QU(15521484052007487468ULL),
-	QU(14462579404124614699ULL), QU(11012375590820665520ULL),
-	QU(11625327350536084927ULL), QU(14452017765243785417ULL),
-	QU( 9989342263518766305ULL), QU( 3640105471101803790ULL),
-	QU( 4749866455897513242ULL), QU(13963064946736312044ULL),
-	QU(10007416591973223791ULL), QU(18314132234717431115ULL),
-	QU( 3286596588617483450ULL), QU( 7726163455370818765ULL),
-	QU( 7575454721115379328ULL), QU( 5308331576437663422ULL),
-	QU(18288821894903530934ULL), QU( 8028405805410554106ULL),
-	QU(15744019832103296628ULL), QU(  149765559630932100ULL),
-	QU( 6137705557200071977ULL), QU(14513416315434803615ULL),
-	QU(11665702820128984473ULL), QU(  218926670505601386ULL),
-	QU( 6868675028717769519ULL), QU(15282016569441512302ULL),
-	QU( 5707000497782960236ULL), QU( 6671120586555079567ULL),
-	QU( 2194098052618985448ULL), QU(16849577895477330978ULL),
-	QU(12957148471017466283ULL), QU( 1997805535404859393ULL),
-	QU( 1180721060263860490ULL), QU(13206391310193756958ULL),
-	QU(12980208674461861797ULL), QU( 3825967775058875366ULL),
-	QU(17543433670782042631ULL), QU( 1518339070120322730ULL),
-	QU(16344584340890991669ULL), QU( 2611327165318529819ULL),
-	QU(11265022723283422529ULL), QU( 4001552800373196817ULL),
-	QU(14509595890079346161ULL), QU( 3528717165416234562ULL),
-	QU(18153222571501914072ULL), QU( 9387182977209744425ULL),
-	QU(10064342315985580021ULL), QU(11373678413215253977ULL),
-	QU( 2308457853228798099ULL), QU( 9729042942839545302ULL),
-	QU( 7833785471140127746ULL), QU( 6351049900319844436ULL),
-	QU(14454610627133496067ULL), QU(12533175683634819111ULL),
-	QU(15570163926716513029ULL), QU(13356980519185762498ULL)
+	KQU( 2100341266307895239), KQU( 8344256300489757943),
+	KQU(15687933285484243894), KQU( 8268620370277076319),
+	KQU(12371852309826545459), KQU( 8800491541730110238),
+	KQU(18113268950100835773), KQU( 2886823658884438119),
+	KQU( 3293667307248180724), KQU( 9307928143300172731),
+	KQU( 7688082017574293629), KQU(  900986224735166665),
+	KQU( 9977972710722265039), KQU( 6008205004994830552),
+	KQU(  546909104521689292), KQU( 7428471521869107594),
+	KQU(14777563419314721179), KQU(16116143076567350053),
+	KQU( 5322685342003142329), KQU( 4200427048445863473),
+	KQU( 4693092150132559146), KQU(13671425863759338582),
+	KQU( 6747117460737639916), KQU( 4732666080236551150),
+	KQU( 5912839950611941263), KQU( 3903717554504704909),
+	KQU( 2615667650256786818), KQU(10844129913887006352),
+	KQU(13786467861810997820), KQU(14267853002994021570),
+	KQU(13767807302847237439), KQU(16407963253707224617),
+	KQU( 4802498363698583497), KQU( 2523802839317209764),
+	KQU( 3822579397797475589), KQU( 8950320572212130610),
+	KQU( 3745623504978342534), KQU(16092609066068482806),
+	KQU( 9817016950274642398), KQU(10591660660323829098),
+	KQU(11751606650792815920), KQU( 5122873818577122211),
+	KQU(17209553764913936624), KQU( 6249057709284380343),
+	KQU(15088791264695071830), KQU(15344673071709851930),
+	KQU( 4345751415293646084), KQU( 2542865750703067928),
+	KQU(13520525127852368784), KQU(18294188662880997241),
+	KQU( 3871781938044881523), KQU( 2873487268122812184),
+	KQU(15099676759482679005), KQU(15442599127239350490),
+	KQU( 6311893274367710888), KQU( 3286118760484672933),
+	KQU( 4146067961333542189), KQU(13303942567897208770),
+	KQU( 8196013722255630418), KQU( 4437815439340979989),
+	KQU(15433791533450605135), KQU( 4254828956815687049),
+	KQU( 1310903207708286015), KQU(10529182764462398549),
+	KQU(14900231311660638810), KQU( 9727017277104609793),
+	KQU( 1821308310948199033), KQU(11628861435066772084),
+	KQU( 9469019138491546924), KQU( 3145812670532604988),
+	KQU( 9938468915045491919), KQU( 1562447430672662142),
+	KQU(13963995266697989134), KQU( 3356884357625028695),
+	KQU( 4499850304584309747), KQU( 8456825817023658122),
+	KQU(10859039922814285279), KQU( 8099512337972526555),
+	KQU(  348006375109672149), KQU(11919893998241688603),
+	KQU( 1104199577402948826), KQU(16689191854356060289),
+	KQU(10992552041730168078), KQU( 7243733172705465836),
+	KQU( 5668075606180319560), KQU(18182847037333286970),
+	KQU( 4290215357664631322), KQU( 4061414220791828613),
+	KQU(13006291061652989604), KQU( 7140491178917128798),
+	KQU(12703446217663283481), KQU( 5500220597564558267),
+	KQU(10330551509971296358), KQU(15958554768648714492),
+	KQU( 5174555954515360045), KQU( 1731318837687577735),
+	KQU( 3557700801048354857), KQU(13764012341928616198),
+	KQU(13115166194379119043), KQU( 7989321021560255519),
+	KQU( 2103584280905877040), KQU( 9230788662155228488),
+	KQU(16396629323325547654), KQU(  657926409811318051),
+	KQU(15046700264391400727), KQU( 5120132858771880830),
+	KQU( 7934160097989028561), KQU( 6963121488531976245),
+	KQU(17412329602621742089), KQU(15144843053931774092),
+	KQU(17204176651763054532), KQU(13166595387554065870),
+	KQU( 8590377810513960213), KQU( 5834365135373991938),
+	KQU( 7640913007182226243), KQU( 3479394703859418425),
+	KQU(16402784452644521040), KQU( 4993979809687083980),
+	KQU(13254522168097688865), KQU(15643659095244365219),
+	KQU( 5881437660538424982), KQU(11174892200618987379),
+	KQU(  254409966159711077), KQU(17158413043140549909),
+	KQU( 3638048789290376272), KQU( 1376816930299489190),
+	KQU( 4622462095217761923), KQU(15086407973010263515),
+	KQU(13253971772784692238), KQU( 5270549043541649236),
+	KQU(11182714186805411604), KQU(12283846437495577140),
+	KQU( 5297647149908953219), KQU(10047451738316836654),
+	KQU( 4938228100367874746), KQU(12328523025304077923),
+	KQU( 3601049438595312361), KQU( 9313624118352733770),
+	KQU(13322966086117661798), KQU(16660005705644029394),
+	KQU(11337677526988872373), KQU(13869299102574417795),
+	KQU(15642043183045645437), KQU( 3021755569085880019),
+	KQU( 4979741767761188161), KQU(13679979092079279587),
+	KQU( 3344685842861071743), KQU(13947960059899588104),
+	KQU(  305806934293368007), KQU( 5749173929201650029),
+	KQU(11123724852118844098), KQU(15128987688788879802),
+	KQU(15251651211024665009), KQU( 7689925933816577776),
+	KQU(16732804392695859449), KQU(17087345401014078468),
+	KQU(14315108589159048871), KQU( 4820700266619778917),
+	KQU(16709637539357958441), KQU( 4936227875177351374),
+	KQU( 2137907697912987247), KQU(11628565601408395420),
+	KQU( 2333250549241556786), KQU( 5711200379577778637),
+	KQU( 5170680131529031729), KQU(12620392043061335164),
+	KQU(   95363390101096078), KQU( 5487981914081709462),
+	KQU( 1763109823981838620), KQU( 3395861271473224396),
+	KQU( 1300496844282213595), KQU( 6894316212820232902),
+	KQU(10673859651135576674), KQU( 5911839658857903252),
+	KQU(17407110743387299102), KQU( 8257427154623140385),
+	KQU(11389003026741800267), KQU( 4070043211095013717),
+	KQU(11663806997145259025), KQU(15265598950648798210),
+	KQU(  630585789434030934), KQU( 3524446529213587334),
+	KQU( 7186424168495184211), KQU(10806585451386379021),
+	KQU(11120017753500499273), KQU( 1586837651387701301),
+	KQU(17530454400954415544), KQU( 9991670045077880430),
+	KQU( 7550997268990730180), KQU( 8640249196597379304),
+	KQU( 3522203892786893823), KQU(10401116549878854788),
+	KQU(13690285544733124852), KQU( 8295785675455774586),
+	KQU(15535716172155117603), KQU( 3112108583723722511),
+	KQU(17633179955339271113), KQU(18154208056063759375),
+	KQU( 1866409236285815666), KQU(13326075895396412882),
+	KQU( 8756261842948020025), KQU( 6281852999868439131),
+	KQU(15087653361275292858), KQU(10333923911152949397),
+	KQU( 5265567645757408500), KQU(12728041843210352184),
+	KQU( 6347959327507828759), KQU(  154112802625564758),
+	KQU(18235228308679780218), KQU( 3253805274673352418),
+	KQU( 4849171610689031197), KQU(17948529398340432518),
+	KQU(13803510475637409167), KQU(13506570190409883095),
+	KQU(15870801273282960805), KQU( 8451286481299170773),
+	KQU( 9562190620034457541), KQU( 8518905387449138364),
+	KQU(12681306401363385655), KQU( 3788073690559762558),
+	KQU( 5256820289573487769), KQU( 2752021372314875467),
+	KQU( 6354035166862520716), KQU( 4328956378309739069),
+	KQU(  449087441228269600), KQU( 5533508742653090868),
+	KQU( 1260389420404746988), KQU(18175394473289055097),
+	KQU( 1535467109660399420), KQU( 8818894282874061442),
+	KQU(12140873243824811213), KQU(15031386653823014946),
+	KQU( 1286028221456149232), KQU( 6329608889367858784),
+	KQU( 9419654354945132725), KQU( 6094576547061672379),
+	KQU(17706217251847450255), KQU( 1733495073065878126),
+	KQU(16918923754607552663), KQU( 8881949849954945044),
+	KQU(12938977706896313891), KQU(14043628638299793407),
+	KQU(18393874581723718233), KQU( 6886318534846892044),
+	KQU(14577870878038334081), KQU(13541558383439414119),
+	KQU(13570472158807588273), KQU(18300760537910283361),
+	KQU(  818368572800609205), KQU( 1417000585112573219),
+	KQU(12337533143867683655), KQU(12433180994702314480),
+	KQU(  778190005829189083), KQU(13667356216206524711),
+	KQU( 9866149895295225230), KQU(11043240490417111999),
+	KQU( 1123933826541378598), KQU( 6469631933605123610),
+	KQU(14508554074431980040), KQU(13918931242962026714),
+	KQU( 2870785929342348285), KQU(14786362626740736974),
+	KQU(13176680060902695786), KQU( 9591778613541679456),
+	KQU( 9097662885117436706), KQU(  749262234240924947),
+	KQU( 1944844067793307093), KQU( 4339214904577487742),
+	KQU( 8009584152961946551), KQU(16073159501225501777),
+	KQU( 3335870590499306217), KQU(17088312653151202847),
+	KQU( 3108893142681931848), KQU(16636841767202792021),
+	KQU(10423316431118400637), KQU( 8008357368674443506),
+	KQU(11340015231914677875), KQU(17687896501594936090),
+	KQU(15173627921763199958), KQU(  542569482243721959),
+	KQU(15071714982769812975), KQU( 4466624872151386956),
+	KQU( 1901780715602332461), KQU( 9822227742154351098),
+	KQU( 1479332892928648780), KQU( 6981611948382474400),
+	KQU( 7620824924456077376), KQU(14095973329429406782),
+	KQU( 7902744005696185404), KQU(15830577219375036920),
+	KQU(10287076667317764416), KQU(12334872764071724025),
+	KQU( 4419302088133544331), KQU(14455842851266090520),
+	KQU(12488077416504654222), KQU( 7953892017701886766),
+	KQU( 6331484925529519007), KQU( 4902145853785030022),
+	KQU(17010159216096443073), KQU(11945354668653886087),
+	KQU(15112022728645230829), KQU(17363484484522986742),
+	KQU( 4423497825896692887), KQU( 8155489510809067471),
+	KQU(  258966605622576285), KQU( 5462958075742020534),
+	KQU( 6763710214913276228), KQU( 2368935183451109054),
+	KQU(14209506165246453811), KQU( 2646257040978514881),
+	KQU( 3776001911922207672), KQU( 1419304601390147631),
+	KQU(14987366598022458284), KQU( 3977770701065815721),
+	KQU(  730820417451838898), KQU( 3982991703612885327),
+	KQU( 2803544519671388477), KQU(17067667221114424649),
+	KQU( 2922555119737867166), KQU( 1989477584121460932),
+	KQU(15020387605892337354), KQU( 9293277796427533547),
+	KQU(10722181424063557247), KQU(16704542332047511651),
+	KQU( 5008286236142089514), KQU(16174732308747382540),
+	KQU(17597019485798338402), KQU(13081745199110622093),
+	KQU( 8850305883842258115), KQU(12723629125624589005),
+	KQU( 8140566453402805978), KQU(15356684607680935061),
+	KQU(14222190387342648650), KQU(11134610460665975178),
+	KQU( 1259799058620984266), KQU(13281656268025610041),
+	KQU(  298262561068153992), KQU(12277871700239212922),
+	KQU(13911297774719779438), KQU(16556727962761474934),
+	KQU(17903010316654728010), KQU( 9682617699648434744),
+	KQU(14757681836838592850), KQU( 1327242446558524473),
+	KQU(11126645098780572792), KQU( 1883602329313221774),
+	KQU( 2543897783922776873), KQU(15029168513767772842),
+	KQU(12710270651039129878), KQU(16118202956069604504),
+	KQU(15010759372168680524), KQU( 2296827082251923948),
+	KQU(10793729742623518101), KQU(13829764151845413046),
+	KQU(17769301223184451213), KQU( 3118268169210783372),
+	KQU(17626204544105123127), KQU( 7416718488974352644),
+	KQU(10450751996212925994), KQU( 9352529519128770586),
+	KQU(  259347569641110140), KQU( 8048588892269692697),
+	KQU( 1774414152306494058), KQU(10669548347214355622),
+	KQU(13061992253816795081), KQU(18432677803063861659),
+	KQU( 8879191055593984333), KQU(12433753195199268041),
+	KQU(14919392415439730602), KQU( 6612848378595332963),
+	KQU( 6320986812036143628), KQU(10465592420226092859),
+	KQU( 4196009278962570808), KQU( 3747816564473572224),
+	KQU(17941203486133732898), KQU( 2350310037040505198),
+	KQU( 5811779859134370113), KQU(10492109599506195126),
+	KQU( 7699650690179541274), KQU( 1954338494306022961),
+	KQU(14095816969027231152), KQU( 5841346919964852061),
+	KQU(14945969510148214735), KQU( 3680200305887550992),
+	KQU( 6218047466131695792), KQU( 8242165745175775096),
+	KQU(11021371934053307357), KQU( 1265099502753169797),
+	KQU( 4644347436111321718), KQU( 3609296916782832859),
+	KQU( 8109807992218521571), KQU(18387884215648662020),
+	KQU(14656324896296392902), KQU(17386819091238216751),
+	KQU(17788300878582317152), KQU( 7919446259742399591),
+	KQU( 4466613134576358004), KQU(12928181023667938509),
+	KQU(13147446154454932030), KQU(16552129038252734620),
+	KQU( 8395299403738822450), KQU(11313817655275361164),
+	KQU(  434258809499511718), KQU( 2074882104954788676),
+	KQU( 7929892178759395518), KQU( 9006461629105745388),
+	KQU( 5176475650000323086), KQU(11128357033468341069),
+	KQU(12026158851559118955), KQU(14699716249471156500),
+	KQU(  448982497120206757), KQU( 4156475356685519900),
+	KQU( 6063816103417215727), KQU(10073289387954971479),
+	KQU( 8174466846138590962), KQU( 2675777452363449006),
+	KQU( 9090685420572474281), KQU( 6659652652765562060),
+	KQU(12923120304018106621), KQU(11117480560334526775),
+	KQU(  937910473424587511), KQU( 1838692113502346645),
+	KQU(11133914074648726180), KQU( 7922600945143884053),
+	KQU(13435287702700959550), KQU( 5287964921251123332),
+	KQU(11354875374575318947), KQU(17955724760748238133),
+	KQU(13728617396297106512), KQU( 4107449660118101255),
+	KQU( 1210269794886589623), KQU(11408687205733456282),
+	KQU( 4538354710392677887), KQU(13566803319341319267),
+	KQU(17870798107734050771), KQU( 3354318982568089135),
+	KQU( 9034450839405133651), KQU(13087431795753424314),
+	KQU(  950333102820688239), KQU( 1968360654535604116),
+	KQU(16840551645563314995), KQU( 8867501803892924995),
+	KQU(11395388644490626845), KQU( 1529815836300732204),
+	KQU(13330848522996608842), KQU( 1813432878817504265),
+	KQU( 2336867432693429560), KQU(15192805445973385902),
+	KQU( 2528593071076407877), KQU(  128459777936689248),
+	KQU( 9976345382867214866), KQU( 6208885766767996043),
+	KQU(14982349522273141706), KQU( 3099654362410737822),
+	KQU(13776700761947297661), KQU( 8806185470684925550),
+	KQU( 8151717890410585321), KQU(  640860591588072925),
+	KQU(14592096303937307465), KQU( 9056472419613564846),
+	KQU(14861544647742266352), KQU(12703771500398470216),
+	KQU( 3142372800384138465), KQU( 6201105606917248196),
+	KQU(18337516409359270184), KQU(15042268695665115339),
+	KQU(15188246541383283846), KQU(12800028693090114519),
+	KQU( 5992859621101493472), KQU(18278043971816803521),
+	KQU( 9002773075219424560), KQU( 7325707116943598353),
+	KQU( 7930571931248040822), KQU( 5645275869617023448),
+	KQU( 7266107455295958487), KQU( 4363664528273524411),
+	KQU(14313875763787479809), KQU(17059695613553486802),
+	KQU( 9247761425889940932), KQU(13704726459237593128),
+	KQU( 2701312427328909832), KQU(17235532008287243115),
+	KQU(14093147761491729538), KQU( 6247352273768386516),
+	KQU( 8268710048153268415), KQU( 7985295214477182083),
+	KQU(15624495190888896807), KQU( 3772753430045262788),
+	KQU( 9133991620474991698), KQU( 5665791943316256028),
+	KQU( 7551996832462193473), KQU(13163729206798953877),
+	KQU( 9263532074153846374), KQU( 1015460703698618353),
+	KQU(17929874696989519390), KQU(18257884721466153847),
+	KQU(16271867543011222991), KQU( 3905971519021791941),
+	KQU(16814488397137052085), KQU( 1321197685504621613),
+	KQU( 2870359191894002181), KQU(14317282970323395450),
+	KQU(13663920845511074366), KQU( 2052463995796539594),
+	KQU(14126345686431444337), KQU( 1727572121947022534),
+	KQU(17793552254485594241), KQU( 6738857418849205750),
+	KQU( 1282987123157442952), KQU(16655480021581159251),
+	KQU( 6784587032080183866), KQU(14726758805359965162),
+	KQU( 7577995933961987349), KQU(12539609320311114036),
+	KQU(10789773033385439494), KQU( 8517001497411158227),
+	KQU(10075543932136339710), KQU(14838152340938811081),
+	KQU( 9560840631794044194), KQU(17445736541454117475),
+	KQU(10633026464336393186), KQU(15705729708242246293),
+	KQU( 1117517596891411098), KQU( 4305657943415886942),
+	KQU( 4948856840533979263), KQU(16071681989041789593),
+	KQU(13723031429272486527), KQU( 7639567622306509462),
+	KQU(12670424537483090390), KQU( 9715223453097197134),
+	KQU( 5457173389992686394), KQU(  289857129276135145),
+	KQU(17048610270521972512), KQU(  692768013309835485),
+	KQU(14823232360546632057), KQU(18218002361317895936),
+	KQU( 3281724260212650204), KQU(16453957266549513795),
+	KQU( 8592711109774511881), KQU(  929825123473369579),
+	KQU(15966784769764367791), KQU( 9627344291450607588),
+	KQU(10849555504977813287), KQU( 9234566913936339275),
+	KQU( 6413807690366911210), KQU(10862389016184219267),
+	KQU(13842504799335374048), KQU( 1531994113376881174),
+	KQU( 2081314867544364459), KQU(16430628791616959932),
+	KQU( 8314714038654394368), KQU( 9155473892098431813),
+	KQU(12577843786670475704), KQU( 4399161106452401017),
+	KQU( 1668083091682623186), KQU( 1741383777203714216),
+	KQU( 2162597285417794374), KQU(15841980159165218736),
+	KQU( 1971354603551467079), KQU( 1206714764913205968),
+	KQU( 4790860439591272330), KQU(14699375615594055799),
+	KQU( 8374423871657449988), KQU(10950685736472937738),
+	KQU(  697344331343267176), KQU(10084998763118059810),
+	KQU(12897369539795983124), KQU(12351260292144383605),
+	KQU( 1268810970176811234), KQU( 7406287800414582768),
+	KQU(  516169557043807831), KQU( 5077568278710520380),
+	KQU( 3828791738309039304), KQU( 7721974069946943610),
+	KQU( 3534670260981096460), KQU( 4865792189600584891),
+	KQU(16892578493734337298), KQU( 9161499464278042590),
+	KQU(11976149624067055931), KQU(13219479887277343990),
+	KQU(14161556738111500680), KQU(14670715255011223056),
+	KQU( 4671205678403576558), KQU(12633022931454259781),
+	KQU(14821376219869187646), KQU(  751181776484317028),
+	KQU( 2192211308839047070), KQU(11787306362361245189),
+	KQU(10672375120744095707), KQU( 4601972328345244467),
+	KQU(15457217788831125879), KQU( 8464345256775460809),
+	KQU(10191938789487159478), KQU( 6184348739615197613),
+	KQU(11425436778806882100), KQU( 2739227089124319793),
+	KQU(  461464518456000551), KQU( 4689850170029177442),
+	KQU( 6120307814374078625), KQU(11153579230681708671),
+	KQU( 7891721473905347926), KQU(10281646937824872400),
+	KQU( 3026099648191332248), KQU( 8666750296953273818),
+	KQU(14978499698844363232), KQU(13303395102890132065),
+	KQU( 8182358205292864080), KQU(10560547713972971291),
+	KQU(11981635489418959093), KQU( 3134621354935288409),
+	KQU(11580681977404383968), KQU(14205530317404088650),
+	KQU( 5997789011854923157), KQU(13659151593432238041),
+	KQU(11664332114338865086), KQU( 7490351383220929386),
+	KQU( 7189290499881530378), KQU(15039262734271020220),
+	KQU( 2057217285976980055), KQU(  555570804905355739),
+	KQU(11235311968348555110), KQU(13824557146269603217),
+	KQU(16906788840653099693), KQU( 7222878245455661677),
+	KQU( 5245139444332423756), KQU( 4723748462805674292),
+	KQU(12216509815698568612), KQU(17402362976648951187),
+	KQU(17389614836810366768), KQU( 4880936484146667711),
+	KQU( 9085007839292639880), KQU(13837353458498535449),
+	KQU(11914419854360366677), KQU(16595890135313864103),
+	KQU( 6313969847197627222), KQU(18296909792163910431),
+	KQU(10041780113382084042), KQU( 2499478551172884794),
+	KQU(11057894246241189489), KQU( 9742243032389068555),
+	KQU(12838934582673196228), KQU(13437023235248490367),
+	KQU(13372420669446163240), KQU( 6752564244716909224),
+	KQU( 7157333073400313737), KQU(12230281516370654308),
+	KQU( 1182884552219419117), KQU( 2955125381312499218),
+	KQU(10308827097079443249), KQU( 1337648572986534958),
+	KQU(16378788590020343939), KQU(  108619126514420935),
+	KQU( 3990981009621629188), KQU( 5460953070230946410),
+	KQU( 9703328329366531883), KQU(13166631489188077236),
+	KQU( 1104768831213675170), KQU( 3447930458553877908),
+	KQU( 8067172487769945676), KQU( 5445802098190775347),
+	KQU( 3244840981648973873), KQU(17314668322981950060),
+	KQU( 5006812527827763807), KQU(18158695070225526260),
+	KQU( 2824536478852417853), KQU(13974775809127519886),
+	KQU( 9814362769074067392), KQU(17276205156374862128),
+	KQU(11361680725379306967), KQU( 3422581970382012542),
+	KQU(11003189603753241266), KQU(11194292945277862261),
+	KQU( 6839623313908521348), KQU(11935326462707324634),
+	KQU( 1611456788685878444), KQU(13112620989475558907),
+	KQU(  517659108904450427), KQU(13558114318574407624),
+	KQU(15699089742731633077), KQU( 4988979278862685458),
+	KQU( 8111373583056521297), KQU( 3891258746615399627),
+	KQU( 8137298251469718086), KQU(12748663295624701649),
+	KQU( 4389835683495292062), KQU( 5775217872128831729),
+	KQU( 9462091896405534927), KQU( 8498124108820263989),
+	KQU( 8059131278842839525), KQU(10503167994254090892),
+	KQU(11613153541070396656), KQU(18069248738504647790),
+	KQU(  570657419109768508), KQU( 3950574167771159665),
+	KQU( 5514655599604313077), KQU( 2908460854428484165),
+	KQU(10777722615935663114), KQU(12007363304839279486),
+	KQU( 9800646187569484767), KQU( 8795423564889864287),
+	KQU(14257396680131028419), KQU( 6405465117315096498),
+	KQU( 7939411072208774878), KQU(17577572378528990006),
+	KQU(14785873806715994850), KQU(16770572680854747390),
+	KQU(18127549474419396481), KQU(11637013449455757750),
+	KQU(14371851933996761086), KQU( 3601181063650110280),
+	KQU( 4126442845019316144), KQU(10198287239244320669),
+	KQU(18000169628555379659), KQU(18392482400739978269),
+	KQU( 6219919037686919957), KQU( 3610085377719446052),
+	KQU( 2513925039981776336), KQU(16679413537926716955),
+	KQU(12903302131714909434), KQU( 5581145789762985009),
+	KQU(12325955044293303233), KQU(17216111180742141204),
+	KQU( 6321919595276545740), KQU( 3507521147216174501),
+	KQU( 9659194593319481840), KQU(11473976005975358326),
+	KQU(14742730101435987026), KQU(  492845897709954780),
+	KQU(16976371186162599676), KQU(17712703422837648655),
+	KQU( 9881254778587061697), KQU( 8413223156302299551),
+	KQU( 1563841828254089168), KQU( 9996032758786671975),
+	KQU(  138877700583772667), KQU(13003043368574995989),
+	KQU( 4390573668650456587), KQU( 8610287390568126755),
+	KQU(15126904974266642199), KQU( 6703637238986057662),
+	KQU( 2873075592956810157), KQU( 6035080933946049418),
+	KQU(13382846581202353014), KQU( 7303971031814642463),
+	KQU(18418024405307444267), KQU( 5847096731675404647),
+	KQU( 4035880699639842500), KQU(11525348625112218478),
+	KQU( 3041162365459574102), KQU( 2604734487727986558),
+	KQU(15526341771636983145), KQU(14556052310697370254),
+	KQU(12997787077930808155), KQU( 9601806501755554499),
+	KQU(11349677952521423389), KQU(14956777807644899350),
+	KQU(16559736957742852721), KQU(12360828274778140726),
+	KQU( 6685373272009662513), KQU(16932258748055324130),
+	KQU(15918051131954158508), KQU( 1692312913140790144),
+	KQU(  546653826801637367), KQU( 5341587076045986652),
+	KQU(14975057236342585662), KQU(12374976357340622412),
+	KQU(10328833995181940552), KQU(12831807101710443149),
+	KQU(10548514914382545716), KQU( 2217806727199715993),
+	KQU(12627067369242845138), KQU( 4598965364035438158),
+	KQU(  150923352751318171), KQU(14274109544442257283),
+	KQU( 4696661475093863031), KQU( 1505764114384654516),
+	KQU(10699185831891495147), KQU( 2392353847713620519),
+	KQU( 3652870166711788383), KQU( 8640653276221911108),
+	KQU( 3894077592275889704), KQU( 4918592872135964845),
+	KQU(16379121273281400789), KQU(12058465483591683656),
+	KQU(11250106829302924945), KQU( 1147537556296983005),
+	KQU( 6376342756004613268), KQU(14967128191709280506),
+	KQU(18007449949790627628), KQU( 9497178279316537841),
+	KQU( 7920174844809394893), KQU(10037752595255719907),
+	KQU(15875342784985217697), KQU(15311615921712850696),
+	KQU( 9552902652110992950), KQU(14054979450099721140),
+	KQU( 5998709773566417349), KQU(18027910339276320187),
+	KQU( 8223099053868585554), KQU( 7842270354824999767),
+	KQU( 4896315688770080292), KQU(12969320296569787895),
+	KQU( 2674321489185759961), KQU( 4053615936864718439),
+	KQU(11349775270588617578), KQU( 4743019256284553975),
+	KQU( 5602100217469723769), KQU(14398995691411527813),
+	KQU( 7412170493796825470), KQU(  836262406131744846),
+	KQU( 8231086633845153022), KQU( 5161377920438552287),
+	KQU( 8828731196169924949), KQU(16211142246465502680),
+	KQU( 3307990879253687818), KQU( 5193405406899782022),
+	KQU( 8510842117467566693), KQU( 6070955181022405365),
+	KQU(14482950231361409799), KQU(12585159371331138077),
+	KQU( 3511537678933588148), KQU( 2041849474531116417),
+	KQU(10944936685095345792), KQU(18303116923079107729),
+	KQU( 2720566371239725320), KQU( 4958672473562397622),
+	KQU( 3032326668253243412), KQU(13689418691726908338),
+	KQU( 1895205511728843996), KQU( 8146303515271990527),
+	KQU(16507343500056113480), KQU(  473996939105902919),
+	KQU( 9897686885246881481), KQU(14606433762712790575),
+	KQU( 6732796251605566368), KQU( 1399778120855368916),
+	KQU(  935023885182833777), KQU(16066282816186753477),
+	KQU( 7291270991820612055), KQU(17530230393129853844),
+	KQU(10223493623477451366), KQU(15841725630495676683),
+	KQU(17379567246435515824), KQU( 8588251429375561971),
+	KQU(18339511210887206423), KQU(17349587430725976100),
+	KQU(12244876521394838088), KQU( 6382187714147161259),
+	KQU(12335807181848950831), KQU(16948885622305460665),
+	KQU(13755097796371520506), KQU(14806740373324947801),
+	KQU( 4828699633859287703), KQU( 8209879281452301604),
+	KQU(12435716669553736437), KQU(13970976859588452131),
+	KQU( 6233960842566773148), KQU(12507096267900505759),
+	KQU( 1198713114381279421), KQU(14989862731124149015),
+	KQU(15932189508707978949), KQU( 2526406641432708722),
+	KQU(   29187427817271982), KQU( 1499802773054556353),
+	KQU(10816638187021897173), KQU( 5436139270839738132),
+	KQU( 6659882287036010082), KQU( 2154048955317173697),
+	KQU(10887317019333757642), KQU(16281091802634424955),
+	KQU(10754549879915384901), KQU(10760611745769249815),
+	KQU( 2161505946972504002), KQU( 5243132808986265107),
+	KQU(10129852179873415416), KQU(  710339480008649081),
+	KQU( 7802129453068808528), KQU(17967213567178907213),
+	KQU(15730859124668605599), KQU(13058356168962376502),
+	KQU( 3701224985413645909), KQU(14464065869149109264),
+	KQU( 9959272418844311646), KQU(10157426099515958752),
+	KQU(14013736814538268528), KQU(17797456992065653951),
+	KQU(17418878140257344806), KQU(15457429073540561521),
+	KQU( 2184426881360949378), KQU( 2062193041154712416),
+	KQU( 8553463347406931661), KQU( 4913057625202871854),
+	KQU( 2668943682126618425), KQU(17064444737891172288),
+	KQU( 4997115903913298637), KQU(12019402608892327416),
+	KQU(17603584559765897352), KQU(11367529582073647975),
+	KQU( 8211476043518436050), KQU( 8676849804070323674),
+	KQU(18431829230394475730), KQU(10490177861361247904),
+	KQU( 9508720602025651349), KQU( 7409627448555722700),
+	KQU( 5804047018862729008), KQU(11943858176893142594),
+	KQU(11908095418933847092), KQU( 5415449345715887652),
+	KQU( 1554022699166156407), KQU( 9073322106406017161),
+	KQU( 7080630967969047082), KQU(18049736940860732943),
+	KQU(12748714242594196794), KQU( 1226992415735156741),
+	KQU(17900981019609531193), KQU(11720739744008710999),
+	KQU( 3006400683394775434), KQU(11347974011751996028),
+	KQU( 3316999628257954608), KQU( 8384484563557639101),
+	KQU(18117794685961729767), KQU( 1900145025596618194),
+	KQU(17459527840632892676), KQU( 5634784101865710994),
+	KQU( 7918619300292897158), KQU( 3146577625026301350),
+	KQU( 9955212856499068767), KQU( 1873995843681746975),
+	KQU( 1561487759967972194), KQU( 8322718804375878474),
+	KQU(11300284215327028366), KQU( 4667391032508998982),
+	KQU( 9820104494306625580), KQU(17922397968599970610),
+	KQU( 1784690461886786712), KQU(14940365084341346821),
+	KQU( 5348719575594186181), KQU(10720419084507855261),
+	KQU(14210394354145143274), KQU( 2426468692164000131),
+	KQU(16271062114607059202), KQU(14851904092357070247),
+	KQU( 6524493015693121897), KQU( 9825473835127138531),
+	KQU(14222500616268569578), KQU(15521484052007487468),
+	KQU(14462579404124614699), KQU(11012375590820665520),
+	KQU(11625327350536084927), KQU(14452017765243785417),
+	KQU( 9989342263518766305), KQU( 3640105471101803790),
+	KQU( 4749866455897513242), KQU(13963064946736312044),
+	KQU(10007416591973223791), KQU(18314132234717431115),
+	KQU( 3286596588617483450), KQU( 7726163455370818765),
+	KQU( 7575454721115379328), KQU( 5308331576437663422),
+	KQU(18288821894903530934), KQU( 8028405805410554106),
+	KQU(15744019832103296628), KQU(  149765559630932100),
+	KQU( 6137705557200071977), KQU(14513416315434803615),
+	KQU(11665702820128984473), KQU(  218926670505601386),
+	KQU( 6868675028717769519), KQU(15282016569441512302),
+	KQU( 5707000497782960236), KQU( 6671120586555079567),
+	KQU( 2194098052618985448), KQU(16849577895477330978),
+	KQU(12957148471017466283), KQU( 1997805535404859393),
+	KQU( 1180721060263860490), KQU(13206391310193756958),
+	KQU(12980208674461861797), KQU( 3825967775058875366),
+	KQU(17543433670782042631), KQU( 1518339070120322730),
+	KQU(16344584340890991669), KQU( 2611327165318529819),
+	KQU(11265022723283422529), KQU( 4001552800373196817),
+	KQU(14509595890079346161), KQU( 3528717165416234562),
+	KQU(18153222571501914072), KQU( 9387182977209744425),
+	KQU(10064342315985580021), KQU(11373678413215253977),
+	KQU( 2308457853228798099), KQU( 9729042942839545302),
+	KQU( 7833785471140127746), KQU( 6351049900319844436),
+	KQU(14454610627133496067), KQU(12533175683634819111),
+	KQU(15570163926716513029), KQU(13356980519185762498)
 };
 
 TEST_BEGIN(test_gen_rand_32)
diff --git a/test/unit/util.c b/test/unit/util.c
index c11d598..8ab39a4 100644
--- a/test/unit/util.c
+++ b/test/unit/util.c
@@ -52,8 +52,8 @@ TEST_BEGIN(test_malloc_strtoumax)
 		const char *expected_errno_name;
 		uintmax_t expected_x;
 	};
-#define	ERR(e)	e, #e
-#define	UMAX(x)	((uintmax_t)x##ULL)
+#define	ERR(e)		e, #e
+#define	KUMAX(x)	((uintmax_t)x##ULL)
 	struct test_s tests[] = {
 		{"0",		"0",	-1,	ERR(EINVAL),	UINTMAX_MAX},
 		{"0",		"0",	1,	ERR(EINVAL),	UINTMAX_MAX},
@@ -64,51 +64,51 @@ TEST_BEGIN(test_malloc_strtoumax)
 		{"++3",		"++3",	0,	ERR(EINVAL),	UINTMAX_MAX},
 		{"-",		"-",	0,	ERR(EINVAL),	UINTMAX_MAX},
 
-		{"42",		"",	0,	ERR(0),		UMAX(42)},
-		{"+42",		"",	0,	ERR(0),		UMAX(42)},
-		{"-42",		"",	0,	ERR(0),		UMAX(-42)},
-		{"042",		"",	0,	ERR(0),		UMAX(042)},
-		{"+042",	"",	0,	ERR(0),		UMAX(042)},
-		{"-042",	"",	0,	ERR(0),		UMAX(-042)},
-		{"0x42",	"",	0,	ERR(0),		UMAX(0x42)},
-		{"+0x42",	"",	0,	ERR(0),		UMAX(0x42)},
-		{"-0x42",	"",	0,	ERR(0),		UMAX(-0x42)},
-
-		{"0",		"",	0,	ERR(0),		UMAX(0)},
-		{"1",		"",	0,	ERR(0),		UMAX(1)},
-
-		{"42",		"",	0,	ERR(0),		UMAX(42)},
-		{" 42",		"",	0,	ERR(0),		UMAX(42)},
-		{"42 ",		" ",	0,	ERR(0),		UMAX(42)},
-		{"0x",		"x",	0,	ERR(0),		UMAX(0)},
-		{"42x",		"x",	0,	ERR(0),		UMAX(42)},
-
-		{"07",		"",	0,	ERR(0),		UMAX(7)},
-		{"010",		"",	0,	ERR(0),		UMAX(8)},
-		{"08",		"8",	0,	ERR(0),		UMAX(0)},
-		{"0_",		"_",	0,	ERR(0),		UMAX(0)},
-
-		{"0x",		"x",	0,	ERR(0),		UMAX(0)},
-		{"0X",		"X",	0,	ERR(0),		UMAX(0)},
-		{"0xg",		"xg",	0,	ERR(0),		UMAX(0)},
-		{"0XA",		"",	0,	ERR(0),		UMAX(10)},
-
-		{"010",		"",	10,	ERR(0),		UMAX(10)},
-		{"0x3",		"x3",	10,	ERR(0),		UMAX(0)},
-
-		{"12",		"2",	2,	ERR(0),		UMAX(1)},
-		{"78",		"8",	8,	ERR(0),		UMAX(7)},
-		{"9a",		"a",	10,	ERR(0),		UMAX(9)},
-		{"9A",		"A",	10,	ERR(0),		UMAX(9)},
-		{"fg",		"g",	16,	ERR(0),		UMAX(15)},
-		{"FG",		"G",	16,	ERR(0),		UMAX(15)},
-		{"0xfg",	"g",	16,	ERR(0),		UMAX(15)},
-		{"0XFG",	"G",	16,	ERR(0),		UMAX(15)},
-		{"z_",		"_",	36,	ERR(0),		UMAX(35)},
-		{"Z_",		"_",	36,	ERR(0),		UMAX(35)}
+		{"42",		"",	0,	ERR(0),		KUMAX(42)},
+		{"+42",		"",	0,	ERR(0),		KUMAX(42)},
+		{"-42",		"",	0,	ERR(0),		KUMAX(-42)},
+		{"042",		"",	0,	ERR(0),		KUMAX(042)},
+		{"+042",	"",	0,	ERR(0),		KUMAX(042)},
+		{"-042",	"",	0,	ERR(0),		KUMAX(-042)},
+		{"0x42",	"",	0,	ERR(0),		KUMAX(0x42)},
+		{"+0x42",	"",	0,	ERR(0),		KUMAX(0x42)},
+		{"-0x42",	"",	0,	ERR(0),		KUMAX(-0x42)},
+
+		{"0",		"",	0,	ERR(0),		KUMAX(0)},
+		{"1",		"",	0,	ERR(0),		KUMAX(1)},
+
+		{"42",		"",	0,	ERR(0),		KUMAX(42)},
+		{" 42",		"",	0,	ERR(0),		KUMAX(42)},
+		{"42 ",		" ",	0,	ERR(0),		KUMAX(42)},
+		{"0x",		"x",	0,	ERR(0),		KUMAX(0)},
+		{"42x",		"x",	0,	ERR(0),		KUMAX(42)},
+
+		{"07",		"",	0,	ERR(0),		KUMAX(7)},
+		{"010",		"",	0,	ERR(0),		KUMAX(8)},
+		{"08",		"8",	0,	ERR(0),		KUMAX(0)},
+		{"0_",		"_",	0,	ERR(0),		KUMAX(0)},
+
+		{"0x",		"x",	0,	ERR(0),		KUMAX(0)},
+		{"0X",		"X",	0,	ERR(0),		KUMAX(0)},
+		{"0xg",		"xg",	0,	ERR(0),		KUMAX(0)},
+		{"0XA",		"",	0,	ERR(0),		KUMAX(10)},
+
+		{"010",		"",	10,	ERR(0),		KUMAX(10)},
+		{"0x3",		"x3",	10,	ERR(0),		KUMAX(0)},
+
+		{"12",		"2",	2,	ERR(0),		KUMAX(1)},
+		{"78",		"8",	8,	ERR(0),		KUMAX(7)},
+		{"9a",		"a",	10,	ERR(0),		KUMAX(9)},
+		{"9A",		"A",	10,	ERR(0),		KUMAX(9)},
+		{"fg",		"g",	16,	ERR(0),		KUMAX(15)},
+		{"FG",		"G",	16,	ERR(0),		KUMAX(15)},
+		{"0xfg",	"g",	16,	ERR(0),		KUMAX(15)},
+		{"0XFG",	"G",	16,	ERR(0),		KUMAX(15)},
+		{"z_",		"_",	36,	ERR(0),		KUMAX(35)},
+		{"Z_",		"_",	36,	ERR(0),		KUMAX(35)}
 	};
 #undef ERR
-#undef UMAX
+#undef KUMAX
 	unsigned i;
 
 	for (i = 0; i < sizeof(tests)/sizeof(struct test_s); i++) {
-- 
cgit v0.12


From 999e1b5cc74e299a25cc718ddf9fae370cf45264 Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Thu, 29 May 2014 09:03:00 +0900
Subject: Fix thd_join on win64

---
 test/src/thd.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/test/src/thd.c b/test/src/thd.c
index 7e53625..c9d0065 100644
--- a/test/src/thd.c
+++ b/test/src/thd.c
@@ -14,8 +14,11 @@ void
 thd_join(thd_t thd, void **ret)
 {
 
-	if (WaitForSingleObject(thd, INFINITE) == WAIT_OBJECT_0 && ret)
-		GetExitCodeThread(thd, (LPDWORD) ret);
+	if (WaitForSingleObject(thd, INFINITE) == WAIT_OBJECT_0 && ret) {
+		DWORD exit_code;
+		GetExitCodeThread(thd, (LPDWORD) &exit_code);
+		*ret = (void *)(uintptr_t)exit_code;
+	}
 }
 
 #else
-- 
cgit v0.12


From ff2e999667cbd06e5e80c243277c1f3c72d6d263 Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Thu, 29 May 2014 16:33:02 +0900
Subject: Don't use msvc_compat's C99 headers with MSVC versions that have
 (some) C99 support

---
 configure.ac                       |   4 +
 include/msvc_compat/C99/inttypes.h | 313 +++++++++++++++++++++++++++++++++++++
 include/msvc_compat/C99/stdbool.h  |  16 ++
 include/msvc_compat/C99/stdint.h   | 247 +++++++++++++++++++++++++++++
 include/msvc_compat/inttypes.h     | 313 -------------------------------------
 include/msvc_compat/stdbool.h      |  16 --
 include/msvc_compat/stdint.h       | 247 -----------------------------
 7 files changed, 580 insertions(+), 576 deletions(-)
 create mode 100644 include/msvc_compat/C99/inttypes.h
 create mode 100644 include/msvc_compat/C99/stdbool.h
 create mode 100644 include/msvc_compat/C99/stdint.h
 delete mode 100644 include/msvc_compat/inttypes.h
 delete mode 100644 include/msvc_compat/stdbool.h
 delete mode 100644 include/msvc_compat/stdint.h

diff --git a/configure.ac b/configure.ac
index 5852249..5aeaa08 100644
--- a/configure.ac
+++ b/configure.ac
@@ -155,6 +155,10 @@ if test "x${ac_cv_big_endian}" = "x1" ; then
   AC_DEFINE_UNQUOTED([JEMALLOC_BIG_ENDIAN], [ ])
 fi
 
+if test "x${je_cv_msvc}" = "xyes" -a "x${ac_cv_header_inttypes_h}" = "xno"; then
+  CPPFLAGS="$CPPFLAGS -I${srcroot}/include/msvc_compat/C99"
+fi
+
 AC_CHECK_SIZEOF([void *])
 if test "x${ac_cv_sizeof_void_p}" = "x8" ; then
   LG_SIZEOF_PTR=3
diff --git a/include/msvc_compat/C99/inttypes.h b/include/msvc_compat/C99/inttypes.h
new file mode 100644
index 0000000..a4e6b75
--- /dev/null
+++ b/include/msvc_compat/C99/inttypes.h
@@ -0,0 +1,313 @@
+// ISO C9x  compliant inttypes.h for Microsoft Visual Studio
+// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 
+// 
+//  Copyright (c) 2006 Alexander Chemeris
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// 
+//   1. Redistributions of source code must retain the above copyright notice,
+//      this list of conditions and the following disclaimer.
+// 
+//   2. Redistributions in binary form must reproduce the above copyright
+//      notice, this list of conditions and the following disclaimer in the
+//      documentation and/or other materials provided with the distribution.
+// 
+//   3. The name of the author may be used to endorse or promote products
+//      derived from this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// 
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef _MSC_VER // [
+#error "Use this header only with Microsoft Visual C++ compilers!"
+#endif // _MSC_VER ]
+
+#ifndef _MSC_INTTYPES_H_ // [
+#define _MSC_INTTYPES_H_
+
+#if _MSC_VER > 1000
+#pragma once
+#endif
+
+#include "stdint.h"
+
+// 7.8 Format conversion of integer types
+
+typedef struct {
+   intmax_t quot;
+   intmax_t rem;
+} imaxdiv_t;
+
+// 7.8.1 Macros for format specifiers
+
+#if !defined(__cplusplus) || defined(__STDC_FORMAT_MACROS) // [   See footnote 185 at page 198
+
+#ifdef _WIN64
+#  define __PRI64_PREFIX        "l"
+#  define __PRIPTR_PREFIX       "l"
+#else
+#  define __PRI64_PREFIX        "ll"
+#  define __PRIPTR_PREFIX
+#endif
+
+// The fprintf macros for signed integers are:
+#define PRId8       "d"
+#define PRIi8       "i"
+#define PRIdLEAST8  "d"
+#define PRIiLEAST8  "i"
+#define PRIdFAST8   "d"
+#define PRIiFAST8   "i"
+
+#define PRId16       "hd"
+#define PRIi16       "hi"
+#define PRIdLEAST16  "hd"
+#define PRIiLEAST16  "hi"
+#define PRIdFAST16   "hd"
+#define PRIiFAST16   "hi"
+
+#define PRId32       "d"
+#define PRIi32       "i"
+#define PRIdLEAST32  "d"
+#define PRIiLEAST32  "i"
+#define PRIdFAST32   "d"
+#define PRIiFAST32   "i"
+
+#define PRId64       __PRI64_PREFIX "d"
+#define PRIi64       __PRI64_PREFIX "i"
+#define PRIdLEAST64  __PRI64_PREFIX "d"
+#define PRIiLEAST64  __PRI64_PREFIX "i"
+#define PRIdFAST64   __PRI64_PREFIX "d"
+#define PRIiFAST64   __PRI64_PREFIX "i"
+
+#define PRIdMAX     __PRI64_PREFIX "d"
+#define PRIiMAX     __PRI64_PREFIX "i"
+
+#define PRIdPTR     __PRIPTR_PREFIX "d"
+#define PRIiPTR     __PRIPTR_PREFIX "i"
+
+// The fprintf macros for unsigned integers are:
+#define PRIo8       "o"
+#define PRIu8       "u"
+#define PRIx8       "x"
+#define PRIX8       "X"
+#define PRIoLEAST8  "o"
+#define PRIuLEAST8  "u"
+#define PRIxLEAST8  "x"
+#define PRIXLEAST8  "X"
+#define PRIoFAST8   "o"
+#define PRIuFAST8   "u"
+#define PRIxFAST8   "x"
+#define PRIXFAST8   "X"
+
+#define PRIo16       "ho"
+#define PRIu16       "hu"
+#define PRIx16       "hx"
+#define PRIX16       "hX"
+#define PRIoLEAST16  "ho"
+#define PRIuLEAST16  "hu"
+#define PRIxLEAST16  "hx"
+#define PRIXLEAST16  "hX"
+#define PRIoFAST16   "ho"
+#define PRIuFAST16   "hu"
+#define PRIxFAST16   "hx"
+#define PRIXFAST16   "hX"
+
+#define PRIo32       "o"
+#define PRIu32       "u"
+#define PRIx32       "x"
+#define PRIX32       "X"
+#define PRIoLEAST32  "o"
+#define PRIuLEAST32  "u"
+#define PRIxLEAST32  "x"
+#define PRIXLEAST32  "X"
+#define PRIoFAST32   "o"
+#define PRIuFAST32   "u"
+#define PRIxFAST32   "x"
+#define PRIXFAST32   "X"
+
+#define PRIo64       __PRI64_PREFIX "o"
+#define PRIu64       __PRI64_PREFIX "u"
+#define PRIx64       __PRI64_PREFIX "x"
+#define PRIX64       __PRI64_PREFIX "X"
+#define PRIoLEAST64  __PRI64_PREFIX "o"
+#define PRIuLEAST64  __PRI64_PREFIX "u"
+#define PRIxLEAST64  __PRI64_PREFIX "x"
+#define PRIXLEAST64  __PRI64_PREFIX "X"
+#define PRIoFAST64   __PRI64_PREFIX "o"
+#define PRIuFAST64   __PRI64_PREFIX "u"
+#define PRIxFAST64   __PRI64_PREFIX "x"
+#define PRIXFAST64   __PRI64_PREFIX "X"
+
+#define PRIoMAX     __PRI64_PREFIX "o"
+#define PRIuMAX     __PRI64_PREFIX "u"
+#define PRIxMAX     __PRI64_PREFIX "x"
+#define PRIXMAX     __PRI64_PREFIX "X"
+
+#define PRIoPTR     __PRIPTR_PREFIX "o"
+#define PRIuPTR     __PRIPTR_PREFIX "u"
+#define PRIxPTR     __PRIPTR_PREFIX "x"
+#define PRIXPTR     __PRIPTR_PREFIX "X"
+
+// The fscanf macros for signed integers are:
+#define SCNd8       "d"
+#define SCNi8       "i"
+#define SCNdLEAST8  "d"
+#define SCNiLEAST8  "i"
+#define SCNdFAST8   "d"
+#define SCNiFAST8   "i"
+
+#define SCNd16       "hd"
+#define SCNi16       "hi"
+#define SCNdLEAST16  "hd"
+#define SCNiLEAST16  "hi"
+#define SCNdFAST16   "hd"
+#define SCNiFAST16   "hi"
+
+#define SCNd32       "ld"
+#define SCNi32       "li"
+#define SCNdLEAST32  "ld"
+#define SCNiLEAST32  "li"
+#define SCNdFAST32   "ld"
+#define SCNiFAST32   "li"
+
+#define SCNd64       "I64d"
+#define SCNi64       "I64i"
+#define SCNdLEAST64  "I64d"
+#define SCNiLEAST64  "I64i"
+#define SCNdFAST64   "I64d"
+#define SCNiFAST64   "I64i"
+
+#define SCNdMAX     "I64d"
+#define SCNiMAX     "I64i"
+
+#ifdef _WIN64 // [
+#  define SCNdPTR     "I64d"
+#  define SCNiPTR     "I64i"
+#else  // _WIN64 ][
+#  define SCNdPTR     "ld"
+#  define SCNiPTR     "li"
+#endif  // _WIN64 ]
+
+// The fscanf macros for unsigned integers are:
+#define SCNo8       "o"
+#define SCNu8       "u"
+#define SCNx8       "x"
+#define SCNX8       "X"
+#define SCNoLEAST8  "o"
+#define SCNuLEAST8  "u"
+#define SCNxLEAST8  "x"
+#define SCNXLEAST8  "X"
+#define SCNoFAST8   "o"
+#define SCNuFAST8   "u"
+#define SCNxFAST8   "x"
+#define SCNXFAST8   "X"
+
+#define SCNo16       "ho"
+#define SCNu16       "hu"
+#define SCNx16       "hx"
+#define SCNX16       "hX"
+#define SCNoLEAST16  "ho"
+#define SCNuLEAST16  "hu"
+#define SCNxLEAST16  "hx"
+#define SCNXLEAST16  "hX"
+#define SCNoFAST16   "ho"
+#define SCNuFAST16   "hu"
+#define SCNxFAST16   "hx"
+#define SCNXFAST16   "hX"
+
+#define SCNo32       "lo"
+#define SCNu32       "lu"
+#define SCNx32       "lx"
+#define SCNX32       "lX"
+#define SCNoLEAST32  "lo"
+#define SCNuLEAST32  "lu"
+#define SCNxLEAST32  "lx"
+#define SCNXLEAST32  "lX"
+#define SCNoFAST32   "lo"
+#define SCNuFAST32   "lu"
+#define SCNxFAST32   "lx"
+#define SCNXFAST32   "lX"
+
+#define SCNo64       "I64o"
+#define SCNu64       "I64u"
+#define SCNx64       "I64x"
+#define SCNX64       "I64X"
+#define SCNoLEAST64  "I64o"
+#define SCNuLEAST64  "I64u"
+#define SCNxLEAST64  "I64x"
+#define SCNXLEAST64  "I64X"
+#define SCNoFAST64   "I64o"
+#define SCNuFAST64   "I64u"
+#define SCNxFAST64   "I64x"
+#define SCNXFAST64   "I64X"
+
+#define SCNoMAX     "I64o"
+#define SCNuMAX     "I64u"
+#define SCNxMAX     "I64x"
+#define SCNXMAX     "I64X"
+
+#ifdef _WIN64 // [
+#  define SCNoPTR     "I64o"
+#  define SCNuPTR     "I64u"
+#  define SCNxPTR     "I64x"
+#  define SCNXPTR     "I64X"
+#else  // _WIN64 ][
+#  define SCNoPTR     "lo"
+#  define SCNuPTR     "lu"
+#  define SCNxPTR     "lx"
+#  define SCNXPTR     "lX"
+#endif  // _WIN64 ]
+
+#endif // __STDC_FORMAT_MACROS ]
+
+// 7.8.2 Functions for greatest-width integer types
+
+// 7.8.2.1 The imaxabs function
+#define imaxabs _abs64
+
+// 7.8.2.2 The imaxdiv function
+
+// This is modified version of div() function from Microsoft's div.c found
+// in %MSVC.NET%\crt\src\div.c
+#ifdef STATIC_IMAXDIV // [
+static
+#else // STATIC_IMAXDIV ][
+_inline
+#endif // STATIC_IMAXDIV ]
+imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom)
+{
+   imaxdiv_t result;
+
+   result.quot = numer / denom;
+   result.rem = numer % denom;
+
+   if (numer < 0 && result.rem > 0) {
+      // did division wrong; must fix up
+      ++result.quot;
+      result.rem -= denom;
+   }
+
+   return result;
+}
+
+// 7.8.2.3 The strtoimax and strtoumax functions
+#define strtoimax _strtoi64
+#define strtoumax _strtoui64
+
+// 7.8.2.4 The wcstoimax and wcstoumax functions
+#define wcstoimax _wcstoi64
+#define wcstoumax _wcstoui64
+
+
+#endif // _MSC_INTTYPES_H_ ]
diff --git a/include/msvc_compat/C99/stdbool.h b/include/msvc_compat/C99/stdbool.h
new file mode 100644
index 0000000..da9ee8b
--- /dev/null
+++ b/include/msvc_compat/C99/stdbool.h
@@ -0,0 +1,16 @@
+#ifndef stdbool_h
+#define stdbool_h
+
+#include <wtypes.h>
+
+/* MSVC doesn't define _Bool or bool in C, but does have BOOL */
+/* Note this doesn't pass autoconf's test because (bool) 0.5 != true */
+typedef BOOL _Bool;
+
+#define bool _Bool
+#define true 1
+#define false 0
+
+#define __bool_true_false_are_defined 1
+
+#endif /* stdbool_h */
diff --git a/include/msvc_compat/C99/stdint.h b/include/msvc_compat/C99/stdint.h
new file mode 100644
index 0000000..d02608a
--- /dev/null
+++ b/include/msvc_compat/C99/stdint.h
@@ -0,0 +1,247 @@
+// ISO C9x  compliant stdint.h for Microsoft Visual Studio
+// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 
+// 
+//  Copyright (c) 2006-2008 Alexander Chemeris
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// 
+//   1. Redistributions of source code must retain the above copyright notice,
+//      this list of conditions and the following disclaimer.
+// 
+//   2. Redistributions in binary form must reproduce the above copyright
+//      notice, this list of conditions and the following disclaimer in the
+//      documentation and/or other materials provided with the distribution.
+// 
+//   3. The name of the author may be used to endorse or promote products
+//      derived from this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// 
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef _MSC_VER // [
+#error "Use this header only with Microsoft Visual C++ compilers!"
+#endif // _MSC_VER ]
+
+#ifndef _MSC_STDINT_H_ // [
+#define _MSC_STDINT_H_
+
+#if _MSC_VER > 1000
+#pragma once
+#endif
+
+#include <limits.h>
+
+// For Visual Studio 6 in C++ mode and for many Visual Studio versions when
+// compiling for ARM we should wrap <wchar.h> include with 'extern "C++" {}'
+// or compiler give many errors like this:
+//   error C2733: second C linkage of overloaded function 'wmemchr' not allowed
+#ifdef __cplusplus
+extern "C" {
+#endif
+#  include <wchar.h>
+#ifdef __cplusplus
+}
+#endif
+
+// Define _W64 macros to mark types changing their size, like intptr_t.
+#ifndef _W64
+#  if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300
+#     define _W64 __w64
+#  else
+#     define _W64
+#  endif
+#endif
+
+
+// 7.18.1 Integer types
+
+// 7.18.1.1 Exact-width integer types
+
+// Visual Studio 6 and Embedded Visual C++ 4 doesn't
+// realize that, e.g. char has the same size as __int8
+// so we give up on __intX for them.
+#if (_MSC_VER < 1300)
+   typedef signed char       int8_t;
+   typedef signed short      int16_t;
+   typedef signed int        int32_t;
+   typedef unsigned char     uint8_t;
+   typedef unsigned short    uint16_t;
+   typedef unsigned int      uint32_t;
+#else
+   typedef signed __int8     int8_t;
+   typedef signed __int16    int16_t;
+   typedef signed __int32    int32_t;
+   typedef unsigned __int8   uint8_t;
+   typedef unsigned __int16  uint16_t;
+   typedef unsigned __int32  uint32_t;
+#endif
+typedef signed __int64       int64_t;
+typedef unsigned __int64     uint64_t;
+
+
+// 7.18.1.2 Minimum-width integer types
+typedef int8_t    int_least8_t;
+typedef int16_t   int_least16_t;
+typedef int32_t   int_least32_t;
+typedef int64_t   int_least64_t;
+typedef uint8_t   uint_least8_t;
+typedef uint16_t  uint_least16_t;
+typedef uint32_t  uint_least32_t;
+typedef uint64_t  uint_least64_t;
+
+// 7.18.1.3 Fastest minimum-width integer types
+typedef int8_t    int_fast8_t;
+typedef int16_t   int_fast16_t;
+typedef int32_t   int_fast32_t;
+typedef int64_t   int_fast64_t;
+typedef uint8_t   uint_fast8_t;
+typedef uint16_t  uint_fast16_t;
+typedef uint32_t  uint_fast32_t;
+typedef uint64_t  uint_fast64_t;
+
+// 7.18.1.4 Integer types capable of holding object pointers
+#ifdef _WIN64 // [
+   typedef signed __int64    intptr_t;
+   typedef unsigned __int64  uintptr_t;
+#else // _WIN64 ][
+   typedef _W64 signed int   intptr_t;
+   typedef _W64 unsigned int uintptr_t;
+#endif // _WIN64 ]
+
+// 7.18.1.5 Greatest-width integer types
+typedef int64_t   intmax_t;
+typedef uint64_t  uintmax_t;
+
+
+// 7.18.2 Limits of specified-width integer types
+
+#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [   See footnote 220 at page 257 and footnote 221 at page 259
+
+// 7.18.2.1 Limits of exact-width integer types
+#define INT8_MIN     ((int8_t)_I8_MIN)
+#define INT8_MAX     _I8_MAX
+#define INT16_MIN    ((int16_t)_I16_MIN)
+#define INT16_MAX    _I16_MAX
+#define INT32_MIN    ((int32_t)_I32_MIN)
+#define INT32_MAX    _I32_MAX
+#define INT64_MIN    ((int64_t)_I64_MIN)
+#define INT64_MAX    _I64_MAX
+#define UINT8_MAX    _UI8_MAX
+#define UINT16_MAX   _UI16_MAX
+#define UINT32_MAX   _UI32_MAX
+#define UINT64_MAX   _UI64_MAX
+
+// 7.18.2.2 Limits of minimum-width integer types
+#define INT_LEAST8_MIN    INT8_MIN
+#define INT_LEAST8_MAX    INT8_MAX
+#define INT_LEAST16_MIN   INT16_MIN
+#define INT_LEAST16_MAX   INT16_MAX
+#define INT_LEAST32_MIN   INT32_MIN
+#define INT_LEAST32_MAX   INT32_MAX
+#define INT_LEAST64_MIN   INT64_MIN
+#define INT_LEAST64_MAX   INT64_MAX
+#define UINT_LEAST8_MAX   UINT8_MAX
+#define UINT_LEAST16_MAX  UINT16_MAX
+#define UINT_LEAST32_MAX  UINT32_MAX
+#define UINT_LEAST64_MAX  UINT64_MAX
+
+// 7.18.2.3 Limits of fastest minimum-width integer types
+#define INT_FAST8_MIN    INT8_MIN
+#define INT_FAST8_MAX    INT8_MAX
+#define INT_FAST16_MIN   INT16_MIN
+#define INT_FAST16_MAX   INT16_MAX
+#define INT_FAST32_MIN   INT32_MIN
+#define INT_FAST32_MAX   INT32_MAX
+#define INT_FAST64_MIN   INT64_MIN
+#define INT_FAST64_MAX   INT64_MAX
+#define UINT_FAST8_MAX   UINT8_MAX
+#define UINT_FAST16_MAX  UINT16_MAX
+#define UINT_FAST32_MAX  UINT32_MAX
+#define UINT_FAST64_MAX  UINT64_MAX
+
+// 7.18.2.4 Limits of integer types capable of holding object pointers
+#ifdef _WIN64 // [
+#  define INTPTR_MIN   INT64_MIN
+#  define INTPTR_MAX   INT64_MAX
+#  define UINTPTR_MAX  UINT64_MAX
+#else // _WIN64 ][
+#  define INTPTR_MIN   INT32_MIN
+#  define INTPTR_MAX   INT32_MAX
+#  define UINTPTR_MAX  UINT32_MAX
+#endif // _WIN64 ]
+
+// 7.18.2.5 Limits of greatest-width integer types
+#define INTMAX_MIN   INT64_MIN
+#define INTMAX_MAX   INT64_MAX
+#define UINTMAX_MAX  UINT64_MAX
+
+// 7.18.3 Limits of other integer types
+
+#ifdef _WIN64 // [
+#  define PTRDIFF_MIN  _I64_MIN
+#  define PTRDIFF_MAX  _I64_MAX
+#else  // _WIN64 ][
+#  define PTRDIFF_MIN  _I32_MIN
+#  define PTRDIFF_MAX  _I32_MAX
+#endif  // _WIN64 ]
+
+#define SIG_ATOMIC_MIN  INT_MIN
+#define SIG_ATOMIC_MAX  INT_MAX
+
+#ifndef SIZE_MAX // [
+#  ifdef _WIN64 // [
+#     define SIZE_MAX  _UI64_MAX
+#  else // _WIN64 ][
+#     define SIZE_MAX  _UI32_MAX
+#  endif // _WIN64 ]
+#endif // SIZE_MAX ]
+
+// WCHAR_MIN and WCHAR_MAX are also defined in <wchar.h>
+#ifndef WCHAR_MIN // [
+#  define WCHAR_MIN  0
+#endif  // WCHAR_MIN ]
+#ifndef WCHAR_MAX // [
+#  define WCHAR_MAX  _UI16_MAX
+#endif  // WCHAR_MAX ]
+
+#define WINT_MIN  0
+#define WINT_MAX  _UI16_MAX
+
+#endif // __STDC_LIMIT_MACROS ]
+
+
+// 7.18.4 Limits of other integer types
+
+#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [   See footnote 224 at page 260
+
+// 7.18.4.1 Macros for minimum-width integer constants
+
+#define INT8_C(val)  val##i8
+#define INT16_C(val) val##i16
+#define INT32_C(val) val##i32
+#define INT64_C(val) val##i64
+
+#define UINT8_C(val)  val##ui8
+#define UINT16_C(val) val##ui16
+#define UINT32_C(val) val##ui32
+#define UINT64_C(val) val##ui64
+
+// 7.18.4.2 Macros for greatest-width integer constants
+#define INTMAX_C   INT64_C
+#define UINTMAX_C  UINT64_C
+
+#endif // __STDC_CONSTANT_MACROS ]
+
+
+#endif // _MSC_STDINT_H_ ]
diff --git a/include/msvc_compat/inttypes.h b/include/msvc_compat/inttypes.h
deleted file mode 100644
index a4e6b75..0000000
--- a/include/msvc_compat/inttypes.h
+++ /dev/null
@@ -1,313 +0,0 @@
-// ISO C9x  compliant inttypes.h for Microsoft Visual Studio
-// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 
-// 
-//  Copyright (c) 2006 Alexander Chemeris
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-// 
-//   1. Redistributions of source code must retain the above copyright notice,
-//      this list of conditions and the following disclaimer.
-// 
-//   2. Redistributions in binary form must reproduce the above copyright
-//      notice, this list of conditions and the following disclaimer in the
-//      documentation and/or other materials provided with the distribution.
-// 
-//   3. The name of the author may be used to endorse or promote products
-//      derived from this software without specific prior written permission.
-// 
-// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
-// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
-// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
-// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-// 
-///////////////////////////////////////////////////////////////////////////////
-
-#ifndef _MSC_VER // [
-#error "Use this header only with Microsoft Visual C++ compilers!"
-#endif // _MSC_VER ]
-
-#ifndef _MSC_INTTYPES_H_ // [
-#define _MSC_INTTYPES_H_
-
-#if _MSC_VER > 1000
-#pragma once
-#endif
-
-#include "stdint.h"
-
-// 7.8 Format conversion of integer types
-
-typedef struct {
-   intmax_t quot;
-   intmax_t rem;
-} imaxdiv_t;
-
-// 7.8.1 Macros for format specifiers
-
-#if !defined(__cplusplus) || defined(__STDC_FORMAT_MACROS) // [   See footnote 185 at page 198
-
-#ifdef _WIN64
-#  define __PRI64_PREFIX        "l"
-#  define __PRIPTR_PREFIX       "l"
-#else
-#  define __PRI64_PREFIX        "ll"
-#  define __PRIPTR_PREFIX
-#endif
-
-// The fprintf macros for signed integers are:
-#define PRId8       "d"
-#define PRIi8       "i"
-#define PRIdLEAST8  "d"
-#define PRIiLEAST8  "i"
-#define PRIdFAST8   "d"
-#define PRIiFAST8   "i"
-
-#define PRId16       "hd"
-#define PRIi16       "hi"
-#define PRIdLEAST16  "hd"
-#define PRIiLEAST16  "hi"
-#define PRIdFAST16   "hd"
-#define PRIiFAST16   "hi"
-
-#define PRId32       "d"
-#define PRIi32       "i"
-#define PRIdLEAST32  "d"
-#define PRIiLEAST32  "i"
-#define PRIdFAST32   "d"
-#define PRIiFAST32   "i"
-
-#define PRId64       __PRI64_PREFIX "d"
-#define PRIi64       __PRI64_PREFIX "i"
-#define PRIdLEAST64  __PRI64_PREFIX "d"
-#define PRIiLEAST64  __PRI64_PREFIX "i"
-#define PRIdFAST64   __PRI64_PREFIX "d"
-#define PRIiFAST64   __PRI64_PREFIX "i"
-
-#define PRIdMAX     __PRI64_PREFIX "d"
-#define PRIiMAX     __PRI64_PREFIX "i"
-
-#define PRIdPTR     __PRIPTR_PREFIX "d"
-#define PRIiPTR     __PRIPTR_PREFIX "i"
-
-// The fprintf macros for unsigned integers are:
-#define PRIo8       "o"
-#define PRIu8       "u"
-#define PRIx8       "x"
-#define PRIX8       "X"
-#define PRIoLEAST8  "o"
-#define PRIuLEAST8  "u"
-#define PRIxLEAST8  "x"
-#define PRIXLEAST8  "X"
-#define PRIoFAST8   "o"
-#define PRIuFAST8   "u"
-#define PRIxFAST8   "x"
-#define PRIXFAST8   "X"
-
-#define PRIo16       "ho"
-#define PRIu16       "hu"
-#define PRIx16       "hx"
-#define PRIX16       "hX"
-#define PRIoLEAST16  "ho"
-#define PRIuLEAST16  "hu"
-#define PRIxLEAST16  "hx"
-#define PRIXLEAST16  "hX"
-#define PRIoFAST16   "ho"
-#define PRIuFAST16   "hu"
-#define PRIxFAST16   "hx"
-#define PRIXFAST16   "hX"
-
-#define PRIo32       "o"
-#define PRIu32       "u"
-#define PRIx32       "x"
-#define PRIX32       "X"
-#define PRIoLEAST32  "o"
-#define PRIuLEAST32  "u"
-#define PRIxLEAST32  "x"
-#define PRIXLEAST32  "X"
-#define PRIoFAST32   "o"
-#define PRIuFAST32   "u"
-#define PRIxFAST32   "x"
-#define PRIXFAST32   "X"
-
-#define PRIo64       __PRI64_PREFIX "o"
-#define PRIu64       __PRI64_PREFIX "u"
-#define PRIx64       __PRI64_PREFIX "x"
-#define PRIX64       __PRI64_PREFIX "X"
-#define PRIoLEAST64  __PRI64_PREFIX "o"
-#define PRIuLEAST64  __PRI64_PREFIX "u"
-#define PRIxLEAST64  __PRI64_PREFIX "x"
-#define PRIXLEAST64  __PRI64_PREFIX "X"
-#define PRIoFAST64   __PRI64_PREFIX "o"
-#define PRIuFAST64   __PRI64_PREFIX "u"
-#define PRIxFAST64   __PRI64_PREFIX "x"
-#define PRIXFAST64   __PRI64_PREFIX "X"
-
-#define PRIoMAX     __PRI64_PREFIX "o"
-#define PRIuMAX     __PRI64_PREFIX "u"
-#define PRIxMAX     __PRI64_PREFIX "x"
-#define PRIXMAX     __PRI64_PREFIX "X"
-
-#define PRIoPTR     __PRIPTR_PREFIX "o"
-#define PRIuPTR     __PRIPTR_PREFIX "u"
-#define PRIxPTR     __PRIPTR_PREFIX "x"
-#define PRIXPTR     __PRIPTR_PREFIX "X"
-
-// The fscanf macros for signed integers are:
-#define SCNd8       "d"
-#define SCNi8       "i"
-#define SCNdLEAST8  "d"
-#define SCNiLEAST8  "i"
-#define SCNdFAST8   "d"
-#define SCNiFAST8   "i"
-
-#define SCNd16       "hd"
-#define SCNi16       "hi"
-#define SCNdLEAST16  "hd"
-#define SCNiLEAST16  "hi"
-#define SCNdFAST16   "hd"
-#define SCNiFAST16   "hi"
-
-#define SCNd32       "ld"
-#define SCNi32       "li"
-#define SCNdLEAST32  "ld"
-#define SCNiLEAST32  "li"
-#define SCNdFAST32   "ld"
-#define SCNiFAST32   "li"
-
-#define SCNd64       "I64d"
-#define SCNi64       "I64i"
-#define SCNdLEAST64  "I64d"
-#define SCNiLEAST64  "I64i"
-#define SCNdFAST64   "I64d"
-#define SCNiFAST64   "I64i"
-
-#define SCNdMAX     "I64d"
-#define SCNiMAX     "I64i"
-
-#ifdef _WIN64 // [
-#  define SCNdPTR     "I64d"
-#  define SCNiPTR     "I64i"
-#else  // _WIN64 ][
-#  define SCNdPTR     "ld"
-#  define SCNiPTR     "li"
-#endif  // _WIN64 ]
-
-// The fscanf macros for unsigned integers are:
-#define SCNo8       "o"
-#define SCNu8       "u"
-#define SCNx8       "x"
-#define SCNX8       "X"
-#define SCNoLEAST8  "o"
-#define SCNuLEAST8  "u"
-#define SCNxLEAST8  "x"
-#define SCNXLEAST8  "X"
-#define SCNoFAST8   "o"
-#define SCNuFAST8   "u"
-#define SCNxFAST8   "x"
-#define SCNXFAST8   "X"
-
-#define SCNo16       "ho"
-#define SCNu16       "hu"
-#define SCNx16       "hx"
-#define SCNX16       "hX"
-#define SCNoLEAST16  "ho"
-#define SCNuLEAST16  "hu"
-#define SCNxLEAST16  "hx"
-#define SCNXLEAST16  "hX"
-#define SCNoFAST16   "ho"
-#define SCNuFAST16   "hu"
-#define SCNxFAST16   "hx"
-#define SCNXFAST16   "hX"
-
-#define SCNo32       "lo"
-#define SCNu32       "lu"
-#define SCNx32       "lx"
-#define SCNX32       "lX"
-#define SCNoLEAST32  "lo"
-#define SCNuLEAST32  "lu"
-#define SCNxLEAST32  "lx"
-#define SCNXLEAST32  "lX"
-#define SCNoFAST32   "lo"
-#define SCNuFAST32   "lu"
-#define SCNxFAST32   "lx"
-#define SCNXFAST32   "lX"
-
-#define SCNo64       "I64o"
-#define SCNu64       "I64u"
-#define SCNx64       "I64x"
-#define SCNX64       "I64X"
-#define SCNoLEAST64  "I64o"
-#define SCNuLEAST64  "I64u"
-#define SCNxLEAST64  "I64x"
-#define SCNXLEAST64  "I64X"
-#define SCNoFAST64   "I64o"
-#define SCNuFAST64   "I64u"
-#define SCNxFAST64   "I64x"
-#define SCNXFAST64   "I64X"
-
-#define SCNoMAX     "I64o"
-#define SCNuMAX     "I64u"
-#define SCNxMAX     "I64x"
-#define SCNXMAX     "I64X"
-
-#ifdef _WIN64 // [
-#  define SCNoPTR     "I64o"
-#  define SCNuPTR     "I64u"
-#  define SCNxPTR     "I64x"
-#  define SCNXPTR     "I64X"
-#else  // _WIN64 ][
-#  define SCNoPTR     "lo"
-#  define SCNuPTR     "lu"
-#  define SCNxPTR     "lx"
-#  define SCNXPTR     "lX"
-#endif  // _WIN64 ]
-
-#endif // __STDC_FORMAT_MACROS ]
-
-// 7.8.2 Functions for greatest-width integer types
-
-// 7.8.2.1 The imaxabs function
-#define imaxabs _abs64
-
-// 7.8.2.2 The imaxdiv function
-
-// This is modified version of div() function from Microsoft's div.c found
-// in %MSVC.NET%\crt\src\div.c
-#ifdef STATIC_IMAXDIV // [
-static
-#else // STATIC_IMAXDIV ][
-_inline
-#endif // STATIC_IMAXDIV ]
-imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom)
-{
-   imaxdiv_t result;
-
-   result.quot = numer / denom;
-   result.rem = numer % denom;
-
-   if (numer < 0 && result.rem > 0) {
-      // did division wrong; must fix up
-      ++result.quot;
-      result.rem -= denom;
-   }
-
-   return result;
-}
-
-// 7.8.2.3 The strtoimax and strtoumax functions
-#define strtoimax _strtoi64
-#define strtoumax _strtoui64
-
-// 7.8.2.4 The wcstoimax and wcstoumax functions
-#define wcstoimax _wcstoi64
-#define wcstoumax _wcstoui64
-
-
-#endif // _MSC_INTTYPES_H_ ]
diff --git a/include/msvc_compat/stdbool.h b/include/msvc_compat/stdbool.h
deleted file mode 100644
index da9ee8b..0000000
--- a/include/msvc_compat/stdbool.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef stdbool_h
-#define stdbool_h
-
-#include <wtypes.h>
-
-/* MSVC doesn't define _Bool or bool in C, but does have BOOL */
-/* Note this doesn't pass autoconf's test because (bool) 0.5 != true */
-typedef BOOL _Bool;
-
-#define bool _Bool
-#define true 1
-#define false 0
-
-#define __bool_true_false_are_defined 1
-
-#endif /* stdbool_h */
diff --git a/include/msvc_compat/stdint.h b/include/msvc_compat/stdint.h
deleted file mode 100644
index d02608a..0000000
--- a/include/msvc_compat/stdint.h
+++ /dev/null
@@ -1,247 +0,0 @@
-// ISO C9x  compliant stdint.h for Microsoft Visual Studio
-// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 
-// 
-//  Copyright (c) 2006-2008 Alexander Chemeris
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-// 
-//   1. Redistributions of source code must retain the above copyright notice,
-//      this list of conditions and the following disclaimer.
-// 
-//   2. Redistributions in binary form must reproduce the above copyright
-//      notice, this list of conditions and the following disclaimer in the
-//      documentation and/or other materials provided with the distribution.
-// 
-//   3. The name of the author may be used to endorse or promote products
-//      derived from this software without specific prior written permission.
-// 
-// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
-// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
-// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
-// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-// 
-///////////////////////////////////////////////////////////////////////////////
-
-#ifndef _MSC_VER // [
-#error "Use this header only with Microsoft Visual C++ compilers!"
-#endif // _MSC_VER ]
-
-#ifndef _MSC_STDINT_H_ // [
-#define _MSC_STDINT_H_
-
-#if _MSC_VER > 1000
-#pragma once
-#endif
-
-#include <limits.h>
-
-// For Visual Studio 6 in C++ mode and for many Visual Studio versions when
-// compiling for ARM we should wrap <wchar.h> include with 'extern "C++" {}'
-// or compiler give many errors like this:
-//   error C2733: second C linkage of overloaded function 'wmemchr' not allowed
-#ifdef __cplusplus
-extern "C" {
-#endif
-#  include <wchar.h>
-#ifdef __cplusplus
-}
-#endif
-
-// Define _W64 macros to mark types changing their size, like intptr_t.
-#ifndef _W64
-#  if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300
-#     define _W64 __w64
-#  else
-#     define _W64
-#  endif
-#endif
-
-
-// 7.18.1 Integer types
-
-// 7.18.1.1 Exact-width integer types
-
-// Visual Studio 6 and Embedded Visual C++ 4 doesn't
-// realize that, e.g. char has the same size as __int8
-// so we give up on __intX for them.
-#if (_MSC_VER < 1300)
-   typedef signed char       int8_t;
-   typedef signed short      int16_t;
-   typedef signed int        int32_t;
-   typedef unsigned char     uint8_t;
-   typedef unsigned short    uint16_t;
-   typedef unsigned int      uint32_t;
-#else
-   typedef signed __int8     int8_t;
-   typedef signed __int16    int16_t;
-   typedef signed __int32    int32_t;
-   typedef unsigned __int8   uint8_t;
-   typedef unsigned __int16  uint16_t;
-   typedef unsigned __int32  uint32_t;
-#endif
-typedef signed __int64       int64_t;
-typedef unsigned __int64     uint64_t;
-
-
-// 7.18.1.2 Minimum-width integer types
-typedef int8_t    int_least8_t;
-typedef int16_t   int_least16_t;
-typedef int32_t   int_least32_t;
-typedef int64_t   int_least64_t;
-typedef uint8_t   uint_least8_t;
-typedef uint16_t  uint_least16_t;
-typedef uint32_t  uint_least32_t;
-typedef uint64_t  uint_least64_t;
-
-// 7.18.1.3 Fastest minimum-width integer types
-typedef int8_t    int_fast8_t;
-typedef int16_t   int_fast16_t;
-typedef int32_t   int_fast32_t;
-typedef int64_t   int_fast64_t;
-typedef uint8_t   uint_fast8_t;
-typedef uint16_t  uint_fast16_t;
-typedef uint32_t  uint_fast32_t;
-typedef uint64_t  uint_fast64_t;
-
-// 7.18.1.4 Integer types capable of holding object pointers
-#ifdef _WIN64 // [
-   typedef signed __int64    intptr_t;
-   typedef unsigned __int64  uintptr_t;
-#else // _WIN64 ][
-   typedef _W64 signed int   intptr_t;
-   typedef _W64 unsigned int uintptr_t;
-#endif // _WIN64 ]
-
-// 7.18.1.5 Greatest-width integer types
-typedef int64_t   intmax_t;
-typedef uint64_t  uintmax_t;
-
-
-// 7.18.2 Limits of specified-width integer types
-
-#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [   See footnote 220 at page 257 and footnote 221 at page 259
-
-// 7.18.2.1 Limits of exact-width integer types
-#define INT8_MIN     ((int8_t)_I8_MIN)
-#define INT8_MAX     _I8_MAX
-#define INT16_MIN    ((int16_t)_I16_MIN)
-#define INT16_MAX    _I16_MAX
-#define INT32_MIN    ((int32_t)_I32_MIN)
-#define INT32_MAX    _I32_MAX
-#define INT64_MIN    ((int64_t)_I64_MIN)
-#define INT64_MAX    _I64_MAX
-#define UINT8_MAX    _UI8_MAX
-#define UINT16_MAX   _UI16_MAX
-#define UINT32_MAX   _UI32_MAX
-#define UINT64_MAX   _UI64_MAX
-
-// 7.18.2.2 Limits of minimum-width integer types
-#define INT_LEAST8_MIN    INT8_MIN
-#define INT_LEAST8_MAX    INT8_MAX
-#define INT_LEAST16_MIN   INT16_MIN
-#define INT_LEAST16_MAX   INT16_MAX
-#define INT_LEAST32_MIN   INT32_MIN
-#define INT_LEAST32_MAX   INT32_MAX
-#define INT_LEAST64_MIN   INT64_MIN
-#define INT_LEAST64_MAX   INT64_MAX
-#define UINT_LEAST8_MAX   UINT8_MAX
-#define UINT_LEAST16_MAX  UINT16_MAX
-#define UINT_LEAST32_MAX  UINT32_MAX
-#define UINT_LEAST64_MAX  UINT64_MAX
-
-// 7.18.2.3 Limits of fastest minimum-width integer types
-#define INT_FAST8_MIN    INT8_MIN
-#define INT_FAST8_MAX    INT8_MAX
-#define INT_FAST16_MIN   INT16_MIN
-#define INT_FAST16_MAX   INT16_MAX
-#define INT_FAST32_MIN   INT32_MIN
-#define INT_FAST32_MAX   INT32_MAX
-#define INT_FAST64_MIN   INT64_MIN
-#define INT_FAST64_MAX   INT64_MAX
-#define UINT_FAST8_MAX   UINT8_MAX
-#define UINT_FAST16_MAX  UINT16_MAX
-#define UINT_FAST32_MAX  UINT32_MAX
-#define UINT_FAST64_MAX  UINT64_MAX
-
-// 7.18.2.4 Limits of integer types capable of holding object pointers
-#ifdef _WIN64 // [
-#  define INTPTR_MIN   INT64_MIN
-#  define INTPTR_MAX   INT64_MAX
-#  define UINTPTR_MAX  UINT64_MAX
-#else // _WIN64 ][
-#  define INTPTR_MIN   INT32_MIN
-#  define INTPTR_MAX   INT32_MAX
-#  define UINTPTR_MAX  UINT32_MAX
-#endif // _WIN64 ]
-
-// 7.18.2.5 Limits of greatest-width integer types
-#define INTMAX_MIN   INT64_MIN
-#define INTMAX_MAX   INT64_MAX
-#define UINTMAX_MAX  UINT64_MAX
-
-// 7.18.3 Limits of other integer types
-
-#ifdef _WIN64 // [
-#  define PTRDIFF_MIN  _I64_MIN
-#  define PTRDIFF_MAX  _I64_MAX
-#else  // _WIN64 ][
-#  define PTRDIFF_MIN  _I32_MIN
-#  define PTRDIFF_MAX  _I32_MAX
-#endif  // _WIN64 ]
-
-#define SIG_ATOMIC_MIN  INT_MIN
-#define SIG_ATOMIC_MAX  INT_MAX
-
-#ifndef SIZE_MAX // [
-#  ifdef _WIN64 // [
-#     define SIZE_MAX  _UI64_MAX
-#  else // _WIN64 ][
-#     define SIZE_MAX  _UI32_MAX
-#  endif // _WIN64 ]
-#endif // SIZE_MAX ]
-
-// WCHAR_MIN and WCHAR_MAX are also defined in <wchar.h>
-#ifndef WCHAR_MIN // [
-#  define WCHAR_MIN  0
-#endif  // WCHAR_MIN ]
-#ifndef WCHAR_MAX // [
-#  define WCHAR_MAX  _UI16_MAX
-#endif  // WCHAR_MAX ]
-
-#define WINT_MIN  0
-#define WINT_MAX  _UI16_MAX
-
-#endif // __STDC_LIMIT_MACROS ]
-
-
-// 7.18.4 Limits of other integer types
-
-#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [   See footnote 224 at page 260
-
-// 7.18.4.1 Macros for minimum-width integer constants
-
-#define INT8_C(val)  val##i8
-#define INT16_C(val) val##i16
-#define INT32_C(val) val##i32
-#define INT64_C(val) val##i64
-
-#define UINT8_C(val)  val##ui8
-#define UINT16_C(val) val##ui16
-#define UINT32_C(val) val##ui32
-#define UINT64_C(val) val##ui64
-
-// 7.18.4.2 Macros for greatest-width integer constants
-#define INTMAX_C   INT64_C
-#define UINTMAX_C  UINT64_C
-
-#endif // __STDC_CONSTANT_MACROS ]
-
-
-#endif // _MSC_STDINT_H_ ]
-- 
cgit v0.12


From 8c6157558aca6cb764b4f312c3d4f285664ef3e7 Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Thu, 29 May 2014 16:58:21 +0900
Subject: Add -FS flag to support parallel builds with MSVC 2013

---
 configure.ac | 1 +
 1 file changed, 1 insertion(+)

diff --git a/configure.ac b/configure.ac
index 5aeaa08..045f62e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -141,6 +141,7 @@ if test "x$CFLAGS" = "x" ; then
     JE_CFLAGS_APPEND([-Zi])
     JE_CFLAGS_APPEND([-MT])
     JE_CFLAGS_APPEND([-W3])
+    JE_CFLAGS_APPEND([-FS])
     CPPFLAGS="$CPPFLAGS -I${srcroot}/include/msvc_compat"
   fi
 fi
-- 
cgit v0.12


From 6f6704c35b28e919552a50e9e1d89a75a8b7c962 Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Thu, 29 May 2014 17:01:10 +0900
Subject: Make in-tree MSVC builds work

---
 configure.ac | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/configure.ac b/configure.ac
index 045f62e..4944c44 100644
--- a/configure.ac
+++ b/configure.ac
@@ -142,7 +142,7 @@ if test "x$CFLAGS" = "x" ; then
     JE_CFLAGS_APPEND([-MT])
     JE_CFLAGS_APPEND([-W3])
     JE_CFLAGS_APPEND([-FS])
-    CPPFLAGS="$CPPFLAGS -I${srcroot}/include/msvc_compat"
+    CPPFLAGS="$CPPFLAGS -I${srcdir}/include/msvc_compat"
   fi
 fi
 dnl Append EXTRA_CFLAGS to CFLAGS, if defined.
@@ -157,7 +157,7 @@ if test "x${ac_cv_big_endian}" = "x1" ; then
 fi
 
 if test "x${je_cv_msvc}" = "xyes" -a "x${ac_cv_header_inttypes_h}" = "xno"; then
-  CPPFLAGS="$CPPFLAGS -I${srcroot}/include/msvc_compat/C99"
+  CPPFLAGS="$CPPFLAGS -I${srcdir}/include/msvc_compat/C99"
 fi
 
 AC_CHECK_SIZEOF([void *])
-- 
cgit v0.12


From 0b5c92213fbafc52c5b5a5dc84e91eacc812ae0b Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 1 Jun 2014 22:05:08 -0700
Subject: Fix fallback lg_floor() implementations.

---
 include/jemalloc/internal/util.h | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/include/jemalloc/internal/util.h b/include/jemalloc/internal/util.h
index 7864823..54aed8e 100644
--- a/include/jemalloc/internal/util.h
+++ b/include/jemalloc/internal/util.h
@@ -152,9 +152,9 @@ lg_floor(size_t x)
 {
 
 #if (LG_SIZEOF_PTR == LG_SIZEOF_INT)
-	return ((8 << LG_SIZEOF_PTR - 1) - __builtin_clz(x));
+	return (((8 << LG_SIZEOF_PTR) - 1) - __builtin_clz(x));
 #elif (LG_SIZEOF_PTR == LG_SIZEOF_LONG)
-	return ((8 << LG_SIZEOF_PTR - 1) - __builtin_clzl(x));
+	return (((8 << LG_SIZEOF_PTR) - 1) - __builtin_clzl(x));
 #else
 #  error "Unsupported type sizes for lg_floor()"
 #endif
@@ -164,16 +164,22 @@ JEMALLOC_INLINE size_t
 lg_floor(size_t x)
 {
 
-        x |= (x >> 1);
-        x |= (x >> 2);
-        x |= (x >> 4);
-        x |= (x >> 8);
-        x |= (x >> 16);
+	x |= (x >> 1);
+	x |= (x >> 2);
+	x |= (x >> 4);
+	x |= (x >> 8);
+	x |= (x >> 16);
 #if (LG_SIZEOF_PTR == 3 && LG_SIZEOF_PTR == LG_SIZEOF_LONG)
-        x |= (x >> 32);
-        return (65 - ffsl(~x));
+	x |= (x >> 32);
+	if (x == KZU(0xffffffffffffffff))
+		return (63);
+	x++;
+	return (ffsl(x) - 2);
 #elif (LG_SIZEOF_PTR == 2)
-        return (33 - ffs(~x));
+	if (x == KZU(0xffffffff))
+		return (31);
+	x++;
+	return (ffs(x) - 2);
 #else
 #  error "Unsupported type sizes for lg_floor()"
 #endif
-- 
cgit v0.12


From 9c3a10fdf6baa5ddb042b6adbef1ff1b3c613ce3 Mon Sep 17 00:00:00 2001
From: Richard Diamond <wichard@vitalitystudios.com>
Date: Wed, 28 May 2014 21:37:02 -0500
Subject: Try to use __builtin_ffsl if ffsl is unavailable.

Some platforms (like those using Newlib) don't have ffs/ffsl.  This
commit adds a check to configure.ac for __builtin_ffsl if ffsl isn't
found.  __builtin_ffsl performs the same function as ffsl, and has the
added benefit of being available on any platform utilizing
Gcc-compatible compiler.

This change does not address the used of ffs in the MALLOCX_ARENA()
macro.
---
 configure.ac                                       | 30 ++++++++++++++++++----
 include/jemalloc/internal/arena.h                  |  2 +-
 include/jemalloc/internal/bitmap.h                 |  4 +--
 include/jemalloc/internal/jemalloc_internal.h.in   |  3 +++
 .../jemalloc/internal/jemalloc_internal_decls.h    | 10 +++++---
 .../jemalloc/internal/jemalloc_internal_defs.h.in  |  7 +++++
 include/jemalloc/internal/util.h                   | 26 +++++++++++++++++--
 src/arena.c                                        |  2 +-
 src/rtree.c                                        |  4 +--
 9 files changed, 71 insertions(+), 17 deletions(-)

diff --git a/configure.ac b/configure.ac
index 4944c44..3d36b5f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1109,9 +1109,11 @@ elif test "x${force_tls}" = "x1" ; then
 fi
 
 dnl ============================================================================
-dnl Check for ffsl(3), and fail if not found.  This function exists on all
-dnl platforms that jemalloc currently has a chance of functioning on without
-dnl modification.
+dnl Check for ffsl(3), then __builtin_ffsl(), and fail if neither are found.
+dnl One of those two functions should (theoretically) exist on all platforms
+dnl that jemalloc currently has a chance of functioning on without modification.
+dnl We additionally assume ffs() or __builtin_ffs() are defined if
+dnl ffsl() or __builtin_ffsl() are defined, respectively.
 JE_COMPILABLE([a program using ffsl], [
 #include <stdio.h>
 #include <strings.h>
@@ -1122,8 +1124,26 @@ JE_COMPILABLE([a program using ffsl], [
 		printf("%d\n", rv);
 	}
 ], [je_cv_function_ffsl])
-if test "x${je_cv_function_ffsl}" != "xyes" ; then
-   AC_MSG_ERROR([Cannot build without ffsl(3)])
+if test "x${je_cv_function_ffsl}" == "xyes" ; then
+  AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [ffsl])
+  AC_DEFINE([JEMALLOC_INTERNAL_FFS], [ffs])
+else
+  JE_COMPILABLE([a program using __builtin_ffsl], [
+  #include <stdio.h>
+  #include <strings.h>
+  #include <string.h>
+  ], [
+	{
+		int rv = __builtin_ffsl(0x08);
+		printf("%d\n", rv);
+	}
+  ], [je_cv_gcc_builtin_ffsl])
+  if test "x${je_cv_gcc_builtin_ffsl}" == "xyes" ; then
+    AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [__builtin_ffsl])
+    AC_DEFINE([JEMALLOC_INTERNAL_FFS], [__builtin_ffs])
+  else
+    AC_MSG_ERROR([Cannot build without ffsl(3) or __builtin_ffsl()])
+  fi
 fi
 
 dnl ============================================================================
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 2dc9501..cb73283 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -970,7 +970,7 @@ arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, const void *ptr)
 
 	/* Rescale (factor powers of 2 out of the numerator and denominator). */
 	interval = bin_info->reg_interval;
-	shift = ffs(interval) - 1;
+	shift = jemalloc_ffs(interval) - 1;
 	diff >>= shift;
 	interval >>= shift;
 
diff --git a/include/jemalloc/internal/bitmap.h b/include/jemalloc/internal/bitmap.h
index 605ebac..6db4ab7 100644
--- a/include/jemalloc/internal/bitmap.h
+++ b/include/jemalloc/internal/bitmap.h
@@ -130,11 +130,11 @@ bitmap_sfu(bitmap_t *bitmap, const bitmap_info_t *binfo)
 
 	i = binfo->nlevels - 1;
 	g = bitmap[binfo->levels[i].group_offset];
-	bit = ffsl(g) - 1;
+	bit = jemalloc_ffsl(g) - 1;
 	while (i > 0) {
 		i--;
 		g = bitmap[binfo->levels[i].group_offset + bit];
-		bit = (bit << LG_BITMAP_GROUP_NBITS) + (ffsl(g) - 1);
+		bit = (bit << LG_BITMAP_GROUP_NBITS) + (jemalloc_ffsl(g) - 1);
 	}
 
 	bitmap_set(bitmap, binfo, bit);
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 491345c..f2cd743 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -215,6 +215,9 @@ static const bool config_ivsalloc =
 #  ifdef __tile__
 #    define LG_QUANTUM		4
 #  endif
+#  ifdef __le32__
+#    define LG_QUANTUM		4
+#  endif
 #  ifndef LG_QUANTUM
 #    error "No LG_QUANTUM definition for architecture; specify via CPPFLAGS"
 #  endif
diff --git a/include/jemalloc/internal/jemalloc_internal_decls.h b/include/jemalloc/internal/jemalloc_internal_decls.h
index 7775ab3..fa59040 100644
--- a/include/jemalloc/internal/jemalloc_internal_decls.h
+++ b/include/jemalloc/internal/jemalloc_internal_decls.h
@@ -15,11 +15,13 @@
 #else
 #  include <sys/param.h>
 #  include <sys/mman.h>
-#  include <sys/syscall.h>
-#  if !defined(SYS_write) && defined(__NR_write)
-#    define SYS_write __NR_write
+#  if !defined(__pnacl__) && !defined(__native_client__)
+#    include <sys/syscall.h>
+#    if !defined(SYS_write) && defined(__NR_write)
+#      define SYS_write __NR_write
+#    endif
+#    include <sys/uio.h>
 #  endif
-#  include <sys/uio.h>
 #  include <pthread.h>
 #  include <errno.h>
 #endif
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index a9a50f1..65ac76c 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -153,6 +153,13 @@
 #undef JEMALLOC_TLS
 
 /*
+ * ffs()/ffsl() functions to use for bitmapping.  Don't use these directly;
+ * instead, use jemalloc_ffs() or jemalloc_ffsl() from util.h.
+ */
+#undef JEMALLOC_INTERNAL_FFSL
+#undef JEMALLOC_INTERNAL_FFS
+
+/*
  * JEMALLOC_IVSALLOC enables ivsalloc(), which verifies that pointers reside
  * within jemalloc-owned chunks before dereferencing them.
  */
diff --git a/include/jemalloc/internal/util.h b/include/jemalloc/internal/util.h
index 54aed8e..d2b7a96 100644
--- a/include/jemalloc/internal/util.h
+++ b/include/jemalloc/internal/util.h
@@ -109,6 +109,8 @@ void	malloc_printf(const char *format, ...)
 #ifdef JEMALLOC_H_INLINES
 
 #ifndef JEMALLOC_ENABLE_INLINE
+int	jemalloc_ffsl(long bitmap);
+int	jemalloc_ffs(int bitmap);
 size_t	pow2_ceil(size_t x);
 size_t	lg_floor(size_t x);
 void	set_errno(int errnum);
@@ -116,6 +118,26 @@ int	get_errno(void);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_UTIL_C_))
+
+/* Sanity check: */
+#if !defined(JEMALLOC_INTERNAL_FFSL) || !defined(JEMALLOC_INTERNAL_FFS)
+#  error Both JEMALLOC_INTERNAL_FFSL && JEMALLOC_INTERNAL_FFS should have been defined by configure
+#endif
+
+JEMALLOC_ALWAYS_INLINE int
+jemalloc_ffsl(long bitmap)
+{
+
+        return (JEMALLOC_INTERNAL_FFSL(bitmap));
+}
+
+JEMALLOC_ALWAYS_INLINE int
+jemalloc_ffs(int bitmap)
+{
+
+        return (JEMALLOC_INTERNAL_FFS(bitmap));
+}
+
 /* Compute the smallest power of 2 that is >= x. */
 JEMALLOC_INLINE size_t
 pow2_ceil(size_t x)
@@ -174,12 +196,12 @@ lg_floor(size_t x)
 	if (x == KZU(0xffffffffffffffff))
 		return (63);
 	x++;
-	return (ffsl(x) - 2);
+	return (jemalloc_ffsl(x) - 2);
 #elif (LG_SIZEOF_PTR == 2)
 	if (x == KZU(0xffffffff))
 		return (31);
 	x++;
-	return (ffs(x) - 2);
+	return (jemalloc_ffs(x) - 2);
 #else
 #  error "Unsupported type sizes for lg_floor()"
 #endif
diff --git a/src/arena.c b/src/arena.c
index c392419..d3fe0fb 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -2483,7 +2483,7 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size)
 	 * be twice as large in order to maintain alignment.
 	 */
 	if (config_fill && opt_redzone) {
-		size_t align_min = ZU(1) << (ffs(bin_info->reg_size) - 1);
+		size_t align_min = ZU(1) << (jemalloc_ffs(bin_info->reg_size) - 1);
 		if (align_min <= REDZONE_MINSIZE) {
 			bin_info->redzone_size = REDZONE_MINSIZE;
 			pad_size = 0;
diff --git a/src/rtree.c b/src/rtree.c
index 205957a..87b0b15 100644
--- a/src/rtree.c
+++ b/src/rtree.c
@@ -9,8 +9,8 @@ rtree_new(unsigned bits, rtree_alloc_t *alloc, rtree_dalloc_t *dalloc)
 
 	assert(bits > 0 && bits <= (sizeof(uintptr_t) << 3));
 
-	bits_per_level = ffs(pow2_ceil((RTREE_NODESIZE / sizeof(void *)))) - 1;
-	bits_in_leaf = ffs(pow2_ceil((RTREE_NODESIZE / sizeof(uint8_t)))) - 1;
+	bits_per_level = jemalloc_ffs(pow2_ceil((RTREE_NODESIZE / sizeof(void *)))) - 1;
+	bits_in_leaf = jemalloc_ffs(pow2_ceil((RTREE_NODESIZE / sizeof(uint8_t)))) - 1;
 	if (bits > bits_in_leaf) {
 		height = 1 + (bits - bits_in_leaf) / bits_per_level;
 		if ((height-1) * bits_per_level + bits_in_leaf != bits)
-- 
cgit v0.12


From 94ed6812bc04a6171d1a801f2740355f458d5c9c Mon Sep 17 00:00:00 2001
From: Richard Diamond <wichard@vitalitystudios.com>
Date: Wed, 28 May 2014 21:47:15 -0500
Subject: Don't catch fork()ing events for Native Client.

Native Client doesn't allow forking, thus there is no need to catch
fork()ing events for Native Client.

Additionally, without this commit, jemalloc will introduce an unresolved
pthread_atfork() in PNaCl Rust bins.
---
 src/jemalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/jemalloc.c b/src/jemalloc.c
index 43a494e..0983c00 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -792,7 +792,7 @@ malloc_init_hard(void)
 	ncpus = malloc_ncpus();
 
 #if (!defined(JEMALLOC_MUTEX_INIT_CB) && !defined(JEMALLOC_ZONE) \
-    && !defined(_WIN32))
+    && !defined(_WIN32) && !defined(__native_client__))
 	/* LinuxThreads's pthread_atfork() allocates. */
 	if (pthread_atfork(jemalloc_prefork, jemalloc_postfork_parent,
 	    jemalloc_postfork_child) != 0) {
-- 
cgit v0.12


From 3e310b34eb53eb331981ecda2ea5f10cf6956747 Mon Sep 17 00:00:00 2001
From: Chris Peterson <cpeterson@mozilla.com>
Date: Wed, 28 May 2014 19:04:06 -0700
Subject: Fix -Wsign-compare warnings

---
 src/prof.c | 4 ++--
 src/util.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/prof.c b/src/prof.c
index b64386e..0eb7dbd 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -1093,7 +1093,7 @@ label_open_close_error:
 #define	DUMP_FILENAME_BUFSIZE	(PATH_MAX + 1)
 #define	VSEQ_INVALID		UINT64_C(0xffffffffffffffff)
 static void
-prof_dump_filename(char *filename, char v, int64_t vseq)
+prof_dump_filename(char *filename, char v, uint64_t vseq)
 {
 
 	cassert(config_prof);
@@ -1101,7 +1101,7 @@ prof_dump_filename(char *filename, char v, int64_t vseq)
 	if (vseq != VSEQ_INVALID) {
 	        /* "<prefix>.<pid>.<seq>.v<vseq>.heap" */
 		malloc_snprintf(filename, DUMP_FILENAME_BUFSIZE,
-		    "%s.%d.%"PRIu64".%c%"PRId64".heap",
+		    "%s.%d.%"PRIu64".%c%"PRIu64".heap",
 		    opt_prof_prefix, (int)getpid(), prof_dump_seq, v, vseq);
 	} else {
 	        /* "<prefix>.<pid>.<seq>.<v>.heap" */
diff --git a/src/util.c b/src/util.c
index 93a19fd..9076be9 100644
--- a/src/util.c
+++ b/src/util.c
@@ -100,7 +100,7 @@ uintmax_t
 malloc_strtoumax(const char *restrict nptr, char **restrict endptr, int base)
 {
 	uintmax_t ret, digit;
-	int b;
+	unsigned b;
 	bool neg;
 	const char *p, *ns;
 
@@ -548,7 +548,7 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 				assert(len == '?' || len == 'l');
 				assert_not_implemented(len != 'l');
 				s = va_arg(ap, char *);
-				slen = (prec < 0) ? strlen(s) : prec;
+				slen = (prec < 0) ? strlen(s) : (size_t)prec;
 				APPEND_PADDED_S(s, slen, width, left_justify);
 				f++;
 				break;
-- 
cgit v0.12


From 70807bc54b06bb259b6607541af44bc73a890bf6 Mon Sep 17 00:00:00 2001
From: Chris Peterson <cpeterson@mozilla.com>
Date: Wed, 28 May 2014 19:04:33 -0700
Subject: Fix -Wsometimes-uninitialized warnings

---
 src/util.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/util.c b/src/util.c
index 9076be9..1717f08 100644
--- a/src/util.c
+++ b/src/util.c
@@ -381,7 +381,9 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 	case 'p': /* Synthetic; used for %p. */				\
 		val = va_arg(ap, uintptr_t);				\
 		break;							\
-	default: not_reached();						\
+	default:							\
+		not_reached();						\
+		val = 0;						\
 	}								\
 } while (0)
 
-- 
cgit v0.12


From 994fad9bdaaa18273f2089856c2637cfb0c307bd Mon Sep 17 00:00:00 2001
From: Richard Diamond <wichard@vitalitystudios.com>
Date: Tue, 3 Jun 2014 02:39:18 -0500
Subject: Add check for madvise(2) to configure.ac.

Some platforms, such as Google's Portable Native Client, use Newlib and
thus lack access to madvise(2).  In those instances, pages_purge() is
transformed into a no-op.
---
 configure.ac                                          | 14 ++++++++++++++
 include/jemalloc/internal/jemalloc_internal_defs.h.in |  5 +++++
 src/chunk_mmap.c                                      |  7 +++++--
 3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/configure.ac b/configure.ac
index 3d36b5f..29edcb6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1192,6 +1192,20 @@ if test "x${je_cv_osatomic}" = "xyes" ; then
 fi
 
 dnl ============================================================================
+dnl Check for madvise(2).
+
+JE_COMPILABLE([madvise(2)], [
+#include <sys/mman.h>
+], [
+	{
+		madvise((void *)0, 0, 0);
+	}
+], [je_cv_madvise])
+if test "x${je_cv_madvise}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_MADVISE], [ ])
+fi
+
+dnl ============================================================================
 dnl Check whether __sync_{add,sub}_and_fetch() are available despite
 dnl __GCC_HAVE_SYNC_COMPARE_AND_SWAP_n macros being undefined.
 
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index 65ac76c..93716b0 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -53,6 +53,11 @@
 #undef JEMALLOC_HAVE_BUILTIN_CLZ
 
 /*
+ * Defined if madvise(2) is available.
+ */
+#undef JEMALLOC_HAVE_MADVISE
+
+/*
  * Defined if OSSpin*() functions are available, as provided by Darwin, and
  * documented in the spinlock(3) manual page.
  */
diff --git a/src/chunk_mmap.c b/src/chunk_mmap.c
index f960e06..65137b4 100644
--- a/src/chunk_mmap.c
+++ b/src/chunk_mmap.c
@@ -121,7 +121,7 @@ pages_purge(void *addr, size_t length)
 #ifdef _WIN32
 	VirtualAlloc(addr, length, MEM_RESET, PAGE_READWRITE);
 	unzeroed = true;
-#else
+#elif defined(JEMALLOC_HAVE_MADVISE)
 #  ifdef JEMALLOC_PURGE_MADVISE_DONTNEED
 #    define JEMALLOC_MADV_PURGE MADV_DONTNEED
 #    define JEMALLOC_MADV_ZEROS true
@@ -129,12 +129,15 @@ pages_purge(void *addr, size_t length)
 #    define JEMALLOC_MADV_PURGE MADV_FREE
 #    define JEMALLOC_MADV_ZEROS false
 #  else
-#    error "No method defined for purging unused dirty pages."
+#    error "No madvise(2) flag defined for purging unused dirty pages."
 #  endif
 	int err = madvise(addr, length, JEMALLOC_MADV_PURGE);
 	unzeroed = (JEMALLOC_MADV_ZEROS == false || err != 0);
 #  undef JEMALLOC_MADV_PURGE
 #  undef JEMALLOC_MADV_ZEROS
+#else
+	/* Last resort no-op. */
+	unzeroed = true;
 #endif
 	return (unzeroed);
 }
-- 
cgit v0.12


From 1a3eafd1b045163f27e4a5acf01280edfe28c309 Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Wed, 4 Jun 2014 12:09:08 +0900
Subject: Check for __builtin_ffsl before ffsl.

When building with -O0, GCC doesn't use builtins for ffs and ffsl calls,
and uses library function calls instead. But the Android NDK doesn't have
those functions exported from any library, leading to build failure.
However, using __builtin_ffs* uses the builtin inlines.
---
 configure.ac | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/configure.ac b/configure.ac
index 29edcb6..f456bd2 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1109,43 +1109,44 @@ elif test "x${force_tls}" = "x1" ; then
 fi
 
 dnl ============================================================================
-dnl Check for ffsl(3), then __builtin_ffsl(), and fail if neither are found.
+dnl Check for  __builtin_ffsl(), then ffsl(3), and fail if neither are found.
 dnl One of those two functions should (theoretically) exist on all platforms
 dnl that jemalloc currently has a chance of functioning on without modification.
 dnl We additionally assume ffs() or __builtin_ffs() are defined if
 dnl ffsl() or __builtin_ffsl() are defined, respectively.
-JE_COMPILABLE([a program using ffsl], [
+JE_COMPILABLE([a program using __builtin_ffsl], [
 #include <stdio.h>
 #include <strings.h>
 #include <string.h>
 ], [
 	{
-		int rv = ffsl(0x08);
+		int rv = __builtin_ffsl(0x08);
 		printf("%d\n", rv);
 	}
-], [je_cv_function_ffsl])
-if test "x${je_cv_function_ffsl}" == "xyes" ; then
-  AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [ffsl])
-  AC_DEFINE([JEMALLOC_INTERNAL_FFS], [ffs])
+], [je_cv_gcc_builtin_ffsl])
+if test "x${je_cv_gcc_builtin_ffsl}" == "xyes" ; then
+  AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [__builtin_ffsl])
+  AC_DEFINE([JEMALLOC_INTERNAL_FFS], [__builtin_ffs])
 else
-  JE_COMPILABLE([a program using __builtin_ffsl], [
+  JE_COMPILABLE([a program using ffsl], [
   #include <stdio.h>
   #include <strings.h>
   #include <string.h>
   ], [
 	{
-		int rv = __builtin_ffsl(0x08);
+		int rv = ffsl(0x08);
 		printf("%d\n", rv);
 	}
-  ], [je_cv_gcc_builtin_ffsl])
-  if test "x${je_cv_gcc_builtin_ffsl}" == "xyes" ; then
-    AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [__builtin_ffsl])
-    AC_DEFINE([JEMALLOC_INTERNAL_FFS], [__builtin_ffs])
+  ], [je_cv_function_ffsl])
+  if test "x${je_cv_function_ffsl}" == "xyes" ; then
+    AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [ffsl])
+    AC_DEFINE([JEMALLOC_INTERNAL_FFS], [ffs])
   else
     AC_MSG_ERROR([Cannot build without ffsl(3) or __builtin_ffsl()])
   fi
 fi
 
+
 dnl ============================================================================
 dnl Check for atomic(9) operations as provided on FreeBSD.
 
-- 
cgit v0.12


From 8f50ec8eda262e87ad547ec50b6ca928ea3e31c4 Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Wed, 4 Jun 2014 12:12:55 +0900
Subject: Use JEMALLOC_INTERNAL_FFSL in STATIC_PAGE_SHIFT test

---
 configure.ac | 79 ++++++++++++++++++++++++++++++------------------------------
 1 file changed, 39 insertions(+), 40 deletions(-)

diff --git a/configure.ac b/configure.ac
index f456bd2..e977534 100644
--- a/configure.ac
+++ b/configure.ac
@@ -935,6 +935,44 @@ if test "x$enable_xmalloc" = "x1" ; then
 fi
 AC_SUBST([enable_xmalloc])
 
+dnl ============================================================================
+dnl Check for  __builtin_ffsl(), then ffsl(3), and fail if neither are found.
+dnl One of those two functions should (theoretically) exist on all platforms
+dnl that jemalloc currently has a chance of functioning on without modification.
+dnl We additionally assume ffs() or __builtin_ffs() are defined if
+dnl ffsl() or __builtin_ffsl() are defined, respectively.
+JE_COMPILABLE([a program using __builtin_ffsl], [
+#include <stdio.h>
+#include <strings.h>
+#include <string.h>
+], [
+	{
+		int rv = __builtin_ffsl(0x08);
+		printf("%d\n", rv);
+	}
+], [je_cv_gcc_builtin_ffsl])
+if test "x${je_cv_gcc_builtin_ffsl}" == "xyes" ; then
+  AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [__builtin_ffsl])
+  AC_DEFINE([JEMALLOC_INTERNAL_FFS], [__builtin_ffs])
+else
+  JE_COMPILABLE([a program using ffsl], [
+  #include <stdio.h>
+  #include <strings.h>
+  #include <string.h>
+  ], [
+	{
+		int rv = ffsl(0x08);
+		printf("%d\n", rv);
+	}
+  ], [je_cv_function_ffsl])
+  if test "x${je_cv_function_ffsl}" == "xyes" ; then
+    AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [ffsl])
+    AC_DEFINE([JEMALLOC_INTERNAL_FFS], [ffs])
+  else
+    AC_MSG_ERROR([Cannot build without ffsl(3) or __builtin_ffsl()])
+  fi
+fi
+
 AC_CACHE_CHECK([STATIC_PAGE_SHIFT],
                [je_cv_static_page_shift],
                AC_RUN_IFELSE([AC_LANG_PROGRAM(
@@ -961,7 +999,7 @@ AC_CACHE_CHECK([STATIC_PAGE_SHIFT],
     if (result == -1) {
 	return 1;
     }
-    result = ffsl(result) - 1;
+    result = JEMALLOC_INTERNAL_FFSL(result) - 1;
 
     f = fopen("conftest.out", "w");
     if (f == NULL) {
@@ -1109,45 +1147,6 @@ elif test "x${force_tls}" = "x1" ; then
 fi
 
 dnl ============================================================================
-dnl Check for  __builtin_ffsl(), then ffsl(3), and fail if neither are found.
-dnl One of those two functions should (theoretically) exist on all platforms
-dnl that jemalloc currently has a chance of functioning on without modification.
-dnl We additionally assume ffs() or __builtin_ffs() are defined if
-dnl ffsl() or __builtin_ffsl() are defined, respectively.
-JE_COMPILABLE([a program using __builtin_ffsl], [
-#include <stdio.h>
-#include <strings.h>
-#include <string.h>
-], [
-	{
-		int rv = __builtin_ffsl(0x08);
-		printf("%d\n", rv);
-	}
-], [je_cv_gcc_builtin_ffsl])
-if test "x${je_cv_gcc_builtin_ffsl}" == "xyes" ; then
-  AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [__builtin_ffsl])
-  AC_DEFINE([JEMALLOC_INTERNAL_FFS], [__builtin_ffs])
-else
-  JE_COMPILABLE([a program using ffsl], [
-  #include <stdio.h>
-  #include <strings.h>
-  #include <string.h>
-  ], [
-	{
-		int rv = ffsl(0x08);
-		printf("%d\n", rv);
-	}
-  ], [je_cv_function_ffsl])
-  if test "x${je_cv_function_ffsl}" == "xyes" ; then
-    AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [ffsl])
-    AC_DEFINE([JEMALLOC_INTERNAL_FFS], [ffs])
-  else
-    AC_MSG_ERROR([Cannot build without ffsl(3) or __builtin_ffsl()])
-  fi
-fi
-
-
-dnl ============================================================================
 dnl Check for atomic(9) operations as provided on FreeBSD.
 
 JE_COMPILABLE([atomic(9)], [
-- 
cgit v0.12


From 5921ba7b0c3b3278c54d569dee37deab2768b70b Mon Sep 17 00:00:00 2001
From: Valerii Hiora <valerii.hiora@gmail.com>
Date: Fri, 16 May 2014 16:28:20 +0300
Subject: Support for iOS compilation

---
 config.sub   | 2 ++
 configure.ac | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/config.sub b/config.sub
index 61cb4bc..c4cc983 100755
--- a/config.sub
+++ b/config.sub
@@ -1400,6 +1400,8 @@ case $os in
 	-mac*)
 		os=`echo $os | sed -e 's|mac|macos|'`
 		;;
+	-ios*)
+		;;
 	-linux-dietlibc)
 		os=-linux-dietlibc
 		;;
diff --git a/configure.ac b/configure.ac
index e977534..48863a5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -264,7 +264,7 @@ dnl definitions need to be seen before any headers are included, which is a pain
 dnl to make happen otherwise.
 default_munmap="1"
 case "${host}" in
-  *-*-darwin*)
+  *-*-darwin* | *-*-ios*)
 	CFLAGS="$CFLAGS"
 	abi="macho"
 	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ])
-- 
cgit v0.12


From 6f533c1903a1d067dacfca2f06c6cc9754fdf67e Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Tue, 10 Jun 2014 18:18:22 +0900
Subject: Ensure the default purgeable zone is after the default zone on OS X

---
 src/zone.c | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/src/zone.c b/src/zone.c
index e0302ef..a722287 100644
--- a/src/zone.c
+++ b/src/zone.c
@@ -176,6 +176,7 @@ register_zone(void)
 	 * register jemalloc's.
 	 */
 	malloc_zone_t *default_zone = malloc_default_zone();
+	malloc_zone_t *purgeable_zone = NULL;
 	if (!default_zone->zone_name ||
 	    strcmp(default_zone->zone_name, "DefaultMallocZone") != 0) {
 		return;
@@ -237,22 +238,37 @@ register_zone(void)
 	 * run time.
 	 */
 	if (malloc_default_purgeable_zone != NULL)
-		malloc_default_purgeable_zone();
+		purgeable_zone = malloc_default_purgeable_zone();
 
 	/* Register the custom zone.  At this point it won't be the default. */
 	malloc_zone_register(&zone);
 
-	/*
-	 * Unregister and reregister the default zone.  On OSX >= 10.6,
-	 * unregistering takes the last registered zone and places it at the
-	 * location of the specified zone.  Unregistering the default zone thus
-	 * makes the last registered one the default.  On OSX < 10.6,
-	 * unregistering shifts all registered zones.  The first registered zone
-	 * then becomes the default.
-	 */
 	do {
 		default_zone = malloc_default_zone();
+		/*
+		 * Unregister and reregister the default zone.  On OSX >= 10.6,
+		 * unregistering takes the last registered zone and places it
+		 * at the location of the specified zone.  Unregistering the
+		 * default zone thus makes the last registered one the default.
+		 * On OSX < 10.6, unregistering shifts all registered zones.
+		 * The first registered zone then becomes the default.
+		 */
 		malloc_zone_unregister(default_zone);
 		malloc_zone_register(default_zone);
+		/*
+		 * On OSX 10.6, having the default purgeable zone appear before
+		 * the default zone makes some things crash because it thinks it
+		 * owns the default zone allocated pointers. We thus unregister/
+		 * re-register it in order to ensure it's always after the
+		 * default zone. On OSX < 10.6, there is no purgeable zone, so
+		 * this does nothing. On OSX >= 10.6, unregistering replaces the
+		 * purgeable zone with the last registered zone above, i.e the
+		 * default zone. Registering it again then puts it at the end,
+		 * obviously after the default zone.
+		 */
+		if (purgeable_zone) {
+			malloc_zone_unregister(purgeable_zone);
+			malloc_zone_register(purgeable_zone);
+		}
 	} while (malloc_default_zone() != &zone);
 }
-- 
cgit v0.12


From c521df5dcf7410898cabdcb556f919535cf16d19 Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Thu, 12 Jun 2014 13:07:31 +0900
Subject: Allow to build with clang-cl

---
 include/msvc_compat/C99/stdbool.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/msvc_compat/C99/stdbool.h b/include/msvc_compat/C99/stdbool.h
index da9ee8b..d92160e 100644
--- a/include/msvc_compat/C99/stdbool.h
+++ b/include/msvc_compat/C99/stdbool.h
@@ -5,7 +5,11 @@
 
 /* MSVC doesn't define _Bool or bool in C, but does have BOOL */
 /* Note this doesn't pass autoconf's test because (bool) 0.5 != true */
+/* Clang-cl uses MSVC headers, so needs msvc_compat, but has _Bool as
+ * a built-in type. */
+#ifndef __clang__
 typedef BOOL _Bool;
+#endif
 
 #define bool _Bool
 #define true 1
-- 
cgit v0.12


From 79230fef31428a133683c236bedcc1560f8fcfd8 Mon Sep 17 00:00:00 2001
From: Steven Stewart-Gallus <sstewartgallus00@mylangara.bc.ca>
Date: Thu, 19 Jun 2014 16:11:43 -0700
Subject: Fix unportable == operator in configure scripts

Now this code is more portable and now people can use faster shells than
Bash such as Dash.

To use a faster shell with autoconf set the CONFIG_SHELL environment
variable to the shell and run the configure script with the shell.
---
 configure.ac | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/configure.ac b/configure.ac
index 48863a5..645bd46 100644
--- a/configure.ac
+++ b/configure.ac
@@ -833,7 +833,7 @@ have_dss="1"
 dnl Check whether the BSD/SUSv1 sbrk() exists.  If not, disable DSS support.
 AC_CHECK_FUNC([sbrk], [have_sbrk="1"], [have_sbrk="0"])
 if test "x$have_sbrk" = "x1" ; then
-  if test "x$sbrk_deprecated" == "x1" ; then
+  if test "x$sbrk_deprecated" = "x1" ; then
     AC_MSG_RESULT([Disabling dss allocation because sbrk is deprecated])
     have_dss="0"
   fi
@@ -951,7 +951,7 @@ JE_COMPILABLE([a program using __builtin_ffsl], [
 		printf("%d\n", rv);
 	}
 ], [je_cv_gcc_builtin_ffsl])
-if test "x${je_cv_gcc_builtin_ffsl}" == "xyes" ; then
+if test "x${je_cv_gcc_builtin_ffsl}" = "xyes" ; then
   AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [__builtin_ffsl])
   AC_DEFINE([JEMALLOC_INTERNAL_FFS], [__builtin_ffs])
 else
@@ -965,7 +965,7 @@ else
 		printf("%d\n", rv);
 	}
   ], [je_cv_function_ffsl])
-  if test "x${je_cv_function_ffsl}" == "xyes" ; then
+  if test "x${je_cv_function_ffsl}" = "xyes" ; then
     AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [ffsl])
     AC_DEFINE([JEMALLOC_INTERNAL_FFS], [ffs])
   else
-- 
cgit v0.12


From ffa259841c6a4b6dae4ed74f02bb38703e190065 Mon Sep 17 00:00:00 2001
From: "Manuel A. Fernandez Montecelo" <manuel.montezelo@gmail.com>
Date: Tue, 29 Jul 2014 23:11:26 +0100
Subject: Add OpenRISC/or1k LG_QUANTUM size definition

---
 include/jemalloc/internal/jemalloc_internal.h.in | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index f2cd743..1c2f3d4 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -203,6 +203,9 @@ static const bool config_ivsalloc =
 #  ifdef __mips__
 #    define LG_QUANTUM		3
 #  endif
+#  ifdef __or1k__
+#    define LG_QUANTUM		3
+#  endif
 #  ifdef __powerpc__
 #    define LG_QUANTUM		4
 #  endif
-- 
cgit v0.12


From b433d7a87b27ff1e4ccea5103bc0a95afbf58ea4 Mon Sep 17 00:00:00 2001
From: "Manuel A. Fernandez Montecelo" <manuel.montezelo@gmail.com>
Date: Tue, 29 Jul 2014 23:15:26 +0100
Subject: Update config.{guess,sub} to more recent versions, to add better
 support to OpenRISC/or1k (among others)

---
 config.guess | 192 +++++++++--------------------------------------------------
 config.sub   |  21 ++++---
 2 files changed, 37 insertions(+), 176 deletions(-)

diff --git a/config.guess b/config.guess
index b79252d..1f5c50c 100755
--- a/config.guess
+++ b/config.guess
@@ -1,8 +1,8 @@
 #! /bin/sh
 # Attempt to guess a canonical system name.
-#   Copyright 1992-2013 Free Software Foundation, Inc.
+#   Copyright 1992-2014 Free Software Foundation, Inc.
 
-timestamp='2013-06-10'
+timestamp='2014-03-23'
 
 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
@@ -50,7 +50,7 @@ version="\
 GNU config.guess ($timestamp)
 
 Originally written by Per Bothner.
-Copyright 1992-2013 Free Software Foundation, Inc.
+Copyright 1992-2014 Free Software Foundation, Inc.
 
 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -149,7 +149,7 @@ Linux|GNU|GNU/*)
 	LIBC=gnu
 	#endif
 	EOF
-	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC'`
+	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC' | sed 's, ,,g'`
 	;;
 esac
 
@@ -826,7 +826,7 @@ EOF
     *:MINGW*:*)
 	echo ${UNAME_MACHINE}-pc-mingw32
 	exit ;;
-    i*:MSYS*:*)
+    *:MSYS*:*)
 	echo ${UNAME_MACHINE}-pc-msys
 	exit ;;
     i*:windows32*:*)
@@ -969,10 +969,10 @@ EOF
 	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'`
 	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-${LIBC}"; exit; }
 	;;
-    or1k:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+    openrisc*:Linux:*:*)
+	echo or1k-unknown-linux-${LIBC}
 	exit ;;
-    or32:Linux:*:*)
+    or32:Linux:*:* | or1k*:Linux:*:*)
 	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     padre:Linux:*:*)
@@ -1260,16 +1260,26 @@ EOF
 	if test "$UNAME_PROCESSOR" = unknown ; then
 	    UNAME_PROCESSOR=powerpc
 	fi
-	if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
-	    if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
-		(CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
-		grep IS_64BIT_ARCH >/dev/null
-	    then
-		case $UNAME_PROCESSOR in
-		    i386) UNAME_PROCESSOR=x86_64 ;;
-		    powerpc) UNAME_PROCESSOR=powerpc64 ;;
-		esac
+	if test `echo "$UNAME_RELEASE" | sed -e 's/\..*//'` -le 10 ; then
+	    if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
+		if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
+		    (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
+		    grep IS_64BIT_ARCH >/dev/null
+		then
+		    case $UNAME_PROCESSOR in
+			i386) UNAME_PROCESSOR=x86_64 ;;
+			powerpc) UNAME_PROCESSOR=powerpc64 ;;
+		    esac
+		fi
 	    fi
+	elif test "$UNAME_PROCESSOR" = i386 ; then
+	    # Avoid executing cc on OS X 10.9, as it ships with a stub
+	    # that puts up a graphical alert prompting to install
+	    # developer tools.  Any system running Mac OS X 10.7 or
+	    # later (Darwin 11 and later) is required to have a 64-bit
+	    # processor. This is not true of the ARM version of Darwin
+	    # that Apple uses in portable devices.
+	    UNAME_PROCESSOR=x86_64
 	fi
 	echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
 	exit ;;
@@ -1361,154 +1371,6 @@ EOF
 	exit ;;
 esac
 
-eval $set_cc_for_build
-cat >$dummy.c <<EOF
-#ifdef _SEQUENT_
-# include <sys/types.h>
-# include <sys/utsname.h>
-#endif
-main ()
-{
-#if defined (sony)
-#if defined (MIPSEB)
-  /* BFD wants "bsd" instead of "newsos".  Perhaps BFD should be changed,
-     I don't know....  */
-  printf ("mips-sony-bsd\n"); exit (0);
-#else
-#include <sys/param.h>
-  printf ("m68k-sony-newsos%s\n",
-#ifdef NEWSOS4
-	"4"
-#else
-	""
-#endif
-	); exit (0);
-#endif
-#endif
-
-#if defined (__arm) && defined (__acorn) && defined (__unix)
-  printf ("arm-acorn-riscix\n"); exit (0);
-#endif
-
-#if defined (hp300) && !defined (hpux)
-  printf ("m68k-hp-bsd\n"); exit (0);
-#endif
-
-#if defined (NeXT)
-#if !defined (__ARCHITECTURE__)
-#define __ARCHITECTURE__ "m68k"
-#endif
-  int version;
-  version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
-  if (version < 4)
-    printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
-  else
-    printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
-  exit (0);
-#endif
-
-#if defined (MULTIMAX) || defined (n16)
-#if defined (UMAXV)
-  printf ("ns32k-encore-sysv\n"); exit (0);
-#else
-#if defined (CMU)
-  printf ("ns32k-encore-mach\n"); exit (0);
-#else
-  printf ("ns32k-encore-bsd\n"); exit (0);
-#endif
-#endif
-#endif
-
-#if defined (__386BSD__)
-  printf ("i386-pc-bsd\n"); exit (0);
-#endif
-
-#if defined (sequent)
-#if defined (i386)
-  printf ("i386-sequent-dynix\n"); exit (0);
-#endif
-#if defined (ns32000)
-  printf ("ns32k-sequent-dynix\n"); exit (0);
-#endif
-#endif
-
-#if defined (_SEQUENT_)
-    struct utsname un;
-
-    uname(&un);
-
-    if (strncmp(un.version, "V2", 2) == 0) {
-	printf ("i386-sequent-ptx2\n"); exit (0);
-    }
-    if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
-	printf ("i386-sequent-ptx1\n"); exit (0);
-    }
-    printf ("i386-sequent-ptx\n"); exit (0);
-
-#endif
-
-#if defined (vax)
-# if !defined (ultrix)
-#  include <sys/param.h>
-#  if defined (BSD)
-#   if BSD == 43
-      printf ("vax-dec-bsd4.3\n"); exit (0);
-#   else
-#    if BSD == 199006
-      printf ("vax-dec-bsd4.3reno\n"); exit (0);
-#    else
-      printf ("vax-dec-bsd\n"); exit (0);
-#    endif
-#   endif
-#  else
-    printf ("vax-dec-bsd\n"); exit (0);
-#  endif
-# else
-    printf ("vax-dec-ultrix\n"); exit (0);
-# endif
-#endif
-
-#if defined (alliant) && defined (i860)
-  printf ("i860-alliant-bsd\n"); exit (0);
-#endif
-
-  exit (1);
-}
-EOF
-
-$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` &&
-	{ echo "$SYSTEM_NAME"; exit; }
-
-# Apollos put the system type in the environment.
-
-test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; }
-
-# Convex versions that predate uname can use getsysinfo(1)
-
-if [ -x /usr/convex/getsysinfo ]
-then
-    case `getsysinfo -f cpu_type` in
-    c1*)
-	echo c1-convex-bsd
-	exit ;;
-    c2*)
-	if getsysinfo -f scalar_acc
-	then echo c32-convex-bsd
-	else echo c2-convex-bsd
-	fi
-	exit ;;
-    c34*)
-	echo c34-convex-bsd
-	exit ;;
-    c38*)
-	echo c38-convex-bsd
-	exit ;;
-    c4*)
-	echo c4-convex-bsd
-	exit ;;
-    esac
-fi
-
 cat >&2 <<EOF
 $0: unable to guess system type
 
diff --git a/config.sub b/config.sub
index c4cc983..d654d03 100755
--- a/config.sub
+++ b/config.sub
@@ -1,8 +1,8 @@
 #! /bin/sh
 # Configuration validation subroutine script.
-#   Copyright 1992-2013 Free Software Foundation, Inc.
+#   Copyright 1992-2014 Free Software Foundation, Inc.
 
-timestamp='2013-10-01'
+timestamp='2014-05-01'
 
 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
@@ -68,7 +68,7 @@ Report bugs and patches to <config-patches@gnu.org>."
 version="\
 GNU config.sub ($timestamp)
 
-Copyright 1992-2013 Free Software Foundation, Inc.
+Copyright 1992-2014 Free Software Foundation, Inc.
 
 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -283,8 +283,10 @@ case $basic_machine in
 	| mips64vr5900 | mips64vr5900el \
 	| mipsisa32 | mipsisa32el \
 	| mipsisa32r2 | mipsisa32r2el \
+	| mipsisa32r6 | mipsisa32r6el \
 	| mipsisa64 | mipsisa64el \
 	| mipsisa64r2 | mipsisa64r2el \
+	| mipsisa64r6 | mipsisa64r6el \
 	| mipsisa64sb1 | mipsisa64sb1el \
 	| mipsisa64sr71k | mipsisa64sr71kel \
 	| mipsr5900 | mipsr5900el \
@@ -296,8 +298,7 @@ case $basic_machine in
 	| nds32 | nds32le | nds32be \
 	| nios | nios2 | nios2eb | nios2el \
 	| ns16k | ns32k \
-	| open8 \
-	| or1k | or32 \
+	| open8 | or1k | or1knd | or32 \
 	| pdp10 | pdp11 | pj | pjl \
 	| powerpc | powerpc64 | powerpc64le | powerpcle \
 	| pyramid \
@@ -402,8 +403,10 @@ case $basic_machine in
 	| mips64vr5900-* | mips64vr5900el-* \
 	| mipsisa32-* | mipsisa32el-* \
 	| mipsisa32r2-* | mipsisa32r2el-* \
+	| mipsisa32r6-* | mipsisa32r6el-* \
 	| mipsisa64-* | mipsisa64el-* \
 	| mipsisa64r2-* | mipsisa64r2el-* \
+	| mipsisa64r6-* | mipsisa64r6el-* \
 	| mipsisa64sb1-* | mipsisa64sb1el-* \
 	| mipsisa64sr71k-* | mipsisa64sr71kel-* \
 	| mipsr5900-* | mipsr5900el-* \
@@ -415,6 +418,7 @@ case $basic_machine in
 	| nios-* | nios2-* | nios2eb-* | nios2el-* \
 	| none-* | np1-* | ns16k-* | ns32k-* \
 	| open8-* \
+	| or1k*-* \
 	| orion-* \
 	| pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \
 	| powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* \
@@ -1376,7 +1380,7 @@ case $os in
 	      | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \
 	      | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \
 	      | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \
-	      | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es*)
+	      | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es* | -tirtos*)
 	# Remember, each alternative MUST END IN *, to match a version number.
 		;;
 	-qnx*)
@@ -1400,8 +1404,6 @@ case $os in
 	-mac*)
 		os=`echo $os | sed -e 's|mac|macos|'`
 		;;
-	-ios*)
-		;;
 	-linux-dietlibc)
 		os=-linux-dietlibc
 		;;
@@ -1596,9 +1598,6 @@ case $basic_machine in
 	mips*-*)
 		os=-elf
 		;;
-	or1k-*)
-		os=-elf
-		;;
 	or32-*)
 		os=-coff
 		;;
-- 
cgit v0.12


From 1aa25a3ca28d8da347dc115636073493db791183 Mon Sep 17 00:00:00 2001
From: Michael Neumann <mneumann@ntecs.de>
Date: Tue, 5 Aug 2014 03:06:02 +0200
Subject: Support DragonFlyBSD

Note that in contrast to FreeBSD, DragonFly does not work
with force_lazy_lock enabled.
---
 configure.ac | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/configure.ac b/configure.ac
index 645bd46..83c60ec 100644
--- a/configure.ac
+++ b/configure.ac
@@ -283,6 +283,11 @@ case "${host}" in
 	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ])
 	force_lazy_lock="1"
 	;;
+  *-*-dragonfly*)
+	CFLAGS="$CFLAGS"
+	abi="elf"
+	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ])
+	;;
   *-*-linux*)
 	CFLAGS="$CFLAGS"
 	CPPFLAGS="$CPPFLAGS -D_GNU_SOURCE"
-- 
cgit v0.12


From cf6032d0efbc2e3e9f736a8cd69846cf7427640b Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Wed, 30 Jul 2014 18:16:13 +0900
Subject: Remove ${srcroot} from cfghdrs_in, cfgoutputs_in and cfghdrs_tup in
 configure

On Windows, srcroot may start with "drive:", which confuses autoconf's
AC_CONFIG_* macros. The macros works equally well without ${srcroot},
provided some adjustment to Makefile.in.
---
 Makefile.in  |  4 ++--
 configure.ac | 46 +++++++++++++++++++++++-----------------------
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/Makefile.in b/Makefile.in
index 839bb08..a21acd4 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -42,9 +42,9 @@ XSLTPROC := @XSLTPROC@
 AUTOCONF := @AUTOCONF@
 _RPATH = @RPATH@
 RPATH = $(if $(1),$(call _RPATH,$(1)))
-cfghdrs_in := @cfghdrs_in@
+cfghdrs_in := $(addprefix $(srcroot),@cfghdrs_in@)
 cfghdrs_out := @cfghdrs_out@
-cfgoutputs_in := @cfgoutputs_in@
+cfgoutputs_in := $(addprefix $(srcroot),@cfgoutputs_in@)
 cfgoutputs_out := @cfgoutputs_out@
 enable_autogen := @enable_autogen@
 enable_code_coverage := @enable_code_coverage@
diff --git a/configure.ac b/configure.ac
index 645bd46..bc3464f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -534,15 +534,15 @@ dnl jemalloc_protos_jet.h easy.
 je_="je_"
 AC_SUBST([je_])
 
-cfgoutputs_in="${srcroot}Makefile.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}doc/html.xsl.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}doc/manpages.xsl.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}doc/jemalloc.xml.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}include/jemalloc/jemalloc_macros.h.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}include/jemalloc/jemalloc_protos.h.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}include/jemalloc/internal/jemalloc_internal.h.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}test/test.sh.in"
-cfgoutputs_in="${cfgoutputs_in} ${srcroot}test/include/test/jemalloc_test.h.in"
+cfgoutputs_in="Makefile.in"
+cfgoutputs_in="${cfgoutputs_in} doc/html.xsl.in"
+cfgoutputs_in="${cfgoutputs_in} doc/manpages.xsl.in"
+cfgoutputs_in="${cfgoutputs_in} doc/jemalloc.xml.in"
+cfgoutputs_in="${cfgoutputs_in} include/jemalloc/jemalloc_macros.h.in"
+cfgoutputs_in="${cfgoutputs_in} include/jemalloc/jemalloc_protos.h.in"
+cfgoutputs_in="${cfgoutputs_in} include/jemalloc/internal/jemalloc_internal.h.in"
+cfgoutputs_in="${cfgoutputs_in} test/test.sh.in"
+cfgoutputs_in="${cfgoutputs_in} test/include/test/jemalloc_test.h.in"
 
 cfgoutputs_out="Makefile"
 cfgoutputs_out="${cfgoutputs_out} doc/html.xsl"
@@ -564,18 +564,18 @@ cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/internal/jemalloc_internal.h"
 cfgoutputs_tup="${cfgoutputs_tup} test/test.sh:test/test.sh.in"
 cfgoutputs_tup="${cfgoutputs_tup} test/include/test/jemalloc_test.h:test/include/test/jemalloc_test.h.in"
 
-cfghdrs_in="${srcroot}include/jemalloc/jemalloc_defs.h.in"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/internal/jemalloc_internal_defs.h.in"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/internal/private_namespace.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/internal/private_unnamespace.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/internal/private_symbols.txt"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/internal/public_namespace.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/internal/public_unnamespace.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/internal/size_classes.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/jemalloc_rename.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/jemalloc_mangle.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}include/jemalloc/jemalloc.sh"
-cfghdrs_in="${cfghdrs_in} ${srcroot}test/include/test/jemalloc_test_defs.h.in"
+cfghdrs_in="include/jemalloc/jemalloc_defs.h.in"
+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/jemalloc_internal_defs.h.in"
+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/private_namespace.sh"
+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/private_unnamespace.sh"
+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/private_symbols.txt"
+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/public_namespace.sh"
+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/public_unnamespace.sh"
+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/size_classes.sh"
+cfghdrs_in="${cfghdrs_in} include/jemalloc/jemalloc_rename.sh"
+cfghdrs_in="${cfghdrs_in} include/jemalloc/jemalloc_mangle.sh"
+cfghdrs_in="${cfghdrs_in} include/jemalloc/jemalloc.sh"
+cfghdrs_in="${cfghdrs_in} test/include/test/jemalloc_test_defs.h.in"
 
 cfghdrs_out="include/jemalloc/jemalloc_defs.h"
 cfghdrs_out="${cfghdrs_out} include/jemalloc/jemalloc${install_suffix}.h"
@@ -593,8 +593,8 @@ cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/jemalloc_internal_defs.h"
 cfghdrs_out="${cfghdrs_out} test/include/test/jemalloc_test_defs.h"
 
 cfghdrs_tup="include/jemalloc/jemalloc_defs.h:include/jemalloc/jemalloc_defs.h.in"
-cfghdrs_tup="${cfghdrs_tup} include/jemalloc/internal/jemalloc_internal_defs.h:${srcroot}include/jemalloc/internal/jemalloc_internal_defs.h.in"
-cfghdrs_tup="${cfghdrs_tup} test/include/test/jemalloc_test_defs.h:${srcroot}test/include/test/jemalloc_test_defs.h.in"
+cfghdrs_tup="${cfghdrs_tup} include/jemalloc/internal/jemalloc_internal_defs.h:include/jemalloc/internal/jemalloc_internal_defs.h.in"
+cfghdrs_tup="${cfghdrs_tup} test/include/test/jemalloc_test_defs.h:test/include/test/jemalloc_test_defs.h.in"
 
 dnl Silence irrelevant compiler warnings by default.
 AC_ARG_ENABLE([cc-silence],
-- 
cgit v0.12


From 55c9aa10386b21af92f323d04bddc15691d48756 Mon Sep 17 00:00:00 2001
From: Qinfan Wu <wqfish@fb.com>
Date: Wed, 6 Aug 2014 16:10:08 -0700
Subject: Fix the bug that causes not allocating free run with lowest address.

---
 src/arena.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index d3fe0fb..db69916 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -101,14 +101,18 @@ arena_avail_comp(arena_chunk_map_t *a, arena_chunk_map_t *b)
 	uintptr_t a_mapelm = (uintptr_t)a;
 	uintptr_t b_mapelm = (uintptr_t)b;
 
-        if (a_mapelm & CHUNK_MAP_KEY)
+	if (a_mapelm & CHUNK_MAP_KEY)
 		a_size = a_mapelm & ~PAGE_MASK;
         else
 		a_size = arena_mapelm_to_bits(a) & ~PAGE_MASK;
 
 	ret = (a_size > b_size) - (a_size < b_size);
-	if (ret == 0 && (!(a_mapelm & CHUNK_MAP_KEY)))
-		ret = (a_mapelm > b_mapelm) - (a_mapelm < b_mapelm);
+	if (ret == 0) {
+		if (!(a_mapelm & CHUNK_MAP_KEY))
+			ret = (a_mapelm > b_mapelm) - (a_mapelm < b_mapelm);
+		else
+			ret = -1;
+	}
 
 	return (ret);
 }
-- 
cgit v0.12


From ea73eb8f3e029f0a5697e78c6771b49063cf4138 Mon Sep 17 00:00:00 2001
From: Qinfan Wu <wqfish@fb.com>
Date: Wed, 6 Aug 2014 16:43:01 -0700
Subject: Reintroduce the comment that was removed in f9ff603.

---
 src/arena.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/arena.c b/src/arena.c
index db69916..118700b 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -110,8 +110,12 @@ arena_avail_comp(arena_chunk_map_t *a, arena_chunk_map_t *b)
 	if (ret == 0) {
 		if (!(a_mapelm & CHUNK_MAP_KEY))
 			ret = (a_mapelm > b_mapelm) - (a_mapelm < b_mapelm);
-		else
+		else {
+			/*
+			 * Treat keys as if they are lower than anything else.
+			 */
 			ret = -1;
+		}
 	}
 
 	return (ret);
-- 
cgit v0.12


From a2ea54c98640eafc5bb256fa4369d5553499ac81 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Wed, 6 Aug 2014 23:36:19 -0700
Subject: Add atomic operations tests and fix latent bugs.

---
 Makefile.in                        |  3 +-
 include/jemalloc/internal/atomic.h | 41 +++++++++++-----
 test/unit/atomic.c                 | 97 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 128 insertions(+), 13 deletions(-)
 create mode 100644 test/unit/atomic.c

diff --git a/Makefile.in b/Makefile.in
index a21acd4..dfafe45 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -110,7 +110,8 @@ C_TESTLIB_SRCS := $(srcroot)test/src/math.c $(srcroot)test/src/mtx.c \
 	$(srcroot)test/src/SFMT.c $(srcroot)test/src/test.c \
 	$(srcroot)test/src/thd.c
 C_UTIL_INTEGRATION_SRCS := $(srcroot)src/util.c
-TESTS_UNIT := $(srcroot)test/unit/bitmap.c \
+TESTS_UNIT := $(srcroot)test/unit/atomic.c \
+	$(srcroot)test/unit/bitmap.c \
 	$(srcroot)test/unit/ckh.c \
 	$(srcroot)test/unit/hash.c \
 	$(srcroot)test/unit/junk.c \
diff --git a/include/jemalloc/internal/atomic.h b/include/jemalloc/internal/atomic.h
index 11a7b47..a048815 100644
--- a/include/jemalloc/internal/atomic.h
+++ b/include/jemalloc/internal/atomic.h
@@ -18,6 +18,17 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
 
+/*
+ * All functions return the arithmetic result of the atomic operation.  Some
+ * atomic operation APIs return the value prior to mutation, in which case the
+ * following functions must redundantly compute the result so that it can be
+ * returned.  These functions are normally inlined, so the extra operations can
+ * be optimized away if the return values aren't used by the callers.
+ *
+ *   <t> atomic_add_<t>(<t> *p, <t> x) { return (*p + x); }
+ *   <t> atomic_sub_<t>(<t> *p, <t> x) { return (*p - x); }
+ */
+
 #ifndef JEMALLOC_ENABLE_INLINE
 uint64_t	atomic_add_uint64(uint64_t *p, uint64_t x);
 uint64_t	atomic_sub_uint64(uint64_t *p, uint64_t x);
@@ -52,14 +63,14 @@ JEMALLOC_INLINE uint64_t
 atomic_add_uint64(uint64_t *p, uint64_t x)
 {
 
-	return (InterlockedExchangeAdd64(p, x));
+	return (InterlockedExchangeAdd64(p, x) + x);
 }
 
 JEMALLOC_INLINE uint64_t
 atomic_sub_uint64(uint64_t *p, uint64_t x)
 {
 
-	return (InterlockedExchangeAdd64(p, -((int64_t)x)));
+	return (InterlockedExchangeAdd64(p, -((int64_t)x)) - x);
 }
 #elif (defined(JEMALLOC_OSATOMIC))
 JEMALLOC_INLINE uint64_t
@@ -79,28 +90,31 @@ atomic_sub_uint64(uint64_t *p, uint64_t x)
 JEMALLOC_INLINE uint64_t
 atomic_add_uint64(uint64_t *p, uint64_t x)
 {
+	uint64_t t = x;
 
 	asm volatile (
 	    "lock; xaddq %0, %1;"
-	    : "+r" (x), "=m" (*p) /* Outputs. */
+	    : "+r" (t), "=m" (*p) /* Outputs. */
 	    : "m" (*p) /* Inputs. */
 	    );
 
-	return (x);
+	return (t + x);
 }
 
 JEMALLOC_INLINE uint64_t
 atomic_sub_uint64(uint64_t *p, uint64_t x)
 {
+	uint64_t t;
 
 	x = (uint64_t)(-(int64_t)x);
+	t = x;
 	asm volatile (
 	    "lock; xaddq %0, %1;"
-	    : "+r" (x), "=m" (*p) /* Outputs. */
+	    : "+r" (t), "=m" (*p) /* Outputs. */
 	    : "m" (*p) /* Inputs. */
 	    );
 
-	return (x);
+	return (t + x);
 }
 #  elif (defined(JEMALLOC_ATOMIC9))
 JEMALLOC_INLINE uint64_t
@@ -164,14 +178,14 @@ JEMALLOC_INLINE uint32_t
 atomic_add_uint32(uint32_t *p, uint32_t x)
 {
 
-	return (InterlockedExchangeAdd(p, x));
+	return (InterlockedExchangeAdd(p, x) + x);
 }
 
 JEMALLOC_INLINE uint32_t
 atomic_sub_uint32(uint32_t *p, uint32_t x)
 {
 
-	return (InterlockedExchangeAdd(p, -((int32_t)x)));
+	return (InterlockedExchangeAdd(p, -((int32_t)x)) - x);
 }
 #elif (defined(JEMALLOC_OSATOMIC))
 JEMALLOC_INLINE uint32_t
@@ -191,28 +205,31 @@ atomic_sub_uint32(uint32_t *p, uint32_t x)
 JEMALLOC_INLINE uint32_t
 atomic_add_uint32(uint32_t *p, uint32_t x)
 {
+	uint32_t t = x;
 
 	asm volatile (
 	    "lock; xaddl %0, %1;"
-	    : "+r" (x), "=m" (*p) /* Outputs. */
+	    : "+r" (t), "=m" (*p) /* Outputs. */
 	    : "m" (*p) /* Inputs. */
 	    );
 
-	return (x);
+	return (t + x);
 }
 
 JEMALLOC_INLINE uint32_t
 atomic_sub_uint32(uint32_t *p, uint32_t x)
 {
+	uint32_t t;
 
 	x = (uint32_t)(-(int32_t)x);
+	t = x;
 	asm volatile (
 	    "lock; xaddl %0, %1;"
-	    : "+r" (x), "=m" (*p) /* Outputs. */
+	    : "+r" (t), "=m" (*p) /* Outputs. */
 	    : "m" (*p) /* Inputs. */
 	    );
 
-	return (x);
+	return (t + x);
 }
 #elif (defined(JEMALLOC_ATOMIC9))
 JEMALLOC_INLINE uint32_t
diff --git a/test/unit/atomic.c b/test/unit/atomic.c
new file mode 100644
index 0000000..eb6136c
--- /dev/null
+++ b/test/unit/atomic.c
@@ -0,0 +1,97 @@
+#include "test/jemalloc_test.h"
+
+#define	TEST_STRUCT(p, t)						\
+struct p##_test_s {							\
+	t	accum0;							\
+	t	x;							\
+};									\
+typedef struct p##_test_s p##_test_t;
+
+#define	TEST_BODY(p, t, PRI) do {					\
+	const p##_test_t tests[] = {					\
+		{-1, -1},						\
+		{-1,  0},						\
+		{-1,  1},						\
+									\
+		{ 0, -1},						\
+		{ 0,  0},						\
+		{ 0,  1},						\
+									\
+		{ 1, -1},						\
+		{ 1,  0},						\
+		{ 1,  1},						\
+									\
+		{0, -(1 << 22)},					\
+		{0, (1 << 22)},						\
+		{(1 << 22), -(1 << 22)},				\
+		{(1 << 22), (1 << 22)}					\
+	};								\
+	unsigned i;							\
+									\
+	for (i = 0; i < sizeof(tests)/sizeof(p##_test_t); i++) {	\
+		t accum = tests[i].accum0;				\
+		assert_u64_eq(atomic_read_##p(&accum), tests[i].accum0,	\
+		    "i=%u", i);						\
+		assert_u64_eq(atomic_add_##p(&accum, tests[i].x),	\
+		    tests[i].accum0 + tests[i].x,			\
+		    "i=%u, accum=%#"PRI", x=%#"PRI,			\
+		    i, tests[i].accum0, tests[i].x);			\
+		assert_u64_eq(atomic_read_##p(&accum), accum,		\
+		    "i=%u", i);						\
+									\
+		accum = tests[i].accum0;				\
+		assert_u64_eq(atomic_sub_##p(&accum, tests[i].x),	\
+		    tests[i].accum0 - tests[i].x,			\
+		    "i=%u, accum=%#"PRI", x=%#"PRI,			\
+		    i, tests[i].accum0, tests[i].x);			\
+		assert_u64_eq(atomic_read_##p(&accum), accum,		\
+		    "i=%u", i);						\
+	}								\
+} while (0)
+
+TEST_STRUCT(uint64, uint64_t)
+TEST_BEGIN(test_atomic_uint64)
+{
+
+#if !(LG_SIZEOF_PTR == 3 || LG_SIZEOF_INT == 3)
+	test_skip("64-bit atomic operations not supported");
+#else
+	TEST_BODY(uint64, uint64_t, PRIx64);
+#endif
+}
+TEST_END
+
+TEST_STRUCT(uint32, uint32_t)
+TEST_BEGIN(test_atomic_uint32)
+{
+
+	TEST_BODY(uint32, uint32_t, PRIx32);
+}
+TEST_END
+
+TEST_STRUCT(z, size_t)
+TEST_BEGIN(test_atomic_z)
+{
+
+	TEST_BODY(z, size_t, "zx");
+}
+TEST_END
+
+TEST_STRUCT(u, unsigned)
+TEST_BEGIN(test_atomic_u)
+{
+
+	TEST_BODY(u, unsigned, "x");
+}
+TEST_END
+
+int
+main(void)
+{
+
+	return (test(
+	    test_atomic_uint64,
+	    test_atomic_uint32,
+	    test_atomic_z,
+	    test_atomic_u));
+}
-- 
cgit v0.12


From 1522937e9cbcfa24c881dc439cc454f9a34a7e88 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Wed, 6 Aug 2014 23:38:39 -0700
Subject: Fix the cactive statistic.

Fix the cactive statistic to decrease (rather than increase) when active
memory decreases.  This regression was introduced by
aa5113b1fdafd1129c22512837c6c3d66c295fc8 (Refactor overly large/complex
functions) and first released in 3.5.0.
---
 src/arena.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 118700b..c0ec98a 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -382,9 +382,9 @@ arena_cactive_update(arena_t *arena, size_t add_pages, size_t sub_pages)
 {
 
 	if (config_stats) {
-		ssize_t cactive_diff = CHUNK_CEILING((arena->nactive +
-		    add_pages) << LG_PAGE) - CHUNK_CEILING((arena->nactive -
-		    sub_pages) << LG_PAGE);
+		ssize_t cactive_diff = CHUNK_CEILING((arena->nactive + add_pages
+		    - sub_pages) << LG_PAGE) - CHUNK_CEILING(arena->nactive <<
+		    LG_PAGE);
 		if (cactive_diff != 0)
 			stats_cactive_add(cactive_diff);
 	}
-- 
cgit v0.12


From 011dde96c52e37e897526e242e9e3018caafb751 Mon Sep 17 00:00:00 2001
From: Psi Mankoski <psi@aerospike.com>
Date: Mon, 11 Aug 2014 17:08:25 -0700
Subject: Set VERSION also when the source directory is a git submodule using a
 ".git" file pointing to the repo. directory.

---
 configure.ac | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/configure.ac b/configure.ac
index 9c365ed..ede5f70 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1029,8 +1029,8 @@ dnl ============================================================================
 dnl jemalloc configuration.
 dnl 
 
-dnl Set VERSION if source directory has an embedded git repository.
-if test -d "${srcroot}.git" ; then
+dnl Set VERSION if source directory has an embedded git repository or is a git submodule.
+if test -e "${srcroot}.git" ; then
   git describe --long --abbrev=40 > ${srcroot}VERSION
 fi
 jemalloc_version=`cat ${srcroot}VERSION`
-- 
cgit v0.12


From 04d60a132beed9e8c33f73b94fb9251b919073c8 Mon Sep 17 00:00:00 2001
From: Qinfan Wu <wqfish@fb.com>
Date: Fri, 18 Jul 2014 14:21:17 -0700
Subject: Maintain all the dirty runs in a linked list for each arena

---
 include/jemalloc/internal/arena.h |  6 +++++
 src/arena.c                       | 47 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index cb73283..3422f36 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -89,6 +89,9 @@ struct arena_chunk_map_s {
 	}; /* union { ... }; */
 #endif
 
+	/* Linkage for list of dirty runs. */
+	ql_elm(arena_chunk_map_t)	dr_link;
+
 	/*
 	 * Run address (or size) and various flags are stored together.  The bit
 	 * layout looks like (assuming 32-bit system):
@@ -333,6 +336,9 @@ struct arena_s {
 	/* Tree of dirty-page-containing chunks this arena manages. */
 	arena_chunk_tree_t	chunks_dirty;
 
+	/* List of dirty runs this arena manages. */
+	arena_chunk_mapelms_t	runs_dirty;
+
 	/*
 	 * In order to avoid rapid chunk allocation/deallocation when an arena
 	 * oscillates right on the cusp of needing a new chunk, cache the most
diff --git a/src/arena.c b/src/arena.c
index c0ec98a..3397731 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -394,6 +394,7 @@ static void
 arena_run_split_remove(arena_t *arena, arena_chunk_t *chunk, size_t run_ind,
     size_t flag_dirty, size_t need_pages)
 {
+	arena_chunk_map_t *mapelm;
 	size_t total_pages, rem_pages;
 
 	total_pages = arena_mapbits_unallocated_size_get(chunk, run_ind) >>
@@ -404,6 +405,11 @@ arena_run_split_remove(arena_t *arena, arena_chunk_t *chunk, size_t run_ind,
 	rem_pages = total_pages - need_pages;
 
 	arena_avail_remove(arena, chunk, run_ind, total_pages, true, true);
+	if (flag_dirty != 0) {
+		/* If the run is dirty, it must be in the dirty list. */
+		mapelm = arena_mapp_get(chunk, run_ind);
+		ql_remove(&arena->runs_dirty, mapelm, dr_link);
+	}
 	arena_cactive_update(arena, need_pages, 0);
 	arena->nactive += need_pages;
 
@@ -416,6 +422,14 @@ arena_run_split_remove(arena_t *arena, arena_chunk_t *chunk, size_t run_ind,
 			arena_mapbits_unallocated_set(chunk,
 			    run_ind+total_pages-1, (rem_pages << LG_PAGE),
 			    flag_dirty);
+			mapelm = arena_mapp_get(chunk, run_ind+need_pages);
+			/*
+			 * Append the trailing run at the end of the dirty list.
+			 * We could also insert the run at the original place.
+			 * Let us consider this later.
+			 */
+			ql_elm_new(mapelm, dr_link);
+			ql_tail_insert(&arena->runs_dirty, mapelm, dr_link);
 		} else {
 			arena_mapbits_unallocated_set(chunk, run_ind+need_pages,
 			    (rem_pages << LG_PAGE),
@@ -701,6 +715,11 @@ arena_chunk_alloc(arena_t *arena)
 	/* Insert the run into the runs_avail tree. */
 	arena_avail_insert(arena, chunk, map_bias, chunk_npages-map_bias,
 	    false, false);
+	if (arena_mapbits_dirty_get(chunk, map_bias) != 0) {
+		arena_chunk_map_t *mapelm = arena_mapp_get(chunk, map_bias);
+		ql_elm_new(mapelm, dr_link);
+		ql_tail_insert(&arena->runs_dirty, mapelm, dr_link);
+	}
 
 	return (chunk);
 }
@@ -739,6 +758,7 @@ arena_chunk_dalloc_huge(arena_t *arena, void *chunk, size_t size)
 static void
 arena_chunk_dalloc(arena_t *arena, arena_chunk_t *chunk)
 {
+
 	assert(arena_mapbits_allocated_get(chunk, map_bias) == 0);
 	assert(arena_mapbits_allocated_get(chunk, chunk_npages-1) == 0);
 	assert(arena_mapbits_unallocated_size_get(chunk, map_bias) ==
@@ -754,6 +774,10 @@ arena_chunk_dalloc(arena_t *arena, arena_chunk_t *chunk)
 	 */
 	arena_avail_remove(arena, chunk, map_bias, chunk_npages-map_bias,
 	    false, false);
+	if (arena_mapbits_dirty_get(chunk, map_bias) != 0) {
+		arena_chunk_map_t *mapelm = arena_mapp_get(chunk, map_bias);
+		ql_remove(&arena->runs_dirty, mapelm, dr_link);
+	}
 
 	if (arena->spare != NULL) {
 		arena_chunk_t *spare = arena->spare;
@@ -1216,6 +1240,13 @@ arena_run_coalesce(arena_t *arena, arena_chunk_t *chunk, size_t *p_size,
 		arena_avail_remove(arena, chunk, run_ind+run_pages, nrun_pages,
 		    false, true);
 
+		/* If the successor is dirty, remove it from runs_dirty. */
+		if (flag_dirty != 0) {
+			arena_chunk_map_t *mapelm = arena_mapp_get(chunk,
+			    run_ind+run_pages);
+			ql_remove(&arena->runs_dirty, mapelm, dr_link);
+		}
+
 		size += nrun_size;
 		run_pages += nrun_pages;
 
@@ -1244,6 +1275,13 @@ arena_run_coalesce(arena_t *arena, arena_chunk_t *chunk, size_t *p_size,
 		arena_avail_remove(arena, chunk, run_ind, prun_pages, true,
 		    false);
 
+		/* If the predecessor is dirty, remove it from runs_dirty. */
+		if (flag_dirty != 0) {
+			arena_chunk_map_t *mapelm = arena_mapp_get(chunk,
+			    run_ind);
+			ql_remove(&arena->runs_dirty, mapelm, dr_link);
+		}
+
 		size += prun_size;
 		run_pages += prun_pages;
 
@@ -1261,6 +1299,7 @@ static void
 arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned)
 {
 	arena_chunk_t *chunk;
+	arena_chunk_map_t *mapelm;
 	size_t size, run_ind, run_pages, flag_dirty;
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
@@ -1315,6 +1354,13 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned)
 	    arena_mapbits_dirty_get(chunk, run_ind+run_pages-1));
 	arena_avail_insert(arena, chunk, run_ind, run_pages, true, true);
 
+	if (dirty) {
+		/* Insert into runs_dirty list. */
+		mapelm = arena_mapp_get(chunk, run_ind);
+		ql_elm_new(mapelm, dr_link);
+		ql_tail_insert(&arena->runs_dirty, mapelm, dr_link);
+	}
+
 	/* Deallocate chunk if it is now completely unused. */
 	if (size == arena_maxclass) {
 		assert(run_ind == map_bias);
@@ -2437,6 +2483,7 @@ arena_new(arena_t *arena, unsigned ind)
 
 	/* Initialize chunks. */
 	arena_chunk_dirty_new(&arena->chunks_dirty);
+	ql_new(&arena->runs_dirty);
 	arena->spare = NULL;
 
 	arena->nactive = 0;
-- 
cgit v0.12


From a244e5078e8505978b5f63cfe6dcb3c9d63d2cb5 Mon Sep 17 00:00:00 2001
From: Qinfan Wu <wqfish@fb.com>
Date: Mon, 21 Jul 2014 10:23:36 -0700
Subject: Add dirty page counting for debug

---
 src/arena.c | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 3397731..3cf1abf 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -923,11 +923,33 @@ arena_maybe_purge(arena_t *arena)
 static arena_chunk_t *
 chunks_dirty_iter_cb(arena_chunk_tree_t *tree, arena_chunk_t *chunk, void *arg)
 {
-       size_t *ndirty = (size_t *)arg;
+	size_t *ndirty = (size_t *)arg;
 
-       assert(chunk->ndirty != 0);
-       *ndirty += chunk->ndirty;
-       return (NULL);
+	assert(chunk->ndirty != 0);
+	*ndirty += chunk->ndirty;
+	return (NULL);
+}
+
+static size_t
+arena_dirty_count(arena_t *arena)
+{
+	size_t ndirty = 0;
+	arena_chunk_map_t *mapelm;
+	arena_chunk_t *chunk;
+	size_t pageind, npages;
+
+	ql_foreach(mapelm, &arena->runs_dirty, dr_link) {
+		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(mapelm);
+		pageind = arena_mapelm_to_pageind(mapelm);
+		assert(arena_mapbits_allocated_get(chunk, pageind) == 0);
+		assert(arena_mapbits_large_get(chunk, pageind) == 0);
+		assert(arena_mapbits_dirty_get(chunk, pageind) != 0);
+		npages = arena_mapbits_unallocated_size_get(chunk, pageind) >>
+		    LG_PAGE;
+		ndirty += npages;
+	}
+
+       return (ndirty);
 }
 
 static size_t
@@ -1134,6 +1156,9 @@ arena_purge(arena_t *arena, bool all)
 		arena_chunk_dirty_iter(&arena->chunks_dirty, NULL,
 		    chunks_dirty_iter_cb, (void *)&ndirty);
 		assert(ndirty == arena->ndirty);
+
+		ndirty = arena_dirty_count(arena);
+		assert(ndirty == arena->ndirty);
 	}
 	assert(arena->ndirty > arena->npurgatory || all);
 	assert((arena->nactive >> opt_lg_dirty_mult) < (arena->ndirty -
-- 
cgit v0.12


From e970800c780df918b80f8b914eeac475dd5f1ec4 Mon Sep 17 00:00:00 2001
From: Qinfan Wu <wqfish@fb.com>
Date: Mon, 21 Jul 2014 18:09:04 -0700
Subject: Purge dirty pages from the beginning of the dirty list.

---
 src/arena.c | 235 ++++++++++++++++++------------------------------------------
 1 file changed, 70 insertions(+), 165 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 3cf1abf..a78a66f 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -973,86 +973,73 @@ arena_compute_npurgatory(arena_t *arena, bool all)
 	return (npurgatory);
 }
 
-static void
-arena_chunk_stash_dirty(arena_t *arena, arena_chunk_t *chunk, bool all,
+static size_t
+arena_stash_dirty(arena_t *arena, bool all, size_t npurgatory,
     arena_chunk_mapelms_t *mapelms)
 {
-	size_t pageind, npages;
+	arena_chunk_map_t *mapelm;
+	size_t nstashed = 0;
+	arena_chunk_t *chunk;
+	size_t pageind, npages, run_size;
+	arena_run_t *run;
 
-	/*
-	 * Temporarily allocate free dirty runs within chunk.  If all is false,
-	 * only operate on dirty runs that are fragments; otherwise operate on
-	 * all dirty runs.
-	 */
-	for (pageind = map_bias; pageind < chunk_npages; pageind += npages) {
-		arena_chunk_map_t *mapelm = arena_mapp_get(chunk, pageind);
-		if (arena_mapbits_allocated_get(chunk, pageind) == 0) {
-			size_t run_size =
-			    arena_mapbits_unallocated_size_get(chunk, pageind);
-
-			npages = run_size >> LG_PAGE;
-			assert(pageind + npages <= chunk_npages);
-			assert(arena_mapbits_dirty_get(chunk, pageind) ==
-			    arena_mapbits_dirty_get(chunk, pageind+npages-1));
-
-			if (arena_mapbits_dirty_get(chunk, pageind) != 0 &&
-			    (all || arena_avail_adjac(chunk, pageind,
-			    npages))) {
-				arena_run_t *run = (arena_run_t *)((uintptr_t)
-				    chunk + (uintptr_t)(pageind << LG_PAGE));
-
-				arena_run_split_large(arena, run, run_size,
-				    false);
-				/* Append to list for later processing. */
-				ql_elm_new(mapelm, u.ql_link);
-				ql_tail_insert(mapelms, mapelm, u.ql_link);
-			}
-		} else {
-			/* Skip run. */
-			if (arena_mapbits_large_get(chunk, pageind) != 0) {
-				npages = arena_mapbits_large_size_get(chunk,
-				    pageind) >> LG_PAGE;
-			} else {
-				size_t binind;
-				arena_bin_info_t *bin_info;
-				arena_run_t *run = (arena_run_t *)((uintptr_t)
-				    chunk + (uintptr_t)(pageind << LG_PAGE));
-
-				assert(arena_mapbits_small_runind_get(chunk,
-				    pageind) == 0);
-				binind = arena_bin_index(arena, run->bin);
-				bin_info = &arena_bin_info[binind];
-				npages = bin_info->run_size >> LG_PAGE;
-			}
-		}
+	/* Add at least npurgatory pages to purge_list. */
+	for (mapelm = ql_first(&arena->runs_dirty); mapelm != NULL;
+	    mapelm = ql_first(&arena->runs_dirty)) {
+		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(mapelm);
+		pageind = arena_mapelm_to_pageind(mapelm);
+		run_size = arena_mapbits_unallocated_size_get(chunk, pageind);
+		npages = run_size >> LG_PAGE;
+		run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)(pageind <<
+		    LG_PAGE));
+
+		assert(pageind + npages <= chunk_npages);
+		assert(arena_mapbits_dirty_get(chunk, pageind) ==
+		    arena_mapbits_dirty_get(chunk, pageind+npages-1));
+
+		/* Temporarily allocate the free dirty run. */
+		arena_run_split_large(arena, run, run_size, false);
+		/* Append to purge_list for later processing. */
+		ql_elm_new(mapelm, dr_link);
+		ql_tail_insert(mapelms, mapelm, dr_link);
+
+		nstashed += npages;
+
+		if (all == false && nstashed >= npurgatory)
+			break;
 	}
-	assert(pageind == chunk_npages);
-	assert(chunk->ndirty == 0 || all == false);
-	assert(chunk->nruns_adjac == 0);
+
+	return (nstashed);
 }
 
 static size_t
-arena_chunk_purge_stashed(arena_t *arena, arena_chunk_t *chunk,
-    arena_chunk_mapelms_t *mapelms)
+arena_purge_stashed(arena_t *arena, arena_chunk_mapelms_t *mapelms)
 {
-	size_t npurged, pageind, npages, nmadvise;
+	size_t npurged, nmadvise;
 	arena_chunk_map_t *mapelm;
+	arena_chunk_t *chunk;
+	size_t pageind, npages, run_size;
 
-	malloc_mutex_unlock(&arena->lock);
 	if (config_stats)
 		nmadvise = 0;
 	npurged = 0;
-	ql_foreach(mapelm, mapelms, u.ql_link) {
+
+	malloc_mutex_unlock(&arena->lock);
+
+	ql_foreach(mapelm, mapelms, dr_link) {
 		bool unzeroed;
 		size_t flag_unzeroed, i;
 
+		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(mapelm);
 		pageind = arena_mapelm_to_pageind(mapelm);
-		npages = arena_mapbits_large_size_get(chunk, pageind) >>
-		    LG_PAGE;
+		run_size = arena_mapbits_large_size_get(chunk, pageind);
+		npages = run_size >> LG_PAGE;
+
 		assert(pageind + npages <= chunk_npages);
 		unzeroed = pages_purge((void *)((uintptr_t)chunk + (pageind <<
-		    LG_PAGE)), (npages << LG_PAGE));
+		    LG_PAGE)), run_size);
 		flag_unzeroed = unzeroed ? CHUNK_MAP_UNZEROED : 0;
+
 		/*
 		 * Set the unzeroed flag for all pages, now that pages_purge()
 		 * has returned whether the pages were zeroed as a side effect
@@ -1067,89 +1054,48 @@ arena_chunk_purge_stashed(arena_t *arena, arena_chunk_t *chunk,
 			arena_mapbits_unzeroed_set(chunk, pageind+i,
 			    flag_unzeroed);
 		}
+
 		npurged += npages;
 		if (config_stats)
 			nmadvise++;
 	}
+
 	malloc_mutex_lock(&arena->lock);
-	if (config_stats)
+
+	if (config_stats) {
 		arena->stats.nmadvise += nmadvise;
+		arena->stats.purged += npurged;
+	}
 
 	return (npurged);
 }
 
 static void
-arena_chunk_unstash_purged(arena_t *arena, arena_chunk_t *chunk,
-    arena_chunk_mapelms_t *mapelms)
+arena_unstash_purged(arena_t *arena, arena_chunk_mapelms_t *mapelms)
 {
 	arena_chunk_map_t *mapelm;
+	arena_chunk_t *chunk;
+	arena_run_t *run;
 	size_t pageind;
 
 	/* Deallocate runs. */
 	for (mapelm = ql_first(mapelms); mapelm != NULL;
 	    mapelm = ql_first(mapelms)) {
-		arena_run_t *run;
-
+		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(mapelm);
 		pageind = arena_mapelm_to_pageind(mapelm);
 		run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)(pageind <<
 		    LG_PAGE));
-		ql_remove(mapelms, mapelm, u.ql_link);
+		ql_remove(mapelms, mapelm, dr_link);
 		arena_run_dalloc(arena, run, false, true);
 	}
 }
 
-static inline size_t
-arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk, bool all)
-{
-	size_t npurged;
-	arena_chunk_mapelms_t mapelms;
-
-	ql_new(&mapelms);
-
-	/*
-	 * If chunk is the spare, temporarily re-allocate it, 1) so that its
-	 * run is reinserted into runs_avail, and 2) so that it cannot be
-	 * completely discarded by another thread while arena->lock is dropped
-	 * by this thread.  Note that the arena_run_dalloc() call will
-	 * implicitly deallocate the chunk, so no explicit action is required
-	 * in this function to deallocate the chunk.
-	 *
-	 * Note that once a chunk contains dirty pages, it cannot again contain
-	 * a single run unless 1) it is a dirty run, or 2) this function purges
-	 * dirty pages and causes the transition to a single clean run.  Thus
-	 * (chunk == arena->spare) is possible, but it is not possible for
-	 * this function to be called on the spare unless it contains a dirty
-	 * run.
-	 */
-	if (chunk == arena->spare) {
-		assert(arena_mapbits_dirty_get(chunk, map_bias) != 0);
-		assert(arena_mapbits_dirty_get(chunk, chunk_npages-1) != 0);
-
-		arena_chunk_alloc(arena);
-	}
-
-	if (config_stats)
-		arena->stats.purged += chunk->ndirty;
-
-	/*
-	 * Operate on all dirty runs if there is no clean/dirty run
-	 * fragmentation.
-	 */
-	if (chunk->nruns_adjac == 0)
-		all = true;
-
-	arena_chunk_stash_dirty(arena, chunk, all, &mapelms);
-	npurged = arena_chunk_purge_stashed(arena, chunk, &mapelms);
-	arena_chunk_unstash_purged(arena, chunk, &mapelms);
-
-	return (npurged);
-}
-
-static void
+void
 arena_purge(arena_t *arena, bool all)
 {
-	arena_chunk_t *chunk;
-	size_t npurgatory;
+	size_t npurgatory, npurgeable, npurged;
+	arena_chunk_mapelms_t purge_list;
+
 	if (config_debug) {
 		size_t ndirty = 0;
 
@@ -1175,58 +1121,17 @@ arena_purge(arena_t *arena, bool all)
 	npurgatory = arena_compute_npurgatory(arena, all);
 	arena->npurgatory += npurgatory;
 
-	while (npurgatory > 0) {
-		size_t npurgeable, npurged, nunpurged;
+	ql_new(&purge_list);
 
-		/* Get next chunk with dirty pages. */
-		chunk = arena_chunk_dirty_first(&arena->chunks_dirty);
-		if (chunk == NULL) {
-			/*
-			 * This thread was unable to purge as many pages as
-			 * originally intended, due to races with other threads
-			 * that either did some of the purging work, or re-used
-			 * dirty pages.
-			 */
-			arena->npurgatory -= npurgatory;
-			return;
-		}
-		npurgeable = chunk->ndirty;
-		assert(npurgeable != 0);
+	npurgeable = arena_stash_dirty(arena, all, npurgatory, &purge_list);
+	assert(npurgeable >= npurgatory);
+	/* Actually we no longer need arena->npurgatory. */
+	arena->npurgatory -= npurgatory;
 
-		if (npurgeable > npurgatory && chunk->nruns_adjac == 0) {
-			/*
-			 * This thread will purge all the dirty pages in chunk,
-			 * so set npurgatory to reflect this thread's intent to
-			 * purge the pages.  This tends to reduce the chances
-			 * of the following scenario:
-			 *
-			 * 1) This thread sets arena->npurgatory such that
-			 *    (arena->ndirty - arena->npurgatory) is at the
-			 *    threshold.
-			 * 2) This thread drops arena->lock.
-			 * 3) Another thread causes one or more pages to be
-			 *    dirtied, and immediately determines that it must
-			 *    purge dirty pages.
-			 *
-			 * If this scenario *does* play out, that's okay,
-			 * because all of the purging work being done really
-			 * needs to happen.
-			 */
-			arena->npurgatory += npurgeable - npurgatory;
-			npurgatory = npurgeable;
-		}
+	npurged = arena_purge_stashed(arena, &purge_list);
+	assert(npurged == npurgeable);
 
-		/*
-		 * Keep track of how many pages are purgeable, versus how many
-		 * actually get purged, and adjust counters accordingly.
-		 */
-		arena->npurgatory -= npurgeable;
-		npurgatory -= npurgeable;
-		npurged = arena_chunk_purge(arena, chunk, all);
-		nunpurged = npurgeable - npurged;
-		arena->npurgatory += nunpurged;
-		npurgatory += nunpurged;
-	}
+	arena_unstash_purged(arena, &purge_list);
 }
 
 void
-- 
cgit v0.12


From 90737fcda150a5da3f4db1c3144ea24eed8de55b Mon Sep 17 00:00:00 2001
From: Qinfan Wu <wqfish@fb.com>
Date: Mon, 21 Jul 2014 19:39:20 -0700
Subject: Remove chunks_dirty tree, nruns_avail and nruns_adjac since we no
 longer need to maintain the tree for dirty page purging.

---
 include/jemalloc/internal/arena.h |  19 ----
 src/arena.c                       | 187 ++------------------------------------
 2 files changed, 10 insertions(+), 196 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 3422f36..f87dfe4 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -167,24 +167,9 @@ struct arena_chunk_s {
 	/* Arena that owns the chunk. */
 	arena_t			*arena;
 
-	/* Linkage for tree of arena chunks that contain dirty runs. */
-	rb_node(arena_chunk_t)	dirty_link;
-
 	/* Number of dirty pages. */
 	size_t			ndirty;
 
-	/* Number of available runs. */
-	size_t			nruns_avail;
-
-	/*
-	 * Number of available run adjacencies that purging could coalesce.
-	 * Clean and dirty available runs are not coalesced, which causes
-	 * virtual memory fragmentation.  The ratio of
-	 * (nruns_avail-nruns_adjac):nruns_adjac is used for tracking this
-	 * fragmentation.
-	 */
-	size_t			nruns_adjac;
-
 	/*
 	 * Map of pages within chunk that keeps track of free/large/small.  The
 	 * first map_bias entries are omitted, since the chunk header does not
@@ -193,7 +178,6 @@ struct arena_chunk_s {
 	 */
 	arena_chunk_map_t	map[1]; /* Dynamically sized. */
 };
-typedef rb_tree(arena_chunk_t) arena_chunk_tree_t;
 
 struct arena_run_s {
 	/* Bin this run is associated with. */
@@ -333,9 +317,6 @@ struct arena_s {
 
 	dss_prec_t		dss_prec;
 
-	/* Tree of dirty-page-containing chunks this arena manages. */
-	arena_chunk_tree_t	chunks_dirty;
-
 	/* List of dirty runs this arena manages. */
 	arena_chunk_mapelms_t	runs_dirty;
 
diff --git a/src/arena.c b/src/arena.c
index a78a66f..24ed2ba 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -125,143 +125,18 @@ arena_avail_comp(arena_chunk_map_t *a, arena_chunk_map_t *b)
 rb_gen(static UNUSED, arena_avail_tree_, arena_avail_tree_t, arena_chunk_map_t,
     u.rb_link, arena_avail_comp)
 
-static inline int
-arena_chunk_dirty_comp(arena_chunk_t *a, arena_chunk_t *b)
-{
-
-	assert(a != NULL);
-	assert(b != NULL);
-
-	/*
-	 * Short-circuit for self comparison.  The following comparison code
-	 * would come to the same result, but at the cost of executing the slow
-	 * path.
-	 */
-	if (a == b)
-		return (0);
-
-	/*
-	 * Order such that chunks with higher fragmentation are "less than"
-	 * those with lower fragmentation -- purging order is from "least" to
-	 * "greatest".  Fragmentation is measured as:
-	 *
-	 *     mean current avail run size
-	 *   --------------------------------
-	 *   mean defragmented avail run size
-	 *
-	 *            navail
-	 *         -----------
-	 *         nruns_avail           nruns_avail-nruns_adjac
-	 * = ========================= = -----------------------
-	 *            navail                  nruns_avail
-	 *    -----------------------
-	 *    nruns_avail-nruns_adjac
-	 *
-	 * The following code multiplies away the denominator prior to
-	 * comparison, in order to avoid division.
-	 *
-	 */
-	{
-		size_t a_val = (a->nruns_avail - a->nruns_adjac) *
-		    b->nruns_avail;
-		size_t b_val = (b->nruns_avail - b->nruns_adjac) *
-		    a->nruns_avail;
-
-		if (a_val < b_val)
-			return (1);
-		if (a_val > b_val)
-			return (-1);
-	}
-	/*
-	 * Break ties by chunk address.  For fragmented chunks, report lower
-	 * addresses as "lower", so that fragmentation reduction happens first
-	 * at lower addresses.  However, use the opposite ordering for
-	 * unfragmented chunks, in order to increase the chances of
-	 * re-allocating dirty runs.
-	 */
-	{
-		uintptr_t a_chunk = (uintptr_t)a;
-		uintptr_t b_chunk = (uintptr_t)b;
-		int ret = ((a_chunk > b_chunk) - (a_chunk < b_chunk));
-		if (a->nruns_adjac == 0) {
-			assert(b->nruns_adjac == 0);
-			ret = -ret;
-		}
-		return (ret);
-	}
-}
-
-/* Generate red-black tree functions. */
-rb_gen(static UNUSED, arena_chunk_dirty_, arena_chunk_tree_t, arena_chunk_t,
-    dirty_link, arena_chunk_dirty_comp)
-
-static inline bool
-arena_avail_adjac_pred(arena_chunk_t *chunk, size_t pageind)
-{
-	bool ret;
-
-	if (pageind-1 < map_bias)
-		ret = false;
-	else {
-		ret = (arena_mapbits_allocated_get(chunk, pageind-1) == 0);
-		assert(ret == false || arena_mapbits_dirty_get(chunk,
-		    pageind-1) != arena_mapbits_dirty_get(chunk, pageind));
-	}
-	return (ret);
-}
-
-static inline bool
-arena_avail_adjac_succ(arena_chunk_t *chunk, size_t pageind, size_t npages)
-{
-	bool ret;
-
-	if (pageind+npages == chunk_npages)
-		ret = false;
-	else {
-		assert(pageind+npages < chunk_npages);
-		ret = (arena_mapbits_allocated_get(chunk, pageind+npages) == 0);
-		assert(ret == false || arena_mapbits_dirty_get(chunk, pageind)
-		    != arena_mapbits_dirty_get(chunk, pageind+npages));
-	}
-	return (ret);
-}
-
-static inline bool
-arena_avail_adjac(arena_chunk_t *chunk, size_t pageind, size_t npages)
-{
-
-	return (arena_avail_adjac_pred(chunk, pageind) ||
-	    arena_avail_adjac_succ(chunk, pageind, npages));
-}
-
 static void
 arena_avail_insert(arena_t *arena, arena_chunk_t *chunk, size_t pageind,
-    size_t npages, bool maybe_adjac_pred, bool maybe_adjac_succ)
+    size_t npages)
 {
 
 	assert(npages == (arena_mapbits_unallocated_size_get(chunk, pageind) >>
 	    LG_PAGE));
 
-	/*
-	 * chunks_dirty is keyed by nruns_{avail,adjac}, so the chunk must be
-	 * removed and reinserted even if the run to be inserted is clean.
-	 */
-	if (chunk->ndirty != 0)
-		arena_chunk_dirty_remove(&arena->chunks_dirty, chunk);
-
-	if (maybe_adjac_pred && arena_avail_adjac_pred(chunk, pageind))
-		chunk->nruns_adjac++;
-	if (maybe_adjac_succ && arena_avail_adjac_succ(chunk, pageind, npages))
-		chunk->nruns_adjac++;
-	chunk->nruns_avail++;
-	assert(chunk->nruns_avail > chunk->nruns_adjac);
-
 	if (arena_mapbits_dirty_get(chunk, pageind) != 0) {
 		arena->ndirty += npages;
 		chunk->ndirty += npages;
 	}
-	if (chunk->ndirty != 0)
-		arena_chunk_dirty_insert(&arena->chunks_dirty, chunk);
 
 	arena_avail_tree_insert(&arena->runs_avail, arena_mapp_get(chunk,
 	    pageind));
@@ -269,33 +144,16 @@ arena_avail_insert(arena_t *arena, arena_chunk_t *chunk, size_t pageind,
 
 static void
 arena_avail_remove(arena_t *arena, arena_chunk_t *chunk, size_t pageind,
-    size_t npages, bool maybe_adjac_pred, bool maybe_adjac_succ)
+    size_t npages)
 {
 
 	assert(npages == (arena_mapbits_unallocated_size_get(chunk, pageind) >>
 	    LG_PAGE));
 
-	/*
-	 * chunks_dirty is keyed by nruns_{avail,adjac}, so the chunk must be
-	 * removed and reinserted even if the run to be removed is clean.
-	 */
-	if (chunk->ndirty != 0)
-		arena_chunk_dirty_remove(&arena->chunks_dirty, chunk);
-
-	if (maybe_adjac_pred && arena_avail_adjac_pred(chunk, pageind))
-		chunk->nruns_adjac--;
-	if (maybe_adjac_succ && arena_avail_adjac_succ(chunk, pageind, npages))
-		chunk->nruns_adjac--;
-	chunk->nruns_avail--;
-	assert(chunk->nruns_avail > chunk->nruns_adjac || (chunk->nruns_avail
-	    == 0 && chunk->nruns_adjac == 0));
-
 	if (arena_mapbits_dirty_get(chunk, pageind) != 0) {
 		arena->ndirty -= npages;
 		chunk->ndirty -= npages;
 	}
-	if (chunk->ndirty != 0)
-		arena_chunk_dirty_insert(&arena->chunks_dirty, chunk);
 
 	arena_avail_tree_remove(&arena->runs_avail, arena_mapp_get(chunk,
 	    pageind));
@@ -404,7 +262,7 @@ arena_run_split_remove(arena_t *arena, arena_chunk_t *chunk, size_t run_ind,
 	assert(need_pages <= total_pages);
 	rem_pages = total_pages - need_pages;
 
-	arena_avail_remove(arena, chunk, run_ind, total_pages, true, true);
+	arena_avail_remove(arena, chunk, run_ind, total_pages);
 	if (flag_dirty != 0) {
 		/* If the run is dirty, it must be in the dirty list. */
 		mapelm = arena_mapp_get(chunk, run_ind);
@@ -440,8 +298,7 @@ arena_run_split_remove(arena_t *arena, arena_chunk_t *chunk, size_t run_ind,
 			    arena_mapbits_unzeroed_get(chunk,
 			    run_ind+total_pages-1));
 		}
-		arena_avail_insert(arena, chunk, run_ind+need_pages, rem_pages,
-		    false, true);
+		arena_avail_insert(arena, chunk, run_ind+need_pages, rem_pages);
 	}
 }
 
@@ -660,9 +517,6 @@ arena_chunk_init_hard(arena_t *arena)
 	 */
 	chunk->ndirty = 0;
 
-	chunk->nruns_avail = 0;
-	chunk->nruns_adjac = 0;
-
 	/*
 	 * Initialize the map to contain one maximal free untouched run.  Mark
 	 * the pages as zeroed iff chunk_alloc() returned a zeroed chunk.
@@ -713,8 +567,7 @@ arena_chunk_alloc(arena_t *arena)
 	}
 
 	/* Insert the run into the runs_avail tree. */
-	arena_avail_insert(arena, chunk, map_bias, chunk_npages-map_bias,
-	    false, false);
+	arena_avail_insert(arena, chunk, map_bias, chunk_npages-map_bias);
 	if (arena_mapbits_dirty_get(chunk, map_bias) != 0) {
 		arena_chunk_map_t *mapelm = arena_mapp_get(chunk, map_bias);
 		ql_elm_new(mapelm, dr_link);
@@ -772,8 +625,7 @@ arena_chunk_dalloc(arena_t *arena, arena_chunk_t *chunk)
 	 * Remove run from the runs_avail tree, so that the arena does not use
 	 * it.
 	 */
-	arena_avail_remove(arena, chunk, map_bias, chunk_npages-map_bias,
-	    false, false);
+	arena_avail_remove(arena, chunk, map_bias, chunk_npages-map_bias);
 	if (arena_mapbits_dirty_get(chunk, map_bias) != 0) {
 		arena_chunk_map_t *mapelm = arena_mapp_get(chunk, map_bias);
 		ql_remove(&arena->runs_dirty, mapelm, dr_link);
@@ -920,16 +772,6 @@ arena_maybe_purge(arena_t *arena)
 	arena_purge(arena, false);
 }
 
-static arena_chunk_t *
-chunks_dirty_iter_cb(arena_chunk_tree_t *tree, arena_chunk_t *chunk, void *arg)
-{
-	size_t *ndirty = (size_t *)arg;
-
-	assert(chunk->ndirty != 0);
-	*ndirty += chunk->ndirty;
-	return (NULL);
-}
-
 static size_t
 arena_dirty_count(arena_t *arena)
 {
@@ -1097,13 +939,7 @@ arena_purge(arena_t *arena, bool all)
 	arena_chunk_mapelms_t purge_list;
 
 	if (config_debug) {
-		size_t ndirty = 0;
-
-		arena_chunk_dirty_iter(&arena->chunks_dirty, NULL,
-		    chunks_dirty_iter_cb, (void *)&ndirty);
-		assert(ndirty == arena->ndirty);
-
-		ndirty = arena_dirty_count(arena);
+		size_t ndirty = arena_dirty_count(arena);
 		assert(ndirty == arena->ndirty);
 	}
 	assert(arena->ndirty > arena->npurgatory || all);
@@ -1167,8 +1003,7 @@ arena_run_coalesce(arena_t *arena, arena_chunk_t *chunk, size_t *p_size,
 		    run_ind+run_pages+nrun_pages-1) == nrun_size);
 		assert(arena_mapbits_dirty_get(chunk,
 		    run_ind+run_pages+nrun_pages-1) == flag_dirty);
-		arena_avail_remove(arena, chunk, run_ind+run_pages, nrun_pages,
-		    false, true);
+		arena_avail_remove(arena, chunk, run_ind+run_pages, nrun_pages);
 
 		/* If the successor is dirty, remove it from runs_dirty. */
 		if (flag_dirty != 0) {
@@ -1202,8 +1037,7 @@ arena_run_coalesce(arena_t *arena, arena_chunk_t *chunk, size_t *p_size,
 		assert(arena_mapbits_unallocated_size_get(chunk, run_ind) ==
 		    prun_size);
 		assert(arena_mapbits_dirty_get(chunk, run_ind) == flag_dirty);
-		arena_avail_remove(arena, chunk, run_ind, prun_pages, true,
-		    false);
+		arena_avail_remove(arena, chunk, run_ind, prun_pages);
 
 		/* If the predecessor is dirty, remove it from runs_dirty. */
 		if (flag_dirty != 0) {
@@ -1282,7 +1116,7 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned)
 	    arena_mapbits_unallocated_size_get(chunk, run_ind+run_pages-1));
 	assert(arena_mapbits_dirty_get(chunk, run_ind) ==
 	    arena_mapbits_dirty_get(chunk, run_ind+run_pages-1));
-	arena_avail_insert(arena, chunk, run_ind, run_pages, true, true);
+	arena_avail_insert(arena, chunk, run_ind, run_pages);
 
 	if (dirty) {
 		/* Insert into runs_dirty list. */
@@ -2412,7 +2246,6 @@ arena_new(arena_t *arena, unsigned ind)
 	arena->dss_prec = chunk_dss_prec_get();
 
 	/* Initialize chunks. */
-	arena_chunk_dirty_new(&arena->chunks_dirty);
 	ql_new(&arena->runs_dirty);
 	arena->spare = NULL;
 
-- 
cgit v0.12


From e8a2fd83a2ddc082fcd4e49373ea05bd79213c71 Mon Sep 17 00:00:00 2001
From: Qinfan Wu <wqfish@fb.com>
Date: Mon, 21 Jul 2014 20:00:14 -0700
Subject: arena->npurgatory is no longer needed since we drop arena's lock
 after stashing all the purgeable runs.

---
 include/jemalloc/internal/arena.h |  8 --------
 src/arena.c                       | 15 +++------------
 2 files changed, 3 insertions(+), 20 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index f87dfe4..1e2e987 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -344,14 +344,6 @@ struct arena_s {
 	size_t			ndirty;
 
 	/*
-	 * Approximate number of pages being purged.  It is possible for
-	 * multiple threads to purge dirty pages concurrently, and they use
-	 * npurgatory to indicate the total number of pages all threads are
-	 * attempting to purge.
-	 */
-	size_t			npurgatory;
-
-	/*
 	 * Size/address-ordered trees of this arena's available runs.  The trees
 	 * are used for first-best-fit run allocation.
 	 */
diff --git a/src/arena.c b/src/arena.c
index 24ed2ba..68b156b 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -757,10 +757,7 @@ arena_maybe_purge(arena_t *arena)
 	/* Don't purge if the option is disabled. */
 	if (opt_lg_dirty_mult < 0)
 		return;
-	/* Don't purge if all dirty pages are already being purged. */
-	if (arena->ndirty <= arena->npurgatory)
-		return;
-	npurgeable = arena->ndirty - arena->npurgatory;
+	npurgeable = arena->ndirty;
 	threshold = (arena->nactive >> opt_lg_dirty_mult);
 	/*
 	 * Don't purge unless the number of purgeable pages exceeds the
@@ -803,7 +800,7 @@ arena_compute_npurgatory(arena_t *arena, bool all)
 	 * Compute the minimum number of pages that this thread should try to
 	 * purge.
 	 */
-	npurgeable = arena->ndirty - arena->npurgatory;
+	npurgeable = arena->ndirty;
 
 	if (all == false) {
 		size_t threshold = (arena->nactive >> opt_lg_dirty_mult);
@@ -942,9 +939,7 @@ arena_purge(arena_t *arena, bool all)
 		size_t ndirty = arena_dirty_count(arena);
 		assert(ndirty == arena->ndirty);
 	}
-	assert(arena->ndirty > arena->npurgatory || all);
-	assert((arena->nactive >> opt_lg_dirty_mult) < (arena->ndirty -
-	    arena->npurgatory) || all);
+	assert((arena->nactive >> opt_lg_dirty_mult) < arena->ndirty || all);
 
 	if (config_stats)
 		arena->stats.npurge++;
@@ -955,14 +950,11 @@ arena_purge(arena_t *arena, bool all)
 	 * reduce ndirty below the threshold.
 	 */
 	npurgatory = arena_compute_npurgatory(arena, all);
-	arena->npurgatory += npurgatory;
 
 	ql_new(&purge_list);
 
 	npurgeable = arena_stash_dirty(arena, all, npurgatory, &purge_list);
 	assert(npurgeable >= npurgatory);
-	/* Actually we no longer need arena->npurgatory. */
-	arena->npurgatory -= npurgatory;
 
 	npurged = arena_purge_stashed(arena, &purge_list);
 	assert(npurged == npurgeable);
@@ -2251,7 +2243,6 @@ arena_new(arena_t *arena, unsigned ind)
 
 	arena->nactive = 0;
 	arena->ndirty = 0;
-	arena->npurgatory = 0;
 
 	arena_avail_tree_new(&arena->runs_avail);
 
-- 
cgit v0.12


From 070b3c3fbd90296610005c111ec6060e8bb23d31 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Thu, 14 Aug 2014 14:45:58 -0700
Subject: Fix and refactor runs_dirty-based purging.

Fix runs_dirty-based purging to also purge dirty pages in the spare
chunk.

Refactor runs_dirty manipulation into arena_dirty_{insert,remove}(), and
move the arena->ndirty accounting into those functions.

Remove the u.ql_link field from arena_chunk_map_t, and get rid of the
enclosing union for u.rb_link, since only rb_link remains.

Remove the ndirty field from arena_chunk_t.
---
 include/jemalloc/internal/arena.h |  34 +++----
 src/arena.c                       | 184 +++++++++++++++++---------------------
 2 files changed, 91 insertions(+), 127 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 1e2e987..9351e3b 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -65,23 +65,14 @@ struct arena_chunk_map_s {
 	 */
 	union {
 #endif
-	union {
-		/*
-		 * Linkage for run trees.  There are two disjoint uses:
-		 *
-		 * 1) arena_t's runs_avail tree.
-		 * 2) arena_run_t conceptually uses this linkage for in-use
-		 *    non-full runs, rather than directly embedding linkage.
-		 */
-		rb_node(arena_chunk_map_t)	rb_link;
-		/*
-		 * List of runs currently in purgatory.  arena_chunk_purge()
-		 * temporarily allocates runs that contain dirty pages while
-		 * purging, so that other threads cannot use the runs while the
-		 * purging thread is operating without the arena lock held.
-		 */
-		ql_elm(arena_chunk_map_t)	ql_link;
-	}				u;
+	/*
+	 * Linkage for run trees.  There are two disjoint uses:
+	 *
+	 * 1) arena_t's runs_avail tree.
+	 * 2) arena_run_t conceptually uses this linkage for in-use non-full
+	 * runs, rather than directly embedding linkage.
+	 */
+	rb_node(arena_chunk_map_t)	rb_link;
 
 	/* Profile counters, used for large object runs. */
 	prof_ctx_t			*prof_ctx;
@@ -167,9 +158,6 @@ struct arena_chunk_s {
 	/* Arena that owns the chunk. */
 	arena_t			*arena;
 
-	/* Number of dirty pages. */
-	size_t			ndirty;
-
 	/*
 	 * Map of pages within chunk that keeps track of free/large/small.  The
 	 * first map_bias entries are omitted, since the chunk header does not
@@ -317,9 +305,6 @@ struct arena_s {
 
 	dss_prec_t		dss_prec;
 
-	/* List of dirty runs this arena manages. */
-	arena_chunk_mapelms_t	runs_dirty;
-
 	/*
 	 * In order to avoid rapid chunk allocation/deallocation when an arena
 	 * oscillates right on the cusp of needing a new chunk, cache the most
@@ -349,6 +334,9 @@ struct arena_s {
 	 */
 	arena_avail_tree_t	runs_avail;
 
+	/* List of dirty runs this arena manages. */
+	arena_chunk_mapelms_t	runs_dirty;
+
 	/*
 	 * user-configureable chunk allocation and deallocation functions.
 	 */
diff --git a/src/arena.c b/src/arena.c
index 68b156b..1263269 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -90,7 +90,7 @@ arena_run_comp(arena_chunk_map_t *a, arena_chunk_map_t *b)
 
 /* Generate red-black tree functions. */
 rb_gen(static UNUSED, arena_run_tree_, arena_run_tree_t, arena_chunk_map_t,
-    u.rb_link, arena_run_comp)
+    rb_link, arena_run_comp)
 
 static inline int
 arena_avail_comp(arena_chunk_map_t *a, arena_chunk_map_t *b)
@@ -123,7 +123,7 @@ arena_avail_comp(arena_chunk_map_t *a, arena_chunk_map_t *b)
 
 /* Generate red-black tree functions. */
 rb_gen(static UNUSED, arena_avail_tree_, arena_avail_tree_t, arena_chunk_map_t,
-    u.rb_link, arena_avail_comp)
+    rb_link, arena_avail_comp)
 
 static void
 arena_avail_insert(arena_t *arena, arena_chunk_t *chunk, size_t pageind,
@@ -132,12 +132,6 @@ arena_avail_insert(arena_t *arena, arena_chunk_t *chunk, size_t pageind,
 
 	assert(npages == (arena_mapbits_unallocated_size_get(chunk, pageind) >>
 	    LG_PAGE));
-
-	if (arena_mapbits_dirty_get(chunk, pageind) != 0) {
-		arena->ndirty += npages;
-		chunk->ndirty += npages;
-	}
-
 	arena_avail_tree_insert(&arena->runs_avail, arena_mapp_get(chunk,
 	    pageind));
 }
@@ -149,16 +143,39 @@ arena_avail_remove(arena_t *arena, arena_chunk_t *chunk, size_t pageind,
 
 	assert(npages == (arena_mapbits_unallocated_size_get(chunk, pageind) >>
 	    LG_PAGE));
-
-	if (arena_mapbits_dirty_get(chunk, pageind) != 0) {
-		arena->ndirty -= npages;
-		chunk->ndirty -= npages;
-	}
-
 	arena_avail_tree_remove(&arena->runs_avail, arena_mapp_get(chunk,
 	    pageind));
 }
 
+static void
+arena_dirty_insert(arena_t *arena, arena_chunk_t *chunk, size_t pageind,
+    size_t npages)
+{
+	arena_chunk_map_t *mapelm = arena_mapp_get(chunk, pageind);
+	assert(npages == (arena_mapbits_unallocated_size_get(chunk, pageind) >>
+	    LG_PAGE));
+	assert(arena_mapbits_dirty_get(chunk, pageind) == CHUNK_MAP_DIRTY);
+	assert(arena_mapbits_dirty_get(chunk, pageind+npages-1) ==
+	    CHUNK_MAP_DIRTY);
+	ql_elm_new(mapelm, dr_link);
+	ql_tail_insert(&arena->runs_dirty, mapelm, dr_link);
+	arena->ndirty += npages;
+}
+
+static void
+arena_dirty_remove(arena_t *arena, arena_chunk_t *chunk, size_t pageind,
+    size_t npages)
+{
+	arena_chunk_map_t *mapelm = arena_mapp_get(chunk, pageind);
+	assert(npages == (arena_mapbits_unallocated_size_get(chunk, pageind) >>
+	    LG_PAGE));
+	assert(arena_mapbits_dirty_get(chunk, pageind) == CHUNK_MAP_DIRTY);
+	assert(arena_mapbits_dirty_get(chunk, pageind+npages-1) ==
+	    CHUNK_MAP_DIRTY);
+	ql_remove(&arena->runs_dirty, mapelm, dr_link);
+	arena->ndirty -= npages;
+}
+
 static inline void *
 arena_run_reg_alloc(arena_run_t *run, arena_bin_info_t *bin_info)
 {
@@ -252,7 +269,6 @@ static void
 arena_run_split_remove(arena_t *arena, arena_chunk_t *chunk, size_t run_ind,
     size_t flag_dirty, size_t need_pages)
 {
-	arena_chunk_map_t *mapelm;
 	size_t total_pages, rem_pages;
 
 	total_pages = arena_mapbits_unallocated_size_get(chunk, run_ind) >>
@@ -263,11 +279,8 @@ arena_run_split_remove(arena_t *arena, arena_chunk_t *chunk, size_t run_ind,
 	rem_pages = total_pages - need_pages;
 
 	arena_avail_remove(arena, chunk, run_ind, total_pages);
-	if (flag_dirty != 0) {
-		/* If the run is dirty, it must be in the dirty list. */
-		mapelm = arena_mapp_get(chunk, run_ind);
-		ql_remove(&arena->runs_dirty, mapelm, dr_link);
-	}
+	if (flag_dirty != 0)
+		arena_dirty_remove(arena, chunk, run_ind, total_pages);
 	arena_cactive_update(arena, need_pages, 0);
 	arena->nactive += need_pages;
 
@@ -280,14 +293,8 @@ arena_run_split_remove(arena_t *arena, arena_chunk_t *chunk, size_t run_ind,
 			arena_mapbits_unallocated_set(chunk,
 			    run_ind+total_pages-1, (rem_pages << LG_PAGE),
 			    flag_dirty);
-			mapelm = arena_mapp_get(chunk, run_ind+need_pages);
-			/*
-			 * Append the trailing run at the end of the dirty list.
-			 * We could also insert the run at the original place.
-			 * Let us consider this later.
-			 */
-			ql_elm_new(mapelm, dr_link);
-			ql_tail_insert(&arena->runs_dirty, mapelm, dr_link);
+			arena_dirty_insert(arena, chunk, run_ind+need_pages,
+			    rem_pages);
 		} else {
 			arena_mapbits_unallocated_set(chunk, run_ind+need_pages,
 			    (rem_pages << LG_PAGE),
@@ -513,11 +520,6 @@ arena_chunk_init_hard(arena_t *arena)
 	chunk->arena = arena;
 
 	/*
-	 * Claim that no pages are in use, since the header is merely overhead.
-	 */
-	chunk->ndirty = 0;
-
-	/*
 	 * Initialize the map to contain one maximal free untouched run.  Mark
 	 * the pages as zeroed iff chunk_alloc() returned a zeroed chunk.
 	 */
@@ -568,11 +570,6 @@ arena_chunk_alloc(arena_t *arena)
 
 	/* Insert the run into the runs_avail tree. */
 	arena_avail_insert(arena, chunk, map_bias, chunk_npages-map_bias);
-	if (arena_mapbits_dirty_get(chunk, map_bias) != 0) {
-		arena_chunk_map_t *mapelm = arena_mapp_get(chunk, map_bias);
-		ql_elm_new(mapelm, dr_link);
-		ql_tail_insert(&arena->runs_dirty, mapelm, dr_link);
-	}
 
 	return (chunk);
 }
@@ -626,15 +623,15 @@ arena_chunk_dalloc(arena_t *arena, arena_chunk_t *chunk)
 	 * it.
 	 */
 	arena_avail_remove(arena, chunk, map_bias, chunk_npages-map_bias);
-	if (arena_mapbits_dirty_get(chunk, map_bias) != 0) {
-		arena_chunk_map_t *mapelm = arena_mapp_get(chunk, map_bias);
-		ql_remove(&arena->runs_dirty, mapelm, dr_link);
-	}
 
 	if (arena->spare != NULL) {
 		arena_chunk_t *spare = arena->spare;
 
 		arena->spare = chunk;
+		if (arena_mapbits_dirty_get(spare, map_bias) != 0) {
+			arena_dirty_remove(arena, spare, map_bias,
+			    chunk_npages-map_bias);
+		}
 		arena_chunk_dalloc_internal(arena, spare);
 	} else
 		arena->spare = chunk;
@@ -752,18 +749,17 @@ arena_run_alloc_small(arena_t *arena, size_t size, size_t binind)
 static inline void
 arena_maybe_purge(arena_t *arena)
 {
-	size_t npurgeable, threshold;
+	size_t threshold;
 
 	/* Don't purge if the option is disabled. */
 	if (opt_lg_dirty_mult < 0)
 		return;
-	npurgeable = arena->ndirty;
 	threshold = (arena->nactive >> opt_lg_dirty_mult);
 	/*
 	 * Don't purge unless the number of purgeable pages exceeds the
 	 * threshold.
 	 */
-	if (npurgeable <= threshold)
+	if (arena->ndirty <= threshold)
 		return;
 
 	arena_purge(arena, false);
@@ -792,50 +788,53 @@ arena_dirty_count(arena_t *arena)
 }
 
 static size_t
-arena_compute_npurgatory(arena_t *arena, bool all)
+arena_compute_npurge(arena_t *arena, bool all)
 {
-	size_t npurgatory, npurgeable;
+	size_t npurge;
 
 	/*
 	 * Compute the minimum number of pages that this thread should try to
 	 * purge.
 	 */
-	npurgeable = arena->ndirty;
-
 	if (all == false) {
 		size_t threshold = (arena->nactive >> opt_lg_dirty_mult);
 
-		npurgatory = npurgeable - threshold;
+		npurge = arena->ndirty - threshold;
 	} else
-		npurgatory = npurgeable;
+		npurge = arena->ndirty;
 
-	return (npurgatory);
+	return (npurge);
 }
 
 static size_t
-arena_stash_dirty(arena_t *arena, bool all, size_t npurgatory,
+arena_stash_dirty(arena_t *arena, bool all, size_t npurge,
     arena_chunk_mapelms_t *mapelms)
 {
 	arena_chunk_map_t *mapelm;
 	size_t nstashed = 0;
-	arena_chunk_t *chunk;
-	size_t pageind, npages, run_size;
-	arena_run_t *run;
 
-	/* Add at least npurgatory pages to purge_list. */
+	/* Add at least npurge pages to purge_list. */
 	for (mapelm = ql_first(&arena->runs_dirty); mapelm != NULL;
 	    mapelm = ql_first(&arena->runs_dirty)) {
-		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(mapelm);
-		pageind = arena_mapelm_to_pageind(mapelm);
-		run_size = arena_mapbits_unallocated_size_get(chunk, pageind);
-		npages = run_size >> LG_PAGE;
-		run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)(pageind <<
-		    LG_PAGE));
+		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(mapelm);
+		size_t pageind = arena_mapelm_to_pageind(mapelm);
+		size_t run_size = arena_mapbits_unallocated_size_get(chunk,
+		    pageind);
+		size_t npages = run_size >> LG_PAGE;
+		arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
+		    (uintptr_t)(pageind << LG_PAGE));
 
 		assert(pageind + npages <= chunk_npages);
 		assert(arena_mapbits_dirty_get(chunk, pageind) ==
 		    arena_mapbits_dirty_get(chunk, pageind+npages-1));
 
+		/*
+		 * If purging the spare chunk's run, make it available prior to
+		 * allocation.
+		 */
+		if (chunk == arena->spare)
+			arena_chunk_alloc(arena);
+
 		/* Temporarily allocate the free dirty run. */
 		arena_run_split_large(arena, run, run_size, false);
 		/* Append to purge_list for later processing. */
@@ -844,7 +843,7 @@ arena_stash_dirty(arena_t *arena, bool all, size_t npurgatory,
 
 		nstashed += npages;
 
-		if (all == false && nstashed >= npurgatory)
+		if (all == false && nstashed >= npurge)
 			break;
 	}
 
@@ -856,8 +855,6 @@ arena_purge_stashed(arena_t *arena, arena_chunk_mapelms_t *mapelms)
 {
 	size_t npurged, nmadvise;
 	arena_chunk_map_t *mapelm;
-	arena_chunk_t *chunk;
-	size_t pageind, npages, run_size;
 
 	if (config_stats)
 		nmadvise = 0;
@@ -866,8 +863,9 @@ arena_purge_stashed(arena_t *arena, arena_chunk_mapelms_t *mapelms)
 	malloc_mutex_unlock(&arena->lock);
 
 	ql_foreach(mapelm, mapelms, dr_link) {
+		arena_chunk_t *chunk;
+		size_t pageind, run_size, npages, flag_unzeroed, i;
 		bool unzeroed;
-		size_t flag_unzeroed, i;
 
 		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(mapelm);
 		pageind = arena_mapelm_to_pageind(mapelm);
@@ -913,17 +911,14 @@ static void
 arena_unstash_purged(arena_t *arena, arena_chunk_mapelms_t *mapelms)
 {
 	arena_chunk_map_t *mapelm;
-	arena_chunk_t *chunk;
-	arena_run_t *run;
-	size_t pageind;
 
 	/* Deallocate runs. */
 	for (mapelm = ql_first(mapelms); mapelm != NULL;
 	    mapelm = ql_first(mapelms)) {
-		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(mapelm);
-		pageind = arena_mapelm_to_pageind(mapelm);
-		run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)(pageind <<
-		    LG_PAGE));
+		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(mapelm);
+		size_t pageind = arena_mapelm_to_pageind(mapelm);
+		arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
+		    (uintptr_t)(pageind << LG_PAGE));
 		ql_remove(mapelms, mapelm, dr_link);
 		arena_run_dalloc(arena, run, false, true);
 	}
@@ -932,7 +927,7 @@ arena_unstash_purged(arena_t *arena, arena_chunk_mapelms_t *mapelms)
 void
 arena_purge(arena_t *arena, bool all)
 {
-	size_t npurgatory, npurgeable, npurged;
+	size_t npurge, npurgeable, npurged;
 	arena_chunk_mapelms_t purge_list;
 
 	if (config_debug) {
@@ -944,21 +939,12 @@ arena_purge(arena_t *arena, bool all)
 	if (config_stats)
 		arena->stats.npurge++;
 
-	/*
-	 * Add the minimum number of pages this thread should try to purge to
-	 * arena->npurgatory.  This will keep multiple threads from racing to
-	 * reduce ndirty below the threshold.
-	 */
-	npurgatory = arena_compute_npurgatory(arena, all);
-
+	npurge = arena_compute_npurge(arena, all);
 	ql_new(&purge_list);
-
-	npurgeable = arena_stash_dirty(arena, all, npurgatory, &purge_list);
-	assert(npurgeable >= npurgatory);
-
+	npurgeable = arena_stash_dirty(arena, all, npurge, &purge_list);
+	assert(npurgeable >= npurge);
 	npurged = arena_purge_stashed(arena, &purge_list);
 	assert(npurged == npurgeable);
-
 	arena_unstash_purged(arena, &purge_list);
 }
 
@@ -999,9 +985,8 @@ arena_run_coalesce(arena_t *arena, arena_chunk_t *chunk, size_t *p_size,
 
 		/* If the successor is dirty, remove it from runs_dirty. */
 		if (flag_dirty != 0) {
-			arena_chunk_map_t *mapelm = arena_mapp_get(chunk,
-			    run_ind+run_pages);
-			ql_remove(&arena->runs_dirty, mapelm, dr_link);
+			arena_dirty_remove(arena, chunk, run_ind+run_pages,
+			    nrun_pages);
 		}
 
 		size += nrun_size;
@@ -1032,11 +1017,8 @@ arena_run_coalesce(arena_t *arena, arena_chunk_t *chunk, size_t *p_size,
 		arena_avail_remove(arena, chunk, run_ind, prun_pages);
 
 		/* If the predecessor is dirty, remove it from runs_dirty. */
-		if (flag_dirty != 0) {
-			arena_chunk_map_t *mapelm = arena_mapp_get(chunk,
-			    run_ind);
-			ql_remove(&arena->runs_dirty, mapelm, dr_link);
-		}
+		if (flag_dirty != 0)
+			arena_dirty_remove(arena, chunk, run_ind, prun_pages);
 
 		size += prun_size;
 		run_pages += prun_pages;
@@ -1055,7 +1037,6 @@ static void
 arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned)
 {
 	arena_chunk_t *chunk;
-	arena_chunk_map_t *mapelm;
 	size_t size, run_ind, run_pages, flag_dirty;
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
@@ -1110,12 +1091,8 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned)
 	    arena_mapbits_dirty_get(chunk, run_ind+run_pages-1));
 	arena_avail_insert(arena, chunk, run_ind, run_pages);
 
-	if (dirty) {
-		/* Insert into runs_dirty list. */
-		mapelm = arena_mapp_get(chunk, run_ind);
-		ql_elm_new(mapelm, dr_link);
-		ql_tail_insert(&arena->runs_dirty, mapelm, dr_link);
-	}
+	if (dirty)
+		arena_dirty_insert(arena, chunk, run_ind, run_pages);
 
 	/* Deallocate chunk if it is now completely unused. */
 	if (size == arena_maxclass) {
@@ -2237,14 +2214,13 @@ arena_new(arena_t *arena, unsigned ind)
 
 	arena->dss_prec = chunk_dss_prec_get();
 
-	/* Initialize chunks. */
-	ql_new(&arena->runs_dirty);
 	arena->spare = NULL;
 
 	arena->nactive = 0;
 	arena->ndirty = 0;
 
 	arena_avail_tree_new(&arena->runs_avail);
+	ql_new(&arena->runs_dirty);
 
 	/* Initialize bins. */
 	for (i = 0; i < NBINS; i++) {
-- 
cgit v0.12


From 586c8ede42d7d0545d36d9cbb0235fb39221ef3e Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 15 Aug 2014 12:20:20 -0700
Subject: Fix arena.<i>.dss mallctl to handle read-only calls.

---
 src/ctl.c           | 52 +++++++++++++++++++++++++++++-----------------------
 test/unit/mallctl.c | 13 +++++++++++++
 2 files changed, 42 insertions(+), 23 deletions(-)

diff --git a/src/ctl.c b/src/ctl.c
index a193605..fa52a6c 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -1327,45 +1327,51 @@ static int
 arena_i_dss_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
     void *newp, size_t newlen)
 {
-	int ret, i;
-	bool match, err;
-	const char *dss;
+	int ret;
+	const char *dss = NULL;
 	unsigned arena_ind = mib[1];
 	dss_prec_t dss_prec_old = dss_prec_limit;
 	dss_prec_t dss_prec = dss_prec_limit;
 
 	malloc_mutex_lock(&ctl_mtx);
 	WRITE(dss, const char *);
-	match = false;
-	for (i = 0; i < dss_prec_limit; i++) {
-		if (strcmp(dss_prec_names[i], dss) == 0) {
-			dss_prec = i;
-			match = true;
-			break;
+	if (dss != NULL) {
+		int i;
+		bool match = false;
+
+		for (i = 0; i < dss_prec_limit; i++) {
+			if (strcmp(dss_prec_names[i], dss) == 0) {
+				dss_prec = i;
+				match = true;
+				break;
+			}
+		}
+
+		if (match == false) {
+			ret = EINVAL;
+			goto label_return;
 		}
-	}
-	if (match == false) {
-		ret = EINVAL;
-		goto label_return;
 	}
 
 	if (arena_ind < ctl_stats.narenas) {
 		arena_t *arena = arenas[arena_ind];
-		if (arena != NULL) {
-			dss_prec_old = arena_dss_prec_get(arena);
-			err = arena_dss_prec_set(arena, dss_prec);
-		} else
-			err = true;
+		if (arena == NULL || (dss_prec != dss_prec_limit &&
+		    arena_dss_prec_set(arena, dss_prec))) {
+			ret = EFAULT;
+			goto label_return;
+		}
+		dss_prec_old = arena_dss_prec_get(arena);
 	} else {
+		if (dss_prec != dss_prec_limit &&
+		    chunk_dss_prec_set(dss_prec)) {
+			ret = EFAULT;
+			goto label_return;
+		}
 		dss_prec_old = chunk_dss_prec_get();
-		err = chunk_dss_prec_set(dss_prec);
 	}
+
 	dss = dss_prec_names[dss_prec_old];
 	READ(dss, const char *);
-	if (err) {
-		ret = EFAULT;
-		goto label_return;
-	}
 
 	ret = 0;
 label_return:
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index 7a8b55f..c70473c 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -268,12 +268,25 @@ TEST_BEGIN(test_arena_i_dss)
 	assert_d_eq(mallctlbymib(mib, miblen, &dss_prec_new, &sz, &dss_prec_old,
 	    sizeof(dss_prec_old)), 0, "Unexpected mallctl() failure");
 
+	assert_d_eq(mallctlbymib(mib, miblen, &dss_prec_old, &sz, NULL, 0), 0,
+	    "Unexpected mallctl() failure");
+	assert_str_ne(dss_prec_old, "primary",
+	    "Unexpected value for dss precedence");
+
 	mib[1] = narenas_total_get();
 	dss_prec_new = "disabled";
 	assert_d_eq(mallctlbymib(mib, miblen, &dss_prec_old, &sz, &dss_prec_new,
 	    sizeof(dss_prec_new)), 0, "Unexpected mallctl() failure");
 	assert_str_ne(dss_prec_old, "primary",
 	    "Unexpected default for dss precedence");
+
+	assert_d_eq(mallctlbymib(mib, miblen, &dss_prec_new, &sz, &dss_prec_old,
+	    sizeof(dss_prec_new)), 0, "Unexpected mallctl() failure");
+
+	assert_d_eq(mallctlbymib(mib, miblen, &dss_prec_old, &sz, NULL, 0), 0,
+	    "Unexpected mallctl() failure");
+	assert_str_ne(dss_prec_old, "primary",
+	    "Unexpected value for dss precedence");
 }
 TEST_END
 
-- 
cgit v0.12


From b41ccdb125b312d4522da1a80091a0137773c964 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 15 Aug 2014 15:01:15 -0700
Subject: Convert prof_tdata_t's bt2cnt to a comprehensive map.

Treat prof_tdata_t's bt2cnt as a comprehensive map of the thread's
extant allocation samples (do not limit the total number of entries).
This helps prepare the way for per thread heap profiling.
---
 include/jemalloc/internal/prof.h | 24 +++++---------
 src/prof.c                       | 67 ++++++++++------------------------------
 2 files changed, 25 insertions(+), 66 deletions(-)

diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index d82fbc4..96db4c3 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -23,9 +23,6 @@ typedef struct prof_tdata_s prof_tdata_t;
  */
 #define	PROF_BT_MAX			128
 
-/* Maximum number of backtraces to store in each per thread LRU cache. */
-#define	PROF_TCMAX			1024
-
 /* Initial hash table size. */
 #define	PROF_CKH_MINITEMS		64
 
@@ -87,9 +84,6 @@ struct prof_thr_cnt_s {
 	/* Linkage into prof_ctx_t's cnts_ql. */
 	ql_elm(prof_thr_cnt_t)	cnts_link;
 
-	/* Linkage into thread's LRU. */
-	ql_elm(prof_thr_cnt_t)	lru_link;
-
 	/*
 	 * Associated context.  If a thread frees an object that it did not
 	 * allocate, it is possible that the context is not cached in the
@@ -157,10 +151,11 @@ typedef ql_head(prof_ctx_t) prof_ctx_list_t;
 
 struct prof_tdata_s {
 	/*
-	 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread keeps a
-	 * cache of backtraces, with associated thread-specific prof_thr_cnt_t
-	 * objects.  Other threads may read the prof_thr_cnt_t contents, but no
-	 * others will ever write them.
+	 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread tracks
+	 * backtraces for which it has non-zero allocation/deallocation counters
+	 * associated with thread-specific prof_thr_cnt_t objects.  Other
+	 * threads may read the prof_thr_cnt_t contents, but no others will ever
+	 * write them.
 	 *
 	 * Upon thread exit, the thread must merge all the prof_thr_cnt_t
 	 * counter data into the associated prof_ctx_t objects, and unlink/free
@@ -168,12 +163,6 @@ struct prof_tdata_s {
 	 */
 	ckh_t			bt2cnt;
 
-	/* LRU for contents of bt2cnt. */
-	ql_head(prof_thr_cnt_t)	lru_ql;
-
-	/* Backtrace vector, used for calls to prof_backtrace(). */
-	void			**vec;
-
 	/* Sampling state. */
 	uint64_t		prng_state;
 	uint64_t		bytes_until_sample;
@@ -182,6 +171,9 @@ struct prof_tdata_s {
 	bool			enq;
 	bool			enq_idump;
 	bool			enq_gdump;
+
+	/* Backtrace vector, used for calls to prof_backtrace(). */
+	void			*vec[PROF_BT_MAX];
 };
 
 #endif /* JEMALLOC_H_STRUCTS */
diff --git a/src/prof.c b/src/prof.c
index 0eb7dbd..4f95fdb 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -567,33 +567,13 @@ prof_lookup(prof_bt_t *bt)
 			return (NULL);
 
 		/* Link a prof_thd_cnt_t into ctx for this thread. */
-		if (ckh_count(&prof_tdata->bt2cnt) == PROF_TCMAX) {
-			assert(ckh_count(&prof_tdata->bt2cnt) > 0);
-			/*
-			 * Flush the least recently used cnt in order to keep
-			 * bt2cnt from becoming too large.
-			 */
-			ret.p = ql_last(&prof_tdata->lru_ql, lru_link);
-			assert(ret.v != NULL);
-			if (ckh_remove(&prof_tdata->bt2cnt, ret.p->ctx->bt,
-			    NULL, NULL))
-				not_reached();
-			ql_remove(&prof_tdata->lru_ql, ret.p, lru_link);
-			prof_ctx_merge(ret.p->ctx, ret.p);
-			/* ret can now be re-used. */
-		} else {
-			assert(ckh_count(&prof_tdata->bt2cnt) < PROF_TCMAX);
-			/* Allocate and partially initialize a new cnt. */
-			ret.v = imalloc(sizeof(prof_thr_cnt_t));
-			if (ret.p == NULL) {
-				if (new_ctx)
-					prof_ctx_destroy(ctx);
-				return (NULL);
-			}
-			ql_elm_new(ret.p, cnts_link);
-			ql_elm_new(ret.p, lru_link);
+		ret.v = imalloc(sizeof(prof_thr_cnt_t));
+		if (ret.p == NULL) {
+			if (new_ctx)
+				prof_ctx_destroy(ctx);
+			return (NULL);
 		}
-		/* Finish initializing ret. */
+		ql_elm_new(ret.p, cnts_link);
 		ret.p->ctx = ctx;
 		ret.p->epoch = 0;
 		memset(&ret.p->cnts, 0, sizeof(prof_cnt_t));
@@ -603,15 +583,10 @@ prof_lookup(prof_bt_t *bt)
 			idalloc(ret.v);
 			return (NULL);
 		}
-		ql_head_insert(&prof_tdata->lru_ql, ret.p, lru_link);
 		malloc_mutex_lock(ctx->lock);
 		ql_tail_insert(&ctx->cnts_ql, ret.p, cnts_link);
 		ctx->nlimbo--;
 		malloc_mutex_unlock(ctx->lock);
-	} else {
-		/* Move ret to the front of the LRU. */
-		ql_remove(&prof_tdata->lru_ql, ret.p, lru_link);
-		ql_head_insert(&prof_tdata->lru_ql, ret.p, lru_link);
 	}
 
 	return (ret.p);
@@ -1247,14 +1222,6 @@ prof_tdata_init(void)
 		idalloc(prof_tdata);
 		return (NULL);
 	}
-	ql_new(&prof_tdata->lru_ql);
-
-	prof_tdata->vec = imalloc(sizeof(void *) * PROF_BT_MAX);
-	if (prof_tdata->vec == NULL) {
-		ckh_delete(&prof_tdata->bt2cnt);
-		idalloc(prof_tdata);
-		return (NULL);
-	}
 
 	prof_tdata->prng_state = (uint64_t)(uintptr_t)prof_tdata;
 	prof_sample_threshold_update(prof_tdata);
@@ -1271,7 +1238,6 @@ prof_tdata_init(void)
 void
 prof_tdata_cleanup(void *arg)
 {
-	prof_thr_cnt_t *cnt;
 	prof_tdata_t *prof_tdata = *(prof_tdata_t **)arg;
 
 	cassert(config_prof);
@@ -1292,21 +1258,22 @@ prof_tdata_cleanup(void *arg)
 		 * nothing, so that the destructor will not be called again.
 		 */
 	} else if (prof_tdata != NULL) {
-		/*
-		 * Delete the hash table.  All of its contents can still be
-		 * iterated over via the LRU.
-		 */
-		ckh_delete(&prof_tdata->bt2cnt);
+		union {
+			prof_thr_cnt_t	*p;
+			void		*v;
+		} cnt;
+		size_t tabind;
+
 		/*
 		 * Iteratively merge cnt's into the global stats and delete
 		 * them.
 		 */
-		while ((cnt = ql_last(&prof_tdata->lru_ql, lru_link)) != NULL) {
-			ql_remove(&prof_tdata->lru_ql, cnt, lru_link);
-			prof_ctx_merge(cnt->ctx, cnt);
-			idalloc(cnt);
+		for (tabind = 0; ckh_iter(&prof_tdata->bt2cnt, &tabind, NULL,
+		    &cnt.v);) {
+			prof_ctx_merge(cnt.p->ctx, cnt.p);
+			idalloc(cnt.v);
 		}
-		idalloc(prof_tdata->vec);
+		ckh_delete(&prof_tdata->bt2cnt);
 		idalloc(prof_tdata);
 		prof_tdata = PROF_TDATA_STATE_PURGATORY;
 		prof_tdata_tsd_set(&prof_tdata);
-- 
cgit v0.12


From ab532e97991d190e9368781cf308c60c2319b933 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 15 Aug 2014 15:05:12 -0700
Subject: Directly embed prof_ctx_t's bt.

---
 include/jemalloc/internal/prof.h | 13 +++++---
 src/prof.c                       | 69 +++++++++++-----------------------------
 2 files changed, 26 insertions(+), 56 deletions(-)

diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index 96db4c3..9be908d 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -115,9 +115,6 @@ struct prof_thr_cnt_s {
 };
 
 struct prof_ctx_s {
-	/* Associated backtrace. */
-	prof_bt_t		*bt;
-
 	/* Protects nlimbo, cnt_merged, and cnts_ql. */
 	malloc_mutex_t		*lock;
 
@@ -146,6 +143,12 @@ struct prof_ctx_s {
 
 	/* Linkage for list of contexts to be dumped. */
 	ql_elm(prof_ctx_t)	dump_link;
+
+	/* Associated backtrace. */
+	prof_bt_t		bt;
+
+	/* Backtrace vector, variable size, referred to by bt. */
+	void			*vec[1];
 };
 typedef ql_head(prof_ctx_t) prof_ctx_list_t;
 
@@ -425,7 +428,7 @@ prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
 	}
 
 	if ((uintptr_t)old_ctx > (uintptr_t)1U) {
-		told_cnt = prof_lookup(old_ctx->bt);
+		told_cnt = prof_lookup(&old_ctx->bt);
 		if (told_cnt == NULL) {
 			/*
 			 * It's too late to propagate OOM for this realloc(),
@@ -483,7 +486,7 @@ prof_free(const void *ptr, size_t size)
 	if ((uintptr_t)ctx > (uintptr_t)1) {
 		prof_thr_cnt_t *tcnt;
 		assert(size == isalloc(ptr, true));
-		tcnt = prof_lookup(ctx->bt);
+		tcnt = prof_lookup(&ctx->bt);
 
 		if (tcnt != NULL) {
 			tcnt->epoch++;
diff --git a/src/prof.c b/src/prof.c
index 4f95fdb..1b396af 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -87,41 +87,6 @@ bt_init(prof_bt_t *bt, void **vec)
 	bt->len = 0;
 }
 
-static void
-bt_destroy(prof_bt_t *bt)
-{
-
-	cassert(config_prof);
-
-	idalloc(bt);
-}
-
-static prof_bt_t *
-bt_dup(prof_bt_t *bt)
-{
-	prof_bt_t *ret;
-
-	cassert(config_prof);
-
-	/*
-	 * Create a single allocation that has space for vec immediately
-	 * following the prof_bt_t structure.  The backtraces that get
-	 * stored in the backtrace caches are copied from stack-allocated
-	 * temporary variables, so size is known at creation time.  Making this
-	 * a contiguous object improves cache locality.
-	 */
-	ret = (prof_bt_t *)imalloc(QUANTUM_CEILING(sizeof(prof_bt_t)) +
-	    (bt->len * sizeof(void *)));
-	if (ret == NULL)
-		return (NULL);
-	ret->vec = (void **)((uintptr_t)ret +
-	    QUANTUM_CEILING(sizeof(prof_bt_t)));
-	memcpy(ret->vec, bt->vec, bt->len * sizeof(void *));
-	ret->len = bt->len;
-
-	return (ret);
-}
-
 static inline void
 prof_enter(prof_tdata_t *prof_tdata)
 {
@@ -388,11 +353,16 @@ prof_ctx_mutex_choose(void)
 	return (&ctx_locks[(nctxs - 1) % PROF_NCTX_LOCKS]);
 }
 
-static void
-prof_ctx_init(prof_ctx_t *ctx, prof_bt_t *bt)
+static prof_ctx_t *
+prof_ctx_create(prof_bt_t *bt)
 {
-
-	ctx->bt = bt;
+	/*
+	 * Create a single allocation that has space for vec of length bt->len.
+	 */
+	prof_ctx_t *ctx = (prof_ctx_t *)imalloc(offsetof(prof_ctx_t, vec) +
+	    (bt->len * sizeof(void *)));
+	if (ctx == NULL)
+		return (NULL);
 	ctx->lock = prof_ctx_mutex_choose();
 	/*
 	 * Set nlimbo to 1, in order to avoid a race condition with
@@ -402,6 +372,11 @@ prof_ctx_init(prof_ctx_t *ctx, prof_bt_t *bt)
 	ql_elm_new(ctx, dump_link);
 	memset(&ctx->cnt_merged, 0, sizeof(prof_cnt_t));
 	ql_new(&ctx->cnts_ql);
+	/* Duplicate bt. */
+	memcpy(ctx->vec, bt->vec, bt->len * sizeof(void *));
+	ctx->bt.vec = ctx->vec;
+	ctx->bt.len = bt->len;
+	return (ctx);
 }
 
 static void
@@ -428,12 +403,11 @@ prof_ctx_destroy(prof_ctx_t *ctx)
 		assert(ctx->cnt_merged.accumobjs == 0);
 		assert(ctx->cnt_merged.accumbytes == 0);
 		/* Remove ctx from bt2ctx. */
-		if (ckh_remove(&bt2ctx, ctx->bt, NULL, NULL))
+		if (ckh_remove(&bt2ctx, &ctx->bt, NULL, NULL))
 			not_reached();
 		prof_leave(prof_tdata);
 		/* Destroy ctx. */
 		malloc_mutex_unlock(ctx->lock);
-		bt_destroy(ctx->bt);
 		idalloc(ctx);
 	} else {
 		/*
@@ -501,22 +475,15 @@ prof_lookup_global(prof_bt_t *bt, prof_tdata_t *prof_tdata, void **p_btkey,
 	prof_enter(prof_tdata);
 	if (ckh_search(&bt2ctx, bt, &btkey.v, &ctx.v)) {
 		/* bt has never been seen before.  Insert it. */
-		ctx.v = imalloc(sizeof(prof_ctx_t));
+		ctx.p = prof_ctx_create(bt);
 		if (ctx.v == NULL) {
 			prof_leave(prof_tdata);
 			return (true);
 		}
-		btkey.p = bt_dup(bt);
-		if (btkey.v == NULL) {
-			prof_leave(prof_tdata);
-			idalloc(ctx.v);
-			return (true);
-		}
-		prof_ctx_init(ctx.p, btkey.p);
+		btkey.p = &ctx.p->bt;
 		if (ckh_insert(&bt2ctx, btkey.v, ctx.v)) {
 			/* OOM. */
 			prof_leave(prof_tdata);
-			idalloc(btkey.v);
 			idalloc(ctx.v);
 			return (true);
 		}
@@ -1039,7 +1006,7 @@ prof_dump(bool propagate_err, const char *filename, bool leakcheck)
 
 	/* Dump per ctx profile stats. */
 	while ((ctx.p = ql_first(&ctx_ql)) != NULL) {
-		if (prof_dump_ctx(propagate_err, ctx.p, ctx.p->bt, &ctx_ql))
+		if (prof_dump_ctx(propagate_err, ctx.p, &ctx.p->bt, &ctx_ql))
 			goto label_write_error;
 	}
 
-- 
cgit v0.12


From 3a81cbd2d4f2d8c052f11f4b0b73ee5c84a33d4f Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sat, 16 Aug 2014 12:58:55 -0700
Subject: Dump heap profile backtraces in a stable order.

Also iterate over per thread stats in a stable order, which prepares the
way for stable ordering of per thread heap profile dumps.
---
 include/jemalloc/internal/prof.h |  24 +++---
 src/prof.c                       | 157 ++++++++++++++++++++++++++-------------
 2 files changed, 119 insertions(+), 62 deletions(-)

diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index 9be908d..9398ad9 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -1,6 +1,7 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_TYPES
 
+typedef uint64_t prof_thr_uid_t;
 typedef struct prof_bt_s prof_bt_t;
 typedef struct prof_cnt_s prof_cnt_t;
 typedef struct prof_thr_cnt_s prof_thr_cnt_t;
@@ -81,15 +82,17 @@ struct prof_cnt_s {
 };
 
 struct prof_thr_cnt_s {
-	/* Linkage into prof_ctx_t's cnts_ql. */
-	ql_elm(prof_thr_cnt_t)	cnts_link;
+	prof_thr_uid_t		thr_uid;
+
+	/* Linkage into prof_ctx_t's thr_cnts. */
+	rb_node(prof_thr_cnt_t)	thr_cnt_link;
 
 	/*
 	 * Associated context.  If a thread frees an object that it did not
-	 * allocate, it is possible that the context is not cached in the
+	 * allocate, it is possible that the context is not present in the
 	 * thread's hash table, in which case it must be able to look up the
 	 * context, insert a new prof_thr_cnt_t into the thread's hash table,
-	 * and link it into the prof_ctx_t's cnts_ql.
+	 * and link it into the prof_ctx_t's thr_cnts.
 	 */
 	prof_ctx_t		*ctx;
 
@@ -113,9 +116,10 @@ struct prof_thr_cnt_s {
 	/* Profiling counters. */
 	prof_cnt_t		cnts;
 };
+typedef rb_tree(prof_thr_cnt_t) prof_thr_cnt_tree_t;
 
 struct prof_ctx_s {
-	/* Protects nlimbo, cnt_merged, and cnts_ql. */
+	/* Protects nlimbo, cnt_merged, and thr_cnts. */
 	malloc_mutex_t		*lock;
 
 	/*
@@ -136,13 +140,13 @@ struct prof_ctx_s {
 	prof_cnt_t		cnt_merged;
 
 	/*
-	 * List of profile counters, one for each thread that has allocated in
+	 * Tree of profile counters, one for each thread that has allocated in
 	 * this context.
 	 */
-	ql_head(prof_thr_cnt_t)	cnts_ql;
+	prof_thr_cnt_tree_t	thr_cnts;
 
-	/* Linkage for list of contexts to be dumped. */
-	ql_elm(prof_ctx_t)	dump_link;
+	/* Linkage for tree of contexts to be dumped. */
+	rb_node(prof_ctx_t)	dump_link;
 
 	/* Associated backtrace. */
 	prof_bt_t		bt;
@@ -150,7 +154,7 @@ struct prof_ctx_s {
 	/* Backtrace vector, variable size, referred to by bt. */
 	void			*vec[1];
 };
-typedef ql_head(prof_ctx_t) prof_ctx_list_t;
+typedef rb_tree(prof_ctx_t) prof_ctx_tree_t;
 
 struct prof_tdata_s {
 	/*
diff --git a/src/prof.c b/src/prof.c
index 1b396af..497ccf4 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -77,6 +77,33 @@ static bool		prof_booted = false;
 
 /******************************************************************************/
 
+JEMALLOC_INLINE_C int
+prof_thr_cnt_comp(const prof_thr_cnt_t *a, const prof_thr_cnt_t *b)
+{
+	prof_thr_uid_t a_uid = a->thr_uid;
+	prof_thr_uid_t b_uid = b->thr_uid;
+
+	return ((a_uid > b_uid) - (a_uid < b_uid));
+}
+
+rb_gen(static UNUSED, thr_cnt_tree_, prof_thr_cnt_tree_t, prof_thr_cnt_t,
+    thr_cnt_link, prof_thr_cnt_comp)
+
+JEMALLOC_INLINE_C int
+prof_ctx_comp(const prof_ctx_t *a, const prof_ctx_t *b)
+{
+	unsigned a_len = a->bt.len;
+	unsigned b_len = b->bt.len;
+	unsigned comp_len = (a_len < b_len) ? a_len : b_len;
+	int ret = memcmp(a->bt.vec, b->bt.vec, comp_len * sizeof(void *));
+	if (ret == 0)
+		ret = (a_len > b_len) - (a_len < b_len);
+	return (ret);
+}
+
+rb_gen(static UNUSED, ctx_tree_, prof_ctx_tree_t, prof_ctx_t, dump_link,
+    prof_ctx_comp)
+
 void
 bt_init(prof_bt_t *bt, void **vec)
 {
@@ -369,9 +396,8 @@ prof_ctx_create(prof_bt_t *bt)
 	 * prof_ctx_merge()/prof_ctx_destroy().
 	 */
 	ctx->nlimbo = 1;
-	ql_elm_new(ctx, dump_link);
 	memset(&ctx->cnt_merged, 0, sizeof(prof_cnt_t));
-	ql_new(&ctx->cnts_ql);
+	thr_cnt_tree_new(&ctx->thr_cnts);
 	/* Duplicate bt. */
 	memcpy(ctx->vec, bt->vec, bt->len * sizeof(void *));
 	ctx->bt.vec = ctx->vec;
@@ -397,8 +423,8 @@ prof_ctx_destroy(prof_ctx_t *ctx)
 	assert((uintptr_t)prof_tdata > (uintptr_t)PROF_TDATA_STATE_MAX);
 	prof_enter(prof_tdata);
 	malloc_mutex_lock(ctx->lock);
-	if (ql_first(&ctx->cnts_ql) == NULL && ctx->cnt_merged.curobjs == 0 &&
-	    ctx->nlimbo == 1) {
+	if (thr_cnt_tree_first(&ctx->thr_cnts) == NULL &&
+	    ctx->cnt_merged.curobjs == 0 && ctx->nlimbo == 1) {
 		assert(ctx->cnt_merged.curbytes == 0);
 		assert(ctx->cnt_merged.accumobjs == 0);
 		assert(ctx->cnt_merged.accumbytes == 0);
@@ -433,9 +459,9 @@ prof_ctx_merge(prof_ctx_t *ctx, prof_thr_cnt_t *cnt)
 	ctx->cnt_merged.curbytes += cnt->cnts.curbytes;
 	ctx->cnt_merged.accumobjs += cnt->cnts.accumobjs;
 	ctx->cnt_merged.accumbytes += cnt->cnts.accumbytes;
-	ql_remove(&ctx->cnts_ql, cnt, cnts_link);
-	if (opt_prof_accum == false && ql_first(&ctx->cnts_ql) == NULL &&
-	    ctx->cnt_merged.curobjs == 0 && ctx->nlimbo == 0) {
+	thr_cnt_tree_remove(&ctx->thr_cnts, cnt);
+	if (opt_prof_accum == false && thr_cnt_tree_first(&ctx->thr_cnts) ==
+	    NULL && ctx->cnt_merged.curobjs == 0 && ctx->nlimbo == 0) {
 		/*
 		 * Increment ctx->nlimbo in order to keep another thread from
 		 * winning the race to destroy ctx while this one has ctx->lock
@@ -540,7 +566,6 @@ prof_lookup(prof_bt_t *bt)
 				prof_ctx_destroy(ctx);
 			return (NULL);
 		}
-		ql_elm_new(ret.p, cnts_link);
 		ret.p->ctx = ctx;
 		ret.p->epoch = 0;
 		memset(&ret.p->cnts, 0, sizeof(prof_cnt_t));
@@ -551,7 +576,7 @@ prof_lookup(prof_bt_t *bt)
 			return (NULL);
 		}
 		malloc_mutex_lock(ctx->lock);
-		ql_tail_insert(&ctx->cnts_ql, ret.p, cnts_link);
+		thr_cnt_tree_insert(&ctx->thr_cnts, ret.p);
 		ctx->nlimbo--;
 		malloc_mutex_unlock(ctx->lock);
 	}
@@ -745,12 +770,41 @@ prof_dump_printf(bool propagate_err, const char *format, ...)
 	return (ret);
 }
 
+static prof_thr_cnt_t *
+ctx_sum_iter(prof_thr_cnt_tree_t *thr_cnts, prof_thr_cnt_t *thr_cnt, void *arg)
+{
+	prof_ctx_t *ctx = (prof_ctx_t *)arg;
+	volatile unsigned *epoch = &thr_cnt->epoch;
+	prof_cnt_t tcnt;
+
+	while (true) {
+		unsigned epoch0 = *epoch;
+
+		/* Make sure epoch is even. */
+		if (epoch0 & 1U)
+			continue;
+
+		memcpy(&tcnt, &thr_cnt->cnts, sizeof(prof_cnt_t));
+
+		/* Terminate if epoch didn't change while reading. */
+		if (*epoch == epoch0)
+			break;
+	}
+
+	ctx->cnt_summed.curobjs += tcnt.curobjs;
+	ctx->cnt_summed.curbytes += tcnt.curbytes;
+	if (opt_prof_accum) {
+		ctx->cnt_summed.accumobjs += tcnt.accumobjs;
+		ctx->cnt_summed.accumbytes += tcnt.accumbytes;
+	}
+
+	return (NULL);
+}
+
 static void
 prof_dump_ctx_prep(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx,
-    prof_ctx_list_t *ctx_ql)
+    prof_ctx_tree_t *ctxs)
 {
-	prof_thr_cnt_t *thr_cnt;
-	prof_cnt_t tcnt;
 
 	cassert(config_prof);
 
@@ -762,33 +816,10 @@ prof_dump_ctx_prep(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx,
 	 * prof_dump()'s second pass.
 	 */
 	ctx->nlimbo++;
-	ql_tail_insert(ctx_ql, ctx, dump_link);
+	ctx_tree_insert(ctxs, ctx);
 
 	memcpy(&ctx->cnt_summed, &ctx->cnt_merged, sizeof(prof_cnt_t));
-	ql_foreach(thr_cnt, &ctx->cnts_ql, cnts_link) {
-		volatile unsigned *epoch = &thr_cnt->epoch;
-
-		while (true) {
-			unsigned epoch0 = *epoch;
-
-			/* Make sure epoch is even. */
-			if (epoch0 & 1U)
-				continue;
-
-			memcpy(&tcnt, &thr_cnt->cnts, sizeof(prof_cnt_t));
-
-			/* Terminate if epoch didn't change while reading. */
-			if (*epoch == epoch0)
-				break;
-		}
-
-		ctx->cnt_summed.curobjs += tcnt.curobjs;
-		ctx->cnt_summed.curbytes += tcnt.curbytes;
-		if (opt_prof_accum) {
-			ctx->cnt_summed.accumobjs += tcnt.accumobjs;
-			ctx->cnt_summed.accumbytes += tcnt.accumbytes;
-		}
-	}
+	thr_cnt_tree_iter(&ctx->thr_cnts, NULL, ctx_sum_iter, (void *)ctx);
 
 	if (ctx->cnt_summed.curobjs != 0)
 		(*leak_nctx)++;
@@ -829,25 +860,24 @@ prof_dump_header(bool propagate_err, const prof_cnt_t *cnt_all)
 }
 
 static void
-prof_dump_ctx_cleanup_locked(prof_ctx_t *ctx, prof_ctx_list_t *ctx_ql)
+prof_dump_ctx_cleanup_locked(prof_ctx_t *ctx, prof_ctx_tree_t *ctxs)
 {
 
 	ctx->nlimbo--;
-	ql_remove(ctx_ql, ctx, dump_link);
 }
 
 static void
-prof_dump_ctx_cleanup(prof_ctx_t *ctx, prof_ctx_list_t *ctx_ql)
+prof_dump_ctx_cleanup(prof_ctx_t *ctx, prof_ctx_tree_t *ctxs)
 {
 
 	malloc_mutex_lock(ctx->lock);
-	prof_dump_ctx_cleanup_locked(ctx, ctx_ql);
+	prof_dump_ctx_cleanup_locked(ctx, ctxs);
 	malloc_mutex_unlock(ctx->lock);
 }
 
 static bool
 prof_dump_ctx(bool propagate_err, prof_ctx_t *ctx, const prof_bt_t *bt,
-    prof_ctx_list_t *ctx_ql)
+    prof_ctx_tree_t *ctxs)
 {
 	bool ret;
 	unsigned i;
@@ -895,7 +925,7 @@ prof_dump_ctx(bool propagate_err, prof_ctx_t *ctx, const prof_bt_t *bt,
 
 	ret = false;
 label_return:
-	prof_dump_ctx_cleanup_locked(ctx, ctx_ql);
+	prof_dump_ctx_cleanup_locked(ctx, ctxs);
 	malloc_mutex_unlock(ctx->lock);
 	return (ret);
 }
@@ -966,6 +996,26 @@ prof_leakcheck(const prof_cnt_t *cnt_all, size_t leak_nctx,
 	}
 }
 
+static prof_ctx_t *
+prof_ctx_dump_iter(prof_ctx_tree_t *ctxs, prof_ctx_t *ctx, void *arg)
+{
+	bool propagate_err = *(bool *)arg;
+
+	if (prof_dump_ctx(propagate_err, ctx, &ctx->bt, ctxs))
+		return (ctx_tree_next(ctxs, ctx));
+
+	return (NULL);
+}
+
+static prof_ctx_t *
+prof_ctx_cleanup_iter(prof_ctx_tree_t *ctxs, prof_ctx_t *ctx, void *arg)
+{
+
+	prof_dump_ctx_cleanup(ctx, ctxs);
+
+	return (NULL);
+}
+
 static bool
 prof_dump(bool propagate_err, const char *filename, bool leakcheck)
 {
@@ -977,7 +1027,8 @@ prof_dump(bool propagate_err, const char *filename, bool leakcheck)
 		void		*v;
 	} ctx;
 	size_t leak_nctx;
-	prof_ctx_list_t ctx_ql;
+	prof_ctx_tree_t ctxs;
+	prof_ctx_t *cleanup_start = NULL;
 
 	cassert(config_prof);
 
@@ -990,10 +1041,10 @@ prof_dump(bool propagate_err, const char *filename, bool leakcheck)
 	/* Merge per thread profile stats, and sum them in cnt_all. */
 	memset(&cnt_all, 0, sizeof(prof_cnt_t));
 	leak_nctx = 0;
-	ql_new(&ctx_ql);
+	ctx_tree_new(&ctxs);
 	prof_enter(prof_tdata);
 	for (tabind = 0; ckh_iter(&bt2ctx, &tabind, NULL, &ctx.v) == false;)
-		prof_dump_ctx_prep(ctx.p, &cnt_all, &leak_nctx, &ctx_ql);
+		prof_dump_ctx_prep(ctx.p, &cnt_all, &leak_nctx, &ctxs);
 	prof_leave(prof_tdata);
 
 	/* Create dump file. */
@@ -1005,10 +1056,10 @@ prof_dump(bool propagate_err, const char *filename, bool leakcheck)
 		goto label_write_error;
 
 	/* Dump per ctx profile stats. */
-	while ((ctx.p = ql_first(&ctx_ql)) != NULL) {
-		if (prof_dump_ctx(propagate_err, ctx.p, &ctx.p->bt, &ctx_ql))
-			goto label_write_error;
-	}
+	cleanup_start = ctx_tree_iter(&ctxs, NULL, prof_ctx_dump_iter,
+	    (void *)&propagate_err);
+	if (cleanup_start != NULL)
+		goto label_write_error;
 
 	/* Dump /proc/<pid>/maps if possible. */
 	if (prof_dump_maps(propagate_err))
@@ -1026,8 +1077,10 @@ prof_dump(bool propagate_err, const char *filename, bool leakcheck)
 label_write_error:
 	prof_dump_close(propagate_err);
 label_open_close_error:
-	while ((ctx.p = ql_first(&ctx_ql)) != NULL)
-		prof_dump_ctx_cleanup(ctx.p, &ctx_ql);
+	if (cleanup_start != NULL) {
+		ctx_tree_iter(&ctxs, cleanup_start, prof_ctx_cleanup_iter,
+		    NULL);
+	}
 	malloc_mutex_unlock(&prof_dump_mtx);
 	return (true);
 }
-- 
cgit v0.12


From 1628e8615ed6c82ded14d6013ac775274eb426e6 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 19 Aug 2014 01:28:49 -0700
Subject: Add rb_empty().

---
 include/jemalloc/internal/rb.h | 13 +++++++++++++
 test/unit/rb.c                 |  3 +++
 2 files changed, 16 insertions(+)

diff --git a/include/jemalloc/internal/rb.h b/include/jemalloc/internal/rb.h
index 423802e..ffe3bb0 100644
--- a/include/jemalloc/internal/rb.h
+++ b/include/jemalloc/internal/rb.h
@@ -158,6 +158,8 @@ struct {								\
 #define	rb_proto(a_attr, a_prefix, a_rbt_type, a_type)			\
 a_attr void								\
 a_prefix##new(a_rbt_type *rbtree);					\
+a_attr bool								\
+a_prefix##empty(a_rbt_type *rbtree);					\
 a_attr a_type *								\
 a_prefix##first(a_rbt_type *rbtree);					\
 a_attr a_type *								\
@@ -224,6 +226,13 @@ a_prefix##reverse_iter(a_rbt_type *rbtree, a_type *start,		\
  *       Args:
  *         tree: Pointer to an uninitialized red-black tree object.
  *
+ *   static bool
+ *   ex_empty(ex_t *tree);
+ *       Description: Determine whether tree is empty.
+ *       Args:
+ *         tree: Pointer to an initialized red-black tree object.
+ *       Ret: True if tree is empty, false otherwise.
+ *
  *   static ex_node_t *
  *   ex_first(ex_t *tree);
  *   static ex_node_t *
@@ -309,6 +318,10 @@ a_attr void								\
 a_prefix##new(a_rbt_type *rbtree) {					\
     rb_new(a_type, a_field, rbtree);					\
 }									\
+a_attr bool								\
+a_prefix##empty(a_rbt_type *rbtree) {					\
+    return (rbtree->rbt_root == &rbtree->rbt_nil);			\
+}									\
 a_attr a_type *								\
 a_prefix##first(a_rbt_type *rbtree) {					\
     a_type *ret;							\
diff --git a/test/unit/rb.c b/test/unit/rb.c
index b737485..e43907f 100644
--- a/test/unit/rb.c
+++ b/test/unit/rb.c
@@ -49,6 +49,7 @@ TEST_BEGIN(test_rb_empty)
 
 	tree_new(&tree);
 
+	assert_true(tree_empty(&tree), "Tree should be empty");
 	assert_ptr_null(tree_first(&tree), "Unexpected node");
 	assert_ptr_null(tree_last(&tree), "Unexpected node");
 
@@ -265,6 +266,8 @@ TEST_BEGIN(test_rb_random)
 				assert_u_eq(tree_iterate_reverse(&tree), k+1,
 				    "Unexpected node iteration count");
 
+				assert_false(tree_empty(&tree),
+				    "Tree should not be empty");
 				assert_ptr_not_null(tree_first(&tree),
 				    "Tree should not be empty");
 				assert_ptr_not_null(tree_last(&tree),
-- 
cgit v0.12


From 602c8e0971160e4b85b08b16cf8a2375aa24bc04 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Mon, 18 Aug 2014 16:22:13 -0700
Subject: Implement per thread heap profiling.

Rename data structures (prof_thr_cnt_t-->prof_tctx_t,
prof_ctx_t-->prof_gctx_t), and convert to storing a prof_tctx_t for
sampled objects.

Convert PROF_ALLOC_PREP() to prof_alloc_prep(), since precise backtrace
depth within jemalloc functions is no longer an issue (pprof prunes
irrelevant frames).

Implement mallctl's:
- prof.reset implements full sample data reset, and optional change of
  sample interval.
- prof.lg_sample reads the current sample interval (opt.lg_prof_sample
  was the permanent source of truth prior to prof.reset).
- thread.prof.name provides naming capability for threads within heap
  profile dumps.
- thread.prof.active makes it possible to activate/deactivate heap
  profiling for individual threads.

Modify the heap dump files to contain per thread heap profile data.
This change is incompatible with the existing pprof, which will require
enhancements to read and process the enriched data.
---
 doc/jemalloc.xml.in                           |   56 +-
 include/jemalloc/internal/arena.h             |   22 +-
 include/jemalloc/internal/extent.h            |    2 +-
 include/jemalloc/internal/huge.h              |    4 +-
 include/jemalloc/internal/private_symbols.txt |   21 +-
 include/jemalloc/internal/prof.h              |  440 +++++-----
 src/ctl.c                                     |   97 ++-
 src/huge.c                                    |   12 +-
 src/jemalloc.c                                |  140 +--
 src/prof.c                                    | 1127 +++++++++++++++++--------
 src/stats.c                                   |    2 +-
 11 files changed, 1217 insertions(+), 706 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 308d0c6..8f4327f 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -1047,7 +1047,7 @@ malloc_conf = "xmalloc:true";]]></programlisting>
       <varlistentry id="opt.lg_prof_sample">
         <term>
           <mallctl>opt.lg_prof_sample</mallctl>
-          (<type>ssize_t</type>)
+          (<type>size_t</type>)
           <literal>r-</literal>
           [<option>--enable-prof</option>]
         </term>
@@ -1243,6 +1243,35 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         the developer may find manual flushing useful.</para></listitem>
       </varlistentry>
 
+      <varlistentry id="thread.prof.name">
+        <term>
+          <mallctl>thread.prof.name</mallctl>
+          (<type>const char *</type>)
+          <literal>rw</literal>
+          [<option>--enable-prof</option>]
+        </term>
+        <listitem><para>Get/set the descriptive name associated with the calling
+        thread in memory profile dumps.  An internal copy of the name string is
+        created, so the input string need not be maintained after this interface
+        completes execution.  The output string of this interface should be
+        copied for non-ephemeral uses, because multiple implementation details
+        can cause asynchronous string deallocation.</para></listitem>
+      </varlistentry>
+
+      <varlistentry id="thread.prof.active">
+        <term>
+          <mallctl>thread.prof.active</mallctl>
+          (<type>bool</type>)
+          <literal>rw</literal>
+          [<option>--enable-prof</option>]
+        </term>
+        <listitem><para>Control whether sampling is currently active for the
+        calling thread.  This is a deactivation mechanism in addition to <link
+        linkend="prof.active"><mallctl>prof.active</mallctl></link>; both must
+        be active for the calling thread to sample.  This flag is enabled by
+        default.</para></listitem>
+      </varlistentry>
+
       <varlistentry id="arena.i.purge">
         <term>
           <mallctl>arena.&lt;i&gt;.purge</mallctl>
@@ -1492,6 +1521,31 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         option.</para></listitem>
       </varlistentry>
 
+      <varlistentry id="prof.reset">
+        <term>
+          <mallctl>prof.reset</mallctl>
+          (<type>size_t</type>)
+          <literal>-w</literal>
+          [<option>--enable-prof</option>]
+        </term>
+        <listitem><para>Reset all memory profile statistics, and optionally
+        update the sample rate (see <link
+        linkend="opt.lg_prof_sample"><mallctl>opt.lg_prof_sample</mallctl></link>).
+        </para></listitem>
+      </varlistentry>
+
+      <varlistentry id="prof.lg_sample">
+        <term>
+          <mallctl>prof.lg_sample</mallctl>
+          (<type>size_t</type>)
+          <literal>r-</literal>
+          [<option>--enable-prof</option>]
+        </term>
+        <listitem><para>Get the sample rate (see <link
+        linkend="opt.lg_prof_sample"><mallctl>opt.lg_prof_sample</mallctl></link>).
+        </para></listitem>
+      </varlistentry>
+
       <varlistentry id="prof.interval">
         <term>
           <mallctl>prof.interval</mallctl>
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 9351e3b..f3f6426 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -58,7 +58,7 @@ typedef struct arena_s arena_t;
 struct arena_chunk_map_s {
 #ifndef JEMALLOC_PROF
 	/*
-	 * Overlay prof_ctx in order to allow it to be referenced by dead code.
+	 * Overlay prof_tctx in order to allow it to be referenced by dead code.
 	 * Such antics aren't warranted for per arena data structures, but
 	 * chunk map overhead accounts for a percentage of memory, rather than
 	 * being just a fixed cost.
@@ -75,7 +75,7 @@ struct arena_chunk_map_s {
 	rb_node(arena_chunk_map_t)	rb_link;
 
 	/* Profile counters, used for large object runs. */
-	prof_ctx_t			*prof_ctx;
+	prof_tctx_t			*prof_tctx;
 #ifndef JEMALLOC_PROF
 	}; /* union { ... }; */
 #endif
@@ -472,8 +472,8 @@ size_t	arena_ptr_small_binind_get(const void *ptr, size_t mapbits);
 size_t	arena_bin_index(arena_t *arena, arena_bin_t *bin);
 unsigned	arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info,
     const void *ptr);
-prof_ctx_t	*arena_prof_ctx_get(const void *ptr);
-void	arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
+prof_tctx_t	*arena_prof_tctx_get(const void *ptr);
+void	arena_prof_tctx_set(const void *ptr, prof_tctx_t *tctx);
 void	*arena_malloc(arena_t *arena, size_t size, bool zero, bool try_tcache);
 size_t	arena_salloc(const void *ptr, bool demote);
 void	arena_dalloc(arena_chunk_t *chunk, void *ptr, bool try_tcache);
@@ -987,10 +987,10 @@ arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, const void *ptr)
 	return (regind);
 }
 
-JEMALLOC_INLINE prof_ctx_t *
-arena_prof_ctx_get(const void *ptr)
+JEMALLOC_INLINE prof_tctx_t *
+arena_prof_tctx_get(const void *ptr)
 {
-	prof_ctx_t *ret;
+	prof_tctx_t *ret;
 	arena_chunk_t *chunk;
 	size_t pageind, mapbits;
 
@@ -1003,15 +1003,15 @@ arena_prof_ctx_get(const void *ptr)
 	mapbits = arena_mapbits_get(chunk, pageind);
 	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
 	if ((mapbits & CHUNK_MAP_LARGE) == 0)
-		ret = (prof_ctx_t *)(uintptr_t)1U;
+		ret = (prof_tctx_t *)(uintptr_t)1U;
 	else
-		ret = arena_mapp_get(chunk, pageind)->prof_ctx;
+		ret = arena_mapp_get(chunk, pageind)->prof_tctx;
 
 	return (ret);
 }
 
 JEMALLOC_INLINE void
-arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
+arena_prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 {
 	arena_chunk_t *chunk;
 	size_t pageind;
@@ -1025,7 +1025,7 @@ arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 	assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
 
 	if (arena_mapbits_large_get(chunk, pageind) != 0)
-		arena_mapp_get(chunk, pageind)->prof_ctx = ctx;
+		arena_mapp_get(chunk, pageind)->prof_tctx = tctx;
 }
 
 JEMALLOC_ALWAYS_INLINE void *
diff --git a/include/jemalloc/internal/extent.h b/include/jemalloc/internal/extent.h
index 000ef6d..5b00076 100644
--- a/include/jemalloc/internal/extent.h
+++ b/include/jemalloc/internal/extent.h
@@ -16,7 +16,7 @@ struct extent_node_s {
 	rb_node(extent_node_t)	link_ad;
 
 	/* Profile counters, used for huge objects. */
-	prof_ctx_t		*prof_ctx;
+	prof_tctx_t		*prof_tctx;
 
 	/* Pointer to the extent that this tree node is responsible for. */
 	void			*addr;
diff --git a/include/jemalloc/internal/huge.h b/include/jemalloc/internal/huge.h
index 1e54536..2ec7752 100644
--- a/include/jemalloc/internal/huge.h
+++ b/include/jemalloc/internal/huge.h
@@ -21,8 +21,8 @@ extern huge_dalloc_junk_t *huge_dalloc_junk;
 #endif
 void	huge_dalloc(void *ptr);
 size_t	huge_salloc(const void *ptr);
-prof_ctx_t	*huge_prof_ctx_get(const void *ptr);
-void	huge_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
+prof_tctx_t	*huge_prof_tctx_get(const void *ptr);
+void	huge_prof_tctx_set(const void *ptr, prof_tctx_t *tctx);
 bool	huge_boot(void);
 void	huge_prefork(void);
 void	huge_postfork_parent(void);
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 3401301..1350545 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -48,9 +48,9 @@ arena_prefork
 arena_prof_accum
 arena_prof_accum_impl
 arena_prof_accum_locked
-arena_prof_ctx_get
-arena_prof_ctx_set
 arena_prof_promoted
+arena_prof_tctx_get
+arena_prof_tctx_set
 arena_ptr_small_binind_get
 arena_purge_all
 arena_quarantine_junk_small
@@ -208,8 +208,8 @@ huge_palloc
 huge_postfork_child
 huge_postfork_parent
 huge_prefork
-huge_prof_ctx_get
-huge_prof_ctx_set
+huge_prof_tctx_get
+huge_prof_tctx_set
 huge_ralloc
 huge_ralloc_no_move
 huge_salloc
@@ -287,28 +287,31 @@ opt_zero
 p2rz
 pages_purge
 pow2_ceil
+prof_alloc_prep
 prof_backtrace
 prof_boot0
 prof_boot1
 prof_boot2
 prof_bt_count
-prof_ctx_get
-prof_ctx_set
 prof_dump_open
 prof_free
+prof_free_sampled_object
 prof_gdump
 prof_idump
 prof_interval
 prof_lookup
 prof_malloc
-prof_malloc_record_object
+prof_malloc_sample_object
 prof_mdump
 prof_postfork_child
 prof_postfork_parent
 prof_prefork
 prof_realloc
+prof_reset
 prof_sample_accum_update
 prof_sample_threshold_update
+prof_tctx_get
+prof_tctx_set
 prof_tdata_booted
 prof_tdata_cleanup
 prof_tdata_get
@@ -322,6 +325,10 @@ prof_tdata_tsd_get
 prof_tdata_tsd_get_wrapper
 prof_tdata_tsd_init_head
 prof_tdata_tsd_set
+prof_thread_active_get
+prof_thread_active_set
+prof_thread_name_get
+prof_thread_name_set
 quarantine
 quarantine_alloc_hook
 quarantine_boot
diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index 9398ad9..104bfad 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -1,11 +1,10 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_TYPES
 
-typedef uint64_t prof_thr_uid_t;
 typedef struct prof_bt_s prof_bt_t;
 typedef struct prof_cnt_s prof_cnt_t;
-typedef struct prof_thr_cnt_s prof_thr_cnt_t;
-typedef struct prof_ctx_s prof_ctx_t;
+typedef struct prof_tctx_s prof_tctx_t;
+typedef struct prof_gctx_s prof_gctx_t;
 typedef struct prof_tdata_s prof_tdata_t;
 
 /* Option defaults. */
@@ -34,12 +33,18 @@ typedef struct prof_tdata_s prof_tdata_t;
 #define	PROF_PRINTF_BUFSIZE		128
 
 /*
- * Number of mutexes shared among all ctx's.  No space is allocated for these
+ * Number of mutexes shared among all gctx's.  No space is allocated for these
  * unless profiling is enabled, so it's okay to over-provision.
  */
 #define	PROF_NCTX_LOCKS			1024
 
 /*
+ * Number of mutexes shared among all tdata's.  No space is allocated for these
+ * unless profiling is enabled, so it's okay to over-provision.
+ */
+#define	PROF_NTDATA_LOCKS		256
+
+/*
  * prof_tdata pointers close to NULL are used to encode state information that
  * is used for cleaning up during thread shutdown.
  */
@@ -66,87 +71,70 @@ typedef struct {
 #endif
 
 struct prof_cnt_s {
-	/*
-	 * Profiling counters.  An allocation/deallocation pair can operate on
-	 * different prof_thr_cnt_t objects that are linked into the same
-	 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
-	 * negative.  In principle it is possible for the *bytes counters to
-	 * overflow/underflow, but a general solution would require something
-	 * like 128-bit counters; this implementation doesn't bother to solve
-	 * that problem.
-	 */
-	int64_t		curobjs;
-	int64_t		curbytes;
+	/* Profiling counters. */
+	uint64_t	curobjs;
+	uint64_t	curbytes;
 	uint64_t	accumobjs;
 	uint64_t	accumbytes;
 };
 
-struct prof_thr_cnt_s {
-	prof_thr_uid_t		thr_uid;
+typedef enum {
+	prof_tctx_state_nominal,
+	prof_tctx_state_dumping,
+	prof_tctx_state_purgatory /* Dumper must finish destroying. */
+} prof_tctx_state_t;
 
-	/* Linkage into prof_ctx_t's thr_cnts. */
-	rb_node(prof_thr_cnt_t)	thr_cnt_link;
+struct prof_tctx_s {
+	/* Thread data for thread that performed the allocation. */
+	prof_tdata_t		*tdata;
 
-	/*
-	 * Associated context.  If a thread frees an object that it did not
-	 * allocate, it is possible that the context is not present in the
-	 * thread's hash table, in which case it must be able to look up the
-	 * context, insert a new prof_thr_cnt_t into the thread's hash table,
-	 * and link it into the prof_ctx_t's thr_cnts.
-	 */
-	prof_ctx_t		*ctx;
+	/* Profiling counters, protected by tdata->lock. */
+	prof_cnt_t		cnts;
+
+	/* Associated global context. */
+	prof_gctx_t		*gctx;
+
+	/* Linkage into gctx's tctxs. */
+	rb_node(prof_tctx_t)	tctx_link;
+
+	/* Current dump-related state, protected by gctx->lock. */
+	prof_tctx_state_t	state;
 
 	/*
-	 * Threads use memory barriers to update the counters.  Since there is
-	 * only ever one writer, the only challenge is for the reader to get a
-	 * consistent read of the counters.
-	 *
-	 * The writer uses this series of operations:
-	 *
-	 * 1) Increment epoch to an odd number.
-	 * 2) Update counters.
-	 * 3) Increment epoch to an even number.
-	 *
-	 * The reader must assure 1) that the epoch is even while it reads the
-	 * counters, and 2) that the epoch doesn't change between the time it
-	 * starts and finishes reading the counters.
+	 * Copy of cnts snapshotted during early dump phase, protected by
+	 * dump_mtx.
 	 */
-	unsigned		epoch;
-
-	/* Profiling counters. */
-	prof_cnt_t		cnts;
+	prof_cnt_t		dump_cnts;
 };
-typedef rb_tree(prof_thr_cnt_t) prof_thr_cnt_tree_t;
+typedef rb_tree(prof_tctx_t) prof_tctx_tree_t;
 
-struct prof_ctx_s {
-	/* Protects nlimbo, cnt_merged, and thr_cnts. */
+struct prof_gctx_s {
+	/* Protects nlimbo, cnt_summed, and tctxs. */
 	malloc_mutex_t		*lock;
 
 	/*
-	 * Number of threads that currently cause this ctx to be in a state of
+	 * Number of threads that currently cause this gctx to be in a state of
 	 * limbo due to one of:
-	 *   - Initializing per thread counters associated with this ctx.
-	 *   - Preparing to destroy this ctx.
-	 *   - Dumping a heap profile that includes this ctx.
+	 *   - Initializing this gctx.
+	 *   - Initializing per thread counters associated with this gctx.
+	 *   - Preparing to destroy this gctx.
+	 *   - Dumping a heap profile that includes this gctx.
 	 * nlimbo must be 1 (single destroyer) in order to safely destroy the
-	 * ctx.
+	 * gctx.
 	 */
 	unsigned		nlimbo;
 
-	/* Temporary storage for summation during dump. */
-	prof_cnt_t		cnt_summed;
-
-	/* When threads exit, they merge their stats into cnt_merged. */
-	prof_cnt_t		cnt_merged;
-
 	/*
 	 * Tree of profile counters, one for each thread that has allocated in
 	 * this context.
 	 */
-	prof_thr_cnt_tree_t	thr_cnts;
+	prof_tctx_tree_t	tctxs;
 
 	/* Linkage for tree of contexts to be dumped. */
-	rb_node(prof_ctx_t)	dump_link;
+	rb_node(prof_gctx_t)	dump_link;
+
+	/* Temporary storage for summation during dump. */
+	prof_cnt_t		cnt_summed;
 
 	/* Associated backtrace. */
 	prof_bt_t		bt;
@@ -154,21 +142,34 @@ struct prof_ctx_s {
 	/* Backtrace vector, variable size, referred to by bt. */
 	void			*vec[1];
 };
-typedef rb_tree(prof_ctx_t) prof_ctx_tree_t;
+typedef rb_tree(prof_gctx_t) prof_gctx_tree_t;
+
+typedef enum {
+	prof_tdata_state_attached, /* Active thread attached, data valid. */
+	prof_tdata_state_detached, /* Defunct thread, data remain valid. */
+	prof_tdata_state_expired   /* Predates reset, omit data from dump. */
+} prof_tdata_state_t;
 
 struct prof_tdata_s {
+	malloc_mutex_t		*lock;
+
+	/* Monotonically increasing unique thread identifier. */
+	uint64_t		thr_uid;
+
+	/* Included in heap profile dumps if non-NULL. */
+	char			*thread_name;
+
+	prof_tdata_state_t	state;
+
+	rb_node(prof_tdata_t)	tdata_link;
+
 	/*
-	 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread tracks
+	 * Hash of (prof_bt_t *)-->(prof_tctx_t *).  Each thread tracks
 	 * backtraces for which it has non-zero allocation/deallocation counters
-	 * associated with thread-specific prof_thr_cnt_t objects.  Other
-	 * threads may read the prof_thr_cnt_t contents, but no others will ever
-	 * write them.
-	 *
-	 * Upon thread exit, the thread must merge all the prof_thr_cnt_t
-	 * counter data into the associated prof_ctx_t objects, and unlink/free
-	 * the prof_thr_cnt_t objects.
+	 * associated with thread-specific prof_tctx_t objects.  Other threads
+	 * may write to prof_tctx_t contents when freeing associated objects.
 	 */
-	ckh_t			bt2cnt;
+	ckh_t			bt2tctx;
 
 	/* Sampling state. */
 	uint64_t		prng_state;
@@ -179,9 +180,27 @@ struct prof_tdata_s {
 	bool			enq_idump;
 	bool			enq_gdump;
 
+	/*
+	 * Set to true during an early dump phase for tdata's which are
+	 * currently being dumped.  New threads' tdata's have this initialized
+	 * to false so that they aren't accidentally included in later dump
+	 * phases.
+	 */
+	bool			dumping;
+
+	/*
+	 * True if profiling is active for this tdata's thread
+	 * (thread.prof.active mallctl).
+	 */
+	bool			active;
+
+	/* Temporary storage for summation during dump. */
+	prof_cnt_t		cnt_summed;
+
 	/* Backtrace vector, used for calls to prof_backtrace(). */
 	void			*vec[PROF_BT_MAX];
 };
+typedef rb_tree(prof_tdata_t) prof_tdata_tree_t;
 
 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
@@ -217,9 +236,18 @@ extern char	opt_prof_prefix[
  */
 extern uint64_t	prof_interval;
 
+/*
+ * Initialized as opt_lg_prof_sample, and potentially modified during profiling
+ * resets.
+ */
+extern size_t	lg_prof_sample;
+
+void	prof_malloc_sample_object(const void *ptr, size_t usize,
+    prof_tctx_t *tctx);
+void	prof_free_sampled_object(size_t usize, prof_tctx_t *tctx);
 void	bt_init(prof_bt_t *bt, void **vec);
 void	prof_backtrace(prof_bt_t *bt);
-prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
+prof_tctx_t	*prof_lookup(prof_bt_t *bt);
 #ifdef JEMALLOC_JET
 size_t	prof_bt_count(void);
 typedef int (prof_dump_open_t)(bool, const char *);
@@ -229,53 +257,44 @@ void	prof_idump(void);
 bool	prof_mdump(const char *filename);
 void	prof_gdump(void);
 prof_tdata_t	*prof_tdata_init(void);
+prof_tdata_t	*prof_tdata_reinit(prof_tdata_t *tdata);
+void	prof_reset(size_t lg_sample);
 void	prof_tdata_cleanup(void *arg);
+const char	*prof_thread_name_get(void);
+bool	prof_thread_name_set(const char *thread_name);
+bool	prof_thread_active_get(void);
+bool	prof_thread_active_set(bool active);
 void	prof_boot0(void);
 void	prof_boot1(void);
 bool	prof_boot2(void);
 void	prof_prefork(void);
 void	prof_postfork_parent(void);
 void	prof_postfork_child(void);
-void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
+void	prof_sample_threshold_update(prof_tdata_t *tdata);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
 
-#define	PROF_ALLOC_PREP(size, ret) do {					\
-	prof_tdata_t *prof_tdata;					\
-	prof_bt_t bt;							\
-									\
-	assert(size == s2u(size));					\
-									\
-	if (!opt_prof_active ||						\
-	    prof_sample_accum_update(size, false, &prof_tdata)) {	\
-		ret = (prof_thr_cnt_t *)(uintptr_t)1U;			\
-	} else {							\
-		bt_init(&bt, prof_tdata->vec);				\
-		prof_backtrace(&bt);					\
-		ret = prof_lookup(&bt);					\
-	}								\
-} while (0)
-
 #ifndef JEMALLOC_ENABLE_INLINE
 malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *)
 
 prof_tdata_t	*prof_tdata_get(bool create);
-bool	prof_sample_accum_update(size_t size, bool commit,
-    prof_tdata_t **prof_tdata_out);
-prof_ctx_t	*prof_ctx_get(const void *ptr);
-void	prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
-void	prof_malloc_record_object(const void *ptr, size_t usize,
-    prof_thr_cnt_t *cnt);
-void	prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt);
-void	prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
-    size_t old_usize, prof_ctx_t *old_ctx);
-void	prof_free(const void *ptr, size_t size);
+bool	prof_sample_accum_update(size_t usize, bool commit,
+    prof_tdata_t **tdata_out);
+prof_tctx_t	*prof_alloc_prep(size_t usize);
+prof_tctx_t	*prof_tctx_get(const void *ptr);
+void	prof_tctx_set(const void *ptr, prof_tctx_t *tctx);
+void	prof_malloc_sample_object(const void *ptr, size_t usize,
+    prof_tctx_t *tctx);
+void	prof_malloc(const void *ptr, size_t usize, prof_tctx_t *tctx);
+void	prof_realloc(const void *ptr, size_t usize, prof_tctx_t *tctx,
+    size_t old_usize, prof_tctx_t *old_tctx);
+void	prof_free(const void *ptr, size_t usize);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
-/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
+/* Thread-specific backtrace cache, used to reduce bt2gctx contention. */
 malloc_tsd_externs(prof_tdata, prof_tdata_t *)
 malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL,
     prof_tdata_cleanup)
@@ -283,21 +302,27 @@ malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL,
 JEMALLOC_INLINE prof_tdata_t *
 prof_tdata_get(bool create)
 {
-	prof_tdata_t *prof_tdata;
+	prof_tdata_t *tdata;
 
 	cassert(config_prof);
 
-	prof_tdata = *prof_tdata_tsd_get();
-	if (create && prof_tdata == NULL)
-		prof_tdata = prof_tdata_init();
+	tdata = *prof_tdata_tsd_get();
+	if (create) {
+		if (tdata == NULL)
+			tdata = prof_tdata_init();
+		else if (tdata->state == prof_tdata_state_expired)
+			tdata = prof_tdata_reinit(tdata);
+		assert(tdata == NULL || tdata->state ==
+		    prof_tdata_state_attached);
+	}
 
-	return (prof_tdata);
+	return (tdata);
 }
 
-JEMALLOC_INLINE prof_ctx_t *
-prof_ctx_get(const void *ptr)
+JEMALLOC_INLINE prof_tctx_t *
+prof_tctx_get(const void *ptr)
 {
-	prof_ctx_t *ret;
+	prof_tctx_t *ret;
 	arena_chunk_t *chunk;
 
 	cassert(config_prof);
@@ -306,15 +331,15 @@ prof_ctx_get(const void *ptr)
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (chunk != ptr) {
 		/* Region. */
-		ret = arena_prof_ctx_get(ptr);
+		ret = arena_prof_tctx_get(ptr);
 	} else
-		ret = huge_prof_ctx_get(ptr);
+		ret = huge_prof_tctx_get(ptr);
 
 	return (ret);
 }
 
 JEMALLOC_INLINE void
-prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
+prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 {
 	arena_chunk_t *chunk;
 
@@ -324,66 +349,62 @@ prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (chunk != ptr) {
 		/* Region. */
-		arena_prof_ctx_set(ptr, ctx);
+		arena_prof_tctx_set(ptr, tctx);
 	} else
-		huge_prof_ctx_set(ptr, ctx);
+		huge_prof_tctx_set(ptr, tctx);
 }
 
 JEMALLOC_INLINE bool
-prof_sample_accum_update(size_t size, bool commit,
-    prof_tdata_t **prof_tdata_out)
+prof_sample_accum_update(size_t usize, bool commit, prof_tdata_t **tdata_out)
 {
-	prof_tdata_t *prof_tdata;
+	prof_tdata_t *tdata;
 
 	cassert(config_prof);
 
-	prof_tdata = prof_tdata_get(true);
-	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
-		prof_tdata = NULL;
+	tdata = prof_tdata_get(true);
+	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+		tdata = NULL;
 
-	if (prof_tdata_out != NULL)
-		*prof_tdata_out = prof_tdata;
+	if (tdata_out != NULL)
+		*tdata_out = tdata;
 
-	if (prof_tdata == NULL)
+	if (tdata == NULL)
 		return (true);
 
-	if (prof_tdata->bytes_until_sample >= size) {
+	if (tdata->bytes_until_sample >= usize) {
 		if (commit)
-			prof_tdata->bytes_until_sample -= size;
+			tdata->bytes_until_sample -= usize;
 		return (true);
 	} else {
 		/* Compute new sample threshold. */
 		if (commit)
-			prof_sample_threshold_update(prof_tdata);
-		return (false);
+			prof_sample_threshold_update(tdata);
+		return (tdata->active == false);
 	}
 }
 
-JEMALLOC_INLINE void
-prof_malloc_record_object(const void *ptr, size_t usize, prof_thr_cnt_t *cnt) {
-	prof_ctx_set(ptr, cnt->ctx);
-
-	cnt->epoch++;
-	/*********/
-	mb_write();
-	/*********/
-	cnt->cnts.curobjs++;
-	cnt->cnts.curbytes += usize;
-	if (opt_prof_accum) {
-		cnt->cnts.accumobjs++;
-		cnt->cnts.accumbytes += usize;
+JEMALLOC_INLINE prof_tctx_t *
+prof_alloc_prep(size_t usize)
+{
+	prof_tctx_t *ret;
+	prof_tdata_t *tdata;
+	prof_bt_t bt;
+
+	assert(usize == s2u(usize));
+
+	if (!opt_prof_active || prof_sample_accum_update(usize, false, &tdata))
+		ret = (prof_tctx_t *)(uintptr_t)1U;
+	else {
+		bt_init(&bt, tdata->vec);
+		prof_backtrace(&bt);
+		ret = prof_lookup(&bt);
 	}
-	/*********/
-	mb_write();
-	/*********/
-	cnt->epoch++;
-	/*********/
-	mb_write();
-	/*********/
+
+	return (ret);
 }
 
 JEMALLOC_INLINE void
-prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt)
+prof_malloc(const void *ptr, size_t usize, prof_tctx_t *tctx)
 {
 
 	cassert(config_prof);
@@ -392,131 +413,60 @@ prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt)
 
 	if (prof_sample_accum_update(usize, true, NULL)) {
 		/*
-		 * Don't sample.  For malloc()-like allocation, it is
-		 * always possible to tell in advance how large an
-		 * object's usable size will be, so there should never
-		 * be a difference between the usize passed to
-		 * PROF_ALLOC_PREP() and prof_malloc().
+		 * Don't sample.  For malloc()-like allocation, it is always
+		 * possible to tell in advance how large an object's usable size
+		 * will be, so there should never be a difference between the
+		 * usize passed to PROF_ALLOC_PREP() and prof_malloc().
 		 */
-		assert((uintptr_t)cnt == (uintptr_t)1U);
+		assert((uintptr_t)tctx == (uintptr_t)1U);
 	}
 
-	if ((uintptr_t)cnt > (uintptr_t)1U)
-		prof_malloc_record_object(ptr, usize, cnt);
+	if ((uintptr_t)tctx > (uintptr_t)1U)
+		prof_malloc_sample_object(ptr, usize, tctx);
 	else
-		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
+		prof_tctx_set(ptr, (prof_tctx_t *)(uintptr_t)1U);
 }
 
 JEMALLOC_INLINE void
-prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
-    size_t old_usize, prof_ctx_t *old_ctx)
+prof_realloc(const void *ptr, size_t usize, prof_tctx_t *tctx, size_t old_usize,
+    prof_tctx_t *old_tctx)
 {
-	prof_thr_cnt_t *told_cnt;
 
 	cassert(config_prof);
-	assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
+	assert(ptr != NULL || (uintptr_t)tctx <= (uintptr_t)1U);
 
 	if (ptr != NULL) {
 		assert(usize == isalloc(ptr, true));
 		if (prof_sample_accum_update(usize, true, NULL)) {
 			/*
-			 * Don't sample.  The usize passed to
-			 * PROF_ALLOC_PREP() was larger than what
-			 * actually got allocated, so a backtrace was
-			 * captured for this allocation, even though
-			 * its actual usize was insufficient to cross
-			 * the sample threshold.
+			 * Don't sample.  The usize passed to PROF_ALLOC_PREP()
+			 * was larger than what actually got allocated, so a
+			 * backtrace was captured for this allocation, even
+			 * though its actual usize was insufficient to cross the
+			 * sample threshold.
 			 */
-			cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
+			tctx = (prof_tctx_t *)(uintptr_t)1U;
 		}
 	}
 
-	if ((uintptr_t)old_ctx > (uintptr_t)1U) {
-		told_cnt = prof_lookup(&old_ctx->bt);
-		if (told_cnt == NULL) {
-			/*
-			 * It's too late to propagate OOM for this realloc(),
-			 * so operate directly on old_cnt->ctx->cnt_merged.
-			 */
-			malloc_mutex_lock(old_ctx->lock);
-			old_ctx->cnt_merged.curobjs--;
-			old_ctx->cnt_merged.curbytes -= old_usize;
-			malloc_mutex_unlock(old_ctx->lock);
-			told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
-		}
-	} else
-		told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
-
-	if ((uintptr_t)told_cnt > (uintptr_t)1U)
-		told_cnt->epoch++;
-	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		prof_ctx_set(ptr, cnt->ctx);
-		cnt->epoch++;
-	} else if (ptr != NULL)
-		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
-	/*********/
-	mb_write();
-	/*********/
-	if ((uintptr_t)told_cnt > (uintptr_t)1U) {
-		told_cnt->cnts.curobjs--;
-		told_cnt->cnts.curbytes -= old_usize;
-	}
-	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		cnt->cnts.curobjs++;
-		cnt->cnts.curbytes += usize;
-		if (opt_prof_accum) {
-			cnt->cnts.accumobjs++;
-			cnt->cnts.accumbytes += usize;
-		}
-	}
-	/*********/
-	mb_write();
-	/*********/
-	if ((uintptr_t)told_cnt > (uintptr_t)1U)
-		told_cnt->epoch++;
-	if ((uintptr_t)cnt > (uintptr_t)1U)
-		cnt->epoch++;
-	/*********/
-	mb_write(); /* Not strictly necessary. */
+	if ((uintptr_t)old_tctx > (uintptr_t)1U)
+		prof_free_sampled_object(old_usize, old_tctx);
+	if ((uintptr_t)tctx > (uintptr_t)1U)
+		prof_malloc_sample_object(ptr, usize, tctx);
+	else
+		prof_tctx_set(ptr, (prof_tctx_t *)(uintptr_t)1U);
 }
 
 JEMALLOC_INLINE void
-prof_free(const void *ptr, size_t size)
+prof_free(const void *ptr, size_t usize)
 {
-	prof_ctx_t *ctx = prof_ctx_get(ptr);
+	prof_tctx_t *tctx = prof_tctx_get(ptr);
 
 	cassert(config_prof);
+	assert(usize == isalloc(ptr, true));
 
-	if ((uintptr_t)ctx > (uintptr_t)1) {
-		prof_thr_cnt_t *tcnt;
-		assert(size == isalloc(ptr, true));
-		tcnt = prof_lookup(&ctx->bt);
-
-		if (tcnt != NULL) {
-			tcnt->epoch++;
-			/*********/
-			mb_write();
-			/*********/
-			tcnt->cnts.curobjs--;
-			tcnt->cnts.curbytes -= size;
-			/*********/
-			mb_write();
-			/*********/
-			tcnt->epoch++;
-			/*********/
-			mb_write();
-			/*********/
-		} else {
-			/*
-			 * OOM during free() cannot be propagated, so operate
-			 * directly on cnt->ctx->cnt_merged.
-			 */
-			malloc_mutex_lock(ctx->lock);
-			ctx->cnt_merged.curobjs--;
-			ctx->cnt_merged.curbytes -= size;
-			malloc_mutex_unlock(ctx->lock);
-		}
-	}
+	if ((uintptr_t)tctx > (uintptr_t)1U)
+		prof_free_sampled_object(usize, tctx);
 }
 #endif
 
diff --git a/src/ctl.c b/src/ctl.c
index fa52a6c..b816c84 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -68,6 +68,8 @@ CTL_PROTO(version)
 CTL_PROTO(epoch)
 CTL_PROTO(thread_tcache_enabled)
 CTL_PROTO(thread_tcache_flush)
+CTL_PROTO(thread_prof_name)
+CTL_PROTO(thread_prof_active)
 CTL_PROTO(thread_arena)
 CTL_PROTO(thread_allocated)
 CTL_PROTO(thread_allocatedp)
@@ -132,7 +134,9 @@ CTL_PROTO(arenas_nlruns)
 CTL_PROTO(arenas_extend)
 CTL_PROTO(prof_active)
 CTL_PROTO(prof_dump)
+CTL_PROTO(prof_reset)
 CTL_PROTO(prof_interval)
+CTL_PROTO(lg_prof_sample)
 CTL_PROTO(stats_chunks_current)
 CTL_PROTO(stats_chunks_total)
 CTL_PROTO(stats_chunks_high)
@@ -196,18 +200,24 @@ CTL_PROTO(stats_mapped)
  */
 #define	INDEX(i)	{false},	i##_index
 
-static const ctl_named_node_t	tcache_node[] = {
+static const ctl_named_node_t	thread_tcache_node[] = {
 	{NAME("enabled"),	CTL(thread_tcache_enabled)},
 	{NAME("flush"),		CTL(thread_tcache_flush)}
 };
 
+static const ctl_named_node_t	thread_prof_node[] = {
+	{NAME("name"),		CTL(thread_prof_name)},
+	{NAME("active"),	CTL(thread_prof_active)}
+};
+
 static const ctl_named_node_t	thread_node[] = {
 	{NAME("arena"),		CTL(thread_arena)},
 	{NAME("allocated"),	CTL(thread_allocated)},
 	{NAME("allocatedp"),	CTL(thread_allocatedp)},
 	{NAME("deallocated"),	CTL(thread_deallocated)},
 	{NAME("deallocatedp"),	CTL(thread_deallocatedp)},
-	{NAME("tcache"),	CHILD(named, tcache)}
+	{NAME("tcache"),	CHILD(named, thread_tcache)},
+	{NAME("prof"),		CHILD(named, thread_prof)}
 };
 
 static const ctl_named_node_t	config_node[] = {
@@ -311,7 +321,9 @@ static const ctl_named_node_t arenas_node[] = {
 static const ctl_named_node_t	prof_node[] = {
 	{NAME("active"),	CTL(prof_active)},
 	{NAME("dump"),		CTL(prof_dump)},
-	{NAME("interval"),	CTL(prof_interval)}
+	{NAME("reset"),		CTL(prof_reset)},
+	{NAME("interval"),	CTL(prof_interval)},
+	{NAME("lg_sample"),	CTL(lg_prof_sample)}
 };
 
 static const ctl_named_node_t stats_chunks_node[] = {
@@ -1281,6 +1293,62 @@ label_return:
 	return (ret);
 }
 
+static int
+thread_prof_name_ctl(const size_t *mib, size_t miblen, void *oldp,
+    size_t *oldlenp, void *newp, size_t newlen)
+{
+	int ret;
+	const char *oldname;
+
+	if (config_prof == false)
+		return (ENOENT);
+
+	oldname = prof_thread_name_get();
+	if (newp != NULL) {
+		if (newlen != sizeof(const char *)) {
+			ret = EINVAL;
+			goto label_return;
+		}
+		if (prof_thread_name_set(*(const char **)newp)) {
+			ret = EAGAIN;
+			goto label_return;
+		}
+	}
+	READ(oldname, const char *);
+
+	ret = 0;
+label_return:
+	return (ret);
+}
+
+static int
+thread_prof_active_ctl(const size_t *mib, size_t miblen, void *oldp,
+    size_t *oldlenp, void *newp, size_t newlen)
+{
+	int ret;
+	bool oldval;
+
+	if (config_prof == false)
+		return (ENOENT);
+
+	oldval = prof_thread_active_get();
+	if (newp != NULL) {
+		if (newlen != sizeof(bool)) {
+			ret = EINVAL;
+			goto label_return;
+		}
+		if (prof_thread_active_set(*(bool *)newp)) {
+			ret = EAGAIN;
+			goto label_return;
+		}
+	}
+	READ(oldval, bool);
+
+	ret = 0;
+label_return:
+	return (ret);
+}
+
 /******************************************************************************/
 
 /* ctl_mutex must be held during execution of this function. */
@@ -1601,7 +1669,30 @@ label_return:
 	return (ret);
 }
 
+static int
+prof_reset_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
+    void *newp, size_t newlen)
+{
+	int ret;
+	size_t lg_sample = lg_prof_sample;
+
+	if (config_prof == false)
+		return (ENOENT);
+
+	WRITEONLY();
+	WRITE(lg_sample, size_t);
+	if (lg_sample >= (sizeof(uint64_t) << 3))
+		lg_sample = (sizeof(uint64_t) << 3) - 1;
+
+	prof_reset(lg_sample);
+
+	ret = 0;
+label_return:
+	return (ret);
+}
+
 CTL_RO_NL_CGEN(config_prof, prof_interval, prof_interval, uint64_t)
+CTL_RO_NL_CGEN(config_prof, lg_prof_sample, lg_prof_sample, size_t)
 
 /******************************************************************************/
 
diff --git a/src/huge.c b/src/huge.c
index d08ed4a..5f0c698 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -197,10 +197,10 @@ huge_salloc(const void *ptr)
 	return (ret);
 }
 
-prof_ctx_t *
-huge_prof_ctx_get(const void *ptr)
+prof_tctx_t *
+huge_prof_tctx_get(const void *ptr)
 {
-	prof_ctx_t *ret;
+	prof_tctx_t *ret;
 	extent_node_t *node, key;
 
 	malloc_mutex_lock(&huge_mtx);
@@ -210,7 +210,7 @@ huge_prof_ctx_get(const void *ptr)
 	node = extent_tree_ad_search(&huge, &key);
 	assert(node != NULL);
 
-	ret = node->prof_ctx;
+	ret = node->prof_tctx;
 
 	malloc_mutex_unlock(&huge_mtx);
 
@@ -218,7 +218,7 @@ huge_prof_ctx_get(const void *ptr)
 }
 
 void
-huge_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
+huge_prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 {
 	extent_node_t *node, key;
 
@@ -229,7 +229,7 @@ huge_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 	node = extent_tree_ad_search(&huge, &key);
 	assert(node != NULL);
 
-	node->prof_ctx = ctx;
+	node->prof_tctx = tctx;
 
 	malloc_mutex_unlock(&huge_mtx);
 }
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 0983c00..2d01272 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -636,9 +636,9 @@ malloc_conf_init(void)
 				    "prof_prefix", "jeprof")
 				CONF_HANDLE_BOOL(opt_prof_active, "prof_active",
 				    true)
-				CONF_HANDLE_SSIZE_T(opt_lg_prof_sample,
+				CONF_HANDLE_SIZE_T(opt_lg_prof_sample,
 				    "lg_prof_sample", 0,
-				    (sizeof(uint64_t) << 3) - 1)
+				    (sizeof(uint64_t) << 3) - 1, true)
 				CONF_HANDLE_BOOL(opt_prof_accum, "prof_accum",
 				    true)
 				CONF_HANDLE_SSIZE_T(opt_lg_prof_interval,
@@ -863,11 +863,11 @@ malloc_init_hard(void)
  */
 
 static void *
-imalloc_prof_sample(size_t usize, prof_thr_cnt_t *cnt)
+imalloc_prof_sample(size_t usize, prof_tctx_t *tctx)
 {
 	void *p;
 
-	if (cnt == NULL)
+	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
 		p = imalloc(SMALL_MAXCLASS+1);
@@ -884,16 +884,16 @@ JEMALLOC_ALWAYS_INLINE_C void *
 imalloc_prof(size_t usize)
 {
 	void *p;
-	prof_thr_cnt_t *cnt;
+	prof_tctx_t *tctx;
 
-	PROF_ALLOC_PREP(usize, cnt);
-	if ((uintptr_t)cnt != (uintptr_t)1U)
-		p = imalloc_prof_sample(usize, cnt);
+	tctx = prof_alloc_prep(usize);
+	if ((uintptr_t)tctx != (uintptr_t)1U)
+		p = imalloc_prof_sample(usize, tctx);
 	else
 		p = imalloc(usize);
 	if (p == NULL)
 		return (NULL);
-	prof_malloc(p, usize, cnt);
+	prof_malloc(p, usize, tctx);
 
 	return (p);
 }
@@ -943,11 +943,11 @@ je_malloc(size_t size)
 }
 
 static void *
-imemalign_prof_sample(size_t alignment, size_t usize, prof_thr_cnt_t *cnt)
+imemalign_prof_sample(size_t alignment, size_t usize, prof_tctx_t *tctx)
 {
 	void *p;
 
-	if (cnt == NULL)
+	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
 		assert(sa2u(SMALL_MAXCLASS+1, alignment) != 0);
@@ -963,17 +963,17 @@ imemalign_prof_sample(size_t alignment, size_t usize, prof_thr_cnt_t *cnt)
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
-imemalign_prof(size_t alignment, size_t usize, prof_thr_cnt_t *cnt)
+imemalign_prof(size_t alignment, size_t usize, prof_tctx_t *tctx)
 {
 	void *p;
 
-	if ((uintptr_t)cnt != (uintptr_t)1U)
-		p = imemalign_prof_sample(alignment, usize, cnt);
+	if ((uintptr_t)tctx != (uintptr_t)1U)
+		p = imemalign_prof_sample(alignment, usize, tctx);
 	else
 		p = ipalloc(usize, alignment, false);
 	if (p == NULL)
 		return (NULL);
-	prof_malloc(p, usize, cnt);
+	prof_malloc(p, usize, tctx);
 
 	return (p);
 }
@@ -1015,10 +1015,10 @@ imemalign(void **memptr, size_t alignment, size_t size, size_t min_alignment)
 		}
 
 		if (config_prof && opt_prof) {
-			prof_thr_cnt_t *cnt;
+			prof_tctx_t *tctx;
 
-			PROF_ALLOC_PREP(usize, cnt);
-			result = imemalign_prof(alignment, usize, cnt);
+			tctx = prof_alloc_prep(usize);
+			result = imemalign_prof(alignment, usize, tctx);
 		} else
 			result = ipalloc(usize, alignment, false);
 		if (result == NULL)
@@ -1070,11 +1070,11 @@ je_aligned_alloc(size_t alignment, size_t size)
 }
 
 static void *
-icalloc_prof_sample(size_t usize, prof_thr_cnt_t *cnt)
+icalloc_prof_sample(size_t usize, prof_tctx_t *tctx)
 {
 	void *p;
 
-	if (cnt == NULL)
+	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
 		p = icalloc(SMALL_MAXCLASS+1);
@@ -1088,17 +1088,17 @@ icalloc_prof_sample(size_t usize, prof_thr_cnt_t *cnt)
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
-icalloc_prof(size_t usize, prof_thr_cnt_t *cnt)
+icalloc_prof(size_t usize, prof_tctx_t *tctx)
 {
 	void *p;
 
-	if ((uintptr_t)cnt != (uintptr_t)1U)
-		p = icalloc_prof_sample(usize, cnt);
+	if ((uintptr_t)tctx != (uintptr_t)1U)
+		p = icalloc_prof_sample(usize, tctx);
 	else
 		p = icalloc(usize);
 	if (p == NULL)
 		return (NULL);
-	prof_malloc(p, usize, cnt);
+	prof_malloc(p, usize, tctx);
 
 	return (p);
 }
@@ -1137,11 +1137,11 @@ je_calloc(size_t num, size_t size)
 	}
 
 	if (config_prof && opt_prof) {
-		prof_thr_cnt_t *cnt;
+		prof_tctx_t *tctx;
 
 		usize = s2u(num_size);
-		PROF_ALLOC_PREP(usize, cnt);
-		ret = icalloc_prof(usize, cnt);
+		tctx = prof_alloc_prep(usize);
+		ret = icalloc_prof(usize, tctx);
 	} else {
 		if (config_stats || (config_valgrind && in_valgrind))
 			usize = s2u(num_size);
@@ -1167,11 +1167,11 @@ label_return:
 }
 
 static void *
-irealloc_prof_sample(void *oldptr, size_t usize, prof_thr_cnt_t *cnt)
+irealloc_prof_sample(void *oldptr, size_t usize, prof_tctx_t *tctx)
 {
 	void *p;
 
-	if (cnt == NULL)
+	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
 		p = iralloc(oldptr, SMALL_MAXCLASS+1, 0, 0, false);
@@ -1185,19 +1185,19 @@ irealloc_prof_sample(void *oldptr, size_t usize, prof_thr_cnt_t *cnt)
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
-irealloc_prof(void *oldptr, size_t old_usize, size_t usize, prof_thr_cnt_t *cnt)
+irealloc_prof(void *oldptr, size_t old_usize, size_t usize, prof_tctx_t *tctx)
 {
 	void *p;
-	prof_ctx_t *old_ctx;
+	prof_tctx_t *old_tctx;
 
-	old_ctx = prof_ctx_get(oldptr);
-	if ((uintptr_t)cnt != (uintptr_t)1U)
-		p = irealloc_prof_sample(oldptr, usize, cnt);
+	old_tctx = prof_tctx_get(oldptr);
+	if ((uintptr_t)tctx != (uintptr_t)1U)
+		p = irealloc_prof_sample(oldptr, usize, tctx);
 	else
 		p = iralloc(oldptr, usize, 0, 0, false);
 	if (p == NULL)
 		return (NULL);
-	prof_realloc(p, usize, cnt, old_usize, old_ctx);
+	prof_realloc(p, usize, tctx, old_usize, old_tctx);
 
 	return (p);
 }
@@ -1253,11 +1253,11 @@ je_realloc(void *ptr, size_t size)
 			old_rzsize = config_prof ? p2rz(ptr) : u2rz(old_usize);
 
 		if (config_prof && opt_prof) {
-			prof_thr_cnt_t *cnt;
+			prof_tctx_t *tctx;
 
 			usize = s2u(size);
-			PROF_ALLOC_PREP(usize, cnt);
-			ret = irealloc_prof(ptr, old_usize, usize, cnt);
+			tctx = prof_alloc_prep(usize);
+			ret = irealloc_prof(ptr, old_usize, usize, tctx);
 		} else {
 			if (config_stats || (config_valgrind && in_valgrind))
 				usize = s2u(size);
@@ -1379,11 +1379,11 @@ imallocx(size_t usize, size_t alignment, bool zero, bool try_tcache,
 
 static void *
 imallocx_prof_sample(size_t usize, size_t alignment, bool zero, bool try_tcache,
-    arena_t *arena, prof_thr_cnt_t *cnt)
+    arena_t *arena, prof_tctx_t *tctx)
 {
 	void *p;
 
-	if (cnt == NULL)
+	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
 		size_t usize_promoted = (alignment == 0) ?
@@ -1402,18 +1402,18 @@ imallocx_prof_sample(size_t usize, size_t alignment, bool zero, bool try_tcache,
 
 JEMALLOC_ALWAYS_INLINE_C void *
 imallocx_prof(size_t usize, size_t alignment, bool zero, bool try_tcache,
-    arena_t *arena, prof_thr_cnt_t *cnt)
+    arena_t *arena, prof_tctx_t *tctx)
 {
 	void *p;
 
-	if ((uintptr_t)cnt != (uintptr_t)1U) {
+	if ((uintptr_t)tctx != (uintptr_t)1U) {
 		p = imallocx_prof_sample(usize, alignment, zero, try_tcache,
-		    arena, cnt);
+		    arena, tctx);
 	} else
 		p = imallocx(usize, alignment, zero, try_tcache, arena);
 	if (p == NULL)
 		return (NULL);
-	prof_malloc(p, usize, cnt);
+	prof_malloc(p, usize, tctx);
 
 	return (p);
 }
@@ -1447,11 +1447,11 @@ je_mallocx(size_t size, int flags)
 	assert(usize != 0);
 
 	if (config_prof && opt_prof) {
-		prof_thr_cnt_t *cnt;
+		prof_tctx_t *tctx;
 
-		PROF_ALLOC_PREP(usize, cnt);
+		tctx = prof_alloc_prep(usize);
 		p = imallocx_prof(usize, alignment, zero, try_tcache, arena,
-		    cnt);
+		    tctx);
 	} else
 		p = imallocx(usize, alignment, zero, try_tcache, arena);
 	if (p == NULL)
@@ -1476,11 +1476,11 @@ label_oom:
 static void *
 irallocx_prof_sample(void *oldptr, size_t size, size_t alignment, size_t usize,
     bool zero, bool try_tcache_alloc, bool try_tcache_dalloc, arena_t *arena,
-    prof_thr_cnt_t *cnt)
+    prof_tctx_t *tctx)
 {
 	void *p;
 
-	if (cnt == NULL)
+	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
 		p = iralloct(oldptr, SMALL_MAXCLASS+1, (SMALL_MAXCLASS+1 >=
@@ -1500,15 +1500,15 @@ irallocx_prof_sample(void *oldptr, size_t size, size_t alignment, size_t usize,
 JEMALLOC_ALWAYS_INLINE_C void *
 irallocx_prof(void *oldptr, size_t old_usize, size_t size, size_t alignment,
     size_t *usize, bool zero, bool try_tcache_alloc, bool try_tcache_dalloc,
-    arena_t *arena, prof_thr_cnt_t *cnt)
+    arena_t *arena, prof_tctx_t *tctx)
 {
 	void *p;
-	prof_ctx_t *old_ctx;
+	prof_tctx_t *old_tctx;
 
-	old_ctx = prof_ctx_get(oldptr);
-	if ((uintptr_t)cnt != (uintptr_t)1U)
+	old_tctx = prof_tctx_get(oldptr);
+	if ((uintptr_t)tctx != (uintptr_t)1U)
 		p = irallocx_prof_sample(oldptr, size, alignment, *usize, zero,
-		    try_tcache_alloc, try_tcache_dalloc, arena, cnt);
+		    try_tcache_alloc, try_tcache_dalloc, arena, tctx);
 	else {
 		p = iralloct(oldptr, size, 0, alignment, zero,
 		    try_tcache_alloc, try_tcache_dalloc, arena);
@@ -1527,7 +1527,7 @@ irallocx_prof(void *oldptr, size_t old_usize, size_t size, size_t alignment,
 		 */
 		*usize = isalloc(p, config_prof);
 	}
-	prof_realloc(p, *usize, cnt, old_usize, old_ctx);
+	prof_realloc(p, *usize, tctx, old_usize, old_tctx);
 
 	return (p);
 }
@@ -1570,13 +1570,13 @@ je_rallocx(void *ptr, size_t size, int flags)
 		old_rzsize = u2rz(old_usize);
 
 	if (config_prof && opt_prof) {
-		prof_thr_cnt_t *cnt;
+		prof_tctx_t *tctx;
 
 		usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment);
 		assert(usize != 0);
-		PROF_ALLOC_PREP(usize, cnt);
+		tctx = prof_alloc_prep(usize);
 		p = irallocx_prof(ptr, old_usize, size, alignment, &usize, zero,
-		    try_tcache_alloc, try_tcache_dalloc, arena, cnt);
+		    try_tcache_alloc, try_tcache_dalloc, arena, tctx);
 		if (p == NULL)
 			goto label_oom;
 	} else {
@@ -1623,11 +1623,11 @@ ixallocx_helper(void *ptr, size_t old_usize, size_t size, size_t extra,
 static size_t
 ixallocx_prof_sample(void *ptr, size_t old_usize, size_t size, size_t extra,
     size_t alignment, size_t max_usize, bool zero, arena_t *arena,
-    prof_thr_cnt_t *cnt)
+    prof_tctx_t *tctx)
 {
 	size_t usize;
 
-	if (cnt == NULL)
+	if (tctx == NULL)
 		return (old_usize);
 	/* Use minimum usize to determine whether promotion may happen. */
 	if (((alignment == 0) ? s2u(size) : sa2u(size, alignment)) <=
@@ -1650,22 +1650,22 @@ ixallocx_prof_sample(void *ptr, size_t old_usize, size_t size, size_t extra,
 JEMALLOC_ALWAYS_INLINE_C size_t
 ixallocx_prof(void *ptr, size_t old_usize, size_t size, size_t extra,
     size_t alignment, size_t max_usize, bool zero, arena_t *arena,
-    prof_thr_cnt_t *cnt)
+    prof_tctx_t *tctx)
 {
 	size_t usize;
-	prof_ctx_t *old_ctx;
+	prof_tctx_t *old_tctx;
 
-	old_ctx = prof_ctx_get(ptr);
-	if ((uintptr_t)cnt != (uintptr_t)1U) {
+	old_tctx = prof_tctx_get(ptr);
+	if ((uintptr_t)tctx != (uintptr_t)1U) {
 		usize = ixallocx_prof_sample(ptr, old_usize, size, extra,
-		    alignment, zero, max_usize, arena, cnt);
+		    alignment, zero, max_usize, arena, tctx);
 	} else {
 		usize = ixallocx_helper(ptr, old_usize, size, extra, alignment,
 		    zero, arena);
 	}
 	if (usize == old_usize)
 		return (usize);
-	prof_realloc(ptr, usize, cnt, old_usize, old_ctx);
+	prof_realloc(ptr, usize, tctx, old_usize, old_tctx);
 
 	return (usize);
 }
@@ -1697,19 +1697,19 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags)
 		old_rzsize = u2rz(old_usize);
 
 	if (config_prof && opt_prof) {
-		prof_thr_cnt_t *cnt;
+		prof_tctx_t *tctx;
 		/*
 		 * usize isn't knowable before ixalloc() returns when extra is
 		 * non-zero.  Therefore, compute its maximum possible value and
-		 * use that in PROF_ALLOC_PREP() to decide whether to capture a
+		 * use that in prof_alloc_prep() to decide whether to capture a
 		 * backtrace.  prof_realloc() will use the actual usize to
 		 * decide whether to sample.
 		 */
 		size_t max_usize = (alignment == 0) ? s2u(size+extra) :
 		    sa2u(size+extra, alignment);
-		PROF_ALLOC_PREP(max_usize, cnt);
+		tctx = prof_alloc_prep(max_usize);
 		usize = ixallocx_prof(ptr, old_usize, size, extra, alignment,
-		    max_usize, zero, arena, cnt);
+		    max_usize, zero, arena, tctx);
 	} else {
 		usize = ixallocx_helper(ptr, old_usize, size, extra, alignment,
 		    zero, arena);
diff --git a/src/prof.c b/src/prof.c
index 497ccf4..044acd8 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -33,22 +33,41 @@ char		opt_prof_prefix[
 
 uint64_t	prof_interval = 0;
 
+size_t		lg_prof_sample;
+
 /*
- * Table of mutexes that are shared among ctx's.  These are leaf locks, so
- * there is no problem with using them for more than one ctx at the same time.
- * The primary motivation for this sharing though is that ctx's are ephemeral,
+ * Table of mutexes that are shared among gctx's.  These are leaf locks, so
+ * there is no problem with using them for more than one gctx at the same time.
+ * The primary motivation for this sharing though is that gctx's are ephemeral,
  * and destroying mutexes causes complications for systems that allocate when
  * creating/destroying mutexes.
  */
-static malloc_mutex_t	*ctx_locks;
-static unsigned		cum_ctxs; /* Atomic counter. */
+static malloc_mutex_t	*gctx_locks;
+static unsigned		cum_gctxs; /* Atomic counter. */
 
 /*
- * Global hash of (prof_bt_t *)-->(prof_ctx_t *).  This is the master data
+ * Table of mutexes that are shared among tdata's.  No operations require
+ * holding multiple tdata locks, so there is no problem with using them for more
+ * than one tdata at the same time, even though a gctx lock may be acquired
+ * while holding a tdata lock.
+ */
+static malloc_mutex_t	*tdata_locks;
+
+/*
+ * Global hash of (prof_bt_t *)-->(prof_gctx_t *).  This is the master data
  * structure that knows about all backtraces currently captured.
  */
-static ckh_t		bt2ctx;
-static malloc_mutex_t	bt2ctx_mtx;
+static ckh_t		bt2gctx;
+static malloc_mutex_t	bt2gctx_mtx;
+
+/*
+ * Tree of all extant prof_tdata_t structures, regardless of state,
+ * {attached,detached,expired}.
+ */
+static prof_tdata_tree_t	tdatas;
+static malloc_mutex_t	tdatas_mtx;
+
+static uint64_t		next_thr_uid;
 
 static malloc_mutex_t	prof_dump_seq_mtx;
 static uint64_t		prof_dump_seq;
@@ -76,21 +95,33 @@ static int		prof_dump_fd;
 static bool		prof_booted = false;
 
 /******************************************************************************/
+/*
+ * Function prototypes for static functions that are referenced prior to
+ * definition.
+ */
+
+static bool	prof_tctx_should_destroy(prof_tctx_t *tctx);
+static void	prof_tctx_destroy(prof_tctx_t *tctx);
+static bool	prof_tdata_should_destroy(prof_tdata_t *tdata);
+static void	prof_tdata_destroy(prof_tdata_t *tdata);
+
+/******************************************************************************/
+/* Red-black trees. */
 
 JEMALLOC_INLINE_C int
-prof_thr_cnt_comp(const prof_thr_cnt_t *a, const prof_thr_cnt_t *b)
+prof_tctx_comp(const prof_tctx_t *a, const prof_tctx_t *b)
 {
-	prof_thr_uid_t a_uid = a->thr_uid;
-	prof_thr_uid_t b_uid = b->thr_uid;
+	uint64_t a_uid = a->tdata->thr_uid;
+	uint64_t b_uid = b->tdata->thr_uid;
 
 	return ((a_uid > b_uid) - (a_uid < b_uid));
 }
 
-rb_gen(static UNUSED, thr_cnt_tree_, prof_thr_cnt_tree_t, prof_thr_cnt_t,
-    thr_cnt_link, prof_thr_cnt_comp)
+rb_gen(static UNUSED, tctx_tree_, prof_tctx_tree_t, prof_tctx_t,
+    tctx_link, prof_tctx_comp)
 
 JEMALLOC_INLINE_C int
-prof_ctx_comp(const prof_ctx_t *a, const prof_ctx_t *b)
+prof_gctx_comp(const prof_gctx_t *a, const prof_gctx_t *b)
 {
 	unsigned a_len = a->bt.len;
 	unsigned b_len = b->bt.len;
@@ -101,8 +132,52 @@ prof_ctx_comp(const prof_ctx_t *a, const prof_ctx_t *b)
 	return (ret);
 }
 
-rb_gen(static UNUSED, ctx_tree_, prof_ctx_tree_t, prof_ctx_t, dump_link,
-    prof_ctx_comp)
+rb_gen(static UNUSED, gctx_tree_, prof_gctx_tree_t, prof_gctx_t, dump_link,
+    prof_gctx_comp)
+
+JEMALLOC_INLINE_C int
+prof_tdata_comp(const prof_tdata_t *a, const prof_tdata_t *b)
+{
+	uint64_t a_uid = a->thr_uid;
+	uint64_t b_uid = b->thr_uid;
+
+	return ((a_uid > b_uid) - (a_uid < b_uid));
+}
+
+rb_gen(static UNUSED, tdata_tree_, prof_tdata_tree_t, prof_tdata_t, tdata_link,
+    prof_tdata_comp)
+
+/******************************************************************************/
+
+void
+prof_malloc_sample_object(const void *ptr, size_t usize, prof_tctx_t *tctx) {
+	prof_tctx_set(ptr, tctx);
+
+	malloc_mutex_lock(tctx->tdata->lock);
+	tctx->cnts.curobjs++;
+	tctx->cnts.curbytes += usize;
+	if (opt_prof_accum) {
+		tctx->cnts.accumobjs++;
+		tctx->cnts.accumbytes += usize;
+	}
+	malloc_mutex_unlock(tctx->tdata->lock);
+}
+
+void
+prof_free_sampled_object(size_t usize, prof_tctx_t *tctx)
+{
+
+	malloc_mutex_lock(tctx->tdata->lock);
+	assert(tctx->cnts.curobjs > 0);
+	assert(tctx->cnts.curbytes >= usize);
+	tctx->cnts.curobjs--;
+	tctx->cnts.curbytes -= usize;
+
+	if (prof_tctx_should_destroy(tctx))
+		prof_tctx_destroy(tctx);
+	else
+		malloc_mutex_unlock(tctx->tdata->lock);
+}
 
 void
 bt_init(prof_bt_t *bt, void **vec)
@@ -115,32 +190,32 @@ bt_init(prof_bt_t *bt, void **vec)
 }
 
 static inline void
-prof_enter(prof_tdata_t *prof_tdata)
+prof_enter(prof_tdata_t *tdata)
 {
 
 	cassert(config_prof);
 
-	assert(prof_tdata->enq == false);
-	prof_tdata->enq = true;
+	assert(tdata->enq == false);
+	tdata->enq = true;
 
-	malloc_mutex_lock(&bt2ctx_mtx);
+	malloc_mutex_lock(&bt2gctx_mtx);
 }
 
 static inline void
-prof_leave(prof_tdata_t *prof_tdata)
+prof_leave(prof_tdata_t *tdata)
 {
 	bool idump, gdump;
 
 	cassert(config_prof);
 
-	malloc_mutex_unlock(&bt2ctx_mtx);
+	malloc_mutex_unlock(&bt2gctx_mtx);
 
-	assert(prof_tdata->enq);
-	prof_tdata->enq = false;
-	idump = prof_tdata->enq_idump;
-	prof_tdata->enq_idump = false;
-	gdump = prof_tdata->enq_gdump;
-	prof_tdata->enq_gdump = false;
+	assert(tdata->enq);
+	tdata->enq = false;
+	idump = tdata->enq_idump;
+	tdata->enq_idump = false;
+	gdump = tdata->enq_gdump;
+	tdata->enq_gdump = false;
 
 	if (idump)
 		prof_idump();
@@ -373,220 +448,268 @@ prof_backtrace(prof_bt_t *bt)
 #endif
 
 static malloc_mutex_t *
-prof_ctx_mutex_choose(void)
+prof_gctx_mutex_choose(void)
+{
+	unsigned ngctxs = atomic_add_u(&cum_gctxs, 1);
+
+	return (&gctx_locks[(ngctxs - 1) % PROF_NCTX_LOCKS]);
+}
+
+static malloc_mutex_t *
+prof_tdata_mutex_choose(uint64_t thr_uid)
 {
-	unsigned nctxs = atomic_add_u(&cum_ctxs, 1);
 
-	return (&ctx_locks[(nctxs - 1) % PROF_NCTX_LOCKS]);
+	return (&tdata_locks[thr_uid % PROF_NTDATA_LOCKS]);
 }
 
-static prof_ctx_t *
-prof_ctx_create(prof_bt_t *bt)
+static prof_gctx_t *
+prof_gctx_create(prof_bt_t *bt)
 {
 	/*
 	 * Create a single allocation that has space for vec of length bt->len.
 	 */
-	prof_ctx_t *ctx = (prof_ctx_t *)imalloc(offsetof(prof_ctx_t, vec) +
+	prof_gctx_t *gctx = (prof_gctx_t *)imalloc(offsetof(prof_gctx_t, vec) +
 	    (bt->len * sizeof(void *)));
-	if (ctx == NULL)
+	if (gctx == NULL)
 		return (NULL);
-	ctx->lock = prof_ctx_mutex_choose();
+	gctx->lock = prof_gctx_mutex_choose();
 	/*
 	 * Set nlimbo to 1, in order to avoid a race condition with
-	 * prof_ctx_merge()/prof_ctx_destroy().
+	 * prof_tctx_destroy()/prof_gctx_maybe_destroy().
 	 */
-	ctx->nlimbo = 1;
-	memset(&ctx->cnt_merged, 0, sizeof(prof_cnt_t));
-	thr_cnt_tree_new(&ctx->thr_cnts);
+	gctx->nlimbo = 1;
+	tctx_tree_new(&gctx->tctxs);
 	/* Duplicate bt. */
-	memcpy(ctx->vec, bt->vec, bt->len * sizeof(void *));
-	ctx->bt.vec = ctx->vec;
-	ctx->bt.len = bt->len;
-	return (ctx);
+	memcpy(gctx->vec, bt->vec, bt->len * sizeof(void *));
+	gctx->bt.vec = gctx->vec;
+	gctx->bt.len = bt->len;
+	return (gctx);
 }
 
 static void
-prof_ctx_destroy(prof_ctx_t *ctx)
+prof_gctx_maybe_destroy(prof_gctx_t *gctx)
 {
-	prof_tdata_t *prof_tdata;
+	prof_tdata_t *tdata;
 
 	cassert(config_prof);
 
 	/*
-	 * Check that ctx is still unused by any thread cache before destroying
-	 * it.  prof_lookup() increments ctx->nlimbo in order to avoid a race
-	 * condition with this function, as does prof_ctx_merge() in order to
-	 * avoid a race between the main body of prof_ctx_merge() and entry
+	 * Check that gctx is still unused by any thread cache before destroying
+	 * it.  prof_lookup() increments gctx->nlimbo in order to avoid a race
+	 * condition with this function, as does prof_tctx_destroy() in order to
+	 * avoid a race between the main body of prof_tctx_destroy() and entry
 	 * into this function.
 	 */
-	prof_tdata = prof_tdata_get(false);
-	assert((uintptr_t)prof_tdata > (uintptr_t)PROF_TDATA_STATE_MAX);
-	prof_enter(prof_tdata);
-	malloc_mutex_lock(ctx->lock);
-	if (thr_cnt_tree_first(&ctx->thr_cnts) == NULL &&
-	    ctx->cnt_merged.curobjs == 0 && ctx->nlimbo == 1) {
-		assert(ctx->cnt_merged.curbytes == 0);
-		assert(ctx->cnt_merged.accumobjs == 0);
-		assert(ctx->cnt_merged.accumbytes == 0);
-		/* Remove ctx from bt2ctx. */
-		if (ckh_remove(&bt2ctx, &ctx->bt, NULL, NULL))
+	tdata = prof_tdata_get(false);
+	assert((uintptr_t)tdata > (uintptr_t)PROF_TDATA_STATE_MAX);
+	prof_enter(tdata);
+	malloc_mutex_lock(gctx->lock);
+	if (tctx_tree_empty(&gctx->tctxs) && gctx->nlimbo == 1) {
+		/* Remove gctx from bt2gctx. */
+		if (ckh_remove(&bt2gctx, &gctx->bt, NULL, NULL))
 			not_reached();
-		prof_leave(prof_tdata);
-		/* Destroy ctx. */
-		malloc_mutex_unlock(ctx->lock);
-		idalloc(ctx);
+		prof_leave(tdata);
+		/* Destroy gctx. */
+		malloc_mutex_unlock(gctx->lock);
+		idalloc(gctx);
 	} else {
 		/*
-		 * Compensate for increment in prof_ctx_merge() or
+		 * Compensate for increment in prof_tctx_destroy() or
 		 * prof_lookup().
 		 */
-		ctx->nlimbo--;
-		malloc_mutex_unlock(ctx->lock);
-		prof_leave(prof_tdata);
+		gctx->nlimbo--;
+		malloc_mutex_unlock(gctx->lock);
+		prof_leave(tdata);
 	}
 }
 
-static void
-prof_ctx_merge(prof_ctx_t *ctx, prof_thr_cnt_t *cnt)
+/* tctx->tdata->lock must be held. */
+static bool
+prof_tctx_should_destroy(prof_tctx_t *tctx)
 {
-	bool destroy;
 
-	cassert(config_prof);
+	if (opt_prof_accum)
+		return (false);
+	if (tctx->cnts.curobjs != 0)
+		return (false);
+	return (true);
+}
+
+static bool
+prof_gctx_should_destroy(prof_gctx_t *gctx)
+{
 
-	/* Merge cnt stats and detach from ctx. */
-	malloc_mutex_lock(ctx->lock);
-	ctx->cnt_merged.curobjs += cnt->cnts.curobjs;
-	ctx->cnt_merged.curbytes += cnt->cnts.curbytes;
-	ctx->cnt_merged.accumobjs += cnt->cnts.accumobjs;
-	ctx->cnt_merged.accumbytes += cnt->cnts.accumbytes;
-	thr_cnt_tree_remove(&ctx->thr_cnts, cnt);
-	if (opt_prof_accum == false && thr_cnt_tree_first(&ctx->thr_cnts) ==
-	    NULL && ctx->cnt_merged.curobjs == 0 && ctx->nlimbo == 0) {
+	if (opt_prof_accum)
+		return (false);
+	if (tctx_tree_empty(&gctx->tctxs) == false)
+		return (false);
+	if (gctx->nlimbo != 0)
+		return (false);
+	return (true);
+}
+
+/* tctx->tdata->lock is held upon entry, and released before return. */
+static void
+prof_tctx_destroy(prof_tctx_t *tctx)
+{
+	prof_gctx_t *gctx = tctx->gctx;
+	bool destroy_gctx;
+
+	assert(tctx->cnts.curobjs == 0);
+	assert(tctx->cnts.curbytes == 0);
+	assert(opt_prof_accum == false);
+	assert(tctx->cnts.accumobjs == 0);
+	assert(tctx->cnts.accumbytes == 0);
+
+	{
+		prof_tdata_t *tdata = tctx->tdata;
+		bool tdata_destroy;
+
+		ckh_remove(&tdata->bt2tctx, &gctx->bt, NULL, NULL);
+		tdata_destroy = prof_tdata_should_destroy(tdata);
+		malloc_mutex_unlock(tdata->lock);
+		if (tdata_destroy)
+			prof_tdata_destroy(tdata);
+	}
+
+	malloc_mutex_lock(gctx->lock);
+	tctx_tree_remove(&gctx->tctxs, tctx);
+	if (prof_gctx_should_destroy(gctx)) {
 		/*
-		 * Increment ctx->nlimbo in order to keep another thread from
-		 * winning the race to destroy ctx while this one has ctx->lock
-		 * dropped.  Without this, it would be possible for another
-		 * thread to:
+		 * Increment gctx->nlimbo in order to keep another thread from
+		 * winning the race to destroy gctx while this one has
+		 * gctx->lock dropped.  Without this, it would be possible for
+		 * another thread to:
 		 *
-		 * 1) Sample an allocation associated with ctx.
+		 * 1) Sample an allocation associated with gctx.
 		 * 2) Deallocate the sampled object.
-		 * 3) Successfully prof_ctx_destroy(ctx).
+		 * 3) Successfully prof_gctx_maybe_destroy(gctx).
 		 *
-		 * The result would be that ctx no longer exists by the time
-		 * this thread accesses it in prof_ctx_destroy().
+		 * The result would be that gctx no longer exists by the time
+		 * this thread accesses it in prof_gctx_maybe_destroy().
 		 */
-		ctx->nlimbo++;
-		destroy = true;
+		gctx->nlimbo++;
+		destroy_gctx = true;
 	} else
-		destroy = false;
-	malloc_mutex_unlock(ctx->lock);
-	if (destroy)
-		prof_ctx_destroy(ctx);
+		destroy_gctx = false;
+	malloc_mutex_unlock(gctx->lock);
+	if (destroy_gctx)
+		prof_gctx_maybe_destroy(gctx);
+
+	idalloc(tctx);
 }
 
 static bool
-prof_lookup_global(prof_bt_t *bt, prof_tdata_t *prof_tdata, void **p_btkey,
-    prof_ctx_t **p_ctx, bool *p_new_ctx)
+prof_lookup_global(prof_bt_t *bt, prof_tdata_t *tdata, void **p_btkey,
+    prof_gctx_t **p_gctx, bool *p_new_gctx)
 {
 	union {
-		prof_ctx_t	*p;
+		prof_gctx_t	*p;
 		void		*v;
-	} ctx;
+	} gctx;
 	union {
 		prof_bt_t	*p;
 		void		*v;
 	} btkey;
-	bool new_ctx;
+	bool new_gctx;
 
-	prof_enter(prof_tdata);
-	if (ckh_search(&bt2ctx, bt, &btkey.v, &ctx.v)) {
+	prof_enter(tdata);
+	if (ckh_search(&bt2gctx, bt, &btkey.v, &gctx.v)) {
 		/* bt has never been seen before.  Insert it. */
-		ctx.p = prof_ctx_create(bt);
-		if (ctx.v == NULL) {
-			prof_leave(prof_tdata);
+		gctx.p = prof_gctx_create(bt);
+		if (gctx.v == NULL) {
+			prof_leave(tdata);
 			return (true);
 		}
-		btkey.p = &ctx.p->bt;
-		if (ckh_insert(&bt2ctx, btkey.v, ctx.v)) {
+		btkey.p = &gctx.p->bt;
+		if (ckh_insert(&bt2gctx, btkey.v, gctx.v)) {
 			/* OOM. */
-			prof_leave(prof_tdata);
-			idalloc(ctx.v);
+			prof_leave(tdata);
+			idalloc(gctx.v);
 			return (true);
 		}
-		new_ctx = true;
+		new_gctx = true;
 	} else {
 		/*
 		 * Increment nlimbo, in order to avoid a race condition with
-		 * prof_ctx_merge()/prof_ctx_destroy().
+		 * prof_tctx_destroy()/prof_gctx_maybe_destroy().
 		 */
-		malloc_mutex_lock(ctx.p->lock);
-		ctx.p->nlimbo++;
-		malloc_mutex_unlock(ctx.p->lock);
-		new_ctx = false;
+		malloc_mutex_lock(gctx.p->lock);
+		gctx.p->nlimbo++;
+		malloc_mutex_unlock(gctx.p->lock);
+		new_gctx = false;
 	}
-	prof_leave(prof_tdata);
+	prof_leave(tdata);
 
 	*p_btkey = btkey.v;
-	*p_ctx = ctx.p;
-	*p_new_ctx = new_ctx;
+	*p_gctx = gctx.p;
+	*p_new_gctx = new_gctx;
 	return (false);
 }
 
-prof_thr_cnt_t *
+prof_tctx_t *
 prof_lookup(prof_bt_t *bt)
 {
 	union {
-		prof_thr_cnt_t	*p;
+		prof_tctx_t	*p;
 		void		*v;
 	} ret;
-	prof_tdata_t *prof_tdata;
+	prof_tdata_t *tdata;
+	bool not_found;
 
 	cassert(config_prof);
 
-	prof_tdata = prof_tdata_get(false);
-	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+	tdata = prof_tdata_get(false);
+	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
 		return (NULL);
 
-	if (ckh_search(&prof_tdata->bt2cnt, bt, NULL, &ret.v)) {
+	malloc_mutex_lock(tdata->lock);
+	not_found = ckh_search(&tdata->bt2tctx, bt, NULL, &ret.v);
+	malloc_mutex_unlock(tdata->lock);
+	if (not_found) {
 		void *btkey;
-		prof_ctx_t *ctx;
-		bool new_ctx;
+		prof_gctx_t *gctx;
+		bool new_gctx, error;
 
 		/*
 		 * This thread's cache lacks bt.  Look for it in the global
 		 * cache.
 		 */
-		if (prof_lookup_global(bt, prof_tdata, &btkey, &ctx, &new_ctx))
+		if (prof_lookup_global(bt, tdata, &btkey, &gctx,
+		    &new_gctx))
 			return (NULL);
 
-		/* Link a prof_thd_cnt_t into ctx for this thread. */
-		ret.v = imalloc(sizeof(prof_thr_cnt_t));
+		/* Link a prof_tctx_t into gctx for this thread. */
+		ret.v = imalloc(sizeof(prof_tctx_t));
 		if (ret.p == NULL) {
-			if (new_ctx)
-				prof_ctx_destroy(ctx);
+			if (new_gctx)
+				prof_gctx_maybe_destroy(gctx);
 			return (NULL);
 		}
-		ret.p->ctx = ctx;
-		ret.p->epoch = 0;
+		ret.p->tdata = tdata;
 		memset(&ret.p->cnts, 0, sizeof(prof_cnt_t));
-		if (ckh_insert(&prof_tdata->bt2cnt, btkey, ret.v)) {
-			if (new_ctx)
-				prof_ctx_destroy(ctx);
+		ret.p->gctx = gctx;
+		ret.p->state = prof_tctx_state_nominal;
+		malloc_mutex_lock(tdata->lock);
+		error = ckh_insert(&tdata->bt2tctx, btkey, ret.v);
+		malloc_mutex_unlock(tdata->lock);
+		if (error) {
+			if (new_gctx)
+				prof_gctx_maybe_destroy(gctx);
 			idalloc(ret.v);
 			return (NULL);
 		}
-		malloc_mutex_lock(ctx->lock);
-		thr_cnt_tree_insert(&ctx->thr_cnts, ret.p);
-		ctx->nlimbo--;
-		malloc_mutex_unlock(ctx->lock);
+		malloc_mutex_lock(gctx->lock);
+		tctx_tree_insert(&gctx->tctxs, ret.p);
+		gctx->nlimbo--;
+		malloc_mutex_unlock(gctx->lock);
 	}
 
 	return (ret.p);
 }
 
-
 void
-prof_sample_threshold_update(prof_tdata_t *prof_tdata)
+prof_sample_threshold_update(prof_tdata_t *tdata)
 {
 	/*
 	 * The body of this function is compiled out unless heap profiling is
@@ -608,23 +731,20 @@ prof_sample_threshold_update(prof_tdata_t *prof_tdata)
 	if (!config_prof)
 		return;
 
-	if (prof_tdata == NULL)
-		prof_tdata = prof_tdata_get(false);
-
-	if (opt_lg_prof_sample == 0) {
-		prof_tdata->bytes_until_sample = 0;
+	if (lg_prof_sample == 0) {
+		tdata->bytes_until_sample = 0;
 		return;
 	}
 
 	/*
-	 * Compute sample threshold as a geometrically distributed random
-	 * variable with mean (2^opt_lg_prof_sample).
+	 * Compute sample interval as a geometrically distributed random
+	 * variable with mean (2^lg_prof_sample).
 	 *
-	 *                         __        __
-	 *                         |  log(u)  |                     1
-	 * prof_tdata->threshold = | -------- |, where p = -------------------
-	 *                         | log(1-p) |             opt_lg_prof_sample
-	 *                                                 2
+	 *                             __        __
+	 *                             |  log(u)  |                     1
+	 * tdata->bytes_until_sample = | -------- |, where p = ---------------
+	 *                             | log(1-p) |             lg_prof_sample
+	 *                                                     2
 	 *
 	 * For more information on the math, see:
 	 *
@@ -634,30 +754,29 @@ prof_sample_threshold_update(prof_tdata_t *prof_tdata)
 	 *   pp 500
 	 *   (http://luc.devroye.org/rnbookindex.html)
 	 */
-	prng64(r, 53, prof_tdata->prng_state,
-	    UINT64_C(6364136223846793005), UINT64_C(1442695040888963407));
+	prng64(r, 53, tdata->prng_state, UINT64_C(6364136223846793005),
+	    UINT64_C(1442695040888963407));
 	u = (double)r * (1.0/9007199254740992.0L);
-	prof_tdata->bytes_until_sample = (uint64_t)(log(u) /
-	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
+	tdata->bytes_until_sample = (uint64_t)(log(u) /
+	    log(1.0 - (1.0 / (double)((uint64_t)1U << lg_prof_sample))))
 	    + (uint64_t)1U;
 #endif
 }
 
-
 #ifdef JEMALLOC_JET
 size_t
 prof_bt_count(void)
 {
 	size_t bt_count;
-	prof_tdata_t *prof_tdata;
+	prof_tdata_t *tdata;
 
-	prof_tdata = prof_tdata_get(false);
-	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+	tdata = prof_tdata_get(false);
+	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
 		return (0);
 
-	prof_enter(prof_tdata);
-	bt_count = ckh_count(&bt2ctx);
-	prof_leave(prof_tdata);
+	prof_enter(tdata);
+	bt_count = ckh_count(&bt2gctx);
+	prof_leave(tdata);
 
 	return (bt_count);
 }
@@ -770,146 +889,249 @@ prof_dump_printf(bool propagate_err, const char *format, ...)
 	return (ret);
 }
 
-static prof_thr_cnt_t *
-ctx_sum_iter(prof_thr_cnt_tree_t *thr_cnts, prof_thr_cnt_t *thr_cnt, void *arg)
+/* tctx->tdata->lock is held. */
+static void
+prof_tctx_merge_tdata(prof_tctx_t *tctx, prof_tdata_t *tdata)
 {
-	prof_ctx_t *ctx = (prof_ctx_t *)arg;
-	volatile unsigned *epoch = &thr_cnt->epoch;
-	prof_cnt_t tcnt;
 
-	while (true) {
-		unsigned epoch0 = *epoch;
+	assert(tctx->state == prof_tctx_state_nominal);
+	tctx->state = prof_tctx_state_dumping;
+	memcpy(&tctx->dump_cnts, &tctx->cnts, sizeof(prof_cnt_t));
 
-		/* Make sure epoch is even. */
-		if (epoch0 & 1U)
-			continue;
+	tdata->cnt_summed.curobjs += tctx->dump_cnts.curobjs;
+	tdata->cnt_summed.curbytes += tctx->dump_cnts.curbytes;
+	if (opt_prof_accum) {
+		tdata->cnt_summed.accumobjs += tctx->dump_cnts.accumobjs;
+		tdata->cnt_summed.accumbytes += tctx->dump_cnts.accumbytes;
+	}
+}
 
-		memcpy(&tcnt, &thr_cnt->cnts, sizeof(prof_cnt_t));
+/* gctx->lock is held. */
+static void
+prof_tctx_merge_gctx(prof_tctx_t *tctx, prof_gctx_t *gctx)
+{
 
-		/* Terminate if epoch didn't change while reading. */
-		if (*epoch == epoch0)
-			break;
+	gctx->cnt_summed.curobjs += tctx->dump_cnts.curobjs;
+	gctx->cnt_summed.curbytes += tctx->dump_cnts.curbytes;
+	if (opt_prof_accum) {
+		gctx->cnt_summed.accumobjs += tctx->dump_cnts.accumobjs;
+		gctx->cnt_summed.accumbytes += tctx->dump_cnts.accumbytes;
 	}
+}
 
-	ctx->cnt_summed.curobjs += tcnt.curobjs;
-	ctx->cnt_summed.curbytes += tcnt.curbytes;
-	if (opt_prof_accum) {
-		ctx->cnt_summed.accumobjs += tcnt.accumobjs;
-		ctx->cnt_summed.accumbytes += tcnt.accumbytes;
+/* tctx->gctx is held. */
+static prof_tctx_t *
+prof_tctx_merge_iter(prof_tctx_tree_t *tctxs, prof_tctx_t *tctx, void *arg)
+{
+
+	switch (tctx->state) {
+	case prof_tctx_state_nominal:
+		/* New since dumping started; ignore. */
+		break;
+	case prof_tctx_state_dumping:
+	case prof_tctx_state_purgatory:
+		prof_tctx_merge_gctx(tctx, tctx->gctx);
+		break;
+	default:
+		not_reached();
 	}
 
 	return (NULL);
 }
 
+/* gctx->lock is held. */
+static prof_tctx_t *
+prof_tctx_dump_iter(prof_tctx_tree_t *tctxs, prof_tctx_t *tctx, void *arg)
+{
+	bool propagate_err = *(bool *)arg;
+
+	if (prof_dump_printf(propagate_err,
+	    "  t%"PRIu64": %"PRIu64": %"PRIu64" [%"PRIu64": %"PRIu64"]\n",
+	    tctx->tdata->thr_uid, tctx->dump_cnts.curobjs,
+	    tctx->dump_cnts.curbytes, tctx->dump_cnts.accumobjs,
+	    tctx->dump_cnts.accumbytes))
+		return (tctx);
+	return (NULL);
+}
+
+/* tctx->gctx is held. */
+static prof_tctx_t *
+prof_tctx_finish_iter(prof_tctx_tree_t *tctxs, prof_tctx_t *tctx, void *arg)
+{
+	prof_tctx_t *ret;
+
+	switch (tctx->state) {
+	case prof_tctx_state_nominal:
+		/* New since dumping started; ignore. */
+		break;
+	case prof_tctx_state_dumping:
+		tctx->state = prof_tctx_state_nominal;
+		break;
+	case prof_tctx_state_purgatory:
+		ret = tctx_tree_next(tctxs, tctx);
+		tctx_tree_remove(tctxs, tctx);
+		idalloc(tctx);
+		goto label_return;
+	default:
+		not_reached();
+	}
+
+	ret = NULL;
+label_return:
+	return (ret);
+}
+
 static void
-prof_dump_ctx_prep(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx,
-    prof_ctx_tree_t *ctxs)
+prof_dump_gctx_prep(prof_gctx_t *gctx, prof_gctx_tree_t *gctxs)
 {
 
 	cassert(config_prof);
 
-	malloc_mutex_lock(ctx->lock);
+	malloc_mutex_lock(gctx->lock);
 
 	/*
-	 * Increment nlimbo so that ctx won't go away before dump.
-	 * Additionally, link ctx into the dump list so that it is included in
+	 * Increment nlimbo so that gctx won't go away before dump.
+	 * Additionally, link gctx into the dump list so that it is included in
 	 * prof_dump()'s second pass.
 	 */
-	ctx->nlimbo++;
-	ctx_tree_insert(ctxs, ctx);
+	gctx->nlimbo++;
+	gctx_tree_insert(gctxs, gctx);
 
-	memcpy(&ctx->cnt_summed, &ctx->cnt_merged, sizeof(prof_cnt_t));
-	thr_cnt_tree_iter(&ctx->thr_cnts, NULL, ctx_sum_iter, (void *)ctx);
+	memset(&gctx->cnt_summed, 0, sizeof(prof_cnt_t));
 
-	if (ctx->cnt_summed.curobjs != 0)
-		(*leak_nctx)++;
+	malloc_mutex_unlock(gctx->lock);
+}
 
-	/* Add to cnt_all. */
-	cnt_all->curobjs += ctx->cnt_summed.curobjs;
-	cnt_all->curbytes += ctx->cnt_summed.curbytes;
-	if (opt_prof_accum) {
-		cnt_all->accumobjs += ctx->cnt_summed.accumobjs;
-		cnt_all->accumbytes += ctx->cnt_summed.accumbytes;
-	}
+static prof_gctx_t *
+prof_gctx_merge_iter(prof_gctx_tree_t *gctxs, prof_gctx_t *gctx, void *arg)
+{
+	size_t *leak_ngctx = (size_t *)arg;
 
-	malloc_mutex_unlock(ctx->lock);
+	malloc_mutex_lock(gctx->lock);
+	tctx_tree_iter(&gctx->tctxs, NULL, prof_tctx_merge_iter, NULL);
+	if (gctx->cnt_summed.curobjs != 0)
+		(*leak_ngctx)++;
+	malloc_mutex_unlock(gctx->lock);
+
+	return (NULL);
 }
 
-static bool
-prof_dump_header(bool propagate_err, const prof_cnt_t *cnt_all)
+static prof_gctx_t *
+prof_gctx_finish_iter(prof_gctx_tree_t *gctxs, prof_gctx_t *gctx, void *arg)
 {
+	prof_tctx_t *next;
+	bool destroy_gctx;
 
-	if (opt_lg_prof_sample == 0) {
-		if (prof_dump_printf(propagate_err,
-		    "heap profile: %"PRId64": %"PRId64
-		    " [%"PRIu64": %"PRIu64"] @ heapprofile\n",
-		    cnt_all->curobjs, cnt_all->curbytes,
-		    cnt_all->accumobjs, cnt_all->accumbytes))
-			return (true);
-	} else {
-		if (prof_dump_printf(propagate_err,
-		    "heap profile: %"PRId64": %"PRId64
-		    " [%"PRIu64": %"PRIu64"] @ heap_v2/%"PRIu64"\n",
-		    cnt_all->curobjs, cnt_all->curbytes,
-		    cnt_all->accumobjs, cnt_all->accumbytes,
-		    ((uint64_t)1U << opt_lg_prof_sample)))
-			return (true);
-	}
+	malloc_mutex_lock(gctx->lock);
+	next = NULL;
+	do {
+		next = tctx_tree_iter(&gctx->tctxs, next, prof_tctx_finish_iter,
+		    NULL);
+	} while (next != NULL);
+	gctx->nlimbo--;
+	destroy_gctx = prof_gctx_should_destroy(gctx);
+	malloc_mutex_unlock(gctx->lock);
+	if (destroy_gctx)
+		prof_gctx_maybe_destroy(gctx);
 
-	return (false);
+	return (NULL);
 }
 
-static void
-prof_dump_ctx_cleanup_locked(prof_ctx_t *ctx, prof_ctx_tree_t *ctxs)
+static prof_tdata_t *
+prof_tdata_merge_iter(prof_tdata_tree_t *tdatas, prof_tdata_t *tdata, void *arg)
 {
+	prof_cnt_t *cnt_all = (prof_cnt_t *)arg;
+
+	malloc_mutex_lock(tdata->lock);
+	if (tdata->state != prof_tdata_state_expired) {
+		size_t tabind;
+		union {
+			prof_tctx_t	*p;
+			void		*v;
+		} tctx;
+
+		tdata->dumping = true;
+		memset(&tdata->cnt_summed, 0, sizeof(prof_cnt_t));
+		for (tabind = 0; ckh_iter(&tdata->bt2tctx, &tabind, NULL,
+		    &tctx.v) == false;)
+			prof_tctx_merge_tdata(tctx.p, tdata);
+
+		cnt_all->curobjs += tdata->cnt_summed.curobjs;
+		cnt_all->curbytes += tdata->cnt_summed.curbytes;
+		if (opt_prof_accum) {
+			cnt_all->accumobjs += tdata->cnt_summed.accumobjs;
+			cnt_all->accumbytes += tdata->cnt_summed.accumbytes;
+		}
+	} else
+		tdata->dumping = false;
+	malloc_mutex_unlock(tdata->lock);
 
-	ctx->nlimbo--;
+	return (NULL);
 }
 
-static void
-prof_dump_ctx_cleanup(prof_ctx_t *ctx, prof_ctx_tree_t *ctxs)
+static prof_tdata_t *
+prof_tdata_dump_iter(prof_tdata_tree_t *tdatas, prof_tdata_t *tdata, void *arg)
+{
+	bool propagate_err = *(bool *)arg;
+
+	if (tdata->dumping == false)
+		return (NULL);
+
+	if (prof_dump_printf(propagate_err,
+	    "  t%"PRIu64": %"PRIu64": %"PRIu64" [%"PRIu64": %"PRIu64"]%s%s\n",
+	    tdata->thr_uid, tdata->cnt_summed.curobjs,
+	    tdata->cnt_summed.curbytes, tdata->cnt_summed.accumobjs,
+	    tdata->cnt_summed.accumbytes,
+	    (tdata->thread_name != NULL) ? " " : "",
+	    (tdata->thread_name != NULL) ? tdata->thread_name : ""))
+		return (tdata);
+	return (NULL);
+}
+
+static bool
+prof_dump_header(bool propagate_err, const prof_cnt_t *cnt_all)
 {
+	bool ret;
+
+	if (prof_dump_printf(propagate_err,
+	    "heap_v2/%"PRIu64"\n"
+	    "  t*: %"PRIu64": %"PRIu64" [%"PRIu64": %"PRIu64"]\n",
+	    ((uint64_t)1U << lg_prof_sample), cnt_all->curobjs,
+	    cnt_all->curbytes, cnt_all->accumobjs, cnt_all->accumbytes))
+		return (true);
 
-	malloc_mutex_lock(ctx->lock);
-	prof_dump_ctx_cleanup_locked(ctx, ctxs);
-	malloc_mutex_unlock(ctx->lock);
+	malloc_mutex_lock(&tdatas_mtx);
+	ret = (tdata_tree_iter(&tdatas, NULL, prof_tdata_dump_iter,
+	    (void *)&propagate_err) != NULL);
+	malloc_mutex_unlock(&tdatas_mtx);
+	return (ret);
 }
 
+/* gctx->lock is held. */
 static bool
-prof_dump_ctx(bool propagate_err, prof_ctx_t *ctx, const prof_bt_t *bt,
-    prof_ctx_tree_t *ctxs)
+prof_dump_gctx(bool propagate_err, prof_gctx_t *gctx, const prof_bt_t *bt,
+    prof_gctx_tree_t *gctxs)
 {
 	bool ret;
 	unsigned i;
 
 	cassert(config_prof);
 
-	/*
-	 * Current statistics can sum to 0 as a result of unmerged per thread
-	 * statistics.  Additionally, interval- and growth-triggered dumps can
-	 * occur between the time a ctx is created and when its statistics are
-	 * filled in.  Avoid dumping any ctx that is an artifact of either
-	 * implementation detail.
-	 */
-	malloc_mutex_lock(ctx->lock);
-	if ((opt_prof_accum == false && ctx->cnt_summed.curobjs == 0) ||
-	    (opt_prof_accum && ctx->cnt_summed.accumobjs == 0)) {
-		assert(ctx->cnt_summed.curobjs == 0);
-		assert(ctx->cnt_summed.curbytes == 0);
-		assert(ctx->cnt_summed.accumobjs == 0);
-		assert(ctx->cnt_summed.accumbytes == 0);
+	/* Avoid dumping such gctx's that have no useful data. */
+	if ((opt_prof_accum == false && gctx->cnt_summed.curobjs == 0) ||
+	    (opt_prof_accum && gctx->cnt_summed.accumobjs == 0)) {
+		assert(gctx->cnt_summed.curobjs == 0);
+		assert(gctx->cnt_summed.curbytes == 0);
+		assert(gctx->cnt_summed.accumobjs == 0);
+		assert(gctx->cnt_summed.accumbytes == 0);
 		ret = false;
 		goto label_return;
 	}
 
-	if (prof_dump_printf(propagate_err, "%"PRId64": %"PRId64
-	    " [%"PRIu64": %"PRIu64"] @",
-	    ctx->cnt_summed.curobjs, ctx->cnt_summed.curbytes,
-	    ctx->cnt_summed.accumobjs, ctx->cnt_summed.accumbytes)) {
+	if (prof_dump_printf(propagate_err, "@")) {
 		ret = true;
 		goto label_return;
 	}
-
 	for (i = 0; i < bt->len; i++) {
 		if (prof_dump_printf(propagate_err, " %#"PRIxPTR,
 		    (uintptr_t)bt->vec[i])) {
@@ -918,15 +1140,23 @@ prof_dump_ctx(bool propagate_err, prof_ctx_t *ctx, const prof_bt_t *bt,
 		}
 	}
 
-	if (prof_dump_write(propagate_err, "\n")) {
+	if (prof_dump_printf(propagate_err,
+	    "\n"
+	    "  t*: %"PRIu64": %"PRIu64" [%"PRIu64": %"PRIu64"]\n",
+	    gctx->cnt_summed.curobjs, gctx->cnt_summed.curbytes,
+	    gctx->cnt_summed.accumobjs, gctx->cnt_summed.accumbytes)) {
+		ret = true;
+		goto label_return;
+	}
+
+	if (tctx_tree_iter(&gctx->tctxs, NULL, prof_tctx_dump_iter,
+	    (void *)&propagate_err) != NULL) {
 		ret = true;
 		goto label_return;
 	}
 
 	ret = false;
 label_return:
-	prof_dump_ctx_cleanup_locked(ctx, ctxs);
-	malloc_mutex_unlock(ctx->lock);
 	return (ret);
 }
 
@@ -980,72 +1210,85 @@ label_return:
 }
 
 static void
-prof_leakcheck(const prof_cnt_t *cnt_all, size_t leak_nctx,
+prof_leakcheck(const prof_cnt_t *cnt_all, size_t leak_ngctx,
     const char *filename)
 {
 
 	if (cnt_all->curbytes != 0) {
-		malloc_printf("<jemalloc>: Leak summary: %"PRId64" byte%s, %"
-		    PRId64" object%s, %zu context%s\n",
+		malloc_printf("<jemalloc>: Leak summary: %"PRIu64" byte%s, %"
+		    PRIu64" object%s, %zu context%s\n",
 		    cnt_all->curbytes, (cnt_all->curbytes != 1) ? "s" : "",
 		    cnt_all->curobjs, (cnt_all->curobjs != 1) ? "s" : "",
-		    leak_nctx, (leak_nctx != 1) ? "s" : "");
+		    leak_ngctx, (leak_ngctx != 1) ? "s" : "");
 		malloc_printf(
 		    "<jemalloc>: Run pprof on \"%s\" for leak detail\n",
 		    filename);
 	}
 }
 
-static prof_ctx_t *
-prof_ctx_dump_iter(prof_ctx_tree_t *ctxs, prof_ctx_t *ctx, void *arg)
+static prof_gctx_t *
+prof_gctx_dump_iter(prof_gctx_tree_t *gctxs, prof_gctx_t *gctx, void *arg)
 {
+	prof_gctx_t *ret;
 	bool propagate_err = *(bool *)arg;
 
-	if (prof_dump_ctx(propagate_err, ctx, &ctx->bt, ctxs))
-		return (ctx_tree_next(ctxs, ctx));
+	malloc_mutex_lock(gctx->lock);
 
-	return (NULL);
-}
-
-static prof_ctx_t *
-prof_ctx_cleanup_iter(prof_ctx_tree_t *ctxs, prof_ctx_t *ctx, void *arg)
-{
-
-	prof_dump_ctx_cleanup(ctx, ctxs);
+	if (prof_dump_gctx(propagate_err, gctx, &gctx->bt, gctxs)) {
+		ret = gctx_tree_next(gctxs, gctx);
+		goto label_return;
+	}
 
-	return (NULL);
+	ret = NULL;
+label_return:
+	malloc_mutex_unlock(gctx->lock);
+	return (ret);
 }
 
 static bool
 prof_dump(bool propagate_err, const char *filename, bool leakcheck)
 {
-	prof_tdata_t *prof_tdata;
+	prof_tdata_t *tdata;
 	prof_cnt_t cnt_all;
 	size_t tabind;
 	union {
-		prof_ctx_t	*p;
+		prof_gctx_t	*p;
 		void		*v;
-	} ctx;
-	size_t leak_nctx;
-	prof_ctx_tree_t ctxs;
-	prof_ctx_t *cleanup_start = NULL;
+	} gctx;
+	size_t leak_ngctx;
+	prof_gctx_tree_t gctxs;
 
 	cassert(config_prof);
 
-	prof_tdata = prof_tdata_get(false);
-	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+	tdata = prof_tdata_get(false);
+	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
 		return (true);
 
 	malloc_mutex_lock(&prof_dump_mtx);
+	prof_enter(tdata);
 
-	/* Merge per thread profile stats, and sum them in cnt_all. */
+	/*
+	 * Put gctx's in limbo and clear their counters in preparation for
+	 * summing.
+	 */
+	gctx_tree_new(&gctxs);
+	for (tabind = 0; ckh_iter(&bt2gctx, &tabind, NULL, &gctx.v) == false;)
+		prof_dump_gctx_prep(gctx.p, &gctxs);
+
+	/*
+	 * Iterate over tdatas, and for the non-expired ones snapshot their tctx
+	 * stats and merge them into the associated gctx's.
+	 */
 	memset(&cnt_all, 0, sizeof(prof_cnt_t));
-	leak_nctx = 0;
-	ctx_tree_new(&ctxs);
-	prof_enter(prof_tdata);
-	for (tabind = 0; ckh_iter(&bt2ctx, &tabind, NULL, &ctx.v) == false;)
-		prof_dump_ctx_prep(ctx.p, &cnt_all, &leak_nctx, &ctxs);
-	prof_leave(prof_tdata);
+	malloc_mutex_lock(&tdatas_mtx);
+	tdata_tree_iter(&tdatas, NULL, prof_tdata_merge_iter, (void *)&cnt_all);
+	malloc_mutex_unlock(&tdatas_mtx);
+
+	/* Merge tctx stats into gctx's. */
+	leak_ngctx = 0;
+	gctx_tree_iter(&gctxs, NULL, prof_gctx_merge_iter, (void *)&leak_ngctx);
+
+	prof_leave(tdata);
 
 	/* Create dump file. */
 	if ((prof_dump_fd = prof_dump_open(propagate_err, filename)) == -1)
@@ -1055,10 +1298,9 @@ prof_dump(bool propagate_err, const char *filename, bool leakcheck)
 	if (prof_dump_header(propagate_err, &cnt_all))
 		goto label_write_error;
 
-	/* Dump per ctx profile stats. */
-	cleanup_start = ctx_tree_iter(&ctxs, NULL, prof_ctx_dump_iter,
-	    (void *)&propagate_err);
-	if (cleanup_start != NULL)
+	/* Dump per gctx profile stats. */
+	if (gctx_tree_iter(&gctxs, NULL, prof_gctx_dump_iter,
+	    (void *)&propagate_err) != NULL)
 		goto label_write_error;
 
 	/* Dump /proc/<pid>/maps if possible. */
@@ -1068,19 +1310,17 @@ prof_dump(bool propagate_err, const char *filename, bool leakcheck)
 	if (prof_dump_close(propagate_err))
 		goto label_open_close_error;
 
+	gctx_tree_iter(&gctxs, NULL, prof_gctx_finish_iter, NULL);
 	malloc_mutex_unlock(&prof_dump_mtx);
 
 	if (leakcheck)
-		prof_leakcheck(&cnt_all, leak_nctx, filename);
+		prof_leakcheck(&cnt_all, leak_ngctx, filename);
 
 	return (false);
 label_write_error:
 	prof_dump_close(propagate_err);
 label_open_close_error:
-	if (cleanup_start != NULL) {
-		ctx_tree_iter(&ctxs, cleanup_start, prof_ctx_cleanup_iter,
-		    NULL);
-	}
+	gctx_tree_iter(&gctxs, NULL, prof_gctx_finish_iter, NULL);
 	malloc_mutex_unlock(&prof_dump_mtx);
 	return (true);
 }
@@ -1128,18 +1368,18 @@ prof_fdump(void)
 void
 prof_idump(void)
 {
-	prof_tdata_t *prof_tdata;
+	prof_tdata_t *tdata;
 	char filename[PATH_MAX + 1];
 
 	cassert(config_prof);
 
 	if (prof_booted == false)
 		return;
-	prof_tdata = prof_tdata_get(false);
-	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+	tdata = prof_tdata_get(false);
+	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
 		return;
-	if (prof_tdata->enq) {
-		prof_tdata->enq_idump = true;
+	if (tdata->enq) {
+		tdata->enq_idump = true;
 		return;
 	}
 
@@ -1178,18 +1418,18 @@ prof_mdump(const char *filename)
 void
 prof_gdump(void)
 {
-	prof_tdata_t *prof_tdata;
+	prof_tdata_t *tdata;
 	char filename[DUMP_FILENAME_BUFSIZE];
 
 	cassert(config_prof);
 
 	if (prof_booted == false)
 		return;
-	prof_tdata = prof_tdata_get(false);
-	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+	tdata = prof_tdata_get(false);
+	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
 		return;
-	if (prof_tdata->enq) {
-		prof_tdata->enq_gdump = true;
+	if (tdata->enq) {
+		tdata->enq_gdump = true;
 		return;
 	}
 
@@ -1225,81 +1465,233 @@ prof_bt_keycomp(const void *k1, const void *k2)
 	return (memcmp(bt1->vec, bt2->vec, bt1->len * sizeof(void *)) == 0);
 }
 
-prof_tdata_t *
-prof_tdata_init(void)
+JEMALLOC_INLINE_C uint64_t
+prof_thr_uid_alloc(void)
+{
+
+	return (atomic_add_uint64(&next_thr_uid, 1) - 1);
+}
+
+static prof_tdata_t *
+prof_tdata_init_impl(uint64_t thr_uid)
 {
-	prof_tdata_t *prof_tdata;
+	prof_tdata_t *tdata;
 
 	cassert(config_prof);
 
 	/* Initialize an empty cache for this thread. */
-	prof_tdata = (prof_tdata_t *)imalloc(sizeof(prof_tdata_t));
-	if (prof_tdata == NULL)
+	tdata = (prof_tdata_t *)imalloc(sizeof(prof_tdata_t));
+	if (tdata == NULL)
 		return (NULL);
 
-	if (ckh_new(&prof_tdata->bt2cnt, PROF_CKH_MINITEMS,
+	tdata->lock = prof_tdata_mutex_choose(thr_uid);
+	tdata->thr_uid = thr_uid;
+	tdata->thread_name = NULL;
+	tdata->state = prof_tdata_state_attached;
+
+	if (ckh_new(&tdata->bt2tctx, PROF_CKH_MINITEMS,
 	    prof_bt_hash, prof_bt_keycomp)) {
-		idalloc(prof_tdata);
+		idalloc(tdata);
 		return (NULL);
 	}
 
-	prof_tdata->prng_state = (uint64_t)(uintptr_t)prof_tdata;
-	prof_sample_threshold_update(prof_tdata);
+	tdata->prng_state = (uint64_t)(uintptr_t)tdata;
+	prof_sample_threshold_update(tdata);
+
+	tdata->enq = false;
+	tdata->enq_idump = false;
+	tdata->enq_gdump = false;
+
+	tdata->dumping = false;
+	tdata->active = true;
+
+	prof_tdata_tsd_set(&tdata);
+
+	malloc_mutex_lock(&tdatas_mtx);
+	tdata_tree_insert(&tdatas, tdata);
+	malloc_mutex_unlock(&tdatas_mtx);
+
+	return (tdata);
+}
+
+prof_tdata_t *
+prof_tdata_init(void)
+{
+
+	return (prof_tdata_init_impl(prof_thr_uid_alloc()));
+}
+
+prof_tdata_t *
+prof_tdata_reinit(prof_tdata_t *tdata)
+{
+
+	return (prof_tdata_init_impl(tdata->thr_uid));
+}
+
+/* tdata->lock must be held. */
+static bool
+prof_tdata_should_destroy(prof_tdata_t *tdata)
+{
+
+	if (tdata->state == prof_tdata_state_attached)
+		return (false);
+	if (ckh_count(&tdata->bt2tctx) != 0)
+		return (false);
+	return (true);
+}
+
+static void
+prof_tdata_destroy(prof_tdata_t *tdata)
+{
+
+	assert(prof_tdata_should_destroy(tdata));
+
+	malloc_mutex_lock(&tdatas_mtx);
+	tdata_tree_remove(&tdatas, tdata);
+	malloc_mutex_unlock(&tdatas_mtx);
+
+	if (tdata->thread_name != NULL)
+		idalloc(tdata->thread_name);
+	ckh_delete(&tdata->bt2tctx);
+	idalloc(tdata);
+}
+
+static void
+prof_tdata_state_transition(prof_tdata_t *tdata, prof_tdata_state_t state)
+{
+	bool destroy_tdata;
+
+	malloc_mutex_lock(tdata->lock);
+	if (tdata->state != state) {
+		tdata->state = state;
+		destroy_tdata = prof_tdata_should_destroy(tdata);
+	} else
+		destroy_tdata = false;
+	malloc_mutex_unlock(tdata->lock);
+	if (destroy_tdata)
+		prof_tdata_destroy(tdata);
+}
 
-	prof_tdata->enq = false;
-	prof_tdata->enq_idump = false;
-	prof_tdata->enq_gdump = false;
+static void
+prof_tdata_detach(prof_tdata_t *tdata)
+{
 
-	prof_tdata_tsd_set(&prof_tdata);
+	prof_tdata_state_transition(tdata, prof_tdata_state_detached);
+}
 
-	return (prof_tdata);
+static void
+prof_tdata_expire(prof_tdata_t *tdata)
+{
+
+	prof_tdata_state_transition(tdata, prof_tdata_state_expired);
+}
+
+static prof_tdata_t *
+prof_tdata_reset_iter(prof_tdata_tree_t *tdatas, prof_tdata_t *tdata, void *arg)
+{
+
+	prof_tdata_expire(tdata);
+	return (NULL);
+}
+
+void
+prof_reset(size_t lg_sample)
+{
+
+	assert(lg_sample < (sizeof(uint64_t) << 3));
+
+	malloc_mutex_lock(&prof_dump_mtx);
+	malloc_mutex_lock(&tdatas_mtx);
+
+	lg_prof_sample = lg_sample;
+	tdata_tree_iter(&tdatas, NULL, prof_tdata_reset_iter, NULL);
+
+	malloc_mutex_unlock(&tdatas_mtx);
+	malloc_mutex_unlock(&prof_dump_mtx);
 }
 
 void
 prof_tdata_cleanup(void *arg)
 {
-	prof_tdata_t *prof_tdata = *(prof_tdata_t **)arg;
+	prof_tdata_t *tdata = *(prof_tdata_t **)arg;
 
 	cassert(config_prof);
 
-	if (prof_tdata == PROF_TDATA_STATE_REINCARNATED) {
+	if (tdata == PROF_TDATA_STATE_REINCARNATED) {
 		/*
 		 * Another destructor deallocated memory after this destructor
-		 * was called.  Reset prof_tdata to PROF_TDATA_STATE_PURGATORY
-		 * in order to receive another callback.
+		 * was called.  Reset tdata to PROF_TDATA_STATE_PURGATORY in
+		 * order to receive another callback.
 		 */
-		prof_tdata = PROF_TDATA_STATE_PURGATORY;
-		prof_tdata_tsd_set(&prof_tdata);
-	} else if (prof_tdata == PROF_TDATA_STATE_PURGATORY) {
+		tdata = PROF_TDATA_STATE_PURGATORY;
+		prof_tdata_tsd_set(&tdata);
+	} else if (tdata == PROF_TDATA_STATE_PURGATORY) {
 		/*
 		 * The previous time this destructor was called, we set the key
 		 * to PROF_TDATA_STATE_PURGATORY so that other destructors
-		 * wouldn't cause re-creation of the prof_tdata.  This time, do
+		 * wouldn't cause re-creation of the tdata.  This time, do
 		 * nothing, so that the destructor will not be called again.
 		 */
-	} else if (prof_tdata != NULL) {
-		union {
-			prof_thr_cnt_t	*p;
-			void		*v;
-		} cnt;
-		size_t tabind;
-
-		/*
-		 * Iteratively merge cnt's into the global stats and delete
-		 * them.
-		 */
-		for (tabind = 0; ckh_iter(&prof_tdata->bt2cnt, &tabind, NULL,
-		    &cnt.v);) {
-			prof_ctx_merge(cnt.p->ctx, cnt.p);
-			idalloc(cnt.v);
-		}
-		ckh_delete(&prof_tdata->bt2cnt);
-		idalloc(prof_tdata);
-		prof_tdata = PROF_TDATA_STATE_PURGATORY;
-		prof_tdata_tsd_set(&prof_tdata);
+	} else if (tdata != NULL) {
+		prof_tdata_detach(tdata);
+		tdata = PROF_TDATA_STATE_PURGATORY;
+		prof_tdata_tsd_set(&tdata);
 	}
 }
 
+const char *
+prof_thread_name_get(void)
+{
+	prof_tdata_t *tdata = prof_tdata_get(true);
+	if (tdata == NULL)
+		return (NULL);
+	return (tdata->thread_name);
+}
+
+bool
+prof_thread_name_set(const char *thread_name)
+{
+	prof_tdata_t *tdata;
+	size_t size;
+	char *s;
+
+	tdata = prof_tdata_get(true);
+	if (tdata == NULL)
+		return (true);
+
+	size = strlen(thread_name) + 1;
+	s = imalloc(size);
+	if (s == NULL)
+		return (true);
+
+	memcpy(s, thread_name, size);
+	if (tdata->thread_name != NULL)
+		idalloc(tdata->thread_name);
+	tdata->thread_name = s;
+	return (false);
+}
+
+bool
+prof_thread_active_get(void)
+{
+	prof_tdata_t *tdata = prof_tdata_get(true);
+	if (tdata == NULL)
+		return (false);
+	return (tdata->active);
+}
+
+bool
+prof_thread_active_set(bool active)
+{
+	prof_tdata_t *tdata;
+
+	tdata = prof_tdata_get(true);
+	if (tdata == NULL)
+		return (true);
+	tdata->active = active;
+	return (false);
+}
+
 void
 prof_boot0(void)
 {
@@ -1345,10 +1737,12 @@ prof_boot2(void)
 	if (opt_prof) {
 		unsigned i;
 
-		if (ckh_new(&bt2ctx, PROF_CKH_MINITEMS, prof_bt_hash,
+		lg_prof_sample = opt_lg_prof_sample;
+
+		if (ckh_new(&bt2gctx, PROF_CKH_MINITEMS, prof_bt_hash,
 		    prof_bt_keycomp))
 			return (true);
-		if (malloc_mutex_init(&bt2ctx_mtx))
+		if (malloc_mutex_init(&bt2gctx_mtx))
 			return (true);
 		if (prof_tdata_tsd_boot()) {
 			malloc_write(
@@ -1356,6 +1750,12 @@ prof_boot2(void)
 			abort();
 		}
 
+		tdata_tree_new(&tdatas);
+		if (malloc_mutex_init(&tdatas_mtx))
+			return (true);
+
+		next_thr_uid = 0;
+
 		if (malloc_mutex_init(&prof_dump_seq_mtx))
 			return (true);
 		if (malloc_mutex_init(&prof_dump_mtx))
@@ -1367,12 +1767,21 @@ prof_boot2(void)
 				abort();
 		}
 
-		ctx_locks = (malloc_mutex_t *)base_alloc(PROF_NCTX_LOCKS *
+		gctx_locks = (malloc_mutex_t *)base_alloc(PROF_NCTX_LOCKS *
 		    sizeof(malloc_mutex_t));
-		if (ctx_locks == NULL)
+		if (gctx_locks == NULL)
 			return (true);
 		for (i = 0; i < PROF_NCTX_LOCKS; i++) {
-			if (malloc_mutex_init(&ctx_locks[i]))
+			if (malloc_mutex_init(&gctx_locks[i]))
+				return (true);
+		}
+
+		tdata_locks = (malloc_mutex_t *)base_alloc(PROF_NTDATA_LOCKS *
+		    sizeof(malloc_mutex_t));
+		if (tdata_locks == NULL)
+			return (true);
+		for (i = 0; i < PROF_NTDATA_LOCKS; i++) {
+			if (malloc_mutex_init(&tdata_locks[i]))
 				return (true);
 		}
 	}
@@ -1397,10 +1806,10 @@ prof_prefork(void)
 	if (opt_prof) {
 		unsigned i;
 
-		malloc_mutex_prefork(&bt2ctx_mtx);
+		malloc_mutex_prefork(&bt2gctx_mtx);
 		malloc_mutex_prefork(&prof_dump_seq_mtx);
 		for (i = 0; i < PROF_NCTX_LOCKS; i++)
-			malloc_mutex_prefork(&ctx_locks[i]);
+			malloc_mutex_prefork(&gctx_locks[i]);
 	}
 }
 
@@ -1412,9 +1821,9 @@ prof_postfork_parent(void)
 		unsigned i;
 
 		for (i = 0; i < PROF_NCTX_LOCKS; i++)
-			malloc_mutex_postfork_parent(&ctx_locks[i]);
+			malloc_mutex_postfork_parent(&gctx_locks[i]);
 		malloc_mutex_postfork_parent(&prof_dump_seq_mtx);
-		malloc_mutex_postfork_parent(&bt2ctx_mtx);
+		malloc_mutex_postfork_parent(&bt2gctx_mtx);
 	}
 }
 
@@ -1426,9 +1835,9 @@ prof_postfork_child(void)
 		unsigned i;
 
 		for (i = 0; i < PROF_NCTX_LOCKS; i++)
-			malloc_mutex_postfork_child(&ctx_locks[i]);
+			malloc_mutex_postfork_child(&gctx_locks[i]);
 		malloc_mutex_postfork_child(&prof_dump_seq_mtx);
-		malloc_mutex_postfork_child(&bt2ctx_mtx);
+		malloc_mutex_postfork_child(&bt2gctx_mtx);
 	}
 }
 
diff --git a/src/stats.c b/src/stats.c
index a0eb297..db34275 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -441,7 +441,7 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		}
 		if ((err = je_mallctl("opt.prof", &bv, &bsz, NULL, 0)) == 0 &&
 		    bv) {
-			CTL_GET("opt.lg_prof_sample", &sv, size_t);
+			CTL_GET("prof.lg_sample", &sv, size_t);
 			malloc_cprintf(write_cb, cbopaque,
 			    "Average profile sample interval: %"PRIu64
 			    " (2^%zu)\n", (((uint64_t)1U) << sv), sv);
-- 
cgit v0.12


From 3e24afa28e01b743a9f7fa1d42acb67e079d8187 Mon Sep 17 00:00:00 2001
From: Sara Golemon <sgolemon@fb.com>
Date: Mon, 18 Aug 2014 13:06:39 -0700
Subject: Test for availability of malloc hooks via autoconf

__*_hook() is glibc, but on at least one glibc platform (homebrew),
the __GLIBC__ define isn't set correctly and we miss being able to
use these hooks.

Do a feature test for it during configuration so that we enable it
anywhere the hooks are actually available.
---
 configure.ac                                       | 31 ++++++++++++++++++++++
 .../jemalloc/internal/jemalloc_internal_defs.h.in  |  6 +++++
 src/jemalloc.c                                     |  4 ++-
 3 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index ede5f70..6f8fd3f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1341,6 +1341,37 @@ if test "x${enable_zone_allocator}" = "x1" ; then
 fi
 
 dnl ============================================================================
+dnl Check for glibc malloc hooks
+
+JE_COMPILABLE([glibc malloc hook], [
+#include <stddef.h>
+
+extern void (* __free_hook)(void *ptr);
+extern void *(* __malloc_hook)(size_t size);
+extern void *(* __realloc_hook)(void *ptr, size_t size);
+], [
+  void *ptr = 0L;
+  if (__malloc_hook) ptr = __malloc_hook(1);
+  if (__realloc_hook) ptr = __realloc_hook(ptr, 2);
+  if (__free_hook && ptr) __free_hook(ptr);
+], [je_cv_glibc_malloc_hook])
+if test "x${je_cv_glibc_malloc_hook}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_GLIBC_MALLOC_HOOK], [ ])
+fi
+
+JE_COMPILABLE([glibc memalign hook], [
+#include <stddef.h>
+
+extern void *(* __memalign_hook)(size_t alignment, size_t size);
+], [
+  void *ptr = 0L;
+  if (__memalign_hook) ptr = __memalign_hook(16, 7);
+], [je_cv_glibc_memalign_hook])
+if test "x${je_cv_glibc_memalign_hook}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_GLIBC_MEMALIGN_HOOK], [ ])
+fi
+
+dnl ============================================================================
 dnl Check for typedefs, structures, and compiler characteristics.
 AC_HEADER_STDBOOL
 
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index 93716b0..955582e 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -209,4 +209,10 @@
 /* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */
 #undef LG_SIZEOF_INTMAX_T
 
+/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook) */
+#undef JEMALLOC_GLIBC_MALLOC_HOOK
+
+/* glibc memalign hook */
+#undef JEMALLOC_GLIBC_MEMALIGN_HOOK
+
 #endif /* JEMALLOC_INTERNAL_DEFS_H_ */
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 2d01272..9df7001 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1336,7 +1336,7 @@ je_valloc(size_t size)
 #define	is_malloc_(a) malloc_is_ ## a
 #define	is_malloc(a) is_malloc_(a)
 
-#if ((is_malloc(je_malloc) == 1) && defined(__GLIBC__) && !defined(__UCLIBC__))
+#if ((is_malloc(je_malloc) == 1) && defined(JEMALLOC_GLIBC_MALLOC_HOOK))
 /*
  * glibc provides the RTLD_DEEPBIND flag for dlopen which can make it possible
  * to inconsistently reference libc's malloc(3)-compatible functions
@@ -1349,8 +1349,10 @@ je_valloc(size_t size)
 JEMALLOC_EXPORT void (*__free_hook)(void *ptr) = je_free;
 JEMALLOC_EXPORT void *(*__malloc_hook)(size_t size) = je_malloc;
 JEMALLOC_EXPORT void *(*__realloc_hook)(void *ptr, size_t size) = je_realloc;
+# ifdef JEMALLOC_GLIBC_MEMALIGN_HOOK
 JEMALLOC_EXPORT void *(*__memalign_hook)(size_t alignment, size_t size) =
     je_memalign;
+# endif
 #endif
 
 /*
-- 
cgit v0.12


From 58799f6d1c1f58053f4aac1b100ce9049c868039 Mon Sep 17 00:00:00 2001
From: Qinfan Wu <wqfish@fb.com>
Date: Tue, 26 Aug 2014 21:28:31 -0700
Subject: Remove junk filling in tcache_bin_flush_small().

Junk filling is done in arena_dalloc_bin_locked(), so arena_alloc_junk_small()
is redundant. Also, we should use arena_dalloc_junk_small() instead of
arena_alloc_junk_small().
---
 src/tcache.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/tcache.c b/src/tcache.c
index 868f2d7..4fbc94c 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -120,10 +120,6 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem,
 				    (uintptr_t)chunk) >> LG_PAGE;
 				arena_chunk_map_t *mapelm =
 				    arena_mapp_get(chunk, pageind);
-				if (config_fill && opt_junk) {
-					arena_alloc_junk_small(ptr,
-					    &arena_bin_info[binind], true);
-				}
 				arena_dalloc_bin_locked(arena, chunk, ptr,
 				    mapelm);
 			} else {
-- 
cgit v0.12


From a5a658ab48f7dfa7fd134e505ef23304eaa0ce54 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Tue, 2 Sep 2014 15:07:07 -0700
Subject: Make VERSION generation more robust.

Relax the "are we in a git repo?" check to succeed even if the top level
jemalloc directory is not at the top level of the git repo.

Add git tag filtering so that only version triplets match when
generating VERSION.

Add fallback bogus VERSION creation, so that in the worst case, rather
than generating empty values for e.g. JEMALLOC_VERSION_MAJOR,
configuration ends up generating useless constants.
---
 configure.ac | 30 ++++++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/configure.ac b/configure.ac
index 6f8fd3f..3b65885 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1029,11 +1029,33 @@ dnl ============================================================================
 dnl jemalloc configuration.
 dnl 
 
-dnl Set VERSION if source directory has an embedded git repository or is a git submodule.
-if test -e "${srcroot}.git" ; then
-  git describe --long --abbrev=40 > ${srcroot}VERSION
+dnl Set VERSION if source directory is inside a git repository.
+if test "x`git rev-parse --is-inside-work-tree 2>/dev/null`" = "xtrue" ; then
+  dnl Pattern globs aren't powerful enough to match both single- and
+  dnl double-digit version numbers, so iterate over patterns to support up to
+  dnl version 99.99.99 without any accidental matches.
+  rm -f "${srcroot}VERSION"
+  for pattern in ['[0-9].[0-9].[0-9]' '[0-9].[0-9].[0-9][0-9]' \
+                 '[0-9].[0-9][0-9].[0-9]' '[0-9].[0-9][0-9].[0-9][0-9]' \
+                 '[0-9][0-9].[0-9].[0-9]' '[0-9][0-9].[0-9].[0-9][0-9]' \
+                 '[0-9][0-9].[0-9][0-9].[0-9]' \
+                 '[0-9][0-9].[0-9][0-9].[0-9][0-9]']; do
+    if test ! -e "${srcroot}VERSION" ; then
+      git describe --long --abbrev=40 --match="${pattern}" > "${srcroot}VERSION.tmp" 2>/dev/null
+      if test $? -eq 0 ; then
+        mv "${srcroot}VERSION.tmp" "${srcroot}VERSION"
+        break
+      fi
+    fi
+  done
+fi
+rm -f "${srcroot}VERSION.tmp"
+if test ! -e "${srcroot}VERSION" ; then
+  AC_MSG_RESULT(
+    [Missing VERSION file, and unable to generate it; creating bogus VERSION])
+  echo "0.0.0-0-g0000000000000000000000000000000000000000" > "${srcroot}VERSION"
 fi
-jemalloc_version=`cat ${srcroot}VERSION`
+jemalloc_version=`cat "${srcroot}VERSION"`
 jemalloc_version_major=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]1}'`
 jemalloc_version_minor=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]2}'`
 jemalloc_version_bugfix=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]3}'`
-- 
cgit v0.12


From f34f6037e8d9836f7cddc02ad349dc72964bbcc7 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Tue, 2 Sep 2014 17:49:29 -0700
Subject: Disable autom4te cache.

---
 .autom4te.cfg | 3 +++
 .gitignore    | 2 --
 Makefile.in   | 1 -
 3 files changed, 3 insertions(+), 3 deletions(-)
 create mode 100644 .autom4te.cfg

diff --git a/.autom4te.cfg b/.autom4te.cfg
new file mode 100644
index 0000000..fe2424d
--- /dev/null
+++ b/.autom4te.cfg
@@ -0,0 +1,3 @@
+begin-language: "Autoconf-without-aclocal-m4"
+args: --no-cache
+end-language: "Autoconf-without-aclocal-m4"
diff --git a/.gitignore b/.gitignore
index 4c408ec..ec9c0b9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,5 @@
 /*.gcov.*
 
-/autom4te.cache/
-
 /bin/jemalloc.sh
 
 /config.stamp
diff --git a/Makefile.in b/Makefile.in
index dfafe45..b5f0ee9 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -400,7 +400,6 @@ clean:
 	rm -f $(objroot)*.gcov.*
 
 distclean: clean
-	rm -rf $(objroot)autom4te.cache
 	rm -f $(objroot)bin/jemalloc.sh
 	rm -f $(objroot)config.log
 	rm -f $(objroot)config.status
-- 
cgit v0.12


From ff6a31d3b92b7c63446ce645341d2bbd77b67dc6 Mon Sep 17 00:00:00 2001
From: Qinfan Wu <wqfish@fb.com>
Date: Fri, 29 Aug 2014 13:34:40 -0700
Subject: Refactor chunk map.

Break the chunk map into two separate arrays, in order to improve cache
locality. This is related to issue #23.
---
 include/jemalloc/internal/arena.h             | 108 +++++++------
 include/jemalloc/internal/chunk.h             |   1 +
 include/jemalloc/internal/private_symbols.txt |   4 +-
 include/jemalloc/internal/size_classes.sh     |   2 +-
 src/arena.c                                   | 208 ++++++++++++++------------
 src/chunk.c                                   |   1 +
 src/tcache.c                                  |  11 +-
 7 files changed, 186 insertions(+), 149 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index f3f6426..986bea9 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -43,7 +43,8 @@
  */
 #define	LG_DIRTY_MULT_DEFAULT	3
 
-typedef struct arena_chunk_map_s arena_chunk_map_t;
+typedef struct arena_chunk_map_bits_s arena_chunk_map_bits_t;
+typedef struct arena_chunk_map_misc_s arena_chunk_map_misc_t;
 typedef struct arena_chunk_s arena_chunk_t;
 typedef struct arena_run_s arena_run_t;
 typedef struct arena_bin_info_s arena_bin_info_t;
@@ -55,34 +56,7 @@ typedef struct arena_s arena_t;
 #ifdef JEMALLOC_H_STRUCTS
 
 /* Each element of the chunk map corresponds to one page within the chunk. */
-struct arena_chunk_map_s {
-#ifndef JEMALLOC_PROF
-	/*
-	 * Overlay prof_tctx in order to allow it to be referenced by dead code.
-	 * Such antics aren't warranted for per arena data structures, but
-	 * chunk map overhead accounts for a percentage of memory, rather than
-	 * being just a fixed cost.
-	 */
-	union {
-#endif
-	/*
-	 * Linkage for run trees.  There are two disjoint uses:
-	 *
-	 * 1) arena_t's runs_avail tree.
-	 * 2) arena_run_t conceptually uses this linkage for in-use non-full
-	 * runs, rather than directly embedding linkage.
-	 */
-	rb_node(arena_chunk_map_t)	rb_link;
-
-	/* Profile counters, used for large object runs. */
-	prof_tctx_t			*prof_tctx;
-#ifndef JEMALLOC_PROF
-	}; /* union { ... }; */
-#endif
-
-	/* Linkage for list of dirty runs. */
-	ql_elm(arena_chunk_map_t)	dr_link;
-
+struct arena_chunk_map_bits_s {
 	/*
 	 * Run address (or size) and various flags are stored together.  The bit
 	 * layout looks like (assuming 32-bit system):
@@ -149,9 +123,43 @@ struct arena_chunk_map_s {
 #define	CHUNK_MAP_ALLOCATED	((size_t)0x1U)
 #define	CHUNK_MAP_KEY		CHUNK_MAP_ALLOCATED
 };
-typedef rb_tree(arena_chunk_map_t) arena_avail_tree_t;
-typedef rb_tree(arena_chunk_map_t) arena_run_tree_t;
-typedef ql_head(arena_chunk_map_t) arena_chunk_mapelms_t;
+
+/*
+ * Each arena_chunk_map_misc_t corresponds to one page within the chunk, just
+ * like arena_chunk_map_bits_t.  Two separate arrays are stored within each
+ * chunk header in order to improve cache locality.
+ */
+struct arena_chunk_map_misc_s {
+#ifndef JEMALLOC_PROF
+	/*
+	 * Overlay prof_tctx in order to allow it to be referenced by dead code.
+	 * Such antics aren't warranted for per arena data structures, but
+	 * chunk map overhead accounts for a percentage of memory, rather than
+	 * being just a fixed cost.
+	 */
+	union {
+#endif
+	/*
+	 * Linkage for run trees.  There are two disjoint uses:
+	 *
+	 * 1) arena_t's runs_avail tree.
+	 * 2) arena_run_t conceptually uses this linkage for in-use non-full
+	 * runs, rather than directly embedding linkage.
+	 */
+	rb_node(arena_chunk_map_misc_t)	rb_link;
+
+	/* Profile counters, used for large object runs. */
+	prof_tctx_t			*prof_tctx;
+#ifndef JEMALLOC_PROF
+	}; /* union { ... }; */
+#endif
+
+	/* Linkage for list of dirty runs. */
+	ql_elm(arena_chunk_map_misc_t)	dr_link;
+};
+typedef rb_tree(arena_chunk_map_misc_t) arena_avail_tree_t;
+typedef rb_tree(arena_chunk_map_misc_t) arena_run_tree_t;
+typedef ql_head(arena_chunk_map_misc_t) arena_chunk_miscelms_t;
 
 /* Arena chunk header. */
 struct arena_chunk_s {
@@ -164,7 +172,7 @@ struct arena_chunk_s {
 	 * need to be tracked in the map.  This omission saves a header page
 	 * for common chunk sizes (e.g. 4 MiB).
 	 */
-	arena_chunk_map_t	map[1]; /* Dynamically sized. */
+	arena_chunk_map_bits_t	map_bits[1]; /* Dynamically sized. */
 };
 
 struct arena_run_s {
@@ -335,7 +343,7 @@ struct arena_s {
 	arena_avail_tree_t	runs_avail;
 
 	/* List of dirty runs this arena manages. */
-	arena_chunk_mapelms_t	runs_dirty;
+	arena_chunk_miscelms_t	runs_dirty;
 
 	/*
 	 * user-configureable chunk allocation and deallocation functions.
@@ -393,9 +401,9 @@ void	*arena_malloc_large(arena_t *arena, size_t size, bool zero);
 void	*arena_palloc(arena_t *arena, size_t size, size_t alignment, bool zero);
 void	arena_prof_promoted(const void *ptr, size_t size);
 void	arena_dalloc_bin_locked(arena_t *arena, arena_chunk_t *chunk, void *ptr,
-    arena_chunk_map_t *mapelm);
+    arena_chunk_map_bits_t *bitselm);
 void	arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
-    size_t pageind, arena_chunk_map_t *mapelm);
+    size_t pageind, arena_chunk_map_bits_t *bitselm);
 void	arena_dalloc_small(arena_t *arena, arena_chunk_t *chunk, void *ptr,
     size_t pageind);
 #ifdef JEMALLOC_JET
@@ -439,7 +447,10 @@ size_t	small_bin2size(size_t binind);
 size_t	small_s2u_compute(size_t size);
 size_t	small_s2u_lookup(size_t size);
 size_t	small_s2u(size_t size);
-arena_chunk_map_t	*arena_mapp_get(arena_chunk_t *chunk, size_t pageind);
+arena_chunk_map_bits_t	*arena_bitselm_get(arena_chunk_t *chunk,
+    size_t pageind);
+arena_chunk_map_misc_t	*arena_miscelm_get(arena_chunk_t *chunk,
+    size_t pageind);
 size_t	*arena_mapbitsp_get(arena_chunk_t *chunk, size_t pageind);
 size_t	arena_mapbitsp_read(size_t *mapbitsp);
 size_t	arena_mapbits_get(arena_chunk_t *chunk, size_t pageind);
@@ -623,21 +634,32 @@ small_s2u(size_t size)
 #  endif /* JEMALLOC_ARENA_INLINE_A */
 
 #  ifdef JEMALLOC_ARENA_INLINE_B
-JEMALLOC_ALWAYS_INLINE arena_chunk_map_t *
-arena_mapp_get(arena_chunk_t *chunk, size_t pageind)
+JEMALLOC_ALWAYS_INLINE arena_chunk_map_bits_t *
+arena_bitselm_get(arena_chunk_t *chunk, size_t pageind)
+{
+
+	assert(pageind >= map_bias);
+	assert(pageind < chunk_npages);
+
+	return (&chunk->map_bits[pageind-map_bias]);
+}
+
+JEMALLOC_ALWAYS_INLINE arena_chunk_map_misc_t *
+arena_miscelm_get(arena_chunk_t *chunk, size_t pageind)
 {
 
 	assert(pageind >= map_bias);
 	assert(pageind < chunk_npages);
 
-	return (&chunk->map[pageind-map_bias]);
+	return ((arena_chunk_map_misc_t *)((uintptr_t)chunk +
+	    (uintptr_t)map_misc_offset) + pageind-map_bias);
 }
 
 JEMALLOC_ALWAYS_INLINE size_t *
 arena_mapbitsp_get(arena_chunk_t *chunk, size_t pageind)
 {
 
-	return (&arena_mapp_get(chunk, pageind)->bits);
+	return (&arena_bitselm_get(chunk, pageind)->bits);
 }
 
 JEMALLOC_ALWAYS_INLINE size_t
@@ -1005,7 +1027,7 @@ arena_prof_tctx_get(const void *ptr)
 	if ((mapbits & CHUNK_MAP_LARGE) == 0)
 		ret = (prof_tctx_t *)(uintptr_t)1U;
 	else
-		ret = arena_mapp_get(chunk, pageind)->prof_tctx;
+		ret = arena_miscelm_get(chunk, pageind)->prof_tctx;
 
 	return (ret);
 }
@@ -1025,7 +1047,7 @@ arena_prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 	assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
 
 	if (arena_mapbits_large_get(chunk, pageind) != 0)
-		arena_mapp_get(chunk, pageind)->prof_tctx = tctx;
+		arena_miscelm_get(chunk, pageind)->prof_tctx = tctx;
 }
 
 JEMALLOC_ALWAYS_INLINE void *
diff --git a/include/jemalloc/internal/chunk.h b/include/jemalloc/internal/chunk.h
index f3bfbe0..27aa0ad 100644
--- a/include/jemalloc/internal/chunk.h
+++ b/include/jemalloc/internal/chunk.h
@@ -41,6 +41,7 @@ extern size_t		chunksize;
 extern size_t		chunksize_mask; /* (chunksize - 1). */
 extern size_t		chunk_npages;
 extern size_t		map_bias; /* Number of arena chunk header pages. */
+extern size_t		map_misc_offset;
 extern size_t		arena_maxclass; /* Max size class for arenas. */
 
 void	*chunk_alloc_base(size_t size);
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 1350545..9ca139a 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -4,6 +4,7 @@ a0malloc
 arena_alloc_junk_small
 arena_bin_index
 arena_bin_info
+arena_bitselm_get
 arena_boot
 arena_chunk_alloc_huge
 arena_chunk_dalloc_huge
@@ -38,8 +39,8 @@ arena_mapbits_unzeroed_set
 arena_mapbitsp_get
 arena_mapbitsp_read
 arena_mapbitsp_write
-arena_mapp_get
 arena_maxclass
+arena_miscelm_get
 arena_new
 arena_palloc
 arena_postfork_child
@@ -254,6 +255,7 @@ malloc_vcprintf
 malloc_vsnprintf
 malloc_write
 map_bias
+map_misc_offset
 mb_write
 mutex_boot
 narenas_auto
diff --git a/include/jemalloc/internal/size_classes.sh b/include/jemalloc/internal/size_classes.sh
index 3edebf2..379d36c 100755
--- a/include/jemalloc/internal/size_classes.sh
+++ b/include/jemalloc/internal/size_classes.sh
@@ -240,7 +240,7 @@ cat <<EOF
  * The small_size2bin lookup table uses uint8_t to encode each bin index, so we
  * cannot support more than 256 small size classes.  Further constrain NBINS to
  * 255 since all small size classes, plus a "not small" size class must be
- * stored in 8 bits of arena_chunk_map_t's bits field.
+ * stored in 8 bits of arena_chunk_map_bits_t's bits field.
  */
 #if (NBINS > 255)
 #  error "Too many small size classes"
diff --git a/src/arena.c b/src/arena.c
index 1263269..d9dda83 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -61,55 +61,57 @@ static void	arena_bin_lower_run(arena_t *arena, arena_chunk_t *chunk,
 /******************************************************************************/
 
 JEMALLOC_INLINE_C size_t
-arena_mapelm_to_pageind(arena_chunk_map_t *mapelm)
+arena_miscelm_to_pageind(arena_chunk_map_misc_t *miscelm)
 {
-	uintptr_t map_offset =
-	    CHUNK_ADDR2OFFSET(mapelm) - offsetof(arena_chunk_t, map);
+	size_t offset = CHUNK_ADDR2OFFSET(miscelm);
 
-	return ((map_offset / sizeof(arena_chunk_map_t)) + map_bias);
+	return ((offset - map_misc_offset) / sizeof(arena_chunk_map_misc_t) +
+	    map_bias);
 }
 
 JEMALLOC_INLINE_C size_t
-arena_mapelm_to_bits(arena_chunk_map_t *mapelm)
+arena_miscelm_to_bits(arena_chunk_map_misc_t *miscelm)
 {
+	arena_chunk_t *chunk = CHUNK_ADDR2BASE(miscelm);
+	size_t pageind = arena_miscelm_to_pageind(miscelm);
 
-	return (mapelm->bits);
+	return arena_mapbits_get(chunk, pageind);
 }
 
 static inline int
-arena_run_comp(arena_chunk_map_t *a, arena_chunk_map_t *b)
+arena_run_comp(arena_chunk_map_misc_t *a, arena_chunk_map_misc_t *b)
 {
-	uintptr_t a_mapelm = (uintptr_t)a;
-	uintptr_t b_mapelm = (uintptr_t)b;
+	uintptr_t a_miscelm = (uintptr_t)a;
+	uintptr_t b_miscelm = (uintptr_t)b;
 
 	assert(a != NULL);
 	assert(b != NULL);
 
-	return ((a_mapelm > b_mapelm) - (a_mapelm < b_mapelm));
+	return ((a_miscelm > b_miscelm) - (a_miscelm < b_miscelm));
 }
 
 /* Generate red-black tree functions. */
-rb_gen(static UNUSED, arena_run_tree_, arena_run_tree_t, arena_chunk_map_t,
+rb_gen(static UNUSED, arena_run_tree_, arena_run_tree_t, arena_chunk_map_misc_t,
     rb_link, arena_run_comp)
 
 static inline int
-arena_avail_comp(arena_chunk_map_t *a, arena_chunk_map_t *b)
+arena_avail_comp(arena_chunk_map_misc_t *a, arena_chunk_map_misc_t *b)
 {
 	int ret;
 	size_t a_size;
-	size_t b_size = arena_mapelm_to_bits(b) & ~PAGE_MASK;
-	uintptr_t a_mapelm = (uintptr_t)a;
-	uintptr_t b_mapelm = (uintptr_t)b;
+	size_t b_size = arena_miscelm_to_bits(b) & ~PAGE_MASK;
+	uintptr_t a_miscelm = (uintptr_t)a;
+	uintptr_t b_miscelm = (uintptr_t)b;
 
-	if (a_mapelm & CHUNK_MAP_KEY)
-		a_size = a_mapelm & ~PAGE_MASK;
-        else
-		a_size = arena_mapelm_to_bits(a) & ~PAGE_MASK;
+	if (a_miscelm & CHUNK_MAP_KEY)
+		a_size = a_miscelm & ~PAGE_MASK;
+	else
+		a_size = arena_miscelm_to_bits(a) & ~PAGE_MASK;
 
 	ret = (a_size > b_size) - (a_size < b_size);
 	if (ret == 0) {
-		if (!(a_mapelm & CHUNK_MAP_KEY))
-			ret = (a_mapelm > b_mapelm) - (a_mapelm < b_mapelm);
+		if (!(a_miscelm & CHUNK_MAP_KEY))
+			ret = (a_miscelm > b_miscelm) - (a_miscelm < b_miscelm);
 		else {
 			/*
 			 * Treat keys as if they are lower than anything else.
@@ -122,8 +124,8 @@ arena_avail_comp(arena_chunk_map_t *a, arena_chunk_map_t *b)
 }
 
 /* Generate red-black tree functions. */
-rb_gen(static UNUSED, arena_avail_tree_, arena_avail_tree_t, arena_chunk_map_t,
-    rb_link, arena_avail_comp)
+rb_gen(static UNUSED, arena_avail_tree_, arena_avail_tree_t,
+    arena_chunk_map_misc_t, rb_link, arena_avail_comp)
 
 static void
 arena_avail_insert(arena_t *arena, arena_chunk_t *chunk, size_t pageind,
@@ -132,7 +134,7 @@ arena_avail_insert(arena_t *arena, arena_chunk_t *chunk, size_t pageind,
 
 	assert(npages == (arena_mapbits_unallocated_size_get(chunk, pageind) >>
 	    LG_PAGE));
-	arena_avail_tree_insert(&arena->runs_avail, arena_mapp_get(chunk,
+	arena_avail_tree_insert(&arena->runs_avail, arena_miscelm_get(chunk,
 	    pageind));
 }
 
@@ -143,7 +145,7 @@ arena_avail_remove(arena_t *arena, arena_chunk_t *chunk, size_t pageind,
 
 	assert(npages == (arena_mapbits_unallocated_size_get(chunk, pageind) >>
 	    LG_PAGE));
-	arena_avail_tree_remove(&arena->runs_avail, arena_mapp_get(chunk,
+	arena_avail_tree_remove(&arena->runs_avail, arena_miscelm_get(chunk,
 	    pageind));
 }
 
@@ -151,14 +153,14 @@ static void
 arena_dirty_insert(arena_t *arena, arena_chunk_t *chunk, size_t pageind,
     size_t npages)
 {
-	arena_chunk_map_t *mapelm = arena_mapp_get(chunk, pageind);
+	arena_chunk_map_misc_t *miscelm = arena_miscelm_get(chunk, pageind);
 	assert(npages == (arena_mapbits_unallocated_size_get(chunk, pageind) >>
 	    LG_PAGE));
 	assert(arena_mapbits_dirty_get(chunk, pageind) == CHUNK_MAP_DIRTY);
 	assert(arena_mapbits_dirty_get(chunk, pageind+npages-1) ==
 	    CHUNK_MAP_DIRTY);
-	ql_elm_new(mapelm, dr_link);
-	ql_tail_insert(&arena->runs_dirty, mapelm, dr_link);
+	ql_elm_new(miscelm, dr_link);
+	ql_tail_insert(&arena->runs_dirty, miscelm, dr_link);
 	arena->ndirty += npages;
 }
 
@@ -166,13 +168,13 @@ static void
 arena_dirty_remove(arena_t *arena, arena_chunk_t *chunk, size_t pageind,
     size_t npages)
 {
-	arena_chunk_map_t *mapelm = arena_mapp_get(chunk, pageind);
+	arena_chunk_map_misc_t *miscelm = arena_miscelm_get(chunk, pageind);
 	assert(npages == (arena_mapbits_unallocated_size_get(chunk, pageind) >>
 	    LG_PAGE));
 	assert(arena_mapbits_dirty_get(chunk, pageind) == CHUNK_MAP_DIRTY);
 	assert(arena_mapbits_dirty_get(chunk, pageind+npages-1) ==
 	    CHUNK_MAP_DIRTY);
-	ql_remove(&arena->runs_dirty, mapelm, dr_link);
+	ql_remove(&arena->runs_dirty, miscelm, dr_link);
 	arena->ndirty -= npages;
 }
 
@@ -532,16 +534,17 @@ arena_chunk_init_hard(arena_t *arena)
 	 */
 	if (zero == false) {
 		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(
-		    (void *)arena_mapp_get(chunk, map_bias+1),
-		    (size_t)((uintptr_t) arena_mapp_get(chunk, chunk_npages-1) -
-		    (uintptr_t)arena_mapp_get(chunk, map_bias+1)));
+		    (void *)arena_bitselm_get(chunk, map_bias+1),
+		    (size_t)((uintptr_t) arena_bitselm_get(chunk,
+		    chunk_npages-1) - (uintptr_t)arena_bitselm_get(chunk,
+		    map_bias+1)));
 		for (i = map_bias+1; i < chunk_npages-1; i++)
 			arena_mapbits_unzeroed_set(chunk, i, unzeroed);
 	} else {
-		JEMALLOC_VALGRIND_MAKE_MEM_DEFINED((void *)arena_mapp_get(chunk,
-		    map_bias+1), (size_t)((uintptr_t) arena_mapp_get(chunk,
-		    chunk_npages-1) - (uintptr_t)arena_mapp_get(chunk,
-		    map_bias+1)));
+		JEMALLOC_VALGRIND_MAKE_MEM_DEFINED((void
+		    *)arena_bitselm_get(chunk, map_bias+1), (size_t)((uintptr_t)
+		    arena_bitselm_get(chunk, chunk_npages-1) -
+		    (uintptr_t)arena_bitselm_get(chunk, map_bias+1)));
 		if (config_debug) {
 			for (i = map_bias+1; i < chunk_npages-1; i++) {
 				assert(arena_mapbits_unzeroed_get(chunk, i) ==
@@ -641,14 +644,14 @@ static arena_run_t *
 arena_run_alloc_large_helper(arena_t *arena, size_t size, bool zero)
 {
 	arena_run_t *run;
-	arena_chunk_map_t *mapelm;
-	arena_chunk_map_t *key;
+	arena_chunk_map_misc_t *miscelm;
+	arena_chunk_map_misc_t *key;
 
-	key = (arena_chunk_map_t *)(size | CHUNK_MAP_KEY);
-	mapelm = arena_avail_tree_nsearch(&arena->runs_avail, key);
-	if (mapelm != NULL) {
-		arena_chunk_t *run_chunk = CHUNK_ADDR2BASE(mapelm);
-		size_t pageind = arena_mapelm_to_pageind(mapelm);
+	key = (arena_chunk_map_misc_t *)(size | CHUNK_MAP_KEY);
+	miscelm = arena_avail_tree_nsearch(&arena->runs_avail, key);
+	if (miscelm != NULL) {
+		arena_chunk_t *run_chunk = CHUNK_ADDR2BASE(miscelm);
+		size_t pageind = arena_miscelm_to_pageind(miscelm);
 
 		run = (arena_run_t *)((uintptr_t)run_chunk + (pageind <<
 		    LG_PAGE));
@@ -695,14 +698,14 @@ static arena_run_t *
 arena_run_alloc_small_helper(arena_t *arena, size_t size, size_t binind)
 {
 	arena_run_t *run;
-	arena_chunk_map_t *mapelm;
-	arena_chunk_map_t *key;
+	arena_chunk_map_misc_t *miscelm;
+	arena_chunk_map_misc_t *key;
 
-	key = (arena_chunk_map_t *)(size | CHUNK_MAP_KEY);
-	mapelm = arena_avail_tree_nsearch(&arena->runs_avail, key);
-	if (mapelm != NULL) {
-		arena_chunk_t *run_chunk = CHUNK_ADDR2BASE(mapelm);
-		size_t pageind = arena_mapelm_to_pageind(mapelm);
+	key = (arena_chunk_map_misc_t *)(size | CHUNK_MAP_KEY);
+	miscelm = arena_avail_tree_nsearch(&arena->runs_avail, key);
+	if (miscelm != NULL) {
+		arena_chunk_t *run_chunk = CHUNK_ADDR2BASE(miscelm);
+		size_t pageind = arena_miscelm_to_pageind(miscelm);
 
 		run = (arena_run_t *)((uintptr_t)run_chunk + (pageind <<
 		    LG_PAGE));
@@ -769,13 +772,13 @@ static size_t
 arena_dirty_count(arena_t *arena)
 {
 	size_t ndirty = 0;
-	arena_chunk_map_t *mapelm;
+	arena_chunk_map_misc_t *miscelm;
 	arena_chunk_t *chunk;
 	size_t pageind, npages;
 
-	ql_foreach(mapelm, &arena->runs_dirty, dr_link) {
-		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(mapelm);
-		pageind = arena_mapelm_to_pageind(mapelm);
+	ql_foreach(miscelm, &arena->runs_dirty, dr_link) {
+		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(miscelm);
+		pageind = arena_miscelm_to_pageind(miscelm);
 		assert(arena_mapbits_allocated_get(chunk, pageind) == 0);
 		assert(arena_mapbits_large_get(chunk, pageind) == 0);
 		assert(arena_mapbits_dirty_get(chunk, pageind) != 0);
@@ -808,16 +811,17 @@ arena_compute_npurge(arena_t *arena, bool all)
 
 static size_t
 arena_stash_dirty(arena_t *arena, bool all, size_t npurge,
-    arena_chunk_mapelms_t *mapelms)
+    arena_chunk_miscelms_t *miscelms)
 {
-	arena_chunk_map_t *mapelm;
+	arena_chunk_map_misc_t *miscelm;
 	size_t nstashed = 0;
 
 	/* Add at least npurge pages to purge_list. */
-	for (mapelm = ql_first(&arena->runs_dirty); mapelm != NULL;
-	    mapelm = ql_first(&arena->runs_dirty)) {
-		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(mapelm);
-		size_t pageind = arena_mapelm_to_pageind(mapelm);
+	for (miscelm = ql_first(&arena->runs_dirty); miscelm != NULL;
+	    miscelm = ql_first(&arena->runs_dirty)) {
+		arena_chunk_t *chunk =
+		    (arena_chunk_t *)CHUNK_ADDR2BASE(miscelm);
+		size_t pageind = arena_miscelm_to_pageind(miscelm);
 		size_t run_size = arena_mapbits_unallocated_size_get(chunk,
 		    pageind);
 		size_t npages = run_size >> LG_PAGE;
@@ -838,8 +842,8 @@ arena_stash_dirty(arena_t *arena, bool all, size_t npurge,
 		/* Temporarily allocate the free dirty run. */
 		arena_run_split_large(arena, run, run_size, false);
 		/* Append to purge_list for later processing. */
-		ql_elm_new(mapelm, dr_link);
-		ql_tail_insert(mapelms, mapelm, dr_link);
+		ql_elm_new(miscelm, dr_link);
+		ql_tail_insert(miscelms, miscelm, dr_link);
 
 		nstashed += npages;
 
@@ -851,10 +855,10 @@ arena_stash_dirty(arena_t *arena, bool all, size_t npurge,
 }
 
 static size_t
-arena_purge_stashed(arena_t *arena, arena_chunk_mapelms_t *mapelms)
+arena_purge_stashed(arena_t *arena, arena_chunk_miscelms_t *miscelms)
 {
 	size_t npurged, nmadvise;
-	arena_chunk_map_t *mapelm;
+	arena_chunk_map_misc_t *miscelm;
 
 	if (config_stats)
 		nmadvise = 0;
@@ -862,13 +866,13 @@ arena_purge_stashed(arena_t *arena, arena_chunk_mapelms_t *mapelms)
 
 	malloc_mutex_unlock(&arena->lock);
 
-	ql_foreach(mapelm, mapelms, dr_link) {
+	ql_foreach(miscelm, miscelms, dr_link) {
 		arena_chunk_t *chunk;
 		size_t pageind, run_size, npages, flag_unzeroed, i;
 		bool unzeroed;
 
-		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(mapelm);
-		pageind = arena_mapelm_to_pageind(mapelm);
+		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(miscelm);
+		pageind = arena_miscelm_to_pageind(miscelm);
 		run_size = arena_mapbits_large_size_get(chunk, pageind);
 		npages = run_size >> LG_PAGE;
 
@@ -908,18 +912,19 @@ arena_purge_stashed(arena_t *arena, arena_chunk_mapelms_t *mapelms)
 }
 
 static void
-arena_unstash_purged(arena_t *arena, arena_chunk_mapelms_t *mapelms)
+arena_unstash_purged(arena_t *arena, arena_chunk_miscelms_t *miscelms)
 {
-	arena_chunk_map_t *mapelm;
+	arena_chunk_map_misc_t *miscelm;
 
 	/* Deallocate runs. */
-	for (mapelm = ql_first(mapelms); mapelm != NULL;
-	    mapelm = ql_first(mapelms)) {
-		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(mapelm);
-		size_t pageind = arena_mapelm_to_pageind(mapelm);
+	for (miscelm = ql_first(miscelms); miscelm != NULL;
+	    miscelm = ql_first(miscelms)) {
+		arena_chunk_t *chunk =
+		    (arena_chunk_t *)CHUNK_ADDR2BASE(miscelm);
+		size_t pageind = arena_miscelm_to_pageind(miscelm);
 		arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
 		    (uintptr_t)(pageind << LG_PAGE));
-		ql_remove(mapelms, mapelm, dr_link);
+		ql_remove(miscelms, miscelm, dr_link);
 		arena_run_dalloc(arena, run, false, true);
 	}
 }
@@ -928,7 +933,7 @@ void
 arena_purge(arena_t *arena, bool all)
 {
 	size_t npurge, npurgeable, npurged;
-	arena_chunk_mapelms_t purge_list;
+	arena_chunk_miscelms_t purge_list;
 
 	if (config_debug) {
 		size_t ndirty = arena_dirty_count(arena);
@@ -1180,14 +1185,14 @@ arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 static arena_run_t *
 arena_bin_runs_first(arena_bin_t *bin)
 {
-	arena_chunk_map_t *mapelm = arena_run_tree_first(&bin->runs);
-	if (mapelm != NULL) {
+	arena_chunk_map_misc_t *miscelm = arena_run_tree_first(&bin->runs);
+	if (miscelm != NULL) {
 		arena_chunk_t *chunk;
 		size_t pageind;
 		arena_run_t *run;
 
-		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(mapelm);
-		pageind = arena_mapelm_to_pageind(mapelm);
+		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(miscelm);
+		pageind = arena_miscelm_to_pageind(miscelm);
 		run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
 		    arena_mapbits_small_runind_get(chunk, pageind)) <<
 		    LG_PAGE));
@@ -1202,11 +1207,11 @@ arena_bin_runs_insert(arena_bin_t *bin, arena_run_t *run)
 {
 	arena_chunk_t *chunk = CHUNK_ADDR2BASE(run);
 	size_t pageind = ((uintptr_t)run - (uintptr_t)chunk) >> LG_PAGE;
-	arena_chunk_map_t *mapelm = arena_mapp_get(chunk, pageind);
+	arena_chunk_map_misc_t *miscelm = arena_miscelm_get(chunk, pageind);
 
-	assert(arena_run_tree_search(&bin->runs, mapelm) == NULL);
+	assert(arena_run_tree_search(&bin->runs, miscelm) == NULL);
 
-	arena_run_tree_insert(&bin->runs, mapelm);
+	arena_run_tree_insert(&bin->runs, miscelm);
 }
 
 static void
@@ -1214,11 +1219,11 @@ arena_bin_runs_remove(arena_bin_t *bin, arena_run_t *run)
 {
 	arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
 	size_t pageind = ((uintptr_t)run - (uintptr_t)chunk) >> LG_PAGE;
-	arena_chunk_map_t *mapelm = arena_mapp_get(chunk, pageind);
+	arena_chunk_map_misc_t *miscelm = arena_miscelm_get(chunk, pageind);
 
-	assert(arena_run_tree_search(&bin->runs, mapelm) != NULL);
+	assert(arena_run_tree_search(&bin->runs, miscelm) != NULL);
 
-	arena_run_tree_remove(&bin->runs, mapelm);
+	arena_run_tree_remove(&bin->runs, miscelm);
 }
 
 static arena_run_t *
@@ -1684,9 +1689,8 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 	size_t npages, run_ind, past;
 
 	assert(run != bin->runcur);
-	assert(arena_run_tree_search(&bin->runs,
-	    arena_mapp_get(chunk, ((uintptr_t)run-(uintptr_t)chunk)>>LG_PAGE))
-	    == NULL);
+	assert(arena_run_tree_search(&bin->runs, arena_miscelm_get(chunk,
+	    ((uintptr_t)run-(uintptr_t)chunk)>>LG_PAGE)) == NULL);
 
 	binind = arena_bin_index(chunk->arena, run->bin);
 	bin_info = &arena_bin_info[binind];
@@ -1749,7 +1753,7 @@ arena_bin_lower_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 
 void
 arena_dalloc_bin_locked(arena_t *arena, arena_chunk_t *chunk, void *ptr,
-    arena_chunk_map_t *mapelm)
+    arena_chunk_map_bits_t *bitselm)
 {
 	size_t pageind;
 	arena_run_t *run;
@@ -1761,7 +1765,8 @@ arena_dalloc_bin_locked(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
 	    arena_mapbits_small_runind_get(chunk, pageind)) << LG_PAGE));
 	bin = run->bin;
-	binind = arena_ptr_small_binind_get(ptr, arena_mapbits_get(chunk, pageind));
+	binind = arena_ptr_small_binind_get(ptr, arena_mapbits_get(chunk,
+	    pageind));
 	bin_info = &arena_bin_info[binind];
 	if (config_fill || config_stats)
 		size = bin_info->reg_size;
@@ -1784,7 +1789,7 @@ arena_dalloc_bin_locked(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 
 void
 arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
-    size_t pageind, arena_chunk_map_t *mapelm)
+    size_t pageind, arena_chunk_map_bits_t *bitselm)
 {
 	arena_run_t *run;
 	arena_bin_t *bin;
@@ -1793,7 +1798,7 @@ arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	    arena_mapbits_small_runind_get(chunk, pageind)) << LG_PAGE));
 	bin = run->bin;
 	malloc_mutex_lock(&bin->lock);
-	arena_dalloc_bin_locked(arena, chunk, ptr, mapelm);
+	arena_dalloc_bin_locked(arena, chunk, ptr, bitselm);
 	malloc_mutex_unlock(&bin->lock);
 }
 
@@ -1801,15 +1806,15 @@ void
 arena_dalloc_small(arena_t *arena, arena_chunk_t *chunk, void *ptr,
     size_t pageind)
 {
-	arena_chunk_map_t *mapelm;
+	arena_chunk_map_bits_t *bitselm;
 
 	if (config_debug) {
 		/* arena_ptr_small_binind_get() does extra sanity checking. */
 		assert(arena_ptr_small_binind_get(ptr, arena_mapbits_get(chunk,
 		    pageind)) != BININD_INVALID);
 	}
-	mapelm = arena_mapp_get(chunk, pageind);
-	arena_dalloc_bin(arena, chunk, ptr, pageind, mapelm);
+	bitselm = arena_bitselm_get(chunk, pageind);
+	arena_dalloc_bin(arena, chunk, ptr, pageind, bitselm);
 }
 
 #ifdef JEMALLOC_JET
@@ -2268,7 +2273,8 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size)
 	 * be twice as large in order to maintain alignment.
 	 */
 	if (config_fill && opt_redzone) {
-		size_t align_min = ZU(1) << (jemalloc_ffs(bin_info->reg_size) - 1);
+		size_t align_min = ZU(1) << (jemalloc_ffs(bin_info->reg_size) -
+		    1);
 		if (align_min <= REDZONE_MINSIZE) {
 			bin_info->redzone_size = REDZONE_MINSIZE;
 			pad_size = 0;
@@ -2404,13 +2410,17 @@ arena_boot(void)
 	 */
 	map_bias = 0;
 	for (i = 0; i < 3; i++) {
-		header_size = offsetof(arena_chunk_t, map) +
-		    (sizeof(arena_chunk_map_t) * (chunk_npages-map_bias));
+		header_size = offsetof(arena_chunk_t, map_bits) +
+		    ((sizeof(arena_chunk_map_bits_t) +
+		    sizeof(arena_chunk_map_misc_t)) * (chunk_npages-map_bias));
 		map_bias = (header_size >> LG_PAGE) + ((header_size & PAGE_MASK)
 		    != 0);
 	}
 	assert(map_bias > 0);
 
+	map_misc_offset = offsetof(arena_chunk_t, map_bits) +
+	    sizeof(arena_chunk_map_bits_t) * (chunk_npages-map_bias);
+
 	arena_maxclass = chunksize - (map_bias << LG_PAGE);
 
 	bin_info_init();
diff --git a/src/chunk.c b/src/chunk.c
index 38d0286..874002c 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -28,6 +28,7 @@ size_t		chunksize;
 size_t		chunksize_mask; /* (chunksize - 1). */
 size_t		chunk_npages;
 size_t		map_bias;
+size_t		map_misc_offset;
 size_t		arena_maxclass; /* Max size class for arenas. */
 
 /******************************************************************************/
diff --git a/src/tcache.c b/src/tcache.c
index 4fbc94c..f86a46e 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -118,10 +118,10 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem,
 			if (chunk->arena == arena) {
 				size_t pageind = ((uintptr_t)ptr -
 				    (uintptr_t)chunk) >> LG_PAGE;
-				arena_chunk_map_t *mapelm =
-				    arena_mapp_get(chunk, pageind);
+				arena_chunk_map_bits_t *bitselm =
+				    arena_bitselm_get(chunk, pageind);
 				arena_dalloc_bin_locked(arena, chunk, ptr,
-				    mapelm);
+				    bitselm);
 			} else {
 				/*
 				 * This object was allocated via a different
@@ -393,9 +393,10 @@ tcache_destroy(tcache_t *tcache)
 		arena_t *arena = chunk->arena;
 		size_t pageind = ((uintptr_t)tcache - (uintptr_t)chunk) >>
 		    LG_PAGE;
-		arena_chunk_map_t *mapelm = arena_mapp_get(chunk, pageind);
+		arena_chunk_map_bits_t *bitselm = arena_bitselm_get(chunk,
+		    pageind);
 
-		arena_dalloc_bin(arena, chunk, tcache, pageind, mapelm);
+		arena_dalloc_bin(arena, chunk, tcache, pageind, bitselm);
 	} else if (tcache_size <= tcache_maxclass) {
 		arena_chunk_t *chunk = CHUNK_ADDR2BASE(tcache);
 		arena_t *arena = chunk->arena;
-- 
cgit v0.12


From c21b05ea09874222266b3e36ceb18765fcb4a00b Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Thu, 4 Sep 2014 22:27:26 -0700
Subject: Whitespace cleanups.

---
 INSTALL                          |  6 +++---
 include/jemalloc/internal/prng.h |  2 +-
 src/zone.c                       | 14 +++++++-------
 test/src/SFMT.c                  | 20 ++++++++++----------
 4 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/INSTALL b/INSTALL
index 2df667c..6c46100 100644
--- a/INSTALL
+++ b/INSTALL
@@ -56,7 +56,7 @@ any of the following arguments (not a definitive list) to 'configure':
     replace the "malloc", "calloc", etc. symbols.
 
 --without-export
-    Don't export public APIs. This can be useful when building jemalloc as a
+    Don't export public APIs.  This can be useful when building jemalloc as a
     static library, or to avoid exporting public APIs when using the zone
     allocator on OSX.
 
@@ -96,7 +96,7 @@ any of the following arguments (not a definitive list) to 'configure':
 
 --enable-ivsalloc
     Enable validation code, which verifies that pointers reside within
-    jemalloc-owned chunks before dereferencing them. This incurs a substantial
+    jemalloc-owned chunks before dereferencing them.  This incurs a substantial
     performance hit.
 
 --disable-stats
@@ -148,7 +148,7 @@ any of the following arguments (not a definitive list) to 'configure':
     Disable support for Valgrind.
 
 --disable-zone-allocator
-    Disable zone allocator for Darwin. This means jemalloc won't be hooked as
+    Disable zone allocator for Darwin.  This means jemalloc won't be hooked as
     the default allocator on OSX/iOS.
 
 --enable-utrace
diff --git a/include/jemalloc/internal/prng.h b/include/jemalloc/internal/prng.h
index 7b2b065..c6b1797 100644
--- a/include/jemalloc/internal/prng.h
+++ b/include/jemalloc/internal/prng.h
@@ -15,7 +15,7 @@
  * See Knuth's TAOCP 3rd Ed., Vol. 2, pg. 17 for details on these constraints.
  *
  * This choice of m has the disadvantage that the quality of the bits is
- * proportional to bit position.  For example. the lowest bit has a cycle of 2,
+ * proportional to bit position.  For example, the lowest bit has a cycle of 2,
  * the next has a cycle of 4, etc.  For this reason, we prefer to use the upper
  * bits.
  *
diff --git a/src/zone.c b/src/zone.c
index a722287..c6bd533 100644
--- a/src/zone.c
+++ b/src/zone.c
@@ -258,13 +258,13 @@ register_zone(void)
 		/*
 		 * On OSX 10.6, having the default purgeable zone appear before
 		 * the default zone makes some things crash because it thinks it
-		 * owns the default zone allocated pointers. We thus unregister/
-		 * re-register it in order to ensure it's always after the
-		 * default zone. On OSX < 10.6, there is no purgeable zone, so
-		 * this does nothing. On OSX >= 10.6, unregistering replaces the
-		 * purgeable zone with the last registered zone above, i.e the
-		 * default zone. Registering it again then puts it at the end,
-		 * obviously after the default zone.
+		 * owns the default zone allocated pointers.  We thus
+		 * unregister/re-register it in order to ensure it's always
+		 * after the default zone.  On OSX < 10.6, there is no purgeable
+		 * zone, so this does nothing.  On OSX >= 10.6, unregistering
+		 * replaces the purgeable zone with the last registered zone
+		 * above, i.e the default zone.  Registering it again then puts
+		 * it at the end, obviously after the default zone.
 		 */
 		if (purgeable_zone) {
 			malloc_zone_unregister(purgeable_zone);
diff --git a/test/src/SFMT.c b/test/src/SFMT.c
index 22a5ac5..80cabe0 100644
--- a/test/src/SFMT.c
+++ b/test/src/SFMT.c
@@ -463,11 +463,11 @@ uint32_t gen_rand32_range(sfmt_t *ctx, uint32_t limit) {
 
     above = 0xffffffffU - (0xffffffffU % limit);
     while (1) {
-        ret = gen_rand32(ctx);
-        if (ret < above) {
-            ret %= limit;
-            break;
-        }
+	ret = gen_rand32(ctx);
+	if (ret < above) {
+	    ret %= limit;
+	    break;
+	}
     }
     return ret;
 }
@@ -513,11 +513,11 @@ uint64_t gen_rand64_range(sfmt_t *ctx, uint64_t limit) {
 
     above = KQU(0xffffffffffffffff) - (KQU(0xffffffffffffffff) % limit);
     while (1) {
-        ret = gen_rand64(ctx);
-        if (ret < above) {
-            ret %= limit;
-            break;
-        }
+	ret = gen_rand64(ctx);
+	if (ret < above) {
+	    ret %= limit;
+	    break;
+	}
     }
     return ret;
 }
-- 
cgit v0.12


From b718cf77e9917f6ae1995c2e2b219ff4219c9f46 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Sun, 7 Sep 2014 14:40:19 -0700
Subject: Optimize [nmd]alloc() fast paths.

Optimize [nmd]alloc() fast paths such that the (flags == 0) case is
streamlined, flags decoding only happens to the minimum degree
necessary, and no conditionals are repeated.
---
 include/jemalloc/internal/arena.h                |   4 +-
 include/jemalloc/internal/jemalloc_internal.h.in |  52 ++---
 include/jemalloc/internal/private_symbols.txt    |   1 -
 include/jemalloc/internal/size_classes.sh        |   3 +
 src/arena.c                                      |   2 +-
 src/huge.c                                       |   2 +-
 src/jemalloc.c                                   | 239 +++++++++++++----------
 7 files changed, 172 insertions(+), 131 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 986bea9..166d052 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -577,7 +577,7 @@ small_bin2size_lookup(size_t binind)
 
 	assert(binind < NBINS);
 	{
-		size_t ret = ((size_t)(small_bin2size_tab[binind]));
+		size_t ret = (size_t)small_bin2size_tab[binind];
 		assert(ret == small_bin2size_compute(binind));
 		return (ret);
 	}
@@ -615,7 +615,7 @@ small_s2u_compute(size_t size)
 JEMALLOC_ALWAYS_INLINE size_t
 small_s2u_lookup(size_t size)
 {
-	size_t ret = (small_bin2size(small_size2bin(size)));
+	size_t ret = small_bin2size(small_size2bin(size));
 
 	assert(ret == small_s2u_compute(size));
 	return (ret);
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 1c2f3d4..59ae8d5 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -165,7 +165,17 @@ static const bool config_ivsalloc =
 
 #include "jemalloc/internal/jemalloc_internal_macros.h"
 
+#define	MALLOCX_ARENA_MASK	((int)~0xff)
 #define	MALLOCX_LG_ALIGN_MASK	((int)0x3f)
+/* Use MALLOCX_ALIGN_GET() if alignment may not be specified in flags. */
+#define	MALLOCX_ALIGN_GET_SPECIFIED(flags)				\
+    (ZU(1) << (flags & MALLOCX_LG_ALIGN_MASK))
+#define	MALLOCX_ALIGN_GET(flags)					\
+    (MALLOCX_ALIGN_GET_SPECIFIED(flags) & (SIZE_T_MAX-1))
+#define	MALLOCX_ZERO_GET(flags)						\
+    ((bool)(flags & MALLOCX_ZERO))
+#define	MALLOCX_ARENA_GET(flags)					\
+    (((unsigned)(flags >> 8)) - 1)
 
 /* Smallest size class to support. */
 #define	LG_TINY_MIN		3
@@ -625,15 +635,13 @@ size_t	u2rz(size_t usize);
 size_t	p2rz(const void *ptr);
 void	idalloct(void *ptr, bool try_tcache);
 void	idalloc(void *ptr);
-void	iqalloct(void *ptr, bool try_tcache);
-void	iqalloc(void *ptr);
+void	iqalloc(void *ptr, bool try_tcache);
 void	*iralloct_realign(void *ptr, size_t oldsize, size_t size, size_t extra,
     size_t alignment, bool zero, bool try_tcache_alloc, bool try_tcache_dalloc,
     arena_t *arena);
-void	*iralloct(void *ptr, size_t size, size_t extra, size_t alignment,
-    bool zero, bool try_tcache_alloc, bool try_tcache_dalloc, arena_t *arena);
-void	*iralloc(void *ptr, size_t size, size_t extra, size_t alignment,
-    bool zero);
+void	*iralloct(void *ptr, size_t size, size_t alignment, bool zero,
+    bool try_tcache_alloc, bool try_tcache_dalloc, arena_t *arena);
+void	*iralloc(void *ptr, size_t size, size_t alignment, bool zero);
 bool	ixalloc(void *ptr, size_t size, size_t extra, size_t alignment,
     bool zero);
 malloc_tsd_protos(JEMALLOC_ATTR(unused), thread_allocated, thread_allocated_t)
@@ -787,7 +795,7 @@ idalloc(void *ptr)
 }
 
 JEMALLOC_ALWAYS_INLINE void
-iqalloct(void *ptr, bool try_tcache)
+iqalloc(void *ptr, bool try_tcache)
 {
 
 	if (config_fill && opt_quarantine)
@@ -796,13 +804,6 @@ iqalloct(void *ptr, bool try_tcache)
 		idalloct(ptr, try_tcache);
 }
 
-JEMALLOC_ALWAYS_INLINE void
-iqalloc(void *ptr)
-{
-
-	iqalloct(ptr, true);
-}
-
 JEMALLOC_ALWAYS_INLINE void *
 iralloct_realign(void *ptr, size_t oldsize, size_t size, size_t extra,
     size_t alignment, bool zero, bool try_tcache_alloc, bool try_tcache_dalloc,
@@ -832,12 +833,12 @@ iralloct_realign(void *ptr, size_t oldsize, size_t size, size_t extra,
 	 */
 	copysize = (size < oldsize) ? size : oldsize;
 	memcpy(p, ptr, copysize);
-	iqalloct(ptr, try_tcache_dalloc);
+	iqalloc(ptr, try_tcache_dalloc);
 	return (p);
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-iralloct(void *ptr, size_t size, size_t extra, size_t alignment, bool zero,
+iralloct(void *ptr, size_t size, size_t alignment, bool zero,
     bool try_tcache_alloc, bool try_tcache_dalloc, arena_t *arena)
 {
 	size_t oldsize;
@@ -853,25 +854,24 @@ iralloct(void *ptr, size_t size, size_t extra, size_t alignment, bool zero,
 		 * Existing object alignment is inadequate; allocate new space
 		 * and copy.
 		 */
-		return (iralloct_realign(ptr, oldsize, size, extra, alignment,
-		    zero, try_tcache_alloc, try_tcache_dalloc, arena));
+		return (iralloct_realign(ptr, oldsize, size, 0, alignment, zero,
+		    try_tcache_alloc, try_tcache_dalloc, arena));
 	}
 
-	if (size + extra <= arena_maxclass) {
-		return (arena_ralloc(arena, ptr, oldsize, size, extra,
-		    alignment, zero, try_tcache_alloc,
-		    try_tcache_dalloc));
+	if (size <= arena_maxclass) {
+		return (arena_ralloc(arena, ptr, oldsize, size, 0, alignment,
+		    zero, try_tcache_alloc, try_tcache_dalloc));
 	} else {
-		return (huge_ralloc(arena, ptr, oldsize, size, extra,
-		    alignment, zero, try_tcache_dalloc));
+		return (huge_ralloc(arena, ptr, oldsize, size, 0, alignment,
+		    zero, try_tcache_dalloc));
 	}
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-iralloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero)
+iralloc(void *ptr, size_t size, size_t alignment, bool zero)
 {
 
-	return (iralloct(ptr, size, extra, alignment, zero, true, true, NULL));
+	return (iralloct(ptr, size, alignment, zero, true, true, NULL));
 }
 
 JEMALLOC_ALWAYS_INLINE bool
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 9ca139a..84f0591 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -224,7 +224,6 @@ in_valgrind
 ipalloc
 ipalloct
 iqalloc
-iqalloct
 iralloc
 iralloct
 iralloct_realign
diff --git a/include/jemalloc/internal/size_classes.sh b/include/jemalloc/internal/size_classes.sh
index 379d36c..0cfac72 100755
--- a/include/jemalloc/internal/size_classes.sh
+++ b/include/jemalloc/internal/size_classes.sh
@@ -202,6 +202,7 @@ cat <<EOF
  *   LG_TINY_MAXCLASS: Lg of maximum tiny size class.
  *   LOOKUP_MAXCLASS: Maximum size class included in lookup table.
  *   SMALL_MAXCLASS: Maximum small size class.
+ *   LARGE_MINCLASS: Minimum large size class.
  */
 
 #define	LG_SIZE_CLASS_GROUP	${lg_g}
@@ -246,6 +247,8 @@ cat <<EOF
 #  error "Too many small size classes"
 #endif
 
+#define	LARGE_MINCLASS (PAGE_CEILING(SMALL_MAXCLASS+1))
+
 #endif /* JEMALLOC_H_TYPES */
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS
diff --git a/src/arena.c b/src/arena.c
index d9dda83..8d34cf6 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -2108,7 +2108,7 @@ arena_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
 	copysize = (size < oldsize) ? size : oldsize;
 	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, copysize);
 	memcpy(ret, ptr, copysize);
-	iqalloct(ptr, try_tcache_dalloc);
+	iqalloc(ptr, try_tcache_dalloc);
 	return (ret);
 }
 
diff --git a/src/huge.c b/src/huge.c
index 5f0c698..e773309 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -129,7 +129,7 @@ huge_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
 	 */
 	copysize = (size < oldsize) ? size : oldsize;
 	memcpy(ret, ptr, copysize);
-	iqalloct(ptr, try_tcache_dalloc);
+	iqalloc(ptr, try_tcache_dalloc);
 	return (ret);
 }
 
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 9df7001..71e921b 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -870,7 +870,7 @@ imalloc_prof_sample(size_t usize, prof_tctx_t *tctx)
 	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
-		p = imalloc(SMALL_MAXCLASS+1);
+		p = imalloc(LARGE_MINCLASS);
 		if (p == NULL)
 			return (NULL);
 		arena_prof_promoted(p, usize);
@@ -950,9 +950,8 @@ imemalign_prof_sample(size_t alignment, size_t usize, prof_tctx_t *tctx)
 	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
-		assert(sa2u(SMALL_MAXCLASS+1, alignment) != 0);
-		p = ipalloc(sa2u(SMALL_MAXCLASS+1, alignment), alignment,
-		    false);
+		assert(sa2u(LARGE_MINCLASS, alignment) == LARGE_MINCLASS);
+		p = imalloc(LARGE_MINCLASS);
 		if (p == NULL)
 			return (NULL);
 		arena_prof_promoted(p, usize);
@@ -1077,7 +1076,7 @@ icalloc_prof_sample(size_t usize, prof_tctx_t *tctx)
 	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
-		p = icalloc(SMALL_MAXCLASS+1);
+		p = icalloc(LARGE_MINCLASS);
 		if (p == NULL)
 			return (NULL);
 		arena_prof_promoted(p, usize);
@@ -1174,12 +1173,12 @@ irealloc_prof_sample(void *oldptr, size_t usize, prof_tctx_t *tctx)
 	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
-		p = iralloc(oldptr, SMALL_MAXCLASS+1, 0, 0, false);
+		p = iralloc(oldptr, LARGE_MINCLASS, 0, false);
 		if (p == NULL)
 			return (NULL);
 		arena_prof_promoted(p, usize);
 	} else
-		p = iralloc(oldptr, usize, 0, 0, false);
+		p = iralloc(oldptr, usize, 0, false);
 
 	return (p);
 }
@@ -1194,7 +1193,7 @@ irealloc_prof(void *oldptr, size_t old_usize, size_t usize, prof_tctx_t *tctx)
 	if ((uintptr_t)tctx != (uintptr_t)1U)
 		p = irealloc_prof_sample(oldptr, usize, tctx);
 	else
-		p = iralloc(oldptr, usize, 0, 0, false);
+		p = iralloc(oldptr, usize, 0, false);
 	if (p == NULL)
 		return (NULL);
 	prof_realloc(p, usize, tctx, old_usize, old_tctx);
@@ -1203,7 +1202,7 @@ irealloc_prof(void *oldptr, size_t old_usize, size_t usize, prof_tctx_t *tctx)
 }
 
 JEMALLOC_INLINE_C void
-ifree(void *ptr)
+ifree(void *ptr, bool try_tcache)
 {
 	size_t usize;
 	UNUSED size_t rzsize JEMALLOC_CC_SILENCE_INIT(0);
@@ -1220,7 +1219,7 @@ ifree(void *ptr)
 		thread_allocated_tsd_get()->deallocated += usize;
 	if (config_valgrind && in_valgrind)
 		rzsize = p2rz(ptr);
-	iqalloc(ptr);
+	iqalloc(ptr, try_tcache);
 	JEMALLOC_VALGRIND_FREE(ptr, rzsize);
 }
 
@@ -1236,7 +1235,7 @@ je_realloc(void *ptr, size_t size)
 		if (ptr != NULL) {
 			/* realloc(ptr, 0) is equivalent to free(ptr). */
 			UTRACE(ptr, 0, 0);
-			ifree(ptr);
+			ifree(ptr, true);
 			return (NULL);
 		}
 		size = 1;
@@ -1261,7 +1260,7 @@ je_realloc(void *ptr, size_t size)
 		} else {
 			if (config_stats || (config_valgrind && in_valgrind))
 				usize = s2u(size);
-			ret = iralloc(ptr, size, 0, 0, false);
+			ret = iralloc(ptr, size, 0, false);
 		}
 	} else {
 		/* realloc(NULL, size) is equivalent to malloc(size). */
@@ -1295,7 +1294,7 @@ je_free(void *ptr)
 
 	UTRACE(ptr, 0, 0);
 	if (ptr != NULL)
-		ifree(ptr);
+		ifree(ptr, true);
 }
 
 /*
@@ -1363,99 +1362,153 @@ JEMALLOC_EXPORT void *(*__memalign_hook)(size_t alignment, size_t size) =
  * Begin non-standard functions.
  */
 
+JEMALLOC_ALWAYS_INLINE_C void
+imallocx_flags_decode_hard(size_t size, int flags, size_t *usize,
+    size_t *alignment, bool *zero, bool *try_tcache, arena_t **arena)
+{
+
+	if ((flags & MALLOCX_LG_ALIGN_MASK) == 0) {
+		*alignment = 0;
+		*usize = s2u(size);
+	} else {
+		*alignment = MALLOCX_ALIGN_GET_SPECIFIED(flags);
+		*usize = sa2u(size, *alignment);
+	}
+	*zero = MALLOCX_ZERO_GET(flags);
+	if ((flags & MALLOCX_ARENA_MASK) != 0) {
+		unsigned arena_ind = MALLOCX_ARENA_GET(flags);
+		*try_tcache = false;
+		*arena = arenas[arena_ind];
+	} else {
+		*try_tcache = true;
+		*arena = NULL;
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE_C void
+imallocx_flags_decode(size_t size, int flags, size_t *usize, size_t *alignment,
+    bool *zero, bool *try_tcache, arena_t **arena)
+{
+
+	if (flags == 0) {
+		*usize = s2u(size);
+		assert(usize != 0);
+		*alignment = 0;
+		*zero = false;
+		*try_tcache = true;
+		*arena = NULL;
+	} else {
+		imallocx_flags_decode_hard(size, flags, usize, alignment, zero,
+		    try_tcache, arena);
+	}
+}
+
 JEMALLOC_ALWAYS_INLINE_C void *
-imallocx(size_t usize, size_t alignment, bool zero, bool try_tcache,
+imallocx_flags(size_t usize, size_t alignment, bool zero, bool try_tcache,
     arena_t *arena)
 {
 
-	assert(usize == ((alignment == 0) ? s2u(usize) : sa2u(usize,
-	    alignment)));
-
 	if (alignment != 0)
 		return (ipalloct(usize, alignment, zero, try_tcache, arena));
-	else if (zero)
+	if (zero)
 		return (icalloct(usize, try_tcache, arena));
-	else
-		return (imalloct(usize, try_tcache, arena));
+	return (imalloct(usize, try_tcache, arena));
+}
+
+
+JEMALLOC_ALWAYS_INLINE_C void *
+imallocx_maybe_flags(size_t size, int flags, size_t usize, size_t alignment,
+    bool zero, bool try_tcache, arena_t *arena)
+{
+
+	if (flags == 0)
+		return (imalloc(size));
+	return (imallocx_flags(usize, alignment, zero, try_tcache, arena));
 }
 
 static void *
-imallocx_prof_sample(size_t usize, size_t alignment, bool zero, bool try_tcache,
-    arena_t *arena, prof_tctx_t *tctx)
+imallocx_prof_sample(size_t size, int flags, size_t usize, size_t alignment,
+    bool zero, bool try_tcache, arena_t *arena)
 {
 	void *p;
 
-	if (tctx == NULL)
-		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
-		size_t usize_promoted = (alignment == 0) ?
-		    s2u(SMALL_MAXCLASS+1) : sa2u(SMALL_MAXCLASS+1, alignment);
-		assert(usize_promoted != 0);
-		p = imallocx(usize_promoted, alignment, zero, try_tcache,
-		    arena);
+		assert(((alignment == 0) ? s2u(LARGE_MINCLASS) :
+		    sa2u(LARGE_MINCLASS, alignment)) == LARGE_MINCLASS);
+		p = imalloc(LARGE_MINCLASS);
 		if (p == NULL)
 			return (NULL);
 		arena_prof_promoted(p, usize);
-	} else
-		p = imallocx(usize, alignment, zero, try_tcache, arena);
+	} else {
+		p = imallocx_maybe_flags(size, flags, usize, alignment, zero,
+		    try_tcache, arena);
+	}
 
 	return (p);
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
-imallocx_prof(size_t usize, size_t alignment, bool zero, bool try_tcache,
-    arena_t *arena, prof_tctx_t *tctx)
+imallocx_prof(size_t size, int flags, size_t *usize)
 {
 	void *p;
+	size_t alignment;
+	bool zero;
+	bool try_tcache;
+	arena_t *arena;
+	prof_tctx_t *tctx;
 
-	if ((uintptr_t)tctx != (uintptr_t)1U) {
-		p = imallocx_prof_sample(usize, alignment, zero, try_tcache,
-		    arena, tctx);
+	imallocx_flags_decode(size, flags, usize, &alignment, &zero,
+	    &try_tcache, &arena);
+	tctx = prof_alloc_prep(*usize);
+	if ((uintptr_t)tctx == (uintptr_t)1U) {
+		p = imallocx_maybe_flags(size, flags, *usize, alignment, zero,
+		    try_tcache, arena);
+	} else if ((uintptr_t)tctx > (uintptr_t)1U) {
+		p = imallocx_prof_sample(size, flags, *usize, alignment, zero,
+		    try_tcache, arena);
 	} else
-		p = imallocx(usize, alignment, zero, try_tcache, arena);
+		p = NULL;
 	if (p == NULL)
 		return (NULL);
-	prof_malloc(p, usize, tctx);
+	prof_malloc(p, *usize, tctx);
 
 	return (p);
 }
 
+JEMALLOC_ALWAYS_INLINE_C void *
+imallocx_no_prof(size_t size, int flags, size_t *usize)
+{
+	size_t alignment;
+	bool zero;
+	bool try_tcache;
+	arena_t *arena;
+
+	if (flags == 0) {
+		if (config_stats || (config_valgrind && in_valgrind))
+			*usize = s2u(size);
+		return (imalloc(size));
+	}
+
+	imallocx_flags_decode_hard(size, flags, usize, &alignment, &zero,
+	    &try_tcache, &arena);
+	return (imallocx_flags(*usize, alignment, zero, try_tcache, arena));
+}
+
 void *
 je_mallocx(size_t size, int flags)
 {
 	void *p;
 	size_t usize;
-	size_t alignment = (ZU(1) << (flags & MALLOCX_LG_ALIGN_MASK)
-	    & (SIZE_T_MAX-1));
-	bool zero = flags & MALLOCX_ZERO;
-	unsigned arena_ind = ((unsigned)(flags >> 8)) - 1;
-	arena_t *arena;
-	bool try_tcache;
 
 	assert(size != 0);
 
 	if (malloc_init())
 		goto label_oom;
 
-	if (arena_ind != UINT_MAX) {
-		arena = arenas[arena_ind];
-		try_tcache = false;
-	} else {
-		arena = NULL;
-		try_tcache = true;
-	}
-
-	usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment);
-	assert(usize != 0);
-
-	if (config_prof && opt_prof) {
-		prof_tctx_t *tctx;
-
-		tctx = prof_alloc_prep(usize);
-		p = imallocx_prof(usize, alignment, zero, try_tcache, arena,
-		    tctx);
-	} else
-		p = imallocx(usize, alignment, zero, try_tcache, arena);
+	if (config_prof && opt_prof)
+		p = imallocx_prof(size, flags, &usize);
+	else
+		p = imallocx_no_prof(size, flags, &usize);
 	if (p == NULL)
 		goto label_oom;
 
@@ -1464,7 +1517,7 @@ je_mallocx(size_t size, int flags)
 		thread_allocated_tsd_get()->allocated += usize;
 	}
 	UTRACE(0, size, p);
-	JEMALLOC_VALGRIND_MALLOC(true, p, usize, zero);
+	JEMALLOC_VALGRIND_MALLOC(true, p, usize, MALLOCX_ZERO_GET(flags));
 	return (p);
 label_oom:
 	if (config_xmalloc && opt_xmalloc) {
@@ -1485,15 +1538,14 @@ irallocx_prof_sample(void *oldptr, size_t size, size_t alignment, size_t usize,
 	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
-		p = iralloct(oldptr, SMALL_MAXCLASS+1, (SMALL_MAXCLASS+1 >=
-		    size) ? 0 : size - (SMALL_MAXCLASS+1), alignment, zero,
+		p = iralloct(oldptr, LARGE_MINCLASS, alignment, zero,
 		    try_tcache_alloc, try_tcache_dalloc, arena);
 		if (p == NULL)
 			return (NULL);
 		arena_prof_promoted(p, usize);
 	} else {
-		p = iralloct(oldptr, size, 0, alignment, zero,
-		    try_tcache_alloc, try_tcache_dalloc, arena);
+		p = iralloct(oldptr, size, alignment, zero, try_tcache_alloc,
+		    try_tcache_dalloc, arena);
 	}
 
 	return (p);
@@ -1512,8 +1564,8 @@ irallocx_prof(void *oldptr, size_t old_usize, size_t size, size_t alignment,
 		p = irallocx_prof_sample(oldptr, size, alignment, *usize, zero,
 		    try_tcache_alloc, try_tcache_dalloc, arena, tctx);
 	else {
-		p = iralloct(oldptr, size, 0, alignment, zero,
-		    try_tcache_alloc, try_tcache_dalloc, arena);
+		p = iralloct(oldptr, size, alignment, zero, try_tcache_alloc,
+		    try_tcache_dalloc, arena);
 	}
 	if (p == NULL)
 		return (NULL);
@@ -1540,10 +1592,8 @@ je_rallocx(void *ptr, size_t size, int flags)
 	void *p;
 	size_t usize, old_usize;
 	UNUSED size_t old_rzsize JEMALLOC_CC_SILENCE_INIT(0);
-	size_t alignment = (ZU(1) << (flags & MALLOCX_LG_ALIGN_MASK)
-	    & (SIZE_T_MAX-1));
+	size_t alignment = MALLOCX_ALIGN_GET(flags);
 	bool zero = flags & MALLOCX_ZERO;
-	unsigned arena_ind = ((unsigned)(flags >> 8)) - 1;
 	bool try_tcache_alloc, try_tcache_dalloc;
 	arena_t *arena;
 
@@ -1552,7 +1602,8 @@ je_rallocx(void *ptr, size_t size, int flags)
 	assert(malloc_initialized || IS_INITIALIZER);
 	malloc_thread_init();
 
-	if (arena_ind != UINT_MAX) {
+	if ((flags & MALLOCX_ARENA_MASK) != 0) {
+		unsigned arena_ind = MALLOCX_ARENA_GET(flags);
 		arena_chunk_t *chunk;
 		try_tcache_alloc = false;
 		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
@@ -1582,7 +1633,7 @@ je_rallocx(void *ptr, size_t size, int flags)
 		if (p == NULL)
 			goto label_oom;
 	} else {
-		p = iralloct(ptr, size, 0, alignment, zero, try_tcache_alloc,
+		p = iralloct(ptr, size, alignment, zero, try_tcache_alloc,
 		    try_tcache_dalloc, arena);
 		if (p == NULL)
 			goto label_oom;
@@ -1677,10 +1728,8 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags)
 {
 	size_t usize, old_usize;
 	UNUSED size_t old_rzsize JEMALLOC_CC_SILENCE_INIT(0);
-	size_t alignment = (ZU(1) << (flags & MALLOCX_LG_ALIGN_MASK)
-	    & (SIZE_T_MAX-1));
+	size_t alignment = MALLOCX_ALIGN_GET(flags);
 	bool zero = flags & MALLOCX_ZERO;
-	unsigned arena_ind = ((unsigned)(flags >> 8)) - 1;
 	arena_t *arena;
 
 	assert(ptr != NULL);
@@ -1689,9 +1738,10 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags)
 	assert(malloc_initialized || IS_INITIALIZER);
 	malloc_thread_init();
 
-	if (arena_ind != UINT_MAX)
+	if ((flags & MALLOCX_ARENA_MASK) != 0) {
+		unsigned arena_ind = MALLOCX_ARENA_GET(flags);
 		arena = arenas[arena_ind];
-	else
+	} else
 		arena = NULL;
 
 	old_usize = isalloc(ptr, config_prof);
@@ -1753,15 +1803,13 @@ je_sallocx(const void *ptr, int flags)
 void
 je_dallocx(void *ptr, int flags)
 {
-	size_t usize;
-	UNUSED size_t rzsize JEMALLOC_CC_SILENCE_INIT(0);
-	unsigned arena_ind = ((unsigned)(flags >> 8)) - 1;
 	bool try_tcache;
 
 	assert(ptr != NULL);
 	assert(malloc_initialized || IS_INITIALIZER);
 
-	if (arena_ind != UINT_MAX) {
+	if ((flags & MALLOCX_ARENA_MASK) != 0) {
+		unsigned arena_ind = MALLOCX_ARENA_GET(flags);
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 		try_tcache = (chunk == ptr || chunk->arena !=
 		    arenas[arena_ind]);
@@ -1769,34 +1817,25 @@ je_dallocx(void *ptr, int flags)
 		try_tcache = true;
 
 	UTRACE(ptr, 0, 0);
-	if (config_stats || config_valgrind)
-		usize = isalloc(ptr, config_prof);
-	if (config_prof && opt_prof) {
-		if (config_stats == false && config_valgrind == false)
-			usize = isalloc(ptr, config_prof);
-		prof_free(ptr, usize);
-	}
-	if (config_stats)
-		thread_allocated_tsd_get()->deallocated += usize;
-	if (config_valgrind && in_valgrind)
-		rzsize = p2rz(ptr);
-	iqalloct(ptr, try_tcache);
-	JEMALLOC_VALGRIND_FREE(ptr, rzsize);
+	ifree(ptr, try_tcache);
 }
 
 size_t
 je_nallocx(size_t size, int flags)
 {
 	size_t usize;
-	size_t alignment = (ZU(1) << (flags & MALLOCX_LG_ALIGN_MASK)
-	    & (SIZE_T_MAX-1));
 
 	assert(size != 0);
 
 	if (malloc_init())
 		return (0);
 
-	usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment);
+	if ((flags & MALLOCX_LG_ALIGN_MASK) == 0)
+		usize = s2u(size);
+	else {
+		size_t alignment = MALLOCX_ALIGN_GET_SPECIFIED(flags);
+		usize = sa2u(size, alignment);
+	}
 	assert(usize != 0);
 	return (usize);
 }
-- 
cgit v0.12


From 82e88d1ecfe3d7bf700355cb5023ab61559f9578 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Sun, 7 Sep 2014 19:55:03 -0700
Subject: Move typedefs from jemalloc_protos.h.in to jemalloc_typedefs.h.in.

Move typedefs from jemalloc_protos.h.in to jemalloc_typedefs.h.in, so
that typedefs aren't redefined when compiling stress tests.
---
 .gitignore                              | 1 +
 configure.ac                            | 3 +++
 include/jemalloc/jemalloc.sh            | 2 +-
 include/jemalloc/jemalloc_protos.h.in   | 3 ---
 include/jemalloc/jemalloc_typedefs.h.in | 2 ++
 5 files changed, 7 insertions(+), 4 deletions(-)
 create mode 100644 include/jemalloc/jemalloc_typedefs.h.in

diff --git a/.gitignore b/.gitignore
index ec9c0b9..79d454f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -33,6 +33,7 @@
 /include/jemalloc/jemalloc_protos.h
 /include/jemalloc/jemalloc_protos_jet.h
 /include/jemalloc/jemalloc_rename.h
+/include/jemalloc/jemalloc_typedefs.h
 
 /src/*.[od]
 /src/*.gcda
diff --git a/configure.ac b/configure.ac
index 3b65885..ce4af21 100644
--- a/configure.ac
+++ b/configure.ac
@@ -545,6 +545,7 @@ cfgoutputs_in="${cfgoutputs_in} doc/manpages.xsl.in"
 cfgoutputs_in="${cfgoutputs_in} doc/jemalloc.xml.in"
 cfgoutputs_in="${cfgoutputs_in} include/jemalloc/jemalloc_macros.h.in"
 cfgoutputs_in="${cfgoutputs_in} include/jemalloc/jemalloc_protos.h.in"
+cfgoutputs_in="${cfgoutputs_in} include/jemalloc/jemalloc_typedefs.h.in"
 cfgoutputs_in="${cfgoutputs_in} include/jemalloc/internal/jemalloc_internal.h.in"
 cfgoutputs_in="${cfgoutputs_in} test/test.sh.in"
 cfgoutputs_in="${cfgoutputs_in} test/include/test/jemalloc_test.h.in"
@@ -555,6 +556,7 @@ cfgoutputs_out="${cfgoutputs_out} doc/manpages.xsl"
 cfgoutputs_out="${cfgoutputs_out} doc/jemalloc.xml"
 cfgoutputs_out="${cfgoutputs_out} include/jemalloc/jemalloc_macros.h"
 cfgoutputs_out="${cfgoutputs_out} include/jemalloc/jemalloc_protos.h"
+cfgoutputs_out="${cfgoutputs_out} include/jemalloc/jemalloc_typedefs.h"
 cfgoutputs_out="${cfgoutputs_out} include/jemalloc/internal/jemalloc_internal.h"
 cfgoutputs_out="${cfgoutputs_out} test/test.sh"
 cfgoutputs_out="${cfgoutputs_out} test/include/test/jemalloc_test.h"
@@ -565,6 +567,7 @@ cfgoutputs_tup="${cfgoutputs_tup} doc/manpages.xsl:doc/manpages.xsl.in"
 cfgoutputs_tup="${cfgoutputs_tup} doc/jemalloc.xml:doc/jemalloc.xml.in"
 cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/jemalloc_macros.h:include/jemalloc/jemalloc_macros.h.in"
 cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/jemalloc_protos.h:include/jemalloc/jemalloc_protos.h.in"
+cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/jemalloc_typedefs.h:include/jemalloc/jemalloc_typedefs.h.in"
 cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/internal/jemalloc_internal.h"
 cfgoutputs_tup="${cfgoutputs_tup} test/test.sh:test/test.sh.in"
 cfgoutputs_tup="${cfgoutputs_tup} test/include/test/jemalloc_test.h:test/include/test/jemalloc_test.h.in"
diff --git a/include/jemalloc/jemalloc.sh b/include/jemalloc/jemalloc.sh
index e4738eb..7e1c8be 100755
--- a/include/jemalloc/jemalloc.sh
+++ b/include/jemalloc/jemalloc.sh
@@ -12,7 +12,7 @@ extern "C" {
 EOF
 
 for hdr in jemalloc_defs.h jemalloc_rename.h jemalloc_macros.h \
-           jemalloc_protos.h jemalloc_mangle.h ; do
+           jemalloc_protos.h jemalloc_typedefs.h jemalloc_mangle.h ; do
   cat "${objroot}include/jemalloc/${hdr}" \
       | grep -v 'Generated from .* by configure\.' \
       | sed -e 's/^#define /#define	/g' \
diff --git a/include/jemalloc/jemalloc_protos.h.in b/include/jemalloc/jemalloc_protos.h.in
index 67268c4..59aeee1 100644
--- a/include/jemalloc/jemalloc_protos.h.in
+++ b/include/jemalloc/jemalloc_protos.h.in
@@ -44,6 +44,3 @@ JEMALLOC_EXPORT void *	@je_@memalign(size_t alignment, size_t size)
 #ifdef JEMALLOC_OVERRIDE_VALLOC
 JEMALLOC_EXPORT void *	@je_@valloc(size_t size) JEMALLOC_ATTR(malloc);
 #endif
-
-typedef void *(chunk_alloc_t)(size_t, size_t, bool *, unsigned);
-typedef bool (chunk_dalloc_t)(void *, size_t, unsigned);
diff --git a/include/jemalloc/jemalloc_typedefs.h.in b/include/jemalloc/jemalloc_typedefs.h.in
new file mode 100644
index 0000000..47e57ca
--- /dev/null
+++ b/include/jemalloc/jemalloc_typedefs.h.in
@@ -0,0 +1,2 @@
+typedef void *(chunk_alloc_t)(size_t, size_t, bool *, unsigned);
+typedef bool (chunk_dalloc_t)(void *, size_t, unsigned);
-- 
cgit v0.12


From b67ec3c4973e8f7ca272c13472aa98c8a3ba4de4 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Sun, 7 Sep 2014 19:57:24 -0700
Subject: Add a simple timer implementation for use in benchmarking.

---
 Makefile.in                          |  2 +-
 test/include/test/jemalloc_test.h.in |  2 ++
 test/include/test/timer.h            | 15 ++++++++++
 test/src/timer.c                     | 57 ++++++++++++++++++++++++++++++++++++
 4 files changed, 75 insertions(+), 1 deletion(-)
 create mode 100644 test/include/test/timer.h
 create mode 100644 test/src/timer.c

diff --git a/Makefile.in b/Makefile.in
index b5f0ee9..d3e91b5 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -108,7 +108,7 @@ DOCS_MAN3 := $(DOCS_XML:$(objroot)%.xml=$(srcroot)%.3)
 DOCS := $(DOCS_HTML) $(DOCS_MAN3)
 C_TESTLIB_SRCS := $(srcroot)test/src/math.c $(srcroot)test/src/mtx.c \
 	$(srcroot)test/src/SFMT.c $(srcroot)test/src/test.c \
-	$(srcroot)test/src/thd.c
+	$(srcroot)test/src/thd.c $(srcroot)test/src/timer.c
 C_UTIL_INTEGRATION_SRCS := $(srcroot)src/util.c
 TESTS_UNIT := $(srcroot)test/unit/atomic.c \
 	$(srcroot)test/unit/bitmap.c \
diff --git a/test/include/test/jemalloc_test.h.in b/test/include/test/jemalloc_test.h.in
index 730a55d..a93c4f6 100644
--- a/test/include/test/jemalloc_test.h.in
+++ b/test/include/test/jemalloc_test.h.in
@@ -5,6 +5,7 @@
 #include <inttypes.h>
 #include <math.h>
 #include <string.h>
+#include <sys/time.h>
 
 #ifdef _WIN32
 #  include <windows.h>
@@ -136,6 +137,7 @@
 #include "test/mtx.h"
 #include "test/mq.h"
 #include "test/test.h"
+#include "test/timer.h"
 #include "test/thd.h"
 #define	MEXP 19937
 #include "test/SFMT.h"
diff --git a/test/include/test/timer.h b/test/include/test/timer.h
new file mode 100644
index 0000000..f21ccf1
--- /dev/null
+++ b/test/include/test/timer.h
@@ -0,0 +1,15 @@
+/*
+ * Simple timer, for use in benchmark reporting.
+ */
+
+#include <sys/time.h>
+
+typedef struct {
+	struct timeval tv0;
+	struct timeval tv1;
+} timer_t;
+
+void	timer_start(timer_t *timer);
+void	timer_stop(timer_t *timer);
+uint64_t	timer_usec(const timer_t *timer);
+void	timer_ratio(timer_t *a, timer_t *b, char *buf, size_t buflen);
diff --git a/test/src/timer.c b/test/src/timer.c
new file mode 100644
index 0000000..17ead17
--- /dev/null
+++ b/test/src/timer.c
@@ -0,0 +1,57 @@
+#include "test/jemalloc_test.h"
+
+void
+timer_start(timer_t *timer)
+{
+
+	gettimeofday(&timer->tv0, NULL);
+}
+
+void
+timer_stop(timer_t *timer)
+{
+
+	gettimeofday(&timer->tv1, NULL);
+}
+
+uint64_t
+timer_usec(const timer_t *timer)
+{
+
+	return (((timer->tv1.tv_sec - timer->tv0.tv_sec) * 1000000) +
+	    timer->tv1.tv_usec - timer->tv0.tv_usec);
+}
+
+void
+timer_ratio(timer_t *a, timer_t *b, char *buf, size_t buflen)
+{
+	uint64_t t0 = timer_usec(a);
+	uint64_t t1 = timer_usec(b);
+	uint64_t mult;
+	unsigned i = 0;
+	unsigned j;
+	int n;
+
+	/* Whole. */
+	n = malloc_snprintf(&buf[i], buflen-i, "%"PRIu64, t0 / t1);
+	i += n;
+	if (i >= buflen)
+		return;
+	mult = 1;
+	for (j = 0; j < n; j++)
+		mult *= 10;
+
+	/* Decimal. */
+	n = malloc_snprintf(&buf[i], buflen-i, ".");
+	i += n;
+
+	/* Fraction. */
+	while (i < buflen-1) {
+		uint64_t round = (i+1 == buflen-1 && ((t0 * mult * 10 / t1) % 10
+		    >= 5)) ? 1 : 0;
+		n = malloc_snprintf(&buf[i], buflen-i,
+		    "%"PRIu64, (t0 * mult / t1) % 10 + round);
+		i += n;
+		mult *= 10;
+	}
+}
-- 
cgit v0.12


From 423d78a21bc6c9a038bdf436ad2cee194560d488 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Sun, 7 Sep 2014 19:58:04 -0700
Subject: Add microbench tests.

---
 Makefile.in              |   2 +-
 test/stress/microbench.c | 142 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 143 insertions(+), 1 deletion(-)
 create mode 100644 test/stress/microbench.c

diff --git a/Makefile.in b/Makefile.in
index d3e91b5..1446dbe 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -144,7 +144,7 @@ TESTS_INTEGRATION := $(srcroot)test/integration/aligned_alloc.c \
 	$(srcroot)test/integration/thread_tcache_enabled.c \
 	$(srcroot)test/integration/xallocx.c \
 	$(srcroot)test/integration/chunk.c
-TESTS_STRESS :=
+TESTS_STRESS := $(srcroot)test/stress/microbench.c
 TESTS := $(TESTS_UNIT) $(TESTS_INTEGRATION) $(TESTS_STRESS)
 
 C_OBJS := $(C_SRCS:$(srcroot)%.c=$(objroot)%.$(O))
diff --git a/test/stress/microbench.c b/test/stress/microbench.c
new file mode 100644
index 0000000..8c25215
--- /dev/null
+++ b/test/stress/microbench.c
@@ -0,0 +1,142 @@
+#include "test/jemalloc_test.h"
+
+JEMALLOC_INLINE_C void
+time_func(timer_t *timer, uint64_t nwarmup, uint64_t niter, void (*func)(void))
+{
+	uint64_t i;
+
+	for (i = 0; i < nwarmup; i++)
+		func();
+	timer_start(timer);
+	for (i = 0; i < niter; i++)
+		func();
+	timer_stop(timer);
+}
+
+void
+compare_funcs(uint64_t nwarmup, uint64_t niter, const char *name_a,
+    void (*func_a), const char *name_b, void (*func_b))
+{
+	timer_t timer_a, timer_b;
+	char ratio_buf[6];
+
+	time_func(&timer_a, nwarmup, niter, func_a);
+	time_func(&timer_b, nwarmup, niter, func_b);
+
+	timer_ratio(&timer_a, &timer_b, ratio_buf, sizeof(ratio_buf));
+	malloc_printf("%"PRIu64" iterations, %s=%"PRIu64"us, "
+	    "%s=%"PRIu64"us, ratio=1:%s\n",
+	    niter, name_a, timer_usec(&timer_a), name_b, timer_usec(&timer_b),
+	    ratio_buf);
+}
+
+static void
+malloc_vs_mallocx_malloc(void)
+{
+
+	free(malloc(1));
+}
+
+static void
+malloc_vs_mallocx_mallocx(void)
+{
+
+	free(mallocx(1, 0));
+}
+
+TEST_BEGIN(test_malloc_vs_mallocx)
+{
+
+	compare_funcs(10*1000*1000, 100*1000*1000, "malloc",
+	    malloc_vs_mallocx_malloc, "mallocx", malloc_vs_mallocx_mallocx);
+}
+TEST_END
+
+static void
+free_vs_dallocx_free(void)
+{
+
+	free(malloc(1));
+}
+
+static void
+free_vs_dallocx_dallocx(void)
+{
+
+	dallocx(malloc(1), 0);
+}
+
+TEST_BEGIN(test_free_vs_dallocx)
+{
+
+	compare_funcs(10*1000*1000, 100*1000*1000, "free", free_vs_dallocx_free,
+	    "dallocx", free_vs_dallocx_dallocx);
+}
+TEST_END
+
+static void
+mus_vs_sallocx_mus(void)
+{
+	void *p;
+
+	p = malloc(1);
+	malloc_usable_size(p);
+	free(p);
+}
+
+static void
+mus_vs_sallocx_sallocx(void)
+{
+	void *p;
+
+	p = malloc(1);
+	sallocx(p, 0);
+	free(p);
+}
+
+TEST_BEGIN(test_mus_vs_sallocx)
+{
+
+	compare_funcs(10*1000*1000, 100*1000*1000, "malloc_usable_size",
+	    mus_vs_sallocx_mus, "sallocx", mus_vs_sallocx_sallocx);
+}
+TEST_END
+
+static void
+sallocx_vs_nallocx_sallocx(void)
+{
+	void *p;
+
+	p = malloc(1);
+	sallocx(p, 0);
+	free(p);
+}
+
+static void
+sallocx_vs_nallocx_nallocx(void)
+{
+	void *p;
+
+	p = malloc(1);
+	nallocx(1, 0);
+	free(p);
+}
+
+TEST_BEGIN(test_sallocx_vs_nallocx)
+{
+
+	compare_funcs(10*1000*1000, 100*1000*1000, "sallocx",
+	    sallocx_vs_nallocx_sallocx, "nallocx", sallocx_vs_nallocx_nallocx);
+}
+TEST_END
+
+int
+main(void)
+{
+
+	return (test(
+	    test_malloc_vs_mallocx,
+	    test_free_vs_dallocx,
+	    test_mus_vs_sallocx,
+	    test_sallocx_vs_nallocx));
+}
-- 
cgit v0.12


From c3bfe9569a9927dc881b6d8ac025c423d66a541f Mon Sep 17 00:00:00 2001
From: Daniel Micay <danielmicay@gmail.com>
Date: Mon, 8 Sep 2014 00:46:12 -0400
Subject: avoid conflict with the POSIX timer_t type

It hits a compilation error with glibc 2.19 without a rename.
---
 test/include/test/timer.h | 10 +++++-----
 test/src/timer.c          |  8 ++++----
 test/stress/microbench.c  |  4 ++--
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/test/include/test/timer.h b/test/include/test/timer.h
index f21ccf1..6877e4a 100644
--- a/test/include/test/timer.h
+++ b/test/include/test/timer.h
@@ -7,9 +7,9 @@
 typedef struct {
 	struct timeval tv0;
 	struct timeval tv1;
-} timer_t;
+} timedelta_t;
 
-void	timer_start(timer_t *timer);
-void	timer_stop(timer_t *timer);
-uint64_t	timer_usec(const timer_t *timer);
-void	timer_ratio(timer_t *a, timer_t *b, char *buf, size_t buflen);
+void	timer_start(timedelta_t *timer);
+void	timer_stop(timedelta_t *timer);
+uint64_t	timer_usec(const timedelta_t *timer);
+void	timer_ratio(timedelta_t *a, timedelta_t *b, char *buf, size_t buflen);
diff --git a/test/src/timer.c b/test/src/timer.c
index 17ead17..36fbedd 100644
--- a/test/src/timer.c
+++ b/test/src/timer.c
@@ -1,21 +1,21 @@
 #include "test/jemalloc_test.h"
 
 void
-timer_start(timer_t *timer)
+timer_start(timedelta_t *timer)
 {
 
 	gettimeofday(&timer->tv0, NULL);
 }
 
 void
-timer_stop(timer_t *timer)
+timer_stop(timedelta_t *timer)
 {
 
 	gettimeofday(&timer->tv1, NULL);
 }
 
 uint64_t
-timer_usec(const timer_t *timer)
+timer_usec(const timedelta_t *timer)
 {
 
 	return (((timer->tv1.tv_sec - timer->tv0.tv_sec) * 1000000) +
@@ -23,7 +23,7 @@ timer_usec(const timer_t *timer)
 }
 
 void
-timer_ratio(timer_t *a, timer_t *b, char *buf, size_t buflen)
+timer_ratio(timedelta_t *a, timedelta_t *b, char *buf, size_t buflen)
 {
 	uint64_t t0 = timer_usec(a);
 	uint64_t t1 = timer_usec(b);
diff --git a/test/stress/microbench.c b/test/stress/microbench.c
index 8c25215..616f361 100644
--- a/test/stress/microbench.c
+++ b/test/stress/microbench.c
@@ -1,7 +1,7 @@
 #include "test/jemalloc_test.h"
 
 JEMALLOC_INLINE_C void
-time_func(timer_t *timer, uint64_t nwarmup, uint64_t niter, void (*func)(void))
+time_func(timedelta_t *timer, uint64_t nwarmup, uint64_t niter, void (*func)(void))
 {
 	uint64_t i;
 
@@ -17,7 +17,7 @@ void
 compare_funcs(uint64_t nwarmup, uint64_t niter, const char *name_a,
     void (*func_a), const char *name_b, void (*func_b))
 {
-	timer_t timer_a, timer_b;
+	timedelta_t timer_a, timer_b;
 	char ratio_buf[6];
 
 	time_func(&timer_a, nwarmup, niter, func_a);
-- 
cgit v0.12


From a1f3929ffd1bd958734a2747cf2000a9b2a5db0b Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Mon, 8 Sep 2014 16:23:48 -0700
Subject: Thwart optimization of free(malloc(1)) in microbench.

---
 test/stress/microbench.c | 44 +++++++++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/test/stress/microbench.c b/test/stress/microbench.c
index 616f361..8e1017c 100644
--- a/test/stress/microbench.c
+++ b/test/stress/microbench.c
@@ -31,46 +31,52 @@ compare_funcs(uint64_t nwarmup, uint64_t niter, const char *name_a,
 }
 
 static void
-malloc_vs_mallocx_malloc(void)
+malloc_free(void)
 {
-
-	free(malloc(1));
+	/* The compiler can optimize away free(malloc(1))! */
+	void *p = malloc(1);
+	if (p == NULL) {
+		test_fail("Unexpected malloc() failure");
+		return;
+	}
+	free(p);
 }
 
 static void
-malloc_vs_mallocx_mallocx(void)
+mallocx_free(void)
 {
-
-	free(mallocx(1, 0));
+	void *p = mallocx(1, 0);
+	if (p == NULL) {
+		test_fail("Unexpected mallocx() failure");
+		return;
+	}
+	free(p);
 }
 
 TEST_BEGIN(test_malloc_vs_mallocx)
 {
 
 	compare_funcs(10*1000*1000, 100*1000*1000, "malloc",
-	    malloc_vs_mallocx_malloc, "mallocx", malloc_vs_mallocx_mallocx);
+	    malloc_free, "mallocx", mallocx_free);
 }
 TEST_END
 
 static void
-free_vs_dallocx_free(void)
-{
-
-	free(malloc(1));
-}
-
-static void
-free_vs_dallocx_dallocx(void)
+malloc_dallocx(void)
 {
-
-	dallocx(malloc(1), 0);
+	void *p = malloc(1);
+	if (p == NULL) {
+		test_fail("Unexpected malloc() failure");
+		return;
+	}
+	dallocx(p, 0);
 }
 
 TEST_BEGIN(test_free_vs_dallocx)
 {
 
-	compare_funcs(10*1000*1000, 100*1000*1000, "free", free_vs_dallocx_free,
-	    "dallocx", free_vs_dallocx_dallocx);
+	compare_funcs(10*1000*1000, 100*1000*1000, "free", malloc_free,
+	    "dallocx", malloc_dallocx);
 }
 TEST_END
 
-- 
cgit v0.12


From c3f865074923bf388742da3ec52dca857a0960a2 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Mon, 8 Sep 2014 16:47:51 -0700
Subject: Add relevant function attributes to [msn]allocx().

---
 include/jemalloc/jemalloc_protos.h.in |  9 ++++++---
 test/stress/microbench.c              | 26 +++++++++-----------------
 2 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/include/jemalloc/jemalloc_protos.h.in b/include/jemalloc/jemalloc_protos.h.in
index 59aeee1..b365eb4 100644
--- a/include/jemalloc/jemalloc_protos.h.in
+++ b/include/jemalloc/jemalloc_protos.h.in
@@ -17,13 +17,16 @@ JEMALLOC_EXPORT void	*@je_@aligned_alloc(size_t alignment, size_t size)
 JEMALLOC_EXPORT void	*@je_@realloc(void *ptr, size_t size);
 JEMALLOC_EXPORT void	@je_@free(void *ptr);
 
-JEMALLOC_EXPORT void	*@je_@mallocx(size_t size, int flags);
+JEMALLOC_EXPORT void	*@je_@mallocx(size_t size, int flags)
+    JEMALLOC_ATTR(malloc);
 JEMALLOC_EXPORT void	*@je_@rallocx(void *ptr, size_t size, int flags);
 JEMALLOC_EXPORT size_t	@je_@xallocx(void *ptr, size_t size, size_t extra,
     int flags);
-JEMALLOC_EXPORT size_t	@je_@sallocx(const void *ptr, int flags);
+JEMALLOC_EXPORT size_t	@je_@sallocx(const void *ptr, int flags)
+    JEMALLOC_ATTR(pure);
 JEMALLOC_EXPORT void	@je_@dallocx(void *ptr, int flags);
-JEMALLOC_EXPORT size_t	@je_@nallocx(size_t size, int flags);
+JEMALLOC_EXPORT size_t	@je_@nallocx(size_t size, int flags)
+    JEMALLOC_ATTR(pure);
 
 JEMALLOC_EXPORT int	@je_@mallctl(const char *name, void *oldp,
     size_t *oldlenp, void *newp, size_t newlen);
diff --git a/test/stress/microbench.c b/test/stress/microbench.c
index 8e1017c..60c02db 100644
--- a/test/stress/microbench.c
+++ b/test/stress/microbench.c
@@ -81,7 +81,7 @@ TEST_BEGIN(test_free_vs_dallocx)
 TEST_END
 
 static void
-mus_vs_sallocx_mus(void)
+malloc_mus_free(void)
 {
 	void *p;
 
@@ -91,12 +91,13 @@ mus_vs_sallocx_mus(void)
 }
 
 static void
-mus_vs_sallocx_sallocx(void)
+malloc_sallocx_free(void)
 {
 	void *p;
 
 	p = malloc(1);
-	sallocx(p, 0);
+	if (sallocx(p, 0) < 1)
+		test_fail("Unexpected sallocx() failure");
 	free(p);
 }
 
@@ -104,27 +105,18 @@ TEST_BEGIN(test_mus_vs_sallocx)
 {
 
 	compare_funcs(10*1000*1000, 100*1000*1000, "malloc_usable_size",
-	    mus_vs_sallocx_mus, "sallocx", mus_vs_sallocx_sallocx);
+	    malloc_mus_free, "sallocx", malloc_sallocx_free);
 }
 TEST_END
 
 static void
-sallocx_vs_nallocx_sallocx(void)
+malloc_nallocx_free(void)
 {
 	void *p;
 
 	p = malloc(1);
-	sallocx(p, 0);
-	free(p);
-}
-
-static void
-sallocx_vs_nallocx_nallocx(void)
-{
-	void *p;
-
-	p = malloc(1);
-	nallocx(1, 0);
+	if (nallocx(1, 0) < 1)
+		test_fail("Unexpected nallocx() failure");
 	free(p);
 }
 
@@ -132,7 +124,7 @@ TEST_BEGIN(test_sallocx_vs_nallocx)
 {
 
 	compare_funcs(10*1000*1000, 100*1000*1000, "sallocx",
-	    sallocx_vs_nallocx_sallocx, "nallocx", sallocx_vs_nallocx_nallocx);
+	    malloc_sallocx_free, "nallocx", malloc_nallocx_free);
 }
 TEST_END
 
-- 
cgit v0.12


From 4cfe55166e0173be745c53adb0fecf50d11d1227 Mon Sep 17 00:00:00 2001
From: Daniel Micay <danielmicay@gmail.com>
Date: Thu, 28 Aug 2014 15:41:48 -0400
Subject: Add support for sized deallocation.

This adds a new `sdallocx` function to the external API, allowing the
size to be passed by the caller.  It avoids some extra reads in the
thread cache fast path.  In the case where stats are enabled, this
avoids the work of calculating the size from the pointer.

An assertion validates the size that's passed in, so enabling debugging
will allow users of the API to debug cases where an incorrect size is
passed in.

The performance win for a contrived microbenchmark doing an allocation
and immediately freeing it is ~10%.  It may have a different impact on a
real workload.

Closes #28
---
 Makefile.in                                      |  1 +
 configure.ac                                     |  2 +-
 doc/jemalloc.xml.in                              | 19 +++++++-
 include/jemalloc/internal/arena.h                | 33 ++++++++++++--
 include/jemalloc/internal/jemalloc_internal.h.in | 26 +++++++++++
 include/jemalloc/internal/private_symbols.txt    |  3 ++
 include/jemalloc/jemalloc_protos.h.in            |  1 +
 src/jemalloc.c                                   | 44 ++++++++++++++++++
 test/integration/sdallocx.c                      | 57 ++++++++++++++++++++++++
 test/stress/microbench.c                         | 20 +++++++++
 10 files changed, 201 insertions(+), 5 deletions(-)
 create mode 100644 test/integration/sdallocx.c

diff --git a/Makefile.in b/Makefile.in
index 1446dbe..ac56d8f 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -136,6 +136,7 @@ TESTS_UNIT_AUX := $(srcroot)test/unit/prof_accum_a.c \
 	$(srcroot)test/unit/prof_accum_b.c
 TESTS_INTEGRATION := $(srcroot)test/integration/aligned_alloc.c \
 	$(srcroot)test/integration/allocated.c \
+	$(srcroot)test/integration/sdallocx.c \
 	$(srcroot)test/integration/mallocx.c \
 	$(srcroot)test/integration/MALLOCX_ARENA.c \
 	$(srcroot)test/integration/posix_memalign.c \
diff --git a/configure.ac b/configure.ac
index ce4af21..d221876 100644
--- a/configure.ac
+++ b/configure.ac
@@ -452,7 +452,7 @@ AC_PROG_RANLIB
 AC_PATH_PROG([LD], [ld], [false], [$PATH])
 AC_PATH_PROG([AUTOCONF], [autoconf], [false], [$PATH])
 
-public_syms="malloc_conf malloc_message malloc calloc posix_memalign aligned_alloc realloc free mallocx rallocx xallocx sallocx dallocx nallocx mallctl mallctlnametomib mallctlbymib malloc_stats_print malloc_usable_size"
+public_syms="malloc_conf malloc_message malloc calloc posix_memalign aligned_alloc realloc free mallocx rallocx xallocx sallocx dallocx sdallocx nallocx mallctl mallctlnametomib mallctlbymib malloc_stats_print malloc_usable_size"
 
 dnl Check for allocator-related functions that should be wrapped.
 AC_CHECK_FUNC([memalign],
diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 8f4327f..e5c229f 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -38,6 +38,7 @@
     <refname>xallocx</refname>
     <refname>sallocx</refname>
     <refname>dallocx</refname>
+    <refname>sdallocx</refname>
     <refname>nallocx</refname>
     <refname>mallctl</refname>
     <refname>mallctlnametomib</refname>
@@ -121,6 +122,12 @@
           <paramdef>int <parameter>flags</parameter></paramdef>
         </funcprototype>
         <funcprototype>
+          <funcdef>void <function>sdallocx</function></funcdef>
+          <paramdef>void *<parameter>ptr</parameter></paramdef>
+          <paramdef>size_t <parameter>size</parameter></paramdef>
+          <paramdef>int <parameter>flags</parameter></paramdef>
+        </funcprototype>
+        <funcprototype>
           <funcdef>size_t <function>nallocx</function></funcdef>
           <paramdef>size_t <parameter>size</parameter></paramdef>
           <paramdef>int <parameter>flags</parameter></paramdef>
@@ -228,7 +235,8 @@
       <function>rallocx<parameter/></function>,
       <function>xallocx<parameter/></function>,
       <function>sallocx<parameter/></function>,
-      <function>dallocx<parameter/></function>, and
+      <function>dallocx<parameter/></function>,
+      <function>sdallocx<parameter/></function>, and
       <function>nallocx<parameter/></function> functions all have a
       <parameter>flags</parameter> argument that can be used to specify
       options.  The functions only check the options that are contextually
@@ -312,6 +320,15 @@
       memory referenced by <parameter>ptr</parameter> to be made available for
       future allocations.</para>
 
+      <para>The <function>sdallocx<parameter/></function> function is an
+      extension of <function>dallocx<parameter/></function> with a
+      <parameter>size</parameter> parameter to allow the caller to pass in the
+      allocation size as an optimization.  The minimum valid input size is the
+      original requested size of the allocation, and the maximum valid input
+      size is the corresponding value returned by
+      <function>nallocx<parameter/></function> or
+      <function>sallocx<parameter/></function>.</para>
+
       <para>The <function>nallocx<parameter/></function> function allocates no
       memory, but it performs the same size computation as the
       <function>mallocx<parameter/></function> function, and returns the real
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 166d052..6ab0ae7 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -488,6 +488,7 @@ void	arena_prof_tctx_set(const void *ptr, prof_tctx_t *tctx);
 void	*arena_malloc(arena_t *arena, size_t size, bool zero, bool try_tcache);
 size_t	arena_salloc(const void *ptr, bool demote);
 void	arena_dalloc(arena_chunk_t *chunk, void *ptr, bool try_tcache);
+void	arena_sdalloc(arena_chunk_t *chunk, void *ptr, size_t size, bool try_tcache);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ARENA_C_))
@@ -1139,9 +1140,7 @@ arena_dalloc(arena_chunk_t *chunk, void *ptr, bool try_tcache)
 	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
 		/* Small allocation. */
 		if (try_tcache && (tcache = tcache_get(false)) != NULL) {
-			size_t binind;
-
-			binind = arena_ptr_small_binind_get(ptr, mapbits);
+			size_t binind = arena_ptr_small_binind_get(ptr, mapbits);
 			tcache_dalloc_small(tcache, ptr, binind);
 		} else
 			arena_dalloc_small(chunk->arena, chunk, ptr, pageind);
@@ -1157,6 +1156,34 @@ arena_dalloc(arena_chunk_t *chunk, void *ptr, bool try_tcache)
 			arena_dalloc_large(chunk->arena, chunk, ptr);
 	}
 }
+
+JEMALLOC_ALWAYS_INLINE void
+arena_sdalloc(arena_chunk_t *chunk, void *ptr, size_t size, bool try_tcache)
+{
+	tcache_t *tcache;
+
+	assert(ptr != NULL);
+	assert(CHUNK_ADDR2BASE(ptr) != ptr);
+
+	if (size < PAGE) {
+		/* Small allocation. */
+		if (try_tcache && (tcache = tcache_get(false)) != NULL) {
+			size_t binind = small_size2bin(size);
+			tcache_dalloc_small(tcache, ptr, binind);
+		} else {
+			size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
+			arena_dalloc_small(chunk->arena, chunk, ptr, pageind);
+		}
+	} else {
+		assert(((uintptr_t)ptr & PAGE_MASK) == 0);
+
+		if (try_tcache && size <= tcache_maxclass && (tcache =
+		    tcache_get(false)) != NULL) {
+			tcache_dalloc_large(tcache, ptr, size);
+		} else
+			arena_dalloc_large(chunk->arena, chunk, ptr);
+	}
+}
 #  endif /* JEMALLOC_ARENA_INLINE_C */
 #endif
 
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 59ae8d5..c0e326d 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -634,8 +634,10 @@ size_t	ivsalloc(const void *ptr, bool demote);
 size_t	u2rz(size_t usize);
 size_t	p2rz(const void *ptr);
 void	idalloct(void *ptr, bool try_tcache);
+void	isdalloct(void *ptr, size_t size, bool try_tcache);
 void	idalloc(void *ptr);
 void	iqalloc(void *ptr, bool try_tcache);
+void	isqalloc(void *ptr, size_t size, bool try_tcache);
 void	*iralloct_realign(void *ptr, size_t oldsize, size_t size, size_t extra,
     size_t alignment, bool zero, bool try_tcache_alloc, bool try_tcache_dalloc,
     arena_t *arena);
@@ -788,6 +790,20 @@ idalloct(void *ptr, bool try_tcache)
 }
 
 JEMALLOC_ALWAYS_INLINE void
+isdalloct(void *ptr, size_t size, bool try_tcache)
+{
+	arena_chunk_t *chunk;
+
+	assert(ptr != NULL);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	if (chunk != ptr)
+		arena_sdalloc(chunk, ptr, size, try_tcache);
+	else
+		huge_dalloc(ptr);
+}
+
+JEMALLOC_ALWAYS_INLINE void
 idalloc(void *ptr)
 {
 
@@ -804,6 +820,16 @@ iqalloc(void *ptr, bool try_tcache)
 		idalloct(ptr, try_tcache);
 }
 
+JEMALLOC_ALWAYS_INLINE void
+isqalloc(void *ptr, size_t size, bool try_tcache)
+{
+
+	if (config_fill && opt_quarantine)
+		quarantine(ptr);
+	else
+		idalloct(ptr, try_tcache);
+}
+
 JEMALLOC_ALWAYS_INLINE void *
 iralloct_realign(void *ptr, size_t oldsize, size_t size, size_t extra,
     size_t alignment, bool zero, bool try_tcache_alloc, bool try_tcache_dalloc,
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 84f0591..3b990b0 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -61,6 +61,7 @@ arena_ralloc_no_move
 arena_redzone_corruption
 arena_run_regind
 arena_salloc
+arena_sdalloc
 arena_stats_merge
 arena_tcache_fill_small
 arenas
@@ -228,7 +229,9 @@ iralloc
 iralloct
 iralloct_realign
 isalloc
+isdalloct
 isthreaded
+isqalloc
 ivsalloc
 ixalloc
 jemalloc_postfork_child
diff --git a/include/jemalloc/jemalloc_protos.h.in b/include/jemalloc/jemalloc_protos.h.in
index b365eb4..f81adc1 100644
--- a/include/jemalloc/jemalloc_protos.h.in
+++ b/include/jemalloc/jemalloc_protos.h.in
@@ -25,6 +25,7 @@ JEMALLOC_EXPORT size_t	@je_@xallocx(void *ptr, size_t size, size_t extra,
 JEMALLOC_EXPORT size_t	@je_@sallocx(const void *ptr, int flags)
     JEMALLOC_ATTR(pure);
 JEMALLOC_EXPORT void	@je_@dallocx(void *ptr, int flags);
+JEMALLOC_EXPORT void	@je_@sdallocx(void *ptr, size_t size, int flags);
 JEMALLOC_EXPORT size_t	@je_@nallocx(size_t size, int flags)
     JEMALLOC_ATTR(pure);
 
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 71e921b..527782e 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1223,6 +1223,24 @@ ifree(void *ptr, bool try_tcache)
 	JEMALLOC_VALGRIND_FREE(ptr, rzsize);
 }
 
+JEMALLOC_INLINE_C void
+isfree(void *ptr, size_t usize, bool try_tcache)
+{
+	UNUSED size_t rzsize JEMALLOC_CC_SILENCE_INIT(0);
+
+	assert(ptr != NULL);
+	assert(malloc_initialized || IS_INITIALIZER);
+
+	if (config_prof && opt_prof)
+		prof_free(ptr, usize);
+	if (config_stats)
+		thread_allocated_tsd_get()->deallocated += usize;
+	if (config_valgrind && in_valgrind)
+		rzsize = p2rz(ptr);
+	isqalloc(ptr, usize, try_tcache);
+	JEMALLOC_VALGRIND_FREE(ptr, rzsize);
+}
+
 void *
 je_realloc(void *ptr, size_t size)
 {
@@ -1820,6 +1838,32 @@ je_dallocx(void *ptr, int flags)
 	ifree(ptr, try_tcache);
 }
 
+void
+je_sdallocx(void *ptr, size_t size, int flags)
+{
+	bool try_tcache;
+
+	assert(ptr != NULL);
+	assert(malloc_initialized || IS_INITIALIZER);
+	assert(size == isalloc(ptr, config_prof));
+
+	if ((flags & MALLOCX_LG_ALIGN_MASK) == 0)
+		size = s2u(size);
+	else
+		size = sa2u(size, MALLOCX_ALIGN_GET_SPECIFIED(flags));
+
+	if ((flags & MALLOCX_ARENA_MASK) != 0) {
+		unsigned arena_ind = MALLOCX_ARENA_GET(flags);
+		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+		try_tcache = (chunk == ptr || chunk->arena !=
+		    arenas[arena_ind]);
+	} else
+		try_tcache = true;
+
+	UTRACE(ptr, 0, 0);
+	isfree(ptr, size, try_tcache);
+}
+
 size_t
 je_nallocx(size_t size, int flags)
 {
diff --git a/test/integration/sdallocx.c b/test/integration/sdallocx.c
new file mode 100644
index 0000000..b84817d
--- /dev/null
+++ b/test/integration/sdallocx.c
@@ -0,0 +1,57 @@
+#include "test/jemalloc_test.h"
+
+#define	MAXALIGN (((size_t)1) << 25)
+#define	NITER 4
+
+TEST_BEGIN(test_basic)
+{
+	void *ptr = mallocx(64, 0);
+	sdallocx(ptr, 64, 0);
+}
+TEST_END
+
+TEST_BEGIN(test_alignment_and_size)
+{
+	size_t nsz, sz, alignment, total;
+	unsigned i;
+	void *ps[NITER];
+
+	for (i = 0; i < NITER; i++)
+		ps[i] = NULL;
+
+	for (alignment = 8;
+	    alignment <= MAXALIGN;
+	    alignment <<= 1) {
+		total = 0;
+		for (sz = 1;
+		    sz < 3 * alignment && sz < (1U << 31);
+		    sz += (alignment >> (LG_SIZEOF_PTR-1)) - 1) {
+			for (i = 0; i < NITER; i++) {
+				nsz = nallocx(sz, MALLOCX_ALIGN(alignment) |
+				    MALLOCX_ZERO);
+				ps[i] = mallocx(sz, MALLOCX_ALIGN(alignment) |
+				    MALLOCX_ZERO);
+				total += nsz;
+				if (total >= (MAXALIGN << 1))
+					break;
+			}
+			for (i = 0; i < NITER; i++) {
+				if (ps[i] != NULL) {
+					sdallocx(ps[i], sz,
+					    MALLOCX_ALIGN(alignment));
+					ps[i] = NULL;
+				}
+			}
+		}
+	}
+}
+TEST_END
+
+int
+main(void)
+{
+
+	return (test(
+	    test_basic,
+	    test_alignment_and_size));
+}
diff --git a/test/stress/microbench.c b/test/stress/microbench.c
index 60c02db..a8267c3 100644
--- a/test/stress/microbench.c
+++ b/test/stress/microbench.c
@@ -72,6 +72,17 @@ malloc_dallocx(void)
 	dallocx(p, 0);
 }
 
+static void
+malloc_sdallocx(void)
+{
+	void *p = malloc(1);
+	if (p == NULL) {
+		test_fail("Unexpected malloc() failure");
+		return;
+	}
+	sdallocx(p, 1, 0);
+}
+
 TEST_BEGIN(test_free_vs_dallocx)
 {
 
@@ -80,6 +91,14 @@ TEST_BEGIN(test_free_vs_dallocx)
 }
 TEST_END
 
+TEST_BEGIN(test_dallocx_vs_sdallocx)
+{
+
+	compare_funcs(10*1000*1000, 100*1000*1000, "dallocx", malloc_dallocx,
+	    "sdallocx", malloc_sdallocx);
+}
+TEST_END
+
 static void
 malloc_mus_free(void)
 {
@@ -135,6 +154,7 @@ main(void)
 	return (test(
 	    test_malloc_vs_mallocx,
 	    test_free_vs_dallocx,
+	    test_dallocx_vs_sdallocx,
 	    test_mus_vs_sallocx,
 	    test_sallocx_vs_nallocx));
 }
-- 
cgit v0.12


From a62812eacca8ac3ce81f27c9480b44b2a97ff66c Mon Sep 17 00:00:00 2001
From: Daniel Micay <danielmicay@gmail.com>
Date: Mon, 8 Sep 2014 21:43:21 -0400
Subject: fix isqalloct (should call isdalloct)

---
 include/jemalloc/internal/jemalloc_internal.h.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index c0e326d..81d46fc 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -827,7 +827,7 @@ isqalloc(void *ptr, size_t size, bool try_tcache)
 	if (config_fill && opt_quarantine)
 		quarantine(ptr);
 	else
-		idalloct(ptr, try_tcache);
+		isdalloct(ptr, size, try_tcache);
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-- 
cgit v0.12


From d95e704feadd44cc6d9eb8695b9cff7ac6d4c88f Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Fri, 5 Sep 2014 14:10:37 -0700
Subject:     Support threaded heap profiles in pprof

    - Add a --thread N option to select profile for thread N (otherwise, all
      threads will be printed)
    - The $profile map now has a {threads} element that is a map from thread id to
      a profile that has the same format as the {profile} element
    - Refactor ReadHeapProfile into smaller components and use them to implement
      ReadThreadedHeapProfile
---
 bin/pprof | 377 +++++++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 251 insertions(+), 126 deletions(-)

diff --git a/bin/pprof b/bin/pprof
index 328138c..52da600 100755
--- a/bin/pprof
+++ b/bin/pprof
@@ -2,11 +2,11 @@
 
 # Copyright (c) 1998-2007, Google Inc.
 # All rights reserved.
-# 
+#
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
 # met:
-# 
+#
 #     * Redistributions of source code must retain the above copyright
 # notice, this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above
@@ -16,7 +16,7 @@
 #     * Neither the name of Google Inc. nor the names of its
 # contributors may be used to endorse or promote products derived from
 # this software without specific prior written permission.
-# 
+#
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -223,6 +223,7 @@ Call-graph Options:
    --edgefraction=<f>  Hide edges below <f>*total [default=.001]
    --maxdegree=<n>     Max incoming/outgoing edges per node [default=8]
    --focus=<regexp>    Focus on nodes matching <regexp>
+   --thread=<n>        Show profile for thread <n>
    --ignore=<regexp>   Ignore nodes matching <regexp>
    --scale=<n>         Set GV scaling [default=0]
    --heapcheck         Make nodes with non-0 object counts
@@ -332,6 +333,7 @@ sub Init() {
   $main::opt_edgefraction = 0.001;
   $main::opt_maxdegree = 8;
   $main::opt_focus = '';
+  $main::opt_thread = undef;
   $main::opt_ignore = '';
   $main::opt_scale = 0;
   $main::opt_heapcheck = 0;
@@ -402,6 +404,7 @@ sub Init() {
              "edgefraction=f" => \$main::opt_edgefraction,
              "maxdegree=i"    => \$main::opt_maxdegree,
              "focus=s"        => \$main::opt_focus,
+             "thread=i"       => \$main::opt_thread,
              "ignore=s"       => \$main::opt_ignore,
              "scale=i"        => \$main::opt_scale,
              "heapcheck"      => \$main::opt_heapcheck,
@@ -562,66 +565,12 @@ sub Init() {
   }
 }
 
-sub Main() {
-  Init();
-  $main::collected_profile = undef;
-  @main::profile_files = ();
-  $main::op_time = time();
-
-  # Printing symbols is special and requires a lot less info that most.
-  if ($main::opt_symbols) {
-    PrintSymbols(*STDIN);   # Get /proc/maps and symbols output from stdin
-    return;
-  }
-
-  # Fetch all profile data
-  FetchDynamicProfiles();
-
-  # this will hold symbols that we read from the profile files
-  my $symbol_map = {};
-
-  # Read one profile, pick the last item on the list
-  my $data = ReadProfile($main::prog, pop(@main::profile_files));
-  my $profile = $data->{profile};
-  my $pcs = $data->{pcs};
-  my $libs = $data->{libs};   # Info about main program and shared libraries
-  $symbol_map = MergeSymbols($symbol_map, $data->{symbols});
-
-  # Add additional profiles, if available.
-  if (scalar(@main::profile_files) > 0) {
-    foreach my $pname (@main::profile_files) {
-      my $data2 = ReadProfile($main::prog, $pname);
-      $profile = AddProfile($profile, $data2->{profile});
-      $pcs = AddPcs($pcs, $data2->{pcs});
-      $symbol_map = MergeSymbols($symbol_map, $data2->{symbols});
-    }
-  }
-
-  # Subtract base from profile, if specified
-  if ($main::opt_base ne '') {
-    my $base = ReadProfile($main::prog, $main::opt_base);
-    $profile = SubtractProfile($profile, $base->{profile});
-    $pcs = AddPcs($pcs, $base->{pcs});
-    $symbol_map = MergeSymbols($symbol_map, $base->{symbols});
-  }
+sub FilterAndPrint {
+  my ($profile, $symbols, $libs, $thread) = @_;
 
   # Get total data in profile
   my $total = TotalProfile($profile);
 
-  # Collect symbols
-  my $symbols;
-  if ($main::use_symbolized_profile) {
-    $symbols = FetchSymbols($pcs, $symbol_map);
-  } elsif ($main::use_symbol_page) {
-    $symbols = FetchSymbols($pcs);
-  } else {
-    # TODO(csilvers): $libs uses the /proc/self/maps data from profile1,
-    # which may differ from the data from subsequent profiles, especially
-    # if they were run on different machines.  Use appropriate libs for
-    # each pc somehow.
-    $symbols = ExtractSymbols($libs, $pcs);
-  }
-
   # Remove uniniteresting stack items
   $profile = RemoveUninterestingFrames($symbols, $profile);
 
@@ -656,7 +605,9 @@ sub Main() {
       # (only matters when --heapcheck is given but we must be
       # compatible with old branches that did not pass --heapcheck always):
       if ($total != 0) {
-        printf("Total: %s %s\n", Unparse($total), Units());
+        printf("Total%s: %s %s\n",
+               (defined($thread) ? " (t$thread)" : ""),
+               Unparse($total), Units());
       }
       PrintText($symbols, $flat, $cumulative, -1);
     } elsif ($main::opt_raw) {
@@ -692,6 +643,76 @@ sub Main() {
   } else {
     InteractiveMode($profile, $symbols, $libs, $total);
   }
+}
+
+sub Main() {
+  Init();
+  $main::collected_profile = undef;
+  @main::profile_files = ();
+  $main::op_time = time();
+
+  # Printing symbols is special and requires a lot less info that most.
+  if ($main::opt_symbols) {
+    PrintSymbols(*STDIN);   # Get /proc/maps and symbols output from stdin
+    return;
+  }
+
+  # Fetch all profile data
+  FetchDynamicProfiles();
+
+  # this will hold symbols that we read from the profile files
+  my $symbol_map = {};
+
+  # Read one profile, pick the last item on the list
+  my $data = ReadProfile($main::prog, pop(@main::profile_files));
+  my $profile = $data->{profile};
+  my $pcs = $data->{pcs};
+  my $libs = $data->{libs};   # Info about main program and shared libraries
+  $symbol_map = MergeSymbols($symbol_map, $data->{symbols});
+
+  # Add additional profiles, if available.
+  if (scalar(@main::profile_files) > 0) {
+    foreach my $pname (@main::profile_files) {
+      my $data2 = ReadProfile($main::prog, $pname);
+      $profile = AddProfile($profile, $data2->{profile});
+      $pcs = AddPcs($pcs, $data2->{pcs});
+      $symbol_map = MergeSymbols($symbol_map, $data2->{symbols});
+    }
+  }
+
+  # Subtract base from profile, if specified
+  if ($main::opt_base ne '') {
+    my $base = ReadProfile($main::prog, $main::opt_base);
+    $profile = SubtractProfile($profile, $base->{profile});
+    $pcs = AddPcs($pcs, $base->{pcs});
+    $symbol_map = MergeSymbols($symbol_map, $base->{symbols});
+  }
+
+  # Collect symbols
+  my $symbols;
+  if ($main::use_symbolized_profile) {
+    $symbols = FetchSymbols($pcs, $symbol_map);
+  } elsif ($main::use_symbol_page) {
+    $symbols = FetchSymbols($pcs);
+  } else {
+    # TODO(csilvers): $libs uses the /proc/self/maps data from profile1,
+    # which may differ from the data from subsequent profiles, especially
+    # if they were run on different machines.  Use appropriate libs for
+    # each pc somehow.
+    $symbols = ExtractSymbols($libs, $pcs);
+  }
+
+  if (!defined($main::opt_thread)) {
+    FilterAndPrint($profile, $symbols, $libs);
+  }
+  if (defined($data->{threads})) {
+    foreach my $thread (sort { $a <=> $b } keys(%{$data->{threads}})) {
+      if (!defined($main::opt_thread) || $main::opt_thread == $thread) {
+        my $thread_profile = $data->{threads}{$thread};
+        FilterAndPrint($thread_profile, $symbols, $libs, $thread);
+      }
+    }
+  }
 
   cleanup();
   exit(0);
@@ -1683,23 +1704,23 @@ sub PrintSource {
                         HtmlPrintNumber($c2),
                         UnparseAddress($offset, $e->[0]),
                         CleanDisassembly($e->[3]));
-      
+
       # Append the most specific source line associated with this instruction
       if (length($dis) < 80) { $dis .= (' ' x (80 - length($dis))) };
       $dis = HtmlEscape($dis);
       my $f = $e->[5];
       my $l = $e->[6];
       if ($f ne $last_dis_filename) {
-        $dis .= sprintf("<span class=disasmloc>%s:%d</span>", 
+        $dis .= sprintf("<span class=disasmloc>%s:%d</span>",
                         HtmlEscape(CleanFileName($f)), $l);
       } elsif ($l ne $last_dis_linenum) {
         # De-emphasize the unchanged file name portion
         $dis .= sprintf("<span class=unimportant>%s</span>" .
-                        "<span class=disasmloc>:%d</span>", 
+                        "<span class=disasmloc>:%d</span>",
                         HtmlEscape(CleanFileName($f)), $l);
       } else {
         # De-emphasize the entire location
-        $dis .= sprintf("<span class=unimportant>%s:%d</span>", 
+        $dis .= sprintf("<span class=unimportant>%s:%d</span>",
                         HtmlEscape(CleanFileName($f)), $l);
       }
       $last_dis_filename = $f;
@@ -1788,8 +1809,8 @@ sub PrintSource {
         if (defined($dis) && $dis ne '') {
           $asm = "<span class=\"asm\">" . $dis . "</span>";
         }
-        my $source_class = (($n1 + $n2 > 0) 
-                            ? "livesrc" 
+        my $source_class = (($n1 + $n2 > 0)
+                            ? "livesrc"
                             : (($asm ne "") ? "deadsrc" : "nop"));
         printf $output (
           "<span class=\"line\">%5d</span> " .
@@ -3689,6 +3710,7 @@ sub IsSymbolizedProfileFile {
 #      $result->{version}     Version number of profile file
 #      $result->{period}      Sampling period (in microseconds)
 #      $result->{profile}     Profile object
+#      $result->{threads}     Map of thread IDs to profile objects
 #      $result->{map}         Memory map info from profile
 #      $result->{pcs}         Hash of all PC values seen, key is hex address
 sub ReadProfile {
@@ -3737,6 +3759,9 @@ sub ReadProfile {
   } elsif ($header =~ m/^heap profile:/) {
     $main::profile_type = 'heap';
     $result =  ReadHeapProfile($prog, *PROFILE, $header);
+  } elsif ($header =~ m/^heap/) {
+    $main::profile_type = 'heap';
+    $result = ReadThreadedHeapProfile($prog, $fname, $header);
   } elsif ($header =~ m/^--- *$contention_marker/o) {
     $main::profile_type = 'contention';
     $result = ReadSynchProfile($prog, *PROFILE);
@@ -3879,11 +3904,7 @@ sub ReadCPUProfile {
   return $r;
 }
 
-sub ReadHeapProfile {
-  my $prog = shift;
-  local *PROFILE = shift;
-  my $header = shift;
-
+sub HeapProfileIndex {
   my $index = 1;
   if ($main::opt_inuse_space) {
     $index = 1;
@@ -3894,6 +3915,84 @@ sub ReadHeapProfile {
   } elsif ($main::opt_alloc_objects) {
     $index = 2;
   }
+  return $index;
+}
+
+sub ReadMappedLibraries {
+  my $fh = shift;
+  my $map = "";
+  # Read the /proc/self/maps data
+  while (<$fh>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    $map .= $_;
+  }
+  return $map;
+}
+
+sub ReadMemoryMap {
+  my $fh = shift;
+  my $map = "";
+  # Read /proc/self/maps data as formatted by DumpAddressMap()
+  my $buildvar = "";
+  while (<PROFILE>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    # Parse "build=<dir>" specification if supplied
+    if (m/^\s*build=(.*)\n/) {
+      $buildvar = $1;
+    }
+
+    # Expand "$build" variable if available
+    $_ =~ s/\$build\b/$buildvar/g;
+
+    $map .= $_;
+  }
+  return $map;
+}
+
+sub AdjustSamples {
+  my ($sample_adjustment, $sampling_algorithm, $n1, $s1, $n2, $s2) = @_;
+  if ($sample_adjustment) {
+    if ($sampling_algorithm == 2) {
+      # Remote-heap version 2
+      # The sampling frequency is the rate of a Poisson process.
+      # This means that the probability of sampling an allocation of
+      # size X with sampling rate Y is 1 - exp(-X/Y)
+      if ($n1 != 0) {
+        my $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
+        my $scale_factor = 1/(1 - exp(-$ratio));
+        $n1 *= $scale_factor;
+        $s1 *= $scale_factor;
+      }
+      if ($n2 != 0) {
+        my $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
+        my $scale_factor = 1/(1 - exp(-$ratio));
+        $n2 *= $scale_factor;
+        $s2 *= $scale_factor;
+      }
+    } else {
+      # Remote-heap version 1
+      my $ratio;
+      $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
+      if ($ratio < 1) {
+        $n1 /= $ratio;
+        $s1 /= $ratio;
+      }
+      $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
+      if ($ratio < 1) {
+        $n2 /= $ratio;
+        $s2 /= $ratio;
+      }
+    }
+  }
+  return ($n1, $s1, $n2, $s2);
+}
+
+sub ReadHeapProfile {
+  my $prog = shift;
+  local *PROFILE = shift;
+  my $header = shift;
+
+  my $index = HeapProfileIndex();
 
   # Find the type of this profile.  The header line looks like:
   #    heap profile:   1246:  8800744 [  1246:  8800744] @ <heap-url>/266053
@@ -3983,29 +4082,12 @@ sub ReadHeapProfile {
   while (<PROFILE>) {
     s/\r//g;         # turn windows-looking lines into unix-looking lines
     if (/^MAPPED_LIBRARIES:/) {
-      # Read the /proc/self/maps data
-      while (<PROFILE>) {
-        s/\r//g;         # turn windows-looking lines into unix-looking lines
-        $map .= $_;
-      }
+      $map .= ReadMappedLibraries(*PROFILE);
       last;
     }
 
     if (/^--- Memory map:/) {
-      # Read /proc/self/maps data as formatted by DumpAddressMap()
-      my $buildvar = "";
-      while (<PROFILE>) {
-        s/\r//g;         # turn windows-looking lines into unix-looking lines
-        # Parse "build=<dir>" specification if supplied
-        if (m/^\s*build=(.*)\n/) {
-          $buildvar = $1;
-        }
-
-        # Expand "$build" variable if available
-        $_ =~ s/\$build\b/$buildvar/g;
-
-        $map .= $_;
-      }
+      $map .= ReadMemoryMap(*PROFILE);
       last;
     }
 
@@ -4016,43 +4098,85 @@ sub ReadHeapProfile {
     if (m/^\s*(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\]\s+@\s+(.*)$/) {
       my $stack = $5;
       my ($n1, $s1, $n2, $s2) = ($1, $2, $3, $4);
+      my @counts = AdjustSamples($sample_adjustment, $sampling_algorithm,
+                                 $n1, $s1, $n2, $s2);
+      AddEntries($profile, $pcs, FixCallerAddresses($stack), $counts[$index]);
+    }
+  }
 
-      if ($sample_adjustment) {
-        if ($sampling_algorithm == 2) {
-          # Remote-heap version 2
-          # The sampling frequency is the rate of a Poisson process.
-          # This means that the probability of sampling an allocation of
-          # size X with sampling rate Y is 1 - exp(-X/Y)
-          if ($n1 != 0) {
-            my $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
-            my $scale_factor = 1/(1 - exp(-$ratio));
-            $n1 *= $scale_factor;
-            $s1 *= $scale_factor;
-          }
-          if ($n2 != 0) {
-            my $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
-            my $scale_factor = 1/(1 - exp(-$ratio));
-            $n2 *= $scale_factor;
-            $s2 *= $scale_factor;
-          }
-        } else {
-          # Remote-heap version 1
-          my $ratio;
-          $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
-          if ($ratio < 1) {
-            $n1 /= $ratio;
-            $s1 /= $ratio;
-          }
-          $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
-          if ($ratio < 1) {
-            $n2 /= $ratio;
-            $s2 /= $ratio;
-          }
+  my $r = {};
+  $r->{version} = "heap";
+  $r->{period} = 1;
+  $r->{profile} = $profile;
+  $r->{libs} = ParseLibraries($prog, $map, $pcs);
+  $r->{pcs} = $pcs;
+  return $r;
+}
+
+sub ReadThreadedHeapProfile {
+  my ($prog, $fname, $header) = @_;
+
+  my $index = HeapProfileIndex();
+  my $sampling_algorithm = 0;
+  my $sample_adjustment = 0;
+  chomp($header);
+  my $type = "unknown";
+  # Assuming a very specific type of header for now.
+  if ($header =~ m"^heap_v2/(\d+)") {
+    $type = "_v2";
+    $sampling_algorithm = 2;
+    $sample_adjustment = int($1);
+  }
+  if ($type ne "_v2" || !defined($sample_adjustment)) {
+    die "Threaded heap profiles require v2 sampling with a sample rate\n";
+  }
+
+  my $profile = {};
+  my $thread_profiles = {};
+  my $pcs = {};
+  my $map = "";
+  my $stack = "";
+
+  while (<PROFILE>) {
+    s/\r//g;
+    if (/^MAPPED_LIBRARIES:/) {
+      $map .= ReadMappedLibraries(*PROFILE);
+      last;
+    }
+
+    if (/^--- Memory map:/) {
+      $map .= ReadMemoryMap(*PROFILE);
+      last;
+    }
+
+    # Read entry of the form:
+    # @ a1 a2 ... an
+    #   t*: <count1>: <bytes1> [<count2>: <bytes2>]
+    #   t1: <count1>: <bytes1> [<count2>: <bytes2>]
+    #     ...
+    #   tn: <count1>: <bytes1> [<count2>: <bytes2>]
+    s/^\s*//;
+    s/\s*$//;
+    if (m/^@\s+(.*)$/) {
+      $stack = $1;
+    } elsif (m/^\s*(t(\*|\d+)):\s+(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\]$/) {
+      if ($stack eq "") {
+        # Still in the header, so this is just a per-thread summary.
+        next;
+      }
+      my $thread = $2;
+      my ($n1, $s1, $n2, $s2) = ($3, $4, $5, $6);
+      my @counts = AdjustSamples($sample_adjustment, $sampling_algorithm,
+                                 $n1, $s2, $n2, $s2);
+      if ($thread eq "*") {
+        AddEntries($profile, $pcs, FixCallerAddresses($stack), $counts[$index]);
+      } else {
+        if (!exists($thread_profiles->{$thread})) {
+          $thread_profiles->{$thread} = {};
         }
+        AddEntries($thread_profiles->{$thread}, $pcs,
+                   FixCallerAddresses($stack), $counts[$index]);
       }
-
-      my @counts = ($n1, $s1, $n2, $s2);
-      AddEntries($profile, $pcs, FixCallerAddresses($stack), $counts[$index]);
     }
   }
 
@@ -4060,6 +4184,7 @@ sub ReadHeapProfile {
   $r->{version} = "heap";
   $r->{period} = 1;
   $r->{profile} = $profile;
+  $r->{threads} = $thread_profiles;
   $r->{libs} = ParseLibraries($prog, $map, $pcs);
   $r->{pcs} = $pcs;
   return $r;
@@ -4756,7 +4881,7 @@ sub MapToSymbols {
 	}
       }
     }
-    
+
     # Prepend to accumulated symbols for pcstr
     # (so that caller comes before callee)
     my $sym = $symbols->{$pcstr};
@@ -4950,7 +5075,7 @@ sub ConfigureTool {
     my $dirname = $`;    # this is everything up to and including the last slash
     if (-x "$dirname$tool") {
       $path = "$dirname$tool";
-    } else { 
+    } else {
       $path = $tool;
     }
   }
-- 
cgit v0.12


From a2260c95cd717c06c28b61d40b2157254d594219 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Tue, 9 Sep 2014 10:29:26 -0700
Subject: Fix sdallocx() assertion.

Refactor sdallocx() and nallocx() to share inallocx(), and fix an
sdallocx() assertion to check usize rather than size.
---
 src/jemalloc.c | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/src/jemalloc.c b/src/jemalloc.c
index 527782e..3f29a85 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1838,19 +1838,29 @@ je_dallocx(void *ptr, int flags)
 	ifree(ptr, try_tcache);
 }
 
+JEMALLOC_ALWAYS_INLINE_C size_t
+inallocx(size_t size, int flags)
+{
+	size_t usize;
+
+	if ((flags & MALLOCX_LG_ALIGN_MASK) == 0)
+		usize = s2u(size);
+	else
+		usize = sa2u(size, MALLOCX_ALIGN_GET_SPECIFIED(flags));
+	assert(usize != 0);
+	return (usize);
+}
+
 void
 je_sdallocx(void *ptr, size_t size, int flags)
 {
 	bool try_tcache;
+	size_t usize;
 
 	assert(ptr != NULL);
 	assert(malloc_initialized || IS_INITIALIZER);
-	assert(size == isalloc(ptr, config_prof));
-
-	if ((flags & MALLOCX_LG_ALIGN_MASK) == 0)
-		size = s2u(size);
-	else
-		size = sa2u(size, MALLOCX_ALIGN_GET_SPECIFIED(flags));
+	usize = inallocx(size, flags);
+	assert(usize == isalloc(ptr, config_prof));
 
 	if ((flags & MALLOCX_ARENA_MASK) != 0) {
 		unsigned arena_ind = MALLOCX_ARENA_GET(flags);
@@ -1861,27 +1871,19 @@ je_sdallocx(void *ptr, size_t size, int flags)
 		try_tcache = true;
 
 	UTRACE(ptr, 0, 0);
-	isfree(ptr, size, try_tcache);
+	isfree(ptr, usize, try_tcache);
 }
 
 size_t
 je_nallocx(size_t size, int flags)
 {
-	size_t usize;
 
 	assert(size != 0);
 
 	if (malloc_init())
 		return (0);
 
-	if ((flags & MALLOCX_LG_ALIGN_MASK) == 0)
-		usize = s2u(size);
-	else {
-		size_t alignment = MALLOCX_ALIGN_GET_SPECIFIED(flags);
-		usize = sa2u(size, alignment);
-	}
-	assert(usize != 0);
-	return (usize);
+	return (inallocx(size, flags));
 }
 
 int
-- 
cgit v0.12


From 7c17e1670d7294db4b3c483ad7173dd056b42268 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Tue, 9 Sep 2014 15:27:52 -0700
Subject: Fix threaded heap profile bug in pprof.

Fix ReadThreadedHeapProfile to pass the correct parameters to
AdjustSamples.
---
 bin/pprof | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/pprof b/bin/pprof
index 52da600..87313f4 100755
--- a/bin/pprof
+++ b/bin/pprof
@@ -4167,7 +4167,7 @@ sub ReadThreadedHeapProfile {
       my $thread = $2;
       my ($n1, $s1, $n2, $s2) = ($3, $4, $5, $6);
       my @counts = AdjustSamples($sample_adjustment, $sampling_algorithm,
-                                 $n1, $s2, $n2, $s2);
+                                 $n1, $s1, $n2, $s2);
       if ($thread eq "*") {
         AddEntries($profile, $pcs, FixCallerAddresses($stack), $counts[$index]);
       } else {
-- 
cgit v0.12


From 6fd53da030b5e9161a49d6010a8b38499ca2a124 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Tue, 9 Sep 2014 12:45:53 -0700
Subject: Fix prof_tdata_get()-related regressions.

Fix prof_tdata_get() to avoid dereferencing an invalid tdata pointer
(when it's PROF_TDATA_STATE_{REINCARNATED,PURGATORY}).

Fix prof_tdata_get() callers to check for invalid results besides NULL
(PROF_TDATA_STATE_{REINCARNATED,PURGATORY}).

These regressions were caused by
602c8e0971160e4b85b08b16cf8a2375aa24bc04 (Implement per thread heap
profiling.), which did not make it into any releases prior to these
fixes.
---
 include/jemalloc/internal/prof.h | 11 +++++-----
 src/prof.c                       | 45 ++++++++++++++++++----------------------
 2 files changed, 26 insertions(+), 30 deletions(-)

diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index 104bfad..a990328 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -308,12 +308,13 @@ prof_tdata_get(bool create)
 
 	tdata = *prof_tdata_tsd_get();
 	if (create) {
-		if (tdata == NULL)
-			tdata = prof_tdata_init();
-		else if (tdata->state == prof_tdata_state_expired)
+		if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX) {
+			if (tdata == NULL)
+				tdata = prof_tdata_init();
+		} else if (tdata->state == prof_tdata_state_expired)
 			tdata = prof_tdata_reinit(tdata);
-		assert(tdata == NULL || tdata->state ==
-		    prof_tdata_state_attached);
+		assert((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX ||
+		    tdata->state == prof_tdata_state_attached);
 	}
 
 	return (tdata);
diff --git a/src/prof.c b/src/prof.c
index 044acd8..941e53b 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -487,9 +487,8 @@ prof_gctx_create(prof_bt_t *bt)
 }
 
 static void
-prof_gctx_maybe_destroy(prof_gctx_t *gctx)
+prof_gctx_maybe_destroy(prof_gctx_t *gctx, prof_tdata_t *tdata)
 {
-	prof_tdata_t *tdata;
 
 	cassert(config_prof);
 
@@ -500,8 +499,6 @@ prof_gctx_maybe_destroy(prof_gctx_t *gctx)
 	 * avoid a race between the main body of prof_tctx_destroy() and entry
 	 * into this function.
 	 */
-	tdata = prof_tdata_get(false);
-	assert((uintptr_t)tdata > (uintptr_t)PROF_TDATA_STATE_MAX);
 	prof_enter(tdata);
 	malloc_mutex_lock(gctx->lock);
 	if (tctx_tree_empty(&gctx->tctxs) && gctx->nlimbo == 1) {
@@ -552,8 +549,9 @@ prof_gctx_should_destroy(prof_gctx_t *gctx)
 static void
 prof_tctx_destroy(prof_tctx_t *tctx)
 {
+	prof_tdata_t *tdata = tctx->tdata;
 	prof_gctx_t *gctx = tctx->gctx;
-	bool destroy_gctx;
+	bool destroy_tdata, destroy_gctx;
 
 	assert(tctx->cnts.curobjs == 0);
 	assert(tctx->cnts.curbytes == 0);
@@ -561,16 +559,9 @@ prof_tctx_destroy(prof_tctx_t *tctx)
 	assert(tctx->cnts.accumobjs == 0);
 	assert(tctx->cnts.accumbytes == 0);
 
-	{
-		prof_tdata_t *tdata = tctx->tdata;
-		bool tdata_destroy;
-
-		ckh_remove(&tdata->bt2tctx, &gctx->bt, NULL, NULL);
-		tdata_destroy = prof_tdata_should_destroy(tdata);
-		malloc_mutex_unlock(tdata->lock);
-		if (tdata_destroy)
-			prof_tdata_destroy(tdata);
-	}
+	ckh_remove(&tdata->bt2tctx, &gctx->bt, NULL, NULL);
+	destroy_tdata = prof_tdata_should_destroy(tdata);
+	malloc_mutex_unlock(tdata->lock);
 
 	malloc_mutex_lock(gctx->lock);
 	tctx_tree_remove(&gctx->tctxs, tctx);
@@ -594,7 +585,10 @@ prof_tctx_destroy(prof_tctx_t *tctx)
 		destroy_gctx = false;
 	malloc_mutex_unlock(gctx->lock);
 	if (destroy_gctx)
-		prof_gctx_maybe_destroy(gctx);
+		prof_gctx_maybe_destroy(gctx, tdata);
+
+	if (destroy_tdata)
+		prof_tdata_destroy(tdata);
 
 	idalloc(tctx);
 }
@@ -683,7 +677,7 @@ prof_lookup(prof_bt_t *bt)
 		ret.v = imalloc(sizeof(prof_tctx_t));
 		if (ret.p == NULL) {
 			if (new_gctx)
-				prof_gctx_maybe_destroy(gctx);
+				prof_gctx_maybe_destroy(gctx, tdata);
 			return (NULL);
 		}
 		ret.p->tdata = tdata;
@@ -695,7 +689,7 @@ prof_lookup(prof_bt_t *bt)
 		malloc_mutex_unlock(tdata->lock);
 		if (error) {
 			if (new_gctx)
-				prof_gctx_maybe_destroy(gctx);
+				prof_gctx_maybe_destroy(gctx, tdata);
 			idalloc(ret.v);
 			return (NULL);
 		}
@@ -1019,6 +1013,7 @@ prof_gctx_merge_iter(prof_gctx_tree_t *gctxs, prof_gctx_t *gctx, void *arg)
 static prof_gctx_t *
 prof_gctx_finish_iter(prof_gctx_tree_t *gctxs, prof_gctx_t *gctx, void *arg)
 {
+	prof_tdata_t *tdata = (prof_tdata_t *)arg;
 	prof_tctx_t *next;
 	bool destroy_gctx;
 
@@ -1032,7 +1027,7 @@ prof_gctx_finish_iter(prof_gctx_tree_t *gctxs, prof_gctx_t *gctx, void *arg)
 	destroy_gctx = prof_gctx_should_destroy(gctx);
 	malloc_mutex_unlock(gctx->lock);
 	if (destroy_gctx)
-		prof_gctx_maybe_destroy(gctx);
+		prof_gctx_maybe_destroy(gctx, tdata);
 
 	return (NULL);
 }
@@ -1310,7 +1305,7 @@ prof_dump(bool propagate_err, const char *filename, bool leakcheck)
 	if (prof_dump_close(propagate_err))
 		goto label_open_close_error;
 
-	gctx_tree_iter(&gctxs, NULL, prof_gctx_finish_iter, NULL);
+	gctx_tree_iter(&gctxs, NULL, prof_gctx_finish_iter, tdata);
 	malloc_mutex_unlock(&prof_dump_mtx);
 
 	if (leakcheck)
@@ -1320,7 +1315,7 @@ prof_dump(bool propagate_err, const char *filename, bool leakcheck)
 label_write_error:
 	prof_dump_close(propagate_err);
 label_open_close_error:
-	gctx_tree_iter(&gctxs, NULL, prof_gctx_finish_iter, NULL);
+	gctx_tree_iter(&gctxs, NULL, prof_gctx_finish_iter, tdata);
 	malloc_mutex_unlock(&prof_dump_mtx);
 	return (true);
 }
@@ -1643,7 +1638,7 @@ const char *
 prof_thread_name_get(void)
 {
 	prof_tdata_t *tdata = prof_tdata_get(true);
-	if (tdata == NULL)
+	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
 		return (NULL);
 	return (tdata->thread_name);
 }
@@ -1656,7 +1651,7 @@ prof_thread_name_set(const char *thread_name)
 	char *s;
 
 	tdata = prof_tdata_get(true);
-	if (tdata == NULL)
+	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
 		return (true);
 
 	size = strlen(thread_name) + 1;
@@ -1675,7 +1670,7 @@ bool
 prof_thread_active_get(void)
 {
 	prof_tdata_t *tdata = prof_tdata_get(true);
-	if (tdata == NULL)
+	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
 		return (false);
 	return (tdata->active);
 }
@@ -1686,7 +1681,7 @@ prof_thread_active_set(bool active)
 	prof_tdata_t *tdata;
 
 	tdata = prof_tdata_get(true);
-	if (tdata == NULL)
+	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
 		return (true);
 	tdata->active = active;
 	return (false);
-- 
cgit v0.12


From 6e73dc194ee9682d3eacaf725a989f04629718f7 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Tue, 9 Sep 2014 19:37:26 -0700
Subject: Fix a profile sampling race.

Fix a profile sampling race that was due to preparing to sample, yet
doing nothing to assure that the context remains valid until the stats
are updated.

These regressions were caused by
602c8e0971160e4b85b08b16cf8a2375aa24bc04 (Implement per thread heap
profiling.), which did not make it into any releases prior to these
fixes.
---
 include/jemalloc/internal/private_symbols.txt |   1 +
 include/jemalloc/internal/prof.h              |  37 ++++-----
 src/jemalloc.c                                | 109 +++++++++++++-------------
 src/prof.c                                    |  35 +++++++++
 4 files changed, 109 insertions(+), 73 deletions(-)

diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 3b990b0..b899017 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -292,6 +292,7 @@ p2rz
 pages_purge
 pow2_ceil
 prof_alloc_prep
+prof_alloc_rollback
 prof_backtrace
 prof_boot0
 prof_boot1
diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index a990328..920ec63 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -97,6 +97,12 @@ struct prof_tctx_s {
 	/* Linkage into gctx's tctxs. */
 	rb_node(prof_tctx_t)	tctx_link;
 
+	/*
+	 * True during prof_alloc_prep()..prof_malloc_sample_object(), prevents
+	 * sample vs destroy race.
+	 */
+	bool			prepared;
+
 	/* Current dump-related state, protected by gctx->lock. */
 	prof_tctx_state_t	state;
 
@@ -242,6 +248,7 @@ extern uint64_t	prof_interval;
  */
 extern size_t	lg_prof_sample;
 
+void	prof_alloc_rollback(prof_tctx_t *tctx, bool updated);
 void	prof_malloc_sample_object(const void *ptr, size_t usize,
     prof_tctx_t *tctx);
 void	prof_free_sampled_object(size_t usize, prof_tctx_t *tctx);
@@ -282,14 +289,14 @@ malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *)
 prof_tdata_t	*prof_tdata_get(bool create);
 bool	prof_sample_accum_update(size_t usize, bool commit,
     prof_tdata_t **tdata_out);
-prof_tctx_t	*prof_alloc_prep(size_t usize);
+prof_tctx_t	*prof_alloc_prep(size_t usize, bool update);
 prof_tctx_t	*prof_tctx_get(const void *ptr);
 void	prof_tctx_set(const void *ptr, prof_tctx_t *tctx);
 void	prof_malloc_sample_object(const void *ptr, size_t usize,
     prof_tctx_t *tctx);
 void	prof_malloc(const void *ptr, size_t usize, prof_tctx_t *tctx);
 void	prof_realloc(const void *ptr, size_t usize, prof_tctx_t *tctx,
-    size_t old_usize, prof_tctx_t *old_tctx);
+    bool updated, size_t old_usize, prof_tctx_t *old_tctx);
 void	prof_free(const void *ptr, size_t usize);
 #endif
 
@@ -356,7 +363,7 @@ prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 }
 
 JEMALLOC_INLINE bool
-prof_sample_accum_update(size_t usize, bool commit, prof_tdata_t **tdata_out)
+prof_sample_accum_update(size_t usize, bool update, prof_tdata_t **tdata_out)
 {
 	prof_tdata_t *tdata;
 
@@ -373,19 +380,19 @@ prof_sample_accum_update(size_t usize, bool commit, prof_tdata_t **tdata_out)
 		return (true);
 
 	if (tdata->bytes_until_sample >= usize) {
-		if (commit)
+		if (update)
 			tdata->bytes_until_sample -= usize;
 		return (true);
 	} else {
 		/* Compute new sample threshold. */
-		if (commit)
+		if (update)
 			prof_sample_threshold_update(tdata);
 		return (tdata->active == false);
 	}
 }
 
 JEMALLOC_INLINE prof_tctx_t *
-prof_alloc_prep(size_t usize)
+prof_alloc_prep(size_t usize, bool update)
 {
 	prof_tctx_t *ret;
 	prof_tdata_t *tdata;
@@ -393,7 +400,7 @@ prof_alloc_prep(size_t usize)
 
 	assert(usize == s2u(usize));
 
-	if (!opt_prof_active || prof_sample_accum_update(usize, false, &tdata))
+	if (!opt_prof_active || prof_sample_accum_update(usize, update, &tdata))
 		ret = (prof_tctx_t *)(uintptr_t)1U;
 	else {
 		bt_init(&bt, tdata->vec);
@@ -412,16 +419,6 @@ prof_malloc(const void *ptr, size_t usize, prof_tctx_t *tctx)
 	assert(ptr != NULL);
 	assert(usize == isalloc(ptr, true));
 
-	if (prof_sample_accum_update(usize, true, NULL)) {
-		/*
-		 * Don't sample.  For malloc()-like allocation, it is always
-		 * possible to tell in advance how large an object's usable size
-		 * will be, so there should never be a difference between the
-		 * usize passed to PROF_ALLOC_PREP() and prof_malloc().
-		 */
-		assert((uintptr_t)tctx == (uintptr_t)1U);
-	}
-
 	if ((uintptr_t)tctx > (uintptr_t)1U)
 		prof_malloc_sample_object(ptr, usize, tctx);
 	else
@@ -429,14 +426,14 @@ prof_malloc(const void *ptr, size_t usize, prof_tctx_t *tctx)
 }
 
 JEMALLOC_INLINE void
-prof_realloc(const void *ptr, size_t usize, prof_tctx_t *tctx, size_t old_usize,
-    prof_tctx_t *old_tctx)
+prof_realloc(const void *ptr, size_t usize, prof_tctx_t *tctx, bool updated,
+    size_t old_usize, prof_tctx_t *old_tctx)
 {
 
 	cassert(config_prof);
 	assert(ptr != NULL || (uintptr_t)tctx <= (uintptr_t)1U);
 
-	if (ptr != NULL) {
+	if (!updated && ptr != NULL) {
 		assert(usize == isalloc(ptr, true));
 		if (prof_sample_accum_update(usize, true, NULL)) {
 			/*
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 3f29a85..1d4d1a8 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -886,13 +886,15 @@ imalloc_prof(size_t usize)
 	void *p;
 	prof_tctx_t *tctx;
 
-	tctx = prof_alloc_prep(usize);
+	tctx = prof_alloc_prep(usize, true);
 	if ((uintptr_t)tctx != (uintptr_t)1U)
 		p = imalloc_prof_sample(usize, tctx);
 	else
 		p = imalloc(usize);
-	if (p == NULL)
+	if (p == NULL) {
+		prof_alloc_rollback(tctx, true);
 		return (NULL);
+	}
 	prof_malloc(p, usize, tctx);
 
 	return (p);
@@ -962,16 +964,20 @@ imemalign_prof_sample(size_t alignment, size_t usize, prof_tctx_t *tctx)
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
-imemalign_prof(size_t alignment, size_t usize, prof_tctx_t *tctx)
+imemalign_prof(size_t alignment, size_t usize)
 {
 	void *p;
+	prof_tctx_t *tctx;
 
+	tctx = prof_alloc_prep(usize, true);
 	if ((uintptr_t)tctx != (uintptr_t)1U)
 		p = imemalign_prof_sample(alignment, usize, tctx);
 	else
 		p = ipalloc(usize, alignment, false);
-	if (p == NULL)
+	if (p == NULL) {
+		prof_alloc_rollback(tctx, true);
 		return (NULL);
+	}
 	prof_malloc(p, usize, tctx);
 
 	return (p);
@@ -1013,12 +1019,9 @@ imemalign(void **memptr, size_t alignment, size_t size, size_t min_alignment)
 			goto label_oom;
 		}
 
-		if (config_prof && opt_prof) {
-			prof_tctx_t *tctx;
-
-			tctx = prof_alloc_prep(usize);
-			result = imemalign_prof(alignment, usize, tctx);
-		} else
+		if (config_prof && opt_prof)
+			result = imemalign_prof(alignment, usize);
+		else
 			result = ipalloc(usize, alignment, false);
 		if (result == NULL)
 			goto label_oom;
@@ -1087,16 +1090,20 @@ icalloc_prof_sample(size_t usize, prof_tctx_t *tctx)
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
-icalloc_prof(size_t usize, prof_tctx_t *tctx)
+icalloc_prof(size_t usize)
 {
 	void *p;
+	prof_tctx_t *tctx;
 
+	tctx = prof_alloc_prep(usize, true);
 	if ((uintptr_t)tctx != (uintptr_t)1U)
 		p = icalloc_prof_sample(usize, tctx);
 	else
 		p = icalloc(usize);
-	if (p == NULL)
+	if (p == NULL) {
+		prof_alloc_rollback(tctx, true);
 		return (NULL);
+	}
 	prof_malloc(p, usize, tctx);
 
 	return (p);
@@ -1136,11 +1143,8 @@ je_calloc(size_t num, size_t size)
 	}
 
 	if (config_prof && opt_prof) {
-		prof_tctx_t *tctx;
-
 		usize = s2u(num_size);
-		tctx = prof_alloc_prep(usize);
-		ret = icalloc_prof(usize, tctx);
+		ret = icalloc_prof(usize);
 	} else {
 		if (config_stats || (config_valgrind && in_valgrind))
 			usize = s2u(num_size);
@@ -1184,19 +1188,20 @@ irealloc_prof_sample(void *oldptr, size_t usize, prof_tctx_t *tctx)
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
-irealloc_prof(void *oldptr, size_t old_usize, size_t usize, prof_tctx_t *tctx)
+irealloc_prof(void *oldptr, size_t old_usize, size_t usize)
 {
 	void *p;
-	prof_tctx_t *old_tctx;
+	prof_tctx_t *old_tctx, *tctx;
 
 	old_tctx = prof_tctx_get(oldptr);
+	tctx = prof_alloc_prep(usize, true);
 	if ((uintptr_t)tctx != (uintptr_t)1U)
 		p = irealloc_prof_sample(oldptr, usize, tctx);
 	else
 		p = iralloc(oldptr, usize, 0, false);
 	if (p == NULL)
 		return (NULL);
-	prof_realloc(p, usize, tctx, old_usize, old_tctx);
+	prof_realloc(p, usize, tctx, true, old_usize, old_tctx);
 
 	return (p);
 }
@@ -1270,11 +1275,8 @@ je_realloc(void *ptr, size_t size)
 			old_rzsize = config_prof ? p2rz(ptr) : u2rz(old_usize);
 
 		if (config_prof && opt_prof) {
-			prof_tctx_t *tctx;
-
 			usize = s2u(size);
-			tctx = prof_alloc_prep(usize);
-			ret = irealloc_prof(ptr, old_usize, usize, tctx);
+			ret = irealloc_prof(ptr, old_usize, usize);
 		} else {
 			if (config_stats || (config_valgrind && in_valgrind))
 				usize = s2u(size);
@@ -1477,7 +1479,7 @@ imallocx_prof(size_t size, int flags, size_t *usize)
 
 	imallocx_flags_decode(size, flags, usize, &alignment, &zero,
 	    &try_tcache, &arena);
-	tctx = prof_alloc_prep(*usize);
+	tctx = prof_alloc_prep(*usize, true);
 	if ((uintptr_t)tctx == (uintptr_t)1U) {
 		p = imallocx_maybe_flags(size, flags, *usize, alignment, zero,
 		    try_tcache, arena);
@@ -1486,8 +1488,10 @@ imallocx_prof(size_t size, int flags, size_t *usize)
 		    try_tcache, arena);
 	} else
 		p = NULL;
-	if (p == NULL)
+	if (p == NULL) {
+		prof_alloc_rollback(tctx, true);
 		return (NULL);
+	}
 	prof_malloc(p, *usize, tctx);
 
 	return (p);
@@ -1572,21 +1576,24 @@ irallocx_prof_sample(void *oldptr, size_t size, size_t alignment, size_t usize,
 JEMALLOC_ALWAYS_INLINE_C void *
 irallocx_prof(void *oldptr, size_t old_usize, size_t size, size_t alignment,
     size_t *usize, bool zero, bool try_tcache_alloc, bool try_tcache_dalloc,
-    arena_t *arena, prof_tctx_t *tctx)
+    arena_t *arena)
 {
 	void *p;
-	prof_tctx_t *old_tctx;
+	prof_tctx_t *old_tctx, *tctx;
 
 	old_tctx = prof_tctx_get(oldptr);
-	if ((uintptr_t)tctx != (uintptr_t)1U)
+	tctx = prof_alloc_prep(*usize, true);
+	if ((uintptr_t)tctx != (uintptr_t)1U) {
 		p = irallocx_prof_sample(oldptr, size, alignment, *usize, zero,
 		    try_tcache_alloc, try_tcache_dalloc, arena, tctx);
-	else {
+	} else {
 		p = iralloct(oldptr, size, alignment, zero, try_tcache_alloc,
 		    try_tcache_dalloc, arena);
 	}
-	if (p == NULL)
+	if (p == NULL) {
+		prof_alloc_rollback(tctx, true);
 		return (NULL);
+	}
 
 	if (p == oldptr && alignment != 0) {
 		/*
@@ -1599,7 +1606,7 @@ irallocx_prof(void *oldptr, size_t old_usize, size_t size, size_t alignment,
 		 */
 		*usize = isalloc(p, config_prof);
 	}
-	prof_realloc(p, *usize, tctx, old_usize, old_tctx);
+	prof_realloc(p, *usize, tctx, true, old_usize, old_tctx);
 
 	return (p);
 }
@@ -1641,13 +1648,10 @@ je_rallocx(void *ptr, size_t size, int flags)
 		old_rzsize = u2rz(old_usize);
 
 	if (config_prof && opt_prof) {
-		prof_tctx_t *tctx;
-
 		usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment);
 		assert(usize != 0);
-		tctx = prof_alloc_prep(usize);
 		p = irallocx_prof(ptr, old_usize, size, alignment, &usize, zero,
-		    try_tcache_alloc, try_tcache_dalloc, arena, tctx);
+		    try_tcache_alloc, try_tcache_dalloc, arena);
 		if (p == NULL)
 			goto label_oom;
 	} else {
@@ -1720,13 +1724,21 @@ ixallocx_prof_sample(void *ptr, size_t old_usize, size_t size, size_t extra,
 
 JEMALLOC_ALWAYS_INLINE_C size_t
 ixallocx_prof(void *ptr, size_t old_usize, size_t size, size_t extra,
-    size_t alignment, size_t max_usize, bool zero, arena_t *arena,
-    prof_tctx_t *tctx)
+    size_t alignment, bool zero, arena_t *arena)
 {
-	size_t usize;
-	prof_tctx_t *old_tctx;
+	size_t max_usize, usize;
+	prof_tctx_t *old_tctx, *tctx;
 
 	old_tctx = prof_tctx_get(ptr);
+	/*
+	 * usize isn't knowable before ixalloc() returns when extra is non-zero.
+	 * Therefore, compute its maximum possible value and use that in
+	 * prof_alloc_prep() to decide whether to capture a backtrace.
+	 * prof_realloc() will use the actual usize to decide whether to sample.
+	 */
+	max_usize = (alignment == 0) ? s2u(size+extra) : sa2u(size+extra,
+	    alignment);
+	tctx = prof_alloc_prep(max_usize, false);
 	if ((uintptr_t)tctx != (uintptr_t)1U) {
 		usize = ixallocx_prof_sample(ptr, old_usize, size, extra,
 		    alignment, zero, max_usize, arena, tctx);
@@ -1734,9 +1746,11 @@ ixallocx_prof(void *ptr, size_t old_usize, size_t size, size_t extra,
 		usize = ixallocx_helper(ptr, old_usize, size, extra, alignment,
 		    zero, arena);
 	}
-	if (usize == old_usize)
+	if (usize == old_usize) {
+		prof_alloc_rollback(tctx, false);
 		return (usize);
-	prof_realloc(ptr, usize, tctx, old_usize, old_tctx);
+	}
+	prof_realloc(ptr, usize, tctx, false, old_usize, old_tctx);
 
 	return (usize);
 }
@@ -1767,19 +1781,8 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags)
 		old_rzsize = u2rz(old_usize);
 
 	if (config_prof && opt_prof) {
-		prof_tctx_t *tctx;
-		/*
-		 * usize isn't knowable before ixalloc() returns when extra is
-		 * non-zero.  Therefore, compute its maximum possible value and
-		 * use that in prof_alloc_prep() to decide whether to capture a
-		 * backtrace.  prof_realloc() will use the actual usize to
-		 * decide whether to sample.
-		 */
-		size_t max_usize = (alignment == 0) ? s2u(size+extra) :
-		    sa2u(size+extra, alignment);
-		tctx = prof_alloc_prep(max_usize);
 		usize = ixallocx_prof(ptr, old_usize, size, extra, alignment,
-		    max_usize, zero, arena, tctx);
+		    zero, arena);
 	} else {
 		usize = ixallocx_helper(ptr, old_usize, size, extra, alignment,
 		    zero, arena);
diff --git a/src/prof.c b/src/prof.c
index 941e53b..9495afc 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -150,6 +150,35 @@ rb_gen(static UNUSED, tdata_tree_, prof_tdata_tree_t, prof_tdata_t, tdata_link,
 /******************************************************************************/
 
 void
+prof_alloc_rollback(prof_tctx_t *tctx, bool updated)
+{
+	prof_tdata_t *tdata;
+
+	cassert(config_prof);
+
+	if (updated) {
+		/*
+		 * Compute a new sample threshold.  This isn't very important in
+		 * practice, because this function is rarely executed, so the
+		 * potential for sample bias is minimal except in contrived
+		 * programs.
+		 */
+		tdata = prof_tdata_get(true);
+		if ((uintptr_t)tdata > (uintptr_t)PROF_TDATA_STATE_MAX)
+			prof_sample_threshold_update(tctx->tdata);
+	}
+
+	if ((uintptr_t)tctx > (uintptr_t)1U) {
+		malloc_mutex_lock(tctx->tdata->lock);
+		tctx->prepared = false;
+		if (prof_tctx_should_destroy(tctx))
+			prof_tctx_destroy(tctx);
+		else
+			malloc_mutex_unlock(tctx->tdata->lock);
+	}
+}
+
+void
 prof_malloc_sample_object(const void *ptr, size_t usize, prof_tctx_t *tctx) {
 	prof_tctx_set(ptr, tctx);
 
@@ -160,6 +189,7 @@ prof_malloc_sample_object(const void *ptr, size_t usize, prof_tctx_t *tctx) {
 		tctx->cnts.accumobjs++;
 		tctx->cnts.accumbytes += usize;
 	}
+	tctx->prepared = false;
 	malloc_mutex_unlock(tctx->tdata->lock);
 }
 
@@ -529,6 +559,8 @@ prof_tctx_should_destroy(prof_tctx_t *tctx)
 		return (false);
 	if (tctx->cnts.curobjs != 0)
 		return (false);
+	if (tctx->prepared)
+		return (false);
 	return (true);
 }
 
@@ -659,6 +691,8 @@ prof_lookup(prof_bt_t *bt)
 
 	malloc_mutex_lock(tdata->lock);
 	not_found = ckh_search(&tdata->bt2tctx, bt, NULL, &ret.v);
+	if (!not_found) /* Note double negative! */
+		ret.p->prepared = true;
 	malloc_mutex_unlock(tdata->lock);
 	if (not_found) {
 		void *btkey;
@@ -683,6 +717,7 @@ prof_lookup(prof_bt_t *bt)
 		ret.p->tdata = tdata;
 		memset(&ret.p->cnts, 0, sizeof(prof_cnt_t));
 		ret.p->gctx = gctx;
+		ret.p->prepared = true;
 		ret.p->state = prof_tctx_state_nominal;
 		malloc_mutex_lock(tdata->lock);
 		error = ckh_insert(&tdata->bt2tctx, btkey, ret.v);
-- 
cgit v0.12


From 61beeb9f69f2f1fd5669b2411245cc7197b5d66a Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Wed, 10 Sep 2014 08:49:29 -0700
Subject: Add sdallocx() to list of functions to prune in pprof.

---
 bin/pprof | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bin/pprof b/bin/pprof
index 87313f4..5a4c6cd 100755
--- a/bin/pprof
+++ b/bin/pprof
@@ -2840,6 +2840,7 @@ sub RemoveUninterestingFrames {
                       'rallocx', # jemalloc
                       'xallocx', # jemalloc
                       'dallocx', # jemalloc
+                      'sdallocx', # jemalloc
                       'tc_calloc',
                       'tc_cfree',
                       'tc_malloc',
-- 
cgit v0.12


From 6b5609d23bf49423fdc6506281e0deac7c3a524e Mon Sep 17 00:00:00 2001
From: Daniel Micay <danielmicay@gmail.com>
Date: Mon, 8 Sep 2014 22:18:49 -0400
Subject: add likely / unlikely macros

---
 include/jemalloc/internal/util.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/jemalloc/internal/util.h b/include/jemalloc/internal/util.h
index d2b7a96..82a453d 100644
--- a/include/jemalloc/internal/util.h
+++ b/include/jemalloc/internal/util.h
@@ -27,6 +27,14 @@
 #  define JEMALLOC_CC_SILENCE_INIT(v)
 #endif
 
+#ifdef __GNUC__
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#else
+#define likely(x) !!(x)
+#define unlikely(x) !!(x)
+#endif
+
 /*
  * Define a custom assert() in order to reduce the chances of deadlock during
  * assertion failure.
-- 
cgit v0.12


From 23fdf8b359a690f457c5300338f4994d06402b95 Mon Sep 17 00:00:00 2001
From: Daniel Micay <danielmicay@gmail.com>
Date: Tue, 9 Sep 2014 15:26:05 -0400
Subject: mark some conditions as unlikely

* assertion failure
* malloc_init failure
* malloc not already initialized (in malloc_init)
* running in valgrind
* thread cache disabled at runtime

Clang and GCC already consider a comparison with NULL or -1 to be cold,
so many branches (out-of-memory) are already correctly considered as
cold and marking them is not important.
---
 include/jemalloc/internal/tcache.h   |  2 +-
 include/jemalloc/internal/util.h     |  6 +++---
 include/jemalloc/internal/valgrind.h | 12 +++++------
 src/jemalloc.c                       | 42 ++++++++++++++++++------------------
 4 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/include/jemalloc/internal/tcache.h b/include/jemalloc/internal/tcache.h
index c0d48b9..292ce46 100644
--- a/include/jemalloc/internal/tcache.h
+++ b/include/jemalloc/internal/tcache.h
@@ -218,7 +218,7 @@ tcache_get(bool create)
 		return (NULL);
 
 	tcache = *tcache_tsd_get();
-	if ((uintptr_t)tcache <= (uintptr_t)TCACHE_STATE_MAX) {
+	if (unlikely((uintptr_t)tcache <= (uintptr_t)TCACHE_STATE_MAX)) {
 		if (tcache == TCACHE_STATE_DISABLED)
 			return (NULL);
 		tcache = tcache_get_hard(tcache, create);
diff --git a/include/jemalloc/internal/util.h b/include/jemalloc/internal/util.h
index 82a453d..cc7806d 100644
--- a/include/jemalloc/internal/util.h
+++ b/include/jemalloc/internal/util.h
@@ -41,7 +41,7 @@
  */
 #ifndef assert
 #define	assert(e) do {							\
-	if (config_debug && !(e)) {					\
+	if (unlikely(config_debug && !(e))) {				\
 		malloc_printf(						\
 		    "<jemalloc>: %s:%d: Failed assertion: \"%s\"\n",	\
 		    __FILE__, __LINE__, #e);				\
@@ -73,14 +73,14 @@
 
 #ifndef assert_not_implemented
 #define	assert_not_implemented(e) do {					\
-	if (config_debug && !(e))					\
+	if (unlikely(config_debug && !(e)))				\
 		not_implemented();					\
 } while (0)
 #endif
 
 /* Use to assert a particular configuration, e.g., cassert(config_debug). */
 #define	cassert(c) do {							\
-	if ((c) == false)						\
+	if (unlikely(!(c)))						\
 		not_reached();						\
 } while (0)
 
diff --git a/include/jemalloc/internal/valgrind.h b/include/jemalloc/internal/valgrind.h
index 52c93f2..a3380df 100644
--- a/include/jemalloc/internal/valgrind.h
+++ b/include/jemalloc/internal/valgrind.h
@@ -14,15 +14,15 @@
  * usable space.
  */
 #define	JEMALLOC_VALGRIND_MAKE_MEM_NOACCESS(ptr, usize) do {		\
-	if (in_valgrind)						\
+	if (unlikely(in_valgrind))					\
 		valgrind_make_mem_noaccess(ptr, usize);			\
 } while (0)
 #define	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ptr, usize) do {		\
-	if (in_valgrind)						\
+	if (unlikely(in_valgrind))					\
 		valgrind_make_mem_undefined(ptr, usize);		\
 } while (0)
 #define	JEMALLOC_VALGRIND_MAKE_MEM_DEFINED(ptr, usize) do {		\
-	if (in_valgrind)						\
+	if (unlikely(in_valgrind))					\
 		valgrind_make_mem_defined(ptr, usize);			\
 } while (0)
 /*
@@ -31,13 +31,13 @@
  * Valgrind reports errors, there are no extra stack frames in the backtraces.
  */
 #define	JEMALLOC_VALGRIND_MALLOC(cond, ptr, usize, zero) do {		\
-	if (in_valgrind && cond)					\
+	if (unlikely(in_valgrind && cond))				\
 		VALGRIND_MALLOCLIKE_BLOCK(ptr, usize, p2rz(ptr), zero);	\
 } while (0)
 #define	JEMALLOC_VALGRIND_REALLOC(maybe_moved, ptr, usize,		\
     ptr_maybe_null, old_ptr, old_usize, old_rzsize, old_ptr_maybe_null,	\
     zero) do {								\
-	if (in_valgrind) {						\
+	if (unlikely(in_valgrind)) {					\
 		size_t rzsize = p2rz(ptr);				\
 									\
 		if (!maybe_moved || ptr == old_ptr) {			\
@@ -73,7 +73,7 @@
 	}								\
 } while (0)
 #define	JEMALLOC_VALGRIND_FREE(ptr, rzsize) do {			\
-	if (in_valgrind)						\
+	if (unlikely(in_valgrind))					\
 		valgrind_freelike_block(ptr, rzsize);			\
 } while (0)
 #else
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 1d4d1a8..9874361 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -291,7 +291,7 @@ JEMALLOC_ALWAYS_INLINE_C bool
 malloc_init(void)
 {
 
-	if (malloc_initialized == false && malloc_init_hard())
+	if (unlikely(!malloc_initialized) && malloc_init_hard())
 		return (true);
 	malloc_thread_init();
 
@@ -904,7 +904,7 @@ JEMALLOC_ALWAYS_INLINE_C void *
 imalloc_body(size_t size, size_t *usize)
 {
 
-	if (malloc_init())
+	if (unlikely(malloc_init()))
 		return (NULL);
 
 	if (config_prof && opt_prof) {
@@ -912,7 +912,7 @@ imalloc_body(size_t size, size_t *usize)
 		return (imalloc_prof(*usize));
 	}
 
-	if (config_stats || (config_valgrind && in_valgrind))
+	if (config_stats || (unlikely(config_valgrind && in_valgrind)))
 		*usize = s2u(size);
 	return (imalloc(size));
 }
@@ -993,7 +993,7 @@ imemalign(void **memptr, size_t alignment, size_t size, size_t min_alignment)
 
 	assert(min_alignment != 0);
 
-	if (malloc_init()) {
+	if (unlikely(malloc_init())) {
 		result = NULL;
 		goto label_oom;
 	} else {
@@ -1116,7 +1116,7 @@ je_calloc(size_t num, size_t size)
 	size_t num_size;
 	size_t usize JEMALLOC_CC_SILENCE_INIT(0);
 
-	if (malloc_init()) {
+	if (unlikely(malloc_init())) {
 		num_size = 0;
 		ret = NULL;
 		goto label_return;
@@ -1146,7 +1146,7 @@ je_calloc(size_t num, size_t size)
 		usize = s2u(num_size);
 		ret = icalloc_prof(usize);
 	} else {
-		if (config_stats || (config_valgrind && in_valgrind))
+		if (config_stats || unlikely(config_valgrind && in_valgrind))
 			usize = s2u(num_size);
 		ret = icalloc(num_size);
 	}
@@ -1222,7 +1222,7 @@ ifree(void *ptr, bool try_tcache)
 		usize = isalloc(ptr, config_prof);
 	if (config_stats)
 		thread_allocated_tsd_get()->deallocated += usize;
-	if (config_valgrind && in_valgrind)
+	if (unlikely(config_valgrind && in_valgrind))
 		rzsize = p2rz(ptr);
 	iqalloc(ptr, try_tcache);
 	JEMALLOC_VALGRIND_FREE(ptr, rzsize);
@@ -1240,7 +1240,7 @@ isfree(void *ptr, size_t usize, bool try_tcache)
 		prof_free(ptr, usize);
 	if (config_stats)
 		thread_allocated_tsd_get()->deallocated += usize;
-	if (config_valgrind && in_valgrind)
+	if (unlikely(config_valgrind && in_valgrind))
 		rzsize = p2rz(ptr);
 	isqalloc(ptr, usize, try_tcache);
 	JEMALLOC_VALGRIND_FREE(ptr, rzsize);
@@ -1269,16 +1269,16 @@ je_realloc(void *ptr, size_t size)
 		malloc_thread_init();
 
 		if ((config_prof && opt_prof) || config_stats ||
-		    (config_valgrind && in_valgrind))
+		    unlikely(config_valgrind && in_valgrind))
 			old_usize = isalloc(ptr, config_prof);
-		if (config_valgrind && in_valgrind)
+		if (unlikely(config_valgrind && in_valgrind))
 			old_rzsize = config_prof ? p2rz(ptr) : u2rz(old_usize);
 
 		if (config_prof && opt_prof) {
 			usize = s2u(size);
 			ret = irealloc_prof(ptr, old_usize, usize);
 		} else {
-			if (config_stats || (config_valgrind && in_valgrind))
+			if (config_stats || unlikely(config_valgrind && in_valgrind))
 				usize = s2u(size);
 			ret = iralloc(ptr, size, 0, false);
 		}
@@ -1506,7 +1506,7 @@ imallocx_no_prof(size_t size, int flags, size_t *usize)
 	arena_t *arena;
 
 	if (flags == 0) {
-		if (config_stats || (config_valgrind && in_valgrind))
+		if (config_stats || unlikely(config_valgrind && in_valgrind))
 			*usize = s2u(size);
 		return (imalloc(size));
 	}
@@ -1524,7 +1524,7 @@ je_mallocx(size_t size, int flags)
 
 	assert(size != 0);
 
-	if (malloc_init())
+	if (unlikely(malloc_init()))
 		goto label_oom;
 
 	if (config_prof && opt_prof)
@@ -1642,9 +1642,9 @@ je_rallocx(void *ptr, size_t size, int flags)
 	}
 
 	if ((config_prof && opt_prof) || config_stats ||
-	    (config_valgrind && in_valgrind))
+	    (unlikely(config_valgrind && in_valgrind)))
 		old_usize = isalloc(ptr, config_prof);
-	if (config_valgrind && in_valgrind)
+	if (unlikely(config_valgrind && in_valgrind))
 		old_rzsize = u2rz(old_usize);
 
 	if (config_prof && opt_prof) {
@@ -1777,7 +1777,7 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags)
 		arena = NULL;
 
 	old_usize = isalloc(ptr, config_prof);
-	if (config_valgrind && in_valgrind)
+	if (unlikely(config_valgrind && in_valgrind))
 		old_rzsize = u2rz(old_usize);
 
 	if (config_prof && opt_prof) {
@@ -1883,7 +1883,7 @@ je_nallocx(size_t size, int flags)
 
 	assert(size != 0);
 
-	if (malloc_init())
+	if (unlikely(malloc_init()))
 		return (0);
 
 	return (inallocx(size, flags));
@@ -1894,7 +1894,7 @@ je_mallctl(const char *name, void *oldp, size_t *oldlenp, void *newp,
     size_t newlen)
 {
 
-	if (malloc_init())
+	if (unlikely(malloc_init()))
 		return (EAGAIN);
 
 	return (ctl_byname(name, oldp, oldlenp, newp, newlen));
@@ -1904,7 +1904,7 @@ int
 je_mallctlnametomib(const char *name, size_t *mibp, size_t *miblenp)
 {
 
-	if (malloc_init())
+	if (unlikely(malloc_init()))
 		return (EAGAIN);
 
 	return (ctl_nametomib(name, mibp, miblenp));
@@ -1915,7 +1915,7 @@ je_mallctlbymib(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
   void *newp, size_t newlen)
 {
 
-	if (malloc_init())
+	if (unlikely(malloc_init()))
 		return (EAGAIN);
 
 	return (ctl_bymib(mib, miblen, oldp, oldlenp, newp, newlen));
@@ -2064,7 +2064,7 @@ static void *
 a0alloc(size_t size, bool zero)
 {
 
-	if (malloc_init())
+	if (unlikely(malloc_init()))
 		return (NULL);
 
 	if (size == 0)
-- 
cgit v0.12


From 91566fc079cfaeaf2b424b7f40d6b9d8669d0470 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Thu, 11 Sep 2014 13:15:33 -0700
Subject: Fix mallocx() to always honor MALLOCX_ARENA() when profiling.

---
 src/jemalloc.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/jemalloc.c b/src/jemalloc.c
index 9874361..f6be751 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1435,7 +1435,6 @@ imallocx_flags(size_t usize, size_t alignment, bool zero, bool try_tcache,
 	return (imalloct(usize, try_tcache, arena));
 }
 
-
 JEMALLOC_ALWAYS_INLINE_C void *
 imallocx_maybe_flags(size_t size, int flags, size_t usize, size_t alignment,
     bool zero, bool try_tcache, arena_t *arena)
@@ -1455,7 +1454,7 @@ imallocx_prof_sample(size_t size, int flags, size_t usize, size_t alignment,
 	if (usize <= SMALL_MAXCLASS) {
 		assert(((alignment == 0) ? s2u(LARGE_MINCLASS) :
 		    sa2u(LARGE_MINCLASS, alignment)) == LARGE_MINCLASS);
-		p = imalloc(LARGE_MINCLASS);
+		p = imalloct(LARGE_MINCLASS, try_tcache, arena);
 		if (p == NULL)
 			return (NULL);
 		arena_prof_promoted(p, usize);
-- 
cgit v0.12


From 9c640bfdd4e2f25180a32ed3704ce8e4c4cc21f1 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Thu, 11 Sep 2014 16:20:44 -0700
Subject: Apply likely()/unlikely() to allocation/deallocation fast paths.

---
 include/jemalloc/internal/arena.h                |  52 +++++----
 include/jemalloc/internal/jemalloc_internal.h.in |   4 +-
 include/jemalloc/internal/prof.h                 |  11 +-
 include/jemalloc/internal/tcache.h               |  32 +++---
 src/arena.c                                      |  28 ++---
 src/huge.c                                       |   6 +-
 src/jemalloc.c                                   | 130 ++++++++++++-----------
 src/quarantine.c                                 |   4 +-
 8 files changed, 138 insertions(+), 129 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 6ab0ae7..bfb0b3c 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -488,7 +488,8 @@ void	arena_prof_tctx_set(const void *ptr, prof_tctx_t *tctx);
 void	*arena_malloc(arena_t *arena, size_t size, bool zero, bool try_tcache);
 size_t	arena_salloc(const void *ptr, bool demote);
 void	arena_dalloc(arena_chunk_t *chunk, void *ptr, bool try_tcache);
-void	arena_sdalloc(arena_chunk_t *chunk, void *ptr, size_t size, bool try_tcache);
+void	arena_sdalloc(arena_chunk_t *chunk, void *ptr, size_t size,
+    bool try_tcache);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ARENA_C_))
@@ -539,7 +540,7 @@ small_size2bin(size_t size)
 {
 
 	assert(size > 0);
-	if (size <= LOOKUP_MAXCLASS)
+	if (likely(size <= LOOKUP_MAXCLASS))
 		return (small_size2bin_lookup(size));
 	else
 		return (small_size2bin_compute(size));
@@ -627,7 +628,7 @@ small_s2u(size_t size)
 {
 
 	assert(size > 0);
-	if (size <= LOOKUP_MAXCLASS)
+	if (likely(size <= LOOKUP_MAXCLASS))
 		return (small_s2u_lookup(size));
 	else
 		return (small_s2u_compute(size));
@@ -864,7 +865,7 @@ arena_prof_accum_locked(arena_t *arena, uint64_t accumbytes)
 
 	cassert(config_prof);
 
-	if (prof_interval == 0)
+	if (likely(prof_interval == 0))
 		return (false);
 	return (arena_prof_accum_impl(arena, accumbytes));
 }
@@ -875,7 +876,7 @@ arena_prof_accum(arena_t *arena, uint64_t accumbytes)
 
 	cassert(config_prof);
 
-	if (prof_interval == 0)
+	if (likely(prof_interval == 0))
 		return (false);
 
 	{
@@ -995,8 +996,8 @@ arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, const void *ptr)
 		    SIZE_INV(28), SIZE_INV(29), SIZE_INV(30), SIZE_INV(31)
 		};
 
-		if (interval <= ((sizeof(interval_invs) / sizeof(unsigned)) +
-		    2)) {
+		if (likely(interval <= ((sizeof(interval_invs) /
+		    sizeof(unsigned)) + 2))) {
 			regind = (diff * interval_invs[interval - 3]) >>
 			    SIZE_INV_SHIFT;
 		} else
@@ -1025,7 +1026,7 @@ arena_prof_tctx_get(const void *ptr)
 	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
 	mapbits = arena_mapbits_get(chunk, pageind);
 	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
-	if ((mapbits & CHUNK_MAP_LARGE) == 0)
+	if (likely((mapbits & CHUNK_MAP_LARGE) == 0))
 		ret = (prof_tctx_t *)(uintptr_t)1U;
 	else
 		ret = arena_miscelm_get(chunk, pageind)->prof_tctx;
@@ -1047,7 +1048,7 @@ arena_prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
 	assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
 
-	if (arena_mapbits_large_get(chunk, pageind) != 0)
+	if (unlikely(arena_mapbits_large_get(chunk, pageind) != 0))
 		arena_miscelm_get(chunk, pageind)->prof_tctx = tctx;
 }
 
@@ -1059,8 +1060,9 @@ arena_malloc(arena_t *arena, size_t size, bool zero, bool try_tcache)
 	assert(size != 0);
 	assert(size <= arena_maxclass);
 
-	if (size <= SMALL_MAXCLASS) {
-		if (try_tcache && (tcache = tcache_get(true)) != NULL)
+	if (likely(size <= SMALL_MAXCLASS)) {
+		if (likely(try_tcache) && likely((tcache = tcache_get(true)) !=
+		    NULL))
 			return (tcache_alloc_small(tcache, size, zero));
 		else {
 			return (arena_malloc_small(choose_arena(arena), size,
@@ -1071,8 +1073,8 @@ arena_malloc(arena_t *arena, size_t size, bool zero, bool try_tcache)
 		 * Initialize tcache after checking size in order to avoid
 		 * infinite recursion during tcache initialization.
 		 */
-		if (try_tcache && size <= tcache_maxclass && (tcache =
-		    tcache_get(true)) != NULL)
+		if (try_tcache && size <= tcache_maxclass && likely((tcache =
+		    tcache_get(true)) != NULL))
 			return (tcache_alloc_large(tcache, size, zero));
 		else {
 			return (arena_malloc_large(choose_arena(arena), size,
@@ -1096,8 +1098,8 @@ arena_salloc(const void *ptr, bool demote)
 	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
 	assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
 	binind = arena_mapbits_binind_get(chunk, pageind);
-	if (binind == BININD_INVALID || (config_prof && demote == false &&
-	    arena_mapbits_large_get(chunk, pageind) != 0)) {
+	if (unlikely(binind == BININD_INVALID || (config_prof && demote == false
+	    && arena_mapbits_large_get(chunk, pageind) != 0))) {
 		/*
 		 * Large allocation.  In the common case (demote == true), and
 		 * as this is an inline function, most callers will only end up
@@ -1137,10 +1139,12 @@ arena_dalloc(arena_chunk_t *chunk, void *ptr, bool try_tcache)
 	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
 	mapbits = arena_mapbits_get(chunk, pageind);
 	assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
-	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
+	if (likely((mapbits & CHUNK_MAP_LARGE) == 0)) {
 		/* Small allocation. */
-		if (try_tcache && (tcache = tcache_get(false)) != NULL) {
-			size_t binind = arena_ptr_small_binind_get(ptr, mapbits);
+		if (likely(try_tcache) && likely((tcache = tcache_get(false)) !=
+		    NULL)) {
+			size_t binind = arena_ptr_small_binind_get(ptr,
+			    mapbits);
 			tcache_dalloc_small(tcache, ptr, binind);
 		} else
 			arena_dalloc_small(chunk->arena, chunk, ptr, pageind);
@@ -1149,8 +1153,8 @@ arena_dalloc(arena_chunk_t *chunk, void *ptr, bool try_tcache)
 
 		assert(((uintptr_t)ptr & PAGE_MASK) == 0);
 
-		if (try_tcache && size <= tcache_maxclass && (tcache =
-		    tcache_get(false)) != NULL) {
+		if (try_tcache && size <= tcache_maxclass && likely((tcache =
+		    tcache_get(false)) != NULL)) {
 			tcache_dalloc_large(tcache, ptr, size);
 		} else
 			arena_dalloc_large(chunk->arena, chunk, ptr);
@@ -1165,13 +1169,15 @@ arena_sdalloc(arena_chunk_t *chunk, void *ptr, size_t size, bool try_tcache)
 	assert(ptr != NULL);
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
-	if (size < PAGE) {
+	if (likely(size <= SMALL_MAXCLASS)) {
 		/* Small allocation. */
-		if (try_tcache && (tcache = tcache_get(false)) != NULL) {
+		if (likely(try_tcache) && likely((tcache = tcache_get(false)) !=
+		    NULL)) {
 			size_t binind = small_size2bin(size);
 			tcache_dalloc_small(tcache, ptr, binind);
 		} else {
-			size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
+			size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >>
+			    LG_PAGE;
 			arena_dalloc_small(chunk->arena, chunk, ptr, pageind);
 		}
 	} else {
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 81d46fc..a380a41 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -814,7 +814,7 @@ JEMALLOC_ALWAYS_INLINE void
 iqalloc(void *ptr, bool try_tcache)
 {
 
-	if (config_fill && opt_quarantine)
+	if (config_fill && unlikely(opt_quarantine))
 		quarantine(ptr);
 	else
 		idalloct(ptr, try_tcache);
@@ -824,7 +824,7 @@ JEMALLOC_ALWAYS_INLINE void
 isqalloc(void *ptr, size_t size, bool try_tcache)
 {
 
-	if (config_fill && opt_quarantine)
+	if (config_fill && unlikely(opt_quarantine))
 		quarantine(ptr);
 	else
 		isdalloct(ptr, size, try_tcache);
diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index 920ec63..a1e7ac5 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -400,7 +400,8 @@ prof_alloc_prep(size_t usize, bool update)
 
 	assert(usize == s2u(usize));
 
-	if (!opt_prof_active || prof_sample_accum_update(usize, update, &tdata))
+	if (!opt_prof_active || likely(prof_sample_accum_update(usize, update,
+	    &tdata)))
 		ret = (prof_tctx_t *)(uintptr_t)1U;
 	else {
 		bt_init(&bt, tdata->vec);
@@ -419,7 +420,7 @@ prof_malloc(const void *ptr, size_t usize, prof_tctx_t *tctx)
 	assert(ptr != NULL);
 	assert(usize == isalloc(ptr, true));
 
-	if ((uintptr_t)tctx > (uintptr_t)1U)
+	if (unlikely((uintptr_t)tctx > (uintptr_t)1U))
 		prof_malloc_sample_object(ptr, usize, tctx);
 	else
 		prof_tctx_set(ptr, (prof_tctx_t *)(uintptr_t)1U);
@@ -447,9 +448,9 @@ prof_realloc(const void *ptr, size_t usize, prof_tctx_t *tctx, bool updated,
 		}
 	}
 
-	if ((uintptr_t)old_tctx > (uintptr_t)1U)
+	if (unlikely((uintptr_t)old_tctx > (uintptr_t)1U))
 		prof_free_sampled_object(old_usize, old_tctx);
-	if ((uintptr_t)tctx > (uintptr_t)1U)
+	if (unlikely((uintptr_t)tctx > (uintptr_t)1U))
 		prof_malloc_sample_object(ptr, usize, tctx);
 	else
 		prof_tctx_set(ptr, (prof_tctx_t *)(uintptr_t)1U);
@@ -463,7 +464,7 @@ prof_free(const void *ptr, size_t usize)
 	cassert(config_prof);
 	assert(usize == isalloc(ptr, true));
 
-	if ((uintptr_t)tctx > (uintptr_t)1U)
+	if (unlikely((uintptr_t)tctx > (uintptr_t)1U))
 		prof_free_sampled_object(usize, tctx);
 }
 #endif
diff --git a/include/jemalloc/internal/tcache.h b/include/jemalloc/internal/tcache.h
index 292ce46..c9d723a 100644
--- a/include/jemalloc/internal/tcache.h
+++ b/include/jemalloc/internal/tcache.h
@@ -236,7 +236,7 @@ tcache_event(tcache_t *tcache)
 
 	tcache->ev_cnt++;
 	assert(tcache->ev_cnt <= TCACHE_GC_INCR);
-	if (tcache->ev_cnt == TCACHE_GC_INCR)
+	if (unlikely(tcache->ev_cnt == TCACHE_GC_INCR))
 		tcache_event_hard(tcache);
 }
 
@@ -245,12 +245,12 @@ tcache_alloc_easy(tcache_bin_t *tbin)
 {
 	void *ret;
 
-	if (tbin->ncached == 0) {
+	if (unlikely(tbin->ncached == 0)) {
 		tbin->low_water = -1;
 		return (NULL);
 	}
 	tbin->ncached--;
-	if ((int)tbin->ncached < tbin->low_water)
+	if (unlikely((int)tbin->ncached < tbin->low_water))
 		tbin->low_water = tbin->ncached;
 	ret = tbin->avail[tbin->ncached];
 	return (ret);
@@ -268,23 +268,23 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
 	tbin = &tcache->tbins[binind];
 	size = small_bin2size(binind);
 	ret = tcache_alloc_easy(tbin);
-	if (ret == NULL) {
+	if (unlikely(ret == NULL)) {
 		ret = tcache_alloc_small_hard(tcache, tbin, binind);
 		if (ret == NULL)
 			return (NULL);
 	}
 	assert(tcache_salloc(ret) == size);
 
-	if (zero == false) {
+	if (likely(zero == false)) {
 		if (config_fill) {
-			if (opt_junk) {
+			if (unlikely(opt_junk)) {
 				arena_alloc_junk_small(ret,
 				    &arena_bin_info[binind], false);
-			} else if (opt_zero)
+			} else if (unlikely(opt_zero))
 				memset(ret, 0, size);
 		}
 	} else {
-		if (config_fill && opt_junk) {
+		if (config_fill && unlikely(opt_junk)) {
 			arena_alloc_junk_small(ret, &arena_bin_info[binind],
 			    true);
 		}
@@ -312,7 +312,7 @@ tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
 	assert(binind < nhbins);
 	tbin = &tcache->tbins[binind];
 	ret = tcache_alloc_easy(tbin);
-	if (ret == NULL) {
+	if (unlikely(ret == NULL)) {
 		/*
 		 * Only allocate one large object at a time, because it's quite
 		 * expensive to create one and not use it.
@@ -329,11 +329,11 @@ tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
 			arena_mapbits_large_binind_set(chunk, pageind,
 			    BININD_INVALID);
 		}
-		if (zero == false) {
+		if (likely(zero == false)) {
 			if (config_fill) {
-				if (opt_junk)
+				if (unlikely(opt_junk))
 					memset(ret, 0xa5, size);
-				else if (opt_zero)
+				else if (unlikely(opt_zero))
 					memset(ret, 0, size);
 			}
 		} else
@@ -357,12 +357,12 @@ tcache_dalloc_small(tcache_t *tcache, void *ptr, size_t binind)
 
 	assert(tcache_salloc(ptr) <= SMALL_MAXCLASS);
 
-	if (config_fill && opt_junk)
+	if (config_fill && unlikely(opt_junk))
 		arena_dalloc_junk_small(ptr, &arena_bin_info[binind]);
 
 	tbin = &tcache->tbins[binind];
 	tbin_info = &tcache_bin_info[binind];
-	if (tbin->ncached == tbin_info->ncached_max) {
+	if (unlikely(tbin->ncached == tbin_info->ncached_max)) {
 		tcache_bin_flush_small(tbin, binind, (tbin_info->ncached_max >>
 		    1), tcache);
 	}
@@ -386,12 +386,12 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
 
 	binind = NBINS + (size >> LG_PAGE) - 1;
 
-	if (config_fill && opt_junk)
+	if (config_fill && unlikely(opt_junk))
 		memset(ptr, 0x5a, size);
 
 	tbin = &tcache->tbins[binind];
 	tbin_info = &tcache_bin_info[binind];
-	if (tbin->ncached == tbin_info->ncached_max) {
+	if (unlikely(tbin->ncached == tbin_info->ncached_max)) {
 		tcache_bin_flush_large(tbin, binind, (tbin_info->ncached_max >>
 		    1), tcache);
 	}
diff --git a/src/arena.c b/src/arena.c
index 8d34cf6..35d792a 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1365,7 +1365,7 @@ arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, size_t binind,
 			ptr = arena_bin_malloc_hard(arena, bin);
 		if (ptr == NULL)
 			break;
-		if (config_fill && opt_junk) {
+		if (config_fill && unlikely(opt_junk)) {
 			arena_alloc_junk_small(ptr, &arena_bin_info[binind],
 			    true);
 		}
@@ -1519,15 +1519,15 @@ arena_malloc_small(arena_t *arena, size_t size, bool zero)
 
 	if (zero == false) {
 		if (config_fill) {
-			if (opt_junk) {
+			if (unlikely(opt_junk)) {
 				arena_alloc_junk_small(ret,
 				    &arena_bin_info[binind], false);
-			} else if (opt_zero)
+			} else if (unlikely(opt_zero))
 				memset(ret, 0, size);
 		}
 		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
 	} else {
-		if (config_fill && opt_junk) {
+		if (config_fill && unlikely(opt_junk)) {
 			arena_alloc_junk_small(ret, &arena_bin_info[binind],
 			    true);
 		}
@@ -1568,9 +1568,9 @@ arena_malloc_large(arena_t *arena, size_t size, bool zero)
 
 	if (zero == false) {
 		if (config_fill) {
-			if (opt_junk)
+			if (unlikely(opt_junk))
 				memset(ret, 0xa5, size);
-			else if (opt_zero)
+			else if (unlikely(opt_zero))
 				memset(ret, 0, size);
 		}
 	}
@@ -1626,9 +1626,9 @@ arena_palloc(arena_t *arena, size_t size, size_t alignment, bool zero)
 	malloc_mutex_unlock(&arena->lock);
 
 	if (config_fill && zero == false) {
-		if (opt_junk)
+		if (unlikely(opt_junk))
 			memset(ret, 0xa5, size);
-		else if (opt_zero)
+		else if (unlikely(opt_zero))
 			memset(ret, 0, size);
 	}
 	return (ret);
@@ -1771,7 +1771,7 @@ arena_dalloc_bin_locked(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	if (config_fill || config_stats)
 		size = bin_info->reg_size;
 
-	if (config_fill && opt_junk)
+	if (config_fill && unlikely(opt_junk))
 		arena_dalloc_junk_small(ptr, bin_info);
 
 	arena_run_reg_dalloc(run, ptr);
@@ -1825,7 +1825,7 @@ static void
 arena_dalloc_junk_large(void *ptr, size_t usize)
 {
 
-	if (config_fill && opt_junk)
+	if (config_fill && unlikely(opt_junk))
 		memset(ptr, 0x5a, usize);
 }
 #ifdef JEMALLOC_JET
@@ -1967,7 +1967,7 @@ static void
 arena_ralloc_junk_large(void *ptr, size_t old_usize, size_t usize)
 {
 
-	if (config_fill && opt_junk) {
+	if (config_fill && unlikely(opt_junk)) {
 		memset((void *)((uintptr_t)ptr + usize), 0x5a,
 		    old_usize - usize);
 	}
@@ -2011,11 +2011,11 @@ arena_ralloc_large(void *ptr, size_t oldsize, size_t size, size_t extra,
 			    oldsize, PAGE_CEILING(size),
 			    psize - PAGE_CEILING(size), zero);
 			if (config_fill && ret == false && zero == false) {
-				if (opt_junk) {
+				if (unlikely(opt_junk)) {
 					memset((void *)((uintptr_t)ptr +
 					    oldsize), 0xa5, isalloc(ptr,
 					    config_prof) - oldsize);
-				} else if (opt_zero) {
+				} else if (unlikely(opt_zero)) {
 					memset((void *)((uintptr_t)ptr +
 					    oldsize), 0, isalloc(ptr,
 					    config_prof) - oldsize);
@@ -2272,7 +2272,7 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size)
 	 * minimum alignment; without the padding, each redzone would have to
 	 * be twice as large in order to maintain alignment.
 	 */
-	if (config_fill && opt_redzone) {
+	if (config_fill && unlikely(opt_redzone)) {
 		size_t align_min = ZU(1) << (jemalloc_ffs(bin_info->reg_size) -
 		    1);
 		if (align_min <= REDZONE_MINSIZE) {
diff --git a/src/huge.c b/src/huge.c
index e773309..0b7db7f 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -62,9 +62,9 @@ huge_palloc(arena_t *arena, size_t size, size_t alignment, bool zero)
 	malloc_mutex_unlock(&huge_mtx);
 
 	if (config_fill && zero == false) {
-		if (opt_junk)
+		if (unlikely(opt_junk))
 			memset(ret, 0xa5, csize);
-		else if (opt_zero && is_zeroed == false)
+		else if (unlikely(opt_zero) && is_zeroed == false)
 			memset(ret, 0, csize);
 	}
 
@@ -141,7 +141,7 @@ static void
 huge_dalloc_junk(void *ptr, size_t usize)
 {
 
-	if (config_fill && have_dss && opt_junk) {
+	if (config_fill && have_dss && unlikely(opt_junk)) {
 		/*
 		 * Only bother junk filling if the chunk isn't about to be
 		 * unmapped.
diff --git a/src/jemalloc.c b/src/jemalloc.c
index f6be751..dfb1266 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -87,7 +87,7 @@ typedef struct {
 
 #ifdef JEMALLOC_UTRACE
 #  define UTRACE(a, b, c) do {						\
-	if (opt_utrace) {						\
+	if (unlikely(opt_utrace)) {					\
 		int utrace_serrno = errno;				\
 		malloc_utrace_t ut;					\
 		ut.p = (a);						\
@@ -283,7 +283,7 @@ malloc_thread_init(void)
 	 * a best effort attempt at initializing its TSD by hooking all
 	 * allocation events.
 	 */
-	if (config_fill && opt_quarantine)
+	if (config_fill && unlikely(opt_quarantine))
 		quarantine_alloc_hook();
 }
 
@@ -397,13 +397,13 @@ malloc_conf_init(void)
 	 */
 	if (config_valgrind) {
 		in_valgrind = (RUNNING_ON_VALGRIND != 0) ? true : false;
-		if (config_fill && in_valgrind) {
+		if (config_fill && unlikely(in_valgrind)) {
 			opt_junk = false;
 			assert(opt_zero == false);
 			opt_quarantine = JEMALLOC_VALGRIND_QUARANTINE_DEFAULT;
 			opt_redzone = true;
 		}
-		if (config_tcache && in_valgrind)
+		if (config_tcache && unlikely(in_valgrind))
 			opt_tcache = false;
 	}
 
@@ -887,7 +887,7 @@ imalloc_prof(size_t usize)
 	prof_tctx_t *tctx;
 
 	tctx = prof_alloc_prep(usize, true);
-	if ((uintptr_t)tctx != (uintptr_t)1U)
+	if (unlikely((uintptr_t)tctx != (uintptr_t)1U))
 		p = imalloc_prof_sample(usize, tctx);
 	else
 		p = imalloc(usize);
@@ -912,7 +912,7 @@ imalloc_body(size_t size, size_t *usize)
 		return (imalloc_prof(*usize));
 	}
 
-	if (config_stats || (unlikely(config_valgrind && in_valgrind)))
+	if (config_stats || (config_valgrind && unlikely(in_valgrind)))
 		*usize = s2u(size);
 	return (imalloc(size));
 }
@@ -927,15 +927,15 @@ je_malloc(size_t size)
 		size = 1;
 
 	ret = imalloc_body(size, &usize);
-	if (ret == NULL) {
-		if (config_xmalloc && opt_xmalloc) {
+	if (unlikely(ret == NULL)) {
+		if (config_xmalloc && unlikely(opt_xmalloc)) {
 			malloc_write("<jemalloc>: Error in malloc(): "
 			    "out of memory\n");
 			abort();
 		}
 		set_errno(ENOMEM);
 	}
-	if (config_stats && ret != NULL) {
+	if (config_stats && likely(ret != NULL)) {
 		assert(usize == isalloc(ret, config_prof));
 		thread_allocated_tsd_get()->allocated += usize;
 	}
@@ -970,7 +970,7 @@ imemalign_prof(size_t alignment, size_t usize)
 	prof_tctx_t *tctx;
 
 	tctx = prof_alloc_prep(usize, true);
-	if ((uintptr_t)tctx != (uintptr_t)1U)
+	if (unlikely((uintptr_t)tctx != (uintptr_t)1U))
 		p = imemalign_prof_sample(alignment, usize, tctx);
 	else
 		p = ipalloc(usize, alignment, false);
@@ -1001,9 +1001,9 @@ imemalign(void **memptr, size_t alignment, size_t size, size_t min_alignment)
 			size = 1;
 
 		/* Make sure that alignment is a large enough power of 2. */
-		if (((alignment - 1) & alignment) != 0
-		    || (alignment < min_alignment)) {
-			if (config_xmalloc && opt_xmalloc) {
+		if (unlikely(((alignment - 1) & alignment) != 0
+		    || (alignment < min_alignment))) {
+			if (config_xmalloc && unlikely(opt_xmalloc)) {
 				malloc_write("<jemalloc>: Error allocating "
 				    "aligned memory: invalid alignment\n");
 				abort();
@@ -1014,7 +1014,7 @@ imemalign(void **memptr, size_t alignment, size_t size, size_t min_alignment)
 		}
 
 		usize = sa2u(size, alignment);
-		if (usize == 0) {
+		if (unlikely(usize == 0)) {
 			result = NULL;
 			goto label_oom;
 		}
@@ -1023,14 +1023,14 @@ imemalign(void **memptr, size_t alignment, size_t size, size_t min_alignment)
 			result = imemalign_prof(alignment, usize);
 		else
 			result = ipalloc(usize, alignment, false);
-		if (result == NULL)
+		if (unlikely(result == NULL))
 			goto label_oom;
 	}
 
 	*memptr = result;
 	ret = 0;
 label_return:
-	if (config_stats && result != NULL) {
+	if (config_stats && likely(result != NULL)) {
 		assert(usize == isalloc(result, config_prof));
 		thread_allocated_tsd_get()->allocated += usize;
 	}
@@ -1038,7 +1038,7 @@ label_return:
 	return (ret);
 label_oom:
 	assert(result == NULL);
-	if (config_xmalloc && opt_xmalloc) {
+	if (config_xmalloc && unlikely(opt_xmalloc)) {
 		malloc_write("<jemalloc>: Error allocating aligned memory: "
 		    "out of memory\n");
 		abort();
@@ -1062,7 +1062,7 @@ je_aligned_alloc(size_t alignment, size_t size)
 	void *ret;
 	int err;
 
-	if ((err = imemalign(&ret, alignment, size, 1)) != 0) {
+	if (unlikely((err = imemalign(&ret, alignment, size, 1)) != 0)) {
 		ret = NULL;
 		set_errno(err);
 	}
@@ -1096,7 +1096,7 @@ icalloc_prof(size_t usize)
 	prof_tctx_t *tctx;
 
 	tctx = prof_alloc_prep(usize, true);
-	if ((uintptr_t)tctx != (uintptr_t)1U)
+	if (unlikely((uintptr_t)tctx != (uintptr_t)1U))
 		p = icalloc_prof_sample(usize, tctx);
 	else
 		p = icalloc(usize);
@@ -1123,7 +1123,7 @@ je_calloc(size_t num, size_t size)
 	}
 
 	num_size = num * size;
-	if (num_size == 0) {
+	if (unlikely(num_size == 0)) {
 		if (num == 0 || size == 0)
 			num_size = 1;
 		else {
@@ -1135,8 +1135,8 @@ je_calloc(size_t num, size_t size)
 	 * overflow during multiplication if neither operand uses any of the
 	 * most significant half of the bits in a size_t.
 	 */
-	} else if (((num | size) & (SIZE_T_MAX << (sizeof(size_t) << 2)))
-	    && (num_size / size != num)) {
+	} else if (unlikely(((num | size) & (SIZE_T_MAX << (sizeof(size_t) <<
+	    2))) && (num_size / size != num))) {
 		/* size_t overflow. */
 		ret = NULL;
 		goto label_return;
@@ -1146,21 +1146,21 @@ je_calloc(size_t num, size_t size)
 		usize = s2u(num_size);
 		ret = icalloc_prof(usize);
 	} else {
-		if (config_stats || unlikely(config_valgrind && in_valgrind))
+		if (config_stats || (config_valgrind && unlikely(in_valgrind)))
 			usize = s2u(num_size);
 		ret = icalloc(num_size);
 	}
 
 label_return:
-	if (ret == NULL) {
-		if (config_xmalloc && opt_xmalloc) {
+	if (unlikely(ret == NULL)) {
+		if (config_xmalloc && unlikely(opt_xmalloc)) {
 			malloc_write("<jemalloc>: Error in calloc(): out of "
 			    "memory\n");
 			abort();
 		}
 		set_errno(ENOMEM);
 	}
-	if (config_stats && ret != NULL) {
+	if (config_stats && likely(ret != NULL)) {
 		assert(usize == isalloc(ret, config_prof));
 		thread_allocated_tsd_get()->allocated += usize;
 	}
@@ -1195,7 +1195,7 @@ irealloc_prof(void *oldptr, size_t old_usize, size_t usize)
 
 	old_tctx = prof_tctx_get(oldptr);
 	tctx = prof_alloc_prep(usize, true);
-	if ((uintptr_t)tctx != (uintptr_t)1U)
+	if (unlikely((uintptr_t)tctx != (uintptr_t)1U))
 		p = irealloc_prof_sample(oldptr, usize, tctx);
 	else
 		p = iralloc(oldptr, usize, 0, false);
@@ -1222,7 +1222,7 @@ ifree(void *ptr, bool try_tcache)
 		usize = isalloc(ptr, config_prof);
 	if (config_stats)
 		thread_allocated_tsd_get()->deallocated += usize;
-	if (unlikely(config_valgrind && in_valgrind))
+	if (config_valgrind && unlikely(in_valgrind))
 		rzsize = p2rz(ptr);
 	iqalloc(ptr, try_tcache);
 	JEMALLOC_VALGRIND_FREE(ptr, rzsize);
@@ -1240,7 +1240,7 @@ isfree(void *ptr, size_t usize, bool try_tcache)
 		prof_free(ptr, usize);
 	if (config_stats)
 		thread_allocated_tsd_get()->deallocated += usize;
-	if (unlikely(config_valgrind && in_valgrind))
+	if (config_valgrind && unlikely(in_valgrind))
 		rzsize = p2rz(ptr);
 	isqalloc(ptr, usize, try_tcache);
 	JEMALLOC_VALGRIND_FREE(ptr, rzsize);
@@ -1254,7 +1254,7 @@ je_realloc(void *ptr, size_t size)
 	size_t old_usize = 0;
 	UNUSED size_t old_rzsize JEMALLOC_CC_SILENCE_INIT(0);
 
-	if (size == 0) {
+	if (unlikely(size == 0)) {
 		if (ptr != NULL) {
 			/* realloc(ptr, 0) is equivalent to free(ptr). */
 			UTRACE(ptr, 0, 0);
@@ -1264,21 +1264,22 @@ je_realloc(void *ptr, size_t size)
 		size = 1;
 	}
 
-	if (ptr != NULL) {
+	if (likely(ptr != NULL)) {
 		assert(malloc_initialized || IS_INITIALIZER);
 		malloc_thread_init();
 
 		if ((config_prof && opt_prof) || config_stats ||
-		    unlikely(config_valgrind && in_valgrind))
+		    (config_valgrind && unlikely(in_valgrind)))
 			old_usize = isalloc(ptr, config_prof);
-		if (unlikely(config_valgrind && in_valgrind))
+		if (config_valgrind && unlikely(in_valgrind))
 			old_rzsize = config_prof ? p2rz(ptr) : u2rz(old_usize);
 
 		if (config_prof && opt_prof) {
 			usize = s2u(size);
 			ret = irealloc_prof(ptr, old_usize, usize);
 		} else {
-			if (config_stats || unlikely(config_valgrind && in_valgrind))
+			if (config_stats || (config_valgrind &&
+			    unlikely(in_valgrind)))
 				usize = s2u(size);
 			ret = iralloc(ptr, size, 0, false);
 		}
@@ -1287,15 +1288,15 @@ je_realloc(void *ptr, size_t size)
 		ret = imalloc_body(size, &usize);
 	}
 
-	if (ret == NULL) {
-		if (config_xmalloc && opt_xmalloc) {
+	if (unlikely(ret == NULL)) {
+		if (config_xmalloc && unlikely(opt_xmalloc)) {
 			malloc_write("<jemalloc>: Error in realloc(): "
 			    "out of memory\n");
 			abort();
 		}
 		set_errno(ENOMEM);
 	}
-	if (config_stats && ret != NULL) {
+	if (config_stats && likely(ret != NULL)) {
 		thread_allocated_t *ta;
 		assert(usize == isalloc(ret, config_prof));
 		ta = thread_allocated_tsd_get();
@@ -1313,7 +1314,7 @@ je_free(void *ptr)
 {
 
 	UTRACE(ptr, 0, 0);
-	if (ptr != NULL)
+	if (likely(ptr != NULL))
 		ifree(ptr, true);
 }
 
@@ -1410,7 +1411,7 @@ imallocx_flags_decode(size_t size, int flags, size_t *usize, size_t *alignment,
     bool *zero, bool *try_tcache, arena_t **arena)
 {
 
-	if (flags == 0) {
+	if (likely(flags == 0)) {
 		*usize = s2u(size);
 		assert(usize != 0);
 		*alignment = 0;
@@ -1440,7 +1441,7 @@ imallocx_maybe_flags(size_t size, int flags, size_t usize, size_t alignment,
     bool zero, bool try_tcache, arena_t *arena)
 {
 
-	if (flags == 0)
+	if (likely(flags == 0))
 		return (imalloc(size));
 	return (imallocx_flags(usize, alignment, zero, try_tcache, arena));
 }
@@ -1479,7 +1480,7 @@ imallocx_prof(size_t size, int flags, size_t *usize)
 	imallocx_flags_decode(size, flags, usize, &alignment, &zero,
 	    &try_tcache, &arena);
 	tctx = prof_alloc_prep(*usize, true);
-	if ((uintptr_t)tctx == (uintptr_t)1U) {
+	if (likely((uintptr_t)tctx == (uintptr_t)1U)) {
 		p = imallocx_maybe_flags(size, flags, *usize, alignment, zero,
 		    try_tcache, arena);
 	} else if ((uintptr_t)tctx > (uintptr_t)1U) {
@@ -1487,7 +1488,7 @@ imallocx_prof(size_t size, int flags, size_t *usize)
 		    try_tcache, arena);
 	} else
 		p = NULL;
-	if (p == NULL) {
+	if (unlikely(p == NULL)) {
 		prof_alloc_rollback(tctx, true);
 		return (NULL);
 	}
@@ -1504,8 +1505,8 @@ imallocx_no_prof(size_t size, int flags, size_t *usize)
 	bool try_tcache;
 	arena_t *arena;
 
-	if (flags == 0) {
-		if (config_stats || unlikely(config_valgrind && in_valgrind))
+	if (likely(flags == 0)) {
+		if (config_stats || (config_valgrind && unlikely(in_valgrind)))
 			*usize = s2u(size);
 		return (imalloc(size));
 	}
@@ -1530,7 +1531,7 @@ je_mallocx(size_t size, int flags)
 		p = imallocx_prof(size, flags, &usize);
 	else
 		p = imallocx_no_prof(size, flags, &usize);
-	if (p == NULL)
+	if (unlikely(p == NULL))
 		goto label_oom;
 
 	if (config_stats) {
@@ -1541,7 +1542,7 @@ je_mallocx(size_t size, int flags)
 	JEMALLOC_VALGRIND_MALLOC(true, p, usize, MALLOCX_ZERO_GET(flags));
 	return (p);
 label_oom:
-	if (config_xmalloc && opt_xmalloc) {
+	if (config_xmalloc && unlikely(opt_xmalloc)) {
 		malloc_write("<jemalloc>: Error in mallocx(): out of memory\n");
 		abort();
 	}
@@ -1582,14 +1583,14 @@ irallocx_prof(void *oldptr, size_t old_usize, size_t size, size_t alignment,
 
 	old_tctx = prof_tctx_get(oldptr);
 	tctx = prof_alloc_prep(*usize, true);
-	if ((uintptr_t)tctx != (uintptr_t)1U) {
+	if (unlikely((uintptr_t)tctx != (uintptr_t)1U)) {
 		p = irallocx_prof_sample(oldptr, size, alignment, *usize, zero,
 		    try_tcache_alloc, try_tcache_dalloc, arena, tctx);
 	} else {
 		p = iralloct(oldptr, size, alignment, zero, try_tcache_alloc,
 		    try_tcache_dalloc, arena);
 	}
-	if (p == NULL) {
+	if (unlikely(p == NULL)) {
 		prof_alloc_rollback(tctx, true);
 		return (NULL);
 	}
@@ -1614,7 +1615,8 @@ void *
 je_rallocx(void *ptr, size_t size, int flags)
 {
 	void *p;
-	size_t usize, old_usize;
+	size_t usize;
+	UNUSED size_t old_usize JEMALLOC_CC_SILENCE_INIT(0);
 	UNUSED size_t old_rzsize JEMALLOC_CC_SILENCE_INIT(0);
 	size_t alignment = MALLOCX_ALIGN_GET(flags);
 	bool zero = flags & MALLOCX_ZERO;
@@ -1626,7 +1628,7 @@ je_rallocx(void *ptr, size_t size, int flags)
 	assert(malloc_initialized || IS_INITIALIZER);
 	malloc_thread_init();
 
-	if ((flags & MALLOCX_ARENA_MASK) != 0) {
+	if (unlikely((flags & MALLOCX_ARENA_MASK) != 0)) {
 		unsigned arena_ind = MALLOCX_ARENA_GET(flags);
 		arena_chunk_t *chunk;
 		try_tcache_alloc = false;
@@ -1641,9 +1643,9 @@ je_rallocx(void *ptr, size_t size, int flags)
 	}
 
 	if ((config_prof && opt_prof) || config_stats ||
-	    (unlikely(config_valgrind && in_valgrind)))
+	    ((config_valgrind && unlikely(in_valgrind))))
 		old_usize = isalloc(ptr, config_prof);
-	if (unlikely(config_valgrind && in_valgrind))
+	if (config_valgrind && unlikely(in_valgrind))
 		old_rzsize = u2rz(old_usize);
 
 	if (config_prof && opt_prof) {
@@ -1651,14 +1653,14 @@ je_rallocx(void *ptr, size_t size, int flags)
 		assert(usize != 0);
 		p = irallocx_prof(ptr, old_usize, size, alignment, &usize, zero,
 		    try_tcache_alloc, try_tcache_dalloc, arena);
-		if (p == NULL)
+		if (unlikely(p == NULL))
 			goto label_oom;
 	} else {
 		p = iralloct(ptr, size, alignment, zero, try_tcache_alloc,
 		    try_tcache_dalloc, arena);
-		if (p == NULL)
+		if (unlikely(p == NULL))
 			goto label_oom;
-		if (config_stats || (config_valgrind && in_valgrind))
+		if (config_stats || (config_valgrind && unlikely(in_valgrind)))
 			usize = isalloc(p, config_prof);
 	}
 
@@ -1673,7 +1675,7 @@ je_rallocx(void *ptr, size_t size, int flags)
 	    old_rzsize, false, zero);
 	return (p);
 label_oom:
-	if (config_xmalloc && opt_xmalloc) {
+	if (config_xmalloc && unlikely(opt_xmalloc)) {
 		malloc_write("<jemalloc>: Error in rallocx(): out of memory\n");
 		abort();
 	}
@@ -1738,14 +1740,14 @@ ixallocx_prof(void *ptr, size_t old_usize, size_t size, size_t extra,
 	max_usize = (alignment == 0) ? s2u(size+extra) : sa2u(size+extra,
 	    alignment);
 	tctx = prof_alloc_prep(max_usize, false);
-	if ((uintptr_t)tctx != (uintptr_t)1U) {
+	if (unlikely((uintptr_t)tctx != (uintptr_t)1U)) {
 		usize = ixallocx_prof_sample(ptr, old_usize, size, extra,
 		    alignment, zero, max_usize, arena, tctx);
 	} else {
 		usize = ixallocx_helper(ptr, old_usize, size, extra, alignment,
 		    zero, arena);
 	}
-	if (usize == old_usize) {
+	if (unlikely(usize == old_usize)) {
 		prof_alloc_rollback(tctx, false);
 		return (usize);
 	}
@@ -1769,14 +1771,14 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags)
 	assert(malloc_initialized || IS_INITIALIZER);
 	malloc_thread_init();
 
-	if ((flags & MALLOCX_ARENA_MASK) != 0) {
+	if (unlikely((flags & MALLOCX_ARENA_MASK) != 0)) {
 		unsigned arena_ind = MALLOCX_ARENA_GET(flags);
 		arena = arenas[arena_ind];
 	} else
 		arena = NULL;
 
 	old_usize = isalloc(ptr, config_prof);
-	if (unlikely(config_valgrind && in_valgrind))
+	if (config_valgrind && unlikely(in_valgrind))
 		old_rzsize = u2rz(old_usize);
 
 	if (config_prof && opt_prof) {
@@ -1786,7 +1788,7 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags)
 		usize = ixallocx_helper(ptr, old_usize, size, extra, alignment,
 		    zero, arena);
 	}
-	if (usize == old_usize)
+	if (unlikely(usize == old_usize))
 		goto label_not_resized;
 
 	if (config_stats) {
@@ -1828,7 +1830,7 @@ je_dallocx(void *ptr, int flags)
 	assert(ptr != NULL);
 	assert(malloc_initialized || IS_INITIALIZER);
 
-	if ((flags & MALLOCX_ARENA_MASK) != 0) {
+	if (unlikely((flags & MALLOCX_ARENA_MASK) != 0)) {
 		unsigned arena_ind = MALLOCX_ARENA_GET(flags);
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 		try_tcache = (chunk == ptr || chunk->arena !=
@@ -1845,7 +1847,7 @@ inallocx(size_t size, int flags)
 {
 	size_t usize;
 
-	if ((flags & MALLOCX_LG_ALIGN_MASK) == 0)
+	if (likely((flags & MALLOCX_LG_ALIGN_MASK) == 0))
 		usize = s2u(size);
 	else
 		usize = sa2u(size, MALLOCX_ALIGN_GET_SPECIFIED(flags));
@@ -1864,7 +1866,7 @@ je_sdallocx(void *ptr, size_t size, int flags)
 	usize = inallocx(size, flags);
 	assert(usize == isalloc(ptr, config_prof));
 
-	if ((flags & MALLOCX_ARENA_MASK) != 0) {
+	if (unlikely((flags & MALLOCX_ARENA_MASK) != 0)) {
 		unsigned arena_ind = MALLOCX_ARENA_GET(flags);
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 		try_tcache = (chunk == ptr || chunk->arena !=
diff --git a/src/quarantine.c b/src/quarantine.c
index 3b87442..efddeae 100644
--- a/src/quarantine.c
+++ b/src/quarantine.c
@@ -141,12 +141,12 @@ quarantine(void *ptr)
 		obj->usize = usize;
 		quarantine->curbytes += usize;
 		quarantine->curobjs++;
-		if (config_fill && opt_junk) {
+		if (config_fill && unlikely(opt_junk)) {
 			/*
 			 * Only do redzone validation if Valgrind isn't in
 			 * operation.
 			 */
-			if ((config_valgrind == false || in_valgrind == false)
+			if ((!config_valgrind || likely(!in_valgrind))
 			    && usize <= SMALL_MAXCLASS)
 				arena_quarantine_junk_small(ptr, usize);
 			else
-- 
cgit v0.12


From c3e9e7b0412e97e4976507f914fd39901b023537 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Thu, 11 Sep 2014 17:04:03 -0700
Subject: Fix irallocx_prof() sample logic.

Fix irallocx_prof() sample logic to only update the threshold counter
after it knows what size the allocation ended up being.  This regression
was caused by 6e73dc194ee9682d3eacaf725a989f04629718f7 (Fix a profile
sampling race.), which did not make it into any releases prior to this
fix.
---
 src/jemalloc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/jemalloc.c b/src/jemalloc.c
index dfb1266..c5b8f52 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1582,7 +1582,7 @@ irallocx_prof(void *oldptr, size_t old_usize, size_t size, size_t alignment,
 	prof_tctx_t *old_tctx, *tctx;
 
 	old_tctx = prof_tctx_get(oldptr);
-	tctx = prof_alloc_prep(*usize, true);
+	tctx = prof_alloc_prep(*usize, false);
 	if (unlikely((uintptr_t)tctx != (uintptr_t)1U)) {
 		p = irallocx_prof_sample(oldptr, size, alignment, *usize, zero,
 		    try_tcache_alloc, try_tcache_dalloc, arena, tctx);
@@ -1591,7 +1591,7 @@ irallocx_prof(void *oldptr, size_t old_usize, size_t size, size_t alignment,
 		    try_tcache_dalloc, arena);
 	}
 	if (unlikely(p == NULL)) {
-		prof_alloc_rollback(tctx, true);
+		prof_alloc_rollback(tctx, false);
 		return (NULL);
 	}
 
@@ -1606,7 +1606,7 @@ irallocx_prof(void *oldptr, size_t old_usize, size_t size, size_t alignment,
 		 */
 		*usize = isalloc(p, config_prof);
 	}
-	prof_realloc(p, *usize, tctx, true, old_usize, old_tctx);
+	prof_realloc(p, *usize, tctx, false, old_usize, old_tctx);
 
 	return (p);
 }
-- 
cgit v0.12


From 9d8f3d203327a7ee9ba92814e1fd8a7d1b9c421b Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Thu, 11 Sep 2014 18:06:30 -0700
Subject: Fix prof regressions.

Don't use atomic_add_uint64(), because it isn't available on 32-bit
platforms.

Fix forking support functions to manage all prof-related mutexes.

These regressions were introduced by
602c8e0971160e4b85b08b16cf8a2375aa24bc04 (Implement per thread heap
profiling.), which did not make it into any releases prior to these
fixes.
---
 src/prof.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/prof.c b/src/prof.c
index 9495afc..a773e22 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -68,6 +68,7 @@ static prof_tdata_tree_t	tdatas;
 static malloc_mutex_t	tdatas_mtx;
 
 static uint64_t		next_thr_uid;
+static malloc_mutex_t	next_thr_uid_mtx;
 
 static malloc_mutex_t	prof_dump_seq_mtx;
 static uint64_t		prof_dump_seq;
@@ -1498,8 +1499,14 @@ prof_bt_keycomp(const void *k1, const void *k2)
 JEMALLOC_INLINE_C uint64_t
 prof_thr_uid_alloc(void)
 {
+	uint64_t thr_uid;
 
-	return (atomic_add_uint64(&next_thr_uid, 1) - 1);
+	malloc_mutex_lock(&next_thr_uid_mtx);
+	thr_uid = next_thr_uid;
+	next_thr_uid++;
+	malloc_mutex_unlock(&next_thr_uid_mtx);
+
+	return (thr_uid);
 }
 
 static prof_tdata_t *
@@ -1785,6 +1792,8 @@ prof_boot2(void)
 			return (true);
 
 		next_thr_uid = 0;
+		if (malloc_mutex_init(&next_thr_uid_mtx))
+			return (true);
 
 		if (malloc_mutex_init(&prof_dump_seq_mtx))
 			return (true);
@@ -1836,10 +1845,14 @@ prof_prefork(void)
 	if (opt_prof) {
 		unsigned i;
 
+		malloc_mutex_prefork(&tdatas_mtx);
 		malloc_mutex_prefork(&bt2gctx_mtx);
+		malloc_mutex_prefork(&next_thr_uid_mtx);
 		malloc_mutex_prefork(&prof_dump_seq_mtx);
 		for (i = 0; i < PROF_NCTX_LOCKS; i++)
 			malloc_mutex_prefork(&gctx_locks[i]);
+		for (i = 0; i < PROF_NTDATA_LOCKS; i++)
+			malloc_mutex_prefork(&tdata_locks[i]);
 	}
 }
 
@@ -1850,10 +1863,14 @@ prof_postfork_parent(void)
 	if (opt_prof) {
 		unsigned i;
 
+		for (i = 0; i < PROF_NTDATA_LOCKS; i++)
+			malloc_mutex_postfork_parent(&tdata_locks[i]);
 		for (i = 0; i < PROF_NCTX_LOCKS; i++)
 			malloc_mutex_postfork_parent(&gctx_locks[i]);
 		malloc_mutex_postfork_parent(&prof_dump_seq_mtx);
+		malloc_mutex_postfork_parent(&next_thr_uid_mtx);
 		malloc_mutex_postfork_parent(&bt2gctx_mtx);
+		malloc_mutex_postfork_parent(&tdatas_mtx);
 	}
 }
 
@@ -1864,10 +1881,14 @@ prof_postfork_child(void)
 	if (opt_prof) {
 		unsigned i;
 
+		for (i = 0; i < PROF_NTDATA_LOCKS; i++)
+			malloc_mutex_postfork_child(&tdata_locks[i]);
 		for (i = 0; i < PROF_NCTX_LOCKS; i++)
 			malloc_mutex_postfork_child(&gctx_locks[i]);
 		malloc_mutex_postfork_child(&prof_dump_seq_mtx);
+		malloc_mutex_postfork_child(&next_thr_uid_mtx);
 		malloc_mutex_postfork_child(&bt2gctx_mtx);
+		malloc_mutex_postfork_child(&tdatas_mtx);
 	}
 }
 
-- 
cgit v0.12


From ebca69c9fb07dd7b0be7aa008215389581b193a0 Mon Sep 17 00:00:00 2001
From: Valerii Hiora <valerii.hiora@gmail.com>
Date: Fri, 12 Sep 2014 07:24:28 +0300
Subject: Fixed iOS build after OR1 changes

---
 config.sub | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/config.sub b/config.sub
index d654d03..0ccff77 100755
--- a/config.sub
+++ b/config.sub
@@ -1404,6 +1404,9 @@ case $os in
 	-mac*)
 		os=`echo $os | sed -e 's|mac|macos|'`
 		;;
+	# Apple iOS
+	-ios*)
+		;;
 	-linux-dietlibc)
 		os=-linux-dietlibc
 		;;
-- 
cgit v0.12


From f1cf3ea4753260d37c9a43463bae2140e00e16f0 Mon Sep 17 00:00:00 2001
From: Daniel Micay <danielmicay@gmail.com>
Date: Tue, 16 Sep 2014 04:42:33 -0400
Subject: fix tls_model autoconf test

It has an unused variable, so it was always failing (at least with gcc
4.9.1). Alternatively, the `-Werror` flag could be removed if it isn't
strictly necessary.
---
 configure.ac | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index d221876..1d2e890 100644
--- a/configure.ac
+++ b/configure.ac
@@ -411,7 +411,7 @@ SAVED_CFLAGS="${CFLAGS}"
 JE_CFLAGS_APPEND([-Werror])
 JE_COMPILABLE([tls_model attribute], [],
               [static __thread int
-               __attribute__((tls_model("initial-exec"))) foo;
+               __attribute__((tls_model("initial-exec"), unused)) foo;
                foo = 0;],
               [je_cv_tls_model])
 CFLAGS="${SAVED_CFLAGS}"
-- 
cgit v0.12


From 913e9a8a853a693c5b5d6c13ab86f1b46a3404f7 Mon Sep 17 00:00:00 2001
From: Nick White <nwhite@palantir.com>
Date: Fri, 19 Sep 2014 22:01:23 +0100
Subject: Generate a pkg-config file

---
 Makefile.in    | 10 +++++++++-
 configure.ac   |  3 +++
 jemalloc.pc.in | 11 +++++++++++
 3 files changed, 23 insertions(+), 1 deletion(-)
 create mode 100644 jemalloc.pc.in

diff --git a/Makefile.in b/Makefile.in
index ac56d8f..41328b9 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -101,6 +101,7 @@ DSOS := $(objroot)lib/$(LIBJEMALLOC).$(SOREV)
 ifneq ($(SOREV),$(SO))
 DSOS += $(objroot)lib/$(LIBJEMALLOC).$(SO)
 endif
+PC := $(srcroot)jemalloc.pc
 MAN3 := $(objroot)doc/jemalloc$(install_suffix).3
 DOCS_XML := $(objroot)doc/jemalloc$(install_suffix).xml
 DOCS_HTML := $(DOCS_XML:$(objroot)%.xml=$(srcroot)%.html)
@@ -302,7 +303,14 @@ install_lib_static: $(STATIC_LIBS)
 	install -m 755 $$l $(LIBDIR); \
 done
 
-install_lib: install_lib_shared install_lib_static
+install_lib_pc: $(PC)
+	install -d $(LIBDIR)/pkgconfig
+	@for l in $(PC); do \
+	echo "install -m 644 $$l $(LIBDIR)/pkgconfig"; \
+	install -m 644 $$l $(LIBDIR)/pkgconfig; \
+done
+
+install_lib: install_lib_shared install_lib_static install_lib_pc
 
 install_doc_html:
 	install -d $(DATADIR)/doc/jemalloc$(install_suffix)
diff --git a/configure.ac b/configure.ac
index 1d2e890..2d5b56a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -540,6 +540,7 @@ je_="je_"
 AC_SUBST([je_])
 
 cfgoutputs_in="Makefile.in"
+cfgoutputs_in="${cfgoutputs_in} jemalloc.pc.in"
 cfgoutputs_in="${cfgoutputs_in} doc/html.xsl.in"
 cfgoutputs_in="${cfgoutputs_in} doc/manpages.xsl.in"
 cfgoutputs_in="${cfgoutputs_in} doc/jemalloc.xml.in"
@@ -551,6 +552,7 @@ cfgoutputs_in="${cfgoutputs_in} test/test.sh.in"
 cfgoutputs_in="${cfgoutputs_in} test/include/test/jemalloc_test.h.in"
 
 cfgoutputs_out="Makefile"
+cfgoutputs_out="${cfgoutputs_out} jemalloc.pc"
 cfgoutputs_out="${cfgoutputs_out} doc/html.xsl"
 cfgoutputs_out="${cfgoutputs_out} doc/manpages.xsl"
 cfgoutputs_out="${cfgoutputs_out} doc/jemalloc.xml"
@@ -562,6 +564,7 @@ cfgoutputs_out="${cfgoutputs_out} test/test.sh"
 cfgoutputs_out="${cfgoutputs_out} test/include/test/jemalloc_test.h"
 
 cfgoutputs_tup="Makefile"
+cfgoutputs_tup="${cfgoutputs_tup} jemalloc.pc:jemalloc.pc.in"
 cfgoutputs_tup="${cfgoutputs_tup} doc/html.xsl:doc/html.xsl.in"
 cfgoutputs_tup="${cfgoutputs_tup} doc/manpages.xsl:doc/manpages.xsl.in"
 cfgoutputs_tup="${cfgoutputs_tup} doc/jemalloc.xml:doc/jemalloc.xml.in"
diff --git a/jemalloc.pc.in b/jemalloc.pc.in
new file mode 100644
index 0000000..af3f945
--- /dev/null
+++ b/jemalloc.pc.in
@@ -0,0 +1,11 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: jemalloc
+Description: A general purpose malloc(3) implementation that emphasizes fragmentation avoidance and scalable concurrency support.
+URL: http://www.canonware.com/jemalloc
+Version: @jemalloc_version@
+Cflags: -I${includedir}
+Libs: -L${libdir} -ljemalloc
-- 
cgit v0.12


From 42f59559384ddb1af22607ddb3fe766b7b6ab0b7 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 21 Sep 2014 21:40:38 -0700
Subject: Ignore jemalloc.pc .

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 79d454f..fd68315 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,8 @@
 /doc/jemalloc.html
 /doc/jemalloc.3
 
+/jemalloc.pc
+
 /lib/
 
 /Makefile
-- 
cgit v0.12


From 5460aa6f6676c7f253bfcb75c028dfd38cae8aaf Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Mon, 22 Sep 2014 21:09:23 -0700
Subject: Convert all tsd variables to reside in a single tsd structure.

---
 include/jemalloc/internal/arena.h                |  48 +--
 include/jemalloc/internal/ckh.h                  |   8 +-
 include/jemalloc/internal/huge.h                 |  10 +-
 include/jemalloc/internal/jemalloc_internal.h.in | 168 +++++------
 include/jemalloc/internal/private_symbols.txt    |  96 ++----
 include/jemalloc/internal/prof.h                 |  78 +++--
 include/jemalloc/internal/quarantine.h           |  21 +-
 include/jemalloc/internal/tcache.h               |  88 +++---
 include/jemalloc/internal/tsd.h                  | 341 +++++++++++++--------
 src/arena.c                                      |  23 +-
 src/ckh.c                                        |  38 +--
 src/ctl.c                                        |  93 ++++--
 src/huge.c                                       |  21 +-
 src/jemalloc.c                                   | 366 ++++++++++++-----------
 src/prof.c                                       | 244 +++++++--------
 src/quarantine.c                                 |  99 ++----
 src/rtree.c                                      |   6 +-
 src/tcache.c                                     | 101 ++-----
 src/tsd.c                                        |  51 +++-
 test/unit/ckh.c                                  |  46 +--
 test/unit/rtree.c                                |   8 +-
 test/unit/tsd.c                                  |   8 +-
 22 files changed, 1027 insertions(+), 935 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index bfb0b3c..f1a1205 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -419,9 +419,9 @@ extern arena_ralloc_junk_large_t *arena_ralloc_junk_large;
 #endif
 bool	arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
     size_t extra, bool zero);
-void	*arena_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
-    size_t extra, size_t alignment, bool zero, bool try_tcache_alloc,
-    bool try_tcache_dalloc);
+void	*arena_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize,
+    size_t size, size_t extra, size_t alignment, bool zero,
+    bool try_tcache_alloc, bool try_tcache_dalloc);
 dss_prec_t	arena_dss_prec_get(arena_t *arena);
 bool	arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec);
 void	arena_stats_merge(arena_t *arena, const char **dss, size_t *nactive,
@@ -485,10 +485,12 @@ unsigned	arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info,
     const void *ptr);
 prof_tctx_t	*arena_prof_tctx_get(const void *ptr);
 void	arena_prof_tctx_set(const void *ptr, prof_tctx_t *tctx);
-void	*arena_malloc(arena_t *arena, size_t size, bool zero, bool try_tcache);
+void	*arena_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
+    bool try_tcache);
 size_t	arena_salloc(const void *ptr, bool demote);
-void	arena_dalloc(arena_chunk_t *chunk, void *ptr, bool try_tcache);
-void	arena_sdalloc(arena_chunk_t *chunk, void *ptr, size_t size,
+void	arena_dalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr,
+    bool try_tcache);
+void	arena_sdalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, size_t size,
     bool try_tcache);
 #endif
 
@@ -1053,7 +1055,8 @@ arena_prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-arena_malloc(arena_t *arena, size_t size, bool zero, bool try_tcache)
+arena_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
+    bool try_tcache)
 {
 	tcache_t *tcache;
 
@@ -1061,12 +1064,12 @@ arena_malloc(arena_t *arena, size_t size, bool zero, bool try_tcache)
 	assert(size <= arena_maxclass);
 
 	if (likely(size <= SMALL_MAXCLASS)) {
-		if (likely(try_tcache) && likely((tcache = tcache_get(true)) !=
-		    NULL))
+		if (likely(try_tcache) && likely((tcache = tcache_get(tsd,
+		    true)) != NULL))
 			return (tcache_alloc_small(tcache, size, zero));
 		else {
-			return (arena_malloc_small(choose_arena(arena), size,
-			    zero));
+			return (arena_malloc_small(choose_arena(tsd, arena),
+			    size, zero));
 		}
 	} else {
 		/*
@@ -1074,11 +1077,11 @@ arena_malloc(arena_t *arena, size_t size, bool zero, bool try_tcache)
 		 * infinite recursion during tcache initialization.
 		 */
 		if (try_tcache && size <= tcache_maxclass && likely((tcache =
-		    tcache_get(true)) != NULL))
+		    tcache_get(tsd, true)) != NULL))
 			return (tcache_alloc_large(tcache, size, zero));
 		else {
-			return (arena_malloc_large(choose_arena(arena), size,
-			    zero));
+			return (arena_malloc_large(choose_arena(tsd, arena),
+			    size, zero));
 		}
 	}
 }
@@ -1128,7 +1131,7 @@ arena_salloc(const void *ptr, bool demote)
 }
 
 JEMALLOC_ALWAYS_INLINE void
-arena_dalloc(arena_chunk_t *chunk, void *ptr, bool try_tcache)
+arena_dalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, bool try_tcache)
 {
 	size_t pageind, mapbits;
 	tcache_t *tcache;
@@ -1141,8 +1144,8 @@ arena_dalloc(arena_chunk_t *chunk, void *ptr, bool try_tcache)
 	assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
 	if (likely((mapbits & CHUNK_MAP_LARGE) == 0)) {
 		/* Small allocation. */
-		if (likely(try_tcache) && likely((tcache = tcache_get(false)) !=
-		    NULL)) {
+		if (likely(try_tcache) && likely((tcache = tcache_get(tsd,
+		    false)) != NULL)) {
 			size_t binind = arena_ptr_small_binind_get(ptr,
 			    mapbits);
 			tcache_dalloc_small(tcache, ptr, binind);
@@ -1154,7 +1157,7 @@ arena_dalloc(arena_chunk_t *chunk, void *ptr, bool try_tcache)
 		assert(((uintptr_t)ptr & PAGE_MASK) == 0);
 
 		if (try_tcache && size <= tcache_maxclass && likely((tcache =
-		    tcache_get(false)) != NULL)) {
+		    tcache_get(tsd, false)) != NULL)) {
 			tcache_dalloc_large(tcache, ptr, size);
 		} else
 			arena_dalloc_large(chunk->arena, chunk, ptr);
@@ -1162,7 +1165,8 @@ arena_dalloc(arena_chunk_t *chunk, void *ptr, bool try_tcache)
 }
 
 JEMALLOC_ALWAYS_INLINE void
-arena_sdalloc(arena_chunk_t *chunk, void *ptr, size_t size, bool try_tcache)
+arena_sdalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, size_t size,
+    bool try_tcache)
 {
 	tcache_t *tcache;
 
@@ -1171,8 +1175,8 @@ arena_sdalloc(arena_chunk_t *chunk, void *ptr, size_t size, bool try_tcache)
 
 	if (likely(size <= SMALL_MAXCLASS)) {
 		/* Small allocation. */
-		if (likely(try_tcache) && likely((tcache = tcache_get(false)) !=
-		    NULL)) {
+		if (likely(try_tcache) && likely((tcache = tcache_get(tsd,
+		    false)) != NULL)) {
 			size_t binind = small_size2bin(size);
 			tcache_dalloc_small(tcache, ptr, binind);
 		} else {
@@ -1184,7 +1188,7 @@ arena_sdalloc(arena_chunk_t *chunk, void *ptr, size_t size, bool try_tcache)
 		assert(((uintptr_t)ptr & PAGE_MASK) == 0);
 
 		if (try_tcache && size <= tcache_maxclass && (tcache =
-		    tcache_get(false)) != NULL) {
+		    tcache_get(tsd, false)) != NULL) {
 			tcache_dalloc_large(tcache, ptr, size);
 		} else
 			arena_dalloc_large(chunk->arena, chunk, ptr);
diff --git a/include/jemalloc/internal/ckh.h b/include/jemalloc/internal/ckh.h
index 58712a6..75c1c97 100644
--- a/include/jemalloc/internal/ckh.h
+++ b/include/jemalloc/internal/ckh.h
@@ -66,13 +66,13 @@ struct ckh_s {
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
-bool	ckh_new(ckh_t *ckh, size_t minitems, ckh_hash_t *hash,
+bool	ckh_new(tsd_t *tsd, ckh_t *ckh, size_t minitems, ckh_hash_t *hash,
     ckh_keycomp_t *keycomp);
-void	ckh_delete(ckh_t *ckh);
+void	ckh_delete(tsd_t *tsd, ckh_t *ckh);
 size_t	ckh_count(ckh_t *ckh);
 bool	ckh_iter(ckh_t *ckh, size_t *tabind, void **key, void **data);
-bool	ckh_insert(ckh_t *ckh, const void *key, const void *data);
-bool	ckh_remove(ckh_t *ckh, const void *searchkey, void **key,
+bool	ckh_insert(tsd_t *tsd, ckh_t *ckh, const void *key, const void *data);
+bool	ckh_remove(tsd_t *tsd, ckh_t *ckh, const void *searchkey, void **key,
     void **data);
 bool	ckh_search(ckh_t *ckh, const void *seachkey, void **key, void **data);
 void	ckh_string_hash(const void *key, size_t r_hash[2]);
diff --git a/include/jemalloc/internal/huge.h b/include/jemalloc/internal/huge.h
index 2ec7752..b061e15 100644
--- a/include/jemalloc/internal/huge.h
+++ b/include/jemalloc/internal/huge.h
@@ -9,12 +9,14 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
-void	*huge_malloc(arena_t *arena, size_t size, bool zero);
-void	*huge_palloc(arena_t *arena, size_t size, size_t alignment, bool zero);
+void	*huge_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero);
+void	*huge_palloc(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
+    bool zero);
 bool	huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
     size_t extra);
-void	*huge_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
-    size_t extra, size_t alignment, bool zero, bool try_tcache_dalloc);
+void	*huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize,
+    size_t size, size_t extra, size_t alignment, bool zero,
+    bool try_tcache_dalloc);
 #ifdef JEMALLOC_JET
 typedef void (huge_dalloc_junk_t)(void *, size_t);
 extern huge_dalloc_junk_t *huge_dalloc_junk;
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index a380a41..bff2bd2 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -350,7 +350,6 @@ static const bool config_ivsalloc =
 #include "jemalloc/internal/stats.h"
 #include "jemalloc/internal/ctl.h"
 #include "jemalloc/internal/mutex.h"
-#include "jemalloc/internal/tsd.h"
 #include "jemalloc/internal/mb.h"
 #include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/extent.h"
@@ -364,15 +363,7 @@ static const bool config_ivsalloc =
 #include "jemalloc/internal/quarantine.h"
 #include "jemalloc/internal/prof.h"
 
-typedef struct {
-	uint64_t	allocated;
-	uint64_t	deallocated;
-} thread_allocated_t;
-/*
- * The JEMALLOC_ARG_CONCAT() wrapper is necessary to pass {0, 0} via a cpp macro
- * argument.
- */
-#define	THREAD_ALLOCATED_INITIALIZER	JEMALLOC_ARG_CONCAT({0, 0})
+#include "jemalloc/internal/tsd.h"
 
 #undef JEMALLOC_H_STRUCTS
 /******************************************************************************/
@@ -407,8 +398,10 @@ extern unsigned		narenas_total;
 extern unsigned		narenas_auto; /* Read-only after initialization. */
 
 arena_t	*arenas_extend(unsigned ind);
-void	arenas_cleanup(void *arg);
-arena_t	*choose_arena_hard(void);
+arena_t	*choose_arena_hard(tsd_t *tsd);
+void	thread_allocated_cleanup(tsd_t *tsd);
+void	thread_deallocated_cleanup(tsd_t *tsd);
+void	arena_cleanup(tsd_t *tsd);
 void	jemalloc_prefork(void);
 void	jemalloc_postfork_parent(void);
 void	jemalloc_postfork_child(void);
@@ -422,7 +415,6 @@ void	jemalloc_postfork_child(void);
 #include "jemalloc/internal/stats.h"
 #include "jemalloc/internal/ctl.h"
 #include "jemalloc/internal/mutex.h"
-#include "jemalloc/internal/tsd.h"
 #include "jemalloc/internal/mb.h"
 #include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/extent.h"
@@ -435,6 +427,7 @@ void	jemalloc_postfork_child(void);
 #include "jemalloc/internal/hash.h"
 #include "jemalloc/internal/quarantine.h"
 #include "jemalloc/internal/prof.h"
+#include "jemalloc/internal/tsd.h"
 
 #undef JEMALLOC_H_EXTERNS
 /******************************************************************************/
@@ -465,24 +458,14 @@ void	jemalloc_postfork_child(void);
 #undef JEMALLOC_ARENA_INLINE_A
 
 #ifndef JEMALLOC_ENABLE_INLINE
-malloc_tsd_protos(JEMALLOC_ATTR(unused), arenas, arena_t *)
-
 size_t	s2u(size_t size);
 size_t	sa2u(size_t size, size_t alignment);
 unsigned	narenas_total_get(void);
-arena_t	*choose_arena(arena_t *arena);
+arena_t	*choose_arena(tsd_t *tsd, arena_t *arena);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
 /*
- * Map of pthread_self() --> arenas[???], used for selecting an arena to use
- * for allocations.
- */
-malloc_tsd_externs(arenas, arena_t *)
-malloc_tsd_funcs(JEMALLOC_ALWAYS_INLINE, arenas, arena_t *, NULL,
-    arenas_cleanup)
-
-/*
  * Compute usable size that would result from allocating an object with the
  * specified size.
  */
@@ -589,15 +572,15 @@ narenas_total_get(void)
 
 /* Choose an arena based on a per-thread value. */
 JEMALLOC_INLINE arena_t *
-choose_arena(arena_t *arena)
+choose_arena(tsd_t *tsd, arena_t *arena)
 {
 	arena_t *ret;
 
 	if (arena != NULL)
 		return (arena);
 
-	if ((ret = *arenas_tsd_get()) == NULL) {
-		ret = choose_arena_hard();
+	if (unlikely((ret = tsd_arena_get(tsd)) == NULL)) {
+		ret = choose_arena_hard(tsd);
 		assert(ret != NULL);
 	}
 
@@ -622,72 +605,72 @@ choose_arena(arena_t *arena)
 #include "jemalloc/internal/quarantine.h"
 
 #ifndef JEMALLOC_ENABLE_INLINE
-void	*imalloct(size_t size, bool try_tcache, arena_t *arena);
-void	*imalloc(size_t size);
-void	*icalloct(size_t size, bool try_tcache, arena_t *arena);
-void	*icalloc(size_t size);
-void	*ipalloct(size_t usize, size_t alignment, bool zero, bool try_tcache,
-    arena_t *arena);
-void	*ipalloc(size_t usize, size_t alignment, bool zero);
+void	*imalloct(tsd_t *tsd, size_t size, bool try_tcache, arena_t *arena);
+void	*imalloc(tsd_t *tsd, size_t size);
+void	*icalloct(tsd_t *tsd, size_t size, bool try_tcache, arena_t *arena);
+void	*icalloc(tsd_t *tsd, size_t size);
+void	*ipalloct(tsd_t *tsd, size_t usize, size_t alignment, bool zero,
+    bool try_tcache, arena_t *arena);
+void	*ipalloc(tsd_t *tsd, size_t usize, size_t alignment, bool zero);
 size_t	isalloc(const void *ptr, bool demote);
 size_t	ivsalloc(const void *ptr, bool demote);
 size_t	u2rz(size_t usize);
 size_t	p2rz(const void *ptr);
-void	idalloct(void *ptr, bool try_tcache);
-void	isdalloct(void *ptr, size_t size, bool try_tcache);
-void	idalloc(void *ptr);
-void	iqalloc(void *ptr, bool try_tcache);
-void	isqalloc(void *ptr, size_t size, bool try_tcache);
-void	*iralloct_realign(void *ptr, size_t oldsize, size_t size, size_t extra,
-    size_t alignment, bool zero, bool try_tcache_alloc, bool try_tcache_dalloc,
-    arena_t *arena);
-void	*iralloct(void *ptr, size_t size, size_t alignment, bool zero,
-    bool try_tcache_alloc, bool try_tcache_dalloc, arena_t *arena);
-void	*iralloc(void *ptr, size_t size, size_t alignment, bool zero);
+void	idalloct(tsd_t *tsd, void *ptr, bool try_tcache);
+void	isdalloct(tsd_t *tsd, void *ptr, size_t size, bool try_tcache);
+void	idalloc(tsd_t *tsd, void *ptr);
+void	iqalloc(tsd_t *tsd, void *ptr, bool try_tcache);
+void	isqalloc(tsd_t *tsd, void *ptr, size_t size, bool try_tcache);
+void	*iralloct_realign(tsd_t *tsd, void *ptr, size_t oldsize, size_t size,
+    size_t extra, size_t alignment, bool zero, bool try_tcache_alloc,
+    bool try_tcache_dalloc, arena_t *arena);
+void	*iralloct(tsd_t *tsd, void *ptr, size_t size, size_t alignment,
+    bool zero, bool try_tcache_alloc, bool try_tcache_dalloc, arena_t *arena);
+void	*iralloc(tsd_t *tsd, void *ptr, size_t size, size_t alignment,
+    bool zero);
 bool	ixalloc(void *ptr, size_t size, size_t extra, size_t alignment,
     bool zero);
-malloc_tsd_protos(JEMALLOC_ATTR(unused), thread_allocated, thread_allocated_t)
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
 JEMALLOC_ALWAYS_INLINE void *
-imalloct(size_t size, bool try_tcache, arena_t *arena)
+imalloct(tsd_t *tsd, size_t size, bool try_tcache, arena_t *arena)
 {
 
 	assert(size != 0);
 
 	if (size <= arena_maxclass)
-		return (arena_malloc(arena, size, false, try_tcache));
+		return (arena_malloc(tsd, arena, size, false, try_tcache));
 	else
-		return (huge_malloc(arena, size, false));
+		return (huge_malloc(tsd, arena, size, false));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-imalloc(size_t size)
+imalloc(tsd_t *tsd, size_t size)
 {
 
-	return (imalloct(size, true, NULL));
+	return (imalloct(tsd, size, true, NULL));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-icalloct(size_t size, bool try_tcache, arena_t *arena)
+icalloct(tsd_t *tsd, size_t size, bool try_tcache, arena_t *arena)
 {
 
 	if (size <= arena_maxclass)
-		return (arena_malloc(arena, size, true, try_tcache));
+		return (arena_malloc(tsd, arena, size, true, try_tcache));
 	else
-		return (huge_malloc(arena, size, true));
+		return (huge_malloc(tsd, arena, size, true));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-icalloc(size_t size)
+icalloc(tsd_t *tsd, size_t size)
 {
 
-	return (icalloct(size, true, NULL));
+	return (icalloct(tsd, size, true, NULL));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-ipalloct(size_t usize, size_t alignment, bool zero, bool try_tcache,
+ipalloct(tsd_t *tsd, size_t usize, size_t alignment, bool zero, bool try_tcache,
     arena_t *arena)
 {
 	void *ret;
@@ -696,15 +679,15 @@ ipalloct(size_t usize, size_t alignment, bool zero, bool try_tcache,
 	assert(usize == sa2u(usize, alignment));
 
 	if (usize <= arena_maxclass && alignment <= PAGE)
-		ret = arena_malloc(arena, usize, zero, try_tcache);
+		ret = arena_malloc(tsd, arena, usize, zero, try_tcache);
 	else {
 		if (usize <= arena_maxclass) {
-			ret = arena_palloc(choose_arena(arena), usize,
+			ret = arena_palloc(choose_arena(tsd, arena), usize,
 			    alignment, zero);
 		} else if (alignment <= chunksize)
-			ret = huge_malloc(arena, usize, zero);
+			ret = huge_malloc(tsd, arena, usize, zero);
 		else
-			ret = huge_palloc(arena, usize, alignment, zero);
+			ret = huge_palloc(tsd, arena, usize, alignment, zero);
 	}
 
 	assert(ALIGNMENT_ADDR2BASE(ret, alignment) == ret);
@@ -712,10 +695,10 @@ ipalloct(size_t usize, size_t alignment, bool zero, bool try_tcache,
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-ipalloc(size_t usize, size_t alignment, bool zero)
+ipalloc(tsd_t *tsd, size_t usize, size_t alignment, bool zero)
 {
 
-	return (ipalloct(usize, alignment, zero, true, NULL));
+	return (ipalloct(tsd, usize, alignment, zero, true, NULL));
 }
 
 /*
@@ -776,7 +759,7 @@ p2rz(const void *ptr)
 }
 
 JEMALLOC_ALWAYS_INLINE void
-idalloct(void *ptr, bool try_tcache)
+idalloct(tsd_t *tsd, void *ptr, bool try_tcache)
 {
 	arena_chunk_t *chunk;
 
@@ -784,13 +767,13 @@ idalloct(void *ptr, bool try_tcache)
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (chunk != ptr)
-		arena_dalloc(chunk, ptr, try_tcache);
+		arena_dalloc(tsd, chunk, ptr, try_tcache);
 	else
 		huge_dalloc(ptr);
 }
 
 JEMALLOC_ALWAYS_INLINE void
-isdalloct(void *ptr, size_t size, bool try_tcache)
+isdalloct(tsd_t *tsd, void *ptr, size_t size, bool try_tcache)
 {
 	arena_chunk_t *chunk;
 
@@ -798,42 +781,42 @@ isdalloct(void *ptr, size_t size, bool try_tcache)
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (chunk != ptr)
-		arena_sdalloc(chunk, ptr, size, try_tcache);
+		arena_sdalloc(tsd, chunk, ptr, size, try_tcache);
 	else
 		huge_dalloc(ptr);
 }
 
 JEMALLOC_ALWAYS_INLINE void
-idalloc(void *ptr)
+idalloc(tsd_t *tsd, void *ptr)
 {
 
-	idalloct(ptr, true);
+	idalloct(tsd, ptr, true);
 }
 
 JEMALLOC_ALWAYS_INLINE void
-iqalloc(void *ptr, bool try_tcache)
+iqalloc(tsd_t *tsd, void *ptr, bool try_tcache)
 {
 
 	if (config_fill && unlikely(opt_quarantine))
-		quarantine(ptr);
+		quarantine(tsd, ptr);
 	else
-		idalloct(ptr, try_tcache);
+		idalloct(tsd, ptr, try_tcache);
 }
 
 JEMALLOC_ALWAYS_INLINE void
-isqalloc(void *ptr, size_t size, bool try_tcache)
+isqalloc(tsd_t *tsd, void *ptr, size_t size, bool try_tcache)
 {
 
 	if (config_fill && unlikely(opt_quarantine))
-		quarantine(ptr);
+		quarantine(tsd, ptr);
 	else
-		isdalloct(ptr, size, try_tcache);
+		isdalloct(tsd, ptr, size, try_tcache);
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-iralloct_realign(void *ptr, size_t oldsize, size_t size, size_t extra,
-    size_t alignment, bool zero, bool try_tcache_alloc, bool try_tcache_dalloc,
-    arena_t *arena)
+iralloct_realign(tsd_t *tsd, void *ptr, size_t oldsize, size_t size,
+    size_t extra, size_t alignment, bool zero, bool try_tcache_alloc,
+    bool try_tcache_dalloc, arena_t *arena)
 {
 	void *p;
 	size_t usize, copysize;
@@ -841,7 +824,7 @@ iralloct_realign(void *ptr, size_t oldsize, size_t size, size_t extra,
 	usize = sa2u(size + extra, alignment);
 	if (usize == 0)
 		return (NULL);
-	p = ipalloct(usize, alignment, zero, try_tcache_alloc, arena);
+	p = ipalloct(tsd, usize, alignment, zero, try_tcache_alloc, arena);
 	if (p == NULL) {
 		if (extra == 0)
 			return (NULL);
@@ -849,7 +832,8 @@ iralloct_realign(void *ptr, size_t oldsize, size_t size, size_t extra,
 		usize = sa2u(size, alignment);
 		if (usize == 0)
 			return (NULL);
-		p = ipalloct(usize, alignment, zero, try_tcache_alloc, arena);
+		p = ipalloct(tsd, usize, alignment, zero, try_tcache_alloc,
+		    arena);
 		if (p == NULL)
 			return (NULL);
 	}
@@ -859,12 +843,12 @@ iralloct_realign(void *ptr, size_t oldsize, size_t size, size_t extra,
 	 */
 	copysize = (size < oldsize) ? size : oldsize;
 	memcpy(p, ptr, copysize);
-	iqalloc(ptr, try_tcache_dalloc);
+	iqalloc(tsd, ptr, try_tcache_dalloc);
 	return (p);
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-iralloct(void *ptr, size_t size, size_t alignment, bool zero,
+iralloct(tsd_t *tsd, void *ptr, size_t size, size_t alignment, bool zero,
     bool try_tcache_alloc, bool try_tcache_dalloc, arena_t *arena)
 {
 	size_t oldsize;
@@ -880,24 +864,24 @@ iralloct(void *ptr, size_t size, size_t alignment, bool zero,
 		 * Existing object alignment is inadequate; allocate new space
 		 * and copy.
 		 */
-		return (iralloct_realign(ptr, oldsize, size, 0, alignment, zero,
-		    try_tcache_alloc, try_tcache_dalloc, arena));
+		return (iralloct_realign(tsd, ptr, oldsize, size, 0, alignment,
+		    zero, try_tcache_alloc, try_tcache_dalloc, arena));
 	}
 
 	if (size <= arena_maxclass) {
-		return (arena_ralloc(arena, ptr, oldsize, size, 0, alignment,
-		    zero, try_tcache_alloc, try_tcache_dalloc));
+		return (arena_ralloc(tsd, arena, ptr, oldsize, size, 0,
+		    alignment, zero, try_tcache_alloc, try_tcache_dalloc));
 	} else {
-		return (huge_ralloc(arena, ptr, oldsize, size, 0, alignment,
-		    zero, try_tcache_dalloc));
+		return (huge_ralloc(tsd, arena, ptr, oldsize, size, 0,
+		    alignment, zero, try_tcache_dalloc));
 	}
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-iralloc(void *ptr, size_t size, size_t alignment, bool zero)
+iralloc(tsd_t *tsd, void *ptr, size_t size, size_t alignment, bool zero)
 {
 
-	return (iralloct(ptr, size, alignment, zero, true, true, NULL));
+	return (iralloct(tsd, ptr, size, alignment, zero, true, true, NULL));
 }
 
 JEMALLOC_ALWAYS_INLINE bool
@@ -920,10 +904,6 @@ ixalloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero)
 	else
 		return (huge_ralloc_no_move(ptr, oldsize, size, extra));
 }
-
-malloc_tsd_externs(thread_allocated, thread_allocated_t)
-malloc_tsd_funcs(JEMALLOC_ALWAYS_INLINE, thread_allocated, thread_allocated_t,
-    THREAD_ALLOCATED_INITIALIZER, malloc_tsd_no_cleanup)
 #endif
 
 #include "jemalloc/internal/prof.h"
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index b899017..84d48d1 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -8,6 +8,7 @@ arena_bitselm_get
 arena_boot
 arena_chunk_alloc_huge
 arena_chunk_dalloc_huge
+arena_cleanup
 arena_dalloc
 arena_dalloc_bin
 arena_dalloc_bin_locked
@@ -65,19 +66,9 @@ arena_sdalloc
 arena_stats_merge
 arena_tcache_fill_small
 arenas
-arenas_booted
 arenas_cleanup
 arenas_extend
-arenas_initialized
 arenas_lock
-arenas_tls
-arenas_tsd
-arenas_tsd_boot
-arenas_tsd_cleanup_wrapper
-arenas_tsd_get
-arenas_tsd_get_wrapper
-arenas_tsd_init_head
-arenas_tsd_set
 atomic_add_u
 atomic_add_uint32
 atomic_add_uint64
@@ -317,37 +308,17 @@ prof_sample_accum_update
 prof_sample_threshold_update
 prof_tctx_get
 prof_tctx_set
-prof_tdata_booted
 prof_tdata_cleanup
 prof_tdata_get
 prof_tdata_init
-prof_tdata_initialized
-prof_tdata_tls
-prof_tdata_tsd
-prof_tdata_tsd_boot
-prof_tdata_tsd_cleanup_wrapper
-prof_tdata_tsd_get
-prof_tdata_tsd_get_wrapper
-prof_tdata_tsd_init_head
-prof_tdata_tsd_set
 prof_thread_active_get
 prof_thread_active_set
 prof_thread_name_get
 prof_thread_name_set
 quarantine
 quarantine_alloc_hook
-quarantine_boot
-quarantine_booted
 quarantine_cleanup
 quarantine_init
-quarantine_tls
-quarantine_tsd
-quarantine_tsd_boot
-quarantine_tsd_cleanup_wrapper
-quarantine_tsd_get
-quarantine_tsd_get_wrapper
-quarantine_tsd_init_head
-quarantine_tsd_set
 register_zone
 rtree_delete
 rtree_get
@@ -386,55 +357,52 @@ tcache_arena_dissociate
 tcache_bin_flush_large
 tcache_bin_flush_small
 tcache_bin_info
-tcache_boot0
-tcache_boot1
-tcache_booted
+tcache_boot
+tcache_cleanup
 tcache_create
 tcache_dalloc_large
 tcache_dalloc_small
-tcache_destroy
-tcache_enabled_booted
+tcache_enabled_cleanup
 tcache_enabled_get
-tcache_enabled_initialized
 tcache_enabled_set
-tcache_enabled_tls
-tcache_enabled_tsd
-tcache_enabled_tsd_boot
-tcache_enabled_tsd_cleanup_wrapper
-tcache_enabled_tsd_get
-tcache_enabled_tsd_get_wrapper
-tcache_enabled_tsd_init_head
-tcache_enabled_tsd_set
 tcache_event
 tcache_event_hard
 tcache_flush
 tcache_get
 tcache_get_hard
-tcache_initialized
 tcache_maxclass
 tcache_salloc
 tcache_stats_merge
-tcache_thread_cleanup
-tcache_tls
-tcache_tsd
-tcache_tsd_boot
-tcache_tsd_cleanup_wrapper
-tcache_tsd_get
-tcache_tsd_get_wrapper
-tcache_tsd_init_head
-tcache_tsd_set
-thread_allocated_booted
-thread_allocated_initialized
-thread_allocated_tls
-thread_allocated_tsd
-thread_allocated_tsd_boot
-thread_allocated_tsd_cleanup_wrapper
-thread_allocated_tsd_get
-thread_allocated_tsd_get_wrapper
-thread_allocated_tsd_init_head
-thread_allocated_tsd_set
+thread_allocated_cleanup
+thread_deallocated_cleanup
+tsd_booted
+tsd_arena_get
+tsd_arena_set
+tsd_boot
+tsd_cleanup
+tsd_cleanup_wrapper
+tsd_get
+tsd_get_wrapper
+tsd_initialized
 tsd_init_check_recursion
 tsd_init_finish
+tsd_init_head
+tsd_quarantine_get
+tsd_quarantine_set
+tsd_set
+tsd_tcache_enabled_get
+tsd_tcache_enabled_set
+tsd_tcache_get
+tsd_tcache_set
+tsd_tls
+tsd_tsd
+tsd_prof_tdata_get
+tsd_prof_tdata_set
+tsd_thread_allocated_get
+tsd_thread_allocated_set
+tsd_thread_deallocated_get
+tsd_thread_deallocated_set
+tsd_tryget
 u2rz
 valgrind_freelike_block
 valgrind_make_mem_defined
diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index a1e7ac5..b8a8b41 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -248,13 +248,13 @@ extern uint64_t	prof_interval;
  */
 extern size_t	lg_prof_sample;
 
-void	prof_alloc_rollback(prof_tctx_t *tctx, bool updated);
+void	prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx, bool updated);
 void	prof_malloc_sample_object(const void *ptr, size_t usize,
     prof_tctx_t *tctx);
-void	prof_free_sampled_object(size_t usize, prof_tctx_t *tctx);
+void	prof_free_sampled_object(tsd_t *tsd, size_t usize, prof_tctx_t *tctx);
 void	bt_init(prof_bt_t *bt, void **vec);
 void	prof_backtrace(prof_bt_t *bt);
-prof_tctx_t	*prof_lookup(prof_bt_t *bt);
+prof_tctx_t	*prof_lookup(tsd_t *tsd, prof_bt_t *bt);
 #ifdef JEMALLOC_JET
 size_t	prof_bt_count(void);
 typedef int (prof_dump_open_t)(bool, const char *);
@@ -263,12 +263,12 @@ extern prof_dump_open_t *prof_dump_open;
 void	prof_idump(void);
 bool	prof_mdump(const char *filename);
 void	prof_gdump(void);
-prof_tdata_t	*prof_tdata_init(void);
-prof_tdata_t	*prof_tdata_reinit(prof_tdata_t *tdata);
-void	prof_reset(size_t lg_sample);
-void	prof_tdata_cleanup(void *arg);
+prof_tdata_t	*prof_tdata_init(tsd_t *tsd);
+prof_tdata_t	*prof_tdata_reinit(tsd_t *tsd, prof_tdata_t *tdata);
+void	prof_reset(tsd_t *tsd, size_t lg_sample);
+void	prof_tdata_cleanup(tsd_t *tsd);
 const char	*prof_thread_name_get(void);
-bool	prof_thread_name_set(const char *thread_name);
+bool	prof_thread_name_set(tsd_t *tsd, const char *thread_name);
 bool	prof_thread_active_get(void);
 bool	prof_thread_active_set(bool active);
 void	prof_boot0(void);
@@ -284,43 +284,38 @@ void	prof_sample_threshold_update(prof_tdata_t *tdata);
 #ifdef JEMALLOC_H_INLINES
 
 #ifndef JEMALLOC_ENABLE_INLINE
-malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *)
-
-prof_tdata_t	*prof_tdata_get(bool create);
-bool	prof_sample_accum_update(size_t usize, bool commit,
+prof_tdata_t	*prof_tdata_get(tsd_t *tsd, bool create);
+bool	prof_sample_accum_update(tsd_t *tsd, size_t usize, bool commit,
     prof_tdata_t **tdata_out);
-prof_tctx_t	*prof_alloc_prep(size_t usize, bool update);
+prof_tctx_t	*prof_alloc_prep(tsd_t *tsd, size_t usize, bool update);
 prof_tctx_t	*prof_tctx_get(const void *ptr);
 void	prof_tctx_set(const void *ptr, prof_tctx_t *tctx);
 void	prof_malloc_sample_object(const void *ptr, size_t usize,
     prof_tctx_t *tctx);
 void	prof_malloc(const void *ptr, size_t usize, prof_tctx_t *tctx);
-void	prof_realloc(const void *ptr, size_t usize, prof_tctx_t *tctx,
-    bool updated, size_t old_usize, prof_tctx_t *old_tctx);
-void	prof_free(const void *ptr, size_t usize);
+void	prof_realloc(tsd_t *tsd, const void *ptr, size_t usize,
+    prof_tctx_t *tctx, bool updated, size_t old_usize, prof_tctx_t *old_tctx);
+void	prof_free(tsd_t *tsd, const void *ptr, size_t usize);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
-/* Thread-specific backtrace cache, used to reduce bt2gctx contention. */
-malloc_tsd_externs(prof_tdata, prof_tdata_t *)
-malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL,
-    prof_tdata_cleanup)
-
 JEMALLOC_INLINE prof_tdata_t *
-prof_tdata_get(bool create)
+prof_tdata_get(tsd_t *tsd, bool create)
 {
 	prof_tdata_t *tdata;
 
 	cassert(config_prof);
 
-	tdata = *prof_tdata_tsd_get();
+	tdata = tsd_prof_tdata_get(tsd);
 	if (create) {
-		if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX) {
-			if (tdata == NULL)
-				tdata = prof_tdata_init();
-		} else if (tdata->state == prof_tdata_state_expired)
-			tdata = prof_tdata_reinit(tdata);
-		assert((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX ||
+		if (unlikely(tdata == NULL)) {
+			tdata = prof_tdata_init(tsd);
+			tsd_prof_tdata_set(tsd, tdata);
+		} else if (unlikely(tdata->state == prof_tdata_state_expired)) {
+			tdata = prof_tdata_reinit(tsd, tdata);
+			tsd_prof_tdata_set(tsd, tdata);
+		}
+		assert(tdata == NULL ||
 		    tdata->state == prof_tdata_state_attached);
 	}
 
@@ -363,13 +358,14 @@ prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 }
 
 JEMALLOC_INLINE bool
-prof_sample_accum_update(size_t usize, bool update, prof_tdata_t **tdata_out)
+prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
+    prof_tdata_t **tdata_out)
 {
 	prof_tdata_t *tdata;
 
 	cassert(config_prof);
 
-	tdata = prof_tdata_get(true);
+	tdata = prof_tdata_get(tsd, true);
 	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
 		tdata = NULL;
 
@@ -392,7 +388,7 @@ prof_sample_accum_update(size_t usize, bool update, prof_tdata_t **tdata_out)
 }
 
 JEMALLOC_INLINE prof_tctx_t *
-prof_alloc_prep(size_t usize, bool update)
+prof_alloc_prep(tsd_t *tsd, size_t usize, bool update)
 {
 	prof_tctx_t *ret;
 	prof_tdata_t *tdata;
@@ -400,13 +396,13 @@ prof_alloc_prep(size_t usize, bool update)
 
 	assert(usize == s2u(usize));
 
-	if (!opt_prof_active || likely(prof_sample_accum_update(usize, update,
-	    &tdata)))
+	if (!opt_prof_active || likely(prof_sample_accum_update(tsd, usize,
+	    update, &tdata)))
 		ret = (prof_tctx_t *)(uintptr_t)1U;
 	else {
 		bt_init(&bt, tdata->vec);
 		prof_backtrace(&bt);
-		ret = prof_lookup(&bt);
+		ret = prof_lookup(tsd, &bt);
 	}
 
 	return (ret);
@@ -427,8 +423,8 @@ prof_malloc(const void *ptr, size_t usize, prof_tctx_t *tctx)
 }
 
 JEMALLOC_INLINE void
-prof_realloc(const void *ptr, size_t usize, prof_tctx_t *tctx, bool updated,
-    size_t old_usize, prof_tctx_t *old_tctx)
+prof_realloc(tsd_t *tsd, const void *ptr, size_t usize, prof_tctx_t *tctx,
+    bool updated, size_t old_usize, prof_tctx_t *old_tctx)
 {
 
 	cassert(config_prof);
@@ -436,7 +432,7 @@ prof_realloc(const void *ptr, size_t usize, prof_tctx_t *tctx, bool updated,
 
 	if (!updated && ptr != NULL) {
 		assert(usize == isalloc(ptr, true));
-		if (prof_sample_accum_update(usize, true, NULL)) {
+		if (prof_sample_accum_update(tsd, usize, true, NULL)) {
 			/*
 			 * Don't sample.  The usize passed to PROF_ALLOC_PREP()
 			 * was larger than what actually got allocated, so a
@@ -449,7 +445,7 @@ prof_realloc(const void *ptr, size_t usize, prof_tctx_t *tctx, bool updated,
 	}
 
 	if (unlikely((uintptr_t)old_tctx > (uintptr_t)1U))
-		prof_free_sampled_object(old_usize, old_tctx);
+		prof_free_sampled_object(tsd, old_usize, old_tctx);
 	if (unlikely((uintptr_t)tctx > (uintptr_t)1U))
 		prof_malloc_sample_object(ptr, usize, tctx);
 	else
@@ -457,7 +453,7 @@ prof_realloc(const void *ptr, size_t usize, prof_tctx_t *tctx, bool updated,
 }
 
 JEMALLOC_INLINE void
-prof_free(const void *ptr, size_t usize)
+prof_free(tsd_t *tsd, const void *ptr, size_t usize)
 {
 	prof_tctx_t *tctx = prof_tctx_get(ptr);
 
@@ -465,7 +461,7 @@ prof_free(const void *ptr, size_t usize)
 	assert(usize == isalloc(ptr, true));
 
 	if (unlikely((uintptr_t)tctx > (uintptr_t)1U))
-		prof_free_sampled_object(usize, tctx);
+		prof_free_sampled_object(tsd, usize, tctx);
 }
 #endif
 
diff --git a/include/jemalloc/internal/quarantine.h b/include/jemalloc/internal/quarantine.h
index 16f677f..3a75598 100644
--- a/include/jemalloc/internal/quarantine.h
+++ b/include/jemalloc/internal/quarantine.h
@@ -29,36 +29,29 @@ struct quarantine_s {
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
-quarantine_t	*quarantine_init(size_t lg_maxobjs);
-void	quarantine(void *ptr);
-void	quarantine_cleanup(void *arg);
-bool	quarantine_boot(void);
+quarantine_t	*quarantine_init(tsd_t *tsd, size_t lg_maxobjs);
+void	quarantine(tsd_t *tsd, void *ptr);
+void	quarantine_cleanup(tsd_t *tsd);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
 
 #ifndef JEMALLOC_ENABLE_INLINE
-malloc_tsd_protos(JEMALLOC_ATTR(unused), quarantine, quarantine_t *)
-
 void	quarantine_alloc_hook(void);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_QUARANTINE_C_))
-malloc_tsd_externs(quarantine, quarantine_t *)
-malloc_tsd_funcs(JEMALLOC_ALWAYS_INLINE, quarantine, quarantine_t *, NULL,
-    quarantine_cleanup)
-
 JEMALLOC_ALWAYS_INLINE void
 quarantine_alloc_hook(void)
 {
-	quarantine_t *quarantine;
+	tsd_t *tsd;
 
 	assert(config_fill && opt_quarantine);
 
-	quarantine = *quarantine_tsd_get();
-	if (quarantine == NULL)
-		quarantine_init(LG_MAXOBJS_INIT);
+	tsd = tsd_tryget();
+	if (tsd != NULL && tsd_quarantine_get(tsd) == NULL)
+		tsd_quarantine_set(tsd, quarantine_init(tsd, LG_MAXOBJS_INIT));
 }
 #endif
 
diff --git a/include/jemalloc/internal/tcache.h b/include/jemalloc/internal/tcache.h
index c9d723a..6804668 100644
--- a/include/jemalloc/internal/tcache.h
+++ b/include/jemalloc/internal/tcache.h
@@ -110,26 +110,22 @@ void	tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem,
     tcache_t *tcache);
 void	tcache_arena_associate(tcache_t *tcache, arena_t *arena);
 void	tcache_arena_dissociate(tcache_t *tcache);
-tcache_t *tcache_get_hard(tcache_t *tcache, bool create);
+tcache_t *tcache_get_hard(tsd_t *tsd);
 tcache_t *tcache_create(arena_t *arena);
-void	tcache_destroy(tcache_t *tcache);
-void	tcache_thread_cleanup(void *arg);
+void	tcache_cleanup(tsd_t *tsd);
+void	tcache_enabled_cleanup(tsd_t *tsd);
 void	tcache_stats_merge(tcache_t *tcache, arena_t *arena);
-bool	tcache_boot0(void);
-bool	tcache_boot1(void);
+bool	tcache_boot(void);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
 
 #ifndef JEMALLOC_ENABLE_INLINE
-malloc_tsd_protos(JEMALLOC_ATTR(unused), tcache, tcache_t *)
-malloc_tsd_protos(JEMALLOC_ATTR(unused), tcache_enabled, tcache_enabled_t)
-
 void	tcache_event(tcache_t *tcache);
 void	tcache_flush(void);
 bool	tcache_enabled_get(void);
-tcache_t *tcache_get(bool create);
+tcache_t *tcache_get(tsd_t *tsd, bool create);
 void	tcache_enabled_set(bool enabled);
 void	*tcache_alloc_easy(tcache_bin_t *tbin);
 void	*tcache_alloc_small(tcache_t *tcache, size_t size, bool zero);
@@ -139,41 +135,33 @@ void	tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_TCACHE_C_))
-/* Map of thread-specific caches. */
-malloc_tsd_externs(tcache, tcache_t *)
-malloc_tsd_funcs(JEMALLOC_ALWAYS_INLINE, tcache, tcache_t *, NULL,
-    tcache_thread_cleanup)
-/* Per thread flag that allows thread caches to be disabled. */
-malloc_tsd_externs(tcache_enabled, tcache_enabled_t)
-malloc_tsd_funcs(JEMALLOC_ALWAYS_INLINE, tcache_enabled, tcache_enabled_t,
-    tcache_enabled_default, malloc_tsd_no_cleanup)
-
 JEMALLOC_INLINE void
 tcache_flush(void)
 {
-	tcache_t *tcache;
+	tsd_t *tsd;
 
 	cassert(config_tcache);
 
-	tcache = *tcache_tsd_get();
-	if ((uintptr_t)tcache <= (uintptr_t)TCACHE_STATE_MAX)
-		return;
-	tcache_destroy(tcache);
-	tcache = NULL;
-	tcache_tsd_set(&tcache);
+	tsd = tsd_tryget();
+	if (tsd != NULL)
+		tcache_cleanup(tsd);
 }
 
 JEMALLOC_INLINE bool
 tcache_enabled_get(void)
 {
+	tsd_t *tsd;
 	tcache_enabled_t tcache_enabled;
 
 	cassert(config_tcache);
 
-	tcache_enabled = *tcache_enabled_tsd_get();
+	tsd = tsd_tryget();
+	if (tsd == NULL)
+		return (false);
+	tcache_enabled = tsd_tcache_enabled_get(tsd);
 	if (tcache_enabled == tcache_enabled_default) {
 		tcache_enabled = (tcache_enabled_t)opt_tcache;
-		tcache_enabled_tsd_set(&tcache_enabled);
+		tsd_tcache_enabled_set(tsd, tcache_enabled);
 	}
 
 	return ((bool)tcache_enabled);
@@ -182,33 +170,24 @@ tcache_enabled_get(void)
 JEMALLOC_INLINE void
 tcache_enabled_set(bool enabled)
 {
+	tsd_t *tsd;
 	tcache_enabled_t tcache_enabled;
-	tcache_t *tcache;
 
 	cassert(config_tcache);
 
+	tsd = tsd_tryget();
+	if (tsd == NULL)
+		return;
+
 	tcache_enabled = (tcache_enabled_t)enabled;
-	tcache_enabled_tsd_set(&tcache_enabled);
-	tcache = *tcache_tsd_get();
-	if (enabled) {
-		if (tcache == TCACHE_STATE_DISABLED) {
-			tcache = NULL;
-			tcache_tsd_set(&tcache);
-		}
-	} else /* disabled */ {
-		if (tcache > TCACHE_STATE_MAX) {
-			tcache_destroy(tcache);
-			tcache = NULL;
-		}
-		if (tcache == NULL) {
-			tcache = TCACHE_STATE_DISABLED;
-			tcache_tsd_set(&tcache);
-		}
-	}
+	tsd_tcache_enabled_set(tsd, tcache_enabled);
+
+	if (!enabled)
+		tcache_cleanup(tsd);
 }
 
 JEMALLOC_ALWAYS_INLINE tcache_t *
-tcache_get(bool create)
+tcache_get(tsd_t *tsd, bool create)
 {
 	tcache_t *tcache;
 
@@ -216,12 +195,19 @@ tcache_get(bool create)
 		return (NULL);
 	if (config_lazy_lock && isthreaded == false)
 		return (NULL);
+	/*
+	 * If create is true, the caller has already assured that tsd is
+	 * non-NULL.
+	 */
+	if (!create && unlikely(tsd == NULL))
+		return (NULL);
 
-	tcache = *tcache_tsd_get();
-	if (unlikely((uintptr_t)tcache <= (uintptr_t)TCACHE_STATE_MAX)) {
-		if (tcache == TCACHE_STATE_DISABLED)
-			return (NULL);
-		tcache = tcache_get_hard(tcache, create);
+	tcache = tsd_tcache_get(tsd);
+	if (!create)
+		return (tcache);
+	if (unlikely(tcache == NULL)) {
+		tcache = tcache_get_hard(tsd);
+		tsd_tcache_set(tsd, tcache);
 	}
 
 	return (tcache);
diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index 9fb4a23..44952ee 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -12,6 +12,15 @@ typedef struct tsd_init_block_s tsd_init_block_t;
 typedef struct tsd_init_head_s tsd_init_head_t;
 #endif
 
+typedef struct tsd_s tsd_t;
+
+typedef enum {
+	tsd_state_uninitialized,
+	tsd_state_nominal,
+	tsd_state_purgatory,
+	tsd_state_reincarnated
+} tsd_state_t;
+
 /*
  * TLS/TSD-agnostic macro-based implementation of thread-specific data.  There
  * are four macros that support (at least) three use cases: file-private,
@@ -24,11 +33,11 @@ typedef struct tsd_init_head_s tsd_init_head_t;
  *           int y;
  *   } example_t;
  *   #define EX_INITIALIZER JEMALLOC_CONCAT({0, 0})
- *   malloc_tsd_protos(, example, example_t *)
- *   malloc_tsd_externs(example, example_t *)
+ *   malloc_tsd_protos(, example_, example_t *)
+ *   malloc_tsd_externs(example_, example_t *)
  * In example.c:
- *   malloc_tsd_data(, example, example_t *, EX_INITIALIZER)
- *   malloc_tsd_funcs(, example, example_t *, EX_INITIALIZER,
+ *   malloc_tsd_data(, example_, example_t *, EX_INITIALIZER)
+ *   malloc_tsd_funcs(, example_, example_t *, EX_INITIALIZER,
  *       example_tsd_cleanup)
  *
  * The result is a set of generated functions, e.g.:
@@ -43,15 +52,13 @@ typedef struct tsd_init_head_s tsd_init_head_t;
  * cast to (void *).  This means that the cleanup function needs to cast *and*
  * dereference the function argument, e.g.:
  *
- *   void
+ *   bool
  *   example_tsd_cleanup(void *arg)
  *   {
  *           example_t *example = *(example_t **)arg;
  *
  *           [...]
- *           if ([want the cleanup function to be called again]) {
- *                   example_tsd_set(&example);
- *           }
+ *           return ([want the cleanup function to be called again]);
  *   }
  *
  * If example_tsd_set() is called within example_tsd_cleanup(), it will be
@@ -63,60 +70,60 @@ typedef struct tsd_init_head_s tsd_init_head_t;
 /* malloc_tsd_protos(). */
 #define	malloc_tsd_protos(a_attr, a_name, a_type)			\
 a_attr bool								\
-a_name##_tsd_boot(void);						\
+a_name##tsd_boot(void);							\
 a_attr a_type *								\
-a_name##_tsd_get(void);							\
+a_name##tsd_get(void);							\
 a_attr void								\
-a_name##_tsd_set(a_type *val);
+a_name##tsd_set(a_type *val);
 
 /* malloc_tsd_externs(). */
 #ifdef JEMALLOC_MALLOC_THREAD_CLEANUP
 #define	malloc_tsd_externs(a_name, a_type)				\
-extern __thread a_type	a_name##_tls;					\
-extern __thread bool	a_name##_initialized;				\
-extern bool		a_name##_booted;
+extern __thread a_type	a_name##tsd_tls;				\
+extern __thread bool	a_name##tsd_initialized;			\
+extern bool		a_name##tsd_booted;
 #elif (defined(JEMALLOC_TLS))
 #define	malloc_tsd_externs(a_name, a_type)				\
-extern __thread a_type	a_name##_tls;					\
-extern pthread_key_t	a_name##_tsd;					\
-extern bool		a_name##_booted;
+extern __thread a_type	a_name##tsd_tls;				\
+extern pthread_key_t	a_name##tsd_tsd;				\
+extern bool		a_name##tsd_booted;
 #elif (defined(_WIN32))
 #define	malloc_tsd_externs(a_name, a_type)				\
-extern DWORD		a_name##_tsd;					\
-extern bool		a_name##_booted;
+extern DWORD		a_name##tsd_tsd;				\
+extern bool		a_name##tsd_booted;
 #else
 #define	malloc_tsd_externs(a_name, a_type)				\
-extern pthread_key_t	a_name##_tsd;					\
-extern tsd_init_head_t	a_name##_tsd_init_head;				\
-extern bool		a_name##_booted;
+extern pthread_key_t	a_name##tsd_tsd;				\
+extern tsd_init_head_t	a_name##tsd_init_head;				\
+extern bool		a_name##tsd_booted;
 #endif
 
 /* malloc_tsd_data(). */
 #ifdef JEMALLOC_MALLOC_THREAD_CLEANUP
 #define	malloc_tsd_data(a_attr, a_name, a_type, a_initializer)		\
 a_attr __thread a_type JEMALLOC_TLS_MODEL				\
-    a_name##_tls = a_initializer;					\
+    a_name##tsd_tls = a_initializer;					\
 a_attr __thread bool JEMALLOC_TLS_MODEL					\
-    a_name##_initialized = false;					\
-a_attr bool		a_name##_booted = false;
+    a_name##tsd_initialized = false;					\
+a_attr bool		a_name##tsd_booted = false;
 #elif (defined(JEMALLOC_TLS))
 #define	malloc_tsd_data(a_attr, a_name, a_type, a_initializer)		\
 a_attr __thread a_type JEMALLOC_TLS_MODEL				\
-    a_name##_tls = a_initializer;					\
-a_attr pthread_key_t	a_name##_tsd;					\
-a_attr bool		a_name##_booted = false;
+    a_name##tsd_tls = a_initializer;					\
+a_attr pthread_key_t	a_name##tsd_tsd;				\
+a_attr bool		a_name##tsd_booted = false;
 #elif (defined(_WIN32))
 #define	malloc_tsd_data(a_attr, a_name, a_type, a_initializer)		\
-a_attr DWORD		a_name##_tsd;					\
-a_attr bool		a_name##_booted = false;
+a_attr DWORD		a_name##tsd_tsd;				\
+a_attr bool		a_name##tsd_booted = false;
 #else
 #define	malloc_tsd_data(a_attr, a_name, a_type, a_initializer)		\
-a_attr pthread_key_t	a_name##_tsd;					\
-a_attr tsd_init_head_t	a_name##_tsd_init_head = {			\
+a_attr pthread_key_t	a_name##tsd_tsd;				\
+a_attr tsd_init_head_t	a_name##tsd_init_head = {			\
 	ql_head_initializer(blocks),					\
 	MALLOC_MUTEX_INITIALIZER					\
 };									\
-a_attr bool		a_name##_booted = false;
+a_attr bool		a_name##tsd_booted = false;
 #endif
 
 /* malloc_tsd_funcs(). */
@@ -125,75 +132,76 @@ a_attr bool		a_name##_booted = false;
     a_cleanup)								\
 /* Initialization/cleanup. */						\
 a_attr bool								\
-a_name##_tsd_cleanup_wrapper(void)					\
+a_name##tsd_cleanup_wrapper(void)					\
 {									\
 									\
-	if (a_name##_initialized) {					\
-		a_name##_initialized = false;				\
-		a_cleanup(&a_name##_tls);				\
+	if (a_name##tsd_initialized) {					\
+		a_name##tsd_initialized = false;			\
+		a_cleanup(&a_name##tsd_tls);				\
 	}								\
-	return (a_name##_initialized);					\
+	return (a_name##tsd_initialized);				\
 }									\
 a_attr bool								\
-a_name##_tsd_boot(void)							\
+a_name##tsd_boot(void)							\
 {									\
 									\
 	if (a_cleanup != malloc_tsd_no_cleanup) {			\
 		malloc_tsd_cleanup_register(				\
-		    &a_name##_tsd_cleanup_wrapper);			\
+		    &a_name##tsd_cleanup_wrapper);			\
 	}								\
-	a_name##_booted = true;						\
+	a_name##tsd_booted = true;					\
 	return (false);							\
 }									\
 /* Get/set. */								\
 a_attr a_type *								\
-a_name##_tsd_get(void)							\
+a_name##tsd_get(void)							\
 {									\
 									\
-	assert(a_name##_booted);					\
-	return (&a_name##_tls);						\
+	assert(a_name##tsd_booted);					\
+	return (&a_name##tsd_tls);					\
 }									\
 a_attr void								\
-a_name##_tsd_set(a_type *val)						\
+a_name##tsd_set(a_type *val)						\
 {									\
 									\
-	assert(a_name##_booted);					\
-	a_name##_tls = (*val);						\
+	assert(a_name##tsd_booted);					\
+	a_name##tsd_tls = (*val);					\
 	if (a_cleanup != malloc_tsd_no_cleanup)				\
-		a_name##_initialized = true;				\
+		a_name##tsd_initialized = true;				\
 }
 #elif (defined(JEMALLOC_TLS))
 #define	malloc_tsd_funcs(a_attr, a_name, a_type, a_initializer,		\
     a_cleanup)								\
 /* Initialization/cleanup. */						\
 a_attr bool								\
-a_name##_tsd_boot(void)							\
+a_name##tsd_boot(void)							\
 {									\
 									\
 	if (a_cleanup != malloc_tsd_no_cleanup) {			\
-		if (pthread_key_create(&a_name##_tsd, a_cleanup) != 0)	\
+		if (pthread_key_create(&a_name##tsd_tsd, a_cleanup) !=	\
+		    0)							\
 			return (true);					\
 	}								\
-	a_name##_booted = true;						\
+	a_name##tsd_booted = true;					\
 	return (false);							\
 }									\
 /* Get/set. */								\
 a_attr a_type *								\
-a_name##_tsd_get(void)							\
+a_name##tsd_get(void)							\
 {									\
 									\
-	assert(a_name##_booted);					\
-	return (&a_name##_tls);						\
+	assert(a_name##tsd_booted);					\
+	return (&a_name##tsd_tls);					\
 }									\
 a_attr void								\
-a_name##_tsd_set(a_type *val)						\
+a_name##tsd_set(a_type *val)						\
 {									\
 									\
-	assert(a_name##_booted);					\
-	a_name##_tls = (*val);						\
+	assert(a_name##tsd_booted);					\
+	a_name##tsd_tls = (*val);					\
 	if (a_cleanup != malloc_tsd_no_cleanup) {			\
-		if (pthread_setspecific(a_name##_tsd,			\
-		    (void *)(&a_name##_tls))) {				\
+		if (pthread_setspecific(a_name##tsd_tsd,		\
+		    (void *)(&a_name##tsd_tls))) {			\
 			malloc_write("<jemalloc>: Error"		\
 			    " setting TSD for "#a_name"\n");		\
 			if (opt_abort)					\
@@ -208,23 +216,20 @@ a_name##_tsd_set(a_type *val)						\
 typedef struct {							\
 	bool	initialized;						\
 	a_type	val;							\
-} a_name##_tsd_wrapper_t;						\
+} a_name##tsd_wrapper_t;						\
 /* Initialization/cleanup. */						\
 a_attr bool								\
-a_name##_tsd_cleanup_wrapper(void)					\
+a_name##tsd_cleanup_wrapper(void)					\
 {									\
-	a_name##_tsd_wrapper_t *wrapper;				\
+	a_name##tsd_wrapper_t *wrapper;					\
 									\
-	wrapper = (a_name##_tsd_wrapper_t *) TlsGetValue(a_name##_tsd);	\
+	wrapper = (a_name##tsd_wrapper_t *)TlsGetValue(a_name##tsd_tsd);\
 	if (wrapper == NULL)						\
 		return (false);						\
 	if (a_cleanup != malloc_tsd_no_cleanup &&			\
 	    wrapper->initialized) {					\
-		a_type val = wrapper->val;				\
-		a_type tsd_static_data = a_initializer;			\
 		wrapper->initialized = false;				\
-		wrapper->val = tsd_static_data;				\
-		a_cleanup(&val);					\
+		a_cleanup(&wrapper->val);				\
 		if (wrapper->initialized) {				\
 			/* Trigger another cleanup round. */		\
 			return (true);					\
@@ -234,39 +239,38 @@ a_name##_tsd_cleanup_wrapper(void)					\
 	return (false);							\
 }									\
 a_attr bool								\
-a_name##_tsd_boot(void)							\
+a_name##tsd_boot(void)							\
 {									\
 									\
-	a_name##_tsd = TlsAlloc();					\
-	if (a_name##_tsd == TLS_OUT_OF_INDEXES)				\
+	a_name##tsd_tsd = TlsAlloc();					\
+	if (a_name##tsd_tsd == TLS_OUT_OF_INDEXES)			\
 		return (true);						\
 	if (a_cleanup != malloc_tsd_no_cleanup) {			\
 		malloc_tsd_cleanup_register(				\
-		    &a_name##_tsd_cleanup_wrapper);			\
+		    &a_name##tsd_cleanup_wrapper);			\
 	}								\
-	a_name##_booted = true;						\
+	a_name##tsd_booted = true;					\
 	return (false);							\
 }									\
 /* Get/set. */								\
-a_attr a_name##_tsd_wrapper_t *						\
-a_name##_tsd_get_wrapper(void)						\
+a_attr a_name##tsd_wrapper_t *						\
+a_name##tsd_get_wrapper(void)						\
 {									\
-	a_name##_tsd_wrapper_t *wrapper = (a_name##_tsd_wrapper_t *)	\
-	    TlsGetValue(a_name##_tsd);					\
+	a_name##tsd_wrapper_t *wrapper = (a_name##tsd_wrapper_t *)	\
+	    TlsGetValue(a_name##tsd_tsd);				\
 									\
-	if (wrapper == NULL) {						\
-		wrapper = (a_name##_tsd_wrapper_t *)			\
-		    malloc_tsd_malloc(sizeof(a_name##_tsd_wrapper_t));	\
+	if (unlikely(wrapper == NULL)) {				\
+		wrapper = (a_name##tsd_wrapper_t *)			\
+		    malloc_tsd_malloc(sizeof(a_name##tsd_wrapper_t));	\
 		if (wrapper == NULL) {					\
 			malloc_write("<jemalloc>: Error allocating"	\
 			    " TSD for "#a_name"\n");			\
 			abort();					\
 		} else {						\
-			static a_type tsd_static_data = a_initializer;	\
 			wrapper->initialized = false;			\
-			wrapper->val = tsd_static_data;			\
+			wrapper->val = a_initializer;			\
 		}							\
-		if (!TlsSetValue(a_name##_tsd, (void *)wrapper)) {	\
+		if (!TlsSetValue(a_name##tsd_tsd, (void *)wrapper)) {	\
 			malloc_write("<jemalloc>: Error setting"	\
 			    " TSD for "#a_name"\n");			\
 			abort();					\
@@ -275,21 +279,21 @@ a_name##_tsd_get_wrapper(void)						\
 	return (wrapper);						\
 }									\
 a_attr a_type *								\
-a_name##_tsd_get(void)							\
+a_name##tsd_get(void)							\
 {									\
-	a_name##_tsd_wrapper_t *wrapper;				\
+	a_name##tsd_wrapper_t *wrapper;					\
 									\
-	assert(a_name##_booted);					\
-	wrapper = a_name##_tsd_get_wrapper();				\
+	assert(a_name##tsd_booted);					\
+	wrapper = a_name##tsd_get_wrapper();				\
 	return (&wrapper->val);						\
 }									\
 a_attr void								\
-a_name##_tsd_set(a_type *val)						\
+a_name##tsd_set(a_type *val)						\
 {									\
-	a_name##_tsd_wrapper_t *wrapper;				\
+	a_name##tsd_wrapper_t *wrapper;					\
 									\
-	assert(a_name##_booted);					\
-	wrapper = a_name##_tsd_get_wrapper();				\
+	assert(a_name##tsd_booted);					\
+	wrapper = a_name##tsd_get_wrapper();				\
 	wrapper->val = *(val);						\
 	if (a_cleanup != malloc_tsd_no_cleanup)				\
 		wrapper->initialized = true;				\
@@ -301,12 +305,12 @@ a_name##_tsd_set(a_type *val)						\
 typedef struct {							\
 	bool	initialized;						\
 	a_type	val;							\
-} a_name##_tsd_wrapper_t;						\
+} a_name##tsd_wrapper_t;						\
 /* Initialization/cleanup. */						\
 a_attr void								\
-a_name##_tsd_cleanup_wrapper(void *arg)					\
+a_name##tsd_cleanup_wrapper(void *arg)					\
 {									\
-	a_name##_tsd_wrapper_t *wrapper = (a_name##_tsd_wrapper_t *)arg;\
+	a_name##tsd_wrapper_t *wrapper = (a_name##tsd_wrapper_t *)arg;	\
 									\
 	if (a_cleanup != malloc_tsd_no_cleanup &&			\
 	    wrapper->initialized) {					\
@@ -314,7 +318,7 @@ a_name##_tsd_cleanup_wrapper(void *arg)					\
 		a_cleanup(&wrapper->val);				\
 		if (wrapper->initialized) {				\
 			/* Trigger another cleanup round. */		\
-			if (pthread_setspecific(a_name##_tsd,		\
+			if (pthread_setspecific(a_name##tsd_tsd,	\
 			    (void *)wrapper)) {				\
 				malloc_write("<jemalloc>: Error"	\
 				    " setting TSD for "#a_name"\n");	\
@@ -327,66 +331,65 @@ a_name##_tsd_cleanup_wrapper(void *arg)					\
 	malloc_tsd_dalloc(wrapper);					\
 }									\
 a_attr bool								\
-a_name##_tsd_boot(void)							\
+a_name##tsd_boot(void)							\
 {									\
 									\
-	if (pthread_key_create(&a_name##_tsd,				\
-	    a_name##_tsd_cleanup_wrapper) != 0)				\
+	if (pthread_key_create(&a_name##tsd_tsd,			\
+	    a_name##tsd_cleanup_wrapper) != 0)				\
 		return (true);						\
-	a_name##_booted = true;						\
+	a_name##tsd_booted = true;					\
 	return (false);							\
 }									\
 /* Get/set. */								\
-a_attr a_name##_tsd_wrapper_t *						\
-a_name##_tsd_get_wrapper(void)						\
+a_attr a_name##tsd_wrapper_t *						\
+a_name##tsd_get_wrapper(void)						\
 {									\
-	a_name##_tsd_wrapper_t *wrapper = (a_name##_tsd_wrapper_t *)	\
-	    pthread_getspecific(a_name##_tsd);				\
+	a_name##tsd_wrapper_t *wrapper = (a_name##tsd_wrapper_t *)	\
+	    pthread_getspecific(a_name##tsd_tsd);			\
 									\
-	if (wrapper == NULL) {						\
+	if (unlikely(wrapper == NULL)) {				\
 		tsd_init_block_t block;					\
 		wrapper = tsd_init_check_recursion(			\
-		    &a_name##_tsd_init_head, &block);			\
+		    &a_name##tsd_init_head, &block);			\
 		if (wrapper)						\
 		    return (wrapper);					\
-		wrapper = (a_name##_tsd_wrapper_t *)			\
-		    malloc_tsd_malloc(sizeof(a_name##_tsd_wrapper_t));	\
+		wrapper = (a_name##tsd_wrapper_t *)			\
+		    malloc_tsd_malloc(sizeof(a_name##tsd_wrapper_t));	\
 		block.data = wrapper;					\
 		if (wrapper == NULL) {					\
 			malloc_write("<jemalloc>: Error allocating"	\
 			    " TSD for "#a_name"\n");			\
 			abort();					\
 		} else {						\
-			static a_type tsd_static_data = a_initializer;	\
 			wrapper->initialized = false;			\
-			wrapper->val = tsd_static_data;			\
+			wrapper->val = a_initializer;			\
 		}							\
-		if (pthread_setspecific(a_name##_tsd,			\
+		if (pthread_setspecific(a_name##tsd_tsd,		\
 		    (void *)wrapper)) {					\
 			malloc_write("<jemalloc>: Error setting"	\
 			    " TSD for "#a_name"\n");			\
 			abort();					\
 		}							\
-		tsd_init_finish(&a_name##_tsd_init_head, &block);	\
+		tsd_init_finish(&a_name##tsd_init_head, &block);	\
 	}								\
 	return (wrapper);						\
 }									\
 a_attr a_type *								\
-a_name##_tsd_get(void)							\
+a_name##tsd_get(void)							\
 {									\
-	a_name##_tsd_wrapper_t *wrapper;				\
+	a_name##tsd_wrapper_t *wrapper;					\
 									\
-	assert(a_name##_booted);					\
-	wrapper = a_name##_tsd_get_wrapper();				\
+	assert(a_name##tsd_booted);					\
+	wrapper = a_name##tsd_get_wrapper();				\
 	return (&wrapper->val);						\
 }									\
 a_attr void								\
-a_name##_tsd_set(a_type *val)						\
+a_name##tsd_set(a_type *val)						\
 {									\
-	a_name##_tsd_wrapper_t *wrapper;				\
+	a_name##tsd_wrapper_t *wrapper;					\
 									\
-	assert(a_name##_booted);					\
-	wrapper = a_name##_tsd_get_wrapper();				\
+	assert(a_name##tsd_booted);					\
+	wrapper = a_name##tsd_get_wrapper();				\
 	wrapper->val = *(val);						\
 	if (a_cleanup != malloc_tsd_no_cleanup)				\
 		wrapper->initialized = true;				\
@@ -410,25 +413,123 @@ struct tsd_init_head_s {
 };
 #endif
 
+#define	MALLOC_TSD							\
+/*  O(name,			type) */				\
+    O(tcache,			tcache_t *)				\
+    O(thread_allocated,		uint64_t)				\
+    O(thread_deallocated,	uint64_t)				\
+    O(prof_tdata,		prof_tdata_t *)				\
+    O(arena,			arena_t *)				\
+    O(tcache_enabled,		tcache_enabled_t)			\
+    O(quarantine,		quarantine_t *)				\
+
+#define	TSD_INITIALIZER {						\
+    tsd_state_uninitialized,						\
+    NULL,								\
+    0,									\
+    0,									\
+    NULL,								\
+    NULL,								\
+    tcache_enabled_default,						\
+    NULL								\
+}
+
+struct tsd_s {
+	tsd_state_t	state;
+#define	O(n, t)								\
+	t		n;
+MALLOC_TSD
+#undef O
+};
+
+static const tsd_t tsd_initializer = TSD_INITIALIZER;
+
 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
 void	*malloc_tsd_malloc(size_t size);
 void	malloc_tsd_dalloc(void *wrapper);
-void	malloc_tsd_no_cleanup(void *);
+void	malloc_tsd_no_cleanup(void *arg);
 void	malloc_tsd_cleanup_register(bool (*f)(void));
-void	malloc_tsd_boot(void);
+bool	malloc_tsd_boot(void);
 #if (!defined(JEMALLOC_MALLOC_THREAD_CLEANUP) && !defined(JEMALLOC_TLS) && \
     !defined(_WIN32))
 void	*tsd_init_check_recursion(tsd_init_head_t *head,
     tsd_init_block_t *block);
 void	tsd_init_finish(tsd_init_head_t *head, tsd_init_block_t *block);
 #endif
+void	tsd_cleanup(void *arg);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
 
+#ifndef JEMALLOC_ENABLE_INLINE
+malloc_tsd_protos(JEMALLOC_ATTR(unused), , tsd_t)
+
+tsd_t	*tsd_tryget(void);
+#define	O(n, t)								\
+t	*tsd_##n##p_get(tsd_t *tsd);					\
+t	tsd_##n##_get(tsd_t *tsd);					\
+void	tsd_##n##_set(tsd_t *tsd, t n);
+MALLOC_TSD
+#undef O
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_TSD_C_))
+malloc_tsd_externs(, tsd_t)
+malloc_tsd_funcs(JEMALLOC_ALWAYS_INLINE, , tsd_t, tsd_initializer, tsd_cleanup)
+
+JEMALLOC_INLINE tsd_t *
+tsd_tryget(void)
+{
+	tsd_t *tsd;
+
+	tsd = tsd_get();
+	if (unlikely(tsd == NULL))
+		return (NULL);
+
+	if (likely(tsd->state == tsd_state_nominal))
+		return (tsd);
+	else if (tsd->state == tsd_state_uninitialized) {
+		tsd->state = tsd_state_nominal;
+		tsd_set(tsd);
+		return (tsd);
+	} else if (tsd->state == tsd_state_purgatory) {
+		tsd->state = tsd_state_reincarnated;
+		tsd_set(tsd);
+		return (NULL);
+	} else {
+		assert(tsd->state == tsd_state_reincarnated);
+		return (NULL);
+	}
+}
+
+#define	O(n, t)								\
+JEMALLOC_INLINE t *							\
+tsd_##n##p_get(tsd_t *tsd)						\
+{									\
+									\
+	return (&tsd->n);						\
+}									\
+									\
+JEMALLOC_INLINE t							\
+tsd_##n##_get(tsd_t *tsd)						\
+{									\
+									\
+	return (*tsd_##n##p_get(tsd));					\
+}									\
+									\
+JEMALLOC_INLINE void							\
+tsd_##n##_set(tsd_t *tsd, t n)						\
+{									\
+									\
+	tsd->n = n;							\
+}
+MALLOC_TSD
+#undef O
+#endif
+
 #endif /* JEMALLOC_H_INLINES */
 /******************************************************************************/
diff --git a/src/arena.c b/src/arena.c
index 35d792a..40da9f4 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -2058,7 +2058,7 @@ arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
 }
 
 void *
-arena_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
+arena_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t size,
     size_t extra, size_t alignment, bool zero, bool try_tcache_alloc,
     bool try_tcache_dalloc)
 {
@@ -2078,9 +2078,12 @@ arena_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
 		size_t usize = sa2u(size + extra, alignment);
 		if (usize == 0)
 			return (NULL);
-		ret = ipalloct(usize, alignment, zero, try_tcache_alloc, arena);
-	} else
-		ret = arena_malloc(arena, size + extra, zero, try_tcache_alloc);
+		ret = ipalloct(tsd, usize, alignment, zero, try_tcache_alloc,
+		    arena);
+	} else {
+		ret = arena_malloc(tsd, arena, size + extra, zero,
+		    try_tcache_alloc);
+	}
 
 	if (ret == NULL) {
 		if (extra == 0)
@@ -2090,10 +2093,12 @@ arena_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
 			size_t usize = sa2u(size, alignment);
 			if (usize == 0)
 				return (NULL);
-			ret = ipalloct(usize, alignment, zero, try_tcache_alloc,
-			    arena);
-		} else
-			ret = arena_malloc(arena, size, zero, try_tcache_alloc);
+			ret = ipalloct(tsd, usize, alignment, zero,
+			    try_tcache_alloc, arena);
+		} else {
+			ret = arena_malloc(tsd, arena, size, zero,
+			    try_tcache_alloc);
+		}
 
 		if (ret == NULL)
 			return (NULL);
@@ -2108,7 +2113,7 @@ arena_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
 	copysize = (size < oldsize) ? size : oldsize;
 	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, copysize);
 	memcpy(ret, ptr, copysize);
-	iqalloc(ptr, try_tcache_dalloc);
+	iqalloc(tsd, ptr, try_tcache_dalloc);
 	return (ret);
 }
 
diff --git a/src/ckh.c b/src/ckh.c
index 04c5296..7c7cc09 100644
--- a/src/ckh.c
+++ b/src/ckh.c
@@ -40,8 +40,8 @@
 /******************************************************************************/
 /* Function prototypes for non-inline static functions. */
 
-static bool	ckh_grow(ckh_t *ckh);
-static void	ckh_shrink(ckh_t *ckh);
+static bool	ckh_grow(tsd_t *tsd, ckh_t *ckh);
+static void	ckh_shrink(tsd_t *tsd, ckh_t *ckh);
 
 /******************************************************************************/
 
@@ -243,7 +243,7 @@ ckh_rebuild(ckh_t *ckh, ckhc_t *aTab)
 }
 
 static bool
-ckh_grow(ckh_t *ckh)
+ckh_grow(tsd_t *tsd, ckh_t *ckh)
 {
 	bool ret;
 	ckhc_t *tab, *ttab;
@@ -270,7 +270,7 @@ ckh_grow(ckh_t *ckh)
 			ret = true;
 			goto label_return;
 		}
-		tab = (ckhc_t *)ipalloc(usize, CACHELINE, true);
+		tab = (ckhc_t *)ipalloc(tsd, usize, CACHELINE, true);
 		if (tab == NULL) {
 			ret = true;
 			goto label_return;
@@ -282,12 +282,12 @@ ckh_grow(ckh_t *ckh)
 		ckh->lg_curbuckets = lg_curcells - LG_CKH_BUCKET_CELLS;
 
 		if (ckh_rebuild(ckh, tab) == false) {
-			idalloc(tab);
+			idalloc(tsd, tab);
 			break;
 		}
 
 		/* Rebuilding failed, so back out partially rebuilt table. */
-		idalloc(ckh->tab);
+		idalloc(tsd, ckh->tab);
 		ckh->tab = tab;
 		ckh->lg_curbuckets = lg_prevbuckets;
 	}
@@ -298,7 +298,7 @@ label_return:
 }
 
 static void
-ckh_shrink(ckh_t *ckh)
+ckh_shrink(tsd_t *tsd, ckh_t *ckh)
 {
 	ckhc_t *tab, *ttab;
 	size_t lg_curcells, usize;
@@ -313,7 +313,7 @@ ckh_shrink(ckh_t *ckh)
 	usize = sa2u(sizeof(ckhc_t) << lg_curcells, CACHELINE);
 	if (usize == 0)
 		return;
-	tab = (ckhc_t *)ipalloc(usize, CACHELINE, true);
+	tab = (ckhc_t *)ipalloc(tsd, usize, CACHELINE, true);
 	if (tab == NULL) {
 		/*
 		 * An OOM error isn't worth propagating, since it doesn't
@@ -328,7 +328,7 @@ ckh_shrink(ckh_t *ckh)
 	ckh->lg_curbuckets = lg_curcells - LG_CKH_BUCKET_CELLS;
 
 	if (ckh_rebuild(ckh, tab) == false) {
-		idalloc(tab);
+		idalloc(tsd, tab);
 #ifdef CKH_COUNT
 		ckh->nshrinks++;
 #endif
@@ -336,7 +336,7 @@ ckh_shrink(ckh_t *ckh)
 	}
 
 	/* Rebuilding failed, so back out partially rebuilt table. */
-	idalloc(ckh->tab);
+	idalloc(tsd, ckh->tab);
 	ckh->tab = tab;
 	ckh->lg_curbuckets = lg_prevbuckets;
 #ifdef CKH_COUNT
@@ -345,7 +345,8 @@ ckh_shrink(ckh_t *ckh)
 }
 
 bool
-ckh_new(ckh_t *ckh, size_t minitems, ckh_hash_t *hash, ckh_keycomp_t *keycomp)
+ckh_new(tsd_t *tsd, ckh_t *ckh, size_t minitems, ckh_hash_t *hash,
+    ckh_keycomp_t *keycomp)
 {
 	bool ret;
 	size_t mincells, usize;
@@ -388,7 +389,7 @@ ckh_new(ckh_t *ckh, size_t minitems, ckh_hash_t *hash, ckh_keycomp_t *keycomp)
 		ret = true;
 		goto label_return;
 	}
-	ckh->tab = (ckhc_t *)ipalloc(usize, CACHELINE, true);
+	ckh->tab = (ckhc_t *)ipalloc(tsd, usize, CACHELINE, true);
 	if (ckh->tab == NULL) {
 		ret = true;
 		goto label_return;
@@ -400,7 +401,7 @@ label_return:
 }
 
 void
-ckh_delete(ckh_t *ckh)
+ckh_delete(tsd_t *tsd, ckh_t *ckh)
 {
 
 	assert(ckh != NULL);
@@ -417,7 +418,7 @@ ckh_delete(ckh_t *ckh)
 	    (unsigned long long)ckh->nrelocs);
 #endif
 
-	idalloc(ckh->tab);
+	idalloc(tsd, ckh->tab);
 	if (config_debug)
 		memset(ckh, 0x5a, sizeof(ckh_t));
 }
@@ -452,7 +453,7 @@ ckh_iter(ckh_t *ckh, size_t *tabind, void **key, void **data)
 }
 
 bool
-ckh_insert(ckh_t *ckh, const void *key, const void *data)
+ckh_insert(tsd_t *tsd, ckh_t *ckh, const void *key, const void *data)
 {
 	bool ret;
 
@@ -464,7 +465,7 @@ ckh_insert(ckh_t *ckh, const void *key, const void *data)
 #endif
 
 	while (ckh_try_insert(ckh, &key, &data)) {
-		if (ckh_grow(ckh)) {
+		if (ckh_grow(tsd, ckh)) {
 			ret = true;
 			goto label_return;
 		}
@@ -476,7 +477,8 @@ label_return:
 }
 
 bool
-ckh_remove(ckh_t *ckh, const void *searchkey, void **key, void **data)
+ckh_remove(tsd_t *tsd, ckh_t *ckh, const void *searchkey, void **key,
+    void **data)
 {
 	size_t cell;
 
@@ -497,7 +499,7 @@ ckh_remove(ckh_t *ckh, const void *searchkey, void **key, void **data)
 		    + LG_CKH_BUCKET_CELLS - 2)) && ckh->lg_curbuckets
 		    > ckh->lg_minbuckets) {
 			/* Ignore error due to OOM. */
-			ckh_shrink(ckh);
+			ckh_shrink(tsd, ckh);
 		}
 
 		return (false);
diff --git a/src/ctl.c b/src/ctl.c
index b816c84..c55f6e4 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -565,18 +565,23 @@ ctl_arena_refresh(arena_t *arena, unsigned i)
 static bool
 ctl_grow(void)
 {
+	tsd_t *tsd;
 	ctl_arena_stats_t *astats;
 	arena_t **tarenas;
 
+	tsd = tsd_tryget();
+	if (tsd == NULL)
+		return (true);
+
 	/* Allocate extended arena stats and arenas arrays. */
-	astats = (ctl_arena_stats_t *)imalloc((ctl_stats.narenas + 2) *
+	astats = (ctl_arena_stats_t *)imalloc(tsd, (ctl_stats.narenas + 2) *
 	    sizeof(ctl_arena_stats_t));
 	if (astats == NULL)
 		return (true);
-	tarenas = (arena_t **)imalloc((ctl_stats.narenas + 1) *
+	tarenas = (arena_t **)imalloc(tsd, (ctl_stats.narenas + 1) *
 	    sizeof(arena_t *));
 	if (tarenas == NULL) {
-		idalloc(astats);
+		idalloc(tsd, astats);
 		return (true);
 	}
 
@@ -585,8 +590,8 @@ ctl_grow(void)
 	    sizeof(ctl_arena_stats_t));
 	memset(&astats[ctl_stats.narenas + 1], 0, sizeof(ctl_arena_stats_t));
 	if (ctl_arena_init(&astats[ctl_stats.narenas + 1])) {
-		idalloc(tarenas);
-		idalloc(astats);
+		idalloc(tsd, tarenas);
+		idalloc(tsd, astats);
 		return (true);
 	}
 	/* Swap merged stats to their new location. */
@@ -623,7 +628,7 @@ ctl_grow(void)
 		 * base_alloc()).
 		 */
 		if (ctl_stats.narenas != narenas_auto)
-			idalloc(arenas_old);
+			idalloc(tsd, arenas_old);
 	}
 	ctl_stats.arenas = astats;
 	ctl_stats.narenas++;
@@ -1105,6 +1110,31 @@ label_return:								\
 	return (ret);							\
 }
 
+#define	CTL_TSD_RO_NL_CGEN(c, n, m, t)					\
+static int								\
+n##_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,	\
+    void *newp, size_t newlen)						\
+{									\
+	int ret;							\
+	t oldval;							\
+	tsd_t *tsd;							\
+									\
+	if ((c) == false)						\
+		return (ENOENT);					\
+	READONLY();							\
+	tsd = tsd_tryget();						\
+	if (tsd == NULL) {						\
+		ret = EAGAIN;						\
+		goto label_return;					\
+	}								\
+	oldval = (m(tsd));						\
+	READ(oldval, t);						\
+									\
+	ret = 0;							\
+label_return:								\
+	return (ret);							\
+}
+
 #define	CTL_RO_BOOL_CONFIG_GEN(n)					\
 static int								\
 n##_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,	\
@@ -1194,10 +1224,15 @@ thread_arena_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
     void *newp, size_t newlen)
 {
 	int ret;
+	tsd_t *tsd;
 	unsigned newind, oldind;
 
+	tsd = tsd_tryget();
+	if (tsd == NULL)
+		return (EAGAIN);
+
 	malloc_mutex_lock(&ctl_mtx);
-	newind = oldind = choose_arena(NULL)->ind;
+	newind = oldind = choose_arena(tsd, NULL)->ind;
 	WRITE(newind, unsigned);
 	READ(oldind, unsigned);
 	if (newind != oldind) {
@@ -1224,14 +1259,14 @@ thread_arena_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 
 		/* Set new arena association. */
 		if (config_tcache) {
-			tcache_t *tcache;
-			if ((uintptr_t)(tcache = *tcache_tsd_get()) >
-			    (uintptr_t)TCACHE_STATE_MAX) {
+			tcache_t *tcache = tsd_tcache_get(tsd);
+			if (tcache != NULL) {
 				tcache_arena_dissociate(tcache);
 				tcache_arena_associate(tcache, arena);
 			}
 		}
-		arenas_tsd_set(&arena);
+
+		tsd_arena_set(tsd, arena);
 	}
 
 	ret = 0;
@@ -1240,14 +1275,14 @@ label_return:
 	return (ret);
 }
 
-CTL_RO_NL_CGEN(config_stats, thread_allocated,
-    thread_allocated_tsd_get()->allocated, uint64_t)
-CTL_RO_NL_CGEN(config_stats, thread_allocatedp,
-    &thread_allocated_tsd_get()->allocated, uint64_t *)
-CTL_RO_NL_CGEN(config_stats, thread_deallocated,
-    thread_allocated_tsd_get()->deallocated, uint64_t)
-CTL_RO_NL_CGEN(config_stats, thread_deallocatedp,
-    &thread_allocated_tsd_get()->deallocated, uint64_t *)
+CTL_TSD_RO_NL_CGEN(config_stats, thread_allocated, tsd_thread_allocated_get,
+    uint64_t)
+CTL_TSD_RO_NL_CGEN(config_stats, thread_allocatedp, tsd_thread_allocatedp_get,
+    uint64_t *)
+CTL_TSD_RO_NL_CGEN(config_stats, thread_deallocated, tsd_thread_deallocated_get,
+    uint64_t)
+CTL_TSD_RO_NL_CGEN(config_stats, thread_deallocatedp,
+    tsd_thread_deallocatedp_get, uint64_t *)
 
 static int
 thread_tcache_enabled_ctl(const size_t *mib, size_t miblen, void *oldp,
@@ -1305,11 +1340,20 @@ thread_prof_name_ctl(const size_t *mib, size_t miblen, void *oldp,
 
 	oldname = prof_thread_name_get();
 	if (newp != NULL) {
+		tsd_t *tsd;
+
 		if (newlen != sizeof(const char *)) {
 			ret = EINVAL;
 			goto label_return;
 		}
-		if (prof_thread_name_set(*(const char **)newp)) {
+
+		tsd = tsd_tryget();
+		if (tsd == NULL) {
+			ret = EAGAIN;
+			goto label_return;
+		}
+
+		if (prof_thread_name_set(tsd, *(const char **)newp)) {
 			ret = EAGAIN;
 			goto label_return;
 		}
@@ -1675,6 +1719,7 @@ prof_reset_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 {
 	int ret;
 	size_t lg_sample = lg_prof_sample;
+	tsd_t *tsd;
 
 	if (config_prof == false)
 		return (ENOENT);
@@ -1684,7 +1729,13 @@ prof_reset_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 	if (lg_sample >= (sizeof(uint64_t) << 3))
 		lg_sample = (sizeof(uint64_t) << 3) - 1;
 
-	prof_reset(lg_sample);
+	tsd = tsd_tryget();
+	if (tsd == NULL) {
+		ret = EAGAIN;
+		goto label_return;
+	}
+
+	prof_reset(tsd, lg_sample);
 
 	ret = 0;
 label_return:
diff --git a/src/huge.c b/src/huge.c
index 0b7db7f..2e30ccf 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -13,14 +13,15 @@ static malloc_mutex_t	huge_mtx;
 static extent_tree_t	huge;
 
 void *
-huge_malloc(arena_t *arena, size_t size, bool zero)
+huge_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero)
 {
 
-	return (huge_palloc(arena, size, chunksize, zero));
+	return (huge_palloc(tsd, arena, size, chunksize, zero));
 }
 
 void *
-huge_palloc(arena_t *arena, size_t size, size_t alignment, bool zero)
+huge_palloc(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
+    bool zero)
 {
 	void *ret;
 	size_t csize;
@@ -45,7 +46,7 @@ huge_palloc(arena_t *arena, size_t size, size_t alignment, bool zero)
 	 * it is possible to make correct junk/zero fill decisions below.
 	 */
 	is_zeroed = zero;
-	arena = choose_arena(arena);
+	arena = choose_arena(tsd, arena);
 	ret = arena_chunk_alloc_huge(arena, csize, alignment, &is_zeroed);
 	if (ret == NULL) {
 		base_node_dalloc(node);
@@ -90,7 +91,7 @@ huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra)
 }
 
 void *
-huge_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
+huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t size,
     size_t extra, size_t alignment, bool zero, bool try_tcache_dalloc)
 {
 	void *ret;
@@ -106,18 +107,18 @@ huge_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
 	 * space and copying.
 	 */
 	if (alignment > chunksize)
-		ret = huge_palloc(arena, size + extra, alignment, zero);
+		ret = huge_palloc(tsd, arena, size + extra, alignment, zero);
 	else
-		ret = huge_malloc(arena, size + extra, zero);
+		ret = huge_malloc(tsd, arena, size + extra, zero);
 
 	if (ret == NULL) {
 		if (extra == 0)
 			return (NULL);
 		/* Try again, this time without extra. */
 		if (alignment > chunksize)
-			ret = huge_palloc(arena, size, alignment, zero);
+			ret = huge_palloc(tsd, arena, size, alignment, zero);
 		else
-			ret = huge_malloc(arena, size, zero);
+			ret = huge_malloc(tsd, arena, size, zero);
 
 		if (ret == NULL)
 			return (NULL);
@@ -129,7 +130,7 @@ huge_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
 	 */
 	copysize = (size < oldsize) ? size : oldsize;
 	memcpy(ret, ptr, copysize);
-	iqalloc(ptr, try_tcache_dalloc);
+	iqalloc(tsd, ptr, try_tcache_dalloc);
 	return (ret);
 }
 
diff --git a/src/jemalloc.c b/src/jemalloc.c
index c5b8f52..4d3b22e 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -5,8 +5,6 @@
 /* Data. */
 
 malloc_tsd_data(, arenas, arena_t *, NULL)
-malloc_tsd_data(, thread_allocated, thread_allocated_t,
-    THREAD_ALLOCATED_INITIALIZER)
 
 /* Runtime configuration options. */
 const char	*je_malloc_conf;
@@ -142,7 +140,7 @@ arenas_extend(unsigned ind)
 
 /* Slow path, called only by choose_arena(). */
 arena_t *
-choose_arena_hard(void)
+choose_arena_hard(tsd_t *tsd)
 {
 	arena_t *ret;
 
@@ -196,11 +194,32 @@ choose_arena_hard(void)
 		malloc_mutex_unlock(&arenas_lock);
 	}
 
-	arenas_tsd_set(&ret);
+	tsd_arena_set(tsd, ret);
 
 	return (ret);
 }
 
+void
+thread_allocated_cleanup(tsd_t *tsd)
+{
+
+	/* Do nothing. */
+}
+
+void
+thread_deallocated_cleanup(tsd_t *tsd)
+{
+
+	/* Do nothing. */
+}
+
+void
+arena_cleanup(tsd_t *tsd)
+{
+
+	/* Do nothing. */
+}
+
 static void
 stats_print_atexit(void)
 {
@@ -691,7 +710,11 @@ malloc_init_hard(void)
 #endif
 	malloc_initializer = INITIALIZER;
 
-	malloc_tsd_boot();
+	if (malloc_tsd_boot()) {
+		malloc_mutex_unlock(&init_lock);
+		return (true);
+	}
+
 	if (config_prof)
 		prof_boot0();
 
@@ -726,7 +749,7 @@ malloc_init_hard(void)
 
 	arena_boot();
 
-	if (config_tcache && tcache_boot0()) {
+	if (config_tcache && tcache_boot()) {
 		malloc_mutex_unlock(&init_lock);
 		return (true);
 	}
@@ -759,27 +782,6 @@ malloc_init_hard(void)
 		return (true);
 	}
 
-	/* Initialize allocation counters before any allocations can occur. */
-	if (config_stats && thread_allocated_tsd_boot()) {
-		malloc_mutex_unlock(&init_lock);
-		return (true);
-	}
-
-	if (arenas_tsd_boot()) {
-		malloc_mutex_unlock(&init_lock);
-		return (true);
-	}
-
-	if (config_tcache && tcache_boot1()) {
-		malloc_mutex_unlock(&init_lock);
-		return (true);
-	}
-
-	if (config_fill && quarantine_boot()) {
-		malloc_mutex_unlock(&init_lock);
-		return (true);
-	}
-
 	if (config_prof && prof_boot2()) {
 		malloc_mutex_unlock(&init_lock);
 		return (true);
@@ -863,36 +865,36 @@ malloc_init_hard(void)
  */
 
 static void *
-imalloc_prof_sample(size_t usize, prof_tctx_t *tctx)
+imalloc_prof_sample(tsd_t *tsd, size_t usize, prof_tctx_t *tctx)
 {
 	void *p;
 
 	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
-		p = imalloc(LARGE_MINCLASS);
+		p = imalloc(tsd, LARGE_MINCLASS);
 		if (p == NULL)
 			return (NULL);
 		arena_prof_promoted(p, usize);
 	} else
-		p = imalloc(usize);
+		p = imalloc(tsd, usize);
 
 	return (p);
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
-imalloc_prof(size_t usize)
+imalloc_prof(tsd_t *tsd, size_t usize)
 {
 	void *p;
 	prof_tctx_t *tctx;
 
-	tctx = prof_alloc_prep(usize, true);
+	tctx = prof_alloc_prep(tsd, usize, true);
 	if (unlikely((uintptr_t)tctx != (uintptr_t)1U))
-		p = imalloc_prof_sample(usize, tctx);
+		p = imalloc_prof_sample(tsd, usize, tctx);
 	else
-		p = imalloc(usize);
+		p = imalloc(tsd, usize);
 	if (p == NULL) {
-		prof_alloc_rollback(tctx, true);
+		prof_alloc_rollback(tsd, tctx, true);
 		return (NULL);
 	}
 	prof_malloc(p, usize, tctx);
@@ -901,32 +903,33 @@ imalloc_prof(size_t usize)
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
-imalloc_body(size_t size, size_t *usize)
+imalloc_body(size_t size, tsd_t **tsd, size_t *usize)
 {
 
-	if (unlikely(malloc_init()))
+	if (unlikely(malloc_init()) || unlikely((*tsd = tsd_tryget()) == NULL))
 		return (NULL);
 
 	if (config_prof && opt_prof) {
 		*usize = s2u(size);
-		return (imalloc_prof(*usize));
+		return (imalloc_prof(*tsd, *usize));
 	}
 
 	if (config_stats || (config_valgrind && unlikely(in_valgrind)))
 		*usize = s2u(size);
-	return (imalloc(size));
+	return (imalloc(*tsd, size));
 }
 
 void *
 je_malloc(size_t size)
 {
 	void *ret;
+	tsd_t *tsd;
 	size_t usize JEMALLOC_CC_SILENCE_INIT(0);
 
 	if (size == 0)
 		size = 1;
 
-	ret = imalloc_body(size, &usize);
+	ret = imalloc_body(size, &tsd, &usize);
 	if (unlikely(ret == NULL)) {
 		if (config_xmalloc && unlikely(opt_xmalloc)) {
 			malloc_write("<jemalloc>: Error in malloc(): "
@@ -937,7 +940,7 @@ je_malloc(size_t size)
 	}
 	if (config_stats && likely(ret != NULL)) {
 		assert(usize == isalloc(ret, config_prof));
-		thread_allocated_tsd_get()->allocated += usize;
+		*tsd_thread_allocatedp_get(tsd) += usize;
 	}
 	UTRACE(0, size, ret);
 	JEMALLOC_VALGRIND_MALLOC(ret != NULL, ret, usize, false);
@@ -945,7 +948,8 @@ je_malloc(size_t size)
 }
 
 static void *
-imemalign_prof_sample(size_t alignment, size_t usize, prof_tctx_t *tctx)
+imemalign_prof_sample(tsd_t *tsd, size_t alignment, size_t usize,
+    prof_tctx_t *tctx)
 {
 	void *p;
 
@@ -953,29 +957,29 @@ imemalign_prof_sample(size_t alignment, size_t usize, prof_tctx_t *tctx)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
 		assert(sa2u(LARGE_MINCLASS, alignment) == LARGE_MINCLASS);
-		p = imalloc(LARGE_MINCLASS);
+		p = imalloc(tsd, LARGE_MINCLASS);
 		if (p == NULL)
 			return (NULL);
 		arena_prof_promoted(p, usize);
 	} else
-		p = ipalloc(usize, alignment, false);
+		p = ipalloc(tsd, usize, alignment, false);
 
 	return (p);
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
-imemalign_prof(size_t alignment, size_t usize)
+imemalign_prof(tsd_t *tsd, size_t alignment, size_t usize)
 {
 	void *p;
 	prof_tctx_t *tctx;
 
-	tctx = prof_alloc_prep(usize, true);
+	tctx = prof_alloc_prep(tsd, usize, true);
 	if (unlikely((uintptr_t)tctx != (uintptr_t)1U))
-		p = imemalign_prof_sample(alignment, usize, tctx);
+		p = imemalign_prof_sample(tsd, alignment, usize, tctx);
 	else
-		p = ipalloc(usize, alignment, false);
+		p = ipalloc(tsd, usize, alignment, false);
 	if (p == NULL) {
-		prof_alloc_rollback(tctx, true);
+		prof_alloc_rollback(tsd, tctx, true);
 		return (NULL);
 	}
 	prof_malloc(p, usize, tctx);
@@ -988,12 +992,13 @@ static int
 imemalign(void **memptr, size_t alignment, size_t size, size_t min_alignment)
 {
 	int ret;
+	tsd_t *tsd;
 	size_t usize;
 	void *result;
 
 	assert(min_alignment != 0);
 
-	if (unlikely(malloc_init())) {
+	if (unlikely(malloc_init()) || unlikely((tsd = tsd_tryget()) == NULL)) {
 		result = NULL;
 		goto label_oom;
 	} else {
@@ -1020,9 +1025,9 @@ imemalign(void **memptr, size_t alignment, size_t size, size_t min_alignment)
 		}
 
 		if (config_prof && opt_prof)
-			result = imemalign_prof(alignment, usize);
+			result = imemalign_prof(tsd, alignment, usize);
 		else
-			result = ipalloc(usize, alignment, false);
+			result = ipalloc(tsd, usize, alignment, false);
 		if (unlikely(result == NULL))
 			goto label_oom;
 	}
@@ -1032,7 +1037,7 @@ imemalign(void **memptr, size_t alignment, size_t size, size_t min_alignment)
 label_return:
 	if (config_stats && likely(result != NULL)) {
 		assert(usize == isalloc(result, config_prof));
-		thread_allocated_tsd_get()->allocated += usize;
+		*tsd_thread_allocatedp_get(tsd) += usize;
 	}
 	UTRACE(0, size, result);
 	return (ret);
@@ -1072,36 +1077,36 @@ je_aligned_alloc(size_t alignment, size_t size)
 }
 
 static void *
-icalloc_prof_sample(size_t usize, prof_tctx_t *tctx)
+icalloc_prof_sample(tsd_t *tsd, size_t usize, prof_tctx_t *tctx)
 {
 	void *p;
 
 	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
-		p = icalloc(LARGE_MINCLASS);
+		p = icalloc(tsd, LARGE_MINCLASS);
 		if (p == NULL)
 			return (NULL);
 		arena_prof_promoted(p, usize);
 	} else
-		p = icalloc(usize);
+		p = icalloc(tsd, usize);
 
 	return (p);
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
-icalloc_prof(size_t usize)
+icalloc_prof(tsd_t *tsd, size_t usize)
 {
 	void *p;
 	prof_tctx_t *tctx;
 
-	tctx = prof_alloc_prep(usize, true);
+	tctx = prof_alloc_prep(tsd, usize, true);
 	if (unlikely((uintptr_t)tctx != (uintptr_t)1U))
-		p = icalloc_prof_sample(usize, tctx);
+		p = icalloc_prof_sample(tsd, usize, tctx);
 	else
-		p = icalloc(usize);
+		p = icalloc(tsd, usize);
 	if (p == NULL) {
-		prof_alloc_rollback(tctx, true);
+		prof_alloc_rollback(tsd, tctx, true);
 		return (NULL);
 	}
 	prof_malloc(p, usize, tctx);
@@ -1113,10 +1118,11 @@ void *
 je_calloc(size_t num, size_t size)
 {
 	void *ret;
+	tsd_t *tsd;
 	size_t num_size;
 	size_t usize JEMALLOC_CC_SILENCE_INIT(0);
 
-	if (unlikely(malloc_init())) {
+	if (unlikely(malloc_init()) || unlikely((tsd = tsd_tryget()) == NULL)) {
 		num_size = 0;
 		ret = NULL;
 		goto label_return;
@@ -1144,11 +1150,11 @@ je_calloc(size_t num, size_t size)
 
 	if (config_prof && opt_prof) {
 		usize = s2u(num_size);
-		ret = icalloc_prof(usize);
+		ret = icalloc_prof(tsd, usize);
 	} else {
 		if (config_stats || (config_valgrind && unlikely(in_valgrind)))
 			usize = s2u(num_size);
-		ret = icalloc(num_size);
+		ret = icalloc(tsd, num_size);
 	}
 
 label_return:
@@ -1162,7 +1168,7 @@ label_return:
 	}
 	if (config_stats && likely(ret != NULL)) {
 		assert(usize == isalloc(ret, config_prof));
-		thread_allocated_tsd_get()->allocated += usize;
+		*tsd_thread_allocatedp_get(tsd) += usize;
 	}
 	UTRACE(0, num_size, ret);
 	JEMALLOC_VALGRIND_MALLOC(ret != NULL, ret, usize, true);
@@ -1170,44 +1176,44 @@ label_return:
 }
 
 static void *
-irealloc_prof_sample(void *oldptr, size_t usize, prof_tctx_t *tctx)
+irealloc_prof_sample(tsd_t *tsd, void *oldptr, size_t usize, prof_tctx_t *tctx)
 {
 	void *p;
 
 	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
-		p = iralloc(oldptr, LARGE_MINCLASS, 0, false);
+		p = iralloc(tsd, oldptr, LARGE_MINCLASS, 0, false);
 		if (p == NULL)
 			return (NULL);
 		arena_prof_promoted(p, usize);
 	} else
-		p = iralloc(oldptr, usize, 0, false);
+		p = iralloc(tsd, oldptr, usize, 0, false);
 
 	return (p);
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
-irealloc_prof(void *oldptr, size_t old_usize, size_t usize)
+irealloc_prof(tsd_t *tsd, void *oldptr, size_t old_usize, size_t usize)
 {
 	void *p;
 	prof_tctx_t *old_tctx, *tctx;
 
 	old_tctx = prof_tctx_get(oldptr);
-	tctx = prof_alloc_prep(usize, true);
+	tctx = prof_alloc_prep(tsd, usize, true);
 	if (unlikely((uintptr_t)tctx != (uintptr_t)1U))
-		p = irealloc_prof_sample(oldptr, usize, tctx);
+		p = irealloc_prof_sample(tsd, oldptr, usize, tctx);
 	else
-		p = iralloc(oldptr, usize, 0, false);
+		p = iralloc(tsd, oldptr, usize, 0, false);
 	if (p == NULL)
 		return (NULL);
-	prof_realloc(p, usize, tctx, true, old_usize, old_tctx);
+	prof_realloc(tsd, p, usize, tctx, true, old_usize, old_tctx);
 
 	return (p);
 }
 
 JEMALLOC_INLINE_C void
-ifree(void *ptr, bool try_tcache)
+ifree(tsd_t *tsd, void *ptr, bool try_tcache)
 {
 	size_t usize;
 	UNUSED size_t rzsize JEMALLOC_CC_SILENCE_INIT(0);
@@ -1217,19 +1223,19 @@ ifree(void *ptr, bool try_tcache)
 
 	if (config_prof && opt_prof) {
 		usize = isalloc(ptr, config_prof);
-		prof_free(ptr, usize);
+		prof_free(tsd, ptr, usize);
 	} else if (config_stats || config_valgrind)
 		usize = isalloc(ptr, config_prof);
-	if (config_stats)
-		thread_allocated_tsd_get()->deallocated += usize;
+	if (config_stats && likely(tsd != NULL))
+		*tsd_thread_deallocatedp_get(tsd) += usize;
 	if (config_valgrind && unlikely(in_valgrind))
 		rzsize = p2rz(ptr);
-	iqalloc(ptr, try_tcache);
+	iqalloc(tsd, ptr, try_tcache);
 	JEMALLOC_VALGRIND_FREE(ptr, rzsize);
 }
 
 JEMALLOC_INLINE_C void
-isfree(void *ptr, size_t usize, bool try_tcache)
+isfree(tsd_t *tsd, void *ptr, size_t usize, bool try_tcache)
 {
 	UNUSED size_t rzsize JEMALLOC_CC_SILENCE_INIT(0);
 
@@ -1237,12 +1243,12 @@ isfree(void *ptr, size_t usize, bool try_tcache)
 	assert(malloc_initialized || IS_INITIALIZER);
 
 	if (config_prof && opt_prof)
-		prof_free(ptr, usize);
-	if (config_stats)
-		thread_allocated_tsd_get()->deallocated += usize;
+		prof_free(tsd, ptr, usize);
+	if (config_stats && likely(tsd != NULL))
+		*tsd_thread_deallocatedp_get(tsd) += usize;
 	if (config_valgrind && unlikely(in_valgrind))
 		rzsize = p2rz(ptr);
-	isqalloc(ptr, usize, try_tcache);
+	isqalloc(tsd, ptr, usize, try_tcache);
 	JEMALLOC_VALGRIND_FREE(ptr, rzsize);
 }
 
@@ -1250,6 +1256,7 @@ void *
 je_realloc(void *ptr, size_t size)
 {
 	void *ret;
+	tsd_t *tsd;
 	size_t usize JEMALLOC_CC_SILENCE_INIT(0);
 	size_t old_usize = 0;
 	UNUSED size_t old_rzsize JEMALLOC_CC_SILENCE_INIT(0);
@@ -1258,7 +1265,8 @@ je_realloc(void *ptr, size_t size)
 		if (ptr != NULL) {
 			/* realloc(ptr, 0) is equivalent to free(ptr). */
 			UTRACE(ptr, 0, 0);
-			ifree(ptr, true);
+			tsd = tsd_tryget();
+			ifree(tsd, ptr, true);
 			return (NULL);
 		}
 		size = 1;
@@ -1268,24 +1276,29 @@ je_realloc(void *ptr, size_t size)
 		assert(malloc_initialized || IS_INITIALIZER);
 		malloc_thread_init();
 
-		if ((config_prof && opt_prof) || config_stats ||
-		    (config_valgrind && unlikely(in_valgrind)))
-			old_usize = isalloc(ptr, config_prof);
-		if (config_valgrind && unlikely(in_valgrind))
-			old_rzsize = config_prof ? p2rz(ptr) : u2rz(old_usize);
+		if ((tsd = tsd_tryget()) != NULL) {
+			if ((config_prof && opt_prof) || config_stats ||
+			    (config_valgrind && unlikely(in_valgrind)))
+				old_usize = isalloc(ptr, config_prof);
+			if (config_valgrind && unlikely(in_valgrind)) {
+				old_rzsize = config_prof ? p2rz(ptr) :
+				    u2rz(old_usize);
+			}
 
-		if (config_prof && opt_prof) {
-			usize = s2u(size);
-			ret = irealloc_prof(ptr, old_usize, usize);
-		} else {
-			if (config_stats || (config_valgrind &&
-			    unlikely(in_valgrind)))
+			if (config_prof && opt_prof) {
 				usize = s2u(size);
-			ret = iralloc(ptr, size, 0, false);
-		}
+				ret = irealloc_prof(tsd, ptr, old_usize, usize);
+			} else {
+				if (config_stats || (config_valgrind &&
+				    unlikely(in_valgrind)))
+					usize = s2u(size);
+				ret = iralloc(tsd, ptr, size, 0, false);
+			}
+		} else
+			ret = NULL;
 	} else {
 		/* realloc(NULL, size) is equivalent to malloc(size). */
-		ret = imalloc_body(size, &usize);
+		ret = imalloc_body(size, &tsd, &usize);
 	}
 
 	if (unlikely(ret == NULL)) {
@@ -1297,11 +1310,11 @@ je_realloc(void *ptr, size_t size)
 		set_errno(ENOMEM);
 	}
 	if (config_stats && likely(ret != NULL)) {
-		thread_allocated_t *ta;
 		assert(usize == isalloc(ret, config_prof));
-		ta = thread_allocated_tsd_get();
-		ta->allocated += usize;
-		ta->deallocated += old_usize;
+		if (tsd != NULL) {
+			*tsd_thread_allocatedp_get(tsd) += usize;
+			*tsd_thread_deallocatedp_get(tsd) += old_usize;
+		}
 	}
 	UTRACE(ptr, size, ret);
 	JEMALLOC_VALGRIND_REALLOC(true, ret, usize, true, ptr, old_usize,
@@ -1315,7 +1328,7 @@ je_free(void *ptr)
 
 	UTRACE(ptr, 0, 0);
 	if (likely(ptr != NULL))
-		ifree(ptr, true);
+		ifree(tsd_tryget(), ptr, true);
 }
 
 /*
@@ -1425,50 +1438,52 @@ imallocx_flags_decode(size_t size, int flags, size_t *usize, size_t *alignment,
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
-imallocx_flags(size_t usize, size_t alignment, bool zero, bool try_tcache,
-    arena_t *arena)
+imallocx_flags(tsd_t *tsd, size_t usize, size_t alignment, bool zero,
+    bool try_tcache, arena_t *arena)
 {
 
-	if (alignment != 0)
-		return (ipalloct(usize, alignment, zero, try_tcache, arena));
+	if (alignment != 0) {
+		return (ipalloct(tsd, usize, alignment, zero, try_tcache,
+		    arena));
+	}
 	if (zero)
-		return (icalloct(usize, try_tcache, arena));
-	return (imalloct(usize, try_tcache, arena));
+		return (icalloct(tsd, usize, try_tcache, arena));
+	return (imalloct(tsd, usize, try_tcache, arena));
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
-imallocx_maybe_flags(size_t size, int flags, size_t usize, size_t alignment,
-    bool zero, bool try_tcache, arena_t *arena)
+imallocx_maybe_flags(tsd_t *tsd, size_t size, int flags, size_t usize,
+    size_t alignment, bool zero, bool try_tcache, arena_t *arena)
 {
 
 	if (likely(flags == 0))
-		return (imalloc(size));
-	return (imallocx_flags(usize, alignment, zero, try_tcache, arena));
+		return (imalloc(tsd, size));
+	return (imallocx_flags(tsd, usize, alignment, zero, try_tcache, arena));
 }
 
 static void *
-imallocx_prof_sample(size_t size, int flags, size_t usize, size_t alignment,
-    bool zero, bool try_tcache, arena_t *arena)
+imallocx_prof_sample(tsd_t *tsd, size_t size, int flags, size_t usize,
+    size_t alignment, bool zero, bool try_tcache, arena_t *arena)
 {
 	void *p;
 
 	if (usize <= SMALL_MAXCLASS) {
 		assert(((alignment == 0) ? s2u(LARGE_MINCLASS) :
 		    sa2u(LARGE_MINCLASS, alignment)) == LARGE_MINCLASS);
-		p = imalloct(LARGE_MINCLASS, try_tcache, arena);
+		p = imalloct(tsd, LARGE_MINCLASS, try_tcache, arena);
 		if (p == NULL)
 			return (NULL);
 		arena_prof_promoted(p, usize);
 	} else {
-		p = imallocx_maybe_flags(size, flags, usize, alignment, zero,
-		    try_tcache, arena);
+		p = imallocx_maybe_flags(tsd, size, flags, usize, alignment,
+		    zero, try_tcache, arena);
 	}
 
 	return (p);
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
-imallocx_prof(size_t size, int flags, size_t *usize)
+imallocx_prof(tsd_t *tsd, size_t size, int flags, size_t *usize)
 {
 	void *p;
 	size_t alignment;
@@ -1479,17 +1494,17 @@ imallocx_prof(size_t size, int flags, size_t *usize)
 
 	imallocx_flags_decode(size, flags, usize, &alignment, &zero,
 	    &try_tcache, &arena);
-	tctx = prof_alloc_prep(*usize, true);
+	tctx = prof_alloc_prep(tsd, *usize, true);
 	if (likely((uintptr_t)tctx == (uintptr_t)1U)) {
-		p = imallocx_maybe_flags(size, flags, *usize, alignment, zero,
-		    try_tcache, arena);
+		p = imallocx_maybe_flags(tsd, size, flags, *usize, alignment,
+		    zero, try_tcache, arena);
 	} else if ((uintptr_t)tctx > (uintptr_t)1U) {
-		p = imallocx_prof_sample(size, flags, *usize, alignment, zero,
-		    try_tcache, arena);
+		p = imallocx_prof_sample(tsd, size, flags, *usize, alignment,
+		    zero, try_tcache, arena);
 	} else
 		p = NULL;
 	if (unlikely(p == NULL)) {
-		prof_alloc_rollback(tctx, true);
+		prof_alloc_rollback(tsd, tctx, true);
 		return (NULL);
 	}
 	prof_malloc(p, *usize, tctx);
@@ -1498,7 +1513,7 @@ imallocx_prof(size_t size, int flags, size_t *usize)
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
-imallocx_no_prof(size_t size, int flags, size_t *usize)
+imallocx_no_prof(tsd_t *tsd, size_t size, int flags, size_t *usize)
 {
 	size_t alignment;
 	bool zero;
@@ -1508,35 +1523,39 @@ imallocx_no_prof(size_t size, int flags, size_t *usize)
 	if (likely(flags == 0)) {
 		if (config_stats || (config_valgrind && unlikely(in_valgrind)))
 			*usize = s2u(size);
-		return (imalloc(size));
+		return (imalloc(tsd, size));
 	}
 
 	imallocx_flags_decode_hard(size, flags, usize, &alignment, &zero,
 	    &try_tcache, &arena);
-	return (imallocx_flags(*usize, alignment, zero, try_tcache, arena));
+	return (imallocx_flags(tsd, *usize, alignment, zero, try_tcache,
+	    arena));
 }
 
 void *
 je_mallocx(size_t size, int flags)
 {
+	tsd_t *tsd;
 	void *p;
 	size_t usize;
 
 	assert(size != 0);
 
-	if (unlikely(malloc_init()))
+	if (unlikely(malloc_init()) || unlikely((tsd = tsd_tryget()) == NULL))
 		goto label_oom;
 
 	if (config_prof && opt_prof)
-		p = imallocx_prof(size, flags, &usize);
+		p = imallocx_prof(tsd, size, flags, &usize);
 	else
-		p = imallocx_no_prof(size, flags, &usize);
+		p = imallocx_no_prof(tsd, size, flags, &usize);
 	if (unlikely(p == NULL))
 		goto label_oom;
 
 	if (config_stats) {
+		tsd_t *tsd = tsd_tryget();
 		assert(usize == isalloc(p, config_prof));
-		thread_allocated_tsd_get()->allocated += usize;
+		if (tsd != NULL)
+			*tsd_thread_allocatedp_get(tsd) += usize;
 	}
 	UTRACE(0, size, p);
 	JEMALLOC_VALGRIND_MALLOC(true, p, usize, MALLOCX_ZERO_GET(flags));
@@ -1551,47 +1570,47 @@ label_oom:
 }
 
 static void *
-irallocx_prof_sample(void *oldptr, size_t size, size_t alignment, size_t usize,
-    bool zero, bool try_tcache_alloc, bool try_tcache_dalloc, arena_t *arena,
-    prof_tctx_t *tctx)
+irallocx_prof_sample(tsd_t *tsd, void *oldptr, size_t size, size_t alignment,
+    size_t usize, bool zero, bool try_tcache_alloc, bool try_tcache_dalloc,
+    arena_t *arena, prof_tctx_t *tctx)
 {
 	void *p;
 
 	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
-		p = iralloct(oldptr, LARGE_MINCLASS, alignment, zero,
+		p = iralloct(tsd, oldptr, LARGE_MINCLASS, alignment, zero,
 		    try_tcache_alloc, try_tcache_dalloc, arena);
 		if (p == NULL)
 			return (NULL);
 		arena_prof_promoted(p, usize);
 	} else {
-		p = iralloct(oldptr, size, alignment, zero, try_tcache_alloc,
-		    try_tcache_dalloc, arena);
+		p = iralloct(tsd, oldptr, size, alignment, zero,
+		    try_tcache_alloc, try_tcache_dalloc, arena);
 	}
 
 	return (p);
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
-irallocx_prof(void *oldptr, size_t old_usize, size_t size, size_t alignment,
-    size_t *usize, bool zero, bool try_tcache_alloc, bool try_tcache_dalloc,
-    arena_t *arena)
+irallocx_prof(tsd_t *tsd, void *oldptr, size_t old_usize, size_t size,
+    size_t alignment, size_t *usize, bool zero, bool try_tcache_alloc,
+    bool try_tcache_dalloc, arena_t *arena)
 {
 	void *p;
 	prof_tctx_t *old_tctx, *tctx;
 
 	old_tctx = prof_tctx_get(oldptr);
-	tctx = prof_alloc_prep(*usize, false);
+	tctx = prof_alloc_prep(tsd, *usize, false);
 	if (unlikely((uintptr_t)tctx != (uintptr_t)1U)) {
-		p = irallocx_prof_sample(oldptr, size, alignment, *usize, zero,
-		    try_tcache_alloc, try_tcache_dalloc, arena, tctx);
+		p = irallocx_prof_sample(tsd, oldptr, size, alignment, *usize,
+		    zero, try_tcache_alloc, try_tcache_dalloc, arena, tctx);
 	} else {
-		p = iralloct(oldptr, size, alignment, zero, try_tcache_alloc,
-		    try_tcache_dalloc, arena);
+		p = iralloct(tsd, oldptr, size, alignment, zero,
+		    try_tcache_alloc, try_tcache_dalloc, arena);
 	}
 	if (unlikely(p == NULL)) {
-		prof_alloc_rollback(tctx, false);
+		prof_alloc_rollback(tsd, tctx, false);
 		return (NULL);
 	}
 
@@ -1606,7 +1625,7 @@ irallocx_prof(void *oldptr, size_t old_usize, size_t size, size_t alignment,
 		 */
 		*usize = isalloc(p, config_prof);
 	}
-	prof_realloc(p, *usize, tctx, false, old_usize, old_tctx);
+	prof_realloc(tsd, p, *usize, tctx, false, old_usize, old_tctx);
 
 	return (p);
 }
@@ -1615,6 +1634,7 @@ void *
 je_rallocx(void *ptr, size_t size, int flags)
 {
 	void *p;
+	tsd_t *tsd;
 	size_t usize;
 	UNUSED size_t old_usize JEMALLOC_CC_SILENCE_INIT(0);
 	UNUSED size_t old_rzsize JEMALLOC_CC_SILENCE_INIT(0);
@@ -1628,6 +1648,9 @@ je_rallocx(void *ptr, size_t size, int flags)
 	assert(malloc_initialized || IS_INITIALIZER);
 	malloc_thread_init();
 
+	if (unlikely((tsd = tsd_tryget()) == NULL))
+		goto label_oom;
+
 	if (unlikely((flags & MALLOCX_ARENA_MASK) != 0)) {
 		unsigned arena_ind = MALLOCX_ARENA_GET(flags);
 		arena_chunk_t *chunk;
@@ -1651,12 +1674,12 @@ je_rallocx(void *ptr, size_t size, int flags)
 	if (config_prof && opt_prof) {
 		usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment);
 		assert(usize != 0);
-		p = irallocx_prof(ptr, old_usize, size, alignment, &usize, zero,
-		    try_tcache_alloc, try_tcache_dalloc, arena);
+		p = irallocx_prof(tsd, ptr, old_usize, size, alignment, &usize,
+		    zero, try_tcache_alloc, try_tcache_dalloc, arena);
 		if (unlikely(p == NULL))
 			goto label_oom;
 	} else {
-		p = iralloct(ptr, size, alignment, zero, try_tcache_alloc,
+		p = iralloct(tsd, ptr, size, alignment, zero, try_tcache_alloc,
 		    try_tcache_dalloc, arena);
 		if (unlikely(p == NULL))
 			goto label_oom;
@@ -1665,10 +1688,8 @@ je_rallocx(void *ptr, size_t size, int flags)
 	}
 
 	if (config_stats) {
-		thread_allocated_t *ta;
-		ta = thread_allocated_tsd_get();
-		ta->allocated += usize;
-		ta->deallocated += old_usize;
+		*tsd_thread_allocatedp_get(tsd) += usize;
+		*tsd_thread_deallocatedp_get(tsd) += old_usize;
 	}
 	UTRACE(ptr, size, p);
 	JEMALLOC_VALGRIND_REALLOC(true, p, usize, false, ptr, old_usize,
@@ -1724,8 +1745,8 @@ ixallocx_prof_sample(void *ptr, size_t old_usize, size_t size, size_t extra,
 }
 
 JEMALLOC_ALWAYS_INLINE_C size_t
-ixallocx_prof(void *ptr, size_t old_usize, size_t size, size_t extra,
-    size_t alignment, bool zero, arena_t *arena)
+ixallocx_prof(tsd_t *tsd, void *ptr, size_t old_usize, size_t size,
+    size_t extra, size_t alignment, bool zero, arena_t *arena)
 {
 	size_t max_usize, usize;
 	prof_tctx_t *old_tctx, *tctx;
@@ -1739,7 +1760,7 @@ ixallocx_prof(void *ptr, size_t old_usize, size_t size, size_t extra,
 	 */
 	max_usize = (alignment == 0) ? s2u(size+extra) : sa2u(size+extra,
 	    alignment);
-	tctx = prof_alloc_prep(max_usize, false);
+	tctx = prof_alloc_prep(tsd, max_usize, false);
 	if (unlikely((uintptr_t)tctx != (uintptr_t)1U)) {
 		usize = ixallocx_prof_sample(ptr, old_usize, size, extra,
 		    alignment, zero, max_usize, arena, tctx);
@@ -1748,10 +1769,10 @@ ixallocx_prof(void *ptr, size_t old_usize, size_t size, size_t extra,
 		    zero, arena);
 	}
 	if (unlikely(usize == old_usize)) {
-		prof_alloc_rollback(tctx, false);
+		prof_alloc_rollback(tsd, tctx, false);
 		return (usize);
 	}
-	prof_realloc(ptr, usize, tctx, false, old_usize, old_tctx);
+	prof_realloc(tsd, ptr, usize, tctx, false, old_usize, old_tctx);
 
 	return (usize);
 }
@@ -1759,6 +1780,7 @@ ixallocx_prof(void *ptr, size_t old_usize, size_t size, size_t extra,
 size_t
 je_xallocx(void *ptr, size_t size, size_t extra, int flags)
 {
+	tsd_t *tsd;
 	size_t usize, old_usize;
 	UNUSED size_t old_rzsize JEMALLOC_CC_SILENCE_INIT(0);
 	size_t alignment = MALLOCX_ALIGN_GET(flags);
@@ -1778,12 +1800,16 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags)
 		arena = NULL;
 
 	old_usize = isalloc(ptr, config_prof);
+	if (unlikely((tsd = tsd_tryget()) == NULL)) {
+		usize = old_usize;
+		goto label_not_resized;
+	}
 	if (config_valgrind && unlikely(in_valgrind))
 		old_rzsize = u2rz(old_usize);
 
 	if (config_prof && opt_prof) {
-		usize = ixallocx_prof(ptr, old_usize, size, extra, alignment,
-		    zero, arena);
+		usize = ixallocx_prof(tsd, ptr, old_usize, size, extra,
+		    alignment, zero, arena);
 	} else {
 		usize = ixallocx_helper(ptr, old_usize, size, extra, alignment,
 		    zero, arena);
@@ -1792,10 +1818,8 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags)
 		goto label_not_resized;
 
 	if (config_stats) {
-		thread_allocated_t *ta;
-		ta = thread_allocated_tsd_get();
-		ta->allocated += usize;
-		ta->deallocated += old_usize;
+		*tsd_thread_allocatedp_get(tsd) += usize;
+		*tsd_thread_deallocatedp_get(tsd) += old_usize;
 	}
 	JEMALLOC_VALGRIND_REALLOC(false, ptr, usize, false, ptr, old_usize,
 	    old_rzsize, false, zero);
@@ -1839,7 +1863,7 @@ je_dallocx(void *ptr, int flags)
 		try_tcache = true;
 
 	UTRACE(ptr, 0, 0);
-	ifree(ptr, try_tcache);
+	ifree(tsd_tryget(), ptr, try_tcache);
 }
 
 JEMALLOC_ALWAYS_INLINE_C size_t
@@ -1875,7 +1899,7 @@ je_sdallocx(void *ptr, size_t size, int flags)
 		try_tcache = true;
 
 	UTRACE(ptr, 0, 0);
-	isfree(ptr, usize, try_tcache);
+	isfree(tsd_tryget(), ptr, usize, try_tcache);
 }
 
 size_t
@@ -2072,9 +2096,9 @@ a0alloc(size_t size, bool zero)
 		size = 1;
 
 	if (size <= arena_maxclass)
-		return (arena_malloc(arenas[0], size, zero, false));
+		return (arena_malloc(NULL, arenas[0], size, zero, false));
 	else
-		return (huge_malloc(NULL, size, zero));
+		return (huge_malloc(NULL, arenas[0], size, zero));
 }
 
 void *
@@ -2101,7 +2125,7 @@ a0free(void *ptr)
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (chunk != ptr)
-		arena_dalloc(chunk, ptr, false);
+		arena_dalloc(NULL, chunk, ptr, false);
 	else
 		huge_dalloc(ptr);
 }
diff --git a/src/prof.c b/src/prof.c
index a773e22..dd84f53 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -14,8 +14,6 @@
 /******************************************************************************/
 /* Data. */
 
-malloc_tsd_data(, prof_tdata, prof_tdata_t *, NULL)
-
 bool		opt_prof = false;
 bool		opt_prof_active = true;
 size_t		opt_lg_prof_sample = LG_PROF_SAMPLE_DEFAULT;
@@ -102,9 +100,9 @@ static bool		prof_booted = false;
  */
 
 static bool	prof_tctx_should_destroy(prof_tctx_t *tctx);
-static void	prof_tctx_destroy(prof_tctx_t *tctx);
+static void	prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx);
 static bool	prof_tdata_should_destroy(prof_tdata_t *tdata);
-static void	prof_tdata_destroy(prof_tdata_t *tdata);
+static void	prof_tdata_destroy(tsd_t *tsd, prof_tdata_t *tdata);
 
 /******************************************************************************/
 /* Red-black trees. */
@@ -151,7 +149,7 @@ rb_gen(static UNUSED, tdata_tree_, prof_tdata_tree_t, prof_tdata_t, tdata_link,
 /******************************************************************************/
 
 void
-prof_alloc_rollback(prof_tctx_t *tctx, bool updated)
+prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx, bool updated)
 {
 	prof_tdata_t *tdata;
 
@@ -164,8 +162,8 @@ prof_alloc_rollback(prof_tctx_t *tctx, bool updated)
 		 * potential for sample bias is minimal except in contrived
 		 * programs.
 		 */
-		tdata = prof_tdata_get(true);
-		if ((uintptr_t)tdata > (uintptr_t)PROF_TDATA_STATE_MAX)
+		tdata = prof_tdata_get(tsd, true);
+		if (tdata != NULL)
 			prof_sample_threshold_update(tctx->tdata);
 	}
 
@@ -173,7 +171,7 @@ prof_alloc_rollback(prof_tctx_t *tctx, bool updated)
 		malloc_mutex_lock(tctx->tdata->lock);
 		tctx->prepared = false;
 		if (prof_tctx_should_destroy(tctx))
-			prof_tctx_destroy(tctx);
+			prof_tctx_destroy(tsd, tctx);
 		else
 			malloc_mutex_unlock(tctx->tdata->lock);
 	}
@@ -195,7 +193,7 @@ prof_malloc_sample_object(const void *ptr, size_t usize, prof_tctx_t *tctx) {
 }
 
 void
-prof_free_sampled_object(size_t usize, prof_tctx_t *tctx)
+prof_free_sampled_object(tsd_t *tsd, size_t usize, prof_tctx_t *tctx)
 {
 
 	malloc_mutex_lock(tctx->tdata->lock);
@@ -205,7 +203,7 @@ prof_free_sampled_object(size_t usize, prof_tctx_t *tctx)
 	tctx->cnts.curbytes -= usize;
 
 	if (prof_tctx_should_destroy(tctx))
-		prof_tctx_destroy(tctx);
+		prof_tctx_destroy(tsd, tctx);
 	else
 		malloc_mutex_unlock(tctx->tdata->lock);
 }
@@ -494,13 +492,13 @@ prof_tdata_mutex_choose(uint64_t thr_uid)
 }
 
 static prof_gctx_t *
-prof_gctx_create(prof_bt_t *bt)
+prof_gctx_create(tsd_t *tsd, prof_bt_t *bt)
 {
 	/*
 	 * Create a single allocation that has space for vec of length bt->len.
 	 */
-	prof_gctx_t *gctx = (prof_gctx_t *)imalloc(offsetof(prof_gctx_t, vec) +
-	    (bt->len * sizeof(void *)));
+	prof_gctx_t *gctx = (prof_gctx_t *)imalloc(tsd, offsetof(prof_gctx_t,
+	    vec) + (bt->len * sizeof(void *)));
 	if (gctx == NULL)
 		return (NULL);
 	gctx->lock = prof_gctx_mutex_choose();
@@ -518,7 +516,7 @@ prof_gctx_create(prof_bt_t *bt)
 }
 
 static void
-prof_gctx_maybe_destroy(prof_gctx_t *gctx, prof_tdata_t *tdata)
+prof_gctx_maybe_destroy(tsd_t *tsd, prof_gctx_t *gctx, prof_tdata_t *tdata)
 {
 
 	cassert(config_prof);
@@ -534,12 +532,12 @@ prof_gctx_maybe_destroy(prof_gctx_t *gctx, prof_tdata_t *tdata)
 	malloc_mutex_lock(gctx->lock);
 	if (tctx_tree_empty(&gctx->tctxs) && gctx->nlimbo == 1) {
 		/* Remove gctx from bt2gctx. */
-		if (ckh_remove(&bt2gctx, &gctx->bt, NULL, NULL))
+		if (ckh_remove(tsd, &bt2gctx, &gctx->bt, NULL, NULL))
 			not_reached();
 		prof_leave(tdata);
 		/* Destroy gctx. */
 		malloc_mutex_unlock(gctx->lock);
-		idalloc(gctx);
+		idalloc(tsd, gctx);
 	} else {
 		/*
 		 * Compensate for increment in prof_tctx_destroy() or
@@ -580,7 +578,7 @@ prof_gctx_should_destroy(prof_gctx_t *gctx)
 
 /* tctx->tdata->lock is held upon entry, and released before return. */
 static void
-prof_tctx_destroy(prof_tctx_t *tctx)
+prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx)
 {
 	prof_tdata_t *tdata = tctx->tdata;
 	prof_gctx_t *gctx = tctx->gctx;
@@ -592,7 +590,7 @@ prof_tctx_destroy(prof_tctx_t *tctx)
 	assert(tctx->cnts.accumobjs == 0);
 	assert(tctx->cnts.accumbytes == 0);
 
-	ckh_remove(&tdata->bt2tctx, &gctx->bt, NULL, NULL);
+	ckh_remove(tsd, &tdata->bt2tctx, &gctx->bt, NULL, NULL);
 	destroy_tdata = prof_tdata_should_destroy(tdata);
 	malloc_mutex_unlock(tdata->lock);
 
@@ -618,17 +616,17 @@ prof_tctx_destroy(prof_tctx_t *tctx)
 		destroy_gctx = false;
 	malloc_mutex_unlock(gctx->lock);
 	if (destroy_gctx)
-		prof_gctx_maybe_destroy(gctx, tdata);
+		prof_gctx_maybe_destroy(tsd, gctx, tdata);
 
 	if (destroy_tdata)
-		prof_tdata_destroy(tdata);
+		prof_tdata_destroy(tsd, tdata);
 
-	idalloc(tctx);
+	idalloc(tsd, tctx);
 }
 
 static bool
-prof_lookup_global(prof_bt_t *bt, prof_tdata_t *tdata, void **p_btkey,
-    prof_gctx_t **p_gctx, bool *p_new_gctx)
+prof_lookup_global(tsd_t *tsd, prof_bt_t *bt, prof_tdata_t *tdata,
+    void **p_btkey, prof_gctx_t **p_gctx, bool *p_new_gctx)
 {
 	union {
 		prof_gctx_t	*p;
@@ -643,16 +641,16 @@ prof_lookup_global(prof_bt_t *bt, prof_tdata_t *tdata, void **p_btkey,
 	prof_enter(tdata);
 	if (ckh_search(&bt2gctx, bt, &btkey.v, &gctx.v)) {
 		/* bt has never been seen before.  Insert it. */
-		gctx.p = prof_gctx_create(bt);
+		gctx.p = prof_gctx_create(tsd, bt);
 		if (gctx.v == NULL) {
 			prof_leave(tdata);
 			return (true);
 		}
 		btkey.p = &gctx.p->bt;
-		if (ckh_insert(&bt2gctx, btkey.v, gctx.v)) {
+		if (ckh_insert(tsd, &bt2gctx, btkey.v, gctx.v)) {
 			/* OOM. */
 			prof_leave(tdata);
-			idalloc(gctx.v);
+			idalloc(tsd, gctx.v);
 			return (true);
 		}
 		new_gctx = true;
@@ -675,7 +673,7 @@ prof_lookup_global(prof_bt_t *bt, prof_tdata_t *tdata, void **p_btkey,
 }
 
 prof_tctx_t *
-prof_lookup(prof_bt_t *bt)
+prof_lookup(tsd_t *tsd, prof_bt_t *bt)
 {
 	union {
 		prof_tctx_t	*p;
@@ -686,8 +684,8 @@ prof_lookup(prof_bt_t *bt)
 
 	cassert(config_prof);
 
-	tdata = prof_tdata_get(false);
-	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+	tdata = prof_tdata_get(tsd, false);
+	if (tdata == NULL)
 		return (NULL);
 
 	malloc_mutex_lock(tdata->lock);
@@ -704,15 +702,15 @@ prof_lookup(prof_bt_t *bt)
 		 * This thread's cache lacks bt.  Look for it in the global
 		 * cache.
 		 */
-		if (prof_lookup_global(bt, tdata, &btkey, &gctx,
+		if (prof_lookup_global(tsd, bt, tdata, &btkey, &gctx,
 		    &new_gctx))
 			return (NULL);
 
 		/* Link a prof_tctx_t into gctx for this thread. */
-		ret.v = imalloc(sizeof(prof_tctx_t));
+		ret.v = imalloc(tsd, sizeof(prof_tctx_t));
 		if (ret.p == NULL) {
 			if (new_gctx)
-				prof_gctx_maybe_destroy(gctx, tdata);
+				prof_gctx_maybe_destroy(tsd, gctx, tdata);
 			return (NULL);
 		}
 		ret.p->tdata = tdata;
@@ -721,12 +719,12 @@ prof_lookup(prof_bt_t *bt)
 		ret.p->prepared = true;
 		ret.p->state = prof_tctx_state_nominal;
 		malloc_mutex_lock(tdata->lock);
-		error = ckh_insert(&tdata->bt2tctx, btkey, ret.v);
+		error = ckh_insert(tsd, &tdata->bt2tctx, btkey, ret.v);
 		malloc_mutex_unlock(tdata->lock);
 		if (error) {
 			if (new_gctx)
-				prof_gctx_maybe_destroy(gctx, tdata);
-			idalloc(ret.v);
+				prof_gctx_maybe_destroy(tsd, gctx, tdata);
+			idalloc(tsd, ret.v);
 			return (NULL);
 		}
 		malloc_mutex_lock(gctx->lock);
@@ -798,10 +796,13 @@ size_t
 prof_bt_count(void)
 {
 	size_t bt_count;
+	tsd_t *tsd;
 	prof_tdata_t *tdata;
 
-	tdata = prof_tdata_get(false);
-	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+	if ((tsd = tsd_tryget()) == NULL)
+		return (0);
+	tdata = prof_tdata_get(tsd, false);
+	if (tdata == NULL)
 		return (0);
 
 	prof_enter(tdata);
@@ -989,6 +990,7 @@ static prof_tctx_t *
 prof_tctx_finish_iter(prof_tctx_tree_t *tctxs, prof_tctx_t *tctx, void *arg)
 {
 	prof_tctx_t *ret;
+	tsd_t *tsd = (tsd_t *)arg;
 
 	switch (tctx->state) {
 	case prof_tctx_state_nominal:
@@ -1000,7 +1002,7 @@ prof_tctx_finish_iter(prof_tctx_tree_t *tctxs, prof_tctx_t *tctx, void *arg)
 	case prof_tctx_state_purgatory:
 		ret = tctx_tree_next(tctxs, tctx);
 		tctx_tree_remove(tctxs, tctx);
-		idalloc(tctx);
+		idalloc(tsd, tctx);
 		goto label_return;
 	default:
 		not_reached();
@@ -1049,7 +1051,8 @@ prof_gctx_merge_iter(prof_gctx_tree_t *gctxs, prof_gctx_t *gctx, void *arg)
 static prof_gctx_t *
 prof_gctx_finish_iter(prof_gctx_tree_t *gctxs, prof_gctx_t *gctx, void *arg)
 {
-	prof_tdata_t *tdata = (prof_tdata_t *)arg;
+	tsd_t *tsd = (tsd_t *)arg;
+	prof_tdata_t *tdata = prof_tdata_get(tsd, false);
 	prof_tctx_t *next;
 	bool destroy_gctx;
 
@@ -1057,13 +1060,13 @@ prof_gctx_finish_iter(prof_gctx_tree_t *gctxs, prof_gctx_t *gctx, void *arg)
 	next = NULL;
 	do {
 		next = tctx_tree_iter(&gctx->tctxs, next, prof_tctx_finish_iter,
-		    NULL);
+		    tsd);
 	} while (next != NULL);
 	gctx->nlimbo--;
 	destroy_gctx = prof_gctx_should_destroy(gctx);
 	malloc_mutex_unlock(gctx->lock);
 	if (destroy_gctx)
-		prof_gctx_maybe_destroy(gctx, tdata);
+		prof_gctx_maybe_destroy(tsd, gctx, tdata);
 
 	return (NULL);
 }
@@ -1277,7 +1280,7 @@ label_return:
 }
 
 static bool
-prof_dump(bool propagate_err, const char *filename, bool leakcheck)
+prof_dump(tsd_t *tsd, bool propagate_err, const char *filename, bool leakcheck)
 {
 	prof_tdata_t *tdata;
 	prof_cnt_t cnt_all;
@@ -1291,8 +1294,8 @@ prof_dump(bool propagate_err, const char *filename, bool leakcheck)
 
 	cassert(config_prof);
 
-	tdata = prof_tdata_get(false);
-	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+	tdata = prof_tdata_get(tsd, false);
+	if (tdata == NULL)
 		return (true);
 
 	malloc_mutex_lock(&prof_dump_mtx);
@@ -1341,7 +1344,7 @@ prof_dump(bool propagate_err, const char *filename, bool leakcheck)
 	if (prof_dump_close(propagate_err))
 		goto label_open_close_error;
 
-	gctx_tree_iter(&gctxs, NULL, prof_gctx_finish_iter, tdata);
+	gctx_tree_iter(&gctxs, NULL, prof_gctx_finish_iter, tsd);
 	malloc_mutex_unlock(&prof_dump_mtx);
 
 	if (leakcheck)
@@ -1351,7 +1354,7 @@ prof_dump(bool propagate_err, const char *filename, bool leakcheck)
 label_write_error:
 	prof_dump_close(propagate_err);
 label_open_close_error:
-	gctx_tree_iter(&gctxs, NULL, prof_gctx_finish_iter, tdata);
+	gctx_tree_iter(&gctxs, NULL, prof_gctx_finish_iter, tsd);
 	malloc_mutex_unlock(&prof_dump_mtx);
 	return (true);
 }
@@ -1381,24 +1384,28 @@ prof_dump_filename(char *filename, char v, uint64_t vseq)
 static void
 prof_fdump(void)
 {
+	tsd_t *tsd;
 	char filename[DUMP_FILENAME_BUFSIZE];
 
 	cassert(config_prof);
 
 	if (prof_booted == false)
 		return;
+	if ((tsd = tsd_tryget()) == NULL)
+		return;
 
 	if (opt_prof_final && opt_prof_prefix[0] != '\0') {
 		malloc_mutex_lock(&prof_dump_seq_mtx);
 		prof_dump_filename(filename, 'f', VSEQ_INVALID);
 		malloc_mutex_unlock(&prof_dump_seq_mtx);
-		prof_dump(false, filename, opt_prof_leak);
+		prof_dump(tsd, false, filename, opt_prof_leak);
 	}
 }
 
 void
 prof_idump(void)
 {
+	tsd_t *tsd;
 	prof_tdata_t *tdata;
 	char filename[PATH_MAX + 1];
 
@@ -1406,8 +1413,10 @@ prof_idump(void)
 
 	if (prof_booted == false)
 		return;
-	tdata = prof_tdata_get(false);
-	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+	if ((tsd = tsd_tryget()) == NULL)
+		return;
+	tdata = prof_tdata_get(tsd, false);
+	if (tdata == NULL)
 		return;
 	if (tdata->enq) {
 		tdata->enq_idump = true;
@@ -1419,19 +1428,22 @@ prof_idump(void)
 		prof_dump_filename(filename, 'i', prof_dump_iseq);
 		prof_dump_iseq++;
 		malloc_mutex_unlock(&prof_dump_seq_mtx);
-		prof_dump(false, filename, false);
+		prof_dump(tsd, false, filename, false);
 	}
 }
 
 bool
 prof_mdump(const char *filename)
 {
+	tsd_t *tsd;
 	char filename_buf[DUMP_FILENAME_BUFSIZE];
 
 	cassert(config_prof);
 
 	if (opt_prof == false || prof_booted == false)
 		return (true);
+	if ((tsd = tsd_tryget()) == NULL)
+		return (true);
 
 	if (filename == NULL) {
 		/* No filename specified, so automatically generate one. */
@@ -1443,12 +1455,13 @@ prof_mdump(const char *filename)
 		malloc_mutex_unlock(&prof_dump_seq_mtx);
 		filename = filename_buf;
 	}
-	return (prof_dump(true, filename, false));
+	return (prof_dump(tsd, true, filename, false));
 }
 
 void
 prof_gdump(void)
 {
+	tsd_t *tsd;
 	prof_tdata_t *tdata;
 	char filename[DUMP_FILENAME_BUFSIZE];
 
@@ -1456,8 +1469,10 @@ prof_gdump(void)
 
 	if (prof_booted == false)
 		return;
-	tdata = prof_tdata_get(false);
-	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+	if ((tsd = tsd_tryget()) == NULL)
+		return;
+	tdata = prof_tdata_get(tsd, false);
+	if (tdata == NULL)
 		return;
 	if (tdata->enq) {
 		tdata->enq_gdump = true;
@@ -1469,7 +1484,7 @@ prof_gdump(void)
 		prof_dump_filename(filename, 'u', prof_dump_useq);
 		prof_dump_useq++;
 		malloc_mutex_unlock(&prof_dump_seq_mtx);
-		prof_dump(false, filename, false);
+		prof_dump(tsd, false, filename, false);
 	}
 }
 
@@ -1510,14 +1525,14 @@ prof_thr_uid_alloc(void)
 }
 
 static prof_tdata_t *
-prof_tdata_init_impl(uint64_t thr_uid)
+prof_tdata_init_impl(tsd_t *tsd, uint64_t thr_uid)
 {
 	prof_tdata_t *tdata;
 
 	cassert(config_prof);
 
 	/* Initialize an empty cache for this thread. */
-	tdata = (prof_tdata_t *)imalloc(sizeof(prof_tdata_t));
+	tdata = (prof_tdata_t *)imalloc(tsd, sizeof(prof_tdata_t));
 	if (tdata == NULL)
 		return (NULL);
 
@@ -1526,9 +1541,9 @@ prof_tdata_init_impl(uint64_t thr_uid)
 	tdata->thread_name = NULL;
 	tdata->state = prof_tdata_state_attached;
 
-	if (ckh_new(&tdata->bt2tctx, PROF_CKH_MINITEMS,
+	if (ckh_new(tsd, &tdata->bt2tctx, PROF_CKH_MINITEMS,
 	    prof_bt_hash, prof_bt_keycomp)) {
-		idalloc(tdata);
+		idalloc(tsd, tdata);
 		return (NULL);
 	}
 
@@ -1542,8 +1557,6 @@ prof_tdata_init_impl(uint64_t thr_uid)
 	tdata->dumping = false;
 	tdata->active = true;
 
-	prof_tdata_tsd_set(&tdata);
-
 	malloc_mutex_lock(&tdatas_mtx);
 	tdata_tree_insert(&tdatas, tdata);
 	malloc_mutex_unlock(&tdatas_mtx);
@@ -1552,17 +1565,17 @@ prof_tdata_init_impl(uint64_t thr_uid)
 }
 
 prof_tdata_t *
-prof_tdata_init(void)
+prof_tdata_init(tsd_t *tsd)
 {
 
-	return (prof_tdata_init_impl(prof_thr_uid_alloc()));
+	return (prof_tdata_init_impl(tsd, prof_thr_uid_alloc()));
 }
 
 prof_tdata_t *
-prof_tdata_reinit(prof_tdata_t *tdata)
+prof_tdata_reinit(tsd_t *tsd, prof_tdata_t *tdata)
 {
 
-	return (prof_tdata_init_impl(tdata->thr_uid));
+	return (prof_tdata_init_impl(tsd, tdata->thr_uid));
 }
 
 /* tdata->lock must be held. */
@@ -1578,7 +1591,7 @@ prof_tdata_should_destroy(prof_tdata_t *tdata)
 }
 
 static void
-prof_tdata_destroy(prof_tdata_t *tdata)
+prof_tdata_destroy(tsd_t *tsd, prof_tdata_t *tdata)
 {
 
 	assert(prof_tdata_should_destroy(tdata));
@@ -1588,13 +1601,14 @@ prof_tdata_destroy(prof_tdata_t *tdata)
 	malloc_mutex_unlock(&tdatas_mtx);
 
 	if (tdata->thread_name != NULL)
-		idalloc(tdata->thread_name);
-	ckh_delete(&tdata->bt2tctx);
-	idalloc(tdata);
+		idalloc(tsd, tdata->thread_name);
+	ckh_delete(tsd, &tdata->bt2tctx);
+	idalloc(tsd, tdata);
 }
 
 static void
-prof_tdata_state_transition(prof_tdata_t *tdata, prof_tdata_state_t state)
+prof_tdata_state_transition(tsd_t *tsd, prof_tdata_t *tdata,
+   prof_tdata_state_t state)
 {
 	bool destroy_tdata;
 
@@ -1606,33 +1620,34 @@ prof_tdata_state_transition(prof_tdata_t *tdata, prof_tdata_state_t state)
 		destroy_tdata = false;
 	malloc_mutex_unlock(tdata->lock);
 	if (destroy_tdata)
-		prof_tdata_destroy(tdata);
+		prof_tdata_destroy(tsd, tdata);
 }
 
 static void
-prof_tdata_detach(prof_tdata_t *tdata)
+prof_tdata_detach(tsd_t *tsd, prof_tdata_t *tdata)
 {
 
-	prof_tdata_state_transition(tdata, prof_tdata_state_detached);
+	prof_tdata_state_transition(tsd, tdata, prof_tdata_state_detached);
 }
 
 static void
-prof_tdata_expire(prof_tdata_t *tdata)
+prof_tdata_expire(tsd_t *tsd, prof_tdata_t *tdata)
 {
 
-	prof_tdata_state_transition(tdata, prof_tdata_state_expired);
+	prof_tdata_state_transition(tsd, tdata, prof_tdata_state_expired);
 }
 
 static prof_tdata_t *
 prof_tdata_reset_iter(prof_tdata_tree_t *tdatas, prof_tdata_t *tdata, void *arg)
 {
+	tsd_t *tsd = (tsd_t *)arg;
 
-	prof_tdata_expire(tdata);
+	prof_tdata_expire(tsd, tdata);
 	return (NULL);
 }
 
 void
-prof_reset(size_t lg_sample)
+prof_reset(tsd_t *tsd, size_t lg_sample)
 {
 
 	assert(lg_sample < (sizeof(uint64_t) << 3));
@@ -1641,69 +1656,58 @@ prof_reset(size_t lg_sample)
 	malloc_mutex_lock(&tdatas_mtx);
 
 	lg_prof_sample = lg_sample;
-	tdata_tree_iter(&tdatas, NULL, prof_tdata_reset_iter, NULL);
+	tdata_tree_iter(&tdatas, NULL, prof_tdata_reset_iter, tsd);
 
 	malloc_mutex_unlock(&tdatas_mtx);
 	malloc_mutex_unlock(&prof_dump_mtx);
 }
 
 void
-prof_tdata_cleanup(void *arg)
+prof_tdata_cleanup(tsd_t *tsd)
 {
-	prof_tdata_t *tdata = *(prof_tdata_t **)arg;
+	prof_tdata_t *tdata;
 
-	cassert(config_prof);
+	if (!config_prof)
+		return;
 
-	if (tdata == PROF_TDATA_STATE_REINCARNATED) {
-		/*
-		 * Another destructor deallocated memory after this destructor
-		 * was called.  Reset tdata to PROF_TDATA_STATE_PURGATORY in
-		 * order to receive another callback.
-		 */
-		tdata = PROF_TDATA_STATE_PURGATORY;
-		prof_tdata_tsd_set(&tdata);
-	} else if (tdata == PROF_TDATA_STATE_PURGATORY) {
-		/*
-		 * The previous time this destructor was called, we set the key
-		 * to PROF_TDATA_STATE_PURGATORY so that other destructors
-		 * wouldn't cause re-creation of the tdata.  This time, do
-		 * nothing, so that the destructor will not be called again.
-		 */
-	} else if (tdata != NULL) {
-		prof_tdata_detach(tdata);
-		tdata = PROF_TDATA_STATE_PURGATORY;
-		prof_tdata_tsd_set(&tdata);
-	}
+	tdata = tsd_prof_tdata_get(tsd);
+	if (tdata != NULL)
+		prof_tdata_detach(tsd, tdata);
 }
 
 const char *
 prof_thread_name_get(void)
 {
-	prof_tdata_t *tdata = prof_tdata_get(true);
-	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+	tsd_t *tsd;
+	prof_tdata_t *tdata;
+
+	if ((tsd = tsd_tryget()) == NULL)
+		return (NULL);
+	tdata = prof_tdata_get(tsd, true);
+	if (tdata == NULL)
 		return (NULL);
 	return (tdata->thread_name);
 }
 
 bool
-prof_thread_name_set(const char *thread_name)
+prof_thread_name_set(tsd_t *tsd, const char *thread_name)
 {
 	prof_tdata_t *tdata;
 	size_t size;
 	char *s;
 
-	tdata = prof_tdata_get(true);
-	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+	tdata = prof_tdata_get(tsd, true);
+	if (tdata == NULL)
 		return (true);
 
 	size = strlen(thread_name) + 1;
-	s = imalloc(size);
+	s = imalloc(tsd, size);
 	if (s == NULL)
 		return (true);
 
 	memcpy(s, thread_name, size);
 	if (tdata->thread_name != NULL)
-		idalloc(tdata->thread_name);
+		idalloc(tsd, tdata->thread_name);
 	tdata->thread_name = s;
 	return (false);
 }
@@ -1711,8 +1715,13 @@ prof_thread_name_set(const char *thread_name)
 bool
 prof_thread_active_get(void)
 {
-	prof_tdata_t *tdata = prof_tdata_get(true);
-	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+	tsd_t *tsd;
+	prof_tdata_t *tdata;
+
+	if ((tsd = tsd_tryget()) == NULL)
+		return (false);
+	tdata = prof_tdata_get(tsd, true);
+	if (tdata == NULL)
 		return (false);
 	return (tdata->active);
 }
@@ -1720,10 +1729,13 @@ prof_thread_active_get(void)
 bool
 prof_thread_active_set(bool active)
 {
+	tsd_t *tsd;
 	prof_tdata_t *tdata;
 
-	tdata = prof_tdata_get(true);
-	if ((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+	if ((tsd = tsd_tryget()) == NULL)
+		return (true);
+	tdata = prof_tdata_get(tsd, true);
+	if (tdata == NULL)
 		return (true);
 	tdata->active = active;
 	return (false);
@@ -1772,20 +1784,18 @@ prof_boot2(void)
 	cassert(config_prof);
 
 	if (opt_prof) {
+		tsd_t *tsd;
 		unsigned i;
 
 		lg_prof_sample = opt_lg_prof_sample;
 
-		if (ckh_new(&bt2gctx, PROF_CKH_MINITEMS, prof_bt_hash,
+		if ((tsd = tsd_tryget()) == NULL)
+			return (true);
+		if (ckh_new(tsd, &bt2gctx, PROF_CKH_MINITEMS, prof_bt_hash,
 		    prof_bt_keycomp))
 			return (true);
 		if (malloc_mutex_init(&bt2gctx_mtx))
 			return (true);
-		if (prof_tdata_tsd_boot()) {
-			malloc_write(
-			    "<jemalloc>: Error in pthread_key_create()\n");
-			abort();
-		}
 
 		tdata_tree_new(&tdatas);
 		if (malloc_mutex_init(&tdatas_mtx))
diff --git a/src/quarantine.c b/src/quarantine.c
index efddeae..1301b47 100644
--- a/src/quarantine.c
+++ b/src/quarantine.c
@@ -10,25 +10,21 @@
 #define	QUARANTINE_STATE_MAX		QUARANTINE_STATE_PURGATORY
 
 /******************************************************************************/
-/* Data. */
-
-malloc_tsd_data(, quarantine, quarantine_t *, NULL)
-
-/******************************************************************************/
 /* Function prototypes for non-inline static functions. */
 
-static quarantine_t	*quarantine_grow(quarantine_t *quarantine);
-static void	quarantine_drain_one(quarantine_t *quarantine);
-static void	quarantine_drain(quarantine_t *quarantine, size_t upper_bound);
+static quarantine_t	*quarantine_grow(tsd_t *tsd, quarantine_t *quarantine);
+static void	quarantine_drain_one(tsd_t *tsd, quarantine_t *quarantine);
+static void	quarantine_drain(tsd_t *tsd, quarantine_t *quarantine,
+    size_t upper_bound);
 
 /******************************************************************************/
 
 quarantine_t *
-quarantine_init(size_t lg_maxobjs)
+quarantine_init(tsd_t *tsd, size_t lg_maxobjs)
 {
 	quarantine_t *quarantine;
 
-	quarantine = (quarantine_t *)imalloc(offsetof(quarantine_t, objs) +
+	quarantine = (quarantine_t *)imalloc(tsd, offsetof(quarantine_t, objs) +
 	    ((ZU(1) << lg_maxobjs) * sizeof(quarantine_obj_t)));
 	if (quarantine == NULL)
 		return (NULL);
@@ -37,19 +33,17 @@ quarantine_init(size_t lg_maxobjs)
 	quarantine->first = 0;
 	quarantine->lg_maxobjs = lg_maxobjs;
 
-	quarantine_tsd_set(&quarantine);
-
 	return (quarantine);
 }
 
 static quarantine_t *
-quarantine_grow(quarantine_t *quarantine)
+quarantine_grow(tsd_t *tsd, quarantine_t *quarantine)
 {
 	quarantine_t *ret;
 
-	ret = quarantine_init(quarantine->lg_maxobjs + 1);
+	ret = quarantine_init(tsd, quarantine->lg_maxobjs + 1);
 	if (ret == NULL) {
-		quarantine_drain_one(quarantine);
+		quarantine_drain_one(tsd, quarantine);
 		return (quarantine);
 	}
 
@@ -71,17 +65,17 @@ quarantine_grow(quarantine_t *quarantine)
 		memcpy(&ret->objs[ncopy_a], quarantine->objs, ncopy_b *
 		    sizeof(quarantine_obj_t));
 	}
-	idalloc(quarantine);
+	idalloc(tsd, quarantine);
 
 	return (ret);
 }
 
 static void
-quarantine_drain_one(quarantine_t *quarantine)
+quarantine_drain_one(tsd_t *tsd, quarantine_t *quarantine)
 {
 	quarantine_obj_t *obj = &quarantine->objs[quarantine->first];
 	assert(obj->usize == isalloc(obj->ptr, config_prof));
-	idalloc(obj->ptr);
+	idalloc(tsd, obj->ptr);
 	quarantine->curbytes -= obj->usize;
 	quarantine->curobjs--;
 	quarantine->first = (quarantine->first + 1) & ((ZU(1) <<
@@ -89,15 +83,15 @@ quarantine_drain_one(quarantine_t *quarantine)
 }
 
 static void
-quarantine_drain(quarantine_t *quarantine, size_t upper_bound)
+quarantine_drain(tsd_t *tsd, quarantine_t *quarantine, size_t upper_bound)
 {
 
 	while (quarantine->curbytes > upper_bound && quarantine->curobjs > 0)
-		quarantine_drain_one(quarantine);
+		quarantine_drain_one(tsd, quarantine);
 }
 
 void
-quarantine(void *ptr)
+quarantine(tsd_t *tsd, void *ptr)
 {
 	quarantine_t *quarantine;
 	size_t usize = isalloc(ptr, config_prof);
@@ -105,17 +99,8 @@ quarantine(void *ptr)
 	cassert(config_fill);
 	assert(opt_quarantine);
 
-	quarantine = *quarantine_tsd_get();
-	if ((uintptr_t)quarantine <= (uintptr_t)QUARANTINE_STATE_MAX) {
-		if (quarantine == QUARANTINE_STATE_PURGATORY) {
-			/*
-			 * Make a note that quarantine() was called after
-			 * quarantine_cleanup() was called.
-			 */
-			quarantine = QUARANTINE_STATE_REINCARNATED;
-			quarantine_tsd_set(&quarantine);
-		}
-		idalloc(ptr);
+	if ((quarantine = tsd_quarantine_get(tsd)) == NULL) {
+		idalloc(tsd, ptr);
 		return;
 	}
 	/*
@@ -125,11 +110,11 @@ quarantine(void *ptr)
 	if (quarantine->curbytes + usize > opt_quarantine) {
 		size_t upper_bound = (opt_quarantine >= usize) ? opt_quarantine
 		    - usize : 0;
-		quarantine_drain(quarantine, upper_bound);
+		quarantine_drain(tsd, quarantine, upper_bound);
 	}
 	/* Grow the quarantine ring buffer if it's full. */
 	if (quarantine->curobjs == (ZU(1) << quarantine->lg_maxobjs))
-		quarantine = quarantine_grow(quarantine);
+		quarantine = quarantine_grow(tsd, quarantine);
 	/* quarantine_grow() must free a slot if it fails to grow. */
 	assert(quarantine->curobjs < (ZU(1) << quarantine->lg_maxobjs));
 	/* Append ptr if its size doesn't exceed the quarantine size. */
@@ -154,46 +139,22 @@ quarantine(void *ptr)
 		}
 	} else {
 		assert(quarantine->curbytes == 0);
-		idalloc(ptr);
+		idalloc(tsd, ptr);
 	}
 }
 
 void
-quarantine_cleanup(void *arg)
-{
-	quarantine_t *quarantine = *(quarantine_t **)arg;
-
-	if (quarantine == QUARANTINE_STATE_REINCARNATED) {
-		/*
-		 * Another destructor deallocated memory after this destructor
-		 * was called.  Reset quarantine to QUARANTINE_STATE_PURGATORY
-		 * in order to receive another callback.
-		 */
-		quarantine = QUARANTINE_STATE_PURGATORY;
-		quarantine_tsd_set(&quarantine);
-	} else if (quarantine == QUARANTINE_STATE_PURGATORY) {
-		/*
-		 * The previous time this destructor was called, we set the key
-		 * to QUARANTINE_STATE_PURGATORY so that other destructors
-		 * wouldn't cause re-creation of the quarantine.  This time, do
-		 * nothing, so that the destructor will not be called again.
-		 */
-	} else if (quarantine != NULL) {
-		quarantine_drain(quarantine, 0);
-		idalloc(quarantine);
-		quarantine = QUARANTINE_STATE_PURGATORY;
-		quarantine_tsd_set(&quarantine);
-	}
-}
-
-bool
-quarantine_boot(void)
+quarantine_cleanup(tsd_t *tsd)
 {
+	quarantine_t *quarantine;
 
-	cassert(config_fill);
-
-	if (quarantine_tsd_boot())
-		return (true);
+	if (!config_fill)
+		return;
 
-	return (false);
+	quarantine = tsd_quarantine_get(tsd);
+	if (quarantine != NULL) {
+		quarantine_drain(tsd, quarantine, 0);
+		idalloc(tsd, quarantine);
+		tsd_quarantine_set(tsd, NULL);
+	}
 }
diff --git a/src/rtree.c b/src/rtree.c
index 87b0b15..2ff93db 100644
--- a/src/rtree.c
+++ b/src/rtree.c
@@ -9,8 +9,10 @@ rtree_new(unsigned bits, rtree_alloc_t *alloc, rtree_dalloc_t *dalloc)
 
 	assert(bits > 0 && bits <= (sizeof(uintptr_t) << 3));
 
-	bits_per_level = jemalloc_ffs(pow2_ceil((RTREE_NODESIZE / sizeof(void *)))) - 1;
-	bits_in_leaf = jemalloc_ffs(pow2_ceil((RTREE_NODESIZE / sizeof(uint8_t)))) - 1;
+	bits_per_level = jemalloc_ffs(pow2_ceil((RTREE_NODESIZE / sizeof(void
+	    *)))) - 1;
+	bits_in_leaf = jemalloc_ffs(pow2_ceil((RTREE_NODESIZE /
+	    sizeof(uint8_t)))) - 1;
 	if (bits > bits_in_leaf) {
 		height = 1 + (bits - bits_in_leaf) / bits_per_level;
 		if ((height-1) * bits_per_level + bits_in_leaf != bits)
diff --git a/src/tcache.c b/src/tcache.c
index f86a46e..bb4c3cc 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -4,9 +4,6 @@
 /******************************************************************************/
 /* Data. */
 
-malloc_tsd_data(, tcache, tcache_t *, NULL)
-malloc_tsd_data(, tcache_enabled, tcache_enabled_t, tcache_enabled_default)
-
 bool	opt_tcache = true;
 ssize_t	opt_lg_tcache_max = LG_TCACHE_MAXCLASS_DEFAULT;
 
@@ -262,43 +259,14 @@ tcache_arena_dissociate(tcache_t *tcache)
 }
 
 tcache_t *
-tcache_get_hard(tcache_t *tcache, bool create)
+tcache_get_hard(tsd_t *tsd)
 {
 
-	if (tcache == NULL) {
-		if (create == false) {
-			/*
-			 * Creating a tcache here would cause
-			 * allocation as a side effect of free().
-			 * Ordinarily that would be okay since
-			 * tcache_create() failure is a soft failure
-			 * that doesn't propagate.  However, if TLS
-			 * data are freed via free() as in glibc,
-			 * subtle corruption could result from setting
-			 * a TLS variable after its backing memory is
-			 * freed.
-			 */
-			return (NULL);
-		}
-		if (tcache_enabled_get() == false) {
-			tcache_enabled_set(false); /* Memoize. */
-			return (NULL);
-		}
-		return (tcache_create(choose_arena(NULL)));
-	}
-	if (tcache == TCACHE_STATE_PURGATORY) {
-		/*
-		 * Make a note that an allocator function was called
-		 * after tcache_thread_cleanup() was called.
-		 */
-		tcache = TCACHE_STATE_REINCARNATED;
-		tcache_tsd_set(&tcache);
+	if (tcache_enabled_get() == false) {
+		tcache_enabled_set(false); /* Memoize. */
 		return (NULL);
 	}
-	if (tcache == TCACHE_STATE_REINCARNATED)
-		return (NULL);
-	not_reached();
-	return (NULL);
+	return (tcache_create(choose_arena(tsd, NULL)));
 }
 
 tcache_t *
@@ -328,7 +296,7 @@ tcache_create(arena_t *arena)
 	else if (size <= tcache_maxclass)
 		tcache = (tcache_t *)arena_malloc_large(arena, size, true);
 	else
-		tcache = (tcache_t *)icalloct(size, false, arena);
+		tcache = (tcache_t *)icalloct(NULL, size, false, arena);
 
 	if (tcache == NULL)
 		return (NULL);
@@ -343,13 +311,11 @@ tcache_create(arena_t *arena)
 		stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *);
 	}
 
-	tcache_tsd_set(&tcache);
-
 	return (tcache);
 }
 
-void
-tcache_destroy(tcache_t *tcache)
+static void
+tcache_destroy(tsd_t *tsd, tcache_t *tcache)
 {
 	unsigned i;
 	size_t tcache_size;
@@ -403,39 +369,30 @@ tcache_destroy(tcache_t *tcache)
 
 		arena_dalloc_large(arena, chunk, tcache);
 	} else
-		idalloct(tcache, false);
+		idalloct(tsd, tcache, false);
 }
 
 void
-tcache_thread_cleanup(void *arg)
+tcache_cleanup(tsd_t *tsd)
 {
-	tcache_t *tcache = *(tcache_t **)arg;
+	tcache_t *tcache;
 
-	if (tcache == TCACHE_STATE_DISABLED) {
-		/* Do nothing. */
-	} else if (tcache == TCACHE_STATE_REINCARNATED) {
-		/*
-		 * Another destructor called an allocator function after this
-		 * destructor was called.  Reset tcache to
-		 * TCACHE_STATE_PURGATORY in order to receive another callback.
-		 */
-		tcache = TCACHE_STATE_PURGATORY;
-		tcache_tsd_set(&tcache);
-	} else if (tcache == TCACHE_STATE_PURGATORY) {
-		/*
-		 * The previous time this destructor was called, we set the key
-		 * to TCACHE_STATE_PURGATORY so that other destructors wouldn't
-		 * cause re-creation of the tcache.  This time, do nothing, so
-		 * that the destructor will not be called again.
-		 */
-	} else if (tcache != NULL) {
-		assert(tcache != TCACHE_STATE_PURGATORY);
-		tcache_destroy(tcache);
-		tcache = TCACHE_STATE_PURGATORY;
-		tcache_tsd_set(&tcache);
+	if (!config_tcache)
+		return;
+
+	if ((tcache = tsd_tcache_get(tsd)) != NULL) {
+		tcache_destroy(tsd, tcache);
+		tsd_tcache_set(tsd, NULL);
 	}
 }
 
+void
+tcache_enabled_cleanup(tsd_t *tsd)
+{
+
+	/* Do nothing. */
+}
+
 /* Caller must own arena->lock. */
 void
 tcache_stats_merge(tcache_t *tcache, arena_t *arena)
@@ -464,7 +421,7 @@ tcache_stats_merge(tcache_t *tcache, arena_t *arena)
 }
 
 bool
-tcache_boot0(void)
+tcache_boot(void)
 {
 	unsigned i;
 
@@ -504,13 +461,3 @@ tcache_boot0(void)
 
 	return (false);
 }
-
-bool
-tcache_boot1(void)
-{
-
-	if (tcache_tsd_boot() || tcache_enabled_tsd_boot())
-		return (true);
-
-	return (false);
-}
diff --git a/src/tsd.c b/src/tsd.c
index 700caab..27a70ee 100644
--- a/src/tsd.c
+++ b/src/tsd.c
@@ -7,6 +7,8 @@
 static unsigned ncleanups;
 static malloc_tsd_cleanup_t cleanups[MALLOC_TSD_CLEANUPS_MAX];
 
+malloc_tsd_data(, , tsd_t, TSD_INITIALIZER)
+
 /******************************************************************************/
 
 void *
@@ -14,14 +16,15 @@ malloc_tsd_malloc(size_t size)
 {
 
 	/* Avoid choose_arena() in order to dodge bootstrapping issues. */
-	return (arena_malloc(arenas[0], size, false, false));
+	return (arena_malloc(NULL, arenas[0], CACHELINE_CEILING(size), false,
+	    false));
 }
 
 void
 malloc_tsd_dalloc(void *wrapper)
 {
 
-	idalloct(wrapper, false);
+	idalloct(NULL, wrapper, false);
 }
 
 void
@@ -67,10 +70,54 @@ malloc_tsd_cleanup_register(bool (*f)(void))
 }
 
 void
+tsd_cleanup(void *arg)
+{
+	tsd_t *tsd = (tsd_t *)arg;
+
+	if (tsd == NULL) {
+		/* OOM during re-initialization. */
+		return;
+	}
+
+	switch (tsd->state) {
+	case tsd_state_nominal:
+#define O(n, t)								\
+		n##_cleanup(tsd);
+MALLOC_TSD
+#undef O
+		tsd->state = tsd_state_purgatory;
+		tsd_set(tsd);
+		break;
+	case tsd_state_purgatory:
+		/*
+		 * The previous time this destructor was called, we set the
+		 * state to tsd_state_purgatory so that other destructors
+		 * wouldn't cause re-creation of the tsd.  This time, do
+		 * nothing, and do not request another callback.
+		 */
+		break;
+	case tsd_state_reincarnated:
+		/*
+		 * Another destructor deallocated memory after this destructor
+		 * was called.  Reset state to tsd_state_purgatory and request
+		 * another callback.
+		 */
+		tsd->state = tsd_state_purgatory;
+		tsd_set(tsd);
+		break;
+	default:
+		not_reached();
+	}
+}
+
+bool
 malloc_tsd_boot(void)
 {
 
 	ncleanups = 0;
+	if (tsd_boot())
+		return (true);
+	return (false);
 }
 
 #ifdef _WIN32
diff --git a/test/unit/ckh.c b/test/unit/ckh.c
index b214c27..148b81e 100644
--- a/test/unit/ckh.c
+++ b/test/unit/ckh.c
@@ -2,20 +2,25 @@
 
 TEST_BEGIN(test_new_delete)
 {
+	tsd_t *tsd;
 	ckh_t ckh;
 
-	assert_false(ckh_new(&ckh, 2, ckh_string_hash, ckh_string_keycomp),
-	    "Unexpected ckh_new() error");
-	ckh_delete(&ckh);
+	tsd = tsd_tryget();
+	assert_ptr_not_null(tsd, "Unexpected tsd failure");
 
-	assert_false(ckh_new(&ckh, 3, ckh_pointer_hash, ckh_pointer_keycomp),
+	assert_false(ckh_new(tsd, &ckh, 2, ckh_string_hash, ckh_string_keycomp),
 	    "Unexpected ckh_new() error");
-	ckh_delete(&ckh);
+	ckh_delete(tsd, &ckh);
+
+	assert_false(ckh_new(tsd, &ckh, 3, ckh_pointer_hash,
+	    ckh_pointer_keycomp), "Unexpected ckh_new() error");
+	ckh_delete(tsd, &ckh);
 }
 TEST_END
 
 TEST_BEGIN(test_count_insert_search_remove)
 {
+	tsd_t *tsd;
 	ckh_t ckh;
 	const char *strs[] = {
 	    "a string",
@@ -26,7 +31,10 @@ TEST_BEGIN(test_count_insert_search_remove)
 	const char *missing = "A string not in the hash table.";
 	size_t i;
 
-	assert_false(ckh_new(&ckh, 2, ckh_string_hash, ckh_string_keycomp),
+	tsd = tsd_tryget();
+	assert_ptr_not_null(tsd, "Unexpected tsd failure");
+
+	assert_false(ckh_new(tsd, &ckh, 2, ckh_string_hash, ckh_string_keycomp),
 	    "Unexpected ckh_new() error");
 	assert_zu_eq(ckh_count(&ckh), 0,
 	    "ckh_count() should return %zu, but it returned %zu", ZU(0),
@@ -34,7 +42,7 @@ TEST_BEGIN(test_count_insert_search_remove)
 
 	/* Insert. */
 	for (i = 0; i < sizeof(strs)/sizeof(const char *); i++) {
-		ckh_insert(&ckh, strs[i], strs[i]);
+		ckh_insert(tsd, &ckh, strs[i], strs[i]);
 		assert_zu_eq(ckh_count(&ckh), i+1,
 		    "ckh_count() should return %zu, but it returned %zu", i+1,
 		    ckh_count(&ckh));
@@ -79,7 +87,7 @@ TEST_BEGIN(test_count_insert_search_remove)
 		vp = (i & 2) ? &v.p : NULL;
 		k.p = NULL;
 		v.p = NULL;
-		assert_false(ckh_remove(&ckh, strs[i], kp, vp),
+		assert_false(ckh_remove(tsd, &ckh, strs[i], kp, vp),
 		    "Unexpected ckh_remove() error");
 
 		ks = (i & 1) ? strs[i] : (const char *)NULL;
@@ -95,20 +103,24 @@ TEST_BEGIN(test_count_insert_search_remove)
 		    ckh_count(&ckh));
 	}
 
-	ckh_delete(&ckh);
+	ckh_delete(tsd, &ckh);
 }
 TEST_END
 
 TEST_BEGIN(test_insert_iter_remove)
 {
 #define	NITEMS ZU(1000)
+	tsd_t *tsd;
 	ckh_t ckh;
 	void **p[NITEMS];
 	void *q, *r;
 	size_t i;
 
-	assert_false(ckh_new(&ckh, 2, ckh_pointer_hash, ckh_pointer_keycomp),
-	    "Unexpected ckh_new() error");
+	tsd = tsd_tryget();
+	assert_ptr_not_null(tsd, "Unexpected tsd failure");
+
+	assert_false(ckh_new(tsd, &ckh, 2, ckh_pointer_hash,
+	    ckh_pointer_keycomp), "Unexpected ckh_new() error");
 
 	for (i = 0; i < NITEMS; i++) {
 		p[i] = mallocx(i+1, 0);
@@ -119,7 +131,7 @@ TEST_BEGIN(test_insert_iter_remove)
 		size_t j;
 
 		for (j = i; j < NITEMS; j++) {
-			assert_false(ckh_insert(&ckh, p[j], p[j]),
+			assert_false(ckh_insert(tsd, &ckh, p[j], p[j]),
 			    "Unexpected ckh_insert() failure");
 			assert_false(ckh_search(&ckh, p[j], &q, &r),
 			    "Unexpected ckh_search() failure");
@@ -134,13 +146,13 @@ TEST_BEGIN(test_insert_iter_remove)
 		for (j = i + 1; j < NITEMS; j++) {
 			assert_false(ckh_search(&ckh, p[j], NULL, NULL),
 			    "Unexpected ckh_search() failure");
-			assert_false(ckh_remove(&ckh, p[j], &q, &r),
+			assert_false(ckh_remove(tsd, &ckh, p[j], &q, &r),
 			    "Unexpected ckh_remove() failure");
 			assert_ptr_eq(p[j], q, "Key pointer mismatch");
 			assert_ptr_eq(p[j], r, "Value pointer mismatch");
 			assert_true(ckh_search(&ckh, p[j], NULL, NULL),
 			    "Unexpected ckh_search() success");
-			assert_true(ckh_remove(&ckh, p[j], &q, &r),
+			assert_true(ckh_remove(tsd, &ckh, p[j], &q, &r),
 			    "Unexpected ckh_remove() success");
 		}
 
@@ -176,13 +188,13 @@ TEST_BEGIN(test_insert_iter_remove)
 	for (i = 0; i < NITEMS; i++) {
 		assert_false(ckh_search(&ckh, p[i], NULL, NULL),
 		    "Unexpected ckh_search() failure");
-		assert_false(ckh_remove(&ckh, p[i], &q, &r),
+		assert_false(ckh_remove(tsd, &ckh, p[i], &q, &r),
 		    "Unexpected ckh_remove() failure");
 		assert_ptr_eq(p[i], q, "Key pointer mismatch");
 		assert_ptr_eq(p[i], r, "Value pointer mismatch");
 		assert_true(ckh_search(&ckh, p[i], NULL, NULL),
 		    "Unexpected ckh_search() success");
-		assert_true(ckh_remove(&ckh, p[i], &q, &r),
+		assert_true(ckh_remove(tsd, &ckh, p[i], &q, &r),
 		    "Unexpected ckh_remove() success");
 		dallocx(p[i], 0);
 	}
@@ -190,7 +202,7 @@ TEST_BEGIN(test_insert_iter_remove)
 	assert_zu_eq(ckh_count(&ckh), 0,
 	    "ckh_count() should return %zu, but it returned %zu", ZU(0),
 	    ckh_count(&ckh));
-	ckh_delete(&ckh);
+	ckh_delete(tsd, &ckh);
 #undef NITEMS
 }
 TEST_END
diff --git a/test/unit/rtree.c b/test/unit/rtree.c
index 5463055..77a947d 100644
--- a/test/unit/rtree.c
+++ b/test/unit/rtree.c
@@ -5,7 +5,7 @@ TEST_BEGIN(test_rtree_get_empty)
 	unsigned i;
 
 	for (i = 1; i <= (sizeof(uintptr_t) << 3); i++) {
-		rtree_t *rtree = rtree_new(i, imalloc, idalloc);
+		rtree_t *rtree = rtree_new(i, malloc, free);
 		assert_u_eq(rtree_get(rtree, 0), 0,
 		    "rtree_get() should return NULL for empty tree");
 		rtree_delete(rtree);
@@ -18,7 +18,7 @@ TEST_BEGIN(test_rtree_extrema)
 	unsigned i;
 
 	for (i = 1; i <= (sizeof(uintptr_t) << 3); i++) {
-		rtree_t *rtree = rtree_new(i, imalloc, idalloc);
+		rtree_t *rtree = rtree_new(i, malloc, free);
 
 		rtree_set(rtree, 0, 1);
 		assert_u_eq(rtree_get(rtree, 0), 1,
@@ -40,7 +40,7 @@ TEST_BEGIN(test_rtree_bits)
 	for (i = 1; i < (sizeof(uintptr_t) << 3); i++) {
 		uintptr_t keys[] = {0, 1,
 		    (((uintptr_t)1) << (sizeof(uintptr_t)*8-i)) - 1};
-		rtree_t *rtree = rtree_new(i, imalloc, idalloc);
+		rtree_t *rtree = rtree_new(i, malloc, free);
 
 		for (j = 0; j < sizeof(keys)/sizeof(uintptr_t); j++) {
 			rtree_set(rtree, keys[j], 1);
@@ -73,7 +73,7 @@ TEST_BEGIN(test_rtree_random)
 
 	sfmt = init_gen_rand(SEED);
 	for (i = 1; i <= (sizeof(uintptr_t) << 3); i++) {
-		rtree_t *rtree = rtree_new(i, imalloc, idalloc);
+		rtree_t *rtree = rtree_new(i, malloc, free);
 		uintptr_t keys[NSET];
 		unsigned j;
 
diff --git a/test/unit/tsd.c b/test/unit/tsd.c
index f421c1a..391a780 100644
--- a/test/unit/tsd.c
+++ b/test/unit/tsd.c
@@ -16,11 +16,11 @@ data_cleanup(void *arg)
 	data_cleanup_executed = true;
 }
 
-malloc_tsd_protos(, data, data_t)
-malloc_tsd_externs(data, data_t)
+malloc_tsd_protos(, data_, data_t)
+malloc_tsd_externs(data_, data_t)
 #define	DATA_INIT 0x12345678
-malloc_tsd_data(, data, data_t, DATA_INIT)
-malloc_tsd_funcs(, data, data_t, DATA_INIT, data_cleanup)
+malloc_tsd_data(, data_, data_t, DATA_INIT)
+malloc_tsd_funcs(, data_, data_t, DATA_INIT, data_cleanup)
 
 static void *
 thd_start(void *arg)
-- 
cgit v0.12


From eb5376ab9e61d96daa0d1f03b4474baf5232478f Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Tue, 23 Sep 2014 09:21:49 -0700
Subject: Add instructions for installing from non-packaged sources.

---
 INSTALL | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/INSTALL b/INSTALL
index 6c46100..9af2336 100644
--- a/INSTALL
+++ b/INSTALL
@@ -1,10 +1,23 @@
-Building and installing jemalloc can be as simple as typing the following while
-in the root directory of the source tree:
+Building and installing a packaged release of jemalloc can be as simple as
+typing the following while in the root directory of the source tree:
 
     ./configure
     make
     make install
 
+If building from unpackaged developer sources, the simplest command sequence
+that might work is:
+
+    ./autogen.sh
+    make dist
+    make
+    make install
+
+Note that documentation is not built by the default target because doing so
+would create a dependency on xsltproc in packaged releases, hence the
+requirement to either run 'make dist' or avoid installing docs via the various
+install_* targets documented below.
+
 === Advanced configuration =====================================================
 
 The 'configure' script supports numerous options that allow control of which
-- 
cgit v0.12


From 70bdee07d9e3942580e576b94010108c342d609d Mon Sep 17 00:00:00 2001
From: Dave Rigby <daver@couchbase.com>
Date: Mon, 22 Sep 2014 15:53:16 +0100
Subject: autoconf: Support cygwin in addition to mingw

---
 configure.ac | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 2d5b56a..ab4bcc3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -330,7 +330,7 @@ case "${host}" in
 	fi
 	abi="xcoff"
 	;;
-  *-*-mingw*)
+  *-*-mingw* | *-*-cygwin*)
 	abi="pecoff"
 	force_tls="0"
 	RPATH=""
-- 
cgit v0.12


From 112704cfbfacfc9cecdfb732741df47eb4133902 Mon Sep 17 00:00:00 2001
From: Dave Rigby <daver@couchbase.com>
Date: Mon, 22 Sep 2014 15:54:33 +0100
Subject: Use MSVC intrinsics for lg_floor

When using MSVC make use of its intrinsic functions (supported on
x86, amd64 & ARM) for lg_floor.
---
 include/jemalloc/internal/util.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/include/jemalloc/internal/util.h b/include/jemalloc/internal/util.h
index cc7806d..5af6832 100644
--- a/include/jemalloc/internal/util.h
+++ b/include/jemalloc/internal/util.h
@@ -176,6 +176,21 @@ lg_floor(size_t x)
 	    );
 	return (ret);
 }
+#elif (defined(_MSC_VER))
+JEMALLOC_INLINE size_t
+lg_floor(size_t x)
+{
+    unsigned long ret;
+
+#if (LG_SIZEOF_PTR == 3)
+    _BitScanReverse64(&ret, x);
+#elif (LG_SIZEOF_PTR == 2)
+    _BitScanReverse(&ret, x);
+#else
+#  error "Unsupported type sizes for lg_floor()"
+#endif
+    return (ret);
+}
 #elif (defined(JEMALLOC_HAVE_BUILTIN_CLZ))
 JEMALLOC_INLINE size_t
 lg_floor(size_t x)
-- 
cgit v0.12


From 6ef80d68f092caf3b3802a73b8d716057b41864c Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 24 Sep 2014 22:14:21 -0700
Subject: Fix profile dumping race.

Fix a race that caused a non-critical assertion failure.  To trigger the
race, a thread had to be part way through initializing a new sample,
such that it was discoverable by the dumping thread, but not yet linked
into its gctx by the time a later dump phase would normally have reset
its state to 'nominal'.

Additionally, lock access to the state field during modification to
transition to the dumping state.  It's not apparent that this oversight
could have caused an actual problem due to outer locking that protects
the dumping machinery, but the added locking pedantically follows the
stated locking protocol for the state field.
---
 include/jemalloc/internal/prof.h |  1 +
 src/prof.c                       | 10 +++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index b8a8b41..3872c7a 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -79,6 +79,7 @@ struct prof_cnt_s {
 };
 
 typedef enum {
+	prof_tctx_state_initializing,
 	prof_tctx_state_nominal,
 	prof_tctx_state_dumping,
 	prof_tctx_state_purgatory /* Dumper must finish destroying. */
diff --git a/src/prof.c b/src/prof.c
index dd84f53..9f10b53 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -717,7 +717,7 @@ prof_lookup(tsd_t *tsd, prof_bt_t *bt)
 		memset(&ret.p->cnts, 0, sizeof(prof_cnt_t));
 		ret.p->gctx = gctx;
 		ret.p->prepared = true;
-		ret.p->state = prof_tctx_state_nominal;
+		ret.p->state = prof_tctx_state_initializing;
 		malloc_mutex_lock(tdata->lock);
 		error = ckh_insert(tsd, &tdata->bt2tctx, btkey, ret.v);
 		malloc_mutex_unlock(tdata->lock);
@@ -728,6 +728,7 @@ prof_lookup(tsd_t *tsd, prof_bt_t *bt)
 			return (NULL);
 		}
 		malloc_mutex_lock(gctx->lock);
+		ret.p->state = prof_tctx_state_nominal;
 		tctx_tree_insert(&gctx->tctxs, ret.p);
 		gctx->nlimbo--;
 		malloc_mutex_unlock(gctx->lock);
@@ -925,8 +926,15 @@ static void
 prof_tctx_merge_tdata(prof_tctx_t *tctx, prof_tdata_t *tdata)
 {
 
+	malloc_mutex_lock(tctx->gctx->lock);
+	if (tctx->state == prof_tctx_state_initializing) {
+		malloc_mutex_unlock(tctx->gctx->lock);
+		return;
+	}
 	assert(tctx->state == prof_tctx_state_nominal);
 	tctx->state = prof_tctx_state_dumping;
+	malloc_mutex_unlock(tctx->gctx->lock);
+
 	memcpy(&tctx->dump_cnts, &tctx->cnts, sizeof(prof_cnt_t));
 
 	tdata->cnt_summed.curobjs += tctx->dump_cnts.curobjs;
-- 
cgit v0.12


From f97e5ac4ec8a5ae7ed74829e6c1bf6ce814947f5 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 28 Sep 2014 14:43:11 -0700
Subject: Implement compile-time bitmap size computation.

---
 include/jemalloc/internal/bitmap.h | 46 ++++++++++++++++++++++++++++++++++++++
 src/bitmap.c                       | 18 +++------------
 test/unit/bitmap.c                 | 16 +++++--------
 3 files changed, 54 insertions(+), 26 deletions(-)

diff --git a/include/jemalloc/internal/bitmap.h b/include/jemalloc/internal/bitmap.h
index 6db4ab7..4ca40ff 100644
--- a/include/jemalloc/internal/bitmap.h
+++ b/include/jemalloc/internal/bitmap.h
@@ -3,6 +3,7 @@
 
 /* Maximum bitmap bit count is 2^LG_BITMAP_MAXBITS. */
 #define	LG_BITMAP_MAXBITS	LG_RUN_MAXREGS
+#define	BITMAP_MAXBITS		(ZU(1) << LG_BITMAP_MAXBITS)
 
 typedef struct bitmap_level_s bitmap_level_t;
 typedef struct bitmap_info_s bitmap_info_t;
@@ -14,6 +15,51 @@ typedef unsigned long bitmap_t;
 #define	BITMAP_GROUP_NBITS		(ZU(1) << LG_BITMAP_GROUP_NBITS)
 #define	BITMAP_GROUP_NBITS_MASK		(BITMAP_GROUP_NBITS-1)
 
+/* Number of groups required to store a given number of bits. */
+#define	BITMAP_BITS2GROUPS(nbits)					\
+    ((nbits + BITMAP_GROUP_NBITS_MASK) >> LG_BITMAP_GROUP_NBITS)
+
+/*
+ * Number of groups required at a particular level for a given number of bits.
+ */
+#define	BITMAP_GROUPS_L0(nbits)						\
+    BITMAP_BITS2GROUPS(nbits)
+#define	BITMAP_GROUPS_L1(nbits)						\
+    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(nbits))
+#define	BITMAP_GROUPS_L2(nbits)						\
+    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS((nbits))))
+#define	BITMAP_GROUPS_L3(nbits)						\
+    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(		\
+	BITMAP_BITS2GROUPS((nbits)))))
+
+/*
+ * Assuming the number of levels, number of groups required for a given number
+ * of bits.
+ */
+#define	BITMAP_GROUPS_1_LEVEL(nbits)					\
+    BITMAP_GROUPS_L0(nbits)
+#define	BITMAP_GROUPS_2_LEVEL(nbits)					\
+    (BITMAP_GROUPS_1_LEVEL(nbits) + BITMAP_GROUPS_L1(nbits))
+#define	BITMAP_GROUPS_3_LEVEL(nbits)					\
+    (BITMAP_GROUPS_2_LEVEL(nbits) + BITMAP_GROUPS_L2(nbits))
+#define	BITMAP_GROUPS_4_LEVEL(nbits)					\
+    (BITMAP_GROUPS_3_LEVEL(nbits) + BITMAP_GROUPS_L3(nbits))
+
+/*
+ * Maximum number of groups required to support LG_BITMAP_MAXBITS.
+ */
+#if LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS
+#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_1_LEVEL(BITMAP_MAXBITS)
+#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 2
+#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_2_LEVEL(BITMAP_MAXBITS)
+#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 3
+#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_3_LEVEL(BITMAP_MAXBITS)
+#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 4
+#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_4_LEVEL(BITMAP_MAXBITS)
+#else
+#  error "Unsupported bitmap size"
+#endif
+
 /* Maximum number of levels possible. */
 #define	BITMAP_MAX_LEVELS						\
     (LG_BITMAP_MAXBITS / LG_SIZEOF_BITMAP)				\
diff --git a/src/bitmap.c b/src/bitmap.c
index e2bd907..c733372 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -2,19 +2,6 @@
 #include "jemalloc/internal/jemalloc_internal.h"
 
 /******************************************************************************/
-/* Function prototypes for non-inline static functions. */
-
-static size_t	bits2groups(size_t nbits);
-
-/******************************************************************************/
-
-static size_t
-bits2groups(size_t nbits)
-{
-
-	return ((nbits >> LG_BITMAP_GROUP_NBITS) +
-	    !!(nbits & BITMAP_GROUP_NBITS_MASK));
-}
 
 void
 bitmap_info_init(bitmap_info_t *binfo, size_t nbits)
@@ -31,15 +18,16 @@ bitmap_info_init(bitmap_info_t *binfo, size_t nbits)
 	 * that requires only one group.
 	 */
 	binfo->levels[0].group_offset = 0;
-	group_count = bits2groups(nbits);
+	group_count = BITMAP_BITS2GROUPS(nbits);
 	for (i = 1; group_count > 1; i++) {
 		assert(i < BITMAP_MAX_LEVELS);
 		binfo->levels[i].group_offset = binfo->levels[i-1].group_offset
 		    + group_count;
-		group_count = bits2groups(group_count);
+		group_count = BITMAP_BITS2GROUPS(group_count);
 	}
 	binfo->levels[i].group_offset = binfo->levels[i-1].group_offset
 	    + group_count;
+	assert(binfo->levels[i].group_offset <= BITMAP_GROUPS_MAX);
 	binfo->nlevels = i;
 	binfo->nbits = nbits;
 }
diff --git a/test/unit/bitmap.c b/test/unit/bitmap.c
index 8086b88..4ea94f8 100644
--- a/test/unit/bitmap.c
+++ b/test/unit/bitmap.c
@@ -1,17 +1,11 @@
 #include "test/jemalloc_test.h"
 
-#if (LG_BITMAP_MAXBITS > 12)
-#  define MAXBITS	4500
-#else
-#  define MAXBITS	(1U << LG_BITMAP_MAXBITS)
-#endif
-
 TEST_BEGIN(test_bitmap_size)
 {
 	size_t i, prev_size;
 
 	prev_size = 0;
-	for (i = 1; i <= MAXBITS; i++) {
+	for (i = 1; i <= BITMAP_MAXBITS; i++) {
 		size_t size = bitmap_size(i);
 		assert_true(size >= prev_size,
 		    "Bitmap size is smaller than expected");
@@ -24,7 +18,7 @@ TEST_BEGIN(test_bitmap_init)
 {
 	size_t i;
 
-	for (i = 1; i <= MAXBITS; i++) {
+	for (i = 1; i <= BITMAP_MAXBITS; i++) {
 		bitmap_info_t binfo;
 		bitmap_info_init(&binfo, i);
 		{
@@ -47,7 +41,7 @@ TEST_BEGIN(test_bitmap_set)
 {
 	size_t i;
 
-	for (i = 1; i <= MAXBITS; i++) {
+	for (i = 1; i <= BITMAP_MAXBITS; i++) {
 		bitmap_info_t binfo;
 		bitmap_info_init(&binfo, i);
 		{
@@ -70,7 +64,7 @@ TEST_BEGIN(test_bitmap_unset)
 {
 	size_t i;
 
-	for (i = 1; i <= MAXBITS; i++) {
+	for (i = 1; i <= BITMAP_MAXBITS; i++) {
 		bitmap_info_t binfo;
 		bitmap_info_init(&binfo, i);
 		{
@@ -99,7 +93,7 @@ TEST_BEGIN(test_bitmap_sfu)
 {
 	size_t i;
 
-	for (i = 1; i <= MAXBITS; i++) {
+	for (i = 1; i <= BITMAP_MAXBITS; i++) {
 		bitmap_info_t binfo;
 		bitmap_info_init(&binfo, i);
 		{
-- 
cgit v0.12


From 0c5dd03e889d0269170b5db9fa872738d906eb78 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Mon, 29 Sep 2014 01:31:39 -0700
Subject: Move small run metadata into the arena chunk header.

Move small run metadata into the arena chunk header, with multiple
expected benefits:
- Lower run fragmentation due to reduced run sizes; runs are more likely
  to completely drain when there are fewer total regions.
- Improved cache behavior.  Prior to this change, run headers were
  always page-aligned, which put extra pressure on some CPU cache sets.
  The degree to which this was a problem was hardware dependent, but it
  likely hurt some even for the most advanced modern hardware.
- Buffer overruns/underruns are less likely to corrupt allocator
  metadata.
- Size classes between 4 KiB and 16 KiB become reasonable to support
  without any special handling, and the runs are small enough that dirty
  unused pages aren't a significant concern.
---
 include/jemalloc/internal/arena.h             | 144 ++++++-----
 include/jemalloc/internal/private_symbols.txt |   3 +
 src/arena.c                                   | 347 ++++++++++++--------------
 3 files changed, 233 insertions(+), 261 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index f1a1205..48fd205 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -1,30 +1,8 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_TYPES
 
-/*
- * RUN_MAX_OVRHD indicates maximum desired run header overhead.  Runs are sized
- * as small as possible such that this setting is still honored, without
- * violating other constraints.  The goal is to make runs as small as possible
- * without exceeding a per run external fragmentation threshold.
- *
- * We use binary fixed point math for overhead computations, where the binary
- * point is implicitly RUN_BFP bits to the left.
- *
- * Note that it is possible to set RUN_MAX_OVRHD low enough that it cannot be
- * honored for some/all object sizes, since when heap profiling is enabled
- * there is one pointer of header overhead per object (plus a constant).  This
- * constraint is relaxed (ignored) for runs that are so small that the
- * per-region overhead is greater than:
- *
- *   (RUN_MAX_OVRHD / (reg_interval << (3+RUN_BFP))
- */
-#define	RUN_BFP			12
-/*                                    \/   Implicit binary fixed point. */
-#define	RUN_MAX_OVRHD		0x0000003dU
-#define	RUN_MAX_OVRHD_RELAX	0x00001800U
-
 /* Maximum number of regions in one run. */
-#define	LG_RUN_MAXREGS		11
+#define	LG_RUN_MAXREGS		(LG_PAGE - LG_TINY_MIN)
 #define	RUN_MAXREGS		(1U << LG_RUN_MAXREGS)
 
 /*
@@ -43,10 +21,10 @@
  */
 #define	LG_DIRTY_MULT_DEFAULT	3
 
+typedef struct arena_run_s arena_run_t;
 typedef struct arena_chunk_map_bits_s arena_chunk_map_bits_t;
 typedef struct arena_chunk_map_misc_s arena_chunk_map_misc_t;
 typedef struct arena_chunk_s arena_chunk_t;
-typedef struct arena_run_s arena_run_t;
 typedef struct arena_bin_info_s arena_bin_info_t;
 typedef struct arena_bin_s arena_bin_t;
 typedef struct arena_s arena_t;
@@ -55,6 +33,20 @@ typedef struct arena_s arena_t;
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS
 
+struct arena_run_s {
+	/* Bin this run is associated with. */
+	arena_bin_t	*bin;
+
+	/* Index of next region that has never been allocated, or nregs. */
+	uint32_t	nextind;
+
+	/* Number of free regions in run. */
+	unsigned	nfree;
+
+	/* Per region allocated/deallocated bitmap. */
+	bitmap_t	bitmap[BITMAP_GROUPS_MAX];
+};
+
 /* Each element of the chunk map corresponds to one page within the chunk. */
 struct arena_chunk_map_bits_s {
 	/*
@@ -130,15 +122,6 @@ struct arena_chunk_map_bits_s {
  * chunk header in order to improve cache locality.
  */
 struct arena_chunk_map_misc_s {
-#ifndef JEMALLOC_PROF
-	/*
-	 * Overlay prof_tctx in order to allow it to be referenced by dead code.
-	 * Such antics aren't warranted for per arena data structures, but
-	 * chunk map overhead accounts for a percentage of memory, rather than
-	 * being just a fixed cost.
-	 */
-	union {
-#endif
 	/*
 	 * Linkage for run trees.  There are two disjoint uses:
 	 *
@@ -146,16 +129,18 @@ struct arena_chunk_map_misc_s {
 	 * 2) arena_run_t conceptually uses this linkage for in-use non-full
 	 * runs, rather than directly embedding linkage.
 	 */
-	rb_node(arena_chunk_map_misc_t)	rb_link;
+	rb_node(arena_chunk_map_misc_t)		rb_link;
 
-	/* Profile counters, used for large object runs. */
-	prof_tctx_t			*prof_tctx;
-#ifndef JEMALLOC_PROF
-	}; /* union { ... }; */
-#endif
+	union {
+		/* Linkage for list of dirty runs. */
+		ql_elm(arena_chunk_map_misc_t)	dr_link;
 
-	/* Linkage for list of dirty runs. */
-	ql_elm(arena_chunk_map_misc_t)	dr_link;
+		/* Profile counters, used for large object runs. */
+		prof_tctx_t			*prof_tctx;
+
+		/* Small region run metadata. */
+		arena_run_t			run;
+	};
 };
 typedef rb_tree(arena_chunk_map_misc_t) arena_avail_tree_t;
 typedef rb_tree(arena_chunk_map_misc_t) arena_run_tree_t;
@@ -175,17 +160,6 @@ struct arena_chunk_s {
 	arena_chunk_map_bits_t	map_bits[1]; /* Dynamically sized. */
 };
 
-struct arena_run_s {
-	/* Bin this run is associated with. */
-	arena_bin_t	*bin;
-
-	/* Index of next region that has never been allocated, or nregs. */
-	uint32_t	nextind;
-
-	/* Number of free regions in run. */
-	unsigned	nfree;
-};
-
 /*
  * Read-only information associated with each element of arena_t's bins array
  * is stored separately, partly to reduce memory usage (only one copy, rather
@@ -194,10 +168,7 @@ struct arena_run_s {
  * Each run has the following layout:
  *
  *               /--------------------\
- *               | arena_run_t header |
- *               | ...                |
- * bitmap_offset | bitmap             |
- *               | ...                |
+ *               | pad?               |
  *               |--------------------|
  *               | redzone            |
  *   reg0_offset | region 0           |
@@ -239,12 +210,6 @@ struct arena_bin_info_s {
 	uint32_t	nregs;
 
 	/*
-	 * Offset of first bitmap_t element in a run header for this bin's size
-	 * class.
-	 */
-	uint32_t	bitmap_offset;
-
-	/*
 	 * Metadata used to manipulate bitmaps for runs associated with this
 	 * bin.
 	 */
@@ -451,6 +416,9 @@ arena_chunk_map_bits_t	*arena_bitselm_get(arena_chunk_t *chunk,
     size_t pageind);
 arena_chunk_map_misc_t	*arena_miscelm_get(arena_chunk_t *chunk,
     size_t pageind);
+size_t	arena_miscelm_to_pageind(arena_chunk_map_misc_t *miscelm);
+void	*arena_miscelm_to_rpages(arena_chunk_map_misc_t *miscelm);
+arena_chunk_map_misc_t	*arena_run_to_miscelm(arena_run_t *run);
 size_t	*arena_mapbitsp_get(arena_chunk_t *chunk, size_t pageind);
 size_t	arena_mapbitsp_read(size_t *mapbitsp);
 size_t	arena_mapbits_get(arena_chunk_t *chunk, size_t pageind);
@@ -659,6 +627,40 @@ arena_miscelm_get(arena_chunk_t *chunk, size_t pageind)
 	    (uintptr_t)map_misc_offset) + pageind-map_bias);
 }
 
+JEMALLOC_ALWAYS_INLINE size_t
+arena_miscelm_to_pageind(arena_chunk_map_misc_t *miscelm)
+{
+	arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(miscelm);
+	size_t pageind = ((uintptr_t)miscelm - ((uintptr_t)chunk +
+	    map_misc_offset)) / sizeof(arena_chunk_map_misc_t) + map_bias;
+
+	assert(pageind >= map_bias);
+	assert(pageind < chunk_npages);
+
+	return (pageind);
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+arena_miscelm_to_rpages(arena_chunk_map_misc_t *miscelm)
+{
+	arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(miscelm);
+	size_t pageind = arena_miscelm_to_pageind(miscelm);
+
+	return ((void *)((uintptr_t)chunk + (pageind << LG_PAGE)));
+}
+
+JEMALLOC_ALWAYS_INLINE arena_chunk_map_misc_t *
+arena_run_to_miscelm(arena_run_t *run)
+{
+	arena_chunk_map_misc_t *miscelm = (arena_chunk_map_misc_t
+	    *)((uintptr_t)run - offsetof(arena_chunk_map_misc_t, run));
+
+	assert(arena_miscelm_to_pageind(miscelm) >= map_bias);
+	assert(arena_miscelm_to_pageind(miscelm) < chunk_npages);
+
+	return (miscelm);
+}
+
 JEMALLOC_ALWAYS_INLINE size_t *
 arena_mapbitsp_get(arena_chunk_t *chunk, size_t pageind)
 {
@@ -903,10 +905,13 @@ arena_ptr_small_binind_get(const void *ptr, size_t mapbits)
 		arena_t *arena;
 		size_t pageind;
 		size_t actual_mapbits;
+		size_t rpages_ind;
 		arena_run_t *run;
 		arena_bin_t *bin;
 		size_t actual_binind;
 		arena_bin_info_t *bin_info;
+		arena_chunk_map_misc_t *miscelm;
+		void *rpages;
 
 		assert(binind != BININD_INVALID);
 		assert(binind < NBINS);
@@ -917,13 +922,16 @@ arena_ptr_small_binind_get(const void *ptr, size_t mapbits)
 		assert(mapbits == actual_mapbits);
 		assert(arena_mapbits_large_get(chunk, pageind) == 0);
 		assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
-		run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
-		    (actual_mapbits >> LG_PAGE)) << LG_PAGE));
+		rpages_ind = pageind - arena_mapbits_small_runind_get(chunk,
+		    pageind);
+		miscelm = arena_miscelm_get(chunk, rpages_ind);
+		run = &miscelm->run;
 		bin = run->bin;
 		actual_binind = bin - arena->bins;
 		assert(binind == actual_binind);
 		bin_info = &arena_bin_info[actual_binind];
-		assert(((uintptr_t)ptr - ((uintptr_t)run +
+		rpages = arena_miscelm_to_rpages(miscelm);
+		assert(((uintptr_t)ptr - ((uintptr_t)rpages +
 		    (uintptr_t)bin_info->reg0_offset)) % bin_info->reg_interval
 		    == 0);
 	}
@@ -946,19 +954,21 @@ arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, const void *ptr)
 {
 	unsigned shift, diff, regind;
 	size_t interval;
+	arena_chunk_map_misc_t *miscelm = arena_run_to_miscelm(run);
+	void *rpages = arena_miscelm_to_rpages(miscelm);
 
 	/*
 	 * Freeing a pointer lower than region zero can cause assertion
 	 * failure.
 	 */
-	assert((uintptr_t)ptr >= (uintptr_t)run +
+	assert((uintptr_t)ptr >= (uintptr_t)rpages +
 	    (uintptr_t)bin_info->reg0_offset);
 
 	/*
 	 * Avoid doing division with a variable divisor if possible.  Using
 	 * actual division here can reduce allocator throughput by over 20%!
 	 */
-	diff = (unsigned)((uintptr_t)ptr - (uintptr_t)run -
+	diff = (unsigned)((uintptr_t)ptr - (uintptr_t)rpages -
 	    bin_info->reg0_offset);
 
 	/* Rescale (factor powers of 2 out of the numerator and denominator). */
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 84d48d1..5ac82f5 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -42,6 +42,8 @@ arena_mapbitsp_read
 arena_mapbitsp_write
 arena_maxclass
 arena_miscelm_get
+arena_miscelm_to_pageind
+arena_miscelm_to_rpages
 arena_new
 arena_palloc
 arena_postfork_child
@@ -61,6 +63,7 @@ arena_ralloc_junk_large
 arena_ralloc_no_move
 arena_redzone_corruption
 arena_run_regind
+arena_run_to_miscelm
 arena_salloc
 arena_sdalloc
 arena_stats_merge
diff --git a/src/arena.c b/src/arena.c
index 40da9f4..ef391b1 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -61,15 +61,6 @@ static void	arena_bin_lower_run(arena_t *arena, arena_chunk_t *chunk,
 /******************************************************************************/
 
 JEMALLOC_INLINE_C size_t
-arena_miscelm_to_pageind(arena_chunk_map_misc_t *miscelm)
-{
-	size_t offset = CHUNK_ADDR2OFFSET(miscelm);
-
-	return ((offset - map_misc_offset) / sizeof(arena_chunk_map_misc_t) +
-	    map_bias);
-}
-
-JEMALLOC_INLINE_C size_t
 arena_miscelm_to_bits(arena_chunk_map_misc_t *miscelm)
 {
 	arena_chunk_t *chunk = CHUNK_ADDR2BASE(miscelm);
@@ -183,14 +174,16 @@ arena_run_reg_alloc(arena_run_t *run, arena_bin_info_t *bin_info)
 {
 	void *ret;
 	unsigned regind;
-	bitmap_t *bitmap = (bitmap_t *)((uintptr_t)run +
-	    (uintptr_t)bin_info->bitmap_offset);
+	arena_chunk_map_misc_t *miscelm;
+	void *rpages;
 
 	assert(run->nfree > 0);
-	assert(bitmap_full(bitmap, &bin_info->bitmap_info) == false);
+	assert(bitmap_full(run->bitmap, &bin_info->bitmap_info) == false);
 
-	regind = bitmap_sfu(bitmap, &bin_info->bitmap_info);
-	ret = (void *)((uintptr_t)run + (uintptr_t)bin_info->reg0_offset +
+	regind = bitmap_sfu(run->bitmap, &bin_info->bitmap_info);
+	miscelm = arena_run_to_miscelm(run);
+	rpages = arena_miscelm_to_rpages(miscelm);
+	ret = (void *)((uintptr_t)rpages + (uintptr_t)bin_info->reg0_offset +
 	    (uintptr_t)(bin_info->reg_interval * regind));
 	run->nfree--;
 	if (regind == run->nextind)
@@ -208,20 +201,20 @@ arena_run_reg_dalloc(arena_run_t *run, void *ptr)
 	size_t binind = arena_ptr_small_binind_get(ptr, mapbits);
 	arena_bin_info_t *bin_info = &arena_bin_info[binind];
 	unsigned regind = arena_run_regind(run, bin_info, ptr);
-	bitmap_t *bitmap = (bitmap_t *)((uintptr_t)run +
-	    (uintptr_t)bin_info->bitmap_offset);
 
 	assert(run->nfree < bin_info->nregs);
 	/* Freeing an interior pointer can cause assertion failure. */
-	assert(((uintptr_t)ptr - ((uintptr_t)run +
+	assert(((uintptr_t)ptr -
+	    ((uintptr_t)arena_miscelm_to_rpages(arena_run_to_miscelm(run)) +
 	    (uintptr_t)bin_info->reg0_offset)) %
 	    (uintptr_t)bin_info->reg_interval == 0);
-	assert((uintptr_t)ptr >= (uintptr_t)run +
+	assert((uintptr_t)ptr >=
+	    (uintptr_t)arena_miscelm_to_rpages(arena_run_to_miscelm(run)) +
 	    (uintptr_t)bin_info->reg0_offset);
 	/* Freeing an unallocated pointer can cause assertion failure. */
-	assert(bitmap_get(bitmap, &bin_info->bitmap_info, regind));
+	assert(bitmap_get(run->bitmap, &bin_info->bitmap_info, regind));
 
-	bitmap_unset(bitmap, &bin_info->bitmap_info, regind);
+	bitmap_unset(run->bitmap, &bin_info->bitmap_info, regind);
 	run->nfree++;
 }
 
@@ -316,10 +309,12 @@ arena_run_split_large_helper(arena_t *arena, arena_run_t *run, size_t size,
     bool remove, bool zero)
 {
 	arena_chunk_t *chunk;
+	arena_chunk_map_misc_t *miscelm;
 	size_t flag_dirty, run_ind, need_pages, i;
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
-	run_ind = (unsigned)(((uintptr_t)run - (uintptr_t)chunk) >> LG_PAGE);
+	miscelm = arena_run_to_miscelm(run);
+	run_ind = arena_miscelm_to_pageind(miscelm);
 	flag_dirty = arena_mapbits_dirty_get(chunk, run_ind);
 	need_pages = (size >> LG_PAGE);
 	assert(need_pages > 0);
@@ -383,12 +378,14 @@ arena_run_split_small(arena_t *arena, arena_run_t *run, size_t size,
     size_t binind)
 {
 	arena_chunk_t *chunk;
+	arena_chunk_map_misc_t *miscelm;
 	size_t flag_dirty, run_ind, need_pages, i;
 
 	assert(binind != BININD_INVALID);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
-	run_ind = (unsigned)(((uintptr_t)run - (uintptr_t)chunk) >> LG_PAGE);
+	miscelm = arena_run_to_miscelm(run);
+	run_ind = arena_miscelm_to_pageind(miscelm);
 	flag_dirty = arena_mapbits_dirty_get(chunk, run_ind);
 	need_pages = (size >> LG_PAGE);
 	assert(need_pages > 0);
@@ -401,11 +398,6 @@ arena_run_split_small(arena_t *arena, arena_run_t *run, size_t size,
 	 * clean pages.
 	 */
 	arena_mapbits_small_set(chunk, run_ind, 0, binind, flag_dirty);
-	/*
-	 * The first page will always be dirtied during small run
-	 * initialization, so a validation failure here would not actually
-	 * cause an observable failure.
-	 */
 	if (config_debug && flag_dirty == 0 && arena_mapbits_unzeroed_get(chunk,
 	    run_ind) == 0)
 		arena_run_page_validate_zeroed(chunk, run_ind);
@@ -643,19 +635,14 @@ arena_chunk_dalloc(arena_t *arena, arena_chunk_t *chunk)
 static arena_run_t *
 arena_run_alloc_large_helper(arena_t *arena, size_t size, bool zero)
 {
-	arena_run_t *run;
 	arena_chunk_map_misc_t *miscelm;
 	arena_chunk_map_misc_t *key;
 
 	key = (arena_chunk_map_misc_t *)(size | CHUNK_MAP_KEY);
 	miscelm = arena_avail_tree_nsearch(&arena->runs_avail, key);
 	if (miscelm != NULL) {
-		arena_chunk_t *run_chunk = CHUNK_ADDR2BASE(miscelm);
-		size_t pageind = arena_miscelm_to_pageind(miscelm);
-
-		run = (arena_run_t *)((uintptr_t)run_chunk + (pageind <<
-		    LG_PAGE));
-		arena_run_split_large(arena, run, size, zero);
+		arena_run_t *run = &miscelm->run;
+		arena_run_split_large(arena, &miscelm->run, size, zero);
 		return (run);
 	}
 
@@ -681,7 +668,7 @@ arena_run_alloc_large(arena_t *arena, size_t size, bool zero)
 	 */
 	chunk = arena_chunk_alloc(arena);
 	if (chunk != NULL) {
-		run = (arena_run_t *)((uintptr_t)chunk + (map_bias << LG_PAGE));
+		run = &arena_miscelm_get(chunk, map_bias)->run;
 		arena_run_split_large(arena, run, size, zero);
 		return (run);
 	}
@@ -704,11 +691,7 @@ arena_run_alloc_small_helper(arena_t *arena, size_t size, size_t binind)
 	key = (arena_chunk_map_misc_t *)(size | CHUNK_MAP_KEY);
 	miscelm = arena_avail_tree_nsearch(&arena->runs_avail, key);
 	if (miscelm != NULL) {
-		arena_chunk_t *run_chunk = CHUNK_ADDR2BASE(miscelm);
-		size_t pageind = arena_miscelm_to_pageind(miscelm);
-
-		run = (arena_run_t *)((uintptr_t)run_chunk + (pageind <<
-		    LG_PAGE));
+		run = &miscelm->run;
 		arena_run_split_small(arena, run, size, binind);
 		return (run);
 	}
@@ -736,7 +719,7 @@ arena_run_alloc_small(arena_t *arena, size_t size, size_t binind)
 	 */
 	chunk = arena_chunk_alloc(arena);
 	if (chunk != NULL) {
-		run = (arena_run_t *)((uintptr_t)chunk + (map_bias << LG_PAGE));
+		run = &arena_miscelm_get(chunk, map_bias)->run;
 		arena_run_split_small(arena, run, size, binind);
 		return (run);
 	}
@@ -825,8 +808,7 @@ arena_stash_dirty(arena_t *arena, bool all, size_t npurge,
 		size_t run_size = arena_mapbits_unallocated_size_get(chunk,
 		    pageind);
 		size_t npages = run_size >> LG_PAGE;
-		arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
-		    (uintptr_t)(pageind << LG_PAGE));
+		arena_run_t *run = &miscelm->run;
 
 		assert(pageind + npages <= chunk_npages);
 		assert(arena_mapbits_dirty_get(chunk, pageind) ==
@@ -919,11 +901,7 @@ arena_unstash_purged(arena_t *arena, arena_chunk_miscelms_t *miscelms)
 	/* Deallocate runs. */
 	for (miscelm = ql_first(miscelms); miscelm != NULL;
 	    miscelm = ql_first(miscelms)) {
-		arena_chunk_t *chunk =
-		    (arena_chunk_t *)CHUNK_ADDR2BASE(miscelm);
-		size_t pageind = arena_miscelm_to_pageind(miscelm);
-		arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
-		    (uintptr_t)(pageind << LG_PAGE));
+		arena_run_t *run = &miscelm->run;
 		ql_remove(miscelms, miscelm, dr_link);
 		arena_run_dalloc(arena, run, false, true);
 	}
@@ -1042,10 +1020,12 @@ static void
 arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned)
 {
 	arena_chunk_t *chunk;
+	arena_chunk_map_misc_t *miscelm;
 	size_t size, run_ind, run_pages, flag_dirty;
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
-	run_ind = (size_t)(((uintptr_t)run - (uintptr_t)chunk) >> LG_PAGE);
+	miscelm = arena_run_to_miscelm(run);
+	run_ind = arena_miscelm_to_pageind(miscelm);
 	assert(run_ind >= map_bias);
 	assert(run_ind < chunk_npages);
 	if (arena_mapbits_large_get(chunk, run_ind) != 0) {
@@ -1086,8 +1066,7 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned)
 		    arena_mapbits_unzeroed_get(chunk, run_ind+run_pages-1));
 	}
 
-	arena_run_coalesce(arena, chunk, &size, &run_ind, &run_pages,
-	    flag_dirty);
+	arena_run_coalesce(arena, chunk, &size, &run_ind, &run_pages, flag_dirty);
 
 	/* Insert into runs_avail, now that coalescing is complete. */
 	assert(arena_mapbits_unallocated_size_get(chunk, run_ind) ==
@@ -1121,7 +1100,8 @@ static void
 arena_run_trim_head(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
     size_t oldsize, size_t newsize)
 {
-	size_t pageind = ((uintptr_t)run - (uintptr_t)chunk) >> LG_PAGE;
+	arena_chunk_map_misc_t *miscelm = arena_run_to_miscelm(run);
+	size_t pageind = arena_miscelm_to_pageind(miscelm);
 	size_t head_npages = (oldsize - newsize) >> LG_PAGE;
 	size_t flag_dirty = arena_mapbits_dirty_get(chunk, pageind);
 
@@ -1153,9 +1133,12 @@ static void
 arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
     size_t oldsize, size_t newsize, bool dirty)
 {
-	size_t pageind = ((uintptr_t)run - (uintptr_t)chunk) >> LG_PAGE;
+	arena_chunk_map_misc_t *miscelm = arena_run_to_miscelm(run);
+	size_t pageind = arena_miscelm_to_pageind(miscelm);
 	size_t head_npages = newsize >> LG_PAGE;
 	size_t flag_dirty = arena_mapbits_dirty_get(chunk, pageind);
+	arena_chunk_map_misc_t *tail_miscelm;
+	arena_run_t *tail_run;
 
 	assert(oldsize > newsize);
 
@@ -1178,26 +1161,17 @@ arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 	arena_mapbits_large_set(chunk, pageind+head_npages, oldsize-newsize,
 	    flag_dirty);
 
-	arena_run_dalloc(arena, (arena_run_t *)((uintptr_t)run + newsize),
-	    dirty, false);
+	tail_miscelm = arena_miscelm_get(chunk, pageind + head_npages);
+	tail_run = &tail_miscelm->run;
+	arena_run_dalloc(arena, tail_run, dirty, false);
 }
 
 static arena_run_t *
 arena_bin_runs_first(arena_bin_t *bin)
 {
 	arena_chunk_map_misc_t *miscelm = arena_run_tree_first(&bin->runs);
-	if (miscelm != NULL) {
-		arena_chunk_t *chunk;
-		size_t pageind;
-		arena_run_t *run;
-
-		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(miscelm);
-		pageind = arena_miscelm_to_pageind(miscelm);
-		run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
-		    arena_mapbits_small_runind_get(chunk, pageind)) <<
-		    LG_PAGE));
-		return (run);
-	}
+	if (miscelm != NULL)
+		return (&miscelm->run);
 
 	return (NULL);
 }
@@ -1205,9 +1179,7 @@ arena_bin_runs_first(arena_bin_t *bin)
 static void
 arena_bin_runs_insert(arena_bin_t *bin, arena_run_t *run)
 {
-	arena_chunk_t *chunk = CHUNK_ADDR2BASE(run);
-	size_t pageind = ((uintptr_t)run - (uintptr_t)chunk) >> LG_PAGE;
-	arena_chunk_map_misc_t *miscelm = arena_miscelm_get(chunk, pageind);
+	arena_chunk_map_misc_t *miscelm = arena_run_to_miscelm(run);
 
 	assert(arena_run_tree_search(&bin->runs, miscelm) == NULL);
 
@@ -1217,9 +1189,7 @@ arena_bin_runs_insert(arena_bin_t *bin, arena_run_t *run)
 static void
 arena_bin_runs_remove(arena_bin_t *bin, arena_run_t *run)
 {
-	arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
-	size_t pageind = ((uintptr_t)run - (uintptr_t)chunk) >> LG_PAGE;
-	arena_chunk_map_misc_t *miscelm = arena_miscelm_get(chunk, pageind);
+	arena_chunk_map_misc_t *miscelm = arena_run_to_miscelm(run);
 
 	assert(arena_run_tree_search(&bin->runs, miscelm) != NULL);
 
@@ -1260,14 +1230,11 @@ arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin)
 	malloc_mutex_lock(&arena->lock);
 	run = arena_run_alloc_small(arena, bin_info->run_size, binind);
 	if (run != NULL) {
-		bitmap_t *bitmap = (bitmap_t *)((uintptr_t)run +
-		    (uintptr_t)bin_info->bitmap_offset);
-
 		/* Initialize run internals. */
 		run->bin = bin;
 		run->nextind = 0;
 		run->nfree = bin_info->nregs;
-		bitmap_init(bitmap, &bin_info->bitmap_info);
+		bitmap_init(run->bitmap, &bin_info->bitmap_info);
 	}
 	malloc_mutex_unlock(&arena->lock);
 	/********************************/
@@ -1542,16 +1509,20 @@ void *
 arena_malloc_large(arena_t *arena, size_t size, bool zero)
 {
 	void *ret;
+	arena_run_t *run;
+	arena_chunk_map_misc_t *miscelm;
 	UNUSED bool idump;
 
 	/* Large allocation. */
 	size = PAGE_CEILING(size);
 	malloc_mutex_lock(&arena->lock);
-	ret = (void *)arena_run_alloc_large(arena, size, zero);
-	if (ret == NULL) {
+	run = arena_run_alloc_large(arena, size, zero);
+	if (run == NULL) {
 		malloc_mutex_unlock(&arena->lock);
 		return (NULL);
 	}
+	miscelm = arena_run_to_miscelm(run);
+	ret = arena_miscelm_to_rpages(miscelm);
 	if (config_stats) {
 		arena->stats.nmalloc_large++;
 		arena->stats.nrequests_large++;
@@ -1586,6 +1557,8 @@ arena_palloc(arena_t *arena, size_t size, size_t alignment, bool zero)
 	size_t alloc_size, leadsize, trailsize;
 	arena_run_t *run;
 	arena_chunk_t *chunk;
+	arena_chunk_map_misc_t *miscelm;
+	void *rpages;
 
 	assert((size & PAGE_MASK) == 0);
 
@@ -1599,21 +1572,31 @@ arena_palloc(arena_t *arena, size_t size, size_t alignment, bool zero)
 		return (NULL);
 	}
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
+	miscelm = arena_run_to_miscelm(run);
+	rpages = arena_miscelm_to_rpages(miscelm);
 
-	leadsize = ALIGNMENT_CEILING((uintptr_t)run, alignment) -
-	    (uintptr_t)run;
+	leadsize = ALIGNMENT_CEILING((uintptr_t)rpages, alignment) -
+	    (uintptr_t)rpages;
 	assert(alloc_size >= leadsize + size);
 	trailsize = alloc_size - leadsize - size;
-	ret = (void *)((uintptr_t)run + leadsize);
 	if (leadsize != 0) {
-		arena_run_trim_head(arena, chunk, run, alloc_size, alloc_size -
-		    leadsize);
+		arena_chunk_map_misc_t *head_miscelm = miscelm;
+		arena_run_t *head_run = run;
+
+		miscelm = arena_miscelm_get(chunk,
+		    arena_miscelm_to_pageind(head_miscelm) + (leadsize >>
+		    LG_PAGE));
+		run = &miscelm->run;
+
+		arena_run_trim_head(arena, chunk, head_run, alloc_size,
+		    alloc_size - leadsize);
 	}
 	if (trailsize != 0) {
-		arena_run_trim_tail(arena, chunk, ret, size + trailsize, size,
+		arena_run_trim_tail(arena, chunk, run, size + trailsize, size,
 		    false);
 	}
-	arena_run_init_large(arena, (arena_run_t *)ret, size, zero);
+	arena_run_init_large(arena, run, size, zero);
+	ret = arena_miscelm_to_rpages(miscelm);
 
 	if (config_stats) {
 		arena->stats.nmalloc_large++;
@@ -1687,10 +1670,12 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 	size_t binind;
 	arena_bin_info_t *bin_info;
 	size_t npages, run_ind, past;
+	arena_chunk_map_misc_t *miscelm;
+	void *rpages;
 
 	assert(run != bin->runcur);
-	assert(arena_run_tree_search(&bin->runs, arena_miscelm_get(chunk,
-	    ((uintptr_t)run-(uintptr_t)chunk)>>LG_PAGE)) == NULL);
+	assert(arena_run_tree_search(&bin->runs, arena_run_to_miscelm(run)) ==
+	    NULL);
 
 	binind = arena_bin_index(chunk->arena, run->bin);
 	bin_info = &arena_bin_info[binind];
@@ -1698,8 +1683,10 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 	malloc_mutex_unlock(&bin->lock);
 	/******************************/
 	npages = bin_info->run_size >> LG_PAGE;
-	run_ind = (size_t)(((uintptr_t)run - (uintptr_t)chunk) >> LG_PAGE);
-	past = (size_t)(PAGE_CEILING((uintptr_t)run +
+	miscelm = arena_run_to_miscelm(run);
+	run_ind = arena_miscelm_to_pageind(miscelm);
+	rpages = arena_miscelm_to_rpages(miscelm);
+	past = (size_t)(PAGE_CEILING((uintptr_t)rpages +
 	    (uintptr_t)bin_info->reg0_offset + (uintptr_t)(run->nextind *
 	    bin_info->reg_interval - bin_info->redzone_size) -
 	    (uintptr_t)chunk) >> LG_PAGE);
@@ -1716,13 +1703,18 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 	    npages) {
 		/* Trim clean pages.  Convert to large run beforehand. */
 		assert(npages > 0);
-		arena_mapbits_large_set(chunk, run_ind, bin_info->run_size, 0);
-		arena_mapbits_large_set(chunk, run_ind+npages-1, 0, 0);
-		arena_run_trim_tail(arena, chunk, run, (npages << LG_PAGE),
-		    ((past - run_ind) << LG_PAGE), false);
+		if (past > run_ind) {
+			arena_mapbits_large_set(chunk, run_ind,
+			    bin_info->run_size, 0);
+			arena_mapbits_large_set(chunk, run_ind+npages-1, 0, 0);
+			arena_run_trim_tail(arena, chunk, run, (npages <<
+			    LG_PAGE), ((past - run_ind) << LG_PAGE), false);
+			arena_run_dalloc(arena, run, true, false);
+		} else
+			arena_run_dalloc(arena, run, false, false);
 		/* npages = past - run_ind; */
-	}
-	arena_run_dalloc(arena, run, true, false);
+	} else
+		arena_run_dalloc(arena, run, true, false);
 	malloc_mutex_unlock(&arena->lock);
 	/****************************/
 	malloc_mutex_lock(&bin->lock);
@@ -1755,15 +1747,15 @@ void
 arena_dalloc_bin_locked(arena_t *arena, arena_chunk_t *chunk, void *ptr,
     arena_chunk_map_bits_t *bitselm)
 {
-	size_t pageind;
+	size_t pageind, rpages_ind;
 	arena_run_t *run;
 	arena_bin_t *bin;
 	arena_bin_info_t *bin_info;
 	size_t size, binind;
 
 	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
-	run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
-	    arena_mapbits_small_runind_get(chunk, pageind)) << LG_PAGE));
+	rpages_ind = pageind - arena_mapbits_small_runind_get(chunk, pageind);
+	run = &arena_miscelm_get(chunk, rpages_ind)->run;
 	bin = run->bin;
 	binind = arena_ptr_small_binind_get(ptr, arena_mapbits_get(chunk,
 	    pageind));
@@ -1793,9 +1785,10 @@ arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 {
 	arena_run_t *run;
 	arena_bin_t *bin;
+	size_t rpages_ind;
 
-	run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
-	    arena_mapbits_small_runind_get(chunk, pageind)) << LG_PAGE));
+	rpages_ind = pageind - arena_mapbits_small_runind_get(chunk, pageind);
+	run = &arena_miscelm_get(chunk, rpages_ind)->run;
 	bin = run->bin;
 	malloc_mutex_lock(&bin->lock);
 	arena_dalloc_bin_locked(arena, chunk, ptr, bitselm);
@@ -1838,9 +1831,11 @@ arena_dalloc_junk_large_t *arena_dalloc_junk_large =
 void
 arena_dalloc_large_locked(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 {
+	size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
+	arena_chunk_map_misc_t *miscelm = arena_miscelm_get(chunk, pageind);
+	arena_run_t *run = &miscelm->run;
 
 	if (config_fill || config_stats) {
-		size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
 		size_t usize = arena_mapbits_large_size_get(chunk, pageind);
 
 		arena_dalloc_junk_large(ptr, usize);
@@ -1852,7 +1847,7 @@ arena_dalloc_large_locked(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 		}
 	}
 
-	arena_run_dalloc(arena, (arena_run_t *)ptr, true, false);
+	arena_run_dalloc(arena, run, true, false);
 }
 
 void
@@ -1868,6 +1863,9 @@ static void
 arena_ralloc_large_shrink(arena_t *arena, arena_chunk_t *chunk, void *ptr,
     size_t oldsize, size_t size)
 {
+	size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
+	arena_chunk_map_misc_t *miscelm = arena_miscelm_get(chunk, pageind);
+	arena_run_t *run = &miscelm->run;
 
 	assert(size < oldsize);
 
@@ -1876,8 +1874,7 @@ arena_ralloc_large_shrink(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	 * allocations.
 	 */
 	malloc_mutex_lock(&arena->lock);
-	arena_run_trim_tail(arena, chunk, (arena_run_t *)ptr, oldsize, size,
-	    true);
+	arena_run_trim_tail(arena, chunk, run, oldsize, size, true);
 	if (config_stats) {
 		arena->stats.ndalloc_large++;
 		arena->stats.allocated_large -= oldsize;
@@ -1919,8 +1916,9 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 		size_t flag_dirty;
 		size_t splitsize = (oldsize + followsize <= size + extra)
 		    ? followsize : size + extra - oldsize;
-		arena_run_split_large(arena, (arena_run_t *)((uintptr_t)chunk +
-		    ((pageind+npages) << LG_PAGE)), splitsize, zero);
+		arena_run_t *run = &arena_miscelm_get(chunk,
+		    pageind+npages)->run;
+		arena_run_split_large(arena, run, splitsize, zero);
 
 		size = oldsize + splitsize;
 		npages = size >> LG_PAGE;
@@ -2249,26 +2247,18 @@ arena_new(arena_t *arena, unsigned ind)
 /*
  * Calculate bin_info->run_size such that it meets the following constraints:
  *
- *   *) bin_info->run_size >= min_run_size
  *   *) bin_info->run_size <= arena_maxclass
- *   *) run header overhead <= RUN_MAX_OVRHD (or header overhead relaxed).
  *   *) bin_info->nregs <= RUN_MAXREGS
  *
- * bin_info->nregs, bin_info->bitmap_offset, and bin_info->reg0_offset are also
- * calculated here, since these settings are all interdependent.
+ * bin_info->nregs and bin_info->reg0_offset are also calculated here, since
+ * these settings are all interdependent.
  */
-static size_t
-bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size)
+static void
+bin_info_run_size_calc(arena_bin_info_t *bin_info)
 {
 	size_t pad_size;
-	size_t try_run_size, good_run_size;
-	uint32_t try_nregs, good_nregs;
-	uint32_t try_hdr_size, good_hdr_size;
-	uint32_t try_bitmap_offset, good_bitmap_offset;
-	uint32_t try_redzone0_offset, good_redzone0_offset;
-
-	assert(min_run_size >= PAGE);
-	assert(min_run_size <= arena_maxclass);
+	size_t try_run_size, perfect_run_size, actual_run_size;
+	uint32_t try_nregs, perfect_nregs, actual_nregs;
 
 	/*
 	 * Determine redzone size based on minimum alignment and minimum
@@ -2295,96 +2285,66 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size)
 	    (bin_info->redzone_size << 1);
 
 	/*
-	 * Calculate known-valid settings before entering the run_size
-	 * expansion loop, so that the first part of the loop always copies
-	 * valid settings.
-	 *
-	 * The do..while loop iteratively reduces the number of regions until
-	 * the run header and the regions no longer overlap.  A closed formula
-	 * would be quite messy, since there is an interdependency between the
-	 * header's mask length and the number of regions.
+	 * Compute run size under ideal conditions (no redzones, no limit on run
+	 * size).
 	 */
-	try_run_size = min_run_size;
-	try_nregs = ((try_run_size - sizeof(arena_run_t)) /
-	    bin_info->reg_interval)
-	    + 1; /* Counter-act try_nregs-- in loop. */
-	if (try_nregs > RUN_MAXREGS) {
-		try_nregs = RUN_MAXREGS
-		    + 1; /* Counter-act try_nregs-- in loop. */
-	}
+	try_run_size = PAGE;
+	try_nregs = try_run_size / bin_info->reg_size;
 	do {
-		try_nregs--;
-		try_hdr_size = sizeof(arena_run_t);
-		/* Pad to a long boundary. */
-		try_hdr_size = LONG_CEILING(try_hdr_size);
-		try_bitmap_offset = try_hdr_size;
-		/* Add space for bitmap. */
-		try_hdr_size += bitmap_size(try_nregs);
-		try_redzone0_offset = try_run_size - (try_nregs *
-		    bin_info->reg_interval) - pad_size;
-	} while (try_hdr_size > try_redzone0_offset);
-
-	/* run_size expansion loop. */
-	do {
-		/*
-		 * Copy valid settings before trying more aggressive settings.
-		 */
-		good_run_size = try_run_size;
-		good_nregs = try_nregs;
-		good_hdr_size = try_hdr_size;
-		good_bitmap_offset = try_bitmap_offset;
-		good_redzone0_offset = try_redzone0_offset;
+		perfect_run_size = try_run_size;
+		perfect_nregs = try_nregs;
 
-		/* Try more aggressive settings. */
 		try_run_size += PAGE;
-		try_nregs = ((try_run_size - sizeof(arena_run_t) - pad_size) /
-		    bin_info->reg_interval)
-		    + 1; /* Counter-act try_nregs-- in loop. */
-		if (try_nregs > RUN_MAXREGS) {
-			try_nregs = RUN_MAXREGS
-			    + 1; /* Counter-act try_nregs-- in loop. */
-		}
-		do {
-			try_nregs--;
-			try_hdr_size = sizeof(arena_run_t);
-			/* Pad to a long boundary. */
-			try_hdr_size = LONG_CEILING(try_hdr_size);
-			try_bitmap_offset = try_hdr_size;
-			/* Add space for bitmap. */
-			try_hdr_size += bitmap_size(try_nregs);
-			try_redzone0_offset = try_run_size - (try_nregs *
-			    bin_info->reg_interval) - pad_size;
-		} while (try_hdr_size > try_redzone0_offset);
-	} while (try_run_size <= arena_maxclass
-	    && RUN_MAX_OVRHD * (bin_info->reg_interval << 3) >
-	    RUN_MAX_OVRHD_RELAX
-	    && (try_redzone0_offset << RUN_BFP) > RUN_MAX_OVRHD * try_run_size
-	    && try_nregs < RUN_MAXREGS);
-
-	assert(good_hdr_size <= good_redzone0_offset);
+		try_nregs = try_run_size / bin_info->reg_size;
+	} while (perfect_run_size != perfect_nregs * bin_info->reg_size);
+	assert(perfect_nregs <= RUN_MAXREGS);
+
+	actual_run_size = perfect_run_size;
+	actual_nregs = (actual_run_size - pad_size) / bin_info->reg_interval;
+
+	/*
+	 * Redzones can require enough padding that not even a single region can
+	 * fit within the number of pages that would normally be dedicated to a
+	 * run for this size class.  Increase the run size until at least one
+	 * region fits.
+	 */
+	while (actual_nregs == 0) {
+		assert(config_fill && unlikely(opt_redzone));
+
+		actual_run_size += PAGE;
+		actual_nregs = (actual_run_size - pad_size) /
+		    bin_info->reg_interval;
+	}
+
+	/*
+	 * Make sure that the run will fit within an arena chunk.
+	 */
+	while (actual_run_size > arena_maxclass) {
+		actual_run_size -= PAGE;
+		actual_nregs = (actual_run_size - pad_size) /
+		    bin_info->reg_interval;
+	}
+	assert(actual_nregs > 0);
 
 	/* Copy final settings. */
-	bin_info->run_size = good_run_size;
-	bin_info->nregs = good_nregs;
-	bin_info->bitmap_offset = good_bitmap_offset;
-	bin_info->reg0_offset = good_redzone0_offset + bin_info->redzone_size;
+	bin_info->run_size = actual_run_size;
+	bin_info->nregs = actual_nregs;
+	bin_info->reg0_offset = actual_run_size - (actual_nregs *
+	    bin_info->reg_interval) - pad_size + bin_info->redzone_size;
 
 	assert(bin_info->reg0_offset - bin_info->redzone_size + (bin_info->nregs
 	    * bin_info->reg_interval) + pad_size == bin_info->run_size);
-
-	return (good_run_size);
 }
 
 static void
 bin_info_init(void)
 {
 	arena_bin_info_t *bin_info;
-	size_t prev_run_size = PAGE;
 
 #define	BIN_INFO_INIT_bin_yes(index, size) \
 	bin_info = &arena_bin_info[index];				\
 	bin_info->reg_size = size;					\
-	prev_run_size = bin_info_run_size_calc(bin_info, prev_run_size);\
+	bin_info_run_size_calc(bin_info);				\
 	bitmap_info_init(&bin_info->bitmap_info, bin_info->nregs);
 #define	BIN_INFO_INIT_bin_no(index, size)
 #define	SC(index, lg_grp, lg_delta, ndelta, bin, lg_delta_lookup)	\
@@ -2418,8 +2378,7 @@ arena_boot(void)
 		header_size = offsetof(arena_chunk_t, map_bits) +
 		    ((sizeof(arena_chunk_map_bits_t) +
 		    sizeof(arena_chunk_map_misc_t)) * (chunk_npages-map_bias));
-		map_bias = (header_size >> LG_PAGE) + ((header_size & PAGE_MASK)
-		    != 0);
+		map_bias = (header_size + PAGE_MASK) >> LG_PAGE;
 	}
 	assert(map_bias > 0);
 
-- 
cgit v0.12


From e3a16fce5eb0c62a49e751f156d040c9f77fbc23 Mon Sep 17 00:00:00 2001
From: Dave Rigby <daver@couchbase.com>
Date: Wed, 24 Sep 2014 14:19:28 +0100
Subject: Mark malloc_conf as a weak symbol

This fixes issue #113 - je_malloc_conf is not respected on OS X
---
 src/jemalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/jemalloc.c b/src/jemalloc.c
index 4d3b22e..3012f55 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -7,7 +7,7 @@
 malloc_tsd_data(, arenas, arena_t *, NULL)
 
 /* Runtime configuration options. */
-const char	*je_malloc_conf;
+const char	*je_malloc_conf JEMALLOC_ATTR(weak);
 bool	opt_abort =
 #ifdef JEMALLOC_DEBUG
     true
-- 
cgit v0.12


From 4dcf04bfc03b9e9eb50015a8fc8735de28c23090 Mon Sep 17 00:00:00 2001
From: Eric Wong <normalperson@yhbt.net>
Date: Sun, 31 Aug 2014 03:57:06 +0000
Subject: correctly detect adaptive mutexes in pthreads

PTHREAD_MUTEX_ADAPTIVE_NP is an enum on glibc and not a macro,
we must test for their existence by attempting compilation.
---
 configure.ac                                          | 12 ++++++++++++
 include/jemalloc/internal/jemalloc_internal_defs.h.in |  3 +++
 include/jemalloc/internal/mutex.h                     |  2 +-
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index ab4bcc3..1ee2ed8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1399,6 +1399,18 @@ if test "x${je_cv_glibc_memalign_hook}" = "xyes" ; then
   AC_DEFINE([JEMALLOC_GLIBC_MEMALIGN_HOOK], [ ])
 fi
 
+JE_COMPILABLE([pthreads adaptive mutexes], [
+#include <pthread.h>
+], [
+  pthread_mutexattr_t attr;
+  pthread_mutexattr_init(&attr);
+  pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP);
+  pthread_mutexattr_destroy(&attr);
+], [je_cv_pthread_mutex_adaptive_np])
+if test "x${je_cv_pthread_mutex_adaptive_np}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP], [ ])
+fi
+
 dnl ============================================================================
 dnl Check for typedefs, structures, and compiler characteristics.
 AC_HEADER_STDBOOL
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index 955582e..fd85e5c 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -215,4 +215,7 @@
 /* glibc memalign hook */
 #undef JEMALLOC_GLIBC_MEMALIGN_HOOK
 
+/* adaptive mutex support in pthreads */
+#undef JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP
+
 #endif /* JEMALLOC_INTERNAL_DEFS_H_ */
diff --git a/include/jemalloc/internal/mutex.h b/include/jemalloc/internal/mutex.h
index de44e14..8a03d82 100644
--- a/include/jemalloc/internal/mutex.h
+++ b/include/jemalloc/internal/mutex.h
@@ -10,7 +10,7 @@ typedef struct malloc_mutex_s malloc_mutex_t;
 #elif (defined(JEMALLOC_MUTEX_INIT_CB))
 #  define MALLOC_MUTEX_INITIALIZER {PTHREAD_MUTEX_INITIALIZER, NULL}
 #else
-#  if (defined(PTHREAD_MUTEX_ADAPTIVE_NP) &&				\
+#  if (defined(JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP) &&		\
        defined(PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP))
 #    define MALLOC_MUTEX_TYPE PTHREAD_MUTEX_ADAPTIVE_NP
 #    define MALLOC_MUTEX_INITIALIZER {PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP}
-- 
cgit v0.12


From f8034540a16a6f4fc7948e4783747ca1e9055823 Mon Sep 17 00:00:00 2001
From: Daniel Micay <danielmicay@gmail.com>
Date: Tue, 30 Sep 2014 10:33:46 -0400
Subject: Implement in-place huge allocation shrinking.

Trivial example:

    #include <stdlib.h>

    int main(void) {
        void *ptr = malloc(1024 * 1024 * 8);
        if (!ptr) return 1;
        ptr = realloc(ptr, 1024 * 1024 * 4);
        if (!ptr) return 1;
    }

Before:

    mmap(NULL, 8388608, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fcfff000000
    mmap(NULL, 4194304, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fcffec00000
    madvise(0x7fcfff000000, 8388608, MADV_DONTNEED) = 0

After:

    mmap(NULL, 8388608, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f1934800000
    madvise(0x7f1934c00000, 4194304, MADV_DONTNEED) = 0

Closes #134
---
 src/huge.c | 89 +++++++++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 62 insertions(+), 27 deletions(-)

diff --git a/src/huge.c b/src/huge.c
index 2e30ccf..40d1362 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -72,21 +72,79 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
 	return (ret);
 }
 
+#ifdef JEMALLOC_JET
+#undef huge_dalloc_junk
+#define	huge_dalloc_junk JEMALLOC_N(huge_dalloc_junk_impl)
+#endif
+static void
+huge_dalloc_junk(void *ptr, size_t usize)
+{
+
+	if (config_fill && have_dss && unlikely(opt_junk)) {
+		/*
+		 * Only bother junk filling if the chunk isn't about to be
+		 * unmapped.
+		 */
+		if (config_munmap == false || (have_dss && chunk_in_dss(ptr)))
+			memset(ptr, 0x5a, usize);
+	}
+}
+#ifdef JEMALLOC_JET
+#undef huge_dalloc_junk
+#define	huge_dalloc_junk JEMALLOC_N(huge_dalloc_junk)
+huge_dalloc_junk_t *huge_dalloc_junk = JEMALLOC_N(huge_dalloc_junk_impl);
+#endif
+
 bool
 huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra)
 {
 
+	/* Both allocations must be huge to avoid a move. */
+	if (oldsize <= arena_maxclass)
+		return (true);
+
+	assert(CHUNK_CEILING(oldsize) == oldsize);
+
 	/*
 	 * Avoid moving the allocation if the size class can be left the same.
 	 */
-	if (oldsize > arena_maxclass
-	    && CHUNK_CEILING(oldsize) >= CHUNK_CEILING(size)
+	if (CHUNK_CEILING(oldsize) >= CHUNK_CEILING(size)
 	    && CHUNK_CEILING(oldsize) <= CHUNK_CEILING(size+extra)) {
-		assert(CHUNK_CEILING(oldsize) == oldsize);
 		return (false);
 	}
 
-	/* Reallocation would require a move. */
+	/* Overflow. */
+	if (CHUNK_CEILING(size) == 0)
+		return (true);
+
+	/* Shrink the allocation in-place. */
+	if (CHUNK_CEILING(oldsize) > CHUNK_CEILING(size)) {
+		extent_node_t *node, key;
+		void *excess_addr;
+		size_t excess_size;
+
+		malloc_mutex_lock(&huge_mtx);
+
+		key.addr = ptr;
+		node = extent_tree_ad_search(&huge, &key);
+		assert(node != NULL);
+		assert(node->addr == ptr);
+
+		/* Update the size of the huge allocation. */
+		node->size = CHUNK_CEILING(size);
+
+		malloc_mutex_unlock(&huge_mtx);
+
+		excess_addr = node->addr + CHUNK_CEILING(size);
+		excess_size = CHUNK_CEILING(oldsize) - CHUNK_CEILING(size);
+
+		/* Zap the excess chunks. */
+		huge_dalloc_junk(excess_addr, excess_size);
+		arena_chunk_dalloc_huge(node->arena, excess_addr, excess_size);
+
+		return (false);
+	}
+
 	return (true);
 }
 
@@ -134,29 +192,6 @@ huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t size,
 	return (ret);
 }
 
-#ifdef JEMALLOC_JET
-#undef huge_dalloc_junk
-#define	huge_dalloc_junk JEMALLOC_N(huge_dalloc_junk_impl)
-#endif
-static void
-huge_dalloc_junk(void *ptr, size_t usize)
-{
-
-	if (config_fill && have_dss && unlikely(opt_junk)) {
-		/*
-		 * Only bother junk filling if the chunk isn't about to be
-		 * unmapped.
-		 */
-		if (config_munmap == false || (have_dss && chunk_in_dss(ptr)))
-			memset(ptr, 0x5a, usize);
-	}
-}
-#ifdef JEMALLOC_JET
-#undef huge_dalloc_junk
-#define	huge_dalloc_junk JEMALLOC_N(huge_dalloc_junk)
-huge_dalloc_junk_t *huge_dalloc_junk = JEMALLOC_N(huge_dalloc_junk_impl);
-#endif
-
 void
 huge_dalloc(void *ptr)
 {
-- 
cgit v0.12


From cc9e626ea97eb294f337c674685b8b5c9d5524b7 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 1 Oct 2014 17:51:52 -0700
Subject: Refactor permuted backtrace test allocation.

Refactor permuted backtrace test allocation that was originally used
only by the prof_accum test, so that it can be used by other heap
profiling test binaries.
---
 Makefile.in                          | 20 +++++++-------------
 test/include/test/btalloc.h          | 31 +++++++++++++++++++++++++++++++
 test/include/test/jemalloc_test.h.in |  1 +
 test/src/btalloc.c                   |  8 ++++++++
 test/src/btalloc_0.c                 |  3 +++
 test/src/btalloc_1.c                 |  3 +++
 test/unit/prof_accum.c               |  9 +++++++--
 test/unit/prof_accum.h               | 35 -----------------------------------
 test/unit/prof_accum_a.c             |  3 ---
 test/unit/prof_accum_b.c             |  3 ---
 10 files changed, 60 insertions(+), 56 deletions(-)
 create mode 100644 test/include/test/btalloc.h
 create mode 100644 test/src/btalloc.c
 create mode 100644 test/src/btalloc_0.c
 create mode 100644 test/src/btalloc_1.c
 delete mode 100644 test/unit/prof_accum.h
 delete mode 100644 test/unit/prof_accum_a.c
 delete mode 100644 test/unit/prof_accum_b.c

diff --git a/Makefile.in b/Makefile.in
index 41328b9..5267bea 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -107,9 +107,11 @@ DOCS_XML := $(objroot)doc/jemalloc$(install_suffix).xml
 DOCS_HTML := $(DOCS_XML:$(objroot)%.xml=$(srcroot)%.html)
 DOCS_MAN3 := $(DOCS_XML:$(objroot)%.xml=$(srcroot)%.3)
 DOCS := $(DOCS_HTML) $(DOCS_MAN3)
-C_TESTLIB_SRCS := $(srcroot)test/src/math.c $(srcroot)test/src/mtx.c \
-	$(srcroot)test/src/SFMT.c $(srcroot)test/src/test.c \
-	$(srcroot)test/src/thd.c $(srcroot)test/src/timer.c
+C_TESTLIB_SRCS := $(srcroot)test/src/btalloc.c $(srcroot)test/src/btalloc_0.c \
+	$(srcroot)test/src/btalloc_1.c $(srcroot)test/src/math.c \
+	$(srcroot)test/src/mtx.c $(srcroot)test/src/SFMT.c \
+	$(srcroot)test/src/test.c $(srcroot)test/src/thd.c \
+	$(srcroot)test/src/timer.c
 C_UTIL_INTEGRATION_SRCS := $(srcroot)src/util.c
 TESTS_UNIT := $(srcroot)test/unit/atomic.c \
 	$(srcroot)test/unit/bitmap.c \
@@ -123,6 +125,7 @@ TESTS_UNIT := $(srcroot)test/unit/atomic.c \
 	$(srcroot)test/unit/prof_accum.c \
 	$(srcroot)test/unit/prof_gdump.c \
 	$(srcroot)test/unit/prof_idump.c \
+	$(srcroot)test/unit/prof_reset.c \
 	$(srcroot)test/unit/ql.c \
 	$(srcroot)test/unit/qr.c \
 	$(srcroot)test/unit/quarantine.c \
@@ -133,8 +136,6 @@ TESTS_UNIT := $(srcroot)test/unit/atomic.c \
 	$(srcroot)test/unit/tsd.c \
 	$(srcroot)test/unit/util.c \
 	$(srcroot)test/unit/zero.c
-TESTS_UNIT_AUX := $(srcroot)test/unit/prof_accum_a.c \
-	$(srcroot)test/unit/prof_accum_b.c
 TESTS_INTEGRATION := $(srcroot)test/integration/aligned_alloc.c \
 	$(srcroot)test/integration/allocated.c \
 	$(srcroot)test/integration/sdallocx.c \
@@ -159,10 +160,9 @@ C_TESTLIB_STRESS_OBJS := $(C_TESTLIB_SRCS:$(srcroot)%.c=$(objroot)%.stress.$(O))
 C_TESTLIB_OBJS := $(C_TESTLIB_UNIT_OBJS) $(C_TESTLIB_INTEGRATION_OBJS) $(C_UTIL_INTEGRATION_OBJS) $(C_TESTLIB_STRESS_OBJS)
 
 TESTS_UNIT_OBJS := $(TESTS_UNIT:$(srcroot)%.c=$(objroot)%.$(O))
-TESTS_UNIT_AUX_OBJS := $(TESTS_UNIT_AUX:$(srcroot)%.c=$(objroot)%.$(O))
 TESTS_INTEGRATION_OBJS := $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%.$(O))
 TESTS_STRESS_OBJS := $(TESTS_STRESS:$(srcroot)%.c=$(objroot)%.$(O))
-TESTS_OBJS := $(TESTS_UNIT_OBJS) $(TESTS_UNIT_AUX_OBJS) $(TESTS_INTEGRATION_OBJS) $(TESTS_STRESS_OBJS)
+TESTS_OBJS := $(TESTS_UNIT_OBJS) $(TESTS_INTEGRATION_OBJS) $(TESTS_STRESS_OBJS)
 
 .PHONY: all dist build_doc_html build_doc_man build_doc
 .PHONY: install_bin install_include install_lib
@@ -211,12 +211,6 @@ $(C_TESTLIB_STRESS_OBJS): $(objroot)test/src/%.stress.$(O): $(srcroot)test/src/%
 $(C_TESTLIB_STRESS_OBJS): CPPFLAGS += -DJEMALLOC_STRESS_TEST -DJEMALLOC_STRESS_TESTLIB
 $(C_TESTLIB_OBJS): CPPFLAGS += -I$(srcroot)test/include -I$(objroot)test/include
 $(TESTS_UNIT_OBJS): CPPFLAGS += -DJEMALLOC_UNIT_TEST
-$(TESTS_UNIT_AUX_OBJS): CPPFLAGS += -DJEMALLOC_UNIT_TEST
-define make-unit-link-dep
-$(1): TESTS_UNIT_LINK_OBJS += $(2)
-$(1): $(2)
-endef
-$(foreach test, $(TESTS_UNIT:$(srcroot)test/unit/%.c=$(objroot)test/unit/%$(EXE)), $(eval $(call make-unit-link-dep,$(test),$(filter $(test:%$(EXE)=%_a.$(O)) $(test:%$(EXE)=%_b.$(O)),$(TESTS_UNIT_AUX_OBJS)))))
 $(TESTS_INTEGRATION_OBJS): CPPFLAGS += -DJEMALLOC_INTEGRATION_TEST
 $(TESTS_STRESS_OBJS): CPPFLAGS += -DJEMALLOC_STRESS_TEST
 $(TESTS_OBJS): $(objroot)test/%.$(O): $(srcroot)test/%.c
diff --git a/test/include/test/btalloc.h b/test/include/test/btalloc.h
new file mode 100644
index 0000000..c3f9d4d
--- /dev/null
+++ b/test/include/test/btalloc.h
@@ -0,0 +1,31 @@
+/* btalloc() provides a mechanism for allocating via permuted backtraces. */
+void	*btalloc(size_t size, unsigned bits);
+
+#define	btalloc_n_proto(n)						\
+void	*btalloc_##n(size_t size, unsigned bits);
+btalloc_n_proto(0)
+btalloc_n_proto(1)
+
+#define	btalloc_n_gen(n)						\
+void *									\
+btalloc_##n(size_t size, unsigned bits)					\
+{									\
+	void *p;							\
+									\
+	if (bits == 0)							\
+		p = mallocx(size, 0);					\
+	else {								\
+		switch (bits & 0x1U) {					\
+		case 0:							\
+			p = (btalloc_0(size, bits >> 1));		\
+			break;						\
+		case 1:							\
+			p = (btalloc_1(size, bits >> 1));		\
+			break;						\
+		default: not_reached();					\
+		}							\
+	}								\
+	/* Intentionally sabotage tail call optimization. */		\
+	assert_ptr_not_null(p, "Unexpected mallocx() failure");		\
+	return (p);							\
+}
diff --git a/test/include/test/jemalloc_test.h.in b/test/include/test/jemalloc_test.h.in
index a93c4f6..6018e58 100644
--- a/test/include/test/jemalloc_test.h.in
+++ b/test/include/test/jemalloc_test.h.in
@@ -133,6 +133,7 @@
 /*
  * Common test utilities.
  */
+#include "test/btalloc.h"
 #include "test/math.h"
 #include "test/mtx.h"
 #include "test/mq.h"
diff --git a/test/src/btalloc.c b/test/src/btalloc.c
new file mode 100644
index 0000000..9a253d9
--- /dev/null
+++ b/test/src/btalloc.c
@@ -0,0 +1,8 @@
+#include "test/jemalloc_test.h"
+
+void *
+btalloc(size_t size, unsigned bits)
+{
+
+	return (btalloc_0(size, bits));
+}
diff --git a/test/src/btalloc_0.c b/test/src/btalloc_0.c
new file mode 100644
index 0000000..77d8904
--- /dev/null
+++ b/test/src/btalloc_0.c
@@ -0,0 +1,3 @@
+#include "test/jemalloc_test.h"
+
+btalloc_n_gen(0)
diff --git a/test/src/btalloc_1.c b/test/src/btalloc_1.c
new file mode 100644
index 0000000..4c126c3
--- /dev/null
+++ b/test/src/btalloc_1.c
@@ -0,0 +1,3 @@
+#include "test/jemalloc_test.h"
+
+btalloc_n_gen(1)
diff --git a/test/unit/prof_accum.c b/test/unit/prof_accum.c
index 050a8a7..fd229e0 100644
--- a/test/unit/prof_accum.c
+++ b/test/unit/prof_accum.c
@@ -1,4 +1,9 @@
-#include "prof_accum.h"
+#include "test/jemalloc_test.h"
+
+#define	NTHREADS		4
+#define	NALLOCS_PER_THREAD	50
+#define	DUMP_INTERVAL		1
+#define	BT_COUNT_CHECK_INTERVAL	5
 
 #ifdef JEMALLOC_PROF
 const char *malloc_conf =
@@ -20,7 +25,7 @@ static void *
 alloc_from_permuted_backtrace(unsigned thd_ind, unsigned iteration)
 {
 
-	return (alloc_0(thd_ind*NALLOCS_PER_THREAD + iteration));
+	return (btalloc(1, thd_ind*NALLOCS_PER_THREAD + iteration));
 }
 
 static void *
diff --git a/test/unit/prof_accum.h b/test/unit/prof_accum.h
deleted file mode 100644
index 109d86b..0000000
--- a/test/unit/prof_accum.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#include "test/jemalloc_test.h"
-
-#define	NTHREADS		4
-#define	NALLOCS_PER_THREAD	50
-#define	DUMP_INTERVAL		1
-#define	BT_COUNT_CHECK_INTERVAL	5
-
-#define	alloc_n_proto(n)						\
-void	*alloc_##n(unsigned bits);
-alloc_n_proto(0)
-alloc_n_proto(1)
-
-#define	alloc_n_gen(n)							\
-void *									\
-alloc_##n(unsigned bits)						\
-{									\
-	void *p;							\
-									\
-	if (bits == 0)							\
-		p = mallocx(1, 0);					\
-	else {								\
-		switch (bits & 0x1U) {					\
-		case 0:							\
-			p = (alloc_0(bits >> 1));			\
-			break;						\
-		case 1:							\
-			p = (alloc_1(bits >> 1));			\
-			break;						\
-		default: not_reached();					\
-		}							\
-	}								\
-	/* Intentionally sabotage tail call optimization. */		\
-	assert_ptr_not_null(p, "Unexpected mallocx() failure");		\
-	return (p);							\
-}
diff --git a/test/unit/prof_accum_a.c b/test/unit/prof_accum_a.c
deleted file mode 100644
index 42ad521..0000000
--- a/test/unit/prof_accum_a.c
+++ /dev/null
@@ -1,3 +0,0 @@
-#include "prof_accum.h"
-
-alloc_n_gen(0)
diff --git a/test/unit/prof_accum_b.c b/test/unit/prof_accum_b.c
deleted file mode 100644
index 60d9dab..0000000
--- a/test/unit/prof_accum_b.c
+++ /dev/null
@@ -1,3 +0,0 @@
-#include "prof_accum.h"
-
-alloc_n_gen(1)
-- 
cgit v0.12


From 20c31deaae38ed9aa4fe169ed65e0c45cd542955 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Thu, 2 Oct 2014 23:01:10 -0700
Subject: Test prof.reset mallctl and fix numerous discovered bugs.

---
 doc/jemalloc.xml.in                           |   5 +-
 include/jemalloc/internal/private_symbols.txt |   1 +
 include/jemalloc/internal/prof.h              |  24 +--
 src/prof.c                                    | 213 ++++++++++++++++-------
 test/unit/prof_reset.c                        | 238 ++++++++++++++++++++++++++
 5 files changed, 405 insertions(+), 76 deletions(-)
 create mode 100644 test/unit/prof_reset.c

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index e5c229f..b586e69 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -1547,7 +1547,8 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         </term>
         <listitem><para>Reset all memory profile statistics, and optionally
         update the sample rate (see <link
-        linkend="opt.lg_prof_sample"><mallctl>opt.lg_prof_sample</mallctl></link>).
+        linkend="opt.lg_prof_sample"><mallctl>opt.lg_prof_sample</mallctl></link>
+        and <link linkend="prof.lg_sample"><mallctl>prof.lg_sample</mallctl>).
         </para></listitem>
       </varlistentry>
 
@@ -1558,7 +1559,7 @@ malloc_conf = "xmalloc:true";]]></programlisting>
           <literal>r-</literal>
           [<option>--enable-prof</option>]
         </term>
-        <listitem><para>Get the sample rate (see <link
+        <listitem><para>Get the current sample rate (see <link
         linkend="opt.lg_prof_sample"><mallctl>opt.lg_prof_sample</mallctl></link>).
         </para></listitem>
       </varlistentry>
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 5ac82f5..33f8ce0 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -292,6 +292,7 @@ prof_boot0
 prof_boot1
 prof_boot2
 prof_bt_count
+prof_dump_header
 prof_dump_open
 prof_free
 prof_free_sampled_object
diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index 3872c7a..91c871d 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -151,22 +151,23 @@ struct prof_gctx_s {
 };
 typedef rb_tree(prof_gctx_t) prof_gctx_tree_t;
 
-typedef enum {
-	prof_tdata_state_attached, /* Active thread attached, data valid. */
-	prof_tdata_state_detached, /* Defunct thread, data remain valid. */
-	prof_tdata_state_expired   /* Predates reset, omit data from dump. */
-} prof_tdata_state_t;
-
 struct prof_tdata_s {
 	malloc_mutex_t		*lock;
 
 	/* Monotonically increasing unique thread identifier. */
 	uint64_t		thr_uid;
 
+	/*
+	 * Monotonically increasing discriminator among tdata structures
+	 * associated with the same thr_uid.
+	 */
+	uint64_t		thr_discrim;
+
 	/* Included in heap profile dumps if non-NULL. */
 	char			*thread_name;
 
-	prof_tdata_state_t	state;
+	bool			attached;
+	bool			expired;
 
 	rb_node(prof_tdata_t)	tdata_link;
 
@@ -257,9 +258,13 @@ void	bt_init(prof_bt_t *bt, void **vec);
 void	prof_backtrace(prof_bt_t *bt);
 prof_tctx_t	*prof_lookup(tsd_t *tsd, prof_bt_t *bt);
 #ifdef JEMALLOC_JET
+size_t	prof_tdata_count(void);
 size_t	prof_bt_count(void);
+const prof_cnt_t *prof_cnt_all(void);
 typedef int (prof_dump_open_t)(bool, const char *);
 extern prof_dump_open_t *prof_dump_open;
+typedef bool (prof_dump_header_t)(bool, const prof_cnt_t *);
+extern prof_dump_header_t *prof_dump_header;
 #endif
 void	prof_idump(void);
 bool	prof_mdump(const char *filename);
@@ -312,12 +317,11 @@ prof_tdata_get(tsd_t *tsd, bool create)
 		if (unlikely(tdata == NULL)) {
 			tdata = prof_tdata_init(tsd);
 			tsd_prof_tdata_set(tsd, tdata);
-		} else if (unlikely(tdata->state == prof_tdata_state_expired)) {
+		} else if (unlikely(tdata->expired)) {
 			tdata = prof_tdata_reinit(tsd, tdata);
 			tsd_prof_tdata_set(tsd, tdata);
 		}
-		assert(tdata == NULL ||
-		    tdata->state == prof_tdata_state_attached);
+		assert(tdata == NULL || tdata->attached);
 	}
 
 	return (tdata);
diff --git a/src/prof.c b/src/prof.c
index 9f10b53..0a96d85 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -137,10 +137,18 @@ rb_gen(static UNUSED, gctx_tree_, prof_gctx_tree_t, prof_gctx_t, dump_link,
 JEMALLOC_INLINE_C int
 prof_tdata_comp(const prof_tdata_t *a, const prof_tdata_t *b)
 {
+	int ret;
 	uint64_t a_uid = a->thr_uid;
 	uint64_t b_uid = b->thr_uid;
 
-	return ((a_uid > b_uid) - (a_uid < b_uid));
+	ret = ((a_uid > b_uid) - (a_uid < b_uid));
+	if (ret == 0) {
+		uint64_t a_discrim = a->thr_discrim;
+		uint64_t b_discrim = b->thr_discrim;
+
+		ret = ((a_discrim > b_discrim) - (a_discrim < b_discrim));
+	}
+	return (ret);
 }
 
 rb_gen(static UNUSED, tdata_tree_, prof_tdata_tree_t, prof_tdata_t, tdata_link,
@@ -504,7 +512,7 @@ prof_gctx_create(tsd_t *tsd, prof_bt_t *bt)
 	gctx->lock = prof_gctx_mutex_choose();
 	/*
 	 * Set nlimbo to 1, in order to avoid a race condition with
-	 * prof_tctx_destroy()/prof_gctx_maybe_destroy().
+	 * prof_tctx_destroy()/prof_gctx_try_destroy().
 	 */
 	gctx->nlimbo = 1;
 	tctx_tree_new(&gctx->tctxs);
@@ -516,7 +524,7 @@ prof_gctx_create(tsd_t *tsd, prof_bt_t *bt)
 }
 
 static void
-prof_gctx_maybe_destroy(tsd_t *tsd, prof_gctx_t *gctx, prof_tdata_t *tdata)
+prof_gctx_try_destroy(tsd_t *tsd, prof_gctx_t *gctx, prof_tdata_t *tdata)
 {
 
 	cassert(config_prof);
@@ -530,6 +538,7 @@ prof_gctx_maybe_destroy(tsd_t *tsd, prof_gctx_t *gctx, prof_tdata_t *tdata)
 	 */
 	prof_enter(tdata);
 	malloc_mutex_lock(gctx->lock);
+	assert(gctx->nlimbo != 0);
 	if (tctx_tree_empty(&gctx->tctxs) && gctx->nlimbo == 1) {
 		/* Remove gctx from bt2gctx. */
 		if (ckh_remove(tsd, &bt2gctx, &gctx->bt, NULL, NULL))
@@ -605,10 +614,10 @@ prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx)
 		 *
 		 * 1) Sample an allocation associated with gctx.
 		 * 2) Deallocate the sampled object.
-		 * 3) Successfully prof_gctx_maybe_destroy(gctx).
+		 * 3) Successfully prof_gctx_try_destroy(gctx).
 		 *
 		 * The result would be that gctx no longer exists by the time
-		 * this thread accesses it in prof_gctx_maybe_destroy().
+		 * this thread accesses it in prof_gctx_try_destroy().
 		 */
 		gctx->nlimbo++;
 		destroy_gctx = true;
@@ -616,7 +625,7 @@ prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx)
 		destroy_gctx = false;
 	malloc_mutex_unlock(gctx->lock);
 	if (destroy_gctx)
-		prof_gctx_maybe_destroy(tsd, gctx, tdata);
+		prof_gctx_try_destroy(tsd, gctx, tdata);
 
 	if (destroy_tdata)
 		prof_tdata_destroy(tsd, tdata);
@@ -657,7 +666,7 @@ prof_lookup_global(tsd_t *tsd, prof_bt_t *bt, prof_tdata_t *tdata,
 	} else {
 		/*
 		 * Increment nlimbo, in order to avoid a race condition with
-		 * prof_tctx_destroy()/prof_gctx_maybe_destroy().
+		 * prof_tctx_destroy()/prof_gctx_try_destroy().
 		 */
 		malloc_mutex_lock(gctx.p->lock);
 		gctx.p->nlimbo++;
@@ -710,7 +719,7 @@ prof_lookup(tsd_t *tsd, prof_bt_t *bt)
 		ret.v = imalloc(tsd, sizeof(prof_tctx_t));
 		if (ret.p == NULL) {
 			if (new_gctx)
-				prof_gctx_maybe_destroy(tsd, gctx, tdata);
+				prof_gctx_try_destroy(tsd, gctx, tdata);
 			return (NULL);
 		}
 		ret.p->tdata = tdata;
@@ -723,7 +732,7 @@ prof_lookup(tsd_t *tsd, prof_bt_t *bt)
 		malloc_mutex_unlock(tdata->lock);
 		if (error) {
 			if (new_gctx)
-				prof_gctx_maybe_destroy(tsd, gctx, tdata);
+				prof_gctx_try_destroy(tsd, gctx, tdata);
 			idalloc(tsd, ret.v);
 			return (NULL);
 		}
@@ -793,6 +802,31 @@ prof_sample_threshold_update(prof_tdata_t *tdata)
 }
 
 #ifdef JEMALLOC_JET
+static prof_tdata_t *
+prof_tdata_count_iter(prof_tdata_tree_t *tdatas, prof_tdata_t *tdata, void *arg)
+{
+	size_t *tdata_count = (size_t *)arg;
+
+	(*tdata_count)++;
+
+	return (NULL);
+}
+
+size_t
+prof_tdata_count(void)
+{
+	size_t tdata_count = 0;
+
+	malloc_mutex_lock(&tdatas_mtx);
+	tdata_tree_iter(&tdatas, NULL, prof_tdata_count_iter,
+	    (void *)&tdata_count);
+	malloc_mutex_unlock(&tdatas_mtx);
+
+	return (tdata_count);
+}
+#endif
+
+#ifdef JEMALLOC_JET
 size_t
 prof_bt_count(void)
 {
@@ -998,7 +1032,6 @@ static prof_tctx_t *
 prof_tctx_finish_iter(prof_tctx_tree_t *tctxs, prof_tctx_t *tctx, void *arg)
 {
 	prof_tctx_t *ret;
-	tsd_t *tsd = (tsd_t *)arg;
 
 	switch (tctx->state) {
 	case prof_tctx_state_nominal:
@@ -1008,9 +1041,7 @@ prof_tctx_finish_iter(prof_tctx_tree_t *tctxs, prof_tctx_t *tctx, void *arg)
 		tctx->state = prof_tctx_state_nominal;
 		break;
 	case prof_tctx_state_purgatory:
-		ret = tctx_tree_next(tctxs, tctx);
-		tctx_tree_remove(tctxs, tctx);
-		idalloc(tsd, tctx);
+		ret = tctx;
 		goto label_return;
 	default:
 		not_reached();
@@ -1056,27 +1087,47 @@ prof_gctx_merge_iter(prof_gctx_tree_t *gctxs, prof_gctx_t *gctx, void *arg)
 	return (NULL);
 }
 
-static prof_gctx_t *
-prof_gctx_finish_iter(prof_gctx_tree_t *gctxs, prof_gctx_t *gctx, void *arg)
+static void
+prof_gctx_finish(tsd_t *tsd, prof_gctx_tree_t *gctxs)
 {
-	tsd_t *tsd = (tsd_t *)arg;
 	prof_tdata_t *tdata = prof_tdata_get(tsd, false);
-	prof_tctx_t *next;
-	bool destroy_gctx;
-
-	malloc_mutex_lock(gctx->lock);
-	next = NULL;
-	do {
-		next = tctx_tree_iter(&gctx->tctxs, next, prof_tctx_finish_iter,
-		    tsd);
-	} while (next != NULL);
-	gctx->nlimbo--;
-	destroy_gctx = prof_gctx_should_destroy(gctx);
-	malloc_mutex_unlock(gctx->lock);
-	if (destroy_gctx)
-		prof_gctx_maybe_destroy(tsd, gctx, tdata);
+	prof_gctx_t *gctx;
 
-	return (NULL);
+	/*
+	 * Standard tree iteration won't work here, because as soon as we
+	 * decrement gctx->nlimbo and unlock gctx, another thread can
+	 * concurrently destroy it, which will corrupt the tree.  Therefore,
+	 * tear down the tree one node at a time during iteration.
+	 */
+	while ((gctx = gctx_tree_first(gctxs)) != NULL) {
+		gctx_tree_remove(gctxs, gctx);
+		malloc_mutex_lock(gctx->lock);
+		{
+			prof_tctx_t *next;
+
+			next = NULL;
+			do {
+				prof_tctx_t *to_destroy =
+				    tctx_tree_iter(&gctx->tctxs, next,
+				    prof_tctx_finish_iter, NULL);
+				if (to_destroy != NULL) {
+					next = tctx_tree_next(&gctx->tctxs,
+					    to_destroy);
+					tctx_tree_remove(&gctx->tctxs,
+					    to_destroy);
+					idalloc(tsd, to_destroy);
+				} else
+					next = NULL;
+			} while (next != NULL);
+		}
+		gctx->nlimbo--;
+		if (prof_gctx_should_destroy(gctx)) {
+			gctx->nlimbo++;
+			malloc_mutex_unlock(gctx->lock);
+			prof_gctx_try_destroy(tsd, gctx, tdata);
+		} else
+			malloc_mutex_unlock(gctx->lock);
+	}
 }
 
 static prof_tdata_t *
@@ -1085,7 +1136,7 @@ prof_tdata_merge_iter(prof_tdata_tree_t *tdatas, prof_tdata_t *tdata, void *arg)
 	prof_cnt_t *cnt_all = (prof_cnt_t *)arg;
 
 	malloc_mutex_lock(tdata->lock);
-	if (tdata->state != prof_tdata_state_expired) {
+	if (!tdata->expired) {
 		size_t tabind;
 		union {
 			prof_tctx_t	*p;
@@ -1130,6 +1181,10 @@ prof_tdata_dump_iter(prof_tdata_tree_t *tdatas, prof_tdata_t *tdata, void *arg)
 	return (NULL);
 }
 
+#ifdef JEMALLOC_JET
+#undef prof_dump_header
+#define	prof_dump_header JEMALLOC_N(prof_dump_header_impl)
+#endif
 static bool
 prof_dump_header(bool propagate_err, const prof_cnt_t *cnt_all)
 {
@@ -1148,6 +1203,11 @@ prof_dump_header(bool propagate_err, const prof_cnt_t *cnt_all)
 	malloc_mutex_unlock(&tdatas_mtx);
 	return (ret);
 }
+#ifdef JEMALLOC_JET
+#undef prof_dump_header
+#define	prof_dump_header JEMALLOC_N(prof_dump_header)
+prof_dump_header_t *prof_dump_header = JEMALLOC_N(prof_dump_header_impl);
+#endif
 
 /* gctx->lock is held. */
 static bool
@@ -1277,7 +1337,7 @@ prof_gctx_dump_iter(prof_gctx_tree_t *gctxs, prof_gctx_t *gctx, void *arg)
 	malloc_mutex_lock(gctx->lock);
 
 	if (prof_dump_gctx(propagate_err, gctx, &gctx->bt, gctxs)) {
-		ret = gctx_tree_next(gctxs, gctx);
+		ret = gctx;
 		goto label_return;
 	}
 
@@ -1302,7 +1362,7 @@ prof_dump(tsd_t *tsd, bool propagate_err, const char *filename, bool leakcheck)
 
 	cassert(config_prof);
 
-	tdata = prof_tdata_get(tsd, false);
+	tdata = prof_tdata_get(tsd, true);
 	if (tdata == NULL)
 		return (true);
 
@@ -1352,7 +1412,7 @@ prof_dump(tsd_t *tsd, bool propagate_err, const char *filename, bool leakcheck)
 	if (prof_dump_close(propagate_err))
 		goto label_open_close_error;
 
-	gctx_tree_iter(&gctxs, NULL, prof_gctx_finish_iter, tsd);
+	prof_gctx_finish(tsd, &gctxs);
 	malloc_mutex_unlock(&prof_dump_mtx);
 
 	if (leakcheck)
@@ -1362,7 +1422,7 @@ prof_dump(tsd_t *tsd, bool propagate_err, const char *filename, bool leakcheck)
 label_write_error:
 	prof_dump_close(propagate_err);
 label_open_close_error:
-	gctx_tree_iter(&gctxs, NULL, prof_gctx_finish_iter, tsd);
+	prof_gctx_finish(tsd, &gctxs);
 	malloc_mutex_unlock(&prof_dump_mtx);
 	return (true);
 }
@@ -1533,7 +1593,7 @@ prof_thr_uid_alloc(void)
 }
 
 static prof_tdata_t *
-prof_tdata_init_impl(tsd_t *tsd, uint64_t thr_uid)
+prof_tdata_init_impl(tsd_t *tsd, uint64_t thr_uid, uint64_t thr_discrim)
 {
 	prof_tdata_t *tdata;
 
@@ -1546,8 +1606,10 @@ prof_tdata_init_impl(tsd_t *tsd, uint64_t thr_uid)
 
 	tdata->lock = prof_tdata_mutex_choose(thr_uid);
 	tdata->thr_uid = thr_uid;
+	tdata->thr_discrim = thr_discrim;
 	tdata->thread_name = NULL;
-	tdata->state = prof_tdata_state_attached;
+	tdata->attached = true;
+	tdata->expired = false;
 
 	if (ckh_new(tsd, &tdata->bt2tctx, PROF_CKH_MINITEMS,
 	    prof_bt_hash, prof_bt_keycomp)) {
@@ -1576,14 +1638,7 @@ prof_tdata_t *
 prof_tdata_init(tsd_t *tsd)
 {
 
-	return (prof_tdata_init_impl(tsd, prof_thr_uid_alloc()));
-}
-
-prof_tdata_t *
-prof_tdata_reinit(tsd_t *tsd, prof_tdata_t *tdata)
-{
-
-	return (prof_tdata_init_impl(tsd, tdata->thr_uid));
+	return (prof_tdata_init_impl(tsd, prof_thr_uid_alloc(), 0));
 }
 
 /* tdata->lock must be held. */
@@ -1591,22 +1646,21 @@ static bool
 prof_tdata_should_destroy(prof_tdata_t *tdata)
 {
 
-	if (tdata->state == prof_tdata_state_attached)
+	if (tdata->attached)
 		return (false);
 	if (ckh_count(&tdata->bt2tctx) != 0)
 		return (false);
 	return (true);
 }
 
+/* tdatas_mtx must be held. */
 static void
-prof_tdata_destroy(tsd_t *tsd, prof_tdata_t *tdata)
+prof_tdata_destroy_locked(tsd_t *tsd, prof_tdata_t *tdata)
 {
 
 	assert(prof_tdata_should_destroy(tdata));
 
-	malloc_mutex_lock(&tdatas_mtx);
 	tdata_tree_remove(&tdatas, tdata);
-	malloc_mutex_unlock(&tdatas_mtx);
 
 	if (tdata->thread_name != NULL)
 		idalloc(tsd, tdata->thread_name);
@@ -1615,14 +1669,22 @@ prof_tdata_destroy(tsd_t *tsd, prof_tdata_t *tdata)
 }
 
 static void
-prof_tdata_state_transition(tsd_t *tsd, prof_tdata_t *tdata,
-   prof_tdata_state_t state)
+prof_tdata_destroy(tsd_t *tsd, prof_tdata_t *tdata)
+{
+
+	malloc_mutex_lock(&tdatas_mtx);
+	prof_tdata_destroy_locked(tsd, tdata);
+	malloc_mutex_unlock(&tdatas_mtx);
+}
+
+static void
+prof_tdata_detach(tsd_t *tsd, prof_tdata_t *tdata)
 {
 	bool destroy_tdata;
 
 	malloc_mutex_lock(tdata->lock);
-	if (tdata->state != state) {
-		tdata->state = state;
+	if (tdata->attached) {
+		tdata->attached = false;
 		destroy_tdata = prof_tdata_should_destroy(tdata);
 	} else
 		destroy_tdata = false;
@@ -1631,32 +1693,44 @@ prof_tdata_state_transition(tsd_t *tsd, prof_tdata_t *tdata,
 		prof_tdata_destroy(tsd, tdata);
 }
 
-static void
-prof_tdata_detach(tsd_t *tsd, prof_tdata_t *tdata)
+prof_tdata_t *
+prof_tdata_reinit(tsd_t *tsd, prof_tdata_t *tdata)
 {
+	uint64_t thr_uid = tdata->thr_uid;
+	uint64_t thr_discrim = tdata->thr_discrim + 1;
 
-	prof_tdata_state_transition(tsd, tdata, prof_tdata_state_detached);
+	prof_tdata_detach(tsd, tdata);
+	return (prof_tdata_init_impl(tsd, thr_uid, thr_discrim));
 }
 
-static void
-prof_tdata_expire(tsd_t *tsd, prof_tdata_t *tdata)
+static bool
+prof_tdata_expire(prof_tdata_t *tdata)
 {
+	bool destroy_tdata;
 
-	prof_tdata_state_transition(tsd, tdata, prof_tdata_state_expired);
+	malloc_mutex_lock(tdata->lock);
+	if (!tdata->expired) {
+		tdata->expired = true;
+		destroy_tdata = tdata->attached ? false :
+		    prof_tdata_should_destroy(tdata);
+	} else
+		destroy_tdata = false;
+	malloc_mutex_unlock(tdata->lock);
+
+	return (destroy_tdata);
 }
 
 static prof_tdata_t *
 prof_tdata_reset_iter(prof_tdata_tree_t *tdatas, prof_tdata_t *tdata, void *arg)
 {
-	tsd_t *tsd = (tsd_t *)arg;
 
-	prof_tdata_expire(tsd, tdata);
-	return (NULL);
+	return (prof_tdata_expire(tdata) ? tdata : NULL);
 }
 
 void
 prof_reset(tsd_t *tsd, size_t lg_sample)
 {
+	prof_tdata_t *next;
 
 	assert(lg_sample < (sizeof(uint64_t) << 3));
 
@@ -1664,7 +1738,18 @@ prof_reset(tsd_t *tsd, size_t lg_sample)
 	malloc_mutex_lock(&tdatas_mtx);
 
 	lg_prof_sample = lg_sample;
-	tdata_tree_iter(&tdatas, NULL, prof_tdata_reset_iter, tsd);
+
+	next = NULL;
+	do {
+		prof_tdata_t *to_destroy = tdata_tree_iter(&tdatas, next,
+		    prof_tdata_reset_iter, NULL);
+		if (to_destroy != NULL) {
+			next = tdata_tree_next(&tdatas, to_destroy);
+			tdata_tree_remove(&tdatas, to_destroy);
+			prof_tdata_destroy(tsd, to_destroy);
+		} else
+			next = NULL;
+	} while (next != NULL);
 
 	malloc_mutex_unlock(&tdatas_mtx);
 	malloc_mutex_unlock(&prof_dump_mtx);
diff --git a/test/unit/prof_reset.c b/test/unit/prof_reset.c
new file mode 100644
index 0000000..73fda41
--- /dev/null
+++ b/test/unit/prof_reset.c
@@ -0,0 +1,238 @@
+#include "test/jemalloc_test.h"
+
+#ifdef JEMALLOC_PROF
+const char *malloc_conf =
+    "prof:true,prof_active:false,lg_prof_sample:0";
+#endif
+
+static int
+prof_dump_open_intercept(bool propagate_err, const char *filename)
+{
+	int fd;
+
+	fd = open("/dev/null", O_WRONLY);
+	assert_d_ne(fd, -1, "Unexpected open() failure");
+
+	return (fd);
+}
+
+TEST_BEGIN(test_prof_reset_basic)
+{
+	size_t lg_prof_sample_orig, lg_prof_sample, lg_prof_sample_next;
+	size_t sz;
+	unsigned i;
+
+	sz = sizeof(size_t);
+	assert_d_eq(mallctl("opt.lg_prof_sample", &lg_prof_sample_orig, &sz,
+	    NULL, 0), 0,
+	    "Unexpected mallctl failure while reading profiling sample rate");
+	assert_zu_eq(lg_prof_sample_orig, 0,
+	    "Unexpected profiling sample rate");
+	sz = sizeof(size_t);
+	assert_d_eq(mallctl("prof.lg_sample", &lg_prof_sample, &sz, NULL, 0), 0,
+	    "Unexpected mallctl failure while reading profiling sample rate");
+	assert_zu_eq(lg_prof_sample_orig, lg_prof_sample,
+	    "Unexpected disagreement between \"opt.lg_prof_sample\" and "
+	    "\"prof.lg_sample\"");
+
+	/* Test simple resets. */
+	for (i = 0; i < 2; i++) {
+		assert_d_eq(mallctl("prof.reset", NULL, NULL, NULL, 0), 0,
+		    "Unexpected mallctl failure while resetting profile data");
+		sz = sizeof(size_t);
+		assert_d_eq(mallctl("prof.lg_sample", &lg_prof_sample, &sz,
+		    NULL, 0), 0, "Unexpected mallctl failure while reading "
+		    "profiling sample rate");
+		assert_zu_eq(lg_prof_sample_orig, lg_prof_sample,
+		    "Unexpected profile sample rate change");
+	}
+
+	/* Test resets with prof.lg_sample changes. */
+	lg_prof_sample_next = 1;
+	for (i = 0; i < 2; i++) {
+		assert_d_eq(mallctl("prof.reset", NULL, NULL,
+		    &lg_prof_sample_next, sizeof(size_t)), 0,
+		    "Unexpected mallctl failure while resetting profile data");
+		sz = sizeof(size_t);
+		assert_d_eq(mallctl("prof.lg_sample", &lg_prof_sample, &sz,
+		    NULL, 0), 0, "Unexpected mallctl failure while reading "
+		    "profiling sample rate");
+		assert_zu_eq(lg_prof_sample, lg_prof_sample_next,
+		    "Expected profile sample rate change");
+		lg_prof_sample_next = lg_prof_sample_orig;
+	}
+
+	/* Make sure the test code restored prof.lg_sample. */
+	sz = sizeof(size_t);
+	assert_d_eq(mallctl("prof.lg_sample", &lg_prof_sample, &sz, NULL, 0), 0,
+	    "Unexpected mallctl failure while reading profiling sample rate");
+	assert_zu_eq(lg_prof_sample_orig, lg_prof_sample,
+	    "Unexpected disagreement between \"opt.lg_prof_sample\" and "
+	    "\"prof.lg_sample\"");
+}
+TEST_END
+
+bool prof_dump_header_intercepted = false;
+prof_cnt_t cnt_all_copy = {0, 0, 0, 0};
+static bool
+prof_dump_header_intercept(bool propagate_err, const prof_cnt_t *cnt_all)
+{
+
+	prof_dump_header_intercepted = true;
+	memcpy(&cnt_all_copy, cnt_all, sizeof(prof_cnt_t));
+
+	return (false);
+}
+
+TEST_BEGIN(test_prof_reset_cleanup)
+{
+	bool active;
+	void *p;
+	prof_dump_header_t *prof_dump_header_orig;
+
+	active = true;
+	assert_d_eq(mallctl("prof.active", NULL, NULL, &active, sizeof(active)),
+	    0, "Unexpected mallctl failure while activating profiling");
+
+	// XXX Verify that reset actually drops backtrace count to 0.  Alloc an
+	// object, reset, check bt count, free.  prof_bt_count() doesn't do the
+	// right thing; we need to iterate during dump and count backtraces.
+	// Or, just intercept prof_dump_header(), which has enough information
+	// for these purposes.
+
+	assert_zu_eq(prof_bt_count(), 0, "Expected 0 backtraces");
+	p = mallocx(1, 0);
+	assert_ptr_not_null(p, "Unexpected mallocx() failure");
+	assert_zu_eq(prof_bt_count(), 1, "Expected 1 backtrace");
+
+	prof_dump_header_orig = prof_dump_header;
+	prof_dump_header = prof_dump_header_intercept;
+	assert_false(prof_dump_header_intercepted, "Unexpected intercept");
+
+	assert_d_eq(mallctl("prof.dump", NULL, NULL, NULL, 0),
+	    0, "Unexpected error while dumping heap profile");
+	assert_true(prof_dump_header_intercepted, "Expected intercept");
+	assert_u64_eq(cnt_all_copy.curobjs, 1, "Expected 1 allocation");
+
+	assert_d_eq(mallctl("prof.reset", NULL, NULL, NULL, 0), 0,
+	    "Unexpected error while resetting heap profile data");
+	assert_d_eq(mallctl("prof.dump", NULL, NULL, NULL, 0),
+	    0, "Unexpected error while dumping heap profile");
+	assert_u64_eq(cnt_all_copy.curobjs, 0, "Expected 0 allocations");
+	assert_zu_eq(prof_bt_count(), 1, "Expected 1 backtrace");
+
+	prof_dump_header = prof_dump_header_orig;
+
+	dallocx(p, 0);
+	assert_zu_eq(prof_bt_count(), 0, "Expected 0 backtraces");
+
+	active = false;
+	assert_d_eq(mallctl("prof.active", NULL, NULL, &active, sizeof(active)),
+	    0, "Unexpected mallctl failure while deactivating profiling");
+}
+TEST_END
+
+#define	NTHREADS		4
+#define	NALLOCS_PER_THREAD	(1U << 13)
+#define	OBJ_RING_BUF_COUNT	1531
+#define	RESET_INTERVAL		(1U << 10)
+#define	DUMP_INTERVAL		3677
+static void *
+thd_start(void *varg)
+{
+	unsigned thd_ind = *(unsigned *)varg;
+	unsigned i;
+	void *objs[OBJ_RING_BUF_COUNT];
+
+	memset(objs, 0, sizeof(objs));
+
+	for (i = 0; i < NALLOCS_PER_THREAD; i++) {
+		if (i % RESET_INTERVAL == 0) {
+			assert_d_eq(mallctl("prof.reset", NULL, NULL, NULL, 0),
+			    0, "Unexpected error while resetting heap profile "
+			    "data");
+		}
+
+		if (i % DUMP_INTERVAL == 0) {
+			assert_d_eq(mallctl("prof.dump", NULL, NULL, NULL, 0),
+			    0, "Unexpected error while dumping heap profile");
+		}
+
+		{
+			void **pp = &objs[i % OBJ_RING_BUF_COUNT];
+			if (*pp != NULL) {
+				dallocx(*pp, 0);
+				*pp = NULL;
+			}
+			*pp = btalloc(1, thd_ind*NALLOCS_PER_THREAD + i);
+			assert_ptr_not_null(*pp,
+			    "Unexpected btalloc() failure");
+		}
+	}
+
+	/* Clean up any remaining objects. */
+	for (i = 0; i < OBJ_RING_BUF_COUNT; i++) {
+		void **pp = &objs[i % OBJ_RING_BUF_COUNT];
+		if (*pp != NULL) {
+			dallocx(*pp, 0);
+			*pp = NULL;
+		}
+	}
+
+	return (NULL);
+}
+
+TEST_BEGIN(test_prof_reset)
+{
+	bool active;
+	thd_t thds[NTHREADS];
+	unsigned thd_args[NTHREADS];
+	unsigned i;
+	size_t bt_count, tdata_count;
+
+	test_skip_if(!config_prof);
+
+	bt_count = prof_bt_count();
+	assert_zu_eq(bt_count, 0,
+	    "Unexpected pre-existing tdata structures");
+	tdata_count = prof_tdata_count();
+
+	active = true;
+	assert_d_eq(mallctl("prof.active", NULL, NULL, &active, sizeof(active)),
+	    0, "Unexpected mallctl failure while activating profiling");
+
+	for (i = 0; i < NTHREADS; i++) {
+		thd_args[i] = i;
+		thd_create(&thds[i], thd_start, (void *)&thd_args[i]);
+	}
+	for (i = 0; i < NTHREADS; i++)
+		thd_join(thds[i], NULL);
+
+	assert_zu_eq(prof_bt_count(), bt_count,
+	    "Unexpected bactrace count change");
+	assert_zu_eq(prof_tdata_count(), tdata_count,
+	    "Unexpected remaining tdata structures");
+
+	active = false;
+	assert_d_eq(mallctl("prof.active", NULL, NULL, &active, sizeof(active)),
+	    0, "Unexpected mallctl failure while deactivating profiling");
+}
+TEST_END
+#undef NTHREADS
+#undef NALLOCS_PER_THREAD
+#undef OBJ_RING_BUF_COUNT
+#undef RESET_INTERVAL
+#undef DUMP_INTERVAL
+
+int
+main(void)
+{
+
+	/* Intercept dumping prior to running any tests. */
+	prof_dump_open = prof_dump_open_intercept;
+
+	return (test(
+	    test_prof_reset_basic,
+	    test_prof_reset_cleanup,
+	    test_prof_reset));
+}
-- 
cgit v0.12


From ebbd0c91f0935421c04d05c8bdc6e38762a1e561 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Thu, 2 Oct 2014 23:05:23 -0700
Subject: Remove obsolete comment.

---
 test/unit/prof_reset.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/test/unit/prof_reset.c b/test/unit/prof_reset.c
index 73fda41..62a4d5a 100644
--- a/test/unit/prof_reset.c
+++ b/test/unit/prof_reset.c
@@ -94,12 +94,6 @@ TEST_BEGIN(test_prof_reset_cleanup)
 	assert_d_eq(mallctl("prof.active", NULL, NULL, &active, sizeof(active)),
 	    0, "Unexpected mallctl failure while activating profiling");
 
-	// XXX Verify that reset actually drops backtrace count to 0.  Alloc an
-	// object, reset, check bt count, free.  prof_bt_count() doesn't do the
-	// right thing; we need to iterate during dump and count backtraces.
-	// Or, just intercept prof_dump_header(), which has enough information
-	// for these purposes.
-
 	assert_zu_eq(prof_bt_count(), 0, "Expected 0 backtraces");
 	p = mallocx(1, 0);
 	assert_ptr_not_null(p, "Unexpected mallocx() failure");
-- 
cgit v0.12


From 551ebc43647521bdd0bc78558b106762b3388928 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 3 Oct 2014 10:16:09 -0700
Subject: Convert to uniform style: cond == false --> !cond

---
 include/jemalloc/internal/arena.h                | 11 ++++---
 include/jemalloc/internal/bitmap.h               |  8 ++---
 include/jemalloc/internal/jemalloc_internal.h.in |  2 +-
 include/jemalloc/internal/prof.h                 |  2 +-
 include/jemalloc/internal/rb.h                   |  7 ++---
 include/jemalloc/internal/tcache.h               |  8 ++---
 src/arena.c                                      | 28 +++++++++---------
 src/chunk.c                                      | 16 +++++-----
 src/chunk_dss.c                                  |  4 +--
 src/chunk_mmap.c                                 |  4 +--
 src/ckh.c                                        | 10 +++----
 src/ctl.c                                        | 37 ++++++++++++------------
 src/huge.c                                       |  8 ++---
 src/jemalloc.c                                   | 22 +++++++-------
 src/prof.c                                       | 30 +++++++++----------
 src/stats.c                                      |  2 +-
 src/tcache.c                                     |  8 ++---
 src/util.c                                       | 12 ++++----
 test/unit/ckh.c                                  |  3 +-
 test/unit/rb.c                                   |  4 +--
 20 files changed, 111 insertions(+), 115 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 48fd205..2e9920c 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -1111,13 +1111,12 @@ arena_salloc(const void *ptr, bool demote)
 	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
 	assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
 	binind = arena_mapbits_binind_get(chunk, pageind);
-	if (unlikely(binind == BININD_INVALID || (config_prof && demote == false
-	    && arena_mapbits_large_get(chunk, pageind) != 0))) {
+	if (unlikely(binind == BININD_INVALID || (config_prof && !demote &&
+	    arena_mapbits_large_get(chunk, pageind) != 0))) {
 		/*
-		 * Large allocation.  In the common case (demote == true), and
-		 * as this is an inline function, most callers will only end up
-		 * looking at binind to determine that ptr is a small
-		 * allocation.
+		 * Large allocation.  In the common case (demote), and as this
+		 * is an inline function, most callers will only end up looking
+		 * at binind to determine that ptr is a small allocation.
 		 */
 		assert(((uintptr_t)ptr & PAGE_MASK) == 0);
 		ret = arena_mapbits_large_size_get(chunk, pageind);
diff --git a/include/jemalloc/internal/bitmap.h b/include/jemalloc/internal/bitmap.h
index 4ca40ff..fcc6005 100644
--- a/include/jemalloc/internal/bitmap.h
+++ b/include/jemalloc/internal/bitmap.h
@@ -139,7 +139,7 @@ bitmap_set(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
 	bitmap_t g;
 
 	assert(bit < binfo->nbits);
-	assert(bitmap_get(bitmap, binfo, bit) == false);
+	assert(!bitmap_get(bitmap, binfo, bit));
 	goff = bit >> LG_BITMAP_GROUP_NBITS;
 	gp = &bitmap[goff];
 	g = *gp;
@@ -172,7 +172,7 @@ bitmap_sfu(bitmap_t *bitmap, const bitmap_info_t *binfo)
 	bitmap_t g;
 	unsigned i;
 
-	assert(bitmap_full(bitmap, binfo) == false);
+	assert(!bitmap_full(bitmap, binfo));
 
 	i = binfo->nlevels - 1;
 	g = bitmap[binfo->levels[i].group_offset];
@@ -204,7 +204,7 @@ bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
 	assert((g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK))) == 0);
 	g ^= 1LU << (bit & BITMAP_GROUP_NBITS_MASK);
 	*gp = g;
-	assert(bitmap_get(bitmap, binfo, bit) == false);
+	assert(!bitmap_get(bitmap, binfo, bit));
 	/* Propagate group state transitions up the tree. */
 	if (propagate) {
 		unsigned i;
@@ -218,7 +218,7 @@ bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
 			    == 0);
 			g ^= 1LU << (bit & BITMAP_GROUP_NBITS_MASK);
 			*gp = g;
-			if (propagate == false)
+			if (!propagate)
 				break;
 		}
 	}
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index bff2bd2..ed25172 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -714,7 +714,7 @@ isalloc(const void *ptr, bool demote)
 
 	assert(ptr != NULL);
 	/* Demotion only makes sense if config_prof is true. */
-	assert(config_prof || demote == false);
+	assert(config_prof || !demote);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (chunk != ptr)
diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index 91c871d..ea52a63 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -388,7 +388,7 @@ prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
 		/* Compute new sample threshold. */
 		if (update)
 			prof_sample_threshold_update(tdata);
-		return (tdata->active == false);
+		return (!tdata->active);
 	}
 }
 
diff --git a/include/jemalloc/internal/rb.h b/include/jemalloc/internal/rb.h
index ffe3bb0..64fab89 100644
--- a/include/jemalloc/internal/rb.h
+++ b/include/jemalloc/internal/rb.h
@@ -593,7 +593,7 @@ a_prefix##remove(a_rbt_type *rbtree, a_type *node) {			\
 	if (left != &rbtree->rbt_nil) {					\
 	    /* node has no successor, but it has a left child.        */\
 	    /* Splice node out, without losing the left child.        */\
-	    assert(rbtn_red_get(a_type, a_field, node) == false);	\
+	    assert(!rbtn_red_get(a_type, a_field, node));		\
 	    assert(rbtn_red_get(a_type, a_field, left));		\
 	    rbtn_black_set(a_type, a_field, left);			\
 	    if (pathp == path) {					\
@@ -629,8 +629,7 @@ a_prefix##remove(a_rbt_type *rbtree, a_type *node) {			\
 	if (pathp->cmp < 0) {						\
 	    rbtn_left_set(a_type, a_field, pathp->node,			\
 	      pathp[1].node);						\
-	    assert(rbtn_red_get(a_type, a_field, pathp[1].node)		\
-	      == false);						\
+	    assert(!rbtn_red_get(a_type, a_field, pathp[1].node));	\
 	    if (rbtn_red_get(a_type, a_field, pathp->node)) {		\
 		a_type *right = rbtn_right_get(a_type, a_field,		\
 		  pathp->node);						\
@@ -862,7 +861,7 @@ a_prefix##remove(a_rbt_type *rbtree, a_type *node) {			\
     }									\
     /* Set root. */							\
     rbtree->rbt_root = path->node;					\
-    assert(rbtn_red_get(a_type, a_field, rbtree->rbt_root) == false);	\
+    assert(!rbtn_red_get(a_type, a_field, rbtree->rbt_root));		\
 }									\
 a_attr a_type *								\
 a_prefix##iter_recurse(a_rbt_type *rbtree, a_type *node,		\
diff --git a/include/jemalloc/internal/tcache.h b/include/jemalloc/internal/tcache.h
index 6804668..bc0b41c 100644
--- a/include/jemalloc/internal/tcache.h
+++ b/include/jemalloc/internal/tcache.h
@@ -191,9 +191,9 @@ tcache_get(tsd_t *tsd, bool create)
 {
 	tcache_t *tcache;
 
-	if (config_tcache == false)
+	if (!config_tcache)
 		return (NULL);
-	if (config_lazy_lock && isthreaded == false)
+	if (config_lazy_lock && !isthreaded)
 		return (NULL);
 	/*
 	 * If create is true, the caller has already assured that tsd is
@@ -261,7 +261,7 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
 	}
 	assert(tcache_salloc(ret) == size);
 
-	if (likely(zero == false)) {
+	if (likely(!zero)) {
 		if (config_fill) {
 			if (unlikely(opt_junk)) {
 				arena_alloc_junk_small(ret,
@@ -315,7 +315,7 @@ tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
 			arena_mapbits_large_binind_set(chunk, pageind,
 			    BININD_INVALID);
 		}
-		if (likely(zero == false)) {
+		if (likely(!zero)) {
 			if (config_fill) {
 				if (unlikely(opt_junk))
 					memset(ret, 0xa5, size);
diff --git a/src/arena.c b/src/arena.c
index ef391b1..79fea72 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -178,7 +178,7 @@ arena_run_reg_alloc(arena_run_t *run, arena_bin_info_t *bin_info)
 	void *rpages;
 
 	assert(run->nfree > 0);
-	assert(bitmap_full(run->bitmap, &bin_info->bitmap_info) == false);
+	assert(!bitmap_full(run->bitmap, &bin_info->bitmap_info));
 
 	regind = bitmap_sfu(run->bitmap, &bin_info->bitmap_info);
 	miscelm = arena_run_to_miscelm(run);
@@ -524,7 +524,7 @@ arena_chunk_init_hard(arena_t *arena)
 	 * There is no need to initialize the internal page map entries unless
 	 * the chunk is not zeroed.
 	 */
-	if (zero == false) {
+	if (!zero) {
 		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(
 		    (void *)arena_bitselm_get(chunk, map_bias+1),
 		    (size_t)((uintptr_t) arena_bitselm_get(chunk,
@@ -782,7 +782,7 @@ arena_compute_npurge(arena_t *arena, bool all)
 	 * Compute the minimum number of pages that this thread should try to
 	 * purge.
 	 */
-	if (all == false) {
+	if (!all) {
 		size_t threshold = (arena->nactive >> opt_lg_dirty_mult);
 
 		npurge = arena->ndirty - threshold;
@@ -829,7 +829,7 @@ arena_stash_dirty(arena_t *arena, bool all, size_t npurge,
 
 		nstashed += npages;
 
-		if (all == false && nstashed >= npurge)
+		if (!all && nstashed >= npurge)
 			break;
 	}
 
@@ -1049,7 +1049,7 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned)
 	 */
 	assert(arena_mapbits_dirty_get(chunk, run_ind) ==
 	    arena_mapbits_dirty_get(chunk, run_ind+run_pages-1));
-	if (cleaned == false && arena_mapbits_dirty_get(chunk, run_ind) != 0)
+	if (!cleaned && arena_mapbits_dirty_get(chunk, run_ind) != 0)
 		dirty = true;
 	flag_dirty = dirty ? CHUNK_MAP_DIRTY : 0;
 
@@ -1481,10 +1481,10 @@ arena_malloc_small(arena_t *arena, size_t size, bool zero)
 		bin->stats.nrequests++;
 	}
 	malloc_mutex_unlock(&bin->lock);
-	if (config_prof && isthreaded == false && arena_prof_accum(arena, size))
+	if (config_prof && !isthreaded && arena_prof_accum(arena, size))
 		prof_idump();
 
-	if (zero == false) {
+	if (!zero) {
 		if (config_fill) {
 			if (unlikely(opt_junk)) {
 				arena_alloc_junk_small(ret,
@@ -1537,7 +1537,7 @@ arena_malloc_large(arena_t *arena, size_t size, bool zero)
 	if (config_prof && idump)
 		prof_idump();
 
-	if (zero == false) {
+	if (!zero) {
 		if (config_fill) {
 			if (unlikely(opt_junk))
 				memset(ret, 0xa5, size);
@@ -1608,7 +1608,7 @@ arena_palloc(arena_t *arena, size_t size, size_t alignment, bool zero)
 	}
 	malloc_mutex_unlock(&arena->lock);
 
-	if (config_fill && zero == false) {
+	if (config_fill && !zero) {
 		if (unlikely(opt_junk))
 			memset(ret, 0xa5, size);
 		else if (unlikely(opt_zero))
@@ -2008,7 +2008,7 @@ arena_ralloc_large(void *ptr, size_t oldsize, size_t size, size_t extra,
 			bool ret = arena_ralloc_large_grow(arena, chunk, ptr,
 			    oldsize, PAGE_CEILING(size),
 			    psize - PAGE_CEILING(size), zero);
-			if (config_fill && ret == false && zero == false) {
+			if (config_fill && !ret && !zero) {
 				if (unlikely(opt_junk)) {
 					memset((void *)((uintptr_t)ptr +
 					    oldsize), 0xa5, isalloc(ptr,
@@ -2044,8 +2044,8 @@ arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
 		} else {
 			assert(size <= arena_maxclass);
 			if (size + extra > SMALL_MAXCLASS) {
-				if (arena_ralloc_large(ptr, oldsize, size,
-				    extra, zero) == false)
+				if (!arena_ralloc_large(ptr, oldsize, size,
+				    extra, zero))
 					return (false);
 			}
 		}
@@ -2064,7 +2064,7 @@ arena_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t size,
 	size_t copysize;
 
 	/* Try to avoid moving the allocation. */
-	if (arena_ralloc_no_move(ptr, oldsize, size, extra, zero) == false)
+	if (!arena_ralloc_no_move(ptr, oldsize, size, extra, zero))
 		return (ptr);
 
 	/*
@@ -2130,7 +2130,7 @@ bool
 arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec)
 {
 
-	if (have_dss == false)
+	if (!have_dss)
 		return (dss_prec != dss_prec_disabled);
 	malloc_mutex_lock(&arena->lock);
 	arena->dss_prec = dss_prec;
diff --git a/src/chunk.c b/src/chunk.c
index 874002c..cde8606 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -121,7 +121,7 @@ chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, size_t size,
 	if (node != NULL)
 		base_node_dalloc(node);
 	if (*zero) {
-		if (zeroed == false)
+		if (!zeroed)
 			memset(ret, 0, size);
 		else if (config_debug) {
 			size_t i;
@@ -136,10 +136,10 @@ chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, size_t size,
 }
 
 /*
- * If the caller specifies (*zero == false), it is still possible to receive
- * zeroed memory, in which case *zero is toggled to true.  arena_chunk_alloc()
- * takes advantage of this to avoid demanding zeroed chunks, but taking
- * advantage of them if they are returned.
+ * If the caller specifies (!*zero), it is still possible to receive zeroed
+ * memory, in which case *zero is toggled to true.  arena_chunk_alloc() takes
+ * advantage of this to avoid demanding zeroed chunks, but taking advantage of
+ * them if they are returned.
  */
 static void *
 chunk_alloc_core(size_t size, size_t alignment, bool base, bool *zero,
@@ -186,7 +186,7 @@ chunk_register(void *chunk, size_t size, bool base)
 	assert(chunk != NULL);
 	assert(CHUNK_ADDR2BASE(chunk) == chunk);
 
-	if (config_ivsalloc && base == false) {
+	if (config_ivsalloc && !base) {
 		if (rtree_set(chunks_rtree, (uintptr_t)chunk, 1))
 			return (true);
 	}
@@ -288,7 +288,7 @@ chunk_record(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, void *chunk,
 		extent_tree_szad_remove(chunks_szad, node);
 		node->addr = chunk;
 		node->size += size;
-		node->zeroed = (node->zeroed && (unzeroed == false));
+		node->zeroed = (node->zeroed && !unzeroed);
 		extent_tree_szad_insert(chunks_szad, node);
 	} else {
 		/* Coalescing forward failed, so insert a new node. */
@@ -305,7 +305,7 @@ chunk_record(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, void *chunk,
 		xnode = NULL; /* Prevent deallocation below. */
 		node->addr = chunk;
 		node->size = size;
-		node->zeroed = (unzeroed == false);
+		node->zeroed = !unzeroed;
 		extent_tree_ad_insert(chunks_ad, node);
 		extent_tree_szad_insert(chunks_szad, node);
 	}
diff --git a/src/chunk_dss.c b/src/chunk_dss.c
index 82faf91..cce7104 100644
--- a/src/chunk_dss.c
+++ b/src/chunk_dss.c
@@ -45,7 +45,7 @@ chunk_dss_prec_get(void)
 {
 	dss_prec_t ret;
 
-	if (have_dss == false)
+	if (!have_dss)
 		return (dss_prec_disabled);
 	malloc_mutex_lock(&dss_mtx);
 	ret = dss_prec_default;
@@ -57,7 +57,7 @@ bool
 chunk_dss_prec_set(dss_prec_t dss_prec)
 {
 
-	if (have_dss == false)
+	if (!have_dss)
 		return (dss_prec != dss_prec_disabled);
 	malloc_mutex_lock(&dss_mtx);
 	dss_prec_default = dss_prec;
diff --git a/src/chunk_mmap.c b/src/chunk_mmap.c
index 65137b4..7e02c10 100644
--- a/src/chunk_mmap.c
+++ b/src/chunk_mmap.c
@@ -132,7 +132,7 @@ pages_purge(void *addr, size_t length)
 #    error "No madvise(2) flag defined for purging unused dirty pages."
 #  endif
 	int err = madvise(addr, length, JEMALLOC_MADV_PURGE);
-	unzeroed = (JEMALLOC_MADV_ZEROS == false || err != 0);
+	unzeroed = (!JEMALLOC_MADV_ZEROS || err != 0);
 #  undef JEMALLOC_MADV_PURGE
 #  undef JEMALLOC_MADV_ZEROS
 #else
@@ -209,5 +209,5 @@ chunk_dalloc_mmap(void *chunk, size_t size)
 	if (config_munmap)
 		pages_unmap(chunk, size);
 
-	return (config_munmap == false);
+	return (!config_munmap);
 }
diff --git a/src/ckh.c b/src/ckh.c
index 7c7cc09..3a54596 100644
--- a/src/ckh.c
+++ b/src/ckh.c
@@ -185,7 +185,7 @@ ckh_evict_reloc_insert(ckh_t *ckh, size_t argbucket, void const **argkey,
 		}
 
 		bucket = tbucket;
-		if (ckh_try_bucket_insert(ckh, bucket, key, data) == false)
+		if (!ckh_try_bucket_insert(ckh, bucket, key, data))
 			return (false);
 	}
 }
@@ -201,12 +201,12 @@ ckh_try_insert(ckh_t *ckh, void const**argkey, void const**argdata)
 
 	/* Try to insert in primary bucket. */
 	bucket = hashes[0] & ((ZU(1) << ckh->lg_curbuckets) - 1);
-	if (ckh_try_bucket_insert(ckh, bucket, key, data) == false)
+	if (!ckh_try_bucket_insert(ckh, bucket, key, data))
 		return (false);
 
 	/* Try to insert in secondary bucket. */
 	bucket = hashes[1] & ((ZU(1) << ckh->lg_curbuckets) - 1);
-	if (ckh_try_bucket_insert(ckh, bucket, key, data) == false)
+	if (!ckh_try_bucket_insert(ckh, bucket, key, data))
 		return (false);
 
 	/*
@@ -281,7 +281,7 @@ ckh_grow(tsd_t *tsd, ckh_t *ckh)
 		tab = ttab;
 		ckh->lg_curbuckets = lg_curcells - LG_CKH_BUCKET_CELLS;
 
-		if (ckh_rebuild(ckh, tab) == false) {
+		if (!ckh_rebuild(ckh, tab)) {
 			idalloc(tsd, tab);
 			break;
 		}
@@ -327,7 +327,7 @@ ckh_shrink(tsd_t *tsd, ckh_t *ckh)
 	tab = ttab;
 	ckh->lg_curbuckets = lg_curcells - LG_CKH_BUCKET_CELLS;
 
-	if (ckh_rebuild(ckh, tab) == false) {
+	if (!ckh_rebuild(ckh, tab)) {
 		idalloc(tsd, tab);
 #ifdef CKH_COUNT
 		ckh->nshrinks++;
diff --git a/src/ctl.c b/src/ctl.c
index c55f6e4..b85710c 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -36,8 +36,7 @@ static inline const ctl_indexed_node_t *
 ctl_indexed_node(const ctl_node_t *node)
 {
 
-	return ((node->named == false) ? (const ctl_indexed_node_t *)node :
-	    NULL);
+	return (!node->named ? (const ctl_indexed_node_t *)node : NULL);
 }
 
 /******************************************************************************/
@@ -693,7 +692,7 @@ ctl_init(void)
 	bool ret;
 
 	malloc_mutex_lock(&ctl_mtx);
-	if (ctl_initialized == false) {
+	if (!ctl_initialized) {
 		/*
 		 * Allocate space for one extra arena stats element, which
 		 * contains summed stats across all arenas.
@@ -843,7 +842,7 @@ ctl_byname(const char *name, void *oldp, size_t *oldlenp, void *newp,
 	size_t mib[CTL_MAX_DEPTH];
 	const ctl_named_node_t *node;
 
-	if (ctl_initialized == false && ctl_init()) {
+	if (!ctl_initialized && ctl_init()) {
 		ret = EAGAIN;
 		goto label_return;
 	}
@@ -870,7 +869,7 @@ ctl_nametomib(const char *name, size_t *mibp, size_t *miblenp)
 {
 	int ret;
 
-	if (ctl_initialized == false && ctl_init()) {
+	if (!ctl_initialized && ctl_init()) {
 		ret = EAGAIN;
 		goto label_return;
 	}
@@ -888,7 +887,7 @@ ctl_bymib(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 	const ctl_named_node_t *node;
 	size_t i;
 
-	if (ctl_initialized == false && ctl_init()) {
+	if (!ctl_initialized && ctl_init()) {
 		ret = EAGAIN;
 		goto label_return;
 	}
@@ -1015,7 +1014,7 @@ n##_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,	\
 	int ret;							\
 	t oldval;							\
 									\
-	if ((c) == false)						\
+	if (!(c))							\
 		return (ENOENT);					\
 	if (l)								\
 		malloc_mutex_lock(&ctl_mtx);				\
@@ -1038,7 +1037,7 @@ n##_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,	\
 	int ret;							\
 	t oldval;							\
 									\
-	if ((c) == false)						\
+	if (!(c))							\
 		return (ENOENT);					\
 	malloc_mutex_lock(&ctl_mtx);					\
 	READONLY();							\
@@ -1082,7 +1081,7 @@ n##_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,	\
 	int ret;							\
 	t oldval;							\
 									\
-	if ((c) == false)						\
+	if (!(c))							\
 		return (ENOENT);					\
 	READONLY();							\
 	oldval = (v);							\
@@ -1119,7 +1118,7 @@ n##_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,	\
 	t oldval;							\
 	tsd_t *tsd;							\
 									\
-	if ((c) == false)						\
+	if (!(c))							\
 		return (ENOENT);					\
 	READONLY();							\
 	tsd = tsd_tryget();						\
@@ -1291,7 +1290,7 @@ thread_tcache_enabled_ctl(const size_t *mib, size_t miblen, void *oldp,
 	int ret;
 	bool oldval;
 
-	if (config_tcache == false)
+	if (!config_tcache)
 		return (ENOENT);
 
 	oldval = tcache_enabled_get();
@@ -1315,7 +1314,7 @@ thread_tcache_flush_ctl(const size_t *mib, size_t miblen, void *oldp,
 {
 	int ret;
 
-	if (config_tcache == false)
+	if (!config_tcache)
 		return (ENOENT);
 
 	READONLY();
@@ -1335,7 +1334,7 @@ thread_prof_name_ctl(const size_t *mib, size_t miblen, void *oldp,
 	int ret;
 	const char *oldname;
 
-	if (config_prof == false)
+	if (!config_prof)
 		return (ENOENT);
 
 	oldname = prof_thread_name_get();
@@ -1372,7 +1371,7 @@ thread_prof_active_ctl(const size_t *mib, size_t miblen, void *oldp,
 	int ret;
 	bool oldval;
 
-	if (config_prof == false)
+	if (!config_prof)
 		return (ENOENT);
 
 	oldval = prof_thread_active_get();
@@ -1459,7 +1458,7 @@ arena_i_dss_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 			}
 		}
 
-		if (match == false) {
+		if (!match) {
 			ret = EINVAL;
 			goto label_return;
 		}
@@ -1668,7 +1667,7 @@ prof_active_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 	int ret;
 	bool oldval;
 
-	if (config_prof == false)
+	if (!config_prof)
 		return (ENOENT);
 
 	malloc_mutex_lock(&ctl_mtx); /* Protect opt_prof_active. */
@@ -1697,7 +1696,7 @@ prof_dump_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 	int ret;
 	const char *filename = NULL;
 
-	if (config_prof == false)
+	if (!config_prof)
 		return (ENOENT);
 
 	WRITEONLY();
@@ -1721,7 +1720,7 @@ prof_reset_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 	size_t lg_sample = lg_prof_sample;
 	tsd_t *tsd;
 
-	if (config_prof == false)
+	if (!config_prof)
 		return (ENOENT);
 
 	WRITEONLY();
@@ -1847,7 +1846,7 @@ stats_arenas_i_index(const size_t *mib, size_t miblen, size_t i)
 	const ctl_named_node_t * ret;
 
 	malloc_mutex_lock(&ctl_mtx);
-	if (i > ctl_stats.narenas || ctl_stats.arenas[i].initialized == false) {
+	if (i > ctl_stats.narenas || !ctl_stats.arenas[i].initialized) {
 		ret = NULL;
 		goto label_return;
 	}
diff --git a/src/huge.c b/src/huge.c
index 40d1362..2f059b4 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -62,10 +62,10 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
 	extent_tree_ad_insert(&huge, node);
 	malloc_mutex_unlock(&huge_mtx);
 
-	if (config_fill && zero == false) {
+	if (config_fill && !zero) {
 		if (unlikely(opt_junk))
 			memset(ret, 0xa5, csize);
-		else if (unlikely(opt_zero) && is_zeroed == false)
+		else if (unlikely(opt_zero) && !is_zeroed)
 			memset(ret, 0, csize);
 	}
 
@@ -85,7 +85,7 @@ huge_dalloc_junk(void *ptr, size_t usize)
 		 * Only bother junk filling if the chunk isn't about to be
 		 * unmapped.
 		 */
-		if (config_munmap == false || (have_dss && chunk_in_dss(ptr)))
+		if (!config_munmap || (have_dss && chunk_in_dss(ptr)))
 			memset(ptr, 0x5a, usize);
 	}
 }
@@ -156,7 +156,7 @@ huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t size,
 	size_t copysize;
 
 	/* Try to avoid moving the allocation. */
-	if (huge_ralloc_no_move(ptr, oldsize, size, extra) == false)
+	if (!huge_ralloc_no_move(ptr, oldsize, size, extra))
 		return (ptr);
 
 	/*
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 3012f55..0d04131 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -119,7 +119,7 @@ arenas_extend(unsigned ind)
 	arena_t *ret;
 
 	ret = (arena_t *)base_alloc(sizeof(arena_t));
-	if (ret != NULL && arena_new(ret, ind) == false) {
+	if (ret != NULL && !arena_new(ret, ind)) {
 		arenas[ind] = ret;
 		return (ret);
 	}
@@ -326,7 +326,7 @@ malloc_conf_next(char const **opts_p, char const **k_p, size_t *klen_p,
 
 	*k_p = opts;
 
-	for (accept = false; accept == false;) {
+	for (accept = false; !accept;) {
 		switch (*opts) {
 		case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 		case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
@@ -361,7 +361,7 @@ malloc_conf_next(char const **opts_p, char const **k_p, size_t *klen_p,
 		}
 	}
 
-	for (accept = false; accept == false;) {
+	for (accept = false; !accept;) {
 		switch (*opts) {
 		case ',':
 			opts++;
@@ -418,7 +418,7 @@ malloc_conf_init(void)
 		in_valgrind = (RUNNING_ON_VALGRIND != 0) ? true : false;
 		if (config_fill && unlikely(in_valgrind)) {
 			opt_junk = false;
-			assert(opt_zero == false);
+			assert(!opt_zero);
 			opt_quarantine = JEMALLOC_VALGRIND_QUARANTINE_DEFAULT;
 			opt_redzone = true;
 		}
@@ -496,8 +496,8 @@ malloc_conf_init(void)
 			opts = buf;
 		}
 
-		while (*opts != '\0' && malloc_conf_next(&opts, &k, &klen, &v,
-		    &vlen) == false) {
+		while (*opts != '\0' && !malloc_conf_next(&opts, &k, &klen, &v,
+		    &vlen)) {
 #define	CONF_MATCH(n)							\
 	(sizeof(n)-1 == klen && strncmp(n, k, klen) == 0)
 #define	CONF_HANDLE_BOOL(o, n, cont)					\
@@ -607,7 +607,7 @@ malloc_conf_init(void)
 						}
 					}
 				}
-				if (match == false) {
+				if (!match) {
 					malloc_conf_error("Invalid conf value",
 					    k, klen, v, vlen);
 				}
@@ -697,13 +697,13 @@ malloc_init_hard(void)
 		return (false);
 	}
 #ifdef JEMALLOC_THREADED_INIT
-	if (malloc_initializer != NO_INITIALIZER && IS_INITIALIZER == false) {
+	if (malloc_initializer != NO_INITIALIZER && !IS_INITIALIZER) {
 		/* Busy-wait until the initializing thread completes. */
 		do {
 			malloc_mutex_unlock(&init_lock);
 			CPU_SPINWAIT;
 			malloc_mutex_lock(&init_lock);
-		} while (malloc_initialized == false);
+		} while (!malloc_initialized);
 		malloc_mutex_unlock(&init_lock);
 		return (false);
 	}
@@ -2011,7 +2011,7 @@ _malloc_prefork(void)
 	unsigned i;
 
 #ifdef JEMALLOC_MUTEX_INIT_CB
-	if (malloc_initialized == false)
+	if (!malloc_initialized)
 		return;
 #endif
 	assert(malloc_initialized);
@@ -2040,7 +2040,7 @@ _malloc_postfork(void)
 	unsigned i;
 
 #ifdef JEMALLOC_MUTEX_INIT_CB
-	if (malloc_initialized == false)
+	if (!malloc_initialized)
 		return;
 #endif
 	assert(malloc_initialized);
diff --git a/src/prof.c b/src/prof.c
index 0a96d85..29b4baa 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -232,7 +232,7 @@ prof_enter(prof_tdata_t *tdata)
 
 	cassert(config_prof);
 
-	assert(tdata->enq == false);
+	assert(!tdata->enq);
 	tdata->enq = true;
 
 	malloc_mutex_lock(&bt2gctx_mtx);
@@ -578,7 +578,7 @@ prof_gctx_should_destroy(prof_gctx_t *gctx)
 
 	if (opt_prof_accum)
 		return (false);
-	if (tctx_tree_empty(&gctx->tctxs) == false)
+	if (!tctx_tree_empty(&gctx->tctxs))
 		return (false);
 	if (gctx->nlimbo != 0)
 		return (false);
@@ -595,7 +595,7 @@ prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx)
 
 	assert(tctx->cnts.curobjs == 0);
 	assert(tctx->cnts.curbytes == 0);
-	assert(opt_prof_accum == false);
+	assert(!opt_prof_accum);
 	assert(tctx->cnts.accumobjs == 0);
 	assert(tctx->cnts.accumbytes == 0);
 
@@ -858,7 +858,7 @@ prof_dump_open(bool propagate_err, const char *filename)
 	int fd;
 
 	fd = creat(filename, 0644);
-	if (fd == -1 && propagate_err == false) {
+	if (fd == -1 && !propagate_err) {
 		malloc_printf("<jemalloc>: creat(\"%s\"), 0644) failed\n",
 		    filename);
 		if (opt_abort)
@@ -883,7 +883,7 @@ prof_dump_flush(bool propagate_err)
 
 	err = write(prof_dump_fd, prof_dump_buf, prof_dump_buf_end);
 	if (err == -1) {
-		if (propagate_err == false) {
+		if (!propagate_err) {
 			malloc_write("<jemalloc>: write() failed during heap "
 			    "profile flush\n");
 			if (opt_abort)
@@ -1145,8 +1145,8 @@ prof_tdata_merge_iter(prof_tdata_tree_t *tdatas, prof_tdata_t *tdata, void *arg)
 
 		tdata->dumping = true;
 		memset(&tdata->cnt_summed, 0, sizeof(prof_cnt_t));
-		for (tabind = 0; ckh_iter(&tdata->bt2tctx, &tabind, NULL,
-		    &tctx.v) == false;)
+		for (tabind = 0; !ckh_iter(&tdata->bt2tctx, &tabind, NULL,
+		    &tctx.v);)
 			prof_tctx_merge_tdata(tctx.p, tdata);
 
 		cnt_all->curobjs += tdata->cnt_summed.curobjs;
@@ -1167,7 +1167,7 @@ prof_tdata_dump_iter(prof_tdata_tree_t *tdatas, prof_tdata_t *tdata, void *arg)
 {
 	bool propagate_err = *(bool *)arg;
 
-	if (tdata->dumping == false)
+	if (!tdata->dumping)
 		return (NULL);
 
 	if (prof_dump_printf(propagate_err,
@@ -1220,7 +1220,7 @@ prof_dump_gctx(bool propagate_err, prof_gctx_t *gctx, const prof_bt_t *bt,
 	cassert(config_prof);
 
 	/* Avoid dumping such gctx's that have no useful data. */
-	if ((opt_prof_accum == false && gctx->cnt_summed.curobjs == 0) ||
+	if ((!opt_prof_accum && gctx->cnt_summed.curobjs == 0) ||
 	    (opt_prof_accum && gctx->cnt_summed.accumobjs == 0)) {
 		assert(gctx->cnt_summed.curobjs == 0);
 		assert(gctx->cnt_summed.curbytes == 0);
@@ -1374,7 +1374,7 @@ prof_dump(tsd_t *tsd, bool propagate_err, const char *filename, bool leakcheck)
 	 * summing.
 	 */
 	gctx_tree_new(&gctxs);
-	for (tabind = 0; ckh_iter(&bt2gctx, &tabind, NULL, &gctx.v) == false;)
+	for (tabind = 0; !ckh_iter(&bt2gctx, &tabind, NULL, &gctx.v);)
 		prof_dump_gctx_prep(gctx.p, &gctxs);
 
 	/*
@@ -1457,7 +1457,7 @@ prof_fdump(void)
 
 	cassert(config_prof);
 
-	if (prof_booted == false)
+	if (!prof_booted)
 		return;
 	if ((tsd = tsd_tryget()) == NULL)
 		return;
@@ -1479,7 +1479,7 @@ prof_idump(void)
 
 	cassert(config_prof);
 
-	if (prof_booted == false)
+	if (!prof_booted)
 		return;
 	if ((tsd = tsd_tryget()) == NULL)
 		return;
@@ -1508,7 +1508,7 @@ prof_mdump(const char *filename)
 
 	cassert(config_prof);
 
-	if (opt_prof == false || prof_booted == false)
+	if (!opt_prof || !prof_booted)
 		return (true);
 	if ((tsd = tsd_tryget()) == NULL)
 		return (true);
@@ -1535,7 +1535,7 @@ prof_gdump(void)
 
 	cassert(config_prof);
 
-	if (prof_booted == false)
+	if (!prof_booted)
 		return;
 	if ((tsd = tsd_tryget()) == NULL)
 		return;
@@ -1855,7 +1855,7 @@ prof_boot1(void)
 	 * initialized, so this function must be executed early.
 	 */
 
-	if (opt_prof_leak && opt_prof == false) {
+	if (opt_prof_leak && !opt_prof) {
 		/*
 		 * Enable opt_prof, but in such a way that profiles are never
 		 * automatically dumped.
diff --git a/src/stats.c b/src/stats.c
index db34275..aa09550 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -505,7 +505,7 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 						ninitialized++;
 				}
 
-				if (ninitialized > 1 || unmerged == false) {
+				if (ninitialized > 1 || !unmerged) {
 					/* Print merged arena stats. */
 					malloc_cprintf(write_cb, cbopaque,
 					    "\nMerged arenas stats:\n");
diff --git a/src/tcache.c b/src/tcache.c
index bb4c3cc..6f3408c 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -101,7 +101,7 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem,
 
 		malloc_mutex_lock(&bin->lock);
 		if (config_stats && arena == tcache->arena) {
-			assert(merged_stats == false);
+			assert(!merged_stats);
 			merged_stats = true;
 			bin->stats.nflushes++;
 			bin->stats.nrequests += tbin->tstats.nrequests;
@@ -132,7 +132,7 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem,
 		}
 		malloc_mutex_unlock(&bin->lock);
 	}
-	if (config_stats && merged_stats == false) {
+	if (config_stats && !merged_stats) {
 		/*
 		 * The flush loop didn't happen to flush to this thread's
 		 * arena, so the stats didn't get merged.  Manually do so now.
@@ -210,7 +210,7 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem,
 		if (config_prof && idump)
 			prof_idump();
 	}
-	if (config_stats && merged_stats == false) {
+	if (config_stats && !merged_stats) {
 		/*
 		 * The flush loop didn't happen to flush to this thread's
 		 * arena, so the stats didn't get merged.  Manually do so now.
@@ -262,7 +262,7 @@ tcache_t *
 tcache_get_hard(tsd_t *tsd)
 {
 
-	if (tcache_enabled_get() == false) {
+	if (!tcache_enabled_get()) {
 		tcache_enabled_set(false); /* Memoize. */
 		return (NULL);
 	}
diff --git a/src/util.c b/src/util.c
index 1717f08..bfd86af 100644
--- a/src/util.c
+++ b/src/util.c
@@ -266,7 +266,7 @@ d2s(intmax_t x, char sign, char *s, size_t *slen_p)
 		sign = '-';
 	switch (sign) {
 	case '-':
-		if (neg == false)
+		if (!neg)
 			break;
 		/* Fall through. */
 	case ' ':
@@ -329,7 +329,7 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 	/* Left padding. */						\
 	size_t pad_len = (width == -1) ? 0 : ((slen < (size_t)width) ?	\
 	    (size_t)width - slen : 0);					\
-	if (left_justify == false && pad_len != 0) {			\
+	if (!left_justify && pad_len != 0) {				\
 		size_t j;						\
 		for (j = 0; j < pad_len; j++)				\
 			APPEND_C(' ');					\
@@ -406,19 +406,19 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 			while (true) {
 				switch (*f) {
 				case '#':
-					assert(alt_form == false);
+					assert(!alt_form);
 					alt_form = true;
 					break;
 				case '-':
-					assert(left_justify == false);
+					assert(!left_justify);
 					left_justify = true;
 					break;
 				case ' ':
-					assert(plus_space == false);
+					assert(!plus_space);
 					plus_space = true;
 					break;
 				case '+':
-					assert(plus_plus == false);
+					assert(!plus_plus);
 					plus_plus = true;
 					break;
 				default: goto label_width;
diff --git a/test/unit/ckh.c b/test/unit/ckh.c
index 148b81e..03b4f71 100644
--- a/test/unit/ckh.c
+++ b/test/unit/ckh.c
@@ -162,8 +162,7 @@ TEST_BEGIN(test_insert_iter_remove)
 
 			memset(seen, 0, sizeof(seen));
 
-			for (tabind = 0; ckh_iter(&ckh, &tabind, &q, &r) ==
-			    false;) {
+			for (tabind = 0; !ckh_iter(&ckh, &tabind, &q, &r);) {
 				size_t k;
 
 				assert_ptr_eq(q, r, "Key and val not equal");
diff --git a/test/unit/rb.c b/test/unit/rb.c
index e43907f..b38eb0e 100644
--- a/test/unit/rb.c
+++ b/test/unit/rb.c
@@ -5,7 +5,7 @@
     for (rbp_bh_t = (a_rbt)->rbt_root, (r_height) = 0;			\
       rbp_bh_t != &(a_rbt)->rbt_nil;					\
       rbp_bh_t = rbtn_left_get(a_type, a_field, rbp_bh_t)) {		\
-	if (rbtn_red_get(a_type, a_field, rbp_bh_t) == false) {		\
+	if (!rbtn_red_get(a_type, a_field, rbp_bh_t)) {			\
 	    (r_height)++;						\
 	}								\
     }									\
@@ -75,7 +75,7 @@ tree_recurse(node_t *node, unsigned black_height, unsigned black_depth,
 	node_t *left_node = rbtn_left_get(node_t, link, node);
 	node_t *right_node = rbtn_right_get(node_t, link, node);
 
-	if (rbtn_red_get(node_t, link, node) == false)
+	if (!rbtn_red_get(node_t, link, node))
 		black_depth++;
 
 	/* Red nodes must be interleaved with black nodes. */
-- 
cgit v0.12


From fc12c0b8bc1160530d1e3e641b76d2a4f793136f Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 3 Oct 2014 23:25:30 -0700
Subject: Implement/test/fix prof-related mallctl's.

Implement/test/fix the opt.prof_thread_active_init,
prof.thread_active_init, and thread.prof.active mallctl's.

Test/fix the thread.prof.name mallctl.

Refactor opt_prof_active to be read-only and move mutable state into the
prof_active variable.  Stop leaning on ctl-related locking for
protection.
---
 Makefile.in                                   |   2 +
 doc/jemalloc.xml.in                           |  52 ++++++++--
 include/jemalloc/internal/private_symbols.txt |   5 +
 include/jemalloc/internal/prof.h              |  34 +++++--
 src/ctl.c                                     |  73 ++++++++++----
 src/jemalloc.c                                |   2 +
 src/prof.c                                    | 140 ++++++++++++++++++++++----
 src/stats.c                                   |  33 +++---
 test/unit/prof_active.c                       | 136 +++++++++++++++++++++++++
 test/unit/prof_reset.c                        |   4 +
 test/unit/prof_thread_name.c                  | 128 +++++++++++++++++++++++
 11 files changed, 544 insertions(+), 65 deletions(-)
 create mode 100644 test/unit/prof_active.c
 create mode 100644 test/unit/prof_thread_name.c

diff --git a/Makefile.in b/Makefile.in
index 5267bea..52f5a9d 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -123,9 +123,11 @@ TESTS_UNIT := $(srcroot)test/unit/atomic.c \
 	$(srcroot)test/unit/mq.c \
 	$(srcroot)test/unit/mtx.c \
 	$(srcroot)test/unit/prof_accum.c \
+	$(srcroot)test/unit/prof_active.c \
 	$(srcroot)test/unit/prof_gdump.c \
 	$(srcroot)test/unit/prof_idump.c \
 	$(srcroot)test/unit/prof_reset.c \
+	$(srcroot)test/unit/prof_thread_name.c \
 	$(srcroot)test/unit/ql.c \
 	$(srcroot)test/unit/qr.c \
 	$(srcroot)test/unit/quarantine.c \
diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index b586e69..6abb50b 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -1061,6 +1061,21 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         This option is enabled by default.</para></listitem>
       </varlistentry>
 
+      <varlistentry id="opt.prof_thread_active_init">
+        <term>
+          <mallctl>opt.prof_thread_active_init</mallctl>
+          (<type>bool</type>)
+          <literal>r-</literal>
+          [<option>--enable-prof</option>]
+        </term>
+        <listitem><para>Initial setting for <link
+        linkend="thread.prof.active"><mallctl>thread.prof.active</mallctl></link>
+        in newly created threads.  The initial setting for newly created threads
+        can also be changed during execution via the <link
+        linkend="prof.thread_active_init"><mallctl>prof.thread_active_init</mallctl></link>
+        mallctl.  This option is enabled by default.</para></listitem>
+      </varlistentry>
+
       <varlistentry id="opt.lg_prof_sample">
         <term>
           <mallctl>opt.lg_prof_sample</mallctl>
@@ -1264,7 +1279,8 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         <term>
           <mallctl>thread.prof.name</mallctl>
           (<type>const char *</type>)
-          <literal>rw</literal>
+          <literal>r-</literal> or
+          <literal>-w</literal>
           [<option>--enable-prof</option>]
         </term>
         <listitem><para>Get/set the descriptive name associated with the calling
@@ -1272,7 +1288,15 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         created, so the input string need not be maintained after this interface
         completes execution.  The output string of this interface should be
         copied for non-ephemeral uses, because multiple implementation details
-        can cause asynchronous string deallocation.</para></listitem>
+        can cause asynchronous string deallocation.  Furthermore, each
+        invocation of this interface can only read or write; simultaneous
+        read/write is not supported due to string lifetime limitations.  The
+        name string must nil-terminated and comprised only of characters in the
+        sets recognized
+        by <citerefentry><refentrytitle>isgraph</refentrytitle>
+        <manvolnum>3</manvolnum></citerefentry> and
+        <citerefentry><refentrytitle>isblank</refentrytitle>
+        <manvolnum>3</manvolnum></citerefentry>.</para></listitem>
       </varlistentry>
 
       <varlistentry id="thread.prof.active">
@@ -1283,7 +1307,7 @@ malloc_conf = "xmalloc:true";]]></programlisting>
           [<option>--enable-prof</option>]
         </term>
         <listitem><para>Control whether sampling is currently active for the
-        calling thread.  This is a deactivation mechanism in addition to <link
+        calling thread.  This is an activation mechanism in addition to <link
         linkend="prof.active"><mallctl>prof.active</mallctl></link>; both must
         be active for the calling thread to sample.  This flag is enabled by
         default.</para></listitem>
@@ -1508,6 +1532,20 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         and returning the new arena index.</para></listitem>
       </varlistentry>
 
+      <varlistentry id="prof.thread_active_init">
+        <term>
+          <mallctl>prof.thread_active_init</mallctl>
+          (<type>bool</type>)
+          <literal>rw</literal>
+          [<option>--enable-prof</option>]
+        </term>
+        <listitem><para>Control the initial setting for <link
+        linkend="thread.prof.active"><mallctl>thread.prof.active</mallctl></link>
+        in newly created threads.  See the <link
+        linkend="opt.prof_thread_active_init"><mallctl>opt.prof_thread_active_init</mallctl></link>
+        option for additional information.</para></listitem>
+      </varlistentry>
+
       <varlistentry id="prof.active">
         <term>
           <mallctl>prof.active</mallctl>
@@ -1518,8 +1556,9 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         <listitem><para>Control whether sampling is currently active.  See the
         <link
         linkend="opt.prof_active"><mallctl>opt.prof_active</mallctl></link>
-        option for additional information.
-        </para></listitem>
+        option for additional information, as well as the interrelated <link
+        linkend="thread.prof.active"><mallctl>thread.prof.active</mallctl></link>
+        mallctl.</para></listitem>
       </varlistentry>
 
       <varlistentry id="prof.dump">
@@ -1548,7 +1587,8 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         <listitem><para>Reset all memory profile statistics, and optionally
         update the sample rate (see <link
         linkend="opt.lg_prof_sample"><mallctl>opt.lg_prof_sample</mallctl></link>
-        and <link linkend="prof.lg_sample"><mallctl>prof.lg_sample</mallctl>).
+        and <link
+        linkend="prof.lg_sample"><mallctl>prof.lg_sample</mallctl></link>).
         </para></listitem>
       </varlistentry>
 
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 33f8ce0..6365783 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -285,6 +285,9 @@ opt_zero
 p2rz
 pages_purge
 pow2_ceil
+prof_active_get
+prof_active_get_unlocked
+prof_active_set
 prof_alloc_prep
 prof_alloc_rollback
 prof_backtrace
@@ -316,6 +319,8 @@ prof_tdata_cleanup
 prof_tdata_get
 prof_tdata_init
 prof_thread_active_get
+prof_thread_active_init_get
+prof_thread_active_init_set
 prof_thread_active_set
 prof_thread_name_get
 prof_thread_name_set
diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index ea52a63..3d3f8f4 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -215,13 +215,8 @@ typedef rb_tree(prof_tdata_t) prof_tdata_tree_t;
 #ifdef JEMALLOC_H_EXTERNS
 
 extern bool	opt_prof;
-/*
- * Even if opt_prof is true, sampling can be temporarily disabled by setting
- * opt_prof_active to false.  No locking is used when updating opt_prof_active,
- * so there are no guarantees regarding how long it will take for all threads
- * to notice state changes.
- */
 extern bool	opt_prof_active;
+extern bool	opt_prof_thread_active_init;
 extern size_t	opt_lg_prof_sample;   /* Mean bytes between samples. */
 extern ssize_t	opt_lg_prof_interval; /* lg(prof_interval). */
 extern bool	opt_prof_gdump;       /* High-water memory dumping. */
@@ -235,6 +230,9 @@ extern char	opt_prof_prefix[
 #endif
     1];
 
+/* Accessed via prof_active_[gs]et{_unlocked,}(). */
+extern bool	prof_active;
+
 /*
  * Profile dump interval, measured in bytes allocated.  Each arena triggers a
  * profile dump when it reaches this threshold.  The effect is that the
@@ -274,9 +272,13 @@ prof_tdata_t	*prof_tdata_reinit(tsd_t *tsd, prof_tdata_t *tdata);
 void	prof_reset(tsd_t *tsd, size_t lg_sample);
 void	prof_tdata_cleanup(tsd_t *tsd);
 const char	*prof_thread_name_get(void);
-bool	prof_thread_name_set(tsd_t *tsd, const char *thread_name);
+bool	prof_active_get(void);
+bool	prof_active_set(bool active);
+int	prof_thread_name_set(tsd_t *tsd, const char *thread_name);
 bool	prof_thread_active_get(void);
 bool	prof_thread_active_set(bool active);
+bool	prof_thread_active_init_get(void);
+bool	prof_thread_active_init_set(bool active_init);
 void	prof_boot0(void);
 void	prof_boot1(void);
 bool	prof_boot2(void);
@@ -290,6 +292,7 @@ void	prof_sample_threshold_update(prof_tdata_t *tdata);
 #ifdef JEMALLOC_H_INLINES
 
 #ifndef JEMALLOC_ENABLE_INLINE
+bool	prof_active_get_unlocked(void);
 prof_tdata_t	*prof_tdata_get(tsd_t *tsd, bool create);
 bool	prof_sample_accum_update(tsd_t *tsd, size_t usize, bool commit,
     prof_tdata_t **tdata_out);
@@ -305,6 +308,19 @@ void	prof_free(tsd_t *tsd, const void *ptr, size_t usize);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
+JEMALLOC_INLINE bool
+prof_active_get_unlocked(void)
+{
+
+	/*
+	 * Even if opt_prof is true, sampling can be temporarily disabled by
+	 * setting prof_active to false.  No locking is used when reading
+	 * prof_active in the fast path, so there are no guarantees regarding
+	 * how long it will take for all threads to notice state changes.
+	 */
+	return (prof_active);
+}
+
 JEMALLOC_INLINE prof_tdata_t *
 prof_tdata_get(tsd_t *tsd, bool create)
 {
@@ -401,8 +417,8 @@ prof_alloc_prep(tsd_t *tsd, size_t usize, bool update)
 
 	assert(usize == s2u(usize));
 
-	if (!opt_prof_active || likely(prof_sample_accum_update(tsd, usize,
-	    update, &tdata)))
+	if (!prof_active_get_unlocked() || likely(prof_sample_accum_update(tsd,
+	    usize, update, &tdata)))
 		ret = (prof_tctx_t *)(uintptr_t)1U;
 	else {
 		bt_init(&bt, tdata->vec);
diff --git a/src/ctl.c b/src/ctl.c
index b85710c..8f9faa5 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -7,7 +7,6 @@
 /*
  * ctl_mtx protects the following:
  * - ctl_stats.*
- * - opt_prof_active
  */
 static malloc_mutex_t	ctl_mtx;
 static bool		ctl_initialized;
@@ -104,6 +103,7 @@ CTL_PROTO(opt_lg_tcache_max)
 CTL_PROTO(opt_prof)
 CTL_PROTO(opt_prof_prefix)
 CTL_PROTO(opt_prof_active)
+CTL_PROTO(opt_prof_thread_active_init)
 CTL_PROTO(opt_lg_prof_sample)
 CTL_PROTO(opt_lg_prof_interval)
 CTL_PROTO(opt_prof_gdump)
@@ -131,6 +131,7 @@ CTL_PROTO(arenas_nbins)
 CTL_PROTO(arenas_nhbins)
 CTL_PROTO(arenas_nlruns)
 CTL_PROTO(arenas_extend)
+CTL_PROTO(prof_thread_active_init)
 CTL_PROTO(prof_active)
 CTL_PROTO(prof_dump)
 CTL_PROTO(prof_reset)
@@ -253,6 +254,7 @@ static const ctl_named_node_t opt_node[] = {
 	{NAME("prof"),			CTL(opt_prof)},
 	{NAME("prof_prefix"),		CTL(opt_prof_prefix)},
 	{NAME("prof_active"),		CTL(opt_prof_active)},
+	{NAME("prof_thread_active_init"), CTL(opt_prof_thread_active_init)},
 	{NAME("lg_prof_sample"),	CTL(opt_lg_prof_sample)},
 	{NAME("lg_prof_interval"),	CTL(opt_lg_prof_interval)},
 	{NAME("prof_gdump"),		CTL(opt_prof_gdump)},
@@ -318,6 +320,7 @@ static const ctl_named_node_t arenas_node[] = {
 };
 
 static const ctl_named_node_t	prof_node[] = {
+	{NAME("thread_active_init"), CTL(prof_thread_active_init)},
 	{NAME("active"),	CTL(prof_active)},
 	{NAME("dump"),		CTL(prof_dump)},
 	{NAME("reset"),		CTL(prof_reset)},
@@ -979,6 +982,14 @@ ctl_postfork_child(void)
 	}								\
 } while (0)
 
+#define	READ_XOR_WRITE()	do {					\
+	if ((oldp != NULL && oldlenp != NULL) && (newp != NULL ||	\
+	    newlen != 0)) {						\
+		ret = EPERM;						\
+		goto label_return;					\
+	}								\
+} while (0)
+
 #define	READ(v, t)	do {						\
 	if (oldp != NULL && oldlenp != NULL) {				\
 		if (*oldlenp != sizeof(t)) {				\
@@ -1208,7 +1219,9 @@ CTL_RO_NL_CGEN(config_tcache, opt_tcache, opt_tcache, bool)
 CTL_RO_NL_CGEN(config_tcache, opt_lg_tcache_max, opt_lg_tcache_max, ssize_t)
 CTL_RO_NL_CGEN(config_prof, opt_prof, opt_prof, bool)
 CTL_RO_NL_CGEN(config_prof, opt_prof_prefix, opt_prof_prefix, const char *)
-CTL_RO_CGEN(config_prof, opt_prof_active, opt_prof_active, bool) /* Mutable. */
+CTL_RO_NL_CGEN(config_prof, opt_prof_active, opt_prof_active, bool)
+CTL_RO_NL_CGEN(config_prof, opt_prof_thread_active_init,
+    opt_prof_thread_active_init, bool)
 CTL_RO_NL_CGEN(config_prof, opt_lg_prof_sample, opt_lg_prof_sample, size_t)
 CTL_RO_NL_CGEN(config_prof, opt_prof_accum, opt_prof_accum, bool)
 CTL_RO_NL_CGEN(config_prof, opt_lg_prof_interval, opt_lg_prof_interval, ssize_t)
@@ -1332,12 +1345,12 @@ thread_prof_name_ctl(const size_t *mib, size_t miblen, void *oldp,
     size_t *oldlenp, void *newp, size_t newlen)
 {
 	int ret;
-	const char *oldname;
 
 	if (!config_prof)
 		return (ENOENT);
 
-	oldname = prof_thread_name_get();
+	READ_XOR_WRITE();
+
 	if (newp != NULL) {
 		tsd_t *tsd;
 
@@ -1352,12 +1365,13 @@ thread_prof_name_ctl(const size_t *mib, size_t miblen, void *oldp,
 			goto label_return;
 		}
 
-		if (prof_thread_name_set(tsd, *(const char **)newp)) {
-			ret = EAGAIN;
+		if ((ret = prof_thread_name_set(tsd, *(const char **)newp)) !=
+		    0)
 			goto label_return;
-		}
+	} else {
+		const char *oldname = prof_thread_name_get();
+		READ(oldname, const char *);
 	}
-	READ(oldname, const char *);
 
 	ret = 0;
 label_return:
@@ -1661,6 +1675,31 @@ label_return:
 /******************************************************************************/
 
 static int
+prof_thread_active_init_ctl(const size_t *mib, size_t miblen, void *oldp,
+    size_t *oldlenp, void *newp, size_t newlen)
+{
+	int ret;
+	bool oldval;
+
+	if (!config_prof)
+		return (ENOENT);
+
+	if (newp != NULL) {
+		if (newlen != sizeof(bool)) {
+			ret = EINVAL;
+			goto label_return;
+		}
+		oldval = prof_thread_active_init_set(*(bool *)newp);
+	} else
+		oldval = prof_thread_active_init_get();
+	READ(oldval, bool);
+
+	ret = 0;
+label_return:
+	return (ret);
+}
+
+static int
 prof_active_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
     void *newp, size_t newlen)
 {
@@ -1670,22 +1709,18 @@ prof_active_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 	if (!config_prof)
 		return (ENOENT);
 
-	malloc_mutex_lock(&ctl_mtx); /* Protect opt_prof_active. */
-	oldval = opt_prof_active;
 	if (newp != NULL) {
-		/*
-		 * The memory barriers will tend to make opt_prof_active
-		 * propagate faster on systems with weak memory ordering.
-		 */
-		mb_write();
-		WRITE(opt_prof_active, bool);
-		mb_write();
-	}
+		if (newlen != sizeof(bool)) {
+			ret = EINVAL;
+			goto label_return;
+		}
+		oldval = prof_active_set(*(bool *)newp);
+	} else
+		oldval = prof_active_get();
 	READ(oldval, bool);
 
 	ret = 0;
 label_return:
-	malloc_mutex_unlock(&ctl_mtx);
 	return (ret);
 }
 
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 0d04131..2e96705 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -655,6 +655,8 @@ malloc_conf_init(void)
 				    "prof_prefix", "jeprof")
 				CONF_HANDLE_BOOL(opt_prof_active, "prof_active",
 				    true)
+				CONF_HANDLE_BOOL(opt_prof_thread_active_init,
+				    "prof_thread_active_init", true)
 				CONF_HANDLE_SIZE_T(opt_lg_prof_sample,
 				    "lg_prof_sample", 0,
 				    (sizeof(uint64_t) << 3) - 1, true)
diff --git a/src/prof.c b/src/prof.c
index 29b4baa..5b97998 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -16,6 +16,7 @@
 
 bool		opt_prof = false;
 bool		opt_prof_active = true;
+bool		opt_prof_thread_active_init = true;
 size_t		opt_lg_prof_sample = LG_PROF_SAMPLE_DEFAULT;
 ssize_t		opt_lg_prof_interval = LG_PROF_INTERVAL_DEFAULT;
 bool		opt_prof_gdump = false;
@@ -29,6 +30,20 @@ char		opt_prof_prefix[
 #endif
     1];
 
+/*
+ * Initialized as opt_prof_active, and accessed via
+ * prof_active_[gs]et{_unlocked,}().
+ */
+bool			prof_active;
+static malloc_mutex_t	prof_active_mtx;
+
+/*
+ * Initialized as opt_prof_thread_active_init, and accessed via
+ * prof_thread_active_init_[gs]et().
+ */
+static bool		prof_thread_active_init;
+static malloc_mutex_t	prof_thread_active_init_mtx;
+
 uint64_t	prof_interval = 0;
 
 size_t		lg_prof_sample;
@@ -103,6 +118,7 @@ static bool	prof_tctx_should_destroy(prof_tctx_t *tctx);
 static void	prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx);
 static bool	prof_tdata_should_destroy(prof_tdata_t *tdata);
 static void	prof_tdata_destroy(tsd_t *tsd, prof_tdata_t *tdata);
+static char	*prof_thread_name_alloc(tsd_t *tsd, const char *thread_name);
 
 /******************************************************************************/
 /* Red-black trees. */
@@ -1593,7 +1609,8 @@ prof_thr_uid_alloc(void)
 }
 
 static prof_tdata_t *
-prof_tdata_init_impl(tsd_t *tsd, uint64_t thr_uid, uint64_t thr_discrim)
+prof_tdata_init_impl(tsd_t *tsd, uint64_t thr_uid, uint64_t thr_discrim,
+    char *thread_name, bool active)
 {
 	prof_tdata_t *tdata;
 
@@ -1607,7 +1624,7 @@ prof_tdata_init_impl(tsd_t *tsd, uint64_t thr_uid, uint64_t thr_discrim)
 	tdata->lock = prof_tdata_mutex_choose(thr_uid);
 	tdata->thr_uid = thr_uid;
 	tdata->thr_discrim = thr_discrim;
-	tdata->thread_name = NULL;
+	tdata->thread_name = thread_name;
 	tdata->attached = true;
 	tdata->expired = false;
 
@@ -1625,7 +1642,7 @@ prof_tdata_init_impl(tsd_t *tsd, uint64_t thr_uid, uint64_t thr_discrim)
 	tdata->enq_gdump = false;
 
 	tdata->dumping = false;
-	tdata->active = true;
+	tdata->active = active;
 
 	malloc_mutex_lock(&tdatas_mtx);
 	tdata_tree_insert(&tdatas, tdata);
@@ -1638,7 +1655,8 @@ prof_tdata_t *
 prof_tdata_init(tsd_t *tsd)
 {
 
-	return (prof_tdata_init_impl(tsd, prof_thr_uid_alloc(), 0));
+	return (prof_tdata_init_impl(tsd, prof_thr_uid_alloc(), 0, NULL,
+	    prof_thread_active_init_get()));
 }
 
 /* tdata->lock must be held. */
@@ -1698,9 +1716,13 @@ prof_tdata_reinit(tsd_t *tsd, prof_tdata_t *tdata)
 {
 	uint64_t thr_uid = tdata->thr_uid;
 	uint64_t thr_discrim = tdata->thr_discrim + 1;
+	char *thread_name = (tdata->thread_name != NULL) ?
+	    prof_thread_name_alloc(tsd, tdata->thread_name) : NULL;
+	bool active = tdata->active;
 
 	prof_tdata_detach(tsd, tdata);
-	return (prof_tdata_init_impl(tsd, thr_uid, thr_discrim));
+	return (prof_tdata_init_impl(tsd, thr_uid, thr_discrim, thread_name,
+	    active));
 }
 
 static bool
@@ -1768,6 +1790,29 @@ prof_tdata_cleanup(tsd_t *tsd)
 		prof_tdata_detach(tsd, tdata);
 }
 
+bool
+prof_active_get(void)
+{
+	bool prof_active_current;
+
+	malloc_mutex_lock(&prof_active_mtx);
+	prof_active_current = prof_active;
+	malloc_mutex_unlock(&prof_active_mtx);
+	return (prof_active_current);
+}
+
+bool
+prof_active_set(bool active)
+{
+	bool prof_active_old;
+
+	malloc_mutex_lock(&prof_active_mtx);
+	prof_active_old = prof_active;
+	prof_active = active;
+	malloc_mutex_unlock(&prof_active_mtx);
+	return (prof_active_old);
+}
+
 const char *
 prof_thread_name_get(void)
 {
@@ -1775,34 +1820,64 @@ prof_thread_name_get(void)
 	prof_tdata_t *tdata;
 
 	if ((tsd = tsd_tryget()) == NULL)
-		return (NULL);
+		return ("");
 	tdata = prof_tdata_get(tsd, true);
 	if (tdata == NULL)
+		return ("");
+	return (tdata->thread_name != NULL ? tdata->thread_name : "");
+}
+
+static char *
+prof_thread_name_alloc(tsd_t *tsd, const char *thread_name)
+{
+	char *ret;
+	size_t size;
+
+	if (thread_name == NULL)
+		return (NULL);
+
+	size = strlen(thread_name) + 1;
+	if (size == 1)
+		return ("");
+
+	ret = imalloc(tsd, size);
+	if (ret == NULL)
 		return (NULL);
-	return (tdata->thread_name);
+	memcpy(ret, thread_name, size);
+	return (ret);
 }
 
-bool
+int
 prof_thread_name_set(tsd_t *tsd, const char *thread_name)
 {
 	prof_tdata_t *tdata;
-	size_t size;
+	unsigned i;
 	char *s;
 
 	tdata = prof_tdata_get(tsd, true);
 	if (tdata == NULL)
-		return (true);
+		return (EAGAIN);
+
+	/* Validate input. */
+	if (thread_name == NULL)
+		return (EFAULT);
+	for (i = 0; thread_name[i] != '\0'; i++) {
+		char c = thread_name[i];
+		if (!isgraph(c) && !isblank(c))
+			return (EFAULT);
+	}
 
-	size = strlen(thread_name) + 1;
-	s = imalloc(tsd, size);
+	s = prof_thread_name_alloc(tsd, thread_name);
 	if (s == NULL)
-		return (true);
+		return (EAGAIN);
 
-	memcpy(s, thread_name, size);
-	if (tdata->thread_name != NULL)
+	if (tdata->thread_name != NULL) {
 		idalloc(tsd, tdata->thread_name);
-	tdata->thread_name = s;
-	return (false);
+		tdata->thread_name = NULL;
+	}
+	if (strlen(s) > 0)
+		tdata->thread_name = s;
+	return (0);
 }
 
 bool
@@ -1834,6 +1909,29 @@ prof_thread_active_set(bool active)
 	return (false);
 }
 
+bool
+prof_thread_active_init_get(void)
+{
+	bool active_init;
+
+	malloc_mutex_lock(&prof_thread_active_init_mtx);
+	active_init = prof_thread_active_init;
+	malloc_mutex_unlock(&prof_thread_active_init_mtx);
+	return (active_init);
+}
+
+bool
+prof_thread_active_init_set(bool active_init)
+{
+	bool active_init_old;
+
+	malloc_mutex_lock(&prof_thread_active_init_mtx);
+	active_init_old = prof_thread_active_init;
+	prof_thread_active_init = active_init;
+	malloc_mutex_unlock(&prof_thread_active_init_mtx);
+	return (active_init_old);
+}
+
 void
 prof_boot0(void)
 {
@@ -1882,6 +1980,14 @@ prof_boot2(void)
 
 		lg_prof_sample = opt_lg_prof_sample;
 
+		prof_active = opt_prof_active;
+		if (malloc_mutex_init(&prof_active_mtx))
+			return (true);
+
+		prof_thread_active_init = opt_prof_thread_active_init;
+		if (malloc_mutex_init(&prof_thread_active_init_mtx))
+			return (true);
+
 		if ((tsd = tsd_tryget()) == NULL)
 			return (true);
 		if (ckh_new(tsd, &bt2gctx, PROF_CKH_MINITEMS, prof_bt_hash,
diff --git a/src/stats.c b/src/stats.c
index aa09550..5c3d701 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -336,7 +336,6 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	malloc_cprintf(write_cb, cbopaque,
 	    "___ Begin jemalloc statistics ___\n");
 	if (general) {
-		int err;
 		const char *cpv;
 		bool bv;
 		unsigned uv;
@@ -355,26 +354,31 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		    bv ? "enabled" : "disabled");
 
 #define	OPT_WRITE_BOOL(n)						\
-		if ((err = je_mallctl("opt."#n, &bv, &bsz, NULL, 0))	\
-		    == 0) {						\
+		if (je_mallctl("opt."#n, &bv, &bsz, NULL, 0) == 0) {	\
 			malloc_cprintf(write_cb, cbopaque,		\
 			    "  opt."#n": %s\n", bv ? "true" : "false");	\
 		}
+#define	OPT_WRITE_BOOL_MUTABLE(n, m) {					\
+		bool bv2;						\
+		if (je_mallctl("opt."#n, &bv, &bsz, NULL, 0) == 0 &&	\
+		    je_mallctl(#m, &bv2, &bsz, NULL, 0) == 0) {		\
+			malloc_cprintf(write_cb, cbopaque,		\
+			    "  opt."#n": %s ("#m": %s)\n", bv ? "true"	\
+			    : "false", bv2 ? "true" : "false");		\
+		}							\
+}
 #define	OPT_WRITE_SIZE_T(n)						\
-		if ((err = je_mallctl("opt."#n, &sv, &ssz, NULL, 0))	\
-		    == 0) {						\
+		if (je_mallctl("opt."#n, &sv, &ssz, NULL, 0) == 0) {	\
 			malloc_cprintf(write_cb, cbopaque,		\
 			"  opt."#n": %zu\n", sv);			\
 		}
 #define	OPT_WRITE_SSIZE_T(n)						\
-		if ((err = je_mallctl("opt."#n, &ssv, &sssz, NULL, 0))	\
-		    == 0) {						\
+		if (je_mallctl("opt."#n, &ssv, &sssz, NULL, 0) == 0) {	\
 			malloc_cprintf(write_cb, cbopaque,		\
 			    "  opt."#n": %zd\n", ssv);			\
 		}
 #define	OPT_WRITE_CHAR_P(n)						\
-		if ((err = je_mallctl("opt."#n, &cpv, &cpsz, NULL, 0))	\
-		    == 0) {						\
+		if (je_mallctl("opt."#n, &cpv, &cpsz, NULL, 0) == 0) {	\
 			malloc_cprintf(write_cb, cbopaque,		\
 			    "  opt."#n": \"%s\"\n", cpv);		\
 		}
@@ -398,7 +402,9 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		OPT_WRITE_SSIZE_T(lg_tcache_max)
 		OPT_WRITE_BOOL(prof)
 		OPT_WRITE_CHAR_P(prof_prefix)
-		OPT_WRITE_BOOL(prof_active)
+		OPT_WRITE_BOOL_MUTABLE(prof_active, prof.active)
+		OPT_WRITE_BOOL_MUTABLE(prof_thread_active_init,
+		    prof.thread_active_init)
 		OPT_WRITE_SSIZE_T(lg_prof_sample)
 		OPT_WRITE_BOOL(prof_accum)
 		OPT_WRITE_SSIZE_T(lg_prof_interval)
@@ -407,6 +413,7 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		OPT_WRITE_BOOL(prof_leak)
 
 #undef OPT_WRITE_BOOL
+#undef OPT_WRITE_BOOL_MUTABLE
 #undef OPT_WRITE_SIZE_T
 #undef OPT_WRITE_SSIZE_T
 #undef OPT_WRITE_CHAR_P
@@ -434,13 +441,11 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 			malloc_cprintf(write_cb, cbopaque,
 			    "Min active:dirty page ratio per arena: N/A\n");
 		}
-		if ((err = je_mallctl("arenas.tcache_max", &sv, &ssz, NULL, 0))
-		    == 0) {
+		if (je_mallctl("arenas.tcache_max", &sv, &ssz, NULL, 0) == 0) {
 			malloc_cprintf(write_cb, cbopaque,
 			    "Maximum thread-cached size class: %zu\n", sv);
 		}
-		if ((err = je_mallctl("opt.prof", &bv, &bsz, NULL, 0)) == 0 &&
-		    bv) {
+		if (je_mallctl("opt.prof", &bv, &bsz, NULL, 0) == 0 && bv) {
 			CTL_GET("prof.lg_sample", &sv, size_t);
 			malloc_cprintf(write_cb, cbopaque,
 			    "Average profile sample interval: %"PRIu64
diff --git a/test/unit/prof_active.c b/test/unit/prof_active.c
new file mode 100644
index 0000000..d4bab8d
--- /dev/null
+++ b/test/unit/prof_active.c
@@ -0,0 +1,136 @@
+#include "test/jemalloc_test.h"
+
+#ifdef JEMALLOC_PROF
+const char *malloc_conf =
+    "prof:true,prof_thread_active_init:false,lg_prof_sample:0,prof_final:false";
+#endif
+
+static void
+mallctl_bool_get(const char *name, bool expected, const char *func, int line)
+{
+	bool old;
+	size_t sz;
+
+	sz = sizeof(old);
+	assert_d_eq(mallctl(name, &old, &sz, NULL, 0), 0,
+	    "%s():%d: Unexpected mallctl failure reading %s", func, line, name);
+	assert_b_eq(old, expected, "%s():%d: Unexpected %s value", func, line,
+	    name);
+}
+
+static void
+mallctl_bool_set(const char *name, bool old_expected, bool val_new,
+    const char *func, int line)
+{
+	bool old;
+	size_t sz;
+
+	sz = sizeof(old);
+	assert_d_eq(mallctl(name, &old, &sz, &val_new, sizeof(val_new)), 0,
+	    "%s():%d: Unexpected mallctl failure reading/writing %s", func,
+	    line, name);
+	assert_b_eq(old, old_expected, "%s():%d: Unexpected %s value", func,
+	    line, name);
+}
+
+static void
+mallctl_prof_active_get_impl(bool prof_active_old_expected, const char *func,
+    int line)
+{
+
+	mallctl_bool_get("prof.active", prof_active_old_expected, func, line);
+}
+#define	mallctl_prof_active_get(a)					\
+	mallctl_prof_active_get_impl(a, __func__, __LINE__)
+
+static void
+mallctl_prof_active_set_impl(bool prof_active_old_expected,
+    bool prof_active_new, const char *func, int line)
+{
+
+	mallctl_bool_set("prof.active", prof_active_old_expected,
+	    prof_active_new, func, line);
+}
+#define	mallctl_prof_active_set(a, b)					\
+	mallctl_prof_active_set_impl(a, b, __func__, __LINE__)
+
+static void
+mallctl_thread_prof_active_get_impl(bool thread_prof_active_old_expected,
+    const char *func, int line)
+{
+
+	mallctl_bool_get("thread.prof.active", thread_prof_active_old_expected,
+	    func, line);
+}
+#define	mallctl_thread_prof_active_get(a)				\
+	mallctl_thread_prof_active_get_impl(a, __func__, __LINE__)
+
+static void
+mallctl_thread_prof_active_set_impl(bool thread_prof_active_old_expected,
+    bool thread_prof_active_new, const char *func, int line)
+{
+
+	mallctl_bool_set("thread.prof.active", thread_prof_active_old_expected,
+	    thread_prof_active_new, func, line);
+}
+#define	mallctl_thread_prof_active_set(a, b)				\
+	mallctl_thread_prof_active_set_impl(a, b, __func__, __LINE__)
+
+static void
+prof_sampling_probe_impl(bool expect_sample, const char *func, int line)
+{
+	void *p;
+	size_t expected_backtraces = expect_sample ? 1 : 0;
+
+	assert_zu_eq(prof_bt_count(), 0, "%s():%d: Expected 0 backtraces", func,
+	    line);
+	p = mallocx(1, 0);
+	assert_ptr_not_null(p, "Unexpected mallocx() failure");
+	assert_zu_eq(prof_bt_count(), expected_backtraces,
+	    "%s():%d: Unexpected backtrace count", func, line);
+	dallocx(p, 0);
+}
+#define	prof_sampling_probe(a)						\
+	prof_sampling_probe_impl(a, __func__, __LINE__)
+
+TEST_BEGIN(test_prof_active)
+{
+
+	test_skip_if(!config_prof);
+
+	mallctl_prof_active_get(true);
+	mallctl_thread_prof_active_get(false);
+
+	mallctl_prof_active_set(true, true);
+	mallctl_thread_prof_active_set(false, false);
+	/* prof.active, !thread.prof.active. */
+	prof_sampling_probe(false);
+
+	mallctl_prof_active_set(true, false);
+	mallctl_thread_prof_active_set(false, false);
+	/* !prof.active, !thread.prof.active. */
+	prof_sampling_probe(false);
+
+	mallctl_prof_active_set(false, false);
+	mallctl_thread_prof_active_set(false, true);
+	/* !prof.active, thread.prof.active. */
+	prof_sampling_probe(false);
+
+	mallctl_prof_active_set(false, true);
+	mallctl_thread_prof_active_set(true, true);
+	/* prof.active, thread.prof.active. */
+	prof_sampling_probe(true);
+
+	/* Restore settings. */
+	mallctl_prof_active_set(true, true);
+	mallctl_thread_prof_active_set(true, false);
+}
+TEST_END
+
+int
+main(void)
+{
+
+	return (test(
+	    test_prof_active));
+}
diff --git a/test/unit/prof_reset.c b/test/unit/prof_reset.c
index 62a4d5a..3af1964 100644
--- a/test/unit/prof_reset.c
+++ b/test/unit/prof_reset.c
@@ -22,6 +22,8 @@ TEST_BEGIN(test_prof_reset_basic)
 	size_t sz;
 	unsigned i;
 
+	test_skip_if(!config_prof);
+
 	sz = sizeof(size_t);
 	assert_d_eq(mallctl("opt.lg_prof_sample", &lg_prof_sample_orig, &sz,
 	    NULL, 0), 0,
@@ -90,6 +92,8 @@ TEST_BEGIN(test_prof_reset_cleanup)
 	void *p;
 	prof_dump_header_t *prof_dump_header_orig;
 
+	test_skip_if(!config_prof);
+
 	active = true;
 	assert_d_eq(mallctl("prof.active", NULL, NULL, &active, sizeof(active)),
 	    0, "Unexpected mallctl failure while activating profiling");
diff --git a/test/unit/prof_thread_name.c b/test/unit/prof_thread_name.c
new file mode 100644
index 0000000..7fb8038
--- /dev/null
+++ b/test/unit/prof_thread_name.c
@@ -0,0 +1,128 @@
+#include "test/jemalloc_test.h"
+
+#ifdef JEMALLOC_PROF
+const char *malloc_conf =
+    "prof:true,prof_active:false,prof_final:false";
+#endif
+
+static void
+mallctl_thread_name_get_impl(const char *thread_name_expected, const char *func,
+    int line)
+{
+	const char *thread_name_old;
+	size_t sz;
+
+	sz = sizeof(thread_name_old);
+	assert_d_eq(mallctl("thread.prof.name", &thread_name_old, &sz, NULL, 0),
+	    0, "%s():%d: Unexpected mallctl failure reading thread.prof.name",
+	    func, line);
+	assert_str_eq(thread_name_old, thread_name_expected,
+	    "%s():%d: Unexpected thread.prof.name value", func, line);
+}
+#define	mallctl_thread_name_get(a)					\
+	mallctl_thread_name_get_impl(a, __func__, __LINE__)
+
+static void
+mallctl_thread_name_set_impl(const char *thread_name, const char *func,
+    int line)
+{
+
+	assert_d_eq(mallctl("thread.prof.name", NULL, NULL, &thread_name,
+	    sizeof(thread_name)), 0,
+	    "%s():%d: Unexpected mallctl failure reading thread.prof.name",
+	    func, line);
+	mallctl_thread_name_get_impl(thread_name, func, line);
+}
+#define	mallctl_thread_name_set(a)					\
+	mallctl_thread_name_set_impl(a, __func__, __LINE__)
+
+TEST_BEGIN(test_prof_thread_name_validation)
+{
+	const char *thread_name;
+
+	mallctl_thread_name_get("");
+	mallctl_thread_name_set("hi there");
+
+	/* NULL input shouldn't be allowed. */
+	thread_name = NULL;
+	assert_d_eq(mallctl("thread.prof.name", NULL, NULL, &thread_name,
+	    sizeof(thread_name)), EFAULT,
+	    "Unexpected mallctl result writing \"%s\" to thread.prof.name",
+	    thread_name);
+
+	/* '\n' shouldn't be allowed. */
+	thread_name = "hi\nthere";
+	assert_d_eq(mallctl("thread.prof.name", NULL, NULL, &thread_name,
+	    sizeof(thread_name)), EFAULT,
+	    "Unexpected mallctl result writing \"%s\" to thread.prof.name",
+	    thread_name);
+
+	/* Simultaneous read/write shouldn't be allowed. */
+	{
+		const char *thread_name_old;
+		size_t sz;
+
+		sz = sizeof(thread_name_old);
+		assert_d_eq(mallctl("thread.prof.name", &thread_name_old, &sz,
+		    &thread_name, sizeof(thread_name)), EPERM,
+		    "Unexpected mallctl result writing \"%s\" to "
+		    "thread.prof.name", thread_name);
+	}
+
+	mallctl_thread_name_set("");
+}
+TEST_END
+
+#define	NTHREADS	4
+#define	NRESET		25
+static void *
+thd_start(void *varg)
+{
+	unsigned thd_ind = *(unsigned *)varg;
+	char thread_name[16] = "";
+	unsigned i;
+
+	malloc_snprintf(thread_name, sizeof(thread_name), "thread %u", thd_ind);
+
+	mallctl_thread_name_get("");
+	mallctl_thread_name_set(thread_name);
+
+	for (i = 0; i < NRESET; i++) {
+		assert_d_eq(mallctl("prof.reset", NULL, NULL, NULL, 0), 0,
+		    "Unexpected error while resetting heap profile data");
+		mallctl_thread_name_get(thread_name);
+	}
+
+	mallctl_thread_name_set(thread_name);
+	mallctl_thread_name_set("");
+
+	return (NULL);
+}
+
+TEST_BEGIN(test_prof_thread_name_threaded)
+{
+	thd_t thds[NTHREADS];
+	unsigned thd_args[NTHREADS];
+	unsigned i;
+
+	test_skip_if(!config_prof);
+
+	for (i = 0; i < NTHREADS; i++) {
+		thd_args[i] = i;
+		thd_create(&thds[i], thd_start, (void *)&thd_args[i]);
+	}
+	for (i = 0; i < NTHREADS; i++)
+		thd_join(thds[i], NULL);
+}
+TEST_END
+#undef NTHREADS
+#undef NRESET
+
+int
+main(void)
+{
+
+	return (test(
+	    test_prof_thread_name_validation,
+	    test_prof_thread_name_threaded));
+}
-- 
cgit v0.12


From b72d4abc5fb1185e4017c014d521693a99f9175b Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Fri, 3 Oct 2014 23:41:53 -0700
Subject: Skip test_prof_thread_name_validation if !config_prof.

---
 test/unit/prof_thread_name.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/unit/prof_thread_name.c b/test/unit/prof_thread_name.c
index 7fb8038..6066dba 100644
--- a/test/unit/prof_thread_name.c
+++ b/test/unit/prof_thread_name.c
@@ -40,6 +40,8 @@ TEST_BEGIN(test_prof_thread_name_validation)
 {
 	const char *thread_name;
 
+	test_skip_if(!config_prof);
+
 	mallctl_thread_name_get("");
 	mallctl_thread_name_set("hi there");
 
-- 
cgit v0.12


From a4a972d9a163a57183f851535104f4e8ac78f511 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Sat, 4 Oct 2014 00:35:07 -0700
Subject: Fix install_lib target (incorrect jemalloc.pc path).

---
 Makefile.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.in b/Makefile.in
index 52f5a9d..50f6596 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -101,7 +101,7 @@ DSOS := $(objroot)lib/$(LIBJEMALLOC).$(SOREV)
 ifneq ($(SOREV),$(SO))
 DSOS += $(objroot)lib/$(LIBJEMALLOC).$(SO)
 endif
-PC := $(srcroot)jemalloc.pc
+PC := $(objroot)jemalloc.pc
 MAN3 := $(objroot)doc/jemalloc$(install_suffix).3
 DOCS_XML := $(objroot)doc/jemalloc$(install_suffix).xml
 DOCS_HTML := $(DOCS_XML:$(objroot)%.xml=$(srcroot)%.html)
-- 
cgit v0.12


From 029d44cf8b22aa7b749747bfd585887fb59e0030 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sat, 4 Oct 2014 11:12:53 -0700
Subject: Fix tsd cleanup regressions.

Fix tsd cleanup regressions that were introduced in
5460aa6f6676c7f253bfcb75c028dfd38cae8aaf (Convert all tsd variables to
reside in a single tsd structure.).  These regressions were twofold:

1) tsd_tryget() should never (and need never) return NULL.  Rename it to
   tsd_fetch() and simplify all callers.
2) tsd_*_set() must only be called when tsd is in the nominal state,
   because cleanup happens during the nominal-->purgatory transition,
   and re-initialization must not happen while in the purgatory state.
   Add tsd_nominal() and use it as needed.  Note that tsd_*{p,}_get()
   can still be used as long as no re-initialization that would require
   cleanup occurs.  This means that e.g. the thread_allocated counter
   can be updated unconditionally.
---
 include/jemalloc/internal/private_symbols.txt |  3 +-
 include/jemalloc/internal/prof.h              |  6 ++-
 include/jemalloc/internal/quarantine.h        |  4 +-
 include/jemalloc/internal/tcache.h            | 21 ++------
 include/jemalloc/internal/tsd.h               | 65 ++++++++++++----------
 src/ctl.c                                     | 26 ++-------
 src/jemalloc.c                                | 78 ++++++++++++---------------
 src/prof.c                                    | 29 ++++------
 src/tcache.c                                  |  3 +-
 src/tsd.c                                     |  5 --
 test/unit/ckh.c                               |  9 ++--
 test/unit/tsd.c                               | 35 ++++++++++--
 12 files changed, 137 insertions(+), 147 deletions(-)

diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 6365783..4ea9a95 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -390,12 +390,14 @@ tsd_arena_set
 tsd_boot
 tsd_cleanup
 tsd_cleanup_wrapper
+tsd_fetch
 tsd_get
 tsd_get_wrapper
 tsd_initialized
 tsd_init_check_recursion
 tsd_init_finish
 tsd_init_head
+tsd_nominal
 tsd_quarantine_get
 tsd_quarantine_set
 tsd_set
@@ -411,7 +413,6 @@ tsd_thread_allocated_get
 tsd_thread_allocated_set
 tsd_thread_deallocated_get
 tsd_thread_deallocated_set
-tsd_tryget
 u2rz
 valgrind_freelike_block
 valgrind_make_mem_defined
diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index 3d3f8f4..0ec7c18 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -331,8 +331,10 @@ prof_tdata_get(tsd_t *tsd, bool create)
 	tdata = tsd_prof_tdata_get(tsd);
 	if (create) {
 		if (unlikely(tdata == NULL)) {
-			tdata = prof_tdata_init(tsd);
-			tsd_prof_tdata_set(tsd, tdata);
+			if (tsd_nominal(tsd)) {
+				tdata = prof_tdata_init(tsd);
+				tsd_prof_tdata_set(tsd, tdata);
+			}
 		} else if (unlikely(tdata->expired)) {
 			tdata = prof_tdata_reinit(tsd, tdata);
 			tsd_prof_tdata_set(tsd, tdata);
diff --git a/include/jemalloc/internal/quarantine.h b/include/jemalloc/internal/quarantine.h
index 3a75598..4e9c710 100644
--- a/include/jemalloc/internal/quarantine.h
+++ b/include/jemalloc/internal/quarantine.h
@@ -49,8 +49,8 @@ quarantine_alloc_hook(void)
 
 	assert(config_fill && opt_quarantine);
 
-	tsd = tsd_tryget();
-	if (tsd != NULL && tsd_quarantine_get(tsd) == NULL)
+	tsd = tsd_fetch();
+	if (tsd_quarantine_get(tsd) == NULL && tsd_nominal(tsd))
 		tsd_quarantine_set(tsd, quarantine_init(tsd, LG_MAXOBJS_INIT));
 }
 #endif
diff --git a/include/jemalloc/internal/tcache.h b/include/jemalloc/internal/tcache.h
index bc0b41c..1a70972 100644
--- a/include/jemalloc/internal/tcache.h
+++ b/include/jemalloc/internal/tcache.h
@@ -142,9 +142,8 @@ tcache_flush(void)
 
 	cassert(config_tcache);
 
-	tsd = tsd_tryget();
-	if (tsd != NULL)
-		tcache_cleanup(tsd);
+	tsd = tsd_fetch();
+	tcache_cleanup(tsd);
 }
 
 JEMALLOC_INLINE bool
@@ -155,9 +154,7 @@ tcache_enabled_get(void)
 
 	cassert(config_tcache);
 
-	tsd = tsd_tryget();
-	if (tsd == NULL)
-		return (false);
+	tsd = tsd_fetch();
 	tcache_enabled = tsd_tcache_enabled_get(tsd);
 	if (tcache_enabled == tcache_enabled_default) {
 		tcache_enabled = (tcache_enabled_t)opt_tcache;
@@ -175,9 +172,7 @@ tcache_enabled_set(bool enabled)
 
 	cassert(config_tcache);
 
-	tsd = tsd_tryget();
-	if (tsd == NULL)
-		return;
+	tsd = tsd_fetch();
 
 	tcache_enabled = (tcache_enabled_t)enabled;
 	tsd_tcache_enabled_set(tsd, tcache_enabled);
@@ -195,17 +190,11 @@ tcache_get(tsd_t *tsd, bool create)
 		return (NULL);
 	if (config_lazy_lock && !isthreaded)
 		return (NULL);
-	/*
-	 * If create is true, the caller has already assured that tsd is
-	 * non-NULL.
-	 */
-	if (!create && unlikely(tsd == NULL))
-		return (NULL);
 
 	tcache = tsd_tcache_get(tsd);
 	if (!create)
 		return (tcache);
-	if (unlikely(tcache == NULL)) {
+	if (unlikely(tcache == NULL) && tsd_nominal(tsd)) {
 		tcache = tcache_get_hard(tsd);
 		tsd_tcache_set(tsd, tcache);
 	}
diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index 44952ee..2545039 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -49,16 +49,19 @@ typedef enum {
  * Note that all of the functions deal in terms of (a_type *) rather than
  * (a_type)  so that it is possible to support non-pointer types (unlike
  * pthreads TSD).  example_tsd_cleanup() is passed an (a_type *) pointer that is
- * cast to (void *).  This means that the cleanup function needs to cast *and*
- * dereference the function argument, e.g.:
+ * cast to (void *).  This means that the cleanup function needs to cast the
+ * function argument to (a_type *), then dereference the resulting pointer to
+ * access fields, e.g.
  *
- *   bool
+ *   void
  *   example_tsd_cleanup(void *arg)
  *   {
- *           example_t *example = *(example_t **)arg;
+ *           example_t *example = (example_t *)arg;
  *
+ *           example->x = 42;
  *           [...]
- *           return ([want the cleanup function to be called again]);
+ *           if ([want the cleanup function to be called again])
+ *                   example_tsd_set(example);
  *   }
  *
  * If example_tsd_set() is called within example_tsd_cleanup(), it will be
@@ -468,7 +471,8 @@ void	tsd_cleanup(void *arg);
 #ifndef JEMALLOC_ENABLE_INLINE
 malloc_tsd_protos(JEMALLOC_ATTR(unused), , tsd_t)
 
-tsd_t	*tsd_tryget(void);
+tsd_t	*tsd_fetch(void);
+bool	tsd_nominal(tsd_t *tsd);
 #define	O(n, t)								\
 t	*tsd_##n##p_get(tsd_t *tsd);					\
 t	tsd_##n##_get(tsd_t *tsd);					\
@@ -481,50 +485,53 @@ MALLOC_TSD
 malloc_tsd_externs(, tsd_t)
 malloc_tsd_funcs(JEMALLOC_ALWAYS_INLINE, , tsd_t, tsd_initializer, tsd_cleanup)
 
-JEMALLOC_INLINE tsd_t *
-tsd_tryget(void)
+JEMALLOC_ALWAYS_INLINE tsd_t *
+tsd_fetch(void)
 {
-	tsd_t *tsd;
+	tsd_t *tsd = tsd_get();
 
-	tsd = tsd_get();
-	if (unlikely(tsd == NULL))
-		return (NULL);
-
-	if (likely(tsd->state == tsd_state_nominal))
-		return (tsd);
-	else if (tsd->state == tsd_state_uninitialized) {
-		tsd->state = tsd_state_nominal;
-		tsd_set(tsd);
-		return (tsd);
-	} else if (tsd->state == tsd_state_purgatory) {
-		tsd->state = tsd_state_reincarnated;
-		tsd_set(tsd);
-		return (NULL);
-	} else {
-		assert(tsd->state == tsd_state_reincarnated);
-		return (NULL);
+	if (unlikely(tsd->state != tsd_state_nominal)) {
+		if (tsd->state == tsd_state_uninitialized) {
+			tsd->state = tsd_state_nominal;
+			/* Trigger cleanup handler registration. */
+			tsd_set(tsd);
+		} else if (tsd->state == tsd_state_purgatory) {
+			tsd->state = tsd_state_reincarnated;
+			tsd_set(tsd);
+		} else
+			assert(tsd->state == tsd_state_reincarnated);
 	}
+
+	return (tsd);
+}
+
+JEMALLOC_INLINE bool
+tsd_nominal(tsd_t *tsd)
+{
+
+	return (tsd->state == tsd_state_nominal);
 }
 
 #define	O(n, t)								\
-JEMALLOC_INLINE t *							\
+JEMALLOC_ALWAYS_INLINE t *						\
 tsd_##n##p_get(tsd_t *tsd)						\
 {									\
 									\
 	return (&tsd->n);						\
 }									\
 									\
-JEMALLOC_INLINE t							\
+JEMALLOC_ALWAYS_INLINE t						\
 tsd_##n##_get(tsd_t *tsd)						\
 {									\
 									\
 	return (*tsd_##n##p_get(tsd));					\
 }									\
 									\
-JEMALLOC_INLINE void							\
+JEMALLOC_ALWAYS_INLINE void						\
 tsd_##n##_set(tsd_t *tsd, t n)						\
 {									\
 									\
+	assert(tsd->state == tsd_state_nominal);			\
 	tsd->n = n;							\
 }
 MALLOC_TSD
diff --git a/src/ctl.c b/src/ctl.c
index 8f9faa5..309f1f6 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -571,9 +571,7 @@ ctl_grow(void)
 	ctl_arena_stats_t *astats;
 	arena_t **tarenas;
 
-	tsd = tsd_tryget();
-	if (tsd == NULL)
-		return (true);
+	tsd = tsd_fetch();
 
 	/* Allocate extended arena stats and arenas arrays. */
 	astats = (ctl_arena_stats_t *)imalloc(tsd, (ctl_stats.narenas + 2) *
@@ -1132,11 +1130,7 @@ n##_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,	\
 	if (!(c))							\
 		return (ENOENT);					\
 	READONLY();							\
-	tsd = tsd_tryget();						\
-	if (tsd == NULL) {						\
-		ret = EAGAIN;						\
-		goto label_return;					\
-	}								\
+	tsd = tsd_fetch();						\
 	oldval = (m(tsd));						\
 	READ(oldval, t);						\
 									\
@@ -1239,9 +1233,7 @@ thread_arena_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 	tsd_t *tsd;
 	unsigned newind, oldind;
 
-	tsd = tsd_tryget();
-	if (tsd == NULL)
-		return (EAGAIN);
+	tsd = tsd_fetch();
 
 	malloc_mutex_lock(&ctl_mtx);
 	newind = oldind = choose_arena(tsd, NULL)->ind;
@@ -1359,11 +1351,7 @@ thread_prof_name_ctl(const size_t *mib, size_t miblen, void *oldp,
 			goto label_return;
 		}
 
-		tsd = tsd_tryget();
-		if (tsd == NULL) {
-			ret = EAGAIN;
-			goto label_return;
-		}
+		tsd = tsd_fetch();
 
 		if ((ret = prof_thread_name_set(tsd, *(const char **)newp)) !=
 		    0)
@@ -1763,11 +1751,7 @@ prof_reset_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 	if (lg_sample >= (sizeof(uint64_t) << 3))
 		lg_sample = (sizeof(uint64_t) << 3) - 1;
 
-	tsd = tsd_tryget();
-	if (tsd == NULL) {
-		ret = EAGAIN;
-		goto label_return;
-	}
+	tsd = tsd_fetch();
 
 	prof_reset(tsd, lg_sample);
 
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 2e96705..4a3d968 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -194,7 +194,8 @@ choose_arena_hard(tsd_t *tsd)
 		malloc_mutex_unlock(&arenas_lock);
 	}
 
-	tsd_arena_set(tsd, ret);
+	if (tsd_nominal(tsd))
+		tsd_arena_set(tsd, ret);
 
 	return (ret);
 }
@@ -908,8 +909,9 @@ JEMALLOC_ALWAYS_INLINE_C void *
 imalloc_body(size_t size, tsd_t **tsd, size_t *usize)
 {
 
-	if (unlikely(malloc_init()) || unlikely((*tsd = tsd_tryget()) == NULL))
+	if (unlikely(malloc_init()))
 		return (NULL);
+	*tsd = tsd_fetch();
 
 	if (config_prof && opt_prof) {
 		*usize = s2u(size);
@@ -1000,10 +1002,11 @@ imemalign(void **memptr, size_t alignment, size_t size, size_t min_alignment)
 
 	assert(min_alignment != 0);
 
-	if (unlikely(malloc_init()) || unlikely((tsd = tsd_tryget()) == NULL)) {
+	if (unlikely(malloc_init())) {
 		result = NULL;
 		goto label_oom;
 	} else {
+		tsd = tsd_fetch();
 		if (size == 0)
 			size = 1;
 
@@ -1124,11 +1127,12 @@ je_calloc(size_t num, size_t size)
 	size_t num_size;
 	size_t usize JEMALLOC_CC_SILENCE_INIT(0);
 
-	if (unlikely(malloc_init()) || unlikely((tsd = tsd_tryget()) == NULL)) {
+	if (unlikely(malloc_init())) {
 		num_size = 0;
 		ret = NULL;
 		goto label_return;
 	}
+	tsd = tsd_fetch();
 
 	num_size = num * size;
 	if (unlikely(num_size == 0)) {
@@ -1228,7 +1232,7 @@ ifree(tsd_t *tsd, void *ptr, bool try_tcache)
 		prof_free(tsd, ptr, usize);
 	} else if (config_stats || config_valgrind)
 		usize = isalloc(ptr, config_prof);
-	if (config_stats && likely(tsd != NULL))
+	if (config_stats)
 		*tsd_thread_deallocatedp_get(tsd) += usize;
 	if (config_valgrind && unlikely(in_valgrind))
 		rzsize = p2rz(ptr);
@@ -1246,7 +1250,7 @@ isfree(tsd_t *tsd, void *ptr, size_t usize, bool try_tcache)
 
 	if (config_prof && opt_prof)
 		prof_free(tsd, ptr, usize);
-	if (config_stats && likely(tsd != NULL))
+	if (config_stats)
 		*tsd_thread_deallocatedp_get(tsd) += usize;
 	if (config_valgrind && unlikely(in_valgrind))
 		rzsize = p2rz(ptr);
@@ -1267,7 +1271,7 @@ je_realloc(void *ptr, size_t size)
 		if (ptr != NULL) {
 			/* realloc(ptr, 0) is equivalent to free(ptr). */
 			UTRACE(ptr, 0, 0);
-			tsd = tsd_tryget();
+			tsd = tsd_fetch();
 			ifree(tsd, ptr, true);
 			return (NULL);
 		}
@@ -1277,27 +1281,23 @@ je_realloc(void *ptr, size_t size)
 	if (likely(ptr != NULL)) {
 		assert(malloc_initialized || IS_INITIALIZER);
 		malloc_thread_init();
+		tsd = tsd_fetch();
 
-		if ((tsd = tsd_tryget()) != NULL) {
-			if ((config_prof && opt_prof) || config_stats ||
-			    (config_valgrind && unlikely(in_valgrind)))
-				old_usize = isalloc(ptr, config_prof);
-			if (config_valgrind && unlikely(in_valgrind)) {
-				old_rzsize = config_prof ? p2rz(ptr) :
-				    u2rz(old_usize);
-			}
+		if ((config_prof && opt_prof) || config_stats ||
+		    (config_valgrind && unlikely(in_valgrind)))
+			old_usize = isalloc(ptr, config_prof);
+		if (config_valgrind && unlikely(in_valgrind))
+			old_rzsize = config_prof ? p2rz(ptr) : u2rz(old_usize);
 
-			if (config_prof && opt_prof) {
+		if (config_prof && opt_prof) {
+			usize = s2u(size);
+			ret = irealloc_prof(tsd, ptr, old_usize, usize);
+		} else {
+			if (config_stats || (config_valgrind &&
+			    unlikely(in_valgrind)))
 				usize = s2u(size);
-				ret = irealloc_prof(tsd, ptr, old_usize, usize);
-			} else {
-				if (config_stats || (config_valgrind &&
-				    unlikely(in_valgrind)))
-					usize = s2u(size);
-				ret = iralloc(tsd, ptr, size, 0, false);
-			}
-		} else
-			ret = NULL;
+			ret = iralloc(tsd, ptr, size, 0, false);
+		}
 	} else {
 		/* realloc(NULL, size) is equivalent to malloc(size). */
 		ret = imalloc_body(size, &tsd, &usize);
@@ -1313,10 +1313,8 @@ je_realloc(void *ptr, size_t size)
 	}
 	if (config_stats && likely(ret != NULL)) {
 		assert(usize == isalloc(ret, config_prof));
-		if (tsd != NULL) {
-			*tsd_thread_allocatedp_get(tsd) += usize;
-			*tsd_thread_deallocatedp_get(tsd) += old_usize;
-		}
+		*tsd_thread_allocatedp_get(tsd) += usize;
+		*tsd_thread_deallocatedp_get(tsd) += old_usize;
 	}
 	UTRACE(ptr, size, ret);
 	JEMALLOC_VALGRIND_REALLOC(true, ret, usize, true, ptr, old_usize,
@@ -1330,7 +1328,7 @@ je_free(void *ptr)
 
 	UTRACE(ptr, 0, 0);
 	if (likely(ptr != NULL))
-		ifree(tsd_tryget(), ptr, true);
+		ifree(tsd_fetch(), ptr, true);
 }
 
 /*
@@ -1543,8 +1541,9 @@ je_mallocx(size_t size, int flags)
 
 	assert(size != 0);
 
-	if (unlikely(malloc_init()) || unlikely((tsd = tsd_tryget()) == NULL))
+	if (unlikely(malloc_init()))
 		goto label_oom;
+	tsd = tsd_fetch();
 
 	if (config_prof && opt_prof)
 		p = imallocx_prof(tsd, size, flags, &usize);
@@ -1554,10 +1553,8 @@ je_mallocx(size_t size, int flags)
 		goto label_oom;
 
 	if (config_stats) {
-		tsd_t *tsd = tsd_tryget();
 		assert(usize == isalloc(p, config_prof));
-		if (tsd != NULL)
-			*tsd_thread_allocatedp_get(tsd) += usize;
+		*tsd_thread_allocatedp_get(tsd) += usize;
 	}
 	UTRACE(0, size, p);
 	JEMALLOC_VALGRIND_MALLOC(true, p, usize, MALLOCX_ZERO_GET(flags));
@@ -1649,9 +1646,7 @@ je_rallocx(void *ptr, size_t size, int flags)
 	assert(size != 0);
 	assert(malloc_initialized || IS_INITIALIZER);
 	malloc_thread_init();
-
-	if (unlikely((tsd = tsd_tryget()) == NULL))
-		goto label_oom;
+	tsd = tsd_fetch();
 
 	if (unlikely((flags & MALLOCX_ARENA_MASK) != 0)) {
 		unsigned arena_ind = MALLOCX_ARENA_GET(flags);
@@ -1794,6 +1789,7 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags)
 	assert(SIZE_T_MAX - size >= extra);
 	assert(malloc_initialized || IS_INITIALIZER);
 	malloc_thread_init();
+	tsd = tsd_fetch();
 
 	if (unlikely((flags & MALLOCX_ARENA_MASK) != 0)) {
 		unsigned arena_ind = MALLOCX_ARENA_GET(flags);
@@ -1802,10 +1798,6 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags)
 		arena = NULL;
 
 	old_usize = isalloc(ptr, config_prof);
-	if (unlikely((tsd = tsd_tryget()) == NULL)) {
-		usize = old_usize;
-		goto label_not_resized;
-	}
 	if (config_valgrind && unlikely(in_valgrind))
 		old_rzsize = u2rz(old_usize);
 
@@ -1865,7 +1857,7 @@ je_dallocx(void *ptr, int flags)
 		try_tcache = true;
 
 	UTRACE(ptr, 0, 0);
-	ifree(tsd_tryget(), ptr, try_tcache);
+	ifree(tsd_fetch(), ptr, try_tcache);
 }
 
 JEMALLOC_ALWAYS_INLINE_C size_t
@@ -1901,7 +1893,7 @@ je_sdallocx(void *ptr, size_t size, int flags)
 		try_tcache = true;
 
 	UTRACE(ptr, 0, 0);
-	isfree(tsd_tryget(), ptr, usize, try_tcache);
+	isfree(tsd_fetch(), ptr, usize, try_tcache);
 }
 
 size_t
diff --git a/src/prof.c b/src/prof.c
index 5b97998..262f0ba 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -850,8 +850,7 @@ prof_bt_count(void)
 	tsd_t *tsd;
 	prof_tdata_t *tdata;
 
-	if ((tsd = tsd_tryget()) == NULL)
-		return (0);
+	tsd = tsd_fetch();
 	tdata = prof_tdata_get(tsd, false);
 	if (tdata == NULL)
 		return (0);
@@ -1475,8 +1474,7 @@ prof_fdump(void)
 
 	if (!prof_booted)
 		return;
-	if ((tsd = tsd_tryget()) == NULL)
-		return;
+	tsd = tsd_fetch();
 
 	if (opt_prof_final && opt_prof_prefix[0] != '\0') {
 		malloc_mutex_lock(&prof_dump_seq_mtx);
@@ -1497,8 +1495,7 @@ prof_idump(void)
 
 	if (!prof_booted)
 		return;
-	if ((tsd = tsd_tryget()) == NULL)
-		return;
+	tsd = tsd_fetch();
 	tdata = prof_tdata_get(tsd, false);
 	if (tdata == NULL)
 		return;
@@ -1526,8 +1523,7 @@ prof_mdump(const char *filename)
 
 	if (!opt_prof || !prof_booted)
 		return (true);
-	if ((tsd = tsd_tryget()) == NULL)
-		return (true);
+	tsd = tsd_fetch();
 
 	if (filename == NULL) {
 		/* No filename specified, so automatically generate one. */
@@ -1553,8 +1549,7 @@ prof_gdump(void)
 
 	if (!prof_booted)
 		return;
-	if ((tsd = tsd_tryget()) == NULL)
-		return;
+	tsd = tsd_fetch();
 	tdata = prof_tdata_get(tsd, false);
 	if (tdata == NULL)
 		return;
@@ -1677,6 +1672,7 @@ prof_tdata_destroy_locked(tsd_t *tsd, prof_tdata_t *tdata)
 {
 
 	assert(prof_tdata_should_destroy(tdata));
+	assert(tsd_prof_tdata_get(tsd) != tdata);
 
 	tdata_tree_remove(&tdatas, tdata);
 
@@ -1704,6 +1700,7 @@ prof_tdata_detach(tsd_t *tsd, prof_tdata_t *tdata)
 	if (tdata->attached) {
 		tdata->attached = false;
 		destroy_tdata = prof_tdata_should_destroy(tdata);
+		tsd_prof_tdata_set(tsd, NULL);
 	} else
 		destroy_tdata = false;
 	malloc_mutex_unlock(tdata->lock);
@@ -1819,8 +1816,7 @@ prof_thread_name_get(void)
 	tsd_t *tsd;
 	prof_tdata_t *tdata;
 
-	if ((tsd = tsd_tryget()) == NULL)
-		return ("");
+	tsd = tsd_fetch();
 	tdata = prof_tdata_get(tsd, true);
 	if (tdata == NULL)
 		return ("");
@@ -1886,8 +1882,7 @@ prof_thread_active_get(void)
 	tsd_t *tsd;
 	prof_tdata_t *tdata;
 
-	if ((tsd = tsd_tryget()) == NULL)
-		return (false);
+	tsd = tsd_fetch();
 	tdata = prof_tdata_get(tsd, true);
 	if (tdata == NULL)
 		return (false);
@@ -1900,8 +1895,7 @@ prof_thread_active_set(bool active)
 	tsd_t *tsd;
 	prof_tdata_t *tdata;
 
-	if ((tsd = tsd_tryget()) == NULL)
-		return (true);
+	tsd = tsd_fetch();
 	tdata = prof_tdata_get(tsd, true);
 	if (tdata == NULL)
 		return (true);
@@ -1988,8 +1982,7 @@ prof_boot2(void)
 		if (malloc_mutex_init(&prof_thread_active_init_mtx))
 			return (true);
 
-		if ((tsd = tsd_tryget()) == NULL)
-			return (true);
+		tsd = tsd_fetch();
 		if (ckh_new(tsd, &bt2gctx, PROF_CKH_MINITEMS, prof_bt_hash,
 		    prof_bt_keycomp))
 			return (true);
diff --git a/src/tcache.c b/src/tcache.c
index 6f3408c..07167b6 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -263,7 +263,8 @@ tcache_get_hard(tsd_t *tsd)
 {
 
 	if (!tcache_enabled_get()) {
-		tcache_enabled_set(false); /* Memoize. */
+		if (tsd_nominal(tsd))
+			tcache_enabled_set(false); /* Memoize. */
 		return (NULL);
 	}
 	return (tcache_create(choose_arena(tsd, NULL)));
diff --git a/src/tsd.c b/src/tsd.c
index 27a70ee..cbc64e4 100644
--- a/src/tsd.c
+++ b/src/tsd.c
@@ -74,11 +74,6 @@ tsd_cleanup(void *arg)
 {
 	tsd_t *tsd = (tsd_t *)arg;
 
-	if (tsd == NULL) {
-		/* OOM during re-initialization. */
-		return;
-	}
-
 	switch (tsd->state) {
 	case tsd_state_nominal:
 #define O(n, t)								\
diff --git a/test/unit/ckh.c b/test/unit/ckh.c
index 03b4f71..c212648 100644
--- a/test/unit/ckh.c
+++ b/test/unit/ckh.c
@@ -5,8 +5,7 @@ TEST_BEGIN(test_new_delete)
 	tsd_t *tsd;
 	ckh_t ckh;
 
-	tsd = tsd_tryget();
-	assert_ptr_not_null(tsd, "Unexpected tsd failure");
+	tsd = tsd_fetch();
 
 	assert_false(ckh_new(tsd, &ckh, 2, ckh_string_hash, ckh_string_keycomp),
 	    "Unexpected ckh_new() error");
@@ -31,8 +30,7 @@ TEST_BEGIN(test_count_insert_search_remove)
 	const char *missing = "A string not in the hash table.";
 	size_t i;
 
-	tsd = tsd_tryget();
-	assert_ptr_not_null(tsd, "Unexpected tsd failure");
+	tsd = tsd_fetch();
 
 	assert_false(ckh_new(tsd, &ckh, 2, ckh_string_hash, ckh_string_keycomp),
 	    "Unexpected ckh_new() error");
@@ -116,8 +114,7 @@ TEST_BEGIN(test_insert_iter_remove)
 	void *q, *r;
 	size_t i;
 
-	tsd = tsd_tryget();
-	assert_ptr_not_null(tsd, "Unexpected tsd failure");
+	tsd = tsd_fetch();
 
 	assert_false(ckh_new(tsd, &ckh, 2, ckh_pointer_hash,
 	    ckh_pointer_keycomp), "Unexpected ckh_new() error");
diff --git a/test/unit/tsd.c b/test/unit/tsd.c
index 391a780..eb1c597 100644
--- a/test/unit/tsd.c
+++ b/test/unit/tsd.c
@@ -6,17 +6,46 @@ typedef unsigned int data_t;
 
 static bool data_cleanup_executed;
 
+malloc_tsd_protos(, data_, data_t)
+
 void
 data_cleanup(void *arg)
 {
 	data_t *data = (data_t *)arg;
 
-	assert_x_eq(*data, THREAD_DATA,
-	    "Argument passed into cleanup function should match tsd value");
+	if (!data_cleanup_executed) {
+		assert_x_eq(*data, THREAD_DATA,
+		    "Argument passed into cleanup function should match tsd "
+		    "value");
+	}
 	data_cleanup_executed = true;
+
+	/*
+	 * Allocate during cleanup for two rounds, in order to assure that
+	 * jemalloc's internal tsd reinitialization happens.
+	 */
+	switch (*data) {
+	case THREAD_DATA:
+		*data = 1;
+		data_tsd_set(data);
+		break;
+	case 1:
+		*data = 2;
+		data_tsd_set(data);
+		break;
+	case 2:
+		return;
+	default:
+		not_reached();
+	}
+
+	{
+		void *p = mallocx(1, 0);
+		assert_ptr_not_null(p, "Unexpeced mallocx() failure");
+		dallocx(p, 0);
+	}
 }
 
-malloc_tsd_protos(, data_, data_t)
 malloc_tsd_externs(data_, data_t)
 #define	DATA_INIT 0x12345678
 malloc_tsd_data(, data_, data_t, DATA_INIT)
-- 
cgit v0.12


From 29146e9d15250be0b05cb92550a61a6511e58f79 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sat, 4 Oct 2014 11:23:13 -0700
Subject: Don't force TLS on behalf of heap profiling.

Revert 6716aa83526b3f866d73a033970cc920bc61c13f (Force use of TLS if
heap profiling is enabled.).  No existing tests indicate that this is
necessary, nor does code inspection uncover any potential issues.  Most
likely the original commit covered up a bug related to tsd-internal
allocation that has since been fixed.
---
 configure.ac | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/configure.ac b/configure.ac
index 1ee2ed8..e4afe88 100644
--- a/configure.ac
+++ b/configure.ac
@@ -793,11 +793,6 @@ fi
 AC_MSG_CHECKING([configured backtracing method])
 AC_MSG_RESULT([$backtrace_method])
 if test "x$enable_prof" = "x1" ; then
-  if test "x${force_tls}" = "x0" ; then
-    AC_MSG_ERROR([Heap profiling requires TLS]);
-  fi
-  force_tls="1"
-
   if test "x$abi" != "xpecoff"; then
     dnl Heap profiling uses the log(3) function.
     LIBS="$LIBS -lm"
-- 
cgit v0.12


From 34e85b4182d5ae029b558aae3da25fff7c3efe12 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sat, 4 Oct 2014 11:26:05 -0700
Subject: Make prof-related inline functions always-inline.

---
 include/jemalloc/internal/prof.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index 0ec7c18..c801471 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -308,7 +308,7 @@ void	prof_free(tsd_t *tsd, const void *ptr, size_t usize);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
-JEMALLOC_INLINE bool
+JEMALLOC_ALWAYS_INLINE bool
 prof_active_get_unlocked(void)
 {
 
@@ -321,7 +321,7 @@ prof_active_get_unlocked(void)
 	return (prof_active);
 }
 
-JEMALLOC_INLINE prof_tdata_t *
+JEMALLOC_ALWAYS_INLINE prof_tdata_t *
 prof_tdata_get(tsd_t *tsd, bool create)
 {
 	prof_tdata_t *tdata;
@@ -345,7 +345,7 @@ prof_tdata_get(tsd_t *tsd, bool create)
 	return (tdata);
 }
 
-JEMALLOC_INLINE prof_tctx_t *
+JEMALLOC_ALWAYS_INLINE prof_tctx_t *
 prof_tctx_get(const void *ptr)
 {
 	prof_tctx_t *ret;
@@ -364,7 +364,7 @@ prof_tctx_get(const void *ptr)
 	return (ret);
 }
 
-JEMALLOC_INLINE void
+JEMALLOC_ALWAYS_INLINE void
 prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 {
 	arena_chunk_t *chunk;
@@ -380,7 +380,7 @@ prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 		huge_prof_tctx_set(ptr, tctx);
 }
 
-JEMALLOC_INLINE bool
+JEMALLOC_ALWAYS_INLINE bool
 prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
     prof_tdata_t **tdata_out)
 {
@@ -410,7 +410,7 @@ prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
 	}
 }
 
-JEMALLOC_INLINE prof_tctx_t *
+JEMALLOC_ALWAYS_INLINE prof_tctx_t *
 prof_alloc_prep(tsd_t *tsd, size_t usize, bool update)
 {
 	prof_tctx_t *ret;
@@ -431,7 +431,7 @@ prof_alloc_prep(tsd_t *tsd, size_t usize, bool update)
 	return (ret);
 }
 
-JEMALLOC_INLINE void
+JEMALLOC_ALWAYS_INLINE void
 prof_malloc(const void *ptr, size_t usize, prof_tctx_t *tctx)
 {
 
@@ -445,7 +445,7 @@ prof_malloc(const void *ptr, size_t usize, prof_tctx_t *tctx)
 		prof_tctx_set(ptr, (prof_tctx_t *)(uintptr_t)1U);
 }
 
-JEMALLOC_INLINE void
+JEMALLOC_ALWAYS_INLINE void
 prof_realloc(tsd_t *tsd, const void *ptr, size_t usize, prof_tctx_t *tctx,
     bool updated, size_t old_usize, prof_tctx_t *old_tctx)
 {
@@ -475,7 +475,7 @@ prof_realloc(tsd_t *tsd, const void *ptr, size_t usize, prof_tctx_t *tctx,
 		prof_tctx_set(ptr, (prof_tctx_t *)(uintptr_t)1U);
 }
 
-JEMALLOC_INLINE void
+JEMALLOC_ALWAYS_INLINE void
 prof_free(tsd_t *tsd, const void *ptr, size_t usize)
 {
 	prof_tctx_t *tctx = prof_tctx_get(ptr);
-- 
cgit v0.12


From 0800afd03f6f4bc2d722bffedb3398d8ac762c5f Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sat, 4 Oct 2014 14:59:17 -0700
Subject: Silence a compiler warning.

---
 src/jemalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/jemalloc.c b/src/jemalloc.c
index 4a3d968..3490ecd 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1262,7 +1262,7 @@ void *
 je_realloc(void *ptr, size_t size)
 {
 	void *ret;
-	tsd_t *tsd;
+	tsd_t *tsd JEMALLOC_CC_SILENCE_INIT(NULL);
 	size_t usize JEMALLOC_CC_SILENCE_INIT(0);
 	size_t old_usize = 0;
 	UNUSED size_t old_rzsize JEMALLOC_CC_SILENCE_INIT(0);
-- 
cgit v0.12


From 47395a1b4c0793f676b89a763daaed1cbb510529 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sat, 4 Oct 2014 14:59:38 -0700
Subject: Avoid purging in microbench when lazy-lock is enabled.

---
 test/stress/microbench.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test/stress/microbench.c b/test/stress/microbench.c
index a8267c3..980eca4 100644
--- a/test/stress/microbench.c
+++ b/test/stress/microbench.c
@@ -19,6 +19,13 @@ compare_funcs(uint64_t nwarmup, uint64_t niter, const char *name_a,
 {
 	timedelta_t timer_a, timer_b;
 	char ratio_buf[6];
+	void *p;
+
+	p = mallocx(1, 0);
+	if (p == NULL) {
+		test_fail("Unexpected mallocx() failure");
+		return;
+	}
 
 	time_func(&timer_a, nwarmup, niter, func_a);
 	time_func(&timer_b, nwarmup, niter, func_b);
@@ -28,6 +35,8 @@ compare_funcs(uint64_t nwarmup, uint64_t niter, const char *name_a,
 	    "%s=%"PRIu64"us, ratio=1:%s\n",
 	    niter, name_a, timer_usec(&timer_a), name_b, timer_usec(&timer_b),
 	    ratio_buf);
+
+	dallocx(p, 0);
 }
 
 static void
-- 
cgit v0.12


From 16854ebeb77c9403ebd1b85fdd46ee80bb3f3e9d Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sat, 4 Oct 2014 15:00:51 -0700
Subject: Don't disable tcache for lazy-lock.

Don't disable tcache when lazy-lock is configured.  There already exists
a mechanism to disable tcache, but doing so automatically due to
lazy-lock causes surprising performance behavior.
---
 include/jemalloc/internal/tcache.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/jemalloc/internal/tcache.h b/include/jemalloc/internal/tcache.h
index 1a70972..1b1d8d9 100644
--- a/include/jemalloc/internal/tcache.h
+++ b/include/jemalloc/internal/tcache.h
@@ -188,8 +188,6 @@ tcache_get(tsd_t *tsd, bool create)
 
 	if (!config_tcache)
 		return (NULL);
-	if (config_lazy_lock && !isthreaded)
-		return (NULL);
 
 	tcache = tsd_tcache_get(tsd);
 	if (!create)
-- 
cgit v0.12


From f04a0bef99e67e11b687a661d6f04e1d7e3bde1f Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sat, 4 Oct 2014 15:03:49 -0700
Subject: Fix prof regressions.

Fix prof regressions related to tdata (main per thread profiling data
structure) destruction:
- Deadlock.  The fix for this was intended to be part of
  20c31deaae38ed9aa4fe169ed65e0c45cd542955 (Test prof.reset mallctl and
  fix numerous discovered bugs.) but the fix was left incomplete.
- Destruction race.  Detaching tdata just prior to destruction without
  holding the tdatas lock made it possible for another thread to destroy
  the tdata out from under the thread that was on its way to doing so.
---
 src/prof.c | 39 +++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/src/prof.c b/src/prof.c
index 262f0ba..a6cea92 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -116,8 +116,10 @@ static bool		prof_booted = false;
 
 static bool	prof_tctx_should_destroy(prof_tctx_t *tctx);
 static void	prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx);
-static bool	prof_tdata_should_destroy(prof_tdata_t *tdata);
-static void	prof_tdata_destroy(tsd_t *tsd, prof_tdata_t *tdata);
+static bool	prof_tdata_should_destroy(prof_tdata_t *tdata,
+    bool even_if_attached);
+static void	prof_tdata_destroy(tsd_t *tsd, prof_tdata_t *tdata,
+    bool even_if_attached);
 static char	*prof_thread_name_alloc(tsd_t *tsd, const char *thread_name);
 
 /******************************************************************************/
@@ -616,7 +618,7 @@ prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx)
 	assert(tctx->cnts.accumbytes == 0);
 
 	ckh_remove(tsd, &tdata->bt2tctx, &gctx->bt, NULL, NULL);
-	destroy_tdata = prof_tdata_should_destroy(tdata);
+	destroy_tdata = prof_tdata_should_destroy(tdata, false);
 	malloc_mutex_unlock(tdata->lock);
 
 	malloc_mutex_lock(gctx->lock);
@@ -644,7 +646,7 @@ prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx)
 		prof_gctx_try_destroy(tsd, gctx, tdata);
 
 	if (destroy_tdata)
-		prof_tdata_destroy(tsd, tdata);
+		prof_tdata_destroy(tsd, tdata, false);
 
 	idalloc(tsd, tctx);
 }
@@ -1656,10 +1658,10 @@ prof_tdata_init(tsd_t *tsd)
 
 /* tdata->lock must be held. */
 static bool
-prof_tdata_should_destroy(prof_tdata_t *tdata)
+prof_tdata_should_destroy(prof_tdata_t *tdata, bool even_if_attached)
 {
 
-	if (tdata->attached)
+	if (tdata->attached && !even_if_attached)
 		return (false);
 	if (ckh_count(&tdata->bt2tctx) != 0)
 		return (false);
@@ -1668,10 +1670,11 @@ prof_tdata_should_destroy(prof_tdata_t *tdata)
 
 /* tdatas_mtx must be held. */
 static void
-prof_tdata_destroy_locked(tsd_t *tsd, prof_tdata_t *tdata)
+prof_tdata_destroy_locked(tsd_t *tsd, prof_tdata_t *tdata,
+    bool even_if_attached)
 {
 
-	assert(prof_tdata_should_destroy(tdata));
+	assert(prof_tdata_should_destroy(tdata, even_if_attached));
 	assert(tsd_prof_tdata_get(tsd) != tdata);
 
 	tdata_tree_remove(&tdatas, tdata);
@@ -1683,11 +1686,11 @@ prof_tdata_destroy_locked(tsd_t *tsd, prof_tdata_t *tdata)
 }
 
 static void
-prof_tdata_destroy(tsd_t *tsd, prof_tdata_t *tdata)
+prof_tdata_destroy(tsd_t *tsd, prof_tdata_t *tdata, bool even_if_attached)
 {
 
 	malloc_mutex_lock(&tdatas_mtx);
-	prof_tdata_destroy_locked(tsd, tdata);
+	prof_tdata_destroy_locked(tsd, tdata, even_if_attached);
 	malloc_mutex_unlock(&tdatas_mtx);
 }
 
@@ -1698,14 +1701,19 @@ prof_tdata_detach(tsd_t *tsd, prof_tdata_t *tdata)
 
 	malloc_mutex_lock(tdata->lock);
 	if (tdata->attached) {
-		tdata->attached = false;
-		destroy_tdata = prof_tdata_should_destroy(tdata);
+		destroy_tdata = prof_tdata_should_destroy(tdata, true);
+		/*
+		 * Only detach if !destroy_tdata, because detaching would allow
+		 * another thread to win the race to destroy tdata.
+		 */
+		if (!destroy_tdata)
+			tdata->attached = false;
 		tsd_prof_tdata_set(tsd, NULL);
 	} else
 		destroy_tdata = false;
 	malloc_mutex_unlock(tdata->lock);
 	if (destroy_tdata)
-		prof_tdata_destroy(tsd, tdata);
+		prof_tdata_destroy(tsd, tdata, true);
 }
 
 prof_tdata_t *
@@ -1731,7 +1739,7 @@ prof_tdata_expire(prof_tdata_t *tdata)
 	if (!tdata->expired) {
 		tdata->expired = true;
 		destroy_tdata = tdata->attached ? false :
-		    prof_tdata_should_destroy(tdata);
+		    prof_tdata_should_destroy(tdata, false);
 	} else
 		destroy_tdata = false;
 	malloc_mutex_unlock(tdata->lock);
@@ -1764,8 +1772,7 @@ prof_reset(tsd_t *tsd, size_t lg_sample)
 		    prof_tdata_reset_iter, NULL);
 		if (to_destroy != NULL) {
 			next = tdata_tree_next(&tdatas, to_destroy);
-			tdata_tree_remove(&tdatas, to_destroy);
-			prof_tdata_destroy(tsd, to_destroy);
+			prof_tdata_destroy_locked(tsd, to_destroy, false);
 		} else
 			next = NULL;
 	} while (next != NULL);
-- 
cgit v0.12


From e9a3fa2e091a48df272e6a7d5d3e92b1a12c489b Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 5 Oct 2014 12:05:37 -0700
Subject: Add missing header includes in jemalloc/jemalloc.h .

Add stdlib.h, stdbool.h, and stdint.h to jemalloc/jemalloc.h so that
applications only have to #include <jemalloc/jemalloc.h>.

This resolves #132.
---
 doc/jemalloc.xml.in                   | 3 +--
 include/jemalloc/jemalloc_macros.h.in | 3 +++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 6abb50b..fcbb472 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -57,8 +57,7 @@
   <refsynopsisdiv>
     <title>SYNOPSIS</title>
     <funcsynopsis>
-      <funcsynopsisinfo>#include &lt;<filename class="headerfile">stdlib.h</filename>&gt;
-#include &lt;<filename class="headerfile">jemalloc/jemalloc.h</filename>&gt;</funcsynopsisinfo>
+      <funcsynopsisinfo>#include &lt;<filename class="headerfile">jemalloc/jemalloc.h</filename>&gt;</funcsynopsisinfo>
       <refsect2>
         <title>Standard API</title>
         <funcprototype>
diff --git a/include/jemalloc/jemalloc_macros.h.in b/include/jemalloc/jemalloc_macros.h.in
index 1530f9c..99f1261 100644
--- a/include/jemalloc/jemalloc_macros.h.in
+++ b/include/jemalloc/jemalloc_macros.h.in
@@ -1,3 +1,6 @@
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdint.h>
 #include <limits.h>
 #include <strings.h>
 
-- 
cgit v0.12


From f11a6776c78a09059f8418b718c996a065b33fca Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 5 Oct 2014 13:05:10 -0700
Subject: Fix OOM-related regression in arena_tcache_fill_small().

Fix an OOM-related regression in arena_tcache_fill_small() that caused
cache corruption that would almost certainly expose the application to
undefined behavior, usually in the form of an allocation request
returning an already-allocated region, or somewhat less likely, a freed
region that had already been returned to the arena, thus making it
available to the arena for any purpose.

This regression was introduced by
9c43c13a35220c10d97a886616899189daceb359 (Reverse tcache fill order.),
and was present in all releases from 2.2.0 through 3.6.0.

This resolves #98.
---
 src/arena.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/arena.c b/src/arena.c
index 79fea72..c223946 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1330,8 +1330,19 @@ arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, size_t binind,
 			ptr = arena_run_reg_alloc(run, &arena_bin_info[binind]);
 		else
 			ptr = arena_bin_malloc_hard(arena, bin);
-		if (ptr == NULL)
+		if (ptr == NULL) {
+			/*
+			 * OOM.  tbin->avail isn't yet filled down to its first
+			 * element, so the successful allocations (if any) must
+			 * be moved to the base of tbin->avail before bailing
+			 * out.
+			 */
+			if (i > 0) {
+				memmove(tbin->avail, &tbin->avail[nfill - i],
+				    i * sizeof(void *));
+			}
 			break;
+		}
 		if (config_fill && unlikely(opt_junk)) {
 			arena_alloc_junk_small(ptr, &arena_bin_info[binind],
 			    true);
-- 
cgit v0.12


From a95018ee819abf897562d9d1f3bc31d4dd725a8d Mon Sep 17 00:00:00 2001
From: Daniel Micay <danielmicay@gmail.com>
Date: Sat, 4 Oct 2014 01:39:32 -0400
Subject: Attempt to expand huge allocations in-place.

This adds support for expanding huge allocations in-place by requesting
memory at a specific address from the chunk allocator.

It's currently only implemented for the chunk recycling path, although
in theory it could also be done by optimistically allocating new chunks.
On Linux, it could attempt an in-place mremap. However, that won't work
in practice since the heap is grown downwards and memory is not unmapped
(in a normal build, at least).

Repeated vector reallocation micro-benchmark:

    #include <string.h>
    #include <stdlib.h>

    int main(void) {
        for (size_t i = 0; i < 100; i++) {
            void *ptr = NULL;
            size_t old_size = 0;
            for (size_t size = 4; size < (1 << 30); size *= 2) {
                ptr = realloc(ptr, size);
                if (!ptr) return 1;
                memset(ptr + old_size, 0xff, size - old_size);
                old_size = size;
            }
            free(ptr);
        }
    }

The glibc allocator fails to do any in-place reallocations on this
benchmark once it passes the M_MMAP_THRESHOLD (default 128k) but it
elides the cost of copies via mremap, which is currently not something
that jemalloc can use.

With this improvement, jemalloc still fails to do any in-place huge
reallocations for the first outer loop, but then succeeds 100% of the
time for the remaining 99 iterations. The time spent doing allocations
and copies drops down to under 5%, with nearly all of it spent doing
purging + faulting (when huge pages are disabled) and the array memset.

An improved mremap API (MREMAP_RETAIN - #138) would be far more general
but this is a portable optimization and would still be useful on Linux
for xallocx.

Numbers with transparent huge pages enabled:

glibc (copies elided via MREMAP_MAYMOVE): 8.471s

jemalloc: 17.816s
jemalloc + no-op madvise: 13.236s

jemalloc + this commit: 6.787s
jemalloc + this commit + no-op madvise: 6.144s

Numbers with transparent huge pages disabled:

glibc (copies elided via MREMAP_MAYMOVE): 15.403s

jemalloc: 39.456s
jemalloc + no-op madvise: 12.768s

jemalloc + this commit: 15.534s
jemalloc + this commit + no-op madvise: 6.354s

Closes #137
---
 doc/jemalloc.xml.in                              |  7 ++-
 include/jemalloc/internal/arena.h                |  4 +-
 include/jemalloc/internal/chunk.h                |  8 +--
 include/jemalloc/internal/huge.h                 |  2 +-
 include/jemalloc/internal/jemalloc_internal.h.in |  2 +-
 include/jemalloc/jemalloc_typedefs.h.in          |  2 +-
 src/arena.c                                      |  8 +--
 src/chunk.c                                      | 47 ++++++++-------
 src/huge.c                                       | 74 ++++++++++++++++++++++--
 test/integration/chunk.c                         |  5 +-
 10 files changed, 118 insertions(+), 41 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index fcbb472..f9d464c 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -1351,6 +1351,7 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         function that knows how to deallocate the chunks.
         <funcprototype>
           <funcdef>typedef void *<function>(chunk_alloc_t)</function></funcdef>
+          <paramdef>void *<parameter>chunk</parameter></paramdef>
           <paramdef>size_t <parameter>size</parameter></paramdef>
           <paramdef>size_t <parameter>alignment</parameter></paramdef>
           <paramdef>bool *<parameter>zero</parameter></paramdef>
@@ -1367,8 +1368,10 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         <parameter>size</parameter> parameter is always a multiple of the chunk
         size.  The <parameter>alignment</parameter> parameter is always a power
         of two at least as large as the chunk size.  Zeroing is mandatory if
-        <parameter>*zero</parameter> is true upon function
-        entry.</para>
+        <parameter>*zero</parameter> is true upon function entry.  If
+        <parameter>chunk</parameter> is not <constant>NULL</constant>, the
+        returned pointer must be <parameter>chunk</parameter> or
+        <constant>NULL</constant> if it could not be allocated.</para>
 
         <para>Note that replacing the default chunk allocation function makes
         the arena's <link
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 2e9920c..1f98572 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -343,8 +343,8 @@ extern arena_bin_info_t	arena_bin_info[NBINS];
 /* Number of large size classes. */
 #define			nlclasses (chunk_npages - map_bias)
 
-void	*arena_chunk_alloc_huge(arena_t *arena, size_t size, size_t alignment,
-    bool *zero);
+void	*arena_chunk_alloc_huge(arena_t *arena, void *new_addr, size_t size,
+    size_t alignment, bool *zero);
 void	arena_chunk_dalloc_huge(arena_t *arena, void *chunk, size_t size);
 void	arena_purge_all(arena_t *arena);
 void	arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin,
diff --git a/include/jemalloc/internal/chunk.h b/include/jemalloc/internal/chunk.h
index 27aa0ad..2e68a02 100644
--- a/include/jemalloc/internal/chunk.h
+++ b/include/jemalloc/internal/chunk.h
@@ -46,10 +46,10 @@ extern size_t		arena_maxclass; /* Max size class for arenas. */
 
 void	*chunk_alloc_base(size_t size);
 void	*chunk_alloc_arena(chunk_alloc_t *chunk_alloc,
-    chunk_dalloc_t *chunk_dalloc, unsigned arena_ind, size_t size,
-    size_t alignment, bool *zero);
-void	*chunk_alloc_default(size_t size, size_t alignment, bool *zero,
-    unsigned arena_ind);
+    chunk_dalloc_t *chunk_dalloc, unsigned arena_ind, void *new_addr,
+    size_t size, size_t alignment, bool *zero);
+void	*chunk_alloc_default(void *new_addr, size_t size, size_t alignment,
+    bool *zero, unsigned arena_ind);
 void	chunk_unmap(void *chunk, size_t size);
 bool	chunk_dalloc_default(void *chunk, size_t size, unsigned arena_ind);
 bool	chunk_boot(void);
diff --git a/include/jemalloc/internal/huge.h b/include/jemalloc/internal/huge.h
index b061e15..00d8c09 100644
--- a/include/jemalloc/internal/huge.h
+++ b/include/jemalloc/internal/huge.h
@@ -13,7 +13,7 @@ void	*huge_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero);
 void	*huge_palloc(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
     bool zero);
 bool	huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
-    size_t extra);
+    size_t extra, bool zero);
 void	*huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize,
     size_t size, size_t extra, size_t alignment, bool zero,
     bool try_tcache_dalloc);
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index ed25172..a169221 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -902,7 +902,7 @@ ixalloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero)
 	if (size <= arena_maxclass)
 		return (arena_ralloc_no_move(ptr, oldsize, size, extra, zero));
 	else
-		return (huge_ralloc_no_move(ptr, oldsize, size, extra));
+		return (huge_ralloc_no_move(ptr, oldsize, size, extra, zero));
 }
 #endif
 
diff --git a/include/jemalloc/jemalloc_typedefs.h.in b/include/jemalloc/jemalloc_typedefs.h.in
index 47e57ca..8092f1b 100644
--- a/include/jemalloc/jemalloc_typedefs.h.in
+++ b/include/jemalloc/jemalloc_typedefs.h.in
@@ -1,2 +1,2 @@
-typedef void *(chunk_alloc_t)(size_t, size_t, bool *, unsigned);
+typedef void *(chunk_alloc_t)(void *, size_t, size_t, bool *, unsigned);
 typedef bool (chunk_dalloc_t)(void *, size_t, unsigned);
diff --git a/src/arena.c b/src/arena.c
index c223946..b7300a9 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -450,7 +450,7 @@ arena_chunk_alloc_internal(arena_t *arena, size_t size, size_t alignment,
 	chunk_dalloc = arena->chunk_dalloc;
 	malloc_mutex_unlock(&arena->lock);
 	chunk = (arena_chunk_t *)chunk_alloc_arena(chunk_alloc, chunk_dalloc,
-	    arena->ind, size, alignment, zero);
+	    arena->ind, NULL, size, alignment, zero);
 	malloc_mutex_lock(&arena->lock);
 	if (config_stats && chunk != NULL)
 		arena->stats.mapped += chunksize;
@@ -459,8 +459,8 @@ arena_chunk_alloc_internal(arena_t *arena, size_t size, size_t alignment,
 }
 
 void *
-arena_chunk_alloc_huge(arena_t *arena, size_t size, size_t alignment,
-    bool *zero)
+arena_chunk_alloc_huge(arena_t *arena, void *new_addr, size_t size,
+    size_t alignment, bool *zero)
 {
 	void *ret;
 	chunk_alloc_t *chunk_alloc;
@@ -480,7 +480,7 @@ arena_chunk_alloc_huge(arena_t *arena, size_t size, size_t alignment,
 	malloc_mutex_unlock(&arena->lock);
 
 	ret = chunk_alloc_arena(chunk_alloc, chunk_dalloc, arena->ind,
-	    size, alignment, zero);
+	    new_addr, size, alignment, zero);
 	if (config_stats) {
 		if (ret != NULL)
 			stats_cactive_add(size);
diff --git a/src/chunk.c b/src/chunk.c
index cde8606..32b8b3a 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -42,8 +42,8 @@ static void	chunk_dalloc_core(void *chunk, size_t size);
 /******************************************************************************/
 
 static void *
-chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, size_t size,
-    size_t alignment, bool base, bool *zero)
+chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad,
+    void *new_addr, size_t size, size_t alignment, bool base, bool *zero)
 {
 	void *ret;
 	extent_node_t *node;
@@ -65,11 +65,11 @@ chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, size_t size,
 	/* Beware size_t wrap-around. */
 	if (alloc_size < size)
 		return (NULL);
-	key.addr = NULL;
+	key.addr = new_addr;
 	key.size = alloc_size;
 	malloc_mutex_lock(&chunks_mtx);
 	node = extent_tree_szad_nsearch(chunks_szad, &key);
-	if (node == NULL) {
+	if (node == NULL || (new_addr && node->addr != new_addr)) {
 		malloc_mutex_unlock(&chunks_mtx);
 		return (NULL);
 	}
@@ -142,8 +142,8 @@ chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, size_t size,
  * them if they are returned.
  */
 static void *
-chunk_alloc_core(size_t size, size_t alignment, bool base, bool *zero,
-    dss_prec_t dss_prec)
+chunk_alloc_core(void *new_addr, size_t size, size_t alignment, bool base,
+    bool *zero, dss_prec_t dss_prec)
 {
 	void *ret;
 
@@ -154,24 +154,30 @@ chunk_alloc_core(size_t size, size_t alignment, bool base, bool *zero,
 
 	/* "primary" dss. */
 	if (have_dss && dss_prec == dss_prec_primary) {
-		if ((ret = chunk_recycle(&chunks_szad_dss, &chunks_ad_dss, size,
-		    alignment, base, zero)) != NULL)
+		if ((ret = chunk_recycle(&chunks_szad_dss, &chunks_ad_dss,
+		    new_addr, size, alignment, base, zero)) != NULL)
 			return (ret);
-		if ((ret = chunk_alloc_dss(size, alignment, zero)) != NULL)
+		/* requesting an address only implemented for recycle */
+		if (new_addr == NULL
+		    && (ret = chunk_alloc_dss(size, alignment, zero)) != NULL)
 			return (ret);
 	}
 	/* mmap. */
-	if ((ret = chunk_recycle(&chunks_szad_mmap, &chunks_ad_mmap, size,
-	    alignment, base, zero)) != NULL)
+	if ((ret = chunk_recycle(&chunks_szad_mmap, &chunks_ad_mmap, new_addr,
+	    size, alignment, base, zero)) != NULL)
 		return (ret);
-	if ((ret = chunk_alloc_mmap(size, alignment, zero)) != NULL)
+	/* requesting an address only implemented for recycle */
+	if (new_addr == NULL &&
+	    (ret = chunk_alloc_mmap(size, alignment, zero)) != NULL)
 		return (ret);
 	/* "secondary" dss. */
 	if (have_dss && dss_prec == dss_prec_secondary) {
-		if ((ret = chunk_recycle(&chunks_szad_dss, &chunks_ad_dss, size,
-		    alignment, base, zero)) != NULL)
+		if ((ret = chunk_recycle(&chunks_szad_dss, &chunks_ad_dss,
+		    new_addr, size, alignment, base, zero)) != NULL)
 			return (ret);
-		if ((ret = chunk_alloc_dss(size, alignment, zero)) != NULL)
+		/* requesting an address only implemented for recycle */
+		if (new_addr == NULL &&
+		    (ret = chunk_alloc_dss(size, alignment, zero)) != NULL)
 			return (ret);
 	}
 
@@ -219,7 +225,7 @@ chunk_alloc_base(size_t size)
 	bool zero;
 
 	zero = false;
-	ret = chunk_alloc_core(size, chunksize, true, &zero,
+	ret = chunk_alloc_core(NULL, size, chunksize, true, &zero,
 	    chunk_dss_prec_get());
 	if (ret == NULL)
 		return (NULL);
@@ -232,11 +238,12 @@ chunk_alloc_base(size_t size)
 
 void *
 chunk_alloc_arena(chunk_alloc_t *chunk_alloc, chunk_dalloc_t *chunk_dalloc,
-    unsigned arena_ind, size_t size, size_t alignment, bool *zero)
+    unsigned arena_ind, void *new_addr, size_t size, size_t alignment,
+    bool *zero)
 {
 	void *ret;
 
-	ret = chunk_alloc(size, alignment, zero, arena_ind);
+	ret = chunk_alloc(new_addr, size, alignment, zero, arena_ind);
 	if (ret != NULL && chunk_register(ret, size, false)) {
 		chunk_dalloc(ret, size, arena_ind);
 		ret = NULL;
@@ -247,11 +254,11 @@ chunk_alloc_arena(chunk_alloc_t *chunk_alloc, chunk_dalloc_t *chunk_dalloc,
 
 /* Default arena chunk allocation routine in the absence of user override. */
 void *
-chunk_alloc_default(size_t size, size_t alignment, bool *zero,
+chunk_alloc_default(void *new_addr, size_t size, size_t alignment, bool *zero,
     unsigned arena_ind)
 {
 
-	return (chunk_alloc_core(size, alignment, false, zero,
+	return (chunk_alloc_core(new_addr, size, alignment, false, zero,
 	    arenas[arena_ind]->dss_prec));
 }
 
diff --git a/src/huge.c b/src/huge.c
index 2f059b4..6bdc076 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -47,7 +47,7 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
 	 */
 	is_zeroed = zero;
 	arena = choose_arena(tsd, arena);
-	ret = arena_chunk_alloc_huge(arena, csize, alignment, &is_zeroed);
+	ret = arena_chunk_alloc_huge(arena, NULL, csize, alignment, &is_zeroed);
 	if (ret == NULL) {
 		base_node_dalloc(node);
 		return (NULL);
@@ -95,8 +95,66 @@ huge_dalloc_junk(void *ptr, size_t usize)
 huge_dalloc_junk_t *huge_dalloc_junk = JEMALLOC_N(huge_dalloc_junk_impl);
 #endif
 
+static bool
+huge_ralloc_no_move_expand(void *ptr, size_t oldsize, size_t size, bool zero) {
+	size_t csize;
+	void *expand_addr;
+	size_t expand_size;
+	extent_node_t *node, key;
+	arena_t *arena;
+	bool is_zeroed;
+	void *ret;
+
+	csize = CHUNK_CEILING(size);
+	if (csize == 0) {
+		/* size is large enough to cause size_t wrap-around. */
+		return (true);
+	}
+
+	expand_addr = ptr + oldsize;
+	expand_size = csize - oldsize;
+
+	malloc_mutex_lock(&huge_mtx);
+
+	key.addr = ptr;
+	node = extent_tree_ad_search(&huge, &key);
+	assert(node != NULL);
+	assert(node->addr == ptr);
+
+	/* Find the current arena. */
+	arena = node->arena;
+
+	malloc_mutex_unlock(&huge_mtx);
+
+	/*
+	 * Copy zero into is_zeroed and pass the copy to chunk_alloc(), so that
+	 * it is possible to make correct junk/zero fill decisions below.
+	 */
+	is_zeroed = zero;
+	ret = arena_chunk_alloc_huge(arena, expand_addr, expand_size, chunksize,
+				     &is_zeroed);
+	if (ret == NULL)
+		return (true);
+
+	assert(ret == expand_addr);
+
+	malloc_mutex_lock(&huge_mtx);
+	/* Update the size of the huge allocation. */
+	node->size = csize;
+	malloc_mutex_unlock(&huge_mtx);
+
+	if (config_fill && !zero) {
+		if (unlikely(opt_junk))
+			memset(expand_addr, 0xa5, expand_size);
+		else if (unlikely(opt_zero) && !is_zeroed)
+			memset(expand_addr, 0, expand_size);
+	}
+	return (false);
+}
+
 bool
-huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra)
+huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
+    bool zero)
 {
 
 	/* Both allocations must be huge to avoid a move. */
@@ -145,7 +203,15 @@ huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra)
 		return (false);
 	}
 
-	return (true);
+	/* Attempt to expand the allocation in-place. */
+	if (huge_ralloc_no_move_expand(ptr, oldsize, size + extra, zero)) {
+		if (extra == 0)
+			return (true);
+
+		/* Try again, this time without extra. */
+		return (huge_ralloc_no_move_expand(ptr, oldsize, size, zero));
+	}
+	return (false);
 }
 
 void *
@@ -156,7 +222,7 @@ huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t size,
 	size_t copysize;
 
 	/* Try to avoid moving the allocation. */
-	if (!huge_ralloc_no_move(ptr, oldsize, size, extra))
+	if (!huge_ralloc_no_move(ptr, oldsize, size, extra, zero))
 		return (ptr);
 
 	/*
diff --git a/test/integration/chunk.c b/test/integration/chunk.c
index 2853709..8993850 100644
--- a/test/integration/chunk.c
+++ b/test/integration/chunk.c
@@ -11,10 +11,11 @@ chunk_dalloc(void *chunk, size_t size, unsigned arena_ind)
 }
 
 void *
-chunk_alloc(size_t size, size_t alignment, bool *zero, unsigned arena_ind)
+chunk_alloc(void *new_addr, size_t size, size_t alignment, bool *zero,
+    unsigned arena_ind)
 {
 
-	return (old_alloc(size, alignment, zero, arena_ind));
+	return (old_alloc(new_addr, size, alignment, zero, arena_ind));
 }
 
 TEST_BEGIN(test_chunk)
-- 
cgit v0.12


From 3c3b3b1a94705c8019b973fb679dd99bd19305af Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 5 Oct 2014 14:48:44 -0700
Subject: Fix a docbook element nesting nit.

According to the docbook documentation for <funcprototype>, its parent
must be <funcsynopsis>; fix accordingly.  Nonetheless, the man page
processor fails badly when this construct is embedded in a <para> (which
is documented to be legal), although the html processor does fine.
---
 doc/jemalloc.xml.in | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index f9d464c..1f692f7 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -1349,14 +1349,14 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         also be set via <link linkend="arena.i.chunk.dalloc">
         <mallctl>arena.&lt;i&gt;.chunk.dalloc</mallctl></link> to a companion
         function that knows how to deallocate the chunks.
-        <funcprototype>
+        <funcsynopsis><funcprototype>
           <funcdef>typedef void *<function>(chunk_alloc_t)</function></funcdef>
           <paramdef>void *<parameter>chunk</parameter></paramdef>
           <paramdef>size_t <parameter>size</parameter></paramdef>
           <paramdef>size_t <parameter>alignment</parameter></paramdef>
           <paramdef>bool *<parameter>zero</parameter></paramdef>
           <paramdef>unsigned <parameter>arena_ind</parameter></paramdef>
-        </funcprototype>
+        </funcprototype></funcsynopsis>
         A chunk allocation function conforms to the <type>chunk_alloc_t</type>
         type and upon success returns a pointer to <parameter>size</parameter>
         bytes of memory on behalf of arena <parameter>arena_ind</parameter> such
@@ -1397,12 +1397,12 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         arena creation), but the automatically created arenas may have already
         created chunks prior to the application having an opportunity to take
         over chunk allocation.
-        <funcprototype>
+        <funcsynopsis><funcprototype>
           <funcdef>typedef void <function>(chunk_dalloc_t)</function></funcdef>
           <paramdef>void *<parameter>chunk</parameter></paramdef>
           <paramdef>size_t <parameter>size</parameter></paramdef>
           <paramdef>unsigned <parameter>arena_ind</parameter></paramdef>
-        </funcprototype>
+        </funcprototype></funcsynopsis>
         A chunk deallocation function conforms to the
         <type>chunk_dalloc_t</type> type and deallocates a
         <parameter>chunk</parameter> of given <parameter>size</parameter> on
-- 
cgit v0.12


From 155bfa7da18cab0d21d87aa2dce4554166836f5d Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 5 Oct 2014 17:54:10 -0700
Subject: Normalize size classes.

Normalize size classes to use the same number of size classes per size
doubling (currently hard coded to 4), across the intire range of size
classes.  Small size classes already used this spacing, but in order to
support this change, additional small size classes now fill [4 KiB .. 16
KiB).  Large size classes range from [16 KiB .. 4 MiB).  Huge size
classes now support non-multiples of the chunk size in order to fill (4
MiB .. 16 MiB).
---
 include/jemalloc/internal/arena.h                | 231 +++---------------
 include/jemalloc/internal/chunk.h                |   3 -
 include/jemalloc/internal/huge.h                 |   2 +-
 include/jemalloc/internal/jemalloc_internal.h.in | 297 +++++++++++++++++------
 include/jemalloc/internal/private_symbols.txt    |  22 +-
 include/jemalloc/internal/size_classes.sh        |  15 +-
 include/jemalloc/internal/stats.h                |   7 +-
 include/jemalloc/internal/tcache.h               |  52 ++--
 src/arena.c                                      | 223 ++++++++---------
 src/chunk.c                                      |   3 -
 src/ctl.c                                        |   2 +-
 src/huge.c                                       | 113 ++++++---
 src/jemalloc.c                                   |  34 ++-
 src/tcache.c                                     |   8 +-
 test/unit/junk.c                                 |  17 +-
 test/unit/mallctl.c                              |   2 +-
 16 files changed, 557 insertions(+), 474 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 1f98572..681b580 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -1,6 +1,8 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_TYPES
 
+#define	LARGE_MINCLASS		(ZU(1) << LG_LARGE_MINCLASS)
+
 /* Maximum number of regions in one run. */
 #define	LG_RUN_MAXREGS		(LG_PAGE - LG_TINY_MIN)
 #define	RUN_MAXREGS		(1U << LG_RUN_MAXREGS)
@@ -96,11 +98,15 @@ struct arena_chunk_map_bits_s {
 	 *     xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx
 	 *     -------- -------- ----++++ ++++D-LA
 	 *
-	 *   Large (sampled, size <= PAGE):
+	 *   Large (sampled, size <= LARGE_MINCLASS):
 	 *     ssssssss ssssssss ssssnnnn nnnnD-LA
+	 *     xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx
+	 *     -------- -------- ----++++ ++++D-LA
 	 *
-	 *   Large (not sampled, size == PAGE):
+	 *   Large (not sampled, size == LARGE_MINCLASS):
 	 *     ssssssss ssssssss ssss++++ ++++D-LA
+	 *     xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx
+	 *     -------- -------- ----++++ ++++D-LA
 	 */
 	size_t				bits;
 #define	CHUNK_MAP_BININD_SHIFT	4
@@ -325,30 +331,21 @@ struct arena_s {
 #ifdef JEMALLOC_H_EXTERNS
 
 extern ssize_t	opt_lg_dirty_mult;
-/*
- * small_size2bin_tab is a compact lookup table that rounds request sizes up to
- * size classes.  In order to reduce cache footprint, the table is compressed,
- * and all accesses are via small_size2bin().
- */
-extern uint8_t const	small_size2bin_tab[];
-/*
- * small_bin2size_tab duplicates information in arena_bin_info, but in a const
- * array, for which it is easier for the compiler to optimize repeated
- * dereferences.
- */
-extern uint32_t const	small_bin2size_tab[NBINS];
 
 extern arena_bin_info_t	arena_bin_info[NBINS];
 
-/* Number of large size classes. */
-#define			nlclasses (chunk_npages - map_bias)
+extern size_t		map_bias; /* Number of arena chunk header pages. */
+extern size_t		map_misc_offset;
+extern size_t		arena_maxrun; /* Max run size for arenas. */
+extern size_t		arena_maxclass; /* Max size class for arenas. */
+extern size_t		nlclasses; /* Number of large size classes. */
 
 void	*arena_chunk_alloc_huge(arena_t *arena, void *new_addr, size_t size,
     size_t alignment, bool *zero);
 void	arena_chunk_dalloc_huge(arena_t *arena, void *chunk, size_t size);
 void	arena_purge_all(arena_t *arena);
 void	arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin,
-    size_t binind, uint64_t prof_accumbytes);
+    index_t binind, uint64_t prof_accumbytes);
 void	arena_alloc_junk_small(void *ptr, arena_bin_info_t *bin_info,
     bool zero);
 #ifdef JEMALLOC_JET
@@ -403,15 +400,6 @@ void	arena_postfork_child(arena_t *arena);
 #ifdef JEMALLOC_H_INLINES
 
 #ifndef JEMALLOC_ENABLE_INLINE
-size_t	small_size2bin_compute(size_t size);
-size_t	small_size2bin_lookup(size_t size);
-size_t	small_size2bin(size_t size);
-size_t	small_bin2size_compute(size_t binind);
-size_t	small_bin2size_lookup(size_t binind);
-size_t	small_bin2size(size_t binind);
-size_t	small_s2u_compute(size_t size);
-size_t	small_s2u_lookup(size_t size);
-size_t	small_s2u(size_t size);
 arena_chunk_map_bits_t	*arena_bitselm_get(arena_chunk_t *chunk,
     size_t pageind);
 arena_chunk_map_misc_t	*arena_miscelm_get(arena_chunk_t *chunk,
@@ -426,7 +414,7 @@ size_t	arena_mapbits_unallocated_size_get(arena_chunk_t *chunk,
     size_t pageind);
 size_t	arena_mapbits_large_size_get(arena_chunk_t *chunk, size_t pageind);
 size_t	arena_mapbits_small_runind_get(arena_chunk_t *chunk, size_t pageind);
-size_t	arena_mapbits_binind_get(arena_chunk_t *chunk, size_t pageind);
+index_t	arena_mapbits_binind_get(arena_chunk_t *chunk, size_t pageind);
 size_t	arena_mapbits_dirty_get(arena_chunk_t *chunk, size_t pageind);
 size_t	arena_mapbits_unzeroed_get(arena_chunk_t *chunk, size_t pageind);
 size_t	arena_mapbits_large_get(arena_chunk_t *chunk, size_t pageind);
@@ -439,16 +427,16 @@ void	arena_mapbits_unallocated_size_set(arena_chunk_t *chunk, size_t pageind,
 void	arena_mapbits_large_set(arena_chunk_t *chunk, size_t pageind,
     size_t size, size_t flags);
 void	arena_mapbits_large_binind_set(arena_chunk_t *chunk, size_t pageind,
-    size_t binind);
+    index_t binind);
 void	arena_mapbits_small_set(arena_chunk_t *chunk, size_t pageind,
-    size_t runind, size_t binind, size_t flags);
+    size_t runind, index_t binind, size_t flags);
 void	arena_mapbits_unzeroed_set(arena_chunk_t *chunk, size_t pageind,
     size_t unzeroed);
 bool	arena_prof_accum_impl(arena_t *arena, uint64_t accumbytes);
 bool	arena_prof_accum_locked(arena_t *arena, uint64_t accumbytes);
 bool	arena_prof_accum(arena_t *arena, uint64_t accumbytes);
-size_t	arena_ptr_small_binind_get(const void *ptr, size_t mapbits);
-size_t	arena_bin_index(arena_t *arena, arena_bin_t *bin);
+index_t	arena_ptr_small_binind_get(const void *ptr, size_t mapbits);
+index_t	arena_bin_index(arena_t *arena, arena_bin_t *bin);
 unsigned	arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info,
     const void *ptr);
 prof_tctx_t	*arena_prof_tctx_get(const void *ptr);
@@ -464,148 +452,6 @@ void	arena_sdalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, size_t size,
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ARENA_C_))
 #  ifdef JEMALLOC_ARENA_INLINE_A
-JEMALLOC_INLINE size_t
-small_size2bin_compute(size_t size)
-{
-#if (NTBINS != 0)
-	if (size <= (ZU(1) << LG_TINY_MAXCLASS)) {
-		size_t lg_tmin = LG_TINY_MAXCLASS - NTBINS + 1;
-		size_t lg_ceil = lg_floor(pow2_ceil(size));
-		return (lg_ceil < lg_tmin ? 0 : lg_ceil - lg_tmin);
-	} else
-#endif
-	{
-		size_t x = lg_floor((size<<1)-1);
-		size_t shift = (x < LG_SIZE_CLASS_GROUP + LG_QUANTUM) ? 0 :
-		    x - (LG_SIZE_CLASS_GROUP + LG_QUANTUM);
-		size_t grp = shift << LG_SIZE_CLASS_GROUP;
-
-		size_t lg_delta = (x < LG_SIZE_CLASS_GROUP + LG_QUANTUM + 1)
-		    ? LG_QUANTUM : x - LG_SIZE_CLASS_GROUP - 1;
-
-		size_t delta_inverse_mask = ZI(-1) << lg_delta;
-		size_t mod = ((((size-1) & delta_inverse_mask) >> lg_delta)) &
-		    ((ZU(1) << LG_SIZE_CLASS_GROUP) - 1);
-
-		size_t bin = NTBINS + grp + mod;
-		return (bin);
-	}
-}
-
-JEMALLOC_ALWAYS_INLINE size_t
-small_size2bin_lookup(size_t size)
-{
-
-	assert(size <= LOOKUP_MAXCLASS);
-	{
-		size_t ret = ((size_t)(small_size2bin_tab[(size-1) >>
-		    LG_TINY_MIN]));
-		assert(ret == small_size2bin_compute(size));
-		return (ret);
-	}
-}
-
-JEMALLOC_ALWAYS_INLINE size_t
-small_size2bin(size_t size)
-{
-
-	assert(size > 0);
-	if (likely(size <= LOOKUP_MAXCLASS))
-		return (small_size2bin_lookup(size));
-	else
-		return (small_size2bin_compute(size));
-}
-
-JEMALLOC_INLINE size_t
-small_bin2size_compute(size_t binind)
-{
-#if (NTBINS > 0)
-	if (binind < NTBINS)
-		return (ZU(1) << (LG_TINY_MAXCLASS - NTBINS + 1 + binind));
-	else
-#endif
-	{
-		size_t reduced_binind = binind - NTBINS;
-		size_t grp = reduced_binind >> LG_SIZE_CLASS_GROUP;
-		size_t mod = reduced_binind & ((ZU(1) << LG_SIZE_CLASS_GROUP) -
-		    1);
-
-		size_t grp_size_mask = ~((!!grp)-1);
-		size_t grp_size = ((ZU(1) << (LG_QUANTUM +
-		    (LG_SIZE_CLASS_GROUP-1))) << grp) & grp_size_mask;
-
-		size_t shift = (grp == 0) ? 1 : grp;
-		size_t lg_delta = shift + (LG_QUANTUM-1);
-		size_t mod_size = (mod+1) << lg_delta;
-
-		size_t usize = grp_size + mod_size;
-		return (usize);
-	}
-}
-
-JEMALLOC_ALWAYS_INLINE size_t
-small_bin2size_lookup(size_t binind)
-{
-
-	assert(binind < NBINS);
-	{
-		size_t ret = (size_t)small_bin2size_tab[binind];
-		assert(ret == small_bin2size_compute(binind));
-		return (ret);
-	}
-}
-
-JEMALLOC_ALWAYS_INLINE size_t
-small_bin2size(size_t binind)
-{
-
-	return (small_bin2size_lookup(binind));
-}
-
-JEMALLOC_ALWAYS_INLINE size_t
-small_s2u_compute(size_t size)
-{
-#if (NTBINS > 0)
-	if (size <= (ZU(1) << LG_TINY_MAXCLASS)) {
-		size_t lg_tmin = LG_TINY_MAXCLASS - NTBINS + 1;
-		size_t lg_ceil = lg_floor(pow2_ceil(size));
-		return (lg_ceil < lg_tmin ? (ZU(1) << lg_tmin) :
-		    (ZU(1) << lg_ceil));
-	} else
-#endif
-	{
-		size_t x = lg_floor((size<<1)-1);
-		size_t lg_delta = (x < LG_SIZE_CLASS_GROUP + LG_QUANTUM + 1)
-		    ?  LG_QUANTUM : x - LG_SIZE_CLASS_GROUP - 1;
-		size_t delta = ZU(1) << lg_delta;
-		size_t delta_mask = delta - 1;
-		size_t usize = (size + delta_mask) & ~delta_mask;
-		return (usize);
-	}
-}
-
-JEMALLOC_ALWAYS_INLINE size_t
-small_s2u_lookup(size_t size)
-{
-	size_t ret = small_bin2size(small_size2bin(size));
-
-	assert(ret == small_s2u_compute(size));
-	return (ret);
-}
-
-JEMALLOC_ALWAYS_INLINE size_t
-small_s2u(size_t size)
-{
-
-	assert(size > 0);
-	if (likely(size <= LOOKUP_MAXCLASS))
-		return (small_s2u_lookup(size));
-	else
-		return (small_s2u_compute(size));
-}
-#  endif /* JEMALLOC_ARENA_INLINE_A */
-
-#  ifdef JEMALLOC_ARENA_INLINE_B
 JEMALLOC_ALWAYS_INLINE arena_chunk_map_bits_t *
 arena_bitselm_get(arena_chunk_t *chunk, size_t pageind)
 {
@@ -714,11 +560,11 @@ arena_mapbits_small_runind_get(arena_chunk_t *chunk, size_t pageind)
 	return (mapbits >> LG_PAGE);
 }
 
-JEMALLOC_ALWAYS_INLINE size_t
+JEMALLOC_ALWAYS_INLINE index_t
 arena_mapbits_binind_get(arena_chunk_t *chunk, size_t pageind)
 {
 	size_t mapbits;
-	size_t binind;
+	index_t binind;
 
 	mapbits = arena_mapbits_get(chunk, pageind);
 	binind = (mapbits & CHUNK_MAP_BININD_MASK) >> CHUNK_MAP_BININD_SHIFT;
@@ -810,20 +656,20 @@ arena_mapbits_large_set(arena_chunk_t *chunk, size_t pageind, size_t size,
 
 JEMALLOC_ALWAYS_INLINE void
 arena_mapbits_large_binind_set(arena_chunk_t *chunk, size_t pageind,
-    size_t binind)
+    index_t binind)
 {
 	size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
 	size_t mapbits = arena_mapbitsp_read(mapbitsp);
 
 	assert(binind <= BININD_INVALID);
-	assert(arena_mapbits_large_size_get(chunk, pageind) == PAGE);
+	assert(arena_mapbits_large_size_get(chunk, pageind) == LARGE_MINCLASS);
 	arena_mapbitsp_write(mapbitsp, (mapbits & ~CHUNK_MAP_BININD_MASK) |
 	    (binind << CHUNK_MAP_BININD_SHIFT));
 }
 
 JEMALLOC_ALWAYS_INLINE void
 arena_mapbits_small_set(arena_chunk_t *chunk, size_t pageind, size_t runind,
-    size_t binind, size_t flags)
+    index_t binind, size_t flags)
 {
 	size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
 	size_t mapbits = arena_mapbitsp_read(mapbitsp);
@@ -893,10 +739,10 @@ arena_prof_accum(arena_t *arena, uint64_t accumbytes)
 	}
 }
 
-JEMALLOC_ALWAYS_INLINE size_t
+JEMALLOC_ALWAYS_INLINE index_t
 arena_ptr_small_binind_get(const void *ptr, size_t mapbits)
 {
-	size_t binind;
+	index_t binind;
 
 	binind = (mapbits & CHUNK_MAP_BININD_MASK) >> CHUNK_MAP_BININD_SHIFT;
 
@@ -908,7 +754,7 @@ arena_ptr_small_binind_get(const void *ptr, size_t mapbits)
 		size_t rpages_ind;
 		arena_run_t *run;
 		arena_bin_t *bin;
-		size_t actual_binind;
+		index_t actual_binind;
 		arena_bin_info_t *bin_info;
 		arena_chunk_map_misc_t *miscelm;
 		void *rpages;
@@ -938,13 +784,13 @@ arena_ptr_small_binind_get(const void *ptr, size_t mapbits)
 
 	return (binind);
 }
-#  endif /* JEMALLOC_ARENA_INLINE_B */
+#  endif /* JEMALLOC_ARENA_INLINE_A */
 
-#  ifdef JEMALLOC_ARENA_INLINE_C
-JEMALLOC_INLINE size_t
+#  ifdef JEMALLOC_ARENA_INLINE_B
+JEMALLOC_INLINE index_t
 arena_bin_index(arena_t *arena, arena_bin_t *bin)
 {
-	size_t binind = bin - arena->bins;
+	index_t binind = bin - arena->bins;
 	assert(binind < NBINS);
 	return (binind);
 }
@@ -1102,7 +948,8 @@ arena_salloc(const void *ptr, bool demote)
 {
 	size_t ret;
 	arena_chunk_t *chunk;
-	size_t pageind, binind;
+	size_t pageind;
+	index_t binind;
 
 	assert(ptr != NULL);
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
@@ -1122,10 +969,6 @@ arena_salloc(const void *ptr, bool demote)
 		ret = arena_mapbits_large_size_get(chunk, pageind);
 		assert(ret != 0);
 		assert(pageind + (ret>>LG_PAGE) <= chunk_npages);
-		assert(ret == PAGE || arena_mapbits_large_size_get(chunk,
-		    pageind+(ret>>LG_PAGE)-1) == 0);
-		assert(binind == arena_mapbits_binind_get(chunk,
-		    pageind+(ret>>LG_PAGE)-1));
 		assert(arena_mapbits_dirty_get(chunk, pageind) ==
 		    arena_mapbits_dirty_get(chunk, pageind+(ret>>LG_PAGE)-1));
 	} else {
@@ -1133,7 +976,7 @@ arena_salloc(const void *ptr, bool demote)
 		assert(arena_mapbits_large_get(chunk, pageind) != 0 ||
 		    arena_ptr_small_binind_get(ptr, arena_mapbits_get(chunk,
 		    pageind)) == binind);
-		ret = small_bin2size(binind);
+		ret = index2size(binind);
 	}
 
 	return (ret);
@@ -1155,7 +998,7 @@ arena_dalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, bool try_tcache)
 		/* Small allocation. */
 		if (likely(try_tcache) && likely((tcache = tcache_get(tsd,
 		    false)) != NULL)) {
-			size_t binind = arena_ptr_small_binind_get(ptr,
+			index_t binind = arena_ptr_small_binind_get(ptr,
 			    mapbits);
 			tcache_dalloc_small(tcache, ptr, binind);
 		} else
@@ -1186,7 +1029,7 @@ arena_sdalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, size_t size,
 		/* Small allocation. */
 		if (likely(try_tcache) && likely((tcache = tcache_get(tsd,
 		    false)) != NULL)) {
-			size_t binind = small_size2bin(size);
+			index_t binind = size2index(size);
 			tcache_dalloc_small(tcache, ptr, binind);
 		} else {
 			size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >>
@@ -1203,7 +1046,7 @@ arena_sdalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, size_t size,
 			arena_dalloc_large(chunk->arena, chunk, ptr);
 	}
 }
-#  endif /* JEMALLOC_ARENA_INLINE_C */
+#  endif /* JEMALLOC_ARENA_INLINE_B */
 #endif
 
 #endif /* JEMALLOC_H_INLINES */
diff --git a/include/jemalloc/internal/chunk.h b/include/jemalloc/internal/chunk.h
index 2e68a02..764b7ac 100644
--- a/include/jemalloc/internal/chunk.h
+++ b/include/jemalloc/internal/chunk.h
@@ -40,9 +40,6 @@ extern rtree_t		*chunks_rtree;
 extern size_t		chunksize;
 extern size_t		chunksize_mask; /* (chunksize - 1). */
 extern size_t		chunk_npages;
-extern size_t		map_bias; /* Number of arena chunk header pages. */
-extern size_t		map_misc_offset;
-extern size_t		arena_maxclass; /* Max size class for arenas. */
 
 void	*chunk_alloc_base(size_t size);
 void	*chunk_alloc_arena(chunk_alloc_t *chunk_alloc,
diff --git a/include/jemalloc/internal/huge.h b/include/jemalloc/internal/huge.h
index 00d8c09..939993f 100644
--- a/include/jemalloc/internal/huge.h
+++ b/include/jemalloc/internal/huge.h
@@ -10,7 +10,7 @@
 #ifdef JEMALLOC_H_EXTERNS
 
 void	*huge_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero);
-void	*huge_palloc(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
+void	*huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
     bool zero);
 bool	huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
     size_t extra, bool zero);
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index a169221..8f0beb9 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -165,6 +165,9 @@ static const bool config_ivsalloc =
 
 #include "jemalloc/internal/jemalloc_internal_macros.h"
 
+/* Size class index type. */
+typedef unsigned index_t;
+
 #define	MALLOCX_ARENA_MASK	((int)~0xff)
 #define	MALLOCX_LG_ALIGN_MASK	((int)0x3f)
 /* Use MALLOCX_ALIGN_GET() if alignment may not be specified in flags. */
@@ -397,6 +400,18 @@ extern arena_t		**arenas;
 extern unsigned		narenas_total;
 extern unsigned		narenas_auto; /* Read-only after initialization. */
 
+/*
+ * index2size_tab encodes the same information as could be computed (at
+ * unacceptable cost in some code paths) by index2size_compute().
+ */
+extern size_t const	index2size_tab[NSIZES];
+/*
+ * size2index_tab is a compact lookup table that rounds request sizes up to
+ * size classes.  In order to reduce cache footprint, the table is compressed,
+ * and all accesses are via size2index().
+ */
+extern uint8_t const	size2index_tab[];
+
 arena_t	*arenas_extend(unsigned ind);
 arena_t	*choose_arena_hard(tsd_t *tsd);
 void	thread_allocated_cleanup(tsd_t *tsd);
@@ -449,15 +464,15 @@ void	jemalloc_postfork_child(void);
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
 
-/*
- * Include arena.h the first time in order to provide inline functions for this
- * header's inlines.
- */
-#define	JEMALLOC_ARENA_INLINE_A
-#include "jemalloc/internal/arena.h"
-#undef JEMALLOC_ARENA_INLINE_A
-
 #ifndef JEMALLOC_ENABLE_INLINE
+index_t	size2index_compute(size_t size);
+index_t	size2index_lookup(size_t size);
+index_t	size2index(size_t size);
+size_t	index2size_compute(index_t index);
+size_t	index2size_lookup(index_t index);
+size_t	index2size(index_t index);
+size_t	s2u_compute(size_t size);
+size_t	s2u_lookup(size_t size);
 size_t	s2u(size_t size);
 size_t	sa2u(size_t size, size_t alignment);
 unsigned	narenas_total_get(void);
@@ -465,6 +480,135 @@ arena_t	*choose_arena(tsd_t *tsd, arena_t *arena);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
+JEMALLOC_INLINE index_t
+size2index_compute(size_t size)
+{
+
+#if (NTBINS != 0)
+	if (size <= (ZU(1) << LG_TINY_MAXCLASS)) {
+		size_t lg_tmin = LG_TINY_MAXCLASS - NTBINS + 1;
+		size_t lg_ceil = lg_floor(pow2_ceil(size));
+		return (lg_ceil < lg_tmin ? 0 : lg_ceil - lg_tmin);
+	} else
+#endif
+	{
+		size_t x = lg_floor((size<<1)-1);
+		size_t shift = (x < LG_SIZE_CLASS_GROUP + LG_QUANTUM) ? 0 :
+		    x - (LG_SIZE_CLASS_GROUP + LG_QUANTUM);
+		size_t grp = shift << LG_SIZE_CLASS_GROUP;
+
+		size_t lg_delta = (x < LG_SIZE_CLASS_GROUP + LG_QUANTUM + 1)
+		    ? LG_QUANTUM : x - LG_SIZE_CLASS_GROUP - 1;
+
+		size_t delta_inverse_mask = ZI(-1) << lg_delta;
+		size_t mod = ((((size-1) & delta_inverse_mask) >> lg_delta)) &
+		    ((ZU(1) << LG_SIZE_CLASS_GROUP) - 1);
+
+		size_t index = NTBINS + grp + mod;
+		return (index);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE index_t
+size2index_lookup(size_t size)
+{
+
+	assert(size <= LOOKUP_MAXCLASS);
+	{
+		size_t ret = ((size_t)(size2index_tab[(size-1) >>
+		    LG_TINY_MIN]));
+		assert(ret == size2index_compute(size));
+		return (ret);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE index_t
+size2index(size_t size)
+{
+
+	assert(size > 0);
+	if (likely(size <= LOOKUP_MAXCLASS))
+		return (size2index_lookup(size));
+	else
+		return (size2index_compute(size));
+}
+
+JEMALLOC_INLINE size_t
+index2size_compute(index_t index)
+{
+
+#if (NTBINS > 0)
+	if (index < NTBINS)
+		return (ZU(1) << (LG_TINY_MAXCLASS - NTBINS + 1 + index));
+	else
+#endif
+	{
+		size_t reduced_index = index - NTBINS;
+		size_t grp = reduced_index >> LG_SIZE_CLASS_GROUP;
+		size_t mod = reduced_index & ((ZU(1) << LG_SIZE_CLASS_GROUP) -
+		    1);
+
+		size_t grp_size_mask = ~((!!grp)-1);
+		size_t grp_size = ((ZU(1) << (LG_QUANTUM +
+		    (LG_SIZE_CLASS_GROUP-1))) << grp) & grp_size_mask;
+
+		size_t shift = (grp == 0) ? 1 : grp;
+		size_t lg_delta = shift + (LG_QUANTUM-1);
+		size_t mod_size = (mod+1) << lg_delta;
+
+		size_t usize = grp_size + mod_size;
+		return (usize);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+index2size_lookup(index_t index)
+{
+	size_t ret = (size_t)index2size_tab[index];
+	assert(ret == index2size_compute(index));
+	return (ret);
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+index2size(index_t index)
+{
+
+	assert(index < NSIZES);
+	return (index2size_lookup(index));
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+s2u_compute(size_t size)
+{
+
+#if (NTBINS > 0)
+	if (size <= (ZU(1) << LG_TINY_MAXCLASS)) {
+		size_t lg_tmin = LG_TINY_MAXCLASS - NTBINS + 1;
+		size_t lg_ceil = lg_floor(pow2_ceil(size));
+		return (lg_ceil < lg_tmin ? (ZU(1) << lg_tmin) :
+		    (ZU(1) << lg_ceil));
+	} else
+#endif
+	{
+		size_t x = lg_floor((size<<1)-1);
+		size_t lg_delta = (x < LG_SIZE_CLASS_GROUP + LG_QUANTUM + 1)
+		    ?  LG_QUANTUM : x - LG_SIZE_CLASS_GROUP - 1;
+		size_t delta = ZU(1) << lg_delta;
+		size_t delta_mask = delta - 1;
+		size_t usize = (size + delta_mask) & ~delta_mask;
+		return (usize);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+s2u_lookup(size_t size)
+{
+	size_t ret = index2size_lookup(size2index_lookup(size));
+
+	assert(ret == s2u_compute(size));
+	return (ret);
+}
+
 /*
  * Compute usable size that would result from allocating an object with the
  * specified size.
@@ -473,11 +617,11 @@ JEMALLOC_ALWAYS_INLINE size_t
 s2u(size_t size)
 {
 
-	if (size <= SMALL_MAXCLASS)
-		return (small_s2u(size));
-	if (size <= arena_maxclass)
-		return (PAGE_CEILING(size));
-	return (CHUNK_CEILING(size));
+	assert(size > 0);
+	if (likely(size <= LOOKUP_MAXCLASS))
+		return (s2u_lookup(size));
+	else
+		return (s2u_compute(size));
 }
 
 /*
@@ -491,71 +635,78 @@ sa2u(size_t size, size_t alignment)
 
 	assert(alignment != 0 && ((alignment - 1) & alignment) == 0);
 
-	/*
-	 * Round size up to the nearest multiple of alignment.
-	 *
-	 * This done, we can take advantage of the fact that for each small
-	 * size class, every object is aligned at the smallest power of two
-	 * that is non-zero in the base two representation of the size.  For
-	 * example:
-	 *
-	 *   Size |   Base 2 | Minimum alignment
-	 *   -----+----------+------------------
-	 *     96 |  1100000 |  32
-	 *    144 | 10100000 |  32
-	 *    192 | 11000000 |  64
-	 */
-	usize = ALIGNMENT_CEILING(size, alignment);
-	/*
-	 * (usize < size) protects against the combination of maximal
-	 * alignment and size greater than maximal alignment.
-	 */
-	if (usize < size) {
-		/* size_t overflow. */
-		return (0);
+	/* Try for a small size class. */
+	if (size <= SMALL_MAXCLASS && alignment < PAGE) {
+		/*
+		 * Round size up to the nearest multiple of alignment.
+		 *
+		 * This done, we can take advantage of the fact that for each
+		 * small size class, every object is aligned at the smallest
+		 * power of two that is non-zero in the base two representation
+		 * of the size.  For example:
+		 *
+		 *   Size |   Base 2 | Minimum alignment
+		 *   -----+----------+------------------
+		 *     96 |  1100000 |  32
+		 *    144 | 10100000 |  32
+		 *    192 | 11000000 |  64
+		 */
+		usize = s2u(ALIGNMENT_CEILING(size, alignment));
+		if (usize < LARGE_MINCLASS)
+			return (usize);
 	}
 
-	if (usize <= arena_maxclass && alignment <= PAGE) {
-		if (usize <= SMALL_MAXCLASS)
-			return (small_s2u(usize));
-		return (PAGE_CEILING(usize));
-	} else {
-		size_t run_size;
-
+	/* Try for a large size class. */
+	if (size <= arena_maxclass && alignment < chunksize) {
 		/*
 		 * We can't achieve subpage alignment, so round up alignment
-		 * permanently; it makes later calculations simpler.
+		 * to the minimum that can actually be supported.
 		 */
 		alignment = PAGE_CEILING(alignment);
-		usize = PAGE_CEILING(size);
+
+		/* Make sure result is a large size class. */
+		usize = (size <= LARGE_MINCLASS) ? LARGE_MINCLASS : s2u(size);
+
 		/*
-		 * (usize < size) protects against very large sizes within
-		 * PAGE of SIZE_T_MAX.
-		 *
-		 * (usize + alignment < usize) protects against the
-		 * combination of maximal alignment and usize large enough
-		 * to cause overflow.  This is similar to the first overflow
-		 * check above, but it needs to be repeated due to the new
-		 * usize value, which may now be *equal* to maximal
-		 * alignment, whereas before we only detected overflow if the
-		 * original size was *greater* than maximal alignment.
+		 * Calculate the size of the over-size run that arena_palloc()
+		 * would need to allocate in order to guarantee the alignment.
 		 */
-		if (usize < size || usize + alignment < usize) {
+		if (usize + alignment - PAGE <= arena_maxrun)
+			return (usize);
+	}
+
+	/* Huge size class.  Beware of size_t overflow. */
+
+	/*
+	 * We can't achieve subchunk alignment, so round up alignment to the
+	 * minimum that can actually be supported.
+	 */
+	alignment = CHUNK_CEILING(alignment);
+	if (alignment == 0) {
+		/* size_t overflow. */
+		return (0);
+	}
+
+	/* Make sure result is a huge size class. */
+	if (size <= chunksize)
+		usize = chunksize;
+	else {
+		usize = s2u(size);
+		if (usize < size) {
 			/* size_t overflow. */
 			return (0);
 		}
+	}
 
-		/*
-		 * Calculate the size of the over-size run that arena_palloc()
-		 * would need to allocate in order to guarantee the alignment.
-		 * If the run wouldn't fit within a chunk, round up to a huge
-		 * allocation size.
-		 */
-		run_size = usize + alignment - PAGE;
-		if (run_size <= arena_maxclass)
-			return (PAGE_CEILING(usize));
-		return (CHUNK_CEILING(usize));
+	/*
+	 * Calculate the multi-chunk mapping that huge_palloc() would need in
+	 * order to guarantee the alignment.
+	 */
+	if (usize + alignment - PAGE < usize) {
+		/* size_t overflow. */
+		return (0);
 	}
+	return (usize);
 }
 
 JEMALLOC_INLINE unsigned
@@ -591,16 +742,16 @@ choose_arena(tsd_t *tsd, arena_t *arena)
 #include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/rtree.h"
 /*
- * Include arena.h the second and third times in order to resolve circular
- * dependencies with tcache.h.
+ * Include portions of arena.h interleaved with tcache.h in order to resolve
+ * circular dependencies.
  */
-#define	JEMALLOC_ARENA_INLINE_B
+#define	JEMALLOC_ARENA_INLINE_A
 #include "jemalloc/internal/arena.h"
-#undef JEMALLOC_ARENA_INLINE_B
+#undef JEMALLOC_ARENA_INLINE_A
 #include "jemalloc/internal/tcache.h"
-#define	JEMALLOC_ARENA_INLINE_C
+#define	JEMALLOC_ARENA_INLINE_B
 #include "jemalloc/internal/arena.h"
-#undef JEMALLOC_ARENA_INLINE_C
+#undef JEMALLOC_ARENA_INLINE_B
 #include "jemalloc/internal/hash.h"
 #include "jemalloc/internal/quarantine.h"
 
@@ -678,7 +829,7 @@ ipalloct(tsd_t *tsd, size_t usize, size_t alignment, bool zero, bool try_tcache,
 	assert(usize != 0);
 	assert(usize == sa2u(usize, alignment));
 
-	if (usize <= arena_maxclass && alignment <= PAGE)
+	if (usize <= SMALL_MAXCLASS && alignment < PAGE)
 		ret = arena_malloc(tsd, arena, usize, zero, try_tcache);
 	else {
 		if (usize <= arena_maxclass) {
@@ -742,7 +893,7 @@ u2rz(size_t usize)
 	size_t ret;
 
 	if (usize <= SMALL_MAXCLASS) {
-		size_t binind = small_size2bin(usize);
+		index_t binind = size2index(usize);
 		ret = arena_bin_info[binind].redzone_size;
 	} else
 		ret = 0;
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 4ea9a95..1a7fde4 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -41,6 +41,7 @@ arena_mapbitsp_get
 arena_mapbitsp_read
 arena_mapbitsp_write
 arena_maxclass
+arena_maxrun
 arena_miscelm_get
 arena_miscelm_to_pageind
 arena_miscelm_to_rpages
@@ -216,6 +217,10 @@ idalloct
 imalloc
 imalloct
 in_valgrind
+index2size
+index2size_compute
+index2size_lookup
+index2size_tab
 ipalloc
 ipalloct
 iqalloc
@@ -338,19 +343,14 @@ rtree_postfork_parent
 rtree_prefork
 rtree_set
 s2u
+s2u_compute
+s2u_lookup
 sa2u
 set_errno
-small_bin2size
-small_bin2size_compute
-small_bin2size_lookup
-small_bin2size_tab
-small_s2u
-small_s2u_compute
-small_s2u_lookup
-small_size2bin
-small_size2bin_compute
-small_size2bin_lookup
-small_size2bin_tab
+size2index
+size2index_compute
+size2index_lookup
+size2index_tab
 stats_cactive
 stats_cactive_add
 stats_cactive_get
diff --git a/include/jemalloc/internal/size_classes.sh b/include/jemalloc/internal/size_classes.sh
index 0cfac72..897570c 100755
--- a/include/jemalloc/internal/size_classes.sh
+++ b/include/jemalloc/internal/size_classes.sh
@@ -61,7 +61,7 @@ size_class() {
     rem="yes"
   fi
 
-  if [ ${lg_size} -lt ${lg_p} ] ; then
+  if [ ${lg_size} -lt $((${lg_p} + ${lg_g})) ] ; then
     bin="yes"
   else
     bin="no"
@@ -159,6 +159,7 @@ size_classes() {
         nbins=$((${index} + 1))
         # Final written value is correct:
         small_maxclass="((((size_t)1) << ${lg_grp}) + (((size_t)${ndelta}) << ${lg_delta}))"
+        lg_large_minclass=$((${lg_grp} + 1))
       fi
       index=$((${index} + 1))
       ndelta=$((${ndelta} + 1))
@@ -167,14 +168,17 @@ size_classes() {
     lg_delta=$((${lg_delta} + 1))
   done
   echo
+  nsizes=${index}
 
   # Defined upon completion:
   # - ntbins
   # - nlbins
   # - nbins
+  # - nsizes
   # - lg_tiny_maxclass
   # - lookup_maxclass
   # - small_maxclass
+  # - lg_large_minclass
 }
 
 cat <<EOF
@@ -199,10 +203,11 @@ cat <<EOF
  *   NTBINS: Number of tiny bins.
  *   NLBINS: Number of bins supported by the lookup table.
  *   NBINS: Number of small size class bins.
+ *   NSIZES: Number of size classes.
  *   LG_TINY_MAXCLASS: Lg of maximum tiny size class.
  *   LOOKUP_MAXCLASS: Maximum size class included in lookup table.
  *   SMALL_MAXCLASS: Maximum small size class.
- *   LARGE_MINCLASS: Minimum large size class.
+ *   LG_LARGE_MINCLASS: Lg of minimum large size class.
  */
 
 #define	LG_SIZE_CLASS_GROUP	${lg_g}
@@ -221,9 +226,11 @@ for lg_z in ${lg_zarr} ; do
         echo "#define	NTBINS			${ntbins}"
         echo "#define	NLBINS			${nlbins}"
         echo "#define	NBINS			${nbins}"
+        echo "#define	NSIZES			${nsizes}"
         echo "#define	LG_TINY_MAXCLASS	${lg_tiny_maxclass}"
         echo "#define	LOOKUP_MAXCLASS		${lookup_maxclass}"
         echo "#define	SMALL_MAXCLASS		${small_maxclass}"
+        echo "#define	LG_LARGE_MINCLASS	${lg_large_minclass}"
         echo "#endif"
         echo
       done
@@ -238,7 +245,7 @@ cat <<EOF
 #endif
 #undef SIZE_CLASSES_DEFINED
 /*
- * The small_size2bin lookup table uses uint8_t to encode each bin index, so we
+ * The size2index_tab lookup table uses uint8_t to encode each bin index, so we
  * cannot support more than 256 small size classes.  Further constrain NBINS to
  * 255 since all small size classes, plus a "not small" size class must be
  * stored in 8 bits of arena_chunk_map_bits_t's bits field.
@@ -247,8 +254,6 @@ cat <<EOF
 #  error "Too many small size classes"
 #endif
 
-#define	LARGE_MINCLASS (PAGE_CEILING(SMALL_MAXCLASS+1))
-
 #endif /* JEMALLOC_H_TYPES */
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS
diff --git a/include/jemalloc/internal/stats.h b/include/jemalloc/internal/stats.h
index ce96476..6104cb3 100644
--- a/include/jemalloc/internal/stats.h
+++ b/include/jemalloc/internal/stats.h
@@ -106,12 +106,7 @@ struct arena_stats_s {
 	uint64_t	ndalloc_huge;
 	uint64_t	nrequests_huge;
 
-	/*
-	 * One element for each possible size class, including sizes that
-	 * overlap with bin size classes.  This is necessary because ipalloc()
-	 * sometimes has to use such large objects in order to assure proper
-	 * alignment.
-	 */
+	/* One element for each large size class. */
 	malloc_large_stats_t	*lstats;
 };
 
diff --git a/include/jemalloc/internal/tcache.h b/include/jemalloc/internal/tcache.h
index 1b1d8d9..da8e4ef 100644
--- a/include/jemalloc/internal/tcache.h
+++ b/include/jemalloc/internal/tcache.h
@@ -72,7 +72,7 @@ struct tcache_s {
 	uint64_t	prof_accumbytes;/* Cleared after arena_prof_accum() */
 	arena_t		*arena;		/* This thread's arena. */
 	unsigned	ev_cnt;		/* Event count since incremental GC. */
-	unsigned	next_gc_bin;	/* Next bin to GC. */
+	index_t		next_gc_bin;	/* Next bin to GC. */
 	tcache_bin_t	tbins[1];	/* Dynamically sized. */
 	/*
 	 * The pointer stacks associated with tbins follow as a contiguous
@@ -103,10 +103,10 @@ extern size_t			tcache_maxclass;
 size_t	tcache_salloc(const void *ptr);
 void	tcache_event_hard(tcache_t *tcache);
 void	*tcache_alloc_small_hard(tcache_t *tcache, tcache_bin_t *tbin,
-    size_t binind);
-void	tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem,
+    index_t binind);
+void	tcache_bin_flush_small(tcache_bin_t *tbin, index_t binind, unsigned rem,
     tcache_t *tcache);
-void	tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem,
+void	tcache_bin_flush_large(tcache_bin_t *tbin, index_t binind, unsigned rem,
     tcache_t *tcache);
 void	tcache_arena_associate(tcache_t *tcache, arena_t *arena);
 void	tcache_arena_dissociate(tcache_t *tcache);
@@ -130,7 +130,7 @@ void	tcache_enabled_set(bool enabled);
 void	*tcache_alloc_easy(tcache_bin_t *tbin);
 void	*tcache_alloc_small(tcache_t *tcache, size_t size, bool zero);
 void	*tcache_alloc_large(tcache_t *tcache, size_t size, bool zero);
-void	tcache_dalloc_small(tcache_t *tcache, void *ptr, size_t binind);
+void	tcache_dalloc_small(tcache_t *tcache, void *ptr, index_t binind);
 void	tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size);
 #endif
 
@@ -233,20 +233,21 @@ JEMALLOC_ALWAYS_INLINE void *
 tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
 {
 	void *ret;
-	size_t binind;
+	index_t binind;
+	size_t usize;
 	tcache_bin_t *tbin;
 
-	binind = small_size2bin(size);
+	binind = size2index(size);
 	assert(binind < NBINS);
 	tbin = &tcache->tbins[binind];
-	size = small_bin2size(binind);
+	usize = index2size(binind);
 	ret = tcache_alloc_easy(tbin);
 	if (unlikely(ret == NULL)) {
 		ret = tcache_alloc_small_hard(tcache, tbin, binind);
 		if (ret == NULL)
 			return (NULL);
 	}
-	assert(tcache_salloc(ret) == size);
+	assert(tcache_salloc(ret) == usize);
 
 	if (likely(!zero)) {
 		if (config_fill) {
@@ -254,20 +255,20 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
 				arena_alloc_junk_small(ret,
 				    &arena_bin_info[binind], false);
 			} else if (unlikely(opt_zero))
-				memset(ret, 0, size);
+				memset(ret, 0, usize);
 		}
 	} else {
 		if (config_fill && unlikely(opt_junk)) {
 			arena_alloc_junk_small(ret, &arena_bin_info[binind],
 			    true);
 		}
-		memset(ret, 0, size);
+		memset(ret, 0, usize);
 	}
 
 	if (config_stats)
 		tbin->tstats.nrequests++;
 	if (config_prof)
-		tcache->prof_accumbytes += size;
+		tcache->prof_accumbytes += usize;
 	tcache_event(tcache);
 	return (ret);
 }
@@ -276,12 +277,13 @@ JEMALLOC_ALWAYS_INLINE void *
 tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
 {
 	void *ret;
-	size_t binind;
+	index_t binind;
+	size_t usize;
 	tcache_bin_t *tbin;
 
-	size = PAGE_CEILING(size);
-	assert(size <= tcache_maxclass);
-	binind = NBINS + (size >> LG_PAGE) - 1;
+	binind = size2index(size);
+	usize = index2size(binind);
+	assert(usize <= tcache_maxclass);
 	assert(binind < nhbins);
 	tbin = &tcache->tbins[binind];
 	ret = tcache_alloc_easy(tbin);
@@ -290,11 +292,11 @@ tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
 		 * Only allocate one large object at a time, because it's quite
 		 * expensive to create one and not use it.
 		 */
-		ret = arena_malloc_large(tcache->arena, size, zero);
+		ret = arena_malloc_large(tcache->arena, usize, zero);
 		if (ret == NULL)
 			return (NULL);
 	} else {
-		if (config_prof && size == PAGE) {
+		if (config_prof && usize == LARGE_MINCLASS) {
 			arena_chunk_t *chunk =
 			    (arena_chunk_t *)CHUNK_ADDR2BASE(ret);
 			size_t pageind = (((uintptr_t)ret - (uintptr_t)chunk) >>
@@ -305,17 +307,17 @@ tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
 		if (likely(!zero)) {
 			if (config_fill) {
 				if (unlikely(opt_junk))
-					memset(ret, 0xa5, size);
+					memset(ret, 0xa5, usize);
 				else if (unlikely(opt_zero))
-					memset(ret, 0, size);
+					memset(ret, 0, usize);
 			}
 		} else
-			memset(ret, 0, size);
+			memset(ret, 0, usize);
 
 		if (config_stats)
 			tbin->tstats.nrequests++;
 		if (config_prof)
-			tcache->prof_accumbytes += size;
+			tcache->prof_accumbytes += usize;
 	}
 
 	tcache_event(tcache);
@@ -323,7 +325,7 @@ tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
 }
 
 JEMALLOC_ALWAYS_INLINE void
-tcache_dalloc_small(tcache_t *tcache, void *ptr, size_t binind)
+tcache_dalloc_small(tcache_t *tcache, void *ptr, index_t binind)
 {
 	tcache_bin_t *tbin;
 	tcache_bin_info_t *tbin_info;
@@ -349,7 +351,7 @@ tcache_dalloc_small(tcache_t *tcache, void *ptr, size_t binind)
 JEMALLOC_ALWAYS_INLINE void
 tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
 {
-	size_t binind;
+	index_t binind;
 	tcache_bin_t *tbin;
 	tcache_bin_info_t *tbin_info;
 
@@ -357,7 +359,7 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
 	assert(tcache_salloc(ptr) > SMALL_MAXCLASS);
 	assert(tcache_salloc(ptr) <= tcache_maxclass);
 
-	binind = NBINS + (size >> LG_PAGE) - 1;
+	binind = size2index(size);
 
 	if (config_fill && unlikely(opt_junk))
 		memset(ptr, 0x5a, size);
diff --git a/src/arena.c b/src/arena.c
index b7300a9..49a3057 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -7,42 +7,11 @@
 ssize_t		opt_lg_dirty_mult = LG_DIRTY_MULT_DEFAULT;
 arena_bin_info_t	arena_bin_info[NBINS];
 
-JEMALLOC_ALIGNED(CACHELINE)
-const uint32_t	small_bin2size_tab[NBINS] = {
-#define	B2S_bin_yes(size) \
-	size,
-#define	B2S_bin_no(size)
-#define	SC(index, lg_grp, lg_delta, ndelta, bin, lg_delta_lookup) \
-	B2S_bin_##bin((ZU(1)<<lg_grp) + (ZU(ndelta)<<lg_delta))
-	SIZE_CLASSES
-#undef B2S_bin_yes
-#undef B2S_bin_no
-#undef SC
-};
-
-JEMALLOC_ALIGNED(CACHELINE)
-const uint8_t	small_size2bin_tab[] = {
-#define	S2B_3(i)	i,
-#define	S2B_4(i)	S2B_3(i) S2B_3(i)
-#define	S2B_5(i)	S2B_4(i) S2B_4(i)
-#define	S2B_6(i)	S2B_5(i) S2B_5(i)
-#define	S2B_7(i)	S2B_6(i) S2B_6(i)
-#define	S2B_8(i)	S2B_7(i) S2B_7(i)
-#define	S2B_9(i)	S2B_8(i) S2B_8(i)
-#define	S2B_no(i)
-#define	SC(index, lg_grp, lg_delta, ndelta, bin, lg_delta_lookup) \
-	S2B_##lg_delta_lookup(index)
-	SIZE_CLASSES
-#undef S2B_3
-#undef S2B_4
-#undef S2B_5
-#undef S2B_6
-#undef S2B_7
-#undef S2B_8
-#undef S2B_9
-#undef S2B_no
-#undef SC
-};
+size_t		map_bias;
+size_t		map_misc_offset;
+size_t		arena_maxrun; /* Max run size for arenas. */
+size_t		arena_maxclass; /* Max size class for arenas. */
+size_t		nlclasses; /* Number of large size classes. */
 
 /******************************************************************************/
 /*
@@ -198,7 +167,7 @@ arena_run_reg_dalloc(arena_run_t *run, void *ptr)
 	arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
 	size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
 	size_t mapbits = arena_mapbits_get(chunk, pageind);
-	size_t binind = arena_ptr_small_binind_get(ptr, mapbits);
+	index_t binind = arena_ptr_small_binind_get(ptr, mapbits);
 	arena_bin_info_t *bin_info = &arena_bin_info[binind];
 	unsigned regind = arena_run_regind(run, bin_info, ptr);
 
@@ -375,7 +344,7 @@ arena_run_init_large(arena_t *arena, arena_run_t *run, size_t size, bool zero)
 
 static void
 arena_run_split_small(arena_t *arena, arena_run_t *run, size_t size,
-    size_t binind)
+    index_t binind)
 {
 	arena_chunk_t *chunk;
 	arena_chunk_map_misc_t *miscelm;
@@ -429,9 +398,9 @@ arena_chunk_init_spare(arena_t *arena)
 	assert(arena_mapbits_allocated_get(chunk, map_bias) == 0);
 	assert(arena_mapbits_allocated_get(chunk, chunk_npages-1) == 0);
 	assert(arena_mapbits_unallocated_size_get(chunk, map_bias) ==
-	    arena_maxclass);
+	    arena_maxrun);
 	assert(arena_mapbits_unallocated_size_get(chunk, chunk_npages-1) ==
-	    arena_maxclass);
+	    arena_maxrun);
 	assert(arena_mapbits_dirty_get(chunk, map_bias) ==
 	    arena_mapbits_dirty_get(chunk, chunk_npages-1));
 
@@ -518,8 +487,7 @@ arena_chunk_init_hard(arena_t *arena)
 	 * the pages as zeroed iff chunk_alloc() returned a zeroed chunk.
 	 */
 	unzeroed = zero ? 0 : CHUNK_MAP_UNZEROED;
-	arena_mapbits_unallocated_set(chunk, map_bias, arena_maxclass,
-	    unzeroed);
+	arena_mapbits_unallocated_set(chunk, map_bias, arena_maxrun, unzeroed);
 	/*
 	 * There is no need to initialize the internal page map entries unless
 	 * the chunk is not zeroed.
@@ -544,7 +512,7 @@ arena_chunk_init_hard(arena_t *arena)
 			}
 		}
 	}
-	arena_mapbits_unallocated_set(chunk, chunk_npages-1, arena_maxclass,
+	arena_mapbits_unallocated_set(chunk, chunk_npages-1, arena_maxrun,
 	    unzeroed);
 
 	return (chunk);
@@ -607,9 +575,9 @@ arena_chunk_dalloc(arena_t *arena, arena_chunk_t *chunk)
 	assert(arena_mapbits_allocated_get(chunk, map_bias) == 0);
 	assert(arena_mapbits_allocated_get(chunk, chunk_npages-1) == 0);
 	assert(arena_mapbits_unallocated_size_get(chunk, map_bias) ==
-	    arena_maxclass);
+	    arena_maxrun);
 	assert(arena_mapbits_unallocated_size_get(chunk, chunk_npages-1) ==
-	    arena_maxclass);
+	    arena_maxrun);
 	assert(arena_mapbits_dirty_get(chunk, map_bias) ==
 	    arena_mapbits_dirty_get(chunk, chunk_npages-1));
 
@@ -682,7 +650,7 @@ arena_run_alloc_large(arena_t *arena, size_t size, bool zero)
 }
 
 static arena_run_t *
-arena_run_alloc_small_helper(arena_t *arena, size_t size, size_t binind)
+arena_run_alloc_small_helper(arena_t *arena, size_t size, index_t binind)
 {
 	arena_run_t *run;
 	arena_chunk_map_misc_t *miscelm;
@@ -700,7 +668,7 @@ arena_run_alloc_small_helper(arena_t *arena, size_t size, size_t binind)
 }
 
 static arena_run_t *
-arena_run_alloc_small(arena_t *arena, size_t size, size_t binind)
+arena_run_alloc_small(arena_t *arena, size_t size, index_t binind)
 {
 	arena_chunk_t *chunk;
 	arena_run_t *run;
@@ -1034,7 +1002,7 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned)
 		    arena_mapbits_large_size_get(chunk,
 		    run_ind+(size>>LG_PAGE)-1) == 0);
 	} else {
-		size_t binind = arena_bin_index(arena, run->bin);
+		index_t binind = arena_bin_index(arena, run->bin);
 		arena_bin_info_t *bin_info = &arena_bin_info[binind];
 		size = bin_info->run_size;
 	}
@@ -1079,9 +1047,9 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned)
 		arena_dirty_insert(arena, chunk, run_ind, run_pages);
 
 	/* Deallocate chunk if it is now completely unused. */
-	if (size == arena_maxclass) {
+	if (size == arena_maxrun) {
 		assert(run_ind == map_bias);
-		assert(run_pages == (arena_maxclass >> LG_PAGE));
+		assert(run_pages == (arena_maxrun >> LG_PAGE));
 		arena_chunk_dalloc(arena, chunk);
 	}
 
@@ -1212,7 +1180,7 @@ static arena_run_t *
 arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin)
 {
 	arena_run_t *run;
-	size_t binind;
+	index_t binind;
 	arena_bin_info_t *bin_info;
 
 	/* Look for a usable run. */
@@ -1264,7 +1232,7 @@ static void *
 arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin)
 {
 	void *ret;
-	size_t binind;
+	index_t binind;
 	arena_bin_info_t *bin_info;
 	arena_run_t *run;
 
@@ -1310,7 +1278,7 @@ arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin)
 }
 
 void
-arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, size_t binind,
+arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, index_t binind,
     uint64_t prof_accumbytes)
 {
 	unsigned i, nfill;
@@ -1450,14 +1418,14 @@ arena_dalloc_junk_small_t *arena_dalloc_junk_small =
 void
 arena_quarantine_junk_small(void *ptr, size_t usize)
 {
-	size_t binind;
+	index_t binind;
 	arena_bin_info_t *bin_info;
 	cassert(config_fill);
 	assert(opt_junk);
 	assert(opt_quarantine);
 	assert(usize <= SMALL_MAXCLASS);
 
-	binind = small_size2bin(usize);
+	binind = size2index(usize);
 	bin_info = &arena_bin_info[binind];
 	arena_redzones_validate(ptr, bin_info, true);
 }
@@ -1468,12 +1436,12 @@ arena_malloc_small(arena_t *arena, size_t size, bool zero)
 	void *ret;
 	arena_bin_t *bin;
 	arena_run_t *run;
-	size_t binind;
+	index_t binind;
 
-	binind = small_size2bin(size);
+	binind = size2index(size);
 	assert(binind < NBINS);
 	bin = &arena->bins[binind];
-	size = small_bin2size(binind);
+	size = index2size(binind);
 
 	malloc_mutex_lock(&bin->lock);
 	if ((run = bin->runcur) != NULL && run->nfree > 0)
@@ -1520,14 +1488,15 @@ void *
 arena_malloc_large(arena_t *arena, size_t size, bool zero)
 {
 	void *ret;
+	size_t usize;
 	arena_run_t *run;
 	arena_chunk_map_misc_t *miscelm;
 	UNUSED bool idump;
 
 	/* Large allocation. */
-	size = PAGE_CEILING(size);
+	usize = s2u(size);
 	malloc_mutex_lock(&arena->lock);
-	run = arena_run_alloc_large(arena, size, zero);
+	run = arena_run_alloc_large(arena, usize, zero);
 	if (run == NULL) {
 		malloc_mutex_unlock(&arena->lock);
 		return (NULL);
@@ -1535,15 +1504,17 @@ arena_malloc_large(arena_t *arena, size_t size, bool zero)
 	miscelm = arena_run_to_miscelm(run);
 	ret = arena_miscelm_to_rpages(miscelm);
 	if (config_stats) {
+		index_t index = size2index(usize) - NBINS;
+
 		arena->stats.nmalloc_large++;
 		arena->stats.nrequests_large++;
-		arena->stats.allocated_large += size;
-		arena->stats.lstats[(size >> LG_PAGE) - 1].nmalloc++;
-		arena->stats.lstats[(size >> LG_PAGE) - 1].nrequests++;
-		arena->stats.lstats[(size >> LG_PAGE) - 1].curruns++;
+		arena->stats.allocated_large += usize;
+		arena->stats.lstats[index].nmalloc++;
+		arena->stats.lstats[index].nrequests++;
+		arena->stats.lstats[index].curruns++;
 	}
 	if (config_prof)
-		idump = arena_prof_accum_locked(arena, size);
+		idump = arena_prof_accum_locked(arena, usize);
 	malloc_mutex_unlock(&arena->lock);
 	if (config_prof && idump)
 		prof_idump();
@@ -1551,9 +1522,9 @@ arena_malloc_large(arena_t *arena, size_t size, bool zero)
 	if (!zero) {
 		if (config_fill) {
 			if (unlikely(opt_junk))
-				memset(ret, 0xa5, size);
+				memset(ret, 0xa5, usize);
 			else if (unlikely(opt_zero))
-				memset(ret, 0, size);
+				memset(ret, 0, usize);
 		}
 	}
 
@@ -1610,12 +1581,14 @@ arena_palloc(arena_t *arena, size_t size, size_t alignment, bool zero)
 	ret = arena_miscelm_to_rpages(miscelm);
 
 	if (config_stats) {
+		index_t index = size2index(size) - NBINS;
+
 		arena->stats.nmalloc_large++;
 		arena->stats.nrequests_large++;
 		arena->stats.allocated_large += size;
-		arena->stats.lstats[(size >> LG_PAGE) - 1].nmalloc++;
-		arena->stats.lstats[(size >> LG_PAGE) - 1].nrequests++;
-		arena->stats.lstats[(size >> LG_PAGE) - 1].curruns++;
+		arena->stats.lstats[index].nmalloc++;
+		arena->stats.lstats[index].nrequests++;
+		arena->stats.lstats[index].curruns++;
 	}
 	malloc_mutex_unlock(&arena->lock);
 
@@ -1632,22 +1605,23 @@ void
 arena_prof_promoted(const void *ptr, size_t size)
 {
 	arena_chunk_t *chunk;
-	size_t pageind, binind;
+	size_t pageind;
+	index_t binind;
 
 	cassert(config_prof);
 	assert(ptr != NULL);
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
-	assert(isalloc(ptr, false) == PAGE);
-	assert(isalloc(ptr, true) == PAGE);
+	assert(isalloc(ptr, false) == LARGE_MINCLASS);
+	assert(isalloc(ptr, true) == LARGE_MINCLASS);
 	assert(size <= SMALL_MAXCLASS);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
-	binind = small_size2bin(size);
+	binind = size2index(size);
 	assert(binind < NBINS);
 	arena_mapbits_large_binind_set(chunk, pageind, binind);
 
-	assert(isalloc(ptr, false) == PAGE);
+	assert(isalloc(ptr, false) == LARGE_MINCLASS);
 	assert(isalloc(ptr, true) == size);
 }
 
@@ -1660,7 +1634,7 @@ arena_dissociate_bin_run(arena_chunk_t *chunk, arena_run_t *run,
 	if (run == bin->runcur)
 		bin->runcur = NULL;
 	else {
-		size_t binind = arena_bin_index(chunk->arena, bin);
+		index_t binind = arena_bin_index(chunk->arena, bin);
 		arena_bin_info_t *bin_info = &arena_bin_info[binind];
 
 		if (bin_info->nregs != 1) {
@@ -1678,7 +1652,7 @@ static void
 arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
     arena_bin_t *bin)
 {
-	size_t binind;
+	index_t binind;
 	arena_bin_info_t *bin_info;
 	size_t npages, run_ind, past;
 	arena_chunk_map_misc_t *miscelm;
@@ -1762,7 +1736,8 @@ arena_dalloc_bin_locked(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	arena_run_t *run;
 	arena_bin_t *bin;
 	arena_bin_info_t *bin_info;
-	size_t size, binind;
+	size_t size;
+	index_t binind;
 
 	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
 	rpages_ind = pageind - arena_mapbits_small_runind_get(chunk, pageind);
@@ -1851,10 +1826,12 @@ arena_dalloc_large_locked(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 
 		arena_dalloc_junk_large(ptr, usize);
 		if (config_stats) {
+			index_t index = size2index(usize) - NBINS;
+
 			arena->stats.ndalloc_large++;
 			arena->stats.allocated_large -= usize;
-			arena->stats.lstats[(usize >> LG_PAGE) - 1].ndalloc++;
-			arena->stats.lstats[(usize >> LG_PAGE) - 1].curruns--;
+			arena->stats.lstats[index].ndalloc++;
+			arena->stats.lstats[index].curruns--;
 		}
 	}
 
@@ -1887,17 +1864,20 @@ arena_ralloc_large_shrink(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	malloc_mutex_lock(&arena->lock);
 	arena_run_trim_tail(arena, chunk, run, oldsize, size, true);
 	if (config_stats) {
+		index_t oldindex = size2index(oldsize) - NBINS;
+		index_t index = size2index(size) - NBINS;
+
 		arena->stats.ndalloc_large++;
 		arena->stats.allocated_large -= oldsize;
-		arena->stats.lstats[(oldsize >> LG_PAGE) - 1].ndalloc++;
-		arena->stats.lstats[(oldsize >> LG_PAGE) - 1].curruns--;
+		arena->stats.lstats[oldindex].ndalloc++;
+		arena->stats.lstats[oldindex].curruns--;
 
 		arena->stats.nmalloc_large++;
 		arena->stats.nrequests_large++;
 		arena->stats.allocated_large += size;
-		arena->stats.lstats[(size >> LG_PAGE) - 1].nmalloc++;
-		arena->stats.lstats[(size >> LG_PAGE) - 1].nrequests++;
-		arena->stats.lstats[(size >> LG_PAGE) - 1].curruns++;
+		arena->stats.lstats[index].nmalloc++;
+		arena->stats.lstats[index].nrequests++;
+		arena->stats.lstats[index].curruns++;
 	}
 	malloc_mutex_unlock(&arena->lock);
 }
@@ -1909,24 +1889,30 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
 	size_t npages = oldsize >> LG_PAGE;
 	size_t followsize;
+	size_t usize_min = s2u(size);
 
 	assert(oldsize == arena_mapbits_large_size_get(chunk, pageind));
 
 	/* Try to extend the run. */
-	assert(size + extra > oldsize);
+	assert(usize_min > oldsize);
 	malloc_mutex_lock(&arena->lock);
 	if (pageind + npages < chunk_npages &&
 	    arena_mapbits_allocated_get(chunk, pageind+npages) == 0 &&
 	    (followsize = arena_mapbits_unallocated_size_get(chunk,
-	    pageind+npages)) >= size - oldsize) {
+	    pageind+npages)) >= usize_min - oldsize) {
 		/*
 		 * The next run is available and sufficiently large.  Split the
 		 * following run, then merge the first part with the existing
 		 * allocation.
 		 */
-		size_t flag_dirty;
-		size_t splitsize = (oldsize + followsize <= size + extra)
-		    ? followsize : size + extra - oldsize;
+		size_t flag_dirty, splitsize, usize;
+
+		usize = s2u(size + extra);
+		while (oldsize + followsize < usize)
+			usize = index2size(size2index(usize)-1);
+		assert(usize >= usize_min);
+		splitsize = usize - oldsize;
+
 		arena_run_t *run = &arena_miscelm_get(chunk,
 		    pageind+npages)->run;
 		arena_run_split_large(arena, run, splitsize, zero);
@@ -1948,17 +1934,20 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 		arena_mapbits_large_set(chunk, pageind+npages-1, 0, flag_dirty);
 
 		if (config_stats) {
+			index_t oldindex = size2index(oldsize) - NBINS;
+			index_t index = size2index(size) - NBINS;
+
 			arena->stats.ndalloc_large++;
 			arena->stats.allocated_large -= oldsize;
-			arena->stats.lstats[(oldsize >> LG_PAGE) - 1].ndalloc++;
-			arena->stats.lstats[(oldsize >> LG_PAGE) - 1].curruns--;
+			arena->stats.lstats[oldindex].ndalloc++;
+			arena->stats.lstats[oldindex].curruns--;
 
 			arena->stats.nmalloc_large++;
 			arena->stats.nrequests_large++;
 			arena->stats.allocated_large += size;
-			arena->stats.lstats[(size >> LG_PAGE) - 1].nmalloc++;
-			arena->stats.lstats[(size >> LG_PAGE) - 1].nrequests++;
-			arena->stats.lstats[(size >> LG_PAGE) - 1].curruns++;
+			arena->stats.lstats[index].nmalloc++;
+			arena->stats.lstats[index].nrequests++;
+			arena->stats.lstats[index].curruns++;
 		}
 		malloc_mutex_unlock(&arena->lock);
 		return (false);
@@ -1996,10 +1985,14 @@ static bool
 arena_ralloc_large(void *ptr, size_t oldsize, size_t size, size_t extra,
     bool zero)
 {
-	size_t psize;
+	size_t usize;
 
-	psize = PAGE_CEILING(size + extra);
-	if (psize == oldsize) {
+	/* Make sure extra can't cause size_t overflow. */
+	if (extra >= arena_maxclass)
+		return (true);
+
+	usize = s2u(size + extra);
+	if (usize == oldsize) {
 		/* Same size class. */
 		return (false);
 	} else {
@@ -2009,16 +2002,15 @@ arena_ralloc_large(void *ptr, size_t oldsize, size_t size, size_t extra,
 		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 		arena = chunk->arena;
 
-		if (psize < oldsize) {
+		if (usize < oldsize) {
 			/* Fill before shrinking in order avoid a race. */
-			arena_ralloc_junk_large(ptr, oldsize, psize);
+			arena_ralloc_junk_large(ptr, oldsize, usize);
 			arena_ralloc_large_shrink(arena, chunk, ptr, oldsize,
-			    psize);
+			    usize);
 			return (false);
 		} else {
 			bool ret = arena_ralloc_large_grow(arena, chunk, ptr,
-			    oldsize, PAGE_CEILING(size),
-			    psize - PAGE_CEILING(size), zero);
+			    oldsize, size, extra, zero);
 			if (config_fill && !ret && !zero) {
 				if (unlikely(opt_junk)) {
 					memset((void *)((uintptr_t)ptr +
@@ -2045,12 +2037,11 @@ arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
 	 */
 	if (oldsize <= arena_maxclass) {
 		if (oldsize <= SMALL_MAXCLASS) {
-			assert(arena_bin_info[small_size2bin(oldsize)].reg_size
+			assert(arena_bin_info[size2index(oldsize)].reg_size
 			    == oldsize);
-			if ((size + extra <= SMALL_MAXCLASS &&
-			    small_size2bin(size + extra) ==
-			    small_size2bin(oldsize)) || (size <= oldsize &&
-			    size + extra >= oldsize))
+			if ((size + extra <= SMALL_MAXCLASS && size2index(size +
+			    extra) == size2index(oldsize)) || (size <= oldsize
+			    && size + extra >= oldsize))
 				return (false);
 		} else {
 			assert(size <= arena_maxclass);
@@ -2258,7 +2249,7 @@ arena_new(arena_t *arena, unsigned ind)
 /*
  * Calculate bin_info->run_size such that it meets the following constraints:
  *
- *   *) bin_info->run_size <= arena_maxclass
+ *   *) bin_info->run_size <= arena_maxrun
  *   *) bin_info->nregs <= RUN_MAXREGS
  *
  * bin_info->nregs and bin_info->reg0_offset are also calculated here, since
@@ -2330,7 +2321,7 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info)
 	/*
 	 * Make sure that the run will fit within an arena chunk.
 	 */
-	while (actual_run_size > arena_maxclass) {
+	while (actual_run_size > arena_maxrun) {
 		actual_run_size -= PAGE;
 		actual_nregs = (actual_run_size - pad_size) /
 		    bin_info->reg_interval;
@@ -2396,7 +2387,17 @@ arena_boot(void)
 	map_misc_offset = offsetof(arena_chunk_t, map_bits) +
 	    sizeof(arena_chunk_map_bits_t) * (chunk_npages-map_bias);
 
-	arena_maxclass = chunksize - (map_bias << LG_PAGE);
+	arena_maxrun = chunksize - (map_bias << LG_PAGE);
+	arena_maxclass = index2size(size2index(chunksize)-1);
+	if (arena_maxclass > arena_maxrun) {
+		/*
+		 * For small chunk sizes it's possible for there to be fewer
+		 * non-header pages available than are necessary to serve the
+		 * size classes just below chunksize.
+		 */
+		arena_maxclass = arena_maxrun;
+	}
+	nlclasses = size2index(arena_maxclass) - size2index(SMALL_MAXCLASS);
 
 	bin_info_init();
 }
diff --git a/src/chunk.c b/src/chunk.c
index 32b8b3a..618aaca 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -27,9 +27,6 @@ rtree_t		*chunks_rtree;
 size_t		chunksize;
 size_t		chunksize_mask; /* (chunksize - 1). */
 size_t		chunk_npages;
-size_t		map_bias;
-size_t		map_misc_offset;
-size_t		arena_maxclass; /* Max size class for arenas. */
 
 /******************************************************************************/
 /*
diff --git a/src/ctl.c b/src/ctl.c
index 309f1f6..f1f3234 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -1628,7 +1628,7 @@ arenas_bin_i_index(const size_t *mib, size_t miblen, size_t i)
 }
 
 CTL_RO_NL_GEN(arenas_nlruns, nlclasses, size_t)
-CTL_RO_NL_GEN(arenas_lrun_i_size, ((mib[2]+1) << LG_PAGE), size_t)
+CTL_RO_NL_GEN(arenas_lrun_i_size, index2size(NBINS+mib[2]), size_t)
 static const ctl_named_node_t *
 arenas_lrun_i_index(const size_t *mib, size_t miblen, size_t i)
 {
diff --git a/src/huge.c b/src/huge.c
index 6bdc076..ae41625 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -15,12 +15,19 @@ static extent_tree_t	huge;
 void *
 huge_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero)
 {
+	size_t usize;
 
-	return (huge_palloc(tsd, arena, size, chunksize, zero));
+	usize = s2u(size);
+	if (usize == 0) {
+		/* size_t overflow. */
+		return (NULL);
+	}
+
+	return (huge_palloc(tsd, arena, usize, chunksize, zero));
 }
 
 void *
-huge_palloc(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
+huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
     bool zero)
 {
 	void *ret;
@@ -30,11 +37,8 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
 
 	/* Allocate one or more contiguous chunks for this request. */
 
-	csize = CHUNK_CEILING(size);
-	if (csize == 0) {
-		/* size is large enough to cause size_t wrap-around. */
-		return (NULL);
-	}
+	csize = CHUNK_CEILING(usize);
+	assert(csize >= usize);
 
 	/* Allocate an extent node with which to track the chunk. */
 	node = base_node_alloc();
@@ -55,7 +59,7 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
 
 	/* Insert node into huge. */
 	node->addr = ret;
-	node->size = csize;
+	node->size = usize;
 	node->arena = arena;
 
 	malloc_mutex_lock(&huge_mtx);
@@ -64,9 +68,9 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
 
 	if (config_fill && !zero) {
 		if (unlikely(opt_junk))
-			memset(ret, 0xa5, csize);
+			memset(ret, 0xa5, usize);
 		else if (unlikely(opt_zero) && !is_zeroed)
-			memset(ret, 0, csize);
+			memset(ret, 0, usize);
 	}
 
 	return (ret);
@@ -97,7 +101,7 @@ huge_dalloc_junk_t *huge_dalloc_junk = JEMALLOC_N(huge_dalloc_junk_impl);
 
 static bool
 huge_ralloc_no_move_expand(void *ptr, size_t oldsize, size_t size, bool zero) {
-	size_t csize;
+	size_t usize;
 	void *expand_addr;
 	size_t expand_size;
 	extent_node_t *node, key;
@@ -105,14 +109,14 @@ huge_ralloc_no_move_expand(void *ptr, size_t oldsize, size_t size, bool zero) {
 	bool is_zeroed;
 	void *ret;
 
-	csize = CHUNK_CEILING(size);
-	if (csize == 0) {
-		/* size is large enough to cause size_t wrap-around. */
+	usize = s2u(size);
+	if (usize == 0) {
+		/* size_t overflow. */
 		return (true);
 	}
 
-	expand_addr = ptr + oldsize;
-	expand_size = csize - oldsize;
+	expand_addr = ptr + CHUNK_CEILING(oldsize);
+	expand_size = CHUNK_CEILING(usize) - CHUNK_CEILING(oldsize);
 
 	malloc_mutex_lock(&huge_mtx);
 
@@ -140,14 +144,14 @@ huge_ralloc_no_move_expand(void *ptr, size_t oldsize, size_t size, bool zero) {
 
 	malloc_mutex_lock(&huge_mtx);
 	/* Update the size of the huge allocation. */
-	node->size = csize;
+	node->size = usize;
 	malloc_mutex_unlock(&huge_mtx);
 
 	if (config_fill && !zero) {
 		if (unlikely(opt_junk))
-			memset(expand_addr, 0xa5, expand_size);
+			memset(ptr + oldsize, 0xa5, usize - oldsize);
 		else if (unlikely(opt_zero) && !is_zeroed)
-			memset(expand_addr, 0, expand_size);
+			memset(ptr + oldsize, 0, usize - oldsize);
 	}
 	return (false);
 }
@@ -156,27 +160,71 @@ bool
 huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
     bool zero)
 {
+	size_t usize;
 
 	/* Both allocations must be huge to avoid a move. */
-	if (oldsize <= arena_maxclass)
+	if (oldsize < chunksize)
 		return (true);
 
-	assert(CHUNK_CEILING(oldsize) == oldsize);
+	assert(s2u(oldsize) == oldsize);
+	usize = s2u(size);
+	if (usize == 0) {
+		/* size_t overflow. */
+		return (true);
+	}
 
 	/*
-	 * Avoid moving the allocation if the size class can be left the same.
+	 * Avoid moving the allocation if the existing chunk size accommodates
+	 * the new size.
 	 */
-	if (CHUNK_CEILING(oldsize) >= CHUNK_CEILING(size)
+	if (CHUNK_CEILING(oldsize) >= CHUNK_CEILING(usize)
 	    && CHUNK_CEILING(oldsize) <= CHUNK_CEILING(size+extra)) {
+		size_t usize_next;
+
+		/* Increase usize to incorporate extra. */
+		while (usize < s2u(size+extra) && (usize_next = s2u(usize+1)) <
+		    oldsize)
+			usize = usize_next;
+
+		/* Update the size of the huge allocation if it changed. */
+		if (oldsize != usize) {
+			extent_node_t *node, key;
+
+			malloc_mutex_lock(&huge_mtx);
+
+			key.addr = ptr;
+			node = extent_tree_ad_search(&huge, &key);
+			assert(node != NULL);
+			assert(node->addr == ptr);
+
+			assert(node->size != usize);
+			node->size = usize;
+
+			malloc_mutex_unlock(&huge_mtx);
+
+			if (oldsize < usize) {
+				if (zero || (config_fill &&
+				    unlikely(opt_zero))) {
+					memset(ptr + oldsize, 0, usize -
+					    oldsize);
+				} else if (config_fill && unlikely(opt_junk)) {
+					memset(ptr + oldsize, 0xa5, usize -
+					    oldsize);
+				}
+			} else if (config_fill && unlikely(opt_junk) && oldsize
+			    > usize)
+				memset(ptr + usize, 0x5a, oldsize - usize);
+		}
 		return (false);
 	}
 
-	/* Overflow. */
-	if (CHUNK_CEILING(size) == 0)
-		return (true);
+	if (CHUNK_CEILING(oldsize) >= CHUNK_CEILING(size)
+	    && CHUNK_CEILING(oldsize) <= CHUNK_CEILING(size+extra)) {
+		return (false);
+	}
 
 	/* Shrink the allocation in-place. */
-	if (CHUNK_CEILING(oldsize) > CHUNK_CEILING(size)) {
+	if (CHUNK_CEILING(oldsize) > CHUNK_CEILING(usize)) {
 		extent_node_t *node, key;
 		void *excess_addr;
 		size_t excess_size;
@@ -189,15 +237,15 @@ huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
 		assert(node->addr == ptr);
 
 		/* Update the size of the huge allocation. */
-		node->size = CHUNK_CEILING(size);
+		node->size = usize;
 
 		malloc_mutex_unlock(&huge_mtx);
 
-		excess_addr = node->addr + CHUNK_CEILING(size);
-		excess_size = CHUNK_CEILING(oldsize) - CHUNK_CEILING(size);
+		excess_addr = node->addr + CHUNK_CEILING(usize);
+		excess_size = CHUNK_CEILING(oldsize) - CHUNK_CEILING(usize);
 
 		/* Zap the excess chunks. */
-		huge_dalloc_junk(excess_addr, excess_size);
+		huge_dalloc_junk(ptr + usize, oldsize - usize);
 		arena_chunk_dalloc_huge(node->arena, excess_addr, excess_size);
 
 		return (false);
@@ -275,7 +323,8 @@ huge_dalloc(void *ptr)
 	malloc_mutex_unlock(&huge_mtx);
 
 	huge_dalloc_junk(node->addr, node->size);
-	arena_chunk_dalloc_huge(node->arena, node->addr, node->size);
+	arena_chunk_dalloc_huge(node->arena, node->addr,
+	    CHUNK_CEILING(node->size));
 	base_node_dalloc(node);
 }
 
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 3490ecd..f3750b4 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -42,6 +42,38 @@ unsigned		narenas_auto;
 /* Set to true once the allocator has been initialized. */
 static bool		malloc_initialized = false;
 
+JEMALLOC_ALIGNED(CACHELINE)
+const size_t	index2size_tab[NSIZES] = {
+#define	SC(index, lg_grp, lg_delta, ndelta, bin, lg_delta_lookup) \
+	((ZU(1)<<lg_grp) + (ZU(ndelta)<<lg_delta)),
+	SIZE_CLASSES
+#undef SC
+};
+
+JEMALLOC_ALIGNED(CACHELINE)
+const uint8_t	size2index_tab[] = {
+#define	S2B_3(i)	i,
+#define	S2B_4(i)	S2B_3(i) S2B_3(i)
+#define	S2B_5(i)	S2B_4(i) S2B_4(i)
+#define	S2B_6(i)	S2B_5(i) S2B_5(i)
+#define	S2B_7(i)	S2B_6(i) S2B_6(i)
+#define	S2B_8(i)	S2B_7(i) S2B_7(i)
+#define	S2B_9(i)	S2B_8(i) S2B_8(i)
+#define	S2B_no(i)
+#define	SC(index, lg_grp, lg_delta, ndelta, bin, lg_delta_lookup) \
+	S2B_##lg_delta_lookup(index)
+	SIZE_CLASSES
+#undef S2B_3
+#undef S2B_4
+#undef S2B_5
+#undef S2B_6
+#undef S2B_7
+#undef S2B_8
+#undef S2B_9
+#undef S2B_no
+#undef SC
+};
+
 #ifdef JEMALLOC_THREADED_INIT
 /* Used to let the initializing thread recursively allocate. */
 #  define NO_INITIALIZER	((unsigned long)0)
@@ -1731,7 +1763,7 @@ ixallocx_prof_sample(void *ptr, size_t old_usize, size_t size, size_t extra,
 		    alignment, zero))
 			return (old_usize);
 		usize = isalloc(ptr, config_prof);
-		if (max_usize < PAGE)
+		if (max_usize < LARGE_MINCLASS)
 			arena_prof_promoted(ptr, usize);
 	} else {
 		usize = ixallocx_helper(ptr, old_usize, size, extra, alignment,
diff --git a/src/tcache.c b/src/tcache.c
index 07167b6..2c968c6 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -24,7 +24,7 @@ size_t	tcache_salloc(const void *ptr)
 void
 tcache_event_hard(tcache_t *tcache)
 {
-	size_t binind = tcache->next_gc_bin;
+	index_t binind = tcache->next_gc_bin;
 	tcache_bin_t *tbin = &tcache->tbins[binind];
 	tcache_bin_info_t *tbin_info = &tcache_bin_info[binind];
 
@@ -62,7 +62,7 @@ tcache_event_hard(tcache_t *tcache)
 }
 
 void *
-tcache_alloc_small_hard(tcache_t *tcache, tcache_bin_t *tbin, size_t binind)
+tcache_alloc_small_hard(tcache_t *tcache, tcache_bin_t *tbin, index_t binind)
 {
 	void *ret;
 
@@ -76,7 +76,7 @@ tcache_alloc_small_hard(tcache_t *tcache, tcache_bin_t *tbin, size_t binind)
 }
 
 void
-tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem,
+tcache_bin_flush_small(tcache_bin_t *tbin, index_t binind, unsigned rem,
     tcache_t *tcache)
 {
 	void *ptr;
@@ -153,7 +153,7 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem,
 }
 
 void
-tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem,
+tcache_bin_flush_large(tcache_bin_t *tbin, index_t binind, unsigned rem,
     tcache_t *tcache)
 {
 	void *ptr;
diff --git a/test/unit/junk.c b/test/unit/junk.c
index 301428f..5b35a87 100644
--- a/test/unit/junk.c
+++ b/test/unit/junk.c
@@ -88,7 +88,6 @@ test_junk(size_t sz_min, size_t sz_max)
 
 		if (xallocx(s, sz+1, 0, 0) == sz) {
 			void *junked = (void *)s;
-
 			s = (char *)rallocx(s, sz+1, 0);
 			assert_ptr_not_null((void *)s,
 			    "Unexpected rallocx() failure");
@@ -134,13 +133,25 @@ TEST_END
 arena_ralloc_junk_large_t *arena_ralloc_junk_large_orig;
 static void *most_recently_trimmed;
 
+static size_t
+shrink_size(size_t size)
+{
+	size_t shrink_size;
+
+	for (shrink_size = size - 1; nallocx(shrink_size, 0) == size;
+	    shrink_size--)
+		; /* Do nothing. */
+
+	return (shrink_size);
+}
+
 static void
 arena_ralloc_junk_large_intercept(void *ptr, size_t old_usize, size_t usize)
 {
 
 	arena_ralloc_junk_large_orig(ptr, old_usize, usize);
 	assert_zu_eq(old_usize, arena_maxclass, "Unexpected old_usize");
-	assert_zu_eq(usize, arena_maxclass-PAGE, "Unexpected usize");
+	assert_zu_eq(usize, shrink_size(arena_maxclass), "Unexpected usize");
 	most_recently_trimmed = ptr;
 }
 
@@ -154,7 +165,7 @@ TEST_BEGIN(test_junk_large_ralloc_shrink)
 	arena_ralloc_junk_large_orig = arena_ralloc_junk_large;
 	arena_ralloc_junk_large = arena_ralloc_junk_large_intercept;
 
-	p2 = rallocx(p1, arena_maxclass-PAGE, 0);
+	p2 = rallocx(p1, shrink_size(arena_maxclass), 0);
 	assert_ptr_eq(p1, p2, "Unexpected move during shrink");
 
 	arena_ralloc_junk_large = arena_ralloc_junk_large_orig;
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index c70473c..e62e54f 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -357,7 +357,7 @@ TEST_BEGIN(test_arenas_lrun_constants)
 	assert_zu_eq(name, expected, "Incorrect "#name" size");		\
 } while (0)
 
-	TEST_ARENAS_LRUN_CONSTANT(size_t, size, (1 << LG_PAGE));
+	TEST_ARENAS_LRUN_CONSTANT(size_t, size, (1 << (LG_PAGE+2)));
 
 #undef TEST_ARENAS_LRUN_CONSTANT
 }
-- 
cgit v0.12


From bf40641c5c9496d2912ad9ff2c38ee9ce2bfbde6 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Mon, 6 Oct 2014 16:35:11 -0700
Subject: Fix a prof_tctx_t destruction race.

---
 src/prof.c | 50 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 32 insertions(+), 18 deletions(-)

diff --git a/src/prof.c b/src/prof.c
index a6cea92..b3150a2 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -609,7 +609,7 @@ prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx)
 {
 	prof_tdata_t *tdata = tctx->tdata;
 	prof_gctx_t *gctx = tctx->gctx;
-	bool destroy_tdata, destroy_gctx;
+	bool destroy_tdata, destroy_tctx, destroy_gctx;
 
 	assert(tctx->cnts.curobjs == 0);
 	assert(tctx->cnts.curbytes == 0);
@@ -622,25 +622,38 @@ prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx)
 	malloc_mutex_unlock(tdata->lock);
 
 	malloc_mutex_lock(gctx->lock);
-	tctx_tree_remove(&gctx->tctxs, tctx);
-	if (prof_gctx_should_destroy(gctx)) {
+	if (tctx->state != prof_tctx_state_dumping) {
+		tctx_tree_remove(&gctx->tctxs, tctx);
+		destroy_tctx = true;
+		if (prof_gctx_should_destroy(gctx)) {
+			/*
+			 * Increment gctx->nlimbo in order to keep another
+			 * thread from winning the race to destroy gctx while
+			 * this one has gctx->lock dropped.  Without this, it
+			 * would be possible for another thread to:
+			 *
+			 * 1) Sample an allocation associated with gctx.
+			 * 2) Deallocate the sampled object.
+			 * 3) Successfully prof_gctx_try_destroy(gctx).
+			 *
+			 * The result would be that gctx no longer exists by the
+			 * time this thread accesses it in
+			 * prof_gctx_try_destroy().
+			 */
+			gctx->nlimbo++;
+			destroy_gctx = true;
+		} else
+			destroy_gctx = false;
+	} else {
 		/*
-		 * Increment gctx->nlimbo in order to keep another thread from
-		 * winning the race to destroy gctx while this one has
-		 * gctx->lock dropped.  Without this, it would be possible for
-		 * another thread to:
-		 *
-		 * 1) Sample an allocation associated with gctx.
-		 * 2) Deallocate the sampled object.
-		 * 3) Successfully prof_gctx_try_destroy(gctx).
-		 *
-		 * The result would be that gctx no longer exists by the time
-		 * this thread accesses it in prof_gctx_try_destroy().
+		 * A dumping thread needs tctx to remain valid until dumping
+		 * has finished.  Change state such that the dumping thread will
+		 * complete destruction during a late dump iteration phase.
 		 */
-		gctx->nlimbo++;
-		destroy_gctx = true;
-	} else
+		tctx->state = prof_tctx_state_purgatory;
+		destroy_tctx = false;
 		destroy_gctx = false;
+	}
 	malloc_mutex_unlock(gctx->lock);
 	if (destroy_gctx)
 		prof_gctx_try_destroy(tsd, gctx, tdata);
@@ -648,7 +661,8 @@ prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx)
 	if (destroy_tdata)
 		prof_tdata_destroy(tsd, tdata, false);
 
-	idalloc(tsd, tctx);
+	if (destroy_tctx)
+		idalloc(tsd, tctx);
 }
 
 static bool
-- 
cgit v0.12


From 8bb3198f72fc7587dc93527f9f19fb5be52fa553 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 7 Oct 2014 23:14:57 -0700
Subject: Refactor/fix arenas manipulation.

Abstract arenas access to use arena_get() (or a0get() where appropriate)
rather than directly reading e.g. arenas[ind].  Prior to the addition of
the arenas.extend mallctl, the worst possible outcome of directly
accessing arenas was a stale read, but arenas.extend may allocate and
assign a new array to arenas.

Add a tsd-based arenas_cache, which amortizes arenas reads.  This
introduces some subtle bootstrapping issues, with tsd_boot() now being
split into tsd_boot[01]() to support tsd wrapper allocation
bootstrapping, as well as an arenas_cache_bypass tsd variable which
dynamically terminates allocation of arenas_cache itself.

Promote a0malloc(), a0calloc(), and a0free() to be generally useful for
internal allocation, and use them in several places (more may be
appropriate).

Abstract arena->nthreads management and fix a missing decrement during
thread destruction (recent tsd refactoring left arenas_cleanup()
unused).

Change arena_choose() to propagate OOM, and handle OOM in all callers.
This is important for providing consistent allocation behavior when the
MALLOCX_ARENA() flag is being used.  Prior to this fix, it was possible
for an OOM to result in allocation silently allocating from a different
arena than the one specified.
---
 include/jemalloc/internal/arena.h                |  14 +-
 include/jemalloc/internal/jemalloc_internal.h.in |  90 ++--
 include/jemalloc/internal/private_symbols.txt    |  28 +-
 include/jemalloc/internal/tcache.h               |   1 +
 include/jemalloc/internal/tsd.h                  | 239 ++++++++---
 src/arena.c                                      |  30 +-
 src/chunk.c                                      |  10 +-
 src/ctl.c                                        | 119 ++----
 src/huge.c                                       |   6 +-
 src/jemalloc.c                                   | 516 ++++++++++++++++-------
 src/tcache.c                                     |  14 +-
 src/tsd.c                                        |  19 +-
 test/unit/tsd.c                                  |   1 +
 13 files changed, 740 insertions(+), 347 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 681b580..894ce9a 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -389,7 +389,7 @@ bool	arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec);
 void	arena_stats_merge(arena_t *arena, const char **dss, size_t *nactive,
     size_t *ndirty, arena_stats_t *astats, malloc_bin_stats_t *bstats,
     malloc_large_stats_t *lstats);
-bool	arena_new(arena_t *arena, unsigned ind);
+arena_t	*arena_new(unsigned ind);
 void	arena_boot(void);
 void	arena_prefork(arena_t *arena);
 void	arena_postfork_parent(arena_t *arena);
@@ -924,8 +924,10 @@ arena_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
 		    true)) != NULL))
 			return (tcache_alloc_small(tcache, size, zero));
 		else {
-			return (arena_malloc_small(choose_arena(tsd, arena),
-			    size, zero));
+			arena = arena_choose(tsd, arena);
+			if (unlikely(arena == NULL))
+				return (NULL);
+			return (arena_malloc_small(arena, size, zero));
 		}
 	} else {
 		/*
@@ -936,8 +938,10 @@ arena_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
 		    tcache_get(tsd, true)) != NULL))
 			return (tcache_alloc_large(tcache, size, zero));
 		else {
-			return (arena_malloc_large(choose_arena(tsd, arena),
-			    size, zero));
+			arena = arena_choose(tsd, arena);
+			if (unlikely(arena == NULL))
+				return (NULL);
+			return (arena_malloc_large(arena, size, zero));
 		}
 	}
 }
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 8f0beb9..c7a5fd8 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -386,20 +386,6 @@ extern bool	in_valgrind;
 /* Number of CPUs. */
 extern unsigned		ncpus;
 
-/* Protects arenas initialization (arenas, arenas_total). */
-extern malloc_mutex_t	arenas_lock;
-/*
- * Arenas that are used to service external requests.  Not all elements of the
- * arenas array are necessarily used; arenas are created lazily as needed.
- *
- * arenas[0..narenas_auto) are used for automatic multiplexing of threads and
- * arenas.  arenas[narenas_auto..narenas_total) are only used if the application
- * takes some action to create them and allocate from them.
- */
-extern arena_t		**arenas;
-extern unsigned		narenas_total;
-extern unsigned		narenas_auto; /* Read-only after initialization. */
-
 /*
  * index2size_tab encodes the same information as could be computed (at
  * unacceptable cost in some code paths) by index2size_compute().
@@ -412,11 +398,23 @@ extern size_t const	index2size_tab[NSIZES];
  */
 extern uint8_t const	size2index_tab[];
 
+arena_t	*a0get(void);
+void	*a0malloc(size_t size);
+void	*a0calloc(size_t num, size_t size);
+void	a0free(void *ptr);
 arena_t	*arenas_extend(unsigned ind);
-arena_t	*choose_arena_hard(tsd_t *tsd);
+arena_t	*arena_init(unsigned ind);
+unsigned	narenas_total_get(void);
+arena_t	*arena_get_hard(tsd_t *tsd, unsigned ind, bool init_if_missing);
+arena_t	*arena_choose_hard(tsd_t *tsd);
+void	arena_migrate(tsd_t *tsd, unsigned oldind, unsigned newind);
+unsigned	arena_nbound(unsigned ind);
 void	thread_allocated_cleanup(tsd_t *tsd);
 void	thread_deallocated_cleanup(tsd_t *tsd);
 void	arena_cleanup(tsd_t *tsd);
+void	arenas_cache_cleanup(tsd_t *tsd);
+void	narenas_cache_cleanup(tsd_t *tsd);
+void	arenas_cache_bypass_cleanup(tsd_t *tsd);
 void	jemalloc_prefork(void);
 void	jemalloc_postfork_parent(void);
 void	jemalloc_postfork_child(void);
@@ -475,8 +473,9 @@ size_t	s2u_compute(size_t size);
 size_t	s2u_lookup(size_t size);
 size_t	s2u(size_t size);
 size_t	sa2u(size_t size, size_t alignment);
-unsigned	narenas_total_get(void);
-arena_t	*choose_arena(tsd_t *tsd, arena_t *arena);
+arena_t	*arena_choose(tsd_t *tsd, arena_t *arena);
+arena_t	*arena_get(tsd_t *tsd, unsigned ind, bool init_if_missing,
+    bool refresh_if_missing);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
@@ -709,34 +708,51 @@ sa2u(size_t size, size_t alignment)
 	return (usize);
 }
 
-JEMALLOC_INLINE unsigned
-narenas_total_get(void)
-{
-	unsigned narenas;
-
-	malloc_mutex_lock(&arenas_lock);
-	narenas = narenas_total;
-	malloc_mutex_unlock(&arenas_lock);
-
-	return (narenas);
-}
-
 /* Choose an arena based on a per-thread value. */
 JEMALLOC_INLINE arena_t *
-choose_arena(tsd_t *tsd, arena_t *arena)
+arena_choose(tsd_t *tsd, arena_t *arena)
 {
 	arena_t *ret;
 
 	if (arena != NULL)
 		return (arena);
 
-	if (unlikely((ret = tsd_arena_get(tsd)) == NULL)) {
-		ret = choose_arena_hard(tsd);
-		assert(ret != NULL);
-	}
+	if (unlikely((ret = tsd_arena_get(tsd)) == NULL))
+		ret = arena_choose_hard(tsd);
 
 	return (ret);
 }
+
+JEMALLOC_INLINE arena_t *
+arena_get(tsd_t *tsd, unsigned ind, bool init_if_missing,
+    bool refresh_if_missing)
+{
+	arena_t *arena;
+	arena_t **arenas_cache = tsd_arenas_cache_get(tsd);
+
+	/* init_if_missing requires refresh_if_missing. */
+	assert(!init_if_missing || refresh_if_missing);
+
+	if (unlikely(arenas_cache == NULL)) {
+		/* arenas_cache hasn't been initialized yet. */
+		return (arena_get_hard(tsd, ind, init_if_missing));
+	}
+	if (unlikely(ind >= tsd_narenas_cache_get(tsd))) {
+		/*
+		 * ind is invalid, cache is old (too small), or arena to be
+		 * initialized.
+		 */
+		return (refresh_if_missing ?  arena_get_hard(tsd, ind,
+		    init_if_missing) : NULL);
+	}
+	arena = arenas_cache[ind];
+	if (likely(arena != NULL) || !refresh_if_missing)
+		return (arena);
+	if (init_if_missing)
+		return (arena_get_hard(tsd, ind, init_if_missing));
+	else
+		return (NULL);
+}
 #endif
 
 #include "jemalloc/internal/bitmap.h"
@@ -833,8 +849,10 @@ ipalloct(tsd_t *tsd, size_t usize, size_t alignment, bool zero, bool try_tcache,
 		ret = arena_malloc(tsd, arena, usize, zero, try_tcache);
 	else {
 		if (usize <= arena_maxclass) {
-			ret = arena_palloc(choose_arena(tsd, arena), usize,
-			    alignment, zero);
+			arena = arena_choose(tsd, arena);
+			if (unlikely(arena == NULL))
+				return (NULL);
+			ret = arena_palloc(arena, usize, alignment, zero);
 		} else if (alignment <= chunksize)
 			ret = huge_malloc(tsd, arena, usize, zero);
 		else
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 1a7fde4..d5e6fdc 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -1,11 +1,16 @@
 a0calloc
 a0free
+a0get
 a0malloc
+arena_get
+arena_get_hard
 arena_alloc_junk_small
 arena_bin_index
 arena_bin_info
 arena_bitselm_get
 arena_boot
+arena_choose
+arena_choose_hard
 arena_chunk_alloc_huge
 arena_chunk_dalloc_huge
 arena_cleanup
@@ -19,6 +24,7 @@ arena_dalloc_large_locked
 arena_dalloc_small
 arena_dss_prec_get
 arena_dss_prec_set
+arena_init
 arena_malloc
 arena_malloc_large
 arena_malloc_small
@@ -42,9 +48,11 @@ arena_mapbitsp_read
 arena_mapbitsp_write
 arena_maxclass
 arena_maxrun
+arena_migrate
 arena_miscelm_get
 arena_miscelm_to_pageind
 arena_miscelm_to_rpages
+arena_nbound
 arena_new
 arena_palloc
 arena_postfork_child
@@ -69,10 +77,8 @@ arena_salloc
 arena_sdalloc
 arena_stats_merge
 arena_tcache_fill_small
-arenas
-arenas_cleanup
-arenas_extend
-arenas_lock
+arenas_cache_bypass_cleanup
+arenas_cache_cleanup
 atomic_add_u
 atomic_add_uint32
 atomic_add_uint64
@@ -100,8 +106,6 @@ bitmap_size
 bitmap_unset
 bt_init
 buferror
-choose_arena
-choose_arena_hard
 chunk_alloc_arena
 chunk_alloc_base
 chunk_alloc_default
@@ -247,7 +251,8 @@ malloc_mutex_unlock
 malloc_printf
 malloc_snprintf
 malloc_strtoumax
-malloc_tsd_boot
+malloc_tsd_boot0
+malloc_tsd_boot1
 malloc_tsd_cleanup_register
 malloc_tsd_dalloc
 malloc_tsd_malloc
@@ -259,8 +264,7 @@ map_bias
 map_misc_offset
 mb_write
 mutex_boot
-narenas_auto
-narenas_total
+narenas_cache_cleanup
 narenas_total_get
 ncpus
 nhbins
@@ -363,6 +367,7 @@ tcache_alloc_small
 tcache_alloc_small_hard
 tcache_arena_associate
 tcache_arena_dissociate
+tcache_arena_reassociate
 tcache_bin_flush_large
 tcache_bin_flush_small
 tcache_bin_info
@@ -388,11 +393,14 @@ tsd_booted
 tsd_arena_get
 tsd_arena_set
 tsd_boot
+tsd_boot0
+tsd_boot1
 tsd_cleanup
 tsd_cleanup_wrapper
 tsd_fetch
 tsd_get
-tsd_get_wrapper
+tsd_wrapper_get
+tsd_wrapper_set
 tsd_initialized
 tsd_init_check_recursion
 tsd_init_finish
diff --git a/include/jemalloc/internal/tcache.h b/include/jemalloc/internal/tcache.h
index da8e4ef..02eec5d 100644
--- a/include/jemalloc/internal/tcache.h
+++ b/include/jemalloc/internal/tcache.h
@@ -109,6 +109,7 @@ void	tcache_bin_flush_small(tcache_bin_t *tbin, index_t binind, unsigned rem,
 void	tcache_bin_flush_large(tcache_bin_t *tbin, index_t binind, unsigned rem,
     tcache_t *tcache);
 void	tcache_arena_associate(tcache_t *tcache, arena_t *arena);
+void	tcache_arena_reassociate(tcache_t *tcache, arena_t *arena);
 void	tcache_arena_dissociate(tcache_t *tcache);
 tcache_t *tcache_get_hard(tsd_t *tsd);
 tcache_t *tcache_create(arena_t *arena);
diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index 2545039..b5658f8 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -2,7 +2,7 @@
 #ifdef JEMALLOC_H_TYPES
 
 /* Maximum number of malloc_tsd users with cleanup functions. */
-#define	MALLOC_TSD_CLEANUPS_MAX	8
+#define	MALLOC_TSD_CLEANUPS_MAX	2
 
 typedef bool (*malloc_tsd_cleanup_t)(void);
 
@@ -23,7 +23,7 @@ typedef enum {
 
 /*
  * TLS/TSD-agnostic macro-based implementation of thread-specific data.  There
- * are four macros that support (at least) three use cases: file-private,
+ * are five macros that support (at least) three use cases: file-private,
  * library-private, and library-private inlined.  Following is an example
  * library-private tsd variable:
  *
@@ -33,18 +33,19 @@ typedef enum {
  *           int y;
  *   } example_t;
  *   #define EX_INITIALIZER JEMALLOC_CONCAT({0, 0})
- *   malloc_tsd_protos(, example_, example_t *)
- *   malloc_tsd_externs(example_, example_t *)
+ *   malloc_tsd_types(example_, example_t)
+ *   malloc_tsd_protos(, example_, example_t)
+ *   malloc_tsd_externs(example_, example_t)
  * In example.c:
- *   malloc_tsd_data(, example_, example_t *, EX_INITIALIZER)
- *   malloc_tsd_funcs(, example_, example_t *, EX_INITIALIZER,
+ *   malloc_tsd_data(, example_, example_t, EX_INITIALIZER)
+ *   malloc_tsd_funcs(, example_, example_t, EX_INITIALIZER,
  *       example_tsd_cleanup)
  *
  * The result is a set of generated functions, e.g.:
  *
  *   bool example_tsd_boot(void) {...}
- *   example_t **example_tsd_get() {...}
- *   void example_tsd_set(example_t **val) {...}
+ *   example_t *example_tsd_get() {...}
+ *   void example_tsd_set(example_t *val) {...}
  *
  * Note that all of the functions deal in terms of (a_type *) rather than
  * (a_type)  so that it is possible to support non-pointer types (unlike
@@ -70,9 +71,32 @@ typedef enum {
  * non-NULL.
  */
 
+/* malloc_tsd_types(). */
+#ifdef JEMALLOC_MALLOC_THREAD_CLEANUP
+#define	malloc_tsd_types(a_name, a_type)
+#elif (defined(JEMALLOC_TLS))
+#define	malloc_tsd_types(a_name, a_type)
+#elif (defined(_WIN32))
+#define	malloc_tsd_types(a_name, a_type)				\
+typedef struct {							\
+	bool	initialized;						\
+	a_type	val;							\
+} a_name##tsd_wrapper_t;
+#else
+#define	malloc_tsd_types(a_name, a_type)				\
+typedef struct {							\
+	bool	initialized;						\
+	a_type	val;							\
+} a_name##tsd_wrapper_t;
+#endif
+
 /* malloc_tsd_protos(). */
 #define	malloc_tsd_protos(a_attr, a_name, a_type)			\
 a_attr bool								\
+a_name##tsd_boot0(void);						\
+a_attr void								\
+a_name##tsd_boot1(void);						\
+a_attr bool								\
 a_name##tsd_boot(void);							\
 a_attr a_type *								\
 a_name##tsd_get(void);							\
@@ -93,11 +117,13 @@ extern bool		a_name##tsd_booted;
 #elif (defined(_WIN32))
 #define	malloc_tsd_externs(a_name, a_type)				\
 extern DWORD		a_name##tsd_tsd;				\
+extern a_name##tsd_wrapper_t	a_name##tsd_boot_wrapper;		\
 extern bool		a_name##tsd_booted;
 #else
 #define	malloc_tsd_externs(a_name, a_type)				\
 extern pthread_key_t	a_name##tsd_tsd;				\
 extern tsd_init_head_t	a_name##tsd_init_head;				\
+extern a_name##tsd_wrapper_t	a_name##tsd_boot_wrapper;		\
 extern bool		a_name##tsd_booted;
 #endif
 
@@ -118,6 +144,10 @@ a_attr bool		a_name##tsd_booted = false;
 #elif (defined(_WIN32))
 #define	malloc_tsd_data(a_attr, a_name, a_type, a_initializer)		\
 a_attr DWORD		a_name##tsd_tsd;				\
+a_attr a_name##tsd_wrapper_t a_name##tsd_boot_wrapper = {		\
+	false,								\
+	a_initializer							\
+};									\
 a_attr bool		a_name##tsd_booted = false;
 #else
 #define	malloc_tsd_data(a_attr, a_name, a_type, a_initializer)		\
@@ -126,6 +156,10 @@ a_attr tsd_init_head_t	a_name##tsd_init_head = {			\
 	ql_head_initializer(blocks),					\
 	MALLOC_MUTEX_INITIALIZER					\
 };									\
+a_attr a_name##tsd_wrapper_t a_name##tsd_boot_wrapper = {		\
+	false,								\
+	a_initializer							\
+};									\
 a_attr bool		a_name##tsd_booted = false;
 #endif
 
@@ -145,7 +179,7 @@ a_name##tsd_cleanup_wrapper(void)					\
 	return (a_name##tsd_initialized);				\
 }									\
 a_attr bool								\
-a_name##tsd_boot(void)							\
+a_name##tsd_boot0(void)							\
 {									\
 									\
 	if (a_cleanup != malloc_tsd_no_cleanup) {			\
@@ -155,6 +189,18 @@ a_name##tsd_boot(void)							\
 	a_name##tsd_booted = true;					\
 	return (false);							\
 }									\
+a_attr void								\
+a_name##tsd_boot1()							\
+{									\
+									\
+	/* Do nothing. */						\
+}									\
+a_attr bool								\
+a_name##tsd_boot(void)							\
+{									\
+									\
+	return (a_name##tsd_boot0());					\
+}									\
 /* Get/set. */								\
 a_attr a_type *								\
 a_name##tsd_get(void)							\
@@ -177,7 +223,7 @@ a_name##tsd_set(a_type *val)						\
     a_cleanup)								\
 /* Initialization/cleanup. */						\
 a_attr bool								\
-a_name##tsd_boot(void)							\
+a_name##tsd_boot0(void)							\
 {									\
 									\
 	if (a_cleanup != malloc_tsd_no_cleanup) {			\
@@ -188,6 +234,18 @@ a_name##tsd_boot(void)							\
 	a_name##tsd_booted = true;					\
 	return (false);							\
 }									\
+a_attr void								\
+a_name##tsd_boot1()							\
+{									\
+									\
+	/* Do nothing. */						\
+}									\
+a_attr bool								\
+a_name##tsd_boot(void)							\
+{									\
+									\
+	return (a_name##tsd_boot0());					\
+}									\
 /* Get/set. */								\
 a_attr a_type *								\
 a_name##tsd_get(void)							\
@@ -215,11 +273,6 @@ a_name##tsd_set(a_type *val)						\
 #elif (defined(_WIN32))
 #define	malloc_tsd_funcs(a_attr, a_name, a_type, a_initializer,		\
     a_cleanup)								\
-/* Data structure. */							\
-typedef struct {							\
-	bool	initialized;						\
-	a_type	val;							\
-} a_name##tsd_wrapper_t;						\
 /* Initialization/cleanup. */						\
 a_attr bool								\
 a_name##tsd_cleanup_wrapper(void)					\
@@ -241,23 +294,18 @@ a_name##tsd_cleanup_wrapper(void)					\
 	malloc_tsd_dalloc(wrapper);					\
 	return (false);							\
 }									\
-a_attr bool								\
-a_name##tsd_boot(void)							\
+a_attr void								\
+a_name##tsd_wrapper_set(a_name##tsd_wrapper_t *wrapper)			\
 {									\
 									\
-	a_name##tsd_tsd = TlsAlloc();					\
-	if (a_name##tsd_tsd == TLS_OUT_OF_INDEXES)			\
-		return (true);						\
-	if (a_cleanup != malloc_tsd_no_cleanup) {			\
-		malloc_tsd_cleanup_register(				\
-		    &a_name##tsd_cleanup_wrapper);			\
+	if (!TlsSetValue(a_name##tsd_tsd, (void *)wrapper)) {		\
+		malloc_write("<jemalloc>: Error setting"		\
+		    " TSD for "#a_name"\n");				\
+		abort();						\
 	}								\
-	a_name##tsd_booted = true;					\
-	return (false);							\
 }									\
-/* Get/set. */								\
 a_attr a_name##tsd_wrapper_t *						\
-a_name##tsd_get_wrapper(void)						\
+a_name##tsd_wrapper_get(void)						\
 {									\
 	a_name##tsd_wrapper_t *wrapper = (a_name##tsd_wrapper_t *)	\
 	    TlsGetValue(a_name##tsd_tsd);				\
@@ -273,21 +321,63 @@ a_name##tsd_get_wrapper(void)						\
 			wrapper->initialized = false;			\
 			wrapper->val = a_initializer;			\
 		}							\
-		if (!TlsSetValue(a_name##tsd_tsd, (void *)wrapper)) {	\
-			malloc_write("<jemalloc>: Error setting"	\
-			    " TSD for "#a_name"\n");			\
-			abort();					\
-		}							\
+		a_name##tsd_wrapper_set(wrapper);			\
 	}								\
 	return (wrapper);						\
 }									\
+a_attr bool								\
+a_name##tsd_boot0(void)							\
+{									\
+									\
+	a_name##tsd_tsd = TlsAlloc();					\
+	if (a_name##tsd_tsd == TLS_OUT_OF_INDEXES)			\
+		return (true);						\
+	if (a_cleanup != malloc_tsd_no_cleanup) {			\
+		malloc_tsd_cleanup_register(				\
+		    &a_name##tsd_cleanup_wrapper);			\
+	}								\
+	a_name##tsd_wrapper_set(&a_name##tsd_boot_wrapper);		\
+	a_name##tsd_booted = true;					\
+	return (false);							\
+}									\
+a_attr void								\
+a_name##tsd_boot1()							\
+{									\
+	a_name##tsd_wrapper_t *wrapper;					\
+	wrapper = (a_name##tsd_wrapper_t *)				\
+	    malloc_tsd_malloc(sizeof(a_name##tsd_wrapper_t));		\
+	if (wrapper == NULL) {						\
+		malloc_write("<jemalloc>: Error allocating"		\
+		    " TSD for "#a_name"\n");				\
+		abort();						\
+	}								\
+	memcpy(wrapper, &a_name##tsd_boot_wrapper,			\
+	    sizeof(a_name##tsd_wrapper_t));				\
+	a_name##tsd_wrapper_set(wrapper);				\
+}									\
+a_attr bool								\
+a_name##tsd_boot(void)							\
+{									\
+									\
+	if (a_name##tsd_boot0())					\
+		return (true);						\
+	a_name##tsd_boot1();						\
+	return (false);							\
+}									\
+a_attr bool								\
+a_name##tsd_boot(void)							\
+{									\
+									\
+	return (false);							\
+}									\
+/* Get/set. */								\
 a_attr a_type *								\
 a_name##tsd_get(void)							\
 {									\
 	a_name##tsd_wrapper_t *wrapper;					\
 									\
 	assert(a_name##tsd_booted);					\
-	wrapper = a_name##tsd_get_wrapper();				\
+	wrapper = a_name##tsd_wrapper_get();				\
 	return (&wrapper->val);						\
 }									\
 a_attr void								\
@@ -296,7 +386,7 @@ a_name##tsd_set(a_type *val)						\
 	a_name##tsd_wrapper_t *wrapper;					\
 									\
 	assert(a_name##tsd_booted);					\
-	wrapper = a_name##tsd_get_wrapper();				\
+	wrapper = a_name##tsd_wrapper_get();				\
 	wrapper->val = *(val);						\
 	if (a_cleanup != malloc_tsd_no_cleanup)				\
 		wrapper->initialized = true;				\
@@ -304,11 +394,6 @@ a_name##tsd_set(a_type *val)						\
 #else
 #define	malloc_tsd_funcs(a_attr, a_name, a_type, a_initializer,		\
     a_cleanup)								\
-/* Data structure. */							\
-typedef struct {							\
-	bool	initialized;						\
-	a_type	val;							\
-} a_name##tsd_wrapper_t;						\
 /* Initialization/cleanup. */						\
 a_attr void								\
 a_name##tsd_cleanup_wrapper(void *arg)					\
@@ -333,19 +418,19 @@ a_name##tsd_cleanup_wrapper(void *arg)					\
 	}								\
 	malloc_tsd_dalloc(wrapper);					\
 }									\
-a_attr bool								\
-a_name##tsd_boot(void)							\
+a_attr void								\
+a_name##tsd_wrapper_set(a_name##tsd_wrapper_t *wrapper)			\
 {									\
 									\
-	if (pthread_key_create(&a_name##tsd_tsd,			\
-	    a_name##tsd_cleanup_wrapper) != 0)				\
-		return (true);						\
-	a_name##tsd_booted = true;					\
-	return (false);							\
+	if (pthread_setspecific(a_name##tsd_tsd,			\
+	    (void *)wrapper)) {						\
+		malloc_write("<jemalloc>: Error setting"		\
+		    " TSD for "#a_name"\n");				\
+		abort();						\
+	}								\
 }									\
-/* Get/set. */								\
 a_attr a_name##tsd_wrapper_t *						\
-a_name##tsd_get_wrapper(void)						\
+a_name##tsd_wrapper_get(void)						\
 {									\
 	a_name##tsd_wrapper_t *wrapper = (a_name##tsd_wrapper_t *)	\
 	    pthread_getspecific(a_name##tsd_tsd);			\
@@ -367,23 +452,54 @@ a_name##tsd_get_wrapper(void)						\
 			wrapper->initialized = false;			\
 			wrapper->val = a_initializer;			\
 		}							\
-		if (pthread_setspecific(a_name##tsd_tsd,		\
-		    (void *)wrapper)) {					\
-			malloc_write("<jemalloc>: Error setting"	\
-			    " TSD for "#a_name"\n");			\
-			abort();					\
-		}							\
+		a_name##tsd_wrapper_set(wrapper);			\
 		tsd_init_finish(&a_name##tsd_init_head, &block);	\
 	}								\
 	return (wrapper);						\
 }									\
+a_attr bool								\
+a_name##tsd_boot0(void)							\
+{									\
+									\
+	if (pthread_key_create(&a_name##tsd_tsd,			\
+	    a_name##tsd_cleanup_wrapper) != 0)				\
+		return (true);						\
+	a_name##tsd_wrapper_set(&a_name##tsd_boot_wrapper);		\
+	a_name##tsd_booted = true;					\
+	return (false);							\
+}									\
+a_attr void								\
+a_name##tsd_boot1()							\
+{									\
+	a_name##tsd_wrapper_t *wrapper;					\
+	wrapper = (a_name##tsd_wrapper_t *)				\
+	    malloc_tsd_malloc(sizeof(a_name##tsd_wrapper_t));		\
+	if (wrapper == NULL) {						\
+		malloc_write("<jemalloc>: Error allocating"		\
+		    " TSD for "#a_name"\n");				\
+		abort();						\
+	}								\
+	memcpy(wrapper, &a_name##tsd_boot_wrapper,			\
+	    sizeof(a_name##tsd_wrapper_t));				\
+	a_name##tsd_wrapper_set(wrapper);				\
+}									\
+a_attr bool								\
+a_name##tsd_boot(void)							\
+{									\
+									\
+	if (a_name##tsd_boot0())					\
+		return (true);						\
+	a_name##tsd_boot1();						\
+	return (false);							\
+}									\
+/* Get/set. */								\
 a_attr a_type *								\
 a_name##tsd_get(void)							\
 {									\
 	a_name##tsd_wrapper_t *wrapper;					\
 									\
 	assert(a_name##tsd_booted);					\
-	wrapper = a_name##tsd_get_wrapper();				\
+	wrapper = a_name##tsd_wrapper_get();				\
 	return (&wrapper->val);						\
 }									\
 a_attr void								\
@@ -392,7 +508,7 @@ a_name##tsd_set(a_type *val)						\
 	a_name##tsd_wrapper_t *wrapper;					\
 									\
 	assert(a_name##tsd_booted);					\
-	wrapper = a_name##tsd_get_wrapper();				\
+	wrapper = a_name##tsd_wrapper_get();				\
 	wrapper->val = *(val);						\
 	if (a_cleanup != malloc_tsd_no_cleanup)				\
 		wrapper->initialized = true;				\
@@ -423,6 +539,9 @@ struct tsd_init_head_s {
     O(thread_deallocated,	uint64_t)				\
     O(prof_tdata,		prof_tdata_t *)				\
     O(arena,			arena_t *)				\
+    O(arenas_cache,		arena_t **)				\
+    O(narenas_cache,		unsigned)				\
+    O(arenas_cache_bypass,	bool)					\
     O(tcache_enabled,		tcache_enabled_t)			\
     O(quarantine,		quarantine_t *)				\
 
@@ -433,6 +552,9 @@ struct tsd_init_head_s {
     0,									\
     NULL,								\
     NULL,								\
+    NULL,								\
+    0,									\
+    false,								\
     tcache_enabled_default,						\
     NULL								\
 }
@@ -447,6 +569,8 @@ MALLOC_TSD
 
 static const tsd_t tsd_initializer = TSD_INITIALIZER;
 
+malloc_tsd_types(, tsd_t)
+
 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
@@ -455,7 +579,8 @@ void	*malloc_tsd_malloc(size_t size);
 void	malloc_tsd_dalloc(void *wrapper);
 void	malloc_tsd_no_cleanup(void *arg);
 void	malloc_tsd_cleanup_register(bool (*f)(void));
-bool	malloc_tsd_boot(void);
+bool	malloc_tsd_boot0(void);
+void	malloc_tsd_boot1(void);
 #if (!defined(JEMALLOC_MALLOC_THREAD_CLEANUP) && !defined(JEMALLOC_TLS) && \
     !defined(_WIN32))
 void	*tsd_init_check_recursion(tsd_init_head_t *head,
diff --git a/src/arena.c b/src/arena.c
index 49a3057..86e5440 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -2192,27 +2192,37 @@ arena_stats_merge(arena_t *arena, const char **dss, size_t *nactive,
 	}
 }
 
-bool
-arena_new(arena_t *arena, unsigned ind)
+arena_t *
+arena_new(unsigned ind)
 {
+	arena_t *arena;
 	unsigned i;
 	arena_bin_t *bin;
 
+	/*
+	 * Allocate arena and arena->lstats contiguously, mainly because there
+	 * is no way to clean up if base_alloc() OOMs.
+	 */
+	if (config_stats) {
+		arena = (arena_t *)base_alloc(CACHELINE_CEILING(sizeof(arena_t))
+		    + nlclasses * sizeof(malloc_large_stats_t));
+	} else
+		arena = (arena_t *)base_alloc(sizeof(arena_t));
+	if (arena == NULL)
+		return (NULL);
+
 	arena->ind = ind;
 	arena->nthreads = 0;
 	arena->chunk_alloc = chunk_alloc_default;
 	arena->chunk_dalloc = chunk_dalloc_default;
 
 	if (malloc_mutex_init(&arena->lock))
-		return (true);
+		return (NULL);
 
 	if (config_stats) {
 		memset(&arena->stats, 0, sizeof(arena_stats_t));
-		arena->stats.lstats =
-		    (malloc_large_stats_t *)base_alloc(nlclasses *
-		    sizeof(malloc_large_stats_t));
-		if (arena->stats.lstats == NULL)
-			return (true);
+		arena->stats.lstats = (malloc_large_stats_t *)(((void *)arena) +
+		    CACHELINE_CEILING(sizeof(arena_t)));
 		memset(arena->stats.lstats, 0, nlclasses *
 		    sizeof(malloc_large_stats_t));
 		if (config_tcache)
@@ -2236,14 +2246,14 @@ arena_new(arena_t *arena, unsigned ind)
 	for (i = 0; i < NBINS; i++) {
 		bin = &arena->bins[i];
 		if (malloc_mutex_init(&bin->lock))
-			return (true);
+			return (NULL);
 		bin->runcur = NULL;
 		arena_run_tree_new(&bin->runs);
 		if (config_stats)
 			memset(&bin->stats, 0, sizeof(malloc_bin_stats_t));
 	}
 
-	return (false);
+	return (arena);
 }
 
 /*
diff --git a/src/chunk.c b/src/chunk.c
index 618aaca..f65b67a 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -254,9 +254,17 @@ void *
 chunk_alloc_default(void *new_addr, size_t size, size_t alignment, bool *zero,
     unsigned arena_ind)
 {
+	arena_t *arena;
+
+	arena = arena_get(tsd_fetch(), arena_ind, false, true);
+	/*
+	 * The arena we're allocating on behalf of must have been initialized
+	 * already.
+	 */
+	assert(arena != NULL);
 
 	return (chunk_alloc_core(new_addr, size, alignment, false, zero,
-	    arenas[arena_ind]->dss_prec));
+	    arena->dss_prec));
 }
 
 static void
diff --git a/src/ctl.c b/src/ctl.c
index f1f3234..37f8f42 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -447,7 +447,7 @@ ctl_arena_init(ctl_arena_stats_t *astats)
 {
 
 	if (astats->lstats == NULL) {
-		astats->lstats = (malloc_large_stats_t *)base_alloc(nlclasses *
+		astats->lstats = (malloc_large_stats_t *)a0malloc(nlclasses *
 		    sizeof(malloc_large_stats_t));
 		if (astats->lstats == NULL)
 			return (true);
@@ -567,31 +567,24 @@ ctl_arena_refresh(arena_t *arena, unsigned i)
 static bool
 ctl_grow(void)
 {
-	tsd_t *tsd;
 	ctl_arena_stats_t *astats;
-	arena_t **tarenas;
 
-	tsd = tsd_fetch();
+	/* Initialize new arena. */
+	if (arena_init(ctl_stats.narenas) == NULL)
+		return (true);
 
-	/* Allocate extended arena stats and arenas arrays. */
-	astats = (ctl_arena_stats_t *)imalloc(tsd, (ctl_stats.narenas + 2) *
+	/* Allocate extended arena stats. */
+	astats = (ctl_arena_stats_t *)a0malloc((ctl_stats.narenas + 2) *
 	    sizeof(ctl_arena_stats_t));
 	if (astats == NULL)
 		return (true);
-	tarenas = (arena_t **)imalloc(tsd, (ctl_stats.narenas + 1) *
-	    sizeof(arena_t *));
-	if (tarenas == NULL) {
-		idalloc(tsd, astats);
-		return (true);
-	}
 
 	/* Initialize the new astats element. */
 	memcpy(astats, ctl_stats.arenas, (ctl_stats.narenas + 1) *
 	    sizeof(ctl_arena_stats_t));
 	memset(&astats[ctl_stats.narenas + 1], 0, sizeof(ctl_arena_stats_t));
 	if (ctl_arena_init(&astats[ctl_stats.narenas + 1])) {
-		idalloc(tsd, tarenas);
-		idalloc(tsd, astats);
+		a0free(astats);
 		return (true);
 	}
 	/* Swap merged stats to their new location. */
@@ -604,32 +597,7 @@ ctl_grow(void)
 		memcpy(&astats[ctl_stats.narenas + 1], &tstats,
 		    sizeof(ctl_arena_stats_t));
 	}
-	/* Initialize the new arenas element. */
-	tarenas[ctl_stats.narenas] = NULL;
-	{
-		arena_t **arenas_old = arenas;
-		/*
-		 * Swap extended arenas array into place.  Although ctl_mtx
-		 * protects this function from other threads extending the
-		 * array, it does not protect from other threads mutating it
-		 * (i.e. initializing arenas and setting array elements to
-		 * point to them).  Therefore, array copying must happen under
-		 * the protection of arenas_lock.
-		 */
-		malloc_mutex_lock(&arenas_lock);
-		arenas = tarenas;
-		memcpy(arenas, arenas_old, ctl_stats.narenas *
-		    sizeof(arena_t *));
-		narenas_total++;
-		arenas_extend(narenas_total - 1);
-		malloc_mutex_unlock(&arenas_lock);
-		/*
-		 * Deallocate arenas_old only if it came from imalloc() (not
-		 * base_alloc()).
-		 */
-		if (ctl_stats.narenas != narenas_auto)
-			idalloc(tsd, arenas_old);
-	}
+	a0free(ctl_stats.arenas);
 	ctl_stats.arenas = astats;
 	ctl_stats.narenas++;
 
@@ -639,6 +607,7 @@ ctl_grow(void)
 static void
 ctl_refresh(void)
 {
+	tsd_t *tsd;
 	unsigned i;
 	VARIABLE_ARRAY(arena_t *, tarenas, ctl_stats.narenas);
 
@@ -657,15 +626,17 @@ ctl_refresh(void)
 	ctl_stats.arenas[ctl_stats.narenas].nthreads = 0;
 	ctl_arena_clear(&ctl_stats.arenas[ctl_stats.narenas]);
 
-	malloc_mutex_lock(&arenas_lock);
-	memcpy(tarenas, arenas, sizeof(arena_t *) * ctl_stats.narenas);
+	tsd = tsd_fetch();
+	for (i = 0; i < ctl_stats.narenas; i++)
+		tarenas[i] = arena_get(tsd, i, false, (i == 0));
+
 	for (i = 0; i < ctl_stats.narenas; i++) {
-		if (arenas[i] != NULL)
-			ctl_stats.arenas[i].nthreads = arenas[i]->nthreads;
+		if (tarenas[i] != NULL)
+			ctl_stats.arenas[i].nthreads = arena_nbound(i);
 		else
 			ctl_stats.arenas[i].nthreads = 0;
 	}
-	malloc_mutex_unlock(&arenas_lock);
+
 	for (i = 0; i < ctl_stats.narenas; i++) {
 		bool initialized = (tarenas[i] != NULL);
 
@@ -698,9 +669,8 @@ ctl_init(void)
 		 * Allocate space for one extra arena stats element, which
 		 * contains summed stats across all arenas.
 		 */
-		assert(narenas_auto == narenas_total_get());
-		ctl_stats.narenas = narenas_auto;
-		ctl_stats.arenas = (ctl_arena_stats_t *)base_alloc(
+		ctl_stats.narenas = narenas_total_get();
+		ctl_stats.arenas = (ctl_arena_stats_t *)a0malloc(
 		    (ctl_stats.narenas + 1) * sizeof(ctl_arena_stats_t));
 		if (ctl_stats.arenas == NULL) {
 			ret = true;
@@ -718,6 +688,13 @@ ctl_init(void)
 			unsigned i;
 			for (i = 0; i <= ctl_stats.narenas; i++) {
 				if (ctl_arena_init(&ctl_stats.arenas[i])) {
+					unsigned j;
+					for (j = 0; j < i; j++) {
+						a0free(
+						    ctl_stats.arenas[j].lstats);
+					}
+					a0free(ctl_stats.arenas);
+					ctl_stats.arenas = NULL;
 					ret = true;
 					goto label_return;
 				}
@@ -1231,17 +1208,19 @@ thread_arena_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 {
 	int ret;
 	tsd_t *tsd;
+	arena_t *arena;
 	unsigned newind, oldind;
 
 	tsd = tsd_fetch();
+	arena = arena_choose(tsd, NULL);
+	if (arena == NULL)
+		return (EAGAIN);
 
 	malloc_mutex_lock(&ctl_mtx);
-	newind = oldind = choose_arena(tsd, NULL)->ind;
+	newind = oldind = arena->ind;
 	WRITE(newind, unsigned);
 	READ(oldind, unsigned);
 	if (newind != oldind) {
-		arena_t *arena;
-
 		if (newind >= ctl_stats.narenas) {
 			/* New arena index is out of range. */
 			ret = EFAULT;
@@ -1249,28 +1228,18 @@ thread_arena_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 		}
 
 		/* Initialize arena if necessary. */
-		malloc_mutex_lock(&arenas_lock);
-		if ((arena = arenas[newind]) == NULL && (arena =
-		    arenas_extend(newind)) == NULL) {
-			malloc_mutex_unlock(&arenas_lock);
+		arena = arena_get(tsd, newind, true, true);
+		if (arena == NULL) {
 			ret = EAGAIN;
 			goto label_return;
 		}
-		assert(arena == arenas[newind]);
-		arenas[oldind]->nthreads--;
-		arenas[newind]->nthreads++;
-		malloc_mutex_unlock(&arenas_lock);
-
-		/* Set new arena association. */
+		/* Set new arena/tcache associations. */
+		arena_migrate(tsd, oldind, newind);
 		if (config_tcache) {
 			tcache_t *tcache = tsd_tcache_get(tsd);
-			if (tcache != NULL) {
-				tcache_arena_dissociate(tcache);
-				tcache_arena_associate(tcache, arena);
-			}
+			if (tcache != NULL)
+				tcache_arena_reassociate(tcache, arena);
 		}
-
-		tsd_arena_set(tsd, arena);
 	}
 
 	ret = 0;
@@ -1400,11 +1369,13 @@ label_return:
 static void
 arena_purge(unsigned arena_ind)
 {
+	tsd_t *tsd;
+	unsigned i;
 	VARIABLE_ARRAY(arena_t *, tarenas, ctl_stats.narenas);
 
-	malloc_mutex_lock(&arenas_lock);
-	memcpy(tarenas, arenas, sizeof(arena_t *) * ctl_stats.narenas);
-	malloc_mutex_unlock(&arenas_lock);
+	tsd = tsd_fetch();
+	for (i = 0; i < ctl_stats.narenas; i++)
+		tarenas[i] = arena_get(tsd, i, false, (i == 0));
 
 	if (arena_ind == ctl_stats.narenas) {
 		unsigned i;
@@ -1467,7 +1438,7 @@ arena_i_dss_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 	}
 
 	if (arena_ind < ctl_stats.narenas) {
-		arena_t *arena = arenas[arena_ind];
+		arena_t *arena = arena_get(tsd_fetch(), arena_ind, false, true);
 		if (arena == NULL || (dss_prec != dss_prec_limit &&
 		    arena_dss_prec_set(arena, dss_prec))) {
 			ret = EFAULT;
@@ -1501,7 +1472,8 @@ arena_i_chunk_alloc_ctl(const size_t *mib, size_t miblen, void *oldp,
 	arena_t *arena;
 
 	malloc_mutex_lock(&ctl_mtx);
-	if (arena_ind < narenas_total && (arena = arenas[arena_ind]) != NULL) {
+	if (arena_ind < narenas_total_get() && (arena = arena_get(tsd_fetch(),
+	    arena_ind, false, true)) != NULL) {
 		malloc_mutex_lock(&arena->lock);
 		READ(arena->chunk_alloc, chunk_alloc_t *);
 		WRITE(arena->chunk_alloc, chunk_alloc_t *);
@@ -1527,7 +1499,8 @@ arena_i_chunk_dalloc_ctl(const size_t *mib, size_t miblen, void *oldp,
 	arena_t *arena;
 
 	malloc_mutex_lock(&ctl_mtx);
-	if (arena_ind < narenas_total && (arena = arenas[arena_ind]) != NULL) {
+	if (arena_ind < narenas_total_get() && (arena = arena_get(tsd_fetch(),
+	    arena_ind, false, true)) != NULL) {
 		malloc_mutex_lock(&arena->lock);
 		READ(arena->chunk_dalloc, chunk_dalloc_t *);
 		WRITE(arena->chunk_dalloc, chunk_dalloc_t *);
diff --git a/src/huge.c b/src/huge.c
index ae41625..1376729 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -50,7 +50,11 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 	 * it is possible to make correct junk/zero fill decisions below.
 	 */
 	is_zeroed = zero;
-	arena = choose_arena(tsd, arena);
+	arena = arena_choose(tsd, arena);
+	if (unlikely(arena == NULL)) {
+		base_node_dalloc(node);
+		return (NULL);
+	}
 	ret = arena_chunk_alloc_huge(arena, NULL, csize, alignment, &is_zeroed);
 	if (ret == NULL) {
 		base_node_dalloc(node);
diff --git a/src/jemalloc.c b/src/jemalloc.c
index f3750b4..3c889e8 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -4,8 +4,6 @@
 /******************************************************************************/
 /* Data. */
 
-malloc_tsd_data(, arenas, arena_t *, NULL)
-
 /* Runtime configuration options. */
 const char	*je_malloc_conf JEMALLOC_ATTR(weak);
 bool	opt_abort =
@@ -34,10 +32,20 @@ bool	in_valgrind;
 
 unsigned	ncpus;
 
-malloc_mutex_t		arenas_lock;
-arena_t			**arenas;
-unsigned		narenas_total;
-unsigned		narenas_auto;
+/* Protects arenas initialization (arenas, narenas_total). */
+static malloc_mutex_t	arenas_lock;
+/*
+ * Arenas that are used to service external requests.  Not all elements of the
+ * arenas array are necessarily used; arenas are created lazily as needed.
+ *
+ * arenas[0..narenas_auto) are used for automatic multiplexing of threads and
+ * arenas.  arenas[narenas_auto..narenas_total) are only used if the application
+ * takes some action to create them and allocate from them.
+ */
+static arena_t		**arenas;
+static unsigned		narenas_total;
+static arena_t		*a0; /* arenas[0]; read-only after initialization. */
+static unsigned		narenas_auto; /* Read-only after initialization. */
 
 /* Set to true once the allocator has been initialized. */
 static bool		malloc_initialized = false;
@@ -144,35 +152,288 @@ static bool	malloc_init_hard(void);
  * Begin miscellaneous support functions.
  */
 
+JEMALLOC_ALWAYS_INLINE_C void
+malloc_thread_init(void)
+{
+
+	/*
+	 * TSD initialization can't be safely done as a side effect of
+	 * deallocation, because it is possible for a thread to do nothing but
+	 * deallocate its TLS data via free(), in which case writing to TLS
+	 * would cause write-after-free memory corruption.  The quarantine
+	 * facility *only* gets used as a side effect of deallocation, so make
+	 * a best effort attempt at initializing its TSD by hooking all
+	 * allocation events.
+	 */
+	if (config_fill && unlikely(opt_quarantine))
+		quarantine_alloc_hook();
+}
+
+JEMALLOC_ALWAYS_INLINE_C bool
+malloc_init(void)
+{
+
+	if (unlikely(!malloc_initialized) && malloc_init_hard())
+		return (true);
+	malloc_thread_init();
+
+	return (false);
+}
+
+/*
+ * The a0*() functions are used instead of i[mcd]alloc() in bootstrap-sensitive
+ * situations that cannot tolerate TLS variable access.  These functions are
+ * also exposed for use in static binaries on FreeBSD, hence the old-style
+ * malloc() API.
+ */
+
+arena_t *
+a0get(void)
+{
+
+	assert(a0 != NULL);
+	return (a0);
+}
+
+static void *
+a0alloc(size_t size, bool zero)
+{
+	void *ret;
+
+	if (unlikely(malloc_init()))
+		return (NULL);
+
+	if (size == 0)
+		size = 1;
+
+	if (size <= arena_maxclass)
+		ret = arena_malloc(NULL, a0get(), size, zero, false);
+	else
+		ret = huge_malloc(NULL, a0get(), size, zero);
+
+	return (ret);
+}
+
+void *
+a0malloc(size_t size)
+{
+
+	return (a0alloc(size, false));
+}
+
+void *
+a0calloc(size_t num, size_t size)
+{
+
+	return (a0alloc(num * size, true));
+}
+
+void
+a0free(void *ptr)
+{
+	arena_chunk_t *chunk;
+
+	if (ptr == NULL)
+		return;
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	if (chunk != ptr)
+		arena_dalloc(NULL, chunk, ptr, false);
+	else
+		huge_dalloc(ptr);
+}
+
 /* Create a new arena and insert it into the arenas array at index ind. */
 arena_t *
-arenas_extend(unsigned ind)
+arena_init(unsigned ind)
 {
-	arena_t *ret;
+	arena_t *arena;
+
+	malloc_mutex_lock(&arenas_lock);
 
-	ret = (arena_t *)base_alloc(sizeof(arena_t));
-	if (ret != NULL && !arena_new(ret, ind)) {
-		arenas[ind] = ret;
-		return (ret);
+	/* Expand arenas if necessary. */
+	assert(ind <= narenas_total);
+	if (ind == narenas_total) {
+		unsigned narenas_new = narenas_total + 1;
+		arena_t **arenas_new =
+		    (arena_t **)a0malloc(CACHELINE_CEILING(narenas_new *
+		    sizeof(arena_t *)));
+		if (arenas_new == NULL) {
+			arena = NULL;
+			goto label_return;
+		}
+		memcpy(arenas_new, arenas, narenas_total * sizeof(arena_t *));
+		arenas_new[ind] = NULL;
+		/*
+		 * Deallocate only if arenas came from a0malloc() (not
+		 * base_alloc()).
+		 */
+		if (narenas_total != narenas_auto)
+			a0free(arenas);
+		arenas = arenas_new;
+		narenas_total = narenas_new;
 	}
-	/* Only reached if there is an OOM error. */
 
 	/*
-	 * OOM here is quite inconvenient to propagate, since dealing with it
-	 * would require a check for failure in the fast path.  Instead, punt
-	 * by using arenas[0].  In practice, this is an extremely unlikely
-	 * failure.
+	 * Another thread may have already initialized arenas[ind] if it's an
+	 * auto arena.
 	 */
-	malloc_write("<jemalloc>: Error initializing arena\n");
-	if (opt_abort)
-		abort();
+	arena = arenas[ind];
+	if (arena != NULL) {
+		assert(ind < narenas_auto);
+		goto label_return;
+	}
+
+	/* Actually initialize the arena. */
+	arena = arenas[ind] = arena_new(ind);
+label_return:
+	malloc_mutex_unlock(&arenas_lock);
+	return (arena);
+}
+
+unsigned
+narenas_total_get(void)
+{
+	unsigned narenas;
+
+	malloc_mutex_lock(&arenas_lock);
+	narenas = narenas_total;
+	malloc_mutex_unlock(&arenas_lock);
+
+	return (narenas);
+}
+
+static void
+arena_bind_locked(tsd_t *tsd, unsigned ind)
+{
+	arena_t *arena;
+
+	arena = arenas[ind];
+	arena->nthreads++;
+
+	if (tsd_nominal(tsd))
+		tsd_arena_set(tsd, arena);
+}
+
+static void
+arena_bind(tsd_t *tsd, unsigned ind)
+{
+
+	malloc_mutex_lock(&arenas_lock);
+	arena_bind_locked(tsd, ind);
+	malloc_mutex_unlock(&arenas_lock);
+}
+
+void
+arena_migrate(tsd_t *tsd, unsigned oldind, unsigned newind)
+{
+	arena_t *oldarena, *newarena;
+
+	malloc_mutex_lock(&arenas_lock);
+	oldarena = arenas[oldind];
+	newarena = arenas[newind];
+	oldarena->nthreads--;
+	newarena->nthreads++;
+	malloc_mutex_unlock(&arenas_lock);
+	tsd_arena_set(tsd, newarena);
+}
+
+unsigned
+arena_nbound(unsigned ind)
+{
+	unsigned nthreads;
+
+	malloc_mutex_lock(&arenas_lock);
+	nthreads = arenas[ind]->nthreads;
+	malloc_mutex_unlock(&arenas_lock);
+	return (nthreads);
+}
+
+static void
+arena_unbind(tsd_t *tsd, unsigned ind)
+{
+	arena_t *arena;
+
+	malloc_mutex_lock(&arenas_lock);
+	arena = arenas[ind];
+	arena->nthreads--;
+	malloc_mutex_unlock(&arenas_lock);
+	tsd_arena_set(tsd, NULL);
+}
+
+arena_t *
+arena_get_hard(tsd_t *tsd, unsigned ind, bool init_if_missing)
+{
+	arena_t *arena;
+	arena_t **arenas_cache = tsd_arenas_cache_get(tsd);
+	unsigned narenas_cache = tsd_narenas_cache_get(tsd);
+	unsigned narenas_actual = narenas_total_get();
+
+	/* Deallocate old cache if it's too small. */
+	if (arenas_cache != NULL && narenas_cache < narenas_actual) {
+		a0free(arenas_cache);
+		arenas_cache = NULL;
+		narenas_cache = 0;
+		tsd_arenas_cache_set(tsd, arenas_cache);
+		tsd_narenas_cache_set(tsd, narenas_cache);
+	}
+
+	/* Allocate cache if it's missing. */
+	if (arenas_cache == NULL) {
+		bool *arenas_cache_bypassp = tsd_arenas_cache_bypassp_get(tsd);
+		assert(ind < narenas_actual || !init_if_missing);
+		narenas_cache = (ind < narenas_actual) ? narenas_actual : ind+1;
+
+		if (!*arenas_cache_bypassp) {
+			*arenas_cache_bypassp = true;
+			arenas_cache = (arena_t **)a0malloc(sizeof(arena_t *) *
+			    narenas_cache);
+			*arenas_cache_bypassp = false;
+		} else
+			arenas_cache = NULL;
+		if (arenas_cache == NULL) {
+			/*
+			 * This function must always tell the truth, even if
+			 * it's slow, so don't let OOM or recursive allocation
+			 * avoidance (note arenas_cache_bypass check) get in the
+			 * way.
+			 */
+			if (ind >= narenas_actual)
+				return (NULL);
+			malloc_mutex_lock(&arenas_lock);
+			arena = arenas[ind];
+			malloc_mutex_unlock(&arenas_lock);
+			return (arena);
+		}
+		tsd_arenas_cache_set(tsd, arenas_cache);
+		tsd_narenas_cache_set(tsd, narenas_cache);
+	}
 
-	return (arenas[0]);
+	/*
+	 * Copy to cache.  It's possible that the actual number of arenas has
+	 * increased since narenas_total_get() was called above, but that causes
+	 * no correctness issues unless two threads concurrently execute the
+	 * arenas.extend mallctl, which we trust mallctl synchronization to
+	 * prevent.
+	 */
+	malloc_mutex_lock(&arenas_lock);
+	memcpy(arenas_cache, arenas, sizeof(arena_t *) * narenas_actual);
+	malloc_mutex_unlock(&arenas_lock);
+	if (narenas_cache > narenas_actual) {
+		memset(&arenas_cache[narenas_actual], 0, sizeof(arena_t *) *
+		    (narenas_cache - narenas_actual));
+	}
+
+	/* Read the refreshed cache, and init the arena if necessary. */
+	arena = arenas_cache[ind];
+	if (init_if_missing && arena == NULL)
+		arena = arenas_cache[ind] = arena_init(ind);
+	return (arena);
 }
 
-/* Slow path, called only by choose_arena(). */
+/* Slow path, called only by arena_choose(). */
 arena_t *
-choose_arena_hard(tsd_t *tsd)
+arena_choose_hard(tsd_t *tsd)
 {
 	arena_t *ret;
 
@@ -182,7 +443,7 @@ choose_arena_hard(tsd_t *tsd)
 		choose = 0;
 		first_null = narenas_auto;
 		malloc_mutex_lock(&arenas_lock);
-		assert(arenas[0] != NULL);
+		assert(a0get() != NULL);
 		for (i = 1; i < narenas_auto; i++) {
 			if (arenas[i] != NULL) {
 				/*
@@ -215,20 +476,20 @@ choose_arena_hard(tsd_t *tsd)
 			ret = arenas[choose];
 		} else {
 			/* Initialize a new arena. */
-			ret = arenas_extend(first_null);
+			choose = first_null;
+			ret = arena_init(choose);
+			if (ret == NULL) {
+				malloc_mutex_unlock(&arenas_lock);
+				return (NULL);
+			}
 		}
-		ret->nthreads++;
+		arena_bind_locked(tsd, choose);
 		malloc_mutex_unlock(&arenas_lock);
 	} else {
-		ret = arenas[0];
-		malloc_mutex_lock(&arenas_lock);
-		ret->nthreads++;
-		malloc_mutex_unlock(&arenas_lock);
+		ret = a0get();
+		arena_bind(tsd, 0);
 	}
 
-	if (tsd_nominal(tsd))
-		tsd_arena_set(tsd, ret);
-
 	return (ret);
 }
 
@@ -249,6 +510,33 @@ thread_deallocated_cleanup(tsd_t *tsd)
 void
 arena_cleanup(tsd_t *tsd)
 {
+	arena_t *arena;
+
+	arena = tsd_arena_get(tsd);
+	if (arena != NULL)
+		arena_unbind(tsd, arena->ind);
+}
+
+void
+arenas_cache_cleanup(tsd_t *tsd)
+{
+	arena_t **arenas_cache;
+
+	arenas_cache = tsd_arenas_cache_get(tsd);
+	if (arenas != NULL)
+		a0free(arenas_cache);
+}
+
+void
+narenas_cache_cleanup(tsd_t *tsd)
+{
+
+	/* Do nothing. */
+}
+
+void
+arenas_cache_bypass_cleanup(tsd_t *tsd)
+{
 
 	/* Do nothing. */
 }
@@ -312,44 +600,6 @@ malloc_ncpus(void)
 	return ((result == -1) ? 1 : (unsigned)result);
 }
 
-void
-arenas_cleanup(void *arg)
-{
-	arena_t *arena = *(arena_t **)arg;
-
-	malloc_mutex_lock(&arenas_lock);
-	arena->nthreads--;
-	malloc_mutex_unlock(&arenas_lock);
-}
-
-JEMALLOC_ALWAYS_INLINE_C void
-malloc_thread_init(void)
-{
-
-	/*
-	 * TSD initialization can't be safely done as a side effect of
-	 * deallocation, because it is possible for a thread to do nothing but
-	 * deallocate its TLS data via free(), in which case writing to TLS
-	 * would cause write-after-free memory corruption.  The quarantine
-	 * facility *only* gets used as a side effect of deallocation, so make
-	 * a best effort attempt at initializing its TSD by hooking all
-	 * allocation events.
-	 */
-	if (config_fill && unlikely(opt_quarantine))
-		quarantine_alloc_hook();
-}
-
-JEMALLOC_ALWAYS_INLINE_C bool
-malloc_init(void)
-{
-
-	if (unlikely(!malloc_initialized) && malloc_init_hard())
-		return (true);
-	malloc_thread_init();
-
-	return (false);
-}
-
 static bool
 malloc_conf_next(char const **opts_p, char const **k_p, size_t *klen_p,
     char const **v_p, size_t *vlen_p)
@@ -745,7 +995,7 @@ malloc_init_hard(void)
 #endif
 	malloc_initializer = INITIALIZER;
 
-	if (malloc_tsd_boot()) {
+	if (malloc_tsd_boot0()) {
 		malloc_mutex_unlock(&init_lock);
 		return (true);
 	}
@@ -809,10 +1059,10 @@ malloc_init_hard(void)
 
 	/*
 	 * Initialize one arena here.  The rest are lazily created in
-	 * choose_arena_hard().
+	 * arena_choose_hard().
 	 */
-	arenas_extend(0);
-	if (arenas[0] == NULL) {
+	a0 = arena_init(0);
+	if (a0 == NULL) {
 		malloc_mutex_unlock(&init_lock);
 		return (true);
 	}
@@ -887,6 +1137,7 @@ malloc_init_hard(void)
 
 	malloc_initialized = true;
 	malloc_mutex_unlock(&init_lock);
+	malloc_tsd_boot1();
 
 	return (false);
 }
@@ -1428,8 +1679,8 @@ JEMALLOC_EXPORT void *(*__memalign_hook)(size_t alignment, size_t size) =
  * Begin non-standard functions.
  */
 
-JEMALLOC_ALWAYS_INLINE_C void
-imallocx_flags_decode_hard(size_t size, int flags, size_t *usize,
+JEMALLOC_ALWAYS_INLINE_C bool
+imallocx_flags_decode_hard(tsd_t *tsd, size_t size, int flags, size_t *usize,
     size_t *alignment, bool *zero, bool *try_tcache, arena_t **arena)
 {
 
@@ -1444,16 +1695,19 @@ imallocx_flags_decode_hard(size_t size, int flags, size_t *usize,
 	if ((flags & MALLOCX_ARENA_MASK) != 0) {
 		unsigned arena_ind = MALLOCX_ARENA_GET(flags);
 		*try_tcache = false;
-		*arena = arenas[arena_ind];
+		*arena = arena_get(tsd, arena_ind, true, true);
+		if (unlikely(*arena == NULL))
+			return (true);
 	} else {
 		*try_tcache = true;
 		*arena = NULL;
 	}
+	return (false);
 }
 
-JEMALLOC_ALWAYS_INLINE_C void
-imallocx_flags_decode(size_t size, int flags, size_t *usize, size_t *alignment,
-    bool *zero, bool *try_tcache, arena_t **arena)
+JEMALLOC_ALWAYS_INLINE_C bool
+imallocx_flags_decode(tsd_t *tsd, size_t size, int flags, size_t *usize,
+    size_t *alignment, bool *zero, bool *try_tcache, arena_t **arena)
 {
 
 	if (likely(flags == 0)) {
@@ -1463,9 +1717,10 @@ imallocx_flags_decode(size_t size, int flags, size_t *usize, size_t *alignment,
 		*zero = false;
 		*try_tcache = true;
 		*arena = NULL;
+		return (false);
 	} else {
-		imallocx_flags_decode_hard(size, flags, usize, alignment, zero,
-		    try_tcache, arena);
+		return (imallocx_flags_decode_hard(tsd, size, flags, usize,
+		    alignment, zero, try_tcache, arena));
 	}
 }
 
@@ -1524,8 +1779,9 @@ imallocx_prof(tsd_t *tsd, size_t size, int flags, size_t *usize)
 	arena_t *arena;
 	prof_tctx_t *tctx;
 
-	imallocx_flags_decode(size, flags, usize, &alignment, &zero,
-	    &try_tcache, &arena);
+	if (unlikely(imallocx_flags_decode(tsd, size, flags, usize, &alignment,
+	    &zero, &try_tcache, &arena)))
+		return (NULL);
 	tctx = prof_alloc_prep(tsd, *usize, true);
 	if (likely((uintptr_t)tctx == (uintptr_t)1U)) {
 		p = imallocx_maybe_flags(tsd, size, flags, *usize, alignment,
@@ -1558,8 +1814,9 @@ imallocx_no_prof(tsd_t *tsd, size_t size, int flags, size_t *usize)
 		return (imalloc(tsd, size));
 	}
 
-	imallocx_flags_decode_hard(size, flags, usize, &alignment, &zero,
-	    &try_tcache, &arena);
+	if (unlikely(imallocx_flags_decode_hard(tsd, size, flags, usize,
+	    &alignment, &zero, &try_tcache, &arena)))
+		return (NULL);
 	return (imallocx_flags(tsd, *usize, alignment, zero, try_tcache,
 	    arena));
 }
@@ -1685,9 +1942,10 @@ je_rallocx(void *ptr, size_t size, int flags)
 		arena_chunk_t *chunk;
 		try_tcache_alloc = false;
 		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-		try_tcache_dalloc = (chunk == ptr || chunk->arena !=
-		    arenas[arena_ind]);
-		arena = arenas[arena_ind];
+		arena = arena_get(tsd, arena_ind, true, true);
+		if (unlikely(arena == NULL))
+			goto label_oom;
+		try_tcache_dalloc = (chunk == ptr || chunk->arena != arena);
 	} else {
 		try_tcache_alloc = true;
 		try_tcache_dalloc = true;
@@ -1825,6 +2083,7 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags)
 
 	if (unlikely((flags & MALLOCX_ARENA_MASK) != 0)) {
 		unsigned arena_ind = MALLOCX_ARENA_GET(flags);
+		// XX Dangerous arenas read.
 		arena = arenas[arena_ind];
 	} else
 		arena = NULL;
@@ -1875,16 +2134,24 @@ je_sallocx(const void *ptr, int flags)
 void
 je_dallocx(void *ptr, int flags)
 {
+	tsd_t *tsd;
 	bool try_tcache;
 
 	assert(ptr != NULL);
 	assert(malloc_initialized || IS_INITIALIZER);
 
+	tsd = tsd_fetch();
 	if (unlikely((flags & MALLOCX_ARENA_MASK) != 0)) {
 		unsigned arena_ind = MALLOCX_ARENA_GET(flags);
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-		try_tcache = (chunk == ptr || chunk->arena !=
-		    arenas[arena_ind]);
+		arena_t *arena = arena_get(tsd, arena_ind, true, true);
+		/*
+		 * If arena is NULL, the application passed an arena that has
+		 * never been used before, which is unsupported during
+		 * deallocation.
+		 */
+		assert(arena != NULL);
+		try_tcache = (chunk == ptr || chunk->arena != arena);
 	} else
 		try_tcache = true;
 
@@ -1908,6 +2175,7 @@ inallocx(size_t size, int flags)
 void
 je_sdallocx(void *ptr, size_t size, int flags)
 {
+	tsd_t *tsd;
 	bool try_tcache;
 	size_t usize;
 
@@ -1916,16 +2184,22 @@ je_sdallocx(void *ptr, size_t size, int flags)
 	usize = inallocx(size, flags);
 	assert(usize == isalloc(ptr, config_prof));
 
+	tsd = tsd_fetch();
 	if (unlikely((flags & MALLOCX_ARENA_MASK) != 0)) {
 		unsigned arena_ind = MALLOCX_ARENA_GET(flags);
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-		try_tcache = (chunk == ptr || chunk->arena !=
-		    arenas[arena_ind]);
+		arena_t *arena = arena_get(tsd, arena_ind, true, true);
+		/*
+		 * If arena is NULL, the application passed an arena that has
+		 * never been used before, which is unsupported during
+		 * deallocation.
+		 */
+		try_tcache = (chunk == ptr || chunk->arena != arena);
 	} else
 		try_tcache = true;
 
 	UTRACE(ptr, 0, 0);
-	isfree(tsd_fetch(), ptr, usize, try_tcache);
+	isfree(tsd, ptr, usize, try_tcache);
 }
 
 size_t
@@ -2105,55 +2379,3 @@ jemalloc_postfork_child(void)
 }
 
 /******************************************************************************/
-/*
- * The following functions are used for TLS allocation/deallocation in static
- * binaries on FreeBSD.  The primary difference between these and i[mcd]alloc()
- * is that these avoid accessing TLS variables.
- */
-
-static void *
-a0alloc(size_t size, bool zero)
-{
-
-	if (unlikely(malloc_init()))
-		return (NULL);
-
-	if (size == 0)
-		size = 1;
-
-	if (size <= arena_maxclass)
-		return (arena_malloc(NULL, arenas[0], size, zero, false));
-	else
-		return (huge_malloc(NULL, arenas[0], size, zero));
-}
-
-void *
-a0malloc(size_t size)
-{
-
-	return (a0alloc(size, false));
-}
-
-void *
-a0calloc(size_t num, size_t size)
-{
-
-	return (a0alloc(num * size, true));
-}
-
-void
-a0free(void *ptr)
-{
-	arena_chunk_t *chunk;
-
-	if (ptr == NULL)
-		return;
-
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (chunk != ptr)
-		arena_dalloc(NULL, chunk, ptr, false);
-	else
-		huge_dalloc(ptr);
-}
-
-/******************************************************************************/
diff --git a/src/tcache.c b/src/tcache.c
index 2c968c6..1bf7026 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -246,6 +246,14 @@ tcache_arena_associate(tcache_t *tcache, arena_t *arena)
 }
 
 void
+tcache_arena_reassociate(tcache_t *tcache, arena_t *arena)
+{
+
+	tcache_arena_dissociate(tcache);
+	tcache_arena_associate(tcache, arena);
+}
+
+void
 tcache_arena_dissociate(tcache_t *tcache)
 {
 
@@ -261,13 +269,17 @@ tcache_arena_dissociate(tcache_t *tcache)
 tcache_t *
 tcache_get_hard(tsd_t *tsd)
 {
+	arena_t *arena;
 
 	if (!tcache_enabled_get()) {
 		if (tsd_nominal(tsd))
 			tcache_enabled_set(false); /* Memoize. */
 		return (NULL);
 	}
-	return (tcache_create(choose_arena(tsd, NULL)));
+	arena = arena_choose(tsd, NULL);
+	if (unlikely(arena == NULL))
+		return (NULL);
+	return (tcache_create(arena));
 }
 
 tcache_t *
diff --git a/src/tsd.c b/src/tsd.c
index cbc64e4..59253fe 100644
--- a/src/tsd.c
+++ b/src/tsd.c
@@ -15,16 +15,14 @@ void *
 malloc_tsd_malloc(size_t size)
 {
 
-	/* Avoid choose_arena() in order to dodge bootstrapping issues. */
-	return (arena_malloc(NULL, arenas[0], CACHELINE_CEILING(size), false,
-	    false));
+	return (a0malloc(CACHELINE_CEILING(size)));
 }
 
 void
 malloc_tsd_dalloc(void *wrapper)
 {
 
-	idalloct(NULL, wrapper, false);
+	a0free(wrapper);
 }
 
 void
@@ -106,15 +104,24 @@ MALLOC_TSD
 }
 
 bool
-malloc_tsd_boot(void)
+malloc_tsd_boot0(void)
 {
 
 	ncleanups = 0;
-	if (tsd_boot())
+	if (tsd_boot0())
 		return (true);
+	*tsd_arenas_cache_bypassp_get(tsd_fetch()) = true;
 	return (false);
 }
 
+void
+malloc_tsd_boot1(void)
+{
+
+	tsd_boot1();
+	*tsd_arenas_cache_bypassp_get(tsd_fetch()) = false;
+}
+
 #ifdef _WIN32
 static BOOL WINAPI
 _tls_callback(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpvReserved)
diff --git a/test/unit/tsd.c b/test/unit/tsd.c
index eb1c597..b031c48 100644
--- a/test/unit/tsd.c
+++ b/test/unit/tsd.c
@@ -6,6 +6,7 @@ typedef unsigned int data_t;
 
 static bool data_cleanup_executed;
 
+malloc_tsd_types(data_, data_t)
 malloc_tsd_protos(, data_, data_t)
 
 void
-- 
cgit v0.12


From f22214a29ddd3bed005cbcc8f2aff7c61ef4940b Mon Sep 17 00:00:00 2001
From: Daniel Micay <danielmicay@gmail.com>
Date: Mon, 6 Oct 2014 03:42:10 -0400
Subject: Use regular arena allocation for huge tree nodes.

This avoids grabbing the base mutex, as a step towards fine-grained
locking for huge allocations. The thread cache also provides a tiny
(~3%) improvement for serial huge allocations.
---
 include/jemalloc/internal/huge.h                 |  2 +-
 include/jemalloc/internal/jemalloc_internal.h.in |  4 ++--
 src/huge.c                                       |  9 ++++----
 src/jemalloc.c                                   |  2 +-
 test/unit/junk.c                                 | 27 ++++++++++++++++++------
 5 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/include/jemalloc/internal/huge.h b/include/jemalloc/internal/huge.h
index 939993f..5d4d3a1 100644
--- a/include/jemalloc/internal/huge.h
+++ b/include/jemalloc/internal/huge.h
@@ -21,7 +21,7 @@ void	*huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize,
 typedef void (huge_dalloc_junk_t)(void *, size_t);
 extern huge_dalloc_junk_t *huge_dalloc_junk;
 #endif
-void	huge_dalloc(void *ptr);
+void	huge_dalloc(tsd_t *tsd, void *ptr);
 size_t	huge_salloc(const void *ptr);
 prof_tctx_t	*huge_prof_tctx_get(const void *ptr);
 void	huge_prof_tctx_set(const void *ptr, prof_tctx_t *tctx);
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index c7a5fd8..f4d5de6 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -938,7 +938,7 @@ idalloct(tsd_t *tsd, void *ptr, bool try_tcache)
 	if (chunk != ptr)
 		arena_dalloc(tsd, chunk, ptr, try_tcache);
 	else
-		huge_dalloc(ptr);
+		huge_dalloc(tsd, ptr);
 }
 
 JEMALLOC_ALWAYS_INLINE void
@@ -952,7 +952,7 @@ isdalloct(tsd_t *tsd, void *ptr, size_t size, bool try_tcache)
 	if (chunk != ptr)
 		arena_sdalloc(tsd, chunk, ptr, size, try_tcache);
 	else
-		huge_dalloc(ptr);
+		huge_dalloc(tsd, ptr);
 }
 
 JEMALLOC_ALWAYS_INLINE void
diff --git a/src/huge.c b/src/huge.c
index 1376729..541df60 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -41,7 +41,8 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 	assert(csize >= usize);
 
 	/* Allocate an extent node with which to track the chunk. */
-	node = base_node_alloc();
+	node = ipalloct(tsd, CACHELINE_CEILING(sizeof(extent_node_t)),
+	    CACHELINE, false, tsd != NULL, NULL);
 	if (node == NULL)
 		return (NULL);
 
@@ -57,7 +58,7 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 	}
 	ret = arena_chunk_alloc_huge(arena, NULL, csize, alignment, &is_zeroed);
 	if (ret == NULL) {
-		base_node_dalloc(node);
+		idalloct(tsd, node, tsd != NULL);
 		return (NULL);
 	}
 
@@ -311,7 +312,7 @@ huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t size,
 }
 
 void
-huge_dalloc(void *ptr)
+huge_dalloc(tsd_t *tsd, void *ptr)
 {
 	extent_node_t *node, key;
 
@@ -329,7 +330,7 @@ huge_dalloc(void *ptr)
 	huge_dalloc_junk(node->addr, node->size);
 	arena_chunk_dalloc_huge(node->arena, node->addr,
 	    CHUNK_CEILING(node->size));
-	base_node_dalloc(node);
+	idalloct(tsd, node, tsd != NULL);
 }
 
 size_t
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 3c889e8..38b5aaf 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -240,7 +240,7 @@ a0free(void *ptr)
 	if (chunk != ptr)
 		arena_dalloc(NULL, chunk, ptr, false);
 	else
-		huge_dalloc(ptr);
+		huge_dalloc(NULL, ptr);
 }
 
 /* Create a new arena and insert it into the arenas array at index ind. */
diff --git a/test/unit/junk.c b/test/unit/junk.c
index 5b35a87..1522a61 100644
--- a/test/unit/junk.c
+++ b/test/unit/junk.c
@@ -8,7 +8,16 @@ const char *malloc_conf =
 static arena_dalloc_junk_small_t *arena_dalloc_junk_small_orig;
 static arena_dalloc_junk_large_t *arena_dalloc_junk_large_orig;
 static huge_dalloc_junk_t *huge_dalloc_junk_orig;
-static void *most_recently_junked;
+static void *watch_for_junking;
+static bool saw_junking;
+
+static void
+watch_junking(void *p)
+{
+
+	watch_for_junking = p;
+	saw_junking = false;
+}
 
 static void
 arena_dalloc_junk_small_intercept(void *ptr, arena_bin_info_t *bin_info)
@@ -21,7 +30,8 @@ arena_dalloc_junk_small_intercept(void *ptr, arena_bin_info_t *bin_info)
 		    "Missing junk fill for byte %zu/%zu of deallocated region",
 		    i, bin_info->reg_size);
 	}
-	most_recently_junked = ptr;
+	if (ptr == watch_for_junking)
+		saw_junking = true;
 }
 
 static void
@@ -35,7 +45,8 @@ arena_dalloc_junk_large_intercept(void *ptr, size_t usize)
 		    "Missing junk fill for byte %zu/%zu of deallocated region",
 		    i, usize);
 	}
-	most_recently_junked = ptr;
+	if (ptr == watch_for_junking)
+		saw_junking = true;
 }
 
 static void
@@ -48,7 +59,8 @@ huge_dalloc_junk_intercept(void *ptr, size_t usize)
 	 * enough that it doesn't make sense to duplicate the decision logic in
 	 * test code, so don't actually check that the region is junk-filled.
 	 */
-	most_recently_junked = ptr;
+	if (ptr == watch_for_junking)
+		saw_junking = true;
 }
 
 static void
@@ -87,18 +99,19 @@ test_junk(size_t sz_min, size_t sz_max)
 		}
 
 		if (xallocx(s, sz+1, 0, 0) == sz) {
-			void *junked = (void *)s;
+			watch_junking(s);
 			s = (char *)rallocx(s, sz+1, 0);
 			assert_ptr_not_null((void *)s,
 			    "Unexpected rallocx() failure");
-			assert_ptr_eq(most_recently_junked, junked,
+			assert_true(saw_junking,
 			    "Expected region of size %zu to be junk-filled",
 			    sz);
 		}
 	}
 
+	watch_junking(s);
 	dallocx(s, 0);
-	assert_ptr_eq(most_recently_junked, (void *)s,
+	assert_true(saw_junking,
 	    "Expected region of size %zu to be junk-filled", sz);
 
 	arena_dalloc_junk_small = arena_dalloc_junk_small_orig;
-- 
cgit v0.12


From 3a8b9b1fd95b1bb9b3dc00f6798eeb40d5100b7b Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Wed, 8 Oct 2014 00:54:16 -0700
Subject: Fix a recursive lock acquisition regression.

Fix a recursive lock acquisition regression, which was introduced by
8bb3198f72fc7587dc93527f9f19fb5be52fa553 (Refactor/fix arenas
manipulation.).
---
 src/jemalloc.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/src/jemalloc.c b/src/jemalloc.c
index 38b5aaf..c62d8ce 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -244,13 +244,11 @@ a0free(void *ptr)
 }
 
 /* Create a new arena and insert it into the arenas array at index ind. */
-arena_t *
-arena_init(unsigned ind)
+static arena_t *
+arena_init_locked(unsigned ind)
 {
 	arena_t *arena;
 
-	malloc_mutex_lock(&arenas_lock);
-
 	/* Expand arenas if necessary. */
 	assert(ind <= narenas_total);
 	if (ind == narenas_total) {
@@ -258,10 +256,8 @@ arena_init(unsigned ind)
 		arena_t **arenas_new =
 		    (arena_t **)a0malloc(CACHELINE_CEILING(narenas_new *
 		    sizeof(arena_t *)));
-		if (arenas_new == NULL) {
-			arena = NULL;
-			goto label_return;
-		}
+		if (arenas_new == NULL)
+			return (NULL);
 		memcpy(arenas_new, arenas, narenas_total * sizeof(arena_t *));
 		arenas_new[ind] = NULL;
 		/*
@@ -281,12 +277,21 @@ arena_init(unsigned ind)
 	arena = arenas[ind];
 	if (arena != NULL) {
 		assert(ind < narenas_auto);
-		goto label_return;
+		return (arena);
 	}
 
 	/* Actually initialize the arena. */
 	arena = arenas[ind] = arena_new(ind);
-label_return:
+	return (arena);
+}
+
+arena_t *
+arena_init(unsigned ind)
+{
+	arena_t *arena;
+
+	malloc_mutex_lock(&arenas_lock);
+	arena = arena_init_locked(ind);
 	malloc_mutex_unlock(&arenas_lock);
 	return (arena);
 }
@@ -477,7 +482,7 @@ arena_choose_hard(tsd_t *tsd)
 		} else {
 			/* Initialize a new arena. */
 			choose = first_null;
-			ret = arena_init(choose);
+			ret = arena_init_locked(choose);
 			if (ret == NULL) {
 				malloc_mutex_unlock(&arenas_lock);
 				return (NULL);
-- 
cgit v0.12


From 57efa7bb0e284805c940472190bc9924327635a1 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 8 Oct 2014 17:57:19 -0700
Subject: Avoid atexit(3) when possible, disable prof_final by default.

atexit(3) can deadlock internally during its own initialization if
jemalloc calls atexit() during jemalloc initialization.  Mitigate the
impact by restructuring prof initialization to avoid calling atexit()
unless the registered function will actually dump a final heap profile.

Additionally, disable prof_final by default so that this land mine is
opt-in rather than opt-out.

This resolves #144.
---
 doc/jemalloc.xml.in          | 18 +++++++++++++++---
 src/prof.c                   | 17 +++++++++--------
 test/unit/prof_active.c      |  2 +-
 test/unit/prof_thread_name.c |  3 +--
 4 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 1f692f7..7da1498 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -857,8 +857,14 @@ for (i = 0; i < nbins; i++) {
         <option>--enable-stats</option> is specified during configuration, this
         has the potential to cause deadlock for a multi-threaded process that
         exits while one or more threads are executing in the memory allocation
-        functions.  Therefore, this option should only be used with care; it is
-        primarily intended as a performance tuning aid during application
+        functions.  Furthermore, <function>atexit<parameter/></function> may
+        allocate memory during application initialization and then deadlock
+        internally when jemalloc in turn calls
+        <function>atexit<parameter/></function>, so this option is not
+        univerally usable (though the application can register its own
+        <function>atexit<parameter/></function> function with equivalent
+        functionality).  Therefore, this option should only be used with care;
+        it is primarily intended as a performance tuning aid during application
         development.  This option is disabled by default.</para></listitem>
       </varlistentry>
 
@@ -1155,7 +1161,13 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         <filename>&lt;prefix&gt;.&lt;pid&gt;.&lt;seq&gt;.f.heap</filename>,
         where <literal>&lt;prefix&gt;</literal> is controlled by the <link
         linkend="opt.prof_prefix"><mallctl>opt.prof_prefix</mallctl></link>
-        option.  This option is enabled by default.</para></listitem>
+        option.  Note that <function>atexit<parameter/></function> may allocate
+        memory during application initialization and then deadlock internally
+        when jemalloc in turn calls <function>atexit<parameter/></function>, so
+        this option is not univerally usable (though the application can
+        register its own <function>atexit<parameter/></function> function with
+        equivalent functionality).  This option is disabled by
+        default.</para></listitem>
       </varlistentry>
 
       <varlistentry id="opt.prof_leak">
diff --git a/src/prof.c b/src/prof.c
index b3150a2..3e2e427 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -20,7 +20,7 @@ bool		opt_prof_thread_active_init = true;
 size_t		opt_lg_prof_sample = LG_PROF_SAMPLE_DEFAULT;
 ssize_t		opt_lg_prof_interval = LG_PROF_INTERVAL_DEFAULT;
 bool		opt_prof_gdump = false;
-bool		opt_prof_final = true;
+bool		opt_prof_final = false;
 bool		opt_prof_leak = false;
 bool		opt_prof_accum = false;
 char		opt_prof_prefix[
@@ -1487,17 +1487,17 @@ prof_fdump(void)
 	char filename[DUMP_FILENAME_BUFSIZE];
 
 	cassert(config_prof);
+	assert(opt_prof_final);
+	assert(opt_prof_prefix[0] != '\0');
 
 	if (!prof_booted)
 		return;
 	tsd = tsd_fetch();
 
-	if (opt_prof_final && opt_prof_prefix[0] != '\0') {
-		malloc_mutex_lock(&prof_dump_seq_mtx);
-		prof_dump_filename(filename, 'f', VSEQ_INVALID);
-		malloc_mutex_unlock(&prof_dump_seq_mtx);
-		prof_dump(tsd, false, filename, opt_prof_leak);
-	}
+	malloc_mutex_lock(&prof_dump_seq_mtx);
+	prof_dump_filename(filename, 'f', VSEQ_INVALID);
+	malloc_mutex_unlock(&prof_dump_seq_mtx);
+	prof_dump(tsd, false, filename, opt_prof_leak);
 }
 
 void
@@ -2023,7 +2023,8 @@ prof_boot2(void)
 		if (malloc_mutex_init(&prof_dump_mtx))
 			return (true);
 
-		if (atexit(prof_fdump) != 0) {
+		if (opt_prof_final && opt_prof_prefix[0] != '\0' &&
+		    atexit(prof_fdump) != 0) {
 			malloc_write("<jemalloc>: Error in atexit()\n");
 			if (opt_abort)
 				abort();
diff --git a/test/unit/prof_active.c b/test/unit/prof_active.c
index d4bab8d..8149095 100644
--- a/test/unit/prof_active.c
+++ b/test/unit/prof_active.c
@@ -2,7 +2,7 @@
 
 #ifdef JEMALLOC_PROF
 const char *malloc_conf =
-    "prof:true,prof_thread_active_init:false,lg_prof_sample:0,prof_final:false";
+    "prof:true,prof_thread_active_init:false,lg_prof_sample:0";
 #endif
 
 static void
diff --git a/test/unit/prof_thread_name.c b/test/unit/prof_thread_name.c
index 6066dba..f501158 100644
--- a/test/unit/prof_thread_name.c
+++ b/test/unit/prof_thread_name.c
@@ -1,8 +1,7 @@
 #include "test/jemalloc_test.h"
 
 #ifdef JEMALLOC_PROF
-const char *malloc_conf =
-    "prof:true,prof_active:false,prof_final:false";
+const char *malloc_conf = "prof:true,prof_active:false";
 #endif
 
 static void
-- 
cgit v0.12


From b123ddc760e5b53dde17c6a19a130173067c0e30 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 8 Oct 2014 18:18:03 -0700
Subject: Don't configure HAVE_SSE2.

Don't configure HAVE_SSE2 (on behalf of SFMT), because its dependencies
are notoriously unportable in practice.

This resolves #119.
---
 configure.ac                              | 10 ----------
 test/include/test/jemalloc_test_defs.h.in |  5 ++++-
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/configure.ac b/configure.ac
index e4afe88..1d79ded 100644
--- a/configure.ac
+++ b/configure.ac
@@ -206,8 +206,6 @@ AC_CANONICAL_HOST
 dnl CPU-specific settings.
 CPU_SPINWAIT=""
 case "${host_cpu}" in
-  i[[345]]86)
-	;;
   i686|x86_64)
 	JE_COMPILABLE([pause instruction], [],
 	              [[__asm__ volatile("pause"); return 0;]],
@@ -215,14 +213,6 @@ case "${host_cpu}" in
 	if test "x${je_cv_pause}" = "xyes" ; then
 	    CPU_SPINWAIT='__asm__ volatile("pause")'
 	fi
-	dnl emmintrin.h fails to compile unless MMX, SSE, and SSE2 are
-	dnl supported.
-	JE_COMPILABLE([SSE2 intrinsics], [
-#include <emmintrin.h>
-], [], [je_cv_sse2])
-	if test "x${je_cv_sse2}" = "xyes" ; then
-	  AC_DEFINE_UNQUOTED([HAVE_SSE2], [ ])
-	fi
 	;;
   powerpc)
 	AC_DEFINE_UNQUOTED([HAVE_ALTIVEC], [ ])
diff --git a/test/include/test/jemalloc_test_defs.h.in b/test/include/test/jemalloc_test_defs.h.in
index aaaaec1..5cc8532 100644
--- a/test/include/test/jemalloc_test_defs.h.in
+++ b/test/include/test/jemalloc_test_defs.h.in
@@ -1,6 +1,9 @@
 #include "jemalloc/internal/jemalloc_internal_defs.h"
 #include "jemalloc/internal/jemalloc_internal_decls.h"
 
-/* For use by SFMT. */
+/*
+ * For use by SFMT.  configure.ac doesn't actually define HAVE_SSE2 because its
+ * dependencies are notoriously unportable in practice.
+ */
 #undef HAVE_SSE2
 #undef HAVE_ALTIVEC
-- 
cgit v0.12


From fc0b3b7383373d66cfed2cd4e2faa272a6868d32 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Thu, 9 Oct 2014 17:54:06 -0700
Subject: Add configure options.

Add:
  --with-lg-page
  --with-lg-page-sizes
  --with-lg-size-class-group
  --with-lg-quantum

Get rid of STATIC_PAGE_SHIFT, in favor of directly setting LG_PAGE.

Fix various edge conditions exposed by the configure options.
---
 INSTALL                                            | 67 ++++++++++++++++++++++
 Makefile.in                                        |  1 +
 configure.ac                                       | 45 +++++++++++----
 include/jemalloc/internal/arena.h                  |  8 ++-
 include/jemalloc/internal/huge.h                   |  9 +--
 include/jemalloc/internal/jemalloc_internal.h.in   | 28 ++++-----
 .../jemalloc/internal/jemalloc_internal_defs.h.in  | 10 +++-
 include/jemalloc/internal/private_symbols.txt      |  4 +-
 include/jemalloc/internal/size_classes.sh          | 12 +++-
 include/jemalloc/internal/tcache.h                 |  4 +-
 src/arena.c                                        | 42 ++++++++++----
 src/huge.c                                         | 52 +++++++++--------
 src/jemalloc.c                                     | 50 +++++++++-------
 src/tcache.c                                       | 53 ++++-------------
 test/unit/lg_chunk.c                               | 26 +++++++++
 test/unit/mallctl.c                                |  2 +-
 16 files changed, 277 insertions(+), 136 deletions(-)
 create mode 100644 test/unit/lg_chunk.c

diff --git a/INSTALL b/INSTALL
index 9af2336..73bf718 100644
--- a/INSTALL
+++ b/INSTALL
@@ -189,6 +189,73 @@ any of the following arguments (not a definitive list) to 'configure':
     Specify where to find DocBook XSL stylesheets when building the
     documentation.
 
+--with-lg-page=<lg-page>
+    Specify the base 2 log of the system page size.  This option is only useful
+    when cross compiling, since the configure script automatically determines the
+    host's page size by default.
+
+--with-lg-page-sizes=<lg-page-sizes>
+    Specify the comma-separated base 2 logs of the page sizes to support.  This
+    option may be useful when cross-compiling in combination with
+    --with-lg-page, but its primary use case is for integration with FreeBSD's
+    libc, wherein jemalloc is embedded.
+
+--with-lg-size-class-group=<lg-size-class-group>
+    Specify the base 2 log of how many size classes to use for each doubling in
+    size.  By default jemalloc uses <lg-size-class-group>=2, which results in
+    e.g. the following size classes:
+
+      [...], 64,
+      80, 96, 112, 128,
+      160, [...]
+
+    <lg-size-class-group>=3 results in e.g. the following size classes:
+
+      [...], 64,
+      72, 80, 88, 96, 104, 112, 120, 128,
+      144, [...]
+
+    The minimal <lg-size-class-group>=0 causes jemalloc to only provide size
+    classes that are powers of 2:
+
+      [...],
+      64,
+      128,
+      256,
+      [...]
+
+    An implementation detail currently limits the total number of small size
+    classes to 255, and a compilation error will result if the
+    <lg-size-class-group> you specify cannot be supported.  The limit is
+    roughly <lg-size-class-group>=4, depending on page size.
+
+--with-lg-quantum=<lg-quantum>
+    Specify the base 2 log of the minimum allocation alignment (only
+    <lg-quantum>=3 and <lg-quantum>=4 are supported).  jemalloc needs to know
+    the minimum alignment that meets the following C standard requirement
+    (quoted from the April 12, 2011 draft of the C11 standard):
+
+      The pointer returned if the allocation succeeds is suitably aligned so
+      that it may be assigned to a pointer to any type of object with a
+      fundamental alignment requirement and then used to access such an object
+      or an array of such objects in the space allocated [...]
+
+    This setting is architecture-specific, and although jemalloc includes known
+    safe values for the most commonly used modern architectures, there is a
+    wrinkle related to GNU libc (glibc) that may impact your choice of
+    <lg-quantum>.  On most modern architectures, this mandates 16-byte alignment
+    (<lg-quantum>=4), but the glibc developers chose not to meet this requirement
+    for performance reasons.  An old discussion can be found at
+    https://sourceware.org/bugzilla/show_bug.cgi?id=206 .  Unlike glibc,
+    jemalloc does follow the C standard by default (caveat: jemalloc technically
+    cheats by only providing 8-byte alignment for 8-byte allocation requests),
+    but the fact that Linux systems already work around this allocator
+    noncompliance means that it is generally safe in practice to let jemalloc's
+    minimum alignment follow glibc's lead.  If you specify --with-lg-quantum=3
+    during configuration, jemalloc will provide additional size classes that
+    are not 16-byte-aligned (24, 40, and 56, assuming
+    --with-lg-size-class-group=2).
+
 The following environment variables (not a definitive list) impact configure's
 behavior:
 
diff --git a/Makefile.in b/Makefile.in
index 50f6596..40644ce 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -118,6 +118,7 @@ TESTS_UNIT := $(srcroot)test/unit/atomic.c \
 	$(srcroot)test/unit/ckh.c \
 	$(srcroot)test/unit/hash.c \
 	$(srcroot)test/unit/junk.c \
+	$(srcroot)test/unit/lg_chunk.c \
 	$(srcroot)test/unit/mallctl.c \
 	$(srcroot)test/unit/math.c \
 	$(srcroot)test/unit/mq.c \
diff --git a/configure.ac b/configure.ac
index 1d79ded..f8c09c4 100644
--- a/configure.ac
+++ b/configure.ac
@@ -969,8 +969,17 @@ else
   fi
 fi
 
-AC_CACHE_CHECK([STATIC_PAGE_SHIFT],
-               [je_cv_static_page_shift],
+AC_ARG_WITH([lg_quantum],
+  [AS_HELP_STRING([--with-lg-quantum=<lg-quantum>],
+   [Base 2 log of minimum allocation alignment])],
+  [AC_DEFINE_UNQUOTED([LG_QUANTUM], [$with_lg_quantum])])
+
+AC_ARG_WITH([lg_page],
+  [AS_HELP_STRING([--with-lg-page=<lg-page>], [Base 2 log of system page size])],
+  [LG_PAGE="$with_lg_page"], [LG_PAGE="detect"])
+if test "x$LG_PAGE" == "xdetect"; then
+  AC_CACHE_CHECK([LG_PAGE],
+               [je_cv_lg_page],
                AC_RUN_IFELSE([AC_LANG_PROGRAM(
 [[
 #include <strings.h>
@@ -1006,16 +1015,30 @@ AC_CACHE_CHECK([STATIC_PAGE_SHIFT],
 
     return 0;
 ]])],
-                             [je_cv_static_page_shift=`cat conftest.out`],
-                             [je_cv_static_page_shift=undefined],
-                             [je_cv_static_page_shift=12]))
-
-if test "x$je_cv_static_page_shift" != "xundefined"; then
-   AC_DEFINE_UNQUOTED([STATIC_PAGE_SHIFT], [$je_cv_static_page_shift])
+                             [je_cv_lg_page=`cat conftest.out`],
+                             [je_cv_lg_page=undefined],
+                             [je_cv_lg_page=12]))
+fi
+if test "x${je_cv_lg_page}" != "x" ; then
+  LG_PAGE="${je_cv_lg_page}"
+fi
+if test "x${LG_PAGE}" != "xundefined" ; then
+   AC_DEFINE_UNQUOTED([LG_PAGE], [$LG_PAGE])
 else
-   AC_MSG_ERROR([cannot determine value for STATIC_PAGE_SHIFT])
+   AC_MSG_ERROR([cannot determine value for LG_PAGE])
 fi
 
+AC_ARG_WITH([lg_page_sizes],
+  [AS_HELP_STRING([--with-lg-page-sizes=<lg-page-sizes>],
+   [Base 2 logs of system page sizes to support])],
+  [LG_PAGE_SIZES="$with_lg_page_sizes"], [LG_PAGE_SIZES="$LG_PAGE"])
+
+AC_ARG_WITH([lg_size_class_group],
+  [AS_HELP_STRING([--with-lg-size-class-group=<lg-size-class-group>],
+   [Base 2 log of size classes per doubling])],
+  [LG_SIZE_CLASS_GROUP="$with_lg_size_class_group"],
+  [LG_SIZE_CLASS_GROUP="2"])
+
 dnl ============================================================================
 dnl jemalloc configuration.
 dnl 
@@ -1456,10 +1479,12 @@ AC_CONFIG_COMMANDS([include/jemalloc/internal/public_unnamespace.h], [
 ])
 AC_CONFIG_COMMANDS([include/jemalloc/internal/size_classes.h], [
   mkdir -p "${objroot}include/jemalloc/internal"
-  "${srcdir}/include/jemalloc/internal/size_classes.sh" > "${objroot}include/jemalloc/internal/size_classes.h"
+  "${srcdir}/include/jemalloc/internal/size_classes.sh" ${LG_PAGE_SIZES} ${LG_SIZE_CLASS_GROUP} > "${objroot}include/jemalloc/internal/size_classes.h"
 ], [
   srcdir="${srcdir}"
   objroot="${objroot}"
+  LG_PAGE_SIZES=${LG_PAGE_SIZES}
+  LG_SIZE_CLASS_GROUP=${LG_SIZE_CLASS_GROUP}
 ])
 AC_CONFIG_COMMANDS([include/jemalloc/jemalloc_protos_jet.h], [
   mkdir -p "${objroot}include/jemalloc"
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 894ce9a..f5b9fc6 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -362,8 +362,8 @@ void	*arena_malloc_small(arena_t *arena, size_t size, bool zero);
 void	*arena_malloc_large(arena_t *arena, size_t size, bool zero);
 void	*arena_palloc(arena_t *arena, size_t size, size_t alignment, bool zero);
 void	arena_prof_promoted(const void *ptr, size_t size);
-void	arena_dalloc_bin_locked(arena_t *arena, arena_chunk_t *chunk, void *ptr,
-    arena_chunk_map_bits_t *bitselm);
+void	arena_dalloc_bin_junked_locked(arena_t *arena, arena_chunk_t *chunk,
+    void *ptr, arena_chunk_map_bits_t *bitselm);
 void	arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
     size_t pageind, arena_chunk_map_bits_t *bitselm);
 void	arena_dalloc_small(arena_t *arena, arena_chunk_t *chunk, void *ptr,
@@ -371,8 +371,10 @@ void	arena_dalloc_small(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 #ifdef JEMALLOC_JET
 typedef void (arena_dalloc_junk_large_t)(void *, size_t);
 extern arena_dalloc_junk_large_t *arena_dalloc_junk_large;
+#else
+void	arena_dalloc_junk_large(void *ptr, size_t usize);
 #endif
-void	arena_dalloc_large_locked(arena_t *arena, arena_chunk_t *chunk,
+void	arena_dalloc_large_junked_locked(arena_t *arena, arena_chunk_t *chunk,
     void *ptr);
 void	arena_dalloc_large(arena_t *arena, arena_chunk_t *chunk, void *ptr);
 #ifdef JEMALLOC_JET
diff --git a/include/jemalloc/internal/huge.h b/include/jemalloc/internal/huge.h
index 5d4d3a1..39d8aa5 100644
--- a/include/jemalloc/internal/huge.h
+++ b/include/jemalloc/internal/huge.h
@@ -9,19 +9,20 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
-void	*huge_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero);
+void	*huge_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
+    bool try_tcache);
 void	*huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
-    bool zero);
+    bool zero, bool try_tcache);
 bool	huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
     size_t extra, bool zero);
 void	*huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize,
     size_t size, size_t extra, size_t alignment, bool zero,
-    bool try_tcache_dalloc);
+    bool try_tcache_alloc, bool try_tcache_dalloc);
 #ifdef JEMALLOC_JET
 typedef void (huge_dalloc_junk_t)(void *, size_t);
 extern huge_dalloc_junk_t *huge_dalloc_junk;
 #endif
-void	huge_dalloc(tsd_t *tsd, void *ptr);
+void	huge_dalloc(tsd_t *tsd, void *ptr, bool try_tcache);
 size_t	huge_salloc(const void *ptr);
 prof_tctx_t	*huge_prof_tctx_get(const void *ptr);
 void	huge_prof_tctx_set(const void *ptr, prof_tctx_t *tctx);
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index f4d5de6..3f65fad 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -185,7 +185,7 @@ typedef unsigned index_t;
 #define	TINY_MIN		(1U << LG_TINY_MIN)
 
 /*
- * Minimum alignment of allocations is 2^LG_QUANTUM bytes (ignoring tiny size
+ * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
  * classes).
  */
 #ifndef LG_QUANTUM
@@ -235,7 +235,8 @@ typedef unsigned index_t;
 #    define LG_QUANTUM		4
 #  endif
 #  ifndef LG_QUANTUM
-#    error "No LG_QUANTUM definition for architecture; specify via CPPFLAGS"
+#    error "Unknown minimum alignment for architecture; specify via "
+	 "--with-lg-quantum"
 #  endif
 #endif
 
@@ -275,12 +276,11 @@ typedef unsigned index_t;
 #define	CACHELINE_CEILING(s)						\
 	(((s) + CACHELINE_MASK) & ~CACHELINE_MASK)
 
-/* Page size.  STATIC_PAGE_SHIFT is determined by the configure script. */
+/* Page size.  LG_PAGE is determined by the configure script. */
 #ifdef PAGE_MASK
 #  undef PAGE_MASK
 #endif
-#define	LG_PAGE		STATIC_PAGE_SHIFT
-#define	PAGE		((size_t)(1U << STATIC_PAGE_SHIFT))
+#define	PAGE		((size_t)(1U << LG_PAGE))
 #define	PAGE_MASK	((size_t)(PAGE - 1))
 
 /* Return the smallest pagesize multiple that is >= s. */
@@ -809,7 +809,7 @@ imalloct(tsd_t *tsd, size_t size, bool try_tcache, arena_t *arena)
 	if (size <= arena_maxclass)
 		return (arena_malloc(tsd, arena, size, false, try_tcache));
 	else
-		return (huge_malloc(tsd, arena, size, false));
+		return (huge_malloc(tsd, arena, size, false, try_tcache));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
@@ -826,7 +826,7 @@ icalloct(tsd_t *tsd, size_t size, bool try_tcache, arena_t *arena)
 	if (size <= arena_maxclass)
 		return (arena_malloc(tsd, arena, size, true, try_tcache));
 	else
-		return (huge_malloc(tsd, arena, size, true));
+		return (huge_malloc(tsd, arena, size, true, try_tcache));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
@@ -854,9 +854,11 @@ ipalloct(tsd_t *tsd, size_t usize, size_t alignment, bool zero, bool try_tcache,
 				return (NULL);
 			ret = arena_palloc(arena, usize, alignment, zero);
 		} else if (alignment <= chunksize)
-			ret = huge_malloc(tsd, arena, usize, zero);
-		else
-			ret = huge_palloc(tsd, arena, usize, alignment, zero);
+			ret = huge_malloc(tsd, arena, usize, zero, try_tcache);
+		else {
+			ret = huge_palloc(tsd, arena, usize, alignment, zero,
+			    try_tcache);
+		}
 	}
 
 	assert(ALIGNMENT_ADDR2BASE(ret, alignment) == ret);
@@ -938,7 +940,7 @@ idalloct(tsd_t *tsd, void *ptr, bool try_tcache)
 	if (chunk != ptr)
 		arena_dalloc(tsd, chunk, ptr, try_tcache);
 	else
-		huge_dalloc(tsd, ptr);
+		huge_dalloc(tsd, ptr, try_tcache);
 }
 
 JEMALLOC_ALWAYS_INLINE void
@@ -952,7 +954,7 @@ isdalloct(tsd_t *tsd, void *ptr, size_t size, bool try_tcache)
 	if (chunk != ptr)
 		arena_sdalloc(tsd, chunk, ptr, size, try_tcache);
 	else
-		huge_dalloc(tsd, ptr);
+		huge_dalloc(tsd, ptr, try_tcache);
 }
 
 JEMALLOC_ALWAYS_INLINE void
@@ -1042,7 +1044,7 @@ iralloct(tsd_t *tsd, void *ptr, size_t size, size_t alignment, bool zero,
 		    alignment, zero, try_tcache_alloc, try_tcache_dalloc));
 	} else {
 		return (huge_ralloc(tsd, arena, ptr, oldsize, size, 0,
-		    alignment, zero, try_tcache_dalloc));
+		    alignment, zero, try_tcache_alloc, try_tcache_dalloc));
 	}
 }
 
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index fd85e5c..0ff939c 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -144,8 +144,14 @@
 /* Support lazy locking (avoid locking unless a second thread is launched). */
 #undef JEMALLOC_LAZY_LOCK
 
-/* One page is 2^STATIC_PAGE_SHIFT bytes. */
-#undef STATIC_PAGE_SHIFT
+/*
+ * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
+ * classes).
+ */
+#undef LG_QUANTUM
+
+/* One page is 2^LG_PAGE bytes. */
+#undef LG_PAGE
 
 /*
  * If defined, use munmap() to unmap freed chunks, rather than storing them for
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index d5e6fdc..66d4822 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -16,11 +16,11 @@ arena_chunk_dalloc_huge
 arena_cleanup
 arena_dalloc
 arena_dalloc_bin
-arena_dalloc_bin_locked
+arena_dalloc_bin_junked_locked
 arena_dalloc_junk_large
 arena_dalloc_junk_small
 arena_dalloc_large
-arena_dalloc_large_locked
+arena_dalloc_large_junked_locked
 arena_dalloc_small
 arena_dss_prec_get
 arena_dss_prec_set
diff --git a/include/jemalloc/internal/size_classes.sh b/include/jemalloc/internal/size_classes.sh
index 897570c..733338c 100755
--- a/include/jemalloc/internal/size_classes.sh
+++ b/include/jemalloc/internal/size_classes.sh
@@ -1,4 +1,6 @@
 #!/bin/sh
+#
+# Usage: size_classes.sh <lg_parr> <lg_g>
 
 # The following limits are chosen such that they cover all supported platforms.
 
@@ -15,10 +17,10 @@ lg_tmin=3
 lg_kmax=12
 
 # Page sizes.
-lg_parr="12 13 16"
+lg_parr=`echo $1 | tr ',' ' '`
 
 # Size class group size (number of size classes for each size doubling).
-lg_g=2
+lg_g=$2
 
 pow2() {
   e=$1
@@ -159,7 +161,11 @@ size_classes() {
         nbins=$((${index} + 1))
         # Final written value is correct:
         small_maxclass="((((size_t)1) << ${lg_grp}) + (((size_t)${ndelta}) << ${lg_delta}))"
-        lg_large_minclass=$((${lg_grp} + 1))
+        if [ ${lg_g} -gt 0 ] ; then
+          lg_large_minclass=$((${lg_grp} + 1))
+        else
+          lg_large_minclass=$((${lg_grp} + 2))
+        fi
       fi
       index=$((${index} + 1))
       ndelta=$((${ndelta} + 1))
diff --git a/include/jemalloc/internal/tcache.h b/include/jemalloc/internal/tcache.h
index 02eec5d..fe9c47e 100644
--- a/include/jemalloc/internal/tcache.h
+++ b/include/jemalloc/internal/tcache.h
@@ -112,7 +112,7 @@ void	tcache_arena_associate(tcache_t *tcache, arena_t *arena);
 void	tcache_arena_reassociate(tcache_t *tcache, arena_t *arena);
 void	tcache_arena_dissociate(tcache_t *tcache);
 tcache_t *tcache_get_hard(tsd_t *tsd);
-tcache_t *tcache_create(arena_t *arena);
+tcache_t *tcache_create(tsd_t *tsd, arena_t *arena);
 void	tcache_cleanup(tsd_t *tsd);
 void	tcache_enabled_cleanup(tsd_t *tsd);
 void	tcache_stats_merge(tcache_t *tcache, arena_t *arena);
@@ -363,7 +363,7 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
 	binind = size2index(size);
 
 	if (config_fill && unlikely(opt_junk))
-		memset(ptr, 0x5a, size);
+		arena_dalloc_junk_large(ptr, size);
 
 	tbin = &tcache->tbins[binind];
 	tbin_info = &tcache_bin_info[binind];
diff --git a/src/arena.c b/src/arena.c
index 86e5440..bbe58fa 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -623,7 +623,7 @@ arena_run_alloc_large(arena_t *arena, size_t size, bool zero)
 	arena_chunk_t *chunk;
 	arena_run_t *run;
 
-	assert(size <= arena_maxclass);
+	assert(size <= arena_maxrun);
 	assert((size & PAGE_MASK) == 0);
 
 	/* Search the arena's chunks for the lowest best fit. */
@@ -673,7 +673,7 @@ arena_run_alloc_small(arena_t *arena, size_t size, index_t binind)
 	arena_chunk_t *chunk;
 	arena_run_t *run;
 
-	assert(size <= arena_maxclass);
+	assert(size <= arena_maxrun);
 	assert((size & PAGE_MASK) == 0);
 	assert(binind != BININD_INVALID);
 
@@ -1728,9 +1728,9 @@ arena_bin_lower_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 		arena_bin_runs_insert(bin, run);
 }
 
-void
-arena_dalloc_bin_locked(arena_t *arena, arena_chunk_t *chunk, void *ptr,
-    arena_chunk_map_bits_t *bitselm)
+static void
+arena_dalloc_bin_locked_impl(arena_t *arena, arena_chunk_t *chunk, void *ptr,
+    arena_chunk_map_bits_t *bitselm, bool junked)
 {
 	size_t pageind, rpages_ind;
 	arena_run_t *run;
@@ -1749,7 +1749,7 @@ arena_dalloc_bin_locked(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	if (config_fill || config_stats)
 		size = bin_info->reg_size;
 
-	if (config_fill && unlikely(opt_junk))
+	if (!junked && config_fill && unlikely(opt_junk))
 		arena_dalloc_junk_small(ptr, bin_info);
 
 	arena_run_reg_dalloc(run, ptr);
@@ -1766,6 +1766,14 @@ arena_dalloc_bin_locked(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 }
 
 void
+arena_dalloc_bin_junked_locked(arena_t *arena, arena_chunk_t *chunk, void *ptr,
+    arena_chunk_map_bits_t *bitselm)
+{
+
+	arena_dalloc_bin_locked_impl(arena, chunk, ptr, bitselm, true);
+}
+
+void
 arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
     size_t pageind, arena_chunk_map_bits_t *bitselm)
 {
@@ -1777,7 +1785,7 @@ arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	run = &arena_miscelm_get(chunk, rpages_ind)->run;
 	bin = run->bin;
 	malloc_mutex_lock(&bin->lock);
-	arena_dalloc_bin_locked(arena, chunk, ptr, bitselm);
+	arena_dalloc_bin_locked_impl(arena, chunk, ptr, bitselm, false);
 	malloc_mutex_unlock(&bin->lock);
 }
 
@@ -1800,7 +1808,7 @@ arena_dalloc_small(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 #undef arena_dalloc_junk_large
 #define	arena_dalloc_junk_large JEMALLOC_N(arena_dalloc_junk_large_impl)
 #endif
-static void
+void
 arena_dalloc_junk_large(void *ptr, size_t usize)
 {
 
@@ -1815,7 +1823,8 @@ arena_dalloc_junk_large_t *arena_dalloc_junk_large =
 #endif
 
 void
-arena_dalloc_large_locked(arena_t *arena, arena_chunk_t *chunk, void *ptr)
+arena_dalloc_large_locked_impl(arena_t *arena, arena_chunk_t *chunk,
+    void *ptr, bool junked)
 {
 	size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
 	arena_chunk_map_misc_t *miscelm = arena_miscelm_get(chunk, pageind);
@@ -1824,7 +1833,8 @@ arena_dalloc_large_locked(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 	if (config_fill || config_stats) {
 		size_t usize = arena_mapbits_large_size_get(chunk, pageind);
 
-		arena_dalloc_junk_large(ptr, usize);
+		if (!junked)
+			arena_dalloc_junk_large(ptr, usize);
 		if (config_stats) {
 			index_t index = size2index(usize) - NBINS;
 
@@ -1839,11 +1849,19 @@ arena_dalloc_large_locked(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 }
 
 void
+arena_dalloc_large_junked_locked(arena_t *arena, arena_chunk_t *chunk,
+    void *ptr)
+{
+
+	arena_dalloc_large_locked_impl(arena, chunk, ptr, true);
+}
+
+void
 arena_dalloc_large(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 {
 
 	malloc_mutex_lock(&arena->lock);
-	arena_dalloc_large_locked(arena, chunk, ptr);
+	arena_dalloc_large_locked_impl(arena, chunk, ptr, false);
 	malloc_mutex_unlock(&arena->lock);
 }
 
@@ -2398,6 +2416,7 @@ arena_boot(void)
 	    sizeof(arena_chunk_map_bits_t) * (chunk_npages-map_bias);
 
 	arena_maxrun = chunksize - (map_bias << LG_PAGE);
+	assert(arena_maxrun > 0);
 	arena_maxclass = index2size(size2index(chunksize)-1);
 	if (arena_maxclass > arena_maxrun) {
 		/*
@@ -2407,6 +2426,7 @@ arena_boot(void)
 		 */
 		arena_maxclass = arena_maxrun;
 	}
+	assert(arena_maxclass > 0);
 	nlclasses = size2index(arena_maxclass) - size2index(SMALL_MAXCLASS);
 
 	bin_info_init();
diff --git a/src/huge.c b/src/huge.c
index 541df60..6c9b97b 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -13,7 +13,7 @@ static malloc_mutex_t	huge_mtx;
 static extent_tree_t	huge;
 
 void *
-huge_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero)
+huge_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero, bool try_tcache)
 {
 	size_t usize;
 
@@ -23,12 +23,12 @@ huge_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero)
 		return (NULL);
 	}
 
-	return (huge_palloc(tsd, arena, usize, chunksize, zero));
+	return (huge_palloc(tsd, arena, usize, chunksize, zero, try_tcache));
 }
 
 void *
 huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
-    bool zero)
+    bool zero, bool try_tcache)
 {
 	void *ret;
 	size_t csize;
@@ -42,7 +42,7 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 
 	/* Allocate an extent node with which to track the chunk. */
 	node = ipalloct(tsd, CACHELINE_CEILING(sizeof(extent_node_t)),
-	    CACHELINE, false, tsd != NULL, NULL);
+	    CACHELINE, false, try_tcache, NULL);
 	if (node == NULL)
 		return (NULL);
 
@@ -58,7 +58,7 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 	}
 	ret = arena_chunk_alloc_huge(arena, NULL, csize, alignment, &is_zeroed);
 	if (ret == NULL) {
-		idalloct(tsd, node, tsd != NULL);
+		idalloct(tsd, node, try_tcache);
 		return (NULL);
 	}
 
@@ -122,6 +122,7 @@ huge_ralloc_no_move_expand(void *ptr, size_t oldsize, size_t size, bool zero) {
 
 	expand_addr = ptr + CHUNK_CEILING(oldsize);
 	expand_size = CHUNK_CEILING(usize) - CHUNK_CEILING(oldsize);
+	assert(expand_size > 0);
 
 	malloc_mutex_lock(&huge_mtx);
 
@@ -223,13 +224,8 @@ huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
 		return (false);
 	}
 
-	if (CHUNK_CEILING(oldsize) >= CHUNK_CEILING(size)
-	    && CHUNK_CEILING(oldsize) <= CHUNK_CEILING(size+extra)) {
-		return (false);
-	}
-
 	/* Shrink the allocation in-place. */
-	if (CHUNK_CEILING(oldsize) > CHUNK_CEILING(usize)) {
+	if (CHUNK_CEILING(oldsize) >= CHUNK_CEILING(usize)) {
 		extent_node_t *node, key;
 		void *excess_addr;
 		size_t excess_size;
@@ -251,7 +247,10 @@ huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
 
 		/* Zap the excess chunks. */
 		huge_dalloc_junk(ptr + usize, oldsize - usize);
-		arena_chunk_dalloc_huge(node->arena, excess_addr, excess_size);
+		if (excess_size > 0) {
+			arena_chunk_dalloc_huge(node->arena, excess_addr,
+			    excess_size);
+		}
 
 		return (false);
 	}
@@ -269,7 +268,8 @@ huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
 
 void *
 huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t size,
-    size_t extra, size_t alignment, bool zero, bool try_tcache_dalloc)
+    size_t extra, size_t alignment, bool zero, bool try_tcache_alloc,
+    bool try_tcache_dalloc)
 {
 	void *ret;
 	size_t copysize;
@@ -283,19 +283,25 @@ huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t size,
 	 * different size class.  In that case, fall back to allocating new
 	 * space and copying.
 	 */
-	if (alignment > chunksize)
-		ret = huge_palloc(tsd, arena, size + extra, alignment, zero);
-	else
-		ret = huge_malloc(tsd, arena, size + extra, zero);
+	if (alignment > chunksize) {
+		ret = huge_palloc(tsd, arena, size + extra, alignment, zero,
+		    try_tcache_alloc);
+	} else {
+		ret = huge_malloc(tsd, arena, size + extra, zero,
+		    try_tcache_alloc);
+	}
 
 	if (ret == NULL) {
 		if (extra == 0)
 			return (NULL);
 		/* Try again, this time without extra. */
-		if (alignment > chunksize)
-			ret = huge_palloc(tsd, arena, size, alignment, zero);
-		else
-			ret = huge_malloc(tsd, arena, size, zero);
+		if (alignment > chunksize) {
+			ret = huge_palloc(tsd, arena, size, alignment, zero,
+			    try_tcache_alloc);
+		} else {
+			ret = huge_malloc(tsd, arena, size, zero,
+			    try_tcache_alloc);
+		}
 
 		if (ret == NULL)
 			return (NULL);
@@ -312,7 +318,7 @@ huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t size,
 }
 
 void
-huge_dalloc(tsd_t *tsd, void *ptr)
+huge_dalloc(tsd_t *tsd, void *ptr, bool try_tcache)
 {
 	extent_node_t *node, key;
 
@@ -330,7 +336,7 @@ huge_dalloc(tsd_t *tsd, void *ptr)
 	huge_dalloc_junk(node->addr, node->size);
 	arena_chunk_dalloc_huge(node->arena, node->addr,
 	    CHUNK_CEILING(node->size));
-	idalloct(tsd, node, tsd != NULL);
+	idalloct(tsd, node, try_tcache);
 }
 
 size_t
diff --git a/src/jemalloc.c b/src/jemalloc.c
index c62d8ce..a862104 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -67,6 +67,8 @@ const uint8_t	size2index_tab[] = {
 #define	S2B_7(i)	S2B_6(i) S2B_6(i)
 #define	S2B_8(i)	S2B_7(i) S2B_7(i)
 #define	S2B_9(i)	S2B_8(i) S2B_8(i)
+#define	S2B_10(i)	S2B_9(i) S2B_9(i)
+#define	S2B_11(i)	S2B_10(i) S2B_10(i)
 #define	S2B_no(i)
 #define	SC(index, lg_grp, lg_delta, ndelta, bin, lg_delta_lookup) \
 	S2B_##lg_delta_lookup(index)
@@ -78,6 +80,8 @@ const uint8_t	size2index_tab[] = {
 #undef S2B_7
 #undef S2B_8
 #undef S2B_9
+#undef S2B_10
+#undef S2B_11
 #undef S2B_no
 #undef SC
 };
@@ -199,6 +203,7 @@ static void *
 a0alloc(size_t size, bool zero)
 {
 	void *ret;
+	tsd_t *tsd;
 
 	if (unlikely(malloc_init()))
 		return (NULL);
@@ -206,10 +211,11 @@ a0alloc(size_t size, bool zero)
 	if (size == 0)
 		size = 1;
 
+	tsd = tsd_fetch();
 	if (size <= arena_maxclass)
-		ret = arena_malloc(NULL, a0get(), size, zero, false);
+		ret = arena_malloc(tsd, a0get(), size, zero, false);
 	else
-		ret = huge_malloc(NULL, a0get(), size, zero);
+		ret = huge_malloc(tsd, a0get(), size, zero, false);
 
 	return (ret);
 }
@@ -231,16 +237,18 @@ a0calloc(size_t num, size_t size)
 void
 a0free(void *ptr)
 {
+	tsd_t *tsd;
 	arena_chunk_t *chunk;
 
 	if (ptr == NULL)
 		return;
 
+	tsd = tsd_fetch();
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (chunk != ptr)
-		arena_dalloc(NULL, chunk, ptr, false);
+		arena_dalloc(tsd, chunk, ptr, false);
 	else
-		huge_dalloc(NULL, ptr);
+		huge_dalloc(tsd, ptr, false);
 }
 
 /* Create a new arena and insert it into the arenas array at index ind. */
@@ -817,15 +825,15 @@ malloc_conf_init(void)
 					    "Invalid conf value",	\
 					    k, klen, v, vlen);		\
 				} else if (clip) {			\
-					if (min != 0 && um < min)	\
-						o = min;		\
-					else if (um > max)		\
-						o = max;		\
+					if ((min) != 0 && um < (min))	\
+						o = (min);		\
+					else if (um > (max))		\
+						o = (max);		\
 					else				\
 						o = um;			\
 				} else {				\
-					if ((min != 0 && um < min) ||	\
-					    um > max) {			\
+					if (((min) != 0 && um < (min))	\
+					    || um > (max)) {		\
 						malloc_conf_error(	\
 						    "Out-of-range "	\
 						    "conf value",	\
@@ -847,8 +855,8 @@ malloc_conf_init(void)
 					malloc_conf_error(		\
 					    "Invalid conf value",	\
 					    k, klen, v, vlen);		\
-				} else if (l < (ssize_t)min || l >	\
-				    (ssize_t)max) {			\
+				} else if (l < (ssize_t)(min) || l >	\
+				    (ssize_t)(max)) {			\
 					malloc_conf_error(		\
 					    "Out-of-range conf value",	\
 					    k, klen, v, vlen);		\
@@ -868,15 +876,16 @@ malloc_conf_init(void)
 
 			CONF_HANDLE_BOOL(opt_abort, "abort", true)
 			/*
-			 * Chunks always require at least one header page, plus
-			 * one data page in the absence of redzones, or three
-			 * pages in the presence of redzones.  In order to
-			 * simplify options processing, fix the limit based on
-			 * config_fill.
+			 * Chunks always require at least one header page,
+			 * as many as 2^(LG_SIZE_CLASS_GROUP+1) data pages, and
+			 * possibly an additional page in the presence of
+			 * redzones.  In order to simplify options processing,
+			 * use a conservative bound that accommodates all these
+			 * constraints.
 			 */
 			CONF_HANDLE_SIZE_T(opt_lg_chunk, "lg_chunk", LG_PAGE +
-			    (config_fill ? 2 : 1), (sizeof(size_t) << 3) - 1,
-			    true)
+			    LG_SIZE_CLASS_GROUP + (config_fill ? 2 : 1),
+			    (sizeof(size_t) << 3) - 1, true)
 			if (strncmp("dss", k, klen) == 0) {
 				int i;
 				bool match = false;
@@ -2088,8 +2097,7 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags)
 
 	if (unlikely((flags & MALLOCX_ARENA_MASK) != 0)) {
 		unsigned arena_ind = MALLOCX_ARENA_GET(flags);
-		// XX Dangerous arenas read.
-		arena = arenas[arena_ind];
+		arena = arena_get(tsd, arena_ind, true, true);
 	} else
 		arena = NULL;
 
diff --git a/src/tcache.c b/src/tcache.c
index 1bf7026..34224ec 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -117,8 +117,8 @@ tcache_bin_flush_small(tcache_bin_t *tbin, index_t binind, unsigned rem,
 				    (uintptr_t)chunk) >> LG_PAGE;
 				arena_chunk_map_bits_t *bitselm =
 				    arena_bitselm_get(chunk, pageind);
-				arena_dalloc_bin_locked(arena, chunk, ptr,
-				    bitselm);
+				arena_dalloc_bin_junked_locked(arena, chunk,
+				    ptr, bitselm);
 			} else {
 				/*
 				 * This object was allocated via a different
@@ -193,9 +193,10 @@ tcache_bin_flush_large(tcache_bin_t *tbin, index_t binind, unsigned rem,
 			ptr = tbin->avail[i];
 			assert(ptr != NULL);
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-			if (chunk->arena == arena)
-				arena_dalloc_large_locked(arena, chunk, ptr);
-			else {
+			if (chunk->arena == arena) {
+				arena_dalloc_large_junked_locked(arena, chunk,
+				    ptr);
+			} else {
 				/*
 				 * This object was allocated via a different
 				 * arena than the one that is currently locked.
@@ -279,11 +280,11 @@ tcache_get_hard(tsd_t *tsd)
 	arena = arena_choose(tsd, NULL);
 	if (unlikely(arena == NULL))
 		return (NULL);
-	return (tcache_create(arena));
+	return (tcache_create(tsd, arena));
 }
 
 tcache_t *
-tcache_create(arena_t *arena)
+tcache_create(tsd_t *tsd, arena_t *arena)
 {
 	tcache_t *tcache;
 	size_t size, stack_offset;
@@ -294,23 +295,10 @@ tcache_create(arena_t *arena)
 	size = PTR_CEILING(size);
 	stack_offset = size;
 	size += stack_nelms * sizeof(void *);
-	/*
-	 * Round up to the nearest multiple of the cacheline size, in order to
-	 * avoid the possibility of false cacheline sharing.
-	 *
-	 * That this works relies on the same logic as in ipalloc(), but we
-	 * cannot directly call ipalloc() here due to tcache bootstrapping
-	 * issues.
-	 */
-	size = (size + CACHELINE_MASK) & (-CACHELINE);
-
-	if (size <= SMALL_MAXCLASS)
-		tcache = (tcache_t *)arena_malloc_small(arena, size, true);
-	else if (size <= tcache_maxclass)
-		tcache = (tcache_t *)arena_malloc_large(arena, size, true);
-	else
-		tcache = (tcache_t *)icalloct(NULL, size, false, arena);
+	/* Avoid false cacheline sharing. */
+	size = sa2u(size, CACHELINE);
 
+	tcache = ipalloct(tsd, size, CACHELINE, true, false, arena);
 	if (tcache == NULL)
 		return (NULL);
 
@@ -331,7 +319,6 @@ static void
 tcache_destroy(tsd_t *tsd, tcache_t *tcache)
 {
 	unsigned i;
-	size_t tcache_size;
 
 	tcache_arena_dissociate(tcache);
 
@@ -366,23 +353,7 @@ tcache_destroy(tsd_t *tsd, tcache_t *tcache)
 	    arena_prof_accum(tcache->arena, tcache->prof_accumbytes))
 		prof_idump();
 
-	tcache_size = arena_salloc(tcache, false);
-	if (tcache_size <= SMALL_MAXCLASS) {
-		arena_chunk_t *chunk = CHUNK_ADDR2BASE(tcache);
-		arena_t *arena = chunk->arena;
-		size_t pageind = ((uintptr_t)tcache - (uintptr_t)chunk) >>
-		    LG_PAGE;
-		arena_chunk_map_bits_t *bitselm = arena_bitselm_get(chunk,
-		    pageind);
-
-		arena_dalloc_bin(arena, chunk, tcache, pageind, bitselm);
-	} else if (tcache_size <= tcache_maxclass) {
-		arena_chunk_t *chunk = CHUNK_ADDR2BASE(tcache);
-		arena_t *arena = chunk->arena;
-
-		arena_dalloc_large(arena, chunk, tcache);
-	} else
-		idalloct(tsd, tcache, false);
+	idalloct(tsd, tcache, false);
 }
 
 void
diff --git a/test/unit/lg_chunk.c b/test/unit/lg_chunk.c
new file mode 100644
index 0000000..7f0b31c
--- /dev/null
+++ b/test/unit/lg_chunk.c
@@ -0,0 +1,26 @@
+#include "test/jemalloc_test.h"
+
+/*
+ * Make sure that opt.lg_chunk clamping is sufficient.  In practice, this test
+ * program will fail a debug assertion during initialization and abort (rather
+ * than the test soft-failing) if clamping is insufficient.
+ */
+const char *malloc_conf = "lg_chunk:0";
+
+TEST_BEGIN(test_lg_chunk_clamp)
+{
+	void *p;
+
+	p = mallocx(1, 0);
+	assert_ptr_not_null(p, "Unexpected mallocx() failure");
+	dallocx(p, 0);
+}
+TEST_END
+
+int
+main(void)
+{
+
+	return (test(
+		test_lg_chunk_clamp));
+}
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index e62e54f..a8f7aed 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -357,7 +357,7 @@ TEST_BEGIN(test_arenas_lrun_constants)
 	assert_zu_eq(name, expected, "Incorrect "#name" size");		\
 } while (0)
 
-	TEST_ARENAS_LRUN_CONSTANT(size_t, size, (1 << (LG_PAGE+2)));
+	TEST_ARENAS_LRUN_CONSTANT(size_t, size, LARGE_MINCLASS);
 
 #undef TEST_ARENAS_LRUN_CONSTANT
 }
-- 
cgit v0.12


From 9b75677e538836b284a0d26a593963187c24a153 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 10 Oct 2014 18:19:20 -0700
Subject: Don't fetch tsd in a0{d,}alloc().

Don't fetch tsd in a0{d,}alloc(), because doing so can cause infinite
recursion on systems that require an allocated tsd wrapper.
---
 src/jemalloc.c | 18 +++++++-----------
 test/unit/mq.c |  1 +
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/src/jemalloc.c b/src/jemalloc.c
index a862104..fc490eb 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -203,7 +203,6 @@ static void *
 a0alloc(size_t size, bool zero)
 {
 	void *ret;
-	tsd_t *tsd;
 
 	if (unlikely(malloc_init()))
 		return (NULL);
@@ -211,11 +210,10 @@ a0alloc(size_t size, bool zero)
 	if (size == 0)
 		size = 1;
 
-	tsd = tsd_fetch();
 	if (size <= arena_maxclass)
-		ret = arena_malloc(tsd, a0get(), size, zero, false);
+		ret = arena_malloc(NULL, a0get(), size, zero, false);
 	else
-		ret = huge_malloc(tsd, a0get(), size, zero, false);
+		ret = huge_malloc(NULL, a0get(), size, zero, false);
 
 	return (ret);
 }
@@ -237,18 +235,16 @@ a0calloc(size_t num, size_t size)
 void
 a0free(void *ptr)
 {
-	tsd_t *tsd;
 	arena_chunk_t *chunk;
 
 	if (ptr == NULL)
 		return;
 
-	tsd = tsd_fetch();
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (chunk != ptr)
-		arena_dalloc(tsd, chunk, ptr, false);
+		arena_dalloc(NULL, chunk, ptr, false);
 	else
-		huge_dalloc(tsd, ptr, false);
+		huge_dalloc(NULL, ptr, false);
 }
 
 /* Create a new arena and insert it into the arenas array at index ind. */
@@ -2301,9 +2297,9 @@ je_malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *ptr)
  * fork/malloc races via the following functions it registers during
  * initialization using pthread_atfork(), but of course that does no good if
  * the allocator isn't fully initialized at fork time.  The following library
- * constructor is a partial solution to this problem.  It may still possible to
- * trigger the deadlock described above, but doing so would involve forking via
- * a library constructor that runs before jemalloc's runs.
+ * constructor is a partial solution to this problem.  It may still be possible
+ * to trigger the deadlock described above, but doing so would involve forking
+ * via a library constructor that runs before jemalloc's runs.
  */
 JEMALLOC_ATTR(constructor)
 static void
diff --git a/test/unit/mq.c b/test/unit/mq.c
index bd289c5..bde2a48 100644
--- a/test/unit/mq.c
+++ b/test/unit/mq.c
@@ -85,6 +85,7 @@ TEST_END
 int
 main(void)
 {
+
 	return (test(
 	    test_mq_basic,
 	    test_mq_threaded));
-- 
cgit v0.12


From 2eb941a3d3a69fa8a73902b29564294f854fc3b0 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 10 Oct 2014 20:40:43 -0700
Subject: Add AC_CACHE_CHECK() for pause instruction.

This supports cross compilation.
---
 configure.ac | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/configure.ac b/configure.ac
index f8c09c4..cc30da9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -207,9 +207,10 @@ dnl CPU-specific settings.
 CPU_SPINWAIT=""
 case "${host_cpu}" in
   i686|x86_64)
-	JE_COMPILABLE([pause instruction], [],
-	              [[__asm__ volatile("pause"); return 0;]],
-	              [je_cv_pause])
+	AC_CACHE_CHECK([whether pause instruction is compilable], [je_cv_pause],
+	  [JE_COMPILABLE([pause instruction], [],
+	                [[__asm__ volatile("pause"); return 0;]],
+	                [je_cv_pause])])
 	if test "x${je_cv_pause}" = "xyes" ; then
 	    CPU_SPINWAIT='__asm__ volatile("pause")'
 	fi
-- 
cgit v0.12


From 81e547566e9bd55db7c317c5848ab9dc189047cb Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 10 Oct 2014 22:34:25 -0700
Subject: Add --with-lg-tiny-min, generalize --with-lg-quantum.

---
 INSTALL                                            | 32 ++++++++++---
 configure.ac                                       | 21 +++++++--
 include/jemalloc/internal/jemalloc_internal.h.in   |  1 -
 .../jemalloc/internal/jemalloc_internal_defs.h.in  |  3 ++
 include/jemalloc/internal/size_classes.sh          | 10 ++--
 src/jemalloc.c                                     | 54 ++++++++++++++++++++++
 6 files changed, 105 insertions(+), 16 deletions(-)

diff --git a/INSTALL b/INSTALL
index 73bf718..a00960a 100644
--- a/INSTALL
+++ b/INSTALL
@@ -230,10 +230,9 @@ any of the following arguments (not a definitive list) to 'configure':
     roughly <lg-size-class-group>=4, depending on page size.
 
 --with-lg-quantum=<lg-quantum>
-    Specify the base 2 log of the minimum allocation alignment (only
-    <lg-quantum>=3 and <lg-quantum>=4 are supported).  jemalloc needs to know
-    the minimum alignment that meets the following C standard requirement
-    (quoted from the April 12, 2011 draft of the C11 standard):
+    Specify the base 2 log of the minimum allocation alignment.  jemalloc needs
+    to know the minimum alignment that meets the following C standard
+    requirement (quoted from the April 12, 2011 draft of the C11 standard):
 
       The pointer returned if the allocation succeeds is suitably aligned so
       that it may be assigned to a pointer to any type of object with a
@@ -247,8 +246,8 @@ any of the following arguments (not a definitive list) to 'configure':
     (<lg-quantum>=4), but the glibc developers chose not to meet this requirement
     for performance reasons.  An old discussion can be found at
     https://sourceware.org/bugzilla/show_bug.cgi?id=206 .  Unlike glibc,
-    jemalloc does follow the C standard by default (caveat: jemalloc technically
-    cheats by only providing 8-byte alignment for 8-byte allocation requests),
+    jemalloc does follow the C standard by default (caveat: jemalloc
+    technically cheats if --with-lg-tiny-min is smaller than --with-lg-quantum),
     but the fact that Linux systems already work around this allocator
     noncompliance means that it is generally safe in practice to let jemalloc's
     minimum alignment follow glibc's lead.  If you specify --with-lg-quantum=3
@@ -256,6 +255,27 @@ any of the following arguments (not a definitive list) to 'configure':
     are not 16-byte-aligned (24, 40, and 56, assuming
     --with-lg-size-class-group=2).
 
+--with-lg-tiny-min=<lg-tiny-min>
+    Specify the base 2 log of the minimum tiny size class to support.  Tiny
+    size classes are powers of 2 less than the quantum, and are only
+    incorporated if <lg-tiny-min> is less than <lg-quantum> (see
+    --with-lg-quantum).  Tiny size classes technically violate the C standard
+    requirement for minimum alignment, and crashes could conceivably result if
+    the compiler were to generate instructions that made alignment assumptions,
+    both because illegal instruction traps could result, and because accesses
+    could straddle page boundaries and cause segmentation faults due to
+    accessing unmapped addresses.
+
+    The default of <lg-tiny-min>=3 works well in practice even on architectures
+    that technically require 16-byte alignment, probably for the same reason
+    --with-lg-quantum=3 works.  Smaller tiny size classes can, and will, cause
+    crashes (see https://bugzilla.mozilla.org/show_bug.cgi?id=691003 for an
+    example).
+
+    This option is rarely useful, and is mainly provided as documentation of a
+    subtle implementation detail.  If you do use this option, specify a
+    value in [3, ..., <lg-quantum>].
+
 The following environment variables (not a definitive list) impact configure's
 behavior:
 
diff --git a/configure.ac b/configure.ac
index cc30da9..a7bf103 100644
--- a/configure.ac
+++ b/configure.ac
@@ -207,7 +207,7 @@ dnl CPU-specific settings.
 CPU_SPINWAIT=""
 case "${host_cpu}" in
   i686|x86_64)
-	AC_CACHE_CHECK([whether pause instruction is compilable], [je_cv_pause],
+	AC_CACHE_VAL([je_cv_pause],
 	  [JE_COMPILABLE([pause instruction], [],
 	                [[__asm__ volatile("pause"); return 0;]],
 	                [je_cv_pause])])
@@ -970,10 +970,21 @@ else
   fi
 fi
 
+AC_ARG_WITH([lg_tiny_min],
+  [AS_HELP_STRING([--with-lg-tiny-min=<lg-tiny-min>],
+   [Base 2 log of minimum tiny size class to support])],
+  [LG_TINY_MIN="$with_lg_tiny_min"],
+  [LG_TINY_MIN="3"])
+AC_DEFINE_UNQUOTED([LG_TINY_MIN], [$LG_TINY_MIN])
+
 AC_ARG_WITH([lg_quantum],
   [AS_HELP_STRING([--with-lg-quantum=<lg-quantum>],
    [Base 2 log of minimum allocation alignment])],
-  [AC_DEFINE_UNQUOTED([LG_QUANTUM], [$with_lg_quantum])])
+  [LG_QUANTA="$with_lg_quantum"],
+  [LG_QUANTA="3 4"])
+if test "x$with_lg_quantum" != "x" ; then
+  AC_DEFINE_UNQUOTED([LG_QUANTUM], [$with_lg_quantum])
+fi
 
 AC_ARG_WITH([lg_page],
   [AS_HELP_STRING([--with-lg-page=<lg-page>], [Base 2 log of system page size])],
@@ -1480,11 +1491,13 @@ AC_CONFIG_COMMANDS([include/jemalloc/internal/public_unnamespace.h], [
 ])
 AC_CONFIG_COMMANDS([include/jemalloc/internal/size_classes.h], [
   mkdir -p "${objroot}include/jemalloc/internal"
-  "${srcdir}/include/jemalloc/internal/size_classes.sh" ${LG_PAGE_SIZES} ${LG_SIZE_CLASS_GROUP} > "${objroot}include/jemalloc/internal/size_classes.h"
+  "${srcdir}/include/jemalloc/internal/size_classes.sh" "${LG_QUANTA}" ${LG_TINY_MIN} "${LG_PAGE_SIZES}" ${LG_SIZE_CLASS_GROUP} > "${objroot}include/jemalloc/internal/size_classes.h"
 ], [
   srcdir="${srcdir}"
   objroot="${objroot}"
-  LG_PAGE_SIZES=${LG_PAGE_SIZES}
+  LG_QUANTA="${LG_QUANTA}"
+  LG_TINY_MIN=${LG_TINY_MIN}
+  LG_PAGE_SIZES="${LG_PAGE_SIZES}"
   LG_SIZE_CLASS_GROUP=${LG_SIZE_CLASS_GROUP}
 ])
 AC_CONFIG_COMMANDS([include/jemalloc/jemalloc_protos_jet.h], [
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 3f65fad..294e2cc 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -181,7 +181,6 @@ typedef unsigned index_t;
     (((unsigned)(flags >> 8)) - 1)
 
 /* Smallest size class to support. */
-#define	LG_TINY_MIN		3
 #define	TINY_MIN		(1U << LG_TINY_MIN)
 
 /*
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index 0ff939c..dccbb1e 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -144,6 +144,9 @@
 /* Support lazy locking (avoid locking unless a second thread is launched). */
 #undef JEMALLOC_LAZY_LOCK
 
+/* Minimum size class to support is 2^LG_TINY_MIN bytes. */
+#undef LG_TINY_MIN
+
 /*
  * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
  * classes).
diff --git a/include/jemalloc/internal/size_classes.sh b/include/jemalloc/internal/size_classes.sh
index 733338c..38020dc 100755
--- a/include/jemalloc/internal/size_classes.sh
+++ b/include/jemalloc/internal/size_classes.sh
@@ -1,6 +1,6 @@
 #!/bin/sh
 #
-# Usage: size_classes.sh <lg_parr> <lg_g>
+# Usage: size_classes.sh <lg_qarr> <lg_tmin> <lg_parr> <lg_g>
 
 # The following limits are chosen such that they cover all supported platforms.
 
@@ -8,19 +8,19 @@
 lg_zarr="2 3"
 
 # Quanta.
-lg_qarr="3 4"
+lg_qarr=$1
 
 # The range of tiny size classes is [2^lg_tmin..2^(lg_q-1)].
-lg_tmin=3
+lg_tmin=$2
 
 # Maximum lookup size.
 lg_kmax=12
 
 # Page sizes.
-lg_parr=`echo $1 | tr ',' ' '`
+lg_parr=`echo $3 | tr ',' ' '`
 
 # Size class group size (number of size classes for each size doubling).
-lg_g=$2
+lg_g=$4
 
 pow2() {
   e=$1
diff --git a/src/jemalloc.c b/src/jemalloc.c
index fc490eb..4543959 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -60,15 +60,69 @@ const size_t	index2size_tab[NSIZES] = {
 
 JEMALLOC_ALIGNED(CACHELINE)
 const uint8_t	size2index_tab[] = {
+#if LG_TINY_MIN == 0
+#warning "Dangerous LG_TINY_MIN"
+#define	S2B_0(i)	i,
+#elif LG_TINY_MIN == 1
+#warning "Dangerous LG_TINY_MIN"
+#define	S2B_1(i)	i,
+#elif LG_TINY_MIN == 2
+#warning "Dangerous LG_TINY_MIN"
+#define	S2B_2(i)	i,
+#elif LG_TINY_MIN == 3
 #define	S2B_3(i)	i,
+#elif LG_TINY_MIN == 4
+#define	S2B_4(i)	i,
+#elif LG_TINY_MIN == 5
+#define	S2B_5(i)	i,
+#elif LG_TINY_MIN == 6
+#define	S2B_6(i)	i,
+#elif LG_TINY_MIN == 7
+#define	S2B_7(i)	i,
+#elif LG_TINY_MIN == 8
+#define	S2B_8(i)	i,
+#elif LG_TINY_MIN == 9
+#define	S2B_9(i)	i,
+#elif LG_TINY_MIN == 10
+#define	S2B_10(i)	i,
+#elif LG_TINY_MIN == 11
+#define	S2B_11(i)	i,
+#else
+#error "Unsupported LG_TINY_MIN"
+#endif
+#if LG_TINY_MIN < 1
+#define	S2B_1(i)	S2B_0(i) S2B_0(i)
+#endif
+#if LG_TINY_MIN < 2
+#define	S2B_2(i)	S2B_1(i) S2B_1(i)
+#endif
+#if LG_TINY_MIN < 3
+#define	S2B_3(i)	S2B_2(i) S2B_2(i)
+#endif
+#if LG_TINY_MIN < 4
 #define	S2B_4(i)	S2B_3(i) S2B_3(i)
+#endif
+#if LG_TINY_MIN < 5
 #define	S2B_5(i)	S2B_4(i) S2B_4(i)
+#endif
+#if LG_TINY_MIN < 6
 #define	S2B_6(i)	S2B_5(i) S2B_5(i)
+#endif
+#if LG_TINY_MIN < 7
 #define	S2B_7(i)	S2B_6(i) S2B_6(i)
+#endif
+#if LG_TINY_MIN < 8
 #define	S2B_8(i)	S2B_7(i) S2B_7(i)
+#endif
+#if LG_TINY_MIN < 9
 #define	S2B_9(i)	S2B_8(i) S2B_8(i)
+#endif
+#if LG_TINY_MIN < 10
 #define	S2B_10(i)	S2B_9(i) S2B_9(i)
+#endif
+#if LG_TINY_MIN < 11
 #define	S2B_11(i)	S2B_10(i) S2B_10(i)
+#endif
 #define	S2B_no(i)
 #define	SC(index, lg_grp, lg_delta, ndelta, bin, lg_delta_lookup) \
 	S2B_##lg_delta_lookup(index)
-- 
cgit v0.12


From 381c23dd9d3bf019cc4c7523a900be1e888802a7 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 10 Oct 2014 23:01:03 -0700
Subject: Remove arena_dalloc_bin_run() clean page preservation.

Remove code in arena_dalloc_bin_run() that preserved the "clean" state
of trailing clean pages by splitting them into a separate run during
deallocation.  This was a useful mechanism for reducing dirty page
churn when bin runs comprised many pages, but bin runs are now quite
small.

Remove the nextind field from arena_run_t now that it is no longer
needed, and change arena_run_t's bin field (arena_bin_t *) to binind
(index_t).  These two changes remove 8 bytes of chunk header overhead
per page, which saves 1/512 of all arena chunk memory.
---
 include/jemalloc/internal/arena.h | 14 ++++----
 src/arena.c                       | 73 ++++-----------------------------------
 2 files changed, 13 insertions(+), 74 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index f5b9fc6..28ff727 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -36,11 +36,8 @@ typedef struct arena_s arena_t;
 #ifdef JEMALLOC_H_STRUCTS
 
 struct arena_run_s {
-	/* Bin this run is associated with. */
-	arena_bin_t	*bin;
-
-	/* Index of next region that has never been allocated, or nregs. */
-	uint32_t	nextind;
+	/* Index of bin this run is associated with. */
+	index_t		binind;
 
 	/* Number of free regions in run. */
 	unsigned	nfree;
@@ -756,7 +753,7 @@ arena_ptr_small_binind_get(const void *ptr, size_t mapbits)
 		size_t rpages_ind;
 		arena_run_t *run;
 		arena_bin_t *bin;
-		index_t actual_binind;
+		index_t run_binind, actual_binind;
 		arena_bin_info_t *bin_info;
 		arena_chunk_map_misc_t *miscelm;
 		void *rpages;
@@ -774,9 +771,10 @@ arena_ptr_small_binind_get(const void *ptr, size_t mapbits)
 		    pageind);
 		miscelm = arena_miscelm_get(chunk, rpages_ind);
 		run = &miscelm->run;
-		bin = run->bin;
+		run_binind = run->binind;
+		bin = &arena->bins[run_binind];
 		actual_binind = bin - arena->bins;
-		assert(binind == actual_binind);
+		assert(run_binind == actual_binind);
 		bin_info = &arena_bin_info[actual_binind];
 		rpages = arena_miscelm_to_rpages(miscelm);
 		assert(((uintptr_t)ptr - ((uintptr_t)rpages +
diff --git a/src/arena.c b/src/arena.c
index bbe58fa..8872331 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -155,9 +155,6 @@ arena_run_reg_alloc(arena_run_t *run, arena_bin_info_t *bin_info)
 	ret = (void *)((uintptr_t)rpages + (uintptr_t)bin_info->reg0_offset +
 	    (uintptr_t)(bin_info->reg_interval * regind));
 	run->nfree--;
-	if (regind == run->nextind)
-		run->nextind++;
-	assert(regind < run->nextind);
 	return (ret);
 }
 
@@ -361,26 +358,12 @@ arena_run_split_small(arena_t *arena, arena_run_t *run, size_t size,
 
 	arena_run_split_remove(arena, chunk, run_ind, flag_dirty, need_pages);
 
-	/*
-	 * Propagate the dirty and unzeroed flags to the allocated small run,
-	 * so that arena_dalloc_bin_run() has the ability to conditionally trim
-	 * clean pages.
-	 */
-	arena_mapbits_small_set(chunk, run_ind, 0, binind, flag_dirty);
-	if (config_debug && flag_dirty == 0 && arena_mapbits_unzeroed_get(chunk,
-	    run_ind) == 0)
-		arena_run_page_validate_zeroed(chunk, run_ind);
-	for (i = 1; i < need_pages - 1; i++) {
+	for (i = 0; i < need_pages; i++) {
 		arena_mapbits_small_set(chunk, run_ind+i, i, binind, 0);
 		if (config_debug && flag_dirty == 0 &&
 		    arena_mapbits_unzeroed_get(chunk, run_ind+i) == 0)
 			arena_run_page_validate_zeroed(chunk, run_ind+i);
 	}
-	arena_mapbits_small_set(chunk, run_ind+need_pages-1, need_pages-1,
-	    binind, flag_dirty);
-	if (config_debug && flag_dirty == 0 && arena_mapbits_unzeroed_get(chunk,
-	    run_ind+need_pages-1) == 0)
-		arena_run_page_validate_zeroed(chunk, run_ind+need_pages-1);
 	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED((void *)((uintptr_t)chunk +
 	    (run_ind << LG_PAGE)), (need_pages << LG_PAGE));
 }
@@ -1002,8 +985,7 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned)
 		    arena_mapbits_large_size_get(chunk,
 		    run_ind+(size>>LG_PAGE)-1) == 0);
 	} else {
-		index_t binind = arena_bin_index(arena, run->bin);
-		arena_bin_info_t *bin_info = &arena_bin_info[binind];
+		arena_bin_info_t *bin_info = &arena_bin_info[run->binind];
 		size = bin_info->run_size;
 	}
 	run_pages = (size >> LG_PAGE);
@@ -1199,8 +1181,7 @@ arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin)
 	run = arena_run_alloc_small(arena, bin_info->run_size, binind);
 	if (run != NULL) {
 		/* Initialize run internals. */
-		run->bin = bin;
-		run->nextind = 0;
+		run->binind = binind;
 		run->nfree = bin_info->nregs;
 		bitmap_init(run->bitmap, &bin_info->bitmap_info);
 	}
@@ -1652,54 +1633,15 @@ static void
 arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
     arena_bin_t *bin)
 {
-	index_t binind;
-	arena_bin_info_t *bin_info;
-	size_t npages, run_ind, past;
-	arena_chunk_map_misc_t *miscelm;
-	void *rpages;
 
 	assert(run != bin->runcur);
 	assert(arena_run_tree_search(&bin->runs, arena_run_to_miscelm(run)) ==
 	    NULL);
 
-	binind = arena_bin_index(chunk->arena, run->bin);
-	bin_info = &arena_bin_info[binind];
-
 	malloc_mutex_unlock(&bin->lock);
 	/******************************/
-	npages = bin_info->run_size >> LG_PAGE;
-	miscelm = arena_run_to_miscelm(run);
-	run_ind = arena_miscelm_to_pageind(miscelm);
-	rpages = arena_miscelm_to_rpages(miscelm);
-	past = (size_t)(PAGE_CEILING((uintptr_t)rpages +
-	    (uintptr_t)bin_info->reg0_offset + (uintptr_t)(run->nextind *
-	    bin_info->reg_interval - bin_info->redzone_size) -
-	    (uintptr_t)chunk) >> LG_PAGE);
 	malloc_mutex_lock(&arena->lock);
-
-	/*
-	 * If the run was originally clean, and some pages were never touched,
-	 * trim the clean pages before deallocating the dirty portion of the
-	 * run.
-	 */
-	assert(arena_mapbits_dirty_get(chunk, run_ind) ==
-	    arena_mapbits_dirty_get(chunk, run_ind+npages-1));
-	if (arena_mapbits_dirty_get(chunk, run_ind) == 0 && past - run_ind <
-	    npages) {
-		/* Trim clean pages.  Convert to large run beforehand. */
-		assert(npages > 0);
-		if (past > run_ind) {
-			arena_mapbits_large_set(chunk, run_ind,
-			    bin_info->run_size, 0);
-			arena_mapbits_large_set(chunk, run_ind+npages-1, 0, 0);
-			arena_run_trim_tail(arena, chunk, run, (npages <<
-			    LG_PAGE), ((past - run_ind) << LG_PAGE), false);
-			arena_run_dalloc(arena, run, true, false);
-		} else
-			arena_run_dalloc(arena, run, false, false);
-		/* npages = past - run_ind; */
-	} else
-		arena_run_dalloc(arena, run, true, false);
+	arena_run_dalloc(arena, run, true, false);
 	malloc_mutex_unlock(&arena->lock);
 	/****************************/
 	malloc_mutex_lock(&bin->lock);
@@ -1742,9 +1684,8 @@ arena_dalloc_bin_locked_impl(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
 	rpages_ind = pageind - arena_mapbits_small_runind_get(chunk, pageind);
 	run = &arena_miscelm_get(chunk, rpages_ind)->run;
-	bin = run->bin;
-	binind = arena_ptr_small_binind_get(ptr, arena_mapbits_get(chunk,
-	    pageind));
+	binind = run->binind;
+	bin = &arena->bins[binind];
 	bin_info = &arena_bin_info[binind];
 	if (config_fill || config_stats)
 		size = bin_info->reg_size;
@@ -1783,7 +1724,7 @@ arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 
 	rpages_ind = pageind - arena_mapbits_small_runind_get(chunk, pageind);
 	run = &arena_miscelm_get(chunk, rpages_ind)->run;
-	bin = run->bin;
+	bin = &arena->bins[run->binind];
 	malloc_mutex_lock(&bin->lock);
 	arena_dalloc_bin_locked_impl(arena, chunk, ptr, bitselm, false);
 	malloc_mutex_unlock(&bin->lock);
-- 
cgit v0.12


From 44c97b712ef1669a4c75ea97e8d47c0535e9ec71 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 12 Oct 2014 13:03:20 -0700
Subject: Fix a prof_tctx_t/prof_tdata_t cleanup race.

Fix a prof_tctx_t/prof_tdata_t cleanup race by storing a copy of thr_uid
in prof_tctx_t, so that the associated tdata need not be present during
tctx teardown.
---
 include/jemalloc/internal/prof.h |  6 ++++++
 src/prof.c                       | 10 +++++-----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index c801471..5103146 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -89,6 +89,12 @@ struct prof_tctx_s {
 	/* Thread data for thread that performed the allocation. */
 	prof_tdata_t		*tdata;
 
+	/*
+	 * Copy of tdata->thr_uid, necessary because tdata may be defunct during
+	 * teardown.
+	 */
+	uint64_t		thr_uid;
+
 	/* Profiling counters, protected by tdata->lock. */
 	prof_cnt_t		cnts;
 
diff --git a/src/prof.c b/src/prof.c
index 3e2e427..4016327 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -128,8 +128,8 @@ static char	*prof_thread_name_alloc(tsd_t *tsd, const char *thread_name);
 JEMALLOC_INLINE_C int
 prof_tctx_comp(const prof_tctx_t *a, const prof_tctx_t *b)
 {
-	uint64_t a_uid = a->tdata->thr_uid;
-	uint64_t b_uid = b->tdata->thr_uid;
+	uint64_t a_uid = a->thr_uid;
+	uint64_t b_uid = b->thr_uid;
 
 	return ((a_uid > b_uid) - (a_uid < b_uid));
 }
@@ -755,6 +755,7 @@ prof_lookup(tsd_t *tsd, prof_bt_t *bt)
 			return (NULL);
 		}
 		ret.p->tdata = tdata;
+		ret.p->thr_uid = tdata->thr_uid;
 		memset(&ret.p->cnts, 0, sizeof(prof_cnt_t));
 		ret.p->gctx = gctx;
 		ret.p->prepared = true;
@@ -1051,9 +1052,8 @@ prof_tctx_dump_iter(prof_tctx_tree_t *tctxs, prof_tctx_t *tctx, void *arg)
 
 	if (prof_dump_printf(propagate_err,
 	    "  t%"PRIu64": %"PRIu64": %"PRIu64" [%"PRIu64": %"PRIu64"]\n",
-	    tctx->tdata->thr_uid, tctx->dump_cnts.curobjs,
-	    tctx->dump_cnts.curbytes, tctx->dump_cnts.accumobjs,
-	    tctx->dump_cnts.accumbytes))
+	    tctx->thr_uid, tctx->dump_cnts.curobjs, tctx->dump_cnts.curbytes,
+	    tctx->dump_cnts.accumobjs, tctx->dump_cnts.accumbytes))
 		return (tctx);
 	return (NULL);
 }
-- 
cgit v0.12


From 3c4d92e82a31f652a7c77ca937a02d0185085b06 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 12 Oct 2014 22:53:59 -0700
Subject: Add per size class huge allocation statistics.

Add per size class huge allocation statistics, and normalize various
stats:
- Change the arenas.nlruns type from size_t to unsigned.
- Add the arenas.nhchunks and arenas.hchunks.<i>.size mallctl's.
- Replace the stats.arenas.<i>.bins.<j>.allocated mallctl with
  stats.arenas.<i>.bins.<j>.curregs .
- Add the stats.arenas.<i>.hchunks.<j>.nmalloc,
  stats.arenas.<i>.hchunks.<j>.ndalloc,
  stats.arenas.<i>.hchunks.<j>.nrequests, and
  stats.arenas.<i>.hchunks.<j>.curhchunks mallctl's.
---
 doc/jemalloc.xml.in               |  98 +++++++++--
 include/jemalloc/internal/arena.h |   9 +-
 include/jemalloc/internal/ctl.h   |   1 +
 include/jemalloc/internal/stats.h |  34 +++-
 src/arena.c                       |  79 +++++----
 src/ctl.c                         | 334 ++++++++++++++++++++++++--------------
 src/huge.c                        | 168 +++++++++++--------
 src/stats.c                       | 215 +++++++++++++++---------
 test/unit/mallctl.c               |  21 ++-
 test/unit/stats.c                 | 103 +++++++++++-
 10 files changed, 724 insertions(+), 338 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 7da1498..8111fc1 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -406,11 +406,12 @@ for (i = 0; i < nbins; i++) {
       functions simultaneously.  If <option>--enable-stats</option> is
       specified during configuration, &ldquo;m&rdquo; and &ldquo;a&rdquo; can
       be specified to omit merged arena and per arena statistics, respectively;
-      &ldquo;b&rdquo; and &ldquo;l&rdquo; can be specified to omit per size
-      class statistics for bins and large objects, respectively.  Unrecognized
-      characters are silently ignored.  Note that thread caching may prevent
-      some statistics from being completely up to date, since extra locking
-      would be required to merge counters that track thread cache operations.
+      &ldquo;b&rdquo;, &ldquo;l&rdquo;, and &ldquo;h&rdquo; can be specified to
+      omit per size class statistics for bins, large objects, and huge objects,
+      respectively.  Unrecognized characters are silently ignored.  Note that
+      thread caching may prevent some statistics from being completely up to
+      date, since extra locking would be required to merge counters that track
+      thread cache operations.
       </para>
 
       <para>The <function>malloc_usable_size<parameter/></function> function
@@ -1520,7 +1521,7 @@ malloc_conf = "xmalloc:true";]]></programlisting>
       <varlistentry id="arenas.nlruns">
         <term>
           <mallctl>arenas.nlruns</mallctl>
-          (<type>size_t</type>)
+          (<type>unsigned</type>)
           <literal>r-</literal>
         </term>
         <listitem><para>Total number of large size classes.</para></listitem>
@@ -1536,6 +1537,25 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         class.</para></listitem>
       </varlistentry>
 
+      <varlistentry id="arenas.nhchunks">
+        <term>
+          <mallctl>arenas.nhchunks</mallctl>
+          (<type>unsigned</type>)
+          <literal>r-</literal>
+        </term>
+        <listitem><para>Total number of huge size classes.</para></listitem>
+      </varlistentry>
+
+      <varlistentry id="arenas.hchunks.i.size">
+        <term>
+          <mallctl>arenas.hchunks.&lt;i&gt;.size</mallctl>
+          (<type>size_t</type>)
+          <literal>r-</literal>
+        </term>
+        <listitem><para>Maximum size supported by this huge size
+        class.</para></listitem>
+      </varlistentry>
+
       <varlistentry id="arenas.extend">
         <term>
           <mallctl>arenas.extend</mallctl>
@@ -1945,17 +1965,6 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         </para></listitem>
       </varlistentry>
 
-      <varlistentry id="stats.arenas.i.bins.j.allocated">
-        <term>
-          <mallctl>stats.arenas.&lt;i&gt;.bins.&lt;j&gt;.allocated</mallctl>
-          (<type>size_t</type>)
-          <literal>r-</literal>
-          [<option>--enable-stats</option>]
-        </term>
-        <listitem><para>Current number of bytes allocated by
-        bin.</para></listitem>
-      </varlistentry>
-
       <varlistentry id="stats.arenas.i.bins.j.nmalloc">
         <term>
           <mallctl>stats.arenas.&lt;i&gt;.bins.&lt;j&gt;.nmalloc</mallctl>
@@ -1989,6 +1998,17 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         requests.</para></listitem>
       </varlistentry>
 
+      <varlistentry id="stats.arenas.i.bins.j.curregs">
+        <term>
+          <mallctl>stats.arenas.&lt;i&gt;.bins.&lt;j&gt;.curregs</mallctl>
+          (<type>size_t</type>)
+          <literal>r-</literal>
+          [<option>--enable-stats</option>]
+        </term>
+        <listitem><para>Current number of regions for this size
+        class.</para></listitem>
+      </varlistentry>
+
       <varlistentry id="stats.arenas.i.bins.j.nfills">
         <term>
           <mallctl>stats.arenas.&lt;i&gt;.bins.&lt;j&gt;.nfills</mallctl>
@@ -2083,6 +2103,50 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         <listitem><para>Current number of runs for this size class.
         </para></listitem>
       </varlistentry>
+
+      <varlistentry id="stats.arenas.i.hchunks.j.nmalloc">
+        <term>
+          <mallctl>stats.arenas.&lt;i&gt;.hchunks.&lt;j&gt;.nmalloc</mallctl>
+          (<type>uint64_t</type>)
+          <literal>r-</literal>
+          [<option>--enable-stats</option>]
+        </term>
+        <listitem><para>Cumulative number of allocation requests for this size
+        class served directly by the arena.</para></listitem>
+      </varlistentry>
+
+      <varlistentry id="stats.arenas.i.hchunks.j.ndalloc">
+        <term>
+          <mallctl>stats.arenas.&lt;i&gt;.hchunks.&lt;j&gt;.ndalloc</mallctl>
+          (<type>uint64_t</type>)
+          <literal>r-</literal>
+          [<option>--enable-stats</option>]
+        </term>
+        <listitem><para>Cumulative number of deallocation requests for this
+        size class served directly by the arena.</para></listitem>
+      </varlistentry>
+
+      <varlistentry id="stats.arenas.i.hchunks.j.nrequests">
+        <term>
+          <mallctl>stats.arenas.&lt;i&gt;.hchunks.&lt;j&gt;.nrequests</mallctl>
+          (<type>uint64_t</type>)
+          <literal>r-</literal>
+          [<option>--enable-stats</option>]
+        </term>
+        <listitem><para>Cumulative number of allocation requests for this size
+        class.</para></listitem>
+      </varlistentry>
+
+      <varlistentry id="stats.arenas.i.hchunks.j.curhchunks">
+        <term>
+          <mallctl>stats.arenas.&lt;i&gt;.hchunks.&lt;j&gt;.curhchunks</mallctl>
+          (<type>size_t</type>)
+          <literal>r-</literal>
+          [<option>--enable-stats</option>]
+        </term>
+        <listitem><para>Current number of huge allocations for this size class.
+        </para></listitem>
+      </varlistentry>
     </variablelist>
   </refsect1>
   <refsect1 id="debugging_malloc_problems">
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 28ff727..c31c8d7 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -335,11 +335,12 @@ extern size_t		map_bias; /* Number of arena chunk header pages. */
 extern size_t		map_misc_offset;
 extern size_t		arena_maxrun; /* Max run size for arenas. */
 extern size_t		arena_maxclass; /* Max size class for arenas. */
-extern size_t		nlclasses; /* Number of large size classes. */
+extern unsigned		nlclasses; /* Number of large size classes. */
+extern unsigned		nhclasses; /* Number of huge size classes. */
 
-void	*arena_chunk_alloc_huge(arena_t *arena, void *new_addr, size_t size,
+void	*arena_chunk_alloc_huge(arena_t *arena, void *new_addr, size_t usize,
     size_t alignment, bool *zero);
-void	arena_chunk_dalloc_huge(arena_t *arena, void *chunk, size_t size);
+void	arena_chunk_dalloc_huge(arena_t *arena, void *chunk, size_t usize);
 void	arena_purge_all(arena_t *arena);
 void	arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin,
     index_t binind, uint64_t prof_accumbytes);
@@ -387,7 +388,7 @@ dss_prec_t	arena_dss_prec_get(arena_t *arena);
 bool	arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec);
 void	arena_stats_merge(arena_t *arena, const char **dss, size_t *nactive,
     size_t *ndirty, arena_stats_t *astats, malloc_bin_stats_t *bstats,
-    malloc_large_stats_t *lstats);
+    malloc_large_stats_t *lstats, malloc_huge_stats_t *hstats);
 arena_t	*arena_new(unsigned ind);
 void	arena_boot(void);
 void	arena_prefork(arena_t *arena);
diff --git a/include/jemalloc/internal/ctl.h b/include/jemalloc/internal/ctl.h
index 2d301bf..a3e899e 100644
--- a/include/jemalloc/internal/ctl.h
+++ b/include/jemalloc/internal/ctl.h
@@ -46,6 +46,7 @@ struct ctl_arena_stats_s {
 
 	malloc_bin_stats_t	bstats[NBINS];
 	malloc_large_stats_t	*lstats;	/* nlclasses elements. */
+	malloc_huge_stats_t	*hstats;	/* nhclasses elements. */
 };
 
 struct ctl_stats_s {
diff --git a/include/jemalloc/internal/stats.h b/include/jemalloc/internal/stats.h
index 6104cb3..d8600ed 100644
--- a/include/jemalloc/internal/stats.h
+++ b/include/jemalloc/internal/stats.h
@@ -4,6 +4,7 @@
 typedef struct tcache_bin_stats_s tcache_bin_stats_t;
 typedef struct malloc_bin_stats_s malloc_bin_stats_t;
 typedef struct malloc_large_stats_s malloc_large_stats_t;
+typedef struct malloc_huge_stats_s malloc_huge_stats_t;
 typedef struct arena_stats_s arena_stats_t;
 typedef struct chunk_stats_s chunk_stats_t;
 
@@ -21,12 +22,6 @@ struct tcache_bin_stats_s {
 
 struct malloc_bin_stats_s {
 	/*
-	 * Current number of bytes allocated, including objects currently
-	 * cached by tcache.
-	 */
-	size_t		allocated;
-
-	/*
 	 * Total number of allocation/deallocation requests served directly by
 	 * the bin.  Note that tcache may allocate an object, then recycle it
 	 * many times, resulting many increments to nrequests, but only one
@@ -42,6 +37,12 @@ struct malloc_bin_stats_s {
 	 */
 	uint64_t	nrequests;
 
+	/*
+	 * Current number of regions of this size class, including regions
+	 * currently cached by tcache.
+	 */
+	size_t		curregs;
+
 	/* Number of tcache fills from this bin. */
 	uint64_t	nfills;
 
@@ -78,10 +79,25 @@ struct malloc_large_stats_s {
 	 */
 	uint64_t	nrequests;
 
-	/* Current number of runs of this size class. */
+	/*
+	 * Current number of runs of this size class, including runs currently
+	 * cached by tcache.
+	 */
 	size_t		curruns;
 };
 
+struct malloc_huge_stats_s {
+	/*
+	 * Total number of allocation/deallocation requests served directly by
+	 * the arena.
+	 */
+	uint64_t	nmalloc;
+	uint64_t	ndalloc;
+
+	/* Current number of (multi-)chunk allocations of this size class. */
+	size_t		curhchunks;
+};
+
 struct arena_stats_s {
 	/* Number of bytes currently mapped. */
 	size_t		mapped;
@@ -104,10 +120,12 @@ struct arena_stats_s {
 	size_t		allocated_huge;
 	uint64_t	nmalloc_huge;
 	uint64_t	ndalloc_huge;
-	uint64_t	nrequests_huge;
 
 	/* One element for each large size class. */
 	malloc_large_stats_t	*lstats;
+
+	/* One element for each huge size class. */
+	malloc_huge_stats_t	*hstats;
 };
 
 struct chunk_stats_s {
diff --git a/src/arena.c b/src/arena.c
index 8872331..74c3632 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -11,7 +11,8 @@ size_t		map_bias;
 size_t		map_misc_offset;
 size_t		arena_maxrun; /* Max run size for arenas. */
 size_t		arena_maxclass; /* Max size class for arenas. */
-size_t		nlclasses; /* Number of large size classes. */
+unsigned	nlclasses; /* Number of large size classes. */
+unsigned	nhclasses; /* Number of huge size classes. */
 
 /******************************************************************************/
 /*
@@ -411,7 +412,7 @@ arena_chunk_alloc_internal(arena_t *arena, size_t size, size_t alignment,
 }
 
 void *
-arena_chunk_alloc_huge(arena_t *arena, void *new_addr, size_t size,
+arena_chunk_alloc_huge(arena_t *arena, void *new_addr, size_t usize,
     size_t alignment, bool *zero)
 {
 	void *ret;
@@ -422,26 +423,33 @@ arena_chunk_alloc_huge(arena_t *arena, void *new_addr, size_t size,
 	chunk_alloc = arena->chunk_alloc;
 	chunk_dalloc = arena->chunk_dalloc;
 	if (config_stats) {
+		index_t index = size2index(usize) - nlclasses - NBINS;
+
 		/* Optimistically update stats prior to unlocking. */
-		arena->stats.mapped += size;
-		arena->stats.allocated_huge += size;
+		arena->stats.allocated_huge += usize;
 		arena->stats.nmalloc_huge++;
-		arena->stats.nrequests_huge++;
+		arena->stats.hstats[index].nmalloc++;
+		arena->stats.hstats[index].curhchunks++;
+		arena->stats.mapped += usize;
 	}
-	arena->nactive += (size >> LG_PAGE);
+	arena->nactive += (usize >> LG_PAGE);
 	malloc_mutex_unlock(&arena->lock);
 
 	ret = chunk_alloc_arena(chunk_alloc, chunk_dalloc, arena->ind,
-	    new_addr, size, alignment, zero);
+	    new_addr, usize, alignment, zero);
 	if (config_stats) {
 		if (ret != NULL)
-			stats_cactive_add(size);
+			stats_cactive_add(usize);
 		else {
-			/* Revert optimistic stats updates. */
+			index_t index = size2index(usize) - nlclasses - NBINS;
+
 			malloc_mutex_lock(&arena->lock);
-			arena->stats.mapped -= size;
-			arena->stats.allocated_huge -= size;
+			/* Revert optimistic stats updates. */
+			arena->stats.allocated_huge -= usize;
 			arena->stats.nmalloc_huge--;
+			arena->stats.hstats[index].nmalloc--;
+			arena->stats.hstats[index].curhchunks--;
+			arena->stats.mapped -= usize;
 			malloc_mutex_unlock(&arena->lock);
 		}
 	}
@@ -534,21 +542,25 @@ arena_chunk_dalloc_internal(arena_t *arena, arena_chunk_t *chunk)
 }
 
 void
-arena_chunk_dalloc_huge(arena_t *arena, void *chunk, size_t size)
+arena_chunk_dalloc_huge(arena_t *arena, void *chunk, size_t usize)
 {
 	chunk_dalloc_t *chunk_dalloc;
 
 	malloc_mutex_lock(&arena->lock);
 	chunk_dalloc = arena->chunk_dalloc;
 	if (config_stats) {
-		arena->stats.mapped -= size;
-		arena->stats.allocated_huge -= size;
+		index_t index = size2index(usize) - nlclasses - NBINS;
+
 		arena->stats.ndalloc_huge++;
-		stats_cactive_sub(size);
+		arena->stats.allocated_huge -= usize;
+		arena->stats.hstats[index].ndalloc++;
+		arena->stats.hstats[index].curhchunks--;
+		arena->stats.mapped -= usize;
+		stats_cactive_sub(usize);
 	}
-	arena->nactive -= (size >> LG_PAGE);
+	arena->nactive -= (usize >> LG_PAGE);
 	malloc_mutex_unlock(&arena->lock);
-	chunk_dalloc(chunk, size, arena->ind);
+	chunk_dalloc(chunk, usize, arena->ind);
 }
 
 static void
@@ -1300,9 +1312,9 @@ arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, index_t binind,
 		tbin->avail[nfill - 1 - i] = ptr;
 	}
 	if (config_stats) {
-		bin->stats.allocated += i * arena_bin_info[binind].reg_size;
 		bin->stats.nmalloc += i;
 		bin->stats.nrequests += tbin->tstats.nrequests;
+		bin->stats.curregs += i;
 		bin->stats.nfills++;
 		tbin->tstats.nrequests = 0;
 	}
@@ -1436,9 +1448,9 @@ arena_malloc_small(arena_t *arena, size_t size, bool zero)
 	}
 
 	if (config_stats) {
-		bin->stats.allocated += size;
 		bin->stats.nmalloc++;
 		bin->stats.nrequests++;
+		bin->stats.curregs++;
 	}
 	malloc_mutex_unlock(&bin->lock);
 	if (config_prof && !isthreaded && arena_prof_accum(arena, size))
@@ -1678,7 +1690,6 @@ arena_dalloc_bin_locked_impl(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	arena_run_t *run;
 	arena_bin_t *bin;
 	arena_bin_info_t *bin_info;
-	size_t size;
 	index_t binind;
 
 	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
@@ -1687,8 +1698,6 @@ arena_dalloc_bin_locked_impl(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	binind = run->binind;
 	bin = &arena->bins[binind];
 	bin_info = &arena_bin_info[binind];
-	if (config_fill || config_stats)
-		size = bin_info->reg_size;
 
 	if (!junked && config_fill && unlikely(opt_junk))
 		arena_dalloc_junk_small(ptr, bin_info);
@@ -1701,8 +1710,8 @@ arena_dalloc_bin_locked_impl(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 		arena_bin_lower_run(arena, chunk, run, bin);
 
 	if (config_stats) {
-		bin->stats.allocated -= size;
 		bin->stats.ndalloc++;
+		bin->stats.curregs--;
 	}
 }
 
@@ -2102,7 +2111,7 @@ arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec)
 void
 arena_stats_merge(arena_t *arena, const char **dss, size_t *nactive,
     size_t *ndirty, arena_stats_t *astats, malloc_bin_stats_t *bstats,
-    malloc_large_stats_t *lstats)
+    malloc_large_stats_t *lstats, malloc_huge_stats_t *hstats)
 {
 	unsigned i;
 
@@ -2122,7 +2131,6 @@ arena_stats_merge(arena_t *arena, const char **dss, size_t *nactive,
 	astats->allocated_huge += arena->stats.allocated_huge;
 	astats->nmalloc_huge += arena->stats.nmalloc_huge;
 	astats->ndalloc_huge += arena->stats.ndalloc_huge;
-	astats->nrequests_huge += arena->stats.nrequests_huge;
 
 	for (i = 0; i < nlclasses; i++) {
 		lstats[i].nmalloc += arena->stats.lstats[i].nmalloc;
@@ -2130,16 +2138,22 @@ arena_stats_merge(arena_t *arena, const char **dss, size_t *nactive,
 		lstats[i].nrequests += arena->stats.lstats[i].nrequests;
 		lstats[i].curruns += arena->stats.lstats[i].curruns;
 	}
+
+	for (i = 0; i < nhclasses; i++) {
+		hstats[i].nmalloc += arena->stats.hstats[i].nmalloc;
+		hstats[i].ndalloc += arena->stats.hstats[i].ndalloc;
+		hstats[i].curhchunks += arena->stats.hstats[i].curhchunks;
+	}
 	malloc_mutex_unlock(&arena->lock);
 
 	for (i = 0; i < NBINS; i++) {
 		arena_bin_t *bin = &arena->bins[i];
 
 		malloc_mutex_lock(&bin->lock);
-		bstats[i].allocated += bin->stats.allocated;
 		bstats[i].nmalloc += bin->stats.nmalloc;
 		bstats[i].ndalloc += bin->stats.ndalloc;
 		bstats[i].nrequests += bin->stats.nrequests;
+		bstats[i].curregs += bin->stats.curregs;
 		if (config_tcache) {
 			bstats[i].nfills += bin->stats.nfills;
 			bstats[i].nflushes += bin->stats.nflushes;
@@ -2159,12 +2173,13 @@ arena_new(unsigned ind)
 	arena_bin_t *bin;
 
 	/*
-	 * Allocate arena and arena->lstats contiguously, mainly because there
-	 * is no way to clean up if base_alloc() OOMs.
+	 * Allocate arena, arena->lstats, and arena->hstats contiguously, mainly
+	 * because there is no way to clean up if base_alloc() OOMs.
 	 */
 	if (config_stats) {
 		arena = (arena_t *)base_alloc(CACHELINE_CEILING(sizeof(arena_t))
-		    + nlclasses * sizeof(malloc_large_stats_t));
+		    + QUANTUM_CEILING(nlclasses * sizeof(malloc_large_stats_t) +
+		    nhclasses) * sizeof(malloc_huge_stats_t));
 	} else
 		arena = (arena_t *)base_alloc(sizeof(arena_t));
 	if (arena == NULL)
@@ -2184,6 +2199,11 @@ arena_new(unsigned ind)
 		    CACHELINE_CEILING(sizeof(arena_t)));
 		memset(arena->stats.lstats, 0, nlclasses *
 		    sizeof(malloc_large_stats_t));
+		arena->stats.hstats = (malloc_huge_stats_t *)(((void *)arena) +
+		    CACHELINE_CEILING(sizeof(arena_t)) +
+		    QUANTUM_CEILING(nlclasses * sizeof(malloc_large_stats_t)));
+		memset(arena->stats.hstats, 0, nhclasses *
+		    sizeof(malloc_huge_stats_t));
 		if (config_tcache)
 			ql_new(&arena->tcache_ql);
 	}
@@ -2369,6 +2389,7 @@ arena_boot(void)
 	}
 	assert(arena_maxclass > 0);
 	nlclasses = size2index(arena_maxclass) - size2index(SMALL_MAXCLASS);
+	nhclasses = NSIZES - nlclasses - NBINS;
 
 	bin_info_init();
 }
diff --git a/src/ctl.c b/src/ctl.c
index 37f8f42..72598b3 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -122,6 +122,8 @@ CTL_PROTO(arenas_bin_i_run_size)
 INDEX_PROTO(arenas_bin_i)
 CTL_PROTO(arenas_lrun_i_size)
 INDEX_PROTO(arenas_lrun_i)
+CTL_PROTO(arenas_hchunk_i_size)
+INDEX_PROTO(arenas_hchunk_i)
 CTL_PROTO(arenas_narenas)
 CTL_PROTO(arenas_initialized)
 CTL_PROTO(arenas_quantum)
@@ -130,6 +132,7 @@ CTL_PROTO(arenas_tcache_max)
 CTL_PROTO(arenas_nbins)
 CTL_PROTO(arenas_nhbins)
 CTL_PROTO(arenas_nlruns)
+CTL_PROTO(arenas_nhchunks)
 CTL_PROTO(arenas_extend)
 CTL_PROTO(prof_thread_active_init)
 CTL_PROTO(prof_active)
@@ -152,10 +155,10 @@ CTL_PROTO(stats_arenas_i_huge_allocated)
 CTL_PROTO(stats_arenas_i_huge_nmalloc)
 CTL_PROTO(stats_arenas_i_huge_ndalloc)
 CTL_PROTO(stats_arenas_i_huge_nrequests)
-CTL_PROTO(stats_arenas_i_bins_j_allocated)
 CTL_PROTO(stats_arenas_i_bins_j_nmalloc)
 CTL_PROTO(stats_arenas_i_bins_j_ndalloc)
 CTL_PROTO(stats_arenas_i_bins_j_nrequests)
+CTL_PROTO(stats_arenas_i_bins_j_curregs)
 CTL_PROTO(stats_arenas_i_bins_j_nfills)
 CTL_PROTO(stats_arenas_i_bins_j_nflushes)
 CTL_PROTO(stats_arenas_i_bins_j_nruns)
@@ -167,6 +170,11 @@ CTL_PROTO(stats_arenas_i_lruns_j_ndalloc)
 CTL_PROTO(stats_arenas_i_lruns_j_nrequests)
 CTL_PROTO(stats_arenas_i_lruns_j_curruns)
 INDEX_PROTO(stats_arenas_i_lruns_j)
+CTL_PROTO(stats_arenas_i_hchunks_j_nmalloc)
+CTL_PROTO(stats_arenas_i_hchunks_j_ndalloc)
+CTL_PROTO(stats_arenas_i_hchunks_j_nrequests)
+CTL_PROTO(stats_arenas_i_hchunks_j_curhchunks)
+INDEX_PROTO(stats_arenas_i_hchunks_j)
 CTL_PROTO(stats_arenas_i_nthreads)
 CTL_PROTO(stats_arenas_i_dss)
 CTL_PROTO(stats_arenas_i_pactive)
@@ -221,60 +229,60 @@ static const ctl_named_node_t	thread_node[] = {
 };
 
 static const ctl_named_node_t	config_node[] = {
-	{NAME("debug"),			CTL(config_debug)},
-	{NAME("fill"),			CTL(config_fill)},
-	{NAME("lazy_lock"),		CTL(config_lazy_lock)},
-	{NAME("munmap"),		CTL(config_munmap)},
-	{NAME("prof"),			CTL(config_prof)},
-	{NAME("prof_libgcc"),		CTL(config_prof_libgcc)},
-	{NAME("prof_libunwind"),	CTL(config_prof_libunwind)},
-	{NAME("stats"),			CTL(config_stats)},
-	{NAME("tcache"),		CTL(config_tcache)},
-	{NAME("tls"),			CTL(config_tls)},
-	{NAME("utrace"),		CTL(config_utrace)},
-	{NAME("valgrind"),		CTL(config_valgrind)},
-	{NAME("xmalloc"),		CTL(config_xmalloc)}
+	{NAME("debug"),		CTL(config_debug)},
+	{NAME("fill"),		CTL(config_fill)},
+	{NAME("lazy_lock"),	CTL(config_lazy_lock)},
+	{NAME("munmap"),	CTL(config_munmap)},
+	{NAME("prof"),		CTL(config_prof)},
+	{NAME("prof_libgcc"),	CTL(config_prof_libgcc)},
+	{NAME("prof_libunwind"), CTL(config_prof_libunwind)},
+	{NAME("stats"),		CTL(config_stats)},
+	{NAME("tcache"),	CTL(config_tcache)},
+	{NAME("tls"),		CTL(config_tls)},
+	{NAME("utrace"),	CTL(config_utrace)},
+	{NAME("valgrind"),	CTL(config_valgrind)},
+	{NAME("xmalloc"),	CTL(config_xmalloc)}
 };
 
 static const ctl_named_node_t opt_node[] = {
-	{NAME("abort"),			CTL(opt_abort)},
-	{NAME("dss"),			CTL(opt_dss)},
-	{NAME("lg_chunk"),		CTL(opt_lg_chunk)},
-	{NAME("narenas"),		CTL(opt_narenas)},
-	{NAME("lg_dirty_mult"),		CTL(opt_lg_dirty_mult)},
-	{NAME("stats_print"),		CTL(opt_stats_print)},
-	{NAME("junk"),			CTL(opt_junk)},
-	{NAME("zero"),			CTL(opt_zero)},
-	{NAME("quarantine"),		CTL(opt_quarantine)},
-	{NAME("redzone"),		CTL(opt_redzone)},
-	{NAME("utrace"),		CTL(opt_utrace)},
-	{NAME("xmalloc"),		CTL(opt_xmalloc)},
-	{NAME("tcache"),		CTL(opt_tcache)},
-	{NAME("lg_tcache_max"),		CTL(opt_lg_tcache_max)},
-	{NAME("prof"),			CTL(opt_prof)},
-	{NAME("prof_prefix"),		CTL(opt_prof_prefix)},
-	{NAME("prof_active"),		CTL(opt_prof_active)},
+	{NAME("abort"),		CTL(opt_abort)},
+	{NAME("dss"),		CTL(opt_dss)},
+	{NAME("lg_chunk"),	CTL(opt_lg_chunk)},
+	{NAME("narenas"),	CTL(opt_narenas)},
+	{NAME("lg_dirty_mult"),	CTL(opt_lg_dirty_mult)},
+	{NAME("stats_print"),	CTL(opt_stats_print)},
+	{NAME("junk"),		CTL(opt_junk)},
+	{NAME("zero"),		CTL(opt_zero)},
+	{NAME("quarantine"),	CTL(opt_quarantine)},
+	{NAME("redzone"),	CTL(opt_redzone)},
+	{NAME("utrace"),	CTL(opt_utrace)},
+	{NAME("xmalloc"),	CTL(opt_xmalloc)},
+	{NAME("tcache"),	CTL(opt_tcache)},
+	{NAME("lg_tcache_max"),	CTL(opt_lg_tcache_max)},
+	{NAME("prof"),		CTL(opt_prof)},
+	{NAME("prof_prefix"),	CTL(opt_prof_prefix)},
+	{NAME("prof_active"),	CTL(opt_prof_active)},
 	{NAME("prof_thread_active_init"), CTL(opt_prof_thread_active_init)},
-	{NAME("lg_prof_sample"),	CTL(opt_lg_prof_sample)},
-	{NAME("lg_prof_interval"),	CTL(opt_lg_prof_interval)},
-	{NAME("prof_gdump"),		CTL(opt_prof_gdump)},
-	{NAME("prof_final"),		CTL(opt_prof_final)},
-	{NAME("prof_leak"),		CTL(opt_prof_leak)},
-	{NAME("prof_accum"),		CTL(opt_prof_accum)}
+	{NAME("lg_prof_sample"), CTL(opt_lg_prof_sample)},
+	{NAME("lg_prof_interval"), CTL(opt_lg_prof_interval)},
+	{NAME("prof_gdump"),	CTL(opt_prof_gdump)},
+	{NAME("prof_final"),	CTL(opt_prof_final)},
+	{NAME("prof_leak"),	CTL(opt_prof_leak)},
+	{NAME("prof_accum"),	CTL(opt_prof_accum)}
 };
 
 static const ctl_named_node_t chunk_node[] = {
-	{NAME("alloc"),			CTL(arena_i_chunk_alloc)},
-	{NAME("dalloc"),		CTL(arena_i_chunk_dalloc)}
+	{NAME("alloc"),		CTL(arena_i_chunk_alloc)},
+	{NAME("dalloc"),	CTL(arena_i_chunk_dalloc)}
 };
 
 static const ctl_named_node_t arena_i_node[] = {
-	{NAME("purge"),			CTL(arena_i_purge)},
-	{NAME("dss"),			CTL(arena_i_dss)},
-	{NAME("chunk"),			CHILD(named, chunk)},
+	{NAME("purge"),		CTL(arena_i_purge)},
+	{NAME("dss"),		CTL(arena_i_dss)},
+	{NAME("chunk"),		CHILD(named, chunk)},
 };
 static const ctl_named_node_t super_arena_i_node[] = {
-	{NAME(""),			CHILD(named, arena_i)}
+	{NAME(""),		CHILD(named, arena_i)}
 };
 
 static const ctl_indexed_node_t arena_node[] = {
@@ -282,12 +290,12 @@ static const ctl_indexed_node_t arena_node[] = {
 };
 
 static const ctl_named_node_t arenas_bin_i_node[] = {
-	{NAME("size"),			CTL(arenas_bin_i_size)},
-	{NAME("nregs"),			CTL(arenas_bin_i_nregs)},
-	{NAME("run_size"),		CTL(arenas_bin_i_run_size)}
+	{NAME("size"),		CTL(arenas_bin_i_size)},
+	{NAME("nregs"),		CTL(arenas_bin_i_nregs)},
+	{NAME("run_size"),	CTL(arenas_bin_i_run_size)}
 };
 static const ctl_named_node_t super_arenas_bin_i_node[] = {
-	{NAME(""),			CHILD(named, arenas_bin_i)}
+	{NAME(""),		CHILD(named, arenas_bin_i)}
 };
 
 static const ctl_indexed_node_t arenas_bin_node[] = {
@@ -295,28 +303,41 @@ static const ctl_indexed_node_t arenas_bin_node[] = {
 };
 
 static const ctl_named_node_t arenas_lrun_i_node[] = {
-	{NAME("size"),			CTL(arenas_lrun_i_size)}
+	{NAME("size"),		CTL(arenas_lrun_i_size)}
 };
 static const ctl_named_node_t super_arenas_lrun_i_node[] = {
-	{NAME(""),			CHILD(named, arenas_lrun_i)}
+	{NAME(""),		CHILD(named, arenas_lrun_i)}
 };
 
 static const ctl_indexed_node_t arenas_lrun_node[] = {
 	{INDEX(arenas_lrun_i)}
 };
 
+static const ctl_named_node_t arenas_hchunk_i_node[] = {
+	{NAME("size"),		CTL(arenas_hchunk_i_size)}
+};
+static const ctl_named_node_t super_arenas_hchunk_i_node[] = {
+	{NAME(""),		CHILD(named, arenas_hchunk_i)}
+};
+
+static const ctl_indexed_node_t arenas_hchunk_node[] = {
+	{INDEX(arenas_hchunk_i)}
+};
+
 static const ctl_named_node_t arenas_node[] = {
-	{NAME("narenas"),		CTL(arenas_narenas)},
-	{NAME("initialized"),		CTL(arenas_initialized)},
-	{NAME("quantum"),		CTL(arenas_quantum)},
-	{NAME("page"),			CTL(arenas_page)},
-	{NAME("tcache_max"),		CTL(arenas_tcache_max)},
-	{NAME("nbins"),			CTL(arenas_nbins)},
-	{NAME("nhbins"),		CTL(arenas_nhbins)},
-	{NAME("bin"),			CHILD(indexed, arenas_bin)},
-	{NAME("nlruns"),		CTL(arenas_nlruns)},
-	{NAME("lrun"),			CHILD(indexed, arenas_lrun)},
-	{NAME("extend"),		CTL(arenas_extend)}
+	{NAME("narenas"),	CTL(arenas_narenas)},
+	{NAME("initialized"),	CTL(arenas_initialized)},
+	{NAME("quantum"),	CTL(arenas_quantum)},
+	{NAME("page"),		CTL(arenas_page)},
+	{NAME("tcache_max"),	CTL(arenas_tcache_max)},
+	{NAME("nbins"),		CTL(arenas_nbins)},
+	{NAME("nhbins"),	CTL(arenas_nhbins)},
+	{NAME("bin"),		CHILD(indexed, arenas_bin)},
+	{NAME("nlruns"),	CTL(arenas_nlruns)},
+	{NAME("lrun"),		CHILD(indexed, arenas_lrun)},
+	{NAME("nhchunks"),	CTL(arenas_nhchunks)},
+	{NAME("hchunk"),	CHILD(indexed, arenas_hchunk)},
+	{NAME("extend"),	CTL(arenas_extend)}
 };
 
 static const ctl_named_node_t	prof_node[] = {
@@ -329,45 +350,45 @@ static const ctl_named_node_t	prof_node[] = {
 };
 
 static const ctl_named_node_t stats_chunks_node[] = {
-	{NAME("current"),		CTL(stats_chunks_current)},
-	{NAME("total"),			CTL(stats_chunks_total)},
-	{NAME("high"),			CTL(stats_chunks_high)}
+	{NAME("current"),	CTL(stats_chunks_current)},
+	{NAME("total"),		CTL(stats_chunks_total)},
+	{NAME("high"),		CTL(stats_chunks_high)}
 };
 
 static const ctl_named_node_t stats_arenas_i_small_node[] = {
-	{NAME("allocated"),		CTL(stats_arenas_i_small_allocated)},
-	{NAME("nmalloc"),		CTL(stats_arenas_i_small_nmalloc)},
-	{NAME("ndalloc"),		CTL(stats_arenas_i_small_ndalloc)},
-	{NAME("nrequests"),		CTL(stats_arenas_i_small_nrequests)}
+	{NAME("allocated"),	CTL(stats_arenas_i_small_allocated)},
+	{NAME("nmalloc"),	CTL(stats_arenas_i_small_nmalloc)},
+	{NAME("ndalloc"),	CTL(stats_arenas_i_small_ndalloc)},
+	{NAME("nrequests"),	CTL(stats_arenas_i_small_nrequests)}
 };
 
 static const ctl_named_node_t stats_arenas_i_large_node[] = {
-	{NAME("allocated"),		CTL(stats_arenas_i_large_allocated)},
-	{NAME("nmalloc"),		CTL(stats_arenas_i_large_nmalloc)},
-	{NAME("ndalloc"),		CTL(stats_arenas_i_large_ndalloc)},
-	{NAME("nrequests"),		CTL(stats_arenas_i_large_nrequests)}
+	{NAME("allocated"),	CTL(stats_arenas_i_large_allocated)},
+	{NAME("nmalloc"),	CTL(stats_arenas_i_large_nmalloc)},
+	{NAME("ndalloc"),	CTL(stats_arenas_i_large_ndalloc)},
+	{NAME("nrequests"),	CTL(stats_arenas_i_large_nrequests)}
 };
 
 static const ctl_named_node_t stats_arenas_i_huge_node[] = {
-	{NAME("allocated"),		CTL(stats_arenas_i_huge_allocated)},
-	{NAME("nmalloc"),		CTL(stats_arenas_i_huge_nmalloc)},
-	{NAME("ndalloc"),		CTL(stats_arenas_i_huge_ndalloc)},
-	{NAME("nrequests"),		CTL(stats_arenas_i_huge_nrequests)},
+	{NAME("allocated"),	CTL(stats_arenas_i_huge_allocated)},
+	{NAME("nmalloc"),	CTL(stats_arenas_i_huge_nmalloc)},
+	{NAME("ndalloc"),	CTL(stats_arenas_i_huge_ndalloc)},
+	{NAME("nrequests"),	CTL(stats_arenas_i_huge_nrequests)}
 };
 
 static const ctl_named_node_t stats_arenas_i_bins_j_node[] = {
-	{NAME("allocated"),		CTL(stats_arenas_i_bins_j_allocated)},
-	{NAME("nmalloc"),		CTL(stats_arenas_i_bins_j_nmalloc)},
-	{NAME("ndalloc"),		CTL(stats_arenas_i_bins_j_ndalloc)},
-	{NAME("nrequests"),		CTL(stats_arenas_i_bins_j_nrequests)},
-	{NAME("nfills"),		CTL(stats_arenas_i_bins_j_nfills)},
-	{NAME("nflushes"),		CTL(stats_arenas_i_bins_j_nflushes)},
-	{NAME("nruns"),			CTL(stats_arenas_i_bins_j_nruns)},
-	{NAME("nreruns"),		CTL(stats_arenas_i_bins_j_nreruns)},
-	{NAME("curruns"),		CTL(stats_arenas_i_bins_j_curruns)}
+	{NAME("nmalloc"),	CTL(stats_arenas_i_bins_j_nmalloc)},
+	{NAME("ndalloc"),	CTL(stats_arenas_i_bins_j_ndalloc)},
+	{NAME("nrequests"),	CTL(stats_arenas_i_bins_j_nrequests)},
+	{NAME("curregs"),	CTL(stats_arenas_i_bins_j_curregs)},
+	{NAME("nfills"),	CTL(stats_arenas_i_bins_j_nfills)},
+	{NAME("nflushes"),	CTL(stats_arenas_i_bins_j_nflushes)},
+	{NAME("nruns"),		CTL(stats_arenas_i_bins_j_nruns)},
+	{NAME("nreruns"),	CTL(stats_arenas_i_bins_j_nreruns)},
+	{NAME("curruns"),	CTL(stats_arenas_i_bins_j_curruns)}
 };
 static const ctl_named_node_t super_stats_arenas_i_bins_j_node[] = {
-	{NAME(""),			CHILD(named, stats_arenas_i_bins_j)}
+	{NAME(""),		CHILD(named, stats_arenas_i_bins_j)}
 };
 
 static const ctl_indexed_node_t stats_arenas_i_bins_node[] = {
@@ -375,36 +396,51 @@ static const ctl_indexed_node_t stats_arenas_i_bins_node[] = {
 };
 
 static const ctl_named_node_t stats_arenas_i_lruns_j_node[] = {
-	{NAME("nmalloc"),		CTL(stats_arenas_i_lruns_j_nmalloc)},
-	{NAME("ndalloc"),		CTL(stats_arenas_i_lruns_j_ndalloc)},
-	{NAME("nrequests"),		CTL(stats_arenas_i_lruns_j_nrequests)},
-	{NAME("curruns"),		CTL(stats_arenas_i_lruns_j_curruns)}
+	{NAME("nmalloc"),	CTL(stats_arenas_i_lruns_j_nmalloc)},
+	{NAME("ndalloc"),	CTL(stats_arenas_i_lruns_j_ndalloc)},
+	{NAME("nrequests"),	CTL(stats_arenas_i_lruns_j_nrequests)},
+	{NAME("curruns"),	CTL(stats_arenas_i_lruns_j_curruns)}
 };
 static const ctl_named_node_t super_stats_arenas_i_lruns_j_node[] = {
-	{NAME(""),			CHILD(named, stats_arenas_i_lruns_j)}
+	{NAME(""),		CHILD(named, stats_arenas_i_lruns_j)}
 };
 
 static const ctl_indexed_node_t stats_arenas_i_lruns_node[] = {
 	{INDEX(stats_arenas_i_lruns_j)}
 };
 
+static const ctl_named_node_t stats_arenas_i_hchunks_j_node[] = {
+	{NAME("nmalloc"),	CTL(stats_arenas_i_hchunks_j_nmalloc)},
+	{NAME("ndalloc"),	CTL(stats_arenas_i_hchunks_j_ndalloc)},
+	{NAME("nrequests"),	CTL(stats_arenas_i_hchunks_j_nrequests)},
+	{NAME("curhchunks"),	CTL(stats_arenas_i_hchunks_j_curhchunks)}
+};
+static const ctl_named_node_t super_stats_arenas_i_hchunks_j_node[] = {
+	{NAME(""),		CHILD(named, stats_arenas_i_hchunks_j)}
+};
+
+static const ctl_indexed_node_t stats_arenas_i_hchunks_node[] = {
+	{INDEX(stats_arenas_i_hchunks_j)}
+};
+
 static const ctl_named_node_t stats_arenas_i_node[] = {
-	{NAME("nthreads"),		CTL(stats_arenas_i_nthreads)},
-	{NAME("dss"),			CTL(stats_arenas_i_dss)},
-	{NAME("pactive"),		CTL(stats_arenas_i_pactive)},
-	{NAME("pdirty"),		CTL(stats_arenas_i_pdirty)},
-	{NAME("mapped"),		CTL(stats_arenas_i_mapped)},
-	{NAME("npurge"),		CTL(stats_arenas_i_npurge)},
-	{NAME("nmadvise"),		CTL(stats_arenas_i_nmadvise)},
-	{NAME("purged"),		CTL(stats_arenas_i_purged)},
-	{NAME("small"),			CHILD(named, stats_arenas_i_small)},
-	{NAME("large"),			CHILD(named, stats_arenas_i_large)},
-	{NAME("huge"),			CHILD(named, stats_arenas_i_huge)},
-	{NAME("bins"),			CHILD(indexed, stats_arenas_i_bins)},
-	{NAME("lruns"),			CHILD(indexed, stats_arenas_i_lruns)}
+	{NAME("nthreads"),	CTL(stats_arenas_i_nthreads)},
+	{NAME("dss"),		CTL(stats_arenas_i_dss)},
+	{NAME("pactive"),	CTL(stats_arenas_i_pactive)},
+	{NAME("pdirty"),	CTL(stats_arenas_i_pdirty)},
+	{NAME("mapped"),	CTL(stats_arenas_i_mapped)},
+	{NAME("npurge"),	CTL(stats_arenas_i_npurge)},
+	{NAME("nmadvise"),	CTL(stats_arenas_i_nmadvise)},
+	{NAME("purged"),	CTL(stats_arenas_i_purged)},
+	{NAME("small"),		CHILD(named, stats_arenas_i_small)},
+	{NAME("large"),		CHILD(named, stats_arenas_i_large)},
+	{NAME("huge"),		CHILD(named, stats_arenas_i_huge)},
+	{NAME("bins"),		CHILD(indexed, stats_arenas_i_bins)},
+	{NAME("lruns"),		CHILD(indexed, stats_arenas_i_lruns)},
+	{NAME("hchunks"),	CHILD(indexed, stats_arenas_i_hchunks)}
 };
 static const ctl_named_node_t super_stats_arenas_i_node[] = {
-	{NAME(""),			CHILD(named, stats_arenas_i)}
+	{NAME(""),		CHILD(named, stats_arenas_i)}
 };
 
 static const ctl_indexed_node_t stats_arenas_node[] = {
@@ -412,12 +448,12 @@ static const ctl_indexed_node_t stats_arenas_node[] = {
 };
 
 static const ctl_named_node_t stats_node[] = {
-	{NAME("cactive"),		CTL(stats_cactive)},
-	{NAME("allocated"),		CTL(stats_allocated)},
-	{NAME("active"),		CTL(stats_active)},
-	{NAME("mapped"),		CTL(stats_mapped)},
-	{NAME("chunks"),		CHILD(named, stats_chunks)},
-	{NAME("arenas"),		CHILD(indexed, stats_arenas)}
+	{NAME("cactive"),	CTL(stats_cactive)},
+	{NAME("allocated"),	CTL(stats_allocated)},
+	{NAME("active"),	CTL(stats_active)},
+	{NAME("mapped"),	CTL(stats_mapped)},
+	{NAME("chunks"),	CHILD(named, stats_chunks)},
+	{NAME("arenas"),	CHILD(indexed, stats_arenas)}
 };
 
 static const ctl_named_node_t	root_node[] = {
@@ -453,6 +489,13 @@ ctl_arena_init(ctl_arena_stats_t *astats)
 			return (true);
 	}
 
+	if (astats->hstats == NULL) {
+		astats->hstats = (malloc_huge_stats_t *)a0malloc(nhclasses *
+		    sizeof(malloc_huge_stats_t));
+		if (astats->hstats == NULL)
+			return (true);
+	}
+
 	return (false);
 }
 
@@ -472,6 +515,8 @@ ctl_arena_clear(ctl_arena_stats_t *astats)
 		memset(astats->bstats, 0, NBINS * sizeof(malloc_bin_stats_t));
 		memset(astats->lstats, 0, nlclasses *
 		    sizeof(malloc_large_stats_t));
+		memset(astats->hstats, 0, nhclasses *
+		    sizeof(malloc_huge_stats_t));
 	}
 }
 
@@ -481,10 +526,12 @@ ctl_arena_stats_amerge(ctl_arena_stats_t *cstats, arena_t *arena)
 	unsigned i;
 
 	arena_stats_merge(arena, &cstats->dss, &cstats->pactive,
-	    &cstats->pdirty, &cstats->astats, cstats->bstats, cstats->lstats);
+	    &cstats->pdirty, &cstats->astats, cstats->bstats, cstats->lstats,
+	    cstats->hstats);
 
 	for (i = 0; i < NBINS; i++) {
-		cstats->allocated_small += cstats->bstats[i].allocated;
+		cstats->allocated_small += cstats->bstats[i].curregs *
+		    index2size(i);
 		cstats->nmalloc_small += cstats->bstats[i].nmalloc;
 		cstats->ndalloc_small += cstats->bstats[i].ndalloc;
 		cstats->nrequests_small += cstats->bstats[i].nrequests;
@@ -517,20 +564,12 @@ ctl_arena_stats_smerge(ctl_arena_stats_t *sstats, ctl_arena_stats_t *astats)
 	sstats->astats.allocated_huge += astats->astats.allocated_huge;
 	sstats->astats.nmalloc_huge += astats->astats.nmalloc_huge;
 	sstats->astats.ndalloc_huge += astats->astats.ndalloc_huge;
-	sstats->astats.nrequests_huge += astats->astats.nrequests_huge;
-
-	for (i = 0; i < nlclasses; i++) {
-		sstats->lstats[i].nmalloc += astats->lstats[i].nmalloc;
-		sstats->lstats[i].ndalloc += astats->lstats[i].ndalloc;
-		sstats->lstats[i].nrequests += astats->lstats[i].nrequests;
-		sstats->lstats[i].curruns += astats->lstats[i].curruns;
-	}
 
 	for (i = 0; i < NBINS; i++) {
-		sstats->bstats[i].allocated += astats->bstats[i].allocated;
 		sstats->bstats[i].nmalloc += astats->bstats[i].nmalloc;
 		sstats->bstats[i].ndalloc += astats->bstats[i].ndalloc;
 		sstats->bstats[i].nrequests += astats->bstats[i].nrequests;
+		sstats->bstats[i].curregs += astats->bstats[i].curregs;
 		if (config_tcache) {
 			sstats->bstats[i].nfills += astats->bstats[i].nfills;
 			sstats->bstats[i].nflushes +=
@@ -540,6 +579,19 @@ ctl_arena_stats_smerge(ctl_arena_stats_t *sstats, ctl_arena_stats_t *astats)
 		sstats->bstats[i].reruns += astats->bstats[i].reruns;
 		sstats->bstats[i].curruns += astats->bstats[i].curruns;
 	}
+
+	for (i = 0; i < nlclasses; i++) {
+		sstats->lstats[i].nmalloc += astats->lstats[i].nmalloc;
+		sstats->lstats[i].ndalloc += astats->lstats[i].ndalloc;
+		sstats->lstats[i].nrequests += astats->lstats[i].nrequests;
+		sstats->lstats[i].curruns += astats->lstats[i].curruns;
+	}
+
+	for (i = 0; i < nhclasses; i++) {
+		sstats->hstats[i].nmalloc += astats->hstats[i].nmalloc;
+		sstats->hstats[i].ndalloc += astats->hstats[i].ndalloc;
+		sstats->hstats[i].curhchunks += astats->hstats[i].curhchunks;
+	}
 }
 
 static void
@@ -692,6 +744,8 @@ ctl_init(void)
 					for (j = 0; j < i; j++) {
 						a0free(
 						    ctl_stats.arenas[j].lstats);
+						a0free(
+						    ctl_stats.arenas[j].hstats);
 					}
 					a0free(ctl_stats.arenas);
 					ctl_stats.arenas = NULL;
@@ -1600,7 +1654,7 @@ arenas_bin_i_index(const size_t *mib, size_t miblen, size_t i)
 	return (super_arenas_bin_i_node);
 }
 
-CTL_RO_NL_GEN(arenas_nlruns, nlclasses, size_t)
+CTL_RO_NL_GEN(arenas_nlruns, nlclasses, unsigned)
 CTL_RO_NL_GEN(arenas_lrun_i_size, index2size(NBINS+mib[2]), size_t)
 static const ctl_named_node_t *
 arenas_lrun_i_index(const size_t *mib, size_t miblen, size_t i)
@@ -1611,6 +1665,17 @@ arenas_lrun_i_index(const size_t *mib, size_t miblen, size_t i)
 	return (super_arenas_lrun_i_node);
 }
 
+CTL_RO_NL_GEN(arenas_nhchunks, nhclasses, unsigned)
+CTL_RO_NL_GEN(arenas_hchunk_i_size, index2size(NBINS+nlclasses+mib[2]), size_t)
+static const ctl_named_node_t *
+arenas_hchunk_i_index(const size_t *mib, size_t miblen, size_t i)
+{
+
+	if (i > nhclasses)
+		return (NULL);
+	return (super_arenas_hchunk_i_node);
+}
+
 static int
 arenas_extend_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
     void *newp, size_t newlen)
@@ -1784,16 +1849,16 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_huge_nmalloc,
 CTL_RO_CGEN(config_stats, stats_arenas_i_huge_ndalloc,
     ctl_stats.arenas[mib[2]].astats.ndalloc_huge, uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_huge_nrequests,
-    ctl_stats.arenas[mib[2]].astats.nrequests_huge, uint64_t)
+    ctl_stats.arenas[mib[2]].astats.nmalloc_huge, uint64_t) /* Intentional. */
 
-CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_allocated,
-    ctl_stats.arenas[mib[2]].bstats[mib[4]].allocated, size_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_nmalloc,
     ctl_stats.arenas[mib[2]].bstats[mib[4]].nmalloc, uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_ndalloc,
     ctl_stats.arenas[mib[2]].bstats[mib[4]].ndalloc, uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_nrequests,
     ctl_stats.arenas[mib[2]].bstats[mib[4]].nrequests, uint64_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_curregs,
+    ctl_stats.arenas[mib[2]].bstats[mib[4]].curregs, size_t)
 CTL_RO_CGEN(config_stats && config_tcache, stats_arenas_i_bins_j_nfills,
     ctl_stats.arenas[mib[2]].bstats[mib[4]].nfills, uint64_t)
 CTL_RO_CGEN(config_stats && config_tcache, stats_arenas_i_bins_j_nflushes,
@@ -1832,6 +1897,25 @@ stats_arenas_i_lruns_j_index(const size_t *mib, size_t miblen, size_t j)
 	return (super_stats_arenas_i_lruns_j_node);
 }
 
+CTL_RO_CGEN(config_stats, stats_arenas_i_hchunks_j_nmalloc,
+    ctl_stats.arenas[mib[2]].hstats[mib[4]].nmalloc, uint64_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_hchunks_j_ndalloc,
+    ctl_stats.arenas[mib[2]].hstats[mib[4]].ndalloc, uint64_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_hchunks_j_nrequests,
+    ctl_stats.arenas[mib[2]].hstats[mib[4]].nmalloc, /* Intentional. */
+    uint64_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_hchunks_j_curhchunks,
+    ctl_stats.arenas[mib[2]].hstats[mib[4]].curhchunks, size_t)
+
+static const ctl_named_node_t *
+stats_arenas_i_hchunks_j_index(const size_t *mib, size_t miblen, size_t j)
+{
+
+	if (j > nhclasses)
+		return (NULL);
+	return (super_stats_arenas_i_hchunks_j_node);
+}
+
 static const ctl_named_node_t *
 stats_arenas_i_index(const size_t *mib, size_t miblen, size_t i)
 {
diff --git a/src/huge.c b/src/huge.c
index 6c9b97b..5f46241 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -104,6 +104,101 @@ huge_dalloc_junk(void *ptr, size_t usize)
 huge_dalloc_junk_t *huge_dalloc_junk = JEMALLOC_N(huge_dalloc_junk_impl);
 #endif
 
+static void
+huge_ralloc_no_move_stats_update(arena_t *arena, size_t oldsize, size_t usize)
+{
+	index_t oldindex = size2index(oldsize) - nlclasses - NBINS;
+	index_t index = size2index(usize) - nlclasses - NBINS;
+
+	cassert(config_stats);
+
+	arena->stats.ndalloc_huge++;
+	arena->stats.allocated_huge -= oldsize;
+	arena->stats.hstats[oldindex].ndalloc++;
+	arena->stats.hstats[oldindex].curhchunks--;
+
+	arena->stats.nmalloc_huge++;
+	arena->stats.allocated_huge += usize;
+	arena->stats.hstats[index].nmalloc++;
+	arena->stats.hstats[index].curhchunks++;
+}
+
+static void
+huge_ralloc_no_move_similar(void *ptr, size_t oldsize, size_t usize,
+    size_t size, size_t extra, bool zero)
+{
+	size_t usize_next;
+	extent_node_t *node, key;
+	arena_t *arena;
+
+	/* Increase usize to incorporate extra. */
+	while (usize < s2u(size+extra) && (usize_next = s2u(usize+1)) < oldsize)
+		usize = usize_next;
+
+	malloc_mutex_lock(&huge_mtx);
+
+	key.addr = ptr;
+	node = extent_tree_ad_search(&huge, &key);
+	assert(node != NULL);
+	assert(node->addr == ptr);
+
+	arena = node->arena;
+
+	/* Update the size of the huge allocation if it changed. */
+	if (oldsize != usize) {
+		assert(node->size != usize);
+		node->size = usize;
+	}
+
+	malloc_mutex_unlock(&huge_mtx);
+
+	/* Fill if necessary. */
+	if (oldsize < usize) {
+		if (zero || (config_fill && unlikely(opt_zero)))
+			memset(ptr + oldsize, 0, usize - oldsize);
+		else if (config_fill && unlikely(opt_junk))
+			memset(ptr + oldsize, 0xa5, usize - oldsize);
+	} else if (config_fill && unlikely(opt_junk) && oldsize > usize)
+		memset(ptr + usize, 0x5a, oldsize - usize);
+
+	if (config_stats)
+		huge_ralloc_no_move_stats_update(arena, oldsize, usize);
+}
+
+static void
+huge_ralloc_no_move_shrink(void *ptr, size_t oldsize, size_t usize)
+{
+	extent_node_t *node, key;
+	arena_t *arena;
+	void *excess_addr;
+	size_t excess_size;
+
+	malloc_mutex_lock(&huge_mtx);
+
+	key.addr = ptr;
+	node = extent_tree_ad_search(&huge, &key);
+	assert(node != NULL);
+	assert(node->addr == ptr);
+
+	arena = node->arena;
+
+	/* Update the size of the huge allocation. */
+	node->size = usize;
+
+	malloc_mutex_unlock(&huge_mtx);
+
+	excess_addr = node->addr + CHUNK_CEILING(usize);
+	excess_size = CHUNK_CEILING(oldsize) - CHUNK_CEILING(usize);
+
+	/* Zap the excess chunks. */
+	huge_dalloc_junk(ptr + usize, oldsize - usize);
+	if (excess_size > 0)
+		arena_chunk_dalloc_huge(arena, excess_addr, excess_size);
+
+	if (config_stats)
+		huge_ralloc_no_move_stats_update(arena, oldsize, usize);
+}
+
 static bool
 huge_ralloc_no_move_expand(void *ptr, size_t oldsize, size_t size, bool zero) {
 	size_t usize;
@@ -131,7 +226,6 @@ huge_ralloc_no_move_expand(void *ptr, size_t oldsize, size_t size, bool zero) {
 	assert(node != NULL);
 	assert(node->addr == ptr);
 
-	/* Find the current arena. */
 	arena = node->arena;
 
 	malloc_mutex_unlock(&huge_mtx);
@@ -159,6 +253,10 @@ huge_ralloc_no_move_expand(void *ptr, size_t oldsize, size_t size, bool zero) {
 		else if (unlikely(opt_zero) && !is_zeroed)
 			memset(ptr + oldsize, 0, usize - oldsize);
 	}
+
+	if (config_stats)
+		huge_ralloc_no_move_stats_update(arena, oldsize, usize);
+
 	return (false);
 }
 
@@ -185,78 +283,20 @@ huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
 	 */
 	if (CHUNK_CEILING(oldsize) >= CHUNK_CEILING(usize)
 	    && CHUNK_CEILING(oldsize) <= CHUNK_CEILING(size+extra)) {
-		size_t usize_next;
-
-		/* Increase usize to incorporate extra. */
-		while (usize < s2u(size+extra) && (usize_next = s2u(usize+1)) <
-		    oldsize)
-			usize = usize_next;
-
-		/* Update the size of the huge allocation if it changed. */
-		if (oldsize != usize) {
-			extent_node_t *node, key;
-
-			malloc_mutex_lock(&huge_mtx);
-
-			key.addr = ptr;
-			node = extent_tree_ad_search(&huge, &key);
-			assert(node != NULL);
-			assert(node->addr == ptr);
-
-			assert(node->size != usize);
-			node->size = usize;
-
-			malloc_mutex_unlock(&huge_mtx);
-
-			if (oldsize < usize) {
-				if (zero || (config_fill &&
-				    unlikely(opt_zero))) {
-					memset(ptr + oldsize, 0, usize -
-					    oldsize);
-				} else if (config_fill && unlikely(opt_junk)) {
-					memset(ptr + oldsize, 0xa5, usize -
-					    oldsize);
-				}
-			} else if (config_fill && unlikely(opt_junk) && oldsize
-			    > usize)
-				memset(ptr + usize, 0x5a, oldsize - usize);
-		}
+		huge_ralloc_no_move_similar(ptr, oldsize, usize, size, extra,
+		    zero);
 		return (false);
 	}
 
 	/* Shrink the allocation in-place. */
 	if (CHUNK_CEILING(oldsize) >= CHUNK_CEILING(usize)) {
-		extent_node_t *node, key;
-		void *excess_addr;
-		size_t excess_size;
-
-		malloc_mutex_lock(&huge_mtx);
-
-		key.addr = ptr;
-		node = extent_tree_ad_search(&huge, &key);
-		assert(node != NULL);
-		assert(node->addr == ptr);
-
-		/* Update the size of the huge allocation. */
-		node->size = usize;
-
-		malloc_mutex_unlock(&huge_mtx);
-
-		excess_addr = node->addr + CHUNK_CEILING(usize);
-		excess_size = CHUNK_CEILING(oldsize) - CHUNK_CEILING(usize);
-
-		/* Zap the excess chunks. */
-		huge_dalloc_junk(ptr + usize, oldsize - usize);
-		if (excess_size > 0) {
-			arena_chunk_dalloc_huge(node->arena, excess_addr,
-			    excess_size);
-		}
-
+		huge_ralloc_no_move_shrink(ptr, oldsize, usize);
 		return (false);
 	}
 
 	/* Attempt to expand the allocation in-place. */
-	if (huge_ralloc_no_move_expand(ptr, oldsize, size + extra, zero)) {
+	if (huge_ralloc_no_move_expand(ptr, oldsize, size + extra,
+	    zero)) {
 		if (extra == 0)
 			return (true);
 
diff --git a/src/stats.c b/src/stats.c
index 5c3d701..16a18c5 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -48,8 +48,10 @@ static void	stats_arena_bins_print(void (*write_cb)(void *, const char *),
     void *cbopaque, unsigned i);
 static void	stats_arena_lruns_print(void (*write_cb)(void *, const char *),
     void *cbopaque, unsigned i);
+static void	stats_arena_hchunks_print(
+    void (*write_cb)(void *, const char *), void *cbopaque, unsigned i);
 static void	stats_arena_print(void (*write_cb)(void *, const char *),
-    void *cbopaque, unsigned i, bool bins, bool large);
+    void *cbopaque, unsigned i, bool bins, bool large, bool huge);
 
 /******************************************************************************/
 
@@ -58,62 +60,55 @@ stats_arena_bins_print(void (*write_cb)(void *, const char *), void *cbopaque,
     unsigned i)
 {
 	size_t page;
-	bool config_tcache;
-	unsigned nbins, j, gap_start;
+	bool config_tcache, in_gap;
+	unsigned nbins, j;
 
 	CTL_GET("arenas.page", &page, size_t);
 
 	CTL_GET("config.tcache", &config_tcache, bool);
 	if (config_tcache) {
 		malloc_cprintf(write_cb, cbopaque,
-		    "bins:     bin  size regs pgs    allocated      nmalloc"
-		    "      ndalloc    nrequests       nfills     nflushes"
-		    "      newruns       reruns      curruns\n");
+		    "bins:           size ind    allocated      nmalloc"
+		    "      ndalloc    nrequests      curregs regs pgs"
+		    "       nfills     nflushes      newruns       reruns"
+		    "      curruns\n");
 	} else {
 		malloc_cprintf(write_cb, cbopaque,
-		    "bins:     bin  size regs pgs    allocated      nmalloc"
-		    "      ndalloc      newruns       reruns      curruns\n");
+		    "bins:           size ind    allocated      nmalloc"
+		    "      ndalloc    nrequests      curregs regs pgs"
+		    "      newruns       reruns      curruns\n");
 	}
 	CTL_GET("arenas.nbins", &nbins, unsigned);
-	for (j = 0, gap_start = UINT_MAX; j < nbins; j++) {
+	for (j = 0, in_gap = false; j < nbins; j++) {
 		uint64_t nruns;
 
 		CTL_IJ_GET("stats.arenas.0.bins.0.nruns", &nruns, uint64_t);
-		if (nruns == 0) {
-			if (gap_start == UINT_MAX)
-				gap_start = j;
-		} else {
-			size_t reg_size, run_size, allocated;
+		if (nruns == 0)
+			in_gap = true;
+		else {
+			size_t reg_size, run_size, curregs;
 			uint32_t nregs;
 			uint64_t nmalloc, ndalloc, nrequests, nfills, nflushes;
 			uint64_t reruns;
 			size_t curruns;
 
-			if (gap_start != UINT_MAX) {
-				if (j > gap_start + 1) {
-					/* Gap of more than one size class. */
-					malloc_cprintf(write_cb, cbopaque,
-					    "[%u..%u]\n", gap_start,
-					    j - 1);
-				} else {
-					/* Gap of one size class. */
-					malloc_cprintf(write_cb, cbopaque,
-					    "[%u]\n", gap_start);
-				}
-				gap_start = UINT_MAX;
+			if (in_gap) {
+				malloc_cprintf(write_cb, cbopaque,
+				    "                     ---\n");
+				in_gap = false;
 			}
 			CTL_J_GET("arenas.bin.0.size", &reg_size, size_t);
 			CTL_J_GET("arenas.bin.0.nregs", &nregs, uint32_t);
 			CTL_J_GET("arenas.bin.0.run_size", &run_size, size_t);
-			CTL_IJ_GET("stats.arenas.0.bins.0.allocated",
-			    &allocated, size_t);
 			CTL_IJ_GET("stats.arenas.0.bins.0.nmalloc",
 			    &nmalloc, uint64_t);
 			CTL_IJ_GET("stats.arenas.0.bins.0.ndalloc",
 			    &ndalloc, uint64_t);
+			CTL_IJ_GET("stats.arenas.0.bins.0.curregs",
+			    &curregs, size_t);
+			CTL_IJ_GET("stats.arenas.0.bins.0.nrequests",
+			    &nrequests, uint64_t);
 			if (config_tcache) {
-				CTL_IJ_GET("stats.arenas.0.bins.0.nrequests",
-				    &nrequests, uint64_t);
 				CTL_IJ_GET("stats.arenas.0.bins.0.nfills",
 				    &nfills, uint64_t);
 				CTL_IJ_GET("stats.arenas.0.bins.0.nflushes",
@@ -125,33 +120,28 @@ stats_arena_bins_print(void (*write_cb)(void *, const char *), void *cbopaque,
 			    size_t);
 			if (config_tcache) {
 				malloc_cprintf(write_cb, cbopaque,
-				    "%13u %5zu %4u %3zu %12zu %12"PRIu64
-				    " %12"PRIu64" %12"PRIu64" %12"PRIu64
+				    "%20zu %3u %12zu %12"PRIu64" %12"PRIu64
+				    " %12"PRIu64" %12zu %4u %3zu %12"PRIu64
 				    " %12"PRIu64" %12"PRIu64" %12"PRIu64
 				    " %12zu\n",
-				    j, reg_size, nregs, run_size / page,
-				    allocated, nmalloc, ndalloc, nrequests,
-				    nfills, nflushes, nruns, reruns, curruns);
+				    reg_size, j, curregs * reg_size, nmalloc,
+				    ndalloc, nrequests, curregs, nregs, run_size
+				    / page, nfills, nflushes, nruns, reruns,
+				    curruns);
 			} else {
 				malloc_cprintf(write_cb, cbopaque,
-				    "%13u %5zu %4u %3zu %12zu %12"PRIu64
-				    " %12"PRIu64" %12"PRIu64" %12"PRIu64
-				    " %12zu\n",
-				    j, reg_size, nregs, run_size / page,
-				    allocated, nmalloc, ndalloc, nruns, reruns,
-				    curruns);
+				    "%20zu %3u %12zu %12"PRIu64" %12"PRIu64
+				    " %12"PRIu64" %12zu %4u %3zu %12"PRIu64
+				    " %12"PRIu64" %12zu\n",
+				    reg_size, j, curregs * reg_size, nmalloc,
+				    ndalloc, nrequests, curregs, nregs,
+				    run_size / page, nruns, reruns, curruns);
 			}
 		}
 	}
-	if (gap_start != UINT_MAX) {
-		if (j > gap_start + 1) {
-			/* Gap of more than one size class. */
-			malloc_cprintf(write_cb, cbopaque, "[%u..%u]\n",
-			    gap_start, j - 1);
-		} else {
-			/* Gap of one size class. */
-			malloc_cprintf(write_cb, cbopaque, "[%u]\n", gap_start);
-		}
+	if (in_gap) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "                     ---\n");
 	}
 }
 
@@ -159,16 +149,15 @@ static void
 stats_arena_lruns_print(void (*write_cb)(void *, const char *), void *cbopaque,
     unsigned i)
 {
-	size_t page, nlruns, j;
-	ssize_t gap_start;
-
-	CTL_GET("arenas.page", &page, size_t);
+	unsigned nbins, nlruns, j;
+	bool in_gap;
 
 	malloc_cprintf(write_cb, cbopaque,
-	    "large:   size pages      nmalloc      ndalloc    nrequests"
-	    "      curruns\n");
-	CTL_GET("arenas.nlruns", &nlruns, size_t);
-	for (j = 0, gap_start = -1; j < nlruns; j++) {
+	    "large:          size ind    allocated      nmalloc      ndalloc"
+	    "    nrequests      curruns\n");
+	CTL_GET("arenas.nbins", &nbins, unsigned);
+	CTL_GET("arenas.nlruns", &nlruns, unsigned);
+	for (j = 0, in_gap = false; j < nlruns; j++) {
 		uint64_t nmalloc, ndalloc, nrequests;
 		size_t run_size, curruns;
 
@@ -178,32 +167,82 @@ stats_arena_lruns_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		    uint64_t);
 		CTL_IJ_GET("stats.arenas.0.lruns.0.nrequests", &nrequests,
 		    uint64_t);
-		if (nrequests == 0) {
-			if (gap_start == -1)
-				gap_start = j;
-		} else {
+		if (nrequests == 0)
+			in_gap = true;
+		else {
 			CTL_J_GET("arenas.lrun.0.size", &run_size, size_t);
 			CTL_IJ_GET("stats.arenas.0.lruns.0.curruns", &curruns,
 			    size_t);
-			if (gap_start != -1) {
-				malloc_cprintf(write_cb, cbopaque, "[%zu]\n",
-				    j - gap_start);
-				gap_start = -1;
+			if (in_gap) {
+				malloc_cprintf(write_cb, cbopaque,
+				    "                     ---\n");
+				in_gap = false;
+			}
+			malloc_cprintf(write_cb, cbopaque,
+			    "%20zu %3u %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64
+			    " %12zu\n",
+			    run_size, nbins + j, curruns * run_size, nmalloc,
+			    ndalloc, nrequests, curruns);
+		}
+	}
+	if (in_gap) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "                     ---\n");
+	}
+}
+
+static void
+stats_arena_hchunks_print(void (*write_cb)(void *, const char *),
+    void *cbopaque, unsigned i)
+{
+	unsigned nbins, nlruns, nhchunks, j;
+	bool in_gap;
+
+	malloc_cprintf(write_cb, cbopaque,
+	    "huge:           size ind    allocated      nmalloc      ndalloc"
+	    "    nrequests   curhchunks\n");
+	CTL_GET("arenas.nbins", &nbins, unsigned);
+	CTL_GET("arenas.nlruns", &nlruns, unsigned);
+	CTL_GET("arenas.nhchunks", &nhchunks, unsigned);
+	for (j = 0, in_gap = false; j < nhchunks; j++) {
+		uint64_t nmalloc, ndalloc, nrequests;
+		size_t hchunk_size, curhchunks;
+
+		CTL_IJ_GET("stats.arenas.0.hchunks.0.nmalloc", &nmalloc,
+		    uint64_t);
+		CTL_IJ_GET("stats.arenas.0.hchunks.0.ndalloc", &ndalloc,
+		    uint64_t);
+		CTL_IJ_GET("stats.arenas.0.hchunks.0.nrequests", &nrequests,
+		    uint64_t);
+		if (nrequests == 0)
+			in_gap = true;
+		else {
+			CTL_J_GET("arenas.hchunk.0.size", &hchunk_size,
+			    size_t);
+			CTL_IJ_GET("stats.arenas.0.hchunks.0.curhchunks",
+			    &curhchunks, size_t);
+			if (in_gap) {
+				malloc_cprintf(write_cb, cbopaque,
+				    "                     ---\n");
+				in_gap = false;
 			}
 			malloc_cprintf(write_cb, cbopaque,
-			    "%13zu %5zu %12"PRIu64" %12"PRIu64" %12"PRIu64
+			    "%20zu %3u %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64
 			    " %12zu\n",
-			    run_size, run_size / page, nmalloc, ndalloc,
-			    nrequests, curruns);
+			    hchunk_size, nbins + nlruns + j,
+			    curhchunks * hchunk_size, nmalloc, ndalloc,
+			    nrequests, curhchunks);
 		}
 	}
-	if (gap_start != -1)
-		malloc_cprintf(write_cb, cbopaque, "[%zu]\n", j - gap_start);
+	if (in_gap) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "                     ---\n");
+	}
 }
 
 static void
 stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
-    unsigned i, bool bins, bool large)
+    unsigned i, bool bins, bool large, bool huge)
 {
 	unsigned nthreads;
 	const char *dss;
@@ -236,42 +275,51 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	    nmadvise, nmadvise == 1 ? "" : "s", purged);
 
 	malloc_cprintf(write_cb, cbopaque,
-	    "            allocated      nmalloc      ndalloc    nrequests\n");
+	    "                            allocated      nmalloc      ndalloc"
+	    "    nrequests\n");
 	CTL_I_GET("stats.arenas.0.small.allocated", &small_allocated, size_t);
 	CTL_I_GET("stats.arenas.0.small.nmalloc", &small_nmalloc, uint64_t);
 	CTL_I_GET("stats.arenas.0.small.ndalloc", &small_ndalloc, uint64_t);
 	CTL_I_GET("stats.arenas.0.small.nrequests", &small_nrequests, uint64_t);
 	malloc_cprintf(write_cb, cbopaque,
-	    "small:   %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64"\n",
+	    "small:                   %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64
+	    "\n",
 	    small_allocated, small_nmalloc, small_ndalloc, small_nrequests);
 	CTL_I_GET("stats.arenas.0.large.allocated", &large_allocated, size_t);
 	CTL_I_GET("stats.arenas.0.large.nmalloc", &large_nmalloc, uint64_t);
 	CTL_I_GET("stats.arenas.0.large.ndalloc", &large_ndalloc, uint64_t);
 	CTL_I_GET("stats.arenas.0.large.nrequests", &large_nrequests, uint64_t);
 	malloc_cprintf(write_cb, cbopaque,
-	    "large:   %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64"\n",
+	    "large:                   %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64
+	    "\n",
 	    large_allocated, large_nmalloc, large_ndalloc, large_nrequests);
 	CTL_I_GET("stats.arenas.0.huge.allocated", &huge_allocated, size_t);
 	CTL_I_GET("stats.arenas.0.huge.nmalloc", &huge_nmalloc, uint64_t);
 	CTL_I_GET("stats.arenas.0.huge.ndalloc", &huge_ndalloc, uint64_t);
 	CTL_I_GET("stats.arenas.0.huge.nrequests", &huge_nrequests, uint64_t);
 	malloc_cprintf(write_cb, cbopaque,
-	    "huge:    %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64"\n",
+	    "huge:                    %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64
+	    "\n",
 	    huge_allocated, huge_nmalloc, huge_ndalloc, huge_nrequests);
 	malloc_cprintf(write_cb, cbopaque,
-	    "total:   %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64"\n",
+	    "total:                   %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64
+	    "\n",
 	    small_allocated + large_allocated + huge_allocated,
 	    small_nmalloc + large_nmalloc + huge_nmalloc,
 	    small_ndalloc + large_ndalloc + huge_ndalloc,
 	    small_nrequests + large_nrequests + huge_nrequests);
-	malloc_cprintf(write_cb, cbopaque, "active:  %12zu\n", pactive * page);
+	malloc_cprintf(write_cb, cbopaque, "active:                  %12zu\n",
+	    pactive * page);
 	CTL_I_GET("stats.arenas.0.mapped", &mapped, size_t);
-	malloc_cprintf(write_cb, cbopaque, "mapped:  %12zu\n", mapped);
+	malloc_cprintf(write_cb, cbopaque, "mapped:                  %12zu\n",
+	    mapped);
 
 	if (bins)
 		stats_arena_bins_print(write_cb, cbopaque, i);
 	if (large)
 		stats_arena_lruns_print(write_cb, cbopaque, i);
+	if (huge)
+		stats_arena_hchunks_print(write_cb, cbopaque, i);
 }
 
 void
@@ -286,6 +334,7 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	bool unmerged = true;
 	bool bins = true;
 	bool large = true;
+	bool huge = true;
 
 	/*
 	 * Refresh stats, in case mallctl() was called by the application.
@@ -328,6 +377,9 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 			case 'l':
 				large = false;
 				break;
+			case 'h':
+				huge = false;
+				break;
 			default:;
 			}
 		}
@@ -515,7 +567,7 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 					malloc_cprintf(write_cb, cbopaque,
 					    "\nMerged arenas stats:\n");
 					stats_arena_print(write_cb, cbopaque,
-					    narenas, bins, large);
+					    narenas, bins, large, huge);
 				}
 			}
 		}
@@ -541,7 +593,8 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 						    cbopaque,
 						    "\narenas[%u]:\n", i);
 						stats_arena_print(write_cb,
-						    cbopaque, i, bins, large);
+						    cbopaque, i, bins, large,
+						    huge);
 					}
 				}
 			}
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index a8f7aed..028a971 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -321,7 +321,8 @@ TEST_BEGIN(test_arenas_constants)
 	TEST_ARENAS_CONSTANT(size_t, quantum, QUANTUM);
 	TEST_ARENAS_CONSTANT(size_t, page, PAGE);
 	TEST_ARENAS_CONSTANT(unsigned, nbins, NBINS);
-	TEST_ARENAS_CONSTANT(size_t, nlruns, nlclasses);
+	TEST_ARENAS_CONSTANT(unsigned, nlruns, nlclasses);
+	TEST_ARENAS_CONSTANT(unsigned, nhchunks, nhclasses);
 
 #undef TEST_ARENAS_CONSTANT
 }
@@ -363,6 +364,23 @@ TEST_BEGIN(test_arenas_lrun_constants)
 }
 TEST_END
 
+TEST_BEGIN(test_arenas_hchunk_constants)
+{
+
+#define	TEST_ARENAS_HCHUNK_CONSTANT(t, name, expected) do {		\
+	t name;								\
+	size_t sz = sizeof(t);						\
+	assert_d_eq(mallctl("arenas.hchunk.0."#name, &name, &sz, NULL,	\
+	    0), 0, "Unexpected mallctl() failure");			\
+	assert_zu_eq(name, expected, "Incorrect "#name" size");		\
+} while (0)
+
+	TEST_ARENAS_HCHUNK_CONSTANT(size_t, size, chunksize);
+
+#undef TEST_ARENAS_HCHUNK_CONSTANT
+}
+TEST_END
+
 TEST_BEGIN(test_arenas_extend)
 {
 	unsigned narenas_before, arena, narenas_after;
@@ -420,6 +438,7 @@ main(void)
 	    test_arenas_constants,
 	    test_arenas_bin_constants,
 	    test_arenas_lrun_constants,
+	    test_arenas_hchunk_constants,
 	    test_arenas_extend,
 	    test_stats_arenas));
 }
diff --git a/test/unit/stats.c b/test/unit/stats.c
index 78c78cd..fd92d54 100644
--- a/test/unit/stats.c
+++ b/test/unit/stats.c
@@ -97,7 +97,7 @@ TEST_END
 TEST_BEGIN(test_stats_arenas_summary)
 {
 	unsigned arena;
-	void *little, *large;
+	void *little, *large, *huge;
 	uint64_t epoch;
 	size_t sz;
 	int expected = config_stats ? 0 : ENOENT;
@@ -112,6 +112,8 @@ TEST_BEGIN(test_stats_arenas_summary)
 	assert_ptr_not_null(little, "Unexpected mallocx() failure");
 	large = mallocx(arena_maxclass, 0);
 	assert_ptr_not_null(large, "Unexpected mallocx() failure");
+	huge = mallocx(chunksize, 0);
+	assert_ptr_not_null(huge, "Unexpected mallocx() failure");
 
 	assert_d_eq(mallctl("arena.0.purge", NULL, NULL, NULL, 0), 0,
 	    "Unexpected mallctl() failure");
@@ -139,6 +141,7 @@ TEST_BEGIN(test_stats_arenas_summary)
 
 	dallocx(little, 0);
 	dallocx(large, 0);
+	dallocx(huge, 0);
 }
 TEST_END
 
@@ -251,11 +254,51 @@ TEST_BEGIN(test_stats_arenas_large)
 }
 TEST_END
 
+TEST_BEGIN(test_stats_arenas_huge)
+{
+	unsigned arena;
+	void *p;
+	size_t sz, allocated;
+	uint64_t epoch, nmalloc, ndalloc;
+	int expected = config_stats ? 0 : ENOENT;
+
+	arena = 0;
+	assert_d_eq(mallctl("thread.arena", NULL, NULL, &arena, sizeof(arena)),
+	    0, "Unexpected mallctl() failure");
+
+	p = mallocx(chunksize, 0);
+	assert_ptr_not_null(p, "Unexpected mallocx() failure");
+
+	assert_d_eq(mallctl("epoch", NULL, NULL, &epoch, sizeof(epoch)), 0,
+	    "Unexpected mallctl() failure");
+
+	sz = sizeof(size_t);
+	assert_d_eq(mallctl("stats.arenas.0.huge.allocated", &allocated, &sz,
+	    NULL, 0), expected, "Unexpected mallctl() result");
+	sz = sizeof(uint64_t);
+	assert_d_eq(mallctl("stats.arenas.0.huge.nmalloc", &nmalloc, &sz,
+	    NULL, 0), expected, "Unexpected mallctl() result");
+	assert_d_eq(mallctl("stats.arenas.0.huge.ndalloc", &ndalloc, &sz,
+	    NULL, 0), expected, "Unexpected mallctl() result");
+
+	if (config_stats) {
+		assert_zu_gt(allocated, 0,
+		    "allocated should be greater than zero");
+		assert_zu_gt(nmalloc, 0,
+		    "nmalloc should be greater than zero");
+		assert_zu_ge(nmalloc, ndalloc,
+		    "nmalloc should be at least as large as ndalloc");
+	}
+
+	dallocx(p, 0);
+}
+TEST_END
+
 TEST_BEGIN(test_stats_arenas_bins)
 {
 	unsigned arena;
 	void *p;
-	size_t sz, allocated, curruns;
+	size_t sz, curruns, curregs;
 	uint64_t epoch, nmalloc, ndalloc, nrequests, nfills, nflushes;
 	uint64_t nruns, nreruns;
 	int expected = config_stats ? 0 : ENOENT;
@@ -273,9 +316,6 @@ TEST_BEGIN(test_stats_arenas_bins)
 	assert_d_eq(mallctl("epoch", NULL, NULL, &epoch, sizeof(epoch)), 0,
 	    "Unexpected mallctl() failure");
 
-	sz = sizeof(size_t);
-	assert_d_eq(mallctl("stats.arenas.0.bins.0.allocated", &allocated, &sz,
-	    NULL, 0), expected, "Unexpected mallctl() result");
 	sz = sizeof(uint64_t);
 	assert_d_eq(mallctl("stats.arenas.0.bins.0.nmalloc", &nmalloc, &sz,
 	    NULL, 0), expected, "Unexpected mallctl() result");
@@ -283,6 +323,9 @@ TEST_BEGIN(test_stats_arenas_bins)
 	    NULL, 0), expected, "Unexpected mallctl() result");
 	assert_d_eq(mallctl("stats.arenas.0.bins.0.nrequests", &nrequests, &sz,
 	    NULL, 0), expected, "Unexpected mallctl() result");
+	sz = sizeof(size_t);
+	assert_d_eq(mallctl("stats.arenas.0.bins.0.curregs", &curregs, &sz,
+	    NULL, 0), expected, "Unexpected mallctl() result");
 
 	assert_d_eq(mallctl("stats.arenas.0.bins.0.nfills", &nfills, &sz,
 	    NULL, 0), config_tcache ? expected : ENOENT,
@@ -300,14 +343,14 @@ TEST_BEGIN(test_stats_arenas_bins)
 	    NULL, 0), expected, "Unexpected mallctl() result");
 
 	if (config_stats) {
-		assert_zu_gt(allocated, 0,
-		    "allocated should be greater than zero");
 		assert_u64_gt(nmalloc, 0,
 		    "nmalloc should be greater than zero");
 		assert_u64_ge(nmalloc, ndalloc,
 		    "nmalloc should be at least as large as ndalloc");
 		assert_u64_gt(nrequests, 0,
 		    "nrequests should be greater than zero");
+		assert_zu_gt(curregs, 0,
+		    "allocated should be greater than zero");
 		if (config_tcache) {
 			assert_u64_gt(nfills, 0,
 			    "At least one fill should have occurred");
@@ -336,7 +379,7 @@ TEST_BEGIN(test_stats_arenas_lruns)
 	assert_d_eq(mallctl("thread.arena", NULL, NULL, &arena, sizeof(arena)),
 	    0, "Unexpected mallctl() failure");
 
-	p = mallocx(SMALL_MAXCLASS+1, 0);
+	p = mallocx(LARGE_MINCLASS, 0);
 	assert_ptr_not_null(p, "Unexpected mallocx() failure");
 
 	assert_d_eq(mallctl("epoch", NULL, NULL, &epoch, sizeof(epoch)), 0,
@@ -368,6 +411,46 @@ TEST_BEGIN(test_stats_arenas_lruns)
 }
 TEST_END
 
+TEST_BEGIN(test_stats_arenas_hchunks)
+{
+	unsigned arena;
+	void *p;
+	uint64_t epoch, nmalloc, ndalloc;
+	size_t curhchunks, sz;
+	int expected = config_stats ? 0 : ENOENT;
+
+	arena = 0;
+	assert_d_eq(mallctl("thread.arena", NULL, NULL, &arena, sizeof(arena)),
+	    0, "Unexpected mallctl() failure");
+
+	p = mallocx(chunksize, 0);
+	assert_ptr_not_null(p, "Unexpected mallocx() failure");
+
+	assert_d_eq(mallctl("epoch", NULL, NULL, &epoch, sizeof(epoch)), 0,
+	    "Unexpected mallctl() failure");
+
+	sz = sizeof(uint64_t);
+	assert_d_eq(mallctl("stats.arenas.0.hchunks.0.nmalloc", &nmalloc, &sz,
+	    NULL, 0), expected, "Unexpected mallctl() result");
+	assert_d_eq(mallctl("stats.arenas.0.hchunks.0.ndalloc", &ndalloc, &sz,
+	    NULL, 0), expected, "Unexpected mallctl() result");
+	sz = sizeof(size_t);
+	assert_d_eq(mallctl("stats.arenas.0.hchunks.0.curhchunks", &curhchunks,
+	    &sz, NULL, 0), expected, "Unexpected mallctl() result");
+
+	if (config_stats) {
+		assert_u64_gt(nmalloc, 0,
+		    "nmalloc should be greater than zero");
+		assert_u64_ge(nmalloc, ndalloc,
+		    "nmalloc should be at least as large as ndalloc");
+		assert_u64_gt(curhchunks, 0,
+		    "At least one chunk should be currently allocated");
+	}
+
+	dallocx(p, 0);
+}
+TEST_END
+
 int
 main(void)
 {
@@ -379,6 +462,8 @@ main(void)
 	    test_stats_arenas_summary,
 	    test_stats_arenas_small,
 	    test_stats_arenas_large,
+	    test_stats_arenas_huge,
 	    test_stats_arenas_bins,
-	    test_stats_arenas_lruns));
+	    test_stats_arenas_lruns,
+	    test_stats_arenas_hchunks));
 }
-- 
cgit v0.12


From 0cdabd2d489133e3cea8a00bdb9a986b24e57a66 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Tue, 14 Oct 2014 22:19:21 -0700
Subject: Update size class documentation.

---
 doc/jemalloc.xml.in | 110 +++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 84 insertions(+), 26 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 8111fc1..fc01ad1 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -501,13 +501,11 @@ for (i = 0; i < nbins; i++) {
     possible to find metadata for user objects very quickly.</para>
 
     <para>User objects are broken into three categories according to size:
-    small, large, and huge.  Small objects are smaller than one page.  Large
-    objects are smaller than the chunk size.  Huge objects are a multiple of
-    the chunk size.  Small and large objects are managed entirely by arenas;
-    huge objects are additionally aggregated in a single data structure that is
-    shared by all threads.  Huge objects are typically used by applications
-    infrequently enough that this single data structure is not a scalability
-    issue.</para>
+    small, large, and huge.  Small and large objects are managed entirely by
+    arenas; huge objects are additionally aggregated in a single data structure
+    that is shared by all threads.  Huge objects are typically used by
+    applications infrequently enough that this single data structure is not a
+    scalability issue.</para>
 
     <para>Each chunk that is managed by an arena tracks its contents as runs of
     contiguous pages (unused, backing a set of small objects, or backing one
@@ -516,18 +514,18 @@ for (i = 0; i < nbins; i++) {
     allocations in constant time.</para>
 
     <para>Small objects are managed in groups by page runs.  Each run maintains
-    a frontier and free list to track which regions are in use.  Allocation
-    requests that are no more than half the quantum (8 or 16, depending on
-    architecture) are rounded up to the nearest power of two that is at least
-    <code language="C">sizeof(<type>double</type>)</code>.  All other small
-    object size classes are multiples of the quantum, spaced such that internal
-    fragmentation is limited to approximately 25% for all but the smallest size
-    classes.  Allocation requests that are larger than the maximum small size
-    class, but small enough to fit in an arena-managed chunk (see the <link
-    linkend="opt.lg_chunk"><mallctl>opt.lg_chunk</mallctl></link> option), are
-    rounded up to the nearest run size.  Allocation requests that are too large
-    to fit in an arena-managed chunk are rounded up to the nearest multiple of
-    the chunk size.</para>
+    a bitmap to track which regions are in use.  Allocation requests that are no
+    more than half the quantum (8 or 16, depending on architecture) are rounded
+    up to the nearest power of two that is at least <code
+    language="C">sizeof(<type>double</type>)</code>.  All other object size
+    classes are multiples of the quantum, spaced such that there are four size
+    classes for each doubling in size, which limits internal fragmentation to
+    approximately 20% for all but the smallest size classes.  Small size classes
+    are smaller than four times the page size, large size classes are smaller
+    than the chunk size (see the <link
+    linkend="opt.lg_chunk"><mallctl>opt.lg_chunk</mallctl></link> option), and
+    huge size classes extend from the chunk size up to one size class less than
+    the full address space size.</para>
 
     <para>Allocations are packed tightly together, which can be an issue for
     multi-threaded applications.  If you need to assure that allocations do not
@@ -554,13 +552,13 @@ for (i = 0; i < nbins; i++) {
       </thead>
       <tbody>
         <row>
-          <entry morerows="6">Small</entry>
+          <entry morerows="8">Small</entry>
           <entry>lg</entry>
           <entry>[8]</entry>
         </row>
         <row>
           <entry>16</entry>
-          <entry>[16, 32, 48, ..., 128]</entry>
+          <entry>[16, 32, 48, 64, 80, 96, 112, 128]</entry>
         </row>
         <row>
           <entry>32</entry>
@@ -580,17 +578,77 @@ for (i = 0; i < nbins; i++) {
         </row>
         <row>
           <entry>512</entry>
-          <entry>[2560, 3072, 3584]</entry>
+          <entry>[2560, 3072, 3584, 4096]</entry>
+        </row>
+        <row>
+          <entry>1 KiB</entry>
+          <entry>[5 KiB, 6 KiB, 7 KiB, 8 KiB]</entry>
+        </row>
+        <row>
+          <entry>2 KiB</entry>
+          <entry>[10 KiB, 12 KiB, 14 KiB]</entry>
+        </row>
+        <row>
+          <entry morerows="8">Large</entry>
+          <entry>2 KiB</entry>
+          <entry>[16 KiB]</entry>
         </row>
         <row>
-          <entry>Large</entry>
           <entry>4 KiB</entry>
-          <entry>[4 KiB, 8 KiB, 12 KiB, ..., 4072 KiB]</entry>
+          <entry>[20 KiB, 24 KiB, 28 KiB, 32 KiB]</entry>
+        </row>
+        <row>
+          <entry>8 KiB</entry>
+          <entry>[40 KiB, 48 KiB, 54 KiB, 64 KiB]</entry>
+        </row>
+        <row>
+          <entry>16 KiB</entry>
+          <entry>[80 KiB, 96 KiB, 112 KiB, 128 KiB]</entry>
+        </row>
+        <row>
+          <entry>32 KiB</entry>
+          <entry>[160 KiB, 192 KiB, 224 KiB, 256 KiB]</entry>
+        </row>
+        <row>
+          <entry>64 KiB</entry>
+          <entry>[320 KiB, 384 KiB, 448 KiB, 512 KiB]</entry>
+        </row>
+        <row>
+          <entry>128 KiB</entry>
+          <entry>[640 KiB, 768 KiB, 896 KiB, 1024 KiB]</entry>
+        </row>
+        <row>
+          <entry>256 KiB</entry>
+          <entry>[1280 KiB, 1536 KiB, 1792 KiB, 2048 KiB]</entry>
+        </row>
+        <row>
+          <entry>512 KiB</entry>
+          <entry>[2560 KiB, 3072 KiB, 3584 KiB]</entry>
+        </row>
+        <row>
+          <entry morerows="5">Huge</entry>
+          <entry>512 KiB</entry>
+          <entry>[4 MiB]</entry>
+        </row>
+        <row>
+          <entry>1 MiB</entry>
+          <entry>[5 MiB, 6 MiB, 7 MiB, 8 MiB]</entry>
+        </row>
+        <row>
+          <entry>2 MiB</entry>
+          <entry>[10 MiB, 12 MiB, 14 MiB, 16 MiB]</entry>
         </row>
         <row>
-          <entry>Huge</entry>
           <entry>4 MiB</entry>
-          <entry>[4 MiB, 8 MiB, 12 MiB, ...]</entry>
+          <entry>[20 MiB, 24 MiB, 28 MiB, 32 MiB]</entry>
+        </row>
+        <row>
+          <entry>8 MiB</entry>
+          <entry>[40 MiB, 48 MiB, 56 MiB, 64 MiB]</entry>
+        </row>
+        <row>
+          <entry>...</entry>
+          <entry>...</entry>
         </row>
       </tbody>
       </tgroup>
-- 
cgit v0.12


From 9b41ac909facf4f09bb1b637b78ba647348e572e Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Tue, 14 Oct 2014 22:20:00 -0700
Subject: Fix huge allocation statistics.

---
 doc/jemalloc.xml.in                           |   5 +-
 include/jemalloc/internal/arena.h             |  10 +-
 include/jemalloc/internal/private_symbols.txt |   3 +
 src/arena.c                                   | 301 +++++++++++++++++++-------
 src/huge.c                                    |  93 ++------
 5 files changed, 252 insertions(+), 160 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index fc01ad1..71b4cd1 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -1719,9 +1719,8 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         </term>
         <listitem><para>Pointer to a counter that contains an approximate count
         of the current number of bytes in active pages.  The estimate may be
-        high, but never low, because each arena rounds up to the nearest
-        multiple of the chunk size when computing its contribution to the
-        counter.  Note that the <link
+        high, but never low, because each arena rounds up when computing its
+        contribution to the counter.  Note that the <link
         linkend="epoch"><mallctl>epoch</mallctl></link> mallctl has no bearing
         on this counter.  Furthermore, counter consistency is maintained via
         atomic operations, so it is necessary to use an atomic operation in
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index c31c8d7..16c04d2 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -338,9 +338,15 @@ extern size_t		arena_maxclass; /* Max size class for arenas. */
 extern unsigned		nlclasses; /* Number of large size classes. */
 extern unsigned		nhclasses; /* Number of huge size classes. */
 
-void	*arena_chunk_alloc_huge(arena_t *arena, void *new_addr, size_t usize,
-    size_t alignment, bool *zero);
+void	*arena_chunk_alloc_huge(arena_t *arena, size_t usize, size_t alignment,
+    bool *zero);
 void	arena_chunk_dalloc_huge(arena_t *arena, void *chunk, size_t usize);
+void	arena_chunk_ralloc_huge_similar(arena_t *arena, void *chunk,
+    size_t oldsize, size_t usize);
+void	arena_chunk_ralloc_huge_shrink(arena_t *arena, void *chunk,
+    size_t oldsize, size_t usize);
+bool	arena_chunk_ralloc_huge_expand(arena_t *arena, void *chunk,
+    size_t oldsize, size_t usize, bool *zero);
 void	arena_purge_all(arena_t *arena);
 void	arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin,
     index_t binind, uint64_t prof_accumbytes);
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 66d4822..8eec874 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -13,6 +13,9 @@ arena_choose
 arena_choose_hard
 arena_chunk_alloc_huge
 arena_chunk_dalloc_huge
+arena_chunk_ralloc_huge_expand
+arena_chunk_ralloc_huge_shrink
+arena_chunk_ralloc_huge_similar
 arena_cleanup
 arena_dalloc
 arena_dalloc_bin
diff --git a/src/arena.c b/src/arena.c
index 74c3632..586e3c7 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -411,52 +411,6 @@ arena_chunk_alloc_internal(arena_t *arena, size_t size, size_t alignment,
 	return (chunk);
 }
 
-void *
-arena_chunk_alloc_huge(arena_t *arena, void *new_addr, size_t usize,
-    size_t alignment, bool *zero)
-{
-	void *ret;
-	chunk_alloc_t *chunk_alloc;
-	chunk_dalloc_t *chunk_dalloc;
-
-	malloc_mutex_lock(&arena->lock);
-	chunk_alloc = arena->chunk_alloc;
-	chunk_dalloc = arena->chunk_dalloc;
-	if (config_stats) {
-		index_t index = size2index(usize) - nlclasses - NBINS;
-
-		/* Optimistically update stats prior to unlocking. */
-		arena->stats.allocated_huge += usize;
-		arena->stats.nmalloc_huge++;
-		arena->stats.hstats[index].nmalloc++;
-		arena->stats.hstats[index].curhchunks++;
-		arena->stats.mapped += usize;
-	}
-	arena->nactive += (usize >> LG_PAGE);
-	malloc_mutex_unlock(&arena->lock);
-
-	ret = chunk_alloc_arena(chunk_alloc, chunk_dalloc, arena->ind,
-	    new_addr, usize, alignment, zero);
-	if (config_stats) {
-		if (ret != NULL)
-			stats_cactive_add(usize);
-		else {
-			index_t index = size2index(usize) - nlclasses - NBINS;
-
-			malloc_mutex_lock(&arena->lock);
-			/* Revert optimistic stats updates. */
-			arena->stats.allocated_huge -= usize;
-			arena->stats.nmalloc_huge--;
-			arena->stats.hstats[index].nmalloc--;
-			arena->stats.hstats[index].curhchunks--;
-			arena->stats.mapped -= usize;
-			malloc_mutex_unlock(&arena->lock);
-		}
-	}
-
-	return (ret);
-}
-
 static arena_chunk_t *
 arena_chunk_init_hard(arena_t *arena)
 {
@@ -529,16 +483,150 @@ arena_chunk_alloc(arena_t *arena)
 }
 
 static void
-arena_chunk_dalloc_internal(arena_t *arena, arena_chunk_t *chunk)
+arena_chunk_dalloc(arena_t *arena, arena_chunk_t *chunk)
+{
+
+	assert(arena_mapbits_allocated_get(chunk, map_bias) == 0);
+	assert(arena_mapbits_allocated_get(chunk, chunk_npages-1) == 0);
+	assert(arena_mapbits_unallocated_size_get(chunk, map_bias) ==
+	    arena_maxrun);
+	assert(arena_mapbits_unallocated_size_get(chunk, chunk_npages-1) ==
+	    arena_maxrun);
+	assert(arena_mapbits_dirty_get(chunk, map_bias) ==
+	    arena_mapbits_dirty_get(chunk, chunk_npages-1));
+
+	/*
+	 * Remove run from the runs_avail tree, so that the arena does not use
+	 * it.
+	 */
+	arena_avail_remove(arena, chunk, map_bias, chunk_npages-map_bias);
+
+	if (arena->spare != NULL) {
+		arena_chunk_t *spare = arena->spare;
+		chunk_dalloc_t *chunk_dalloc;
+
+		arena->spare = chunk;
+		if (arena_mapbits_dirty_get(spare, map_bias) != 0) {
+			arena_dirty_remove(arena, spare, map_bias,
+			    chunk_npages-map_bias);
+		}
+		chunk_dalloc = arena->chunk_dalloc;
+		malloc_mutex_unlock(&arena->lock);
+		chunk_dalloc((void *)spare, chunksize, arena->ind);
+		malloc_mutex_lock(&arena->lock);
+		if (config_stats)
+			arena->stats.mapped -= chunksize;
+	} else
+		arena->spare = chunk;
+}
+
+static void
+arena_huge_malloc_stats_update(arena_t *arena, size_t usize)
+{
+	index_t index = size2index(usize) - nlclasses - NBINS;
+
+	cassert(config_stats);
+
+	arena->stats.nmalloc_huge++;
+	arena->stats.allocated_huge += usize;
+	arena->stats.hstats[index].nmalloc++;
+	arena->stats.hstats[index].curhchunks++;
+}
+
+static void
+arena_huge_malloc_stats_update_undo(arena_t *arena, size_t usize)
+{
+	index_t index = size2index(usize) - nlclasses - NBINS;
+
+	cassert(config_stats);
+
+	arena->stats.nmalloc_huge--;
+	arena->stats.allocated_huge -= usize;
+	arena->stats.hstats[index].nmalloc--;
+	arena->stats.hstats[index].curhchunks--;
+}
+
+static void
+arena_huge_dalloc_stats_update(arena_t *arena, size_t usize)
+{
+	index_t index = size2index(usize) - nlclasses - NBINS;
+
+	cassert(config_stats);
+
+	arena->stats.ndalloc_huge++;
+	arena->stats.allocated_huge -= usize;
+	arena->stats.hstats[index].ndalloc++;
+	arena->stats.hstats[index].curhchunks--;
+}
+
+static void
+arena_huge_dalloc_stats_update_undo(arena_t *arena, size_t usize)
+{
+	index_t index = size2index(usize) - nlclasses - NBINS;
+
+	cassert(config_stats);
+
+	arena->stats.ndalloc_huge--;
+	arena->stats.allocated_huge += usize;
+	arena->stats.hstats[index].ndalloc--;
+	arena->stats.hstats[index].curhchunks++;
+}
+
+static void
+arena_huge_ralloc_stats_update(arena_t *arena, size_t oldsize, size_t usize)
+{
+
+	arena_huge_dalloc_stats_update(arena, oldsize);
+	arena_huge_malloc_stats_update(arena, usize);
+}
+
+static void
+arena_huge_ralloc_stats_update_undo(arena_t *arena, size_t oldsize,
+    size_t usize)
+{
+
+	arena_huge_dalloc_stats_update_undo(arena, oldsize);
+	arena_huge_malloc_stats_update_undo(arena, usize);
+}
+
+void *
+arena_chunk_alloc_huge(arena_t *arena, size_t usize, size_t alignment,
+    bool *zero)
 {
+	void *ret;
+	chunk_alloc_t *chunk_alloc;
 	chunk_dalloc_t *chunk_dalloc;
+	size_t csize = CHUNK_CEILING(usize);
 
+	malloc_mutex_lock(&arena->lock);
+	chunk_alloc = arena->chunk_alloc;
 	chunk_dalloc = arena->chunk_dalloc;
+	if (config_stats) {
+		/* Optimistically update stats prior to unlocking. */
+		arena_huge_malloc_stats_update(arena, usize);
+		arena->stats.mapped += usize;
+	}
+	arena->nactive += (usize >> LG_PAGE);
 	malloc_mutex_unlock(&arena->lock);
-	chunk_dalloc((void *)chunk, chunksize, arena->ind);
-	malloc_mutex_lock(&arena->lock);
+
+	ret = chunk_alloc_arena(chunk_alloc, chunk_dalloc, arena->ind, NULL,
+	    csize, alignment, zero);
+	if (ret == NULL) {
+		/* Revert optimistic stats updates. */
+		malloc_mutex_lock(&arena->lock);
+		if (config_stats) {
+			arena_huge_malloc_stats_update_undo(arena, usize);
+			arena->stats.mapped -= usize;
+		}
+		arena->nactive -= (usize >> LG_PAGE);
+		malloc_mutex_unlock(&arena->lock);
+		return (NULL);
+	}
+
 	if (config_stats)
-		arena->stats.mapped -= chunksize;
+		stats_cactive_add(usize);
+
+	return (ret);
 }
 
 void
@@ -549,50 +637,101 @@ arena_chunk_dalloc_huge(arena_t *arena, void *chunk, size_t usize)
 	malloc_mutex_lock(&arena->lock);
 	chunk_dalloc = arena->chunk_dalloc;
 	if (config_stats) {
-		index_t index = size2index(usize) - nlclasses - NBINS;
-
-		arena->stats.ndalloc_huge++;
-		arena->stats.allocated_huge -= usize;
-		arena->stats.hstats[index].ndalloc++;
-		arena->stats.hstats[index].curhchunks--;
+		arena_huge_dalloc_stats_update(arena, usize);
 		arena->stats.mapped -= usize;
 		stats_cactive_sub(usize);
 	}
 	arena->nactive -= (usize >> LG_PAGE);
 	malloc_mutex_unlock(&arena->lock);
-	chunk_dalloc(chunk, usize, arena->ind);
+	chunk_dalloc(chunk, CHUNK_CEILING(usize), arena->ind);
 }
 
-static void
-arena_chunk_dalloc(arena_t *arena, arena_chunk_t *chunk)
+void
+arena_chunk_ralloc_huge_similar(arena_t *arena, void *chunk, size_t oldsize,
+    size_t usize)
 {
 
-	assert(arena_mapbits_allocated_get(chunk, map_bias) == 0);
-	assert(arena_mapbits_allocated_get(chunk, chunk_npages-1) == 0);
-	assert(arena_mapbits_unallocated_size_get(chunk, map_bias) ==
-	    arena_maxrun);
-	assert(arena_mapbits_unallocated_size_get(chunk, chunk_npages-1) ==
-	    arena_maxrun);
-	assert(arena_mapbits_dirty_get(chunk, map_bias) ==
-	    arena_mapbits_dirty_get(chunk, chunk_npages-1));
+	assert(CHUNK_CEILING(oldsize) == CHUNK_CEILING(usize));
+	assert(oldsize != usize);
 
-	/*
-	 * Remove run from the runs_avail tree, so that the arena does not use
-	 * it.
-	 */
-	arena_avail_remove(arena, chunk, map_bias, chunk_npages-map_bias);
+	malloc_mutex_lock(&arena->lock);
+	if (config_stats)
+		arena_huge_ralloc_stats_update(arena, oldsize, usize);
+	if (oldsize < usize) {
+		size_t udiff = usize - oldsize;
+		arena->nactive += udiff >> LG_PAGE;
+		if (config_stats)
+			stats_cactive_add(udiff);
+	} else {
+		size_t udiff = oldsize - usize;
+		arena->nactive -= udiff >> LG_PAGE;
+		if (config_stats)
+			stats_cactive_sub(udiff);
+	}
+	malloc_mutex_unlock(&arena->lock);
+}
 
-	if (arena->spare != NULL) {
-		arena_chunk_t *spare = arena->spare;
+void
+arena_chunk_ralloc_huge_shrink(arena_t *arena, void *chunk, size_t oldsize,
+    size_t usize)
+{
+	chunk_dalloc_t *chunk_dalloc;
+	size_t udiff = oldsize - usize;
+	size_t cdiff = CHUNK_CEILING(oldsize) - CHUNK_CEILING(usize);
 
-		arena->spare = chunk;
-		if (arena_mapbits_dirty_get(spare, map_bias) != 0) {
-			arena_dirty_remove(arena, spare, map_bias,
-			    chunk_npages-map_bias);
+	malloc_mutex_lock(&arena->lock);
+	chunk_dalloc = arena->chunk_dalloc;
+	if (config_stats) {
+		arena_huge_ralloc_stats_update(arena, oldsize, usize);
+		if (cdiff != 0) {
+			arena->stats.mapped -= cdiff;
+			stats_cactive_sub(udiff);
 		}
-		arena_chunk_dalloc_internal(arena, spare);
-	} else
-		arena->spare = chunk;
+	}
+	arena->nactive -= udiff >> LG_PAGE;
+	malloc_mutex_unlock(&arena->lock);
+	if (cdiff != 0)
+		chunk_dalloc(chunk + CHUNK_CEILING(usize), cdiff, arena->ind);
+}
+
+bool
+arena_chunk_ralloc_huge_expand(arena_t *arena, void *chunk, size_t oldsize,
+    size_t usize, bool *zero)
+{
+	chunk_alloc_t *chunk_alloc;
+	chunk_dalloc_t *chunk_dalloc;
+	size_t udiff = usize - oldsize;
+	size_t cdiff = CHUNK_CEILING(usize) - CHUNK_CEILING(oldsize);
+
+	malloc_mutex_lock(&arena->lock);
+	chunk_alloc = arena->chunk_alloc;
+	chunk_dalloc = arena->chunk_dalloc;
+	if (config_stats) {
+		/* Optimistically update stats prior to unlocking. */
+		arena_huge_ralloc_stats_update(arena, oldsize, usize);
+		arena->stats.mapped += cdiff;
+	}
+	arena->nactive += (udiff >> LG_PAGE);
+	malloc_mutex_unlock(&arena->lock);
+
+	if (chunk_alloc_arena(chunk_alloc, chunk_dalloc, arena->ind, chunk +
+	    CHUNK_CEILING(oldsize), cdiff, chunksize, zero) == NULL) {
+		/* Revert optimistic stats updates. */
+		malloc_mutex_lock(&arena->lock);
+		if (config_stats) {
+			arena_huge_ralloc_stats_update_undo(arena,
+			    oldsize, usize);
+			arena->stats.mapped -= cdiff;
+		}
+		arena->nactive -= (udiff >> LG_PAGE);
+		malloc_mutex_unlock(&arena->lock);
+		return (true);
+	}
+
+	if (config_stats)
+		stats_cactive_add(udiff);
+
+	return (false);
 }
 
 static arena_run_t *
diff --git a/src/huge.c b/src/huge.c
index 5f46241..740a93f 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -31,15 +31,11 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
     bool zero, bool try_tcache)
 {
 	void *ret;
-	size_t csize;
 	extent_node_t *node;
 	bool is_zeroed;
 
 	/* Allocate one or more contiguous chunks for this request. */
 
-	csize = CHUNK_CEILING(usize);
-	assert(csize >= usize);
-
 	/* Allocate an extent node with which to track the chunk. */
 	node = ipalloct(tsd, CACHELINE_CEILING(sizeof(extent_node_t)),
 	    CACHELINE, false, try_tcache, NULL);
@@ -56,7 +52,7 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 		base_node_dalloc(node);
 		return (NULL);
 	}
-	ret = arena_chunk_alloc_huge(arena, NULL, csize, alignment, &is_zeroed);
+	ret = arena_chunk_alloc_huge(arena, usize, alignment, &is_zeroed);
 	if (ret == NULL) {
 		idalloct(tsd, node, try_tcache);
 		return (NULL);
@@ -105,25 +101,6 @@ huge_dalloc_junk_t *huge_dalloc_junk = JEMALLOC_N(huge_dalloc_junk_impl);
 #endif
 
 static void
-huge_ralloc_no_move_stats_update(arena_t *arena, size_t oldsize, size_t usize)
-{
-	index_t oldindex = size2index(oldsize) - nlclasses - NBINS;
-	index_t index = size2index(usize) - nlclasses - NBINS;
-
-	cassert(config_stats);
-
-	arena->stats.ndalloc_huge++;
-	arena->stats.allocated_huge -= oldsize;
-	arena->stats.hstats[oldindex].ndalloc++;
-	arena->stats.hstats[oldindex].curhchunks--;
-
-	arena->stats.nmalloc_huge++;
-	arena->stats.allocated_huge += usize;
-	arena->stats.hstats[index].nmalloc++;
-	arena->stats.hstats[index].curhchunks++;
-}
-
-static void
 huge_ralloc_no_move_similar(void *ptr, size_t oldsize, size_t usize,
     size_t size, size_t extra, bool zero)
 {
@@ -135,34 +112,33 @@ huge_ralloc_no_move_similar(void *ptr, size_t oldsize, size_t usize,
 	while (usize < s2u(size+extra) && (usize_next = s2u(usize+1)) < oldsize)
 		usize = usize_next;
 
-	malloc_mutex_lock(&huge_mtx);
+	if (oldsize == usize)
+		return;
 
+	malloc_mutex_lock(&huge_mtx);
 	key.addr = ptr;
 	node = extent_tree_ad_search(&huge, &key);
 	assert(node != NULL);
 	assert(node->addr == ptr);
-
 	arena = node->arena;
+	/* Update the size of the huge allocation. */
+	assert(node->size != usize);
+	node->size = usize;
+	malloc_mutex_unlock(&huge_mtx);
 
-	/* Update the size of the huge allocation if it changed. */
-	if (oldsize != usize) {
-		assert(node->size != usize);
-		node->size = usize;
-	}
+	/* Fill if necessary (shrinking). */
+	if (config_fill && unlikely(opt_junk) && oldsize > usize)
+		memset(ptr + usize, 0x5a, oldsize - usize);
 
-	malloc_mutex_unlock(&huge_mtx);
+	arena_chunk_ralloc_huge_similar(arena, ptr, oldsize, usize);
 
-	/* Fill if necessary. */
+	/* Fill if necessary (growing). */
 	if (oldsize < usize) {
 		if (zero || (config_fill && unlikely(opt_zero)))
 			memset(ptr + oldsize, 0, usize - oldsize);
 		else if (config_fill && unlikely(opt_junk))
 			memset(ptr + oldsize, 0xa5, usize - oldsize);
-	} else if (config_fill && unlikely(opt_junk) && oldsize > usize)
-		memset(ptr + usize, 0x5a, oldsize - usize);
-
-	if (config_stats)
-		huge_ralloc_no_move_stats_update(arena, oldsize, usize);
+	}
 }
 
 static void
@@ -170,44 +146,28 @@ huge_ralloc_no_move_shrink(void *ptr, size_t oldsize, size_t usize)
 {
 	extent_node_t *node, key;
 	arena_t *arena;
-	void *excess_addr;
-	size_t excess_size;
 
 	malloc_mutex_lock(&huge_mtx);
-
 	key.addr = ptr;
 	node = extent_tree_ad_search(&huge, &key);
 	assert(node != NULL);
 	assert(node->addr == ptr);
-
 	arena = node->arena;
-
 	/* Update the size of the huge allocation. */
 	node->size = usize;
-
 	malloc_mutex_unlock(&huge_mtx);
 
-	excess_addr = node->addr + CHUNK_CEILING(usize);
-	excess_size = CHUNK_CEILING(oldsize) - CHUNK_CEILING(usize);
-
 	/* Zap the excess chunks. */
 	huge_dalloc_junk(ptr + usize, oldsize - usize);
-	if (excess_size > 0)
-		arena_chunk_dalloc_huge(arena, excess_addr, excess_size);
-
-	if (config_stats)
-		huge_ralloc_no_move_stats_update(arena, oldsize, usize);
+	arena_chunk_ralloc_huge_shrink(arena, ptr, oldsize, usize);
 }
 
 static bool
 huge_ralloc_no_move_expand(void *ptr, size_t oldsize, size_t size, bool zero) {
 	size_t usize;
-	void *expand_addr;
-	size_t expand_size;
 	extent_node_t *node, key;
 	arena_t *arena;
 	bool is_zeroed;
-	void *ret;
 
 	usize = s2u(size);
 	if (usize == 0) {
@@ -215,19 +175,12 @@ huge_ralloc_no_move_expand(void *ptr, size_t oldsize, size_t size, bool zero) {
 		return (true);
 	}
 
-	expand_addr = ptr + CHUNK_CEILING(oldsize);
-	expand_size = CHUNK_CEILING(usize) - CHUNK_CEILING(oldsize);
-	assert(expand_size > 0);
-
 	malloc_mutex_lock(&huge_mtx);
-
 	key.addr = ptr;
 	node = extent_tree_ad_search(&huge, &key);
 	assert(node != NULL);
 	assert(node->addr == ptr);
-
 	arena = node->arena;
-
 	malloc_mutex_unlock(&huge_mtx);
 
 	/*
@@ -235,12 +188,10 @@ huge_ralloc_no_move_expand(void *ptr, size_t oldsize, size_t size, bool zero) {
 	 * it is possible to make correct junk/zero fill decisions below.
 	 */
 	is_zeroed = zero;
-	ret = arena_chunk_alloc_huge(arena, expand_addr, expand_size, chunksize,
-				     &is_zeroed);
-	if (ret == NULL)
-		return (true);
 
-	assert(ret == expand_addr);
+	if (arena_chunk_ralloc_huge_expand(arena, ptr, oldsize, usize,
+	     &is_zeroed))
+		return (true);
 
 	malloc_mutex_lock(&huge_mtx);
 	/* Update the size of the huge allocation. */
@@ -254,9 +205,6 @@ huge_ralloc_no_move_expand(void *ptr, size_t oldsize, size_t size, bool zero) {
 			memset(ptr + oldsize, 0, usize - oldsize);
 	}
 
-	if (config_stats)
-		huge_ralloc_no_move_stats_update(arena, oldsize, usize);
-
 	return (false);
 }
 
@@ -363,19 +311,16 @@ huge_dalloc(tsd_t *tsd, void *ptr, bool try_tcache)
 	extent_node_t *node, key;
 
 	malloc_mutex_lock(&huge_mtx);
-
 	/* Extract from tree of huge allocations. */
 	key.addr = ptr;
 	node = extent_tree_ad_search(&huge, &key);
 	assert(node != NULL);
 	assert(node->addr == ptr);
 	extent_tree_ad_remove(&huge, node);
-
 	malloc_mutex_unlock(&huge_mtx);
 
 	huge_dalloc_junk(node->addr, node->size);
-	arena_chunk_dalloc_huge(node->arena, node->addr,
-	    CHUNK_CEILING(node->size));
+	arena_chunk_dalloc_huge(node->arena, node->addr, node->size);
 	idalloct(tsd, node, try_tcache);
 }
 
-- 
cgit v0.12


From d1f3ab4008f95e8928777d28a40aff9708701503 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Tue, 14 Oct 2014 22:31:49 -0700
Subject: Fix line wrapping.

---
 INSTALL | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/INSTALL b/INSTALL
index a00960a..b8459a8 100644
--- a/INSTALL
+++ b/INSTALL
@@ -191,8 +191,8 @@ any of the following arguments (not a definitive list) to 'configure':
 
 --with-lg-page=<lg-page>
     Specify the base 2 log of the system page size.  This option is only useful
-    when cross compiling, since the configure script automatically determines the
-    host's page size by default.
+    when cross compiling, since the configure script automatically determines
+    the host's page size by default.
 
 --with-lg-page-sizes=<lg-page-sizes>
     Specify the comma-separated base 2 logs of the page sizes to support.  This
@@ -243,16 +243,16 @@ any of the following arguments (not a definitive list) to 'configure':
     safe values for the most commonly used modern architectures, there is a
     wrinkle related to GNU libc (glibc) that may impact your choice of
     <lg-quantum>.  On most modern architectures, this mandates 16-byte alignment
-    (<lg-quantum>=4), but the glibc developers chose not to meet this requirement
-    for performance reasons.  An old discussion can be found at
+    (<lg-quantum>=4), but the glibc developers chose not to meet this
+    requirement for performance reasons.  An old discussion can be found at
     https://sourceware.org/bugzilla/show_bug.cgi?id=206 .  Unlike glibc,
     jemalloc does follow the C standard by default (caveat: jemalloc
-    technically cheats if --with-lg-tiny-min is smaller than --with-lg-quantum),
-    but the fact that Linux systems already work around this allocator
-    noncompliance means that it is generally safe in practice to let jemalloc's
-    minimum alignment follow glibc's lead.  If you specify --with-lg-quantum=3
-    during configuration, jemalloc will provide additional size classes that
-    are not 16-byte-aligned (24, 40, and 56, assuming
+    technically cheats if --with-lg-tiny-min is smaller than
+    --with-lg-quantum), but the fact that Linux systems already work around
+    this allocator noncompliance means that it is generally safe in practice to
+    let jemalloc's minimum alignment follow glibc's lead.  If you specify
+    --with-lg-quantum=3 during configuration, jemalloc will provide additional
+    size classes that are not 16-byte-aligned (24, 40, and 56, assuming
     --with-lg-size-class-group=2).
 
 --with-lg-tiny-min=<lg-tiny-min>
-- 
cgit v0.12


From acbcbad1e18d3082ee6ce851994ed03f63ae55bd Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 15 Oct 2014 14:49:14 -0700
Subject: Thwart compiler optimizations.

---
 test/stress/microbench.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/test/stress/microbench.c b/test/stress/microbench.c
index 980eca4..aefbe6a 100644
--- a/test/stress/microbench.c
+++ b/test/stress/microbench.c
@@ -114,6 +114,10 @@ malloc_mus_free(void)
 	void *p;
 
 	p = malloc(1);
+	if (p == NULL) {
+		test_fail("Unexpected malloc() failure");
+		return;
+	}
 	malloc_usable_size(p);
 	free(p);
 }
@@ -124,6 +128,10 @@ malloc_sallocx_free(void)
 	void *p;
 
 	p = malloc(1);
+	if (p == NULL) {
+		test_fail("Unexpected malloc() failure");
+		return;
+	}
 	if (sallocx(p, 0) < 1)
 		test_fail("Unexpected sallocx() failure");
 	free(p);
@@ -143,6 +151,10 @@ malloc_nallocx_free(void)
 	void *p;
 
 	p = malloc(1);
+	if (p == NULL) {
+		test_fail("Unexpected malloc() failure");
+		return;
+	}
 	if (nallocx(1, 0) < 1)
 		test_fail("Unexpected nallocx() failure");
 	free(p);
-- 
cgit v0.12


From bf8d6a109200bf10f1c942ad914aa8cb5f279e17 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 15 Oct 2014 16:18:42 -0700
Subject: Add small run utilization to stats output.

Add the 'util' column, which reports the proportion of available regions
that are currently in use for each small size class.  Small run
utilization is the complement of external fragmentation.  For example,
utilization of 0.75 indicates that 25% of small run memory is consumed
by external fragmentation, in other (more obtuse) words, 33% external
fragmentation overhead.

This resolves #27.
---
 src/stats.c | 50 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 34 insertions(+), 16 deletions(-)

diff --git a/src/stats.c b/src/stats.c
index 16a18c5..054f033 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -69,14 +69,14 @@ stats_arena_bins_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	if (config_tcache) {
 		malloc_cprintf(write_cb, cbopaque,
 		    "bins:           size ind    allocated      nmalloc"
-		    "      ndalloc    nrequests      curregs regs pgs"
-		    "       nfills     nflushes      newruns       reruns"
-		    "      curruns\n");
+		    "      ndalloc    nrequests      curregs      curruns regs"
+		    " pgs  util       nfills     nflushes      newruns"
+		    "       reruns\n");
 	} else {
 		malloc_cprintf(write_cb, cbopaque,
 		    "bins:           size ind    allocated      nmalloc"
-		    "      ndalloc    nrequests      curregs regs pgs"
-		    "      newruns       reruns      curruns\n");
+		    "      ndalloc    nrequests      curregs      curruns regs"
+		    " pgs  util      newruns       reruns\n");
 	}
 	CTL_GET("arenas.nbins", &nbins, unsigned);
 	for (j = 0, in_gap = false; j < nbins; j++) {
@@ -86,11 +86,12 @@ stats_arena_bins_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		if (nruns == 0)
 			in_gap = true;
 		else {
-			size_t reg_size, run_size, curregs;
+			size_t reg_size, run_size, curregs, availregs, milli;
+			size_t curruns;
 			uint32_t nregs;
 			uint64_t nmalloc, ndalloc, nrequests, nfills, nflushes;
 			uint64_t reruns;
-			size_t curruns;
+			char util[6]; /* "x.yyy". */
 
 			if (in_gap) {
 				malloc_cprintf(write_cb, cbopaque,
@@ -118,24 +119,41 @@ stats_arena_bins_print(void (*write_cb)(void *, const char *), void *cbopaque,
 			    uint64_t);
 			CTL_IJ_GET("stats.arenas.0.bins.0.curruns", &curruns,
 			    size_t);
+
+			availregs = nregs * curruns;
+			milli = (availregs != 0) ? (1000 * curregs) / availregs
+			    : 1000;
+			assert(milli <= 1000);
+			if (milli < 10) {
+				malloc_snprintf(util, sizeof(util), "0.00%zu",
+				    milli);
+			} else if (milli < 100) {
+				malloc_snprintf(util, sizeof(util), "0.0%zu",
+				    milli);
+			} else if (milli < 1000) {
+				malloc_snprintf(util, sizeof(util), "0.%zu",
+				    milli);
+			} else
+				malloc_snprintf(util, sizeof(util), "1");
+
 			if (config_tcache) {
 				malloc_cprintf(write_cb, cbopaque,
 				    "%20zu %3u %12zu %12"PRIu64" %12"PRIu64
-				    " %12"PRIu64" %12zu %4u %3zu %12"PRIu64
+				    " %12"PRIu64" %12zu %12zu %4u %3zu %-5s"
 				    " %12"PRIu64" %12"PRIu64" %12"PRIu64
-				    " %12zu\n",
+				    " %12"PRIu64"\n",
 				    reg_size, j, curregs * reg_size, nmalloc,
-				    ndalloc, nrequests, curregs, nregs, run_size
-				    / page, nfills, nflushes, nruns, reruns,
-				    curruns);
+				    ndalloc, nrequests, curregs, curruns, nregs,
+				    run_size / page, util, nfills, nflushes,
+				    nruns, reruns);
 			} else {
 				malloc_cprintf(write_cb, cbopaque,
 				    "%20zu %3u %12zu %12"PRIu64" %12"PRIu64
-				    " %12"PRIu64" %12zu %4u %3zu %12"PRIu64
-				    " %12"PRIu64" %12zu\n",
+				    " %12"PRIu64" %12zu %12zu %4u %3zu %-5s"
+				    " %12"PRIu64" %12"PRIu64"\n",
 				    reg_size, j, curregs * reg_size, nmalloc,
-				    ndalloc, nrequests, curregs, nregs,
-				    run_size / page, nruns, reruns, curruns);
+				    ndalloc, nrequests, curregs, curruns, nregs,
+				    run_size / page, util, nruns, reruns);
 			}
 		}
 	}
-- 
cgit v0.12


From 9673983443a0782d975fbcb5d8457cfd411b8b56 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 15 Oct 2014 18:02:02 -0700
Subject: Purge/zero sub-chunk huge allocations as necessary.

Purge trailing pages during shrinking huge reallocation when resulting
size is not a multiple of the chunk size.  Similarly, zero pages if
necessary during growing huge reallocation when the resulting size is
not a multiple of the chunk size.
---
 src/huge.c | 75 ++++++++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 51 insertions(+), 24 deletions(-)

diff --git a/src/huge.c b/src/huge.c
index 740a93f..1734ff6 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -61,18 +61,18 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 	/* Insert node into huge. */
 	node->addr = ret;
 	node->size = usize;
+	node->zeroed = is_zeroed;
 	node->arena = arena;
 
 	malloc_mutex_lock(&huge_mtx);
 	extent_tree_ad_insert(&huge, node);
 	malloc_mutex_unlock(&huge_mtx);
 
-	if (config_fill && !zero) {
-		if (unlikely(opt_junk))
-			memset(ret, 0xa5, usize);
-		else if (unlikely(opt_zero) && !is_zeroed)
+	if (zero || (config_fill && unlikely(opt_zero))) {
+		if (!is_zeroed)
 			memset(ret, 0, usize);
-	}
+	} else if (config_fill && unlikely(opt_junk))
+		memset(ret, 0xa5, usize);
 
 	return (ret);
 }
@@ -105,6 +105,7 @@ huge_ralloc_no_move_similar(void *ptr, size_t oldsize, size_t usize,
     size_t size, size_t extra, bool zero)
 {
 	size_t usize_next;
+	bool zeroed;
 	extent_node_t *node, key;
 	arena_t *arena;
 
@@ -115,6 +116,17 @@ huge_ralloc_no_move_similar(void *ptr, size_t oldsize, size_t usize,
 	if (oldsize == usize)
 		return;
 
+	/* Fill if necessary (shrinking). */
+	if (oldsize > usize) {
+		size_t sdiff = CHUNK_CEILING(usize) - usize;
+		zeroed = (sdiff != 0) ? !pages_purge(ptr + usize, sdiff) : true;
+		if (config_fill && unlikely(opt_junk)) {
+			memset(ptr + usize, 0x5a, oldsize - usize);
+			zeroed = false;
+		}
+	} else
+		zeroed = true;
+
 	malloc_mutex_lock(&huge_mtx);
 	key.addr = ptr;
 	node = extent_tree_ad_search(&huge, &key);
@@ -124,19 +136,18 @@ huge_ralloc_no_move_similar(void *ptr, size_t oldsize, size_t usize,
 	/* Update the size of the huge allocation. */
 	assert(node->size != usize);
 	node->size = usize;
+	/* Clear node->zeroed if zeroing failed above. */
+	node->zeroed = (node->zeroed && zeroed);
 	malloc_mutex_unlock(&huge_mtx);
 
-	/* Fill if necessary (shrinking). */
-	if (config_fill && unlikely(opt_junk) && oldsize > usize)
-		memset(ptr + usize, 0x5a, oldsize - usize);
-
 	arena_chunk_ralloc_huge_similar(arena, ptr, oldsize, usize);
 
 	/* Fill if necessary (growing). */
 	if (oldsize < usize) {
-		if (zero || (config_fill && unlikely(opt_zero)))
-			memset(ptr + oldsize, 0, usize - oldsize);
-		else if (config_fill && unlikely(opt_junk))
+		if (zero || (config_fill && unlikely(opt_zero))) {
+			if (!zeroed)
+				memset(ptr + oldsize, 0, usize - oldsize);
+		} else if (config_fill && unlikely(opt_junk))
 			memset(ptr + oldsize, 0xa5, usize - oldsize);
 	}
 }
@@ -144,9 +155,18 @@ huge_ralloc_no_move_similar(void *ptr, size_t oldsize, size_t usize,
 static void
 huge_ralloc_no_move_shrink(void *ptr, size_t oldsize, size_t usize)
 {
+	size_t sdiff;
+	bool zeroed;
 	extent_node_t *node, key;
 	arena_t *arena;
 
+	sdiff = CHUNK_CEILING(usize) - usize;
+	zeroed = (sdiff != 0) ? !pages_purge(ptr + usize, sdiff) : true;
+	if (config_fill && unlikely(opt_junk)) {
+		huge_dalloc_junk(ptr + usize, oldsize - usize);
+		zeroed = false;
+	}
+
 	malloc_mutex_lock(&huge_mtx);
 	key.addr = ptr;
 	node = extent_tree_ad_search(&huge, &key);
@@ -155,10 +175,11 @@ huge_ralloc_no_move_shrink(void *ptr, size_t oldsize, size_t usize)
 	arena = node->arena;
 	/* Update the size of the huge allocation. */
 	node->size = usize;
+	/* Clear node->zeroed if zeroing failed above. */
+	node->zeroed = (node->zeroed && zeroed);
 	malloc_mutex_unlock(&huge_mtx);
 
 	/* Zap the excess chunks. */
-	huge_dalloc_junk(ptr + usize, oldsize - usize);
 	arena_chunk_ralloc_huge_shrink(arena, ptr, oldsize, usize);
 }
 
@@ -167,7 +188,7 @@ huge_ralloc_no_move_expand(void *ptr, size_t oldsize, size_t size, bool zero) {
 	size_t usize;
 	extent_node_t *node, key;
 	arena_t *arena;
-	bool is_zeroed;
+	bool is_zeroed_subchunk, is_zeroed_chunk;
 
 	usize = s2u(size);
 	if (usize == 0) {
@@ -181,16 +202,17 @@ huge_ralloc_no_move_expand(void *ptr, size_t oldsize, size_t size, bool zero) {
 	assert(node != NULL);
 	assert(node->addr == ptr);
 	arena = node->arena;
+	is_zeroed_subchunk = node->zeroed;
 	malloc_mutex_unlock(&huge_mtx);
 
 	/*
-	 * Copy zero into is_zeroed and pass the copy to chunk_alloc(), so that
-	 * it is possible to make correct junk/zero fill decisions below.
+	 * Copy zero into is_zeroed_chunk and pass the copy to chunk_alloc(), so
+	 * that it is possible to make correct junk/zero fill decisions below.
 	 */
-	is_zeroed = zero;
+	is_zeroed_chunk = zero;
 
 	if (arena_chunk_ralloc_huge_expand(arena, ptr, oldsize, usize,
-	     &is_zeroed))
+	     &is_zeroed_chunk))
 		return (true);
 
 	malloc_mutex_lock(&huge_mtx);
@@ -198,12 +220,17 @@ huge_ralloc_no_move_expand(void *ptr, size_t oldsize, size_t size, bool zero) {
 	node->size = usize;
 	malloc_mutex_unlock(&huge_mtx);
 
-	if (config_fill && !zero) {
-		if (unlikely(opt_junk))
-			memset(ptr + oldsize, 0xa5, usize - oldsize);
-		else if (unlikely(opt_zero) && !is_zeroed)
-			memset(ptr + oldsize, 0, usize - oldsize);
-	}
+	if (zero || (config_fill && unlikely(opt_zero))) {
+		if (!is_zeroed_subchunk) {
+			memset(ptr + oldsize, 0, CHUNK_CEILING(oldsize) -
+			    oldsize);
+		}
+		if (!is_zeroed_chunk) {
+			memset(ptr + CHUNK_CEILING(oldsize), 0, usize -
+			    CHUNK_CEILING(oldsize));
+		}
+	} else if (config_fill && unlikely(opt_junk))
+		memset(ptr + oldsize, 0xa5, usize - oldsize);
 
 	return (false);
 }
-- 
cgit v0.12


From c83bccd27396cbb6e818d83cc360a58aef96558d Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Thu, 16 Oct 2014 12:33:18 -0700
Subject: Initialize chunks_mtx for all configurations.

This resolves #150.
---
 src/chunk.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/chunk.c b/src/chunk.c
index f65b67a..a776116 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -409,11 +409,10 @@ chunk_boot(void)
 	chunksize_mask = chunksize - 1;
 	chunk_npages = (chunksize >> LG_PAGE);
 
-	if (config_stats || config_prof) {
-		if (malloc_mutex_init(&chunks_mtx))
-			return (true);
+	if (malloc_mutex_init(&chunks_mtx))
+		return (true);
+	if (config_stats || config_prof)
 		memset(&stats_chunks, 0, sizeof(chunk_stats_t));
-	}
 	if (have_dss && chunk_dss_boot())
 		return (true);
 	extent_tree_szad_new(&chunks_szad_mmap);
-- 
cgit v0.12


From a9ea10d27c320926cab2e59c66ebcd25c49df24c Mon Sep 17 00:00:00 2001
From: Daniel Micay <danielmicay@gmail.com>
Date: Thu, 16 Oct 2014 15:05:02 -0400
Subject: use sized deallocation internally for ralloc

The size of the source allocation is known at this point, so reading the
chunk header can be avoided for the small size class fast path. This is
not very useful right now, but it provides a significant performance
boost with an alternate ralloc entry point taking the old size.
---
 src/arena.c | 2 +-
 src/huge.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 586e3c7..d7377ae 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -2220,7 +2220,7 @@ arena_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t size,
 	copysize = (size < oldsize) ? size : oldsize;
 	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, copysize);
 	memcpy(ret, ptr, copysize);
-	iqalloc(tsd, ptr, try_tcache_dalloc);
+	isqalloc(tsd, ptr, oldsize, try_tcache_dalloc);
 	return (ret);
 }
 
diff --git a/src/huge.c b/src/huge.c
index 1734ff6..826464c 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -328,7 +328,7 @@ huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t size,
 	 */
 	copysize = (size < oldsize) ? size : oldsize;
 	memcpy(ret, ptr, copysize);
-	iqalloc(tsd, ptr, try_tcache_dalloc);
+	isqalloc(tsd, ptr, oldsize, try_tcache_dalloc);
 	return (ret);
 }
 
-- 
cgit v0.12


From 79725aa6f6823bf0703374cb4b89b64133321138 Mon Sep 17 00:00:00 2001
From: Guilherme Goncalves <guilherme.p.gonc@gmail.com>
Date: Mon, 20 Oct 2014 14:08:37 -0200
Subject: Fix variable declaration with no type in the configure script.

---
 configure.ac | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index a7bf103..5c51f27 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1363,7 +1363,7 @@ if test "x${enable_zone_allocator}" = "x1" ; then
   AC_DEFUN([JE_ZONE_PROGRAM],
     [AC_LANG_PROGRAM(
       [#include <malloc/malloc.h>],
-      [static foo[[sizeof($1) $2 sizeof(void *) * $3 ? 1 : -1]]]
+      [static int foo[[sizeof($1) $2 sizeof(void *) * $3 ? 1 : -1]]]
     )])
 
   AC_COMPILE_IFELSE([JE_ZONE_PROGRAM(malloc_zone_t,==,14)],[JEMALLOC_ZONE_VERSION=3],[
-- 
cgit v0.12


From af1f5927633ee2cb98c095de0fcc67b8aacdc9c0 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Thu, 30 Oct 2014 16:38:08 -0700
Subject: Use JEMALLOC_INLINE_C everywhere it's appropriate.

---
 src/arena.c  | 16 ++++++++--------
 src/ctl.c    |  6 +++---
 src/extent.c |  4 ++--
 src/prof.c   |  4 ++--
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index d7377ae..795f530 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -39,7 +39,7 @@ arena_miscelm_to_bits(arena_chunk_map_misc_t *miscelm)
 	return arena_mapbits_get(chunk, pageind);
 }
 
-static inline int
+JEMALLOC_INLINE_C int
 arena_run_comp(arena_chunk_map_misc_t *a, arena_chunk_map_misc_t *b)
 {
 	uintptr_t a_miscelm = (uintptr_t)a;
@@ -55,7 +55,7 @@ arena_run_comp(arena_chunk_map_misc_t *a, arena_chunk_map_misc_t *b)
 rb_gen(static UNUSED, arena_run_tree_, arena_run_tree_t, arena_chunk_map_misc_t,
     rb_link, arena_run_comp)
 
-static inline int
+JEMALLOC_INLINE_C int
 arena_avail_comp(arena_chunk_map_misc_t *a, arena_chunk_map_misc_t *b)
 {
 	int ret;
@@ -139,7 +139,7 @@ arena_dirty_remove(arena_t *arena, arena_chunk_t *chunk, size_t pageind,
 	arena->ndirty -= npages;
 }
 
-static inline void *
+JEMALLOC_INLINE_C void *
 arena_run_reg_alloc(arena_run_t *run, arena_bin_info_t *bin_info)
 {
 	void *ret;
@@ -159,7 +159,7 @@ arena_run_reg_alloc(arena_run_t *run, arena_bin_info_t *bin_info)
 	return (ret);
 }
 
-static inline void
+JEMALLOC_INLINE_C void
 arena_run_reg_dalloc(arena_run_t *run, void *ptr)
 {
 	arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
@@ -185,7 +185,7 @@ arena_run_reg_dalloc(arena_run_t *run, void *ptr)
 	run->nfree++;
 }
 
-static inline void
+JEMALLOC_INLINE_C void
 arena_run_zero(arena_chunk_t *chunk, size_t run_ind, size_t npages)
 {
 
@@ -195,7 +195,7 @@ arena_run_zero(arena_chunk_t *chunk, size_t run_ind, size_t npages)
 	    (npages << LG_PAGE));
 }
 
-static inline void
+JEMALLOC_INLINE_C void
 arena_run_page_mark_zeroed(arena_chunk_t *chunk, size_t run_ind)
 {
 
@@ -203,7 +203,7 @@ arena_run_page_mark_zeroed(arena_chunk_t *chunk, size_t run_ind)
 	    << LG_PAGE)), PAGE);
 }
 
-static inline void
+JEMALLOC_INLINE_C void
 arena_run_page_validate_zeroed(arena_chunk_t *chunk, size_t run_ind)
 {
 	size_t i;
@@ -834,7 +834,7 @@ arena_run_alloc_small(arena_t *arena, size_t size, index_t binind)
 	return (arena_run_alloc_small_helper(arena, size, binind));
 }
 
-static inline void
+JEMALLOC_INLINE_C void
 arena_maybe_purge(arena_t *arena)
 {
 	size_t threshold;
diff --git a/src/ctl.c b/src/ctl.c
index 72598b3..b367c9f 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -16,14 +16,14 @@ static ctl_stats_t	ctl_stats;
 /******************************************************************************/
 /* Helpers for named and indexed nodes. */
 
-static inline const ctl_named_node_t *
+JEMALLOC_INLINE_C const ctl_named_node_t *
 ctl_named_node(const ctl_node_t *node)
 {
 
 	return ((node->named) ? (const ctl_named_node_t *)node : NULL);
 }
 
-static inline const ctl_named_node_t *
+JEMALLOC_INLINE_C const ctl_named_node_t *
 ctl_named_children(const ctl_named_node_t *node, int index)
 {
 	const ctl_named_node_t *children = ctl_named_node(node->children);
@@ -31,7 +31,7 @@ ctl_named_children(const ctl_named_node_t *node, int index)
 	return (children ? &children[index] : NULL);
 }
 
-static inline const ctl_indexed_node_t *
+JEMALLOC_INLINE_C const ctl_indexed_node_t *
 ctl_indexed_node(const ctl_node_t *node)
 {
 
diff --git a/src/extent.c b/src/extent.c
index 8c09b48..ca85201 100644
--- a/src/extent.c
+++ b/src/extent.c
@@ -3,7 +3,7 @@
 
 /******************************************************************************/
 
-static inline int
+JEMALLOC_INLINE_C int
 extent_szad_comp(extent_node_t *a, extent_node_t *b)
 {
 	int ret;
@@ -25,7 +25,7 @@ extent_szad_comp(extent_node_t *a, extent_node_t *b)
 rb_gen(, extent_tree_szad_, extent_tree_t, extent_node_t, link_szad,
     extent_szad_comp)
 
-static inline int
+JEMALLOC_INLINE_C int
 extent_ad_comp(extent_node_t *a, extent_node_t *b)
 {
 	uintptr_t a_addr = (uintptr_t)a->addr;
diff --git a/src/prof.c b/src/prof.c
index 4016327..36ee758 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -244,7 +244,7 @@ bt_init(prof_bt_t *bt, void **vec)
 	bt->len = 0;
 }
 
-static inline void
+JEMALLOC_INLINE_C void
 prof_enter(prof_tdata_t *tdata)
 {
 
@@ -256,7 +256,7 @@ prof_enter(prof_tdata_t *tdata)
 	malloc_mutex_lock(&bt2gctx_mtx);
 }
 
-static inline void
+JEMALLOC_INLINE_C void
 prof_leave(prof_tdata_t *tdata)
 {
 	bool idump, gdump;
-- 
cgit v0.12


From c93ed81cd06ae46906ae7a386fd6312caca391fb Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Thu, 30 Oct 2014 16:50:33 -0700
Subject: Fix prof_{enter,leave}() calls to pass tdata_self.

---
 src/prof.c | 43 ++++++++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/src/prof.c b/src/prof.c
index 36ee758..71b0994 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -245,10 +245,11 @@ bt_init(prof_bt_t *bt, void **vec)
 }
 
 JEMALLOC_INLINE_C void
-prof_enter(prof_tdata_t *tdata)
+prof_enter(tsd_t *tsd, prof_tdata_t *tdata)
 {
 
 	cassert(config_prof);
+	assert(tdata == prof_tdata_get(tsd, false));
 
 	assert(!tdata->enq);
 	tdata->enq = true;
@@ -257,11 +258,12 @@ prof_enter(prof_tdata_t *tdata)
 }
 
 JEMALLOC_INLINE_C void
-prof_leave(prof_tdata_t *tdata)
+prof_leave(tsd_t *tsd, prof_tdata_t *tdata)
 {
 	bool idump, gdump;
 
 	cassert(config_prof);
+	assert(tdata == prof_tdata_get(tsd, false));
 
 	malloc_mutex_unlock(&bt2gctx_mtx);
 
@@ -542,7 +544,8 @@ prof_gctx_create(tsd_t *tsd, prof_bt_t *bt)
 }
 
 static void
-prof_gctx_try_destroy(tsd_t *tsd, prof_gctx_t *gctx, prof_tdata_t *tdata)
+prof_gctx_try_destroy(tsd_t *tsd, prof_tdata_t *tdata_self, prof_gctx_t *gctx,
+    prof_tdata_t *tdata)
 {
 
 	cassert(config_prof);
@@ -554,14 +557,14 @@ prof_gctx_try_destroy(tsd_t *tsd, prof_gctx_t *gctx, prof_tdata_t *tdata)
 	 * avoid a race between the main body of prof_tctx_destroy() and entry
 	 * into this function.
 	 */
-	prof_enter(tdata);
+	prof_enter(tsd, tdata_self);
 	malloc_mutex_lock(gctx->lock);
 	assert(gctx->nlimbo != 0);
 	if (tctx_tree_empty(&gctx->tctxs) && gctx->nlimbo == 1) {
 		/* Remove gctx from bt2gctx. */
 		if (ckh_remove(tsd, &bt2gctx, &gctx->bt, NULL, NULL))
 			not_reached();
-		prof_leave(tdata);
+		prof_leave(tsd, tdata_self);
 		/* Destroy gctx. */
 		malloc_mutex_unlock(gctx->lock);
 		idalloc(tsd, gctx);
@@ -572,7 +575,7 @@ prof_gctx_try_destroy(tsd_t *tsd, prof_gctx_t *gctx, prof_tdata_t *tdata)
 		 */
 		gctx->nlimbo--;
 		malloc_mutex_unlock(gctx->lock);
-		prof_leave(tdata);
+		prof_leave(tsd, tdata_self);
 	}
 }
 
@@ -655,8 +658,10 @@ prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx)
 		destroy_gctx = false;
 	}
 	malloc_mutex_unlock(gctx->lock);
-	if (destroy_gctx)
-		prof_gctx_try_destroy(tsd, gctx, tdata);
+	if (destroy_gctx) {
+		prof_gctx_try_destroy(tsd, prof_tdata_get(tsd, false), gctx,
+		    tdata);
+	}
 
 	if (destroy_tdata)
 		prof_tdata_destroy(tsd, tdata, false);
@@ -679,18 +684,18 @@ prof_lookup_global(tsd_t *tsd, prof_bt_t *bt, prof_tdata_t *tdata,
 	} btkey;
 	bool new_gctx;
 
-	prof_enter(tdata);
+	prof_enter(tsd, tdata);
 	if (ckh_search(&bt2gctx, bt, &btkey.v, &gctx.v)) {
 		/* bt has never been seen before.  Insert it. */
 		gctx.p = prof_gctx_create(tsd, bt);
 		if (gctx.v == NULL) {
-			prof_leave(tdata);
+			prof_leave(tsd, tdata);
 			return (true);
 		}
 		btkey.p = &gctx.p->bt;
 		if (ckh_insert(tsd, &bt2gctx, btkey.v, gctx.v)) {
 			/* OOM. */
-			prof_leave(tdata);
+			prof_leave(tsd, tdata);
 			idalloc(tsd, gctx.v);
 			return (true);
 		}
@@ -705,7 +710,7 @@ prof_lookup_global(tsd_t *tsd, prof_bt_t *bt, prof_tdata_t *tdata,
 		malloc_mutex_unlock(gctx.p->lock);
 		new_gctx = false;
 	}
-	prof_leave(tdata);
+	prof_leave(tsd, tdata);
 
 	*p_btkey = btkey.v;
 	*p_gctx = gctx.p;
@@ -751,7 +756,7 @@ prof_lookup(tsd_t *tsd, prof_bt_t *bt)
 		ret.v = imalloc(tsd, sizeof(prof_tctx_t));
 		if (ret.p == NULL) {
 			if (new_gctx)
-				prof_gctx_try_destroy(tsd, gctx, tdata);
+				prof_gctx_try_destroy(tsd, tdata, gctx, tdata);
 			return (NULL);
 		}
 		ret.p->tdata = tdata;
@@ -765,7 +770,7 @@ prof_lookup(tsd_t *tsd, prof_bt_t *bt)
 		malloc_mutex_unlock(tdata->lock);
 		if (error) {
 			if (new_gctx)
-				prof_gctx_try_destroy(tsd, gctx, tdata);
+				prof_gctx_try_destroy(tsd, tdata, gctx, tdata);
 			idalloc(tsd, ret.v);
 			return (NULL);
 		}
@@ -872,9 +877,9 @@ prof_bt_count(void)
 	if (tdata == NULL)
 		return (0);
 
-	prof_enter(tdata);
+	malloc_mutex_lock(&bt2gctx_mtx);
 	bt_count = ckh_count(&bt2gctx);
-	prof_leave(tdata);
+	malloc_mutex_unlock(&bt2gctx_mtx);
 
 	return (bt_count);
 }
@@ -1155,7 +1160,7 @@ prof_gctx_finish(tsd_t *tsd, prof_gctx_tree_t *gctxs)
 		if (prof_gctx_should_destroy(gctx)) {
 			gctx->nlimbo++;
 			malloc_mutex_unlock(gctx->lock);
-			prof_gctx_try_destroy(tsd, gctx, tdata);
+			prof_gctx_try_destroy(tsd, tdata, gctx, tdata);
 		} else
 			malloc_mutex_unlock(gctx->lock);
 	}
@@ -1398,7 +1403,7 @@ prof_dump(tsd_t *tsd, bool propagate_err, const char *filename, bool leakcheck)
 		return (true);
 
 	malloc_mutex_lock(&prof_dump_mtx);
-	prof_enter(tdata);
+	prof_enter(tsd, tdata);
 
 	/*
 	 * Put gctx's in limbo and clear their counters in preparation for
@@ -1421,7 +1426,7 @@ prof_dump(tsd_t *tsd, bool propagate_err, const char *filename, bool leakcheck)
 	leak_ngctx = 0;
 	gctx_tree_iter(&gctxs, NULL, prof_gctx_merge_iter, (void *)&leak_ngctx);
 
-	prof_leave(tdata);
+	prof_leave(tsd, tdata);
 
 	/* Create dump file. */
 	if ((prof_dump_fd = prof_dump_open(propagate_err, filename)) == -1)
-- 
cgit v0.12


From 809b0ac3919da60c20ad59517ef560d0df639f3b Mon Sep 17 00:00:00 2001
From: Daniel Micay <danielmicay@gmail.com>
Date: Thu, 23 Oct 2014 10:30:52 -0400
Subject: mark huge allocations as unlikely

This cleans up the fast path a bit more by moving away more code.
---
 include/jemalloc/internal/jemalloc_internal.h.in | 20 ++++++++++----------
 include/jemalloc/internal/prof.h                 |  4 ++--
 src/arena.c                                      |  4 ++--
 src/jemalloc.c                                   |  4 ++--
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 294e2cc..3ce5aba 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -655,7 +655,7 @@ sa2u(size_t size, size_t alignment)
 	}
 
 	/* Try for a large size class. */
-	if (size <= arena_maxclass && alignment < chunksize) {
+	if (likely(size <= arena_maxclass) && likely(alignment < chunksize)) {
 		/*
 		 * We can't achieve subpage alignment, so round up alignment
 		 * to the minimum that can actually be supported.
@@ -805,7 +805,7 @@ imalloct(tsd_t *tsd, size_t size, bool try_tcache, arena_t *arena)
 
 	assert(size != 0);
 
-	if (size <= arena_maxclass)
+	if (likely(size <= arena_maxclass))
 		return (arena_malloc(tsd, arena, size, false, try_tcache));
 	else
 		return (huge_malloc(tsd, arena, size, false, try_tcache));
@@ -822,7 +822,7 @@ JEMALLOC_ALWAYS_INLINE void *
 icalloct(tsd_t *tsd, size_t size, bool try_tcache, arena_t *arena)
 {
 
-	if (size <= arena_maxclass)
+	if (likely(size <= arena_maxclass))
 		return (arena_malloc(tsd, arena, size, true, try_tcache));
 	else
 		return (huge_malloc(tsd, arena, size, true, try_tcache));
@@ -847,12 +847,12 @@ ipalloct(tsd_t *tsd, size_t usize, size_t alignment, bool zero, bool try_tcache,
 	if (usize <= SMALL_MAXCLASS && alignment < PAGE)
 		ret = arena_malloc(tsd, arena, usize, zero, try_tcache);
 	else {
-		if (usize <= arena_maxclass) {
+		if (likely(usize <= arena_maxclass)) {
 			arena = arena_choose(tsd, arena);
 			if (unlikely(arena == NULL))
 				return (NULL);
 			ret = arena_palloc(arena, usize, alignment, zero);
-		} else if (alignment <= chunksize)
+		} else if (likely(alignment <= chunksize))
 			ret = huge_malloc(tsd, arena, usize, zero, try_tcache);
 		else {
 			ret = huge_palloc(tsd, arena, usize, alignment, zero,
@@ -887,7 +887,7 @@ isalloc(const void *ptr, bool demote)
 	assert(config_prof || !demote);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (chunk != ptr)
+	if (likely(chunk != ptr))
 		ret = arena_salloc(ptr, demote);
 	else
 		ret = huge_salloc(ptr);
@@ -936,7 +936,7 @@ idalloct(tsd_t *tsd, void *ptr, bool try_tcache)
 	assert(ptr != NULL);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (chunk != ptr)
+	if (likely(chunk != ptr))
 		arena_dalloc(tsd, chunk, ptr, try_tcache);
 	else
 		huge_dalloc(tsd, ptr, try_tcache);
@@ -950,7 +950,7 @@ isdalloct(tsd_t *tsd, void *ptr, size_t size, bool try_tcache)
 	assert(ptr != NULL);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (chunk != ptr)
+	if (likely(chunk != ptr))
 		arena_sdalloc(tsd, chunk, ptr, size, try_tcache);
 	else
 		huge_dalloc(tsd, ptr, try_tcache);
@@ -1038,7 +1038,7 @@ iralloct(tsd_t *tsd, void *ptr, size_t size, size_t alignment, bool zero,
 		    zero, try_tcache_alloc, try_tcache_dalloc, arena));
 	}
 
-	if (size <= arena_maxclass) {
+	if (likely(size <= arena_maxclass)) {
 		return (arena_ralloc(tsd, arena, ptr, oldsize, size, 0,
 		    alignment, zero, try_tcache_alloc, try_tcache_dalloc));
 	} else {
@@ -1069,7 +1069,7 @@ ixalloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero)
 		return (true);
 	}
 
-	if (size <= arena_maxclass)
+	if (likely(size <= arena_maxclass))
 		return (arena_ralloc_no_move(ptr, oldsize, size, extra, zero));
 	else
 		return (huge_ralloc_no_move(ptr, oldsize, size, extra, zero));
diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index 5103146..e0d5f10 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -361,7 +361,7 @@ prof_tctx_get(const void *ptr)
 	assert(ptr != NULL);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (chunk != ptr) {
+	if (likely(chunk != ptr)) {
 		/* Region. */
 		ret = arena_prof_tctx_get(ptr);
 	} else
@@ -379,7 +379,7 @@ prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 	assert(ptr != NULL);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (chunk != ptr) {
+	if (likely(chunk != ptr)) {
 		/* Region. */
 		arena_prof_tctx_set(ptr, tctx);
 	} else
diff --git a/src/arena.c b/src/arena.c
index 795f530..347d58e 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -2095,7 +2095,7 @@ arena_ralloc_large(void *ptr, size_t oldsize, size_t size, size_t extra,
 	size_t usize;
 
 	/* Make sure extra can't cause size_t overflow. */
-	if (extra >= arena_maxclass)
+	if (unlikely(extra >= arena_maxclass))
 		return (true);
 
 	usize = s2u(size + extra);
@@ -2142,7 +2142,7 @@ arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
 	/*
 	 * Avoid moving the allocation if the size class can be left the same.
 	 */
-	if (oldsize <= arena_maxclass) {
+	if (likely(oldsize <= arena_maxclass)) {
 		if (oldsize <= SMALL_MAXCLASS) {
 			assert(arena_bin_info[size2index(oldsize)].reg_size
 			    == oldsize);
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 4543959..f130e99 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -264,7 +264,7 @@ a0alloc(size_t size, bool zero)
 	if (size == 0)
 		size = 1;
 
-	if (size <= arena_maxclass)
+	if (likely(size <= arena_maxclass))
 		ret = arena_malloc(NULL, a0get(), size, zero, false);
 	else
 		ret = huge_malloc(NULL, a0get(), size, zero, false);
@@ -295,7 +295,7 @@ a0free(void *ptr)
 		return;
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (chunk != ptr)
+	if (likely(chunk != ptr))
 		arena_dalloc(NULL, chunk, ptr, false);
 	else
 		huge_dalloc(NULL, ptr, false);
-- 
cgit v0.12


From d33f834591a2459f22da7a165c524340b5fc3a0c Mon Sep 17 00:00:00 2001
From: Daniel Micay <danielmicay@gmail.com>
Date: Fri, 24 Oct 2014 13:18:57 -0400
Subject: avoid redundant chunk header reads

* use sized deallocation in iralloct_realign
* iralloc and ixalloc always need the old size, so pass it in from the
  caller where it's often already calculated
---
 include/jemalloc/internal/jemalloc_internal.h.in | 33 +++++++--------
 src/jemalloc.c                                   | 54 ++++++++++++------------
 2 files changed, 42 insertions(+), 45 deletions(-)

diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 3ce5aba..6f13093 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -790,12 +790,13 @@ void	isqalloc(tsd_t *tsd, void *ptr, size_t size, bool try_tcache);
 void	*iralloct_realign(tsd_t *tsd, void *ptr, size_t oldsize, size_t size,
     size_t extra, size_t alignment, bool zero, bool try_tcache_alloc,
     bool try_tcache_dalloc, arena_t *arena);
-void	*iralloct(tsd_t *tsd, void *ptr, size_t size, size_t alignment,
-    bool zero, bool try_tcache_alloc, bool try_tcache_dalloc, arena_t *arena);
-void	*iralloc(tsd_t *tsd, void *ptr, size_t size, size_t alignment,
-    bool zero);
-bool	ixalloc(void *ptr, size_t size, size_t extra, size_t alignment,
-    bool zero);
+void	*iralloct(tsd_t *tsd, void *ptr, size_t oldsize, size_t size,
+    size_t alignment, bool zero, bool try_tcache_alloc, bool try_tcache_dalloc,
+    arena_t *arena);
+void	*iralloc(tsd_t *tsd, void *ptr, size_t oldsize, size_t size,
+    size_t alignment, bool zero);
+bool	ixalloc(void *ptr, size_t oldsize, size_t size, size_t extra,
+    size_t alignment, bool zero);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
@@ -1013,21 +1014,18 @@ iralloct_realign(tsd_t *tsd, void *ptr, size_t oldsize, size_t size,
 	 */
 	copysize = (size < oldsize) ? size : oldsize;
 	memcpy(p, ptr, copysize);
-	iqalloc(tsd, ptr, try_tcache_dalloc);
+	isqalloc(tsd, ptr, oldsize, try_tcache_dalloc);
 	return (p);
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-iralloct(tsd_t *tsd, void *ptr, size_t size, size_t alignment, bool zero,
-    bool try_tcache_alloc, bool try_tcache_dalloc, arena_t *arena)
+iralloct(tsd_t *tsd, void *ptr, size_t oldsize, size_t size, size_t alignment,
+    bool zero, bool try_tcache_alloc, bool try_tcache_dalloc, arena_t *arena)
 {
-	size_t oldsize;
 
 	assert(ptr != NULL);
 	assert(size != 0);
 
-	oldsize = isalloc(ptr, config_prof);
-
 	if (alignment != 0 && ((uintptr_t)ptr & ((uintptr_t)alignment-1))
 	    != 0) {
 		/*
@@ -1048,21 +1046,22 @@ iralloct(tsd_t *tsd, void *ptr, size_t size, size_t alignment, bool zero,
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-iralloc(tsd_t *tsd, void *ptr, size_t size, size_t alignment, bool zero)
+iralloc(tsd_t *tsd, void *ptr, size_t oldsize, size_t size, size_t alignment,
+    bool zero)
 {
 
-	return (iralloct(tsd, ptr, size, alignment, zero, true, true, NULL));
+	return (iralloct(tsd, ptr, oldsize, size, alignment, zero, true, true,
+	    NULL));
 }
 
 JEMALLOC_ALWAYS_INLINE bool
-ixalloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero)
+ixalloc(void *ptr, size_t oldsize, size_t size, size_t extra, size_t alignment,
+    bool zero)
 {
-	size_t oldsize;
 
 	assert(ptr != NULL);
 	assert(size != 0);
 
-	oldsize = isalloc(ptr, config_prof);
 	if (alignment != 0 && ((uintptr_t)ptr & ((uintptr_t)alignment-1))
 	    != 0) {
 		/* Existing object alignment is inadequate. */
diff --git a/src/jemalloc.c b/src/jemalloc.c
index f130e99..7d559ef 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1529,19 +1529,20 @@ label_return:
 }
 
 static void *
-irealloc_prof_sample(tsd_t *tsd, void *oldptr, size_t usize, prof_tctx_t *tctx)
+irealloc_prof_sample(tsd_t *tsd, void *oldptr, size_t old_usize, size_t usize,
+    prof_tctx_t *tctx)
 {
 	void *p;
 
 	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
-		p = iralloc(tsd, oldptr, LARGE_MINCLASS, 0, false);
+		p = iralloc(tsd, oldptr, old_usize, LARGE_MINCLASS, 0, false);
 		if (p == NULL)
 			return (NULL);
 		arena_prof_promoted(p, usize);
 	} else
-		p = iralloc(tsd, oldptr, usize, 0, false);
+		p = iralloc(tsd, oldptr, old_usize, usize, 0, false);
 
 	return (p);
 }
@@ -1555,9 +1556,9 @@ irealloc_prof(tsd_t *tsd, void *oldptr, size_t old_usize, size_t usize)
 	old_tctx = prof_tctx_get(oldptr);
 	tctx = prof_alloc_prep(tsd, usize, true);
 	if (unlikely((uintptr_t)tctx != (uintptr_t)1U))
-		p = irealloc_prof_sample(tsd, oldptr, usize, tctx);
+		p = irealloc_prof_sample(tsd, oldptr, old_usize, usize, tctx);
 	else
-		p = iralloc(tsd, oldptr, usize, 0, false);
+		p = iralloc(tsd, oldptr, old_usize, usize, 0, false);
 	if (p == NULL)
 		return (NULL);
 	prof_realloc(tsd, p, usize, tctx, true, old_usize, old_tctx);
@@ -1630,9 +1631,7 @@ je_realloc(void *ptr, size_t size)
 		malloc_thread_init();
 		tsd = tsd_fetch();
 
-		if ((config_prof && opt_prof) || config_stats ||
-		    (config_valgrind && unlikely(in_valgrind)))
-			old_usize = isalloc(ptr, config_prof);
+		old_usize = isalloc(ptr, config_prof);
 		if (config_valgrind && unlikely(in_valgrind))
 			old_rzsize = config_prof ? p2rz(ptr) : u2rz(old_usize);
 
@@ -1643,7 +1642,7 @@ je_realloc(void *ptr, size_t size)
 			if (config_stats || (config_valgrind &&
 			    unlikely(in_valgrind)))
 				usize = s2u(size);
-			ret = iralloc(tsd, ptr, size, 0, false);
+			ret = iralloc(tsd, ptr, old_usize, size, 0, false);
 		}
 	} else {
 		/* realloc(NULL, size) is equivalent to malloc(size). */
@@ -1922,22 +1921,22 @@ label_oom:
 }
 
 static void *
-irallocx_prof_sample(tsd_t *tsd, void *oldptr, size_t size, size_t alignment,
-    size_t usize, bool zero, bool try_tcache_alloc, bool try_tcache_dalloc,
-    arena_t *arena, prof_tctx_t *tctx)
+irallocx_prof_sample(tsd_t *tsd, void *oldptr, size_t old_usize, size_t size,
+    size_t alignment, size_t usize, bool zero, bool try_tcache_alloc,
+    bool try_tcache_dalloc, arena_t *arena, prof_tctx_t *tctx)
 {
 	void *p;
 
 	if (tctx == NULL)
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
-		p = iralloct(tsd, oldptr, LARGE_MINCLASS, alignment, zero,
-		    try_tcache_alloc, try_tcache_dalloc, arena);
+		p = iralloct(tsd, oldptr, old_usize, LARGE_MINCLASS, alignment,
+		    zero, try_tcache_alloc, try_tcache_dalloc, arena);
 		if (p == NULL)
 			return (NULL);
 		arena_prof_promoted(p, usize);
 	} else {
-		p = iralloct(tsd, oldptr, size, alignment, zero,
+		p = iralloct(tsd, oldptr, old_usize, size, alignment, zero,
 		    try_tcache_alloc, try_tcache_dalloc, arena);
 	}
 
@@ -1955,10 +1954,11 @@ irallocx_prof(tsd_t *tsd, void *oldptr, size_t old_usize, size_t size,
 	old_tctx = prof_tctx_get(oldptr);
 	tctx = prof_alloc_prep(tsd, *usize, false);
 	if (unlikely((uintptr_t)tctx != (uintptr_t)1U)) {
-		p = irallocx_prof_sample(tsd, oldptr, size, alignment, *usize,
-		    zero, try_tcache_alloc, try_tcache_dalloc, arena, tctx);
+		p = irallocx_prof_sample(tsd, oldptr, old_usize, size,
+		    alignment, *usize, zero, try_tcache_alloc,
+		    try_tcache_dalloc, arena, tctx);
 	} else {
-		p = iralloct(tsd, oldptr, size, alignment, zero,
+		p = iralloct(tsd, oldptr, old_usize, size, alignment, zero,
 		    try_tcache_alloc, try_tcache_dalloc, arena);
 	}
 	if (unlikely(p == NULL)) {
@@ -1988,7 +1988,7 @@ je_rallocx(void *ptr, size_t size, int flags)
 	void *p;
 	tsd_t *tsd;
 	size_t usize;
-	UNUSED size_t old_usize JEMALLOC_CC_SILENCE_INIT(0);
+	size_t old_usize;
 	UNUSED size_t old_rzsize JEMALLOC_CC_SILENCE_INIT(0);
 	size_t alignment = MALLOCX_ALIGN_GET(flags);
 	bool zero = flags & MALLOCX_ZERO;
@@ -2016,9 +2016,7 @@ je_rallocx(void *ptr, size_t size, int flags)
 		arena = NULL;
 	}
 
-	if ((config_prof && opt_prof) || config_stats ||
-	    ((config_valgrind && unlikely(in_valgrind))))
-		old_usize = isalloc(ptr, config_prof);
+	old_usize = isalloc(ptr, config_prof);
 	if (config_valgrind && unlikely(in_valgrind))
 		old_rzsize = u2rz(old_usize);
 
@@ -2030,8 +2028,8 @@ je_rallocx(void *ptr, size_t size, int flags)
 		if (unlikely(p == NULL))
 			goto label_oom;
 	} else {
-		p = iralloct(tsd, ptr, size, alignment, zero, try_tcache_alloc,
-		    try_tcache_dalloc, arena);
+		p = iralloct(tsd, ptr, old_usize, size, alignment, zero,
+		     try_tcache_alloc, try_tcache_dalloc, arena);
 		if (unlikely(p == NULL))
 			goto label_oom;
 		if (config_stats || (config_valgrind && unlikely(in_valgrind)))
@@ -2061,7 +2059,7 @@ ixallocx_helper(void *ptr, size_t old_usize, size_t size, size_t extra,
 {
 	size_t usize;
 
-	if (ixalloc(ptr, size, extra, alignment, zero))
+	if (ixalloc(ptr, old_usize, size, extra, alignment, zero))
 		return (old_usize);
 	usize = isalloc(ptr, config_prof);
 
@@ -2080,9 +2078,9 @@ ixallocx_prof_sample(void *ptr, size_t old_usize, size_t size, size_t extra,
 	/* Use minimum usize to determine whether promotion may happen. */
 	if (((alignment == 0) ? s2u(size) : sa2u(size, alignment)) <=
 	    SMALL_MAXCLASS) {
-		if (ixalloc(ptr, SMALL_MAXCLASS+1, (SMALL_MAXCLASS+1 >=
-		    size+extra) ? 0 : size+extra - (SMALL_MAXCLASS+1),
-		    alignment, zero))
+		if (ixalloc(ptr, old_usize, SMALL_MAXCLASS+1,
+		    (SMALL_MAXCLASS+1 >= size+extra) ? 0 : size+extra -
+		    (SMALL_MAXCLASS+1), alignment, zero))
 			return (old_usize);
 		usize = isalloc(ptr, config_prof);
 		if (max_usize < LARGE_MINCLASS)
-- 
cgit v0.12


From cfc5706f6977a48f3b82d69cd68aa1cf8802fb8d Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Thu, 30 Oct 2014 23:18:45 -0700
Subject: Miscellaneous cleanups.

---
 include/jemalloc/internal/prof.h | 10 ++++------
 src/jemalloc.c                   |  6 +++---
 src/prof.c                       |  4 +++-
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index e0d5f10..e081884 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -361,10 +361,9 @@ prof_tctx_get(const void *ptr)
 	assert(ptr != NULL);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (likely(chunk != ptr)) {
-		/* Region. */
+	if (likely(chunk != ptr))
 		ret = arena_prof_tctx_get(ptr);
-	} else
+	else
 		ret = huge_prof_tctx_get(ptr);
 
 	return (ret);
@@ -379,10 +378,9 @@ prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 	assert(ptr != NULL);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (likely(chunk != ptr)) {
-		/* Region. */
+	if (likely(chunk != ptr))
 		arena_prof_tctx_set(ptr, tctx);
-	} else
+	else
 		huge_prof_tctx_set(ptr, tctx);
 }
 
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 7d559ef..23947f4 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1243,7 +1243,7 @@ imalloc_prof(tsd_t *tsd, size_t usize)
 		p = imalloc_prof_sample(tsd, usize, tctx);
 	else
 		p = imalloc(tsd, usize);
-	if (p == NULL) {
+	if (unlikely(p == NULL)) {
 		prof_alloc_rollback(tsd, tctx, true);
 		return (NULL);
 	}
@@ -1329,7 +1329,7 @@ imemalign_prof(tsd_t *tsd, size_t alignment, size_t usize)
 		p = imemalign_prof_sample(tsd, alignment, usize, tctx);
 	else
 		p = ipalloc(tsd, usize, alignment, false);
-	if (p == NULL) {
+	if (unlikely(p == NULL)) {
 		prof_alloc_rollback(tsd, tctx, true);
 		return (NULL);
 	}
@@ -1457,7 +1457,7 @@ icalloc_prof(tsd_t *tsd, size_t usize)
 		p = icalloc_prof_sample(tsd, usize, tctx);
 	else
 		p = icalloc(tsd, usize);
-	if (p == NULL) {
+	if (unlikely(p == NULL)) {
 		prof_alloc_rollback(tsd, tctx, true);
 		return (NULL);
 	}
diff --git a/src/prof.c b/src/prof.c
index 71b0994..4f5d405 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -204,7 +204,9 @@ prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx, bool updated)
 }
 
 void
-prof_malloc_sample_object(const void *ptr, size_t usize, prof_tctx_t *tctx) {
+prof_malloc_sample_object(const void *ptr, size_t usize, prof_tctx_t *tctx)
+{
+
 	prof_tctx_set(ptr, tctx);
 
 	malloc_mutex_lock(tctx->tdata->lock);
-- 
cgit v0.12


From dc652131110abb480df608d17b20cf5bd4cfe2d4 Mon Sep 17 00:00:00 2001
From: Daniel Micay <danielmicay@gmail.com>
Date: Thu, 30 Oct 2014 23:23:16 -0400
Subject: rm unused arena wrangling from xallocx

It has no use for the arena_t since unlike rallocx it never makes a new
memory allocation. It's just an unused parameter in ixalloc_helper.
---
 src/jemalloc.c | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/src/jemalloc.c b/src/jemalloc.c
index 23947f4..8b2ab8d 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -2055,7 +2055,7 @@ label_oom:
 
 JEMALLOC_ALWAYS_INLINE_C size_t
 ixallocx_helper(void *ptr, size_t old_usize, size_t size, size_t extra,
-    size_t alignment, bool zero, arena_t *arena)
+    size_t alignment, bool zero)
 {
 	size_t usize;
 
@@ -2068,8 +2068,7 @@ ixallocx_helper(void *ptr, size_t old_usize, size_t size, size_t extra,
 
 static size_t
 ixallocx_prof_sample(void *ptr, size_t old_usize, size_t size, size_t extra,
-    size_t alignment, size_t max_usize, bool zero, arena_t *arena,
-    prof_tctx_t *tctx)
+    size_t alignment, size_t max_usize, bool zero, prof_tctx_t *tctx)
 {
 	size_t usize;
 
@@ -2087,7 +2086,7 @@ ixallocx_prof_sample(void *ptr, size_t old_usize, size_t size, size_t extra,
 			arena_prof_promoted(ptr, usize);
 	} else {
 		usize = ixallocx_helper(ptr, old_usize, size, extra, alignment,
-		    zero, arena);
+		    zero);
 	}
 
 	return (usize);
@@ -2095,7 +2094,7 @@ ixallocx_prof_sample(void *ptr, size_t old_usize, size_t size, size_t extra,
 
 JEMALLOC_ALWAYS_INLINE_C size_t
 ixallocx_prof(tsd_t *tsd, void *ptr, size_t old_usize, size_t size,
-    size_t extra, size_t alignment, bool zero, arena_t *arena)
+    size_t extra, size_t alignment, bool zero)
 {
 	size_t max_usize, usize;
 	prof_tctx_t *old_tctx, *tctx;
@@ -2112,10 +2111,10 @@ ixallocx_prof(tsd_t *tsd, void *ptr, size_t old_usize, size_t size,
 	tctx = prof_alloc_prep(tsd, max_usize, false);
 	if (unlikely((uintptr_t)tctx != (uintptr_t)1U)) {
 		usize = ixallocx_prof_sample(ptr, old_usize, size, extra,
-		    alignment, zero, max_usize, arena, tctx);
+		    alignment, zero, max_usize, tctx);
 	} else {
 		usize = ixallocx_helper(ptr, old_usize, size, extra, alignment,
-		    zero, arena);
+		    zero);
 	}
 	if (unlikely(usize == old_usize)) {
 		prof_alloc_rollback(tsd, tctx, false);
@@ -2134,7 +2133,6 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags)
 	UNUSED size_t old_rzsize JEMALLOC_CC_SILENCE_INIT(0);
 	size_t alignment = MALLOCX_ALIGN_GET(flags);
 	bool zero = flags & MALLOCX_ZERO;
-	arena_t *arena;
 
 	assert(ptr != NULL);
 	assert(size != 0);
@@ -2143,22 +2141,16 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags)
 	malloc_thread_init();
 	tsd = tsd_fetch();
 
-	if (unlikely((flags & MALLOCX_ARENA_MASK) != 0)) {
-		unsigned arena_ind = MALLOCX_ARENA_GET(flags);
-		arena = arena_get(tsd, arena_ind, true, true);
-	} else
-		arena = NULL;
-
 	old_usize = isalloc(ptr, config_prof);
 	if (config_valgrind && unlikely(in_valgrind))
 		old_rzsize = u2rz(old_usize);
 
 	if (config_prof && opt_prof) {
 		usize = ixallocx_prof(tsd, ptr, old_usize, size, extra,
-		    alignment, zero, arena);
+		    alignment, zero);
 	} else {
 		usize = ixallocx_helper(ptr, old_usize, size, extra, alignment,
-		    zero, arena);
+		    zero);
 	}
 	if (unlikely(usize == old_usize))
 		goto label_not_resized;
-- 
cgit v0.12


From 6da2e9d4f6fdccf5108296c99b2b839a4f474bae Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 31 Oct 2014 17:08:13 -0700
Subject: Fix arena_sdalloc() to use promoted size.

---
 include/jemalloc/internal/arena.h | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 16c04d2..8782b19 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -1020,9 +1020,9 @@ arena_dalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, bool try_tcache)
 		assert(((uintptr_t)ptr & PAGE_MASK) == 0);
 
 		if (try_tcache && size <= tcache_maxclass && likely((tcache =
-		    tcache_get(tsd, false)) != NULL)) {
+		    tcache_get(tsd, false)) != NULL))
 			tcache_dalloc_large(tcache, ptr, size);
-		} else
+		else
 			arena_dalloc_large(chunk->arena, chunk, ptr);
 	}
 }
@@ -1031,18 +1031,26 @@ JEMALLOC_ALWAYS_INLINE void
 arena_sdalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, size_t size,
     bool try_tcache)
 {
+	index_t binind;
 	tcache_t *tcache;
 
 	assert(ptr != NULL);
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
+	if (config_prof && opt_prof) {
+		/* Use promoted size, not request size. */
+		size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
+		binind = arena_mapbits_binind_get(chunk, pageind);
+		size = index2size(binind);
+	} else
+		binind = size2index(size);
+
 	if (likely(size <= SMALL_MAXCLASS)) {
 		/* Small allocation. */
 		if (likely(try_tcache) && likely((tcache = tcache_get(tsd,
-		    false)) != NULL)) {
-			index_t binind = size2index(size);
+		    false)) != NULL))
 			tcache_dalloc_small(tcache, ptr, binind);
-		} else {
+		else {
 			size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >>
 			    LG_PAGE;
 			arena_dalloc_small(chunk->arena, chunk, ptr, pageind);
@@ -1051,9 +1059,9 @@ arena_sdalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, size_t size,
 		assert(((uintptr_t)ptr & PAGE_MASK) == 0);
 
 		if (try_tcache && size <= tcache_maxclass && (tcache =
-		    tcache_get(tsd, false)) != NULL) {
+		    tcache_get(tsd, false)) != NULL)
 			tcache_dalloc_large(tcache, ptr, size);
-		} else
+		else
 			arena_dalloc_large(chunk->arena, chunk, ptr);
 	}
 }
-- 
cgit v0.12


From d7a9bab92db5dd3acc02e4f58e95637c6338c285 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 31 Oct 2014 22:26:24 -0700
Subject: Fix arena_sdalloc() to use promoted size (second attempt).

Unlike the preceeding attempted fix, this version avoids the potential
for converting an invalid bin index to a size class.
---
 include/jemalloc/internal/arena.h | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 8782b19..a42522d 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -1031,26 +1031,29 @@ JEMALLOC_ALWAYS_INLINE void
 arena_sdalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, size_t size,
     bool try_tcache)
 {
-	index_t binind;
 	tcache_t *tcache;
 
 	assert(ptr != NULL);
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
 	if (config_prof && opt_prof) {
-		/* Use promoted size, not request size. */
 		size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
-		binind = arena_mapbits_binind_get(chunk, pageind);
-		size = index2size(binind);
-	} else
-		binind = size2index(size);
+		assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
+		if (arena_mapbits_large_get(chunk, pageind) != 0) {
+			/* Make sure to use promoted size, not request size. */
+			assert(((uintptr_t)ptr & PAGE_MASK) == 0);
+			size = arena_mapbits_large_size_get(chunk, pageind);
+		}
+	}
+	assert(s2u(size) == s2u(arena_salloc(ptr, false)));
 
 	if (likely(size <= SMALL_MAXCLASS)) {
 		/* Small allocation. */
 		if (likely(try_tcache) && likely((tcache = tcache_get(tsd,
-		    false)) != NULL))
+		    false)) != NULL)) {
+			index_t binind = size2index(size);
 			tcache_dalloc_small(tcache, ptr, binind);
-		else {
+		} else {
 			size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >>
 			    LG_PAGE;
 			arena_dalloc_small(chunk->arena, chunk, ptr, pageind);
-- 
cgit v0.12


From 82cb603ed799f29e387f37fb44cdfbe98fd2e4ee Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sat, 1 Nov 2014 00:20:28 -0700
Subject: Don't dereference NULL tdata in prof_{enter,leave}().

It is possible for the thread's tdata to be NULL late during thread
destruction, so take care not to dereference a NULL pointer in such
cases.
---
 src/prof.c | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/prof.c b/src/prof.c
index 4f5d405..1103cc9 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -253,8 +253,10 @@ prof_enter(tsd_t *tsd, prof_tdata_t *tdata)
 	cassert(config_prof);
 	assert(tdata == prof_tdata_get(tsd, false));
 
-	assert(!tdata->enq);
-	tdata->enq = true;
+	if (tdata != NULL) {
+		assert(!tdata->enq);
+		tdata->enq = true;
+	}
 
 	malloc_mutex_lock(&bt2gctx_mtx);
 }
@@ -262,24 +264,27 @@ prof_enter(tsd_t *tsd, prof_tdata_t *tdata)
 JEMALLOC_INLINE_C void
 prof_leave(tsd_t *tsd, prof_tdata_t *tdata)
 {
-	bool idump, gdump;
 
 	cassert(config_prof);
 	assert(tdata == prof_tdata_get(tsd, false));
 
 	malloc_mutex_unlock(&bt2gctx_mtx);
 
-	assert(tdata->enq);
-	tdata->enq = false;
-	idump = tdata->enq_idump;
-	tdata->enq_idump = false;
-	gdump = tdata->enq_gdump;
-	tdata->enq_gdump = false;
+	if (tdata != NULL) {
+		bool idump, gdump;
+
+		assert(tdata->enq);
+		tdata->enq = false;
+		idump = tdata->enq_idump;
+		tdata->enq_idump = false;
+		gdump = tdata->enq_gdump;
+		tdata->enq_gdump = false;
 
-	if (idump)
-		prof_idump();
-	if (gdump)
-		prof_gdump();
+		if (idump)
+			prof_idump();
+		if (gdump)
+			prof_gdump();
+	}
 }
 
 #ifdef JEMALLOC_PROF_LIBUNWIND
-- 
cgit v0.12


From 2b2f6dc1e45808c31fb2f3ae33306d224ec0b2d2 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sat, 1 Nov 2014 02:29:10 -0700
Subject: Disable arena_dirty_count() validation.

---
 src/arena.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 347d58e..ef42771 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -872,7 +872,7 @@ arena_dirty_count(arena_t *arena)
 		ndirty += npages;
 	}
 
-       return (ndirty);
+	return (ndirty);
 }
 
 static size_t
@@ -1015,7 +1015,11 @@ arena_purge(arena_t *arena, bool all)
 	size_t npurge, npurgeable, npurged;
 	arena_chunk_miscelms_t purge_list;
 
-	if (config_debug) {
+	/*
+	 * Calls to arena_dirty_count() are disabled even for debug builds
+	 * because overhead grows nonlinearly as memory usage increases.
+	 */
+	if (false && config_debug) {
 		size_t ndirty = arena_dirty_count(arena);
 		assert(ndirty == arena->ndirty);
 	}
-- 
cgit v0.12


From c002a5c80058ee27acb234ef34f69b0cf6836836 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 4 Nov 2014 18:03:11 -0800
Subject: Fix two quarantine regressions.

Fix quarantine to actually update tsd when expanding, and to avoid
double initialization (leaking the first quarantine) due to recursive
initialization.

This resolves #161.
---
 include/jemalloc/internal/private_symbols.txt |  1 +
 include/jemalloc/internal/quarantine.h        |  5 +++--
 src/quarantine.c                              | 22 ++++++++++++++++++++++
 3 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 8eec874..1988c6e 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -339,6 +339,7 @@ prof_thread_name_set
 quarantine
 quarantine_alloc_hook
 quarantine_cleanup
+quarantine_alloc_hook_work
 quarantine_init
 register_zone
 rtree_delete
diff --git a/include/jemalloc/internal/quarantine.h b/include/jemalloc/internal/quarantine.h
index 4e9c710..a399faa 100644
--- a/include/jemalloc/internal/quarantine.h
+++ b/include/jemalloc/internal/quarantine.h
@@ -30,6 +30,7 @@ struct quarantine_s {
 #ifdef JEMALLOC_H_EXTERNS
 
 quarantine_t	*quarantine_init(tsd_t *tsd, size_t lg_maxobjs);
+void	quarantine_alloc_hook_work(tsd_t *tsd);
 void	quarantine(tsd_t *tsd, void *ptr);
 void	quarantine_cleanup(tsd_t *tsd);
 
@@ -50,8 +51,8 @@ quarantine_alloc_hook(void)
 	assert(config_fill && opt_quarantine);
 
 	tsd = tsd_fetch();
-	if (tsd_quarantine_get(tsd) == NULL && tsd_nominal(tsd))
-		tsd_quarantine_set(tsd, quarantine_init(tsd, LG_MAXOBJS_INIT));
+	if (tsd_quarantine_get(tsd) == NULL)
+		quarantine_alloc_hook_work(tsd);
 }
 #endif
 
diff --git a/src/quarantine.c b/src/quarantine.c
index 1301b47..aa1c3b0 100644
--- a/src/quarantine.c
+++ b/src/quarantine.c
@@ -24,6 +24,8 @@ quarantine_init(tsd_t *tsd, size_t lg_maxobjs)
 {
 	quarantine_t *quarantine;
 
+	assert(tsd_nominal(tsd));
+
 	quarantine = (quarantine_t *)imalloc(tsd, offsetof(quarantine_t, objs) +
 	    ((ZU(1) << lg_maxobjs) * sizeof(quarantine_obj_t)));
 	if (quarantine == NULL)
@@ -36,6 +38,25 @@ quarantine_init(tsd_t *tsd, size_t lg_maxobjs)
 	return (quarantine);
 }
 
+void
+quarantine_alloc_hook_work(tsd_t *tsd)
+{
+	quarantine_t *quarantine;
+
+	if (!tsd_nominal(tsd))
+		return;
+
+	quarantine = quarantine_init(tsd, LG_MAXOBJS_INIT);
+	/*
+	 * Check again whether quarantine has been initialized, because
+	 * qurantine_init() may have triggered recursive initialization.
+	 */
+	if (tsd_quarantine_get(tsd) == NULL)
+		tsd_quarantine_set(tsd, quarantine);
+	else
+		idalloc(tsd, quarantine);
+}
+
 static quarantine_t *
 quarantine_grow(tsd_t *tsd, quarantine_t *quarantine)
 {
@@ -67,6 +88,7 @@ quarantine_grow(tsd_t *tsd, quarantine_t *quarantine)
 	}
 	idalloc(tsd, quarantine);
 
+	tsd_quarantine_set(tsd, ret);
 	return (ret);
 }
 
-- 
cgit v0.12


From 9cf2be0a81b77d4586591c19fb469a51fe6684fa Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Fri, 7 Nov 2014 14:50:38 -0800
Subject: Make quarantine_init() static.

---
 include/jemalloc/internal/private_symbols.txt | 3 +--
 include/jemalloc/internal/quarantine.h        | 1 -
 src/quarantine.c                              | 2 +-
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 1988c6e..ee973c9 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -338,9 +338,8 @@ prof_thread_name_get
 prof_thread_name_set
 quarantine
 quarantine_alloc_hook
-quarantine_cleanup
 quarantine_alloc_hook_work
-quarantine_init
+quarantine_cleanup
 register_zone
 rtree_delete
 rtree_get
diff --git a/include/jemalloc/internal/quarantine.h b/include/jemalloc/internal/quarantine.h
index a399faa..ae60739 100644
--- a/include/jemalloc/internal/quarantine.h
+++ b/include/jemalloc/internal/quarantine.h
@@ -29,7 +29,6 @@ struct quarantine_s {
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
-quarantine_t	*quarantine_init(tsd_t *tsd, size_t lg_maxobjs);
 void	quarantine_alloc_hook_work(tsd_t *tsd);
 void	quarantine(tsd_t *tsd, void *ptr);
 void	quarantine_cleanup(tsd_t *tsd);
diff --git a/src/quarantine.c b/src/quarantine.c
index aa1c3b0..ddacc6e 100644
--- a/src/quarantine.c
+++ b/src/quarantine.c
@@ -19,7 +19,7 @@ static void	quarantine_drain(tsd_t *tsd, quarantine_t *quarantine,
 
 /******************************************************************************/
 
-quarantine_t *
+static quarantine_t *
 quarantine_init(tsd_t *tsd, size_t lg_maxobjs)
 {
 	quarantine_t *quarantine;
-- 
cgit v0.12


From 2012d5a5601c787ce464fac0cbd2b16e3754cfa2 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Mon, 17 Nov 2014 09:54:49 -0800
Subject: Fix pointer arithmetic undefined behavior.

Reported by Denis Denisov.
---
 src/arena.c | 11 +++++++----
 src/huge.c  | 37 ++++++++++++++++++++++++-------------
 2 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index ef42771..1ecc5d0 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -690,8 +690,10 @@ arena_chunk_ralloc_huge_shrink(arena_t *arena, void *chunk, size_t oldsize,
 	}
 	arena->nactive -= udiff >> LG_PAGE;
 	malloc_mutex_unlock(&arena->lock);
-	if (cdiff != 0)
-		chunk_dalloc(chunk + CHUNK_CEILING(usize), cdiff, arena->ind);
+	if (cdiff != 0) {
+		chunk_dalloc((void *)((uintptr_t)chunk + CHUNK_CEILING(usize)),
+		    cdiff, arena->ind);
+	}
 }
 
 bool
@@ -714,8 +716,9 @@ arena_chunk_ralloc_huge_expand(arena_t *arena, void *chunk, size_t oldsize,
 	arena->nactive += (udiff >> LG_PAGE);
 	malloc_mutex_unlock(&arena->lock);
 
-	if (chunk_alloc_arena(chunk_alloc, chunk_dalloc, arena->ind, chunk +
-	    CHUNK_CEILING(oldsize), cdiff, chunksize, zero) == NULL) {
+	if (chunk_alloc_arena(chunk_alloc, chunk_dalloc, arena->ind,
+	    (void *)((uintptr_t)chunk + CHUNK_CEILING(oldsize)), cdiff,
+	    chunksize, zero) == NULL) {
 		/* Revert optimistic stats updates. */
 		malloc_mutex_lock(&arena->lock);
 		if (config_stats) {
diff --git a/src/huge.c b/src/huge.c
index 826464c..7ad9b66 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -119,9 +119,11 @@ huge_ralloc_no_move_similar(void *ptr, size_t oldsize, size_t usize,
 	/* Fill if necessary (shrinking). */
 	if (oldsize > usize) {
 		size_t sdiff = CHUNK_CEILING(usize) - usize;
-		zeroed = (sdiff != 0) ? !pages_purge(ptr + usize, sdiff) : true;
+		zeroed = (sdiff != 0) ? !pages_purge((void *)((uintptr_t)ptr +
+		    usize), sdiff) : true;
 		if (config_fill && unlikely(opt_junk)) {
-			memset(ptr + usize, 0x5a, oldsize - usize);
+			memset((void *)((uintptr_t)ptr + usize), 0x5a, oldsize -
+			    usize);
 			zeroed = false;
 		}
 	} else
@@ -145,10 +147,14 @@ huge_ralloc_no_move_similar(void *ptr, size_t oldsize, size_t usize,
 	/* Fill if necessary (growing). */
 	if (oldsize < usize) {
 		if (zero || (config_fill && unlikely(opt_zero))) {
-			if (!zeroed)
-				memset(ptr + oldsize, 0, usize - oldsize);
-		} else if (config_fill && unlikely(opt_junk))
-			memset(ptr + oldsize, 0xa5, usize - oldsize);
+			if (!zeroed) {
+				memset((void *)((uintptr_t)ptr + oldsize), 0,
+				    usize - oldsize);
+			}
+		} else if (config_fill && unlikely(opt_junk)) {
+			memset((void *)((uintptr_t)ptr + oldsize), 0xa5, usize -
+			    oldsize);
+		}
 	}
 }
 
@@ -161,9 +167,11 @@ huge_ralloc_no_move_shrink(void *ptr, size_t oldsize, size_t usize)
 	arena_t *arena;
 
 	sdiff = CHUNK_CEILING(usize) - usize;
-	zeroed = (sdiff != 0) ? !pages_purge(ptr + usize, sdiff) : true;
+	zeroed = (sdiff != 0) ? !pages_purge((void *)((uintptr_t)ptr + usize),
+	    sdiff) : true;
 	if (config_fill && unlikely(opt_junk)) {
-		huge_dalloc_junk(ptr + usize, oldsize - usize);
+		huge_dalloc_junk((void *)((uintptr_t)ptr + usize), oldsize -
+		    usize);
 		zeroed = false;
 	}
 
@@ -222,15 +230,18 @@ huge_ralloc_no_move_expand(void *ptr, size_t oldsize, size_t size, bool zero) {
 
 	if (zero || (config_fill && unlikely(opt_zero))) {
 		if (!is_zeroed_subchunk) {
-			memset(ptr + oldsize, 0, CHUNK_CEILING(oldsize) -
-			    oldsize);
+			memset((void *)((uintptr_t)ptr + oldsize), 0,
+			    CHUNK_CEILING(oldsize) - oldsize);
 		}
 		if (!is_zeroed_chunk) {
-			memset(ptr + CHUNK_CEILING(oldsize), 0, usize -
+			memset((void *)((uintptr_t)ptr +
+			    CHUNK_CEILING(oldsize)), 0, usize -
 			    CHUNK_CEILING(oldsize));
 		}
-	} else if (config_fill && unlikely(opt_junk))
-		memset(ptr + oldsize, 0xa5, usize - oldsize);
+	} else if (config_fill && unlikely(opt_junk)) {
+		memset((void *)((uintptr_t)ptr + oldsize), 0xa5, usize -
+		    oldsize);
+	}
 
 	return (false);
 }
-- 
cgit v0.12


From d49cb68b9e8b57169240e16686f4f60d6b5a089f Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Mon, 17 Nov 2014 10:31:59 -0800
Subject: Fix more pointer arithmetic undefined behavior.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reported by Guilherme Gonçalves.

This resolves #166.
---
 src/arena.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 1ecc5d0..f351c09 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -2341,12 +2341,12 @@ arena_new(unsigned ind)
 
 	if (config_stats) {
 		memset(&arena->stats, 0, sizeof(arena_stats_t));
-		arena->stats.lstats = (malloc_large_stats_t *)(((void *)arena) +
-		    CACHELINE_CEILING(sizeof(arena_t)));
+		arena->stats.lstats = (malloc_large_stats_t *)((uintptr_t)arena
+		    + CACHELINE_CEILING(sizeof(arena_t)));
 		memset(arena->stats.lstats, 0, nlclasses *
 		    sizeof(malloc_large_stats_t));
-		arena->stats.hstats = (malloc_huge_stats_t *)(((void *)arena) +
-		    CACHELINE_CEILING(sizeof(arena_t)) +
+		arena->stats.hstats = (malloc_huge_stats_t *)((uintptr_t)arena
+		    + CACHELINE_CEILING(sizeof(arena_t)) +
 		    QUANTUM_CEILING(nlclasses * sizeof(malloc_large_stats_t)));
 		memset(arena->stats.hstats, 0, nhclasses *
 		    sizeof(malloc_huge_stats_t));
-- 
cgit v0.12


From a2136025c4c4861b91f361a90c1dc94214848708 Mon Sep 17 00:00:00 2001
From: Guilherme Goncalves <guilherme.p.gonc@gmail.com>
Date: Tue, 18 Nov 2014 18:48:48 -0200
Subject: Remove extra definition of je_tsd_boot on win32.

---
 include/jemalloc/internal/tsd.h | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index b5658f8..35dd862 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -364,12 +364,6 @@ a_name##tsd_boot(void)							\
 	a_name##tsd_boot1();						\
 	return (false);							\
 }									\
-a_attr bool								\
-a_name##tsd_boot(void)							\
-{									\
-									\
-	return (false);							\
-}									\
 /* Get/set. */								\
 a_attr a_type *								\
 a_name##tsd_get(void)							\
-- 
cgit v0.12


From 879e76a9e57e725e927e77900940967d301a4958 Mon Sep 17 00:00:00 2001
From: Daniel Micay <danielmicay@gmail.com>
Date: Mon, 3 Nov 2014 14:02:52 -0500
Subject: teach the dss chunk allocator to handle new_addr

This provides in-place expansion of huge allocations when the end of the
allocation is at the end of the sbrk heap. There's already the ability
to extend in-place via recycled chunks but this handles the initial
growth of the heap via repeated vector / string reallocations.

A possible future extension could allow realloc to go from the following:

    | huge allocation | recycled chunks |
                                        ^ dss_end

To a larger allocation built from recycled *and* new chunks:

    |                      huge allocation                      |
                                                                ^ dss_end

Doing that would involve teaching the chunk recycling code to request
new chunks to satisfy the request. The chunk_dss code wouldn't require
any further changes.

    #include <stdlib.h>

    int main(void) {
        size_t chunk = 4 * 1024 * 1024;
        void *ptr = NULL;
        for (size_t size = chunk; size < chunk * 128; size *= 2) {
            ptr = realloc(ptr, size);
            if (!ptr) return 1;
        }
    }

dss:secondary: 0.083s
dss:primary: 0.083s

After:

dss:secondary: 0.083s
dss:primary: 0.003s

The dss heap grows in the upwards direction, so the oldest chunks are at
the low addresses and they are used first. Linux prefers to grow the
mmap heap downwards, so the trick will not work in the *current* mmap
chunk allocator as a huge allocation will only be at the top of the heap
in a contrived case.
---
 include/jemalloc/internal/chunk_dss.h |  3 ++-
 src/chunk.c                           | 12 +++++-------
 src/chunk_dss.c                       | 11 ++++++++++-
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/include/jemalloc/internal/chunk_dss.h b/include/jemalloc/internal/chunk_dss.h
index 4535ce0..0989647 100644
--- a/include/jemalloc/internal/chunk_dss.h
+++ b/include/jemalloc/internal/chunk_dss.h
@@ -23,7 +23,8 @@ extern const char *dss_prec_names[];
 
 dss_prec_t	chunk_dss_prec_get(void);
 bool	chunk_dss_prec_set(dss_prec_t dss_prec);
-void	*chunk_alloc_dss(size_t size, size_t alignment, bool *zero);
+void	*chunk_alloc_dss(void *new_addr, size_t size, size_t alignment,
+    bool *zero);
 bool	chunk_in_dss(void *chunk);
 bool	chunk_dss_boot(void);
 void	chunk_dss_prefork(void);
diff --git a/src/chunk.c b/src/chunk.c
index a776116..b373718 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -154,16 +154,15 @@ chunk_alloc_core(void *new_addr, size_t size, size_t alignment, bool base,
 		if ((ret = chunk_recycle(&chunks_szad_dss, &chunks_ad_dss,
 		    new_addr, size, alignment, base, zero)) != NULL)
 			return (ret);
-		/* requesting an address only implemented for recycle */
-		if (new_addr == NULL
-		    && (ret = chunk_alloc_dss(size, alignment, zero)) != NULL)
+		if ((ret = chunk_alloc_dss(new_addr, size, alignment, zero))
+		    != NULL)
 			return (ret);
 	}
 	/* mmap. */
 	if ((ret = chunk_recycle(&chunks_szad_mmap, &chunks_ad_mmap, new_addr,
 	    size, alignment, base, zero)) != NULL)
 		return (ret);
-	/* requesting an address only implemented for recycle */
+	/* requesting an address not implemented for chunk_alloc_mmap */
 	if (new_addr == NULL &&
 	    (ret = chunk_alloc_mmap(size, alignment, zero)) != NULL)
 		return (ret);
@@ -172,9 +171,8 @@ chunk_alloc_core(void *new_addr, size_t size, size_t alignment, bool base,
 		if ((ret = chunk_recycle(&chunks_szad_dss, &chunks_ad_dss,
 		    new_addr, size, alignment, base, zero)) != NULL)
 			return (ret);
-		/* requesting an address only implemented for recycle */
-		if (new_addr == NULL &&
-		    (ret = chunk_alloc_dss(size, alignment, zero)) != NULL)
+		if ((ret = chunk_alloc_dss(new_addr, size, alignment, zero))
+		    != NULL)
 			return (ret);
 	}
 
diff --git a/src/chunk_dss.c b/src/chunk_dss.c
index cce7104..edba3b2 100644
--- a/src/chunk_dss.c
+++ b/src/chunk_dss.c
@@ -66,7 +66,7 @@ chunk_dss_prec_set(dss_prec_t dss_prec)
 }
 
 void *
-chunk_alloc_dss(size_t size, size_t alignment, bool *zero)
+chunk_alloc_dss(void *new_addr, size_t size, size_t alignment, bool *zero)
 {
 	void *ret;
 
@@ -93,8 +93,17 @@ chunk_alloc_dss(size_t size, size_t alignment, bool *zero)
 		 * malloc.
 		 */
 		do {
+			/* Avoid an unnecessary system call. */
+			if (new_addr != NULL && dss_max != new_addr)
+				break;
+
 			/* Get the current end of the DSS. */
 			dss_max = chunk_dss_sbrk(0);
+
+			/* Make sure the earlier condition still holds. */
+			if (new_addr != NULL && dss_max != new_addr)
+				break;
+
 			/*
 			 * Calculate how much padding is necessary to
 			 * chunk-align the end of the DSS.
-- 
cgit v0.12


From f79e01f75b79058c3be0ce6de0d46f8a9a990176 Mon Sep 17 00:00:00 2001
From: Yuriy Kaminskiy <yumkam@gmail.com>
Date: Tue, 2 Dec 2014 16:24:11 -0800
Subject: Fix test_stats_arenas_bins for 32-bit builds.

---
 test/unit/stats.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/unit/stats.c b/test/unit/stats.c
index fd92d54..946e737 100644
--- a/test/unit/stats.c
+++ b/test/unit/stats.c
@@ -327,6 +327,7 @@ TEST_BEGIN(test_stats_arenas_bins)
 	assert_d_eq(mallctl("stats.arenas.0.bins.0.curregs", &curregs, &sz,
 	    NULL, 0), expected, "Unexpected mallctl() result");
 
+	sz = sizeof(uint64_t);
 	assert_d_eq(mallctl("stats.arenas.0.bins.0.nfills", &nfills, &sz,
 	    NULL, 0), config_tcache ? expected : ENOENT,
 	    "Unexpected mallctl() result");
-- 
cgit v0.12


From 1036ddbf11b7e9ec566b92b3dd50e105fc5f6932 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Thu, 4 Dec 2014 16:42:42 -0800
Subject: Fix OOM cleanup in huge_palloc().

Fix OOM cleanup in huge_palloc() to call idalloct() rather than
base_node_dalloc().  This bug is a result of incomplete refactoring, and
has no impact other than leaking memory during OOM.
---
 src/huge.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/huge.c b/src/huge.c
index 7ad9b66..6883903 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -48,12 +48,8 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 	 */
 	is_zeroed = zero;
 	arena = arena_choose(tsd, arena);
-	if (unlikely(arena == NULL)) {
-		base_node_dalloc(node);
-		return (NULL);
-	}
-	ret = arena_chunk_alloc_huge(arena, usize, alignment, &is_zeroed);
-	if (ret == NULL) {
+	if (unlikely(arena == NULL) || (ret = arena_chunk_alloc_huge(arena,
+	    usize, alignment, &is_zeroed)) == NULL) {
 		idalloct(tsd, node, try_tcache);
 		return (NULL);
 	}
-- 
cgit v0.12


From a18c2b1f152b4334474ed32fc46d762d4fa54c2b Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 5 Dec 2014 17:49:47 -0800
Subject: Style fixes.

---
 include/jemalloc/internal/atomic.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/jemalloc/internal/atomic.h b/include/jemalloc/internal/atomic.h
index a048815..8b743b8 100644
--- a/include/jemalloc/internal/atomic.h
+++ b/include/jemalloc/internal/atomic.h
@@ -58,7 +58,7 @@ atomic_sub_uint64(uint64_t *p, uint64_t x)
 
 	return (__sync_sub_and_fetch(p, x));
 }
-#elif (defined(_MSC_VER))
+#  elif (defined(_MSC_VER))
 JEMALLOC_INLINE uint64_t
 atomic_add_uint64(uint64_t *p, uint64_t x)
 {
@@ -72,7 +72,7 @@ atomic_sub_uint64(uint64_t *p, uint64_t x)
 
 	return (InterlockedExchangeAdd64(p, -((int64_t)x)) - x);
 }
-#elif (defined(JEMALLOC_OSATOMIC))
+#  elif (defined(JEMALLOC_OSATOMIC))
 JEMALLOC_INLINE uint64_t
 atomic_add_uint64(uint64_t *p, uint64_t x)
 {
-- 
cgit v0.12


From 59cd80e6c6e36c26a880e86f6cde9f71808b256c Mon Sep 17 00:00:00 2001
From: Chih-hung Hsieh <chh@google.com>
Date: Fri, 5 Dec 2014 17:42:41 -0800
Subject: Add a C11 atomics-based implementation of atomic.h API.

---
 configure.ac                                       | 21 ++++++++++++++++
 include/jemalloc/internal/atomic.h                 | 28 ++++++++++++++++++++++
 include/jemalloc/internal/jemalloc_internal.h.in   |  4 ++++
 .../jemalloc/internal/jemalloc_internal_defs.h.in  |  3 +++
 4 files changed, 56 insertions(+)

diff --git a/configure.ac b/configure.ac
index 5c51f27..8b1e55e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1200,6 +1200,27 @@ elif test "x${force_tls}" = "x1" ; then
 fi
 
 dnl ============================================================================
+dnl Check for C11 atomics.
+
+JE_COMPILABLE([C11 atomics], [
+#include <stdint.h>
+#if (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__)
+#include <stdatomic.h>
+#else
+#error Atomics not available
+#endif
+], [
+    uint64_t *p = (uint64_t *)0;
+    uint64_t x = 1;
+    volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
+    uint64_t r = atomic_fetch_add(a, x) + x;
+    return (r == 0);
+], [je_cv_c11atomics])
+if test "x${je_cv_c11atomics}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_C11ATOMICS])
+fi
+
+dnl ============================================================================
 dnl Check for atomic(9) operations as provided on FreeBSD.
 
 JE_COMPILABLE([atomic(9)], [
diff --git a/include/jemalloc/internal/atomic.h b/include/jemalloc/internal/atomic.h
index 8b743b8..23ac93f 100644
--- a/include/jemalloc/internal/atomic.h
+++ b/include/jemalloc/internal/atomic.h
@@ -72,6 +72,20 @@ atomic_sub_uint64(uint64_t *p, uint64_t x)
 
 	return (InterlockedExchangeAdd64(p, -((int64_t)x)) - x);
 }
+#  elif (defined(JEMALLOC_C11ATOMICS))
+JEMALLOC_INLINE uint64_t
+atomic_add_uint64(uint64_t *p, uint64_t x)
+{
+	volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
+	return (atomic_fetch_add(a, x) + x);
+}
+
+JEMALLOC_INLINE uint64_t
+atomic_sub_uint64(uint64_t *p, uint64_t x)
+{
+	volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
+	return (atomic_fetch_sub(a, x) - x);
+}
 #  elif (defined(JEMALLOC_OSATOMIC))
 JEMALLOC_INLINE uint64_t
 atomic_add_uint64(uint64_t *p, uint64_t x)
@@ -187,6 +201,20 @@ atomic_sub_uint32(uint32_t *p, uint32_t x)
 
 	return (InterlockedExchangeAdd(p, -((int32_t)x)) - x);
 }
+#  elif (defined(JEMALLOC_C11ATOMICS))
+JEMALLOC_INLINE uint32_t
+atomic_add_uint32(uint32_t *p, uint32_t x)
+{
+	volatile atomic_uint_least32_t *a = (volatile atomic_uint_least32_t *)p;
+	return (atomic_fetch_add(a, x) + x);
+}
+
+JEMALLOC_INLINE uint32_t
+atomic_sub_uint32(uint32_t *p, uint32_t x)
+{
+	volatile atomic_uint_least32_t *a = (volatile atomic_uint_least32_t *)p;
+	return (atomic_fetch_sub(a, x) - x);
+}
 #elif (defined(JEMALLOC_OSATOMIC))
 JEMALLOC_INLINE uint32_t
 atomic_add_uint32(uint32_t *p, uint32_t x)
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 6f13093..bf10617 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -127,6 +127,10 @@ static const bool config_ivsalloc =
 #endif
     ;
 
+#ifdef JEMALLOC_C11ATOMICS
+#include <stdatomic.h>
+#endif
+
 #ifdef JEMALLOC_ATOMIC9
 #include <machine/atomic.h>
 #endif
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index dccbb1e..2923e83 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -22,6 +22,9 @@
  */
 #undef CPU_SPINWAIT
 
+/* Defined if C11 atomics are available. */
+#undef JEMALLOC_C11ATOMICS
+
 /* Defined if the equivalent of FreeBSD's atomic(9) functions are available. */
 #undef JEMALLOC_ATOMIC9
 
-- 
cgit v0.12


From e12eaf93dca308a426c182956197b0eeb5f2cff3 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Mon, 8 Dec 2014 14:40:14 -0800
Subject: Style and spelling fixes.

---
 include/jemalloc/internal/arena.h                     |  5 ++---
 include/jemalloc/internal/extent.h                    |  2 +-
 include/jemalloc/internal/hash.h                      |  5 +++--
 include/jemalloc/internal/jemalloc_internal.h.in      |  2 +-
 include/jemalloc/internal/jemalloc_internal_decls.h   |  2 +-
 include/jemalloc/internal/jemalloc_internal_defs.h.in | 14 ++++++--------
 include/jemalloc/internal/ql.h                        |  4 +---
 include/jemalloc/internal/qr.h                        |  6 ++++--
 include/jemalloc/internal/rb.h                        |  4 ++--
 include/jemalloc/internal/tcache.h                    |  2 +-
 include/jemalloc/internal/util.h                      |  6 +++---
 src/arena.c                                           |  2 +-
 src/chunk.c                                           |  2 +-
 src/ckh.c                                             |  4 ++--
 src/jemalloc.c                                        |  2 +-
 src/quarantine.c                                      |  4 ++--
 src/zone.c                                            |  2 +-
 test/include/test/math.h                              |  2 +-
 test/include/test/thd.h                               |  2 +-
 test/include/test/timer.h                             |  4 +---
 20 files changed, 36 insertions(+), 40 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index a42522d..1e19023 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -263,8 +263,7 @@ struct arena_s {
 	/*
 	 * There are three classes of arena operations from a locking
 	 * perspective:
-	 * 1) Thread asssignment (modifies nthreads) is protected by
-	 *    arenas_lock.
+	 * 1) Thread assignment (modifies nthreads) is protected by arenas_lock.
 	 * 2) Bin-related operations are protected by bin locks.
 	 * 3) Chunk- and run-related operations are protected by this mutex.
 	 */
@@ -314,7 +313,7 @@ struct arena_s {
 	arena_chunk_miscelms_t	runs_dirty;
 
 	/*
-	 * user-configureable chunk allocation and deallocation functions.
+	 * User-configurable chunk allocation and deallocation functions.
 	 */
 	chunk_alloc_t		*chunk_alloc;
 	chunk_dalloc_t		*chunk_dalloc;
diff --git a/include/jemalloc/internal/extent.h b/include/jemalloc/internal/extent.h
index 5b00076..cbfc20a 100644
--- a/include/jemalloc/internal/extent.h
+++ b/include/jemalloc/internal/extent.h
@@ -24,7 +24,7 @@ struct extent_node_s {
 	/* Total region size. */
 	size_t			size;
 
-	/* Arena from which this extent came, if any */
+	/* Arena from which this extent came, if any. */
 	arena_t			*arena;
 
 	/* True if zero-filled; used by chunk recycling code. */
diff --git a/include/jemalloc/internal/hash.h b/include/jemalloc/internal/hash.h
index a43bbbe..bcead33 100644
--- a/include/jemalloc/internal/hash.h
+++ b/include/jemalloc/internal/hash.h
@@ -35,13 +35,14 @@ JEMALLOC_INLINE uint32_t
 hash_rotl_32(uint32_t x, int8_t r)
 {
 
-	return (x << r) | (x >> (32 - r));
+	return ((x << r) | (x >> (32 - r)));
 }
 
 JEMALLOC_INLINE uint64_t
 hash_rotl_64(uint64_t x, int8_t r)
 {
-	return (x << r) | (x >> (64 - r));
+
+	return ((x << r) | (x >> (64 - r)));
 }
 
 JEMALLOC_INLINE uint32_t
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index bf10617..9bd501c 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -302,7 +302,7 @@ typedef unsigned index_t;
 #define	ALIGNMENT_CEILING(s, alignment)					\
 	(((s) + (alignment - 1)) & (-(alignment)))
 
-/* Declare a variable length array */
+/* Declare a variable-length array. */
 #if __STDC_VERSION__ < 199901L
 #  ifdef _MSC_VER
 #    include <malloc.h>
diff --git a/include/jemalloc/internal/jemalloc_internal_decls.h b/include/jemalloc/internal/jemalloc_internal_decls.h
index fa59040..fb2effb 100644
--- a/include/jemalloc/internal/jemalloc_internal_decls.h
+++ b/include/jemalloc/internal/jemalloc_internal_decls.h
@@ -50,7 +50,7 @@ typedef intptr_t ssize_t;
 #  define PATH_MAX 1024
 #  define STDERR_FILENO 2
 #  define __func__ __FUNCTION__
-/* Disable warnings about deprecated system functions */
+/* Disable warnings about deprecated system functions. */
 #  pragma warning(disable: 4996)
 #else
 #  include <unistd.h>
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index 2923e83..e172c66 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -38,7 +38,7 @@
  * Defined if __sync_add_and_fetch(uint32_t *, uint32_t) and
  * __sync_sub_and_fetch(uint32_t *, uint32_t) are available, despite
  * __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 not being defined (which means the
- * functions are defined in libgcc instead of being inlines)
+ * functions are defined in libgcc instead of being inlines).
  */
 #undef JE_FORCE_SYNC_COMPARE_AND_SWAP_4
 
@@ -46,7 +46,7 @@
  * Defined if __sync_add_and_fetch(uint64_t *, uint64_t) and
  * __sync_sub_and_fetch(uint64_t *, uint64_t) are available, despite
  * __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 not being defined (which means the
- * functions are defined in libgcc instead of being inlines)
+ * functions are defined in libgcc instead of being inlines).
  */
 #undef JE_FORCE_SYNC_COMPARE_AND_SWAP_8
 
@@ -201,9 +201,7 @@
 #undef JEMALLOC_PURGE_MADVISE_DONTNEED
 #undef JEMALLOC_PURGE_MADVISE_FREE
 
-/*
- * Define if operating system has alloca.h header.
- */
+/* Define if operating system has alloca.h header. */
 #undef JEMALLOC_HAS_ALLOCA_H
 
 /* C99 restrict keyword supported. */
@@ -221,13 +219,13 @@
 /* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */
 #undef LG_SIZEOF_INTMAX_T
 
-/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook) */
+/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook). */
 #undef JEMALLOC_GLIBC_MALLOC_HOOK
 
-/* glibc memalign hook */
+/* glibc memalign hook. */
 #undef JEMALLOC_GLIBC_MEMALIGN_HOOK
 
-/* adaptive mutex support in pthreads */
+/* Adaptive mutex support in pthreads. */
 #undef JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP
 
 #endif /* JEMALLOC_INTERNAL_DEFS_H_ */
diff --git a/include/jemalloc/internal/ql.h b/include/jemalloc/internal/ql.h
index f70c5f6..1834bb8 100644
--- a/include/jemalloc/internal/ql.h
+++ b/include/jemalloc/internal/ql.h
@@ -1,6 +1,4 @@
-/*
- * List definitions.
- */
+/* List definitions. */
 #define	ql_head(a_type)							\
 struct {								\
 	a_type *qlh_first;						\
diff --git a/include/jemalloc/internal/qr.h b/include/jemalloc/internal/qr.h
index 602944b..0fbaec2 100644
--- a/include/jemalloc/internal/qr.h
+++ b/include/jemalloc/internal/qr.h
@@ -40,8 +40,10 @@ struct {								\
 	(a_qr_b)->a_field.qre_prev = t;					\
 } while (0)
 
-/* qr_meld() and qr_split() are functionally equivalent, so there's no need to
- * have two copies of the code. */
+/*
+ * qr_meld() and qr_split() are functionally equivalent, so there's no need to
+ * have two copies of the code.
+ */
 #define	qr_split(a_qr_a, a_qr_b, a_field)				\
 	qr_meld((a_qr_a), (a_qr_b), a_field)
 
diff --git a/include/jemalloc/internal/rb.h b/include/jemalloc/internal/rb.h
index 64fab89..2ca8e59 100644
--- a/include/jemalloc/internal/rb.h
+++ b/include/jemalloc/internal/rb.h
@@ -200,7 +200,7 @@ a_prefix##reverse_iter(a_rbt_type *rbtree, a_type *start,		\
  *                 int (a_cmp *)(a_type *a_node, a_type *a_other);
  *                                       ^^^^^^
  *                                    or a_key
- *               Interpretation of comparision function return values:
+ *               Interpretation of comparison function return values:
  *                 -1 : a_node <  a_other
  *                  0 : a_node == a_other
  *                  1 : a_node >  a_other
@@ -693,7 +693,7 @@ a_prefix##remove(a_rbt_type *rbtree, a_type *node) {			\
 		    rbtn_rotate_left(a_type, a_field, pathp->node,	\
 		      tnode);						\
 		    /* Balance restored, but rotation modified        */\
-		    /* subree root, which may actually be the tree    */\
+		    /* subtree root, which may actually be the tree   */\
 		    /* root.                                          */\
 		    if (pathp == path) {				\
 			/* Set root. */					\
diff --git a/include/jemalloc/internal/tcache.h b/include/jemalloc/internal/tcache.h
index fe9c47e..3a3fd49 100644
--- a/include/jemalloc/internal/tcache.h
+++ b/include/jemalloc/internal/tcache.h
@@ -69,7 +69,7 @@ struct tcache_bin_s {
 
 struct tcache_s {
 	ql_elm(tcache_t) link;		/* Used for aggregating stats. */
-	uint64_t	prof_accumbytes;/* Cleared after arena_prof_accum() */
+	uint64_t	prof_accumbytes;/* Cleared after arena_prof_accum(). */
 	arena_t		*arena;		/* This thread's arena. */
 	unsigned	ev_cnt;		/* Event count since incremental GC. */
 	index_t		next_gc_bin;	/* Next bin to GC. */
diff --git a/include/jemalloc/internal/util.h b/include/jemalloc/internal/util.h
index 5af6832..b2b4ab7 100644
--- a/include/jemalloc/internal/util.h
+++ b/include/jemalloc/internal/util.h
@@ -127,7 +127,7 @@ int	get_errno(void);
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_UTIL_C_))
 
-/* Sanity check: */
+/* Sanity check. */
 #if !defined(JEMALLOC_INTERNAL_FFSL) || !defined(JEMALLOC_INTERNAL_FFS)
 #  error Both JEMALLOC_INTERNAL_FFSL && JEMALLOC_INTERNAL_FFS should have been defined by configure
 #endif
@@ -231,7 +231,7 @@ lg_floor(size_t x)
 }
 #endif
 
-/* Sets error code */
+/* Set error code. */
 JEMALLOC_INLINE void
 set_errno(int errnum)
 {
@@ -243,7 +243,7 @@ set_errno(int errnum)
 #endif
 }
 
-/* Get last error code */
+/* Get last error code. */
 JEMALLOC_INLINE int
 get_errno(void)
 {
diff --git a/src/arena.c b/src/arena.c
index f351c09..6f2410a 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -36,7 +36,7 @@ arena_miscelm_to_bits(arena_chunk_map_misc_t *miscelm)
 	arena_chunk_t *chunk = CHUNK_ADDR2BASE(miscelm);
 	size_t pageind = arena_miscelm_to_pageind(miscelm);
 
-	return arena_mapbits_get(chunk, pageind);
+	return (arena_mapbits_get(chunk, pageind));
 }
 
 JEMALLOC_INLINE_C int
diff --git a/src/chunk.c b/src/chunk.c
index b373718..7926452 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -162,7 +162,7 @@ chunk_alloc_core(void *new_addr, size_t size, size_t alignment, bool base,
 	if ((ret = chunk_recycle(&chunks_szad_mmap, &chunks_ad_mmap, new_addr,
 	    size, alignment, base, zero)) != NULL)
 		return (ret);
-	/* requesting an address not implemented for chunk_alloc_mmap */
+	/* Requesting an address not implemented for chunk_alloc_mmap(). */
 	if (new_addr == NULL &&
 	    (ret = chunk_alloc_mmap(size, alignment, zero)) != NULL)
 		return (ret);
diff --git a/src/ckh.c b/src/ckh.c
index 3a54596..db2ae39 100644
--- a/src/ckh.c
+++ b/src/ckh.c
@@ -367,10 +367,10 @@ ckh_new(tsd_t *tsd, ckh_t *ckh, size_t minitems, ckh_hash_t *hash,
 	ckh->count = 0;
 
 	/*
-	 * Find the minimum power of 2 that is large enough to fit aBaseCount
+	 * Find the minimum power of 2 that is large enough to fit minitems
 	 * entries.  We are using (2+,2) cuckoo hashing, which has an expected
 	 * maximum load factor of at least ~0.86, so 0.75 is a conservative load
-	 * factor that will typically allow 2^aLgMinItems to fit without ever
+	 * factor that will typically allow mincells items to fit without ever
 	 * growing the table.
 	 */
 	assert(LG_CKH_BUCKET_CELLS > 0);
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 8b2ab8d..f7cc457 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -808,7 +808,7 @@ malloc_conf_init(void)
 			if (linklen == -1) {
 				/* No configuration specified. */
 				linklen = 0;
-				/* restore errno */
+				/* Restore errno. */
 				set_errno(saved_errno);
 			}
 #endif
diff --git a/src/quarantine.c b/src/quarantine.c
index ddacc6e..c5fa566 100644
--- a/src/quarantine.c
+++ b/src/quarantine.c
@@ -2,7 +2,7 @@
 #include "jemalloc/internal/jemalloc_internal.h"
 
 /*
- * quarantine pointers close to NULL are used to encode state information that
+ * Quarantine pointers close to NULL are used to encode state information that
  * is used for cleaning up during thread shutdown.
  */
 #define	QUARANTINE_STATE_REINCARNATED	((quarantine_t *)(uintptr_t)1)
@@ -49,7 +49,7 @@ quarantine_alloc_hook_work(tsd_t *tsd)
 	quarantine = quarantine_init(tsd, LG_MAXOBJS_INIT);
 	/*
 	 * Check again whether quarantine has been initialized, because
-	 * qurantine_init() may have triggered recursive initialization.
+	 * quarantine_init() may have triggered recursive initialization.
 	 */
 	if (tsd_quarantine_get(tsd) == NULL)
 		tsd_quarantine_set(tsd, quarantine);
diff --git a/src/zone.c b/src/zone.c
index c6bd533..12e1734 100644
--- a/src/zone.c
+++ b/src/zone.c
@@ -263,7 +263,7 @@ register_zone(void)
 		 * after the default zone.  On OSX < 10.6, there is no purgeable
 		 * zone, so this does nothing.  On OSX >= 10.6, unregistering
 		 * replaces the purgeable zone with the last registered zone
-		 * above, i.e the default zone.  Registering it again then puts
+		 * above, i.e. the default zone.  Registering it again then puts
 		 * it at the end, obviously after the default zone.
 		 */
 		if (purgeable_zone) {
diff --git a/test/include/test/math.h b/test/include/test/math.h
index a862ed7..b057b29 100644
--- a/test/include/test/math.h
+++ b/test/include/test/math.h
@@ -299,7 +299,7 @@ pt_chi2(double p, double df, double ln_gamma_df_2)
 
 /*
  * Given a value p in [0..1] and Gamma distribution shape and scale parameters,
- * compute the upper limit on the definite integeral from [0..z] that satisfies
+ * compute the upper limit on the definite integral from [0..z] that satisfies
  * p.
  */
 JEMALLOC_INLINE double
diff --git a/test/include/test/thd.h b/test/include/test/thd.h
index f941d7a..47a5126 100644
--- a/test/include/test/thd.h
+++ b/test/include/test/thd.h
@@ -1,4 +1,4 @@
-/* Abstraction layer for threading in tests */
+/* Abstraction layer for threading in tests. */
 #ifdef _WIN32
 typedef HANDLE thd_t;
 #else
diff --git a/test/include/test/timer.h b/test/include/test/timer.h
index 6877e4a..496072a 100644
--- a/test/include/test/timer.h
+++ b/test/include/test/timer.h
@@ -1,6 +1,4 @@
-/*
- * Simple timer, for use in benchmark reporting.
- */
+/* Simple timer, for use in benchmark reporting. */
 
 #include <sys/time.h>
 
-- 
cgit v0.12


From b74041fb6e279bd8bbc133250241249f90cd619f Mon Sep 17 00:00:00 2001
From: Daniel Micay <danielmicay@gmail.com>
Date: Tue, 9 Dec 2014 17:41:34 -0500
Subject: Ignore MALLOC_CONF in set{uid,gid,cap} binaries.

This eliminates the malloc tunables as tools for an attacker.

Closes #173
---
 configure.ac                                       | 18 +++++++++++++++++
 .../jemalloc/internal/jemalloc_internal_defs.h.in  | 10 ++++++++++
 src/jemalloc.c                                     | 23 +++++++++++++++++++++-
 3 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 8b1e55e..82bdefd 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1108,6 +1108,24 @@ fi
 
 CPPFLAGS="$CPPFLAGS -D_REENTRANT"
 
+dnl Check if the GNU-specific secure_getenv function exists.
+AC_CHECK_FUNC([secure_getenv],
+              [have_secure_getenv="1"],
+              [have_secure_getenv="0"]
+             )
+if test "x$have_secure_getenv" = "x1" ; then
+  AC_DEFINE([JEMALLOC_HAVE_SECURE_GETENV], [ ])
+fi
+
+dnl Check if the Solaris/BSD issetugid function exists.
+AC_CHECK_FUNC([issetugid],
+              [have_issetugid="1"],
+              [have_issetugid="0"]
+             )
+if test "x$have_issetugid" = "x1" ; then
+  AC_DEFINE([JEMALLOC_HAVE_ISSETUGID], [ ])
+fi
+
 dnl Check whether the BSD-specific _malloc_thread_cleanup() exists.  If so, use
 dnl it rather than pthreads TSD cleanup functions to support cleanup during
 dnl thread exit, in order to avoid pthreads library recursion during
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index e172c66..c8d7daf 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -67,6 +67,16 @@
 #undef JEMALLOC_OSSPIN
 
 /*
+ * Defined if secure_getenv(3) is available.
+ */
+#undef JEMALLOC_HAVE_SECURE_GETENV
+
+/*
+ * Defined if issetugid(2) is available.
+ */
+#undef JEMALLOC_HAVE_ISSETUGID
+
+/*
  * Defined if _malloc_thread_cleanup() exists.  At least in the case of
  * FreeBSD, pthread_key_create() allocates, which if used during malloc
  * bootstrapping will cause recursion into the pthreads library.  Therefore, if
diff --git a/src/jemalloc.c b/src/jemalloc.c
index f7cc457..48de0da 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -648,6 +648,27 @@ stats_print_atexit(void)
  * Begin initialization functions.
  */
 
+#ifndef JEMALLOC_HAVE_SECURE_GETENV
+#  ifdef JEMALLOC_HAVE_ISSETUGID
+static char *
+secure_getenv(const char *name)
+{
+
+	if (issetugid() == 0)
+		return (getenv(name));
+	else
+		return (NULL);
+}
+#  else
+static char *
+secure_getenv(const char *name)
+{
+
+	return (getenv(name));
+}
+#  endif
+#endif
+
 static unsigned
 malloc_ncpus(void)
 {
@@ -824,7 +845,7 @@ malloc_conf_init(void)
 #endif
 			    ;
 
-			if ((opts = getenv(envname)) != NULL) {
+			if ((opts = secure_getenv(envname)) != NULL) {
 				/*
 				 * Do nothing; opts is already initialized to
 				 * the value of the MALLOC_CONF environment
-- 
cgit v0.12


From 2c5cb613dfbdf58f88152321b63e60c58cd23972 Mon Sep 17 00:00:00 2001
From: Guilherme Goncalves <guilherme.p.gonc@gmail.com>
Date: Mon, 8 Dec 2014 19:12:41 -0200
Subject: Introduce two new modes of junk filling: "alloc" and "free".

In addition to true/false, opt.junk can now be either "alloc" or "free",
giving applications the possibility of junking memory only on allocation
or deallocation.

This resolves #172.
---
 Makefile.in                                      |  2 +
 doc/jemalloc.xml.in                              | 20 +++++----
 include/jemalloc/internal/jemalloc_internal.h.in |  4 +-
 include/jemalloc/internal/private_symbols.txt    |  2 +
 include/jemalloc/internal/tcache.h               | 10 ++---
 src/arena.c                                      | 53 ++++++++++++-----------
 src/ctl.c                                        |  2 +-
 src/huge.c                                       | 12 +++---
 src/jemalloc.c                                   | 54 +++++++++++++++++++++---
 src/quarantine.c                                 |  2 +-
 test/unit/junk.c                                 | 41 +++++++++++-------
 test/unit/junk_alloc.c                           |  3 ++
 test/unit/junk_free.c                            |  3 ++
 test/unit/mallctl.c                              |  2 +-
 14 files changed, 139 insertions(+), 71 deletions(-)
 create mode 100644 test/unit/junk_alloc.c
 create mode 100644 test/unit/junk_free.c

diff --git a/Makefile.in b/Makefile.in
index 40644ce..c268d00 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -118,6 +118,8 @@ TESTS_UNIT := $(srcroot)test/unit/atomic.c \
 	$(srcroot)test/unit/ckh.c \
 	$(srcroot)test/unit/hash.c \
 	$(srcroot)test/unit/junk.c \
+	$(srcroot)test/unit/junk_alloc.c \
+	$(srcroot)test/unit/junk_free.c \
 	$(srcroot)test/unit/lg_chunk.c \
 	$(srcroot)test/unit/mallctl.c \
 	$(srcroot)test/unit/math.c \
diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 71b4cd1..0148f03 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -930,18 +930,20 @@ for (i = 0; i < nbins; i++) {
       <varlistentry id="opt.junk">
         <term>
           <mallctl>opt.junk</mallctl>
-          (<type>bool</type>)
+          (<type>const char *</type>)
           <literal>r-</literal>
           [<option>--enable-fill</option>]
         </term>
-        <listitem><para>Junk filling enabled/disabled.  If enabled, each byte
-        of uninitialized allocated memory will be initialized to
-        <literal>0xa5</literal>.  All deallocated memory will be initialized to
-        <literal>0x5a</literal>.  This is intended for debugging and will
-        impact performance negatively.  This option is disabled by default
-        unless <option>--enable-debug</option> is specified during
-        configuration, in which case it is enabled by default unless running
-        inside <ulink
+        <listitem><para>Junk filling.  If set to "alloc", each byte of
+        uninitialized allocated memory will be initialized to
+        <literal>0xa5</literal>.  If set to "free", all deallocated memory will
+        be initialized to <literal>0x5a</literal>.  If set to "true", both
+        allocated and deallocated memory will be initialized, and if set to
+        "false", junk filling be disabled entirely.  This is intended for
+        debugging and will impact performance negatively.  This option is
+        "false" by default unless <option>--enable-debug</option> is specified
+        during configuration, in which case it is "true" by default unless
+        running inside <ulink
         url="http://valgrind.org/">Valgrind</ulink>.</para></listitem>
       </varlistentry>
 
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 9bd501c..b7617df 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -376,7 +376,9 @@ typedef unsigned index_t;
 #define	JEMALLOC_H_EXTERNS
 
 extern bool	opt_abort;
-extern bool	opt_junk;
+extern const char	*opt_junk;
+extern bool	opt_junk_alloc;
+extern bool	opt_junk_free;
 extern size_t	opt_quarantine;
 extern bool	opt_redzone;
 extern bool	opt_utrace;
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index ee973c9..7e33915 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -274,6 +274,8 @@ nhbins
 opt_abort
 opt_dss
 opt_junk
+opt_junk_alloc
+opt_junk_free
 opt_lg_chunk
 opt_lg_dirty_mult
 opt_lg_prof_interval
diff --git a/include/jemalloc/internal/tcache.h b/include/jemalloc/internal/tcache.h
index 3a3fd49..6e97b3d 100644
--- a/include/jemalloc/internal/tcache.h
+++ b/include/jemalloc/internal/tcache.h
@@ -252,14 +252,14 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
 
 	if (likely(!zero)) {
 		if (config_fill) {
-			if (unlikely(opt_junk)) {
+			if (unlikely(opt_junk_alloc)) {
 				arena_alloc_junk_small(ret,
 				    &arena_bin_info[binind], false);
 			} else if (unlikely(opt_zero))
 				memset(ret, 0, usize);
 		}
 	} else {
-		if (config_fill && unlikely(opt_junk)) {
+		if (config_fill && unlikely(opt_junk_alloc)) {
 			arena_alloc_junk_small(ret, &arena_bin_info[binind],
 			    true);
 		}
@@ -307,7 +307,7 @@ tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
 		}
 		if (likely(!zero)) {
 			if (config_fill) {
-				if (unlikely(opt_junk))
+				if (unlikely(opt_junk_alloc))
 					memset(ret, 0xa5, usize);
 				else if (unlikely(opt_zero))
 					memset(ret, 0, usize);
@@ -333,7 +333,7 @@ tcache_dalloc_small(tcache_t *tcache, void *ptr, index_t binind)
 
 	assert(tcache_salloc(ptr) <= SMALL_MAXCLASS);
 
-	if (config_fill && unlikely(opt_junk))
+	if (config_fill && unlikely(opt_junk_free))
 		arena_dalloc_junk_small(ptr, &arena_bin_info[binind]);
 
 	tbin = &tcache->tbins[binind];
@@ -362,7 +362,7 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
 
 	binind = size2index(size);
 
-	if (config_fill && unlikely(opt_junk))
+	if (config_fill && unlikely(opt_junk_free))
 		arena_dalloc_junk_large(ptr, size);
 
 	tbin = &tcache->tbins[binind];
diff --git a/src/arena.c b/src/arena.c
index 6f2410a..bf78995 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1450,7 +1450,7 @@ arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, index_t binind,
 			}
 			break;
 		}
-		if (config_fill && unlikely(opt_junk)) {
+		if (config_fill && unlikely(opt_junk_alloc)) {
 			arena_alloc_junk_small(ptr, &arena_bin_info[binind],
 			    true);
 		}
@@ -1512,24 +1512,27 @@ arena_redzones_validate(void *ptr, arena_bin_info_t *bin_info, bool reset)
 	size_t i;
 	bool error = false;
 
-	for (i = 1; i <= redzone_size; i++) {
-		uint8_t *byte = (uint8_t *)((uintptr_t)ptr - i);
-		if (*byte != 0xa5) {
-			error = true;
-			arena_redzone_corruption(ptr, size, false, i, *byte);
-			if (reset)
-				*byte = 0xa5;
+	if (opt_junk_alloc) {
+		for (i = 1; i <= redzone_size; i++) {
+			uint8_t *byte = (uint8_t *)((uintptr_t)ptr - i);
+			if (*byte != 0xa5) {
+				error = true;
+				arena_redzone_corruption(ptr, size, false, i, *byte);
+				if (reset)
+					*byte = 0xa5;
+			}
 		}
-	}
-	for (i = 0; i < redzone_size; i++) {
-		uint8_t *byte = (uint8_t *)((uintptr_t)ptr + size + i);
-		if (*byte != 0xa5) {
-			error = true;
-			arena_redzone_corruption(ptr, size, true, i, *byte);
-			if (reset)
-				*byte = 0xa5;
+		for (i = 0; i < redzone_size; i++) {
+			uint8_t *byte = (uint8_t *)((uintptr_t)ptr + size + i);
+			if (*byte != 0xa5) {
+				error = true;
+				arena_redzone_corruption(ptr, size, true, i, *byte);
+				if (reset)
+					*byte = 0xa5;
+			}
 		}
 	}
+
 	if (opt_abort && error)
 		abort();
 }
@@ -1560,7 +1563,7 @@ arena_quarantine_junk_small(void *ptr, size_t usize)
 	index_t binind;
 	arena_bin_info_t *bin_info;
 	cassert(config_fill);
-	assert(opt_junk);
+	assert(opt_junk_free);
 	assert(opt_quarantine);
 	assert(usize <= SMALL_MAXCLASS);
 
@@ -1604,7 +1607,7 @@ arena_malloc_small(arena_t *arena, size_t size, bool zero)
 
 	if (!zero) {
 		if (config_fill) {
-			if (unlikely(opt_junk)) {
+			if (unlikely(opt_junk_alloc)) {
 				arena_alloc_junk_small(ret,
 				    &arena_bin_info[binind], false);
 			} else if (unlikely(opt_zero))
@@ -1612,7 +1615,7 @@ arena_malloc_small(arena_t *arena, size_t size, bool zero)
 		}
 		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
 	} else {
-		if (config_fill && unlikely(opt_junk)) {
+		if (config_fill && unlikely(opt_junk_alloc)) {
 			arena_alloc_junk_small(ret, &arena_bin_info[binind],
 			    true);
 		}
@@ -1660,7 +1663,7 @@ arena_malloc_large(arena_t *arena, size_t size, bool zero)
 
 	if (!zero) {
 		if (config_fill) {
-			if (unlikely(opt_junk))
+			if (unlikely(opt_junk_alloc))
 				memset(ret, 0xa5, usize);
 			else if (unlikely(opt_zero))
 				memset(ret, 0, usize);
@@ -1732,7 +1735,7 @@ arena_palloc(arena_t *arena, size_t size, size_t alignment, bool zero)
 	malloc_mutex_unlock(&arena->lock);
 
 	if (config_fill && !zero) {
-		if (unlikely(opt_junk))
+		if (unlikely(opt_junk_alloc))
 			memset(ret, 0xa5, size);
 		else if (unlikely(opt_zero))
 			memset(ret, 0, size);
@@ -1845,7 +1848,7 @@ arena_dalloc_bin_locked_impl(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	bin = &arena->bins[binind];
 	bin_info = &arena_bin_info[binind];
 
-	if (!junked && config_fill && unlikely(opt_junk))
+	if (!junked && config_fill && unlikely(opt_junk_free))
 		arena_dalloc_junk_small(ptr, bin_info);
 
 	arena_run_reg_dalloc(run, ptr);
@@ -1908,7 +1911,7 @@ void
 arena_dalloc_junk_large(void *ptr, size_t usize)
 {
 
-	if (config_fill && unlikely(opt_junk))
+	if (config_fill && unlikely(opt_junk_free))
 		memset(ptr, 0x5a, usize);
 }
 #ifdef JEMALLOC_JET
@@ -2079,7 +2082,7 @@ static void
 arena_ralloc_junk_large(void *ptr, size_t old_usize, size_t usize)
 {
 
-	if (config_fill && unlikely(opt_junk)) {
+	if (config_fill && unlikely(opt_junk_free)) {
 		memset((void *)((uintptr_t)ptr + usize), 0x5a,
 		    old_usize - usize);
 	}
@@ -2126,7 +2129,7 @@ arena_ralloc_large(void *ptr, size_t oldsize, size_t size, size_t extra,
 			bool ret = arena_ralloc_large_grow(arena, chunk, ptr,
 			    oldsize, size, extra, zero);
 			if (config_fill && !ret && !zero) {
-				if (unlikely(opt_junk)) {
+				if (unlikely(opt_junk_alloc)) {
 					memset((void *)((uintptr_t)ptr +
 					    oldsize), 0xa5, isalloc(ptr,
 					    config_prof) - oldsize);
diff --git a/src/ctl.c b/src/ctl.c
index b367c9f..90bad7e 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -1234,7 +1234,7 @@ CTL_RO_NL_GEN(opt_lg_chunk, opt_lg_chunk, size_t)
 CTL_RO_NL_GEN(opt_narenas, opt_narenas, size_t)
 CTL_RO_NL_GEN(opt_lg_dirty_mult, opt_lg_dirty_mult, ssize_t)
 CTL_RO_NL_GEN(opt_stats_print, opt_stats_print, bool)
-CTL_RO_NL_CGEN(config_fill, opt_junk, opt_junk, bool)
+CTL_RO_NL_CGEN(config_fill, opt_junk, opt_junk, const char *)
 CTL_RO_NL_CGEN(config_fill, opt_quarantine, opt_quarantine, size_t)
 CTL_RO_NL_CGEN(config_fill, opt_redzone, opt_redzone, bool)
 CTL_RO_NL_CGEN(config_fill, opt_zero, opt_zero, bool)
diff --git a/src/huge.c b/src/huge.c
index 6883903..416cb17 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -67,7 +67,7 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 	if (zero || (config_fill && unlikely(opt_zero))) {
 		if (!is_zeroed)
 			memset(ret, 0, usize);
-	} else if (config_fill && unlikely(opt_junk))
+	} else if (config_fill && unlikely(opt_junk_alloc))
 		memset(ret, 0xa5, usize);
 
 	return (ret);
@@ -81,7 +81,7 @@ static void
 huge_dalloc_junk(void *ptr, size_t usize)
 {
 
-	if (config_fill && have_dss && unlikely(opt_junk)) {
+	if (config_fill && have_dss && unlikely(opt_junk_free)) {
 		/*
 		 * Only bother junk filling if the chunk isn't about to be
 		 * unmapped.
@@ -117,7 +117,7 @@ huge_ralloc_no_move_similar(void *ptr, size_t oldsize, size_t usize,
 		size_t sdiff = CHUNK_CEILING(usize) - usize;
 		zeroed = (sdiff != 0) ? !pages_purge((void *)((uintptr_t)ptr +
 		    usize), sdiff) : true;
-		if (config_fill && unlikely(opt_junk)) {
+		if (config_fill && unlikely(opt_junk_free)) {
 			memset((void *)((uintptr_t)ptr + usize), 0x5a, oldsize -
 			    usize);
 			zeroed = false;
@@ -147,7 +147,7 @@ huge_ralloc_no_move_similar(void *ptr, size_t oldsize, size_t usize,
 				memset((void *)((uintptr_t)ptr + oldsize), 0,
 				    usize - oldsize);
 			}
-		} else if (config_fill && unlikely(opt_junk)) {
+		} else if (config_fill && unlikely(opt_junk_alloc)) {
 			memset((void *)((uintptr_t)ptr + oldsize), 0xa5, usize -
 			    oldsize);
 		}
@@ -165,7 +165,7 @@ huge_ralloc_no_move_shrink(void *ptr, size_t oldsize, size_t usize)
 	sdiff = CHUNK_CEILING(usize) - usize;
 	zeroed = (sdiff != 0) ? !pages_purge((void *)((uintptr_t)ptr + usize),
 	    sdiff) : true;
-	if (config_fill && unlikely(opt_junk)) {
+	if (config_fill && unlikely(opt_junk_free)) {
 		huge_dalloc_junk((void *)((uintptr_t)ptr + usize), oldsize -
 		    usize);
 		zeroed = false;
@@ -234,7 +234,7 @@ huge_ralloc_no_move_expand(void *ptr, size_t oldsize, size_t size, bool zero) {
 			    CHUNK_CEILING(oldsize)), 0, usize -
 			    CHUNK_CEILING(oldsize));
 		}
-	} else if (config_fill && unlikely(opt_junk)) {
+	} else if (config_fill && unlikely(opt_junk_alloc)) {
 		memset((void *)((uintptr_t)ptr + oldsize), 0xa5, usize -
 		    oldsize);
 	}
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 48de0da..e63dab3 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -13,13 +13,28 @@ bool	opt_abort =
     false
 #endif
     ;
-bool	opt_junk =
+const char	*opt_junk =
+#if (defined(JEMALLOC_DEBUG) && defined(JEMALLOC_FILL))
+    "true"
+#else
+    "false"
+#endif
+    ;
+bool	opt_junk_alloc =
 #if (defined(JEMALLOC_DEBUG) && defined(JEMALLOC_FILL))
     true
 #else
     false
 #endif
     ;
+bool	opt_junk_free =
+#if (defined(JEMALLOC_DEBUG) && defined(JEMALLOC_FILL))
+    true
+#else
+    false
+#endif
+    ;
+
 size_t	opt_quarantine = ZU(0);
 bool	opt_redzone = false;
 bool	opt_utrace = false;
@@ -784,7 +799,9 @@ malloc_conf_init(void)
 	if (config_valgrind) {
 		in_valgrind = (RUNNING_ON_VALGRIND != 0) ? true : false;
 		if (config_fill && unlikely(in_valgrind)) {
-			opt_junk = false;
+			opt_junk = "false";
+			opt_junk_alloc = false;
+			opt_junk_free = false;
 			assert(!opt_zero);
 			opt_quarantine = JEMALLOC_VALGRIND_QUARANTINE_DEFAULT;
 			opt_redzone = true;
@@ -867,13 +884,13 @@ malloc_conf_init(void)
 		    &vlen)) {
 #define	CONF_MATCH(n)							\
 	(sizeof(n)-1 == klen && strncmp(n, k, klen) == 0)
+#define	CONF_MATCH_VALUE(n)						\
+	(sizeof(n)-1 == vlen && strncmp(n, v, vlen) == 0)
 #define	CONF_HANDLE_BOOL(o, n, cont)					\
 			if (CONF_MATCH(n)) {				\
-				if (strncmp("true", v, vlen) == 0 &&	\
-				    vlen == sizeof("true")-1)		\
+				if (CONF_MATCH_VALUE("true"))		\
 					o = true;			\
-				else if (strncmp("false", v, vlen) ==	\
-				    0 && vlen == sizeof("false")-1)	\
+				else if (CONF_MATCH_VALUE("false"))	\
 					o = false;			\
 				else {					\
 					malloc_conf_error(		\
@@ -987,7 +1004,30 @@ malloc_conf_init(void)
 			    -1, (sizeof(size_t) << 3) - 1)
 			CONF_HANDLE_BOOL(opt_stats_print, "stats_print", true)
 			if (config_fill) {
-				CONF_HANDLE_BOOL(opt_junk, "junk", true)
+				if (CONF_MATCH("junk")) {
+					if (CONF_MATCH_VALUE("true")) {
+						opt_junk = "true";
+						opt_junk_alloc = opt_junk_free =
+						    true;
+					} else if (CONF_MATCH_VALUE("false")) {
+						opt_junk = "false";
+						opt_junk_alloc = opt_junk_free =
+						    false;
+					} else if (CONF_MATCH_VALUE("alloc")) {
+						opt_junk = "alloc";
+						opt_junk_alloc = true;
+						opt_junk_free = false;
+					} else if (CONF_MATCH_VALUE("free")) {
+						opt_junk = "free";
+						opt_junk_alloc = false;
+						opt_junk_free = true;
+					} else {
+						malloc_conf_error(
+						    "Invalid conf value", k,
+						    klen, v, vlen);
+					}
+					continue;
+				}
 				CONF_HANDLE_SIZE_T(opt_quarantine, "quarantine",
 				    0, SIZE_T_MAX, false)
 				CONF_HANDLE_BOOL(opt_redzone, "redzone", true)
diff --git a/src/quarantine.c b/src/quarantine.c
index c5fa566..12c37e0 100644
--- a/src/quarantine.c
+++ b/src/quarantine.c
@@ -148,7 +148,7 @@ quarantine(tsd_t *tsd, void *ptr)
 		obj->usize = usize;
 		quarantine->curbytes += usize;
 		quarantine->curobjs++;
-		if (config_fill && unlikely(opt_junk)) {
+		if (config_fill && unlikely(opt_junk_free)) {
 			/*
 			 * Only do redzone validation if Valgrind isn't in
 			 * operation.
diff --git a/test/unit/junk.c b/test/unit/junk.c
index 1522a61..733f661 100644
--- a/test/unit/junk.c
+++ b/test/unit/junk.c
@@ -1,8 +1,11 @@
 #include "test/jemalloc_test.h"
 
 #ifdef JEMALLOC_FILL
+#  ifndef JEMALLOC_TEST_JUNK_OPT
+#    define JEMALLOC_TEST_JUNK_OPT "junk:true"
+#  endif
 const char *malloc_conf =
-    "abort:false,junk:true,zero:false,redzone:true,quarantine:0";
+    "abort:false,zero:false,redzone:true,quarantine:0," JEMALLOC_TEST_JUNK_OPT;
 #endif
 
 static arena_dalloc_junk_small_t *arena_dalloc_junk_small_orig;
@@ -69,12 +72,14 @@ test_junk(size_t sz_min, size_t sz_max)
 	char *s;
 	size_t sz_prev, sz, i;
 
-	arena_dalloc_junk_small_orig = arena_dalloc_junk_small;
-	arena_dalloc_junk_small = arena_dalloc_junk_small_intercept;
-	arena_dalloc_junk_large_orig = arena_dalloc_junk_large;
-	arena_dalloc_junk_large = arena_dalloc_junk_large_intercept;
-	huge_dalloc_junk_orig = huge_dalloc_junk;
-	huge_dalloc_junk = huge_dalloc_junk_intercept;
+	if (opt_junk_free) {
+		arena_dalloc_junk_small_orig = arena_dalloc_junk_small;
+		arena_dalloc_junk_small = arena_dalloc_junk_small_intercept;
+		arena_dalloc_junk_large_orig = arena_dalloc_junk_large;
+		arena_dalloc_junk_large = arena_dalloc_junk_large_intercept;
+		huge_dalloc_junk_orig = huge_dalloc_junk;
+		huge_dalloc_junk = huge_dalloc_junk_intercept;
+	}
 
 	sz_prev = 0;
 	s = (char *)mallocx(sz_min, 0);
@@ -92,9 +97,11 @@ test_junk(size_t sz_min, size_t sz_max)
 		}
 
 		for (i = sz_prev; i < sz; i++) {
-			assert_c_eq(s[i], 0xa5,
-			    "Newly allocated byte %zu/%zu isn't junk-filled",
-			    i, sz);
+			if (opt_junk_alloc) {
+				assert_c_eq(s[i], 0xa5,
+				    "Newly allocated byte %zu/%zu isn't "
+				    "junk-filled", i, sz);
+			}
 			s[i] = 'a';
 		}
 
@@ -103,7 +110,7 @@ test_junk(size_t sz_min, size_t sz_max)
 			s = (char *)rallocx(s, sz+1, 0);
 			assert_ptr_not_null((void *)s,
 			    "Unexpected rallocx() failure");
-			assert_true(saw_junking,
+			assert_true(!opt_junk_free || saw_junking,
 			    "Expected region of size %zu to be junk-filled",
 			    sz);
 		}
@@ -111,12 +118,14 @@ test_junk(size_t sz_min, size_t sz_max)
 
 	watch_junking(s);
 	dallocx(s, 0);
-	assert_true(saw_junking,
+	assert_true(!opt_junk_free || saw_junking,
 	    "Expected region of size %zu to be junk-filled", sz);
 
-	arena_dalloc_junk_small = arena_dalloc_junk_small_orig;
-	arena_dalloc_junk_large = arena_dalloc_junk_large_orig;
-	huge_dalloc_junk = huge_dalloc_junk_orig;
+	if (opt_junk_free) {
+		arena_dalloc_junk_small = arena_dalloc_junk_small_orig;
+		arena_dalloc_junk_large = arena_dalloc_junk_large_orig;
+		huge_dalloc_junk = huge_dalloc_junk_orig;
+	}
 }
 
 TEST_BEGIN(test_junk_small)
@@ -204,6 +213,7 @@ TEST_BEGIN(test_junk_redzone)
 	arena_redzone_corruption_t *arena_redzone_corruption_orig;
 
 	test_skip_if(!config_fill);
+	test_skip_if(!opt_junk_alloc || !opt_junk_free);
 
 	arena_redzone_corruption_orig = arena_redzone_corruption;
 	arena_redzone_corruption = arena_redzone_corruption_replacement;
@@ -234,6 +244,7 @@ int
 main(void)
 {
 
+	assert(opt_junk_alloc || opt_junk_free);
 	return (test(
 	    test_junk_small,
 	    test_junk_large,
diff --git a/test/unit/junk_alloc.c b/test/unit/junk_alloc.c
new file mode 100644
index 0000000..8db3331
--- /dev/null
+++ b/test/unit/junk_alloc.c
@@ -0,0 +1,3 @@
+#define JEMALLOC_TEST_JUNK_OPT "junk:alloc"
+#include "junk.c"
+#undef JEMALLOC_TEST_JUNK_OPT
diff --git a/test/unit/junk_free.c b/test/unit/junk_free.c
new file mode 100644
index 0000000..482a61d
--- /dev/null
+++ b/test/unit/junk_free.c
@@ -0,0 +1,3 @@
+#define JEMALLOC_TEST_JUNK_OPT "junk:free"
+#include "junk.c"
+#undef JEMALLOC_TEST_JUNK_OPT
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index 028a971..f4b7d1a 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -164,7 +164,7 @@ TEST_BEGIN(test_mallctl_opt)
 	TEST_MALLCTL_OPT(size_t, narenas, always);
 	TEST_MALLCTL_OPT(ssize_t, lg_dirty_mult, always);
 	TEST_MALLCTL_OPT(bool, stats_print, always);
-	TEST_MALLCTL_OPT(bool, junk, fill);
+	TEST_MALLCTL_OPT(const char *, junk, fill);
 	TEST_MALLCTL_OPT(size_t, quarantine, fill);
 	TEST_MALLCTL_OPT(bool, redzone, fill);
 	TEST_MALLCTL_OPT(bool, zero, fill);
-- 
cgit v0.12


From b4acf7300a4ca3423ca36fe227e9bc2e23f25b9f Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Fri, 24 Oct 2014 14:09:42 -0700
Subject: [pprof] Produce global profile unless thread-local profile requested

Currently pprof will print output for all threads if a single thread is not
specified, but this doesn't play well with many output formats (e.g., any of
the dot-based formats).  Instead, default to printing just the overall profile
when no specific thread is requested.

This resolves #157.
---
 bin/pprof | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/bin/pprof b/bin/pprof
index 5a4c6cd..df503ae 100755
--- a/bin/pprof
+++ b/bin/pprof
@@ -404,7 +404,7 @@ sub Init() {
              "edgefraction=f" => \$main::opt_edgefraction,
              "maxdegree=i"    => \$main::opt_maxdegree,
              "focus=s"        => \$main::opt_focus,
-             "thread=i"       => \$main::opt_thread,
+             "thread=s"       => \$main::opt_thread,
              "ignore=s"       => \$main::opt_ignore,
              "scale=i"        => \$main::opt_scale,
              "heapcheck"      => \$main::opt_heapcheck,
@@ -707,7 +707,8 @@ sub Main() {
   }
   if (defined($data->{threads})) {
     foreach my $thread (sort { $a <=> $b } keys(%{$data->{threads}})) {
-      if (!defined($main::opt_thread) || $main::opt_thread == $thread) {
+      if (defined($main::opt_thread) &&
+          ($main::opt_thread eq '*' || $main::opt_thread == $thread)) {
         my $thread_profile = $data->{threads}{$thread};
         FilterAndPrint($thread_profile, $symbols, $libs, $thread);
       }
-- 
cgit v0.12


From 9c6a8d3b0cc14fd26b119ad08f190e537771464f Mon Sep 17 00:00:00 2001
From: Guilherme Goncalves <guilherme.p.gonc@gmail.com>
Date: Wed, 17 Dec 2014 14:46:35 -0200
Subject: Move variable declaration to the top its block for MSVC
 compatibility.

---
 src/arena.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index bf78995..1eb4000 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -2022,6 +2022,7 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 		 * following run, then merge the first part with the existing
 		 * allocation.
 		 */
+		arena_run_t *run;
 		size_t flag_dirty, splitsize, usize;
 
 		usize = s2u(size + extra);
@@ -2030,8 +2031,7 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 		assert(usize >= usize_min);
 		splitsize = usize - oldsize;
 
-		arena_run_t *run = &arena_miscelm_get(chunk,
-		    pageind+npages)->run;
+		run = &arena_miscelm_get(chunk, pageind+npages)->run;
 		arena_run_split_large(arena, run, splitsize, zero);
 
 		size = oldsize + splitsize;
-- 
cgit v0.12


From b7b44dfad09186cf74080818075eb0bfc0805e3b Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Thu, 18 Dec 2014 15:12:53 +0900
Subject: Make mixed declarations an error

It often happens that code changes introduce mixed declarations, that then
break building with Visual Studio. Since the code style is to not use
mixed declarations anyways, we might as well enforce it with -Werror.
---
 configure.ac | 1 +
 1 file changed, 1 insertion(+)

diff --git a/configure.ac b/configure.ac
index 82bdefd..95133c4 100644
--- a/configure.ac
+++ b/configure.ac
@@ -134,6 +134,7 @@ if test "x$CFLAGS" = "x" ; then
       AC_DEFINE_UNQUOTED([JEMALLOC_HAS_RESTRICT])
     fi
     JE_CFLAGS_APPEND([-Wall])
+    JE_CFLAGS_APPEND([-Werror=declaration-after-statement])
     JE_CFLAGS_APPEND([-pipe])
     JE_CFLAGS_APPEND([-g3])
   elif test "x$je_cv_msvc" = "xyes" ; then
-- 
cgit v0.12


From 51f86346c000aa2a44abaab08caeedcb151e6556 Mon Sep 17 00:00:00 2001
From: Guilherme Goncalves <guilherme.p.gonc@gmail.com>
Date: Thu, 18 Dec 2014 15:01:21 +0900
Subject: Add a isblank definition for MSVC < 2013

---
 include/jemalloc/internal/jemalloc_internal_decls.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/jemalloc/internal/jemalloc_internal_decls.h b/include/jemalloc/internal/jemalloc_internal_decls.h
index fb2effb..b10561c 100644
--- a/include/jemalloc/internal/jemalloc_internal_decls.h
+++ b/include/jemalloc/internal/jemalloc_internal_decls.h
@@ -52,6 +52,14 @@ typedef intptr_t ssize_t;
 #  define __func__ __FUNCTION__
 /* Disable warnings about deprecated system functions. */
 #  pragma warning(disable: 4996)
+#if _MSC_VER < 1800
+static int
+isblank(int c)
+{
+
+	return (c == '\t' || c == ' ');
+}
+#endif
 #else
 #  include <unistd.h>
 #endif
-- 
cgit v0.12


From 24057f3da8cd1b23955068a368165eba2eefb5c4 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 14 Jan 2015 16:27:31 -0800
Subject: Fix an infinite recursion bug related to a0/tsd bootstrapping.

This resolves #184.
---
 src/chunk.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/chunk.c b/src/chunk.c
index 7926452..b9a2441 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -254,7 +254,9 @@ chunk_alloc_default(void *new_addr, size_t size, size_t alignment, bool *zero,
 {
 	arena_t *arena;
 
-	arena = arena_get(tsd_fetch(), arena_ind, false, true);
+	/* Dodge tsd for a0 in order to avoid bootstrapping issues. */
+	arena = (arena_ind == 0) ? a0get() : arena_get(tsd_fetch(), arena_ind,
+	     false, true);
 	/*
 	 * The arena we're allocating on behalf of must have been initialized
 	 * already.
-- 
cgit v0.12


From 44b57b8e8b25797b94c7cccc0e32705f76fcf03b Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Fri, 16 Jan 2015 18:04:17 -0800
Subject: Fix OOM handling in memalign() and valloc().

Fix memalign() and valloc() to heed imemalign()'s return value.

Reported by Kurt Wampler.
---
 src/jemalloc.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/jemalloc.c b/src/jemalloc.c
index e63dab3..aecdce3 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1751,7 +1751,8 @@ void *
 je_memalign(size_t alignment, size_t size)
 {
 	void *ret JEMALLOC_CC_SILENCE_INIT(NULL);
-	imemalign(&ret, alignment, size, 1);
+	if (unlikely(imemalign(&ret, alignment, size, 1) != 0))
+		ret = NULL;
 	JEMALLOC_VALGRIND_MALLOC(ret != NULL, ret, size, false);
 	return (ret);
 }
@@ -1762,7 +1763,8 @@ void *
 je_valloc(size_t size)
 {
 	void *ret JEMALLOC_CC_SILENCE_INIT(NULL);
-	imemalign(&ret, PAGE, size, 1);
+	if (unlikely(imemalign(&ret, PAGE, size, 1) != 0))
+		ret = NULL;
 	JEMALLOC_VALGRIND_MALLOC(ret != NULL, ret, size, false);
 	return (ret);
 }
-- 
cgit v0.12


From b617df81bbd35b2d7124b16df4024f9541644f6e Mon Sep 17 00:00:00 2001
From: Abhishek Kulkarni <adkulkar@indiana.edu>
Date: Wed, 21 Jan 2015 15:02:42 -0500
Subject: Add missing symbols to private_symbols.txt.

This resolves #185.
---
 include/jemalloc/internal/private_symbols.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 7e33915..39132b2 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -158,6 +158,7 @@ ctl_postfork_child
 ctl_postfork_parent
 ctl_prefork
 dss_prec_names
+extent_tree_ad_empty
 extent_tree_ad_first
 extent_tree_ad_insert
 extent_tree_ad_iter
@@ -174,6 +175,7 @@ extent_tree_ad_reverse_iter
 extent_tree_ad_reverse_iter_recurse
 extent_tree_ad_reverse_iter_start
 extent_tree_ad_search
+extent_tree_szad_empty
 extent_tree_szad_first
 extent_tree_szad_insert
 extent_tree_szad_iter
@@ -289,6 +291,7 @@ opt_prof_final
 opt_prof_gdump
 opt_prof_leak
 opt_prof_prefix
+opt_prof_thread_active_init
 opt_quarantine
 opt_redzone
 opt_stats_print
@@ -332,6 +335,7 @@ prof_tctx_set
 prof_tdata_cleanup
 prof_tdata_get
 prof_tdata_init
+prof_tdata_reinit
 prof_thread_active_get
 prof_thread_active_init_get
 prof_thread_active_init_set
-- 
cgit v0.12


From bc96876f99e89705817630b503ac54a5c48789ab Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Wed, 21 Jan 2015 09:01:43 -0800
Subject: Fix arenas_cache_cleanup().

Fix arenas_cache_cleanup() to check whether arenas_cache is NULL before
deallocation, rather than checking arenas.
---
 src/jemalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/jemalloc.c b/src/jemalloc.c
index aecdce3..c53129a 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -601,7 +601,7 @@ arenas_cache_cleanup(tsd_t *tsd)
 	arena_t **arenas_cache;
 
 	arenas_cache = tsd_arenas_cache_get(tsd);
-	if (arenas != NULL)
+	if (arenas_cache != NULL)
 		a0free(arenas_cache);
 }
 
-- 
cgit v0.12


From 10aff3f3e1b8b3ac0348b259c439c9fe870a6b95 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 20 Jan 2015 15:37:51 -0800
Subject: Refactor bootstrapping to delay tsd initialization.

Refactor bootstrapping to delay tsd initialization, primarily to support
integration with FreeBSD's libc.

Refactor a0*() for internal-only use, and add the
bootstrap_{malloc,calloc,free}() API for use by FreeBSD's libc.  This
separation limits use of the a0*() functions to metadata allocation,
which doesn't require malloc/calloc/free API compatibility.

This resolves #170.
---
 include/jemalloc/internal/jemalloc_internal.h.in |   5 +-
 include/jemalloc/internal/private_symbols.txt    |   6 +-
 include/jemalloc/internal/tsd.h                  |   2 +-
 src/ctl.c                                        |  18 +-
 src/jemalloc.c                                   | 293 ++++++++++++++---------
 src/tsd.c                                        |   4 +-
 6 files changed, 203 insertions(+), 125 deletions(-)

diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index b7617df..4107860 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -404,9 +404,8 @@ extern size_t const	index2size_tab[NSIZES];
 extern uint8_t const	size2index_tab[];
 
 arena_t	*a0get(void);
-void	*a0malloc(size_t size);
-void	*a0calloc(size_t num, size_t size);
-void	a0free(void *ptr);
+void	*a0malloc(size_t size, bool zero);
+void	a0dalloc(void *ptr);
 arena_t	*arenas_extend(unsigned ind);
 arena_t	*arena_init(unsigned ind);
 unsigned	narenas_total_get(void);
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 39132b2..1aaf80b 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -1,5 +1,4 @@
-a0calloc
-a0free
+a0dalloc
 a0get
 a0malloc
 arena_get
@@ -107,6 +106,9 @@ bitmap_set
 bitmap_sfu
 bitmap_size
 bitmap_unset
+bootstrap_calloc
+bootstrap_free
+bootstrap_malloc
 bt_init
 buferror
 chunk_alloc_arena
diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index 35dd862..dbb91a2 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -48,7 +48,7 @@ typedef enum {
  *   void example_tsd_set(example_t *val) {...}
  *
  * Note that all of the functions deal in terms of (a_type *) rather than
- * (a_type)  so that it is possible to support non-pointer types (unlike
+ * (a_type) so that it is possible to support non-pointer types (unlike
  * pthreads TSD).  example_tsd_cleanup() is passed an (a_type *) pointer that is
  * cast to (void *).  This means that the cleanup function needs to cast the
  * function argument to (a_type *), then dereference the resulting pointer to
diff --git a/src/ctl.c b/src/ctl.c
index 90bad7e..6b95584 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -484,14 +484,14 @@ ctl_arena_init(ctl_arena_stats_t *astats)
 
 	if (astats->lstats == NULL) {
 		astats->lstats = (malloc_large_stats_t *)a0malloc(nlclasses *
-		    sizeof(malloc_large_stats_t));
+		    sizeof(malloc_large_stats_t), false);
 		if (astats->lstats == NULL)
 			return (true);
 	}
 
 	if (astats->hstats == NULL) {
 		astats->hstats = (malloc_huge_stats_t *)a0malloc(nhclasses *
-		    sizeof(malloc_huge_stats_t));
+		    sizeof(malloc_huge_stats_t), false);
 		if (astats->hstats == NULL)
 			return (true);
 	}
@@ -627,7 +627,7 @@ ctl_grow(void)
 
 	/* Allocate extended arena stats. */
 	astats = (ctl_arena_stats_t *)a0malloc((ctl_stats.narenas + 2) *
-	    sizeof(ctl_arena_stats_t));
+	    sizeof(ctl_arena_stats_t), false);
 	if (astats == NULL)
 		return (true);
 
@@ -636,7 +636,7 @@ ctl_grow(void)
 	    sizeof(ctl_arena_stats_t));
 	memset(&astats[ctl_stats.narenas + 1], 0, sizeof(ctl_arena_stats_t));
 	if (ctl_arena_init(&astats[ctl_stats.narenas + 1])) {
-		a0free(astats);
+		a0dalloc(astats);
 		return (true);
 	}
 	/* Swap merged stats to their new location. */
@@ -649,7 +649,7 @@ ctl_grow(void)
 		memcpy(&astats[ctl_stats.narenas + 1], &tstats,
 		    sizeof(ctl_arena_stats_t));
 	}
-	a0free(ctl_stats.arenas);
+	a0dalloc(ctl_stats.arenas);
 	ctl_stats.arenas = astats;
 	ctl_stats.narenas++;
 
@@ -723,7 +723,7 @@ ctl_init(void)
 		 */
 		ctl_stats.narenas = narenas_total_get();
 		ctl_stats.arenas = (ctl_arena_stats_t *)a0malloc(
-		    (ctl_stats.narenas + 1) * sizeof(ctl_arena_stats_t));
+		    (ctl_stats.narenas + 1) * sizeof(ctl_arena_stats_t), false);
 		if (ctl_stats.arenas == NULL) {
 			ret = true;
 			goto label_return;
@@ -742,12 +742,12 @@ ctl_init(void)
 				if (ctl_arena_init(&ctl_stats.arenas[i])) {
 					unsigned j;
 					for (j = 0; j < i; j++) {
-						a0free(
+						a0dalloc(
 						    ctl_stats.arenas[j].lstats);
-						a0free(
+						a0dalloc(
 						    ctl_stats.arenas[j].hstats);
 					}
-					a0free(ctl_stats.arenas);
+					a0dalloc(ctl_stats.arenas);
 					ctl_stats.arenas = NULL;
 					ret = true;
 					goto label_return;
diff --git a/src/jemalloc.c b/src/jemalloc.c
index c53129a..632c8d3 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -62,8 +62,13 @@ static unsigned		narenas_total;
 static arena_t		*a0; /* arenas[0]; read-only after initialization. */
 static unsigned		narenas_auto; /* Read-only after initialization. */
 
-/* Set to true once the allocator has been initialized. */
-static bool		malloc_initialized = false;
+typedef enum {
+	malloc_init_uninitialized	= 3,
+	malloc_init_a0_initialized	= 2,
+	malloc_init_recursible		= 1,
+	malloc_init_initialized		= 0 /* Common case --> jnz. */
+} malloc_init_t;
+static malloc_init_t	malloc_init_state = malloc_init_uninitialized;
 
 JEMALLOC_ALIGNED(CACHELINE)
 const size_t	index2size_tab[NSIZES] = {
@@ -218,6 +223,7 @@ typedef struct {
  * definition.
  */
 
+static bool	malloc_init_hard_a0(void);
 static bool	malloc_init_hard(void);
 
 /******************************************************************************/
@@ -225,6 +231,13 @@ static bool	malloc_init_hard(void);
  * Begin miscellaneous support functions.
  */
 
+JEMALLOC_ALWAYS_INLINE_C bool
+malloc_initialized(void)
+{
+
+	return (malloc_init_state == malloc_init_initialized);
+}
+
 JEMALLOC_ALWAYS_INLINE_C void
 malloc_thread_init(void)
 {
@@ -243,10 +256,19 @@ malloc_thread_init(void)
 }
 
 JEMALLOC_ALWAYS_INLINE_C bool
+malloc_init_a0(void)
+{
+
+	if (unlikely(malloc_init_state == malloc_init_uninitialized))
+		return (malloc_init_hard_a0());
+	return (false);
+}
+
+JEMALLOC_ALWAYS_INLINE_C bool
 malloc_init(void)
 {
 
-	if (unlikely(!malloc_initialized) && malloc_init_hard())
+	if (unlikely(!malloc_initialized()) && malloc_init_hard())
 		return (true);
 	malloc_thread_init();
 
@@ -254,10 +276,8 @@ malloc_init(void)
 }
 
 /*
- * The a0*() functions are used instead of i[mcd]alloc() in bootstrap-sensitive
- * situations that cannot tolerate TLS variable access.  These functions are
- * also exposed for use in static binaries on FreeBSD, hence the old-style
- * malloc() API.
+ * The a0*() functions are used instead of i[mcd]alloc() in situations that
+ * cannot tolerate TLS variable access.
  */
 
 arena_t *
@@ -269,16 +289,13 @@ a0get(void)
 }
 
 static void *
-a0alloc(size_t size, bool zero)
+a0imalloc(size_t size, bool zero)
 {
 	void *ret;
 
-	if (unlikely(malloc_init()))
+	if (unlikely(malloc_init_a0()))
 		return (NULL);
 
-	if (size == 0)
-		size = 1;
-
 	if (likely(size <= arena_maxclass))
 		ret = arena_malloc(NULL, a0get(), size, zero, false);
 	else
@@ -287,33 +304,70 @@ a0alloc(size_t size, bool zero)
 	return (ret);
 }
 
+static void
+a0idalloc(void *ptr)
+{
+	arena_chunk_t *chunk;
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	if (likely(chunk != ptr))
+		arena_dalloc(NULL, chunk, ptr, false);
+	else
+		huge_dalloc(NULL, ptr, false);
+}
+
 void *
-a0malloc(size_t size)
+a0malloc(size_t size, bool zero)
 {
 
-	return (a0alloc(size, false));
+	return (a0imalloc(size, zero));
 }
 
+void
+a0dalloc(void *ptr)
+{
+
+	a0idalloc(ptr);
+}
+
+/*
+ * FreeBSD's libc uses the bootstrap_*() functions in bootstrap-senstive
+ * situations that cannot tolerate TLS variable access (TLS allocation and very
+ * early internal data structure initialization).
+ */
+
 void *
-a0calloc(size_t num, size_t size)
+bootstrap_malloc(size_t size)
 {
 
-	return (a0alloc(num * size, true));
+	if (unlikely(size == 0))
+		size = 1;
+
+	return (a0imalloc(size, false));
+}
+
+void *
+bootstrap_calloc(size_t num, size_t size)
+{
+	size_t num_size;
+
+	num_size = num * size;
+	if (unlikely(num_size == 0)) {
+		assert(num == 0 || size == 0);
+		num_size = 1;
+	}
+
+	return (a0imalloc(num_size, true));
 }
 
 void
-a0free(void *ptr)
+bootstrap_free(void *ptr)
 {
-	arena_chunk_t *chunk;
 
-	if (ptr == NULL)
+	if (unlikely(ptr == NULL))
 		return;
 
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (likely(chunk != ptr))
-		arena_dalloc(NULL, chunk, ptr, false);
-	else
-		huge_dalloc(NULL, ptr, false);
+	a0idalloc(ptr);
 }
 
 /* Create a new arena and insert it into the arenas array at index ind. */
@@ -328,7 +382,7 @@ arena_init_locked(unsigned ind)
 		unsigned narenas_new = narenas_total + 1;
 		arena_t **arenas_new =
 		    (arena_t **)a0malloc(CACHELINE_CEILING(narenas_new *
-		    sizeof(arena_t *)));
+		    sizeof(arena_t *)), false);
 		if (arenas_new == NULL)
 			return (NULL);
 		memcpy(arenas_new, arenas, narenas_total * sizeof(arena_t *));
@@ -338,7 +392,7 @@ arena_init_locked(unsigned ind)
 		 * base_alloc()).
 		 */
 		if (narenas_total != narenas_auto)
-			a0free(arenas);
+			a0dalloc(arenas);
 		arenas = arenas_new;
 		narenas_total = narenas_new;
 	}
@@ -449,7 +503,7 @@ arena_get_hard(tsd_t *tsd, unsigned ind, bool init_if_missing)
 
 	/* Deallocate old cache if it's too small. */
 	if (arenas_cache != NULL && narenas_cache < narenas_actual) {
-		a0free(arenas_cache);
+		a0dalloc(arenas_cache);
 		arenas_cache = NULL;
 		narenas_cache = 0;
 		tsd_arenas_cache_set(tsd, arenas_cache);
@@ -465,7 +519,7 @@ arena_get_hard(tsd_t *tsd, unsigned ind, bool init_if_missing)
 		if (!*arenas_cache_bypassp) {
 			*arenas_cache_bypassp = true;
 			arenas_cache = (arena_t **)a0malloc(sizeof(arena_t *) *
-			    narenas_cache);
+			    narenas_cache, false);
 			*arenas_cache_bypassp = false;
 		} else
 			arenas_cache = NULL;
@@ -602,7 +656,7 @@ arenas_cache_cleanup(tsd_t *tsd)
 
 	arenas_cache = tsd_arenas_cache_get(tsd);
 	if (arenas_cache != NULL)
-		a0free(arenas_cache);
+		a0dalloc(arenas_cache);
 }
 
 void
@@ -1091,19 +1145,18 @@ malloc_conf_init(void)
 	}
 }
 
+/* init_lock must be held. */
 static bool
-malloc_init_hard(void)
+malloc_init_hard_needed(void)
 {
-	arena_t *init_arenas[1];
 
-	malloc_mutex_lock(&init_lock);
-	if (malloc_initialized || IS_INITIALIZER) {
+	if (malloc_initialized() || (IS_INITIALIZER && malloc_init_state ==
+	    malloc_init_recursible)) {
 		/*
 		 * Another thread initialized the allocator before this one
 		 * acquired init_lock, or this thread is the initializing
 		 * thread, and it is recursively allocating.
 		 */
-		malloc_mutex_unlock(&init_lock);
 		return (false);
 	}
 #ifdef JEMALLOC_THREADED_INIT
@@ -1113,23 +1166,23 @@ malloc_init_hard(void)
 			malloc_mutex_unlock(&init_lock);
 			CPU_SPINWAIT;
 			malloc_mutex_lock(&init_lock);
-		} while (!malloc_initialized);
-		malloc_mutex_unlock(&init_lock);
+		} while (!malloc_initialized());
 		return (false);
 	}
 #endif
-	malloc_initializer = INITIALIZER;
+	return (true);
+}
 
-	if (malloc_tsd_boot0()) {
-		malloc_mutex_unlock(&init_lock);
-		return (true);
-	}
+/* init_lock must be held. */
+static bool
+malloc_init_hard_a0_locked(void)
+{
+
+	malloc_initializer = INITIALIZER;
 
 	if (config_prof)
 		prof_boot0();
-
 	malloc_conf_init();
-
 	if (opt_stats_print) {
 		/* Print statistics at exit. */
 		if (atexit(stats_print_atexit) != 0) {
@@ -1138,68 +1191,60 @@ malloc_init_hard(void)
 				abort();
 		}
 	}
-
-	if (base_boot()) {
-		malloc_mutex_unlock(&init_lock);
+	if (base_boot())
 		return (true);
-	}
-
-	if (chunk_boot()) {
-		malloc_mutex_unlock(&init_lock);
+	if (chunk_boot())
 		return (true);
-	}
-
-	if (ctl_boot()) {
-		malloc_mutex_unlock(&init_lock);
+	if (ctl_boot())
 		return (true);
-	}
-
 	if (config_prof)
 		prof_boot1();
-
 	arena_boot();
-
-	if (config_tcache && tcache_boot()) {
-		malloc_mutex_unlock(&init_lock);
+	if (config_tcache && tcache_boot())
 		return (true);
-	}
-
-	if (huge_boot()) {
-		malloc_mutex_unlock(&init_lock);
+	if (huge_boot())
 		return (true);
-	}
-
-	if (malloc_mutex_init(&arenas_lock)) {
-		malloc_mutex_unlock(&init_lock);
+	if (malloc_mutex_init(&arenas_lock))
 		return (true);
-	}
-
 	/*
 	 * Create enough scaffolding to allow recursive allocation in
 	 * malloc_ncpus().
 	 */
 	narenas_total = narenas_auto = 1;
-	arenas = init_arenas;
+	arenas = &a0;
 	memset(arenas, 0, sizeof(arena_t *) * narenas_auto);
-
 	/*
 	 * Initialize one arena here.  The rest are lazily created in
 	 * arena_choose_hard().
 	 */
-	a0 = arena_init(0);
-	if (a0 == NULL) {
-		malloc_mutex_unlock(&init_lock);
+	if (arena_init(0) == NULL)
 		return (true);
-	}
+	malloc_init_state = malloc_init_a0_initialized;
+	return (false);
+}
 
-	if (config_prof && prof_boot2()) {
-		malloc_mutex_unlock(&init_lock);
-		return (true);
-	}
+static bool
+malloc_init_hard_a0(void)
+{
+	bool ret;
 
+	malloc_mutex_lock(&init_lock);
+	ret = malloc_init_hard_a0_locked();
+	malloc_mutex_unlock(&init_lock);
+	return (ret);
+}
+
+/*
+ * Initialize data structures which may trigger recursive allocation.
+ *
+ * init_lock must be held.
+ */
+static void
+malloc_init_hard_recursible(void)
+{
+
+	malloc_init_state = malloc_init_recursible;
 	malloc_mutex_unlock(&init_lock);
-	/**********************************************************************/
-	/* Recursive allocation may follow. */
 
 	ncpus = malloc_ncpus();
 
@@ -1213,15 +1258,16 @@ malloc_init_hard(void)
 			abort();
 	}
 #endif
-
-	/* Done recursively allocating. */
-	/**********************************************************************/
 	malloc_mutex_lock(&init_lock);
+}
 
-	if (mutex_boot()) {
-		malloc_mutex_unlock(&init_lock);
+/* init_lock must be held. */
+static bool
+malloc_init_hard_finish(void)
+{
+
+	if (mutex_boot())
 		return (true);
-	}
 
 	if (opt_narenas == 0) {
 		/*
@@ -1248,22 +1294,53 @@ malloc_init_hard(void)
 
 	/* Allocate and initialize arenas. */
 	arenas = (arena_t **)base_alloc(sizeof(arena_t *) * narenas_total);
-	if (arenas == NULL) {
-		malloc_mutex_unlock(&init_lock);
+	if (arenas == NULL)
 		return (true);
-	}
 	/*
 	 * Zero the array.  In practice, this should always be pre-zeroed,
 	 * since it was just mmap()ed, but let's be sure.
 	 */
 	memset(arenas, 0, sizeof(arena_t *) * narenas_total);
 	/* Copy the pointer to the one arena that was already initialized. */
-	arenas[0] = init_arenas[0];
+	arenas[0] = a0;
+
+	malloc_init_state = malloc_init_initialized;
+	return (false);
+}
+
+static bool
+malloc_init_hard(void)
+{
+
+	malloc_mutex_lock(&init_lock);
+	if (!malloc_init_hard_needed()) {
+		malloc_mutex_unlock(&init_lock);
+		return (false);
+	}
+
+	if (malloc_init_state != malloc_init_a0_initialized &&
+	    malloc_init_hard_a0_locked()) {
+		malloc_mutex_unlock(&init_lock);
+		return (true);
+	}
+	if (malloc_tsd_boot0()) {
+		malloc_mutex_unlock(&init_lock);
+		return (true);
+	}
+	if (config_prof && prof_boot2()) {
+		malloc_mutex_unlock(&init_lock);
+		return (true);
+	}
+
+	malloc_init_hard_recursible();
+
+	if (malloc_init_hard_finish()) {
+		malloc_mutex_unlock(&init_lock);
+		return (true);
+	}
 
-	malloc_initialized = true;
 	malloc_mutex_unlock(&init_lock);
 	malloc_tsd_boot1();
-
 	return (false);
 }
 
@@ -1634,7 +1711,7 @@ ifree(tsd_t *tsd, void *ptr, bool try_tcache)
 	UNUSED size_t rzsize JEMALLOC_CC_SILENCE_INIT(0);
 
 	assert(ptr != NULL);
-	assert(malloc_initialized || IS_INITIALIZER);
+	assert(malloc_initialized() || IS_INITIALIZER);
 
 	if (config_prof && opt_prof) {
 		usize = isalloc(ptr, config_prof);
@@ -1655,7 +1732,7 @@ isfree(tsd_t *tsd, void *ptr, size_t usize, bool try_tcache)
 	UNUSED size_t rzsize JEMALLOC_CC_SILENCE_INIT(0);
 
 	assert(ptr != NULL);
-	assert(malloc_initialized || IS_INITIALIZER);
+	assert(malloc_initialized() || IS_INITIALIZER);
 
 	if (config_prof && opt_prof)
 		prof_free(tsd, ptr, usize);
@@ -1688,7 +1765,7 @@ je_realloc(void *ptr, size_t size)
 	}
 
 	if (likely(ptr != NULL)) {
-		assert(malloc_initialized || IS_INITIALIZER);
+		assert(malloc_initialized() || IS_INITIALIZER);
 		malloc_thread_init();
 		tsd = tsd_fetch();
 
@@ -2060,7 +2137,7 @@ je_rallocx(void *ptr, size_t size, int flags)
 
 	assert(ptr != NULL);
 	assert(size != 0);
-	assert(malloc_initialized || IS_INITIALIZER);
+	assert(malloc_initialized() || IS_INITIALIZER);
 	malloc_thread_init();
 	tsd = tsd_fetch();
 
@@ -2200,7 +2277,7 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags)
 	assert(ptr != NULL);
 	assert(size != 0);
 	assert(SIZE_T_MAX - size >= extra);
-	assert(malloc_initialized || IS_INITIALIZER);
+	assert(malloc_initialized() || IS_INITIALIZER);
 	malloc_thread_init();
 	tsd = tsd_fetch();
 
@@ -2234,7 +2311,7 @@ je_sallocx(const void *ptr, int flags)
 {
 	size_t usize;
 
-	assert(malloc_initialized || IS_INITIALIZER);
+	assert(malloc_initialized() || IS_INITIALIZER);
 	malloc_thread_init();
 
 	if (config_ivsalloc)
@@ -2254,7 +2331,7 @@ je_dallocx(void *ptr, int flags)
 	bool try_tcache;
 
 	assert(ptr != NULL);
-	assert(malloc_initialized || IS_INITIALIZER);
+	assert(malloc_initialized() || IS_INITIALIZER);
 
 	tsd = tsd_fetch();
 	if (unlikely((flags & MALLOCX_ARENA_MASK) != 0)) {
@@ -2296,7 +2373,7 @@ je_sdallocx(void *ptr, size_t size, int flags)
 	size_t usize;
 
 	assert(ptr != NULL);
-	assert(malloc_initialized || IS_INITIALIZER);
+	assert(malloc_initialized() || IS_INITIALIZER);
 	usize = inallocx(size, flags);
 	assert(usize == isalloc(ptr, config_prof));
 
@@ -2375,7 +2452,7 @@ je_malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *ptr)
 {
 	size_t ret;
 
-	assert(malloc_initialized || IS_INITIALIZER);
+	assert(malloc_initialized() || IS_INITIALIZER);
 	malloc_thread_init();
 
 	if (config_ivsalloc)
@@ -2427,10 +2504,10 @@ _malloc_prefork(void)
 	unsigned i;
 
 #ifdef JEMALLOC_MUTEX_INIT_CB
-	if (!malloc_initialized)
+	if (!malloc_initialized())
 		return;
 #endif
-	assert(malloc_initialized);
+	assert(malloc_initialized());
 
 	/* Acquire all mutexes in a safe order. */
 	ctl_prefork();
@@ -2456,10 +2533,10 @@ _malloc_postfork(void)
 	unsigned i;
 
 #ifdef JEMALLOC_MUTEX_INIT_CB
-	if (!malloc_initialized)
+	if (!malloc_initialized())
 		return;
 #endif
-	assert(malloc_initialized);
+	assert(malloc_initialized());
 
 	/* Release all mutexes, now that fork() has completed. */
 	huge_postfork_parent();
@@ -2479,7 +2556,7 @@ jemalloc_postfork_child(void)
 {
 	unsigned i;
 
-	assert(malloc_initialized);
+	assert(malloc_initialized());
 
 	/* Release all mutexes, now that fork() has completed. */
 	huge_postfork_child();
diff --git a/src/tsd.c b/src/tsd.c
index 59253fe..00d8f95 100644
--- a/src/tsd.c
+++ b/src/tsd.c
@@ -15,14 +15,14 @@ void *
 malloc_tsd_malloc(size_t size)
 {
 
-	return (a0malloc(CACHELINE_CEILING(size)));
+	return (a0malloc(CACHELINE_CEILING(size), false));
 }
 
 void
 malloc_tsd_dalloc(void *wrapper)
 {
 
-	a0free(wrapper);
+	a0dalloc(wrapper);
 }
 
 void
-- 
cgit v0.12


From 228b2e92421d8cc7990e931e3144b6f1c3398501 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Thu, 22 Jan 2015 15:28:25 -0800
Subject: Document under what circumstances in-place resizing succeeds.

This resolves #100.
---
 doc/jemalloc.xml.in | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 0148f03..858572d 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -533,6 +533,22 @@ for (i = 0; i < nbins; i++) {
     nearest multiple of the cacheline size, or specify cacheline alignment when
     allocating.</para>
 
+    <para>The <function>realloc<parameter/></function>,
+    <function>rallocx<parameter/></function>, and
+    <function>xallocx<parameter/></function> functions may resize allocations
+    without moving them under limited circumstances.  Unlike the
+    <function>*allocx<parameter/></function> API, the standard API does not
+    officially round up the usable size of an allocation to the nearest size
+    class, so technically it is necessary to call
+    <function>realloc<parameter/></function> to grow e.g. a 9-byte allocation to
+    16 bytes, or shrink a 16-byte allocation to 9 bytes.  Growth and shrinkage
+    trivially succeeds in place as long as the pre-size and post-size both round
+    up to the same size class.  No other API guarantees are made regarding
+    in-place resizing, but the current implementation also tries to resize large
+    and huge allocations in place, as long as the pre-size and post-size are
+    both large or both huge.  In such cases shrinkage always succeeds, but
+    growth only succeeds if the trailing memory is currently available.</para>
+
     <para>Assuming 4 MiB chunks, 4 KiB pages, and a 16-byte quantum on a 64-bit
     system, the size classes in each category are as shown in <xref
     linkend="size_classes" xrefstyle="template:Table %n"/>.</para>
-- 
cgit v0.12


From 8afcaa9d8133f0a147820799222492d1c251d285 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Thu, 22 Jan 2015 16:03:00 -0800
Subject: Update copyright dates for 2015.

---
 COPYING | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/COPYING b/COPYING
index bdda0fe..611968c 100644
--- a/COPYING
+++ b/COPYING
@@ -1,10 +1,10 @@
 Unless otherwise specified, files in the jemalloc source distribution are
 subject to the following license:
 --------------------------------------------------------------------------------
-Copyright (C) 2002-2014 Jason Evans <jasone@canonware.com>.
+Copyright (C) 2002-2015 Jason Evans <jasone@canonware.com>.
 All rights reserved.
 Copyright (C) 2007-2012 Mozilla Foundation.  All rights reserved.
-Copyright (C) 2009-2014 Facebook, Inc.  All rights reserved.
+Copyright (C) 2009-2015 Facebook, Inc.  All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
-- 
cgit v0.12


From bec6a8da39e8cb7e59550541d429cff5e3dfb6d8 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Thu, 22 Jan 2015 17:55:58 -0800
Subject: Implement the jemalloc-config script.

This resolves #133.
---
 .gitignore             |  1 +
 Makefile.in            |  3 +-
 bin/jemalloc-config.in | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++
 configure.ac           | 10 +++++--
 4 files changed, 89 insertions(+), 4 deletions(-)
 create mode 100644 bin/jemalloc-config.in

diff --git a/.gitignore b/.gitignore
index fd68315..5cd3e92 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 /*.gcov.*
 
+/bin/jemalloc-config
 /bin/jemalloc.sh
 
 /config.stamp
diff --git a/Makefile.in b/Makefile.in
index c268d00..da397c3 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -73,7 +73,7 @@ endif
 LIBJEMALLOC := $(LIBPREFIX)jemalloc$(install_suffix)
 
 # Lists of files.
-BINS := $(srcroot)bin/pprof $(objroot)bin/jemalloc.sh
+BINS := $(srcroot)bin/pprof $(objroot)bin/jemalloc-config $(objroot)bin/jemalloc.sh
 C_HDRS := $(objroot)include/jemalloc/jemalloc$(install_suffix).h
 C_SRCS := $(srcroot)src/jemalloc.c $(srcroot)src/arena.c \
 	$(srcroot)src/atomic.c $(srcroot)src/base.c $(srcroot)src/bitmap.c \
@@ -408,6 +408,7 @@ clean:
 	rm -f $(objroot)*.gcov.*
 
 distclean: clean
+	rm -f $(objroot)bin/jemalloc-config
 	rm -f $(objroot)bin/jemalloc.sh
 	rm -f $(objroot)config.log
 	rm -f $(objroot)config.status
diff --git a/bin/jemalloc-config.in b/bin/jemalloc-config.in
new file mode 100644
index 0000000..b016c8d
--- /dev/null
+++ b/bin/jemalloc-config.in
@@ -0,0 +1,79 @@
+#!/bin/sh
+
+usage() {
+	cat <<EOF
+Usage:
+  @BINDIR@/jemalloc-config <option>
+Options:
+  --help | -h  : Print usage.
+  --version    : Print jemalloc version.
+  --revision   : Print shared library revision number.
+  --config     : Print configure options used to build jemalloc.
+  --prefix     : Print installation directory prefix.
+  --bindir     : Print binary installation directory.
+  --datadir    : Print data installation directory.
+  --includedir : Print include installation directory.
+  --libdir     : Print library installation directory.
+  --mandir     : Print manual page installation directory.
+  --cc         : Print compiler used to build jemalloc.
+  --cflags     : Print compiler flags used to build jemalloc.
+  --cppflags   : Print preprocessor flags used to build jemalloc.
+  --ldflags    : Print library flags used to build jemalloc.
+  --libs       : Print libraries jemalloc was linked against.
+EOF
+}
+
+prefix="@prefix@"
+exec_prefix="@exec_prefix@"
+
+case "$1" in
+--help | -h)
+	usage
+	exit 0
+	;;
+--version)
+	echo "@jemalloc_version@"
+	;;
+--revision)
+	echo "@rev@"
+	;;
+--config)
+	echo "@CONFIG@"
+	;;
+--prefix)
+	echo "@PREFIX@"
+	;;
+--bindir)
+	echo "@BINDIR@"
+	;;
+--datadir)
+	echo "@DATADIR@"
+	;;
+--includedir)
+	echo "@INCLUDEDIR@"
+	;;
+--libdir)
+	echo "@LIBDIR@"
+	;;
+--mandir)
+	echo "@MANDIR@"
+	;;
+--cc)
+	echo "@CC@"
+	;;
+--cflags)
+	echo "@CFLAGS@"
+	;;
+--cppflags)
+	echo "@CPPFLAGS@"
+	;;
+--ldflags)
+	echo "@LDFLAGS@ @EXTRA_LDFLAGS@"
+	;;
+--libs)
+	echo "@LIBS@"
+	;;
+*)
+	usage
+	exit 1
+esac
diff --git a/configure.ac b/configure.ac
index 95133c4..0a4f01e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -43,6 +43,9 @@ AC_CACHE_CHECK([whether $1 is compilable],
 
 dnl ============================================================================
 
+CONFIG=`echo ${ac_configure_args} | sed -e "s#\'\([^ ]*\)\'#\1#g"`
+AC_SUBST([CONFIG])
+
 dnl Library revision.
 rev=2
 AC_SUBST([rev])
@@ -1585,7 +1588,7 @@ AC_CONFIG_HEADERS([$cfghdrs_tup])
 dnl ============================================================================
 dnl Generate outputs.
 
-AC_CONFIG_FILES([$cfgoutputs_tup config.stamp bin/jemalloc.sh])
+AC_CONFIG_FILES([$cfgoutputs_tup config.stamp bin/jemalloc-config bin/jemalloc.sh])
 AC_SUBST([cfgoutputs_in])
 AC_SUBST([cfgoutputs_out])
 AC_OUTPUT
@@ -1596,9 +1599,10 @@ AC_MSG_RESULT([=================================================================
 AC_MSG_RESULT([jemalloc version   : ${jemalloc_version}])
 AC_MSG_RESULT([library revision   : ${rev}])
 AC_MSG_RESULT([])
+AC_MSG_RESULT([CONFIG             : ${CONFIG}])
 AC_MSG_RESULT([CC                 : ${CC}])
-AC_MSG_RESULT([CPPFLAGS           : ${CPPFLAGS}])
 AC_MSG_RESULT([CFLAGS             : ${CFLAGS}])
+AC_MSG_RESULT([CPPFLAGS           : ${CPPFLAGS}])
 AC_MSG_RESULT([LDFLAGS            : ${LDFLAGS}])
 AC_MSG_RESULT([EXTRA_LDFLAGS      : ${EXTRA_LDFLAGS}])
 AC_MSG_RESULT([LIBS               : ${LIBS}])
@@ -1609,9 +1613,9 @@ AC_MSG_RESULT([XSLROOT            : ${XSLROOT}])
 AC_MSG_RESULT([])
 AC_MSG_RESULT([PREFIX             : ${PREFIX}])
 AC_MSG_RESULT([BINDIR             : ${BINDIR}])
+AC_MSG_RESULT([DATADIR            : ${DATADIR}])
 AC_MSG_RESULT([INCLUDEDIR         : ${INCLUDEDIR}])
 AC_MSG_RESULT([LIBDIR             : ${LIBDIR}])
-AC_MSG_RESULT([DATADIR            : ${DATADIR}])
 AC_MSG_RESULT([MANDIR             : ${MANDIR}])
 AC_MSG_RESULT([])
 AC_MSG_RESULT([srcroot            : ${srcroot}])
-- 
cgit v0.12


From ec98a44662a82aff30a54ed86bd9b24f36cfe67e Mon Sep 17 00:00:00 2001
From: Guilherme Goncalves <guilherme.p.gonc@gmail.com>
Date: Fri, 23 Jan 2015 10:52:13 -0200
Subject: Use the correct type for opt.junk when printing stats.

---
 src/stats.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stats.c b/src/stats.c
index 054f033..2b3da64 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -461,7 +461,7 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		OPT_WRITE_SIZE_T(narenas)
 		OPT_WRITE_SSIZE_T(lg_dirty_mult)
 		OPT_WRITE_BOOL(stats_print)
-		OPT_WRITE_BOOL(junk)
+		OPT_WRITE_CHAR_P(junk)
 		OPT_WRITE_SIZE_T(quarantine)
 		OPT_WRITE_BOOL(redzone)
 		OPT_WRITE_BOOL(zero)
-- 
cgit v0.12


From 4581b97809e7e545c38b996870a4e7284a620bc5 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Thu, 27 Nov 2014 17:22:36 -0200
Subject: Implement metadata statistics.

There are three categories of metadata:

- Base allocations are used for bootstrap-sensitive internal allocator
  data structures.
- Arena chunk headers comprise pages which track the states of the
  non-metadata pages.
- Internal allocations differ from application-originated allocations
  in that they are for internal use, and that they are omitted from heap
  profiles.

The metadata statistics comprise the metadata categories as follows:

- stats.metadata: All metadata -- base + arena chunk headers + internal
  allocations.
- stats.arenas.<i>.metadata.mapped: Arena chunk headers.
- stats.arenas.<i>.metadata.allocated: Internal allocations.  This is
  reported separately from the other metadata statistics because it
  overlaps with the allocated and active statistics, whereas the other
  metadata statistics do not.

Base allocations are not reported separately, though their magnitude can
be computed by subtracting the arena-specific metadata.

This resolves #163.
---
 doc/jemalloc.xml.in                              |  47 +++++++
 include/jemalloc/internal/arena.h                |  34 +++++
 include/jemalloc/internal/base.h                 |   1 +
 include/jemalloc/internal/ctl.h                  |   1 +
 include/jemalloc/internal/huge.h                 |   1 +
 include/jemalloc/internal/jemalloc_internal.h.in | 156 ++++++++++++++++-------
 include/jemalloc/internal/private_symbols.txt    |  10 ++
 include/jemalloc/internal/stats.h                |   7 +
 src/arena.c                                      |  10 +-
 src/base.c                                       |  15 +++
 src/ctl.c                                        |  30 ++++-
 src/huge.c                                       | 113 +++++++---------
 src/jemalloc.c                                   |  37 ++----
 src/prof.c                                       |  30 +++--
 src/quarantine.c                                 |  11 +-
 src/stats.c                                      |  14 +-
 src/tcache.c                                     |   4 +-
 src/tsd.c                                        |   2 +-
 18 files changed, 356 insertions(+), 167 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 858572d..08fd4eb 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -1773,6 +1773,21 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         entirely devoted to allocator metadata.</para></listitem>
       </varlistentry>
 
+      <varlistentry id="stats.metadata">
+        <term>
+          <mallctl>stats.metadata</mallctl>
+          (<type>size_t</type>)
+          <literal>r-</literal>
+          [<option>--enable-stats</option>]
+        </term>
+        <listitem><para>Total number of bytes dedicated to metadata, which
+        comprise base allocations used for bootstrap-sensitive internal
+        allocator data structures, arena chunk headers (see <link
+        linkend="stats.arenas.i.metadata.mapped"><mallctl>stats.arenas.&lt;i&gt;.metadata.mapped</mallctl></link>),
+        and internal allocations (see <link
+        linkend="stats.arenas.i.metadata.allocated"><mallctl>stats.arenas.&lt;i&gt;.metadata.allocated</mallctl></link>).</para></listitem>
+      </varlistentry>
+
       <varlistentry id="stats.mapped">
         <term>
           <mallctl>stats.mapped</mallctl>
@@ -1875,6 +1890,38 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         <listitem><para>Number of mapped bytes.</para></listitem>
       </varlistentry>
 
+      <varlistentry id="stats.arenas.i.metadata.mapped">
+        <term>
+          <mallctl>stats.arenas.&lt;i&gt;.metadata.mapped</mallctl>
+          (<type>size_t</type>)
+          <literal>r-</literal>
+          [<option>--enable-stats</option>]
+        </term>
+        <listitem><para>Number of mapped bytes in arena chunk headers, which
+        track the states of the non-metadata pages.</para></listitem>
+      </varlistentry>
+
+      <varlistentry id="stats.arenas.i.metadata.allocated">
+        <term>
+          <mallctl>stats.arenas.&lt;i&gt;.metadata.allocated</mallctl>
+          (<type>size_t</type>)
+          <literal>r-</literal>
+          [<option>--enable-stats</option>]
+        </term>
+        <listitem><para>Number of bytes dedicated to internal allocations.
+        Internal allocations differ from application-originated allocations in
+        that they are for internal use, and that they are omitted from heap
+        profiles.  This statistic is reported separately from <link
+        linkend="stats.metadata"><mallctl>stats.metadata</mallctl></link> and
+        <link
+        linkend="stats.arenas.i.metadata.mapped"><mallctl>stats.arenas.&lt;i&gt;.metadata.mapped</mallctl></link>
+        because it overlaps with e.g. the <link
+        linkend="stats.allocated"><mallctl>stats.allocated</mallctl></link> and
+        <link linkend="stats.active"><mallctl>stats.active</mallctl></link>
+        statistics, whereas the other metadata statistics do
+        not.</para></listitem>
+      </varlistentry>
+
       <varlistentry id="stats.arenas.i.npurge">
         <term>
           <mallctl>stats.arenas.&lt;i&gt;.npurge</mallctl>
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 1e19023..46367f6 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -437,6 +437,9 @@ void	arena_mapbits_small_set(arena_chunk_t *chunk, size_t pageind,
     size_t runind, index_t binind, size_t flags);
 void	arena_mapbits_unzeroed_set(arena_chunk_t *chunk, size_t pageind,
     size_t unzeroed);
+void	arena_metadata_allocated_add(arena_t *arena, size_t size);
+void	arena_metadata_allocated_sub(arena_t *arena, size_t size);
+size_t	arena_metadata_allocated_get(arena_t *arena);
 bool	arena_prof_accum_impl(arena_t *arena, uint64_t accumbytes);
 bool	arena_prof_accum_locked(arena_t *arena, uint64_t accumbytes);
 bool	arena_prof_accum(arena_t *arena, uint64_t accumbytes);
@@ -448,6 +451,7 @@ prof_tctx_t	*arena_prof_tctx_get(const void *ptr);
 void	arena_prof_tctx_set(const void *ptr, prof_tctx_t *tctx);
 void	*arena_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
     bool try_tcache);
+arena_t	*arena_aalloc(const void *ptr);
 size_t	arena_salloc(const void *ptr, bool demote);
 void	arena_dalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr,
     bool try_tcache);
@@ -699,6 +703,27 @@ arena_mapbits_unzeroed_set(arena_chunk_t *chunk, size_t pageind,
 	    unzeroed);
 }
 
+JEMALLOC_INLINE void
+arena_metadata_allocated_add(arena_t *arena, size_t size)
+{
+
+	atomic_add_z(&arena->stats.metadata_allocated, size);
+}
+
+JEMALLOC_INLINE void
+arena_metadata_allocated_sub(arena_t *arena, size_t size)
+{
+
+	atomic_sub_z(&arena->stats.metadata_allocated, size);
+}
+
+JEMALLOC_INLINE size_t
+arena_metadata_allocated_get(arena_t *arena)
+{
+
+	return (atomic_read_z(&arena->stats.metadata_allocated));
+}
+
 JEMALLOC_INLINE bool
 arena_prof_accum_impl(arena_t *arena, uint64_t accumbytes)
 {
@@ -952,6 +977,15 @@ arena_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
 	}
 }
 
+JEMALLOC_ALWAYS_INLINE arena_t *
+arena_aalloc(const void *ptr)
+{
+	arena_chunk_t *chunk;
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	return (chunk->arena);
+}
+
 /* Return the size of the allocation pointed to by ptr. */
 JEMALLOC_ALWAYS_INLINE size_t
 arena_salloc(const void *ptr, bool demote)
diff --git a/include/jemalloc/internal/base.h b/include/jemalloc/internal/base.h
index 3fb80b9..18b7a72 100644
--- a/include/jemalloc/internal/base.h
+++ b/include/jemalloc/internal/base.h
@@ -13,6 +13,7 @@ void	*base_alloc(size_t size);
 void	*base_calloc(size_t number, size_t size);
 extent_node_t *base_node_alloc(void);
 void	base_node_dalloc(extent_node_t *node);
+size_t	base_allocated_get(void);
 bool	base_boot(void);
 void	base_prefork(void);
 void	base_postfork_parent(void);
diff --git a/include/jemalloc/internal/ctl.h b/include/jemalloc/internal/ctl.h
index a3e899e..65617bc 100644
--- a/include/jemalloc/internal/ctl.h
+++ b/include/jemalloc/internal/ctl.h
@@ -52,6 +52,7 @@ struct ctl_arena_stats_s {
 struct ctl_stats_s {
 	size_t			allocated;
 	size_t			active;
+	size_t			metadata;
 	size_t			mapped;
 	struct {
 		size_t		current;	/* stats_chunks.curchunks */
diff --git a/include/jemalloc/internal/huge.h b/include/jemalloc/internal/huge.h
index 39d8aa5..decb024 100644
--- a/include/jemalloc/internal/huge.h
+++ b/include/jemalloc/internal/huge.h
@@ -23,6 +23,7 @@ typedef void (huge_dalloc_junk_t)(void *, size_t);
 extern huge_dalloc_junk_t *huge_dalloc_junk;
 #endif
 void	huge_dalloc(tsd_t *tsd, void *ptr, bool try_tcache);
+arena_t	*huge_aalloc(const void *ptr);
 size_t	huge_salloc(const void *ptr);
 prof_tctx_t	*huge_prof_tctx_get(const void *ptr);
 void	huge_prof_tctx_set(const void *ptr, prof_tctx_t *tctx);
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 4107860..a477855 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -404,8 +404,9 @@ extern size_t const	index2size_tab[NSIZES];
 extern uint8_t const	size2index_tab[];
 
 arena_t	*a0get(void);
-void	*a0malloc(size_t size, bool zero);
+void	*a0malloc(size_t size);
 void	a0dalloc(void *ptr);
+size_t	a0allocated(void);
 arena_t	*arenas_extend(unsigned ind);
 arena_t	*arena_init(unsigned ind);
 unsigned	narenas_total_get(void);
@@ -776,21 +777,27 @@ arena_get(tsd_t *tsd, unsigned ind, bool init_if_missing,
 #include "jemalloc/internal/quarantine.h"
 
 #ifndef JEMALLOC_ENABLE_INLINE
+arena_t	*iaalloc(const void *ptr);
+size_t	isalloc(const void *ptr, bool demote);
+void	*iallocztm(tsd_t *tsd, size_t size, bool zero, bool try_tcache,
+    bool is_metadata, arena_t *arena);
 void	*imalloct(tsd_t *tsd, size_t size, bool try_tcache, arena_t *arena);
 void	*imalloc(tsd_t *tsd, size_t size);
 void	*icalloct(tsd_t *tsd, size_t size, bool try_tcache, arena_t *arena);
 void	*icalloc(tsd_t *tsd, size_t size);
+void	*ipallocztm(tsd_t *tsd, size_t usize, size_t alignment, bool zero,
+    bool try_tcache, bool is_metadata, arena_t *arena);
 void	*ipalloct(tsd_t *tsd, size_t usize, size_t alignment, bool zero,
     bool try_tcache, arena_t *arena);
 void	*ipalloc(tsd_t *tsd, size_t usize, size_t alignment, bool zero);
-size_t	isalloc(const void *ptr, bool demote);
 size_t	ivsalloc(const void *ptr, bool demote);
 size_t	u2rz(size_t usize);
 size_t	p2rz(const void *ptr);
+void	idalloctm(tsd_t *tsd, void *ptr, bool try_tcache, bool is_metadata);
 void	idalloct(tsd_t *tsd, void *ptr, bool try_tcache);
-void	isdalloct(tsd_t *tsd, void *ptr, size_t size, bool try_tcache);
 void	idalloc(tsd_t *tsd, void *ptr);
 void	iqalloc(tsd_t *tsd, void *ptr, bool try_tcache);
+void	isdalloct(tsd_t *tsd, void *ptr, size_t size, bool try_tcache);
 void	isqalloc(tsd_t *tsd, void *ptr, size_t size, bool try_tcache);
 void	*iralloct_realign(tsd_t *tsd, void *ptr, size_t oldsize, size_t size,
     size_t extra, size_t alignment, bool zero, bool try_tcache_alloc,
@@ -805,45 +812,97 @@ bool	ixalloc(void *ptr, size_t oldsize, size_t size, size_t extra,
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
+JEMALLOC_ALWAYS_INLINE arena_t *
+iaalloc(const void *ptr)
+{
+	arena_t *arena;
+	arena_chunk_t *chunk;
+
+	assert(ptr != NULL);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	if (likely(chunk != ptr))
+		arena = arena_aalloc(ptr);
+	else
+		arena = huge_aalloc(ptr);
+
+	return (arena);
+}
+
+/*
+ * Typical usage:
+ *   void *ptr = [...]
+ *   size_t sz = isalloc(ptr, config_prof);
+ */
+JEMALLOC_ALWAYS_INLINE size_t
+isalloc(const void *ptr, bool demote)
+{
+	size_t ret;
+	arena_chunk_t *chunk;
+
+	assert(ptr != NULL);
+	/* Demotion only makes sense if config_prof is true. */
+	assert(config_prof || !demote);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	if (likely(chunk != ptr))
+		ret = arena_salloc(ptr, demote);
+	else
+		ret = huge_salloc(ptr);
+
+	return (ret);
+}
+
 JEMALLOC_ALWAYS_INLINE void *
-imalloct(tsd_t *tsd, size_t size, bool try_tcache, arena_t *arena)
+iallocztm(tsd_t *tsd, size_t size, bool zero, bool try_tcache, bool is_metadata,
+    arena_t *arena)
 {
+	void *ret;
 
 	assert(size != 0);
 
 	if (likely(size <= arena_maxclass))
-		return (arena_malloc(tsd, arena, size, false, try_tcache));
+		ret = arena_malloc(tsd, arena, size, zero, try_tcache);
 	else
-		return (huge_malloc(tsd, arena, size, false, try_tcache));
+		ret = huge_malloc(tsd, arena, size, zero, try_tcache);
+	if (config_stats && is_metadata && likely(ret != NULL)) {
+		arena_metadata_allocated_add(iaalloc(ret), isalloc(ret,
+		    config_prof));
+	}
+	return (ret);
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+imalloct(tsd_t *tsd, size_t size, bool try_tcache, arena_t *arena)
+{
+
+	return (iallocztm(tsd, size, false, try_tcache, false, arena));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
 imalloc(tsd_t *tsd, size_t size)
 {
 
-	return (imalloct(tsd, size, true, NULL));
+	return (iallocztm(tsd, size, false, true, false, NULL));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
 icalloct(tsd_t *tsd, size_t size, bool try_tcache, arena_t *arena)
 {
 
-	if (likely(size <= arena_maxclass))
-		return (arena_malloc(tsd, arena, size, true, try_tcache));
-	else
-		return (huge_malloc(tsd, arena, size, true, try_tcache));
+	return (iallocztm(tsd, size, true, try_tcache, false, arena));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
 icalloc(tsd_t *tsd, size_t size)
 {
 
-	return (icalloct(tsd, size, true, NULL));
+	return (iallocztm(tsd, size, true, true, false, NULL));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-ipalloct(tsd_t *tsd, size_t usize, size_t alignment, bool zero, bool try_tcache,
-    arena_t *arena)
+ipallocztm(tsd_t *tsd, size_t usize, size_t alignment, bool zero,
+    bool try_tcache, bool is_metadata, arena_t *arena)
 {
 	void *ret;
 
@@ -865,40 +924,28 @@ ipalloct(tsd_t *tsd, size_t usize, size_t alignment, bool zero, bool try_tcache,
 			    try_tcache);
 		}
 	}
-
 	assert(ALIGNMENT_ADDR2BASE(ret, alignment) == ret);
+	if (config_stats && is_metadata && likely(ret != NULL)) {
+		arena_metadata_allocated_add(iaalloc(ret), isalloc(ret,
+		    config_prof));
+	}
 	return (ret);
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-ipalloc(tsd_t *tsd, size_t usize, size_t alignment, bool zero)
+ipalloct(tsd_t *tsd, size_t usize, size_t alignment, bool zero, bool try_tcache,
+    arena_t *arena)
 {
 
-	return (ipalloct(tsd, usize, alignment, zero, true, NULL));
+	return (ipallocztm(tsd, usize, alignment, zero, try_tcache, false,
+	    arena));
 }
 
-/*
- * Typical usage:
- *   void *ptr = [...]
- *   size_t sz = isalloc(ptr, config_prof);
- */
-JEMALLOC_ALWAYS_INLINE size_t
-isalloc(const void *ptr, bool demote)
+JEMALLOC_ALWAYS_INLINE void *
+ipalloc(tsd_t *tsd, size_t usize, size_t alignment, bool zero)
 {
-	size_t ret;
-	arena_chunk_t *chunk;
-
-	assert(ptr != NULL);
-	/* Demotion only makes sense if config_prof is true. */
-	assert(config_prof || !demote);
-
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (likely(chunk != ptr))
-		ret = arena_salloc(ptr, demote);
-	else
-		ret = huge_salloc(ptr);
 
-	return (ret);
+	return (ipallocztm(tsd, usize, alignment, zero, true, false, NULL));
 }
 
 JEMALLOC_ALWAYS_INLINE size_t
@@ -935,11 +982,15 @@ p2rz(const void *ptr)
 }
 
 JEMALLOC_ALWAYS_INLINE void
-idalloct(tsd_t *tsd, void *ptr, bool try_tcache)
+idalloctm(tsd_t *tsd, void *ptr, bool try_tcache, bool is_metadata)
 {
 	arena_chunk_t *chunk;
 
 	assert(ptr != NULL);
+	if (config_stats && is_metadata) {
+		arena_metadata_allocated_sub(iaalloc(ptr), isalloc(ptr,
+		    config_prof));
+	}
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (likely(chunk != ptr))
@@ -949,24 +1000,17 @@ idalloct(tsd_t *tsd, void *ptr, bool try_tcache)
 }
 
 JEMALLOC_ALWAYS_INLINE void
-isdalloct(tsd_t *tsd, void *ptr, size_t size, bool try_tcache)
+idalloct(tsd_t *tsd, void *ptr, bool try_tcache)
 {
-	arena_chunk_t *chunk;
-
-	assert(ptr != NULL);
 
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (likely(chunk != ptr))
-		arena_sdalloc(tsd, chunk, ptr, size, try_tcache);
-	else
-		huge_dalloc(tsd, ptr, try_tcache);
+	idalloctm(tsd, ptr, try_tcache, false);
 }
 
 JEMALLOC_ALWAYS_INLINE void
 idalloc(tsd_t *tsd, void *ptr)
 {
 
-	idalloct(tsd, ptr, true);
+	idalloctm(tsd, ptr, true, false);
 }
 
 JEMALLOC_ALWAYS_INLINE void
@@ -976,7 +1020,21 @@ iqalloc(tsd_t *tsd, void *ptr, bool try_tcache)
 	if (config_fill && unlikely(opt_quarantine))
 		quarantine(tsd, ptr);
 	else
-		idalloct(tsd, ptr, try_tcache);
+		idalloctm(tsd, ptr, try_tcache, false);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+isdalloct(tsd_t *tsd, void *ptr, size_t size, bool try_tcache)
+{
+	arena_chunk_t *chunk;
+
+	assert(ptr != NULL);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	if (likely(chunk != ptr))
+		arena_sdalloc(tsd, chunk, ptr, size, try_tcache);
+	else
+		huge_dalloc(tsd, ptr, try_tcache);
 }
 
 JEMALLOC_ALWAYS_INLINE void
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 1aaf80b..dfa8755 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -1,6 +1,7 @@
 a0dalloc
 a0get
 a0malloc
+arena_aalloc
 arena_get
 arena_get_hard
 arena_alloc_junk_small
@@ -50,6 +51,9 @@ arena_mapbitsp_read
 arena_mapbitsp_write
 arena_maxclass
 arena_maxrun
+arena_metadata_allocated_add
+arena_metadata_allocated_get
+arena_metadata_allocated_sub
 arena_migrate
 arena_miscelm_get
 arena_miscelm_to_pageind
@@ -90,6 +94,7 @@ atomic_sub_uint32
 atomic_sub_uint64
 atomic_sub_z
 base_alloc
+base_allocated_get
 base_boot
 base_calloc
 base_node_alloc
@@ -205,6 +210,7 @@ hash_rotl_64
 hash_x64_128
 hash_x86_128
 hash_x86_32
+huge_aalloc
 huge_allocated
 huge_boot
 huge_dalloc
@@ -221,10 +227,13 @@ huge_prof_tctx_set
 huge_ralloc
 huge_ralloc_no_move
 huge_salloc
+iaalloc
+iallocztm
 icalloc
 icalloct
 idalloc
 idalloct
+idalloctm
 imalloc
 imalloct
 in_valgrind
@@ -234,6 +243,7 @@ index2size_lookup
 index2size_tab
 ipalloc
 ipalloct
+ipallocztm
 iqalloc
 iralloc
 iralloct
diff --git a/include/jemalloc/internal/stats.h b/include/jemalloc/internal/stats.h
index d8600ed..7cba77b 100644
--- a/include/jemalloc/internal/stats.h
+++ b/include/jemalloc/internal/stats.h
@@ -111,6 +111,13 @@ struct arena_stats_s {
 	uint64_t	nmadvise;
 	uint64_t	purged;
 
+	/*
+	 * Number of bytes currently mapped purely for metadata purposes, and
+	 * number of bytes currently allocated for internal metadata.
+	 */
+	size_t		metadata_mapped;
+	size_t		metadata_allocated; /* Protected via atomic_*_z(). */
+
 	/* Per-size-category statistics. */
 	size_t		allocated_large;
 	uint64_t	nmalloc_large;
diff --git a/src/arena.c b/src/arena.c
index 1eb4000..984b8ad 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -405,8 +405,10 @@ arena_chunk_alloc_internal(arena_t *arena, size_t size, size_t alignment,
 	chunk = (arena_chunk_t *)chunk_alloc_arena(chunk_alloc, chunk_dalloc,
 	    arena->ind, NULL, size, alignment, zero);
 	malloc_mutex_lock(&arena->lock);
-	if (config_stats && chunk != NULL)
+	if (config_stats && chunk != NULL) {
 		arena->stats.mapped += chunksize;
+		arena->stats.metadata_mapped += (map_bias << LG_PAGE);
+	}
 
 	return (chunk);
 }
@@ -514,8 +516,10 @@ arena_chunk_dalloc(arena_t *arena, arena_chunk_t *chunk)
 		malloc_mutex_unlock(&arena->lock);
 		chunk_dalloc((void *)spare, chunksize, arena->ind);
 		malloc_mutex_lock(&arena->lock);
-		if (config_stats)
+		if (config_stats) {
 			arena->stats.mapped -= chunksize;
+			arena->stats.metadata_mapped -= (map_bias << LG_PAGE);
+		}
 	} else
 		arena->spare = chunk;
 }
@@ -2273,6 +2277,8 @@ arena_stats_merge(arena_t *arena, const char **dss, size_t *nactive,
 	astats->npurge += arena->stats.npurge;
 	astats->nmadvise += arena->stats.nmadvise;
 	astats->purged += arena->stats.purged;
+	astats->metadata_mapped += arena->stats.metadata_mapped;
+	astats->metadata_allocated += arena_metadata_allocated_get(arena);
 	astats->allocated_large += arena->stats.allocated_large;
 	astats->nmalloc_large += arena->stats.nmalloc_large;
 	astats->ndalloc_large += arena->stats.ndalloc_large;
diff --git a/src/base.c b/src/base.c
index 409c7bb..22f3613 100644
--- a/src/base.c
+++ b/src/base.c
@@ -16,6 +16,8 @@ static void		*base_next_addr;
 static void		*base_past_addr; /* Addr immediately past base_pages. */
 static extent_node_t	*base_nodes;
 
+static size_t		base_allocated;
+
 /******************************************************************************/
 
 static bool
@@ -54,6 +56,8 @@ base_alloc(size_t size)
 	/* Allocate. */
 	ret = base_next_addr;
 	base_next_addr = (void *)((uintptr_t)base_next_addr + csize);
+	if (config_stats)
+		base_allocated += csize;
 	malloc_mutex_unlock(&base_mtx);
 	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, csize);
 
@@ -102,6 +106,17 @@ base_node_dalloc(extent_node_t *node)
 	malloc_mutex_unlock(&base_mtx);
 }
 
+size_t
+base_allocated_get(void)
+{
+	size_t ret;
+
+	malloc_mutex_lock(&base_mtx);
+	ret = base_allocated;
+	malloc_mutex_unlock(&base_mtx);
+	return (ret);
+}
+
 bool
 base_boot(void)
 {
diff --git a/src/ctl.c b/src/ctl.c
index 6b95584..b65af52 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -183,10 +183,13 @@ CTL_PROTO(stats_arenas_i_mapped)
 CTL_PROTO(stats_arenas_i_npurge)
 CTL_PROTO(stats_arenas_i_nmadvise)
 CTL_PROTO(stats_arenas_i_purged)
+CTL_PROTO(stats_arenas_i_metadata_mapped)
+CTL_PROTO(stats_arenas_i_metadata_allocated)
 INDEX_PROTO(stats_arenas_i)
 CTL_PROTO(stats_cactive)
 CTL_PROTO(stats_allocated)
 CTL_PROTO(stats_active)
+CTL_PROTO(stats_metadata)
 CTL_PROTO(stats_mapped)
 
 /******************************************************************************/
@@ -355,6 +358,11 @@ static const ctl_named_node_t stats_chunks_node[] = {
 	{NAME("high"),		CTL(stats_chunks_high)}
 };
 
+static const ctl_named_node_t stats_arenas_i_metadata_node[] = {
+	{NAME("mapped"),	CTL(stats_arenas_i_metadata_mapped)},
+	{NAME("allocated"),	CTL(stats_arenas_i_metadata_allocated)}
+};
+
 static const ctl_named_node_t stats_arenas_i_small_node[] = {
 	{NAME("allocated"),	CTL(stats_arenas_i_small_allocated)},
 	{NAME("nmalloc"),	CTL(stats_arenas_i_small_nmalloc)},
@@ -432,6 +440,7 @@ static const ctl_named_node_t stats_arenas_i_node[] = {
 	{NAME("npurge"),	CTL(stats_arenas_i_npurge)},
 	{NAME("nmadvise"),	CTL(stats_arenas_i_nmadvise)},
 	{NAME("purged"),	CTL(stats_arenas_i_purged)},
+	{NAME("metadata"),	CHILD(named, stats_arenas_i_metadata)},
 	{NAME("small"),		CHILD(named, stats_arenas_i_small)},
 	{NAME("large"),		CHILD(named, stats_arenas_i_large)},
 	{NAME("huge"),		CHILD(named, stats_arenas_i_huge)},
@@ -451,6 +460,7 @@ static const ctl_named_node_t stats_node[] = {
 	{NAME("cactive"),	CTL(stats_cactive)},
 	{NAME("allocated"),	CTL(stats_allocated)},
 	{NAME("active"),	CTL(stats_active)},
+	{NAME("metadata"),	CTL(stats_metadata)},
 	{NAME("mapped"),	CTL(stats_mapped)},
 	{NAME("chunks"),	CHILD(named, stats_chunks)},
 	{NAME("arenas"),	CHILD(indexed, stats_arenas)}
@@ -484,14 +494,14 @@ ctl_arena_init(ctl_arena_stats_t *astats)
 
 	if (astats->lstats == NULL) {
 		astats->lstats = (malloc_large_stats_t *)a0malloc(nlclasses *
-		    sizeof(malloc_large_stats_t), false);
+		    sizeof(malloc_large_stats_t));
 		if (astats->lstats == NULL)
 			return (true);
 	}
 
 	if (astats->hstats == NULL) {
 		astats->hstats = (malloc_huge_stats_t *)a0malloc(nhclasses *
-		    sizeof(malloc_huge_stats_t), false);
+		    sizeof(malloc_huge_stats_t));
 		if (astats->hstats == NULL)
 			return (true);
 	}
@@ -551,6 +561,9 @@ ctl_arena_stats_smerge(ctl_arena_stats_t *sstats, ctl_arena_stats_t *astats)
 	sstats->astats.nmadvise += astats->astats.nmadvise;
 	sstats->astats.purged += astats->astats.purged;
 
+	sstats->astats.metadata_mapped += astats->astats.metadata_mapped;
+	sstats->astats.metadata_allocated += astats->astats.metadata_allocated;
+
 	sstats->allocated_small += astats->allocated_small;
 	sstats->nmalloc_small += astats->nmalloc_small;
 	sstats->ndalloc_small += astats->ndalloc_small;
@@ -627,7 +640,7 @@ ctl_grow(void)
 
 	/* Allocate extended arena stats. */
 	astats = (ctl_arena_stats_t *)a0malloc((ctl_stats.narenas + 2) *
-	    sizeof(ctl_arena_stats_t), false);
+	    sizeof(ctl_arena_stats_t));
 	if (astats == NULL)
 		return (true);
 
@@ -704,6 +717,10 @@ ctl_refresh(void)
 		    + ctl_stats.arenas[ctl_stats.narenas].astats.allocated_huge;
 		ctl_stats.active =
 		    (ctl_stats.arenas[ctl_stats.narenas].pactive << LG_PAGE);
+		ctl_stats.metadata = base_allocated_get()
+		    + ctl_stats.arenas[ctl_stats.narenas].astats.metadata_mapped
+		    + ctl_stats.arenas[ctl_stats.narenas].astats
+		    .metadata_allocated;
 		ctl_stats.mapped = (ctl_stats.chunks.current << opt_lg_chunk);
 	}
 
@@ -723,7 +740,7 @@ ctl_init(void)
 		 */
 		ctl_stats.narenas = narenas_total_get();
 		ctl_stats.arenas = (ctl_arena_stats_t *)a0malloc(
-		    (ctl_stats.narenas + 1) * sizeof(ctl_arena_stats_t), false);
+		    (ctl_stats.narenas + 1) * sizeof(ctl_arena_stats_t));
 		if (ctl_stats.arenas == NULL) {
 			ret = true;
 			goto label_return;
@@ -1806,6 +1823,7 @@ CTL_RO_NL_CGEN(config_prof, lg_prof_sample, lg_prof_sample, size_t)
 CTL_RO_CGEN(config_stats, stats_cactive, &stats_cactive, size_t *)
 CTL_RO_CGEN(config_stats, stats_allocated, ctl_stats.allocated, size_t)
 CTL_RO_CGEN(config_stats, stats_active, ctl_stats.active, size_t)
+CTL_RO_CGEN(config_stats, stats_metadata, ctl_stats.metadata, size_t)
 CTL_RO_CGEN(config_stats, stats_mapped, ctl_stats.mapped, size_t)
 
 CTL_RO_CGEN(config_stats, stats_chunks_current, ctl_stats.chunks.current,
@@ -1825,6 +1843,10 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_nmadvise,
     ctl_stats.arenas[mib[2]].astats.nmadvise, uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_purged,
     ctl_stats.arenas[mib[2]].astats.purged, uint64_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_metadata_mapped,
+    ctl_stats.arenas[mib[2]].astats.metadata_mapped, size_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_metadata_allocated,
+    ctl_stats.arenas[mib[2]].astats.metadata_allocated, size_t)
 
 CTL_RO_CGEN(config_stats, stats_arenas_i_small_allocated,
     ctl_stats.arenas[mib[2]].allocated_small, size_t)
diff --git a/src/huge.c b/src/huge.c
index 416cb17..c4d1ebc 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -37,8 +37,8 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 	/* Allocate one or more contiguous chunks for this request. */
 
 	/* Allocate an extent node with which to track the chunk. */
-	node = ipalloct(tsd, CACHELINE_CEILING(sizeof(extent_node_t)),
-	    CACHELINE, false, try_tcache, NULL);
+	node = ipallocztm(tsd, CACHELINE_CEILING(sizeof(extent_node_t)),
+	    CACHELINE, false, try_tcache, true, arena);
 	if (node == NULL)
 		return (NULL);
 
@@ -50,7 +50,7 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 	arena = arena_choose(tsd, arena);
 	if (unlikely(arena == NULL) || (ret = arena_chunk_alloc_huge(arena,
 	    usize, alignment, &is_zeroed)) == NULL) {
-		idalloct(tsd, node, try_tcache);
+		idalloctm(tsd, node, try_tcache, true);
 		return (NULL);
 	}
 
@@ -73,6 +73,33 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 	return (ret);
 }
 
+static extent_node_t *
+huge_node_locked(const void *ptr)
+{
+	extent_node_t *node, key;
+
+	/* Extract from tree of huge allocations. */
+	key.addr = __DECONST(void *, ptr);
+	node = extent_tree_ad_search(&huge, &key);
+	assert(node != NULL);
+	assert(node->addr == ptr);
+	malloc_mutex_unlock(&huge_mtx);
+
+	return (node);
+}
+
+static extent_node_t *
+huge_node(const void *ptr)
+{
+	extent_node_t *node;
+
+	malloc_mutex_lock(&huge_mtx);
+	node = huge_node_locked(ptr);
+	malloc_mutex_unlock(&huge_mtx);
+
+	return (node);
+}
+
 #ifdef JEMALLOC_JET
 #undef huge_dalloc_junk
 #define	huge_dalloc_junk JEMALLOC_N(huge_dalloc_junk_impl)
@@ -102,7 +129,7 @@ huge_ralloc_no_move_similar(void *ptr, size_t oldsize, size_t usize,
 {
 	size_t usize_next;
 	bool zeroed;
-	extent_node_t *node, key;
+	extent_node_t *node;
 	arena_t *arena;
 
 	/* Increase usize to incorporate extra. */
@@ -126,10 +153,7 @@ huge_ralloc_no_move_similar(void *ptr, size_t oldsize, size_t usize,
 		zeroed = true;
 
 	malloc_mutex_lock(&huge_mtx);
-	key.addr = ptr;
-	node = extent_tree_ad_search(&huge, &key);
-	assert(node != NULL);
-	assert(node->addr == ptr);
+	node = huge_node_locked(ptr);
 	arena = node->arena;
 	/* Update the size of the huge allocation. */
 	assert(node->size != usize);
@@ -159,7 +183,7 @@ huge_ralloc_no_move_shrink(void *ptr, size_t oldsize, size_t usize)
 {
 	size_t sdiff;
 	bool zeroed;
-	extent_node_t *node, key;
+	extent_node_t *node;
 	arena_t *arena;
 
 	sdiff = CHUNK_CEILING(usize) - usize;
@@ -172,10 +196,7 @@ huge_ralloc_no_move_shrink(void *ptr, size_t oldsize, size_t usize)
 	}
 
 	malloc_mutex_lock(&huge_mtx);
-	key.addr = ptr;
-	node = extent_tree_ad_search(&huge, &key);
-	assert(node != NULL);
-	assert(node->addr == ptr);
+	node = huge_node_locked(ptr);
 	arena = node->arena;
 	/* Update the size of the huge allocation. */
 	node->size = usize;
@@ -190,7 +211,7 @@ huge_ralloc_no_move_shrink(void *ptr, size_t oldsize, size_t usize)
 static bool
 huge_ralloc_no_move_expand(void *ptr, size_t oldsize, size_t size, bool zero) {
 	size_t usize;
-	extent_node_t *node, key;
+	extent_node_t *node;
 	arena_t *arena;
 	bool is_zeroed_subchunk, is_zeroed_chunk;
 
@@ -201,10 +222,7 @@ huge_ralloc_no_move_expand(void *ptr, size_t oldsize, size_t size, bool zero) {
 	}
 
 	malloc_mutex_lock(&huge_mtx);
-	key.addr = ptr;
-	node = extent_tree_ad_search(&huge, &key);
-	assert(node != NULL);
-	assert(node->addr == ptr);
+	node = huge_node_locked(ptr);
 	arena = node->arena;
 	is_zeroed_subchunk = node->zeroed;
 	malloc_mutex_unlock(&huge_mtx);
@@ -342,77 +360,44 @@ huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t size,
 void
 huge_dalloc(tsd_t *tsd, void *ptr, bool try_tcache)
 {
-	extent_node_t *node, key;
+	extent_node_t *node;
 
 	malloc_mutex_lock(&huge_mtx);
-	/* Extract from tree of huge allocations. */
-	key.addr = ptr;
-	node = extent_tree_ad_search(&huge, &key);
-	assert(node != NULL);
-	assert(node->addr == ptr);
+	node = huge_node_locked(ptr);
 	extent_tree_ad_remove(&huge, node);
 	malloc_mutex_unlock(&huge_mtx);
 
 	huge_dalloc_junk(node->addr, node->size);
 	arena_chunk_dalloc_huge(node->arena, node->addr, node->size);
-	idalloct(tsd, node, try_tcache);
+	idalloctm(tsd, node, try_tcache, true);
 }
 
-size_t
-huge_salloc(const void *ptr)
+arena_t *
+huge_aalloc(const void *ptr)
 {
-	size_t ret;
-	extent_node_t *node, key;
-
-	malloc_mutex_lock(&huge_mtx);
-
-	/* Extract from tree of huge allocations. */
-	key.addr = __DECONST(void *, ptr);
-	node = extent_tree_ad_search(&huge, &key);
-	assert(node != NULL);
 
-	ret = node->size;
+	return (huge_node(ptr)->arena);
+}
 
-	malloc_mutex_unlock(&huge_mtx);
+size_t
+huge_salloc(const void *ptr)
+{
 
-	return (ret);
+	return (huge_node(ptr)->size);
 }
 
 prof_tctx_t *
 huge_prof_tctx_get(const void *ptr)
 {
-	prof_tctx_t *ret;
-	extent_node_t *node, key;
-
-	malloc_mutex_lock(&huge_mtx);
 
-	/* Extract from tree of huge allocations. */
-	key.addr = __DECONST(void *, ptr);
-	node = extent_tree_ad_search(&huge, &key);
-	assert(node != NULL);
-
-	ret = node->prof_tctx;
-
-	malloc_mutex_unlock(&huge_mtx);
-
-	return (ret);
+	return (huge_node(ptr)->prof_tctx);
 }
 
 void
 huge_prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 {
-	extent_node_t *node, key;
 
-	malloc_mutex_lock(&huge_mtx);
-
-	/* Extract from tree of huge allocations. */
-	key.addr = __DECONST(void *, ptr);
-	node = extent_tree_ad_search(&huge, &key);
-	assert(node != NULL);
-
-	node->prof_tctx = tctx;
-
-	malloc_mutex_unlock(&huge_mtx);
+	huge_node(ptr)->prof_tctx = tctx;
 }
 
 bool
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 632c8d3..d1fa674 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -289,45 +289,34 @@ a0get(void)
 }
 
 static void *
-a0imalloc(size_t size, bool zero)
+a0ialloc(size_t size, bool zero, bool is_metadata)
 {
-	void *ret;
 
 	if (unlikely(malloc_init_a0()))
 		return (NULL);
 
-	if (likely(size <= arena_maxclass))
-		ret = arena_malloc(NULL, a0get(), size, zero, false);
-	else
-		ret = huge_malloc(NULL, a0get(), size, zero, false);
-
-	return (ret);
+	return (iallocztm(NULL, size, zero, false, is_metadata, a0get()));
 }
 
 static void
-a0idalloc(void *ptr)
+a0idalloc(void *ptr, bool is_metadata)
 {
-	arena_chunk_t *chunk;
 
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (likely(chunk != ptr))
-		arena_dalloc(NULL, chunk, ptr, false);
-	else
-		huge_dalloc(NULL, ptr, false);
+	idalloctm(NULL, ptr, false, is_metadata);
 }
 
 void *
-a0malloc(size_t size, bool zero)
+a0malloc(size_t size)
 {
 
-	return (a0imalloc(size, zero));
+	return (a0ialloc(size, false, true));
 }
 
 void
 a0dalloc(void *ptr)
 {
 
-	a0idalloc(ptr);
+	a0idalloc(ptr, true);
 }
 
 /*
@@ -343,7 +332,7 @@ bootstrap_malloc(size_t size)
 	if (unlikely(size == 0))
 		size = 1;
 
-	return (a0imalloc(size, false));
+	return (a0ialloc(size, false, false));
 }
 
 void *
@@ -357,7 +346,7 @@ bootstrap_calloc(size_t num, size_t size)
 		num_size = 1;
 	}
 
-	return (a0imalloc(num_size, true));
+	return (a0ialloc(num_size, true, false));
 }
 
 void
@@ -367,7 +356,7 @@ bootstrap_free(void *ptr)
 	if (unlikely(ptr == NULL))
 		return;
 
-	a0idalloc(ptr);
+	a0idalloc(ptr, false);
 }
 
 /* Create a new arena and insert it into the arenas array at index ind. */
@@ -382,7 +371,7 @@ arena_init_locked(unsigned ind)
 		unsigned narenas_new = narenas_total + 1;
 		arena_t **arenas_new =
 		    (arena_t **)a0malloc(CACHELINE_CEILING(narenas_new *
-		    sizeof(arena_t *)), false);
+		    sizeof(arena_t *)));
 		if (arenas_new == NULL)
 			return (NULL);
 		memcpy(arenas_new, arenas, narenas_total * sizeof(arena_t *));
@@ -519,7 +508,7 @@ arena_get_hard(tsd_t *tsd, unsigned ind, bool init_if_missing)
 		if (!*arenas_cache_bypassp) {
 			*arenas_cache_bypassp = true;
 			arenas_cache = (arena_t **)a0malloc(sizeof(arena_t *) *
-			    narenas_cache, false);
+			    narenas_cache);
 			*arenas_cache_bypassp = false;
 		} else
 			arenas_cache = NULL;
@@ -1202,6 +1191,8 @@ malloc_init_hard_a0_locked(void)
 	arena_boot();
 	if (config_tcache && tcache_boot())
 		return (true);
+	if (config_tcache && tcache_boot())
+		malloc_mutex_unlock(&init_lock);
 	if (huge_boot())
 		return (true);
 	if (malloc_mutex_init(&arenas_lock))
diff --git a/src/prof.c b/src/prof.c
index 1103cc9..06f5499 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -532,8 +532,8 @@ prof_gctx_create(tsd_t *tsd, prof_bt_t *bt)
 	/*
 	 * Create a single allocation that has space for vec of length bt->len.
 	 */
-	prof_gctx_t *gctx = (prof_gctx_t *)imalloc(tsd, offsetof(prof_gctx_t,
-	    vec) + (bt->len * sizeof(void *)));
+	prof_gctx_t *gctx = (prof_gctx_t *)iallocztm(tsd, offsetof(prof_gctx_t,
+	    vec) + (bt->len * sizeof(void *)), false, true, true, NULL);
 	if (gctx == NULL)
 		return (NULL);
 	gctx->lock = prof_gctx_mutex_choose();
@@ -574,7 +574,7 @@ prof_gctx_try_destroy(tsd_t *tsd, prof_tdata_t *tdata_self, prof_gctx_t *gctx,
 		prof_leave(tsd, tdata_self);
 		/* Destroy gctx. */
 		malloc_mutex_unlock(gctx->lock);
-		idalloc(tsd, gctx);
+		idalloctm(tsd, gctx, true, true);
 	} else {
 		/*
 		 * Compensate for increment in prof_tctx_destroy() or
@@ -674,7 +674,7 @@ prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx)
 		prof_tdata_destroy(tsd, tdata, false);
 
 	if (destroy_tctx)
-		idalloc(tsd, tctx);
+		idalloctm(tsd, tctx, true, true);
 }
 
 static bool
@@ -703,7 +703,7 @@ prof_lookup_global(tsd_t *tsd, prof_bt_t *bt, prof_tdata_t *tdata,
 		if (ckh_insert(tsd, &bt2gctx, btkey.v, gctx.v)) {
 			/* OOM. */
 			prof_leave(tsd, tdata);
-			idalloc(tsd, gctx.v);
+			idalloctm(tsd, gctx.v, true, true);
 			return (true);
 		}
 		new_gctx = true;
@@ -760,7 +760,8 @@ prof_lookup(tsd_t *tsd, prof_bt_t *bt)
 			return (NULL);
 
 		/* Link a prof_tctx_t into gctx for this thread. */
-		ret.v = imalloc(tsd, sizeof(prof_tctx_t));
+		ret.v = iallocztm(tsd, sizeof(prof_tctx_t), false, true, true,
+		    NULL);
 		if (ret.p == NULL) {
 			if (new_gctx)
 				prof_gctx_try_destroy(tsd, tdata, gctx, tdata);
@@ -778,7 +779,7 @@ prof_lookup(tsd_t *tsd, prof_bt_t *bt)
 		if (error) {
 			if (new_gctx)
 				prof_gctx_try_destroy(tsd, tdata, gctx, tdata);
-			idalloc(tsd, ret.v);
+			idalloctm(tsd, ret.v, true, true);
 			return (NULL);
 		}
 		malloc_mutex_lock(gctx->lock);
@@ -1158,7 +1159,7 @@ prof_gctx_finish(tsd_t *tsd, prof_gctx_tree_t *gctxs)
 					    to_destroy);
 					tctx_tree_remove(&gctx->tctxs,
 					    to_destroy);
-					idalloc(tsd, to_destroy);
+					idalloctm(tsd, to_destroy, true, true);
 				} else
 					next = NULL;
 			} while (next != NULL);
@@ -1640,7 +1641,8 @@ prof_tdata_init_impl(tsd_t *tsd, uint64_t thr_uid, uint64_t thr_discrim,
 	cassert(config_prof);
 
 	/* Initialize an empty cache for this thread. */
-	tdata = (prof_tdata_t *)imalloc(tsd, sizeof(prof_tdata_t));
+	tdata = (prof_tdata_t *)iallocztm(tsd, sizeof(prof_tdata_t), false,
+	    true, true, NULL);
 	if (tdata == NULL)
 		return (NULL);
 
@@ -1653,7 +1655,7 @@ prof_tdata_init_impl(tsd_t *tsd, uint64_t thr_uid, uint64_t thr_discrim,
 
 	if (ckh_new(tsd, &tdata->bt2tctx, PROF_CKH_MINITEMS,
 	    prof_bt_hash, prof_bt_keycomp)) {
-		idalloc(tsd, tdata);
+		idalloctm(tsd, tdata, true, true);
 		return (NULL);
 	}
 
@@ -1706,9 +1708,9 @@ prof_tdata_destroy_locked(tsd_t *tsd, prof_tdata_t *tdata,
 	tdata_tree_remove(&tdatas, tdata);
 
 	if (tdata->thread_name != NULL)
-		idalloc(tsd, tdata->thread_name);
+		idalloctm(tsd, tdata->thread_name, true, true);
 	ckh_delete(tsd, &tdata->bt2tctx);
-	idalloc(tsd, tdata);
+	idalloctm(tsd, tdata, true, true);
 }
 
 static void
@@ -1869,7 +1871,7 @@ prof_thread_name_alloc(tsd_t *tsd, const char *thread_name)
 	if (size == 1)
 		return ("");
 
-	ret = imalloc(tsd, size);
+	ret = iallocztm(tsd, size, false, true, true, NULL);
 	if (ret == NULL)
 		return (NULL);
 	memcpy(ret, thread_name, size);
@@ -1901,7 +1903,7 @@ prof_thread_name_set(tsd_t *tsd, const char *thread_name)
 		return (EAGAIN);
 
 	if (tdata->thread_name != NULL) {
-		idalloc(tsd, tdata->thread_name);
+		idalloctm(tsd, tdata->thread_name, true, true);
 		tdata->thread_name = NULL;
 	}
 	if (strlen(s) > 0)
diff --git a/src/quarantine.c b/src/quarantine.c
index 12c37e0..094b44d 100644
--- a/src/quarantine.c
+++ b/src/quarantine.c
@@ -26,8 +26,9 @@ quarantine_init(tsd_t *tsd, size_t lg_maxobjs)
 
 	assert(tsd_nominal(tsd));
 
-	quarantine = (quarantine_t *)imalloc(tsd, offsetof(quarantine_t, objs) +
-	    ((ZU(1) << lg_maxobjs) * sizeof(quarantine_obj_t)));
+	quarantine = (quarantine_t *)iallocztm(tsd, offsetof(quarantine_t, objs)
+	    + ((ZU(1) << lg_maxobjs) * sizeof(quarantine_obj_t)), false, true,
+	    true, NULL);
 	if (quarantine == NULL)
 		return (NULL);
 	quarantine->curbytes = 0;
@@ -54,7 +55,7 @@ quarantine_alloc_hook_work(tsd_t *tsd)
 	if (tsd_quarantine_get(tsd) == NULL)
 		tsd_quarantine_set(tsd, quarantine);
 	else
-		idalloc(tsd, quarantine);
+		idalloctm(tsd, quarantine, true, true);
 }
 
 static quarantine_t *
@@ -86,7 +87,7 @@ quarantine_grow(tsd_t *tsd, quarantine_t *quarantine)
 		memcpy(&ret->objs[ncopy_a], quarantine->objs, ncopy_b *
 		    sizeof(quarantine_obj_t));
 	}
-	idalloc(tsd, quarantine);
+	idalloctm(tsd, quarantine, true, true);
 
 	tsd_quarantine_set(tsd, ret);
 	return (ret);
@@ -176,7 +177,7 @@ quarantine_cleanup(tsd_t *tsd)
 	quarantine = tsd_quarantine_get(tsd);
 	if (quarantine != NULL) {
 		quarantine_drain(tsd, quarantine, 0);
-		idalloc(tsd, quarantine);
+		idalloctm(tsd, quarantine, true, true);
 		tsd_quarantine_set(tsd, NULL);
 	}
 }
diff --git a/src/stats.c b/src/stats.c
index 2b3da64..865f775 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -265,6 +265,7 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	unsigned nthreads;
 	const char *dss;
 	size_t page, pactive, pdirty, mapped;
+	size_t metadata_mapped, metadata_allocated;
 	uint64_t npurge, nmadvise, purged;
 	size_t small_allocated;
 	uint64_t small_nmalloc, small_ndalloc, small_nrequests;
@@ -331,6 +332,12 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	CTL_I_GET("stats.arenas.0.mapped", &mapped, size_t);
 	malloc_cprintf(write_cb, cbopaque, "mapped:                  %12zu\n",
 	    mapped);
+	CTL_I_GET("stats.arenas.0.metadata.mapped", &metadata_mapped, size_t);
+	CTL_I_GET("stats.arenas.0.metadata.allocated", &metadata_allocated,
+	    size_t);
+	malloc_cprintf(write_cb, cbopaque,
+	    "metadata: mapped: %zu, allocated: %zu\n", metadata_mapped,
+	    metadata_allocated);
 
 	if (bins)
 		stats_arena_bins_print(write_cb, cbopaque, i);
@@ -539,17 +546,18 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 
 	if (config_stats) {
 		size_t *cactive;
-		size_t allocated, active, mapped;
+		size_t allocated, active, metadata, mapped;
 		size_t chunks_current, chunks_high;
 		uint64_t chunks_total;
 
 		CTL_GET("stats.cactive", &cactive, size_t *);
 		CTL_GET("stats.allocated", &allocated, size_t);
 		CTL_GET("stats.active", &active, size_t);
+		CTL_GET("stats.metadata", &metadata, size_t);
 		CTL_GET("stats.mapped", &mapped, size_t);
 		malloc_cprintf(write_cb, cbopaque,
-		    "Allocated: %zu, active: %zu, mapped: %zu\n",
-		    allocated, active, mapped);
+		    "Allocated: %zu, active: %zu, metadata: %zu, mapped: %zu\n",
+		    allocated, active, metadata, mapped);
 		malloc_cprintf(write_cb, cbopaque,
 		    "Current active ceiling: %zu\n", atomic_read_z(cactive));
 
diff --git a/src/tcache.c b/src/tcache.c
index 34224ec..d638015 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -298,7 +298,7 @@ tcache_create(tsd_t *tsd, arena_t *arena)
 	/* Avoid false cacheline sharing. */
 	size = sa2u(size, CACHELINE);
 
-	tcache = ipalloct(tsd, size, CACHELINE, true, false, arena);
+	tcache = ipallocztm(tsd, size, CACHELINE, true, false, true, arena);
 	if (tcache == NULL)
 		return (NULL);
 
@@ -353,7 +353,7 @@ tcache_destroy(tsd_t *tsd, tcache_t *tcache)
 	    arena_prof_accum(tcache->arena, tcache->prof_accumbytes))
 		prof_idump();
 
-	idalloct(tsd, tcache, false);
+	idalloctm(tsd, tcache, false, true);
 }
 
 void
diff --git a/src/tsd.c b/src/tsd.c
index 00d8f95..3b59acf 100644
--- a/src/tsd.c
+++ b/src/tsd.c
@@ -15,7 +15,7 @@ void *
 malloc_tsd_malloc(size_t size)
 {
 
-	return (a0malloc(CACHELINE_CEILING(size), false));
+	return (a0malloc(CACHELINE_CEILING(size)));
 }
 
 void
-- 
cgit v0.12


From eee27b2a38d6bb741d9de5e028d5b23e2f4ec4cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Marie?= <semarie@users.noreply.github.com>
Date: Sun, 25 Jan 2015 15:12:28 +0100
Subject: huge_node_locked don't have to unlock huge_mtx

in src/huge.c, after each call of huge_node_locked(), huge_mtx is
already unlocked. don't unlock it twice (it is a undefined behaviour).
---
 src/huge.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/huge.c b/src/huge.c
index c4d1ebc..84a1ab2 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -83,7 +83,6 @@ huge_node_locked(const void *ptr)
 	node = extent_tree_ad_search(&huge, &key);
 	assert(node != NULL);
 	assert(node->addr == ptr);
-	malloc_mutex_unlock(&huge_mtx);
 
 	return (node);
 }
-- 
cgit v0.12


From 77d597ebb23aa47a4a0112c294ad6a68857f450c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Marie?= <semarie@users.noreply.github.com>
Date: Sun, 25 Jan 2015 10:18:32 +0100
Subject: add openbsd support

---
 configure.ac | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/configure.ac b/configure.ac
index 0a4f01e..5e93a5d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -283,6 +283,11 @@ case "${host}" in
 	abi="elf"
 	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ])
 	;;
+  *-*-openbsd*)
+	CFLAGS="$CFLAGS"
+	abi="elf"
+	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ])
+	;;
   *-*-linux*)
 	CFLAGS="$CFLAGS"
 	CPPFLAGS="$CPPFLAGS -D_GNU_SOURCE"
-- 
cgit v0.12


From 0fd663e9c5336089a98e8a2a0cf5419b534f045f Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 25 Jan 2015 17:31:24 -0800
Subject: Avoid pointless chunk_recycle() call.

Avoid calling chunk_recycle() for mmap()ed chunks if config_munmap is
disabled, in which case there are never any recyclable chunks.

This resolves #164.
---
 src/chunk.c | 50 +++++++++++++++++++++++++++++---------------------
 1 file changed, 29 insertions(+), 21 deletions(-)

diff --git a/src/chunk.c b/src/chunk.c
index b9a2441..6d5f84f 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -132,6 +132,19 @@ chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad,
 	return (ret);
 }
 
+static void *
+chunk_alloc_core_dss(void *new_addr, size_t size, size_t alignment, bool base,
+    bool *zero)
+{
+	void *ret;
+
+	if ((ret = chunk_recycle(&chunks_szad_dss, &chunks_ad_dss,
+	    new_addr, size, alignment, base, zero)) != NULL)
+		return (ret);
+	ret = chunk_alloc_dss(new_addr, size, alignment, zero);
+	return (ret);
+}
+
 /*
  * If the caller specifies (!*zero), it is still possible to receive zeroed
  * memory, in which case *zero is toggled to true.  arena_chunk_alloc() takes
@@ -150,31 +163,26 @@ chunk_alloc_core(void *new_addr, size_t size, size_t alignment, bool base,
 	assert((alignment & chunksize_mask) == 0);
 
 	/* "primary" dss. */
-	if (have_dss && dss_prec == dss_prec_primary) {
-		if ((ret = chunk_recycle(&chunks_szad_dss, &chunks_ad_dss,
-		    new_addr, size, alignment, base, zero)) != NULL)
-			return (ret);
-		if ((ret = chunk_alloc_dss(new_addr, size, alignment, zero))
-		    != NULL)
-			return (ret);
-	}
+	if (have_dss && dss_prec == dss_prec_primary && (ret =
+	    chunk_alloc_core_dss(new_addr, size, alignment, base, zero)) !=
+	    NULL)
+		return (ret);
 	/* mmap. */
-	if ((ret = chunk_recycle(&chunks_szad_mmap, &chunks_ad_mmap, new_addr,
-	    size, alignment, base, zero)) != NULL)
+	if (!config_munmap && (ret = chunk_recycle(&chunks_szad_mmap,
+	    &chunks_ad_mmap, new_addr, size, alignment, base, zero)) != NULL)
 		return (ret);
-	/* Requesting an address not implemented for chunk_alloc_mmap(). */
-	if (new_addr == NULL &&
-	    (ret = chunk_alloc_mmap(size, alignment, zero)) != NULL)
+	/*
+	 * Requesting an address is not implemented for chunk_alloc_mmap(), so
+	 * only call it if (new_addr == NULL).
+	 */
+	if (new_addr == NULL && (ret = chunk_alloc_mmap(size, alignment, zero))
+	    != NULL)
 		return (ret);
 	/* "secondary" dss. */
-	if (have_dss && dss_prec == dss_prec_secondary) {
-		if ((ret = chunk_recycle(&chunks_szad_dss, &chunks_ad_dss,
-		    new_addr, size, alignment, base, zero)) != NULL)
-			return (ret);
-		if ((ret = chunk_alloc_dss(new_addr, size, alignment, zero))
-		    != NULL)
-			return (ret);
-	}
+	if (have_dss && dss_prec == dss_prec_secondary && (ret =
+	    chunk_alloc_core_dss(new_addr, size, alignment, base, zero)) !=
+	    NULL)
+		return (ret);
 
 	/* All strategies for allocation failed. */
 	return (NULL);
-- 
cgit v0.12


From 41f2e692f664da683ae694b17630f5e186aa454c Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 25 Jan 2015 20:15:13 -0800
Subject: Fix quoting for CONFIG-related sed expression.

---
 configure.ac | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 5e93a5d..4c20259 100644
--- a/configure.ac
+++ b/configure.ac
@@ -43,7 +43,7 @@ AC_CACHE_CHECK([whether $1 is compilable],
 
 dnl ============================================================================
 
-CONFIG=`echo ${ac_configure_args} | sed -e "s#\'\([^ ]*\)\'#\1#g"`
+CONFIG=`echo ${ac_configure_args} | sed -e 's#'"'"'\([^ ]*\)'"'"'#\1#g'`
 AC_SUBST([CONFIG])
 
 dnl Library revision.
-- 
cgit v0.12


From 5b8ed5b7c91939f64f14fc48be84ed20e3f023f4 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 25 Jan 2015 21:16:57 -0800
Subject: Implement the prof.gdump mallctl.

This feature makes it possible to toggle the gdump feature on/off during
program execution, whereas the the opt.prof_dump mallctl value can only
be set during program startup.

This resolves #72.
---
 doc/jemalloc.xml.in                           | 28 ++++++++++++++++------
 include/jemalloc/internal/private_symbols.txt |  4 ++++
 include/jemalloc/internal/prof.h              | 18 ++++++++++++++
 src/chunk.c                                   |  3 ++-
 src/ctl.c                                     | 27 +++++++++++++++++++++
 src/prof.c                                    | 34 +++++++++++++++++++++++++++
 test/unit/prof_gdump.c                        | 29 +++++++++++++++++++++--
 7 files changed, 133 insertions(+), 10 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 08fd4eb..739b33a 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -1215,13 +1215,11 @@ malloc_conf = "xmalloc:true";]]></programlisting>
           <literal>r-</literal>
           [<option>--enable-prof</option>]
         </term>
-        <listitem><para>Trigger a memory profile dump every time the total
-        virtual memory exceeds the previous maximum.  Profiles are dumped to
-        files named according to the pattern
-        <filename>&lt;prefix&gt;.&lt;pid&gt;.&lt;seq&gt;.u&lt;useq&gt;.heap</filename>,
-        where <literal>&lt;prefix&gt;</literal> is controlled by the <link
-        linkend="opt.prof_prefix"><mallctl>opt.prof_prefix</mallctl></link>
-        option.  This option is disabled by default.</para></listitem>
+        <listitem><para>Set the initial state of <link
+        linkend="prof.gdump"><mallctl>prof.gdump</mallctl></link>, which when
+        enabled triggers a memory profile dump every time the total virtual
+        memory exceeds the previous maximum.  This option is disabled by
+        default.</para></listitem>
       </varlistentry>
 
       <varlistentry id="opt.prof_final">
@@ -1687,6 +1685,22 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         option.</para></listitem>
       </varlistentry>
 
+      <varlistentry id="prof.gdump">
+        <term>
+          <mallctl>prof.gdump</mallctl>
+          (<type>bool</type>)
+          <literal>rw</literal>
+          [<option>--enable-prof</option>]
+        </term>
+        <listitem><para>When enabled, trigger a memory profile dump every time
+        the total virtual memory exceeds the previous maximum.  Profiles are
+        dumped to files named according to the pattern
+        <filename>&lt;prefix&gt;.&lt;pid&gt;.&lt;seq&gt;.u&lt;useq&gt;.heap</filename>,
+        where <literal>&lt;prefix&gt;</literal> is controlled by the <link
+        linkend="opt.prof_prefix"><mallctl>opt.prof_prefix</mallctl></link>
+        option.</para></listitem>
+      </varlistentry>
+
       <varlistentry id="prof.reset">
         <term>
           <mallctl>prof.reset</mallctl>
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index dfa8755..f3fd826 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -329,6 +329,10 @@ prof_dump_open
 prof_free
 prof_free_sampled_object
 prof_gdump
+prof_gdump_get
+prof_gdump_get_unlocked
+prof_gdump_set
+prof_gdump_val
 prof_idump
 prof_interval
 prof_lookup
diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index e081884..b2db685 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -239,6 +239,9 @@ extern char	opt_prof_prefix[
 /* Accessed via prof_active_[gs]et{_unlocked,}(). */
 extern bool	prof_active;
 
+/* Accessed via prof_gdump_[gs]et{_unlocked,}(). */
+extern bool	prof_gdump_val;
+
 /*
  * Profile dump interval, measured in bytes allocated.  Each arena triggers a
  * profile dump when it reaches this threshold.  The effect is that the
@@ -285,6 +288,8 @@ bool	prof_thread_active_get(void);
 bool	prof_thread_active_set(bool active);
 bool	prof_thread_active_init_get(void);
 bool	prof_thread_active_init_set(bool active_init);
+bool	prof_gdump_get(void);
+bool	prof_gdump_set(bool active);
 void	prof_boot0(void);
 void	prof_boot1(void);
 bool	prof_boot2(void);
@@ -299,6 +304,7 @@ void	prof_sample_threshold_update(prof_tdata_t *tdata);
 
 #ifndef JEMALLOC_ENABLE_INLINE
 bool	prof_active_get_unlocked(void);
+bool	prof_gdump_get_unlocked(void);
 prof_tdata_t	*prof_tdata_get(tsd_t *tsd, bool create);
 bool	prof_sample_accum_update(tsd_t *tsd, size_t usize, bool commit,
     prof_tdata_t **tdata_out);
@@ -327,6 +333,18 @@ prof_active_get_unlocked(void)
 	return (prof_active);
 }
 
+JEMALLOC_ALWAYS_INLINE bool
+prof_gdump_get_unlocked(void)
+{
+
+	/*
+	 * No locking is used when reading prof_gdump_val in the fast path, so
+	 * there are no guarantees regarding how long it will take for all
+	 * threads to notice state changes.
+	 */
+	return (prof_gdump_val);
+}
+
 JEMALLOC_ALWAYS_INLINE prof_tdata_t *
 prof_tdata_get(tsd_t *tsd, bool create)
 {
diff --git a/src/chunk.c b/src/chunk.c
index 6d5f84f..7bfcdb8 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -213,7 +213,8 @@ chunk_register(void *chunk, size_t size, bool base)
 		} else if (config_prof)
 			gdump = false;
 		malloc_mutex_unlock(&chunks_mtx);
-		if (config_prof && opt_prof && opt_prof_gdump && gdump)
+		if (config_prof && opt_prof && prof_gdump_get_unlocked() &&
+		    gdump)
 			prof_gdump();
 	}
 	if (config_valgrind)
diff --git a/src/ctl.c b/src/ctl.c
index b65af52..63a689a 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -137,6 +137,7 @@ CTL_PROTO(arenas_extend)
 CTL_PROTO(prof_thread_active_init)
 CTL_PROTO(prof_active)
 CTL_PROTO(prof_dump)
+CTL_PROTO(prof_gdump)
 CTL_PROTO(prof_reset)
 CTL_PROTO(prof_interval)
 CTL_PROTO(lg_prof_sample)
@@ -347,6 +348,7 @@ static const ctl_named_node_t	prof_node[] = {
 	{NAME("thread_active_init"), CTL(prof_thread_active_init)},
 	{NAME("active"),	CTL(prof_active)},
 	{NAME("dump"),		CTL(prof_dump)},
+	{NAME("gdump"),		CTL(prof_gdump)},
 	{NAME("reset"),		CTL(prof_reset)},
 	{NAME("interval"),	CTL(prof_interval)},
 	{NAME("lg_sample"),	CTL(lg_prof_sample)}
@@ -1791,6 +1793,31 @@ label_return:
 }
 
 static int
+prof_gdump_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
+    void *newp, size_t newlen)
+{
+	int ret;
+	bool oldval;
+
+	if (!config_prof)
+		return (ENOENT);
+
+	if (newp != NULL) {
+		if (newlen != sizeof(bool)) {
+			ret = EINVAL;
+			goto label_return;
+		}
+		oldval = prof_gdump_set(*(bool *)newp);
+	} else
+		oldval = prof_gdump_get();
+	READ(oldval, bool);
+
+	ret = 0;
+label_return:
+	return (ret);
+}
+
+static int
 prof_reset_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
     void *newp, size_t newlen)
 {
diff --git a/src/prof.c b/src/prof.c
index 06f5499..04b2591 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -44,6 +44,13 @@ static malloc_mutex_t	prof_active_mtx;
 static bool		prof_thread_active_init;
 static malloc_mutex_t	prof_thread_active_init_mtx;
 
+/*
+ * Initialized as opt_prof_gdump, and accessed via
+ * prof_gdump_[gs]et{_unlocked,}().
+ */
+bool			prof_gdump_val;
+static malloc_mutex_t	prof_gdump_mtx;
+
 uint64_t	prof_interval = 0;
 
 size_t		lg_prof_sample;
@@ -1961,6 +1968,29 @@ prof_thread_active_init_set(bool active_init)
 	return (active_init_old);
 }
 
+bool
+prof_gdump_get(void)
+{
+	bool prof_gdump_current;
+
+	malloc_mutex_lock(&prof_gdump_mtx);
+	prof_gdump_current = prof_gdump_val;
+	malloc_mutex_unlock(&prof_gdump_mtx);
+	return (prof_gdump_current);
+}
+
+bool
+prof_gdump_set(bool gdump)
+{
+	bool prof_gdump_old;
+
+	malloc_mutex_lock(&prof_gdump_mtx);
+	prof_gdump_old = prof_gdump_val;
+	prof_gdump_val = gdump;
+	malloc_mutex_unlock(&prof_gdump_mtx);
+	return (prof_gdump_old);
+}
+
 void
 prof_boot0(void)
 {
@@ -2013,6 +2043,10 @@ prof_boot2(void)
 		if (malloc_mutex_init(&prof_active_mtx))
 			return (true);
 
+		prof_gdump_val = opt_prof_gdump;
+		if (malloc_mutex_init(&prof_gdump_mtx))
+			return (true);
+
 		prof_thread_active_init = opt_prof_thread_active_init;
 		if (malloc_mutex_init(&prof_thread_active_init_mtx))
 			return (true);
diff --git a/test/unit/prof_gdump.c b/test/unit/prof_gdump.c
index a00b105..a0e6ee9 100644
--- a/test/unit/prof_gdump.c
+++ b/test/unit/prof_gdump.c
@@ -21,8 +21,9 @@ prof_dump_open_intercept(bool propagate_err, const char *filename)
 
 TEST_BEGIN(test_gdump)
 {
-	bool active;
-	void *p, *q;
+	bool active, gdump, gdump_old;
+	void *p, *q, *r, *s;
+	size_t sz;
 
 	test_skip_if(!config_prof);
 
@@ -42,8 +43,32 @@ TEST_BEGIN(test_gdump)
 	assert_ptr_not_null(q, "Unexpected mallocx() failure");
 	assert_true(did_prof_dump_open, "Expected a profile dump");
 
+	gdump = false;
+	sz = sizeof(gdump_old);
+	assert_d_eq(mallctl("prof.gdump", &gdump_old, &sz, &gdump,
+	    sizeof(gdump)), 0,
+	    "Unexpected mallctl failure while disabling prof.gdump");
+	assert(gdump_old);
+	did_prof_dump_open = false;
+	r = mallocx(chunksize, 0);
+	assert_ptr_not_null(q, "Unexpected mallocx() failure");
+	assert_false(did_prof_dump_open, "Unexpected profile dump");
+
+	gdump = true;
+	sz = sizeof(gdump_old);
+	assert_d_eq(mallctl("prof.gdump", &gdump_old, &sz, &gdump,
+	    sizeof(gdump)), 0,
+	    "Unexpected mallctl failure while enabling prof.gdump");
+	assert(!gdump_old);
+	did_prof_dump_open = false;
+	s = mallocx(chunksize, 0);
+	assert_ptr_not_null(q, "Unexpected mallocx() failure");
+	assert_true(did_prof_dump_open, "Expected a profile dump");
+
 	dallocx(p, 0);
 	dallocx(q, 0);
+	dallocx(r, 0);
+	dallocx(s, 0);
 }
 TEST_END
 
-- 
cgit v0.12


From 008267b9f6a0e4d92a78f0e8c0697248020fc8d3 Mon Sep 17 00:00:00 2001
From: Felix Janda <felix.janda@posteo.de>
Date: Tue, 3 Feb 2015 18:58:02 +0100
Subject: util.c: strerror_r returns char* only on glibc

---
 src/util.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/util.c b/src/util.c
index bfd86af..a964d70 100644
--- a/src/util.c
+++ b/src/util.c
@@ -84,7 +84,7 @@ buferror(int err, char *buf, size_t buflen)
 	FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM, NULL, GetLastError(), 0,
 	    (LPSTR)buf, buflen, NULL);
 	return (0);
-#elif defined(_GNU_SOURCE)
+#elif defined(__GLIBC__) && defined(_GNU_SOURCE)
 	char *b = strerror_r(err, buf, buflen);
 	if (b != buf) {
 		strncpy(buf, b, buflen);
-- 
cgit v0.12


From 6505733012458d8fcd0ae8e1f1acdc9ffe33ff35 Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Wed, 4 Feb 2015 07:16:55 +0900
Subject: Make opt.lg_dirty_mult work as documented

The documentation for opt.lg_dirty_mult says:
    Per-arena minimum ratio (log base 2) of active to dirty
    pages.  Some dirty unused pages may be allowed to accumulate,
    within the limit set by the ratio (or one chunk worth of dirty
    pages, whichever is greater) (...)

The restriction in parentheses currently doesn't happen. This makes
jemalloc aggressively madvise(), which in turns increases the amount
of page faults significantly.

For instance, this resulted in several(!) hundred(!) milliseconds
startup regression on Firefox for Android.

This may require further tweaking, but starting with actually doing
what the documentation says is a good start.
---
 src/arena.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/arena.c b/src/arena.c
index 984b8ad..a5033bf 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -850,6 +850,7 @@ arena_maybe_purge(arena_t *arena)
 	if (opt_lg_dirty_mult < 0)
 		return;
 	threshold = (arena->nactive >> opt_lg_dirty_mult);
+	threshold = threshold < chunk_npages ? chunk_npages : threshold;
 	/*
 	 * Don't purge unless the number of purgeable pages exceeds the
 	 * threshold.
@@ -893,6 +894,7 @@ arena_compute_npurge(arena_t *arena, bool all)
 	 */
 	if (!all) {
 		size_t threshold = (arena->nactive >> opt_lg_dirty_mult);
+		threshold = threshold < chunk_npages ? chunk_npages : threshold;
 
 		npurge = arena->ndirty - threshold;
 	} else
-- 
cgit v0.12


From b0808d5f635592cf7b9c487efbf26f13dc60b223 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 3 Feb 2015 12:39:31 -0800
Subject: Fix shell test to use = instead of ==.

---
 configure.ac | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 4c20259..dc8aa02 100644
--- a/configure.ac
+++ b/configure.ac
@@ -998,7 +998,7 @@ fi
 AC_ARG_WITH([lg_page],
   [AS_HELP_STRING([--with-lg-page=<lg-page>], [Base 2 log of system page size])],
   [LG_PAGE="$with_lg_page"], [LG_PAGE="detect"])
-if test "x$LG_PAGE" == "xdetect"; then
+if test "x$LG_PAGE" = "xdetect"; then
   AC_CACHE_CHECK([LG_PAGE],
                [je_cv_lg_page],
                AC_RUN_IFELSE([AC_LANG_PROGRAM(
-- 
cgit v0.12


From f8723572d8b3418f145fc1d5466cca6b8e2530ef Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 3 Feb 2015 12:39:55 -0800
Subject: Add missing prototypes for bootstrap_{malloc,calloc,free}().

---
 include/jemalloc/internal/jemalloc_internal.h.in | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index a477855..79a23e5 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -406,7 +406,9 @@ extern uint8_t const	size2index_tab[];
 arena_t	*a0get(void);
 void	*a0malloc(size_t size);
 void	a0dalloc(void *ptr);
-size_t	a0allocated(void);
+void	*bootstrap_malloc(size_t size);
+void	*bootstrap_calloc(size_t num, size_t size);
+void	bootstrap_free(void *ptr);
 arena_t	*arenas_extend(unsigned ind);
 arena_t	*arena_init(unsigned ind);
 unsigned	narenas_total_get(void);
-- 
cgit v0.12


From 8ddc93293cd8370870f221225ef1e013fbff6d65 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 30 Jan 2015 21:22:54 -0800
Subject: Fix chunk_recycle()'s new_addr functionality.

Fix chunk_recycle()'s new_addr functionality to search by address rather
than just size if new_addr is specified.  The functionality added by
a95018ee819abf897562d9d1f3bc31d4dd725a8d (Attempt to expand huge
allocations in-place.) only worked if the two search orders happened to
return the same results (e.g. in simple test cases).
---
 src/chunk.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/chunk.c b/src/chunk.c
index 7bfcdb8..a3ae548 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -48,6 +48,8 @@ chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad,
 	size_t alloc_size, leadsize, trailsize;
 	bool zeroed;
 
+	assert(new_addr == NULL || alignment == chunksize);
+
 	if (base) {
 		/*
 		 * This function may need to call base_node_{,de}alloc(), but
@@ -65,13 +67,15 @@ chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad,
 	key.addr = new_addr;
 	key.size = alloc_size;
 	malloc_mutex_lock(&chunks_mtx);
-	node = extent_tree_szad_nsearch(chunks_szad, &key);
-	if (node == NULL || (new_addr && node->addr != new_addr)) {
+	node = (new_addr != NULL) ? extent_tree_ad_search(chunks_ad, &key) :
+	    extent_tree_szad_nsearch(chunks_szad, &key);
+	if (node == NULL) {
 		malloc_mutex_unlock(&chunks_mtx);
 		return (NULL);
 	}
 	leadsize = ALIGNMENT_CEILING((uintptr_t)node->addr, alignment) -
 	    (uintptr_t)node->addr;
+	assert(new_addr == NULL || leadsize == 0);
 	assert(node->size >= leadsize + size);
 	trailsize = node->size - leadsize - size;
 	ret = (void *)((uintptr_t)node->addr + leadsize);
-- 
cgit v0.12


From a55dfa4b0af68f372782e130031483ad73cf7eec Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Mon, 2 Feb 2015 13:49:08 -0800
Subject: Implement more atomic operations.

- atomic_*_p().
- atomic_cas_*().
- atomic_write_*().
---
 include/jemalloc/internal/atomic.h            | 466 +++++++++++++++++++++-----
 include/jemalloc/internal/private_symbols.txt |   7 +
 test/unit/atomic.c                            |  85 +++--
 3 files changed, 446 insertions(+), 112 deletions(-)

diff --git a/include/jemalloc/internal/atomic.h b/include/jemalloc/internal/atomic.h
index 23ac93f..f8bd62e 100644
--- a/include/jemalloc/internal/atomic.h
+++ b/include/jemalloc/internal/atomic.h
@@ -11,6 +11,7 @@
 
 #define	atomic_read_uint64(p)	atomic_add_uint64(p, 0)
 #define	atomic_read_uint32(p)	atomic_add_uint32(p, 0)
+#define	atomic_read_p(p)	atomic_add_p(p, NULL)
 #define	atomic_read_z(p)	atomic_add_z(p, 0)
 #define	atomic_read_u(p)	atomic_add_u(p, 0)
 
@@ -19,58 +20,110 @@
 #ifdef JEMALLOC_H_INLINES
 
 /*
- * All functions return the arithmetic result of the atomic operation.  Some
- * atomic operation APIs return the value prior to mutation, in which case the
- * following functions must redundantly compute the result so that it can be
- * returned.  These functions are normally inlined, so the extra operations can
- * be optimized away if the return values aren't used by the callers.
+ * All arithmetic functions return the arithmetic result of the atomic
+ * operation.  Some atomic operation APIs return the value prior to mutation, in
+ * which case the following functions must redundantly compute the result so
+ * that it can be returned.  These functions are normally inlined, so the extra
+ * operations can be optimized away if the return values aren't used by the
+ * callers.
  *
+ *   <t> atomic_read_<t>(<t> *p) { return (*p); }
  *   <t> atomic_add_<t>(<t> *p, <t> x) { return (*p + x); }
  *   <t> atomic_sub_<t>(<t> *p, <t> x) { return (*p - x); }
+ *   bool atomic_cas_<t>(<t> *p, <t> c, <t> s)
+ *   {
+ *     if (*p != c)
+ *       return (true);
+ *     *p = s;
+ *     return (false);
+ *   }
+ *   void atomic_write_<t>(<t> *p, <t> x) { *p = x; }
  */
 
 #ifndef JEMALLOC_ENABLE_INLINE
 uint64_t	atomic_add_uint64(uint64_t *p, uint64_t x);
 uint64_t	atomic_sub_uint64(uint64_t *p, uint64_t x);
+bool	atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s);
+void	atomic_write_uint64(uint64_t *p, uint64_t x);
 uint32_t	atomic_add_uint32(uint32_t *p, uint32_t x);
 uint32_t	atomic_sub_uint32(uint32_t *p, uint32_t x);
+bool	atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s);
+void	atomic_write_uint32(uint32_t *p, uint32_t x);
+void	*atomic_add_p(void **p, void *x);
+void	*atomic_sub_p(void **p, void *x);
+bool	atomic_cas_p(void **p, void *c, void *s);
+void	atomic_write_p(void **p, void *x);
 size_t	atomic_add_z(size_t *p, size_t x);
 size_t	atomic_sub_z(size_t *p, size_t x);
+bool	atomic_cas_z(size_t *p, size_t c, size_t s);
+void	atomic_write_z(size_t *p, size_t x);
 unsigned	atomic_add_u(unsigned *p, unsigned x);
 unsigned	atomic_sub_u(unsigned *p, unsigned x);
+bool	atomic_cas_u(unsigned *p, unsigned c, unsigned s);
+void	atomic_write_u(unsigned *p, unsigned x);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ATOMIC_C_))
 /******************************************************************************/
 /* 64-bit operations. */
 #if (LG_SIZEOF_PTR == 3 || LG_SIZEOF_INT == 3)
-#  ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
+#  if (defined(__amd64__) || defined(__x86_64__))
 JEMALLOC_INLINE uint64_t
 atomic_add_uint64(uint64_t *p, uint64_t x)
 {
+	uint64_t t = x;
 
-	return (__sync_add_and_fetch(p, x));
+	asm volatile (
+	    "lock; xaddq %0, %1;"
+	    : "+r" (t), "=m" (*p) /* Outputs. */
+	    : "m" (*p) /* Inputs. */
+	    );
+
+	return (t + x);
 }
 
 JEMALLOC_INLINE uint64_t
 atomic_sub_uint64(uint64_t *p, uint64_t x)
 {
+	uint64_t t;
 
-	return (__sync_sub_and_fetch(p, x));
+	x = (uint64_t)(-(int64_t)x);
+	t = x;
+	asm volatile (
+	    "lock; xaddq %0, %1;"
+	    : "+r" (t), "=m" (*p) /* Outputs. */
+	    : "m" (*p) /* Inputs. */
+	    );
+
+	return (t + x);
 }
-#  elif (defined(_MSC_VER))
-JEMALLOC_INLINE uint64_t
-atomic_add_uint64(uint64_t *p, uint64_t x)
+
+JEMALLOC_INLINE bool
+atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s)
 {
+	uint8_t success;
 
-	return (InterlockedExchangeAdd64(p, x) + x);
+	asm volatile (
+	    "lock; cmpxchgq %4, %0;"
+	    "sete %1;"
+	    : "=m" (*p), "=a" (success) /* Outputs. */
+	    : "m" (*p), "a" (c), "r" (s) /* Inputs. */
+	    : "memory" /* Clobbers. */
+	    );
+
+	return (!(bool)success);
 }
 
-JEMALLOC_INLINE uint64_t
-atomic_sub_uint64(uint64_t *p, uint64_t x)
+JEMALLOC_INLINE void
+atomic_write_uint64(uint64_t *p, uint64_t x)
 {
 
-	return (InterlockedExchangeAdd64(p, -((int64_t)x)) - x);
+	asm volatile (
+	    "lock; xchgq %1, %0;"
+	    : "=m" (*p), "+r" (x) /* Outputs. */
+	    : "m" (*p) /* Inputs. */
+	    : "memory" /* Clobbers. */
+	    );
 }
 #  elif (defined(JEMALLOC_C11ATOMICS))
 JEMALLOC_INLINE uint64_t
@@ -86,73 +139,124 @@ atomic_sub_uint64(uint64_t *p, uint64_t x)
 	volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
 	return (atomic_fetch_sub(a, x) - x);
 }
-#  elif (defined(JEMALLOC_OSATOMIC))
+
+JEMALLOC_INLINE bool
+atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s)
+{
+
+	return (!atomic_compare_exchange_strong(p, &c, s));
+}
+
+JEMALLOC_INLINE void
+atomic_write_uint64(uint64_t *p, uint64_t x)
+{
+
+	atomic_store(p, x);
+}
+#  elif (defined(JEMALLOC_ATOMIC9))
 JEMALLOC_INLINE uint64_t
 atomic_add_uint64(uint64_t *p, uint64_t x)
 {
 
-	return (OSAtomicAdd64((int64_t)x, (int64_t *)p));
+	/*
+	 * atomic_fetchadd_64() doesn't exist, but we only ever use this
+	 * function on LP64 systems, so atomic_fetchadd_long() will do.
+	 */
+	assert(sizeof(uint64_t) == sizeof(unsigned long));
+
+	return (atomic_fetchadd_long(p, (unsigned long)x) + x);
 }
 
 JEMALLOC_INLINE uint64_t
 atomic_sub_uint64(uint64_t *p, uint64_t x)
 {
 
-	return (OSAtomicAdd64(-((int64_t)x), (int64_t *)p));
+	assert(sizeof(uint64_t) == sizeof(unsigned long));
+
+	return (atomic_fetchadd_long(p, (unsigned long)(-(long)x)) - x);
+}
+
+JEMALLOC_INLINE bool
+atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s)
+{
+
+	assert(sizeof(uint64_t) == sizeof(unsigned long));
+
+	return (!atomic_cmpset_long(p, (unsigned long)c, (unsigned long)s));
+}
+
+JEMALLOC_INLINE void
+atomic_write_uint64(uint64_t *p, uint64_t x)
+{
+
+	assert(sizeof(uint64_t) == sizeof(unsigned long));
+
+	atomic_store_rel_long(p, x);
 }
-#  elif (defined(__amd64__) || defined(__x86_64__))
+#  elif (defined(JEMALLOC_OSATOMIC))
 JEMALLOC_INLINE uint64_t
 atomic_add_uint64(uint64_t *p, uint64_t x)
 {
-	uint64_t t = x;
-
-	asm volatile (
-	    "lock; xaddq %0, %1;"
-	    : "+r" (t), "=m" (*p) /* Outputs. */
-	    : "m" (*p) /* Inputs. */
-	    );
 
-	return (t + x);
+	return (OSAtomicAdd64((int64_t)x, (int64_t *)p));
 }
 
 JEMALLOC_INLINE uint64_t
 atomic_sub_uint64(uint64_t *p, uint64_t x)
 {
-	uint64_t t;
 
-	x = (uint64_t)(-(int64_t)x);
-	t = x;
-	asm volatile (
-	    "lock; xaddq %0, %1;"
-	    : "+r" (t), "=m" (*p) /* Outputs. */
-	    : "m" (*p) /* Inputs. */
-	    );
+	return (OSAtomicAdd64(-((int64_t)x), (int64_t *)p));
+}
 
-	return (t + x);
+JEMALLOC_INLINE bool
+atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s)
+{
+
+	return (!OSAtomicCompareAndSwap64(c, s, (int64_t *)p));
 }
-#  elif (defined(JEMALLOC_ATOMIC9))
+
+JEMALLOC_INLINE void
+atomic_write_uint64(uint64_t *p, uint64_t x)
+{
+	uint64_t o;
+
+	/*The documented OSAtomic*() API does not expose an atomic exchange. */
+	do {
+		o = atomic_read_uint64(p);
+	} while (atomic_cas_uint64(p, o, x));
+}
+#  elif (defined(_MSC_VER))
 JEMALLOC_INLINE uint64_t
 atomic_add_uint64(uint64_t *p, uint64_t x)
 {
 
-	/*
-	 * atomic_fetchadd_64() doesn't exist, but we only ever use this
-	 * function on LP64 systems, so atomic_fetchadd_long() will do.
-	 */
-	assert(sizeof(uint64_t) == sizeof(unsigned long));
-
-	return (atomic_fetchadd_long(p, (unsigned long)x) + x);
+	return (InterlockedExchangeAdd64(p, x) + x);
 }
 
 JEMALLOC_INLINE uint64_t
 atomic_sub_uint64(uint64_t *p, uint64_t x)
 {
 
-	assert(sizeof(uint64_t) == sizeof(unsigned long));
+	return (InterlockedExchangeAdd64(p, -((int64_t)x)) - x);
+}
 
-	return (atomic_fetchadd_long(p, (unsigned long)(-(long)x)) - x);
+JEMALLOC_INLINE bool
+atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s)
+{
+	uint64_t o;
+
+	o = InterlockedCompareExchange64(p, s, c);
+	return (o != c);
 }
-#  elif (defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_8))
+
+JEMALLOC_INLINE void
+atomic_write_uint64(uint64_t *p, uint64_t x)
+{
+
+	InterlockedExchange64(p, x);
+}
+#  elif (defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8) || \
+    defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_8))
 JEMALLOC_INLINE uint64_t
 atomic_add_uint64(uint64_t *p, uint64_t x)
 {
@@ -166,6 +270,20 @@ atomic_sub_uint64(uint64_t *p, uint64_t x)
 
 	return (__sync_sub_and_fetch(p, x));
 }
+
+JEMALLOC_INLINE bool
+atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s)
+{
+
+	return (!__sync_bool_compare_and_swap(p, c, s));
+}
+
+JEMALLOC_INLINE void
+atomic_write_uint64(uint64_t *p, uint64_t x)
+{
+
+	__sync_lock_test_and_set(p, x);
+}
 #  else
 #    error "Missing implementation for 64-bit atomic operations"
 #  endif
@@ -173,33 +291,63 @@ atomic_sub_uint64(uint64_t *p, uint64_t x)
 
 /******************************************************************************/
 /* 32-bit operations. */
-#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4
+#if (defined(__i386__) || defined(__amd64__) || defined(__x86_64__))
 JEMALLOC_INLINE uint32_t
 atomic_add_uint32(uint32_t *p, uint32_t x)
 {
+	uint32_t t = x;
 
-	return (__sync_add_and_fetch(p, x));
+	asm volatile (
+	    "lock; xaddl %0, %1;"
+	    : "+r" (t), "=m" (*p) /* Outputs. */
+	    : "m" (*p) /* Inputs. */
+	    );
+
+	return (t + x);
 }
 
 JEMALLOC_INLINE uint32_t
 atomic_sub_uint32(uint32_t *p, uint32_t x)
 {
+	uint32_t t;
 
-	return (__sync_sub_and_fetch(p, x));
+	x = (uint32_t)(-(int32_t)x);
+	t = x;
+	asm volatile (
+	    "lock; xaddl %0, %1;"
+	    : "+r" (t), "=m" (*p) /* Outputs. */
+	    : "m" (*p) /* Inputs. */
+	    );
+
+	return (t + x);
 }
-#elif (defined(_MSC_VER))
-JEMALLOC_INLINE uint32_t
-atomic_add_uint32(uint32_t *p, uint32_t x)
+
+JEMALLOC_INLINE bool
+atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s)
 {
+	uint8_t success;
 
-	return (InterlockedExchangeAdd(p, x) + x);
+	asm volatile (
+	    "lock; cmpxchgl %4, %0;"
+	    "sete %1;"
+	    : "=m" (*p), "=a" (success) /* Outputs. */
+	    : "m" (*p), "a" (c), "r" (s) /* Inputs. */
+	    : "memory"
+	    );
+
+	return (!(bool)success);
 }
 
-JEMALLOC_INLINE uint32_t
-atomic_sub_uint32(uint32_t *p, uint32_t x)
+JEMALLOC_INLINE void
+atomic_write_uint32(uint32_t *p, uint32_t x)
 {
 
-	return (InterlockedExchangeAdd(p, -((int32_t)x)) - x);
+	asm volatile (
+	    "lock; xchgl %1, %0;"
+	    : "=m" (*p), "+r" (x) /* Outputs. */
+	    : "m" (*p) /* Inputs. */
+	    : "memory" /* Clobbers. */
+	    );
 }
 #  elif (defined(JEMALLOC_C11ATOMICS))
 JEMALLOC_INLINE uint32_t
@@ -215,65 +363,112 @@ atomic_sub_uint32(uint32_t *p, uint32_t x)
 	volatile atomic_uint_least32_t *a = (volatile atomic_uint_least32_t *)p;
 	return (atomic_fetch_sub(a, x) - x);
 }
-#elif (defined(JEMALLOC_OSATOMIC))
+
+JEMALLOC_INLINE bool
+atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s)
+{
+
+	return (!atomic_compare_exchange_strong(p, &c, s));
+}
+
+JEMALLOC_INLINE void
+atomic_write_uint32(uint32_t *p, uint32_t x)
+{
+
+	atomic_store(p, x);
+}
+#elif (defined(JEMALLOC_ATOMIC9))
 JEMALLOC_INLINE uint32_t
 atomic_add_uint32(uint32_t *p, uint32_t x)
 {
 
-	return (OSAtomicAdd32((int32_t)x, (int32_t *)p));
+	return (atomic_fetchadd_32(p, x) + x);
 }
 
 JEMALLOC_INLINE uint32_t
 atomic_sub_uint32(uint32_t *p, uint32_t x)
 {
 
-	return (OSAtomicAdd32(-((int32_t)x), (int32_t *)p));
+	return (atomic_fetchadd_32(p, (uint32_t)(-(int32_t)x)) - x);
 }
-#elif (defined(__i386__) || defined(__amd64__) || defined(__x86_64__))
+
+JEMALLOC_INLINE bool
+atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s)
+{
+
+	return (!atomic_cmpset_32(p, c, s));
+}
+
+JEMALLOC_INLINE void
+atomic_write_uint32(uint32_t *p, uint32_t x)
+{
+
+	atomic_store_rel_32(p, x);
+}
+#elif (defined(JEMALLOC_OSATOMIC))
 JEMALLOC_INLINE uint32_t
 atomic_add_uint32(uint32_t *p, uint32_t x)
 {
-	uint32_t t = x;
 
-	asm volatile (
-	    "lock; xaddl %0, %1;"
-	    : "+r" (t), "=m" (*p) /* Outputs. */
-	    : "m" (*p) /* Inputs. */
-	    );
-
-	return (t + x);
+	return (OSAtomicAdd32((int32_t)x, (int32_t *)p));
 }
 
 JEMALLOC_INLINE uint32_t
 atomic_sub_uint32(uint32_t *p, uint32_t x)
 {
-	uint32_t t;
 
-	x = (uint32_t)(-(int32_t)x);
-	t = x;
-	asm volatile (
-	    "lock; xaddl %0, %1;"
-	    : "+r" (t), "=m" (*p) /* Outputs. */
-	    : "m" (*p) /* Inputs. */
-	    );
+	return (OSAtomicAdd32(-((int32_t)x), (int32_t *)p));
+}
 
-	return (t + x);
+JEMALLOC_INLINE bool
+atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s)
+{
+
+	return (!OSAtomicCompareAndSwap32(c, s, (int32_t *)p));
 }
-#elif (defined(JEMALLOC_ATOMIC9))
+
+JEMALLOC_INLINE void
+atomic_write_uint32(uint32_t *p, uint32_t x)
+{
+	uint32_t o;
+
+	/*The documented OSAtomic*() API does not expose an atomic exchange. */
+	do {
+		o = atomic_read_uint32(p);
+	} while (atomic_cas_uint32(p, o, x));
+}
+#elif (defined(_MSC_VER))
 JEMALLOC_INLINE uint32_t
 atomic_add_uint32(uint32_t *p, uint32_t x)
 {
 
-	return (atomic_fetchadd_32(p, x) + x);
+	return (InterlockedExchangeAdd(p, x) + x);
 }
 
 JEMALLOC_INLINE uint32_t
 atomic_sub_uint32(uint32_t *p, uint32_t x)
 {
 
-	return (atomic_fetchadd_32(p, (uint32_t)(-(int32_t)x)) - x);
+	return (InterlockedExchangeAdd(p, -((int32_t)x)) - x);
+}
+
+JEMALLOC_INLINE bool
+atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s)
+{
+	uint32_t o;
+
+	o = InterlockedCompareExchange32(p, s, c);
+	return (o != c);
 }
-#elif (defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_4))
+
+JEMALLOC_INLINE void
+atomic_write_uint32(uint32_t *p, uint32_t x)
+{
+
+	InterlockedExchange(p, x);
+}
+#elif (defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4) || \
+ defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_4))
 JEMALLOC_INLINE uint32_t
 atomic_add_uint32(uint32_t *p, uint32_t x)
 {
@@ -287,11 +482,73 @@ atomic_sub_uint32(uint32_t *p, uint32_t x)
 
 	return (__sync_sub_and_fetch(p, x));
 }
+
+JEMALLOC_INLINE bool
+atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s)
+{
+
+	return (!__sync_bool_compare_and_swap(p, c, s));
+}
+
+JEMALLOC_INLINE void
+atomic_write_uint32(uint32_t *p, uint32_t x)
+{
+
+	__sync_lock_test_and_set(p, x);
+}
 #else
 #  error "Missing implementation for 32-bit atomic operations"
 #endif
 
 /******************************************************************************/
+/* Pointer operations. */
+JEMALLOC_INLINE void *
+atomic_add_p(void **p, void *x)
+{
+
+#if (LG_SIZEOF_PTR == 3)
+	return ((void *)atomic_add_uint64((uint64_t *)p, (uint64_t)x));
+#elif (LG_SIZEOF_PTR == 2)
+	return ((void *)atomic_add_uint32((uint32_t *)p, (uint32_t)x));
+#endif
+}
+
+JEMALLOC_INLINE void *
+atomic_sub_p(void **p, void *x)
+{
+
+#if (LG_SIZEOF_PTR == 3)
+	return ((void *)atomic_add_uint64((uint64_t *)p,
+	    (uint64_t)-((int64_t)x)));
+#elif (LG_SIZEOF_PTR == 2)
+	return ((void *)atomic_add_uint32((uint32_t *)p,
+	    (uint32_t)-((int32_t)x)));
+#endif
+}
+
+JEMALLOC_INLINE bool
+atomic_cas_p(void **p, void *c, void *s)
+{
+
+#if (LG_SIZEOF_PTR == 3)
+	return (atomic_cas_uint64((uint64_t *)p, (uint64_t)c, (uint64_t)s));
+#elif (LG_SIZEOF_PTR == 2)
+	return (atomic_cas_uint32((uint32_t *)p, (uint32_t)c, (uint32_t)s));
+#endif
+}
+
+JEMALLOC_INLINE void
+atomic_write_p(void **p, void *x)
+{
+
+#if (LG_SIZEOF_PTR == 3)
+	atomic_write_uint64((uint64_t *)p, (uint64_t)x);
+#elif (LG_SIZEOF_PTR == 2)
+	atomic_write_uint32((uint32_t *)p, (uint32_t)x);
+#endif
+}
+
+/******************************************************************************/
 /* size_t operations. */
 JEMALLOC_INLINE size_t
 atomic_add_z(size_t *p, size_t x)
@@ -317,6 +574,28 @@ atomic_sub_z(size_t *p, size_t x)
 #endif
 }
 
+JEMALLOC_INLINE bool
+atomic_cas_z(size_t *p, size_t c, size_t s)
+{
+
+#if (LG_SIZEOF_PTR == 3)
+	return (atomic_cas_uint64((uint64_t *)p, (uint64_t)c, (uint64_t)s));
+#elif (LG_SIZEOF_PTR == 2)
+	return (atomic_cas_uint32((uint32_t *)p, (uint32_t)c, (uint32_t)s));
+#endif
+}
+
+JEMALLOC_INLINE void
+atomic_write_z(size_t *p, size_t x)
+{
+
+#if (LG_SIZEOF_PTR == 3)
+	atomic_write_uint64((uint64_t *)p, (uint64_t)x);
+#elif (LG_SIZEOF_PTR == 2)
+	atomic_write_uint32((uint32_t *)p, (uint32_t)x);
+#endif
+}
+
 /******************************************************************************/
 /* unsigned operations. */
 JEMALLOC_INLINE unsigned
@@ -342,6 +621,29 @@ atomic_sub_u(unsigned *p, unsigned x)
 	    (uint32_t)-((int32_t)x)));
 #endif
 }
+
+JEMALLOC_INLINE bool
+atomic_cas_u(unsigned *p, unsigned c, unsigned s)
+{
+
+#if (LG_SIZEOF_INT == 3)
+	return (atomic_cas_uint64((uint64_t *)p, (uint64_t)c, (uint64_t)s));
+#elif (LG_SIZEOF_INT == 2)
+	return (atomic_cas_uint32((uint32_t *)p, (uint32_t)c, (uint32_t)s));
+#endif
+}
+
+JEMALLOC_INLINE void
+atomic_write_u(unsigned *p, unsigned x)
+{
+
+#if (LG_SIZEOF_INT == 3)
+	atomic_write_uint64((uint64_t *)p, (uint64_t)x);
+#elif (LG_SIZEOF_INT == 2)
+	atomic_write_uint32((uint32_t *)p, (uint32_t)x);
+#endif
+}
+
 /******************************************************************************/
 #endif
 
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index f3fd826..ba7ab38 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -85,10 +85,17 @@ arena_stats_merge
 arena_tcache_fill_small
 arenas_cache_bypass_cleanup
 arenas_cache_cleanup
+atomic_add_p
 atomic_add_u
 atomic_add_uint32
 atomic_add_uint64
 atomic_add_z
+atomic_cas_p
+atomic_cas_u
+atomic_cas_uint32
+atomic_cas_uint64
+atomic_cas_z
+atomic_sub_p
 atomic_sub_u
 atomic_sub_uint32
 atomic_sub_uint64
diff --git a/test/unit/atomic.c b/test/unit/atomic.c
index eb6136c..a774836 100644
--- a/test/unit/atomic.c
+++ b/test/unit/atomic.c
@@ -4,48 +4,64 @@
 struct p##_test_s {							\
 	t	accum0;							\
 	t	x;							\
+	t	s;							\
 };									\
 typedef struct p##_test_s p##_test_t;
 
-#define	TEST_BODY(p, t, PRI) do {					\
+#define	TEST_BODY(p, t, tc, ta, PRI) do {				\
 	const p##_test_t tests[] = {					\
-		{-1, -1},						\
-		{-1,  0},						\
-		{-1,  1},						\
+		{(t)-1, (t)-1, (t)-2},					\
+		{(t)-1, (t) 0, (t)-2},					\
+		{(t)-1, (t) 1, (t)-2},					\
 									\
-		{ 0, -1},						\
-		{ 0,  0},						\
-		{ 0,  1},						\
+		{(t) 0, (t)-1, (t)-2},					\
+		{(t) 0, (t) 0, (t)-2},					\
+		{(t) 0, (t) 1, (t)-2},					\
 									\
-		{ 1, -1},						\
-		{ 1,  0},						\
-		{ 1,  1},						\
+		{(t) 1, (t)-1, (t)-2},					\
+		{(t) 1, (t) 0, (t)-2},					\
+		{(t) 1, (t) 1, (t)-2},					\
 									\
-		{0, -(1 << 22)},					\
-		{0, (1 << 22)},						\
-		{(1 << 22), -(1 << 22)},				\
-		{(1 << 22), (1 << 22)}					\
+		{(t)0, (t)-(1 << 22), (t)-2},				\
+		{(t)0, (t)(1 << 22), (t)-2},				\
+		{(t)(1 << 22), (t)-(1 << 22), (t)-2},			\
+		{(t)(1 << 22), (t)(1 << 22), (t)-2}			\
 	};								\
 	unsigned i;							\
 									\
 	for (i = 0; i < sizeof(tests)/sizeof(p##_test_t); i++) {	\
+		bool err;						\
 		t accum = tests[i].accum0;				\
-		assert_u64_eq(atomic_read_##p(&accum), tests[i].accum0,	\
-		    "i=%u", i);						\
-		assert_u64_eq(atomic_add_##p(&accum, tests[i].x),	\
-		    tests[i].accum0 + tests[i].x,			\
-		    "i=%u, accum=%#"PRI", x=%#"PRI,			\
+		assert_##ta##_eq(atomic_read_##p(&accum),		\
+		    tests[i].accum0,					\
+		    "Erroneous read, i=%u", i);				\
+									\
+		assert_##ta##_eq(atomic_add_##p(&accum, tests[i].x),	\
+		    (t)((tc)tests[i].accum0 + (tc)tests[i].x),		\
+		    "i=%u, accum=%"PRI", x=%"PRI,			\
 		    i, tests[i].accum0, tests[i].x);			\
-		assert_u64_eq(atomic_read_##p(&accum), accum,		\
-		    "i=%u", i);						\
+		assert_##ta##_eq(atomic_read_##p(&accum), accum,	\
+		    "Erroneous add, i=%u", i);				\
 									\
 		accum = tests[i].accum0;				\
-		assert_u64_eq(atomic_sub_##p(&accum, tests[i].x),	\
-		    tests[i].accum0 - tests[i].x,			\
-		    "i=%u, accum=%#"PRI", x=%#"PRI,			\
+		assert_##ta##_eq(atomic_sub_##p(&accum, tests[i].x),	\
+		    (t)((tc)tests[i].accum0 - (tc)tests[i].x),		\
+		    "i=%u, accum=%"PRI", x=%"PRI,			\
 		    i, tests[i].accum0, tests[i].x);			\
-		assert_u64_eq(atomic_read_##p(&accum), accum,		\
-		    "i=%u", i);						\
+		assert_##ta##_eq(atomic_read_##p(&accum), accum,	\
+		    "Erroneous sub, i=%u", i);				\
+									\
+		accum = tests[i].accum0;				\
+		err = atomic_cas_##p(&accum, tests[i].x, tests[i].s);	\
+		assert_b_eq(err, tests[i].accum0 != tests[i].x,		\
+		    "Erroneous cas success/failure result");		\
+		assert_##ta##_eq(accum, err ? tests[i].accum0 :		\
+		    tests[i].s, "Erroneous cas effect, i=%u", i);	\
+									\
+		accum = tests[i].accum0;				\
+		atomic_write_##p(&accum, tests[i].s);			\
+		assert_##ta##_eq(accum, tests[i].s,			\
+		    "Erroneous write, i=%u", i);			\
 	}								\
 } while (0)
 
@@ -56,7 +72,7 @@ TEST_BEGIN(test_atomic_uint64)
 #if !(LG_SIZEOF_PTR == 3 || LG_SIZEOF_INT == 3)
 	test_skip("64-bit atomic operations not supported");
 #else
-	TEST_BODY(uint64, uint64_t, PRIx64);
+	TEST_BODY(uint64, uint64_t, uint64_t, u64, PRIx64);
 #endif
 }
 TEST_END
@@ -65,7 +81,15 @@ TEST_STRUCT(uint32, uint32_t)
 TEST_BEGIN(test_atomic_uint32)
 {
 
-	TEST_BODY(uint32, uint32_t, PRIx32);
+	TEST_BODY(uint32, uint32_t, uint32_t, u32, "#"PRIx32);
+}
+TEST_END
+
+TEST_STRUCT(p, void *)
+TEST_BEGIN(test_atomic_p)
+{
+
+	TEST_BODY(p, void *, uintptr_t, ptr, "p");
 }
 TEST_END
 
@@ -73,7 +97,7 @@ TEST_STRUCT(z, size_t)
 TEST_BEGIN(test_atomic_z)
 {
 
-	TEST_BODY(z, size_t, "zx");
+	TEST_BODY(z, size_t, size_t, zu, "#zx");
 }
 TEST_END
 
@@ -81,7 +105,7 @@ TEST_STRUCT(u, unsigned)
 TEST_BEGIN(test_atomic_u)
 {
 
-	TEST_BODY(u, unsigned, "x");
+	TEST_BODY(u, unsigned, unsigned, u, "#x");
 }
 TEST_END
 
@@ -92,6 +116,7 @@ main(void)
 	return (test(
 	    test_atomic_uint64,
 	    test_atomic_uint32,
+	    test_atomic_p,
 	    test_atomic_z,
 	    test_atomic_u));
 }
-- 
cgit v0.12


From 918a1a5b3f09cb456c25be9a2555a8fea6a9bb94 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 30 Jan 2015 21:21:16 -0800
Subject: Reduce extent_node_t size to fit in one cache line.

---
 include/jemalloc/internal/extent.h | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/include/jemalloc/internal/extent.h b/include/jemalloc/internal/extent.h
index cbfc20a..f45940c 100644
--- a/include/jemalloc/internal/extent.h
+++ b/include/jemalloc/internal/extent.h
@@ -15,9 +15,6 @@ struct extent_node_s {
 	/* Linkage for the address-ordered tree. */
 	rb_node(extent_node_t)	link_ad;
 
-	/* Profile counters, used for huge objects. */
-	prof_tctx_t		*prof_tctx;
-
 	/* Pointer to the extent that this tree node is responsible for. */
 	void			*addr;
 
@@ -27,8 +24,17 @@ struct extent_node_s {
 	/* Arena from which this extent came, if any. */
 	arena_t			*arena;
 
-	/* True if zero-filled; used by chunk recycling code. */
-	bool			zeroed;
+	/*
+	 * 'prof_tctx' and 'zeroed' are never needed at the same time, so
+	 * overlay them in order to fit extent_node_t in one cache line.
+	 */
+	union {
+		/* Profile counters, used for huge objects. */
+		prof_tctx_t	*prof_tctx;
+
+		/* True if zero-filled; used by chunk recycling code. */
+		bool		zeroed;
+	};
 };
 typedef rb_tree(extent_node_t) extent_tree_t;
 
-- 
cgit v0.12


From f500a10b2e94852b867334703ad77467dcfd2ddd Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 30 Jan 2015 21:49:19 -0800
Subject: Refactor base_alloc() to guarantee demand-zeroed memory.

Refactor base_alloc() to guarantee that allocations are carved from
demand-zeroed virtual memory.  This supports sparse data structures such
as multi-page radix tree nodes.

Enhance base_alloc() to keep track of fragments which were too small to
support previous allocation requests, and try to consume them during
subsequent requests.  This becomes important when request sizes commonly
approach or exceed the chunk size (as could radix tree node
allocations).
---
 include/jemalloc/internal/base.h              |   1 -
 include/jemalloc/internal/private_symbols.txt |   1 -
 src/base.c                                    | 147 ++++++++++++++++----------
 src/chunk.c                                   |  17 +--
 src/mutex.c                                   |   6 +-
 5 files changed, 104 insertions(+), 68 deletions(-)

diff --git a/include/jemalloc/internal/base.h b/include/jemalloc/internal/base.h
index 18b7a72..a0798ee 100644
--- a/include/jemalloc/internal/base.h
+++ b/include/jemalloc/internal/base.h
@@ -10,7 +10,6 @@
 #ifdef JEMALLOC_H_EXTERNS
 
 void	*base_alloc(size_t size);
-void	*base_calloc(size_t number, size_t size);
 extent_node_t *base_node_alloc(void);
 void	base_node_dalloc(extent_node_t *node);
 size_t	base_allocated_get(void);
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index ba7ab38..105e664 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -103,7 +103,6 @@ atomic_sub_z
 base_alloc
 base_allocated_get
 base_boot
-base_calloc
 base_node_alloc
 base_node_dalloc
 base_postfork_child
diff --git a/src/base.c b/src/base.c
index 22f3613..0d1de7f 100644
--- a/src/base.c
+++ b/src/base.c
@@ -5,73 +5,117 @@
 /* Data. */
 
 static malloc_mutex_t	base_mtx;
-
-/*
- * Current pages that are being used for internal memory allocations.  These
- * pages are carved up in cacheline-size quanta, so that there is no chance of
- * false cache line sharing.
- */
-static void		*base_pages;
-static void		*base_next_addr;
-static void		*base_past_addr; /* Addr immediately past base_pages. */
+static extent_tree_t	base_avail_szad;
 static extent_node_t	*base_nodes;
-
 static size_t		base_allocated;
 
 /******************************************************************************/
 
-static bool
-base_pages_alloc(size_t minsize)
+static extent_node_t *
+base_node_try_alloc_locked(void)
 {
-	size_t csize;
+	extent_node_t *node;
 
-	assert(minsize != 0);
-	csize = CHUNK_CEILING(minsize);
-	base_pages = chunk_alloc_base(csize);
-	if (base_pages == NULL)
-		return (true);
-	base_next_addr = base_pages;
-	base_past_addr = (void *)((uintptr_t)base_pages + csize);
+	if (base_nodes == NULL)
+		return (NULL);
+	node = base_nodes;
+	base_nodes = *(extent_node_t **)node;
+	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(node, sizeof(extent_node_t));
+	return (node);
+}
 
-	return (false);
+static void
+base_node_dalloc_locked(extent_node_t *node)
+{
+
+	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(node, sizeof(extent_node_t));
+	*(extent_node_t **)node = base_nodes;
+	base_nodes = node;
 }
 
-void *
-base_alloc(size_t size)
+/* base_mtx must be held. */
+static extent_node_t *
+base_chunk_alloc(size_t minsize)
+{
+	extent_node_t *node;
+	size_t csize, nsize;
+	void *addr;
+
+	assert(minsize != 0);
+	node = base_node_try_alloc_locked();
+	/* Allocate enough space to also carve a node out if necessary. */
+	nsize = (node == NULL) ? CACHELINE_CEILING(sizeof(extent_node_t)) : 0;
+	csize = CHUNK_CEILING(minsize + nsize);
+	addr = chunk_alloc_base(csize);
+	if (addr == NULL) {
+		if (node != NULL)
+			base_node_dalloc_locked(node);
+		return (NULL);
+	}
+	if (node == NULL) {
+		csize -= nsize;
+		node = (extent_node_t *)((uintptr_t)addr + csize);
+		if (config_stats)
+			base_allocated += nsize;
+	}
+	node->addr = addr;
+	node->size = csize;
+	return (node);
+}
+
+static void *
+base_alloc_locked(size_t size)
 {
 	void *ret;
 	size_t csize;
+	extent_node_t *node;
+	extent_node_t key;
 
-	/* Round size up to nearest multiple of the cacheline size. */
+	/*
+	 * Round size up to nearest multiple of the cacheline size, so that
+	 * there is no chance of false cache line sharing.
+	 */
 	csize = CACHELINE_CEILING(size);
 
-	malloc_mutex_lock(&base_mtx);
-	/* Make sure there's enough space for the allocation. */
-	if ((uintptr_t)base_next_addr + csize > (uintptr_t)base_past_addr) {
-		if (base_pages_alloc(csize)) {
-			malloc_mutex_unlock(&base_mtx);
-			return (NULL);
-		}
+	key.addr = NULL;
+	key.size = csize;
+	node = extent_tree_szad_nsearch(&base_avail_szad, &key);
+	if (node != NULL) {
+		/* Use existing space. */
+		extent_tree_szad_remove(&base_avail_szad, node);
+	} else {
+		/* Try to allocate more space. */
+		node = base_chunk_alloc(csize);
 	}
-	/* Allocate. */
-	ret = base_next_addr;
-	base_next_addr = (void *)((uintptr_t)base_next_addr + csize);
+	if (node == NULL)
+		return (NULL);
+
+	ret = node->addr;
+	if (node->size > csize) {
+		node->addr = (void *)((uintptr_t)ret + csize);
+		node->size -= csize;
+		extent_tree_szad_insert(&base_avail_szad, node);
+	} else
+		base_node_dalloc_locked(node);
 	if (config_stats)
 		base_allocated += csize;
-	malloc_mutex_unlock(&base_mtx);
 	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, csize);
-
 	return (ret);
 }
 
+/*
+ * base_alloc() guarantees demand-zeroed memory, in order to make multi-page
+ * sparse data structures such as radix tree nodes efficient with respect to
+ * physical memory usage.
+ */
 void *
-base_calloc(size_t number, size_t size)
+base_alloc(size_t size)
 {
-	void *ret = base_alloc(number * size);
-
-	if (ret != NULL)
-		memset(ret, 0, number * size);
+	void *ret;
 
+	malloc_mutex_lock(&base_mtx);
+	ret = base_alloc_locked(size);
+	malloc_mutex_unlock(&base_mtx);
 	return (ret);
 }
 
@@ -81,17 +125,9 @@ base_node_alloc(void)
 	extent_node_t *ret;
 
 	malloc_mutex_lock(&base_mtx);
-	if (base_nodes != NULL) {
-		ret = base_nodes;
-		base_nodes = *(extent_node_t **)ret;
-		malloc_mutex_unlock(&base_mtx);
-		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret,
-		    sizeof(extent_node_t));
-	} else {
-		malloc_mutex_unlock(&base_mtx);
-		ret = (extent_node_t *)base_alloc(sizeof(extent_node_t));
-	}
-
+	if ((ret = base_node_try_alloc_locked()) == NULL)
+		ret = (extent_node_t *)base_alloc_locked(sizeof(extent_node_t));
+	malloc_mutex_unlock(&base_mtx);
 	return (ret);
 }
 
@@ -99,10 +135,8 @@ void
 base_node_dalloc(extent_node_t *node)
 {
 
-	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(node, sizeof(extent_node_t));
 	malloc_mutex_lock(&base_mtx);
-	*(extent_node_t **)node = base_nodes;
-	base_nodes = node;
+	base_node_dalloc_locked(node);
 	malloc_mutex_unlock(&base_mtx);
 }
 
@@ -121,9 +155,10 @@ bool
 base_boot(void)
 {
 
-	base_nodes = NULL;
 	if (malloc_mutex_init(&base_mtx))
 		return (true);
+	extent_tree_szad_new(&base_avail_szad);
+	base_nodes = NULL;
 
 	return (false);
 }
diff --git a/src/chunk.c b/src/chunk.c
index a3ae548..01180a7 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -232,15 +232,18 @@ chunk_alloc_base(size_t size)
 	void *ret;
 	bool zero;
 
-	zero = false;
-	ret = chunk_alloc_core(NULL, size, chunksize, true, &zero,
-	    chunk_dss_prec_get());
-	if (ret == NULL)
-		return (NULL);
-	if (chunk_register(ret, size, true)) {
+	/*
+	 * Directly call chunk_alloc_mmap() rather than chunk_alloc_core()
+	 * because it's critical that chunk_alloc_base() return untouched
+	 * demand-zeroed virtual memory.
+	 */
+	zero = true;
+	ret = chunk_alloc_mmap(size, chunksize, &zero);
+	if (ret != NULL && chunk_register(ret, size, true)) {
 		chunk_dalloc_core(ret, size);
-		return (NULL);
+		ret = NULL;
 	}
+
 	return (ret);
 }
 
diff --git a/src/mutex.c b/src/mutex.c
index 788eca3..d86887e 100644
--- a/src/mutex.c
+++ b/src/mutex.c
@@ -83,8 +83,8 @@ malloc_mutex_init(malloc_mutex_t *mutex)
 		mutex->postponed_next = postponed_mutexes;
 		postponed_mutexes = mutex;
 	} else {
-		if (_pthread_mutex_init_calloc_cb(&mutex->lock, base_calloc) !=
-		    0)
+		if (_pthread_mutex_init_calloc_cb(&mutex->lock,
+		    bootstrap_calloc) != 0)
 			return (true);
 	}
 #else
@@ -140,7 +140,7 @@ mutex_boot(void)
 	postpone_init = false;
 	while (postponed_mutexes != NULL) {
 		if (_pthread_mutex_init_calloc_cb(&postponed_mutexes->lock,
-		    base_calloc) != 0)
+		    bootstrap_calloc) != 0)
 			return (true);
 		postponed_mutexes = postponed_mutexes->postponed_next;
 	}
-- 
cgit v0.12


From c810fcea1fa7983ef5bcabe6556cdc19dde6dd8d Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 4 Feb 2015 16:41:55 -0800
Subject: Add (x != 0) assertion to lg_floor(x).

lg_floor(0) is undefined, but depending on compiler options may not
cause a crash.  This assertion makes it harder to accidentally abuse
lg_floor().
---
 include/jemalloc/internal/util.h | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/include/jemalloc/internal/util.h b/include/jemalloc/internal/util.h
index b2b4ab7..5ad4933 100644
--- a/include/jemalloc/internal/util.h
+++ b/include/jemalloc/internal/util.h
@@ -136,14 +136,14 @@ JEMALLOC_ALWAYS_INLINE int
 jemalloc_ffsl(long bitmap)
 {
 
-        return (JEMALLOC_INTERNAL_FFSL(bitmap));
+	return (JEMALLOC_INTERNAL_FFSL(bitmap));
 }
 
 JEMALLOC_ALWAYS_INLINE int
 jemalloc_ffs(int bitmap)
 {
 
-        return (JEMALLOC_INTERNAL_FFS(bitmap));
+	return (JEMALLOC_INTERNAL_FFS(bitmap));
 }
 
 /* Compute the smallest power of 2 that is >= x. */
@@ -170,6 +170,8 @@ lg_floor(size_t x)
 {
 	size_t ret;
 
+	assert(x != 0);
+
 	asm ("bsr %1, %0"
 	    : "=r"(ret) // Outputs.
 	    : "r"(x)    // Inputs.
@@ -180,22 +182,26 @@ lg_floor(size_t x)
 JEMALLOC_INLINE size_t
 lg_floor(size_t x)
 {
-    unsigned long ret;
+	unsigned long ret;
+
+	assert(x != 0);
 
 #if (LG_SIZEOF_PTR == 3)
-    _BitScanReverse64(&ret, x);
+	_BitScanReverse64(&ret, x);
 #elif (LG_SIZEOF_PTR == 2)
-    _BitScanReverse(&ret, x);
+	_BitScanReverse(&ret, x);
 #else
 #  error "Unsupported type sizes for lg_floor()"
 #endif
-    return (ret);
+	return (ret);
 }
 #elif (defined(JEMALLOC_HAVE_BUILTIN_CLZ))
 JEMALLOC_INLINE size_t
 lg_floor(size_t x)
 {
 
+	assert(x != 0);
+
 #if (LG_SIZEOF_PTR == LG_SIZEOF_INT)
 	return (((8 << LG_SIZEOF_PTR) - 1) - __builtin_clz(x));
 #elif (LG_SIZEOF_PTR == LG_SIZEOF_LONG)
@@ -209,6 +215,8 @@ JEMALLOC_INLINE size_t
 lg_floor(size_t x)
 {
 
+	assert(x != 0);
+
 	x |= (x >> 1);
 	x |= (x >> 2);
 	x |= (x >> 4);
-- 
cgit v0.12


From 8d0e04d42f4750970ac3052a6c76379b60aba5dc Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 30 Jan 2015 22:54:08 -0800
Subject: Refactor rtree to be lock-free.

Recent huge allocation refactoring associates huge allocations with
arenas, but it remains necessary to quickly look up huge allocation
metadata during reallocation/deallocation.  A global radix tree remains
a good solution to this problem, but locking would have become the
primary bottleneck after (upcoming) migration of chunk management from
global to per arena data structures.

This lock-free implementation uses double-checked reads to traverse the
tree, so that in the steady state, each read or write requires only a
single atomic operation.

This implementation also assures that no more than two tree levels
actually exist, through a combination of careful virtual memory
allocation which makes large sparse nodes cheap, and skipping the root
node on x64 (possible because the top 16 bits are all 0 in practice).
---
 include/jemalloc/internal/chunk.h                |   2 +-
 include/jemalloc/internal/jemalloc_internal.h.in |   2 +-
 include/jemalloc/internal/private_symbols.txt    |  15 +-
 include/jemalloc/internal/rtree.h                | 344 +++++++++++++++--------
 src/chunk.c                                      |  25 +-
 src/rtree.c                                      | 138 +++++----
 test/unit/rtree.c                                |  77 +++--
 7 files changed, 379 insertions(+), 224 deletions(-)

diff --git a/include/jemalloc/internal/chunk.h b/include/jemalloc/internal/chunk.h
index 764b7ac..62ac3e7 100644
--- a/include/jemalloc/internal/chunk.h
+++ b/include/jemalloc/internal/chunk.h
@@ -35,7 +35,7 @@ extern malloc_mutex_t	chunks_mtx;
 /* Chunk statistics. */
 extern chunk_stats_t	stats_chunks;
 
-extern rtree_t		*chunks_rtree;
+extern rtree_t		chunks_rtree;
 
 extern size_t		chunksize;
 extern size_t		chunksize_mask; /* (chunksize - 1). */
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 79a23e5..280501d 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -955,7 +955,7 @@ ivsalloc(const void *ptr, bool demote)
 {
 
 	/* Return 0 if ptr is not within a chunk managed by jemalloc. */
-	if (rtree_get(chunks_rtree, (uintptr_t)CHUNK_ADDR2BASE(ptr)) == 0)
+	if (rtree_get(&chunks_rtree, (uintptr_t)CHUNK_ADDR2BASE(ptr)) == 0)
 		return (0);
 
 	return (isalloc(ptr, demote));
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 105e664..7a78f58 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -369,14 +369,21 @@ quarantine_alloc_hook
 quarantine_alloc_hook_work
 quarantine_cleanup
 register_zone
+rtree_child_read
+rtree_child_read_hard
+rtree_child_tryread
 rtree_delete
 rtree_get
-rtree_get_locked
 rtree_new
-rtree_postfork_child
-rtree_postfork_parent
-rtree_prefork
+rtree_node_valid
 rtree_set
+rtree_start_level
+rtree_subkey
+rtree_subtree_read
+rtree_subtree_read_hard
+rtree_subtree_tryread
+rtree_val_read
+rtree_val_write
 s2u
 s2u_compute
 s2u_lookup
diff --git a/include/jemalloc/internal/rtree.h b/include/jemalloc/internal/rtree.h
index bc74769..e86e17c 100644
--- a/include/jemalloc/internal/rtree.h
+++ b/include/jemalloc/internal/rtree.h
@@ -1,170 +1,270 @@
 /*
  * This radix tree implementation is tailored to the singular purpose of
- * tracking which chunks are currently owned by jemalloc.  This functionality
- * is mandatory for OS X, where jemalloc must be able to respond to object
- * ownership queries.
+ * associating metadata with chunks that are currently owned by jemalloc.
  *
  *******************************************************************************
  */
 #ifdef JEMALLOC_H_TYPES
 
+typedef struct rtree_node_elm_s rtree_node_elm_t;
+typedef struct rtree_level_s rtree_level_t;
 typedef struct rtree_s rtree_t;
 
 /*
- * Size of each radix tree node (must be a power of 2).  This impacts tree
- * depth.
+ * RTREE_BITS_PER_LEVEL must be a power of two that is no larger than the
+ * machine address width.
  */
-#define	RTREE_NODESIZE (1U << 16)
+#define	LG_RTREE_BITS_PER_LEVEL	4
+#define	RTREE_BITS_PER_LEVEL	(ZU(1) << LG_RTREE_BITS_PER_LEVEL)
+#define	RTREE_HEIGHT_MAX						\
+    ((ZU(1) << (LG_SIZEOF_PTR+3)) / RTREE_BITS_PER_LEVEL)
 
-typedef void *(rtree_alloc_t)(size_t);
-typedef void (rtree_dalloc_t)(void *);
+/* Used for two-stage lock-free node initialization. */
+#define	RTREE_NODE_INITIALIZING	((rtree_node_elm_t *)0x1)
+
+/*
+ * The node allocation callback function's argument is the number of contiguous
+ * rtree_node_elm_t structures to allocate, and the resulting memory must be
+ * zeroed.
+ */
+typedef rtree_node_elm_t *(rtree_node_alloc_t)(size_t);
+typedef void (rtree_node_dalloc_t)(rtree_node_elm_t *);
 
 #endif /* JEMALLOC_H_TYPES */
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS
 
+struct rtree_node_elm_s {
+	union {
+		rtree_node_elm_t	*child;
+		void			*val;
+	};
+};
+
+struct rtree_level_s {
+	/*
+	 * A non-NULL subtree points to a subtree rooted along the hypothetical
+	 * path to the leaf node corresponding to key 0.  Depending on what keys
+	 * have been used to store to the tree, an arbitrary combination of
+	 * subtree pointers may remain NULL.
+	 *
+	 * Suppose keys comprise 48 bits, and LG_RTREE_BITS_PER_LEVEL is 4.
+	 * This results in a 3-level tree, and the leftmost leaf can be directly
+	 * accessed via subtrees[2], the subtree prefixed by 0x0000 (excluding
+	 * 0x00000000) can be accessed via subtrees[1], and the remainder of the
+	 * tree can be accessed via subtrees[0].
+	 *
+	 *   levels[0] : [<unused> | 0x0001******** | 0x0002******** | ...]
+	 *
+	 *   levels[1] : [<unused> | 0x00000001**** | 0x00000002**** | ... ]
+	 *
+	 *   levels[2] : [val(0x000000000000) | val(0x000000000001) | ...]
+	 *
+	 * This has practical implications on x64, which currently uses only the
+	 * lower 47 bits of virtual address space in userland, thus leaving
+	 * subtrees[0] unused and avoiding a level of tree traversal.
+	 */
+	rtree_node_elm_t	*subtree;
+	/* Number of key bits distinguished by this level. */
+	unsigned		bits;
+	/*
+	 * Cumulative number of key bits distinguished by traversing to
+	 * corresponding tree level.
+	 */
+	unsigned		cumbits;
+};
+
 struct rtree_s {
-	rtree_alloc_t	*alloc;
-	rtree_dalloc_t	*dalloc;
-	malloc_mutex_t	mutex;
-	void		**root;
-	unsigned	height;
-	unsigned	level2bits[1]; /* Dynamically sized. */
+	rtree_node_alloc_t	*alloc;
+	rtree_node_dalloc_t	*dalloc;
+	unsigned		height;
+	/*
+	 * Precomputed table used to convert from the number of leading 0 key
+	 * bits to which subtree level to start at.
+	 */
+	unsigned		start_level[RTREE_HEIGHT_MAX];
+	rtree_level_t		levels[RTREE_HEIGHT_MAX];
 };
 
 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
-rtree_t	*rtree_new(unsigned bits, rtree_alloc_t *alloc, rtree_dalloc_t *dalloc);
+bool rtree_new(rtree_t *rtree, unsigned bits, rtree_node_alloc_t *alloc,
+    rtree_node_dalloc_t *dalloc);
 void	rtree_delete(rtree_t *rtree);
-void	rtree_prefork(rtree_t *rtree);
-void	rtree_postfork_parent(rtree_t *rtree);
-void	rtree_postfork_child(rtree_t *rtree);
+rtree_node_elm_t	*rtree_subtree_read_hard(rtree_t *rtree,
+    unsigned level);
+rtree_node_elm_t	*rtree_child_read_hard(rtree_t *rtree,
+    rtree_node_elm_t *elm, unsigned level);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
 
 #ifndef JEMALLOC_ENABLE_INLINE
-#ifdef JEMALLOC_DEBUG
-uint8_t rtree_get_locked(rtree_t *rtree, uintptr_t key);
-#endif
-uint8_t	rtree_get(rtree_t *rtree, uintptr_t key);
-bool	rtree_set(rtree_t *rtree, uintptr_t key, uint8_t val);
+unsigned	rtree_start_level(rtree_t *rtree, uintptr_t key);
+uintptr_t	rtree_subkey(rtree_t *rtree, uintptr_t key, unsigned level);
+
+bool	rtree_node_valid(rtree_node_elm_t *node);
+rtree_node_elm_t	*rtree_child_tryread(rtree_node_elm_t *elm);
+rtree_node_elm_t	*rtree_child_read(rtree_t *rtree, rtree_node_elm_t *elm,
+    unsigned level);
+void	*rtree_val_read(rtree_t *rtree, rtree_node_elm_t *elm);
+void	rtree_val_write(rtree_t *rtree, rtree_node_elm_t *elm, void *val);
+rtree_node_elm_t	*rtree_subtree_tryread(rtree_t *rtree, unsigned level);
+rtree_node_elm_t	*rtree_subtree_read(rtree_t *rtree, unsigned level);
+
+void	*rtree_get(rtree_t *rtree, uintptr_t key);
+bool	rtree_set(rtree_t *rtree, uintptr_t key, void *val);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_RTREE_C_))
-#define	RTREE_GET_GENERATE(f)						\
-/* The least significant bits of the key are ignored. */		\
-JEMALLOC_INLINE uint8_t							\
-f(rtree_t *rtree, uintptr_t key)					\
-{									\
-	uint8_t ret;							\
-	uintptr_t subkey;						\
-	unsigned i, lshift, height, bits;				\
-	void **node, **child;						\
-									\
-	RTREE_LOCK(&rtree->mutex);					\
-	for (i = lshift = 0, height = rtree->height, node = rtree->root;\
-	    i < height - 1;						\
-	    i++, lshift += bits, node = child) {			\
-		bits = rtree->level2bits[i];				\
-		subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR +	\
-		    3)) - bits);					\
-		child = (void**)node[subkey];				\
-		if (child == NULL) {					\
-			RTREE_UNLOCK(&rtree->mutex);			\
-			return (0);					\
-		}							\
-	}								\
-									\
-	/*								\
-	 * node is a leaf, so it contains values rather than node	\
-	 * pointers.							\
-	 */								\
-	bits = rtree->level2bits[i];					\
-	subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) -	\
-	    bits);							\
-	{								\
-		uint8_t *leaf = (uint8_t *)node;			\
-		ret = leaf[subkey];					\
-	}								\
-	RTREE_UNLOCK(&rtree->mutex);					\
-									\
-	RTREE_GET_VALIDATE						\
-	return (ret);							\
+JEMALLOC_INLINE unsigned
+rtree_start_level(rtree_t *rtree, uintptr_t key)
+{
+	unsigned start_level;
+
+	if (unlikely(key == 0))
+		return (rtree->height - 1);
+
+	start_level = rtree->start_level[lg_floor(key) >>
+	    LG_RTREE_BITS_PER_LEVEL];
+	assert(start_level < rtree->height);
+	return (start_level);
 }
 
-#ifdef JEMALLOC_DEBUG
-#  define RTREE_LOCK(l)		malloc_mutex_lock(l)
-#  define RTREE_UNLOCK(l)	malloc_mutex_unlock(l)
-#  define RTREE_GET_VALIDATE
-RTREE_GET_GENERATE(rtree_get_locked)
-#  undef RTREE_LOCK
-#  undef RTREE_UNLOCK
-#  undef RTREE_GET_VALIDATE
-#endif
+JEMALLOC_INLINE uintptr_t
+rtree_subkey(rtree_t *rtree, uintptr_t key, unsigned level)
+{
 
-#define	RTREE_LOCK(l)
-#define	RTREE_UNLOCK(l)
-#ifdef JEMALLOC_DEBUG
-   /*
-    * Suppose that it were possible for a jemalloc-allocated chunk to be
-    * munmap()ped, followed by a different allocator in another thread re-using
-    * overlapping virtual memory, all without invalidating the cached rtree
-    * value.  The result would be a false positive (the rtree would claim that
-    * jemalloc owns memory that it had actually discarded).  This scenario
-    * seems impossible, but the following assertion is a prudent sanity check.
-    */
-#  define RTREE_GET_VALIDATE						\
-	assert(rtree_get_locked(rtree, key) == ret);
-#else
-#  define RTREE_GET_VALIDATE
-#endif
-RTREE_GET_GENERATE(rtree_get)
-#undef RTREE_LOCK
-#undef RTREE_UNLOCK
-#undef RTREE_GET_VALIDATE
+	return ((key >> ((ZU(1) << (LG_SIZEOF_PTR+3)) -
+	    rtree->levels[level].cumbits)) & ((ZU(1) <<
+	    rtree->levels[level].bits) - 1));
+}
 
 JEMALLOC_INLINE bool
-rtree_set(rtree_t *rtree, uintptr_t key, uint8_t val)
+rtree_node_valid(rtree_node_elm_t *node)
+{
+
+	return ((uintptr_t)node > (uintptr_t)RTREE_NODE_INITIALIZING);
+}
+
+JEMALLOC_INLINE rtree_node_elm_t *
+rtree_child_tryread(rtree_node_elm_t *elm)
+{
+	rtree_node_elm_t *child;
+
+	/* Double-checked read (first read may be stale. */
+	child = elm->child;
+	if (!rtree_node_valid(child))
+		child = atomic_read_p((void **)&elm->child);
+	return (child);
+}
+
+JEMALLOC_INLINE rtree_node_elm_t *
+rtree_child_read(rtree_t *rtree, rtree_node_elm_t *elm, unsigned level)
+{
+	rtree_node_elm_t *child;
+
+	child = rtree_child_tryread(elm);
+	if (unlikely(!rtree_node_valid(child)))
+		child = rtree_child_read_hard(rtree, elm, level);
+	return (child);
+}
+
+JEMALLOC_INLINE void *
+rtree_val_read(rtree_t *rtree, rtree_node_elm_t *elm)
+{
+
+	return (atomic_read_p(&elm->val));
+}
+
+JEMALLOC_INLINE void
+rtree_val_write(rtree_t *rtree, rtree_node_elm_t *elm, void *val)
+{
+
+	atomic_write_p(&elm->val, val);
+}
+
+JEMALLOC_INLINE rtree_node_elm_t *
+rtree_subtree_tryread(rtree_t *rtree, unsigned level)
+{
+	rtree_node_elm_t *subtree;
+
+	/* Double-checked read (first read may be stale. */
+	subtree = rtree->levels[level].subtree;
+	if (!rtree_node_valid(subtree))
+		subtree = atomic_read_p((void **)&rtree->levels[level].subtree);
+	return (subtree);
+}
+
+JEMALLOC_INLINE rtree_node_elm_t *
+rtree_subtree_read(rtree_t *rtree, unsigned level)
+{
+	rtree_node_elm_t *subtree;
+
+	subtree = rtree_subtree_tryread(rtree, level);
+	if (unlikely(!rtree_node_valid(subtree)))
+		subtree = rtree_subtree_read_hard(rtree, level);
+	return (subtree);
+}
+
+JEMALLOC_INLINE void *
+rtree_get(rtree_t *rtree, uintptr_t key)
 {
 	uintptr_t subkey;
-	unsigned i, lshift, height, bits;
-	void **node, **child;
-
-	malloc_mutex_lock(&rtree->mutex);
-	for (i = lshift = 0, height = rtree->height, node = rtree->root;
-	    i < height - 1;
-	    i++, lshift += bits, node = child) {
-		bits = rtree->level2bits[i];
-		subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) -
-		    bits);
-		child = (void**)node[subkey];
-		if (child == NULL) {
-			size_t size = ((i + 1 < height - 1) ? sizeof(void *)
-			    : (sizeof(uint8_t))) << rtree->level2bits[i+1];
-			child = (void**)rtree->alloc(size);
-			if (child == NULL) {
-				malloc_mutex_unlock(&rtree->mutex);
-				return (true);
-			}
-			memset(child, 0, size);
-			node[subkey] = child;
+	unsigned i, start_level;
+	rtree_node_elm_t *node, *child;
+
+	start_level = rtree_start_level(rtree, key);
+
+	for (i = start_level, node = rtree_subtree_tryread(rtree, start_level);
+	    /**/; i++, node = child) {
+		if (unlikely(!rtree_node_valid(node)))
+			return (NULL);
+		subkey = rtree_subkey(rtree, key, i);
+		if (i == rtree->height - 1) {
+			/*
+			 * node is a leaf, so it contains values rather than
+			 * child pointers.
+			 */
+			return (rtree_val_read(rtree, &node[subkey]));
 		}
+		assert(i < rtree->height - 1);
+		child = rtree_child_tryread(&node[subkey]);
 	}
+	not_reached();
+}
 
-	/* node is a leaf, so it contains values rather than node pointers. */
-	bits = rtree->level2bits[i];
-	subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) - bits);
-	{
-		uint8_t *leaf = (uint8_t *)node;
-		leaf[subkey] = val;
-	}
-	malloc_mutex_unlock(&rtree->mutex);
+JEMALLOC_INLINE bool
+rtree_set(rtree_t *rtree, uintptr_t key, void *val)
+{
+	uintptr_t subkey;
+	unsigned i, start_level;
+	rtree_node_elm_t *node, *child;
 
-	return (false);
+	start_level = rtree_start_level(rtree, key);
+
+	node = rtree_subtree_read(rtree, start_level);
+	if (node == NULL)
+		return (true);
+	for (i = start_level; /**/; i++, node = child) {
+		subkey = rtree_subkey(rtree, key, i);
+		if (i == rtree->height - 1) {
+			/*
+			 * node is a leaf, so it contains values rather than
+			 * child pointers.
+			 */
+			rtree_val_write(rtree, &node[subkey], val);
+			return (false);
+		}
+		assert(i < rtree->height - 1);
+		child = rtree_child_read(rtree, &node[subkey], i);
+		if (child == NULL)
+			return (true);
+	}
+	not_reached();
 }
 #endif
 
diff --git a/src/chunk.c b/src/chunk.c
index 01180a7..9ba0b0c 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -21,7 +21,7 @@ static extent_tree_t	chunks_ad_mmap;
 static extent_tree_t	chunks_szad_dss;
 static extent_tree_t	chunks_ad_dss;
 
-rtree_t		*chunks_rtree;
+rtree_t		chunks_rtree;
 
 /* Various chunk-related settings. */
 size_t		chunksize;
@@ -200,7 +200,7 @@ chunk_register(void *chunk, size_t size, bool base)
 	assert(CHUNK_ADDR2BASE(chunk) == chunk);
 
 	if (config_ivsalloc && !base) {
-		if (rtree_set(chunks_rtree, (uintptr_t)chunk, 1))
+		if (rtree_set(&chunks_rtree, (uintptr_t)chunk, chunk))
 			return (true);
 	}
 	if (config_stats || config_prof) {
@@ -395,7 +395,7 @@ chunk_dalloc_core(void *chunk, size_t size)
 	assert((size & chunksize_mask) == 0);
 
 	if (config_ivsalloc)
-		rtree_set(chunks_rtree, (uintptr_t)chunk, 0);
+		rtree_set(&chunks_rtree, (uintptr_t)chunk, NULL);
 	if (config_stats || config_prof) {
 		malloc_mutex_lock(&chunks_mtx);
 		assert(stats_chunks.curchunks >= (size / chunksize));
@@ -415,6 +415,14 @@ chunk_dalloc_default(void *chunk, size_t size, unsigned arena_ind)
 	return (false);
 }
 
+static rtree_node_elm_t *
+chunks_rtree_node_alloc(size_t nelms)
+{
+
+	return ((rtree_node_elm_t *)base_alloc(nelms *
+	    sizeof(rtree_node_elm_t)));
+}
+
 bool
 chunk_boot(void)
 {
@@ -436,9 +444,8 @@ chunk_boot(void)
 	extent_tree_szad_new(&chunks_szad_dss);
 	extent_tree_ad_new(&chunks_ad_dss);
 	if (config_ivsalloc) {
-		chunks_rtree = rtree_new((ZU(1) << (LG_SIZEOF_PTR+3)) -
-		    opt_lg_chunk, base_alloc, NULL);
-		if (chunks_rtree == NULL)
+		if (rtree_new(&chunks_rtree, (ZU(1) << (LG_SIZEOF_PTR+3)) -
+		    opt_lg_chunk, chunks_rtree_node_alloc, NULL))
 			return (true);
 	}
 
@@ -450,8 +457,6 @@ chunk_prefork(void)
 {
 
 	malloc_mutex_prefork(&chunks_mtx);
-	if (config_ivsalloc)
-		rtree_prefork(chunks_rtree);
 	chunk_dss_prefork();
 }
 
@@ -460,8 +465,6 @@ chunk_postfork_parent(void)
 {
 
 	chunk_dss_postfork_parent();
-	if (config_ivsalloc)
-		rtree_postfork_parent(chunks_rtree);
 	malloc_mutex_postfork_parent(&chunks_mtx);
 }
 
@@ -470,7 +473,5 @@ chunk_postfork_child(void)
 {
 
 	chunk_dss_postfork_child();
-	if (config_ivsalloc)
-		rtree_postfork_child(chunks_rtree);
 	malloc_mutex_postfork_child(&chunks_mtx);
 }
diff --git a/src/rtree.c b/src/rtree.c
index 2ff93db..47d9084 100644
--- a/src/rtree.c
+++ b/src/rtree.c
@@ -1,75 +1,74 @@
 #define	JEMALLOC_RTREE_C_
 #include "jemalloc/internal/jemalloc_internal.h"
 
-rtree_t *
-rtree_new(unsigned bits, rtree_alloc_t *alloc, rtree_dalloc_t *dalloc)
+static unsigned
+hmin(unsigned ha, unsigned hb)
 {
-	rtree_t *ret;
-	unsigned bits_per_level, bits_in_leaf, height, i;
+
+	return (ha < hb ? ha : hb);
+}
+
+/* Only the most significant bits of keys passed to rtree_[gs]et() are used. */
+bool
+rtree_new(rtree_t *rtree, unsigned bits, rtree_node_alloc_t *alloc,
+    rtree_node_dalloc_t *dalloc)
+{
+	unsigned bits_in_leaf, height, i;
 
 	assert(bits > 0 && bits <= (sizeof(uintptr_t) << 3));
 
-	bits_per_level = jemalloc_ffs(pow2_ceil((RTREE_NODESIZE / sizeof(void
-	    *)))) - 1;
-	bits_in_leaf = jemalloc_ffs(pow2_ceil((RTREE_NODESIZE /
-	    sizeof(uint8_t)))) - 1;
+	bits_in_leaf = (bits % RTREE_BITS_PER_LEVEL) == 0 ? RTREE_BITS_PER_LEVEL
+	    : (bits % RTREE_BITS_PER_LEVEL);
 	if (bits > bits_in_leaf) {
-		height = 1 + (bits - bits_in_leaf) / bits_per_level;
-		if ((height-1) * bits_per_level + bits_in_leaf != bits)
+		height = 1 + (bits - bits_in_leaf) / RTREE_BITS_PER_LEVEL;
+		if ((height-1) * RTREE_BITS_PER_LEVEL + bits_in_leaf != bits)
 			height++;
-	} else {
+	} else
 		height = 1;
+	assert((height-1) * RTREE_BITS_PER_LEVEL + bits_in_leaf == bits);
+
+	rtree->alloc = alloc;
+	rtree->dalloc = dalloc;
+	rtree->height = height;
+
+	/* Root level. */
+	rtree->levels[0].subtree = NULL;
+	rtree->levels[0].bits = (height > 1) ? RTREE_BITS_PER_LEVEL :
+	    bits_in_leaf;
+	rtree->levels[0].cumbits = rtree->levels[0].bits;
+	/* Interior levels. */
+	for (i = 1; i < height-1; i++) {
+		rtree->levels[i].subtree = NULL;
+		rtree->levels[i].bits = RTREE_BITS_PER_LEVEL;
+		rtree->levels[i].cumbits = rtree->levels[i-1].cumbits +
+		    RTREE_BITS_PER_LEVEL;
 	}
-	assert((height-1) * bits_per_level + bits_in_leaf >= bits);
-
-	ret = (rtree_t*)alloc(offsetof(rtree_t, level2bits) +
-	    (sizeof(unsigned) * height));
-	if (ret == NULL)
-		return (NULL);
-	memset(ret, 0, offsetof(rtree_t, level2bits) + (sizeof(unsigned) *
-	    height));
-
-	ret->alloc = alloc;
-	ret->dalloc = dalloc;
-	if (malloc_mutex_init(&ret->mutex)) {
-		if (dalloc != NULL)
-			dalloc(ret);
-		return (NULL);
-	}
-	ret->height = height;
+	/* Leaf level. */
 	if (height > 1) {
-		if ((height-1) * bits_per_level + bits_in_leaf > bits) {
-			ret->level2bits[0] = (bits - bits_in_leaf) %
-			    bits_per_level;
-		} else
-			ret->level2bits[0] = bits_per_level;
-		for (i = 1; i < height-1; i++)
-			ret->level2bits[i] = bits_per_level;
-		ret->level2bits[height-1] = bits_in_leaf;
-	} else
-		ret->level2bits[0] = bits;
+		rtree->levels[height-1].subtree = NULL;
+		rtree->levels[height-1].bits = bits_in_leaf;
+		rtree->levels[height-1].cumbits = bits;
+	}
 
-	ret->root = (void**)alloc(sizeof(void *) << ret->level2bits[0]);
-	if (ret->root == NULL) {
-		if (dalloc != NULL)
-			dalloc(ret);
-		return (NULL);
+	/* Compute lookup table to be used by rtree_start_level(). */
+	for (i = 0; i < RTREE_HEIGHT_MAX; i++) {
+		rtree->start_level[i] = hmin(RTREE_HEIGHT_MAX - 1 - i, height -
+		    1);
 	}
-	memset(ret->root, 0, sizeof(void *) << ret->level2bits[0]);
 
-	return (ret);
+	return (false);
 }
 
 static void
-rtree_delete_subtree(rtree_t *rtree, void **node, unsigned level)
+rtree_delete_subtree(rtree_t *rtree, rtree_node_elm_t *node, unsigned level)
 {
 
 	if (level < rtree->height - 1) {
 		size_t nchildren, i;
 
-		nchildren = ZU(1) << rtree->level2bits[level];
+		nchildren = ZU(1) << rtree->levels[level].bits;
 		for (i = 0; i < nchildren; i++) {
-			void **child = (void **)node[i];
+			rtree_node_elm_t *child = node[i].child;
 			if (child != NULL)
 				rtree_delete_subtree(rtree, child, level + 1);
 		}
@@ -80,28 +79,49 @@ rtree_delete_subtree(rtree_t *rtree, void **node, unsigned level)
 void
 rtree_delete(rtree_t *rtree)
 {
+	unsigned i;
 
-	rtree_delete_subtree(rtree, rtree->root, 0);
-	rtree->dalloc(rtree);
+	for (i = 0; i < rtree->height; i++) {
+		rtree_node_elm_t *subtree = rtree->levels[i].subtree;
+		if (subtree != NULL)
+			rtree_delete_subtree(rtree, subtree, i);
+	}
 }
 
-void
-rtree_prefork(rtree_t *rtree)
+static rtree_node_elm_t *
+rtree_node_init(rtree_t *rtree, unsigned level, rtree_node_elm_t **elmp)
 {
+	rtree_node_elm_t *node;
+
+	if (atomic_cas_p((void **)elmp, NULL, RTREE_NODE_INITIALIZING)) {
+		/*
+		 * Another thread is already in the process of initializing.
+		 * Spin-wait until initialization is complete.
+		 */
+		do {
+			CPU_SPINWAIT;
+			node = atomic_read_p((void **)elmp);
+		} while (node == RTREE_NODE_INITIALIZING);
+	} else {
+		node = rtree->alloc(ZU(1) << rtree->levels[level].bits);
+		if (node == NULL)
+			return (NULL);
+		atomic_write_p((void **)elmp, node);
+	}
 
-	malloc_mutex_prefork(&rtree->mutex);
+	return (node);
 }
 
-void
-rtree_postfork_parent(rtree_t *rtree)
+rtree_node_elm_t *
+rtree_subtree_read_hard(rtree_t *rtree, unsigned level)
 {
 
-	malloc_mutex_postfork_parent(&rtree->mutex);
+	return (rtree_node_init(rtree, level, &rtree->levels[level].subtree));
 }
 
-void
-rtree_postfork_child(rtree_t *rtree)
+rtree_node_elm_t *
+rtree_child_read_hard(rtree_t *rtree, rtree_node_elm_t *elm, unsigned level)
 {
 
-	malloc_mutex_postfork_child(&rtree->mutex);
+	return (rtree_node_init(rtree, level, &elm->child));
 }
diff --git a/test/unit/rtree.c b/test/unit/rtree.c
index 77a947d..556c4a8 100644
--- a/test/unit/rtree.c
+++ b/test/unit/rtree.c
@@ -1,14 +1,30 @@
 #include "test/jemalloc_test.h"
 
+static rtree_node_elm_t *
+node_alloc(size_t nelms)
+{
+
+	return (calloc(nelms, sizeof(rtree_node_elm_t)));
+}
+
+static void
+node_dalloc(rtree_node_elm_t *node)
+{
+
+	free(node);
+}
+
 TEST_BEGIN(test_rtree_get_empty)
 {
 	unsigned i;
 
 	for (i = 1; i <= (sizeof(uintptr_t) << 3); i++) {
-		rtree_t *rtree = rtree_new(i, malloc, free);
-		assert_u_eq(rtree_get(rtree, 0), 0,
+		rtree_t rtree;
+		assert_false(rtree_new(&rtree, i, node_alloc, node_dalloc),
+		    "Unexpected rtree_new() failure");
+		assert_ptr_eq(rtree_get(&rtree, 0), NULL,
 		    "rtree_get() should return NULL for empty tree");
-		rtree_delete(rtree);
+		rtree_delete(&rtree);
 	}
 }
 TEST_END
@@ -16,19 +32,22 @@ TEST_END
 TEST_BEGIN(test_rtree_extrema)
 {
 	unsigned i;
+	extent_node_t node_a, node_b;
 
 	for (i = 1; i <= (sizeof(uintptr_t) << 3); i++) {
-		rtree_t *rtree = rtree_new(i, malloc, free);
+		rtree_t rtree;
+		assert_false(rtree_new(&rtree, i, node_alloc, node_dalloc),
+		    "Unexpected rtree_new() failure");
 
-		rtree_set(rtree, 0, 1);
-		assert_u_eq(rtree_get(rtree, 0), 1,
+		rtree_set(&rtree, 0, &node_a);
+		assert_ptr_eq(rtree_get(&rtree, 0), &node_a,
 		    "rtree_get() should return previously set value");
 
-		rtree_set(rtree, ~((uintptr_t)0), 1);
-		assert_u_eq(rtree_get(rtree, ~((uintptr_t)0)), 1,
+		rtree_set(&rtree, ~((uintptr_t)0), &node_b);
+		assert_ptr_eq(rtree_get(&rtree, ~((uintptr_t)0)), &node_b,
 		    "rtree_get() should return previously set value");
 
-		rtree_delete(rtree);
+		rtree_delete(&rtree);
 	}
 }
 TEST_END
@@ -40,26 +59,30 @@ TEST_BEGIN(test_rtree_bits)
 	for (i = 1; i < (sizeof(uintptr_t) << 3); i++) {
 		uintptr_t keys[] = {0, 1,
 		    (((uintptr_t)1) << (sizeof(uintptr_t)*8-i)) - 1};
-		rtree_t *rtree = rtree_new(i, malloc, free);
+		extent_node_t node;
+		rtree_t rtree;
+
+		assert_false(rtree_new(&rtree, i, node_alloc, node_dalloc),
+		    "Unexpected rtree_new() failure");
 
 		for (j = 0; j < sizeof(keys)/sizeof(uintptr_t); j++) {
-			rtree_set(rtree, keys[j], 1);
+			rtree_set(&rtree, keys[j], &node);
 			for (k = 0; k < sizeof(keys)/sizeof(uintptr_t); k++) {
-				assert_u_eq(rtree_get(rtree, keys[k]), 1,
+				assert_ptr_eq(rtree_get(&rtree, keys[k]), &node,
 				    "rtree_get() should return previously set "
 				    "value and ignore insignificant key bits; "
 				    "i=%u, j=%u, k=%u, set key=%#"PRIxPTR", "
 				    "get key=%#"PRIxPTR, i, j, k, keys[j],
 				    keys[k]);
 			}
-			assert_u_eq(rtree_get(rtree,
-			    (((uintptr_t)1) << (sizeof(uintptr_t)*8-i))), 0,
+			assert_ptr_eq(rtree_get(&rtree,
+			    (((uintptr_t)1) << (sizeof(uintptr_t)*8-i))), NULL,
 			    "Only leftmost rtree leaf should be set; "
 			    "i=%u, j=%u", i, j);
-			rtree_set(rtree, keys[j], 0);
+			rtree_set(&rtree, keys[j], NULL);
 		}
 
-		rtree_delete(rtree);
+		rtree_delete(&rtree);
 	}
 }
 TEST_END
@@ -68,37 +91,41 @@ TEST_BEGIN(test_rtree_random)
 {
 	unsigned i;
 	sfmt_t *sfmt;
-#define	NSET 100
+#define	NSET 16
 #define	SEED 42
 
 	sfmt = init_gen_rand(SEED);
 	for (i = 1; i <= (sizeof(uintptr_t) << 3); i++) {
-		rtree_t *rtree = rtree_new(i, malloc, free);
 		uintptr_t keys[NSET];
+		extent_node_t node;
 		unsigned j;
+		rtree_t rtree;
+
+		assert_false(rtree_new(&rtree, i, node_alloc, node_dalloc),
+		    "Unexpected rtree_new() failure");
 
 		for (j = 0; j < NSET; j++) {
 			keys[j] = (uintptr_t)gen_rand64(sfmt);
-			rtree_set(rtree, keys[j], 1);
-			assert_u_eq(rtree_get(rtree, keys[j]), 1,
+			rtree_set(&rtree, keys[j], &node);
+			assert_ptr_eq(rtree_get(&rtree, keys[j]), &node,
 			    "rtree_get() should return previously set value");
 		}
 		for (j = 0; j < NSET; j++) {
-			assert_u_eq(rtree_get(rtree, keys[j]), 1,
+			assert_ptr_eq(rtree_get(&rtree, keys[j]), &node,
 			    "rtree_get() should return previously set value");
 		}
 
 		for (j = 0; j < NSET; j++) {
-			rtree_set(rtree, keys[j], 0);
-			assert_u_eq(rtree_get(rtree, keys[j]), 0,
+			rtree_set(&rtree, keys[j], NULL);
+			assert_ptr_eq(rtree_get(&rtree, keys[j]), NULL,
 			    "rtree_get() should return previously set value");
 		}
 		for (j = 0; j < NSET; j++) {
-			assert_u_eq(rtree_get(rtree, keys[j]), 0,
+			assert_ptr_eq(rtree_get(&rtree, keys[j]), NULL,
 			    "rtree_get() should return previously set value");
 		}
 
-		rtree_delete(rtree);
+		rtree_delete(&rtree);
 	}
 	fini_gen_rand(sfmt);
 #undef NSET
-- 
cgit v0.12


From 23694b07457f3aaf9605a4ff6b386f3c897eb624 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Mon, 9 Feb 2015 16:19:27 -0800
Subject: Fix arena_get() for (!init_if_missing && refresh_if_missing) case.

Fix arena_get() to refresh the cache as needed in the (!init_if_missing
&& refresh_if_missing) case.

This flaw was introduced by the initial arena_get() implementation,
which was part of 8bb3198f72fc7587dc93527f9f19fb5be52fa553 (Refactor/fix
arenas manipulation.).
---
 include/jemalloc/internal/jemalloc_internal.h.in | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 280501d..2b16742 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -755,10 +755,7 @@ arena_get(tsd_t *tsd, unsigned ind, bool init_if_missing,
 	arena = arenas_cache[ind];
 	if (likely(arena != NULL) || !refresh_if_missing)
 		return (arena);
-	if (init_if_missing)
-		return (arena_get_hard(tsd, ind, init_if_missing));
-	else
-		return (NULL);
+	return (arena_get_hard(tsd, ind, init_if_missing));
 }
 #endif
 
-- 
cgit v0.12


From 1cb181ed632e7573fb4eab194e4d216867222d27 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Thu, 29 Jan 2015 15:30:47 -0800
Subject: Implement explicit tcache support.

Add the MALLOCX_TCACHE() and MALLOCX_TCACHE_NONE macros, which can be
used in conjunction with the *allocx() API.

Add the tcache.create, tcache.flush, and tcache.destroy mallctls.

This resolves #145.
---
 doc/jemalloc.xml.in                              | 106 ++++++++++++---
 include/jemalloc/internal/arena.h                |  51 +++----
 include/jemalloc/internal/huge.h                 |   8 +-
 include/jemalloc/internal/jemalloc_internal.h.in | 134 ++++++++++--------
 include/jemalloc/internal/private_symbols.txt    |   5 +
 include/jemalloc/internal/tcache.h               | 102 +++++++++-----
 include/jemalloc/jemalloc_macros.h.in            |  12 +-
 src/arena.c                                      |  24 ++--
 src/ckh.c                                        |   7 +-
 src/ctl.c                                        | 113 +++++++++++++--
 src/huge.c                                       |  36 +++--
 src/jemalloc.c                                   | 158 +++++++++++----------
 src/prof.c                                       |  35 +++--
 src/quarantine.c                                 |  10 +-
 src/tcache.c                                     | 166 +++++++++++++++++------
 test/unit/mallctl.c                              | 110 +++++++++++++++
 16 files changed, 740 insertions(+), 337 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 739b33a..da800de 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -242,7 +242,7 @@
       relevant.  Use bitwise or (<code language="C">|</code>) operations to
       specify one or more of the following:
         <variablelist>
-          <varlistentry>
+          <varlistentry id="MALLOCX_LG_ALIGN">
             <term><constant>MALLOCX_LG_ALIGN(<parameter>la</parameter>)
             </constant></term>
 
@@ -252,7 +252,7 @@
             that <parameter>la</parameter> is within the valid
             range.</para></listitem>
           </varlistentry>
-          <varlistentry>
+          <varlistentry id="MALLOCX_ALIGN">
             <term><constant>MALLOCX_ALIGN(<parameter>a</parameter>)
             </constant></term>
 
@@ -262,7 +262,7 @@
             validate that <parameter>a</parameter> is a power of 2.
             </para></listitem>
           </varlistentry>
-          <varlistentry>
+          <varlistentry id="MALLOCX_ZERO">
             <term><constant>MALLOCX_ZERO</constant></term>
 
             <listitem><para>Initialize newly allocated memory to contain zero
@@ -271,16 +271,38 @@
             that are initialized to contain zero bytes.  If this macro is
             absent, newly allocated memory is uninitialized.</para></listitem>
           </varlistentry>
-          <varlistentry>
+          <varlistentry id="MALLOCX_TCACHE">
+            <term><constant>MALLOCX_TCACHE(<parameter>tc</parameter>)
+            </constant></term>
+
+            <listitem><para>Use the thread-specific cache (tcache) specified by
+            the identifier <parameter>tc</parameter>, which must have been
+            acquired via the <link
+            linkend="tcache.create"><mallctl>tcache.create</mallctl></link>
+            mallctl.  This macro does not validate that
+            <parameter>tc</parameter> specifies a valid
+            identifier.</para></listitem>
+          </varlistentry>
+          <varlistentry id="MALLOC_TCACHE_NONE">
+            <term><constant>MALLOCX_TCACHE_NONE</constant></term>
+
+            <listitem><para>Do not use a thread-specific cache (tcache).  Unless
+            <constant>MALLOCX_TCACHE(<parameter>tc</parameter>)</constant> or
+            <constant>MALLOCX_TCACHE_NONE</constant> is specified, an
+            automatically managed tcache will be used under many circumstances.
+            This macro cannot be used in the same <parameter>flags</parameter>
+            argument as
+            <constant>MALLOCX_TCACHE(<parameter>tc</parameter>)</constant>.</para></listitem>
+          </varlistentry>
+          <varlistentry id="MALLOCX_ARENA">
             <term><constant>MALLOCX_ARENA(<parameter>a</parameter>)
             </constant></term>
 
             <listitem><para>Use the arena specified by the index
-            <parameter>a</parameter> (and by necessity bypass the thread
-            cache).  This macro has no effect for regions that were allocated
-            via an arena other than the one specified.  This macro does not
-            validate that <parameter>a</parameter> specifies an arena index in
-            the valid range.</para></listitem>
+            <parameter>a</parameter>.  This macro has no effect for regions that
+            were allocated via an arena other than the one specified.  This
+            macro does not validate that <parameter>a</parameter> specifies an
+            arena index in the valid range.</para></listitem>
           </varlistentry>
         </variablelist>
       </para>
@@ -1060,12 +1082,11 @@ malloc_conf = "xmalloc:true";]]></programlisting>
           <literal>r-</literal>
           [<option>--enable-tcache</option>]
         </term>
-        <listitem><para>Thread-specific caching enabled/disabled.  When there
-        are multiple threads, each thread uses a thread-specific cache for
-        objects up to a certain size.  Thread-specific caching allows many
-        allocations to be satisfied without performing any thread
-        synchronization, at the cost of increased memory use.  See the
-        <link
+        <listitem><para>Thread-specific caching (tcache) enabled/disabled.  When
+        there are multiple threads, each thread uses a tcache for objects up to
+        a certain size.  Thread-specific caching allows many allocations to be
+        satisfied without performing any thread synchronization, at the cost of
+        increased memory use.  See the <link
         linkend="opt.lg_tcache_max"><mallctl>opt.lg_tcache_max</mallctl></link>
         option for related tuning information.  This option is enabled by
         default unless running inside <ulink
@@ -1081,8 +1102,8 @@ malloc_conf = "xmalloc:true";]]></programlisting>
           [<option>--enable-tcache</option>]
         </term>
         <listitem><para>Maximum size class (log base 2) to cache in the
-        thread-specific cache.  At a minimum, all small size classes are
-        cached, and at a maximum all large size classes are cached.  The
+        thread-specific cache (tcache).  At a minimum, all small size classes
+        are cached, and at a maximum all large size classes are cached.  The
         default maximum is 32 KiB (2^15).</para></listitem>
       </varlistentry>
 
@@ -1339,7 +1360,7 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         <listitem><para>Enable/disable calling thread's tcache.  The tcache is
         implicitly flushed as a side effect of becoming
         disabled (see <link
-        lenkend="thread.tcache.flush"><mallctl>thread.tcache.flush</mallctl></link>).
+        linkend="thread.tcache.flush"><mallctl>thread.tcache.flush</mallctl></link>).
         </para></listitem>
       </varlistentry>
 
@@ -1350,9 +1371,9 @@ malloc_conf = "xmalloc:true";]]></programlisting>
           <literal>--</literal>
           [<option>--enable-tcache</option>]
         </term>
-        <listitem><para>Flush calling thread's tcache.  This interface releases
-        all cached objects and internal data structures associated with the
-        calling thread's thread-specific cache.  Ordinarily, this interface
+        <listitem><para>Flush calling thread's thread-specific cache (tcache).
+        This interface releases all cached objects and internal data structures
+        associated with the calling thread's tcache.  Ordinarily, this interface
         need not be called, since automatic periodic incremental garbage
         collection occurs, and the thread cache is automatically discarded when
         a thread exits.  However, garbage collection is triggered by allocation
@@ -1399,6 +1420,49 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         default.</para></listitem>
       </varlistentry>
 
+      <varlistentry id="tcache.create">
+        <term>
+          <mallctl>tcache.create</mallctl>
+          (<type>unsigned</type>)
+          <literal>r-</literal>
+          [<option>--enable-tcache</option>]
+        </term>
+        <listitem><para>Create an explicit thread-specific cache (tcache) and
+        return an identifier that can be passed to the <link
+        linkend="MALLOCX_TCACHE"><constant>MALLOCX_TCACHE(<parameter>tc</parameter>)</constant></link>
+        macro to explicitly use the specified cache rather than the
+        automatically managed one that is used by default.  Each explicit cache
+        can be used by only one thread at a time; the application must assure
+        that this constraint holds.
+        </para></listitem>
+      </varlistentry>
+
+      <varlistentry id="tcache.flush">
+        <term>
+          <mallctl>tcache.flush</mallctl>
+          (<type>unsigned</type>)
+          <literal>-w</literal>
+          [<option>--enable-tcache</option>]
+        </term>
+        <listitem><para>Flush the specified thread-specific cache (tcache).  The
+        same considerations apply to this interface as to <link
+        linkend="thread.tcache.flush"><mallctl>thread.tcache.flush</mallctl></link>,
+        except that the tcache will never be automatically be discarded.
+        </para></listitem>
+      </varlistentry>
+
+      <varlistentry id="tcache.destroy">
+        <term>
+          <mallctl>tcache.destroy</mallctl>
+          (<type>unsigned</type>)
+          <literal>-w</literal>
+          [<option>--enable-tcache</option>]
+        </term>
+        <listitem><para>Flush the specified thread-specific cache (tcache) and
+        make the identifier available for use during a future tcache creation.
+        </para></listitem>
+      </varlistentry>
+
       <varlistentry id="arena.i.purge">
         <term>
           <mallctl>arena.&lt;i&gt;.purge</mallctl>
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 46367f6..5476899 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -272,7 +272,8 @@ struct arena_s {
 	arena_stats_t		stats;
 	/*
 	 * List of tcaches for extant threads associated with this arena.
-	 * Stats from these are merged incrementally, and at exit.
+	 * Stats from these are merged incrementally, and at exit if
+	 * opt_stats_print is enabled.
 	 */
 	ql_head(tcache_t)	tcache_ql;
 
@@ -387,8 +388,7 @@ extern arena_ralloc_junk_large_t *arena_ralloc_junk_large;
 bool	arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
     size_t extra, bool zero);
 void	*arena_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize,
-    size_t size, size_t extra, size_t alignment, bool zero,
-    bool try_tcache_alloc, bool try_tcache_dalloc);
+    size_t size, size_t extra, size_t alignment, bool zero, tcache_t *tcache);
 dss_prec_t	arena_dss_prec_get(arena_t *arena);
 bool	arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec);
 void	arena_stats_merge(arena_t *arena, const char **dss, size_t *nactive,
@@ -450,13 +450,13 @@ unsigned	arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info,
 prof_tctx_t	*arena_prof_tctx_get(const void *ptr);
 void	arena_prof_tctx_set(const void *ptr, prof_tctx_t *tctx);
 void	*arena_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
-    bool try_tcache);
+    tcache_t *tcache);
 arena_t	*arena_aalloc(const void *ptr);
 size_t	arena_salloc(const void *ptr, bool demote);
 void	arena_dalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr,
-    bool try_tcache);
+    tcache_t *tcache);
 void	arena_sdalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, size_t size,
-    bool try_tcache);
+    tcache_t *tcache);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ARENA_C_))
@@ -943,17 +943,15 @@ arena_prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 
 JEMALLOC_ALWAYS_INLINE void *
 arena_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
-    bool try_tcache)
+    tcache_t *tcache)
 {
-	tcache_t *tcache;
 
 	assert(size != 0);
 	assert(size <= arena_maxclass);
 
 	if (likely(size <= SMALL_MAXCLASS)) {
-		if (likely(try_tcache) && likely((tcache = tcache_get(tsd,
-		    true)) != NULL))
-			return (tcache_alloc_small(tcache, size, zero));
+		if (likely(tcache != NULL))
+			return (tcache_alloc_small(tsd, tcache, size, zero));
 		else {
 			arena = arena_choose(tsd, arena);
 			if (unlikely(arena == NULL))
@@ -965,9 +963,8 @@ arena_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
 		 * Initialize tcache after checking size in order to avoid
 		 * infinite recursion during tcache initialization.
 		 */
-		if (try_tcache && size <= tcache_maxclass && likely((tcache =
-		    tcache_get(tsd, true)) != NULL))
-			return (tcache_alloc_large(tcache, size, zero));
+		if (likely(tcache != NULL) && size <= tcache_maxclass)
+			return (tcache_alloc_large(tsd, tcache, size, zero));
 		else {
 			arena = arena_choose(tsd, arena);
 			if (unlikely(arena == NULL))
@@ -1027,10 +1024,9 @@ arena_salloc(const void *ptr, bool demote)
 }
 
 JEMALLOC_ALWAYS_INLINE void
-arena_dalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, bool try_tcache)
+arena_dalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, tcache_t *tcache)
 {
 	size_t pageind, mapbits;
-	tcache_t *tcache;
 
 	assert(ptr != NULL);
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
@@ -1040,11 +1036,10 @@ arena_dalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, bool try_tcache)
 	assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
 	if (likely((mapbits & CHUNK_MAP_LARGE) == 0)) {
 		/* Small allocation. */
-		if (likely(try_tcache) && likely((tcache = tcache_get(tsd,
-		    false)) != NULL)) {
+		if (likely(tcache != NULL)) {
 			index_t binind = arena_ptr_small_binind_get(ptr,
 			    mapbits);
-			tcache_dalloc_small(tcache, ptr, binind);
+			tcache_dalloc_small(tsd, tcache, ptr, binind);
 		} else
 			arena_dalloc_small(chunk->arena, chunk, ptr, pageind);
 	} else {
@@ -1052,9 +1047,8 @@ arena_dalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, bool try_tcache)
 
 		assert(((uintptr_t)ptr & PAGE_MASK) == 0);
 
-		if (try_tcache && size <= tcache_maxclass && likely((tcache =
-		    tcache_get(tsd, false)) != NULL))
-			tcache_dalloc_large(tcache, ptr, size);
+		if (likely(tcache != NULL) && size <= tcache_maxclass)
+			tcache_dalloc_large(tsd, tcache, ptr, size);
 		else
 			arena_dalloc_large(chunk->arena, chunk, ptr);
 	}
@@ -1062,9 +1056,8 @@ arena_dalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, bool try_tcache)
 
 JEMALLOC_ALWAYS_INLINE void
 arena_sdalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, size_t size,
-    bool try_tcache)
+    tcache_t *tcache)
 {
-	tcache_t *tcache;
 
 	assert(ptr != NULL);
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
@@ -1082,10 +1075,9 @@ arena_sdalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, size_t size,
 
 	if (likely(size <= SMALL_MAXCLASS)) {
 		/* Small allocation. */
-		if (likely(try_tcache) && likely((tcache = tcache_get(tsd,
-		    false)) != NULL)) {
+		if (likely(tcache != NULL)) {
 			index_t binind = size2index(size);
-			tcache_dalloc_small(tcache, ptr, binind);
+			tcache_dalloc_small(tsd, tcache, ptr, binind);
 		} else {
 			size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >>
 			    LG_PAGE;
@@ -1094,9 +1086,8 @@ arena_sdalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, size_t size,
 	} else {
 		assert(((uintptr_t)ptr & PAGE_MASK) == 0);
 
-		if (try_tcache && size <= tcache_maxclass && (tcache =
-		    tcache_get(tsd, false)) != NULL)
-			tcache_dalloc_large(tcache, ptr, size);
+		if (likely(tcache != NULL) && size <= tcache_maxclass)
+			tcache_dalloc_large(tsd, tcache, ptr, size);
 		else
 			arena_dalloc_large(chunk->arena, chunk, ptr);
 	}
diff --git a/include/jemalloc/internal/huge.h b/include/jemalloc/internal/huge.h
index decb024..231cc36 100644
--- a/include/jemalloc/internal/huge.h
+++ b/include/jemalloc/internal/huge.h
@@ -10,19 +10,19 @@
 #ifdef JEMALLOC_H_EXTERNS
 
 void	*huge_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
-    bool try_tcache);
+    tcache_t *tcache);
 void	*huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
-    bool zero, bool try_tcache);
+    bool zero, tcache_t *tcache);
 bool	huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
     size_t extra, bool zero);
 void	*huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize,
     size_t size, size_t extra, size_t alignment, bool zero,
-    bool try_tcache_alloc, bool try_tcache_dalloc);
+    tcache_t *tcache);
 #ifdef JEMALLOC_JET
 typedef void (huge_dalloc_junk_t)(void *, size_t);
 extern huge_dalloc_junk_t *huge_dalloc_junk;
 #endif
-void	huge_dalloc(tsd_t *tsd, void *ptr, bool try_tcache);
+void	huge_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache);
 arena_t	*huge_aalloc(const void *ptr);
 size_t	huge_salloc(const void *ptr);
 prof_tctx_t	*huge_prof_tctx_get(const void *ptr);
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 2b16742..b8c994c 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -172,7 +172,21 @@ static const bool config_ivsalloc =
 /* Size class index type. */
 typedef unsigned index_t;
 
-#define	MALLOCX_ARENA_MASK	((int)~0xff)
+/*
+ * Flags bits:
+ *
+ * a: arena
+ * t: tcache
+ * 0: unused
+ * z: zero
+ * n: alignment
+ *
+ * aaaaaaaa aaaatttt tttttttt 0znnnnnn
+ */
+#define	MALLOCX_ARENA_MASK	((int)~0xfffff)
+#define	MALLOCX_ARENA_MAX	0xffe
+#define	MALLOCX_TCACHE_MASK	((int)~0xfff000ffU)
+#define	MALLOCX_TCACHE_MAX	0xffd
 #define	MALLOCX_LG_ALIGN_MASK	((int)0x3f)
 /* Use MALLOCX_ALIGN_GET() if alignment may not be specified in flags. */
 #define	MALLOCX_ALIGN_GET_SPECIFIED(flags)				\
@@ -181,8 +195,11 @@ typedef unsigned index_t;
     (MALLOCX_ALIGN_GET_SPECIFIED(flags) & (SIZE_T_MAX-1))
 #define	MALLOCX_ZERO_GET(flags)						\
     ((bool)(flags & MALLOCX_ZERO))
+
+#define	MALLOCX_TCACHE_GET(flags)					\
+    (((unsigned)((flags & MALLOCX_TCACHE_MASK) >> 8)) - 2)
 #define	MALLOCX_ARENA_GET(flags)					\
-    (((unsigned)(flags >> 8)) - 1)
+    (((unsigned)(((unsigned)flags) >> 20)) - 1)
 
 /* Smallest size class to support. */
 #define	TINY_MIN		(1U << LG_TINY_MIN)
@@ -749,7 +766,7 @@ arena_get(tsd_t *tsd, unsigned ind, bool init_if_missing,
 		 * ind is invalid, cache is old (too small), or arena to be
 		 * initialized.
 		 */
-		return (refresh_if_missing ?  arena_get_hard(tsd, ind,
+		return (refresh_if_missing ? arena_get_hard(tsd, ind,
 		    init_if_missing) : NULL);
 	}
 	arena = arenas_cache[ind];
@@ -778,32 +795,31 @@ arena_get(tsd_t *tsd, unsigned ind, bool init_if_missing,
 #ifndef JEMALLOC_ENABLE_INLINE
 arena_t	*iaalloc(const void *ptr);
 size_t	isalloc(const void *ptr, bool demote);
-void	*iallocztm(tsd_t *tsd, size_t size, bool zero, bool try_tcache,
+void	*iallocztm(tsd_t *tsd, size_t size, bool zero, tcache_t *tcache,
     bool is_metadata, arena_t *arena);
-void	*imalloct(tsd_t *tsd, size_t size, bool try_tcache, arena_t *arena);
+void	*imalloct(tsd_t *tsd, size_t size, tcache_t *tcache, arena_t *arena);
 void	*imalloc(tsd_t *tsd, size_t size);
-void	*icalloct(tsd_t *tsd, size_t size, bool try_tcache, arena_t *arena);
+void	*icalloct(tsd_t *tsd, size_t size, tcache_t *tcache, arena_t *arena);
 void	*icalloc(tsd_t *tsd, size_t size);
 void	*ipallocztm(tsd_t *tsd, size_t usize, size_t alignment, bool zero,
-    bool try_tcache, bool is_metadata, arena_t *arena);
+    tcache_t *tcache, bool is_metadata, arena_t *arena);
 void	*ipalloct(tsd_t *tsd, size_t usize, size_t alignment, bool zero,
-    bool try_tcache, arena_t *arena);
+    tcache_t *tcache, arena_t *arena);
 void	*ipalloc(tsd_t *tsd, size_t usize, size_t alignment, bool zero);
 size_t	ivsalloc(const void *ptr, bool demote);
 size_t	u2rz(size_t usize);
 size_t	p2rz(const void *ptr);
-void	idalloctm(tsd_t *tsd, void *ptr, bool try_tcache, bool is_metadata);
-void	idalloct(tsd_t *tsd, void *ptr, bool try_tcache);
+void	idalloctm(tsd_t *tsd, void *ptr, tcache_t *tcache, bool is_metadata);
+void	idalloct(tsd_t *tsd, void *ptr, tcache_t *tcache);
 void	idalloc(tsd_t *tsd, void *ptr);
-void	iqalloc(tsd_t *tsd, void *ptr, bool try_tcache);
-void	isdalloct(tsd_t *tsd, void *ptr, size_t size, bool try_tcache);
-void	isqalloc(tsd_t *tsd, void *ptr, size_t size, bool try_tcache);
+void	iqalloc(tsd_t *tsd, void *ptr, tcache_t *tcache);
+void	isdalloct(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache);
+void	isqalloc(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache);
 void	*iralloct_realign(tsd_t *tsd, void *ptr, size_t oldsize, size_t size,
-    size_t extra, size_t alignment, bool zero, bool try_tcache_alloc,
-    bool try_tcache_dalloc, arena_t *arena);
-void	*iralloct(tsd_t *tsd, void *ptr, size_t oldsize, size_t size,
-    size_t alignment, bool zero, bool try_tcache_alloc, bool try_tcache_dalloc,
+    size_t extra, size_t alignment, bool zero, tcache_t *tcache,
     arena_t *arena);
+void	*iralloct(tsd_t *tsd, void *ptr, size_t oldsize, size_t size,
+    size_t alignment, bool zero, tcache_t *tcache, arena_t *arena);
 void	*iralloc(tsd_t *tsd, void *ptr, size_t oldsize, size_t size,
     size_t alignment, bool zero);
 bool	ixalloc(void *ptr, size_t oldsize, size_t size, size_t extra,
@@ -853,7 +869,7 @@ isalloc(const void *ptr, bool demote)
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-iallocztm(tsd_t *tsd, size_t size, bool zero, bool try_tcache, bool is_metadata,
+iallocztm(tsd_t *tsd, size_t size, bool zero, tcache_t *tcache, bool is_metadata,
     arena_t *arena)
 {
 	void *ret;
@@ -861,9 +877,9 @@ iallocztm(tsd_t *tsd, size_t size, bool zero, bool try_tcache, bool is_metadata,
 	assert(size != 0);
 
 	if (likely(size <= arena_maxclass))
-		ret = arena_malloc(tsd, arena, size, zero, try_tcache);
+		ret = arena_malloc(tsd, arena, size, zero, tcache);
 	else
-		ret = huge_malloc(tsd, arena, size, zero, try_tcache);
+		ret = huge_malloc(tsd, arena, size, zero, tcache);
 	if (config_stats && is_metadata && likely(ret != NULL)) {
 		arena_metadata_allocated_add(iaalloc(ret), isalloc(ret,
 		    config_prof));
@@ -872,36 +888,36 @@ iallocztm(tsd_t *tsd, size_t size, bool zero, bool try_tcache, bool is_metadata,
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-imalloct(tsd_t *tsd, size_t size, bool try_tcache, arena_t *arena)
+imalloct(tsd_t *tsd, size_t size, tcache_t *tcache, arena_t *arena)
 {
 
-	return (iallocztm(tsd, size, false, try_tcache, false, arena));
+	return (iallocztm(tsd, size, false, tcache, false, arena));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
 imalloc(tsd_t *tsd, size_t size)
 {
 
-	return (iallocztm(tsd, size, false, true, false, NULL));
+	return (iallocztm(tsd, size, false, tcache_get(tsd, true), false, NULL));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-icalloct(tsd_t *tsd, size_t size, bool try_tcache, arena_t *arena)
+icalloct(tsd_t *tsd, size_t size, tcache_t *tcache, arena_t *arena)
 {
 
-	return (iallocztm(tsd, size, true, try_tcache, false, arena));
+	return (iallocztm(tsd, size, true, tcache, false, arena));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
 icalloc(tsd_t *tsd, size_t size)
 {
 
-	return (iallocztm(tsd, size, true, true, false, NULL));
+	return (iallocztm(tsd, size, true, tcache_get(tsd, true), false, NULL));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
 ipallocztm(tsd_t *tsd, size_t usize, size_t alignment, bool zero,
-    bool try_tcache, bool is_metadata, arena_t *arena)
+    tcache_t *tcache, bool is_metadata, arena_t *arena)
 {
 	void *ret;
 
@@ -909,7 +925,7 @@ ipallocztm(tsd_t *tsd, size_t usize, size_t alignment, bool zero,
 	assert(usize == sa2u(usize, alignment));
 
 	if (usize <= SMALL_MAXCLASS && alignment < PAGE)
-		ret = arena_malloc(tsd, arena, usize, zero, try_tcache);
+		ret = arena_malloc(tsd, arena, usize, zero, tcache);
 	else {
 		if (likely(usize <= arena_maxclass)) {
 			arena = arena_choose(tsd, arena);
@@ -917,10 +933,10 @@ ipallocztm(tsd_t *tsd, size_t usize, size_t alignment, bool zero,
 				return (NULL);
 			ret = arena_palloc(arena, usize, alignment, zero);
 		} else if (likely(alignment <= chunksize))
-			ret = huge_malloc(tsd, arena, usize, zero, try_tcache);
+			ret = huge_malloc(tsd, arena, usize, zero, tcache);
 		else {
 			ret = huge_palloc(tsd, arena, usize, alignment, zero,
-			    try_tcache);
+			    tcache);
 		}
 	}
 	assert(ALIGNMENT_ADDR2BASE(ret, alignment) == ret);
@@ -932,19 +948,19 @@ ipallocztm(tsd_t *tsd, size_t usize, size_t alignment, bool zero,
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-ipalloct(tsd_t *tsd, size_t usize, size_t alignment, bool zero, bool try_tcache,
-    arena_t *arena)
+ipalloct(tsd_t *tsd, size_t usize, size_t alignment, bool zero,
+    tcache_t *tcache, arena_t *arena)
 {
 
-	return (ipallocztm(tsd, usize, alignment, zero, try_tcache, false,
-	    arena));
+	return (ipallocztm(tsd, usize, alignment, zero, tcache, false, arena));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
 ipalloc(tsd_t *tsd, size_t usize, size_t alignment, bool zero)
 {
 
-	return (ipallocztm(tsd, usize, alignment, zero, true, false, NULL));
+	return (ipallocztm(tsd, usize, alignment, zero, tcache_get(tsd,
+	    NULL), false, NULL));
 }
 
 JEMALLOC_ALWAYS_INLINE size_t
@@ -981,7 +997,7 @@ p2rz(const void *ptr)
 }
 
 JEMALLOC_ALWAYS_INLINE void
-idalloctm(tsd_t *tsd, void *ptr, bool try_tcache, bool is_metadata)
+idalloctm(tsd_t *tsd, void *ptr, tcache_t *tcache, bool is_metadata)
 {
 	arena_chunk_t *chunk;
 
@@ -993,37 +1009,37 @@ idalloctm(tsd_t *tsd, void *ptr, bool try_tcache, bool is_metadata)
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (likely(chunk != ptr))
-		arena_dalloc(tsd, chunk, ptr, try_tcache);
+		arena_dalloc(tsd, chunk, ptr, tcache);
 	else
-		huge_dalloc(tsd, ptr, try_tcache);
+		huge_dalloc(tsd, ptr, tcache);
 }
 
 JEMALLOC_ALWAYS_INLINE void
-idalloct(tsd_t *tsd, void *ptr, bool try_tcache)
+idalloct(tsd_t *tsd, void *ptr, tcache_t *tcache)
 {
 
-	idalloctm(tsd, ptr, try_tcache, false);
+	idalloctm(tsd, ptr, tcache, false);
 }
 
 JEMALLOC_ALWAYS_INLINE void
 idalloc(tsd_t *tsd, void *ptr)
 {
 
-	idalloctm(tsd, ptr, true, false);
+	idalloctm(tsd, ptr, tcache_get(tsd, false), false);
 }
 
 JEMALLOC_ALWAYS_INLINE void
-iqalloc(tsd_t *tsd, void *ptr, bool try_tcache)
+iqalloc(tsd_t *tsd, void *ptr, tcache_t *tcache)
 {
 
 	if (config_fill && unlikely(opt_quarantine))
 		quarantine(tsd, ptr);
 	else
-		idalloctm(tsd, ptr, try_tcache, false);
+		idalloctm(tsd, ptr, tcache, false);
 }
 
 JEMALLOC_ALWAYS_INLINE void
-isdalloct(tsd_t *tsd, void *ptr, size_t size, bool try_tcache)
+isdalloct(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache)
 {
 	arena_chunk_t *chunk;
 
@@ -1031,25 +1047,24 @@ isdalloct(tsd_t *tsd, void *ptr, size_t size, bool try_tcache)
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (likely(chunk != ptr))
-		arena_sdalloc(tsd, chunk, ptr, size, try_tcache);
+		arena_sdalloc(tsd, chunk, ptr, size, tcache);
 	else
-		huge_dalloc(tsd, ptr, try_tcache);
+		huge_dalloc(tsd, ptr, tcache);
 }
 
 JEMALLOC_ALWAYS_INLINE void
-isqalloc(tsd_t *tsd, void *ptr, size_t size, bool try_tcache)
+isqalloc(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache)
 {
 
 	if (config_fill && unlikely(opt_quarantine))
 		quarantine(tsd, ptr);
 	else
-		isdalloct(tsd, ptr, size, try_tcache);
+		isdalloct(tsd, ptr, size, tcache);
 }
 
 JEMALLOC_ALWAYS_INLINE void *
 iralloct_realign(tsd_t *tsd, void *ptr, size_t oldsize, size_t size,
-    size_t extra, size_t alignment, bool zero, bool try_tcache_alloc,
-    bool try_tcache_dalloc, arena_t *arena)
+    size_t extra, size_t alignment, bool zero, tcache_t *tcache, arena_t *arena)
 {
 	void *p;
 	size_t usize, copysize;
@@ -1057,7 +1072,7 @@ iralloct_realign(tsd_t *tsd, void *ptr, size_t oldsize, size_t size,
 	usize = sa2u(size + extra, alignment);
 	if (usize == 0)
 		return (NULL);
-	p = ipalloct(tsd, usize, alignment, zero, try_tcache_alloc, arena);
+	p = ipalloct(tsd, usize, alignment, zero, tcache, arena);
 	if (p == NULL) {
 		if (extra == 0)
 			return (NULL);
@@ -1065,8 +1080,7 @@ iralloct_realign(tsd_t *tsd, void *ptr, size_t oldsize, size_t size,
 		usize = sa2u(size, alignment);
 		if (usize == 0)
 			return (NULL);
-		p = ipalloct(tsd, usize, alignment, zero, try_tcache_alloc,
-		    arena);
+		p = ipalloct(tsd, usize, alignment, zero, tcache, arena);
 		if (p == NULL)
 			return (NULL);
 	}
@@ -1076,13 +1090,13 @@ iralloct_realign(tsd_t *tsd, void *ptr, size_t oldsize, size_t size,
 	 */
 	copysize = (size < oldsize) ? size : oldsize;
 	memcpy(p, ptr, copysize);
-	isqalloc(tsd, ptr, oldsize, try_tcache_dalloc);
+	isqalloc(tsd, ptr, oldsize, tcache);
 	return (p);
 }
 
 JEMALLOC_ALWAYS_INLINE void *
 iralloct(tsd_t *tsd, void *ptr, size_t oldsize, size_t size, size_t alignment,
-    bool zero, bool try_tcache_alloc, bool try_tcache_dalloc, arena_t *arena)
+    bool zero, tcache_t *tcache, arena_t *arena)
 {
 
 	assert(ptr != NULL);
@@ -1095,15 +1109,15 @@ iralloct(tsd_t *tsd, void *ptr, size_t oldsize, size_t size, size_t alignment,
 		 * and copy.
 		 */
 		return (iralloct_realign(tsd, ptr, oldsize, size, 0, alignment,
-		    zero, try_tcache_alloc, try_tcache_dalloc, arena));
+		    zero, tcache, arena));
 	}
 
 	if (likely(size <= arena_maxclass)) {
 		return (arena_ralloc(tsd, arena, ptr, oldsize, size, 0,
-		    alignment, zero, try_tcache_alloc, try_tcache_dalloc));
+		    alignment, zero, tcache));
 	} else {
 		return (huge_ralloc(tsd, arena, ptr, oldsize, size, 0,
-		    alignment, zero, try_tcache_alloc, try_tcache_dalloc));
+		    alignment, zero, tcache));
 	}
 }
 
@@ -1112,8 +1126,8 @@ iralloc(tsd_t *tsd, void *ptr, size_t oldsize, size_t size, size_t alignment,
     bool zero)
 {
 
-	return (iralloct(tsd, ptr, oldsize, size, alignment, zero, true, true,
-	    NULL));
+	return (iralloct(tsd, ptr, oldsize, size, alignment, zero,
+	    tcache_get(tsd, true), NULL));
 }
 
 JEMALLOC_ALWAYS_INLINE bool
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 7a78f58..cf42bea 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -425,6 +425,11 @@ tcache_get_hard
 tcache_maxclass
 tcache_salloc
 tcache_stats_merge
+tcaches
+tcaches_create
+tcaches_destroy
+tcaches_flush
+tcaches_get
 thread_allocated_cleanup
 thread_deallocated_cleanup
 tsd_booted
diff --git a/include/jemalloc/internal/tcache.h b/include/jemalloc/internal/tcache.h
index 6e97b3d..2a3952b 100644
--- a/include/jemalloc/internal/tcache.h
+++ b/include/jemalloc/internal/tcache.h
@@ -4,6 +4,7 @@
 typedef struct tcache_bin_info_s tcache_bin_info_t;
 typedef struct tcache_bin_s tcache_bin_t;
 typedef struct tcache_s tcache_t;
+typedef struct tcaches_s tcaches_t;
 
 /*
  * tcache pointers close to NULL are used to encode state information that is
@@ -70,7 +71,6 @@ struct tcache_bin_s {
 struct tcache_s {
 	ql_elm(tcache_t) link;		/* Used for aggregating stats. */
 	uint64_t	prof_accumbytes;/* Cleared after arena_prof_accum(). */
-	arena_t		*arena;		/* This thread's arena. */
 	unsigned	ev_cnt;		/* Event count since incremental GC. */
 	index_t		next_gc_bin;	/* Next bin to GC. */
 	tcache_bin_t	tbins[1];	/* Dynamically sized. */
@@ -82,6 +82,14 @@ struct tcache_s {
 	 */
 };
 
+/* Linkage for list of available (previously used) explicit tcache IDs. */
+struct tcaches_s {
+	union {
+		tcache_t	*tcache;
+		tcaches_t	*next;
+	};
+};
+
 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
@@ -95,27 +103,41 @@ extern tcache_bin_info_t	*tcache_bin_info;
  * Number of tcache bins.  There are NBINS small-object bins, plus 0 or more
  * large-object bins.
  */
-extern size_t			nhbins;
+extern size_t	nhbins;
 
 /* Maximum cached size class. */
-extern size_t			tcache_maxclass;
+extern size_t	tcache_maxclass;
+
+/*
+ * Explicit tcaches, managed via the tcache.{create,flush,destroy} mallctls and
+ * usable via the MALLOCX_TCACHE() flag.  The automatic per thread tcaches are
+ * completely disjoint from this data structure.  tcaches starts off as a sparse
+ * array, so it has no physical memory footprint until individual pages are
+ * touched.  This allows the entire array to be allocated the first time an
+ * explicit tcache is created without a disproportionate impact on memory usage.
+ */
+extern tcaches_t	*tcaches;
 
 size_t	tcache_salloc(const void *ptr);
-void	tcache_event_hard(tcache_t *tcache);
-void	*tcache_alloc_small_hard(tcache_t *tcache, tcache_bin_t *tbin,
-    index_t binind);
-void	tcache_bin_flush_small(tcache_bin_t *tbin, index_t binind, unsigned rem,
-    tcache_t *tcache);
-void	tcache_bin_flush_large(tcache_bin_t *tbin, index_t binind, unsigned rem,
-    tcache_t *tcache);
+void	tcache_event_hard(tsd_t *tsd, tcache_t *tcache);
+void	*tcache_alloc_small_hard(tsd_t *tsd, tcache_t *tcache,
+    tcache_bin_t *tbin, index_t binind);
+void	tcache_bin_flush_small(tsd_t *tsd, tcache_bin_t *tbin, index_t binind,
+    unsigned rem, tcache_t *tcache);
+void	tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, index_t binind,
+    unsigned rem, tcache_t *tcache);
 void	tcache_arena_associate(tcache_t *tcache, arena_t *arena);
-void	tcache_arena_reassociate(tcache_t *tcache, arena_t *arena);
-void	tcache_arena_dissociate(tcache_t *tcache);
+void	tcache_arena_reassociate(tcache_t *tcache, arena_t *oldarena,
+    arena_t *newarena);
+void	tcache_arena_dissociate(tcache_t *tcache, arena_t *arena);
 tcache_t *tcache_get_hard(tsd_t *tsd);
 tcache_t *tcache_create(tsd_t *tsd, arena_t *arena);
 void	tcache_cleanup(tsd_t *tsd);
 void	tcache_enabled_cleanup(tsd_t *tsd);
 void	tcache_stats_merge(tcache_t *tcache, arena_t *arena);
+bool	tcaches_create(tsd_t *tsd, unsigned *r_ind);
+void	tcaches_flush(tsd_t *tsd, unsigned ind);
+void	tcaches_destroy(tsd_t *tsd, unsigned ind);
 bool	tcache_boot(void);
 
 #endif /* JEMALLOC_H_EXTERNS */
@@ -123,16 +145,21 @@ bool	tcache_boot(void);
 #ifdef JEMALLOC_H_INLINES
 
 #ifndef JEMALLOC_ENABLE_INLINE
-void	tcache_event(tcache_t *tcache);
+void	tcache_event(tsd_t *tsd, tcache_t *tcache);
 void	tcache_flush(void);
 bool	tcache_enabled_get(void);
 tcache_t *tcache_get(tsd_t *tsd, bool create);
 void	tcache_enabled_set(bool enabled);
 void	*tcache_alloc_easy(tcache_bin_t *tbin);
-void	*tcache_alloc_small(tcache_t *tcache, size_t size, bool zero);
-void	*tcache_alloc_large(tcache_t *tcache, size_t size, bool zero);
-void	tcache_dalloc_small(tcache_t *tcache, void *ptr, index_t binind);
-void	tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size);
+void	*tcache_alloc_small(tsd_t *tsd, tcache_t *tcache, size_t size,
+    bool zero);
+void	*tcache_alloc_large(tsd_t *tsd, tcache_t *tcache, size_t size,
+    bool zero);
+void	tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr,
+    index_t binind);
+void	tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr,
+    size_t size);
+tcache_t	*tcaches_get(tsd_t *tsd, unsigned ind);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_TCACHE_C_))
@@ -202,7 +229,7 @@ tcache_get(tsd_t *tsd, bool create)
 }
 
 JEMALLOC_ALWAYS_INLINE void
-tcache_event(tcache_t *tcache)
+tcache_event(tsd_t *tsd, tcache_t *tcache)
 {
 
 	if (TCACHE_GC_INCR == 0)
@@ -211,7 +238,7 @@ tcache_event(tcache_t *tcache)
 	tcache->ev_cnt++;
 	assert(tcache->ev_cnt <= TCACHE_GC_INCR);
 	if (unlikely(tcache->ev_cnt == TCACHE_GC_INCR))
-		tcache_event_hard(tcache);
+		tcache_event_hard(tsd, tcache);
 }
 
 JEMALLOC_ALWAYS_INLINE void *
@@ -231,7 +258,7 @@ tcache_alloc_easy(tcache_bin_t *tbin)
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
+tcache_alloc_small(tsd_t *tsd, tcache_t *tcache, size_t size, bool zero)
 {
 	void *ret;
 	index_t binind;
@@ -244,7 +271,7 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
 	usize = index2size(binind);
 	ret = tcache_alloc_easy(tbin);
 	if (unlikely(ret == NULL)) {
-		ret = tcache_alloc_small_hard(tcache, tbin, binind);
+		ret = tcache_alloc_small_hard(tsd, tcache, tbin, binind);
 		if (ret == NULL)
 			return (NULL);
 	}
@@ -270,12 +297,12 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
 		tbin->tstats.nrequests++;
 	if (config_prof)
 		tcache->prof_accumbytes += usize;
-	tcache_event(tcache);
+	tcache_event(tsd, tcache);
 	return (ret);
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
+tcache_alloc_large(tsd_t *tsd, tcache_t *tcache, size_t size, bool zero)
 {
 	void *ret;
 	index_t binind;
@@ -293,7 +320,7 @@ tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
 		 * Only allocate one large object at a time, because it's quite
 		 * expensive to create one and not use it.
 		 */
-		ret = arena_malloc_large(tcache->arena, usize, zero);
+		ret = arena_malloc_large(arena_choose(tsd, NULL), usize, zero);
 		if (ret == NULL)
 			return (NULL);
 	} else {
@@ -321,12 +348,12 @@ tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
 			tcache->prof_accumbytes += usize;
 	}
 
-	tcache_event(tcache);
+	tcache_event(tsd, tcache);
 	return (ret);
 }
 
 JEMALLOC_ALWAYS_INLINE void
-tcache_dalloc_small(tcache_t *tcache, void *ptr, index_t binind)
+tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, index_t binind)
 {
 	tcache_bin_t *tbin;
 	tcache_bin_info_t *tbin_info;
@@ -339,18 +366,18 @@ tcache_dalloc_small(tcache_t *tcache, void *ptr, index_t binind)
 	tbin = &tcache->tbins[binind];
 	tbin_info = &tcache_bin_info[binind];
 	if (unlikely(tbin->ncached == tbin_info->ncached_max)) {
-		tcache_bin_flush_small(tbin, binind, (tbin_info->ncached_max >>
-		    1), tcache);
+		tcache_bin_flush_small(tsd, tbin, binind,
+		    (tbin_info->ncached_max >> 1), tcache);
 	}
 	assert(tbin->ncached < tbin_info->ncached_max);
 	tbin->avail[tbin->ncached] = ptr;
 	tbin->ncached++;
 
-	tcache_event(tcache);
+	tcache_event(tsd, tcache);
 }
 
 JEMALLOC_ALWAYS_INLINE void
-tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
+tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr, size_t size)
 {
 	index_t binind;
 	tcache_bin_t *tbin;
@@ -368,14 +395,23 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
 	tbin = &tcache->tbins[binind];
 	tbin_info = &tcache_bin_info[binind];
 	if (unlikely(tbin->ncached == tbin_info->ncached_max)) {
-		tcache_bin_flush_large(tbin, binind, (tbin_info->ncached_max >>
-		    1), tcache);
+		tcache_bin_flush_large(tsd, tbin, binind,
+		    (tbin_info->ncached_max >> 1), tcache);
 	}
 	assert(tbin->ncached < tbin_info->ncached_max);
 	tbin->avail[tbin->ncached] = ptr;
 	tbin->ncached++;
 
-	tcache_event(tcache);
+	tcache_event(tsd, tcache);
+}
+
+JEMALLOC_ALWAYS_INLINE tcache_t *
+tcaches_get(tsd_t *tsd, unsigned ind)
+{
+	tcaches_t *elm = &tcaches[ind];
+	if (unlikely(elm->tcache == NULL))
+		elm->tcache = tcache_create(tsd, arena_choose(tsd, NULL));
+	return (elm->tcache);
 }
 #endif
 
diff --git a/include/jemalloc/jemalloc_macros.h.in b/include/jemalloc/jemalloc_macros.h.in
index 99f1261..7d1dcf4 100644
--- a/include/jemalloc/jemalloc_macros.h.in
+++ b/include/jemalloc/jemalloc_macros.h.in
@@ -19,8 +19,16 @@
 	 ((a < (size_t)INT_MAX) ? ffs(a)-1 : ffs(a>>32)+31)
 #  endif
 #  define MALLOCX_ZERO	((int)0x40)
-/* Bias arena index bits so that 0 encodes "MALLOCX_ARENA() unspecified". */
-#  define MALLOCX_ARENA(a)	((int)(((a)+1) << 8))
+/*
+ * Bias tcache index bits so that 0 encodes "automatic tcache management", and 1
+ * encodes MALLOCX_TCACHE_NONE.
+ */
+#  define MALLOCX_TCACHE(tc)	((int)(((tc)+2) << 8))
+#  define MALLOCX_TCACHE_NONE	MALLOCX_TCACHE(-1)
+/*
+ * Bias arena index bits so that 0 encodes "use an automatically chosen arena".
+ */
+#  define MALLOCX_ARENA(a)	((int)(((a)+1) << 20))
 
 #ifdef JEMALLOC_HAVE_ATTR
 #  define JEMALLOC_ATTR(s) __attribute__((s))
diff --git a/src/arena.c b/src/arena.c
index a5033bf..907fbd7 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -2182,8 +2182,7 @@ arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
 
 void *
 arena_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t size,
-    size_t extra, size_t alignment, bool zero, bool try_tcache_alloc,
-    bool try_tcache_dalloc)
+    size_t extra, size_t alignment, bool zero, tcache_t *tcache)
 {
 	void *ret;
 	size_t copysize;
@@ -2201,12 +2200,9 @@ arena_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t size,
 		size_t usize = sa2u(size + extra, alignment);
 		if (usize == 0)
 			return (NULL);
-		ret = ipalloct(tsd, usize, alignment, zero, try_tcache_alloc,
-		    arena);
-	} else {
-		ret = arena_malloc(tsd, arena, size + extra, zero,
-		    try_tcache_alloc);
-	}
+		ret = ipalloct(tsd, usize, alignment, zero, tcache, arena);
+	} else
+		ret = arena_malloc(tsd, arena, size + extra, zero, tcache);
 
 	if (ret == NULL) {
 		if (extra == 0)
@@ -2216,12 +2212,10 @@ arena_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t size,
 			size_t usize = sa2u(size, alignment);
 			if (usize == 0)
 				return (NULL);
-			ret = ipalloct(tsd, usize, alignment, zero,
-			    try_tcache_alloc, arena);
-		} else {
-			ret = arena_malloc(tsd, arena, size, zero,
-			    try_tcache_alloc);
-		}
+			ret = ipalloct(tsd, usize, alignment, zero, tcache,
+			    arena);
+		} else
+			ret = arena_malloc(tsd, arena, size, zero, tcache);
 
 		if (ret == NULL)
 			return (NULL);
@@ -2236,7 +2230,7 @@ arena_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t size,
 	copysize = (size < oldsize) ? size : oldsize;
 	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, copysize);
 	memcpy(ret, ptr, copysize);
-	isqalloc(tsd, ptr, oldsize, try_tcache_dalloc);
+	isqalloc(tsd, ptr, oldsize, tcache);
 	return (ret);
 }
 
diff --git a/src/ckh.c b/src/ckh.c
index db2ae39..ad075d6 100644
--- a/src/ckh.c
+++ b/src/ckh.c
@@ -270,7 +270,8 @@ ckh_grow(tsd_t *tsd, ckh_t *ckh)
 			ret = true;
 			goto label_return;
 		}
-		tab = (ckhc_t *)ipalloc(tsd, usize, CACHELINE, true);
+		tab = (ckhc_t *)ipalloct(tsd, usize, CACHELINE, true, NULL,
+		    NULL);
 		if (tab == NULL) {
 			ret = true;
 			goto label_return;
@@ -313,7 +314,7 @@ ckh_shrink(tsd_t *tsd, ckh_t *ckh)
 	usize = sa2u(sizeof(ckhc_t) << lg_curcells, CACHELINE);
 	if (usize == 0)
 		return;
-	tab = (ckhc_t *)ipalloc(tsd, usize, CACHELINE, true);
+	tab = (ckhc_t *)ipalloct(tsd, usize, CACHELINE, true, NULL, NULL);
 	if (tab == NULL) {
 		/*
 		 * An OOM error isn't worth propagating, since it doesn't
@@ -389,7 +390,7 @@ ckh_new(tsd_t *tsd, ckh_t *ckh, size_t minitems, ckh_hash_t *hash,
 		ret = true;
 		goto label_return;
 	}
-	ckh->tab = (ckhc_t *)ipalloc(tsd, usize, CACHELINE, true);
+	ckh->tab = (ckhc_t *)ipalloct(tsd, usize, CACHELINE, true, NULL, NULL);
 	if (ckh->tab == NULL) {
 		ret = true;
 		goto label_return;
diff --git a/src/ctl.c b/src/ctl.c
index 63a689a..a283803 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -110,6 +110,9 @@ CTL_PROTO(opt_prof_gdump)
 CTL_PROTO(opt_prof_final)
 CTL_PROTO(opt_prof_leak)
 CTL_PROTO(opt_prof_accum)
+CTL_PROTO(tcache_create)
+CTL_PROTO(tcache_flush)
+CTL_PROTO(tcache_destroy)
 CTL_PROTO(arena_i_purge)
 static void	arena_purge(unsigned arena_ind);
 CTL_PROTO(arena_i_dss)
@@ -275,6 +278,12 @@ static const ctl_named_node_t opt_node[] = {
 	{NAME("prof_accum"),	CTL(opt_prof_accum)}
 };
 
+static const ctl_named_node_t	tcache_node[] = {
+	{NAME("create"),	CTL(tcache_create)},
+	{NAME("flush"),		CTL(tcache_flush)},
+	{NAME("destroy"),	CTL(tcache_destroy)}
+};
+
 static const ctl_named_node_t chunk_node[] = {
 	{NAME("alloc"),		CTL(arena_i_chunk_alloc)},
 	{NAME("dalloc"),	CTL(arena_i_chunk_dalloc)}
@@ -474,6 +483,7 @@ static const ctl_named_node_t	root_node[] = {
 	{NAME("thread"),	CHILD(named, thread)},
 	{NAME("config"),	CHILD(named, config)},
 	{NAME("opt"),		CHILD(named, opt)},
+	{NAME("tcache"),	CHILD(named, tcache)},
 	{NAME("arena"),		CHILD(indexed, arena)},
 	{NAME("arenas"),	CHILD(named, arenas)},
 	{NAME("prof"),		CHILD(named, prof)},
@@ -1281,19 +1291,21 @@ thread_arena_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 {
 	int ret;
 	tsd_t *tsd;
-	arena_t *arena;
+	arena_t *oldarena;
 	unsigned newind, oldind;
 
 	tsd = tsd_fetch();
-	arena = arena_choose(tsd, NULL);
-	if (arena == NULL)
+	oldarena = arena_choose(tsd, NULL);
+	if (oldarena == NULL)
 		return (EAGAIN);
 
 	malloc_mutex_lock(&ctl_mtx);
-	newind = oldind = arena->ind;
+	newind = oldind = oldarena->ind;
 	WRITE(newind, unsigned);
 	READ(oldind, unsigned);
 	if (newind != oldind) {
+		arena_t *newarena;
+
 		if (newind >= ctl_stats.narenas) {
 			/* New arena index is out of range. */
 			ret = EFAULT;
@@ -1301,8 +1313,8 @@ thread_arena_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 		}
 
 		/* Initialize arena if necessary. */
-		arena = arena_get(tsd, newind, true, true);
-		if (arena == NULL) {
+		newarena = arena_get(tsd, newind, true, true);
+		if (newarena == NULL) {
 			ret = EAGAIN;
 			goto label_return;
 		}
@@ -1310,8 +1322,10 @@ thread_arena_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 		arena_migrate(tsd, oldind, newind);
 		if (config_tcache) {
 			tcache_t *tcache = tsd_tcache_get(tsd);
-			if (tcache != NULL)
-				tcache_arena_reassociate(tcache, arena);
+			if (tcache != NULL) {
+				tcache_arena_reassociate(tcache, oldarena,
+				    newarena);
+			}
 		}
 	}
 
@@ -1438,6 +1452,89 @@ label_return:
 
 /******************************************************************************/
 
+static int
+tcache_create_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
+    void *newp, size_t newlen)
+{
+	int ret;
+	tsd_t *tsd;
+	unsigned tcache_ind;
+
+	if (!config_tcache)
+		return (ENOENT);
+
+	tsd = tsd_fetch();
+
+	malloc_mutex_lock(&ctl_mtx);
+	READONLY();
+	if (tcaches_create(tsd, &tcache_ind)) {
+		ret = EFAULT;
+		goto label_return;
+	}
+	READ(tcache_ind, unsigned);
+
+	ret = 0;
+label_return:
+	malloc_mutex_unlock(&ctl_mtx);
+	return (ret);
+}
+
+static int
+tcache_flush_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
+    void *newp, size_t newlen)
+{
+	int ret;
+	tsd_t *tsd;
+	unsigned tcache_ind;
+
+	if (!config_tcache)
+		return (ENOENT);
+
+	tsd = tsd_fetch();
+
+	WRITEONLY();
+	tcache_ind = UINT_MAX;
+	WRITE(tcache_ind, unsigned);
+	if (tcache_ind == UINT_MAX) {
+		ret = EFAULT;
+		goto label_return;
+	}
+	tcaches_flush(tsd, tcache_ind);
+
+	ret = 0;
+label_return:
+	return (ret);
+}
+
+static int
+tcache_destroy_ctl(const size_t *mib, size_t miblen, void *oldp,
+    size_t *oldlenp, void *newp, size_t newlen)
+{
+	int ret;
+	tsd_t *tsd;
+	unsigned tcache_ind;
+
+	if (!config_tcache)
+		return (ENOENT);
+
+	tsd = tsd_fetch();
+
+	WRITEONLY();
+	tcache_ind = UINT_MAX;
+	WRITE(tcache_ind, unsigned);
+	if (tcache_ind == UINT_MAX) {
+		ret = EFAULT;
+		goto label_return;
+	}
+	tcaches_destroy(tsd, tcache_ind);
+
+	ret = 0;
+label_return:
+	return (ret);
+}
+
+/******************************************************************************/
+
 /* ctl_mutex must be held during execution of this function. */
 static void
 arena_purge(unsigned arena_ind)
diff --git a/src/huge.c b/src/huge.c
index 84a1ab2..db0ecd5 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -13,7 +13,8 @@ static malloc_mutex_t	huge_mtx;
 static extent_tree_t	huge;
 
 void *
-huge_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero, bool try_tcache)
+huge_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
+    tcache_t *tcache)
 {
 	size_t usize;
 
@@ -23,12 +24,12 @@ huge_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero, bool try_tcache)
 		return (NULL);
 	}
 
-	return (huge_palloc(tsd, arena, usize, chunksize, zero, try_tcache));
+	return (huge_palloc(tsd, arena, usize, chunksize, zero, tcache));
 }
 
 void *
 huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
-    bool zero, bool try_tcache)
+    bool zero, tcache_t *tcache)
 {
 	void *ret;
 	extent_node_t *node;
@@ -38,7 +39,7 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 
 	/* Allocate an extent node with which to track the chunk. */
 	node = ipallocztm(tsd, CACHELINE_CEILING(sizeof(extent_node_t)),
-	    CACHELINE, false, try_tcache, true, arena);
+	    CACHELINE, false, tcache, true, arena);
 	if (node == NULL)
 		return (NULL);
 
@@ -50,7 +51,7 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 	arena = arena_choose(tsd, arena);
 	if (unlikely(arena == NULL) || (ret = arena_chunk_alloc_huge(arena,
 	    usize, alignment, &is_zeroed)) == NULL) {
-		idalloctm(tsd, node, try_tcache, true);
+		idalloctm(tsd, node, tcache, true);
 		return (NULL);
 	}
 
@@ -307,8 +308,7 @@ huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
 
 void *
 huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t size,
-    size_t extra, size_t alignment, bool zero, bool try_tcache_alloc,
-    bool try_tcache_dalloc)
+    size_t extra, size_t alignment, bool zero, tcache_t *tcache)
 {
 	void *ret;
 	size_t copysize;
@@ -324,11 +324,9 @@ huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t size,
 	 */
 	if (alignment > chunksize) {
 		ret = huge_palloc(tsd, arena, size + extra, alignment, zero,
-		    try_tcache_alloc);
-	} else {
-		ret = huge_malloc(tsd, arena, size + extra, zero,
-		    try_tcache_alloc);
-	}
+		    tcache);
+	} else
+		ret = huge_malloc(tsd, arena, size + extra, zero, tcache);
 
 	if (ret == NULL) {
 		if (extra == 0)
@@ -336,11 +334,9 @@ huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t size,
 		/* Try again, this time without extra. */
 		if (alignment > chunksize) {
 			ret = huge_palloc(tsd, arena, size, alignment, zero,
-			    try_tcache_alloc);
-		} else {
-			ret = huge_malloc(tsd, arena, size, zero,
-			    try_tcache_alloc);
-		}
+			    tcache);
+		} else
+			ret = huge_malloc(tsd, arena, size, zero, tcache);
 
 		if (ret == NULL)
 			return (NULL);
@@ -352,12 +348,12 @@ huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t size,
 	 */
 	copysize = (size < oldsize) ? size : oldsize;
 	memcpy(ret, ptr, copysize);
-	isqalloc(tsd, ptr, oldsize, try_tcache_dalloc);
+	isqalloc(tsd, ptr, oldsize, tcache);
 	return (ret);
 }
 
 void
-huge_dalloc(tsd_t *tsd, void *ptr, bool try_tcache)
+huge_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache)
 {
 	extent_node_t *node;
 
@@ -368,7 +364,7 @@ huge_dalloc(tsd_t *tsd, void *ptr, bool try_tcache)
 
 	huge_dalloc_junk(node->addr, node->size);
 	arena_chunk_dalloc_huge(node->arena, node->addr, node->size);
-	idalloctm(tsd, node, try_tcache, true);
+	idalloctm(tsd, node, tcache, true);
 }
 
 arena_t *
diff --git a/src/jemalloc.c b/src/jemalloc.c
index d1fa674..9447791 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -367,6 +367,8 @@ arena_init_locked(unsigned ind)
 
 	/* Expand arenas if necessary. */
 	assert(ind <= narenas_total);
+	if (ind > MALLOCX_ARENA_MAX)
+		return (NULL);
 	if (ind == narenas_total) {
 		unsigned narenas_new = narenas_total + 1;
 		arena_t **arenas_new =
@@ -1696,7 +1698,7 @@ irealloc_prof(tsd_t *tsd, void *oldptr, size_t old_usize, size_t usize)
 }
 
 JEMALLOC_INLINE_C void
-ifree(tsd_t *tsd, void *ptr, bool try_tcache)
+ifree(tsd_t *tsd, void *ptr, tcache_t *tcache)
 {
 	size_t usize;
 	UNUSED size_t rzsize JEMALLOC_CC_SILENCE_INIT(0);
@@ -1713,12 +1715,12 @@ ifree(tsd_t *tsd, void *ptr, bool try_tcache)
 		*tsd_thread_deallocatedp_get(tsd) += usize;
 	if (config_valgrind && unlikely(in_valgrind))
 		rzsize = p2rz(ptr);
-	iqalloc(tsd, ptr, try_tcache);
+	iqalloc(tsd, ptr, tcache);
 	JEMALLOC_VALGRIND_FREE(ptr, rzsize);
 }
 
 JEMALLOC_INLINE_C void
-isfree(tsd_t *tsd, void *ptr, size_t usize, bool try_tcache)
+isfree(tsd_t *tsd, void *ptr, size_t usize, tcache_t *tcache)
 {
 	UNUSED size_t rzsize JEMALLOC_CC_SILENCE_INIT(0);
 
@@ -1731,7 +1733,7 @@ isfree(tsd_t *tsd, void *ptr, size_t usize, bool try_tcache)
 		*tsd_thread_deallocatedp_get(tsd) += usize;
 	if (config_valgrind && unlikely(in_valgrind))
 		rzsize = p2rz(ptr);
-	isqalloc(tsd, ptr, usize, try_tcache);
+	isqalloc(tsd, ptr, usize, tcache);
 	JEMALLOC_VALGRIND_FREE(ptr, rzsize);
 }
 
@@ -1749,7 +1751,7 @@ je_realloc(void *ptr, size_t size)
 			/* realloc(ptr, 0) is equivalent to free(ptr). */
 			UTRACE(ptr, 0, 0);
 			tsd = tsd_fetch();
-			ifree(tsd, ptr, true);
+			ifree(tsd, ptr, tcache_get(tsd, false));
 			return (NULL);
 		}
 		size = 1;
@@ -1802,8 +1804,10 @@ je_free(void *ptr)
 {
 
 	UTRACE(ptr, 0, 0);
-	if (likely(ptr != NULL))
-		ifree(tsd_fetch(), ptr, true);
+	if (likely(ptr != NULL)) {
+		tsd_t *tsd = tsd_fetch();
+		ifree(tsd, ptr, tcache_get(tsd, false));
+	}
 }
 
 /*
@@ -1875,7 +1879,7 @@ JEMALLOC_EXPORT void *(*__memalign_hook)(size_t alignment, size_t size) =
 
 JEMALLOC_ALWAYS_INLINE_C bool
 imallocx_flags_decode_hard(tsd_t *tsd, size_t size, int flags, size_t *usize,
-    size_t *alignment, bool *zero, bool *try_tcache, arena_t **arena)
+    size_t *alignment, bool *zero, tcache_t **tcache, arena_t **arena)
 {
 
 	if ((flags & MALLOCX_LG_ALIGN_MASK) == 0) {
@@ -1886,22 +1890,26 @@ imallocx_flags_decode_hard(tsd_t *tsd, size_t size, int flags, size_t *usize,
 		*usize = sa2u(size, *alignment);
 	}
 	*zero = MALLOCX_ZERO_GET(flags);
+	if ((flags & MALLOCX_TCACHE_MASK) != 0) {
+		if ((flags & MALLOCX_TCACHE_MASK) == MALLOCX_TCACHE_NONE)
+			*tcache = NULL;
+		else
+			*tcache = tcaches_get(tsd, MALLOCX_TCACHE_GET(flags));
+	} else
+		*tcache = tcache_get(tsd, true);
 	if ((flags & MALLOCX_ARENA_MASK) != 0) {
 		unsigned arena_ind = MALLOCX_ARENA_GET(flags);
-		*try_tcache = false;
 		*arena = arena_get(tsd, arena_ind, true, true);
 		if (unlikely(*arena == NULL))
 			return (true);
-	} else {
-		*try_tcache = true;
+	} else
 		*arena = NULL;
-	}
 	return (false);
 }
 
 JEMALLOC_ALWAYS_INLINE_C bool
 imallocx_flags_decode(tsd_t *tsd, size_t size, int flags, size_t *usize,
-    size_t *alignment, bool *zero, bool *try_tcache, arena_t **arena)
+    size_t *alignment, bool *zero, tcache_t **tcache, arena_t **arena)
 {
 
 	if (likely(flags == 0)) {
@@ -1909,55 +1917,53 @@ imallocx_flags_decode(tsd_t *tsd, size_t size, int flags, size_t *usize,
 		assert(usize != 0);
 		*alignment = 0;
 		*zero = false;
-		*try_tcache = true;
+		*tcache = tcache_get(tsd, true);
 		*arena = NULL;
 		return (false);
 	} else {
 		return (imallocx_flags_decode_hard(tsd, size, flags, usize,
-		    alignment, zero, try_tcache, arena));
+		    alignment, zero, tcache, arena));
 	}
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
 imallocx_flags(tsd_t *tsd, size_t usize, size_t alignment, bool zero,
-    bool try_tcache, arena_t *arena)
+    tcache_t *tcache, arena_t *arena)
 {
 
-	if (alignment != 0) {
-		return (ipalloct(tsd, usize, alignment, zero, try_tcache,
-		    arena));
-	}
+	if (alignment != 0)
+		return (ipalloct(tsd, usize, alignment, zero, tcache, arena));
 	if (zero)
-		return (icalloct(tsd, usize, try_tcache, arena));
-	return (imalloct(tsd, usize, try_tcache, arena));
+		return (icalloct(tsd, usize, tcache, arena));
+	return (imalloct(tsd, usize, tcache, arena));
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
 imallocx_maybe_flags(tsd_t *tsd, size_t size, int flags, size_t usize,
-    size_t alignment, bool zero, bool try_tcache, arena_t *arena)
+    size_t alignment, bool zero, tcache_t *tcache, arena_t *arena)
 {
 
 	if (likely(flags == 0))
 		return (imalloc(tsd, size));
-	return (imallocx_flags(tsd, usize, alignment, zero, try_tcache, arena));
+	return (imallocx_flags(tsd, usize, alignment, zero, tcache, arena));
 }
 
 static void *
 imallocx_prof_sample(tsd_t *tsd, size_t size, int flags, size_t usize,
-    size_t alignment, bool zero, bool try_tcache, arena_t *arena)
+    size_t alignment, bool zero, tcache_t *tcache, arena_t *arena)
 {
 	void *p;
 
 	if (usize <= SMALL_MAXCLASS) {
 		assert(((alignment == 0) ? s2u(LARGE_MINCLASS) :
 		    sa2u(LARGE_MINCLASS, alignment)) == LARGE_MINCLASS);
-		p = imalloct(tsd, LARGE_MINCLASS, try_tcache, arena);
+		p = imalloct(tsd, LARGE_MINCLASS, tcache, arena);
 		if (p == NULL)
 			return (NULL);
 		arena_prof_promoted(p, usize);
 	} else {
 		p = imallocx_maybe_flags(tsd, size, flags, usize, alignment,
-		    zero, try_tcache, arena);
+		    zero, tcache, arena);
 	}
 
 	return (p);
@@ -1969,20 +1975,20 @@ imallocx_prof(tsd_t *tsd, size_t size, int flags, size_t *usize)
 	void *p;
 	size_t alignment;
 	bool zero;
-	bool try_tcache;
+	tcache_t *tcache;
 	arena_t *arena;
 	prof_tctx_t *tctx;
 
 	if (unlikely(imallocx_flags_decode(tsd, size, flags, usize, &alignment,
-	    &zero, &try_tcache, &arena)))
+	    &zero, &tcache, &arena)))
 		return (NULL);
 	tctx = prof_alloc_prep(tsd, *usize, true);
 	if (likely((uintptr_t)tctx == (uintptr_t)1U)) {
 		p = imallocx_maybe_flags(tsd, size, flags, *usize, alignment,
-		    zero, try_tcache, arena);
+		    zero, tcache, arena);
 	} else if ((uintptr_t)tctx > (uintptr_t)1U) {
 		p = imallocx_prof_sample(tsd, size, flags, *usize, alignment,
-		    zero, try_tcache, arena);
+		    zero, tcache, arena);
 	} else
 		p = NULL;
 	if (unlikely(p == NULL)) {
@@ -1999,7 +2005,7 @@ imallocx_no_prof(tsd_t *tsd, size_t size, int flags, size_t *usize)
 {
 	size_t alignment;
 	bool zero;
-	bool try_tcache;
+	tcache_t *tcache;
 	arena_t *arena;
 
 	if (likely(flags == 0)) {
@@ -2009,10 +2015,9 @@ imallocx_no_prof(tsd_t *tsd, size_t size, int flags, size_t *usize)
 	}
 
 	if (unlikely(imallocx_flags_decode_hard(tsd, size, flags, usize,
-	    &alignment, &zero, &try_tcache, &arena)))
+	    &alignment, &zero, &tcache, &arena)))
 		return (NULL);
-	return (imallocx_flags(tsd, *usize, alignment, zero, try_tcache,
-	    arena));
+	return (imallocx_flags(tsd, *usize, alignment, zero, tcache, arena));
 }
 
 void *
@@ -2053,8 +2058,8 @@ label_oom:
 
 static void *
 irallocx_prof_sample(tsd_t *tsd, void *oldptr, size_t old_usize, size_t size,
-    size_t alignment, size_t usize, bool zero, bool try_tcache_alloc,
-    bool try_tcache_dalloc, arena_t *arena, prof_tctx_t *tctx)
+    size_t alignment, size_t usize, bool zero, tcache_t *tcache, arena_t *arena,
+    prof_tctx_t *tctx)
 {
 	void *p;
 
@@ -2062,13 +2067,13 @@ irallocx_prof_sample(tsd_t *tsd, void *oldptr, size_t old_usize, size_t size,
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
 		p = iralloct(tsd, oldptr, old_usize, LARGE_MINCLASS, alignment,
-		    zero, try_tcache_alloc, try_tcache_dalloc, arena);
+		    zero, tcache, arena);
 		if (p == NULL)
 			return (NULL);
 		arena_prof_promoted(p, usize);
 	} else {
 		p = iralloct(tsd, oldptr, old_usize, size, alignment, zero,
-		    try_tcache_alloc, try_tcache_dalloc, arena);
+		    tcache, arena);
 	}
 
 	return (p);
@@ -2076,8 +2081,8 @@ irallocx_prof_sample(tsd_t *tsd, void *oldptr, size_t old_usize, size_t size,
 
 JEMALLOC_ALWAYS_INLINE_C void *
 irallocx_prof(tsd_t *tsd, void *oldptr, size_t old_usize, size_t size,
-    size_t alignment, size_t *usize, bool zero, bool try_tcache_alloc,
-    bool try_tcache_dalloc, arena_t *arena)
+    size_t alignment, size_t *usize, bool zero, tcache_t *tcache,
+    arena_t *arena)
 {
 	void *p;
 	prof_tctx_t *old_tctx, *tctx;
@@ -2086,11 +2091,10 @@ irallocx_prof(tsd_t *tsd, void *oldptr, size_t old_usize, size_t size,
 	tctx = prof_alloc_prep(tsd, *usize, false);
 	if (unlikely((uintptr_t)tctx != (uintptr_t)1U)) {
 		p = irallocx_prof_sample(tsd, oldptr, old_usize, size,
-		    alignment, *usize, zero, try_tcache_alloc,
-		    try_tcache_dalloc, arena, tctx);
+		    alignment, *usize, zero, tcache, arena, tctx);
 	} else {
 		p = iralloct(tsd, oldptr, old_usize, size, alignment, zero,
-		    try_tcache_alloc, try_tcache_dalloc, arena);
+		    tcache, arena);
 	}
 	if (unlikely(p == NULL)) {
 		prof_alloc_rollback(tsd, tctx, false);
@@ -2123,8 +2127,8 @@ je_rallocx(void *ptr, size_t size, int flags)
 	UNUSED size_t old_rzsize JEMALLOC_CC_SILENCE_INIT(0);
 	size_t alignment = MALLOCX_ALIGN_GET(flags);
 	bool zero = flags & MALLOCX_ZERO;
-	bool try_tcache_alloc, try_tcache_dalloc;
 	arena_t *arena;
+	tcache_t *tcache;
 
 	assert(ptr != NULL);
 	assert(size != 0);
@@ -2134,18 +2138,19 @@ je_rallocx(void *ptr, size_t size, int flags)
 
 	if (unlikely((flags & MALLOCX_ARENA_MASK) != 0)) {
 		unsigned arena_ind = MALLOCX_ARENA_GET(flags);
-		arena_chunk_t *chunk;
-		try_tcache_alloc = false;
-		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 		arena = arena_get(tsd, arena_ind, true, true);
 		if (unlikely(arena == NULL))
 			goto label_oom;
-		try_tcache_dalloc = (chunk == ptr || chunk->arena != arena);
-	} else {
-		try_tcache_alloc = true;
-		try_tcache_dalloc = true;
+	} else
 		arena = NULL;
-	}
+
+	if (unlikely((flags & MALLOCX_TCACHE_MASK) != 0)) {
+		if ((flags & MALLOCX_TCACHE_MASK) == MALLOCX_TCACHE_NONE)
+			tcache = NULL;
+		else
+			tcache = tcaches_get(tsd, MALLOCX_TCACHE_GET(flags));
+	} else
+		tcache = tcache_get(tsd, true);
 
 	old_usize = isalloc(ptr, config_prof);
 	if (config_valgrind && unlikely(in_valgrind))
@@ -2155,12 +2160,12 @@ je_rallocx(void *ptr, size_t size, int flags)
 		usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment);
 		assert(usize != 0);
 		p = irallocx_prof(tsd, ptr, old_usize, size, alignment, &usize,
-		    zero, try_tcache_alloc, try_tcache_dalloc, arena);
+		    zero, tcache, arena);
 		if (unlikely(p == NULL))
 			goto label_oom;
 	} else {
 		p = iralloct(tsd, ptr, old_usize, size, alignment, zero,
-		     try_tcache_alloc, try_tcache_dalloc, arena);
+		     tcache, arena);
 		if (unlikely(p == NULL))
 			goto label_oom;
 		if (config_stats || (config_valgrind && unlikely(in_valgrind)))
@@ -2319,28 +2324,22 @@ void
 je_dallocx(void *ptr, int flags)
 {
 	tsd_t *tsd;
-	bool try_tcache;
+	tcache_t *tcache;
 
 	assert(ptr != NULL);
 	assert(malloc_initialized() || IS_INITIALIZER);
 
 	tsd = tsd_fetch();
-	if (unlikely((flags & MALLOCX_ARENA_MASK) != 0)) {
-		unsigned arena_ind = MALLOCX_ARENA_GET(flags);
-		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-		arena_t *arena = arena_get(tsd, arena_ind, true, true);
-		/*
-		 * If arena is NULL, the application passed an arena that has
-		 * never been used before, which is unsupported during
-		 * deallocation.
-		 */
-		assert(arena != NULL);
-		try_tcache = (chunk == ptr || chunk->arena != arena);
+	if (unlikely((flags & MALLOCX_TCACHE_MASK) != 0)) {
+		if ((flags & MALLOCX_TCACHE_MASK) == MALLOCX_TCACHE_NONE)
+			tcache = NULL;
+		else
+			tcache = tcaches_get(tsd, MALLOCX_TCACHE_GET(flags));
 	} else
-		try_tcache = true;
+		tcache = tcache_get(tsd, false);
 
 	UTRACE(ptr, 0, 0);
-	ifree(tsd_fetch(), ptr, try_tcache);
+	ifree(tsd_fetch(), ptr, tcache);
 }
 
 JEMALLOC_ALWAYS_INLINE_C size_t
@@ -2360,7 +2359,7 @@ void
 je_sdallocx(void *ptr, size_t size, int flags)
 {
 	tsd_t *tsd;
-	bool try_tcache;
+	tcache_t *tcache;
 	size_t usize;
 
 	assert(ptr != NULL);
@@ -2369,21 +2368,16 @@ je_sdallocx(void *ptr, size_t size, int flags)
 	assert(usize == isalloc(ptr, config_prof));
 
 	tsd = tsd_fetch();
-	if (unlikely((flags & MALLOCX_ARENA_MASK) != 0)) {
-		unsigned arena_ind = MALLOCX_ARENA_GET(flags);
-		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-		arena_t *arena = arena_get(tsd, arena_ind, true, true);
-		/*
-		 * If arena is NULL, the application passed an arena that has
-		 * never been used before, which is unsupported during
-		 * deallocation.
-		 */
-		try_tcache = (chunk == ptr || chunk->arena != arena);
+	if (unlikely((flags & MALLOCX_TCACHE_MASK) != 0)) {
+		if ((flags & MALLOCX_TCACHE_MASK) == MALLOCX_TCACHE_NONE)
+			tcache = NULL;
+		else
+			tcache = tcaches_get(tsd, MALLOCX_TCACHE_GET(flags));
 	} else
-		try_tcache = true;
+		tcache = tcache_get(tsd, false);
 
 	UTRACE(ptr, 0, 0);
-	isfree(tsd, ptr, usize, try_tcache);
+	isfree(tsd, ptr, usize, tcache);
 }
 
 size_t
diff --git a/src/prof.c b/src/prof.c
index 04b2591..4f1580b 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -540,7 +540,8 @@ prof_gctx_create(tsd_t *tsd, prof_bt_t *bt)
 	 * Create a single allocation that has space for vec of length bt->len.
 	 */
 	prof_gctx_t *gctx = (prof_gctx_t *)iallocztm(tsd, offsetof(prof_gctx_t,
-	    vec) + (bt->len * sizeof(void *)), false, true, true, NULL);
+	    vec) + (bt->len * sizeof(void *)), false, tcache_get(tsd, true),
+	    true, NULL);
 	if (gctx == NULL)
 		return (NULL);
 	gctx->lock = prof_gctx_mutex_choose();
@@ -581,7 +582,7 @@ prof_gctx_try_destroy(tsd_t *tsd, prof_tdata_t *tdata_self, prof_gctx_t *gctx,
 		prof_leave(tsd, tdata_self);
 		/* Destroy gctx. */
 		malloc_mutex_unlock(gctx->lock);
-		idalloctm(tsd, gctx, true, true);
+		idalloctm(tsd, gctx, tcache_get(tsd, false), true);
 	} else {
 		/*
 		 * Compensate for increment in prof_tctx_destroy() or
@@ -681,7 +682,7 @@ prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx)
 		prof_tdata_destroy(tsd, tdata, false);
 
 	if (destroy_tctx)
-		idalloctm(tsd, tctx, true, true);
+		idalloctm(tsd, tctx, tcache_get(tsd, false), true);
 }
 
 static bool
@@ -710,7 +711,7 @@ prof_lookup_global(tsd_t *tsd, prof_bt_t *bt, prof_tdata_t *tdata,
 		if (ckh_insert(tsd, &bt2gctx, btkey.v, gctx.v)) {
 			/* OOM. */
 			prof_leave(tsd, tdata);
-			idalloctm(tsd, gctx.v, true, true);
+			idalloctm(tsd, gctx.v, tcache_get(tsd, false), true);
 			return (true);
 		}
 		new_gctx = true;
@@ -754,6 +755,7 @@ prof_lookup(tsd_t *tsd, prof_bt_t *bt)
 		ret.p->prepared = true;
 	malloc_mutex_unlock(tdata->lock);
 	if (not_found) {
+		tcache_t *tcache;
 		void *btkey;
 		prof_gctx_t *gctx;
 		bool new_gctx, error;
@@ -767,7 +769,8 @@ prof_lookup(tsd_t *tsd, prof_bt_t *bt)
 			return (NULL);
 
 		/* Link a prof_tctx_t into gctx for this thread. */
-		ret.v = iallocztm(tsd, sizeof(prof_tctx_t), false, true, true,
+		tcache = tcache_get(tsd, true);
+		ret.v = iallocztm(tsd, sizeof(prof_tctx_t), false, tcache, true,
 		    NULL);
 		if (ret.p == NULL) {
 			if (new_gctx)
@@ -786,7 +789,7 @@ prof_lookup(tsd_t *tsd, prof_bt_t *bt)
 		if (error) {
 			if (new_gctx)
 				prof_gctx_try_destroy(tsd, tdata, gctx, tdata);
-			idalloctm(tsd, ret.v, true, true);
+			idalloctm(tsd, ret.v, tcache, true);
 			return (NULL);
 		}
 		malloc_mutex_lock(gctx->lock);
@@ -1166,7 +1169,8 @@ prof_gctx_finish(tsd_t *tsd, prof_gctx_tree_t *gctxs)
 					    to_destroy);
 					tctx_tree_remove(&gctx->tctxs,
 					    to_destroy);
-					idalloctm(tsd, to_destroy, true, true);
+					idalloctm(tsd, to_destroy,
+					    tcache_get(tsd, false), true);
 				} else
 					next = NULL;
 			} while (next != NULL);
@@ -1644,12 +1648,14 @@ prof_tdata_init_impl(tsd_t *tsd, uint64_t thr_uid, uint64_t thr_discrim,
     char *thread_name, bool active)
 {
 	prof_tdata_t *tdata;
+	tcache_t *tcache;
 
 	cassert(config_prof);
 
 	/* Initialize an empty cache for this thread. */
+	tcache = tcache_get(tsd, true);
 	tdata = (prof_tdata_t *)iallocztm(tsd, sizeof(prof_tdata_t), false,
-	    true, true, NULL);
+	    tcache, true, NULL);
 	if (tdata == NULL)
 		return (NULL);
 
@@ -1662,7 +1668,7 @@ prof_tdata_init_impl(tsd_t *tsd, uint64_t thr_uid, uint64_t thr_discrim,
 
 	if (ckh_new(tsd, &tdata->bt2tctx, PROF_CKH_MINITEMS,
 	    prof_bt_hash, prof_bt_keycomp)) {
-		idalloctm(tsd, tdata, true, true);
+		idalloctm(tsd, tdata, tcache, true);
 		return (NULL);
 	}
 
@@ -1708,16 +1714,18 @@ static void
 prof_tdata_destroy_locked(tsd_t *tsd, prof_tdata_t *tdata,
     bool even_if_attached)
 {
+	tcache_t *tcache;
 
 	assert(prof_tdata_should_destroy(tdata, even_if_attached));
 	assert(tsd_prof_tdata_get(tsd) != tdata);
 
 	tdata_tree_remove(&tdatas, tdata);
 
+	tcache = tcache_get(tsd, false);
 	if (tdata->thread_name != NULL)
-		idalloctm(tsd, tdata->thread_name, true, true);
+		idalloctm(tsd, tdata->thread_name, tcache, true);
 	ckh_delete(tsd, &tdata->bt2tctx);
-	idalloctm(tsd, tdata, true, true);
+	idalloctm(tsd, tdata, tcache, true);
 }
 
 static void
@@ -1878,7 +1886,7 @@ prof_thread_name_alloc(tsd_t *tsd, const char *thread_name)
 	if (size == 1)
 		return ("");
 
-	ret = iallocztm(tsd, size, false, true, true, NULL);
+	ret = iallocztm(tsd, size, false, tcache_get(tsd, true), true, NULL);
 	if (ret == NULL)
 		return (NULL);
 	memcpy(ret, thread_name, size);
@@ -1910,7 +1918,8 @@ prof_thread_name_set(tsd_t *tsd, const char *thread_name)
 		return (EAGAIN);
 
 	if (tdata->thread_name != NULL) {
-		idalloctm(tsd, tdata->thread_name, true, true);
+		idalloctm(tsd, tdata->thread_name, tcache_get(tsd, false),
+		    true);
 		tdata->thread_name = NULL;
 	}
 	if (strlen(s) > 0)
diff --git a/src/quarantine.c b/src/quarantine.c
index 094b44d..adc7305 100644
--- a/src/quarantine.c
+++ b/src/quarantine.c
@@ -27,8 +27,8 @@ quarantine_init(tsd_t *tsd, size_t lg_maxobjs)
 	assert(tsd_nominal(tsd));
 
 	quarantine = (quarantine_t *)iallocztm(tsd, offsetof(quarantine_t, objs)
-	    + ((ZU(1) << lg_maxobjs) * sizeof(quarantine_obj_t)), false, true,
-	    true, NULL);
+	    + ((ZU(1) << lg_maxobjs) * sizeof(quarantine_obj_t)), false,
+	    tcache_get(tsd, true), true, NULL);
 	if (quarantine == NULL)
 		return (NULL);
 	quarantine->curbytes = 0;
@@ -55,7 +55,7 @@ quarantine_alloc_hook_work(tsd_t *tsd)
 	if (tsd_quarantine_get(tsd) == NULL)
 		tsd_quarantine_set(tsd, quarantine);
 	else
-		idalloctm(tsd, quarantine, true, true);
+		idalloctm(tsd, quarantine, tcache_get(tsd, false), true);
 }
 
 static quarantine_t *
@@ -87,7 +87,7 @@ quarantine_grow(tsd_t *tsd, quarantine_t *quarantine)
 		memcpy(&ret->objs[ncopy_a], quarantine->objs, ncopy_b *
 		    sizeof(quarantine_obj_t));
 	}
-	idalloctm(tsd, quarantine, true, true);
+	idalloctm(tsd, quarantine, tcache_get(tsd, false), true);
 
 	tsd_quarantine_set(tsd, ret);
 	return (ret);
@@ -177,7 +177,7 @@ quarantine_cleanup(tsd_t *tsd)
 	quarantine = tsd_quarantine_get(tsd);
 	if (quarantine != NULL) {
 		quarantine_drain(tsd, quarantine, 0);
-		idalloctm(tsd, quarantine, true, true);
+		idalloctm(tsd, quarantine, tcache_get(tsd, false), true);
 		tsd_quarantine_set(tsd, NULL);
 	}
 }
diff --git a/src/tcache.c b/src/tcache.c
index d638015..c7d4f78 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -13,6 +13,14 @@ static unsigned		stack_nelms; /* Total stack elms per tcache. */
 size_t			nhbins;
 size_t			tcache_maxclass;
 
+tcaches_t		*tcaches;
+
+/* Index of first element within tcaches that has never been used. */
+static unsigned		tcaches_past;
+
+/* Head of singly linked list tracking available tcaches elements. */
+static tcaches_t	*tcaches_avail;
+
 /******************************************************************************/
 
 size_t	tcache_salloc(const void *ptr)
@@ -22,7 +30,7 @@ size_t	tcache_salloc(const void *ptr)
 }
 
 void
-tcache_event_hard(tcache_t *tcache)
+tcache_event_hard(tsd_t *tsd, tcache_t *tcache)
 {
 	index_t binind = tcache->next_gc_bin;
 	tcache_bin_t *tbin = &tcache->tbins[binind];
@@ -33,11 +41,11 @@ tcache_event_hard(tcache_t *tcache)
 		 * Flush (ceiling) 3/4 of the objects below the low water mark.
 		 */
 		if (binind < NBINS) {
-			tcache_bin_flush_small(tbin, binind, tbin->ncached -
-			    tbin->low_water + (tbin->low_water >> 2), tcache);
+			tcache_bin_flush_small(tsd, tbin, binind, tbin->ncached
+			    - tbin->low_water + (tbin->low_water >> 2), tcache);
 		} else {
-			tcache_bin_flush_large(tbin, binind, tbin->ncached -
-			    tbin->low_water + (tbin->low_water >> 2), tcache);
+			tcache_bin_flush_large(tsd, tbin, binind, tbin->ncached
+			    - tbin->low_water + (tbin->low_water >> 2), tcache);
 		}
 		/*
 		 * Reduce fill count by 2X.  Limit lg_fill_div such that the
@@ -62,11 +70,12 @@ tcache_event_hard(tcache_t *tcache)
 }
 
 void *
-tcache_alloc_small_hard(tcache_t *tcache, tcache_bin_t *tbin, index_t binind)
+tcache_alloc_small_hard(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
+    index_t binind)
 {
 	void *ret;
 
-	arena_tcache_fill_small(tcache->arena, tbin, binind,
+	arena_tcache_fill_small(arena_choose(tsd, NULL), tbin, binind,
 	    config_prof ? tcache->prof_accumbytes : 0);
 	if (config_prof)
 		tcache->prof_accumbytes = 0;
@@ -76,9 +85,10 @@ tcache_alloc_small_hard(tcache_t *tcache, tcache_bin_t *tbin, index_t binind)
 }
 
 void
-tcache_bin_flush_small(tcache_bin_t *tbin, index_t binind, unsigned rem,
-    tcache_t *tcache)
+tcache_bin_flush_small(tsd_t *tsd, tcache_bin_t *tbin, index_t binind,
+    unsigned rem, tcache_t *tcache)
 {
+	arena_t *arena;
 	void *ptr;
 	unsigned i, nflush, ndeferred;
 	bool merged_stats = false;
@@ -86,21 +96,23 @@ tcache_bin_flush_small(tcache_bin_t *tbin, index_t binind, unsigned rem,
 	assert(binind < NBINS);
 	assert(rem <= tbin->ncached);
 
+	arena = arena_choose(tsd, NULL);
+	assert(arena != NULL);
 	for (nflush = tbin->ncached - rem; nflush > 0; nflush = ndeferred) {
 		/* Lock the arena bin associated with the first object. */
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(
 		    tbin->avail[0]);
-		arena_t *arena = chunk->arena;
+		arena_t *bin_arena = chunk->arena;
 		arena_bin_t *bin = &arena->bins[binind];
 
-		if (config_prof && arena == tcache->arena) {
+		if (config_prof && bin_arena == arena) {
 			if (arena_prof_accum(arena, tcache->prof_accumbytes))
 				prof_idump();
 			tcache->prof_accumbytes = 0;
 		}
 
 		malloc_mutex_lock(&bin->lock);
-		if (config_stats && arena == tcache->arena) {
+		if (config_stats && bin_arena == arena) {
 			assert(!merged_stats);
 			merged_stats = true;
 			bin->stats.nflushes++;
@@ -112,12 +124,12 @@ tcache_bin_flush_small(tcache_bin_t *tbin, index_t binind, unsigned rem,
 			ptr = tbin->avail[i];
 			assert(ptr != NULL);
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-			if (chunk->arena == arena) {
+			if (chunk->arena == bin_arena) {
 				size_t pageind = ((uintptr_t)ptr -
 				    (uintptr_t)chunk) >> LG_PAGE;
 				arena_chunk_map_bits_t *bitselm =
 				    arena_bitselm_get(chunk, pageind);
-				arena_dalloc_bin_junked_locked(arena, chunk,
+				arena_dalloc_bin_junked_locked(bin_arena, chunk,
 				    ptr, bitselm);
 			} else {
 				/*
@@ -137,7 +149,7 @@ tcache_bin_flush_small(tcache_bin_t *tbin, index_t binind, unsigned rem,
 		 * The flush loop didn't happen to flush to this thread's
 		 * arena, so the stats didn't get merged.  Manually do so now.
 		 */
-		arena_bin_t *bin = &tcache->arena->bins[binind];
+		arena_bin_t *bin = &arena->bins[binind];
 		malloc_mutex_lock(&bin->lock);
 		bin->stats.nflushes++;
 		bin->stats.nrequests += tbin->tstats.nrequests;
@@ -153,9 +165,10 @@ tcache_bin_flush_small(tcache_bin_t *tbin, index_t binind, unsigned rem,
 }
 
 void
-tcache_bin_flush_large(tcache_bin_t *tbin, index_t binind, unsigned rem,
-    tcache_t *tcache)
+tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, index_t binind,
+    unsigned rem, tcache_t *tcache)
 {
+	arena_t *arena;
 	void *ptr;
 	unsigned i, nflush, ndeferred;
 	bool merged_stats = false;
@@ -163,17 +176,19 @@ tcache_bin_flush_large(tcache_bin_t *tbin, index_t binind, unsigned rem,
 	assert(binind < nhbins);
 	assert(rem <= tbin->ncached);
 
+	arena = arena_choose(tsd, NULL);
+	assert(arena != NULL);
 	for (nflush = tbin->ncached - rem; nflush > 0; nflush = ndeferred) {
 		/* Lock the arena associated with the first object. */
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(
 		    tbin->avail[0]);
-		arena_t *arena = chunk->arena;
+		arena_t *locked_arena = chunk->arena;
 		UNUSED bool idump;
 
 		if (config_prof)
 			idump = false;
-		malloc_mutex_lock(&arena->lock);
-		if ((config_prof || config_stats) && arena == tcache->arena) {
+		malloc_mutex_lock(&locked_arena->lock);
+		if ((config_prof || config_stats) && locked_arena == arena) {
 			if (config_prof) {
 				idump = arena_prof_accum_locked(arena,
 				    tcache->prof_accumbytes);
@@ -193,9 +208,9 @@ tcache_bin_flush_large(tcache_bin_t *tbin, index_t binind, unsigned rem,
 			ptr = tbin->avail[i];
 			assert(ptr != NULL);
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-			if (chunk->arena == arena) {
-				arena_dalloc_large_junked_locked(arena, chunk,
-				    ptr);
+			if (chunk->arena == locked_arena) {
+				arena_dalloc_large_junked_locked(locked_arena,
+				    chunk, ptr);
 			} else {
 				/*
 				 * This object was allocated via a different
@@ -207,7 +222,7 @@ tcache_bin_flush_large(tcache_bin_t *tbin, index_t binind, unsigned rem,
 				ndeferred++;
 			}
 		}
-		malloc_mutex_unlock(&arena->lock);
+		malloc_mutex_unlock(&locked_arena->lock);
 		if (config_prof && idump)
 			prof_idump();
 	}
@@ -216,7 +231,6 @@ tcache_bin_flush_large(tcache_bin_t *tbin, index_t binind, unsigned rem,
 		 * The flush loop didn't happen to flush to this thread's
 		 * arena, so the stats didn't get merged.  Manually do so now.
 		 */
-		arena_t *arena = tcache->arena;
 		malloc_mutex_lock(&arena->lock);
 		arena->stats.nrequests_large += tbin->tstats.nrequests;
 		arena->stats.lstats[binind - NBINS].nrequests +=
@@ -243,27 +257,37 @@ tcache_arena_associate(tcache_t *tcache, arena_t *arena)
 		ql_tail_insert(&arena->tcache_ql, tcache, link);
 		malloc_mutex_unlock(&arena->lock);
 	}
-	tcache->arena = arena;
 }
 
 void
-tcache_arena_reassociate(tcache_t *tcache, arena_t *arena)
+tcache_arena_reassociate(tcache_t *tcache, arena_t *oldarena, arena_t *newarena)
 {
 
-	tcache_arena_dissociate(tcache);
-	tcache_arena_associate(tcache, arena);
+	tcache_arena_dissociate(tcache, oldarena);
+	tcache_arena_associate(tcache, newarena);
 }
 
 void
-tcache_arena_dissociate(tcache_t *tcache)
+tcache_arena_dissociate(tcache_t *tcache, arena_t *arena)
 {
 
 	if (config_stats) {
 		/* Unlink from list of extant tcaches. */
-		malloc_mutex_lock(&tcache->arena->lock);
-		ql_remove(&tcache->arena->tcache_ql, tcache, link);
-		tcache_stats_merge(tcache, tcache->arena);
-		malloc_mutex_unlock(&tcache->arena->lock);
+		malloc_mutex_lock(&arena->lock);
+		if (config_debug) {
+			bool in_ql = false;
+			tcache_t *iter;
+			ql_foreach(iter, &arena->tcache_ql, link) {
+				if (iter == tcache) {
+					in_ql = true;
+					break;
+				}
+			}
+			assert(in_ql);
+		}
+		ql_remove(&arena->tcache_ql, tcache, link);
+		tcache_stats_merge(tcache, arena);
+		malloc_mutex_unlock(&arena->lock);
 	}
 }
 
@@ -298,7 +322,7 @@ tcache_create(tsd_t *tsd, arena_t *arena)
 	/* Avoid false cacheline sharing. */
 	size = sa2u(size, CACHELINE);
 
-	tcache = ipallocztm(tsd, size, CACHELINE, true, false, true, arena);
+	tcache = ipallocztm(tsd, size, CACHELINE, true, false, true, a0get());
 	if (tcache == NULL)
 		return (NULL);
 
@@ -318,16 +342,17 @@ tcache_create(tsd_t *tsd, arena_t *arena)
 static void
 tcache_destroy(tsd_t *tsd, tcache_t *tcache)
 {
+	arena_t *arena;
 	unsigned i;
 
-	tcache_arena_dissociate(tcache);
+	arena = arena_choose(tsd, NULL);
+	tcache_arena_dissociate(tcache, arena);
 
 	for (i = 0; i < NBINS; i++) {
 		tcache_bin_t *tbin = &tcache->tbins[i];
-		tcache_bin_flush_small(tbin, i, 0, tcache);
+		tcache_bin_flush_small(tsd, tbin, i, 0, tcache);
 
 		if (config_stats && tbin->tstats.nrequests != 0) {
-			arena_t *arena = tcache->arena;
 			arena_bin_t *bin = &arena->bins[i];
 			malloc_mutex_lock(&bin->lock);
 			bin->stats.nrequests += tbin->tstats.nrequests;
@@ -337,10 +362,9 @@ tcache_destroy(tsd_t *tsd, tcache_t *tcache)
 
 	for (; i < nhbins; i++) {
 		tcache_bin_t *tbin = &tcache->tbins[i];
-		tcache_bin_flush_large(tbin, i, 0, tcache);
+		tcache_bin_flush_large(tsd, tbin, i, 0, tcache);
 
 		if (config_stats && tbin->tstats.nrequests != 0) {
-			arena_t *arena = tcache->arena;
 			malloc_mutex_lock(&arena->lock);
 			arena->stats.nrequests_large += tbin->tstats.nrequests;
 			arena->stats.lstats[i - NBINS].nrequests +=
@@ -350,7 +374,7 @@ tcache_destroy(tsd_t *tsd, tcache_t *tcache)
 	}
 
 	if (config_prof && tcache->prof_accumbytes > 0 &&
-	    arena_prof_accum(tcache->arena, tcache->prof_accumbytes))
+	    arena_prof_accum(arena, tcache->prof_accumbytes))
 		prof_idump();
 
 	idalloctm(tsd, tcache, false, true);
@@ -405,6 +429,66 @@ tcache_stats_merge(tcache_t *tcache, arena_t *arena)
 }
 
 bool
+tcaches_create(tsd_t *tsd, unsigned *r_ind)
+{
+	tcache_t *tcache;
+	tcaches_t *elm;
+
+	if (tcaches == NULL) {
+		tcaches = base_alloc(sizeof(tcache_t *) *
+		    (MALLOCX_TCACHE_MAX+1));
+		if (tcaches == NULL)
+			return (true);
+	}
+
+	if (tcaches_avail == NULL && tcaches_past > MALLOCX_TCACHE_MAX)
+		return (true);
+	tcache = tcache_create(tsd, a0get());
+	if (tcache == NULL)
+		return (true);
+
+	if (tcaches_avail != NULL) {
+		elm = tcaches_avail;
+		tcaches_avail = tcaches_avail->next;
+		elm->tcache = tcache;
+		*r_ind = (elm - tcaches) / sizeof(tcaches_t);
+	} else {
+		elm = &tcaches[tcaches_past];
+		elm->tcache = tcache;
+		*r_ind = tcaches_past;
+		tcaches_past++;
+	}
+
+	return (false);
+}
+
+static void
+tcaches_elm_flush(tsd_t *tsd, tcaches_t *elm)
+{
+
+	if (elm->tcache == NULL)
+		return;
+	tcache_destroy(tsd, elm->tcache);
+	elm->tcache = NULL;
+}
+
+void
+tcaches_flush(tsd_t *tsd, unsigned ind)
+{
+
+	tcaches_elm_flush(tsd, &tcaches[ind]);
+}
+
+void
+tcaches_destroy(tsd_t *tsd, unsigned ind)
+{
+	tcaches_t *elm = &tcaches[ind];
+	tcaches_elm_flush(tsd, elm);
+	elm->next = tcaches_avail;
+	tcaches_avail = elm;
+}
+
+bool
 tcache_boot(void)
 {
 	unsigned i;
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index f4b7d1a..10a6fcd 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -211,6 +211,114 @@ TEST_BEGIN(test_manpage_example)
 }
 TEST_END
 
+TEST_BEGIN(test_tcache_none)
+{
+	void *p0, *q, *p1;
+
+	test_skip_if(!config_tcache);
+
+	/* Allocate p and q. */
+	p0 = mallocx(42, 0);
+	assert_ptr_not_null(p0, "Unexpected mallocx() failure");
+	q = mallocx(42, 0);
+	assert_ptr_not_null(q, "Unexpected mallocx() failure");
+
+	/* Deallocate p and q, but bypass the tcache for q. */
+	dallocx(p0, 0);
+	dallocx(q, MALLOCX_TCACHE_NONE);
+
+	/* Make sure that tcache-based allocation returns p, not q. */
+	p1 = mallocx(42, 0);
+	assert_ptr_not_null(p1, "Unexpected mallocx() failure");
+	assert_ptr_eq(p0, p1, "Expected tcache to allocate cached region");
+
+	/* Clean up. */
+	dallocx(p1, MALLOCX_TCACHE_NONE);
+}
+TEST_END
+
+TEST_BEGIN(test_tcache)
+{
+#define	NTCACHES	10
+	unsigned tis[NTCACHES];
+	void *ps[NTCACHES];
+	void *qs[NTCACHES];
+	unsigned i;
+	size_t sz, psz, qsz;
+
+	test_skip_if(!config_tcache);
+
+	psz = 42;
+	qsz = nallocx(psz, 0) + 1;
+
+	/* Create tcaches. */
+	for (i = 0; i < NTCACHES; i++) {
+		sz = sizeof(unsigned);
+		assert_d_eq(mallctl("tcache.create", &tis[i], &sz, NULL, 0), 0,
+		    "Unexpected mallctl() failure, i=%u", i);
+	}
+
+	/* Flush empty tcaches. */
+	for (i = 0; i < NTCACHES; i++) {
+		assert_d_eq(mallctl("tcache.flush", NULL, NULL, &tis[i],
+		    sizeof(unsigned)), 0, "Unexpected mallctl() failure, i=%u",
+		    i);
+	}
+
+	/* Cache some allocations. */
+	for (i = 0; i < NTCACHES; i++) {
+		ps[i] = mallocx(psz, MALLOCX_TCACHE(tis[i]));
+		assert_ptr_not_null(ps[i], "Unexpected mallocx() failure, i=%u",
+		    i);
+		dallocx(ps[i], MALLOCX_TCACHE(tis[i]));
+
+		qs[i] = mallocx(qsz, MALLOCX_TCACHE(tis[i]));
+		assert_ptr_not_null(qs[i], "Unexpected mallocx() failure, i=%u",
+		    i);
+		dallocx(qs[i], MALLOCX_TCACHE(tis[i]));
+	}
+
+	/* Verify that tcaches allocate cached regions. */
+	for (i = 0; i < NTCACHES; i++) {
+		void *p0 = ps[i];
+		ps[i] = mallocx(psz, MALLOCX_TCACHE(tis[i]));
+		assert_ptr_not_null(ps[i], "Unexpected mallocx() failure, i=%u",
+		    i);
+		assert_ptr_eq(ps[i], p0,
+		    "Expected mallocx() to allocate cached region, i=%u", i);
+	}
+
+	/* Verify that reallocation uses cached regions. */
+	for (i = 0; i < NTCACHES; i++) {
+		void *q0 = qs[i];
+		qs[i] = rallocx(ps[i], qsz, MALLOCX_TCACHE(tis[i]));
+		assert_ptr_not_null(qs[i], "Unexpected rallocx() failure, i=%u",
+		    i);
+		assert_ptr_eq(qs[i], q0,
+		    "Expected rallocx() to allocate cached region, i=%u", i);
+		/* Avoid undefined behavior in case of test failure. */
+		if (qs[i] == NULL)
+			qs[i] = ps[i];
+	}
+	for (i = 0; i < NTCACHES; i++)
+		dallocx(qs[i], MALLOCX_TCACHE(tis[i]));
+
+	/* Flush some non-empty tcaches. */
+	for (i = 0; i < NTCACHES/2; i++) {
+		assert_d_eq(mallctl("tcache.flush", NULL, NULL, &tis[i],
+		    sizeof(unsigned)), 0, "Unexpected mallctl() failure, i=%u",
+		    i);
+	}
+
+	/* Destroy tcaches. */
+	for (i = 0; i < NTCACHES; i++) {
+		assert_d_eq(mallctl("tcache.destroy", NULL, NULL, &tis[i],
+		    sizeof(unsigned)), 0, "Unexpected mallctl() failure, i=%u",
+		    i);
+	}
+}
+TEST_END
+
 TEST_BEGIN(test_thread_arena)
 {
 	unsigned arena_old, arena_new, narenas;
@@ -431,6 +539,8 @@ main(void)
 	    test_mallctl_config,
 	    test_mallctl_opt,
 	    test_manpage_example,
+	    test_tcache_none,
+	    test_tcache,
 	    test_thread_arena,
 	    test_arena_i_purge,
 	    test_arena_i_dss,
-- 
cgit v0.12


From 9e561e8d3f3c625b98b57df069eeac0fa2f522fb Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 10 Feb 2015 09:03:48 -0800
Subject: Test and fix tcache ID recycling.

---
 src/tcache.c        |  2 +-
 test/unit/mallctl.c | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/tcache.c b/src/tcache.c
index c7d4f78..9fe78c3 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -451,7 +451,7 @@ tcaches_create(tsd_t *tsd, unsigned *r_ind)
 		elm = tcaches_avail;
 		tcaches_avail = tcaches_avail->next;
 		elm->tcache = tcache;
-		*r_ind = (elm - tcaches) / sizeof(tcaches_t);
+		*r_ind = elm - tcaches;
 	} else {
 		elm = &tcaches[tcaches_past];
 		elm->tcache = tcache;
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index 10a6fcd..5960496 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -258,6 +258,18 @@ TEST_BEGIN(test_tcache)
 		    "Unexpected mallctl() failure, i=%u", i);
 	}
 
+	/* Exercise tcache ID recycling. */
+	for (i = 0; i < NTCACHES; i++) {
+		assert_d_eq(mallctl("tcache.destroy", NULL, NULL, &tis[i],
+		    sizeof(unsigned)), 0, "Unexpected mallctl() failure, i=%u",
+		    i);
+	}
+	for (i = 0; i < NTCACHES; i++) {
+		sz = sizeof(unsigned);
+		assert_d_eq(mallctl("tcache.create", &tis[i], &sz, NULL, 0), 0,
+		    "Unexpected mallctl() failure, i=%u", i);
+	}
+
 	/* Flush empty tcaches. */
 	for (i = 0; i < NTCACHES; i++) {
 		assert_d_eq(mallctl("tcache.flush", NULL, NULL, &tis[i],
-- 
cgit v0.12


From 051eae8cc591dfa2955cbfa73aae79ab53620c08 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Tue, 10 Feb 2015 16:05:52 -0800
Subject: Remove unnecessary xchg* lock prefixes.

---
 include/jemalloc/internal/atomic.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/jemalloc/internal/atomic.h b/include/jemalloc/internal/atomic.h
index f8bd62e..af2c687 100644
--- a/include/jemalloc/internal/atomic.h
+++ b/include/jemalloc/internal/atomic.h
@@ -119,7 +119,7 @@ atomic_write_uint64(uint64_t *p, uint64_t x)
 {
 
 	asm volatile (
-	    "lock; xchgq %1, %0;"
+	    "xchgq %1, %0;" /* Lock is implied by xchgq. */
 	    : "=m" (*p), "+r" (x) /* Outputs. */
 	    : "m" (*p) /* Inputs. */
 	    : "memory" /* Clobbers. */
@@ -343,7 +343,7 @@ atomic_write_uint32(uint32_t *p, uint32_t x)
 {
 
 	asm volatile (
-	    "lock; xchgl %1, %0;"
+	    "xchgl %1, %0;" /* Lock is implied by xchgl. */
 	    : "=m" (*p), "+r" (x) /* Outputs. */
 	    : "m" (*p) /* Inputs. */
 	    : "memory" /* Clobbers. */
-- 
cgit v0.12


From 064dbfbaf76617643bbbe66cbcc880e7ee9ec00f Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Thu, 12 Feb 2015 00:09:37 -0800
Subject: Fix a regression in tcache_bin_flush_small().

Fix a serious regression in tcache_bin_flush_small() that was introduced
by 1cb181ed632e7573fb4eab194e4d216867222d27 (Implement explicit tcache
support.).
---
 src/tcache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tcache.c b/src/tcache.c
index 9fe78c3..1166d60 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -103,7 +103,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_bin_t *tbin, index_t binind,
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(
 		    tbin->avail[0]);
 		arena_t *bin_arena = chunk->arena;
-		arena_bin_t *bin = &arena->bins[binind];
+		arena_bin_t *bin = &bin_arena->bins[binind];
 
 		if (config_prof && bin_arena == arena) {
 			if (arena_prof_accum(arena, tcache->prof_accumbytes))
-- 
cgit v0.12


From f30e261c5b85d2900224f91c6d426a23dce94fe9 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Thu, 12 Feb 2015 00:12:44 -0800
Subject: Update ckh to support metadata allocation tracking.

---
 src/ckh.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/ckh.c b/src/ckh.c
index ad075d6..da78d1b 100644
--- a/src/ckh.c
+++ b/src/ckh.c
@@ -270,8 +270,8 @@ ckh_grow(tsd_t *tsd, ckh_t *ckh)
 			ret = true;
 			goto label_return;
 		}
-		tab = (ckhc_t *)ipalloct(tsd, usize, CACHELINE, true, NULL,
-		    NULL);
+		tab = (ckhc_t *)ipallocztm(tsd, usize, CACHELINE, true, NULL,
+		    true, NULL);
 		if (tab == NULL) {
 			ret = true;
 			goto label_return;
@@ -283,12 +283,12 @@ ckh_grow(tsd_t *tsd, ckh_t *ckh)
 		ckh->lg_curbuckets = lg_curcells - LG_CKH_BUCKET_CELLS;
 
 		if (!ckh_rebuild(ckh, tab)) {
-			idalloc(tsd, tab);
+			idalloctm(tsd, tab, tcache_get(tsd, false), true);
 			break;
 		}
 
 		/* Rebuilding failed, so back out partially rebuilt table. */
-		idalloc(tsd, ckh->tab);
+		idalloctm(tsd, ckh->tab, tcache_get(tsd, false), true);
 		ckh->tab = tab;
 		ckh->lg_curbuckets = lg_prevbuckets;
 	}
@@ -314,7 +314,8 @@ ckh_shrink(tsd_t *tsd, ckh_t *ckh)
 	usize = sa2u(sizeof(ckhc_t) << lg_curcells, CACHELINE);
 	if (usize == 0)
 		return;
-	tab = (ckhc_t *)ipalloct(tsd, usize, CACHELINE, true, NULL, NULL);
+	tab = (ckhc_t *)ipallocztm(tsd, usize, CACHELINE, true, NULL, true,
+	    NULL);
 	if (tab == NULL) {
 		/*
 		 * An OOM error isn't worth propagating, since it doesn't
@@ -329,7 +330,7 @@ ckh_shrink(tsd_t *tsd, ckh_t *ckh)
 	ckh->lg_curbuckets = lg_curcells - LG_CKH_BUCKET_CELLS;
 
 	if (!ckh_rebuild(ckh, tab)) {
-		idalloc(tsd, tab);
+		idalloctm(tsd, tab, tcache_get(tsd, false), true);
 #ifdef CKH_COUNT
 		ckh->nshrinks++;
 #endif
@@ -337,7 +338,7 @@ ckh_shrink(tsd_t *tsd, ckh_t *ckh)
 	}
 
 	/* Rebuilding failed, so back out partially rebuilt table. */
-	idalloc(tsd, ckh->tab);
+	idalloctm(tsd, ckh->tab, tcache_get(tsd, false), true);
 	ckh->tab = tab;
 	ckh->lg_curbuckets = lg_prevbuckets;
 #ifdef CKH_COUNT
@@ -390,7 +391,8 @@ ckh_new(tsd_t *tsd, ckh_t *ckh, size_t minitems, ckh_hash_t *hash,
 		ret = true;
 		goto label_return;
 	}
-	ckh->tab = (ckhc_t *)ipalloct(tsd, usize, CACHELINE, true, NULL, NULL);
+	ckh->tab = (ckhc_t *)ipallocztm(tsd, usize, CACHELINE, true, NULL, true,
+	    NULL);
 	if (ckh->tab == NULL) {
 		ret = true;
 		goto label_return;
@@ -419,7 +421,7 @@ ckh_delete(tsd_t *tsd, ckh_t *ckh)
 	    (unsigned long long)ckh->nrelocs);
 #endif
 
-	idalloc(tsd, ckh->tab);
+	idalloctm(tsd, ckh->tab, tcache_get(tsd, false), true);
 	if (config_debug)
 		memset(ckh, 0x5a, sizeof(ckh_t));
 }
-- 
cgit v0.12


From cbf3a6d70371d2390b8b0e76814e04cc6088002c Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 11 Feb 2015 12:24:27 -0800
Subject: Move centralized chunk management into arenas.

Migrate all centralized data structures related to huge allocations and
recyclable chunks into arena_t, so that each arena can manage huge
allocations and recyclable virtual memory completely independently of
other arenas.

Add chunk node caching to arenas, in order to avoid contention on the
base allocator.

Use chunks_rtree to look up huge allocations rather than a red-black
tree.  Maintain a per arena unsorted list of huge allocations (which
will be needed to enumerate huge allocations during arena reset).

Remove the --enable-ivsalloc option, make ivsalloc() always available,
and use it for size queries if --enable-debug is enabled.  The only
practical implications to this removal are that 1) ivsalloc() is now
always available during live debugging (and the underlying radix tree is
available during core-based debugging), and 2) size query validation can
no longer be enabled independent of --enable-debug.

Remove the stats.chunks.{current,total,high} mallctls, and replace their
underlying statistics with simpler atomically updated counters used
exclusively for gdump triggering.  These statistics are no longer very
useful because each arena manages chunks independently, and per arena
statistics provide similar information.

Simplify chunk synchronization code, now that base chunk allocation
cannot cause recursive lock acquisition.
---
 INSTALL                                            |   6 -
 configure.ac                                       |  22 +-
 doc/jemalloc.xml.in                                |  35 +--
 include/jemalloc/internal/arena.h                  |  60 ++++-
 include/jemalloc/internal/atomic.h                 |   4 +-
 include/jemalloc/internal/base.h                   |   2 -
 include/jemalloc/internal/chunk.h                  |  22 +-
 include/jemalloc/internal/chunk_dss.h              |   4 +-
 include/jemalloc/internal/ctl.h                    |   5 -
 include/jemalloc/internal/extent.h                 |  25 +-
 include/jemalloc/internal/huge.h                   |   4 -
 include/jemalloc/internal/jemalloc_internal.h.in   |  28 +--
 .../jemalloc/internal/jemalloc_internal_defs.h.in  |   6 -
 include/jemalloc/internal/private_symbols.txt      |  12 +-
 include/jemalloc/internal/rtree.h                  |  23 +-
 include/jemalloc/internal/stats.h                  |  15 --
 src/arena.c                                        |  74 +++++-
 src/base.c                                         |  65 ++---
 src/chunk.c                                        | 275 ++++++++-------------
 src/chunk_dss.c                                    |   5 +-
 src/ctl.c                                          |  26 +-
 src/huge.c                                         | 169 ++++++-------
 src/jemalloc.c                                     |  15 +-
 src/stats.c                                        |  12 -
 src/tcache.c                                       |   8 +-
 test/unit/stats.c                                  |  27 --
 26 files changed, 394 insertions(+), 555 deletions(-)

diff --git a/INSTALL b/INSTALL
index b8459a8..517fe02 100644
--- a/INSTALL
+++ b/INSTALL
@@ -92,7 +92,6 @@ any of the following arguments (not a definitive list) to 'configure':
 --enable-debug
     Enable assertions and validation code.  This incurs a substantial
     performance hit, but is very useful during application development.
-    Implies --enable-ivsalloc.
 
 --enable-code-coverage
     Enable code coverage support, for use during jemalloc test development.
@@ -107,11 +106,6 @@ any of the following arguments (not a definitive list) to 'configure':
     there are interactions between the various coverage targets, so it is
     usually advisable to run 'make clean' between repeated code coverage runs.
 
---enable-ivsalloc
-    Enable validation code, which verifies that pointers reside within
-    jemalloc-owned chunks before dereferencing them.  This incurs a substantial
-    performance hit.
-
 --disable-stats
     Disable statistics gathering functionality.  See the "opt.stats_print"
     option documentation for usage details.
diff --git a/configure.ac b/configure.ac
index dc8aa02..2922880 100644
--- a/configure.ac
+++ b/configure.ac
@@ -625,7 +625,7 @@ fi
 
 dnl Do not compile with debugging by default.
 AC_ARG_ENABLE([debug],
-  [AS_HELP_STRING([--enable-debug], [Build debugging code (implies --enable-ivsalloc)])],
+  [AS_HELP_STRING([--enable-debug], [Build debugging code])],
 [if test "x$enable_debug" = "xno" ; then
   enable_debug="0"
 else
@@ -634,27 +634,8 @@ fi
 ],
 [enable_debug="0"]
 )
-if test "x$enable_debug" = "x1" ; then
-  AC_DEFINE([JEMALLOC_DEBUG], [ ])
-  enable_ivsalloc="1"
-fi
 AC_SUBST([enable_debug])
 
-dnl Do not validate pointers by default.
-AC_ARG_ENABLE([ivsalloc],
-  [AS_HELP_STRING([--enable-ivsalloc], [Validate pointers passed through the public API])],
-[if test "x$enable_ivsalloc" = "xno" ; then
-  enable_ivsalloc="0"
-else
-  enable_ivsalloc="1"
-fi
-],
-[enable_ivsalloc="0"]
-)
-if test "x$enable_ivsalloc" = "x1" ; then
-  AC_DEFINE([JEMALLOC_IVSALLOC], [ ])
-fi
-
 dnl Only optimize if not debugging.
 if test "x$enable_debug" = "x0" -a "x$no_CFLAGS" = "xyes" ; then
   dnl Make sure that an optimization flag was not specified in EXTRA_CFLAGS.
@@ -1401,7 +1382,6 @@ if test "x${enable_zone_allocator}" = "x1" ; then
   if test "x${abi}" != "xmacho"; then
     AC_MSG_ERROR([--enable-zone-allocator is only supported on Darwin])
   fi
-  AC_DEFINE([JEMALLOC_IVSALLOC], [ ])
   AC_DEFINE([JEMALLOC_ZONE], [ ])
 
   dnl The szone version jumped from 3 to 6 between the OS X 10.5.x and 10.6
diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index da800de..b392fa9 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -1847,7 +1847,7 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         equal to <link
         linkend="stats.allocated"><mallctl>stats.allocated</mallctl></link>.
         This does not include <link linkend="stats.arenas.i.pdirty">
-        <mallctl>stats.arenas.&lt;i&gt;.pdirty</mallctl></link> and pages
+        <mallctl>stats.arenas.&lt;i&gt;.pdirty</mallctl></link>, nor pages
         entirely devoted to allocator metadata.</para></listitem>
       </varlistentry>
 
@@ -1880,39 +1880,6 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         does not include inactive chunks.</para></listitem>
       </varlistentry>
 
-      <varlistentry id="stats.chunks.current">
-        <term>
-          <mallctl>stats.chunks.current</mallctl>
-          (<type>size_t</type>)
-          <literal>r-</literal>
-          [<option>--enable-stats</option>]
-        </term>
-        <listitem><para>Total number of chunks actively mapped on behalf of the
-        application.  This does not include inactive chunks.
-        </para></listitem>
-      </varlistentry>
-
-      <varlistentry id="stats.chunks.total">
-        <term>
-          <mallctl>stats.chunks.total</mallctl>
-          (<type>uint64_t</type>)
-          <literal>r-</literal>
-          [<option>--enable-stats</option>]
-        </term>
-        <listitem><para>Cumulative number of chunks allocated.</para></listitem>
-      </varlistentry>
-
-      <varlistentry id="stats.chunks.high">
-        <term>
-          <mallctl>stats.chunks.high</mallctl>
-          (<type>size_t</type>)
-          <literal>r-</literal>
-          [<option>--enable-stats</option>]
-        </term>
-        <listitem><para>Maximum number of active chunks at any time thus far.
-        </para></listitem>
-      </varlistentry>
-
       <varlistentry id="stats.arenas.i.dss">
         <term>
           <mallctl>stats.arenas.&lt;i&gt;.dss</mallctl>
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 5476899..2ae4609 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -151,8 +151,12 @@ typedef ql_head(arena_chunk_map_misc_t) arena_chunk_miscelms_t;
 
 /* Arena chunk header. */
 struct arena_chunk_s {
-	/* Arena that owns the chunk. */
-	arena_t			*arena;
+	/*
+	 * The arena that owns the chunk is node.arena.  This field as a whole
+	 * is used by chunks_rtree to support both ivsalloc() and core-based
+	 * debugging.
+	 */
+	extent_node_t		node;
 
 	/*
 	 * Map of pages within chunk that keeps track of free/large/small.  The
@@ -313,6 +317,27 @@ struct arena_s {
 	/* List of dirty runs this arena manages. */
 	arena_chunk_miscelms_t	runs_dirty;
 
+	/* Extant huge allocations. */
+	ql_head(extent_node_t)	huge;
+	/* Synchronizes all huge allocation/update/deallocation. */
+	malloc_mutex_t		huge_mtx;
+
+	/*
+	 * Trees of chunks that were previously allocated (trees differ only in
+	 * node ordering).  These are used when allocating chunks, in an attempt
+	 * to re-use address space.  Depending on function, different tree
+	 * orderings are needed, which is why there are two trees with the same
+	 * contents.
+	 */
+	extent_tree_t		chunks_szad_mmap;
+	extent_tree_t		chunks_ad_mmap;
+	extent_tree_t		chunks_szad_dss;
+	extent_tree_t		chunks_ad_dss;
+	malloc_mutex_t		chunks_mtx;
+	/* Cache of nodes that were allocated via base_alloc(). */
+	ql_head(extent_node_t)	node_cache;
+	malloc_mutex_t		node_cache_mtx;
+
 	/*
 	 * User-configurable chunk allocation and deallocation functions.
 	 */
@@ -338,6 +363,8 @@ extern size_t		arena_maxclass; /* Max size class for arenas. */
 extern unsigned		nlclasses; /* Number of large size classes. */
 extern unsigned		nhclasses; /* Number of huge size classes. */
 
+extent_node_t	*arena_node_alloc(arena_t *arena);
+void	arena_node_dalloc(arena_t *arena, extent_node_t *node);
 void	*arena_chunk_alloc_huge(arena_t *arena, size_t usize, size_t alignment,
     bool *zero);
 void	arena_chunk_dalloc_huge(arena_t *arena, void *chunk, size_t usize);
@@ -453,8 +480,7 @@ void	*arena_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
     tcache_t *tcache);
 arena_t	*arena_aalloc(const void *ptr);
 size_t	arena_salloc(const void *ptr, bool demote);
-void	arena_dalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr,
-    tcache_t *tcache);
+void	arena_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache);
 void	arena_sdalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, size_t size,
     tcache_t *tcache);
 #endif
@@ -792,7 +818,7 @@ arena_ptr_small_binind_get(const void *ptr, size_t mapbits)
 		assert(binind != BININD_INVALID);
 		assert(binind < NBINS);
 		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-		arena = chunk->arena;
+		arena = chunk->node.arena;
 		pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
 		actual_mapbits = arena_mapbits_get(chunk, pageind);
 		assert(mapbits == actual_mapbits);
@@ -980,7 +1006,7 @@ arena_aalloc(const void *ptr)
 	arena_chunk_t *chunk;
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	return (chunk->arena);
+	return (chunk->node.arena);
 }
 
 /* Return the size of the allocation pointed to by ptr. */
@@ -1024,11 +1050,18 @@ arena_salloc(const void *ptr, bool demote)
 }
 
 JEMALLOC_ALWAYS_INLINE void
-arena_dalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, tcache_t *tcache)
+arena_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache)
 {
+	arena_chunk_t *chunk;
 	size_t pageind, mapbits;
 
 	assert(ptr != NULL);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	if (unlikely(chunk == ptr)) {
+		huge_dalloc(tsd, ptr, tcache);
+		return;
+	}
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
 	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
@@ -1040,8 +1073,10 @@ arena_dalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, tcache_t *tcache)
 			index_t binind = arena_ptr_small_binind_get(ptr,
 			    mapbits);
 			tcache_dalloc_small(tsd, tcache, ptr, binind);
-		} else
-			arena_dalloc_small(chunk->arena, chunk, ptr, pageind);
+		} else {
+			arena_dalloc_small(chunk->node.arena, chunk, ptr,
+			    pageind);
+		}
 	} else {
 		size_t size = arena_mapbits_large_size_get(chunk, pageind);
 
@@ -1050,7 +1085,7 @@ arena_dalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, tcache_t *tcache)
 		if (likely(tcache != NULL) && size <= tcache_maxclass)
 			tcache_dalloc_large(tsd, tcache, ptr, size);
 		else
-			arena_dalloc_large(chunk->arena, chunk, ptr);
+			arena_dalloc_large(chunk->node.arena, chunk, ptr);
 	}
 }
 
@@ -1081,7 +1116,8 @@ arena_sdalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, size_t size,
 		} else {
 			size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >>
 			    LG_PAGE;
-			arena_dalloc_small(chunk->arena, chunk, ptr, pageind);
+			arena_dalloc_small(chunk->node.arena, chunk, ptr,
+			    pageind);
 		}
 	} else {
 		assert(((uintptr_t)ptr & PAGE_MASK) == 0);
@@ -1089,7 +1125,7 @@ arena_sdalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, size_t size,
 		if (likely(tcache != NULL) && size <= tcache_maxclass)
 			tcache_dalloc_large(tsd, tcache, ptr, size);
 		else
-			arena_dalloc_large(chunk->arena, chunk, ptr);
+			arena_dalloc_large(chunk->node.arena, chunk, ptr);
 	}
 }
 #  endif /* JEMALLOC_ARENA_INLINE_B */
diff --git a/include/jemalloc/internal/atomic.h b/include/jemalloc/internal/atomic.h
index af2c687..0d33065 100644
--- a/include/jemalloc/internal/atomic.h
+++ b/include/jemalloc/internal/atomic.h
@@ -52,7 +52,7 @@ void	atomic_write_uint32(uint32_t *p, uint32_t x);
 void	*atomic_add_p(void **p, void *x);
 void	*atomic_sub_p(void **p, void *x);
 bool	atomic_cas_p(void **p, void *c, void *s);
-void	atomic_write_p(void **p, void *x);
+void	atomic_write_p(void **p, const void *x);
 size_t	atomic_add_z(size_t *p, size_t x);
 size_t	atomic_sub_z(size_t *p, size_t x);
 bool	atomic_cas_z(size_t *p, size_t c, size_t s);
@@ -538,7 +538,7 @@ atomic_cas_p(void **p, void *c, void *s)
 }
 
 JEMALLOC_INLINE void
-atomic_write_p(void **p, void *x)
+atomic_write_p(void **p, const void *x)
 {
 
 #if (LG_SIZEOF_PTR == 3)
diff --git a/include/jemalloc/internal/base.h b/include/jemalloc/internal/base.h
index a0798ee..bec76b3 100644
--- a/include/jemalloc/internal/base.h
+++ b/include/jemalloc/internal/base.h
@@ -10,8 +10,6 @@
 #ifdef JEMALLOC_H_EXTERNS
 
 void	*base_alloc(size_t size);
-extent_node_t *base_node_alloc(void);
-void	base_node_dalloc(extent_node_t *node);
 size_t	base_allocated_get(void);
 bool	base_boot(void);
 void	base_prefork(void);
diff --git a/include/jemalloc/internal/chunk.h b/include/jemalloc/internal/chunk.h
index 62ac3e7..5e0fb14 100644
--- a/include/jemalloc/internal/chunk.h
+++ b/include/jemalloc/internal/chunk.h
@@ -30,24 +30,21 @@
 extern size_t		opt_lg_chunk;
 extern const char	*opt_dss;
 
-/* Protects stats_chunks; currently not used for any other purpose. */
-extern malloc_mutex_t	chunks_mtx;
-/* Chunk statistics. */
-extern chunk_stats_t	stats_chunks;
-
 extern rtree_t		chunks_rtree;
 
 extern size_t		chunksize;
 extern size_t		chunksize_mask; /* (chunksize - 1). */
 extern size_t		chunk_npages;
 
+bool	chunk_register(const void *chunk, const extent_node_t *node);
+void	chunk_deregister(const void *chunk, const extent_node_t *node);
 void	*chunk_alloc_base(size_t size);
 void	*chunk_alloc_arena(chunk_alloc_t *chunk_alloc,
     chunk_dalloc_t *chunk_dalloc, unsigned arena_ind, void *new_addr,
     size_t size, size_t alignment, bool *zero);
 void	*chunk_alloc_default(void *new_addr, size_t size, size_t alignment,
     bool *zero, unsigned arena_ind);
-void	chunk_unmap(void *chunk, size_t size);
+void	chunk_unmap(arena_t *arena, void *chunk, size_t size);
 bool	chunk_dalloc_default(void *chunk, size_t size, unsigned arena_ind);
 bool	chunk_boot(void);
 void	chunk_prefork(void);
@@ -58,6 +55,19 @@ void	chunk_postfork_child(void);
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
 
+#ifndef JEMALLOC_ENABLE_INLINE
+extent_node_t	*chunk_lookup(const void *chunk);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_CHUNK_C_))
+JEMALLOC_INLINE extent_node_t *
+chunk_lookup(const void *chunk)
+{
+
+	return (rtree_get(&chunks_rtree, (uintptr_t)chunk));
+}
+#endif
+
 #endif /* JEMALLOC_H_INLINES */
 /******************************************************************************/
 
diff --git a/include/jemalloc/internal/chunk_dss.h b/include/jemalloc/internal/chunk_dss.h
index 0989647..87366a2 100644
--- a/include/jemalloc/internal/chunk_dss.h
+++ b/include/jemalloc/internal/chunk_dss.h
@@ -23,8 +23,8 @@ extern const char *dss_prec_names[];
 
 dss_prec_t	chunk_dss_prec_get(void);
 bool	chunk_dss_prec_set(dss_prec_t dss_prec);
-void	*chunk_alloc_dss(void *new_addr, size_t size, size_t alignment,
-    bool *zero);
+void	*chunk_alloc_dss(arena_t *arena, void *new_addr, size_t size,
+    size_t alignment, bool *zero);
 bool	chunk_in_dss(void *chunk);
 bool	chunk_dss_boot(void);
 void	chunk_dss_prefork(void);
diff --git a/include/jemalloc/internal/ctl.h b/include/jemalloc/internal/ctl.h
index 65617bc..ab9c986 100644
--- a/include/jemalloc/internal/ctl.h
+++ b/include/jemalloc/internal/ctl.h
@@ -54,11 +54,6 @@ struct ctl_stats_s {
 	size_t			active;
 	size_t			metadata;
 	size_t			mapped;
-	struct {
-		size_t		current;	/* stats_chunks.curchunks */
-		uint64_t	total;		/* stats_chunks.nchunks */
-		size_t		high;		/* stats_chunks.highchunks */
-	} chunks;
 	unsigned		narenas;
 	ctl_arena_stats_t	*arenas;	/* (narenas + 1) elements. */
 };
diff --git a/include/jemalloc/internal/extent.h b/include/jemalloc/internal/extent.h
index f45940c..fbcdcf9 100644
--- a/include/jemalloc/internal/extent.h
+++ b/include/jemalloc/internal/extent.h
@@ -9,21 +9,17 @@ typedef struct extent_node_s extent_node_t;
 
 /* Tree of extents. */
 struct extent_node_s {
-	/* Linkage for the size/address-ordered tree. */
-	rb_node(extent_node_t)	link_szad;
-
-	/* Linkage for the address-ordered tree. */
-	rb_node(extent_node_t)	link_ad;
+	/* Arena from which this extent came, if any. */
+	arena_t			*arena;
 
 	/* Pointer to the extent that this tree node is responsible for. */
 	void			*addr;
 
-	/* Total region size. */
+	/*
+	 * Total region size, or 0 if this node corresponds to an arena chunk.
+	 */
 	size_t			size;
 
-	/* Arena from which this extent came, if any. */
-	arena_t			*arena;
-
 	/*
 	 * 'prof_tctx' and 'zeroed' are never needed at the same time, so
 	 * overlay them in order to fit extent_node_t in one cache line.
@@ -35,6 +31,17 @@ struct extent_node_s {
 		/* True if zero-filled; used by chunk recycling code. */
 		bool		zeroed;
 	};
+
+	union {
+		/* Linkage for the size/address-ordered tree. */
+		rb_node(extent_node_t)	link_szad;
+
+		/* Linkage for huge allocations and cached chunks nodes. */
+		ql_elm(extent_node_t)	link_ql;
+	};
+
+	/* Linkage for the address-ordered tree. */
+	rb_node(extent_node_t)	link_ad;
 };
 typedef rb_tree(extent_node_t) extent_tree_t;
 
diff --git a/include/jemalloc/internal/huge.h b/include/jemalloc/internal/huge.h
index 231cc36..c478d16 100644
--- a/include/jemalloc/internal/huge.h
+++ b/include/jemalloc/internal/huge.h
@@ -27,10 +27,6 @@ arena_t	*huge_aalloc(const void *ptr);
 size_t	huge_salloc(const void *ptr);
 prof_tctx_t	*huge_prof_tctx_get(const void *ptr);
 void	huge_prof_tctx_set(const void *ptr, prof_tctx_t *tctx);
-bool	huge_boot(void);
-void	huge_prefork(void);
-void	huge_postfork_parent(void);
-void	huge_postfork_child(void);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index b8c994c..ab93aa5 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -119,13 +119,6 @@ static const bool config_xmalloc =
     false
 #endif
     ;
-static const bool config_ivsalloc =
-#ifdef JEMALLOC_IVSALLOC
-    true
-#else
-    false
-#endif
-    ;
 
 #ifdef JEMALLOC_C11ATOMICS
 #include <stdatomic.h>
@@ -352,9 +345,9 @@ typedef unsigned index_t;
 #include "jemalloc/internal/arena.h"
 #include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/base.h"
+#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
-#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/hash.h"
 #include "jemalloc/internal/quarantine.h"
@@ -378,9 +371,9 @@ typedef unsigned index_t;
 #include "jemalloc/internal/extent.h"
 #include "jemalloc/internal/arena.h"
 #include "jemalloc/internal/base.h"
+#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
-#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/hash.h"
 #include "jemalloc/internal/quarantine.h"
@@ -457,9 +450,9 @@ void	jemalloc_postfork_child(void);
 #include "jemalloc/internal/extent.h"
 #include "jemalloc/internal/arena.h"
 #include "jemalloc/internal/base.h"
+#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
-#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/hash.h"
 #include "jemalloc/internal/quarantine.h"
@@ -483,6 +476,7 @@ void	jemalloc_postfork_child(void);
 #include "jemalloc/internal/mb.h"
 #include "jemalloc/internal/extent.h"
 #include "jemalloc/internal/base.h"
+#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
 
@@ -777,7 +771,6 @@ arena_get(tsd_t *tsd, unsigned ind, bool init_if_missing,
 #endif
 
 #include "jemalloc/internal/bitmap.h"
-#include "jemalloc/internal/rtree.h"
 /*
  * Include portions of arena.h interleaved with tcache.h in order to resolve
  * circular dependencies.
@@ -966,10 +959,14 @@ ipalloc(tsd_t *tsd, size_t usize, size_t alignment, bool zero)
 JEMALLOC_ALWAYS_INLINE size_t
 ivsalloc(const void *ptr, bool demote)
 {
+	extent_node_t *node;
 
 	/* Return 0 if ptr is not within a chunk managed by jemalloc. */
-	if (rtree_get(&chunks_rtree, (uintptr_t)CHUNK_ADDR2BASE(ptr)) == 0)
+	node = chunk_lookup(CHUNK_ADDR2BASE(ptr));
+	if (node == NULL)
 		return (0);
+	/* Only arena chunks should be looked up via interior pointers. */
+	assert(node->addr == ptr || node->size == 0);
 
 	return (isalloc(ptr, demote));
 }
@@ -999,7 +996,6 @@ p2rz(const void *ptr)
 JEMALLOC_ALWAYS_INLINE void
 idalloctm(tsd_t *tsd, void *ptr, tcache_t *tcache, bool is_metadata)
 {
-	arena_chunk_t *chunk;
 
 	assert(ptr != NULL);
 	if (config_stats && is_metadata) {
@@ -1007,11 +1003,7 @@ idalloctm(tsd_t *tsd, void *ptr, tcache_t *tcache, bool is_metadata)
 		    config_prof));
 	}
 
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (likely(chunk != ptr))
-		arena_dalloc(tsd, chunk, ptr, tcache);
-	else
-		huge_dalloc(tsd, ptr, tcache);
+	arena_dalloc(tsd, ptr, tcache);
 }
 
 JEMALLOC_ALWAYS_INLINE void
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index c8d7daf..0f0db8a 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -187,12 +187,6 @@
 #undef JEMALLOC_INTERNAL_FFS
 
 /*
- * JEMALLOC_IVSALLOC enables ivsalloc(), which verifies that pointers reside
- * within jemalloc-owned chunks before dereferencing them.
- */
-#undef JEMALLOC_IVSALLOC
-
-/*
  * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
  */
 #undef JEMALLOC_ZONE
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index cf42bea..d5601a6 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -60,6 +60,8 @@ arena_miscelm_to_pageind
 arena_miscelm_to_rpages
 arena_nbound
 arena_new
+arena_node_alloc
+arena_node_dalloc
 arena_palloc
 arena_postfork_child
 arena_postfork_parent
@@ -103,8 +105,6 @@ atomic_sub_z
 base_alloc
 base_allocated_get
 base_boot
-base_node_alloc
-base_node_dalloc
 base_postfork_child
 base_postfork_parent
 base_prefork
@@ -130,6 +130,7 @@ chunk_alloc_mmap
 chunk_boot
 chunk_dalloc_default
 chunk_dalloc_mmap
+chunk_deregister
 chunk_dss_boot
 chunk_dss_postfork_child
 chunk_dss_postfork_parent
@@ -137,12 +138,13 @@ chunk_dss_prec_get
 chunk_dss_prec_set
 chunk_dss_prefork
 chunk_in_dss
+chunk_lookup
 chunk_npages
 chunk_postfork_child
 chunk_postfork_parent
 chunk_prefork
+chunk_register
 chunk_unmap
-chunks_mtx
 chunks_rtree
 chunksize
 chunksize_mask
@@ -218,16 +220,12 @@ hash_x86_128
 hash_x86_32
 huge_aalloc
 huge_allocated
-huge_boot
 huge_dalloc
 huge_dalloc_junk
 huge_malloc
 huge_ndalloc
 huge_nmalloc
 huge_palloc
-huge_postfork_child
-huge_postfork_parent
-huge_prefork
 huge_prof_tctx_get
 huge_prof_tctx_set
 huge_ralloc
diff --git a/include/jemalloc/internal/rtree.h b/include/jemalloc/internal/rtree.h
index e86e17c..2eb726d 100644
--- a/include/jemalloc/internal/rtree.h
+++ b/include/jemalloc/internal/rtree.h
@@ -37,7 +37,7 @@ typedef void (rtree_node_dalloc_t)(rtree_node_elm_t *);
 struct rtree_node_elm_s {
 	union {
 		rtree_node_elm_t	*child;
-		void			*val;
+		extent_node_t		*val;
 	};
 };
 
@@ -110,13 +110,14 @@ bool	rtree_node_valid(rtree_node_elm_t *node);
 rtree_node_elm_t	*rtree_child_tryread(rtree_node_elm_t *elm);
 rtree_node_elm_t	*rtree_child_read(rtree_t *rtree, rtree_node_elm_t *elm,
     unsigned level);
-void	*rtree_val_read(rtree_t *rtree, rtree_node_elm_t *elm);
-void	rtree_val_write(rtree_t *rtree, rtree_node_elm_t *elm, void *val);
+extent_node_t	*rtree_val_read(rtree_t *rtree, rtree_node_elm_t *elm);
+void	rtree_val_write(rtree_t *rtree, rtree_node_elm_t *elm,
+    const extent_node_t *val);
 rtree_node_elm_t	*rtree_subtree_tryread(rtree_t *rtree, unsigned level);
 rtree_node_elm_t	*rtree_subtree_read(rtree_t *rtree, unsigned level);
 
-void	*rtree_get(rtree_t *rtree, uintptr_t key);
-bool	rtree_set(rtree_t *rtree, uintptr_t key, void *val);
+extent_node_t	*rtree_get(rtree_t *rtree, uintptr_t key);
+bool	rtree_set(rtree_t *rtree, uintptr_t key, const extent_node_t *val);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_RTREE_C_))
@@ -173,18 +174,18 @@ rtree_child_read(rtree_t *rtree, rtree_node_elm_t *elm, unsigned level)
 	return (child);
 }
 
-JEMALLOC_INLINE void *
+JEMALLOC_INLINE extent_node_t *
 rtree_val_read(rtree_t *rtree, rtree_node_elm_t *elm)
 {
 
-	return (atomic_read_p(&elm->val));
+	return (atomic_read_p((void **)&elm->val));
 }
 
 JEMALLOC_INLINE void
-rtree_val_write(rtree_t *rtree, rtree_node_elm_t *elm, void *val)
+rtree_val_write(rtree_t *rtree, rtree_node_elm_t *elm, const extent_node_t *val)
 {
 
-	atomic_write_p(&elm->val, val);
+	atomic_write_p((void **)&elm->val, val);
 }
 
 JEMALLOC_INLINE rtree_node_elm_t *
@@ -210,7 +211,7 @@ rtree_subtree_read(rtree_t *rtree, unsigned level)
 	return (subtree);
 }
 
-JEMALLOC_INLINE void *
+JEMALLOC_INLINE extent_node_t *
 rtree_get(rtree_t *rtree, uintptr_t key)
 {
 	uintptr_t subkey;
@@ -238,7 +239,7 @@ rtree_get(rtree_t *rtree, uintptr_t key)
 }
 
 JEMALLOC_INLINE bool
-rtree_set(rtree_t *rtree, uintptr_t key, void *val)
+rtree_set(rtree_t *rtree, uintptr_t key, const extent_node_t *val)
 {
 	uintptr_t subkey;
 	unsigned i, start_level;
diff --git a/include/jemalloc/internal/stats.h b/include/jemalloc/internal/stats.h
index 7cba77b..c91dba9 100644
--- a/include/jemalloc/internal/stats.h
+++ b/include/jemalloc/internal/stats.h
@@ -135,21 +135,6 @@ struct arena_stats_s {
 	malloc_huge_stats_t	*hstats;
 };
 
-struct chunk_stats_s {
-	/* Number of chunks that were allocated. */
-	uint64_t	nchunks;
-
-	/* High-water mark for number of chunks allocated. */
-	size_t		highchunks;
-
-	/*
-	 * Current number of chunks allocated.  This value isn't maintained for
-	 * any other purpose, so keep track of it in order to be able to set
-	 * highchunks.
-	 */
-	size_t		curchunks;
-};
-
 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
diff --git a/src/arena.c b/src/arena.c
index 907fbd7..2bd1a2c 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -20,6 +20,7 @@ unsigned	nhclasses; /* Number of huge size classes. */
  * definition.
  */
 
+static void	arena_chunk_dalloc(arena_t *arena, arena_chunk_t *chunk);
 static void	arena_purge(arena_t *arena, bool all);
 static void	arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty,
     bool cleaned);
@@ -392,8 +393,7 @@ arena_chunk_init_spare(arena_t *arena)
 }
 
 static arena_chunk_t *
-arena_chunk_alloc_internal(arena_t *arena, size_t size, size_t alignment,
-    bool *zero)
+arena_chunk_alloc_internal(arena_t *arena, bool *zero)
 {
 	arena_chunk_t *chunk;
 	chunk_alloc_t *chunk_alloc;
@@ -403,7 +403,16 @@ arena_chunk_alloc_internal(arena_t *arena, size_t size, size_t alignment,
 	chunk_dalloc = arena->chunk_dalloc;
 	malloc_mutex_unlock(&arena->lock);
 	chunk = (arena_chunk_t *)chunk_alloc_arena(chunk_alloc, chunk_dalloc,
-	    arena->ind, NULL, size, alignment, zero);
+	    arena->ind, NULL, chunksize, chunksize, zero);
+	if (chunk != NULL) {
+		chunk->node.arena = arena;
+		chunk->node.addr = chunk;
+		chunk->node.size = 0; /* Indicates this is an arena chunk. */
+		if (chunk_register(chunk, &chunk->node)) {
+			chunk_dalloc((void *)chunk, chunksize, arena->ind);
+			chunk = NULL;
+		}
+	}
 	malloc_mutex_lock(&arena->lock);
 	if (config_stats && chunk != NULL) {
 		arena->stats.mapped += chunksize;
@@ -423,12 +432,10 @@ arena_chunk_init_hard(arena_t *arena)
 	assert(arena->spare == NULL);
 
 	zero = false;
-	chunk = arena_chunk_alloc_internal(arena, chunksize, chunksize, &zero);
+	chunk = arena_chunk_alloc_internal(arena, &zero);
 	if (chunk == NULL)
 		return (NULL);
 
-	chunk->arena = arena;
-
 	/*
 	 * Initialize the map to contain one maximal free untouched run.  Mark
 	 * the pages as zeroed iff chunk_alloc() returned a zeroed chunk.
@@ -514,6 +521,7 @@ arena_chunk_dalloc(arena_t *arena, arena_chunk_t *chunk)
 		}
 		chunk_dalloc = arena->chunk_dalloc;
 		malloc_mutex_unlock(&arena->lock);
+		chunk_deregister(spare, &spare->node);
 		chunk_dalloc((void *)spare, chunksize, arena->ind);
 		malloc_mutex_lock(&arena->lock);
 		if (config_stats) {
@@ -593,6 +601,32 @@ arena_huge_ralloc_stats_update_undo(arena_t *arena, size_t oldsize,
 	arena_huge_malloc_stats_update_undo(arena, usize);
 }
 
+extent_node_t *
+arena_node_alloc(arena_t *arena)
+{
+	extent_node_t *node;
+
+	malloc_mutex_lock(&arena->node_cache_mtx);
+	node = ql_last(&arena->node_cache, link_ql);
+	if (node == NULL) {
+		malloc_mutex_unlock(&arena->node_cache_mtx);
+		return (base_alloc(sizeof(extent_node_t)));
+	}
+	ql_tail_remove(&arena->node_cache, extent_node_t, link_ql);
+	malloc_mutex_unlock(&arena->node_cache_mtx);
+	return (node);
+}
+
+void
+arena_node_dalloc(arena_t *arena, extent_node_t *node)
+{
+
+	malloc_mutex_lock(&arena->node_cache_mtx);
+	ql_elm_new(node, link_ql);
+	ql_tail_insert(&arena->node_cache, node, link_ql);
+	malloc_mutex_unlock(&arena->node_cache_mtx);
+}
+
 void *
 arena_chunk_alloc_huge(arena_t *arena, size_t usize, size_t alignment,
     bool *zero)
@@ -1782,7 +1816,7 @@ arena_dissociate_bin_run(arena_chunk_t *chunk, arena_run_t *run,
 	if (run == bin->runcur)
 		bin->runcur = NULL;
 	else {
-		index_t binind = arena_bin_index(chunk->arena, bin);
+		index_t binind = arena_bin_index(chunk->node.arena, bin);
 		arena_bin_info_t *bin_info = &arena_bin_info[binind];
 
 		if (bin_info->nregs != 1) {
@@ -2123,7 +2157,7 @@ arena_ralloc_large(void *ptr, size_t oldsize, size_t size, size_t extra,
 		arena_t *arena;
 
 		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-		arena = chunk->arena;
+		arena = chunk->node.arena;
 
 		if (usize < oldsize) {
 			/* Fill before shrinking in order avoid a race. */
@@ -2338,10 +2372,21 @@ arena_new(unsigned ind)
 
 	arena->ind = ind;
 	arena->nthreads = 0;
+	if (malloc_mutex_init(&arena->lock))
+		return (NULL);
 	arena->chunk_alloc = chunk_alloc_default;
 	arena->chunk_dalloc = chunk_dalloc_default;
-
-	if (malloc_mutex_init(&arena->lock))
+	ql_new(&arena->huge);
+	if (malloc_mutex_init(&arena->huge_mtx))
+		return (NULL);
+	extent_tree_szad_new(&arena->chunks_szad_mmap);
+	extent_tree_ad_new(&arena->chunks_ad_mmap);
+	extent_tree_szad_new(&arena->chunks_szad_dss);
+	extent_tree_ad_new(&arena->chunks_ad_dss);
+	ql_new(&arena->node_cache);
+	if (malloc_mutex_init(&arena->chunks_mtx))
+		return (NULL);
+	if (malloc_mutex_init(&arena->node_cache_mtx))
 		return (NULL);
 
 	if (config_stats) {
@@ -2551,6 +2596,9 @@ arena_prefork(arena_t *arena)
 	unsigned i;
 
 	malloc_mutex_prefork(&arena->lock);
+	malloc_mutex_prefork(&arena->huge_mtx);
+	malloc_mutex_prefork(&arena->chunks_mtx);
+	malloc_mutex_prefork(&arena->node_cache_mtx);
 	for (i = 0; i < NBINS; i++)
 		malloc_mutex_prefork(&arena->bins[i].lock);
 }
@@ -2562,6 +2610,9 @@ arena_postfork_parent(arena_t *arena)
 
 	for (i = 0; i < NBINS; i++)
 		malloc_mutex_postfork_parent(&arena->bins[i].lock);
+	malloc_mutex_postfork_parent(&arena->node_cache_mtx);
+	malloc_mutex_postfork_parent(&arena->chunks_mtx);
+	malloc_mutex_postfork_parent(&arena->huge_mtx);
 	malloc_mutex_postfork_parent(&arena->lock);
 }
 
@@ -2572,5 +2623,8 @@ arena_postfork_child(arena_t *arena)
 
 	for (i = 0; i < NBINS; i++)
 		malloc_mutex_postfork_child(&arena->bins[i].lock);
+	malloc_mutex_postfork_child(&arena->node_cache_mtx);
+	malloc_mutex_postfork_child(&arena->chunks_mtx);
+	malloc_mutex_postfork_child(&arena->huge_mtx);
 	malloc_mutex_postfork_child(&arena->lock);
 }
diff --git a/src/base.c b/src/base.c
index 0d1de7f..7b5804e 100644
--- a/src/base.c
+++ b/src/base.c
@@ -11,8 +11,9 @@ static size_t		base_allocated;
 
 /******************************************************************************/
 
+/* base_mtx must be held. */
 static extent_node_t *
-base_node_try_alloc_locked(void)
+base_node_try_alloc(void)
 {
 	extent_node_t *node;
 
@@ -24,8 +25,9 @@ base_node_try_alloc_locked(void)
 	return (node);
 }
 
+/* base_mtx must be held. */
 static void
-base_node_dalloc_locked(extent_node_t *node)
+base_node_dalloc(extent_node_t *node)
 {
 
 	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(node, sizeof(extent_node_t));
@@ -42,14 +44,14 @@ base_chunk_alloc(size_t minsize)
 	void *addr;
 
 	assert(minsize != 0);
-	node = base_node_try_alloc_locked();
+	node = base_node_try_alloc();
 	/* Allocate enough space to also carve a node out if necessary. */
 	nsize = (node == NULL) ? CACHELINE_CEILING(sizeof(extent_node_t)) : 0;
 	csize = CHUNK_CEILING(minsize + nsize);
 	addr = chunk_alloc_base(csize);
 	if (addr == NULL) {
 		if (node != NULL)
-			base_node_dalloc_locked(node);
+			base_node_dalloc(node);
 		return (NULL);
 	}
 	if (node == NULL) {
@@ -63,8 +65,13 @@ base_chunk_alloc(size_t minsize)
 	return (node);
 }
 
-static void *
-base_alloc_locked(size_t size)
+/*
+ * base_alloc() guarantees demand-zeroed memory, in order to make multi-page
+ * sparse data structures such as radix tree nodes efficient with respect to
+ * physical memory usage.
+ */
+void *
+base_alloc(size_t size)
 {
 	void *ret;
 	size_t csize;
@@ -79,6 +86,7 @@ base_alloc_locked(size_t size)
 
 	key.addr = NULL;
 	key.size = csize;
+	malloc_mutex_lock(&base_mtx);
 	node = extent_tree_szad_nsearch(&base_avail_szad, &key);
 	if (node != NULL) {
 		/* Use existing space. */
@@ -87,8 +95,10 @@ base_alloc_locked(size_t size)
 		/* Try to allocate more space. */
 		node = base_chunk_alloc(csize);
 	}
-	if (node == NULL)
-		return (NULL);
+	if (node == NULL) {
+		ret = NULL;
+		goto label_return;
+	}
 
 	ret = node->addr;
 	if (node->size > csize) {
@@ -96,50 +106,15 @@ base_alloc_locked(size_t size)
 		node->size -= csize;
 		extent_tree_szad_insert(&base_avail_szad, node);
 	} else
-		base_node_dalloc_locked(node);
+		base_node_dalloc(node);
 	if (config_stats)
 		base_allocated += csize;
 	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, csize);
-	return (ret);
-}
-
-/*
- * base_alloc() guarantees demand-zeroed memory, in order to make multi-page
- * sparse data structures such as radix tree nodes efficient with respect to
- * physical memory usage.
- */
-void *
-base_alloc(size_t size)
-{
-	void *ret;
-
-	malloc_mutex_lock(&base_mtx);
-	ret = base_alloc_locked(size);
+label_return:
 	malloc_mutex_unlock(&base_mtx);
 	return (ret);
 }
 
-extent_node_t *
-base_node_alloc(void)
-{
-	extent_node_t *ret;
-
-	malloc_mutex_lock(&base_mtx);
-	if ((ret = base_node_try_alloc_locked()) == NULL)
-		ret = (extent_node_t *)base_alloc_locked(sizeof(extent_node_t));
-	malloc_mutex_unlock(&base_mtx);
-	return (ret);
-}
-
-void
-base_node_dalloc(extent_node_t *node)
-{
-
-	malloc_mutex_lock(&base_mtx);
-	base_node_dalloc_locked(node);
-	malloc_mutex_unlock(&base_mtx);
-}
-
 size_t
 base_allocated_get(void)
 {
diff --git a/src/chunk.c b/src/chunk.c
index 9ba0b0c..6f705de 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -7,19 +7,9 @@
 const char	*opt_dss = DSS_DEFAULT;
 size_t		opt_lg_chunk = LG_CHUNK_DEFAULT;
 
-malloc_mutex_t	chunks_mtx;
-chunk_stats_t	stats_chunks;
-
-/*
- * Trees of chunks that were previously allocated (trees differ only in node
- * ordering).  These are used when allocating chunks, in an attempt to re-use
- * address space.  Depending on function, different tree orderings are needed,
- * which is why there are two trees with the same contents.
- */
-static extent_tree_t	chunks_szad_mmap;
-static extent_tree_t	chunks_ad_mmap;
-static extent_tree_t	chunks_szad_dss;
-static extent_tree_t	chunks_ad_dss;
+/* Used exclusively for gdump triggering. */
+static size_t	curchunks;
+static size_t	highchunks;
 
 rtree_t		chunks_rtree;
 
@@ -29,18 +19,51 @@ size_t		chunksize_mask; /* (chunksize - 1). */
 size_t		chunk_npages;
 
 /******************************************************************************/
-/*
- * Function prototypes for static functions that are referenced prior to
- * definition.
- */
 
-static void	chunk_dalloc_core(void *chunk, size_t size);
+bool
+chunk_register(const void *chunk, const extent_node_t *node)
+{
 
-/******************************************************************************/
+	assert(node->addr == chunk);
+
+	if (rtree_set(&chunks_rtree, (uintptr_t)chunk, node))
+		return (true);
+	if (config_prof && opt_prof) {
+		size_t nadd = (node->size == 0) ? 1 : node->size / chunksize;
+		size_t cur = atomic_add_z(&curchunks, nadd);
+		size_t high = atomic_read_z(&highchunks);
+		while (cur > high && atomic_cas_z(&highchunks, high, cur)) {
+			/*
+			 * Don't refresh cur, because it may have decreased
+			 * since this thread lost the highchunks update race.
+			 */
+			high = atomic_read_z(&highchunks);
+		}
+		if (cur > high && prof_gdump_get_unlocked())
+			prof_gdump();
+	}
+
+	return (false);
+}
+
+void
+chunk_deregister(const void *chunk, const extent_node_t *node)
+{
+	bool err;
+
+	err = rtree_set(&chunks_rtree, (uintptr_t)chunk, NULL);
+	assert(!err);
+	if (config_prof && opt_prof) {
+		size_t nsub = (node->size == 0) ? 1 : node->size / chunksize;
+		assert(atomic_read_z(&curchunks) >= nsub);
+		atomic_sub_z(&curchunks, nsub);
+	}
+}
 
 static void *
-chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad,
-    void *new_addr, size_t size, size_t alignment, bool base, bool *zero)
+chunk_recycle(arena_t *arena, extent_tree_t *chunks_szad,
+    extent_tree_t *chunks_ad, void *new_addr, size_t size, size_t alignment,
+    bool *zero)
 {
 	void *ret;
 	extent_node_t *node;
@@ -50,27 +73,17 @@ chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad,
 
 	assert(new_addr == NULL || alignment == chunksize);
 
-	if (base) {
-		/*
-		 * This function may need to call base_node_{,de}alloc(), but
-		 * the current chunk allocation request is on behalf of the
-		 * base allocator.  Avoid deadlock (and if that weren't an
-		 * issue, potential for infinite recursion) by returning NULL.
-		 */
-		return (NULL);
-	}
-
 	alloc_size = size + alignment - chunksize;
 	/* Beware size_t wrap-around. */
 	if (alloc_size < size)
 		return (NULL);
 	key.addr = new_addr;
 	key.size = alloc_size;
-	malloc_mutex_lock(&chunks_mtx);
+	malloc_mutex_lock(&arena->chunks_mtx);
 	node = (new_addr != NULL) ? extent_tree_ad_search(chunks_ad, &key) :
 	    extent_tree_szad_nsearch(chunks_szad, &key);
 	if (node == NULL) {
-		malloc_mutex_unlock(&chunks_mtx);
+		malloc_mutex_unlock(&arena->chunks_mtx);
 		return (NULL);
 	}
 	leadsize = ALIGNMENT_CEILING((uintptr_t)node->addr, alignment) -
@@ -95,20 +108,12 @@ chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad,
 	if (trailsize != 0) {
 		/* Insert the trailing space as a smaller chunk. */
 		if (node == NULL) {
-			/*
-			 * An additional node is required, but
-			 * base_node_alloc() can cause a new base chunk to be
-			 * allocated.  Drop chunks_mtx in order to avoid
-			 * deadlock, and if node allocation fails, deallocate
-			 * the result before returning an error.
-			 */
-			malloc_mutex_unlock(&chunks_mtx);
-			node = base_node_alloc();
+			node = arena_node_alloc(arena);
 			if (node == NULL) {
-				chunk_dalloc_core(ret, size);
+				malloc_mutex_unlock(&arena->chunks_mtx);
+				chunk_unmap(arena, ret, size);
 				return (NULL);
 			}
-			malloc_mutex_lock(&chunks_mtx);
 		}
 		node->addr = (void *)((uintptr_t)(ret) + size);
 		node->size = trailsize;
@@ -117,10 +122,10 @@ chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad,
 		extent_tree_ad_insert(chunks_ad, node);
 		node = NULL;
 	}
-	malloc_mutex_unlock(&chunks_mtx);
+	malloc_mutex_unlock(&arena->chunks_mtx);
 
 	if (node != NULL)
-		base_node_dalloc(node);
+		arena_node_dalloc(arena, node);
 	if (*zero) {
 		if (!zeroed)
 			memset(ret, 0, size);
@@ -137,15 +142,15 @@ chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad,
 }
 
 static void *
-chunk_alloc_core_dss(void *new_addr, size_t size, size_t alignment, bool base,
-    bool *zero)
+chunk_alloc_core_dss(arena_t *arena, void *new_addr, size_t size,
+    size_t alignment, bool *zero)
 {
 	void *ret;
 
-	if ((ret = chunk_recycle(&chunks_szad_dss, &chunks_ad_dss,
-	    new_addr, size, alignment, base, zero)) != NULL)
+	if ((ret = chunk_recycle(arena, &arena->chunks_szad_dss,
+	    &arena->chunks_ad_dss, new_addr, size, alignment, zero)) != NULL)
 		return (ret);
-	ret = chunk_alloc_dss(new_addr, size, alignment, zero);
+	ret = chunk_alloc_dss(arena, new_addr, size, alignment, zero);
 	return (ret);
 }
 
@@ -156,7 +161,7 @@ chunk_alloc_core_dss(void *new_addr, size_t size, size_t alignment, bool base,
  * them if they are returned.
  */
 static void *
-chunk_alloc_core(void *new_addr, size_t size, size_t alignment, bool base,
+chunk_alloc_core(arena_t *arena, void *new_addr, size_t size, size_t alignment,
     bool *zero, dss_prec_t dss_prec)
 {
 	void *ret;
@@ -168,12 +173,13 @@ chunk_alloc_core(void *new_addr, size_t size, size_t alignment, bool base,
 
 	/* "primary" dss. */
 	if (have_dss && dss_prec == dss_prec_primary && (ret =
-	    chunk_alloc_core_dss(new_addr, size, alignment, base, zero)) !=
+	    chunk_alloc_core_dss(arena, new_addr, size, alignment, zero)) !=
 	    NULL)
 		return (ret);
 	/* mmap. */
-	if (!config_munmap && (ret = chunk_recycle(&chunks_szad_mmap,
-	    &chunks_ad_mmap, new_addr, size, alignment, base, zero)) != NULL)
+	if (!config_munmap && (ret = chunk_recycle(arena,
+	    &arena->chunks_szad_mmap, &arena->chunks_ad_mmap, new_addr, size,
+	    alignment, zero)) != NULL)
 		return (ret);
 	/*
 	 * Requesting an address is not implemented for chunk_alloc_mmap(), so
@@ -184,7 +190,7 @@ chunk_alloc_core(void *new_addr, size_t size, size_t alignment, bool base,
 		return (ret);
 	/* "secondary" dss. */
 	if (have_dss && dss_prec == dss_prec_secondary && (ret =
-	    chunk_alloc_core_dss(new_addr, size, alignment, base, zero)) !=
+	    chunk_alloc_core_dss(arena, new_addr, size, alignment, zero)) !=
 	    NULL)
 		return (ret);
 
@@ -192,40 +198,6 @@ chunk_alloc_core(void *new_addr, size_t size, size_t alignment, bool base,
 	return (NULL);
 }
 
-static bool
-chunk_register(void *chunk, size_t size, bool base)
-{
-
-	assert(chunk != NULL);
-	assert(CHUNK_ADDR2BASE(chunk) == chunk);
-
-	if (config_ivsalloc && !base) {
-		if (rtree_set(&chunks_rtree, (uintptr_t)chunk, chunk))
-			return (true);
-	}
-	if (config_stats || config_prof) {
-		bool gdump;
-		malloc_mutex_lock(&chunks_mtx);
-		if (config_stats)
-			stats_chunks.nchunks += (size / chunksize);
-		stats_chunks.curchunks += (size / chunksize);
-		if (stats_chunks.curchunks > stats_chunks.highchunks) {
-			stats_chunks.highchunks =
-			    stats_chunks.curchunks;
-			if (config_prof)
-				gdump = true;
-		} else if (config_prof)
-			gdump = false;
-		malloc_mutex_unlock(&chunks_mtx);
-		if (config_prof && opt_prof && prof_gdump_get_unlocked() &&
-		    gdump)
-			prof_gdump();
-	}
-	if (config_valgrind)
-		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(chunk, size);
-	return (false);
-}
-
 void *
 chunk_alloc_base(size_t size)
 {
@@ -239,10 +211,10 @@ chunk_alloc_base(size_t size)
 	 */
 	zero = true;
 	ret = chunk_alloc_mmap(size, chunksize, &zero);
-	if (ret != NULL && chunk_register(ret, size, true)) {
-		chunk_dalloc_core(ret, size);
-		ret = NULL;
-	}
+	if (ret == NULL)
+		return (NULL);
+	if (config_valgrind)
+		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
 
 	return (ret);
 }
@@ -255,18 +227,16 @@ chunk_alloc_arena(chunk_alloc_t *chunk_alloc, chunk_dalloc_t *chunk_dalloc,
 	void *ret;
 
 	ret = chunk_alloc(new_addr, size, alignment, zero, arena_ind);
-	if (ret != NULL && chunk_register(ret, size, false)) {
-		chunk_dalloc(ret, size, arena_ind);
-		ret = NULL;
-	}
+	if (ret == NULL)
+		return (NULL);
+	if (config_valgrind)
+		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
 
 	return (ret);
 }
 
-/* Default arena chunk allocation routine in the absence of user override. */
-void *
-chunk_alloc_default(void *new_addr, size_t size, size_t alignment, bool *zero,
-    unsigned arena_ind)
+static arena_t *
+chunk_arena_get(unsigned arena_ind)
 {
 	arena_t *arena;
 
@@ -278,32 +248,32 @@ chunk_alloc_default(void *new_addr, size_t size, size_t alignment, bool *zero,
 	 * already.
 	 */
 	assert(arena != NULL);
+	return (arena);
+}
+
+/* Default arena chunk allocation routine in the absence of user override. */
+void *
+chunk_alloc_default(void *new_addr, size_t size, size_t alignment, bool *zero,
+    unsigned arena_ind)
+{
+	arena_t *arena;
 
-	return (chunk_alloc_core(new_addr, size, alignment, false, zero,
+	arena = chunk_arena_get(arena_ind);
+	return (chunk_alloc_core(arena, new_addr, size, alignment, zero,
 	    arena->dss_prec));
 }
 
 static void
-chunk_record(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, void *chunk,
-    size_t size)
+chunk_record(arena_t *arena, extent_tree_t *chunks_szad,
+    extent_tree_t *chunks_ad, void *chunk, size_t size)
 {
 	bool unzeroed;
-	extent_node_t *xnode, *node, *prev, *xprev, key;
+	extent_node_t *node, *prev, key;
 
 	unzeroed = pages_purge(chunk, size);
 	JEMALLOC_VALGRIND_MAKE_MEM_NOACCESS(chunk, size);
 
-	/*
-	 * Allocate a node before acquiring chunks_mtx even though it might not
-	 * be needed, because base_node_alloc() may cause a new base chunk to
-	 * be allocated, which could cause deadlock if chunks_mtx were already
-	 * held.
-	 */
-	xnode = base_node_alloc();
-	/* Use xprev to implement conditional deferred deallocation of prev. */
-	xprev = NULL;
-
-	malloc_mutex_lock(&chunks_mtx);
+	malloc_mutex_lock(&arena->chunks_mtx);
 	key.addr = (void *)((uintptr_t)chunk + size);
 	node = extent_tree_ad_nsearch(chunks_ad, &key);
 	/* Try to coalesce forward. */
@@ -320,17 +290,16 @@ chunk_record(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, void *chunk,
 		extent_tree_szad_insert(chunks_szad, node);
 	} else {
 		/* Coalescing forward failed, so insert a new node. */
-		if (xnode == NULL) {
+		node = arena_node_alloc(arena);
+		if (node == NULL) {
 			/*
-			 * base_node_alloc() failed, which is an exceedingly
+			 * Node allocation failed, which is an exceedingly
 			 * unlikely failure.  Leak chunk; its pages have
 			 * already been purged, so this is only a virtual
 			 * memory leak.
 			 */
 			goto label_return;
 		}
-		node = xnode;
-		xnode = NULL; /* Prevent deallocation below. */
 		node->addr = chunk;
 		node->size = size;
 		node->zeroed = !unzeroed;
@@ -356,37 +325,15 @@ chunk_record(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, void *chunk,
 		node->zeroed = (node->zeroed && prev->zeroed);
 		extent_tree_szad_insert(chunks_szad, node);
 
-		xprev = prev;
+		arena_node_dalloc(arena, prev);
 	}
 
 label_return:
-	malloc_mutex_unlock(&chunks_mtx);
-	/*
-	 * Deallocate xnode and/or xprev after unlocking chunks_mtx in order to
-	 * avoid potential deadlock.
-	 */
-	if (xnode != NULL)
-		base_node_dalloc(xnode);
-	if (xprev != NULL)
-		base_node_dalloc(xprev);
+	malloc_mutex_unlock(&arena->chunks_mtx);
 }
 
 void
-chunk_unmap(void *chunk, size_t size)
-{
-	assert(chunk != NULL);
-	assert(CHUNK_ADDR2BASE(chunk) == chunk);
-	assert(size != 0);
-	assert((size & chunksize_mask) == 0);
-
-	if (have_dss && chunk_in_dss(chunk))
-		chunk_record(&chunks_szad_dss, &chunks_ad_dss, chunk, size);
-	else if (chunk_dalloc_mmap(chunk, size))
-		chunk_record(&chunks_szad_mmap, &chunks_ad_mmap, chunk, size);
-}
-
-static void
-chunk_dalloc_core(void *chunk, size_t size)
+chunk_unmap(arena_t *arena, void *chunk, size_t size)
 {
 
 	assert(chunk != NULL);
@@ -394,16 +341,13 @@ chunk_dalloc_core(void *chunk, size_t size)
 	assert(size != 0);
 	assert((size & chunksize_mask) == 0);
 
-	if (config_ivsalloc)
-		rtree_set(&chunks_rtree, (uintptr_t)chunk, NULL);
-	if (config_stats || config_prof) {
-		malloc_mutex_lock(&chunks_mtx);
-		assert(stats_chunks.curchunks >= (size / chunksize));
-		stats_chunks.curchunks -= (size / chunksize);
-		malloc_mutex_unlock(&chunks_mtx);
+	if (have_dss && chunk_in_dss(chunk)) {
+		chunk_record(arena, &arena->chunks_szad_dss,
+		    &arena->chunks_ad_dss, chunk, size);
+	} else if (chunk_dalloc_mmap(chunk, size)) {
+		chunk_record(arena, &arena->chunks_szad_mmap,
+		    &arena->chunks_ad_mmap, chunk, size);
 	}
-
-	chunk_unmap(chunk, size);
 }
 
 /* Default arena chunk deallocation routine in the absence of user override. */
@@ -411,7 +355,7 @@ bool
 chunk_dalloc_default(void *chunk, size_t size, unsigned arena_ind)
 {
 
-	chunk_dalloc_core(chunk, size);
+	chunk_unmap(chunk_arena_get(arena_ind), chunk, size);
 	return (false);
 }
 
@@ -433,21 +377,11 @@ chunk_boot(void)
 	chunksize_mask = chunksize - 1;
 	chunk_npages = (chunksize >> LG_PAGE);
 
-	if (malloc_mutex_init(&chunks_mtx))
-		return (true);
-	if (config_stats || config_prof)
-		memset(&stats_chunks, 0, sizeof(chunk_stats_t));
 	if (have_dss && chunk_dss_boot())
 		return (true);
-	extent_tree_szad_new(&chunks_szad_mmap);
-	extent_tree_ad_new(&chunks_ad_mmap);
-	extent_tree_szad_new(&chunks_szad_dss);
-	extent_tree_ad_new(&chunks_ad_dss);
-	if (config_ivsalloc) {
-		if (rtree_new(&chunks_rtree, (ZU(1) << (LG_SIZEOF_PTR+3)) -
-		    opt_lg_chunk, chunks_rtree_node_alloc, NULL))
-			return (true);
-	}
+	if (rtree_new(&chunks_rtree, (ZU(1) << (LG_SIZEOF_PTR+3)) -
+	    opt_lg_chunk, chunks_rtree_node_alloc, NULL))
+		return (true);
 
 	return (false);
 }
@@ -456,7 +390,6 @@ void
 chunk_prefork(void)
 {
 
-	malloc_mutex_prefork(&chunks_mtx);
 	chunk_dss_prefork();
 }
 
@@ -465,7 +398,6 @@ chunk_postfork_parent(void)
 {
 
 	chunk_dss_postfork_parent();
-	malloc_mutex_postfork_parent(&chunks_mtx);
 }
 
 void
@@ -473,5 +405,4 @@ chunk_postfork_child(void)
 {
 
 	chunk_dss_postfork_child();
-	malloc_mutex_postfork_child(&chunks_mtx);
 }
diff --git a/src/chunk_dss.c b/src/chunk_dss.c
index edba3b2..9c3eea8 100644
--- a/src/chunk_dss.c
+++ b/src/chunk_dss.c
@@ -66,7 +66,8 @@ chunk_dss_prec_set(dss_prec_t dss_prec)
 }
 
 void *
-chunk_alloc_dss(void *new_addr, size_t size, size_t alignment, bool *zero)
+chunk_alloc_dss(arena_t *arena, void *new_addr, size_t size, size_t alignment,
+    bool *zero)
 {
 	void *ret;
 
@@ -133,7 +134,7 @@ chunk_alloc_dss(void *new_addr, size_t size, size_t alignment, bool *zero)
 				dss_max = dss_next;
 				malloc_mutex_unlock(&dss_mtx);
 				if (cpad_size != 0)
-					chunk_unmap(cpad, cpad_size);
+					chunk_unmap(arena, cpad, cpad_size);
 				if (*zero) {
 					JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(
 					    ret, size);
diff --git a/src/ctl.c b/src/ctl.c
index a283803..cd7927f 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -144,9 +144,6 @@ CTL_PROTO(prof_gdump)
 CTL_PROTO(prof_reset)
 CTL_PROTO(prof_interval)
 CTL_PROTO(lg_prof_sample)
-CTL_PROTO(stats_chunks_current)
-CTL_PROTO(stats_chunks_total)
-CTL_PROTO(stats_chunks_high)
 CTL_PROTO(stats_arenas_i_small_allocated)
 CTL_PROTO(stats_arenas_i_small_nmalloc)
 CTL_PROTO(stats_arenas_i_small_ndalloc)
@@ -363,12 +360,6 @@ static const ctl_named_node_t	prof_node[] = {
 	{NAME("lg_sample"),	CTL(lg_prof_sample)}
 };
 
-static const ctl_named_node_t stats_chunks_node[] = {
-	{NAME("current"),	CTL(stats_chunks_current)},
-	{NAME("total"),		CTL(stats_chunks_total)},
-	{NAME("high"),		CTL(stats_chunks_high)}
-};
-
 static const ctl_named_node_t stats_arenas_i_metadata_node[] = {
 	{NAME("mapped"),	CTL(stats_arenas_i_metadata_mapped)},
 	{NAME("allocated"),	CTL(stats_arenas_i_metadata_allocated)}
@@ -473,7 +464,6 @@ static const ctl_named_node_t stats_node[] = {
 	{NAME("active"),	CTL(stats_active)},
 	{NAME("metadata"),	CTL(stats_metadata)},
 	{NAME("mapped"),	CTL(stats_mapped)},
-	{NAME("chunks"),	CHILD(named, stats_chunks)},
 	{NAME("arenas"),	CHILD(indexed, stats_arenas)}
 };
 
@@ -688,14 +678,6 @@ ctl_refresh(void)
 	unsigned i;
 	VARIABLE_ARRAY(arena_t *, tarenas, ctl_stats.narenas);
 
-	if (config_stats) {
-		malloc_mutex_lock(&chunks_mtx);
-		ctl_stats.chunks.current = stats_chunks.curchunks;
-		ctl_stats.chunks.total = stats_chunks.nchunks;
-		ctl_stats.chunks.high = stats_chunks.highchunks;
-		malloc_mutex_unlock(&chunks_mtx);
-	}
-
 	/*
 	 * Clear sum stats, since they will be merged into by
 	 * ctl_arena_refresh().
@@ -733,7 +715,8 @@ ctl_refresh(void)
 		    + ctl_stats.arenas[ctl_stats.narenas].astats.metadata_mapped
 		    + ctl_stats.arenas[ctl_stats.narenas].astats
 		    .metadata_allocated;
-		ctl_stats.mapped = (ctl_stats.chunks.current << opt_lg_chunk);
+		ctl_stats.mapped =
+		    ctl_stats.arenas[ctl_stats.narenas].astats.mapped;
 	}
 
 	ctl_epoch++;
@@ -1950,11 +1933,6 @@ CTL_RO_CGEN(config_stats, stats_active, ctl_stats.active, size_t)
 CTL_RO_CGEN(config_stats, stats_metadata, ctl_stats.metadata, size_t)
 CTL_RO_CGEN(config_stats, stats_mapped, ctl_stats.mapped, size_t)
 
-CTL_RO_CGEN(config_stats, stats_chunks_current, ctl_stats.chunks.current,
-    size_t)
-CTL_RO_CGEN(config_stats, stats_chunks_total, ctl_stats.chunks.total, uint64_t)
-CTL_RO_CGEN(config_stats, stats_chunks_high, ctl_stats.chunks.high, size_t)
-
 CTL_RO_GEN(stats_arenas_i_dss, ctl_stats.arenas[mib[2]].dss, const char *)
 CTL_RO_GEN(stats_arenas_i_nthreads, ctl_stats.arenas[mib[2]].nthreads, unsigned)
 CTL_RO_GEN(stats_arenas_i_pactive, ctl_stats.arenas[mib[2]].pactive, size_t)
diff --git a/src/huge.c b/src/huge.c
index db0ecd5..0032727 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -2,15 +2,33 @@
 #include "jemalloc/internal/jemalloc_internal.h"
 
 /******************************************************************************/
-/* Data. */
 
-/* Protects chunk-related data structures. */
-static malloc_mutex_t	huge_mtx;
+static extent_node_t *
+huge_node_get(const void *ptr)
+{
+	extent_node_t *node;
 
-/******************************************************************************/
+	node = chunk_lookup(ptr);
+	assert(node->size != 0);
+
+	return (node);
+}
 
-/* Tree of chunks that are stand-alone huge allocations. */
-static extent_tree_t	huge;
+static bool
+huge_node_set(const void *ptr, extent_node_t *node)
+{
+
+	assert(node->addr == ptr);
+	assert(node->size != 0);
+	return (chunk_register(ptr, node));
+}
+
+static void
+huge_node_unset(const void *ptr, const extent_node_t *node)
+{
+
+	chunk_deregister(ptr, node);
+}
 
 void *
 huge_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
@@ -55,15 +73,22 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 		return (NULL);
 	}
 
-	/* Insert node into huge. */
 	node->addr = ret;
 	node->size = usize;
 	node->zeroed = is_zeroed;
 	node->arena = arena;
 
-	malloc_mutex_lock(&huge_mtx);
-	extent_tree_ad_insert(&huge, node);
-	malloc_mutex_unlock(&huge_mtx);
+	if (huge_node_set(ret, node)) {
+		arena_chunk_dalloc_huge(arena, ret, usize);
+		idalloctm(tsd, node, tcache, true);
+		return (NULL);
+	}
+
+	/* Insert node into huge. */
+	malloc_mutex_lock(&arena->huge_mtx);
+	ql_elm_new(node, link_ql);
+	ql_tail_insert(&arena->huge, node, link_ql);
+	malloc_mutex_unlock(&arena->huge_mtx);
 
 	if (zero || (config_fill && unlikely(opt_zero))) {
 		if (!is_zeroed)
@@ -74,32 +99,6 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 	return (ret);
 }
 
-static extent_node_t *
-huge_node_locked(const void *ptr)
-{
-	extent_node_t *node, key;
-
-	/* Extract from tree of huge allocations. */
-	key.addr = __DECONST(void *, ptr);
-	node = extent_tree_ad_search(&huge, &key);
-	assert(node != NULL);
-	assert(node->addr == ptr);
-
-	return (node);
-}
-
-static extent_node_t *
-huge_node(const void *ptr)
-{
-	extent_node_t *node;
-
-	malloc_mutex_lock(&huge_mtx);
-	node = huge_node_locked(ptr);
-	malloc_mutex_unlock(&huge_mtx);
-
-	return (node);
-}
-
 #ifdef JEMALLOC_JET
 #undef huge_dalloc_junk
 #define	huge_dalloc_junk JEMALLOC_N(huge_dalloc_junk_impl)
@@ -152,15 +151,15 @@ huge_ralloc_no_move_similar(void *ptr, size_t oldsize, size_t usize,
 	} else
 		zeroed = true;
 
-	malloc_mutex_lock(&huge_mtx);
-	node = huge_node_locked(ptr);
+	node = huge_node_get(ptr);
 	arena = node->arena;
+	malloc_mutex_lock(&arena->huge_mtx);
 	/* Update the size of the huge allocation. */
 	assert(node->size != usize);
 	node->size = usize;
 	/* Clear node->zeroed if zeroing failed above. */
 	node->zeroed = (node->zeroed && zeroed);
-	malloc_mutex_unlock(&huge_mtx);
+	malloc_mutex_unlock(&arena->huge_mtx);
 
 	arena_chunk_ralloc_huge_similar(arena, ptr, oldsize, usize);
 
@@ -195,14 +194,14 @@ huge_ralloc_no_move_shrink(void *ptr, size_t oldsize, size_t usize)
 		zeroed = false;
 	}
 
-	malloc_mutex_lock(&huge_mtx);
-	node = huge_node_locked(ptr);
+	node = huge_node_get(ptr);
 	arena = node->arena;
+	malloc_mutex_lock(&arena->huge_mtx);
 	/* Update the size of the huge allocation. */
 	node->size = usize;
 	/* Clear node->zeroed if zeroing failed above. */
 	node->zeroed = (node->zeroed && zeroed);
-	malloc_mutex_unlock(&huge_mtx);
+	malloc_mutex_unlock(&arena->huge_mtx);
 
 	/* Zap the excess chunks. */
 	arena_chunk_ralloc_huge_shrink(arena, ptr, oldsize, usize);
@@ -221,11 +220,11 @@ huge_ralloc_no_move_expand(void *ptr, size_t oldsize, size_t size, bool zero) {
 		return (true);
 	}
 
-	malloc_mutex_lock(&huge_mtx);
-	node = huge_node_locked(ptr);
+	node = huge_node_get(ptr);
 	arena = node->arena;
+	malloc_mutex_lock(&arena->huge_mtx);
 	is_zeroed_subchunk = node->zeroed;
-	malloc_mutex_unlock(&huge_mtx);
+	malloc_mutex_unlock(&arena->huge_mtx);
 
 	/*
 	 * Copy zero into is_zeroed_chunk and pass the copy to chunk_alloc(), so
@@ -237,10 +236,10 @@ huge_ralloc_no_move_expand(void *ptr, size_t oldsize, size_t size, bool zero) {
 	     &is_zeroed_chunk))
 		return (true);
 
-	malloc_mutex_lock(&huge_mtx);
+	malloc_mutex_lock(&arena->huge_mtx);
 	/* Update the size of the huge allocation. */
 	node->size = usize;
-	malloc_mutex_unlock(&huge_mtx);
+	malloc_mutex_unlock(&arena->huge_mtx);
 
 	if (zero || (config_fill && unlikely(opt_zero))) {
 		if (!is_zeroed_subchunk) {
@@ -356,11 +355,14 @@ void
 huge_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache)
 {
 	extent_node_t *node;
+	arena_t *arena;
 
-	malloc_mutex_lock(&huge_mtx);
-	node = huge_node_locked(ptr);
-	extent_tree_ad_remove(&huge, node);
-	malloc_mutex_unlock(&huge_mtx);
+	node = huge_node_get(ptr);
+	arena = node->arena;
+	huge_node_unset(ptr, node);
+	malloc_mutex_lock(&arena->huge_mtx);
+	ql_remove(&arena->huge, node, link_ql);
+	malloc_mutex_unlock(&arena->huge_mtx);
 
 	huge_dalloc_junk(node->addr, node->size);
 	arena_chunk_dalloc_huge(node->arena, node->addr, node->size);
@@ -371,59 +373,50 @@ arena_t *
 huge_aalloc(const void *ptr)
 {
 
-	return (huge_node(ptr)->arena);
+	return (huge_node_get(ptr)->arena);
 }
 
 size_t
 huge_salloc(const void *ptr)
 {
+	size_t size;
+	extent_node_t *node;
+	arena_t *arena;
 
-	return (huge_node(ptr)->size);
+	node = huge_node_get(ptr);
+	arena = node->arena;
+	malloc_mutex_lock(&arena->huge_mtx);
+	size = node->size;
+	malloc_mutex_unlock(&arena->huge_mtx);
+
+	return (size);
 }
 
 prof_tctx_t *
 huge_prof_tctx_get(const void *ptr)
 {
+	prof_tctx_t *tctx;
+	extent_node_t *node;
+	arena_t *arena;
 
-	return (huge_node(ptr)->prof_tctx);
-}
-
-void
-huge_prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
-{
-
-	huge_node(ptr)->prof_tctx = tctx;
-}
-
-bool
-huge_boot(void)
-{
-
-	/* Initialize chunks data. */
-	if (malloc_mutex_init(&huge_mtx))
-		return (true);
-	extent_tree_ad_new(&huge);
-
-	return (false);
-}
-
-void
-huge_prefork(void)
-{
-
-	malloc_mutex_prefork(&huge_mtx);
-}
-
-void
-huge_postfork_parent(void)
-{
+	node = huge_node_get(ptr);
+	arena = node->arena;
+	malloc_mutex_lock(&arena->huge_mtx);
+	tctx = node->prof_tctx;
+	malloc_mutex_unlock(&arena->huge_mtx);
 
-	malloc_mutex_postfork_parent(&huge_mtx);
+	return (tctx);
 }
 
 void
-huge_postfork_child(void)
+huge_prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 {
+	extent_node_t *node;
+	arena_t *arena;
 
-	malloc_mutex_postfork_child(&huge_mtx);
+	node = huge_node_get(ptr);
+	arena = node->arena;
+	malloc_mutex_lock(&arena->huge_mtx);
+	node->prof_tctx = tctx;
+	malloc_mutex_unlock(&arena->huge_mtx);
 }
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 9447791..3903209 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1195,8 +1195,6 @@ malloc_init_hard_a0_locked(void)
 		return (true);
 	if (config_tcache && tcache_boot())
 		malloc_mutex_unlock(&init_lock);
-	if (huge_boot())
-		return (true);
 	if (malloc_mutex_init(&arenas_lock))
 		return (true);
 	/*
@@ -2310,12 +2308,10 @@ je_sallocx(const void *ptr, int flags)
 	assert(malloc_initialized() || IS_INITIALIZER);
 	malloc_thread_init();
 
-	if (config_ivsalloc)
+	if (config_debug)
 		usize = ivsalloc(ptr, config_prof);
-	else {
-		assert(ptr != NULL);
+	else
 		usize = isalloc(ptr, config_prof);
-	}
 
 	return (usize);
 }
@@ -2440,10 +2436,10 @@ je_malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *ptr)
 	assert(malloc_initialized() || IS_INITIALIZER);
 	malloc_thread_init();
 
-	if (config_ivsalloc)
+	if (config_debug)
 		ret = ivsalloc(ptr, config_prof);
 	else
-		ret = (ptr != NULL) ? isalloc(ptr, config_prof) : 0;
+		ret = (ptr == NULL) ? 0 : isalloc(ptr, config_prof);
 
 	return (ret);
 }
@@ -2504,7 +2500,6 @@ _malloc_prefork(void)
 	}
 	chunk_prefork();
 	base_prefork();
-	huge_prefork();
 }
 
 #ifndef JEMALLOC_MUTEX_INIT_CB
@@ -2524,7 +2519,6 @@ _malloc_postfork(void)
 	assert(malloc_initialized());
 
 	/* Release all mutexes, now that fork() has completed. */
-	huge_postfork_parent();
 	base_postfork_parent();
 	chunk_postfork_parent();
 	for (i = 0; i < narenas_total; i++) {
@@ -2544,7 +2538,6 @@ jemalloc_postfork_child(void)
 	assert(malloc_initialized());
 
 	/* Release all mutexes, now that fork() has completed. */
-	huge_postfork_child();
 	base_postfork_child();
 	chunk_postfork_child();
 	for (i = 0; i < narenas_total; i++) {
diff --git a/src/stats.c b/src/stats.c
index 865f775..e0f7165 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -547,8 +547,6 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	if (config_stats) {
 		size_t *cactive;
 		size_t allocated, active, metadata, mapped;
-		size_t chunks_current, chunks_high;
-		uint64_t chunks_total;
 
 		CTL_GET("stats.cactive", &cactive, size_t *);
 		CTL_GET("stats.allocated", &allocated, size_t);
@@ -561,16 +559,6 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		malloc_cprintf(write_cb, cbopaque,
 		    "Current active ceiling: %zu\n", atomic_read_z(cactive));
 
-		/* Print chunk stats. */
-		CTL_GET("stats.chunks.total", &chunks_total, uint64_t);
-		CTL_GET("stats.chunks.high", &chunks_high, size_t);
-		CTL_GET("stats.chunks.current", &chunks_current, size_t);
-		malloc_cprintf(write_cb, cbopaque, "chunks: nchunks   "
-		    "highchunks    curchunks\n");
-		malloc_cprintf(write_cb, cbopaque,
-		    "  %13"PRIu64" %12zu %12zu\n",
-		    chunks_total, chunks_high, chunks_current);
-
 		if (merged) {
 			unsigned narenas;
 
diff --git a/src/tcache.c b/src/tcache.c
index 1166d60..10c85dd 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -102,7 +102,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_bin_t *tbin, index_t binind,
 		/* Lock the arena bin associated with the first object. */
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(
 		    tbin->avail[0]);
-		arena_t *bin_arena = chunk->arena;
+		arena_t *bin_arena = chunk->node.arena;
 		arena_bin_t *bin = &bin_arena->bins[binind];
 
 		if (config_prof && bin_arena == arena) {
@@ -124,7 +124,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_bin_t *tbin, index_t binind,
 			ptr = tbin->avail[i];
 			assert(ptr != NULL);
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-			if (chunk->arena == bin_arena) {
+			if (chunk->node.arena == bin_arena) {
 				size_t pageind = ((uintptr_t)ptr -
 				    (uintptr_t)chunk) >> LG_PAGE;
 				arena_chunk_map_bits_t *bitselm =
@@ -182,7 +182,7 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, index_t binind,
 		/* Lock the arena associated with the first object. */
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(
 		    tbin->avail[0]);
-		arena_t *locked_arena = chunk->arena;
+		arena_t *locked_arena = chunk->node.arena;
 		UNUSED bool idump;
 
 		if (config_prof)
@@ -208,7 +208,7 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, index_t binind,
 			ptr = tbin->avail[i];
 			assert(ptr != NULL);
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-			if (chunk->arena == locked_arena) {
+			if (chunk->node.arena == locked_arena) {
 				arena_dalloc_large_junked_locked(locked_arena,
 				    chunk, ptr);
 			} else {
diff --git a/test/unit/stats.c b/test/unit/stats.c
index 946e737..1099967 100644
--- a/test/unit/stats.c
+++ b/test/unit/stats.c
@@ -29,32 +29,6 @@ TEST_BEGIN(test_stats_summary)
 }
 TEST_END
 
-TEST_BEGIN(test_stats_chunks)
-{
-	size_t current, high;
-	uint64_t total;
-	size_t sz;
-	int expected = config_stats ? 0 : ENOENT;
-
-	sz = sizeof(size_t);
-	assert_d_eq(mallctl("stats.chunks.current", &current, &sz, NULL, 0),
-	    expected, "Unexpected mallctl() result");
-	sz = sizeof(uint64_t);
-	assert_d_eq(mallctl("stats.chunks.total", &total, &sz, NULL, 0),
-	    expected, "Unexpected mallctl() result");
-	sz = sizeof(size_t);
-	assert_d_eq(mallctl("stats.chunks.high", &high, &sz, NULL, 0), expected,
-	    "Unexpected mallctl() result");
-
-	if (config_stats) {
-		assert_zu_le(current, high,
-		    "current should be no larger than high");
-		assert_u64_le((uint64_t)high, total,
-		    "high should be no larger than total");
-	}
-}
-TEST_END
-
 TEST_BEGIN(test_stats_huge)
 {
 	void *p;
@@ -458,7 +432,6 @@ main(void)
 
 	return (test(
 	    test_stats_summary,
-	    test_stats_chunks,
 	    test_stats_huge,
 	    test_stats_arenas_summary,
 	    test_stats_arenas_small,
-- 
cgit v0.12


From 1eaf3b6f345e0b5835549f19e844c81314c90435 Mon Sep 17 00:00:00 2001
From: Daniel Micay <danielmicay@gmail.com>
Date: Thu, 12 Feb 2015 15:46:30 -0500
Subject: add missing check for new_addr chunk size

8ddc93293cd8370870f221225ef1e013fbff6d65 switched this to over using the
address tree in order to avoid false negatives, so it now needs to check
that the size of the free extent is large enough to satisfy the request.
---
 src/chunk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/chunk.c b/src/chunk.c
index 6f705de..b357619 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -82,7 +82,7 @@ chunk_recycle(arena_t *arena, extent_tree_t *chunks_szad,
 	malloc_mutex_lock(&arena->chunks_mtx);
 	node = (new_addr != NULL) ? extent_tree_ad_search(chunks_ad, &key) :
 	    extent_tree_szad_nsearch(chunks_szad, &key);
-	if (node == NULL) {
+	if (node == NULL || (new_addr != NULL && node->size < size)) {
 		malloc_mutex_unlock(&arena->chunks_mtx);
 		return (NULL);
 	}
-- 
cgit v0.12


From 88fef7ceda6269598cef0cee8b984c8765673c27 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Thu, 12 Feb 2015 14:06:37 -0800
Subject: Refactor huge_*() calls into arena internals.

Make redirects to the huge_*() API the arena code's responsibility,
since arenas now take responsibility for all allocation sizes.
---
 include/jemalloc/internal/arena.h                | 220 +++++++++++++----------
 include/jemalloc/internal/jemalloc_internal.h.in |  64 +------
 include/jemalloc/internal/prof.h                 |  17 +-
 src/arena.c                                      | 160 +++++++++++------
 4 files changed, 236 insertions(+), 225 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 2ae4609..77a7dcb 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -391,7 +391,8 @@ void	arena_dalloc_junk_small(void *ptr, arena_bin_info_t *bin_info);
 void	arena_quarantine_junk_small(void *ptr, size_t usize);
 void	*arena_malloc_small(arena_t *arena, size_t size, bool zero);
 void	*arena_malloc_large(arena_t *arena, size_t size, bool zero);
-void	*arena_palloc(arena_t *arena, size_t size, size_t alignment, bool zero);
+void	*arena_palloc(tsd_t *tsd, arena_t *arena, size_t usize,
+    size_t alignment, bool zero, tcache_t *tcache);
 void	arena_prof_promoted(const void *ptr, size_t size);
 void	arena_dalloc_bin_junked_locked(arena_t *arena, arena_chunk_t *chunk,
     void *ptr, arena_chunk_map_bits_t *bitselm);
@@ -481,8 +482,7 @@ void	*arena_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
 arena_t	*arena_aalloc(const void *ptr);
 size_t	arena_salloc(const void *ptr, bool demote);
 void	arena_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache);
-void	arena_sdalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, size_t size,
-    tcache_t *tcache);
+void	arena_sdalloc(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ARENA_C_))
@@ -931,20 +931,22 @@ arena_prof_tctx_get(const void *ptr)
 {
 	prof_tctx_t *ret;
 	arena_chunk_t *chunk;
-	size_t pageind, mapbits;
 
 	cassert(config_prof);
 	assert(ptr != NULL);
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
-	mapbits = arena_mapbits_get(chunk, pageind);
-	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
-	if (likely((mapbits & CHUNK_MAP_LARGE) == 0))
-		ret = (prof_tctx_t *)(uintptr_t)1U;
-	else
-		ret = arena_miscelm_get(chunk, pageind)->prof_tctx;
+	if (likely(chunk != ptr)) {
+		size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
+		size_t mapbits = arena_mapbits_get(chunk, pageind);
+		assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
+		if (likely((mapbits & CHUNK_MAP_LARGE) == 0))
+			ret = (prof_tctx_t *)(uintptr_t)1U;
+		else
+			ret = arena_miscelm_get(chunk, pageind)->prof_tctx;
+	} else
+		ret = huge_prof_tctx_get(ptr);
 
 	return (ret);
 }
@@ -953,18 +955,20 @@ JEMALLOC_INLINE void
 arena_prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 {
 	arena_chunk_t *chunk;
-	size_t pageind;
 
 	cassert(config_prof);
 	assert(ptr != NULL);
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
-	assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
+	if (likely(chunk != ptr)) {
+		size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
+		assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
 
-	if (unlikely(arena_mapbits_large_get(chunk, pageind) != 0))
-		arena_miscelm_get(chunk, pageind)->prof_tctx = tctx;
+		if (unlikely(arena_mapbits_large_get(chunk, pageind) != 0))
+			arena_miscelm_get(chunk, pageind)->prof_tctx = tctx;
+	} else
+		huge_prof_tctx_set(ptr, tctx);
 }
 
 JEMALLOC_ALWAYS_INLINE void *
@@ -984,7 +988,7 @@ arena_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
 				return (NULL);
 			return (arena_malloc_small(arena, size, zero));
 		}
-	} else {
+	} else if (likely(size <= arena_maxclass)) {
 		/*
 		 * Initialize tcache after checking size in order to avoid
 		 * infinite recursion during tcache initialization.
@@ -997,7 +1001,8 @@ arena_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
 				return (NULL);
 			return (arena_malloc_large(arena, size, zero));
 		}
-	}
+	} else
+		return (huge_malloc(tsd, arena, size, zero, tcache));
 }
 
 JEMALLOC_ALWAYS_INLINE arena_t *
@@ -1006,7 +1011,10 @@ arena_aalloc(const void *ptr)
 	arena_chunk_t *chunk;
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	return (chunk->node.arena);
+	if (likely(chunk != ptr))
+		return (chunk->node.arena);
+	else
+		return (huge_aalloc(ptr));
 }
 
 /* Return the size of the allocation pointed to by ptr. */
@@ -1022,29 +1030,37 @@ arena_salloc(const void *ptr, bool demote)
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
-	assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
-	binind = arena_mapbits_binind_get(chunk, pageind);
-	if (unlikely(binind == BININD_INVALID || (config_prof && !demote &&
-	    arena_mapbits_large_get(chunk, pageind) != 0))) {
-		/*
-		 * Large allocation.  In the common case (demote), and as this
-		 * is an inline function, most callers will only end up looking
-		 * at binind to determine that ptr is a small allocation.
-		 */
-		assert(((uintptr_t)ptr & PAGE_MASK) == 0);
-		ret = arena_mapbits_large_size_get(chunk, pageind);
-		assert(ret != 0);
-		assert(pageind + (ret>>LG_PAGE) <= chunk_npages);
-		assert(arena_mapbits_dirty_get(chunk, pageind) ==
-		    arena_mapbits_dirty_get(chunk, pageind+(ret>>LG_PAGE)-1));
-	} else {
-		/* Small allocation (possibly promoted to a large object). */
-		assert(arena_mapbits_large_get(chunk, pageind) != 0 ||
-		    arena_ptr_small_binind_get(ptr, arena_mapbits_get(chunk,
-		    pageind)) == binind);
-		ret = index2size(binind);
-	}
+	if (likely(chunk != ptr)) {
+		pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
+		assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
+		binind = arena_mapbits_binind_get(chunk, pageind);
+		if (unlikely(binind == BININD_INVALID || (config_prof && !demote
+		    && arena_mapbits_large_get(chunk, pageind) != 0))) {
+			/*
+			 * Large allocation.  In the common case (demote), and
+			 * as this is an inline function, most callers will only
+			 * end up looking at binind to determine that ptr is a
+			 * small allocation.
+			 */
+			assert(((uintptr_t)ptr & PAGE_MASK) == 0);
+			ret = arena_mapbits_large_size_get(chunk, pageind);
+			assert(ret != 0);
+			assert(pageind + (ret>>LG_PAGE) <= chunk_npages);
+			assert(arena_mapbits_dirty_get(chunk, pageind) ==
+			    arena_mapbits_dirty_get(chunk,
+			    pageind+(ret>>LG_PAGE)-1));
+		} else {
+			/*
+			 * Small allocation (possibly promoted to a large
+			 * object).
+			 */
+			assert(arena_mapbits_large_get(chunk, pageind) != 0 ||
+			    arena_ptr_small_binind_get(ptr,
+			    arena_mapbits_get(chunk, pageind)) == binind);
+			ret = index2size(binind);
+		}
+	} else
+		ret = huge_salloc(ptr);
 
 	return (ret);
 }
@@ -1058,75 +1074,83 @@ arena_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache)
 	assert(ptr != NULL);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (unlikely(chunk == ptr)) {
-		huge_dalloc(tsd, ptr, tcache);
-		return;
-	}
-	assert(CHUNK_ADDR2BASE(ptr) != ptr);
-
-	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
-	mapbits = arena_mapbits_get(chunk, pageind);
-	assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
-	if (likely((mapbits & CHUNK_MAP_LARGE) == 0)) {
-		/* Small allocation. */
-		if (likely(tcache != NULL)) {
-			index_t binind = arena_ptr_small_binind_get(ptr,
-			    mapbits);
-			tcache_dalloc_small(tsd, tcache, ptr, binind);
+	if (likely(chunk != ptr)) {
+		pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
+		mapbits = arena_mapbits_get(chunk, pageind);
+		assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
+		if (likely((mapbits & CHUNK_MAP_LARGE) == 0)) {
+			/* Small allocation. */
+			if (likely(tcache != NULL)) {
+				index_t binind = arena_ptr_small_binind_get(ptr,
+				    mapbits);
+				tcache_dalloc_small(tsd, tcache, ptr, binind);
+			} else {
+				arena_dalloc_small(chunk->node.arena, chunk,
+				    ptr, pageind);
+			}
 		} else {
-			arena_dalloc_small(chunk->node.arena, chunk, ptr,
+			size_t size = arena_mapbits_large_size_get(chunk,
 			    pageind);
-		}
-	} else {
-		size_t size = arena_mapbits_large_size_get(chunk, pageind);
 
-		assert(((uintptr_t)ptr & PAGE_MASK) == 0);
+			assert(((uintptr_t)ptr & PAGE_MASK) == 0);
 
-		if (likely(tcache != NULL) && size <= tcache_maxclass)
-			tcache_dalloc_large(tsd, tcache, ptr, size);
-		else
-			arena_dalloc_large(chunk->node.arena, chunk, ptr);
-	}
+			if (likely(tcache != NULL) && size <= tcache_maxclass)
+				tcache_dalloc_large(tsd, tcache, ptr, size);
+			else {
+				arena_dalloc_large(chunk->node.arena, chunk,
+				    ptr);
+			}
+		}
+	} else
+		huge_dalloc(tsd, ptr, tcache);
 }
 
 JEMALLOC_ALWAYS_INLINE void
-arena_sdalloc(tsd_t *tsd, arena_chunk_t *chunk, void *ptr, size_t size,
-    tcache_t *tcache)
+arena_sdalloc(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache)
 {
+	arena_chunk_t *chunk;
 
-	assert(ptr != NULL);
-	assert(CHUNK_ADDR2BASE(ptr) != ptr);
-
-	if (config_prof && opt_prof) {
-		size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
-		assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
-		if (arena_mapbits_large_get(chunk, pageind) != 0) {
-			/* Make sure to use promoted size, not request size. */
-			assert(((uintptr_t)ptr & PAGE_MASK) == 0);
-			size = arena_mapbits_large_size_get(chunk, pageind);
-		}
-	}
-	assert(s2u(size) == s2u(arena_salloc(ptr, false)));
-
-	if (likely(size <= SMALL_MAXCLASS)) {
-		/* Small allocation. */
-		if (likely(tcache != NULL)) {
-			index_t binind = size2index(size);
-			tcache_dalloc_small(tsd, tcache, ptr, binind);
-		} else {
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	if (likely(chunk != ptr)) {
+		if (config_prof && opt_prof) {
 			size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >>
 			    LG_PAGE;
-			arena_dalloc_small(chunk->node.arena, chunk, ptr,
-			    pageind);
+			assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
+			if (arena_mapbits_large_get(chunk, pageind) != 0) {
+				/*
+				 * Make sure to use promoted size, not request
+				 * size.
+				 */
+				assert(((uintptr_t)ptr & PAGE_MASK) == 0);
+				size = arena_mapbits_large_size_get(chunk,
+				    pageind);
+			}
 		}
-	} else {
-		assert(((uintptr_t)ptr & PAGE_MASK) == 0);
+		assert(s2u(size) == s2u(arena_salloc(ptr, false)));
+
+		if (likely(size <= SMALL_MAXCLASS)) {
+			/* Small allocation. */
+			if (likely(tcache != NULL)) {
+				index_t binind = size2index(size);
+				tcache_dalloc_small(tsd, tcache, ptr, binind);
+			} else {
+				size_t pageind = ((uintptr_t)ptr -
+				    (uintptr_t)chunk) >> LG_PAGE;
+				arena_dalloc_small(chunk->node.arena, chunk,
+				    ptr, pageind);
+			}
+		} else {
+			assert(((uintptr_t)ptr & PAGE_MASK) == 0);
 
-		if (likely(tcache != NULL) && size <= tcache_maxclass)
-			tcache_dalloc_large(tsd, tcache, ptr, size);
-		else
-			arena_dalloc_large(chunk->node.arena, chunk, ptr);
-	}
+			if (likely(tcache != NULL) && size <= tcache_maxclass)
+				tcache_dalloc_large(tsd, tcache, ptr, size);
+			else {
+				arena_dalloc_large(chunk->node.arena, chunk,
+				    ptr);
+			}
+		}
+	} else
+		huge_dalloc(tsd, ptr, tcache);
 }
 #  endif /* JEMALLOC_ARENA_INLINE_B */
 #endif
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index ab93aa5..43276c6 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -823,18 +823,10 @@ bool	ixalloc(void *ptr, size_t oldsize, size_t size, size_t extra,
 JEMALLOC_ALWAYS_INLINE arena_t *
 iaalloc(const void *ptr)
 {
-	arena_t *arena;
-	arena_chunk_t *chunk;
 
 	assert(ptr != NULL);
 
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (likely(chunk != ptr))
-		arena = arena_aalloc(ptr);
-	else
-		arena = huge_aalloc(ptr);
-
-	return (arena);
+	return (arena_aalloc(ptr));
 }
 
 /*
@@ -845,20 +837,12 @@ iaalloc(const void *ptr)
 JEMALLOC_ALWAYS_INLINE size_t
 isalloc(const void *ptr, bool demote)
 {
-	size_t ret;
-	arena_chunk_t *chunk;
 
 	assert(ptr != NULL);
 	/* Demotion only makes sense if config_prof is true. */
 	assert(config_prof || !demote);
 
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (likely(chunk != ptr))
-		ret = arena_salloc(ptr, demote);
-	else
-		ret = huge_salloc(ptr);
-
-	return (ret);
+	return (arena_salloc(ptr, demote));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
@@ -869,10 +853,7 @@ iallocztm(tsd_t *tsd, size_t size, bool zero, tcache_t *tcache, bool is_metadata
 
 	assert(size != 0);
 
-	if (likely(size <= arena_maxclass))
-		ret = arena_malloc(tsd, arena, size, zero, tcache);
-	else
-		ret = huge_malloc(tsd, arena, size, zero, tcache);
+	ret = arena_malloc(tsd, arena, size, zero, tcache);
 	if (config_stats && is_metadata && likely(ret != NULL)) {
 		arena_metadata_allocated_add(iaalloc(ret), isalloc(ret,
 		    config_prof));
@@ -917,21 +898,7 @@ ipallocztm(tsd_t *tsd, size_t usize, size_t alignment, bool zero,
 	assert(usize != 0);
 	assert(usize == sa2u(usize, alignment));
 
-	if (usize <= SMALL_MAXCLASS && alignment < PAGE)
-		ret = arena_malloc(tsd, arena, usize, zero, tcache);
-	else {
-		if (likely(usize <= arena_maxclass)) {
-			arena = arena_choose(tsd, arena);
-			if (unlikely(arena == NULL))
-				return (NULL);
-			ret = arena_palloc(arena, usize, alignment, zero);
-		} else if (likely(alignment <= chunksize))
-			ret = huge_malloc(tsd, arena, usize, zero, tcache);
-		else {
-			ret = huge_palloc(tsd, arena, usize, alignment, zero,
-			    tcache);
-		}
-	}
+	ret = arena_palloc(tsd, arena, usize, alignment, zero, tcache);
 	assert(ALIGNMENT_ADDR2BASE(ret, alignment) == ret);
 	if (config_stats && is_metadata && likely(ret != NULL)) {
 		arena_metadata_allocated_add(iaalloc(ret), isalloc(ret,
@@ -1033,15 +1000,8 @@ iqalloc(tsd_t *tsd, void *ptr, tcache_t *tcache)
 JEMALLOC_ALWAYS_INLINE void
 isdalloct(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache)
 {
-	arena_chunk_t *chunk;
-
-	assert(ptr != NULL);
 
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (likely(chunk != ptr))
-		arena_sdalloc(tsd, chunk, ptr, size, tcache);
-	else
-		huge_dalloc(tsd, ptr, tcache);
+	arena_sdalloc(tsd, ptr, size, tcache);
 }
 
 JEMALLOC_ALWAYS_INLINE void
@@ -1104,13 +1064,8 @@ iralloct(tsd_t *tsd, void *ptr, size_t oldsize, size_t size, size_t alignment,
 		    zero, tcache, arena));
 	}
 
-	if (likely(size <= arena_maxclass)) {
-		return (arena_ralloc(tsd, arena, ptr, oldsize, size, 0,
-		    alignment, zero, tcache));
-	} else {
-		return (huge_ralloc(tsd, arena, ptr, oldsize, size, 0,
-		    alignment, zero, tcache));
-	}
+	return (arena_ralloc(tsd, arena, ptr, oldsize, size, 0, alignment, zero,
+	    tcache));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
@@ -1136,10 +1091,7 @@ ixalloc(void *ptr, size_t oldsize, size_t size, size_t extra, size_t alignment,
 		return (true);
 	}
 
-	if (likely(size <= arena_maxclass))
-		return (arena_ralloc_no_move(ptr, oldsize, size, extra, zero));
-	else
-		return (huge_ralloc_no_move(ptr, oldsize, size, extra, zero));
+	return (arena_ralloc_no_move(ptr, oldsize, size, extra, zero));
 }
 #endif
 
diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index b2db685..f508243 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -372,34 +372,21 @@ prof_tdata_get(tsd_t *tsd, bool create)
 JEMALLOC_ALWAYS_INLINE prof_tctx_t *
 prof_tctx_get(const void *ptr)
 {
-	prof_tctx_t *ret;
-	arena_chunk_t *chunk;
 
 	cassert(config_prof);
 	assert(ptr != NULL);
 
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (likely(chunk != ptr))
-		ret = arena_prof_tctx_get(ptr);
-	else
-		ret = huge_prof_tctx_get(ptr);
-
-	return (ret);
+	return (arena_prof_tctx_get(ptr));
 }
 
 JEMALLOC_ALWAYS_INLINE void
 prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 {
-	arena_chunk_t *chunk;
 
 	cassert(config_prof);
 	assert(ptr != NULL);
 
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	if (likely(chunk != ptr))
-		arena_prof_tctx_set(ptr, tctx);
-	else
-		huge_prof_tctx_set(ptr, tctx);
+	arena_prof_tctx_set(ptr, tctx);
 }
 
 JEMALLOC_ALWAYS_INLINE bool
diff --git a/src/arena.c b/src/arena.c
index 2bd1a2c..7b441be 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1714,8 +1714,9 @@ arena_malloc_large(arena_t *arena, size_t size, bool zero)
 }
 
 /* Only handles large allocations that require more than page alignment. */
-void *
-arena_palloc(arena_t *arena, size_t size, size_t alignment, bool zero)
+static void *
+arena_palloc_large(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
+    bool zero)
 {
 	void *ret;
 	size_t alloc_size, leadsize, trailsize;
@@ -1726,6 +1727,10 @@ arena_palloc(arena_t *arena, size_t size, size_t alignment, bool zero)
 
 	assert((size & PAGE_MASK) == 0);
 
+	arena = arena_choose(tsd, arena);
+	if (unlikely(arena == NULL))
+		return (NULL);
+
 	alignment = PAGE_CEILING(alignment);
 	alloc_size = size + alignment - PAGE;
 
@@ -1783,6 +1788,28 @@ arena_palloc(arena_t *arena, size_t size, size_t alignment, bool zero)
 	return (ret);
 }
 
+void *
+arena_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
+    bool zero, tcache_t *tcache)
+{
+	void *ret;
+
+	if (usize <= SMALL_MAXCLASS && alignment < PAGE)
+		ret = arena_malloc(tsd, arena, usize, zero, tcache);
+	else {
+		if (likely(usize <= arena_maxclass)) {
+			ret = arena_palloc_large(tsd, arena, usize, alignment,
+			    zero);
+		} else if (likely(alignment <= chunksize))
+			ret = huge_malloc(tsd, arena, usize, zero, tcache);
+		else {
+			ret = huge_palloc(tsd, arena, usize, alignment, zero,
+			    tcache);
+		}
+	}
+	return (ret);
+}
+
 void
 arena_prof_promoted(const void *ptr, size_t size)
 {
@@ -2189,29 +2216,35 @@ arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
     bool zero)
 {
 
-	/*
-	 * Avoid moving the allocation if the size class can be left the same.
-	 */
-	if (likely(oldsize <= arena_maxclass)) {
-		if (oldsize <= SMALL_MAXCLASS) {
-			assert(arena_bin_info[size2index(oldsize)].reg_size
-			    == oldsize);
-			if ((size + extra <= SMALL_MAXCLASS && size2index(size +
-			    extra) == size2index(oldsize)) || (size <= oldsize
-			    && size + extra >= oldsize))
-				return (false);
-		} else {
-			assert(size <= arena_maxclass);
-			if (size + extra > SMALL_MAXCLASS) {
-				if (!arena_ralloc_large(ptr, oldsize, size,
-				    extra, zero))
+	if (likely(size <= arena_maxclass)) {
+		/*
+		 * Avoid moving the allocation if the size class can be left the
+		 * same.
+		 */
+		if (likely(oldsize <= arena_maxclass)) {
+			if (oldsize <= SMALL_MAXCLASS) {
+				assert(
+				    arena_bin_info[size2index(oldsize)].reg_size
+				    == oldsize);
+				if ((size + extra <= SMALL_MAXCLASS &&
+				    size2index(size + extra) ==
+				    size2index(oldsize)) || (size <= oldsize &&
+				    size + extra >= oldsize))
 					return (false);
+			} else {
+				assert(size <= arena_maxclass);
+				if (size + extra > SMALL_MAXCLASS) {
+					if (!arena_ralloc_large(ptr, oldsize,
+					    size, extra, zero))
+						return (false);
+				}
 			}
 		}
-	}
 
-	/* Reallocation would require a move. */
-	return (true);
+		/* Reallocation would require a move. */
+		return (true);
+	} else
+		return (huge_ralloc_no_move(ptr, oldsize, size, extra, zero));
 }
 
 void *
@@ -2219,52 +2252,67 @@ arena_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t size,
     size_t extra, size_t alignment, bool zero, tcache_t *tcache)
 {
 	void *ret;
-	size_t copysize;
 
-	/* Try to avoid moving the allocation. */
-	if (!arena_ralloc_no_move(ptr, oldsize, size, extra, zero))
-		return (ptr);
+	if (likely(size <= arena_maxclass)) {
+		size_t copysize;
 
-	/*
-	 * size and oldsize are different enough that we need to move the
-	 * object.  In that case, fall back to allocating new space and
-	 * copying.
-	 */
-	if (alignment != 0) {
-		size_t usize = sa2u(size + extra, alignment);
-		if (usize == 0)
-			return (NULL);
-		ret = ipalloct(tsd, usize, alignment, zero, tcache, arena);
-	} else
-		ret = arena_malloc(tsd, arena, size + extra, zero, tcache);
+		/* Try to avoid moving the allocation. */
+		if (!arena_ralloc_no_move(ptr, oldsize, size, extra, zero))
+			return (ptr);
 
-	if (ret == NULL) {
-		if (extra == 0)
-			return (NULL);
-		/* Try again, this time without extra. */
+		/*
+		 * size and oldsize are different enough that we need to move
+		 * the object.  In that case, fall back to allocating new space
+		 * and copying.
+		 */
 		if (alignment != 0) {
-			size_t usize = sa2u(size, alignment);
+			size_t usize = sa2u(size + extra, alignment);
 			if (usize == 0)
 				return (NULL);
 			ret = ipalloct(tsd, usize, alignment, zero, tcache,
 			    arena);
-		} else
-			ret = arena_malloc(tsd, arena, size, zero, tcache);
+		} else {
+			ret = arena_malloc(tsd, arena, size + extra, zero,
+			    tcache);
+		}
 
-		if (ret == NULL)
-			return (NULL);
-	}
+		if (ret == NULL) {
+			if (extra == 0)
+				return (NULL);
+			/* Try again, this time without extra. */
+			if (alignment != 0) {
+				size_t usize = sa2u(size, alignment);
+				if (usize == 0)
+					return (NULL);
+				ret = ipalloct(tsd, usize, alignment, zero,
+				    tcache, arena);
+			} else {
+				ret = arena_malloc(tsd, arena, size, zero,
+				    tcache);
+			}
 
-	/* Junk/zero-filling were already done by ipalloc()/arena_malloc(). */
+			if (ret == NULL)
+				return (NULL);
+		}
 
-	/*
-	 * Copy at most size bytes (not size+extra), since the caller has no
-	 * expectation that the extra bytes will be reliably preserved.
-	 */
-	copysize = (size < oldsize) ? size : oldsize;
-	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, copysize);
-	memcpy(ret, ptr, copysize);
-	isqalloc(tsd, ptr, oldsize, tcache);
+		/*
+		 * Junk/zero-filling were already done by
+		 * ipalloc()/arena_malloc().
+		 */
+
+		/*
+		 * Copy at most size bytes (not size+extra), since the caller
+		 * has no expectation that the extra bytes will be reliably
+		 * preserved.
+		 */
+		copysize = (size < oldsize) ? size : oldsize;
+		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, copysize);
+		memcpy(ret, ptr, copysize);
+		isqalloc(tsd, ptr, oldsize, tcache);
+	} else {
+		ret = huge_ralloc(tsd, arena, ptr, oldsize, size, extra,
+		    alignment, zero, tcache);
+	}
 	return (ret);
 }
 
-- 
cgit v0.12


From 5f7140b045136232b1bbe66fcf2a7f63d08682a1 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Thu, 12 Feb 2015 15:54:53 -0800
Subject: Make prof_tctx accesses atomic.

Although exceedingly unlikely, it appears that writes to the prof_tctx
field of arena_chunk_map_misc_t could be reordered such that a stale
value could be read during deallocation, with profiler metadata
corruption and invalid pointer dereferences being the most likely
effects.
---
 include/jemalloc/internal/arena.h | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 77a7dcb..4d88736 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -943,8 +943,11 @@ arena_prof_tctx_get(const void *ptr)
 		assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
 		if (likely((mapbits & CHUNK_MAP_LARGE) == 0))
 			ret = (prof_tctx_t *)(uintptr_t)1U;
-		else
-			ret = arena_miscelm_get(chunk, pageind)->prof_tctx;
+		else {
+			arena_chunk_map_misc_t *elm = arena_miscelm_get(chunk,
+			    pageind);
+			ret = atomic_read_p((void **)&elm->prof_tctx);
+		}
 	} else
 		ret = huge_prof_tctx_get(ptr);
 
@@ -965,8 +968,11 @@ arena_prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 		size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
 		assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
 
-		if (unlikely(arena_mapbits_large_get(chunk, pageind) != 0))
-			arena_miscelm_get(chunk, pageind)->prof_tctx = tctx;
+		if (unlikely(arena_mapbits_large_get(chunk, pageind) != 0)) {
+			arena_chunk_map_misc_t *elm = arena_miscelm_get(chunk,
+			    pageind);
+			atomic_write_p((void **)&elm->prof_tctx, tctx);
+		}
 	} else
 		huge_prof_tctx_set(ptr, tctx);
 }
-- 
cgit v0.12


From ab5e3790f6bc2dc0c4d7c3d537387cf2563456ff Mon Sep 17 00:00:00 2001
From: Dan McGregor <dan.mcgregor@usask.ca>
Date: Tue, 23 Dec 2014 16:09:32 -0600
Subject: Build docs in object directory

---
 Makefile.in | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Makefile.in b/Makefile.in
index da397c3..b1d88af 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -104,8 +104,8 @@ endif
 PC := $(objroot)jemalloc.pc
 MAN3 := $(objroot)doc/jemalloc$(install_suffix).3
 DOCS_XML := $(objroot)doc/jemalloc$(install_suffix).xml
-DOCS_HTML := $(DOCS_XML:$(objroot)%.xml=$(srcroot)%.html)
-DOCS_MAN3 := $(DOCS_XML:$(objroot)%.xml=$(srcroot)%.3)
+DOCS_HTML := $(DOCS_XML:$(objroot)%.xml=$(objroot)%.html)
+DOCS_MAN3 := $(DOCS_XML:$(objroot)%.xml=$(objroot)%.3)
 DOCS := $(DOCS_HTML) $(DOCS_MAN3)
 C_TESTLIB_SRCS := $(srcroot)test/src/btalloc.c $(srcroot)test/src/btalloc_0.c \
 	$(srcroot)test/src/btalloc_1.c $(srcroot)test/src/math.c \
@@ -181,10 +181,10 @@ all: build_lib
 
 dist: build_doc
 
-$(srcroot)doc/%.html : $(objroot)doc/%.xml $(srcroot)doc/stylesheet.xsl $(objroot)doc/html.xsl
+$(objroot)doc/%.html : $(objroot)doc/%.xml $(srcroot)doc/stylesheet.xsl $(objroot)doc/html.xsl
 	$(XSLTPROC) -o $@ $(objroot)doc/html.xsl $<
 
-$(srcroot)doc/%.3 : $(objroot)doc/%.xml $(srcroot)doc/stylesheet.xsl $(objroot)doc/manpages.xsl
+$(objroot)doc/%.3 : $(objroot)doc/%.xml $(srcroot)doc/stylesheet.xsl $(objroot)doc/manpages.xsl
 	$(XSLTPROC) -o $@ $(objroot)doc/manpages.xsl $<
 
 build_doc_html: $(DOCS_HTML)
-- 
cgit v0.12


From f8880310ebb0ad5e1acce6e9886395e20041a32f Mon Sep 17 00:00:00 2001
From: Dan McGregor <dan.mcgregor@usask.ca>
Date: Tue, 23 Dec 2014 16:10:08 -0600
Subject: Put VERSION file in object directory

Also allow for the possibility that there exists a VERSION
file in the srcroot, in case of building from a release tarball
out of tree.
---
 Makefile.in  |  2 +-
 configure.ac | 26 +++++++++++++++-----------
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/Makefile.in b/Makefile.in
index b1d88af..a105bb1 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -418,7 +418,7 @@ distclean: clean
 
 relclean: distclean
 	rm -f $(objroot)configure
-	rm -f $(srcroot)VERSION
+	rm -f $(objroot)VERSION
 	rm -f $(DOCS_HTML)
 	rm -f $(DOCS_MAN3)
 
diff --git a/configure.ac b/configure.ac
index 2922880..240d27a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1046,32 +1046,36 @@ dnl jemalloc configuration.
 dnl 
 
 dnl Set VERSION if source directory is inside a git repository.
-if test "x`git rev-parse --is-inside-work-tree 2>/dev/null`" = "xtrue" ; then
+if test "x`test ! \"${srcroot}\" && cd \"${srcroot}\"; git rev-parse --is-inside-work-tree 2>/dev/null`" = "xtrue" ; then
   dnl Pattern globs aren't powerful enough to match both single- and
   dnl double-digit version numbers, so iterate over patterns to support up to
   dnl version 99.99.99 without any accidental matches.
-  rm -f "${srcroot}VERSION"
+  rm -f "${objroot}VERSION"
   for pattern in ['[0-9].[0-9].[0-9]' '[0-9].[0-9].[0-9][0-9]' \
                  '[0-9].[0-9][0-9].[0-9]' '[0-9].[0-9][0-9].[0-9][0-9]' \
                  '[0-9][0-9].[0-9].[0-9]' '[0-9][0-9].[0-9].[0-9][0-9]' \
                  '[0-9][0-9].[0-9][0-9].[0-9]' \
                  '[0-9][0-9].[0-9][0-9].[0-9][0-9]']; do
-    if test ! -e "${srcroot}VERSION" ; then
-      git describe --long --abbrev=40 --match="${pattern}" > "${srcroot}VERSION.tmp" 2>/dev/null
+    if test ! -e "${objroot}VERSION" ; then
+      (test ! "${srcroot}" && cd "${srcroot}"; git describe --long --abbrev=40 --match="${pattern}") > "${objroot}VERSION.tmp" 2>/dev/null
       if test $? -eq 0 ; then
-        mv "${srcroot}VERSION.tmp" "${srcroot}VERSION"
+        mv "${objroot}VERSION.tmp" "${objroot}VERSION"
         break
       fi
     fi
   done
 fi
-rm -f "${srcroot}VERSION.tmp"
-if test ! -e "${srcroot}VERSION" ; then
-  AC_MSG_RESULT(
-    [Missing VERSION file, and unable to generate it; creating bogus VERSION])
-  echo "0.0.0-0-g0000000000000000000000000000000000000000" > "${srcroot}VERSION"
+rm -f "${objroot}VERSION.tmp"
+if test ! -e "${objroot}VERSION" ; then
+  if test ! -e "${srcroot}VERSION" ; then
+    AC_MSG_RESULT(
+      [Missing VERSION file, and unable to generate it; creating bogus VERSION])
+    echo "0.0.0-0-g0000000000000000000000000000000000000000" > "${objroot}VERSION"
+  else
+    cp ${srcroot}VERSION ${objroot}VERSION
+  fi
 fi
-jemalloc_version=`cat "${srcroot}VERSION"`
+jemalloc_version=`cat "${objroot}VERSION"`
 jemalloc_version_major=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]1}'`
 jemalloc_version_minor=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]2}'`
 jemalloc_version_bugfix=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]3}'`
-- 
cgit v0.12


From feaaa3df0da9972b9c5016c55b886e54853cc855 Mon Sep 17 00:00:00 2001
From: Abhishek Kulkarni <adkulkar@umail.iu.edu>
Date: Wed, 11 Feb 2015 14:38:10 -0500
Subject: Take into account the install suffix that jemalloc was built with in
 the pkg-config file.

Signed-off-by: Abhishek Kulkarni <adkulkar@umail.iu.edu>
---
 jemalloc.pc.in | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/jemalloc.pc.in b/jemalloc.pc.in
index af3f945..1a3ad9b 100644
--- a/jemalloc.pc.in
+++ b/jemalloc.pc.in
@@ -2,10 +2,11 @@ prefix=@prefix@
 exec_prefix=@exec_prefix@
 libdir=@libdir@
 includedir=@includedir@
+install_suffix=@install_suffix@
 
 Name: jemalloc
 Description: A general purpose malloc(3) implementation that emphasizes fragmentation avoidance and scalable concurrency support.
 URL: http://www.canonware.com/jemalloc
 Version: @jemalloc_version@
 Cflags: -I${includedir}
-Libs: -L${libdir} -ljemalloc
+Libs: -L${libdir} -ljemalloc${install_suffix}
-- 
cgit v0.12


From 41cfe03f39740fe61cf46d86982f66c24168de32 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Fri, 13 Feb 2015 15:28:56 -0800
Subject: If MALLOCX_ARENA(a) is specified, use it during tcache fill.

---
 include/jemalloc/internal/arena.h  | 26 ++++++++++++--------------
 include/jemalloc/internal/tcache.h | 28 +++++++++++++++-------------
 src/tcache.c                       | 19 ++++++++++---------
 3 files changed, 37 insertions(+), 36 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 4d88736..b195daf 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -985,28 +985,26 @@ arena_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
 	assert(size != 0);
 	assert(size <= arena_maxclass);
 
+	arena = arena_choose(tsd, arena);
+	if (unlikely(arena == NULL))
+		return (NULL);
+
 	if (likely(size <= SMALL_MAXCLASS)) {
-		if (likely(tcache != NULL))
-			return (tcache_alloc_small(tsd, tcache, size, zero));
-		else {
-			arena = arena_choose(tsd, arena);
-			if (unlikely(arena == NULL))
-				return (NULL);
+		if (likely(tcache != NULL)) {
+			return (tcache_alloc_small(tsd, arena, tcache, size,
+			    zero));
+		} else
 			return (arena_malloc_small(arena, size, zero));
-		}
 	} else if (likely(size <= arena_maxclass)) {
 		/*
 		 * Initialize tcache after checking size in order to avoid
 		 * infinite recursion during tcache initialization.
 		 */
-		if (likely(tcache != NULL) && size <= tcache_maxclass)
-			return (tcache_alloc_large(tsd, tcache, size, zero));
-		else {
-			arena = arena_choose(tsd, arena);
-			if (unlikely(arena == NULL))
-				return (NULL);
+		if (likely(tcache != NULL) && size <= tcache_maxclass) {
+			return (tcache_alloc_large(tsd, arena, tcache, size,
+			    zero));
+		} else
 			return (arena_malloc_large(arena, size, zero));
-		}
 	} else
 		return (huge_malloc(tsd, arena, size, zero, tcache));
 }
diff --git a/include/jemalloc/internal/tcache.h b/include/jemalloc/internal/tcache.h
index 2a3952b..d2443b1 100644
--- a/include/jemalloc/internal/tcache.h
+++ b/include/jemalloc/internal/tcache.h
@@ -120,10 +120,10 @@ extern tcaches_t	*tcaches;
 
 size_t	tcache_salloc(const void *ptr);
 void	tcache_event_hard(tsd_t *tsd, tcache_t *tcache);
-void	*tcache_alloc_small_hard(tsd_t *tsd, tcache_t *tcache,
+void	*tcache_alloc_small_hard(tsd_t *tsd, arena_t *arena, tcache_t *tcache,
     tcache_bin_t *tbin, index_t binind);
-void	tcache_bin_flush_small(tsd_t *tsd, tcache_bin_t *tbin, index_t binind,
-    unsigned rem, tcache_t *tcache);
+void	tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
+    index_t binind, unsigned rem);
 void	tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, index_t binind,
     unsigned rem, tcache_t *tcache);
 void	tcache_arena_associate(tcache_t *tcache, arena_t *arena);
@@ -151,10 +151,10 @@ bool	tcache_enabled_get(void);
 tcache_t *tcache_get(tsd_t *tsd, bool create);
 void	tcache_enabled_set(bool enabled);
 void	*tcache_alloc_easy(tcache_bin_t *tbin);
-void	*tcache_alloc_small(tsd_t *tsd, tcache_t *tcache, size_t size,
-    bool zero);
-void	*tcache_alloc_large(tsd_t *tsd, tcache_t *tcache, size_t size,
-    bool zero);
+void	*tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache,
+    size_t size, bool zero);
+void	*tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache,
+    size_t size, bool zero);
 void	tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr,
     index_t binind);
 void	tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr,
@@ -258,7 +258,8 @@ tcache_alloc_easy(tcache_bin_t *tbin)
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-tcache_alloc_small(tsd_t *tsd, tcache_t *tcache, size_t size, bool zero)
+tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
+    bool zero)
 {
 	void *ret;
 	index_t binind;
@@ -271,7 +272,7 @@ tcache_alloc_small(tsd_t *tsd, tcache_t *tcache, size_t size, bool zero)
 	usize = index2size(binind);
 	ret = tcache_alloc_easy(tbin);
 	if (unlikely(ret == NULL)) {
-		ret = tcache_alloc_small_hard(tsd, tcache, tbin, binind);
+		ret = tcache_alloc_small_hard(tsd, arena, tcache, tbin, binind);
 		if (ret == NULL)
 			return (NULL);
 	}
@@ -302,7 +303,8 @@ tcache_alloc_small(tsd_t *tsd, tcache_t *tcache, size_t size, bool zero)
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-tcache_alloc_large(tsd_t *tsd, tcache_t *tcache, size_t size, bool zero)
+tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
+    bool zero)
 {
 	void *ret;
 	index_t binind;
@@ -320,7 +322,7 @@ tcache_alloc_large(tsd_t *tsd, tcache_t *tcache, size_t size, bool zero)
 		 * Only allocate one large object at a time, because it's quite
 		 * expensive to create one and not use it.
 		 */
-		ret = arena_malloc_large(arena_choose(tsd, NULL), usize, zero);
+		ret = arena_malloc_large(arena, usize, zero);
 		if (ret == NULL)
 			return (NULL);
 	} else {
@@ -366,8 +368,8 @@ tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, index_t binind)
 	tbin = &tcache->tbins[binind];
 	tbin_info = &tcache_bin_info[binind];
 	if (unlikely(tbin->ncached == tbin_info->ncached_max)) {
-		tcache_bin_flush_small(tsd, tbin, binind,
-		    (tbin_info->ncached_max >> 1), tcache);
+		tcache_bin_flush_small(tsd, tcache, tbin, binind,
+		    (tbin_info->ncached_max >> 1));
 	}
 	assert(tbin->ncached < tbin_info->ncached_max);
 	tbin->avail[tbin->ncached] = ptr;
diff --git a/src/tcache.c b/src/tcache.c
index 10c85dd..318e0dc 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -41,8 +41,9 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache)
 		 * Flush (ceiling) 3/4 of the objects below the low water mark.
 		 */
 		if (binind < NBINS) {
-			tcache_bin_flush_small(tsd, tbin, binind, tbin->ncached
-			    - tbin->low_water + (tbin->low_water >> 2), tcache);
+			tcache_bin_flush_small(tsd, tcache, tbin, binind,
+			    tbin->ncached - tbin->low_water + (tbin->low_water
+			    >> 2));
 		} else {
 			tcache_bin_flush_large(tsd, tbin, binind, tbin->ncached
 			    - tbin->low_water + (tbin->low_water >> 2), tcache);
@@ -70,13 +71,13 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache)
 }
 
 void *
-tcache_alloc_small_hard(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
-    index_t binind)
+tcache_alloc_small_hard(tsd_t *tsd, arena_t *arena, tcache_t *tcache,
+    tcache_bin_t *tbin, index_t binind)
 {
 	void *ret;
 
-	arena_tcache_fill_small(arena_choose(tsd, NULL), tbin, binind,
-	    config_prof ? tcache->prof_accumbytes : 0);
+	arena_tcache_fill_small(arena, tbin, binind, config_prof ?
+	    tcache->prof_accumbytes : 0);
 	if (config_prof)
 		tcache->prof_accumbytes = 0;
 	ret = tcache_alloc_easy(tbin);
@@ -85,8 +86,8 @@ tcache_alloc_small_hard(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
 }
 
 void
-tcache_bin_flush_small(tsd_t *tsd, tcache_bin_t *tbin, index_t binind,
-    unsigned rem, tcache_t *tcache)
+tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
+    index_t binind, unsigned rem)
 {
 	arena_t *arena;
 	void *ptr;
@@ -350,7 +351,7 @@ tcache_destroy(tsd_t *tsd, tcache_t *tcache)
 
 	for (i = 0; i < NBINS; i++) {
 		tcache_bin_t *tbin = &tcache->tbins[i];
-		tcache_bin_flush_small(tsd, tbin, i, 0, tcache);
+		tcache_bin_flush_small(tsd, tcache, tbin, i, 0);
 
 		if (config_stats && tbin->tstats.nrequests != 0) {
 			arena_bin_t *bin = &arena->bins[i];
-- 
cgit v0.12


From b01186cebd9828e91a488d86980544bacb01e1a6 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 15 Feb 2015 14:04:55 -0800
Subject: Remove redundant tcache_boot() call.

---
 src/jemalloc.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/jemalloc.c b/src/jemalloc.c
index 3903209..d511009 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1193,8 +1193,6 @@ malloc_init_hard_a0_locked(void)
 	arena_boot();
 	if (config_tcache && tcache_boot())
 		return (true);
-	if (config_tcache && tcache_boot())
-		malloc_mutex_unlock(&init_lock);
 	if (malloc_mutex_init(&arenas_lock))
 		return (true);
 	/*
-- 
cgit v0.12


From 2195ba4e1f8f262b7e6586106d90f4dc0aea7630 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 15 Feb 2015 16:43:52 -0800
Subject: Normalize *_link and link_* fields to all be *_link.

---
 include/jemalloc/internal/extent.h | 6 +++---
 src/arena.c                        | 8 ++++----
 src/extent.c                       | 5 ++---
 src/huge.c                         | 6 +++---
 4 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/include/jemalloc/internal/extent.h b/include/jemalloc/internal/extent.h
index fbcdcf9..885f475 100644
--- a/include/jemalloc/internal/extent.h
+++ b/include/jemalloc/internal/extent.h
@@ -34,14 +34,14 @@ struct extent_node_s {
 
 	union {
 		/* Linkage for the size/address-ordered tree. */
-		rb_node(extent_node_t)	link_szad;
+		rb_node(extent_node_t)	szad_link;
 
 		/* Linkage for huge allocations and cached chunks nodes. */
-		ql_elm(extent_node_t)	link_ql;
+		ql_elm(extent_node_t)	ql_link;
 	};
 
 	/* Linkage for the address-ordered tree. */
-	rb_node(extent_node_t)	link_ad;
+	rb_node(extent_node_t)	ad_link;
 };
 typedef rb_tree(extent_node_t) extent_tree_t;
 
diff --git a/src/arena.c b/src/arena.c
index 7b441be..ce500f4 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -607,12 +607,12 @@ arena_node_alloc(arena_t *arena)
 	extent_node_t *node;
 
 	malloc_mutex_lock(&arena->node_cache_mtx);
-	node = ql_last(&arena->node_cache, link_ql);
+	node = ql_last(&arena->node_cache, ql_link);
 	if (node == NULL) {
 		malloc_mutex_unlock(&arena->node_cache_mtx);
 		return (base_alloc(sizeof(extent_node_t)));
 	}
-	ql_tail_remove(&arena->node_cache, extent_node_t, link_ql);
+	ql_tail_remove(&arena->node_cache, extent_node_t, ql_link);
 	malloc_mutex_unlock(&arena->node_cache_mtx);
 	return (node);
 }
@@ -622,8 +622,8 @@ arena_node_dalloc(arena_t *arena, extent_node_t *node)
 {
 
 	malloc_mutex_lock(&arena->node_cache_mtx);
-	ql_elm_new(node, link_ql);
-	ql_tail_insert(&arena->node_cache, node, link_ql);
+	ql_elm_new(node, ql_link);
+	ql_tail_insert(&arena->node_cache, node, ql_link);
 	malloc_mutex_unlock(&arena->node_cache_mtx);
 }
 
diff --git a/src/extent.c b/src/extent.c
index ca85201..60e2468 100644
--- a/src/extent.c
+++ b/src/extent.c
@@ -22,7 +22,7 @@ extent_szad_comp(extent_node_t *a, extent_node_t *b)
 }
 
 /* Generate red-black tree functions. */
-rb_gen(, extent_tree_szad_, extent_tree_t, extent_node_t, link_szad,
+rb_gen(, extent_tree_szad_, extent_tree_t, extent_node_t, szad_link,
     extent_szad_comp)
 
 JEMALLOC_INLINE_C int
@@ -35,5 +35,4 @@ extent_ad_comp(extent_node_t *a, extent_node_t *b)
 }
 
 /* Generate red-black tree functions. */
-rb_gen(, extent_tree_ad_, extent_tree_t, extent_node_t, link_ad,
-    extent_ad_comp)
+rb_gen(, extent_tree_ad_, extent_tree_t, extent_node_t, ad_link, extent_ad_comp)
diff --git a/src/huge.c b/src/huge.c
index 0032727..bc7d99c 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -86,8 +86,8 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 
 	/* Insert node into huge. */
 	malloc_mutex_lock(&arena->huge_mtx);
-	ql_elm_new(node, link_ql);
-	ql_tail_insert(&arena->huge, node, link_ql);
+	ql_elm_new(node, ql_link);
+	ql_tail_insert(&arena->huge, node, ql_link);
 	malloc_mutex_unlock(&arena->huge_mtx);
 
 	if (zero || (config_fill && unlikely(opt_zero))) {
@@ -361,7 +361,7 @@ huge_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache)
 	arena = node->arena;
 	huge_node_unset(ptr, node);
 	malloc_mutex_lock(&arena->huge_mtx);
-	ql_remove(&arena->huge, node, link_ql);
+	ql_remove(&arena->huge, node, ql_link);
 	malloc_mutex_unlock(&arena->huge_mtx);
 
 	huge_dalloc_junk(node->addr, node->size);
-- 
cgit v0.12


From 02e5dcf39d4995d2f37d0b18aa8511973938ac51 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 15 Feb 2015 20:12:06 -0800
Subject: Fix --enable-debug regression.

Fix --enable-debug to actually enable debug mode.  This regression was
introduced by cbf3a6d70371d2390b8b0e76814e04cc6088002c (Move centralized
chunk management into arenas.).
---
 configure.ac | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/configure.ac b/configure.ac
index 240d27a..7a694a2 100644
--- a/configure.ac
+++ b/configure.ac
@@ -634,6 +634,9 @@ fi
 ],
 [enable_debug="0"]
 )
+if test "x$enable_debug" = "x1" ; then
+  AC_DEFINE([JEMALLOC_DEBUG], [ ])
+fi
 AC_SUBST([enable_debug])
 
 dnl Only optimize if not debugging.
-- 
cgit v0.12


From cb9b44914e7e25c6b08af7124d7f8f976e059555 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 15 Feb 2015 20:13:28 -0800
Subject: Remove obsolete (incorrect) assertions.

This regression was introduced by
88fef7ceda6269598cef0cee8b984c8765673c27 (Refactor huge_*() calls into
arena internals.), and went undetected because of the --enable-debug
regression.
---
 include/jemalloc/internal/arena.h |  2 --
 test/integration/mallocx.c        | 45 +++++++++++++++++++++------------------
 2 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index b195daf..232e9a6 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -983,7 +983,6 @@ arena_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
 {
 
 	assert(size != 0);
-	assert(size <= arena_maxclass);
 
 	arena = arena_choose(tsd, arena);
 	if (unlikely(arena == NULL))
@@ -1031,7 +1030,6 @@ arena_salloc(const void *ptr, bool demote)
 	index_t binind;
 
 	assert(ptr != NULL);
-	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (likely(chunk != ptr)) {
diff --git a/test/integration/mallocx.c b/test/integration/mallocx.c
index 123e041..23129c2 100644
--- a/test/integration/mallocx.c
+++ b/test/integration/mallocx.c
@@ -2,34 +2,37 @@
 
 #define	CHUNK 0x400000
 #define	MAXALIGN (((size_t)1) << 25)
+#define	MAXSZ (((size_t)1) << 26)
 #define	NITER 4
 
 TEST_BEGIN(test_basic)
 {
-	size_t nsz, rsz, sz;
-	void *p;
+	size_t sz;
 
-	sz = 42;
-	nsz = nallocx(sz, 0);
-	assert_zu_ne(nsz, 0, "Unexpected nallocx() error");
-	p = mallocx(sz, 0);
-	assert_ptr_not_null(p, "Unexpected mallocx() error");
-	rsz = sallocx(p, 0);
-	assert_zu_ge(rsz, sz, "Real size smaller than expected");
-	assert_zu_eq(nsz, rsz, "nallocx()/sallocx() size mismatch");
-	dallocx(p, 0);
+	for (sz = 1; sz < MAXSZ; sz = nallocx(sz, 0) + 1) {
+		size_t nsz, rsz;
+		void *p;
+		nsz = nallocx(sz, 0);
+		assert_zu_ne(nsz, 0, "Unexpected nallocx() error");
+		p = mallocx(sz, 0);
+		assert_ptr_not_null(p, "Unexpected mallocx() error");
+		rsz = sallocx(p, 0);
+		assert_zu_ge(rsz, sz, "Real size smaller than expected");
+		assert_zu_eq(nsz, rsz, "nallocx()/sallocx() size mismatch");
+		dallocx(p, 0);
 
-	p = mallocx(sz, 0);
-	assert_ptr_not_null(p, "Unexpected mallocx() error");
-	dallocx(p, 0);
+		p = mallocx(sz, 0);
+		assert_ptr_not_null(p, "Unexpected mallocx() error");
+		dallocx(p, 0);
 
-	nsz = nallocx(sz, MALLOCX_ZERO);
-	assert_zu_ne(nsz, 0, "Unexpected nallocx() error");
-	p = mallocx(sz, MALLOCX_ZERO);
-	assert_ptr_not_null(p, "Unexpected mallocx() error");
-	rsz = sallocx(p, 0);
-	assert_zu_eq(nsz, rsz, "nallocx()/sallocx() rsize mismatch");
-	dallocx(p, 0);
+		nsz = nallocx(sz, MALLOCX_ZERO);
+		assert_zu_ne(nsz, 0, "Unexpected nallocx() error");
+		p = mallocx(sz, MALLOCX_ZERO);
+		assert_ptr_not_null(p, "Unexpected mallocx() error");
+		rsz = sallocx(p, 0);
+		assert_zu_eq(nsz, rsz, "nallocx()/sallocx() rsize mismatch");
+		dallocx(p, 0);
+	}
 }
 TEST_END
 
-- 
cgit v0.12


From 40ab8f98e42fda3816e2a993f136ec4770c202c7 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 15 Feb 2015 20:26:45 -0800
Subject: Remove more obsolete (incorrect) assertions.

This regression was introduced by
88fef7ceda6269598cef0cee8b984c8765673c27 (Refactor huge_*() calls into
arena internals.), and went undetected because of the --enable-debug
regression.
---
 include/jemalloc/internal/arena.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 232e9a6..6341a86 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -934,7 +934,6 @@ arena_prof_tctx_get(const void *ptr)
 
 	cassert(config_prof);
 	assert(ptr != NULL);
-	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (likely(chunk != ptr)) {
@@ -961,7 +960,6 @@ arena_prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 
 	cassert(config_prof);
 	assert(ptr != NULL);
-	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (likely(chunk != ptr)) {
-- 
cgit v0.12


From ee41ad409a43d12900a5a3108f6c14f84e4eb0eb Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 15 Feb 2015 18:04:46 -0800
Subject: Integrate whole chunks into unused dirty page purging machinery.

Extend per arena unused dirty page purging to manage unused dirty chunks
in aaddtion to unused dirty runs.  Rather than immediately unmapping
deallocated chunks (or purging them in the --disable-munmap case), store
them in a separate set of trees, chunks_[sz]ad_dirty.  Preferrentially
allocate dirty chunks.  When excessive unused dirty pages accumulate,
purge runs and chunks in ingegrated LRU order (and unmap chunks in the
--enable-munmap case).

Refactor extent_node_t to provide accessor functions.
---
 include/jemalloc/internal/arena.h                |  64 ++--
 include/jemalloc/internal/chunk.h                |   4 +-
 include/jemalloc/internal/extent.h               | 137 +++++++-
 include/jemalloc/internal/jemalloc_internal.h.in |   8 +-
 include/jemalloc/internal/private_symbols.txt    |  15 +
 src/arena.c                                      | 403 ++++++++++++++++-------
 src/base.c                                       |  16 +-
 src/chunk.c                                      | 144 +++++---
 src/chunk_dss.c                                  |   8 +-
 src/extent.c                                     |  12 +-
 src/huge.c                                       |  61 ++--
 src/tcache.c                                     |   9 +-
 12 files changed, 632 insertions(+), 249 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 6341a86..f967be3 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -35,6 +35,7 @@ typedef struct arena_s arena_t;
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS
 
+#ifdef JEMALLOC_ARENA_STRUCTS_A
 struct arena_run_s {
 	/* Index of bin this run is associated with. */
 	index_t		binind;
@@ -136,7 +137,7 @@ struct arena_chunk_map_misc_s {
 
 	union {
 		/* Linkage for list of dirty runs. */
-		ql_elm(arena_chunk_map_misc_t)	dr_link;
+		qr(arena_chunk_map_misc_t)	rd_link;
 
 		/* Profile counters, used for large object runs. */
 		prof_tctx_t			*prof_tctx;
@@ -147,14 +148,16 @@ struct arena_chunk_map_misc_s {
 };
 typedef rb_tree(arena_chunk_map_misc_t) arena_avail_tree_t;
 typedef rb_tree(arena_chunk_map_misc_t) arena_run_tree_t;
-typedef ql_head(arena_chunk_map_misc_t) arena_chunk_miscelms_t;
+typedef qr(arena_chunk_map_misc_t) arena_chunk_miscelms_t;
+#endif /* JEMALLOC_ARENA_STRUCTS_A */
 
+#ifdef JEMALLOC_ARENA_STRUCTS_B
 /* Arena chunk header. */
 struct arena_chunk_s {
 	/*
-	 * The arena that owns the chunk is node.arena.  This field as a whole
-	 * is used by chunks_rtree to support both ivsalloc() and core-based
-	 * debugging.
+	 * A pointer to the arena that owns the chunk is stored within the node.
+	 * This field as a whole is used by chunks_rtree to support both
+	 * ivsalloc() and core-based debugging.
 	 */
 	extent_node_t		node;
 
@@ -309,13 +312,29 @@ struct arena_s {
 	size_t			ndirty;
 
 	/*
-	 * Size/address-ordered trees of this arena's available runs.  The trees
-	 * are used for first-best-fit run allocation.
+	 * Size/address-ordered tree of this arena's available runs.  The tree
+	 * is used for first-best-fit run allocation.
 	 */
 	arena_avail_tree_t	runs_avail;
 
-	/* List of dirty runs this arena manages. */
-	arena_chunk_miscelms_t	runs_dirty;
+	/*
+	 * Unused dirty memory this arena manages.  Dirty memory is conceptually
+	 * tracked as an arbitrarily interleaved LRU of runs and chunks, but the
+	 * list linkage is actually semi-duplicated in order to avoid extra
+	 * arena_chunk_map_misc_t space overhead.
+	 *
+	 *   LRU-----------------------------------------------------------MRU
+	 *
+	 *         ______________           ___                      ___
+	 *   ...-->|chunks_dirty|<--------->|c|<-------------------->|c|<--...
+	 *         --------------           |h|                      |h|
+	 *         ____________    _____    |u|    _____    _____    |u|
+	 *   ...-->|runs_dirty|<-->|run|<-->|n|<-->|run|<-->|run|<-->|n|<--...
+	 *         ------------    -----    |k|    -----    -----    |k|
+	 *                                  ---                      ---
+	 */
+	arena_chunk_map_misc_t	runs_dirty;
+	extent_node_t		chunks_dirty;
 
 	/* Extant huge allocations. */
 	ql_head(extent_node_t)	huge;
@@ -329,6 +348,8 @@ struct arena_s {
 	 * orderings are needed, which is why there are two trees with the same
 	 * contents.
 	 */
+	extent_tree_t		chunks_szad_dirty;
+	extent_tree_t		chunks_ad_dirty;
 	extent_tree_t		chunks_szad_mmap;
 	extent_tree_t		chunks_ad_mmap;
 	extent_tree_t		chunks_szad_dss;
@@ -347,6 +368,7 @@ struct arena_s {
 	/* bins is used to store trees of free regions. */
 	arena_bin_t		bins[NBINS];
 };
+#endif /* JEMALLOC_ARENA_STRUCTS_B */
 
 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
@@ -363,6 +385,10 @@ extern size_t		arena_maxclass; /* Max size class for arenas. */
 extern unsigned		nlclasses; /* Number of large size classes. */
 extern unsigned		nhclasses; /* Number of huge size classes. */
 
+void	arena_chunk_dirty_maybe_insert(arena_t *arena, extent_node_t *node,
+    bool dirty);
+void	arena_chunk_dirty_maybe_remove(arena_t *arena, extent_node_t *node,
+    bool dirty);
 extent_node_t	*arena_node_alloc(arena_t *arena);
 void	arena_node_dalloc(arena_t *arena, extent_node_t *node);
 void	*arena_chunk_alloc_huge(arena_t *arena, size_t usize, size_t alignment,
@@ -818,7 +844,7 @@ arena_ptr_small_binind_get(const void *ptr, size_t mapbits)
 		assert(binind != BININD_INVALID);
 		assert(binind < NBINS);
 		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-		arena = chunk->node.arena;
+		arena = extent_node_arena_get(&chunk->node);
 		pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
 		actual_mapbits = arena_mapbits_get(chunk, pageind);
 		assert(mapbits == actual_mapbits);
@@ -1013,7 +1039,7 @@ arena_aalloc(const void *ptr)
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (likely(chunk != ptr))
-		return (chunk->node.arena);
+		return (extent_node_arena_get(&chunk->node));
 	else
 		return (huge_aalloc(ptr));
 }
@@ -1085,8 +1111,8 @@ arena_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache)
 				    mapbits);
 				tcache_dalloc_small(tsd, tcache, ptr, binind);
 			} else {
-				arena_dalloc_small(chunk->node.arena, chunk,
-				    ptr, pageind);
+				arena_dalloc_small(extent_node_arena_get(
+				    &chunk->node), chunk, ptr, pageind);
 			}
 		} else {
 			size_t size = arena_mapbits_large_size_get(chunk,
@@ -1097,8 +1123,8 @@ arena_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache)
 			if (likely(tcache != NULL) && size <= tcache_maxclass)
 				tcache_dalloc_large(tsd, tcache, ptr, size);
 			else {
-				arena_dalloc_large(chunk->node.arena, chunk,
-				    ptr);
+				arena_dalloc_large(extent_node_arena_get(
+				    &chunk->node), chunk, ptr);
 			}
 		}
 	} else
@@ -1136,8 +1162,8 @@ arena_sdalloc(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache)
 			} else {
 				size_t pageind = ((uintptr_t)ptr -
 				    (uintptr_t)chunk) >> LG_PAGE;
-				arena_dalloc_small(chunk->node.arena, chunk,
-				    ptr, pageind);
+				arena_dalloc_small(extent_node_arena_get(
+				    &chunk->node), chunk, ptr, pageind);
 			}
 		} else {
 			assert(((uintptr_t)ptr & PAGE_MASK) == 0);
@@ -1145,8 +1171,8 @@ arena_sdalloc(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache)
 			if (likely(tcache != NULL) && size <= tcache_maxclass)
 				tcache_dalloc_large(tsd, tcache, ptr, size);
 			else {
-				arena_dalloc_large(chunk->node.arena, chunk,
-				    ptr);
+				arena_dalloc_large(extent_node_arena_get(
+				    &chunk->node), chunk, ptr);
 			}
 		}
 	} else
diff --git a/include/jemalloc/internal/chunk.h b/include/jemalloc/internal/chunk.h
index 5e0fb14..96b9e15 100644
--- a/include/jemalloc/internal/chunk.h
+++ b/include/jemalloc/internal/chunk.h
@@ -44,8 +44,10 @@ void	*chunk_alloc_arena(chunk_alloc_t *chunk_alloc,
     size_t size, size_t alignment, bool *zero);
 void	*chunk_alloc_default(void *new_addr, size_t size, size_t alignment,
     bool *zero, unsigned arena_ind);
-void	chunk_unmap(arena_t *arena, void *chunk, size_t size);
+void	chunk_record(arena_t *arena, extent_tree_t *chunks_szad,
+    extent_tree_t *chunks_ad, bool dirty, void *chunk, size_t size);
 bool	chunk_dalloc_default(void *chunk, size_t size, unsigned arena_ind);
+void	chunk_unmap(arena_t *arena, void *chunk, size_t size);
 bool	chunk_boot(void);
 void	chunk_prefork(void);
 void	chunk_postfork_parent(void);
diff --git a/include/jemalloc/internal/extent.h b/include/jemalloc/internal/extent.h
index 885f475..1060761 100644
--- a/include/jemalloc/internal/extent.h
+++ b/include/jemalloc/internal/extent.h
@@ -7,36 +7,48 @@ typedef struct extent_node_s extent_node_t;
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS
 
-/* Tree of extents. */
+/* Tree of extents.  Use accessor functions for en_* fields. */
 struct extent_node_s {
 	/* Arena from which this extent came, if any. */
-	arena_t			*arena;
+	arena_t			*en_arena;
 
 	/* Pointer to the extent that this tree node is responsible for. */
-	void			*addr;
+	void			*en_addr;
+
+	/* Total region size. */
+	size_t			en_size;
 
 	/*
-	 * Total region size, or 0 if this node corresponds to an arena chunk.
+	 * The zeroed flag is used by chunk recycling code to track whether
+	 * memory is zero-filled.
 	 */
-	size_t			size;
+	bool			en_zeroed;
 
 	/*
-	 * 'prof_tctx' and 'zeroed' are never needed at the same time, so
-	 * overlay them in order to fit extent_node_t in one cache line.
+	 * The achunk flag is used to validate that huge allocation lookups
+	 * don't return arena chunks.
 	 */
+	bool			en_achunk;
+
 	union {
 		/* Profile counters, used for huge objects. */
-		prof_tctx_t	*prof_tctx;
-
-		/* True if zero-filled; used by chunk recycling code. */
-		bool		zeroed;
+		prof_tctx_t	*en_prof_tctx;
+
+		struct {
+			/*
+			 * Linkage for arena's runs_dirty and chunks_dirty
+			 * rings.
+			 */
+			qr(extent_node_t)	cd_link;
+			arena_chunk_map_misc_t	runs_dirty;
+		};
 	};
 
 	union {
 		/* Linkage for the size/address-ordered tree. */
 		rb_node(extent_node_t)	szad_link;
 
-		/* Linkage for huge allocations and cached chunks nodes. */
+		/* Linkage for arena's huge and node_cache lists. */
 		ql_elm(extent_node_t)	ql_link;
 	};
 
@@ -57,6 +69,107 @@ rb_proto(, extent_tree_ad_, extent_tree_t, extent_node_t)
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
 
+#ifndef JEMALLOC_ENABLE_INLINE
+arena_t	*extent_node_arena_get(const extent_node_t *node);
+void	*extent_node_addr_get(const extent_node_t *node);
+size_t	extent_node_size_get(const extent_node_t *node);
+bool	extent_node_zeroed_get(const extent_node_t *node);
+bool	extent_node_achunk_get(const extent_node_t *node);
+prof_tctx_t	*extent_node_prof_tctx_get(const extent_node_t *node);
+void	extent_node_arena_set(extent_node_t *node, arena_t *arena);
+void	extent_node_addr_set(extent_node_t *node, void *addr);
+void	extent_node_size_set(extent_node_t *node, size_t size);
+void	extent_node_zeroed_set(extent_node_t *node, bool zeroed);
+void	extent_node_achunk_set(extent_node_t *node, bool achunk);
+void	extent_node_prof_tctx_set(extent_node_t *node, prof_tctx_t *tctx);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_EXTENT_C_))
+JEMALLOC_INLINE arena_t *
+extent_node_arena_get(const extent_node_t *node)
+{
+
+	return (node->en_arena);
+}
+
+JEMALLOC_INLINE void *
+extent_node_addr_get(const extent_node_t *node)
+{
+
+	return (node->en_addr);
+}
+
+JEMALLOC_INLINE size_t
+extent_node_size_get(const extent_node_t *node)
+{
+
+	return (node->en_size);
+}
+
+JEMALLOC_INLINE bool
+extent_node_zeroed_get(const extent_node_t *node)
+{
+
+	return (node->en_zeroed);
+}
+
+JEMALLOC_INLINE bool
+extent_node_achunk_get(const extent_node_t *node)
+{
+
+	return (node->en_achunk);
+}
+
+JEMALLOC_INLINE prof_tctx_t *
+extent_node_prof_tctx_get(const extent_node_t *node)
+{
+
+	return (node->en_prof_tctx);
+}
+
+JEMALLOC_INLINE void
+extent_node_arena_set(extent_node_t *node, arena_t *arena)
+{
+
+	node->en_arena = arena;
+}
+
+JEMALLOC_INLINE void
+extent_node_addr_set(extent_node_t *node, void *addr)
+{
+
+	node->en_addr = addr;
+}
+
+JEMALLOC_INLINE void
+extent_node_size_set(extent_node_t *node, size_t size)
+{
+
+	node->en_size = size;
+}
+
+JEMALLOC_INLINE void
+extent_node_zeroed_set(extent_node_t *node, bool zeroed)
+{
+
+	node->en_zeroed = zeroed;
+}
+
+JEMALLOC_INLINE void
+extent_node_achunk_set(extent_node_t *node, bool achunk)
+{
+
+	node->en_achunk = achunk;
+}
+
+JEMALLOC_INLINE void
+extent_node_prof_tctx_set(extent_node_t *node, prof_tctx_t *tctx)
+{
+
+	node->en_prof_tctx = tctx;
+}
+#endif
+
 #endif /* JEMALLOC_H_INLINES */
 /******************************************************************************/
 
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 43276c6..8ed69ce 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -368,8 +368,13 @@ typedef unsigned index_t;
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/mb.h"
 #include "jemalloc/internal/bitmap.h"
+#define	JEMALLOC_ARENA_STRUCTS_A
+#include "jemalloc/internal/arena.h"
+#undef JEMALLOC_ARENA_STRUCTS_A
 #include "jemalloc/internal/extent.h"
+#define	JEMALLOC_ARENA_STRUCTS_B
 #include "jemalloc/internal/arena.h"
+#undef JEMALLOC_ARENA_STRUCTS_B
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/chunk.h"
@@ -933,7 +938,8 @@ ivsalloc(const void *ptr, bool demote)
 	if (node == NULL)
 		return (0);
 	/* Only arena chunks should be looked up via interior pointers. */
-	assert(node->addr == ptr || node->size == 0);
+	assert(extent_node_addr_get(node) == ptr ||
+	    extent_node_achunk_get(node));
 
 	return (isalloc(ptr, demote));
 }
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index d5601a6..a1d12cf 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -13,6 +13,8 @@ arena_choose
 arena_choose_hard
 arena_chunk_alloc_huge
 arena_chunk_dalloc_huge
+arena_chunk_dirty_maybe_insert
+arena_chunk_dirty_maybe_remove
 arena_chunk_ralloc_huge_expand
 arena_chunk_ralloc_huge_shrink
 arena_chunk_ralloc_huge_similar
@@ -143,6 +145,7 @@ chunk_npages
 chunk_postfork_child
 chunk_postfork_parent
 chunk_prefork
+chunk_record
 chunk_register
 chunk_unmap
 chunks_rtree
@@ -173,6 +176,18 @@ ctl_postfork_child
 ctl_postfork_parent
 ctl_prefork
 dss_prec_names
+extent_node_achunk_get
+extent_node_achunk_set
+extent_node_addr_get
+extent_node_addr_set
+extent_node_arena_get
+extent_node_arena_set
+extent_node_prof_tctx_get
+extent_node_prof_tctx_set
+extent_node_size_get
+extent_node_size_set
+extent_node_zeroed_get
+extent_node_zeroed_set
 extent_tree_ad_empty
 extent_tree_ad_first
 extent_tree_ad_insert
diff --git a/src/arena.c b/src/arena.c
index ce500f4..a7a98e2 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -112,34 +112,94 @@ arena_avail_remove(arena_t *arena, arena_chunk_t *chunk, size_t pageind,
 }
 
 static void
-arena_dirty_insert(arena_t *arena, arena_chunk_t *chunk, size_t pageind,
+arena_run_dirty_insert(arena_t *arena, arena_chunk_t *chunk, size_t pageind,
     size_t npages)
 {
 	arena_chunk_map_misc_t *miscelm = arena_miscelm_get(chunk, pageind);
+
 	assert(npages == (arena_mapbits_unallocated_size_get(chunk, pageind) >>
 	    LG_PAGE));
 	assert(arena_mapbits_dirty_get(chunk, pageind) == CHUNK_MAP_DIRTY);
 	assert(arena_mapbits_dirty_get(chunk, pageind+npages-1) ==
 	    CHUNK_MAP_DIRTY);
-	ql_elm_new(miscelm, dr_link);
-	ql_tail_insert(&arena->runs_dirty, miscelm, dr_link);
+
+	qr_new(miscelm, rd_link);
+	qr_meld(&arena->runs_dirty, miscelm, rd_link);
 	arena->ndirty += npages;
 }
 
 static void
-arena_dirty_remove(arena_t *arena, arena_chunk_t *chunk, size_t pageind,
+arena_run_dirty_remove(arena_t *arena, arena_chunk_t *chunk, size_t pageind,
     size_t npages)
 {
 	arena_chunk_map_misc_t *miscelm = arena_miscelm_get(chunk, pageind);
+
 	assert(npages == (arena_mapbits_unallocated_size_get(chunk, pageind) >>
 	    LG_PAGE));
 	assert(arena_mapbits_dirty_get(chunk, pageind) == CHUNK_MAP_DIRTY);
 	assert(arena_mapbits_dirty_get(chunk, pageind+npages-1) ==
 	    CHUNK_MAP_DIRTY);
-	ql_remove(&arena->runs_dirty, miscelm, dr_link);
+
+	qr_remove(miscelm, rd_link);
+	assert(arena->ndirty >= npages);
 	arena->ndirty -= npages;
 }
 
+static size_t
+arena_chunk_dirty_npages(const extent_node_t *node)
+{
+
+	return (extent_node_size_get(node) >> LG_PAGE);
+}
+
+static void
+arena_chunk_dirty_node_init(extent_node_t *node)
+{
+
+	qr_new(node, cd_link);
+	qr_new(&node->runs_dirty, rd_link);
+}
+
+static void
+arena_chunk_dirty_insert(arena_chunk_map_misc_t *runs_dirty,
+    extent_node_t *chunks_dirty, extent_node_t *node)
+{
+
+	qr_meld(chunks_dirty, node, cd_link);
+	qr_meld(runs_dirty, &node->runs_dirty, rd_link);
+}
+
+static void
+arena_chunk_dirty_remove(extent_node_t *node)
+{
+
+	qr_remove(node, cd_link);
+	qr_remove(&node->runs_dirty, rd_link);
+}
+
+void
+arena_chunk_dirty_maybe_insert(arena_t *arena, extent_node_t *node, bool dirty)
+{
+
+	arena_chunk_dirty_node_init(node);
+	if (dirty) {
+		arena_chunk_dirty_insert(&arena->runs_dirty,
+		    &arena->chunks_dirty, node);
+		arena->ndirty += arena_chunk_dirty_npages(node);
+	}
+}
+
+void
+arena_chunk_dirty_maybe_remove(arena_t *arena, extent_node_t *node, bool dirty)
+{
+
+	if (dirty) {
+		arena_chunk_dirty_remove(node);
+		assert(arena->ndirty >= arena_chunk_dirty_npages(node));
+		arena->ndirty -= arena_chunk_dirty_npages(node);
+	}
+}
+
 JEMALLOC_INLINE_C void *
 arena_run_reg_alloc(arena_run_t *run, arena_bin_info_t *bin_info)
 {
@@ -243,7 +303,7 @@ arena_run_split_remove(arena_t *arena, arena_chunk_t *chunk, size_t run_ind,
 
 	arena_avail_remove(arena, chunk, run_ind, total_pages);
 	if (flag_dirty != 0)
-		arena_dirty_remove(arena, chunk, run_ind, total_pages);
+		arena_run_dirty_remove(arena, chunk, run_ind, total_pages);
 	arena_cactive_update(arena, need_pages, 0);
 	arena->nactive += need_pages;
 
@@ -256,7 +316,7 @@ arena_run_split_remove(arena_t *arena, arena_chunk_t *chunk, size_t run_ind,
 			arena_mapbits_unallocated_set(chunk,
 			    run_ind+total_pages-1, (rem_pages << LG_PAGE),
 			    flag_dirty);
-			arena_dirty_insert(arena, chunk, run_ind+need_pages,
+			arena_run_dirty_insert(arena, chunk, run_ind+need_pages,
 			    rem_pages);
 		} else {
 			arena_mapbits_unallocated_set(chunk, run_ind+need_pages,
@@ -405,9 +465,10 @@ arena_chunk_alloc_internal(arena_t *arena, bool *zero)
 	chunk = (arena_chunk_t *)chunk_alloc_arena(chunk_alloc, chunk_dalloc,
 	    arena->ind, NULL, chunksize, chunksize, zero);
 	if (chunk != NULL) {
-		chunk->node.arena = arena;
-		chunk->node.addr = chunk;
-		chunk->node.size = 0; /* Indicates this is an arena chunk. */
+		extent_node_arena_set(&chunk->node, arena);
+		extent_node_addr_set(&chunk->node, chunk);
+		extent_node_size_set(&chunk->node, chunksize);
+		extent_node_achunk_set(&chunk->node, true);
 		if (chunk_register(chunk, &chunk->node)) {
 			chunk_dalloc((void *)chunk, chunksize, arena->ind);
 			chunk = NULL;
@@ -516,7 +577,7 @@ arena_chunk_dalloc(arena_t *arena, arena_chunk_t *chunk)
 
 		arena->spare = chunk;
 		if (arena_mapbits_dirty_get(spare, map_bias) != 0) {
-			arena_dirty_remove(arena, spare, map_bias,
+			arena_run_dirty_remove(arena, spare, map_bias,
 			    chunk_npages-map_bias);
 		}
 		chunk_dalloc = arena->chunk_dalloc;
@@ -899,18 +960,29 @@ static size_t
 arena_dirty_count(arena_t *arena)
 {
 	size_t ndirty = 0;
-	arena_chunk_map_misc_t *miscelm;
-	arena_chunk_t *chunk;
-	size_t pageind, npages;
-
-	ql_foreach(miscelm, &arena->runs_dirty, dr_link) {
-		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(miscelm);
-		pageind = arena_miscelm_to_pageind(miscelm);
-		assert(arena_mapbits_allocated_get(chunk, pageind) == 0);
-		assert(arena_mapbits_large_get(chunk, pageind) == 0);
-		assert(arena_mapbits_dirty_get(chunk, pageind) != 0);
-		npages = arena_mapbits_unallocated_size_get(chunk, pageind) >>
-		    LG_PAGE;
+	arena_chunk_map_misc_t *runselm;
+	extent_node_t *chunkselm;
+
+	for (runselm = qr_next(&arena->runs_dirty, rd_link),
+	    chunkselm = qr_next(&arena->chunks_dirty, cd_link);
+	    runselm != &arena->runs_dirty; runselm = qr_next(runselm,
+	    rd_link)) {
+		size_t npages;
+
+		if (runselm == &chunkselm->runs_dirty) {
+			npages = extent_node_size_get(chunkselm) >> LG_PAGE;
+			chunkselm = qr_next(chunkselm, cd_link);
+		} else {
+			arena_chunk_t *chunk = (arena_chunk_t
+			    *)CHUNK_ADDR2BASE(runselm);
+			size_t pageind = arena_miscelm_to_pageind(runselm);
+			assert(arena_mapbits_allocated_get(chunk, pageind) ==
+			    0);
+			assert(arena_mapbits_large_get(chunk, pageind) == 0);
+			assert(arena_mapbits_dirty_get(chunk, pageind) != 0);
+			npages = arena_mapbits_unallocated_size_get(chunk,
+			    pageind) >> LG_PAGE;
+		}
 		ndirty += npages;
 	}
 
@@ -939,41 +1011,94 @@ arena_compute_npurge(arena_t *arena, bool all)
 
 static size_t
 arena_stash_dirty(arena_t *arena, bool all, size_t npurge,
-    arena_chunk_miscelms_t *miscelms)
+    arena_chunk_map_misc_t *purge_runs_sentinel,
+    extent_node_t *purge_chunks_sentinel)
 {
-	arena_chunk_map_misc_t *miscelm;
+	arena_chunk_map_misc_t *runselm, *runselm_next;
+	extent_node_t *chunkselm;
 	size_t nstashed = 0;
 
-	/* Add at least npurge pages to purge_list. */
-	for (miscelm = ql_first(&arena->runs_dirty); miscelm != NULL;
-	    miscelm = ql_first(&arena->runs_dirty)) {
-		arena_chunk_t *chunk =
-		    (arena_chunk_t *)CHUNK_ADDR2BASE(miscelm);
-		size_t pageind = arena_miscelm_to_pageind(miscelm);
-		size_t run_size = arena_mapbits_unallocated_size_get(chunk,
-		    pageind);
-		size_t npages = run_size >> LG_PAGE;
-		arena_run_t *run = &miscelm->run;
+	/* Stash at least npurge pages. */
+	for (runselm = qr_next(&arena->runs_dirty, rd_link),
+	    chunkselm = qr_next(&arena->chunks_dirty, cd_link);
+	    runselm != &arena->runs_dirty; runselm = runselm_next) {
+		size_t npages;
+		runselm_next = qr_next(runselm, rd_link);
+
+		if (runselm == &chunkselm->runs_dirty) {
+			extent_node_t *chunkselm_next, *tnode;
+			void *addr;
+			size_t size;
+			bool zeroed, zero;
+			UNUSED void *chunk;
+
+			chunkselm_next = qr_next(chunkselm, cd_link);
+			/*
+			 * Cache contents of chunkselm prior to it being
+			 * destroyed as a side effect of allocating the chunk.
+			 */
+			addr = extent_node_addr_get(chunkselm);
+			size = extent_node_size_get(chunkselm);
+			zeroed = extent_node_zeroed_get(chunkselm);
+			/* Allocate. */
+			zero = false;
+			chunk = arena->chunk_alloc(addr, size, chunksize, &zero,
+			    arena->ind);
+			assert(chunk == addr);
+			/*
+			 * Create a temporary node to link into the ring of
+			 * stashed allocations.
+			 */
+			tnode = arena_node_alloc(arena);
+			/*
+			 * OOM shouldn't be possible because chunk allocation
+			 * just cached a node.
+			 */
+			assert(tnode != NULL);
+			extent_node_arena_set(tnode, arena);
+			extent_node_addr_set(tnode, addr);
+			extent_node_size_set(tnode, size);
+			extent_node_zeroed_set(tnode, zeroed);
+			arena_chunk_dirty_node_init(tnode);
+			/* Stash. */
+			arena_chunk_dirty_insert(purge_runs_sentinel,
+			    purge_chunks_sentinel, tnode);
+			npages = size >> LG_PAGE;
+			chunkselm = chunkselm_next;
+		} else {
+			arena_chunk_t *chunk =
+			    (arena_chunk_t *)CHUNK_ADDR2BASE(runselm);
+			size_t pageind = arena_miscelm_to_pageind(runselm);
+			arena_run_t *run = &runselm->run;
+			size_t run_size =
+			    arena_mapbits_unallocated_size_get(chunk, pageind);
 
-		assert(pageind + npages <= chunk_npages);
-		assert(arena_mapbits_dirty_get(chunk, pageind) ==
-		    arena_mapbits_dirty_get(chunk, pageind+npages-1));
+			npages = run_size >> LG_PAGE;
 
-		/*
-		 * If purging the spare chunk's run, make it available prior to
-		 * allocation.
-		 */
-		if (chunk == arena->spare)
-			arena_chunk_alloc(arena);
+			assert(pageind + npages <= chunk_npages);
+			assert(arena_mapbits_dirty_get(chunk, pageind) ==
+			    arena_mapbits_dirty_get(chunk, pageind+npages-1));
 
-		/* Temporarily allocate the free dirty run. */
-		arena_run_split_large(arena, run, run_size, false);
-		/* Append to purge_list for later processing. */
-		ql_elm_new(miscelm, dr_link);
-		ql_tail_insert(miscelms, miscelm, dr_link);
+			/*
+			 * If purging the spare chunk's run, make it available
+			 * prior to allocation.
+			 */
+			if (chunk == arena->spare)
+				arena_chunk_alloc(arena);
+
+			/* Temporarily allocate the free dirty run. */
+			arena_run_split_large(arena, run, run_size, false);
+			/* Append to purge_runs for later processing. */
+			if (false)
+				qr_new(runselm, rd_link); /* Redundant. */
+			else {
+				assert(qr_next(runselm, rd_link) == runselm);
+				assert(qr_prev(runselm, rd_link) == runselm);
+			}
+			qr_meld(purge_runs_sentinel, runselm, rd_link);
+		}
 
 		nstashed += npages;
-
 		if (!all && nstashed >= npurge)
 			break;
 	}
@@ -982,52 +1107,66 @@ arena_stash_dirty(arena_t *arena, bool all, size_t npurge,
 }
 
 static size_t
-arena_purge_stashed(arena_t *arena, arena_chunk_miscelms_t *miscelms)
+arena_purge_stashed(arena_t *arena, arena_chunk_map_misc_t *purge_runs_sentinel,
+    extent_node_t *purge_chunks_sentinel)
 {
 	size_t npurged, nmadvise;
-	arena_chunk_map_misc_t *miscelm;
+	arena_chunk_map_misc_t *runselm;
+	extent_node_t *chunkselm;
 
 	if (config_stats)
 		nmadvise = 0;
 	npurged = 0;
 
 	malloc_mutex_unlock(&arena->lock);
+	for (runselm = qr_next(purge_runs_sentinel, rd_link),
+	    chunkselm = qr_next(purge_chunks_sentinel, cd_link);
+	    runselm != purge_runs_sentinel; runselm = qr_next(runselm,
+	    rd_link)) {
+		size_t npages;
+
+		if (runselm == &chunkselm->runs_dirty) {
+			size_t size = extent_node_size_get(chunkselm);
+
+			pages_purge(extent_node_addr_get(chunkselm), size);
+			npages = size >> LG_PAGE;
+			chunkselm = qr_next(chunkselm, cd_link);
+		} else {
+			arena_chunk_t *chunk;
+			size_t pageind, run_size, flag_unzeroed, i;
+			bool unzeroed;
 
-	ql_foreach(miscelm, miscelms, dr_link) {
-		arena_chunk_t *chunk;
-		size_t pageind, run_size, npages, flag_unzeroed, i;
-		bool unzeroed;
-
-		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(miscelm);
-		pageind = arena_miscelm_to_pageind(miscelm);
-		run_size = arena_mapbits_large_size_get(chunk, pageind);
-		npages = run_size >> LG_PAGE;
+			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(runselm);
+			pageind = arena_miscelm_to_pageind(runselm);
+			run_size = arena_mapbits_large_size_get(chunk, pageind);
+			npages = run_size >> LG_PAGE;
 
-		assert(pageind + npages <= chunk_npages);
-		unzeroed = pages_purge((void *)((uintptr_t)chunk + (pageind <<
-		    LG_PAGE)), run_size);
-		flag_unzeroed = unzeroed ? CHUNK_MAP_UNZEROED : 0;
+			assert(pageind + npages <= chunk_npages);
+			unzeroed = pages_purge((void *)((uintptr_t)chunk +
+			    (pageind << LG_PAGE)), run_size);
+			flag_unzeroed = unzeroed ? CHUNK_MAP_UNZEROED : 0;
 
-		/*
-		 * Set the unzeroed flag for all pages, now that pages_purge()
-		 * has returned whether the pages were zeroed as a side effect
-		 * of purging.  This chunk map modification is safe even though
-		 * the arena mutex isn't currently owned by this thread,
-		 * because the run is marked as allocated, thus protecting it
-		 * from being modified by any other thread.  As long as these
-		 * writes don't perturb the first and last elements'
-		 * CHUNK_MAP_ALLOCATED bits, behavior is well defined.
-		 */
-		for (i = 0; i < npages; i++) {
-			arena_mapbits_unzeroed_set(chunk, pageind+i,
-			    flag_unzeroed);
+			/*
+			 * Set the unzeroed flag for all pages, now that
+			 * pages_purge() has returned whether the pages were
+			 * zeroed as a side effect of purging.  This chunk map
+			 * modification is safe even though the arena mutex
+			 * isn't currently owned by this thread, because the run
+			 * is marked as allocated, thus protecting it from being
+			 * modified by any other thread.  As long as these
+			 * writes don't perturb the first and last elements'
+			 * CHUNK_MAP_ALLOCATED bits, behavior is well defined.
+			 */
+			for (i = 0; i < npages; i++) {
+				arena_mapbits_unzeroed_set(chunk, pageind+i,
+				    flag_unzeroed);
+			}
 		}
 
 		npurged += npages;
 		if (config_stats)
 			nmadvise++;
 	}
-
 	malloc_mutex_lock(&arena->lock);
 
 	if (config_stats) {
@@ -1039,16 +1178,31 @@ arena_purge_stashed(arena_t *arena, arena_chunk_miscelms_t *miscelms)
 }
 
 static void
-arena_unstash_purged(arena_t *arena, arena_chunk_miscelms_t *miscelms)
+arena_unstash_purged(arena_t *arena,
+    arena_chunk_map_misc_t *purge_runs_sentinel,
+    extent_node_t *purge_chunks_sentinel)
 {
-	arena_chunk_map_misc_t *miscelm;
+	arena_chunk_map_misc_t *runselm, *runselm_next;
+	extent_node_t *chunkselm;
 
 	/* Deallocate runs. */
-	for (miscelm = ql_first(miscelms); miscelm != NULL;
-	    miscelm = ql_first(miscelms)) {
-		arena_run_t *run = &miscelm->run;
-		ql_remove(miscelms, miscelm, dr_link);
-		arena_run_dalloc(arena, run, false, true);
+	for (runselm = qr_next(purge_runs_sentinel, rd_link),
+	    chunkselm = qr_next(purge_chunks_sentinel, cd_link);
+	    runselm != purge_runs_sentinel; runselm = runselm_next) {
+		runselm_next = qr_next(runselm, rd_link);
+		if (runselm == &chunkselm->runs_dirty) {
+			extent_node_t *chunkselm_next = qr_next(chunkselm,
+			    cd_link);
+			arena_chunk_dirty_remove(chunkselm);
+			chunk_unmap(arena, extent_node_addr_get(chunkselm),
+			    extent_node_size_get(chunkselm));
+			arena_node_dalloc(arena, chunkselm);
+			chunkselm = chunkselm_next;
+		} else {
+			arena_run_t *run = &runselm->run;
+			qr_remove(runselm, rd_link);
+			arena_run_dalloc(arena, run, false, true);
+		}
 	}
 }
 
@@ -1056,7 +1210,8 @@ void
 arena_purge(arena_t *arena, bool all)
 {
 	size_t npurge, npurgeable, npurged;
-	arena_chunk_miscelms_t purge_list;
+	arena_chunk_map_misc_t purge_runs_sentinel;
+	extent_node_t purge_chunks_sentinel;
 
 	/*
 	 * Calls to arena_dirty_count() are disabled even for debug builds
@@ -1072,12 +1227,17 @@ arena_purge(arena_t *arena, bool all)
 		arena->stats.npurge++;
 
 	npurge = arena_compute_npurge(arena, all);
-	ql_new(&purge_list);
-	npurgeable = arena_stash_dirty(arena, all, npurge, &purge_list);
+	qr_new(&purge_runs_sentinel, rd_link);
+	arena_chunk_dirty_node_init(&purge_chunks_sentinel);
+
+	npurgeable = arena_stash_dirty(arena, all, npurge, &purge_runs_sentinel,
+	    &purge_chunks_sentinel);
 	assert(npurgeable >= npurge);
-	npurged = arena_purge_stashed(arena, &purge_list);
+	npurged = arena_purge_stashed(arena, &purge_runs_sentinel,
+	    &purge_chunks_sentinel);
 	assert(npurged == npurgeable);
-	arena_unstash_purged(arena, &purge_list);
+	arena_unstash_purged(arena, &purge_runs_sentinel,
+	    &purge_chunks_sentinel);
 }
 
 void
@@ -1115,9 +1275,12 @@ arena_run_coalesce(arena_t *arena, arena_chunk_t *chunk, size_t *p_size,
 		    run_ind+run_pages+nrun_pages-1) == flag_dirty);
 		arena_avail_remove(arena, chunk, run_ind+run_pages, nrun_pages);
 
-		/* If the successor is dirty, remove it from runs_dirty. */
+		/*
+		 * If the successor is dirty, remove it from the set of dirty
+		 * pages.
+		 */
 		if (flag_dirty != 0) {
-			arena_dirty_remove(arena, chunk, run_ind+run_pages,
+			arena_run_dirty_remove(arena, chunk, run_ind+run_pages,
 			    nrun_pages);
 		}
 
@@ -1148,9 +1311,14 @@ arena_run_coalesce(arena_t *arena, arena_chunk_t *chunk, size_t *p_size,
 		assert(arena_mapbits_dirty_get(chunk, run_ind) == flag_dirty);
 		arena_avail_remove(arena, chunk, run_ind, prun_pages);
 
-		/* If the predecessor is dirty, remove it from runs_dirty. */
-		if (flag_dirty != 0)
-			arena_dirty_remove(arena, chunk, run_ind, prun_pages);
+		/*
+		 * If the predecessor is dirty, remove it from the set of dirty
+		 * pages.
+		 */
+		if (flag_dirty != 0) {
+			arena_run_dirty_remove(arena, chunk, run_ind,
+			    prun_pages);
+		}
 
 		size += prun_size;
 		run_pages += prun_pages;
@@ -1224,7 +1392,7 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned)
 	arena_avail_insert(arena, chunk, run_ind, run_pages);
 
 	if (dirty)
-		arena_dirty_insert(arena, chunk, run_ind, run_pages);
+		arena_run_dirty_insert(arena, chunk, run_ind, run_pages);
 
 	/* Deallocate chunk if it is now completely unused. */
 	if (size == arena_maxrun) {
@@ -1843,7 +2011,8 @@ arena_dissociate_bin_run(arena_chunk_t *chunk, arena_run_t *run,
 	if (run == bin->runcur)
 		bin->runcur = NULL;
 	else {
-		index_t binind = arena_bin_index(chunk->node.arena, bin);
+		index_t binind = arena_bin_index(extent_node_arena_get(
+		    &chunk->node), bin);
 		arena_bin_info_t *bin_info = &arena_bin_info[binind];
 
 		if (bin_info->nregs != 1) {
@@ -2184,7 +2353,7 @@ arena_ralloc_large(void *ptr, size_t oldsize, size_t size, size_t extra,
 		arena_t *arena;
 
 		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-		arena = chunk->node.arena;
+		arena = extent_node_arena_get(&chunk->node);
 
 		if (usize < oldsize) {
 			/* Fill before shrinking in order avoid a race. */
@@ -2422,20 +2591,6 @@ arena_new(unsigned ind)
 	arena->nthreads = 0;
 	if (malloc_mutex_init(&arena->lock))
 		return (NULL);
-	arena->chunk_alloc = chunk_alloc_default;
-	arena->chunk_dalloc = chunk_dalloc_default;
-	ql_new(&arena->huge);
-	if (malloc_mutex_init(&arena->huge_mtx))
-		return (NULL);
-	extent_tree_szad_new(&arena->chunks_szad_mmap);
-	extent_tree_ad_new(&arena->chunks_ad_mmap);
-	extent_tree_szad_new(&arena->chunks_szad_dss);
-	extent_tree_ad_new(&arena->chunks_ad_dss);
-	ql_new(&arena->node_cache);
-	if (malloc_mutex_init(&arena->chunks_mtx))
-		return (NULL);
-	if (malloc_mutex_init(&arena->node_cache_mtx))
-		return (NULL);
 
 	if (config_stats) {
 		memset(&arena->stats, 0, sizeof(arena_stats_t));
@@ -2463,7 +2618,27 @@ arena_new(unsigned ind)
 	arena->ndirty = 0;
 
 	arena_avail_tree_new(&arena->runs_avail);
-	ql_new(&arena->runs_dirty);
+	qr_new(&arena->runs_dirty, rd_link);
+	qr_new(&arena->chunks_dirty, cd_link);
+
+	ql_new(&arena->huge);
+	if (malloc_mutex_init(&arena->huge_mtx))
+		return (NULL);
+
+	extent_tree_szad_new(&arena->chunks_szad_dirty);
+	extent_tree_ad_new(&arena->chunks_ad_dirty);
+	extent_tree_szad_new(&arena->chunks_szad_mmap);
+	extent_tree_ad_new(&arena->chunks_ad_mmap);
+	extent_tree_szad_new(&arena->chunks_szad_dss);
+	extent_tree_ad_new(&arena->chunks_ad_dss);
+	if (malloc_mutex_init(&arena->chunks_mtx))
+		return (NULL);
+	ql_new(&arena->node_cache);
+	if (malloc_mutex_init(&arena->node_cache_mtx))
+		return (NULL);
+
+	arena->chunk_alloc = chunk_alloc_default;
+	arena->chunk_dalloc = chunk_dalloc_default;
 
 	/* Initialize bins. */
 	for (i = 0; i < NBINS; i++) {
diff --git a/src/base.c b/src/base.c
index 7b5804e..819fa02 100644
--- a/src/base.c
+++ b/src/base.c
@@ -60,8 +60,8 @@ base_chunk_alloc(size_t minsize)
 		if (config_stats)
 			base_allocated += nsize;
 	}
-	node->addr = addr;
-	node->size = csize;
+	extent_node_addr_set(node, addr);
+	extent_node_size_set(node, csize);
 	return (node);
 }
 
@@ -84,8 +84,8 @@ base_alloc(size_t size)
 	 */
 	csize = CACHELINE_CEILING(size);
 
-	key.addr = NULL;
-	key.size = csize;
+	extent_node_addr_set(&key, NULL);
+	extent_node_size_set(&key, csize);
 	malloc_mutex_lock(&base_mtx);
 	node = extent_tree_szad_nsearch(&base_avail_szad, &key);
 	if (node != NULL) {
@@ -100,10 +100,10 @@ base_alloc(size_t size)
 		goto label_return;
 	}
 
-	ret = node->addr;
-	if (node->size > csize) {
-		node->addr = (void *)((uintptr_t)ret + csize);
-		node->size -= csize;
+	ret = extent_node_addr_get(node);
+	if (extent_node_size_get(node) > csize) {
+		extent_node_addr_set(node, (void *)((uintptr_t)ret + csize));
+		extent_node_size_set(node, extent_node_size_get(node) - csize);
 		extent_tree_szad_insert(&base_avail_szad, node);
 	} else
 		base_node_dalloc(node);
diff --git a/src/chunk.c b/src/chunk.c
index b357619..8bc87be 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -24,12 +24,13 @@ bool
 chunk_register(const void *chunk, const extent_node_t *node)
 {
 
-	assert(node->addr == chunk);
+	assert(extent_node_addr_get(node) == chunk);
 
 	if (rtree_set(&chunks_rtree, (uintptr_t)chunk, node))
 		return (true);
 	if (config_prof && opt_prof) {
-		size_t nadd = (node->size == 0) ? 1 : node->size / chunksize;
+		size_t size = extent_node_size_get(node);
+		size_t nadd = (size == 0) ? 1 : size / chunksize;
 		size_t cur = atomic_add_z(&curchunks, nadd);
 		size_t high = atomic_read_z(&highchunks);
 		while (cur > high && atomic_cas_z(&highchunks, high, cur)) {
@@ -54,7 +55,8 @@ chunk_deregister(const void *chunk, const extent_node_t *node)
 	err = rtree_set(&chunks_rtree, (uintptr_t)chunk, NULL);
 	assert(!err);
 	if (config_prof && opt_prof) {
-		size_t nsub = (node->size == 0) ? 1 : node->size / chunksize;
+		size_t size = extent_node_size_get(node);
+		size_t nsub = (size == 0) ? 1 : size / chunksize;
 		assert(atomic_read_z(&curchunks) >= nsub);
 		atomic_sub_z(&curchunks, nsub);
 	}
@@ -62,8 +64,8 @@ chunk_deregister(const void *chunk, const extent_node_t *node)
 
 static void *
 chunk_recycle(arena_t *arena, extent_tree_t *chunks_szad,
-    extent_tree_t *chunks_ad, void *new_addr, size_t size, size_t alignment,
-    bool *zero)
+    extent_tree_t *chunks_ad, bool dirty, void *new_addr, size_t size,
+    size_t alignment, bool *zero)
 {
 	void *ret;
 	extent_node_t *node;
@@ -77,32 +79,35 @@ chunk_recycle(arena_t *arena, extent_tree_t *chunks_szad,
 	/* Beware size_t wrap-around. */
 	if (alloc_size < size)
 		return (NULL);
-	key.addr = new_addr;
-	key.size = alloc_size;
+	extent_node_addr_set(&key, new_addr);
+	extent_node_size_set(&key, alloc_size);
 	malloc_mutex_lock(&arena->chunks_mtx);
 	node = (new_addr != NULL) ? extent_tree_ad_search(chunks_ad, &key) :
 	    extent_tree_szad_nsearch(chunks_szad, &key);
-	if (node == NULL || (new_addr != NULL && node->size < size)) {
+	if (node == NULL || (new_addr != NULL && extent_node_size_get(node) <
+	    size)) {
 		malloc_mutex_unlock(&arena->chunks_mtx);
 		return (NULL);
 	}
-	leadsize = ALIGNMENT_CEILING((uintptr_t)node->addr, alignment) -
-	    (uintptr_t)node->addr;
+	leadsize = ALIGNMENT_CEILING((uintptr_t)extent_node_addr_get(node),
+	    alignment) - (uintptr_t)extent_node_addr_get(node);
 	assert(new_addr == NULL || leadsize == 0);
-	assert(node->size >= leadsize + size);
-	trailsize = node->size - leadsize - size;
-	ret = (void *)((uintptr_t)node->addr + leadsize);
-	zeroed = node->zeroed;
+	assert(extent_node_size_get(node) >= leadsize + size);
+	trailsize = extent_node_size_get(node) - leadsize - size;
+	ret = (void *)((uintptr_t)extent_node_addr_get(node) + leadsize);
+	zeroed = extent_node_zeroed_get(node);
 	if (zeroed)
 	    *zero = true;
 	/* Remove node from the tree. */
 	extent_tree_szad_remove(chunks_szad, node);
 	extent_tree_ad_remove(chunks_ad, node);
+	arena_chunk_dirty_maybe_remove(arena, node, dirty);
 	if (leadsize != 0) {
 		/* Insert the leading space as a smaller chunk. */
-		node->size = leadsize;
+		extent_node_size_set(node, leadsize);
 		extent_tree_szad_insert(chunks_szad, node);
 		extent_tree_ad_insert(chunks_ad, node);
+		arena_chunk_dirty_maybe_insert(arena, node, dirty);
 		node = NULL;
 	}
 	if (trailsize != 0) {
@@ -111,15 +116,17 @@ chunk_recycle(arena_t *arena, extent_tree_t *chunks_szad,
 			node = arena_node_alloc(arena);
 			if (node == NULL) {
 				malloc_mutex_unlock(&arena->chunks_mtx);
-				chunk_unmap(arena, ret, size);
+				chunk_record(arena, chunks_szad, chunks_ad,
+				    dirty, ret, size);
 				return (NULL);
 			}
 		}
-		node->addr = (void *)((uintptr_t)(ret) + size);
-		node->size = trailsize;
-		node->zeroed = zeroed;
+		extent_node_addr_set(node, (void *)((uintptr_t)(ret) + size));
+		extent_node_size_set(node, trailsize);
+		extent_node_zeroed_set(node, zeroed);
 		extent_tree_szad_insert(chunks_szad, node);
 		extent_tree_ad_insert(chunks_ad, node);
+		arena_chunk_dirty_maybe_insert(arena, node, dirty);
 		node = NULL;
 	}
 	malloc_mutex_unlock(&arena->chunks_mtx);
@@ -148,7 +155,8 @@ chunk_alloc_core_dss(arena_t *arena, void *new_addr, size_t size,
 	void *ret;
 
 	if ((ret = chunk_recycle(arena, &arena->chunks_szad_dss,
-	    &arena->chunks_ad_dss, new_addr, size, alignment, zero)) != NULL)
+	    &arena->chunks_ad_dss, false, new_addr, size, alignment, zero)) !=
+	    NULL)
 		return (ret);
 	ret = chunk_alloc_dss(arena, new_addr, size, alignment, zero);
 	return (ret);
@@ -171,6 +179,11 @@ chunk_alloc_core(arena_t *arena, void *new_addr, size_t size, size_t alignment,
 	assert(alignment != 0);
 	assert((alignment & chunksize_mask) == 0);
 
+	/* dirty. */
+	if ((ret = chunk_recycle(arena, &arena->chunks_szad_dirty,
+	    &arena->chunks_ad_dirty, true, new_addr, size, alignment, zero)) !=
+	    NULL)
+		return (ret);
 	/* "primary" dss. */
 	if (have_dss && dss_prec == dss_prec_primary && (ret =
 	    chunk_alloc_core_dss(arena, new_addr, size, alignment, zero)) !=
@@ -178,8 +191,8 @@ chunk_alloc_core(arena_t *arena, void *new_addr, size_t size, size_t alignment,
 		return (ret);
 	/* mmap. */
 	if (!config_munmap && (ret = chunk_recycle(arena,
-	    &arena->chunks_szad_mmap, &arena->chunks_ad_mmap, new_addr, size,
-	    alignment, zero)) != NULL)
+	    &arena->chunks_szad_mmap, &arena->chunks_ad_mmap, false, new_addr,
+	    size, alignment, zero)) != NULL)
 		return (ret);
 	/*
 	 * Requesting an address is not implemented for chunk_alloc_mmap(), so
@@ -263,54 +276,62 @@ chunk_alloc_default(void *new_addr, size_t size, size_t alignment, bool *zero,
 	    arena->dss_prec));
 }
 
-static void
+void
 chunk_record(arena_t *arena, extent_tree_t *chunks_szad,
-    extent_tree_t *chunks_ad, void *chunk, size_t size)
+    extent_tree_t *chunks_ad, bool dirty, void *chunk, size_t size)
 {
 	bool unzeroed;
-	extent_node_t *node, *prev, key;
+	extent_node_t *node, *prev;
+	extent_node_t key;
 
-	unzeroed = pages_purge(chunk, size);
+	unzeroed = dirty ? true : pages_purge(chunk, size);
 	JEMALLOC_VALGRIND_MAKE_MEM_NOACCESS(chunk, size);
 
 	malloc_mutex_lock(&arena->chunks_mtx);
-	key.addr = (void *)((uintptr_t)chunk + size);
+	extent_node_addr_set(&key, (void *)((uintptr_t)chunk + size));
 	node = extent_tree_ad_nsearch(chunks_ad, &key);
 	/* Try to coalesce forward. */
-	if (node != NULL && node->addr == key.addr) {
+	if (node != NULL && extent_node_addr_get(node) ==
+	    extent_node_addr_get(&key)) {
 		/*
 		 * Coalesce chunk with the following address range.  This does
 		 * not change the position within chunks_ad, so only
 		 * remove/insert from/into chunks_szad.
 		 */
 		extent_tree_szad_remove(chunks_szad, node);
-		node->addr = chunk;
-		node->size += size;
-		node->zeroed = (node->zeroed && !unzeroed);
+		arena_chunk_dirty_maybe_remove(arena, node, dirty);
+		extent_node_addr_set(node, chunk);
+		extent_node_size_set(node, extent_node_size_get(node) + size);
+		extent_node_zeroed_set(node, extent_node_zeroed_get(node) &&
+		    !unzeroed);
 		extent_tree_szad_insert(chunks_szad, node);
+		arena_chunk_dirty_maybe_insert(arena, node, dirty);
 	} else {
 		/* Coalescing forward failed, so insert a new node. */
 		node = arena_node_alloc(arena);
 		if (node == NULL) {
 			/*
 			 * Node allocation failed, which is an exceedingly
-			 * unlikely failure.  Leak chunk; its pages have
-			 * already been purged, so this is only a virtual
-			 * memory leak.
+			 * unlikely failure.  Leak chunk after making sure its
+			 * pages have already been purged, so that this is only
+			 * a virtual memory leak.
 			 */
+			if (dirty)
+				pages_purge(chunk, size);
 			goto label_return;
 		}
-		node->addr = chunk;
-		node->size = size;
-		node->zeroed = !unzeroed;
+		extent_node_addr_set(node, chunk);
+		extent_node_size_set(node, size);
+		extent_node_zeroed_set(node, !unzeroed);
 		extent_tree_ad_insert(chunks_ad, node);
 		extent_tree_szad_insert(chunks_szad, node);
+		arena_chunk_dirty_maybe_insert(arena, node, dirty);
 	}
 
 	/* Try to coalesce backward. */
 	prev = extent_tree_ad_prev(chunks_ad, node);
-	if (prev != NULL && (void *)((uintptr_t)prev->addr + prev->size) ==
-	    chunk) {
+	if (prev != NULL && (void *)((uintptr_t)extent_node_addr_get(prev) +
+	    extent_node_size_get(prev)) == chunk) {
 		/*
 		 * Coalesce chunk with the previous address range.  This does
 		 * not change the position within chunks_ad, so only
@@ -318,12 +339,16 @@ chunk_record(arena_t *arena, extent_tree_t *chunks_szad,
 		 */
 		extent_tree_szad_remove(chunks_szad, prev);
 		extent_tree_ad_remove(chunks_ad, prev);
-
+		arena_chunk_dirty_maybe_remove(arena, prev, dirty);
 		extent_tree_szad_remove(chunks_szad, node);
-		node->addr = prev->addr;
-		node->size += prev->size;
-		node->zeroed = (node->zeroed && prev->zeroed);
+		arena_chunk_dirty_maybe_remove(arena, node, dirty);
+		extent_node_addr_set(node, extent_node_addr_get(prev));
+		extent_node_size_set(node, extent_node_size_get(node) +
+		    extent_node_size_get(prev));
+		extent_node_zeroed_set(node, extent_node_zeroed_get(node) &&
+		    extent_node_zeroed_get(prev));
 		extent_tree_szad_insert(chunks_szad, node);
+		arena_chunk_dirty_maybe_insert(arena, node, dirty);
 
 		arena_node_dalloc(arena, prev);
 	}
@@ -332,8 +357,8 @@ label_return:
 	malloc_mutex_unlock(&arena->chunks_mtx);
 }
 
-void
-chunk_unmap(arena_t *arena, void *chunk, size_t size)
+static void
+chunk_cache(arena_t *arena, void *chunk, size_t size)
 {
 
 	assert(chunk != NULL);
@@ -341,13 +366,8 @@ chunk_unmap(arena_t *arena, void *chunk, size_t size)
 	assert(size != 0);
 	assert((size & chunksize_mask) == 0);
 
-	if (have_dss && chunk_in_dss(chunk)) {
-		chunk_record(arena, &arena->chunks_szad_dss,
-		    &arena->chunks_ad_dss, chunk, size);
-	} else if (chunk_dalloc_mmap(chunk, size)) {
-		chunk_record(arena, &arena->chunks_szad_mmap,
-		    &arena->chunks_ad_mmap, chunk, size);
-	}
+	chunk_record(arena, &arena->chunks_szad_dirty, &arena->chunks_ad_dirty,
+	    true, chunk, size);
 }
 
 /* Default arena chunk deallocation routine in the absence of user override. */
@@ -355,10 +375,28 @@ bool
 chunk_dalloc_default(void *chunk, size_t size, unsigned arena_ind)
 {
 
-	chunk_unmap(chunk_arena_get(arena_ind), chunk, size);
+	chunk_cache(chunk_arena_get(arena_ind), chunk, size);
 	return (false);
 }
 
+void
+chunk_unmap(arena_t *arena, void *chunk, size_t size)
+{
+
+	assert(chunk != NULL);
+	assert(CHUNK_ADDR2BASE(chunk) == chunk);
+	assert(size != 0);
+	assert((size & chunksize_mask) == 0);
+
+	if (have_dss && chunk_in_dss(chunk)) {
+		chunk_record(arena, &arena->chunks_szad_dss,
+		    &arena->chunks_ad_dss, false, chunk, size);
+	} else if (chunk_dalloc_mmap(chunk, size)) {
+		chunk_record(arena, &arena->chunks_szad_mmap,
+		    &arena->chunks_ad_mmap, false, chunk, size);
+	}
+}
+
 static rtree_node_elm_t *
 chunks_rtree_node_alloc(size_t nelms)
 {
diff --git a/src/chunk_dss.c b/src/chunk_dss.c
index 9c3eea8..c3c4848 100644
--- a/src/chunk_dss.c
+++ b/src/chunk_dss.c
@@ -133,8 +133,12 @@ chunk_alloc_dss(arena_t *arena, void *new_addr, size_t size, size_t alignment,
 				/* Success. */
 				dss_max = dss_next;
 				malloc_mutex_unlock(&dss_mtx);
-				if (cpad_size != 0)
-					chunk_unmap(arena, cpad, cpad_size);
+				if (cpad_size != 0) {
+					chunk_record(arena,
+					    &arena->chunks_szad_dss,
+					    &arena->chunks_ad_dss, false, cpad,
+					    cpad_size);
+				}
 				if (*zero) {
 					JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(
 					    ret, size);
diff --git a/src/extent.c b/src/extent.c
index 60e2468..f98e77e 100644
--- a/src/extent.c
+++ b/src/extent.c
@@ -7,13 +7,13 @@ JEMALLOC_INLINE_C int
 extent_szad_comp(extent_node_t *a, extent_node_t *b)
 {
 	int ret;
-	size_t a_size = a->size;
-	size_t b_size = b->size;
+	size_t a_size = extent_node_size_get(a);
+	size_t b_size = extent_node_size_get(b);
 
 	ret = (a_size > b_size) - (a_size < b_size);
 	if (ret == 0) {
-		uintptr_t a_addr = (uintptr_t)a->addr;
-		uintptr_t b_addr = (uintptr_t)b->addr;
+		uintptr_t a_addr = (uintptr_t)extent_node_addr_get(a);
+		uintptr_t b_addr = (uintptr_t)extent_node_addr_get(b);
 
 		ret = (a_addr > b_addr) - (a_addr < b_addr);
 	}
@@ -28,8 +28,8 @@ rb_gen(, extent_tree_szad_, extent_tree_t, extent_node_t, szad_link,
 JEMALLOC_INLINE_C int
 extent_ad_comp(extent_node_t *a, extent_node_t *b)
 {
-	uintptr_t a_addr = (uintptr_t)a->addr;
-	uintptr_t b_addr = (uintptr_t)b->addr;
+	uintptr_t a_addr = (uintptr_t)extent_node_addr_get(a);
+	uintptr_t b_addr = (uintptr_t)extent_node_addr_get(b);
 
 	return ((a_addr > b_addr) - (a_addr < b_addr));
 }
diff --git a/src/huge.c b/src/huge.c
index bc7d99c..b9cae00 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -9,7 +9,7 @@ huge_node_get(const void *ptr)
 	extent_node_t *node;
 
 	node = chunk_lookup(ptr);
-	assert(node->size != 0);
+	assert(!extent_node_achunk_get(node));
 
 	return (node);
 }
@@ -18,8 +18,8 @@ static bool
 huge_node_set(const void *ptr, extent_node_t *node)
 {
 
-	assert(node->addr == ptr);
-	assert(node->size != 0);
+	assert(extent_node_addr_get(node) == ptr);
+	assert(!extent_node_achunk_get(node));
 	return (chunk_register(ptr, node));
 }
 
@@ -73,10 +73,11 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 		return (NULL);
 	}
 
-	node->addr = ret;
-	node->size = usize;
-	node->zeroed = is_zeroed;
-	node->arena = arena;
+	extent_node_arena_set(node, arena);
+	extent_node_addr_set(node, ret);
+	extent_node_size_set(node, usize);
+	extent_node_achunk_set(node, false);
+	extent_node_zeroed_set(node, is_zeroed);
 
 	if (huge_node_set(ret, node)) {
 		arena_chunk_dalloc_huge(arena, ret, usize);
@@ -152,13 +153,13 @@ huge_ralloc_no_move_similar(void *ptr, size_t oldsize, size_t usize,
 		zeroed = true;
 
 	node = huge_node_get(ptr);
-	arena = node->arena;
+	arena = extent_node_arena_get(node);
 	malloc_mutex_lock(&arena->huge_mtx);
 	/* Update the size of the huge allocation. */
-	assert(node->size != usize);
-	node->size = usize;
-	/* Clear node->zeroed if zeroing failed above. */
-	node->zeroed = (node->zeroed && zeroed);
+	assert(extent_node_size_get(node) != usize);
+	extent_node_size_set(node, usize);
+	/* Clear node's zeroed field if zeroing failed above. */
+	extent_node_zeroed_set(node, extent_node_zeroed_get(node) && zeroed);
 	malloc_mutex_unlock(&arena->huge_mtx);
 
 	arena_chunk_ralloc_huge_similar(arena, ptr, oldsize, usize);
@@ -195,12 +196,12 @@ huge_ralloc_no_move_shrink(void *ptr, size_t oldsize, size_t usize)
 	}
 
 	node = huge_node_get(ptr);
-	arena = node->arena;
+	arena = extent_node_arena_get(node);
 	malloc_mutex_lock(&arena->huge_mtx);
 	/* Update the size of the huge allocation. */
-	node->size = usize;
-	/* Clear node->zeroed if zeroing failed above. */
-	node->zeroed = (node->zeroed && zeroed);
+	extent_node_size_set(node, usize);
+	/* Clear node's zeroed field if zeroing failed above. */
+	extent_node_zeroed_set(node, extent_node_zeroed_get(node) && zeroed);
 	malloc_mutex_unlock(&arena->huge_mtx);
 
 	/* Zap the excess chunks. */
@@ -221,9 +222,9 @@ huge_ralloc_no_move_expand(void *ptr, size_t oldsize, size_t size, bool zero) {
 	}
 
 	node = huge_node_get(ptr);
-	arena = node->arena;
+	arena = extent_node_arena_get(node);
 	malloc_mutex_lock(&arena->huge_mtx);
-	is_zeroed_subchunk = node->zeroed;
+	is_zeroed_subchunk = extent_node_zeroed_get(node);
 	malloc_mutex_unlock(&arena->huge_mtx);
 
 	/*
@@ -238,7 +239,7 @@ huge_ralloc_no_move_expand(void *ptr, size_t oldsize, size_t size, bool zero) {
 
 	malloc_mutex_lock(&arena->huge_mtx);
 	/* Update the size of the huge allocation. */
-	node->size = usize;
+	extent_node_size_set(node, usize);
 	malloc_mutex_unlock(&arena->huge_mtx);
 
 	if (zero || (config_fill && unlikely(opt_zero))) {
@@ -358,14 +359,16 @@ huge_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache)
 	arena_t *arena;
 
 	node = huge_node_get(ptr);
-	arena = node->arena;
+	arena = extent_node_arena_get(node);
 	huge_node_unset(ptr, node);
 	malloc_mutex_lock(&arena->huge_mtx);
 	ql_remove(&arena->huge, node, ql_link);
 	malloc_mutex_unlock(&arena->huge_mtx);
 
-	huge_dalloc_junk(node->addr, node->size);
-	arena_chunk_dalloc_huge(node->arena, node->addr, node->size);
+	huge_dalloc_junk(extent_node_addr_get(node),
+	    extent_node_size_get(node));
+	arena_chunk_dalloc_huge(extent_node_arena_get(node),
+	    extent_node_addr_get(node), extent_node_size_get(node));
 	idalloctm(tsd, node, tcache, true);
 }
 
@@ -373,7 +376,7 @@ arena_t *
 huge_aalloc(const void *ptr)
 {
 
-	return (huge_node_get(ptr)->arena);
+	return (extent_node_arena_get(huge_node_get(ptr)));
 }
 
 size_t
@@ -384,9 +387,9 @@ huge_salloc(const void *ptr)
 	arena_t *arena;
 
 	node = huge_node_get(ptr);
-	arena = node->arena;
+	arena = extent_node_arena_get(node);
 	malloc_mutex_lock(&arena->huge_mtx);
-	size = node->size;
+	size = extent_node_size_get(node);
 	malloc_mutex_unlock(&arena->huge_mtx);
 
 	return (size);
@@ -400,9 +403,9 @@ huge_prof_tctx_get(const void *ptr)
 	arena_t *arena;
 
 	node = huge_node_get(ptr);
-	arena = node->arena;
+	arena = extent_node_arena_get(node);
 	malloc_mutex_lock(&arena->huge_mtx);
-	tctx = node->prof_tctx;
+	tctx = extent_node_prof_tctx_get(node);
 	malloc_mutex_unlock(&arena->huge_mtx);
 
 	return (tctx);
@@ -415,8 +418,8 @@ huge_prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 	arena_t *arena;
 
 	node = huge_node_get(ptr);
-	arena = node->arena;
+	arena = extent_node_arena_get(node);
 	malloc_mutex_lock(&arena->huge_mtx);
-	node->prof_tctx = tctx;
+	extent_node_prof_tctx_set(node, tctx);
 	malloc_mutex_unlock(&arena->huge_mtx);
 }
diff --git a/src/tcache.c b/src/tcache.c
index 318e0dc..8d0a6fa 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -103,7 +103,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
 		/* Lock the arena bin associated with the first object. */
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(
 		    tbin->avail[0]);
-		arena_t *bin_arena = chunk->node.arena;
+		arena_t *bin_arena = extent_node_arena_get(&chunk->node);
 		arena_bin_t *bin = &bin_arena->bins[binind];
 
 		if (config_prof && bin_arena == arena) {
@@ -125,7 +125,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
 			ptr = tbin->avail[i];
 			assert(ptr != NULL);
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-			if (chunk->node.arena == bin_arena) {
+			if (extent_node_arena_get(&chunk->node) == bin_arena) {
 				size_t pageind = ((uintptr_t)ptr -
 				    (uintptr_t)chunk) >> LG_PAGE;
 				arena_chunk_map_bits_t *bitselm =
@@ -183,7 +183,7 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, index_t binind,
 		/* Lock the arena associated with the first object. */
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(
 		    tbin->avail[0]);
-		arena_t *locked_arena = chunk->node.arena;
+		arena_t *locked_arena = extent_node_arena_get(&chunk->node);
 		UNUSED bool idump;
 
 		if (config_prof)
@@ -209,7 +209,8 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, index_t binind,
 			ptr = tbin->avail[i];
 			assert(ptr != NULL);
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-			if (chunk->node.arena == locked_arena) {
+			if (extent_node_arena_get(&chunk->node) ==
+			    locked_arena) {
 				arena_dalloc_large_junked_locked(locked_arena,
 				    chunk, ptr);
 			} else {
-- 
cgit v0.12


From a4e1888d1a12d864f42350f2859e33eb3a0033f2 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 17 Feb 2015 15:13:52 -0800
Subject: Simplify extent_node_t and add extent_node_init().

---
 include/jemalloc/internal/extent.h            | 35 +++++++++++++++++----------
 include/jemalloc/internal/private_symbols.txt |  1 +
 src/arena.c                                   |  9 ++-----
 src/base.c                                    |  6 ++---
 src/chunk.c                                   | 25 +++++++++----------
 src/huge.c                                    |  6 +----
 6 files changed, 39 insertions(+), 43 deletions(-)

diff --git a/include/jemalloc/internal/extent.h b/include/jemalloc/internal/extent.h
index 1060761..9ee1b44 100644
--- a/include/jemalloc/internal/extent.h
+++ b/include/jemalloc/internal/extent.h
@@ -30,19 +30,12 @@ struct extent_node_s {
 	 */
 	bool			en_achunk;
 
-	union {
-		/* Profile counters, used for huge objects. */
-		prof_tctx_t	*en_prof_tctx;
-
-		struct {
-			/*
-			 * Linkage for arena's runs_dirty and chunks_dirty
-			 * rings.
-			 */
-			qr(extent_node_t)	cd_link;
-			arena_chunk_map_misc_t	runs_dirty;
-		};
-	};
+	/* Profile counters, used for huge objects. */
+	prof_tctx_t		*en_prof_tctx;
+
+	/* Linkage for arena's runs_dirty and chunks_dirty rings. */
+	qr(extent_node_t)	cd_link;
+	arena_chunk_map_misc_t	runs_dirty;
 
 	union {
 		/* Linkage for the size/address-ordered tree. */
@@ -82,6 +75,8 @@ void	extent_node_size_set(extent_node_t *node, size_t size);
 void	extent_node_zeroed_set(extent_node_t *node, bool zeroed);
 void	extent_node_achunk_set(extent_node_t *node, bool achunk);
 void	extent_node_prof_tctx_set(extent_node_t *node, prof_tctx_t *tctx);
+void	extent_node_init(extent_node_t *node, arena_t *arena, void *addr,
+    size_t size, bool zeroed);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_EXTENT_C_))
@@ -168,6 +163,20 @@ extent_node_prof_tctx_set(extent_node_t *node, prof_tctx_t *tctx)
 
 	node->en_prof_tctx = tctx;
 }
+
+JEMALLOC_INLINE void
+extent_node_init(extent_node_t *node, arena_t *arena, void *addr, size_t size,
+    bool zeroed)
+{
+
+	extent_node_arena_set(node, arena);
+	extent_node_addr_set(node, addr);
+	extent_node_size_set(node, size);
+	extent_node_zeroed_set(node, zeroed);
+	extent_node_achunk_set(node, false);
+	if (config_prof)
+		extent_node_prof_tctx_set(node, NULL);
+}
 #endif
 
 #endif /* JEMALLOC_H_INLINES */
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index a1d12cf..8b55954 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -182,6 +182,7 @@ extent_node_addr_get
 extent_node_addr_set
 extent_node_arena_get
 extent_node_arena_set
+extent_node_init
 extent_node_prof_tctx_get
 extent_node_prof_tctx_set
 extent_node_size_get
diff --git a/src/arena.c b/src/arena.c
index a7a98e2..b068a4d 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -465,9 +465,7 @@ arena_chunk_alloc_internal(arena_t *arena, bool *zero)
 	chunk = (arena_chunk_t *)chunk_alloc_arena(chunk_alloc, chunk_dalloc,
 	    arena->ind, NULL, chunksize, chunksize, zero);
 	if (chunk != NULL) {
-		extent_node_arena_set(&chunk->node, arena);
-		extent_node_addr_set(&chunk->node, chunk);
-		extent_node_size_set(&chunk->node, chunksize);
+		extent_node_init(&chunk->node, arena, chunk, chunksize, *zero);
 		extent_node_achunk_set(&chunk->node, true);
 		if (chunk_register(chunk, &chunk->node)) {
 			chunk_dalloc((void *)chunk, chunksize, arena->ind);
@@ -1055,10 +1053,7 @@ arena_stash_dirty(arena_t *arena, bool all, size_t npurge,
 			 * just cached a node.
 			 */
 			assert(tnode != NULL);
-			extent_node_arena_set(tnode, arena);
-			extent_node_addr_set(tnode, addr);
-			extent_node_size_set(tnode, size);
-			extent_node_zeroed_set(tnode, zeroed);
+			extent_node_init(tnode, arena, addr, size, zeroed);
 			arena_chunk_dirty_node_init(tnode);
 			/* Stash. */
 			arena_chunk_dirty_insert(purge_runs_sentinel,
diff --git a/src/base.c b/src/base.c
index 819fa02..33e8b6f 100644
--- a/src/base.c
+++ b/src/base.c
@@ -60,8 +60,7 @@ base_chunk_alloc(size_t minsize)
 		if (config_stats)
 			base_allocated += nsize;
 	}
-	extent_node_addr_set(node, addr);
-	extent_node_size_set(node, csize);
+	extent_node_init(node, NULL, addr, csize, true);
 	return (node);
 }
 
@@ -84,8 +83,7 @@ base_alloc(size_t size)
 	 */
 	csize = CACHELINE_CEILING(size);
 
-	extent_node_addr_set(&key, NULL);
-	extent_node_size_set(&key, csize);
+	extent_node_init(&key, NULL, NULL, csize, false);
 	malloc_mutex_lock(&base_mtx);
 	node = extent_tree_szad_nsearch(&base_avail_szad, &key);
 	if (node != NULL) {
diff --git a/src/chunk.c b/src/chunk.c
index 8bc87be..59d72c9 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -79,8 +79,7 @@ chunk_recycle(arena_t *arena, extent_tree_t *chunks_szad,
 	/* Beware size_t wrap-around. */
 	if (alloc_size < size)
 		return (NULL);
-	extent_node_addr_set(&key, new_addr);
-	extent_node_size_set(&key, alloc_size);
+	extent_node_init(&key, arena, new_addr, alloc_size, false);
 	malloc_mutex_lock(&arena->chunks_mtx);
 	node = (new_addr != NULL) ? extent_tree_ad_search(chunks_ad, &key) :
 	    extent_tree_szad_nsearch(chunks_szad, &key);
@@ -121,9 +120,8 @@ chunk_recycle(arena_t *arena, extent_tree_t *chunks_szad,
 				return (NULL);
 			}
 		}
-		extent_node_addr_set(node, (void *)((uintptr_t)(ret) + size));
-		extent_node_size_set(node, trailsize);
-		extent_node_zeroed_set(node, zeroed);
+		extent_node_init(node, arena, (void *)((uintptr_t)(ret) + size),
+		    trailsize, zeroed);
 		extent_tree_szad_insert(chunks_szad, node);
 		extent_tree_ad_insert(chunks_ad, node);
 		arena_chunk_dirty_maybe_insert(arena, node, dirty);
@@ -288,7 +286,8 @@ chunk_record(arena_t *arena, extent_tree_t *chunks_szad,
 	JEMALLOC_VALGRIND_MAKE_MEM_NOACCESS(chunk, size);
 
 	malloc_mutex_lock(&arena->chunks_mtx);
-	extent_node_addr_set(&key, (void *)((uintptr_t)chunk + size));
+	extent_node_init(&key, arena, (void *)((uintptr_t)chunk + size), 0,
+	    false);
 	node = extent_tree_ad_nsearch(chunks_ad, &key);
 	/* Try to coalesce forward. */
 	if (node != NULL && extent_node_addr_get(node) ==
@@ -301,7 +300,7 @@ chunk_record(arena_t *arena, extent_tree_t *chunks_szad,
 		extent_tree_szad_remove(chunks_szad, node);
 		arena_chunk_dirty_maybe_remove(arena, node, dirty);
 		extent_node_addr_set(node, chunk);
-		extent_node_size_set(node, extent_node_size_get(node) + size);
+		extent_node_size_set(node, size + extent_node_size_get(node));
 		extent_node_zeroed_set(node, extent_node_zeroed_get(node) &&
 		    !unzeroed);
 		extent_tree_szad_insert(chunks_szad, node);
@@ -320,9 +319,7 @@ chunk_record(arena_t *arena, extent_tree_t *chunks_szad,
 				pages_purge(chunk, size);
 			goto label_return;
 		}
-		extent_node_addr_set(node, chunk);
-		extent_node_size_set(node, size);
-		extent_node_zeroed_set(node, !unzeroed);
+		extent_node_init(node, arena, chunk, size, !unzeroed);
 		extent_tree_ad_insert(chunks_ad, node);
 		extent_tree_szad_insert(chunks_szad, node);
 		arena_chunk_dirty_maybe_insert(arena, node, dirty);
@@ -343,10 +340,10 @@ chunk_record(arena_t *arena, extent_tree_t *chunks_szad,
 		extent_tree_szad_remove(chunks_szad, node);
 		arena_chunk_dirty_maybe_remove(arena, node, dirty);
 		extent_node_addr_set(node, extent_node_addr_get(prev));
-		extent_node_size_set(node, extent_node_size_get(node) +
-		    extent_node_size_get(prev));
-		extent_node_zeroed_set(node, extent_node_zeroed_get(node) &&
-		    extent_node_zeroed_get(prev));
+		extent_node_size_set(node, extent_node_size_get(prev) +
+		    extent_node_size_get(node));
+		extent_node_zeroed_set(node, extent_node_zeroed_get(prev) &&
+		    extent_node_zeroed_get(node));
 		extent_tree_szad_insert(chunks_szad, node);
 		arena_chunk_dirty_maybe_insert(arena, node, dirty);
 
diff --git a/src/huge.c b/src/huge.c
index b9cae00..3092932 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -73,11 +73,7 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 		return (NULL);
 	}
 
-	extent_node_arena_set(node, arena);
-	extent_node_addr_set(node, ret);
-	extent_node_size_set(node, usize);
-	extent_node_achunk_set(node, false);
-	extent_node_zeroed_set(node, is_zeroed);
+	extent_node_init(node, arena, ret, usize, is_zeroed);
 
 	if (huge_node_set(ret, node)) {
 		arena_chunk_dalloc_huge(arena, ret, usize);
-- 
cgit v0.12


From eafebfdfbe48bf8e95902d89cfa1eb3d5cd2fa5c Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 17 Feb 2015 16:12:31 -0800
Subject: Remove obsolete type arena_chunk_miscelms_t.

---
 include/jemalloc/internal/arena.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index f967be3..0383f0c 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -148,7 +148,6 @@ struct arena_chunk_map_misc_s {
 };
 typedef rb_tree(arena_chunk_map_misc_t) arena_avail_tree_t;
 typedef rb_tree(arena_chunk_map_misc_t) arena_run_tree_t;
-typedef qr(arena_chunk_map_misc_t) arena_chunk_miscelms_t;
 #endif /* JEMALLOC_ARENA_STRUCTS_A */
 
 #ifdef JEMALLOC_ARENA_STRUCTS_B
-- 
cgit v0.12


From 47701b22ee7c0df5e99efa0fcdcf98b9ff805b59 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 17 Feb 2015 22:23:10 -0800
Subject: arena_chunk_dirty_node_init() --> extent_node_dirty_linkage_init()

---
 include/jemalloc/internal/extent.h            |  9 +++++++++
 include/jemalloc/internal/private_symbols.txt |  1 +
 src/arena.c                                   | 14 +++-----------
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/include/jemalloc/internal/extent.h b/include/jemalloc/internal/extent.h
index 9ee1b44..2f99deb 100644
--- a/include/jemalloc/internal/extent.h
+++ b/include/jemalloc/internal/extent.h
@@ -77,6 +77,7 @@ void	extent_node_achunk_set(extent_node_t *node, bool achunk);
 void	extent_node_prof_tctx_set(extent_node_t *node, prof_tctx_t *tctx);
 void	extent_node_init(extent_node_t *node, arena_t *arena, void *addr,
     size_t size, bool zeroed);
+void	extent_node_dirty_linkage_init(extent_node_t *node);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_EXTENT_C_))
@@ -177,6 +178,14 @@ extent_node_init(extent_node_t *node, arena_t *arena, void *addr, size_t size,
 	if (config_prof)
 		extent_node_prof_tctx_set(node, NULL);
 }
+
+JEMALLOC_INLINE void
+extent_node_dirty_linkage_init(extent_node_t *node)
+{
+
+	qr_new(node, cd_link);
+	qr_new(&node->runs_dirty, rd_link);
+}
 #endif
 
 #endif /* JEMALLOC_H_INLINES */
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 8b55954..0a8654b 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -182,6 +182,7 @@ extent_node_addr_get
 extent_node_addr_set
 extent_node_arena_get
 extent_node_arena_set
+extent_node_dirty_linkage_init
 extent_node_init
 extent_node_prof_tctx_get
 extent_node_prof_tctx_set
diff --git a/src/arena.c b/src/arena.c
index b068a4d..205f598 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -153,14 +153,6 @@ arena_chunk_dirty_npages(const extent_node_t *node)
 }
 
 static void
-arena_chunk_dirty_node_init(extent_node_t *node)
-{
-
-	qr_new(node, cd_link);
-	qr_new(&node->runs_dirty, rd_link);
-}
-
-static void
 arena_chunk_dirty_insert(arena_chunk_map_misc_t *runs_dirty,
     extent_node_t *chunks_dirty, extent_node_t *node)
 {
@@ -181,8 +173,8 @@ void
 arena_chunk_dirty_maybe_insert(arena_t *arena, extent_node_t *node, bool dirty)
 {
 
-	arena_chunk_dirty_node_init(node);
 	if (dirty) {
+		extent_node_dirty_linkage_init(node);
 		arena_chunk_dirty_insert(&arena->runs_dirty,
 		    &arena->chunks_dirty, node);
 		arena->ndirty += arena_chunk_dirty_npages(node);
@@ -1054,7 +1046,7 @@ arena_stash_dirty(arena_t *arena, bool all, size_t npurge,
 			 */
 			assert(tnode != NULL);
 			extent_node_init(tnode, arena, addr, size, zeroed);
-			arena_chunk_dirty_node_init(tnode);
+			extent_node_dirty_linkage_init(tnode);
 			/* Stash. */
 			arena_chunk_dirty_insert(purge_runs_sentinel,
 			    purge_chunks_sentinel, tnode);
@@ -1223,7 +1215,7 @@ arena_purge(arena_t *arena, bool all)
 
 	npurge = arena_compute_npurge(arena, all);
 	qr_new(&purge_runs_sentinel, rd_link);
-	arena_chunk_dirty_node_init(&purge_chunks_sentinel);
+	extent_node_dirty_linkage_init(&purge_chunks_sentinel);
 
 	npurgeable = arena_stash_dirty(arena, all, npurge, &purge_runs_sentinel,
 	    &purge_chunks_sentinel);
-- 
cgit v0.12


From 339c2b23b2d61993ac768afcc72af135662c6771 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 17 Feb 2015 22:25:56 -0800
Subject: Fix chunk_unmap() to propagate dirty state.

Fix chunk_unmap() to propagate whether a chunk is dirty, and modify
dirty chunk purging to record this information so it can be passed to
chunk_unmap().  Since the broken version of chunk_unmap() claimed that
all chunks were clean, this resulted in potential memory corruption for
purging implementations that do not zero (e.g. MADV_FREE).

This regression was introduced by
ee41ad409a43d12900a5a3108f6c14f84e4eb0eb (Integrate whole chunks into
unused dirty page purging machinery.).
---
 include/jemalloc/internal/chunk.h |  2 +-
 src/arena.c                       | 14 ++++++++++----
 src/chunk.c                       |  6 +++---
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/include/jemalloc/internal/chunk.h b/include/jemalloc/internal/chunk.h
index 96b9e15..8722dd0 100644
--- a/include/jemalloc/internal/chunk.h
+++ b/include/jemalloc/internal/chunk.h
@@ -47,7 +47,7 @@ void	*chunk_alloc_default(void *new_addr, size_t size, size_t alignment,
 void	chunk_record(arena_t *arena, extent_tree_t *chunks_szad,
     extent_tree_t *chunks_ad, bool dirty, void *chunk, size_t size);
 bool	chunk_dalloc_default(void *chunk, size_t size, unsigned arena_ind);
-void	chunk_unmap(arena_t *arena, void *chunk, size_t size);
+void	chunk_unmap(arena_t *arena, bool dirty, void *chunk, size_t size);
 bool	chunk_boot(void);
 void	chunk_prefork(void);
 void	chunk_postfork_parent(void);
diff --git a/src/arena.c b/src/arena.c
index 205f598..3d38386 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1035,6 +1035,7 @@ arena_stash_dirty(arena_t *arena, bool all, size_t npurge,
 			chunk = arena->chunk_alloc(addr, size, chunksize, &zero,
 			    arena->ind);
 			assert(chunk == addr);
+			assert(zero == zeroed);
 			/*
 			 * Create a temporary node to link into the ring of
 			 * stashed allocations.
@@ -1075,7 +1076,7 @@ arena_stash_dirty(arena_t *arena, bool all, size_t npurge,
 
 			/* Temporarily allocate the free dirty run. */
 			arena_run_split_large(arena, run, run_size, false);
-			/* Append to purge_runs for later processing. */
+			/* Stash. */
 			if (false)
 				qr_new(runselm, rd_link); /* Redundant. */
 			else {
@@ -1114,9 +1115,12 @@ arena_purge_stashed(arena_t *arena, arena_chunk_map_misc_t *purge_runs_sentinel,
 
 		if (runselm == &chunkselm->runs_dirty) {
 			size_t size = extent_node_size_get(chunkselm);
+			bool unzeroed;
 
-			pages_purge(extent_node_addr_get(chunkselm), size);
 			npages = size >> LG_PAGE;
+			unzeroed = pages_purge(extent_node_addr_get(chunkselm),
+			    size);
+			extent_node_zeroed_set(chunkselm, !unzeroed);
 			chunkselm = qr_next(chunkselm, cd_link);
 		} else {
 			arena_chunk_t *chunk;
@@ -1180,11 +1184,13 @@ arena_unstash_purged(arena_t *arena,
 		if (runselm == &chunkselm->runs_dirty) {
 			extent_node_t *chunkselm_next = qr_next(chunkselm,
 			    cd_link);
+			bool dirty = !extent_node_zeroed_get(chunkselm);
+			void *addr = extent_node_addr_get(chunkselm);
+			size_t size = extent_node_size_get(chunkselm);
 			arena_chunk_dirty_remove(chunkselm);
-			chunk_unmap(arena, extent_node_addr_get(chunkselm),
-			    extent_node_size_get(chunkselm));
 			arena_node_dalloc(arena, chunkselm);
 			chunkselm = chunkselm_next;
+			chunk_unmap(arena, dirty, addr, size);
 		} else {
 			arena_run_t *run = &runselm->run;
 			qr_remove(runselm, rd_link);
diff --git a/src/chunk.c b/src/chunk.c
index 59d72c9..774a978 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -377,7 +377,7 @@ chunk_dalloc_default(void *chunk, size_t size, unsigned arena_ind)
 }
 
 void
-chunk_unmap(arena_t *arena, void *chunk, size_t size)
+chunk_unmap(arena_t *arena, bool dirty, void *chunk, size_t size)
 {
 
 	assert(chunk != NULL);
@@ -387,10 +387,10 @@ chunk_unmap(arena_t *arena, void *chunk, size_t size)
 
 	if (have_dss && chunk_in_dss(chunk)) {
 		chunk_record(arena, &arena->chunks_szad_dss,
-		    &arena->chunks_ad_dss, false, chunk, size);
+		    &arena->chunks_ad_dss, dirty, chunk, size);
 	} else if (chunk_dalloc_mmap(chunk, size)) {
 		chunk_record(arena, &arena->chunks_szad_mmap,
-		    &arena->chunks_ad_mmap, false, chunk, size);
+		    &arena->chunks_ad_mmap, dirty, chunk, size);
 	}
 }
 
-- 
cgit v0.12


From 738e089a2e707dbfc70286f7deeebc68e03d2347 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 18 Feb 2015 01:15:50 -0800
Subject: Rename "dirty chunks" to "cached chunks".

Rename "dirty chunks" to "cached chunks", in order to avoid overloading
the term "dirty".

Fix the regression caused by 339c2b23b2d61993ac768afcc72af135662c6771
(Fix chunk_unmap() to propagate dirty state.), and actually address what
that change attempted, which is to only purge chunks once, and propagate
whether zeroed pages resulted into chunk_record().
---
 include/jemalloc/internal/arena.h             | 22 ++++-----
 include/jemalloc/internal/chunk.h             |  5 +-
 include/jemalloc/internal/extent.h            | 27 +++++++++--
 include/jemalloc/internal/private_symbols.txt |  6 ++-
 src/arena.c                                   | 70 ++++++++++-----------------
 src/chunk.c                                   | 45 ++++++++---------
 src/chunk_dss.c                               |  2 +-
 7 files changed, 91 insertions(+), 86 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 0383f0c..3d79c62 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -318,14 +318,14 @@ struct arena_s {
 
 	/*
 	 * Unused dirty memory this arena manages.  Dirty memory is conceptually
-	 * tracked as an arbitrarily interleaved LRU of runs and chunks, but the
-	 * list linkage is actually semi-duplicated in order to avoid extra
-	 * arena_chunk_map_misc_t space overhead.
+	 * tracked as an arbitrarily interleaved LRU of dirty runs and cached
+	 * chunks, but the list linkage is actually semi-duplicated in order to
+	 * avoid extra arena_chunk_map_misc_t space overhead.
 	 *
 	 *   LRU-----------------------------------------------------------MRU
 	 *
 	 *         ______________           ___                      ___
-	 *   ...-->|chunks_dirty|<--------->|c|<-------------------->|c|<--...
+	 *   ...-->|chunks_cache|<--------->|c|<-------------------->|c|<--...
 	 *         --------------           |h|                      |h|
 	 *         ____________    _____    |u|    _____    _____    |u|
 	 *   ...-->|runs_dirty|<-->|run|<-->|n|<-->|run|<-->|run|<-->|n|<--...
@@ -333,7 +333,7 @@ struct arena_s {
 	 *                                  ---                      ---
 	 */
 	arena_chunk_map_misc_t	runs_dirty;
-	extent_node_t		chunks_dirty;
+	extent_node_t		chunks_cache;
 
 	/* Extant huge allocations. */
 	ql_head(extent_node_t)	huge;
@@ -347,8 +347,8 @@ struct arena_s {
 	 * orderings are needed, which is why there are two trees with the same
 	 * contents.
 	 */
-	extent_tree_t		chunks_szad_dirty;
-	extent_tree_t		chunks_ad_dirty;
+	extent_tree_t		chunks_szad_cache;
+	extent_tree_t		chunks_ad_cache;
 	extent_tree_t		chunks_szad_mmap;
 	extent_tree_t		chunks_ad_mmap;
 	extent_tree_t		chunks_szad_dss;
@@ -384,10 +384,10 @@ extern size_t		arena_maxclass; /* Max size class for arenas. */
 extern unsigned		nlclasses; /* Number of large size classes. */
 extern unsigned		nhclasses; /* Number of huge size classes. */
 
-void	arena_chunk_dirty_maybe_insert(arena_t *arena, extent_node_t *node,
-    bool dirty);
-void	arena_chunk_dirty_maybe_remove(arena_t *arena, extent_node_t *node,
-    bool dirty);
+void	arena_chunk_cache_maybe_insert(arena_t *arena, extent_node_t *node,
+    bool cache);
+void	arena_chunk_cache_maybe_remove(arena_t *arena, extent_node_t *node,
+    bool cache);
 extent_node_t	*arena_node_alloc(arena_t *arena);
 void	arena_node_dalloc(arena_t *arena, extent_node_t *node);
 void	*arena_chunk_alloc_huge(arena_t *arena, size_t usize, size_t alignment,
diff --git a/include/jemalloc/internal/chunk.h b/include/jemalloc/internal/chunk.h
index 8722dd0..bf6acbd 100644
--- a/include/jemalloc/internal/chunk.h
+++ b/include/jemalloc/internal/chunk.h
@@ -45,9 +45,10 @@ void	*chunk_alloc_arena(chunk_alloc_t *chunk_alloc,
 void	*chunk_alloc_default(void *new_addr, size_t size, size_t alignment,
     bool *zero, unsigned arena_ind);
 void	chunk_record(arena_t *arena, extent_tree_t *chunks_szad,
-    extent_tree_t *chunks_ad, bool dirty, void *chunk, size_t size);
+    extent_tree_t *chunks_ad, bool cache, void *chunk, size_t size,
+    bool zeroed);
 bool	chunk_dalloc_default(void *chunk, size_t size, unsigned arena_ind);
-void	chunk_unmap(arena_t *arena, bool dirty, void *chunk, size_t size);
+void	chunk_unmap(arena_t *arena, void *chunk, size_t size, bool zeroed);
 bool	chunk_boot(void);
 void	chunk_prefork(void);
 void	chunk_postfork_parent(void);
diff --git a/include/jemalloc/internal/extent.h b/include/jemalloc/internal/extent.h
index 2f99deb..81ff40b 100644
--- a/include/jemalloc/internal/extent.h
+++ b/include/jemalloc/internal/extent.h
@@ -33,9 +33,9 @@ struct extent_node_s {
 	/* Profile counters, used for huge objects. */
 	prof_tctx_t		*en_prof_tctx;
 
-	/* Linkage for arena's runs_dirty and chunks_dirty rings. */
-	qr(extent_node_t)	cd_link;
+	/* Linkage for arena's runs_dirty and chunks_cache rings. */
 	arena_chunk_map_misc_t	runs_dirty;
+	qr(extent_node_t)	cc_link;
 
 	union {
 		/* Linkage for the size/address-ordered tree. */
@@ -78,6 +78,9 @@ void	extent_node_prof_tctx_set(extent_node_t *node, prof_tctx_t *tctx);
 void	extent_node_init(extent_node_t *node, arena_t *arena, void *addr,
     size_t size, bool zeroed);
 void	extent_node_dirty_linkage_init(extent_node_t *node);
+void	extent_node_dirty_insert(extent_node_t *node,
+    arena_chunk_map_misc_t *runs_dirty, extent_node_t *chunks_dirty);
+void	extent_node_dirty_remove(extent_node_t *node);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_EXTENT_C_))
@@ -183,9 +186,27 @@ JEMALLOC_INLINE void
 extent_node_dirty_linkage_init(extent_node_t *node)
 {
 
-	qr_new(node, cd_link);
 	qr_new(&node->runs_dirty, rd_link);
+	qr_new(node, cc_link);
 }
+
+JEMALLOC_INLINE void
+extent_node_dirty_insert(extent_node_t *node,
+    arena_chunk_map_misc_t *runs_dirty, extent_node_t *chunks_dirty)
+{
+
+	qr_meld(runs_dirty, &node->runs_dirty, rd_link);
+	qr_meld(chunks_dirty, node, cc_link);
+}
+
+JEMALLOC_INLINE void
+extent_node_dirty_remove(extent_node_t *node)
+{
+
+	qr_remove(&node->runs_dirty, rd_link);
+	qr_remove(node, cc_link);
+}
+
 #endif
 
 #endif /* JEMALLOC_H_INLINES */
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 0a8654b..dfe62ce 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -12,9 +12,9 @@ arena_boot
 arena_choose
 arena_choose_hard
 arena_chunk_alloc_huge
+arena_chunk_cache_maybe_insert
+arena_chunk_cache_maybe_remove
 arena_chunk_dalloc_huge
-arena_chunk_dirty_maybe_insert
-arena_chunk_dirty_maybe_remove
 arena_chunk_ralloc_huge_expand
 arena_chunk_ralloc_huge_shrink
 arena_chunk_ralloc_huge_similar
@@ -182,7 +182,9 @@ extent_node_addr_get
 extent_node_addr_set
 extent_node_arena_get
 extent_node_arena_set
+extent_node_dirty_insert
 extent_node_dirty_linkage_init
+extent_node_dirty_remove
 extent_node_init
 extent_node_prof_tctx_get
 extent_node_prof_tctx_set
diff --git a/src/arena.c b/src/arena.c
index 3d38386..762b818 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -152,41 +152,24 @@ arena_chunk_dirty_npages(const extent_node_t *node)
 	return (extent_node_size_get(node) >> LG_PAGE);
 }
 
-static void
-arena_chunk_dirty_insert(arena_chunk_map_misc_t *runs_dirty,
-    extent_node_t *chunks_dirty, extent_node_t *node)
-{
-
-	qr_meld(chunks_dirty, node, cd_link);
-	qr_meld(runs_dirty, &node->runs_dirty, rd_link);
-}
-
-static void
-arena_chunk_dirty_remove(extent_node_t *node)
-{
-
-	qr_remove(node, cd_link);
-	qr_remove(&node->runs_dirty, rd_link);
-}
-
 void
-arena_chunk_dirty_maybe_insert(arena_t *arena, extent_node_t *node, bool dirty)
+arena_chunk_cache_maybe_insert(arena_t *arena, extent_node_t *node, bool cache)
 {
 
-	if (dirty) {
+	if (cache) {
 		extent_node_dirty_linkage_init(node);
-		arena_chunk_dirty_insert(&arena->runs_dirty,
-		    &arena->chunks_dirty, node);
+		extent_node_dirty_insert(node, &arena->runs_dirty,
+		    &arena->chunks_cache);
 		arena->ndirty += arena_chunk_dirty_npages(node);
 	}
 }
 
 void
-arena_chunk_dirty_maybe_remove(arena_t *arena, extent_node_t *node, bool dirty)
+arena_chunk_cache_maybe_remove(arena_t *arena, extent_node_t *node, bool dirty)
 {
 
 	if (dirty) {
-		arena_chunk_dirty_remove(node);
+		extent_node_dirty_remove(node);
 		assert(arena->ndirty >= arena_chunk_dirty_npages(node));
 		arena->ndirty -= arena_chunk_dirty_npages(node);
 	}
@@ -954,14 +937,14 @@ arena_dirty_count(arena_t *arena)
 	extent_node_t *chunkselm;
 
 	for (runselm = qr_next(&arena->runs_dirty, rd_link),
-	    chunkselm = qr_next(&arena->chunks_dirty, cd_link);
+	    chunkselm = qr_next(&arena->chunks_cache, cc_link);
 	    runselm != &arena->runs_dirty; runselm = qr_next(runselm,
 	    rd_link)) {
 		size_t npages;
 
 		if (runselm == &chunkselm->runs_dirty) {
 			npages = extent_node_size_get(chunkselm) >> LG_PAGE;
-			chunkselm = qr_next(chunkselm, cd_link);
+			chunkselm = qr_next(chunkselm, cc_link);
 		} else {
 			arena_chunk_t *chunk = (arena_chunk_t
 			    *)CHUNK_ADDR2BASE(runselm);
@@ -1010,7 +993,7 @@ arena_stash_dirty(arena_t *arena, bool all, size_t npurge,
 
 	/* Stash at least npurge pages. */
 	for (runselm = qr_next(&arena->runs_dirty, rd_link),
-	    chunkselm = qr_next(&arena->chunks_dirty, cd_link);
+	    chunkselm = qr_next(&arena->chunks_cache, cc_link);
 	    runselm != &arena->runs_dirty; runselm = runselm_next) {
 		size_t npages;
 		runselm_next = qr_next(runselm, rd_link);
@@ -1022,7 +1005,7 @@ arena_stash_dirty(arena_t *arena, bool all, size_t npurge,
 			bool zeroed, zero;
 			UNUSED void *chunk;
 
-			chunkselm_next = qr_next(chunkselm, cd_link);
+			chunkselm_next = qr_next(chunkselm, cc_link);
 			/*
 			 * Cache contents of chunkselm prior to it being
 			 * destroyed as a side effect of allocating the chunk.
@@ -1038,19 +1021,16 @@ arena_stash_dirty(arena_t *arena, bool all, size_t npurge,
 			assert(zero == zeroed);
 			/*
 			 * Create a temporary node to link into the ring of
-			 * stashed allocations.
+			 * stashed allocations.  OOM shouldn't be possible
+			 * because chunk allocation just cached a node.
 			 */
 			tnode = arena_node_alloc(arena);
-			/*
-			 * OOM shouldn't be possible because chunk allocation
-			 * just cached a node.
-			 */
 			assert(tnode != NULL);
+			/* Stash. */
 			extent_node_init(tnode, arena, addr, size, zeroed);
 			extent_node_dirty_linkage_init(tnode);
-			/* Stash. */
-			arena_chunk_dirty_insert(purge_runs_sentinel,
-			    purge_chunks_sentinel, tnode);
+			extent_node_dirty_insert(tnode, purge_runs_sentinel,
+			    purge_chunks_sentinel);
 			npages = size >> LG_PAGE;
 			chunkselm = chunkselm_next;
 		} else {
@@ -1108,7 +1088,7 @@ arena_purge_stashed(arena_t *arena, arena_chunk_map_misc_t *purge_runs_sentinel,
 
 	malloc_mutex_unlock(&arena->lock);
 	for (runselm = qr_next(purge_runs_sentinel, rd_link),
-	    chunkselm = qr_next(purge_chunks_sentinel, cd_link);
+	    chunkselm = qr_next(purge_chunks_sentinel, cc_link);
 	    runselm != purge_runs_sentinel; runselm = qr_next(runselm,
 	    rd_link)) {
 		size_t npages;
@@ -1121,7 +1101,7 @@ arena_purge_stashed(arena_t *arena, arena_chunk_map_misc_t *purge_runs_sentinel,
 			unzeroed = pages_purge(extent_node_addr_get(chunkselm),
 			    size);
 			extent_node_zeroed_set(chunkselm, !unzeroed);
-			chunkselm = qr_next(chunkselm, cd_link);
+			chunkselm = qr_next(chunkselm, cc_link);
 		} else {
 			arena_chunk_t *chunk;
 			size_t pageind, run_size, flag_unzeroed, i;
@@ -1178,19 +1158,19 @@ arena_unstash_purged(arena_t *arena,
 
 	/* Deallocate runs. */
 	for (runselm = qr_next(purge_runs_sentinel, rd_link),
-	    chunkselm = qr_next(purge_chunks_sentinel, cd_link);
+	    chunkselm = qr_next(purge_chunks_sentinel, cc_link);
 	    runselm != purge_runs_sentinel; runselm = runselm_next) {
 		runselm_next = qr_next(runselm, rd_link);
 		if (runselm == &chunkselm->runs_dirty) {
 			extent_node_t *chunkselm_next = qr_next(chunkselm,
-			    cd_link);
-			bool dirty = !extent_node_zeroed_get(chunkselm);
+			    cc_link);
 			void *addr = extent_node_addr_get(chunkselm);
 			size_t size = extent_node_size_get(chunkselm);
-			arena_chunk_dirty_remove(chunkselm);
+			bool zeroed = extent_node_zeroed_get(chunkselm);
+			extent_node_dirty_remove(chunkselm);
 			arena_node_dalloc(arena, chunkselm);
 			chunkselm = chunkselm_next;
-			chunk_unmap(arena, dirty, addr, size);
+			chunk_unmap(arena, addr, size, zeroed);
 		} else {
 			arena_run_t *run = &runselm->run;
 			qr_remove(runselm, rd_link);
@@ -2612,14 +2592,14 @@ arena_new(unsigned ind)
 
 	arena_avail_tree_new(&arena->runs_avail);
 	qr_new(&arena->runs_dirty, rd_link);
-	qr_new(&arena->chunks_dirty, cd_link);
+	qr_new(&arena->chunks_cache, cc_link);
 
 	ql_new(&arena->huge);
 	if (malloc_mutex_init(&arena->huge_mtx))
 		return (NULL);
 
-	extent_tree_szad_new(&arena->chunks_szad_dirty);
-	extent_tree_ad_new(&arena->chunks_ad_dirty);
+	extent_tree_szad_new(&arena->chunks_szad_cache);
+	extent_tree_ad_new(&arena->chunks_ad_cache);
 	extent_tree_szad_new(&arena->chunks_szad_mmap);
 	extent_tree_ad_new(&arena->chunks_ad_mmap);
 	extent_tree_szad_new(&arena->chunks_szad_dss);
diff --git a/src/chunk.c b/src/chunk.c
index 774a978..264e4f2 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -64,7 +64,7 @@ chunk_deregister(const void *chunk, const extent_node_t *node)
 
 static void *
 chunk_recycle(arena_t *arena, extent_tree_t *chunks_szad,
-    extent_tree_t *chunks_ad, bool dirty, void *new_addr, size_t size,
+    extent_tree_t *chunks_ad, bool cache, void *new_addr, size_t size,
     size_t alignment, bool *zero)
 {
 	void *ret;
@@ -100,13 +100,13 @@ chunk_recycle(arena_t *arena, extent_tree_t *chunks_szad,
 	/* Remove node from the tree. */
 	extent_tree_szad_remove(chunks_szad, node);
 	extent_tree_ad_remove(chunks_ad, node);
-	arena_chunk_dirty_maybe_remove(arena, node, dirty);
+	arena_chunk_cache_maybe_remove(arena, node, cache);
 	if (leadsize != 0) {
 		/* Insert the leading space as a smaller chunk. */
 		extent_node_size_set(node, leadsize);
 		extent_tree_szad_insert(chunks_szad, node);
 		extent_tree_ad_insert(chunks_ad, node);
-		arena_chunk_dirty_maybe_insert(arena, node, dirty);
+		arena_chunk_cache_maybe_insert(arena, node, cache);
 		node = NULL;
 	}
 	if (trailsize != 0) {
@@ -116,7 +116,7 @@ chunk_recycle(arena_t *arena, extent_tree_t *chunks_szad,
 			if (node == NULL) {
 				malloc_mutex_unlock(&arena->chunks_mtx);
 				chunk_record(arena, chunks_szad, chunks_ad,
-				    dirty, ret, size);
+				    cache, ret, size, zeroed);
 				return (NULL);
 			}
 		}
@@ -124,7 +124,7 @@ chunk_recycle(arena_t *arena, extent_tree_t *chunks_szad,
 		    trailsize, zeroed);
 		extent_tree_szad_insert(chunks_szad, node);
 		extent_tree_ad_insert(chunks_ad, node);
-		arena_chunk_dirty_maybe_insert(arena, node, dirty);
+		arena_chunk_cache_maybe_insert(arena, node, cache);
 		node = NULL;
 	}
 	malloc_mutex_unlock(&arena->chunks_mtx);
@@ -177,9 +177,9 @@ chunk_alloc_core(arena_t *arena, void *new_addr, size_t size, size_t alignment,
 	assert(alignment != 0);
 	assert((alignment & chunksize_mask) == 0);
 
-	/* dirty. */
-	if ((ret = chunk_recycle(arena, &arena->chunks_szad_dirty,
-	    &arena->chunks_ad_dirty, true, new_addr, size, alignment, zero)) !=
+	/* cache. */
+	if ((ret = chunk_recycle(arena, &arena->chunks_szad_cache,
+	    &arena->chunks_ad_cache, true, new_addr, size, alignment, zero)) !=
 	    NULL)
 		return (ret);
 	/* "primary" dss. */
@@ -276,13 +276,14 @@ chunk_alloc_default(void *new_addr, size_t size, size_t alignment, bool *zero,
 
 void
 chunk_record(arena_t *arena, extent_tree_t *chunks_szad,
-    extent_tree_t *chunks_ad, bool dirty, void *chunk, size_t size)
+    extent_tree_t *chunks_ad, bool cache, void *chunk, size_t size, bool zeroed)
 {
 	bool unzeroed;
 	extent_node_t *node, *prev;
 	extent_node_t key;
 
-	unzeroed = dirty ? true : pages_purge(chunk, size);
+	assert(!cache || !zeroed);
+	unzeroed = cache || !zeroed;
 	JEMALLOC_VALGRIND_MAKE_MEM_NOACCESS(chunk, size);
 
 	malloc_mutex_lock(&arena->chunks_mtx);
@@ -298,13 +299,13 @@ chunk_record(arena_t *arena, extent_tree_t *chunks_szad,
 		 * remove/insert from/into chunks_szad.
 		 */
 		extent_tree_szad_remove(chunks_szad, node);
-		arena_chunk_dirty_maybe_remove(arena, node, dirty);
+		arena_chunk_cache_maybe_remove(arena, node, cache);
 		extent_node_addr_set(node, chunk);
 		extent_node_size_set(node, size + extent_node_size_get(node));
 		extent_node_zeroed_set(node, extent_node_zeroed_get(node) &&
 		    !unzeroed);
 		extent_tree_szad_insert(chunks_szad, node);
-		arena_chunk_dirty_maybe_insert(arena, node, dirty);
+		arena_chunk_cache_maybe_insert(arena, node, cache);
 	} else {
 		/* Coalescing forward failed, so insert a new node. */
 		node = arena_node_alloc(arena);
@@ -315,14 +316,14 @@ chunk_record(arena_t *arena, extent_tree_t *chunks_szad,
 			 * pages have already been purged, so that this is only
 			 * a virtual memory leak.
 			 */
-			if (dirty)
+			if (cache)
 				pages_purge(chunk, size);
 			goto label_return;
 		}
 		extent_node_init(node, arena, chunk, size, !unzeroed);
 		extent_tree_ad_insert(chunks_ad, node);
 		extent_tree_szad_insert(chunks_szad, node);
-		arena_chunk_dirty_maybe_insert(arena, node, dirty);
+		arena_chunk_cache_maybe_insert(arena, node, cache);
 	}
 
 	/* Try to coalesce backward. */
@@ -336,16 +337,16 @@ chunk_record(arena_t *arena, extent_tree_t *chunks_szad,
 		 */
 		extent_tree_szad_remove(chunks_szad, prev);
 		extent_tree_ad_remove(chunks_ad, prev);
-		arena_chunk_dirty_maybe_remove(arena, prev, dirty);
+		arena_chunk_cache_maybe_remove(arena, prev, cache);
 		extent_tree_szad_remove(chunks_szad, node);
-		arena_chunk_dirty_maybe_remove(arena, node, dirty);
+		arena_chunk_cache_maybe_remove(arena, node, cache);
 		extent_node_addr_set(node, extent_node_addr_get(prev));
 		extent_node_size_set(node, extent_node_size_get(prev) +
 		    extent_node_size_get(node));
 		extent_node_zeroed_set(node, extent_node_zeroed_get(prev) &&
 		    extent_node_zeroed_get(node));
 		extent_tree_szad_insert(chunks_szad, node);
-		arena_chunk_dirty_maybe_insert(arena, node, dirty);
+		arena_chunk_cache_maybe_insert(arena, node, cache);
 
 		arena_node_dalloc(arena, prev);
 	}
@@ -363,8 +364,8 @@ chunk_cache(arena_t *arena, void *chunk, size_t size)
 	assert(size != 0);
 	assert((size & chunksize_mask) == 0);
 
-	chunk_record(arena, &arena->chunks_szad_dirty, &arena->chunks_ad_dirty,
-	    true, chunk, size);
+	chunk_record(arena, &arena->chunks_szad_cache, &arena->chunks_ad_cache,
+	    true, chunk, size, false);
 }
 
 /* Default arena chunk deallocation routine in the absence of user override. */
@@ -377,7 +378,7 @@ chunk_dalloc_default(void *chunk, size_t size, unsigned arena_ind)
 }
 
 void
-chunk_unmap(arena_t *arena, bool dirty, void *chunk, size_t size)
+chunk_unmap(arena_t *arena, void *chunk, size_t size, bool zeroed)
 {
 
 	assert(chunk != NULL);
@@ -387,10 +388,10 @@ chunk_unmap(arena_t *arena, bool dirty, void *chunk, size_t size)
 
 	if (have_dss && chunk_in_dss(chunk)) {
 		chunk_record(arena, &arena->chunks_szad_dss,
-		    &arena->chunks_ad_dss, dirty, chunk, size);
+		    &arena->chunks_ad_dss, false, chunk, size, zeroed);
 	} else if (chunk_dalloc_mmap(chunk, size)) {
 		chunk_record(arena, &arena->chunks_szad_mmap,
-		    &arena->chunks_ad_mmap, dirty, chunk, size);
+		    &arena->chunks_ad_mmap, false, chunk, size, zeroed);
 	}
 }
 
diff --git a/src/chunk_dss.c b/src/chunk_dss.c
index c3c4848..6fbe31b 100644
--- a/src/chunk_dss.c
+++ b/src/chunk_dss.c
@@ -137,7 +137,7 @@ chunk_alloc_dss(arena_t *arena, void *new_addr, size_t size, size_t alignment,
 					chunk_record(arena,
 					    &arena->chunks_szad_dss,
 					    &arena->chunks_ad_dss, false, cpad,
-					    cpad_size);
+					    cpad_size, false);
 				}
 				if (*zero) {
 					JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(
-- 
cgit v0.12


From 99bd94fb65a0b6423c4efcc3e3e501179b92a4db Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 18 Feb 2015 16:40:53 -0800
Subject: Fix chunk cache races.

These regressions were introduced by
ee41ad409a43d12900a5a3108f6c14f84e4eb0eb (Integrate whole chunks into
unused dirty page purging machinery.).
---
 include/jemalloc/internal/arena.h             |   1 +
 include/jemalloc/internal/chunk.h             |  13 +-
 include/jemalloc/internal/private_symbols.txt |   8 +-
 src/arena.c                                   | 256 ++++++++++++++++----------
 src/chunk.c                                   | 114 ++++++++----
 5 files changed, 258 insertions(+), 134 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 3d79c62..42086ca 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -399,6 +399,7 @@ void	arena_chunk_ralloc_huge_shrink(arena_t *arena, void *chunk,
     size_t oldsize, size_t usize);
 bool	arena_chunk_ralloc_huge_expand(arena_t *arena, void *chunk,
     size_t oldsize, size_t usize, bool *zero);
+void	arena_maybe_purge(arena_t *arena);
 void	arena_purge_all(arena_t *arena);
 void	arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin,
     index_t binind, uint64_t prof_accumbytes);
diff --git a/include/jemalloc/internal/chunk.h b/include/jemalloc/internal/chunk.h
index bf6acbd..1a968a5 100644
--- a/include/jemalloc/internal/chunk.h
+++ b/include/jemalloc/internal/chunk.h
@@ -39,16 +39,21 @@ extern size_t		chunk_npages;
 bool	chunk_register(const void *chunk, const extent_node_t *node);
 void	chunk_deregister(const void *chunk, const extent_node_t *node);
 void	*chunk_alloc_base(size_t size);
-void	*chunk_alloc_arena(chunk_alloc_t *chunk_alloc,
-    chunk_dalloc_t *chunk_dalloc, unsigned arena_ind, void *new_addr,
-    size_t size, size_t alignment, bool *zero);
+void	*chunk_alloc_cache(arena_t *arena, void *new_addr, size_t size,
+    size_t alignment, bool *zero, bool dalloc_node);
 void	*chunk_alloc_default(void *new_addr, size_t size, size_t alignment,
     bool *zero, unsigned arena_ind);
+void	*chunk_alloc_wrapper(arena_t *arena, chunk_alloc_t *chunk_alloc,
+    void *new_addr, size_t size, size_t alignment, bool *zero);
 void	chunk_record(arena_t *arena, extent_tree_t *chunks_szad,
     extent_tree_t *chunks_ad, bool cache, void *chunk, size_t size,
     bool zeroed);
+void	chunk_dalloc_cache(arena_t *arena, void *chunk, size_t size);
+void	chunk_dalloc_arena(arena_t *arena, void *chunk, size_t size,
+    bool zeroed);
 bool	chunk_dalloc_default(void *chunk, size_t size, unsigned arena_ind);
-void	chunk_unmap(arena_t *arena, void *chunk, size_t size, bool zeroed);
+void	chunk_dalloc_wrapper(arena_t *arena, chunk_dalloc_t *chunk_dalloc,
+    void *chunk, size_t size);
 bool	chunk_boot(void);
 void	chunk_prefork(void);
 void	chunk_postfork_parent(void);
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index dfe62ce..7c217c7 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -53,6 +53,7 @@ arena_mapbitsp_read
 arena_mapbitsp_write
 arena_maxclass
 arena_maxrun
+arena_maybe_purge
 arena_metadata_allocated_add
 arena_metadata_allocated_get
 arena_metadata_allocated_sub
@@ -124,14 +125,18 @@ bootstrap_free
 bootstrap_malloc
 bt_init
 buferror
-chunk_alloc_arena
+chunk_alloc_cache
 chunk_alloc_base
 chunk_alloc_default
 chunk_alloc_dss
 chunk_alloc_mmap
+chunk_alloc_wrapper
 chunk_boot
+chunk_dalloc_arena
+chunk_dalloc_cache
 chunk_dalloc_default
 chunk_dalloc_mmap
+chunk_dalloc_wrapper
 chunk_deregister
 chunk_dss_boot
 chunk_dss_postfork_child
@@ -147,7 +152,6 @@ chunk_postfork_parent
 chunk_prefork
 chunk_record
 chunk_register
-chunk_unmap
 chunks_rtree
 chunksize
 chunksize_mask
diff --git a/src/arena.c b/src/arena.c
index 762b818..78aa1ae 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -20,7 +20,6 @@ unsigned	nhclasses; /* Number of huge size classes. */
  * definition.
  */
 
-static void	arena_chunk_dalloc(arena_t *arena, arena_chunk_t *chunk);
 static void	arena_purge(arena_t *arena, bool all);
 static void	arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty,
     bool cleaned);
@@ -427,27 +426,53 @@ arena_chunk_init_spare(arena_t *arena)
 	return (chunk);
 }
 
+static bool
+arena_chunk_register(arena_t *arena, arena_chunk_t *chunk, bool zero)
+{
+
+	extent_node_init(&chunk->node, arena, chunk, chunksize, zero);
+	extent_node_achunk_set(&chunk->node, true);
+	return (chunk_register(chunk, &chunk->node));
+}
+
 static arena_chunk_t *
-arena_chunk_alloc_internal(arena_t *arena, bool *zero)
+arena_chunk_alloc_internal_hard(arena_t *arena, bool *zero)
 {
 	arena_chunk_t *chunk;
-	chunk_alloc_t *chunk_alloc;
-	chunk_dalloc_t *chunk_dalloc;
+	chunk_alloc_t *chunk_alloc = arena->chunk_alloc;
+	chunk_dalloc_t *chunk_dalloc = arena->chunk_dalloc;
 
-	chunk_alloc = arena->chunk_alloc;
-	chunk_dalloc = arena->chunk_dalloc;
 	malloc_mutex_unlock(&arena->lock);
-	chunk = (arena_chunk_t *)chunk_alloc_arena(chunk_alloc, chunk_dalloc,
-	    arena->ind, NULL, chunksize, chunksize, zero);
-	if (chunk != NULL) {
-		extent_node_init(&chunk->node, arena, chunk, chunksize, *zero);
-		extent_node_achunk_set(&chunk->node, true);
-		if (chunk_register(chunk, &chunk->node)) {
-			chunk_dalloc((void *)chunk, chunksize, arena->ind);
-			chunk = NULL;
-		}
+	chunk = (arena_chunk_t *)chunk_alloc_wrapper(arena, chunk_alloc, NULL,
+	    chunksize, chunksize, zero);
+	if (chunk != NULL && arena_chunk_register(arena, chunk, *zero)) {
+		chunk_dalloc_wrapper(arena, chunk_dalloc, (void *)chunk,
+		    chunksize);
+		chunk = NULL;
 	}
 	malloc_mutex_lock(&arena->lock);
+
+	return (chunk);
+}
+
+static arena_chunk_t *
+arena_chunk_alloc_internal(arena_t *arena, bool *zero)
+{
+	arena_chunk_t *chunk;
+
+	if (likely(arena->chunk_alloc == chunk_alloc_default)) {
+		chunk = chunk_alloc_cache(arena, NULL, chunksize, chunksize,
+		    zero, true);
+		if (chunk != NULL && arena_chunk_register(arena, chunk,
+		    *zero)) {
+			chunk_dalloc_cache(arena, chunk, chunksize);
+			return (NULL);
+		}
+	} else
+		chunk = NULL;
+	if (chunk == NULL)
+		chunk = arena_chunk_alloc_internal_hard(arena, zero);
+
 	if (config_stats && chunk != NULL) {
 		arena->stats.mapped += chunksize;
 		arena->stats.metadata_mapped += (map_bias << LG_PAGE);
@@ -553,11 +578,19 @@ arena_chunk_dalloc(arena_t *arena, arena_chunk_t *chunk)
 			arena_run_dirty_remove(arena, spare, map_bias,
 			    chunk_npages-map_bias);
 		}
-		chunk_dalloc = arena->chunk_dalloc;
-		malloc_mutex_unlock(&arena->lock);
+
 		chunk_deregister(spare, &spare->node);
-		chunk_dalloc((void *)spare, chunksize, arena->ind);
-		malloc_mutex_lock(&arena->lock);
+
+		chunk_dalloc = arena->chunk_dalloc;
+		if (likely(chunk_dalloc == chunk_dalloc_default))
+			chunk_dalloc_cache(arena, (void *)spare, chunksize);
+		else {
+			malloc_mutex_unlock(&arena->lock);
+			chunk_dalloc_wrapper(arena, chunk_dalloc, (void *)spare,
+			    chunksize);
+			malloc_mutex_lock(&arena->lock);
+		}
+
 		if (config_stats) {
 			arena->stats.mapped -= chunksize;
 			arena->stats.metadata_mapped -= (map_bias << LG_PAGE);
@@ -661,43 +694,59 @@ arena_node_dalloc(arena_t *arena, extent_node_t *node)
 	malloc_mutex_unlock(&arena->node_cache_mtx);
 }
 
+static void *
+arena_chunk_alloc_huge_hard(arena_t *arena, chunk_alloc_t *chunk_alloc,
+    size_t usize, size_t alignment, bool *zero, size_t csize)
+{
+	void *ret;
+
+	ret = chunk_alloc_wrapper(arena, chunk_alloc, NULL, csize, alignment,
+	    zero);
+	if (ret == NULL) {
+		/* Revert optimistic stats updates. */
+		malloc_mutex_lock(&arena->lock);
+		if (config_stats) {
+			arena_huge_malloc_stats_update_undo(arena, usize);
+			arena->stats.mapped -= usize;
+		}
+		arena->nactive -= (usize >> LG_PAGE);
+		malloc_mutex_unlock(&arena->lock);
+	}
+
+	return (ret);
+}
+
 void *
 arena_chunk_alloc_huge(arena_t *arena, size_t usize, size_t alignment,
     bool *zero)
 {
 	void *ret;
 	chunk_alloc_t *chunk_alloc;
-	chunk_dalloc_t *chunk_dalloc;
 	size_t csize = CHUNK_CEILING(usize);
 
 	malloc_mutex_lock(&arena->lock);
-	chunk_alloc = arena->chunk_alloc;
-	chunk_dalloc = arena->chunk_dalloc;
+
+	/* Optimistically update stats. */
 	if (config_stats) {
-		/* Optimistically update stats prior to unlocking. */
 		arena_huge_malloc_stats_update(arena, usize);
 		arena->stats.mapped += usize;
 	}
 	arena->nactive += (usize >> LG_PAGE);
-	malloc_mutex_unlock(&arena->lock);
 
-	ret = chunk_alloc_arena(chunk_alloc, chunk_dalloc, arena->ind, NULL,
-	    csize, alignment, zero);
+	chunk_alloc = arena->chunk_alloc;
+	if (likely(chunk_alloc == chunk_alloc_default)) {
+		ret = chunk_alloc_cache(arena, NULL, csize, alignment, zero,
+		    true);
+	} else
+		ret = NULL;
+	malloc_mutex_unlock(&arena->lock);
 	if (ret == NULL) {
-		/* Revert optimistic stats updates. */
-		malloc_mutex_lock(&arena->lock);
-		if (config_stats) {
-			arena_huge_malloc_stats_update_undo(arena, usize);
-			arena->stats.mapped -= usize;
-		}
-		arena->nactive -= (usize >> LG_PAGE);
-		malloc_mutex_unlock(&arena->lock);
-		return (NULL);
+		ret = arena_chunk_alloc_huge_hard(arena, chunk_alloc, usize,
+		    alignment, zero, csize);
 	}
 
-	if (config_stats)
+	if (config_stats && ret != NULL)
 		stats_cactive_add(usize);
-
 	return (ret);
 }
 
@@ -705,7 +754,9 @@ void
 arena_chunk_dalloc_huge(arena_t *arena, void *chunk, size_t usize)
 {
 	chunk_dalloc_t *chunk_dalloc;
+	size_t csize;
 
+	csize = CHUNK_CEILING(usize);
 	malloc_mutex_lock(&arena->lock);
 	chunk_dalloc = arena->chunk_dalloc;
 	if (config_stats) {
@@ -714,8 +765,14 @@ arena_chunk_dalloc_huge(arena_t *arena, void *chunk, size_t usize)
 		stats_cactive_sub(usize);
 	}
 	arena->nactive -= (usize >> LG_PAGE);
-	malloc_mutex_unlock(&arena->lock);
-	chunk_dalloc(chunk, CHUNK_CEILING(usize), arena->ind);
+
+	if (likely(chunk_dalloc == chunk_dalloc_default)) {
+		chunk_dalloc_cache(arena, chunk, csize);
+		malloc_mutex_unlock(&arena->lock);
+	} else {
+		malloc_mutex_unlock(&arena->lock);
+		chunk_dalloc_wrapper(arena, chunk_dalloc, chunk, csize);
+	}
 }
 
 void
@@ -747,12 +804,10 @@ void
 arena_chunk_ralloc_huge_shrink(arena_t *arena, void *chunk, size_t oldsize,
     size_t usize)
 {
-	chunk_dalloc_t *chunk_dalloc;
 	size_t udiff = oldsize - usize;
 	size_t cdiff = CHUNK_CEILING(oldsize) - CHUNK_CEILING(usize);
 
 	malloc_mutex_lock(&arena->lock);
-	chunk_dalloc = arena->chunk_dalloc;
 	if (config_stats) {
 		arena_huge_ralloc_stats_update(arena, oldsize, usize);
 		if (cdiff != 0) {
@@ -761,52 +816,81 @@ arena_chunk_ralloc_huge_shrink(arena_t *arena, void *chunk, size_t oldsize,
 		}
 	}
 	arena->nactive -= udiff >> LG_PAGE;
-	malloc_mutex_unlock(&arena->lock);
+
 	if (cdiff != 0) {
-		chunk_dalloc((void *)((uintptr_t)chunk + CHUNK_CEILING(usize)),
-		    cdiff, arena->ind);
+		chunk_dalloc_t *chunk_dalloc = arena->chunk_dalloc;
+		void *nchunk = (void *)((uintptr_t)chunk +
+		    CHUNK_CEILING(usize));
+
+		if (likely(chunk_dalloc == chunk_dalloc_default)) {
+			chunk_dalloc_cache(arena, nchunk, cdiff);
+			malloc_mutex_unlock(&arena->lock);
+		} else {
+			malloc_mutex_unlock(&arena->lock);
+			chunk_dalloc_wrapper(arena, chunk_dalloc, nchunk,
+			    cdiff);
+		}
+	} else
+		malloc_mutex_unlock(&arena->lock);
+}
+
+bool
+arena_chunk_ralloc_huge_expand_hard(arena_t *arena, chunk_alloc_t *chunk_alloc,
+    size_t oldsize, size_t usize, bool *zero, void *nchunk, size_t udiff,
+    size_t cdiff)
+{
+	bool err;
+
+	err = (chunk_alloc_wrapper(arena, chunk_alloc, nchunk, cdiff, chunksize,
+	    zero) == NULL);
+	if (err) {
+		/* Revert optimistic stats updates. */
+		malloc_mutex_lock(&arena->lock);
+		if (config_stats) {
+			arena_huge_ralloc_stats_update_undo(arena, oldsize,
+			    usize);
+			arena->stats.mapped -= cdiff;
+		}
+		arena->nactive -= (udiff >> LG_PAGE);
+		malloc_mutex_unlock(&arena->lock);
 	}
+	return (err);
 }
 
 bool
 arena_chunk_ralloc_huge_expand(arena_t *arena, void *chunk, size_t oldsize,
     size_t usize, bool *zero)
 {
+	bool err;
 	chunk_alloc_t *chunk_alloc;
-	chunk_dalloc_t *chunk_dalloc;
+	void *nchunk = (void *)((uintptr_t)chunk + CHUNK_CEILING(oldsize));
 	size_t udiff = usize - oldsize;
 	size_t cdiff = CHUNK_CEILING(usize) - CHUNK_CEILING(oldsize);
 
 	malloc_mutex_lock(&arena->lock);
-	chunk_alloc = arena->chunk_alloc;
-	chunk_dalloc = arena->chunk_dalloc;
+
+	/* Optimistically update stats. */
 	if (config_stats) {
-		/* Optimistically update stats prior to unlocking. */
 		arena_huge_ralloc_stats_update(arena, oldsize, usize);
 		arena->stats.mapped += cdiff;
 	}
 	arena->nactive += (udiff >> LG_PAGE);
-	malloc_mutex_unlock(&arena->lock);
 
-	if (chunk_alloc_arena(chunk_alloc, chunk_dalloc, arena->ind,
-	    (void *)((uintptr_t)chunk + CHUNK_CEILING(oldsize)), cdiff,
-	    chunksize, zero) == NULL) {
-		/* Revert optimistic stats updates. */
-		malloc_mutex_lock(&arena->lock);
-		if (config_stats) {
-			arena_huge_ralloc_stats_update_undo(arena,
-			    oldsize, usize);
-			arena->stats.mapped -= cdiff;
-		}
-		arena->nactive -= (udiff >> LG_PAGE);
-		malloc_mutex_unlock(&arena->lock);
-		return (true);
+	chunk_alloc = arena->chunk_alloc;
+	if (likely(chunk_alloc == chunk_alloc_default)) {
+		err = (chunk_alloc_cache(arena, nchunk, cdiff, chunksize, zero,
+		    true) == NULL);
+	} else
+		err = true;
+	malloc_mutex_unlock(&arena->lock);
+	if (err) {
+		err = arena_chunk_ralloc_huge_expand_hard(arena, chunk_alloc,
+		    oldsize, usize, zero, nchunk, udiff, cdiff);
 	}
 
-	if (config_stats)
+	if (config_stats && !err)
 		stats_cactive_add(udiff);
-
-	return (false);
+	return (err);
 }
 
 static arena_run_t *
@@ -909,7 +993,7 @@ arena_run_alloc_small(arena_t *arena, size_t size, index_t binind)
 	return (arena_run_alloc_small_helper(arena, size, binind));
 }
 
-JEMALLOC_INLINE_C void
+void
 arena_maybe_purge(arena_t *arena)
 {
 	size_t threshold;
@@ -999,39 +1083,25 @@ arena_stash_dirty(arena_t *arena, bool all, size_t npurge,
 		runselm_next = qr_next(runselm, rd_link);
 
 		if (runselm == &chunkselm->runs_dirty) {
-			extent_node_t *chunkselm_next, *tnode;
-			void *addr;
-			size_t size;
-			bool zeroed, zero;
+			extent_node_t *chunkselm_next;
+			bool zero;
 			UNUSED void *chunk;
 
 			chunkselm_next = qr_next(chunkselm, cc_link);
 			/*
-			 * Cache contents of chunkselm prior to it being
-			 * destroyed as a side effect of allocating the chunk.
+			 * Allocate.  chunkselm remains valid due to the
+			 * dalloc_node=false argument to chunk_alloc_cache().
 			 */
-			addr = extent_node_addr_get(chunkselm);
-			size = extent_node_size_get(chunkselm);
-			zeroed = extent_node_zeroed_get(chunkselm);
-			/* Allocate. */
 			zero = false;
-			chunk = arena->chunk_alloc(addr, size, chunksize, &zero,
-			    arena->ind);
-			assert(chunk == addr);
-			assert(zero == zeroed);
-			/*
-			 * Create a temporary node to link into the ring of
-			 * stashed allocations.  OOM shouldn't be possible
-			 * because chunk allocation just cached a node.
-			 */
-			tnode = arena_node_alloc(arena);
-			assert(tnode != NULL);
-			/* Stash. */
-			extent_node_init(tnode, arena, addr, size, zeroed);
-			extent_node_dirty_linkage_init(tnode);
-			extent_node_dirty_insert(tnode, purge_runs_sentinel,
+			chunk = chunk_alloc_cache(arena,
+			    extent_node_addr_get(chunkselm),
+			    extent_node_size_get(chunkselm), chunksize, &zero,
+			    false);
+			assert(chunk == extent_node_addr_get(chunkselm));
+			assert(zero == extent_node_zeroed_get(chunkselm));
+			extent_node_dirty_insert(chunkselm, purge_runs_sentinel,
 			    purge_chunks_sentinel);
-			npages = size >> LG_PAGE;
+			npages = extent_node_size_get(chunkselm) >> LG_PAGE;
 			chunkselm = chunkselm_next;
 		} else {
 			arena_chunk_t *chunk =
@@ -1170,7 +1240,7 @@ arena_unstash_purged(arena_t *arena,
 			extent_node_dirty_remove(chunkselm);
 			arena_node_dalloc(arena, chunkselm);
 			chunkselm = chunkselm_next;
-			chunk_unmap(arena, addr, size, zeroed);
+			chunk_dalloc_arena(arena, addr, size, zeroed);
 		} else {
 			arena_run_t *run = &runselm->run;
 			qr_remove(runselm, rd_link);
diff --git a/src/chunk.c b/src/chunk.c
index 264e4f2..08f21f6 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -65,7 +65,7 @@ chunk_deregister(const void *chunk, const extent_node_t *node)
 static void *
 chunk_recycle(arena_t *arena, extent_tree_t *chunks_szad,
     extent_tree_t *chunks_ad, bool cache, void *new_addr, size_t size,
-    size_t alignment, bool *zero)
+    size_t alignment, bool *zero, bool dalloc_node)
 {
 	void *ret;
 	extent_node_t *node;
@@ -74,6 +74,7 @@ chunk_recycle(arena_t *arena, extent_tree_t *chunks_szad,
 	bool zeroed;
 
 	assert(new_addr == NULL || alignment == chunksize);
+	assert(dalloc_node || new_addr != NULL);
 
 	alloc_size = size + alignment - chunksize;
 	/* Beware size_t wrap-around. */
@@ -129,7 +130,8 @@ chunk_recycle(arena_t *arena, extent_tree_t *chunks_szad,
 	}
 	malloc_mutex_unlock(&arena->chunks_mtx);
 
-	if (node != NULL)
+	assert(!dalloc_node || node != NULL);
+	if (dalloc_node && node != NULL)
 		arena_node_dalloc(arena, node);
 	if (*zero) {
 		if (!zeroed)
@@ -153,8 +155,8 @@ chunk_alloc_core_dss(arena_t *arena, void *new_addr, size_t size,
 	void *ret;
 
 	if ((ret = chunk_recycle(arena, &arena->chunks_szad_dss,
-	    &arena->chunks_ad_dss, false, new_addr, size, alignment, zero)) !=
-	    NULL)
+	    &arena->chunks_ad_dss, false, new_addr, size, alignment, zero,
+	    true)) != NULL)
 		return (ret);
 	ret = chunk_alloc_dss(arena, new_addr, size, alignment, zero);
 	return (ret);
@@ -177,11 +179,6 @@ chunk_alloc_core(arena_t *arena, void *new_addr, size_t size, size_t alignment,
 	assert(alignment != 0);
 	assert((alignment & chunksize_mask) == 0);
 
-	/* cache. */
-	if ((ret = chunk_recycle(arena, &arena->chunks_szad_cache,
-	    &arena->chunks_ad_cache, true, new_addr, size, alignment, zero)) !=
-	    NULL)
-		return (ret);
 	/* "primary" dss. */
 	if (have_dss && dss_prec == dss_prec_primary && (ret =
 	    chunk_alloc_core_dss(arena, new_addr, size, alignment, zero)) !=
@@ -190,7 +187,7 @@ chunk_alloc_core(arena_t *arena, void *new_addr, size_t size, size_t alignment,
 	/* mmap. */
 	if (!config_munmap && (ret = chunk_recycle(arena,
 	    &arena->chunks_szad_mmap, &arena->chunks_ad_mmap, false, new_addr,
-	    size, alignment, zero)) != NULL)
+	    size, alignment, zero, true)) != NULL)
 		return (ret);
 	/*
 	 * Requesting an address is not implemented for chunk_alloc_mmap(), so
@@ -231,19 +228,18 @@ chunk_alloc_base(size_t size)
 }
 
 void *
-chunk_alloc_arena(chunk_alloc_t *chunk_alloc, chunk_dalloc_t *chunk_dalloc,
-    unsigned arena_ind, void *new_addr, size_t size, size_t alignment,
-    bool *zero)
+chunk_alloc_cache(arena_t *arena, void *new_addr, size_t size, size_t alignment,
+    bool *zero, bool dalloc_node)
 {
-	void *ret;
 
-	ret = chunk_alloc(new_addr, size, alignment, zero, arena_ind);
-	if (ret == NULL)
-		return (NULL);
-	if (config_valgrind)
-		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
+	assert(size != 0);
+	assert((size & chunksize_mask) == 0);
+	assert(alignment != 0);
+	assert((alignment & chunksize_mask) == 0);
 
-	return (ret);
+	return (chunk_recycle(arena, &arena->chunks_szad_cache,
+	    &arena->chunks_ad_cache, true, new_addr, size, alignment, zero,
+	    dalloc_node));
 }
 
 static arena_t *
@@ -262,7 +258,27 @@ chunk_arena_get(unsigned arena_ind)
 	return (arena);
 }
 
-/* Default arena chunk allocation routine in the absence of user override. */
+static void *
+chunk_alloc_arena(arena_t *arena, void *new_addr, size_t size, size_t alignment,
+    bool *zero)
+{
+	void *ret;
+
+	ret = chunk_alloc_core(arena, new_addr, size, alignment, zero,
+	    arena->dss_prec);
+	if (ret == NULL)
+		return (NULL);
+	if (config_valgrind)
+		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
+
+	return (ret);
+}
+
+/*
+ * Default arena chunk allocation routine in the absence of user override.  This
+ * function isn't actually used by jemalloc, but it does the right thing if the
+ * application passes calls through to it during chunk allocation.
+ */
 void *
 chunk_alloc_default(void *new_addr, size_t size, size_t alignment, bool *zero,
     unsigned arena_ind)
@@ -270,8 +286,21 @@ chunk_alloc_default(void *new_addr, size_t size, size_t alignment, bool *zero,
 	arena_t *arena;
 
 	arena = chunk_arena_get(arena_ind);
-	return (chunk_alloc_core(arena, new_addr, size, alignment, zero,
-	    arena->dss_prec));
+	return (chunk_alloc_arena(arena, new_addr, size, alignment, zero));
+}
+
+void *
+chunk_alloc_wrapper(arena_t *arena, chunk_alloc_t *chunk_alloc, void *new_addr,
+    size_t size, size_t alignment, bool *zero)
+{
+	void *ret;
+
+	ret = chunk_alloc(new_addr, size, alignment, zero, arena->ind);
+	if (ret == NULL)
+		return (NULL);
+	if (config_valgrind && chunk_alloc != chunk_alloc_default)
+		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(chunk, chunksize);
+	return (ret);
 }
 
 void
@@ -355,8 +384,8 @@ label_return:
 	malloc_mutex_unlock(&arena->chunks_mtx);
 }
 
-static void
-chunk_cache(arena_t *arena, void *chunk, size_t size)
+void
+chunk_dalloc_cache(arena_t *arena, void *chunk, size_t size)
 {
 
 	assert(chunk != NULL);
@@ -366,19 +395,11 @@ chunk_cache(arena_t *arena, void *chunk, size_t size)
 
 	chunk_record(arena, &arena->chunks_szad_cache, &arena->chunks_ad_cache,
 	    true, chunk, size, false);
-}
-
-/* Default arena chunk deallocation routine in the absence of user override. */
-bool
-chunk_dalloc_default(void *chunk, size_t size, unsigned arena_ind)
-{
-
-	chunk_cache(chunk_arena_get(arena_ind), chunk, size);
-	return (false);
+	arena_maybe_purge(arena);
 }
 
 void
-chunk_unmap(arena_t *arena, void *chunk, size_t size, bool zeroed)
+chunk_dalloc_arena(arena_t *arena, void *chunk, size_t size, bool zeroed)
 {
 
 	assert(chunk != NULL);
@@ -395,6 +416,29 @@ chunk_unmap(arena_t *arena, void *chunk, size_t size, bool zeroed)
 	}
 }
 
+/*
+ * Default arena chunk deallocation routine in the absence of user override.
+ * This function isn't actually used by jemalloc, but it does the right thing if
+ * the application passes calls through to it during chunk deallocation.
+ */
+bool
+chunk_dalloc_default(void *chunk, size_t size, unsigned arena_ind)
+{
+
+	chunk_dalloc_arena(chunk_arena_get(arena_ind), chunk, size, false);
+	return (false);
+}
+
+void
+chunk_dalloc_wrapper(arena_t *arena, chunk_dalloc_t *chunk_dalloc, void *chunk,
+    size_t size)
+{
+
+	chunk_dalloc(chunk, size, arena->ind);
+	if (config_valgrind && chunk_dalloc != chunk_dalloc_default)
+		JEMALLOC_VALGRIND_MAKE_MEM_NOACCESS(chunk, size);
+}
+
 static rtree_node_elm_t *
 chunks_rtree_node_alloc(size_t nelms)
 {
-- 
cgit v0.12


From 35e3fd9a63a9d24276eab24bf84edb3d9e856732 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 18 Feb 2015 16:51:51 -0800
Subject: Fix a compilation error and an incorrect assertion.

---
 src/chunk.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/chunk.c b/src/chunk.c
index 08f21f6..9474a15 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -130,7 +130,7 @@ chunk_recycle(arena_t *arena, extent_tree_t *chunks_szad,
 	}
 	malloc_mutex_unlock(&arena->chunks_mtx);
 
-	assert(!dalloc_node || node != NULL);
+	assert(dalloc_node || node != NULL);
 	if (dalloc_node && node != NULL)
 		arena_node_dalloc(arena, node);
 	if (*zero) {
@@ -299,7 +299,7 @@ chunk_alloc_wrapper(arena_t *arena, chunk_alloc_t *chunk_alloc, void *new_addr,
 	if (ret == NULL)
 		return (NULL);
 	if (config_valgrind && chunk_alloc != chunk_alloc_default)
-		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(chunk, chunksize);
+		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, chunksize);
 	return (ret);
 }
 
-- 
cgit v0.12


From 970fcfbca5dffda921394172c7298d274eebfd0e Mon Sep 17 00:00:00 2001
From: Dave Huseby <dhuseby@mozilla.com>
Date: Mon, 9 Feb 2015 21:46:54 -0800
Subject: adding support for bitrig

---
 configure.ac | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 7a694a2..4ac7ac8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -283,7 +283,7 @@ case "${host}" in
 	abi="elf"
 	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ])
 	;;
-  *-*-openbsd*)
+  *-*-openbsd*|*-*-bitrig*)
 	CFLAGS="$CFLAGS"
 	abi="elf"
 	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ])
-- 
cgit v0.12


From 7c46fd59cce6afb14cdc6c819f662b6e81638f84 Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Wed, 4 Mar 2015 21:48:01 +0900
Subject: Make --without-export actually work

9906660 added a --without-export configure option to avoid exporting
jemalloc symbols, but the option didn't actually work.
---
 include/jemalloc/internal/jemalloc_internal_defs.h.in |  6 ++++++
 include/jemalloc/jemalloc_macros.h.in                 | 14 +++++++++-----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index 0f0db8a..191abc5 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -232,4 +232,10 @@
 /* Adaptive mutex support in pthreads. */
 #undef JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP
 
+/*
+ * If defined, jemalloc symbols are not exported (doesn't work when
+ * JEMALLOC_PREFIX is not defined).
+ */
+#undef JEMALLOC_EXPORT
+
 #endif /* JEMALLOC_INTERNAL_DEFS_H_ */
diff --git a/include/jemalloc/jemalloc_macros.h.in b/include/jemalloc/jemalloc_macros.h.in
index 7d1dcf4..72f2a08 100644
--- a/include/jemalloc/jemalloc_macros.h.in
+++ b/include/jemalloc/jemalloc_macros.h.in
@@ -32,16 +32,20 @@
 
 #ifdef JEMALLOC_HAVE_ATTR
 #  define JEMALLOC_ATTR(s) __attribute__((s))
-#  define JEMALLOC_EXPORT JEMALLOC_ATTR(visibility("default"))
+#  ifndef JEMALLOC_EXPORT
+#    define JEMALLOC_EXPORT JEMALLOC_ATTR(visibility("default"))
+#  endif
 #  define JEMALLOC_ALIGNED(s) JEMALLOC_ATTR(aligned(s))
 #  define JEMALLOC_SECTION(s) JEMALLOC_ATTR(section(s))
 #  define JEMALLOC_NOINLINE JEMALLOC_ATTR(noinline)
 #elif _MSC_VER
 #  define JEMALLOC_ATTR(s)
-#  ifdef DLLEXPORT
-#    define JEMALLOC_EXPORT __declspec(dllexport)
-#  else
-#    define JEMALLOC_EXPORT __declspec(dllimport)
+#  ifndef JEMALLOC_EXPORT
+#    ifdef DLLEXPORT
+#      define JEMALLOC_EXPORT __declspec(dllexport)
+#    else
+#      define JEMALLOC_EXPORT __declspec(dllimport)
+#    endif
 #  endif
 #  define JEMALLOC_ALIGNED(s) __declspec(align(s))
 #  define JEMALLOC_SECTION(s) __declspec(allocate(s))
-- 
cgit v0.12


From 4d871f73af6b8310564dfcb63357dbfe8b1a1529 Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Wed, 4 Mar 2015 10:54:10 +0900
Subject: Preserve LastError when calling TlsGetValue

TlsGetValue has a semantic difference with pthread_getspecific, in that it
can return a non-error NULL value, so it always sets the LastError.
But allocator callers may not be expecting calling e.g. free() to change
the value of the last error, so preserve it.
---
 include/jemalloc/internal/tsd.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index dbb91a2..62a887e 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -277,9 +277,11 @@ a_name##tsd_set(a_type *val)						\
 a_attr bool								\
 a_name##tsd_cleanup_wrapper(void)					\
 {									\
-	a_name##tsd_wrapper_t *wrapper;					\
+	DWORD error = GetLastError();					\
+	a_name##tsd_wrapper_t *wrapper = (a_name##tsd_wrapper_t *)	\
+	    TlsGetValue(a_name##tsd_tsd);				\
+	SetLastError(error);						\
 									\
-	wrapper = (a_name##tsd_wrapper_t *)TlsGetValue(a_name##tsd_tsd);\
 	if (wrapper == NULL)						\
 		return (false);						\
 	if (a_cleanup != malloc_tsd_no_cleanup &&			\
@@ -307,8 +309,10 @@ a_name##tsd_wrapper_set(a_name##tsd_wrapper_t *wrapper)			\
 a_attr a_name##tsd_wrapper_t *						\
 a_name##tsd_wrapper_get(void)						\
 {									\
+	DWORD error = GetLastError();					\
 	a_name##tsd_wrapper_t *wrapper = (a_name##tsd_wrapper_t *)	\
 	    TlsGetValue(a_name##tsd_tsd);				\
+	SetLastError(error);						\
 									\
 	if (unlikely(wrapper == NULL)) {				\
 		wrapper = (a_name##tsd_wrapper_t *)			\
-- 
cgit v0.12


From f044bb219e9bfcc585f64f097e5ab0b5837c0451 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 6 Mar 2015 20:05:16 -0800
Subject: Change default chunk size from 4 MiB to 256 KiB.

Recent changes have improved huge allocation scalability, which removes
upward pressure to set the chunk size so large that huge allocations are
rare.  Smaller chunks are more likely to completely drain, so set the
default to the smallest size that doesn't leave excessive unusable
trailing space in chunk headers.
---
 doc/jemalloc.xml.in               | 26 +++++++++++++-------------
 include/jemalloc/internal/chunk.h |  2 +-
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index b392fa9..747e03f 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -571,8 +571,8 @@ for (i = 0; i < nbins; i++) {
     both large or both huge.  In such cases shrinkage always succeeds, but
     growth only succeeds if the trailing memory is currently available.</para>
 
-    <para>Assuming 4 MiB chunks, 4 KiB pages, and a 16-byte quantum on a 64-bit
-    system, the size classes in each category are as shown in <xref
+    <para>Assuming 256 KiB chunks, 4 KiB pages, and a 16-byte quantum on a
+    64-bit system, the size classes in each category are as shown in <xref
     linkend="size_classes" xrefstyle="template:Table %n"/>.</para>
 
     <table xml:id="size_classes" frame="all">
@@ -627,7 +627,7 @@ for (i = 0; i < nbins; i++) {
           <entry>[10 KiB, 12 KiB, 14 KiB]</entry>
         </row>
         <row>
-          <entry morerows="8">Large</entry>
+          <entry morerows="4">Large</entry>
           <entry>2 KiB</entry>
           <entry>[16 KiB]</entry>
         </row>
@@ -645,7 +645,12 @@ for (i = 0; i < nbins; i++) {
         </row>
         <row>
           <entry>32 KiB</entry>
-          <entry>[160 KiB, 192 KiB, 224 KiB, 256 KiB]</entry>
+          <entry>[160 KiB, 192 KiB, 224 KiB]</entry>
+        </row>
+        <row>
+          <entry morerows="9">Huge</entry>
+          <entry>32 KiB</entry>
+          <entry>[256 KiB]</entry>
         </row>
         <row>
           <entry>64 KiB</entry>
@@ -653,20 +658,15 @@ for (i = 0; i < nbins; i++) {
         </row>
         <row>
           <entry>128 KiB</entry>
-          <entry>[640 KiB, 768 KiB, 896 KiB, 1024 KiB]</entry>
+          <entry>[640 KiB, 768 KiB, 896 KiB, 1 MiB]</entry>
         </row>
         <row>
           <entry>256 KiB</entry>
-          <entry>[1280 KiB, 1536 KiB, 1792 KiB, 2048 KiB]</entry>
-        </row>
-        <row>
-          <entry>512 KiB</entry>
-          <entry>[2560 KiB, 3072 KiB, 3584 KiB]</entry>
+          <entry>[1280 KiB, 1536 KiB, 1792 KiB, 2 MiB]</entry>
         </row>
         <row>
-          <entry morerows="5">Huge</entry>
           <entry>512 KiB</entry>
-          <entry>[4 MiB]</entry>
+          <entry>[2560 KiB, 3 MiB, 3584 KiB, 4 MiB]</entry>
         </row>
         <row>
           <entry>1 MiB</entry>
@@ -907,7 +907,7 @@ for (i = 0; i < nbins; i++) {
         <listitem><para>Virtual memory chunk size (log base 2).  If a chunk
         size outside the supported size range is specified, the size is
         silently clipped to the minimum/maximum supported size.  The default
-        chunk size is 4 MiB (2^22).
+        chunk size is 256 KiB (2^18).
         </para></listitem>
       </varlistentry>
 
diff --git a/include/jemalloc/internal/chunk.h b/include/jemalloc/internal/chunk.h
index 1a968a5..1af5b24 100644
--- a/include/jemalloc/internal/chunk.h
+++ b/include/jemalloc/internal/chunk.h
@@ -5,7 +5,7 @@
  * Size and alignment of memory chunks that are allocated by the OS's virtual
  * memory system.
  */
-#define	LG_CHUNK_DEFAULT	22
+#define	LG_CHUNK_DEFAULT	18
 
 /* Return the chunk address for allocation address a. */
 #define	CHUNK_ADDR2BASE(a)						\
-- 
cgit v0.12


From 5707d6f952c71baa2f19102479859012982ac821 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 6 Mar 2015 17:14:05 -0800
Subject: Quantize szad trees by size class.

Treat sizes that round down to the same size class as size-equivalent
in trees that are used to search for first best fit, so that there are
only as many "firsts" as there are size classes.  This comes closer to
the ideal of first fit.
---
 src/arena.c  | 36 +++++++++++++++++++++++++++---------
 src/base.c   |  5 +++--
 src/chunk.c  |  2 +-
 src/extent.c | 10 ++++++++--
 4 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 78aa1ae..34329a6 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -59,21 +59,35 @@ JEMALLOC_INLINE_C int
 arena_avail_comp(arena_chunk_map_misc_t *a, arena_chunk_map_misc_t *b)
 {
 	int ret;
+	uintptr_t a_miscelm = (uintptr_t)a;
 	size_t a_size;
 	size_t b_size = arena_miscelm_to_bits(b) & ~PAGE_MASK;
-	uintptr_t a_miscelm = (uintptr_t)a;
-	uintptr_t b_miscelm = (uintptr_t)b;
+	index_t a_index, b_index;
 
-	if (a_miscelm & CHUNK_MAP_KEY)
+	if (a_miscelm & CHUNK_MAP_KEY) {
 		a_size = a_miscelm & ~PAGE_MASK;
-	else
+		assert(a_size == s2u(a_size));
+	} else
 		a_size = arena_miscelm_to_bits(a) & ~PAGE_MASK;
 
-	ret = (a_size > b_size) - (a_size < b_size);
+	/*
+	 * Compute the index of the largest size class that the run can satisfy
+	 * a request for.
+	 */
+	a_index = size2index(a_size + 1) - 1;
+	b_index = size2index(b_size + 1) - 1;
+
+	/*
+	 * Compare based on size class index rather than size, in order to
+	 * sort equally useful runs only by address.
+	 */
+	ret = (a_index > b_index) - (a_index < b_index);
 	if (ret == 0) {
-		if (!(a_miscelm & CHUNK_MAP_KEY))
+		if (!(a_miscelm & CHUNK_MAP_KEY)) {
+			uintptr_t b_miscelm = (uintptr_t)b;
+
 			ret = (a_miscelm > b_miscelm) - (a_miscelm < b_miscelm);
-		else {
+		} else {
 			/*
 			 * Treat keys as if they are lower than anything else.
 			 */
@@ -898,8 +912,10 @@ arena_run_alloc_large_helper(arena_t *arena, size_t size, bool zero)
 {
 	arena_chunk_map_misc_t *miscelm;
 	arena_chunk_map_misc_t *key;
+	size_t usize;
 
-	key = (arena_chunk_map_misc_t *)(size | CHUNK_MAP_KEY);
+	usize = s2u(size);
+	key = (arena_chunk_map_misc_t *)(usize | CHUNK_MAP_KEY);
 	miscelm = arena_avail_tree_nsearch(&arena->runs_avail, key);
 	if (miscelm != NULL) {
 		arena_run_t *run = &miscelm->run;
@@ -949,7 +965,8 @@ arena_run_alloc_small_helper(arena_t *arena, size_t size, index_t binind)
 	arena_chunk_map_misc_t *miscelm;
 	arena_chunk_map_misc_t *key;
 
-	key = (arena_chunk_map_misc_t *)(size | CHUNK_MAP_KEY);
+	assert(size == s2u(size));
+	key = (arena_chunk_map_misc_t *)(PAGE_CEILING(size) | CHUNK_MAP_KEY);
 	miscelm = arena_avail_tree_nsearch(&arena->runs_avail, key);
 	if (miscelm != NULL) {
 		run = &miscelm->run;
@@ -2778,6 +2795,7 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info)
 		    bin_info->reg_interval;
 	}
 	assert(actual_nregs > 0);
+	assert(actual_run_size == s2u(actual_run_size));
 
 	/* Copy final settings. */
 	bin_info->run_size = actual_run_size;
diff --git a/src/base.c b/src/base.c
index 33e8b6f..01c62df 100644
--- a/src/base.c
+++ b/src/base.c
@@ -73,7 +73,7 @@ void *
 base_alloc(size_t size)
 {
 	void *ret;
-	size_t csize;
+	size_t csize, usize;
 	extent_node_t *node;
 	extent_node_t key;
 
@@ -83,7 +83,8 @@ base_alloc(size_t size)
 	 */
 	csize = CACHELINE_CEILING(size);
 
-	extent_node_init(&key, NULL, NULL, csize, false);
+	usize = s2u(csize);
+	extent_node_init(&key, NULL, NULL, usize, false);
 	malloc_mutex_lock(&base_mtx);
 	node = extent_tree_szad_nsearch(&base_avail_szad, &key);
 	if (node != NULL) {
diff --git a/src/chunk.c b/src/chunk.c
index 9474a15..972fecd 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -76,7 +76,7 @@ chunk_recycle(arena_t *arena, extent_tree_t *chunks_szad,
 	assert(new_addr == NULL || alignment == chunksize);
 	assert(dalloc_node || new_addr != NULL);
 
-	alloc_size = size + alignment - chunksize;
+	alloc_size = CHUNK_CEILING(s2u(size + alignment - chunksize));
 	/* Beware size_t wrap-around. */
 	if (alloc_size < size)
 		return (NULL);
diff --git a/src/extent.c b/src/extent.c
index f98e77e..e16f8f6 100644
--- a/src/extent.c
+++ b/src/extent.c
@@ -9,8 +9,14 @@ extent_szad_comp(extent_node_t *a, extent_node_t *b)
 	int ret;
 	size_t a_size = extent_node_size_get(a);
 	size_t b_size = extent_node_size_get(b);
-
-	ret = (a_size > b_size) - (a_size < b_size);
+	/*
+	 * Compute the index of the largest size class that the chunk can
+	 * satisfy a request for.
+	 */
+	size_t a_index = size2index(a_size + 1) - 1;
+	size_t b_index = size2index(b_size + 1) - 1;
+
+	ret = (a_index > b_index) - (a_index < b_index);
 	if (ret == 0) {
 		uintptr_t a_addr = (uintptr_t)extent_node_addr_get(a);
 		uintptr_t b_addr = (uintptr_t)extent_node_addr_get(b);
-- 
cgit v0.12


From 97c04a93838c4001688fe31bf018972b4696efe2 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 6 Mar 2015 19:57:36 -0800
Subject: Use first-fit rather than first-best-fit run/chunk allocation.

This tends to more effectively pack active memory toward low addresses.
However, additional tree searches are required in many cases, so whether
this change stands the test of time will depend on real-world
benchmarks.
---
 include/jemalloc/internal/arena.h |  2 +-
 src/arena.c                       | 72 +++++++++++++++++++++++++--------------
 src/chunk.c                       | 43 ++++++++++++++++++++---
 3 files changed, 87 insertions(+), 30 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 42086ca..50b296e 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -312,7 +312,7 @@ struct arena_s {
 
 	/*
 	 * Size/address-ordered tree of this arena's available runs.  The tree
-	 * is used for first-best-fit run allocation.
+	 * is used for first-fit run allocation.
 	 */
 	arena_avail_tree_t	runs_avail;
 
diff --git a/src/arena.c b/src/arena.c
index 34329a6..6f4197b 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -907,23 +907,55 @@ arena_chunk_ralloc_huge_expand(arena_t *arena, void *chunk, size_t oldsize,
 	return (err);
 }
 
+/* Do first-fit run selection. */
 static arena_run_t *
-arena_run_alloc_large_helper(arena_t *arena, size_t size, bool zero)
+arena_run_first_fit(arena_t *arena, size_t size)
 {
-	arena_chunk_map_misc_t *miscelm;
-	arena_chunk_map_misc_t *key;
-	size_t usize;
+	arena_run_t *run;
+	index_t index, max_index;
 
-	usize = s2u(size);
-	key = (arena_chunk_map_misc_t *)(usize | CHUNK_MAP_KEY);
-	miscelm = arena_avail_tree_nsearch(&arena->runs_avail, key);
-	if (miscelm != NULL) {
-		arena_run_t *run = &miscelm->run;
-		arena_run_split_large(arena, &miscelm->run, size, zero);
-		return (run);
+	assert(size == s2u(size));
+	assert(size == PAGE_CEILING(size));
+
+	/*
+	 * Iterate over all size classes that are at least large enough to
+	 * satisfy the request, search for the lowest run of each size class,
+	 * and choose the lowest of the runs found.
+	 */
+	run = NULL;
+	for (index = size2index(size), max_index = size2index(arena_maxclass);
+	    index <= max_index;) {
+		arena_run_t *currun;
+		arena_chunk_t *currun_chunk;
+		size_t currun_pageind, currun_size;
+		size_t usize = PAGE_CEILING(index2size(index));
+		arena_chunk_map_misc_t *key = (arena_chunk_map_misc_t *)(usize |
+		    CHUNK_MAP_KEY);
+		arena_chunk_map_misc_t *miscelm =
+		    arena_avail_tree_nsearch(&arena->runs_avail, key);
+		if (miscelm == NULL)
+			break;
+		currun = &miscelm->run;
+		if (run == NULL || (uintptr_t)currun < (uintptr_t)run)
+			run = currun;
+		currun_chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(currun);
+		currun_pageind = arena_miscelm_to_pageind(miscelm);
+		currun_size = arena_mapbits_unallocated_size_get(currun_chunk,
+		    currun_pageind);
+		assert(size2index(currun_size) + 1 > index);
+		index = size2index(currun_size) + 1;
 	}
 
-	return (NULL);
+	return (run);
+}
+
+static arena_run_t *
+arena_run_alloc_large_helper(arena_t *arena, size_t size, bool zero)
+{
+	arena_run_t *run = arena_run_first_fit(arena, s2u(size));
+	if (run != NULL)
+		arena_run_split_large(arena, run, size, zero);
+	return (run);
 }
 
 static arena_run_t *
@@ -961,20 +993,10 @@ arena_run_alloc_large(arena_t *arena, size_t size, bool zero)
 static arena_run_t *
 arena_run_alloc_small_helper(arena_t *arena, size_t size, index_t binind)
 {
-	arena_run_t *run;
-	arena_chunk_map_misc_t *miscelm;
-	arena_chunk_map_misc_t *key;
-
-	assert(size == s2u(size));
-	key = (arena_chunk_map_misc_t *)(PAGE_CEILING(size) | CHUNK_MAP_KEY);
-	miscelm = arena_avail_tree_nsearch(&arena->runs_avail, key);
-	if (miscelm != NULL) {
-		run = &miscelm->run;
+	arena_run_t *run = arena_run_first_fit(arena, PAGE_CEILING(size));
+	if (run != NULL)
 		arena_run_split_small(arena, run, size, binind);
-		return (run);
-	}
-
-	return (NULL);
+	return (run);
 }
 
 static arena_run_t *
diff --git a/src/chunk.c b/src/chunk.c
index 972fecd..875fa4c 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -62,6 +62,39 @@ chunk_deregister(const void *chunk, const extent_node_t *node)
 	}
 }
 
+/* Do first-fit chunk selection. */
+static extent_node_t *
+chunk_first_fit(arena_t *arena, extent_tree_t *chunks_szad, size_t size)
+{
+	extent_node_t *node;
+	index_t index;
+
+	assert(size == CHUNK_CEILING(size));
+
+	/*
+	 * Iterate over all size classes that are at least large enough to
+	 * satisfy the request, search for the lowest chunk of each size class,
+	 * and choose the lowest of the chunks found.
+	 */
+	node = NULL;
+	for (index = size2index(size); index < NSIZES;) {
+		extent_node_t *curnode;
+		extent_node_t key;
+		extent_node_init(&key, arena, NULL,
+		    CHUNK_CEILING(index2size(index)), false);
+		curnode = extent_tree_szad_nsearch(chunks_szad, &key);
+		if (curnode == NULL)
+			break;
+		if (node == NULL || (uintptr_t)extent_node_addr_get(curnode) <
+		    (uintptr_t)extent_node_addr_get(node))
+			node = curnode;
+		assert(size2index(extent_node_size_get(curnode)) + 1 > index);
+		index = size2index(extent_node_size_get(curnode)) + 1;
+	}
+
+	return (node);
+}
+
 static void *
 chunk_recycle(arena_t *arena, extent_tree_t *chunks_szad,
     extent_tree_t *chunks_ad, bool cache, void *new_addr, size_t size,
@@ -69,7 +102,6 @@ chunk_recycle(arena_t *arena, extent_tree_t *chunks_szad,
 {
 	void *ret;
 	extent_node_t *node;
-	extent_node_t key;
 	size_t alloc_size, leadsize, trailsize;
 	bool zeroed;
 
@@ -80,10 +112,13 @@ chunk_recycle(arena_t *arena, extent_tree_t *chunks_szad,
 	/* Beware size_t wrap-around. */
 	if (alloc_size < size)
 		return (NULL);
-	extent_node_init(&key, arena, new_addr, alloc_size, false);
 	malloc_mutex_lock(&arena->chunks_mtx);
-	node = (new_addr != NULL) ? extent_tree_ad_search(chunks_ad, &key) :
-	    extent_tree_szad_nsearch(chunks_szad, &key);
+	if (new_addr != NULL || size == chunksize) {
+		extent_node_t key;
+		extent_node_init(&key, arena, new_addr, alloc_size, false);
+		node = extent_tree_ad_search(chunks_ad, &key);
+	} else
+		node = chunk_first_fit(arena, chunks_szad, alloc_size);
 	if (node == NULL || (new_addr != NULL && extent_node_size_get(node) <
 	    size)) {
 		malloc_mutex_unlock(&arena->chunks_mtx);
-- 
cgit v0.12


From 04ca7580dbc409915de05cb1cee12a369e898590 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 6 Mar 2015 23:25:13 -0800
Subject: Fix a chunk_recycle() regression.

This regression was introduced by
97c04a93838c4001688fe31bf018972b4696efe2 (Use first-fit rather than
first-best-fit run/chunk allocation.).
---
 src/chunk.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/chunk.c b/src/chunk.c
index 875fa4c..fb8cd41 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -64,13 +64,22 @@ chunk_deregister(const void *chunk, const extent_node_t *node)
 
 /* Do first-fit chunk selection. */
 static extent_node_t *
-chunk_first_fit(arena_t *arena, extent_tree_t *chunks_szad, size_t size)
+chunk_first_fit(arena_t *arena, extent_tree_t *chunks_szad,
+    extent_tree_t *chunks_ad, size_t size)
 {
 	extent_node_t *node;
 	index_t index;
 
 	assert(size == CHUNK_CEILING(size));
 
+	if (size == chunksize) {
+		/*
+		 * Any chunk will suffice, so simply select the one lowest in
+		 * memory.
+		 */
+		return (extent_tree_ad_first(chunks_ad));
+	}
+
 	/*
 	 * Iterate over all size classes that are at least large enough to
 	 * satisfy the request, search for the lowest chunk of each size class,
@@ -113,12 +122,14 @@ chunk_recycle(arena_t *arena, extent_tree_t *chunks_szad,
 	if (alloc_size < size)
 		return (NULL);
 	malloc_mutex_lock(&arena->chunks_mtx);
-	if (new_addr != NULL || size == chunksize) {
+	if (new_addr != NULL) {
 		extent_node_t key;
 		extent_node_init(&key, arena, new_addr, alloc_size, false);
 		node = extent_tree_ad_search(chunks_ad, &key);
-	} else
-		node = chunk_first_fit(arena, chunks_szad, alloc_size);
+	} else {
+		node = chunk_first_fit(arena, chunks_szad, chunks_ad,
+		    alloc_size);
+	}
 	if (node == NULL || (new_addr != NULL && extent_node_size_get(node) <
 	    size)) {
 		malloc_mutex_unlock(&arena->chunks_mtx);
-- 
cgit v0.12


From 54673fd8d719e081536fb531417cd9060de895f0 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Mon, 23 Feb 2015 22:28:43 -0800
Subject: Update ChangeLog.

---
 ChangeLog | 151 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 150 insertions(+), 1 deletion(-)

diff --git a/ChangeLog b/ChangeLog
index d56ee99..ef7dbfd 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -5,6 +5,155 @@ found in the git revision history:
 
     https://github.com/jemalloc/jemalloc
 
+* 4.0.0 (XXX) See https://github.com/jemalloc/jemalloc/milestones/4.0.0 for
+              remaining work.
+
+  This version contains many speed and space optimizations, both minor and
+  major.  The major themes are generalization, unification, and simplification.
+  Although many of these optimizations cause no visible behavior change, their
+  cumulative effect is substantial.
+
+  New features:
+  - Normalize size class spacing to be consistent across the complete size
+    range.  By default there are four size classes per size doubling, but this
+    is now configurable via the --with-lg-size-class-group option.  Also add the
+    --with-lg-page, --with-lg-page-sizes, --with-lg-quantum, and
+    --with-lg-tiny-min options, which can be used to tweak page and size class
+    settings.  Impacts:
+    + Worst case performance for incrementally growing/shrinking reallocation
+      is improved because there are far fewer size classes, and therefore
+      copying happens less often.
+    + Internal fragmentation is limited to 20% for all but the smallest size
+      classes (those less than four times the quantum).  (1B + 4 KiB)
+      and (1B + 4 MiB) previously suffered nearly 50% internal fragmentation.
+    + Chunk fragmentation tends to be lower because there are fewer distinct run
+      sizes to pack.
+  - Add support for explicit tcaches.  The "tcache.create", "tcache.flush", and
+    "tcache.destroy" mallctls control tcache lifetime and flushing, and the
+    MALLOCX_TCACHE(tc) and MALLOCX_TCACHE_NONE flags to the *allocx() API
+    control which tcache is used for each operation.
+  - Implement per thread heap profiling, as well as the ability to
+    enable/disable heap profiling on a per thread basis.  Add the "prof.reset",
+    "prof.lg_sample", "thread.prof.name", "thread.prof.active",
+    "opt.prof_thread_active_init", "prof.thread_active_init", and
+    "thread.prof.active" mallctls.
+  - Add support for per arena application-specified chunk allocators, configured
+    via the "arena<i>.chunk.alloc" and "arena<i>.chunk.dalloc" mallctls.
+  - Refactor huge allocation to be managed by arenas, so that arenas now
+    function as general purpose independent allocators.  This is important in
+    the context of user-specified chunk allocators, aside from the scalability
+    benefits.  Related new statistics:
+    + The "stats.arenas.<i>.huge.allocated", "stats.arenas.<i>.huge.nmalloc",
+      "stats.arenas.<i>.huge.ndalloc", and "stats.arenas.<i>.huge.nrequests"
+      mallctls provide high level per arena huge allocation statistics.
+    + The "arenas.nhchunks", "arenas.hchunks.<i>.size",
+      "stats.arenas.<i>.hchunks.<j>.nmalloc",
+      "stats.arenas.<i>.hchunks.<j>.ndalloc",
+      "stats.arenas.<i>.hchunks.<j>.nrequests", and
+      "stats.arenas.<i>.hchunks.<j>.curhchunks" mallctls provide per size class
+      statistics.
+  - Add the 'util' column to malloc_stats_print() output, which reports the
+    proportion of available regions that are currently in use for each small
+    size class.
+  - Add "alloc" and "free" modes for for junk filling (see the "opt.junk"
+    mallctl), so that it is possible to separately enable junk filling for
+    allocation versus deallocation.
+  - Add the jemalloc-config script, which provides information about how
+    jemalloc was configured, and how to integrate it into application builds.
+  - Add metadata statistics, which are accessible via the "stats.metadata",
+    "stats.arenas.<i>.metadata.mapped", and
+    "stats.arenas.<i>.metadata.allocated" mallctls.
+  - Add the "prof.gdump" mallctl, which makes it possible to toggle the gdump
+    feature on/off during program execution.
+  - Add sdallocx(), which implements sized deallocation.  The primary
+    optimization over dallocx() is the removal of a metadata read, which often
+    suffers an L1 cache miss.
+  - Add missing header includes in jemalloc/jemalloc.h, so that applications
+    only have to #include <jemalloc/jemalloc.h>.
+  - Add support for additional platforms:
+    + Bitrig
+    + Cygwin
+    + DragonFlyBSD
+    + iOS
+    + OpenBSD
+    + OpenRISC/or1k
+
+  Optimizations:
+  - Switch run and chunk allocation from first-best-fit (among best-fit
+    candidates, choose the lowest in memory) to first-fit (among all candidates,
+    choose the lowest in memory).  This tends to reduce chunk and virtual memory
+    fragmentation, respectively.
+  - Maintain dirty runs in per arena LRUs rather than in per arena trees of
+    dirty-run-containing chunks.  In practice this change significantly reduces
+    dirty page purging volume.
+  - Integrate whole chunks into the unused dirty page purging machinery.  This
+    reduces the cost of repeated huge allocation/deallocation, because it
+    effectively introduces a cache of chunks.
+  - Split the arena chunk map into two separate arrays, in order to increase
+    cache locality for the frequently accessed bits.
+  - Move small run metadata out of runs, into arena chunk headers.  This reduces
+    run fragmentation, smaller runs reduce external fragmentation for small size
+    classes, and packed (less uniformly aligned) metadata layout improves CPU
+    cache set distribution.
+  - Micro-optimize the fast paths for the public API functions.
+  - Refactor thread-specific data to reside in a single structure.  This assures
+    that only a single TLS read is necessary per call into the public API.
+  - Implement in-place huge allocation growing and shrinking.
+  - Refactor rtree (radix tree for chunk lookups) to be lock-free, and make
+    additional optimizations that reduce maximum lookup depth to one or two
+    levels.  This resolves what was a concurrency bottleneck for per arena huge
+    allocation, because a global data structure is critical for determining
+    which arenas own which huge allocations.
+
+  Incompatible changes:
+  - Replace --enable-cc-silence with --disable-cc-silence to suppress spurious
+    warnings by default.
+  - Assure that the constness of malloc_usable_size()'s return type matches that
+    of the system implementation.
+  - Change the heap profile dump format to support per thread heap profiling,
+    and enhance pprof with the --thread=<n> option.  As a result, the bundled
+    pprof must now be used rather than the upstream (gperftools) pprof.
+  - Disable "opt.prof_final" by default, in order to avoid atexit(3), which can
+    internally deadlock on some platforms.
+  - Change the "arenas.nlruns" mallctl type from size_t to unsigned.
+  - Replace the "stats.arenas.<i>.bins.<j>.allocated" mallctl with
+    "stats.arenas.<i>.bins.<j>.curregs".
+  - Ignore MALLOC_CONF in set{uid,gid,cap} binaries.
+  - Ignore MALLOCX_ARENA(a) in dallocx(), in favor of using the
+    MALLOCX_TCACHE(tc) and MALLOCX_TCACHE_NONE flags to control tcache usage.
+
+  Removed features:
+  - Remove the *allocm() API, which is superseded by the *allocx() API.
+  - Remove the --enable-dss options, and make dss non-optional on all platforms
+    which support sbrk(2).
+  - Remove the "arenas.purge" mallctl, which was obsoleted by the
+    "arena.<i>.purge" mallctl in 3.1.0.
+  - Remove the unnecessary "opt.valgrind" mallctl; jemalloc automatically
+    detects whether it is running inside Valgrind.
+  - Remove the "stats.huge.allocated", "stats.huge.nmalloc", and
+    "stats.huge.ndalloc" mallctls.
+  - Remove the --enable-mremap option.
+  - Remove the --enable-ivsalloc option, and merge its functionality into
+    --enable-debug.
+  - Remove the "stats.chunks.current", "stats.chunks.total", and
+    "stats.chunks.high" mallctls.
+
+  Bug fixes:
+  - Fix the cactive statistic to decrease (rather than increase) when active
+    memory decreases.  This regression was first released in 3.5.0.
+  - Fix OOM handling in memalign() and valloc().  A variant of this bug existed
+    in all releases since 2.0.0, which introduced these functions.
+  - Fix the "arena.<i>.dss" mallctl to return an error if "primary" or
+    "secondary" precedence is specified, but sbrk(2) is not supported.
+  - Fix fallback lg_floor() implementations to handle extremely large inputs.
+  - Ensure the default purgeable zone is after the default zone on OS X.
+  - Fix latent bugs in atomic_*().
+  - Fix the "arena.<i>.dss" mallctl to handle read-only calls.
+  - Fix tls_model configuration to enable the initial-exec model when possible.
+  - Mark malloc_conf as a weak symbol so that the application can override it.
+  - Correctly detect glibc's adaptive pthread mutexes.
+  - Fix the --without-export configure option.
+
 * 3.6.0 (March 31, 2014)
 
   This version contains a critical bug fix for a regression present in 3.5.0 and
@@ -21,7 +170,7 @@ found in the git revision history:
     backtracing to be reliable.
   - Use dss allocation precedence for huge allocations as well as small/large
     allocations.
-  - Fix test assertion failure message formatting.  This bug did not manifect on
+  - Fix test assertion failure message formatting.  This bug did not manifest on
     x86_64 systems because of implementation subtleties in va_list.
   - Fix inconsequential test failures for hash and SFMT code.
 
-- 
cgit v0.12


From 38e42d311c1844a66e8ced84551621de41e42b85 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Tue, 10 Mar 2015 18:15:40 -0700
Subject: Refactor dirty run linkage to reduce sizeof(extent_node_t).

---
 include/jemalloc/internal/arena.h             | 50 ++++++++++++---
 include/jemalloc/internal/extent.h            | 12 ++--
 include/jemalloc/internal/private_symbols.txt |  1 +
 src/arena.c                                   | 89 +++++++++++++++------------
 4 files changed, 95 insertions(+), 57 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 50b296e..de298e5 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -23,6 +23,7 @@
  */
 #define	LG_DIRTY_MULT_DEFAULT	3
 
+typedef struct arena_runs_dirty_link_s arena_runs_dirty_link_t;
 typedef struct arena_run_s arena_run_t;
 typedef struct arena_chunk_map_bits_s arena_chunk_map_bits_t;
 typedef struct arena_chunk_map_misc_s arena_chunk_map_misc_t;
@@ -120,6 +121,10 @@ struct arena_chunk_map_bits_s {
 #define	CHUNK_MAP_KEY		CHUNK_MAP_ALLOCATED
 };
 
+struct arena_runs_dirty_link_s {
+	qr(arena_runs_dirty_link_t)	rd_link;
+};
+
 /*
  * Each arena_chunk_map_misc_t corresponds to one page within the chunk, just
  * like arena_chunk_map_bits_t.  Two separate arrays are stored within each
@@ -131,13 +136,13 @@ struct arena_chunk_map_misc_s {
 	 *
 	 * 1) arena_t's runs_avail tree.
 	 * 2) arena_run_t conceptually uses this linkage for in-use non-full
-	 * runs, rather than directly embedding linkage.
+	 *    runs, rather than directly embedding linkage.
 	 */
 	rb_node(arena_chunk_map_misc_t)		rb_link;
 
 	union {
 		/* Linkage for list of dirty runs. */
-		qr(arena_chunk_map_misc_t)	rd_link;
+		arena_runs_dirty_link_t		rd;
 
 		/* Profile counters, used for large object runs. */
 		prof_tctx_t			*prof_tctx;
@@ -324,15 +329,27 @@ struct arena_s {
 	 *
 	 *   LRU-----------------------------------------------------------MRU
 	 *
-	 *         ______________           ___                      ___
-	 *   ...-->|chunks_cache|<--------->|c|<-------------------->|c|<--...
-	 *         --------------           |h|                      |h|
-	 *         ____________    _____    |u|    _____    _____    |u|
-	 *   ...-->|runs_dirty|<-->|run|<-->|n|<-->|run|<-->|run|<-->|n|<--...
-	 *         ------------    -----    |k|    -----    -----    |k|
-	 *                                  ---                      ---
+	 *        /------------------\
+	 *        |      arena       |
+	 *        |                  |
+	 *        |  /------------\  |                     /-----------\
+	 *   ...---->|chunks_cache|<---------------------->|   chunk   |<--...
+	 *        |  \------------/  |                     |           |
+	 *        |                  |                     |           |
+	 *        |                  |  /---\      /---\   |           |
+	 *        |                  |  |run|      |run|   |           |
+	 *        |                  |  |   |      |   |   |           |
+	 *        |   /----------\   |  |---|      |---|   |  /-----\  |
+	 *   ...----->|runs_dirty|<---->|rd |<---->|rd |<---->|rdelm|<-----...
+	 *        |   \----------/   |  |---|      |---|   |  \-----/  |
+	 *        |                  |  |   |      |   |   |           |
+	 *        |                  |  |   |      |   |   |           |
+	 *        |                  |  \---/      \---/   |           |
+	 *        |                  |                     |           |
+	 *        |                  |                     |           |
+	 *        \------------------/                     \-----------/
 	 */
-	arena_chunk_map_misc_t	runs_dirty;
+	arena_runs_dirty_link_t	runs_dirty;
 	extent_node_t		chunks_cache;
 
 	/* Extant huge allocations. */
@@ -465,6 +482,7 @@ arena_chunk_map_misc_t	*arena_miscelm_get(arena_chunk_t *chunk,
     size_t pageind);
 size_t	arena_miscelm_to_pageind(arena_chunk_map_misc_t *miscelm);
 void	*arena_miscelm_to_rpages(arena_chunk_map_misc_t *miscelm);
+arena_chunk_map_misc_t	*arena_rd_to_miscelm(arena_runs_dirty_link_t *rd);
 arena_chunk_map_misc_t	*arena_run_to_miscelm(arena_run_t *run);
 size_t	*arena_mapbitsp_get(arena_chunk_t *chunk, size_t pageind);
 size_t	arena_mapbitsp_read(size_t *mapbitsp);
@@ -557,6 +575,18 @@ arena_miscelm_to_rpages(arena_chunk_map_misc_t *miscelm)
 }
 
 JEMALLOC_ALWAYS_INLINE arena_chunk_map_misc_t *
+arena_rd_to_miscelm(arena_runs_dirty_link_t *rd)
+{
+	arena_chunk_map_misc_t *miscelm = (arena_chunk_map_misc_t
+	    *)((uintptr_t)rd - offsetof(arena_chunk_map_misc_t, rd));
+
+	assert(arena_miscelm_to_pageind(miscelm) >= map_bias);
+	assert(arena_miscelm_to_pageind(miscelm) < chunk_npages);
+
+	return (miscelm);
+}
+
+JEMALLOC_ALWAYS_INLINE arena_chunk_map_misc_t *
 arena_run_to_miscelm(arena_run_t *run)
 {
 	arena_chunk_map_misc_t *miscelm = (arena_chunk_map_misc_t
diff --git a/include/jemalloc/internal/extent.h b/include/jemalloc/internal/extent.h
index 81ff40b..5dbc04a 100644
--- a/include/jemalloc/internal/extent.h
+++ b/include/jemalloc/internal/extent.h
@@ -34,7 +34,7 @@ struct extent_node_s {
 	prof_tctx_t		*en_prof_tctx;
 
 	/* Linkage for arena's runs_dirty and chunks_cache rings. */
-	arena_chunk_map_misc_t	runs_dirty;
+	arena_runs_dirty_link_t	rdelm;
 	qr(extent_node_t)	cc_link;
 
 	union {
@@ -79,7 +79,7 @@ void	extent_node_init(extent_node_t *node, arena_t *arena, void *addr,
     size_t size, bool zeroed);
 void	extent_node_dirty_linkage_init(extent_node_t *node);
 void	extent_node_dirty_insert(extent_node_t *node,
-    arena_chunk_map_misc_t *runs_dirty, extent_node_t *chunks_dirty);
+    arena_runs_dirty_link_t *runs_dirty, extent_node_t *chunks_dirty);
 void	extent_node_dirty_remove(extent_node_t *node);
 #endif
 
@@ -186,16 +186,16 @@ JEMALLOC_INLINE void
 extent_node_dirty_linkage_init(extent_node_t *node)
 {
 
-	qr_new(&node->runs_dirty, rd_link);
+	qr_new(&node->rdelm, rd_link);
 	qr_new(node, cc_link);
 }
 
 JEMALLOC_INLINE void
 extent_node_dirty_insert(extent_node_t *node,
-    arena_chunk_map_misc_t *runs_dirty, extent_node_t *chunks_dirty)
+    arena_runs_dirty_link_t *runs_dirty, extent_node_t *chunks_dirty)
 {
 
-	qr_meld(runs_dirty, &node->runs_dirty, rd_link);
+	qr_meld(runs_dirty, &node->rdelm, rd_link);
 	qr_meld(chunks_dirty, node, cc_link);
 }
 
@@ -203,7 +203,7 @@ JEMALLOC_INLINE void
 extent_node_dirty_remove(extent_node_t *node)
 {
 
-	qr_remove(&node->runs_dirty, rd_link);
+	qr_remove(&node->rdelm, rd_link);
 	qr_remove(node, cc_link);
 }
 
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 7c217c7..d086db1 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -81,6 +81,7 @@ arena_quarantine_junk_small
 arena_ralloc
 arena_ralloc_junk_large
 arena_ralloc_no_move
+arena_rd_to_miscelm
 arena_redzone_corruption
 arena_run_regind
 arena_run_to_miscelm
diff --git a/src/arena.c b/src/arena.c
index 6f4197b..5d792f9 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -136,8 +136,8 @@ arena_run_dirty_insert(arena_t *arena, arena_chunk_t *chunk, size_t pageind,
 	assert(arena_mapbits_dirty_get(chunk, pageind+npages-1) ==
 	    CHUNK_MAP_DIRTY);
 
-	qr_new(miscelm, rd_link);
-	qr_meld(&arena->runs_dirty, miscelm, rd_link);
+	qr_new(&miscelm->rd, rd_link);
+	qr_meld(&arena->runs_dirty, &miscelm->rd, rd_link);
 	arena->ndirty += npages;
 }
 
@@ -153,7 +153,7 @@ arena_run_dirty_remove(arena_t *arena, arena_chunk_t *chunk, size_t pageind,
 	assert(arena_mapbits_dirty_get(chunk, pageind+npages-1) ==
 	    CHUNK_MAP_DIRTY);
 
-	qr_remove(miscelm, rd_link);
+	qr_remove(&miscelm->rd, rd_link);
 	assert(arena->ndirty >= npages);
 	arena->ndirty -= npages;
 }
@@ -1056,22 +1056,23 @@ static size_t
 arena_dirty_count(arena_t *arena)
 {
 	size_t ndirty = 0;
-	arena_chunk_map_misc_t *runselm;
+	arena_runs_dirty_link_t *rdelm;
 	extent_node_t *chunkselm;
 
-	for (runselm = qr_next(&arena->runs_dirty, rd_link),
+	for (rdelm = qr_next(&arena->runs_dirty, rd_link),
 	    chunkselm = qr_next(&arena->chunks_cache, cc_link);
-	    runselm != &arena->runs_dirty; runselm = qr_next(runselm,
-	    rd_link)) {
+	    rdelm != &arena->runs_dirty; rdelm = qr_next(rdelm, rd_link)) {
 		size_t npages;
 
-		if (runselm == &chunkselm->runs_dirty) {
+		if (rdelm == &chunkselm->rdelm) {
 			npages = extent_node_size_get(chunkselm) >> LG_PAGE;
 			chunkselm = qr_next(chunkselm, cc_link);
 		} else {
-			arena_chunk_t *chunk = (arena_chunk_t
-			    *)CHUNK_ADDR2BASE(runselm);
-			size_t pageind = arena_miscelm_to_pageind(runselm);
+			arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(
+			    rdelm);
+			arena_chunk_map_misc_t *miscelm =
+			    arena_rd_to_miscelm(rdelm);
+			size_t pageind = arena_miscelm_to_pageind(miscelm);
 			assert(arena_mapbits_allocated_get(chunk, pageind) ==
 			    0);
 			assert(arena_mapbits_large_get(chunk, pageind) == 0);
@@ -1107,21 +1108,21 @@ arena_compute_npurge(arena_t *arena, bool all)
 
 static size_t
 arena_stash_dirty(arena_t *arena, bool all, size_t npurge,
-    arena_chunk_map_misc_t *purge_runs_sentinel,
+    arena_runs_dirty_link_t *purge_runs_sentinel,
     extent_node_t *purge_chunks_sentinel)
 {
-	arena_chunk_map_misc_t *runselm, *runselm_next;
+	arena_runs_dirty_link_t *rdelm, *rdelm_next;
 	extent_node_t *chunkselm;
 	size_t nstashed = 0;
 
 	/* Stash at least npurge pages. */
-	for (runselm = qr_next(&arena->runs_dirty, rd_link),
+	for (rdelm = qr_next(&arena->runs_dirty, rd_link),
 	    chunkselm = qr_next(&arena->chunks_cache, cc_link);
-	    runselm != &arena->runs_dirty; runselm = runselm_next) {
+	    rdelm != &arena->runs_dirty; rdelm = rdelm_next) {
 		size_t npages;
-		runselm_next = qr_next(runselm, rd_link);
+		rdelm_next = qr_next(rdelm, rd_link);
 
-		if (runselm == &chunkselm->runs_dirty) {
+		if (rdelm == &chunkselm->rdelm) {
 			extent_node_t *chunkselm_next;
 			bool zero;
 			UNUSED void *chunk;
@@ -1144,9 +1145,11 @@ arena_stash_dirty(arena_t *arena, bool all, size_t npurge,
 			chunkselm = chunkselm_next;
 		} else {
 			arena_chunk_t *chunk =
-			    (arena_chunk_t *)CHUNK_ADDR2BASE(runselm);
-			size_t pageind = arena_miscelm_to_pageind(runselm);
-			arena_run_t *run = &runselm->run;
+			    (arena_chunk_t *)CHUNK_ADDR2BASE(rdelm);
+			arena_chunk_map_misc_t *miscelm =
+			    arena_rd_to_miscelm(rdelm);
+			size_t pageind = arena_miscelm_to_pageind(miscelm);
+			arena_run_t *run = &miscelm->run;
 			size_t run_size =
 			    arena_mapbits_unallocated_size_get(chunk, pageind);
 
@@ -1167,12 +1170,12 @@ arena_stash_dirty(arena_t *arena, bool all, size_t npurge,
 			arena_run_split_large(arena, run, run_size, false);
 			/* Stash. */
 			if (false)
-				qr_new(runselm, rd_link); /* Redundant. */
+				qr_new(rdelm, rd_link); /* Redundant. */
 			else {
-				assert(qr_next(runselm, rd_link) == runselm);
-				assert(qr_prev(runselm, rd_link) == runselm);
+				assert(qr_next(rdelm, rd_link) == rdelm);
+				assert(qr_prev(rdelm, rd_link) == rdelm);
 			}
-			qr_meld(purge_runs_sentinel, runselm, rd_link);
+			qr_meld(purge_runs_sentinel, rdelm, rd_link);
 		}
 
 		nstashed += npages;
@@ -1184,11 +1187,12 @@ arena_stash_dirty(arena_t *arena, bool all, size_t npurge,
 }
 
 static size_t
-arena_purge_stashed(arena_t *arena, arena_chunk_map_misc_t *purge_runs_sentinel,
+arena_purge_stashed(arena_t *arena,
+    arena_runs_dirty_link_t *purge_runs_sentinel,
     extent_node_t *purge_chunks_sentinel)
 {
 	size_t npurged, nmadvise;
-	arena_chunk_map_misc_t *runselm;
+	arena_runs_dirty_link_t *rdelm;
 	extent_node_t *chunkselm;
 
 	if (config_stats)
@@ -1196,13 +1200,12 @@ arena_purge_stashed(arena_t *arena, arena_chunk_map_misc_t *purge_runs_sentinel,
 	npurged = 0;
 
 	malloc_mutex_unlock(&arena->lock);
-	for (runselm = qr_next(purge_runs_sentinel, rd_link),
+	for (rdelm = qr_next(purge_runs_sentinel, rd_link),
 	    chunkselm = qr_next(purge_chunks_sentinel, cc_link);
-	    runselm != purge_runs_sentinel; runselm = qr_next(runselm,
-	    rd_link)) {
+	    rdelm != purge_runs_sentinel; rdelm = qr_next(rdelm, rd_link)) {
 		size_t npages;
 
-		if (runselm == &chunkselm->runs_dirty) {
+		if (rdelm == &chunkselm->rdelm) {
 			size_t size = extent_node_size_get(chunkselm);
 			bool unzeroed;
 
@@ -1216,8 +1219,10 @@ arena_purge_stashed(arena_t *arena, arena_chunk_map_misc_t *purge_runs_sentinel,
 			size_t pageind, run_size, flag_unzeroed, i;
 			bool unzeroed;
 
-			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(runselm);
-			pageind = arena_miscelm_to_pageind(runselm);
+			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(rdelm);
+			arena_chunk_map_misc_t *miscelm =
+			    arena_rd_to_miscelm(rdelm);
+			pageind = arena_miscelm_to_pageind(miscelm);
 			run_size = arena_mapbits_large_size_get(chunk, pageind);
 			npages = run_size >> LG_PAGE;
 
@@ -1259,18 +1264,18 @@ arena_purge_stashed(arena_t *arena, arena_chunk_map_misc_t *purge_runs_sentinel,
 
 static void
 arena_unstash_purged(arena_t *arena,
-    arena_chunk_map_misc_t *purge_runs_sentinel,
+    arena_runs_dirty_link_t *purge_runs_sentinel,
     extent_node_t *purge_chunks_sentinel)
 {
-	arena_chunk_map_misc_t *runselm, *runselm_next;
+	arena_runs_dirty_link_t *rdelm, *rdelm_next;
 	extent_node_t *chunkselm;
 
 	/* Deallocate runs. */
-	for (runselm = qr_next(purge_runs_sentinel, rd_link),
+	for (rdelm = qr_next(purge_runs_sentinel, rd_link),
 	    chunkselm = qr_next(purge_chunks_sentinel, cc_link);
-	    runselm != purge_runs_sentinel; runselm = runselm_next) {
-		runselm_next = qr_next(runselm, rd_link);
-		if (runselm == &chunkselm->runs_dirty) {
+	    rdelm != purge_runs_sentinel; rdelm = rdelm_next) {
+		rdelm_next = qr_next(rdelm, rd_link);
+		if (rdelm == &chunkselm->rdelm) {
 			extent_node_t *chunkselm_next = qr_next(chunkselm,
 			    cc_link);
 			void *addr = extent_node_addr_get(chunkselm);
@@ -1281,8 +1286,10 @@ arena_unstash_purged(arena_t *arena,
 			chunkselm = chunkselm_next;
 			chunk_dalloc_arena(arena, addr, size, zeroed);
 		} else {
-			arena_run_t *run = &runselm->run;
-			qr_remove(runselm, rd_link);
+			arena_chunk_map_misc_t *miscelm =
+			    arena_rd_to_miscelm(rdelm);
+			arena_run_t *run = &miscelm->run;
+			qr_remove(rdelm, rd_link);
 			arena_run_dalloc(arena, run, false, true);
 		}
 	}
@@ -1292,7 +1299,7 @@ void
 arena_purge(arena_t *arena, bool all)
 {
 	size_t npurge, npurgeable, npurged;
-	arena_chunk_map_misc_t purge_runs_sentinel;
+	arena_runs_dirty_link_t purge_runs_sentinel;
 	extent_node_t purge_chunks_sentinel;
 
 	/*
-- 
cgit v0.12


From f5c8f37259d7697c3f850ac1e5ef63b724cf7689 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Tue, 10 Mar 2015 18:29:49 -0700
Subject: Normalize rdelm/rd structure field naming.

---
 include/jemalloc/internal/arena.h  | 38 +++++++++++++++++++-------------------
 include/jemalloc/internal/extent.h |  8 ++++----
 src/arena.c                        |  8 ++++----
 3 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index de298e5..9cbc591 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -329,25 +329,25 @@ struct arena_s {
 	 *
 	 *   LRU-----------------------------------------------------------MRU
 	 *
-	 *        /------------------\
-	 *        |      arena       |
-	 *        |                  |
-	 *        |  /------------\  |                     /-----------\
-	 *   ...---->|chunks_cache|<---------------------->|   chunk   |<--...
-	 *        |  \------------/  |                     |           |
-	 *        |                  |                     |           |
-	 *        |                  |  /---\      /---\   |           |
-	 *        |                  |  |run|      |run|   |           |
-	 *        |                  |  |   |      |   |   |           |
-	 *        |   /----------\   |  |---|      |---|   |  /-----\  |
-	 *   ...----->|runs_dirty|<---->|rd |<---->|rd |<---->|rdelm|<-----...
-	 *        |   \----------/   |  |---|      |---|   |  \-----/  |
-	 *        |                  |  |   |      |   |   |           |
-	 *        |                  |  |   |      |   |   |           |
-	 *        |                  |  \---/      \---/   |           |
-	 *        |                  |                     |           |
-	 *        |                  |                     |           |
-	 *        \------------------/                     \-----------/
+	 *        /-- arena ---\
+	 *        |            |
+	 *        |            |
+	 *        |------------|                             /- chunk -\
+	 *   ...->|chunks_cache|<--------------------------->|  /----\ |<--...
+	 *        |------------|                             |  |node| |
+	 *        |            |                             |  |    | |
+	 *        |            |    /- run -\    /- run -\   |  |    | |
+	 *        |            |    |       |    |       |   |  |    | |
+	 *        |            |    |       |    |       |   |  |    | |
+	 *        |------------|    |-------|    |-------|   |  |----| |
+	 *   ...->|runs_dirty  |<-->|rd     |<-->|rd     |<---->|rd  |<----...
+	 *        |------------|    |-------|    |-------|   |  |----| |
+	 *        |            |    |       |    |       |   |  |    | |
+	 *        |            |    |       |    |       |   |  \----/ |
+	 *        |            |    \-------/    \-------/   |         |
+	 *        |            |                             |         |
+	 *        |            |                             |         |
+	 *        \------------/                             \---------/
 	 */
 	arena_runs_dirty_link_t	runs_dirty;
 	extent_node_t		chunks_cache;
diff --git a/include/jemalloc/internal/extent.h b/include/jemalloc/internal/extent.h
index 5dbc04a..3751adc 100644
--- a/include/jemalloc/internal/extent.h
+++ b/include/jemalloc/internal/extent.h
@@ -34,7 +34,7 @@ struct extent_node_s {
 	prof_tctx_t		*en_prof_tctx;
 
 	/* Linkage for arena's runs_dirty and chunks_cache rings. */
-	arena_runs_dirty_link_t	rdelm;
+	arena_runs_dirty_link_t	rd;
 	qr(extent_node_t)	cc_link;
 
 	union {
@@ -186,7 +186,7 @@ JEMALLOC_INLINE void
 extent_node_dirty_linkage_init(extent_node_t *node)
 {
 
-	qr_new(&node->rdelm, rd_link);
+	qr_new(&node->rd, rd_link);
 	qr_new(node, cc_link);
 }
 
@@ -195,7 +195,7 @@ extent_node_dirty_insert(extent_node_t *node,
     arena_runs_dirty_link_t *runs_dirty, extent_node_t *chunks_dirty)
 {
 
-	qr_meld(runs_dirty, &node->rdelm, rd_link);
+	qr_meld(runs_dirty, &node->rd, rd_link);
 	qr_meld(chunks_dirty, node, cc_link);
 }
 
@@ -203,7 +203,7 @@ JEMALLOC_INLINE void
 extent_node_dirty_remove(extent_node_t *node)
 {
 
-	qr_remove(&node->rdelm, rd_link);
+	qr_remove(&node->rd, rd_link);
 	qr_remove(node, cc_link);
 }
 
diff --git a/src/arena.c b/src/arena.c
index 5d792f9..8af1a5d 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1064,7 +1064,7 @@ arena_dirty_count(arena_t *arena)
 	    rdelm != &arena->runs_dirty; rdelm = qr_next(rdelm, rd_link)) {
 		size_t npages;
 
-		if (rdelm == &chunkselm->rdelm) {
+		if (rdelm == &chunkselm->rd) {
 			npages = extent_node_size_get(chunkselm) >> LG_PAGE;
 			chunkselm = qr_next(chunkselm, cc_link);
 		} else {
@@ -1122,7 +1122,7 @@ arena_stash_dirty(arena_t *arena, bool all, size_t npurge,
 		size_t npages;
 		rdelm_next = qr_next(rdelm, rd_link);
 
-		if (rdelm == &chunkselm->rdelm) {
+		if (rdelm == &chunkselm->rd) {
 			extent_node_t *chunkselm_next;
 			bool zero;
 			UNUSED void *chunk;
@@ -1205,7 +1205,7 @@ arena_purge_stashed(arena_t *arena,
 	    rdelm != purge_runs_sentinel; rdelm = qr_next(rdelm, rd_link)) {
 		size_t npages;
 
-		if (rdelm == &chunkselm->rdelm) {
+		if (rdelm == &chunkselm->rd) {
 			size_t size = extent_node_size_get(chunkselm);
 			bool unzeroed;
 
@@ -1275,7 +1275,7 @@ arena_unstash_purged(arena_t *arena,
 	    chunkselm = qr_next(purge_chunks_sentinel, cc_link);
 	    rdelm != purge_runs_sentinel; rdelm = rdelm_next) {
 		rdelm_next = qr_next(rdelm, rd_link);
-		if (rdelm == &chunkselm->rdelm) {
+		if (rdelm == &chunkselm->rd) {
 			extent_node_t *chunkselm_next = qr_next(chunkselm,
 			    cc_link);
 			void *addr = extent_node_addr_get(chunkselm);
-- 
cgit v0.12


From bc45d41d23bac598dbd38e5aac5a85b43d24bc04 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Wed, 11 Mar 2015 16:50:40 -0700
Subject: Fix a declaration-after-statement regression.

---
 src/arena.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 8af1a5d..e36cb50 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1215,11 +1215,10 @@ arena_purge_stashed(arena_t *arena,
 			extent_node_zeroed_set(chunkselm, !unzeroed);
 			chunkselm = qr_next(chunkselm, cc_link);
 		} else {
-			arena_chunk_t *chunk;
 			size_t pageind, run_size, flag_unzeroed, i;
 			bool unzeroed;
-
-			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(rdelm);
+			arena_chunk_t *chunk = (arena_chunk_t
+			    *)CHUNK_ADDR2BASE(rdelm);
 			arena_chunk_map_misc_t *miscelm =
 			    arena_rd_to_miscelm(rdelm);
 			pageind = arena_miscelm_to_pageind(miscelm);
-- 
cgit v0.12


From fbd8d773ad0230ffba4e2c296dac3edcac9ca27e Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Wed, 11 Mar 2015 23:14:50 -0700
Subject: Fix unsigned comparison underflow.

These bugs only affected tests and debug builds.
---
 include/jemalloc/internal/rtree.h | 2 +-
 src/rtree.c                       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/jemalloc/internal/rtree.h b/include/jemalloc/internal/rtree.h
index 2eb726d..c1fb90c 100644
--- a/include/jemalloc/internal/rtree.h
+++ b/include/jemalloc/internal/rtree.h
@@ -260,7 +260,7 @@ rtree_set(rtree_t *rtree, uintptr_t key, const extent_node_t *val)
 			rtree_val_write(rtree, &node[subkey], val);
 			return (false);
 		}
-		assert(i < rtree->height - 1);
+		assert(i + 1 < rtree->height);
 		child = rtree_child_read(rtree, &node[subkey], i);
 		if (child == NULL)
 			return (true);
diff --git a/src/rtree.c b/src/rtree.c
index 47d9084..af0d97e 100644
--- a/src/rtree.c
+++ b/src/rtree.c
@@ -63,7 +63,7 @@ static void
 rtree_delete_subtree(rtree_t *rtree, rtree_node_elm_t *node, unsigned level)
 {
 
-	if (level < rtree->height - 1) {
+	if (level + 1 < rtree->height) {
 		size_t nchildren, i;
 
 		nchildren = ZU(1) << rtree->levels[level].bits;
-- 
cgit v0.12


From d69964bd2d31387f79a5f0494de8fd255b693afb Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Thu, 12 Mar 2015 16:25:18 -0700
Subject: Fix a heap profiling regression.

Fix prof_tctx_comp() to incorporate tctx state into the comparison.
During a dump it is possible for both a purgatory tctx and an otherwise
equivalent nominal tctx to reside in the tree at the same time.

This regression was introduced by
602c8e0971160e4b85b08b16cf8a2375aa24bc04 (Implement per thread heap
profiling.).
---
 src/prof.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/prof.c b/src/prof.c
index 4f1580b..84fa5fd 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -137,8 +137,13 @@ prof_tctx_comp(const prof_tctx_t *a, const prof_tctx_t *b)
 {
 	uint64_t a_uid = a->thr_uid;
 	uint64_t b_uid = b->thr_uid;
-
-	return ((a_uid > b_uid) - (a_uid < b_uid));
+	int ret = (a_uid > b_uid) - (a_uid < b_uid);
+	if (ret == 0) {
+		prof_tctx_state_t a_state = a->state;
+		prof_tctx_state_t b_state = b->state;
+		ret = (a_state > b_state) - (a_state < b_state);
+	}
+	return (ret);
 }
 
 rb_gen(static UNUSED, tctx_tree_, prof_tctx_tree_t, prof_tctx_t,
-- 
cgit v0.12


From f69e2f6fdab40c7612be5fd69960b8c7d40dba44 Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Thu, 12 Mar 2015 08:51:05 +0900
Subject: Use the error code given to buferror on Windows

a14bce85 made buferror not take an error code, and make the Windows
code path for buferror use GetLastError, while the alternative code
paths used errno. Then 2a83ed02 made buferror take an error code
again, and while it changed the non-Windows code paths to use that
error code, the Windows code path was not changed accordingly.
---
 src/util.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/util.c b/src/util.c
index a964d70..a6ef5d5 100644
--- a/src/util.c
+++ b/src/util.c
@@ -81,7 +81,7 @@ buferror(int err, char *buf, size_t buflen)
 {
 
 #ifdef _WIN32
-	FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM, NULL, GetLastError(), 0,
+	FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM, NULL, err, 0,
 	    (LPSTR)buf, buflen, NULL);
 	return (0);
 #elif defined(__GLIBC__) && defined(_GNU_SOURCE)
-- 
cgit v0.12


From d6384b09e137874d7cdf527e5bb50abba0ae5f95 Mon Sep 17 00:00:00 2001
From: Daniel Micay <danielmicay@gmail.com>
Date: Thu, 16 Oct 2014 17:02:18 -0400
Subject: use CLOCK_MONOTONIC in the timer if it's available

Linux sets _POSIX_MONOTONIC_CLOCK to 0 meaning it *might* be available,
so a sysconf check is necessary at runtime with a fallback to the
mandatory CLOCK_REALTIME clock.
---
 test/include/test/timer.h | 10 ++++++++++
 test/src/timer.c          | 17 +++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/test/include/test/timer.h b/test/include/test/timer.h
index 496072a..9ffbaef 100644
--- a/test/include/test/timer.h
+++ b/test/include/test/timer.h
@@ -1,10 +1,20 @@
 /* Simple timer, for use in benchmark reporting. */
 
+#include <unistd.h>
 #include <sys/time.h>
 
+#define JEMALLOC_CLOCK_GETTIME defined(_POSIX_MONOTONIC_CLOCK) \
+    && _POSIX_MONOTONIC_CLOCK >= 0
+
 typedef struct {
+#if JEMALLOC_CLOCK_GETTIME
+	struct timespec tv0;
+	struct timespec tv1;
+	int clock_id;
+#else
 	struct timeval tv0;
 	struct timeval tv1;
+#endif
 } timedelta_t;
 
 void	timer_start(timedelta_t *timer);
diff --git a/test/src/timer.c b/test/src/timer.c
index 36fbedd..338a9ef 100644
--- a/test/src/timer.c
+++ b/test/src/timer.c
@@ -4,22 +4,39 @@ void
 timer_start(timedelta_t *timer)
 {
 
+#if JEMALLOC_CLOCK_GETTIME
+	if (sysconf(_SC_MONOTONIC_CLOCK) <= 0)
+		timer->clock_id = CLOCK_REALTIME;
+	else
+		timer->clock_id = CLOCK_MONOTONIC;
+	clock_gettime(timer->clock_id, &timer->tv0);
+#else
 	gettimeofday(&timer->tv0, NULL);
+#endif
 }
 
 void
 timer_stop(timedelta_t *timer)
 {
 
+#if JEMALLOC_CLOCK_GETTIME
+	clock_gettime(timer->clock_id, &timer->tv1);
+#else
 	gettimeofday(&timer->tv1, NULL);
+#endif
 }
 
 uint64_t
 timer_usec(const timedelta_t *timer)
 {
 
+#if JEMALLOC_CLOCK_GETTIME
+	return (((timer->tv1.tv_sec - timer->tv0.tv_sec) * 1000000) +
+	    (timer->tv1.tv_nsec - timer->tv0.tv_nsec) / 1000);
+#else
 	return (((timer->tv1.tv_sec - timer->tv0.tv_sec) * 1000000) +
 	    timer->tv1.tv_usec - timer->tv0.tv_usec);
+#endif
 }
 
 void
-- 
cgit v0.12


From 764b00023f2bc97f240c3a758ed23ce9c0ad8526 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Sat, 14 Mar 2015 14:01:35 -0700
Subject: Fix a heap profiling regression.

Add the prof_tctx_state_destroying transitionary state to fix a race
between a thread destroying a tctx and another thread creating a new
equivalent tctx.

This regression was introduced by
602c8e0971160e4b85b08b16cf8a2375aa24bc04 (Implement per thread heap
profiling.).
---
 include/jemalloc/internal/prof.h |  1 +
 src/prof.c                       | 44 ++++++++++++++++++++++++++++------------
 2 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index f508243..8967333 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -81,6 +81,7 @@ struct prof_cnt_s {
 typedef enum {
 	prof_tctx_state_initializing,
 	prof_tctx_state_nominal,
+	prof_tctx_state_destroying,
 	prof_tctx_state_dumping,
 	prof_tctx_state_purgatory /* Dumper must finish destroying. */
 } prof_tctx_state_t;
diff --git a/src/prof.c b/src/prof.c
index 84fa5fd..e86669c 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -642,10 +642,13 @@ prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx)
 
 	ckh_remove(tsd, &tdata->bt2tctx, &gctx->bt, NULL, NULL);
 	destroy_tdata = prof_tdata_should_destroy(tdata, false);
+	if (tctx->state == prof_tctx_state_nominal)
+		tctx->state = prof_tctx_state_destroying;
 	malloc_mutex_unlock(tdata->lock);
 
 	malloc_mutex_lock(gctx->lock);
-	if (tctx->state != prof_tctx_state_dumping) {
+	switch (tctx->state) {
+	case prof_tctx_state_destroying:
 		tctx_tree_remove(&gctx->tctxs, tctx);
 		destroy_tctx = true;
 		if (prof_gctx_should_destroy(gctx)) {
@@ -667,7 +670,8 @@ prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx)
 			destroy_gctx = true;
 		} else
 			destroy_gctx = false;
-	} else {
+		break;
+	case prof_tctx_state_dumping:
 		/*
 		 * A dumping thread needs tctx to remain valid until dumping
 		 * has finished.  Change state such that the dumping thread will
@@ -676,6 +680,9 @@ prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx)
 		tctx->state = prof_tctx_state_purgatory;
 		destroy_tctx = false;
 		destroy_gctx = false;
+		break;
+	default:
+		not_reached();
 	}
 	malloc_mutex_unlock(gctx->lock);
 	if (destroy_gctx) {
@@ -1021,21 +1028,30 @@ prof_tctx_merge_tdata(prof_tctx_t *tctx, prof_tdata_t *tdata)
 {
 
 	malloc_mutex_lock(tctx->gctx->lock);
-	if (tctx->state == prof_tctx_state_initializing) {
+
+	switch (tctx->state) {
+	case prof_tctx_state_initializing:
+	case prof_tctx_state_destroying:
 		malloc_mutex_unlock(tctx->gctx->lock);
 		return;
-	}
-	assert(tctx->state == prof_tctx_state_nominal);
-	tctx->state = prof_tctx_state_dumping;
-	malloc_mutex_unlock(tctx->gctx->lock);
+	case prof_tctx_state_nominal:
+		tctx->state = prof_tctx_state_dumping;
+		malloc_mutex_unlock(tctx->gctx->lock);
 
-	memcpy(&tctx->dump_cnts, &tctx->cnts, sizeof(prof_cnt_t));
+		memcpy(&tctx->dump_cnts, &tctx->cnts, sizeof(prof_cnt_t));
 
-	tdata->cnt_summed.curobjs += tctx->dump_cnts.curobjs;
-	tdata->cnt_summed.curbytes += tctx->dump_cnts.curbytes;
-	if (opt_prof_accum) {
-		tdata->cnt_summed.accumobjs += tctx->dump_cnts.accumobjs;
-		tdata->cnt_summed.accumbytes += tctx->dump_cnts.accumbytes;
+		tdata->cnt_summed.curobjs += tctx->dump_cnts.curobjs;
+		tdata->cnt_summed.curbytes += tctx->dump_cnts.curbytes;
+		if (opt_prof_accum) {
+			tdata->cnt_summed.accumobjs +=
+			    tctx->dump_cnts.accumobjs;
+			tdata->cnt_summed.accumbytes +=
+			    tctx->dump_cnts.accumbytes;
+		}
+		break;
+	case prof_tctx_state_dumping:
+	case prof_tctx_state_purgatory:
+		not_reached();
 	}
 }
 
@@ -1059,6 +1075,7 @@ prof_tctx_merge_iter(prof_tctx_tree_t *tctxs, prof_tctx_t *tctx, void *arg)
 
 	switch (tctx->state) {
 	case prof_tctx_state_nominal:
+	case prof_tctx_state_destroying:
 		/* New since dumping started; ignore. */
 		break;
 	case prof_tctx_state_dumping:
@@ -1094,6 +1111,7 @@ prof_tctx_finish_iter(prof_tctx_tree_t *tctxs, prof_tctx_t *tctx, void *arg)
 
 	switch (tctx->state) {
 	case prof_tctx_state_nominal:
+	case prof_tctx_state_destroying:
 		/* New since dumping started; ignore. */
 		break;
 	case prof_tctx_state_dumping:
-- 
cgit v0.12


From 262146dfc4778f0671ab86458acd4ec531a80a34 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sat, 14 Mar 2015 14:34:16 -0700
Subject: Eliminate innocuous compiler warnings.

---
 src/prof.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/prof.c b/src/prof.c
index e86669c..e9daa6f 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -683,6 +683,8 @@ prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx)
 		break;
 	default:
 		not_reached();
+		destroy_tctx = false;
+		destroy_gctx = false;
 	}
 	malloc_mutex_unlock(gctx->lock);
 	if (destroy_gctx) {
-- 
cgit v0.12


From 04211e226628c41da4b3804ba411b5dd4b3a02ab Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Mon, 16 Mar 2015 15:11:06 -0700
Subject: Fix heap profiling regressions.

Remove the prof_tctx_state_destroying transitory state and instead add
the tctx_uid field, so that the tuple <thr_uid, tctx_uid> uniquely
identifies a tctx.  This assures that tctx's are well ordered even when
more than two with the same thr_uid coexist.  A previous attempted fix
based on prof_tctx_state_destroying was only sufficient for protecting
against two coexisting tctx's, but it also introduced a new dumping
race.

These regressions were introduced by
602c8e0971160e4b85b08b16cf8a2375aa24bc04 (Implement per thread heap
profiling.) and 764b00023f2bc97f240c3a758ed23ce9c0ad8526 (Fix a heap
profiling regression.).
---
 include/jemalloc/internal/prof.h | 23 ++++++++++++++++++++++-
 src/prof.c                       | 21 +++++++++------------
 2 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index 8967333..2e22711 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -81,7 +81,6 @@ struct prof_cnt_s {
 typedef enum {
 	prof_tctx_state_initializing,
 	prof_tctx_state_nominal,
-	prof_tctx_state_destroying,
 	prof_tctx_state_dumping,
 	prof_tctx_state_purgatory /* Dumper must finish destroying. */
 } prof_tctx_state_t;
@@ -102,6 +101,21 @@ struct prof_tctx_s {
 	/* Associated global context. */
 	prof_gctx_t		*gctx;
 
+	/*
+	 * UID that distinguishes multiple tctx's created by the same thread,
+	 * but coexisting in gctx->tctxs.  There are two ways that such
+	 * coexistence can occur:
+	 * - A dumper thread can cause a tctx to be retained in the purgatory
+	 *   state.
+	 * - Although a single "producer" thread must create all tctx's which
+	 *   share the same thr_uid, multiple "consumers" can each concurrently
+	 *   execute portions of prof_tctx_destroy().  prof_tctx_destroy() only
+	 *   gets called once each time cnts.cur{objs,bytes} drop to 0, but this
+	 *   threshold can be hit again before the first consumer finishes
+	 *   executing prof_tctx_destroy().
+	 */
+	uint64_t		tctx_uid;
+
 	/* Linkage into gctx's tctxs. */
 	rb_node(prof_tctx_t)	tctx_link;
 
@@ -179,6 +193,13 @@ struct prof_tdata_s {
 	rb_node(prof_tdata_t)	tdata_link;
 
 	/*
+	 * Counter used to initialize prof_tctx_t's tctx_uid.  No locking is
+	 * necessary when incrementing this field, because only one thread ever
+	 * does so.
+	 */
+	uint64_t		tctx_uid_next;
+
+	/*
 	 * Hash of (prof_bt_t *)-->(prof_tctx_t *).  Each thread tracks
 	 * backtraces for which it has non-zero allocation/deallocation counters
 	 * associated with thread-specific prof_tctx_t objects.  Other threads
diff --git a/src/prof.c b/src/prof.c
index e9daa6f..f2a3725 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -135,13 +135,13 @@ static char	*prof_thread_name_alloc(tsd_t *tsd, const char *thread_name);
 JEMALLOC_INLINE_C int
 prof_tctx_comp(const prof_tctx_t *a, const prof_tctx_t *b)
 {
-	uint64_t a_uid = a->thr_uid;
-	uint64_t b_uid = b->thr_uid;
-	int ret = (a_uid > b_uid) - (a_uid < b_uid);
+	uint64_t a_thr_uid = a->thr_uid;
+	uint64_t b_thr_uid = b->thr_uid;
+	int ret = (a_thr_uid > b_thr_uid) - (a_thr_uid < b_thr_uid);
 	if (ret == 0) {
-		prof_tctx_state_t a_state = a->state;
-		prof_tctx_state_t b_state = b->state;
-		ret = (a_state > b_state) - (a_state < b_state);
+		uint64_t a_tctx_uid = a->tctx_uid;
+		uint64_t b_tctx_uid = b->tctx_uid;
+		ret = (a_tctx_uid > b_tctx_uid) - (a_tctx_uid < b_tctx_uid);
 	}
 	return (ret);
 }
@@ -642,13 +642,11 @@ prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx)
 
 	ckh_remove(tsd, &tdata->bt2tctx, &gctx->bt, NULL, NULL);
 	destroy_tdata = prof_tdata_should_destroy(tdata, false);
-	if (tctx->state == prof_tctx_state_nominal)
-		tctx->state = prof_tctx_state_destroying;
 	malloc_mutex_unlock(tdata->lock);
 
 	malloc_mutex_lock(gctx->lock);
 	switch (tctx->state) {
-	case prof_tctx_state_destroying:
+	case prof_tctx_state_nominal:
 		tctx_tree_remove(&gctx->tctxs, tctx);
 		destroy_tctx = true;
 		if (prof_gctx_should_destroy(gctx)) {
@@ -795,6 +793,7 @@ prof_lookup(tsd_t *tsd, prof_bt_t *bt)
 		ret.p->thr_uid = tdata->thr_uid;
 		memset(&ret.p->cnts, 0, sizeof(prof_cnt_t));
 		ret.p->gctx = gctx;
+		ret.p->tctx_uid = tdata->tctx_uid_next++;
 		ret.p->prepared = true;
 		ret.p->state = prof_tctx_state_initializing;
 		malloc_mutex_lock(tdata->lock);
@@ -1033,7 +1032,6 @@ prof_tctx_merge_tdata(prof_tctx_t *tctx, prof_tdata_t *tdata)
 
 	switch (tctx->state) {
 	case prof_tctx_state_initializing:
-	case prof_tctx_state_destroying:
 		malloc_mutex_unlock(tctx->gctx->lock);
 		return;
 	case prof_tctx_state_nominal:
@@ -1077,7 +1075,6 @@ prof_tctx_merge_iter(prof_tctx_tree_t *tctxs, prof_tctx_t *tctx, void *arg)
 
 	switch (tctx->state) {
 	case prof_tctx_state_nominal:
-	case prof_tctx_state_destroying:
 		/* New since dumping started; ignore. */
 		break;
 	case prof_tctx_state_dumping:
@@ -1113,7 +1110,6 @@ prof_tctx_finish_iter(prof_tctx_tree_t *tctxs, prof_tctx_t *tctx, void *arg)
 
 	switch (tctx->state) {
 	case prof_tctx_state_nominal:
-	case prof_tctx_state_destroying:
 		/* New since dumping started; ignore. */
 		break;
 	case prof_tctx_state_dumping:
@@ -1690,6 +1686,7 @@ prof_tdata_init_impl(tsd_t *tsd, uint64_t thr_uid, uint64_t thr_discrim,
 	tdata->thread_name = thread_name;
 	tdata->attached = true;
 	tdata->expired = false;
+	tdata->tctx_uid_next = 0;
 
 	if (ckh_new(tsd, &tdata->bt2tctx, PROF_CKH_MINITEMS,
 	    prof_bt_hash, prof_bt_keycomp)) {
-- 
cgit v0.12


From c9db461ffb608ad32aed0e34663ae58a992e1003 Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Tue, 17 Mar 2015 12:09:30 +0900
Subject: Use InterlockedCompareExchange instead of non-existing
 InterlockedCompareExchange32

---
 include/jemalloc/internal/atomic.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/jemalloc/internal/atomic.h b/include/jemalloc/internal/atomic.h
index 0d33065..522dd2a 100644
--- a/include/jemalloc/internal/atomic.h
+++ b/include/jemalloc/internal/atomic.h
@@ -457,7 +457,7 @@ atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s)
 {
 	uint32_t o;
 
-	o = InterlockedCompareExchange32(p, s, c);
+	o = InterlockedCompareExchange(p, s, c);
 	return (o != c);
 }
 
-- 
cgit v0.12


From 8d6a3e8321a7767cb2ca0930b85d5d488a8cc659 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Wed, 18 Mar 2015 18:55:33 -0700
Subject: Implement dynamic per arena control over dirty page purging.

Add mallctls:
- arenas.lg_dirty_mult is initialized via opt.lg_dirty_mult, and can be
  modified to change the initial lg_dirty_mult setting for newly created
  arenas.
- arena.<i>.lg_dirty_mult controls an individual arena's dirty page
  purging threshold, and synchronously triggers any purging that may be
  necessary to maintain the constraint.
- arena.<i>.chunk.purge allows the per arena dirty page purging function
  to be replaced.

This resolves #93.
---
 doc/jemalloc.xml.in                           |  88 +++++++++++++++++--
 include/jemalloc/internal/arena.h             |  16 +++-
 include/jemalloc/internal/chunk.h             |   6 ++
 include/jemalloc/internal/private_symbols.txt |   7 ++
 include/jemalloc/jemalloc_typedefs.h.in       |   1 +
 src/arena.c                                   |  87 +++++++++++++++---
 src/chunk.c                                   |  37 +++++++-
 src/ctl.c                                     | 121 ++++++++++++++++++--------
 src/huge.c                                    |  38 +++++---
 src/stats.c                                   |  10 +++
 test/integration/chunk.c                      |  66 +++++++++++---
 test/unit/mallctl.c                           |  66 ++++++++++++++
 test/unit/rtree.c                             |  10 +--
 13 files changed, 457 insertions(+), 96 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 747e03f..01ac38c 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -937,7 +937,11 @@ for (i = 0; i < nbins; i++) {
         provides the kernel with sufficient information to recycle dirty pages
         if physical memory becomes scarce and the pages remain unused.  The
         default minimum ratio is 8:1 (2^3:1); an option value of -1 will
-        disable dirty page purging.</para></listitem>
+        disable dirty page purging.  See <link
+        linkend="arenas.lg_dirty_mult"><mallctl>arenas.lg_dirty_mult</mallctl></link>
+        and <link
+        linkend="arena.i.lg_dirty_mult"><mallctl>arena.&lt;i&gt;.lg_dirty_mult</mallctl></link>
+        for related dynamic control options.</para></listitem>
       </varlistentry>
 
       <varlistentry id="opt.stats_print">
@@ -1151,7 +1155,7 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         <term>
           <mallctl>opt.prof_active</mallctl>
           (<type>bool</type>)
-          <literal>rw</literal>
+          <literal>r-</literal>
           [<option>--enable-prof</option>]
         </term>
         <listitem><para>Profiling activated/deactivated.  This is a secondary
@@ -1489,6 +1493,20 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         settings.</para></listitem>
       </varlistentry>
 
+      <varlistentry id="arena.i.lg_dirty_mult">
+        <term>
+          <mallctl>arena.&lt;i&gt;.lg_dirty_mult</mallctl>
+          (<type>ssize_t</type>)
+          <literal>rw</literal>
+        </term>
+        <listitem><para>Current per-arena minimum ratio (log base 2) of active
+        to dirty pages for arena &lt;i&gt;.  Each time this interface is set and
+        the ratio is increased, pages are synchronously purged as necessary to
+        impose the new ratio.  See <link
+        linkend="opt.lg_dirty_mult"><mallctl>opt.lg_dirty_mult</mallctl></link>
+        for additional information.</para></listitem>
+      </varlistentry>
+
       <varlistentry id="arena.i.chunk.alloc">
         <term>
           <mallctl>arena.&lt;i&gt;.chunk.alloc</mallctl>
@@ -1544,12 +1562,12 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         allocation for arenas created via <link
         linkend="arenas.extend"><mallctl>arenas.extend</mallctl></link> such
         that all chunks originate from an application-supplied chunk allocator
-        (by setting custom chunk allocation/deallocation functions just after
-        arena creation), but the automatically created arenas may have already
-        created chunks prior to the application having an opportunity to take
-        over chunk allocation.
+        (by setting custom chunk allocation/deallocation/purge functions just
+        after arena creation), but the automatically created arenas may have
+        already created chunks prior to the application having an opportunity to
+        take over chunk allocation.
         <funcsynopsis><funcprototype>
-          <funcdef>typedef void <function>(chunk_dalloc_t)</function></funcdef>
+          <funcdef>typedef bool <function>(chunk_dalloc_t)</function></funcdef>
           <paramdef>void *<parameter>chunk</parameter></paramdef>
           <paramdef>size_t <parameter>size</parameter></paramdef>
           <paramdef>unsigned <parameter>arena_ind</parameter></paramdef>
@@ -1557,7 +1575,47 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         A chunk deallocation function conforms to the
         <type>chunk_dalloc_t</type> type and deallocates a
         <parameter>chunk</parameter> of given <parameter>size</parameter> on
-        behalf of arena <parameter>arena_ind</parameter>.</para></listitem>
+        behalf of arena <parameter>arena_ind</parameter>, returning false upon
+        success.</para></listitem>
+      </varlistentry>
+
+      <varlistentry id="arena.i.chunk.purge">
+        <term>
+          <mallctl>arena.&lt;i&gt;.chunk.purge</mallctl>
+          (<type>chunk_purge_t *</type>)
+          <literal>rw</literal>
+        </term>
+        <listitem><para>Get or set the chunk purge function for arena &lt;i&gt;.
+        A chunk purge function optionally discards physical pages associated
+        with pages in the chunk's virtual memory range but leaves the virtual
+        memory mapping intact, and indicates via its return value whether pages
+        in the virtual memory range will be zero-filled the next time they are
+        accessed.  If setting, the chunk purge function must be capable of
+        purging all extant chunks associated with arena &lt;i&gt;, usually by
+        passing unknown chunks to the purge function that was replaced.  In
+        practice, it is feasible to control allocation for arenas created via
+        <link linkend="arenas.extend"><mallctl>arenas.extend</mallctl></link>
+        such that all chunks originate from an application-supplied chunk
+        allocator (by setting custom chunk allocation/deallocation/purge
+        functions just after arena creation), but the automatically created
+        arenas may have already created chunks prior to the application having
+        an opportunity to take over chunk allocation.
+        <funcsynopsis><funcprototype>
+          <funcdef>typedef bool <function>(chunk_purge_t)</function></funcdef>
+          <paramdef>void *<parameter>chunk</parameter></paramdef>
+          <paramdef>size_t <parameter>offset</parameter></paramdef>
+          <paramdef>size_t <parameter>length</parameter></paramdef>
+          <paramdef>unsigned <parameter>arena_ind</parameter></paramdef>
+        </funcprototype></funcsynopsis>
+        A chunk purge function conforms to the <type>chunk_purge_t</type> type
+        and purges pages within <parameter>chunk</parameter> at
+        <parameter>offset</parameter> bytes, extending for
+        <parameter>length</parameter> on behalf of arena
+        <parameter>arena_ind</parameter>, returning false if pages within the
+        purged virtual memory range will be zero-filled the next time they are
+        accessed.  Note that the memory range being purged may span multiple
+        contiguous chunks, e.g. when purging memory that backed a huge
+        allocation.</para></listitem>
       </varlistentry>
 
       <varlistentry id="arenas.narenas">
@@ -1581,6 +1639,20 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         initialized.</para></listitem>
       </varlistentry>
 
+      <varlistentry id="arenas.lg_dirty_mult">
+        <term>
+          <mallctl>arenas.lg_dirty_mult</mallctl>
+          (<type>ssize_t</type>)
+          <literal>rw</literal>
+        </term>
+        <listitem><para>Current default per-arena minimum ratio (log base 2) of
+        active to dirty pages, used to initialize <link
+        linkend="arena.i.lg_dirty_mult"><mallctl>arena.&lt;i&gt;.lg_dirty_mult</mallctl></link>
+        during arena creation.  See <link
+        linkend="opt.lg_dirty_mult"><mallctl>opt.lg_dirty_mult</mallctl></link>
+        for additional information.</para></listitem>
+      </varlistentry>
+
       <varlistentry id="arenas.quantum">
         <term>
           <mallctl>arenas.quantum</mallctl>
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 9cbc591..56ee74a 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -16,10 +16,10 @@
 /*
  * The minimum ratio of active:dirty pages per arena is computed as:
  *
- *   (nactive >> opt_lg_dirty_mult) >= ndirty
+ *   (nactive >> lg_dirty_mult) >= ndirty
  *
- * So, supposing that opt_lg_dirty_mult is 3, there can be no less than 8 times
- * as many active pages as dirty pages.
+ * So, supposing that lg_dirty_mult is 3, there can be no less than 8 times as
+ * many active pages as dirty pages.
  */
 #define	LG_DIRTY_MULT_DEFAULT	3
 
@@ -304,6 +304,9 @@ struct arena_s {
 	 */
 	arena_chunk_t		*spare;
 
+	/* Minimum ratio (log base 2) of nactive:ndirty. */
+	ssize_t			lg_dirty_mult;
+
 	/* Number of pages in active runs and huge regions. */
 	size_t			nactive;
 
@@ -376,10 +379,11 @@ struct arena_s {
 	malloc_mutex_t		node_cache_mtx;
 
 	/*
-	 * User-configurable chunk allocation and deallocation functions.
+	 * User-configurable chunk allocation/deallocation/purge functions.
 	 */
 	chunk_alloc_t		*chunk_alloc;
 	chunk_dalloc_t		*chunk_dalloc;
+	chunk_purge_t		*chunk_purge;
 
 	/* bins is used to store trees of free regions. */
 	arena_bin_t		bins[NBINS];
@@ -416,6 +420,8 @@ void	arena_chunk_ralloc_huge_shrink(arena_t *arena, void *chunk,
     size_t oldsize, size_t usize);
 bool	arena_chunk_ralloc_huge_expand(arena_t *arena, void *chunk,
     size_t oldsize, size_t usize, bool *zero);
+ssize_t	arena_lg_dirty_mult_get(arena_t *arena);
+bool	arena_lg_dirty_mult_set(arena_t *arena, ssize_t lg_dirty_mult);
 void	arena_maybe_purge(arena_t *arena);
 void	arena_purge_all(arena_t *arena);
 void	arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin,
@@ -462,6 +468,8 @@ void	*arena_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize,
     size_t size, size_t extra, size_t alignment, bool zero, tcache_t *tcache);
 dss_prec_t	arena_dss_prec_get(arena_t *arena);
 bool	arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec);
+ssize_t	arena_lg_dirty_mult_default_get(void);
+bool	arena_lg_dirty_mult_default_set(ssize_t lg_dirty_mult);
 void	arena_stats_merge(arena_t *arena, const char **dss, size_t *nactive,
     size_t *ndirty, arena_stats_t *astats, malloc_bin_stats_t *bstats,
     malloc_large_stats_t *lstats, malloc_huge_stats_t *hstats);
diff --git a/include/jemalloc/internal/chunk.h b/include/jemalloc/internal/chunk.h
index 1af5b24..8093814 100644
--- a/include/jemalloc/internal/chunk.h
+++ b/include/jemalloc/internal/chunk.h
@@ -54,6 +54,12 @@ void	chunk_dalloc_arena(arena_t *arena, void *chunk, size_t size,
 bool	chunk_dalloc_default(void *chunk, size_t size, unsigned arena_ind);
 void	chunk_dalloc_wrapper(arena_t *arena, chunk_dalloc_t *chunk_dalloc,
     void *chunk, size_t size);
+bool	chunk_purge_arena(arena_t *arena, void *chunk, size_t offset,
+    size_t length);
+bool	chunk_purge_default(void *chunk, size_t offset, size_t length,
+    unsigned arena_ind);
+bool	chunk_purge_wrapper(arena_t *arena, chunk_purge_t *chunk_purge,
+    void *chunk, size_t offset, size_t length);
 bool	chunk_boot(void);
 void	chunk_prefork(void);
 void	chunk_postfork_parent(void);
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index d086db1..bc0f2a6 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -30,6 +30,10 @@ arena_dalloc_small
 arena_dss_prec_get
 arena_dss_prec_set
 arena_init
+arena_lg_dirty_mult_default_get
+arena_lg_dirty_mult_default_set
+arena_lg_dirty_mult_get
+arena_lg_dirty_mult_set
 arena_malloc
 arena_malloc_large
 arena_malloc_small
@@ -151,6 +155,9 @@ chunk_npages
 chunk_postfork_child
 chunk_postfork_parent
 chunk_prefork
+chunk_purge_arena
+chunk_purge_default
+chunk_purge_wrapper
 chunk_record
 chunk_register
 chunks_rtree
diff --git a/include/jemalloc/jemalloc_typedefs.h.in b/include/jemalloc/jemalloc_typedefs.h.in
index 8092f1b..d4b4690 100644
--- a/include/jemalloc/jemalloc_typedefs.h.in
+++ b/include/jemalloc/jemalloc_typedefs.h.in
@@ -1,2 +1,3 @@
 typedef void *(chunk_alloc_t)(void *, size_t, size_t, bool *, unsigned);
 typedef bool (chunk_dalloc_t)(void *, size_t, unsigned);
+typedef bool (chunk_purge_t)(void *, size_t, size_t, unsigned);
diff --git a/src/arena.c b/src/arena.c
index e36cb50..7272682 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -5,6 +5,7 @@
 /* Data. */
 
 ssize_t		opt_lg_dirty_mult = LG_DIRTY_MULT_DEFAULT;
+static ssize_t	lg_dirty_mult_default;
 arena_bin_info_t	arena_bin_info[NBINS];
 
 size_t		map_bias;
@@ -1032,15 +1033,49 @@ arena_run_alloc_small(arena_t *arena, size_t size, index_t binind)
 	return (arena_run_alloc_small_helper(arena, size, binind));
 }
 
+static bool
+arena_lg_dirty_mult_valid(ssize_t lg_dirty_mult)
+{
+
+	return (lg_dirty_mult >= -1 && lg_dirty_mult < (sizeof(size_t) << 3));
+}
+
+ssize_t
+arena_lg_dirty_mult_get(arena_t *arena)
+{
+	ssize_t lg_dirty_mult;
+
+	malloc_mutex_lock(&arena->lock);
+	lg_dirty_mult = arena->lg_dirty_mult;
+	malloc_mutex_unlock(&arena->lock);
+
+	return (lg_dirty_mult);
+}
+
+bool
+arena_lg_dirty_mult_set(arena_t *arena, ssize_t lg_dirty_mult)
+{
+
+	if (!arena_lg_dirty_mult_valid(lg_dirty_mult))
+		return (true);
+
+	malloc_mutex_lock(&arena->lock);
+	arena->lg_dirty_mult = lg_dirty_mult;
+	arena_maybe_purge(arena);
+	malloc_mutex_unlock(&arena->lock);
+
+	return (false);
+}
+
 void
 arena_maybe_purge(arena_t *arena)
 {
 	size_t threshold;
 
 	/* Don't purge if the option is disabled. */
-	if (opt_lg_dirty_mult < 0)
+	if (arena->lg_dirty_mult < 0)
 		return;
-	threshold = (arena->nactive >> opt_lg_dirty_mult);
+	threshold = (arena->nactive >> arena->lg_dirty_mult);
 	threshold = threshold < chunk_npages ? chunk_npages : threshold;
 	/*
 	 * Don't purge unless the number of purgeable pages exceeds the
@@ -1096,7 +1131,7 @@ arena_compute_npurge(arena_t *arena, bool all)
 	 * purge.
 	 */
 	if (!all) {
-		size_t threshold = (arena->nactive >> opt_lg_dirty_mult);
+		size_t threshold = (arena->nactive >> arena->lg_dirty_mult);
 		threshold = threshold < chunk_npages ? chunk_npages : threshold;
 
 		npurge = arena->ndirty - threshold;
@@ -1192,6 +1227,7 @@ arena_purge_stashed(arena_t *arena,
     extent_node_t *purge_chunks_sentinel)
 {
 	size_t npurged, nmadvise;
+	chunk_purge_t *chunk_purge;
 	arena_runs_dirty_link_t *rdelm;
 	extent_node_t *chunkselm;
 
@@ -1199,6 +1235,7 @@ arena_purge_stashed(arena_t *arena,
 		nmadvise = 0;
 	npurged = 0;
 
+	chunk_purge = arena->chunk_purge;
 	malloc_mutex_unlock(&arena->lock);
 	for (rdelm = qr_next(purge_runs_sentinel, rd_link),
 	    chunkselm = qr_next(purge_chunks_sentinel, cc_link);
@@ -1207,11 +1244,16 @@ arena_purge_stashed(arena_t *arena,
 
 		if (rdelm == &chunkselm->rd) {
 			size_t size = extent_node_size_get(chunkselm);
+			void *addr, *chunk;
+			size_t offset;
 			bool unzeroed;
 
 			npages = size >> LG_PAGE;
-			unzeroed = pages_purge(extent_node_addr_get(chunkselm),
-			    size);
+			addr = extent_node_addr_get(chunkselm);
+			chunk = CHUNK_ADDR2BASE(addr);
+			offset = CHUNK_ADDR2OFFSET(addr);
+			unzeroed = chunk_purge_wrapper(arena, chunk_purge,
+			    chunk, offset, size);
 			extent_node_zeroed_set(chunkselm, !unzeroed);
 			chunkselm = qr_next(chunkselm, cc_link);
 		} else {
@@ -1226,15 +1268,15 @@ arena_purge_stashed(arena_t *arena,
 			npages = run_size >> LG_PAGE;
 
 			assert(pageind + npages <= chunk_npages);
-			unzeroed = pages_purge((void *)((uintptr_t)chunk +
-			    (pageind << LG_PAGE)), run_size);
+			unzeroed = chunk_purge_wrapper(arena, chunk_purge,
+			    chunk, pageind << LG_PAGE, run_size);
 			flag_unzeroed = unzeroed ? CHUNK_MAP_UNZEROED : 0;
 
 			/*
 			 * Set the unzeroed flag for all pages, now that
-			 * pages_purge() has returned whether the pages were
-			 * zeroed as a side effect of purging.  This chunk map
-			 * modification is safe even though the arena mutex
+			 * chunk_purge_wrapper() has returned whether the pages
+			 * were zeroed as a side effect of purging.  This chunk
+			 * map modification is safe even though the arena mutex
 			 * isn't currently owned by this thread, because the run
 			 * is marked as allocated, thus protecting it from being
 			 * modified by any other thread.  As long as these
@@ -1294,7 +1336,7 @@ arena_unstash_purged(arena_t *arena,
 	}
 }
 
-void
+static void
 arena_purge(arena_t *arena, bool all)
 {
 	size_t npurge, npurgeable, npurged;
@@ -1309,7 +1351,7 @@ arena_purge(arena_t *arena, bool all)
 		size_t ndirty = arena_dirty_count(arena);
 		assert(ndirty == arena->ndirty);
 	}
-	assert((arena->nactive >> opt_lg_dirty_mult) < arena->ndirty || all);
+	assert((arena->nactive >> arena->lg_dirty_mult) < arena->ndirty || all);
 
 	if (config_stats)
 		arena->stats.npurge++;
@@ -2596,6 +2638,23 @@ arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec)
 	return (false);
 }
 
+ssize_t
+arena_lg_dirty_mult_default_get(void)
+{
+
+	return ((ssize_t)atomic_read_z((size_t *)&lg_dirty_mult_default));
+}
+
+bool
+arena_lg_dirty_mult_default_set(ssize_t lg_dirty_mult)
+{
+
+	if (!arena_lg_dirty_mult_valid(lg_dirty_mult))
+		return (true);
+	atomic_write_z((size_t *)&lg_dirty_mult_default, (size_t)lg_dirty_mult);
+	return (false);
+}
+
 void
 arena_stats_merge(arena_t *arena, const char **dss, size_t *nactive,
     size_t *ndirty, arena_stats_t *astats, malloc_bin_stats_t *bstats,
@@ -2702,6 +2761,7 @@ arena_new(unsigned ind)
 
 	arena->spare = NULL;
 
+	arena->lg_dirty_mult = arena_lg_dirty_mult_default_get();
 	arena->nactive = 0;
 	arena->ndirty = 0;
 
@@ -2727,6 +2787,7 @@ arena_new(unsigned ind)
 
 	arena->chunk_alloc = chunk_alloc_default;
 	arena->chunk_dalloc = chunk_dalloc_default;
+	arena->chunk_purge = chunk_purge_default;
 
 	/* Initialize bins. */
 	for (i = 0; i < NBINS; i++) {
@@ -2860,6 +2921,8 @@ arena_boot(void)
 	size_t header_size;
 	unsigned i;
 
+	arena_lg_dirty_mult_default_set(opt_lg_dirty_mult);
+
 	/*
 	 * Compute the header size such that it is large enough to contain the
 	 * page map.  The page map is biased to omit entries for the header
diff --git a/src/chunk.c b/src/chunk.c
index fb8cd41..7063410 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -391,8 +391,10 @@ chunk_record(arena_t *arena, extent_tree_t *chunks_szad,
 			 * pages have already been purged, so that this is only
 			 * a virtual memory leak.
 			 */
-			if (cache)
-				pages_purge(chunk, size);
+			if (cache) {
+				chunk_purge_wrapper(arena, arena->chunk_purge,
+				    chunk, 0, size);
+			}
 			goto label_return;
 		}
 		extent_node_init(node, arena, chunk, size, !unzeroed);
@@ -485,6 +487,37 @@ chunk_dalloc_wrapper(arena_t *arena, chunk_dalloc_t *chunk_dalloc, void *chunk,
 		JEMALLOC_VALGRIND_MAKE_MEM_NOACCESS(chunk, size);
 }
 
+bool
+chunk_purge_arena(arena_t *arena, void *chunk, size_t offset, size_t length)
+{
+
+	assert(chunk != NULL);
+	assert(CHUNK_ADDR2BASE(chunk) == chunk);
+	assert((offset & PAGE_MASK) == 0);
+	assert(length != 0);
+	assert((length & PAGE_MASK) == 0);
+
+	return (pages_purge((void *)((uintptr_t)chunk + (uintptr_t)offset),
+	    length));
+}
+
+bool
+chunk_purge_default(void *chunk, size_t offset, size_t length,
+    unsigned arena_ind)
+{
+
+	return (chunk_purge_arena(chunk_arena_get(arena_ind), chunk, offset,
+	    length));
+}
+
+bool
+chunk_purge_wrapper(arena_t *arena, chunk_purge_t *chunk_purge, void *chunk,
+    size_t offset, size_t length)
+{
+
+	return (chunk_purge(chunk, offset, length, arena->ind));
+}
+
 static rtree_node_elm_t *
 chunks_rtree_node_alloc(size_t nelms)
 {
diff --git a/src/ctl.c b/src/ctl.c
index cd7927f..447b877 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -116,8 +116,10 @@ CTL_PROTO(tcache_destroy)
 CTL_PROTO(arena_i_purge)
 static void	arena_purge(unsigned arena_ind);
 CTL_PROTO(arena_i_dss)
+CTL_PROTO(arena_i_lg_dirty_mult)
 CTL_PROTO(arena_i_chunk_alloc)
 CTL_PROTO(arena_i_chunk_dalloc)
+CTL_PROTO(arena_i_chunk_purge)
 INDEX_PROTO(arena_i)
 CTL_PROTO(arenas_bin_i_size)
 CTL_PROTO(arenas_bin_i_nregs)
@@ -129,6 +131,7 @@ CTL_PROTO(arenas_hchunk_i_size)
 INDEX_PROTO(arenas_hchunk_i)
 CTL_PROTO(arenas_narenas)
 CTL_PROTO(arenas_initialized)
+CTL_PROTO(arenas_lg_dirty_mult)
 CTL_PROTO(arenas_quantum)
 CTL_PROTO(arenas_page)
 CTL_PROTO(arenas_tcache_max)
@@ -283,12 +286,14 @@ static const ctl_named_node_t	tcache_node[] = {
 
 static const ctl_named_node_t chunk_node[] = {
 	{NAME("alloc"),		CTL(arena_i_chunk_alloc)},
-	{NAME("dalloc"),	CTL(arena_i_chunk_dalloc)}
+	{NAME("dalloc"),	CTL(arena_i_chunk_dalloc)},
+	{NAME("purge"),		CTL(arena_i_chunk_purge)}
 };
 
 static const ctl_named_node_t arena_i_node[] = {
 	{NAME("purge"),		CTL(arena_i_purge)},
 	{NAME("dss"),		CTL(arena_i_dss)},
+	{NAME("lg_dirty_mult"),	CTL(arena_i_lg_dirty_mult)},
 	{NAME("chunk"),		CHILD(named, chunk)},
 };
 static const ctl_named_node_t super_arena_i_node[] = {
@@ -337,6 +342,7 @@ static const ctl_indexed_node_t arenas_hchunk_node[] = {
 static const ctl_named_node_t arenas_node[] = {
 	{NAME("narenas"),	CTL(arenas_narenas)},
 	{NAME("initialized"),	CTL(arenas_initialized)},
+	{NAME("lg_dirty_mult"),	CTL(arenas_lg_dirty_mult)},
 	{NAME("quantum"),	CTL(arenas_quantum)},
 	{NAME("page"),		CTL(arenas_page)},
 	{NAME("tcache_max"),	CTL(arenas_tcache_max)},
@@ -1617,58 +1623,71 @@ label_return:
 }
 
 static int
-arena_i_chunk_alloc_ctl(const size_t *mib, size_t miblen, void *oldp,
+arena_i_lg_dirty_mult_ctl(const size_t *mib, size_t miblen, void *oldp,
     size_t *oldlenp, void *newp, size_t newlen)
 {
 	int ret;
 	unsigned arena_ind = mib[1];
 	arena_t *arena;
 
-	malloc_mutex_lock(&ctl_mtx);
-	if (arena_ind < narenas_total_get() && (arena = arena_get(tsd_fetch(),
-	    arena_ind, false, true)) != NULL) {
-		malloc_mutex_lock(&arena->lock);
-		READ(arena->chunk_alloc, chunk_alloc_t *);
-		WRITE(arena->chunk_alloc, chunk_alloc_t *);
-	} else {
+	arena = arena_get(tsd_fetch(), arena_ind, false, (arena_ind == 0));
+	if (arena == NULL) {
 		ret = EFAULT;
-		goto label_outer_return;
+		goto label_return;
 	}
-	ret = 0;
-label_return:
-	malloc_mutex_unlock(&arena->lock);
-label_outer_return:
-	malloc_mutex_unlock(&ctl_mtx);
-	return (ret);
-}
-
-static int
-arena_i_chunk_dalloc_ctl(const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen)
-{
-
-	int ret;
-	unsigned arena_ind = mib[1];
-	arena_t *arena;
 
-	malloc_mutex_lock(&ctl_mtx);
-	if (arena_ind < narenas_total_get() && (arena = arena_get(tsd_fetch(),
-	    arena_ind, false, true)) != NULL) {
-		malloc_mutex_lock(&arena->lock);
-		READ(arena->chunk_dalloc, chunk_dalloc_t *);
-		WRITE(arena->chunk_dalloc, chunk_dalloc_t *);
-	} else {
-		ret = EFAULT;
-		goto label_outer_return;
+	if (oldp != NULL && oldlenp != NULL) {
+		size_t oldval = arena_lg_dirty_mult_get(arena);
+		READ(oldval, ssize_t);
 	}
+	if (newp != NULL) {
+		if (newlen != sizeof(ssize_t)) {
+			ret = EINVAL;
+			goto label_return;
+		}
+		if (arena_lg_dirty_mult_set(arena, *(ssize_t *)newp)) {
+			ret = EFAULT;
+			goto label_return;
+		}
+	}
+
 	ret = 0;
 label_return:
-	malloc_mutex_unlock(&arena->lock);
-label_outer_return:
-	malloc_mutex_unlock(&ctl_mtx);
 	return (ret);
 }
 
+#define	CHUNK_FUNC(n)							\
+static int								\
+arena_i_chunk_##n##_ctl(const size_t *mib, size_t miblen, void *oldp,	\
+    size_t *oldlenp, void *newp, size_t newlen)				\
+{									\
+									\
+	int ret;							\
+	unsigned arena_ind = mib[1];					\
+	arena_t *arena;							\
+									\
+	malloc_mutex_lock(&ctl_mtx);					\
+	if (arena_ind < narenas_total_get() && (arena =			\
+	    arena_get(tsd_fetch(), arena_ind, false, true)) != NULL) {	\
+		malloc_mutex_lock(&arena->lock);			\
+		READ(arena->chunk_##n, chunk_##n##_t *);		\
+		WRITE(arena->chunk_##n, chunk_##n##_t *);		\
+	} else {							\
+		ret = EFAULT;						\
+		goto label_outer_return;				\
+	}								\
+	ret = 0;							\
+label_return:								\
+	malloc_mutex_unlock(&arena->lock);				\
+label_outer_return:							\
+	malloc_mutex_unlock(&ctl_mtx);					\
+	return (ret);							\
+}
+CHUNK_FUNC(alloc)
+CHUNK_FUNC(dalloc)
+CHUNK_FUNC(purge)
+#undef CHUNK_FUNC
+
 static const ctl_named_node_t *
 arena_i_index(const size_t *mib, size_t miblen, size_t i)
 {
@@ -1736,6 +1755,32 @@ label_return:
 	return (ret);
 }
 
+static int
+arenas_lg_dirty_mult_ctl(const size_t *mib, size_t miblen, void *oldp,
+    size_t *oldlenp, void *newp, size_t newlen)
+{
+	int ret;
+
+	if (oldp != NULL && oldlenp != NULL) {
+		size_t oldval = arena_lg_dirty_mult_default_get();
+		READ(oldval, ssize_t);
+	}
+	if (newp != NULL) {
+		if (newlen != sizeof(ssize_t)) {
+			ret = EINVAL;
+			goto label_return;
+		}
+		if (arena_lg_dirty_mult_default_set(*(ssize_t *)newp)) {
+			ret = EFAULT;
+			goto label_return;
+		}
+	}
+
+	ret = 0;
+label_return:
+	return (ret);
+}
+
 CTL_RO_NL_GEN(arenas_quantum, QUANTUM, size_t)
 CTL_RO_NL_GEN(arenas_page, PAGE, size_t)
 CTL_RO_NL_CGEN(config_tcache, arenas_tcache_max, tcache_maxclass, size_t)
diff --git a/src/huge.c b/src/huge.c
index 3092932..aa26f5d 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -124,9 +124,10 @@ huge_ralloc_no_move_similar(void *ptr, size_t oldsize, size_t usize,
     size_t size, size_t extra, bool zero)
 {
 	size_t usize_next;
-	bool zeroed;
 	extent_node_t *node;
 	arena_t *arena;
+	chunk_purge_t *chunk_purge;
+	bool zeroed;
 
 	/* Increase usize to incorporate extra. */
 	while (usize < s2u(size+extra) && (usize_next = s2u(usize+1)) < oldsize)
@@ -135,11 +136,18 @@ huge_ralloc_no_move_similar(void *ptr, size_t oldsize, size_t usize,
 	if (oldsize == usize)
 		return;
 
+	node = huge_node_get(ptr);
+	arena = extent_node_arena_get(node);
+
+	malloc_mutex_lock(&arena->lock);
+	chunk_purge = arena->chunk_purge;
+	malloc_mutex_unlock(&arena->lock);
+
 	/* Fill if necessary (shrinking). */
 	if (oldsize > usize) {
 		size_t sdiff = CHUNK_CEILING(usize) - usize;
-		zeroed = (sdiff != 0) ? !pages_purge((void *)((uintptr_t)ptr +
-		    usize), sdiff) : true;
+		zeroed = (sdiff != 0) ? !chunk_purge_wrapper(arena, chunk_purge,
+		    CHUNK_ADDR2BASE(ptr), CHUNK_ADDR2OFFSET(ptr), usize) : true;
 		if (config_fill && unlikely(opt_junk_free)) {
 			memset((void *)((uintptr_t)ptr + usize), 0x5a, oldsize -
 			    usize);
@@ -148,8 +156,6 @@ huge_ralloc_no_move_similar(void *ptr, size_t oldsize, size_t usize,
 	} else
 		zeroed = true;
 
-	node = huge_node_get(ptr);
-	arena = extent_node_arena_get(node);
 	malloc_mutex_lock(&arena->huge_mtx);
 	/* Update the size of the huge allocation. */
 	assert(extent_node_size_get(node) != usize);
@@ -177,22 +183,29 @@ huge_ralloc_no_move_similar(void *ptr, size_t oldsize, size_t usize,
 static void
 huge_ralloc_no_move_shrink(void *ptr, size_t oldsize, size_t usize)
 {
-	size_t sdiff;
-	bool zeroed;
 	extent_node_t *node;
 	arena_t *arena;
+	chunk_purge_t *chunk_purge;
+	size_t sdiff;
+	bool zeroed;
+
+	node = huge_node_get(ptr);
+	arena = extent_node_arena_get(node);
+
+	malloc_mutex_lock(&arena->lock);
+	chunk_purge = arena->chunk_purge;
+	malloc_mutex_unlock(&arena->lock);
 
 	sdiff = CHUNK_CEILING(usize) - usize;
-	zeroed = (sdiff != 0) ? !pages_purge((void *)((uintptr_t)ptr + usize),
-	    sdiff) : true;
+	zeroed = (sdiff != 0) ? !chunk_purge_wrapper(arena, chunk_purge,
+	    CHUNK_ADDR2BASE((uintptr_t)ptr + usize),
+	    CHUNK_ADDR2OFFSET((uintptr_t)ptr + usize), sdiff) : true;
 	if (config_fill && unlikely(opt_junk_free)) {
 		huge_dalloc_junk((void *)((uintptr_t)ptr + usize), oldsize -
 		    usize);
 		zeroed = false;
 	}
 
-	node = huge_node_get(ptr);
-	arena = extent_node_arena_get(node);
 	malloc_mutex_lock(&arena->huge_mtx);
 	/* Update the size of the huge allocation. */
 	extent_node_size_set(node, usize);
@@ -291,8 +304,7 @@ huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
 	}
 
 	/* Attempt to expand the allocation in-place. */
-	if (huge_ralloc_no_move_expand(ptr, oldsize, size + extra,
-	    zero)) {
+	if (huge_ralloc_no_move_expand(ptr, oldsize, size + extra, zero)) {
 		if (extra == 0)
 			return (true);
 
diff --git a/src/stats.c b/src/stats.c
index e0f7165..f246c8b 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -264,6 +264,7 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 {
 	unsigned nthreads;
 	const char *dss;
+	ssize_t lg_dirty_mult;
 	size_t page, pactive, pdirty, mapped;
 	size_t metadata_mapped, metadata_allocated;
 	uint64_t npurge, nmadvise, purged;
@@ -282,6 +283,15 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	CTL_I_GET("stats.arenas.0.dss", &dss, const char *);
 	malloc_cprintf(write_cb, cbopaque, "dss allocation precedence: %s\n",
 	    dss);
+	CTL_I_GET("stats.arenas.0.lg_dirty_mult", &lg_dirty_mult, ssize_t);
+	if (lg_dirty_mult >= 0) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "Min active:dirty page ratio: %u:1\n",
+		    (1U << lg_dirty_mult));
+	} else {
+		malloc_cprintf(write_cb, cbopaque,
+		    "Min active:dirty page ratio: N/A\n");
+	}
 	CTL_I_GET("stats.arenas.0.pactive", &pactive, size_t);
 	CTL_I_GET("stats.arenas.0.pdirty", &pdirty, size_t);
 	CTL_I_GET("stats.arenas.0.npurge", &npurge, uint64_t);
diff --git a/test/integration/chunk.c b/test/integration/chunk.c
index 8993850..de45bc5 100644
--- a/test/integration/chunk.c
+++ b/test/integration/chunk.c
@@ -2,6 +2,16 @@
 
 chunk_alloc_t *old_alloc;
 chunk_dalloc_t *old_dalloc;
+chunk_purge_t *old_purge;
+bool purged;
+
+void *
+chunk_alloc(void *new_addr, size_t size, size_t alignment, bool *zero,
+    unsigned arena_ind)
+{
+
+	return (old_alloc(new_addr, size, alignment, zero, arena_ind));
+}
 
 bool
 chunk_dalloc(void *chunk, size_t size, unsigned arena_ind)
@@ -10,12 +20,12 @@ chunk_dalloc(void *chunk, size_t size, unsigned arena_ind)
 	return (old_dalloc(chunk, size, arena_ind));
 }
 
-void *
-chunk_alloc(void *new_addr, size_t size, size_t alignment, bool *zero,
-    unsigned arena_ind)
+bool
+chunk_purge(void *chunk, size_t offset, size_t length, unsigned arena_ind)
 {
 
-	return (old_alloc(new_addr, size, alignment, zero, arena_ind));
+	purged = true;
+	return (old_purge(chunk, offset, length, arena_ind));
 }
 
 TEST_BEGIN(test_chunk)
@@ -23,31 +33,59 @@ TEST_BEGIN(test_chunk)
 	void *p;
 	chunk_alloc_t *new_alloc;
 	chunk_dalloc_t *new_dalloc;
-	size_t old_size, new_size;
+	chunk_purge_t *new_purge;
+	size_t old_size, new_size, huge0, huge1, huge2, sz;
 
 	new_alloc = chunk_alloc;
 	new_dalloc = chunk_dalloc;
+	new_purge = chunk_purge;
 	old_size = sizeof(chunk_alloc_t *);
 	new_size = sizeof(chunk_alloc_t *);
 
-	assert_d_eq(mallctl("arena.0.chunk.alloc", &old_alloc,
-	    &old_size, &new_alloc, new_size), 0,
-	    "Unexpected alloc error");
-	assert_ptr_ne(old_alloc, new_alloc,
-	    "Unexpected alloc error");
+	assert_d_eq(mallctl("arena.0.chunk.alloc", &old_alloc, &old_size,
+	    &new_alloc, new_size), 0, "Unexpected alloc error");
+	assert_ptr_ne(old_alloc, new_alloc, "Unexpected alloc error");
+
 	assert_d_eq(mallctl("arena.0.chunk.dalloc", &old_dalloc, &old_size,
 	    &new_dalloc, new_size), 0, "Unexpected dalloc error");
 	assert_ptr_ne(old_dalloc, new_dalloc, "Unexpected dalloc error");
 
+	assert_d_eq(mallctl("arena.0.chunk.purge", &old_purge, &old_size,
+	    &new_purge, new_size), 0, "Unexpected purge error");
+	assert_ptr_ne(old_purge, new_purge, "Unexpected purge error");
+
+	sz = sizeof(size_t);
+	assert_d_eq(mallctl("arenas.hchunk.0.size", &huge0, &sz, NULL, 0), 0,
+	    "Unexpected arenas.hchunk.0.size failure");
+	assert_d_eq(mallctl("arenas.hchunk.1.size", &huge1, &sz, NULL, 0), 0,
+	    "Unexpected arenas.hchunk.1.size failure");
+	assert_d_eq(mallctl("arenas.hchunk.2.size", &huge2, &sz, NULL, 0), 0,
+	    "Unexpected arenas.hchunk.2.size failure");
+	if (huge0 * 2 > huge2) {
+		/*
+		 * There are at least four size classes per doubling, so
+		 * xallocx() from size=huge2 to size=huge1 is guaranteed to
+		 * leave trailing purgeable memory.
+		 */
+		p = mallocx(huge2, 0);
+		assert_ptr_not_null(p, "Unexpected mallocx() error");
+		purged = false;
+		assert_zu_eq(xallocx(p, huge1, 0, 0), huge1,
+		    "Unexpected xallocx() failure");
+		assert_true(purged, "Unexpected purge");
+		dallocx(p, 0);
+	}
+
 	p = mallocx(42, 0);
-	assert_ptr_ne(p, NULL, "Unexpected alloc error");
+	assert_ptr_not_null(p, "Unexpected mallocx() error");
 	free(p);
 
-	assert_d_eq(mallctl("arena.0.chunk.alloc", NULL,
-	    NULL, &old_alloc, old_size), 0,
-	    "Unexpected alloc error");
+	assert_d_eq(mallctl("arena.0.chunk.alloc", NULL, NULL, &old_alloc,
+	    old_size), 0, "Unexpected alloc error");
 	assert_d_eq(mallctl("arena.0.chunk.dalloc", NULL, NULL, &old_dalloc,
 	    old_size), 0, "Unexpected dalloc error");
+	assert_d_eq(mallctl("arena.0.chunk.purge", NULL, NULL, &old_purge,
+	    old_size), 0, "Unexpected purge error");
 }
 TEST_END
 
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index 5960496..31ada19 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -348,6 +348,38 @@ TEST_BEGIN(test_thread_arena)
 }
 TEST_END
 
+TEST_BEGIN(test_arena_i_lg_dirty_mult)
+{
+	ssize_t lg_dirty_mult, orig_lg_dirty_mult, prev_lg_dirty_mult;
+	size_t sz = sizeof(ssize_t);
+
+	assert_d_eq(mallctl("arena.0.lg_dirty_mult", &orig_lg_dirty_mult, &sz,
+	    NULL, 0), 0, "Unexpected mallctl() failure");
+
+	lg_dirty_mult = -2;
+	assert_d_eq(mallctl("arena.0.lg_dirty_mult", NULL, NULL,
+	    &lg_dirty_mult, sizeof(ssize_t)), EFAULT,
+	    "Unexpected mallctl() success");
+
+	lg_dirty_mult = (sizeof(size_t) << 3);
+	assert_d_eq(mallctl("arena.0.lg_dirty_mult", NULL, NULL,
+	    &lg_dirty_mult, sizeof(ssize_t)), EFAULT,
+	    "Unexpected mallctl() success");
+
+	for (prev_lg_dirty_mult = orig_lg_dirty_mult, lg_dirty_mult = -1;
+	    lg_dirty_mult < (sizeof(ssize_t) << 3); prev_lg_dirty_mult =
+	    lg_dirty_mult, lg_dirty_mult++) {
+		ssize_t old_lg_dirty_mult;
+
+		assert_d_eq(mallctl("arena.0.lg_dirty_mult", &old_lg_dirty_mult,
+		    &sz, &lg_dirty_mult, sizeof(ssize_t)), 0,
+		    "Unexpected mallctl() failure");
+		assert_zd_eq(old_lg_dirty_mult, prev_lg_dirty_mult,
+		    "Unexpected old arena.0.lg_dirty_mult");
+	}
+}
+TEST_END
+
 TEST_BEGIN(test_arena_i_purge)
 {
 	unsigned narenas;
@@ -427,6 +459,38 @@ TEST_BEGIN(test_arenas_initialized)
 }
 TEST_END
 
+TEST_BEGIN(test_arenas_lg_dirty_mult)
+{
+	ssize_t lg_dirty_mult, orig_lg_dirty_mult, prev_lg_dirty_mult;
+	size_t sz = sizeof(ssize_t);
+
+	assert_d_eq(mallctl("arenas.lg_dirty_mult", &orig_lg_dirty_mult, &sz,
+	    NULL, 0), 0, "Unexpected mallctl() failure");
+
+	lg_dirty_mult = -2;
+	assert_d_eq(mallctl("arenas.lg_dirty_mult", NULL, NULL,
+	    &lg_dirty_mult, sizeof(ssize_t)), EFAULT,
+	    "Unexpected mallctl() success");
+
+	lg_dirty_mult = (sizeof(size_t) << 3);
+	assert_d_eq(mallctl("arenas.lg_dirty_mult", NULL, NULL,
+	    &lg_dirty_mult, sizeof(ssize_t)), EFAULT,
+	    "Unexpected mallctl() success");
+
+	for (prev_lg_dirty_mult = orig_lg_dirty_mult, lg_dirty_mult = -1;
+	    lg_dirty_mult < (sizeof(ssize_t) << 3); prev_lg_dirty_mult =
+	    lg_dirty_mult, lg_dirty_mult++) {
+		ssize_t old_lg_dirty_mult;
+
+		assert_d_eq(mallctl("arenas.lg_dirty_mult", &old_lg_dirty_mult,
+		    &sz, &lg_dirty_mult, sizeof(ssize_t)), 0,
+		    "Unexpected mallctl() failure");
+		assert_zd_eq(old_lg_dirty_mult, prev_lg_dirty_mult,
+		    "Unexpected old arenas.lg_dirty_mult");
+	}
+}
+TEST_END
+
 TEST_BEGIN(test_arenas_constants)
 {
 
@@ -554,9 +618,11 @@ main(void)
 	    test_tcache_none,
 	    test_tcache,
 	    test_thread_arena,
+	    test_arena_i_lg_dirty_mult,
 	    test_arena_i_purge,
 	    test_arena_i_dss,
 	    test_arenas_initialized,
+	    test_arenas_lg_dirty_mult,
 	    test_arenas_constants,
 	    test_arenas_bin_constants,
 	    test_arenas_lrun_constants,
diff --git a/test/unit/rtree.c b/test/unit/rtree.c
index 556c4a8..496e03a 100644
--- a/test/unit/rtree.c
+++ b/test/unit/rtree.c
@@ -22,7 +22,7 @@ TEST_BEGIN(test_rtree_get_empty)
 		rtree_t rtree;
 		assert_false(rtree_new(&rtree, i, node_alloc, node_dalloc),
 		    "Unexpected rtree_new() failure");
-		assert_ptr_eq(rtree_get(&rtree, 0), NULL,
+		assert_ptr_null(rtree_get(&rtree, 0),
 		    "rtree_get() should return NULL for empty tree");
 		rtree_delete(&rtree);
 	}
@@ -75,8 +75,8 @@ TEST_BEGIN(test_rtree_bits)
 				    "get key=%#"PRIxPTR, i, j, k, keys[j],
 				    keys[k]);
 			}
-			assert_ptr_eq(rtree_get(&rtree,
-			    (((uintptr_t)1) << (sizeof(uintptr_t)*8-i))), NULL,
+			assert_ptr_null(rtree_get(&rtree,
+			    (((uintptr_t)1) << (sizeof(uintptr_t)*8-i))),
 			    "Only leftmost rtree leaf should be set; "
 			    "i=%u, j=%u", i, j);
 			rtree_set(&rtree, keys[j], NULL);
@@ -117,11 +117,11 @@ TEST_BEGIN(test_rtree_random)
 
 		for (j = 0; j < NSET; j++) {
 			rtree_set(&rtree, keys[j], NULL);
-			assert_ptr_eq(rtree_get(&rtree, keys[j]), NULL,
+			assert_ptr_null(rtree_get(&rtree, keys[j]),
 			    "rtree_get() should return previously set value");
 		}
 		for (j = 0; j < NSET; j++) {
-			assert_ptr_eq(rtree_get(&rtree, keys[j]), NULL,
+			assert_ptr_null(rtree_get(&rtree, keys[j]),
 			    "rtree_get() should return previously set value");
 		}
 
-- 
cgit v0.12


From e0a08a14962c8d6b09fd25ba9f3f6c57d5a4f844 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 18 Mar 2015 21:06:58 -0700
Subject: Restore --enable-ivsalloc.

However, unlike before it was removed do not force --enable-ivsalloc
when Darwin zone allocator integration is enabled, since the zone
allocator code uses ivsalloc() regardless of whether
malloc_usable_size() and sallocx() do.

This resolves #211.
---
 ChangeLog                                          |  2 --
 INSTALL                                            |  6 ++++++
 configure.ac                                       | 23 +++++++++++++++++++++-
 include/jemalloc/internal/jemalloc_internal.h.in   |  7 +++++++
 .../jemalloc/internal/jemalloc_internal_defs.h.in  |  6 ++++++
 src/jemalloc.c                                     |  4 ++--
 6 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index ef7dbfd..a462d02 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -133,8 +133,6 @@ found in the git revision history:
   - Remove the "stats.huge.allocated", "stats.huge.nmalloc", and
     "stats.huge.ndalloc" mallctls.
   - Remove the --enable-mremap option.
-  - Remove the --enable-ivsalloc option, and merge its functionality into
-    --enable-debug.
   - Remove the "stats.chunks.current", "stats.chunks.total", and
     "stats.chunks.high" mallctls.
 
diff --git a/INSTALL b/INSTALL
index 517fe02..cd760ca 100644
--- a/INSTALL
+++ b/INSTALL
@@ -92,6 +92,7 @@ any of the following arguments (not a definitive list) to 'configure':
 --enable-debug
     Enable assertions and validation code.  This incurs a substantial
     performance hit, but is very useful during application development.
+    Implies --enable-ivsalloc.
 
 --enable-code-coverage
     Enable code coverage support, for use during jemalloc test development.
@@ -110,6 +111,11 @@ any of the following arguments (not a definitive list) to 'configure':
     Disable statistics gathering functionality.  See the "opt.stats_print"
     option documentation for usage details.
 
+--enable-ivsalloc
+    Enable validation code, which verifies that pointers reside within
+    jemalloc-owned chunks before dereferencing them.  This incurs a minor
+    performance hit.
+
 --enable-prof
     Enable heap profiling and leak detection functionality.  See the "opt.prof"
     option documentation for usage details.  When enabled, there are several
diff --git a/configure.ac b/configure.ac
index 4ac7ac8..be49743 100644
--- a/configure.ac
+++ b/configure.ac
@@ -625,7 +625,8 @@ fi
 
 dnl Do not compile with debugging by default.
 AC_ARG_ENABLE([debug],
-  [AS_HELP_STRING([--enable-debug], [Build debugging code])],
+  [AS_HELP_STRING([--enable-debug],
+                  [Build debugging code (implies --enable-ivsalloc)])],
 [if test "x$enable_debug" = "xno" ; then
   enable_debug="0"
 else
@@ -637,8 +638,28 @@ fi
 if test "x$enable_debug" = "x1" ; then
   AC_DEFINE([JEMALLOC_DEBUG], [ ])
 fi
+if test "x$enable_debug" = "x1" ; then
+  AC_DEFINE([JEMALLOC_DEBUG], [ ])
+  enable_ivsalloc="1"
+fi
 AC_SUBST([enable_debug])
 
+dnl Do not validate pointers by default.
+AC_ARG_ENABLE([ivsalloc],
+  [AS_HELP_STRING([--enable-ivsalloc],
+                  [Validate pointers passed through the public API])],
+[if test "x$enable_ivsalloc" = "xno" ; then
+  enable_ivsalloc="0"
+else
+  enable_ivsalloc="1"
+fi
+],
+[enable_ivsalloc="0"]
+)
+if test "x$enable_ivsalloc" = "x1" ; then
+  AC_DEFINE([JEMALLOC_IVSALLOC], [ ])
+fi
+
 dnl Only optimize if not debugging.
 if test "x$enable_debug" = "x0" -a "x$no_CFLAGS" = "xyes" ; then
   dnl Make sure that an optimization flag was not specified in EXTRA_CFLAGS.
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 8ed69ce..b398f31 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -119,6 +119,13 @@ static const bool config_xmalloc =
     false
 #endif
     ;
+static const bool config_ivsalloc =
+#ifdef JEMALLOC_IVSALLOC
+    true
+#else
+    false
+#endif
+    ;
 
 #ifdef JEMALLOC_C11ATOMICS
 #include <stdatomic.h>
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index 191abc5..a943d23 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -187,6 +187,12 @@
 #undef JEMALLOC_INTERNAL_FFS
 
 /*
+ * JEMALLOC_IVSALLOC enables ivsalloc(), which verifies that pointers reside
+ * within jemalloc-owned chunks before dereferencing them.
+ */
+#undef JEMALLOC_IVSALLOC
+
+/*
  * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
  */
 #undef JEMALLOC_ZONE
diff --git a/src/jemalloc.c b/src/jemalloc.c
index d511009..7e9f486 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -2306,7 +2306,7 @@ je_sallocx(const void *ptr, int flags)
 	assert(malloc_initialized() || IS_INITIALIZER);
 	malloc_thread_init();
 
-	if (config_debug)
+	if (config_ivsalloc)
 		usize = ivsalloc(ptr, config_prof);
 	else
 		usize = isalloc(ptr, config_prof);
@@ -2434,7 +2434,7 @@ je_malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *ptr)
 	assert(malloc_initialized() || IS_INITIALIZER);
 	malloc_thread_init();
 
-	if (config_debug)
+	if (config_ivsalloc)
 		ret = ivsalloc(ptr, config_prof);
 	else
 		ret = (ptr == NULL) ? 0 : isalloc(ptr, config_prof);
-- 
cgit v0.12


From 7e336e7359ec50f06ec73f29033c7807148bf476 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 20 Mar 2015 18:08:10 -0700
Subject: Fix lg_dirty_mult-related stats printing.

This regression was introduced by
8d6a3e8321a7767cb2ca0930b85d5d488a8cc659 (Implement dynamic per arena
control over dirty page purging.).

This resolves #215.
---
 src/stats.c | 148 +++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 82 insertions(+), 66 deletions(-)

diff --git a/src/stats.c b/src/stats.c
index f246c8b..ae74737 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -6,31 +6,31 @@
 	xmallctl(n, v, &sz, NULL, 0);					\
 } while (0)
 
-#define	CTL_I_GET(n, v, t) do {						\
+#define	CTL_M1_GET(n, i, v, t) do {					\
 	size_t mib[6];							\
 	size_t miblen = sizeof(mib) / sizeof(size_t);			\
 	size_t sz = sizeof(t);						\
 	xmallctlnametomib(n, mib, &miblen);				\
-	mib[2] = i;							\
+	mib[1] = (i);							\
 	xmallctlbymib(mib, miblen, v, &sz, NULL, 0);			\
 } while (0)
 
-#define	CTL_J_GET(n, v, t) do {						\
+#define	CTL_M2_GET(n, i, v, t) do {					\
 	size_t mib[6];							\
 	size_t miblen = sizeof(mib) / sizeof(size_t);			\
 	size_t sz = sizeof(t);						\
 	xmallctlnametomib(n, mib, &miblen);				\
-	mib[2] = j;							\
+	mib[2] = (i);							\
 	xmallctlbymib(mib, miblen, v, &sz, NULL, 0);			\
 } while (0)
 
-#define	CTL_IJ_GET(n, v, t) do {					\
+#define	CTL_M2_M4_GET(n, i, j, v, t) do {				\
 	size_t mib[6];							\
 	size_t miblen = sizeof(mib) / sizeof(size_t);			\
 	size_t sz = sizeof(t);						\
 	xmallctlnametomib(n, mib, &miblen);				\
-	mib[2] = i;							\
-	mib[4] = j;							\
+	mib[2] = (i);							\
+	mib[4] = (j);							\
 	xmallctlbymib(mib, miblen, v, &sz, NULL, 0);			\
 } while (0)
 
@@ -82,7 +82,8 @@ stats_arena_bins_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	for (j = 0, in_gap = false; j < nbins; j++) {
 		uint64_t nruns;
 
-		CTL_IJ_GET("stats.arenas.0.bins.0.nruns", &nruns, uint64_t);
+		CTL_M2_M4_GET("stats.arenas.0.bins.0.nruns", i, j, &nruns,
+		    uint64_t);
 		if (nruns == 0)
 			in_gap = true;
 		else {
@@ -98,27 +99,28 @@ stats_arena_bins_print(void (*write_cb)(void *, const char *), void *cbopaque,
 				    "                     ---\n");
 				in_gap = false;
 			}
-			CTL_J_GET("arenas.bin.0.size", &reg_size, size_t);
-			CTL_J_GET("arenas.bin.0.nregs", &nregs, uint32_t);
-			CTL_J_GET("arenas.bin.0.run_size", &run_size, size_t);
-			CTL_IJ_GET("stats.arenas.0.bins.0.nmalloc",
+			CTL_M2_GET("arenas.bin.0.size", j, &reg_size, size_t);
+			CTL_M2_GET("arenas.bin.0.nregs", j, &nregs, uint32_t);
+			CTL_M2_GET("arenas.bin.0.run_size", j, &run_size,
+			    size_t);
+			CTL_M2_M4_GET("stats.arenas.0.bins.0.nmalloc", i, j,
 			    &nmalloc, uint64_t);
-			CTL_IJ_GET("stats.arenas.0.bins.0.ndalloc",
+			CTL_M2_M4_GET("stats.arenas.0.bins.0.ndalloc", i, j,
 			    &ndalloc, uint64_t);
-			CTL_IJ_GET("stats.arenas.0.bins.0.curregs",
+			CTL_M2_M4_GET("stats.arenas.0.bins.0.curregs", i, j,
 			    &curregs, size_t);
-			CTL_IJ_GET("stats.arenas.0.bins.0.nrequests",
+			CTL_M2_M4_GET("stats.arenas.0.bins.0.nrequests", i, j,
 			    &nrequests, uint64_t);
 			if (config_tcache) {
-				CTL_IJ_GET("stats.arenas.0.bins.0.nfills",
-				    &nfills, uint64_t);
-				CTL_IJ_GET("stats.arenas.0.bins.0.nflushes",
-				    &nflushes, uint64_t);
+				CTL_M2_M4_GET("stats.arenas.0.bins.0.nfills", i,
+				    j, &nfills, uint64_t);
+				CTL_M2_M4_GET("stats.arenas.0.bins.0.nflushes",
+				    i, j, &nflushes, uint64_t);
 			}
-			CTL_IJ_GET("stats.arenas.0.bins.0.nreruns", &reruns,
-			    uint64_t);
-			CTL_IJ_GET("stats.arenas.0.bins.0.curruns", &curruns,
-			    size_t);
+			CTL_M2_M4_GET("stats.arenas.0.bins.0.nreruns", i, j,
+			    &reruns, uint64_t);
+			CTL_M2_M4_GET("stats.arenas.0.bins.0.curruns", i, j,
+			    &curruns, size_t);
 
 			availregs = nregs * curruns;
 			milli = (availregs != 0) ? (1000 * curregs) / availregs
@@ -179,18 +181,18 @@ stats_arena_lruns_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		uint64_t nmalloc, ndalloc, nrequests;
 		size_t run_size, curruns;
 
-		CTL_IJ_GET("stats.arenas.0.lruns.0.nmalloc", &nmalloc,
-		    uint64_t);
-		CTL_IJ_GET("stats.arenas.0.lruns.0.ndalloc", &ndalloc,
+		CTL_M2_M4_GET("stats.arenas.0.lruns.0.nmalloc", i, j, &nmalloc,
 		    uint64_t);
-		CTL_IJ_GET("stats.arenas.0.lruns.0.nrequests", &nrequests,
+		CTL_M2_M4_GET("stats.arenas.0.lruns.0.ndalloc", i, j, &ndalloc,
 		    uint64_t);
+		CTL_M2_M4_GET("stats.arenas.0.lruns.0.nrequests", i, j,
+		    &nrequests, uint64_t);
 		if (nrequests == 0)
 			in_gap = true;
 		else {
-			CTL_J_GET("arenas.lrun.0.size", &run_size, size_t);
-			CTL_IJ_GET("stats.arenas.0.lruns.0.curruns", &curruns,
-			    size_t);
+			CTL_M2_GET("arenas.lrun.0.size", j, &run_size, size_t);
+			CTL_M2_M4_GET("stats.arenas.0.lruns.0.curruns", i, j,
+			    &curruns, size_t);
 			if (in_gap) {
 				malloc_cprintf(write_cb, cbopaque,
 				    "                     ---\n");
@@ -226,19 +228,19 @@ stats_arena_hchunks_print(void (*write_cb)(void *, const char *),
 		uint64_t nmalloc, ndalloc, nrequests;
 		size_t hchunk_size, curhchunks;
 
-		CTL_IJ_GET("stats.arenas.0.hchunks.0.nmalloc", &nmalloc,
-		    uint64_t);
-		CTL_IJ_GET("stats.arenas.0.hchunks.0.ndalloc", &ndalloc,
-		    uint64_t);
-		CTL_IJ_GET("stats.arenas.0.hchunks.0.nrequests", &nrequests,
-		    uint64_t);
+		CTL_M2_M4_GET("stats.arenas.0.hchunks.0.nmalloc", i, j,
+		    &nmalloc, uint64_t);
+		CTL_M2_M4_GET("stats.arenas.0.hchunks.0.ndalloc", i, j,
+		    &ndalloc, uint64_t);
+		CTL_M2_M4_GET("stats.arenas.0.hchunks.0.nrequests", i, j,
+		    &nrequests, uint64_t);
 		if (nrequests == 0)
 			in_gap = true;
 		else {
-			CTL_J_GET("arenas.hchunk.0.size", &hchunk_size,
+			CTL_M2_GET("arenas.hchunk.0.size", j, &hchunk_size,
 			    size_t);
-			CTL_IJ_GET("stats.arenas.0.hchunks.0.curhchunks",
-			    &curhchunks, size_t);
+			CTL_M2_M4_GET("stats.arenas.0.hchunks.0.curhchunks", i,
+			    j, &curhchunks, size_t);
 			if (in_gap) {
 				malloc_cprintf(write_cb, cbopaque,
 				    "                     ---\n");
@@ -277,26 +279,26 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 
 	CTL_GET("arenas.page", &page, size_t);
 
-	CTL_I_GET("stats.arenas.0.nthreads", &nthreads, unsigned);
+	CTL_M2_GET("stats.arenas.0.nthreads", i, &nthreads, unsigned);
 	malloc_cprintf(write_cb, cbopaque,
 	    "assigned threads: %u\n", nthreads);
-	CTL_I_GET("stats.arenas.0.dss", &dss, const char *);
+	CTL_M2_GET("stats.arenas.0.dss", i, &dss, const char *);
 	malloc_cprintf(write_cb, cbopaque, "dss allocation precedence: %s\n",
 	    dss);
-	CTL_I_GET("stats.arenas.0.lg_dirty_mult", &lg_dirty_mult, ssize_t);
+	CTL_M1_GET("arena.0.lg_dirty_mult", i, &lg_dirty_mult, ssize_t);
 	if (lg_dirty_mult >= 0) {
 		malloc_cprintf(write_cb, cbopaque,
-		    "Min active:dirty page ratio: %u:1\n",
+		    "min active:dirty page ratio: %u:1\n",
 		    (1U << lg_dirty_mult));
 	} else {
 		malloc_cprintf(write_cb, cbopaque,
-		    "Min active:dirty page ratio: N/A\n");
+		    "min active:dirty page ratio: N/A\n");
 	}
-	CTL_I_GET("stats.arenas.0.pactive", &pactive, size_t);
-	CTL_I_GET("stats.arenas.0.pdirty", &pdirty, size_t);
-	CTL_I_GET("stats.arenas.0.npurge", &npurge, uint64_t);
-	CTL_I_GET("stats.arenas.0.nmadvise", &nmadvise, uint64_t);
-	CTL_I_GET("stats.arenas.0.purged", &purged, uint64_t);
+	CTL_M2_GET("stats.arenas.0.pactive", i, &pactive, size_t);
+	CTL_M2_GET("stats.arenas.0.pdirty", i, &pdirty, size_t);
+	CTL_M2_GET("stats.arenas.0.npurge", i, &npurge, uint64_t);
+	CTL_M2_GET("stats.arenas.0.nmadvise", i, &nmadvise, uint64_t);
+	CTL_M2_GET("stats.arenas.0.purged", i, &purged, uint64_t);
 	malloc_cprintf(write_cb, cbopaque,
 	    "dirty pages: %zu:%zu active:dirty, %"PRIu64" sweep%s,"
 	    " %"PRIu64" madvise%s, %"PRIu64" purged\n",
@@ -306,26 +308,31 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	malloc_cprintf(write_cb, cbopaque,
 	    "                            allocated      nmalloc      ndalloc"
 	    "    nrequests\n");
-	CTL_I_GET("stats.arenas.0.small.allocated", &small_allocated, size_t);
-	CTL_I_GET("stats.arenas.0.small.nmalloc", &small_nmalloc, uint64_t);
-	CTL_I_GET("stats.arenas.0.small.ndalloc", &small_ndalloc, uint64_t);
-	CTL_I_GET("stats.arenas.0.small.nrequests", &small_nrequests, uint64_t);
+	CTL_M2_GET("stats.arenas.0.small.allocated", i, &small_allocated,
+	    size_t);
+	CTL_M2_GET("stats.arenas.0.small.nmalloc", i, &small_nmalloc, uint64_t);
+	CTL_M2_GET("stats.arenas.0.small.ndalloc", i, &small_ndalloc, uint64_t);
+	CTL_M2_GET("stats.arenas.0.small.nrequests", i, &small_nrequests,
+	    uint64_t);
 	malloc_cprintf(write_cb, cbopaque,
 	    "small:                   %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64
 	    "\n",
 	    small_allocated, small_nmalloc, small_ndalloc, small_nrequests);
-	CTL_I_GET("stats.arenas.0.large.allocated", &large_allocated, size_t);
-	CTL_I_GET("stats.arenas.0.large.nmalloc", &large_nmalloc, uint64_t);
-	CTL_I_GET("stats.arenas.0.large.ndalloc", &large_ndalloc, uint64_t);
-	CTL_I_GET("stats.arenas.0.large.nrequests", &large_nrequests, uint64_t);
+	CTL_M2_GET("stats.arenas.0.large.allocated", i, &large_allocated,
+	    size_t);
+	CTL_M2_GET("stats.arenas.0.large.nmalloc", i, &large_nmalloc, uint64_t);
+	CTL_M2_GET("stats.arenas.0.large.ndalloc", i, &large_ndalloc, uint64_t);
+	CTL_M2_GET("stats.arenas.0.large.nrequests", i, &large_nrequests,
+	    uint64_t);
 	malloc_cprintf(write_cb, cbopaque,
 	    "large:                   %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64
 	    "\n",
 	    large_allocated, large_nmalloc, large_ndalloc, large_nrequests);
-	CTL_I_GET("stats.arenas.0.huge.allocated", &huge_allocated, size_t);
-	CTL_I_GET("stats.arenas.0.huge.nmalloc", &huge_nmalloc, uint64_t);
-	CTL_I_GET("stats.arenas.0.huge.ndalloc", &huge_ndalloc, uint64_t);
-	CTL_I_GET("stats.arenas.0.huge.nrequests", &huge_nrequests, uint64_t);
+	CTL_M2_GET("stats.arenas.0.huge.allocated", i, &huge_allocated, size_t);
+	CTL_M2_GET("stats.arenas.0.huge.nmalloc", i, &huge_nmalloc, uint64_t);
+	CTL_M2_GET("stats.arenas.0.huge.ndalloc", i, &huge_ndalloc, uint64_t);
+	CTL_M2_GET("stats.arenas.0.huge.nrequests", i, &huge_nrequests,
+	    uint64_t);
 	malloc_cprintf(write_cb, cbopaque,
 	    "huge:                    %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64
 	    "\n",
@@ -339,11 +346,12 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	    small_nrequests + large_nrequests + huge_nrequests);
 	malloc_cprintf(write_cb, cbopaque, "active:                  %12zu\n",
 	    pactive * page);
-	CTL_I_GET("stats.arenas.0.mapped", &mapped, size_t);
+	CTL_M2_GET("stats.arenas.0.mapped", i, &mapped, size_t);
 	malloc_cprintf(write_cb, cbopaque, "mapped:                  %12zu\n",
 	    mapped);
-	CTL_I_GET("stats.arenas.0.metadata.mapped", &metadata_mapped, size_t);
-	CTL_I_GET("stats.arenas.0.metadata.allocated", &metadata_allocated,
+	CTL_M2_GET("stats.arenas.0.metadata.mapped", i, &metadata_mapped,
+	    size_t);
+	CTL_M2_GET("stats.arenas.0.metadata.allocated", i, &metadata_allocated,
 	    size_t);
 	malloc_cprintf(write_cb, cbopaque,
 	    "metadata: mapped: %zu, allocated: %zu\n", metadata_mapped,
@@ -464,6 +472,14 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 			malloc_cprintf(write_cb, cbopaque,		\
 			    "  opt."#n": %zd\n", ssv);			\
 		}
+#define	OPT_WRITE_SSIZE_T_MUTABLE(n, m)					\
+		ssize_t ssv2;						\
+		if (je_mallctl("opt."#n, &ssv, &sssz, NULL, 0) == 0 &&	\
+		    je_mallctl(#m, &ssv2, &sssz, NULL, 0) == 0) {	\
+			malloc_cprintf(write_cb, cbopaque,		\
+			    "  opt."#n": %zd ("#m": %zd)\n", ssv,	\
+			    ssv2);					\
+		}
 #define	OPT_WRITE_CHAR_P(n)						\
 		if (je_mallctl("opt."#n, &cpv, &cpsz, NULL, 0) == 0) {	\
 			malloc_cprintf(write_cb, cbopaque,		\
@@ -476,7 +492,7 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		OPT_WRITE_SIZE_T(lg_chunk)
 		OPT_WRITE_CHAR_P(dss)
 		OPT_WRITE_SIZE_T(narenas)
-		OPT_WRITE_SSIZE_T(lg_dirty_mult)
+		OPT_WRITE_SSIZE_T_MUTABLE(lg_dirty_mult, arenas.lg_dirty_mult)
 		OPT_WRITE_BOOL(stats_print)
 		OPT_WRITE_CHAR_P(junk)
 		OPT_WRITE_SIZE_T(quarantine)
@@ -519,7 +535,7 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		CTL_GET("arenas.page", &sv, size_t);
 		malloc_cprintf(write_cb, cbopaque, "Page size: %zu\n", sv);
 
-		CTL_GET("opt.lg_dirty_mult", &ssv, ssize_t);
+		CTL_GET("arenas.lg_dirty_mult", &ssv, ssize_t);
 		if (ssv >= 0) {
 			malloc_cprintf(write_cb, cbopaque,
 			    "Min active:dirty page ratio per arena: %u:1\n",
-- 
cgit v0.12


From fd5901ce3083cd3277b87aa414884d7628e2d509 Mon Sep 17 00:00:00 2001
From: Qinfan Wu <wqfish@fb.com>
Date: Sat, 21 Mar 2015 10:18:39 -0700
Subject: Fix a compile error caused by mixed declarations and code.

---
 src/stats.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/stats.c b/src/stats.c
index ae74737..b41b458 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -472,14 +472,15 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 			malloc_cprintf(write_cb, cbopaque,		\
 			    "  opt."#n": %zd\n", ssv);			\
 		}
-#define	OPT_WRITE_SSIZE_T_MUTABLE(n, m)					\
+#define	OPT_WRITE_SSIZE_T_MUTABLE(n, m) {				\
 		ssize_t ssv2;						\
 		if (je_mallctl("opt."#n, &ssv, &sssz, NULL, 0) == 0 &&	\
 		    je_mallctl(#m, &ssv2, &sssz, NULL, 0) == 0) {	\
 			malloc_cprintf(write_cb, cbopaque,		\
 			    "  opt."#n": %zd ("#m": %zd)\n", ssv,	\
 			    ssv2);					\
-		}
+		}							\
+}
 #define	OPT_WRITE_CHAR_P(n)						\
 		if (je_mallctl("opt."#n, &cpv, &cpsz, NULL, 0) == 0) {	\
 			malloc_cprintf(write_cb, cbopaque,		\
-- 
cgit v0.12


From 8ad6bf360f9ca5c6c9a1d8e5825ee473bb4697da Mon Sep 17 00:00:00 2001
From: Igor Podlesny <user.email@poige.ru>
Date: Sun, 22 Mar 2015 01:30:02 +0700
Subject: Fix indentation inconsistencies.

---
 include/jemalloc/internal/util.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/jemalloc/internal/util.h b/include/jemalloc/internal/util.h
index 5ad4933..001cd09 100644
--- a/include/jemalloc/internal/util.h
+++ b/include/jemalloc/internal/util.h
@@ -22,17 +22,17 @@
  * uninitialized.
  */
 #ifdef JEMALLOC_CC_SILENCE
-#  define JEMALLOC_CC_SILENCE_INIT(v) = v
+#	define JEMALLOC_CC_SILENCE_INIT(v) = v
 #else
-#  define JEMALLOC_CC_SILENCE_INIT(v)
+#	define JEMALLOC_CC_SILENCE_INIT(v)
 #endif
 
 #ifdef __GNUC__
-#define likely(x) __builtin_expect(!!(x), 1)
-#define unlikely(x) __builtin_expect(!!(x), 0)
+#	define likely(x)   __builtin_expect(!!(x), 1)
+#	define unlikely(x) __builtin_expect(!!(x), 0)
 #else
-#define likely(x) !!(x)
-#define unlikely(x) !!(x)
+#	define likely(x)   !!(x)
+#	define unlikely(x) !!(x)
 #endif
 
 /*
-- 
cgit v0.12


From 4acd75a694173186e9e0399d2855f05ce8553008 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Mon, 23 Mar 2015 17:25:57 -0700
Subject: Add the "stats.allocated" mallctl.

---
 ChangeLog                                     |  2 ++
 doc/jemalloc.xml.in                           | 23 ++++++++++++++++++---
 include/jemalloc/internal/base.h              |  2 +-
 include/jemalloc/internal/ctl.h               |  1 +
 include/jemalloc/internal/private_symbols.txt |  2 +-
 src/base.c                                    | 29 +++++++++++++++++++--------
 src/ctl.c                                     | 23 ++++++++++++++-------
 src/stats.c                                   |  8 +++++---
 8 files changed, 67 insertions(+), 23 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index a462d02..2607576 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -63,6 +63,8 @@ found in the git revision history:
   - Add metadata statistics, which are accessible via the "stats.metadata",
     "stats.arenas.<i>.metadata.mapped", and
     "stats.arenas.<i>.metadata.allocated" mallctls.
+  - Add the "stats.resident" mallctl, which reports the upper limit of
+    physically resident memory mapped by the allocator.
   - Add the "prof.gdump" mallctl, which makes it possible to toggle the gdump
     feature on/off during program execution.
   - Add sdallocx(), which implements sized deallocation.  The primary
diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 01ac38c..adff6a4 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -1938,6 +1938,23 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         linkend="stats.arenas.i.metadata.allocated"><mallctl>stats.arenas.&lt;i&gt;.metadata.allocated</mallctl></link>).</para></listitem>
       </varlistentry>
 
+      <varlistentry id="stats.resident">
+        <term>
+          <mallctl>stats.resident</mallctl>
+          (<type>size_t</type>)
+          <literal>r-</literal>
+          [<option>--enable-stats</option>]
+        </term>
+        <listitem><para>Maximum number of bytes in physically resident data
+        pages mapped by the allocator, comprising all pages dedicated to
+        allocator metadata, pages backing active allocations, and unused dirty
+        pages.  This is a maximum rather than precise because pages may not
+        actually be physically resident if they correspond to demand-zeroed
+        virtual memory that has not yet been touched.  This is a multiple of the
+        page size, and is larger than <link
+        linkend="stats.active"><mallctl>stats.active</mallctl></link>.</para></listitem>
+      </varlistentry>
+
       <varlistentry id="stats.mapped">
         <term>
           <mallctl>stats.mapped</mallctl>
@@ -1945,10 +1962,10 @@ malloc_conf = "xmalloc:true";]]></programlisting>
           <literal>r-</literal>
           [<option>--enable-stats</option>]
         </term>
-        <listitem><para>Total number of bytes in chunks mapped on behalf of the
-        application.  This is a multiple of the chunk size, and is at least as
+        <listitem><para>Total number of bytes in active chunks mapped by the
+        allocator.  This is a multiple of the chunk size, and is at least as
         large as <link
-        linkend="stats.active"><mallctl>stats.active</mallctl></link>.  This
+        linkend="stats.resident"><mallctl>stats.resident</mallctl></link>.  This
         does not include inactive chunks.</para></listitem>
       </varlistentry>
 
diff --git a/include/jemalloc/internal/base.h b/include/jemalloc/internal/base.h
index bec76b3..39e46ee 100644
--- a/include/jemalloc/internal/base.h
+++ b/include/jemalloc/internal/base.h
@@ -10,7 +10,7 @@
 #ifdef JEMALLOC_H_EXTERNS
 
 void	*base_alloc(size_t size);
-size_t	base_allocated_get(void);
+void	base_stats_get(size_t *allocated, size_t *resident, size_t *mapped);
 bool	base_boot(void);
 void	base_prefork(void);
 void	base_postfork_parent(void);
diff --git a/include/jemalloc/internal/ctl.h b/include/jemalloc/internal/ctl.h
index ab9c986..7c2a4be 100644
--- a/include/jemalloc/internal/ctl.h
+++ b/include/jemalloc/internal/ctl.h
@@ -53,6 +53,7 @@ struct ctl_stats_s {
 	size_t			allocated;
 	size_t			active;
 	size_t			metadata;
+	size_t			resident;
 	size_t			mapped;
 	unsigned		narenas;
 	ctl_arena_stats_t	*arenas;	/* (narenas + 1) elements. */
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index bc0f2a6..aaf6978 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -111,11 +111,11 @@ atomic_sub_uint32
 atomic_sub_uint64
 atomic_sub_z
 base_alloc
-base_allocated_get
 base_boot
 base_postfork_child
 base_postfork_parent
 base_prefork
+base_stats_get
 bitmap_full
 bitmap_get
 bitmap_info_init
diff --git a/src/base.c b/src/base.c
index 01c62df..1a9b829 100644
--- a/src/base.c
+++ b/src/base.c
@@ -8,6 +8,8 @@ static malloc_mutex_t	base_mtx;
 static extent_tree_t	base_avail_szad;
 static extent_node_t	*base_nodes;
 static size_t		base_allocated;
+static size_t		base_resident;
+static size_t		base_mapped;
 
 /******************************************************************************/
 
@@ -54,11 +56,15 @@ base_chunk_alloc(size_t minsize)
 			base_node_dalloc(node);
 		return (NULL);
 	}
+	base_mapped += csize;
 	if (node == NULL) {
+		node = (extent_node_t *)addr;
+		addr = (void *)((uintptr_t)addr + nsize);
 		csize -= nsize;
-		node = (extent_node_t *)((uintptr_t)addr + csize);
-		if (config_stats)
+		if (config_stats) {
 			base_allocated += nsize;
+			base_resident += PAGE_CEILING(nsize);
+		}
 	}
 	extent_node_init(node, NULL, addr, csize, true);
 	return (node);
@@ -106,23 +112,30 @@ base_alloc(size_t size)
 		extent_tree_szad_insert(&base_avail_szad, node);
 	} else
 		base_node_dalloc(node);
-	if (config_stats)
+	if (config_stats) {
 		base_allocated += csize;
+		/*
+		 * Add one PAGE to base_resident for every page boundary that is
+		 * crossed by the new allocation.
+		 */
+		base_resident += PAGE_CEILING((uintptr_t)ret + csize) -
+		    PAGE_CEILING((uintptr_t)ret);
+	}
 	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, csize);
 label_return:
 	malloc_mutex_unlock(&base_mtx);
 	return (ret);
 }
 
-size_t
-base_allocated_get(void)
+void
+base_stats_get(size_t *allocated, size_t *resident, size_t *mapped)
 {
-	size_t ret;
 
 	malloc_mutex_lock(&base_mtx);
-	ret = base_allocated;
+	*allocated = base_allocated;
+	*resident = base_resident;
+	*mapped = base_mapped;
 	malloc_mutex_unlock(&base_mtx);
-	return (ret);
 }
 
 bool
diff --git a/src/ctl.c b/src/ctl.c
index 447b877..0ed8ddd 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -194,6 +194,7 @@ CTL_PROTO(stats_cactive)
 CTL_PROTO(stats_allocated)
 CTL_PROTO(stats_active)
 CTL_PROTO(stats_metadata)
+CTL_PROTO(stats_resident)
 CTL_PROTO(stats_mapped)
 
 /******************************************************************************/
@@ -469,6 +470,7 @@ static const ctl_named_node_t stats_node[] = {
 	{NAME("allocated"),	CTL(stats_allocated)},
 	{NAME("active"),	CTL(stats_active)},
 	{NAME("metadata"),	CTL(stats_metadata)},
+	{NAME("resident"),	CTL(stats_resident)},
 	{NAME("mapped"),	CTL(stats_mapped)},
 	{NAME("arenas"),	CHILD(indexed, stats_arenas)}
 };
@@ -711,17 +713,23 @@ ctl_refresh(void)
 	}
 
 	if (config_stats) {
+		size_t base_allocated, base_resident, base_mapped;
+		base_stats_get(&base_allocated, &base_resident, &base_mapped);
 		ctl_stats.allocated =
-		    ctl_stats.arenas[ctl_stats.narenas].allocated_small
-		    + ctl_stats.arenas[ctl_stats.narenas].astats.allocated_large
-		    + ctl_stats.arenas[ctl_stats.narenas].astats.allocated_huge;
+		    ctl_stats.arenas[ctl_stats.narenas].allocated_small +
+		    ctl_stats.arenas[ctl_stats.narenas].astats.allocated_large +
+		    ctl_stats.arenas[ctl_stats.narenas].astats.allocated_huge;
 		ctl_stats.active =
 		    (ctl_stats.arenas[ctl_stats.narenas].pactive << LG_PAGE);
-		ctl_stats.metadata = base_allocated_get()
-		    + ctl_stats.arenas[ctl_stats.narenas].astats.metadata_mapped
-		    + ctl_stats.arenas[ctl_stats.narenas].astats
+		ctl_stats.metadata = base_allocated +
+		    ctl_stats.arenas[ctl_stats.narenas].astats.metadata_mapped +
+		    ctl_stats.arenas[ctl_stats.narenas].astats
 		    .metadata_allocated;
-		ctl_stats.mapped =
+		ctl_stats.resident = base_resident +
+		    ctl_stats.arenas[ctl_stats.narenas].astats.metadata_mapped +
+		    ((ctl_stats.arenas[ctl_stats.narenas].pactive +
+		    ctl_stats.arenas[ctl_stats.narenas].pdirty) << LG_PAGE);
+		ctl_stats.mapped = base_mapped +
 		    ctl_stats.arenas[ctl_stats.narenas].astats.mapped;
 	}
 
@@ -1976,6 +1984,7 @@ CTL_RO_CGEN(config_stats, stats_cactive, &stats_cactive, size_t *)
 CTL_RO_CGEN(config_stats, stats_allocated, ctl_stats.allocated, size_t)
 CTL_RO_CGEN(config_stats, stats_active, ctl_stats.active, size_t)
 CTL_RO_CGEN(config_stats, stats_metadata, ctl_stats.metadata, size_t)
+CTL_RO_CGEN(config_stats, stats_resident, ctl_stats.resident, size_t)
 CTL_RO_CGEN(config_stats, stats_mapped, ctl_stats.mapped, size_t)
 
 CTL_RO_GEN(stats_arenas_i_dss, ctl_stats.arenas[mib[2]].dss, const char *)
diff --git a/src/stats.c b/src/stats.c
index b41b458..c5cea5e 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -573,16 +573,18 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 
 	if (config_stats) {
 		size_t *cactive;
-		size_t allocated, active, metadata, mapped;
+		size_t allocated, active, metadata, resident, mapped;
 
 		CTL_GET("stats.cactive", &cactive, size_t *);
 		CTL_GET("stats.allocated", &allocated, size_t);
 		CTL_GET("stats.active", &active, size_t);
 		CTL_GET("stats.metadata", &metadata, size_t);
+		CTL_GET("stats.resident", &resident, size_t);
 		CTL_GET("stats.mapped", &mapped, size_t);
 		malloc_cprintf(write_cb, cbopaque,
-		    "Allocated: %zu, active: %zu, metadata: %zu, mapped: %zu\n",
-		    allocated, active, metadata, mapped);
+		    "Allocated: %zu, active: %zu, metadata: %zu, resident: %zu,"
+		    " mapped: %zu\n", allocated, active, metadata, resident,
+		    mapped);
 		malloc_cprintf(write_cb, cbopaque,
 		    "Current active ceiling: %zu\n", atomic_read_z(cactive));
 
-- 
cgit v0.12


From ef0a0cc3283ea561a40b33f4325d54bbc351de21 Mon Sep 17 00:00:00 2001
From: Igor Podlesny <user.email@poige.ru>
Date: Sun, 22 Mar 2015 23:49:58 +0700
Subject: We have pages_unmap(ret, size) so we use it.

---
 src/chunk_mmap.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/src/chunk_mmap.c b/src/chunk_mmap.c
index 7e02c10..30ac10b 100644
--- a/src/chunk_mmap.c
+++ b/src/chunk_mmap.c
@@ -40,15 +40,7 @@ pages_map(void *addr, size_t size)
 		/*
 		 * We succeeded in mapping memory, but not in the right place.
 		 */
-		if (munmap(ret, size) == -1) {
-			char buf[BUFERROR_BUF];
-
-			buferror(get_errno(), buf, sizeof(buf));
-			malloc_printf("<jemalloc: Error in munmap(): %s\n",
-			    buf);
-			if (opt_abort)
-				abort();
-		}
+		pages_unmap(ret, size);
 		ret = NULL;
 	}
 #endif
-- 
cgit v0.12


From d324ca8933d4f9c3bca7e552be9805525726d1be Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 24 Mar 2015 12:33:12 -0700
Subject: Fix arena_get() usage.

Fix arena_get() calls that specify refresh_if_missing=false.  In
ctl_refresh() and ctl.c's arena_purge(), these calls attempted to only
refresh once, but did so in an unreliable way.
arena_i_lg_dirty_mult_ctl() was simply wrong to pass
refresh_if_missing=false.
---
 src/ctl.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/src/ctl.c b/src/ctl.c
index 0ed8ddd..4493546 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -684,6 +684,7 @@ ctl_refresh(void)
 {
 	tsd_t *tsd;
 	unsigned i;
+	bool refreshed;
 	VARIABLE_ARRAY(arena_t *, tarenas, ctl_stats.narenas);
 
 	/*
@@ -694,8 +695,13 @@ ctl_refresh(void)
 	ctl_arena_clear(&ctl_stats.arenas[ctl_stats.narenas]);
 
 	tsd = tsd_fetch();
-	for (i = 0; i < ctl_stats.narenas; i++)
-		tarenas[i] = arena_get(tsd, i, false, (i == 0));
+	for (i = 0, refreshed = false; i < ctl_stats.narenas; i++) {
+		tarenas[i] = arena_get(tsd, i, false, false);
+		if (tarenas[i] == NULL && !refreshed) {
+			tarenas[i] = arena_get(tsd, i, false, true);
+			refreshed = true;
+		}
+	}
 
 	for (i = 0; i < ctl_stats.narenas; i++) {
 		if (tarenas[i] != NULL)
@@ -1538,11 +1544,17 @@ arena_purge(unsigned arena_ind)
 {
 	tsd_t *tsd;
 	unsigned i;
+	bool refreshed;
 	VARIABLE_ARRAY(arena_t *, tarenas, ctl_stats.narenas);
 
 	tsd = tsd_fetch();
-	for (i = 0; i < ctl_stats.narenas; i++)
-		tarenas[i] = arena_get(tsd, i, false, (i == 0));
+	for (i = 0, refreshed = false; i < ctl_stats.narenas; i++) {
+		tarenas[i] = arena_get(tsd, i, false, false);
+		if (tarenas[i] == NULL && !refreshed) {
+			tarenas[i] = arena_get(tsd, i, false, true);
+			refreshed = true;
+		}
+	}
 
 	if (arena_ind == ctl_stats.narenas) {
 		unsigned i;
@@ -1638,7 +1650,7 @@ arena_i_lg_dirty_mult_ctl(const size_t *mib, size_t miblen, void *oldp,
 	unsigned arena_ind = mib[1];
 	arena_t *arena;
 
-	arena = arena_get(tsd_fetch(), arena_ind, false, (arena_ind == 0));
+	arena = arena_get(tsd_fetch(), arena_ind, false, true);
 	if (arena == NULL) {
 		ret = EFAULT;
 		goto label_return;
-- 
cgit v0.12


From bd16ea49c3e36706a52ef9c8f560813c167fa085 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 24 Mar 2015 15:59:28 -0700
Subject: Fix signed/unsigned comparison in arena_lg_dirty_mult_valid().

---
 src/arena.c         | 3 ++-
 test/unit/mallctl.c | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 7272682..d38ffc6 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1037,7 +1037,8 @@ static bool
 arena_lg_dirty_mult_valid(ssize_t lg_dirty_mult)
 {
 
-	return (lg_dirty_mult >= -1 && lg_dirty_mult < (sizeof(size_t) << 3));
+	return (lg_dirty_mult >= -1 && lg_dirty_mult < (ssize_t)(sizeof(size_t)
+	    << 3));
 }
 
 ssize_t
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index 31ada19..29823a6 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -367,8 +367,8 @@ TEST_BEGIN(test_arena_i_lg_dirty_mult)
 	    "Unexpected mallctl() success");
 
 	for (prev_lg_dirty_mult = orig_lg_dirty_mult, lg_dirty_mult = -1;
-	    lg_dirty_mult < (sizeof(ssize_t) << 3); prev_lg_dirty_mult =
-	    lg_dirty_mult, lg_dirty_mult++) {
+	    lg_dirty_mult < (ssize_t)(sizeof(size_t) << 3); prev_lg_dirty_mult
+	    = lg_dirty_mult, lg_dirty_mult++) {
 		ssize_t old_lg_dirty_mult;
 
 		assert_d_eq(mallctl("arena.0.lg_dirty_mult", &old_lg_dirty_mult,
@@ -478,7 +478,7 @@ TEST_BEGIN(test_arenas_lg_dirty_mult)
 	    "Unexpected mallctl() success");
 
 	for (prev_lg_dirty_mult = orig_lg_dirty_mult, lg_dirty_mult = -1;
-	    lg_dirty_mult < (sizeof(ssize_t) << 3); prev_lg_dirty_mult =
+	    lg_dirty_mult < (ssize_t)(sizeof(size_t) << 3); prev_lg_dirty_mult =
 	    lg_dirty_mult, lg_dirty_mult++) {
 		ssize_t old_lg_dirty_mult;
 
-- 
cgit v0.12


From 562d266511053a51406e91c78eba640cb46ad9c8 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 24 Mar 2015 16:36:12 -0700
Subject: Add the "stats.arenas.<i>.lg_dirty_mult" mallctl.

---
 ChangeLog                         |  6 +++++-
 doc/jemalloc.xml.in               | 12 ++++++++++++
 include/jemalloc/internal/arena.h |  5 +++--
 include/jemalloc/internal/ctl.h   |  1 +
 src/arena.c                       |  8 +++++---
 src/ctl.c                         | 11 ++++++++---
 src/stats.c                       | 11 +----------
 7 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 2607576..8cc214a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -38,7 +38,8 @@ found in the git revision history:
     "opt.prof_thread_active_init", "prof.thread_active_init", and
     "thread.prof.active" mallctls.
   - Add support for per arena application-specified chunk allocators, configured
-    via the "arena<i>.chunk.alloc" and "arena<i>.chunk.dalloc" mallctls.
+    via the "arena<i>.chunk.alloc", "arena<i>.chunk.dalloc", and
+    "arena.<i>.chunk.purge" mallctls.
   - Refactor huge allocation to be managed by arenas, so that arenas now
     function as general purpose independent allocators.  This is important in
     the context of user-specified chunk allocators, aside from the scalability
@@ -65,6 +66,9 @@ found in the git revision history:
     "stats.arenas.<i>.metadata.allocated" mallctls.
   - Add the "stats.resident" mallctl, which reports the upper limit of
     physically resident memory mapped by the allocator.
+  - Add per arena control over unused dirty page purging, via the
+    "arenas.lg_dirty_mult", "arena.<i>.lg_dirty_mult", and
+    "stats.arenas.<i>.lg_dirty_mult" mallctls.
   - Add the "prof.gdump" mallctl, which makes it possible to toggle the gdump
     feature on/off during program execution.
   - Add sdallocx(), which implements sized deallocation.  The primary
diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index adff6a4..d3f3616 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -1983,6 +1983,18 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         </para></listitem>
       </varlistentry>
 
+      <varlistentry id="stats.arenas.i.lg_dirty_mult">
+        <term>
+          <mallctl>stats.arenas.&lt;i&gt;.lg_dirty_mult</mallctl>
+          (<type>ssize_t</type>)
+          <literal>r-</literal>
+        </term>
+        <listitem><para>Minimum ratio (log base 2) of active to dirty pages.
+        See <link
+        linkend="opt.lg_dirty_mult"><mallctl>opt.lg_dirty_mult</mallctl></link>
+        for details.</para></listitem>
+      </varlistentry>
+
       <varlistentry id="stats.arenas.i.nthreads">
         <term>
           <mallctl>stats.arenas.&lt;i&gt;.nthreads</mallctl>
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 56ee74a..dff99fb 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -470,8 +470,9 @@ dss_prec_t	arena_dss_prec_get(arena_t *arena);
 bool	arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec);
 ssize_t	arena_lg_dirty_mult_default_get(void);
 bool	arena_lg_dirty_mult_default_set(ssize_t lg_dirty_mult);
-void	arena_stats_merge(arena_t *arena, const char **dss, size_t *nactive,
-    size_t *ndirty, arena_stats_t *astats, malloc_bin_stats_t *bstats,
+void	arena_stats_merge(arena_t *arena, const char **dss,
+    ssize_t *lg_dirty_mult, size_t *nactive, size_t *ndirty,
+    arena_stats_t *astats, malloc_bin_stats_t *bstats,
     malloc_large_stats_t *lstats, malloc_huge_stats_t *hstats);
 arena_t	*arena_new(unsigned ind);
 void	arena_boot(void);
diff --git a/include/jemalloc/internal/ctl.h b/include/jemalloc/internal/ctl.h
index 7c2a4be..751c14b 100644
--- a/include/jemalloc/internal/ctl.h
+++ b/include/jemalloc/internal/ctl.h
@@ -34,6 +34,7 @@ struct ctl_arena_stats_s {
 	bool			initialized;
 	unsigned		nthreads;
 	const char		*dss;
+	ssize_t			lg_dirty_mult;
 	size_t			pactive;
 	size_t			pdirty;
 	arena_stats_t		astats;
diff --git a/src/arena.c b/src/arena.c
index d38ffc6..bc13d20 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -2657,14 +2657,16 @@ arena_lg_dirty_mult_default_set(ssize_t lg_dirty_mult)
 }
 
 void
-arena_stats_merge(arena_t *arena, const char **dss, size_t *nactive,
-    size_t *ndirty, arena_stats_t *astats, malloc_bin_stats_t *bstats,
-    malloc_large_stats_t *lstats, malloc_huge_stats_t *hstats)
+arena_stats_merge(arena_t *arena, const char **dss, ssize_t *lg_dirty_mult,
+    size_t *nactive, size_t *ndirty, arena_stats_t *astats,
+    malloc_bin_stats_t *bstats, malloc_large_stats_t *lstats,
+    malloc_huge_stats_t *hstats)
 {
 	unsigned i;
 
 	malloc_mutex_lock(&arena->lock);
 	*dss = dss_prec_names[arena->dss_prec];
+	*lg_dirty_mult = arena->lg_dirty_mult;
 	*nactive += arena->nactive;
 	*ndirty += arena->ndirty;
 
diff --git a/src/ctl.c b/src/ctl.c
index 4493546..d215b19 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -181,6 +181,7 @@ CTL_PROTO(stats_arenas_i_hchunks_j_curhchunks)
 INDEX_PROTO(stats_arenas_i_hchunks_j)
 CTL_PROTO(stats_arenas_i_nthreads)
 CTL_PROTO(stats_arenas_i_dss)
+CTL_PROTO(stats_arenas_i_lg_dirty_mult)
 CTL_PROTO(stats_arenas_i_pactive)
 CTL_PROTO(stats_arenas_i_pdirty)
 CTL_PROTO(stats_arenas_i_mapped)
@@ -443,6 +444,7 @@ static const ctl_indexed_node_t stats_arenas_i_hchunks_node[] = {
 static const ctl_named_node_t stats_arenas_i_node[] = {
 	{NAME("nthreads"),	CTL(stats_arenas_i_nthreads)},
 	{NAME("dss"),		CTL(stats_arenas_i_dss)},
+	{NAME("lg_dirty_mult"),	CTL(stats_arenas_i_lg_dirty_mult)},
 	{NAME("pactive"),	CTL(stats_arenas_i_pactive)},
 	{NAME("pdirty"),	CTL(stats_arenas_i_pdirty)},
 	{NAME("mapped"),	CTL(stats_arenas_i_mapped)},
@@ -524,6 +526,7 @@ ctl_arena_clear(ctl_arena_stats_t *astats)
 {
 
 	astats->dss = dss_prec_names[dss_prec_limit];
+	astats->lg_dirty_mult = -1;
 	astats->pactive = 0;
 	astats->pdirty = 0;
 	if (config_stats) {
@@ -545,9 +548,9 @@ ctl_arena_stats_amerge(ctl_arena_stats_t *cstats, arena_t *arena)
 {
 	unsigned i;
 
-	arena_stats_merge(arena, &cstats->dss, &cstats->pactive,
-	    &cstats->pdirty, &cstats->astats, cstats->bstats, cstats->lstats,
-	    cstats->hstats);
+	arena_stats_merge(arena, &cstats->dss, &cstats->lg_dirty_mult,
+	    &cstats->pactive, &cstats->pdirty, &cstats->astats, cstats->bstats,
+	    cstats->lstats, cstats->hstats);
 
 	for (i = 0; i < NBINS; i++) {
 		cstats->allocated_small += cstats->bstats[i].curregs *
@@ -2000,6 +2003,8 @@ CTL_RO_CGEN(config_stats, stats_resident, ctl_stats.resident, size_t)
 CTL_RO_CGEN(config_stats, stats_mapped, ctl_stats.mapped, size_t)
 
 CTL_RO_GEN(stats_arenas_i_dss, ctl_stats.arenas[mib[2]].dss, const char *)
+CTL_RO_GEN(stats_arenas_i_lg_dirty_mult, ctl_stats.arenas[mib[2]].lg_dirty_mult,
+    ssize_t)
 CTL_RO_GEN(stats_arenas_i_nthreads, ctl_stats.arenas[mib[2]].nthreads, unsigned)
 CTL_RO_GEN(stats_arenas_i_pactive, ctl_stats.arenas[mib[2]].pactive, size_t)
 CTL_RO_GEN(stats_arenas_i_pdirty, ctl_stats.arenas[mib[2]].pdirty, size_t)
diff --git a/src/stats.c b/src/stats.c
index c5cea5e..6e1752e 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -6,15 +6,6 @@
 	xmallctl(n, v, &sz, NULL, 0);					\
 } while (0)
 
-#define	CTL_M1_GET(n, i, v, t) do {					\
-	size_t mib[6];							\
-	size_t miblen = sizeof(mib) / sizeof(size_t);			\
-	size_t sz = sizeof(t);						\
-	xmallctlnametomib(n, mib, &miblen);				\
-	mib[1] = (i);							\
-	xmallctlbymib(mib, miblen, v, &sz, NULL, 0);			\
-} while (0)
-
 #define	CTL_M2_GET(n, i, v, t) do {					\
 	size_t mib[6];							\
 	size_t miblen = sizeof(mib) / sizeof(size_t);			\
@@ -285,7 +276,7 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	CTL_M2_GET("stats.arenas.0.dss", i, &dss, const char *);
 	malloc_cprintf(write_cb, cbopaque, "dss allocation precedence: %s\n",
 	    dss);
-	CTL_M1_GET("arena.0.lg_dirty_mult", i, &lg_dirty_mult, ssize_t);
+	CTL_M2_GET("stats.arenas.0.lg_dirty_mult", i, &lg_dirty_mult, ssize_t);
 	if (lg_dirty_mult >= 0) {
 		malloc_cprintf(write_cb, cbopaque,
 		    "min active:dirty page ratio: %u:1\n",
-- 
cgit v0.12


From 65db63cf3f0c5dd5126a1b3786756486eaf931ba Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Wed, 25 Mar 2015 18:56:55 -0700
Subject: Fix in-place shrinking huge reallocation purging bugs.

Fix the shrinking case of huge_ralloc_no_move_similar() to purge the
correct number of pages, at the correct offset.  This regression was
introduced by 8d6a3e8321a7767cb2ca0930b85d5d488a8cc659 (Implement
dynamic per arena control over dirty page purging.).

Fix huge_ralloc_no_move_shrink() to purge the correct number of pages.
This bug was introduced by 9673983443a0782d975fbcb5d8457cfd411b8b56
(Purge/zero sub-chunk huge allocations as necessary.).
---
 src/arena.c |  7 +------
 src/huge.c  | 31 ++++++++++++++++---------------
 2 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index bc13d20..3041068 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1245,16 +1245,11 @@ arena_purge_stashed(arena_t *arena,
 
 		if (rdelm == &chunkselm->rd) {
 			size_t size = extent_node_size_get(chunkselm);
-			void *addr, *chunk;
-			size_t offset;
 			bool unzeroed;
 
 			npages = size >> LG_PAGE;
-			addr = extent_node_addr_get(chunkselm);
-			chunk = CHUNK_ADDR2BASE(addr);
-			offset = CHUNK_ADDR2OFFSET(addr);
 			unzeroed = chunk_purge_wrapper(arena, chunk_purge,
-			    chunk, offset, size);
+			    extent_node_addr_get(chunkselm), 0, size);
 			extent_node_zeroed_set(chunkselm, !unzeroed);
 			chunkselm = qr_next(chunkselm, cc_link);
 		} else {
diff --git a/src/huge.c b/src/huge.c
index aa26f5d..32af205 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -145,12 +145,11 @@ huge_ralloc_no_move_similar(void *ptr, size_t oldsize, size_t usize,
 
 	/* Fill if necessary (shrinking). */
 	if (oldsize > usize) {
-		size_t sdiff = CHUNK_CEILING(usize) - usize;
-		zeroed = (sdiff != 0) ? !chunk_purge_wrapper(arena, chunk_purge,
-		    CHUNK_ADDR2BASE(ptr), CHUNK_ADDR2OFFSET(ptr), usize) : true;
+		size_t sdiff = oldsize - usize;
+		zeroed = !chunk_purge_wrapper(arena, chunk_purge, ptr, usize,
+		    sdiff);
 		if (config_fill && unlikely(opt_junk_free)) {
-			memset((void *)((uintptr_t)ptr + usize), 0x5a, oldsize -
-			    usize);
+			memset((void *)((uintptr_t)ptr + usize), 0x5a, sdiff);
 			zeroed = false;
 		}
 	} else
@@ -186,7 +185,6 @@ huge_ralloc_no_move_shrink(void *ptr, size_t oldsize, size_t usize)
 	extent_node_t *node;
 	arena_t *arena;
 	chunk_purge_t *chunk_purge;
-	size_t sdiff;
 	bool zeroed;
 
 	node = huge_node_get(ptr);
@@ -196,15 +194,18 @@ huge_ralloc_no_move_shrink(void *ptr, size_t oldsize, size_t usize)
 	chunk_purge = arena->chunk_purge;
 	malloc_mutex_unlock(&arena->lock);
 
-	sdiff = CHUNK_CEILING(usize) - usize;
-	zeroed = (sdiff != 0) ? !chunk_purge_wrapper(arena, chunk_purge,
-	    CHUNK_ADDR2BASE((uintptr_t)ptr + usize),
-	    CHUNK_ADDR2OFFSET((uintptr_t)ptr + usize), sdiff) : true;
-	if (config_fill && unlikely(opt_junk_free)) {
-		huge_dalloc_junk((void *)((uintptr_t)ptr + usize), oldsize -
-		    usize);
-		zeroed = false;
-	}
+	if (oldsize > usize) {
+		size_t sdiff = oldsize - usize;
+		zeroed = !chunk_purge_wrapper(arena, chunk_purge,
+		    CHUNK_ADDR2BASE((uintptr_t)ptr + usize),
+		    CHUNK_ADDR2OFFSET((uintptr_t)ptr + usize), sdiff);
+		if (config_fill && unlikely(opt_junk_free)) {
+			huge_dalloc_junk((void *)((uintptr_t)ptr + usize),
+			    sdiff);
+			zeroed = false;
+		}
+	} else
+		zeroed = true;
 
 	malloc_mutex_lock(&arena->huge_mtx);
 	/* Update the size of the huge allocation. */
-- 
cgit v0.12


From b80fbcbbdb7ea6ba5918db7c665c836baa8c0b2e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Marie?= <semarie@users.noreply.github.com>
Date: Tue, 7 Apr 2015 12:21:19 +0200
Subject: OpenBSD don't support TLS

under some compiler (gcc 4.8.4 in particular), the auto-detection of TLS
don't work properly.

force tls to be disabled.

the testsuite pass under gcc (4.8.4) and gcc (4.2.1)
---
 configure.ac | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index be49743..bf2ac3a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -283,7 +283,13 @@ case "${host}" in
 	abi="elf"
 	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ])
 	;;
-  *-*-openbsd*|*-*-bitrig*)
+  *-*-openbsd*)
+	CFLAGS="$CFLAGS"
+	abi="elf"
+	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ])
+	force_tls="0"
+	;;
+  *-*-bitrig*)
 	CFLAGS="$CFLAGS"
 	abi="elf"
 	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ])
-- 
cgit v0.12


From 897503521ddb703a1388899f79112e048c328278 Mon Sep 17 00:00:00 2001
From: Qinfan Wu <wqfish@fb.com>
Date: Tue, 21 Apr 2015 16:57:42 -0700
Subject: Fix mallctl doc: arenas.hchunk.<i>.size

---
 ChangeLog           | 2 +-
 doc/jemalloc.xml.in | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 8cc214a..6f79cac 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -47,7 +47,7 @@ found in the git revision history:
     + The "stats.arenas.<i>.huge.allocated", "stats.arenas.<i>.huge.nmalloc",
       "stats.arenas.<i>.huge.ndalloc", and "stats.arenas.<i>.huge.nrequests"
       mallctls provide high level per arena huge allocation statistics.
-    + The "arenas.nhchunks", "arenas.hchunks.<i>.size",
+    + The "arenas.nhchunks", "arenas.hchunk.<i>.size",
       "stats.arenas.<i>.hchunks.<j>.nmalloc",
       "stats.arenas.<i>.hchunks.<j>.ndalloc",
       "stats.arenas.<i>.hchunks.<j>.nrequests", and
diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index d3f3616..c9ee997 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -1756,9 +1756,9 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         <listitem><para>Total number of huge size classes.</para></listitem>
       </varlistentry>
 
-      <varlistentry id="arenas.hchunks.i.size">
+      <varlistentry id="arenas.hchunk.i.size">
         <term>
-          <mallctl>arenas.hchunks.&lt;i&gt;.size</mallctl>
+          <mallctl>arenas.hchunk.&lt;i&gt;.size</mallctl>
           (<type>size_t</type>)
           <literal>r-</literal>
         </term>
-- 
cgit v0.12


From 95e88de0aab257020dfc33248b86331cbfac28b1 Mon Sep 17 00:00:00 2001
From: Igor Podlesny <user.email@poige.ru>
Date: Tue, 24 Mar 2015 12:49:26 +0700
Subject: Concise JEMALLOC_HAVE_ISSETUGID case in secure_getenv().

---
 src/jemalloc.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/src/jemalloc.c b/src/jemalloc.c
index 7e9f486..a2d1c5c 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -709,24 +709,16 @@ stats_print_atexit(void)
  */
 
 #ifndef JEMALLOC_HAVE_SECURE_GETENV
-#  ifdef JEMALLOC_HAVE_ISSETUGID
 static char *
 secure_getenv(const char *name)
 {
 
-	if (issetugid() == 0)
-		return (getenv(name));
-	else
+#  ifdef JEMALLOC_HAVE_ISSETUGID
+	if (issetugid() != 0)
 		return (NULL);
-}
-#  else
-static char *
-secure_getenv(const char *name)
-{
-
+#  endif
 	return (getenv(name));
 }
-#  endif
 #endif
 
 static unsigned
-- 
cgit v0.12


From f1f2b4542902c5bc14788f6c2d4190b422e5901f Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 1 May 2015 08:57:41 -0700
Subject: Embed full library install when running ld on OS X.

This resolves #228.
---
 configure.ac | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index bf2ac3a..5f9bbd3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -268,7 +268,7 @@ case "${host}" in
 	so="dylib"
 	importlib="${so}"
 	force_tls="0"
-	DSO_LDFLAGS='-shared -Wl,-dylib_install_name,$(@F)'
+	DSO_LDFLAGS='-shared -Wl,-install_name,$(LIBDIR)/$(@F)'
 	SOREV="${rev}.${so}"
 	sbrk_deprecated="1"
 	;;
-- 
cgit v0.12


From 8e33c21d2d03ee7f540e32c3d75b10c128eaea57 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 1 May 2015 09:03:20 -0700
Subject: Prefer /proc/<pid>/task/<pid>/maps over /proc/<pid>/maps on Linux.

This resolves #227.
---
 src/prof.c | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/src/prof.c b/src/prof.c
index f2a3725..8453ea8 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -1338,21 +1338,40 @@ label_return:
 	return (ret);
 }
 
+JEMALLOC_ATTR(format(printf, 1, 2))
+static int
+prof_open_maps(const char *format, ...)
+{
+	int mfd;
+	va_list ap;
+	char filename[PATH_MAX + 1];
+
+	va_start(ap, format);
+	malloc_vsnprintf(filename, sizeof(filename), format, ap);
+	va_end(ap);
+	mfd = open(filename, O_RDONLY);
+
+	return (mfd);
+}
+
 static bool
 prof_dump_maps(bool propagate_err)
 {
 	bool ret;
 	int mfd;
-	char filename[PATH_MAX + 1];
 
 	cassert(config_prof);
 #ifdef __FreeBSD__
-	malloc_snprintf(filename, sizeof(filename), "/proc/curproc/map");
+	mfd = prof_open_maps("/proc/curproc/map");
 #else
-	malloc_snprintf(filename, sizeof(filename), "/proc/%d/maps",
-	    (int)getpid());
+	{
+		int pid = getpid();
+
+		mfd = prof_open_maps("/proc/%d/task/%d/maps", pid, pid);
+		if (mfd == -1)
+			mfd = prof_open_maps("/proc/%d/maps", pid);
+	}
 #endif
-	mfd = open(filename, O_RDONLY);
 	if (mfd != -1) {
 		ssize_t nread;
 
-- 
cgit v0.12


From 7041720ac208fa2f7f65e40d8133d4b291516847 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 1 May 2015 12:31:12 -0700
Subject: Rename pprof to jeprof.

This rename avoids installation collisions with the upstream gperftools.
Additionally, jemalloc's per thread heap profile functionality
introduced an incompatible file format, so it's now worthwhile to
clearly distinguish jemalloc's version of this script from the upstream
version.

This resolves #229.
---
 .gitignore          |    1 +
 ChangeLog           |    5 +-
 Makefile.in         |    2 +-
 bin/jeprof.in       | 5510 +++++++++++++++++++++++++++++++++++++++++++++++++++
 bin/pprof           | 5508 --------------------------------------------------
 configure.ac        |    2 +-
 doc/jemalloc.xml.in |    5 +-
 src/prof.c          |    2 +-
 8 files changed, 5520 insertions(+), 5515 deletions(-)
 create mode 100644 bin/jeprof.in
 delete mode 100755 bin/pprof

diff --git a/.gitignore b/.gitignore
index 5cd3e92..d0e3936 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 
 /bin/jemalloc-config
 /bin/jemalloc.sh
+/bin/jeprof
 
 /config.stamp
 /config.log
diff --git a/ChangeLog b/ChangeLog
index 6f79cac..33139f9 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -117,8 +117,9 @@ found in the git revision history:
   - Assure that the constness of malloc_usable_size()'s return type matches that
     of the system implementation.
   - Change the heap profile dump format to support per thread heap profiling,
-    and enhance pprof with the --thread=<n> option.  As a result, the bundled
-    pprof must now be used rather than the upstream (gperftools) pprof.
+    rename pprof to jeprof, and enhance it with the --thread=<n> option.  As a
+    result, the bundled jeprof must now be used rather than the upstream
+    (gperftools) pprof.
   - Disable "opt.prof_final" by default, in order to avoid atexit(3), which can
     internally deadlock on some platforms.
   - Change the "arenas.nlruns" mallctl type from size_t to unsigned.
diff --git a/Makefile.in b/Makefile.in
index a105bb1..f539fad 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -73,7 +73,7 @@ endif
 LIBJEMALLOC := $(LIBPREFIX)jemalloc$(install_suffix)
 
 # Lists of files.
-BINS := $(srcroot)bin/pprof $(objroot)bin/jemalloc-config $(objroot)bin/jemalloc.sh
+BINS := $(objroot)bin/jemalloc-config $(objroot)bin/jemalloc.sh $(objroot)bin/jeprof
 C_HDRS := $(objroot)include/jemalloc/jemalloc$(install_suffix).h
 C_SRCS := $(srcroot)src/jemalloc.c $(srcroot)src/arena.c \
 	$(srcroot)src/atomic.c $(srcroot)src/base.c $(srcroot)src/bitmap.c \
diff --git a/bin/jeprof.in b/bin/jeprof.in
new file mode 100644
index 0000000..e717807
--- /dev/null
+++ b/bin/jeprof.in
@@ -0,0 +1,5510 @@
+#! /usr/bin/env perl
+
+# Copyright (c) 1998-2007, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# ---
+# Program for printing the profile generated by common/profiler.cc,
+# or by the heap profiler (common/debugallocation.cc)
+#
+# The profile contains a sequence of entries of the form:
+#       <count> <stack trace>
+# This program parses the profile, and generates user-readable
+# output.
+#
+# Examples:
+#
+# % tools/jeprof "program" "profile"
+#   Enters "interactive" mode
+#
+# % tools/jeprof --text "program" "profile"
+#   Generates one line per procedure
+#
+# % tools/jeprof --gv "program" "profile"
+#   Generates annotated call-graph and displays via "gv"
+#
+# % tools/jeprof --gv --focus=Mutex "program" "profile"
+#   Restrict to code paths that involve an entry that matches "Mutex"
+#
+# % tools/jeprof --gv --focus=Mutex --ignore=string "program" "profile"
+#   Restrict to code paths that involve an entry that matches "Mutex"
+#   and does not match "string"
+#
+# % tools/jeprof --list=IBF_CheckDocid "program" "profile"
+#   Generates disassembly listing of all routines with at least one
+#   sample that match the --list=<regexp> pattern.  The listing is
+#   annotated with the flat and cumulative sample counts at each line.
+#
+# % tools/jeprof --disasm=IBF_CheckDocid "program" "profile"
+#   Generates disassembly listing of all routines with at least one
+#   sample that match the --disasm=<regexp> pattern.  The listing is
+#   annotated with the flat and cumulative sample counts at each PC value.
+#
+# TODO: Use color to indicate files?
+
+use strict;
+use warnings;
+use Getopt::Long;
+
+my $JEPROF_VERSION = "@jemalloc_version@";
+my $PPROF_VERSION = "2.0";
+
+# These are the object tools we use which can come from a
+# user-specified location using --tools, from the JEPROF_TOOLS
+# environment variable, or from the environment.
+my %obj_tool_map = (
+  "objdump" => "objdump",
+  "nm" => "nm",
+  "addr2line" => "addr2line",
+  "c++filt" => "c++filt",
+  ## ConfigureObjTools may add architecture-specific entries:
+  #"nm_pdb" => "nm-pdb",       # for reading windows (PDB-format) executables
+  #"addr2line_pdb" => "addr2line-pdb",                                # ditto
+  #"otool" => "otool",         # equivalent of objdump on OS X
+);
+# NOTE: these are lists, so you can put in commandline flags if you want.
+my @DOT = ("dot");          # leave non-absolute, since it may be in /usr/local
+my @GV = ("gv");
+my @EVINCE = ("evince");    # could also be xpdf or perhaps acroread
+my @KCACHEGRIND = ("kcachegrind");
+my @PS2PDF = ("ps2pdf");
+# These are used for dynamic profiles
+my @URL_FETCHER = ("curl", "-s");
+
+# These are the web pages that servers need to support for dynamic profiles
+my $HEAP_PAGE = "/pprof/heap";
+my $PROFILE_PAGE = "/pprof/profile";   # must support cgi-param "?seconds=#"
+my $PMUPROFILE_PAGE = "/pprof/pmuprofile(?:\\?.*)?"; # must support cgi-param
+                                                # ?seconds=#&event=x&period=n
+my $GROWTH_PAGE = "/pprof/growth";
+my $CONTENTION_PAGE = "/pprof/contention";
+my $WALL_PAGE = "/pprof/wall(?:\\?.*)?";  # accepts options like namefilter
+my $FILTEREDPROFILE_PAGE = "/pprof/filteredprofile(?:\\?.*)?";
+my $CENSUSPROFILE_PAGE = "/pprof/censusprofile(?:\\?.*)?"; # must support cgi-param
+                                                       # "?seconds=#",
+                                                       # "?tags_regexp=#" and
+                                                       # "?type=#".
+my $SYMBOL_PAGE = "/pprof/symbol";     # must support symbol lookup via POST
+my $PROGRAM_NAME_PAGE = "/pprof/cmdline";
+
+# These are the web pages that can be named on the command line.
+# All the alternatives must begin with /.
+my $PROFILES = "($HEAP_PAGE|$PROFILE_PAGE|$PMUPROFILE_PAGE|" .
+               "$GROWTH_PAGE|$CONTENTION_PAGE|$WALL_PAGE|" .
+               "$FILTEREDPROFILE_PAGE|$CENSUSPROFILE_PAGE)";
+
+# default binary name
+my $UNKNOWN_BINARY = "(unknown)";
+
+# There is a pervasive dependency on the length (in hex characters,
+# i.e., nibbles) of an address, distinguishing between 32-bit and
+# 64-bit profiles.  To err on the safe size, default to 64-bit here:
+my $address_length = 16;
+
+my $dev_null = "/dev/null";
+if (! -e $dev_null && $^O =~ /MSWin/) {    # $^O is the OS perl was built for
+  $dev_null = "nul";
+}
+
+# A list of paths to search for shared object files
+my @prefix_list = ();
+
+# Special routine name that should not have any symbols.
+# Used as separator to parse "addr2line -i" output.
+my $sep_symbol = '_fini';
+my $sep_address = undef;
+
+##### Argument parsing #####
+
+sub usage_string {
+  return <<EOF;
+Usage:
+jeprof [options] <program> <profiles>
+   <profiles> is a space separated list of profile names.
+jeprof [options] <symbolized-profiles>
+   <symbolized-profiles> is a list of profile files where each file contains
+   the necessary symbol mappings  as well as profile data (likely generated
+   with --raw).
+jeprof [options] <profile>
+   <profile> is a remote form.  Symbols are obtained from host:port$SYMBOL_PAGE
+
+   Each name can be:
+   /path/to/profile        - a path to a profile file
+   host:port[/<service>]   - a location of a service to get profile from
+
+   The /<service> can be $HEAP_PAGE, $PROFILE_PAGE, /pprof/pmuprofile,
+                         $GROWTH_PAGE, $CONTENTION_PAGE, /pprof/wall,
+                         $CENSUSPROFILE_PAGE, or /pprof/filteredprofile.
+   For instance:
+     jeprof http://myserver.com:80$HEAP_PAGE
+   If /<service> is omitted, the service defaults to $PROFILE_PAGE (cpu profiling).
+jeprof --symbols <program>
+   Maps addresses to symbol names.  In this mode, stdin should be a
+   list of library mappings, in the same format as is found in the heap-
+   and cpu-profile files (this loosely matches that of /proc/self/maps
+   on linux), followed by a list of hex addresses to map, one per line.
+
+   For more help with querying remote servers, including how to add the
+   necessary server-side support code, see this filename (or one like it):
+
+   /usr/doc/gperftools-$PPROF_VERSION/pprof_remote_servers.html
+
+Options:
+   --cum               Sort by cumulative data
+   --base=<base>       Subtract <base> from <profile> before display
+   --interactive       Run in interactive mode (interactive "help" gives help) [default]
+   --seconds=<n>       Length of time for dynamic profiles [default=30 secs]
+   --add_lib=<file>    Read additional symbols and line info from the given library
+   --lib_prefix=<dir>  Comma separated list of library path prefixes
+
+Reporting Granularity:
+   --addresses         Report at address level
+   --lines             Report at source line level
+   --functions         Report at function level [default]
+   --files             Report at source file level
+
+Output type:
+   --text              Generate text report
+   --callgrind         Generate callgrind format to stdout
+   --gv                Generate Postscript and display
+   --evince            Generate PDF and display
+   --web               Generate SVG and display
+   --list=<regexp>     Generate source listing of matching routines
+   --disasm=<regexp>   Generate disassembly of matching routines
+   --symbols           Print demangled symbol names found at given addresses
+   --dot               Generate DOT file to stdout
+   --ps                Generate Postcript to stdout
+   --pdf               Generate PDF to stdout
+   --svg               Generate SVG to stdout
+   --gif               Generate GIF to stdout
+   --raw               Generate symbolized jeprof data (useful with remote fetch)
+
+Heap-Profile Options:
+   --inuse_space       Display in-use (mega)bytes [default]
+   --inuse_objects     Display in-use objects
+   --alloc_space       Display allocated (mega)bytes
+   --alloc_objects     Display allocated objects
+   --show_bytes        Display space in bytes
+   --drop_negative     Ignore negative differences
+
+Contention-profile options:
+   --total_delay       Display total delay at each region [default]
+   --contentions       Display number of delays at each region
+   --mean_delay        Display mean delay at each region
+
+Call-graph Options:
+   --nodecount=<n>     Show at most so many nodes [default=80]
+   --nodefraction=<f>  Hide nodes below <f>*total [default=.005]
+   --edgefraction=<f>  Hide edges below <f>*total [default=.001]
+   --maxdegree=<n>     Max incoming/outgoing edges per node [default=8]
+   --focus=<regexp>    Focus on nodes matching <regexp>
+   --thread=<n>        Show profile for thread <n>
+   --ignore=<regexp>   Ignore nodes matching <regexp>
+   --scale=<n>         Set GV scaling [default=0]
+   --heapcheck         Make nodes with non-0 object counts
+                       (i.e. direct leak generators) more visible
+
+Miscellaneous:
+   --tools=<prefix or binary:fullpath>[,...]   \$PATH for object tool pathnames
+   --test              Run unit tests
+   --help              This message
+   --version           Version information
+
+Environment Variables:
+   JEPROF_TMPDIR        Profiles directory. Defaults to \$HOME/jeprof
+   JEPROF_TOOLS         Prefix for object tools pathnames
+
+Examples:
+
+jeprof /bin/ls ls.prof
+                       Enters "interactive" mode
+jeprof --text /bin/ls ls.prof
+                       Outputs one line per procedure
+jeprof --web /bin/ls ls.prof
+                       Displays annotated call-graph in web browser
+jeprof --gv /bin/ls ls.prof
+                       Displays annotated call-graph via 'gv'
+jeprof --gv --focus=Mutex /bin/ls ls.prof
+                       Restricts to code paths including a .*Mutex.* entry
+jeprof --gv --focus=Mutex --ignore=string /bin/ls ls.prof
+                       Code paths including Mutex but not string
+jeprof --list=getdir /bin/ls ls.prof
+                       (Per-line) annotated source listing for getdir()
+jeprof --disasm=getdir /bin/ls ls.prof
+                       (Per-PC) annotated disassembly for getdir()
+
+jeprof http://localhost:1234/
+                       Enters "interactive" mode
+jeprof --text localhost:1234
+                       Outputs one line per procedure for localhost:1234
+jeprof --raw localhost:1234 > ./local.raw
+jeprof --text ./local.raw
+                       Fetches a remote profile for later analysis and then
+                       analyzes it in text mode.
+EOF
+}
+
+sub version_string {
+  return <<EOF
+jeprof (part of jemalloc $JEPROF_VERSION)
+based on pprof (part of gperftools $PPROF_VERSION)
+
+Copyright 1998-2007 Google Inc.
+
+This is BSD licensed software; see the source for copying conditions
+and license information.
+There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE.
+EOF
+}
+
+sub usage {
+  my $msg = shift;
+  print STDERR "$msg\n\n";
+  print STDERR usage_string();
+  print STDERR "\nFATAL ERROR: $msg\n";    # just as a reminder
+  exit(1);
+}
+
+sub Init() {
+  # Setup tmp-file name and handler to clean it up.
+  # We do this in the very beginning so that we can use
+  # error() and cleanup() function anytime here after.
+  $main::tmpfile_sym = "/tmp/jeprof$$.sym";
+  $main::tmpfile_ps = "/tmp/jeprof$$";
+  $main::next_tmpfile = 0;
+  $SIG{'INT'} = \&sighandler;
+
+  # Cache from filename/linenumber to source code
+  $main::source_cache = ();
+
+  $main::opt_help = 0;
+  $main::opt_version = 0;
+
+  $main::opt_cum = 0;
+  $main::opt_base = '';
+  $main::opt_addresses = 0;
+  $main::opt_lines = 0;
+  $main::opt_functions = 0;
+  $main::opt_files = 0;
+  $main::opt_lib_prefix = "";
+
+  $main::opt_text = 0;
+  $main::opt_callgrind = 0;
+  $main::opt_list = "";
+  $main::opt_disasm = "";
+  $main::opt_symbols = 0;
+  $main::opt_gv = 0;
+  $main::opt_evince = 0;
+  $main::opt_web = 0;
+  $main::opt_dot = 0;
+  $main::opt_ps = 0;
+  $main::opt_pdf = 0;
+  $main::opt_gif = 0;
+  $main::opt_svg = 0;
+  $main::opt_raw = 0;
+
+  $main::opt_nodecount = 80;
+  $main::opt_nodefraction = 0.005;
+  $main::opt_edgefraction = 0.001;
+  $main::opt_maxdegree = 8;
+  $main::opt_focus = '';
+  $main::opt_thread = undef;
+  $main::opt_ignore = '';
+  $main::opt_scale = 0;
+  $main::opt_heapcheck = 0;
+  $main::opt_seconds = 30;
+  $main::opt_lib = "";
+
+  $main::opt_inuse_space   = 0;
+  $main::opt_inuse_objects = 0;
+  $main::opt_alloc_space   = 0;
+  $main::opt_alloc_objects = 0;
+  $main::opt_show_bytes    = 0;
+  $main::opt_drop_negative = 0;
+  $main::opt_interactive   = 0;
+
+  $main::opt_total_delay = 0;
+  $main::opt_contentions = 0;
+  $main::opt_mean_delay = 0;
+
+  $main::opt_tools   = "";
+  $main::opt_debug   = 0;
+  $main::opt_test    = 0;
+
+  # These are undocumented flags used only by unittests.
+  $main::opt_test_stride = 0;
+
+  # Are we using $SYMBOL_PAGE?
+  $main::use_symbol_page = 0;
+
+  # Files returned by TempName.
+  %main::tempnames = ();
+
+  # Type of profile we are dealing with
+  # Supported types:
+  #     cpu
+  #     heap
+  #     growth
+  #     contention
+  $main::profile_type = '';     # Empty type means "unknown"
+
+  GetOptions("help!"          => \$main::opt_help,
+             "version!"       => \$main::opt_version,
+             "cum!"           => \$main::opt_cum,
+             "base=s"         => \$main::opt_base,
+             "seconds=i"      => \$main::opt_seconds,
+             "add_lib=s"      => \$main::opt_lib,
+             "lib_prefix=s"   => \$main::opt_lib_prefix,
+             "functions!"     => \$main::opt_functions,
+             "lines!"         => \$main::opt_lines,
+             "addresses!"     => \$main::opt_addresses,
+             "files!"         => \$main::opt_files,
+             "text!"          => \$main::opt_text,
+             "callgrind!"     => \$main::opt_callgrind,
+             "list=s"         => \$main::opt_list,
+             "disasm=s"       => \$main::opt_disasm,
+             "symbols!"       => \$main::opt_symbols,
+             "gv!"            => \$main::opt_gv,
+             "evince!"        => \$main::opt_evince,
+             "web!"           => \$main::opt_web,
+             "dot!"           => \$main::opt_dot,
+             "ps!"            => \$main::opt_ps,
+             "pdf!"           => \$main::opt_pdf,
+             "svg!"           => \$main::opt_svg,
+             "gif!"           => \$main::opt_gif,
+             "raw!"           => \$main::opt_raw,
+             "interactive!"   => \$main::opt_interactive,
+             "nodecount=i"    => \$main::opt_nodecount,
+             "nodefraction=f" => \$main::opt_nodefraction,
+             "edgefraction=f" => \$main::opt_edgefraction,
+             "maxdegree=i"    => \$main::opt_maxdegree,
+             "focus=s"        => \$main::opt_focus,
+             "thread=s"       => \$main::opt_thread,
+             "ignore=s"       => \$main::opt_ignore,
+             "scale=i"        => \$main::opt_scale,
+             "heapcheck"      => \$main::opt_heapcheck,
+             "inuse_space!"   => \$main::opt_inuse_space,
+             "inuse_objects!" => \$main::opt_inuse_objects,
+             "alloc_space!"   => \$main::opt_alloc_space,
+             "alloc_objects!" => \$main::opt_alloc_objects,
+             "show_bytes!"    => \$main::opt_show_bytes,
+             "drop_negative!" => \$main::opt_drop_negative,
+             "total_delay!"   => \$main::opt_total_delay,
+             "contentions!"   => \$main::opt_contentions,
+             "mean_delay!"    => \$main::opt_mean_delay,
+             "tools=s"        => \$main::opt_tools,
+             "test!"          => \$main::opt_test,
+             "debug!"         => \$main::opt_debug,
+             # Undocumented flags used only by unittests:
+             "test_stride=i"  => \$main::opt_test_stride,
+      ) || usage("Invalid option(s)");
+
+  # Deal with the standard --help and --version
+  if ($main::opt_help) {
+    print usage_string();
+    exit(0);
+  }
+
+  if ($main::opt_version) {
+    print version_string();
+    exit(0);
+  }
+
+  # Disassembly/listing/symbols mode requires address-level info
+  if ($main::opt_disasm || $main::opt_list || $main::opt_symbols) {
+    $main::opt_functions = 0;
+    $main::opt_lines = 0;
+    $main::opt_addresses = 1;
+    $main::opt_files = 0;
+  }
+
+  # Check heap-profiling flags
+  if ($main::opt_inuse_space +
+      $main::opt_inuse_objects +
+      $main::opt_alloc_space +
+      $main::opt_alloc_objects > 1) {
+    usage("Specify at most on of --inuse/--alloc options");
+  }
+
+  # Check output granularities
+  my $grains =
+      $main::opt_functions +
+      $main::opt_lines +
+      $main::opt_addresses +
+      $main::opt_files +
+      0;
+  if ($grains > 1) {
+    usage("Only specify one output granularity option");
+  }
+  if ($grains == 0) {
+    $main::opt_functions = 1;
+  }
+
+  # Check output modes
+  my $modes =
+      $main::opt_text +
+      $main::opt_callgrind +
+      ($main::opt_list eq '' ? 0 : 1) +
+      ($main::opt_disasm eq '' ? 0 : 1) +
+      ($main::opt_symbols == 0 ? 0 : 1) +
+      $main::opt_gv +
+      $main::opt_evince +
+      $main::opt_web +
+      $main::opt_dot +
+      $main::opt_ps +
+      $main::opt_pdf +
+      $main::opt_svg +
+      $main::opt_gif +
+      $main::opt_raw +
+      $main::opt_interactive +
+      0;
+  if ($modes > 1) {
+    usage("Only specify one output mode");
+  }
+  if ($modes == 0) {
+    if (-t STDOUT) {  # If STDOUT is a tty, activate interactive mode
+      $main::opt_interactive = 1;
+    } else {
+      $main::opt_text = 1;
+    }
+  }
+
+  if ($main::opt_test) {
+    RunUnitTests();
+    # Should not return
+    exit(1);
+  }
+
+  # Binary name and profile arguments list
+  $main::prog = "";
+  @main::pfile_args = ();
+
+  # Remote profiling without a binary (using $SYMBOL_PAGE instead)
+  if (@ARGV > 0) {
+    if (IsProfileURL($ARGV[0])) {
+      $main::use_symbol_page = 1;
+    } elsif (IsSymbolizedProfileFile($ARGV[0])) {
+      $main::use_symbolized_profile = 1;
+      $main::prog = $UNKNOWN_BINARY;  # will be set later from the profile file
+    }
+  }
+
+  if ($main::use_symbol_page || $main::use_symbolized_profile) {
+    # We don't need a binary!
+    my %disabled = ('--lines' => $main::opt_lines,
+                    '--disasm' => $main::opt_disasm);
+    for my $option (keys %disabled) {
+      usage("$option cannot be used without a binary") if $disabled{$option};
+    }
+    # Set $main::prog later...
+    scalar(@ARGV) || usage("Did not specify profile file");
+  } elsif ($main::opt_symbols) {
+    # --symbols needs a binary-name (to run nm on, etc) but not profiles
+    $main::prog = shift(@ARGV) || usage("Did not specify program");
+  } else {
+    $main::prog = shift(@ARGV) || usage("Did not specify program");
+    scalar(@ARGV) || usage("Did not specify profile file");
+  }
+
+  # Parse profile file/location arguments
+  foreach my $farg (@ARGV) {
+    if ($farg =~ m/(.*)\@([0-9]+)(|\/.*)$/ ) {
+      my $machine = $1;
+      my $num_machines = $2;
+      my $path = $3;
+      for (my $i = 0; $i < $num_machines; $i++) {
+        unshift(@main::pfile_args, "$i.$machine$path");
+      }
+    } else {
+      unshift(@main::pfile_args, $farg);
+    }
+  }
+
+  if ($main::use_symbol_page) {
+    unless (IsProfileURL($main::pfile_args[0])) {
+      error("The first profile should be a remote form to use $SYMBOL_PAGE\n");
+    }
+    CheckSymbolPage();
+    $main::prog = FetchProgramName();
+  } elsif (!$main::use_symbolized_profile) {  # may not need objtools!
+    ConfigureObjTools($main::prog)
+  }
+
+  # Break the opt_lib_prefix into the prefix_list array
+  @prefix_list = split (',', $main::opt_lib_prefix);
+
+  # Remove trailing / from the prefixes, in the list to prevent
+  # searching things like /my/path//lib/mylib.so
+  foreach (@prefix_list) {
+    s|/+$||;
+  }
+}
+
+sub FilterAndPrint {
+  my ($profile, $symbols, $libs, $thread) = @_;
+
+  # Get total data in profile
+  my $total = TotalProfile($profile);
+
+  # Remove uniniteresting stack items
+  $profile = RemoveUninterestingFrames($symbols, $profile);
+
+  # Focus?
+  if ($main::opt_focus ne '') {
+    $profile = FocusProfile($symbols, $profile, $main::opt_focus);
+  }
+
+  # Ignore?
+  if ($main::opt_ignore ne '') {
+    $profile = IgnoreProfile($symbols, $profile, $main::opt_ignore);
+  }
+
+  my $calls = ExtractCalls($symbols, $profile);
+
+  # Reduce profiles to required output granularity, and also clean
+  # each stack trace so a given entry exists at most once.
+  my $reduced = ReduceProfile($symbols, $profile);
+
+  # Get derived profiles
+  my $flat = FlatProfile($reduced);
+  my $cumulative = CumulativeProfile($reduced);
+
+  # Print
+  if (!$main::opt_interactive) {
+    if ($main::opt_disasm) {
+      PrintDisassembly($libs, $flat, $cumulative, $main::opt_disasm);
+    } elsif ($main::opt_list) {
+      PrintListing($total, $libs, $flat, $cumulative, $main::opt_list, 0);
+    } elsif ($main::opt_text) {
+      # Make sure the output is empty when have nothing to report
+      # (only matters when --heapcheck is given but we must be
+      # compatible with old branches that did not pass --heapcheck always):
+      if ($total != 0) {
+        printf("Total%s: %s %s\n",
+               (defined($thread) ? " (t$thread)" : ""),
+               Unparse($total), Units());
+      }
+      PrintText($symbols, $flat, $cumulative, -1);
+    } elsif ($main::opt_raw) {
+      PrintSymbolizedProfile($symbols, $profile, $main::prog);
+    } elsif ($main::opt_callgrind) {
+      PrintCallgrind($calls);
+    } else {
+      if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
+        if ($main::opt_gv) {
+          RunGV(TempName($main::next_tmpfile, "ps"), "");
+        } elsif ($main::opt_evince) {
+          RunEvince(TempName($main::next_tmpfile, "pdf"), "");
+        } elsif ($main::opt_web) {
+          my $tmp = TempName($main::next_tmpfile, "svg");
+          RunWeb($tmp);
+          # The command we run might hand the file name off
+          # to an already running browser instance and then exit.
+          # Normally, we'd remove $tmp on exit (right now),
+          # but fork a child to remove $tmp a little later, so that the
+          # browser has time to load it first.
+          delete $main::tempnames{$tmp};
+          if (fork() == 0) {
+            sleep 5;
+            unlink($tmp);
+            exit(0);
+          }
+        }
+      } else {
+        cleanup();
+        exit(1);
+      }
+    }
+  } else {
+    InteractiveMode($profile, $symbols, $libs, $total);
+  }
+}
+
+sub Main() {
+  Init();
+  $main::collected_profile = undef;
+  @main::profile_files = ();
+  $main::op_time = time();
+
+  # Printing symbols is special and requires a lot less info that most.
+  if ($main::opt_symbols) {
+    PrintSymbols(*STDIN);   # Get /proc/maps and symbols output from stdin
+    return;
+  }
+
+  # Fetch all profile data
+  FetchDynamicProfiles();
+
+  # this will hold symbols that we read from the profile files
+  my $symbol_map = {};
+
+  # Read one profile, pick the last item on the list
+  my $data = ReadProfile($main::prog, pop(@main::profile_files));
+  my $profile = $data->{profile};
+  my $pcs = $data->{pcs};
+  my $libs = $data->{libs};   # Info about main program and shared libraries
+  $symbol_map = MergeSymbols($symbol_map, $data->{symbols});
+
+  # Add additional profiles, if available.
+  if (scalar(@main::profile_files) > 0) {
+    foreach my $pname (@main::profile_files) {
+      my $data2 = ReadProfile($main::prog, $pname);
+      $profile = AddProfile($profile, $data2->{profile});
+      $pcs = AddPcs($pcs, $data2->{pcs});
+      $symbol_map = MergeSymbols($symbol_map, $data2->{symbols});
+    }
+  }
+
+  # Subtract base from profile, if specified
+  if ($main::opt_base ne '') {
+    my $base = ReadProfile($main::prog, $main::opt_base);
+    $profile = SubtractProfile($profile, $base->{profile});
+    $pcs = AddPcs($pcs, $base->{pcs});
+    $symbol_map = MergeSymbols($symbol_map, $base->{symbols});
+  }
+
+  # Collect symbols
+  my $symbols;
+  if ($main::use_symbolized_profile) {
+    $symbols = FetchSymbols($pcs, $symbol_map);
+  } elsif ($main::use_symbol_page) {
+    $symbols = FetchSymbols($pcs);
+  } else {
+    # TODO(csilvers): $libs uses the /proc/self/maps data from profile1,
+    # which may differ from the data from subsequent profiles, especially
+    # if they were run on different machines.  Use appropriate libs for
+    # each pc somehow.
+    $symbols = ExtractSymbols($libs, $pcs);
+  }
+
+  if (!defined($main::opt_thread)) {
+    FilterAndPrint($profile, $symbols, $libs);
+  }
+  if (defined($data->{threads})) {
+    foreach my $thread (sort { $a <=> $b } keys(%{$data->{threads}})) {
+      if (defined($main::opt_thread) &&
+          ($main::opt_thread eq '*' || $main::opt_thread == $thread)) {
+        my $thread_profile = $data->{threads}{$thread};
+        FilterAndPrint($thread_profile, $symbols, $libs, $thread);
+      }
+    }
+  }
+
+  cleanup();
+  exit(0);
+}
+
+##### Entry Point #####
+
+Main();
+
+# Temporary code to detect if we're running on a Goobuntu system.
+# These systems don't have the right stuff installed for the special
+# Readline libraries to work, so as a temporary workaround, we default
+# to using the normal stdio code, rather than the fancier readline-based
+# code
+sub ReadlineMightFail {
+  if (-e '/lib/libtermcap.so.2') {
+    return 0;  # libtermcap exists, so readline should be okay
+  } else {
+    return 1;
+  }
+}
+
+sub RunGV {
+  my $fname = shift;
+  my $bg = shift;       # "" or " &" if we should run in background
+  if (!system(ShellEscape(@GV, "--version") . " >$dev_null 2>&1")) {
+    # Options using double dash are supported by this gv version.
+    # Also, turn on noantialias to better handle bug in gv for
+    # postscript files with large dimensions.
+    # TODO: Maybe we should not pass the --noantialias flag
+    # if the gv version is known to work properly without the flag.
+    system(ShellEscape(@GV, "--scale=$main::opt_scale", "--noantialias", $fname)
+           . $bg);
+  } else {
+    # Old gv version - only supports options that use single dash.
+    print STDERR ShellEscape(@GV, "-scale", $main::opt_scale) . "\n";
+    system(ShellEscape(@GV, "-scale", "$main::opt_scale", $fname) . $bg);
+  }
+}
+
+sub RunEvince {
+  my $fname = shift;
+  my $bg = shift;       # "" or " &" if we should run in background
+  system(ShellEscape(@EVINCE, $fname) . $bg);
+}
+
+sub RunWeb {
+  my $fname = shift;
+  print STDERR "Loading web page file:///$fname\n";
+
+  if (`uname` =~ /Darwin/) {
+    # OS X: open will use standard preference for SVG files.
+    system("/usr/bin/open", $fname);
+    return;
+  }
+
+  # Some kind of Unix; try generic symlinks, then specific browsers.
+  # (Stop once we find one.)
+  # Works best if the browser is already running.
+  my @alt = (
+    "/etc/alternatives/gnome-www-browser",
+    "/etc/alternatives/x-www-browser",
+    "google-chrome",
+    "firefox",
+  );
+  foreach my $b (@alt) {
+    if (system($b, $fname) == 0) {
+      return;
+    }
+  }
+
+  print STDERR "Could not load web browser.\n";
+}
+
+sub RunKcachegrind {
+  my $fname = shift;
+  my $bg = shift;       # "" or " &" if we should run in background
+  print STDERR "Starting '@KCACHEGRIND " . $fname . $bg . "'\n";
+  system(ShellEscape(@KCACHEGRIND, $fname) . $bg);
+}
+
+
+##### Interactive helper routines #####
+
+sub InteractiveMode {
+  $| = 1;  # Make output unbuffered for interactive mode
+  my ($orig_profile, $symbols, $libs, $total) = @_;
+
+  print STDERR "Welcome to jeprof!  For help, type 'help'.\n";
+
+  # Use ReadLine if it's installed and input comes from a console.
+  if ( -t STDIN &&
+       !ReadlineMightFail() &&
+       defined(eval {require Term::ReadLine}) ) {
+    my $term = new Term::ReadLine 'jeprof';
+    while ( defined ($_ = $term->readline('(jeprof) '))) {
+      $term->addhistory($_) if /\S/;
+      if (!InteractiveCommand($orig_profile, $symbols, $libs, $total, $_)) {
+        last;    # exit when we get an interactive command to quit
+      }
+    }
+  } else {       # don't have readline
+    while (1) {
+      print STDERR "(jeprof) ";
+      $_ = <STDIN>;
+      last if ! defined $_ ;
+      s/\r//g;         # turn windows-looking lines into unix-looking lines
+
+      # Save some flags that might be reset by InteractiveCommand()
+      my $save_opt_lines = $main::opt_lines;
+
+      if (!InteractiveCommand($orig_profile, $symbols, $libs, $total, $_)) {
+        last;    # exit when we get an interactive command to quit
+      }
+
+      # Restore flags
+      $main::opt_lines = $save_opt_lines;
+    }
+  }
+}
+
+# Takes two args: orig profile, and command to run.
+# Returns 1 if we should keep going, or 0 if we were asked to quit
+sub InteractiveCommand {
+  my($orig_profile, $symbols, $libs, $total, $command) = @_;
+  $_ = $command;                # just to make future m//'s easier
+  if (!defined($_)) {
+    print STDERR "\n";
+    return 0;
+  }
+  if (m/^\s*quit/) {
+    return 0;
+  }
+  if (m/^\s*help/) {
+    InteractiveHelpMessage();
+    return 1;
+  }
+  # Clear all the mode options -- mode is controlled by "$command"
+  $main::opt_text = 0;
+  $main::opt_callgrind = 0;
+  $main::opt_disasm = 0;
+  $main::opt_list = 0;
+  $main::opt_gv = 0;
+  $main::opt_evince = 0;
+  $main::opt_cum = 0;
+
+  if (m/^\s*(text|top)(\d*)\s*(.*)/) {
+    $main::opt_text = 1;
+
+    my $line_limit = ($2 ne "") ? int($2) : 10;
+
+    my $routine;
+    my $ignore;
+    ($routine, $ignore) = ParseInteractiveArgs($3);
+
+    my $profile = ProcessProfile($total, $orig_profile, $symbols, "", $ignore);
+    my $reduced = ReduceProfile($symbols, $profile);
+
+    # Get derived profiles
+    my $flat = FlatProfile($reduced);
+    my $cumulative = CumulativeProfile($reduced);
+
+    PrintText($symbols, $flat, $cumulative, $line_limit);
+    return 1;
+  }
+  if (m/^\s*callgrind\s*([^ \n]*)/) {
+    $main::opt_callgrind = 1;
+
+    # Get derived profiles
+    my $calls = ExtractCalls($symbols, $orig_profile);
+    my $filename = $1;
+    if ( $1 eq '' ) {
+      $filename = TempName($main::next_tmpfile, "callgrind");
+    }
+    PrintCallgrind($calls, $filename);
+    if ( $1 eq '' ) {
+      RunKcachegrind($filename, " & ");
+      $main::next_tmpfile++;
+    }
+
+    return 1;
+  }
+  if (m/^\s*(web)?list\s*(.+)/) {
+    my $html = (defined($1) && ($1 eq "web"));
+    $main::opt_list = 1;
+
+    my $routine;
+    my $ignore;
+    ($routine, $ignore) = ParseInteractiveArgs($2);
+
+    my $profile = ProcessProfile($total, $orig_profile, $symbols, "", $ignore);
+    my $reduced = ReduceProfile($symbols, $profile);
+
+    # Get derived profiles
+    my $flat = FlatProfile($reduced);
+    my $cumulative = CumulativeProfile($reduced);
+
+    PrintListing($total, $libs, $flat, $cumulative, $routine, $html);
+    return 1;
+  }
+  if (m/^\s*disasm\s*(.+)/) {
+    $main::opt_disasm = 1;
+
+    my $routine;
+    my $ignore;
+    ($routine, $ignore) = ParseInteractiveArgs($1);
+
+    # Process current profile to account for various settings
+    my $profile = ProcessProfile($total, $orig_profile, $symbols, "", $ignore);
+    my $reduced = ReduceProfile($symbols, $profile);
+
+    # Get derived profiles
+    my $flat = FlatProfile($reduced);
+    my $cumulative = CumulativeProfile($reduced);
+
+    PrintDisassembly($libs, $flat, $cumulative, $routine);
+    return 1;
+  }
+  if (m/^\s*(gv|web|evince)\s*(.*)/) {
+    $main::opt_gv = 0;
+    $main::opt_evince = 0;
+    $main::opt_web = 0;
+    if ($1 eq "gv") {
+      $main::opt_gv = 1;
+    } elsif ($1 eq "evince") {
+      $main::opt_evince = 1;
+    } elsif ($1 eq "web") {
+      $main::opt_web = 1;
+    }
+
+    my $focus;
+    my $ignore;
+    ($focus, $ignore) = ParseInteractiveArgs($2);
+
+    # Process current profile to account for various settings
+    my $profile = ProcessProfile($total, $orig_profile, $symbols,
+                                 $focus, $ignore);
+    my $reduced = ReduceProfile($symbols, $profile);
+
+    # Get derived profiles
+    my $flat = FlatProfile($reduced);
+    my $cumulative = CumulativeProfile($reduced);
+
+    if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
+      if ($main::opt_gv) {
+        RunGV(TempName($main::next_tmpfile, "ps"), " &");
+      } elsif ($main::opt_evince) {
+        RunEvince(TempName($main::next_tmpfile, "pdf"), " &");
+      } elsif ($main::opt_web) {
+        RunWeb(TempName($main::next_tmpfile, "svg"));
+      }
+      $main::next_tmpfile++;
+    }
+    return 1;
+  }
+  if (m/^\s*$/) {
+    return 1;
+  }
+  print STDERR "Unknown command: try 'help'.\n";
+  return 1;
+}
+
+
+sub ProcessProfile {
+  my $total_count = shift;
+  my $orig_profile = shift;
+  my $symbols = shift;
+  my $focus = shift;
+  my $ignore = shift;
+
+  # Process current profile to account for various settings
+  my $profile = $orig_profile;
+  printf("Total: %s %s\n", Unparse($total_count), Units());
+  if ($focus ne '') {
+    $profile = FocusProfile($symbols, $profile, $focus);
+    my $focus_count = TotalProfile($profile);
+    printf("After focusing on '%s': %s %s of %s (%0.1f%%)\n",
+           $focus,
+           Unparse($focus_count), Units(),
+           Unparse($total_count), ($focus_count*100.0) / $total_count);
+  }
+  if ($ignore ne '') {
+    $profile = IgnoreProfile($symbols, $profile, $ignore);
+    my $ignore_count = TotalProfile($profile);
+    printf("After ignoring '%s': %s %s of %s (%0.1f%%)\n",
+           $ignore,
+           Unparse($ignore_count), Units(),
+           Unparse($total_count),
+           ($ignore_count*100.0) / $total_count);
+  }
+
+  return $profile;
+}
+
+sub InteractiveHelpMessage {
+  print STDERR <<ENDOFHELP;
+Interactive jeprof mode
+
+Commands:
+  gv
+  gv [focus] [-ignore1] [-ignore2]
+      Show graphical hierarchical display of current profile.  Without
+      any arguments, shows all samples in the profile.  With the optional
+      "focus" argument, restricts the samples shown to just those where
+      the "focus" regular expression matches a routine name on the stack
+      trace.
+
+  web
+  web [focus] [-ignore1] [-ignore2]
+      Like GV, but displays profile in your web browser instead of using
+      Ghostview. Works best if your web browser is already running.
+      To change the browser that gets used:
+      On Linux, set the /etc/alternatives/gnome-www-browser symlink.
+      On OS X, change the Finder association for SVG files.
+
+  list [routine_regexp] [-ignore1] [-ignore2]
+      Show source listing of routines whose names match "routine_regexp"
+
+  weblist [routine_regexp] [-ignore1] [-ignore2]
+     Displays a source listing of routines whose names match "routine_regexp"
+     in a web browser.  You can click on source lines to view the
+     corresponding disassembly.
+
+  top [--cum] [-ignore1] [-ignore2]
+  top20 [--cum] [-ignore1] [-ignore2]
+  top37 [--cum] [-ignore1] [-ignore2]
+      Show top lines ordered by flat profile count, or cumulative count
+      if --cum is specified.  If a number is present after 'top', the
+      top K routines will be shown (defaults to showing the top 10)
+
+  disasm [routine_regexp] [-ignore1] [-ignore2]
+      Show disassembly of routines whose names match "routine_regexp",
+      annotated with sample counts.
+
+  callgrind
+  callgrind [filename]
+      Generates callgrind file. If no filename is given, kcachegrind is called.
+
+  help - This listing
+  quit or ^D - End jeprof
+
+For commands that accept optional -ignore tags, samples where any routine in
+the stack trace matches the regular expression in any of the -ignore
+parameters will be ignored.
+
+Further pprof details are available at this location (or one similar):
+
+ /usr/doc/gperftools-$PPROF_VERSION/cpu_profiler.html
+ /usr/doc/gperftools-$PPROF_VERSION/heap_profiler.html
+
+ENDOFHELP
+}
+sub ParseInteractiveArgs {
+  my $args = shift;
+  my $focus = "";
+  my $ignore = "";
+  my @x = split(/ +/, $args);
+  foreach $a (@x) {
+    if ($a =~ m/^(--|-)lines$/) {
+      $main::opt_lines = 1;
+    } elsif ($a =~ m/^(--|-)cum$/) {
+      $main::opt_cum = 1;
+    } elsif ($a =~ m/^-(.*)/) {
+      $ignore .= (($ignore ne "") ? "|" : "" ) . $1;
+    } else {
+      $focus .= (($focus ne "") ? "|" : "" ) . $a;
+    }
+  }
+  if ($ignore ne "") {
+    print STDERR "Ignoring samples in call stacks that match '$ignore'\n";
+  }
+  return ($focus, $ignore);
+}
+
+##### Output code #####
+
+sub TempName {
+  my $fnum = shift;
+  my $ext = shift;
+  my $file = "$main::tmpfile_ps.$fnum.$ext";
+  $main::tempnames{$file} = 1;
+  return $file;
+}
+
+# Print profile data in packed binary format (64-bit) to standard out
+sub PrintProfileData {
+  my $profile = shift;
+
+  # print header (64-bit style)
+  # (zero) (header-size) (version) (sample-period) (zero)
+  print pack('L*', 0, 0, 3, 0, 0, 0, 1, 0, 0, 0);
+
+  foreach my $k (keys(%{$profile})) {
+    my $count = $profile->{$k};
+    my @addrs = split(/\n/, $k);
+    if ($#addrs >= 0) {
+      my $depth = $#addrs + 1;
+      # int(foo / 2**32) is the only reliable way to get rid of bottom
+      # 32 bits on both 32- and 64-bit systems.
+      print pack('L*', $count & 0xFFFFFFFF, int($count / 2**32));
+      print pack('L*', $depth & 0xFFFFFFFF, int($depth / 2**32));
+
+      foreach my $full_addr (@addrs) {
+        my $addr = $full_addr;
+        $addr =~ s/0x0*//;  # strip off leading 0x, zeroes
+        if (length($addr) > 16) {
+          print STDERR "Invalid address in profile: $full_addr\n";
+          next;
+        }
+        my $low_addr = substr($addr, -8);       # get last 8 hex chars
+        my $high_addr = substr($addr, -16, 8);  # get up to 8 more hex chars
+        print pack('L*', hex('0x' . $low_addr), hex('0x' . $high_addr));
+      }
+    }
+  }
+}
+
+# Print symbols and profile data
+sub PrintSymbolizedProfile {
+  my $symbols = shift;
+  my $profile = shift;
+  my $prog = shift;
+
+  $SYMBOL_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
+  my $symbol_marker = $&;
+
+  print '--- ', $symbol_marker, "\n";
+  if (defined($prog)) {
+    print 'binary=', $prog, "\n";
+  }
+  while (my ($pc, $name) = each(%{$symbols})) {
+    my $sep = ' ';
+    print '0x', $pc;
+    # We have a list of function names, which include the inlined
+    # calls.  They are separated (and terminated) by --, which is
+    # illegal in function names.
+    for (my $j = 2; $j <= $#{$name}; $j += 3) {
+      print $sep, $name->[$j];
+      $sep = '--';
+    }
+    print "\n";
+  }
+  print '---', "\n";
+
+  $PROFILE_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
+  my $profile_marker = $&;
+  print '--- ', $profile_marker, "\n";
+  if (defined($main::collected_profile)) {
+    # if used with remote fetch, simply dump the collected profile to output.
+    open(SRC, "<$main::collected_profile");
+    while (<SRC>) {
+      print $_;
+    }
+    close(SRC);
+  } else {
+    # dump a cpu-format profile to standard out
+    PrintProfileData($profile);
+  }
+}
+
+# Print text output
+sub PrintText {
+  my $symbols = shift;
+  my $flat = shift;
+  my $cumulative = shift;
+  my $line_limit = shift;
+
+  my $total = TotalProfile($flat);
+
+  # Which profile to sort by?
+  my $s = $main::opt_cum ? $cumulative : $flat;
+
+  my $running_sum = 0;
+  my $lines = 0;
+  foreach my $k (sort { GetEntry($s, $b) <=> GetEntry($s, $a) || $a cmp $b }
+                 keys(%{$cumulative})) {
+    my $f = GetEntry($flat, $k);
+    my $c = GetEntry($cumulative, $k);
+    $running_sum += $f;
+
+    my $sym = $k;
+    if (exists($symbols->{$k})) {
+      $sym = $symbols->{$k}->[0] . " " . $symbols->{$k}->[1];
+      if ($main::opt_addresses) {
+        $sym = $k . " " . $sym;
+      }
+    }
+
+    if ($f != 0 || $c != 0) {
+      printf("%8s %6s %6s %8s %6s %s\n",
+             Unparse($f),
+             Percent($f, $total),
+             Percent($running_sum, $total),
+             Unparse($c),
+             Percent($c, $total),
+             $sym);
+    }
+    $lines++;
+    last if ($line_limit >= 0 && $lines >= $line_limit);
+  }
+}
+
+# Callgrind format has a compression for repeated function and file
+# names.  You show the name the first time, and just use its number
+# subsequently.  This can cut down the file to about a third or a
+# quarter of its uncompressed size.  $key and $val are the key/value
+# pair that would normally be printed by callgrind; $map is a map from
+# value to number.
+sub CompressedCGName {
+  my($key, $val, $map) = @_;
+  my $idx = $map->{$val};
+  # For very short keys, providing an index hurts rather than helps.
+  if (length($val) <= 3) {
+    return "$key=$val\n";
+  } elsif (defined($idx)) {
+    return "$key=($idx)\n";
+  } else {
+    # scalar(keys $map) gives the number of items in the map.
+    $idx = scalar(keys(%{$map})) + 1;
+    $map->{$val} = $idx;
+    return "$key=($idx) $val\n";
+  }
+}
+
+# Print the call graph in a way that's suiteable for callgrind.
+sub PrintCallgrind {
+  my $calls = shift;
+  my $filename;
+  my %filename_to_index_map;
+  my %fnname_to_index_map;
+
+  if ($main::opt_interactive) {
+    $filename = shift;
+    print STDERR "Writing callgrind file to '$filename'.\n"
+  } else {
+    $filename = "&STDOUT";
+  }
+  open(CG, ">$filename");
+  printf CG ("events: Hits\n\n");
+  foreach my $call ( map { $_->[0] }
+                     sort { $a->[1] cmp $b ->[1] ||
+                            $a->[2] <=> $b->[2] }
+                     map { /([^:]+):(\d+):([^ ]+)( -> ([^:]+):(\d+):(.+))?/;
+                           [$_, $1, $2] }
+                     keys %$calls ) {
+    my $count = int($calls->{$call});
+    $call =~ /([^:]+):(\d+):([^ ]+)( -> ([^:]+):(\d+):(.+))?/;
+    my ( $caller_file, $caller_line, $caller_function,
+         $callee_file, $callee_line, $callee_function ) =
+       ( $1, $2, $3, $5, $6, $7 );
+
+    # TODO(csilvers): for better compression, collect all the
+    # caller/callee_files and functions first, before printing
+    # anything, and only compress those referenced more than once.
+    printf CG CompressedCGName("fl", $caller_file, \%filename_to_index_map);
+    printf CG CompressedCGName("fn", $caller_function, \%fnname_to_index_map);
+    if (defined $6) {
+      printf CG CompressedCGName("cfl", $callee_file, \%filename_to_index_map);
+      printf CG CompressedCGName("cfn", $callee_function, \%fnname_to_index_map);
+      printf CG ("calls=$count $callee_line\n");
+    }
+    printf CG ("$caller_line $count\n\n");
+  }
+}
+
+# Print disassembly for all all routines that match $main::opt_disasm
+sub PrintDisassembly {
+  my $libs = shift;
+  my $flat = shift;
+  my $cumulative = shift;
+  my $disasm_opts = shift;
+
+  my $total = TotalProfile($flat);
+
+  foreach my $lib (@{$libs}) {
+    my $symbol_table = GetProcedureBoundaries($lib->[0], $disasm_opts);
+    my $offset = AddressSub($lib->[1], $lib->[3]);
+    foreach my $routine (sort ByName keys(%{$symbol_table})) {
+      my $start_addr = $symbol_table->{$routine}->[0];
+      my $end_addr = $symbol_table->{$routine}->[1];
+      # See if there are any samples in this routine
+      my $length = hex(AddressSub($end_addr, $start_addr));
+      my $addr = AddressAdd($start_addr, $offset);
+      for (my $i = 0; $i < $length; $i++) {
+        if (defined($cumulative->{$addr})) {
+          PrintDisassembledFunction($lib->[0], $offset,
+                                    $routine, $flat, $cumulative,
+                                    $start_addr, $end_addr, $total);
+          last;
+        }
+        $addr = AddressInc($addr);
+      }
+    }
+  }
+}
+
+# Return reference to array of tuples of the form:
+#       [start_address, filename, linenumber, instruction, limit_address]
+# E.g.,
+#       ["0x806c43d", "/foo/bar.cc", 131, "ret", "0x806c440"]
+sub Disassemble {
+  my $prog = shift;
+  my $offset = shift;
+  my $start_addr = shift;
+  my $end_addr = shift;
+
+  my $objdump = $obj_tool_map{"objdump"};
+  my $cmd = ShellEscape($objdump, "-C", "-d", "-l", "--no-show-raw-insn",
+                        "--start-address=0x$start_addr",
+                        "--stop-address=0x$end_addr", $prog);
+  open(OBJDUMP, "$cmd |") || error("$cmd: $!\n");
+  my @result = ();
+  my $filename = "";
+  my $linenumber = -1;
+  my $last = ["", "", "", ""];
+  while (<OBJDUMP>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    chop;
+    if (m|\s*([^:\s]+):(\d+)\s*$|) {
+      # Location line of the form:
+      #   <filename>:<linenumber>
+      $filename = $1;
+      $linenumber = $2;
+    } elsif (m/^ +([0-9a-f]+):\s*(.*)/) {
+      # Disassembly line -- zero-extend address to full length
+      my $addr = HexExtend($1);
+      my $k = AddressAdd($addr, $offset);
+      $last->[4] = $k;   # Store ending address for previous instruction
+      $last = [$k, $filename, $linenumber, $2, $end_addr];
+      push(@result, $last);
+    }
+  }
+  close(OBJDUMP);
+  return @result;
+}
+
+# The input file should contain lines of the form /proc/maps-like
+# output (same format as expected from the profiles) or that looks
+# like hex addresses (like "0xDEADBEEF").  We will parse all
+# /proc/maps output, and for all the hex addresses, we will output
+# "short" symbol names, one per line, in the same order as the input.
+sub PrintSymbols {
+  my $maps_and_symbols_file = shift;
+
+  # ParseLibraries expects pcs to be in a set.  Fine by us...
+  my @pclist = ();   # pcs in sorted order
+  my $pcs = {};
+  my $map = "";
+  foreach my $line (<$maps_and_symbols_file>) {
+    $line =~ s/\r//g;    # turn windows-looking lines into unix-looking lines
+    if ($line =~ /\b(0x[0-9a-f]+)\b/i) {
+      push(@pclist, HexExtend($1));
+      $pcs->{$pclist[-1]} = 1;
+    } else {
+      $map .= $line;
+    }
+  }
+
+  my $libs = ParseLibraries($main::prog, $map, $pcs);
+  my $symbols = ExtractSymbols($libs, $pcs);
+
+  foreach my $pc (@pclist) {
+    # ->[0] is the shortname, ->[2] is the full name
+    print(($symbols->{$pc}->[0] || "??") . "\n");
+  }
+}
+
+
+# For sorting functions by name
+sub ByName {
+  return ShortFunctionName($a) cmp ShortFunctionName($b);
+}
+
+# Print source-listing for all all routines that match $list_opts
+sub PrintListing {
+  my $total = shift;
+  my $libs = shift;
+  my $flat = shift;
+  my $cumulative = shift;
+  my $list_opts = shift;
+  my $html = shift;
+
+  my $output = \*STDOUT;
+  my $fname = "";
+
+  if ($html) {
+    # Arrange to write the output to a temporary file
+    $fname = TempName($main::next_tmpfile, "html");
+    $main::next_tmpfile++;
+    if (!open(TEMP, ">$fname")) {
+      print STDERR "$fname: $!\n";
+      return;
+    }
+    $output = \*TEMP;
+    print $output HtmlListingHeader();
+    printf $output ("<div class=\"legend\">%s<br>Total: %s %s</div>\n",
+                    $main::prog, Unparse($total), Units());
+  }
+
+  my $listed = 0;
+  foreach my $lib (@{$libs}) {
+    my $symbol_table = GetProcedureBoundaries($lib->[0], $list_opts);
+    my $offset = AddressSub($lib->[1], $lib->[3]);
+    foreach my $routine (sort ByName keys(%{$symbol_table})) {
+      # Print if there are any samples in this routine
+      my $start_addr = $symbol_table->{$routine}->[0];
+      my $end_addr = $symbol_table->{$routine}->[1];
+      my $length = hex(AddressSub($end_addr, $start_addr));
+      my $addr = AddressAdd($start_addr, $offset);
+      for (my $i = 0; $i < $length; $i++) {
+        if (defined($cumulative->{$addr})) {
+          $listed += PrintSource(
+            $lib->[0], $offset,
+            $routine, $flat, $cumulative,
+            $start_addr, $end_addr,
+            $html,
+            $output);
+          last;
+        }
+        $addr = AddressInc($addr);
+      }
+    }
+  }
+
+  if ($html) {
+    if ($listed > 0) {
+      print $output HtmlListingFooter();
+      close($output);
+      RunWeb($fname);
+    } else {
+      close($output);
+      unlink($fname);
+    }
+  }
+}
+
+sub HtmlListingHeader {
+  return <<'EOF';
+<DOCTYPE html>
+<html>
+<head>
+<title>Pprof listing</title>
+<style type="text/css">
+body {
+  font-family: sans-serif;
+}
+h1 {
+  font-size: 1.5em;
+  margin-bottom: 4px;
+}
+.legend {
+  font-size: 1.25em;
+}
+.line {
+  color: #aaaaaa;
+}
+.nop {
+  color: #aaaaaa;
+}
+.unimportant {
+  color: #cccccc;
+}
+.disasmloc {
+  color: #000000;
+}
+.deadsrc {
+  cursor: pointer;
+}
+.deadsrc:hover {
+  background-color: #eeeeee;
+}
+.livesrc {
+  color: #0000ff;
+  cursor: pointer;
+}
+.livesrc:hover {
+  background-color: #eeeeee;
+}
+.asm {
+  color: #008800;
+  display: none;
+}
+</style>
+<script type="text/javascript">
+function jeprof_toggle_asm(e) {
+  var target;
+  if (!e) e = window.event;
+  if (e.target) target = e.target;
+  else if (e.srcElement) target = e.srcElement;
+
+  if (target) {
+    var asm = target.nextSibling;
+    if (asm && asm.className == "asm") {
+      asm.style.display = (asm.style.display == "block" ? "" : "block");
+      e.preventDefault();
+      return false;
+    }
+  }
+}
+</script>
+</head>
+<body>
+EOF
+}
+
+sub HtmlListingFooter {
+  return <<'EOF';
+</body>
+</html>
+EOF
+}
+
+sub HtmlEscape {
+  my $text = shift;
+  $text =~ s/&/&amp;/g;
+  $text =~ s/</&lt;/g;
+  $text =~ s/>/&gt;/g;
+  return $text;
+}
+
+# Returns the indentation of the line, if it has any non-whitespace
+# characters.  Otherwise, returns -1.
+sub Indentation {
+  my $line = shift;
+  if (m/^(\s*)\S/) {
+    return length($1);
+  } else {
+    return -1;
+  }
+}
+
+# If the symbol table contains inlining info, Disassemble() may tag an
+# instruction with a location inside an inlined function.  But for
+# source listings, we prefer to use the location in the function we
+# are listing.  So use MapToSymbols() to fetch full location
+# information for each instruction and then pick out the first
+# location from a location list (location list contains callers before
+# callees in case of inlining).
+#
+# After this routine has run, each entry in $instructions contains:
+#   [0] start address
+#   [1] filename for function we are listing
+#   [2] line number for function we are listing
+#   [3] disassembly
+#   [4] limit address
+#   [5] most specific filename (may be different from [1] due to inlining)
+#   [6] most specific line number (may be different from [2] due to inlining)
+sub GetTopLevelLineNumbers {
+  my ($lib, $offset, $instructions) = @_;
+  my $pcs = [];
+  for (my $i = 0; $i <= $#{$instructions}; $i++) {
+    push(@{$pcs}, $instructions->[$i]->[0]);
+  }
+  my $symbols = {};
+  MapToSymbols($lib, $offset, $pcs, $symbols);
+  for (my $i = 0; $i <= $#{$instructions}; $i++) {
+    my $e = $instructions->[$i];
+    push(@{$e}, $e->[1]);
+    push(@{$e}, $e->[2]);
+    my $addr = $e->[0];
+    my $sym = $symbols->{$addr};
+    if (defined($sym)) {
+      if ($#{$sym} >= 2 && $sym->[1] =~ m/^(.*):(\d+)$/) {
+        $e->[1] = $1;  # File name
+        $e->[2] = $2;  # Line number
+      }
+    }
+  }
+}
+
+# Print source-listing for one routine
+sub PrintSource {
+  my $prog = shift;
+  my $offset = shift;
+  my $routine = shift;
+  my $flat = shift;
+  my $cumulative = shift;
+  my $start_addr = shift;
+  my $end_addr = shift;
+  my $html = shift;
+  my $output = shift;
+
+  # Disassemble all instructions (just to get line numbers)
+  my @instructions = Disassemble($prog, $offset, $start_addr, $end_addr);
+  GetTopLevelLineNumbers($prog, $offset, \@instructions);
+
+  # Hack 1: assume that the first source file encountered in the
+  # disassembly contains the routine
+  my $filename = undef;
+  for (my $i = 0; $i <= $#instructions; $i++) {
+    if ($instructions[$i]->[2] >= 0) {
+      $filename = $instructions[$i]->[1];
+      last;
+    }
+  }
+  if (!defined($filename)) {
+    print STDERR "no filename found in $routine\n";
+    return 0;
+  }
+
+  # Hack 2: assume that the largest line number from $filename is the
+  # end of the procedure.  This is typically safe since if P1 contains
+  # an inlined call to P2, then P2 usually occurs earlier in the
+  # source file.  If this does not work, we might have to compute a
+  # density profile or just print all regions we find.
+  my $lastline = 0;
+  for (my $i = 0; $i <= $#instructions; $i++) {
+    my $f = $instructions[$i]->[1];
+    my $l = $instructions[$i]->[2];
+    if (($f eq $filename) && ($l > $lastline)) {
+      $lastline = $l;
+    }
+  }
+
+  # Hack 3: assume the first source location from "filename" is the start of
+  # the source code.
+  my $firstline = 1;
+  for (my $i = 0; $i <= $#instructions; $i++) {
+    if ($instructions[$i]->[1] eq $filename) {
+      $firstline = $instructions[$i]->[2];
+      last;
+    }
+  }
+
+  # Hack 4: Extend last line forward until its indentation is less than
+  # the indentation we saw on $firstline
+  my $oldlastline = $lastline;
+  {
+    if (!open(FILE, "<$filename")) {
+      print STDERR "$filename: $!\n";
+      return 0;
+    }
+    my $l = 0;
+    my $first_indentation = -1;
+    while (<FILE>) {
+      s/\r//g;         # turn windows-looking lines into unix-looking lines
+      $l++;
+      my $indent = Indentation($_);
+      if ($l >= $firstline) {
+        if ($first_indentation < 0 && $indent >= 0) {
+          $first_indentation = $indent;
+          last if ($first_indentation == 0);
+        }
+      }
+      if ($l >= $lastline && $indent >= 0) {
+        if ($indent >= $first_indentation) {
+          $lastline = $l+1;
+        } else {
+          last;
+        }
+      }
+    }
+    close(FILE);
+  }
+
+  # Assign all samples to the range $firstline,$lastline,
+  # Hack 4: If an instruction does not occur in the range, its samples
+  # are moved to the next instruction that occurs in the range.
+  my $samples1 = {};        # Map from line number to flat count
+  my $samples2 = {};        # Map from line number to cumulative count
+  my $running1 = 0;         # Unassigned flat counts
+  my $running2 = 0;         # Unassigned cumulative counts
+  my $total1 = 0;           # Total flat counts
+  my $total2 = 0;           # Total cumulative counts
+  my %disasm = ();          # Map from line number to disassembly
+  my $running_disasm = "";  # Unassigned disassembly
+  my $skip_marker = "---\n";
+  if ($html) {
+    $skip_marker = "";
+    for (my $l = $firstline; $l <= $lastline; $l++) {
+      $disasm{$l} = "";
+    }
+  }
+  my $last_dis_filename = '';
+  my $last_dis_linenum = -1;
+  my $last_touched_line = -1;  # To detect gaps in disassembly for a line
+  foreach my $e (@instructions) {
+    # Add up counts for all address that fall inside this instruction
+    my $c1 = 0;
+    my $c2 = 0;
+    for (my $a = $e->[0]; $a lt $e->[4]; $a = AddressInc($a)) {
+      $c1 += GetEntry($flat, $a);
+      $c2 += GetEntry($cumulative, $a);
+    }
+
+    if ($html) {
+      my $dis = sprintf("      %6s %6s \t\t%8s: %s ",
+                        HtmlPrintNumber($c1),
+                        HtmlPrintNumber($c2),
+                        UnparseAddress($offset, $e->[0]),
+                        CleanDisassembly($e->[3]));
+
+      # Append the most specific source line associated with this instruction
+      if (length($dis) < 80) { $dis .= (' ' x (80 - length($dis))) };
+      $dis = HtmlEscape($dis);
+      my $f = $e->[5];
+      my $l = $e->[6];
+      if ($f ne $last_dis_filename) {
+        $dis .= sprintf("<span class=disasmloc>%s:%d</span>",
+                        HtmlEscape(CleanFileName($f)), $l);
+      } elsif ($l ne $last_dis_linenum) {
+        # De-emphasize the unchanged file name portion
+        $dis .= sprintf("<span class=unimportant>%s</span>" .
+                        "<span class=disasmloc>:%d</span>",
+                        HtmlEscape(CleanFileName($f)), $l);
+      } else {
+        # De-emphasize the entire location
+        $dis .= sprintf("<span class=unimportant>%s:%d</span>",
+                        HtmlEscape(CleanFileName($f)), $l);
+      }
+      $last_dis_filename = $f;
+      $last_dis_linenum = $l;
+      $running_disasm .= $dis;
+      $running_disasm .= "\n";
+    }
+
+    $running1 += $c1;
+    $running2 += $c2;
+    $total1 += $c1;
+    $total2 += $c2;
+    my $file = $e->[1];
+    my $line = $e->[2];
+    if (($file eq $filename) &&
+        ($line >= $firstline) &&
+        ($line <= $lastline)) {
+      # Assign all accumulated samples to this line
+      AddEntry($samples1, $line, $running1);
+      AddEntry($samples2, $line, $running2);
+      $running1 = 0;
+      $running2 = 0;
+      if ($html) {
+        if ($line != $last_touched_line && $disasm{$line} ne '') {
+          $disasm{$line} .= "\n";
+        }
+        $disasm{$line} .= $running_disasm;
+        $running_disasm = '';
+        $last_touched_line = $line;
+      }
+    }
+  }
+
+  # Assign any leftover samples to $lastline
+  AddEntry($samples1, $lastline, $running1);
+  AddEntry($samples2, $lastline, $running2);
+  if ($html) {
+    if ($lastline != $last_touched_line && $disasm{$lastline} ne '') {
+      $disasm{$lastline} .= "\n";
+    }
+    $disasm{$lastline} .= $running_disasm;
+  }
+
+  if ($html) {
+    printf $output (
+      "<h1>%s</h1>%s\n<pre onClick=\"jeprof_toggle_asm()\">\n" .
+      "Total:%6s %6s (flat / cumulative %s)\n",
+      HtmlEscape(ShortFunctionName($routine)),
+      HtmlEscape(CleanFileName($filename)),
+      Unparse($total1),
+      Unparse($total2),
+      Units());
+  } else {
+    printf $output (
+      "ROUTINE ====================== %s in %s\n" .
+      "%6s %6s Total %s (flat / cumulative)\n",
+      ShortFunctionName($routine),
+      CleanFileName($filename),
+      Unparse($total1),
+      Unparse($total2),
+      Units());
+  }
+  if (!open(FILE, "<$filename")) {
+    print STDERR "$filename: $!\n";
+    return 0;
+  }
+  my $l = 0;
+  while (<FILE>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    $l++;
+    if ($l >= $firstline - 5 &&
+        (($l <= $oldlastline + 5) || ($l <= $lastline))) {
+      chop;
+      my $text = $_;
+      if ($l == $firstline) { print $output $skip_marker; }
+      my $n1 = GetEntry($samples1, $l);
+      my $n2 = GetEntry($samples2, $l);
+      if ($html) {
+        # Emit a span that has one of the following classes:
+        #    livesrc -- has samples
+        #    deadsrc -- has disassembly, but with no samples
+        #    nop     -- has no matching disasembly
+        # Also emit an optional span containing disassembly.
+        my $dis = $disasm{$l};
+        my $asm = "";
+        if (defined($dis) && $dis ne '') {
+          $asm = "<span class=\"asm\">" . $dis . "</span>";
+        }
+        my $source_class = (($n1 + $n2 > 0)
+                            ? "livesrc"
+                            : (($asm ne "") ? "deadsrc" : "nop"));
+        printf $output (
+          "<span class=\"line\">%5d</span> " .
+          "<span class=\"%s\">%6s %6s %s</span>%s\n",
+          $l, $source_class,
+          HtmlPrintNumber($n1),
+          HtmlPrintNumber($n2),
+          HtmlEscape($text),
+          $asm);
+      } else {
+        printf $output(
+          "%6s %6s %4d: %s\n",
+          UnparseAlt($n1),
+          UnparseAlt($n2),
+          $l,
+          $text);
+      }
+      if ($l == $lastline)  { print $output $skip_marker; }
+    };
+  }
+  close(FILE);
+  if ($html) {
+    print $output "</pre>\n";
+  }
+  return 1;
+}
+
+# Return the source line for the specified file/linenumber.
+# Returns undef if not found.
+sub SourceLine {
+  my $file = shift;
+  my $line = shift;
+
+  # Look in cache
+  if (!defined($main::source_cache{$file})) {
+    if (100 < scalar keys(%main::source_cache)) {
+      # Clear the cache when it gets too big
+      $main::source_cache = ();
+    }
+
+    # Read all lines from the file
+    if (!open(FILE, "<$file")) {
+      print STDERR "$file: $!\n";
+      $main::source_cache{$file} = [];  # Cache the negative result
+      return undef;
+    }
+    my $lines = [];
+    push(@{$lines}, "");        # So we can use 1-based line numbers as indices
+    while (<FILE>) {
+      push(@{$lines}, $_);
+    }
+    close(FILE);
+
+    # Save the lines in the cache
+    $main::source_cache{$file} = $lines;
+  }
+
+  my $lines = $main::source_cache{$file};
+  if (($line < 0) || ($line > $#{$lines})) {
+    return undef;
+  } else {
+    return $lines->[$line];
+  }
+}
+
+# Print disassembly for one routine with interspersed source if available
+sub PrintDisassembledFunction {
+  my $prog = shift;
+  my $offset = shift;
+  my $routine = shift;
+  my $flat = shift;
+  my $cumulative = shift;
+  my $start_addr = shift;
+  my $end_addr = shift;
+  my $total = shift;
+
+  # Disassemble all instructions
+  my @instructions = Disassemble($prog, $offset, $start_addr, $end_addr);
+
+  # Make array of counts per instruction
+  my @flat_count = ();
+  my @cum_count = ();
+  my $flat_total = 0;
+  my $cum_total = 0;
+  foreach my $e (@instructions) {
+    # Add up counts for all address that fall inside this instruction
+    my $c1 = 0;
+    my $c2 = 0;
+    for (my $a = $e->[0]; $a lt $e->[4]; $a = AddressInc($a)) {
+      $c1 += GetEntry($flat, $a);
+      $c2 += GetEntry($cumulative, $a);
+    }
+    push(@flat_count, $c1);
+    push(@cum_count, $c2);
+    $flat_total += $c1;
+    $cum_total += $c2;
+  }
+
+  # Print header with total counts
+  printf("ROUTINE ====================== %s\n" .
+         "%6s %6s %s (flat, cumulative) %.1f%% of total\n",
+         ShortFunctionName($routine),
+         Unparse($flat_total),
+         Unparse($cum_total),
+         Units(),
+         ($cum_total * 100.0) / $total);
+
+  # Process instructions in order
+  my $current_file = "";
+  for (my $i = 0; $i <= $#instructions; ) {
+    my $e = $instructions[$i];
+
+    # Print the new file name whenever we switch files
+    if ($e->[1] ne $current_file) {
+      $current_file = $e->[1];
+      my $fname = $current_file;
+      $fname =~ s|^\./||;   # Trim leading "./"
+
+      # Shorten long file names
+      if (length($fname) >= 58) {
+        $fname = "..." . substr($fname, -55);
+      }
+      printf("-------------------- %s\n", $fname);
+    }
+
+    # TODO: Compute range of lines to print together to deal with
+    # small reorderings.
+    my $first_line = $e->[2];
+    my $last_line = $first_line;
+    my %flat_sum = ();
+    my %cum_sum = ();
+    for (my $l = $first_line; $l <= $last_line; $l++) {
+      $flat_sum{$l} = 0;
+      $cum_sum{$l} = 0;
+    }
+
+    # Find run of instructions for this range of source lines
+    my $first_inst = $i;
+    while (($i <= $#instructions) &&
+           ($instructions[$i]->[2] >= $first_line) &&
+           ($instructions[$i]->[2] <= $last_line)) {
+      $e = $instructions[$i];
+      $flat_sum{$e->[2]} += $flat_count[$i];
+      $cum_sum{$e->[2]} += $cum_count[$i];
+      $i++;
+    }
+    my $last_inst = $i - 1;
+
+    # Print source lines
+    for (my $l = $first_line; $l <= $last_line; $l++) {
+      my $line = SourceLine($current_file, $l);
+      if (!defined($line)) {
+        $line = "?\n";
+        next;
+      } else {
+        $line =~ s/^\s+//;
+      }
+      printf("%6s %6s %5d: %s",
+             UnparseAlt($flat_sum{$l}),
+             UnparseAlt($cum_sum{$l}),
+             $l,
+             $line);
+    }
+
+    # Print disassembly
+    for (my $x = $first_inst; $x <= $last_inst; $x++) {
+      my $e = $instructions[$x];
+      printf("%6s %6s    %8s: %6s\n",
+             UnparseAlt($flat_count[$x]),
+             UnparseAlt($cum_count[$x]),
+             UnparseAddress($offset, $e->[0]),
+             CleanDisassembly($e->[3]));
+    }
+  }
+}
+
+# Print DOT graph
+sub PrintDot {
+  my $prog = shift;
+  my $symbols = shift;
+  my $raw = shift;
+  my $flat = shift;
+  my $cumulative = shift;
+  my $overall_total = shift;
+
+  # Get total
+  my $local_total = TotalProfile($flat);
+  my $nodelimit = int($main::opt_nodefraction * $local_total);
+  my $edgelimit = int($main::opt_edgefraction * $local_total);
+  my $nodecount = $main::opt_nodecount;
+
+  # Find nodes to include
+  my @list = (sort { abs(GetEntry($cumulative, $b)) <=>
+                     abs(GetEntry($cumulative, $a))
+                     || $a cmp $b }
+              keys(%{$cumulative}));
+  my $last = $nodecount - 1;
+  if ($last > $#list) {
+    $last = $#list;
+  }
+  while (($last >= 0) &&
+         (abs(GetEntry($cumulative, $list[$last])) <= $nodelimit)) {
+    $last--;
+  }
+  if ($last < 0) {
+    print STDERR "No nodes to print\n";
+    return 0;
+  }
+
+  if ($nodelimit > 0 || $edgelimit > 0) {
+    printf STDERR ("Dropping nodes with <= %s %s; edges with <= %s abs(%s)\n",
+                   Unparse($nodelimit), Units(),
+                   Unparse($edgelimit), Units());
+  }
+
+  # Open DOT output file
+  my $output;
+  my $escaped_dot = ShellEscape(@DOT);
+  my $escaped_ps2pdf = ShellEscape(@PS2PDF);
+  if ($main::opt_gv) {
+    my $escaped_outfile = ShellEscape(TempName($main::next_tmpfile, "ps"));
+    $output = "| $escaped_dot -Tps2 >$escaped_outfile";
+  } elsif ($main::opt_evince) {
+    my $escaped_outfile = ShellEscape(TempName($main::next_tmpfile, "pdf"));
+    $output = "| $escaped_dot -Tps2 | $escaped_ps2pdf - $escaped_outfile";
+  } elsif ($main::opt_ps) {
+    $output = "| $escaped_dot -Tps2";
+  } elsif ($main::opt_pdf) {
+    $output = "| $escaped_dot -Tps2 | $escaped_ps2pdf - -";
+  } elsif ($main::opt_web || $main::opt_svg) {
+    # We need to post-process the SVG, so write to a temporary file always.
+    my $escaped_outfile = ShellEscape(TempName($main::next_tmpfile, "svg"));
+    $output = "| $escaped_dot -Tsvg >$escaped_outfile";
+  } elsif ($main::opt_gif) {
+    $output = "| $escaped_dot -Tgif";
+  } else {
+    $output = ">&STDOUT";
+  }
+  open(DOT, $output) || error("$output: $!\n");
+
+  # Title
+  printf DOT ("digraph \"%s; %s %s\" {\n",
+              $prog,
+              Unparse($overall_total),
+              Units());
+  if ($main::opt_pdf) {
+    # The output is more printable if we set the page size for dot.
+    printf DOT ("size=\"8,11\"\n");
+  }
+  printf DOT ("node [width=0.375,height=0.25];\n");
+
+  # Print legend
+  printf DOT ("Legend [shape=box,fontsize=24,shape=plaintext," .
+              "label=\"%s\\l%s\\l%s\\l%s\\l%s\\l\"];\n",
+              $prog,
+              sprintf("Total %s: %s", Units(), Unparse($overall_total)),
+              sprintf("Focusing on: %s", Unparse($local_total)),
+              sprintf("Dropped nodes with <= %s abs(%s)",
+                      Unparse($nodelimit), Units()),
+              sprintf("Dropped edges with <= %s %s",
+                      Unparse($edgelimit), Units())
+              );
+
+  # Print nodes
+  my %node = ();
+  my $nextnode = 1;
+  foreach my $a (@list[0..$last]) {
+    # Pick font size
+    my $f = GetEntry($flat, $a);
+    my $c = GetEntry($cumulative, $a);
+
+    my $fs = 8;
+    if ($local_total > 0) {
+      $fs = 8 + (50.0 * sqrt(abs($f * 1.0 / $local_total)));
+    }
+
+    $node{$a} = $nextnode++;
+    my $sym = $a;
+    $sym =~ s/\s+/\\n/g;
+    $sym =~ s/::/\\n/g;
+
+    # Extra cumulative info to print for non-leaves
+    my $extra = "";
+    if ($f != $c) {
+      $extra = sprintf("\\rof %s (%s)",
+                       Unparse($c),
+                       Percent($c, $local_total));
+    }
+    my $style = "";
+    if ($main::opt_heapcheck) {
+      if ($f > 0) {
+        # make leak-causing nodes more visible (add a background)
+        $style = ",style=filled,fillcolor=gray"
+      } elsif ($f < 0) {
+        # make anti-leak-causing nodes (which almost never occur)
+        # stand out as well (triple border)
+        $style = ",peripheries=3"
+      }
+    }
+
+    printf DOT ("N%d [label=\"%s\\n%s (%s)%s\\r" .
+                "\",shape=box,fontsize=%.1f%s];\n",
+                $node{$a},
+                $sym,
+                Unparse($f),
+                Percent($f, $local_total),
+                $extra,
+                $fs,
+                $style,
+               );
+  }
+
+  # Get edges and counts per edge
+  my %edge = ();
+  my $n;
+  my $fullname_to_shortname_map = {};
+  FillFullnameToShortnameMap($symbols, $fullname_to_shortname_map);
+  foreach my $k (keys(%{$raw})) {
+    # TODO: omit low %age edges
+    $n = $raw->{$k};
+    my @translated = TranslateStack($symbols, $fullname_to_shortname_map, $k);
+    for (my $i = 1; $i <= $#translated; $i++) {
+      my $src = $translated[$i];
+      my $dst = $translated[$i-1];
+      #next if ($src eq $dst);  # Avoid self-edges?
+      if (exists($node{$src}) && exists($node{$dst})) {
+        my $edge_label = "$src\001$dst";
+        if (!exists($edge{$edge_label})) {
+          $edge{$edge_label} = 0;
+        }
+        $edge{$edge_label} += $n;
+      }
+    }
+  }
+
+  # Print edges (process in order of decreasing counts)
+  my %indegree = ();   # Number of incoming edges added per node so far
+  my %outdegree = ();  # Number of outgoing edges added per node so far
+  foreach my $e (sort { $edge{$b} <=> $edge{$a} } keys(%edge)) {
+    my @x = split(/\001/, $e);
+    $n = $edge{$e};
+
+    # Initialize degree of kept incoming and outgoing edges if necessary
+    my $src = $x[0];
+    my $dst = $x[1];
+    if (!exists($outdegree{$src})) { $outdegree{$src} = 0; }
+    if (!exists($indegree{$dst})) { $indegree{$dst} = 0; }
+
+    my $keep;
+    if ($indegree{$dst} == 0) {
+      # Keep edge if needed for reachability
+      $keep = 1;
+    } elsif (abs($n) <= $edgelimit) {
+      # Drop if we are below --edgefraction
+      $keep = 0;
+    } elsif ($outdegree{$src} >= $main::opt_maxdegree ||
+             $indegree{$dst} >= $main::opt_maxdegree) {
+      # Keep limited number of in/out edges per node
+      $keep = 0;
+    } else {
+      $keep = 1;
+    }
+
+    if ($keep) {
+      $outdegree{$src}++;
+      $indegree{$dst}++;
+
+      # Compute line width based on edge count
+      my $fraction = abs($local_total ? (3 * ($n / $local_total)) : 0);
+      if ($fraction > 1) { $fraction = 1; }
+      my $w = $fraction * 2;
+      if ($w < 1 && ($main::opt_web || $main::opt_svg)) {
+        # SVG output treats line widths < 1 poorly.
+        $w = 1;
+      }
+
+      # Dot sometimes segfaults if given edge weights that are too large, so
+      # we cap the weights at a large value
+      my $edgeweight = abs($n) ** 0.7;
+      if ($edgeweight > 100000) { $edgeweight = 100000; }
+      $edgeweight = int($edgeweight);
+
+      my $style = sprintf("setlinewidth(%f)", $w);
+      if ($x[1] =~ m/\(inline\)/) {
+        $style .= ",dashed";
+      }
+
+      # Use a slightly squashed function of the edge count as the weight
+      printf DOT ("N%s -> N%s [label=%s, weight=%d, style=\"%s\"];\n",
+                  $node{$x[0]},
+                  $node{$x[1]},
+                  Unparse($n),
+                  $edgeweight,
+                  $style);
+    }
+  }
+
+  print DOT ("}\n");
+  close(DOT);
+
+  if ($main::opt_web || $main::opt_svg) {
+    # Rewrite SVG to be more usable inside web browser.
+    RewriteSvg(TempName($main::next_tmpfile, "svg"));
+  }
+
+  return 1;
+}
+
+sub RewriteSvg {
+  my $svgfile = shift;
+
+  open(SVG, $svgfile) || die "open temp svg: $!";
+  my @svg = <SVG>;
+  close(SVG);
+  unlink $svgfile;
+  my $svg = join('', @svg);
+
+  # Dot's SVG output is
+  #
+  #    <svg width="___" height="___"
+  #     viewBox="___" xmlns=...>
+  #    <g id="graph0" transform="...">
+  #    ...
+  #    </g>
+  #    </svg>
+  #
+  # Change it to
+  #
+  #    <svg width="100%" height="100%"
+  #     xmlns=...>
+  #    $svg_javascript
+  #    <g id="viewport" transform="translate(0,0)">
+  #    <g id="graph0" transform="...">
+  #    ...
+  #    </g>
+  #    </g>
+  #    </svg>
+
+  # Fix width, height; drop viewBox.
+  $svg =~ s/(?s)<svg width="[^"]+" height="[^"]+"(.*?)viewBox="[^"]+"/<svg width="100%" height="100%"$1/;
+
+  # Insert script, viewport <g> above first <g>
+  my $svg_javascript = SvgJavascript();
+  my $viewport = "<g id=\"viewport\" transform=\"translate(0,0)\">\n";
+  $svg =~ s/<g id="graph\d"/$svg_javascript$viewport$&/;
+
+  # Insert final </g> above </svg>.
+  $svg =~ s/(.*)(<\/svg>)/$1<\/g>$2/;
+  $svg =~ s/<g id="graph\d"(.*?)/<g id="viewport"$1/;
+
+  if ($main::opt_svg) {
+    # --svg: write to standard output.
+    print $svg;
+  } else {
+    # Write back to temporary file.
+    open(SVG, ">$svgfile") || die "open $svgfile: $!";
+    print SVG $svg;
+    close(SVG);
+  }
+}
+
+sub SvgJavascript {
+  return <<'EOF';
+<script type="text/ecmascript"><![CDATA[
+// SVGPan
+// http://www.cyberz.org/blog/2009/12/08/svgpan-a-javascript-svg-panzoomdrag-library/
+// Local modification: if(true || ...) below to force panning, never moving.
+
+/**
+ *  SVGPan library 1.2
+ * ====================
+ *
+ * Given an unique existing element with id "viewport", including the
+ * the library into any SVG adds the following capabilities:
+ *
+ *  - Mouse panning
+ *  - Mouse zooming (using the wheel)
+ *  - Object dargging
+ *
+ * Known issues:
+ *
+ *  - Zooming (while panning) on Safari has still some issues
+ *
+ * Releases:
+ *
+ * 1.2, Sat Mar 20 08:42:50 GMT 2010, Zeng Xiaohui
+ *	Fixed a bug with browser mouse handler interaction
+ *
+ * 1.1, Wed Feb  3 17:39:33 GMT 2010, Zeng Xiaohui
+ *	Updated the zoom code to support the mouse wheel on Safari/Chrome
+ *
+ * 1.0, Andrea Leofreddi
+ *	First release
+ *
+ * This code is licensed under the following BSD license:
+ *
+ * Copyright 2009-2010 Andrea Leofreddi <a.leofreddi@itcharm.com>. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are
+ * permitted provided that the following conditions are met:
+ *
+ *    1. Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *
+ *    2. Redistributions in binary form must reproduce the above copyright notice, this list
+ *       of conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Andrea Leofreddi ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Andrea Leofreddi OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * The views and conclusions contained in the software and documentation are those of the
+ * authors and should not be interpreted as representing official policies, either expressed
+ * or implied, of Andrea Leofreddi.
+ */
+
+var root = document.documentElement;
+
+var state = 'none', stateTarget, stateOrigin, stateTf;
+
+setupHandlers(root);
+
+/**
+ * Register handlers
+ */
+function setupHandlers(root){
+	setAttributes(root, {
+		"onmouseup" : "add(evt)",
+		"onmousedown" : "handleMouseDown(evt)",
+		"onmousemove" : "handleMouseMove(evt)",
+		"onmouseup" : "handleMouseUp(evt)",
+		//"onmouseout" : "handleMouseUp(evt)", // Decomment this to stop the pan functionality when dragging out of the SVG element
+	});
+
+	if(navigator.userAgent.toLowerCase().indexOf('webkit') >= 0)
+		window.addEventListener('mousewheel', handleMouseWheel, false); // Chrome/Safari
+	else
+		window.addEventListener('DOMMouseScroll', handleMouseWheel, false); // Others
+
+	var g = svgDoc.getElementById("svg");
+	g.width = "100%";
+	g.height = "100%";
+}
+
+/**
+ * Instance an SVGPoint object with given event coordinates.
+ */
+function getEventPoint(evt) {
+	var p = root.createSVGPoint();
+
+	p.x = evt.clientX;
+	p.y = evt.clientY;
+
+	return p;
+}
+
+/**
+ * Sets the current transform matrix of an element.
+ */
+function setCTM(element, matrix) {
+	var s = "matrix(" + matrix.a + "," + matrix.b + "," + matrix.c + "," + matrix.d + "," + matrix.e + "," + matrix.f + ")";
+
+	element.setAttribute("transform", s);
+}
+
+/**
+ * Dumps a matrix to a string (useful for debug).
+ */
+function dumpMatrix(matrix) {
+	var s = "[ " + matrix.a + ", " + matrix.c + ", " + matrix.e + "\n  " + matrix.b + ", " + matrix.d + ", " + matrix.f + "\n  0, 0, 1 ]";
+
+	return s;
+}
+
+/**
+ * Sets attributes of an element.
+ */
+function setAttributes(element, attributes){
+	for (i in attributes)
+		element.setAttributeNS(null, i, attributes[i]);
+}
+
+/**
+ * Handle mouse move event.
+ */
+function handleMouseWheel(evt) {
+	if(evt.preventDefault)
+		evt.preventDefault();
+
+	evt.returnValue = false;
+
+	var svgDoc = evt.target.ownerDocument;
+
+	var delta;
+
+	if(evt.wheelDelta)
+		delta = evt.wheelDelta / 3600; // Chrome/Safari
+	else
+		delta = evt.detail / -90; // Mozilla
+
+	var z = 1 + delta; // Zoom factor: 0.9/1.1
+
+	var g = svgDoc.getElementById("viewport");
+
+	var p = getEventPoint(evt);
+
+	p = p.matrixTransform(g.getCTM().inverse());
+
+	// Compute new scale matrix in current mouse position
+	var k = root.createSVGMatrix().translate(p.x, p.y).scale(z).translate(-p.x, -p.y);
+
+        setCTM(g, g.getCTM().multiply(k));
+
+	stateTf = stateTf.multiply(k.inverse());
+}
+
+/**
+ * Handle mouse move event.
+ */
+function handleMouseMove(evt) {
+	if(evt.preventDefault)
+		evt.preventDefault();
+
+	evt.returnValue = false;
+
+	var svgDoc = evt.target.ownerDocument;
+
+	var g = svgDoc.getElementById("viewport");
+
+	if(state == 'pan') {
+		// Pan mode
+		var p = getEventPoint(evt).matrixTransform(stateTf);
+
+		setCTM(g, stateTf.inverse().translate(p.x - stateOrigin.x, p.y - stateOrigin.y));
+	} else if(state == 'move') {
+		// Move mode
+		var p = getEventPoint(evt).matrixTransform(g.getCTM().inverse());
+
+		setCTM(stateTarget, root.createSVGMatrix().translate(p.x - stateOrigin.x, p.y - stateOrigin.y).multiply(g.getCTM().inverse()).multiply(stateTarget.getCTM()));
+
+		stateOrigin = p;
+	}
+}
+
+/**
+ * Handle click event.
+ */
+function handleMouseDown(evt) {
+	if(evt.preventDefault)
+		evt.preventDefault();
+
+	evt.returnValue = false;
+
+	var svgDoc = evt.target.ownerDocument;
+
+	var g = svgDoc.getElementById("viewport");
+
+	if(true || evt.target.tagName == "svg") {
+		// Pan mode
+		state = 'pan';
+
+		stateTf = g.getCTM().inverse();
+
+		stateOrigin = getEventPoint(evt).matrixTransform(stateTf);
+	} else {
+		// Move mode
+		state = 'move';
+
+		stateTarget = evt.target;
+
+		stateTf = g.getCTM().inverse();
+
+		stateOrigin = getEventPoint(evt).matrixTransform(stateTf);
+	}
+}
+
+/**
+ * Handle mouse button release event.
+ */
+function handleMouseUp(evt) {
+	if(evt.preventDefault)
+		evt.preventDefault();
+
+	evt.returnValue = false;
+
+	var svgDoc = evt.target.ownerDocument;
+
+	if(state == 'pan' || state == 'move') {
+		// Quit pan mode
+		state = '';
+	}
+}
+
+]]></script>
+EOF
+}
+
+# Provides a map from fullname to shortname for cases where the
+# shortname is ambiguous.  The symlist has both the fullname and
+# shortname for all symbols, which is usually fine, but sometimes --
+# such as overloaded functions -- two different fullnames can map to
+# the same shortname.  In that case, we use the address of the
+# function to disambiguate the two.  This function fills in a map that
+# maps fullnames to modified shortnames in such cases.  If a fullname
+# is not present in the map, the 'normal' shortname provided by the
+# symlist is the appropriate one to use.
+sub FillFullnameToShortnameMap {
+  my $symbols = shift;
+  my $fullname_to_shortname_map = shift;
+  my $shortnames_seen_once = {};
+  my $shortnames_seen_more_than_once = {};
+
+  foreach my $symlist (values(%{$symbols})) {
+    # TODO(csilvers): deal with inlined symbols too.
+    my $shortname = $symlist->[0];
+    my $fullname = $symlist->[2];
+    if ($fullname !~ /<[0-9a-fA-F]+>$/) {  # fullname doesn't end in an address
+      next;       # the only collisions we care about are when addresses differ
+    }
+    if (defined($shortnames_seen_once->{$shortname}) &&
+        $shortnames_seen_once->{$shortname} ne $fullname) {
+      $shortnames_seen_more_than_once->{$shortname} = 1;
+    } else {
+      $shortnames_seen_once->{$shortname} = $fullname;
+    }
+  }
+
+  foreach my $symlist (values(%{$symbols})) {
+    my $shortname = $symlist->[0];
+    my $fullname = $symlist->[2];
+    # TODO(csilvers): take in a list of addresses we care about, and only
+    # store in the map if $symlist->[1] is in that list.  Saves space.
+    next if defined($fullname_to_shortname_map->{$fullname});
+    if (defined($shortnames_seen_more_than_once->{$shortname})) {
+      if ($fullname =~ /<0*([^>]*)>$/) {   # fullname has address at end of it
+        $fullname_to_shortname_map->{$fullname} = "$shortname\@$1";
+      }
+    }
+  }
+}
+
+# Return a small number that identifies the argument.
+# Multiple calls with the same argument will return the same number.
+# Calls with different arguments will return different numbers.
+sub ShortIdFor {
+  my $key = shift;
+  my $id = $main::uniqueid{$key};
+  if (!defined($id)) {
+    $id = keys(%main::uniqueid) + 1;
+    $main::uniqueid{$key} = $id;
+  }
+  return $id;
+}
+
+# Translate a stack of addresses into a stack of symbols
+sub TranslateStack {
+  my $symbols = shift;
+  my $fullname_to_shortname_map = shift;
+  my $k = shift;
+
+  my @addrs = split(/\n/, $k);
+  my @result = ();
+  for (my $i = 0; $i <= $#addrs; $i++) {
+    my $a = $addrs[$i];
+
+    # Skip large addresses since they sometimes show up as fake entries on RH9
+    if (length($a) > 8 && $a gt "7fffffffffffffff") {
+      next;
+    }
+
+    if ($main::opt_disasm || $main::opt_list) {
+      # We want just the address for the key
+      push(@result, $a);
+      next;
+    }
+
+    my $symlist = $symbols->{$a};
+    if (!defined($symlist)) {
+      $symlist = [$a, "", $a];
+    }
+
+    # We can have a sequence of symbols for a particular entry
+    # (more than one symbol in the case of inlining).  Callers
+    # come before callees in symlist, so walk backwards since
+    # the translated stack should contain callees before callers.
+    for (my $j = $#{$symlist}; $j >= 2; $j -= 3) {
+      my $func = $symlist->[$j-2];
+      my $fileline = $symlist->[$j-1];
+      my $fullfunc = $symlist->[$j];
+      if (defined($fullname_to_shortname_map->{$fullfunc})) {
+        $func = $fullname_to_shortname_map->{$fullfunc};
+      }
+      if ($j > 2) {
+        $func = "$func (inline)";
+      }
+
+      # Do not merge nodes corresponding to Callback::Run since that
+      # causes confusing cycles in dot display.  Instead, we synthesize
+      # a unique name for this frame per caller.
+      if ($func =~ m/Callback.*::Run$/) {
+        my $caller = ($i > 0) ? $addrs[$i-1] : 0;
+        $func = "Run#" . ShortIdFor($caller);
+      }
+
+      if ($main::opt_addresses) {
+        push(@result, "$a $func $fileline");
+      } elsif ($main::opt_lines) {
+        if ($func eq '??' && $fileline eq '??:0') {
+          push(@result, "$a");
+        } else {
+          push(@result, "$func $fileline");
+        }
+      } elsif ($main::opt_functions) {
+        if ($func eq '??') {
+          push(@result, "$a");
+        } else {
+          push(@result, $func);
+        }
+      } elsif ($main::opt_files) {
+        if ($fileline eq '??:0' || $fileline eq '') {
+          push(@result, "$a");
+        } else {
+          my $f = $fileline;
+          $f =~ s/:\d+$//;
+          push(@result, $f);
+        }
+      } else {
+        push(@result, $a);
+        last;  # Do not print inlined info
+      }
+    }
+  }
+
+  # print join(",", @addrs), " => ", join(",", @result), "\n";
+  return @result;
+}
+
+# Generate percent string for a number and a total
+sub Percent {
+  my $num = shift;
+  my $tot = shift;
+  if ($tot != 0) {
+    return sprintf("%.1f%%", $num * 100.0 / $tot);
+  } else {
+    return ($num == 0) ? "nan" : (($num > 0) ? "+inf" : "-inf");
+  }
+}
+
+# Generate pretty-printed form of number
+sub Unparse {
+  my $num = shift;
+  if ($main::profile_type eq 'heap' || $main::profile_type eq 'growth') {
+    if ($main::opt_inuse_objects || $main::opt_alloc_objects) {
+      return sprintf("%d", $num);
+    } else {
+      if ($main::opt_show_bytes) {
+        return sprintf("%d", $num);
+      } else {
+        return sprintf("%.1f", $num / 1048576.0);
+      }
+    }
+  } elsif ($main::profile_type eq 'contention' && !$main::opt_contentions) {
+    return sprintf("%.3f", $num / 1e9); # Convert nanoseconds to seconds
+  } else {
+    return sprintf("%d", $num);
+  }
+}
+
+# Alternate pretty-printed form: 0 maps to "."
+sub UnparseAlt {
+  my $num = shift;
+  if ($num == 0) {
+    return ".";
+  } else {
+    return Unparse($num);
+  }
+}
+
+# Alternate pretty-printed form: 0 maps to ""
+sub HtmlPrintNumber {
+  my $num = shift;
+  if ($num == 0) {
+    return "";
+  } else {
+    return Unparse($num);
+  }
+}
+
+# Return output units
+sub Units {
+  if ($main::profile_type eq 'heap' || $main::profile_type eq 'growth') {
+    if ($main::opt_inuse_objects || $main::opt_alloc_objects) {
+      return "objects";
+    } else {
+      if ($main::opt_show_bytes) {
+        return "B";
+      } else {
+        return "MB";
+      }
+    }
+  } elsif ($main::profile_type eq 'contention' && !$main::opt_contentions) {
+    return "seconds";
+  } else {
+    return "samples";
+  }
+}
+
+##### Profile manipulation code #####
+
+# Generate flattened profile:
+# If count is charged to stack [a,b,c,d], in generated profile,
+# it will be charged to [a]
+sub FlatProfile {
+  my $profile = shift;
+  my $result = {};
+  foreach my $k (keys(%{$profile})) {
+    my $count = $profile->{$k};
+    my @addrs = split(/\n/, $k);
+    if ($#addrs >= 0) {
+      AddEntry($result, $addrs[0], $count);
+    }
+  }
+  return $result;
+}
+
+# Generate cumulative profile:
+# If count is charged to stack [a,b,c,d], in generated profile,
+# it will be charged to [a], [b], [c], [d]
+sub CumulativeProfile {
+  my $profile = shift;
+  my $result = {};
+  foreach my $k (keys(%{$profile})) {
+    my $count = $profile->{$k};
+    my @addrs = split(/\n/, $k);
+    foreach my $a (@addrs) {
+      AddEntry($result, $a, $count);
+    }
+  }
+  return $result;
+}
+
+# If the second-youngest PC on the stack is always the same, returns
+# that pc.  Otherwise, returns undef.
+sub IsSecondPcAlwaysTheSame {
+  my $profile = shift;
+
+  my $second_pc = undef;
+  foreach my $k (keys(%{$profile})) {
+    my @addrs = split(/\n/, $k);
+    if ($#addrs < 1) {
+      return undef;
+    }
+    if (not defined $second_pc) {
+      $second_pc = $addrs[1];
+    } else {
+      if ($second_pc ne $addrs[1]) {
+        return undef;
+      }
+    }
+  }
+  return $second_pc;
+}
+
+sub ExtractSymbolLocation {
+  my $symbols = shift;
+  my $address = shift;
+  # 'addr2line' outputs "??:0" for unknown locations; we do the
+  # same to be consistent.
+  my $location = "??:0:unknown";
+  if (exists $symbols->{$address}) {
+    my $file = $symbols->{$address}->[1];
+    if ($file eq "?") {
+      $file = "??:0"
+    }
+    $location = $file . ":" . $symbols->{$address}->[0];
+  }
+  return $location;
+}
+
+# Extracts a graph of calls.
+sub ExtractCalls {
+  my $symbols = shift;
+  my $profile = shift;
+
+  my $calls = {};
+  while( my ($stack_trace, $count) = each %$profile ) {
+    my @address = split(/\n/, $stack_trace);
+    my $destination = ExtractSymbolLocation($symbols, $address[0]);
+    AddEntry($calls, $destination, $count);
+    for (my $i = 1; $i <= $#address; $i++) {
+      my $source = ExtractSymbolLocation($symbols, $address[$i]);
+      my $call = "$source -> $destination";
+      AddEntry($calls, $call, $count);
+      $destination = $source;
+    }
+  }
+
+  return $calls;
+}
+
+sub RemoveUninterestingFrames {
+  my $symbols = shift;
+  my $profile = shift;
+
+  # List of function names to skip
+  my %skip = ();
+  my $skip_regexp = 'NOMATCH';
+  if ($main::profile_type eq 'heap' || $main::profile_type eq 'growth') {
+    foreach my $name ('calloc',
+                      'cfree',
+                      'malloc',
+                      'free',
+                      'memalign',
+                      'posix_memalign',
+                      'aligned_alloc',
+                      'pvalloc',
+                      'valloc',
+                      'realloc',
+                      'mallocx', # jemalloc
+                      'rallocx', # jemalloc
+                      'xallocx', # jemalloc
+                      'dallocx', # jemalloc
+                      'sdallocx', # jemalloc
+                      'tc_calloc',
+                      'tc_cfree',
+                      'tc_malloc',
+                      'tc_free',
+                      'tc_memalign',
+                      'tc_posix_memalign',
+                      'tc_pvalloc',
+                      'tc_valloc',
+                      'tc_realloc',
+                      'tc_new',
+                      'tc_delete',
+                      'tc_newarray',
+                      'tc_deletearray',
+                      'tc_new_nothrow',
+                      'tc_newarray_nothrow',
+                      'do_malloc',
+                      '::do_malloc',   # new name -- got moved to an unnamed ns
+                      '::do_malloc_or_cpp_alloc',
+                      'DoSampledAllocation',
+                      'simple_alloc::allocate',
+                      '__malloc_alloc_template::allocate',
+                      '__builtin_delete',
+                      '__builtin_new',
+                      '__builtin_vec_delete',
+                      '__builtin_vec_new',
+                      'operator new',
+                      'operator new[]',
+                      # The entry to our memory-allocation routines on OS X
+                      'malloc_zone_malloc',
+                      'malloc_zone_calloc',
+                      'malloc_zone_valloc',
+                      'malloc_zone_realloc',
+                      'malloc_zone_memalign',
+                      'malloc_zone_free',
+                      # These mark the beginning/end of our custom sections
+                      '__start_google_malloc',
+                      '__stop_google_malloc',
+                      '__start_malloc_hook',
+                      '__stop_malloc_hook') {
+      $skip{$name} = 1;
+      $skip{"_" . $name} = 1;   # Mach (OS X) adds a _ prefix to everything
+    }
+    # TODO: Remove TCMalloc once everything has been
+    # moved into the tcmalloc:: namespace and we have flushed
+    # old code out of the system.
+    $skip_regexp = "TCMalloc|^tcmalloc::";
+  } elsif ($main::profile_type eq 'contention') {
+    foreach my $vname ('base::RecordLockProfileData',
+                       'base::SubmitMutexProfileData',
+                       'base::SubmitSpinLockProfileData',
+                       'Mutex::Unlock',
+                       'Mutex::UnlockSlow',
+                       'Mutex::ReaderUnlock',
+                       'MutexLock::~MutexLock',
+                       'SpinLock::Unlock',
+                       'SpinLock::SlowUnlock',
+                       'SpinLockHolder::~SpinLockHolder') {
+      $skip{$vname} = 1;
+    }
+  } elsif ($main::profile_type eq 'cpu') {
+    # Drop signal handlers used for CPU profile collection
+    # TODO(dpeng): this should not be necessary; it's taken
+    # care of by the general 2nd-pc mechanism below.
+    foreach my $name ('ProfileData::Add',           # historical
+                      'ProfileData::prof_handler',  # historical
+                      'CpuProfiler::prof_handler',
+                      '__FRAME_END__',
+                      '__pthread_sighandler',
+                      '__restore') {
+      $skip{$name} = 1;
+    }
+  } else {
+    # Nothing skipped for unknown types
+  }
+
+  if ($main::profile_type eq 'cpu') {
+    # If all the second-youngest program counters are the same,
+    # this STRONGLY suggests that it is an artifact of measurement,
+    # i.e., stack frames pushed by the CPU profiler signal handler.
+    # Hence, we delete them.
+    # (The topmost PC is read from the signal structure, not from
+    # the stack, so it does not get involved.)
+    while (my $second_pc = IsSecondPcAlwaysTheSame($profile)) {
+      my $result = {};
+      my $func = '';
+      if (exists($symbols->{$second_pc})) {
+        $second_pc = $symbols->{$second_pc}->[0];
+      }
+      print STDERR "Removing $second_pc from all stack traces.\n";
+      foreach my $k (keys(%{$profile})) {
+        my $count = $profile->{$k};
+        my @addrs = split(/\n/, $k);
+        splice @addrs, 1, 1;
+        my $reduced_path = join("\n", @addrs);
+        AddEntry($result, $reduced_path, $count);
+      }
+      $profile = $result;
+    }
+  }
+
+  my $result = {};
+  foreach my $k (keys(%{$profile})) {
+    my $count = $profile->{$k};
+    my @addrs = split(/\n/, $k);
+    my @path = ();
+    foreach my $a (@addrs) {
+      if (exists($symbols->{$a})) {
+        my $func = $symbols->{$a}->[0];
+        if ($skip{$func} || ($func =~ m/$skip_regexp/)) {
+          # Throw away the portion of the backtrace seen so far, under the
+          # assumption that previous frames were for functions internal to the
+          # allocator.
+          @path = ();
+          next;
+        }
+      }
+      push(@path, $a);
+    }
+    my $reduced_path = join("\n", @path);
+    AddEntry($result, $reduced_path, $count);
+  }
+  return $result;
+}
+
+# Reduce profile to granularity given by user
+sub ReduceProfile {
+  my $symbols = shift;
+  my $profile = shift;
+  my $result = {};
+  my $fullname_to_shortname_map = {};
+  FillFullnameToShortnameMap($symbols, $fullname_to_shortname_map);
+  foreach my $k (keys(%{$profile})) {
+    my $count = $profile->{$k};
+    my @translated = TranslateStack($symbols, $fullname_to_shortname_map, $k);
+    my @path = ();
+    my %seen = ();
+    $seen{''} = 1;      # So that empty keys are skipped
+    foreach my $e (@translated) {
+      # To avoid double-counting due to recursion, skip a stack-trace
+      # entry if it has already been seen
+      if (!$seen{$e}) {
+        $seen{$e} = 1;
+        push(@path, $e);
+      }
+    }
+    my $reduced_path = join("\n", @path);
+    AddEntry($result, $reduced_path, $count);
+  }
+  return $result;
+}
+
+# Does the specified symbol array match the regexp?
+sub SymbolMatches {
+  my $sym = shift;
+  my $re = shift;
+  if (defined($sym)) {
+    for (my $i = 0; $i < $#{$sym}; $i += 3) {
+      if ($sym->[$i] =~ m/$re/ || $sym->[$i+1] =~ m/$re/) {
+        return 1;
+      }
+    }
+  }
+  return 0;
+}
+
+# Focus only on paths involving specified regexps
+sub FocusProfile {
+  my $symbols = shift;
+  my $profile = shift;
+  my $focus = shift;
+  my $result = {};
+  foreach my $k (keys(%{$profile})) {
+    my $count = $profile->{$k};
+    my @addrs = split(/\n/, $k);
+    foreach my $a (@addrs) {
+      # Reply if it matches either the address/shortname/fileline
+      if (($a =~ m/$focus/) || SymbolMatches($symbols->{$a}, $focus)) {
+        AddEntry($result, $k, $count);
+        last;
+      }
+    }
+  }
+  return $result;
+}
+
+# Focus only on paths not involving specified regexps
+sub IgnoreProfile {
+  my $symbols = shift;
+  my $profile = shift;
+  my $ignore = shift;
+  my $result = {};
+  foreach my $k (keys(%{$profile})) {
+    my $count = $profile->{$k};
+    my @addrs = split(/\n/, $k);
+    my $matched = 0;
+    foreach my $a (@addrs) {
+      # Reply if it matches either the address/shortname/fileline
+      if (($a =~ m/$ignore/) || SymbolMatches($symbols->{$a}, $ignore)) {
+        $matched = 1;
+        last;
+      }
+    }
+    if (!$matched) {
+      AddEntry($result, $k, $count);
+    }
+  }
+  return $result;
+}
+
+# Get total count in profile
+sub TotalProfile {
+  my $profile = shift;
+  my $result = 0;
+  foreach my $k (keys(%{$profile})) {
+    $result += $profile->{$k};
+  }
+  return $result;
+}
+
+# Add A to B
+sub AddProfile {
+  my $A = shift;
+  my $B = shift;
+
+  my $R = {};
+  # add all keys in A
+  foreach my $k (keys(%{$A})) {
+    my $v = $A->{$k};
+    AddEntry($R, $k, $v);
+  }
+  # add all keys in B
+  foreach my $k (keys(%{$B})) {
+    my $v = $B->{$k};
+    AddEntry($R, $k, $v);
+  }
+  return $R;
+}
+
+# Merges symbol maps
+sub MergeSymbols {
+  my $A = shift;
+  my $B = shift;
+
+  my $R = {};
+  foreach my $k (keys(%{$A})) {
+    $R->{$k} = $A->{$k};
+  }
+  if (defined($B)) {
+    foreach my $k (keys(%{$B})) {
+      $R->{$k} = $B->{$k};
+    }
+  }
+  return $R;
+}
+
+
+# Add A to B
+sub AddPcs {
+  my $A = shift;
+  my $B = shift;
+
+  my $R = {};
+  # add all keys in A
+  foreach my $k (keys(%{$A})) {
+    $R->{$k} = 1
+  }
+  # add all keys in B
+  foreach my $k (keys(%{$B})) {
+    $R->{$k} = 1
+  }
+  return $R;
+}
+
+# Subtract B from A
+sub SubtractProfile {
+  my $A = shift;
+  my $B = shift;
+
+  my $R = {};
+  foreach my $k (keys(%{$A})) {
+    my $v = $A->{$k} - GetEntry($B, $k);
+    if ($v < 0 && $main::opt_drop_negative) {
+      $v = 0;
+    }
+    AddEntry($R, $k, $v);
+  }
+  if (!$main::opt_drop_negative) {
+    # Take care of when subtracted profile has more entries
+    foreach my $k (keys(%{$B})) {
+      if (!exists($A->{$k})) {
+        AddEntry($R, $k, 0 - $B->{$k});
+      }
+    }
+  }
+  return $R;
+}
+
+# Get entry from profile; zero if not present
+sub GetEntry {
+  my $profile = shift;
+  my $k = shift;
+  if (exists($profile->{$k})) {
+    return $profile->{$k};
+  } else {
+    return 0;
+  }
+}
+
+# Add entry to specified profile
+sub AddEntry {
+  my $profile = shift;
+  my $k = shift;
+  my $n = shift;
+  if (!exists($profile->{$k})) {
+    $profile->{$k} = 0;
+  }
+  $profile->{$k} += $n;
+}
+
+# Add a stack of entries to specified profile, and add them to the $pcs
+# list.
+sub AddEntries {
+  my $profile = shift;
+  my $pcs = shift;
+  my $stack = shift;
+  my $count = shift;
+  my @k = ();
+
+  foreach my $e (split(/\s+/, $stack)) {
+    my $pc = HexExtend($e);
+    $pcs->{$pc} = 1;
+    push @k, $pc;
+  }
+  AddEntry($profile, (join "\n", @k), $count);
+}
+
+##### Code to profile a server dynamically #####
+
+sub CheckSymbolPage {
+  my $url = SymbolPageURL();
+  my $command = ShellEscape(@URL_FETCHER, $url);
+  open(SYMBOL, "$command |") or error($command);
+  my $line = <SYMBOL>;
+  $line =~ s/\r//g;         # turn windows-looking lines into unix-looking lines
+  close(SYMBOL);
+  unless (defined($line)) {
+    error("$url doesn't exist\n");
+  }
+
+  if ($line =~ /^num_symbols:\s+(\d+)$/) {
+    if ($1 == 0) {
+      error("Stripped binary. No symbols available.\n");
+    }
+  } else {
+    error("Failed to get the number of symbols from $url\n");
+  }
+}
+
+sub IsProfileURL {
+  my $profile_name = shift;
+  if (-f $profile_name) {
+    printf STDERR "Using local file $profile_name.\n";
+    return 0;
+  }
+  return 1;
+}
+
+sub ParseProfileURL {
+  my $profile_name = shift;
+
+  if (!defined($profile_name) || $profile_name eq "") {
+    return ();
+  }
+
+  # Split profile URL - matches all non-empty strings, so no test.
+  $profile_name =~ m,^(https?://)?([^/]+)(.*?)(/|$PROFILES)?$,;
+
+  my $proto = $1 || "http://";
+  my $hostport = $2;
+  my $prefix = $3;
+  my $profile = $4 || "/";
+
+  my $host = $hostport;
+  $host =~ s/:.*//;
+
+  my $baseurl = "$proto$hostport$prefix";
+  return ($host, $baseurl, $profile);
+}
+
+# We fetch symbols from the first profile argument.
+sub SymbolPageURL {
+  my ($host, $baseURL, $path) = ParseProfileURL($main::pfile_args[0]);
+  return "$baseURL$SYMBOL_PAGE";
+}
+
+sub FetchProgramName() {
+  my ($host, $baseURL, $path) = ParseProfileURL($main::pfile_args[0]);
+  my $url = "$baseURL$PROGRAM_NAME_PAGE";
+  my $command_line = ShellEscape(@URL_FETCHER, $url);
+  open(CMDLINE, "$command_line |") or error($command_line);
+  my $cmdline = <CMDLINE>;
+  $cmdline =~ s/\r//g;   # turn windows-looking lines into unix-looking lines
+  close(CMDLINE);
+  error("Failed to get program name from $url\n") unless defined($cmdline);
+  $cmdline =~ s/\x00.+//;  # Remove argv[1] and latters.
+  $cmdline =~ s!\n!!g;  # Remove LFs.
+  return $cmdline;
+}
+
+# Gee, curl's -L (--location) option isn't reliable at least
+# with its 7.12.3 version.  Curl will forget to post data if
+# there is a redirection.  This function is a workaround for
+# curl.  Redirection happens on borg hosts.
+sub ResolveRedirectionForCurl {
+  my $url = shift;
+  my $command_line = ShellEscape(@URL_FETCHER, "--head", $url);
+  open(CMDLINE, "$command_line |") or error($command_line);
+  while (<CMDLINE>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    if (/^Location: (.*)/) {
+      $url = $1;
+    }
+  }
+  close(CMDLINE);
+  return $url;
+}
+
+# Add a timeout flat to URL_FETCHER.  Returns a new list.
+sub AddFetchTimeout {
+  my $timeout = shift;
+  my @fetcher = shift;
+  if (defined($timeout)) {
+    if (join(" ", @fetcher) =~ m/\bcurl -s/) {
+      push(@fetcher, "--max-time", sprintf("%d", $timeout));
+    } elsif (join(" ", @fetcher) =~ m/\brpcget\b/) {
+      push(@fetcher, sprintf("--deadline=%d", $timeout));
+    }
+  }
+  return @fetcher;
+}
+
+# Reads a symbol map from the file handle name given as $1, returning
+# the resulting symbol map.  Also processes variables relating to symbols.
+# Currently, the only variable processed is 'binary=<value>' which updates
+# $main::prog to have the correct program name.
+sub ReadSymbols {
+  my $in = shift;
+  my $map = {};
+  while (<$in>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    # Removes all the leading zeroes from the symbols, see comment below.
+    if (m/^0x0*([0-9a-f]+)\s+(.+)/) {
+      $map->{$1} = $2;
+    } elsif (m/^---/) {
+      last;
+    } elsif (m/^([a-z][^=]*)=(.*)$/ ) {
+      my ($variable, $value) = ($1, $2);
+      for ($variable, $value) {
+        s/^\s+//;
+        s/\s+$//;
+      }
+      if ($variable eq "binary") {
+        if ($main::prog ne $UNKNOWN_BINARY && $main::prog ne $value) {
+          printf STDERR ("Warning: Mismatched binary name '%s', using '%s'.\n",
+                         $main::prog, $value);
+        }
+        $main::prog = $value;
+      } else {
+        printf STDERR ("Ignoring unknown variable in symbols list: " .
+            "'%s' = '%s'\n", $variable, $value);
+      }
+    }
+  }
+  return $map;
+}
+
+# Fetches and processes symbols to prepare them for use in the profile output
+# code.  If the optional 'symbol_map' arg is not given, fetches symbols from
+# $SYMBOL_PAGE for all PC values found in profile.  Otherwise, the raw symbols
+# are assumed to have already been fetched into 'symbol_map' and are simply
+# extracted and processed.
+sub FetchSymbols {
+  my $pcset = shift;
+  my $symbol_map = shift;
+
+  my %seen = ();
+  my @pcs = grep { !$seen{$_}++ } keys(%$pcset);  # uniq
+
+  if (!defined($symbol_map)) {
+    my $post_data = join("+", sort((map {"0x" . "$_"} @pcs)));
+
+    open(POSTFILE, ">$main::tmpfile_sym");
+    print POSTFILE $post_data;
+    close(POSTFILE);
+
+    my $url = SymbolPageURL();
+
+    my $command_line;
+    if (join(" ", @URL_FETCHER) =~ m/\bcurl -s/) {
+      $url = ResolveRedirectionForCurl($url);
+      $command_line = ShellEscape(@URL_FETCHER, "-d", "\@$main::tmpfile_sym",
+                                  $url);
+    } else {
+      $command_line = (ShellEscape(@URL_FETCHER, "--post", $url)
+                       . " < " . ShellEscape($main::tmpfile_sym));
+    }
+    # We use c++filt in case $SYMBOL_PAGE gives us mangled symbols.
+    my $escaped_cppfilt = ShellEscape($obj_tool_map{"c++filt"});
+    open(SYMBOL, "$command_line | $escaped_cppfilt |") or error($command_line);
+    $symbol_map = ReadSymbols(*SYMBOL{IO});
+    close(SYMBOL);
+  }
+
+  my $symbols = {};
+  foreach my $pc (@pcs) {
+    my $fullname;
+    # For 64 bits binaries, symbols are extracted with 8 leading zeroes.
+    # Then /symbol reads the long symbols in as uint64, and outputs
+    # the result with a "0x%08llx" format which get rid of the zeroes.
+    # By removing all the leading zeroes in both $pc and the symbols from
+    # /symbol, the symbols match and are retrievable from the map.
+    my $shortpc = $pc;
+    $shortpc =~ s/^0*//;
+    # Each line may have a list of names, which includes the function
+    # and also other functions it has inlined.  They are separated (in
+    # PrintSymbolizedProfile), by --, which is illegal in function names.
+    my $fullnames;
+    if (defined($symbol_map->{$shortpc})) {
+      $fullnames = $symbol_map->{$shortpc};
+    } else {
+      $fullnames = "0x" . $pc;  # Just use addresses
+    }
+    my $sym = [];
+    $symbols->{$pc} = $sym;
+    foreach my $fullname (split("--", $fullnames)) {
+      my $name = ShortFunctionName($fullname);
+      push(@{$sym}, $name, "?", $fullname);
+    }
+  }
+  return $symbols;
+}
+
+sub BaseName {
+  my $file_name = shift;
+  $file_name =~ s!^.*/!!;  # Remove directory name
+  return $file_name;
+}
+
+sub MakeProfileBaseName {
+  my ($binary_name, $profile_name) = @_;
+  my ($host, $baseURL, $path) = ParseProfileURL($profile_name);
+  my $binary_shortname = BaseName($binary_name);
+  return sprintf("%s.%s.%s",
+                 $binary_shortname, $main::op_time, $host);
+}
+
+sub FetchDynamicProfile {
+  my $binary_name = shift;
+  my $profile_name = shift;
+  my $fetch_name_only = shift;
+  my $encourage_patience = shift;
+
+  if (!IsProfileURL($profile_name)) {
+    return $profile_name;
+  } else {
+    my ($host, $baseURL, $path) = ParseProfileURL($profile_name);
+    if ($path eq "" || $path eq "/") {
+      # Missing type specifier defaults to cpu-profile
+      $path = $PROFILE_PAGE;
+    }
+
+    my $profile_file = MakeProfileBaseName($binary_name, $profile_name);
+
+    my $url = "$baseURL$path";
+    my $fetch_timeout = undef;
+    if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE/) {
+      if ($path =~ m/[?]/) {
+        $url .= "&";
+      } else {
+        $url .= "?";
+      }
+      $url .= sprintf("seconds=%d", $main::opt_seconds);
+      $fetch_timeout = $main::opt_seconds * 1.01 + 60;
+    } else {
+      # For non-CPU profiles, we add a type-extension to
+      # the target profile file name.
+      my $suffix = $path;
+      $suffix =~ s,/,.,g;
+      $profile_file .= $suffix;
+    }
+
+    my $profile_dir = $ENV{"JEPROF_TMPDIR"} || ($ENV{HOME} . "/jeprof");
+    if (! -d $profile_dir) {
+      mkdir($profile_dir)
+          || die("Unable to create profile directory $profile_dir: $!\n");
+    }
+    my $tmp_profile = "$profile_dir/.tmp.$profile_file";
+    my $real_profile = "$profile_dir/$profile_file";
+
+    if ($fetch_name_only > 0) {
+      return $real_profile;
+    }
+
+    my @fetcher = AddFetchTimeout($fetch_timeout, @URL_FETCHER);
+    my $cmd = ShellEscape(@fetcher, $url) . " > " . ShellEscape($tmp_profile);
+    if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE|$CENSUSPROFILE_PAGE/){
+      print STDERR "Gathering CPU profile from $url for $main::opt_seconds seconds to\n  ${real_profile}\n";
+      if ($encourage_patience) {
+        print STDERR "Be patient...\n";
+      }
+    } else {
+      print STDERR "Fetching $path profile from $url to\n  ${real_profile}\n";
+    }
+
+    (system($cmd) == 0) || error("Failed to get profile: $cmd: $!\n");
+    (system("mv", $tmp_profile, $real_profile) == 0) || error("Unable to rename profile\n");
+    print STDERR "Wrote profile to $real_profile\n";
+    $main::collected_profile = $real_profile;
+    return $main::collected_profile;
+  }
+}
+
+# Collect profiles in parallel
+sub FetchDynamicProfiles {
+  my $items = scalar(@main::pfile_args);
+  my $levels = log($items) / log(2);
+
+  if ($items == 1) {
+    $main::profile_files[0] = FetchDynamicProfile($main::prog, $main::pfile_args[0], 0, 1);
+  } else {
+    # math rounding issues
+    if ((2 ** $levels) < $items) {
+     $levels++;
+    }
+    my $count = scalar(@main::pfile_args);
+    for (my $i = 0; $i < $count; $i++) {
+      $main::profile_files[$i] = FetchDynamicProfile($main::prog, $main::pfile_args[$i], 1, 0);
+    }
+    print STDERR "Fetching $count profiles, Be patient...\n";
+    FetchDynamicProfilesRecurse($levels, 0, 0);
+    $main::collected_profile = join(" \\\n    ", @main::profile_files);
+  }
+}
+
+# Recursively fork a process to get enough processes
+# collecting profiles
+sub FetchDynamicProfilesRecurse {
+  my $maxlevel = shift;
+  my $level = shift;
+  my $position = shift;
+
+  if (my $pid = fork()) {
+    $position = 0 | ($position << 1);
+    TryCollectProfile($maxlevel, $level, $position);
+    wait;
+  } else {
+    $position = 1 | ($position << 1);
+    TryCollectProfile($maxlevel, $level, $position);
+    cleanup();
+    exit(0);
+  }
+}
+
+# Collect a single profile
+sub TryCollectProfile {
+  my $maxlevel = shift;
+  my $level = shift;
+  my $position = shift;
+
+  if ($level >= ($maxlevel - 1)) {
+    if ($position < scalar(@main::pfile_args)) {
+      FetchDynamicProfile($main::prog, $main::pfile_args[$position], 0, 0);
+    }
+  } else {
+    FetchDynamicProfilesRecurse($maxlevel, $level+1, $position);
+  }
+}
+
+##### Parsing code #####
+
+# Provide a small streaming-read module to handle very large
+# cpu-profile files.  Stream in chunks along a sliding window.
+# Provides an interface to get one 'slot', correctly handling
+# endian-ness differences.  A slot is one 32-bit or 64-bit word
+# (depending on the input profile).  We tell endianness and bit-size
+# for the profile by looking at the first 8 bytes: in cpu profiles,
+# the second slot is always 3 (we'll accept anything that's not 0).
+BEGIN {
+  package CpuProfileStream;
+
+  sub new {
+    my ($class, $file, $fname) = @_;
+    my $self = { file        => $file,
+                 base        => 0,
+                 stride      => 512 * 1024,   # must be a multiple of bitsize/8
+                 slots       => [],
+                 unpack_code => "",           # N for big-endian, V for little
+                 perl_is_64bit => 1,          # matters if profile is 64-bit
+    };
+    bless $self, $class;
+    # Let unittests adjust the stride
+    if ($main::opt_test_stride > 0) {
+      $self->{stride} = $main::opt_test_stride;
+    }
+    # Read the first two slots to figure out bitsize and endianness.
+    my $slots = $self->{slots};
+    my $str;
+    read($self->{file}, $str, 8);
+    # Set the global $address_length based on what we see here.
+    # 8 is 32-bit (8 hexadecimal chars); 16 is 64-bit (16 hexadecimal chars).
+    $address_length = ($str eq (chr(0)x8)) ? 16 : 8;
+    if ($address_length == 8) {
+      if (substr($str, 6, 2) eq chr(0)x2) {
+        $self->{unpack_code} = 'V';  # Little-endian.
+      } elsif (substr($str, 4, 2) eq chr(0)x2) {
+        $self->{unpack_code} = 'N';  # Big-endian
+      } else {
+        ::error("$fname: header size >= 2**16\n");
+      }
+      @$slots = unpack($self->{unpack_code} . "*", $str);
+    } else {
+      # If we're a 64-bit profile, check if we're a 64-bit-capable
+      # perl.  Otherwise, each slot will be represented as a float
+      # instead of an int64, losing precision and making all the
+      # 64-bit addresses wrong.  We won't complain yet, but will
+      # later if we ever see a value that doesn't fit in 32 bits.
+      my $has_q = 0;
+      eval { $has_q = pack("Q", "1") ? 1 : 1; };
+      if (!$has_q) {
+        $self->{perl_is_64bit} = 0;
+      }
+      read($self->{file}, $str, 8);
+      if (substr($str, 4, 4) eq chr(0)x4) {
+        # We'd love to use 'Q', but it's a) not universal, b) not endian-proof.
+        $self->{unpack_code} = 'V';  # Little-endian.
+      } elsif (substr($str, 0, 4) eq chr(0)x4) {
+        $self->{unpack_code} = 'N';  # Big-endian
+      } else {
+        ::error("$fname: header size >= 2**32\n");
+      }
+      my @pair = unpack($self->{unpack_code} . "*", $str);
+      # Since we know one of the pair is 0, it's fine to just add them.
+      @$slots = (0, $pair[0] + $pair[1]);
+    }
+    return $self;
+  }
+
+  # Load more data when we access slots->get(X) which is not yet in memory.
+  sub overflow {
+    my ($self) = @_;
+    my $slots = $self->{slots};
+    $self->{base} += $#$slots + 1;   # skip over data we're replacing
+    my $str;
+    read($self->{file}, $str, $self->{stride});
+    if ($address_length == 8) {      # the 32-bit case
+      # This is the easy case: unpack provides 32-bit unpacking primitives.
+      @$slots = unpack($self->{unpack_code} . "*", $str);
+    } else {
+      # We need to unpack 32 bits at a time and combine.
+      my @b32_values = unpack($self->{unpack_code} . "*", $str);
+      my @b64_values = ();
+      for (my $i = 0; $i < $#b32_values; $i += 2) {
+        # TODO(csilvers): if this is a 32-bit perl, the math below
+        #    could end up in a too-large int, which perl will promote
+        #    to a double, losing necessary precision.  Deal with that.
+        #    Right now, we just die.
+        my ($lo, $hi) = ($b32_values[$i], $b32_values[$i+1]);
+        if ($self->{unpack_code} eq 'N') {    # big-endian
+          ($lo, $hi) = ($hi, $lo);
+        }
+        my $value = $lo + $hi * (2**32);
+        if (!$self->{perl_is_64bit} &&   # check value is exactly represented
+            (($value % (2**32)) != $lo || int($value / (2**32)) != $hi)) {
+          ::error("Need a 64-bit perl to process this 64-bit profile.\n");
+        }
+        push(@b64_values, $value);
+      }
+      @$slots = @b64_values;
+    }
+  }
+
+  # Access the i-th long in the file (logically), or -1 at EOF.
+  sub get {
+    my ($self, $idx) = @_;
+    my $slots = $self->{slots};
+    while ($#$slots >= 0) {
+      if ($idx < $self->{base}) {
+        # The only time we expect a reference to $slots[$i - something]
+        # after referencing $slots[$i] is reading the very first header.
+        # Since $stride > |header|, that shouldn't cause any lookback
+        # errors.  And everything after the header is sequential.
+        print STDERR "Unexpected look-back reading CPU profile";
+        return -1;   # shrug, don't know what better to return
+      } elsif ($idx > $self->{base} + $#$slots) {
+        $self->overflow();
+      } else {
+        return $slots->[$idx - $self->{base}];
+      }
+    }
+    # If we get here, $slots is [], which means we've reached EOF
+    return -1;  # unique since slots is supposed to hold unsigned numbers
+  }
+}
+
+# Reads the top, 'header' section of a profile, and returns the last
+# line of the header, commonly called a 'header line'.  The header
+# section of a profile consists of zero or more 'command' lines that
+# are instructions to jeprof, which jeprof executes when reading the
+# header.  All 'command' lines start with a %.  After the command
+# lines is the 'header line', which is a profile-specific line that
+# indicates what type of profile it is, and perhaps other global
+# information about the profile.  For instance, here's a header line
+# for a heap profile:
+#   heap profile:     53:    38236 [  5525:  1284029] @ heapprofile
+# For historical reasons, the CPU profile does not contain a text-
+# readable header line.  If the profile looks like a CPU profile,
+# this function returns "".  If no header line could be found, this
+# function returns undef.
+#
+# The following commands are recognized:
+#   %warn -- emit the rest of this line to stderr, prefixed by 'WARNING:'
+#
+# The input file should be in binmode.
+sub ReadProfileHeader {
+  local *PROFILE = shift;
+  my $firstchar = "";
+  my $line = "";
+  read(PROFILE, $firstchar, 1);
+  seek(PROFILE, -1, 1);                    # unread the firstchar
+  if ($firstchar !~ /[[:print:]]/) {       # is not a text character
+    return "";
+  }
+  while (defined($line = <PROFILE>)) {
+    $line =~ s/\r//g;   # turn windows-looking lines into unix-looking lines
+    if ($line =~ /^%warn\s+(.*)/) {        # 'warn' command
+      # Note this matches both '%warn blah\n' and '%warn\n'.
+      print STDERR "WARNING: $1\n";        # print the rest of the line
+    } elsif ($line =~ /^%/) {
+      print STDERR "Ignoring unknown command from profile header: $line";
+    } else {
+      # End of commands, must be the header line.
+      return $line;
+    }
+  }
+  return undef;     # got to EOF without seeing a header line
+}
+
+sub IsSymbolizedProfileFile {
+  my $file_name = shift;
+  if (!(-e $file_name) || !(-r $file_name)) {
+    return 0;
+  }
+  # Check if the file contains a symbol-section marker.
+  open(TFILE, "<$file_name");
+  binmode TFILE;
+  my $firstline = ReadProfileHeader(*TFILE);
+  close(TFILE);
+  if (!$firstline) {
+    return 0;
+  }
+  $SYMBOL_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
+  my $symbol_marker = $&;
+  return $firstline =~ /^--- *$symbol_marker/;
+}
+
+# Parse profile generated by common/profiler.cc and return a reference
+# to a map:
+#      $result->{version}     Version number of profile file
+#      $result->{period}      Sampling period (in microseconds)
+#      $result->{profile}     Profile object
+#      $result->{threads}     Map of thread IDs to profile objects
+#      $result->{map}         Memory map info from profile
+#      $result->{pcs}         Hash of all PC values seen, key is hex address
+sub ReadProfile {
+  my $prog = shift;
+  my $fname = shift;
+  my $result;            # return value
+
+  $CONTENTION_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
+  my $contention_marker = $&;
+  $GROWTH_PAGE  =~ m,[^/]+$,;    # matches everything after the last slash
+  my $growth_marker = $&;
+  $SYMBOL_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
+  my $symbol_marker = $&;
+  $PROFILE_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
+  my $profile_marker = $&;
+
+  # Look at first line to see if it is a heap or a CPU profile.
+  # CPU profile may start with no header at all, and just binary data
+  # (starting with \0\0\0\0) -- in that case, don't try to read the
+  # whole firstline, since it may be gigabytes(!) of data.
+  open(PROFILE, "<$fname") || error("$fname: $!\n");
+  binmode PROFILE;      # New perls do UTF-8 processing
+  my $header = ReadProfileHeader(*PROFILE);
+  if (!defined($header)) {   # means "at EOF"
+    error("Profile is empty.\n");
+  }
+
+  my $symbols;
+  if ($header =~ m/^--- *$symbol_marker/o) {
+    # Verify that the user asked for a symbolized profile
+    if (!$main::use_symbolized_profile) {
+      # we have both a binary and symbolized profiles, abort
+      error("FATAL ERROR: Symbolized profile\n   $fname\ncannot be used with " .
+            "a binary arg. Try again without passing\n   $prog\n");
+    }
+    # Read the symbol section of the symbolized profile file.
+    $symbols = ReadSymbols(*PROFILE{IO});
+    # Read the next line to get the header for the remaining profile.
+    $header = ReadProfileHeader(*PROFILE) || "";
+  }
+
+  $main::profile_type = '';
+  if ($header =~ m/^heap profile:.*$growth_marker/o) {
+    $main::profile_type = 'growth';
+    $result =  ReadHeapProfile($prog, *PROFILE, $header);
+  } elsif ($header =~ m/^heap profile:/) {
+    $main::profile_type = 'heap';
+    $result =  ReadHeapProfile($prog, *PROFILE, $header);
+  } elsif ($header =~ m/^heap/) {
+    $main::profile_type = 'heap';
+    $result = ReadThreadedHeapProfile($prog, $fname, $header);
+  } elsif ($header =~ m/^--- *$contention_marker/o) {
+    $main::profile_type = 'contention';
+    $result = ReadSynchProfile($prog, *PROFILE);
+  } elsif ($header =~ m/^--- *Stacks:/) {
+    print STDERR
+      "Old format contention profile: mistakenly reports " .
+      "condition variable signals as lock contentions.\n";
+    $main::profile_type = 'contention';
+    $result = ReadSynchProfile($prog, *PROFILE);
+  } elsif ($header =~ m/^--- *$profile_marker/) {
+    # the binary cpu profile data starts immediately after this line
+    $main::profile_type = 'cpu';
+    $result = ReadCPUProfile($prog, $fname, *PROFILE);
+  } else {
+    if (defined($symbols)) {
+      # a symbolized profile contains a format we don't recognize, bail out
+      error("$fname: Cannot recognize profile section after symbols.\n");
+    }
+    # no ascii header present -- must be a CPU profile
+    $main::profile_type = 'cpu';
+    $result = ReadCPUProfile($prog, $fname, *PROFILE);
+  }
+
+  close(PROFILE);
+
+  # if we got symbols along with the profile, return those as well
+  if (defined($symbols)) {
+    $result->{symbols} = $symbols;
+  }
+
+  return $result;
+}
+
+# Subtract one from caller pc so we map back to call instr.
+# However, don't do this if we're reading a symbolized profile
+# file, in which case the subtract-one was done when the file
+# was written.
+#
+# We apply the same logic to all readers, though ReadCPUProfile uses an
+# independent implementation.
+sub FixCallerAddresses {
+  my $stack = shift;
+  if ($main::use_symbolized_profile) {
+    return $stack;
+  } else {
+    $stack =~ /(\s)/;
+    my $delimiter = $1;
+    my @addrs = split(' ', $stack);
+    my @fixedaddrs;
+    $#fixedaddrs = $#addrs;
+    if ($#addrs >= 0) {
+      $fixedaddrs[0] = $addrs[0];
+    }
+    for (my $i = 1; $i <= $#addrs; $i++) {
+      $fixedaddrs[$i] = AddressSub($addrs[$i], "0x1");
+    }
+    return join $delimiter, @fixedaddrs;
+  }
+}
+
+# CPU profile reader
+sub ReadCPUProfile {
+  my $prog = shift;
+  my $fname = shift;       # just used for logging
+  local *PROFILE = shift;
+  my $version;
+  my $period;
+  my $i;
+  my $profile = {};
+  my $pcs = {};
+
+  # Parse string into array of slots.
+  my $slots = CpuProfileStream->new(*PROFILE, $fname);
+
+  # Read header.  The current header version is a 5-element structure
+  # containing:
+  #   0: header count (always 0)
+  #   1: header "words" (after this one: 3)
+  #   2: format version (0)
+  #   3: sampling period (usec)
+  #   4: unused padding (always 0)
+  if ($slots->get(0) != 0 ) {
+    error("$fname: not a profile file, or old format profile file\n");
+  }
+  $i = 2 + $slots->get(1);
+  $version = $slots->get(2);
+  $period = $slots->get(3);
+  # Do some sanity checking on these header values.
+  if ($version > (2**32) || $period > (2**32) || $i > (2**32) || $i < 5) {
+    error("$fname: not a profile file, or corrupted profile file\n");
+  }
+
+  # Parse profile
+  while ($slots->get($i) != -1) {
+    my $n = $slots->get($i++);
+    my $d = $slots->get($i++);
+    if ($d > (2**16)) {  # TODO(csilvers): what's a reasonable max-stack-depth?
+      my $addr = sprintf("0%o", $i * ($address_length == 8 ? 4 : 8));
+      print STDERR "At index $i (address $addr):\n";
+      error("$fname: stack trace depth >= 2**32\n");
+    }
+    if ($slots->get($i) == 0) {
+      # End of profile data marker
+      $i += $d;
+      last;
+    }
+
+    # Make key out of the stack entries
+    my @k = ();
+    for (my $j = 0; $j < $d; $j++) {
+      my $pc = $slots->get($i+$j);
+      # Subtract one from caller pc so we map back to call instr.
+      # However, don't do this if we're reading a symbolized profile
+      # file, in which case the subtract-one was done when the file
+      # was written.
+      if ($j > 0 && !$main::use_symbolized_profile) {
+        $pc--;
+      }
+      $pc = sprintf("%0*x", $address_length, $pc);
+      $pcs->{$pc} = 1;
+      push @k, $pc;
+    }
+
+    AddEntry($profile, (join "\n", @k), $n);
+    $i += $d;
+  }
+
+  # Parse map
+  my $map = '';
+  seek(PROFILE, $i * 4, 0);
+  read(PROFILE, $map, (stat PROFILE)[7]);
+
+  my $r = {};
+  $r->{version} = $version;
+  $r->{period} = $period;
+  $r->{profile} = $profile;
+  $r->{libs} = ParseLibraries($prog, $map, $pcs);
+  $r->{pcs} = $pcs;
+
+  return $r;
+}
+
+sub HeapProfileIndex {
+  my $index = 1;
+  if ($main::opt_inuse_space) {
+    $index = 1;
+  } elsif ($main::opt_inuse_objects) {
+    $index = 0;
+  } elsif ($main::opt_alloc_space) {
+    $index = 3;
+  } elsif ($main::opt_alloc_objects) {
+    $index = 2;
+  }
+  return $index;
+}
+
+sub ReadMappedLibraries {
+  my $fh = shift;
+  my $map = "";
+  # Read the /proc/self/maps data
+  while (<$fh>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    $map .= $_;
+  }
+  return $map;
+}
+
+sub ReadMemoryMap {
+  my $fh = shift;
+  my $map = "";
+  # Read /proc/self/maps data as formatted by DumpAddressMap()
+  my $buildvar = "";
+  while (<PROFILE>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    # Parse "build=<dir>" specification if supplied
+    if (m/^\s*build=(.*)\n/) {
+      $buildvar = $1;
+    }
+
+    # Expand "$build" variable if available
+    $_ =~ s/\$build\b/$buildvar/g;
+
+    $map .= $_;
+  }
+  return $map;
+}
+
+sub AdjustSamples {
+  my ($sample_adjustment, $sampling_algorithm, $n1, $s1, $n2, $s2) = @_;
+  if ($sample_adjustment) {
+    if ($sampling_algorithm == 2) {
+      # Remote-heap version 2
+      # The sampling frequency is the rate of a Poisson process.
+      # This means that the probability of sampling an allocation of
+      # size X with sampling rate Y is 1 - exp(-X/Y)
+      if ($n1 != 0) {
+        my $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
+        my $scale_factor = 1/(1 - exp(-$ratio));
+        $n1 *= $scale_factor;
+        $s1 *= $scale_factor;
+      }
+      if ($n2 != 0) {
+        my $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
+        my $scale_factor = 1/(1 - exp(-$ratio));
+        $n2 *= $scale_factor;
+        $s2 *= $scale_factor;
+      }
+    } else {
+      # Remote-heap version 1
+      my $ratio;
+      $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
+      if ($ratio < 1) {
+        $n1 /= $ratio;
+        $s1 /= $ratio;
+      }
+      $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
+      if ($ratio < 1) {
+        $n2 /= $ratio;
+        $s2 /= $ratio;
+      }
+    }
+  }
+  return ($n1, $s1, $n2, $s2);
+}
+
+sub ReadHeapProfile {
+  my $prog = shift;
+  local *PROFILE = shift;
+  my $header = shift;
+
+  my $index = HeapProfileIndex();
+
+  # Find the type of this profile.  The header line looks like:
+  #    heap profile:   1246:  8800744 [  1246:  8800744] @ <heap-url>/266053
+  # There are two pairs <count: size>, the first inuse objects/space, and the
+  # second allocated objects/space.  This is followed optionally by a profile
+  # type, and if that is present, optionally by a sampling frequency.
+  # For remote heap profiles (v1):
+  # The interpretation of the sampling frequency is that the profiler, for
+  # each sample, calculates a uniformly distributed random integer less than
+  # the given value, and records the next sample after that many bytes have
+  # been allocated.  Therefore, the expected sample interval is half of the
+  # given frequency.  By default, if not specified, the expected sample
+  # interval is 128KB.  Only remote-heap-page profiles are adjusted for
+  # sample size.
+  # For remote heap profiles (v2):
+  # The sampling frequency is the rate of a Poisson process. This means that
+  # the probability of sampling an allocation of size X with sampling rate Y
+  # is 1 - exp(-X/Y)
+  # For version 2, a typical header line might look like this:
+  # heap profile:   1922: 127792360 [  1922: 127792360] @ <heap-url>_v2/524288
+  # the trailing number (524288) is the sampling rate. (Version 1 showed
+  # double the 'rate' here)
+  my $sampling_algorithm = 0;
+  my $sample_adjustment = 0;
+  chomp($header);
+  my $type = "unknown";
+  if ($header =~ m"^heap profile:\s*(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\](\s*@\s*([^/]*)(/(\d+))?)?") {
+    if (defined($6) && ($6 ne '')) {
+      $type = $6;
+      my $sample_period = $8;
+      # $type is "heapprofile" for profiles generated by the
+      # heap-profiler, and either "heap" or "heap_v2" for profiles
+      # generated by sampling directly within tcmalloc.  It can also
+      # be "growth" for heap-growth profiles.  The first is typically
+      # found for profiles generated locally, and the others for
+      # remote profiles.
+      if (($type eq "heapprofile") || ($type !~ /heap/) ) {
+        # No need to adjust for the sampling rate with heap-profiler-derived data
+        $sampling_algorithm = 0;
+      } elsif ($type =~ /_v2/) {
+        $sampling_algorithm = 2;     # version 2 sampling
+        if (defined($sample_period) && ($sample_period ne '')) {
+          $sample_adjustment = int($sample_period);
+        }
+      } else {
+        $sampling_algorithm = 1;     # version 1 sampling
+        if (defined($sample_period) && ($sample_period ne '')) {
+          $sample_adjustment = int($sample_period)/2;
+        }
+      }
+    } else {
+      # We detect whether or not this is a remote-heap profile by checking
+      # that the total-allocated stats ($n2,$s2) are exactly the
+      # same as the in-use stats ($n1,$s1).  It is remotely conceivable
+      # that a non-remote-heap profile may pass this check, but it is hard
+      # to imagine how that could happen.
+      # In this case it's so old it's guaranteed to be remote-heap version 1.
+      my ($n1, $s1, $n2, $s2) = ($1, $2, $3, $4);
+      if (($n1 == $n2) && ($s1 == $s2)) {
+        # This is likely to be a remote-heap based sample profile
+        $sampling_algorithm = 1;
+      }
+    }
+  }
+
+  if ($sampling_algorithm > 0) {
+    # For remote-heap generated profiles, adjust the counts and sizes to
+    # account for the sample rate (we sample once every 128KB by default).
+    if ($sample_adjustment == 0) {
+      # Turn on profile adjustment.
+      $sample_adjustment = 128*1024;
+      print STDERR "Adjusting heap profiles for 1-in-128KB sampling rate\n";
+    } else {
+      printf STDERR ("Adjusting heap profiles for 1-in-%d sampling rate\n",
+                     $sample_adjustment);
+    }
+    if ($sampling_algorithm > 1) {
+      # We don't bother printing anything for the original version (version 1)
+      printf STDERR "Heap version $sampling_algorithm\n";
+    }
+  }
+
+  my $profile = {};
+  my $pcs = {};
+  my $map = "";
+
+  while (<PROFILE>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    if (/^MAPPED_LIBRARIES:/) {
+      $map .= ReadMappedLibraries(*PROFILE);
+      last;
+    }
+
+    if (/^--- Memory map:/) {
+      $map .= ReadMemoryMap(*PROFILE);
+      last;
+    }
+
+    # Read entry of the form:
+    #  <count1>: <bytes1> [<count2>: <bytes2>] @ a1 a2 a3 ... an
+    s/^\s*//;
+    s/\s*$//;
+    if (m/^\s*(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\]\s+@\s+(.*)$/) {
+      my $stack = $5;
+      my ($n1, $s1, $n2, $s2) = ($1, $2, $3, $4);
+      my @counts = AdjustSamples($sample_adjustment, $sampling_algorithm,
+                                 $n1, $s1, $n2, $s2);
+      AddEntries($profile, $pcs, FixCallerAddresses($stack), $counts[$index]);
+    }
+  }
+
+  my $r = {};
+  $r->{version} = "heap";
+  $r->{period} = 1;
+  $r->{profile} = $profile;
+  $r->{libs} = ParseLibraries($prog, $map, $pcs);
+  $r->{pcs} = $pcs;
+  return $r;
+}
+
+sub ReadThreadedHeapProfile {
+  my ($prog, $fname, $header) = @_;
+
+  my $index = HeapProfileIndex();
+  my $sampling_algorithm = 0;
+  my $sample_adjustment = 0;
+  chomp($header);
+  my $type = "unknown";
+  # Assuming a very specific type of header for now.
+  if ($header =~ m"^heap_v2/(\d+)") {
+    $type = "_v2";
+    $sampling_algorithm = 2;
+    $sample_adjustment = int($1);
+  }
+  if ($type ne "_v2" || !defined($sample_adjustment)) {
+    die "Threaded heap profiles require v2 sampling with a sample rate\n";
+  }
+
+  my $profile = {};
+  my $thread_profiles = {};
+  my $pcs = {};
+  my $map = "";
+  my $stack = "";
+
+  while (<PROFILE>) {
+    s/\r//g;
+    if (/^MAPPED_LIBRARIES:/) {
+      $map .= ReadMappedLibraries(*PROFILE);
+      last;
+    }
+
+    if (/^--- Memory map:/) {
+      $map .= ReadMemoryMap(*PROFILE);
+      last;
+    }
+
+    # Read entry of the form:
+    # @ a1 a2 ... an
+    #   t*: <count1>: <bytes1> [<count2>: <bytes2>]
+    #   t1: <count1>: <bytes1> [<count2>: <bytes2>]
+    #     ...
+    #   tn: <count1>: <bytes1> [<count2>: <bytes2>]
+    s/^\s*//;
+    s/\s*$//;
+    if (m/^@\s+(.*)$/) {
+      $stack = $1;
+    } elsif (m/^\s*(t(\*|\d+)):\s+(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\]$/) {
+      if ($stack eq "") {
+        # Still in the header, so this is just a per-thread summary.
+        next;
+      }
+      my $thread = $2;
+      my ($n1, $s1, $n2, $s2) = ($3, $4, $5, $6);
+      my @counts = AdjustSamples($sample_adjustment, $sampling_algorithm,
+                                 $n1, $s1, $n2, $s2);
+      if ($thread eq "*") {
+        AddEntries($profile, $pcs, FixCallerAddresses($stack), $counts[$index]);
+      } else {
+        if (!exists($thread_profiles->{$thread})) {
+          $thread_profiles->{$thread} = {};
+        }
+        AddEntries($thread_profiles->{$thread}, $pcs,
+                   FixCallerAddresses($stack), $counts[$index]);
+      }
+    }
+  }
+
+  my $r = {};
+  $r->{version} = "heap";
+  $r->{period} = 1;
+  $r->{profile} = $profile;
+  $r->{threads} = $thread_profiles;
+  $r->{libs} = ParseLibraries($prog, $map, $pcs);
+  $r->{pcs} = $pcs;
+  return $r;
+}
+
+sub ReadSynchProfile {
+  my $prog = shift;
+  local *PROFILE = shift;
+  my $header = shift;
+
+  my $map = '';
+  my $profile = {};
+  my $pcs = {};
+  my $sampling_period = 1;
+  my $cyclespernanosec = 2.8;   # Default assumption for old binaries
+  my $seen_clockrate = 0;
+  my $line;
+
+  my $index = 0;
+  if ($main::opt_total_delay) {
+    $index = 0;
+  } elsif ($main::opt_contentions) {
+    $index = 1;
+  } elsif ($main::opt_mean_delay) {
+    $index = 2;
+  }
+
+  while ( $line = <PROFILE> ) {
+    $line =~ s/\r//g;      # turn windows-looking lines into unix-looking lines
+    if ( $line =~ /^\s*(\d+)\s+(\d+) \@\s*(.*?)\s*$/ ) {
+      my ($cycles, $count, $stack) = ($1, $2, $3);
+
+      # Convert cycles to nanoseconds
+      $cycles /= $cyclespernanosec;
+
+      # Adjust for sampling done by application
+      $cycles *= $sampling_period;
+      $count *= $sampling_period;
+
+      my @values = ($cycles, $count, $cycles / $count);
+      AddEntries($profile, $pcs, FixCallerAddresses($stack), $values[$index]);
+
+    } elsif ( $line =~ /^(slow release).*thread \d+  \@\s*(.*?)\s*$/ ||
+              $line =~ /^\s*(\d+) \@\s*(.*?)\s*$/ ) {
+      my ($cycles, $stack) = ($1, $2);
+      if ($cycles !~ /^\d+$/) {
+        next;
+      }
+
+      # Convert cycles to nanoseconds
+      $cycles /= $cyclespernanosec;
+
+      # Adjust for sampling done by application
+      $cycles *= $sampling_period;
+
+      AddEntries($profile, $pcs, FixCallerAddresses($stack), $cycles);
+
+    } elsif ( $line =~ m/^([a-z][^=]*)=(.*)$/ ) {
+      my ($variable, $value) = ($1,$2);
+      for ($variable, $value) {
+        s/^\s+//;
+        s/\s+$//;
+      }
+      if ($variable eq "cycles/second") {
+        $cyclespernanosec = $value / 1e9;
+        $seen_clockrate = 1;
+      } elsif ($variable eq "sampling period") {
+        $sampling_period = $value;
+      } elsif ($variable eq "ms since reset") {
+        # Currently nothing is done with this value in jeprof
+        # So we just silently ignore it for now
+      } elsif ($variable eq "discarded samples") {
+        # Currently nothing is done with this value in jeprof
+        # So we just silently ignore it for now
+      } else {
+        printf STDERR ("Ignoring unnknown variable in /contention output: " .
+                       "'%s' = '%s'\n",$variable,$value);
+      }
+    } else {
+      # Memory map entry
+      $map .= $line;
+    }
+  }
+
+  if (!$seen_clockrate) {
+    printf STDERR ("No cycles/second entry in profile; Guessing %.1f GHz\n",
+                   $cyclespernanosec);
+  }
+
+  my $r = {};
+  $r->{version} = 0;
+  $r->{period} = $sampling_period;
+  $r->{profile} = $profile;
+  $r->{libs} = ParseLibraries($prog, $map, $pcs);
+  $r->{pcs} = $pcs;
+  return $r;
+}
+
+# Given a hex value in the form "0x1abcd" or "1abcd", return either
+# "0001abcd" or "000000000001abcd", depending on the current (global)
+# address length.
+sub HexExtend {
+  my $addr = shift;
+
+  $addr =~ s/^(0x)?0*//;
+  my $zeros_needed = $address_length - length($addr);
+  if ($zeros_needed < 0) {
+    printf STDERR "Warning: address $addr is longer than address length $address_length\n";
+    return $addr;
+  }
+  return ("0" x $zeros_needed) . $addr;
+}
+
+##### Symbol extraction #####
+
+# Aggressively search the lib_prefix values for the given library
+# If all else fails, just return the name of the library unmodified.
+# If the lib_prefix is "/my/path,/other/path" and $file is "/lib/dir/mylib.so"
+# it will search the following locations in this order, until it finds a file:
+#   /my/path/lib/dir/mylib.so
+#   /other/path/lib/dir/mylib.so
+#   /my/path/dir/mylib.so
+#   /other/path/dir/mylib.so
+#   /my/path/mylib.so
+#   /other/path/mylib.so
+#   /lib/dir/mylib.so              (returned as last resort)
+sub FindLibrary {
+  my $file = shift;
+  my $suffix = $file;
+
+  # Search for the library as described above
+  do {
+    foreach my $prefix (@prefix_list) {
+      my $fullpath = $prefix . $suffix;
+      if (-e $fullpath) {
+        return $fullpath;
+      }
+    }
+  } while ($suffix =~ s|^/[^/]+/|/|);
+  return $file;
+}
+
+# Return path to library with debugging symbols.
+# For libc libraries, the copy in /usr/lib/debug contains debugging symbols
+sub DebuggingLibrary {
+  my $file = shift;
+  if ($file =~ m|^/|) {
+      if (-f "/usr/lib/debug$file") {
+        return "/usr/lib/debug$file";
+      } elsif (-f "/usr/lib/debug$file.debug") {
+        return "/usr/lib/debug$file.debug";
+      }
+  }
+  return undef;
+}
+
+# Parse text section header of a library using objdump
+sub ParseTextSectionHeaderFromObjdump {
+  my $lib = shift;
+
+  my $size = undef;
+  my $vma;
+  my $file_offset;
+  # Get objdump output from the library file to figure out how to
+  # map between mapped addresses and addresses in the library.
+  my $cmd = ShellEscape($obj_tool_map{"objdump"}, "-h", $lib);
+  open(OBJDUMP, "$cmd |") || error("$cmd: $!\n");
+  while (<OBJDUMP>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    # Idx Name          Size      VMA       LMA       File off  Algn
+    #  10 .text         00104b2c  420156f0  420156f0  000156f0  2**4
+    # For 64-bit objects, VMA and LMA will be 16 hex digits, size and file
+    # offset may still be 8.  But AddressSub below will still handle that.
+    my @x = split;
+    if (($#x >= 6) && ($x[1] eq '.text')) {
+      $size = $x[2];
+      $vma = $x[3];
+      $file_offset = $x[5];
+      last;
+    }
+  }
+  close(OBJDUMP);
+
+  if (!defined($size)) {
+    return undef;
+  }
+
+  my $r = {};
+  $r->{size} = $size;
+  $r->{vma} = $vma;
+  $r->{file_offset} = $file_offset;
+
+  return $r;
+}
+
+# Parse text section header of a library using otool (on OS X)
+sub ParseTextSectionHeaderFromOtool {
+  my $lib = shift;
+
+  my $size = undef;
+  my $vma = undef;
+  my $file_offset = undef;
+  # Get otool output from the library file to figure out how to
+  # map between mapped addresses and addresses in the library.
+  my $command = ShellEscape($obj_tool_map{"otool"}, "-l", $lib);
+  open(OTOOL, "$command |") || error("$command: $!\n");
+  my $cmd = "";
+  my $sectname = "";
+  my $segname = "";
+  foreach my $line (<OTOOL>) {
+    $line =~ s/\r//g;      # turn windows-looking lines into unix-looking lines
+    # Load command <#>
+    #       cmd LC_SEGMENT
+    # [...]
+    # Section
+    #   sectname __text
+    #    segname __TEXT
+    #       addr 0x000009f8
+    #       size 0x00018b9e
+    #     offset 2552
+    #      align 2^2 (4)
+    # We will need to strip off the leading 0x from the hex addresses,
+    # and convert the offset into hex.
+    if ($line =~ /Load command/) {
+      $cmd = "";
+      $sectname = "";
+      $segname = "";
+    } elsif ($line =~ /Section/) {
+      $sectname = "";
+      $segname = "";
+    } elsif ($line =~ /cmd (\w+)/) {
+      $cmd = $1;
+    } elsif ($line =~ /sectname (\w+)/) {
+      $sectname = $1;
+    } elsif ($line =~ /segname (\w+)/) {
+      $segname = $1;
+    } elsif (!(($cmd eq "LC_SEGMENT" || $cmd eq "LC_SEGMENT_64") &&
+               $sectname eq "__text" &&
+               $segname eq "__TEXT")) {
+      next;
+    } elsif ($line =~ /\baddr 0x([0-9a-fA-F]+)/) {
+      $vma = $1;
+    } elsif ($line =~ /\bsize 0x([0-9a-fA-F]+)/) {
+      $size = $1;
+    } elsif ($line =~ /\boffset ([0-9]+)/) {
+      $file_offset = sprintf("%016x", $1);
+    }
+    if (defined($vma) && defined($size) && defined($file_offset)) {
+      last;
+    }
+  }
+  close(OTOOL);
+
+  if (!defined($vma) || !defined($size) || !defined($file_offset)) {
+     return undef;
+  }
+
+  my $r = {};
+  $r->{size} = $size;
+  $r->{vma} = $vma;
+  $r->{file_offset} = $file_offset;
+
+  return $r;
+}
+
+sub ParseTextSectionHeader {
+  # obj_tool_map("otool") is only defined if we're in a Mach-O environment
+  if (defined($obj_tool_map{"otool"})) {
+    my $r = ParseTextSectionHeaderFromOtool(@_);
+    if (defined($r)){
+      return $r;
+    }
+  }
+  # If otool doesn't work, or we don't have it, fall back to objdump
+  return ParseTextSectionHeaderFromObjdump(@_);
+}
+
+# Split /proc/pid/maps dump into a list of libraries
+sub ParseLibraries {
+  return if $main::use_symbol_page;  # We don't need libraries info.
+  my $prog = shift;
+  my $map = shift;
+  my $pcs = shift;
+
+  my $result = [];
+  my $h = "[a-f0-9]+";
+  my $zero_offset = HexExtend("0");
+
+  my $buildvar = "";
+  foreach my $l (split("\n", $map)) {
+    if ($l =~ m/^\s*build=(.*)$/) {
+      $buildvar = $1;
+    }
+
+    my $start;
+    my $finish;
+    my $offset;
+    my $lib;
+    if ($l =~ /^($h)-($h)\s+..x.\s+($h)\s+\S+:\S+\s+\d+\s+(\S+\.(so|dll|dylib|bundle)((\.\d+)+\w*(\.\d+){0,3})?)$/i) {
+      # Full line from /proc/self/maps.  Example:
+      #   40000000-40015000 r-xp 00000000 03:01 12845071   /lib/ld-2.3.2.so
+      $start = HexExtend($1);
+      $finish = HexExtend($2);
+      $offset = HexExtend($3);
+      $lib = $4;
+      $lib =~ s|\\|/|g;     # turn windows-style paths into unix-style paths
+    } elsif ($l =~ /^\s*($h)-($h):\s*(\S+\.so(\.\d+)*)/) {
+      # Cooked line from DumpAddressMap.  Example:
+      #   40000000-40015000: /lib/ld-2.3.2.so
+      $start = HexExtend($1);
+      $finish = HexExtend($2);
+      $offset = $zero_offset;
+      $lib = $3;
+    }
+    # FreeBSD 10.0 virtual memory map /proc/curproc/map as defined in
+    # function procfs_doprocmap (sys/fs/procfs/procfs_map.c)
+    #
+    # Example:
+    # 0x800600000 0x80061a000 26 0 0xfffff800035a0000 r-x 75 33 0x1004 COW NC vnode /libexec/ld-elf.s
+    # o.1 NCH -1
+    elsif ($l =~ /^(0x$h)\s(0x$h)\s\d+\s\d+\s0x$h\sr-x\s\d+\s\d+\s0x\d+\s(COW|NCO)\s(NC|NNC)\svnode\s(\S+\.so(\.\d+)*)/) {
+      $start = HexExtend($1);
+      $finish = HexExtend($2);
+      $offset = $zero_offset;
+      $lib = FindLibrary($5);
+
+    } else {
+      next;
+    }
+
+    # Expand "$build" variable if available
+    $lib =~ s/\$build\b/$buildvar/g;
+
+    $lib = FindLibrary($lib);
+
+    # Check for pre-relocated libraries, which use pre-relocated symbol tables
+    # and thus require adjusting the offset that we'll use to translate
+    # VM addresses into symbol table addresses.
+    # Only do this if we're not going to fetch the symbol table from a
+    # debugging copy of the library.
+    if (!DebuggingLibrary($lib)) {
+      my $text = ParseTextSectionHeader($lib);
+      if (defined($text)) {
+         my $vma_offset = AddressSub($text->{vma}, $text->{file_offset});
+         $offset = AddressAdd($offset, $vma_offset);
+      }
+    }
+
+    if($main::opt_debug) { printf STDERR "$start:$finish ($offset) $lib\n"; }
+    push(@{$result}, [$lib, $start, $finish, $offset]);
+  }
+
+  # Append special entry for additional library (not relocated)
+  if ($main::opt_lib ne "") {
+    my $text = ParseTextSectionHeader($main::opt_lib);
+    if (defined($text)) {
+       my $start = $text->{vma};
+       my $finish = AddressAdd($start, $text->{size});
+
+       push(@{$result}, [$main::opt_lib, $start, $finish, $start]);
+    }
+  }
+
+  # Append special entry for the main program.  This covers
+  # 0..max_pc_value_seen, so that we assume pc values not found in one
+  # of the library ranges will be treated as coming from the main
+  # program binary.
+  my $min_pc = HexExtend("0");
+  my $max_pc = $min_pc;          # find the maximal PC value in any sample
+  foreach my $pc (keys(%{$pcs})) {
+    if (HexExtend($pc) gt $max_pc) { $max_pc = HexExtend($pc); }
+  }
+  push(@{$result}, [$prog, $min_pc, $max_pc, $zero_offset]);
+
+  return $result;
+}
+
+# Add two hex addresses of length $address_length.
+# Run jeprof --test for unit test if this is changed.
+sub AddressAdd {
+  my $addr1 = shift;
+  my $addr2 = shift;
+  my $sum;
+
+  if ($address_length == 8) {
+    # Perl doesn't cope with wraparound arithmetic, so do it explicitly:
+    $sum = (hex($addr1)+hex($addr2)) % (0x10000000 * 16);
+    return sprintf("%08x", $sum);
+
+  } else {
+    # Do the addition in 7-nibble chunks to trivialize carry handling.
+
+    if ($main::opt_debug and $main::opt_test) {
+      print STDERR "AddressAdd $addr1 + $addr2 = ";
+    }
+
+    my $a1 = substr($addr1,-7);
+    $addr1 = substr($addr1,0,-7);
+    my $a2 = substr($addr2,-7);
+    $addr2 = substr($addr2,0,-7);
+    $sum = hex($a1) + hex($a2);
+    my $c = 0;
+    if ($sum > 0xfffffff) {
+      $c = 1;
+      $sum -= 0x10000000;
+    }
+    my $r = sprintf("%07x", $sum);
+
+    $a1 = substr($addr1,-7);
+    $addr1 = substr($addr1,0,-7);
+    $a2 = substr($addr2,-7);
+    $addr2 = substr($addr2,0,-7);
+    $sum = hex($a1) + hex($a2) + $c;
+    $c = 0;
+    if ($sum > 0xfffffff) {
+      $c = 1;
+      $sum -= 0x10000000;
+    }
+    $r = sprintf("%07x", $sum) . $r;
+
+    $sum = hex($addr1) + hex($addr2) + $c;
+    if ($sum > 0xff) { $sum -= 0x100; }
+    $r = sprintf("%02x", $sum) . $r;
+
+    if ($main::opt_debug and $main::opt_test) { print STDERR "$r\n"; }
+
+    return $r;
+  }
+}
+
+
+# Subtract two hex addresses of length $address_length.
+# Run jeprof --test for unit test if this is changed.
+sub AddressSub {
+  my $addr1 = shift;
+  my $addr2 = shift;
+  my $diff;
+
+  if ($address_length == 8) {
+    # Perl doesn't cope with wraparound arithmetic, so do it explicitly:
+    $diff = (hex($addr1)-hex($addr2)) % (0x10000000 * 16);
+    return sprintf("%08x", $diff);
+
+  } else {
+    # Do the addition in 7-nibble chunks to trivialize borrow handling.
+    # if ($main::opt_debug) { print STDERR "AddressSub $addr1 - $addr2 = "; }
+
+    my $a1 = hex(substr($addr1,-7));
+    $addr1 = substr($addr1,0,-7);
+    my $a2 = hex(substr($addr2,-7));
+    $addr2 = substr($addr2,0,-7);
+    my $b = 0;
+    if ($a2 > $a1) {
+      $b = 1;
+      $a1 += 0x10000000;
+    }
+    $diff = $a1 - $a2;
+    my $r = sprintf("%07x", $diff);
+
+    $a1 = hex(substr($addr1,-7));
+    $addr1 = substr($addr1,0,-7);
+    $a2 = hex(substr($addr2,-7)) + $b;
+    $addr2 = substr($addr2,0,-7);
+    $b = 0;
+    if ($a2 > $a1) {
+      $b = 1;
+      $a1 += 0x10000000;
+    }
+    $diff = $a1 - $a2;
+    $r = sprintf("%07x", $diff) . $r;
+
+    $a1 = hex($addr1);
+    $a2 = hex($addr2) + $b;
+    if ($a2 > $a1) { $a1 += 0x100; }
+    $diff = $a1 - $a2;
+    $r = sprintf("%02x", $diff) . $r;
+
+    # if ($main::opt_debug) { print STDERR "$r\n"; }
+
+    return $r;
+  }
+}
+
+# Increment a hex addresses of length $address_length.
+# Run jeprof --test for unit test if this is changed.
+sub AddressInc {
+  my $addr = shift;
+  my $sum;
+
+  if ($address_length == 8) {
+    # Perl doesn't cope with wraparound arithmetic, so do it explicitly:
+    $sum = (hex($addr)+1) % (0x10000000 * 16);
+    return sprintf("%08x", $sum);
+
+  } else {
+    # Do the addition in 7-nibble chunks to trivialize carry handling.
+    # We are always doing this to step through the addresses in a function,
+    # and will almost never overflow the first chunk, so we check for this
+    # case and exit early.
+
+    # if ($main::opt_debug) { print STDERR "AddressInc $addr1 = "; }
+
+    my $a1 = substr($addr,-7);
+    $addr = substr($addr,0,-7);
+    $sum = hex($a1) + 1;
+    my $r = sprintf("%07x", $sum);
+    if ($sum <= 0xfffffff) {
+      $r = $addr . $r;
+      # if ($main::opt_debug) { print STDERR "$r\n"; }
+      return HexExtend($r);
+    } else {
+      $r = "0000000";
+    }
+
+    $a1 = substr($addr,-7);
+    $addr = substr($addr,0,-7);
+    $sum = hex($a1) + 1;
+    $r = sprintf("%07x", $sum) . $r;
+    if ($sum <= 0xfffffff) {
+      $r = $addr . $r;
+      # if ($main::opt_debug) { print STDERR "$r\n"; }
+      return HexExtend($r);
+    } else {
+      $r = "00000000000000";
+    }
+
+    $sum = hex($addr) + 1;
+    if ($sum > 0xff) { $sum -= 0x100; }
+    $r = sprintf("%02x", $sum) . $r;
+
+    # if ($main::opt_debug) { print STDERR "$r\n"; }
+    return $r;
+  }
+}
+
+# Extract symbols for all PC values found in profile
+sub ExtractSymbols {
+  my $libs = shift;
+  my $pcset = shift;
+
+  my $symbols = {};
+
+  # Map each PC value to the containing library.  To make this faster,
+  # we sort libraries by their starting pc value (highest first), and
+  # advance through the libraries as we advance the pc.  Sometimes the
+  # addresses of libraries may overlap with the addresses of the main
+  # binary, so to make sure the libraries 'win', we iterate over the
+  # libraries in reverse order (which assumes the binary doesn't start
+  # in the middle of a library, which seems a fair assumption).
+  my @pcs = (sort { $a cmp $b } keys(%{$pcset}));  # pcset is 0-extended strings
+  foreach my $lib (sort {$b->[1] cmp $a->[1]} @{$libs}) {
+    my $libname = $lib->[0];
+    my $start = $lib->[1];
+    my $finish = $lib->[2];
+    my $offset = $lib->[3];
+
+    # Use debug library if it exists
+    my $debug_libname = DebuggingLibrary($libname);
+    if ($debug_libname) {
+        $libname = $debug_libname;
+    }
+
+    # Get list of pcs that belong in this library.
+    my $contained = [];
+    my ($start_pc_index, $finish_pc_index);
+    # Find smallest finish_pc_index such that $finish < $pc[$finish_pc_index].
+    for ($finish_pc_index = $#pcs + 1; $finish_pc_index > 0;
+         $finish_pc_index--) {
+      last if $pcs[$finish_pc_index - 1] le $finish;
+    }
+    # Find smallest start_pc_index such that $start <= $pc[$start_pc_index].
+    for ($start_pc_index = $finish_pc_index; $start_pc_index > 0;
+         $start_pc_index--) {
+      last if $pcs[$start_pc_index - 1] lt $start;
+    }
+    # This keeps PC values higher than $pc[$finish_pc_index] in @pcs,
+    # in case there are overlaps in libraries and the main binary.
+    @{$contained} = splice(@pcs, $start_pc_index,
+                           $finish_pc_index - $start_pc_index);
+    # Map to symbols
+    MapToSymbols($libname, AddressSub($start, $offset), $contained, $symbols);
+  }
+
+  return $symbols;
+}
+
+# Map list of PC values to symbols for a given image
+sub MapToSymbols {
+  my $image = shift;
+  my $offset = shift;
+  my $pclist = shift;
+  my $symbols = shift;
+
+  my $debug = 0;
+
+  # Ignore empty binaries
+  if ($#{$pclist} < 0) { return; }
+
+  # Figure out the addr2line command to use
+  my $addr2line = $obj_tool_map{"addr2line"};
+  my $cmd = ShellEscape($addr2line, "-f", "-C", "-e", $image);
+  if (exists $obj_tool_map{"addr2line_pdb"}) {
+    $addr2line = $obj_tool_map{"addr2line_pdb"};
+    $cmd = ShellEscape($addr2line, "--demangle", "-f", "-C", "-e", $image);
+  }
+
+  # If "addr2line" isn't installed on the system at all, just use
+  # nm to get what info we can (function names, but not line numbers).
+  if (system(ShellEscape($addr2line, "--help") . " >$dev_null 2>&1") != 0) {
+    MapSymbolsWithNM($image, $offset, $pclist, $symbols);
+    return;
+  }
+
+  # "addr2line -i" can produce a variable number of lines per input
+  # address, with no separator that allows us to tell when data for
+  # the next address starts.  So we find the address for a special
+  # symbol (_fini) and interleave this address between all real
+  # addresses passed to addr2line.  The name of this special symbol
+  # can then be used as a separator.
+  $sep_address = undef;  # May be filled in by MapSymbolsWithNM()
+  my $nm_symbols = {};
+  MapSymbolsWithNM($image, $offset, $pclist, $nm_symbols);
+  if (defined($sep_address)) {
+    # Only add " -i" to addr2line if the binary supports it.
+    # addr2line --help returns 0, but not if it sees an unknown flag first.
+    if (system("$cmd -i --help >$dev_null 2>&1") == 0) {
+      $cmd .= " -i";
+    } else {
+      $sep_address = undef;   # no need for sep_address if we don't support -i
+    }
+  }
+
+  # Make file with all PC values with intervening 'sep_address' so
+  # that we can reliably detect the end of inlined function list
+  open(ADDRESSES, ">$main::tmpfile_sym") || error("$main::tmpfile_sym: $!\n");
+  if ($debug) { print("---- $image ---\n"); }
+  for (my $i = 0; $i <= $#{$pclist}; $i++) {
+    # addr2line always reads hex addresses, and does not need '0x' prefix.
+    if ($debug) { printf STDERR ("%s\n", $pclist->[$i]); }
+    printf ADDRESSES ("%s\n", AddressSub($pclist->[$i], $offset));
+    if (defined($sep_address)) {
+      printf ADDRESSES ("%s\n", $sep_address);
+    }
+  }
+  close(ADDRESSES);
+  if ($debug) {
+    print("----\n");
+    system("cat", $main::tmpfile_sym);
+    print("----\n");
+    system("$cmd < " . ShellEscape($main::tmpfile_sym));
+    print("----\n");
+  }
+
+  open(SYMBOLS, "$cmd <" . ShellEscape($main::tmpfile_sym) . " |")
+      || error("$cmd: $!\n");
+  my $count = 0;   # Index in pclist
+  while (<SYMBOLS>) {
+    # Read fullfunction and filelineinfo from next pair of lines
+    s/\r?\n$//g;
+    my $fullfunction = $_;
+    $_ = <SYMBOLS>;
+    s/\r?\n$//g;
+    my $filelinenum = $_;
+
+    if (defined($sep_address) && $fullfunction eq $sep_symbol) {
+      # Terminating marker for data for this address
+      $count++;
+      next;
+    }
+
+    $filelinenum =~ s|\\|/|g; # turn windows-style paths into unix-style paths
+
+    my $pcstr = $pclist->[$count];
+    my $function = ShortFunctionName($fullfunction);
+    my $nms = $nm_symbols->{$pcstr};
+    if (defined($nms)) {
+      if ($fullfunction eq '??') {
+        # nm found a symbol for us.
+        $function = $nms->[0];
+        $fullfunction = $nms->[2];
+      } else {
+	# MapSymbolsWithNM tags each routine with its starting address,
+	# useful in case the image has multiple occurrences of this
+	# routine.  (It uses a syntax that resembles template paramters,
+	# that are automatically stripped out by ShortFunctionName().)
+	# addr2line does not provide the same information.  So we check
+	# if nm disambiguated our symbol, and if so take the annotated
+	# (nm) version of the routine-name.  TODO(csilvers): this won't
+	# catch overloaded, inlined symbols, which nm doesn't see.
+	# Better would be to do a check similar to nm's, in this fn.
+	if ($nms->[2] =~ m/^\Q$function\E/) {  # sanity check it's the right fn
+	  $function = $nms->[0];
+	  $fullfunction = $nms->[2];
+	}
+      }
+    }
+
+    # Prepend to accumulated symbols for pcstr
+    # (so that caller comes before callee)
+    my $sym = $symbols->{$pcstr};
+    if (!defined($sym)) {
+      $sym = [];
+      $symbols->{$pcstr} = $sym;
+    }
+    unshift(@{$sym}, $function, $filelinenum, $fullfunction);
+    if ($debug) { printf STDERR ("%s => [%s]\n", $pcstr, join(" ", @{$sym})); }
+    if (!defined($sep_address)) {
+      # Inlining is off, so this entry ends immediately
+      $count++;
+    }
+  }
+  close(SYMBOLS);
+}
+
+# Use nm to map the list of referenced PCs to symbols.  Return true iff we
+# are able to read procedure information via nm.
+sub MapSymbolsWithNM {
+  my $image = shift;
+  my $offset = shift;
+  my $pclist = shift;
+  my $symbols = shift;
+
+  # Get nm output sorted by increasing address
+  my $symbol_table = GetProcedureBoundaries($image, ".");
+  if (!%{$symbol_table}) {
+    return 0;
+  }
+  # Start addresses are already the right length (8 or 16 hex digits).
+  my @names = sort { $symbol_table->{$a}->[0] cmp $symbol_table->{$b}->[0] }
+    keys(%{$symbol_table});
+
+  if ($#names < 0) {
+    # No symbols: just use addresses
+    foreach my $pc (@{$pclist}) {
+      my $pcstr = "0x" . $pc;
+      $symbols->{$pc} = [$pcstr, "?", $pcstr];
+    }
+    return 0;
+  }
+
+  # Sort addresses so we can do a join against nm output
+  my $index = 0;
+  my $fullname = $names[0];
+  my $name = ShortFunctionName($fullname);
+  foreach my $pc (sort { $a cmp $b } @{$pclist}) {
+    # Adjust for mapped offset
+    my $mpc = AddressSub($pc, $offset);
+    while (($index < $#names) && ($mpc ge $symbol_table->{$fullname}->[1])){
+      $index++;
+      $fullname = $names[$index];
+      $name = ShortFunctionName($fullname);
+    }
+    if ($mpc lt $symbol_table->{$fullname}->[1]) {
+      $symbols->{$pc} = [$name, "?", $fullname];
+    } else {
+      my $pcstr = "0x" . $pc;
+      $symbols->{$pc} = [$pcstr, "?", $pcstr];
+    }
+  }
+  return 1;
+}
+
+sub ShortFunctionName {
+  my $function = shift;
+  while ($function =~ s/\([^()]*\)(\s*const)?//g) { }   # Argument types
+  while ($function =~ s/<[^<>]*>//g)  { }    # Remove template arguments
+  $function =~ s/^.*\s+(\w+::)/$1/;          # Remove leading type
+  return $function;
+}
+
+# Trim overly long symbols found in disassembler output
+sub CleanDisassembly {
+  my $d = shift;
+  while ($d =~ s/\([^()%]*\)(\s*const)?//g) { } # Argument types, not (%rax)
+  while ($d =~ s/(\w+)<[^<>]*>/$1/g)  { }       # Remove template arguments
+  return $d;
+}
+
+# Clean file name for display
+sub CleanFileName {
+  my ($f) = @_;
+  $f =~ s|^/proc/self/cwd/||;
+  $f =~ s|^\./||;
+  return $f;
+}
+
+# Make address relative to section and clean up for display
+sub UnparseAddress {
+  my ($offset, $address) = @_;
+  $address = AddressSub($address, $offset);
+  $address =~ s/^0x//;
+  $address =~ s/^0*//;
+  return $address;
+}
+
+##### Miscellaneous #####
+
+# Find the right versions of the above object tools to use.  The
+# argument is the program file being analyzed, and should be an ELF
+# 32-bit or ELF 64-bit executable file.  The location of the tools
+# is determined by considering the following options in this order:
+#   1) --tools option, if set
+#   2) JEPROF_TOOLS environment variable, if set
+#   3) the environment
+sub ConfigureObjTools {
+  my $prog_file = shift;
+
+  # Check for the existence of $prog_file because /usr/bin/file does not
+  # predictably return error status in prod.
+  (-e $prog_file)  || error("$prog_file does not exist.\n");
+
+  my $file_type = undef;
+  if (-e "/usr/bin/file") {
+    # Follow symlinks (at least for systems where "file" supports that).
+    my $escaped_prog_file = ShellEscape($prog_file);
+    $file_type = `/usr/bin/file -L $escaped_prog_file 2>$dev_null ||
+                  /usr/bin/file $escaped_prog_file`;
+  } elsif ($^O == "MSWin32") {
+    $file_type = "MS Windows";
+  } else {
+    print STDERR "WARNING: Can't determine the file type of $prog_file";
+  }
+
+  if ($file_type =~ /64-bit/) {
+    # Change $address_length to 16 if the program file is ELF 64-bit.
+    # We can't detect this from many (most?) heap or lock contention
+    # profiles, since the actual addresses referenced are generally in low
+    # memory even for 64-bit programs.
+    $address_length = 16;
+  }
+
+  if ($file_type =~ /MS Windows/) {
+    # For windows, we provide a version of nm and addr2line as part of
+    # the opensource release, which is capable of parsing
+    # Windows-style PDB executables.  It should live in the path, or
+    # in the same directory as jeprof.
+    $obj_tool_map{"nm_pdb"} = "nm-pdb";
+    $obj_tool_map{"addr2line_pdb"} = "addr2line-pdb";
+  }
+
+  if ($file_type =~ /Mach-O/) {
+    # OS X uses otool to examine Mach-O files, rather than objdump.
+    $obj_tool_map{"otool"} = "otool";
+    $obj_tool_map{"addr2line"} = "false";  # no addr2line
+    $obj_tool_map{"objdump"} = "false";  # no objdump
+  }
+
+  # Go fill in %obj_tool_map with the pathnames to use:
+  foreach my $tool (keys %obj_tool_map) {
+    $obj_tool_map{$tool} = ConfigureTool($obj_tool_map{$tool});
+  }
+}
+
+# Returns the path of a caller-specified object tool.  If --tools or
+# JEPROF_TOOLS are specified, then returns the full path to the tool
+# with that prefix.  Otherwise, returns the path unmodified (which
+# means we will look for it on PATH).
+sub ConfigureTool {
+  my $tool = shift;
+  my $path;
+
+  # --tools (or $JEPROF_TOOLS) is a comma separated list, where each
+  # item is either a) a pathname prefix, or b) a map of the form
+  # <tool>:<path>.  First we look for an entry of type (b) for our
+  # tool.  If one is found, we use it.  Otherwise, we consider all the
+  # pathname prefixes in turn, until one yields an existing file.  If
+  # none does, we use a default path.
+  my $tools = $main::opt_tools || $ENV{"JEPROF_TOOLS"} || "";
+  if ($tools =~ m/(,|^)\Q$tool\E:([^,]*)/) {
+    $path = $2;
+    # TODO(csilvers): sanity-check that $path exists?  Hard if it's relative.
+  } elsif ($tools ne '') {
+    foreach my $prefix (split(',', $tools)) {
+      next if ($prefix =~ /:/);    # ignore "tool:fullpath" entries in the list
+      if (-x $prefix . $tool) {
+        $path = $prefix . $tool;
+        last;
+      }
+    }
+    if (!$path) {
+      error("No '$tool' found with prefix specified by " .
+            "--tools (or \$JEPROF_TOOLS) '$tools'\n");
+    }
+  } else {
+    # ... otherwise use the version that exists in the same directory as
+    # jeprof.  If there's nothing there, use $PATH.
+    $0 =~ m,[^/]*$,;     # this is everything after the last slash
+    my $dirname = $`;    # this is everything up to and including the last slash
+    if (-x "$dirname$tool") {
+      $path = "$dirname$tool";
+    } else {
+      $path = $tool;
+    }
+  }
+  if ($main::opt_debug) { print STDERR "Using '$path' for '$tool'.\n"; }
+  return $path;
+}
+
+sub ShellEscape {
+  my @escaped_words = ();
+  foreach my $word (@_) {
+    my $escaped_word = $word;
+    if ($word =~ m![^a-zA-Z0-9/.,_=-]!) {  # check for anything not in whitelist
+      $escaped_word =~ s/'/'\\''/;
+      $escaped_word = "'$escaped_word'";
+    }
+    push(@escaped_words, $escaped_word);
+  }
+  return join(" ", @escaped_words);
+}
+
+sub cleanup {
+  unlink($main::tmpfile_sym);
+  unlink(keys %main::tempnames);
+
+  # We leave any collected profiles in $HOME/jeprof in case the user wants
+  # to look at them later.  We print a message informing them of this.
+  if ((scalar(@main::profile_files) > 0) &&
+      defined($main::collected_profile)) {
+    if (scalar(@main::profile_files) == 1) {
+      print STDERR "Dynamically gathered profile is in $main::collected_profile\n";
+    }
+    print STDERR "If you want to investigate this profile further, you can do:\n";
+    print STDERR "\n";
+    print STDERR "  jeprof \\\n";
+    print STDERR "    $main::prog \\\n";
+    print STDERR "    $main::collected_profile\n";
+    print STDERR "\n";
+  }
+}
+
+sub sighandler {
+  cleanup();
+  exit(1);
+}
+
+sub error {
+  my $msg = shift;
+  print STDERR $msg;
+  cleanup();
+  exit(1);
+}
+
+
+# Run $nm_command and get all the resulting procedure boundaries whose
+# names match "$regexp" and returns them in a hashtable mapping from
+# procedure name to a two-element vector of [start address, end address]
+sub GetProcedureBoundariesViaNm {
+  my $escaped_nm_command = shift;    # shell-escaped
+  my $regexp = shift;
+
+  my $symbol_table = {};
+  open(NM, "$escaped_nm_command |") || error("$escaped_nm_command: $!\n");
+  my $last_start = "0";
+  my $routine = "";
+  while (<NM>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    if (m/^\s*([0-9a-f]+) (.) (..*)/) {
+      my $start_val = $1;
+      my $type = $2;
+      my $this_routine = $3;
+
+      # It's possible for two symbols to share the same address, if
+      # one is a zero-length variable (like __start_google_malloc) or
+      # one symbol is a weak alias to another (like __libc_malloc).
+      # In such cases, we want to ignore all values except for the
+      # actual symbol, which in nm-speak has type "T".  The logic
+      # below does this, though it's a bit tricky: what happens when
+      # we have a series of lines with the same address, is the first
+      # one gets queued up to be processed.  However, it won't
+      # *actually* be processed until later, when we read a line with
+      # a different address.  That means that as long as we're reading
+      # lines with the same address, we have a chance to replace that
+      # item in the queue, which we do whenever we see a 'T' entry --
+      # that is, a line with type 'T'.  If we never see a 'T' entry,
+      # we'll just go ahead and process the first entry (which never
+      # got touched in the queue), and ignore the others.
+      if ($start_val eq $last_start && $type =~ /t/i) {
+        # We are the 'T' symbol at this address, replace previous symbol.
+        $routine = $this_routine;
+        next;
+      } elsif ($start_val eq $last_start) {
+        # We're not the 'T' symbol at this address, so ignore us.
+        next;
+      }
+
+      if ($this_routine eq $sep_symbol) {
+        $sep_address = HexExtend($start_val);
+      }
+
+      # Tag this routine with the starting address in case the image
+      # has multiple occurrences of this routine.  We use a syntax
+      # that resembles template parameters that are automatically
+      # stripped out by ShortFunctionName()
+      $this_routine .= "<$start_val>";
+
+      if (defined($routine) && $routine =~ m/$regexp/) {
+        $symbol_table->{$routine} = [HexExtend($last_start),
+                                     HexExtend($start_val)];
+      }
+      $last_start = $start_val;
+      $routine = $this_routine;
+    } elsif (m/^Loaded image name: (.+)/) {
+      # The win32 nm workalike emits information about the binary it is using.
+      if ($main::opt_debug) { print STDERR "Using Image $1\n"; }
+    } elsif (m/^PDB file name: (.+)/) {
+      # The win32 nm workalike emits information about the pdb it is using.
+      if ($main::opt_debug) { print STDERR "Using PDB $1\n"; }
+    }
+  }
+  close(NM);
+  # Handle the last line in the nm output.  Unfortunately, we don't know
+  # how big this last symbol is, because we don't know how big the file
+  # is.  For now, we just give it a size of 0.
+  # TODO(csilvers): do better here.
+  if (defined($routine) && $routine =~ m/$regexp/) {
+    $symbol_table->{$routine} = [HexExtend($last_start),
+                                 HexExtend($last_start)];
+  }
+  return $symbol_table;
+}
+
+# Gets the procedure boundaries for all routines in "$image" whose names
+# match "$regexp" and returns them in a hashtable mapping from procedure
+# name to a two-element vector of [start address, end address].
+# Will return an empty map if nm is not installed or not working properly.
+sub GetProcedureBoundaries {
+  my $image = shift;
+  my $regexp = shift;
+
+  # If $image doesn't start with /, then put ./ in front of it.  This works
+  # around an obnoxious bug in our probing of nm -f behavior.
+  # "nm -f $image" is supposed to fail on GNU nm, but if:
+  #
+  # a. $image starts with [BbSsPp] (for example, bin/foo/bar), AND
+  # b. you have a.out in your current directory (a not uncommon occurence)
+  #
+  # then "nm -f $image" succeeds because -f only looks at the first letter of
+  # the argument, which looks valid because it's [BbSsPp], and then since
+  # there's no image provided, it looks for a.out and finds it.
+  #
+  # This regex makes sure that $image starts with . or /, forcing the -f
+  # parsing to fail since . and / are not valid formats.
+  $image =~ s#^[^/]#./$&#;
+
+  # For libc libraries, the copy in /usr/lib/debug contains debugging symbols
+  my $debugging = DebuggingLibrary($image);
+  if ($debugging) {
+    $image = $debugging;
+  }
+
+  my $nm = $obj_tool_map{"nm"};
+  my $cppfilt = $obj_tool_map{"c++filt"};
+
+  # nm can fail for two reasons: 1) $image isn't a debug library; 2) nm
+  # binary doesn't support --demangle.  In addition, for OS X we need
+  # to use the -f flag to get 'flat' nm output (otherwise we don't sort
+  # properly and get incorrect results).  Unfortunately, GNU nm uses -f
+  # in an incompatible way.  So first we test whether our nm supports
+  # --demangle and -f.
+  my $demangle_flag = "";
+  my $cppfilt_flag = "";
+  my $to_devnull = ">$dev_null 2>&1";
+  if (system(ShellEscape($nm, "--demangle", "image") . $to_devnull) == 0) {
+    # In this mode, we do "nm --demangle <foo>"
+    $demangle_flag = "--demangle";
+    $cppfilt_flag = "";
+  } elsif (system(ShellEscape($cppfilt, $image) . $to_devnull) == 0) {
+    # In this mode, we do "nm <foo> | c++filt"
+    $cppfilt_flag = " | " . ShellEscape($cppfilt);
+  };
+  my $flatten_flag = "";
+  if (system(ShellEscape($nm, "-f", $image) . $to_devnull) == 0) {
+    $flatten_flag = "-f";
+  }
+
+  # Finally, in the case $imagie isn't a debug library, we try again with
+  # -D to at least get *exported* symbols.  If we can't use --demangle,
+  # we use c++filt instead, if it exists on this system.
+  my @nm_commands = (ShellEscape($nm, "-n", $flatten_flag, $demangle_flag,
+                                 $image) . " 2>$dev_null $cppfilt_flag",
+                     ShellEscape($nm, "-D", "-n", $flatten_flag, $demangle_flag,
+                                 $image) . " 2>$dev_null $cppfilt_flag",
+                     # 6nm is for Go binaries
+                     ShellEscape("6nm", "$image") . " 2>$dev_null | sort",
+                     );
+
+  # If the executable is an MS Windows PDB-format executable, we'll
+  # have set up obj_tool_map("nm_pdb").  In this case, we actually
+  # want to use both unix nm and windows-specific nm_pdb, since
+  # PDB-format executables can apparently include dwarf .o files.
+  if (exists $obj_tool_map{"nm_pdb"}) {
+    push(@nm_commands,
+         ShellEscape($obj_tool_map{"nm_pdb"}, "--demangle", $image)
+         . " 2>$dev_null");
+  }
+
+  foreach my $nm_command (@nm_commands) {
+    my $symbol_table = GetProcedureBoundariesViaNm($nm_command, $regexp);
+    return $symbol_table if (%{$symbol_table});
+  }
+  my $symbol_table = {};
+  return $symbol_table;
+}
+
+
+# The test vectors for AddressAdd/Sub/Inc are 8-16-nibble hex strings.
+# To make them more readable, we add underscores at interesting places.
+# This routine removes the underscores, producing the canonical representation
+# used by jeprof to represent addresses, particularly in the tested routines.
+sub CanonicalHex {
+  my $arg = shift;
+  return join '', (split '_',$arg);
+}
+
+
+# Unit test for AddressAdd:
+sub AddressAddUnitTest {
+  my $test_data_8 = shift;
+  my $test_data_16 = shift;
+  my $error_count = 0;
+  my $fail_count = 0;
+  my $pass_count = 0;
+  # print STDERR "AddressAddUnitTest: ", 1+$#{$test_data_8}, " tests\n";
+
+  # First a few 8-nibble addresses.  Note that this implementation uses
+  # plain old arithmetic, so a quick sanity check along with verifying what
+  # happens to overflow (we want it to wrap):
+  $address_length = 8;
+  foreach my $row (@{$test_data_8}) {
+    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
+    my $sum = AddressAdd ($row->[0], $row->[1]);
+    if ($sum ne $row->[2]) {
+      printf STDERR "ERROR: %s != %s + %s = %s\n", $sum,
+             $row->[0], $row->[1], $row->[2];
+      ++$fail_count;
+    } else {
+      ++$pass_count;
+    }
+  }
+  printf STDERR "AddressAdd 32-bit tests: %d passes, %d failures\n",
+         $pass_count, $fail_count;
+  $error_count = $fail_count;
+  $fail_count = 0;
+  $pass_count = 0;
+
+  # Now 16-nibble addresses.
+  $address_length = 16;
+  foreach my $row (@{$test_data_16}) {
+    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
+    my $sum = AddressAdd (CanonicalHex($row->[0]), CanonicalHex($row->[1]));
+    my $expected = join '', (split '_',$row->[2]);
+    if ($sum ne CanonicalHex($row->[2])) {
+      printf STDERR "ERROR: %s != %s + %s = %s\n", $sum,
+             $row->[0], $row->[1], $row->[2];
+      ++$fail_count;
+    } else {
+      ++$pass_count;
+    }
+  }
+  printf STDERR "AddressAdd 64-bit tests: %d passes, %d failures\n",
+         $pass_count, $fail_count;
+  $error_count += $fail_count;
+
+  return $error_count;
+}
+
+
+# Unit test for AddressSub:
+sub AddressSubUnitTest {
+  my $test_data_8 = shift;
+  my $test_data_16 = shift;
+  my $error_count = 0;
+  my $fail_count = 0;
+  my $pass_count = 0;
+  # print STDERR "AddressSubUnitTest: ", 1+$#{$test_data_8}, " tests\n";
+
+  # First a few 8-nibble addresses.  Note that this implementation uses
+  # plain old arithmetic, so a quick sanity check along with verifying what
+  # happens to overflow (we want it to wrap):
+  $address_length = 8;
+  foreach my $row (@{$test_data_8}) {
+    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
+    my $sum = AddressSub ($row->[0], $row->[1]);
+    if ($sum ne $row->[3]) {
+      printf STDERR "ERROR: %s != %s - %s = %s\n", $sum,
+             $row->[0], $row->[1], $row->[3];
+      ++$fail_count;
+    } else {
+      ++$pass_count;
+    }
+  }
+  printf STDERR "AddressSub 32-bit tests: %d passes, %d failures\n",
+         $pass_count, $fail_count;
+  $error_count = $fail_count;
+  $fail_count = 0;
+  $pass_count = 0;
+
+  # Now 16-nibble addresses.
+  $address_length = 16;
+  foreach my $row (@{$test_data_16}) {
+    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
+    my $sum = AddressSub (CanonicalHex($row->[0]), CanonicalHex($row->[1]));
+    if ($sum ne CanonicalHex($row->[3])) {
+      printf STDERR "ERROR: %s != %s - %s = %s\n", $sum,
+             $row->[0], $row->[1], $row->[3];
+      ++$fail_count;
+    } else {
+      ++$pass_count;
+    }
+  }
+  printf STDERR "AddressSub 64-bit tests: %d passes, %d failures\n",
+         $pass_count, $fail_count;
+  $error_count += $fail_count;
+
+  return $error_count;
+}
+
+
+# Unit test for AddressInc:
+sub AddressIncUnitTest {
+  my $test_data_8 = shift;
+  my $test_data_16 = shift;
+  my $error_count = 0;
+  my $fail_count = 0;
+  my $pass_count = 0;
+  # print STDERR "AddressIncUnitTest: ", 1+$#{$test_data_8}, " tests\n";
+
+  # First a few 8-nibble addresses.  Note that this implementation uses
+  # plain old arithmetic, so a quick sanity check along with verifying what
+  # happens to overflow (we want it to wrap):
+  $address_length = 8;
+  foreach my $row (@{$test_data_8}) {
+    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
+    my $sum = AddressInc ($row->[0]);
+    if ($sum ne $row->[4]) {
+      printf STDERR "ERROR: %s != %s + 1 = %s\n", $sum,
+             $row->[0], $row->[4];
+      ++$fail_count;
+    } else {
+      ++$pass_count;
+    }
+  }
+  printf STDERR "AddressInc 32-bit tests: %d passes, %d failures\n",
+         $pass_count, $fail_count;
+  $error_count = $fail_count;
+  $fail_count = 0;
+  $pass_count = 0;
+
+  # Now 16-nibble addresses.
+  $address_length = 16;
+  foreach my $row (@{$test_data_16}) {
+    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
+    my $sum = AddressInc (CanonicalHex($row->[0]));
+    if ($sum ne CanonicalHex($row->[4])) {
+      printf STDERR "ERROR: %s != %s + 1 = %s\n", $sum,
+             $row->[0], $row->[4];
+      ++$fail_count;
+    } else {
+      ++$pass_count;
+    }
+  }
+  printf STDERR "AddressInc 64-bit tests: %d passes, %d failures\n",
+         $pass_count, $fail_count;
+  $error_count += $fail_count;
+
+  return $error_count;
+}
+
+
+# Driver for unit tests.
+# Currently just the address add/subtract/increment routines for 64-bit.
+sub RunUnitTests {
+  my $error_count = 0;
+
+  # This is a list of tuples [a, b, a+b, a-b, a+1]
+  my $unit_test_data_8 = [
+    [qw(aaaaaaaa 50505050 fafafafa 5a5a5a5a aaaaaaab)],
+    [qw(50505050 aaaaaaaa fafafafa a5a5a5a6 50505051)],
+    [qw(ffffffff aaaaaaaa aaaaaaa9 55555555 00000000)],
+    [qw(00000001 ffffffff 00000000 00000002 00000002)],
+    [qw(00000001 fffffff0 fffffff1 00000011 00000002)],
+  ];
+  my $unit_test_data_16 = [
+    # The implementation handles data in 7-nibble chunks, so those are the
+    # interesting boundaries.
+    [qw(aaaaaaaa 50505050
+        00_000000f_afafafa 00_0000005_a5a5a5a 00_000000a_aaaaaab)],
+    [qw(50505050 aaaaaaaa
+        00_000000f_afafafa ff_ffffffa_5a5a5a6 00_0000005_0505051)],
+    [qw(ffffffff aaaaaaaa
+        00_000001a_aaaaaa9 00_0000005_5555555 00_0000010_0000000)],
+    [qw(00000001 ffffffff
+        00_0000010_0000000 ff_ffffff0_0000002 00_0000000_0000002)],
+    [qw(00000001 fffffff0
+        00_000000f_ffffff1 ff_ffffff0_0000011 00_0000000_0000002)],
+
+    [qw(00_a00000a_aaaaaaa 50505050
+        00_a00000f_afafafa 00_a000005_a5a5a5a 00_a00000a_aaaaaab)],
+    [qw(0f_fff0005_0505050 aaaaaaaa
+        0f_fff000f_afafafa 0f_ffefffa_5a5a5a6 0f_fff0005_0505051)],
+    [qw(00_000000f_fffffff 01_800000a_aaaaaaa
+        01_800001a_aaaaaa9 fe_8000005_5555555 00_0000010_0000000)],
+    [qw(00_0000000_0000001 ff_fffffff_fffffff
+        00_0000000_0000000 00_0000000_0000002 00_0000000_0000002)],
+    [qw(00_0000000_0000001 ff_fffffff_ffffff0
+        ff_fffffff_ffffff1 00_0000000_0000011 00_0000000_0000002)],
+  ];
+
+  $error_count += AddressAddUnitTest($unit_test_data_8, $unit_test_data_16);
+  $error_count += AddressSubUnitTest($unit_test_data_8, $unit_test_data_16);
+  $error_count += AddressIncUnitTest($unit_test_data_8, $unit_test_data_16);
+  if ($error_count > 0) {
+    print STDERR $error_count, " errors: FAILED\n";
+  } else {
+    print STDERR "PASS\n";
+  }
+  exit ($error_count);
+}
diff --git a/bin/pprof b/bin/pprof
deleted file mode 100755
index df503ae..0000000
--- a/bin/pprof
+++ /dev/null
@@ -1,5508 +0,0 @@
-#! /usr/bin/env perl
-
-# Copyright (c) 1998-2007, Google Inc.
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-#
-#     * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following disclaimer
-# in the documentation and/or other materials provided with the
-# distribution.
-#     * Neither the name of Google Inc. nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-# ---
-# Program for printing the profile generated by common/profiler.cc,
-# or by the heap profiler (common/debugallocation.cc)
-#
-# The profile contains a sequence of entries of the form:
-#       <count> <stack trace>
-# This program parses the profile, and generates user-readable
-# output.
-#
-# Examples:
-#
-# % tools/pprof "program" "profile"
-#   Enters "interactive" mode
-#
-# % tools/pprof --text "program" "profile"
-#   Generates one line per procedure
-#
-# % tools/pprof --gv "program" "profile"
-#   Generates annotated call-graph and displays via "gv"
-#
-# % tools/pprof --gv --focus=Mutex "program" "profile"
-#   Restrict to code paths that involve an entry that matches "Mutex"
-#
-# % tools/pprof --gv --focus=Mutex --ignore=string "program" "profile"
-#   Restrict to code paths that involve an entry that matches "Mutex"
-#   and does not match "string"
-#
-# % tools/pprof --list=IBF_CheckDocid "program" "profile"
-#   Generates disassembly listing of all routines with at least one
-#   sample that match the --list=<regexp> pattern.  The listing is
-#   annotated with the flat and cumulative sample counts at each line.
-#
-# % tools/pprof --disasm=IBF_CheckDocid "program" "profile"
-#   Generates disassembly listing of all routines with at least one
-#   sample that match the --disasm=<regexp> pattern.  The listing is
-#   annotated with the flat and cumulative sample counts at each PC value.
-#
-# TODO: Use color to indicate files?
-
-use strict;
-use warnings;
-use Getopt::Long;
-
-my $PPROF_VERSION = "2.0";
-
-# These are the object tools we use which can come from a
-# user-specified location using --tools, from the PPROF_TOOLS
-# environment variable, or from the environment.
-my %obj_tool_map = (
-  "objdump" => "objdump",
-  "nm" => "nm",
-  "addr2line" => "addr2line",
-  "c++filt" => "c++filt",
-  ## ConfigureObjTools may add architecture-specific entries:
-  #"nm_pdb" => "nm-pdb",       # for reading windows (PDB-format) executables
-  #"addr2line_pdb" => "addr2line-pdb",                                # ditto
-  #"otool" => "otool",         # equivalent of objdump on OS X
-);
-# NOTE: these are lists, so you can put in commandline flags if you want.
-my @DOT = ("dot");          # leave non-absolute, since it may be in /usr/local
-my @GV = ("gv");
-my @EVINCE = ("evince");    # could also be xpdf or perhaps acroread
-my @KCACHEGRIND = ("kcachegrind");
-my @PS2PDF = ("ps2pdf");
-# These are used for dynamic profiles
-my @URL_FETCHER = ("curl", "-s");
-
-# These are the web pages that servers need to support for dynamic profiles
-my $HEAP_PAGE = "/pprof/heap";
-my $PROFILE_PAGE = "/pprof/profile";   # must support cgi-param "?seconds=#"
-my $PMUPROFILE_PAGE = "/pprof/pmuprofile(?:\\?.*)?"; # must support cgi-param
-                                                # ?seconds=#&event=x&period=n
-my $GROWTH_PAGE = "/pprof/growth";
-my $CONTENTION_PAGE = "/pprof/contention";
-my $WALL_PAGE = "/pprof/wall(?:\\?.*)?";  # accepts options like namefilter
-my $FILTEREDPROFILE_PAGE = "/pprof/filteredprofile(?:\\?.*)?";
-my $CENSUSPROFILE_PAGE = "/pprof/censusprofile(?:\\?.*)?"; # must support cgi-param
-                                                       # "?seconds=#",
-                                                       # "?tags_regexp=#" and
-                                                       # "?type=#".
-my $SYMBOL_PAGE = "/pprof/symbol";     # must support symbol lookup via POST
-my $PROGRAM_NAME_PAGE = "/pprof/cmdline";
-
-# These are the web pages that can be named on the command line.
-# All the alternatives must begin with /.
-my $PROFILES = "($HEAP_PAGE|$PROFILE_PAGE|$PMUPROFILE_PAGE|" .
-               "$GROWTH_PAGE|$CONTENTION_PAGE|$WALL_PAGE|" .
-               "$FILTEREDPROFILE_PAGE|$CENSUSPROFILE_PAGE)";
-
-# default binary name
-my $UNKNOWN_BINARY = "(unknown)";
-
-# There is a pervasive dependency on the length (in hex characters,
-# i.e., nibbles) of an address, distinguishing between 32-bit and
-# 64-bit profiles.  To err on the safe size, default to 64-bit here:
-my $address_length = 16;
-
-my $dev_null = "/dev/null";
-if (! -e $dev_null && $^O =~ /MSWin/) {    # $^O is the OS perl was built for
-  $dev_null = "nul";
-}
-
-# A list of paths to search for shared object files
-my @prefix_list = ();
-
-# Special routine name that should not have any symbols.
-# Used as separator to parse "addr2line -i" output.
-my $sep_symbol = '_fini';
-my $sep_address = undef;
-
-##### Argument parsing #####
-
-sub usage_string {
-  return <<EOF;
-Usage:
-pprof [options] <program> <profiles>
-   <profiles> is a space separated list of profile names.
-pprof [options] <symbolized-profiles>
-   <symbolized-profiles> is a list of profile files where each file contains
-   the necessary symbol mappings  as well as profile data (likely generated
-   with --raw).
-pprof [options] <profile>
-   <profile> is a remote form.  Symbols are obtained from host:port$SYMBOL_PAGE
-
-   Each name can be:
-   /path/to/profile        - a path to a profile file
-   host:port[/<service>]   - a location of a service to get profile from
-
-   The /<service> can be $HEAP_PAGE, $PROFILE_PAGE, /pprof/pmuprofile,
-                         $GROWTH_PAGE, $CONTENTION_PAGE, /pprof/wall,
-                         $CENSUSPROFILE_PAGE, or /pprof/filteredprofile.
-   For instance:
-     pprof http://myserver.com:80$HEAP_PAGE
-   If /<service> is omitted, the service defaults to $PROFILE_PAGE (cpu profiling).
-pprof --symbols <program>
-   Maps addresses to symbol names.  In this mode, stdin should be a
-   list of library mappings, in the same format as is found in the heap-
-   and cpu-profile files (this loosely matches that of /proc/self/maps
-   on linux), followed by a list of hex addresses to map, one per line.
-
-   For more help with querying remote servers, including how to add the
-   necessary server-side support code, see this filename (or one like it):
-
-   /usr/doc/gperftools-$PPROF_VERSION/pprof_remote_servers.html
-
-Options:
-   --cum               Sort by cumulative data
-   --base=<base>       Subtract <base> from <profile> before display
-   --interactive       Run in interactive mode (interactive "help" gives help) [default]
-   --seconds=<n>       Length of time for dynamic profiles [default=30 secs]
-   --add_lib=<file>    Read additional symbols and line info from the given library
-   --lib_prefix=<dir>  Comma separated list of library path prefixes
-
-Reporting Granularity:
-   --addresses         Report at address level
-   --lines             Report at source line level
-   --functions         Report at function level [default]
-   --files             Report at source file level
-
-Output type:
-   --text              Generate text report
-   --callgrind         Generate callgrind format to stdout
-   --gv                Generate Postscript and display
-   --evince            Generate PDF and display
-   --web               Generate SVG and display
-   --list=<regexp>     Generate source listing of matching routines
-   --disasm=<regexp>   Generate disassembly of matching routines
-   --symbols           Print demangled symbol names found at given addresses
-   --dot               Generate DOT file to stdout
-   --ps                Generate Postcript to stdout
-   --pdf               Generate PDF to stdout
-   --svg               Generate SVG to stdout
-   --gif               Generate GIF to stdout
-   --raw               Generate symbolized pprof data (useful with remote fetch)
-
-Heap-Profile Options:
-   --inuse_space       Display in-use (mega)bytes [default]
-   --inuse_objects     Display in-use objects
-   --alloc_space       Display allocated (mega)bytes
-   --alloc_objects     Display allocated objects
-   --show_bytes        Display space in bytes
-   --drop_negative     Ignore negative differences
-
-Contention-profile options:
-   --total_delay       Display total delay at each region [default]
-   --contentions       Display number of delays at each region
-   --mean_delay        Display mean delay at each region
-
-Call-graph Options:
-   --nodecount=<n>     Show at most so many nodes [default=80]
-   --nodefraction=<f>  Hide nodes below <f>*total [default=.005]
-   --edgefraction=<f>  Hide edges below <f>*total [default=.001]
-   --maxdegree=<n>     Max incoming/outgoing edges per node [default=8]
-   --focus=<regexp>    Focus on nodes matching <regexp>
-   --thread=<n>        Show profile for thread <n>
-   --ignore=<regexp>   Ignore nodes matching <regexp>
-   --scale=<n>         Set GV scaling [default=0]
-   --heapcheck         Make nodes with non-0 object counts
-                       (i.e. direct leak generators) more visible
-
-Miscellaneous:
-   --tools=<prefix or binary:fullpath>[,...]   \$PATH for object tool pathnames
-   --test              Run unit tests
-   --help              This message
-   --version           Version information
-
-Environment Variables:
-   PPROF_TMPDIR        Profiles directory. Defaults to \$HOME/pprof
-   PPROF_TOOLS         Prefix for object tools pathnames
-
-Examples:
-
-pprof /bin/ls ls.prof
-                       Enters "interactive" mode
-pprof --text /bin/ls ls.prof
-                       Outputs one line per procedure
-pprof --web /bin/ls ls.prof
-                       Displays annotated call-graph in web browser
-pprof --gv /bin/ls ls.prof
-                       Displays annotated call-graph via 'gv'
-pprof --gv --focus=Mutex /bin/ls ls.prof
-                       Restricts to code paths including a .*Mutex.* entry
-pprof --gv --focus=Mutex --ignore=string /bin/ls ls.prof
-                       Code paths including Mutex but not string
-pprof --list=getdir /bin/ls ls.prof
-                       (Per-line) annotated source listing for getdir()
-pprof --disasm=getdir /bin/ls ls.prof
-                       (Per-PC) annotated disassembly for getdir()
-
-pprof http://localhost:1234/
-                       Enters "interactive" mode
-pprof --text localhost:1234
-                       Outputs one line per procedure for localhost:1234
-pprof --raw localhost:1234 > ./local.raw
-pprof --text ./local.raw
-                       Fetches a remote profile for later analysis and then
-                       analyzes it in text mode.
-EOF
-}
-
-sub version_string {
-  return <<EOF
-pprof (part of gperftools $PPROF_VERSION)
-
-Copyright 1998-2007 Google Inc.
-
-This is BSD licensed software; see the source for copying conditions
-and license information.
-There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A
-PARTICULAR PURPOSE.
-EOF
-}
-
-sub usage {
-  my $msg = shift;
-  print STDERR "$msg\n\n";
-  print STDERR usage_string();
-  print STDERR "\nFATAL ERROR: $msg\n";    # just as a reminder
-  exit(1);
-}
-
-sub Init() {
-  # Setup tmp-file name and handler to clean it up.
-  # We do this in the very beginning so that we can use
-  # error() and cleanup() function anytime here after.
-  $main::tmpfile_sym = "/tmp/pprof$$.sym";
-  $main::tmpfile_ps = "/tmp/pprof$$";
-  $main::next_tmpfile = 0;
-  $SIG{'INT'} = \&sighandler;
-
-  # Cache from filename/linenumber to source code
-  $main::source_cache = ();
-
-  $main::opt_help = 0;
-  $main::opt_version = 0;
-
-  $main::opt_cum = 0;
-  $main::opt_base = '';
-  $main::opt_addresses = 0;
-  $main::opt_lines = 0;
-  $main::opt_functions = 0;
-  $main::opt_files = 0;
-  $main::opt_lib_prefix = "";
-
-  $main::opt_text = 0;
-  $main::opt_callgrind = 0;
-  $main::opt_list = "";
-  $main::opt_disasm = "";
-  $main::opt_symbols = 0;
-  $main::opt_gv = 0;
-  $main::opt_evince = 0;
-  $main::opt_web = 0;
-  $main::opt_dot = 0;
-  $main::opt_ps = 0;
-  $main::opt_pdf = 0;
-  $main::opt_gif = 0;
-  $main::opt_svg = 0;
-  $main::opt_raw = 0;
-
-  $main::opt_nodecount = 80;
-  $main::opt_nodefraction = 0.005;
-  $main::opt_edgefraction = 0.001;
-  $main::opt_maxdegree = 8;
-  $main::opt_focus = '';
-  $main::opt_thread = undef;
-  $main::opt_ignore = '';
-  $main::opt_scale = 0;
-  $main::opt_heapcheck = 0;
-  $main::opt_seconds = 30;
-  $main::opt_lib = "";
-
-  $main::opt_inuse_space   = 0;
-  $main::opt_inuse_objects = 0;
-  $main::opt_alloc_space   = 0;
-  $main::opt_alloc_objects = 0;
-  $main::opt_show_bytes    = 0;
-  $main::opt_drop_negative = 0;
-  $main::opt_interactive   = 0;
-
-  $main::opt_total_delay = 0;
-  $main::opt_contentions = 0;
-  $main::opt_mean_delay = 0;
-
-  $main::opt_tools   = "";
-  $main::opt_debug   = 0;
-  $main::opt_test    = 0;
-
-  # These are undocumented flags used only by unittests.
-  $main::opt_test_stride = 0;
-
-  # Are we using $SYMBOL_PAGE?
-  $main::use_symbol_page = 0;
-
-  # Files returned by TempName.
-  %main::tempnames = ();
-
-  # Type of profile we are dealing with
-  # Supported types:
-  #     cpu
-  #     heap
-  #     growth
-  #     contention
-  $main::profile_type = '';     # Empty type means "unknown"
-
-  GetOptions("help!"          => \$main::opt_help,
-             "version!"       => \$main::opt_version,
-             "cum!"           => \$main::opt_cum,
-             "base=s"         => \$main::opt_base,
-             "seconds=i"      => \$main::opt_seconds,
-             "add_lib=s"      => \$main::opt_lib,
-             "lib_prefix=s"   => \$main::opt_lib_prefix,
-             "functions!"     => \$main::opt_functions,
-             "lines!"         => \$main::opt_lines,
-             "addresses!"     => \$main::opt_addresses,
-             "files!"         => \$main::opt_files,
-             "text!"          => \$main::opt_text,
-             "callgrind!"     => \$main::opt_callgrind,
-             "list=s"         => \$main::opt_list,
-             "disasm=s"       => \$main::opt_disasm,
-             "symbols!"       => \$main::opt_symbols,
-             "gv!"            => \$main::opt_gv,
-             "evince!"        => \$main::opt_evince,
-             "web!"           => \$main::opt_web,
-             "dot!"           => \$main::opt_dot,
-             "ps!"            => \$main::opt_ps,
-             "pdf!"           => \$main::opt_pdf,
-             "svg!"           => \$main::opt_svg,
-             "gif!"           => \$main::opt_gif,
-             "raw!"           => \$main::opt_raw,
-             "interactive!"   => \$main::opt_interactive,
-             "nodecount=i"    => \$main::opt_nodecount,
-             "nodefraction=f" => \$main::opt_nodefraction,
-             "edgefraction=f" => \$main::opt_edgefraction,
-             "maxdegree=i"    => \$main::opt_maxdegree,
-             "focus=s"        => \$main::opt_focus,
-             "thread=s"       => \$main::opt_thread,
-             "ignore=s"       => \$main::opt_ignore,
-             "scale=i"        => \$main::opt_scale,
-             "heapcheck"      => \$main::opt_heapcheck,
-             "inuse_space!"   => \$main::opt_inuse_space,
-             "inuse_objects!" => \$main::opt_inuse_objects,
-             "alloc_space!"   => \$main::opt_alloc_space,
-             "alloc_objects!" => \$main::opt_alloc_objects,
-             "show_bytes!"    => \$main::opt_show_bytes,
-             "drop_negative!" => \$main::opt_drop_negative,
-             "total_delay!"   => \$main::opt_total_delay,
-             "contentions!"   => \$main::opt_contentions,
-             "mean_delay!"    => \$main::opt_mean_delay,
-             "tools=s"        => \$main::opt_tools,
-             "test!"          => \$main::opt_test,
-             "debug!"         => \$main::opt_debug,
-             # Undocumented flags used only by unittests:
-             "test_stride=i"  => \$main::opt_test_stride,
-      ) || usage("Invalid option(s)");
-
-  # Deal with the standard --help and --version
-  if ($main::opt_help) {
-    print usage_string();
-    exit(0);
-  }
-
-  if ($main::opt_version) {
-    print version_string();
-    exit(0);
-  }
-
-  # Disassembly/listing/symbols mode requires address-level info
-  if ($main::opt_disasm || $main::opt_list || $main::opt_symbols) {
-    $main::opt_functions = 0;
-    $main::opt_lines = 0;
-    $main::opt_addresses = 1;
-    $main::opt_files = 0;
-  }
-
-  # Check heap-profiling flags
-  if ($main::opt_inuse_space +
-      $main::opt_inuse_objects +
-      $main::opt_alloc_space +
-      $main::opt_alloc_objects > 1) {
-    usage("Specify at most on of --inuse/--alloc options");
-  }
-
-  # Check output granularities
-  my $grains =
-      $main::opt_functions +
-      $main::opt_lines +
-      $main::opt_addresses +
-      $main::opt_files +
-      0;
-  if ($grains > 1) {
-    usage("Only specify one output granularity option");
-  }
-  if ($grains == 0) {
-    $main::opt_functions = 1;
-  }
-
-  # Check output modes
-  my $modes =
-      $main::opt_text +
-      $main::opt_callgrind +
-      ($main::opt_list eq '' ? 0 : 1) +
-      ($main::opt_disasm eq '' ? 0 : 1) +
-      ($main::opt_symbols == 0 ? 0 : 1) +
-      $main::opt_gv +
-      $main::opt_evince +
-      $main::opt_web +
-      $main::opt_dot +
-      $main::opt_ps +
-      $main::opt_pdf +
-      $main::opt_svg +
-      $main::opt_gif +
-      $main::opt_raw +
-      $main::opt_interactive +
-      0;
-  if ($modes > 1) {
-    usage("Only specify one output mode");
-  }
-  if ($modes == 0) {
-    if (-t STDOUT) {  # If STDOUT is a tty, activate interactive mode
-      $main::opt_interactive = 1;
-    } else {
-      $main::opt_text = 1;
-    }
-  }
-
-  if ($main::opt_test) {
-    RunUnitTests();
-    # Should not return
-    exit(1);
-  }
-
-  # Binary name and profile arguments list
-  $main::prog = "";
-  @main::pfile_args = ();
-
-  # Remote profiling without a binary (using $SYMBOL_PAGE instead)
-  if (@ARGV > 0) {
-    if (IsProfileURL($ARGV[0])) {
-      $main::use_symbol_page = 1;
-    } elsif (IsSymbolizedProfileFile($ARGV[0])) {
-      $main::use_symbolized_profile = 1;
-      $main::prog = $UNKNOWN_BINARY;  # will be set later from the profile file
-    }
-  }
-
-  if ($main::use_symbol_page || $main::use_symbolized_profile) {
-    # We don't need a binary!
-    my %disabled = ('--lines' => $main::opt_lines,
-                    '--disasm' => $main::opt_disasm);
-    for my $option (keys %disabled) {
-      usage("$option cannot be used without a binary") if $disabled{$option};
-    }
-    # Set $main::prog later...
-    scalar(@ARGV) || usage("Did not specify profile file");
-  } elsif ($main::opt_symbols) {
-    # --symbols needs a binary-name (to run nm on, etc) but not profiles
-    $main::prog = shift(@ARGV) || usage("Did not specify program");
-  } else {
-    $main::prog = shift(@ARGV) || usage("Did not specify program");
-    scalar(@ARGV) || usage("Did not specify profile file");
-  }
-
-  # Parse profile file/location arguments
-  foreach my $farg (@ARGV) {
-    if ($farg =~ m/(.*)\@([0-9]+)(|\/.*)$/ ) {
-      my $machine = $1;
-      my $num_machines = $2;
-      my $path = $3;
-      for (my $i = 0; $i < $num_machines; $i++) {
-        unshift(@main::pfile_args, "$i.$machine$path");
-      }
-    } else {
-      unshift(@main::pfile_args, $farg);
-    }
-  }
-
-  if ($main::use_symbol_page) {
-    unless (IsProfileURL($main::pfile_args[0])) {
-      error("The first profile should be a remote form to use $SYMBOL_PAGE\n");
-    }
-    CheckSymbolPage();
-    $main::prog = FetchProgramName();
-  } elsif (!$main::use_symbolized_profile) {  # may not need objtools!
-    ConfigureObjTools($main::prog)
-  }
-
-  # Break the opt_lib_prefix into the prefix_list array
-  @prefix_list = split (',', $main::opt_lib_prefix);
-
-  # Remove trailing / from the prefixes, in the list to prevent
-  # searching things like /my/path//lib/mylib.so
-  foreach (@prefix_list) {
-    s|/+$||;
-  }
-}
-
-sub FilterAndPrint {
-  my ($profile, $symbols, $libs, $thread) = @_;
-
-  # Get total data in profile
-  my $total = TotalProfile($profile);
-
-  # Remove uniniteresting stack items
-  $profile = RemoveUninterestingFrames($symbols, $profile);
-
-  # Focus?
-  if ($main::opt_focus ne '') {
-    $profile = FocusProfile($symbols, $profile, $main::opt_focus);
-  }
-
-  # Ignore?
-  if ($main::opt_ignore ne '') {
-    $profile = IgnoreProfile($symbols, $profile, $main::opt_ignore);
-  }
-
-  my $calls = ExtractCalls($symbols, $profile);
-
-  # Reduce profiles to required output granularity, and also clean
-  # each stack trace so a given entry exists at most once.
-  my $reduced = ReduceProfile($symbols, $profile);
-
-  # Get derived profiles
-  my $flat = FlatProfile($reduced);
-  my $cumulative = CumulativeProfile($reduced);
-
-  # Print
-  if (!$main::opt_interactive) {
-    if ($main::opt_disasm) {
-      PrintDisassembly($libs, $flat, $cumulative, $main::opt_disasm);
-    } elsif ($main::opt_list) {
-      PrintListing($total, $libs, $flat, $cumulative, $main::opt_list, 0);
-    } elsif ($main::opt_text) {
-      # Make sure the output is empty when have nothing to report
-      # (only matters when --heapcheck is given but we must be
-      # compatible with old branches that did not pass --heapcheck always):
-      if ($total != 0) {
-        printf("Total%s: %s %s\n",
-               (defined($thread) ? " (t$thread)" : ""),
-               Unparse($total), Units());
-      }
-      PrintText($symbols, $flat, $cumulative, -1);
-    } elsif ($main::opt_raw) {
-      PrintSymbolizedProfile($symbols, $profile, $main::prog);
-    } elsif ($main::opt_callgrind) {
-      PrintCallgrind($calls);
-    } else {
-      if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
-        if ($main::opt_gv) {
-          RunGV(TempName($main::next_tmpfile, "ps"), "");
-        } elsif ($main::opt_evince) {
-          RunEvince(TempName($main::next_tmpfile, "pdf"), "");
-        } elsif ($main::opt_web) {
-          my $tmp = TempName($main::next_tmpfile, "svg");
-          RunWeb($tmp);
-          # The command we run might hand the file name off
-          # to an already running browser instance and then exit.
-          # Normally, we'd remove $tmp on exit (right now),
-          # but fork a child to remove $tmp a little later, so that the
-          # browser has time to load it first.
-          delete $main::tempnames{$tmp};
-          if (fork() == 0) {
-            sleep 5;
-            unlink($tmp);
-            exit(0);
-          }
-        }
-      } else {
-        cleanup();
-        exit(1);
-      }
-    }
-  } else {
-    InteractiveMode($profile, $symbols, $libs, $total);
-  }
-}
-
-sub Main() {
-  Init();
-  $main::collected_profile = undef;
-  @main::profile_files = ();
-  $main::op_time = time();
-
-  # Printing symbols is special and requires a lot less info that most.
-  if ($main::opt_symbols) {
-    PrintSymbols(*STDIN);   # Get /proc/maps and symbols output from stdin
-    return;
-  }
-
-  # Fetch all profile data
-  FetchDynamicProfiles();
-
-  # this will hold symbols that we read from the profile files
-  my $symbol_map = {};
-
-  # Read one profile, pick the last item on the list
-  my $data = ReadProfile($main::prog, pop(@main::profile_files));
-  my $profile = $data->{profile};
-  my $pcs = $data->{pcs};
-  my $libs = $data->{libs};   # Info about main program and shared libraries
-  $symbol_map = MergeSymbols($symbol_map, $data->{symbols});
-
-  # Add additional profiles, if available.
-  if (scalar(@main::profile_files) > 0) {
-    foreach my $pname (@main::profile_files) {
-      my $data2 = ReadProfile($main::prog, $pname);
-      $profile = AddProfile($profile, $data2->{profile});
-      $pcs = AddPcs($pcs, $data2->{pcs});
-      $symbol_map = MergeSymbols($symbol_map, $data2->{symbols});
-    }
-  }
-
-  # Subtract base from profile, if specified
-  if ($main::opt_base ne '') {
-    my $base = ReadProfile($main::prog, $main::opt_base);
-    $profile = SubtractProfile($profile, $base->{profile});
-    $pcs = AddPcs($pcs, $base->{pcs});
-    $symbol_map = MergeSymbols($symbol_map, $base->{symbols});
-  }
-
-  # Collect symbols
-  my $symbols;
-  if ($main::use_symbolized_profile) {
-    $symbols = FetchSymbols($pcs, $symbol_map);
-  } elsif ($main::use_symbol_page) {
-    $symbols = FetchSymbols($pcs);
-  } else {
-    # TODO(csilvers): $libs uses the /proc/self/maps data from profile1,
-    # which may differ from the data from subsequent profiles, especially
-    # if they were run on different machines.  Use appropriate libs for
-    # each pc somehow.
-    $symbols = ExtractSymbols($libs, $pcs);
-  }
-
-  if (!defined($main::opt_thread)) {
-    FilterAndPrint($profile, $symbols, $libs);
-  }
-  if (defined($data->{threads})) {
-    foreach my $thread (sort { $a <=> $b } keys(%{$data->{threads}})) {
-      if (defined($main::opt_thread) &&
-          ($main::opt_thread eq '*' || $main::opt_thread == $thread)) {
-        my $thread_profile = $data->{threads}{$thread};
-        FilterAndPrint($thread_profile, $symbols, $libs, $thread);
-      }
-    }
-  }
-
-  cleanup();
-  exit(0);
-}
-
-##### Entry Point #####
-
-Main();
-
-# Temporary code to detect if we're running on a Goobuntu system.
-# These systems don't have the right stuff installed for the special
-# Readline libraries to work, so as a temporary workaround, we default
-# to using the normal stdio code, rather than the fancier readline-based
-# code
-sub ReadlineMightFail {
-  if (-e '/lib/libtermcap.so.2') {
-    return 0;  # libtermcap exists, so readline should be okay
-  } else {
-    return 1;
-  }
-}
-
-sub RunGV {
-  my $fname = shift;
-  my $bg = shift;       # "" or " &" if we should run in background
-  if (!system(ShellEscape(@GV, "--version") . " >$dev_null 2>&1")) {
-    # Options using double dash are supported by this gv version.
-    # Also, turn on noantialias to better handle bug in gv for
-    # postscript files with large dimensions.
-    # TODO: Maybe we should not pass the --noantialias flag
-    # if the gv version is known to work properly without the flag.
-    system(ShellEscape(@GV, "--scale=$main::opt_scale", "--noantialias", $fname)
-           . $bg);
-  } else {
-    # Old gv version - only supports options that use single dash.
-    print STDERR ShellEscape(@GV, "-scale", $main::opt_scale) . "\n";
-    system(ShellEscape(@GV, "-scale", "$main::opt_scale", $fname) . $bg);
-  }
-}
-
-sub RunEvince {
-  my $fname = shift;
-  my $bg = shift;       # "" or " &" if we should run in background
-  system(ShellEscape(@EVINCE, $fname) . $bg);
-}
-
-sub RunWeb {
-  my $fname = shift;
-  print STDERR "Loading web page file:///$fname\n";
-
-  if (`uname` =~ /Darwin/) {
-    # OS X: open will use standard preference for SVG files.
-    system("/usr/bin/open", $fname);
-    return;
-  }
-
-  # Some kind of Unix; try generic symlinks, then specific browsers.
-  # (Stop once we find one.)
-  # Works best if the browser is already running.
-  my @alt = (
-    "/etc/alternatives/gnome-www-browser",
-    "/etc/alternatives/x-www-browser",
-    "google-chrome",
-    "firefox",
-  );
-  foreach my $b (@alt) {
-    if (system($b, $fname) == 0) {
-      return;
-    }
-  }
-
-  print STDERR "Could not load web browser.\n";
-}
-
-sub RunKcachegrind {
-  my $fname = shift;
-  my $bg = shift;       # "" or " &" if we should run in background
-  print STDERR "Starting '@KCACHEGRIND " . $fname . $bg . "'\n";
-  system(ShellEscape(@KCACHEGRIND, $fname) . $bg);
-}
-
-
-##### Interactive helper routines #####
-
-sub InteractiveMode {
-  $| = 1;  # Make output unbuffered for interactive mode
-  my ($orig_profile, $symbols, $libs, $total) = @_;
-
-  print STDERR "Welcome to pprof!  For help, type 'help'.\n";
-
-  # Use ReadLine if it's installed and input comes from a console.
-  if ( -t STDIN &&
-       !ReadlineMightFail() &&
-       defined(eval {require Term::ReadLine}) ) {
-    my $term = new Term::ReadLine 'pprof';
-    while ( defined ($_ = $term->readline('(pprof) '))) {
-      $term->addhistory($_) if /\S/;
-      if (!InteractiveCommand($orig_profile, $symbols, $libs, $total, $_)) {
-        last;    # exit when we get an interactive command to quit
-      }
-    }
-  } else {       # don't have readline
-    while (1) {
-      print STDERR "(pprof) ";
-      $_ = <STDIN>;
-      last if ! defined $_ ;
-      s/\r//g;         # turn windows-looking lines into unix-looking lines
-
-      # Save some flags that might be reset by InteractiveCommand()
-      my $save_opt_lines = $main::opt_lines;
-
-      if (!InteractiveCommand($orig_profile, $symbols, $libs, $total, $_)) {
-        last;    # exit when we get an interactive command to quit
-      }
-
-      # Restore flags
-      $main::opt_lines = $save_opt_lines;
-    }
-  }
-}
-
-# Takes two args: orig profile, and command to run.
-# Returns 1 if we should keep going, or 0 if we were asked to quit
-sub InteractiveCommand {
-  my($orig_profile, $symbols, $libs, $total, $command) = @_;
-  $_ = $command;                # just to make future m//'s easier
-  if (!defined($_)) {
-    print STDERR "\n";
-    return 0;
-  }
-  if (m/^\s*quit/) {
-    return 0;
-  }
-  if (m/^\s*help/) {
-    InteractiveHelpMessage();
-    return 1;
-  }
-  # Clear all the mode options -- mode is controlled by "$command"
-  $main::opt_text = 0;
-  $main::opt_callgrind = 0;
-  $main::opt_disasm = 0;
-  $main::opt_list = 0;
-  $main::opt_gv = 0;
-  $main::opt_evince = 0;
-  $main::opt_cum = 0;
-
-  if (m/^\s*(text|top)(\d*)\s*(.*)/) {
-    $main::opt_text = 1;
-
-    my $line_limit = ($2 ne "") ? int($2) : 10;
-
-    my $routine;
-    my $ignore;
-    ($routine, $ignore) = ParseInteractiveArgs($3);
-
-    my $profile = ProcessProfile($total, $orig_profile, $symbols, "", $ignore);
-    my $reduced = ReduceProfile($symbols, $profile);
-
-    # Get derived profiles
-    my $flat = FlatProfile($reduced);
-    my $cumulative = CumulativeProfile($reduced);
-
-    PrintText($symbols, $flat, $cumulative, $line_limit);
-    return 1;
-  }
-  if (m/^\s*callgrind\s*([^ \n]*)/) {
-    $main::opt_callgrind = 1;
-
-    # Get derived profiles
-    my $calls = ExtractCalls($symbols, $orig_profile);
-    my $filename = $1;
-    if ( $1 eq '' ) {
-      $filename = TempName($main::next_tmpfile, "callgrind");
-    }
-    PrintCallgrind($calls, $filename);
-    if ( $1 eq '' ) {
-      RunKcachegrind($filename, " & ");
-      $main::next_tmpfile++;
-    }
-
-    return 1;
-  }
-  if (m/^\s*(web)?list\s*(.+)/) {
-    my $html = (defined($1) && ($1 eq "web"));
-    $main::opt_list = 1;
-
-    my $routine;
-    my $ignore;
-    ($routine, $ignore) = ParseInteractiveArgs($2);
-
-    my $profile = ProcessProfile($total, $orig_profile, $symbols, "", $ignore);
-    my $reduced = ReduceProfile($symbols, $profile);
-
-    # Get derived profiles
-    my $flat = FlatProfile($reduced);
-    my $cumulative = CumulativeProfile($reduced);
-
-    PrintListing($total, $libs, $flat, $cumulative, $routine, $html);
-    return 1;
-  }
-  if (m/^\s*disasm\s*(.+)/) {
-    $main::opt_disasm = 1;
-
-    my $routine;
-    my $ignore;
-    ($routine, $ignore) = ParseInteractiveArgs($1);
-
-    # Process current profile to account for various settings
-    my $profile = ProcessProfile($total, $orig_profile, $symbols, "", $ignore);
-    my $reduced = ReduceProfile($symbols, $profile);
-
-    # Get derived profiles
-    my $flat = FlatProfile($reduced);
-    my $cumulative = CumulativeProfile($reduced);
-
-    PrintDisassembly($libs, $flat, $cumulative, $routine);
-    return 1;
-  }
-  if (m/^\s*(gv|web|evince)\s*(.*)/) {
-    $main::opt_gv = 0;
-    $main::opt_evince = 0;
-    $main::opt_web = 0;
-    if ($1 eq "gv") {
-      $main::opt_gv = 1;
-    } elsif ($1 eq "evince") {
-      $main::opt_evince = 1;
-    } elsif ($1 eq "web") {
-      $main::opt_web = 1;
-    }
-
-    my $focus;
-    my $ignore;
-    ($focus, $ignore) = ParseInteractiveArgs($2);
-
-    # Process current profile to account for various settings
-    my $profile = ProcessProfile($total, $orig_profile, $symbols,
-                                 $focus, $ignore);
-    my $reduced = ReduceProfile($symbols, $profile);
-
-    # Get derived profiles
-    my $flat = FlatProfile($reduced);
-    my $cumulative = CumulativeProfile($reduced);
-
-    if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
-      if ($main::opt_gv) {
-        RunGV(TempName($main::next_tmpfile, "ps"), " &");
-      } elsif ($main::opt_evince) {
-        RunEvince(TempName($main::next_tmpfile, "pdf"), " &");
-      } elsif ($main::opt_web) {
-        RunWeb(TempName($main::next_tmpfile, "svg"));
-      }
-      $main::next_tmpfile++;
-    }
-    return 1;
-  }
-  if (m/^\s*$/) {
-    return 1;
-  }
-  print STDERR "Unknown command: try 'help'.\n";
-  return 1;
-}
-
-
-sub ProcessProfile {
-  my $total_count = shift;
-  my $orig_profile = shift;
-  my $symbols = shift;
-  my $focus = shift;
-  my $ignore = shift;
-
-  # Process current profile to account for various settings
-  my $profile = $orig_profile;
-  printf("Total: %s %s\n", Unparse($total_count), Units());
-  if ($focus ne '') {
-    $profile = FocusProfile($symbols, $profile, $focus);
-    my $focus_count = TotalProfile($profile);
-    printf("After focusing on '%s': %s %s of %s (%0.1f%%)\n",
-           $focus,
-           Unparse($focus_count), Units(),
-           Unparse($total_count), ($focus_count*100.0) / $total_count);
-  }
-  if ($ignore ne '') {
-    $profile = IgnoreProfile($symbols, $profile, $ignore);
-    my $ignore_count = TotalProfile($profile);
-    printf("After ignoring '%s': %s %s of %s (%0.1f%%)\n",
-           $ignore,
-           Unparse($ignore_count), Units(),
-           Unparse($total_count),
-           ($ignore_count*100.0) / $total_count);
-  }
-
-  return $profile;
-}
-
-sub InteractiveHelpMessage {
-  print STDERR <<ENDOFHELP;
-Interactive pprof mode
-
-Commands:
-  gv
-  gv [focus] [-ignore1] [-ignore2]
-      Show graphical hierarchical display of current profile.  Without
-      any arguments, shows all samples in the profile.  With the optional
-      "focus" argument, restricts the samples shown to just those where
-      the "focus" regular expression matches a routine name on the stack
-      trace.
-
-  web
-  web [focus] [-ignore1] [-ignore2]
-      Like GV, but displays profile in your web browser instead of using
-      Ghostview. Works best if your web browser is already running.
-      To change the browser that gets used:
-      On Linux, set the /etc/alternatives/gnome-www-browser symlink.
-      On OS X, change the Finder association for SVG files.
-
-  list [routine_regexp] [-ignore1] [-ignore2]
-      Show source listing of routines whose names match "routine_regexp"
-
-  weblist [routine_regexp] [-ignore1] [-ignore2]
-     Displays a source listing of routines whose names match "routine_regexp"
-     in a web browser.  You can click on source lines to view the
-     corresponding disassembly.
-
-  top [--cum] [-ignore1] [-ignore2]
-  top20 [--cum] [-ignore1] [-ignore2]
-  top37 [--cum] [-ignore1] [-ignore2]
-      Show top lines ordered by flat profile count, or cumulative count
-      if --cum is specified.  If a number is present after 'top', the
-      top K routines will be shown (defaults to showing the top 10)
-
-  disasm [routine_regexp] [-ignore1] [-ignore2]
-      Show disassembly of routines whose names match "routine_regexp",
-      annotated with sample counts.
-
-  callgrind
-  callgrind [filename]
-      Generates callgrind file. If no filename is given, kcachegrind is called.
-
-  help - This listing
-  quit or ^D - End pprof
-
-For commands that accept optional -ignore tags, samples where any routine in
-the stack trace matches the regular expression in any of the -ignore
-parameters will be ignored.
-
-Further pprof details are available at this location (or one similar):
-
- /usr/doc/gperftools-$PPROF_VERSION/cpu_profiler.html
- /usr/doc/gperftools-$PPROF_VERSION/heap_profiler.html
-
-ENDOFHELP
-}
-sub ParseInteractiveArgs {
-  my $args = shift;
-  my $focus = "";
-  my $ignore = "";
-  my @x = split(/ +/, $args);
-  foreach $a (@x) {
-    if ($a =~ m/^(--|-)lines$/) {
-      $main::opt_lines = 1;
-    } elsif ($a =~ m/^(--|-)cum$/) {
-      $main::opt_cum = 1;
-    } elsif ($a =~ m/^-(.*)/) {
-      $ignore .= (($ignore ne "") ? "|" : "" ) . $1;
-    } else {
-      $focus .= (($focus ne "") ? "|" : "" ) . $a;
-    }
-  }
-  if ($ignore ne "") {
-    print STDERR "Ignoring samples in call stacks that match '$ignore'\n";
-  }
-  return ($focus, $ignore);
-}
-
-##### Output code #####
-
-sub TempName {
-  my $fnum = shift;
-  my $ext = shift;
-  my $file = "$main::tmpfile_ps.$fnum.$ext";
-  $main::tempnames{$file} = 1;
-  return $file;
-}
-
-# Print profile data in packed binary format (64-bit) to standard out
-sub PrintProfileData {
-  my $profile = shift;
-
-  # print header (64-bit style)
-  # (zero) (header-size) (version) (sample-period) (zero)
-  print pack('L*', 0, 0, 3, 0, 0, 0, 1, 0, 0, 0);
-
-  foreach my $k (keys(%{$profile})) {
-    my $count = $profile->{$k};
-    my @addrs = split(/\n/, $k);
-    if ($#addrs >= 0) {
-      my $depth = $#addrs + 1;
-      # int(foo / 2**32) is the only reliable way to get rid of bottom
-      # 32 bits on both 32- and 64-bit systems.
-      print pack('L*', $count & 0xFFFFFFFF, int($count / 2**32));
-      print pack('L*', $depth & 0xFFFFFFFF, int($depth / 2**32));
-
-      foreach my $full_addr (@addrs) {
-        my $addr = $full_addr;
-        $addr =~ s/0x0*//;  # strip off leading 0x, zeroes
-        if (length($addr) > 16) {
-          print STDERR "Invalid address in profile: $full_addr\n";
-          next;
-        }
-        my $low_addr = substr($addr, -8);       # get last 8 hex chars
-        my $high_addr = substr($addr, -16, 8);  # get up to 8 more hex chars
-        print pack('L*', hex('0x' . $low_addr), hex('0x' . $high_addr));
-      }
-    }
-  }
-}
-
-# Print symbols and profile data
-sub PrintSymbolizedProfile {
-  my $symbols = shift;
-  my $profile = shift;
-  my $prog = shift;
-
-  $SYMBOL_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
-  my $symbol_marker = $&;
-
-  print '--- ', $symbol_marker, "\n";
-  if (defined($prog)) {
-    print 'binary=', $prog, "\n";
-  }
-  while (my ($pc, $name) = each(%{$symbols})) {
-    my $sep = ' ';
-    print '0x', $pc;
-    # We have a list of function names, which include the inlined
-    # calls.  They are separated (and terminated) by --, which is
-    # illegal in function names.
-    for (my $j = 2; $j <= $#{$name}; $j += 3) {
-      print $sep, $name->[$j];
-      $sep = '--';
-    }
-    print "\n";
-  }
-  print '---', "\n";
-
-  $PROFILE_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
-  my $profile_marker = $&;
-  print '--- ', $profile_marker, "\n";
-  if (defined($main::collected_profile)) {
-    # if used with remote fetch, simply dump the collected profile to output.
-    open(SRC, "<$main::collected_profile");
-    while (<SRC>) {
-      print $_;
-    }
-    close(SRC);
-  } else {
-    # dump a cpu-format profile to standard out
-    PrintProfileData($profile);
-  }
-}
-
-# Print text output
-sub PrintText {
-  my $symbols = shift;
-  my $flat = shift;
-  my $cumulative = shift;
-  my $line_limit = shift;
-
-  my $total = TotalProfile($flat);
-
-  # Which profile to sort by?
-  my $s = $main::opt_cum ? $cumulative : $flat;
-
-  my $running_sum = 0;
-  my $lines = 0;
-  foreach my $k (sort { GetEntry($s, $b) <=> GetEntry($s, $a) || $a cmp $b }
-                 keys(%{$cumulative})) {
-    my $f = GetEntry($flat, $k);
-    my $c = GetEntry($cumulative, $k);
-    $running_sum += $f;
-
-    my $sym = $k;
-    if (exists($symbols->{$k})) {
-      $sym = $symbols->{$k}->[0] . " " . $symbols->{$k}->[1];
-      if ($main::opt_addresses) {
-        $sym = $k . " " . $sym;
-      }
-    }
-
-    if ($f != 0 || $c != 0) {
-      printf("%8s %6s %6s %8s %6s %s\n",
-             Unparse($f),
-             Percent($f, $total),
-             Percent($running_sum, $total),
-             Unparse($c),
-             Percent($c, $total),
-             $sym);
-    }
-    $lines++;
-    last if ($line_limit >= 0 && $lines >= $line_limit);
-  }
-}
-
-# Callgrind format has a compression for repeated function and file
-# names.  You show the name the first time, and just use its number
-# subsequently.  This can cut down the file to about a third or a
-# quarter of its uncompressed size.  $key and $val are the key/value
-# pair that would normally be printed by callgrind; $map is a map from
-# value to number.
-sub CompressedCGName {
-  my($key, $val, $map) = @_;
-  my $idx = $map->{$val};
-  # For very short keys, providing an index hurts rather than helps.
-  if (length($val) <= 3) {
-    return "$key=$val\n";
-  } elsif (defined($idx)) {
-    return "$key=($idx)\n";
-  } else {
-    # scalar(keys $map) gives the number of items in the map.
-    $idx = scalar(keys(%{$map})) + 1;
-    $map->{$val} = $idx;
-    return "$key=($idx) $val\n";
-  }
-}
-
-# Print the call graph in a way that's suiteable for callgrind.
-sub PrintCallgrind {
-  my $calls = shift;
-  my $filename;
-  my %filename_to_index_map;
-  my %fnname_to_index_map;
-
-  if ($main::opt_interactive) {
-    $filename = shift;
-    print STDERR "Writing callgrind file to '$filename'.\n"
-  } else {
-    $filename = "&STDOUT";
-  }
-  open(CG, ">$filename");
-  printf CG ("events: Hits\n\n");
-  foreach my $call ( map { $_->[0] }
-                     sort { $a->[1] cmp $b ->[1] ||
-                            $a->[2] <=> $b->[2] }
-                     map { /([^:]+):(\d+):([^ ]+)( -> ([^:]+):(\d+):(.+))?/;
-                           [$_, $1, $2] }
-                     keys %$calls ) {
-    my $count = int($calls->{$call});
-    $call =~ /([^:]+):(\d+):([^ ]+)( -> ([^:]+):(\d+):(.+))?/;
-    my ( $caller_file, $caller_line, $caller_function,
-         $callee_file, $callee_line, $callee_function ) =
-       ( $1, $2, $3, $5, $6, $7 );
-
-    # TODO(csilvers): for better compression, collect all the
-    # caller/callee_files and functions first, before printing
-    # anything, and only compress those referenced more than once.
-    printf CG CompressedCGName("fl", $caller_file, \%filename_to_index_map);
-    printf CG CompressedCGName("fn", $caller_function, \%fnname_to_index_map);
-    if (defined $6) {
-      printf CG CompressedCGName("cfl", $callee_file, \%filename_to_index_map);
-      printf CG CompressedCGName("cfn", $callee_function, \%fnname_to_index_map);
-      printf CG ("calls=$count $callee_line\n");
-    }
-    printf CG ("$caller_line $count\n\n");
-  }
-}
-
-# Print disassembly for all all routines that match $main::opt_disasm
-sub PrintDisassembly {
-  my $libs = shift;
-  my $flat = shift;
-  my $cumulative = shift;
-  my $disasm_opts = shift;
-
-  my $total = TotalProfile($flat);
-
-  foreach my $lib (@{$libs}) {
-    my $symbol_table = GetProcedureBoundaries($lib->[0], $disasm_opts);
-    my $offset = AddressSub($lib->[1], $lib->[3]);
-    foreach my $routine (sort ByName keys(%{$symbol_table})) {
-      my $start_addr = $symbol_table->{$routine}->[0];
-      my $end_addr = $symbol_table->{$routine}->[1];
-      # See if there are any samples in this routine
-      my $length = hex(AddressSub($end_addr, $start_addr));
-      my $addr = AddressAdd($start_addr, $offset);
-      for (my $i = 0; $i < $length; $i++) {
-        if (defined($cumulative->{$addr})) {
-          PrintDisassembledFunction($lib->[0], $offset,
-                                    $routine, $flat, $cumulative,
-                                    $start_addr, $end_addr, $total);
-          last;
-        }
-        $addr = AddressInc($addr);
-      }
-    }
-  }
-}
-
-# Return reference to array of tuples of the form:
-#       [start_address, filename, linenumber, instruction, limit_address]
-# E.g.,
-#       ["0x806c43d", "/foo/bar.cc", 131, "ret", "0x806c440"]
-sub Disassemble {
-  my $prog = shift;
-  my $offset = shift;
-  my $start_addr = shift;
-  my $end_addr = shift;
-
-  my $objdump = $obj_tool_map{"objdump"};
-  my $cmd = ShellEscape($objdump, "-C", "-d", "-l", "--no-show-raw-insn",
-                        "--start-address=0x$start_addr",
-                        "--stop-address=0x$end_addr", $prog);
-  open(OBJDUMP, "$cmd |") || error("$cmd: $!\n");
-  my @result = ();
-  my $filename = "";
-  my $linenumber = -1;
-  my $last = ["", "", "", ""];
-  while (<OBJDUMP>) {
-    s/\r//g;         # turn windows-looking lines into unix-looking lines
-    chop;
-    if (m|\s*([^:\s]+):(\d+)\s*$|) {
-      # Location line of the form:
-      #   <filename>:<linenumber>
-      $filename = $1;
-      $linenumber = $2;
-    } elsif (m/^ +([0-9a-f]+):\s*(.*)/) {
-      # Disassembly line -- zero-extend address to full length
-      my $addr = HexExtend($1);
-      my $k = AddressAdd($addr, $offset);
-      $last->[4] = $k;   # Store ending address for previous instruction
-      $last = [$k, $filename, $linenumber, $2, $end_addr];
-      push(@result, $last);
-    }
-  }
-  close(OBJDUMP);
-  return @result;
-}
-
-# The input file should contain lines of the form /proc/maps-like
-# output (same format as expected from the profiles) or that looks
-# like hex addresses (like "0xDEADBEEF").  We will parse all
-# /proc/maps output, and for all the hex addresses, we will output
-# "short" symbol names, one per line, in the same order as the input.
-sub PrintSymbols {
-  my $maps_and_symbols_file = shift;
-
-  # ParseLibraries expects pcs to be in a set.  Fine by us...
-  my @pclist = ();   # pcs in sorted order
-  my $pcs = {};
-  my $map = "";
-  foreach my $line (<$maps_and_symbols_file>) {
-    $line =~ s/\r//g;    # turn windows-looking lines into unix-looking lines
-    if ($line =~ /\b(0x[0-9a-f]+)\b/i) {
-      push(@pclist, HexExtend($1));
-      $pcs->{$pclist[-1]} = 1;
-    } else {
-      $map .= $line;
-    }
-  }
-
-  my $libs = ParseLibraries($main::prog, $map, $pcs);
-  my $symbols = ExtractSymbols($libs, $pcs);
-
-  foreach my $pc (@pclist) {
-    # ->[0] is the shortname, ->[2] is the full name
-    print(($symbols->{$pc}->[0] || "??") . "\n");
-  }
-}
-
-
-# For sorting functions by name
-sub ByName {
-  return ShortFunctionName($a) cmp ShortFunctionName($b);
-}
-
-# Print source-listing for all all routines that match $list_opts
-sub PrintListing {
-  my $total = shift;
-  my $libs = shift;
-  my $flat = shift;
-  my $cumulative = shift;
-  my $list_opts = shift;
-  my $html = shift;
-
-  my $output = \*STDOUT;
-  my $fname = "";
-
-  if ($html) {
-    # Arrange to write the output to a temporary file
-    $fname = TempName($main::next_tmpfile, "html");
-    $main::next_tmpfile++;
-    if (!open(TEMP, ">$fname")) {
-      print STDERR "$fname: $!\n";
-      return;
-    }
-    $output = \*TEMP;
-    print $output HtmlListingHeader();
-    printf $output ("<div class=\"legend\">%s<br>Total: %s %s</div>\n",
-                    $main::prog, Unparse($total), Units());
-  }
-
-  my $listed = 0;
-  foreach my $lib (@{$libs}) {
-    my $symbol_table = GetProcedureBoundaries($lib->[0], $list_opts);
-    my $offset = AddressSub($lib->[1], $lib->[3]);
-    foreach my $routine (sort ByName keys(%{$symbol_table})) {
-      # Print if there are any samples in this routine
-      my $start_addr = $symbol_table->{$routine}->[0];
-      my $end_addr = $symbol_table->{$routine}->[1];
-      my $length = hex(AddressSub($end_addr, $start_addr));
-      my $addr = AddressAdd($start_addr, $offset);
-      for (my $i = 0; $i < $length; $i++) {
-        if (defined($cumulative->{$addr})) {
-          $listed += PrintSource(
-            $lib->[0], $offset,
-            $routine, $flat, $cumulative,
-            $start_addr, $end_addr,
-            $html,
-            $output);
-          last;
-        }
-        $addr = AddressInc($addr);
-      }
-    }
-  }
-
-  if ($html) {
-    if ($listed > 0) {
-      print $output HtmlListingFooter();
-      close($output);
-      RunWeb($fname);
-    } else {
-      close($output);
-      unlink($fname);
-    }
-  }
-}
-
-sub HtmlListingHeader {
-  return <<'EOF';
-<DOCTYPE html>
-<html>
-<head>
-<title>Pprof listing</title>
-<style type="text/css">
-body {
-  font-family: sans-serif;
-}
-h1 {
-  font-size: 1.5em;
-  margin-bottom: 4px;
-}
-.legend {
-  font-size: 1.25em;
-}
-.line {
-  color: #aaaaaa;
-}
-.nop {
-  color: #aaaaaa;
-}
-.unimportant {
-  color: #cccccc;
-}
-.disasmloc {
-  color: #000000;
-}
-.deadsrc {
-  cursor: pointer;
-}
-.deadsrc:hover {
-  background-color: #eeeeee;
-}
-.livesrc {
-  color: #0000ff;
-  cursor: pointer;
-}
-.livesrc:hover {
-  background-color: #eeeeee;
-}
-.asm {
-  color: #008800;
-  display: none;
-}
-</style>
-<script type="text/javascript">
-function pprof_toggle_asm(e) {
-  var target;
-  if (!e) e = window.event;
-  if (e.target) target = e.target;
-  else if (e.srcElement) target = e.srcElement;
-
-  if (target) {
-    var asm = target.nextSibling;
-    if (asm && asm.className == "asm") {
-      asm.style.display = (asm.style.display == "block" ? "" : "block");
-      e.preventDefault();
-      return false;
-    }
-  }
-}
-</script>
-</head>
-<body>
-EOF
-}
-
-sub HtmlListingFooter {
-  return <<'EOF';
-</body>
-</html>
-EOF
-}
-
-sub HtmlEscape {
-  my $text = shift;
-  $text =~ s/&/&amp;/g;
-  $text =~ s/</&lt;/g;
-  $text =~ s/>/&gt;/g;
-  return $text;
-}
-
-# Returns the indentation of the line, if it has any non-whitespace
-# characters.  Otherwise, returns -1.
-sub Indentation {
-  my $line = shift;
-  if (m/^(\s*)\S/) {
-    return length($1);
-  } else {
-    return -1;
-  }
-}
-
-# If the symbol table contains inlining info, Disassemble() may tag an
-# instruction with a location inside an inlined function.  But for
-# source listings, we prefer to use the location in the function we
-# are listing.  So use MapToSymbols() to fetch full location
-# information for each instruction and then pick out the first
-# location from a location list (location list contains callers before
-# callees in case of inlining).
-#
-# After this routine has run, each entry in $instructions contains:
-#   [0] start address
-#   [1] filename for function we are listing
-#   [2] line number for function we are listing
-#   [3] disassembly
-#   [4] limit address
-#   [5] most specific filename (may be different from [1] due to inlining)
-#   [6] most specific line number (may be different from [2] due to inlining)
-sub GetTopLevelLineNumbers {
-  my ($lib, $offset, $instructions) = @_;
-  my $pcs = [];
-  for (my $i = 0; $i <= $#{$instructions}; $i++) {
-    push(@{$pcs}, $instructions->[$i]->[0]);
-  }
-  my $symbols = {};
-  MapToSymbols($lib, $offset, $pcs, $symbols);
-  for (my $i = 0; $i <= $#{$instructions}; $i++) {
-    my $e = $instructions->[$i];
-    push(@{$e}, $e->[1]);
-    push(@{$e}, $e->[2]);
-    my $addr = $e->[0];
-    my $sym = $symbols->{$addr};
-    if (defined($sym)) {
-      if ($#{$sym} >= 2 && $sym->[1] =~ m/^(.*):(\d+)$/) {
-        $e->[1] = $1;  # File name
-        $e->[2] = $2;  # Line number
-      }
-    }
-  }
-}
-
-# Print source-listing for one routine
-sub PrintSource {
-  my $prog = shift;
-  my $offset = shift;
-  my $routine = shift;
-  my $flat = shift;
-  my $cumulative = shift;
-  my $start_addr = shift;
-  my $end_addr = shift;
-  my $html = shift;
-  my $output = shift;
-
-  # Disassemble all instructions (just to get line numbers)
-  my @instructions = Disassemble($prog, $offset, $start_addr, $end_addr);
-  GetTopLevelLineNumbers($prog, $offset, \@instructions);
-
-  # Hack 1: assume that the first source file encountered in the
-  # disassembly contains the routine
-  my $filename = undef;
-  for (my $i = 0; $i <= $#instructions; $i++) {
-    if ($instructions[$i]->[2] >= 0) {
-      $filename = $instructions[$i]->[1];
-      last;
-    }
-  }
-  if (!defined($filename)) {
-    print STDERR "no filename found in $routine\n";
-    return 0;
-  }
-
-  # Hack 2: assume that the largest line number from $filename is the
-  # end of the procedure.  This is typically safe since if P1 contains
-  # an inlined call to P2, then P2 usually occurs earlier in the
-  # source file.  If this does not work, we might have to compute a
-  # density profile or just print all regions we find.
-  my $lastline = 0;
-  for (my $i = 0; $i <= $#instructions; $i++) {
-    my $f = $instructions[$i]->[1];
-    my $l = $instructions[$i]->[2];
-    if (($f eq $filename) && ($l > $lastline)) {
-      $lastline = $l;
-    }
-  }
-
-  # Hack 3: assume the first source location from "filename" is the start of
-  # the source code.
-  my $firstline = 1;
-  for (my $i = 0; $i <= $#instructions; $i++) {
-    if ($instructions[$i]->[1] eq $filename) {
-      $firstline = $instructions[$i]->[2];
-      last;
-    }
-  }
-
-  # Hack 4: Extend last line forward until its indentation is less than
-  # the indentation we saw on $firstline
-  my $oldlastline = $lastline;
-  {
-    if (!open(FILE, "<$filename")) {
-      print STDERR "$filename: $!\n";
-      return 0;
-    }
-    my $l = 0;
-    my $first_indentation = -1;
-    while (<FILE>) {
-      s/\r//g;         # turn windows-looking lines into unix-looking lines
-      $l++;
-      my $indent = Indentation($_);
-      if ($l >= $firstline) {
-        if ($first_indentation < 0 && $indent >= 0) {
-          $first_indentation = $indent;
-          last if ($first_indentation == 0);
-        }
-      }
-      if ($l >= $lastline && $indent >= 0) {
-        if ($indent >= $first_indentation) {
-          $lastline = $l+1;
-        } else {
-          last;
-        }
-      }
-    }
-    close(FILE);
-  }
-
-  # Assign all samples to the range $firstline,$lastline,
-  # Hack 4: If an instruction does not occur in the range, its samples
-  # are moved to the next instruction that occurs in the range.
-  my $samples1 = {};        # Map from line number to flat count
-  my $samples2 = {};        # Map from line number to cumulative count
-  my $running1 = 0;         # Unassigned flat counts
-  my $running2 = 0;         # Unassigned cumulative counts
-  my $total1 = 0;           # Total flat counts
-  my $total2 = 0;           # Total cumulative counts
-  my %disasm = ();          # Map from line number to disassembly
-  my $running_disasm = "";  # Unassigned disassembly
-  my $skip_marker = "---\n";
-  if ($html) {
-    $skip_marker = "";
-    for (my $l = $firstline; $l <= $lastline; $l++) {
-      $disasm{$l} = "";
-    }
-  }
-  my $last_dis_filename = '';
-  my $last_dis_linenum = -1;
-  my $last_touched_line = -1;  # To detect gaps in disassembly for a line
-  foreach my $e (@instructions) {
-    # Add up counts for all address that fall inside this instruction
-    my $c1 = 0;
-    my $c2 = 0;
-    for (my $a = $e->[0]; $a lt $e->[4]; $a = AddressInc($a)) {
-      $c1 += GetEntry($flat, $a);
-      $c2 += GetEntry($cumulative, $a);
-    }
-
-    if ($html) {
-      my $dis = sprintf("      %6s %6s \t\t%8s: %s ",
-                        HtmlPrintNumber($c1),
-                        HtmlPrintNumber($c2),
-                        UnparseAddress($offset, $e->[0]),
-                        CleanDisassembly($e->[3]));
-
-      # Append the most specific source line associated with this instruction
-      if (length($dis) < 80) { $dis .= (' ' x (80 - length($dis))) };
-      $dis = HtmlEscape($dis);
-      my $f = $e->[5];
-      my $l = $e->[6];
-      if ($f ne $last_dis_filename) {
-        $dis .= sprintf("<span class=disasmloc>%s:%d</span>",
-                        HtmlEscape(CleanFileName($f)), $l);
-      } elsif ($l ne $last_dis_linenum) {
-        # De-emphasize the unchanged file name portion
-        $dis .= sprintf("<span class=unimportant>%s</span>" .
-                        "<span class=disasmloc>:%d</span>",
-                        HtmlEscape(CleanFileName($f)), $l);
-      } else {
-        # De-emphasize the entire location
-        $dis .= sprintf("<span class=unimportant>%s:%d</span>",
-                        HtmlEscape(CleanFileName($f)), $l);
-      }
-      $last_dis_filename = $f;
-      $last_dis_linenum = $l;
-      $running_disasm .= $dis;
-      $running_disasm .= "\n";
-    }
-
-    $running1 += $c1;
-    $running2 += $c2;
-    $total1 += $c1;
-    $total2 += $c2;
-    my $file = $e->[1];
-    my $line = $e->[2];
-    if (($file eq $filename) &&
-        ($line >= $firstline) &&
-        ($line <= $lastline)) {
-      # Assign all accumulated samples to this line
-      AddEntry($samples1, $line, $running1);
-      AddEntry($samples2, $line, $running2);
-      $running1 = 0;
-      $running2 = 0;
-      if ($html) {
-        if ($line != $last_touched_line && $disasm{$line} ne '') {
-          $disasm{$line} .= "\n";
-        }
-        $disasm{$line} .= $running_disasm;
-        $running_disasm = '';
-        $last_touched_line = $line;
-      }
-    }
-  }
-
-  # Assign any leftover samples to $lastline
-  AddEntry($samples1, $lastline, $running1);
-  AddEntry($samples2, $lastline, $running2);
-  if ($html) {
-    if ($lastline != $last_touched_line && $disasm{$lastline} ne '') {
-      $disasm{$lastline} .= "\n";
-    }
-    $disasm{$lastline} .= $running_disasm;
-  }
-
-  if ($html) {
-    printf $output (
-      "<h1>%s</h1>%s\n<pre onClick=\"pprof_toggle_asm()\">\n" .
-      "Total:%6s %6s (flat / cumulative %s)\n",
-      HtmlEscape(ShortFunctionName($routine)),
-      HtmlEscape(CleanFileName($filename)),
-      Unparse($total1),
-      Unparse($total2),
-      Units());
-  } else {
-    printf $output (
-      "ROUTINE ====================== %s in %s\n" .
-      "%6s %6s Total %s (flat / cumulative)\n",
-      ShortFunctionName($routine),
-      CleanFileName($filename),
-      Unparse($total1),
-      Unparse($total2),
-      Units());
-  }
-  if (!open(FILE, "<$filename")) {
-    print STDERR "$filename: $!\n";
-    return 0;
-  }
-  my $l = 0;
-  while (<FILE>) {
-    s/\r//g;         # turn windows-looking lines into unix-looking lines
-    $l++;
-    if ($l >= $firstline - 5 &&
-        (($l <= $oldlastline + 5) || ($l <= $lastline))) {
-      chop;
-      my $text = $_;
-      if ($l == $firstline) { print $output $skip_marker; }
-      my $n1 = GetEntry($samples1, $l);
-      my $n2 = GetEntry($samples2, $l);
-      if ($html) {
-        # Emit a span that has one of the following classes:
-        #    livesrc -- has samples
-        #    deadsrc -- has disassembly, but with no samples
-        #    nop     -- has no matching disasembly
-        # Also emit an optional span containing disassembly.
-        my $dis = $disasm{$l};
-        my $asm = "";
-        if (defined($dis) && $dis ne '') {
-          $asm = "<span class=\"asm\">" . $dis . "</span>";
-        }
-        my $source_class = (($n1 + $n2 > 0)
-                            ? "livesrc"
-                            : (($asm ne "") ? "deadsrc" : "nop"));
-        printf $output (
-          "<span class=\"line\">%5d</span> " .
-          "<span class=\"%s\">%6s %6s %s</span>%s\n",
-          $l, $source_class,
-          HtmlPrintNumber($n1),
-          HtmlPrintNumber($n2),
-          HtmlEscape($text),
-          $asm);
-      } else {
-        printf $output(
-          "%6s %6s %4d: %s\n",
-          UnparseAlt($n1),
-          UnparseAlt($n2),
-          $l,
-          $text);
-      }
-      if ($l == $lastline)  { print $output $skip_marker; }
-    };
-  }
-  close(FILE);
-  if ($html) {
-    print $output "</pre>\n";
-  }
-  return 1;
-}
-
-# Return the source line for the specified file/linenumber.
-# Returns undef if not found.
-sub SourceLine {
-  my $file = shift;
-  my $line = shift;
-
-  # Look in cache
-  if (!defined($main::source_cache{$file})) {
-    if (100 < scalar keys(%main::source_cache)) {
-      # Clear the cache when it gets too big
-      $main::source_cache = ();
-    }
-
-    # Read all lines from the file
-    if (!open(FILE, "<$file")) {
-      print STDERR "$file: $!\n";
-      $main::source_cache{$file} = [];  # Cache the negative result
-      return undef;
-    }
-    my $lines = [];
-    push(@{$lines}, "");        # So we can use 1-based line numbers as indices
-    while (<FILE>) {
-      push(@{$lines}, $_);
-    }
-    close(FILE);
-
-    # Save the lines in the cache
-    $main::source_cache{$file} = $lines;
-  }
-
-  my $lines = $main::source_cache{$file};
-  if (($line < 0) || ($line > $#{$lines})) {
-    return undef;
-  } else {
-    return $lines->[$line];
-  }
-}
-
-# Print disassembly for one routine with interspersed source if available
-sub PrintDisassembledFunction {
-  my $prog = shift;
-  my $offset = shift;
-  my $routine = shift;
-  my $flat = shift;
-  my $cumulative = shift;
-  my $start_addr = shift;
-  my $end_addr = shift;
-  my $total = shift;
-
-  # Disassemble all instructions
-  my @instructions = Disassemble($prog, $offset, $start_addr, $end_addr);
-
-  # Make array of counts per instruction
-  my @flat_count = ();
-  my @cum_count = ();
-  my $flat_total = 0;
-  my $cum_total = 0;
-  foreach my $e (@instructions) {
-    # Add up counts for all address that fall inside this instruction
-    my $c1 = 0;
-    my $c2 = 0;
-    for (my $a = $e->[0]; $a lt $e->[4]; $a = AddressInc($a)) {
-      $c1 += GetEntry($flat, $a);
-      $c2 += GetEntry($cumulative, $a);
-    }
-    push(@flat_count, $c1);
-    push(@cum_count, $c2);
-    $flat_total += $c1;
-    $cum_total += $c2;
-  }
-
-  # Print header with total counts
-  printf("ROUTINE ====================== %s\n" .
-         "%6s %6s %s (flat, cumulative) %.1f%% of total\n",
-         ShortFunctionName($routine),
-         Unparse($flat_total),
-         Unparse($cum_total),
-         Units(),
-         ($cum_total * 100.0) / $total);
-
-  # Process instructions in order
-  my $current_file = "";
-  for (my $i = 0; $i <= $#instructions; ) {
-    my $e = $instructions[$i];
-
-    # Print the new file name whenever we switch files
-    if ($e->[1] ne $current_file) {
-      $current_file = $e->[1];
-      my $fname = $current_file;
-      $fname =~ s|^\./||;   # Trim leading "./"
-
-      # Shorten long file names
-      if (length($fname) >= 58) {
-        $fname = "..." . substr($fname, -55);
-      }
-      printf("-------------------- %s\n", $fname);
-    }
-
-    # TODO: Compute range of lines to print together to deal with
-    # small reorderings.
-    my $first_line = $e->[2];
-    my $last_line = $first_line;
-    my %flat_sum = ();
-    my %cum_sum = ();
-    for (my $l = $first_line; $l <= $last_line; $l++) {
-      $flat_sum{$l} = 0;
-      $cum_sum{$l} = 0;
-    }
-
-    # Find run of instructions for this range of source lines
-    my $first_inst = $i;
-    while (($i <= $#instructions) &&
-           ($instructions[$i]->[2] >= $first_line) &&
-           ($instructions[$i]->[2] <= $last_line)) {
-      $e = $instructions[$i];
-      $flat_sum{$e->[2]} += $flat_count[$i];
-      $cum_sum{$e->[2]} += $cum_count[$i];
-      $i++;
-    }
-    my $last_inst = $i - 1;
-
-    # Print source lines
-    for (my $l = $first_line; $l <= $last_line; $l++) {
-      my $line = SourceLine($current_file, $l);
-      if (!defined($line)) {
-        $line = "?\n";
-        next;
-      } else {
-        $line =~ s/^\s+//;
-      }
-      printf("%6s %6s %5d: %s",
-             UnparseAlt($flat_sum{$l}),
-             UnparseAlt($cum_sum{$l}),
-             $l,
-             $line);
-    }
-
-    # Print disassembly
-    for (my $x = $first_inst; $x <= $last_inst; $x++) {
-      my $e = $instructions[$x];
-      printf("%6s %6s    %8s: %6s\n",
-             UnparseAlt($flat_count[$x]),
-             UnparseAlt($cum_count[$x]),
-             UnparseAddress($offset, $e->[0]),
-             CleanDisassembly($e->[3]));
-    }
-  }
-}
-
-# Print DOT graph
-sub PrintDot {
-  my $prog = shift;
-  my $symbols = shift;
-  my $raw = shift;
-  my $flat = shift;
-  my $cumulative = shift;
-  my $overall_total = shift;
-
-  # Get total
-  my $local_total = TotalProfile($flat);
-  my $nodelimit = int($main::opt_nodefraction * $local_total);
-  my $edgelimit = int($main::opt_edgefraction * $local_total);
-  my $nodecount = $main::opt_nodecount;
-
-  # Find nodes to include
-  my @list = (sort { abs(GetEntry($cumulative, $b)) <=>
-                     abs(GetEntry($cumulative, $a))
-                     || $a cmp $b }
-              keys(%{$cumulative}));
-  my $last = $nodecount - 1;
-  if ($last > $#list) {
-    $last = $#list;
-  }
-  while (($last >= 0) &&
-         (abs(GetEntry($cumulative, $list[$last])) <= $nodelimit)) {
-    $last--;
-  }
-  if ($last < 0) {
-    print STDERR "No nodes to print\n";
-    return 0;
-  }
-
-  if ($nodelimit > 0 || $edgelimit > 0) {
-    printf STDERR ("Dropping nodes with <= %s %s; edges with <= %s abs(%s)\n",
-                   Unparse($nodelimit), Units(),
-                   Unparse($edgelimit), Units());
-  }
-
-  # Open DOT output file
-  my $output;
-  my $escaped_dot = ShellEscape(@DOT);
-  my $escaped_ps2pdf = ShellEscape(@PS2PDF);
-  if ($main::opt_gv) {
-    my $escaped_outfile = ShellEscape(TempName($main::next_tmpfile, "ps"));
-    $output = "| $escaped_dot -Tps2 >$escaped_outfile";
-  } elsif ($main::opt_evince) {
-    my $escaped_outfile = ShellEscape(TempName($main::next_tmpfile, "pdf"));
-    $output = "| $escaped_dot -Tps2 | $escaped_ps2pdf - $escaped_outfile";
-  } elsif ($main::opt_ps) {
-    $output = "| $escaped_dot -Tps2";
-  } elsif ($main::opt_pdf) {
-    $output = "| $escaped_dot -Tps2 | $escaped_ps2pdf - -";
-  } elsif ($main::opt_web || $main::opt_svg) {
-    # We need to post-process the SVG, so write to a temporary file always.
-    my $escaped_outfile = ShellEscape(TempName($main::next_tmpfile, "svg"));
-    $output = "| $escaped_dot -Tsvg >$escaped_outfile";
-  } elsif ($main::opt_gif) {
-    $output = "| $escaped_dot -Tgif";
-  } else {
-    $output = ">&STDOUT";
-  }
-  open(DOT, $output) || error("$output: $!\n");
-
-  # Title
-  printf DOT ("digraph \"%s; %s %s\" {\n",
-              $prog,
-              Unparse($overall_total),
-              Units());
-  if ($main::opt_pdf) {
-    # The output is more printable if we set the page size for dot.
-    printf DOT ("size=\"8,11\"\n");
-  }
-  printf DOT ("node [width=0.375,height=0.25];\n");
-
-  # Print legend
-  printf DOT ("Legend [shape=box,fontsize=24,shape=plaintext," .
-              "label=\"%s\\l%s\\l%s\\l%s\\l%s\\l\"];\n",
-              $prog,
-              sprintf("Total %s: %s", Units(), Unparse($overall_total)),
-              sprintf("Focusing on: %s", Unparse($local_total)),
-              sprintf("Dropped nodes with <= %s abs(%s)",
-                      Unparse($nodelimit), Units()),
-              sprintf("Dropped edges with <= %s %s",
-                      Unparse($edgelimit), Units())
-              );
-
-  # Print nodes
-  my %node = ();
-  my $nextnode = 1;
-  foreach my $a (@list[0..$last]) {
-    # Pick font size
-    my $f = GetEntry($flat, $a);
-    my $c = GetEntry($cumulative, $a);
-
-    my $fs = 8;
-    if ($local_total > 0) {
-      $fs = 8 + (50.0 * sqrt(abs($f * 1.0 / $local_total)));
-    }
-
-    $node{$a} = $nextnode++;
-    my $sym = $a;
-    $sym =~ s/\s+/\\n/g;
-    $sym =~ s/::/\\n/g;
-
-    # Extra cumulative info to print for non-leaves
-    my $extra = "";
-    if ($f != $c) {
-      $extra = sprintf("\\rof %s (%s)",
-                       Unparse($c),
-                       Percent($c, $local_total));
-    }
-    my $style = "";
-    if ($main::opt_heapcheck) {
-      if ($f > 0) {
-        # make leak-causing nodes more visible (add a background)
-        $style = ",style=filled,fillcolor=gray"
-      } elsif ($f < 0) {
-        # make anti-leak-causing nodes (which almost never occur)
-        # stand out as well (triple border)
-        $style = ",peripheries=3"
-      }
-    }
-
-    printf DOT ("N%d [label=\"%s\\n%s (%s)%s\\r" .
-                "\",shape=box,fontsize=%.1f%s];\n",
-                $node{$a},
-                $sym,
-                Unparse($f),
-                Percent($f, $local_total),
-                $extra,
-                $fs,
-                $style,
-               );
-  }
-
-  # Get edges and counts per edge
-  my %edge = ();
-  my $n;
-  my $fullname_to_shortname_map = {};
-  FillFullnameToShortnameMap($symbols, $fullname_to_shortname_map);
-  foreach my $k (keys(%{$raw})) {
-    # TODO: omit low %age edges
-    $n = $raw->{$k};
-    my @translated = TranslateStack($symbols, $fullname_to_shortname_map, $k);
-    for (my $i = 1; $i <= $#translated; $i++) {
-      my $src = $translated[$i];
-      my $dst = $translated[$i-1];
-      #next if ($src eq $dst);  # Avoid self-edges?
-      if (exists($node{$src}) && exists($node{$dst})) {
-        my $edge_label = "$src\001$dst";
-        if (!exists($edge{$edge_label})) {
-          $edge{$edge_label} = 0;
-        }
-        $edge{$edge_label} += $n;
-      }
-    }
-  }
-
-  # Print edges (process in order of decreasing counts)
-  my %indegree = ();   # Number of incoming edges added per node so far
-  my %outdegree = ();  # Number of outgoing edges added per node so far
-  foreach my $e (sort { $edge{$b} <=> $edge{$a} } keys(%edge)) {
-    my @x = split(/\001/, $e);
-    $n = $edge{$e};
-
-    # Initialize degree of kept incoming and outgoing edges if necessary
-    my $src = $x[0];
-    my $dst = $x[1];
-    if (!exists($outdegree{$src})) { $outdegree{$src} = 0; }
-    if (!exists($indegree{$dst})) { $indegree{$dst} = 0; }
-
-    my $keep;
-    if ($indegree{$dst} == 0) {
-      # Keep edge if needed for reachability
-      $keep = 1;
-    } elsif (abs($n) <= $edgelimit) {
-      # Drop if we are below --edgefraction
-      $keep = 0;
-    } elsif ($outdegree{$src} >= $main::opt_maxdegree ||
-             $indegree{$dst} >= $main::opt_maxdegree) {
-      # Keep limited number of in/out edges per node
-      $keep = 0;
-    } else {
-      $keep = 1;
-    }
-
-    if ($keep) {
-      $outdegree{$src}++;
-      $indegree{$dst}++;
-
-      # Compute line width based on edge count
-      my $fraction = abs($local_total ? (3 * ($n / $local_total)) : 0);
-      if ($fraction > 1) { $fraction = 1; }
-      my $w = $fraction * 2;
-      if ($w < 1 && ($main::opt_web || $main::opt_svg)) {
-        # SVG output treats line widths < 1 poorly.
-        $w = 1;
-      }
-
-      # Dot sometimes segfaults if given edge weights that are too large, so
-      # we cap the weights at a large value
-      my $edgeweight = abs($n) ** 0.7;
-      if ($edgeweight > 100000) { $edgeweight = 100000; }
-      $edgeweight = int($edgeweight);
-
-      my $style = sprintf("setlinewidth(%f)", $w);
-      if ($x[1] =~ m/\(inline\)/) {
-        $style .= ",dashed";
-      }
-
-      # Use a slightly squashed function of the edge count as the weight
-      printf DOT ("N%s -> N%s [label=%s, weight=%d, style=\"%s\"];\n",
-                  $node{$x[0]},
-                  $node{$x[1]},
-                  Unparse($n),
-                  $edgeweight,
-                  $style);
-    }
-  }
-
-  print DOT ("}\n");
-  close(DOT);
-
-  if ($main::opt_web || $main::opt_svg) {
-    # Rewrite SVG to be more usable inside web browser.
-    RewriteSvg(TempName($main::next_tmpfile, "svg"));
-  }
-
-  return 1;
-}
-
-sub RewriteSvg {
-  my $svgfile = shift;
-
-  open(SVG, $svgfile) || die "open temp svg: $!";
-  my @svg = <SVG>;
-  close(SVG);
-  unlink $svgfile;
-  my $svg = join('', @svg);
-
-  # Dot's SVG output is
-  #
-  #    <svg width="___" height="___"
-  #     viewBox="___" xmlns=...>
-  #    <g id="graph0" transform="...">
-  #    ...
-  #    </g>
-  #    </svg>
-  #
-  # Change it to
-  #
-  #    <svg width="100%" height="100%"
-  #     xmlns=...>
-  #    $svg_javascript
-  #    <g id="viewport" transform="translate(0,0)">
-  #    <g id="graph0" transform="...">
-  #    ...
-  #    </g>
-  #    </g>
-  #    </svg>
-
-  # Fix width, height; drop viewBox.
-  $svg =~ s/(?s)<svg width="[^"]+" height="[^"]+"(.*?)viewBox="[^"]+"/<svg width="100%" height="100%"$1/;
-
-  # Insert script, viewport <g> above first <g>
-  my $svg_javascript = SvgJavascript();
-  my $viewport = "<g id=\"viewport\" transform=\"translate(0,0)\">\n";
-  $svg =~ s/<g id="graph\d"/$svg_javascript$viewport$&/;
-
-  # Insert final </g> above </svg>.
-  $svg =~ s/(.*)(<\/svg>)/$1<\/g>$2/;
-  $svg =~ s/<g id="graph\d"(.*?)/<g id="viewport"$1/;
-
-  if ($main::opt_svg) {
-    # --svg: write to standard output.
-    print $svg;
-  } else {
-    # Write back to temporary file.
-    open(SVG, ">$svgfile") || die "open $svgfile: $!";
-    print SVG $svg;
-    close(SVG);
-  }
-}
-
-sub SvgJavascript {
-  return <<'EOF';
-<script type="text/ecmascript"><![CDATA[
-// SVGPan
-// http://www.cyberz.org/blog/2009/12/08/svgpan-a-javascript-svg-panzoomdrag-library/
-// Local modification: if(true || ...) below to force panning, never moving.
-
-/**
- *  SVGPan library 1.2
- * ====================
- *
- * Given an unique existing element with id "viewport", including the
- * the library into any SVG adds the following capabilities:
- *
- *  - Mouse panning
- *  - Mouse zooming (using the wheel)
- *  - Object dargging
- *
- * Known issues:
- *
- *  - Zooming (while panning) on Safari has still some issues
- *
- * Releases:
- *
- * 1.2, Sat Mar 20 08:42:50 GMT 2010, Zeng Xiaohui
- *	Fixed a bug with browser mouse handler interaction
- *
- * 1.1, Wed Feb  3 17:39:33 GMT 2010, Zeng Xiaohui
- *	Updated the zoom code to support the mouse wheel on Safari/Chrome
- *
- * 1.0, Andrea Leofreddi
- *	First release
- *
- * This code is licensed under the following BSD license:
- *
- * Copyright 2009-2010 Andrea Leofreddi <a.leofreddi@itcharm.com>. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without modification, are
- * permitted provided that the following conditions are met:
- *
- *    1. Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *
- *    2. Redistributions in binary form must reproduce the above copyright notice, this list
- *       of conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY Andrea Leofreddi ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Andrea Leofreddi OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * The views and conclusions contained in the software and documentation are those of the
- * authors and should not be interpreted as representing official policies, either expressed
- * or implied, of Andrea Leofreddi.
- */
-
-var root = document.documentElement;
-
-var state = 'none', stateTarget, stateOrigin, stateTf;
-
-setupHandlers(root);
-
-/**
- * Register handlers
- */
-function setupHandlers(root){
-	setAttributes(root, {
-		"onmouseup" : "add(evt)",
-		"onmousedown" : "handleMouseDown(evt)",
-		"onmousemove" : "handleMouseMove(evt)",
-		"onmouseup" : "handleMouseUp(evt)",
-		//"onmouseout" : "handleMouseUp(evt)", // Decomment this to stop the pan functionality when dragging out of the SVG element
-	});
-
-	if(navigator.userAgent.toLowerCase().indexOf('webkit') >= 0)
-		window.addEventListener('mousewheel', handleMouseWheel, false); // Chrome/Safari
-	else
-		window.addEventListener('DOMMouseScroll', handleMouseWheel, false); // Others
-
-	var g = svgDoc.getElementById("svg");
-	g.width = "100%";
-	g.height = "100%";
-}
-
-/**
- * Instance an SVGPoint object with given event coordinates.
- */
-function getEventPoint(evt) {
-	var p = root.createSVGPoint();
-
-	p.x = evt.clientX;
-	p.y = evt.clientY;
-
-	return p;
-}
-
-/**
- * Sets the current transform matrix of an element.
- */
-function setCTM(element, matrix) {
-	var s = "matrix(" + matrix.a + "," + matrix.b + "," + matrix.c + "," + matrix.d + "," + matrix.e + "," + matrix.f + ")";
-
-	element.setAttribute("transform", s);
-}
-
-/**
- * Dumps a matrix to a string (useful for debug).
- */
-function dumpMatrix(matrix) {
-	var s = "[ " + matrix.a + ", " + matrix.c + ", " + matrix.e + "\n  " + matrix.b + ", " + matrix.d + ", " + matrix.f + "\n  0, 0, 1 ]";
-
-	return s;
-}
-
-/**
- * Sets attributes of an element.
- */
-function setAttributes(element, attributes){
-	for (i in attributes)
-		element.setAttributeNS(null, i, attributes[i]);
-}
-
-/**
- * Handle mouse move event.
- */
-function handleMouseWheel(evt) {
-	if(evt.preventDefault)
-		evt.preventDefault();
-
-	evt.returnValue = false;
-
-	var svgDoc = evt.target.ownerDocument;
-
-	var delta;
-
-	if(evt.wheelDelta)
-		delta = evt.wheelDelta / 3600; // Chrome/Safari
-	else
-		delta = evt.detail / -90; // Mozilla
-
-	var z = 1 + delta; // Zoom factor: 0.9/1.1
-
-	var g = svgDoc.getElementById("viewport");
-
-	var p = getEventPoint(evt);
-
-	p = p.matrixTransform(g.getCTM().inverse());
-
-	// Compute new scale matrix in current mouse position
-	var k = root.createSVGMatrix().translate(p.x, p.y).scale(z).translate(-p.x, -p.y);
-
-        setCTM(g, g.getCTM().multiply(k));
-
-	stateTf = stateTf.multiply(k.inverse());
-}
-
-/**
- * Handle mouse move event.
- */
-function handleMouseMove(evt) {
-	if(evt.preventDefault)
-		evt.preventDefault();
-
-	evt.returnValue = false;
-
-	var svgDoc = evt.target.ownerDocument;
-
-	var g = svgDoc.getElementById("viewport");
-
-	if(state == 'pan') {
-		// Pan mode
-		var p = getEventPoint(evt).matrixTransform(stateTf);
-
-		setCTM(g, stateTf.inverse().translate(p.x - stateOrigin.x, p.y - stateOrigin.y));
-	} else if(state == 'move') {
-		// Move mode
-		var p = getEventPoint(evt).matrixTransform(g.getCTM().inverse());
-
-		setCTM(stateTarget, root.createSVGMatrix().translate(p.x - stateOrigin.x, p.y - stateOrigin.y).multiply(g.getCTM().inverse()).multiply(stateTarget.getCTM()));
-
-		stateOrigin = p;
-	}
-}
-
-/**
- * Handle click event.
- */
-function handleMouseDown(evt) {
-	if(evt.preventDefault)
-		evt.preventDefault();
-
-	evt.returnValue = false;
-
-	var svgDoc = evt.target.ownerDocument;
-
-	var g = svgDoc.getElementById("viewport");
-
-	if(true || evt.target.tagName == "svg") {
-		// Pan mode
-		state = 'pan';
-
-		stateTf = g.getCTM().inverse();
-
-		stateOrigin = getEventPoint(evt).matrixTransform(stateTf);
-	} else {
-		// Move mode
-		state = 'move';
-
-		stateTarget = evt.target;
-
-		stateTf = g.getCTM().inverse();
-
-		stateOrigin = getEventPoint(evt).matrixTransform(stateTf);
-	}
-}
-
-/**
- * Handle mouse button release event.
- */
-function handleMouseUp(evt) {
-	if(evt.preventDefault)
-		evt.preventDefault();
-
-	evt.returnValue = false;
-
-	var svgDoc = evt.target.ownerDocument;
-
-	if(state == 'pan' || state == 'move') {
-		// Quit pan mode
-		state = '';
-	}
-}
-
-]]></script>
-EOF
-}
-
-# Provides a map from fullname to shortname for cases where the
-# shortname is ambiguous.  The symlist has both the fullname and
-# shortname for all symbols, which is usually fine, but sometimes --
-# such as overloaded functions -- two different fullnames can map to
-# the same shortname.  In that case, we use the address of the
-# function to disambiguate the two.  This function fills in a map that
-# maps fullnames to modified shortnames in such cases.  If a fullname
-# is not present in the map, the 'normal' shortname provided by the
-# symlist is the appropriate one to use.
-sub FillFullnameToShortnameMap {
-  my $symbols = shift;
-  my $fullname_to_shortname_map = shift;
-  my $shortnames_seen_once = {};
-  my $shortnames_seen_more_than_once = {};
-
-  foreach my $symlist (values(%{$symbols})) {
-    # TODO(csilvers): deal with inlined symbols too.
-    my $shortname = $symlist->[0];
-    my $fullname = $symlist->[2];
-    if ($fullname !~ /<[0-9a-fA-F]+>$/) {  # fullname doesn't end in an address
-      next;       # the only collisions we care about are when addresses differ
-    }
-    if (defined($shortnames_seen_once->{$shortname}) &&
-        $shortnames_seen_once->{$shortname} ne $fullname) {
-      $shortnames_seen_more_than_once->{$shortname} = 1;
-    } else {
-      $shortnames_seen_once->{$shortname} = $fullname;
-    }
-  }
-
-  foreach my $symlist (values(%{$symbols})) {
-    my $shortname = $symlist->[0];
-    my $fullname = $symlist->[2];
-    # TODO(csilvers): take in a list of addresses we care about, and only
-    # store in the map if $symlist->[1] is in that list.  Saves space.
-    next if defined($fullname_to_shortname_map->{$fullname});
-    if (defined($shortnames_seen_more_than_once->{$shortname})) {
-      if ($fullname =~ /<0*([^>]*)>$/) {   # fullname has address at end of it
-        $fullname_to_shortname_map->{$fullname} = "$shortname\@$1";
-      }
-    }
-  }
-}
-
-# Return a small number that identifies the argument.
-# Multiple calls with the same argument will return the same number.
-# Calls with different arguments will return different numbers.
-sub ShortIdFor {
-  my $key = shift;
-  my $id = $main::uniqueid{$key};
-  if (!defined($id)) {
-    $id = keys(%main::uniqueid) + 1;
-    $main::uniqueid{$key} = $id;
-  }
-  return $id;
-}
-
-# Translate a stack of addresses into a stack of symbols
-sub TranslateStack {
-  my $symbols = shift;
-  my $fullname_to_shortname_map = shift;
-  my $k = shift;
-
-  my @addrs = split(/\n/, $k);
-  my @result = ();
-  for (my $i = 0; $i <= $#addrs; $i++) {
-    my $a = $addrs[$i];
-
-    # Skip large addresses since they sometimes show up as fake entries on RH9
-    if (length($a) > 8 && $a gt "7fffffffffffffff") {
-      next;
-    }
-
-    if ($main::opt_disasm || $main::opt_list) {
-      # We want just the address for the key
-      push(@result, $a);
-      next;
-    }
-
-    my $symlist = $symbols->{$a};
-    if (!defined($symlist)) {
-      $symlist = [$a, "", $a];
-    }
-
-    # We can have a sequence of symbols for a particular entry
-    # (more than one symbol in the case of inlining).  Callers
-    # come before callees in symlist, so walk backwards since
-    # the translated stack should contain callees before callers.
-    for (my $j = $#{$symlist}; $j >= 2; $j -= 3) {
-      my $func = $symlist->[$j-2];
-      my $fileline = $symlist->[$j-1];
-      my $fullfunc = $symlist->[$j];
-      if (defined($fullname_to_shortname_map->{$fullfunc})) {
-        $func = $fullname_to_shortname_map->{$fullfunc};
-      }
-      if ($j > 2) {
-        $func = "$func (inline)";
-      }
-
-      # Do not merge nodes corresponding to Callback::Run since that
-      # causes confusing cycles in dot display.  Instead, we synthesize
-      # a unique name for this frame per caller.
-      if ($func =~ m/Callback.*::Run$/) {
-        my $caller = ($i > 0) ? $addrs[$i-1] : 0;
-        $func = "Run#" . ShortIdFor($caller);
-      }
-
-      if ($main::opt_addresses) {
-        push(@result, "$a $func $fileline");
-      } elsif ($main::opt_lines) {
-        if ($func eq '??' && $fileline eq '??:0') {
-          push(@result, "$a");
-        } else {
-          push(@result, "$func $fileline");
-        }
-      } elsif ($main::opt_functions) {
-        if ($func eq '??') {
-          push(@result, "$a");
-        } else {
-          push(@result, $func);
-        }
-      } elsif ($main::opt_files) {
-        if ($fileline eq '??:0' || $fileline eq '') {
-          push(@result, "$a");
-        } else {
-          my $f = $fileline;
-          $f =~ s/:\d+$//;
-          push(@result, $f);
-        }
-      } else {
-        push(@result, $a);
-        last;  # Do not print inlined info
-      }
-    }
-  }
-
-  # print join(",", @addrs), " => ", join(",", @result), "\n";
-  return @result;
-}
-
-# Generate percent string for a number and a total
-sub Percent {
-  my $num = shift;
-  my $tot = shift;
-  if ($tot != 0) {
-    return sprintf("%.1f%%", $num * 100.0 / $tot);
-  } else {
-    return ($num == 0) ? "nan" : (($num > 0) ? "+inf" : "-inf");
-  }
-}
-
-# Generate pretty-printed form of number
-sub Unparse {
-  my $num = shift;
-  if ($main::profile_type eq 'heap' || $main::profile_type eq 'growth') {
-    if ($main::opt_inuse_objects || $main::opt_alloc_objects) {
-      return sprintf("%d", $num);
-    } else {
-      if ($main::opt_show_bytes) {
-        return sprintf("%d", $num);
-      } else {
-        return sprintf("%.1f", $num / 1048576.0);
-      }
-    }
-  } elsif ($main::profile_type eq 'contention' && !$main::opt_contentions) {
-    return sprintf("%.3f", $num / 1e9); # Convert nanoseconds to seconds
-  } else {
-    return sprintf("%d", $num);
-  }
-}
-
-# Alternate pretty-printed form: 0 maps to "."
-sub UnparseAlt {
-  my $num = shift;
-  if ($num == 0) {
-    return ".";
-  } else {
-    return Unparse($num);
-  }
-}
-
-# Alternate pretty-printed form: 0 maps to ""
-sub HtmlPrintNumber {
-  my $num = shift;
-  if ($num == 0) {
-    return "";
-  } else {
-    return Unparse($num);
-  }
-}
-
-# Return output units
-sub Units {
-  if ($main::profile_type eq 'heap' || $main::profile_type eq 'growth') {
-    if ($main::opt_inuse_objects || $main::opt_alloc_objects) {
-      return "objects";
-    } else {
-      if ($main::opt_show_bytes) {
-        return "B";
-      } else {
-        return "MB";
-      }
-    }
-  } elsif ($main::profile_type eq 'contention' && !$main::opt_contentions) {
-    return "seconds";
-  } else {
-    return "samples";
-  }
-}
-
-##### Profile manipulation code #####
-
-# Generate flattened profile:
-# If count is charged to stack [a,b,c,d], in generated profile,
-# it will be charged to [a]
-sub FlatProfile {
-  my $profile = shift;
-  my $result = {};
-  foreach my $k (keys(%{$profile})) {
-    my $count = $profile->{$k};
-    my @addrs = split(/\n/, $k);
-    if ($#addrs >= 0) {
-      AddEntry($result, $addrs[0], $count);
-    }
-  }
-  return $result;
-}
-
-# Generate cumulative profile:
-# If count is charged to stack [a,b,c,d], in generated profile,
-# it will be charged to [a], [b], [c], [d]
-sub CumulativeProfile {
-  my $profile = shift;
-  my $result = {};
-  foreach my $k (keys(%{$profile})) {
-    my $count = $profile->{$k};
-    my @addrs = split(/\n/, $k);
-    foreach my $a (@addrs) {
-      AddEntry($result, $a, $count);
-    }
-  }
-  return $result;
-}
-
-# If the second-youngest PC on the stack is always the same, returns
-# that pc.  Otherwise, returns undef.
-sub IsSecondPcAlwaysTheSame {
-  my $profile = shift;
-
-  my $second_pc = undef;
-  foreach my $k (keys(%{$profile})) {
-    my @addrs = split(/\n/, $k);
-    if ($#addrs < 1) {
-      return undef;
-    }
-    if (not defined $second_pc) {
-      $second_pc = $addrs[1];
-    } else {
-      if ($second_pc ne $addrs[1]) {
-        return undef;
-      }
-    }
-  }
-  return $second_pc;
-}
-
-sub ExtractSymbolLocation {
-  my $symbols = shift;
-  my $address = shift;
-  # 'addr2line' outputs "??:0" for unknown locations; we do the
-  # same to be consistent.
-  my $location = "??:0:unknown";
-  if (exists $symbols->{$address}) {
-    my $file = $symbols->{$address}->[1];
-    if ($file eq "?") {
-      $file = "??:0"
-    }
-    $location = $file . ":" . $symbols->{$address}->[0];
-  }
-  return $location;
-}
-
-# Extracts a graph of calls.
-sub ExtractCalls {
-  my $symbols = shift;
-  my $profile = shift;
-
-  my $calls = {};
-  while( my ($stack_trace, $count) = each %$profile ) {
-    my @address = split(/\n/, $stack_trace);
-    my $destination = ExtractSymbolLocation($symbols, $address[0]);
-    AddEntry($calls, $destination, $count);
-    for (my $i = 1; $i <= $#address; $i++) {
-      my $source = ExtractSymbolLocation($symbols, $address[$i]);
-      my $call = "$source -> $destination";
-      AddEntry($calls, $call, $count);
-      $destination = $source;
-    }
-  }
-
-  return $calls;
-}
-
-sub RemoveUninterestingFrames {
-  my $symbols = shift;
-  my $profile = shift;
-
-  # List of function names to skip
-  my %skip = ();
-  my $skip_regexp = 'NOMATCH';
-  if ($main::profile_type eq 'heap' || $main::profile_type eq 'growth') {
-    foreach my $name ('calloc',
-                      'cfree',
-                      'malloc',
-                      'free',
-                      'memalign',
-                      'posix_memalign',
-                      'aligned_alloc',
-                      'pvalloc',
-                      'valloc',
-                      'realloc',
-                      'mallocx', # jemalloc
-                      'rallocx', # jemalloc
-                      'xallocx', # jemalloc
-                      'dallocx', # jemalloc
-                      'sdallocx', # jemalloc
-                      'tc_calloc',
-                      'tc_cfree',
-                      'tc_malloc',
-                      'tc_free',
-                      'tc_memalign',
-                      'tc_posix_memalign',
-                      'tc_pvalloc',
-                      'tc_valloc',
-                      'tc_realloc',
-                      'tc_new',
-                      'tc_delete',
-                      'tc_newarray',
-                      'tc_deletearray',
-                      'tc_new_nothrow',
-                      'tc_newarray_nothrow',
-                      'do_malloc',
-                      '::do_malloc',   # new name -- got moved to an unnamed ns
-                      '::do_malloc_or_cpp_alloc',
-                      'DoSampledAllocation',
-                      'simple_alloc::allocate',
-                      '__malloc_alloc_template::allocate',
-                      '__builtin_delete',
-                      '__builtin_new',
-                      '__builtin_vec_delete',
-                      '__builtin_vec_new',
-                      'operator new',
-                      'operator new[]',
-                      # The entry to our memory-allocation routines on OS X
-                      'malloc_zone_malloc',
-                      'malloc_zone_calloc',
-                      'malloc_zone_valloc',
-                      'malloc_zone_realloc',
-                      'malloc_zone_memalign',
-                      'malloc_zone_free',
-                      # These mark the beginning/end of our custom sections
-                      '__start_google_malloc',
-                      '__stop_google_malloc',
-                      '__start_malloc_hook',
-                      '__stop_malloc_hook') {
-      $skip{$name} = 1;
-      $skip{"_" . $name} = 1;   # Mach (OS X) adds a _ prefix to everything
-    }
-    # TODO: Remove TCMalloc once everything has been
-    # moved into the tcmalloc:: namespace and we have flushed
-    # old code out of the system.
-    $skip_regexp = "TCMalloc|^tcmalloc::";
-  } elsif ($main::profile_type eq 'contention') {
-    foreach my $vname ('base::RecordLockProfileData',
-                       'base::SubmitMutexProfileData',
-                       'base::SubmitSpinLockProfileData',
-                       'Mutex::Unlock',
-                       'Mutex::UnlockSlow',
-                       'Mutex::ReaderUnlock',
-                       'MutexLock::~MutexLock',
-                       'SpinLock::Unlock',
-                       'SpinLock::SlowUnlock',
-                       'SpinLockHolder::~SpinLockHolder') {
-      $skip{$vname} = 1;
-    }
-  } elsif ($main::profile_type eq 'cpu') {
-    # Drop signal handlers used for CPU profile collection
-    # TODO(dpeng): this should not be necessary; it's taken
-    # care of by the general 2nd-pc mechanism below.
-    foreach my $name ('ProfileData::Add',           # historical
-                      'ProfileData::prof_handler',  # historical
-                      'CpuProfiler::prof_handler',
-                      '__FRAME_END__',
-                      '__pthread_sighandler',
-                      '__restore') {
-      $skip{$name} = 1;
-    }
-  } else {
-    # Nothing skipped for unknown types
-  }
-
-  if ($main::profile_type eq 'cpu') {
-    # If all the second-youngest program counters are the same,
-    # this STRONGLY suggests that it is an artifact of measurement,
-    # i.e., stack frames pushed by the CPU profiler signal handler.
-    # Hence, we delete them.
-    # (The topmost PC is read from the signal structure, not from
-    # the stack, so it does not get involved.)
-    while (my $second_pc = IsSecondPcAlwaysTheSame($profile)) {
-      my $result = {};
-      my $func = '';
-      if (exists($symbols->{$second_pc})) {
-        $second_pc = $symbols->{$second_pc}->[0];
-      }
-      print STDERR "Removing $second_pc from all stack traces.\n";
-      foreach my $k (keys(%{$profile})) {
-        my $count = $profile->{$k};
-        my @addrs = split(/\n/, $k);
-        splice @addrs, 1, 1;
-        my $reduced_path = join("\n", @addrs);
-        AddEntry($result, $reduced_path, $count);
-      }
-      $profile = $result;
-    }
-  }
-
-  my $result = {};
-  foreach my $k (keys(%{$profile})) {
-    my $count = $profile->{$k};
-    my @addrs = split(/\n/, $k);
-    my @path = ();
-    foreach my $a (@addrs) {
-      if (exists($symbols->{$a})) {
-        my $func = $symbols->{$a}->[0];
-        if ($skip{$func} || ($func =~ m/$skip_regexp/)) {
-          # Throw away the portion of the backtrace seen so far, under the
-          # assumption that previous frames were for functions internal to the
-          # allocator.
-          @path = ();
-          next;
-        }
-      }
-      push(@path, $a);
-    }
-    my $reduced_path = join("\n", @path);
-    AddEntry($result, $reduced_path, $count);
-  }
-  return $result;
-}
-
-# Reduce profile to granularity given by user
-sub ReduceProfile {
-  my $symbols = shift;
-  my $profile = shift;
-  my $result = {};
-  my $fullname_to_shortname_map = {};
-  FillFullnameToShortnameMap($symbols, $fullname_to_shortname_map);
-  foreach my $k (keys(%{$profile})) {
-    my $count = $profile->{$k};
-    my @translated = TranslateStack($symbols, $fullname_to_shortname_map, $k);
-    my @path = ();
-    my %seen = ();
-    $seen{''} = 1;      # So that empty keys are skipped
-    foreach my $e (@translated) {
-      # To avoid double-counting due to recursion, skip a stack-trace
-      # entry if it has already been seen
-      if (!$seen{$e}) {
-        $seen{$e} = 1;
-        push(@path, $e);
-      }
-    }
-    my $reduced_path = join("\n", @path);
-    AddEntry($result, $reduced_path, $count);
-  }
-  return $result;
-}
-
-# Does the specified symbol array match the regexp?
-sub SymbolMatches {
-  my $sym = shift;
-  my $re = shift;
-  if (defined($sym)) {
-    for (my $i = 0; $i < $#{$sym}; $i += 3) {
-      if ($sym->[$i] =~ m/$re/ || $sym->[$i+1] =~ m/$re/) {
-        return 1;
-      }
-    }
-  }
-  return 0;
-}
-
-# Focus only on paths involving specified regexps
-sub FocusProfile {
-  my $symbols = shift;
-  my $profile = shift;
-  my $focus = shift;
-  my $result = {};
-  foreach my $k (keys(%{$profile})) {
-    my $count = $profile->{$k};
-    my @addrs = split(/\n/, $k);
-    foreach my $a (@addrs) {
-      # Reply if it matches either the address/shortname/fileline
-      if (($a =~ m/$focus/) || SymbolMatches($symbols->{$a}, $focus)) {
-        AddEntry($result, $k, $count);
-        last;
-      }
-    }
-  }
-  return $result;
-}
-
-# Focus only on paths not involving specified regexps
-sub IgnoreProfile {
-  my $symbols = shift;
-  my $profile = shift;
-  my $ignore = shift;
-  my $result = {};
-  foreach my $k (keys(%{$profile})) {
-    my $count = $profile->{$k};
-    my @addrs = split(/\n/, $k);
-    my $matched = 0;
-    foreach my $a (@addrs) {
-      # Reply if it matches either the address/shortname/fileline
-      if (($a =~ m/$ignore/) || SymbolMatches($symbols->{$a}, $ignore)) {
-        $matched = 1;
-        last;
-      }
-    }
-    if (!$matched) {
-      AddEntry($result, $k, $count);
-    }
-  }
-  return $result;
-}
-
-# Get total count in profile
-sub TotalProfile {
-  my $profile = shift;
-  my $result = 0;
-  foreach my $k (keys(%{$profile})) {
-    $result += $profile->{$k};
-  }
-  return $result;
-}
-
-# Add A to B
-sub AddProfile {
-  my $A = shift;
-  my $B = shift;
-
-  my $R = {};
-  # add all keys in A
-  foreach my $k (keys(%{$A})) {
-    my $v = $A->{$k};
-    AddEntry($R, $k, $v);
-  }
-  # add all keys in B
-  foreach my $k (keys(%{$B})) {
-    my $v = $B->{$k};
-    AddEntry($R, $k, $v);
-  }
-  return $R;
-}
-
-# Merges symbol maps
-sub MergeSymbols {
-  my $A = shift;
-  my $B = shift;
-
-  my $R = {};
-  foreach my $k (keys(%{$A})) {
-    $R->{$k} = $A->{$k};
-  }
-  if (defined($B)) {
-    foreach my $k (keys(%{$B})) {
-      $R->{$k} = $B->{$k};
-    }
-  }
-  return $R;
-}
-
-
-# Add A to B
-sub AddPcs {
-  my $A = shift;
-  my $B = shift;
-
-  my $R = {};
-  # add all keys in A
-  foreach my $k (keys(%{$A})) {
-    $R->{$k} = 1
-  }
-  # add all keys in B
-  foreach my $k (keys(%{$B})) {
-    $R->{$k} = 1
-  }
-  return $R;
-}
-
-# Subtract B from A
-sub SubtractProfile {
-  my $A = shift;
-  my $B = shift;
-
-  my $R = {};
-  foreach my $k (keys(%{$A})) {
-    my $v = $A->{$k} - GetEntry($B, $k);
-    if ($v < 0 && $main::opt_drop_negative) {
-      $v = 0;
-    }
-    AddEntry($R, $k, $v);
-  }
-  if (!$main::opt_drop_negative) {
-    # Take care of when subtracted profile has more entries
-    foreach my $k (keys(%{$B})) {
-      if (!exists($A->{$k})) {
-        AddEntry($R, $k, 0 - $B->{$k});
-      }
-    }
-  }
-  return $R;
-}
-
-# Get entry from profile; zero if not present
-sub GetEntry {
-  my $profile = shift;
-  my $k = shift;
-  if (exists($profile->{$k})) {
-    return $profile->{$k};
-  } else {
-    return 0;
-  }
-}
-
-# Add entry to specified profile
-sub AddEntry {
-  my $profile = shift;
-  my $k = shift;
-  my $n = shift;
-  if (!exists($profile->{$k})) {
-    $profile->{$k} = 0;
-  }
-  $profile->{$k} += $n;
-}
-
-# Add a stack of entries to specified profile, and add them to the $pcs
-# list.
-sub AddEntries {
-  my $profile = shift;
-  my $pcs = shift;
-  my $stack = shift;
-  my $count = shift;
-  my @k = ();
-
-  foreach my $e (split(/\s+/, $stack)) {
-    my $pc = HexExtend($e);
-    $pcs->{$pc} = 1;
-    push @k, $pc;
-  }
-  AddEntry($profile, (join "\n", @k), $count);
-}
-
-##### Code to profile a server dynamically #####
-
-sub CheckSymbolPage {
-  my $url = SymbolPageURL();
-  my $command = ShellEscape(@URL_FETCHER, $url);
-  open(SYMBOL, "$command |") or error($command);
-  my $line = <SYMBOL>;
-  $line =~ s/\r//g;         # turn windows-looking lines into unix-looking lines
-  close(SYMBOL);
-  unless (defined($line)) {
-    error("$url doesn't exist\n");
-  }
-
-  if ($line =~ /^num_symbols:\s+(\d+)$/) {
-    if ($1 == 0) {
-      error("Stripped binary. No symbols available.\n");
-    }
-  } else {
-    error("Failed to get the number of symbols from $url\n");
-  }
-}
-
-sub IsProfileURL {
-  my $profile_name = shift;
-  if (-f $profile_name) {
-    printf STDERR "Using local file $profile_name.\n";
-    return 0;
-  }
-  return 1;
-}
-
-sub ParseProfileURL {
-  my $profile_name = shift;
-
-  if (!defined($profile_name) || $profile_name eq "") {
-    return ();
-  }
-
-  # Split profile URL - matches all non-empty strings, so no test.
-  $profile_name =~ m,^(https?://)?([^/]+)(.*?)(/|$PROFILES)?$,;
-
-  my $proto = $1 || "http://";
-  my $hostport = $2;
-  my $prefix = $3;
-  my $profile = $4 || "/";
-
-  my $host = $hostport;
-  $host =~ s/:.*//;
-
-  my $baseurl = "$proto$hostport$prefix";
-  return ($host, $baseurl, $profile);
-}
-
-# We fetch symbols from the first profile argument.
-sub SymbolPageURL {
-  my ($host, $baseURL, $path) = ParseProfileURL($main::pfile_args[0]);
-  return "$baseURL$SYMBOL_PAGE";
-}
-
-sub FetchProgramName() {
-  my ($host, $baseURL, $path) = ParseProfileURL($main::pfile_args[0]);
-  my $url = "$baseURL$PROGRAM_NAME_PAGE";
-  my $command_line = ShellEscape(@URL_FETCHER, $url);
-  open(CMDLINE, "$command_line |") or error($command_line);
-  my $cmdline = <CMDLINE>;
-  $cmdline =~ s/\r//g;   # turn windows-looking lines into unix-looking lines
-  close(CMDLINE);
-  error("Failed to get program name from $url\n") unless defined($cmdline);
-  $cmdline =~ s/\x00.+//;  # Remove argv[1] and latters.
-  $cmdline =~ s!\n!!g;  # Remove LFs.
-  return $cmdline;
-}
-
-# Gee, curl's -L (--location) option isn't reliable at least
-# with its 7.12.3 version.  Curl will forget to post data if
-# there is a redirection.  This function is a workaround for
-# curl.  Redirection happens on borg hosts.
-sub ResolveRedirectionForCurl {
-  my $url = shift;
-  my $command_line = ShellEscape(@URL_FETCHER, "--head", $url);
-  open(CMDLINE, "$command_line |") or error($command_line);
-  while (<CMDLINE>) {
-    s/\r//g;         # turn windows-looking lines into unix-looking lines
-    if (/^Location: (.*)/) {
-      $url = $1;
-    }
-  }
-  close(CMDLINE);
-  return $url;
-}
-
-# Add a timeout flat to URL_FETCHER.  Returns a new list.
-sub AddFetchTimeout {
-  my $timeout = shift;
-  my @fetcher = shift;
-  if (defined($timeout)) {
-    if (join(" ", @fetcher) =~ m/\bcurl -s/) {
-      push(@fetcher, "--max-time", sprintf("%d", $timeout));
-    } elsif (join(" ", @fetcher) =~ m/\brpcget\b/) {
-      push(@fetcher, sprintf("--deadline=%d", $timeout));
-    }
-  }
-  return @fetcher;
-}
-
-# Reads a symbol map from the file handle name given as $1, returning
-# the resulting symbol map.  Also processes variables relating to symbols.
-# Currently, the only variable processed is 'binary=<value>' which updates
-# $main::prog to have the correct program name.
-sub ReadSymbols {
-  my $in = shift;
-  my $map = {};
-  while (<$in>) {
-    s/\r//g;         # turn windows-looking lines into unix-looking lines
-    # Removes all the leading zeroes from the symbols, see comment below.
-    if (m/^0x0*([0-9a-f]+)\s+(.+)/) {
-      $map->{$1} = $2;
-    } elsif (m/^---/) {
-      last;
-    } elsif (m/^([a-z][^=]*)=(.*)$/ ) {
-      my ($variable, $value) = ($1, $2);
-      for ($variable, $value) {
-        s/^\s+//;
-        s/\s+$//;
-      }
-      if ($variable eq "binary") {
-        if ($main::prog ne $UNKNOWN_BINARY && $main::prog ne $value) {
-          printf STDERR ("Warning: Mismatched binary name '%s', using '%s'.\n",
-                         $main::prog, $value);
-        }
-        $main::prog = $value;
-      } else {
-        printf STDERR ("Ignoring unknown variable in symbols list: " .
-            "'%s' = '%s'\n", $variable, $value);
-      }
-    }
-  }
-  return $map;
-}
-
-# Fetches and processes symbols to prepare them for use in the profile output
-# code.  If the optional 'symbol_map' arg is not given, fetches symbols from
-# $SYMBOL_PAGE for all PC values found in profile.  Otherwise, the raw symbols
-# are assumed to have already been fetched into 'symbol_map' and are simply
-# extracted and processed.
-sub FetchSymbols {
-  my $pcset = shift;
-  my $symbol_map = shift;
-
-  my %seen = ();
-  my @pcs = grep { !$seen{$_}++ } keys(%$pcset);  # uniq
-
-  if (!defined($symbol_map)) {
-    my $post_data = join("+", sort((map {"0x" . "$_"} @pcs)));
-
-    open(POSTFILE, ">$main::tmpfile_sym");
-    print POSTFILE $post_data;
-    close(POSTFILE);
-
-    my $url = SymbolPageURL();
-
-    my $command_line;
-    if (join(" ", @URL_FETCHER) =~ m/\bcurl -s/) {
-      $url = ResolveRedirectionForCurl($url);
-      $command_line = ShellEscape(@URL_FETCHER, "-d", "\@$main::tmpfile_sym",
-                                  $url);
-    } else {
-      $command_line = (ShellEscape(@URL_FETCHER, "--post", $url)
-                       . " < " . ShellEscape($main::tmpfile_sym));
-    }
-    # We use c++filt in case $SYMBOL_PAGE gives us mangled symbols.
-    my $escaped_cppfilt = ShellEscape($obj_tool_map{"c++filt"});
-    open(SYMBOL, "$command_line | $escaped_cppfilt |") or error($command_line);
-    $symbol_map = ReadSymbols(*SYMBOL{IO});
-    close(SYMBOL);
-  }
-
-  my $symbols = {};
-  foreach my $pc (@pcs) {
-    my $fullname;
-    # For 64 bits binaries, symbols are extracted with 8 leading zeroes.
-    # Then /symbol reads the long symbols in as uint64, and outputs
-    # the result with a "0x%08llx" format which get rid of the zeroes.
-    # By removing all the leading zeroes in both $pc and the symbols from
-    # /symbol, the symbols match and are retrievable from the map.
-    my $shortpc = $pc;
-    $shortpc =~ s/^0*//;
-    # Each line may have a list of names, which includes the function
-    # and also other functions it has inlined.  They are separated (in
-    # PrintSymbolizedProfile), by --, which is illegal in function names.
-    my $fullnames;
-    if (defined($symbol_map->{$shortpc})) {
-      $fullnames = $symbol_map->{$shortpc};
-    } else {
-      $fullnames = "0x" . $pc;  # Just use addresses
-    }
-    my $sym = [];
-    $symbols->{$pc} = $sym;
-    foreach my $fullname (split("--", $fullnames)) {
-      my $name = ShortFunctionName($fullname);
-      push(@{$sym}, $name, "?", $fullname);
-    }
-  }
-  return $symbols;
-}
-
-sub BaseName {
-  my $file_name = shift;
-  $file_name =~ s!^.*/!!;  # Remove directory name
-  return $file_name;
-}
-
-sub MakeProfileBaseName {
-  my ($binary_name, $profile_name) = @_;
-  my ($host, $baseURL, $path) = ParseProfileURL($profile_name);
-  my $binary_shortname = BaseName($binary_name);
-  return sprintf("%s.%s.%s",
-                 $binary_shortname, $main::op_time, $host);
-}
-
-sub FetchDynamicProfile {
-  my $binary_name = shift;
-  my $profile_name = shift;
-  my $fetch_name_only = shift;
-  my $encourage_patience = shift;
-
-  if (!IsProfileURL($profile_name)) {
-    return $profile_name;
-  } else {
-    my ($host, $baseURL, $path) = ParseProfileURL($profile_name);
-    if ($path eq "" || $path eq "/") {
-      # Missing type specifier defaults to cpu-profile
-      $path = $PROFILE_PAGE;
-    }
-
-    my $profile_file = MakeProfileBaseName($binary_name, $profile_name);
-
-    my $url = "$baseURL$path";
-    my $fetch_timeout = undef;
-    if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE/) {
-      if ($path =~ m/[?]/) {
-        $url .= "&";
-      } else {
-        $url .= "?";
-      }
-      $url .= sprintf("seconds=%d", $main::opt_seconds);
-      $fetch_timeout = $main::opt_seconds * 1.01 + 60;
-    } else {
-      # For non-CPU profiles, we add a type-extension to
-      # the target profile file name.
-      my $suffix = $path;
-      $suffix =~ s,/,.,g;
-      $profile_file .= $suffix;
-    }
-
-    my $profile_dir = $ENV{"PPROF_TMPDIR"} || ($ENV{HOME} . "/pprof");
-    if (! -d $profile_dir) {
-      mkdir($profile_dir)
-          || die("Unable to create profile directory $profile_dir: $!\n");
-    }
-    my $tmp_profile = "$profile_dir/.tmp.$profile_file";
-    my $real_profile = "$profile_dir/$profile_file";
-
-    if ($fetch_name_only > 0) {
-      return $real_profile;
-    }
-
-    my @fetcher = AddFetchTimeout($fetch_timeout, @URL_FETCHER);
-    my $cmd = ShellEscape(@fetcher, $url) . " > " . ShellEscape($tmp_profile);
-    if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE|$CENSUSPROFILE_PAGE/){
-      print STDERR "Gathering CPU profile from $url for $main::opt_seconds seconds to\n  ${real_profile}\n";
-      if ($encourage_patience) {
-        print STDERR "Be patient...\n";
-      }
-    } else {
-      print STDERR "Fetching $path profile from $url to\n  ${real_profile}\n";
-    }
-
-    (system($cmd) == 0) || error("Failed to get profile: $cmd: $!\n");
-    (system("mv", $tmp_profile, $real_profile) == 0) || error("Unable to rename profile\n");
-    print STDERR "Wrote profile to $real_profile\n";
-    $main::collected_profile = $real_profile;
-    return $main::collected_profile;
-  }
-}
-
-# Collect profiles in parallel
-sub FetchDynamicProfiles {
-  my $items = scalar(@main::pfile_args);
-  my $levels = log($items) / log(2);
-
-  if ($items == 1) {
-    $main::profile_files[0] = FetchDynamicProfile($main::prog, $main::pfile_args[0], 0, 1);
-  } else {
-    # math rounding issues
-    if ((2 ** $levels) < $items) {
-     $levels++;
-    }
-    my $count = scalar(@main::pfile_args);
-    for (my $i = 0; $i < $count; $i++) {
-      $main::profile_files[$i] = FetchDynamicProfile($main::prog, $main::pfile_args[$i], 1, 0);
-    }
-    print STDERR "Fetching $count profiles, Be patient...\n";
-    FetchDynamicProfilesRecurse($levels, 0, 0);
-    $main::collected_profile = join(" \\\n    ", @main::profile_files);
-  }
-}
-
-# Recursively fork a process to get enough processes
-# collecting profiles
-sub FetchDynamicProfilesRecurse {
-  my $maxlevel = shift;
-  my $level = shift;
-  my $position = shift;
-
-  if (my $pid = fork()) {
-    $position = 0 | ($position << 1);
-    TryCollectProfile($maxlevel, $level, $position);
-    wait;
-  } else {
-    $position = 1 | ($position << 1);
-    TryCollectProfile($maxlevel, $level, $position);
-    cleanup();
-    exit(0);
-  }
-}
-
-# Collect a single profile
-sub TryCollectProfile {
-  my $maxlevel = shift;
-  my $level = shift;
-  my $position = shift;
-
-  if ($level >= ($maxlevel - 1)) {
-    if ($position < scalar(@main::pfile_args)) {
-      FetchDynamicProfile($main::prog, $main::pfile_args[$position], 0, 0);
-    }
-  } else {
-    FetchDynamicProfilesRecurse($maxlevel, $level+1, $position);
-  }
-}
-
-##### Parsing code #####
-
-# Provide a small streaming-read module to handle very large
-# cpu-profile files.  Stream in chunks along a sliding window.
-# Provides an interface to get one 'slot', correctly handling
-# endian-ness differences.  A slot is one 32-bit or 64-bit word
-# (depending on the input profile).  We tell endianness and bit-size
-# for the profile by looking at the first 8 bytes: in cpu profiles,
-# the second slot is always 3 (we'll accept anything that's not 0).
-BEGIN {
-  package CpuProfileStream;
-
-  sub new {
-    my ($class, $file, $fname) = @_;
-    my $self = { file        => $file,
-                 base        => 0,
-                 stride      => 512 * 1024,   # must be a multiple of bitsize/8
-                 slots       => [],
-                 unpack_code => "",           # N for big-endian, V for little
-                 perl_is_64bit => 1,          # matters if profile is 64-bit
-    };
-    bless $self, $class;
-    # Let unittests adjust the stride
-    if ($main::opt_test_stride > 0) {
-      $self->{stride} = $main::opt_test_stride;
-    }
-    # Read the first two slots to figure out bitsize and endianness.
-    my $slots = $self->{slots};
-    my $str;
-    read($self->{file}, $str, 8);
-    # Set the global $address_length based on what we see here.
-    # 8 is 32-bit (8 hexadecimal chars); 16 is 64-bit (16 hexadecimal chars).
-    $address_length = ($str eq (chr(0)x8)) ? 16 : 8;
-    if ($address_length == 8) {
-      if (substr($str, 6, 2) eq chr(0)x2) {
-        $self->{unpack_code} = 'V';  # Little-endian.
-      } elsif (substr($str, 4, 2) eq chr(0)x2) {
-        $self->{unpack_code} = 'N';  # Big-endian
-      } else {
-        ::error("$fname: header size >= 2**16\n");
-      }
-      @$slots = unpack($self->{unpack_code} . "*", $str);
-    } else {
-      # If we're a 64-bit profile, check if we're a 64-bit-capable
-      # perl.  Otherwise, each slot will be represented as a float
-      # instead of an int64, losing precision and making all the
-      # 64-bit addresses wrong.  We won't complain yet, but will
-      # later if we ever see a value that doesn't fit in 32 bits.
-      my $has_q = 0;
-      eval { $has_q = pack("Q", "1") ? 1 : 1; };
-      if (!$has_q) {
-        $self->{perl_is_64bit} = 0;
-      }
-      read($self->{file}, $str, 8);
-      if (substr($str, 4, 4) eq chr(0)x4) {
-        # We'd love to use 'Q', but it's a) not universal, b) not endian-proof.
-        $self->{unpack_code} = 'V';  # Little-endian.
-      } elsif (substr($str, 0, 4) eq chr(0)x4) {
-        $self->{unpack_code} = 'N';  # Big-endian
-      } else {
-        ::error("$fname: header size >= 2**32\n");
-      }
-      my @pair = unpack($self->{unpack_code} . "*", $str);
-      # Since we know one of the pair is 0, it's fine to just add them.
-      @$slots = (0, $pair[0] + $pair[1]);
-    }
-    return $self;
-  }
-
-  # Load more data when we access slots->get(X) which is not yet in memory.
-  sub overflow {
-    my ($self) = @_;
-    my $slots = $self->{slots};
-    $self->{base} += $#$slots + 1;   # skip over data we're replacing
-    my $str;
-    read($self->{file}, $str, $self->{stride});
-    if ($address_length == 8) {      # the 32-bit case
-      # This is the easy case: unpack provides 32-bit unpacking primitives.
-      @$slots = unpack($self->{unpack_code} . "*", $str);
-    } else {
-      # We need to unpack 32 bits at a time and combine.
-      my @b32_values = unpack($self->{unpack_code} . "*", $str);
-      my @b64_values = ();
-      for (my $i = 0; $i < $#b32_values; $i += 2) {
-        # TODO(csilvers): if this is a 32-bit perl, the math below
-        #    could end up in a too-large int, which perl will promote
-        #    to a double, losing necessary precision.  Deal with that.
-        #    Right now, we just die.
-        my ($lo, $hi) = ($b32_values[$i], $b32_values[$i+1]);
-        if ($self->{unpack_code} eq 'N') {    # big-endian
-          ($lo, $hi) = ($hi, $lo);
-        }
-        my $value = $lo + $hi * (2**32);
-        if (!$self->{perl_is_64bit} &&   # check value is exactly represented
-            (($value % (2**32)) != $lo || int($value / (2**32)) != $hi)) {
-          ::error("Need a 64-bit perl to process this 64-bit profile.\n");
-        }
-        push(@b64_values, $value);
-      }
-      @$slots = @b64_values;
-    }
-  }
-
-  # Access the i-th long in the file (logically), or -1 at EOF.
-  sub get {
-    my ($self, $idx) = @_;
-    my $slots = $self->{slots};
-    while ($#$slots >= 0) {
-      if ($idx < $self->{base}) {
-        # The only time we expect a reference to $slots[$i - something]
-        # after referencing $slots[$i] is reading the very first header.
-        # Since $stride > |header|, that shouldn't cause any lookback
-        # errors.  And everything after the header is sequential.
-        print STDERR "Unexpected look-back reading CPU profile";
-        return -1;   # shrug, don't know what better to return
-      } elsif ($idx > $self->{base} + $#$slots) {
-        $self->overflow();
-      } else {
-        return $slots->[$idx - $self->{base}];
-      }
-    }
-    # If we get here, $slots is [], which means we've reached EOF
-    return -1;  # unique since slots is supposed to hold unsigned numbers
-  }
-}
-
-# Reads the top, 'header' section of a profile, and returns the last
-# line of the header, commonly called a 'header line'.  The header
-# section of a profile consists of zero or more 'command' lines that
-# are instructions to pprof, which pprof executes when reading the
-# header.  All 'command' lines start with a %.  After the command
-# lines is the 'header line', which is a profile-specific line that
-# indicates what type of profile it is, and perhaps other global
-# information about the profile.  For instance, here's a header line
-# for a heap profile:
-#   heap profile:     53:    38236 [  5525:  1284029] @ heapprofile
-# For historical reasons, the CPU profile does not contain a text-
-# readable header line.  If the profile looks like a CPU profile,
-# this function returns "".  If no header line could be found, this
-# function returns undef.
-#
-# The following commands are recognized:
-#   %warn -- emit the rest of this line to stderr, prefixed by 'WARNING:'
-#
-# The input file should be in binmode.
-sub ReadProfileHeader {
-  local *PROFILE = shift;
-  my $firstchar = "";
-  my $line = "";
-  read(PROFILE, $firstchar, 1);
-  seek(PROFILE, -1, 1);                    # unread the firstchar
-  if ($firstchar !~ /[[:print:]]/) {       # is not a text character
-    return "";
-  }
-  while (defined($line = <PROFILE>)) {
-    $line =~ s/\r//g;   # turn windows-looking lines into unix-looking lines
-    if ($line =~ /^%warn\s+(.*)/) {        # 'warn' command
-      # Note this matches both '%warn blah\n' and '%warn\n'.
-      print STDERR "WARNING: $1\n";        # print the rest of the line
-    } elsif ($line =~ /^%/) {
-      print STDERR "Ignoring unknown command from profile header: $line";
-    } else {
-      # End of commands, must be the header line.
-      return $line;
-    }
-  }
-  return undef;     # got to EOF without seeing a header line
-}
-
-sub IsSymbolizedProfileFile {
-  my $file_name = shift;
-  if (!(-e $file_name) || !(-r $file_name)) {
-    return 0;
-  }
-  # Check if the file contains a symbol-section marker.
-  open(TFILE, "<$file_name");
-  binmode TFILE;
-  my $firstline = ReadProfileHeader(*TFILE);
-  close(TFILE);
-  if (!$firstline) {
-    return 0;
-  }
-  $SYMBOL_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
-  my $symbol_marker = $&;
-  return $firstline =~ /^--- *$symbol_marker/;
-}
-
-# Parse profile generated by common/profiler.cc and return a reference
-# to a map:
-#      $result->{version}     Version number of profile file
-#      $result->{period}      Sampling period (in microseconds)
-#      $result->{profile}     Profile object
-#      $result->{threads}     Map of thread IDs to profile objects
-#      $result->{map}         Memory map info from profile
-#      $result->{pcs}         Hash of all PC values seen, key is hex address
-sub ReadProfile {
-  my $prog = shift;
-  my $fname = shift;
-  my $result;            # return value
-
-  $CONTENTION_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
-  my $contention_marker = $&;
-  $GROWTH_PAGE  =~ m,[^/]+$,;    # matches everything after the last slash
-  my $growth_marker = $&;
-  $SYMBOL_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
-  my $symbol_marker = $&;
-  $PROFILE_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
-  my $profile_marker = $&;
-
-  # Look at first line to see if it is a heap or a CPU profile.
-  # CPU profile may start with no header at all, and just binary data
-  # (starting with \0\0\0\0) -- in that case, don't try to read the
-  # whole firstline, since it may be gigabytes(!) of data.
-  open(PROFILE, "<$fname") || error("$fname: $!\n");
-  binmode PROFILE;      # New perls do UTF-8 processing
-  my $header = ReadProfileHeader(*PROFILE);
-  if (!defined($header)) {   # means "at EOF"
-    error("Profile is empty.\n");
-  }
-
-  my $symbols;
-  if ($header =~ m/^--- *$symbol_marker/o) {
-    # Verify that the user asked for a symbolized profile
-    if (!$main::use_symbolized_profile) {
-      # we have both a binary and symbolized profiles, abort
-      error("FATAL ERROR: Symbolized profile\n   $fname\ncannot be used with " .
-            "a binary arg. Try again without passing\n   $prog\n");
-    }
-    # Read the symbol section of the symbolized profile file.
-    $symbols = ReadSymbols(*PROFILE{IO});
-    # Read the next line to get the header for the remaining profile.
-    $header = ReadProfileHeader(*PROFILE) || "";
-  }
-
-  $main::profile_type = '';
-  if ($header =~ m/^heap profile:.*$growth_marker/o) {
-    $main::profile_type = 'growth';
-    $result =  ReadHeapProfile($prog, *PROFILE, $header);
-  } elsif ($header =~ m/^heap profile:/) {
-    $main::profile_type = 'heap';
-    $result =  ReadHeapProfile($prog, *PROFILE, $header);
-  } elsif ($header =~ m/^heap/) {
-    $main::profile_type = 'heap';
-    $result = ReadThreadedHeapProfile($prog, $fname, $header);
-  } elsif ($header =~ m/^--- *$contention_marker/o) {
-    $main::profile_type = 'contention';
-    $result = ReadSynchProfile($prog, *PROFILE);
-  } elsif ($header =~ m/^--- *Stacks:/) {
-    print STDERR
-      "Old format contention profile: mistakenly reports " .
-      "condition variable signals as lock contentions.\n";
-    $main::profile_type = 'contention';
-    $result = ReadSynchProfile($prog, *PROFILE);
-  } elsif ($header =~ m/^--- *$profile_marker/) {
-    # the binary cpu profile data starts immediately after this line
-    $main::profile_type = 'cpu';
-    $result = ReadCPUProfile($prog, $fname, *PROFILE);
-  } else {
-    if (defined($symbols)) {
-      # a symbolized profile contains a format we don't recognize, bail out
-      error("$fname: Cannot recognize profile section after symbols.\n");
-    }
-    # no ascii header present -- must be a CPU profile
-    $main::profile_type = 'cpu';
-    $result = ReadCPUProfile($prog, $fname, *PROFILE);
-  }
-
-  close(PROFILE);
-
-  # if we got symbols along with the profile, return those as well
-  if (defined($symbols)) {
-    $result->{symbols} = $symbols;
-  }
-
-  return $result;
-}
-
-# Subtract one from caller pc so we map back to call instr.
-# However, don't do this if we're reading a symbolized profile
-# file, in which case the subtract-one was done when the file
-# was written.
-#
-# We apply the same logic to all readers, though ReadCPUProfile uses an
-# independent implementation.
-sub FixCallerAddresses {
-  my $stack = shift;
-  if ($main::use_symbolized_profile) {
-    return $stack;
-  } else {
-    $stack =~ /(\s)/;
-    my $delimiter = $1;
-    my @addrs = split(' ', $stack);
-    my @fixedaddrs;
-    $#fixedaddrs = $#addrs;
-    if ($#addrs >= 0) {
-      $fixedaddrs[0] = $addrs[0];
-    }
-    for (my $i = 1; $i <= $#addrs; $i++) {
-      $fixedaddrs[$i] = AddressSub($addrs[$i], "0x1");
-    }
-    return join $delimiter, @fixedaddrs;
-  }
-}
-
-# CPU profile reader
-sub ReadCPUProfile {
-  my $prog = shift;
-  my $fname = shift;       # just used for logging
-  local *PROFILE = shift;
-  my $version;
-  my $period;
-  my $i;
-  my $profile = {};
-  my $pcs = {};
-
-  # Parse string into array of slots.
-  my $slots = CpuProfileStream->new(*PROFILE, $fname);
-
-  # Read header.  The current header version is a 5-element structure
-  # containing:
-  #   0: header count (always 0)
-  #   1: header "words" (after this one: 3)
-  #   2: format version (0)
-  #   3: sampling period (usec)
-  #   4: unused padding (always 0)
-  if ($slots->get(0) != 0 ) {
-    error("$fname: not a profile file, or old format profile file\n");
-  }
-  $i = 2 + $slots->get(1);
-  $version = $slots->get(2);
-  $period = $slots->get(3);
-  # Do some sanity checking on these header values.
-  if ($version > (2**32) || $period > (2**32) || $i > (2**32) || $i < 5) {
-    error("$fname: not a profile file, or corrupted profile file\n");
-  }
-
-  # Parse profile
-  while ($slots->get($i) != -1) {
-    my $n = $slots->get($i++);
-    my $d = $slots->get($i++);
-    if ($d > (2**16)) {  # TODO(csilvers): what's a reasonable max-stack-depth?
-      my $addr = sprintf("0%o", $i * ($address_length == 8 ? 4 : 8));
-      print STDERR "At index $i (address $addr):\n";
-      error("$fname: stack trace depth >= 2**32\n");
-    }
-    if ($slots->get($i) == 0) {
-      # End of profile data marker
-      $i += $d;
-      last;
-    }
-
-    # Make key out of the stack entries
-    my @k = ();
-    for (my $j = 0; $j < $d; $j++) {
-      my $pc = $slots->get($i+$j);
-      # Subtract one from caller pc so we map back to call instr.
-      # However, don't do this if we're reading a symbolized profile
-      # file, in which case the subtract-one was done when the file
-      # was written.
-      if ($j > 0 && !$main::use_symbolized_profile) {
-        $pc--;
-      }
-      $pc = sprintf("%0*x", $address_length, $pc);
-      $pcs->{$pc} = 1;
-      push @k, $pc;
-    }
-
-    AddEntry($profile, (join "\n", @k), $n);
-    $i += $d;
-  }
-
-  # Parse map
-  my $map = '';
-  seek(PROFILE, $i * 4, 0);
-  read(PROFILE, $map, (stat PROFILE)[7]);
-
-  my $r = {};
-  $r->{version} = $version;
-  $r->{period} = $period;
-  $r->{profile} = $profile;
-  $r->{libs} = ParseLibraries($prog, $map, $pcs);
-  $r->{pcs} = $pcs;
-
-  return $r;
-}
-
-sub HeapProfileIndex {
-  my $index = 1;
-  if ($main::opt_inuse_space) {
-    $index = 1;
-  } elsif ($main::opt_inuse_objects) {
-    $index = 0;
-  } elsif ($main::opt_alloc_space) {
-    $index = 3;
-  } elsif ($main::opt_alloc_objects) {
-    $index = 2;
-  }
-  return $index;
-}
-
-sub ReadMappedLibraries {
-  my $fh = shift;
-  my $map = "";
-  # Read the /proc/self/maps data
-  while (<$fh>) {
-    s/\r//g;         # turn windows-looking lines into unix-looking lines
-    $map .= $_;
-  }
-  return $map;
-}
-
-sub ReadMemoryMap {
-  my $fh = shift;
-  my $map = "";
-  # Read /proc/self/maps data as formatted by DumpAddressMap()
-  my $buildvar = "";
-  while (<PROFILE>) {
-    s/\r//g;         # turn windows-looking lines into unix-looking lines
-    # Parse "build=<dir>" specification if supplied
-    if (m/^\s*build=(.*)\n/) {
-      $buildvar = $1;
-    }
-
-    # Expand "$build" variable if available
-    $_ =~ s/\$build\b/$buildvar/g;
-
-    $map .= $_;
-  }
-  return $map;
-}
-
-sub AdjustSamples {
-  my ($sample_adjustment, $sampling_algorithm, $n1, $s1, $n2, $s2) = @_;
-  if ($sample_adjustment) {
-    if ($sampling_algorithm == 2) {
-      # Remote-heap version 2
-      # The sampling frequency is the rate of a Poisson process.
-      # This means that the probability of sampling an allocation of
-      # size X with sampling rate Y is 1 - exp(-X/Y)
-      if ($n1 != 0) {
-        my $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
-        my $scale_factor = 1/(1 - exp(-$ratio));
-        $n1 *= $scale_factor;
-        $s1 *= $scale_factor;
-      }
-      if ($n2 != 0) {
-        my $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
-        my $scale_factor = 1/(1 - exp(-$ratio));
-        $n2 *= $scale_factor;
-        $s2 *= $scale_factor;
-      }
-    } else {
-      # Remote-heap version 1
-      my $ratio;
-      $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
-      if ($ratio < 1) {
-        $n1 /= $ratio;
-        $s1 /= $ratio;
-      }
-      $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
-      if ($ratio < 1) {
-        $n2 /= $ratio;
-        $s2 /= $ratio;
-      }
-    }
-  }
-  return ($n1, $s1, $n2, $s2);
-}
-
-sub ReadHeapProfile {
-  my $prog = shift;
-  local *PROFILE = shift;
-  my $header = shift;
-
-  my $index = HeapProfileIndex();
-
-  # Find the type of this profile.  The header line looks like:
-  #    heap profile:   1246:  8800744 [  1246:  8800744] @ <heap-url>/266053
-  # There are two pairs <count: size>, the first inuse objects/space, and the
-  # second allocated objects/space.  This is followed optionally by a profile
-  # type, and if that is present, optionally by a sampling frequency.
-  # For remote heap profiles (v1):
-  # The interpretation of the sampling frequency is that the profiler, for
-  # each sample, calculates a uniformly distributed random integer less than
-  # the given value, and records the next sample after that many bytes have
-  # been allocated.  Therefore, the expected sample interval is half of the
-  # given frequency.  By default, if not specified, the expected sample
-  # interval is 128KB.  Only remote-heap-page profiles are adjusted for
-  # sample size.
-  # For remote heap profiles (v2):
-  # The sampling frequency is the rate of a Poisson process. This means that
-  # the probability of sampling an allocation of size X with sampling rate Y
-  # is 1 - exp(-X/Y)
-  # For version 2, a typical header line might look like this:
-  # heap profile:   1922: 127792360 [  1922: 127792360] @ <heap-url>_v2/524288
-  # the trailing number (524288) is the sampling rate. (Version 1 showed
-  # double the 'rate' here)
-  my $sampling_algorithm = 0;
-  my $sample_adjustment = 0;
-  chomp($header);
-  my $type = "unknown";
-  if ($header =~ m"^heap profile:\s*(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\](\s*@\s*([^/]*)(/(\d+))?)?") {
-    if (defined($6) && ($6 ne '')) {
-      $type = $6;
-      my $sample_period = $8;
-      # $type is "heapprofile" for profiles generated by the
-      # heap-profiler, and either "heap" or "heap_v2" for profiles
-      # generated by sampling directly within tcmalloc.  It can also
-      # be "growth" for heap-growth profiles.  The first is typically
-      # found for profiles generated locally, and the others for
-      # remote profiles.
-      if (($type eq "heapprofile") || ($type !~ /heap/) ) {
-        # No need to adjust for the sampling rate with heap-profiler-derived data
-        $sampling_algorithm = 0;
-      } elsif ($type =~ /_v2/) {
-        $sampling_algorithm = 2;     # version 2 sampling
-        if (defined($sample_period) && ($sample_period ne '')) {
-          $sample_adjustment = int($sample_period);
-        }
-      } else {
-        $sampling_algorithm = 1;     # version 1 sampling
-        if (defined($sample_period) && ($sample_period ne '')) {
-          $sample_adjustment = int($sample_period)/2;
-        }
-      }
-    } else {
-      # We detect whether or not this is a remote-heap profile by checking
-      # that the total-allocated stats ($n2,$s2) are exactly the
-      # same as the in-use stats ($n1,$s1).  It is remotely conceivable
-      # that a non-remote-heap profile may pass this check, but it is hard
-      # to imagine how that could happen.
-      # In this case it's so old it's guaranteed to be remote-heap version 1.
-      my ($n1, $s1, $n2, $s2) = ($1, $2, $3, $4);
-      if (($n1 == $n2) && ($s1 == $s2)) {
-        # This is likely to be a remote-heap based sample profile
-        $sampling_algorithm = 1;
-      }
-    }
-  }
-
-  if ($sampling_algorithm > 0) {
-    # For remote-heap generated profiles, adjust the counts and sizes to
-    # account for the sample rate (we sample once every 128KB by default).
-    if ($sample_adjustment == 0) {
-      # Turn on profile adjustment.
-      $sample_adjustment = 128*1024;
-      print STDERR "Adjusting heap profiles for 1-in-128KB sampling rate\n";
-    } else {
-      printf STDERR ("Adjusting heap profiles for 1-in-%d sampling rate\n",
-                     $sample_adjustment);
-    }
-    if ($sampling_algorithm > 1) {
-      # We don't bother printing anything for the original version (version 1)
-      printf STDERR "Heap version $sampling_algorithm\n";
-    }
-  }
-
-  my $profile = {};
-  my $pcs = {};
-  my $map = "";
-
-  while (<PROFILE>) {
-    s/\r//g;         # turn windows-looking lines into unix-looking lines
-    if (/^MAPPED_LIBRARIES:/) {
-      $map .= ReadMappedLibraries(*PROFILE);
-      last;
-    }
-
-    if (/^--- Memory map:/) {
-      $map .= ReadMemoryMap(*PROFILE);
-      last;
-    }
-
-    # Read entry of the form:
-    #  <count1>: <bytes1> [<count2>: <bytes2>] @ a1 a2 a3 ... an
-    s/^\s*//;
-    s/\s*$//;
-    if (m/^\s*(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\]\s+@\s+(.*)$/) {
-      my $stack = $5;
-      my ($n1, $s1, $n2, $s2) = ($1, $2, $3, $4);
-      my @counts = AdjustSamples($sample_adjustment, $sampling_algorithm,
-                                 $n1, $s1, $n2, $s2);
-      AddEntries($profile, $pcs, FixCallerAddresses($stack), $counts[$index]);
-    }
-  }
-
-  my $r = {};
-  $r->{version} = "heap";
-  $r->{period} = 1;
-  $r->{profile} = $profile;
-  $r->{libs} = ParseLibraries($prog, $map, $pcs);
-  $r->{pcs} = $pcs;
-  return $r;
-}
-
-sub ReadThreadedHeapProfile {
-  my ($prog, $fname, $header) = @_;
-
-  my $index = HeapProfileIndex();
-  my $sampling_algorithm = 0;
-  my $sample_adjustment = 0;
-  chomp($header);
-  my $type = "unknown";
-  # Assuming a very specific type of header for now.
-  if ($header =~ m"^heap_v2/(\d+)") {
-    $type = "_v2";
-    $sampling_algorithm = 2;
-    $sample_adjustment = int($1);
-  }
-  if ($type ne "_v2" || !defined($sample_adjustment)) {
-    die "Threaded heap profiles require v2 sampling with a sample rate\n";
-  }
-
-  my $profile = {};
-  my $thread_profiles = {};
-  my $pcs = {};
-  my $map = "";
-  my $stack = "";
-
-  while (<PROFILE>) {
-    s/\r//g;
-    if (/^MAPPED_LIBRARIES:/) {
-      $map .= ReadMappedLibraries(*PROFILE);
-      last;
-    }
-
-    if (/^--- Memory map:/) {
-      $map .= ReadMemoryMap(*PROFILE);
-      last;
-    }
-
-    # Read entry of the form:
-    # @ a1 a2 ... an
-    #   t*: <count1>: <bytes1> [<count2>: <bytes2>]
-    #   t1: <count1>: <bytes1> [<count2>: <bytes2>]
-    #     ...
-    #   tn: <count1>: <bytes1> [<count2>: <bytes2>]
-    s/^\s*//;
-    s/\s*$//;
-    if (m/^@\s+(.*)$/) {
-      $stack = $1;
-    } elsif (m/^\s*(t(\*|\d+)):\s+(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\]$/) {
-      if ($stack eq "") {
-        # Still in the header, so this is just a per-thread summary.
-        next;
-      }
-      my $thread = $2;
-      my ($n1, $s1, $n2, $s2) = ($3, $4, $5, $6);
-      my @counts = AdjustSamples($sample_adjustment, $sampling_algorithm,
-                                 $n1, $s1, $n2, $s2);
-      if ($thread eq "*") {
-        AddEntries($profile, $pcs, FixCallerAddresses($stack), $counts[$index]);
-      } else {
-        if (!exists($thread_profiles->{$thread})) {
-          $thread_profiles->{$thread} = {};
-        }
-        AddEntries($thread_profiles->{$thread}, $pcs,
-                   FixCallerAddresses($stack), $counts[$index]);
-      }
-    }
-  }
-
-  my $r = {};
-  $r->{version} = "heap";
-  $r->{period} = 1;
-  $r->{profile} = $profile;
-  $r->{threads} = $thread_profiles;
-  $r->{libs} = ParseLibraries($prog, $map, $pcs);
-  $r->{pcs} = $pcs;
-  return $r;
-}
-
-sub ReadSynchProfile {
-  my $prog = shift;
-  local *PROFILE = shift;
-  my $header = shift;
-
-  my $map = '';
-  my $profile = {};
-  my $pcs = {};
-  my $sampling_period = 1;
-  my $cyclespernanosec = 2.8;   # Default assumption for old binaries
-  my $seen_clockrate = 0;
-  my $line;
-
-  my $index = 0;
-  if ($main::opt_total_delay) {
-    $index = 0;
-  } elsif ($main::opt_contentions) {
-    $index = 1;
-  } elsif ($main::opt_mean_delay) {
-    $index = 2;
-  }
-
-  while ( $line = <PROFILE> ) {
-    $line =~ s/\r//g;      # turn windows-looking lines into unix-looking lines
-    if ( $line =~ /^\s*(\d+)\s+(\d+) \@\s*(.*?)\s*$/ ) {
-      my ($cycles, $count, $stack) = ($1, $2, $3);
-
-      # Convert cycles to nanoseconds
-      $cycles /= $cyclespernanosec;
-
-      # Adjust for sampling done by application
-      $cycles *= $sampling_period;
-      $count *= $sampling_period;
-
-      my @values = ($cycles, $count, $cycles / $count);
-      AddEntries($profile, $pcs, FixCallerAddresses($stack), $values[$index]);
-
-    } elsif ( $line =~ /^(slow release).*thread \d+  \@\s*(.*?)\s*$/ ||
-              $line =~ /^\s*(\d+) \@\s*(.*?)\s*$/ ) {
-      my ($cycles, $stack) = ($1, $2);
-      if ($cycles !~ /^\d+$/) {
-        next;
-      }
-
-      # Convert cycles to nanoseconds
-      $cycles /= $cyclespernanosec;
-
-      # Adjust for sampling done by application
-      $cycles *= $sampling_period;
-
-      AddEntries($profile, $pcs, FixCallerAddresses($stack), $cycles);
-
-    } elsif ( $line =~ m/^([a-z][^=]*)=(.*)$/ ) {
-      my ($variable, $value) = ($1,$2);
-      for ($variable, $value) {
-        s/^\s+//;
-        s/\s+$//;
-      }
-      if ($variable eq "cycles/second") {
-        $cyclespernanosec = $value / 1e9;
-        $seen_clockrate = 1;
-      } elsif ($variable eq "sampling period") {
-        $sampling_period = $value;
-      } elsif ($variable eq "ms since reset") {
-        # Currently nothing is done with this value in pprof
-        # So we just silently ignore it for now
-      } elsif ($variable eq "discarded samples") {
-        # Currently nothing is done with this value in pprof
-        # So we just silently ignore it for now
-      } else {
-        printf STDERR ("Ignoring unnknown variable in /contention output: " .
-                       "'%s' = '%s'\n",$variable,$value);
-      }
-    } else {
-      # Memory map entry
-      $map .= $line;
-    }
-  }
-
-  if (!$seen_clockrate) {
-    printf STDERR ("No cycles/second entry in profile; Guessing %.1f GHz\n",
-                   $cyclespernanosec);
-  }
-
-  my $r = {};
-  $r->{version} = 0;
-  $r->{period} = $sampling_period;
-  $r->{profile} = $profile;
-  $r->{libs} = ParseLibraries($prog, $map, $pcs);
-  $r->{pcs} = $pcs;
-  return $r;
-}
-
-# Given a hex value in the form "0x1abcd" or "1abcd", return either
-# "0001abcd" or "000000000001abcd", depending on the current (global)
-# address length.
-sub HexExtend {
-  my $addr = shift;
-
-  $addr =~ s/^(0x)?0*//;
-  my $zeros_needed = $address_length - length($addr);
-  if ($zeros_needed < 0) {
-    printf STDERR "Warning: address $addr is longer than address length $address_length\n";
-    return $addr;
-  }
-  return ("0" x $zeros_needed) . $addr;
-}
-
-##### Symbol extraction #####
-
-# Aggressively search the lib_prefix values for the given library
-# If all else fails, just return the name of the library unmodified.
-# If the lib_prefix is "/my/path,/other/path" and $file is "/lib/dir/mylib.so"
-# it will search the following locations in this order, until it finds a file:
-#   /my/path/lib/dir/mylib.so
-#   /other/path/lib/dir/mylib.so
-#   /my/path/dir/mylib.so
-#   /other/path/dir/mylib.so
-#   /my/path/mylib.so
-#   /other/path/mylib.so
-#   /lib/dir/mylib.so              (returned as last resort)
-sub FindLibrary {
-  my $file = shift;
-  my $suffix = $file;
-
-  # Search for the library as described above
-  do {
-    foreach my $prefix (@prefix_list) {
-      my $fullpath = $prefix . $suffix;
-      if (-e $fullpath) {
-        return $fullpath;
-      }
-    }
-  } while ($suffix =~ s|^/[^/]+/|/|);
-  return $file;
-}
-
-# Return path to library with debugging symbols.
-# For libc libraries, the copy in /usr/lib/debug contains debugging symbols
-sub DebuggingLibrary {
-  my $file = shift;
-  if ($file =~ m|^/|) {
-      if (-f "/usr/lib/debug$file") {
-        return "/usr/lib/debug$file";
-      } elsif (-f "/usr/lib/debug$file.debug") {
-        return "/usr/lib/debug$file.debug";
-      }
-  }
-  return undef;
-}
-
-# Parse text section header of a library using objdump
-sub ParseTextSectionHeaderFromObjdump {
-  my $lib = shift;
-
-  my $size = undef;
-  my $vma;
-  my $file_offset;
-  # Get objdump output from the library file to figure out how to
-  # map between mapped addresses and addresses in the library.
-  my $cmd = ShellEscape($obj_tool_map{"objdump"}, "-h", $lib);
-  open(OBJDUMP, "$cmd |") || error("$cmd: $!\n");
-  while (<OBJDUMP>) {
-    s/\r//g;         # turn windows-looking lines into unix-looking lines
-    # Idx Name          Size      VMA       LMA       File off  Algn
-    #  10 .text         00104b2c  420156f0  420156f0  000156f0  2**4
-    # For 64-bit objects, VMA and LMA will be 16 hex digits, size and file
-    # offset may still be 8.  But AddressSub below will still handle that.
-    my @x = split;
-    if (($#x >= 6) && ($x[1] eq '.text')) {
-      $size = $x[2];
-      $vma = $x[3];
-      $file_offset = $x[5];
-      last;
-    }
-  }
-  close(OBJDUMP);
-
-  if (!defined($size)) {
-    return undef;
-  }
-
-  my $r = {};
-  $r->{size} = $size;
-  $r->{vma} = $vma;
-  $r->{file_offset} = $file_offset;
-
-  return $r;
-}
-
-# Parse text section header of a library using otool (on OS X)
-sub ParseTextSectionHeaderFromOtool {
-  my $lib = shift;
-
-  my $size = undef;
-  my $vma = undef;
-  my $file_offset = undef;
-  # Get otool output from the library file to figure out how to
-  # map between mapped addresses and addresses in the library.
-  my $command = ShellEscape($obj_tool_map{"otool"}, "-l", $lib);
-  open(OTOOL, "$command |") || error("$command: $!\n");
-  my $cmd = "";
-  my $sectname = "";
-  my $segname = "";
-  foreach my $line (<OTOOL>) {
-    $line =~ s/\r//g;      # turn windows-looking lines into unix-looking lines
-    # Load command <#>
-    #       cmd LC_SEGMENT
-    # [...]
-    # Section
-    #   sectname __text
-    #    segname __TEXT
-    #       addr 0x000009f8
-    #       size 0x00018b9e
-    #     offset 2552
-    #      align 2^2 (4)
-    # We will need to strip off the leading 0x from the hex addresses,
-    # and convert the offset into hex.
-    if ($line =~ /Load command/) {
-      $cmd = "";
-      $sectname = "";
-      $segname = "";
-    } elsif ($line =~ /Section/) {
-      $sectname = "";
-      $segname = "";
-    } elsif ($line =~ /cmd (\w+)/) {
-      $cmd = $1;
-    } elsif ($line =~ /sectname (\w+)/) {
-      $sectname = $1;
-    } elsif ($line =~ /segname (\w+)/) {
-      $segname = $1;
-    } elsif (!(($cmd eq "LC_SEGMENT" || $cmd eq "LC_SEGMENT_64") &&
-               $sectname eq "__text" &&
-               $segname eq "__TEXT")) {
-      next;
-    } elsif ($line =~ /\baddr 0x([0-9a-fA-F]+)/) {
-      $vma = $1;
-    } elsif ($line =~ /\bsize 0x([0-9a-fA-F]+)/) {
-      $size = $1;
-    } elsif ($line =~ /\boffset ([0-9]+)/) {
-      $file_offset = sprintf("%016x", $1);
-    }
-    if (defined($vma) && defined($size) && defined($file_offset)) {
-      last;
-    }
-  }
-  close(OTOOL);
-
-  if (!defined($vma) || !defined($size) || !defined($file_offset)) {
-     return undef;
-  }
-
-  my $r = {};
-  $r->{size} = $size;
-  $r->{vma} = $vma;
-  $r->{file_offset} = $file_offset;
-
-  return $r;
-}
-
-sub ParseTextSectionHeader {
-  # obj_tool_map("otool") is only defined if we're in a Mach-O environment
-  if (defined($obj_tool_map{"otool"})) {
-    my $r = ParseTextSectionHeaderFromOtool(@_);
-    if (defined($r)){
-      return $r;
-    }
-  }
-  # If otool doesn't work, or we don't have it, fall back to objdump
-  return ParseTextSectionHeaderFromObjdump(@_);
-}
-
-# Split /proc/pid/maps dump into a list of libraries
-sub ParseLibraries {
-  return if $main::use_symbol_page;  # We don't need libraries info.
-  my $prog = shift;
-  my $map = shift;
-  my $pcs = shift;
-
-  my $result = [];
-  my $h = "[a-f0-9]+";
-  my $zero_offset = HexExtend("0");
-
-  my $buildvar = "";
-  foreach my $l (split("\n", $map)) {
-    if ($l =~ m/^\s*build=(.*)$/) {
-      $buildvar = $1;
-    }
-
-    my $start;
-    my $finish;
-    my $offset;
-    my $lib;
-    if ($l =~ /^($h)-($h)\s+..x.\s+($h)\s+\S+:\S+\s+\d+\s+(\S+\.(so|dll|dylib|bundle)((\.\d+)+\w*(\.\d+){0,3})?)$/i) {
-      # Full line from /proc/self/maps.  Example:
-      #   40000000-40015000 r-xp 00000000 03:01 12845071   /lib/ld-2.3.2.so
-      $start = HexExtend($1);
-      $finish = HexExtend($2);
-      $offset = HexExtend($3);
-      $lib = $4;
-      $lib =~ s|\\|/|g;     # turn windows-style paths into unix-style paths
-    } elsif ($l =~ /^\s*($h)-($h):\s*(\S+\.so(\.\d+)*)/) {
-      # Cooked line from DumpAddressMap.  Example:
-      #   40000000-40015000: /lib/ld-2.3.2.so
-      $start = HexExtend($1);
-      $finish = HexExtend($2);
-      $offset = $zero_offset;
-      $lib = $3;
-    }
-    # FreeBSD 10.0 virtual memory map /proc/curproc/map as defined in
-    # function procfs_doprocmap (sys/fs/procfs/procfs_map.c)
-    #
-    # Example:
-    # 0x800600000 0x80061a000 26 0 0xfffff800035a0000 r-x 75 33 0x1004 COW NC vnode /libexec/ld-elf.s
-    # o.1 NCH -1
-    elsif ($l =~ /^(0x$h)\s(0x$h)\s\d+\s\d+\s0x$h\sr-x\s\d+\s\d+\s0x\d+\s(COW|NCO)\s(NC|NNC)\svnode\s(\S+\.so(\.\d+)*)/) {
-      $start = HexExtend($1);
-      $finish = HexExtend($2);
-      $offset = $zero_offset;
-      $lib = FindLibrary($5);
-
-    } else {
-      next;
-    }
-
-    # Expand "$build" variable if available
-    $lib =~ s/\$build\b/$buildvar/g;
-
-    $lib = FindLibrary($lib);
-
-    # Check for pre-relocated libraries, which use pre-relocated symbol tables
-    # and thus require adjusting the offset that we'll use to translate
-    # VM addresses into symbol table addresses.
-    # Only do this if we're not going to fetch the symbol table from a
-    # debugging copy of the library.
-    if (!DebuggingLibrary($lib)) {
-      my $text = ParseTextSectionHeader($lib);
-      if (defined($text)) {
-         my $vma_offset = AddressSub($text->{vma}, $text->{file_offset});
-         $offset = AddressAdd($offset, $vma_offset);
-      }
-    }
-
-    if($main::opt_debug) { printf STDERR "$start:$finish ($offset) $lib\n"; }
-    push(@{$result}, [$lib, $start, $finish, $offset]);
-  }
-
-  # Append special entry for additional library (not relocated)
-  if ($main::opt_lib ne "") {
-    my $text = ParseTextSectionHeader($main::opt_lib);
-    if (defined($text)) {
-       my $start = $text->{vma};
-       my $finish = AddressAdd($start, $text->{size});
-
-       push(@{$result}, [$main::opt_lib, $start, $finish, $start]);
-    }
-  }
-
-  # Append special entry for the main program.  This covers
-  # 0..max_pc_value_seen, so that we assume pc values not found in one
-  # of the library ranges will be treated as coming from the main
-  # program binary.
-  my $min_pc = HexExtend("0");
-  my $max_pc = $min_pc;          # find the maximal PC value in any sample
-  foreach my $pc (keys(%{$pcs})) {
-    if (HexExtend($pc) gt $max_pc) { $max_pc = HexExtend($pc); }
-  }
-  push(@{$result}, [$prog, $min_pc, $max_pc, $zero_offset]);
-
-  return $result;
-}
-
-# Add two hex addresses of length $address_length.
-# Run pprof --test for unit test if this is changed.
-sub AddressAdd {
-  my $addr1 = shift;
-  my $addr2 = shift;
-  my $sum;
-
-  if ($address_length == 8) {
-    # Perl doesn't cope with wraparound arithmetic, so do it explicitly:
-    $sum = (hex($addr1)+hex($addr2)) % (0x10000000 * 16);
-    return sprintf("%08x", $sum);
-
-  } else {
-    # Do the addition in 7-nibble chunks to trivialize carry handling.
-
-    if ($main::opt_debug and $main::opt_test) {
-      print STDERR "AddressAdd $addr1 + $addr2 = ";
-    }
-
-    my $a1 = substr($addr1,-7);
-    $addr1 = substr($addr1,0,-7);
-    my $a2 = substr($addr2,-7);
-    $addr2 = substr($addr2,0,-7);
-    $sum = hex($a1) + hex($a2);
-    my $c = 0;
-    if ($sum > 0xfffffff) {
-      $c = 1;
-      $sum -= 0x10000000;
-    }
-    my $r = sprintf("%07x", $sum);
-
-    $a1 = substr($addr1,-7);
-    $addr1 = substr($addr1,0,-7);
-    $a2 = substr($addr2,-7);
-    $addr2 = substr($addr2,0,-7);
-    $sum = hex($a1) + hex($a2) + $c;
-    $c = 0;
-    if ($sum > 0xfffffff) {
-      $c = 1;
-      $sum -= 0x10000000;
-    }
-    $r = sprintf("%07x", $sum) . $r;
-
-    $sum = hex($addr1) + hex($addr2) + $c;
-    if ($sum > 0xff) { $sum -= 0x100; }
-    $r = sprintf("%02x", $sum) . $r;
-
-    if ($main::opt_debug and $main::opt_test) { print STDERR "$r\n"; }
-
-    return $r;
-  }
-}
-
-
-# Subtract two hex addresses of length $address_length.
-# Run pprof --test for unit test if this is changed.
-sub AddressSub {
-  my $addr1 = shift;
-  my $addr2 = shift;
-  my $diff;
-
-  if ($address_length == 8) {
-    # Perl doesn't cope with wraparound arithmetic, so do it explicitly:
-    $diff = (hex($addr1)-hex($addr2)) % (0x10000000 * 16);
-    return sprintf("%08x", $diff);
-
-  } else {
-    # Do the addition in 7-nibble chunks to trivialize borrow handling.
-    # if ($main::opt_debug) { print STDERR "AddressSub $addr1 - $addr2 = "; }
-
-    my $a1 = hex(substr($addr1,-7));
-    $addr1 = substr($addr1,0,-7);
-    my $a2 = hex(substr($addr2,-7));
-    $addr2 = substr($addr2,0,-7);
-    my $b = 0;
-    if ($a2 > $a1) {
-      $b = 1;
-      $a1 += 0x10000000;
-    }
-    $diff = $a1 - $a2;
-    my $r = sprintf("%07x", $diff);
-
-    $a1 = hex(substr($addr1,-7));
-    $addr1 = substr($addr1,0,-7);
-    $a2 = hex(substr($addr2,-7)) + $b;
-    $addr2 = substr($addr2,0,-7);
-    $b = 0;
-    if ($a2 > $a1) {
-      $b = 1;
-      $a1 += 0x10000000;
-    }
-    $diff = $a1 - $a2;
-    $r = sprintf("%07x", $diff) . $r;
-
-    $a1 = hex($addr1);
-    $a2 = hex($addr2) + $b;
-    if ($a2 > $a1) { $a1 += 0x100; }
-    $diff = $a1 - $a2;
-    $r = sprintf("%02x", $diff) . $r;
-
-    # if ($main::opt_debug) { print STDERR "$r\n"; }
-
-    return $r;
-  }
-}
-
-# Increment a hex addresses of length $address_length.
-# Run pprof --test for unit test if this is changed.
-sub AddressInc {
-  my $addr = shift;
-  my $sum;
-
-  if ($address_length == 8) {
-    # Perl doesn't cope with wraparound arithmetic, so do it explicitly:
-    $sum = (hex($addr)+1) % (0x10000000 * 16);
-    return sprintf("%08x", $sum);
-
-  } else {
-    # Do the addition in 7-nibble chunks to trivialize carry handling.
-    # We are always doing this to step through the addresses in a function,
-    # and will almost never overflow the first chunk, so we check for this
-    # case and exit early.
-
-    # if ($main::opt_debug) { print STDERR "AddressInc $addr1 = "; }
-
-    my $a1 = substr($addr,-7);
-    $addr = substr($addr,0,-7);
-    $sum = hex($a1) + 1;
-    my $r = sprintf("%07x", $sum);
-    if ($sum <= 0xfffffff) {
-      $r = $addr . $r;
-      # if ($main::opt_debug) { print STDERR "$r\n"; }
-      return HexExtend($r);
-    } else {
-      $r = "0000000";
-    }
-
-    $a1 = substr($addr,-7);
-    $addr = substr($addr,0,-7);
-    $sum = hex($a1) + 1;
-    $r = sprintf("%07x", $sum) . $r;
-    if ($sum <= 0xfffffff) {
-      $r = $addr . $r;
-      # if ($main::opt_debug) { print STDERR "$r\n"; }
-      return HexExtend($r);
-    } else {
-      $r = "00000000000000";
-    }
-
-    $sum = hex($addr) + 1;
-    if ($sum > 0xff) { $sum -= 0x100; }
-    $r = sprintf("%02x", $sum) . $r;
-
-    # if ($main::opt_debug) { print STDERR "$r\n"; }
-    return $r;
-  }
-}
-
-# Extract symbols for all PC values found in profile
-sub ExtractSymbols {
-  my $libs = shift;
-  my $pcset = shift;
-
-  my $symbols = {};
-
-  # Map each PC value to the containing library.  To make this faster,
-  # we sort libraries by their starting pc value (highest first), and
-  # advance through the libraries as we advance the pc.  Sometimes the
-  # addresses of libraries may overlap with the addresses of the main
-  # binary, so to make sure the libraries 'win', we iterate over the
-  # libraries in reverse order (which assumes the binary doesn't start
-  # in the middle of a library, which seems a fair assumption).
-  my @pcs = (sort { $a cmp $b } keys(%{$pcset}));  # pcset is 0-extended strings
-  foreach my $lib (sort {$b->[1] cmp $a->[1]} @{$libs}) {
-    my $libname = $lib->[0];
-    my $start = $lib->[1];
-    my $finish = $lib->[2];
-    my $offset = $lib->[3];
-
-    # Use debug library if it exists
-    my $debug_libname = DebuggingLibrary($libname);
-    if ($debug_libname) {
-        $libname = $debug_libname;
-    }
-
-    # Get list of pcs that belong in this library.
-    my $contained = [];
-    my ($start_pc_index, $finish_pc_index);
-    # Find smallest finish_pc_index such that $finish < $pc[$finish_pc_index].
-    for ($finish_pc_index = $#pcs + 1; $finish_pc_index > 0;
-         $finish_pc_index--) {
-      last if $pcs[$finish_pc_index - 1] le $finish;
-    }
-    # Find smallest start_pc_index such that $start <= $pc[$start_pc_index].
-    for ($start_pc_index = $finish_pc_index; $start_pc_index > 0;
-         $start_pc_index--) {
-      last if $pcs[$start_pc_index - 1] lt $start;
-    }
-    # This keeps PC values higher than $pc[$finish_pc_index] in @pcs,
-    # in case there are overlaps in libraries and the main binary.
-    @{$contained} = splice(@pcs, $start_pc_index,
-                           $finish_pc_index - $start_pc_index);
-    # Map to symbols
-    MapToSymbols($libname, AddressSub($start, $offset), $contained, $symbols);
-  }
-
-  return $symbols;
-}
-
-# Map list of PC values to symbols for a given image
-sub MapToSymbols {
-  my $image = shift;
-  my $offset = shift;
-  my $pclist = shift;
-  my $symbols = shift;
-
-  my $debug = 0;
-
-  # Ignore empty binaries
-  if ($#{$pclist} < 0) { return; }
-
-  # Figure out the addr2line command to use
-  my $addr2line = $obj_tool_map{"addr2line"};
-  my $cmd = ShellEscape($addr2line, "-f", "-C", "-e", $image);
-  if (exists $obj_tool_map{"addr2line_pdb"}) {
-    $addr2line = $obj_tool_map{"addr2line_pdb"};
-    $cmd = ShellEscape($addr2line, "--demangle", "-f", "-C", "-e", $image);
-  }
-
-  # If "addr2line" isn't installed on the system at all, just use
-  # nm to get what info we can (function names, but not line numbers).
-  if (system(ShellEscape($addr2line, "--help") . " >$dev_null 2>&1") != 0) {
-    MapSymbolsWithNM($image, $offset, $pclist, $symbols);
-    return;
-  }
-
-  # "addr2line -i" can produce a variable number of lines per input
-  # address, with no separator that allows us to tell when data for
-  # the next address starts.  So we find the address for a special
-  # symbol (_fini) and interleave this address between all real
-  # addresses passed to addr2line.  The name of this special symbol
-  # can then be used as a separator.
-  $sep_address = undef;  # May be filled in by MapSymbolsWithNM()
-  my $nm_symbols = {};
-  MapSymbolsWithNM($image, $offset, $pclist, $nm_symbols);
-  if (defined($sep_address)) {
-    # Only add " -i" to addr2line if the binary supports it.
-    # addr2line --help returns 0, but not if it sees an unknown flag first.
-    if (system("$cmd -i --help >$dev_null 2>&1") == 0) {
-      $cmd .= " -i";
-    } else {
-      $sep_address = undef;   # no need for sep_address if we don't support -i
-    }
-  }
-
-  # Make file with all PC values with intervening 'sep_address' so
-  # that we can reliably detect the end of inlined function list
-  open(ADDRESSES, ">$main::tmpfile_sym") || error("$main::tmpfile_sym: $!\n");
-  if ($debug) { print("---- $image ---\n"); }
-  for (my $i = 0; $i <= $#{$pclist}; $i++) {
-    # addr2line always reads hex addresses, and does not need '0x' prefix.
-    if ($debug) { printf STDERR ("%s\n", $pclist->[$i]); }
-    printf ADDRESSES ("%s\n", AddressSub($pclist->[$i], $offset));
-    if (defined($sep_address)) {
-      printf ADDRESSES ("%s\n", $sep_address);
-    }
-  }
-  close(ADDRESSES);
-  if ($debug) {
-    print("----\n");
-    system("cat", $main::tmpfile_sym);
-    print("----\n");
-    system("$cmd < " . ShellEscape($main::tmpfile_sym));
-    print("----\n");
-  }
-
-  open(SYMBOLS, "$cmd <" . ShellEscape($main::tmpfile_sym) . " |")
-      || error("$cmd: $!\n");
-  my $count = 0;   # Index in pclist
-  while (<SYMBOLS>) {
-    # Read fullfunction and filelineinfo from next pair of lines
-    s/\r?\n$//g;
-    my $fullfunction = $_;
-    $_ = <SYMBOLS>;
-    s/\r?\n$//g;
-    my $filelinenum = $_;
-
-    if (defined($sep_address) && $fullfunction eq $sep_symbol) {
-      # Terminating marker for data for this address
-      $count++;
-      next;
-    }
-
-    $filelinenum =~ s|\\|/|g; # turn windows-style paths into unix-style paths
-
-    my $pcstr = $pclist->[$count];
-    my $function = ShortFunctionName($fullfunction);
-    my $nms = $nm_symbols->{$pcstr};
-    if (defined($nms)) {
-      if ($fullfunction eq '??') {
-        # nm found a symbol for us.
-        $function = $nms->[0];
-        $fullfunction = $nms->[2];
-      } else {
-	# MapSymbolsWithNM tags each routine with its starting address,
-	# useful in case the image has multiple occurrences of this
-	# routine.  (It uses a syntax that resembles template paramters,
-	# that are automatically stripped out by ShortFunctionName().)
-	# addr2line does not provide the same information.  So we check
-	# if nm disambiguated our symbol, and if so take the annotated
-	# (nm) version of the routine-name.  TODO(csilvers): this won't
-	# catch overloaded, inlined symbols, which nm doesn't see.
-	# Better would be to do a check similar to nm's, in this fn.
-	if ($nms->[2] =~ m/^\Q$function\E/) {  # sanity check it's the right fn
-	  $function = $nms->[0];
-	  $fullfunction = $nms->[2];
-	}
-      }
-    }
-
-    # Prepend to accumulated symbols for pcstr
-    # (so that caller comes before callee)
-    my $sym = $symbols->{$pcstr};
-    if (!defined($sym)) {
-      $sym = [];
-      $symbols->{$pcstr} = $sym;
-    }
-    unshift(@{$sym}, $function, $filelinenum, $fullfunction);
-    if ($debug) { printf STDERR ("%s => [%s]\n", $pcstr, join(" ", @{$sym})); }
-    if (!defined($sep_address)) {
-      # Inlining is off, so this entry ends immediately
-      $count++;
-    }
-  }
-  close(SYMBOLS);
-}
-
-# Use nm to map the list of referenced PCs to symbols.  Return true iff we
-# are able to read procedure information via nm.
-sub MapSymbolsWithNM {
-  my $image = shift;
-  my $offset = shift;
-  my $pclist = shift;
-  my $symbols = shift;
-
-  # Get nm output sorted by increasing address
-  my $symbol_table = GetProcedureBoundaries($image, ".");
-  if (!%{$symbol_table}) {
-    return 0;
-  }
-  # Start addresses are already the right length (8 or 16 hex digits).
-  my @names = sort { $symbol_table->{$a}->[0] cmp $symbol_table->{$b}->[0] }
-    keys(%{$symbol_table});
-
-  if ($#names < 0) {
-    # No symbols: just use addresses
-    foreach my $pc (@{$pclist}) {
-      my $pcstr = "0x" . $pc;
-      $symbols->{$pc} = [$pcstr, "?", $pcstr];
-    }
-    return 0;
-  }
-
-  # Sort addresses so we can do a join against nm output
-  my $index = 0;
-  my $fullname = $names[0];
-  my $name = ShortFunctionName($fullname);
-  foreach my $pc (sort { $a cmp $b } @{$pclist}) {
-    # Adjust for mapped offset
-    my $mpc = AddressSub($pc, $offset);
-    while (($index < $#names) && ($mpc ge $symbol_table->{$fullname}->[1])){
-      $index++;
-      $fullname = $names[$index];
-      $name = ShortFunctionName($fullname);
-    }
-    if ($mpc lt $symbol_table->{$fullname}->[1]) {
-      $symbols->{$pc} = [$name, "?", $fullname];
-    } else {
-      my $pcstr = "0x" . $pc;
-      $symbols->{$pc} = [$pcstr, "?", $pcstr];
-    }
-  }
-  return 1;
-}
-
-sub ShortFunctionName {
-  my $function = shift;
-  while ($function =~ s/\([^()]*\)(\s*const)?//g) { }   # Argument types
-  while ($function =~ s/<[^<>]*>//g)  { }    # Remove template arguments
-  $function =~ s/^.*\s+(\w+::)/$1/;          # Remove leading type
-  return $function;
-}
-
-# Trim overly long symbols found in disassembler output
-sub CleanDisassembly {
-  my $d = shift;
-  while ($d =~ s/\([^()%]*\)(\s*const)?//g) { } # Argument types, not (%rax)
-  while ($d =~ s/(\w+)<[^<>]*>/$1/g)  { }       # Remove template arguments
-  return $d;
-}
-
-# Clean file name for display
-sub CleanFileName {
-  my ($f) = @_;
-  $f =~ s|^/proc/self/cwd/||;
-  $f =~ s|^\./||;
-  return $f;
-}
-
-# Make address relative to section and clean up for display
-sub UnparseAddress {
-  my ($offset, $address) = @_;
-  $address = AddressSub($address, $offset);
-  $address =~ s/^0x//;
-  $address =~ s/^0*//;
-  return $address;
-}
-
-##### Miscellaneous #####
-
-# Find the right versions of the above object tools to use.  The
-# argument is the program file being analyzed, and should be an ELF
-# 32-bit or ELF 64-bit executable file.  The location of the tools
-# is determined by considering the following options in this order:
-#   1) --tools option, if set
-#   2) PPROF_TOOLS environment variable, if set
-#   3) the environment
-sub ConfigureObjTools {
-  my $prog_file = shift;
-
-  # Check for the existence of $prog_file because /usr/bin/file does not
-  # predictably return error status in prod.
-  (-e $prog_file)  || error("$prog_file does not exist.\n");
-
-  my $file_type = undef;
-  if (-e "/usr/bin/file") {
-    # Follow symlinks (at least for systems where "file" supports that).
-    my $escaped_prog_file = ShellEscape($prog_file);
-    $file_type = `/usr/bin/file -L $escaped_prog_file 2>$dev_null ||
-                  /usr/bin/file $escaped_prog_file`;
-  } elsif ($^O == "MSWin32") {
-    $file_type = "MS Windows";
-  } else {
-    print STDERR "WARNING: Can't determine the file type of $prog_file";
-  }
-
-  if ($file_type =~ /64-bit/) {
-    # Change $address_length to 16 if the program file is ELF 64-bit.
-    # We can't detect this from many (most?) heap or lock contention
-    # profiles, since the actual addresses referenced are generally in low
-    # memory even for 64-bit programs.
-    $address_length = 16;
-  }
-
-  if ($file_type =~ /MS Windows/) {
-    # For windows, we provide a version of nm and addr2line as part of
-    # the opensource release, which is capable of parsing
-    # Windows-style PDB executables.  It should live in the path, or
-    # in the same directory as pprof.
-    $obj_tool_map{"nm_pdb"} = "nm-pdb";
-    $obj_tool_map{"addr2line_pdb"} = "addr2line-pdb";
-  }
-
-  if ($file_type =~ /Mach-O/) {
-    # OS X uses otool to examine Mach-O files, rather than objdump.
-    $obj_tool_map{"otool"} = "otool";
-    $obj_tool_map{"addr2line"} = "false";  # no addr2line
-    $obj_tool_map{"objdump"} = "false";  # no objdump
-  }
-
-  # Go fill in %obj_tool_map with the pathnames to use:
-  foreach my $tool (keys %obj_tool_map) {
-    $obj_tool_map{$tool} = ConfigureTool($obj_tool_map{$tool});
-  }
-}
-
-# Returns the path of a caller-specified object tool.  If --tools or
-# PPROF_TOOLS are specified, then returns the full path to the tool
-# with that prefix.  Otherwise, returns the path unmodified (which
-# means we will look for it on PATH).
-sub ConfigureTool {
-  my $tool = shift;
-  my $path;
-
-  # --tools (or $PPROF_TOOLS) is a comma separated list, where each
-  # item is either a) a pathname prefix, or b) a map of the form
-  # <tool>:<path>.  First we look for an entry of type (b) for our
-  # tool.  If one is found, we use it.  Otherwise, we consider all the
-  # pathname prefixes in turn, until one yields an existing file.  If
-  # none does, we use a default path.
-  my $tools = $main::opt_tools || $ENV{"PPROF_TOOLS"} || "";
-  if ($tools =~ m/(,|^)\Q$tool\E:([^,]*)/) {
-    $path = $2;
-    # TODO(csilvers): sanity-check that $path exists?  Hard if it's relative.
-  } elsif ($tools ne '') {
-    foreach my $prefix (split(',', $tools)) {
-      next if ($prefix =~ /:/);    # ignore "tool:fullpath" entries in the list
-      if (-x $prefix . $tool) {
-        $path = $prefix . $tool;
-        last;
-      }
-    }
-    if (!$path) {
-      error("No '$tool' found with prefix specified by " .
-            "--tools (or \$PPROF_TOOLS) '$tools'\n");
-    }
-  } else {
-    # ... otherwise use the version that exists in the same directory as
-    # pprof.  If there's nothing there, use $PATH.
-    $0 =~ m,[^/]*$,;     # this is everything after the last slash
-    my $dirname = $`;    # this is everything up to and including the last slash
-    if (-x "$dirname$tool") {
-      $path = "$dirname$tool";
-    } else {
-      $path = $tool;
-    }
-  }
-  if ($main::opt_debug) { print STDERR "Using '$path' for '$tool'.\n"; }
-  return $path;
-}
-
-sub ShellEscape {
-  my @escaped_words = ();
-  foreach my $word (@_) {
-    my $escaped_word = $word;
-    if ($word =~ m![^a-zA-Z0-9/.,_=-]!) {  # check for anything not in whitelist
-      $escaped_word =~ s/'/'\\''/;
-      $escaped_word = "'$escaped_word'";
-    }
-    push(@escaped_words, $escaped_word);
-  }
-  return join(" ", @escaped_words);
-}
-
-sub cleanup {
-  unlink($main::tmpfile_sym);
-  unlink(keys %main::tempnames);
-
-  # We leave any collected profiles in $HOME/pprof in case the user wants
-  # to look at them later.  We print a message informing them of this.
-  if ((scalar(@main::profile_files) > 0) &&
-      defined($main::collected_profile)) {
-    if (scalar(@main::profile_files) == 1) {
-      print STDERR "Dynamically gathered profile is in $main::collected_profile\n";
-    }
-    print STDERR "If you want to investigate this profile further, you can do:\n";
-    print STDERR "\n";
-    print STDERR "  pprof \\\n";
-    print STDERR "    $main::prog \\\n";
-    print STDERR "    $main::collected_profile\n";
-    print STDERR "\n";
-  }
-}
-
-sub sighandler {
-  cleanup();
-  exit(1);
-}
-
-sub error {
-  my $msg = shift;
-  print STDERR $msg;
-  cleanup();
-  exit(1);
-}
-
-
-# Run $nm_command and get all the resulting procedure boundaries whose
-# names match "$regexp" and returns them in a hashtable mapping from
-# procedure name to a two-element vector of [start address, end address]
-sub GetProcedureBoundariesViaNm {
-  my $escaped_nm_command = shift;    # shell-escaped
-  my $regexp = shift;
-
-  my $symbol_table = {};
-  open(NM, "$escaped_nm_command |") || error("$escaped_nm_command: $!\n");
-  my $last_start = "0";
-  my $routine = "";
-  while (<NM>) {
-    s/\r//g;         # turn windows-looking lines into unix-looking lines
-    if (m/^\s*([0-9a-f]+) (.) (..*)/) {
-      my $start_val = $1;
-      my $type = $2;
-      my $this_routine = $3;
-
-      # It's possible for two symbols to share the same address, if
-      # one is a zero-length variable (like __start_google_malloc) or
-      # one symbol is a weak alias to another (like __libc_malloc).
-      # In such cases, we want to ignore all values except for the
-      # actual symbol, which in nm-speak has type "T".  The logic
-      # below does this, though it's a bit tricky: what happens when
-      # we have a series of lines with the same address, is the first
-      # one gets queued up to be processed.  However, it won't
-      # *actually* be processed until later, when we read a line with
-      # a different address.  That means that as long as we're reading
-      # lines with the same address, we have a chance to replace that
-      # item in the queue, which we do whenever we see a 'T' entry --
-      # that is, a line with type 'T'.  If we never see a 'T' entry,
-      # we'll just go ahead and process the first entry (which never
-      # got touched in the queue), and ignore the others.
-      if ($start_val eq $last_start && $type =~ /t/i) {
-        # We are the 'T' symbol at this address, replace previous symbol.
-        $routine = $this_routine;
-        next;
-      } elsif ($start_val eq $last_start) {
-        # We're not the 'T' symbol at this address, so ignore us.
-        next;
-      }
-
-      if ($this_routine eq $sep_symbol) {
-        $sep_address = HexExtend($start_val);
-      }
-
-      # Tag this routine with the starting address in case the image
-      # has multiple occurrences of this routine.  We use a syntax
-      # that resembles template parameters that are automatically
-      # stripped out by ShortFunctionName()
-      $this_routine .= "<$start_val>";
-
-      if (defined($routine) && $routine =~ m/$regexp/) {
-        $symbol_table->{$routine} = [HexExtend($last_start),
-                                     HexExtend($start_val)];
-      }
-      $last_start = $start_val;
-      $routine = $this_routine;
-    } elsif (m/^Loaded image name: (.+)/) {
-      # The win32 nm workalike emits information about the binary it is using.
-      if ($main::opt_debug) { print STDERR "Using Image $1\n"; }
-    } elsif (m/^PDB file name: (.+)/) {
-      # The win32 nm workalike emits information about the pdb it is using.
-      if ($main::opt_debug) { print STDERR "Using PDB $1\n"; }
-    }
-  }
-  close(NM);
-  # Handle the last line in the nm output.  Unfortunately, we don't know
-  # how big this last symbol is, because we don't know how big the file
-  # is.  For now, we just give it a size of 0.
-  # TODO(csilvers): do better here.
-  if (defined($routine) && $routine =~ m/$regexp/) {
-    $symbol_table->{$routine} = [HexExtend($last_start),
-                                 HexExtend($last_start)];
-  }
-  return $symbol_table;
-}
-
-# Gets the procedure boundaries for all routines in "$image" whose names
-# match "$regexp" and returns them in a hashtable mapping from procedure
-# name to a two-element vector of [start address, end address].
-# Will return an empty map if nm is not installed or not working properly.
-sub GetProcedureBoundaries {
-  my $image = shift;
-  my $regexp = shift;
-
-  # If $image doesn't start with /, then put ./ in front of it.  This works
-  # around an obnoxious bug in our probing of nm -f behavior.
-  # "nm -f $image" is supposed to fail on GNU nm, but if:
-  #
-  # a. $image starts with [BbSsPp] (for example, bin/foo/bar), AND
-  # b. you have a.out in your current directory (a not uncommon occurence)
-  #
-  # then "nm -f $image" succeeds because -f only looks at the first letter of
-  # the argument, which looks valid because it's [BbSsPp], and then since
-  # there's no image provided, it looks for a.out and finds it.
-  #
-  # This regex makes sure that $image starts with . or /, forcing the -f
-  # parsing to fail since . and / are not valid formats.
-  $image =~ s#^[^/]#./$&#;
-
-  # For libc libraries, the copy in /usr/lib/debug contains debugging symbols
-  my $debugging = DebuggingLibrary($image);
-  if ($debugging) {
-    $image = $debugging;
-  }
-
-  my $nm = $obj_tool_map{"nm"};
-  my $cppfilt = $obj_tool_map{"c++filt"};
-
-  # nm can fail for two reasons: 1) $image isn't a debug library; 2) nm
-  # binary doesn't support --demangle.  In addition, for OS X we need
-  # to use the -f flag to get 'flat' nm output (otherwise we don't sort
-  # properly and get incorrect results).  Unfortunately, GNU nm uses -f
-  # in an incompatible way.  So first we test whether our nm supports
-  # --demangle and -f.
-  my $demangle_flag = "";
-  my $cppfilt_flag = "";
-  my $to_devnull = ">$dev_null 2>&1";
-  if (system(ShellEscape($nm, "--demangle", "image") . $to_devnull) == 0) {
-    # In this mode, we do "nm --demangle <foo>"
-    $demangle_flag = "--demangle";
-    $cppfilt_flag = "";
-  } elsif (system(ShellEscape($cppfilt, $image) . $to_devnull) == 0) {
-    # In this mode, we do "nm <foo> | c++filt"
-    $cppfilt_flag = " | " . ShellEscape($cppfilt);
-  };
-  my $flatten_flag = "";
-  if (system(ShellEscape($nm, "-f", $image) . $to_devnull) == 0) {
-    $flatten_flag = "-f";
-  }
-
-  # Finally, in the case $imagie isn't a debug library, we try again with
-  # -D to at least get *exported* symbols.  If we can't use --demangle,
-  # we use c++filt instead, if it exists on this system.
-  my @nm_commands = (ShellEscape($nm, "-n", $flatten_flag, $demangle_flag,
-                                 $image) . " 2>$dev_null $cppfilt_flag",
-                     ShellEscape($nm, "-D", "-n", $flatten_flag, $demangle_flag,
-                                 $image) . " 2>$dev_null $cppfilt_flag",
-                     # 6nm is for Go binaries
-                     ShellEscape("6nm", "$image") . " 2>$dev_null | sort",
-                     );
-
-  # If the executable is an MS Windows PDB-format executable, we'll
-  # have set up obj_tool_map("nm_pdb").  In this case, we actually
-  # want to use both unix nm and windows-specific nm_pdb, since
-  # PDB-format executables can apparently include dwarf .o files.
-  if (exists $obj_tool_map{"nm_pdb"}) {
-    push(@nm_commands,
-         ShellEscape($obj_tool_map{"nm_pdb"}, "--demangle", $image)
-         . " 2>$dev_null");
-  }
-
-  foreach my $nm_command (@nm_commands) {
-    my $symbol_table = GetProcedureBoundariesViaNm($nm_command, $regexp);
-    return $symbol_table if (%{$symbol_table});
-  }
-  my $symbol_table = {};
-  return $symbol_table;
-}
-
-
-# The test vectors for AddressAdd/Sub/Inc are 8-16-nibble hex strings.
-# To make them more readable, we add underscores at interesting places.
-# This routine removes the underscores, producing the canonical representation
-# used by pprof to represent addresses, particularly in the tested routines.
-sub CanonicalHex {
-  my $arg = shift;
-  return join '', (split '_',$arg);
-}
-
-
-# Unit test for AddressAdd:
-sub AddressAddUnitTest {
-  my $test_data_8 = shift;
-  my $test_data_16 = shift;
-  my $error_count = 0;
-  my $fail_count = 0;
-  my $pass_count = 0;
-  # print STDERR "AddressAddUnitTest: ", 1+$#{$test_data_8}, " tests\n";
-
-  # First a few 8-nibble addresses.  Note that this implementation uses
-  # plain old arithmetic, so a quick sanity check along with verifying what
-  # happens to overflow (we want it to wrap):
-  $address_length = 8;
-  foreach my $row (@{$test_data_8}) {
-    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
-    my $sum = AddressAdd ($row->[0], $row->[1]);
-    if ($sum ne $row->[2]) {
-      printf STDERR "ERROR: %s != %s + %s = %s\n", $sum,
-             $row->[0], $row->[1], $row->[2];
-      ++$fail_count;
-    } else {
-      ++$pass_count;
-    }
-  }
-  printf STDERR "AddressAdd 32-bit tests: %d passes, %d failures\n",
-         $pass_count, $fail_count;
-  $error_count = $fail_count;
-  $fail_count = 0;
-  $pass_count = 0;
-
-  # Now 16-nibble addresses.
-  $address_length = 16;
-  foreach my $row (@{$test_data_16}) {
-    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
-    my $sum = AddressAdd (CanonicalHex($row->[0]), CanonicalHex($row->[1]));
-    my $expected = join '', (split '_',$row->[2]);
-    if ($sum ne CanonicalHex($row->[2])) {
-      printf STDERR "ERROR: %s != %s + %s = %s\n", $sum,
-             $row->[0], $row->[1], $row->[2];
-      ++$fail_count;
-    } else {
-      ++$pass_count;
-    }
-  }
-  printf STDERR "AddressAdd 64-bit tests: %d passes, %d failures\n",
-         $pass_count, $fail_count;
-  $error_count += $fail_count;
-
-  return $error_count;
-}
-
-
-# Unit test for AddressSub:
-sub AddressSubUnitTest {
-  my $test_data_8 = shift;
-  my $test_data_16 = shift;
-  my $error_count = 0;
-  my $fail_count = 0;
-  my $pass_count = 0;
-  # print STDERR "AddressSubUnitTest: ", 1+$#{$test_data_8}, " tests\n";
-
-  # First a few 8-nibble addresses.  Note that this implementation uses
-  # plain old arithmetic, so a quick sanity check along with verifying what
-  # happens to overflow (we want it to wrap):
-  $address_length = 8;
-  foreach my $row (@{$test_data_8}) {
-    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
-    my $sum = AddressSub ($row->[0], $row->[1]);
-    if ($sum ne $row->[3]) {
-      printf STDERR "ERROR: %s != %s - %s = %s\n", $sum,
-             $row->[0], $row->[1], $row->[3];
-      ++$fail_count;
-    } else {
-      ++$pass_count;
-    }
-  }
-  printf STDERR "AddressSub 32-bit tests: %d passes, %d failures\n",
-         $pass_count, $fail_count;
-  $error_count = $fail_count;
-  $fail_count = 0;
-  $pass_count = 0;
-
-  # Now 16-nibble addresses.
-  $address_length = 16;
-  foreach my $row (@{$test_data_16}) {
-    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
-    my $sum = AddressSub (CanonicalHex($row->[0]), CanonicalHex($row->[1]));
-    if ($sum ne CanonicalHex($row->[3])) {
-      printf STDERR "ERROR: %s != %s - %s = %s\n", $sum,
-             $row->[0], $row->[1], $row->[3];
-      ++$fail_count;
-    } else {
-      ++$pass_count;
-    }
-  }
-  printf STDERR "AddressSub 64-bit tests: %d passes, %d failures\n",
-         $pass_count, $fail_count;
-  $error_count += $fail_count;
-
-  return $error_count;
-}
-
-
-# Unit test for AddressInc:
-sub AddressIncUnitTest {
-  my $test_data_8 = shift;
-  my $test_data_16 = shift;
-  my $error_count = 0;
-  my $fail_count = 0;
-  my $pass_count = 0;
-  # print STDERR "AddressIncUnitTest: ", 1+$#{$test_data_8}, " tests\n";
-
-  # First a few 8-nibble addresses.  Note that this implementation uses
-  # plain old arithmetic, so a quick sanity check along with verifying what
-  # happens to overflow (we want it to wrap):
-  $address_length = 8;
-  foreach my $row (@{$test_data_8}) {
-    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
-    my $sum = AddressInc ($row->[0]);
-    if ($sum ne $row->[4]) {
-      printf STDERR "ERROR: %s != %s + 1 = %s\n", $sum,
-             $row->[0], $row->[4];
-      ++$fail_count;
-    } else {
-      ++$pass_count;
-    }
-  }
-  printf STDERR "AddressInc 32-bit tests: %d passes, %d failures\n",
-         $pass_count, $fail_count;
-  $error_count = $fail_count;
-  $fail_count = 0;
-  $pass_count = 0;
-
-  # Now 16-nibble addresses.
-  $address_length = 16;
-  foreach my $row (@{$test_data_16}) {
-    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
-    my $sum = AddressInc (CanonicalHex($row->[0]));
-    if ($sum ne CanonicalHex($row->[4])) {
-      printf STDERR "ERROR: %s != %s + 1 = %s\n", $sum,
-             $row->[0], $row->[4];
-      ++$fail_count;
-    } else {
-      ++$pass_count;
-    }
-  }
-  printf STDERR "AddressInc 64-bit tests: %d passes, %d failures\n",
-         $pass_count, $fail_count;
-  $error_count += $fail_count;
-
-  return $error_count;
-}
-
-
-# Driver for unit tests.
-# Currently just the address add/subtract/increment routines for 64-bit.
-sub RunUnitTests {
-  my $error_count = 0;
-
-  # This is a list of tuples [a, b, a+b, a-b, a+1]
-  my $unit_test_data_8 = [
-    [qw(aaaaaaaa 50505050 fafafafa 5a5a5a5a aaaaaaab)],
-    [qw(50505050 aaaaaaaa fafafafa a5a5a5a6 50505051)],
-    [qw(ffffffff aaaaaaaa aaaaaaa9 55555555 00000000)],
-    [qw(00000001 ffffffff 00000000 00000002 00000002)],
-    [qw(00000001 fffffff0 fffffff1 00000011 00000002)],
-  ];
-  my $unit_test_data_16 = [
-    # The implementation handles data in 7-nibble chunks, so those are the
-    # interesting boundaries.
-    [qw(aaaaaaaa 50505050
-        00_000000f_afafafa 00_0000005_a5a5a5a 00_000000a_aaaaaab)],
-    [qw(50505050 aaaaaaaa
-        00_000000f_afafafa ff_ffffffa_5a5a5a6 00_0000005_0505051)],
-    [qw(ffffffff aaaaaaaa
-        00_000001a_aaaaaa9 00_0000005_5555555 00_0000010_0000000)],
-    [qw(00000001 ffffffff
-        00_0000010_0000000 ff_ffffff0_0000002 00_0000000_0000002)],
-    [qw(00000001 fffffff0
-        00_000000f_ffffff1 ff_ffffff0_0000011 00_0000000_0000002)],
-
-    [qw(00_a00000a_aaaaaaa 50505050
-        00_a00000f_afafafa 00_a000005_a5a5a5a 00_a00000a_aaaaaab)],
-    [qw(0f_fff0005_0505050 aaaaaaaa
-        0f_fff000f_afafafa 0f_ffefffa_5a5a5a6 0f_fff0005_0505051)],
-    [qw(00_000000f_fffffff 01_800000a_aaaaaaa
-        01_800001a_aaaaaa9 fe_8000005_5555555 00_0000010_0000000)],
-    [qw(00_0000000_0000001 ff_fffffff_fffffff
-        00_0000000_0000000 00_0000000_0000002 00_0000000_0000002)],
-    [qw(00_0000000_0000001 ff_fffffff_ffffff0
-        ff_fffffff_ffffff1 00_0000000_0000011 00_0000000_0000002)],
-  ];
-
-  $error_count += AddressAddUnitTest($unit_test_data_8, $unit_test_data_16);
-  $error_count += AddressSubUnitTest($unit_test_data_8, $unit_test_data_16);
-  $error_count += AddressIncUnitTest($unit_test_data_8, $unit_test_data_16);
-  if ($error_count > 0) {
-    print STDERR $error_count, " errors: FAILED\n";
-  } else {
-    print STDERR "PASS\n";
-  }
-  exit ($error_count);
-}
diff --git a/configure.ac b/configure.ac
index 5f9bbd3..05c0d56 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1607,7 +1607,7 @@ AC_CONFIG_HEADERS([$cfghdrs_tup])
 dnl ============================================================================
 dnl Generate outputs.
 
-AC_CONFIG_FILES([$cfgoutputs_tup config.stamp bin/jemalloc-config bin/jemalloc.sh])
+AC_CONFIG_FILES([$cfgoutputs_tup config.stamp bin/jemalloc-config bin/jemalloc.sh bin/jeprof])
 AC_SUBST([cfgoutputs_in])
 AC_SUBST([cfgoutputs_out])
 AC_OUTPUT
diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index c9ee997..123fb3a 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -1132,8 +1132,9 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         option for information on high-water-triggered profile dumping, and the
         <link linkend="opt.prof_final"><mallctl>opt.prof_final</mallctl></link>
         option for final profile dumping.  Profile output is compatible with
-        the included <command>pprof</command> Perl script, which originates
-        from the <ulink url="http://code.google.com/p/gperftools/">gperftools
+        the <command>jeprof</command> command, which is based on the
+        <command>pprof</command> that is developed as part of the <ulink
+        url="http://code.google.com/p/gperftools/">gperftools
         package</ulink>.</para></listitem>
       </varlistentry>
 
diff --git a/src/prof.c b/src/prof.c
index 8453ea8..d097749 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -1418,7 +1418,7 @@ prof_leakcheck(const prof_cnt_t *cnt_all, size_t leak_ngctx,
 		    cnt_all->curobjs, (cnt_all->curobjs != 1) ? "s" : "",
 		    leak_ngctx, (leak_ngctx != 1) ? "s" : "");
 		malloc_printf(
-		    "<jemalloc>: Run pprof on \"%s\" for leak detail\n",
+		    "<jemalloc>: Run jeprof on \"%s\" for leak detail\n",
 		    filename);
 	}
 }
-- 
cgit v0.12


From 6bb54cb9da9cb0f996c16c6ea3e1dda0755390f5 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 5 May 2015 15:43:34 -0700
Subject: Clean up bin/jeprof in distclean build target.

---
 Makefile.in | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile.in b/Makefile.in
index f539fad..7f5ac76 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -410,6 +410,7 @@ clean:
 distclean: clean
 	rm -f $(objroot)bin/jemalloc-config
 	rm -f $(objroot)bin/jemalloc.sh
+	rm -f $(objroot)bin/jeprof
 	rm -f $(objroot)config.log
 	rm -f $(objroot)config.status
 	rm -f $(objroot)config.stamp
-- 
cgit v0.12


From 8a03cf039cd06f9fa6972711195055d865673966 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Mon, 4 May 2015 09:58:36 -0700
Subject: Implement cache index randomization for large allocations.

Extract szad size quantization into {extent,run}_quantize(), and .
quantize szad run sizes to the union of valid small region run sizes and
large run sizes.

Refactor iteration in arena_run_first_fit() to use
run_quantize{,_first,_next(), and add support for padded large runs.

For large allocations that have no specified alignment constraints,
compute a pseudo-random offset from the beginning of the first backing
page that is a multiple of the cache line size.  Under typical
configurations with 4-KiB pages and 64-byte cache lines this results in
a uniform distribution among 64 page boundary offsets.

Add the --disable-cache-oblivious option, primarily intended for
performance testing.

This resolves #13.
---
 ChangeLog                                          |   3 +
 INSTALL                                            |   9 +
 configure.ac                                       |  18 ++
 include/jemalloc/internal/arena.h                  |  53 +++--
 include/jemalloc/internal/jemalloc_internal.h.in   |   7 +
 .../jemalloc/internal/jemalloc_internal_defs.h.in  |   6 +
 include/jemalloc/internal/prng.h                   |  12 +-
 src/arena.c                                        | 216 +++++++++++++++++----
 src/extent.c                                       |  25 ++-
 src/jemalloc.c                                     |   3 +-
 10 files changed, 279 insertions(+), 73 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 33139f9..b6fa366 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -101,6 +101,9 @@ found in the git revision history:
     run fragmentation, smaller runs reduce external fragmentation for small size
     classes, and packed (less uniformly aligned) metadata layout improves CPU
     cache set distribution.
+  - Randomly distribute large allocation base pointer alignment relative to page
+    boundaries in order to more uniformly utilize CPU cache sets.  This can be
+    disabled via the --disable-cache-oblivious configure option.
   - Micro-optimize the fast paths for the public API functions.
   - Refactor thread-specific data to reside in a single structure.  This assures
     that only a single TLS read is necessary per call into the public API.
diff --git a/INSTALL b/INSTALL
index cd760ca..8d39687 100644
--- a/INSTALL
+++ b/INSTALL
@@ -185,6 +185,15 @@ any of the following arguments (not a definitive list) to 'configure':
     thread-local variables via the __thread keyword.  If TLS is available,
     jemalloc uses it for several purposes.
 
+--disable-cache-oblivious
+    Disable cache-oblivious large allocation alignment for large allocation
+    requests with no alignment constraints.  If this feature is disabled, all
+    large allocations are page-aligned as an implementation artifact, which can
+    severely harm CPU cache utilization.  However, the cache-oblivious layout
+    comes at the cost of one extra page per large allocation, which in the
+    most extreme case increases physical memory usage for the 16 KiB size class
+    to 20 KiB.
+
 --with-xslroot=<path>
     Specify where to find DocBook XSL stylesheets when building the
     documentation.
diff --git a/configure.ac b/configure.ac
index 05c0d56..bb6f3a3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -952,6 +952,23 @@ if test "x$enable_xmalloc" = "x1" ; then
 fi
 AC_SUBST([enable_xmalloc])
 
+dnl Support cache-oblivious allocation alignment by default.
+AC_ARG_ENABLE([cache-oblivious],
+  [AS_HELP_STRING([--disable-cache-oblivious],
+                  [Disable support for cache-oblivious allocation alignment])],
+[if test "x$enable_cache_oblivious" = "xno" ; then
+  enable_cache_oblivious="0"
+else
+  enable_cache_oblivious="1"
+fi
+],
+[enable_cache_oblivious="1"]
+)
+if test "x$enable_cache_oblivious" = "x1" ; then
+  AC_DEFINE([JEMALLOC_CACHE_OBLIVIOUS], [ ])
+fi
+AC_SUBST([enable_cache_oblivious])
+
 dnl ============================================================================
 dnl Check for  __builtin_ffsl(), then ffsl(3), and fail if neither are found.
 dnl One of those two functions should (theoretically) exist on all platforms
@@ -1663,4 +1680,5 @@ AC_MSG_RESULT([xmalloc            : ${enable_xmalloc}])
 AC_MSG_RESULT([munmap             : ${enable_munmap}])
 AC_MSG_RESULT([lazy_lock          : ${enable_lazy_lock}])
 AC_MSG_RESULT([tls                : ${enable_tls}])
+AC_MSG_RESULT([cache-oblivious    : ${enable_cache_oblivious}])
 AC_MSG_RESULT([===============================================================================])
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index dff99fb..fba1b81 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -290,6 +290,12 @@ struct arena_s {
 
 	uint64_t		prof_accumbytes;
 
+	/*
+	 * PRNG state for cache index randomization of large allocation base
+	 * pointers.
+	 */
+	uint64_t		offset_state;
+
 	dss_prec_t		dss_prec;
 
 	/*
@@ -394,7 +400,15 @@ struct arena_s {
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
-extern ssize_t	opt_lg_dirty_mult;
+static const size_t	large_pad =
+#ifdef JEMALLOC_CACHE_OBLIVIOUS
+    PAGE
+#else
+    0
+#endif
+    ;
+
+extern ssize_t		opt_lg_dirty_mult;
 
 extern arena_bin_info_t	arena_bin_info[NBINS];
 
@@ -475,7 +489,7 @@ void	arena_stats_merge(arena_t *arena, const char **dss,
     arena_stats_t *astats, malloc_bin_stats_t *bstats,
     malloc_large_stats_t *lstats, malloc_huge_stats_t *hstats);
 arena_t	*arena_new(unsigned ind);
-void	arena_boot(void);
+bool	arena_boot(void);
 void	arena_prefork(arena_t *arena);
 void	arena_postfork_parent(arena_t *arena);
 void	arena_postfork_child(arena_t *arena);
@@ -721,7 +735,7 @@ arena_mapbits_unallocated_set(arena_chunk_t *chunk, size_t pageind, size_t size,
 {
 	size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
 
-	assert((size & PAGE_MASK) == 0);
+	assert(size == PAGE_CEILING(size));
 	assert((flags & ~CHUNK_MAP_FLAGS_MASK) == 0);
 	assert((flags & (CHUNK_MAP_DIRTY|CHUNK_MAP_UNZEROED)) == flags);
 	arena_mapbitsp_write(mapbitsp, size | CHUNK_MAP_BININD_INVALID | flags);
@@ -734,7 +748,7 @@ arena_mapbits_unallocated_size_set(arena_chunk_t *chunk, size_t pageind,
 	size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
 	size_t mapbits = arena_mapbitsp_read(mapbitsp);
 
-	assert((size & PAGE_MASK) == 0);
+	assert(size == PAGE_CEILING(size));
 	assert((mapbits & (CHUNK_MAP_LARGE|CHUNK_MAP_ALLOCATED)) == 0);
 	arena_mapbitsp_write(mapbitsp, size | (mapbits & PAGE_MASK));
 }
@@ -747,7 +761,7 @@ arena_mapbits_large_set(arena_chunk_t *chunk, size_t pageind, size_t size,
 	size_t mapbits = arena_mapbitsp_read(mapbitsp);
 	size_t unzeroed;
 
-	assert((size & PAGE_MASK) == 0);
+	assert(size == PAGE_CEILING(size));
 	assert((flags & CHUNK_MAP_DIRTY) == flags);
 	unzeroed = mapbits & CHUNK_MAP_UNZEROED; /* Preserve unzeroed. */
 	arena_mapbitsp_write(mapbitsp, size | CHUNK_MAP_BININD_INVALID | flags
@@ -762,7 +776,8 @@ arena_mapbits_large_binind_set(arena_chunk_t *chunk, size_t pageind,
 	size_t mapbits = arena_mapbitsp_read(mapbitsp);
 
 	assert(binind <= BININD_INVALID);
-	assert(arena_mapbits_large_size_get(chunk, pageind) == LARGE_MINCLASS);
+	assert(arena_mapbits_large_size_get(chunk, pageind) == LARGE_MINCLASS +
+	    large_pad);
 	arena_mapbitsp_write(mapbitsp, (mapbits & ~CHUNK_MAP_BININD_MASK) |
 	    (binind << CHUNK_MAP_BININD_SHIFT));
 }
@@ -1107,13 +1122,16 @@ arena_salloc(const void *ptr, bool demote)
 			 * end up looking at binind to determine that ptr is a
 			 * small allocation.
 			 */
-			assert(((uintptr_t)ptr & PAGE_MASK) == 0);
-			ret = arena_mapbits_large_size_get(chunk, pageind);
+			assert(config_cache_oblivious || ((uintptr_t)ptr &
+			    PAGE_MASK) == 0);
+			ret = arena_mapbits_large_size_get(chunk, pageind) -
+			    large_pad;
 			assert(ret != 0);
-			assert(pageind + (ret>>LG_PAGE) <= chunk_npages);
+			assert(pageind + ((ret+large_pad)>>LG_PAGE) <=
+			    chunk_npages);
 			assert(arena_mapbits_dirty_get(chunk, pageind) ==
 			    arena_mapbits_dirty_get(chunk,
-			    pageind+(ret>>LG_PAGE)-1));
+			    pageind+((ret+large_pad)>>LG_PAGE)-1));
 		} else {
 			/*
 			 * Small allocation (possibly promoted to a large
@@ -1157,11 +1175,13 @@ arena_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache)
 			size_t size = arena_mapbits_large_size_get(chunk,
 			    pageind);
 
-			assert(((uintptr_t)ptr & PAGE_MASK) == 0);
+			assert(config_cache_oblivious || ((uintptr_t)ptr &
+			    PAGE_MASK) == 0);
 
-			if (likely(tcache != NULL) && size <= tcache_maxclass)
-				tcache_dalloc_large(tsd, tcache, ptr, size);
-			else {
+			if (likely(tcache != NULL) && size <= tcache_maxclass) {
+				tcache_dalloc_large(tsd, tcache, ptr, size -
+				    large_pad);
+			} else {
 				arena_dalloc_large(extent_node_arena_get(
 				    &chunk->node), chunk, ptr);
 			}
@@ -1188,7 +1208,7 @@ arena_sdalloc(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache)
 				 */
 				assert(((uintptr_t)ptr & PAGE_MASK) == 0);
 				size = arena_mapbits_large_size_get(chunk,
-				    pageind);
+				    pageind) - large_pad;
 			}
 		}
 		assert(s2u(size) == s2u(arena_salloc(ptr, false)));
@@ -1205,7 +1225,8 @@ arena_sdalloc(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache)
 				    &chunk->node), chunk, ptr, pageind);
 			}
 		} else {
-			assert(((uintptr_t)ptr & PAGE_MASK) == 0);
+			assert(config_cache_oblivious || ((uintptr_t)ptr &
+			    PAGE_MASK) == 0);
 
 			if (likely(tcache != NULL) && size <= tcache_maxclass)
 				tcache_dalloc_large(tsd, tcache, ptr, size);
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index b398f31..910ebf7 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -126,6 +126,13 @@ static const bool config_ivsalloc =
     false
 #endif
     ;
+static const bool config_cache_oblivious =
+#ifdef JEMALLOC_CACHE_OBLIVIOUS
+    true
+#else
+    false
+#endif
+    ;
 
 #ifdef JEMALLOC_C11ATOMICS
 #include <stdatomic.h>
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index a943d23..ed8347a 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -193,6 +193,12 @@
 #undef JEMALLOC_IVSALLOC
 
 /*
+ * If defined, explicitly attempt to more uniformly distribute large allocation
+ * pointer alignments across all cache indices.
+ */
+#undef JEMALLOC_CACHE_OBLIVIOUS
+
+/*
  * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
  */
 #undef JEMALLOC_ZONE
diff --git a/include/jemalloc/internal/prng.h b/include/jemalloc/internal/prng.h
index c6b1797..216d0ef 100644
--- a/include/jemalloc/internal/prng.h
+++ b/include/jemalloc/internal/prng.h
@@ -26,22 +26,22 @@
  *   const uint32_t a, c : See above discussion.
  */
 #define	prng32(r, lg_range, state, a, c) do {				\
-	assert(lg_range > 0);						\
-	assert(lg_range <= 32);						\
+	assert((lg_range) > 0);						\
+	assert((lg_range) <= 32);					\
 									\
 	r = (state * (a)) + (c);					\
 	state = r;							\
-	r >>= (32 - lg_range);						\
+	r >>= (32 - (lg_range));					\
 } while (false)
 
 /* Same as prng32(), but 64 bits of pseudo-randomness, using uint64_t. */
 #define	prng64(r, lg_range, state, a, c) do {				\
-	assert(lg_range > 0);						\
-	assert(lg_range <= 64);						\
+	assert((lg_range) > 0);						\
+	assert((lg_range) <= 64);					\
 									\
 	r = (state * (a)) + (c);					\
 	state = r;							\
-	r >>= (64 - lg_range);						\
+	r >>= (64 - (lg_range));					\
 } while (false)
 
 #endif /* JEMALLOC_H_TYPES */
diff --git a/src/arena.c b/src/arena.c
index 3041068..a053adf 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -12,6 +12,8 @@ size_t		map_bias;
 size_t		map_misc_offset;
 size_t		arena_maxrun; /* Max run size for arenas. */
 size_t		arena_maxclass; /* Max size class for arenas. */
+static size_t	small_maxrun; /* Max run size used for small size classes. */
+static bool	*small_run_tab; /* Valid small run page multiples. */
 unsigned	nlclasses; /* Number of large size classes. */
 unsigned	nhclasses; /* Number of huge size classes. */
 
@@ -56,33 +58,102 @@ arena_run_comp(arena_chunk_map_misc_t *a, arena_chunk_map_misc_t *b)
 rb_gen(static UNUSED, arena_run_tree_, arena_run_tree_t, arena_chunk_map_misc_t,
     rb_link, arena_run_comp)
 
+static size_t
+run_quantize(size_t size)
+{
+	size_t qsize;
+
+	assert(size != 0);
+	assert(size == PAGE_CEILING(size));
+
+	/* Don't change sizes that are valid small run sizes. */
+	if (size <= small_maxrun && small_run_tab[size >> LG_PAGE])
+		return (size);
+
+	/*
+	 * Round down to the nearest run size that can actually be requested
+	 * during normal large allocation.  Add large_pad so that cache index
+	 * randomization can offset the allocation from the page boundary.
+	 */
+	qsize = index2size(size2index(size - large_pad + 1) - 1) + large_pad;
+	if (qsize <= SMALL_MAXCLASS + large_pad)
+		return (run_quantize(size - large_pad));
+	assert(qsize <= size);
+	return (qsize);
+}
+
+static size_t
+run_quantize_next(size_t size)
+{
+	size_t large_run_size_next;
+
+	assert(size != 0);
+	assert(size == PAGE_CEILING(size));
+
+	/*
+	 * Return the next quantized size greater than the input size.
+	 * Quantized sizes comprise the union of run sizes that back small
+	 * region runs, and run sizes that back large regions with no explicit
+	 * alignment constraints.
+	 */
+
+	if (size > SMALL_MAXCLASS) {
+		large_run_size_next = PAGE_CEILING(index2size(size2index(size -
+		    large_pad) + 1) + large_pad);
+	} else
+		large_run_size_next = SIZE_T_MAX;
+	if (size >= small_maxrun)
+		return (large_run_size_next);
+
+	while (true) {
+		size += PAGE;
+		assert(size <= small_maxrun);
+		if (small_run_tab[size >> LG_PAGE]) {
+			if (large_run_size_next < size)
+				return (large_run_size_next);
+			return (size);
+		}
+	}
+}
+
+static size_t
+run_quantize_first(size_t size)
+{
+	size_t qsize = run_quantize(size);
+
+	if (qsize < size) {
+		/*
+		 * Skip a quantization that may have an adequately large run,
+		 * because under-sized runs may be mixed in.  This only happens
+		 * when an unusual size is requested, i.e. for aligned
+		 * allocation, and is just one of several places where linear
+		 * search would potentially find sufficiently aligned available
+		 * memory somewhere lower.
+		 */
+		qsize = run_quantize_next(size);
+	}
+	return (qsize);
+}
+
 JEMALLOC_INLINE_C int
 arena_avail_comp(arena_chunk_map_misc_t *a, arena_chunk_map_misc_t *b)
 {
 	int ret;
 	uintptr_t a_miscelm = (uintptr_t)a;
-	size_t a_size;
-	size_t b_size = arena_miscelm_to_bits(b) & ~PAGE_MASK;
-	index_t a_index, b_index;
+	size_t a_qsize;
+	size_t b_qsize = run_quantize(arena_miscelm_to_bits(b) & ~PAGE_MASK);
 
 	if (a_miscelm & CHUNK_MAP_KEY) {
-		a_size = a_miscelm & ~PAGE_MASK;
-		assert(a_size == s2u(a_size));
+		size_t a_size = a_miscelm & ~PAGE_MASK;
+		a_qsize = run_quantize(a_size);
 	} else
-		a_size = arena_miscelm_to_bits(a) & ~PAGE_MASK;
+		a_qsize = run_quantize(arena_miscelm_to_bits(a) & ~PAGE_MASK);
 
 	/*
-	 * Compute the index of the largest size class that the run can satisfy
-	 * a request for.
+	 * Compare based on quantized size rather than size, in order to sort
+	 * equally useful runs only by address.
 	 */
-	a_index = size2index(a_size + 1) - 1;
-	b_index = size2index(b_size + 1) - 1;
-
-	/*
-	 * Compare based on size class index rather than size, in order to
-	 * sort equally useful runs only by address.
-	 */
-	ret = (a_index > b_index) - (a_index < b_index);
+	ret = (a_qsize > b_qsize) - (a_qsize < b_qsize);
 	if (ret == 0) {
 		if (!(a_miscelm & CHUNK_MAP_KEY)) {
 			uintptr_t b_miscelm = (uintptr_t)b;
@@ -913,7 +984,7 @@ static arena_run_t *
 arena_run_first_fit(arena_t *arena, size_t size)
 {
 	arena_run_t *run;
-	index_t index, max_index;
+	size_t search_size, max_size;
 
 	assert(size == s2u(size));
 	assert(size == PAGE_CEILING(size));
@@ -924,14 +995,14 @@ arena_run_first_fit(arena_t *arena, size_t size)
 	 * and choose the lowest of the runs found.
 	 */
 	run = NULL;
-	for (index = size2index(size), max_index = size2index(arena_maxclass);
-	    index <= max_index;) {
+	for (search_size = run_quantize_first(size), max_size =
+	    run_quantize(arena_maxclass + large_pad); search_size <= max_size;
+	    search_size = run_quantize_next(search_size)) {
 		arena_run_t *currun;
 		arena_chunk_t *currun_chunk;
 		size_t currun_pageind, currun_size;
-		size_t usize = PAGE_CEILING(index2size(index));
-		arena_chunk_map_misc_t *key = (arena_chunk_map_misc_t *)(usize |
-		    CHUNK_MAP_KEY);
+		arena_chunk_map_misc_t *key = (arena_chunk_map_misc_t *)
+		    (search_size | CHUNK_MAP_KEY);
 		arena_chunk_map_misc_t *miscelm =
 		    arena_avail_tree_nsearch(&arena->runs_avail, key);
 		if (miscelm == NULL)
@@ -939,12 +1010,13 @@ arena_run_first_fit(arena_t *arena, size_t size)
 		currun = &miscelm->run;
 		if (run == NULL || (uintptr_t)currun < (uintptr_t)run)
 			run = currun;
+		/* Skip iteration(s) if run is larger than the search size. */
 		currun_chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(currun);
 		currun_pageind = arena_miscelm_to_pageind(miscelm);
 		currun_size = arena_mapbits_unallocated_size_get(currun_chunk,
 		    currun_pageind);
-		assert(size2index(currun_size) + 1 > index);
-		index = size2index(currun_size) + 1;
+		assert(currun_size >= search_size);
+		search_size = currun_size;
 	}
 
 	return (run);
@@ -966,7 +1038,7 @@ arena_run_alloc_large(arena_t *arena, size_t size, bool zero)
 	arena_run_t *run;
 
 	assert(size <= arena_maxrun);
-	assert((size & PAGE_MASK) == 0);
+	assert(size == PAGE_CEILING(size));
 
 	/* Search the arena's chunks for the lowest best fit. */
 	run = arena_run_alloc_large_helper(arena, size, zero);
@@ -994,7 +1066,7 @@ arena_run_alloc_large(arena_t *arena, size_t size, bool zero)
 static arena_run_t *
 arena_run_alloc_small_helper(arena_t *arena, size_t size, index_t binind)
 {
-	arena_run_t *run = arena_run_first_fit(arena, PAGE_CEILING(size));
+	arena_run_t *run = arena_run_first_fit(arena, size);
 	if (run != NULL)
 		arena_run_split_small(arena, run, size, binind);
 	return (run);
@@ -1007,7 +1079,7 @@ arena_run_alloc_small(arena_t *arena, size_t size, index_t binind)
 	arena_run_t *run;
 
 	assert(size <= arena_maxrun);
-	assert((size & PAGE_MASK) == 0);
+	assert(size == PAGE_CEILING(size));
 	assert(binind != BININD_INVALID);
 
 	/* Search the arena's chunks for the lowest best fit. */
@@ -1965,6 +2037,8 @@ arena_malloc_large(arena_t *arena, size_t size, bool zero)
 {
 	void *ret;
 	size_t usize;
+	uint64_t r;
+	uintptr_t random_offset;
 	arena_run_t *run;
 	arena_chunk_map_misc_t *miscelm;
 	UNUSED bool idump;
@@ -1972,13 +2046,25 @@ arena_malloc_large(arena_t *arena, size_t size, bool zero)
 	/* Large allocation. */
 	usize = s2u(size);
 	malloc_mutex_lock(&arena->lock);
-	run = arena_run_alloc_large(arena, usize, zero);
+	if (config_cache_oblivious) {
+		/*
+		 * Compute a uniformly distributed offset within the first page
+		 * that is a multiple of the cacheline size, e.g. [0 .. 63) * 64
+		 * for 4 KiB pages and 64-byte cachelines.
+		 */
+		prng64(r, LG_PAGE - LG_CACHELINE, arena->offset_state,
+		    UINT64_C(6364136223846793009), UINT64_C(1442695040888963409));
+		random_offset = ((uintptr_t)r) << LG_CACHELINE;
+	} else
+		random_offset = 0;
+	run = arena_run_alloc_large(arena, usize + large_pad, zero);
 	if (run == NULL) {
 		malloc_mutex_unlock(&arena->lock);
 		return (NULL);
 	}
 	miscelm = arena_run_to_miscelm(run);
-	ret = arena_miscelm_to_rpages(miscelm);
+	ret = (void *)((uintptr_t)arena_miscelm_to_rpages(miscelm) +
+	    random_offset);
 	if (config_stats) {
 		index_t index = size2index(usize) - NBINS;
 
@@ -2019,14 +2105,14 @@ arena_palloc_large(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
 	arena_chunk_map_misc_t *miscelm;
 	void *rpages;
 
-	assert((size & PAGE_MASK) == 0);
+	assert(size == PAGE_CEILING(size));
 
 	arena = arena_choose(tsd, arena);
 	if (unlikely(arena == NULL))
 		return (NULL);
 
 	alignment = PAGE_CEILING(alignment);
-	alloc_size = size + alignment - PAGE;
+	alloc_size = size + large_pad + alignment - PAGE;
 
 	malloc_mutex_lock(&arena->lock);
 	run = arena_run_alloc_large(arena, alloc_size, false);
@@ -2041,7 +2127,7 @@ arena_palloc_large(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
 	leadsize = ALIGNMENT_CEILING((uintptr_t)rpages, alignment) -
 	    (uintptr_t)rpages;
 	assert(alloc_size >= leadsize + size);
-	trailsize = alloc_size - leadsize - size;
+	trailsize = alloc_size - leadsize - size - large_pad;
 	if (leadsize != 0) {
 		arena_chunk_map_misc_t *head_miscelm = miscelm;
 		arena_run_t *head_run = run;
@@ -2055,10 +2141,10 @@ arena_palloc_large(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
 		    alloc_size - leadsize);
 	}
 	if (trailsize != 0) {
-		arena_run_trim_tail(arena, chunk, run, size + trailsize, size,
-		    false);
+		arena_run_trim_tail(arena, chunk, run, size + large_pad +
+		    trailsize, size + large_pad, false);
 	}
-	arena_run_init_large(arena, run, size, zero);
+	arena_run_init_large(arena, run, size + large_pad, zero);
 	ret = arena_miscelm_to_rpages(miscelm);
 
 	if (config_stats) {
@@ -2088,7 +2174,8 @@ arena_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 {
 	void *ret;
 
-	if (usize <= SMALL_MAXCLASS && alignment < PAGE)
+	if (usize <= SMALL_MAXCLASS && (alignment < PAGE || (alignment == PAGE
+	    && (usize & PAGE_MASK) == 0)))
 		ret = arena_malloc(tsd, arena, usize, zero, tcache);
 	else {
 		if (likely(usize <= arena_maxclass)) {
@@ -2292,7 +2379,8 @@ arena_dalloc_large_locked_impl(arena_t *arena, arena_chunk_t *chunk,
 	arena_run_t *run = &miscelm->run;
 
 	if (config_fill || config_stats) {
-		size_t usize = arena_mapbits_large_size_get(chunk, pageind);
+		size_t usize = arena_mapbits_large_size_get(chunk, pageind) -
+		    large_pad;
 
 		if (!junked)
 			arena_dalloc_junk_large(ptr, usize);
@@ -2341,7 +2429,8 @@ arena_ralloc_large_shrink(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	 * allocations.
 	 */
 	malloc_mutex_lock(&arena->lock);
-	arena_run_trim_tail(arena, chunk, run, oldsize, size, true);
+	arena_run_trim_tail(arena, chunk, run, oldsize + large_pad, size +
+	    large_pad, true);
 	if (config_stats) {
 		index_t oldindex = size2index(oldsize) - NBINS;
 		index_t index = size2index(size) - NBINS;
@@ -2370,7 +2459,8 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	size_t followsize;
 	size_t usize_min = s2u(size);
 
-	assert(oldsize == arena_mapbits_large_size_get(chunk, pageind));
+	assert(oldsize == arena_mapbits_large_size_get(chunk, pageind) -
+	    large_pad);
 
 	/* Try to extend the run. */
 	assert(usize_min > oldsize);
@@ -2391,7 +2481,7 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 		while (oldsize + followsize < usize)
 			usize = index2size(size2index(usize)-1);
 		assert(usize >= usize_min);
-		splitsize = usize - oldsize;
+		splitsize = usize - oldsize + large_pad;
 
 		run = &arena_miscelm_get(chunk, pageind+npages)->run;
 		arena_run_split_large(arena, run, splitsize, zero);
@@ -2755,6 +2845,18 @@ arena_new(unsigned ind)
 	if (config_prof)
 		arena->prof_accumbytes = 0;
 
+	if (config_cache_oblivious) {
+		/*
+		 * A nondeterministic seed based on the address of arena reduces
+		 * the likelihood of lockstep non-uniform cache index
+		 * utilization among identical concurrent processes, but at the
+		 * cost of test repeatability.  For debug builds, instead use a
+		 * deterministic seed.
+		 */
+		arena->offset_state = config_debug ? ind :
+		    (uint64_t)(uintptr_t)arena;
+	}
+
 	arena->dss_prec = chunk_dss_prec_get();
 
 	arena->spare = NULL;
@@ -2890,6 +2992,9 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info)
 	bin_info->reg0_offset = actual_run_size - (actual_nregs *
 	    bin_info->reg_interval) - pad_size + bin_info->redzone_size;
 
+	if (actual_run_size > small_maxrun)
+		small_maxrun = actual_run_size;
+
 	assert(bin_info->reg0_offset - bin_info->redzone_size + (bin_info->nregs
 	    * bin_info->reg_interval) + pad_size == bin_info->run_size);
 }
@@ -2899,7 +3004,7 @@ bin_info_init(void)
 {
 	arena_bin_info_t *bin_info;
 
-#define	BIN_INFO_INIT_bin_yes(index, size) \
+#define	BIN_INFO_INIT_bin_yes(index, size)				\
 	bin_info = &arena_bin_info[index];				\
 	bin_info->reg_size = size;					\
 	bin_info_run_size_calc(bin_info);				\
@@ -2913,7 +3018,33 @@ bin_info_init(void)
 #undef SC
 }
 
-void
+static bool
+small_run_size_init(void)
+{
+
+	assert(small_maxrun != 0);
+
+	small_run_tab = (bool *)base_alloc(sizeof(bool) * (small_maxrun >>
+	    LG_PAGE));
+	if (small_run_tab == NULL)
+		return (true);
+
+#define	TAB_INIT_bin_yes(index, size) {					\
+		arena_bin_info_t *bin_info = &arena_bin_info[index];	\
+		small_run_tab[bin_info->run_size >> LG_PAGE] = true;	\
+	}
+#define	TAB_INIT_bin_no(index, size)
+#define	SC(index, lg_grp, lg_delta, ndelta, bin, lg_delta_lookup)	\
+	TAB_INIT_bin_##bin(index, (ZU(1)<<lg_grp) + (ZU(ndelta)<<lg_delta))
+	SIZE_CLASSES
+#undef TAB_INIT_bin_yes
+#undef TAB_INIT_bin_no
+#undef SC
+
+	return (false);
+}
+
+bool
 arena_boot(void)
 {
 	size_t header_size;
@@ -2961,6 +3092,7 @@ arena_boot(void)
 	nhclasses = NSIZES - nlclasses - NBINS;
 
 	bin_info_init();
+	return (small_run_size_init());
 }
 
 void
diff --git a/src/extent.c b/src/extent.c
index e16f8f6..13f9441 100644
--- a/src/extent.c
+++ b/src/extent.c
@@ -3,20 +3,29 @@
 
 /******************************************************************************/
 
+JEMALLOC_INLINE_C size_t
+extent_quantize(size_t size)
+{
+
+	/*
+	 * Round down to the nearest chunk size that can actually be requested
+	 * during normal huge allocation.
+	 */
+	return (index2size(size2index(size + 1) - 1));
+}
+
 JEMALLOC_INLINE_C int
 extent_szad_comp(extent_node_t *a, extent_node_t *b)
 {
 	int ret;
-	size_t a_size = extent_node_size_get(a);
-	size_t b_size = extent_node_size_get(b);
+	size_t a_qsize = extent_quantize(extent_node_size_get(a));
+	size_t b_qsize = extent_quantize(extent_node_size_get(b));
+
 	/*
-	 * Compute the index of the largest size class that the chunk can
-	 * satisfy a request for.
+	 * Compare based on quantized size rather than size, in order to sort
+	 * equally useful extents only by address.
 	 */
-	size_t a_index = size2index(a_size + 1) - 1;
-	size_t b_index = size2index(b_size + 1) - 1;
-
-	ret = (a_index > b_index) - (a_index < b_index);
+	ret = (a_qsize > b_qsize) - (a_qsize < b_qsize);
 	if (ret == 0) {
 		uintptr_t a_addr = (uintptr_t)extent_node_addr_get(a);
 		uintptr_t b_addr = (uintptr_t)extent_node_addr_get(b);
diff --git a/src/jemalloc.c b/src/jemalloc.c
index a2d1c5c..7f26652 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1182,7 +1182,8 @@ malloc_init_hard_a0_locked(void)
 		return (true);
 	if (config_prof)
 		prof_boot1();
-	arena_boot();
+	if (arena_boot())
+		return (true);
 	if (config_tcache && tcache_boot())
 		return (true);
 	if (malloc_mutex_init(&arenas_lock))
-- 
cgit v0.12


From c451831264885b84f54a05e0894ad88bb30bd5df Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Thu, 7 May 2015 22:35:40 -0700
Subject: Fix type punning in calls to atomic operation functions.

---
 include/jemalloc/internal/arena.h |  9 ++++++---
 include/jemalloc/internal/rtree.h | 14 +++++++++-----
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index fba1b81..5860195 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -145,7 +145,10 @@ struct arena_chunk_map_misc_s {
 		arena_runs_dirty_link_t		rd;
 
 		/* Profile counters, used for large object runs. */
-		prof_tctx_t			*prof_tctx;
+		union {
+			void				*prof_tctx_pun;
+			prof_tctx_t			*prof_tctx;
+		};
 
 		/* Small region run metadata. */
 		arena_run_t			run;
@@ -1025,7 +1028,7 @@ arena_prof_tctx_get(const void *ptr)
 		else {
 			arena_chunk_map_misc_t *elm = arena_miscelm_get(chunk,
 			    pageind);
-			ret = atomic_read_p((void **)&elm->prof_tctx);
+			ret = atomic_read_p(&elm->prof_tctx_pun);
 		}
 	} else
 		ret = huge_prof_tctx_get(ptr);
@@ -1049,7 +1052,7 @@ arena_prof_tctx_set(const void *ptr, prof_tctx_t *tctx)
 		if (unlikely(arena_mapbits_large_get(chunk, pageind) != 0)) {
 			arena_chunk_map_misc_t *elm = arena_miscelm_get(chunk,
 			    pageind);
-			atomic_write_p((void **)&elm->prof_tctx, tctx);
+			atomic_write_p(&elm->prof_tctx_pun, tctx);
 		}
 	} else
 		huge_prof_tctx_set(ptr, tctx);
diff --git a/include/jemalloc/internal/rtree.h b/include/jemalloc/internal/rtree.h
index c1fb90c..7a8ebfd 100644
--- a/include/jemalloc/internal/rtree.h
+++ b/include/jemalloc/internal/rtree.h
@@ -36,6 +36,7 @@ typedef void (rtree_node_dalloc_t)(rtree_node_elm_t *);
 
 struct rtree_node_elm_s {
 	union {
+		void			*pun;
 		rtree_node_elm_t	*child;
 		extent_node_t		*val;
 	};
@@ -64,7 +65,10 @@ struct rtree_level_s {
 	 * lower 47 bits of virtual address space in userland, thus leaving
 	 * subtrees[0] unused and avoiding a level of tree traversal.
 	 */
-	rtree_node_elm_t	*subtree;
+	union {
+		void			*subtree_pun;
+		rtree_node_elm_t	*subtree;
+	};
 	/* Number of key bits distinguished by this level. */
 	unsigned		bits;
 	/*
@@ -159,7 +163,7 @@ rtree_child_tryread(rtree_node_elm_t *elm)
 	/* Double-checked read (first read may be stale. */
 	child = elm->child;
 	if (!rtree_node_valid(child))
-		child = atomic_read_p((void **)&elm->child);
+		child = atomic_read_p(&elm->pun);
 	return (child);
 }
 
@@ -178,14 +182,14 @@ JEMALLOC_INLINE extent_node_t *
 rtree_val_read(rtree_t *rtree, rtree_node_elm_t *elm)
 {
 
-	return (atomic_read_p((void **)&elm->val));
+	return (atomic_read_p(&elm->pun));
 }
 
 JEMALLOC_INLINE void
 rtree_val_write(rtree_t *rtree, rtree_node_elm_t *elm, const extent_node_t *val)
 {
 
-	atomic_write_p((void **)&elm->val, val);
+	atomic_write_p(&elm->pun, val);
 }
 
 JEMALLOC_INLINE rtree_node_elm_t *
@@ -196,7 +200,7 @@ rtree_subtree_tryread(rtree_t *rtree, unsigned level)
 	/* Double-checked read (first read may be stale. */
 	subtree = rtree->levels[level].subtree;
 	if (!rtree_node_valid(subtree))
-		subtree = atomic_read_p((void **)&rtree->levels[level].subtree);
+		subtree = atomic_read_p(&rtree->levels[level].subtree_pun);
 	return (subtree);
 }
 
-- 
cgit v0.12


From fd5f9e43c35b39740e218fececbb70d929546bb0 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 15 May 2015 17:02:30 -0700
Subject: Avoid atomic operations for dependent rtree reads.

---
 include/jemalloc/internal/chunk.h                |  6 ++---
 include/jemalloc/internal/jemalloc_internal.h.in |  2 +-
 include/jemalloc/internal/rtree.h                | 31 ++++++++++++++++++------
 src/huge.c                                       |  2 +-
 test/unit/rtree.c                                | 28 ++++++++++-----------
 5 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/include/jemalloc/internal/chunk.h b/include/jemalloc/internal/chunk.h
index 8093814..c253cdc 100644
--- a/include/jemalloc/internal/chunk.h
+++ b/include/jemalloc/internal/chunk.h
@@ -70,15 +70,15 @@ void	chunk_postfork_child(void);
 #ifdef JEMALLOC_H_INLINES
 
 #ifndef JEMALLOC_ENABLE_INLINE
-extent_node_t	*chunk_lookup(const void *chunk);
+extent_node_t	*chunk_lookup(const void *chunk, bool dependent);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_CHUNK_C_))
 JEMALLOC_INLINE extent_node_t *
-chunk_lookup(const void *chunk)
+chunk_lookup(const void *ptr, bool dependent)
 {
 
-	return (rtree_get(&chunks_rtree, (uintptr_t)chunk));
+	return (rtree_get(&chunks_rtree, (uintptr_t)ptr, dependent));
 }
 #endif
 
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 910ebf7..0268245 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -948,7 +948,7 @@ ivsalloc(const void *ptr, bool demote)
 	extent_node_t *node;
 
 	/* Return 0 if ptr is not within a chunk managed by jemalloc. */
-	node = chunk_lookup(CHUNK_ADDR2BASE(ptr));
+	node = chunk_lookup(ptr, false);
 	if (node == NULL)
 		return (0);
 	/* Only arena chunks should be looked up via interior pointers. */
diff --git a/include/jemalloc/internal/rtree.h b/include/jemalloc/internal/rtree.h
index 7a8ebfd..28ae9d1 100644
--- a/include/jemalloc/internal/rtree.h
+++ b/include/jemalloc/internal/rtree.h
@@ -114,13 +114,14 @@ bool	rtree_node_valid(rtree_node_elm_t *node);
 rtree_node_elm_t	*rtree_child_tryread(rtree_node_elm_t *elm);
 rtree_node_elm_t	*rtree_child_read(rtree_t *rtree, rtree_node_elm_t *elm,
     unsigned level);
-extent_node_t	*rtree_val_read(rtree_t *rtree, rtree_node_elm_t *elm);
+extent_node_t	*rtree_val_read(rtree_t *rtree, rtree_node_elm_t *elm,
+    bool dependent);
 void	rtree_val_write(rtree_t *rtree, rtree_node_elm_t *elm,
     const extent_node_t *val);
 rtree_node_elm_t	*rtree_subtree_tryread(rtree_t *rtree, unsigned level);
 rtree_node_elm_t	*rtree_subtree_read(rtree_t *rtree, unsigned level);
 
-extent_node_t	*rtree_get(rtree_t *rtree, uintptr_t key);
+extent_node_t	*rtree_get(rtree_t *rtree, uintptr_t key, bool dependent);
 bool	rtree_set(rtree_t *rtree, uintptr_t key, const extent_node_t *val);
 #endif
 
@@ -179,10 +180,25 @@ rtree_child_read(rtree_t *rtree, rtree_node_elm_t *elm, unsigned level)
 }
 
 JEMALLOC_INLINE extent_node_t *
-rtree_val_read(rtree_t *rtree, rtree_node_elm_t *elm)
+rtree_val_read(rtree_t *rtree, rtree_node_elm_t *elm, bool dependent)
 {
 
-	return (atomic_read_p(&elm->pun));
+	if (dependent) {
+		/*
+		 * Reading a val on behalf of a pointer to a valid allocation is
+		 * guaranteed to be a clean read even without synchronization,
+		 * because the rtree update became visible in memory before the
+		 * pointer came into existence.
+		 */
+		return (elm->val);
+	} else {
+		/*
+		 * An arbitrary read, e.g. on behalf of ivsalloc(), may not be
+		 * dependent on a previous rtree write, which means a stale read
+		 * could result if synchronization were omitted here.
+		 */
+		return (atomic_read_p(&elm->pun));
+	}
 }
 
 JEMALLOC_INLINE void
@@ -216,7 +232,7 @@ rtree_subtree_read(rtree_t *rtree, unsigned level)
 }
 
 JEMALLOC_INLINE extent_node_t *
-rtree_get(rtree_t *rtree, uintptr_t key)
+rtree_get(rtree_t *rtree, uintptr_t key, bool dependent)
 {
 	uintptr_t subkey;
 	unsigned i, start_level;
@@ -226,7 +242,7 @@ rtree_get(rtree_t *rtree, uintptr_t key)
 
 	for (i = start_level, node = rtree_subtree_tryread(rtree, start_level);
 	    /**/; i++, node = child) {
-		if (unlikely(!rtree_node_valid(node)))
+		if (!dependent && unlikely(!rtree_node_valid(node)))
 			return (NULL);
 		subkey = rtree_subkey(rtree, key, i);
 		if (i == rtree->height - 1) {
@@ -234,7 +250,8 @@ rtree_get(rtree_t *rtree, uintptr_t key)
 			 * node is a leaf, so it contains values rather than
 			 * child pointers.
 			 */
-			return (rtree_val_read(rtree, &node[subkey]));
+			return (rtree_val_read(rtree, &node[subkey],
+			    dependent));
 		}
 		assert(i < rtree->height - 1);
 		child = rtree_child_tryread(&node[subkey]);
diff --git a/src/huge.c b/src/huge.c
index 32af205..6e6824d 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -8,7 +8,7 @@ huge_node_get(const void *ptr)
 {
 	extent_node_t *node;
 
-	node = chunk_lookup(ptr);
+	node = chunk_lookup(ptr, true);
 	assert(!extent_node_achunk_get(node));
 
 	return (node);
diff --git a/test/unit/rtree.c b/test/unit/rtree.c
index 496e03a..3f95554 100644
--- a/test/unit/rtree.c
+++ b/test/unit/rtree.c
@@ -22,7 +22,7 @@ TEST_BEGIN(test_rtree_get_empty)
 		rtree_t rtree;
 		assert_false(rtree_new(&rtree, i, node_alloc, node_dalloc),
 		    "Unexpected rtree_new() failure");
-		assert_ptr_null(rtree_get(&rtree, 0),
+		assert_ptr_null(rtree_get(&rtree, 0, false),
 		    "rtree_get() should return NULL for empty tree");
 		rtree_delete(&rtree);
 	}
@@ -40,11 +40,11 @@ TEST_BEGIN(test_rtree_extrema)
 		    "Unexpected rtree_new() failure");
 
 		rtree_set(&rtree, 0, &node_a);
-		assert_ptr_eq(rtree_get(&rtree, 0), &node_a,
+		assert_ptr_eq(rtree_get(&rtree, 0, true), &node_a,
 		    "rtree_get() should return previously set value");
 
 		rtree_set(&rtree, ~((uintptr_t)0), &node_b);
-		assert_ptr_eq(rtree_get(&rtree, ~((uintptr_t)0)), &node_b,
+		assert_ptr_eq(rtree_get(&rtree, ~((uintptr_t)0), true), &node_b,
 		    "rtree_get() should return previously set value");
 
 		rtree_delete(&rtree);
@@ -68,15 +68,15 @@ TEST_BEGIN(test_rtree_bits)
 		for (j = 0; j < sizeof(keys)/sizeof(uintptr_t); j++) {
 			rtree_set(&rtree, keys[j], &node);
 			for (k = 0; k < sizeof(keys)/sizeof(uintptr_t); k++) {
-				assert_ptr_eq(rtree_get(&rtree, keys[k]), &node,
-				    "rtree_get() should return previously set "
-				    "value and ignore insignificant key bits; "
-				    "i=%u, j=%u, k=%u, set key=%#"PRIxPTR", "
-				    "get key=%#"PRIxPTR, i, j, k, keys[j],
-				    keys[k]);
+				assert_ptr_eq(rtree_get(&rtree, keys[k], true),
+				    &node, "rtree_get() should return "
+				    "previously set value and ignore "
+				    "insignificant key bits; i=%u, j=%u, k=%u, "
+				    "set key=%#"PRIxPTR", get key=%#"PRIxPTR, i,
+				    j, k, keys[j], keys[k]);
 			}
 			assert_ptr_null(rtree_get(&rtree,
-			    (((uintptr_t)1) << (sizeof(uintptr_t)*8-i))),
+			    (((uintptr_t)1) << (sizeof(uintptr_t)*8-i)), false),
 			    "Only leftmost rtree leaf should be set; "
 			    "i=%u, j=%u", i, j);
 			rtree_set(&rtree, keys[j], NULL);
@@ -107,21 +107,21 @@ TEST_BEGIN(test_rtree_random)
 		for (j = 0; j < NSET; j++) {
 			keys[j] = (uintptr_t)gen_rand64(sfmt);
 			rtree_set(&rtree, keys[j], &node);
-			assert_ptr_eq(rtree_get(&rtree, keys[j]), &node,
+			assert_ptr_eq(rtree_get(&rtree, keys[j], true), &node,
 			    "rtree_get() should return previously set value");
 		}
 		for (j = 0; j < NSET; j++) {
-			assert_ptr_eq(rtree_get(&rtree, keys[j]), &node,
+			assert_ptr_eq(rtree_get(&rtree, keys[j], true), &node,
 			    "rtree_get() should return previously set value");
 		}
 
 		for (j = 0; j < NSET; j++) {
 			rtree_set(&rtree, keys[j], NULL);
-			assert_ptr_null(rtree_get(&rtree, keys[j]),
+			assert_ptr_null(rtree_get(&rtree, keys[j], true),
 			    "rtree_get() should return previously set value");
 		}
 		for (j = 0; j < NSET; j++) {
-			assert_ptr_null(rtree_get(&rtree, keys[j]),
+			assert_ptr_null(rtree_get(&rtree, keys[j], true),
 			    "rtree_get() should return previously set value");
 		}
 
-- 
cgit v0.12


From 5aa50a2834fb09c5338f0e7b9db49cc0edd1a38a Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 19 May 2015 17:40:37 -0700
Subject: Fix nhbins calculation.

This regression was introduced by
155bfa7da18cab0d21d87aa2dce4554166836f5d (Normalize size classes.).
---
 src/tcache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tcache.c b/src/tcache.c
index 8d0a6fa..83e7e36 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -506,7 +506,7 @@ tcache_boot(void)
 	else
 		tcache_maxclass = (1U << opt_lg_tcache_max);
 
-	nhbins = NBINS + (tcache_maxclass >> LG_PAGE);
+	nhbins = size2index(tcache_maxclass) + 1;
 
 	/* Initialize tcache_bin_info. */
 	tcache_bin_info = (tcache_bin_info_t *)base_alloc(nhbins *
-- 
cgit v0.12


From 5154175cf1e6e7b1a2ed0295c232e60384944b3f Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 19 May 2015 17:42:31 -0700
Subject: Fix performance regression in arena_palloc().

Pass large allocation requests to arena_malloc() when possible.  This
regression was introduced by 155bfa7da18cab0d21d87aa2dce4554166836f5d
(Normalize size classes.).
---
 src/arena.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index a053adf..a3f36b3 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -2175,9 +2175,20 @@ arena_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 	void *ret;
 
 	if (usize <= SMALL_MAXCLASS && (alignment < PAGE || (alignment == PAGE
-	    && (usize & PAGE_MASK) == 0)))
+	    && (usize & PAGE_MASK) == 0))) {
+		/* Small; alignment doesn't require special run placement. */
 		ret = arena_malloc(tsd, arena, usize, zero, tcache);
-	else {
+	} else if (usize <= arena_maxclass && alignment <= PAGE) {
+		/*
+		 * Large; alignment doesn't require special run placement.
+		 * However, the cached pointer may be at a random offset from
+		 * the base of the run, so do some bit manipulation to retrieve
+		 * the base.
+		 */
+		ret = arena_malloc(tsd, arena, usize, zero, tcache);
+		if (config_cache_oblivious)
+			ret = (void *)((uintptr_t)ret & ~PAGE_MASK);
+	} else {
 		if (likely(usize <= arena_maxclass)) {
 			ret = arena_palloc_large(tsd, arena, usize, alignment,
 			    zero);
-- 
cgit v0.12


From 6591ff09d80e11f36603a75b32dc6a9b81fb3d47 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 19 May 2015 17:44:45 -0700
Subject: Fix arena_dalloc() performance regression.

Take into account large_pad when computing whether to pass the
deallocation request to tcache_dalloc_large(), so that the largest
cacheable size makes it back to tcache.  This regression was introduced
by 8a03cf039cd06f9fa6972711195055d865673966 (Implement cache index
randomization for large allocations.).
---
 include/jemalloc/internal/arena.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 5860195..6f3c77c 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -1181,7 +1181,8 @@ arena_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache)
 			assert(config_cache_oblivious || ((uintptr_t)ptr &
 			    PAGE_MASK) == 0);
 
-			if (likely(tcache != NULL) && size <= tcache_maxclass) {
+			if (likely(tcache != NULL) && size - large_pad <=
+			    tcache_maxclass) {
 				tcache_dalloc_large(tsd, tcache, ptr, size -
 				    large_pad);
 			} else {
-- 
cgit v0.12


From 836bbe9951a903b2d76af53dfb3ad53ad186f8b9 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 19 May 2015 17:47:16 -0700
Subject: Impose a minimum tcache count for small size classes.

Now that small allocation runs have fewer regions due to run metadata
residing in chunk headers, an explicit minimum tcache count is needed to
make sure that tcache adequately amortizes synchronization overhead.
---
 include/jemalloc/internal/tcache.h | 5 +++++
 src/tcache.c                       | 6 +++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/include/jemalloc/internal/tcache.h b/include/jemalloc/internal/tcache.h
index d2443b1..493f457 100644
--- a/include/jemalloc/internal/tcache.h
+++ b/include/jemalloc/internal/tcache.h
@@ -17,6 +17,11 @@ typedef struct tcaches_s tcaches_t;
 #define	TCACHE_STATE_MAX		TCACHE_STATE_PURGATORY
 
 /*
+ * Absolute minimum number of cache slots for each small bin.
+ */
+#define	TCACHE_NSLOTS_SMALL_MIN		20
+
+/*
  * Absolute maximum number of cache slots for each small bin in the thread
  * cache.  This is an additional constraint beyond that imposed as: twice the
  * number of regions per run for this size class.
diff --git a/src/tcache.c b/src/tcache.c
index 83e7e36..3814365 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -515,7 +515,11 @@ tcache_boot(void)
 		return (true);
 	stack_nelms = 0;
 	for (i = 0; i < NBINS; i++) {
-		if ((arena_bin_info[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MAX) {
+		if ((arena_bin_info[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MIN) {
+			tcache_bin_info[i].ncached_max =
+			    TCACHE_NSLOTS_SMALL_MIN;
+		} else if ((arena_bin_info[i].nregs << 1) <=
+		    TCACHE_NSLOTS_SMALL_MAX) {
 			tcache_bin_info[i].ncached_max =
 			    (arena_bin_info[i].nregs << 1);
 		} else {
-- 
cgit v0.12


From c073f8167a96a9bec98c61df4d2461811c3c10a4 Mon Sep 17 00:00:00 2001
From: Chi-hung Hsieh <chh@google.com>
Date: Wed, 27 May 2015 20:31:51 -0700
Subject: Fix type errors in C11 versions of atomic_*() functions.

---
 include/jemalloc/internal/atomic.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/jemalloc/internal/atomic.h b/include/jemalloc/internal/atomic.h
index 522dd2a..a9aad35 100644
--- a/include/jemalloc/internal/atomic.h
+++ b/include/jemalloc/internal/atomic.h
@@ -143,15 +143,15 @@ atomic_sub_uint64(uint64_t *p, uint64_t x)
 JEMALLOC_INLINE bool
 atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s)
 {
-
-	return (!atomic_compare_exchange_strong(p, &c, s));
+	volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
+	return (!atomic_compare_exchange_strong(a, &c, s));
 }
 
 JEMALLOC_INLINE void
 atomic_write_uint64(uint64_t *p, uint64_t x)
 {
-
-	atomic_store(p, x);
+	volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
+	atomic_store(a, x);
 }
 #  elif (defined(JEMALLOC_ATOMIC9))
 JEMALLOC_INLINE uint64_t
@@ -367,15 +367,15 @@ atomic_sub_uint32(uint32_t *p, uint32_t x)
 JEMALLOC_INLINE bool
 atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s)
 {
-
-	return (!atomic_compare_exchange_strong(p, &c, s));
+	volatile atomic_uint_least32_t *a = (volatile atomic_uint_least32_t *)p;
+	return (!atomic_compare_exchange_strong(a, &c, s));
 }
 
 JEMALLOC_INLINE void
 atomic_write_uint32(uint32_t *p, uint32_t x)
 {
-
-	atomic_store(p, x);
+	volatile atomic_uint_least32_t *a = (volatile atomic_uint_least32_t *)p;
+	atomic_store(a, x);
 }
 #elif (defined(JEMALLOC_ATOMIC9))
 JEMALLOC_INLINE uint32_t
-- 
cgit v0.12


From 09983d2f540fec53a59ac58cba5d86181bf7838a Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Fri, 29 May 2015 17:49:18 -0700
Subject: Bypass tcache when draining quarantined allocations.

This avoids the potential surprise of deallocating an object with one
tcache specified, and having the object cached in a different tcache
once it drains from the quarantine.
---
 src/quarantine.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/quarantine.c b/src/quarantine.c
index adc7305..6c43dfc 100644
--- a/src/quarantine.c
+++ b/src/quarantine.c
@@ -98,7 +98,7 @@ quarantine_drain_one(tsd_t *tsd, quarantine_t *quarantine)
 {
 	quarantine_obj_t *obj = &quarantine->objs[quarantine->first];
 	assert(obj->usize == isalloc(obj->ptr, config_prof));
-	idalloc(tsd, obj->ptr);
+	idalloctm(tsd, obj->ptr, NULL, false);
 	quarantine->curbytes -= obj->usize;
 	quarantine->curobjs--;
 	quarantine->first = (quarantine->first + 1) & ((ZU(1) <<
@@ -123,7 +123,7 @@ quarantine(tsd_t *tsd, void *ptr)
 	assert(opt_quarantine);
 
 	if ((quarantine = tsd_quarantine_get(tsd)) == NULL) {
-		idalloc(tsd, ptr);
+		idalloctm(tsd, ptr, NULL, false);
 		return;
 	}
 	/*
@@ -162,7 +162,7 @@ quarantine(tsd_t *tsd, void *ptr)
 		}
 	} else {
 		assert(quarantine->curbytes == 0);
-		idalloc(tsd, ptr);
+		idalloctm(tsd, ptr, NULL, false);
 	}
 }
 
-- 
cgit v0.12


From 56048baeb4ec3ff82a6acd24a4279db2bebbc9ae Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Thu, 28 May 2015 15:03:58 -0700
Subject: Clarify relationship between stats.resident and stats.mapped.

---
 doc/jemalloc.xml.in | 10 ++++++----
 src/base.c          |  2 ++
 test/unit/stats.c   | 10 +++++++---
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 123fb3a..bb15ae4 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -1964,10 +1964,12 @@ malloc_conf = "xmalloc:true";]]></programlisting>
           [<option>--enable-stats</option>]
         </term>
         <listitem><para>Total number of bytes in active chunks mapped by the
-        allocator.  This is a multiple of the chunk size, and is at least as
-        large as <link
-        linkend="stats.resident"><mallctl>stats.resident</mallctl></link>.  This
-        does not include inactive chunks.</para></listitem>
+        allocator.  This is a multiple of the chunk size, and is larger than
+        <link linkend="stats.active"><mallctl>stats.active</mallctl></link>.
+        This does not include inactive chunks, even those that contain unused
+        dirty pages, which means that there is no strict ordering between this
+        and <link
+        linkend="stats.resident"><mallctl>stats.resident</mallctl></link>.</para></listitem>
       </varlistentry>
 
       <varlistentry id="stats.arenas.i.dss">
diff --git a/src/base.c b/src/base.c
index 1a9b829..3ab46ec 100644
--- a/src/base.c
+++ b/src/base.c
@@ -132,6 +132,8 @@ base_stats_get(size_t *allocated, size_t *resident, size_t *mapped)
 {
 
 	malloc_mutex_lock(&base_mtx);
+	assert(base_allocated <= base_resident);
+	assert(base_resident <= base_mapped);
 	*allocated = base_allocated;
 	*resident = base_resident;
 	*mapped = base_mapped;
diff --git a/test/unit/stats.c b/test/unit/stats.c
index 1099967..81ef0b7 100644
--- a/test/unit/stats.c
+++ b/test/unit/stats.c
@@ -3,7 +3,7 @@
 TEST_BEGIN(test_stats_summary)
 {
 	size_t *cactive;
-	size_t sz, allocated, active, mapped;
+	size_t sz, allocated, active, resident, mapped;
 	int expected = config_stats ? 0 : ENOENT;
 
 	sz = sizeof(cactive);
@@ -15,6 +15,8 @@ TEST_BEGIN(test_stats_summary)
 	    expected, "Unexpected mallctl() result");
 	assert_d_eq(mallctl("stats.active", &active, &sz, NULL, 0), expected,
 	    "Unexpected mallctl() result");
+	assert_d_eq(mallctl("stats.resident", &resident, &sz, NULL, 0),
+	    expected, "Unexpected mallctl() result");
 	assert_d_eq(mallctl("stats.mapped", &mapped, &sz, NULL, 0), expected,
 	    "Unexpected mallctl() result");
 
@@ -23,8 +25,10 @@ TEST_BEGIN(test_stats_summary)
 		    "active should be no larger than cactive");
 		assert_zu_le(allocated, active,
 		    "allocated should be no larger than active");
-		assert_zu_le(active, mapped,
-		    "active should be no larger than mapped");
+		assert_zu_lt(active, resident,
+		    "active should be less than resident");
+		assert_zu_lt(active, mapped,
+		    "active should be less than mapped");
 	}
 }
 TEST_END
-- 
cgit v0.12


From 713b844bfffe7ce7454259da6ae24638794b1967 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Mon, 15 Jun 2015 12:01:05 -0700
Subject: Update a comment.

---
 include/jemalloc/internal/size_classes.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/jemalloc/internal/size_classes.sh b/include/jemalloc/internal/size_classes.sh
index 38020dc..1c2d681 100755
--- a/include/jemalloc/internal/size_classes.sh
+++ b/include/jemalloc/internal/size_classes.sh
@@ -198,7 +198,8 @@ cat <<EOF
  *
  *   LG_SIZE_CLASS_GROUP: Lg of size class count for each size doubling.
  *   SIZE_CLASSES: Complete table of
- *                 SC(index, lg_delta, size, bin, lg_delta_lookup) tuples.
+ *                 SC(index, lg_grp, lg_delta, ndelta, bin, lg_delta_lookup)
+ *                 tuples.
  *     index: Size class index.
  *     lg_grp: Lg group base size (no deltas added).
  *     lg_delta: Lg delta to previous size class.
-- 
cgit v0.12


From 4f6f2b131e2a96c031974681d28f48095bfb6a76 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Mon, 22 Jun 2015 14:38:06 -0700
Subject: Fix two valgrind integration regressions.

The regressions were never merged into the master branch.
---
 src/base.c  |  2 +-
 src/chunk.c | 10 ++++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/base.c b/src/base.c
index 3ab46ec..df3ddb6 100644
--- a/src/base.c
+++ b/src/base.c
@@ -121,7 +121,7 @@ base_alloc(size_t size)
 		base_resident += PAGE_CEILING((uintptr_t)ret + csize) -
 		    PAGE_CEILING((uintptr_t)ret);
 	}
-	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, csize);
+	JEMALLOC_VALGRIND_MAKE_MEM_DEFINED(ret, csize);
 label_return:
 	malloc_mutex_unlock(&base_mtx);
 	return (ret);
diff --git a/src/chunk.c b/src/chunk.c
index 7063410..1279965 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -277,15 +277,21 @@ void *
 chunk_alloc_cache(arena_t *arena, void *new_addr, size_t size, size_t alignment,
     bool *zero, bool dalloc_node)
 {
+	void *ret;
 
 	assert(size != 0);
 	assert((size & chunksize_mask) == 0);
 	assert(alignment != 0);
 	assert((alignment & chunksize_mask) == 0);
 
-	return (chunk_recycle(arena, &arena->chunks_szad_cache,
+	ret = chunk_recycle(arena, &arena->chunks_szad_cache,
 	    &arena->chunks_ad_cache, true, new_addr, size, alignment, zero,
-	    dalloc_node));
+	    dalloc_node);
+	if (ret == NULL)
+		return (NULL);
+	if (config_valgrind)
+		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
+	return (ret);
 }
 
 static arena_t *
-- 
cgit v0.12


From dc0610a714c1ff207bf87ba907506ae0b111e092 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Mon, 22 Jun 2015 18:48:58 -0700
Subject: Add alignment assertions to public aligned allocation functions.

---
 src/jemalloc.c | 61 +++++++++++++++++++++++++++++++---------------------------
 1 file changed, 33 insertions(+), 28 deletions(-)

diff --git a/src/jemalloc.c b/src/jemalloc.c
index 7f26652..094a247 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1472,38 +1472,38 @@ imemalign(void **memptr, size_t alignment, size_t size, size_t min_alignment)
 	if (unlikely(malloc_init())) {
 		result = NULL;
 		goto label_oom;
-	} else {
-		tsd = tsd_fetch();
-		if (size == 0)
-			size = 1;
-
-		/* Make sure that alignment is a large enough power of 2. */
-		if (unlikely(((alignment - 1) & alignment) != 0
-		    || (alignment < min_alignment))) {
-			if (config_xmalloc && unlikely(opt_xmalloc)) {
-				malloc_write("<jemalloc>: Error allocating "
-				    "aligned memory: invalid alignment\n");
-				abort();
-			}
-			result = NULL;
-			ret = EINVAL;
-			goto label_return;
-		}
+	}
+	tsd = tsd_fetch();
+	if (size == 0)
+		size = 1;
 
-		usize = sa2u(size, alignment);
-		if (unlikely(usize == 0)) {
-			result = NULL;
-			goto label_oom;
+	/* Make sure that alignment is a large enough power of 2. */
+	if (unlikely(((alignment - 1) & alignment) != 0
+	    || (alignment < min_alignment))) {
+		if (config_xmalloc && unlikely(opt_xmalloc)) {
+			malloc_write("<jemalloc>: Error allocating "
+			    "aligned memory: invalid alignment\n");
+			abort();
 		}
+		result = NULL;
+		ret = EINVAL;
+		goto label_return;
+	}
 
-		if (config_prof && opt_prof)
-			result = imemalign_prof(tsd, alignment, usize);
-		else
-			result = ipalloc(tsd, usize, alignment, false);
-		if (unlikely(result == NULL))
-			goto label_oom;
+	usize = sa2u(size, alignment);
+	if (unlikely(usize == 0)) {
+		result = NULL;
+		goto label_oom;
 	}
 
+	if (config_prof && opt_prof)
+		result = imemalign_prof(tsd, alignment, usize);
+	else
+		result = ipalloc(tsd, usize, alignment, false);
+	if (unlikely(result == NULL))
+		goto label_oom;
+	assert(((uintptr_t)result & (alignment - 1)) == ZU(0));
+
 	*memptr = result;
 	ret = 0;
 label_return:
@@ -1986,12 +1986,14 @@ imallocx_prof(tsd_t *tsd, size_t size, int flags, size_t *usize)
 	}
 	prof_malloc(p, *usize, tctx);
 
+	assert(alignment == 0 || ((uintptr_t)p & (alignment - 1)) == ZU(0));
 	return (p);
 }
 
 JEMALLOC_ALWAYS_INLINE_C void *
 imallocx_no_prof(tsd_t *tsd, size_t size, int flags, size_t *usize)
 {
+	void *p;
 	size_t alignment;
 	bool zero;
 	tcache_t *tcache;
@@ -2006,7 +2008,9 @@ imallocx_no_prof(tsd_t *tsd, size_t size, int flags, size_t *usize)
 	if (unlikely(imallocx_flags_decode_hard(tsd, size, flags, usize,
 	    &alignment, &zero, &tcache, &arena)))
 		return (NULL);
-	return (imallocx_flags(tsd, *usize, alignment, zero, tcache, arena));
+	p = imallocx_flags(tsd, *usize, alignment, zero, tcache, arena);
+	assert(alignment == 0 || ((uintptr_t)p & (alignment - 1)) == ZU(0));
+	return (p);
 }
 
 void *
@@ -2160,6 +2164,7 @@ je_rallocx(void *ptr, size_t size, int flags)
 		if (config_stats || (config_valgrind && unlikely(in_valgrind)))
 			usize = isalloc(p, config_prof);
 	}
+	assert(alignment == 0 || ((uintptr_t)p & (alignment - 1)) == ZU(0));
 
 	if (config_stats) {
 		*tsd_thread_allocatedp_get(tsd) += usize;
-- 
cgit v0.12


From 0a9f9a4d511e0c3343ff26e04d9592fefd96c2bc Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Mon, 22 Jun 2015 18:50:32 -0700
Subject: Convert arena_maybe_purge() recursion to iteration.

This resolves #235.
---
 include/jemalloc/internal/arena.h |  3 +++
 src/arena.c                       | 34 ++++++++++++++++++++++++----------
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 6f3c77c..58d87cb 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -316,6 +316,9 @@ struct arena_s {
 	/* Minimum ratio (log base 2) of nactive:ndirty. */
 	ssize_t			lg_dirty_mult;
 
+	/* True if a thread is currently executing arena_purge(). */
+	bool			purging;
+
 	/* Number of pages in active runs and huge regions. */
 	size_t			nactive;
 
diff --git a/src/arena.c b/src/arena.c
index a3f36b3..fa37e30 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1143,21 +1143,29 @@ arena_lg_dirty_mult_set(arena_t *arena, ssize_t lg_dirty_mult)
 void
 arena_maybe_purge(arena_t *arena)
 {
-	size_t threshold;
 
 	/* Don't purge if the option is disabled. */
 	if (arena->lg_dirty_mult < 0)
 		return;
-	threshold = (arena->nactive >> arena->lg_dirty_mult);
-	threshold = threshold < chunk_npages ? chunk_npages : threshold;
+	/* Don't recursively purge. */
+	if (arena->purging)
+		return;
 	/*
-	 * Don't purge unless the number of purgeable pages exceeds the
-	 * threshold.
+	 * Iterate, since preventing recursive purging could otherwise leave too
+	 * many dirty pages.
 	 */
-	if (arena->ndirty <= threshold)
-		return;
-
-	arena_purge(arena, false);
+	while (true) {
+		size_t threshold = (arena->nactive >> arena->lg_dirty_mult);
+		if (threshold < chunk_npages)
+			threshold = chunk_npages;
+		/*
+		 * Don't purge unless the number of purgeable pages exceeds the
+		 * threshold.
+		 */
+		if (arena->ndirty <= threshold)
+			return;
+		arena_purge(arena, false);
+	}
 }
 
 static size_t
@@ -1411,6 +1419,8 @@ arena_purge(arena_t *arena, bool all)
 	arena_runs_dirty_link_t purge_runs_sentinel;
 	extent_node_t purge_chunks_sentinel;
 
+	arena->purging = true;
+
 	/*
 	 * Calls to arena_dirty_count() are disabled even for debug builds
 	 * because overhead grows nonlinearly as memory usage increases.
@@ -1436,6 +1446,8 @@ arena_purge(arena_t *arena, bool all)
 	assert(npurged == npurgeable);
 	arena_unstash_purged(arena, &purge_runs_sentinel,
 	    &purge_chunks_sentinel);
+
+	arena->purging = false;
 }
 
 void
@@ -2053,7 +2065,8 @@ arena_malloc_large(arena_t *arena, size_t size, bool zero)
 		 * for 4 KiB pages and 64-byte cachelines.
 		 */
 		prng64(r, LG_PAGE - LG_CACHELINE, arena->offset_state,
-		    UINT64_C(6364136223846793009), UINT64_C(1442695040888963409));
+		    UINT64_C(6364136223846793009),
+		    UINT64_C(1442695040888963409));
 		random_offset = ((uintptr_t)r) << LG_CACHELINE;
 	} else
 		random_offset = 0;
@@ -2873,6 +2886,7 @@ arena_new(unsigned ind)
 	arena->spare = NULL;
 
 	arena->lg_dirty_mult = arena_lg_dirty_mult_default_get();
+	arena->purging = false;
 	arena->nactive = 0;
 	arena->ndirty = 0;
 
-- 
cgit v0.12


From 241abc601b947c5e0e56791bd73a924ce872b4a1 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 23 Jun 2015 18:47:07 -0700
Subject: Fix size class overflow handling when profiling is enabled.

Fix size class overflow handling for malloc(), posix_memalign(),
memalign(), calloc(), and realloc() when profiling is enabled.

Remove an assertion that erroneously caused arena_sdalloc() to fail when
profiling was enabled.

This resolves #232.
---
 ChangeLog                                        |  2 +
 Makefile.in                                      |  9 ++++-
 configure.ac                                     |  1 +
 include/jemalloc/internal/arena.h                |  1 -
 include/jemalloc/internal/jemalloc_internal.h.in | 11 ++----
 src/jemalloc.c                                   | 16 ++++++--
 test/include/test/jemalloc_test.h.in             |  4 ++
 test/integration/mallocx.c                       | 11 +++---
 test/integration/overflow.c                      | 49 ++++++++++++++++++++++++
 9 files changed, 86 insertions(+), 18 deletions(-)
 create mode 100644 test/integration/overflow.c

diff --git a/ChangeLog b/ChangeLog
index b6fa366..156d3c0 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -151,6 +151,8 @@ found in the git revision history:
     memory decreases.  This regression was first released in 3.5.0.
   - Fix OOM handling in memalign() and valloc().  A variant of this bug existed
     in all releases since 2.0.0, which introduced these functions.
+  - Fix size class overflow handling for malloc(), posix_memalign(), memalign(),
+    calloc(), and realloc() when profiling is enabled.
   - Fix the "arena.<i>.dss" mallctl to return an error if "primary" or
     "secondary" precedence is specified, but sbrk(2) is not supported.
   - Fix fallback lg_floor() implementations to handle extremely large inputs.
diff --git a/Makefile.in b/Makefile.in
index 7f5ac76..0dcdb5f 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -48,8 +48,10 @@ cfgoutputs_in := $(addprefix $(srcroot),@cfgoutputs_in@)
 cfgoutputs_out := @cfgoutputs_out@
 enable_autogen := @enable_autogen@
 enable_code_coverage := @enable_code_coverage@
+enable_prof := @enable_prof@
 enable_valgrind := @enable_valgrind@
 enable_zone_allocator := @enable_zone_allocator@
+MALLOC_CONF := @JEMALLOC_CPREFIX@MALLOC_CONF
 DSO_LDFLAGS = @DSO_LDFLAGS@
 SOREV = @SOREV@
 PIC_CFLAGS = @PIC_CFLAGS@
@@ -146,6 +148,7 @@ TESTS_INTEGRATION := $(srcroot)test/integration/aligned_alloc.c \
 	$(srcroot)test/integration/sdallocx.c \
 	$(srcroot)test/integration/mallocx.c \
 	$(srcroot)test/integration/MALLOCX_ARENA.c \
+	$(srcroot)test/integration/overflow.c \
 	$(srcroot)test/integration/posix_memalign.c \
 	$(srcroot)test/integration/rallocx.c \
 	$(srcroot)test/integration/thread_arena.c \
@@ -344,11 +347,15 @@ check_dir: check_unit_dir check_integration_dir check_stress_dir
 
 check_unit: tests_unit check_unit_dir
 	$(SHELL) $(objroot)test/test.sh $(TESTS_UNIT:$(srcroot)%.c=$(objroot)%)
+check_integration_prof: tests_integration check_integration_dir
+ifeq ($(enable_prof), 1)
+	$(MALLOC_CONF)="prof:true" $(SHELL) $(objroot)test/test.sh $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%)
+endif
 check_integration: tests_integration check_integration_dir
 	$(SHELL) $(objroot)test/test.sh $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%)
 check_stress: tests_stress check_stress_dir
 	$(SHELL) $(objroot)test/test.sh $(TESTS_STRESS:$(srcroot)%.c=$(objroot)%)
-check: tests check_dir
+check: tests check_dir check_integration_prof
 	$(SHELL) $(objroot)test/test.sh $(TESTS:$(srcroot)%.c=$(objroot)%)
 
 ifeq ($(enable_code_coverage), 1)
diff --git a/configure.ac b/configure.ac
index bb6f3a3..61adc2a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -513,6 +513,7 @@ if test "x$JEMALLOC_PREFIX" != "x" ; then
   AC_DEFINE_UNQUOTED([JEMALLOC_PREFIX], ["$JEMALLOC_PREFIX"])
   AC_DEFINE_UNQUOTED([JEMALLOC_CPREFIX], ["$JEMALLOC_CPREFIX"])
 fi
+AC_SUBST([JEMALLOC_CPREFIX])
 
 AC_ARG_WITH([export],
   [AS_HELP_STRING([--without-export], [disable exporting jemalloc public APIs])],
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 58d87cb..9990e45 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -1213,7 +1213,6 @@ arena_sdalloc(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache)
 				 * Make sure to use promoted size, not request
 				 * size.
 				 */
-				assert(((uintptr_t)ptr & PAGE_MASK) == 0);
 				size = arena_mapbits_large_size_get(chunk,
 				    pageind) - large_pad;
 			}
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 0268245..ff9412a 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -525,7 +525,7 @@ size2index_compute(size_t size)
 		size_t lg_tmin = LG_TINY_MAXCLASS - NTBINS + 1;
 		size_t lg_ceil = lg_floor(pow2_ceil(size));
 		return (lg_ceil < lg_tmin ? 0 : lg_ceil - lg_tmin);
-	} else
+	}
 #endif
 	{
 		size_t x = lg_floor((size<<1)-1);
@@ -565,8 +565,7 @@ size2index(size_t size)
 	assert(size > 0);
 	if (likely(size <= LOOKUP_MAXCLASS))
 		return (size2index_lookup(size));
-	else
-		return (size2index_compute(size));
+	return (size2index_compute(size));
 }
 
 JEMALLOC_INLINE size_t
@@ -576,7 +575,6 @@ index2size_compute(index_t index)
 #if (NTBINS > 0)
 	if (index < NTBINS)
 		return (ZU(1) << (LG_TINY_MAXCLASS - NTBINS + 1 + index));
-	else
 #endif
 	{
 		size_t reduced_index = index - NTBINS;
@@ -623,7 +621,7 @@ s2u_compute(size_t size)
 		size_t lg_ceil = lg_floor(pow2_ceil(size));
 		return (lg_ceil < lg_tmin ? (ZU(1) << lg_tmin) :
 		    (ZU(1) << lg_ceil));
-	} else
+	}
 #endif
 	{
 		size_t x = lg_floor((size<<1)-1);
@@ -656,8 +654,7 @@ s2u(size_t size)
 	assert(size > 0);
 	if (likely(size <= LOOKUP_MAXCLASS))
 		return (s2u_lookup(size));
-	else
-		return (s2u_compute(size));
+	return (s2u_compute(size));
 }
 
 /*
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 094a247..01cb394 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1382,6 +1382,8 @@ imalloc_body(size_t size, tsd_t **tsd, size_t *usize)
 
 	if (config_prof && opt_prof) {
 		*usize = s2u(size);
+		if (unlikely(*usize == 0))
+			return (NULL);
 		return (imalloc_prof(*tsd, *usize));
 	}
 
@@ -1428,7 +1430,7 @@ imemalign_prof_sample(tsd_t *tsd, size_t alignment, size_t usize,
 		return (NULL);
 	if (usize <= SMALL_MAXCLASS) {
 		assert(sa2u(LARGE_MINCLASS, alignment) == LARGE_MINCLASS);
-		p = imalloc(tsd, LARGE_MINCLASS);
+		p = ipalloc(tsd, LARGE_MINCLASS, alignment, false);
 		if (p == NULL)
 			return (NULL);
 		arena_prof_promoted(p, usize);
@@ -1623,6 +1625,10 @@ je_calloc(size_t num, size_t size)
 
 	if (config_prof && opt_prof) {
 		usize = s2u(num_size);
+		if (unlikely(usize == 0)) {
+			ret = NULL;
+			goto label_return;
+		}
 		ret = icalloc_prof(tsd, usize);
 	} else {
 		if (config_stats || (config_valgrind && unlikely(in_valgrind)))
@@ -1757,7 +1763,8 @@ je_realloc(void *ptr, size_t size)
 
 		if (config_prof && opt_prof) {
 			usize = s2u(size);
-			ret = irealloc_prof(tsd, ptr, old_usize, usize);
+			ret = unlikely(usize == 0) ? NULL : irealloc_prof(tsd,
+			    ptr, old_usize, usize);
 		} else {
 			if (config_stats || (config_valgrind &&
 			    unlikely(in_valgrind)))
@@ -1903,7 +1910,7 @@ imallocx_flags_decode(tsd_t *tsd, size_t size, int flags, size_t *usize,
 
 	if (likely(flags == 0)) {
 		*usize = s2u(size);
-		assert(usize != 0);
+		assert(*usize != 0);
 		*alignment = 0;
 		*zero = false;
 		*tcache = tcache_get(tsd, true);
@@ -1946,7 +1953,8 @@ imallocx_prof_sample(tsd_t *tsd, size_t size, int flags, size_t usize,
 	if (usize <= SMALL_MAXCLASS) {
 		assert(((alignment == 0) ? s2u(LARGE_MINCLASS) :
 		    sa2u(LARGE_MINCLASS, alignment)) == LARGE_MINCLASS);
-		p = imalloct(tsd, LARGE_MINCLASS, tcache, arena);
+		p = imallocx_maybe_flags(tsd, LARGE_MINCLASS, flags,
+		    LARGE_MINCLASS, alignment, zero, tcache, arena);
 		if (p == NULL)
 			return (NULL);
 		arena_prof_promoted(p, usize);
diff --git a/test/include/test/jemalloc_test.h.in b/test/include/test/jemalloc_test.h.in
index 6018e58..c72d09f 100644
--- a/test/include/test/jemalloc_test.h.in
+++ b/test/include/test/jemalloc_test.h.in
@@ -1,3 +1,7 @@
+#include <limits.h>
+#ifndef SIZE_T_MAX
+#  define SIZE_T_MAX	SIZE_MAX
+#endif
 #include <stdlib.h>
 #include <stdarg.h>
 #include <stdbool.h>
diff --git a/test/integration/mallocx.c b/test/integration/mallocx.c
index 23129c2..4b0e33f 100644
--- a/test/integration/mallocx.c
+++ b/test/integration/mallocx.c
@@ -1,12 +1,8 @@
 #include "test/jemalloc_test.h"
 
-#define	CHUNK 0x400000
-#define	MAXALIGN (((size_t)1) << 25)
-#define	MAXSZ (((size_t)1) << 26)
-#define	NITER 4
-
 TEST_BEGIN(test_basic)
 {
+#define	MAXSZ (((size_t)1) << 26)
 	size_t sz;
 
 	for (sz = 1; sz < MAXSZ; sz = nallocx(sz, 0) + 1) {
@@ -33,11 +29,14 @@ TEST_BEGIN(test_basic)
 		assert_zu_eq(nsz, rsz, "nallocx()/sallocx() rsize mismatch");
 		dallocx(p, 0);
 	}
+#undef MAXSZ
 }
 TEST_END
 
 TEST_BEGIN(test_alignment_and_size)
 {
+#define	MAXALIGN (((size_t)1) << 25)
+#define	NITER 4
 	size_t nsz, rsz, sz, alignment, total;
 	unsigned i;
 	void *ps[NITER];
@@ -87,6 +86,8 @@ TEST_BEGIN(test_alignment_and_size)
 			}
 		}
 	}
+#undef MAXALIGN
+#undef NITER
 }
 TEST_END
 
diff --git a/test/integration/overflow.c b/test/integration/overflow.c
new file mode 100644
index 0000000..303d9b2
--- /dev/null
+++ b/test/integration/overflow.c
@@ -0,0 +1,49 @@
+#include "test/jemalloc_test.h"
+
+TEST_BEGIN(test_overflow)
+{
+	unsigned nhchunks;
+	size_t mib[4];
+	size_t sz, miblen, max_size_class;
+	void *p;
+
+	sz = sizeof(unsigned);
+	assert_d_eq(mallctl("arenas.nhchunks", &nhchunks, &sz, NULL, 0), 0,
+	    "Unexpected mallctl() error");
+
+	miblen = sizeof(mib) / sizeof(size_t);
+	assert_d_eq(mallctlnametomib("arenas.hchunk.0.size", mib, &miblen), 0,
+	    "Unexpected mallctlnametomib() error");
+	mib[2] = nhchunks - 1;
+
+	sz = sizeof(size_t);
+	assert_d_eq(mallctlbymib(mib, miblen, &max_size_class, &sz, NULL, 0), 0,
+	    "Unexpected mallctlbymib() error");
+
+	assert_ptr_null(malloc(max_size_class + 1),
+	    "Expected OOM due to over-sized allocation request");
+	assert_ptr_null(malloc(SIZE_T_MAX),
+	    "Expected OOM due to over-sized allocation request");
+
+	assert_ptr_null(calloc(1, max_size_class + 1),
+	    "Expected OOM due to over-sized allocation request");
+	assert_ptr_null(calloc(1, SIZE_T_MAX),
+	    "Expected OOM due to over-sized allocation request");
+
+	p = malloc(1);
+	assert_ptr_not_null(p, "Unexpected malloc() OOM");
+	assert_ptr_null(realloc(p, max_size_class + 1),
+	    "Expected OOM due to over-sized allocation request");
+	assert_ptr_null(realloc(p, SIZE_T_MAX),
+	    "Expected OOM due to over-sized allocation request");
+	free(p);
+}
+TEST_END
+
+int
+main(void)
+{
+
+	return (test(
+	    test_overflow));
+}
-- 
cgit v0.12


From a1aaf949a5d3b639f03dd7e33ffe1f0849b7f8df Mon Sep 17 00:00:00 2001
From: Matthijs <mlvdmeide@gmail.com>
Date: Thu, 25 Jun 2015 22:53:58 +0200
Subject: Optimizations for Windows

- Set opt_lg_chunk based on run-time OS setting
- Verify LG_PAGE is compatible with run-time OS setting
- When targeting Windows Vista or newer, use SRWLOCK instead of CRITICAL_SECTION
- When targeting Windows Vista or newer, statically initialize init_lock
---
 include/jemalloc/internal/mutex.h | 12 ++++++++++++
 src/chunk.c                       | 17 ++++++++++++++++-
 src/jemalloc.c                    |  5 ++++-
 src/mutex.c                       |  4 ++++
 4 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/include/jemalloc/internal/mutex.h b/include/jemalloc/internal/mutex.h
index 8a03d82..f051f29 100644
--- a/include/jemalloc/internal/mutex.h
+++ b/include/jemalloc/internal/mutex.h
@@ -26,7 +26,11 @@ typedef struct malloc_mutex_s malloc_mutex_t;
 
 struct malloc_mutex_s {
 #ifdef _WIN32
+#  if _WIN32_WINNT >= 0x0600
+	SRWLOCK         	lock;
+#  else
 	CRITICAL_SECTION	lock;
+#  endif
 #elif (defined(JEMALLOC_OSSPIN))
 	OSSpinLock		lock;
 #elif (defined(JEMALLOC_MUTEX_INIT_CB))
@@ -70,7 +74,11 @@ malloc_mutex_lock(malloc_mutex_t *mutex)
 
 	if (isthreaded) {
 #ifdef _WIN32
+#  if _WIN32_WINNT >= 0x0600
+		AcquireSRWLockExclusive(&mutex->lock);
+#  else
 		EnterCriticalSection(&mutex->lock);
+#  endif
 #elif (defined(JEMALLOC_OSSPIN))
 		OSSpinLockLock(&mutex->lock);
 #else
@@ -85,7 +93,11 @@ malloc_mutex_unlock(malloc_mutex_t *mutex)
 
 	if (isthreaded) {
 #ifdef _WIN32
+#  if _WIN32_WINNT >= 0x0600
+		ReleaseSRWLockExclusive(&mutex->lock);
+#  else
 		LeaveCriticalSection(&mutex->lock);
+#  endif
 #elif (defined(JEMALLOC_OSSPIN))
 		OSSpinLockUnlock(&mutex->lock);
 #else
diff --git a/src/chunk.c b/src/chunk.c
index 1279965..b600aba 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -5,7 +5,7 @@
 /* Data. */
 
 const char	*opt_dss = DSS_DEFAULT;
-size_t		opt_lg_chunk = LG_CHUNK_DEFAULT;
+size_t		opt_lg_chunk = 0;
 
 /* Used exclusively for gdump triggering. */
 static size_t	curchunks;
@@ -535,6 +535,21 @@ chunks_rtree_node_alloc(size_t nelms)
 bool
 chunk_boot(void)
 {
+#ifdef _WIN32
+	SYSTEM_INFO info;
+	GetSystemInfo(&info);
+
+	/* Verify actual page size is equal to or an integral multiple of configured page size */
+	if (info.dwPageSize & ((1U << LG_PAGE) - 1))
+		return (true);
+
+	/* Configure chunksize (if not set) to match granularity (usually 64K), so pages_map will always take fast path */
+	if (!opt_lg_chunk)
+		opt_lg_chunk = ffs((int)info.dwAllocationGranularity) - 1;
+#else
+	if (!opt_lg_chunk)
+		opt_lg_chunk = LG_CHUNK_DEFAULT;
+#endif
 
 	/* Set variables according to the value of opt_lg_chunk. */
 	chunksize = (ZU(1) << opt_lg_chunk);
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 01cb394..43c4c81 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -175,6 +175,9 @@ static bool			malloc_initializer = NO_INITIALIZER;
 
 /* Used to avoid initialization races. */
 #ifdef _WIN32
+#if _WIN32_WINNT >= 0x0600
+static malloc_mutex_t	init_lock = SRWLOCK_INIT;
+#else
 static malloc_mutex_t	init_lock;
 
 JEMALLOC_ATTR(constructor)
@@ -190,7 +193,7 @@ _init_init_lock(void)
 JEMALLOC_SECTION(".CRT$XCU") JEMALLOC_ATTR(used)
 static const void (WINAPI *init_init_lock)(void) = _init_init_lock;
 #endif
-
+#endif
 #else
 static malloc_mutex_t	init_lock = MALLOC_MUTEX_INITIALIZER;
 #endif
diff --git a/src/mutex.c b/src/mutex.c
index d86887e..2d47af9 100644
--- a/src/mutex.c
+++ b/src/mutex.c
@@ -73,9 +73,13 @@ malloc_mutex_init(malloc_mutex_t *mutex)
 {
 
 #ifdef _WIN32
+#  if _WIN32_WINNT >= 0x0600
+	InitializeSRWLock(&mutex->lock);
+#  else
 	if (!InitializeCriticalSectionAndSpinCount(&mutex->lock,
 	    _CRT_SPINCOUNT))
 		return (true);
+#  endif
 #elif (defined(JEMALLOC_OSSPIN))
 	mutex->lock = 0;
 #elif (defined(JEMALLOC_MUTEX_INIT_CB))
-- 
cgit v0.12


From bce61d61bbe1b1b4ea15d1cbd3e24252b7e79c47 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 7 Jul 2015 09:32:05 -0700
Subject: Move a variable declaration closer to its use.

---
 src/arena.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/arena.c b/src/arena.c
index fa37e30..51e3b25 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -2049,7 +2049,6 @@ arena_malloc_large(arena_t *arena, size_t size, bool zero)
 {
 	void *ret;
 	size_t usize;
-	uint64_t r;
 	uintptr_t random_offset;
 	arena_run_t *run;
 	arena_chunk_map_misc_t *miscelm;
@@ -2059,6 +2058,8 @@ arena_malloc_large(arena_t *arena, size_t size, bool zero)
 	usize = s2u(size);
 	malloc_mutex_lock(&arena->lock);
 	if (config_cache_oblivious) {
+		uint64_t r;
+
 		/*
 		 * Compute a uniformly distributed offset within the first page
 		 * that is a multiple of the cacheline size, e.g. [0 .. 63) * 64
-- 
cgit v0.12


From be09b8145967842f3931c979918a730af81976ce Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 7 Jul 2015 09:33:22 -0700
Subject: Minor ChangeLog edit.

---
 ChangeLog | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 156d3c0..43a7d7b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,7 +1,6 @@
 Following are change highlights associated with official releases.  Important
-bug fixes are all mentioned, but internal enhancements are omitted here for
-brevity (even though they are more fun to write about).  Much more detail can be
-found in the git revision history:
+bug fixes are all mentioned, but some internal enhancements are omitted here for
+brevity.  Much more detail can be found in the git revision history:
 
     https://github.com/jemalloc/jemalloc
 
-- 
cgit v0.12


From ad6800fec32668e34d41321dcc73eda135ef0f84 Mon Sep 17 00:00:00 2001
From: charsyam <charsyam@naver.com>
Date: Sat, 4 Jul 2015 01:06:06 +0900
Subject: Fix typos ChangeLog

Fix typos ChangeLog
---
 ChangeLog | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ChangeLog b/ChangeLog
index 43a7d7b..49faa13 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -674,7 +674,7 @@ brevity.  Much more detail can be found in the git revision history:
   - Make it possible for the application to manually flush a thread's cache, via
     the "tcache.flush" mallctl.
   - Base maximum dirty page count on proportion of active memory.
-  - Compute various addtional run-time statistics, including per size class
+  - Compute various additional run-time statistics, including per size class
     statistics for large objects.
   - Expose malloc_stats_print(), which can be called repeatedly by the
     application.
-- 
cgit v0.12


From 0dd3ad3841fba31312ab6e2c9ddbc453dd14f6a5 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 7 Jul 2015 13:09:45 -0700
Subject: Fix an assignment type warning for tls_callback.

---
 src/tsd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/tsd.c b/src/tsd.c
index 3b59acf..b8886da 100644
--- a/src/tsd.c
+++ b/src/tsd.c
@@ -151,8 +151,8 @@ _tls_callback(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpvReserved)
 #  pragma section(".CRT$XLY",long,read)
 #endif
 JEMALLOC_SECTION(".CRT$XLY") JEMALLOC_ATTR(used)
-static const BOOL	(WINAPI *tls_callback)(HINSTANCE hinstDLL,
-    DWORD fdwReason, LPVOID lpvReserved) = _tls_callback;
+static BOOL	(WINAPI *const tls_callback)(HINSTANCE hinstDLL,
+    DWORD fdwRerason, LPVOID lpvReserved) = _tls_callback;
 #endif
 
 #if (!defined(JEMALLOC_MALLOC_THREAD_CLEANUP) && !defined(JEMALLOC_TLS) && \
-- 
cgit v0.12


From 0313607e663294cd335da2545f10e949ee546fbc Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 7 Jul 2015 13:12:05 -0700
Subject: Fix MinGW build warnings.

Conditionally define ENOENT, EINVAL, etc. (was unconditional).

Add/use PRIzu, PRIzd, and PRIzx for use in malloc_printf() calls.  gcc issued
(harmless) warnings since e.g. "%zu" should be "%Iu" on Windows, and the
alternative to this workaround would have been to disable the function
attributes which cause gcc to look for type mismatches in formatted printing
function calls.
---
 .../jemalloc/internal/jemalloc_internal_decls.h    | 38 +++++++--
 src/arena.c                                        |  4 +-
 src/prof.c                                         |  2 +-
 src/stats.c                                        | 95 +++++++++++-----------
 4 files changed, 82 insertions(+), 57 deletions(-)

diff --git a/include/jemalloc/internal/jemalloc_internal_decls.h b/include/jemalloc/internal/jemalloc_internal_decls.h
index b10561c..bf13c57 100644
--- a/include/jemalloc/internal/jemalloc_internal_decls.h
+++ b/include/jemalloc/internal/jemalloc_internal_decls.h
@@ -4,14 +4,27 @@
 #include <math.h>
 #ifdef _WIN32
 #  include <windows.h>
-#  define ENOENT ERROR_PATH_NOT_FOUND
-#  define EINVAL ERROR_BAD_ARGUMENTS
-#  define EAGAIN ERROR_OUTOFMEMORY
-#  define EPERM  ERROR_WRITE_FAULT
-#  define EFAULT ERROR_INVALID_ADDRESS
-#  define ENOMEM ERROR_NOT_ENOUGH_MEMORY
-#  undef ERANGE
-#  define ERANGE ERROR_INVALID_DATA
+#  ifndef ENOENT
+#    define ENOENT ERROR_PATH_NOT_FOUND
+#  endif
+#  ifndef EINVAL
+#    define EINVAL ERROR_BAD_ARGUMENTS
+#  endif
+#  ifndef EAGAIN
+#    define EAGAIN ERROR_OUTOFMEMORY
+#  endif
+#  ifndef EPERM
+#    define EPERM  ERROR_WRITE_FAULT
+#  endif
+#  ifndef EFAULT
+#    define EFAULT ERROR_INVALID_ADDRESS
+#  endif
+#  ifndef ENOMEM
+#    define ENOMEM ERROR_NOT_ENOUGH_MEMORY
+#  endif
+#  ifndef ERANGE
+#    define ERANGE ERROR_INVALID_DATA
+#  endif
 #else
 #  include <sys/param.h>
 #  include <sys/mman.h>
@@ -41,6 +54,15 @@
 #  define offsetof(type, member)	((size_t)&(((type *)NULL)->member))
 #endif
 #include <inttypes.h>
+#ifdef _WIN32
+#  define PRIzu "Iu"
+#  define PRIzd "Id"
+#  define PRIzx "Ix"
+#else
+#  define PRIzu "zu"
+#  define PRIzd "zd"
+#  define PRIzx "zx"
+#endif
 #include <string.h>
 #include <strings.h>
 #include <ctype.h>
diff --git a/src/arena.c b/src/arena.c
index 51e3b25..a8fae11 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1911,8 +1911,8 @@ arena_redzone_corruption(void *ptr, size_t usize, bool after,
     size_t offset, uint8_t byte)
 {
 
-	malloc_printf("<jemalloc>: Corrupt redzone %zu byte%s %s %p "
-	    "(size %zu), byte=%#x\n", offset, (offset == 1) ? "" : "s",
+	malloc_printf("<jemalloc>: Corrupt redzone %"PRIzu" byte%s %s %p "
+	    "(size %"PRIzu"), byte=%#x\n", offset, (offset == 1) ? "" : "s",
 	    after ? "after" : "before", ptr, usize, byte);
 }
 #ifdef JEMALLOC_JET
diff --git a/src/prof.c b/src/prof.c
index d097749..b24996a 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -1413,7 +1413,7 @@ prof_leakcheck(const prof_cnt_t *cnt_all, size_t leak_ngctx,
 
 	if (cnt_all->curbytes != 0) {
 		malloc_printf("<jemalloc>: Leak summary: %"PRIu64" byte%s, %"
-		    PRIu64" object%s, %zu context%s\n",
+		    PRIu64" object%s, %"PRIzu" context%s\n",
 		    cnt_all->curbytes, (cnt_all->curbytes != 1) ? "s" : "",
 		    cnt_all->curobjs, (cnt_all->curobjs != 1) ? "s" : "",
 		    leak_ngctx, (leak_ngctx != 1) ? "s" : "");
diff --git a/src/stats.c b/src/stats.c
index 6e1752e..57fd650 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -118,32 +118,33 @@ stats_arena_bins_print(void (*write_cb)(void *, const char *), void *cbopaque,
 			    : 1000;
 			assert(milli <= 1000);
 			if (milli < 10) {
-				malloc_snprintf(util, sizeof(util), "0.00%zu",
-				    milli);
+				malloc_snprintf(util, sizeof(util),
+				    "0.00%"PRIzu, milli);
 			} else if (milli < 100) {
-				malloc_snprintf(util, sizeof(util), "0.0%zu",
+				malloc_snprintf(util, sizeof(util), "0.0%"PRIzu,
 				    milli);
 			} else if (milli < 1000) {
-				malloc_snprintf(util, sizeof(util), "0.%zu",
+				malloc_snprintf(util, sizeof(util), "0.%"PRIzu,
 				    milli);
 			} else
 				malloc_snprintf(util, sizeof(util), "1");
 
 			if (config_tcache) {
 				malloc_cprintf(write_cb, cbopaque,
-				    "%20zu %3u %12zu %12"PRIu64" %12"PRIu64
-				    " %12"PRIu64" %12zu %12zu %4u %3zu %-5s"
-				    " %12"PRIu64" %12"PRIu64" %12"PRIu64
-				    " %12"PRIu64"\n",
+				    "%20"PRIzu" %3u %12"PRIzu" %12"PRIu64
+				    " %12"PRIu64" %12"PRIu64" %12"PRIzu
+				    " %12"PRIzu" %4u %3"PRIzu" %-5s %12"PRIu64
+				    " %12"PRIu64" %12"PRIu64" %12"PRIu64"\n",
 				    reg_size, j, curregs * reg_size, nmalloc,
 				    ndalloc, nrequests, curregs, curruns, nregs,
 				    run_size / page, util, nfills, nflushes,
 				    nruns, reruns);
 			} else {
 				malloc_cprintf(write_cb, cbopaque,
-				    "%20zu %3u %12zu %12"PRIu64" %12"PRIu64
-				    " %12"PRIu64" %12zu %12zu %4u %3zu %-5s"
-				    " %12"PRIu64" %12"PRIu64"\n",
+				    "%20"PRIzu" %3u %12"PRIzu" %12"PRIu64
+				    " %12"PRIu64" %12"PRIu64" %12"PRIzu
+				    " %12"PRIzu" %4u %3"PRIzu" %-5s %12"PRIu64
+				    " %12"PRIu64"\n",
 				    reg_size, j, curregs * reg_size, nmalloc,
 				    ndalloc, nrequests, curregs, curruns, nregs,
 				    run_size / page, util, nruns, reruns);
@@ -190,8 +191,8 @@ stats_arena_lruns_print(void (*write_cb)(void *, const char *), void *cbopaque,
 				in_gap = false;
 			}
 			malloc_cprintf(write_cb, cbopaque,
-			    "%20zu %3u %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64
-			    " %12zu\n",
+			    "%20"PRIzu" %3u %12"PRIzu" %12"PRIu64" %12"PRIu64
+			    " %12"PRIu64" %12"PRIzu"\n",
 			    run_size, nbins + j, curruns * run_size, nmalloc,
 			    ndalloc, nrequests, curruns);
 		}
@@ -238,8 +239,8 @@ stats_arena_hchunks_print(void (*write_cb)(void *, const char *),
 				in_gap = false;
 			}
 			malloc_cprintf(write_cb, cbopaque,
-			    "%20zu %3u %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64
-			    " %12zu\n",
+			    "%20"PRIzu" %3u %12"PRIzu" %12"PRIu64" %12"PRIu64
+			    " %12"PRIu64" %12"PRIzu"\n",
 			    hchunk_size, nbins + nlruns + j,
 			    curhchunks * hchunk_size, nmalloc, ndalloc,
 			    nrequests, curhchunks);
@@ -291,7 +292,7 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	CTL_M2_GET("stats.arenas.0.nmadvise", i, &nmadvise, uint64_t);
 	CTL_M2_GET("stats.arenas.0.purged", i, &purged, uint64_t);
 	malloc_cprintf(write_cb, cbopaque,
-	    "dirty pages: %zu:%zu active:dirty, %"PRIu64" sweep%s,"
+	    "dirty pages: %"PRIzu":%"PRIzu" active:dirty, %"PRIu64" sweep%s,"
 	    " %"PRIu64" madvise%s, %"PRIu64" purged\n",
 	    pactive, pdirty, npurge, npurge == 1 ? "" : "s",
 	    nmadvise, nmadvise == 1 ? "" : "s", purged);
@@ -306,8 +307,8 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	CTL_M2_GET("stats.arenas.0.small.nrequests", i, &small_nrequests,
 	    uint64_t);
 	malloc_cprintf(write_cb, cbopaque,
-	    "small:                   %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64
-	    "\n",
+	    "small:                   %12"PRIzu" %12"PRIu64" %12"PRIu64
+	    " %12"PRIu64"\n",
 	    small_allocated, small_nmalloc, small_ndalloc, small_nrequests);
 	CTL_M2_GET("stats.arenas.0.large.allocated", i, &large_allocated,
 	    size_t);
@@ -316,8 +317,8 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	CTL_M2_GET("stats.arenas.0.large.nrequests", i, &large_nrequests,
 	    uint64_t);
 	malloc_cprintf(write_cb, cbopaque,
-	    "large:                   %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64
-	    "\n",
+	    "large:                   %12"PRIzu" %12"PRIu64" %12"PRIu64
+	    " %12"PRIu64"\n",
 	    large_allocated, large_nmalloc, large_ndalloc, large_nrequests);
 	CTL_M2_GET("stats.arenas.0.huge.allocated", i, &huge_allocated, size_t);
 	CTL_M2_GET("stats.arenas.0.huge.nmalloc", i, &huge_nmalloc, uint64_t);
@@ -325,28 +326,28 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	CTL_M2_GET("stats.arenas.0.huge.nrequests", i, &huge_nrequests,
 	    uint64_t);
 	malloc_cprintf(write_cb, cbopaque,
-	    "huge:                    %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64
-	    "\n",
+	    "huge:                    %12"PRIzu" %12"PRIu64" %12"PRIu64
+	    " %12"PRIu64"\n",
 	    huge_allocated, huge_nmalloc, huge_ndalloc, huge_nrequests);
 	malloc_cprintf(write_cb, cbopaque,
-	    "total:                   %12zu %12"PRIu64" %12"PRIu64" %12"PRIu64
-	    "\n",
+	    "total:                   %12"PRIzu" %12"PRIu64" %12"PRIu64
+	    " %12"PRIu64"\n",
 	    small_allocated + large_allocated + huge_allocated,
 	    small_nmalloc + large_nmalloc + huge_nmalloc,
 	    small_ndalloc + large_ndalloc + huge_ndalloc,
 	    small_nrequests + large_nrequests + huge_nrequests);
-	malloc_cprintf(write_cb, cbopaque, "active:                  %12zu\n",
-	    pactive * page);
+	malloc_cprintf(write_cb, cbopaque,
+	    "active:                  %12"PRIzu"\n", pactive * page);
 	CTL_M2_GET("stats.arenas.0.mapped", i, &mapped, size_t);
-	malloc_cprintf(write_cb, cbopaque, "mapped:                  %12zu\n",
-	    mapped);
+	malloc_cprintf(write_cb, cbopaque,
+	    "mapped:                  %12"PRIzu"\n", mapped);
 	CTL_M2_GET("stats.arenas.0.metadata.mapped", i, &metadata_mapped,
 	    size_t);
 	CTL_M2_GET("stats.arenas.0.metadata.allocated", i, &metadata_allocated,
 	    size_t);
 	malloc_cprintf(write_cb, cbopaque,
-	    "metadata: mapped: %zu, allocated: %zu\n", metadata_mapped,
-	    metadata_allocated);
+	    "metadata: mapped: %"PRIzu", allocated: %"PRIzu"\n",
+	    metadata_mapped, metadata_allocated);
 
 	if (bins)
 		stats_arena_bins_print(write_cb, cbopaque, i);
@@ -456,20 +457,20 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 #define	OPT_WRITE_SIZE_T(n)						\
 		if (je_mallctl("opt."#n, &sv, &ssz, NULL, 0) == 0) {	\
 			malloc_cprintf(write_cb, cbopaque,		\
-			"  opt."#n": %zu\n", sv);			\
+			"  opt."#n": %"PRIzu"\n", sv);			\
 		}
 #define	OPT_WRITE_SSIZE_T(n)						\
 		if (je_mallctl("opt."#n, &ssv, &sssz, NULL, 0) == 0) {	\
 			malloc_cprintf(write_cb, cbopaque,		\
-			    "  opt."#n": %zd\n", ssv);			\
+			    "  opt."#n": %"PRIzd"\n", ssv);		\
 		}
 #define	OPT_WRITE_SSIZE_T_MUTABLE(n, m) {				\
 		ssize_t ssv2;						\
 		if (je_mallctl("opt."#n, &ssv, &sssz, NULL, 0) == 0 &&	\
 		    je_mallctl(#m, &ssv2, &sssz, NULL, 0) == 0) {	\
 			malloc_cprintf(write_cb, cbopaque,		\
-			    "  opt."#n": %zd ("#m": %zd)\n", ssv,	\
-			    ssv2);					\
+			    "  opt."#n": %"PRIzd" ("#m": %"PRIzd")\n",	\
+			    ssv, ssv2);					\
 		}							\
 }
 #define	OPT_WRITE_CHAR_P(n)						\
@@ -518,14 +519,15 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		CTL_GET("arenas.narenas", &uv, unsigned);
 		malloc_cprintf(write_cb, cbopaque, "Arenas: %u\n", uv);
 
-		malloc_cprintf(write_cb, cbopaque, "Pointer size: %zu\n",
+		malloc_cprintf(write_cb, cbopaque, "Pointer size: %"PRIzu"\n",
 		    sizeof(void *));
 
 		CTL_GET("arenas.quantum", &sv, size_t);
-		malloc_cprintf(write_cb, cbopaque, "Quantum size: %zu\n", sv);
+		malloc_cprintf(write_cb, cbopaque, "Quantum size: %"PRIzu"\n",
+		    sv);
 
 		CTL_GET("arenas.page", &sv, size_t);
-		malloc_cprintf(write_cb, cbopaque, "Page size: %zu\n", sv);
+		malloc_cprintf(write_cb, cbopaque, "Page size: %"PRIzu"\n", sv);
 
 		CTL_GET("arenas.lg_dirty_mult", &ssv, ssize_t);
 		if (ssv >= 0) {
@@ -538,19 +540,19 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		}
 		if (je_mallctl("arenas.tcache_max", &sv, &ssz, NULL, 0) == 0) {
 			malloc_cprintf(write_cb, cbopaque,
-			    "Maximum thread-cached size class: %zu\n", sv);
+			    "Maximum thread-cached size class: %"PRIzu"\n", sv);
 		}
 		if (je_mallctl("opt.prof", &bv, &bsz, NULL, 0) == 0 && bv) {
 			CTL_GET("prof.lg_sample", &sv, size_t);
 			malloc_cprintf(write_cb, cbopaque,
 			    "Average profile sample interval: %"PRIu64
-			    " (2^%zu)\n", (((uint64_t)1U) << sv), sv);
+			    " (2^%"PRIzu")\n", (((uint64_t)1U) << sv), sv);
 
 			CTL_GET("opt.lg_prof_interval", &ssv, ssize_t);
 			if (ssv >= 0) {
 				malloc_cprintf(write_cb, cbopaque,
 				    "Average profile dump interval: %"PRIu64
-				    " (2^%zd)\n",
+				    " (2^%"PRIzd")\n",
 				    (((uint64_t)1U) << ssv), ssv);
 			} else {
 				malloc_cprintf(write_cb, cbopaque,
@@ -558,8 +560,8 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 			}
 		}
 		CTL_GET("opt.lg_chunk", &sv, size_t);
-		malloc_cprintf(write_cb, cbopaque, "Chunk size: %zu (2^%zu)\n",
-		    (ZU(1) << sv), sv);
+		malloc_cprintf(write_cb, cbopaque,
+		    "Chunk size: %"PRIzu" (2^%"PRIzu")\n", (ZU(1) << sv), sv);
 	}
 
 	if (config_stats) {
@@ -573,11 +575,12 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		CTL_GET("stats.resident", &resident, size_t);
 		CTL_GET("stats.mapped", &mapped, size_t);
 		malloc_cprintf(write_cb, cbopaque,
-		    "Allocated: %zu, active: %zu, metadata: %zu, resident: %zu,"
-		    " mapped: %zu\n", allocated, active, metadata, resident,
-		    mapped);
+		    "Allocated: %"PRIzu", active: %"PRIzu", metadata: %"PRIzu","
+		    " resident: %"PRIzu", mapped: %"PRIzu"\n",
+		    allocated, active, metadata, resident, mapped);
 		malloc_cprintf(write_cb, cbopaque,
-		    "Current active ceiling: %zu\n", atomic_read_z(cactive));
+		    "Current active ceiling: %"PRIzu"\n",
+		    atomic_read_z(cactive));
 
 		if (merged) {
 			unsigned narenas;
-- 
cgit v0.12


From b946086b08fd9f1d989b6c26db3d734d7cfe0be4 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 7 Jul 2015 20:16:25 -0700
Subject: Use jemalloc_ffs() rather than ffs().

---
 src/chunk.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/chunk.c b/src/chunk.c
index b600aba..9a7bd45 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -539,13 +539,21 @@ chunk_boot(void)
 	SYSTEM_INFO info;
 	GetSystemInfo(&info);
 
-	/* Verify actual page size is equal to or an integral multiple of configured page size */
+	/*
+	 * Verify actual page size is equal to or an integral multiple of
+	 * configured page size.
+	 */
 	if (info.dwPageSize & ((1U << LG_PAGE) - 1))
 		return (true);
 
-	/* Configure chunksize (if not set) to match granularity (usually 64K), so pages_map will always take fast path */
-	if (!opt_lg_chunk)
-		opt_lg_chunk = ffs((int)info.dwAllocationGranularity) - 1;
+	/*
+	 * Configure chunksize (if not set) to match granularity (usually 64K),
+	 * so pages_map will always take fast path.
+	 */
+	if (!opt_lg_chunk) {
+		opt_lg_chunk = jemalloc_ffs((int)info.dwAllocationGranularity)
+		    - 1;
+	}
 #else
 	if (!opt_lg_chunk)
 		opt_lg_chunk = LG_CHUNK_DEFAULT;
-- 
cgit v0.12


From d508ec71ebc7985b02851f79765f025a555d7061 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 7 Jul 2015 20:28:22 -0700
Subject: Fix a variable declaration typo.

---
 src/tsd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tsd.c b/src/tsd.c
index b8886da..2100833 100644
--- a/src/tsd.c
+++ b/src/tsd.c
@@ -152,7 +152,7 @@ _tls_callback(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpvReserved)
 #endif
 JEMALLOC_SECTION(".CRT$XLY") JEMALLOC_ATTR(used)
 static BOOL	(WINAPI *const tls_callback)(HINSTANCE hinstDLL,
-    DWORD fdwRerason, LPVOID lpvReserved) = _tls_callback;
+    DWORD fdwReason, LPVOID lpvReserved) = _tls_callback;
 #endif
 
 #if (!defined(JEMALLOC_MALLOC_THREAD_CLEANUP) && !defined(JEMALLOC_TLS) && \
-- 
cgit v0.12


From 32dca1191c297cff0844daf66cb1f26554fe969b Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Thu, 9 Jul 2015 11:34:13 -0700
Subject: Add a missing ChangeLog entry.

---
 ChangeLog | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ChangeLog b/ChangeLog
index 49faa13..950c656 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -150,6 +150,9 @@ brevity.  Much more detail can be found in the git revision history:
     memory decreases.  This regression was first released in 3.5.0.
   - Fix OOM handling in memalign() and valloc().  A variant of this bug existed
     in all releases since 2.0.0, which introduced these functions.
+  - Fix an OOM-related regression in arena_tcache_fill_small(), which could
+    cause cache corruption on OOM.  This regression was present in all releases
+    from 2.2.0 through 3.6.0.
   - Fix size class overflow handling for malloc(), posix_memalign(), memalign(),
     calloc(), and realloc() when profiling is enabled.
   - Fix the "arena.<i>.dss" mallctl to return an error if "primary" or
-- 
cgit v0.12


From 7ae1239177c2e2d4e57d03468539f522bb38406a Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Thu, 9 Jul 2015 16:32:49 -0700
Subject: Fix indentation.

---
 test/unit/lg_chunk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/unit/lg_chunk.c b/test/unit/lg_chunk.c
index 7f0b31c..7e5df38 100644
--- a/test/unit/lg_chunk.c
+++ b/test/unit/lg_chunk.c
@@ -22,5 +22,5 @@ main(void)
 {
 
 	return (test(
-		test_lg_chunk_clamp));
+	    test_lg_chunk_clamp));
 }
-- 
cgit v0.12


From dde067264db6b801f7ffae9616a35dba5d2d9ad4 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Thu, 9 Jul 2015 21:36:33 -0700
Subject: Fix an integer overflow bug in {size2index,s2u}_compute().

This {bug,regression} was introduced by
155bfa7da18cab0d21d87aa2dce4554166836f5d (Normalize size classes.).

This resolves #241.
---
 Makefile.in                                      |  1 +
 include/jemalloc/internal/jemalloc_internal.h.in |  8 ++-
 test/unit/size_classes.c                         | 89 ++++++++++++++++++++++++
 3 files changed, 96 insertions(+), 2 deletions(-)
 create mode 100644 test/unit/size_classes.c

diff --git a/Makefile.in b/Makefile.in
index 0dcdb5f..02f4424 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -139,6 +139,7 @@ TESTS_UNIT := $(srcroot)test/unit/atomic.c \
 	$(srcroot)test/unit/rb.c \
 	$(srcroot)test/unit/rtree.c \
 	$(srcroot)test/unit/SFMT.c \
+	$(srcroot)test/unit/size_classes.c \
 	$(srcroot)test/unit/stats.c \
 	$(srcroot)test/unit/tsd.c \
 	$(srcroot)test/unit/util.c \
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index ff9412a..706c198 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -528,7 +528,9 @@ size2index_compute(size_t size)
 	}
 #endif
 	{
-		size_t x = lg_floor((size<<1)-1);
+		size_t x = unlikely(ZI(size) < 0) ? ((size<<1) ?
+		    (ZU(1)<<(LG_SIZEOF_PTR+3)) : ((ZU(1)<<(LG_SIZEOF_PTR+3))-1))
+		    : lg_floor((size<<1)-1);
 		size_t shift = (x < LG_SIZE_CLASS_GROUP + LG_QUANTUM) ? 0 :
 		    x - (LG_SIZE_CLASS_GROUP + LG_QUANTUM);
 		size_t grp = shift << LG_SIZE_CLASS_GROUP;
@@ -624,7 +626,9 @@ s2u_compute(size_t size)
 	}
 #endif
 	{
-		size_t x = lg_floor((size<<1)-1);
+		size_t x = unlikely(ZI(size) < 0) ? ((size<<1) ?
+		    (ZU(1)<<(LG_SIZEOF_PTR+3)) : ((ZU(1)<<(LG_SIZEOF_PTR+3))-1))
+		    : lg_floor((size<<1)-1);
 		size_t lg_delta = (x < LG_SIZE_CLASS_GROUP + LG_QUANTUM + 1)
 		    ?  LG_QUANTUM : x - LG_SIZE_CLASS_GROUP - 1;
 		size_t delta = ZU(1) << lg_delta;
diff --git a/test/unit/size_classes.c b/test/unit/size_classes.c
new file mode 100644
index 0000000..d791834
--- /dev/null
+++ b/test/unit/size_classes.c
@@ -0,0 +1,89 @@
+#include "test/jemalloc_test.h"
+
+static size_t
+get_max_size_class(void)
+{
+	unsigned nhchunks;
+	size_t mib[4];
+	size_t sz, miblen, max_size_class;
+
+	sz = sizeof(unsigned);
+	assert_d_eq(mallctl("arenas.nhchunks", &nhchunks, &sz, NULL, 0), 0,
+	    "Unexpected mallctl() error");
+
+	miblen = sizeof(mib) / sizeof(size_t);
+	assert_d_eq(mallctlnametomib("arenas.hchunk.0.size", mib, &miblen), 0,
+	    "Unexpected mallctlnametomib() error");
+	mib[2] = nhchunks - 1;
+
+	sz = sizeof(size_t);
+	assert_d_eq(mallctlbymib(mib, miblen, &max_size_class, &sz, NULL, 0), 0,
+	    "Unexpected mallctlbymib() error");
+
+	return (max_size_class);
+}
+
+TEST_BEGIN(test_size_classes)
+{
+	size_t size_class, max_size_class;
+	index_t index, max_index;
+
+	max_size_class = get_max_size_class();
+	max_index = size2index(max_size_class);
+
+	for (index = 0, size_class = index2size(index); index < max_index ||
+	    size_class < max_size_class; index++, size_class =
+	    index2size(index)) {
+		assert_true(index < max_index,
+		    "Loop conditionals should be equivalent; index=%u, "
+		    "size_class=%zu (%#zx)", index, size_class, size_class);
+		assert_true(size_class < max_size_class,
+		    "Loop conditionals should be equivalent; index=%u, "
+		    "size_class=%zu (%#zx)", index, size_class, size_class);
+
+		assert_u_eq(index, size2index(size_class),
+		    "size2index() does not reverse index2size(): index=%u -->"
+		    " size_class=%zu --> index=%u --> size_class=%zu", index,
+		    size_class, size2index(size_class),
+		    index2size(size2index(size_class)));
+		assert_zu_eq(size_class, index2size(size2index(size_class)),
+		    "index2size() does not reverse size2index(): index=%u -->"
+		    " size_class=%zu --> index=%u --> size_class=%zu", index,
+		    size_class, size2index(size_class),
+		    index2size(size2index(size_class)));
+
+		assert_u_eq(index+1, size2index(size_class+1),
+		    "Next size_class does not round up properly");
+
+		assert_zu_eq(size_class, (index > 0) ?
+		    s2u(index2size(index-1)+1) : s2u(1),
+		    "s2u() does not round up to size class");
+		assert_zu_eq(size_class, s2u(size_class-1),
+		    "s2u() does not round up to size class");
+		assert_zu_eq(size_class, s2u(size_class),
+		    "s2u() does not compute same size class");
+		assert_zu_eq(s2u(size_class+1), index2size(index+1),
+		    "s2u() does not round up to next size class");
+	}
+
+	assert_u_eq(index, size2index(index2size(index)),
+	    "size2index() does not reverse index2size()");
+	assert_zu_eq(max_size_class, index2size(size2index(max_size_class)),
+	    "index2size() does not reverse size2index()");
+
+	assert_zu_eq(size_class, s2u(index2size(index-1)+1),
+	    "s2u() does not round up to size class");
+	assert_zu_eq(size_class, s2u(size_class-1),
+	    "s2u() does not round up to size class");
+	assert_zu_eq(size_class, s2u(size_class),
+	    "s2u() does not compute same size class");
+}
+TEST_END
+
+int
+main(void)
+{
+
+	return (test(
+	    test_size_classes));
+}
-- 
cgit v0.12


From ae93d6bf364e9db9f9ee69c3e5f9df110d8685a4 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 10 Jul 2015 14:33:00 -0700
Subject: Avoid function prototype incompatibilities.

Add various function attributes to the exported functions to give the
compiler more information to work with during optimization, and also
specify throw() when compiling with C++ on Linux, in order to adequately
match what __THROW does in glibc.

This resolves #237.
---
 configure.ac                          |  1 +
 include/jemalloc/jemalloc_defs.h.in   |  7 ++++
 include/jemalloc/jemalloc_macros.h.in | 21 ++++++++++--
 include/jemalloc/jemalloc_protos.h.in | 50 +++++++++++++++++------------
 src/jemalloc.c                        | 60 +++++++++++++++++++++++------------
 test/unit/bitmap.c                    |  8 ++---
 test/unit/rtree.c                     |  2 +-
 7 files changed, 100 insertions(+), 49 deletions(-)

diff --git a/configure.ac b/configure.ac
index 61adc2a..97aa2ad 100644
--- a/configure.ac
+++ b/configure.ac
@@ -301,6 +301,7 @@ case "${host}" in
 	AC_DEFINE([JEMALLOC_HAS_ALLOCA_H])
 	AC_DEFINE([JEMALLOC_PURGE_MADVISE_DONTNEED], [ ])
 	AC_DEFINE([JEMALLOC_THREADED_INIT], [ ])
+	AC_DEFINE([JEMALLOC_USE_CXX_THROW], [ ])
 	default_munmap="0"
 	;;
   *-*-netbsd*)
diff --git a/include/jemalloc/jemalloc_defs.h.in b/include/jemalloc/jemalloc_defs.h.in
index ce6c698..4c0335e 100644
--- a/include/jemalloc/jemalloc_defs.h.in
+++ b/include/jemalloc/jemalloc_defs.h.in
@@ -17,5 +17,12 @@
  */
 #undef JEMALLOC_USABLE_SIZE_CONST
 
+/*
+ * If defined, specify throw() for the public function prototypes when compiling
+ * with C++.  The only justification for this is to match the prototypes that
+ * glibc defines.
+ */
+#undef JEMALLOC_USE_CXX_THROW
+
 /* sizeof(void *) == 2^LG_SIZEOF_PTR. */
 #undef LG_SIZEOF_PTR
diff --git a/include/jemalloc/jemalloc_macros.h.in b/include/jemalloc/jemalloc_macros.h.in
index 72f2a08..2183a13 100644
--- a/include/jemalloc/jemalloc_macros.h.in
+++ b/include/jemalloc/jemalloc_macros.h.in
@@ -30,14 +30,23 @@
  */
 #  define MALLOCX_ARENA(a)	((int)(((a)+1) << 20))
 
+#if defined(__cplusplus) && defined(JEMALLOC_USE_CXX_THROW)
+#  define JEMALLOC_CXX_THROW throw()
+#else
+#  define JEMALLOC_CXX_THROW
+#endif
+
 #ifdef JEMALLOC_HAVE_ATTR
 #  define JEMALLOC_ATTR(s) __attribute__((s))
 #  ifndef JEMALLOC_EXPORT
 #    define JEMALLOC_EXPORT JEMALLOC_ATTR(visibility("default"))
 #  endif
 #  define JEMALLOC_ALIGNED(s) JEMALLOC_ATTR(aligned(s))
-#  define JEMALLOC_SECTION(s) JEMALLOC_ATTR(section(s))
+#  define JEMALLOC_ALLOC_SIZE(s) JEMALLOC_ATTR(alloc_size(s))
+#  define JEMALLOC_ALLOC_SIZE2(s1, s2) JEMALLOC_ATTR(alloc_size(s1, s2))
 #  define JEMALLOC_NOINLINE JEMALLOC_ATTR(noinline)
+#  define JEMALLOC_NOTHROW JEMALLOC_ATTR(nothrow)
+#  define JEMALLOC_SECTION(s) JEMALLOC_ATTR(section(s))
 #elif _MSC_VER
 #  define JEMALLOC_ATTR(s)
 #  ifndef JEMALLOC_EXPORT
@@ -48,12 +57,18 @@
 #    endif
 #  endif
 #  define JEMALLOC_ALIGNED(s) __declspec(align(s))
-#  define JEMALLOC_SECTION(s) __declspec(allocate(s))
+#  define JEMALLOC_ALLOC_SIZE(s)
+#  define JEMALLOC_ALLOC_SIZE2(s1, s2)
 #  define JEMALLOC_NOINLINE __declspec(noinline)
+#  define JEMALLOC_NOTHROW __declspec(nothrow)
+#  define JEMALLOC_SECTION(s) __declspec(allocate(s))
 #else
 #  define JEMALLOC_ATTR(s)
 #  define JEMALLOC_EXPORT
 #  define JEMALLOC_ALIGNED(s)
-#  define JEMALLOC_SECTION(s)
+#  define JEMALLOC_ALLOC_SIZE(s)
+#  define JEMALLOC_ALLOC_SIZE2(s1, s2)
 #  define JEMALLOC_NOINLINE
+#  define JEMALLOC_NOTHROW
+#  define JEMALLOC_SECTION(s)
 #endif
diff --git a/include/jemalloc/jemalloc_protos.h.in b/include/jemalloc/jemalloc_protos.h.in
index f81adc1..e77bd28 100644
--- a/include/jemalloc/jemalloc_protos.h.in
+++ b/include/jemalloc/jemalloc_protos.h.in
@@ -7,44 +7,52 @@ extern JEMALLOC_EXPORT const char	*@je_@malloc_conf;
 extern JEMALLOC_EXPORT void		(*@je_@malloc_message)(void *cbopaque,
     const char *s);
 
-JEMALLOC_EXPORT void	*@je_@malloc(size_t size) JEMALLOC_ATTR(malloc);
-JEMALLOC_EXPORT void	*@je_@calloc(size_t num, size_t size)
-    JEMALLOC_ATTR(malloc);
+JEMALLOC_EXPORT void	*@je_@malloc(size_t size) JEMALLOC_CXX_THROW
+    JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1) JEMALLOC_NOTHROW;
+JEMALLOC_EXPORT void	*@je_@calloc(size_t num, size_t size) JEMALLOC_CXX_THROW
+    JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE2(1, 2) JEMALLOC_NOTHROW;
 JEMALLOC_EXPORT int	@je_@posix_memalign(void **memptr, size_t alignment,
-    size_t size) JEMALLOC_ATTR(nonnull(1));
+    size_t size) JEMALLOC_CXX_THROW JEMALLOC_ATTR(nonnull(1))
+    JEMALLOC_ALLOC_SIZE(2) JEMALLOC_NOTHROW;
 JEMALLOC_EXPORT void	*@je_@aligned_alloc(size_t alignment, size_t size)
-    JEMALLOC_ATTR(malloc);
-JEMALLOC_EXPORT void	*@je_@realloc(void *ptr, size_t size);
-JEMALLOC_EXPORT void	@je_@free(void *ptr);
+    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(2)
+    JEMALLOC_NOTHROW;
+JEMALLOC_EXPORT void	*@je_@realloc(void *ptr, size_t size) JEMALLOC_CXX_THROW
+    JEMALLOC_ALLOC_SIZE(2) JEMALLOC_NOTHROW;
+JEMALLOC_EXPORT void	@je_@free(void *ptr) JEMALLOC_CXX_THROW
+    JEMALLOC_NOTHROW;
 
 JEMALLOC_EXPORT void	*@je_@mallocx(size_t size, int flags)
-    JEMALLOC_ATTR(malloc);
-JEMALLOC_EXPORT void	*@je_@rallocx(void *ptr, size_t size, int flags);
+    JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1) JEMALLOC_NOTHROW;
+JEMALLOC_EXPORT void	*@je_@rallocx(void *ptr, size_t size, int flags)
+    JEMALLOC_ALLOC_SIZE(2) JEMALLOC_NOTHROW;
 JEMALLOC_EXPORT size_t	@je_@xallocx(void *ptr, size_t size, size_t extra,
-    int flags);
+    int flags) JEMALLOC_ALLOC_SIZE(2) JEMALLOC_NOTHROW;
 JEMALLOC_EXPORT size_t	@je_@sallocx(const void *ptr, int flags)
-    JEMALLOC_ATTR(pure);
-JEMALLOC_EXPORT void	@je_@dallocx(void *ptr, int flags);
-JEMALLOC_EXPORT void	@je_@sdallocx(void *ptr, size_t size, int flags);
+    JEMALLOC_ATTR(pure) JEMALLOC_NOTHROW;
+JEMALLOC_EXPORT void	@je_@dallocx(void *ptr, int flags) JEMALLOC_NOTHROW;
+JEMALLOC_EXPORT void	@je_@sdallocx(void *ptr, size_t size, int flags)
+    JEMALLOC_NOTHROW;
 JEMALLOC_EXPORT size_t	@je_@nallocx(size_t size, int flags)
-    JEMALLOC_ATTR(pure);
+    JEMALLOC_ATTR(pure) JEMALLOC_NOTHROW;
 
 JEMALLOC_EXPORT int	@je_@mallctl(const char *name, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen);
+    size_t *oldlenp, void *newp, size_t newlen) JEMALLOC_NOTHROW;
 JEMALLOC_EXPORT int	@je_@mallctlnametomib(const char *name, size_t *mibp,
-    size_t *miblenp);
+    size_t *miblenp) JEMALLOC_NOTHROW;
 JEMALLOC_EXPORT int	@je_@mallctlbymib(const size_t *mib, size_t miblen,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen);
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) JEMALLOC_NOTHROW;
 JEMALLOC_EXPORT void	@je_@malloc_stats_print(void (*write_cb)(void *,
-    const char *), void *@je_@cbopaque, const char *opts);
+    const char *), void *@je_@cbopaque, const char *opts) JEMALLOC_NOTHROW;
 JEMALLOC_EXPORT size_t	@je_@malloc_usable_size(
-    JEMALLOC_USABLE_SIZE_CONST void *ptr);
+    JEMALLOC_USABLE_SIZE_CONST void *ptr) JEMALLOC_CXX_THROW JEMALLOC_NOTHROW;
 
 #ifdef JEMALLOC_OVERRIDE_MEMALIGN
-JEMALLOC_EXPORT void *	@je_@memalign(size_t alignment, size_t size)
+JEMALLOC_EXPORT void	*@je_@memalign(size_t alignment, size_t size)
     JEMALLOC_ATTR(malloc);
 #endif
 
 #ifdef JEMALLOC_OVERRIDE_VALLOC
-JEMALLOC_EXPORT void *	@je_@valloc(size_t size) JEMALLOC_ATTR(malloc);
+JEMALLOC_EXPORT void	*@je_@valloc(size_t size) JEMALLOC_CXX_THROW
+    JEMALLOC_ATTR(malloc);
 #endif
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 43c4c81..fc223da 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1395,7 +1395,8 @@ imalloc_body(size_t size, tsd_t **tsd, size_t *usize)
 	return (imalloc(*tsd, size));
 }
 
-void *
+JEMALLOC_EXPORT void *
+JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1) JEMALLOC_NOTHROW
 je_malloc(size_t size)
 {
 	void *ret;
@@ -1529,7 +1530,8 @@ label_oom:
 	goto label_return;
 }
 
-int
+JEMALLOC_EXPORT int
+JEMALLOC_ATTR(nonnull(1)) JEMALLOC_ALLOC_SIZE(2) JEMALLOC_NOTHROW
 je_posix_memalign(void **memptr, size_t alignment, size_t size)
 {
 	int ret = imemalign(memptr, alignment, size, sizeof(void *));
@@ -1538,7 +1540,8 @@ je_posix_memalign(void **memptr, size_t alignment, size_t size)
 	return (ret);
 }
 
-void *
+JEMALLOC_EXPORT void *
+JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(2) JEMALLOC_NOTHROW
 je_aligned_alloc(size_t alignment, size_t size)
 {
 	void *ret;
@@ -1591,7 +1594,8 @@ icalloc_prof(tsd_t *tsd, size_t usize)
 	return (p);
 }
 
-void *
+JEMALLOC_EXPORT void *
+JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE2(1, 2) JEMALLOC_NOTHROW
 je_calloc(size_t num, size_t size)
 {
 	void *ret;
@@ -1735,7 +1739,8 @@ isfree(tsd_t *tsd, void *ptr, size_t usize, tcache_t *tcache)
 	JEMALLOC_VALGRIND_FREE(ptr, rzsize);
 }
 
-void *
+JEMALLOC_EXPORT void *
+JEMALLOC_ALLOC_SIZE(2) JEMALLOC_NOTHROW
 je_realloc(void *ptr, size_t size)
 {
 	void *ret;
@@ -1798,7 +1803,8 @@ je_realloc(void *ptr, size_t size)
 	return (ret);
 }
 
-void
+JEMALLOC_EXPORT void
+JEMALLOC_NOTHROW
 je_free(void *ptr)
 {
 
@@ -1818,7 +1824,8 @@ je_free(void *ptr)
  */
 
 #ifdef JEMALLOC_OVERRIDE_MEMALIGN
-void *
+JEMALLOC_EXPORT void *
+JEMALLOC_ATTR(malloc)
 je_memalign(size_t alignment, size_t size)
 {
 	void *ret JEMALLOC_CC_SILENCE_INIT(NULL);
@@ -1830,7 +1837,8 @@ je_memalign(size_t alignment, size_t size)
 #endif
 
 #ifdef JEMALLOC_OVERRIDE_VALLOC
-void *
+JEMALLOC_EXPORT void *
+JEMALLOC_ATTR(malloc)
 je_valloc(size_t size)
 {
 	void *ret JEMALLOC_CC_SILENCE_INIT(NULL);
@@ -2024,7 +2032,8 @@ imallocx_no_prof(tsd_t *tsd, size_t size, int flags, size_t *usize)
 	return (p);
 }
 
-void *
+JEMALLOC_EXPORT void *
+JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1) JEMALLOC_NOTHROW
 je_mallocx(size_t size, int flags)
 {
 	tsd_t *tsd;
@@ -2121,7 +2130,8 @@ irallocx_prof(tsd_t *tsd, void *oldptr, size_t old_usize, size_t size,
 	return (p);
 }
 
-void *
+JEMALLOC_EXPORT void *
+JEMALLOC_ALLOC_SIZE(2) JEMALLOC_NOTHROW
 je_rallocx(void *ptr, size_t size, int flags)
 {
 	void *p;
@@ -2266,7 +2276,8 @@ ixallocx_prof(tsd_t *tsd, void *ptr, size_t old_usize, size_t size,
 	return (usize);
 }
 
-size_t
+JEMALLOC_EXPORT size_t
+JEMALLOC_ALLOC_SIZE(2) JEMALLOC_NOTHROW
 je_xallocx(void *ptr, size_t size, size_t extra, int flags)
 {
 	tsd_t *tsd;
@@ -2307,7 +2318,8 @@ label_not_resized:
 	return (usize);
 }
 
-size_t
+JEMALLOC_EXPORT size_t
+JEMALLOC_ATTR(pure) JEMALLOC_NOTHROW
 je_sallocx(const void *ptr, int flags)
 {
 	size_t usize;
@@ -2323,7 +2335,8 @@ je_sallocx(const void *ptr, int flags)
 	return (usize);
 }
 
-void
+JEMALLOC_EXPORT void
+JEMALLOC_NOTHROW
 je_dallocx(void *ptr, int flags)
 {
 	tsd_t *tsd;
@@ -2358,7 +2371,8 @@ inallocx(size_t size, int flags)
 	return (usize);
 }
 
-void
+JEMALLOC_EXPORT void
+JEMALLOC_NOTHROW
 je_sdallocx(void *ptr, size_t size, int flags)
 {
 	tsd_t *tsd;
@@ -2383,7 +2397,8 @@ je_sdallocx(void *ptr, size_t size, int flags)
 	isfree(tsd, ptr, usize, tcache);
 }
 
-size_t
+JEMALLOC_EXPORT size_t
+JEMALLOC_ATTR(pure) JEMALLOC_NOTHROW
 je_nallocx(size_t size, int flags)
 {
 
@@ -2395,7 +2410,8 @@ je_nallocx(size_t size, int flags)
 	return (inallocx(size, flags));
 }
 
-int
+JEMALLOC_EXPORT int
+JEMALLOC_NOTHROW
 je_mallctl(const char *name, void *oldp, size_t *oldlenp, void *newp,
     size_t newlen)
 {
@@ -2406,7 +2422,8 @@ je_mallctl(const char *name, void *oldp, size_t *oldlenp, void *newp,
 	return (ctl_byname(name, oldp, oldlenp, newp, newlen));
 }
 
-int
+JEMALLOC_EXPORT int
+JEMALLOC_NOTHROW
 je_mallctlnametomib(const char *name, size_t *mibp, size_t *miblenp)
 {
 
@@ -2416,7 +2433,8 @@ je_mallctlnametomib(const char *name, size_t *mibp, size_t *miblenp)
 	return (ctl_nametomib(name, mibp, miblenp));
 }
 
-int
+JEMALLOC_EXPORT int
+JEMALLOC_NOTHROW
 je_mallctlbymib(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
   void *newp, size_t newlen)
 {
@@ -2427,7 +2445,8 @@ je_mallctlbymib(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 	return (ctl_bymib(mib, miblen, oldp, oldlenp, newp, newlen));
 }
 
-void
+JEMALLOC_EXPORT void
+JEMALLOC_NOTHROW
 je_malloc_stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
     const char *opts)
 {
@@ -2435,7 +2454,8 @@ je_malloc_stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	stats_print(write_cb, cbopaque, opts);
 }
 
-size_t
+JEMALLOC_EXPORT size_t
+JEMALLOC_NOTHROW
 je_malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *ptr)
 {
 	size_t ret;
diff --git a/test/unit/bitmap.c b/test/unit/bitmap.c
index 4ea94f8..7da583d 100644
--- a/test/unit/bitmap.c
+++ b/test/unit/bitmap.c
@@ -23,7 +23,7 @@ TEST_BEGIN(test_bitmap_init)
 		bitmap_info_init(&binfo, i);
 		{
 			size_t j;
-			bitmap_t *bitmap = malloc(sizeof(bitmap_t) *
+			bitmap_t *bitmap = (bitmap_t *)malloc(sizeof(bitmap_t) *
 				bitmap_info_ngroups(&binfo));
 			bitmap_init(bitmap, &binfo);
 
@@ -46,7 +46,7 @@ TEST_BEGIN(test_bitmap_set)
 		bitmap_info_init(&binfo, i);
 		{
 			size_t j;
-			bitmap_t *bitmap = malloc(sizeof(bitmap_t) *
+			bitmap_t *bitmap = (bitmap_t *)malloc(sizeof(bitmap_t) *
 				bitmap_info_ngroups(&binfo));
 			bitmap_init(bitmap, &binfo);
 
@@ -69,7 +69,7 @@ TEST_BEGIN(test_bitmap_unset)
 		bitmap_info_init(&binfo, i);
 		{
 			size_t j;
-			bitmap_t *bitmap = malloc(sizeof(bitmap_t) *
+			bitmap_t *bitmap = (bitmap_t *)malloc(sizeof(bitmap_t) *
 				bitmap_info_ngroups(&binfo));
 			bitmap_init(bitmap, &binfo);
 
@@ -98,7 +98,7 @@ TEST_BEGIN(test_bitmap_sfu)
 		bitmap_info_init(&binfo, i);
 		{
 			ssize_t j;
-			bitmap_t *bitmap = malloc(sizeof(bitmap_t) *
+			bitmap_t *bitmap = (bitmap_t *)malloc(sizeof(bitmap_t) *
 				bitmap_info_ngroups(&binfo));
 			bitmap_init(bitmap, &binfo);
 
diff --git a/test/unit/rtree.c b/test/unit/rtree.c
index 3f95554..3d75bd0 100644
--- a/test/unit/rtree.c
+++ b/test/unit/rtree.c
@@ -4,7 +4,7 @@ static rtree_node_elm_t *
 node_alloc(size_t nelms)
 {
 
-	return (calloc(nelms, sizeof(rtree_node_elm_t)));
+	return ((rtree_node_elm_t *)calloc(nelms, sizeof(rtree_node_elm_t)));
 }
 
 static void
-- 
cgit v0.12


From 0b8f0bc0a41ad6db469097bf257d85a44f839f5f Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Fri, 10 Jul 2015 16:41:12 -0700
Subject: Add configure test for alloc_size attribute.

---
 configure.ac                          | 11 +++++++++++
 include/jemalloc/jemalloc_defs.h.in   |  3 +++
 include/jemalloc/jemalloc_macros.h.in |  9 +++++++--
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/configure.ac b/configure.ac
index 97aa2ad..1363ee9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -428,6 +428,17 @@ if test "x${je_cv_tls_model}" = "xyes" ; then
 else
   AC_DEFINE([JEMALLOC_TLS_MODEL], [ ])
 fi
+dnl Check for alloc_size attribute support.
+SAVED_CFLAGS="${CFLAGS}"
+JE_CFLAGS_APPEND([-Werror])
+JE_COMPILABLE([alloc_size attribute], [],
+              [#include <stdlib.h>
+               static void *foo(size_t size) __attribute__((alloc_size(1)));],
+              [je_cv_alloc_size])
+CFLAGS="${SAVED_CFLAGS}"
+if test "x${je_cv_alloc_size}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_ATTR_ALLOC_SIZE], [ ])
+fi
 
 dnl Support optional additions to rpath.
 AC_ARG_WITH([rpath],
diff --git a/include/jemalloc/jemalloc_defs.h.in b/include/jemalloc/jemalloc_defs.h.in
index 4c0335e..1f47c3e 100644
--- a/include/jemalloc/jemalloc_defs.h.in
+++ b/include/jemalloc/jemalloc_defs.h.in
@@ -1,6 +1,9 @@
 /* Defined if __attribute__((...)) syntax is supported. */
 #undef JEMALLOC_HAVE_ATTR
 
+/* Defined if alloc_size attribute is supported. */
+#undef JEMALLOC_HAVE_ATTR_ALLOC_SIZE
+
 /*
  * Define overrides for non-standard allocator-related functions if they are
  * present on the system.
diff --git a/include/jemalloc/jemalloc_macros.h.in b/include/jemalloc/jemalloc_macros.h.in
index 2183a13..6ba8f9a 100644
--- a/include/jemalloc/jemalloc_macros.h.in
+++ b/include/jemalloc/jemalloc_macros.h.in
@@ -42,8 +42,13 @@
 #    define JEMALLOC_EXPORT JEMALLOC_ATTR(visibility("default"))
 #  endif
 #  define JEMALLOC_ALIGNED(s) JEMALLOC_ATTR(aligned(s))
-#  define JEMALLOC_ALLOC_SIZE(s) JEMALLOC_ATTR(alloc_size(s))
-#  define JEMALLOC_ALLOC_SIZE2(s1, s2) JEMALLOC_ATTR(alloc_size(s1, s2))
+#  ifdef JEMALLOC_HAVE_ATTR_ALLOC_SIZE
+#    define JEMALLOC_ALLOC_SIZE(s) JEMALLOC_ATTR(alloc_size(s))
+#    define JEMALLOC_ALLOC_SIZE2(s1, s2) JEMALLOC_ATTR(alloc_size(s1, s2))
+#  else
+#    define JEMALLOC_ALLOC_SIZE(s)
+#    define JEMALLOC_ALLOC_SIZE2(s1, s2)
+#  endif
 #  define JEMALLOC_NOINLINE JEMALLOC_ATTR(noinline)
 #  define JEMALLOC_NOTHROW JEMALLOC_ATTR(nothrow)
 #  define JEMALLOC_SECTION(s) JEMALLOC_ATTR(section(s))
-- 
cgit v0.12


From 92d72eeef0d7e08da9de1094546330d5facba11d Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 10 Jul 2015 16:45:32 -0700
Subject: Fix alloc_size configure test.

---
 configure.ac | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/configure.ac b/configure.ac
index 1363ee9..c6388f7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -431,9 +431,8 @@ fi
 dnl Check for alloc_size attribute support.
 SAVED_CFLAGS="${CFLAGS}"
 JE_CFLAGS_APPEND([-Werror])
-JE_COMPILABLE([alloc_size attribute], [],
-              [#include <stdlib.h>
-               static void *foo(size_t size) __attribute__((alloc_size(1)));],
+JE_COMPILABLE([alloc_size attribute], [#include <stdlib.h>],
+              [void *foo(size_t size) __attribute__((alloc_size(1)));],
               [je_cv_alloc_size])
 CFLAGS="${SAVED_CFLAGS}"
 if test "x${je_cv_alloc_size}" = "xyes" ; then
-- 
cgit v0.12


From 8693a9ea05931e69b30d57767405436d36ed709c Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Mon, 13 Jul 2015 14:35:15 -0700
Subject: Add timer support for Windows.

---
 test/include/test/timer.h |  9 ++++++---
 test/src/timer.c          | 25 ++++++++++++++++++-------
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/test/include/test/timer.h b/test/include/test/timer.h
index 9ffbaef..a7fefdf 100644
--- a/test/include/test/timer.h
+++ b/test/include/test/timer.h
@@ -7,9 +7,12 @@
     && _POSIX_MONOTONIC_CLOCK >= 0
 
 typedef struct {
-#if JEMALLOC_CLOCK_GETTIME
-	struct timespec tv0;
-	struct timespec tv1;
+#ifdef _WIN32
+	FILETIME ft0;
+	FILETIME ft1;
+#elif JEMALLOC_CLOCK_GETTIME
+	struct timespec ts0;
+	struct timespec ts1;
 	int clock_id;
 #else
 	struct timeval tv0;
diff --git a/test/src/timer.c b/test/src/timer.c
index 338a9ef..66b8070 100644
--- a/test/src/timer.c
+++ b/test/src/timer.c
@@ -4,12 +4,14 @@ void
 timer_start(timedelta_t *timer)
 {
 
-#if JEMALLOC_CLOCK_GETTIME
+#ifdef _WIN32
+	GetSystemTimeAsFileTime(&timer->ft0);
+#elif JEMALLOC_CLOCK_GETTIME
 	if (sysconf(_SC_MONOTONIC_CLOCK) <= 0)
 		timer->clock_id = CLOCK_REALTIME;
 	else
 		timer->clock_id = CLOCK_MONOTONIC;
-	clock_gettime(timer->clock_id, &timer->tv0);
+	clock_gettime(timer->clock_id, &timer->ts0);
 #else
 	gettimeofday(&timer->tv0, NULL);
 #endif
@@ -19,8 +21,10 @@ void
 timer_stop(timedelta_t *timer)
 {
 
-#if JEMALLOC_CLOCK_GETTIME
-	clock_gettime(timer->clock_id, &timer->tv1);
+#ifdef _WIN32
+	GetSystemTimeAsFileTime(&timer->ft0);
+#elif JEMALLOC_CLOCK_GETTIME
+	clock_gettime(timer->clock_id, &timer->ts1);
 #else
 	gettimeofday(&timer->tv1, NULL);
 #endif
@@ -30,9 +34,16 @@ uint64_t
 timer_usec(const timedelta_t *timer)
 {
 
-#if JEMALLOC_CLOCK_GETTIME
-	return (((timer->tv1.tv_sec - timer->tv0.tv_sec) * 1000000) +
-	    (timer->tv1.tv_nsec - timer->tv0.tv_nsec) / 1000);
+#ifdef _WIN32
+	uint64_t t0, t1;
+	t0 = (((uint64_t)timer->ft0.dwHighDateTime) << 32) |
+	    timer->ft0.dwLowDateTime;
+	t1 = (((uint64_t)timer->ft1.dwHighDateTime) << 32) |
+	    timer->ft1.dwLowDateTime;
+	return ((t1 - t0) / 10);
+#elif JEMALLOC_CLOCK_GETTIME
+	return (((timer->ts1.tv_sec - timer->ts0.tv_sec) * 1000000) +
+	    (timer->ts1.tv_nsec - timer->ts0.tv_nsec) / 1000);
 #else
 	return (((timer->tv1.tv_sec - timer->tv0.tv_sec) * 1000000) +
 	    timer->tv1.tv_usec - timer->tv0.tv_usec);
-- 
cgit v0.12


From aa2826621e1793db9faea31e803690ccbe36f14c Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 15 Jul 2015 16:02:21 -0700
Subject: Revert to first-best-fit run/chunk allocation.

This effectively reverts 97c04a93838c4001688fe31bf018972b4696efe2 (Use
first-fit rather than first-best-fit run/chunk allocation.).  In some
pathological cases, first-fit search dominates allocation time, and it
also tends not to converge as readily on a steady state of memory
layout, since precise allocation order has a bigger effect than for
first-best-fit.
---
 include/jemalloc/internal/arena.h |  2 +-
 src/arena.c                       | 59 +++++++++++----------------------------
 src/chunk.c                       | 44 ++++++-----------------------
 3 files changed, 27 insertions(+), 78 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 9990e45..8811f2e 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -332,7 +332,7 @@ struct arena_s {
 
 	/*
 	 * Size/address-ordered tree of this arena's available runs.  The tree
-	 * is used for first-fit run allocation.
+	 * is used for first-best-fit run allocation.
 	 */
 	arena_avail_tree_t	runs_avail;
 
diff --git a/src/arena.c b/src/arena.c
index a8fae11..65aad20 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -979,53 +979,28 @@ arena_chunk_ralloc_huge_expand(arena_t *arena, void *chunk, size_t oldsize,
 	return (err);
 }
 
-/* Do first-fit run selection. */
+/*
+ * Do first-best-fit run selection, i.e. select the lowest run that best fits.
+ * Run sizes are quantized, so not all candidate runs are necessarily exactly
+ * the same size.
+ */
 static arena_run_t *
-arena_run_first_fit(arena_t *arena, size_t size)
-{
-	arena_run_t *run;
-	size_t search_size, max_size;
-
-	assert(size == s2u(size));
-	assert(size == PAGE_CEILING(size));
-
-	/*
-	 * Iterate over all size classes that are at least large enough to
-	 * satisfy the request, search for the lowest run of each size class,
-	 * and choose the lowest of the runs found.
-	 */
-	run = NULL;
-	for (search_size = run_quantize_first(size), max_size =
-	    run_quantize(arena_maxclass + large_pad); search_size <= max_size;
-	    search_size = run_quantize_next(search_size)) {
-		arena_run_t *currun;
-		arena_chunk_t *currun_chunk;
-		size_t currun_pageind, currun_size;
-		arena_chunk_map_misc_t *key = (arena_chunk_map_misc_t *)
-		    (search_size | CHUNK_MAP_KEY);
-		arena_chunk_map_misc_t *miscelm =
-		    arena_avail_tree_nsearch(&arena->runs_avail, key);
-		if (miscelm == NULL)
-			break;
-		currun = &miscelm->run;
-		if (run == NULL || (uintptr_t)currun < (uintptr_t)run)
-			run = currun;
-		/* Skip iteration(s) if run is larger than the search size. */
-		currun_chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(currun);
-		currun_pageind = arena_miscelm_to_pageind(miscelm);
-		currun_size = arena_mapbits_unallocated_size_get(currun_chunk,
-		    currun_pageind);
-		assert(currun_size >= search_size);
-		search_size = currun_size;
-	}
-
-	return (run);
+arena_run_first_best_fit(arena_t *arena, size_t size)
+{
+	size_t search_size = run_quantize_first(size);
+	arena_chunk_map_misc_t *key = (arena_chunk_map_misc_t *)
+	    (search_size | CHUNK_MAP_KEY);
+	arena_chunk_map_misc_t *miscelm =
+	    arena_avail_tree_nsearch(&arena->runs_avail, key);
+	if (miscelm == NULL)
+		return (NULL);
+	return (&miscelm->run);
 }
 
 static arena_run_t *
 arena_run_alloc_large_helper(arena_t *arena, size_t size, bool zero)
 {
-	arena_run_t *run = arena_run_first_fit(arena, s2u(size));
+	arena_run_t *run = arena_run_first_best_fit(arena, s2u(size));
 	if (run != NULL)
 		arena_run_split_large(arena, run, size, zero);
 	return (run);
@@ -1066,7 +1041,7 @@ arena_run_alloc_large(arena_t *arena, size_t size, bool zero)
 static arena_run_t *
 arena_run_alloc_small_helper(arena_t *arena, size_t size, index_t binind)
 {
-	arena_run_t *run = arena_run_first_fit(arena, size);
+	arena_run_t *run = arena_run_first_best_fit(arena, size);
 	if (run != NULL)
 		arena_run_split_small(arena, run, size, binind);
 	return (run);
diff --git a/src/chunk.c b/src/chunk.c
index 9a7bd45..5945482 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -62,46 +62,20 @@ chunk_deregister(const void *chunk, const extent_node_t *node)
 	}
 }
 
-/* Do first-fit chunk selection. */
+/*
+ * Do first-best-fit chunk selection, i.e. select the lowest chunk that best
+ * fits.
+ */
 static extent_node_t *
-chunk_first_fit(arena_t *arena, extent_tree_t *chunks_szad,
+chunk_first_best_fit(arena_t *arena, extent_tree_t *chunks_szad,
     extent_tree_t *chunks_ad, size_t size)
 {
-	extent_node_t *node;
-	index_t index;
+	extent_node_t key;
 
 	assert(size == CHUNK_CEILING(size));
 
-	if (size == chunksize) {
-		/*
-		 * Any chunk will suffice, so simply select the one lowest in
-		 * memory.
-		 */
-		return (extent_tree_ad_first(chunks_ad));
-	}
-
-	/*
-	 * Iterate over all size classes that are at least large enough to
-	 * satisfy the request, search for the lowest chunk of each size class,
-	 * and choose the lowest of the chunks found.
-	 */
-	node = NULL;
-	for (index = size2index(size); index < NSIZES;) {
-		extent_node_t *curnode;
-		extent_node_t key;
-		extent_node_init(&key, arena, NULL,
-		    CHUNK_CEILING(index2size(index)), false);
-		curnode = extent_tree_szad_nsearch(chunks_szad, &key);
-		if (curnode == NULL)
-			break;
-		if (node == NULL || (uintptr_t)extent_node_addr_get(curnode) <
-		    (uintptr_t)extent_node_addr_get(node))
-			node = curnode;
-		assert(size2index(extent_node_size_get(curnode)) + 1 > index);
-		index = size2index(extent_node_size_get(curnode)) + 1;
-	}
-
-	return (node);
+	extent_node_init(&key, arena, NULL, size, false);
+	return (extent_tree_szad_nsearch(chunks_szad, &key));
 }
 
 static void *
@@ -127,7 +101,7 @@ chunk_recycle(arena_t *arena, extent_tree_t *chunks_szad,
 		extent_node_init(&key, arena, new_addr, alloc_size, false);
 		node = extent_tree_ad_search(chunks_ad, &key);
 	} else {
-		node = chunk_first_fit(arena, chunks_szad, chunks_ad,
+		node = chunk_first_best_fit(arena, chunks_szad, chunks_ad,
 		    alloc_size);
 	}
 	if (node == NULL || (new_addr != NULL && extent_node_size_get(node) <
-- 
cgit v0.12


From 5bd879646cdb303045096fc2b9d75f565313afa8 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 15 Jul 2015 17:15:26 -0700
Subject: Change default chunk size from 256 KiB to 2 MiB.

This change improves interaction with transparent huge pages, e.g.
reduced page faults (at least in the absence of unused dirty page
purging).
---
 doc/jemalloc.xml.in               | 2 +-
 include/jemalloc/internal/chunk.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index bb15ae4..eb677ad 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -907,7 +907,7 @@ for (i = 0; i < nbins; i++) {
         <listitem><para>Virtual memory chunk size (log base 2).  If a chunk
         size outside the supported size range is specified, the size is
         silently clipped to the minimum/maximum supported size.  The default
-        chunk size is 256 KiB (2^18).
+        chunk size is 2 MiB (2^21).
         </para></listitem>
       </varlistentry>
 
diff --git a/include/jemalloc/internal/chunk.h b/include/jemalloc/internal/chunk.h
index c253cdc..91aefad 100644
--- a/include/jemalloc/internal/chunk.h
+++ b/include/jemalloc/internal/chunk.h
@@ -5,7 +5,7 @@
  * Size and alignment of memory chunks that are allocated by the OS's virtual
  * memory system.
  */
-#define	LG_CHUNK_DEFAULT	18
+#define	LG_CHUNK_DEFAULT	21
 
 /* Return the chunk address for allocation address a. */
 #define	CHUNK_ADDR2BASE(a)						\
-- 
cgit v0.12


From 37fd1115c38accc319a82f17c0e9262091844cac Mon Sep 17 00:00:00 2001
From: Dave Rigby <daver@couchbase.com>
Date: Thu, 16 Jul 2015 11:36:00 +0100
Subject: Remove extraneous ';' on closing 'extern "C"'

Fixes warning with newer GCCs:

    include/jemalloc/jemalloc.h:229:2: warning: extra ';' [-Wpedantic]
      };
       ^
---
 include/jemalloc/jemalloc.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/jemalloc/jemalloc.sh b/include/jemalloc/jemalloc.sh
index 7e1c8be..c085814 100755
--- a/include/jemalloc/jemalloc.sh
+++ b/include/jemalloc/jemalloc.sh
@@ -22,7 +22,7 @@ done
 
 cat <<EOF
 #ifdef __cplusplus
-};
+}
 #endif
 #endif /* JEMALLOC_H_ */
 EOF
-- 
cgit v0.12


From f2bc85298c1cd6f4e95fbbeeb7ccc32ff52a1d8f Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 17 Jul 2015 16:38:25 -0700
Subject: Add the config.cache_oblivious mallctl.

---
 ChangeLog           |  3 ++-
 doc/jemalloc.xml.in | 10 ++++++++++
 src/ctl.c           |  3 +++
 test/unit/mallctl.c |  1 +
 4 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/ChangeLog b/ChangeLog
index 950c656..fe62e52 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -102,7 +102,8 @@ brevity.  Much more detail can be found in the git revision history:
     cache set distribution.
   - Randomly distribute large allocation base pointer alignment relative to page
     boundaries in order to more uniformly utilize CPU cache sets.  This can be
-    disabled via the --disable-cache-oblivious configure option.
+    disabled via the --disable-cache-oblivious configure option, and queried via
+    the "config.cache_oblivious" mallctl.
   - Micro-optimize the fast paths for the public API functions.
   - Refactor thread-specific data to reside in a single structure.  This assures
     that only a single TLS read is necessary per call into the public API.
diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index eb677ad..dbbe837 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -731,6 +731,16 @@ for (i = 0; i < nbins; i++) {
         detecting whether another thread caused a refresh.</para></listitem>
       </varlistentry>
 
+      <varlistentry id="config.cache_oblivious">
+        <term>
+          <mallctl>config.cache_oblivious</mallctl>
+          (<type>bool</type>)
+          <literal>r-</literal>
+        </term>
+        <listitem><para><option>--enable-cache-oblivious</option> was specified
+        during build configuration.</para></listitem>
+      </varlistentry>
+
       <varlistentry id="config.debug">
         <term>
           <mallctl>config.debug</mallctl>
diff --git a/src/ctl.c b/src/ctl.c
index d215b19..1988aee 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -73,6 +73,7 @@ CTL_PROTO(thread_allocated)
 CTL_PROTO(thread_allocatedp)
 CTL_PROTO(thread_deallocated)
 CTL_PROTO(thread_deallocatedp)
+CTL_PROTO(config_cache_oblivious)
 CTL_PROTO(config_debug)
 CTL_PROTO(config_fill)
 CTL_PROTO(config_lazy_lock)
@@ -238,6 +239,7 @@ static const ctl_named_node_t	thread_node[] = {
 };
 
 static const ctl_named_node_t	config_node[] = {
+	{NAME("cache_oblivious"), CTL(config_cache_oblivious)},
 	{NAME("debug"),		CTL(config_debug)},
 	{NAME("fill"),		CTL(config_fill)},
 	{NAME("lazy_lock"),	CTL(config_lazy_lock)},
@@ -1247,6 +1249,7 @@ label_return:
 
 /******************************************************************************/
 
+CTL_RO_BOOL_CONFIG_GEN(config_cache_oblivious)
 CTL_RO_BOOL_CONFIG_GEN(config_debug)
 CTL_RO_BOOL_CONFIG_GEN(config_fill)
 CTL_RO_BOOL_CONFIG_GEN(config_lazy_lock)
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index 29823a6..31e354c 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -126,6 +126,7 @@ TEST_BEGIN(test_mallctl_config)
 	assert_zu_eq(sz, sizeof(oldval), "Unexpected output size");	\
 } while (0)
 
+	TEST_MALLCTL_CONFIG(cache_oblivious);
 	TEST_MALLCTL_CONFIG(debug);
 	TEST_MALLCTL_CONFIG(fill);
 	TEST_MALLCTL_CONFIG(lazy_lock);
-- 
cgit v0.12


From 218b15cc299ccb8114e52df3eb0f7a9dc810a4b1 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 17 Jul 2015 18:12:44 -0700
Subject: Fix more MinGW build warnings.

---
 test/include/test/test.h | 24 ++++++++++++------------
 test/unit/atomic.c       |  2 +-
 test/unit/ckh.c          | 37 ++++++++++++++++++++-----------------
 test/unit/junk.c         | 26 +++++++++++++-------------
 4 files changed, 46 insertions(+), 43 deletions(-)

diff --git a/test/include/test/test.h b/test/include/test/test.h
index f55bafc..3aa0835 100644
--- a/test/include/test/test.h
+++ b/test/include/test/test.h
@@ -133,30 +133,30 @@
     <=, "ju", __VA_ARGS__)
 
 #define	assert_zd_eq(a, b, ...)	assert_cmp(ssize_t, a, b, ==,	\
-    !=, "zd", __VA_ARGS__)
+    !=, PRIzd, __VA_ARGS__)
 #define	assert_zd_ne(a, b, ...)	assert_cmp(ssize_t, a, b, !=,	\
-    ==, "zd", __VA_ARGS__)
+    ==, PRIzd, __VA_ARGS__)
 #define	assert_zd_lt(a, b, ...)	assert_cmp(ssize_t, a, b, <,	\
-    >=, "zd", __VA_ARGS__)
+    >=, PRIzd, __VA_ARGS__)
 #define	assert_zd_le(a, b, ...)	assert_cmp(ssize_t, a, b, <=,	\
-    >, "zd", __VA_ARGS__)
+    >, PRIzd, __VA_ARGS__)
 #define	assert_zd_ge(a, b, ...)	assert_cmp(ssize_t, a, b, >=,	\
-    <, "zd", __VA_ARGS__)
+    <, PRIzd, __VA_ARGS__)
 #define	assert_zd_gt(a, b, ...)	assert_cmp(ssize_t, a, b, >,	\
-    <=, "zd", __VA_ARGS__)
+    <=, PRIzd, __VA_ARGS__)
 
 #define	assert_zu_eq(a, b, ...)	assert_cmp(size_t, a, b, ==,	\
-    !=, "zu", __VA_ARGS__)
+    !=, PRIzu, __VA_ARGS__)
 #define	assert_zu_ne(a, b, ...)	assert_cmp(size_t, a, b, !=,	\
-    ==, "zu", __VA_ARGS__)
+    ==, PRIzu, __VA_ARGS__)
 #define	assert_zu_lt(a, b, ...)	assert_cmp(size_t, a, b, <,	\
-    >=, "zu", __VA_ARGS__)
+    >=, PRIzu, __VA_ARGS__)
 #define	assert_zu_le(a, b, ...)	assert_cmp(size_t, a, b, <=,	\
-    >, "zu", __VA_ARGS__)
+    >, PRIzu, __VA_ARGS__)
 #define	assert_zu_ge(a, b, ...)	assert_cmp(size_t, a, b, >=,	\
-    <, "zu", __VA_ARGS__)
+    <, PRIzu, __VA_ARGS__)
 #define	assert_zu_gt(a, b, ...)	assert_cmp(size_t, a, b, >,	\
-    <=, "zu", __VA_ARGS__)
+    <=, PRIzu, __VA_ARGS__)
 
 #define	assert_d32_eq(a, b, ...)	assert_cmp(int32_t, a, b, ==,	\
     !=, PRId32, __VA_ARGS__)
diff --git a/test/unit/atomic.c b/test/unit/atomic.c
index a774836..9217ca9 100644
--- a/test/unit/atomic.c
+++ b/test/unit/atomic.c
@@ -97,7 +97,7 @@ TEST_STRUCT(z, size_t)
 TEST_BEGIN(test_atomic_z)
 {
 
-	TEST_BODY(z, size_t, size_t, zu, "#zx");
+	TEST_BODY(z, size_t, size_t, zu, "#"PRIzx);
 }
 TEST_END
 
diff --git a/test/unit/ckh.c b/test/unit/ckh.c
index c212648..1f22baf 100644
--- a/test/unit/ckh.c
+++ b/test/unit/ckh.c
@@ -35,15 +35,15 @@ TEST_BEGIN(test_count_insert_search_remove)
 	assert_false(ckh_new(tsd, &ckh, 2, ckh_string_hash, ckh_string_keycomp),
 	    "Unexpected ckh_new() error");
 	assert_zu_eq(ckh_count(&ckh), 0,
-	    "ckh_count() should return %zu, but it returned %zu", ZU(0),
+	    "ckh_count() should return %"PRIzu", but it returned %"PRIzu, ZU(0),
 	    ckh_count(&ckh));
 
 	/* Insert. */
 	for (i = 0; i < sizeof(strs)/sizeof(const char *); i++) {
 		ckh_insert(tsd, &ckh, strs[i], strs[i]);
 		assert_zu_eq(ckh_count(&ckh), i+1,
-		    "ckh_count() should return %zu, but it returned %zu", i+1,
-		    ckh_count(&ckh));
+		    "ckh_count() should return %"PRIzu", but it returned "
+		    "%"PRIzu, i+1, ckh_count(&ckh));
 	}
 
 	/* Search. */
@@ -65,9 +65,9 @@ TEST_BEGIN(test_count_insert_search_remove)
 		ks = (i & 1) ? strs[i] : (const char *)NULL;
 		vs = (i & 2) ? strs[i] : (const char *)NULL;
 		assert_ptr_eq((void *)ks, (void *)k.s,
-		    "Key mismatch, i=%zu", i);
+		    "Key mismatch, i=%"PRIzu, i);
 		assert_ptr_eq((void *)vs, (void *)v.s,
-		    "Value mismatch, i=%zu", i);
+		    "Value mismatch, i=%"PRIzu, i);
 	}
 	assert_true(ckh_search(&ckh, missing, NULL, NULL),
 	    "Unexpected ckh_search() success");
@@ -91,13 +91,13 @@ TEST_BEGIN(test_count_insert_search_remove)
 		ks = (i & 1) ? strs[i] : (const char *)NULL;
 		vs = (i & 2) ? strs[i] : (const char *)NULL;
 		assert_ptr_eq((void *)ks, (void *)k.s,
-		    "Key mismatch, i=%zu", i);
+		    "Key mismatch, i=%"PRIzu, i);
 		assert_ptr_eq((void *)vs, (void *)v.s,
-		    "Value mismatch, i=%zu", i);
+		    "Value mismatch, i=%"PRIzu, i);
 		assert_zu_eq(ckh_count(&ckh),
 		    sizeof(strs)/sizeof(const char *) - i - 1,
-		    "ckh_count() should return %zu, but it returned %zu",
-		    sizeof(strs)/sizeof(const char *) - i - 1,
+		    "ckh_count() should return %"PRIzu", but it returned "
+		    "%"PRIzu, sizeof(strs)/sizeof(const char *) - i - 1,
 		    ckh_count(&ckh));
 	}
 
@@ -137,8 +137,8 @@ TEST_BEGIN(test_insert_iter_remove)
 		}
 
 		assert_zu_eq(ckh_count(&ckh), NITEMS,
-		    "ckh_count() should return %zu, but it returned %zu",
-		    NITEMS, ckh_count(&ckh));
+		    "ckh_count() should return %"PRIzu", but it returned "
+		    "%"PRIzu, NITEMS, ckh_count(&ckh));
 
 		for (j = i + 1; j < NITEMS; j++) {
 			assert_false(ckh_search(&ckh, p[j], NULL, NULL),
@@ -167,17 +167,20 @@ TEST_BEGIN(test_insert_iter_remove)
 				for (k = 0; k < NITEMS; k++) {
 					if (p[k] == q) {
 						assert_false(seen[k],
-						    "Item %zu already seen", k);
+						    "Item %"PRIzu" already "
+						    "seen", k);
 						seen[k] = true;
 						break;
 					}
 				}
 			}
 
-			for (j = 0; j < i + 1; j++)
-				assert_true(seen[j], "Item %zu not seen", j);
+			for (j = 0; j < i + 1; j++) {
+				assert_true(seen[j], "Item %"PRIzu" not seen",
+				    j);
+			}
 			for (; j < NITEMS; j++)
-				assert_false(seen[j], "Item %zu seen", j);
+				assert_false(seen[j], "Item %"PRIzu" seen", j);
 		}
 	}
 
@@ -196,8 +199,8 @@ TEST_BEGIN(test_insert_iter_remove)
 	}
 
 	assert_zu_eq(ckh_count(&ckh), 0,
-	    "ckh_count() should return %zu, but it returned %zu", ZU(0),
-	    ckh_count(&ckh));
+	    "ckh_count() should return %"PRIzu", but it returned %"PRIzu,
+	    ZU(0), ckh_count(&ckh));
 	ckh_delete(tsd, &ckh);
 #undef NITEMS
 }
diff --git a/test/unit/junk.c b/test/unit/junk.c
index 733f661..8499d06 100644
--- a/test/unit/junk.c
+++ b/test/unit/junk.c
@@ -30,8 +30,8 @@ arena_dalloc_junk_small_intercept(void *ptr, arena_bin_info_t *bin_info)
 	arena_dalloc_junk_small_orig(ptr, bin_info);
 	for (i = 0; i < bin_info->reg_size; i++) {
 		assert_c_eq(((char *)ptr)[i], 0x5a,
-		    "Missing junk fill for byte %zu/%zu of deallocated region",
-		    i, bin_info->reg_size);
+		    "Missing junk fill for byte %"PRIzu"/%"PRIzu" of "
+		    "deallocated region", i, bin_info->reg_size);
 	}
 	if (ptr == watch_for_junking)
 		saw_junking = true;
@@ -45,8 +45,8 @@ arena_dalloc_junk_large_intercept(void *ptr, size_t usize)
 	arena_dalloc_junk_large_orig(ptr, usize);
 	for (i = 0; i < usize; i++) {
 		assert_c_eq(((char *)ptr)[i], 0x5a,
-		    "Missing junk fill for byte %zu/%zu of deallocated region",
-		    i, usize);
+		    "Missing junk fill for byte %"PRIzu"/%"PRIzu" of "
+		    "deallocated region", i, usize);
 	}
 	if (ptr == watch_for_junking)
 		saw_junking = true;
@@ -89,18 +89,18 @@ test_junk(size_t sz_min, size_t sz_max)
 	    sz_prev = sz, sz = sallocx(s, 0)) {
 		if (sz_prev > 0) {
 			assert_c_eq(s[0], 'a',
-			    "Previously allocated byte %zu/%zu is corrupted",
-			    ZU(0), sz_prev);
+			    "Previously allocated byte %"PRIzu"/%"PRIzu" is "
+			    "corrupted", ZU(0), sz_prev);
 			assert_c_eq(s[sz_prev-1], 'a',
-			    "Previously allocated byte %zu/%zu is corrupted",
-			    sz_prev-1, sz_prev);
+			    "Previously allocated byte %"PRIzu"/%"PRIzu" is "
+			    "corrupted", sz_prev-1, sz_prev);
 		}
 
 		for (i = sz_prev; i < sz; i++) {
 			if (opt_junk_alloc) {
 				assert_c_eq(s[i], 0xa5,
-				    "Newly allocated byte %zu/%zu isn't "
-				    "junk-filled", i, sz);
+				    "Newly allocated byte %"PRIzu"/%"PRIzu
+				    " isn't junk-filled", i, sz);
 			}
 			s[i] = 'a';
 		}
@@ -111,15 +111,15 @@ test_junk(size_t sz_min, size_t sz_max)
 			assert_ptr_not_null((void *)s,
 			    "Unexpected rallocx() failure");
 			assert_true(!opt_junk_free || saw_junking,
-			    "Expected region of size %zu to be junk-filled",
-			    sz);
+			    "Expected region of size %"PRIzu" to be "
+			    "junk-filled", sz);
 		}
 	}
 
 	watch_junking(s);
 	dallocx(s, 0);
 	assert_true(!opt_junk_free || saw_junking,
-	    "Expected region of size %zu to be junk-filled", sz);
+	    "Expected region of size %"PRIzu" to be junk-filled", sz);
 
 	if (opt_junk_free) {
 		arena_dalloc_junk_small = arena_dalloc_junk_small_orig;
-- 
cgit v0.12


From 50cd636eedfdc14d68f3917055fe2cc3fc72e853 Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Tue, 21 Jul 2015 09:13:57 +0900
Subject: Remove JEMALLOC_ALLOC_SIZE annotations on functions not returning
 pointers

As per gcc documentation:
  The alloc_size attribute is used to tell the compiler that the function
  return value points to memory (...)

This resolves #245.
---
 include/jemalloc/jemalloc_protos.h.in | 4 ++--
 src/jemalloc.c                        | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/jemalloc/jemalloc_protos.h.in b/include/jemalloc/jemalloc_protos.h.in
index e77bd28..ef4b35e 100644
--- a/include/jemalloc/jemalloc_protos.h.in
+++ b/include/jemalloc/jemalloc_protos.h.in
@@ -13,7 +13,7 @@ JEMALLOC_EXPORT void	*@je_@calloc(size_t num, size_t size) JEMALLOC_CXX_THROW
     JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE2(1, 2) JEMALLOC_NOTHROW;
 JEMALLOC_EXPORT int	@je_@posix_memalign(void **memptr, size_t alignment,
     size_t size) JEMALLOC_CXX_THROW JEMALLOC_ATTR(nonnull(1))
-    JEMALLOC_ALLOC_SIZE(2) JEMALLOC_NOTHROW;
+    JEMALLOC_NOTHROW;
 JEMALLOC_EXPORT void	*@je_@aligned_alloc(size_t alignment, size_t size)
     JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(2)
     JEMALLOC_NOTHROW;
@@ -27,7 +27,7 @@ JEMALLOC_EXPORT void	*@je_@mallocx(size_t size, int flags)
 JEMALLOC_EXPORT void	*@je_@rallocx(void *ptr, size_t size, int flags)
     JEMALLOC_ALLOC_SIZE(2) JEMALLOC_NOTHROW;
 JEMALLOC_EXPORT size_t	@je_@xallocx(void *ptr, size_t size, size_t extra,
-    int flags) JEMALLOC_ALLOC_SIZE(2) JEMALLOC_NOTHROW;
+    int flags) JEMALLOC_NOTHROW;
 JEMALLOC_EXPORT size_t	@je_@sallocx(const void *ptr, int flags)
     JEMALLOC_ATTR(pure) JEMALLOC_NOTHROW;
 JEMALLOC_EXPORT void	@je_@dallocx(void *ptr, int flags) JEMALLOC_NOTHROW;
diff --git a/src/jemalloc.c b/src/jemalloc.c
index fc223da..cf6b78f 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1531,7 +1531,7 @@ label_oom:
 }
 
 JEMALLOC_EXPORT int
-JEMALLOC_ATTR(nonnull(1)) JEMALLOC_ALLOC_SIZE(2) JEMALLOC_NOTHROW
+JEMALLOC_ATTR(nonnull(1)) JEMALLOC_NOTHROW
 je_posix_memalign(void **memptr, size_t alignment, size_t size)
 {
 	int ret = imemalign(memptr, alignment, size, sizeof(void *));
@@ -2277,7 +2277,7 @@ ixallocx_prof(tsd_t *tsd, void *ptr, size_t old_usize, size_t size,
 }
 
 JEMALLOC_EXPORT size_t
-JEMALLOC_ALLOC_SIZE(2) JEMALLOC_NOTHROW
+JEMALLOC_NOTHROW
 je_xallocx(void *ptr, size_t size, size_t extra, int flags)
 {
 	tsd_t *tsd;
-- 
cgit v0.12


From 00632609dfdd28e8de5afdd3e838f3975566e5d9 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 21 Jul 2015 08:10:38 -0700
Subject: Move JEMALLOC_NOTHROW just after return type.

Only use __declspec(nothrow) in C++ mode.

This resolves #244.
---
 include/jemalloc/jemalloc_macros.h.in |  6 ++-
 include/jemalloc/jemalloc_protos.h.in | 74 +++++++++++++++++------------------
 src/jemalloc.c                        | 63 +++++++++++++----------------
 3 files changed, 69 insertions(+), 74 deletions(-)

diff --git a/include/jemalloc/jemalloc_macros.h.in b/include/jemalloc/jemalloc_macros.h.in
index 6ba8f9a..0d827d0 100644
--- a/include/jemalloc/jemalloc_macros.h.in
+++ b/include/jemalloc/jemalloc_macros.h.in
@@ -65,7 +65,11 @@
 #  define JEMALLOC_ALLOC_SIZE(s)
 #  define JEMALLOC_ALLOC_SIZE2(s1, s2)
 #  define JEMALLOC_NOINLINE __declspec(noinline)
-#  define JEMALLOC_NOTHROW __declspec(nothrow)
+#  ifdef __cplusplus
+#    define JEMALLOC_NOTHROW __declspec(nothrow)
+#  else
+#    define JEMALLOC_NOTHROW
+#  endif
 #  define JEMALLOC_SECTION(s) __declspec(allocate(s))
 #else
 #  define JEMALLOC_ATTR(s)
diff --git a/include/jemalloc/jemalloc_protos.h.in b/include/jemalloc/jemalloc_protos.h.in
index ef4b35e..d37ce05 100644
--- a/include/jemalloc/jemalloc_protos.h.in
+++ b/include/jemalloc/jemalloc_protos.h.in
@@ -7,45 +7,45 @@ extern JEMALLOC_EXPORT const char	*@je_@malloc_conf;
 extern JEMALLOC_EXPORT void		(*@je_@malloc_message)(void *cbopaque,
     const char *s);
 
-JEMALLOC_EXPORT void	*@je_@malloc(size_t size) JEMALLOC_CXX_THROW
-    JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1) JEMALLOC_NOTHROW;
-JEMALLOC_EXPORT void	*@je_@calloc(size_t num, size_t size) JEMALLOC_CXX_THROW
-    JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE2(1, 2) JEMALLOC_NOTHROW;
-JEMALLOC_EXPORT int	@je_@posix_memalign(void **memptr, size_t alignment,
-    size_t size) JEMALLOC_CXX_THROW JEMALLOC_ATTR(nonnull(1))
-    JEMALLOC_NOTHROW;
-JEMALLOC_EXPORT void	*@je_@aligned_alloc(size_t alignment, size_t size)
-    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(2)
-    JEMALLOC_NOTHROW;
-JEMALLOC_EXPORT void	*@je_@realloc(void *ptr, size_t size) JEMALLOC_CXX_THROW
-    JEMALLOC_ALLOC_SIZE(2) JEMALLOC_NOTHROW;
-JEMALLOC_EXPORT void	@je_@free(void *ptr) JEMALLOC_CXX_THROW
-    JEMALLOC_NOTHROW;
+JEMALLOC_EXPORT void *JEMALLOC_NOTHROW	@je_@malloc(size_t size)
+    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1);
+JEMALLOC_EXPORT void *JEMALLOC_NOTHROW	@je_@calloc(size_t num, size_t size)
+    JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE2(1, 2);
+JEMALLOC_EXPORT int JEMALLOC_NOTHROW	@je_@posix_memalign(void **memptr,
+    size_t alignment, size_t size) JEMALLOC_CXX_THROW JEMALLOC_ATTR(nonnull(1));
+JEMALLOC_EXPORT void *JEMALLOC_NOTHROW	@je_@aligned_alloc(size_t alignment,
+    size_t size) JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc)
+    JEMALLOC_ALLOC_SIZE(2);
+JEMALLOC_EXPORT void *JEMALLOC_NOTHROW	@je_@realloc(void *ptr, size_t size)
+    JEMALLOC_CXX_THROW JEMALLOC_ALLOC_SIZE(2);
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW	@je_@free(void *ptr)
+    JEMALLOC_CXX_THROW;
 
-JEMALLOC_EXPORT void	*@je_@mallocx(size_t size, int flags)
-    JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1) JEMALLOC_NOTHROW;
-JEMALLOC_EXPORT void	*@je_@rallocx(void *ptr, size_t size, int flags)
-    JEMALLOC_ALLOC_SIZE(2) JEMALLOC_NOTHROW;
-JEMALLOC_EXPORT size_t	@je_@xallocx(void *ptr, size_t size, size_t extra,
-    int flags) JEMALLOC_NOTHROW;
-JEMALLOC_EXPORT size_t	@je_@sallocx(const void *ptr, int flags)
-    JEMALLOC_ATTR(pure) JEMALLOC_NOTHROW;
-JEMALLOC_EXPORT void	@je_@dallocx(void *ptr, int flags) JEMALLOC_NOTHROW;
-JEMALLOC_EXPORT void	@je_@sdallocx(void *ptr, size_t size, int flags)
-    JEMALLOC_NOTHROW;
-JEMALLOC_EXPORT size_t	@je_@nallocx(size_t size, int flags)
-    JEMALLOC_ATTR(pure) JEMALLOC_NOTHROW;
+JEMALLOC_EXPORT void *JEMALLOC_NOTHROW	@je_@mallocx(size_t size, int flags)
+    JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1);
+JEMALLOC_EXPORT void *JEMALLOC_NOTHROW	@je_@rallocx(void *ptr, size_t size,
+    int flags) JEMALLOC_ALLOC_SIZE(2);
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW	@je_@xallocx(void *ptr, size_t size,
+    size_t extra, int flags);
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW	@je_@sallocx(const void *ptr,
+    int flags) JEMALLOC_ATTR(pure);
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW	@je_@dallocx(void *ptr, int flags);
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW	@je_@sdallocx(void *ptr, size_t size,
+    int flags);
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW	@je_@nallocx(size_t size, int flags)
+    JEMALLOC_ATTR(pure);
 
-JEMALLOC_EXPORT int	@je_@mallctl(const char *name, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) JEMALLOC_NOTHROW;
-JEMALLOC_EXPORT int	@je_@mallctlnametomib(const char *name, size_t *mibp,
-    size_t *miblenp) JEMALLOC_NOTHROW;
-JEMALLOC_EXPORT int	@je_@mallctlbymib(const size_t *mib, size_t miblen,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen) JEMALLOC_NOTHROW;
-JEMALLOC_EXPORT void	@je_@malloc_stats_print(void (*write_cb)(void *,
-    const char *), void *@je_@cbopaque, const char *opts) JEMALLOC_NOTHROW;
-JEMALLOC_EXPORT size_t	@je_@malloc_usable_size(
-    JEMALLOC_USABLE_SIZE_CONST void *ptr) JEMALLOC_CXX_THROW JEMALLOC_NOTHROW;
+JEMALLOC_EXPORT int JEMALLOC_NOTHROW	@je_@mallctl(const char *name,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen);
+JEMALLOC_EXPORT int JEMALLOC_NOTHROW	@je_@mallctlnametomib(const char *name,
+    size_t *mibp, size_t *miblenp);
+JEMALLOC_EXPORT int JEMALLOC_NOTHROW	@je_@mallctlbymib(const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen);
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW	@je_@malloc_stats_print(
+    void (*write_cb)(void *, const char *), void *@je_@cbopaque,
+    const char *opts);
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW	@je_@malloc_usable_size(
+    JEMALLOC_USABLE_SIZE_CONST void *ptr) JEMALLOC_CXX_THROW;
 
 #ifdef JEMALLOC_OVERRIDE_MEMALIGN
 JEMALLOC_EXPORT void	*@je_@memalign(size_t alignment, size_t size)
diff --git a/src/jemalloc.c b/src/jemalloc.c
index cf6b78f..1d02318 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1395,8 +1395,8 @@ imalloc_body(size_t size, tsd_t **tsd, size_t *usize)
 	return (imalloc(*tsd, size));
 }
 
-JEMALLOC_EXPORT void *
-JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1) JEMALLOC_NOTHROW
+JEMALLOC_EXPORT void *JEMALLOC_NOTHROW
+JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1)
 je_malloc(size_t size)
 {
 	void *ret;
@@ -1530,8 +1530,8 @@ label_oom:
 	goto label_return;
 }
 
-JEMALLOC_EXPORT int
-JEMALLOC_ATTR(nonnull(1)) JEMALLOC_NOTHROW
+JEMALLOC_EXPORT int JEMALLOC_NOTHROW
+JEMALLOC_ATTR(nonnull(1))
 je_posix_memalign(void **memptr, size_t alignment, size_t size)
 {
 	int ret = imemalign(memptr, alignment, size, sizeof(void *));
@@ -1540,8 +1540,8 @@ je_posix_memalign(void **memptr, size_t alignment, size_t size)
 	return (ret);
 }
 
-JEMALLOC_EXPORT void *
-JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(2) JEMALLOC_NOTHROW
+JEMALLOC_EXPORT void *JEMALLOC_NOTHROW
+JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(2)
 je_aligned_alloc(size_t alignment, size_t size)
 {
 	void *ret;
@@ -1594,8 +1594,8 @@ icalloc_prof(tsd_t *tsd, size_t usize)
 	return (p);
 }
 
-JEMALLOC_EXPORT void *
-JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE2(1, 2) JEMALLOC_NOTHROW
+JEMALLOC_EXPORT void *JEMALLOC_NOTHROW
+JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE2(1, 2)
 je_calloc(size_t num, size_t size)
 {
 	void *ret;
@@ -1739,8 +1739,8 @@ isfree(tsd_t *tsd, void *ptr, size_t usize, tcache_t *tcache)
 	JEMALLOC_VALGRIND_FREE(ptr, rzsize);
 }
 
-JEMALLOC_EXPORT void *
-JEMALLOC_ALLOC_SIZE(2) JEMALLOC_NOTHROW
+JEMALLOC_EXPORT void *JEMALLOC_NOTHROW
+JEMALLOC_ALLOC_SIZE(2)
 je_realloc(void *ptr, size_t size)
 {
 	void *ret;
@@ -1803,8 +1803,7 @@ je_realloc(void *ptr, size_t size)
 	return (ret);
 }
 
-JEMALLOC_EXPORT void
-JEMALLOC_NOTHROW
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW
 je_free(void *ptr)
 {
 
@@ -2032,8 +2031,8 @@ imallocx_no_prof(tsd_t *tsd, size_t size, int flags, size_t *usize)
 	return (p);
 }
 
-JEMALLOC_EXPORT void *
-JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1) JEMALLOC_NOTHROW
+JEMALLOC_EXPORT void *JEMALLOC_NOTHROW
+JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1)
 je_mallocx(size_t size, int flags)
 {
 	tsd_t *tsd;
@@ -2130,8 +2129,8 @@ irallocx_prof(tsd_t *tsd, void *oldptr, size_t old_usize, size_t size,
 	return (p);
 }
 
-JEMALLOC_EXPORT void *
-JEMALLOC_ALLOC_SIZE(2) JEMALLOC_NOTHROW
+JEMALLOC_EXPORT void *JEMALLOC_NOTHROW
+JEMALLOC_ALLOC_SIZE(2)
 je_rallocx(void *ptr, size_t size, int flags)
 {
 	void *p;
@@ -2276,8 +2275,7 @@ ixallocx_prof(tsd_t *tsd, void *ptr, size_t old_usize, size_t size,
 	return (usize);
 }
 
-JEMALLOC_EXPORT size_t
-JEMALLOC_NOTHROW
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW
 je_xallocx(void *ptr, size_t size, size_t extra, int flags)
 {
 	tsd_t *tsd;
@@ -2318,8 +2316,8 @@ label_not_resized:
 	return (usize);
 }
 
-JEMALLOC_EXPORT size_t
-JEMALLOC_ATTR(pure) JEMALLOC_NOTHROW
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW
+JEMALLOC_ATTR(pure)
 je_sallocx(const void *ptr, int flags)
 {
 	size_t usize;
@@ -2335,8 +2333,7 @@ je_sallocx(const void *ptr, int flags)
 	return (usize);
 }
 
-JEMALLOC_EXPORT void
-JEMALLOC_NOTHROW
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW
 je_dallocx(void *ptr, int flags)
 {
 	tsd_t *tsd;
@@ -2371,8 +2368,7 @@ inallocx(size_t size, int flags)
 	return (usize);
 }
 
-JEMALLOC_EXPORT void
-JEMALLOC_NOTHROW
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW
 je_sdallocx(void *ptr, size_t size, int flags)
 {
 	tsd_t *tsd;
@@ -2397,8 +2393,8 @@ je_sdallocx(void *ptr, size_t size, int flags)
 	isfree(tsd, ptr, usize, tcache);
 }
 
-JEMALLOC_EXPORT size_t
-JEMALLOC_ATTR(pure) JEMALLOC_NOTHROW
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW
+JEMALLOC_ATTR(pure)
 je_nallocx(size_t size, int flags)
 {
 
@@ -2410,8 +2406,7 @@ je_nallocx(size_t size, int flags)
 	return (inallocx(size, flags));
 }
 
-JEMALLOC_EXPORT int
-JEMALLOC_NOTHROW
+JEMALLOC_EXPORT int JEMALLOC_NOTHROW
 je_mallctl(const char *name, void *oldp, size_t *oldlenp, void *newp,
     size_t newlen)
 {
@@ -2422,8 +2417,7 @@ je_mallctl(const char *name, void *oldp, size_t *oldlenp, void *newp,
 	return (ctl_byname(name, oldp, oldlenp, newp, newlen));
 }
 
-JEMALLOC_EXPORT int
-JEMALLOC_NOTHROW
+JEMALLOC_EXPORT int JEMALLOC_NOTHROW
 je_mallctlnametomib(const char *name, size_t *mibp, size_t *miblenp)
 {
 
@@ -2433,8 +2427,7 @@ je_mallctlnametomib(const char *name, size_t *mibp, size_t *miblenp)
 	return (ctl_nametomib(name, mibp, miblenp));
 }
 
-JEMALLOC_EXPORT int
-JEMALLOC_NOTHROW
+JEMALLOC_EXPORT int JEMALLOC_NOTHROW
 je_mallctlbymib(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
   void *newp, size_t newlen)
 {
@@ -2445,8 +2438,7 @@ je_mallctlbymib(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 	return (ctl_bymib(mib, miblen, oldp, oldlenp, newp, newlen));
 }
 
-JEMALLOC_EXPORT void
-JEMALLOC_NOTHROW
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW
 je_malloc_stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
     const char *opts)
 {
@@ -2454,8 +2446,7 @@ je_malloc_stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	stats_print(write_cb, cbopaque, opts);
 }
 
-JEMALLOC_EXPORT size_t
-JEMALLOC_NOTHROW
+JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW
 je_malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *ptr)
 {
 	size_t ret;
-- 
cgit v0.12


From 1b0e4abbfdbcc1c1a71d1f617adb19951109bfce Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 21 Jul 2015 16:45:35 -0700
Subject: Port mq_get() to MinGW.

---
 Makefile.in            |  6 +++---
 test/include/test/mq.h | 19 +++++++++----------
 test/src/mq.c          | 27 +++++++++++++++++++++++++++
 3 files changed, 39 insertions(+), 13 deletions(-)
 create mode 100644 test/src/mq.c

diff --git a/Makefile.in b/Makefile.in
index 02f4424..25c2d5a 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -111,9 +111,9 @@ DOCS_MAN3 := $(DOCS_XML:$(objroot)%.xml=$(objroot)%.3)
 DOCS := $(DOCS_HTML) $(DOCS_MAN3)
 C_TESTLIB_SRCS := $(srcroot)test/src/btalloc.c $(srcroot)test/src/btalloc_0.c \
 	$(srcroot)test/src/btalloc_1.c $(srcroot)test/src/math.c \
-	$(srcroot)test/src/mtx.c $(srcroot)test/src/SFMT.c \
-	$(srcroot)test/src/test.c $(srcroot)test/src/thd.c \
-	$(srcroot)test/src/timer.c
+	$(srcroot)test/src/mtx.c $(srcroot)test/src/mq.c \
+	$(srcroot)test/src/SFMT.c $(srcroot)test/src/test.c \
+	$(srcroot)test/src/thd.c $(srcroot)test/src/timer.c
 C_UTIL_INTEGRATION_SRCS := $(srcroot)src/util.c
 TESTS_UNIT := $(srcroot)test/unit/atomic.c \
 	$(srcroot)test/unit/bitmap.c \
diff --git a/test/include/test/mq.h b/test/include/test/mq.h
index 1118865..7c4df49 100644
--- a/test/include/test/mq.h
+++ b/test/include/test/mq.h
@@ -1,3 +1,5 @@
+void	mq_nanosleep(unsigned ns);
+
 /*
  * Simple templated message queue implementation that relies on only mutexes for
  * synchronization (which reduces portability issues).  Given the following
@@ -75,26 +77,23 @@ a_attr a_mq_msg_type *							\
 a_prefix##get(a_mq_type *mq)						\
 {									\
 	a_mq_msg_type *msg;						\
-	struct timespec timeout;					\
+	unsigned ns;							\
 									\
 	msg = a_prefix##tryget(mq);					\
 	if (msg != NULL)						\
 		return (msg);						\
 									\
-	timeout.tv_sec = 0;						\
-	timeout.tv_nsec = 1;						\
+	ns = 1;								\
 	while (true) {							\
-		nanosleep(&timeout, NULL);				\
+		mq_nanosleep(ns);					\
 		msg = a_prefix##tryget(mq);				\
 		if (msg != NULL)					\
 			return (msg);					\
-		if (timeout.tv_sec == 0) {				\
+		if (ns < 1000*1000*1000) {				\
 			/* Double sleep time, up to max 1 second. */	\
-			timeout.tv_nsec <<= 1;				\
-			if (timeout.tv_nsec >= 1000*1000*1000) {	\
-				timeout.tv_sec = 1;			\
-				timeout.tv_nsec = 0;			\
-			}						\
+			ns <<= 1;					\
+			if (ns > 1000*1000*1000)			\
+				ns = 1000*1000*1000;			\
 		}							\
 	}								\
 }									\
diff --git a/test/src/mq.c b/test/src/mq.c
new file mode 100644
index 0000000..d337724
--- /dev/null
+++ b/test/src/mq.c
@@ -0,0 +1,27 @@
+#include "test/jemalloc_test.h"
+
+/*
+ * Sleep for approximately ns nanoseconds.  No lower *nor* upper bound on sleep
+ * time is guaranteed.
+ */
+void
+mq_nanosleep(unsigned ns)
+{
+
+	assert(ns <= 1000*1000*1000);
+
+#ifdef _WIN32
+	Sleep(ns / 1000);
+#else
+	struct timespec timeout;
+
+	if (ns < 1000*1000*1000) {
+		timeout.tv_sec = 0;
+		timeout.tv_nsec = ns;
+	} else {
+		timeout.tv_sec = 1;
+		timeout.tv_nsec = 0;
+	}
+	nanosleep(&timeout, NULL);
+#endif
+}
-- 
cgit v0.12


From e42c309eba6c5084dc0abda9b211e91e2c548fdf Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 22 Jul 2015 15:44:47 -0700
Subject: Add JEMALLOC_FORMAT_PRINTF().

Replace JEMALLOC_ATTR(format(printf, ...). with
JEMALLOC_FORMAT_PRINTF(), so that configuration feature tests can
omit the attribute if it would cause extraneous compilation warnings.
---
 configure.ac                          | 20 ++++++++++++++++++++
 include/jemalloc/internal/util.h      |  7 +++----
 include/jemalloc/jemalloc_defs.h.in   |  6 ++++++
 include/jemalloc/jemalloc_macros.h.in | 23 ++++++++++++++++-------
 src/prof.c                            |  4 ++--
 src/util.c                            |  6 +++---
 test/include/test/test.h              |  4 ++--
 test/src/test.c                       |  4 ++--
 8 files changed, 54 insertions(+), 20 deletions(-)

diff --git a/configure.ac b/configure.ac
index c6388f7..f7e6d08 100644
--- a/configure.ac
+++ b/configure.ac
@@ -438,6 +438,26 @@ CFLAGS="${SAVED_CFLAGS}"
 if test "x${je_cv_alloc_size}" = "xyes" ; then
   AC_DEFINE([JEMALLOC_HAVE_ATTR_ALLOC_SIZE], [ ])
 fi
+dnl Check for format(gnu_printf, ...) attribute support.
+SAVED_CFLAGS="${CFLAGS}"
+JE_CFLAGS_APPEND([-Werror])
+JE_COMPILABLE([format(gnu_printf, ...) attribute], [#include <stdlib.h>],
+              [void *foo(const char *format, ...) __attribute__((format(gnu_printf, 1, 2)));],
+              [je_cv_format_gnu_printf])
+CFLAGS="${SAVED_CFLAGS}"
+if test "x${je_cv_format_gnu_printf}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF], [ ])
+fi
+dnl Check for format(printf, ...) attribute support.
+SAVED_CFLAGS="${CFLAGS}"
+JE_CFLAGS_APPEND([-Werror])
+JE_COMPILABLE([format(printf, ...) attribute], [#include <stdlib.h>],
+              [void *foo(const char *format, ...) __attribute__((format(printf, 1, 2)));],
+              [je_cv_format_printf])
+CFLAGS="${SAVED_CFLAGS}"
+if test "x${je_cv_format_printf}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_ATTR_FORMAT_PRINTF], [ ])
+fi
 
 dnl Support optional additions to rpath.
 AC_ARG_WITH([rpath],
diff --git a/include/jemalloc/internal/util.h b/include/jemalloc/internal/util.h
index 001cd09..f6e271f 100644
--- a/include/jemalloc/internal/util.h
+++ b/include/jemalloc/internal/util.h
@@ -104,13 +104,12 @@ void	malloc_write(const char *s);
 int	malloc_vsnprintf(char *str, size_t size, const char *format,
     va_list ap);
 int	malloc_snprintf(char *str, size_t size, const char *format, ...)
-    JEMALLOC_ATTR(format(printf, 3, 4));
+    JEMALLOC_FORMAT_PRINTF(3, 4);
 void	malloc_vcprintf(void (*write_cb)(void *, const char *), void *cbopaque,
     const char *format, va_list ap);
 void malloc_cprintf(void (*write)(void *, const char *), void *cbopaque,
-    const char *format, ...) JEMALLOC_ATTR(format(printf, 3, 4));
-void	malloc_printf(const char *format, ...)
-    JEMALLOC_ATTR(format(printf, 1, 2));
+    const char *format, ...) JEMALLOC_FORMAT_PRINTF(3, 4);
+void	malloc_printf(const char *format, ...) JEMALLOC_FORMAT_PRINTF(1, 2);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
diff --git a/include/jemalloc/jemalloc_defs.h.in b/include/jemalloc/jemalloc_defs.h.in
index 1f47c3e..ab13c37 100644
--- a/include/jemalloc/jemalloc_defs.h.in
+++ b/include/jemalloc/jemalloc_defs.h.in
@@ -4,6 +4,12 @@
 /* Defined if alloc_size attribute is supported. */
 #undef JEMALLOC_HAVE_ATTR_ALLOC_SIZE
 
+/* Defined if format(gnu_printf, ...) attribute is supported. */
+#undef JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF
+
+/* Defined if format(printf, ...) attribute is supported. */
+#undef JEMALLOC_HAVE_ATTR_FORMAT_PRINTF
+
 /*
  * Define overrides for non-standard allocator-related functions if they are
  * present on the system.
diff --git a/include/jemalloc/jemalloc_macros.h.in b/include/jemalloc/jemalloc_macros.h.in
index 0d827d0..2bde6b7 100644
--- a/include/jemalloc/jemalloc_macros.h.in
+++ b/include/jemalloc/jemalloc_macros.h.in
@@ -38,9 +38,6 @@
 
 #ifdef JEMALLOC_HAVE_ATTR
 #  define JEMALLOC_ATTR(s) __attribute__((s))
-#  ifndef JEMALLOC_EXPORT
-#    define JEMALLOC_EXPORT JEMALLOC_ATTR(visibility("default"))
-#  endif
 #  define JEMALLOC_ALIGNED(s) JEMALLOC_ATTR(aligned(s))
 #  ifdef JEMALLOC_HAVE_ATTR_ALLOC_SIZE
 #    define JEMALLOC_ALLOC_SIZE(s) JEMALLOC_ATTR(alloc_size(s))
@@ -49,11 +46,24 @@
 #    define JEMALLOC_ALLOC_SIZE(s)
 #    define JEMALLOC_ALLOC_SIZE2(s1, s2)
 #  endif
+#  ifndef JEMALLOC_EXPORT
+#    define JEMALLOC_EXPORT JEMALLOC_ATTR(visibility("default"))
+#  endif
+#  ifdef JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF
+#    define JEMALLOC_FORMAT_PRINTF(s, i) JEMALLOC_ATTR(format(gnu_printf, s, i))
+#  elif defined(JEMALLOC_HAVE_ATTR_FORMAT_PRINTF)
+#    define JEMALLOC_FORMAT_PRINTF(s, i) JEMALLOC_ATTR(format(printf, s, i))
+#  else
+#    define JEMALLOC_FORMAT_PRINTF(s, i)
+#  endif
 #  define JEMALLOC_NOINLINE JEMALLOC_ATTR(noinline)
 #  define JEMALLOC_NOTHROW JEMALLOC_ATTR(nothrow)
 #  define JEMALLOC_SECTION(s) JEMALLOC_ATTR(section(s))
 #elif _MSC_VER
 #  define JEMALLOC_ATTR(s)
+#  define JEMALLOC_ALIGNED(s) __declspec(align(s))
+#  define JEMALLOC_ALLOC_SIZE(s)
+#  define JEMALLOC_ALLOC_SIZE2(s1, s2)
 #  ifndef JEMALLOC_EXPORT
 #    ifdef DLLEXPORT
 #      define JEMALLOC_EXPORT __declspec(dllexport)
@@ -61,9 +71,7 @@
 #      define JEMALLOC_EXPORT __declspec(dllimport)
 #    endif
 #  endif
-#  define JEMALLOC_ALIGNED(s) __declspec(align(s))
-#  define JEMALLOC_ALLOC_SIZE(s)
-#  define JEMALLOC_ALLOC_SIZE2(s1, s2)
+#  define JEMALLOC_FORMAT_PRINTF(s, i)
 #  define JEMALLOC_NOINLINE __declspec(noinline)
 #  ifdef __cplusplus
 #    define JEMALLOC_NOTHROW __declspec(nothrow)
@@ -73,10 +81,11 @@
 #  define JEMALLOC_SECTION(s) __declspec(allocate(s))
 #else
 #  define JEMALLOC_ATTR(s)
-#  define JEMALLOC_EXPORT
 #  define JEMALLOC_ALIGNED(s)
 #  define JEMALLOC_ALLOC_SIZE(s)
 #  define JEMALLOC_ALLOC_SIZE2(s1, s2)
+#  define JEMALLOC_EXPORT
+#  define JEMALLOC_FORMAT_PRINTF(s, i)
 #  define JEMALLOC_NOINLINE
 #  define JEMALLOC_NOTHROW
 #  define JEMALLOC_SECTION(s)
diff --git a/src/prof.c b/src/prof.c
index b24996a..babdbd6 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -1007,7 +1007,7 @@ prof_dump_write(bool propagate_err, const char *s)
 	return (false);
 }
 
-JEMALLOC_ATTR(format(printf, 2, 3))
+JEMALLOC_FORMAT_PRINTF(2, 3)
 static bool
 prof_dump_printf(bool propagate_err, const char *format, ...)
 {
@@ -1338,7 +1338,7 @@ label_return:
 	return (ret);
 }
 
-JEMALLOC_ATTR(format(printf, 1, 2))
+JEMALLOC_FORMAT_PRINTF(1, 2)
 static int
 prof_open_maps(const char *format, ...)
 {
diff --git a/src/util.c b/src/util.c
index a6ef5d5..4cb0d6c 100644
--- a/src/util.c
+++ b/src/util.c
@@ -586,7 +586,7 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 	return (ret);
 }
 
-JEMALLOC_ATTR(format(printf, 3, 4))
+JEMALLOC_FORMAT_PRINTF(3, 4)
 int
 malloc_snprintf(char *str, size_t size, const char *format, ...)
 {
@@ -625,7 +625,7 @@ malloc_vcprintf(void (*write_cb)(void *, const char *), void *cbopaque,
  * Print to a callback function in such a way as to (hopefully) avoid memory
  * allocation.
  */
-JEMALLOC_ATTR(format(printf, 3, 4))
+JEMALLOC_FORMAT_PRINTF(3, 4)
 void
 malloc_cprintf(void (*write_cb)(void *, const char *), void *cbopaque,
     const char *format, ...)
@@ -638,7 +638,7 @@ malloc_cprintf(void (*write_cb)(void *, const char *), void *cbopaque,
 }
 
 /* Print to stderr in such a way as to avoid memory allocation. */
-JEMALLOC_ATTR(format(printf, 1, 2))
+JEMALLOC_FORMAT_PRINTF(1, 2)
 void
 malloc_printf(const char *format, ...)
 {
diff --git a/test/include/test/test.h b/test/include/test/test.h
index 3aa0835..7a163ac 100644
--- a/test/include/test/test.h
+++ b/test/include/test/test.h
@@ -319,8 +319,8 @@ label_test_end:								\
 	}								\
 } while (0)
 
-void	test_skip(const char *format, ...) JEMALLOC_ATTR(format(printf, 1, 2));
-void	test_fail(const char *format, ...) JEMALLOC_ATTR(format(printf, 1, 2));
+void	test_skip(const char *format, ...) JEMALLOC_FORMAT_PRINTF(1, 2);
+void	test_fail(const char *format, ...) JEMALLOC_FORMAT_PRINTF(1, 2);
 
 /* For private use by macros. */
 test_status_t	p_test(test_t *t, ...);
diff --git a/test/src/test.c b/test/src/test.c
index 0f8bd49..8173614 100644
--- a/test/src/test.c
+++ b/test/src/test.c
@@ -5,7 +5,7 @@ static test_status_t	test_counts[test_status_count] = {0, 0, 0};
 static test_status_t	test_status = test_status_pass;
 static const char *	test_name = "";
 
-JEMALLOC_ATTR(format(printf, 1, 2))
+JEMALLOC_FORMAT_PRINTF(1, 2)
 void
 test_skip(const char *format, ...)
 {
@@ -18,7 +18,7 @@ test_skip(const char *format, ...)
 	test_status = test_status_skip;
 }
 
-JEMALLOC_ATTR(format(printf, 1, 2))
+JEMALLOC_FORMAT_PRINTF(1, 2)
 void
 test_fail(const char *format, ...)
 {
-- 
cgit v0.12


From e475ff16004d9a7c76a01a71e6a52323e6bf1485 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 22 Jul 2015 15:49:34 -0700
Subject: Fix a compilation error.

This regression was introduced by
1b0e4abbfdbcc1c1a71d1f617adb19951109bfce (Port mq_get() to MinGW.).
---
 test/src/mq.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/test/src/mq.c b/test/src/mq.c
index d337724..40b31c1 100644
--- a/test/src/mq.c
+++ b/test/src/mq.c
@@ -13,15 +13,17 @@ mq_nanosleep(unsigned ns)
 #ifdef _WIN32
 	Sleep(ns / 1000);
 #else
-	struct timespec timeout;
+	{
+		struct timespec timeout;
 
-	if (ns < 1000*1000*1000) {
-		timeout.tv_sec = 0;
-		timeout.tv_nsec = ns;
-	} else {
-		timeout.tv_sec = 1;
-		timeout.tv_nsec = 0;
+		if (ns < 1000*1000*1000) {
+			timeout.tv_sec = 0;
+			timeout.tv_nsec = ns;
+		} else {
+			timeout.tv_sec = 1;
+			timeout.tv_nsec = 0;
+		}
+		nanosleep(&timeout, NULL);
 	}
-	nanosleep(&timeout, NULL);
 #endif
 }
-- 
cgit v0.12


From 5fae7dc1b316d0e93aa20cc3aaf050f509aec705 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Thu, 23 Jul 2015 13:56:25 -0700
Subject: Fix MinGW-related portability issues.

Create and use FMT* macros that are equivalent to the PRI* macros that
inttypes.h defines.  This allows uniform use of the Unix-specific format
specifiers, e.g. "%zu", as well as avoiding Windows-specific definitions
of e.g. PRIu64.

Add ffs()/ffsl() support for compiling with gcc.

Extract compatibility definitions of ENOENT, EINVAL, EAGAIN, EPERM,
ENOMEM, and ENORANGE into include/msvc_compat/windows_extra.h and
use the file for tests as well as for core jemalloc code.
---
 .../jemalloc/internal/jemalloc_internal_decls.h    |  33 +--
 include/jemalloc/internal/util.h                   |  41 +++
 include/msvc_compat/C99/inttypes.h                 | 313 ---------------------
 include/msvc_compat/strings.h                      |  10 +-
 include/msvc_compat/windows_extra.h                |  26 ++
 src/arena.c                                        |   4 +-
 src/ckh.c                                          |   6 +-
 src/prof.c                                         |  20 +-
 src/stats.c                                        |  89 +++---
 test/include/test/jemalloc_test.h.in               |   5 +-
 test/include/test/test.h                           |  72 ++---
 test/src/timer.c                                   |   4 +-
 test/stress/microbench.c                           |   4 +-
 test/unit/SFMT.c                                   |   8 +-
 test/unit/atomic.c                                 |  12 +-
 test/unit/ckh.c                                    |  43 ++-
 test/unit/junk.c                                   |  26 +-
 test/unit/rtree.c                                  |   2 +-
 18 files changed, 224 insertions(+), 494 deletions(-)
 delete mode 100644 include/msvc_compat/C99/inttypes.h
 create mode 100644 include/msvc_compat/windows_extra.h

diff --git a/include/jemalloc/internal/jemalloc_internal_decls.h b/include/jemalloc/internal/jemalloc_internal_decls.h
index bf13c57..5d42f47 100644
--- a/include/jemalloc/internal/jemalloc_internal_decls.h
+++ b/include/jemalloc/internal/jemalloc_internal_decls.h
@@ -4,27 +4,8 @@
 #include <math.h>
 #ifdef _WIN32
 #  include <windows.h>
-#  ifndef ENOENT
-#    define ENOENT ERROR_PATH_NOT_FOUND
-#  endif
-#  ifndef EINVAL
-#    define EINVAL ERROR_BAD_ARGUMENTS
-#  endif
-#  ifndef EAGAIN
-#    define EAGAIN ERROR_OUTOFMEMORY
-#  endif
-#  ifndef EPERM
-#    define EPERM  ERROR_WRITE_FAULT
-#  endif
-#  ifndef EFAULT
-#    define EFAULT ERROR_INVALID_ADDRESS
-#  endif
-#  ifndef ENOMEM
-#    define ENOMEM ERROR_NOT_ENOUGH_MEMORY
-#  endif
-#  ifndef ERANGE
-#    define ERANGE ERROR_INVALID_DATA
-#  endif
+#  include "msvc_compat/windows_extra.h"
+
 #else
 #  include <sys/param.h>
 #  include <sys/mman.h>
@@ -53,16 +34,6 @@
 #ifndef offsetof
 #  define offsetof(type, member)	((size_t)&(((type *)NULL)->member))
 #endif
-#include <inttypes.h>
-#ifdef _WIN32
-#  define PRIzu "Iu"
-#  define PRIzd "Id"
-#  define PRIzx "Ix"
-#else
-#  define PRIzu "zu"
-#  define PRIzd "zd"
-#  define PRIzx "zx"
-#endif
 #include <string.h>
 #include <strings.h>
 #include <ctype.h>
diff --git a/include/jemalloc/internal/util.h b/include/jemalloc/internal/util.h
index f6e271f..ba42df7 100644
--- a/include/jemalloc/internal/util.h
+++ b/include/jemalloc/internal/util.h
@@ -1,6 +1,47 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_TYPES
 
+#ifdef _WIN32
+#  ifdef _WIN64
+#    define FMT64_PREFIX "ll"
+#    define FMTPTR_PREFIX "ll"
+#  else
+#    define FMT64_PREFIX "ll"
+#    define FMTPTR_PREFIX ""
+#  endif
+#  define FMTd32 "d"
+#  define FMTu32 "u"
+#  define FMTx32 "x"
+#  define FMTd64 FMT64_PREFIX "d"
+#  define FMTu64 FMT64_PREFIX "u"
+#  define FMTx64 FMT64_PREFIX "x"
+#  define FMTdPTR FMTPTR_PREFIX "d"
+#  define FMTuPTR FMTPTR_PREFIX "u"
+#  define FMTxPTR FMTPTR_PREFIX "x"
+#else
+#  include <inttypes.h>
+#  define FMTd32 PRId32
+#  define FMTu32 PRIu32
+#  define FMTx32 PRIx32
+#  define FMTd64 PRId64
+#  define FMTu64 PRIu64
+#  define FMTx64 PRIx64
+#  define FMTdPTR PRIdPTR
+#  define FMTuPTR PRIuPTR
+#  define FMTxPTR PRIxPTR
+
+/* Prevent PRI* macros from accidentally being used. */
+#  undef PRId32
+#  undef PRIu32
+#  undef PRIx32
+#  undef PRId64
+#  undef PRIu64
+#  undef PRIx64
+#  undef PRIdPTR
+#  undef PRIuPTR
+#  undef PRIxPTR
+#endif
+
 /* Size of stack-allocated buffer passed to buferror(). */
 #define	BUFERROR_BUF		64
 
diff --git a/include/msvc_compat/C99/inttypes.h b/include/msvc_compat/C99/inttypes.h
deleted file mode 100644
index a4e6b75..0000000
--- a/include/msvc_compat/C99/inttypes.h
+++ /dev/null
@@ -1,313 +0,0 @@
-// ISO C9x  compliant inttypes.h for Microsoft Visual Studio
-// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 
-// 
-//  Copyright (c) 2006 Alexander Chemeris
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-// 
-//   1. Redistributions of source code must retain the above copyright notice,
-//      this list of conditions and the following disclaimer.
-// 
-//   2. Redistributions in binary form must reproduce the above copyright
-//      notice, this list of conditions and the following disclaimer in the
-//      documentation and/or other materials provided with the distribution.
-// 
-//   3. The name of the author may be used to endorse or promote products
-//      derived from this software without specific prior written permission.
-// 
-// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
-// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
-// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
-// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-// 
-///////////////////////////////////////////////////////////////////////////////
-
-#ifndef _MSC_VER // [
-#error "Use this header only with Microsoft Visual C++ compilers!"
-#endif // _MSC_VER ]
-
-#ifndef _MSC_INTTYPES_H_ // [
-#define _MSC_INTTYPES_H_
-
-#if _MSC_VER > 1000
-#pragma once
-#endif
-
-#include "stdint.h"
-
-// 7.8 Format conversion of integer types
-
-typedef struct {
-   intmax_t quot;
-   intmax_t rem;
-} imaxdiv_t;
-
-// 7.8.1 Macros for format specifiers
-
-#if !defined(__cplusplus) || defined(__STDC_FORMAT_MACROS) // [   See footnote 185 at page 198
-
-#ifdef _WIN64
-#  define __PRI64_PREFIX        "l"
-#  define __PRIPTR_PREFIX       "l"
-#else
-#  define __PRI64_PREFIX        "ll"
-#  define __PRIPTR_PREFIX
-#endif
-
-// The fprintf macros for signed integers are:
-#define PRId8       "d"
-#define PRIi8       "i"
-#define PRIdLEAST8  "d"
-#define PRIiLEAST8  "i"
-#define PRIdFAST8   "d"
-#define PRIiFAST8   "i"
-
-#define PRId16       "hd"
-#define PRIi16       "hi"
-#define PRIdLEAST16  "hd"
-#define PRIiLEAST16  "hi"
-#define PRIdFAST16   "hd"
-#define PRIiFAST16   "hi"
-
-#define PRId32       "d"
-#define PRIi32       "i"
-#define PRIdLEAST32  "d"
-#define PRIiLEAST32  "i"
-#define PRIdFAST32   "d"
-#define PRIiFAST32   "i"
-
-#define PRId64       __PRI64_PREFIX "d"
-#define PRIi64       __PRI64_PREFIX "i"
-#define PRIdLEAST64  __PRI64_PREFIX "d"
-#define PRIiLEAST64  __PRI64_PREFIX "i"
-#define PRIdFAST64   __PRI64_PREFIX "d"
-#define PRIiFAST64   __PRI64_PREFIX "i"
-
-#define PRIdMAX     __PRI64_PREFIX "d"
-#define PRIiMAX     __PRI64_PREFIX "i"
-
-#define PRIdPTR     __PRIPTR_PREFIX "d"
-#define PRIiPTR     __PRIPTR_PREFIX "i"
-
-// The fprintf macros for unsigned integers are:
-#define PRIo8       "o"
-#define PRIu8       "u"
-#define PRIx8       "x"
-#define PRIX8       "X"
-#define PRIoLEAST8  "o"
-#define PRIuLEAST8  "u"
-#define PRIxLEAST8  "x"
-#define PRIXLEAST8  "X"
-#define PRIoFAST8   "o"
-#define PRIuFAST8   "u"
-#define PRIxFAST8   "x"
-#define PRIXFAST8   "X"
-
-#define PRIo16       "ho"
-#define PRIu16       "hu"
-#define PRIx16       "hx"
-#define PRIX16       "hX"
-#define PRIoLEAST16  "ho"
-#define PRIuLEAST16  "hu"
-#define PRIxLEAST16  "hx"
-#define PRIXLEAST16  "hX"
-#define PRIoFAST16   "ho"
-#define PRIuFAST16   "hu"
-#define PRIxFAST16   "hx"
-#define PRIXFAST16   "hX"
-
-#define PRIo32       "o"
-#define PRIu32       "u"
-#define PRIx32       "x"
-#define PRIX32       "X"
-#define PRIoLEAST32  "o"
-#define PRIuLEAST32  "u"
-#define PRIxLEAST32  "x"
-#define PRIXLEAST32  "X"
-#define PRIoFAST32   "o"
-#define PRIuFAST32   "u"
-#define PRIxFAST32   "x"
-#define PRIXFAST32   "X"
-
-#define PRIo64       __PRI64_PREFIX "o"
-#define PRIu64       __PRI64_PREFIX "u"
-#define PRIx64       __PRI64_PREFIX "x"
-#define PRIX64       __PRI64_PREFIX "X"
-#define PRIoLEAST64  __PRI64_PREFIX "o"
-#define PRIuLEAST64  __PRI64_PREFIX "u"
-#define PRIxLEAST64  __PRI64_PREFIX "x"
-#define PRIXLEAST64  __PRI64_PREFIX "X"
-#define PRIoFAST64   __PRI64_PREFIX "o"
-#define PRIuFAST64   __PRI64_PREFIX "u"
-#define PRIxFAST64   __PRI64_PREFIX "x"
-#define PRIXFAST64   __PRI64_PREFIX "X"
-
-#define PRIoMAX     __PRI64_PREFIX "o"
-#define PRIuMAX     __PRI64_PREFIX "u"
-#define PRIxMAX     __PRI64_PREFIX "x"
-#define PRIXMAX     __PRI64_PREFIX "X"
-
-#define PRIoPTR     __PRIPTR_PREFIX "o"
-#define PRIuPTR     __PRIPTR_PREFIX "u"
-#define PRIxPTR     __PRIPTR_PREFIX "x"
-#define PRIXPTR     __PRIPTR_PREFIX "X"
-
-// The fscanf macros for signed integers are:
-#define SCNd8       "d"
-#define SCNi8       "i"
-#define SCNdLEAST8  "d"
-#define SCNiLEAST8  "i"
-#define SCNdFAST8   "d"
-#define SCNiFAST8   "i"
-
-#define SCNd16       "hd"
-#define SCNi16       "hi"
-#define SCNdLEAST16  "hd"
-#define SCNiLEAST16  "hi"
-#define SCNdFAST16   "hd"
-#define SCNiFAST16   "hi"
-
-#define SCNd32       "ld"
-#define SCNi32       "li"
-#define SCNdLEAST32  "ld"
-#define SCNiLEAST32  "li"
-#define SCNdFAST32   "ld"
-#define SCNiFAST32   "li"
-
-#define SCNd64       "I64d"
-#define SCNi64       "I64i"
-#define SCNdLEAST64  "I64d"
-#define SCNiLEAST64  "I64i"
-#define SCNdFAST64   "I64d"
-#define SCNiFAST64   "I64i"
-
-#define SCNdMAX     "I64d"
-#define SCNiMAX     "I64i"
-
-#ifdef _WIN64 // [
-#  define SCNdPTR     "I64d"
-#  define SCNiPTR     "I64i"
-#else  // _WIN64 ][
-#  define SCNdPTR     "ld"
-#  define SCNiPTR     "li"
-#endif  // _WIN64 ]
-
-// The fscanf macros for unsigned integers are:
-#define SCNo8       "o"
-#define SCNu8       "u"
-#define SCNx8       "x"
-#define SCNX8       "X"
-#define SCNoLEAST8  "o"
-#define SCNuLEAST8  "u"
-#define SCNxLEAST8  "x"
-#define SCNXLEAST8  "X"
-#define SCNoFAST8   "o"
-#define SCNuFAST8   "u"
-#define SCNxFAST8   "x"
-#define SCNXFAST8   "X"
-
-#define SCNo16       "ho"
-#define SCNu16       "hu"
-#define SCNx16       "hx"
-#define SCNX16       "hX"
-#define SCNoLEAST16  "ho"
-#define SCNuLEAST16  "hu"
-#define SCNxLEAST16  "hx"
-#define SCNXLEAST16  "hX"
-#define SCNoFAST16   "ho"
-#define SCNuFAST16   "hu"
-#define SCNxFAST16   "hx"
-#define SCNXFAST16   "hX"
-
-#define SCNo32       "lo"
-#define SCNu32       "lu"
-#define SCNx32       "lx"
-#define SCNX32       "lX"
-#define SCNoLEAST32  "lo"
-#define SCNuLEAST32  "lu"
-#define SCNxLEAST32  "lx"
-#define SCNXLEAST32  "lX"
-#define SCNoFAST32   "lo"
-#define SCNuFAST32   "lu"
-#define SCNxFAST32   "lx"
-#define SCNXFAST32   "lX"
-
-#define SCNo64       "I64o"
-#define SCNu64       "I64u"
-#define SCNx64       "I64x"
-#define SCNX64       "I64X"
-#define SCNoLEAST64  "I64o"
-#define SCNuLEAST64  "I64u"
-#define SCNxLEAST64  "I64x"
-#define SCNXLEAST64  "I64X"
-#define SCNoFAST64   "I64o"
-#define SCNuFAST64   "I64u"
-#define SCNxFAST64   "I64x"
-#define SCNXFAST64   "I64X"
-
-#define SCNoMAX     "I64o"
-#define SCNuMAX     "I64u"
-#define SCNxMAX     "I64x"
-#define SCNXMAX     "I64X"
-
-#ifdef _WIN64 // [
-#  define SCNoPTR     "I64o"
-#  define SCNuPTR     "I64u"
-#  define SCNxPTR     "I64x"
-#  define SCNXPTR     "I64X"
-#else  // _WIN64 ][
-#  define SCNoPTR     "lo"
-#  define SCNuPTR     "lu"
-#  define SCNxPTR     "lx"
-#  define SCNXPTR     "lX"
-#endif  // _WIN64 ]
-
-#endif // __STDC_FORMAT_MACROS ]
-
-// 7.8.2 Functions for greatest-width integer types
-
-// 7.8.2.1 The imaxabs function
-#define imaxabs _abs64
-
-// 7.8.2.2 The imaxdiv function
-
-// This is modified version of div() function from Microsoft's div.c found
-// in %MSVC.NET%\crt\src\div.c
-#ifdef STATIC_IMAXDIV // [
-static
-#else // STATIC_IMAXDIV ][
-_inline
-#endif // STATIC_IMAXDIV ]
-imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom)
-{
-   imaxdiv_t result;
-
-   result.quot = numer / denom;
-   result.rem = numer % denom;
-
-   if (numer < 0 && result.rem > 0) {
-      // did division wrong; must fix up
-      ++result.quot;
-      result.rem -= denom;
-   }
-
-   return result;
-}
-
-// 7.8.2.3 The strtoimax and strtoumax functions
-#define strtoimax _strtoi64
-#define strtoumax _strtoui64
-
-// 7.8.2.4 The wcstoimax and wcstoumax functions
-#define wcstoimax _wcstoi64
-#define wcstoumax _wcstoui64
-
-
-#endif // _MSC_INTTYPES_H_ ]
diff --git a/include/msvc_compat/strings.h b/include/msvc_compat/strings.h
index c84975b..f01ffdd 100644
--- a/include/msvc_compat/strings.h
+++ b/include/msvc_compat/strings.h
@@ -3,8 +3,9 @@
 
 /* MSVC doesn't define ffs/ffsl. This dummy strings.h header is provided
  * for both */
-#include <intrin.h>
-#pragma intrinsic(_BitScanForward)
+#ifdef _MSC_VER
+#  include <intrin.h>
+#  pragma intrinsic(_BitScanForward)
 static __forceinline int ffsl(long x)
 {
 	unsigned long i;
@@ -20,4 +21,9 @@ static __forceinline int ffs(int x)
 	return (ffsl(x));
 }
 
+#else
+#  define ffsl(x) __builtin_ffsl(x)
+#  define ffs(x) __builtin_ffs(x)
 #endif
+
+#endif /* strings_h */
diff --git a/include/msvc_compat/windows_extra.h b/include/msvc_compat/windows_extra.h
new file mode 100644
index 0000000..0c5e323
--- /dev/null
+++ b/include/msvc_compat/windows_extra.h
@@ -0,0 +1,26 @@
+#ifndef MSVC_COMPAT_WINDOWS_EXTRA_H
+#define	MSVC_COMPAT_WINDOWS_EXTRA_H
+
+#ifndef ENOENT
+#  define ENOENT ERROR_PATH_NOT_FOUND
+#endif
+#ifndef EINVAL
+#  define EINVAL ERROR_BAD_ARGUMENTS
+#endif
+#ifndef EAGAIN
+#  define EAGAIN ERROR_OUTOFMEMORY
+#endif
+#ifndef EPERM
+#  define EPERM  ERROR_WRITE_FAULT
+#endif
+#ifndef EFAULT
+#  define EFAULT ERROR_INVALID_ADDRESS
+#endif
+#ifndef ENOMEM
+#  define ENOMEM ERROR_NOT_ENOUGH_MEMORY
+#endif
+#ifndef ERANGE
+#  define ERANGE ERROR_INVALID_DATA
+#endif
+
+#endif /* MSVC_COMPAT_WINDOWS_EXTRA_H */
diff --git a/src/arena.c b/src/arena.c
index 65aad20..10cd0d2 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1886,8 +1886,8 @@ arena_redzone_corruption(void *ptr, size_t usize, bool after,
     size_t offset, uint8_t byte)
 {
 
-	malloc_printf("<jemalloc>: Corrupt redzone %"PRIzu" byte%s %s %p "
-	    "(size %"PRIzu"), byte=%#x\n", offset, (offset == 1) ? "" : "s",
+	malloc_printf("<jemalloc>: Corrupt redzone %zu byte%s %s %p "
+	    "(size %zu), byte=%#x\n", offset, (offset == 1) ? "" : "s",
 	    after ? "after" : "before", ptr, usize, byte);
 }
 #ifdef JEMALLOC_JET
diff --git a/src/ckh.c b/src/ckh.c
index da78d1b..53a1c1e 100644
--- a/src/ckh.c
+++ b/src/ckh.c
@@ -411,9 +411,9 @@ ckh_delete(tsd_t *tsd, ckh_t *ckh)
 
 #ifdef CKH_VERBOSE
 	malloc_printf(
-	    "%s(%p): ngrows: %"PRIu64", nshrinks: %"PRIu64","
-	    " nshrinkfails: %"PRIu64", ninserts: %"PRIu64","
-	    " nrelocs: %"PRIu64"\n", __func__, ckh,
+	    "%s(%p): ngrows: %"FMTu64", nshrinks: %"FMTu64","
+	    " nshrinkfails: %"FMTu64", ninserts: %"FMTu64","
+	    " nrelocs: %"FMTu64"\n", __func__, ckh,
 	    (unsigned long long)ckh->ngrows,
 	    (unsigned long long)ckh->nshrinks,
 	    (unsigned long long)ckh->nshrinkfails,
diff --git a/src/prof.c b/src/prof.c
index babdbd6..a05792f 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -1095,7 +1095,7 @@ prof_tctx_dump_iter(prof_tctx_tree_t *tctxs, prof_tctx_t *tctx, void *arg)
 	bool propagate_err = *(bool *)arg;
 
 	if (prof_dump_printf(propagate_err,
-	    "  t%"PRIu64": %"PRIu64": %"PRIu64" [%"PRIu64": %"PRIu64"]\n",
+	    "  t%"FMTu64": %"FMTu64": %"FMTu64" [%"FMTu64": %"FMTu64"]\n",
 	    tctx->thr_uid, tctx->dump_cnts.curobjs, tctx->dump_cnts.curbytes,
 	    tctx->dump_cnts.accumobjs, tctx->dump_cnts.accumbytes))
 		return (tctx);
@@ -1247,7 +1247,7 @@ prof_tdata_dump_iter(prof_tdata_tree_t *tdatas, prof_tdata_t *tdata, void *arg)
 		return (NULL);
 
 	if (prof_dump_printf(propagate_err,
-	    "  t%"PRIu64": %"PRIu64": %"PRIu64" [%"PRIu64": %"PRIu64"]%s%s\n",
+	    "  t%"FMTu64": %"FMTu64": %"FMTu64" [%"FMTu64": %"FMTu64"]%s%s\n",
 	    tdata->thr_uid, tdata->cnt_summed.curobjs,
 	    tdata->cnt_summed.curbytes, tdata->cnt_summed.accumobjs,
 	    tdata->cnt_summed.accumbytes,
@@ -1267,8 +1267,8 @@ prof_dump_header(bool propagate_err, const prof_cnt_t *cnt_all)
 	bool ret;
 
 	if (prof_dump_printf(propagate_err,
-	    "heap_v2/%"PRIu64"\n"
-	    "  t*: %"PRIu64": %"PRIu64" [%"PRIu64": %"PRIu64"]\n",
+	    "heap_v2/%"FMTu64"\n"
+	    "  t*: %"FMTu64": %"FMTu64" [%"FMTu64": %"FMTu64"]\n",
 	    ((uint64_t)1U << lg_prof_sample), cnt_all->curobjs,
 	    cnt_all->curbytes, cnt_all->accumobjs, cnt_all->accumbytes))
 		return (true);
@@ -1311,7 +1311,7 @@ prof_dump_gctx(bool propagate_err, prof_gctx_t *gctx, const prof_bt_t *bt,
 		goto label_return;
 	}
 	for (i = 0; i < bt->len; i++) {
-		if (prof_dump_printf(propagate_err, " %#"PRIxPTR,
+		if (prof_dump_printf(propagate_err, " %#"FMTxPTR,
 		    (uintptr_t)bt->vec[i])) {
 			ret = true;
 			goto label_return;
@@ -1320,7 +1320,7 @@ prof_dump_gctx(bool propagate_err, prof_gctx_t *gctx, const prof_bt_t *bt,
 
 	if (prof_dump_printf(propagate_err,
 	    "\n"
-	    "  t*: %"PRIu64": %"PRIu64" [%"PRIu64": %"PRIu64"]\n",
+	    "  t*: %"FMTu64": %"FMTu64" [%"FMTu64": %"FMTu64"]\n",
 	    gctx->cnt_summed.curobjs, gctx->cnt_summed.curbytes,
 	    gctx->cnt_summed.accumobjs, gctx->cnt_summed.accumbytes)) {
 		ret = true;
@@ -1412,8 +1412,8 @@ prof_leakcheck(const prof_cnt_t *cnt_all, size_t leak_ngctx,
 {
 
 	if (cnt_all->curbytes != 0) {
-		malloc_printf("<jemalloc>: Leak summary: %"PRIu64" byte%s, %"
-		    PRIu64" object%s, %"PRIzu" context%s\n",
+		malloc_printf("<jemalloc>: Leak summary: %"FMTu64" byte%s, %"
+		    FMTu64" object%s, %zu context%s\n",
 		    cnt_all->curbytes, (cnt_all->curbytes != 1) ? "s" : "",
 		    cnt_all->curobjs, (cnt_all->curobjs != 1) ? "s" : "",
 		    leak_ngctx, (leak_ngctx != 1) ? "s" : "");
@@ -1533,12 +1533,12 @@ prof_dump_filename(char *filename, char v, uint64_t vseq)
 	if (vseq != VSEQ_INVALID) {
 	        /* "<prefix>.<pid>.<seq>.v<vseq>.heap" */
 		malloc_snprintf(filename, DUMP_FILENAME_BUFSIZE,
-		    "%s.%d.%"PRIu64".%c%"PRIu64".heap",
+		    "%s.%d.%"FMTu64".%c%"FMTu64".heap",
 		    opt_prof_prefix, (int)getpid(), prof_dump_seq, v, vseq);
 	} else {
 	        /* "<prefix>.<pid>.<seq>.<v>.heap" */
 		malloc_snprintf(filename, DUMP_FILENAME_BUFSIZE,
-		    "%s.%d.%"PRIu64".%c.heap",
+		    "%s.%d.%"FMTu64".%c.heap",
 		    opt_prof_prefix, (int)getpid(), prof_dump_seq, v);
 	}
 	prof_dump_seq++;
diff --git a/src/stats.c b/src/stats.c
index 57fd650..154c3e7 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -119,32 +119,32 @@ stats_arena_bins_print(void (*write_cb)(void *, const char *), void *cbopaque,
 			assert(milli <= 1000);
 			if (milli < 10) {
 				malloc_snprintf(util, sizeof(util),
-				    "0.00%"PRIzu, milli);
+				    "0.00%zu", milli);
 			} else if (milli < 100) {
-				malloc_snprintf(util, sizeof(util), "0.0%"PRIzu,
+				malloc_snprintf(util, sizeof(util), "0.0%zu",
 				    milli);
 			} else if (milli < 1000) {
-				malloc_snprintf(util, sizeof(util), "0.%"PRIzu,
+				malloc_snprintf(util, sizeof(util), "0.%zu",
 				    milli);
 			} else
 				malloc_snprintf(util, sizeof(util), "1");
 
 			if (config_tcache) {
 				malloc_cprintf(write_cb, cbopaque,
-				    "%20"PRIzu" %3u %12"PRIzu" %12"PRIu64
-				    " %12"PRIu64" %12"PRIu64" %12"PRIzu
-				    " %12"PRIzu" %4u %3"PRIzu" %-5s %12"PRIu64
-				    " %12"PRIu64" %12"PRIu64" %12"PRIu64"\n",
+				    "%20zu %3u %12zu %12"FMTu64
+				    " %12"FMTu64" %12"FMTu64" %12zu"
+				    " %12zu %4u %3zu %-5s %12"FMTu64
+				    " %12"FMTu64" %12"FMTu64" %12"FMTu64"\n",
 				    reg_size, j, curregs * reg_size, nmalloc,
 				    ndalloc, nrequests, curregs, curruns, nregs,
 				    run_size / page, util, nfills, nflushes,
 				    nruns, reruns);
 			} else {
 				malloc_cprintf(write_cb, cbopaque,
-				    "%20"PRIzu" %3u %12"PRIzu" %12"PRIu64
-				    " %12"PRIu64" %12"PRIu64" %12"PRIzu
-				    " %12"PRIzu" %4u %3"PRIzu" %-5s %12"PRIu64
-				    " %12"PRIu64"\n",
+				    "%20zu %3u %12zu %12"FMTu64
+				    " %12"FMTu64" %12"FMTu64" %12zu"
+				    " %12zu %4u %3zu %-5s %12"FMTu64
+				    " %12"FMTu64"\n",
 				    reg_size, j, curregs * reg_size, nmalloc,
 				    ndalloc, nrequests, curregs, curruns, nregs,
 				    run_size / page, util, nruns, reruns);
@@ -191,8 +191,8 @@ stats_arena_lruns_print(void (*write_cb)(void *, const char *), void *cbopaque,
 				in_gap = false;
 			}
 			malloc_cprintf(write_cb, cbopaque,
-			    "%20"PRIzu" %3u %12"PRIzu" %12"PRIu64" %12"PRIu64
-			    " %12"PRIu64" %12"PRIzu"\n",
+			    "%20zu %3u %12zu %12"FMTu64" %12"FMTu64
+			    " %12"FMTu64" %12zu\n",
 			    run_size, nbins + j, curruns * run_size, nmalloc,
 			    ndalloc, nrequests, curruns);
 		}
@@ -239,8 +239,8 @@ stats_arena_hchunks_print(void (*write_cb)(void *, const char *),
 				in_gap = false;
 			}
 			malloc_cprintf(write_cb, cbopaque,
-			    "%20"PRIzu" %3u %12"PRIzu" %12"PRIu64" %12"PRIu64
-			    " %12"PRIu64" %12"PRIzu"\n",
+			    "%20zu %3u %12zu %12"FMTu64" %12"FMTu64
+			    " %12"FMTu64" %12zu\n",
 			    hchunk_size, nbins + nlruns + j,
 			    curhchunks * hchunk_size, nmalloc, ndalloc,
 			    nrequests, curhchunks);
@@ -292,10 +292,9 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	CTL_M2_GET("stats.arenas.0.nmadvise", i, &nmadvise, uint64_t);
 	CTL_M2_GET("stats.arenas.0.purged", i, &purged, uint64_t);
 	malloc_cprintf(write_cb, cbopaque,
-	    "dirty pages: %"PRIzu":%"PRIzu" active:dirty, %"PRIu64" sweep%s,"
-	    " %"PRIu64" madvise%s, %"PRIu64" purged\n",
-	    pactive, pdirty, npurge, npurge == 1 ? "" : "s",
-	    nmadvise, nmadvise == 1 ? "" : "s", purged);
+	    "dirty pages: %zu:%zu active:dirty, %"FMTu64" sweep%s, %"FMTu64
+	    " madvise%s, %"FMTu64" purged\n", pactive, pdirty, npurge, npurge ==
+	    1 ? "" : "s", nmadvise, nmadvise == 1 ? "" : "s", purged);
 
 	malloc_cprintf(write_cb, cbopaque,
 	    "                            allocated      nmalloc      ndalloc"
@@ -307,8 +306,8 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	CTL_M2_GET("stats.arenas.0.small.nrequests", i, &small_nrequests,
 	    uint64_t);
 	malloc_cprintf(write_cb, cbopaque,
-	    "small:                   %12"PRIzu" %12"PRIu64" %12"PRIu64
-	    " %12"PRIu64"\n",
+	    "small:                   %12zu %12"FMTu64" %12"FMTu64
+	    " %12"FMTu64"\n",
 	    small_allocated, small_nmalloc, small_ndalloc, small_nrequests);
 	CTL_M2_GET("stats.arenas.0.large.allocated", i, &large_allocated,
 	    size_t);
@@ -317,8 +316,8 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	CTL_M2_GET("stats.arenas.0.large.nrequests", i, &large_nrequests,
 	    uint64_t);
 	malloc_cprintf(write_cb, cbopaque,
-	    "large:                   %12"PRIzu" %12"PRIu64" %12"PRIu64
-	    " %12"PRIu64"\n",
+	    "large:                   %12zu %12"FMTu64" %12"FMTu64
+	    " %12"FMTu64"\n",
 	    large_allocated, large_nmalloc, large_ndalloc, large_nrequests);
 	CTL_M2_GET("stats.arenas.0.huge.allocated", i, &huge_allocated, size_t);
 	CTL_M2_GET("stats.arenas.0.huge.nmalloc", i, &huge_nmalloc, uint64_t);
@@ -326,27 +325,27 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	CTL_M2_GET("stats.arenas.0.huge.nrequests", i, &huge_nrequests,
 	    uint64_t);
 	malloc_cprintf(write_cb, cbopaque,
-	    "huge:                    %12"PRIzu" %12"PRIu64" %12"PRIu64
-	    " %12"PRIu64"\n",
+	    "huge:                    %12zu %12"FMTu64" %12"FMTu64
+	    " %12"FMTu64"\n",
 	    huge_allocated, huge_nmalloc, huge_ndalloc, huge_nrequests);
 	malloc_cprintf(write_cb, cbopaque,
-	    "total:                   %12"PRIzu" %12"PRIu64" %12"PRIu64
-	    " %12"PRIu64"\n",
+	    "total:                   %12zu %12"FMTu64" %12"FMTu64
+	    " %12"FMTu64"\n",
 	    small_allocated + large_allocated + huge_allocated,
 	    small_nmalloc + large_nmalloc + huge_nmalloc,
 	    small_ndalloc + large_ndalloc + huge_ndalloc,
 	    small_nrequests + large_nrequests + huge_nrequests);
 	malloc_cprintf(write_cb, cbopaque,
-	    "active:                  %12"PRIzu"\n", pactive * page);
+	    "active:                  %12zu\n", pactive * page);
 	CTL_M2_GET("stats.arenas.0.mapped", i, &mapped, size_t);
 	malloc_cprintf(write_cb, cbopaque,
-	    "mapped:                  %12"PRIzu"\n", mapped);
+	    "mapped:                  %12zu\n", mapped);
 	CTL_M2_GET("stats.arenas.0.metadata.mapped", i, &metadata_mapped,
 	    size_t);
 	CTL_M2_GET("stats.arenas.0.metadata.allocated", i, &metadata_allocated,
 	    size_t);
 	malloc_cprintf(write_cb, cbopaque,
-	    "metadata: mapped: %"PRIzu", allocated: %"PRIzu"\n",
+	    "metadata: mapped: %zu, allocated: %zu\n",
 	    metadata_mapped, metadata_allocated);
 
 	if (bins)
@@ -457,19 +456,19 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 #define	OPT_WRITE_SIZE_T(n)						\
 		if (je_mallctl("opt."#n, &sv, &ssz, NULL, 0) == 0) {	\
 			malloc_cprintf(write_cb, cbopaque,		\
-			"  opt."#n": %"PRIzu"\n", sv);			\
+			"  opt."#n": %zu\n", sv);			\
 		}
 #define	OPT_WRITE_SSIZE_T(n)						\
 		if (je_mallctl("opt."#n, &ssv, &sssz, NULL, 0) == 0) {	\
 			malloc_cprintf(write_cb, cbopaque,		\
-			    "  opt."#n": %"PRIzd"\n", ssv);		\
+			    "  opt."#n": %zd\n", ssv);			\
 		}
 #define	OPT_WRITE_SSIZE_T_MUTABLE(n, m) {				\
 		ssize_t ssv2;						\
 		if (je_mallctl("opt."#n, &ssv, &sssz, NULL, 0) == 0 &&	\
 		    je_mallctl(#m, &ssv2, &sssz, NULL, 0) == 0) {	\
 			malloc_cprintf(write_cb, cbopaque,		\
-			    "  opt."#n": %"PRIzd" ("#m": %"PRIzd")\n",	\
+			    "  opt."#n": %zd ("#m": %zd)\n",		\
 			    ssv, ssv2);					\
 		}							\
 }
@@ -519,15 +518,15 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		CTL_GET("arenas.narenas", &uv, unsigned);
 		malloc_cprintf(write_cb, cbopaque, "Arenas: %u\n", uv);
 
-		malloc_cprintf(write_cb, cbopaque, "Pointer size: %"PRIzu"\n",
+		malloc_cprintf(write_cb, cbopaque, "Pointer size: %zu\n",
 		    sizeof(void *));
 
 		CTL_GET("arenas.quantum", &sv, size_t);
-		malloc_cprintf(write_cb, cbopaque, "Quantum size: %"PRIzu"\n",
+		malloc_cprintf(write_cb, cbopaque, "Quantum size: %zu\n",
 		    sv);
 
 		CTL_GET("arenas.page", &sv, size_t);
-		malloc_cprintf(write_cb, cbopaque, "Page size: %"PRIzu"\n", sv);
+		malloc_cprintf(write_cb, cbopaque, "Page size: %zu\n", sv);
 
 		CTL_GET("arenas.lg_dirty_mult", &ssv, ssize_t);
 		if (ssv >= 0) {
@@ -540,19 +539,19 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		}
 		if (je_mallctl("arenas.tcache_max", &sv, &ssz, NULL, 0) == 0) {
 			malloc_cprintf(write_cb, cbopaque,
-			    "Maximum thread-cached size class: %"PRIzu"\n", sv);
+			    "Maximum thread-cached size class: %zu\n", sv);
 		}
 		if (je_mallctl("opt.prof", &bv, &bsz, NULL, 0) == 0 && bv) {
 			CTL_GET("prof.lg_sample", &sv, size_t);
 			malloc_cprintf(write_cb, cbopaque,
-			    "Average profile sample interval: %"PRIu64
-			    " (2^%"PRIzu")\n", (((uint64_t)1U) << sv), sv);
+			    "Average profile sample interval: %"FMTu64
+			    " (2^%zu)\n", (((uint64_t)1U) << sv), sv);
 
 			CTL_GET("opt.lg_prof_interval", &ssv, ssize_t);
 			if (ssv >= 0) {
 				malloc_cprintf(write_cb, cbopaque,
-				    "Average profile dump interval: %"PRIu64
-				    " (2^%"PRIzd")\n",
+				    "Average profile dump interval: %"FMTu64
+				    " (2^%zd)\n",
 				    (((uint64_t)1U) << ssv), ssv);
 			} else {
 				malloc_cprintf(write_cb, cbopaque,
@@ -561,7 +560,7 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		}
 		CTL_GET("opt.lg_chunk", &sv, size_t);
 		malloc_cprintf(write_cb, cbopaque,
-		    "Chunk size: %"PRIzu" (2^%"PRIzu")\n", (ZU(1) << sv), sv);
+		    "Chunk size: %zu (2^%zu)\n", (ZU(1) << sv), sv);
 	}
 
 	if (config_stats) {
@@ -575,11 +574,11 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		CTL_GET("stats.resident", &resident, size_t);
 		CTL_GET("stats.mapped", &mapped, size_t);
 		malloc_cprintf(write_cb, cbopaque,
-		    "Allocated: %"PRIzu", active: %"PRIzu", metadata: %"PRIzu","
-		    " resident: %"PRIzu", mapped: %"PRIzu"\n",
+		    "Allocated: %zu, active: %zu, metadata: %zu,"
+		    " resident: %zu, mapped: %zu\n",
 		    allocated, active, metadata, resident, mapped);
 		malloc_cprintf(write_cb, cbopaque,
-		    "Current active ceiling: %"PRIzu"\n",
+		    "Current active ceiling: %zu\n",
 		    atomic_read_z(cactive));
 
 		if (merged) {
diff --git a/test/include/test/jemalloc_test.h.in b/test/include/test/jemalloc_test.h.in
index c72d09f..455569d 100644
--- a/test/include/test/jemalloc_test.h.in
+++ b/test/include/test/jemalloc_test.h.in
@@ -6,13 +6,16 @@
 #include <stdarg.h>
 #include <stdbool.h>
 #include <errno.h>
-#include <inttypes.h>
 #include <math.h>
 #include <string.h>
+#ifdef _WIN32
+#  include "msvc_compat/strings.h"
+#endif
 #include <sys/time.h>
 
 #ifdef _WIN32
 #  include <windows.h>
+#  include "msvc_compat/windows_extra.h"
 #else
 #  include <pthread.h>
 #endif
diff --git a/test/include/test/test.h b/test/include/test/test.h
index 7a163ac..3cf901f 100644
--- a/test/include/test/test.h
+++ b/test/include/test/test.h
@@ -133,82 +133,82 @@
     <=, "ju", __VA_ARGS__)
 
 #define	assert_zd_eq(a, b, ...)	assert_cmp(ssize_t, a, b, ==,	\
-    !=, PRIzd, __VA_ARGS__)
+    !=, "zd", __VA_ARGS__)
 #define	assert_zd_ne(a, b, ...)	assert_cmp(ssize_t, a, b, !=,	\
-    ==, PRIzd, __VA_ARGS__)
+    ==, "zd", __VA_ARGS__)
 #define	assert_zd_lt(a, b, ...)	assert_cmp(ssize_t, a, b, <,	\
-    >=, PRIzd, __VA_ARGS__)
+    >=, "zd", __VA_ARGS__)
 #define	assert_zd_le(a, b, ...)	assert_cmp(ssize_t, a, b, <=,	\
-    >, PRIzd, __VA_ARGS__)
+    >, "zd", __VA_ARGS__)
 #define	assert_zd_ge(a, b, ...)	assert_cmp(ssize_t, a, b, >=,	\
-    <, PRIzd, __VA_ARGS__)
+    <, "zd", __VA_ARGS__)
 #define	assert_zd_gt(a, b, ...)	assert_cmp(ssize_t, a, b, >,	\
-    <=, PRIzd, __VA_ARGS__)
+    <=, "zd", __VA_ARGS__)
 
 #define	assert_zu_eq(a, b, ...)	assert_cmp(size_t, a, b, ==,	\
-    !=, PRIzu, __VA_ARGS__)
+    !=, "zu", __VA_ARGS__)
 #define	assert_zu_ne(a, b, ...)	assert_cmp(size_t, a, b, !=,	\
-    ==, PRIzu, __VA_ARGS__)
+    ==, "zu", __VA_ARGS__)
 #define	assert_zu_lt(a, b, ...)	assert_cmp(size_t, a, b, <,	\
-    >=, PRIzu, __VA_ARGS__)
+    >=, "zu", __VA_ARGS__)
 #define	assert_zu_le(a, b, ...)	assert_cmp(size_t, a, b, <=,	\
-    >, PRIzu, __VA_ARGS__)
+    >, "zu", __VA_ARGS__)
 #define	assert_zu_ge(a, b, ...)	assert_cmp(size_t, a, b, >=,	\
-    <, PRIzu, __VA_ARGS__)
+    <, "zu", __VA_ARGS__)
 #define	assert_zu_gt(a, b, ...)	assert_cmp(size_t, a, b, >,	\
-    <=, PRIzu, __VA_ARGS__)
+    <=, "zu", __VA_ARGS__)
 
 #define	assert_d32_eq(a, b, ...)	assert_cmp(int32_t, a, b, ==,	\
-    !=, PRId32, __VA_ARGS__)
+    !=, FMTd32, __VA_ARGS__)
 #define	assert_d32_ne(a, b, ...)	assert_cmp(int32_t, a, b, !=,	\
-    ==, PRId32, __VA_ARGS__)
+    ==, FMTd32, __VA_ARGS__)
 #define	assert_d32_lt(a, b, ...)	assert_cmp(int32_t, a, b, <,	\
-    >=, PRId32, __VA_ARGS__)
+    >=, FMTd32, __VA_ARGS__)
 #define	assert_d32_le(a, b, ...)	assert_cmp(int32_t, a, b, <=,	\
-    >, PRId32, __VA_ARGS__)
+    >, FMTd32, __VA_ARGS__)
 #define	assert_d32_ge(a, b, ...)	assert_cmp(int32_t, a, b, >=,	\
-    <, PRId32, __VA_ARGS__)
+    <, FMTd32, __VA_ARGS__)
 #define	assert_d32_gt(a, b, ...)	assert_cmp(int32_t, a, b, >,	\
-    <=, PRId32, __VA_ARGS__)
+    <=, FMTd32, __VA_ARGS__)
 
 #define	assert_u32_eq(a, b, ...)	assert_cmp(uint32_t, a, b, ==,	\
-    !=, PRIu32, __VA_ARGS__)
+    !=, FMTu32, __VA_ARGS__)
 #define	assert_u32_ne(a, b, ...)	assert_cmp(uint32_t, a, b, !=,	\
-    ==, PRIu32, __VA_ARGS__)
+    ==, FMTu32, __VA_ARGS__)
 #define	assert_u32_lt(a, b, ...)	assert_cmp(uint32_t, a, b, <,	\
-    >=, PRIu32, __VA_ARGS__)
+    >=, FMTu32, __VA_ARGS__)
 #define	assert_u32_le(a, b, ...)	assert_cmp(uint32_t, a, b, <=,	\
-    >, PRIu32, __VA_ARGS__)
+    >, FMTu32, __VA_ARGS__)
 #define	assert_u32_ge(a, b, ...)	assert_cmp(uint32_t, a, b, >=,	\
-    <, PRIu32, __VA_ARGS__)
+    <, FMTu32, __VA_ARGS__)
 #define	assert_u32_gt(a, b, ...)	assert_cmp(uint32_t, a, b, >,	\
-    <=, PRIu32, __VA_ARGS__)
+    <=, FMTu32, __VA_ARGS__)
 
 #define	assert_d64_eq(a, b, ...)	assert_cmp(int64_t, a, b, ==,	\
-    !=, PRId64, __VA_ARGS__)
+    !=, FMTd64, __VA_ARGS__)
 #define	assert_d64_ne(a, b, ...)	assert_cmp(int64_t, a, b, !=,	\
-    ==, PRId64, __VA_ARGS__)
+    ==, FMTd64, __VA_ARGS__)
 #define	assert_d64_lt(a, b, ...)	assert_cmp(int64_t, a, b, <,	\
-    >=, PRId64, __VA_ARGS__)
+    >=, FMTd64, __VA_ARGS__)
 #define	assert_d64_le(a, b, ...)	assert_cmp(int64_t, a, b, <=,	\
-    >, PRId64, __VA_ARGS__)
+    >, FMTd64, __VA_ARGS__)
 #define	assert_d64_ge(a, b, ...)	assert_cmp(int64_t, a, b, >=,	\
-    <, PRId64, __VA_ARGS__)
+    <, FMTd64, __VA_ARGS__)
 #define	assert_d64_gt(a, b, ...)	assert_cmp(int64_t, a, b, >,	\
-    <=, PRId64, __VA_ARGS__)
+    <=, FMTd64, __VA_ARGS__)
 
 #define	assert_u64_eq(a, b, ...)	assert_cmp(uint64_t, a, b, ==,	\
-    !=, PRIu64, __VA_ARGS__)
+    !=, FMTu64, __VA_ARGS__)
 #define	assert_u64_ne(a, b, ...)	assert_cmp(uint64_t, a, b, !=,	\
-    ==, PRIu64, __VA_ARGS__)
+    ==, FMTu64, __VA_ARGS__)
 #define	assert_u64_lt(a, b, ...)	assert_cmp(uint64_t, a, b, <,	\
-    >=, PRIu64, __VA_ARGS__)
+    >=, FMTu64, __VA_ARGS__)
 #define	assert_u64_le(a, b, ...)	assert_cmp(uint64_t, a, b, <=,	\
-    >, PRIu64, __VA_ARGS__)
+    >, FMTu64, __VA_ARGS__)
 #define	assert_u64_ge(a, b, ...)	assert_cmp(uint64_t, a, b, >=,	\
-    <, PRIu64, __VA_ARGS__)
+    <, FMTu64, __VA_ARGS__)
 #define	assert_u64_gt(a, b, ...)	assert_cmp(uint64_t, a, b, >,	\
-    <=, PRIu64, __VA_ARGS__)
+    <=, FMTu64, __VA_ARGS__)
 
 #define	assert_b_eq(a, b, ...) do {					\
 	bool a_ = (a);							\
diff --git a/test/src/timer.c b/test/src/timer.c
index 66b8070..0c93aba 100644
--- a/test/src/timer.c
+++ b/test/src/timer.c
@@ -61,7 +61,7 @@ timer_ratio(timedelta_t *a, timedelta_t *b, char *buf, size_t buflen)
 	int n;
 
 	/* Whole. */
-	n = malloc_snprintf(&buf[i], buflen-i, "%"PRIu64, t0 / t1);
+	n = malloc_snprintf(&buf[i], buflen-i, "%"FMTu64, t0 / t1);
 	i += n;
 	if (i >= buflen)
 		return;
@@ -78,7 +78,7 @@ timer_ratio(timedelta_t *a, timedelta_t *b, char *buf, size_t buflen)
 		uint64_t round = (i+1 == buflen-1 && ((t0 * mult * 10 / t1) % 10
 		    >= 5)) ? 1 : 0;
 		n = malloc_snprintf(&buf[i], buflen-i,
-		    "%"PRIu64, (t0 * mult / t1) % 10 + round);
+		    "%"FMTu64, (t0 * mult / t1) % 10 + round);
 		i += n;
 		mult *= 10;
 	}
diff --git a/test/stress/microbench.c b/test/stress/microbench.c
index aefbe6a..ee39fea 100644
--- a/test/stress/microbench.c
+++ b/test/stress/microbench.c
@@ -31,8 +31,8 @@ compare_funcs(uint64_t nwarmup, uint64_t niter, const char *name_a,
 	time_func(&timer_b, nwarmup, niter, func_b);
 
 	timer_ratio(&timer_a, &timer_b, ratio_buf, sizeof(ratio_buf));
-	malloc_printf("%"PRIu64" iterations, %s=%"PRIu64"us, "
-	    "%s=%"PRIu64"us, ratio=1:%s\n",
+	malloc_printf("%"FMTu64" iterations, %s=%"FMTu64"us, "
+	    "%s=%"FMTu64"us, ratio=1:%s\n",
 	    niter, name_a, timer_usec(&timer_a), name_b, timer_usec(&timer_b),
 	    ratio_buf);
 
diff --git a/test/unit/SFMT.c b/test/unit/SFMT.c
index 88b31f6..ba4be87 100644
--- a/test/unit/SFMT.c
+++ b/test/unit/SFMT.c
@@ -1543,13 +1543,13 @@ TEST_BEGIN(test_gen_rand_64)
 		}
 		r = gen_rand64(ctx);
 		assert_u64_eq(r, array64[i],
-		    "Mismatch at array64[%d]=%"PRIx64", gen=%"PRIx64, i,
+		    "Mismatch at array64[%d]=%"FMTx64", gen=%"FMTx64, i,
 		    array64[i], r);
 	}
 	for (i = 0; i < COUNT_2; i++) {
 		r = gen_rand64(ctx);
 		assert_u64_eq(r, array64_2[i],
-		    "Mismatch at array64_2[%d]=%"PRIx64" gen=%"PRIx64"", i,
+		    "Mismatch at array64_2[%d]=%"FMTx64" gen=%"FMTx64"", i,
 		    array64_2[i], r);
 	}
 	fini_gen_rand(ctx);
@@ -1580,13 +1580,13 @@ TEST_BEGIN(test_by_array_64)
 		}
 		r = gen_rand64(ctx);
 		assert_u64_eq(r, array64[i],
-		    "Mismatch at array64[%d]=%"PRIx64" gen=%"PRIx64, i,
+		    "Mismatch at array64[%d]=%"FMTx64" gen=%"FMTx64, i,
 		    array64[i], r);
 	}
 	for (i = 0; i < COUNT_2; i++) {
 		r = gen_rand64(ctx);
 		assert_u64_eq(r, array64_2[i],
-		    "Mismatch at array64_2[%d]=%"PRIx64" gen=%"PRIx64, i,
+		    "Mismatch at array64_2[%d]=%"FMTx64" gen=%"FMTx64, i,
 		    array64_2[i], r);
 	}
 	fini_gen_rand(ctx);
diff --git a/test/unit/atomic.c b/test/unit/atomic.c
index 9217ca9..bdd74f6 100644
--- a/test/unit/atomic.c
+++ b/test/unit/atomic.c
@@ -8,7 +8,7 @@ struct p##_test_s {							\
 };									\
 typedef struct p##_test_s p##_test_t;
 
-#define	TEST_BODY(p, t, tc, ta, PRI) do {				\
+#define	TEST_BODY(p, t, tc, ta, FMT) do {				\
 	const p##_test_t tests[] = {					\
 		{(t)-1, (t)-1, (t)-2},					\
 		{(t)-1, (t) 0, (t)-2},					\
@@ -38,7 +38,7 @@ typedef struct p##_test_s p##_test_t;
 									\
 		assert_##ta##_eq(atomic_add_##p(&accum, tests[i].x),	\
 		    (t)((tc)tests[i].accum0 + (tc)tests[i].x),		\
-		    "i=%u, accum=%"PRI", x=%"PRI,			\
+		    "i=%u, accum=%"FMT", x=%"FMT,			\
 		    i, tests[i].accum0, tests[i].x);			\
 		assert_##ta##_eq(atomic_read_##p(&accum), accum,	\
 		    "Erroneous add, i=%u", i);				\
@@ -46,7 +46,7 @@ typedef struct p##_test_s p##_test_t;
 		accum = tests[i].accum0;				\
 		assert_##ta##_eq(atomic_sub_##p(&accum, tests[i].x),	\
 		    (t)((tc)tests[i].accum0 - (tc)tests[i].x),		\
-		    "i=%u, accum=%"PRI", x=%"PRI,			\
+		    "i=%u, accum=%"FMT", x=%"FMT,			\
 		    i, tests[i].accum0, tests[i].x);			\
 		assert_##ta##_eq(atomic_read_##p(&accum), accum,	\
 		    "Erroneous sub, i=%u", i);				\
@@ -72,7 +72,7 @@ TEST_BEGIN(test_atomic_uint64)
 #if !(LG_SIZEOF_PTR == 3 || LG_SIZEOF_INT == 3)
 	test_skip("64-bit atomic operations not supported");
 #else
-	TEST_BODY(uint64, uint64_t, uint64_t, u64, PRIx64);
+	TEST_BODY(uint64, uint64_t, uint64_t, u64, FMTx64);
 #endif
 }
 TEST_END
@@ -81,7 +81,7 @@ TEST_STRUCT(uint32, uint32_t)
 TEST_BEGIN(test_atomic_uint32)
 {
 
-	TEST_BODY(uint32, uint32_t, uint32_t, u32, "#"PRIx32);
+	TEST_BODY(uint32, uint32_t, uint32_t, u32, "#"FMTx32);
 }
 TEST_END
 
@@ -97,7 +97,7 @@ TEST_STRUCT(z, size_t)
 TEST_BEGIN(test_atomic_z)
 {
 
-	TEST_BODY(z, size_t, size_t, zu, "#"PRIzx);
+	TEST_BODY(z, size_t, size_t, zu, "#zx");
 }
 TEST_END
 
diff --git a/test/unit/ckh.c b/test/unit/ckh.c
index 1f22baf..b117595 100644
--- a/test/unit/ckh.c
+++ b/test/unit/ckh.c
@@ -35,15 +35,15 @@ TEST_BEGIN(test_count_insert_search_remove)
 	assert_false(ckh_new(tsd, &ckh, 2, ckh_string_hash, ckh_string_keycomp),
 	    "Unexpected ckh_new() error");
 	assert_zu_eq(ckh_count(&ckh), 0,
-	    "ckh_count() should return %"PRIzu", but it returned %"PRIzu, ZU(0),
+	    "ckh_count() should return %zu, but it returned %zu", ZU(0),
 	    ckh_count(&ckh));
 
 	/* Insert. */
 	for (i = 0; i < sizeof(strs)/sizeof(const char *); i++) {
 		ckh_insert(tsd, &ckh, strs[i], strs[i]);
 		assert_zu_eq(ckh_count(&ckh), i+1,
-		    "ckh_count() should return %"PRIzu", but it returned "
-		    "%"PRIzu, i+1, ckh_count(&ckh));
+		    "ckh_count() should return %zu, but it returned %zu", i+1,
+		    ckh_count(&ckh));
 	}
 
 	/* Search. */
@@ -64,10 +64,10 @@ TEST_BEGIN(test_count_insert_search_remove)
 
 		ks = (i & 1) ? strs[i] : (const char *)NULL;
 		vs = (i & 2) ? strs[i] : (const char *)NULL;
-		assert_ptr_eq((void *)ks, (void *)k.s,
-		    "Key mismatch, i=%"PRIzu, i);
-		assert_ptr_eq((void *)vs, (void *)v.s,
-		    "Value mismatch, i=%"PRIzu, i);
+		assert_ptr_eq((void *)ks, (void *)k.s, "Key mismatch, i=%zu",
+		    i);
+		assert_ptr_eq((void *)vs, (void *)v.s, "Value mismatch, i=%zu",
+		    i);
 	}
 	assert_true(ckh_search(&ckh, missing, NULL, NULL),
 	    "Unexpected ckh_search() success");
@@ -90,14 +90,14 @@ TEST_BEGIN(test_count_insert_search_remove)
 
 		ks = (i & 1) ? strs[i] : (const char *)NULL;
 		vs = (i & 2) ? strs[i] : (const char *)NULL;
-		assert_ptr_eq((void *)ks, (void *)k.s,
-		    "Key mismatch, i=%"PRIzu, i);
-		assert_ptr_eq((void *)vs, (void *)v.s,
-		    "Value mismatch, i=%"PRIzu, i);
+		assert_ptr_eq((void *)ks, (void *)k.s, "Key mismatch, i=%zu",
+		    i);
+		assert_ptr_eq((void *)vs, (void *)v.s, "Value mismatch, i=%zu",
+		    i);
 		assert_zu_eq(ckh_count(&ckh),
 		    sizeof(strs)/sizeof(const char *) - i - 1,
-		    "ckh_count() should return %"PRIzu", but it returned "
-		    "%"PRIzu, sizeof(strs)/sizeof(const char *) - i - 1,
+		    "ckh_count() should return %zu, but it returned %zu",
+		        sizeof(strs)/sizeof(const char *) - i - 1,
 		    ckh_count(&ckh));
 	}
 
@@ -137,8 +137,8 @@ TEST_BEGIN(test_insert_iter_remove)
 		}
 
 		assert_zu_eq(ckh_count(&ckh), NITEMS,
-		    "ckh_count() should return %"PRIzu", but it returned "
-		    "%"PRIzu, NITEMS, ckh_count(&ckh));
+		    "ckh_count() should return %zu, but it returned %zu",
+		    NITEMS, ckh_count(&ckh));
 
 		for (j = i + 1; j < NITEMS; j++) {
 			assert_false(ckh_search(&ckh, p[j], NULL, NULL),
@@ -167,20 +167,17 @@ TEST_BEGIN(test_insert_iter_remove)
 				for (k = 0; k < NITEMS; k++) {
 					if (p[k] == q) {
 						assert_false(seen[k],
-						    "Item %"PRIzu" already "
-						    "seen", k);
+						    "Item %zu already seen", k);
 						seen[k] = true;
 						break;
 					}
 				}
 			}
 
-			for (j = 0; j < i + 1; j++) {
-				assert_true(seen[j], "Item %"PRIzu" not seen",
-				    j);
-			}
+			for (j = 0; j < i + 1; j++)
+				assert_true(seen[j], "Item %zu not seen", j);
 			for (; j < NITEMS; j++)
-				assert_false(seen[j], "Item %"PRIzu" seen", j);
+				assert_false(seen[j], "Item %zu seen", j);
 		}
 	}
 
@@ -199,7 +196,7 @@ TEST_BEGIN(test_insert_iter_remove)
 	}
 
 	assert_zu_eq(ckh_count(&ckh), 0,
-	    "ckh_count() should return %"PRIzu", but it returned %"PRIzu,
+	    "ckh_count() should return %zu, but it returned %zu",
 	    ZU(0), ckh_count(&ckh));
 	ckh_delete(tsd, &ckh);
 #undef NITEMS
diff --git a/test/unit/junk.c b/test/unit/junk.c
index 8499d06..733f661 100644
--- a/test/unit/junk.c
+++ b/test/unit/junk.c
@@ -30,8 +30,8 @@ arena_dalloc_junk_small_intercept(void *ptr, arena_bin_info_t *bin_info)
 	arena_dalloc_junk_small_orig(ptr, bin_info);
 	for (i = 0; i < bin_info->reg_size; i++) {
 		assert_c_eq(((char *)ptr)[i], 0x5a,
-		    "Missing junk fill for byte %"PRIzu"/%"PRIzu" of "
-		    "deallocated region", i, bin_info->reg_size);
+		    "Missing junk fill for byte %zu/%zu of deallocated region",
+		    i, bin_info->reg_size);
 	}
 	if (ptr == watch_for_junking)
 		saw_junking = true;
@@ -45,8 +45,8 @@ arena_dalloc_junk_large_intercept(void *ptr, size_t usize)
 	arena_dalloc_junk_large_orig(ptr, usize);
 	for (i = 0; i < usize; i++) {
 		assert_c_eq(((char *)ptr)[i], 0x5a,
-		    "Missing junk fill for byte %"PRIzu"/%"PRIzu" of "
-		    "deallocated region", i, usize);
+		    "Missing junk fill for byte %zu/%zu of deallocated region",
+		    i, usize);
 	}
 	if (ptr == watch_for_junking)
 		saw_junking = true;
@@ -89,18 +89,18 @@ test_junk(size_t sz_min, size_t sz_max)
 	    sz_prev = sz, sz = sallocx(s, 0)) {
 		if (sz_prev > 0) {
 			assert_c_eq(s[0], 'a',
-			    "Previously allocated byte %"PRIzu"/%"PRIzu" is "
-			    "corrupted", ZU(0), sz_prev);
+			    "Previously allocated byte %zu/%zu is corrupted",
+			    ZU(0), sz_prev);
 			assert_c_eq(s[sz_prev-1], 'a',
-			    "Previously allocated byte %"PRIzu"/%"PRIzu" is "
-			    "corrupted", sz_prev-1, sz_prev);
+			    "Previously allocated byte %zu/%zu is corrupted",
+			    sz_prev-1, sz_prev);
 		}
 
 		for (i = sz_prev; i < sz; i++) {
 			if (opt_junk_alloc) {
 				assert_c_eq(s[i], 0xa5,
-				    "Newly allocated byte %"PRIzu"/%"PRIzu
-				    " isn't junk-filled", i, sz);
+				    "Newly allocated byte %zu/%zu isn't "
+				    "junk-filled", i, sz);
 			}
 			s[i] = 'a';
 		}
@@ -111,15 +111,15 @@ test_junk(size_t sz_min, size_t sz_max)
 			assert_ptr_not_null((void *)s,
 			    "Unexpected rallocx() failure");
 			assert_true(!opt_junk_free || saw_junking,
-			    "Expected region of size %"PRIzu" to be "
-			    "junk-filled", sz);
+			    "Expected region of size %zu to be junk-filled",
+			    sz);
 		}
 	}
 
 	watch_junking(s);
 	dallocx(s, 0);
 	assert_true(!opt_junk_free || saw_junking,
-	    "Expected region of size %"PRIzu" to be junk-filled", sz);
+	    "Expected region of size %zu to be junk-filled", sz);
 
 	if (opt_junk_free) {
 		arena_dalloc_junk_small = arena_dalloc_junk_small_orig;
diff --git a/test/unit/rtree.c b/test/unit/rtree.c
index 3d75bd0..305c08a 100644
--- a/test/unit/rtree.c
+++ b/test/unit/rtree.c
@@ -72,7 +72,7 @@ TEST_BEGIN(test_rtree_bits)
 				    &node, "rtree_get() should return "
 				    "previously set value and ignore "
 				    "insignificant key bits; i=%u, j=%u, k=%u, "
-				    "set key=%#"PRIxPTR", get key=%#"PRIxPTR, i,
+				    "set key=%#"FMTxPTR", get key=%#"FMTxPTR, i,
 				    j, k, keys[j], keys[k]);
 			}
 			assert_ptr_null(rtree_get(&rtree,
-- 
cgit v0.12


From 13473c7c66a81a4dc1cf11a97e9c8b1dbb785b64 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Thu, 23 Jul 2015 14:08:49 -0700
Subject: Force lazy_lock on MinGW.

This resolves #83.
---
 configure.ac | 1 +
 1 file changed, 1 insertion(+)

diff --git a/configure.ac b/configure.ac
index f7e6d08..0497eaf 100644
--- a/configure.ac
+++ b/configure.ac
@@ -340,6 +340,7 @@ case "${host}" in
   *-*-mingw* | *-*-cygwin*)
 	abi="pecoff"
 	force_tls="0"
+	force_lazy_lock="1"
 	RPATH=""
 	so="dll"
 	if test "x$je_cv_msvc" = "xyes" ; then
-- 
cgit v0.12


From 71cd2f08ff1775c1265d0b4a7967f10da867bd83 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Thu, 23 Jul 2015 15:50:09 -0700
Subject: Leave PRI* macros defined after using them to define FMT*.

Macro expansion happens too late for the #undef directives to work as a
mechanism for preventing accidental direct use of the PRI* macros.
---
 include/jemalloc/internal/util.h | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/include/jemalloc/internal/util.h b/include/jemalloc/internal/util.h
index ba42df7..fac2a17 100644
--- a/include/jemalloc/internal/util.h
+++ b/include/jemalloc/internal/util.h
@@ -29,17 +29,6 @@
 #  define FMTdPTR PRIdPTR
 #  define FMTuPTR PRIuPTR
 #  define FMTxPTR PRIxPTR
-
-/* Prevent PRI* macros from accidentally being used. */
-#  undef PRId32
-#  undef PRIu32
-#  undef PRIx32
-#  undef PRId64
-#  undef PRIu64
-#  undef PRIx64
-#  undef PRIdPTR
-#  undef PRIuPTR
-#  undef PRIxPTR
 #endif
 
 /* Size of stack-allocated buffer passed to buferror(). */
-- 
cgit v0.12


From 50883deb6eb532e5a16529a1ca009fb2ad4a0dc3 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Thu, 23 Jul 2015 17:13:18 -0700
Subject: Change arena_palloc_large() parameter from size to usize.

This change merely documents that arena_palloc_large() always receives
usize as its argument.
---
 src/arena.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 10cd0d2..e2f49bd 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -2084,7 +2084,7 @@ arena_malloc_large(arena_t *arena, size_t size, bool zero)
 
 /* Only handles large allocations that require more than page alignment. */
 static void *
-arena_palloc_large(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
+arena_palloc_large(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
     bool zero)
 {
 	void *ret;
@@ -2094,14 +2094,14 @@ arena_palloc_large(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
 	arena_chunk_map_misc_t *miscelm;
 	void *rpages;
 
-	assert(size == PAGE_CEILING(size));
+	assert(usize == PAGE_CEILING(usize));
 
 	arena = arena_choose(tsd, arena);
 	if (unlikely(arena == NULL))
 		return (NULL);
 
 	alignment = PAGE_CEILING(alignment);
-	alloc_size = size + large_pad + alignment - PAGE;
+	alloc_size = usize + large_pad + alignment - PAGE;
 
 	malloc_mutex_lock(&arena->lock);
 	run = arena_run_alloc_large(arena, alloc_size, false);
@@ -2115,8 +2115,8 @@ arena_palloc_large(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
 
 	leadsize = ALIGNMENT_CEILING((uintptr_t)rpages, alignment) -
 	    (uintptr_t)rpages;
-	assert(alloc_size >= leadsize + size);
-	trailsize = alloc_size - leadsize - size - large_pad;
+	assert(alloc_size >= leadsize + usize);
+	trailsize = alloc_size - leadsize - usize - large_pad;
 	if (leadsize != 0) {
 		arena_chunk_map_misc_t *head_miscelm = miscelm;
 		arena_run_t *head_run = run;
@@ -2130,18 +2130,18 @@ arena_palloc_large(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
 		    alloc_size - leadsize);
 	}
 	if (trailsize != 0) {
-		arena_run_trim_tail(arena, chunk, run, size + large_pad +
-		    trailsize, size + large_pad, false);
+		arena_run_trim_tail(arena, chunk, run, usize + large_pad +
+		    trailsize, usize + large_pad, false);
 	}
-	arena_run_init_large(arena, run, size + large_pad, zero);
+	arena_run_init_large(arena, run, usize + large_pad, zero);
 	ret = arena_miscelm_to_rpages(miscelm);
 
 	if (config_stats) {
-		index_t index = size2index(size) - NBINS;
+		index_t index = size2index(usize) - NBINS;
 
 		arena->stats.nmalloc_large++;
 		arena->stats.nrequests_large++;
-		arena->stats.allocated_large += size;
+		arena->stats.allocated_large += usize;
 		arena->stats.lstats[index].nmalloc++;
 		arena->stats.lstats[index].nrequests++;
 		arena->stats.lstats[index].curruns++;
@@ -2150,9 +2150,9 @@ arena_palloc_large(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
 
 	if (config_fill && !zero) {
 		if (unlikely(opt_junk_alloc))
-			memset(ret, 0xa5, size);
+			memset(ret, 0xa5, usize);
 		else if (unlikely(opt_zero))
-			memset(ret, 0, size);
+			memset(ret, 0, usize);
 	}
 	return (ret);
 }
-- 
cgit v0.12


From 4becdf21dceeeaaeec37043ce59d654c214363e2 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Thu, 23 Jul 2015 17:14:11 -0700
Subject: Fix sa2u() regression.

Take large_pad into account when determining whether an aligned
allocation can be satisfied by a large size class.

This regression was introduced by
8a03cf039cd06f9fa6972711195055d865673966 (Implement cache index
randomization for large allocations.).
---
 include/jemalloc/internal/jemalloc_internal.h.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 706c198..29aa802 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -708,7 +708,7 @@ sa2u(size_t size, size_t alignment)
 		 * Calculate the size of the over-size run that arena_palloc()
 		 * would need to allocate in order to guarantee the alignment.
 		 */
-		if (usize + alignment - PAGE <= arena_maxrun)
+		if (usize + large_pad + alignment - PAGE <= arena_maxrun)
 			return (usize);
 	}
 
-- 
cgit v0.12


From 87ccb5554769d915338b9a980d36359a5c6ec3fa Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Thu, 23 Jul 2015 17:16:32 -0700
Subject: Fix huge_palloc() to handle size rather than usize input.

huge_ralloc() passes a size that may not be precisely a size class, so
make huge_palloc() handle the more general case of a size input rather
than usize.

This regression appears to have been introduced by the addition of
in-place huge reallocation; as such it was never incorporated into a
release.
---
 include/jemalloc/internal/huge.h |  2 +-
 src/huge.c                       | 18 ++++++++++++------
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/include/jemalloc/internal/huge.h b/include/jemalloc/internal/huge.h
index c478d16..8b6c6ce 100644
--- a/include/jemalloc/internal/huge.h
+++ b/include/jemalloc/internal/huge.h
@@ -11,7 +11,7 @@
 
 void	*huge_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
     tcache_t *tcache);
-void	*huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
+void	*huge_palloc(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
     bool zero, tcache_t *tcache);
 bool	huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
     size_t extra, bool zero);
diff --git a/src/huge.c b/src/huge.c
index 6e6824d..d1a9586 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -46,15 +46,21 @@ huge_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero,
 }
 
 void *
-huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
+huge_palloc(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
     bool zero, tcache_t *tcache)
 {
 	void *ret;
+	size_t usize;
 	extent_node_t *node;
 	bool is_zeroed;
 
 	/* Allocate one or more contiguous chunks for this request. */
 
+	usize = sa2u(size, alignment);
+	if (unlikely(usize == 0))
+		return (NULL);
+	assert(usize >= chunksize);
+
 	/* Allocate an extent node with which to track the chunk. */
 	node = ipallocztm(tsd, CACHELINE_CEILING(sizeof(extent_node_t)),
 	    CACHELINE, false, tcache, true, arena);
@@ -68,15 +74,15 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 	is_zeroed = zero;
 	arena = arena_choose(tsd, arena);
 	if (unlikely(arena == NULL) || (ret = arena_chunk_alloc_huge(arena,
-	    usize, alignment, &is_zeroed)) == NULL) {
+	    size, alignment, &is_zeroed)) == NULL) {
 		idalloctm(tsd, node, tcache, true);
 		return (NULL);
 	}
 
-	extent_node_init(node, arena, ret, usize, is_zeroed);
+	extent_node_init(node, arena, ret, size, is_zeroed);
 
 	if (huge_node_set(ret, node)) {
-		arena_chunk_dalloc_huge(arena, ret, usize);
+		arena_chunk_dalloc_huge(arena, ret, size);
 		idalloctm(tsd, node, tcache, true);
 		return (NULL);
 	}
@@ -89,9 +95,9 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 
 	if (zero || (config_fill && unlikely(opt_zero))) {
 		if (!is_zeroed)
-			memset(ret, 0, usize);
+			memset(ret, 0, size);
 	} else if (config_fill && unlikely(opt_junk_alloc))
-		memset(ret, 0xa5, usize);
+		memset(ret, 0xa5, size);
 
 	return (ret);
 }
-- 
cgit v0.12


From 40cbd30d508b0d4e6462f5c36ffdbf6c1f29da22 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 24 Jul 2015 18:18:03 -0700
Subject: Fix huge_ralloc_no_move() to succeed more often.

Fix huge_ralloc_no_move() to succeed if an allocation request results in
the same usable size as the existing allocation, even if the request
size is smaller than the usable size.  This bug did not cause
correctness issues, but it could cause unnecessary moves during
reallocation.
---
 src/huge.c                 | 2 +-
 test/integration/rallocx.c | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/huge.c b/src/huge.c
index d1a9586..a7993f8 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -298,7 +298,7 @@ huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
 	 * the new size.
 	 */
 	if (CHUNK_CEILING(oldsize) >= CHUNK_CEILING(usize)
-	    && CHUNK_CEILING(oldsize) <= CHUNK_CEILING(size+extra)) {
+	    && CHUNK_CEILING(oldsize) <= CHUNK_CEILING(s2u(size+extra))) {
 		huge_ralloc_no_move_similar(ptr, oldsize, usize, size, extra,
 		    zero);
 		return (false);
diff --git a/test/integration/rallocx.c b/test/integration/rallocx.c
index b698072..8b6cde3 100644
--- a/test/integration/rallocx.c
+++ b/test/integration/rallocx.c
@@ -55,8 +55,9 @@ validate_fill(const void *p, uint8_t c, size_t offset, size_t len)
 	for (i = 0; i < len; i++) {
 		uint8_t b = buf[offset+i];
 		if (b != c) {
-			test_fail("Allocation at %p contains %#x rather than "
-			    "%#x at offset %zu", p, b, c, offset+i);
+			test_fail("Allocation at %p (len=%zu) contains %#x "
+			    "rather than %#x at offset %zu", p, len, b, c,
+			    offset+i);
 			ret = true;
 		}
 	}
-- 
cgit v0.12


From d059b9d6a1ac3e7f834260ba001bf0d1599fb0bf Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 24 Jul 2015 18:21:42 -0700
Subject: Implement support for non-coalescing maps on MinGW.

- Do not reallocate huge objects in place if the number of backing
  chunks would change.
- Do not cache multi-chunk mappings.

This resolves #213.
---
 INSTALL                                               |  5 ++++-
 configure.ac                                          | 12 ++++++++++++
 include/jemalloc/internal/jemalloc_internal.h.in      |  7 +++++++
 include/jemalloc/internal/jemalloc_internal_defs.h.in |  9 +++++++++
 src/chunk.c                                           |  6 ++++++
 src/huge.c                                            |  3 +++
 test/integration/chunk.c                              |  6 +++---
 7 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/INSTALL b/INSTALL
index 8d39687..5413ae8 100644
--- a/INSTALL
+++ b/INSTALL
@@ -150,7 +150,10 @@ any of the following arguments (not a definitive list) to 'configure':
     the virtual memory for later use.  munmap() is disabled by default (i.e.
     --disable-munmap is implied) on Linux, which has a quirk in its virtual
     memory allocation algorithm that causes semi-permanent VM map holes under
-    normal jemalloc operation.
+    normal jemalloc operation.  Conversely, munmap() (actually VirtualFree()) is
+    forcefully enabled on MinGW because virtual memory mappings do not
+    automatically coalesce (nor fragment on demand), and extra bookkeeping
+    would be required to track mapping boundaries.
 
 --disable-fill
     Disable support for junk/zero filling of memory, quarantine, and redzones.
diff --git a/configure.ac b/configure.ac
index 0497eaf..502dd39 100644
--- a/configure.ac
+++ b/configure.ac
@@ -258,6 +258,7 @@ dnl Define cpp macros in CPPFLAGS, rather than doing AC_DEFINE(macro), since the
 dnl definitions need to be seen before any headers are included, which is a pain
 dnl to make happen otherwise.
 default_munmap="1"
+maps_coalesce="1"
 case "${host}" in
   *-*-darwin* | *-*-ios*)
 	CFLAGS="$CFLAGS"
@@ -341,6 +342,7 @@ case "${host}" in
 	abi="pecoff"
 	force_tls="0"
 	force_lazy_lock="1"
+	maps_coalesce="0"
 	RPATH=""
 	so="dll"
 	if test "x$je_cv_msvc" = "xyes" ; then
@@ -862,6 +864,12 @@ if test "x$enable_tcache" = "x1" ; then
 fi
 AC_SUBST([enable_tcache])
 
+dnl Indicate whether adjacent virtual memory mappings automatically coalesce
+dnl (and fragment on demand).
+if test "x${maps_coalesce}" = "x1" ; then
+  AC_DEFINE([JEMALLOC_MAPS_COALESCE], [ ])
+fi
+
 dnl Enable VM deallocation via munmap() by default.
 AC_ARG_ENABLE([munmap],
   [AS_HELP_STRING([--disable-munmap], [Disable VM deallocation via munmap(2)])],
@@ -873,6 +881,10 @@ fi
 ],
 [enable_munmap="${default_munmap}"]
 )
+if test "x$enable_munmap" = "x0" -a "x${maps_coalesce}" = "x0" ; then
+  AC_MSG_RESULT([Forcing munmap to avoid non-coalescing map issues])
+  enable_munmap="1"
+fi
 if test "x$enable_munmap" = "x1" ; then
   AC_DEFINE([JEMALLOC_MUNMAP], [ ])
 fi
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 29aa802..496997d 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -70,6 +70,13 @@ static const bool config_prof_libunwind =
     false
 #endif
     ;
+static const bool maps_coalesce =
+#ifdef JEMALLOC_MAPS_COALESCE
+    true
+#else
+    false
+#endif
+    ;
 static const bool config_munmap =
 #ifdef JEMALLOC_MUNMAP
     true
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index ed8347a..b0f8caa 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -170,6 +170,15 @@
 #undef LG_PAGE
 
 /*
+ * If defined, adjacent virtual memory mappings with identical attributes
+ * automatically coalesce, and they fragment when changes are made to subranges.
+ * This is the normal order of things for mmap()/munmap(), but on Windows
+ * VirtualAlloc()/VirtualFree() operations must be precisely matched, i.e.
+ * mappings do *not* coalesce/fragment.
+ */
+#undef JEMALLOC_MAPS_COALESCE
+
+/*
  * If defined, use munmap() to unmap freed chunks, rather than storing them for
  * later reuse.  This is disabled by default on Linux because common sequences
  * of mmap()/munmap() calls will cause virtual memory map holes.
diff --git a/src/chunk.c b/src/chunk.c
index 5945482..7a4ede8 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -337,6 +337,7 @@ chunk_record(arena_t *arena, extent_tree_t *chunks_szad,
 	extent_node_t *node, *prev;
 	extent_node_t key;
 
+	assert(maps_coalesce || size == chunksize);
 	assert(!cache || !zeroed);
 	unzeroed = cache || !zeroed;
 	JEMALLOC_VALGRIND_MAKE_MEM_NOACCESS(chunk, size);
@@ -421,6 +422,11 @@ chunk_dalloc_cache(arena_t *arena, void *chunk, size_t size)
 	assert(size != 0);
 	assert((size & chunksize_mask) == 0);
 
+	if (!maps_coalesce && size != chunksize) {
+		chunk_dalloc_arena(arena, chunk, size, false);
+		return;
+	}
+
 	chunk_record(arena, &arena->chunks_szad_cache, &arena->chunks_ad_cache,
 	    true, chunk, size, false);
 	arena_maybe_purge(arena);
diff --git a/src/huge.c b/src/huge.c
index a7993f8..7cd0d7d 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -304,6 +304,9 @@ huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
 		return (false);
 	}
 
+	if (!maps_coalesce)
+		return (true);
+
 	/* Shrink the allocation in-place. */
 	if (CHUNK_CEILING(oldsize) >= CHUNK_CEILING(usize)) {
 		huge_ralloc_no_move_shrink(ptr, oldsize, usize);
diff --git a/test/integration/chunk.c b/test/integration/chunk.c
index de45bc5..c94b2d4 100644
--- a/test/integration/chunk.c
+++ b/test/integration/chunk.c
@@ -63,9 +63,9 @@ TEST_BEGIN(test_chunk)
 	    "Unexpected arenas.hchunk.2.size failure");
 	if (huge0 * 2 > huge2) {
 		/*
-		 * There are at least four size classes per doubling, so
-		 * xallocx() from size=huge2 to size=huge1 is guaranteed to
-		 * leave trailing purgeable memory.
+		 * There are at least four size classes per doubling, so a
+		 * successful xallocx() from size=huge2 to size=huge1 is
+		 * guaranteed to leave trailing purgeable memory.
 		 */
 		p = mallocx(huge2, 0);
 		assert_ptr_not_null(p, "Unexpected mallocx() error");
-- 
cgit v0.12


From b49a334a645b854dbb1649f15c38d646fee66738 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Tue, 28 Jul 2015 11:28:19 -0400
Subject: Generalize chunk management hooks.

Add the "arena.<i>.chunk_hooks" mallctl, which replaces and expands on
the "arena.<i>.chunk.{alloc,dalloc,purge}" mallctls.  The chunk hooks
allow control over chunk allocation/deallocation, decommit/commit,
purging, and splitting/merging, such that the application can rely on
jemalloc's internal chunk caching and retaining functionality, yet
implement a variety of chunk management mechanisms and policies.

Merge the chunks_[sz]ad_{mmap,dss} red-black trees into
chunks_[sz]ad_retained.  This slightly reduces how hard jemalloc tries
to honor the dss precedence setting; prior to this change the precedence
setting was also consulted when recycling chunks.

Fix chunk purging.  Don't purge chunks in arena_purge_stashed(); instead
deallocate them in arena_unstash_purged(), so that the dirty memory
linkage remains valid until after the last time it is used.

This resolves #176 and #201.
---
 ChangeLog                                        |   3 +-
 Makefile.in                                      |   7 +-
 doc/jemalloc.xml.in                              | 201 ++++++++-----
 include/jemalloc/internal/arena.h                |  19 +-
 include/jemalloc/internal/chunk.h                |  44 +--
 include/jemalloc/internal/chunk_mmap.h           |   2 -
 include/jemalloc/internal/extent.h               |  28 +-
 include/jemalloc/internal/jemalloc_internal.h.in |   4 +
 include/jemalloc/internal/pages.h                |  26 ++
 include/jemalloc/internal/private_symbols.txt    |  12 +-
 include/jemalloc/jemalloc_typedefs.h.in          |  54 +++-
 src/arena.c                                      | 184 ++++++------
 src/base.c                                       |   4 +-
 src/chunk.c                                      | 346 ++++++++++++++++-------
 src/chunk_dss.c                                  |   8 +-
 src/chunk_mmap.c                                 | 131 ---------
 src/ctl.c                                        |  75 +++--
 src/huge.c                                       |  44 ++-
 src/pages.c                                      | 167 +++++++++++
 test/integration/chunk.c                         | 216 +++++++++++---
 20 files changed, 1022 insertions(+), 553 deletions(-)
 create mode 100644 include/jemalloc/internal/pages.h
 create mode 100644 src/pages.c

diff --git a/ChangeLog b/ChangeLog
index fe62e52..ed5777d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -37,8 +37,7 @@ brevity.  Much more detail can be found in the git revision history:
     "opt.prof_thread_active_init", "prof.thread_active_init", and
     "thread.prof.active" mallctls.
   - Add support for per arena application-specified chunk allocators, configured
-    via the "arena<i>.chunk.alloc", "arena<i>.chunk.dalloc", and
-    "arena.<i>.chunk.purge" mallctls.
+    via the "arena.<i>.chunk_hooks" mallctl.
   - Refactor huge allocation to be managed by arenas, so that arenas now
     function as general purpose independent allocators.  This is important in
     the context of user-specified chunk allocators, aside from the scalability
diff --git a/Makefile.in b/Makefile.in
index 25c2d5a..5084b1a 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -82,9 +82,10 @@ C_SRCS := $(srcroot)src/jemalloc.c $(srcroot)src/arena.c \
 	$(srcroot)src/chunk.c $(srcroot)src/chunk_dss.c \
 	$(srcroot)src/chunk_mmap.c $(srcroot)src/ckh.c $(srcroot)src/ctl.c \
 	$(srcroot)src/extent.c $(srcroot)src/hash.c $(srcroot)src/huge.c \
-	$(srcroot)src/mb.c $(srcroot)src/mutex.c $(srcroot)src/prof.c \
-	$(srcroot)src/quarantine.c $(srcroot)src/rtree.c $(srcroot)src/stats.c \
-	$(srcroot)src/tcache.c $(srcroot)src/util.c $(srcroot)src/tsd.c
+	$(srcroot)src/mb.c $(srcroot)src/mutex.c $(srcroot)src/pages.c \
+	$(srcroot)src/prof.c $(srcroot)src/quarantine.c $(srcroot)src/rtree.c \
+	$(srcroot)src/stats.c $(srcroot)src/tcache.c $(srcroot)src/util.c \
+	$(srcroot)src/tsd.c
 ifeq ($(enable_valgrind), 1)
 C_SRCS += $(srcroot)src/valgrind.c
 endif
diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index dbbe837..4cb74a0 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -1518,18 +1518,48 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         for additional information.</para></listitem>
       </varlistentry>
 
-      <varlistentry id="arena.i.chunk.alloc">
+      <varlistentry id="arena.i.chunk_hooks">
         <term>
-          <mallctl>arena.&lt;i&gt;.chunk.alloc</mallctl>
-          (<type>chunk_alloc_t *</type>)
+          <mallctl>arena.&lt;i&gt;.chunk_hooks</mallctl>
+          (<type>chunk_hooks_t</type>)
           <literal>rw</literal>
         </term>
-        <listitem><para>Get or set the chunk allocation function for arena
-        &lt;i&gt;.  If setting, the chunk deallocation function should
-        also be set via <link linkend="arena.i.chunk.dalloc">
-        <mallctl>arena.&lt;i&gt;.chunk.dalloc</mallctl></link> to a companion
-        function that knows how to deallocate the chunks.
-        <funcsynopsis><funcprototype>
+        <listitem><para>Get or set the chunk management hook functions for arena
+        &lt;i&gt;.  The functions must be capable of operating on all extant
+        chunks associated with arena &lt;i&gt;, usually by passing unknown
+        chunks to the replaced functions.  In practice, it is feasible to
+        control allocation for arenas created via <link
+        linkend="arenas.extend"><mallctl>arenas.extend</mallctl></link> such
+        that all chunks originate from an application-supplied chunk allocator
+        (by setting custom chunk hook functions just after arena creation), but
+        the automatically created arenas may have already created chunks prior
+        to the application having an opportunity to take over chunk
+        allocation.</para>
+
+        <para><programlisting language="C"><![CDATA[
+typedef struct {
+	chunk_alloc_t		*alloc;
+	chunk_dalloc_t		*dalloc;
+	chunk_commit_t		*commit;
+	chunk_decommit_t	*decommit;
+	chunk_purge_t		*purge;
+	chunk_split_t		*split;
+	chunk_merge_t		*merge;
+} chunk_hooks_t;]]></programlisting>
+        The <type>chunk_hooks_t</type> structure comprises function pointers
+        which are described individually below.  jemalloc uses these
+        functions to manage chunk lifetime, which starts off with allocation of
+        mapped committed memory, in the simplest case followed by deallocation.
+        However, there are performance and platform reasons to retain chunks for
+        later reuse.  Cleanup attempts cascade from deallocation to decommit to
+        purging, which gives the chunk management functions opportunities to
+        reject the most permanent cleanup operations in favor of less permanent
+        (and often less costly) operations.  The chunk splitting and merging
+        operations can also be opted out of, but this is mainly intended to
+        support platforms on which virtual memory mappings provided by the
+        operating system kernel do not automatically coalesce and split.</para>
+
+        <para><funcsynopsis><funcprototype>
           <funcdef>typedef void *<function>(chunk_alloc_t)</function></funcdef>
           <paramdef>void *<parameter>chunk</parameter></paramdef>
           <paramdef>size_t <parameter>size</parameter></paramdef>
@@ -1539,9 +1569,9 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         </funcprototype></funcsynopsis>
         A chunk allocation function conforms to the <type>chunk_alloc_t</type>
         type and upon success returns a pointer to <parameter>size</parameter>
-        bytes of memory on behalf of arena <parameter>arena_ind</parameter> such
-        that the chunk's base address is a multiple of
-        <parameter>alignment</parameter>, as well as setting
+        bytes of mapped committed memory on behalf of arena
+        <parameter>arena_ind</parameter> such that the chunk's base address is a
+        multiple of <parameter>alignment</parameter>, as well as setting
         <parameter>*zero</parameter> to indicate whether the chunk is zeroed.
         Upon error the function returns <constant>NULL</constant> and leaves
         <parameter>*zero</parameter> unmodified.  The
@@ -1550,34 +1580,16 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         of two at least as large as the chunk size.  Zeroing is mandatory if
         <parameter>*zero</parameter> is true upon function entry.  If
         <parameter>chunk</parameter> is not <constant>NULL</constant>, the
-        returned pointer must be <parameter>chunk</parameter> or
-        <constant>NULL</constant> if it could not be allocated.</para>
-
-        <para>Note that replacing the default chunk allocation function makes
-        the arena's <link
+        returned pointer must be <parameter>chunk</parameter> on success or
+        <constant>NULL</constant> on error.  Committed memory may be committed
+        in absolute terms as on a system that does not overcommit, or in
+        implicit terms as on a system that overcommits and satisfies physical
+        memory needs on demand via soft page faults.  Note that replacing the
+        default chunk allocation function makes the arena's <link
         linkend="arena.i.dss"><mallctl>arena.&lt;i&gt;.dss</mallctl></link>
-        setting irrelevant.</para></listitem>
-      </varlistentry>
+        setting irrelevant.</para>
 
-      <varlistentry id="arena.i.chunk.dalloc">
-        <term>
-          <mallctl>arena.&lt;i&gt;.chunk.dalloc</mallctl>
-          (<type>chunk_dalloc_t *</type>)
-          <literal>rw</literal>
-        </term>
-        <listitem><para>Get or set the chunk deallocation function for arena
-        &lt;i&gt;.  If setting, the chunk deallocation function must
-        be capable of deallocating all extant chunks associated with arena
-        &lt;i&gt;, usually by passing unknown chunks to the deallocation
-        function that was replaced.  In practice, it is feasible to control
-        allocation for arenas created via <link
-        linkend="arenas.extend"><mallctl>arenas.extend</mallctl></link> such
-        that all chunks originate from an application-supplied chunk allocator
-        (by setting custom chunk allocation/deallocation/purge functions just
-        after arena creation), but the automatically created arenas may have
-        already created chunks prior to the application having an opportunity to
-        take over chunk allocation.
-        <funcsynopsis><funcprototype>
+        <para><funcsynopsis><funcprototype>
           <funcdef>typedef bool <function>(chunk_dalloc_t)</function></funcdef>
           <paramdef>void *<parameter>chunk</parameter></paramdef>
           <paramdef>size_t <parameter>size</parameter></paramdef>
@@ -1587,46 +1599,99 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         <type>chunk_dalloc_t</type> type and deallocates a
         <parameter>chunk</parameter> of given <parameter>size</parameter> on
         behalf of arena <parameter>arena_ind</parameter>, returning false upon
-        success.</para></listitem>
-      </varlistentry>
+        success.  If the function returns true, this indicates opt-out from
+        deallocation; the virtual memory mapping associated with the chunk
+        remains mapped, committed, and available for future use, in which case
+        it will be automatically retained for later reuse.</para>
 
-      <varlistentry id="arena.i.chunk.purge">
-        <term>
-          <mallctl>arena.&lt;i&gt;.chunk.purge</mallctl>
-          (<type>chunk_purge_t *</type>)
-          <literal>rw</literal>
-        </term>
-        <listitem><para>Get or set the chunk purge function for arena &lt;i&gt;.
-        A chunk purge function optionally discards physical pages associated
-        with pages in the chunk's virtual memory range but leaves the virtual
-        memory mapping intact, and indicates via its return value whether pages
-        in the virtual memory range will be zero-filled the next time they are
-        accessed.  If setting, the chunk purge function must be capable of
-        purging all extant chunks associated with arena &lt;i&gt;, usually by
-        passing unknown chunks to the purge function that was replaced.  In
-        practice, it is feasible to control allocation for arenas created via
-        <link linkend="arenas.extend"><mallctl>arenas.extend</mallctl></link>
-        such that all chunks originate from an application-supplied chunk
-        allocator (by setting custom chunk allocation/deallocation/purge
-        functions just after arena creation), but the automatically created
-        arenas may have already created chunks prior to the application having
-        an opportunity to take over chunk allocation.
-        <funcsynopsis><funcprototype>
+        <para><funcsynopsis><funcprototype>
+          <funcdef>typedef bool <function>(chunk_commit_t)</function></funcdef>
+          <paramdef>void *<parameter>chunk</parameter></paramdef>
+          <paramdef>size_t <parameter>size</parameter></paramdef>
+          <paramdef>unsigned <parameter>arena_ind</parameter></paramdef>
+        </funcprototype></funcsynopsis>
+        A chunk commit function conforms to the <type>chunk_commit_t</type> type
+        and commits zeroed physical memory to back a
+        <parameter>chunk</parameter> of given <parameter>size</parameter> on
+        behalf of arena <parameter>arena_ind</parameter>, returning false upon
+        success.  Committed memory may be committed in absolute terms as on a
+        system that does not overcommit, or in implicit terms as on a system
+        that overcommits and satisfies physical memory needs on demand via soft
+        page faults. If the function returns true, this indicates insufficient
+        physical memory to satisfy the request.</para>
+
+        <para><funcsynopsis><funcprototype>
+          <funcdef>typedef bool <function>(chunk_decommit_t)</function></funcdef>
+          <paramdef>void *<parameter>chunk</parameter></paramdef>
+          <paramdef>size_t <parameter>size</parameter></paramdef>
+          <paramdef>unsigned <parameter>arena_ind</parameter></paramdef>
+        </funcprototype></funcsynopsis>
+        A chunk decommit function conforms to the <type>chunk_decommit_t</type>
+        type and decommits any physical memory that is backing a
+        <parameter>chunk</parameter> of given <parameter>size</parameter> on
+        behalf of arena <parameter>arena_ind</parameter>, returning false upon
+        success, in which case the chunk will be committed via the chunk commit
+        function before being reused.  If the function returns true, this
+        indicates opt-out from decommit; the memory remains committed and
+        available for future use, in which case it will be automatically
+        retained for later reuse.</para>
+
+        <para><funcsynopsis><funcprototype>
           <funcdef>typedef bool <function>(chunk_purge_t)</function></funcdef>
           <paramdef>void *<parameter>chunk</parameter></paramdef>
+          <paramdef>size_t<parameter>size</parameter></paramdef>
           <paramdef>size_t <parameter>offset</parameter></paramdef>
           <paramdef>size_t <parameter>length</parameter></paramdef>
           <paramdef>unsigned <parameter>arena_ind</parameter></paramdef>
         </funcprototype></funcsynopsis>
         A chunk purge function conforms to the <type>chunk_purge_t</type> type
-        and purges pages within <parameter>chunk</parameter> at
-        <parameter>offset</parameter> bytes, extending for
-        <parameter>length</parameter> on behalf of arena
+        and optionally discards physical pages within the virtual memory mapping
+        associated with <parameter>chunk</parameter> of given
+        <parameter>size</parameter> at <parameter>offset</parameter> bytes,
+        extending for <parameter>length</parameter> on behalf of arena
         <parameter>arena_ind</parameter>, returning false if pages within the
         purged virtual memory range will be zero-filled the next time they are
-        accessed.  Note that the memory range being purged may span multiple
-        contiguous chunks, e.g. when purging memory that backed a huge
-        allocation.</para></listitem>
+        accessed.</para>
+
+        <para><funcsynopsis><funcprototype>
+          <funcdef>typedef bool <function>(chunk_split_t)</function></funcdef>
+          <paramdef>void *<parameter>chunk</parameter></paramdef>
+          <paramdef>size_t <parameter>size</parameter></paramdef>
+          <paramdef>size_t <parameter>size_a</parameter></paramdef>
+          <paramdef>size_t <parameter>size_b</parameter></paramdef>
+          <paramdef>bool <parameter>committed</parameter></paramdef>
+          <paramdef>unsigned <parameter>arena_ind</parameter></paramdef>
+        </funcprototype></funcsynopsis>
+        A chunk split function conforms to the <type>chunk_split_t</type> type
+        and optionally splits <parameter>chunk</parameter> of given
+        <parameter>size</parameter> into two adjacent chunks, the first of
+        <parameter>size_a</parameter> bytes, and the second of
+        <parameter>size_b</parameter> bytes, operating on
+        <parameter>committed</parameter>/decommitted memory as indicated, on
+        behalf of arena <parameter>arena_ind</parameter>, returning false upon
+        success.  If the function returns true, this indicates that the chunk
+        remains unsplit and therefore should continue to be operated on as a
+        whole.</para>
+
+        <para><funcsynopsis><funcprototype>
+          <funcdef>typedef bool <function>(chunk_merge_t)</function></funcdef>
+          <paramdef>void *<parameter>chunk_a</parameter></paramdef>
+          <paramdef>size_t <parameter>size_a</parameter></paramdef>
+          <paramdef>void *<parameter>chunk_b</parameter></paramdef>
+          <paramdef>size_t <parameter>size_b</parameter></paramdef>
+          <paramdef>bool <parameter>committed</parameter></paramdef>
+          <paramdef>unsigned <parameter>arena_ind</parameter></paramdef>
+        </funcprototype></funcsynopsis>
+        A chunk merge function conforms to the <type>chunk_merge_t</type> type
+        and optionally merges adjacent chunks, <parameter>chunk_a</parameter> of
+        given <parameter>size_a</parameter> and <parameter>chunk_b</parameter>
+        of given <parameter>size_b</parameter> into one contiguous chunk,
+        operating on <parameter>committed</parameter>/decommitted memory as
+        indicated, on behalf of arena <parameter>arena_ind</parameter>,
+        returning false upon success.  If the function returns true, this
+        indicates that the chunks remain distinct mappings and therefore should
+        continue to be operated on independently.</para>
+        </listitem>
       </varlistentry>
 
       <varlistentry id="arenas.narenas">
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 8811f2e..29f73e7 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -379,23 +379,18 @@ struct arena_s {
 	 * orderings are needed, which is why there are two trees with the same
 	 * contents.
 	 */
-	extent_tree_t		chunks_szad_cache;
-	extent_tree_t		chunks_ad_cache;
-	extent_tree_t		chunks_szad_mmap;
-	extent_tree_t		chunks_ad_mmap;
-	extent_tree_t		chunks_szad_dss;
-	extent_tree_t		chunks_ad_dss;
+	extent_tree_t		chunks_szad_cached;
+	extent_tree_t		chunks_ad_cached;
+	extent_tree_t		chunks_szad_retained;
+	extent_tree_t		chunks_ad_retained;
+
 	malloc_mutex_t		chunks_mtx;
 	/* Cache of nodes that were allocated via base_alloc(). */
 	ql_head(extent_node_t)	node_cache;
 	malloc_mutex_t		node_cache_mtx;
 
-	/*
-	 * User-configurable chunk allocation/deallocation/purge functions.
-	 */
-	chunk_alloc_t		*chunk_alloc;
-	chunk_dalloc_t		*chunk_dalloc;
-	chunk_purge_t		*chunk_purge;
+	/* User-configurable chunk hook functions. */
+	chunk_hooks_t		chunk_hooks;
 
 	/* bins is used to store trees of free regions. */
 	arena_bin_t		bins[NBINS];
diff --git a/include/jemalloc/internal/chunk.h b/include/jemalloc/internal/chunk.h
index 91aefad..8e51134 100644
--- a/include/jemalloc/internal/chunk.h
+++ b/include/jemalloc/internal/chunk.h
@@ -19,6 +19,16 @@
 #define	CHUNK_CEILING(s)						\
 	(((s) + chunksize_mask) & ~chunksize_mask)
 
+#define	CHUNK_HOOKS_INITIALIZER {					\
+    NULL,								\
+    NULL,								\
+    NULL,								\
+    NULL,								\
+    NULL,								\
+    NULL,								\
+    NULL								\
+}
+
 #endif /* JEMALLOC_H_TYPES */
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS
@@ -36,30 +46,30 @@ extern size_t		chunksize;
 extern size_t		chunksize_mask; /* (chunksize - 1). */
 extern size_t		chunk_npages;
 
+extern const chunk_hooks_t	chunk_hooks_default;
+
+chunk_hooks_t	chunk_hooks_get(arena_t *arena);
+chunk_hooks_t	chunk_hooks_set(arena_t *arena,
+    const chunk_hooks_t *chunk_hooks);
+
 bool	chunk_register(const void *chunk, const extent_node_t *node);
 void	chunk_deregister(const void *chunk, const extent_node_t *node);
 void	*chunk_alloc_base(size_t size);
-void	*chunk_alloc_cache(arena_t *arena, void *new_addr, size_t size,
-    size_t alignment, bool *zero, bool dalloc_node);
-void	*chunk_alloc_default(void *new_addr, size_t size, size_t alignment,
-    bool *zero, unsigned arena_ind);
-void	*chunk_alloc_wrapper(arena_t *arena, chunk_alloc_t *chunk_alloc,
+void	*chunk_alloc_cache(arena_t *arena, chunk_hooks_t *chunk_hooks,
+    void *new_addr, size_t size, size_t alignment, bool *zero,
+    bool dalloc_node);
+void	*chunk_alloc_wrapper(arena_t *arena, chunk_hooks_t *chunk_hooks,
     void *new_addr, size_t size, size_t alignment, bool *zero);
-void	chunk_record(arena_t *arena, extent_tree_t *chunks_szad,
-    extent_tree_t *chunks_ad, bool cache, void *chunk, size_t size,
-    bool zeroed);
-void	chunk_dalloc_cache(arena_t *arena, void *chunk, size_t size);
-void	chunk_dalloc_arena(arena_t *arena, void *chunk, size_t size,
-    bool zeroed);
-bool	chunk_dalloc_default(void *chunk, size_t size, unsigned arena_ind);
-void	chunk_dalloc_wrapper(arena_t *arena, chunk_dalloc_t *chunk_dalloc,
+void	chunk_dalloc_cache(arena_t *arena, chunk_hooks_t *chunk_hooks,
+    void *chunk, size_t size);
+void	chunk_dalloc_arena(arena_t *arena, chunk_hooks_t *chunk_hooks,
+    void *chunk, size_t size, bool zeroed);
+void	chunk_dalloc_wrapper(arena_t *arena, chunk_hooks_t *chunk_hooks,
     void *chunk, size_t size);
 bool	chunk_purge_arena(arena_t *arena, void *chunk, size_t offset,
     size_t length);
-bool	chunk_purge_default(void *chunk, size_t offset, size_t length,
-    unsigned arena_ind);
-bool	chunk_purge_wrapper(arena_t *arena, chunk_purge_t *chunk_purge,
-    void *chunk, size_t offset, size_t length);
+bool	chunk_purge_wrapper(arena_t *arena, chunk_hooks_t *chunk_hooks,
+    void *chunk, size_t size, size_t offset, size_t length);
 bool	chunk_boot(void);
 void	chunk_prefork(void);
 void	chunk_postfork_parent(void);
diff --git a/include/jemalloc/internal/chunk_mmap.h b/include/jemalloc/internal/chunk_mmap.h
index c5d5c6c..e81dc3a 100644
--- a/include/jemalloc/internal/chunk_mmap.h
+++ b/include/jemalloc/internal/chunk_mmap.h
@@ -9,8 +9,6 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
-bool	pages_purge(void *addr, size_t length);
-
 void	*chunk_alloc_mmap(size_t size, size_t alignment, bool *zero);
 bool	chunk_dalloc_mmap(void *chunk, size_t size);
 
diff --git a/include/jemalloc/internal/extent.h b/include/jemalloc/internal/extent.h
index 3751adc..b2ac2b6 100644
--- a/include/jemalloc/internal/extent.h
+++ b/include/jemalloc/internal/extent.h
@@ -19,6 +19,13 @@ struct extent_node_s {
 	size_t			en_size;
 
 	/*
+	 * True if physical memory is committed to the extent, whether
+	 * explicitly or implicitly as on a system that overcommits and
+	 * satisfies physical mamory needs on demand via soft page faults.
+	 */
+	bool			en_committed;
+
+	/*
 	 * The zeroed flag is used by chunk recycling code to track whether
 	 * memory is zero-filled.
 	 */
@@ -66,17 +73,19 @@ rb_proto(, extent_tree_ad_, extent_tree_t, extent_node_t)
 arena_t	*extent_node_arena_get(const extent_node_t *node);
 void	*extent_node_addr_get(const extent_node_t *node);
 size_t	extent_node_size_get(const extent_node_t *node);
+bool	extent_node_committed_get(const extent_node_t *node);
 bool	extent_node_zeroed_get(const extent_node_t *node);
 bool	extent_node_achunk_get(const extent_node_t *node);
 prof_tctx_t	*extent_node_prof_tctx_get(const extent_node_t *node);
 void	extent_node_arena_set(extent_node_t *node, arena_t *arena);
 void	extent_node_addr_set(extent_node_t *node, void *addr);
 void	extent_node_size_set(extent_node_t *node, size_t size);
+void	extent_node_committed_set(extent_node_t *node, bool committed);
 void	extent_node_zeroed_set(extent_node_t *node, bool zeroed);
 void	extent_node_achunk_set(extent_node_t *node, bool achunk);
 void	extent_node_prof_tctx_set(extent_node_t *node, prof_tctx_t *tctx);
 void	extent_node_init(extent_node_t *node, arena_t *arena, void *addr,
-    size_t size, bool zeroed);
+    size_t size, bool committed, bool zeroed);
 void	extent_node_dirty_linkage_init(extent_node_t *node);
 void	extent_node_dirty_insert(extent_node_t *node,
     arena_runs_dirty_link_t *runs_dirty, extent_node_t *chunks_dirty);
@@ -106,6 +115,13 @@ extent_node_size_get(const extent_node_t *node)
 }
 
 JEMALLOC_INLINE bool
+extent_node_committed_get(const extent_node_t *node)
+{
+
+	return (node->en_committed);
+}
+
+JEMALLOC_INLINE bool
 extent_node_zeroed_get(const extent_node_t *node)
 {
 
@@ -148,6 +164,13 @@ extent_node_size_set(extent_node_t *node, size_t size)
 }
 
 JEMALLOC_INLINE void
+extent_node_committed_set(extent_node_t *node, bool committed)
+{
+
+	node->en_committed = committed;
+}
+
+JEMALLOC_INLINE void
 extent_node_zeroed_set(extent_node_t *node, bool zeroed)
 {
 
@@ -170,12 +193,13 @@ extent_node_prof_tctx_set(extent_node_t *node, prof_tctx_t *tctx)
 
 JEMALLOC_INLINE void
 extent_node_init(extent_node_t *node, arena_t *arena, void *addr, size_t size,
-    bool zeroed)
+    bool committed, bool zeroed)
 {
 
 	extent_node_arena_set(node, arena);
 	extent_node_addr_set(node, addr);
 	extent_node_size_set(node, size);
+	extent_node_committed_set(node, committed);
 	extent_node_zeroed_set(node, zeroed);
 	extent_node_achunk_set(node, false);
 	if (config_prof)
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 496997d..7a137b6 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -367,6 +367,7 @@ typedef unsigned index_t;
 #include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/rtree.h"
+#include "jemalloc/internal/pages.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
 #include "jemalloc/internal/tcache.h"
@@ -398,6 +399,7 @@ typedef unsigned index_t;
 #undef JEMALLOC_ARENA_STRUCTS_B
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/rtree.h"
+#include "jemalloc/internal/pages.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
 #include "jemalloc/internal/tcache.h"
@@ -477,6 +479,7 @@ void	jemalloc_postfork_child(void);
 #include "jemalloc/internal/arena.h"
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/rtree.h"
+#include "jemalloc/internal/pages.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
 #include "jemalloc/internal/tcache.h"
@@ -503,6 +506,7 @@ void	jemalloc_postfork_child(void);
 #include "jemalloc/internal/extent.h"
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/rtree.h"
+#include "jemalloc/internal/pages.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
 
diff --git a/include/jemalloc/internal/pages.h b/include/jemalloc/internal/pages.h
new file mode 100644
index 0000000..da7eb96
--- /dev/null
+++ b/include/jemalloc/internal/pages.h
@@ -0,0 +1,26 @@
+/******************************************************************************/
+#ifdef JEMALLOC_H_TYPES
+
+#endif /* JEMALLOC_H_TYPES */
+/******************************************************************************/
+#ifdef JEMALLOC_H_STRUCTS
+
+#endif /* JEMALLOC_H_STRUCTS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_EXTERNS
+
+void	*pages_map(void *addr, size_t size);
+void	pages_unmap(void *addr, size_t size);
+void	*pages_trim(void *addr, size_t alloc_size, size_t leadsize,
+    size_t size);
+bool	pages_commit(void *addr, size_t size);
+bool	pages_decommit(void *addr, size_t size);
+bool	pages_purge(void *addr, size_t size);
+
+#endif /* JEMALLOC_H_EXTERNS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_INLINES
+
+#endif /* JEMALLOC_H_INLINES */
+/******************************************************************************/
+
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index aaf6978..0e6216f 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -132,14 +132,12 @@ bt_init
 buferror
 chunk_alloc_cache
 chunk_alloc_base
-chunk_alloc_default
 chunk_alloc_dss
 chunk_alloc_mmap
 chunk_alloc_wrapper
 chunk_boot
 chunk_dalloc_arena
 chunk_dalloc_cache
-chunk_dalloc_default
 chunk_dalloc_mmap
 chunk_dalloc_wrapper
 chunk_deregister
@@ -149,6 +147,9 @@ chunk_dss_postfork_parent
 chunk_dss_prec_get
 chunk_dss_prec_set
 chunk_dss_prefork
+chunk_hooks_default
+chunk_hooks_get
+chunk_hooks_set
 chunk_in_dss
 chunk_lookup
 chunk_npages
@@ -156,9 +157,7 @@ chunk_postfork_child
 chunk_postfork_parent
 chunk_prefork
 chunk_purge_arena
-chunk_purge_default
 chunk_purge_wrapper
-chunk_record
 chunk_register
 chunks_rtree
 chunksize
@@ -347,7 +346,12 @@ opt_utrace
 opt_xmalloc
 opt_zero
 p2rz
+pages_commit
+pages_decommit
+pages_map
 pages_purge
+pages_trim
+pages_unmap
 pow2_ceil
 prof_active_get
 prof_active_get_unlocked
diff --git a/include/jemalloc/jemalloc_typedefs.h.in b/include/jemalloc/jemalloc_typedefs.h.in
index d4b4690..26eb9ad 100644
--- a/include/jemalloc/jemalloc_typedefs.h.in
+++ b/include/jemalloc/jemalloc_typedefs.h.in
@@ -1,3 +1,55 @@
+/*
+ * void *
+ * chunk_alloc(void *new_addr, size_t size, size_t alignment, bool *zero,
+ *     unsigned arena_ind);
+ */
 typedef void *(chunk_alloc_t)(void *, size_t, size_t, bool *, unsigned);
+
+/*
+ * bool
+ * chunk_dalloc(void *chunk, size_t size, unsigned arena_ind);
+ */
 typedef bool (chunk_dalloc_t)(void *, size_t, unsigned);
-typedef bool (chunk_purge_t)(void *, size_t, size_t, unsigned);
+
+/*
+ * bool
+ * chunk_commit(void *chunk, size_t size, unsigned arena_ind);
+ */
+typedef bool (chunk_commit_t)(void *, size_t, unsigned);
+
+/*
+ * bool
+ * chunk_decommit(void *chunk, size_t size, unsigned arena_ind);
+ */
+typedef bool (chunk_decommit_t)(void *, size_t, unsigned);
+
+/*
+ * bool
+ * chunk_purge(void *chunk, size_t size, size_t offset, size_t length,
+ *     unsigned arena_ind);
+ */
+typedef bool (chunk_purge_t)(void *, size_t, size_t, size_t, unsigned);
+
+/*
+ * bool
+ * chunk_split(void *chunk, size_t size, size_t size_a, size_t size_b,
+ *     bool committed, unsigned arena_ind);
+ */
+typedef bool (chunk_split_t)(void *, size_t, size_t, size_t, bool, unsigned);
+
+/*
+ * bool
+ * chunk_merge(void *chunk_a, size_t size_a, void *chunk_b, size_t size_b,
+ *     bool committed, unsigned arena_ind);
+ */
+typedef bool (chunk_merge_t)(void *, size_t, void *, size_t, bool, unsigned);
+
+typedef struct {
+	chunk_alloc_t		*alloc;
+	chunk_dalloc_t		*dalloc;
+	chunk_commit_t		*commit;
+	chunk_decommit_t	*decommit;
+	chunk_purge_t		*purge;
+	chunk_split_t		*split;
+	chunk_merge_t		*merge;
+} chunk_hooks_t;
diff --git a/src/arena.c b/src/arena.c
index e2f49bd..ceeef81 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -516,23 +516,23 @@ static bool
 arena_chunk_register(arena_t *arena, arena_chunk_t *chunk, bool zero)
 {
 
-	extent_node_init(&chunk->node, arena, chunk, chunksize, zero);
+	extent_node_init(&chunk->node, arena, chunk, chunksize, true, zero);
 	extent_node_achunk_set(&chunk->node, true);
 	return (chunk_register(chunk, &chunk->node));
 }
 
 static arena_chunk_t *
-arena_chunk_alloc_internal_hard(arena_t *arena, bool *zero)
+arena_chunk_alloc_internal_hard(arena_t *arena, chunk_hooks_t *chunk_hooks,
+    bool *zero)
 {
 	arena_chunk_t *chunk;
-	chunk_alloc_t *chunk_alloc = arena->chunk_alloc;
-	chunk_dalloc_t *chunk_dalloc = arena->chunk_dalloc;
 
 	malloc_mutex_unlock(&arena->lock);
-	chunk = (arena_chunk_t *)chunk_alloc_wrapper(arena, chunk_alloc, NULL,
+
+	chunk = (arena_chunk_t *)chunk_alloc_wrapper(arena, chunk_hooks, NULL,
 	    chunksize, chunksize, zero);
 	if (chunk != NULL && arena_chunk_register(arena, chunk, *zero)) {
-		chunk_dalloc_wrapper(arena, chunk_dalloc, (void *)chunk,
+		chunk_dalloc_wrapper(arena, chunk_hooks, (void *)chunk,
 		    chunksize);
 		chunk = NULL;
 	}
@@ -545,19 +545,18 @@ static arena_chunk_t *
 arena_chunk_alloc_internal(arena_t *arena, bool *zero)
 {
 	arena_chunk_t *chunk;
+	chunk_hooks_t chunk_hooks = CHUNK_HOOKS_INITIALIZER;
 
-	if (likely(arena->chunk_alloc == chunk_alloc_default)) {
-		chunk = chunk_alloc_cache(arena, NULL, chunksize, chunksize,
-		    zero, true);
-		if (chunk != NULL && arena_chunk_register(arena, chunk,
-		    *zero)) {
-			chunk_dalloc_cache(arena, chunk, chunksize);
-			return (NULL);
-		}
-	} else
-		chunk = NULL;
-	if (chunk == NULL)
-		chunk = arena_chunk_alloc_internal_hard(arena, zero);
+	chunk = chunk_alloc_cache(arena, &chunk_hooks, NULL, chunksize,
+	    chunksize, zero, true);
+	if (chunk != NULL && arena_chunk_register(arena, chunk, *zero)) {
+		chunk_dalloc_cache(arena, &chunk_hooks, chunk, chunksize);
+		return (NULL);
+	}
+	if (chunk == NULL) {
+		chunk = arena_chunk_alloc_internal_hard(arena, &chunk_hooks,
+		    zero);
+	}
 
 	if (config_stats && chunk != NULL) {
 		arena->stats.mapped += chunksize;
@@ -657,7 +656,7 @@ arena_chunk_dalloc(arena_t *arena, arena_chunk_t *chunk)
 
 	if (arena->spare != NULL) {
 		arena_chunk_t *spare = arena->spare;
-		chunk_dalloc_t *chunk_dalloc;
+		chunk_hooks_t chunk_hooks = CHUNK_HOOKS_INITIALIZER;
 
 		arena->spare = chunk;
 		if (arena_mapbits_dirty_get(spare, map_bias) != 0) {
@@ -667,15 +666,8 @@ arena_chunk_dalloc(arena_t *arena, arena_chunk_t *chunk)
 
 		chunk_deregister(spare, &spare->node);
 
-		chunk_dalloc = arena->chunk_dalloc;
-		if (likely(chunk_dalloc == chunk_dalloc_default))
-			chunk_dalloc_cache(arena, (void *)spare, chunksize);
-		else {
-			malloc_mutex_unlock(&arena->lock);
-			chunk_dalloc_wrapper(arena, chunk_dalloc, (void *)spare,
-			    chunksize);
-			malloc_mutex_lock(&arena->lock);
-		}
+		chunk_dalloc_cache(arena, &chunk_hooks, (void *)spare,
+		    chunksize);
 
 		if (config_stats) {
 			arena->stats.mapped -= chunksize;
@@ -781,12 +773,12 @@ arena_node_dalloc(arena_t *arena, extent_node_t *node)
 }
 
 static void *
-arena_chunk_alloc_huge_hard(arena_t *arena, chunk_alloc_t *chunk_alloc,
+arena_chunk_alloc_huge_hard(arena_t *arena, chunk_hooks_t *chunk_hooks,
     size_t usize, size_t alignment, bool *zero, size_t csize)
 {
 	void *ret;
 
-	ret = chunk_alloc_wrapper(arena, chunk_alloc, NULL, csize, alignment,
+	ret = chunk_alloc_wrapper(arena, chunk_hooks, NULL, csize, alignment,
 	    zero);
 	if (ret == NULL) {
 		/* Revert optimistic stats updates. */
@@ -807,7 +799,7 @@ arena_chunk_alloc_huge(arena_t *arena, size_t usize, size_t alignment,
     bool *zero)
 {
 	void *ret;
-	chunk_alloc_t *chunk_alloc;
+	chunk_hooks_t chunk_hooks = CHUNK_HOOKS_INITIALIZER;
 	size_t csize = CHUNK_CEILING(usize);
 
 	malloc_mutex_lock(&arena->lock);
@@ -819,15 +811,11 @@ arena_chunk_alloc_huge(arena_t *arena, size_t usize, size_t alignment,
 	}
 	arena->nactive += (usize >> LG_PAGE);
 
-	chunk_alloc = arena->chunk_alloc;
-	if (likely(chunk_alloc == chunk_alloc_default)) {
-		ret = chunk_alloc_cache(arena, NULL, csize, alignment, zero,
-		    true);
-	} else
-		ret = NULL;
+	ret = chunk_alloc_cache(arena, &chunk_hooks, NULL, csize, alignment,
+	    zero, true);
 	malloc_mutex_unlock(&arena->lock);
 	if (ret == NULL) {
-		ret = arena_chunk_alloc_huge_hard(arena, chunk_alloc, usize,
+		ret = arena_chunk_alloc_huge_hard(arena, &chunk_hooks, usize,
 		    alignment, zero, csize);
 	}
 
@@ -839,12 +827,11 @@ arena_chunk_alloc_huge(arena_t *arena, size_t usize, size_t alignment,
 void
 arena_chunk_dalloc_huge(arena_t *arena, void *chunk, size_t usize)
 {
-	chunk_dalloc_t *chunk_dalloc;
+	chunk_hooks_t chunk_hooks = CHUNK_HOOKS_INITIALIZER;
 	size_t csize;
 
 	csize = CHUNK_CEILING(usize);
 	malloc_mutex_lock(&arena->lock);
-	chunk_dalloc = arena->chunk_dalloc;
 	if (config_stats) {
 		arena_huge_dalloc_stats_update(arena, usize);
 		arena->stats.mapped -= usize;
@@ -852,13 +839,8 @@ arena_chunk_dalloc_huge(arena_t *arena, void *chunk, size_t usize)
 	}
 	arena->nactive -= (usize >> LG_PAGE);
 
-	if (likely(chunk_dalloc == chunk_dalloc_default)) {
-		chunk_dalloc_cache(arena, chunk, csize);
-		malloc_mutex_unlock(&arena->lock);
-	} else {
-		malloc_mutex_unlock(&arena->lock);
-		chunk_dalloc_wrapper(arena, chunk_dalloc, chunk, csize);
-	}
+	chunk_dalloc_cache(arena, &chunk_hooks, chunk, csize);
+	malloc_mutex_unlock(&arena->lock);
 }
 
 void
@@ -904,30 +886,23 @@ arena_chunk_ralloc_huge_shrink(arena_t *arena, void *chunk, size_t oldsize,
 	arena->nactive -= udiff >> LG_PAGE;
 
 	if (cdiff != 0) {
-		chunk_dalloc_t *chunk_dalloc = arena->chunk_dalloc;
+		chunk_hooks_t chunk_hooks = CHUNK_HOOKS_INITIALIZER;
 		void *nchunk = (void *)((uintptr_t)chunk +
 		    CHUNK_CEILING(usize));
 
-		if (likely(chunk_dalloc == chunk_dalloc_default)) {
-			chunk_dalloc_cache(arena, nchunk, cdiff);
-			malloc_mutex_unlock(&arena->lock);
-		} else {
-			malloc_mutex_unlock(&arena->lock);
-			chunk_dalloc_wrapper(arena, chunk_dalloc, nchunk,
-			    cdiff);
-		}
-	} else
-		malloc_mutex_unlock(&arena->lock);
+		chunk_dalloc_cache(arena, &chunk_hooks, nchunk, cdiff);
+	}
+	malloc_mutex_unlock(&arena->lock);
 }
 
-bool
-arena_chunk_ralloc_huge_expand_hard(arena_t *arena, chunk_alloc_t *chunk_alloc,
-    size_t oldsize, size_t usize, bool *zero, void *nchunk, size_t udiff,
-    size_t cdiff)
+static bool
+arena_chunk_ralloc_huge_expand_hard(arena_t *arena, chunk_hooks_t *chunk_hooks,
+    void *chunk, size_t oldsize, size_t usize, bool *zero, void *nchunk,
+    size_t udiff, size_t cdiff)
 {
 	bool err;
 
-	err = (chunk_alloc_wrapper(arena, chunk_alloc, nchunk, cdiff, chunksize,
+	err = (chunk_alloc_wrapper(arena, chunk_hooks, nchunk, cdiff, chunksize,
 	    zero) == NULL);
 	if (err) {
 		/* Revert optimistic stats updates. */
@@ -939,6 +914,10 @@ arena_chunk_ralloc_huge_expand_hard(arena_t *arena, chunk_alloc_t *chunk_alloc,
 		}
 		arena->nactive -= (udiff >> LG_PAGE);
 		malloc_mutex_unlock(&arena->lock);
+	} else if (chunk_hooks->merge(chunk, CHUNK_CEILING(oldsize), nchunk,
+	    cdiff, true, arena->ind)) {
+		chunk_dalloc_arena(arena, chunk_hooks, nchunk, cdiff, *zero);
+		err = true;
 	}
 	return (err);
 }
@@ -948,11 +927,13 @@ arena_chunk_ralloc_huge_expand(arena_t *arena, void *chunk, size_t oldsize,
     size_t usize, bool *zero)
 {
 	bool err;
-	chunk_alloc_t *chunk_alloc;
+	chunk_hooks_t chunk_hooks;
 	void *nchunk = (void *)((uintptr_t)chunk + CHUNK_CEILING(oldsize));
 	size_t udiff = usize - oldsize;
 	size_t cdiff = CHUNK_CEILING(usize) - CHUNK_CEILING(oldsize);
 
+	chunk_hooks = chunk_hooks_get(arena);
+
 	malloc_mutex_lock(&arena->lock);
 
 	/* Optimistically update stats. */
@@ -962,16 +943,17 @@ arena_chunk_ralloc_huge_expand(arena_t *arena, void *chunk, size_t oldsize,
 	}
 	arena->nactive += (udiff >> LG_PAGE);
 
-	chunk_alloc = arena->chunk_alloc;
-	if (likely(chunk_alloc == chunk_alloc_default)) {
-		err = (chunk_alloc_cache(arena, nchunk, cdiff, chunksize, zero,
-		    true) == NULL);
-	} else
-		err = true;
+	err = (chunk_alloc_cache(arena, &arena->chunk_hooks, nchunk, cdiff,
+	    chunksize, zero, true) == NULL);
 	malloc_mutex_unlock(&arena->lock);
 	if (err) {
-		err = arena_chunk_ralloc_huge_expand_hard(arena, chunk_alloc,
-		    oldsize, usize, zero, nchunk, udiff, cdiff);
+		err = arena_chunk_ralloc_huge_expand_hard(arena, &chunk_hooks,
+		    chunk, oldsize, usize, zero, nchunk, udiff,
+		    cdiff);
+	} else if (chunk_hooks.merge(chunk, CHUNK_CEILING(oldsize), nchunk,
+	    cdiff, true, arena->ind)) {
+		chunk_dalloc_arena(arena, &chunk_hooks, nchunk, cdiff, *zero);
+		err = true;
 	}
 
 	if (config_stats && !err)
@@ -1198,8 +1180,8 @@ arena_compute_npurge(arena_t *arena, bool all)
 }
 
 static size_t
-arena_stash_dirty(arena_t *arena, bool all, size_t npurge,
-    arena_runs_dirty_link_t *purge_runs_sentinel,
+arena_stash_dirty(arena_t *arena, chunk_hooks_t *chunk_hooks, bool all,
+    size_t npurge, arena_runs_dirty_link_t *purge_runs_sentinel,
     extent_node_t *purge_chunks_sentinel)
 {
 	arena_runs_dirty_link_t *rdelm, *rdelm_next;
@@ -1224,7 +1206,7 @@ arena_stash_dirty(arena_t *arena, bool all, size_t npurge,
 			 * dalloc_node=false argument to chunk_alloc_cache().
 			 */
 			zero = false;
-			chunk = chunk_alloc_cache(arena,
+			chunk = chunk_alloc_cache(arena, chunk_hooks,
 			    extent_node_addr_get(chunkselm),
 			    extent_node_size_get(chunkselm), chunksize, &zero,
 			    false);
@@ -1278,12 +1260,11 @@ arena_stash_dirty(arena_t *arena, bool all, size_t npurge,
 }
 
 static size_t
-arena_purge_stashed(arena_t *arena,
+arena_purge_stashed(arena_t *arena, chunk_hooks_t *chunk_hooks,
     arena_runs_dirty_link_t *purge_runs_sentinel,
     extent_node_t *purge_chunks_sentinel)
 {
 	size_t npurged, nmadvise;
-	chunk_purge_t *chunk_purge;
 	arena_runs_dirty_link_t *rdelm;
 	extent_node_t *chunkselm;
 
@@ -1291,7 +1272,6 @@ arena_purge_stashed(arena_t *arena,
 		nmadvise = 0;
 	npurged = 0;
 
-	chunk_purge = arena->chunk_purge;
 	malloc_mutex_unlock(&arena->lock);
 	for (rdelm = qr_next(purge_runs_sentinel, rd_link),
 	    chunkselm = qr_next(purge_chunks_sentinel, cc_link);
@@ -1299,13 +1279,16 @@ arena_purge_stashed(arena_t *arena,
 		size_t npages;
 
 		if (rdelm == &chunkselm->rd) {
+			/*
+			 * Don't actually purge the chunk here because 1)
+			 * chunkselm is embedded in the chunk and must remain
+			 * valid, and 2) we deallocate the chunk in
+			 * arena_unstash_purged(), where it is destroyed,
+			 * decommitted, or purged, depending on chunk
+			 * deallocation policy.
+			 */
 			size_t size = extent_node_size_get(chunkselm);
-			bool unzeroed;
-
 			npages = size >> LG_PAGE;
-			unzeroed = chunk_purge_wrapper(arena, chunk_purge,
-			    extent_node_addr_get(chunkselm), 0, size);
-			extent_node_zeroed_set(chunkselm, !unzeroed);
 			chunkselm = qr_next(chunkselm, cc_link);
 		} else {
 			size_t pageind, run_size, flag_unzeroed, i;
@@ -1319,8 +1302,9 @@ arena_purge_stashed(arena_t *arena,
 			npages = run_size >> LG_PAGE;
 
 			assert(pageind + npages <= chunk_npages);
-			unzeroed = chunk_purge_wrapper(arena, chunk_purge,
-			    chunk, pageind << LG_PAGE, run_size);
+			unzeroed = chunk_purge_wrapper(arena,
+			    chunk_hooks, chunk, chunksize, pageind << LG_PAGE,
+			    run_size);
 			flag_unzeroed = unzeroed ? CHUNK_MAP_UNZEROED : 0;
 
 			/*
@@ -1355,14 +1339,14 @@ arena_purge_stashed(arena_t *arena,
 }
 
 static void
-arena_unstash_purged(arena_t *arena,
+arena_unstash_purged(arena_t *arena, chunk_hooks_t *chunk_hooks,
     arena_runs_dirty_link_t *purge_runs_sentinel,
     extent_node_t *purge_chunks_sentinel)
 {
 	arena_runs_dirty_link_t *rdelm, *rdelm_next;
 	extent_node_t *chunkselm;
 
-	/* Deallocate runs. */
+	/* Deallocate chunks/runs. */
 	for (rdelm = qr_next(purge_runs_sentinel, rd_link),
 	    chunkselm = qr_next(purge_chunks_sentinel, cc_link);
 	    rdelm != purge_runs_sentinel; rdelm = rdelm_next) {
@@ -1376,7 +1360,8 @@ arena_unstash_purged(arena_t *arena,
 			extent_node_dirty_remove(chunkselm);
 			arena_node_dalloc(arena, chunkselm);
 			chunkselm = chunkselm_next;
-			chunk_dalloc_arena(arena, addr, size, zeroed);
+			chunk_dalloc_arena(arena, chunk_hooks, addr, size,
+			    zeroed);
 		} else {
 			arena_chunk_map_misc_t *miscelm =
 			    arena_rd_to_miscelm(rdelm);
@@ -1390,6 +1375,7 @@ arena_unstash_purged(arena_t *arena,
 static void
 arena_purge(arena_t *arena, bool all)
 {
+	chunk_hooks_t chunk_hooks = CHUNK_HOOKS_INITIALIZER;
 	size_t npurge, npurgeable, npurged;
 	arena_runs_dirty_link_t purge_runs_sentinel;
 	extent_node_t purge_chunks_sentinel;
@@ -1413,13 +1399,13 @@ arena_purge(arena_t *arena, bool all)
 	qr_new(&purge_runs_sentinel, rd_link);
 	extent_node_dirty_linkage_init(&purge_chunks_sentinel);
 
-	npurgeable = arena_stash_dirty(arena, all, npurge, &purge_runs_sentinel,
-	    &purge_chunks_sentinel);
+	npurgeable = arena_stash_dirty(arena, &chunk_hooks, all, npurge,
+	    &purge_runs_sentinel, &purge_chunks_sentinel);
 	assert(npurgeable >= npurge);
-	npurged = arena_purge_stashed(arena, &purge_runs_sentinel,
+	npurged = arena_purge_stashed(arena, &chunk_hooks, &purge_runs_sentinel,
 	    &purge_chunks_sentinel);
 	assert(npurged == npurgeable);
-	arena_unstash_purged(arena, &purge_runs_sentinel,
+	arena_unstash_purged(arena, &chunk_hooks, &purge_runs_sentinel,
 	    &purge_chunks_sentinel);
 
 	arena->purging = false;
@@ -2874,21 +2860,17 @@ arena_new(unsigned ind)
 	if (malloc_mutex_init(&arena->huge_mtx))
 		return (NULL);
 
-	extent_tree_szad_new(&arena->chunks_szad_cache);
-	extent_tree_ad_new(&arena->chunks_ad_cache);
-	extent_tree_szad_new(&arena->chunks_szad_mmap);
-	extent_tree_ad_new(&arena->chunks_ad_mmap);
-	extent_tree_szad_new(&arena->chunks_szad_dss);
-	extent_tree_ad_new(&arena->chunks_ad_dss);
+	extent_tree_szad_new(&arena->chunks_szad_cached);
+	extent_tree_ad_new(&arena->chunks_ad_cached);
+	extent_tree_szad_new(&arena->chunks_szad_retained);
+	extent_tree_ad_new(&arena->chunks_ad_retained);
 	if (malloc_mutex_init(&arena->chunks_mtx))
 		return (NULL);
 	ql_new(&arena->node_cache);
 	if (malloc_mutex_init(&arena->node_cache_mtx))
 		return (NULL);
 
-	arena->chunk_alloc = chunk_alloc_default;
-	arena->chunk_dalloc = chunk_dalloc_default;
-	arena->chunk_purge = chunk_purge_default;
+	arena->chunk_hooks = chunk_hooks_default;
 
 	/* Initialize bins. */
 	for (i = 0; i < NBINS; i++) {
diff --git a/src/base.c b/src/base.c
index df3ddb6..5493d0f 100644
--- a/src/base.c
+++ b/src/base.c
@@ -66,7 +66,7 @@ base_chunk_alloc(size_t minsize)
 			base_resident += PAGE_CEILING(nsize);
 		}
 	}
-	extent_node_init(node, NULL, addr, csize, true);
+	extent_node_init(node, NULL, addr, csize, true, true);
 	return (node);
 }
 
@@ -90,7 +90,7 @@ base_alloc(size_t size)
 	csize = CACHELINE_CEILING(size);
 
 	usize = s2u(csize);
-	extent_node_init(&key, NULL, NULL, usize, false);
+	extent_node_init(&key, NULL, NULL, usize, true, false);
 	malloc_mutex_lock(&base_mtx);
 	node = extent_tree_szad_nsearch(&base_avail_szad, &key);
 	if (node != NULL) {
diff --git a/src/chunk.c b/src/chunk.c
index 7a4ede8..cdd5311 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -18,7 +18,103 @@ size_t		chunksize;
 size_t		chunksize_mask; /* (chunksize - 1). */
 size_t		chunk_npages;
 
+static void	*chunk_alloc_default(void *new_addr, size_t size,
+    size_t alignment, bool *zero, unsigned arena_ind);
+static bool	chunk_dalloc_default(void *chunk, size_t size,
+    unsigned arena_ind);
+static bool	chunk_commit_default(void *chunk, size_t size,
+    unsigned arena_ind);
+static bool	chunk_decommit_default(void *chunk, size_t size,
+    unsigned arena_ind);
+static bool	chunk_purge_default(void *chunk, size_t size, size_t offset,
+    size_t length, unsigned arena_ind);
+static bool	chunk_split_default(void *chunk, size_t size, size_t size_a,
+    size_t size_b, bool committed, unsigned arena_ind);
+static bool	chunk_merge_default(void *chunk_a, size_t size_a, void *chunk_b,
+    size_t size_b, bool committed, unsigned arena_ind);
+
+const chunk_hooks_t	chunk_hooks_default = {
+	chunk_alloc_default,
+	chunk_dalloc_default,
+	chunk_commit_default,
+	chunk_decommit_default,
+	chunk_purge_default,
+	chunk_split_default,
+	chunk_merge_default
+};
+
 /******************************************************************************/
+/*
+ * Function prototypes for static functions that are referenced prior to
+ * definition.
+ */
+
+static void	chunk_record(arena_t *arena, chunk_hooks_t *chunk_hooks,
+    extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, bool cache,
+    void *chunk, size_t size, bool committed, bool zeroed);
+
+/******************************************************************************/
+
+static chunk_hooks_t
+chunk_hooks_get_locked(arena_t *arena)
+{
+
+	return (arena->chunk_hooks);
+}
+
+chunk_hooks_t
+chunk_hooks_get(arena_t *arena)
+{
+	chunk_hooks_t chunk_hooks;
+
+	malloc_mutex_lock(&arena->chunks_mtx);
+	chunk_hooks = chunk_hooks_get_locked(arena);
+	malloc_mutex_unlock(&arena->chunks_mtx);
+
+	return (chunk_hooks);
+}
+
+chunk_hooks_t
+chunk_hooks_set(arena_t *arena, const chunk_hooks_t *chunk_hooks)
+{
+	chunk_hooks_t old_chunk_hooks;
+
+	malloc_mutex_lock(&arena->chunks_mtx);
+	old_chunk_hooks = arena->chunk_hooks;
+	arena->chunk_hooks = *chunk_hooks;
+	malloc_mutex_unlock(&arena->chunks_mtx);
+
+	return (old_chunk_hooks);
+}
+
+static void
+chunk_hooks_assure_initialized_impl(arena_t *arena, chunk_hooks_t *chunk_hooks,
+    bool locked)
+{
+	static const chunk_hooks_t uninitialized_hooks =
+	    CHUNK_HOOKS_INITIALIZER;
+
+	if (memcmp(chunk_hooks, &uninitialized_hooks, sizeof(chunk_hooks_t)) ==
+	    0) {
+		*chunk_hooks = locked ? chunk_hooks_get_locked(arena) :
+		    chunk_hooks_get(arena);
+	}
+}
+
+static void
+chunk_hooks_assure_initialized_locked(arena_t *arena,
+    chunk_hooks_t *chunk_hooks)
+{
+
+	chunk_hooks_assure_initialized_impl(arena, chunk_hooks, true);
+}
+
+static void
+chunk_hooks_assure_initialized(arena_t *arena, chunk_hooks_t *chunk_hooks)
+{
+
+	chunk_hooks_assure_initialized_impl(arena, chunk_hooks, false);
+}
 
 bool
 chunk_register(const void *chunk, const extent_node_t *node)
@@ -74,21 +170,26 @@ chunk_first_best_fit(arena_t *arena, extent_tree_t *chunks_szad,
 
 	assert(size == CHUNK_CEILING(size));
 
-	extent_node_init(&key, arena, NULL, size, false);
+	extent_node_init(&key, arena, NULL, size, false, false);
 	return (extent_tree_szad_nsearch(chunks_szad, &key));
 }
 
 static void *
-chunk_recycle(arena_t *arena, extent_tree_t *chunks_szad,
-    extent_tree_t *chunks_ad, bool cache, void *new_addr, size_t size,
-    size_t alignment, bool *zero, bool dalloc_node)
+chunk_recycle(arena_t *arena, chunk_hooks_t *chunk_hooks,
+    extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, bool cache,
+    void *new_addr, size_t size, size_t alignment, bool *zero, bool dalloc_node)
 {
 	void *ret;
 	extent_node_t *node;
 	size_t alloc_size, leadsize, trailsize;
-	bool zeroed;
+	bool committed, zeroed;
 
 	assert(new_addr == NULL || alignment == chunksize);
+	/*
+	 * Cached chunks use the node linkage embedded in their headers, in
+	 * which case dalloc_node is true, and new_addr is non-NULL because
+	 * we're operating on a specific chunk.
+	 */
 	assert(dalloc_node || new_addr != NULL);
 
 	alloc_size = CHUNK_CEILING(s2u(size + alignment - chunksize));
@@ -96,9 +197,11 @@ chunk_recycle(arena_t *arena, extent_tree_t *chunks_szad,
 	if (alloc_size < size)
 		return (NULL);
 	malloc_mutex_lock(&arena->chunks_mtx);
+	chunk_hooks_assure_initialized_locked(arena, chunk_hooks);
 	if (new_addr != NULL) {
 		extent_node_t key;
-		extent_node_init(&key, arena, new_addr, alloc_size, false);
+		extent_node_init(&key, arena, new_addr, alloc_size, false,
+		    false);
 		node = extent_tree_ad_search(chunks_ad, &key);
 	} else {
 		node = chunk_first_best_fit(arena, chunks_szad, chunks_ad,
@@ -115,9 +218,17 @@ chunk_recycle(arena_t *arena, extent_tree_t *chunks_szad,
 	assert(extent_node_size_get(node) >= leadsize + size);
 	trailsize = extent_node_size_get(node) - leadsize - size;
 	ret = (void *)((uintptr_t)extent_node_addr_get(node) + leadsize);
+	committed = extent_node_committed_get(node);
 	zeroed = extent_node_zeroed_get(node);
 	if (zeroed)
 	    *zero = true;
+	/* Split the lead. */
+	if (leadsize != 0 &&
+	    chunk_hooks->split(extent_node_addr_get(node),
+	    extent_node_size_get(node), leadsize, size, false, arena->ind)) {
+		malloc_mutex_unlock(&arena->chunks_mtx);
+		return (NULL);
+	}
 	/* Remove node from the tree. */
 	extent_tree_szad_remove(chunks_szad, node);
 	extent_tree_ad_remove(chunks_ad, node);
@@ -131,23 +242,40 @@ chunk_recycle(arena_t *arena, extent_tree_t *chunks_szad,
 		node = NULL;
 	}
 	if (trailsize != 0) {
+		/* Split the trail. */
+		if (chunk_hooks->split(ret, size + trailsize, size,
+		    trailsize, false, arena->ind)) {
+			if (dalloc_node && node != NULL)
+				arena_node_dalloc(arena, node);
+			malloc_mutex_unlock(&arena->chunks_mtx);
+			chunk_record(arena, chunk_hooks, chunks_szad, chunks_ad,
+			    cache, ret, size + trailsize, committed, zeroed);
+			return (NULL);
+		}
 		/* Insert the trailing space as a smaller chunk. */
 		if (node == NULL) {
 			node = arena_node_alloc(arena);
 			if (node == NULL) {
 				malloc_mutex_unlock(&arena->chunks_mtx);
-				chunk_record(arena, chunks_szad, chunks_ad,
-				    cache, ret, size, zeroed);
+				chunk_record(arena, chunk_hooks, chunks_szad,
+				    chunks_ad, cache, ret, size + trailsize,
+				    committed, zeroed);
 				return (NULL);
 			}
 		}
 		extent_node_init(node, arena, (void *)((uintptr_t)(ret) + size),
-		    trailsize, zeroed);
+		    trailsize, committed, zeroed);
 		extent_tree_szad_insert(chunks_szad, node);
 		extent_tree_ad_insert(chunks_ad, node);
 		arena_chunk_cache_maybe_insert(arena, node, cache);
 		node = NULL;
 	}
+	if (!committed && chunk_hooks->commit(ret, size, arena->ind)) {
+		malloc_mutex_unlock(&arena->chunks_mtx);
+		chunk_record(arena, chunk_hooks, chunks_szad, chunks_ad, cache,
+		    ret, size, committed, zeroed);
+		return (NULL);
+	}
 	malloc_mutex_unlock(&arena->chunks_mtx);
 
 	assert(dalloc_node || node != NULL);
@@ -168,20 +296,6 @@ chunk_recycle(arena_t *arena, extent_tree_t *chunks_szad,
 	return (ret);
 }
 
-static void *
-chunk_alloc_core_dss(arena_t *arena, void *new_addr, size_t size,
-    size_t alignment, bool *zero)
-{
-	void *ret;
-
-	if ((ret = chunk_recycle(arena, &arena->chunks_szad_dss,
-	    &arena->chunks_ad_dss, false, new_addr, size, alignment, zero,
-	    true)) != NULL)
-		return (ret);
-	ret = chunk_alloc_dss(arena, new_addr, size, alignment, zero);
-	return (ret);
-}
-
 /*
  * If the caller specifies (!*zero), it is still possible to receive zeroed
  * memory, in which case *zero is toggled to true.  arena_chunk_alloc() takes
@@ -193,33 +307,33 @@ chunk_alloc_core(arena_t *arena, void *new_addr, size_t size, size_t alignment,
     bool *zero, dss_prec_t dss_prec)
 {
 	void *ret;
+	chunk_hooks_t chunk_hooks = CHUNK_HOOKS_INITIALIZER;
 
 	assert(size != 0);
 	assert((size & chunksize_mask) == 0);
 	assert(alignment != 0);
 	assert((alignment & chunksize_mask) == 0);
 
+	/* Retained. */
+	if ((ret = chunk_recycle(arena, &chunk_hooks,
+	    &arena->chunks_szad_retained, &arena->chunks_ad_retained, false,
+	    new_addr, size, alignment, zero, true)) != NULL)
+		return (ret);
+
 	/* "primary" dss. */
 	if (have_dss && dss_prec == dss_prec_primary && (ret =
-	    chunk_alloc_core_dss(arena, new_addr, size, alignment, zero)) !=
-	    NULL)
-		return (ret);
-	/* mmap. */
-	if (!config_munmap && (ret = chunk_recycle(arena,
-	    &arena->chunks_szad_mmap, &arena->chunks_ad_mmap, false, new_addr,
-	    size, alignment, zero, true)) != NULL)
+	    chunk_alloc_dss(arena, new_addr, size, alignment, zero)) != NULL)
 		return (ret);
 	/*
-	 * Requesting an address is not implemented for chunk_alloc_mmap(), so
-	 * only call it if (new_addr == NULL).
+	 * mmap.  Requesting an address is not implemented for
+	 * chunk_alloc_mmap(), so only call it if (new_addr == NULL).
 	 */
 	if (new_addr == NULL && (ret = chunk_alloc_mmap(size, alignment, zero))
 	    != NULL)
 		return (ret);
 	/* "secondary" dss. */
 	if (have_dss && dss_prec == dss_prec_secondary && (ret =
-	    chunk_alloc_core_dss(arena, new_addr, size, alignment, zero)) !=
-	    NULL)
+	    chunk_alloc_dss(arena, new_addr, size, alignment, zero)) != NULL)
 		return (ret);
 
 	/* All strategies for allocation failed. */
@@ -248,8 +362,8 @@ chunk_alloc_base(size_t size)
 }
 
 void *
-chunk_alloc_cache(arena_t *arena, void *new_addr, size_t size, size_t alignment,
-    bool *zero, bool dalloc_node)
+chunk_alloc_cache(arena_t *arena, chunk_hooks_t *chunk_hooks, void *new_addr,
+    size_t size, size_t alignment, bool *zero, bool dalloc_node)
 {
 	void *ret;
 
@@ -258,8 +372,8 @@ chunk_alloc_cache(arena_t *arena, void *new_addr, size_t size, size_t alignment,
 	assert(alignment != 0);
 	assert((alignment & chunksize_mask) == 0);
 
-	ret = chunk_recycle(arena, &arena->chunks_szad_cache,
-	    &arena->chunks_ad_cache, true, new_addr, size, alignment, zero,
+	ret = chunk_recycle(arena, chunk_hooks, &arena->chunks_szad_cached,
+	    &arena->chunks_ad_cached, true, new_addr, size, alignment, zero,
 	    dalloc_node);
 	if (ret == NULL)
 		return (NULL);
@@ -285,11 +399,13 @@ chunk_arena_get(unsigned arena_ind)
 }
 
 static void *
-chunk_alloc_arena(arena_t *arena, void *new_addr, size_t size, size_t alignment,
-    bool *zero)
+chunk_alloc_default(void *new_addr, size_t size, size_t alignment, bool *zero,
+    unsigned arena_ind)
 {
 	void *ret;
+	arena_t *arena;
 
+	arena = chunk_arena_get(arena_ind);
 	ret = chunk_alloc_core(arena, new_addr, size, alignment, zero,
 	    arena->dss_prec);
 	if (ret == NULL)
@@ -300,55 +416,45 @@ chunk_alloc_arena(arena_t *arena, void *new_addr, size_t size, size_t alignment,
 	return (ret);
 }
 
-/*
- * Default arena chunk allocation routine in the absence of user override.  This
- * function isn't actually used by jemalloc, but it does the right thing if the
- * application passes calls through to it during chunk allocation.
- */
 void *
-chunk_alloc_default(void *new_addr, size_t size, size_t alignment, bool *zero,
-    unsigned arena_ind)
-{
-	arena_t *arena;
-
-	arena = chunk_arena_get(arena_ind);
-	return (chunk_alloc_arena(arena, new_addr, size, alignment, zero));
-}
-
-void *
-chunk_alloc_wrapper(arena_t *arena, chunk_alloc_t *chunk_alloc, void *new_addr,
+chunk_alloc_wrapper(arena_t *arena, chunk_hooks_t *chunk_hooks, void *new_addr,
     size_t size, size_t alignment, bool *zero)
 {
 	void *ret;
 
-	ret = chunk_alloc(new_addr, size, alignment, zero, arena->ind);
+	chunk_hooks_assure_initialized(arena, chunk_hooks);
+	ret = chunk_hooks->alloc(new_addr, size, alignment, zero, arena->ind);
 	if (ret == NULL)
 		return (NULL);
-	if (config_valgrind && chunk_alloc != chunk_alloc_default)
+	if (config_valgrind && chunk_hooks->alloc != chunk_alloc_default)
 		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, chunksize);
 	return (ret);
 }
 
-void
-chunk_record(arena_t *arena, extent_tree_t *chunks_szad,
-    extent_tree_t *chunks_ad, bool cache, void *chunk, size_t size, bool zeroed)
+static void
+chunk_record(arena_t *arena, chunk_hooks_t *chunk_hooks,
+    extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, bool cache,
+    void *chunk, size_t size, bool committed, bool zeroed)
 {
 	bool unzeroed;
 	extent_node_t *node, *prev;
 	extent_node_t key;
 
-	assert(maps_coalesce || size == chunksize);
 	assert(!cache || !zeroed);
 	unzeroed = cache || !zeroed;
 	JEMALLOC_VALGRIND_MAKE_MEM_NOACCESS(chunk, size);
 
 	malloc_mutex_lock(&arena->chunks_mtx);
+	chunk_hooks_assure_initialized_locked(arena, chunk_hooks);
 	extent_node_init(&key, arena, (void *)((uintptr_t)chunk + size), 0,
-	    false);
+	    false, false);
 	node = extent_tree_ad_nsearch(chunks_ad, &key);
 	/* Try to coalesce forward. */
 	if (node != NULL && extent_node_addr_get(node) ==
-	    extent_node_addr_get(&key)) {
+	    extent_node_addr_get(&key) && extent_node_committed_get(node) ==
+	    committed && !chunk_hooks->merge(chunk, size,
+	    extent_node_addr_get(node), extent_node_size_get(node), false,
+	    arena->ind)) {
 		/*
 		 * Coalesce chunk with the following address range.  This does
 		 * not change the position within chunks_ad, so only
@@ -373,12 +479,13 @@ chunk_record(arena_t *arena, extent_tree_t *chunks_szad,
 			 * a virtual memory leak.
 			 */
 			if (cache) {
-				chunk_purge_wrapper(arena, arena->chunk_purge,
-				    chunk, 0, size);
+				chunk_purge_wrapper(arena, chunk_hooks, chunk,
+				    size, 0, size);
 			}
 			goto label_return;
 		}
-		extent_node_init(node, arena, chunk, size, !unzeroed);
+		extent_node_init(node, arena, chunk, size, committed,
+		    !unzeroed);
 		extent_tree_ad_insert(chunks_ad, node);
 		extent_tree_szad_insert(chunks_szad, node);
 		arena_chunk_cache_maybe_insert(arena, node, cache);
@@ -387,7 +494,10 @@ chunk_record(arena_t *arena, extent_tree_t *chunks_szad,
 	/* Try to coalesce backward. */
 	prev = extent_tree_ad_prev(chunks_ad, node);
 	if (prev != NULL && (void *)((uintptr_t)extent_node_addr_get(prev) +
-	    extent_node_size_get(prev)) == chunk) {
+	    extent_node_size_get(prev)) == chunk &&
+	    extent_node_committed_get(prev) == committed &&
+	    !chunk_hooks->merge(extent_node_addr_get(prev),
+	    extent_node_size_get(prev), chunk, size, false, arena->ind)) {
 		/*
 		 * Coalesce chunk with the previous address range.  This does
 		 * not change the position within chunks_ad, so only
@@ -414,7 +524,8 @@ label_return:
 }
 
 void
-chunk_dalloc_cache(arena_t *arena, void *chunk, size_t size)
+chunk_dalloc_cache(arena_t *arena, chunk_hooks_t *chunk_hooks, void *chunk,
+    size_t size)
 {
 
 	assert(chunk != NULL);
@@ -422,57 +533,68 @@ chunk_dalloc_cache(arena_t *arena, void *chunk, size_t size)
 	assert(size != 0);
 	assert((size & chunksize_mask) == 0);
 
-	if (!maps_coalesce && size != chunksize) {
-		chunk_dalloc_arena(arena, chunk, size, false);
-		return;
-	}
-
-	chunk_record(arena, &arena->chunks_szad_cache, &arena->chunks_ad_cache,
-	    true, chunk, size, false);
+	chunk_record(arena, chunk_hooks, &arena->chunks_szad_cached,
+	    &arena->chunks_ad_cached, true, chunk, size, true, false);
 	arena_maybe_purge(arena);
 }
 
 void
-chunk_dalloc_arena(arena_t *arena, void *chunk, size_t size, bool zeroed)
+chunk_dalloc_arena(arena_t *arena, chunk_hooks_t *chunk_hooks, void *chunk,
+    size_t size, bool zeroed)
 {
+	bool committed;
 
 	assert(chunk != NULL);
 	assert(CHUNK_ADDR2BASE(chunk) == chunk);
 	assert(size != 0);
 	assert((size & chunksize_mask) == 0);
 
-	if (have_dss && chunk_in_dss(chunk)) {
-		chunk_record(arena, &arena->chunks_szad_dss,
-		    &arena->chunks_ad_dss, false, chunk, size, zeroed);
-	} else if (chunk_dalloc_mmap(chunk, size)) {
-		chunk_record(arena, &arena->chunks_szad_mmap,
-		    &arena->chunks_ad_mmap, false, chunk, size, zeroed);
-	}
+	chunk_hooks_assure_initialized(arena, chunk_hooks);
+	/* Try to deallocate. */
+	if (!chunk_hooks->dalloc(chunk, size, arena->ind))
+		return;
+	/* Try to decommit; purge if that fails. */
+	committed = chunk_hooks->decommit(chunk, size, arena->ind);
+	zeroed = !committed || chunk_hooks->purge(chunk, size, 0, size,
+	    arena->ind);
+	chunk_record(arena, chunk_hooks, &arena->chunks_szad_retained,
+	    &arena->chunks_ad_retained, false, chunk, size, committed, zeroed);
 }
 
-/*
- * Default arena chunk deallocation routine in the absence of user override.
- * This function isn't actually used by jemalloc, but it does the right thing if
- * the application passes calls through to it during chunk deallocation.
- */
-bool
+static bool
 chunk_dalloc_default(void *chunk, size_t size, unsigned arena_ind)
 {
 
-	chunk_dalloc_arena(chunk_arena_get(arena_ind), chunk, size, false);
-	return (false);
+	if (!have_dss || !chunk_in_dss(chunk))
+		return (chunk_dalloc_mmap(chunk, size));
+	return (true);
 }
 
 void
-chunk_dalloc_wrapper(arena_t *arena, chunk_dalloc_t *chunk_dalloc, void *chunk,
+chunk_dalloc_wrapper(arena_t *arena, chunk_hooks_t *chunk_hooks, void *chunk,
     size_t size)
 {
 
-	chunk_dalloc(chunk, size, arena->ind);
-	if (config_valgrind && chunk_dalloc != chunk_dalloc_default)
+	chunk_hooks_assure_initialized(arena, chunk_hooks);
+	chunk_hooks->dalloc(chunk, size, arena->ind);
+	if (config_valgrind && chunk_hooks->dalloc != chunk_dalloc_default)
 		JEMALLOC_VALGRIND_MAKE_MEM_NOACCESS(chunk, size);
 }
 
+static bool
+chunk_commit_default(void *chunk, size_t size, unsigned arena_ind)
+{
+
+	return (pages_commit(chunk, size));
+}
+
+static bool
+chunk_decommit_default(void *chunk, size_t size, unsigned arena_ind)
+{
+
+	return (pages_decommit(chunk, size));
+}
+
 bool
 chunk_purge_arena(arena_t *arena, void *chunk, size_t offset, size_t length)
 {
@@ -487,8 +609,8 @@ chunk_purge_arena(arena_t *arena, void *chunk, size_t offset, size_t length)
 	    length));
 }
 
-bool
-chunk_purge_default(void *chunk, size_t offset, size_t length,
+static bool
+chunk_purge_default(void *chunk, size_t size, size_t offset, size_t length,
     unsigned arena_ind)
 {
 
@@ -497,11 +619,35 @@ chunk_purge_default(void *chunk, size_t offset, size_t length,
 }
 
 bool
-chunk_purge_wrapper(arena_t *arena, chunk_purge_t *chunk_purge, void *chunk,
-    size_t offset, size_t length)
+chunk_purge_wrapper(arena_t *arena, chunk_hooks_t *chunk_hooks, void *chunk,
+    size_t size, size_t offset, size_t length)
+{
+
+	chunk_hooks_assure_initialized(arena, chunk_hooks);
+	return (chunk_hooks->purge(chunk, size, offset, length, arena->ind));
+}
+
+static bool
+chunk_split_default(void *chunk, size_t size, size_t size_a, size_t size_b,
+    bool committed, unsigned arena_ind)
 {
 
-	return (chunk_purge(chunk, offset, length, arena->ind));
+	if (!maps_coalesce)
+		return (true);
+	return (false);
+}
+
+static bool
+chunk_merge_default(void *chunk_a, size_t size_a, void *chunk_b, size_t size_b,
+    bool committed, unsigned arena_ind)
+{
+
+	if (!maps_coalesce)
+		return (true);
+	if (have_dss && chunk_in_dss(chunk_a) != chunk_in_dss(chunk_b))
+		return (true);
+
+	return (false);
 }
 
 static rtree_node_elm_t *
diff --git a/src/chunk_dss.c b/src/chunk_dss.c
index 6fbe31b..2c115e0 100644
--- a/src/chunk_dss.c
+++ b/src/chunk_dss.c
@@ -134,10 +134,10 @@ chunk_alloc_dss(arena_t *arena, void *new_addr, size_t size, size_t alignment,
 				dss_max = dss_next;
 				malloc_mutex_unlock(&dss_mtx);
 				if (cpad_size != 0) {
-					chunk_record(arena,
-					    &arena->chunks_szad_dss,
-					    &arena->chunks_ad_dss, false, cpad,
-					    cpad_size, false);
+					chunk_hooks_t chunk_hooks =
+					    CHUNK_HOOKS_INITIALIZER;
+					chunk_dalloc_wrapper(arena,
+					    &chunk_hooks, cpad, cpad_size);
 				}
 				if (*zero) {
 					JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(
diff --git a/src/chunk_mmap.c b/src/chunk_mmap.c
index 30ac10b..f243615 100644
--- a/src/chunk_mmap.c
+++ b/src/chunk_mmap.c
@@ -2,137 +2,6 @@
 #include "jemalloc/internal/jemalloc_internal.h"
 
 /******************************************************************************/
-/* Function prototypes for non-inline static functions. */
-
-static void	*pages_map(void *addr, size_t size);
-static void	pages_unmap(void *addr, size_t size);
-static void	*chunk_alloc_mmap_slow(size_t size, size_t alignment,
-    bool *zero);
-
-/******************************************************************************/
-
-static void *
-pages_map(void *addr, size_t size)
-{
-	void *ret;
-
-	assert(size != 0);
-
-#ifdef _WIN32
-	/*
-	 * If VirtualAlloc can't allocate at the given address when one is
-	 * given, it fails and returns NULL.
-	 */
-	ret = VirtualAlloc(addr, size, MEM_COMMIT | MEM_RESERVE,
-	    PAGE_READWRITE);
-#else
-	/*
-	 * We don't use MAP_FIXED here, because it can cause the *replacement*
-	 * of existing mappings, and we only want to create new mappings.
-	 */
-	ret = mmap(addr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
-	    -1, 0);
-	assert(ret != NULL);
-
-	if (ret == MAP_FAILED)
-		ret = NULL;
-	else if (addr != NULL && ret != addr) {
-		/*
-		 * We succeeded in mapping memory, but not in the right place.
-		 */
-		pages_unmap(ret, size);
-		ret = NULL;
-	}
-#endif
-	assert(ret == NULL || (addr == NULL && ret != addr)
-	    || (addr != NULL && ret == addr));
-	return (ret);
-}
-
-static void
-pages_unmap(void *addr, size_t size)
-{
-
-#ifdef _WIN32
-	if (VirtualFree(addr, 0, MEM_RELEASE) == 0)
-#else
-	if (munmap(addr, size) == -1)
-#endif
-	{
-		char buf[BUFERROR_BUF];
-
-		buferror(get_errno(), buf, sizeof(buf));
-		malloc_printf("<jemalloc>: Error in "
-#ifdef _WIN32
-		              "VirtualFree"
-#else
-		              "munmap"
-#endif
-		              "(): %s\n", buf);
-		if (opt_abort)
-			abort();
-	}
-}
-
-static void *
-pages_trim(void *addr, size_t alloc_size, size_t leadsize, size_t size)
-{
-	void *ret = (void *)((uintptr_t)addr + leadsize);
-
-	assert(alloc_size >= leadsize + size);
-#ifdef _WIN32
-	{
-		void *new_addr;
-
-		pages_unmap(addr, alloc_size);
-		new_addr = pages_map(ret, size);
-		if (new_addr == ret)
-			return (ret);
-		if (new_addr)
-			pages_unmap(new_addr, size);
-		return (NULL);
-	}
-#else
-	{
-		size_t trailsize = alloc_size - leadsize - size;
-
-		if (leadsize != 0)
-			pages_unmap(addr, leadsize);
-		if (trailsize != 0)
-			pages_unmap((void *)((uintptr_t)ret + size), trailsize);
-		return (ret);
-	}
-#endif
-}
-
-bool
-pages_purge(void *addr, size_t length)
-{
-	bool unzeroed;
-
-#ifdef _WIN32
-	VirtualAlloc(addr, length, MEM_RESET, PAGE_READWRITE);
-	unzeroed = true;
-#elif defined(JEMALLOC_HAVE_MADVISE)
-#  ifdef JEMALLOC_PURGE_MADVISE_DONTNEED
-#    define JEMALLOC_MADV_PURGE MADV_DONTNEED
-#    define JEMALLOC_MADV_ZEROS true
-#  elif defined(JEMALLOC_PURGE_MADVISE_FREE)
-#    define JEMALLOC_MADV_PURGE MADV_FREE
-#    define JEMALLOC_MADV_ZEROS false
-#  else
-#    error "No madvise(2) flag defined for purging unused dirty pages."
-#  endif
-	int err = madvise(addr, length, JEMALLOC_MADV_PURGE);
-	unzeroed = (!JEMALLOC_MADV_ZEROS || err != 0);
-#  undef JEMALLOC_MADV_PURGE
-#  undef JEMALLOC_MADV_ZEROS
-#else
-	/* Last resort no-op. */
-	unzeroed = true;
-#endif
-	return (unzeroed);
-}
 
 static void *
 chunk_alloc_mmap_slow(size_t size, size_t alignment, bool *zero)
diff --git a/src/ctl.c b/src/ctl.c
index 1988aee..3de8e60 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -118,9 +118,7 @@ CTL_PROTO(arena_i_purge)
 static void	arena_purge(unsigned arena_ind);
 CTL_PROTO(arena_i_dss)
 CTL_PROTO(arena_i_lg_dirty_mult)
-CTL_PROTO(arena_i_chunk_alloc)
-CTL_PROTO(arena_i_chunk_dalloc)
-CTL_PROTO(arena_i_chunk_purge)
+CTL_PROTO(arena_i_chunk_hooks)
 INDEX_PROTO(arena_i)
 CTL_PROTO(arenas_bin_i_size)
 CTL_PROTO(arenas_bin_i_nregs)
@@ -288,17 +286,11 @@ static const ctl_named_node_t	tcache_node[] = {
 	{NAME("destroy"),	CTL(tcache_destroy)}
 };
 
-static const ctl_named_node_t chunk_node[] = {
-	{NAME("alloc"),		CTL(arena_i_chunk_alloc)},
-	{NAME("dalloc"),	CTL(arena_i_chunk_dalloc)},
-	{NAME("purge"),		CTL(arena_i_chunk_purge)}
-};
-
 static const ctl_named_node_t arena_i_node[] = {
 	{NAME("purge"),		CTL(arena_i_purge)},
 	{NAME("dss"),		CTL(arena_i_dss)},
 	{NAME("lg_dirty_mult"),	CTL(arena_i_lg_dirty_mult)},
-	{NAME("chunk"),		CHILD(named, chunk)},
+	{NAME("chunk_hooks"),	CTL(arena_i_chunk_hooks)}
 };
 static const ctl_named_node_t super_arena_i_node[] = {
 	{NAME(""),		CHILD(named, arena_i)}
@@ -1064,8 +1056,8 @@ ctl_postfork_child(void)
 			memcpy(oldp, (void *)&(v), copylen);		\
 			ret = EINVAL;					\
 			goto label_return;				\
-		} else							\
-			*(t *)oldp = (v);				\
+		}							\
+		*(t *)oldp = (v);					\
 	}								\
 } while (0)
 
@@ -1682,37 +1674,36 @@ label_return:
 	return (ret);
 }
 
-#define	CHUNK_FUNC(n)							\
-static int								\
-arena_i_chunk_##n##_ctl(const size_t *mib, size_t miblen, void *oldp,	\
-    size_t *oldlenp, void *newp, size_t newlen)				\
-{									\
-									\
-	int ret;							\
-	unsigned arena_ind = mib[1];					\
-	arena_t *arena;							\
-									\
-	malloc_mutex_lock(&ctl_mtx);					\
-	if (arena_ind < narenas_total_get() && (arena =			\
-	    arena_get(tsd_fetch(), arena_ind, false, true)) != NULL) {	\
-		malloc_mutex_lock(&arena->lock);			\
-		READ(arena->chunk_##n, chunk_##n##_t *);		\
-		WRITE(arena->chunk_##n, chunk_##n##_t *);		\
-	} else {							\
-		ret = EFAULT;						\
-		goto label_outer_return;				\
-	}								\
-	ret = 0;							\
-label_return:								\
-	malloc_mutex_unlock(&arena->lock);				\
-label_outer_return:							\
-	malloc_mutex_unlock(&ctl_mtx);					\
-	return (ret);							\
+static int
+arena_i_chunk_hooks_ctl(const size_t *mib, size_t miblen, void *oldp,
+    size_t *oldlenp, void *newp, size_t newlen)
+{
+	int ret;
+	unsigned arena_ind = mib[1];
+	arena_t *arena;
+
+	malloc_mutex_lock(&ctl_mtx);
+	if (arena_ind < narenas_total_get() && (arena =
+	    arena_get(tsd_fetch(), arena_ind, false, true)) != NULL) {
+		if (newp != NULL) {
+			chunk_hooks_t old_chunk_hooks, new_chunk_hooks;
+			WRITE(new_chunk_hooks, chunk_hooks_t);
+			old_chunk_hooks = chunk_hooks_set(arena,
+			    &new_chunk_hooks);
+			READ(old_chunk_hooks, chunk_hooks_t);
+		} else {
+			chunk_hooks_t old_chunk_hooks = chunk_hooks_get(arena);
+			READ(old_chunk_hooks, chunk_hooks_t);
+		}
+	} else {
+		ret = EFAULT;
+		goto label_return;
+	}
+	ret = 0;
+label_return:
+	malloc_mutex_unlock(&ctl_mtx);
+	return (ret);
 }
-CHUNK_FUNC(alloc)
-CHUNK_FUNC(dalloc)
-CHUNK_FUNC(purge)
-#undef CHUNK_FUNC
 
 static const ctl_named_node_t *
 arena_i_index(const size_t *mib, size_t miblen, size_t i)
diff --git a/src/huge.c b/src/huge.c
index 7cd0d7d..4aa7a97 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -79,7 +79,7 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
 		return (NULL);
 	}
 
-	extent_node_init(node, arena, ret, size, is_zeroed);
+	extent_node_init(node, arena, ret, size, true, is_zeroed);
 
 	if (huge_node_set(ret, node)) {
 		arena_chunk_dalloc_huge(arena, ret, size);
@@ -132,7 +132,7 @@ huge_ralloc_no_move_similar(void *ptr, size_t oldsize, size_t usize,
 	size_t usize_next;
 	extent_node_t *node;
 	arena_t *arena;
-	chunk_purge_t *chunk_purge;
+	chunk_hooks_t chunk_hooks = CHUNK_HOOKS_INITIALIZER;
 	bool zeroed;
 
 	/* Increase usize to incorporate extra. */
@@ -145,15 +145,11 @@ huge_ralloc_no_move_similar(void *ptr, size_t oldsize, size_t usize,
 	node = huge_node_get(ptr);
 	arena = extent_node_arena_get(node);
 
-	malloc_mutex_lock(&arena->lock);
-	chunk_purge = arena->chunk_purge;
-	malloc_mutex_unlock(&arena->lock);
-
 	/* Fill if necessary (shrinking). */
 	if (oldsize > usize) {
 		size_t sdiff = oldsize - usize;
-		zeroed = !chunk_purge_wrapper(arena, chunk_purge, ptr, usize,
-		    sdiff);
+		zeroed = !chunk_purge_wrapper(arena, &chunk_hooks, ptr,
+		    CHUNK_CEILING(usize), usize, sdiff);
 		if (config_fill && unlikely(opt_junk_free)) {
 			memset((void *)((uintptr_t)ptr + usize), 0x5a, sdiff);
 			zeroed = false;
@@ -185,26 +181,31 @@ huge_ralloc_no_move_similar(void *ptr, size_t oldsize, size_t usize,
 	}
 }
 
-static void
+static bool
 huge_ralloc_no_move_shrink(void *ptr, size_t oldsize, size_t usize)
 {
 	extent_node_t *node;
 	arena_t *arena;
-	chunk_purge_t *chunk_purge;
+	chunk_hooks_t chunk_hooks;
+	size_t cdiff;
 	bool zeroed;
 
 	node = huge_node_get(ptr);
 	arena = extent_node_arena_get(node);
+	chunk_hooks = chunk_hooks_get(arena);
 
-	malloc_mutex_lock(&arena->lock);
-	chunk_purge = arena->chunk_purge;
-	malloc_mutex_unlock(&arena->lock);
+	/* Split excess chunks. */
+	cdiff = CHUNK_CEILING(oldsize) - CHUNK_CEILING(usize);
+	if (cdiff != 0 && chunk_hooks.split(ptr, CHUNK_CEILING(oldsize),
+	    CHUNK_CEILING(usize), cdiff, true, arena->ind))
+		return (true);
 
 	if (oldsize > usize) {
 		size_t sdiff = oldsize - usize;
-		zeroed = !chunk_purge_wrapper(arena, chunk_purge,
+		zeroed = !chunk_purge_wrapper(arena, &chunk_hooks,
 		    CHUNK_ADDR2BASE((uintptr_t)ptr + usize),
-		    CHUNK_ADDR2OFFSET((uintptr_t)ptr + usize), sdiff);
+		    CHUNK_CEILING(usize), CHUNK_ADDR2OFFSET((uintptr_t)ptr +
+		    usize), sdiff);
 		if (config_fill && unlikely(opt_junk_free)) {
 			huge_dalloc_junk((void *)((uintptr_t)ptr + usize),
 			    sdiff);
@@ -222,6 +223,8 @@ huge_ralloc_no_move_shrink(void *ptr, size_t oldsize, size_t usize)
 
 	/* Zap the excess chunks. */
 	arena_chunk_ralloc_huge_shrink(arena, ptr, oldsize, usize);
+
+	return (false);
 }
 
 static bool
@@ -304,14 +307,9 @@ huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
 		return (false);
 	}
 
-	if (!maps_coalesce)
-		return (true);
-
-	/* Shrink the allocation in-place. */
-	if (CHUNK_CEILING(oldsize) >= CHUNK_CEILING(usize)) {
-		huge_ralloc_no_move_shrink(ptr, oldsize, usize);
-		return (false);
-	}
+	/* Attempt to shrink the allocation in-place. */
+	if (CHUNK_CEILING(oldsize) >= CHUNK_CEILING(usize))
+		return (huge_ralloc_no_move_shrink(ptr, oldsize, usize));
 
 	/* Attempt to expand the allocation in-place. */
 	if (huge_ralloc_no_move_expand(ptr, oldsize, size + extra, zero)) {
diff --git a/src/pages.c b/src/pages.c
new file mode 100644
index 0000000..6f775dc
--- /dev/null
+++ b/src/pages.c
@@ -0,0 +1,167 @@
+#define	JEMALLOC_PAGES_C_
+#include "jemalloc/internal/jemalloc_internal.h"
+
+/******************************************************************************/
+
+void *
+pages_map(void *addr, size_t size)
+{
+	void *ret;
+
+	assert(size != 0);
+
+#ifdef _WIN32
+	/*
+	 * If VirtualAlloc can't allocate at the given address when one is
+	 * given, it fails and returns NULL.
+	 */
+	ret = VirtualAlloc(addr, size, MEM_COMMIT | MEM_RESERVE,
+	    PAGE_READWRITE);
+#else
+	/*
+	 * We don't use MAP_FIXED here, because it can cause the *replacement*
+	 * of existing mappings, and we only want to create new mappings.
+	 */
+	ret = mmap(addr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
+	    -1, 0);
+	assert(ret != NULL);
+
+	if (ret == MAP_FAILED)
+		ret = NULL;
+	else if (addr != NULL && ret != addr) {
+		/*
+		 * We succeeded in mapping memory, but not in the right place.
+		 */
+		pages_unmap(ret, size);
+		ret = NULL;
+	}
+#endif
+	assert(ret == NULL || (addr == NULL && ret != addr)
+	    || (addr != NULL && ret == addr));
+	return (ret);
+}
+
+void
+pages_unmap(void *addr, size_t size)
+{
+
+#ifdef _WIN32
+	if (VirtualFree(addr, 0, MEM_RELEASE) == 0)
+#else
+	if (munmap(addr, size) == -1)
+#endif
+	{
+		char buf[BUFERROR_BUF];
+
+		buferror(get_errno(), buf, sizeof(buf));
+		malloc_printf("<jemalloc>: Error in "
+#ifdef _WIN32
+		              "VirtualFree"
+#else
+		              "munmap"
+#endif
+		              "(): %s\n", buf);
+		if (opt_abort)
+			abort();
+	}
+}
+
+void *
+pages_trim(void *addr, size_t alloc_size, size_t leadsize, size_t size)
+{
+	void *ret = (void *)((uintptr_t)addr + leadsize);
+
+	assert(alloc_size >= leadsize + size);
+#ifdef _WIN32
+	{
+		void *new_addr;
+
+		pages_unmap(addr, alloc_size);
+		new_addr = pages_map(ret, size);
+		if (new_addr == ret)
+			return (ret);
+		if (new_addr)
+			pages_unmap(new_addr, size);
+		return (NULL);
+	}
+#else
+	{
+		size_t trailsize = alloc_size - leadsize - size;
+
+		if (leadsize != 0)
+			pages_unmap(addr, leadsize);
+		if (trailsize != 0)
+			pages_unmap((void *)((uintptr_t)ret + size), trailsize);
+		return (ret);
+	}
+#endif
+}
+
+static bool
+pages_commit_impl(void *addr, size_t size, bool commit)
+{
+
+#ifndef _WIN32
+	if (config_debug) {
+		int prot = commit ? (PROT_READ | PROT_WRITE) : PROT_NONE;
+		void *result = mmap(addr, size, prot, MAP_PRIVATE | MAP_ANON |
+		    MAP_FIXED, -1, 0);
+		if (result == MAP_FAILED)
+			return (true);
+		if (result != addr) {
+			/*
+			 * We succeeded in mapping memory, but not in the right
+			 * place.
+			 */
+			pages_unmap(result, size);
+			return (true);
+		}
+		return (false);
+	}
+#endif
+	return (true);
+}
+
+bool
+pages_commit(void *addr, size_t size)
+{
+
+	return (pages_commit_impl(addr, size, true));
+}
+
+bool
+pages_decommit(void *addr, size_t size)
+{
+
+	return (pages_commit_impl(addr, size, false));
+}
+
+bool
+pages_purge(void *addr, size_t size)
+{
+	bool unzeroed;
+
+#ifdef _WIN32
+	VirtualAlloc(addr, size, MEM_RESET, PAGE_READWRITE);
+	unzeroed = true;
+#elif defined(JEMALLOC_HAVE_MADVISE)
+#  ifdef JEMALLOC_PURGE_MADVISE_DONTNEED
+#    define JEMALLOC_MADV_PURGE MADV_DONTNEED
+#    define JEMALLOC_MADV_ZEROS true
+#  elif defined(JEMALLOC_PURGE_MADVISE_FREE)
+#    define JEMALLOC_MADV_PURGE MADV_FREE
+#    define JEMALLOC_MADV_ZEROS false
+#  else
+#    error "No madvise(2) flag defined for purging unused dirty pages."
+#  endif
+	int err = madvise(addr, size, JEMALLOC_MADV_PURGE);
+	unzeroed = (!JEMALLOC_MADV_ZEROS || err != 0);
+#  undef JEMALLOC_MADV_PURGE
+#  undef JEMALLOC_MADV_ZEROS
+#else
+	/* Last resort no-op. */
+	unzeroed = true;
+#endif
+	return (unzeroed);
+}
+
diff --git a/test/integration/chunk.c b/test/integration/chunk.c
index c94b2d4..62d00ba 100644
--- a/test/integration/chunk.c
+++ b/test/integration/chunk.c
@@ -1,59 +1,140 @@
 #include "test/jemalloc_test.h"
 
-chunk_alloc_t *old_alloc;
-chunk_dalloc_t *old_dalloc;
-chunk_purge_t *old_purge;
-bool purged;
+static chunk_hooks_t orig_hooks;
+static chunk_hooks_t old_hooks;
+
+static bool do_dalloc = true;
+static bool do_decommit;
+
+static bool did_alloc;
+static bool did_dalloc;
+static bool did_commit;
+static bool did_decommit;
+static bool did_purge;
+static bool did_split;
+static bool did_merge;
+
+#if 0
+#  define TRACE_HOOK(fmt, ...) malloc_printf(fmt, __VA_ARGS__)
+#else
+#  define TRACE_HOOK(fmt, ...)
+#endif
 
 void *
 chunk_alloc(void *new_addr, size_t size, size_t alignment, bool *zero,
     unsigned arena_ind)
 {
 
-	return (old_alloc(new_addr, size, alignment, zero, arena_ind));
+	TRACE_HOOK("%s(new_addr=%p, size=%zu, alignment=%zu, *zero=%s, "
+	    "arena_ind=%u)\n", __func__, new_addr, size, alignment, *zero ?
+	    "true" : "false", arena_ind);
+	did_alloc = true;
+	return (old_hooks.alloc(new_addr, size, alignment, zero, arena_ind));
 }
 
 bool
 chunk_dalloc(void *chunk, size_t size, unsigned arena_ind)
 {
 
-	return (old_dalloc(chunk, size, arena_ind));
+	TRACE_HOOK("%s(chunk=%p, size=%zu, arena_ind=%u)\n", __func__, chunk,
+	    size, arena_ind);
+	did_dalloc = true;
+	if (!do_dalloc)
+		return (true);
+	return (old_hooks.dalloc(chunk, size, arena_ind));
 }
 
 bool
-chunk_purge(void *chunk, size_t offset, size_t length, unsigned arena_ind)
+chunk_commit(void *chunk, size_t size, unsigned arena_ind)
 {
 
-	purged = true;
-	return (old_purge(chunk, offset, length, arena_ind));
+	TRACE_HOOK("%s(chunk=%p, size=%zu, arena_ind=%u)\n", __func__, chunk,
+	    size, arena_ind);
+	did_commit = true;
+	memset(chunk, 0, size);
+	return (false);
 }
 
-TEST_BEGIN(test_chunk)
+bool
+chunk_decommit(void *chunk, size_t size, unsigned arena_ind)
 {
-	void *p;
-	chunk_alloc_t *new_alloc;
-	chunk_dalloc_t *new_dalloc;
-	chunk_purge_t *new_purge;
-	size_t old_size, new_size, huge0, huge1, huge2, sz;
 
-	new_alloc = chunk_alloc;
-	new_dalloc = chunk_dalloc;
-	new_purge = chunk_purge;
-	old_size = sizeof(chunk_alloc_t *);
-	new_size = sizeof(chunk_alloc_t *);
+	TRACE_HOOK("%s(chunk=%p, size=%zu, arena_ind=%u)\n", __func__, chunk,
+	    size, arena_ind);
+	did_decommit = true;
+	return (!do_decommit);
+}
 
-	assert_d_eq(mallctl("arena.0.chunk.alloc", &old_alloc, &old_size,
-	    &new_alloc, new_size), 0, "Unexpected alloc error");
-	assert_ptr_ne(old_alloc, new_alloc, "Unexpected alloc error");
+bool
+chunk_purge(void *chunk, size_t size, size_t offset, size_t length,
+    unsigned arena_ind)
+{
+
+	TRACE_HOOK("%s(chunk=%p, size=%zu, offset=%zu, length=%zu "
+	    "arena_ind=%u)\n", __func__, chunk, size, offset, length,
+	    arena_ind);
+	did_purge = true;
+	return (old_hooks.purge(chunk, size, offset, length, arena_ind));
+}
 
-	assert_d_eq(mallctl("arena.0.chunk.dalloc", &old_dalloc, &old_size,
-	    &new_dalloc, new_size), 0, "Unexpected dalloc error");
-	assert_ptr_ne(old_dalloc, new_dalloc, "Unexpected dalloc error");
+bool
+chunk_split(void *chunk, size_t size, size_t size_a, size_t size_b,
+    bool committed, unsigned arena_ind)
+{
 
-	assert_d_eq(mallctl("arena.0.chunk.purge", &old_purge, &old_size,
-	    &new_purge, new_size), 0, "Unexpected purge error");
-	assert_ptr_ne(old_purge, new_purge, "Unexpected purge error");
+	TRACE_HOOK("%s(chunk=%p, size=%zu, size_a=%zu, size_b=%zu, "
+	    "committed=%s, arena_ind=%u)\n", __func__, chunk, size, size_a,
+	    size_b, committed ? "true" : "false", arena_ind);
+	did_split = true;
+	return (old_hooks.split(chunk, size, size_a, size_b, committed,
+	    arena_ind));
+}
 
+bool
+chunk_merge(void *chunk_a, size_t size_a, void *chunk_b, size_t size_b,
+    bool committed, unsigned arena_ind)
+{
+
+	TRACE_HOOK("%s(chunk_a=%p, size_a=%zu, chunk_b=%p size_b=%zu, "
+	    "committed=%s, arena_ind=%u)\n", __func__, chunk_a, size_a, chunk_b,
+	    size_b, committed ? "true" : "false", arena_ind);
+	did_merge = true;
+	return (old_hooks.merge(chunk_a, size_a, chunk_b, size_b,
+	    committed, arena_ind));
+}
+
+TEST_BEGIN(test_chunk)
+{
+	void *p;
+	size_t old_size, new_size, huge0, huge1, huge2, sz;
+	chunk_hooks_t new_hooks = {
+		chunk_alloc,
+		chunk_dalloc,
+		chunk_commit,
+		chunk_decommit,
+		chunk_purge,
+		chunk_split,
+		chunk_merge
+	};
+
+	/* Install custom chunk hooks. */
+	old_size = sizeof(chunk_hooks_t);
+	new_size = sizeof(chunk_hooks_t);
+	assert_d_eq(mallctl("arena.0.chunk_hooks", &old_hooks, &old_size,
+	    &new_hooks, new_size), 0, "Unexpected chunk_hooks error");
+	orig_hooks = old_hooks;
+	assert_ptr_ne(old_hooks.alloc, chunk_alloc, "Unexpected alloc error");
+	assert_ptr_ne(old_hooks.dalloc, chunk_dalloc,
+	    "Unexpected dalloc error");
+	assert_ptr_ne(old_hooks.commit, chunk_commit,
+	    "Unexpected commit error");
+	assert_ptr_ne(old_hooks.decommit, chunk_decommit,
+	    "Unexpected decommit error");
+	assert_ptr_ne(old_hooks.purge, chunk_purge, "Unexpected purge error");
+	assert_ptr_ne(old_hooks.split, chunk_split, "Unexpected split error");
+	assert_ptr_ne(old_hooks.merge, chunk_merge, "Unexpected merge error");
+
+	/* Get huge size classes. */
 	sz = sizeof(size_t);
 	assert_d_eq(mallctl("arenas.hchunk.0.size", &huge0, &sz, NULL, 0), 0,
 	    "Unexpected arenas.hchunk.0.size failure");
@@ -61,6 +142,49 @@ TEST_BEGIN(test_chunk)
 	    "Unexpected arenas.hchunk.1.size failure");
 	assert_d_eq(mallctl("arenas.hchunk.2.size", &huge2, &sz, NULL, 0), 0,
 	    "Unexpected arenas.hchunk.2.size failure");
+
+	/* Test dalloc/decommit/purge cascade. */
+	do_dalloc = false;
+	do_decommit = false;
+	p = mallocx(huge0 * 2, 0);
+	assert_ptr_not_null(p, "Unexpected mallocx() error");
+	did_dalloc = false;
+	did_decommit = false;
+	did_purge = false;
+	assert_zu_eq(xallocx(p, huge0, 0, 0), huge0,
+	    "Unexpected xallocx() failure");
+	assert_d_eq(mallctl("arena.0.purge", NULL, NULL, NULL, 0), 0,
+	    "Unexpected arena.0.purge error");
+	assert_true(did_dalloc, "Expected dalloc");
+	assert_true(did_decommit, "Expected decommit");
+	assert_true(did_purge, "Expected purge");
+	dallocx(p, 0);
+	do_dalloc = true;
+
+	/* Test decommit/commit and observe split/merge. */
+	do_dalloc = false;
+	do_decommit = true;
+	p = mallocx(huge0 * 2, 0);
+	assert_ptr_not_null(p, "Unexpected mallocx() error");
+	did_decommit = false;
+	did_commit = false;
+	did_split = false;
+	did_merge = false;
+	assert_zu_eq(xallocx(p, huge0, 0, 0), huge0,
+	    "Unexpected xallocx() failure");
+	assert_d_eq(mallctl("arena.0.purge", NULL, NULL, NULL, 0), 0,
+	    "Unexpected arena.0.purge error");
+	assert_true(did_decommit, "Expected decommit");
+	assert_true(did_split, "Expected split");
+	assert_zu_eq(xallocx(p, huge0 * 2, 0, 0), huge0 * 2,
+	    "Unexpected xallocx() failure");
+	assert_true(did_commit, "Expected commit");
+	assert_true(did_commit, "Expected merge");
+	dallocx(p, 0);
+	do_dalloc = true;
+	do_decommit = false;
+
+	/* Test purge for partial-chunk huge allocations. */
 	if (huge0 * 2 > huge2) {
 		/*
 		 * There are at least four size classes per doubling, so a
@@ -69,23 +193,37 @@ TEST_BEGIN(test_chunk)
 		 */
 		p = mallocx(huge2, 0);
 		assert_ptr_not_null(p, "Unexpected mallocx() error");
-		purged = false;
+		did_purge = false;
 		assert_zu_eq(xallocx(p, huge1, 0, 0), huge1,
 		    "Unexpected xallocx() failure");
-		assert_true(purged, "Unexpected purge");
+		assert_true(did_purge, "Unexpected purge");
 		dallocx(p, 0);
 	}
 
+	/* Make sure non-huge allocation succeeds. */
 	p = mallocx(42, 0);
 	assert_ptr_not_null(p, "Unexpected mallocx() error");
-	free(p);
-
-	assert_d_eq(mallctl("arena.0.chunk.alloc", NULL, NULL, &old_alloc,
-	    old_size), 0, "Unexpected alloc error");
-	assert_d_eq(mallctl("arena.0.chunk.dalloc", NULL, NULL, &old_dalloc,
-	    old_size), 0, "Unexpected dalloc error");
-	assert_d_eq(mallctl("arena.0.chunk.purge", NULL, NULL, &old_purge,
-	    old_size), 0, "Unexpected purge error");
+	dallocx(p, 0);
+
+	/* Restore chunk hooks. */
+	assert_d_eq(mallctl("arena.0.chunk_hooks", NULL, NULL, &old_hooks,
+	    new_size), 0, "Unexpected chunk_hooks error");
+	assert_d_eq(mallctl("arena.0.chunk_hooks", &old_hooks, &old_size,
+	    NULL, 0), 0, "Unexpected chunk_hooks error");
+	assert_ptr_eq(old_hooks.alloc, orig_hooks.alloc,
+	    "Unexpected alloc error");
+	assert_ptr_eq(old_hooks.dalloc, orig_hooks.dalloc,
+	    "Unexpected dalloc error");
+	assert_ptr_eq(old_hooks.commit, orig_hooks.commit,
+	    "Unexpected commit error");
+	assert_ptr_eq(old_hooks.decommit, orig_hooks.decommit,
+	    "Unexpected decommit error");
+	assert_ptr_eq(old_hooks.purge, orig_hooks.purge,
+	    "Unexpected purge error");
+	assert_ptr_eq(old_hooks.split, orig_hooks.split,
+	    "Unexpected split error");
+	assert_ptr_eq(old_hooks.merge, orig_hooks.merge,
+	    "Unexpected merge error");
 }
 TEST_END
 
-- 
cgit v0.12


From c1a6a51e401eb888caff5de142280754e7d99ba3 Mon Sep 17 00:00:00 2001
From: Matthijs <mlvdmeide@gmail.com>
Date: Mon, 27 Jul 2015 22:48:27 +0200
Subject: MSVC compatibility changes

- Decorate public function with __declspec(allocator) and __declspec(restrict), just like MSVC 1900
- Support JEMALLOC_HAS_RESTRICT by defining the restrict keyword
- Move __declspec(nothrow) between 'void' and '*' so it compiles once more
---
 .../jemalloc/internal/jemalloc_internal_decls.h    |  3 +++
 include/jemalloc/jemalloc_macros.h.in              | 10 +++++++++
 include/jemalloc/jemalloc_protos.h.in              | 24 ++++++++++++++--------
 src/jemalloc.c                                     | 24 ++++++++++++++--------
 4 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/include/jemalloc/internal/jemalloc_internal_decls.h b/include/jemalloc/internal/jemalloc_internal_decls.h
index 5d42f47..a601d6e 100644
--- a/include/jemalloc/internal/jemalloc_internal_decls.h
+++ b/include/jemalloc/internal/jemalloc_internal_decls.h
@@ -43,6 +43,9 @@ typedef intptr_t ssize_t;
 #  define PATH_MAX 1024
 #  define STDERR_FILENO 2
 #  define __func__ __FUNCTION__
+#  ifdef JEMALLOC_HAS_RESTRICT
+#    define restrict __restrict
+#  endif
 /* Disable warnings about deprecated system functions. */
 #  pragma warning(disable: 4996)
 #if _MSC_VER < 1800
diff --git a/include/jemalloc/jemalloc_macros.h.in b/include/jemalloc/jemalloc_macros.h.in
index 2bde6b7..7f64d9f 100644
--- a/include/jemalloc/jemalloc_macros.h.in
+++ b/include/jemalloc/jemalloc_macros.h.in
@@ -59,6 +59,8 @@
 #  define JEMALLOC_NOINLINE JEMALLOC_ATTR(noinline)
 #  define JEMALLOC_NOTHROW JEMALLOC_ATTR(nothrow)
 #  define JEMALLOC_SECTION(s) JEMALLOC_ATTR(section(s))
+#  define JEMALLOC_RESTRICT_RETURN
+#  define JEMALLOC_ALLOCATOR
 #elif _MSC_VER
 #  define JEMALLOC_ATTR(s)
 #  define JEMALLOC_ALIGNED(s) __declspec(align(s))
@@ -79,6 +81,12 @@
 #    define JEMALLOC_NOTHROW
 #  endif
 #  define JEMALLOC_SECTION(s) __declspec(allocate(s))
+#  define JEMALLOC_RESTRICT_RETURN __declspec(restrict)
+#  if _MSC_VER >= 1900 && !defined(__EDG__)
+#    define JEMALLOC_ALLOCATOR __declspec(allocator)
+#  else
+#    define JEMALLOC_ALLOCATOR
+#  endif
 #else
 #  define JEMALLOC_ATTR(s)
 #  define JEMALLOC_ALIGNED(s)
@@ -89,4 +97,6 @@
 #  define JEMALLOC_NOINLINE
 #  define JEMALLOC_NOTHROW
 #  define JEMALLOC_SECTION(s)
+#  define JEMALLOC_RESTRICT_RETURN
+#  define JEMALLOC_ALLOCATOR
 #endif
diff --git a/include/jemalloc/jemalloc_protos.h.in b/include/jemalloc/jemalloc_protos.h.in
index d37ce05..317ffdb 100644
--- a/include/jemalloc/jemalloc_protos.h.in
+++ b/include/jemalloc/jemalloc_protos.h.in
@@ -7,23 +7,29 @@ extern JEMALLOC_EXPORT const char	*@je_@malloc_conf;
 extern JEMALLOC_EXPORT void		(*@je_@malloc_message)(void *cbopaque,
     const char *s);
 
-JEMALLOC_EXPORT void *JEMALLOC_NOTHROW	@je_@malloc(size_t size)
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+    void JEMALLOC_NOTHROW	*@je_@malloc(size_t size)
     JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1);
-JEMALLOC_EXPORT void *JEMALLOC_NOTHROW	@je_@calloc(size_t num, size_t size)
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+    void JEMALLOC_NOTHROW	*@je_@calloc(size_t num, size_t size)
     JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE2(1, 2);
 JEMALLOC_EXPORT int JEMALLOC_NOTHROW	@je_@posix_memalign(void **memptr,
     size_t alignment, size_t size) JEMALLOC_CXX_THROW JEMALLOC_ATTR(nonnull(1));
-JEMALLOC_EXPORT void *JEMALLOC_NOTHROW	@je_@aligned_alloc(size_t alignment,
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+    void JEMALLOC_NOTHROW	*@je_@aligned_alloc(size_t alignment,
     size_t size) JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc)
     JEMALLOC_ALLOC_SIZE(2);
-JEMALLOC_EXPORT void *JEMALLOC_NOTHROW	@je_@realloc(void *ptr, size_t size)
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+    void JEMALLOC_NOTHROW	*@je_@realloc(void *ptr, size_t size)
     JEMALLOC_CXX_THROW JEMALLOC_ALLOC_SIZE(2);
 JEMALLOC_EXPORT void JEMALLOC_NOTHROW	@je_@free(void *ptr)
     JEMALLOC_CXX_THROW;
 
-JEMALLOC_EXPORT void *JEMALLOC_NOTHROW	@je_@mallocx(size_t size, int flags)
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+    void JEMALLOC_NOTHROW	*@je_@mallocx(size_t size, int flags)
     JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1);
-JEMALLOC_EXPORT void *JEMALLOC_NOTHROW	@je_@rallocx(void *ptr, size_t size,
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+    void JEMALLOC_NOTHROW	*@je_@rallocx(void *ptr, size_t size,
     int flags) JEMALLOC_ALLOC_SIZE(2);
 JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW	@je_@xallocx(void *ptr, size_t size,
     size_t extra, int flags);
@@ -48,11 +54,13 @@ JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW	@je_@malloc_usable_size(
     JEMALLOC_USABLE_SIZE_CONST void *ptr) JEMALLOC_CXX_THROW;
 
 #ifdef JEMALLOC_OVERRIDE_MEMALIGN
-JEMALLOC_EXPORT void	*@je_@memalign(size_t alignment, size_t size)
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+    void JEMALLOC_NOTHROW	*@je_@memalign(size_t alignment, size_t size)
     JEMALLOC_ATTR(malloc);
 #endif
 
 #ifdef JEMALLOC_OVERRIDE_VALLOC
-JEMALLOC_EXPORT void	*@je_@valloc(size_t size) JEMALLOC_CXX_THROW
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+    void JEMALLOC_NOTHROW	*@je_@valloc(size_t size) JEMALLOC_CXX_THROW
     JEMALLOC_ATTR(malloc);
 #endif
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 1d02318..ed7863b 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1395,7 +1395,8 @@ imalloc_body(size_t size, tsd_t **tsd, size_t *usize)
 	return (imalloc(*tsd, size));
 }
 
-JEMALLOC_EXPORT void *JEMALLOC_NOTHROW
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+void JEMALLOC_NOTHROW *
 JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1)
 je_malloc(size_t size)
 {
@@ -1540,7 +1541,8 @@ je_posix_memalign(void **memptr, size_t alignment, size_t size)
 	return (ret);
 }
 
-JEMALLOC_EXPORT void *JEMALLOC_NOTHROW
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+void JEMALLOC_NOTHROW *
 JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(2)
 je_aligned_alloc(size_t alignment, size_t size)
 {
@@ -1594,7 +1596,8 @@ icalloc_prof(tsd_t *tsd, size_t usize)
 	return (p);
 }
 
-JEMALLOC_EXPORT void *JEMALLOC_NOTHROW
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+void JEMALLOC_NOTHROW *
 JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE2(1, 2)
 je_calloc(size_t num, size_t size)
 {
@@ -1739,7 +1742,8 @@ isfree(tsd_t *tsd, void *ptr, size_t usize, tcache_t *tcache)
 	JEMALLOC_VALGRIND_FREE(ptr, rzsize);
 }
 
-JEMALLOC_EXPORT void *JEMALLOC_NOTHROW
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+void JEMALLOC_NOTHROW *
 JEMALLOC_ALLOC_SIZE(2)
 je_realloc(void *ptr, size_t size)
 {
@@ -1823,7 +1827,8 @@ je_free(void *ptr)
  */
 
 #ifdef JEMALLOC_OVERRIDE_MEMALIGN
-JEMALLOC_EXPORT void *
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+void JEMALLOC_NOTHROW *
 JEMALLOC_ATTR(malloc)
 je_memalign(size_t alignment, size_t size)
 {
@@ -1836,7 +1841,8 @@ je_memalign(size_t alignment, size_t size)
 #endif
 
 #ifdef JEMALLOC_OVERRIDE_VALLOC
-JEMALLOC_EXPORT void *
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+void JEMALLOC_NOTHROW *
 JEMALLOC_ATTR(malloc)
 je_valloc(size_t size)
 {
@@ -2031,7 +2037,8 @@ imallocx_no_prof(tsd_t *tsd, size_t size, int flags, size_t *usize)
 	return (p);
 }
 
-JEMALLOC_EXPORT void *JEMALLOC_NOTHROW
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+void JEMALLOC_NOTHROW *
 JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1)
 je_mallocx(size_t size, int flags)
 {
@@ -2129,7 +2136,8 @@ irallocx_prof(tsd_t *tsd, void *oldptr, size_t old_usize, size_t size,
 	return (p);
 }
 
-JEMALLOC_EXPORT void *JEMALLOC_NOTHROW
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+void JEMALLOC_NOTHROW *
 JEMALLOC_ALLOC_SIZE(2)
 je_rallocx(void *ptr, size_t size, int flags)
 {
-- 
cgit v0.12


From 67c46a9e5366b3461d9f1e733129c792628c337b Mon Sep 17 00:00:00 2001
From: Daniel Micay <danielmicay@gmail.com>
Date: Sat, 1 Aug 2015 15:06:12 -0400
Subject: work around _FORTIFY_SOURCE false positive

In builds with profiling disabled (default), the opt_prof_prefix array
has a one byte length as a micro-optimization. This will cause the usage
of write in the unused profiling code to be statically detected as a
buffer overflow by Bionic's _FORTIFY_SOURCE implementation as it tries
to detect read overflows in addition to write overflows.

This works around the problem by informing the compiler that
not_reached() means code in unreachable in release builds.
---
 include/jemalloc/internal/util.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/jemalloc/internal/util.h b/include/jemalloc/internal/util.h
index fac2a17..c977aea 100644
--- a/include/jemalloc/internal/util.h
+++ b/include/jemalloc/internal/util.h
@@ -60,9 +60,11 @@
 #ifdef __GNUC__
 #	define likely(x)   __builtin_expect(!!(x), 1)
 #	define unlikely(x) __builtin_expect(!!(x), 0)
+#	define unreachable() __builtin_unreachable()
 #else
 #	define likely(x)   !!(x)
 #	define unlikely(x) !!(x)
+#	define unreachable()
 #endif
 
 /*
@@ -88,6 +90,7 @@
 		    __FILE__, __LINE__);				\
 		abort();						\
 	}								\
+	unreachable();							\
 } while (0)
 #endif
 
-- 
cgit v0.12


From 5716d97f7575708453ca477651eff6f1ac653dd1 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Thu, 6 Aug 2015 23:34:12 -0700
Subject: Fix an in-place growing large reallocation regression.

Fix arena_ralloc_large_grow() to properly account for large_pad, so that
in-place large reallocation succeeds when possible, rather than always
failing.  This regression was introduced by
8a03cf039cd06f9fa6972711195055d865673966 (Implement cache index
randomization for large allocations.)
---
 src/arena.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index ceeef81..34ac2ae 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -2441,7 +2441,7 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
     size_t oldsize, size_t size, size_t extra, bool zero)
 {
 	size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
-	size_t npages = oldsize >> LG_PAGE;
+	size_t npages = (oldsize + large_pad) >> LG_PAGE;
 	size_t followsize;
 	size_t usize_min = s2u(size);
 
@@ -2451,7 +2451,7 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	/* Try to extend the run. */
 	assert(usize_min > oldsize);
 	malloc_mutex_lock(&arena->lock);
-	if (pageind + npages < chunk_npages &&
+	if (pageind+npages < chunk_npages &&
 	    arena_mapbits_allocated_get(chunk, pageind+npages) == 0 &&
 	    (followsize = arena_mapbits_unallocated_size_get(chunk,
 	    pageind+npages)) >= usize_min - oldsize) {
@@ -2467,13 +2467,13 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 		while (oldsize + followsize < usize)
 			usize = index2size(size2index(usize)-1);
 		assert(usize >= usize_min);
-		splitsize = usize - oldsize + large_pad;
+		splitsize = usize - oldsize;
 
 		run = &arena_miscelm_get(chunk, pageind+npages)->run;
 		arena_run_split_large(arena, run, splitsize, zero);
 
 		size = oldsize + splitsize;
-		npages = size >> LG_PAGE;
+		npages = (size + large_pad) >> LG_PAGE;
 
 		/*
 		 * Mark the extended run as dirty if either portion of the run
@@ -2485,7 +2485,8 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 		 */
 		flag_dirty = arena_mapbits_dirty_get(chunk, pageind) |
 		    arena_mapbits_dirty_get(chunk, pageind+npages-1);
-		arena_mapbits_large_set(chunk, pageind, size, flag_dirty);
+		arena_mapbits_large_set(chunk, pageind, size + large_pad,
+		    flag_dirty);
 		arena_mapbits_large_set(chunk, pageind+npages-1, 0, flag_dirty);
 
 		if (config_stats) {
-- 
cgit v0.12


From 8fadb1a8c2d0219aded566bc5fac7d29cff9bb67 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 4 Aug 2015 10:49:46 -0700
Subject: Implement chunk hook support for page run commit/decommit.

Cascade from decommit to purge when purging unused dirty pages, so that
it is possible to decommit cleaned memory rather than just purging.  For
non-Windows debug builds, decommit runs rather than purging them, since
this causes access of deallocated runs to segfault.

This resolves #251.
---
 doc/jemalloc.xml.in                           |  61 +++--
 include/jemalloc/internal/arena.h             | 116 ++++++---
 include/jemalloc/internal/chunk.h             |   6 +-
 include/jemalloc/internal/chunk_dss.h         |   2 +-
 include/jemalloc/internal/chunk_mmap.h        |   3 +-
 include/jemalloc/internal/extent.h            |  38 +--
 include/jemalloc/internal/private_symbols.txt |   1 +
 include/jemalloc/jemalloc_typedefs.h.in       |  18 +-
 src/arena.c                                   | 357 +++++++++++++++++++-------
 src/base.c                                    |   2 +-
 src/chunk.c                                   | 126 +++++----
 src/chunk_dss.c                               |   6 +-
 src/chunk_mmap.c                              |   8 +-
 src/huge.c                                    |   2 +-
 test/integration/chunk.c                      |  66 +++--
 15 files changed, 545 insertions(+), 267 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 4cb74a0..39f6a34 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -1565,21 +1565,25 @@ typedef struct {
           <paramdef>size_t <parameter>size</parameter></paramdef>
           <paramdef>size_t <parameter>alignment</parameter></paramdef>
           <paramdef>bool *<parameter>zero</parameter></paramdef>
+          <paramdef>bool *<parameter>commit</parameter></paramdef>
           <paramdef>unsigned <parameter>arena_ind</parameter></paramdef>
         </funcprototype></funcsynopsis>
         A chunk allocation function conforms to the <type>chunk_alloc_t</type>
         type and upon success returns a pointer to <parameter>size</parameter>
-        bytes of mapped committed memory on behalf of arena
+        bytes of mapped memory on behalf of arena
         <parameter>arena_ind</parameter> such that the chunk's base address is a
         multiple of <parameter>alignment</parameter>, as well as setting
-        <parameter>*zero</parameter> to indicate whether the chunk is zeroed.
-        Upon error the function returns <constant>NULL</constant> and leaves
-        <parameter>*zero</parameter> unmodified.  The
+        <parameter>*zero</parameter> to indicate whether the chunk is zeroed and
+        <parameter>*commit</parameter> to indicate whether the chunk is
+        committed.  Upon error the function returns <constant>NULL</constant>
+        and leaves <parameter>*zero</parameter> and
+        <parameter>*commit</parameter> unmodified.  The
         <parameter>size</parameter> parameter is always a multiple of the chunk
         size.  The <parameter>alignment</parameter> parameter is always a power
         of two at least as large as the chunk size.  Zeroing is mandatory if
-        <parameter>*zero</parameter> is true upon function entry.  If
-        <parameter>chunk</parameter> is not <constant>NULL</constant>, the
+        <parameter>*zero</parameter> is true upon function entry.  Committing is
+        mandatory if <parameter>*commit</parameter> is true upon function entry.
+        If <parameter>chunk</parameter> is not <constant>NULL</constant>, the
         returned pointer must be <parameter>chunk</parameter> on success or
         <constant>NULL</constant> on error.  Committed memory may be committed
         in absolute terms as on a system that does not overcommit, or in
@@ -1593,48 +1597,57 @@ typedef struct {
           <funcdef>typedef bool <function>(chunk_dalloc_t)</function></funcdef>
           <paramdef>void *<parameter>chunk</parameter></paramdef>
           <paramdef>size_t <parameter>size</parameter></paramdef>
+          <paramdef>bool <parameter>committed</parameter></paramdef>
           <paramdef>unsigned <parameter>arena_ind</parameter></paramdef>
         </funcprototype></funcsynopsis>
         A chunk deallocation function conforms to the
         <type>chunk_dalloc_t</type> type and deallocates a
-        <parameter>chunk</parameter> of given <parameter>size</parameter> on
+        <parameter>chunk</parameter> of given <parameter>size</parameter> with
+        <parameter>committed</parameter>/decommited memory as indicated, on
         behalf of arena <parameter>arena_ind</parameter>, returning false upon
         success.  If the function returns true, this indicates opt-out from
         deallocation; the virtual memory mapping associated with the chunk
-        remains mapped, committed, and available for future use, in which case
-        it will be automatically retained for later reuse.</para>
+        remains mapped, in the same commit state, and available for future use,
+        in which case it will be automatically retained for later reuse.</para>
 
         <para><funcsynopsis><funcprototype>
           <funcdef>typedef bool <function>(chunk_commit_t)</function></funcdef>
           <paramdef>void *<parameter>chunk</parameter></paramdef>
           <paramdef>size_t <parameter>size</parameter></paramdef>
+          <paramdef>size_t <parameter>offset</parameter></paramdef>
+          <paramdef>size_t <parameter>length</parameter></paramdef>
           <paramdef>unsigned <parameter>arena_ind</parameter></paramdef>
         </funcprototype></funcsynopsis>
         A chunk commit function conforms to the <type>chunk_commit_t</type> type
-        and commits zeroed physical memory to back a
-        <parameter>chunk</parameter> of given <parameter>size</parameter> on
-        behalf of arena <parameter>arena_ind</parameter>, returning false upon
-        success.  Committed memory may be committed in absolute terms as on a
-        system that does not overcommit, or in implicit terms as on a system
-        that overcommits and satisfies physical memory needs on demand via soft
-        page faults. If the function returns true, this indicates insufficient
+        and commits zeroed physical memory to back pages within a
+        <parameter>chunk</parameter> of given <parameter>size</parameter> at
+        <parameter>offset</parameter> bytes, extending for
+        <parameter>length</parameter> on behalf of arena
+        <parameter>arena_ind</parameter>, returning false upon success.
+        Committed memory may be committed in absolute terms as on a system that
+        does not overcommit, or in implicit terms as on a system that
+        overcommits and satisfies physical memory needs on demand via soft page
+        faults. If the function returns true, this indicates insufficient
         physical memory to satisfy the request.</para>
 
         <para><funcsynopsis><funcprototype>
           <funcdef>typedef bool <function>(chunk_decommit_t)</function></funcdef>
           <paramdef>void *<parameter>chunk</parameter></paramdef>
           <paramdef>size_t <parameter>size</parameter></paramdef>
+          <paramdef>size_t <parameter>offset</parameter></paramdef>
+          <paramdef>size_t <parameter>length</parameter></paramdef>
           <paramdef>unsigned <parameter>arena_ind</parameter></paramdef>
         </funcprototype></funcsynopsis>
         A chunk decommit function conforms to the <type>chunk_decommit_t</type>
-        type and decommits any physical memory that is backing a
-        <parameter>chunk</parameter> of given <parameter>size</parameter> on
-        behalf of arena <parameter>arena_ind</parameter>, returning false upon
-        success, in which case the chunk will be committed via the chunk commit
-        function before being reused.  If the function returns true, this
-        indicates opt-out from decommit; the memory remains committed and
-        available for future use, in which case it will be automatically
-        retained for later reuse.</para>
+        type and decommits any physical memory that is backing pages within a
+        <parameter>chunk</parameter> of given <parameter>size</parameter> at
+        <parameter>offset</parameter> bytes, extending for
+        <parameter>length</parameter> on behalf of arena
+        <parameter>arena_ind</parameter>, returning false upon success, in which
+        case the pages will be committed via the chunk commit function before
+        being reused.  If the function returns true, this indicates opt-out from
+        decommit; the memory remains committed and available for future use, in
+        which case it will be automatically retained for later reuse.</para>
 
         <para><funcsynopsis><funcprototype>
           <funcdef>typedef bool <function>(chunk_purge_t)</function></funcdef>
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 29f73e7..b2afb17 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -54,15 +54,16 @@ struct arena_chunk_map_bits_s {
 	 * Run address (or size) and various flags are stored together.  The bit
 	 * layout looks like (assuming 32-bit system):
 	 *
-	 *   ???????? ???????? ????nnnn nnnndula
+	 *   ???????? ???????? ???nnnnn nnndulma
 	 *
 	 * ? : Unallocated: Run address for first/last pages, unset for internal
 	 *                  pages.
 	 *     Small: Run page offset.
-	 *     Large: Run size for first page, unset for trailing pages.
+	 *     Large: Run page count for first page, unset for trailing pages.
 	 * n : binind for small size class, BININD_INVALID for large size class.
 	 * d : dirty?
 	 * u : unzeroed?
+	 * m : decommitted?
 	 * l : large?
 	 * a : allocated?
 	 *
@@ -74,51 +75,58 @@ struct arena_chunk_map_bits_s {
 	 * x : don't care
 	 * - : 0
 	 * + : 1
-	 * [DULA] : bit set
-	 * [dula] : bit unset
+	 * [DUMLA] : bit set
+	 * [dumla] : bit unset
 	 *
 	 *   Unallocated (clean):
-	 *     ssssssss ssssssss ssss++++ ++++du-a
-	 *     xxxxxxxx xxxxxxxx xxxxxxxx xxxx-Uxx
-	 *     ssssssss ssssssss ssss++++ ++++dU-a
+	 *     ssssssss ssssssss sss+++++ +++dum-a
+	 *     xxxxxxxx xxxxxxxx xxxxxxxx xxx-Uxxx
+	 *     ssssssss ssssssss sss+++++ +++dUm-a
 	 *
 	 *   Unallocated (dirty):
-	 *     ssssssss ssssssss ssss++++ ++++D--a
+	 *     ssssssss ssssssss sss+++++ +++D-m-a
 	 *     xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx
-	 *     ssssssss ssssssss ssss++++ ++++D--a
+	 *     ssssssss ssssssss sss+++++ +++D-m-a
 	 *
 	 *   Small:
-	 *     pppppppp pppppppp ppppnnnn nnnnd--A
-	 *     pppppppp pppppppp ppppnnnn nnnn---A
-	 *     pppppppp pppppppp ppppnnnn nnnnd--A
+	 *     pppppppp pppppppp pppnnnnn nnnd---A
+	 *     pppppppp pppppppp pppnnnnn nnn----A
+	 *     pppppppp pppppppp pppnnnnn nnnd---A
 	 *
 	 *   Large:
-	 *     ssssssss ssssssss ssss++++ ++++D-LA
+	 *     ssssssss ssssssss sss+++++ +++D--LA
 	 *     xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx
-	 *     -------- -------- ----++++ ++++D-LA
+	 *     -------- -------- ---+++++ +++D--LA
 	 *
 	 *   Large (sampled, size <= LARGE_MINCLASS):
-	 *     ssssssss ssssssss ssssnnnn nnnnD-LA
+	 *     ssssssss ssssssss sssnnnnn nnnD--LA
 	 *     xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx
-	 *     -------- -------- ----++++ ++++D-LA
+	 *     -------- -------- ---+++++ +++D--LA
 	 *
 	 *   Large (not sampled, size == LARGE_MINCLASS):
-	 *     ssssssss ssssssss ssss++++ ++++D-LA
+	 *     ssssssss ssssssss sss+++++ +++D--LA
 	 *     xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx
-	 *     -------- -------- ----++++ ++++D-LA
+	 *     -------- -------- ---+++++ +++D--LA
 	 */
 	size_t				bits;
-#define	CHUNK_MAP_BININD_SHIFT	4
+#define	CHUNK_MAP_ALLOCATED	((size_t)0x01U)
+#define	CHUNK_MAP_LARGE		((size_t)0x02U)
+#define	CHUNK_MAP_STATE_MASK	((size_t)0x3U)
+
+#define	CHUNK_MAP_DECOMMITTED	((size_t)0x04U)
+#define	CHUNK_MAP_UNZEROED	((size_t)0x08U)
+#define	CHUNK_MAP_DIRTY		((size_t)0x10U)
+#define	CHUNK_MAP_FLAGS_MASK	((size_t)0x1cU)
+
+#define	CHUNK_MAP_BININD_SHIFT	5
 #define	BININD_INVALID		((size_t)0xffU)
-/*     CHUNK_MAP_BININD_MASK == (BININD_INVALID << CHUNK_MAP_BININD_SHIFT) */
-#define	CHUNK_MAP_BININD_MASK	((size_t)0xff0U)
+#define	CHUNK_MAP_BININD_MASK	(BININD_INVALID << CHUNK_MAP_BININD_SHIFT)
 #define	CHUNK_MAP_BININD_INVALID CHUNK_MAP_BININD_MASK
-#define	CHUNK_MAP_FLAGS_MASK	((size_t)0xcU)
-#define	CHUNK_MAP_DIRTY		((size_t)0x8U)
-#define	CHUNK_MAP_UNZEROED	((size_t)0x4U)
-#define	CHUNK_MAP_LARGE		((size_t)0x2U)
-#define	CHUNK_MAP_ALLOCATED	((size_t)0x1U)
-#define	CHUNK_MAP_KEY		CHUNK_MAP_ALLOCATED
+
+#define	CHUNK_MAP_RUNIND_SHIFT	(CHUNK_MAP_BININD_SHIFT + 8)
+#define	CHUNK_MAP_SIZE_SHIFT	(CHUNK_MAP_RUNIND_SHIFT - LG_PAGE)
+#define	CHUNK_MAP_SIZE_MASK						\
+    (~(CHUNK_MAP_BININD_MASK | CHUNK_MAP_FLAGS_MASK | CHUNK_MAP_STATE_MASK))
 };
 
 struct arena_runs_dirty_link_s {
@@ -518,6 +526,7 @@ size_t	arena_mapbits_small_runind_get(arena_chunk_t *chunk, size_t pageind);
 index_t	arena_mapbits_binind_get(arena_chunk_t *chunk, size_t pageind);
 size_t	arena_mapbits_dirty_get(arena_chunk_t *chunk, size_t pageind);
 size_t	arena_mapbits_unzeroed_get(arena_chunk_t *chunk, size_t pageind);
+size_t	arena_mapbits_decommitted_get(arena_chunk_t *chunk, size_t pageind);
 size_t	arena_mapbits_large_get(arena_chunk_t *chunk, size_t pageind);
 size_t	arena_mapbits_allocated_get(arena_chunk_t *chunk, size_t pageind);
 void	arena_mapbitsp_write(size_t *mapbitsp, size_t mapbits);
@@ -650,7 +659,7 @@ arena_mapbits_unallocated_size_get(arena_chunk_t *chunk, size_t pageind)
 
 	mapbits = arena_mapbits_get(chunk, pageind);
 	assert((mapbits & (CHUNK_MAP_LARGE|CHUNK_MAP_ALLOCATED)) == 0);
-	return (mapbits & ~PAGE_MASK);
+	return ((mapbits & CHUNK_MAP_SIZE_MASK) >> CHUNK_MAP_SIZE_SHIFT);
 }
 
 JEMALLOC_ALWAYS_INLINE size_t
@@ -661,7 +670,7 @@ arena_mapbits_large_size_get(arena_chunk_t *chunk, size_t pageind)
 	mapbits = arena_mapbits_get(chunk, pageind);
 	assert((mapbits & (CHUNK_MAP_LARGE|CHUNK_MAP_ALLOCATED)) ==
 	    (CHUNK_MAP_LARGE|CHUNK_MAP_ALLOCATED));
-	return (mapbits & ~PAGE_MASK);
+	return ((mapbits & CHUNK_MAP_SIZE_MASK) >> CHUNK_MAP_SIZE_SHIFT);
 }
 
 JEMALLOC_ALWAYS_INLINE size_t
@@ -672,7 +681,7 @@ arena_mapbits_small_runind_get(arena_chunk_t *chunk, size_t pageind)
 	mapbits = arena_mapbits_get(chunk, pageind);
 	assert((mapbits & (CHUNK_MAP_LARGE|CHUNK_MAP_ALLOCATED)) ==
 	    CHUNK_MAP_ALLOCATED);
-	return (mapbits >> LG_PAGE);
+	return (mapbits >> CHUNK_MAP_RUNIND_SHIFT);
 }
 
 JEMALLOC_ALWAYS_INLINE index_t
@@ -693,6 +702,8 @@ arena_mapbits_dirty_get(arena_chunk_t *chunk, size_t pageind)
 	size_t mapbits;
 
 	mapbits = arena_mapbits_get(chunk, pageind);
+	assert((mapbits & CHUNK_MAP_DECOMMITTED) == 0 || (mapbits &
+	    (CHUNK_MAP_DIRTY|CHUNK_MAP_UNZEROED)) == 0);
 	return (mapbits & CHUNK_MAP_DIRTY);
 }
 
@@ -702,10 +713,23 @@ arena_mapbits_unzeroed_get(arena_chunk_t *chunk, size_t pageind)
 	size_t mapbits;
 
 	mapbits = arena_mapbits_get(chunk, pageind);
+	assert((mapbits & CHUNK_MAP_DECOMMITTED) == 0 || (mapbits &
+	    (CHUNK_MAP_DIRTY|CHUNK_MAP_UNZEROED)) == 0);
 	return (mapbits & CHUNK_MAP_UNZEROED);
 }
 
 JEMALLOC_ALWAYS_INLINE size_t
+arena_mapbits_decommitted_get(arena_chunk_t *chunk, size_t pageind)
+{
+	size_t mapbits;
+
+	mapbits = arena_mapbits_get(chunk, pageind);
+	assert((mapbits & CHUNK_MAP_DECOMMITTED) == 0 || (mapbits &
+	    (CHUNK_MAP_DIRTY|CHUNK_MAP_UNZEROED)) == 0);
+	return (mapbits & CHUNK_MAP_DECOMMITTED);
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
 arena_mapbits_large_get(arena_chunk_t *chunk, size_t pageind)
 {
 	size_t mapbits;
@@ -736,10 +760,13 @@ arena_mapbits_unallocated_set(arena_chunk_t *chunk, size_t pageind, size_t size,
 {
 	size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
 
-	assert(size == PAGE_CEILING(size));
-	assert((flags & ~CHUNK_MAP_FLAGS_MASK) == 0);
-	assert((flags & (CHUNK_MAP_DIRTY|CHUNK_MAP_UNZEROED)) == flags);
-	arena_mapbitsp_write(mapbitsp, size | CHUNK_MAP_BININD_INVALID | flags);
+	assert((size & PAGE_MASK) == 0);
+	assert(((size << CHUNK_MAP_SIZE_SHIFT) & ~CHUNK_MAP_SIZE_MASK) == 0);
+	assert((flags & CHUNK_MAP_FLAGS_MASK) == flags);
+	assert((flags & CHUNK_MAP_DECOMMITTED) == 0 || (flags &
+	    (CHUNK_MAP_DIRTY|CHUNK_MAP_UNZEROED)) == 0);
+	arena_mapbitsp_write(mapbitsp, (size << CHUNK_MAP_SIZE_SHIFT) |
+	    CHUNK_MAP_BININD_INVALID | flags);
 }
 
 JEMALLOC_ALWAYS_INLINE void
@@ -749,9 +776,11 @@ arena_mapbits_unallocated_size_set(arena_chunk_t *chunk, size_t pageind,
 	size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
 	size_t mapbits = arena_mapbitsp_read(mapbitsp);
 
-	assert(size == PAGE_CEILING(size));
+	assert((size & PAGE_MASK) == 0);
+	assert(((size << CHUNK_MAP_SIZE_SHIFT) & ~CHUNK_MAP_SIZE_MASK) == 0);
 	assert((mapbits & (CHUNK_MAP_LARGE|CHUNK_MAP_ALLOCATED)) == 0);
-	arena_mapbitsp_write(mapbitsp, size | (mapbits & PAGE_MASK));
+	arena_mapbitsp_write(mapbitsp, (size << CHUNK_MAP_SIZE_SHIFT) | (mapbits
+	    & ~CHUNK_MAP_SIZE_MASK));
 }
 
 JEMALLOC_ALWAYS_INLINE void
@@ -762,11 +791,13 @@ arena_mapbits_large_set(arena_chunk_t *chunk, size_t pageind, size_t size,
 	size_t mapbits = arena_mapbitsp_read(mapbitsp);
 	size_t unzeroed;
 
-	assert(size == PAGE_CEILING(size));
-	assert((flags & CHUNK_MAP_DIRTY) == flags);
+	assert((size & PAGE_MASK) == 0);
+	assert(((size << CHUNK_MAP_SIZE_SHIFT) & ~CHUNK_MAP_SIZE_MASK) == 0);
+	assert((flags & (CHUNK_MAP_DIRTY|CHUNK_MAP_DECOMMITTED)) == flags);
 	unzeroed = mapbits & CHUNK_MAP_UNZEROED; /* Preserve unzeroed. */
-	arena_mapbitsp_write(mapbitsp, size | CHUNK_MAP_BININD_INVALID | flags
-	    | unzeroed | CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED);
+	arena_mapbitsp_write(mapbitsp, (size << CHUNK_MAP_SIZE_SHIFT) |
+	    CHUNK_MAP_BININD_INVALID | flags | unzeroed | CHUNK_MAP_LARGE |
+	    CHUNK_MAP_ALLOCATED);
 }
 
 JEMALLOC_ALWAYS_INLINE void
@@ -795,8 +826,9 @@ arena_mapbits_small_set(arena_chunk_t *chunk, size_t pageind, size_t runind,
 	assert(pageind - runind >= map_bias);
 	assert((flags & CHUNK_MAP_DIRTY) == flags);
 	unzeroed = mapbits & CHUNK_MAP_UNZEROED; /* Preserve unzeroed. */
-	arena_mapbitsp_write(mapbitsp, (runind << LG_PAGE) | (binind <<
-	    CHUNK_MAP_BININD_SHIFT) | flags | unzeroed | CHUNK_MAP_ALLOCATED);
+	arena_mapbitsp_write(mapbitsp, (runind << CHUNK_MAP_RUNIND_SHIFT) |
+	    (binind << CHUNK_MAP_BININD_SHIFT) | flags | unzeroed |
+	    CHUNK_MAP_ALLOCATED);
 }
 
 JEMALLOC_ALWAYS_INLINE void
diff --git a/include/jemalloc/internal/chunk.h b/include/jemalloc/internal/chunk.h
index 8e51134..51cd8ce 100644
--- a/include/jemalloc/internal/chunk.h
+++ b/include/jemalloc/internal/chunk.h
@@ -59,13 +59,13 @@ void	*chunk_alloc_cache(arena_t *arena, chunk_hooks_t *chunk_hooks,
     void *new_addr, size_t size, size_t alignment, bool *zero,
     bool dalloc_node);
 void	*chunk_alloc_wrapper(arena_t *arena, chunk_hooks_t *chunk_hooks,
-    void *new_addr, size_t size, size_t alignment, bool *zero);
+    void *new_addr, size_t size, size_t alignment, bool *zero, bool *commit);
 void	chunk_dalloc_cache(arena_t *arena, chunk_hooks_t *chunk_hooks,
     void *chunk, size_t size);
 void	chunk_dalloc_arena(arena_t *arena, chunk_hooks_t *chunk_hooks,
-    void *chunk, size_t size, bool zeroed);
+    void *chunk, size_t size, bool zeroed, bool committed);
 void	chunk_dalloc_wrapper(arena_t *arena, chunk_hooks_t *chunk_hooks,
-    void *chunk, size_t size);
+    void *chunk, size_t size, bool committed);
 bool	chunk_purge_arena(arena_t *arena, void *chunk, size_t offset,
     size_t length);
 bool	chunk_purge_wrapper(arena_t *arena, chunk_hooks_t *chunk_hooks,
diff --git a/include/jemalloc/internal/chunk_dss.h b/include/jemalloc/internal/chunk_dss.h
index 87366a2..388f46b 100644
--- a/include/jemalloc/internal/chunk_dss.h
+++ b/include/jemalloc/internal/chunk_dss.h
@@ -24,7 +24,7 @@ extern const char *dss_prec_names[];
 dss_prec_t	chunk_dss_prec_get(void);
 bool	chunk_dss_prec_set(dss_prec_t dss_prec);
 void	*chunk_alloc_dss(arena_t *arena, void *new_addr, size_t size,
-    size_t alignment, bool *zero);
+    size_t alignment, bool *zero, bool *commit);
 bool	chunk_in_dss(void *chunk);
 bool	chunk_dss_boot(void);
 void	chunk_dss_prefork(void);
diff --git a/include/jemalloc/internal/chunk_mmap.h b/include/jemalloc/internal/chunk_mmap.h
index e81dc3a..7d8014c 100644
--- a/include/jemalloc/internal/chunk_mmap.h
+++ b/include/jemalloc/internal/chunk_mmap.h
@@ -9,7 +9,8 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
-void	*chunk_alloc_mmap(size_t size, size_t alignment, bool *zero);
+void	*chunk_alloc_mmap(size_t size, size_t alignment, bool *zero,
+    bool *commit);
 bool	chunk_dalloc_mmap(void *chunk, size_t size);
 
 #endif /* JEMALLOC_H_EXTERNS */
diff --git a/include/jemalloc/internal/extent.h b/include/jemalloc/internal/extent.h
index b2ac2b6..f8436e5 100644
--- a/include/jemalloc/internal/extent.h
+++ b/include/jemalloc/internal/extent.h
@@ -19,6 +19,12 @@ struct extent_node_s {
 	size_t			en_size;
 
 	/*
+	 * The zeroed flag is used by chunk recycling code to track whether
+	 * memory is zero-filled.
+	 */
+	bool			en_zeroed;
+
+	/*
 	 * True if physical memory is committed to the extent, whether
 	 * explicitly or implicitly as on a system that overcommits and
 	 * satisfies physical mamory needs on demand via soft page faults.
@@ -26,12 +32,6 @@ struct extent_node_s {
 	bool			en_committed;
 
 	/*
-	 * The zeroed flag is used by chunk recycling code to track whether
-	 * memory is zero-filled.
-	 */
-	bool			en_zeroed;
-
-	/*
 	 * The achunk flag is used to validate that huge allocation lookups
 	 * don't return arena chunks.
 	 */
@@ -73,19 +73,19 @@ rb_proto(, extent_tree_ad_, extent_tree_t, extent_node_t)
 arena_t	*extent_node_arena_get(const extent_node_t *node);
 void	*extent_node_addr_get(const extent_node_t *node);
 size_t	extent_node_size_get(const extent_node_t *node);
-bool	extent_node_committed_get(const extent_node_t *node);
 bool	extent_node_zeroed_get(const extent_node_t *node);
+bool	extent_node_committed_get(const extent_node_t *node);
 bool	extent_node_achunk_get(const extent_node_t *node);
 prof_tctx_t	*extent_node_prof_tctx_get(const extent_node_t *node);
 void	extent_node_arena_set(extent_node_t *node, arena_t *arena);
 void	extent_node_addr_set(extent_node_t *node, void *addr);
 void	extent_node_size_set(extent_node_t *node, size_t size);
-void	extent_node_committed_set(extent_node_t *node, bool committed);
 void	extent_node_zeroed_set(extent_node_t *node, bool zeroed);
+void	extent_node_committed_set(extent_node_t *node, bool committed);
 void	extent_node_achunk_set(extent_node_t *node, bool achunk);
 void	extent_node_prof_tctx_set(extent_node_t *node, prof_tctx_t *tctx);
 void	extent_node_init(extent_node_t *node, arena_t *arena, void *addr,
-    size_t size, bool committed, bool zeroed);
+    size_t size, bool zeroed, bool committed);
 void	extent_node_dirty_linkage_init(extent_node_t *node);
 void	extent_node_dirty_insert(extent_node_t *node,
     arena_runs_dirty_link_t *runs_dirty, extent_node_t *chunks_dirty);
@@ -115,17 +115,17 @@ extent_node_size_get(const extent_node_t *node)
 }
 
 JEMALLOC_INLINE bool
-extent_node_committed_get(const extent_node_t *node)
+extent_node_zeroed_get(const extent_node_t *node)
 {
 
-	return (node->en_committed);
+	return (node->en_zeroed);
 }
 
 JEMALLOC_INLINE bool
-extent_node_zeroed_get(const extent_node_t *node)
+extent_node_committed_get(const extent_node_t *node)
 {
 
-	return (node->en_zeroed);
+	return (node->en_committed);
 }
 
 JEMALLOC_INLINE bool
@@ -164,17 +164,17 @@ extent_node_size_set(extent_node_t *node, size_t size)
 }
 
 JEMALLOC_INLINE void
-extent_node_committed_set(extent_node_t *node, bool committed)
+extent_node_zeroed_set(extent_node_t *node, bool zeroed)
 {
 
-	node->en_committed = committed;
+	node->en_zeroed = zeroed;
 }
 
 JEMALLOC_INLINE void
-extent_node_zeroed_set(extent_node_t *node, bool zeroed)
+extent_node_committed_set(extent_node_t *node, bool committed)
 {
 
-	node->en_zeroed = zeroed;
+	node->en_committed = committed;
 }
 
 JEMALLOC_INLINE void
@@ -193,14 +193,14 @@ extent_node_prof_tctx_set(extent_node_t *node, prof_tctx_t *tctx)
 
 JEMALLOC_INLINE void
 extent_node_init(extent_node_t *node, arena_t *arena, void *addr, size_t size,
-    bool committed, bool zeroed)
+    bool zeroed, bool committed)
 {
 
 	extent_node_arena_set(node, arena);
 	extent_node_addr_set(node, addr);
 	extent_node_size_set(node, size);
-	extent_node_committed_set(node, committed);
 	extent_node_zeroed_set(node, zeroed);
+	extent_node_committed_set(node, committed);
 	extent_node_achunk_set(node, false);
 	if (config_prof)
 		extent_node_prof_tctx_set(node, NULL);
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 0e6216f..2228520 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -39,6 +39,7 @@ arena_malloc_large
 arena_malloc_small
 arena_mapbits_allocated_get
 arena_mapbits_binind_get
+arena_mapbits_decommitted_get
 arena_mapbits_dirty_get
 arena_mapbits_get
 arena_mapbits_large_binind_set
diff --git a/include/jemalloc/jemalloc_typedefs.h.in b/include/jemalloc/jemalloc_typedefs.h.in
index 26eb9ad..fa7b350 100644
--- a/include/jemalloc/jemalloc_typedefs.h.in
+++ b/include/jemalloc/jemalloc_typedefs.h.in
@@ -1,27 +1,29 @@
 /*
  * void *
  * chunk_alloc(void *new_addr, size_t size, size_t alignment, bool *zero,
- *     unsigned arena_ind);
+ *     bool *commit, unsigned arena_ind);
  */
-typedef void *(chunk_alloc_t)(void *, size_t, size_t, bool *, unsigned);
+typedef void *(chunk_alloc_t)(void *, size_t, size_t, bool *, bool *, unsigned);
 
 /*
  * bool
- * chunk_dalloc(void *chunk, size_t size, unsigned arena_ind);
+ * chunk_dalloc(void *chunk, size_t size, bool committed, unsigned arena_ind);
  */
-typedef bool (chunk_dalloc_t)(void *, size_t, unsigned);
+typedef bool (chunk_dalloc_t)(void *, size_t, bool, unsigned);
 
 /*
  * bool
- * chunk_commit(void *chunk, size_t size, unsigned arena_ind);
+ * chunk_commit(void *chunk, size_t size, size_t offset, size_t length,
+ *     unsigned arena_ind);
  */
-typedef bool (chunk_commit_t)(void *, size_t, unsigned);
+typedef bool (chunk_commit_t)(void *, size_t, size_t, size_t, unsigned);
 
 /*
  * bool
- * chunk_decommit(void *chunk, size_t size, unsigned arena_ind);
+ * chunk_decommit(void *chunk, size_t size, size_t offset, size_t length,
+ *     unsigned arena_ind);
  */
-typedef bool (chunk_decommit_t)(void *, size_t, unsigned);
+typedef bool (chunk_decommit_t)(void *, size_t, size_t, size_t, unsigned);
 
 /*
  * bool
diff --git a/src/arena.c b/src/arena.c
index 34ac2ae..84ccf11 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -25,7 +25,7 @@ unsigned	nhclasses; /* Number of huge size classes. */
 
 static void	arena_purge(arena_t *arena, bool all);
 static void	arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty,
-    bool cleaned);
+    bool cleaned, bool decommitted);
 static void	arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk,
     arena_run_t *run, arena_bin_t *bin);
 static void	arena_bin_lower_run(arena_t *arena, arena_chunk_t *chunk,
@@ -33,13 +33,47 @@ static void	arena_bin_lower_run(arena_t *arena, arena_chunk_t *chunk,
 
 /******************************************************************************/
 
+#define	CHUNK_MAP_KEY		((uintptr_t)0x1U)
+
+JEMALLOC_INLINE_C arena_chunk_map_misc_t *
+arena_miscelm_key_create(size_t size)
+{
+
+	return ((arena_chunk_map_misc_t *)((size << CHUNK_MAP_SIZE_SHIFT) |
+	    CHUNK_MAP_KEY));
+}
+
+JEMALLOC_INLINE_C bool
+arena_miscelm_is_key(const arena_chunk_map_misc_t *miscelm)
+{
+
+	return (((uintptr_t)miscelm & CHUNK_MAP_KEY) != 0);
+}
+
+#undef CHUNK_MAP_KEY
+
 JEMALLOC_INLINE_C size_t
-arena_miscelm_to_bits(arena_chunk_map_misc_t *miscelm)
+arena_miscelm_key_size_get(const arena_chunk_map_misc_t *miscelm)
 {
-	arena_chunk_t *chunk = CHUNK_ADDR2BASE(miscelm);
-	size_t pageind = arena_miscelm_to_pageind(miscelm);
 
-	return (arena_mapbits_get(chunk, pageind));
+	assert(arena_miscelm_is_key(miscelm));
+
+	return (((uintptr_t)miscelm & CHUNK_MAP_SIZE_MASK) >>
+	    CHUNK_MAP_SIZE_SHIFT);
+}
+
+JEMALLOC_INLINE_C size_t
+arena_miscelm_size_get(arena_chunk_map_misc_t *miscelm)
+{
+	arena_chunk_t *chunk;
+	size_t pageind, mapbits;
+
+	assert(!arena_miscelm_is_key(miscelm));
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(miscelm);
+	pageind = arena_miscelm_to_pageind(miscelm);
+	mapbits = arena_mapbits_get(chunk, pageind);
+	return ((mapbits & CHUNK_MAP_SIZE_MASK) >> CHUNK_MAP_SIZE_SHIFT);
 }
 
 JEMALLOC_INLINE_C int
@@ -140,14 +174,9 @@ arena_avail_comp(arena_chunk_map_misc_t *a, arena_chunk_map_misc_t *b)
 {
 	int ret;
 	uintptr_t a_miscelm = (uintptr_t)a;
-	size_t a_qsize;
-	size_t b_qsize = run_quantize(arena_miscelm_to_bits(b) & ~PAGE_MASK);
-
-	if (a_miscelm & CHUNK_MAP_KEY) {
-		size_t a_size = a_miscelm & ~PAGE_MASK;
-		a_qsize = run_quantize(a_size);
-	} else
-		a_qsize = run_quantize(arena_miscelm_to_bits(a) & ~PAGE_MASK);
+	size_t a_qsize = run_quantize(arena_miscelm_is_key(a) ?
+	    arena_miscelm_key_size_get(a) : arena_miscelm_size_get(a));
+	size_t b_qsize = run_quantize(arena_miscelm_size_get(b));
 
 	/*
 	 * Compare based on quantized size rather than size, in order to sort
@@ -155,7 +184,7 @@ arena_avail_comp(arena_chunk_map_misc_t *a, arena_chunk_map_misc_t *b)
 	 */
 	ret = (a_qsize > b_qsize) - (a_qsize < b_qsize);
 	if (ret == 0) {
-		if (!(a_miscelm & CHUNK_MAP_KEY)) {
+		if (!arena_miscelm_is_key(a)) {
 			uintptr_t b_miscelm = (uintptr_t)b;
 
 			ret = (a_miscelm > b_miscelm) - (a_miscelm < b_miscelm);
@@ -350,10 +379,12 @@ arena_cactive_update(arena_t *arena, size_t add_pages, size_t sub_pages)
 
 static void
 arena_run_split_remove(arena_t *arena, arena_chunk_t *chunk, size_t run_ind,
-    size_t flag_dirty, size_t need_pages)
+    size_t flag_dirty, size_t flag_decommitted, size_t need_pages)
 {
 	size_t total_pages, rem_pages;
 
+	assert(flag_dirty == 0 || flag_decommitted == 0);
+
 	total_pages = arena_mapbits_unallocated_size_get(chunk, run_ind) >>
 	    LG_PAGE;
 	assert(arena_mapbits_dirty_get(chunk, run_ind+total_pages-1) ==
@@ -369,15 +400,18 @@ arena_run_split_remove(arena_t *arena, arena_chunk_t *chunk, size_t run_ind,
 
 	/* Keep track of trailing unused pages for later use. */
 	if (rem_pages > 0) {
-		if (flag_dirty != 0) {
+		size_t flags = flag_dirty | flag_decommitted;
+
+		if (flags != 0) {
 			arena_mapbits_unallocated_set(chunk,
-			    run_ind+need_pages, (rem_pages << LG_PAGE),
-			    flag_dirty);
+			    run_ind+need_pages, (rem_pages << LG_PAGE), flags);
 			arena_mapbits_unallocated_set(chunk,
 			    run_ind+total_pages-1, (rem_pages << LG_PAGE),
-			    flag_dirty);
-			arena_run_dirty_insert(arena, chunk, run_ind+need_pages,
-			    rem_pages);
+			    flags);
+			if (flag_dirty != 0) {
+				arena_run_dirty_insert(arena, chunk,
+				    run_ind+need_pages, rem_pages);
+			}
 		} else {
 			arena_mapbits_unallocated_set(chunk, run_ind+need_pages,
 			    (rem_pages << LG_PAGE),
@@ -392,24 +426,30 @@ arena_run_split_remove(arena_t *arena, arena_chunk_t *chunk, size_t run_ind,
 	}
 }
 
-static void
+static bool
 arena_run_split_large_helper(arena_t *arena, arena_run_t *run, size_t size,
     bool remove, bool zero)
 {
 	arena_chunk_t *chunk;
 	arena_chunk_map_misc_t *miscelm;
-	size_t flag_dirty, run_ind, need_pages, i;
+	size_t flag_dirty, flag_decommitted, run_ind, need_pages, i;
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
 	miscelm = arena_run_to_miscelm(run);
 	run_ind = arena_miscelm_to_pageind(miscelm);
 	flag_dirty = arena_mapbits_dirty_get(chunk, run_ind);
+	flag_decommitted = arena_mapbits_decommitted_get(chunk, run_ind);
 	need_pages = (size >> LG_PAGE);
 	assert(need_pages > 0);
 
+	if (flag_decommitted != 0 && arena->chunk_hooks.commit(chunk,
+	    chunksize, (run_ind << LG_PAGE), (need_pages << LG_PAGE),
+	    arena->ind))
+		return (true);
+
 	if (remove) {
 		arena_run_split_remove(arena, chunk, run_ind, flag_dirty,
-		    need_pages);
+		    flag_decommitted, need_pages);
 	}
 
 	if (zero) {
@@ -445,29 +485,30 @@ arena_run_split_large_helper(arena_t *arena, arena_run_t *run, size_t size,
 	 */
 	arena_mapbits_large_set(chunk, run_ind+need_pages-1, 0, flag_dirty);
 	arena_mapbits_large_set(chunk, run_ind, size, flag_dirty);
+	return (false);
 }
 
-static void
+static bool
 arena_run_split_large(arena_t *arena, arena_run_t *run, size_t size, bool zero)
 {
 
-	arena_run_split_large_helper(arena, run, size, true, zero);
+	return (arena_run_split_large_helper(arena, run, size, true, zero));
 }
 
-static void
+static bool
 arena_run_init_large(arena_t *arena, arena_run_t *run, size_t size, bool zero)
 {
 
-	arena_run_split_large_helper(arena, run, size, false, zero);
+	return (arena_run_split_large_helper(arena, run, size, false, zero));
 }
 
-static void
+static bool
 arena_run_split_small(arena_t *arena, arena_run_t *run, size_t size,
     index_t binind)
 {
 	arena_chunk_t *chunk;
 	arena_chunk_map_misc_t *miscelm;
-	size_t flag_dirty, run_ind, need_pages, i;
+	size_t flag_dirty, flag_decommitted, run_ind, need_pages, i;
 
 	assert(binind != BININD_INVALID);
 
@@ -475,10 +516,16 @@ arena_run_split_small(arena_t *arena, arena_run_t *run, size_t size,
 	miscelm = arena_run_to_miscelm(run);
 	run_ind = arena_miscelm_to_pageind(miscelm);
 	flag_dirty = arena_mapbits_dirty_get(chunk, run_ind);
+	flag_decommitted = arena_mapbits_decommitted_get(chunk, run_ind);
 	need_pages = (size >> LG_PAGE);
 	assert(need_pages > 0);
 
-	arena_run_split_remove(arena, chunk, run_ind, flag_dirty, need_pages);
+	if (flag_decommitted != 0 && arena->chunk_hooks.commit(chunk, chunksize,
+	    run_ind << LG_PAGE, size, arena->ind))
+		return (true);
+
+	arena_run_split_remove(arena, chunk, run_ind, flag_dirty,
+	    flag_decommitted, need_pages);
 
 	for (i = 0; i < need_pages; i++) {
 		arena_mapbits_small_set(chunk, run_ind+i, i, binind, 0);
@@ -488,6 +535,7 @@ arena_run_split_small(arena_t *arena, arena_run_t *run, size_t size,
 	}
 	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED((void *)((uintptr_t)chunk +
 	    (run_ind << LG_PAGE)), (need_pages << LG_PAGE));
+	return (false);
 }
 
 static arena_chunk_t *
@@ -516,46 +564,70 @@ static bool
 arena_chunk_register(arena_t *arena, arena_chunk_t *chunk, bool zero)
 {
 
-	extent_node_init(&chunk->node, arena, chunk, chunksize, true, zero);
+	/*
+	 * The extent node notion of "committed" doesn't directly apply to
+	 * arena chunks.  Arbitrarily mark them as committed (after all they are
+	 * always at least partially committed).  The commit state of runs is
+	 * tracked individually.
+	 */
+	extent_node_init(&chunk->node, arena, chunk, chunksize, zero, true);
 	extent_node_achunk_set(&chunk->node, true);
 	return (chunk_register(chunk, &chunk->node));
 }
 
 static arena_chunk_t *
 arena_chunk_alloc_internal_hard(arena_t *arena, chunk_hooks_t *chunk_hooks,
-    bool *zero)
+    bool *zero, bool *commit)
 {
 	arena_chunk_t *chunk;
 
 	malloc_mutex_unlock(&arena->lock);
 
 	chunk = (arena_chunk_t *)chunk_alloc_wrapper(arena, chunk_hooks, NULL,
-	    chunksize, chunksize, zero);
+	    chunksize, chunksize, zero, commit);
+	if (chunk != NULL && !*commit) {
+		/* Commit header. */
+		if (chunk_hooks->commit(chunk, chunksize, 0, map_bias <<
+		    LG_PAGE, arena->ind)) {
+			chunk_dalloc_wrapper(arena, chunk_hooks,
+			    (void *)chunk, chunksize, *commit);
+			chunk = NULL;
+		}
+	}
 	if (chunk != NULL && arena_chunk_register(arena, chunk, *zero)) {
+		if (!*commit) {
+			/* Undo commit of header. */
+			chunk_hooks->decommit(chunk, chunksize, 0, map_bias <<
+			    LG_PAGE, arena->ind);
+		}
 		chunk_dalloc_wrapper(arena, chunk_hooks, (void *)chunk,
-		    chunksize);
+		    chunksize, *commit);
 		chunk = NULL;
 	}
-	malloc_mutex_lock(&arena->lock);
 
+	malloc_mutex_lock(&arena->lock);
 	return (chunk);
 }
 
 static arena_chunk_t *
-arena_chunk_alloc_internal(arena_t *arena, bool *zero)
+arena_chunk_alloc_internal(arena_t *arena, bool *zero, bool *commit)
 {
 	arena_chunk_t *chunk;
 	chunk_hooks_t chunk_hooks = CHUNK_HOOKS_INITIALIZER;
 
 	chunk = chunk_alloc_cache(arena, &chunk_hooks, NULL, chunksize,
 	    chunksize, zero, true);
-	if (chunk != NULL && arena_chunk_register(arena, chunk, *zero)) {
-		chunk_dalloc_cache(arena, &chunk_hooks, chunk, chunksize);
-		return (NULL);
+	if (chunk != NULL) {
+		if (arena_chunk_register(arena, chunk, *zero)) {
+			chunk_dalloc_cache(arena, &chunk_hooks, chunk,
+			    chunksize);
+			return (NULL);
+		}
+		*commit = true;
 	}
 	if (chunk == NULL) {
 		chunk = arena_chunk_alloc_internal_hard(arena, &chunk_hooks,
-		    zero);
+		    zero, commit);
 	}
 
 	if (config_stats && chunk != NULL) {
@@ -570,22 +642,26 @@ static arena_chunk_t *
 arena_chunk_init_hard(arena_t *arena)
 {
 	arena_chunk_t *chunk;
-	bool zero;
-	size_t unzeroed, i;
+	bool zero, commit;
+	size_t unzeroed, decommitted, i;
 
 	assert(arena->spare == NULL);
 
 	zero = false;
-	chunk = arena_chunk_alloc_internal(arena, &zero);
+	commit = false;
+	chunk = arena_chunk_alloc_internal(arena, &zero, &commit);
 	if (chunk == NULL)
 		return (NULL);
 
 	/*
 	 * Initialize the map to contain one maximal free untouched run.  Mark
-	 * the pages as zeroed iff chunk_alloc() returned a zeroed chunk.
+	 * the pages as zeroed if chunk_alloc() returned a zeroed or decommitted
+	 * chunk.
 	 */
-	unzeroed = zero ? 0 : CHUNK_MAP_UNZEROED;
-	arena_mapbits_unallocated_set(chunk, map_bias, arena_maxrun, unzeroed);
+	unzeroed = (zero || !commit) ? 0 : CHUNK_MAP_UNZEROED;
+	decommitted = commit ? 0 : CHUNK_MAP_DECOMMITTED;
+	arena_mapbits_unallocated_set(chunk, map_bias, arena_maxrun, unzeroed |
+	    decommitted);
 	/*
 	 * There is no need to initialize the internal page map entries unless
 	 * the chunk is not zeroed.
@@ -777,9 +853,10 @@ arena_chunk_alloc_huge_hard(arena_t *arena, chunk_hooks_t *chunk_hooks,
     size_t usize, size_t alignment, bool *zero, size_t csize)
 {
 	void *ret;
+	bool commit = true;
 
 	ret = chunk_alloc_wrapper(arena, chunk_hooks, NULL, csize, alignment,
-	    zero);
+	    zero, &commit);
 	if (ret == NULL) {
 		/* Revert optimistic stats updates. */
 		malloc_mutex_lock(&arena->lock);
@@ -901,9 +978,10 @@ arena_chunk_ralloc_huge_expand_hard(arena_t *arena, chunk_hooks_t *chunk_hooks,
     size_t udiff, size_t cdiff)
 {
 	bool err;
+	bool commit = true;
 
 	err = (chunk_alloc_wrapper(arena, chunk_hooks, nchunk, cdiff, chunksize,
-	    zero) == NULL);
+	    zero, &commit) == NULL);
 	if (err) {
 		/* Revert optimistic stats updates. */
 		malloc_mutex_lock(&arena->lock);
@@ -916,7 +994,8 @@ arena_chunk_ralloc_huge_expand_hard(arena_t *arena, chunk_hooks_t *chunk_hooks,
 		malloc_mutex_unlock(&arena->lock);
 	} else if (chunk_hooks->merge(chunk, CHUNK_CEILING(oldsize), nchunk,
 	    cdiff, true, arena->ind)) {
-		chunk_dalloc_arena(arena, chunk_hooks, nchunk, cdiff, *zero);
+		chunk_dalloc_arena(arena, chunk_hooks, nchunk, cdiff, *zero,
+		    true);
 		err = true;
 	}
 	return (err);
@@ -927,13 +1006,11 @@ arena_chunk_ralloc_huge_expand(arena_t *arena, void *chunk, size_t oldsize,
     size_t usize, bool *zero)
 {
 	bool err;
-	chunk_hooks_t chunk_hooks;
+	chunk_hooks_t chunk_hooks = chunk_hooks_get(arena);
 	void *nchunk = (void *)((uintptr_t)chunk + CHUNK_CEILING(oldsize));
 	size_t udiff = usize - oldsize;
 	size_t cdiff = CHUNK_CEILING(usize) - CHUNK_CEILING(oldsize);
 
-	chunk_hooks = chunk_hooks_get(arena);
-
 	malloc_mutex_lock(&arena->lock);
 
 	/* Optimistically update stats. */
@@ -952,7 +1029,8 @@ arena_chunk_ralloc_huge_expand(arena_t *arena, void *chunk, size_t oldsize,
 		    cdiff);
 	} else if (chunk_hooks.merge(chunk, CHUNK_CEILING(oldsize), nchunk,
 	    cdiff, true, arena->ind)) {
-		chunk_dalloc_arena(arena, &chunk_hooks, nchunk, cdiff, *zero);
+		chunk_dalloc_arena(arena, &chunk_hooks, nchunk, cdiff, *zero,
+		    true);
 		err = true;
 	}
 
@@ -970,8 +1048,7 @@ static arena_run_t *
 arena_run_first_best_fit(arena_t *arena, size_t size)
 {
 	size_t search_size = run_quantize_first(size);
-	arena_chunk_map_misc_t *key = (arena_chunk_map_misc_t *)
-	    (search_size | CHUNK_MAP_KEY);
+	arena_chunk_map_misc_t *key = arena_miscelm_key_create(search_size);
 	arena_chunk_map_misc_t *miscelm =
 	    arena_avail_tree_nsearch(&arena->runs_avail, key);
 	if (miscelm == NULL)
@@ -983,8 +1060,10 @@ static arena_run_t *
 arena_run_alloc_large_helper(arena_t *arena, size_t size, bool zero)
 {
 	arena_run_t *run = arena_run_first_best_fit(arena, s2u(size));
-	if (run != NULL)
-		arena_run_split_large(arena, run, size, zero);
+	if (run != NULL) {
+		if (arena_run_split_large(arena, run, size, zero))
+			run = NULL;
+	}
 	return (run);
 }
 
@@ -1008,7 +1087,8 @@ arena_run_alloc_large(arena_t *arena, size_t size, bool zero)
 	chunk = arena_chunk_alloc(arena);
 	if (chunk != NULL) {
 		run = &arena_miscelm_get(chunk, map_bias)->run;
-		arena_run_split_large(arena, run, size, zero);
+		if (arena_run_split_large(arena, run, size, zero))
+			run = NULL;
 		return (run);
 	}
 
@@ -1024,8 +1104,10 @@ static arena_run_t *
 arena_run_alloc_small_helper(arena_t *arena, size_t size, index_t binind)
 {
 	arena_run_t *run = arena_run_first_best_fit(arena, size);
-	if (run != NULL)
-		arena_run_split_small(arena, run, size, binind);
+	if (run != NULL) {
+		if (arena_run_split_small(arena, run, size, binind))
+			run = NULL;
+	}
 	return (run);
 }
 
@@ -1050,7 +1132,8 @@ arena_run_alloc_small(arena_t *arena, size_t size, index_t binind)
 	chunk = arena_chunk_alloc(arena);
 	if (chunk != NULL) {
 		run = &arena_miscelm_get(chunk, map_bias)->run;
-		arena_run_split_small(arena, run, size, binind);
+		if (arena_run_split_small(arena, run, size, binind))
+			run = NULL;
 		return (run);
 	}
 
@@ -1292,9 +1375,9 @@ arena_purge_stashed(arena_t *arena, chunk_hooks_t *chunk_hooks,
 			chunkselm = qr_next(chunkselm, cc_link);
 		} else {
 			size_t pageind, run_size, flag_unzeroed, i;
-			bool unzeroed;
-			arena_chunk_t *chunk = (arena_chunk_t
-			    *)CHUNK_ADDR2BASE(rdelm);
+			bool unzeroed, decommitted;
+			arena_chunk_t *chunk =
+			    (arena_chunk_t *)CHUNK_ADDR2BASE(rdelm);
 			arena_chunk_map_misc_t *miscelm =
 			    arena_rd_to_miscelm(rdelm);
 			pageind = arena_miscelm_to_pageind(miscelm);
@@ -1302,9 +1385,19 @@ arena_purge_stashed(arena_t *arena, chunk_hooks_t *chunk_hooks,
 			npages = run_size >> LG_PAGE;
 
 			assert(pageind + npages <= chunk_npages);
-			unzeroed = chunk_purge_wrapper(arena,
-			    chunk_hooks, chunk, chunksize, pageind << LG_PAGE,
-			    run_size);
+			decommitted = !chunk_hooks->decommit(chunk, chunksize,
+			    pageind << LG_PAGE, npages << LG_PAGE, arena->ind);
+			if (decommitted) {
+				arena_mapbits_large_set(chunk, pageind+npages-1,
+				    0, CHUNK_MAP_DECOMMITTED);
+				arena_mapbits_large_set(chunk, pageind,
+				    run_size, CHUNK_MAP_DECOMMITTED);
+				unzeroed = false;
+			} else {
+				unzeroed = chunk_purge_wrapper(arena,
+				    chunk_hooks, chunk, chunksize, pageind <<
+				    LG_PAGE, run_size);
+			}
 			flag_unzeroed = unzeroed ? CHUNK_MAP_UNZEROED : 0;
 
 			/*
@@ -1361,13 +1454,18 @@ arena_unstash_purged(arena_t *arena, chunk_hooks_t *chunk_hooks,
 			arena_node_dalloc(arena, chunkselm);
 			chunkselm = chunkselm_next;
 			chunk_dalloc_arena(arena, chunk_hooks, addr, size,
-			    zeroed);
+			    zeroed, true);
 		} else {
+			arena_chunk_t *chunk =
+			    (arena_chunk_t *)CHUNK_ADDR2BASE(rdelm);
 			arena_chunk_map_misc_t *miscelm =
 			    arena_rd_to_miscelm(rdelm);
+			size_t pageind = arena_miscelm_to_pageind(miscelm);
+			bool decommitted = (arena_mapbits_decommitted_get(chunk,
+			    pageind) != 0);
 			arena_run_t *run = &miscelm->run;
 			qr_remove(rdelm, rd_link);
-			arena_run_dalloc(arena, run, false, true);
+			arena_run_dalloc(arena, run, false, true, decommitted);
 		}
 	}
 }
@@ -1375,7 +1473,7 @@ arena_unstash_purged(arena_t *arena, chunk_hooks_t *chunk_hooks,
 static void
 arena_purge(arena_t *arena, bool all)
 {
-	chunk_hooks_t chunk_hooks = CHUNK_HOOKS_INITIALIZER;
+	chunk_hooks_t chunk_hooks = chunk_hooks_get(arena);
 	size_t npurge, npurgeable, npurged;
 	arena_runs_dirty_link_t purge_runs_sentinel;
 	extent_node_t purge_chunks_sentinel;
@@ -1422,7 +1520,8 @@ arena_purge_all(arena_t *arena)
 
 static void
 arena_run_coalesce(arena_t *arena, arena_chunk_t *chunk, size_t *p_size,
-    size_t *p_run_ind, size_t *p_run_pages, size_t flag_dirty)
+    size_t *p_run_ind, size_t *p_run_pages, size_t flag_dirty,
+    size_t flag_decommitted)
 {
 	size_t size = *p_size;
 	size_t run_ind = *p_run_ind;
@@ -1431,7 +1530,9 @@ arena_run_coalesce(arena_t *arena, arena_chunk_t *chunk, size_t *p_size,
 	/* Try to coalesce forward. */
 	if (run_ind + run_pages < chunk_npages &&
 	    arena_mapbits_allocated_get(chunk, run_ind+run_pages) == 0 &&
-	    arena_mapbits_dirty_get(chunk, run_ind+run_pages) == flag_dirty) {
+	    arena_mapbits_dirty_get(chunk, run_ind+run_pages) == flag_dirty &&
+	    arena_mapbits_decommitted_get(chunk, run_ind+run_pages) ==
+	    flag_decommitted) {
 		size_t nrun_size = arena_mapbits_unallocated_size_get(chunk,
 		    run_ind+run_pages);
 		size_t nrun_pages = nrun_size >> LG_PAGE;
@@ -1444,6 +1545,8 @@ arena_run_coalesce(arena_t *arena, arena_chunk_t *chunk, size_t *p_size,
 		    run_ind+run_pages+nrun_pages-1) == nrun_size);
 		assert(arena_mapbits_dirty_get(chunk,
 		    run_ind+run_pages+nrun_pages-1) == flag_dirty);
+		assert(arena_mapbits_decommitted_get(chunk,
+		    run_ind+run_pages+nrun_pages-1) == flag_decommitted);
 		arena_avail_remove(arena, chunk, run_ind+run_pages, nrun_pages);
 
 		/*
@@ -1466,7 +1569,8 @@ arena_run_coalesce(arena_t *arena, arena_chunk_t *chunk, size_t *p_size,
 	/* Try to coalesce backward. */
 	if (run_ind > map_bias && arena_mapbits_allocated_get(chunk,
 	    run_ind-1) == 0 && arena_mapbits_dirty_get(chunk, run_ind-1) ==
-	    flag_dirty) {
+	    flag_dirty && arena_mapbits_decommitted_get(chunk, run_ind-1) ==
+	    flag_decommitted) {
 		size_t prun_size = arena_mapbits_unallocated_size_get(chunk,
 		    run_ind-1);
 		size_t prun_pages = prun_size >> LG_PAGE;
@@ -1480,6 +1584,8 @@ arena_run_coalesce(arena_t *arena, arena_chunk_t *chunk, size_t *p_size,
 		assert(arena_mapbits_unallocated_size_get(chunk, run_ind) ==
 		    prun_size);
 		assert(arena_mapbits_dirty_get(chunk, run_ind) == flag_dirty);
+		assert(arena_mapbits_decommitted_get(chunk, run_ind) ==
+		    flag_decommitted);
 		arena_avail_remove(arena, chunk, run_ind, prun_pages);
 
 		/*
@@ -1504,27 +1610,53 @@ arena_run_coalesce(arena_t *arena, arena_chunk_t *chunk, size_t *p_size,
 	*p_run_pages = run_pages;
 }
 
-static void
-arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned)
+static size_t
+arena_run_size_get(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
+    size_t run_ind)
 {
-	arena_chunk_t *chunk;
-	arena_chunk_map_misc_t *miscelm;
-	size_t size, run_ind, run_pages, flag_dirty;
+	size_t size;
 
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
-	miscelm = arena_run_to_miscelm(run);
-	run_ind = arena_miscelm_to_pageind(miscelm);
 	assert(run_ind >= map_bias);
 	assert(run_ind < chunk_npages);
+
 	if (arena_mapbits_large_get(chunk, run_ind) != 0) {
 		size = arena_mapbits_large_size_get(chunk, run_ind);
-		assert(size == PAGE ||
-		    arena_mapbits_large_size_get(chunk,
+		assert(size == PAGE || arena_mapbits_large_size_get(chunk,
 		    run_ind+(size>>LG_PAGE)-1) == 0);
 	} else {
 		arena_bin_info_t *bin_info = &arena_bin_info[run->binind];
 		size = bin_info->run_size;
 	}
+
+	return (size);
+}
+
+static bool
+arena_run_decommit(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run)
+{
+	arena_chunk_map_misc_t *miscelm = arena_run_to_miscelm(run);
+	size_t run_ind = arena_miscelm_to_pageind(miscelm);
+	size_t offset = run_ind << LG_PAGE;
+	size_t length = arena_run_size_get(arena, chunk, run, run_ind);
+
+	return (arena->chunk_hooks.decommit(chunk, chunksize, offset, length,
+	    arena->ind));
+}
+
+static void
+arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned,
+    bool decommitted)
+{
+	arena_chunk_t *chunk;
+	arena_chunk_map_misc_t *miscelm;
+	size_t size, run_ind, run_pages, flag_dirty, flag_decommitted;
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
+	miscelm = arena_run_to_miscelm(run);
+	run_ind = arena_miscelm_to_pageind(miscelm);
+	assert(run_ind >= map_bias);
+	assert(run_ind < chunk_npages);
+	size = arena_run_size_get(arena, chunk, run, run_ind);
 	run_pages = (size >> LG_PAGE);
 	arena_cactive_update(arena, 0, run_pages);
 	arena->nactive -= run_pages;
@@ -1536,16 +1668,18 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned)
 	 */
 	assert(arena_mapbits_dirty_get(chunk, run_ind) ==
 	    arena_mapbits_dirty_get(chunk, run_ind+run_pages-1));
-	if (!cleaned && arena_mapbits_dirty_get(chunk, run_ind) != 0)
+	if (!cleaned && !decommitted && arena_mapbits_dirty_get(chunk, run_ind)
+	    != 0)
 		dirty = true;
 	flag_dirty = dirty ? CHUNK_MAP_DIRTY : 0;
+	flag_decommitted = decommitted ? CHUNK_MAP_DECOMMITTED : 0;
 
 	/* Mark pages as unallocated in the chunk map. */
-	if (dirty) {
-		arena_mapbits_unallocated_set(chunk, run_ind, size,
-		    CHUNK_MAP_DIRTY);
+	if (dirty || decommitted) {
+		size_t flags = flag_dirty | flag_decommitted;
+		arena_mapbits_unallocated_set(chunk, run_ind, size, flags);
 		arena_mapbits_unallocated_set(chunk, run_ind+run_pages-1, size,
-		    CHUNK_MAP_DIRTY);
+		    flags);
 	} else {
 		arena_mapbits_unallocated_set(chunk, run_ind, size,
 		    arena_mapbits_unzeroed_get(chunk, run_ind));
@@ -1553,13 +1687,16 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned)
 		    arena_mapbits_unzeroed_get(chunk, run_ind+run_pages-1));
 	}
 
-	arena_run_coalesce(arena, chunk, &size, &run_ind, &run_pages, flag_dirty);
+	arena_run_coalesce(arena, chunk, &size, &run_ind, &run_pages,
+	    flag_dirty, flag_decommitted);
 
 	/* Insert into runs_avail, now that coalescing is complete. */
 	assert(arena_mapbits_unallocated_size_get(chunk, run_ind) ==
 	    arena_mapbits_unallocated_size_get(chunk, run_ind+run_pages-1));
 	assert(arena_mapbits_dirty_get(chunk, run_ind) ==
 	    arena_mapbits_dirty_get(chunk, run_ind+run_pages-1));
+	assert(arena_mapbits_decommitted_get(chunk, run_ind) ==
+	    arena_mapbits_decommitted_get(chunk, run_ind+run_pages-1));
 	arena_avail_insert(arena, chunk, run_ind, run_pages);
 
 	if (dirty)
@@ -1591,6 +1728,7 @@ arena_run_trim_head(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 	size_t pageind = arena_miscelm_to_pageind(miscelm);
 	size_t head_npages = (oldsize - newsize) >> LG_PAGE;
 	size_t flag_dirty = arena_mapbits_dirty_get(chunk, pageind);
+	bool decommitted = (arena_mapbits_decommitted_get(chunk, pageind) != 0);
 
 	assert(oldsize > newsize);
 
@@ -1613,7 +1751,7 @@ arena_run_trim_head(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 	arena_mapbits_large_set(chunk, pageind+head_npages, newsize,
 	    flag_dirty);
 
-	arena_run_dalloc(arena, run, false, false);
+	arena_run_dalloc(arena, run, false, false, decommitted);
 }
 
 static void
@@ -1624,6 +1762,7 @@ arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 	size_t pageind = arena_miscelm_to_pageind(miscelm);
 	size_t head_npages = newsize >> LG_PAGE;
 	size_t flag_dirty = arena_mapbits_dirty_get(chunk, pageind);
+	bool decommitted = arena_mapbits_decommitted_get(chunk, pageind) != 0;
 	arena_chunk_map_misc_t *tail_miscelm;
 	arena_run_t *tail_run;
 
@@ -1650,7 +1789,7 @@ arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 
 	tail_miscelm = arena_miscelm_get(chunk, pageind + head_npages);
 	tail_run = &tail_miscelm->run;
-	arena_run_dalloc(arena, tail_run, dirty, false);
+	arena_run_dalloc(arena, tail_run, dirty, false, decommitted);
 }
 
 static arena_run_t *
@@ -1896,7 +2035,8 @@ arena_redzones_validate(void *ptr, arena_bin_info_t *bin_info, bool reset)
 			uint8_t *byte = (uint8_t *)((uintptr_t)ptr - i);
 			if (*byte != 0xa5) {
 				error = true;
-				arena_redzone_corruption(ptr, size, false, i, *byte);
+				arena_redzone_corruption(ptr, size, false, i,
+				    *byte);
 				if (reset)
 					*byte = 0xa5;
 			}
@@ -1905,7 +2045,8 @@ arena_redzones_validate(void *ptr, arena_bin_info_t *bin_info, bool reset)
 			uint8_t *byte = (uint8_t *)((uintptr_t)ptr + size + i);
 			if (*byte != 0xa5) {
 				error = true;
-				arena_redzone_corruption(ptr, size, true, i, *byte);
+				arena_redzone_corruption(ptr, size, true, i,
+				    *byte);
 				if (reset)
 					*byte = 0xa5;
 			}
@@ -2119,7 +2260,19 @@ arena_palloc_large(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 		arena_run_trim_tail(arena, chunk, run, usize + large_pad +
 		    trailsize, usize + large_pad, false);
 	}
-	arena_run_init_large(arena, run, usize + large_pad, zero);
+	if (arena_run_init_large(arena, run, usize + large_pad, zero)) {
+		size_t run_ind =
+		    arena_miscelm_to_pageind(arena_run_to_miscelm(run));
+		size_t flag_dirty = arena_mapbits_dirty_get(chunk, run_ind);
+		size_t flag_decommitted = arena_mapbits_decommitted_get(chunk,
+		    run_ind);
+
+		assert(flag_decommitted != 0); /* Cause of OOM. */
+		arena_run_dalloc(arena, run, (flag_dirty != 0), false,
+		    (flag_decommitted != 0));
+		malloc_mutex_unlock(&arena->lock);
+		return (NULL);
+	}
 	ret = arena_miscelm_to_rpages(miscelm);
 
 	if (config_stats) {
@@ -2237,7 +2390,10 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 	malloc_mutex_unlock(&bin->lock);
 	/******************************/
 	malloc_mutex_lock(&arena->lock);
-	arena_run_dalloc(arena, run, true, false);
+	{
+		bool committed = arena_run_decommit(arena, chunk, run);
+		arena_run_dalloc(arena, run, committed, false, !committed);
+	}
 	malloc_mutex_unlock(&arena->lock);
 	/****************************/
 	malloc_mutex_lock(&bin->lock);
@@ -2363,6 +2519,7 @@ arena_dalloc_large_locked_impl(arena_t *arena, arena_chunk_t *chunk,
 	size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
 	arena_chunk_map_misc_t *miscelm = arena_miscelm_get(chunk, pageind);
 	arena_run_t *run = &miscelm->run;
+	bool committed;
 
 	if (config_fill || config_stats) {
 		size_t usize = arena_mapbits_large_size_get(chunk, pageind) -
@@ -2380,7 +2537,8 @@ arena_dalloc_large_locked_impl(arena_t *arena, arena_chunk_t *chunk,
 		}
 	}
 
-	arena_run_dalloc(arena, run, true, false);
+	committed = arena_run_decommit(arena, chunk, run);
+	arena_run_dalloc(arena, run, committed, false, !committed);
 }
 
 void
@@ -2470,7 +2628,10 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 		splitsize = usize - oldsize;
 
 		run = &arena_miscelm_get(chunk, pageind+npages)->run;
-		arena_run_split_large(arena, run, splitsize, zero);
+		if (arena_run_split_large(arena, run, splitsize, zero)) {
+			malloc_mutex_unlock(&arena->lock);
+			return (true);
+		}
 
 		size = oldsize + splitsize;
 		npages = (size + large_pad) >> LG_PAGE;
diff --git a/src/base.c b/src/base.c
index 5493d0f..7cdcfed 100644
--- a/src/base.c
+++ b/src/base.c
@@ -90,7 +90,7 @@ base_alloc(size_t size)
 	csize = CACHELINE_CEILING(size);
 
 	usize = s2u(csize);
-	extent_node_init(&key, NULL, NULL, usize, true, false);
+	extent_node_init(&key, NULL, NULL, usize, false, false);
 	malloc_mutex_lock(&base_mtx);
 	node = extent_tree_szad_nsearch(&base_avail_szad, &key);
 	if (node != NULL) {
diff --git a/src/chunk.c b/src/chunk.c
index cdd5311..770a5bb 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -19,13 +19,13 @@ size_t		chunksize_mask; /* (chunksize - 1). */
 size_t		chunk_npages;
 
 static void	*chunk_alloc_default(void *new_addr, size_t size,
-    size_t alignment, bool *zero, unsigned arena_ind);
-static bool	chunk_dalloc_default(void *chunk, size_t size,
-    unsigned arena_ind);
-static bool	chunk_commit_default(void *chunk, size_t size,
-    unsigned arena_ind);
-static bool	chunk_decommit_default(void *chunk, size_t size,
+    size_t alignment, bool *zero, bool *commit, unsigned arena_ind);
+static bool	chunk_dalloc_default(void *chunk, size_t size, bool committed,
     unsigned arena_ind);
+static bool	chunk_commit_default(void *chunk, size_t size, size_t offset,
+    size_t length, unsigned arena_ind);
+static bool	chunk_decommit_default(void *chunk, size_t size, size_t offset,
+    size_t length, unsigned arena_ind);
 static bool	chunk_purge_default(void *chunk, size_t size, size_t offset,
     size_t length, unsigned arena_ind);
 static bool	chunk_split_default(void *chunk, size_t size, size_t size_a,
@@ -51,7 +51,7 @@ const chunk_hooks_t	chunk_hooks_default = {
 
 static void	chunk_record(arena_t *arena, chunk_hooks_t *chunk_hooks,
     extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, bool cache,
-    void *chunk, size_t size, bool committed, bool zeroed);
+    void *chunk, size_t size, bool zeroed, bool committed);
 
 /******************************************************************************/
 
@@ -81,7 +81,24 @@ chunk_hooks_set(arena_t *arena, const chunk_hooks_t *chunk_hooks)
 
 	malloc_mutex_lock(&arena->chunks_mtx);
 	old_chunk_hooks = arena->chunk_hooks;
-	arena->chunk_hooks = *chunk_hooks;
+	/*
+	 * Copy each field atomically so that it is impossible for readers to
+	 * see partially updated pointers.  There are places where readers only
+	 * need one hook function pointer (therefore no need to copy the
+	 * entirety of arena->chunk_hooks), and stale reads do not affect
+	 * correctness, so they perform unlocked reads.
+	 */
+#define	ATOMIC_COPY_HOOK(n) do {					\
+	atomic_write_p((void **)&arena->chunk_hooks.n, chunk_hooks->n);	\
+} while (0)
+	ATOMIC_COPY_HOOK(alloc);
+	ATOMIC_COPY_HOOK(dalloc);
+	ATOMIC_COPY_HOOK(commit);
+	ATOMIC_COPY_HOOK(decommit);
+	ATOMIC_COPY_HOOK(purge);
+	ATOMIC_COPY_HOOK(split);
+	ATOMIC_COPY_HOOK(merge);
+#undef ATOMIC_COPY_HOOK
 	malloc_mutex_unlock(&arena->chunks_mtx);
 
 	return (old_chunk_hooks);
@@ -177,12 +194,13 @@ chunk_first_best_fit(arena_t *arena, extent_tree_t *chunks_szad,
 static void *
 chunk_recycle(arena_t *arena, chunk_hooks_t *chunk_hooks,
     extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, bool cache,
-    void *new_addr, size_t size, size_t alignment, bool *zero, bool dalloc_node)
+    void *new_addr, size_t size, size_t alignment, bool *zero, bool *commit,
+    bool dalloc_node)
 {
 	void *ret;
 	extent_node_t *node;
 	size_t alloc_size, leadsize, trailsize;
-	bool committed, zeroed;
+	bool zeroed, committed;
 
 	assert(new_addr == NULL || alignment == chunksize);
 	/*
@@ -218,10 +236,12 @@ chunk_recycle(arena_t *arena, chunk_hooks_t *chunk_hooks,
 	assert(extent_node_size_get(node) >= leadsize + size);
 	trailsize = extent_node_size_get(node) - leadsize - size;
 	ret = (void *)((uintptr_t)extent_node_addr_get(node) + leadsize);
-	committed = extent_node_committed_get(node);
 	zeroed = extent_node_zeroed_get(node);
 	if (zeroed)
-	    *zero = true;
+		*zero = true;
+	committed = extent_node_committed_get(node);
+	if (committed)
+		*commit = true;
 	/* Split the lead. */
 	if (leadsize != 0 &&
 	    chunk_hooks->split(extent_node_addr_get(node),
@@ -249,7 +269,7 @@ chunk_recycle(arena_t *arena, chunk_hooks_t *chunk_hooks,
 				arena_node_dalloc(arena, node);
 			malloc_mutex_unlock(&arena->chunks_mtx);
 			chunk_record(arena, chunk_hooks, chunks_szad, chunks_ad,
-			    cache, ret, size + trailsize, committed, zeroed);
+			    cache, ret, size + trailsize, zeroed, committed);
 			return (NULL);
 		}
 		/* Insert the trailing space as a smaller chunk. */
@@ -259,21 +279,21 @@ chunk_recycle(arena_t *arena, chunk_hooks_t *chunk_hooks,
 				malloc_mutex_unlock(&arena->chunks_mtx);
 				chunk_record(arena, chunk_hooks, chunks_szad,
 				    chunks_ad, cache, ret, size + trailsize,
-				    committed, zeroed);
+				    zeroed, committed);
 				return (NULL);
 			}
 		}
 		extent_node_init(node, arena, (void *)((uintptr_t)(ret) + size),
-		    trailsize, committed, zeroed);
+		    trailsize, zeroed, committed);
 		extent_tree_szad_insert(chunks_szad, node);
 		extent_tree_ad_insert(chunks_ad, node);
 		arena_chunk_cache_maybe_insert(arena, node, cache);
 		node = NULL;
 	}
-	if (!committed && chunk_hooks->commit(ret, size, arena->ind)) {
+	if (!committed && chunk_hooks->commit(ret, size, 0, size, arena->ind)) {
 		malloc_mutex_unlock(&arena->chunks_mtx);
 		chunk_record(arena, chunk_hooks, chunks_szad, chunks_ad, cache,
-		    ret, size, committed, zeroed);
+		    ret, size, zeroed, committed);
 		return (NULL);
 	}
 	malloc_mutex_unlock(&arena->chunks_mtx);
@@ -304,7 +324,7 @@ chunk_recycle(arena_t *arena, chunk_hooks_t *chunk_hooks,
  */
 static void *
 chunk_alloc_core(arena_t *arena, void *new_addr, size_t size, size_t alignment,
-    bool *zero, dss_prec_t dss_prec)
+    bool *zero, bool *commit, dss_prec_t dss_prec)
 {
 	void *ret;
 	chunk_hooks_t chunk_hooks = CHUNK_HOOKS_INITIALIZER;
@@ -317,23 +337,25 @@ chunk_alloc_core(arena_t *arena, void *new_addr, size_t size, size_t alignment,
 	/* Retained. */
 	if ((ret = chunk_recycle(arena, &chunk_hooks,
 	    &arena->chunks_szad_retained, &arena->chunks_ad_retained, false,
-	    new_addr, size, alignment, zero, true)) != NULL)
+	    new_addr, size, alignment, zero, commit, true)) != NULL)
 		return (ret);
 
 	/* "primary" dss. */
 	if (have_dss && dss_prec == dss_prec_primary && (ret =
-	    chunk_alloc_dss(arena, new_addr, size, alignment, zero)) != NULL)
+	    chunk_alloc_dss(arena, new_addr, size, alignment, zero, commit)) !=
+	    NULL)
 		return (ret);
 	/*
 	 * mmap.  Requesting an address is not implemented for
 	 * chunk_alloc_mmap(), so only call it if (new_addr == NULL).
 	 */
-	if (new_addr == NULL && (ret = chunk_alloc_mmap(size, alignment, zero))
-	    != NULL)
+	if (new_addr == NULL && (ret = chunk_alloc_mmap(size, alignment, zero,
+	    commit)) != NULL)
 		return (ret);
 	/* "secondary" dss. */
 	if (have_dss && dss_prec == dss_prec_secondary && (ret =
-	    chunk_alloc_dss(arena, new_addr, size, alignment, zero)) != NULL)
+	    chunk_alloc_dss(arena, new_addr, size, alignment, zero, commit)) !=
+	    NULL)
 		return (ret);
 
 	/* All strategies for allocation failed. */
@@ -344,7 +366,7 @@ void *
 chunk_alloc_base(size_t size)
 {
 	void *ret;
-	bool zero;
+	bool zero, commit;
 
 	/*
 	 * Directly call chunk_alloc_mmap() rather than chunk_alloc_core()
@@ -352,7 +374,8 @@ chunk_alloc_base(size_t size)
 	 * demand-zeroed virtual memory.
 	 */
 	zero = true;
-	ret = chunk_alloc_mmap(size, chunksize, &zero);
+	commit = true;
+	ret = chunk_alloc_mmap(size, chunksize, &zero, &commit);
 	if (ret == NULL)
 		return (NULL);
 	if (config_valgrind)
@@ -366,17 +389,20 @@ chunk_alloc_cache(arena_t *arena, chunk_hooks_t *chunk_hooks, void *new_addr,
     size_t size, size_t alignment, bool *zero, bool dalloc_node)
 {
 	void *ret;
+	bool commit;
 
 	assert(size != 0);
 	assert((size & chunksize_mask) == 0);
 	assert(alignment != 0);
 	assert((alignment & chunksize_mask) == 0);
 
+	commit = true;
 	ret = chunk_recycle(arena, chunk_hooks, &arena->chunks_szad_cached,
 	    &arena->chunks_ad_cached, true, new_addr, size, alignment, zero,
-	    dalloc_node);
+	    &commit, dalloc_node);
 	if (ret == NULL)
 		return (NULL);
+	assert(commit);
 	if (config_valgrind)
 		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
 	return (ret);
@@ -400,14 +426,14 @@ chunk_arena_get(unsigned arena_ind)
 
 static void *
 chunk_alloc_default(void *new_addr, size_t size, size_t alignment, bool *zero,
-    unsigned arena_ind)
+    bool *commit, unsigned arena_ind)
 {
 	void *ret;
 	arena_t *arena;
 
 	arena = chunk_arena_get(arena_ind);
 	ret = chunk_alloc_core(arena, new_addr, size, alignment, zero,
-	    arena->dss_prec);
+	    commit, arena->dss_prec);
 	if (ret == NULL)
 		return (NULL);
 	if (config_valgrind)
@@ -418,12 +444,13 @@ chunk_alloc_default(void *new_addr, size_t size, size_t alignment, bool *zero,
 
 void *
 chunk_alloc_wrapper(arena_t *arena, chunk_hooks_t *chunk_hooks, void *new_addr,
-    size_t size, size_t alignment, bool *zero)
+    size_t size, size_t alignment, bool *zero, bool *commit)
 {
 	void *ret;
 
 	chunk_hooks_assure_initialized(arena, chunk_hooks);
-	ret = chunk_hooks->alloc(new_addr, size, alignment, zero, arena->ind);
+	ret = chunk_hooks->alloc(new_addr, size, alignment, zero, commit,
+	    arena->ind);
 	if (ret == NULL)
 		return (NULL);
 	if (config_valgrind && chunk_hooks->alloc != chunk_alloc_default)
@@ -434,7 +461,7 @@ chunk_alloc_wrapper(arena_t *arena, chunk_hooks_t *chunk_hooks, void *new_addr,
 static void
 chunk_record(arena_t *arena, chunk_hooks_t *chunk_hooks,
     extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, bool cache,
-    void *chunk, size_t size, bool committed, bool zeroed)
+    void *chunk, size_t size, bool zeroed, bool committed)
 {
 	bool unzeroed;
 	extent_node_t *node, *prev;
@@ -484,8 +511,8 @@ chunk_record(arena_t *arena, chunk_hooks_t *chunk_hooks,
 			}
 			goto label_return;
 		}
-		extent_node_init(node, arena, chunk, size, committed,
-		    !unzeroed);
+		extent_node_init(node, arena, chunk, size, !unzeroed,
+		    committed);
 		extent_tree_ad_insert(chunks_ad, node);
 		extent_tree_szad_insert(chunks_szad, node);
 		arena_chunk_cache_maybe_insert(arena, node, cache);
@@ -534,15 +561,14 @@ chunk_dalloc_cache(arena_t *arena, chunk_hooks_t *chunk_hooks, void *chunk,
 	assert((size & chunksize_mask) == 0);
 
 	chunk_record(arena, chunk_hooks, &arena->chunks_szad_cached,
-	    &arena->chunks_ad_cached, true, chunk, size, true, false);
+	    &arena->chunks_ad_cached, true, chunk, size, false, true);
 	arena_maybe_purge(arena);
 }
 
 void
 chunk_dalloc_arena(arena_t *arena, chunk_hooks_t *chunk_hooks, void *chunk,
-    size_t size, bool zeroed)
+    size_t size, bool zeroed, bool committed)
 {
-	bool committed;
 
 	assert(chunk != NULL);
 	assert(CHUNK_ADDR2BASE(chunk) == chunk);
@@ -551,18 +577,22 @@ chunk_dalloc_arena(arena_t *arena, chunk_hooks_t *chunk_hooks, void *chunk,
 
 	chunk_hooks_assure_initialized(arena, chunk_hooks);
 	/* Try to deallocate. */
-	if (!chunk_hooks->dalloc(chunk, size, arena->ind))
+	if (!chunk_hooks->dalloc(chunk, size, committed, arena->ind))
 		return;
 	/* Try to decommit; purge if that fails. */
-	committed = chunk_hooks->decommit(chunk, size, arena->ind);
+	if (committed) {
+		committed = chunk_hooks->decommit(chunk, size, 0, size,
+		    arena->ind);
+	}
 	zeroed = !committed || chunk_hooks->purge(chunk, size, 0, size,
 	    arena->ind);
 	chunk_record(arena, chunk_hooks, &arena->chunks_szad_retained,
-	    &arena->chunks_ad_retained, false, chunk, size, committed, zeroed);
+	    &arena->chunks_ad_retained, false, chunk, size, zeroed, committed);
 }
 
 static bool
-chunk_dalloc_default(void *chunk, size_t size, unsigned arena_ind)
+chunk_dalloc_default(void *chunk, size_t size, bool committed,
+    unsigned arena_ind)
 {
 
 	if (!have_dss || !chunk_in_dss(chunk))
@@ -572,27 +602,31 @@ chunk_dalloc_default(void *chunk, size_t size, unsigned arena_ind)
 
 void
 chunk_dalloc_wrapper(arena_t *arena, chunk_hooks_t *chunk_hooks, void *chunk,
-    size_t size)
+    size_t size, bool committed)
 {
 
 	chunk_hooks_assure_initialized(arena, chunk_hooks);
-	chunk_hooks->dalloc(chunk, size, arena->ind);
+	chunk_hooks->dalloc(chunk, size, committed, arena->ind);
 	if (config_valgrind && chunk_hooks->dalloc != chunk_dalloc_default)
 		JEMALLOC_VALGRIND_MAKE_MEM_NOACCESS(chunk, size);
 }
 
 static bool
-chunk_commit_default(void *chunk, size_t size, unsigned arena_ind)
+chunk_commit_default(void *chunk, size_t size, size_t offset, size_t length,
+    unsigned arena_ind)
 {
 
-	return (pages_commit(chunk, size));
+	return (pages_commit((void *)((uintptr_t)chunk + (uintptr_t)offset),
+	    length));
 }
 
 static bool
-chunk_decommit_default(void *chunk, size_t size, unsigned arena_ind)
+chunk_decommit_default(void *chunk, size_t size, size_t offset, size_t length,
+    unsigned arena_ind)
 {
 
-	return (pages_decommit(chunk, size));
+	return (pages_decommit((void *)((uintptr_t)chunk + (uintptr_t)offset),
+	    length));
 }
 
 bool
diff --git a/src/chunk_dss.c b/src/chunk_dss.c
index 2c115e0..1035581 100644
--- a/src/chunk_dss.c
+++ b/src/chunk_dss.c
@@ -67,7 +67,7 @@ chunk_dss_prec_set(dss_prec_t dss_prec)
 
 void *
 chunk_alloc_dss(arena_t *arena, void *new_addr, size_t size, size_t alignment,
-    bool *zero)
+    bool *zero, bool *commit)
 {
 	void *ret;
 
@@ -137,13 +137,15 @@ chunk_alloc_dss(arena_t *arena, void *new_addr, size_t size, size_t alignment,
 					chunk_hooks_t chunk_hooks =
 					    CHUNK_HOOKS_INITIALIZER;
 					chunk_dalloc_wrapper(arena,
-					    &chunk_hooks, cpad, cpad_size);
+					    &chunk_hooks, cpad, cpad_size,
+					    true);
 				}
 				if (*zero) {
 					JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(
 					    ret, size);
 					memset(ret, 0, size);
 				}
+				*commit = true;
 				return (ret);
 			}
 		} while (dss_prev != (void *)-1);
diff --git a/src/chunk_mmap.c b/src/chunk_mmap.c
index f243615..a91a14c 100644
--- a/src/chunk_mmap.c
+++ b/src/chunk_mmap.c
@@ -4,7 +4,7 @@
 /******************************************************************************/
 
 static void *
-chunk_alloc_mmap_slow(size_t size, size_t alignment, bool *zero)
+chunk_alloc_mmap_slow(size_t size, size_t alignment, bool *zero, bool *commit)
 {
 	void *ret, *pages;
 	size_t alloc_size, leadsize;
@@ -24,11 +24,12 @@ chunk_alloc_mmap_slow(size_t size, size_t alignment, bool *zero)
 
 	assert(ret != NULL);
 	*zero = true;
+	*commit = true;
 	return (ret);
 }
 
 void *
-chunk_alloc_mmap(size_t size, size_t alignment, bool *zero)
+chunk_alloc_mmap(size_t size, size_t alignment, bool *zero, bool *commit)
 {
 	void *ret;
 	size_t offset;
@@ -55,11 +56,12 @@ chunk_alloc_mmap(size_t size, size_t alignment, bool *zero)
 	offset = ALIGNMENT_ADDR2OFFSET(ret, alignment);
 	if (offset != 0) {
 		pages_unmap(ret, size);
-		return (chunk_alloc_mmap_slow(size, alignment, zero));
+		return (chunk_alloc_mmap_slow(size, alignment, zero, commit));
 	}
 
 	assert(ret != NULL);
 	*zero = true;
+	*commit = true;
 	return (ret);
 }
 
diff --git a/src/huge.c b/src/huge.c
index 4aa7a97..54c2114 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -79,7 +79,7 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
 		return (NULL);
 	}
 
-	extent_node_init(node, arena, ret, size, true, is_zeroed);
+	extent_node_init(node, arena, ret, size, is_zeroed, true);
 
 	if (huge_node_set(ret, node)) {
 		arena_chunk_dalloc_huge(arena, ret, size);
diff --git a/test/integration/chunk.c b/test/integration/chunk.c
index 62d00ba..b49afa5 100644
--- a/test/integration/chunk.c
+++ b/test/integration/chunk.c
@@ -22,45 +22,50 @@ static bool did_merge;
 
 void *
 chunk_alloc(void *new_addr, size_t size, size_t alignment, bool *zero,
-    unsigned arena_ind)
+    bool *commit, unsigned arena_ind)
 {
 
 	TRACE_HOOK("%s(new_addr=%p, size=%zu, alignment=%zu, *zero=%s, "
-	    "arena_ind=%u)\n", __func__, new_addr, size, alignment, *zero ?
-	    "true" : "false", arena_ind);
+	    "*commit=%s, arena_ind=%u)\n", __func__, new_addr, size, alignment,
+	    *zero ?  "true" : "false", *commit ? "true" : "false", arena_ind);
 	did_alloc = true;
-	return (old_hooks.alloc(new_addr, size, alignment, zero, arena_ind));
+	return (old_hooks.alloc(new_addr, size, alignment, zero, commit,
+	    arena_ind));
 }
 
 bool
-chunk_dalloc(void *chunk, size_t size, unsigned arena_ind)
+chunk_dalloc(void *chunk, size_t size, bool committed, unsigned arena_ind)
 {
 
-	TRACE_HOOK("%s(chunk=%p, size=%zu, arena_ind=%u)\n", __func__, chunk,
-	    size, arena_ind);
+	TRACE_HOOK("%s(chunk=%p, size=%zu, committed=%s, arena_ind=%u)\n",
+	    __func__, chunk, size, committed ? "true" : "false", arena_ind);
 	did_dalloc = true;
 	if (!do_dalloc)
 		return (true);
-	return (old_hooks.dalloc(chunk, size, arena_ind));
+	return (old_hooks.dalloc(chunk, size, committed, arena_ind));
 }
 
 bool
-chunk_commit(void *chunk, size_t size, unsigned arena_ind)
+chunk_commit(void *chunk, size_t size, size_t offset, size_t length,
+    unsigned arena_ind)
 {
 
-	TRACE_HOOK("%s(chunk=%p, size=%zu, arena_ind=%u)\n", __func__, chunk,
-	    size, arena_ind);
+	TRACE_HOOK("%s(chunk=%p, size=%zu, offset=%zu, length=%zu, "
+	    "arena_ind=%u)\n", __func__, chunk, size, offset, length,
+	    arena_ind);
 	did_commit = true;
-	memset(chunk, 0, size);
+	memset((void *)((uintptr_t)chunk + offset), 0, length);
 	return (false);
 }
 
 bool
-chunk_decommit(void *chunk, size_t size, unsigned arena_ind)
+chunk_decommit(void *chunk, size_t size, size_t offset, size_t length,
+    unsigned arena_ind)
 {
 
-	TRACE_HOOK("%s(chunk=%p, size=%zu, arena_ind=%u)\n", __func__, chunk,
-	    size, arena_ind);
+	TRACE_HOOK("%s(chunk=%p, size=%zu, offset=%zu, length=%zu, "
+	    "arena_ind=%u)\n", __func__, chunk, size, offset, length,
+	    arena_ind);
 	did_decommit = true;
 	return (!do_decommit);
 }
@@ -106,7 +111,7 @@ chunk_merge(void *chunk_a, size_t size_a, void *chunk_b, size_t size_b,
 TEST_BEGIN(test_chunk)
 {
 	void *p;
-	size_t old_size, new_size, huge0, huge1, huge2, sz;
+	size_t old_size, new_size, large0, large1, huge0, huge1, huge2, sz;
 	chunk_hooks_t new_hooks = {
 		chunk_alloc,
 		chunk_dalloc,
@@ -134,8 +139,14 @@ TEST_BEGIN(test_chunk)
 	assert_ptr_ne(old_hooks.split, chunk_split, "Unexpected split error");
 	assert_ptr_ne(old_hooks.merge, chunk_merge, "Unexpected merge error");
 
-	/* Get huge size classes. */
+	/* Get large size classes. */
 	sz = sizeof(size_t);
+	assert_d_eq(mallctl("arenas.lrun.0.size", &large0, &sz, NULL, 0), 0,
+	    "Unexpected arenas.lrun.0.size failure");
+	assert_d_eq(mallctl("arenas.lrun.1.size", &large1, &sz, NULL, 0), 0,
+	    "Unexpected arenas.lrun.1.size failure");
+
+	/* Get huge size classes. */
 	assert_d_eq(mallctl("arenas.hchunk.0.size", &huge0, &sz, NULL, 0), 0,
 	    "Unexpected arenas.hchunk.0.size failure");
 	assert_d_eq(mallctl("arenas.hchunk.1.size", &huge1, &sz, NULL, 0), 0,
@@ -196,10 +207,29 @@ TEST_BEGIN(test_chunk)
 		did_purge = false;
 		assert_zu_eq(xallocx(p, huge1, 0, 0), huge1,
 		    "Unexpected xallocx() failure");
-		assert_true(did_purge, "Unexpected purge");
+		assert_true(did_purge, "Expected purge");
 		dallocx(p, 0);
 	}
 
+	/* Test decommit for large allocations. */
+	do_decommit = true;
+	p = mallocx(large1, 0);
+	assert_ptr_not_null(p, "Unexpected mallocx() error");
+	assert_d_eq(mallctl("arena.0.purge", NULL, NULL, NULL, 0), 0,
+	    "Unexpected arena.0.purge error");
+	did_decommit = false;
+	assert_zu_eq(xallocx(p, large0, 0, 0), large0,
+	    "Unexpected xallocx() failure");
+	assert_d_eq(mallctl("arena.0.purge", NULL, NULL, NULL, 0), 0,
+	    "Unexpected arena.0.purge error");
+	assert_true(did_decommit, "Expected decommit");
+	did_commit = false;
+	assert_zu_eq(xallocx(p, large1, 0, 0), large1,
+	    "Unexpected xallocx() failure");
+	assert_true(did_commit, "Expected commit");
+	dallocx(p, 0);
+	do_decommit = false;
+
 	/* Make sure non-huge allocation succeeds. */
 	p = mallocx(42, 0);
 	assert_ptr_not_null(p, "Unexpected mallocx() error");
-- 
cgit v0.12


From 4be0c3ca4258574b8a2abec639dca2b77a25ff7b Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 7 Aug 2015 00:51:11 -0700
Subject: Add no-OOM assertions to test.

---
 test/unit/rtree.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/test/unit/rtree.c b/test/unit/rtree.c
index 305c08a..b54b3e8 100644
--- a/test/unit/rtree.c
+++ b/test/unit/rtree.c
@@ -39,11 +39,13 @@ TEST_BEGIN(test_rtree_extrema)
 		assert_false(rtree_new(&rtree, i, node_alloc, node_dalloc),
 		    "Unexpected rtree_new() failure");
 
-		rtree_set(&rtree, 0, &node_a);
+		assert_false(rtree_set(&rtree, 0, &node_a),
+		    "Unexpected rtree_set() failure");
 		assert_ptr_eq(rtree_get(&rtree, 0, true), &node_a,
 		    "rtree_get() should return previously set value");
 
-		rtree_set(&rtree, ~((uintptr_t)0), &node_b);
+		assert_false(rtree_set(&rtree, ~((uintptr_t)0), &node_b),
+		    "Unexpected rtree_set() failure");
 		assert_ptr_eq(rtree_get(&rtree, ~((uintptr_t)0), true), &node_b,
 		    "rtree_get() should return previously set value");
 
@@ -66,7 +68,8 @@ TEST_BEGIN(test_rtree_bits)
 		    "Unexpected rtree_new() failure");
 
 		for (j = 0; j < sizeof(keys)/sizeof(uintptr_t); j++) {
-			rtree_set(&rtree, keys[j], &node);
+			assert_false(rtree_set(&rtree, keys[j], &node),
+			    "Unexpected rtree_set() failure");
 			for (k = 0; k < sizeof(keys)/sizeof(uintptr_t); k++) {
 				assert_ptr_eq(rtree_get(&rtree, keys[k], true),
 				    &node, "rtree_get() should return "
@@ -79,7 +82,8 @@ TEST_BEGIN(test_rtree_bits)
 			    (((uintptr_t)1) << (sizeof(uintptr_t)*8-i)), false),
 			    "Only leftmost rtree leaf should be set; "
 			    "i=%u, j=%u", i, j);
-			rtree_set(&rtree, keys[j], NULL);
+			assert_false(rtree_set(&rtree, keys[j], NULL),
+			    "Unexpected rtree_set() failure");
 		}
 
 		rtree_delete(&rtree);
@@ -106,7 +110,8 @@ TEST_BEGIN(test_rtree_random)
 
 		for (j = 0; j < NSET; j++) {
 			keys[j] = (uintptr_t)gen_rand64(sfmt);
-			rtree_set(&rtree, keys[j], &node);
+			assert_false(rtree_set(&rtree, keys[j], &node),
+			    "Unexpected rtree_set() failure");
 			assert_ptr_eq(rtree_get(&rtree, keys[j], true), &node,
 			    "rtree_get() should return previously set value");
 		}
@@ -116,7 +121,8 @@ TEST_BEGIN(test_rtree_random)
 		}
 
 		for (j = 0; j < NSET; j++) {
-			rtree_set(&rtree, keys[j], NULL);
+			assert_false(rtree_set(&rtree, keys[j], NULL),
+			    "Unexpected rtree_set() failure");
 			assert_ptr_null(rtree_get(&rtree, keys[j], true),
 			    "rtree_get() should return previously set value");
 		}
-- 
cgit v0.12


From de249c8679a188065949f2560b1f0015ea6534b4 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 9 Aug 2015 16:47:27 -0700
Subject: Arena chunk decommit cleanups and fixes.

Decommit arena chunk header during chunk deallocation if the rest of the
chunk is decommitted.
---
 include/jemalloc/internal/arena.h  |  1 +
 include/jemalloc/internal/chunk.h  |  2 +-
 include/jemalloc/internal/extent.h |  3 +-
 src/arena.c                        | 74 +++++++++++++++++++++++++-------------
 src/chunk.c                        |  4 +--
 5 files changed, 55 insertions(+), 29 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index b2afb17..102390d 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -838,6 +838,7 @@ arena_mapbits_unzeroed_set(arena_chunk_t *chunk, size_t pageind,
 	size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
 	size_t mapbits = arena_mapbitsp_read(mapbitsp);
 
+	assert((mapbits & CHUNK_MAP_DECOMMITTED) == 0 || !unzeroed);
 	arena_mapbitsp_write(mapbitsp, (mapbits & ~CHUNK_MAP_UNZEROED) |
 	    unzeroed);
 }
diff --git a/include/jemalloc/internal/chunk.h b/include/jemalloc/internal/chunk.h
index 51cd8ce..5d19383 100644
--- a/include/jemalloc/internal/chunk.h
+++ b/include/jemalloc/internal/chunk.h
@@ -61,7 +61,7 @@ void	*chunk_alloc_cache(arena_t *arena, chunk_hooks_t *chunk_hooks,
 void	*chunk_alloc_wrapper(arena_t *arena, chunk_hooks_t *chunk_hooks,
     void *new_addr, size_t size, size_t alignment, bool *zero, bool *commit);
 void	chunk_dalloc_cache(arena_t *arena, chunk_hooks_t *chunk_hooks,
-    void *chunk, size_t size);
+    void *chunk, size_t size, bool committed);
 void	chunk_dalloc_arena(arena_t *arena, chunk_hooks_t *chunk_hooks,
     void *chunk, size_t size, bool zeroed, bool committed);
 void	chunk_dalloc_wrapper(arena_t *arena, chunk_hooks_t *chunk_hooks,
diff --git a/include/jemalloc/internal/extent.h b/include/jemalloc/internal/extent.h
index f8436e5..969c786 100644
--- a/include/jemalloc/internal/extent.h
+++ b/include/jemalloc/internal/extent.h
@@ -27,7 +27,7 @@ struct extent_node_s {
 	/*
 	 * True if physical memory is committed to the extent, whether
 	 * explicitly or implicitly as on a system that overcommits and
-	 * satisfies physical mamory needs on demand via soft page faults.
+	 * satisfies physical memory needs on demand via soft page faults.
 	 */
 	bool			en_committed;
 
@@ -125,6 +125,7 @@ JEMALLOC_INLINE bool
 extent_node_committed_get(const extent_node_t *node)
 {
 
+	assert(!extent_node_achunk_get(node));
 	return (node->en_committed);
 }
 
diff --git a/src/arena.c b/src/arena.c
index 84ccf11..befc4e3 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -442,9 +442,8 @@ arena_run_split_large_helper(arena_t *arena, arena_run_t *run, size_t size,
 	need_pages = (size >> LG_PAGE);
 	assert(need_pages > 0);
 
-	if (flag_decommitted != 0 && arena->chunk_hooks.commit(chunk,
-	    chunksize, (run_ind << LG_PAGE), (need_pages << LG_PAGE),
-	    arena->ind))
+	if (flag_decommitted != 0 && arena->chunk_hooks.commit(chunk, chunksize,
+	    run_ind << LG_PAGE, size, arena->ind))
 		return (true);
 
 	if (remove) {
@@ -566,9 +565,9 @@ arena_chunk_register(arena_t *arena, arena_chunk_t *chunk, bool zero)
 
 	/*
 	 * The extent node notion of "committed" doesn't directly apply to
-	 * arena chunks.  Arbitrarily mark them as committed (after all they are
-	 * always at least partially committed).  The commit state of runs is
-	 * tracked individually.
+	 * arena chunks.  Arbitrarily mark them as committed.  The commit state
+	 * of runs is tracked individually, and upon chunk deallocation the
+	 * entire chunk is in a consistent commit state.
 	 */
 	extent_node_init(&chunk->node, arena, chunk, chunksize, zero, true);
 	extent_node_achunk_set(&chunk->node, true);
@@ -620,7 +619,7 @@ arena_chunk_alloc_internal(arena_t *arena, bool *zero, bool *commit)
 	if (chunk != NULL) {
 		if (arena_chunk_register(arena, chunk, *zero)) {
 			chunk_dalloc_cache(arena, &chunk_hooks, chunk,
-			    chunksize);
+			    chunksize, true);
 			return (NULL);
 		}
 		*commit = true;
@@ -723,6 +722,8 @@ arena_chunk_dalloc(arena_t *arena, arena_chunk_t *chunk)
 	    arena_maxrun);
 	assert(arena_mapbits_dirty_get(chunk, map_bias) ==
 	    arena_mapbits_dirty_get(chunk, chunk_npages-1));
+	assert(arena_mapbits_decommitted_get(chunk, map_bias) ==
+	    arena_mapbits_decommitted_get(chunk, chunk_npages-1));
 
 	/*
 	 * Remove run from the runs_avail tree, so that the arena does not use
@@ -733,6 +734,7 @@ arena_chunk_dalloc(arena_t *arena, arena_chunk_t *chunk)
 	if (arena->spare != NULL) {
 		arena_chunk_t *spare = arena->spare;
 		chunk_hooks_t chunk_hooks = CHUNK_HOOKS_INITIALIZER;
+		bool committed;
 
 		arena->spare = chunk;
 		if (arena_mapbits_dirty_get(spare, map_bias) != 0) {
@@ -742,8 +744,23 @@ arena_chunk_dalloc(arena_t *arena, arena_chunk_t *chunk)
 
 		chunk_deregister(spare, &spare->node);
 
+		committed = (arena_mapbits_decommitted_get(spare, map_bias) ==
+		    0);
+		if (!committed) {
+			/*
+			 * Decommit the header.  Mark the chunk as decommitted
+			 * even if header decommit fails, since treating a
+			 * partially committed chunk as committed has a high
+			 * potential for causing later access of decommitted
+			 * memory.
+			 */
+			chunk_hooks = chunk_hooks_get(arena);
+			chunk_hooks.decommit(spare, chunksize, 0, map_bias <<
+			    LG_PAGE, arena->ind);
+		}
+
 		chunk_dalloc_cache(arena, &chunk_hooks, (void *)spare,
-		    chunksize);
+		    chunksize, committed);
 
 		if (config_stats) {
 			arena->stats.mapped -= chunksize;
@@ -916,7 +933,7 @@ arena_chunk_dalloc_huge(arena_t *arena, void *chunk, size_t usize)
 	}
 	arena->nactive -= (usize >> LG_PAGE);
 
-	chunk_dalloc_cache(arena, &chunk_hooks, chunk, csize);
+	chunk_dalloc_cache(arena, &chunk_hooks, chunk, csize, true);
 	malloc_mutex_unlock(&arena->lock);
 }
 
@@ -967,7 +984,7 @@ arena_chunk_ralloc_huge_shrink(arena_t *arena, void *chunk, size_t oldsize,
 		void *nchunk = (void *)((uintptr_t)chunk +
 		    CHUNK_CEILING(usize));
 
-		chunk_dalloc_cache(arena, &chunk_hooks, nchunk, cdiff);
+		chunk_dalloc_cache(arena, &chunk_hooks, nchunk, cdiff, true);
 	}
 	malloc_mutex_unlock(&arena->lock);
 }
@@ -1385,6 +1402,9 @@ arena_purge_stashed(arena_t *arena, chunk_hooks_t *chunk_hooks,
 			npages = run_size >> LG_PAGE;
 
 			assert(pageind + npages <= chunk_npages);
+			assert(!arena_mapbits_decommitted_get(chunk, pageind));
+			assert(!arena_mapbits_decommitted_get(chunk,
+			    pageind+npages-1));
 			decommitted = !chunk_hooks->decommit(chunk, chunksize,
 			    pageind << LG_PAGE, npages << LG_PAGE, arena->ind);
 			if (decommitted) {
@@ -1450,11 +1470,12 @@ arena_unstash_purged(arena_t *arena, chunk_hooks_t *chunk_hooks,
 			void *addr = extent_node_addr_get(chunkselm);
 			size_t size = extent_node_size_get(chunkselm);
 			bool zeroed = extent_node_zeroed_get(chunkselm);
+			bool committed = extent_node_committed_get(chunkselm);
 			extent_node_dirty_remove(chunkselm);
 			arena_node_dalloc(arena, chunkselm);
 			chunkselm = chunkselm_next;
 			chunk_dalloc_arena(arena, chunk_hooks, addr, size,
-			    zeroed, true);
+			    zeroed, committed);
 		} else {
 			arena_chunk_t *chunk =
 			    (arena_chunk_t *)CHUNK_ADDR2BASE(rdelm);
@@ -1721,6 +1742,15 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned,
 }
 
 static void
+arena_run_dalloc_decommit(arena_t *arena, arena_chunk_t *chunk,
+    arena_run_t *run)
+{
+	bool committed = arena_run_decommit(arena, chunk, run);
+
+	arena_run_dalloc(arena, run, committed, false, !committed);
+}
+
+static void
 arena_run_trim_head(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
     size_t oldsize, size_t newsize)
 {
@@ -1762,7 +1792,7 @@ arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 	size_t pageind = arena_miscelm_to_pageind(miscelm);
 	size_t head_npages = newsize >> LG_PAGE;
 	size_t flag_dirty = arena_mapbits_dirty_get(chunk, pageind);
-	bool decommitted = arena_mapbits_decommitted_get(chunk, pageind) != 0;
+	bool decommitted = (arena_mapbits_decommitted_get(chunk, pageind) != 0);
 	arena_chunk_map_misc_t *tail_miscelm;
 	arena_run_t *tail_run;
 
@@ -2263,13 +2293,12 @@ arena_palloc_large(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 	if (arena_run_init_large(arena, run, usize + large_pad, zero)) {
 		size_t run_ind =
 		    arena_miscelm_to_pageind(arena_run_to_miscelm(run));
-		size_t flag_dirty = arena_mapbits_dirty_get(chunk, run_ind);
-		size_t flag_decommitted = arena_mapbits_decommitted_get(chunk,
-		    run_ind);
+		bool dirty = (arena_mapbits_dirty_get(chunk, run_ind) != 0);
+		bool decommitted = (arena_mapbits_decommitted_get(chunk,
+		    run_ind) != 0);
 
-		assert(flag_decommitted != 0); /* Cause of OOM. */
-		arena_run_dalloc(arena, run, (flag_dirty != 0), false,
-		    (flag_decommitted != 0));
+		assert(decommitted); /* Cause of OOM. */
+		arena_run_dalloc(arena, run, dirty, false, decommitted);
 		malloc_mutex_unlock(&arena->lock);
 		return (NULL);
 	}
@@ -2390,10 +2419,7 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 	malloc_mutex_unlock(&bin->lock);
 	/******************************/
 	malloc_mutex_lock(&arena->lock);
-	{
-		bool committed = arena_run_decommit(arena, chunk, run);
-		arena_run_dalloc(arena, run, committed, false, !committed);
-	}
+	arena_run_dalloc_decommit(arena, chunk, run);
 	malloc_mutex_unlock(&arena->lock);
 	/****************************/
 	malloc_mutex_lock(&bin->lock);
@@ -2519,7 +2545,6 @@ arena_dalloc_large_locked_impl(arena_t *arena, arena_chunk_t *chunk,
 	size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
 	arena_chunk_map_misc_t *miscelm = arena_miscelm_get(chunk, pageind);
 	arena_run_t *run = &miscelm->run;
-	bool committed;
 
 	if (config_fill || config_stats) {
 		size_t usize = arena_mapbits_large_size_get(chunk, pageind) -
@@ -2537,8 +2562,7 @@ arena_dalloc_large_locked_impl(arena_t *arena, arena_chunk_t *chunk,
 		}
 	}
 
-	committed = arena_run_decommit(arena, chunk, run);
-	arena_run_dalloc(arena, run, committed, false, !committed);
+	arena_run_dalloc_decommit(arena, chunk, run);
 }
 
 void
diff --git a/src/chunk.c b/src/chunk.c
index 770a5bb..5ad0281 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -552,7 +552,7 @@ label_return:
 
 void
 chunk_dalloc_cache(arena_t *arena, chunk_hooks_t *chunk_hooks, void *chunk,
-    size_t size)
+    size_t size, bool committed)
 {
 
 	assert(chunk != NULL);
@@ -561,7 +561,7 @@ chunk_dalloc_cache(arena_t *arena, chunk_hooks_t *chunk_hooks, void *chunk,
 	assert((size & chunksize_mask) == 0);
 
 	chunk_record(arena, chunk_hooks, &arena->chunks_szad_cached,
-	    &arena->chunks_ad_cached, true, chunk, size, false, true);
+	    &arena->chunks_ad_cached, true, chunk, size, false, committed);
 	arena_maybe_purge(arena);
 }
 
-- 
cgit v0.12


From 45186f0c074a5fba345d04ac1df1b77b60bb3eb6 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Mon, 10 Aug 2015 23:03:34 -0700
Subject: Refactor arena_mapbits unzeroed flag management.

Only set the unzeroed flag when initializing the entire mapbits entry,
rather than mutating just the unzeroed bit.  This simplifies the
possible mapbits state transitions.
---
 include/jemalloc/internal/arena.h             | 25 +++++++---------
 include/jemalloc/internal/private_symbols.txt |  2 +-
 src/arena.c                                   | 43 ++++++++++++++-------------
 src/pages.c                                   |  2 +-
 4 files changed, 35 insertions(+), 37 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 102390d..7b49a49 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -534,14 +534,14 @@ void	arena_mapbits_unallocated_set(arena_chunk_t *chunk, size_t pageind,
     size_t size, size_t flags);
 void	arena_mapbits_unallocated_size_set(arena_chunk_t *chunk, size_t pageind,
     size_t size);
+void	arena_mapbits_internal_set(arena_chunk_t *chunk, size_t pageind,
+    size_t flags);
 void	arena_mapbits_large_set(arena_chunk_t *chunk, size_t pageind,
     size_t size, size_t flags);
 void	arena_mapbits_large_binind_set(arena_chunk_t *chunk, size_t pageind,
     index_t binind);
 void	arena_mapbits_small_set(arena_chunk_t *chunk, size_t pageind,
     size_t runind, index_t binind, size_t flags);
-void	arena_mapbits_unzeroed_set(arena_chunk_t *chunk, size_t pageind,
-    size_t unzeroed);
 void	arena_metadata_allocated_add(arena_t *arena, size_t size);
 void	arena_metadata_allocated_sub(arena_t *arena, size_t size);
 size_t	arena_metadata_allocated_get(arena_t *arena);
@@ -784,6 +784,15 @@ arena_mapbits_unallocated_size_set(arena_chunk_t *chunk, size_t pageind,
 }
 
 JEMALLOC_ALWAYS_INLINE void
+arena_mapbits_internal_set(arena_chunk_t *chunk, size_t pageind, size_t flags)
+{
+	size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
+
+	assert((flags & CHUNK_MAP_UNZEROED) == flags);
+	arena_mapbitsp_write(mapbitsp, flags);
+}
+
+JEMALLOC_ALWAYS_INLINE void
 arena_mapbits_large_set(arena_chunk_t *chunk, size_t pageind, size_t size,
     size_t flags)
 {
@@ -831,18 +840,6 @@ arena_mapbits_small_set(arena_chunk_t *chunk, size_t pageind, size_t runind,
 	    CHUNK_MAP_ALLOCATED);
 }
 
-JEMALLOC_ALWAYS_INLINE void
-arena_mapbits_unzeroed_set(arena_chunk_t *chunk, size_t pageind,
-    size_t unzeroed)
-{
-	size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
-	size_t mapbits = arena_mapbitsp_read(mapbitsp);
-
-	assert((mapbits & CHUNK_MAP_DECOMMITTED) == 0 || !unzeroed);
-	arena_mapbitsp_write(mapbitsp, (mapbits & ~CHUNK_MAP_UNZEROED) |
-	    unzeroed);
-}
-
 JEMALLOC_INLINE void
 arena_metadata_allocated_add(arena_t *arena, size_t size)
 {
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 2228520..bd4fe8f 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -42,6 +42,7 @@ arena_mapbits_binind_get
 arena_mapbits_decommitted_get
 arena_mapbits_dirty_get
 arena_mapbits_get
+arena_mapbits_internal_set
 arena_mapbits_large_binind_set
 arena_mapbits_large_get
 arena_mapbits_large_set
@@ -52,7 +53,6 @@ arena_mapbits_unallocated_set
 arena_mapbits_unallocated_size_get
 arena_mapbits_unallocated_size_set
 arena_mapbits_unzeroed_get
-arena_mapbits_unzeroed_set
 arena_mapbitsp_get
 arena_mapbitsp_read
 arena_mapbitsp_write
diff --git a/src/arena.c b/src/arena.c
index befc4e3..a72fea2 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -642,7 +642,7 @@ arena_chunk_init_hard(arena_t *arena)
 {
 	arena_chunk_t *chunk;
 	bool zero, commit;
-	size_t unzeroed, decommitted, i;
+	size_t flag_unzeroed, flag_decommitted, i;
 
 	assert(arena->spare == NULL);
 
@@ -657,10 +657,10 @@ arena_chunk_init_hard(arena_t *arena)
 	 * the pages as zeroed if chunk_alloc() returned a zeroed or decommitted
 	 * chunk.
 	 */
-	unzeroed = (zero || !commit) ? 0 : CHUNK_MAP_UNZEROED;
-	decommitted = commit ? 0 : CHUNK_MAP_DECOMMITTED;
-	arena_mapbits_unallocated_set(chunk, map_bias, arena_maxrun, unzeroed |
-	    decommitted);
+	flag_unzeroed = (zero || !commit) ? 0 : CHUNK_MAP_UNZEROED;
+	flag_decommitted = commit ? 0 : CHUNK_MAP_DECOMMITTED;
+	arena_mapbits_unallocated_set(chunk, map_bias, arena_maxrun,
+	    flag_unzeroed | flag_decommitted);
 	/*
 	 * There is no need to initialize the internal page map entries unless
 	 * the chunk is not zeroed.
@@ -672,7 +672,7 @@ arena_chunk_init_hard(arena_t *arena)
 		    chunk_npages-1) - (uintptr_t)arena_bitselm_get(chunk,
 		    map_bias+1)));
 		for (i = map_bias+1; i < chunk_npages-1; i++)
-			arena_mapbits_unzeroed_set(chunk, i, unzeroed);
+			arena_mapbits_internal_set(chunk, i, flag_unzeroed);
 	} else {
 		JEMALLOC_VALGRIND_MAKE_MEM_DEFINED((void
 		    *)arena_bitselm_get(chunk, map_bias+1), (size_t)((uintptr_t)
@@ -681,12 +681,12 @@ arena_chunk_init_hard(arena_t *arena)
 		if (config_debug) {
 			for (i = map_bias+1; i < chunk_npages-1; i++) {
 				assert(arena_mapbits_unzeroed_get(chunk, i) ==
-				    unzeroed);
+				    flag_unzeroed);
 			}
 		}
 	}
 	arena_mapbits_unallocated_set(chunk, chunk_npages-1, arena_maxrun,
-	    unzeroed);
+	    flag_unzeroed);
 
 	return (chunk);
 }
@@ -1391,8 +1391,8 @@ arena_purge_stashed(arena_t *arena, chunk_hooks_t *chunk_hooks,
 			npages = size >> LG_PAGE;
 			chunkselm = qr_next(chunkselm, cc_link);
 		} else {
-			size_t pageind, run_size, flag_unzeroed, i;
-			bool unzeroed, decommitted;
+			size_t pageind, run_size, flag_unzeroed, flags, i;
+			bool decommitted;
 			arena_chunk_t *chunk =
 			    (arena_chunk_t *)CHUNK_ADDR2BASE(rdelm);
 			arena_chunk_map_misc_t *miscelm =
@@ -1408,20 +1408,21 @@ arena_purge_stashed(arena_t *arena, chunk_hooks_t *chunk_hooks,
 			decommitted = !chunk_hooks->decommit(chunk, chunksize,
 			    pageind << LG_PAGE, npages << LG_PAGE, arena->ind);
 			if (decommitted) {
-				arena_mapbits_large_set(chunk, pageind+npages-1,
-				    0, CHUNK_MAP_DECOMMITTED);
-				arena_mapbits_large_set(chunk, pageind,
-				    run_size, CHUNK_MAP_DECOMMITTED);
-				unzeroed = false;
+				flag_unzeroed = 0;
+				flags = CHUNK_MAP_DECOMMITTED;
 			} else {
-				unzeroed = chunk_purge_wrapper(arena,
+				flag_unzeroed = chunk_purge_wrapper(arena,
 				    chunk_hooks, chunk, chunksize, pageind <<
-				    LG_PAGE, run_size);
+				    LG_PAGE, run_size) ? CHUNK_MAP_UNZEROED : 0;
+				flags = flag_unzeroed;
 			}
-			flag_unzeroed = unzeroed ? CHUNK_MAP_UNZEROED : 0;
+			arena_mapbits_large_set(chunk, pageind+npages-1, 0,
+			    flags);
+			arena_mapbits_large_set(chunk, pageind, run_size,
+			    flags);
 
 			/*
-			 * Set the unzeroed flag for all pages, now that
+			 * Set the unzeroed flag for internal pages, now that
 			 * chunk_purge_wrapper() has returned whether the pages
 			 * were zeroed as a side effect of purging.  This chunk
 			 * map modification is safe even though the arena mutex
@@ -1431,8 +1432,8 @@ arena_purge_stashed(arena_t *arena, chunk_hooks_t *chunk_hooks,
 			 * writes don't perturb the first and last elements'
 			 * CHUNK_MAP_ALLOCATED bits, behavior is well defined.
 			 */
-			for (i = 0; i < npages; i++) {
-				arena_mapbits_unzeroed_set(chunk, pageind+i,
+			for (i = 1; i < npages-1; i++) {
+				arena_mapbits_internal_set(chunk, pageind+i,
 				    flag_unzeroed);
 			}
 		}
diff --git a/src/pages.c b/src/pages.c
index 6f775dc..3202901 100644
--- a/src/pages.c
+++ b/src/pages.c
@@ -102,7 +102,7 @@ pages_commit_impl(void *addr, size_t size, bool commit)
 {
 
 #ifndef _WIN32
-	if (config_debug) {
+	if (false &&/*XXX*/ config_debug) {
 		int prot = commit ? (PROT_READ | PROT_WRITE) : PROT_NONE;
 		void *result = mmap(addr, size, prot, MAP_PRIVATE | MAP_ANON |
 		    MAP_FIXED, -1, 0);
-- 
cgit v0.12


From ac5db02034c01357a4ce90504886046a58117921 Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh@glandium.org>
Date: Tue, 11 Aug 2015 14:01:59 +0900
Subject: Make --enable-tls and --enable-lazy-lock take precedence over
 configure.ac-hardcoded defaults

---
 configure.ac | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/configure.ac b/configure.ac
index 502dd39..6f44b6c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1246,9 +1246,9 @@ else
   enable_lazy_lock="1"
 fi
 ],
-[enable_lazy_lock="0"]
+[enable_lazy_lock=""]
 )
-if test "x$enable_lazy_lock" = "x0" -a "x${force_lazy_lock}" = "x1" ; then
+if test "x$enable_lazy_lock" = "x" -a "x${force_lazy_lock}" = "x1" ; then
   AC_MSG_RESULT([Forcing lazy-lock to avoid allocator/threading bootstrap issues])
   enable_lazy_lock="1"
 fi
@@ -1261,6 +1261,8 @@ if test "x$enable_lazy_lock" = "x1" ; then
       ])
   fi
   AC_DEFINE([JEMALLOC_LAZY_LOCK], [ ])
+else
+  enable_lazy_lock="0"
 fi
 AC_SUBST([enable_lazy_lock])
 
@@ -1272,13 +1274,13 @@ else
   enable_tls="1"
 fi
 ,
-enable_tls="1"
+enable_tls=""
 )
-if test "x${enable_tls}" = "x0" -a "x${force_tls}" = "x1" ; then
+if test "x${enable_tls}" = "x" -a "x${force_tls}" = "x1" ; then
   AC_MSG_RESULT([Forcing TLS to avoid allocator/threading bootstrap issues])
   enable_tls="1"
 fi
-if test "x${enable_tls}" = "x1" -a "x${force_tls}" = "x0" ; then
+if test "x${enable_tls}" = "x" -a "x${force_tls}" = "x0" ; then
   AC_MSG_RESULT([Forcing no TLS to avoid allocator/threading bootstrap issues])
   enable_tls="0"
 fi
@@ -1295,6 +1297,8 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
               AC_MSG_RESULT([yes]),
               AC_MSG_RESULT([no])
               enable_tls="0")
+else
+  enable_tls="0"
 fi
 AC_SUBST([enable_tls])
 if test "x${enable_tls}" = "x1" ; then
-- 
cgit v0.12


From 6bdeddb6976a9e372caafa6c5b270007b07c41ae Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Mon, 10 Aug 2015 23:42:33 -0700
Subject: Fix build failure.

This regression was introduced by
de249c8679a188065949f2560b1f0015ea6534b4 (Arena chunk decommit cleanups
and fixes.).

This resolves #254.
---
 include/jemalloc/internal/extent.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/jemalloc/internal/extent.h b/include/jemalloc/internal/extent.h
index 969c786..386d50e 100644
--- a/include/jemalloc/internal/extent.h
+++ b/include/jemalloc/internal/extent.h
@@ -125,7 +125,7 @@ JEMALLOC_INLINE bool
 extent_node_committed_get(const extent_node_t *node)
 {
 
-	assert(!extent_node_achunk_get(node));
+	assert(!node->en_achunk);
 	return (node->en_committed);
 }
 
-- 
cgit v0.12


From 1f27abc1b1f3583d9c6f999374613dc5319aeb12 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 11 Aug 2015 12:42:33 -0700
Subject: Refactor arena_mapbits_{small,large}_set() to not preserve unzeroed.

Fix arena_run_split_large_helper() to treat newly committed memory as
zeroed.
---
 include/jemalloc/internal/arena.h |  17 +++---
 src/arena.c                       | 108 +++++++++++++++++++++++---------------
 src/pages.c                       |   2 +-
 3 files changed, 73 insertions(+), 54 deletions(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 7b49a49..f972e8b 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -797,15 +797,14 @@ arena_mapbits_large_set(arena_chunk_t *chunk, size_t pageind, size_t size,
     size_t flags)
 {
 	size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
-	size_t mapbits = arena_mapbitsp_read(mapbitsp);
-	size_t unzeroed;
 
 	assert((size & PAGE_MASK) == 0);
 	assert(((size << CHUNK_MAP_SIZE_SHIFT) & ~CHUNK_MAP_SIZE_MASK) == 0);
-	assert((flags & (CHUNK_MAP_DIRTY|CHUNK_MAP_DECOMMITTED)) == flags);
-	unzeroed = mapbits & CHUNK_MAP_UNZEROED; /* Preserve unzeroed. */
+	assert((flags & CHUNK_MAP_FLAGS_MASK) == flags);
+	assert((flags & CHUNK_MAP_DECOMMITTED) == 0 || (flags &
+	    (CHUNK_MAP_DIRTY|CHUNK_MAP_UNZEROED)) == 0);
 	arena_mapbitsp_write(mapbitsp, (size << CHUNK_MAP_SIZE_SHIFT) |
-	    CHUNK_MAP_BININD_INVALID | flags | unzeroed | CHUNK_MAP_LARGE |
+	    CHUNK_MAP_BININD_INVALID | flags | CHUNK_MAP_LARGE |
 	    CHUNK_MAP_ALLOCATED);
 }
 
@@ -828,16 +827,12 @@ arena_mapbits_small_set(arena_chunk_t *chunk, size_t pageind, size_t runind,
     index_t binind, size_t flags)
 {
 	size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
-	size_t mapbits = arena_mapbitsp_read(mapbitsp);
-	size_t unzeroed;
 
 	assert(binind < BININD_INVALID);
 	assert(pageind - runind >= map_bias);
-	assert((flags & CHUNK_MAP_DIRTY) == flags);
-	unzeroed = mapbits & CHUNK_MAP_UNZEROED; /* Preserve unzeroed. */
+	assert((flags & CHUNK_MAP_UNZEROED) == flags);
 	arena_mapbitsp_write(mapbitsp, (runind << CHUNK_MAP_RUNIND_SHIFT) |
-	    (binind << CHUNK_MAP_BININD_SHIFT) | flags | unzeroed |
-	    CHUNK_MAP_ALLOCATED);
+	    (binind << CHUNK_MAP_BININD_SHIFT) | flags | CHUNK_MAP_ALLOCATED);
 }
 
 JEMALLOC_INLINE void
diff --git a/src/arena.c b/src/arena.c
index a72fea2..af48b39 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -401,26 +401,20 @@ arena_run_split_remove(arena_t *arena, arena_chunk_t *chunk, size_t run_ind,
 	/* Keep track of trailing unused pages for later use. */
 	if (rem_pages > 0) {
 		size_t flags = flag_dirty | flag_decommitted;
-
-		if (flags != 0) {
-			arena_mapbits_unallocated_set(chunk,
-			    run_ind+need_pages, (rem_pages << LG_PAGE), flags);
-			arena_mapbits_unallocated_set(chunk,
-			    run_ind+total_pages-1, (rem_pages << LG_PAGE),
-			    flags);
-			if (flag_dirty != 0) {
-				arena_run_dirty_insert(arena, chunk,
-				    run_ind+need_pages, rem_pages);
-			}
-		} else {
-			arena_mapbits_unallocated_set(chunk, run_ind+need_pages,
-			    (rem_pages << LG_PAGE),
-			    arena_mapbits_unzeroed_get(chunk,
-			    run_ind+need_pages));
-			arena_mapbits_unallocated_set(chunk,
-			    run_ind+total_pages-1, (rem_pages << LG_PAGE),
-			    arena_mapbits_unzeroed_get(chunk,
-			    run_ind+total_pages-1));
+		size_t flag_unzeroed_mask = (flags == 0) ?  CHUNK_MAP_UNZEROED :
+		    0;
+
+		arena_mapbits_unallocated_set(chunk, run_ind+need_pages,
+		    (rem_pages << LG_PAGE), flags |
+		    (arena_mapbits_unzeroed_get(chunk, run_ind+need_pages) &
+		    flag_unzeroed_mask));
+		arena_mapbits_unallocated_set(chunk, run_ind+total_pages-1,
+		    (rem_pages << LG_PAGE), flags |
+		    (arena_mapbits_unzeroed_get(chunk, run_ind+total_pages-1) &
+		    flag_unzeroed_mask));
+		if (flag_dirty != 0) {
+			arena_run_dirty_insert(arena, chunk, run_ind+need_pages,
+			    rem_pages);
 		}
 		arena_avail_insert(arena, chunk, run_ind+need_pages, rem_pages);
 	}
@@ -433,6 +427,7 @@ arena_run_split_large_helper(arena_t *arena, arena_run_t *run, size_t size,
 	arena_chunk_t *chunk;
 	arena_chunk_map_misc_t *miscelm;
 	size_t flag_dirty, flag_decommitted, run_ind, need_pages, i;
+	size_t flag_unzeroed_mask;
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
 	miscelm = arena_run_to_miscelm(run);
@@ -452,7 +447,15 @@ arena_run_split_large_helper(arena_t *arena, arena_run_t *run, size_t size,
 	}
 
 	if (zero) {
-		if (flag_dirty == 0) {
+		if (flag_decommitted != 0) {
+			/* The run is untouched, and therefore zeroed. */
+			JEMALLOC_VALGRIND_MAKE_MEM_DEFINED((void
+			    *)((uintptr_t)chunk + (run_ind << LG_PAGE)),
+			    (need_pages << LG_PAGE));
+		} else if (flag_dirty != 0) {
+			/* The run is dirty, so all pages must be zeroed. */
+			arena_run_zero(chunk, run_ind, need_pages);
+		} else {
 			/*
 			 * The run is clean, so some pages may be zeroed (i.e.
 			 * never before touched).
@@ -469,9 +472,6 @@ arena_run_split_large_helper(arena_t *arena, arena_run_t *run, size_t size,
 					    run_ind+i);
 				}
 			}
-		} else {
-			/* The run is dirty, so all pages must be zeroed. */
-			arena_run_zero(chunk, run_ind, need_pages);
 		}
 	} else {
 		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED((void *)((uintptr_t)chunk +
@@ -482,8 +482,13 @@ arena_run_split_large_helper(arena_t *arena, arena_run_t *run, size_t size,
 	 * Set the last element first, in case the run only contains one page
 	 * (i.e. both statements set the same element).
 	 */
-	arena_mapbits_large_set(chunk, run_ind+need_pages-1, 0, flag_dirty);
-	arena_mapbits_large_set(chunk, run_ind, size, flag_dirty);
+	flag_unzeroed_mask = (flag_dirty | flag_decommitted) == 0 ?
+	    CHUNK_MAP_UNZEROED : 0;
+	arena_mapbits_large_set(chunk, run_ind+need_pages-1, 0, flag_dirty |
+	    (flag_unzeroed_mask & arena_mapbits_unzeroed_get(chunk,
+	    run_ind+need_pages-1)));
+	arena_mapbits_large_set(chunk, run_ind, size, flag_dirty |
+	    (flag_unzeroed_mask & arena_mapbits_unzeroed_get(chunk, run_ind)));
 	return (false);
 }
 
@@ -527,9 +532,11 @@ arena_run_split_small(arena_t *arena, arena_run_t *run, size_t size,
 	    flag_decommitted, need_pages);
 
 	for (i = 0; i < need_pages; i++) {
-		arena_mapbits_small_set(chunk, run_ind+i, i, binind, 0);
-		if (config_debug && flag_dirty == 0 &&
-		    arena_mapbits_unzeroed_get(chunk, run_ind+i) == 0)
+		size_t flag_unzeroed = arena_mapbits_unzeroed_get(chunk,
+		    run_ind+i);
+		arena_mapbits_small_set(chunk, run_ind+i, i, binind,
+		    flag_unzeroed);
+		if (config_debug && flag_dirty == 0 && flag_unzeroed == 0)
 			arena_run_page_validate_zeroed(chunk, run_ind+i);
 	}
 	JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED((void *)((uintptr_t)chunk +
@@ -1759,7 +1766,9 @@ arena_run_trim_head(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 	size_t pageind = arena_miscelm_to_pageind(miscelm);
 	size_t head_npages = (oldsize - newsize) >> LG_PAGE;
 	size_t flag_dirty = arena_mapbits_dirty_get(chunk, pageind);
-	bool decommitted = (arena_mapbits_decommitted_get(chunk, pageind) != 0);
+	size_t flag_decommitted = arena_mapbits_decommitted_get(chunk, pageind);
+	size_t flag_unzeroed_mask = (flag_dirty | flag_decommitted) == 0 ?
+	    CHUNK_MAP_UNZEROED : 0;
 
 	assert(oldsize > newsize);
 
@@ -1769,8 +1778,11 @@ arena_run_trim_head(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 	 * run first, in case of single-page runs.
 	 */
 	assert(arena_mapbits_large_size_get(chunk, pageind) == oldsize);
-	arena_mapbits_large_set(chunk, pageind+head_npages-1, 0, flag_dirty);
-	arena_mapbits_large_set(chunk, pageind, oldsize-newsize, flag_dirty);
+	arena_mapbits_large_set(chunk, pageind+head_npages-1, 0, flag_dirty |
+	    (flag_unzeroed_mask & arena_mapbits_unzeroed_get(chunk,
+	    pageind+head_npages-1)));
+	arena_mapbits_large_set(chunk, pageind, oldsize-newsize, flag_dirty |
+	    (flag_unzeroed_mask & arena_mapbits_unzeroed_get(chunk, pageind)));
 
 	if (config_debug) {
 		UNUSED size_t tail_npages = newsize >> LG_PAGE;
@@ -1780,9 +1792,10 @@ arena_run_trim_head(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 		    pageind+head_npages+tail_npages-1) == flag_dirty);
 	}
 	arena_mapbits_large_set(chunk, pageind+head_npages, newsize,
-	    flag_dirty);
+	    flag_dirty | (flag_unzeroed_mask & arena_mapbits_unzeroed_get(chunk,
+	    pageind+head_npages)));
 
-	arena_run_dalloc(arena, run, false, false, decommitted);
+	arena_run_dalloc(arena, run, false, false, (flag_decommitted != 0));
 }
 
 static void
@@ -1793,7 +1806,9 @@ arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 	size_t pageind = arena_miscelm_to_pageind(miscelm);
 	size_t head_npages = newsize >> LG_PAGE;
 	size_t flag_dirty = arena_mapbits_dirty_get(chunk, pageind);
-	bool decommitted = (arena_mapbits_decommitted_get(chunk, pageind) != 0);
+	size_t flag_decommitted = arena_mapbits_decommitted_get(chunk, pageind);
+	size_t flag_unzeroed_mask = (flag_dirty | flag_decommitted) == 0 ?
+	    CHUNK_MAP_UNZEROED : 0;
 	arena_chunk_map_misc_t *tail_miscelm;
 	arena_run_t *tail_run;
 
@@ -1805,8 +1820,11 @@ arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 	 * run first, in case of single-page runs.
 	 */
 	assert(arena_mapbits_large_size_get(chunk, pageind) == oldsize);
-	arena_mapbits_large_set(chunk, pageind+head_npages-1, 0, flag_dirty);
-	arena_mapbits_large_set(chunk, pageind, newsize, flag_dirty);
+	arena_mapbits_large_set(chunk, pageind+head_npages-1, 0, flag_dirty |
+	    (flag_unzeroed_mask & arena_mapbits_unzeroed_get(chunk,
+	    pageind+head_npages-1)));
+	arena_mapbits_large_set(chunk, pageind, newsize, flag_dirty |
+	    (flag_unzeroed_mask & arena_mapbits_unzeroed_get(chunk, pageind)));
 
 	if (config_debug) {
 		UNUSED size_t tail_npages = (oldsize - newsize) >> LG_PAGE;
@@ -1816,11 +1834,13 @@ arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 		    pageind+head_npages+tail_npages-1) == flag_dirty);
 	}
 	arena_mapbits_large_set(chunk, pageind+head_npages, oldsize-newsize,
-	    flag_dirty);
+	    flag_dirty | (flag_unzeroed_mask & arena_mapbits_unzeroed_get(chunk,
+	    pageind+head_npages)));
 
 	tail_miscelm = arena_miscelm_get(chunk, pageind + head_npages);
 	tail_run = &tail_miscelm->run;
-	arena_run_dalloc(arena, tail_run, dirty, false, decommitted);
+	arena_run_dalloc(arena, tail_run, dirty, false, (flag_decommitted !=
+	    0));
 }
 
 static arena_run_t *
@@ -2644,7 +2664,7 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 		 * allocation.
 		 */
 		arena_run_t *run;
-		size_t flag_dirty, splitsize, usize;
+		size_t flag_dirty, flag_unzeroed_mask, splitsize, usize;
 
 		usize = s2u(size + extra);
 		while (oldsize + followsize < usize)
@@ -2671,9 +2691,13 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 		 */
 		flag_dirty = arena_mapbits_dirty_get(chunk, pageind) |
 		    arena_mapbits_dirty_get(chunk, pageind+npages-1);
+		flag_unzeroed_mask = flag_dirty == 0 ? CHUNK_MAP_UNZEROED : 0;
 		arena_mapbits_large_set(chunk, pageind, size + large_pad,
-		    flag_dirty);
-		arena_mapbits_large_set(chunk, pageind+npages-1, 0, flag_dirty);
+		    flag_dirty | (flag_unzeroed_mask &
+		    arena_mapbits_unzeroed_get(chunk, pageind)));
+		arena_mapbits_large_set(chunk, pageind+npages-1, 0, flag_dirty |
+		    (flag_unzeroed_mask & arena_mapbits_unzeroed_get(chunk,
+		    pageind+npages-1)));
 
 		if (config_stats) {
 			index_t oldindex = size2index(oldsize) - NBINS;
diff --git a/src/pages.c b/src/pages.c
index 3202901..6f775dc 100644
--- a/src/pages.c
+++ b/src/pages.c
@@ -102,7 +102,7 @@ pages_commit_impl(void *addr, size_t size, bool commit)
 {
 
 #ifndef _WIN32
-	if (false &&/*XXX*/ config_debug) {
+	if (config_debug) {
 		int prot = commit ? (PROT_READ | PROT_WRITE) : PROT_NONE;
 		void *result = mmap(addr, size, prot, MAP_PRIVATE | MAP_ANON |
 		    MAP_FIXED, -1, 0);
-- 
cgit v0.12


From 03bf5b67be92db3a49f81816dccb5c18c0f2a0c0 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 12 Aug 2015 10:26:54 -0700
Subject: Try to decommit new chunks.

Always leave decommit disabled on non-Windows systems.
---
 src/chunk_dss.c          |  3 ++-
 src/chunk_mmap.c         |  6 ++++--
 src/pages.c              |  8 +++++++-
 test/integration/chunk.c | 25 ++++++++++++++-----------
 4 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/src/chunk_dss.c b/src/chunk_dss.c
index 1035581..de0546d 100644
--- a/src/chunk_dss.c
+++ b/src/chunk_dss.c
@@ -145,7 +145,8 @@ chunk_alloc_dss(arena_t *arena, void *new_addr, size_t size, size_t alignment,
 					    ret, size);
 					memset(ret, 0, size);
 				}
-				*commit = true;
+				if (!*commit)
+					*commit = pages_decommit(ret, size);
 				return (ret);
 			}
 		} while (dss_prev != (void *)-1);
diff --git a/src/chunk_mmap.c b/src/chunk_mmap.c
index a91a14c..36eb075 100644
--- a/src/chunk_mmap.c
+++ b/src/chunk_mmap.c
@@ -24,7 +24,8 @@ chunk_alloc_mmap_slow(size_t size, size_t alignment, bool *zero, bool *commit)
 
 	assert(ret != NULL);
 	*zero = true;
-	*commit = true;
+	if (!*commit)
+		*commit = pages_decommit(ret, size);
 	return (ret);
 }
 
@@ -61,7 +62,8 @@ chunk_alloc_mmap(size_t size, size_t alignment, bool *zero, bool *commit)
 
 	assert(ret != NULL);
 	*zero = true;
-	*commit = true;
+	if (!*commit)
+		*commit = pages_decommit(ret, size);
 	return (ret);
 }
 
diff --git a/src/pages.c b/src/pages.c
index 6f775dc..83a167f 100644
--- a/src/pages.c
+++ b/src/pages.c
@@ -102,7 +102,13 @@ pages_commit_impl(void *addr, size_t size, bool commit)
 {
 
 #ifndef _WIN32
-	if (config_debug) {
+	/*
+	 * The following decommit/commit implementation is functional, but
+	 * always disabled because it doesn't add value beyong improved
+	 * debugging (at the cost of extra system calls) on systems that
+	 * overcommit.
+	 */
+	if (false) {
 		int prot = commit ? (PROT_READ | PROT_WRITE) : PROT_NONE;
 		void *result = mmap(addr, size, prot, MAP_PRIVATE | MAP_ANON |
 		    MAP_FIXED, -1, 0);
diff --git a/test/integration/chunk.c b/test/integration/chunk.c
index b49afa5..8679f7b 100644
--- a/test/integration/chunk.c
+++ b/test/integration/chunk.c
@@ -49,25 +49,30 @@ bool
 chunk_commit(void *chunk, size_t size, size_t offset, size_t length,
     unsigned arena_ind)
 {
+	bool err;
 
 	TRACE_HOOK("%s(chunk=%p, size=%zu, offset=%zu, length=%zu, "
 	    "arena_ind=%u)\n", __func__, chunk, size, offset, length,
 	    arena_ind);
-	did_commit = true;
-	memset((void *)((uintptr_t)chunk + offset), 0, length);
-	return (false);
+	err = old_hooks.commit(chunk, size, offset, length, arena_ind);
+	did_commit = !err;
+	return (err);
 }
 
 bool
 chunk_decommit(void *chunk, size_t size, size_t offset, size_t length,
     unsigned arena_ind)
 {
+	bool err;
 
 	TRACE_HOOK("%s(chunk=%p, size=%zu, offset=%zu, length=%zu, "
 	    "arena_ind=%u)\n", __func__, chunk, size, offset, length,
 	    arena_ind);
-	did_decommit = true;
-	return (!do_decommit);
+	if (!do_decommit)
+		return (true);
+	err = old_hooks.decommit(chunk, size, offset, length, arena_ind);
+	did_decommit = !err;
+	return (err);
 }
 
 bool
@@ -167,7 +172,7 @@ TEST_BEGIN(test_chunk)
 	assert_d_eq(mallctl("arena.0.purge", NULL, NULL, NULL, 0), 0,
 	    "Unexpected arena.0.purge error");
 	assert_true(did_dalloc, "Expected dalloc");
-	assert_true(did_decommit, "Expected decommit");
+	assert_false(did_decommit, "Unexpected decommit");
 	assert_true(did_purge, "Expected purge");
 	dallocx(p, 0);
 	do_dalloc = true;
@@ -185,12 +190,11 @@ TEST_BEGIN(test_chunk)
 	    "Unexpected xallocx() failure");
 	assert_d_eq(mallctl("arena.0.purge", NULL, NULL, NULL, 0), 0,
 	    "Unexpected arena.0.purge error");
-	assert_true(did_decommit, "Expected decommit");
 	assert_true(did_split, "Expected split");
 	assert_zu_eq(xallocx(p, huge0 * 2, 0, 0), huge0 * 2,
 	    "Unexpected xallocx() failure");
-	assert_true(did_commit, "Expected commit");
-	assert_true(did_commit, "Expected merge");
+	assert_b_eq(did_decommit, did_commit, "Expected decommit/commit match");
+	assert_true(did_merge, "Expected merge");
 	dallocx(p, 0);
 	do_dalloc = true;
 	do_decommit = false;
@@ -222,11 +226,10 @@ TEST_BEGIN(test_chunk)
 	    "Unexpected xallocx() failure");
 	assert_d_eq(mallctl("arena.0.purge", NULL, NULL, NULL, 0), 0,
 	    "Unexpected arena.0.purge error");
-	assert_true(did_decommit, "Expected decommit");
 	did_commit = false;
 	assert_zu_eq(xallocx(p, large1, 0, 0), large1,
 	    "Unexpected xallocx() failure");
-	assert_true(did_commit, "Expected commit");
+	assert_b_eq(did_decommit, did_commit, "Expected decommit/commit match");
 	dallocx(p, 0);
 	do_decommit = false;
 
-- 
cgit v0.12


From 2662ba54494e37074bbc0fc7e4b3eafdd793e98e Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 12 Aug 2015 11:10:42 -0700
Subject: Stop forcing --enable-munmap on MinGW.

This is no longer necessary because of the more general chunk
merge/split approach to dealing with map coalescing.
---
 INSTALL      | 5 +----
 configure.ac | 4 ----
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/INSTALL b/INSTALL
index 5413ae8..8d39687 100644
--- a/INSTALL
+++ b/INSTALL
@@ -150,10 +150,7 @@ any of the following arguments (not a definitive list) to 'configure':
     the virtual memory for later use.  munmap() is disabled by default (i.e.
     --disable-munmap is implied) on Linux, which has a quirk in its virtual
     memory allocation algorithm that causes semi-permanent VM map holes under
-    normal jemalloc operation.  Conversely, munmap() (actually VirtualFree()) is
-    forcefully enabled on MinGW because virtual memory mappings do not
-    automatically coalesce (nor fragment on demand), and extra bookkeeping
-    would be required to track mapping boundaries.
+    normal jemalloc operation.
 
 --disable-fill
     Disable support for junk/zero filling of memory, quarantine, and redzones.
diff --git a/configure.ac b/configure.ac
index 6f44b6c..f7c7f3c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -881,10 +881,6 @@ fi
 ],
 [enable_munmap="${default_munmap}"]
 )
-if test "x$enable_munmap" = "x0" -a "x${maps_coalesce}" = "x0" ; then
-  AC_MSG_RESULT([Forcing munmap to avoid non-coalescing map issues])
-  enable_munmap="1"
-fi
 if test "x$enable_munmap" = "x1" ; then
   AC_DEFINE([JEMALLOC_MUNMAP], [ ])
 fi
-- 
cgit v0.12


From e1ed698973f2465c15173ad7aecb720c57161052 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 12 Aug 2015 11:30:59 -0700
Subject: Remove obsolete entry.

---
 ChangeLog | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index ed5777d..6f1d2c6 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -83,10 +83,6 @@ brevity.  Much more detail can be found in the git revision history:
     + OpenRISC/or1k
 
   Optimizations:
-  - Switch run and chunk allocation from first-best-fit (among best-fit
-    candidates, choose the lowest in memory) to first-fit (among all candidates,
-    choose the lowest in memory).  This tends to reduce chunk and virtual memory
-    fragmentation, respectively.
   - Maintain dirty runs in per arena LRUs rather than in per arena trees of
     dirty-run-containing chunks.  In practice this change significantly reduces
     dirty page purging volume.
-- 
cgit v0.12


From 38d8f50144d229daf4449112d412613bbaa43502 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 12 Aug 2015 12:06:38 -0700
Subject: Fix assertion in test.

---
 test/unit/junk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/unit/junk.c b/test/unit/junk.c
index 733f661..01d314b 100644
--- a/test/unit/junk.c
+++ b/test/unit/junk.c
@@ -244,7 +244,7 @@ int
 main(void)
 {
 
-	assert(opt_junk_alloc || opt_junk_free);
+	assert(!config_fill || opt_junk_alloc || opt_junk_free);
 	return (test(
 	    test_junk_small,
 	    test_junk_large,
-- 
cgit v0.12


From 694d0829c06ac61e0f8e411492a13b04b5c6eec1 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 12 Aug 2015 13:03:43 -0700
Subject: Update list of private symbols.

---
 include/jemalloc/internal/private_symbols.txt | 39 ++++++++++-----------------
 1 file changed, 14 insertions(+), 25 deletions(-)

diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index bd4fe8f..dbf6aa7 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -2,8 +2,6 @@ a0dalloc
 a0get
 a0malloc
 arena_aalloc
-arena_get
-arena_get_hard
 arena_alloc_junk_small
 arena_bin_index
 arena_bin_info
@@ -29,6 +27,8 @@ arena_dalloc_large_junked_locked
 arena_dalloc_small
 arena_dss_prec_get
 arena_dss_prec_set
+arena_get
+arena_get_hard
 arena_init
 arena_lg_dirty_mult_default_get
 arena_lg_dirty_mult_default_set
@@ -47,15 +47,15 @@ arena_mapbits_large_binind_set
 arena_mapbits_large_get
 arena_mapbits_large_set
 arena_mapbits_large_size_get
+arena_mapbitsp_get
+arena_mapbitsp_read
+arena_mapbitsp_write
 arena_mapbits_small_runind_get
 arena_mapbits_small_set
 arena_mapbits_unallocated_set
 arena_mapbits_unallocated_size_get
 arena_mapbits_unallocated_size_set
 arena_mapbits_unzeroed_get
-arena_mapbitsp_get
-arena_mapbitsp_read
-arena_mapbitsp_write
 arena_maxclass
 arena_maxrun
 arena_maybe_purge
@@ -91,11 +91,11 @@ arena_redzone_corruption
 arena_run_regind
 arena_run_to_miscelm
 arena_salloc
+arenas_cache_bypass_cleanup
+arenas_cache_cleanup
 arena_sdalloc
 arena_stats_merge
 arena_tcache_fill_small
-arenas_cache_bypass_cleanup
-arenas_cache_cleanup
 atomic_add_p
 atomic_add_u
 atomic_add_uint32
@@ -131,8 +131,8 @@ bootstrap_free
 bootstrap_malloc
 bt_init
 buferror
-chunk_alloc_cache
 chunk_alloc_base
+chunk_alloc_cache
 chunk_alloc_dss
 chunk_alloc_mmap
 chunk_alloc_wrapper
@@ -160,26 +160,20 @@ chunk_prefork
 chunk_purge_arena
 chunk_purge_wrapper
 chunk_register
-chunks_rtree
 chunksize
 chunksize_mask
-ckh_bucket_search
+chunks_rtree
 ckh_count
 ckh_delete
-ckh_evict_reloc_insert
 ckh_insert
-ckh_isearch
 ckh_iter
 ckh_new
 ckh_pointer_hash
 ckh_pointer_keycomp
-ckh_rebuild
 ckh_remove
 ckh_search
 ckh_string_hash
 ckh_string_keycomp
-ckh_try_bucket_insert
-ckh_try_insert
 ctl_boot
 ctl_bymib
 ctl_byname
@@ -250,12 +244,9 @@ hash_x64_128
 hash_x86_128
 hash_x86_32
 huge_aalloc
-huge_allocated
 huge_dalloc
 huge_dalloc_junk
 huge_malloc
-huge_ndalloc
-huge_nmalloc
 huge_palloc
 huge_prof_tctx_get
 huge_prof_tctx_set
@@ -271,11 +262,11 @@ idalloct
 idalloctm
 imalloc
 imalloct
-in_valgrind
 index2size
 index2size_compute
 index2size_lookup
 index2size_tab
+in_valgrind
 ipalloc
 ipalloct
 ipallocztm
@@ -285,8 +276,8 @@ iralloct
 iralloct_realign
 isalloc
 isdalloct
-isthreaded
 isqalloc
+isthreaded
 ivsalloc
 ixalloc
 jemalloc_postfork_child
@@ -363,7 +354,6 @@ prof_backtrace
 prof_boot0
 prof_boot1
 prof_boot2
-prof_bt_count
 prof_dump_header
 prof_dump_open
 prof_free
@@ -431,7 +421,6 @@ stats_cactive
 stats_cactive_add
 stats_cactive_get
 stats_cactive_sub
-stats_chunks
 stats_print
 tcache_alloc_easy
 tcache_alloc_large
@@ -457,21 +446,21 @@ tcache_flush
 tcache_get
 tcache_get_hard
 tcache_maxclass
-tcache_salloc
-tcache_stats_merge
 tcaches
+tcache_salloc
 tcaches_create
 tcaches_destroy
 tcaches_flush
 tcaches_get
+tcache_stats_merge
 thread_allocated_cleanup
 thread_deallocated_cleanup
-tsd_booted
 tsd_arena_get
 tsd_arena_set
 tsd_boot
 tsd_boot0
 tsd_boot1
+tsd_booted
 tsd_cleanup
 tsd_cleanup_wrapper
 tsd_fetch
-- 
cgit v0.12


From 6ed18cb348c1a1ea84fa8f600db55f8b5fa0f2f9 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 12 Aug 2015 15:20:34 -0700
Subject: Fix chunk_dalloc_arena() re: zeroing due to purge.

---
 src/chunk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/chunk.c b/src/chunk.c
index 5ad0281..b5b8f0d 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -584,7 +584,7 @@ chunk_dalloc_arena(arena_t *arena, chunk_hooks_t *chunk_hooks, void *chunk,
 		committed = chunk_hooks->decommit(chunk, size, 0, size,
 		    arena->ind);
 	}
-	zeroed = !committed || chunk_hooks->purge(chunk, size, 0, size,
+	zeroed = !committed || !chunk_hooks->purge(chunk, size, 0, size,
 	    arena->ind);
 	chunk_record(arena, chunk_hooks, &arena->chunks_szad_retained,
 	    &arena->chunks_ad_retained, false, chunk, size, zeroed, committed);
-- 
cgit v0.12


From 828d919b5ec9d8f4438f80d160a6c8c9a871d1e1 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 12 Aug 2015 15:21:07 -0700
Subject: Fix test for MinGW.

---
 test/integration/chunk.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/test/integration/chunk.c b/test/integration/chunk.c
index 8679f7b..7eb1b6d 100644
--- a/test/integration/chunk.c
+++ b/test/integration/chunk.c
@@ -126,6 +126,7 @@ TEST_BEGIN(test_chunk)
 		chunk_split,
 		chunk_merge
 	};
+	bool xallocx_success_a, xallocx_success_b, xallocx_success_c;
 
 	/* Install custom chunk hooks. */
 	old_size = sizeof(chunk_hooks_t);
@@ -167,13 +168,16 @@ TEST_BEGIN(test_chunk)
 	did_dalloc = false;
 	did_decommit = false;
 	did_purge = false;
-	assert_zu_eq(xallocx(p, huge0, 0, 0), huge0,
-	    "Unexpected xallocx() failure");
+	did_split = false;
+	xallocx_success_a = (xallocx(p, huge0, 0, 0) == huge0);
 	assert_d_eq(mallctl("arena.0.purge", NULL, NULL, NULL, 0), 0,
 	    "Unexpected arena.0.purge error");
-	assert_true(did_dalloc, "Expected dalloc");
-	assert_false(did_decommit, "Unexpected decommit");
-	assert_true(did_purge, "Expected purge");
+	if (xallocx_success_a) {
+		assert_true(did_dalloc, "Expected dalloc");
+		assert_false(did_decommit, "Unexpected decommit");
+		assert_true(did_purge, "Expected purge");
+	}
+	assert_true(did_split, "Expected split");
 	dallocx(p, 0);
 	do_dalloc = true;
 
@@ -186,15 +190,15 @@ TEST_BEGIN(test_chunk)
 	did_commit = false;
 	did_split = false;
 	did_merge = false;
-	assert_zu_eq(xallocx(p, huge0, 0, 0), huge0,
-	    "Unexpected xallocx() failure");
+	xallocx_success_b = (xallocx(p, huge0, 0, 0) == huge0);
 	assert_d_eq(mallctl("arena.0.purge", NULL, NULL, NULL, 0), 0,
 	    "Unexpected arena.0.purge error");
-	assert_true(did_split, "Expected split");
-	assert_zu_eq(xallocx(p, huge0 * 2, 0, 0), huge0 * 2,
-	    "Unexpected xallocx() failure");
+	if (xallocx_success_b)
+		assert_true(did_split, "Expected split");
+	xallocx_success_c = (xallocx(p, huge0 * 2, 0, 0) == huge0 * 2);
 	assert_b_eq(did_decommit, did_commit, "Expected decommit/commit match");
-	assert_true(did_merge, "Expected merge");
+	if (xallocx_success_b && xallocx_success_c)
+		assert_true(did_merge, "Expected merge");
 	dallocx(p, 0);
 	do_dalloc = true;
 	do_decommit = false;
-- 
cgit v0.12


From 56af64dc1920c50d5cf7aee329b2bb1fe1d23519 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 12 Aug 2015 16:38:20 -0700
Subject: Fix a strict aliasing violation.

---
 src/chunk.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/chunk.c b/src/chunk.c
index b5b8f0d..6ba1ca7 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -89,7 +89,12 @@ chunk_hooks_set(arena_t *arena, const chunk_hooks_t *chunk_hooks)
 	 * correctness, so they perform unlocked reads.
 	 */
 #define	ATOMIC_COPY_HOOK(n) do {					\
-	atomic_write_p((void **)&arena->chunk_hooks.n, chunk_hooks->n);	\
+	union {								\
+		chunk_##n##_t	**n;					\
+		void		**v;					\
+	} u;								\
+	u.n = &arena->chunk_hooks.n;					\
+	atomic_write_p(u.v, chunk_hooks->n);				\
 } while (0)
 	ATOMIC_COPY_HOOK(alloc);
 	ATOMIC_COPY_HOOK(dalloc);
-- 
cgit v0.12


From 7928f62273e43bf4e22c664916fb1486329bae48 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 12 Aug 2015 16:38:39 -0700
Subject: Check whether gcc version supports __builtin_unreachable().

---
 include/jemalloc/internal/util.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/include/jemalloc/internal/util.h b/include/jemalloc/internal/util.h
index c977aea..c23e4ac 100644
--- a/include/jemalloc/internal/util.h
+++ b/include/jemalloc/internal/util.h
@@ -57,10 +57,21 @@
 #	define JEMALLOC_CC_SILENCE_INIT(v)
 #endif
 
+#define	JEMALLOC_GNUC_PREREQ(major, minor)				\
+    (!defined(__clang__) &&						\
+    (__GNUC__ > (major) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor))))
+#define	JEMALLOC_CLANG_HAS_BUILTIN(builtin)				\
+    (defined(__clang__) && __has_builtin(builtin))
+
 #ifdef __GNUC__
 #	define likely(x)   __builtin_expect(!!(x), 1)
 #	define unlikely(x) __builtin_expect(!!(x), 0)
+#  if JEMALLOC_GNUC_PREREQ(4, 6) ||					\
+      JEMALLOC_CLANG_HAS_BUILTIN(__builtin_unreachable)
 #	define unreachable() __builtin_unreachable()
+#  else
+#	define unreachable()
+#  endif
 #else
 #	define likely(x)   !!(x)
 #	define unlikely(x) !!(x)
-- 
cgit v0.12


From fead75fd52acec0c7a4b6a6e39a25d3de8fbedc2 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 12 Aug 2015 16:46:09 -0700
Subject: Fix gcc build failure (define __has_builtin).

---
 include/jemalloc/internal/util.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/jemalloc/internal/util.h b/include/jemalloc/internal/util.h
index c23e4ac..b2ea740 100644
--- a/include/jemalloc/internal/util.h
+++ b/include/jemalloc/internal/util.h
@@ -60,6 +60,9 @@
 #define	JEMALLOC_GNUC_PREREQ(major, minor)				\
     (!defined(__clang__) &&						\
     (__GNUC__ > (major) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor))))
+#ifndef __has_builtin
+#  define __has_builtin(builtin) (0)
+#endif
 #define	JEMALLOC_CLANG_HAS_BUILTIN(builtin)				\
     (defined(__clang__) && __has_builtin(builtin))
 
-- 
cgit v0.12


From 85ae064e96387347915973dcc1ac15e553259bc9 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Thu, 13 Aug 2015 14:54:06 -0700
Subject: Fix a comment.

---
 include/jemalloc/internal/arena.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index f972e8b..cb015ee 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -54,7 +54,7 @@ struct arena_chunk_map_bits_s {
 	 * Run address (or size) and various flags are stored together.  The bit
 	 * layout looks like (assuming 32-bit system):
 	 *
-	 *   ???????? ???????? ???nnnnn nnndulma
+	 *   ???????? ???????? ???nnnnn nnndumla
 	 *
 	 * ? : Unallocated: Run address for first/last pages, unset for internal
 	 *                  pages.
-- 
cgit v0.12


From a5dbaef08f583ca5e89c8dd20726430452212469 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 14 Aug 2015 00:35:11 -0700
Subject: Update large/huge size class cutoff documentation.

---
 doc/jemalloc.xml.in | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 39f6a34..dd9387f 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -571,7 +571,7 @@ for (i = 0; i < nbins; i++) {
     both large or both huge.  In such cases shrinkage always succeeds, but
     growth only succeeds if the trailing memory is currently available.</para>
 
-    <para>Assuming 256 KiB chunks, 4 KiB pages, and a 16-byte quantum on a
+    <para>Assuming 2 MiB chunks, 4 KiB pages, and a 16-byte quantum on a
     64-bit system, the size classes in each category are as shown in <xref
     linkend="size_classes" xrefstyle="template:Table %n"/>.</para>
 
@@ -627,7 +627,7 @@ for (i = 0; i < nbins; i++) {
           <entry>[10 KiB, 12 KiB, 14 KiB]</entry>
         </row>
         <row>
-          <entry morerows="4">Large</entry>
+          <entry morerows="7">Large</entry>
           <entry>2 KiB</entry>
           <entry>[16 KiB]</entry>
         </row>
@@ -645,12 +645,7 @@ for (i = 0; i < nbins; i++) {
         </row>
         <row>
           <entry>32 KiB</entry>
-          <entry>[160 KiB, 192 KiB, 224 KiB]</entry>
-        </row>
-        <row>
-          <entry morerows="9">Huge</entry>
-          <entry>32 KiB</entry>
-          <entry>[256 KiB]</entry>
+          <entry>[160 KiB, 192 KiB, 224 KiB, 256 KiB]</entry>
         </row>
         <row>
           <entry>64 KiB</entry>
@@ -662,7 +657,12 @@ for (i = 0; i < nbins; i++) {
         </row>
         <row>
           <entry>256 KiB</entry>
-          <entry>[1280 KiB, 1536 KiB, 1792 KiB, 2 MiB]</entry>
+          <entry>[1280 KiB, 1536 KiB, 1792 KiB]</entry>
+        </row>
+        <row>
+          <entry morerows="6">Huge</entry>
+          <entry>256 KiB</entry>
+          <entry>[2 MiB]</entry>
         </row>
         <row>
           <entry>512 KiB</entry>
-- 
cgit v0.12


From 38f864947bc63ff2728cf23905682612418404fb Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 14 Aug 2015 00:55:44 -0700
Subject: Update in-place reallocation documentation.

---
 doc/jemalloc.xml.in | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index dd9387f..e12abb0 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -568,8 +568,13 @@ for (i = 0; i < nbins; i++) {
     up to the same size class.  No other API guarantees are made regarding
     in-place resizing, but the current implementation also tries to resize large
     and huge allocations in place, as long as the pre-size and post-size are
-    both large or both huge.  In such cases shrinkage always succeeds, but
-    growth only succeeds if the trailing memory is currently available.</para>
+    both large or both huge.  In such cases shrinkage always succeeds for large
+    size classes, but for huge size classes the chunk allocator must support
+    splitting (see <link
+    linkend="arena.i.chunk_hooks"><mallctl>arena.&lt;i&gt;.chunk_hooks</mallctl></link>).
+    Growth only succeeds if the trailing memory is currently available, and
+    additionally for huge size classes the chunk allocator must support
+    merging.</para>
 
     <para>Assuming 2 MiB chunks, 4 KiB pages, and a 16-byte quantum on a
     64-bit system, the size classes in each category are as shown in <xref
@@ -1557,7 +1562,8 @@ typedef struct {
         (and often less costly) operations.  The chunk splitting and merging
         operations can also be opted out of, but this is mainly intended to
         support platforms on which virtual memory mappings provided by the
-        operating system kernel do not automatically coalesce and split.</para>
+        operating system kernel do not automatically coalesce and split, e.g.
+        Windows.</para>
 
         <para><funcsynopsis><funcprototype>
           <funcdef>typedef void *<function>(chunk_alloc_t)</function></funcdef>
-- 
cgit v0.12


From 92e96e3bfc5e02c6d59f0db931f7acb29c3b82ff Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 14 Aug 2015 13:46:08 -0700
Subject: Improve arena.<i>.chunk_hooks documentation formatting.

---
 doc/jemalloc.xml.in | 83 +++++++++++++++++++++++++++++------------------------
 1 file changed, 46 insertions(+), 37 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index e12abb0..8fc774b 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -1541,7 +1541,7 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         to the application having an opportunity to take over chunk
         allocation.</para>
 
-        <para><programlisting language="C"><![CDATA[
+        <programlisting language="C"><![CDATA[
 typedef struct {
 	chunk_alloc_t		*alloc;
 	chunk_dalloc_t		*dalloc;
@@ -1551,8 +1551,8 @@ typedef struct {
 	chunk_split_t		*split;
 	chunk_merge_t		*merge;
 } chunk_hooks_t;]]></programlisting>
-        The <type>chunk_hooks_t</type> structure comprises function pointers
-        which are described individually below.  jemalloc uses these
+        <para>The <type>chunk_hooks_t</type> structure comprises function
+        pointers which are described individually below.  jemalloc uses these
         functions to manage chunk lifetime, which starts off with allocation of
         mapped committed memory, in the simplest case followed by deallocation.
         However, there are performance and platform reasons to retain chunks for
@@ -1565,7 +1565,7 @@ typedef struct {
         operating system kernel do not automatically coalesce and split, e.g.
         Windows.</para>
 
-        <para><funcsynopsis><funcprototype>
+        <funcsynopsis><funcprototype>
           <funcdef>typedef void *<function>(chunk_alloc_t)</function></funcdef>
           <paramdef>void *<parameter>chunk</parameter></paramdef>
           <paramdef>size_t <parameter>size</parameter></paramdef>
@@ -1574,9 +1574,10 @@ typedef struct {
           <paramdef>bool *<parameter>commit</parameter></paramdef>
           <paramdef>unsigned <parameter>arena_ind</parameter></paramdef>
         </funcprototype></funcsynopsis>
-        A chunk allocation function conforms to the <type>chunk_alloc_t</type>
-        type and upon success returns a pointer to <parameter>size</parameter>
-        bytes of mapped memory on behalf of arena
+        <literallayout></literallayout>
+        <para>A chunk allocation function conforms to the
+        <type>chunk_alloc_t</type> type and upon success returns a pointer to
+        <parameter>size</parameter> bytes of mapped memory on behalf of arena
         <parameter>arena_ind</parameter> such that the chunk's base address is a
         multiple of <parameter>alignment</parameter>, as well as setting
         <parameter>*zero</parameter> to indicate whether the chunk is zeroed and
@@ -1599,13 +1600,15 @@ typedef struct {
         linkend="arena.i.dss"><mallctl>arena.&lt;i&gt;.dss</mallctl></link>
         setting irrelevant.</para>
 
-        <para><funcsynopsis><funcprototype>
+        <funcsynopsis><funcprototype>
           <funcdef>typedef bool <function>(chunk_dalloc_t)</function></funcdef>
           <paramdef>void *<parameter>chunk</parameter></paramdef>
           <paramdef>size_t <parameter>size</parameter></paramdef>
           <paramdef>bool <parameter>committed</parameter></paramdef>
           <paramdef>unsigned <parameter>arena_ind</parameter></paramdef>
         </funcprototype></funcsynopsis>
+        <literallayout></literallayout>
+        <para>
         A chunk deallocation function conforms to the
         <type>chunk_dalloc_t</type> type and deallocates a
         <parameter>chunk</parameter> of given <parameter>size</parameter> with
@@ -1616,7 +1619,7 @@ typedef struct {
         remains mapped, in the same commit state, and available for future use,
         in which case it will be automatically retained for later reuse.</para>
 
-        <para><funcsynopsis><funcprototype>
+        <funcsynopsis><funcprototype>
           <funcdef>typedef bool <function>(chunk_commit_t)</function></funcdef>
           <paramdef>void *<parameter>chunk</parameter></paramdef>
           <paramdef>size_t <parameter>size</parameter></paramdef>
@@ -1624,11 +1627,12 @@ typedef struct {
           <paramdef>size_t <parameter>length</parameter></paramdef>
           <paramdef>unsigned <parameter>arena_ind</parameter></paramdef>
         </funcprototype></funcsynopsis>
-        A chunk commit function conforms to the <type>chunk_commit_t</type> type
-        and commits zeroed physical memory to back pages within a
-        <parameter>chunk</parameter> of given <parameter>size</parameter> at
-        <parameter>offset</parameter> bytes, extending for
-        <parameter>length</parameter> on behalf of arena
+        <literallayout></literallayout>
+        <para>A chunk commit function conforms to the
+        <type>chunk_commit_t</type> type and commits zeroed physical memory to
+        back pages within a <parameter>chunk</parameter> of given
+        <parameter>size</parameter> at <parameter>offset</parameter> bytes,
+        extending for <parameter>length</parameter> on behalf of arena
         <parameter>arena_ind</parameter>, returning false upon success.
         Committed memory may be committed in absolute terms as on a system that
         does not overcommit, or in implicit terms as on a system that
@@ -1636,7 +1640,7 @@ typedef struct {
         faults. If the function returns true, this indicates insufficient
         physical memory to satisfy the request.</para>
 
-        <para><funcsynopsis><funcprototype>
+        <funcsynopsis><funcprototype>
           <funcdef>typedef bool <function>(chunk_decommit_t)</function></funcdef>
           <paramdef>void *<parameter>chunk</parameter></paramdef>
           <paramdef>size_t <parameter>size</parameter></paramdef>
@@ -1644,18 +1648,19 @@ typedef struct {
           <paramdef>size_t <parameter>length</parameter></paramdef>
           <paramdef>unsigned <parameter>arena_ind</parameter></paramdef>
         </funcprototype></funcsynopsis>
-        A chunk decommit function conforms to the <type>chunk_decommit_t</type>
-        type and decommits any physical memory that is backing pages within a
-        <parameter>chunk</parameter> of given <parameter>size</parameter> at
-        <parameter>offset</parameter> bytes, extending for
-        <parameter>length</parameter> on behalf of arena
+        <literallayout></literallayout>
+        <para>A chunk decommit function conforms to the
+        <type>chunk_decommit_t</type> type and decommits any physical memory
+        that is backing pages within a <parameter>chunk</parameter> of given
+        <parameter>size</parameter> at <parameter>offset</parameter> bytes,
+        extending for <parameter>length</parameter> on behalf of arena
         <parameter>arena_ind</parameter>, returning false upon success, in which
         case the pages will be committed via the chunk commit function before
         being reused.  If the function returns true, this indicates opt-out from
         decommit; the memory remains committed and available for future use, in
         which case it will be automatically retained for later reuse.</para>
 
-        <para><funcsynopsis><funcprototype>
+        <funcsynopsis><funcprototype>
           <funcdef>typedef bool <function>(chunk_purge_t)</function></funcdef>
           <paramdef>void *<parameter>chunk</parameter></paramdef>
           <paramdef>size_t<parameter>size</parameter></paramdef>
@@ -1663,16 +1668,17 @@ typedef struct {
           <paramdef>size_t <parameter>length</parameter></paramdef>
           <paramdef>unsigned <parameter>arena_ind</parameter></paramdef>
         </funcprototype></funcsynopsis>
-        A chunk purge function conforms to the <type>chunk_purge_t</type> type
-        and optionally discards physical pages within the virtual memory mapping
-        associated with <parameter>chunk</parameter> of given
+        <literallayout></literallayout>
+        <para>A chunk purge function conforms to the <type>chunk_purge_t</type>
+        type and optionally discards physical pages within the virtual memory
+        mapping associated with <parameter>chunk</parameter> of given
         <parameter>size</parameter> at <parameter>offset</parameter> bytes,
         extending for <parameter>length</parameter> on behalf of arena
         <parameter>arena_ind</parameter>, returning false if pages within the
         purged virtual memory range will be zero-filled the next time they are
         accessed.</para>
 
-        <para><funcsynopsis><funcprototype>
+        <funcsynopsis><funcprototype>
           <funcdef>typedef bool <function>(chunk_split_t)</function></funcdef>
           <paramdef>void *<parameter>chunk</parameter></paramdef>
           <paramdef>size_t <parameter>size</parameter></paramdef>
@@ -1681,8 +1687,9 @@ typedef struct {
           <paramdef>bool <parameter>committed</parameter></paramdef>
           <paramdef>unsigned <parameter>arena_ind</parameter></paramdef>
         </funcprototype></funcsynopsis>
-        A chunk split function conforms to the <type>chunk_split_t</type> type
-        and optionally splits <parameter>chunk</parameter> of given
+        <literallayout></literallayout>
+        <para>A chunk split function conforms to the <type>chunk_split_t</type>
+        type and optionally splits <parameter>chunk</parameter> of given
         <parameter>size</parameter> into two adjacent chunks, the first of
         <parameter>size_a</parameter> bytes, and the second of
         <parameter>size_b</parameter> bytes, operating on
@@ -1692,7 +1699,7 @@ typedef struct {
         remains unsplit and therefore should continue to be operated on as a
         whole.</para>
 
-        <para><funcsynopsis><funcprototype>
+        <funcsynopsis><funcprototype>
           <funcdef>typedef bool <function>(chunk_merge_t)</function></funcdef>
           <paramdef>void *<parameter>chunk_a</parameter></paramdef>
           <paramdef>size_t <parameter>size_a</parameter></paramdef>
@@ -1701,15 +1708,17 @@ typedef struct {
           <paramdef>bool <parameter>committed</parameter></paramdef>
           <paramdef>unsigned <parameter>arena_ind</parameter></paramdef>
         </funcprototype></funcsynopsis>
-        A chunk merge function conforms to the <type>chunk_merge_t</type> type
-        and optionally merges adjacent chunks, <parameter>chunk_a</parameter> of
-        given <parameter>size_a</parameter> and <parameter>chunk_b</parameter>
-        of given <parameter>size_b</parameter> into one contiguous chunk,
-        operating on <parameter>committed</parameter>/decommitted memory as
-        indicated, on behalf of arena <parameter>arena_ind</parameter>,
-        returning false upon success.  If the function returns true, this
-        indicates that the chunks remain distinct mappings and therefore should
-        continue to be operated on independently.</para>
+        <literallayout></literallayout>
+        <para>A chunk merge function conforms to the <type>chunk_merge_t</type>
+        type and optionally merges adjacent chunks,
+        <parameter>chunk_a</parameter> of given <parameter>size_a</parameter>
+        and <parameter>chunk_b</parameter> of given
+        <parameter>size_b</parameter> into one contiguous chunk, operating on
+        <parameter>committed</parameter>/decommitted memory as indicated, on
+        behalf of arena <parameter>arena_ind</parameter>, returning false upon
+        success.  If the function returns true, this indicates that the chunks
+        remain distinct mappings and therefore should continue to be operated on
+        independently.</para>
         </listitem>
       </varlistentry>
 
-- 
cgit v0.12


From 9b68f67223b94f4711cbf88722579b2d6dcd013d Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Mon, 17 Aug 2015 13:21:08 -0700
Subject: Update ChangeLog for 4.0.0.

---
 ChangeLog | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 6f1d2c6..0cf887c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -4,8 +4,7 @@ brevity.  Much more detail can be found in the git revision history:
 
     https://github.com/jemalloc/jemalloc
 
-* 4.0.0 (XXX) See https://github.com/jemalloc/jemalloc/milestones/4.0.0 for
-              remaining work.
+* 4.0.0 (August 17, 2015)
 
   This version contains many speed and space optimizations, both minor and
   major.  The major themes are generalization, unification, and simplification.
-- 
cgit v0.12