21 files changed, 1368 insertions, 554 deletions
diff --git a/src/arena.c b/src/arena.c
index 632fce5..40ef143 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -3,6 +3,7 @@
 #include "jemalloc/internal/jemalloc_internal_includes.h"
 
 #include "jemalloc/internal/assert.h"
+#include "jemalloc/internal/div.h"
 #include "jemalloc/internal/extent_dss.h"
 #include "jemalloc/internal/extent_mmap.h"
 #include "jemalloc/internal/mutex.h"
@@ -32,21 +33,6 @@ ssize_t opt_muzzy_decay_ms = MUZZY_DECAY_MS_DEFAULT;
 static atomic_zd_t dirty_decay_ms_default;
 static atomic_zd_t muzzy_decay_ms_default;
 
-const arena_bin_info_t arena_bin_info[NBINS] = {
-#define BIN_INFO_bin_yes(reg_size, slab_size, nregs)			\
-	{reg_size, slab_size, nregs, BITMAP_INFO_INITIALIZER(nregs)},
-#define BIN_INFO_bin_no(reg_size, slab_size, nregs)
-#define SC(index, lg_grp, lg_delta, ndelta, psz, bin, pgs,		\
-    lg_delta_lookup)							\
-	BIN_INFO_bin_##bin((1U<<lg_grp) + (ndelta<<lg_delta),		\
-	    (pgs << LG_PAGE), (pgs << LG_PAGE) / ((1U<<lg_grp) +	\
-	    (ndelta<<lg_delta)))
-	SIZE_CLASSES
-#undef BIN_INFO_bin_yes
-#undef BIN_INFO_bin_no
-#undef SC
-};
-
 const uint64_t h_steps[SMOOTHSTEP_NSTEPS] = {
 #define STEP(step, h, x, y)			\
 		h,
@@ -54,6 +40,8 @@ const uint64_t h_steps[SMOOTHSTEP_NSTEPS] = {
 #undef STEP
 };
 
+static div_info_t arena_binind_div_info[NBINS];
+
 /******************************************************************************/
 /*
  * Function prototypes for static functions that are referenced prior to
@@ -62,155 +50,16 @@ const uint64_t h_steps[SMOOTHSTEP_NSTEPS] = {
 
 static void arena_decay_to_limit(tsdn_t *tsdn, arena_t *arena,
     arena_decay_t *decay, extents_t *extents, bool all, size_t npages_limit,
-    bool is_background_thread);
+    size_t npages_decay_max, bool is_background_thread);
 static bool arena_decay_dirty(tsdn_t *tsdn, arena_t *arena,
     bool is_background_thread, bool all);
 static void arena_dalloc_bin_slab(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
-    arena_bin_t *bin);
+    bin_t *bin);
 static void arena_bin_lower_slab(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
-    arena_bin_t *bin);
+    bin_t *bin);
 
 /******************************************************************************/
 
-static bool
-arena_stats_init(tsdn_t *tsdn, arena_stats_t *arena_stats) {
-	if (config_debug) {
-		for (size_t i = 0; i < sizeof(arena_stats_t); i++) {
-			assert(((char *)arena_stats)[i] == 0);
-		}
-	}
-#ifndef JEMALLOC_ATOMIC_U64
-	if (malloc_mutex_init(&arena_stats->mtx, "arena_stats",
-	    WITNESS_RANK_ARENA_STATS, malloc_mutex_rank_exclusive)) {
-		return true;
-	}
-#endif
-	/* Memory is zeroed, so there is no need to clear stats. */
-	return false;
-}
-
-static void
-arena_stats_lock(tsdn_t *tsdn, arena_stats_t *arena_stats) {
-#ifndef JEMALLOC_ATOMIC_U64
-	malloc_mutex_lock(tsdn, &arena_stats->mtx);
-#endif
-}
-
-static void
-arena_stats_unlock(tsdn_t *tsdn, arena_stats_t *arena_stats) {
-#ifndef JEMALLOC_ATOMIC_U64
-	malloc_mutex_unlock(tsdn, &arena_stats->mtx);
-#endif
-}
-
-static uint64_t
-arena_stats_read_u64(tsdn_t *tsdn, arena_stats_t *arena_stats,
-    arena_stats_u64_t *p) {
-#ifdef JEMALLOC_ATOMIC_U64
-	return atomic_load_u64(p, ATOMIC_RELAXED);
-#else
-	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
-	return *p;
-#endif
-}
-
-static void
-arena_stats_add_u64(tsdn_t *tsdn, arena_stats_t *arena_stats,
-    arena_stats_u64_t *p, uint64_t x) {
-#ifdef JEMALLOC_ATOMIC_U64
-	atomic_fetch_add_u64(p, x, ATOMIC_RELAXED);
-#else
-	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
-	*p += x;
-#endif
-}
-
-UNUSED static void
-arena_stats_sub_u64(tsdn_t *tsdn, arena_stats_t *arena_stats,
-    arena_stats_u64_t *p, uint64_t x) {
-#ifdef JEMALLOC_ATOMIC_U64
-	UNUSED uint64_t r = atomic_fetch_sub_u64(p, x, ATOMIC_RELAXED);
-	assert(r - x <= r);
-#else
-	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
-	*p -= x;
-	assert(*p + x >= *p);
-#endif
-}
-
-/*
- * Non-atomically sets *dst += src.  *dst needs external synchronization.
- * This lets us avoid the cost of a fetch_add when its unnecessary (note that
- * the types here are atomic).
- */
-static void
-arena_stats_accum_u64(arena_stats_u64_t *dst, uint64_t src) {
-#ifdef JEMALLOC_ATOMIC_U64
-	uint64_t cur_dst = atomic_load_u64(dst, ATOMIC_RELAXED);
-	atomic_store_u64(dst, src + cur_dst, ATOMIC_RELAXED);
-#else
-	*dst += src;
-#endif
-}
-
-static size_t
-arena_stats_read_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p) {
-#ifdef JEMALLOC_ATOMIC_U64
-	return atomic_load_zu(p, ATOMIC_RELAXED);
-#else
-	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
-	return atomic_load_zu(p, ATOMIC_RELAXED);
-#endif
-}
-
-static void
-arena_stats_add_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p,
-    size_t x) {
-#ifdef JEMALLOC_ATOMIC_U64
-	atomic_fetch_add_zu(p, x, ATOMIC_RELAXED);
-#else
-	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
-	size_t cur = atomic_load_zu(p, ATOMIC_RELAXED);
-	atomic_store_zu(p, cur + x, ATOMIC_RELAXED);
-#endif
-}
-
-static void
-arena_stats_sub_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p,
-    size_t x) {
-#ifdef JEMALLOC_ATOMIC_U64
-	UNUSED size_t r = atomic_fetch_sub_zu(p, x, ATOMIC_RELAXED);
-	assert(r - x <= r);
-#else
-	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
-	size_t cur = atomic_load_zu(p, ATOMIC_RELAXED);
-	atomic_store_zu(p, cur - x, ATOMIC_RELAXED);
-#endif
-}
-
-/* Like the _u64 variant, needs an externally synchronized *dst. */
-static void
-arena_stats_accum_zu(atomic_zu_t *dst, size_t src) {
-	size_t cur_dst = atomic_load_zu(dst, ATOMIC_RELAXED);
-	atomic_store_zu(dst, src + cur_dst, ATOMIC_RELAXED);
-}
-
-void
-arena_stats_large_nrequests_add(tsdn_t *tsdn, arena_stats_t *arena_stats,
-    szind_t szind, uint64_t nrequests) {
-	arena_stats_lock(tsdn, arena_stats);
-	arena_stats_add_u64(tsdn, arena_stats, &arena_stats->lstats[szind -
-	    NBINS].nrequests, nrequests);
-	arena_stats_unlock(tsdn, arena_stats);
-}
-
-void
-arena_stats_mapped_add(tsdn_t *tsdn, arena_stats_t *arena_stats, size_t size) {
-	arena_stats_lock(tsdn, arena_stats);
-	arena_stats_add_zu(tsdn, arena_stats, &arena_stats->mapped, size);
-	arena_stats_unlock(tsdn, arena_stats);
-}
-
 void
 arena_basic_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
     const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms,
@@ -228,15 +77,15 @@ void
 arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
     const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms,
     size_t *nactive, size_t *ndirty, size_t *nmuzzy, arena_stats_t *astats,
-    malloc_bin_stats_t *bstats, malloc_large_stats_t *lstats) {
+    bin_stats_t *bstats, arena_stats_large_t *lstats) {
 	cassert(config_stats);
 
 	arena_basic_stats_merge(tsdn, arena, nthreads, dss, dirty_decay_ms,
 	    muzzy_decay_ms, nactive, ndirty, nmuzzy);
 
-	size_t base_allocated, base_resident, base_mapped;
+	size_t base_allocated, base_resident, base_mapped, metadata_thp;
 	base_stats_get(tsdn, arena->base, &base_allocated, &base_resident,
-	    &base_mapped);
+	    &base_mapped, &metadata_thp);
 
 	arena_stats_lock(tsdn, &arena->stats);
 
@@ -267,6 +116,7 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
 
 	arena_stats_accum_zu(&astats->base, base_allocated);
 	arena_stats_accum_zu(&astats->internal, arena_internal_get(arena));
+	arena_stats_accum_zu(&astats->metadata_thp, metadata_thp);
 	arena_stats_accum_zu(&astats->resident, base_resident +
 	    (((atomic_load_zu(&arena->nactive, ATOMIC_RELAXED) +
 	    extents_npages_get(&arena->extents_dirty) +
@@ -303,16 +153,16 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
 	/* tcache_bytes counts currently cached bytes. */
 	atomic_store_zu(&astats->tcache_bytes, 0, ATOMIC_RELAXED);
 	malloc_mutex_lock(tsdn, &arena->tcache_ql_mtx);
-	tcache_t *tcache;
-	ql_foreach(tcache, &arena->tcache_ql, link) {
+	cache_bin_array_descriptor_t *descriptor;
+	ql_foreach(descriptor, &arena->cache_bin_array_descriptor_ql, link) {
 		szind_t i = 0;
 		for (; i < NBINS; i++) {
-			tcache_bin_t *tbin = tcache_small_bin_get(tcache, i);
+			cache_bin_t *tbin = &descriptor->bins_small[i];
 			arena_stats_accum_zu(&astats->tcache_bytes,
 			    tbin->ncached * sz_index2size(i));
 		}
 		for (; i < nhbins; i++) {
-			tcache_bin_t *tbin = tcache_large_bin_get(tcache, i);
+			cache_bin_t *tbin = &descriptor->bins_large[i];
 			arena_stats_accum_zu(&astats->tcache_bytes,
 			    tbin->ncached * sz_index2size(i));
 		}
@@ -351,20 +201,7 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
 	nstime_subtract(&astats->uptime, &arena->create_time);
 
 	for (szind_t i = 0; i < NBINS; i++) {
-		arena_bin_t *bin = &arena->bins[i];
-
-		malloc_mutex_lock(tsdn, &bin->lock);
-		malloc_mutex_prof_read(tsdn, &bstats[i].mutex_data, &bin->lock);
-		bstats[i].nmalloc += bin->stats.nmalloc;
-		bstats[i].ndalloc += bin->stats.ndalloc;
-		bstats[i].nrequests += bin->stats.nrequests;
-		bstats[i].curregs += bin->stats.curregs;
-		bstats[i].nfills += bin->stats.nfills;
-		bstats[i].nflushes += bin->stats.nflushes;
-		bstats[i].nslabs += bin->stats.nslabs;
-		bstats[i].reslabs += bin->stats.reslabs;
-		bstats[i].curslabs += bin->stats.curslabs;
-		malloc_mutex_unlock(tsdn, &bin->lock);
+		bin_stats_merge(tsdn, &bstats[i], &arena->bins[i]);
 	}
 }
 
@@ -384,8 +221,7 @@ arena_extents_dirty_dalloc(tsdn_t *tsdn, arena_t *arena,
 }
 
 static void *
-arena_slab_reg_alloc(tsdn_t *tsdn, extent_t *slab,
-    const arena_bin_info_t *bin_info) {
+arena_slab_reg_alloc(tsdn_t *tsdn, extent_t *slab, const bin_info_t *bin_info) {
 	void *ret;
 	arena_slab_data_t *slab_data = extent_slab_data_get(slab);
 	size_t regind;
@@ -412,28 +248,14 @@ arena_slab_regind(extent_t *slab, szind_t binind, const void *ptr) {
 	assert((uintptr_t)ptr < (uintptr_t)extent_past_get(slab));
 	/* Freeing an interior pointer can cause assertion failure. */
 	assert(((uintptr_t)ptr - (uintptr_t)extent_addr_get(slab)) %
-	    (uintptr_t)arena_bin_info[binind].reg_size == 0);
+	    (uintptr_t)bin_infos[binind].reg_size == 0);
 
-	/* Avoid doing division with a variable divisor. */
 	diff = (size_t)((uintptr_t)ptr - (uintptr_t)extent_addr_get(slab));
-	switch (binind) {
-#define REGIND_bin_yes(index, reg_size)					\
-	case index:							\
-		regind = diff / (reg_size);				\
-		assert(diff == regind * (reg_size));			\
-		break;
-#define REGIND_bin_no(index, reg_size)
-#define SC(index, lg_grp, lg_delta, ndelta, psz, bin, pgs,		\
-    lg_delta_lookup)							\
-	REGIND_bin_##bin(index, (1U<<lg_grp) + (ndelta<<lg_delta))
-	SIZE_CLASSES
-#undef REGIND_bin_yes
-#undef REGIND_bin_no
-#undef SC
-	default: not_reached();
-	}
 
-	assert(regind < arena_bin_info[binind].nregs);
+	/* Avoid doing division with a variable divisor. */
+	regind = div_compute(&arena_binind_div_info[binind], diff);
+
+	assert(regind < bin_infos[binind].nregs);
 
 	return regind;
 }
@@ -442,7 +264,7 @@ static void
 arena_slab_reg_dalloc(tsdn_t *tsdn, extent_t *slab,
     arena_slab_data_t *slab_data, void *ptr) {
 	szind_t binind = extent_szind_get(slab);
-	const arena_bin_info_t *bin_info = &arena_bin_info[binind];
+	const bin_info_t *bin_info = &bin_infos[binind];
 	size_t regind = arena_slab_regind(slab, binind, ptr);
 
 	assert(extent_nfree_get(slab) < bin_info->nregs);
@@ -692,7 +514,8 @@ arena_decay_try_purge(tsdn_t *tsdn, arena_t *arena, arena_decay_t *decay,
     bool is_background_thread) {
 	if (current_npages > npages_limit) {
 		arena_decay_to_limit(tsdn, arena, decay, extents, false,
-		    npages_limit, is_background_thread);
+		    npages_limit, current_npages - npages_limit,
+		    is_background_thread);
 	}
 }
 
@@ -756,7 +579,7 @@ arena_decay_reinit(arena_decay_t *decay, extents_t *extents, ssize_t decay_ms) {
 
 static bool
 arena_decay_init(arena_decay_t *decay, extents_t *extents, ssize_t decay_ms,
-    decay_stats_t *stats) {
+    arena_stats_decay_t *stats) {
 	if (config_debug) {
 		for (size_t i = 0; i < sizeof(arena_decay_t); i++) {
 			assert(((char *)decay)[i] == 0);
@@ -798,7 +621,8 @@ arena_maybe_decay(tsdn_t *tsdn, arena_t *arena, arena_decay_t *decay,
 	if (decay_ms <= 0) {
 		if (decay_ms == 0) {
 			arena_decay_to_limit(tsdn, arena, decay, extents, false,
-			    0, is_background_thread);
+			    0, extents_npages_get(extents),
+			    is_background_thread);
 		}
 		return false;
 	}
@@ -900,14 +724,15 @@ arena_muzzy_decay_ms_set(tsdn_t *tsdn, arena_t *arena,
 static size_t
 arena_stash_decayed(tsdn_t *tsdn, arena_t *arena,
     extent_hooks_t **r_extent_hooks, extents_t *extents, size_t npages_limit,
-    extent_list_t *decay_extents) {
+	size_t npages_decay_max, extent_list_t *decay_extents) {
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 0);
 
 	/* Stash extents according to npages_limit. */
 	size_t nstashed = 0;
 	extent_t *extent;
-	while ((extent = extents_evict(tsdn, arena, r_extent_hooks, extents,
+	while (nstashed < npages_decay_max &&
+	    (extent = extents_evict(tsdn, arena, r_extent_hooks, extents,
 	    npages_limit)) != NULL) {
 		extent_list_append(decay_extents, extent);
 		nstashed += extent_size_get(extent) >> LG_PAGE;
@@ -982,12 +807,15 @@ arena_decay_stashed(tsdn_t *tsdn, arena_t *arena,
 }
 
 /*
- * npages_limit: Decay as many dirty extents as possible without violating the
- * invariant: (extents_npages_get(extents) >= npages_limit)
+ * npages_limit: Decay at most npages_decay_max pages without violating the
+ * invariant: (extents_npages_get(extents) >= npages_limit).  We need an upper
+ * bound on number of pages in order to prevent unbounded growth (namely in
+ * stashed), otherwise unbounded new pages could be added to extents during the
+ * current decay run, so that the purging thread never finishes.
  */
 static void
 arena_decay_to_limit(tsdn_t *tsdn, arena_t *arena, arena_decay_t *decay,
-    extents_t *extents, bool all, size_t npages_limit,
+    extents_t *extents, bool all, size_t npages_limit, size_t npages_decay_max,
     bool is_background_thread) {
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 1);
@@ -1005,7 +833,7 @@ arena_decay_to_limit(tsdn_t *tsdn, arena_t *arena, arena_decay_t *decay,
 	extent_list_init(&decay_extents);
 
 	size_t npurge = arena_stash_decayed(tsdn, arena, &extent_hooks, extents,
-	    npages_limit, &decay_extents);
+	    npages_limit, npages_decay_max, &decay_extents);
 	if (npurge != 0) {
 		UNUSED size_t npurged = arena_decay_stashed(tsdn, arena,
 		    &extent_hooks, decay, extents, all, &decay_extents,
@@ -1023,7 +851,7 @@ arena_decay_impl(tsdn_t *tsdn, arena_t *arena, arena_decay_t *decay,
 	if (all) {
 		malloc_mutex_lock(tsdn, &decay->mtx);
 		arena_decay_to_limit(tsdn, arena, decay, extents, all, 0,
-		    is_background_thread);
+		    extents_npages_get(extents), is_background_thread);
 		malloc_mutex_unlock(tsdn, &decay->mtx);
 
 		return false;
@@ -1082,18 +910,18 @@ arena_slab_dalloc(tsdn_t *tsdn, arena_t *arena, extent_t *slab) {
 }
 
 static void
-arena_bin_slabs_nonfull_insert(arena_bin_t *bin, extent_t *slab) {
+arena_bin_slabs_nonfull_insert(bin_t *bin, extent_t *slab) {
 	assert(extent_nfree_get(slab) > 0);
 	extent_heap_insert(&bin->slabs_nonfull, slab);
 }
 
 static void
-arena_bin_slabs_nonfull_remove(arena_bin_t *bin, extent_t *slab) {
+arena_bin_slabs_nonfull_remove(bin_t *bin, extent_t *slab) {
 	extent_heap_remove(&bin->slabs_nonfull, slab);
 }
 
 static extent_t *
-arena_bin_slabs_nonfull_tryget(arena_bin_t *bin) {
+arena_bin_slabs_nonfull_tryget(bin_t *bin) {
 	extent_t *slab = extent_heap_remove_first(&bin->slabs_nonfull);
 	if (slab == NULL) {
 		return NULL;
@@ -1105,7 +933,7 @@ arena_bin_slabs_nonfull_tryget(arena_bin_t *bin) {
 }
 
 static void
-arena_bin_slabs_full_insert(arena_t *arena, arena_bin_t *bin, extent_t *slab) {
+arena_bin_slabs_full_insert(arena_t *arena, bin_t *bin, extent_t *slab) {
 	assert(extent_nfree_get(slab) == 0);
 	/*
 	 *  Tracking extents is required by arena_reset, which is not allowed
@@ -1119,7 +947,7 @@ arena_bin_slabs_full_insert(arena_t *arena, arena_bin_t *bin, extent_t *slab) {
 }
 
 static void
-arena_bin_slabs_full_remove(arena_t *arena, arena_bin_t *bin, extent_t *slab) {
+arena_bin_slabs_full_remove(arena_t *arena, bin_t *bin, extent_t *slab) {
 	if (arena_is_auto(arena)) {
 		return;
 	}
@@ -1173,7 +1001,7 @@ arena_reset(tsd_t *tsd, arena_t *arena) {
 	/* Bins. */
 	for (unsigned i = 0; i < NBINS; i++) {
 		extent_t *slab;
-		arena_bin_t *bin = &arena->bins[i];
+		bin_t *bin = &arena->bins[i];
 		malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
 		if (bin->slabcur != NULL) {
 			slab = bin->slabcur;
@@ -1262,7 +1090,7 @@ arena_destroy(tsd_t *tsd, arena_t *arena) {
 
 static extent_t *
 arena_slab_alloc_hard(tsdn_t *tsdn, arena_t *arena,
-    extent_hooks_t **r_extent_hooks, const arena_bin_info_t *bin_info,
+    extent_hooks_t **r_extent_hooks, const bin_info_t *bin_info,
     szind_t szind) {
 	extent_t *slab;
 	bool zero, commit;
@@ -1285,7 +1113,7 @@ arena_slab_alloc_hard(tsdn_t *tsdn, arena_t *arena,
 
 static extent_t *
 arena_slab_alloc(tsdn_t *tsdn, arena_t *arena, szind_t binind,
-    const arena_bin_info_t *bin_info) {
+    const bin_info_t *bin_info) {
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 0);
 
@@ -1321,10 +1149,10 @@ arena_slab_alloc(tsdn_t *tsdn, arena_t *arena, szind_t binind,
 }
 
 static extent_t *
-arena_bin_nonfull_slab_get(tsdn_t *tsdn, arena_t *arena, arena_bin_t *bin,
+arena_bin_nonfull_slab_get(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
     szind_t binind) {
 	extent_t *slab;
-	const arena_bin_info_t *bin_info;
+	const bin_info_t *bin_info;
 
 	/* Look for a usable slab. */
 	slab = arena_bin_slabs_nonfull_tryget(bin);
@@ -1333,7 +1161,7 @@ arena_bin_nonfull_slab_get(tsdn_t *tsdn, arena_t *arena, arena_bin_t *bin,
 	}
 	/* No existing slabs have any space available. */
 
-	bin_info = &arena_bin_info[binind];
+	bin_info = &bin_infos[binind];
 
 	/* Allocate a new slab. */
 	malloc_mutex_unlock(tsdn, &bin->lock);
@@ -1364,12 +1192,12 @@ arena_bin_nonfull_slab_get(tsdn_t *tsdn, arena_t *arena, arena_bin_t *bin,
 
 /* Re-fill bin->slabcur, then call arena_slab_reg_alloc(). */
 static void *
-arena_bin_malloc_hard(tsdn_t *tsdn, arena_t *arena, arena_bin_t *bin,
+arena_bin_malloc_hard(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
     szind_t binind) {
-	const arena_bin_info_t *bin_info;
+	const bin_info_t *bin_info;
 	extent_t *slab;
 
-	bin_info = &arena_bin_info[binind];
+	bin_info = &bin_infos[binind];
 	if (!arena_is_auto(arena) && bin->slabcur != NULL) {
 		arena_bin_slabs_full_insert(arena, bin, bin->slabcur);
 		bin->slabcur = NULL;
@@ -1420,9 +1248,9 @@ arena_bin_malloc_hard(tsdn_t *tsdn, arena_t *arena, arena_bin_t *bin,
 
 void
 arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
-    tcache_bin_t *tbin, szind_t binind, uint64_t prof_accumbytes) {
+    cache_bin_t *tbin, szind_t binind, uint64_t prof_accumbytes) {
 	unsigned i, nfill;
-	arena_bin_t *bin;
+	bin_t *bin;
 
 	assert(tbin->ncached == 0);
 
@@ -1438,7 +1266,7 @@ arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
 		if ((slab = bin->slabcur) != NULL && extent_nfree_get(slab) >
 		    0) {
 			ptr = arena_slab_reg_alloc(tsdn, slab,
-			    &arena_bin_info[binind]);
+			    &bin_infos[binind]);
 		} else {
 			ptr = arena_bin_malloc_hard(tsdn, arena, bin, binind);
 		}
@@ -1455,8 +1283,7 @@ arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
 			break;
 		}
 		if (config_fill && unlikely(opt_junk_alloc)) {
-			arena_alloc_junk_small(ptr, &arena_bin_info[binind],
-			    true);
+			arena_alloc_junk_small(ptr, &bin_infos[binind], true);
 		}
 		/* Insert such that low regions get used first. */
 		*(tbin->avail - nfill + i) = ptr;
@@ -1474,14 +1301,14 @@ arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
 }
 
 void
-arena_alloc_junk_small(void *ptr, const arena_bin_info_t *bin_info, bool zero) {
+arena_alloc_junk_small(void *ptr, const bin_info_t *bin_info, bool zero) {
 	if (!zero) {
 		memset(ptr, JEMALLOC_ALLOC_JUNK, bin_info->reg_size);
 	}
 }
 
 static void
-arena_dalloc_junk_small_impl(void *ptr, const arena_bin_info_t *bin_info) {
+arena_dalloc_junk_small_impl(void *ptr, const bin_info_t *bin_info) {
 	memset(ptr, JEMALLOC_FREE_JUNK, bin_info->reg_size);
 }
 arena_dalloc_junk_small_t *JET_MUTABLE arena_dalloc_junk_small =
@@ -1490,7 +1317,7 @@ arena_dalloc_junk_small_t *JET_MUTABLE arena_dalloc_junk_small =
 static void *
 arena_malloc_small(tsdn_t *tsdn, arena_t *arena, szind_t binind, bool zero) {
 	void *ret;
-	arena_bin_t *bin;
+	bin_t *bin;
 	size_t usize;
 	extent_t *slab;
 
@@ -1500,7 +1327,7 @@ arena_malloc_small(tsdn_t *tsdn, arena_t *arena, szind_t binind, bool zero) {
 
 	malloc_mutex_lock(tsdn, &bin->lock);
 	if ((slab = bin->slabcur) != NULL && extent_nfree_get(slab) > 0) {
-		ret = arena_slab_reg_alloc(tsdn, slab, &arena_bin_info[binind]);
+		ret = arena_slab_reg_alloc(tsdn, slab, &bin_infos[binind]);
 	} else {
 		ret = arena_bin_malloc_hard(tsdn, arena, bin, binind);
 	}
@@ -1524,14 +1351,14 @@ arena_malloc_small(tsdn_t *tsdn, arena_t *arena, szind_t binind, bool zero) {
 		if (config_fill) {
 			if (unlikely(opt_junk_alloc)) {
 				arena_alloc_junk_small(ret,
-				    &arena_bin_info[binind], false);
+				    &bin_infos[binind], false);
 			} else if (unlikely(opt_zero)) {
 				memset(ret, 0, usize);
 			}
 		}
 	} else {
 		if (config_fill && unlikely(opt_junk_alloc)) {
-			arena_alloc_junk_small(ret, &arena_bin_info[binind],
+			arena_alloc_junk_small(ret, &bin_infos[binind],
 			    true);
 		}
 		memset(ret, 0, usize);
@@ -1636,13 +1463,13 @@ arena_dalloc_promoted(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
 }
 
 static void
-arena_dissociate_bin_slab(arena_t *arena, extent_t *slab, arena_bin_t *bin) {
+arena_dissociate_bin_slab(arena_t *arena, extent_t *slab, bin_t *bin) {
 	/* Dissociate slab from bin. */
 	if (slab == bin->slabcur) {
 		bin->slabcur = NULL;
 	} else {
 		szind_t binind = extent_szind_get(slab);
-		const arena_bin_info_t *bin_info = &arena_bin_info[binind];
+		const bin_info_t *bin_info = &bin_infos[binind];
 
 		/*
 		 * The following block's conditional is necessary because if the
@@ -1659,7 +1486,7 @@ arena_dissociate_bin_slab(arena_t *arena, extent_t *slab, arena_bin_t *bin) {
 
 static void
 arena_dalloc_bin_slab(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
-    arena_bin_t *bin) {
+    bin_t *bin) {
 	assert(slab != bin->slabcur);
 
 	malloc_mutex_unlock(tsdn, &bin->lock);
@@ -1673,8 +1500,7 @@ arena_dalloc_bin_slab(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
 }
 
 static void
-arena_bin_lower_slab(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
-    arena_bin_t *bin) {
+arena_bin_lower_slab(tsdn_t *tsdn, arena_t *arena, extent_t *slab, bin_t *bin) {
 	assert(extent_nfree_get(slab) > 0);
 
 	/*
@@ -1704,8 +1530,8 @@ arena_dalloc_bin_locked_impl(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
     void *ptr, bool junked) {
 	arena_slab_data_t *slab_data = extent_slab_data_get(slab);
 	szind_t binind = extent_szind_get(slab);
-	arena_bin_t *bin = &arena->bins[binind];
-	const arena_bin_info_t *bin_info = &arena_bin_info[binind];
+	bin_t *bin = &arena->bins[binind];
+	const bin_info_t *bin_info = &bin_infos[binind];
 
 	if (!junked && config_fill && unlikely(opt_junk_free)) {
 		arena_dalloc_junk_small(ptr, bin_info);
@@ -1736,7 +1562,7 @@ arena_dalloc_bin_junked_locked(tsdn_t *tsdn, arena_t *arena, extent_t *extent,
 static void
 arena_dalloc_bin(tsdn_t *tsdn, arena_t *arena, extent_t *extent, void *ptr) {
 	szind_t binind = extent_szind_get(extent);
-	arena_bin_t *bin = &arena->bins[binind];
+	bin_t *bin = &arena->bins[binind];
 
 	malloc_mutex_lock(tsdn, &bin->lock);
 	arena_dalloc_bin_locked_impl(tsdn, arena, extent, ptr, false);
@@ -1770,7 +1596,7 @@ arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
 		 * Avoid moving the allocation if the size class can be left the
 		 * same.
 		 */
-		assert(arena_bin_info[sz_size2index(oldsize)].reg_size ==
+		assert(bin_infos[sz_size2index(oldsize)].reg_size ==
 		    oldsize);
 		if ((usize_max > SMALL_MAXCLASS || sz_size2index(usize_max) !=
 		    sz_size2index(oldsize)) && (size > oldsize || usize_max <
@@ -1885,6 +1711,33 @@ arena_muzzy_decay_ms_default_set(ssize_t decay_ms) {
 	return false;
 }
 
+bool
+arena_retain_grow_limit_get_set(tsd_t *tsd, arena_t *arena, size_t *old_limit,
+    size_t *new_limit) {
+	assert(opt_retain);
+
+	pszind_t new_ind JEMALLOC_CC_SILENCE_INIT(0);
+	if (new_limit != NULL) {
+		size_t limit = *new_limit;
+		/* Grow no more than the new limit. */
+		if ((new_ind = sz_psz2ind(limit + 1) - 1) >
+		     EXTENT_GROW_MAX_PIND) {
+			return true;
+		}
+	}
+
+	malloc_mutex_lock(tsd_tsdn(tsd), &arena->extent_grow_mtx);
+	if (old_limit != NULL) {
+		*old_limit = sz_pind2sz(arena->retain_grow_limit);
+	}
+	if (new_limit != NULL) {
+		arena->retain_grow_limit = new_ind;
+	}
+	malloc_mutex_unlock(tsd_tsdn(tsd), &arena->extent_grow_mtx);
+
+	return false;
+}
+
 unsigned
 arena_nthreads_get(arena_t *arena, bool internal) {
 	return atomic_load_u(&arena->nthreads[internal], ATOMIC_RELAXED);
@@ -1935,6 +1788,7 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 		}
 
 		ql_new(&arena->tcache_ql);
+		ql_new(&arena->cache_bin_array_descriptor_ql);
 		if (malloc_mutex_init(&arena->tcache_ql_mtx, "tcache_ql",
 		    WITNESS_RANK_TCACHE_QL, malloc_mutex_rank_exclusive)) {
 			goto label_error;
@@ -2011,6 +1865,7 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 	}
 
 	arena->extent_grow_next = sz_psz2ind(HUGEPAGE);
+	arena->retain_grow_limit = EXTENT_GROW_MAX_PIND;
 	if (malloc_mutex_init(&arena->extent_grow_mtx, "extent_grow",
 	    WITNESS_RANK_EXTENT_GROW, malloc_mutex_rank_exclusive)) {
 		goto label_error;
@@ -2024,17 +1879,10 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 
 	/* Initialize bins. */
 	for (i = 0; i < NBINS; i++) {
-		arena_bin_t *bin = &arena->bins[i];
-		if (malloc_mutex_init(&bin->lock, "arena_bin",
-		    WITNESS_RANK_ARENA_BIN, malloc_mutex_rank_exclusive)) {
+		bool err = bin_init(&arena->bins[i]);
+		if (err) {
 			goto label_error;
 		}
-		bin->slabcur = NULL;
-		extent_heap_new(&bin->slabs_nonfull);
-		extent_list_init(&bin->slabs_full);
-		if (config_stats) {
-			memset(&bin->stats, 0, sizeof(malloc_bin_stats_t));
-		}
 	}
 
 	arena->base = base;
@@ -2070,6 +1918,16 @@ void
 arena_boot(void) {
 	arena_dirty_decay_ms_default_set(opt_dirty_decay_ms);
 	arena_muzzy_decay_ms_default_set(opt_muzzy_decay_ms);
+#define REGIND_bin_yes(index, reg_size) 				\
+	div_init(&arena_binind_div_info[(index)], (reg_size));
+#define REGIND_bin_no(index, reg_size)
+#define SC(index, lg_grp, lg_delta, ndelta, psz, bin, pgs,		\
+    lg_delta_lookup)							\
+	REGIND_bin_##bin(index, (1U<<lg_grp) + (ndelta << lg_delta))
+	SIZE_CLASSES
+#undef REGIND_bin_yes
+#undef REGIND_bin_no
+#undef SC
 }
 
 void
@@ -2115,7 +1973,7 @@ arena_prefork6(tsdn_t *tsdn, arena_t *arena) {
 void
 arena_prefork7(tsdn_t *tsdn, arena_t *arena) {
 	for (unsigned i = 0; i < NBINS; i++) {
-		malloc_mutex_prefork(tsdn, &arena->bins[i].lock);
+		bin_prefork(tsdn, &arena->bins[i]);
 	}
 }
 
@@ -2124,7 +1982,7 @@ arena_postfork_parent(tsdn_t *tsdn, arena_t *arena) {
 	unsigned i;
 
 	for (i = 0; i < NBINS; i++) {
-		malloc_mutex_postfork_parent(tsdn, &arena->bins[i].lock);
+		bin_postfork_parent(tsdn, &arena->bins[i]);
 	}
 	malloc_mutex_postfork_parent(tsdn, &arena->large_mtx);
 	base_postfork_parent(tsdn, arena->base);
@@ -2154,15 +2012,21 @@ arena_postfork_child(tsdn_t *tsdn, arena_t *arena) {
 	}
 	if (config_stats) {
 		ql_new(&arena->tcache_ql);
+		ql_new(&arena->cache_bin_array_descriptor_ql);
 		tcache_t *tcache = tcache_get(tsdn_tsd(tsdn));
 		if (tcache != NULL && tcache->arena == arena) {
 			ql_elm_new(tcache, link);
 			ql_tail_insert(&arena->tcache_ql, tcache, link);
+			cache_bin_array_descriptor_init(
+			    &tcache->cache_bin_array_descriptor,
+			    tcache->bins_small, tcache->bins_large);
+			ql_tail_insert(&arena->cache_bin_array_descriptor_ql,
+			    &tcache->cache_bin_array_descriptor, link);
 		}
 	}
 
 	for (i = 0; i < NBINS; i++) {
-		malloc_mutex_postfork_child(tsdn, &arena->bins[i].lock);
+		bin_postfork_child(tsdn, &arena->bins[i]);
 	}
 	malloc_mutex_postfork_child(tsdn, &arena->large_mtx);
 	base_postfork_child(tsdn, arena->base);
diff --git a/src/background_thread.c b/src/background_thread.c
index eb30eb5..6baff22 100644
--- a/src/background_thread.c
+++ b/src/background_thread.c
@@ -30,19 +30,20 @@ bool can_enable_background_thread;
 
 static int (*pthread_create_fptr)(pthread_t *__restrict, const pthread_attr_t *,
     void *(*)(void *), void *__restrict);
-static pthread_once_t once_control = PTHREAD_ONCE_INIT;
 
 static void
-pthread_create_wrapper_once(void) {
+pthread_create_wrapper_init(void) {
 #ifdef JEMALLOC_LAZY_LOCK
-	isthreaded = true;
+	if (!isthreaded) {
+		isthreaded = true;
+	}
 #endif
 }
 
 int
 pthread_create_wrapper(pthread_t *__restrict thread, const pthread_attr_t *attr,
     void *(*start_routine)(void *), void *__restrict arg) {
-	pthread_once(&once_control, pthread_create_wrapper_once);
+	pthread_create_wrapper_init();
 
 	return pthread_create_fptr(thread, attr, start_routine, arg);
 }
@@ -805,7 +806,7 @@ void
 background_thread_ctl_init(tsdn_t *tsdn) {
 	malloc_mutex_assert_not_owner(tsdn, &background_thread_lock);
 #ifdef JEMALLOC_PTHREAD_CREATE_WRAPPER
-	pthread_once(&once_control, pthread_create_wrapper_once);
+	pthread_create_wrapper_init();
 #endif
 }
 
@@ -848,9 +849,6 @@ background_thread_boot1(tsdn_t *tsdn) {
 	    malloc_mutex_rank_exclusive)) {
 		return true;
 	}
-	if (opt_background_thread) {
-		background_thread_ctl_init(tsdn);
-	}
 
 	background_thread_info = (background_thread_info_t *)base_alloc(tsdn,
 	    b0get(), ncpus * sizeof(background_thread_info_t), CACHELINE);
diff --git a/src/base.c b/src/base.c
index 97078b1..cc3d978 100644
--- a/src/base.c
+++ b/src/base.c
@@ -10,25 +10,39 @@
 /******************************************************************************/
 /* Data. */
 
-static base_t	*b0;
+static base_t *b0;
+
+metadata_thp_mode_t opt_metadata_thp = METADATA_THP_DEFAULT;
+
+const char *metadata_thp_mode_names[] = {
+	"disabled",
+	"auto",
+	"always"
+};
 
 /******************************************************************************/
 
+static inline bool
+metadata_thp_madvise(void) {
+	return (metadata_thp_enabled() && thp_state_madvise);
+}
+
 static void *
 base_map(tsdn_t *tsdn, extent_hooks_t *extent_hooks, unsigned ind, size_t size) {
 	void *addr;
 	bool zero = true;
 	bool commit = true;
 
+	/* Use huge page sizes and alignment regardless of opt_metadata_thp. */
 	assert(size == HUGEPAGE_CEILING(size));
-
+	size_t alignment = HUGEPAGE;
 	if (extent_hooks == &extent_hooks_default) {
-		addr = extent_alloc_mmap(NULL, size, PAGE, &zero, &commit);
+		addr = extent_alloc_mmap(NULL, size, alignment, &zero, &commit);
 	} else {
 		/* No arena context as we are creating new arenas. */
 		tsd_t *tsd = tsdn_null(tsdn) ? tsd_fetch() : tsdn_tsd(tsdn);
 		pre_reentrancy(tsd, NULL);
-		addr = extent_hooks->alloc(extent_hooks, NULL, size, PAGE,
+		addr = extent_hooks->alloc(extent_hooks, NULL, size, alignment,
 		    &zero, &commit, ind);
 		post_reentrancy(tsd);
 	}
@@ -51,16 +65,16 @@ base_unmap(tsdn_t *tsdn, extent_hooks_t *extent_hooks, unsigned ind, void *addr,
 	 */
 	if (extent_hooks == &extent_hooks_default) {
 		if (!extent_dalloc_mmap(addr, size)) {
-			return;
+			goto label_done;
 		}
 		if (!pages_decommit(addr, size)) {
-			return;
+			goto label_done;
 		}
 		if (!pages_purge_forced(addr, size)) {
-			return;
+			goto label_done;
 		}
 		if (!pages_purge_lazy(addr, size)) {
-			return;
+			goto label_done;
 		}
 		/* Nothing worked.  This should never happen. */
 		not_reached();
@@ -70,27 +84,33 @@ base_unmap(tsdn_t *tsdn, extent_hooks_t *extent_hooks, unsigned ind, void *addr,
 		if (extent_hooks->dalloc != NULL &&
 		    !extent_hooks->dalloc(extent_hooks, addr, size, true,
 		    ind)) {
-			goto label_done;
+			goto label_post_reentrancy;
 		}
 		if (extent_hooks->decommit != NULL &&
 		    !extent_hooks->decommit(extent_hooks, addr, size, 0, size,
 		    ind)) {
-			goto label_done;
+			goto label_post_reentrancy;
 		}
 		if (extent_hooks->purge_forced != NULL &&
 		    !extent_hooks->purge_forced(extent_hooks, addr, size, 0,
 		    size, ind)) {
-			goto label_done;
+			goto label_post_reentrancy;
 		}
 		if (extent_hooks->purge_lazy != NULL &&
 		    !extent_hooks->purge_lazy(extent_hooks, addr, size, 0, size,
 		    ind)) {
-			goto label_done;
+			goto label_post_reentrancy;
 		}
 		/* Nothing worked.  That's the application's problem. */
-	label_done:
+	label_post_reentrancy:
 		post_reentrancy(tsd);
-		return;
+	}
+label_done:
+	if (metadata_thp_madvise()) {
+		/* Set NOHUGEPAGE after unmap to avoid kernel defrag. */
+		assert(((uintptr_t)addr & HUGEPAGE_MASK) == 0 &&
+		    (size & HUGEPAGE_MASK) == 0);
+		pages_nohuge(addr, size);
 	}
 }
 
@@ -105,6 +125,56 @@ base_extent_init(size_t *extent_sn_next, extent_t *extent, void *addr,
 	extent_binit(extent, addr, size, sn);
 }
 
+static size_t
+base_get_num_blocks(base_t *base, bool with_new_block) {
+	base_block_t *b = base->blocks;
+	assert(b != NULL);
+
+	size_t n_blocks = with_new_block ? 2 : 1;
+	while (b->next != NULL) {
+		n_blocks++;
+		b = b->next;
+	}
+
+	return n_blocks;
+}
+
+static void
+base_auto_thp_switch(tsdn_t *tsdn, base_t *base) {
+	assert(opt_metadata_thp == metadata_thp_auto);
+	malloc_mutex_assert_owner(tsdn, &base->mtx);
+	if (base->auto_thp_switched) {
+		return;
+	}
+	/* Called when adding a new block. */
+	bool should_switch;
+	if (base_ind_get(base) != 0) {
+		should_switch = (base_get_num_blocks(base, true) ==
+		    BASE_AUTO_THP_THRESHOLD);
+	} else {
+		should_switch = (base_get_num_blocks(base, true) ==
+		    BASE_AUTO_THP_THRESHOLD_A0);
+	}
+	if (!should_switch) {
+		return;
+	}
+
+	base->auto_thp_switched = true;
+	assert(!config_stats || base->n_thp == 0);
+	/* Make the initial blocks THP lazily. */
+	base_block_t *block = base->blocks;
+	while (block != NULL) {
+		assert((block->size & HUGEPAGE_MASK) == 0);
+		pages_huge(block, block->size);
+		if (config_stats) {
+			base->n_thp += HUGEPAGE_CEILING(block->size -
+			    extent_bsize_get(&block->extent)) >> LG_HUGEPAGE;
+		}
+		block = block->next;
+		assert(block == NULL || (base_ind_get(base) == 0));
+	}
+}
+
 static void *
 base_extent_bump_alloc_helper(extent_t *extent, size_t *gap_size, size_t size,
     size_t alignment) {
@@ -140,12 +210,20 @@ base_extent_bump_alloc_post(tsdn_t *tsdn, base_t *base, extent_t *extent,
 		base->allocated += size;
 		/*
 		 * Add one PAGE to base_resident for every page boundary that is
-		 * crossed by the new allocation.
+		 * crossed by the new allocation. Adjust n_thp similarly when
+		 * metadata_thp is enabled.
 		 */
 		base->resident += PAGE_CEILING((uintptr_t)addr + size) -
 		    PAGE_CEILING((uintptr_t)addr - gap_size);
 		assert(base->allocated <= base->resident);
 		assert(base->resident <= base->mapped);
+		if (metadata_thp_madvise() && (opt_metadata_thp ==
+		    metadata_thp_always || base->auto_thp_switched)) {
+			base->n_thp += (HUGEPAGE_CEILING((uintptr_t)addr + size)
+			    - HUGEPAGE_CEILING((uintptr_t)addr - gap_size)) >>
+			    LG_HUGEPAGE;
+			assert(base->mapped >= base->n_thp << LG_HUGEPAGE);
+		}
 	}
 }
 
@@ -166,8 +244,8 @@ base_extent_bump_alloc(tsdn_t *tsdn, base_t *base, extent_t *extent,
  * On success a pointer to the initialized base_block_t header is returned.
  */
 static base_block_t *
-base_block_alloc(tsdn_t *tsdn, extent_hooks_t *extent_hooks, unsigned ind,
-    pszind_t *pind_last, size_t *extent_sn_next, size_t size,
+base_block_alloc(tsdn_t *tsdn, base_t *base, extent_hooks_t *extent_hooks,
+    unsigned ind, pszind_t *pind_last, size_t *extent_sn_next, size_t size,
     size_t alignment) {
 	alignment = ALIGNMENT_CEILING(alignment, QUANTUM);
 	size_t usize = ALIGNMENT_CEILING(size, alignment);
@@ -193,6 +271,25 @@ base_block_alloc(tsdn_t *tsdn, extent_hooks_t *extent_hooks, unsigned ind,
 	if (block == NULL) {
 		return NULL;
 	}
+
+	if (metadata_thp_madvise()) {
+		void *addr = (void *)block;
+		assert(((uintptr_t)addr & HUGEPAGE_MASK) == 0 &&
+		    (block_size & HUGEPAGE_MASK) == 0);
+		if (opt_metadata_thp == metadata_thp_always) {
+			pages_huge(addr, block_size);
+		} else if (opt_metadata_thp == metadata_thp_auto &&
+		    base != NULL) {
+			/* base != NULL indicates this is not a new base. */
+			malloc_mutex_lock(tsdn, &base->mtx);
+			base_auto_thp_switch(tsdn, base);
+			if (base->auto_thp_switched) {
+				pages_huge(addr, block_size);
+			}
+			malloc_mutex_unlock(tsdn, &base->mtx);
+		}
+	}
+
 	*pind_last = sz_psz2ind(block_size);
 	block->size = block_size;
 	block->next = NULL;
@@ -216,7 +313,7 @@ base_extent_alloc(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment) {
 	 * called.
 	 */
 	malloc_mutex_unlock(tsdn, &base->mtx);
-	base_block_t *block = base_block_alloc(tsdn, extent_hooks,
+	base_block_t *block = base_block_alloc(tsdn, base, extent_hooks,
 	    base_ind_get(base), &base->pind_last, &base->extent_sn_next, size,
 	    alignment);
 	malloc_mutex_lock(tsdn, &base->mtx);
@@ -229,8 +326,16 @@ base_extent_alloc(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment) {
 		base->allocated += sizeof(base_block_t);
 		base->resident += PAGE_CEILING(sizeof(base_block_t));
 		base->mapped += block->size;
+		if (metadata_thp_madvise() &&
+		    !(opt_metadata_thp == metadata_thp_auto
+		      && !base->auto_thp_switched)) {
+			assert(base->n_thp > 0);
+			base->n_thp += HUGEPAGE_CEILING(sizeof(base_block_t)) >>
+			    LG_HUGEPAGE;
+		}
 		assert(base->allocated <= base->resident);
 		assert(base->resident <= base->mapped);
+		assert(base->n_thp << LG_HUGEPAGE <= base->mapped);
 	}
 	return &block->extent;
 }
@@ -244,7 +349,7 @@ base_t *
 base_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 	pszind_t pind_last = 0;
 	size_t extent_sn_next = 0;
-	base_block_t *block = base_block_alloc(tsdn, extent_hooks, ind,
+	base_block_t *block = base_block_alloc(tsdn, NULL, extent_hooks, ind,
 	    &pind_last, &extent_sn_next, sizeof(base_t), QUANTUM);
 	if (block == NULL) {
 		return NULL;
@@ -265,6 +370,7 @@ base_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 	base->pind_last = pind_last;
 	base->extent_sn_next = extent_sn_next;
 	base->blocks = block;
+	base->auto_thp_switched = false;
 	for (szind_t i = 0; i < NSIZES; i++) {
 		extent_heap_new(&base->avail[i]);
 	}
@@ -272,8 +378,12 @@ base_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 		base->allocated = sizeof(base_block_t);
 		base->resident = PAGE_CEILING(sizeof(base_block_t));
 		base->mapped = block->size;
+		base->n_thp = (opt_metadata_thp == metadata_thp_always) &&
+		    metadata_thp_madvise() ? HUGEPAGE_CEILING(sizeof(base_block_t))
+		    >> LG_HUGEPAGE : 0;
 		assert(base->allocated <= base->resident);
 		assert(base->resident <= base->mapped);
+		assert(base->n_thp << LG_HUGEPAGE <= base->mapped);
 	}
 	base_extent_bump_alloc_post(tsdn, base, &block->extent, gap_size, base,
 	    base_size);
@@ -368,7 +478,7 @@ base_alloc_extent(tsdn_t *tsdn, base_t *base) {
 
 void
 base_stats_get(tsdn_t *tsdn, base_t *base, size_t *allocated, size_t *resident,
-    size_t *mapped) {
+    size_t *mapped, size_t *n_thp) {
 	cassert(config_stats);
 
 	malloc_mutex_lock(tsdn, &base->mtx);
@@ -377,6 +487,7 @@ base_stats_get(tsdn_t *tsdn, base_t *base, size_t *allocated, size_t *resident,
 	*allocated = base->allocated;
 	*resident = base->resident;
 	*mapped = base->mapped;
+	*n_thp = base->n_thp;
 	malloc_mutex_unlock(tsdn, &base->mtx);
 }
 
diff --git a/src/bin.c b/src/bin.c
new file mode 100644
index 0000000..0886bc4
--- /dev/null
+++ b/src/bin.c
@@ -0,0 +1,50 @@
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/jemalloc_internal_includes.h"
+
+#include "jemalloc/internal/bin.h"
+#include "jemalloc/internal/witness.h"
+
+const bin_info_t bin_infos[NBINS] = {
+#define BIN_INFO_bin_yes(reg_size, slab_size, nregs)			\
+	{reg_size, slab_size, nregs, BITMAP_INFO_INITIALIZER(nregs)},
+#define BIN_INFO_bin_no(reg_size, slab_size, nregs)
+#define SC(index, lg_grp, lg_delta, ndelta, psz, bin, pgs,		\
+    lg_delta_lookup)							\
+	BIN_INFO_bin_##bin((1U<<lg_grp) + (ndelta<<lg_delta),		\
+	    (pgs << LG_PAGE), (pgs << LG_PAGE) / ((1U<<lg_grp) +	\
+	    (ndelta<<lg_delta)))
+	SIZE_CLASSES
+#undef BIN_INFO_bin_yes
+#undef BIN_INFO_bin_no
+#undef SC
+};
+
+bool
+bin_init(bin_t *bin) {
+	if (malloc_mutex_init(&bin->lock, "bin", WITNESS_RANK_BIN,
+	    malloc_mutex_rank_exclusive)) {
+		return true;
+	}
+	bin->slabcur = NULL;
+	extent_heap_new(&bin->slabs_nonfull);
+	extent_list_init(&bin->slabs_full);
+	if (config_stats) {
+		memset(&bin->stats, 0, sizeof(bin_stats_t));
+	}
+	return false;
+}
+
+void
+bin_prefork(tsdn_t *tsdn, bin_t *bin) {
+	malloc_mutex_prefork(tsdn, &bin->lock);
+}
+
+void
+bin_postfork_parent(tsdn_t *tsdn, bin_t *bin) {
+	malloc_mutex_postfork_parent(tsdn, &bin->lock);
+}
+
+void
+bin_postfork_child(tsdn_t *tsdn, bin_t *bin) {
+	malloc_mutex_postfork_child(tsdn, &bin->lock);
+}
diff --git a/src/ctl.c b/src/ctl.c
index 36bc8fb..3a22423 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -80,6 +80,7 @@ CTL_PROTO(config_utrace)
 CTL_PROTO(config_xmalloc)
 CTL_PROTO(opt_abort)
 CTL_PROTO(opt_abort_conf)
+CTL_PROTO(opt_metadata_thp)
 CTL_PROTO(opt_retain)
 CTL_PROTO(opt_dss)
 CTL_PROTO(opt_narenas)
@@ -94,6 +95,7 @@ CTL_PROTO(opt_zero)
 CTL_PROTO(opt_utrace)
 CTL_PROTO(opt_xmalloc)
 CTL_PROTO(opt_tcache)
+CTL_PROTO(opt_lg_extent_max_active_fit)
 CTL_PROTO(opt_lg_tcache_max)
 CTL_PROTO(opt_prof)
 CTL_PROTO(opt_prof_prefix)
@@ -117,6 +119,7 @@ CTL_PROTO(arena_i_dss)
 CTL_PROTO(arena_i_dirty_decay_ms)
 CTL_PROTO(arena_i_muzzy_decay_ms)
 CTL_PROTO(arena_i_extent_hooks)
+CTL_PROTO(arena_i_retain_grow_limit)
 INDEX_PROTO(arena_i)
 CTL_PROTO(arenas_bin_i_size)
 CTL_PROTO(arenas_bin_i_nregs)
@@ -182,6 +185,7 @@ CTL_PROTO(stats_arenas_i_muzzy_nmadvise)
 CTL_PROTO(stats_arenas_i_muzzy_purged)
 CTL_PROTO(stats_arenas_i_base)
 CTL_PROTO(stats_arenas_i_internal)
+CTL_PROTO(stats_arenas_i_metadata_thp)
 CTL_PROTO(stats_arenas_i_tcache_bytes)
 CTL_PROTO(stats_arenas_i_resident)
 INDEX_PROTO(stats_arenas_i)
@@ -191,6 +195,7 @@ CTL_PROTO(stats_background_thread_num_threads)
 CTL_PROTO(stats_background_thread_num_runs)
 CTL_PROTO(stats_background_thread_run_interval)
 CTL_PROTO(stats_metadata)
+CTL_PROTO(stats_metadata_thp)
 CTL_PROTO(stats_resident)
 CTL_PROTO(stats_mapped)
 CTL_PROTO(stats_retained)
@@ -274,6 +279,7 @@ static const ctl_named_node_t	config_node[] = {
 static const ctl_named_node_t opt_node[] = {
 	{NAME("abort"),		CTL(opt_abort)},
 	{NAME("abort_conf"),	CTL(opt_abort_conf)},
+	{NAME("metadata_thp"),	CTL(opt_metadata_thp)},
 	{NAME("retain"),	CTL(opt_retain)},
 	{NAME("dss"),		CTL(opt_dss)},
 	{NAME("narenas"),	CTL(opt_narenas)},
@@ -288,6 +294,7 @@ static const ctl_named_node_t opt_node[] = {
 	{NAME("utrace"),	CTL(opt_utrace)},
 	{NAME("xmalloc"),	CTL(opt_xmalloc)},
 	{NAME("tcache"),	CTL(opt_tcache)},
+	{NAME("lg_extent_max_active_fit"), CTL(opt_lg_extent_max_active_fit)},
 	{NAME("lg_tcache_max"),	CTL(opt_lg_tcache_max)},
 	{NAME("prof"),		CTL(opt_prof)},
 	{NAME("prof_prefix"),	CTL(opt_prof_prefix)},
@@ -316,7 +323,8 @@ static const ctl_named_node_t arena_i_node[] = {
 	{NAME("dss"),		CTL(arena_i_dss)},
 	{NAME("dirty_decay_ms"), CTL(arena_i_dirty_decay_ms)},
 	{NAME("muzzy_decay_ms"), CTL(arena_i_muzzy_decay_ms)},
-	{NAME("extent_hooks"),	CTL(arena_i_extent_hooks)}
+	{NAME("extent_hooks"),	CTL(arena_i_extent_hooks)},
+	{NAME("retain_grow_limit"),	CTL(arena_i_retain_grow_limit)}
 };
 static const ctl_named_node_t super_arena_i_node[] = {
 	{NAME(""),		CHILD(named, arena_i)}
@@ -474,6 +482,7 @@ static const ctl_named_node_t stats_arenas_i_node[] = {
 	{NAME("muzzy_purged"),	CTL(stats_arenas_i_muzzy_purged)},
 	{NAME("base"),		CTL(stats_arenas_i_base)},
 	{NAME("internal"),	CTL(stats_arenas_i_internal)},
+	{NAME("metadata_thp"),	CTL(stats_arenas_i_metadata_thp)},
 	{NAME("tcache_bytes"),	CTL(stats_arenas_i_tcache_bytes)},
 	{NAME("resident"),	CTL(stats_arenas_i_resident)},
 	{NAME("small"),		CHILD(named, stats_arenas_i_small)},
@@ -512,6 +521,7 @@ static const ctl_named_node_t stats_node[] = {
 	{NAME("allocated"),	CTL(stats_allocated)},
 	{NAME("active"),	CTL(stats_active)},
 	{NAME("metadata"),	CTL(stats_metadata)},
+	{NAME("metadata_thp"),	CTL(stats_metadata_thp)},
 	{NAME("resident"),	CTL(stats_resident)},
 	{NAME("mapped"),	CTL(stats_mapped)},
 	{NAME("retained"),	CTL(stats_retained)},
@@ -550,7 +560,7 @@ static const ctl_named_node_t super_root_node[] = {
  * synchronized by the ctl mutex.
  */
 static void
-accum_arena_stats_u64(arena_stats_u64_t *dst, arena_stats_u64_t *src) {
+ctl_accum_arena_stats_u64(arena_stats_u64_t *dst, arena_stats_u64_t *src) {
 #ifdef JEMALLOC_ATOMIC_U64
 	uint64_t cur_dst = atomic_load_u64(dst, ATOMIC_RELAXED);
 	uint64_t cur_src = atomic_load_u64(src, ATOMIC_RELAXED);
@@ -562,7 +572,7 @@ accum_arena_stats_u64(arena_stats_u64_t *dst, arena_stats_u64_t *src) {
 
 /* Likewise: with ctl mutex synchronization, reading is simple. */
 static uint64_t
-arena_stats_read_u64(arena_stats_u64_t *p) {
+ctl_arena_stats_read_u64(arena_stats_u64_t *p) {
 #ifdef JEMALLOC_ATOMIC_U64
 	return atomic_load_u64(p, ATOMIC_RELAXED);
 #else
@@ -570,7 +580,8 @@ arena_stats_read_u64(arena_stats_u64_t *p) {
 #endif
 }
 
-static void accum_atomic_zu(atomic_zu_t *dst, atomic_zu_t *src) {
+static void
+accum_atomic_zu(atomic_zu_t *dst, atomic_zu_t *src) {
 	size_t cur_dst = atomic_load_zu(dst, ATOMIC_RELAXED);
 	size_t cur_src = atomic_load_zu(src, ATOMIC_RELAXED);
 	atomic_store_zu(dst, cur_dst + cur_src, ATOMIC_RELAXED);
@@ -680,9 +691,9 @@ ctl_arena_clear(ctl_arena_t *ctl_arena) {
 		ctl_arena->astats->ndalloc_small = 0;
 		ctl_arena->astats->nrequests_small = 0;
 		memset(ctl_arena->astats->bstats, 0, NBINS *
-		    sizeof(malloc_bin_stats_t));
+		    sizeof(bin_stats_t));
 		memset(ctl_arena->astats->lstats, 0, (NSIZES - NBINS) *
-		    sizeof(malloc_large_stats_t));
+		    sizeof(arena_stats_large_t));
 	}
 }
 
@@ -745,18 +756,18 @@ ctl_arena_stats_sdmerge(ctl_arena_t *ctl_sdarena, ctl_arena_t *ctl_arena,
 			    &astats->astats.retained);
 		}
 
-		accum_arena_stats_u64(&sdstats->astats.decay_dirty.npurge,
+		ctl_accum_arena_stats_u64(&sdstats->astats.decay_dirty.npurge,
 		    &astats->astats.decay_dirty.npurge);
-		accum_arena_stats_u64(&sdstats->astats.decay_dirty.nmadvise,
+		ctl_accum_arena_stats_u64(&sdstats->astats.decay_dirty.nmadvise,
 		    &astats->astats.decay_dirty.nmadvise);
-		accum_arena_stats_u64(&sdstats->astats.decay_dirty.purged,
+		ctl_accum_arena_stats_u64(&sdstats->astats.decay_dirty.purged,
 		    &astats->astats.decay_dirty.purged);
 
-		accum_arena_stats_u64(&sdstats->astats.decay_muzzy.npurge,
+		ctl_accum_arena_stats_u64(&sdstats->astats.decay_muzzy.npurge,
 		    &astats->astats.decay_muzzy.npurge);
-		accum_arena_stats_u64(&sdstats->astats.decay_muzzy.nmadvise,
+		ctl_accum_arena_stats_u64(&sdstats->astats.decay_muzzy.nmadvise,
 		    &astats->astats.decay_muzzy.nmadvise);
-		accum_arena_stats_u64(&sdstats->astats.decay_muzzy.purged,
+		ctl_accum_arena_stats_u64(&sdstats->astats.decay_muzzy.purged,
 		    &astats->astats.decay_muzzy.purged);
 
 #define OP(mtx) malloc_mutex_prof_merge(				\
@@ -773,6 +784,8 @@ MUTEX_PROF_ARENA_MUTEXES
 			    &astats->astats.internal);
 			accum_atomic_zu(&sdstats->astats.resident,
 			    &astats->astats.resident);
+			accum_atomic_zu(&sdstats->astats.metadata_thp,
+			    &astats->astats.metadata_thp);
 		} else {
 			assert(atomic_load_zu(
 			    &astats->astats.internal, ATOMIC_RELAXED) == 0);
@@ -794,11 +807,11 @@ MUTEX_PROF_ARENA_MUTEXES
 			assert(atomic_load_zu(&astats->astats.allocated_large,
 			    ATOMIC_RELAXED) == 0);
 		}
-		accum_arena_stats_u64(&sdstats->astats.nmalloc_large,
+		ctl_accum_arena_stats_u64(&sdstats->astats.nmalloc_large,
 		    &astats->astats.nmalloc_large);
-		accum_arena_stats_u64(&sdstats->astats.ndalloc_large,
+		ctl_accum_arena_stats_u64(&sdstats->astats.ndalloc_large,
 		    &astats->astats.ndalloc_large);
-		accum_arena_stats_u64(&sdstats->astats.nrequests_large,
+		ctl_accum_arena_stats_u64(&sdstats->astats.nrequests_large,
 		    &astats->astats.nrequests_large);
 
 		accum_atomic_zu(&sdstats->astats.tcache_bytes,
@@ -835,11 +848,11 @@ MUTEX_PROF_ARENA_MUTEXES
 		}
 
 		for (i = 0; i < NSIZES - NBINS; i++) {
-			accum_arena_stats_u64(&sdstats->lstats[i].nmalloc,
+			ctl_accum_arena_stats_u64(&sdstats->lstats[i].nmalloc,
 			    &astats->lstats[i].nmalloc);
-			accum_arena_stats_u64(&sdstats->lstats[i].ndalloc,
+			ctl_accum_arena_stats_u64(&sdstats->lstats[i].ndalloc,
 			    &astats->lstats[i].ndalloc);
-			accum_arena_stats_u64(&sdstats->lstats[i].nrequests,
+			ctl_accum_arena_stats_u64(&sdstats->lstats[i].nrequests,
 			    &astats->lstats[i].nrequests);
 			if (!destroyed) {
 				sdstats->lstats[i].curlextents +=
@@ -938,6 +951,8 @@ ctl_refresh(tsdn_t *tsdn) {
 		    &ctl_sarena->astats->astats.base, ATOMIC_RELAXED) +
 		    atomic_load_zu(&ctl_sarena->astats->astats.internal,
 			ATOMIC_RELAXED);
+		ctl_stats->metadata_thp = atomic_load_zu(
+		    &ctl_sarena->astats->astats.metadata_thp, ATOMIC_RELAXED);
 		ctl_stats->resident = atomic_load_zu(
 		    &ctl_sarena->astats->astats.resident, ATOMIC_RELAXED);
 		ctl_stats->mapped = atomic_load_zu(
@@ -1568,6 +1583,8 @@ CTL_RO_CONFIG_GEN(config_xmalloc, bool)
 
 CTL_RO_NL_GEN(opt_abort, opt_abort, bool)
 CTL_RO_NL_GEN(opt_abort_conf, opt_abort_conf, bool)
+CTL_RO_NL_GEN(opt_metadata_thp, metadata_thp_mode_names[opt_metadata_thp],
+    const char *)
 CTL_RO_NL_GEN(opt_retain, opt_retain, bool)
 CTL_RO_NL_GEN(opt_dss, opt_dss, const char *)
 CTL_RO_NL_GEN(opt_narenas, opt_narenas, unsigned)
@@ -1583,6 +1600,8 @@ CTL_RO_NL_CGEN(config_fill, opt_zero, opt_zero, bool)
 CTL_RO_NL_CGEN(config_utrace, opt_utrace, opt_utrace, bool)
 CTL_RO_NL_CGEN(config_xmalloc, opt_xmalloc, opt_xmalloc, bool)
 CTL_RO_NL_GEN(opt_tcache, opt_tcache, bool)
+CTL_RO_NL_GEN(opt_lg_extent_max_active_fit, opt_lg_extent_max_active_fit,
+    size_t)
 CTL_RO_NL_GEN(opt_lg_tcache_max, opt_lg_tcache_max, ssize_t)
 CTL_RO_NL_CGEN(config_prof, opt_prof, opt_prof, bool)
 CTL_RO_NL_CGEN(config_prof, opt_prof_prefix, opt_prof_prefix, const char *)
@@ -2187,6 +2206,42 @@ label_return:
 	return ret;
 }
 
+static int
+arena_i_retain_grow_limit_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+	int ret;
+	unsigned arena_ind;
+	arena_t *arena;
+
+	if (!opt_retain) {
+		/* Only relevant when retain is enabled. */
+		return ENOENT;
+	}
+
+	malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx);
+	MIB_UNSIGNED(arena_ind, 1);
+	if (arena_ind < narenas_total_get() && (arena =
+	    arena_get(tsd_tsdn(tsd), arena_ind, false)) != NULL) {
+		size_t old_limit, new_limit;
+		if (newp != NULL) {
+			WRITE(new_limit, size_t);
+		}
+		bool err = arena_retain_grow_limit_get_set(tsd, arena,
+		    &old_limit, newp != NULL ? &new_limit : NULL);
+		if (!err) {
+			READ(old_limit, size_t);
+			ret = 0;
+		} else {
+			ret = EFAULT;
+		}
+	} else {
+		ret = EFAULT;
+	}
+label_return:
+	malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx);
+	return ret;
+}
+
 static const ctl_named_node_t *
 arena_i_index(tsdn_t *tsdn, const size_t *mib, size_t miblen, size_t i) {
 	const ctl_named_node_t *ret;
@@ -2248,7 +2303,7 @@ arenas_decay_ms_ctl_impl(tsd_t *tsd, const size_t *mib, size_t miblen,
 			ret = EINVAL;
 			goto label_return;
 		}
-		if (dirty ?  arena_dirty_decay_ms_default_set(*(ssize_t *)newp)
+		if (dirty ? arena_dirty_decay_ms_default_set(*(ssize_t *)newp)
 		    : arena_muzzy_decay_ms_default_set(*(ssize_t *)newp)) {
 			ret = EFAULT;
 			goto label_return;
@@ -2279,9 +2334,9 @@ CTL_RO_NL_GEN(arenas_page, PAGE, size_t)
 CTL_RO_NL_GEN(arenas_tcache_max, tcache_maxclass, size_t)
 CTL_RO_NL_GEN(arenas_nbins, NBINS, unsigned)
 CTL_RO_NL_GEN(arenas_nhbins, nhbins, unsigned)
-CTL_RO_NL_GEN(arenas_bin_i_size, arena_bin_info[mib[2]].reg_size, size_t)
-CTL_RO_NL_GEN(arenas_bin_i_nregs, arena_bin_info[mib[2]].nregs, uint32_t)
-CTL_RO_NL_GEN(arenas_bin_i_slab_size, arena_bin_info[mib[2]].slab_size, size_t)
+CTL_RO_NL_GEN(arenas_bin_i_size, bin_infos[mib[2]].reg_size, size_t)
+CTL_RO_NL_GEN(arenas_bin_i_nregs, bin_infos[mib[2]].nregs, uint32_t)
+CTL_RO_NL_GEN(arenas_bin_i_slab_size, bin_infos[mib[2]].slab_size, size_t)
 static const ctl_named_node_t *
 arenas_bin_i_index(tsdn_t *tsdn, const size_t *mib, size_t miblen, size_t i) {
 	if (i > NBINS) {
@@ -2460,6 +2515,7 @@ CTL_RO_NL_CGEN(config_prof, lg_prof_sample, lg_prof_sample, size_t)
 CTL_RO_CGEN(config_stats, stats_allocated, ctl_stats->allocated, size_t)
 CTL_RO_CGEN(config_stats, stats_active, ctl_stats->active, size_t)
 CTL_RO_CGEN(config_stats, stats_metadata, ctl_stats->metadata, size_t)
+CTL_RO_CGEN(config_stats, stats_metadata_thp, ctl_stats->metadata_thp, size_t)
 CTL_RO_CGEN(config_stats, stats_resident, ctl_stats->resident, size_t)
 CTL_RO_CGEN(config_stats, stats_mapped, ctl_stats->mapped, size_t)
 CTL_RO_CGEN(config_stats, stats_retained, ctl_stats->retained, size_t)
@@ -2490,24 +2546,24 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_retained,
     size_t)
 
 CTL_RO_CGEN(config_stats, stats_arenas_i_dirty_npurge,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->astats.decay_dirty.npurge),
-    uint64_t)
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->astats.decay_dirty.npurge), uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_dirty_nmadvise,
-    arena_stats_read_u64(
+    ctl_arena_stats_read_u64(
     &arenas_i(mib[2])->astats->astats.decay_dirty.nmadvise), uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_dirty_purged,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->astats.decay_dirty.purged),
-    uint64_t)
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->astats.decay_dirty.purged), uint64_t)
 
 CTL_RO_CGEN(config_stats, stats_arenas_i_muzzy_npurge,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->astats.decay_muzzy.npurge),
-    uint64_t)
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->astats.decay_muzzy.npurge), uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_muzzy_nmadvise,
-    arena_stats_read_u64(
+    ctl_arena_stats_read_u64(
     &arenas_i(mib[2])->astats->astats.decay_muzzy.nmadvise), uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_muzzy_purged,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->astats.decay_muzzy.purged),
-    uint64_t)
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->astats.decay_muzzy.purged), uint64_t)
 
 CTL_RO_CGEN(config_stats, stats_arenas_i_base,
     atomic_load_zu(&arenas_i(mib[2])->astats->astats.base, ATOMIC_RELAXED),
@@ -2515,6 +2571,9 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_base,
 CTL_RO_CGEN(config_stats, stats_arenas_i_internal,
     atomic_load_zu(&arenas_i(mib[2])->astats->astats.internal, ATOMIC_RELAXED),
     size_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_metadata_thp,
+    atomic_load_zu(&arenas_i(mib[2])->astats->astats.metadata_thp,
+    ATOMIC_RELAXED), size_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_tcache_bytes,
     atomic_load_zu(&arenas_i(mib[2])->astats->astats.tcache_bytes,
     ATOMIC_RELAXED), size_t)
@@ -2534,14 +2593,17 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_large_allocated,
     atomic_load_zu(&arenas_i(mib[2])->astats->astats.allocated_large,
     ATOMIC_RELAXED), size_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_large_nmalloc,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->astats.nmalloc_large),
-    uint64_t)
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->astats.nmalloc_large), uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_large_ndalloc,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->astats.ndalloc_large),
-    uint64_t)
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->astats.ndalloc_large), uint64_t)
+/*
+ * Note: "nmalloc" here instead of "nrequests" in the read.  This is intentional.
+ */
 CTL_RO_CGEN(config_stats, stats_arenas_i_large_nrequests,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->astats.nmalloc_large),
-    uint64_t) /* Intentional. */
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->astats.nmalloc_large), uint64_t) /* Intentional. */
 
 /* Lock profiling related APIs below. */
 #define RO_MUTEX_CTL_GEN(n, l)						\
@@ -2622,7 +2684,7 @@ stats_mutexes_reset_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
 		MUTEX_PROF_RESET(arena->base->mtx);
 
 		for (szind_t i = 0; i < NBINS; i++) {
-			arena_bin_t *bin = &arena->bins[i];
+			bin_t *bin = &arena->bins[i];
 			MUTEX_PROF_RESET(bin->lock);
 		}
 	}
@@ -2659,14 +2721,14 @@ stats_arenas_i_bins_j_index(tsdn_t *tsdn, const size_t *mib, size_t miblen,
 }
 
 CTL_RO_CGEN(config_stats, stats_arenas_i_lextents_j_nmalloc,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->lstats[mib[4]].nmalloc),
-    uint64_t)
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->lstats[mib[4]].nmalloc), uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_lextents_j_ndalloc,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->lstats[mib[4]].ndalloc),
-    uint64_t)
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->lstats[mib[4]].ndalloc), uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_lextents_j_nrequests,
-    arena_stats_read_u64(&arenas_i(mib[2])->astats->lstats[mib[4]].nrequests),
-    uint64_t)
+    ctl_arena_stats_read_u64(
+    &arenas_i(mib[2])->astats->lstats[mib[4]].nrequests), uint64_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_lextents_j_curlextents,
     arenas_i(mib[2])->astats->lstats[mib[4]].curlextents, size_t)
 
diff --git a/src/div.c b/src/div.c
new file mode 100644
index 0000000..808892a
--- /dev/null
+++ b/src/div.c
@@ -0,0 +1,55 @@
+#include "jemalloc/internal/jemalloc_preamble.h"
+
+#include "jemalloc/internal/div.h"
+
+#include "jemalloc/internal/assert.h"
+
+/*
+ * Suppose we have n = q * d, all integers. We know n and d, and want q = n / d.
+ *
+ * For any k, we have (here, all division is exact; not C-style rounding):
+ * floor(ceil(2^k / d) * n / 2^k) = floor((2^k + r) / d * n / 2^k), where
+ * r = (-2^k) mod d.
+ *
+ * Expanding this out:
+ * ... = floor(2^k / d * n / 2^k + r / d * n / 2^k)
+ *     = floor(n / d + (r / d) * (n / 2^k)).
+ *
+ * The fractional part of n / d is 0 (because of the assumption that d divides n
+ * exactly), so we have:
+ * ... = n / d + floor((r / d) * (n / 2^k))
+ *
+ * So that our initial expression is equal to the quantity we seek, so long as
+ * (r / d) * (n / 2^k) < 1.
+ *
+ * r is a remainder mod d, so r < d and r / d < 1 always. We can make
+ * n / 2 ^ k < 1 by setting k = 32. This gets us a value of magic that works.
+ */
+
+void
+div_init(div_info_t *div_info, size_t d) {
+	/* Nonsensical. */
+	assert(d != 0);
+	/*
+	 * This would make the value of magic too high to fit into a uint32_t
+	 * (we would want magic = 2^32 exactly). This would mess with code gen
+	 * on 32-bit machines.
+	 */
+	assert(d != 1);
+
+	uint64_t two_to_k = ((uint64_t)1 << 32);
+	uint32_t magic = (uint32_t)(two_to_k / d);
+
+	/*
+	 * We want magic = ceil(2^k / d), but C gives us floor. We have to
+	 * increment it unless the result was exact (i.e. unless d is a power of
+	 * two).
+	 */
+	if (two_to_k % d != 0) {
+		magic++;
+	}
+	div_info->magic = magic;
+#ifdef JEMALLOC_DEBUG
+	div_info->d = d;
+#endif
+}
diff --git a/src/extent.c b/src/extent.c
index fa45c84..517780e 100644
--- a/src/extent.c
+++ b/src/extent.c
@@ -17,6 +17,8 @@ rtree_t		extents_rtree;
 /* Keyed by the address of the extent_t being protected. */
 mutex_pool_t	extent_mutex_pool;
 
+size_t opt_lg_extent_max_active_fit = LG_EXTENT_MAX_ACTIVE_FIT_DEFAULT;
+
 static const bitmap_info_t extents_bitmap_info =
     BITMAP_INFO_INITIALIZER(NPSIZES+1);
 
@@ -117,7 +119,7 @@ static void extent_record(tsdn_t *tsdn, arena_t *arena,
 
 /******************************************************************************/
 
-rb_gen(UNUSED, extent_avail_, extent_tree_t, extent_t, rb_link,
+ph_gen(UNUSED, extent_avail_, extent_tree_t, extent_t, ph_link,
     extent_esnead_comp)
 
 typedef enum {
@@ -361,6 +363,43 @@ extents_remove_locked(tsdn_t *tsdn, extents_t *extents, extent_t *extent,
 	    cur_extents_npages - (size >> LG_PAGE), ATOMIC_RELAXED);
 }
 
+/*
+ * Find an extent with size [min_size, max_size) to satisfy the alignment
+ * requirement.  For each size, try only the first extent in the heap.
+ */
+static extent_t *
+extents_fit_alignment(extents_t *extents, size_t min_size, size_t max_size,
+    size_t alignment) {
+        pszind_t pind = sz_psz2ind(extent_size_quantize_ceil(min_size));
+        pszind_t pind_max = sz_psz2ind(extent_size_quantize_ceil(max_size));
+
+	for (pszind_t i = (pszind_t)bitmap_ffu(extents->bitmap,
+	    &extents_bitmap_info, (size_t)pind); i < pind_max; i =
+	    (pszind_t)bitmap_ffu(extents->bitmap, &extents_bitmap_info,
+	    (size_t)i+1)) {
+		assert(i < NPSIZES);
+		assert(!extent_heap_empty(&extents->heaps[i]));
+		extent_t *extent = extent_heap_first(&extents->heaps[i]);
+		uintptr_t base = (uintptr_t)extent_base_get(extent);
+		size_t candidate_size = extent_size_get(extent);
+		assert(candidate_size >= min_size);
+
+		uintptr_t next_align = ALIGNMENT_CEILING((uintptr_t)base,
+		    PAGE_CEILING(alignment));
+		if (base > next_align || base + candidate_size <= next_align) {
+			/* Overflow or not crossing the next alignment. */
+			continue;
+		}
+
+		size_t leadsize = next_align - base;
+		if (candidate_size - leadsize >= min_size) {
+			return extent;
+		}
+	}
+
+	return NULL;
+}
+
 /* Do any-best-fit extent selection, i.e. select any extent that best fits. */
 static extent_t *
 extents_best_fit_locked(tsdn_t *tsdn, arena_t *arena, extents_t *extents,
@@ -369,8 +408,15 @@ extents_best_fit_locked(tsdn_t *tsdn, arena_t *arena, extents_t *extents,
 	pszind_t i = (pszind_t)bitmap_ffu(extents->bitmap, &extents_bitmap_info,
 	    (size_t)pind);
 	if (i < NPSIZES+1) {
+		/*
+		 * In order to reduce fragmentation, avoid reusing and splitting
+		 * large extents for much smaller sizes.
+		 */
+		if ((sz_pind2sz(i) >> opt_lg_extent_max_active_fit) > size) {
+			return NULL;
+		}
 		assert(!extent_heap_empty(&extents->heaps[i]));
-		extent_t *extent = extent_heap_any(&extents->heaps[i]);
+		extent_t *extent = extent_heap_first(&extents->heaps[i]);
 		assert(extent_size_get(extent) >= size);
 		return extent;
 	}
@@ -415,12 +461,30 @@ extents_first_fit_locked(tsdn_t *tsdn, arena_t *arena, extents_t *extents,
  */
 static extent_t *
 extents_fit_locked(tsdn_t *tsdn, arena_t *arena, extents_t *extents,
-    size_t size) {
+    size_t esize, size_t alignment) {
 	malloc_mutex_assert_owner(tsdn, &extents->mtx);
 
-	return extents->delay_coalesce ? extents_best_fit_locked(tsdn, arena,
-	    extents, size) : extents_first_fit_locked(tsdn, arena, extents,
-	    size);
+	size_t max_size = esize + PAGE_CEILING(alignment) - PAGE;
+	/* Beware size_t wrap-around. */
+	if (max_size < esize) {
+		return NULL;
+	}
+
+	extent_t *extent = extents->delay_coalesce ?
+	    extents_best_fit_locked(tsdn, arena, extents, max_size) :
+	    extents_first_fit_locked(tsdn, arena, extents, max_size);
+
+	if (alignment > PAGE && extent == NULL) {
+		/*
+		 * max_size guarantees the alignment requirement but is rather
+		 * pessimistic.  Next we try to satisfy the aligned allocation
+		 * with sizes in [esize, max_size).
+		 */
+		extent = extents_fit_alignment(extents, esize, max_size,
+		    alignment);
+	}
+
+	return extent;
 }
 
 static bool
@@ -449,8 +513,10 @@ extents_alloc(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 0);
 
-	return extent_recycle(tsdn, arena, r_extent_hooks, extents, new_addr,
-	    size, pad, alignment, slab, szind, zero, commit, false);
+	extent_t *extent = extent_recycle(tsdn, arena, r_extent_hooks, extents,
+	    new_addr, size, pad, alignment, slab, szind, zero, commit, false);
+	assert(extent == NULL || extent_dumpable_get(extent));
+	return extent;
 }
 
 void
@@ -458,6 +524,7 @@ extents_dalloc(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
     extents_t *extents, extent_t *extent) {
 	assert(extent_base_get(extent) != NULL);
 	assert(extent_size_get(extent) != 0);
+	assert(extent_dumpable_get(extent));
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 0);
 
@@ -487,10 +554,9 @@ extents_evict(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 			goto label_return;
 		}
 		/* Check the eviction limit. */
-		size_t npages = extent_size_get(extent) >> LG_PAGE;
 		size_t extents_npages = atomic_load_zu(&extents->npages,
 		    ATOMIC_RELAXED);
-		if (extents_npages - npages < npages_min) {
+		if (extents_npages <= npages_min) {
 			extent = NULL;
 			goto label_return;
 		}
@@ -723,6 +789,13 @@ extent_reregister(tsdn_t *tsdn, extent_t *extent) {
 	assert(!err);
 }
 
+/*
+ * Removes all pointers to the given extent from the global rtree indices for
+ * its interior.  This is relevant for slab extents, for which we need to do
+ * metadata lookups at places other than the head of the extent.  We deregister
+ * on the interior, then, when an extent moves from being an active slab to an
+ * inactive state.
+ */
 static void
 extent_interior_deregister(tsdn_t *tsdn, rtree_ctx_t *rtree_ctx,
     extent_t *extent) {
@@ -737,8 +810,11 @@ extent_interior_deregister(tsdn_t *tsdn, rtree_ctx_t *rtree_ctx,
 	}
 }
 
+/*
+ * Removes all pointers to the given extent from the global rtree.
+ */
 static void
-extent_deregister(tsdn_t *tsdn, extent_t *extent) {
+extent_deregister_impl(tsdn_t *tsdn, extent_t *extent, bool gdump) {
 	rtree_ctx_t rtree_ctx_fallback;
 	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
 	rtree_leaf_elm_t *elm_a, *elm_b;
@@ -755,16 +831,30 @@ extent_deregister(tsdn_t *tsdn, extent_t *extent) {
 
 	extent_unlock(tsdn, extent);
 
-	if (config_prof) {
+	if (config_prof && gdump) {
 		extent_gdump_sub(tsdn, extent);
 	}
 }
 
+static void
+extent_deregister(tsdn_t *tsdn, extent_t *extent) {
+	extent_deregister_impl(tsdn, extent, true);
+}
+
+static void
+extent_deregister_no_gdump_sub(tsdn_t *tsdn, extent_t *extent) {
+	extent_deregister_impl(tsdn, extent, false);
+}
+
+/*
+ * Tries to find and remove an extent from extents that can be used for the
+ * given allocation request.
+ */
 static extent_t *
 extent_recycle_extract(tsdn_t *tsdn, arena_t *arena,
     extent_hooks_t **r_extent_hooks, rtree_ctx_t *rtree_ctx, extents_t *extents,
     void *new_addr, size_t size, size_t pad, size_t alignment, bool slab,
-    bool *zero, bool *commit, bool growing_retained) {
+    bool growing_retained) {
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, growing_retained ? 1 : 0);
 	assert(alignment > 0);
@@ -786,11 +876,6 @@ extent_recycle_extract(tsdn_t *tsdn, arena_t *arena,
 	}
 
 	size_t esize = size + pad;
-	size_t alloc_size = esize + PAGE_CEILING(alignment) - PAGE;
-	/* Beware size_t wrap-around. */
-	if (alloc_size < esize) {
-		return NULL;
-	}
 	malloc_mutex_lock(tsdn, &extents->mtx);
 	extent_hooks_assure_initialized(arena, r_extent_hooks);
 	extent_t *extent;
@@ -812,7 +897,8 @@ extent_recycle_extract(tsdn_t *tsdn, arena_t *arena,
 			extent_unlock(tsdn, unlock_extent);
 		}
 	} else {
-		extent = extents_fit_locked(tsdn, arena, extents, alloc_size);
+		extent = extents_fit_locked(tsdn, arena, extents, esize,
+		    alignment);
 	}
 	if (extent == NULL) {
 		malloc_mutex_unlock(tsdn, &extents->mtx);
@@ -822,76 +908,161 @@ extent_recycle_extract(tsdn_t *tsdn, arena_t *arena,
 	extent_activate_locked(tsdn, arena, extents, extent, false);
 	malloc_mutex_unlock(tsdn, &extents->mtx);
 
-	if (extent_zeroed_get(extent)) {
-		*zero = true;
-	}
-	if (extent_committed_get(extent)) {
-		*commit = true;
-	}
-
 	return extent;
 }
 
-static extent_t *
-extent_recycle_split(tsdn_t *tsdn, arena_t *arena,
-    extent_hooks_t **r_extent_hooks, rtree_ctx_t *rtree_ctx, extents_t *extents,
+/*
+ * Given an allocation request and an extent guaranteed to be able to satisfy
+ * it, this splits off lead and trail extents, leaving extent pointing to an
+ * extent satisfying the allocation.
+ * This function doesn't put lead or trail into any extents_t; it's the caller's
+ * job to ensure that they can be reused.
+ */
+typedef enum {
+	/*
+	 * Split successfully.  lead, extent, and trail, are modified to extents
+	 * describing the ranges before, in, and after the given allocation.
+	 */
+	extent_split_interior_ok,
+	/*
+	 * The extent can't satisfy the given allocation request.  None of the
+	 * input extent_t *s are touched.
+	 */
+	extent_split_interior_cant_alloc,
+	/*
+	 * In a potentially invalid state.  Must leak (if *to_leak is non-NULL),
+	 * and salvage what's still salvageable (if *to_salvage is non-NULL).
+	 * None of lead, extent, or trail are valid.
+	 */
+	extent_split_interior_error
+} extent_split_interior_result_t;
+
+static extent_split_interior_result_t
+extent_split_interior(tsdn_t *tsdn, arena_t *arena,
+    extent_hooks_t **r_extent_hooks, rtree_ctx_t *rtree_ctx,
+    /* The result of splitting, in case of success. */
+    extent_t **extent, extent_t **lead, extent_t **trail,
+    /* The mess to clean up, in case of error. */
+    extent_t **to_leak, extent_t **to_salvage,
     void *new_addr, size_t size, size_t pad, size_t alignment, bool slab,
-    szind_t szind, extent_t *extent, bool growing_retained) {
+    szind_t szind, bool growing_retained) {
 	size_t esize = size + pad;
-	size_t leadsize = ALIGNMENT_CEILING((uintptr_t)extent_base_get(extent),
-	    PAGE_CEILING(alignment)) - (uintptr_t)extent_base_get(extent);
+	size_t leadsize = ALIGNMENT_CEILING((uintptr_t)extent_base_get(*extent),
+	    PAGE_CEILING(alignment)) - (uintptr_t)extent_base_get(*extent);
 	assert(new_addr == NULL || leadsize == 0);
-	assert(extent_size_get(extent) >= leadsize + esize);
-	size_t trailsize = extent_size_get(extent) - leadsize - esize;
+	if (extent_size_get(*extent) < leadsize + esize) {
+		return extent_split_interior_cant_alloc;
+	}
+	size_t trailsize = extent_size_get(*extent) - leadsize - esize;
+
+	*lead = NULL;
+	*trail = NULL;
+	*to_leak = NULL;
+	*to_salvage = NULL;
 
 	/* Split the lead. */
 	if (leadsize != 0) {
-		extent_t *lead = extent;
-		extent = extent_split_impl(tsdn, arena, r_extent_hooks,
-		    lead, leadsize, NSIZES, false, esize + trailsize, szind,
+		*lead = *extent;
+		*extent = extent_split_impl(tsdn, arena, r_extent_hooks,
+		    *lead, leadsize, NSIZES, false, esize + trailsize, szind,
 		    slab, growing_retained);
-		if (extent == NULL) {
-			extent_deregister(tsdn, lead);
-			extents_leak(tsdn, arena, r_extent_hooks, extents,
-			    lead, growing_retained);
-			return NULL;
+		if (*extent == NULL) {
+			*to_leak = *lead;
+			*lead = NULL;
+			return extent_split_interior_error;
 		}
-		extent_deactivate(tsdn, arena, extents, lead, false);
 	}
 
 	/* Split the trail. */
 	if (trailsize != 0) {
-		extent_t *trail = extent_split_impl(tsdn, arena,
-		    r_extent_hooks, extent, esize, szind, slab, trailsize,
-		    NSIZES, false, growing_retained);
-		if (trail == NULL) {
-			extent_deregister(tsdn, extent);
-			extents_leak(tsdn, arena, r_extent_hooks, extents,
-			    extent, growing_retained);
-			return NULL;
+		*trail = extent_split_impl(tsdn, arena, r_extent_hooks, *extent,
+		    esize, szind, slab, trailsize, NSIZES, false,
+		    growing_retained);
+		if (*trail == NULL) {
+			*to_leak = *extent;
+			*to_salvage = *lead;
+			*lead = NULL;
+			*extent = NULL;
+			return extent_split_interior_error;
 		}
-		extent_deactivate(tsdn, arena, extents, trail, false);
-	} else if (leadsize == 0) {
+	}
+
+	if (leadsize == 0 && trailsize == 0) {
 		/*
 		 * Splitting causes szind to be set as a side effect, but no
 		 * splitting occurred.
 		 */
-		extent_szind_set(extent, szind);
+		extent_szind_set(*extent, szind);
 		if (szind != NSIZES) {
 			rtree_szind_slab_update(tsdn, &extents_rtree, rtree_ctx,
-			    (uintptr_t)extent_addr_get(extent), szind, slab);
-			if (slab && extent_size_get(extent) > PAGE) {
+			    (uintptr_t)extent_addr_get(*extent), szind, slab);
+			if (slab && extent_size_get(*extent) > PAGE) {
 				rtree_szind_slab_update(tsdn, &extents_rtree,
 				    rtree_ctx,
-				    (uintptr_t)extent_past_get(extent) -
+				    (uintptr_t)extent_past_get(*extent) -
 				    (uintptr_t)PAGE, szind, slab);
 			}
 		}
 	}
 
-	return extent;
+	return extent_split_interior_ok;
 }
 
+/*
+ * This fulfills the indicated allocation request out of the given extent (which
+ * the caller should have ensured was big enough).  If there's any unused space
+ * before or after the resulting allocation, that space is given its own extent
+ * and put back into extents.
+ */
+static extent_t *
+extent_recycle_split(tsdn_t *tsdn, arena_t *arena,
+    extent_hooks_t **r_extent_hooks, rtree_ctx_t *rtree_ctx, extents_t *extents,
+    void *new_addr, size_t size, size_t pad, size_t alignment, bool slab,
+    szind_t szind, extent_t *extent, bool growing_retained) {
+	extent_t *lead;
+	extent_t *trail;
+	extent_t *to_leak;
+	extent_t *to_salvage;
+
+	extent_split_interior_result_t result = extent_split_interior(
+	    tsdn, arena, r_extent_hooks, rtree_ctx, &extent, &lead, &trail,
+	    &to_leak, &to_salvage, new_addr, size, pad, alignment, slab, szind,
+	    growing_retained);
+
+	if (result == extent_split_interior_ok) {
+		if (lead != NULL) {
+			extent_deactivate(tsdn, arena, extents, lead, false);
+		}
+		if (trail != NULL) {
+			extent_deactivate(tsdn, arena, extents, trail, false);
+		}
+		return extent;
+	} else {
+		/*
+		 * We should have picked an extent that was large enough to
+		 * fulfill our allocation request.
+		 */
+		assert(result == extent_split_interior_error);
+		if (to_salvage != NULL) {
+			extent_deregister(tsdn, to_salvage);
+		}
+		if (to_leak != NULL) {
+			void *leak = extent_base_get(to_leak);
+			extent_deregister_no_gdump_sub(tsdn, to_leak);
+			extents_leak(tsdn, arena, r_extent_hooks, extents,
+			    to_leak, growing_retained);
+			assert(extent_lock_from_addr(tsdn, rtree_ctx, leak)
+			    == NULL);
+		}
+		return NULL;
+	}
+	unreachable();
+}
+
+/*
+ * Tries to satisfy the given allocation request by reusing one of the extents
+ * in the given extents_t.
+ */
 static extent_t *
 extent_recycle(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
     extents_t *extents, void *new_addr, size_t size, size_t pad,
@@ -906,16 +1077,12 @@ extent_recycle(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 	rtree_ctx_t rtree_ctx_fallback;
 	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
 
-	bool committed = false;
 	extent_t *extent = extent_recycle_extract(tsdn, arena, r_extent_hooks,
-	    rtree_ctx, extents, new_addr, size, pad, alignment, slab, zero,
-	    &committed, growing_retained);
+	    rtree_ctx, extents, new_addr, size, pad, alignment, slab,
+	    growing_retained);
 	if (extent == NULL) {
 		return NULL;
 	}
-	if (committed) {
-		*commit = true;
-	}
 
 	extent = extent_recycle_split(tsdn, arena, r_extent_hooks, rtree_ctx,
 	    extents, new_addr, size, pad, alignment, slab, szind, extent,
@@ -934,6 +1101,13 @@ extent_recycle(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 		extent_zeroed_set(extent, true);
 	}
 
+	if (extent_committed_get(extent)) {
+		*commit = true;
+	}
+	if (extent_zeroed_get(extent)) {
+		*zero = true;
+	}
+
 	if (pad != 0) {
 		extent_addr_randomize(tsdn, extent, alignment);
 	}
@@ -1028,7 +1202,18 @@ extent_alloc_default(extent_hooks_t *extent_hooks, void *new_addr, size_t size,
 static void
 extent_hook_pre_reentrancy(tsdn_t *tsdn, arena_t *arena) {
 	tsd_t *tsd = tsdn_null(tsdn) ? tsd_fetch() : tsdn_tsd(tsdn);
-	pre_reentrancy(tsd, arena);
+	if (arena == arena_get(tsd_tsdn(tsd), 0, false)) {
+		/*
+		 * The only legitimate case of customized extent hooks for a0 is
+		 * hooks with no allocation activities.  One such example is to
+		 * place metadata on pre-allocated resources such as huge pages.
+		 * In that case, rely on reentrancy_level checks to catch
+		 * infinite recursions.
+		 */
+		pre_reentrancy(tsd, NULL);
+	} else {
+		pre_reentrancy(tsd, arena);
+	}
 }
 
 static void
@@ -1094,21 +1279,18 @@ extent_grow_retained(tsdn_t *tsdn, arena_t *arena,
 
 	extent_init(extent, arena, ptr, alloc_size, false, NSIZES,
 	    arena_extent_sn_next(arena), extent_state_active, zeroed,
-	    committed);
+	    committed, true);
 	if (ptr == NULL) {
 		extent_dalloc(tsdn, arena, extent);
 		goto label_err;
 	}
+
 	if (extent_register_no_gdump_add(tsdn, extent)) {
 		extents_leak(tsdn, arena, r_extent_hooks,
 		    &arena->extents_retained, extent, true);
 		goto label_err;
 	}
 
-	size_t leadsize = ALIGNMENT_CEILING((uintptr_t)ptr,
-	    PAGE_CEILING(alignment)) - (uintptr_t)ptr;
-	assert(alloc_size >= leadsize + esize);
-	size_t trailsize = alloc_size - leadsize - esize;
 	if (extent_zeroed_get(extent) && extent_committed_get(extent)) {
 		*zero = true;
 	}
@@ -1116,54 +1298,48 @@ extent_grow_retained(tsdn_t *tsdn, arena_t *arena,
 		*commit = true;
 	}
 
-	/* Split the lead. */
-	if (leadsize != 0) {
-		extent_t *lead = extent;
-		extent = extent_split_impl(tsdn, arena, r_extent_hooks, lead,
-		    leadsize, NSIZES, false, esize + trailsize, szind, slab,
-		    true);
-		if (extent == NULL) {
-			extent_deregister(tsdn, lead);
-			extents_leak(tsdn, arena, r_extent_hooks,
+	rtree_ctx_t rtree_ctx_fallback;
+	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
+
+	extent_t *lead;
+	extent_t *trail;
+	extent_t *to_leak;
+	extent_t *to_salvage;
+	extent_split_interior_result_t result = extent_split_interior(
+	    tsdn, arena, r_extent_hooks, rtree_ctx, &extent, &lead, &trail,
+	    &to_leak, &to_salvage, NULL, size, pad, alignment, slab, szind,
+	    true);
+
+	if (result == extent_split_interior_ok) {
+		if (lead != NULL) {
+			extent_record(tsdn, arena, r_extent_hooks,
 			    &arena->extents_retained, lead, true);
-			goto label_err;
 		}
-		extent_record(tsdn, arena, r_extent_hooks,
-		    &arena->extents_retained, lead, true);
-	}
-
-	/* Split the trail. */
-	if (trailsize != 0) {
-		extent_t *trail = extent_split_impl(tsdn, arena, r_extent_hooks,
-		    extent, esize, szind, slab, trailsize, NSIZES, false, true);
-		if (trail == NULL) {
-			extent_deregister(tsdn, extent);
+		if (trail != NULL) {
+			extent_record(tsdn, arena, r_extent_hooks,
+			    &arena->extents_retained, trail, true);
+		}
+	} else {
+		/*
+		 * We should have allocated a sufficiently large extent; the
+		 * cant_alloc case should not occur.
+		 */
+		assert(result == extent_split_interior_error);
+		if (to_leak != NULL) {
+			extent_deregister_no_gdump_sub(tsdn, to_leak);
 			extents_leak(tsdn, arena, r_extent_hooks,
-			    &arena->extents_retained, extent, true);
+			    &arena->extents_retained, to_leak, true);
 			goto label_err;
 		}
-		extent_record(tsdn, arena, r_extent_hooks,
-		    &arena->extents_retained, trail, true);
-	} else if (leadsize == 0) {
 		/*
-		 * Splitting causes szind to be set as a side effect, but no
-		 * splitting occurred.
+		 * Note: we don't handle the non-NULL to_salvage case at all.
+		 * This maintains the behavior that was present when the
+		 * refactor pulling extent_split_interior into a helper function
+		 * was added.  I think this is actually a bug (we leak both the
+		 * memory and the extent_t in that case), but since this code is
+		 * getting deleted very shortly (in a subsequent commit),
+		 * ensuring correctness down this path isn't worth the effort.
 		 */
-		rtree_ctx_t rtree_ctx_fallback;
-		rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn,
-		    &rtree_ctx_fallback);
-
-		extent_szind_set(extent, szind);
-		if (szind != NSIZES) {
-			rtree_szind_slab_update(tsdn, &extents_rtree, rtree_ctx,
-			    (uintptr_t)extent_addr_get(extent), szind, slab);
-			if (slab && extent_size_get(extent) > PAGE) {
-				rtree_szind_slab_update(tsdn, &extents_rtree,
-				    rtree_ctx,
-				    (uintptr_t)extent_past_get(extent) -
-				    (uintptr_t)PAGE, szind, slab);
-			}
-		}
 	}
 
 	if (*commit && !extent_committed_get(extent)) {
@@ -1177,13 +1353,14 @@ extent_grow_retained(tsdn_t *tsdn, arena_t *arena,
 	}
 
 	/*
-	 * Increment extent_grow_next if doing so wouldn't exceed the legal
+	 * Increment extent_grow_next if doing so wouldn't exceed the allowed
 	 * range.
 	 */
-	if (arena->extent_grow_next + egn_skip + 1 < NPSIZES) {
+	if (arena->extent_grow_next + egn_skip + 1 <=
+	    arena->retain_grow_limit) {
 		arena->extent_grow_next += egn_skip + 1;
 	} else {
-		arena->extent_grow_next = NPSIZES - 1;
+		arena->extent_grow_next = arena->retain_grow_limit;
 	}
 	/* All opportunities for failure are past. */
 	malloc_mutex_unlock(tsdn, &arena->extent_grow_mtx);
@@ -1271,7 +1448,8 @@ extent_alloc_wrapper_hard(tsdn_t *tsdn, arena_t *arena,
 		return NULL;
 	}
 	extent_init(extent, arena, addr, esize, slab, szind,
-	    arena_extent_sn_next(arena), extent_state_active, zero, commit);
+	    arena_extent_sn_next(arena), extent_state_active, zero, commit,
+	    true);
 	if (pad != 0) {
 		extent_addr_randomize(tsdn, extent, alignment);
 	}
@@ -1296,10 +1474,20 @@ extent_alloc_wrapper(tsdn_t *tsdn, arena_t *arena,
 	extent_t *extent = extent_alloc_retained(tsdn, arena, r_extent_hooks,
 	    new_addr, size, pad, alignment, slab, szind, zero, commit);
 	if (extent == NULL) {
+		if (opt_retain && new_addr != NULL) {
+			/*
+			 * When retain is enabled and new_addr is set, we do not
+			 * attempt extent_alloc_wrapper_hard which does mmap
+			 * that is very unlikely to succeed (unless it happens
+			 * to be at the end).
+			 */
+			return NULL;
+		}
 		extent = extent_alloc_wrapper_hard(tsdn, arena, r_extent_hooks,
 		    new_addr, size, pad, alignment, slab, szind, zero, commit);
 	}
 
+	assert(extent == NULL || extent_dumpable_get(extent));
 	return extent;
 }
 
@@ -1329,13 +1517,12 @@ extent_coalesce(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
     bool growing_retained) {
 	assert(extent_can_coalesce(arena, extents, inner, outer));
 
-	if (forward && extents->delay_coalesce) {
+	if (extents->delay_coalesce) {
 		/*
-		 * The extent that remains after coalescing must occupy the
-		 * outer extent's position in the LRU.  For forward coalescing,
-		 * swap the inner extent into the LRU.
+		 * Remove outer from the LRU list so that it won't be show up in
+		 * decay through extents_evict.
 		 */
-		extent_list_replace(&extents->lru, outer, inner);
+		extent_list_remove(&extents->lru, outer);
 	}
 	extent_activate_locked(tsdn, arena, extents, outer,
 	    extents->delay_coalesce);
@@ -1345,9 +1532,16 @@ extent_coalesce(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 	    forward ? inner : outer, forward ? outer : inner, growing_retained);
 	malloc_mutex_lock(tsdn, &extents->mtx);
 
+	if (!err && extents->delay_coalesce) {
+		if (forward) {
+			extent_list_prepend(&extents->lru, inner);
+		} else {
+			extent_list_prepend(&extents->lru, outer);
+		}
+	}
 	if (err) {
-		if (forward && extents->delay_coalesce) {
-			extent_list_replace(&extents->lru, inner, outer);
+		if (extents->delay_coalesce) {
+			extent_list_prepend(&extents->lru, outer);
 		}
 		extent_deactivate_locked(tsdn, arena, extents, outer,
 		    extents->delay_coalesce);
@@ -1422,6 +1616,10 @@ extent_try_coalesce(tsdn_t *tsdn, arena_t *arena,
 	return extent;
 }
 
+/*
+ * Does the metadata management portions of putting an unused extent into the
+ * given extents_t (coalesces, deregisters slab interiors, the heap operations).
+ */
 static void
 extent_record(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
     extents_t *extents, extent_t *extent, bool growing_retained) {
@@ -1447,8 +1645,22 @@ extent_record(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 	if (!extents->delay_coalesce) {
 		extent = extent_try_coalesce(tsdn, arena, r_extent_hooks,
 		    rtree_ctx, extents, extent, NULL, growing_retained);
+	} else if (extent_size_get(extent) >= LARGE_MINCLASS) {
+		/* Always coalesce large extents eagerly. */
+		bool coalesced;
+		size_t prev_size;
+		do {
+			prev_size = extent_size_get(extent);
+			assert(extent_state_get(extent) == extent_state_active);
+			extent = extent_try_coalesce(tsdn, arena,
+			    r_extent_hooks, rtree_ctx, extents, extent,
+			    &coalesced, growing_retained);
+			if (coalesced) {
+				extent_list_remove(&extents->lru, extent);
+			}
+		} while (coalesced &&
+		    extent_size_get(extent) >= prev_size + LARGE_MINCLASS);
 	}
-
 	extent_deactivate_locked(tsdn, arena, extents, extent, false);
 
 	malloc_mutex_unlock(tsdn, &extents->mtx);
@@ -1520,6 +1732,7 @@ extent_dalloc_wrapper_try(tsdn_t *tsdn, arena_t *arena,
 void
 extent_dalloc_wrapper(tsdn_t *tsdn, arena_t *arena,
     extent_hooks_t **r_extent_hooks, extent_t *extent) {
+	assert(extent_dumpable_get(extent));
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 0);
 
@@ -1780,6 +1993,13 @@ extent_split_default(extent_hooks_t *extent_hooks, void *addr, size_t size,
 }
 #endif
 
+/*
+ * Accepts the extent to split, and the characteristics of each side of the
+ * split.  The 'a' parameters go with the 'lead' of the resulting pair of
+ * extents (the lower addressed portion of the split), and the 'b' parameters go
+ * with the trail (the higher addressed portion).  This makes 'extent' the lead,
+ * and returns the trail (except in case of error).
+ */
 static extent_t *
 extent_split_impl(tsdn_t *tsdn, arena_t *arena,
     extent_hooks_t **r_extent_hooks, extent_t *extent, size_t size_a,
@@ -1803,7 +2023,7 @@ extent_split_impl(tsdn_t *tsdn, arena_t *arena,
 	extent_init(trail, arena, (void *)((uintptr_t)extent_base_get(extent) +
 	    size_a), size_b, slab_b, szind_b, extent_sn_get(extent),
 	    extent_state_get(extent), extent_zeroed_get(extent),
-	    extent_committed_get(extent));
+	    extent_committed_get(extent), extent_dumpable_get(extent));
 
 	rtree_ctx_t rtree_ctx_fallback;
 	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
@@ -1814,7 +2034,7 @@ extent_split_impl(tsdn_t *tsdn, arena_t *arena,
 		extent_init(&lead, arena, extent_addr_get(extent), size_a,
 		    slab_a, szind_a, extent_sn_get(extent),
 		    extent_state_get(extent), extent_zeroed_get(extent),
-		    extent_committed_get(extent));
+		    extent_committed_get(extent), extent_dumpable_get(extent));
 
 		extent_rtree_leaf_elms_lookup(tsdn, rtree_ctx, &lead, false,
 		    true, &lead_elm_a, &lead_elm_b);
diff --git a/src/extent_dss.c b/src/extent_dss.c
index e72da95..2b1ea9c 100644
--- a/src/extent_dss.c
+++ b/src/extent_dss.c
@@ -156,7 +156,7 @@ extent_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr, size_t size,
 				extent_init(gap, arena, gap_addr_page,
 				    gap_size_page, false, NSIZES,
 				    arena_extent_sn_next(arena),
-				    extent_state_active, false, true);
+				    extent_state_active, false, true, true);
 			}
 			/*
 			 * Compute the address just past the end of the desired
@@ -199,7 +199,8 @@ extent_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr, size_t size,
 
 					extent_init(&extent, arena, ret, size,
 					    size, false, NSIZES,
-					    extent_state_active, false, true);
+					    extent_state_active, false, true,
+					    true);
 					if (extent_purge_forced_wrapper(tsdn,
 					    arena, &extent_hooks, &extent, 0,
 					    size)) {
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 0ee8ad4..f4fd805 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -8,6 +8,7 @@
 #include "jemalloc/internal/extent_dss.h"
 #include "jemalloc/internal/extent_mmap.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"
+#include "jemalloc/internal/log.h"
 #include "jemalloc/internal/malloc_io.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/rtree.h"
@@ -1054,6 +1055,23 @@ malloc_conf_init(void) {
 			if (opt_abort_conf && had_conf_error) {
 				malloc_abort_invalid_conf();
 			}
+			if (strncmp("metadata_thp", k, klen) == 0) {
+				int i;
+				bool match = false;
+				for (i = 0; i < metadata_thp_mode_limit; i++) {
+					if (strncmp(metadata_thp_mode_names[i],
+					    v, vlen) == 0) {
+						opt_metadata_thp = i;
+						match = true;
+						break;
+					}
+				}
+				if (!match) {
+					malloc_conf_error("Invalid conf value",
+					    k, klen, v, vlen);
+				}
+				continue;
+			}
 			CONF_HANDLE_BOOL(opt_retain, "retain")
 			if (strncmp("dss", k, klen) == 0) {
 				int i;
@@ -1128,6 +1146,9 @@ malloc_conf_init(void) {
 				CONF_HANDLE_BOOL(opt_xmalloc, "xmalloc")
 			}
 			CONF_HANDLE_BOOL(opt_tcache, "tcache")
+			CONF_HANDLE_SIZE_T(opt_lg_extent_max_active_fit,
+			    "lg_extent_max_active_fit", 0,
+			    (sizeof(size_t) << 3), yes, yes, false)
 			CONF_HANDLE_SSIZE_T(opt_lg_tcache_max, "lg_tcache_max",
 			    -1, (sizeof(size_t) << 3) - 1)
 			if (strncmp("percpu_arena", k, klen) == 0) {
@@ -1173,6 +1194,16 @@ malloc_conf_init(void) {
 				CONF_HANDLE_BOOL(opt_prof_final, "prof_final")
 				CONF_HANDLE_BOOL(opt_prof_leak, "prof_leak")
 			}
+			if (config_log) {
+				if (CONF_MATCH("log")) {
+					size_t cpylen = (
+					    vlen <= sizeof(log_var_names) ?
+					    vlen : sizeof(log_var_names) - 1);
+					strncpy(log_var_names, v, cpylen);
+					log_var_names[cpylen] = '\0';
+					continue;
+				}
+			}
 			malloc_conf_error("Invalid conf pair", k, klen, v,
 			    vlen);
 #undef CONF_MATCH
@@ -1189,6 +1220,7 @@ malloc_conf_init(void) {
 #undef CONF_HANDLE_CHAR_P
 		}
 	}
+	atomic_store_b(&log_init_done, true, ATOMIC_RELEASE);
 }
 
 static bool
@@ -1493,6 +1525,8 @@ malloc_init_hard(void) {
 	post_reentrancy(tsd);
 	malloc_mutex_unlock(tsd_tsdn(tsd), &init_lock);
 
+	witness_assert_lockless(witness_tsd_tsdn(
+	    tsd_witness_tsdp_get_unsafe(tsd)));
 	malloc_tsd_boot1();
 	/* Update TSD after tsd_boot1. */
 	tsd = tsd_fetch();
@@ -1500,8 +1534,11 @@ malloc_init_hard(void) {
 		assert(have_background_thread);
 		/*
 		 * Need to finish init & unlock first before creating background
-		 * threads (pthread_create depends on malloc).
+		 * threads (pthread_create depends on malloc).  ctl_init (which
+		 * sets isthreaded) needs to be called without holding any lock.
 		 */
+		background_thread_ctl_init(tsd_tsdn(tsd));
+
 		malloc_mutex_lock(tsd_tsdn(tsd), &background_thread_lock);
 		bool err = background_thread_create(tsd, 0);
 		malloc_mutex_unlock(tsd_tsdn(tsd), &background_thread_lock);
@@ -1701,7 +1738,7 @@ compute_size_with_overflow(bool may_overflow, dynamic_opts_t *dopts,
 	}
 
 	/* A size_t with its high-half bits all set to 1. */
-	const static size_t high_bits = SIZE_T_MAX << (sizeof(size_t) * 8 / 2);
+	static const size_t high_bits = SIZE_T_MAX << (sizeof(size_t) * 8 / 2);
 
 	*size = dopts->item_size * dopts->num_items;
 
@@ -1962,6 +1999,8 @@ je_malloc(size_t size) {
 	static_opts_t sopts;
 	dynamic_opts_t dopts;
 
+	LOG("core.malloc.entry", "size: %zu", size);
+
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);
 
@@ -1976,6 +2015,8 @@ je_malloc(size_t size) {
 
 	imalloc(&sopts, &dopts);
 
+	LOG("core.malloc.exit", "result: %p", ret);
+
 	return ret;
 }
 
@@ -1986,6 +2027,9 @@ je_posix_memalign(void **memptr, size_t alignment, size_t size) {
 	static_opts_t sopts;
 	dynamic_opts_t dopts;
 
+	LOG("core.posix_memalign.entry", "mem ptr: %p, alignment: %zu, "
+	    "size: %zu", memptr, alignment, size);
+
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);
 
@@ -2002,6 +2046,10 @@ je_posix_memalign(void **memptr, size_t alignment, size_t size) {
 	dopts.alignment = alignment;
 
 	ret = imalloc(&sopts, &dopts);
+
+	LOG("core.posix_memalign.exit", "result: %d, alloc ptr: %p", ret,
+	    *memptr);
+
 	return ret;
 }
 
@@ -2014,6 +2062,9 @@ je_aligned_alloc(size_t alignment, size_t size) {
 	static_opts_t sopts;
 	dynamic_opts_t dopts;
 
+	LOG("core.aligned_alloc.entry", "alignment: %zu, size: %zu\n",
+	    alignment, size);
+
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);
 
@@ -2032,6 +2083,9 @@ je_aligned_alloc(size_t alignment, size_t size) {
 	dopts.alignment = alignment;
 
 	imalloc(&sopts, &dopts);
+
+	LOG("core.aligned_alloc.exit", "result: %p", ret);
+
 	return ret;
 }
 
@@ -2043,6 +2097,8 @@ je_calloc(size_t num, size_t size) {
 	static_opts_t sopts;
 	dynamic_opts_t dopts;
 
+	LOG("core.calloc.entry", "num: %zu, size: %zu\n", num, size);
+
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);
 
@@ -2059,6 +2115,8 @@ je_calloc(size_t num, size_t size) {
 
 	imalloc(&sopts, &dopts);
 
+	LOG("core.calloc.exit", "result: %p", ret);
+
 	return ret;
 }
 
@@ -2161,17 +2219,37 @@ isfree(tsd_t *tsd, void *ptr, size_t usize, tcache_t *tcache, bool slow_path) {
 	assert(malloc_initialized() || IS_INITIALIZER);
 
 	alloc_ctx_t alloc_ctx, *ctx;
-	if (config_prof && opt_prof) {
+	if (!config_cache_oblivious && ((uintptr_t)ptr & PAGE_MASK) != 0) {
+		/*
+		 * When cache_oblivious is disabled and ptr is not page aligned,
+		 * the allocation was not sampled -- usize can be used to
+		 * determine szind directly.
+		 */
+		alloc_ctx.szind = sz_size2index(usize);
+		alloc_ctx.slab = true;
+		ctx = &alloc_ctx;
+		if (config_debug) {
+			alloc_ctx_t dbg_ctx;
+			rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd);
+			rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree,
+			    rtree_ctx, (uintptr_t)ptr, true, &dbg_ctx.szind,
+			    &dbg_ctx.slab);
+			assert(dbg_ctx.szind == alloc_ctx.szind);
+			assert(dbg_ctx.slab == alloc_ctx.slab);
+		}
+	} else if (config_prof && opt_prof) {
 		rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd);
 		rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree, rtree_ctx,
 		    (uintptr_t)ptr, true, &alloc_ctx.szind, &alloc_ctx.slab);
 		assert(alloc_ctx.szind == sz_size2index(usize));
 		ctx = &alloc_ctx;
-		prof_free(tsd, ptr, usize, ctx);
 	} else {
 		ctx = NULL;
 	}
 
+	if (config_prof && opt_prof) {
+		prof_free(tsd, ptr, usize, ctx);
+	}
 	if (config_stats) {
 		*tsd_thread_deallocatedp_get(tsd) += usize;
 	}
@@ -2192,6 +2270,8 @@ je_realloc(void *ptr, size_t size) {
 	size_t usize JEMALLOC_CC_SILENCE_INIT(0);
 	size_t old_usize = 0;
 
+	LOG("core.realloc.entry", "ptr: %p, size: %zu\n", ptr, size);
+
 	if (unlikely(size == 0)) {
 		if (ptr != NULL) {
 			/* realloc(ptr, 0) is equivalent to free(ptr). */
@@ -2204,6 +2284,8 @@ je_realloc(void *ptr, size_t size) {
 				tcache = NULL;
 			}
 			ifree(tsd, ptr, tcache, true);
+
+			LOG("core.realloc.exit", "result: %p", NULL);
 			return NULL;
 		}
 		size = 1;
@@ -2236,7 +2318,9 @@ je_realloc(void *ptr, size_t size) {
 		tsdn = tsd_tsdn(tsd);
 	} else {
 		/* realloc(NULL, size) is equivalent to malloc(size). */
-		return je_malloc(size);
+		void *ret = je_malloc(size);
+		LOG("core.realloc.exit", "result: %p", ret);
+		return ret;
 	}
 
 	if (unlikely(ret == NULL)) {
@@ -2257,11 +2341,15 @@ je_realloc(void *ptr, size_t size) {
 	}
 	UTRACE(ptr, size, ret);
 	check_entry_exit_locking(tsdn);
+
+	LOG("core.realloc.exit", "result: %p", ret);
 	return ret;
 }
 
 JEMALLOC_EXPORT void JEMALLOC_NOTHROW
 je_free(void *ptr) {
+	LOG("core.free.entry", "ptr: %p", ptr);
+
 	UTRACE(ptr, 0, 0);
 	if (likely(ptr != NULL)) {
 		/*
@@ -2291,6 +2379,7 @@ je_free(void *ptr) {
 		}
 		check_entry_exit_locking(tsd_tsdn(tsd));
 	}
+	LOG("core.free.exit", "");
 }
 
 /*
@@ -2310,6 +2399,9 @@ je_memalign(size_t alignment, size_t size) {
 	static_opts_t sopts;
 	dynamic_opts_t dopts;
 
+	LOG("core.memalign.entry", "alignment: %zu, size: %zu\n", alignment,
+	    size);
+
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);
 
@@ -2327,6 +2419,8 @@ je_memalign(size_t alignment, size_t size) {
 	dopts.alignment = alignment;
 
 	imalloc(&sopts, &dopts);
+
+	LOG("core.memalign.exit", "result: %p", ret);
 	return ret;
 }
 #endif
@@ -2341,6 +2435,8 @@ je_valloc(size_t size) {
 	static_opts_t sopts;
 	dynamic_opts_t dopts;
 
+	LOG("core.valloc.entry", "size: %zu\n", size);
+
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);
 
@@ -2359,6 +2455,7 @@ je_valloc(size_t size) {
 
 	imalloc(&sopts, &dopts);
 
+	LOG("core.valloc.exit", "result: %p\n", ret);
 	return ret;
 }
 #endif
@@ -2432,6 +2529,8 @@ je_mallocx(size_t size, int flags) {
 	static_opts_t sopts;
 	dynamic_opts_t dopts;
 
+	LOG("core.mallocx.entry", "size: %zu, flags: %d", size, flags);
+
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);
 
@@ -2465,6 +2564,8 @@ je_mallocx(size_t size, int flags) {
 	}
 
 	imalloc(&sopts, &dopts);
+
+	LOG("core.mallocx.exit", "result: %p", ret);
 	return ret;
 }
 
@@ -2545,6 +2646,10 @@ je_rallocx(void *ptr, size_t size, int flags) {
 	arena_t *arena;
 	tcache_t *tcache;
 
+	LOG("core.rallocx.entry", "ptr: %p, size: %zu, flags: %d", ptr,
+	    size, flags);
+
+
 	assert(ptr != NULL);
 	assert(size != 0);
 	assert(malloc_initialized() || IS_INITIALIZER);
@@ -2607,6 +2712,8 @@ je_rallocx(void *ptr, size_t size, int flags) {
 	}
 	UTRACE(ptr, size, p);
 	check_entry_exit_locking(tsd_tsdn(tsd));
+
+	LOG("core.rallocx.exit", "result: %p", p);
 	return p;
 label_oom:
 	if (config_xmalloc && unlikely(opt_xmalloc)) {
@@ -2615,6 +2722,8 @@ label_oom:
 	}
 	UTRACE(ptr, size, 0);
 	check_entry_exit_locking(tsd_tsdn(tsd));
+
+	LOG("core.rallocx.exit", "result: %p", NULL);
 	return NULL;
 }
 
@@ -2701,6 +2810,9 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags) {
 	size_t alignment = MALLOCX_ALIGN_GET(flags);
 	bool zero = flags & MALLOCX_ZERO;
 
+	LOG("core.xallocx.entry", "ptr: %p, size: %zu, extra: %zu, "
+	    "flags: %d", ptr, size, extra, flags);
+
 	assert(ptr != NULL);
 	assert(size != 0);
 	assert(SIZE_T_MAX - size >= extra);
@@ -2750,6 +2862,8 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags) {
 label_not_resized:
 	UTRACE(ptr, size, ptr);
 	check_entry_exit_locking(tsd_tsdn(tsd));
+
+	LOG("core.xallocx.exit", "result: %zu", usize);
 	return usize;
 }
 
@@ -2759,6 +2873,8 @@ je_sallocx(const void *ptr, int flags) {
 	size_t usize;
 	tsdn_t *tsdn;
 
+	LOG("core.sallocx.entry", "ptr: %p, flags: %d", ptr, flags);
+
 	assert(malloc_initialized() || IS_INITIALIZER);
 	assert(ptr != NULL);
 
@@ -2773,11 +2889,15 @@ je_sallocx(const void *ptr, int flags) {
 	}
 
 	check_entry_exit_locking(tsdn);
+
+	LOG("core.sallocx.exit", "result: %zu", usize);
 	return usize;
 }
 
 JEMALLOC_EXPORT void JEMALLOC_NOTHROW
 je_dallocx(void *ptr, int flags) {
+	LOG("core.dallocx.entry", "ptr: %p, flags: %d", ptr, flags);
+
 	assert(ptr != NULL);
 	assert(malloc_initialized() || IS_INITIALIZER);
 
@@ -2815,6 +2935,8 @@ je_dallocx(void *ptr, int flags) {
 		ifree(tsd, ptr, tcache, true);
 	}
 	check_entry_exit_locking(tsd_tsdn(tsd));
+
+	LOG("core.dallocx.exit", "");
 }
 
 JEMALLOC_ALWAYS_INLINE size_t
@@ -2836,6 +2958,9 @@ je_sdallocx(void *ptr, size_t size, int flags) {
 	assert(ptr != NULL);
 	assert(malloc_initialized() || IS_INITIALIZER);
 
+	LOG("core.sdallocx.entry", "ptr: %p, size: %zu, flags: %d", ptr,
+	    size, flags);
+
 	tsd_t *tsd = tsd_fetch();
 	bool fast = tsd_fast(tsd);
 	size_t usize = inallocx(tsd_tsdn(tsd), size, flags);
@@ -2872,6 +2997,8 @@ je_sdallocx(void *ptr, size_t size, int flags) {
 		isfree(tsd, ptr, usize, tcache, true);
 	}
 	check_entry_exit_locking(tsd_tsdn(tsd));
+
+	LOG("core.sdallocx.exit", "");
 }
 
 JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW
@@ -2883,6 +3010,7 @@ je_nallocx(size_t size, int flags) {
 	assert(size != 0);
 
 	if (unlikely(malloc_init())) {
+		LOG("core.nallocx.exit", "result: %zu", ZU(0));
 		return 0;
 	}
 
@@ -2891,10 +3019,12 @@ je_nallocx(size_t size, int flags) {
 
 	usize = inallocx(tsdn, size, flags);
 	if (unlikely(usize > LARGE_MAXCLASS)) {
+		LOG("core.nallocx.exit", "result: %zu", ZU(0));
 		return 0;
 	}
 
 	check_entry_exit_locking(tsdn);
+	LOG("core.nallocx.exit", "result: %zu", usize);
 	return usize;
 }
 
@@ -2904,7 +3034,10 @@ je_mallctl(const char *name, void *oldp, size_t *oldlenp, void *newp,
 	int ret;
 	tsd_t *tsd;
 
+	LOG("core.mallctl.entry", "name: %s", name);
+
 	if (unlikely(malloc_init())) {
+		LOG("core.mallctl.exit", "result: %d", EAGAIN);
 		return EAGAIN;
 	}
 
@@ -2912,6 +3045,8 @@ je_mallctl(const char *name, void *oldp, size_t *oldlenp, void *newp,
 	check_entry_exit_locking(tsd_tsdn(tsd));
 	ret = ctl_byname(tsd, name, oldp, oldlenp, newp, newlen);
 	check_entry_exit_locking(tsd_tsdn(tsd));
+
+	LOG("core.mallctl.exit", "result: %d", ret);
 	return ret;
 }
 
@@ -2919,7 +3054,10 @@ JEMALLOC_EXPORT int JEMALLOC_NOTHROW
 je_mallctlnametomib(const char *name, size_t *mibp, size_t *miblenp) {
 	int ret;
 
+	LOG("core.mallctlnametomib.entry", "name: %s", name);
+
 	if (unlikely(malloc_init())) {
+		LOG("core.mallctlnametomib.exit", "result: %d", EAGAIN);
 		return EAGAIN;
 	}
 
@@ -2927,6 +3065,8 @@ je_mallctlnametomib(const char *name, size_t *mibp, size_t *miblenp) {
 	check_entry_exit_locking(tsd_tsdn(tsd));
 	ret = ctl_nametomib(tsd, name, mibp, miblenp);
 	check_entry_exit_locking(tsd_tsdn(tsd));
+
+	LOG("core.mallctlnametomib.exit", "result: %d", ret);
 	return ret;
 }
 
@@ -2936,7 +3076,10 @@ je_mallctlbymib(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 	int ret;
 	tsd_t *tsd;
 
+	LOG("core.mallctlbymib.entry", "");
+
 	if (unlikely(malloc_init())) {
+		LOG("core.mallctlbymib.exit", "result: %d", EAGAIN);
 		return EAGAIN;
 	}
 
@@ -2944,6 +3087,7 @@ je_mallctlbymib(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 	check_entry_exit_locking(tsd_tsdn(tsd));
 	ret = ctl_bymib(tsd, mib, miblen, oldp, oldlenp, newp, newlen);
 	check_entry_exit_locking(tsd_tsdn(tsd));
+	LOG("core.mallctlbymib.exit", "result: %d", ret);
 	return ret;
 }
 
@@ -2952,10 +3096,13 @@ je_malloc_stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
     const char *opts) {
 	tsdn_t *tsdn;
 
+	LOG("core.malloc_stats_print.entry", "");
+
 	tsdn = tsdn_fetch();
 	check_entry_exit_locking(tsdn);
 	stats_print(write_cb, cbopaque, opts);
 	check_entry_exit_locking(tsdn);
+	LOG("core.malloc_stats_print.exit", "");
 }
 
 JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW
@@ -2963,6 +3110,8 @@ je_malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *ptr) {
 	size_t ret;
 	tsdn_t *tsdn;
 
+	LOG("core.malloc_usable_size.entry", "ptr: %p", ptr);
+
 	assert(malloc_initialized() || IS_INITIALIZER);
 
 	tsdn = tsdn_fetch();
@@ -2980,6 +3129,7 @@ je_malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *ptr) {
 	}
 
 	check_entry_exit_locking(tsdn);
+	LOG("core.malloc_usable_size.exit", "result: %zu", ret);
 	return ret;
 }
 
diff --git a/src/jemalloc_cpp.cpp b/src/jemalloc_cpp.cpp
index 844ab39..f0cedda 100644
--- a/src/jemalloc_cpp.cpp
+++ b/src/jemalloc_cpp.cpp
@@ -39,12 +39,10 @@ void	operator delete(void *ptr, std::size_t size) noexcept;
 void	operator delete[](void *ptr, std::size_t size) noexcept;
 #endif
 
-template <bool IsNoExcept>
-void *
-newImpl(std::size_t size) noexcept(IsNoExcept) {
-	void *ptr = je_malloc(size);
-	if (likely(ptr != nullptr))
-		return ptr;
+JEMALLOC_NOINLINE
+static void *
+handleOOM(std::size_t size, bool nothrow) {
+	void *ptr = nullptr;
 
 	while (ptr == nullptr) {
 		std::new_handler handler;
@@ -68,11 +66,22 @@ newImpl(std::size_t size) noexcept(IsNoExcept) {
 		ptr = je_malloc(size);
 	}
 
-	if (ptr == nullptr && !IsNoExcept)
+	if (ptr == nullptr && !nothrow)
 		std::__throw_bad_alloc();
 	return ptr;
 }
 
+template <bool IsNoExcept>
+JEMALLOC_ALWAYS_INLINE
+void *
+newImpl(std::size_t size) noexcept(IsNoExcept) {
+	void *ptr = je_malloc(size);
+	if (likely(ptr != nullptr))
+		return ptr;
+
+	return handleOOM(size, IsNoExcept);
+}
+
 void *
 operator new(std::size_t size) {
 	return newImpl<false>(size);
diff --git a/src/log.c b/src/log.c
new file mode 100644
index 0000000..778902f
--- /dev/null
+++ b/src/log.c
@@ -0,0 +1,78 @@
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/jemalloc_internal_includes.h"
+
+#include "jemalloc/internal/log.h"
+
+char log_var_names[JEMALLOC_LOG_VAR_BUFSIZE];
+atomic_b_t log_init_done = ATOMIC_INIT(false);
+
+/*
+ * Returns true if we were able to pick out a segment.  Fills in r_segment_end
+ * with a pointer to the first character after the end of the string.
+ */
+static const char *
+log_var_extract_segment(const char* segment_begin) {
+	const char *end;
+	for (end = segment_begin; *end != '\0' && *end != '|'; end++) {
+	}
+	return end;
+}
+
+static bool
+log_var_matches_segment(const char *segment_begin, const char *segment_end,
+    const char *log_var_begin, const char *log_var_end) {
+	assert(segment_begin <= segment_end);
+	assert(log_var_begin < log_var_end);
+
+	ptrdiff_t segment_len = segment_end - segment_begin;
+	ptrdiff_t log_var_len = log_var_end - log_var_begin;
+	/* The special '.' segment matches everything. */
+	if (segment_len == 1 && *segment_begin == '.') {
+		return true;
+	}
+        if (segment_len == log_var_len) {
+		return strncmp(segment_begin, log_var_begin, segment_len) == 0;
+	} else if (segment_len < log_var_len) {
+		return strncmp(segment_begin, log_var_begin, segment_len) == 0
+		    && log_var_begin[segment_len] == '.';
+        } else {
+		return false;
+	}
+}
+
+unsigned
+log_var_update_state(log_var_t *log_var) {
+	const char *log_var_begin = log_var->name;
+	const char *log_var_end = log_var->name + strlen(log_var->name);
+
+	/* Pointer to one before the beginning of the current segment. */
+	const char *segment_begin = log_var_names;
+
+	/*
+	 * If log_init done is false, we haven't parsed the malloc conf yet.  To
+	 * avoid log-spew, we default to not displaying anything.
+	 */
+	if (!atomic_load_b(&log_init_done, ATOMIC_ACQUIRE)) {
+		return LOG_INITIALIZED_NOT_ENABLED;
+	}
+
+	while (true) {
+		const char *segment_end = log_var_extract_segment(
+		    segment_begin);
+		assert(segment_end < log_var_names + JEMALLOC_LOG_VAR_BUFSIZE);
+		if (log_var_matches_segment(segment_begin, segment_end,
+		    log_var_begin, log_var_end)) {
+			atomic_store_u(&log_var->state, LOG_ENABLED,
+			    ATOMIC_RELAXED);
+			return LOG_ENABLED;
+		}
+		if (*segment_end == '\0') {
+			/* Hit the end of the segment string with no match. */
+			atomic_store_u(&log_var->state,
+			    LOG_INITIALIZED_NOT_ENABLED, ATOMIC_RELAXED);
+			return LOG_INITIALIZED_NOT_ENABLED;
+		}
+		/* Otherwise, skip the delimiter and continue. */
+		segment_begin = segment_end + 1;
+	}
+}
diff --git a/src/malloc_io.c b/src/malloc_io.c
index 6b99afc..fd27bd1 100644
--- a/src/malloc_io.c
+++ b/src/malloc_io.c
@@ -111,7 +111,7 @@ buferror(int err, char *buf, size_t buflen) {
 	FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM, NULL, err, 0,
 	    (LPSTR)buf, (DWORD)buflen, NULL);
 	return 0;
-#elif defined(__GLIBC__) && defined(_GNU_SOURCE)
+#elif defined(JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE) && defined(_GNU_SOURCE)
 	char *b = strerror_r(err, buf, buflen);
 	if (b != buf) {
 		strncpy(buf, b, buflen);
diff --git a/src/mutex.c b/src/mutex.c
index a528ef0..3de7f44 100644
--- a/src/mutex.c
+++ b/src/mutex.c
@@ -4,6 +4,7 @@
 
 #include "jemalloc/internal/assert.h"
 #include "jemalloc/internal/malloc_io.h"
+#include "jemalloc/internal/spin.h"
 
 #ifndef _CRT_SPINCOUNT
 #define _CRT_SPINCOUNT 4000
@@ -53,7 +54,7 @@ malloc_mutex_lock_slow(malloc_mutex_t *mutex) {
 
 	int cnt = 0, max_cnt = MALLOC_MUTEX_MAX_SPIN;
 	do {
-		CPU_SPINWAIT;
+		spin_cpu_spinwait();
 		if (!malloc_mutex_trylock_final(mutex)) {
 			data->n_spin_acquired++;
 			return;
diff --git a/src/pages.c b/src/pages.c
index fec64dd..c839471 100644
--- a/src/pages.c
+++ b/src/pages.c
@@ -10,6 +10,9 @@
 
 #ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
 #include <sys/sysctl.h>
+#ifdef __FreeBSD__
+#include <vm/vm_param.h>
+#endif
 #endif
 
 /******************************************************************************/
@@ -25,6 +28,11 @@ static int	mmap_flags;
 #endif
 static bool	os_overcommits;
 
+bool thp_state_madvise;
+
+/* Runtime support for lazy purge. Irrelevant when !pages_can_purge_lazy. */
+static bool pages_can_purge_lazy_runtime = true;
+
 /******************************************************************************/
 /*
  * Function prototypes for static functions that are referenced prior to
@@ -252,12 +260,25 @@ pages_purge_lazy(void *addr, size_t size) {
 	if (!pages_can_purge_lazy) {
 		return true;
 	}
+	if (!pages_can_purge_lazy_runtime) {
+		/*
+		 * Built with lazy purge enabled, but detected it was not
+		 * supported on the current system.
+		 */
+		return true;
+	}
 
 #ifdef _WIN32
 	VirtualAlloc(addr, size, MEM_RESET, PAGE_READWRITE);
 	return false;
 #elif defined(JEMALLOC_PURGE_MADVISE_FREE)
-	return (madvise(addr, size, MADV_FREE) != 0);
+	return (madvise(addr, size,
+#  ifdef MADV_FREE
+	    MADV_FREE
+#  else
+	    JEMALLOC_MADV_FREE
+#  endif
+	    ) != 0);
 #elif defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \
     !defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS)
 	return (madvise(addr, size, MADV_DONTNEED) != 0);
@@ -291,7 +312,7 @@ pages_huge(void *addr, size_t size) {
 	assert(HUGEPAGE_ADDR2BASE(addr) == addr);
 	assert(HUGEPAGE_CEILING(size) == size);
 
-#ifdef JEMALLOC_THP
+#ifdef JEMALLOC_HAVE_MADVISE_HUGE
 	return (madvise(addr, size, MADV_HUGEPAGE) != 0);
 #else
 	return true;
@@ -303,19 +324,44 @@ pages_nohuge(void *addr, size_t size) {
 	assert(HUGEPAGE_ADDR2BASE(addr) == addr);
 	assert(HUGEPAGE_CEILING(size) == size);
 
-#ifdef JEMALLOC_THP
+#ifdef JEMALLOC_HAVE_MADVISE_HUGE
 	return (madvise(addr, size, MADV_NOHUGEPAGE) != 0);
 #else
 	return false;
 #endif
 }
 
+bool
+pages_dontdump(void *addr, size_t size) {
+	assert(PAGE_ADDR2BASE(addr) == addr);
+	assert(PAGE_CEILING(size) == size);
+#ifdef JEMALLOC_MADVISE_DONTDUMP
+	return madvise(addr, size, MADV_DONTDUMP) != 0;
+#else
+	return false;
+#endif
+}
+
+bool
+pages_dodump(void *addr, size_t size) {
+	assert(PAGE_ADDR2BASE(addr) == addr);
+	assert(PAGE_CEILING(size) == size);
+#ifdef JEMALLOC_MADVISE_DONTDUMP
+	return madvise(addr, size, MADV_DODUMP) != 0;
+#else
+	return false;
+#endif
+}
+
+
 static size_t
 os_page_detect(void) {
 #ifdef _WIN32
 	SYSTEM_INFO si;
 	GetSystemInfo(&si);
 	return si.dwPageSize;
+#elif defined(__FreeBSD__)
+	return getpagesize();
 #else
 	long result = sysconf(_SC_PAGESIZE);
 	if (result == -1) {
@@ -332,9 +378,19 @@ os_overcommits_sysctl(void) {
 	size_t sz;
 
 	sz = sizeof(vm_overcommit);
+#if defined(__FreeBSD__) && defined(VM_OVERCOMMIT)
+	int mib[2];
+
+	mib[0] = CTL_VM;
+	mib[1] = VM_OVERCOMMIT;
+	if (sysctl(mib, 2, &vm_overcommit, &sz, NULL, 0) != 0) {
+		return false; /* Error. */
+	}
+#else
 	if (sysctlbyname("vm.overcommit", &vm_overcommit, &sz, NULL, 0) != 0) {
 		return false; /* Error. */
 	}
+#endif
 
 	return ((vm_overcommit & 0x3) == 0);
 }
@@ -353,14 +409,37 @@ os_overcommits_proc(void) {
 	ssize_t nread;
 
 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open)
-	fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY |
-	    O_CLOEXEC);
+	#if defined(O_CLOEXEC)
+		fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY |
+			O_CLOEXEC);
+	#else
+		fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY);
+		if (fd != -1) {
+			fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
+		}
+	#endif
 #elif defined(JEMALLOC_USE_SYSCALL) && defined(SYS_openat)
-	fd = (int)syscall(SYS_openat,
-	    AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
+	#if defined(O_CLOEXEC)
+		fd = (int)syscall(SYS_openat,
+			AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
+	#else
+		fd = (int)syscall(SYS_openat,
+			AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY);
+		if (fd != -1) {
+			fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
+		}
+	#endif
 #else
-	fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
+	#if defined(O_CLOEXEC)
+		fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
+	#else
+		fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY);
+		if (fd != -1) {
+			fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
+		}
+	#endif
 #endif
+
 	if (fd == -1) {
 		return false; /* Error. */
 	}
@@ -390,6 +469,52 @@ os_overcommits_proc(void) {
 }
 #endif
 
+static void
+init_thp_state(void) {
+	if (!have_madvise_huge) {
+		if (metadata_thp_enabled() && opt_abort) {
+			malloc_write("<jemalloc>: no MADV_HUGEPAGE support\n");
+			abort();
+		}
+		goto label_error;
+	}
+
+	static const char madvise_state[] = "always [madvise] never\n";
+	char buf[sizeof(madvise_state)];
+
+#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open)
+	int fd = (int)syscall(SYS_open,
+	    "/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY);
+#else
+	int fd = open("/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY);
+#endif
+	if (fd == -1) {
+		goto label_error;
+	}
+
+#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_read)
+	ssize_t nread = (ssize_t)syscall(SYS_read, fd, &buf, sizeof(buf));
+#else
+	ssize_t nread = read(fd, &buf, sizeof(buf));
+#endif
+
+#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_close)
+	syscall(SYS_close, fd);
+#else
+	close(fd);
+#endif
+
+	if (nread < 1) {
+		goto label_error;
+	}
+	if (strncmp(buf, madvise_state, (size_t)nread) == 0) {
+		thp_state_madvise = true;
+		return;
+	}
+label_error:
+	thp_state_madvise = false;
+}
+
 bool
 pages_boot(void) {
 	os_page = os_page_detect();
@@ -418,5 +543,21 @@ pages_boot(void) {
 	os_overcommits = false;
 #endif
 
+	init_thp_state();
+
+	/* Detect lazy purge runtime support. */
+	if (pages_can_purge_lazy) {
+		bool committed = false;
+		void *madv_free_page = os_pages_map(NULL, PAGE, PAGE, &committed);
+		if (madv_free_page == NULL) {
+			return true;
+		}
+		assert(pages_can_purge_lazy_runtime);
+		if (pages_purge_lazy(madv_free_page, PAGE)) {
+			pages_can_purge_lazy_runtime = false;
+		}
+		os_pages_unmap(madv_free_page, PAGE);
+	}
+
 	return false;
 }
diff --git a/src/prof.c b/src/prof.c
index 975722c..32760e6 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -1409,7 +1409,15 @@ prof_open_maps(const char *format, ...) {
 	va_start(ap, format);
 	malloc_vsnprintf(filename, sizeof(filename), format, ap);
 	va_end(ap);
+
+#if defined(O_CLOEXEC)
 	mfd = open(filename, O_RDONLY | O_CLOEXEC);
+#else
+	mfd = open(filename, O_RDONLY);
+	if (mfd != -1) {
+		fcntl(mfd, F_SETFD, fcntl(mfd, F_GETFD) | FD_CLOEXEC);
+	}
+#endif
 
 	return mfd;
 }
diff --git a/src/spin.c b/src/spin.c
deleted file mode 100644
index 24372c2..0000000
--- a/src/spin.c
+++ /dev/null
@@ -1,4 +0,0 @@
-#define JEMALLOC_SPIN_C_
-#include "jemalloc/internal/jemalloc_preamble.h"
-
-#include "jemalloc/internal/spin.h"
diff --git a/src/stats.c b/src/stats.c
index 087df76..0a89b4b 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -85,34 +85,38 @@ gen_mutex_ctl_str(char *str, size_t buf_len, const char *prefix,
 
 static void
 read_arena_bin_mutex_stats(unsigned arena_ind, unsigned bin_ind,
-    uint64_t results[mutex_prof_num_counters]) {
+    uint64_t results_uint64_t[mutex_prof_num_uint64_t_counters],
+	uint32_t results_uint32_t[mutex_prof_num_uint32_t_counters]) {
 	char cmd[MUTEX_CTL_STR_MAX_LENGTH];
 #define OP(c, t)							\
     gen_mutex_ctl_str(cmd, MUTEX_CTL_STR_MAX_LENGTH,			\
         "arenas.0.bins.0","mutex", #c);					\
     CTL_M2_M4_GET(cmd, arena_ind, bin_ind,				\
-        (t *)&results[mutex_counter_##c], t);
-MUTEX_PROF_COUNTERS
+        (t *)&results_##t[mutex_counter_##c], t);
+	MUTEX_PROF_COUNTERS
 #undef OP
 }
 
 static void
 mutex_stats_output_json(void (*write_cb)(void *, const char *), void *cbopaque,
-    const char *name, uint64_t stats[mutex_prof_num_counters],
+    const char *name, uint64_t stats_uint64_t[mutex_prof_num_uint64_t_counters],
+    uint32_t stats_uint32_t[mutex_prof_num_uint32_t_counters],
     const char *json_indent, bool last) {
 	malloc_cprintf(write_cb, cbopaque, "%s\"%s\": {\n", json_indent, name);
 
-	mutex_prof_counter_ind_t k = 0;
+	mutex_prof_uint64_t_counter_ind_t k_uint64_t = 0;
+	mutex_prof_uint32_t_counter_ind_t k_uint32_t = 0;
 	char *fmt_str[2] = {"%s\t\"%s\": %"FMTu32"%s\n",
 	    "%s\t\"%s\": %"FMTu64"%s\n"};
 #define OP(c, t)							\
 	malloc_cprintf(write_cb, cbopaque,				\
 	    fmt_str[sizeof(t) / sizeof(uint32_t) - 1], 			\
-	    json_indent, #c, (t)stats[mutex_counter_##c],		\
-	    (++k == mutex_prof_num_counters) ? "" : ",");
-MUTEX_PROF_COUNTERS
+	    json_indent, #c, (t)stats_##t[mutex_counter_##c],		\
+	    (++k_##t && k_uint32_t == mutex_prof_num_uint32_t_counters) ? "" : ",");
+	MUTEX_PROF_COUNTERS
 #undef OP
-	malloc_cprintf(write_cb, cbopaque, "%s}%s\n", json_indent,
+
+malloc_cprintf(write_cb, cbopaque, "%s}%s\n", json_indent,
 	    last ? "" : ",");
 }
 
@@ -131,7 +135,8 @@ stats_arena_bins_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		    "\t\t\t\t\"bins\": [\n");
 	} else {
 		char *mutex_counters = "   n_lock_ops    n_waiting"
-		    "   n_spin_acq  total_wait_ns  max_wait_ns\n";
+		    "   n_spin_acq n_owner_switch  total_wait_ns"
+		    "  max_wait_ns max_n_thds\n";
 		malloc_cprintf(write_cb, cbopaque,
 		    "bins:           size ind    allocated      nmalloc"
 		    "      ndalloc    nrequests      curregs     curslabs regs"
@@ -191,10 +196,11 @@ stats_arena_bins_print(void (*write_cb)(void *, const char *), void *cbopaque,
 			    nmalloc, ndalloc, curregs, nrequests, nfills,
 			    nflushes, nreslabs, curslabs, mutex ? "," : "");
 			if (mutex) {
-				uint64_t mutex_stats[mutex_prof_num_counters];
-				read_arena_bin_mutex_stats(i, j, mutex_stats);
+				uint64_t mutex_stats_64[mutex_prof_num_uint64_t_counters];
+				uint32_t mutex_stats_32[mutex_prof_num_uint32_t_counters];
+				read_arena_bin_mutex_stats(i, j, mutex_stats_64, mutex_stats_32);
 				mutex_stats_output_json(write_cb, cbopaque,
-				    "mutex", mutex_stats, "\t\t\t\t\t\t", true);
+				    "mutex", mutex_stats_64, mutex_stats_32, "\t\t\t\t\t\t", true);
 			}
 			malloc_cprintf(write_cb, cbopaque,
 			    "\t\t\t\t\t}%s\n",
@@ -221,9 +227,10 @@ stats_arena_bins_print(void (*write_cb)(void *, const char *), void *cbopaque,
 					not_reached();
 				}
 			}
-			uint64_t mutex_stats[mutex_prof_num_counters];
+			uint64_t mutex_stats_64[mutex_prof_num_uint64_t_counters];
+			uint32_t mutex_stats_32[mutex_prof_num_uint32_t_counters];
 			if (mutex) {
-				read_arena_bin_mutex_stats(i, j, mutex_stats);
+				read_arena_bin_mutex_stats(i, j, mutex_stats_64, mutex_stats_32);
 			}
 
 			malloc_cprintf(write_cb, cbopaque, "%20zu %3u %12zu %12"
@@ -234,16 +241,18 @@ stats_arena_bins_print(void (*write_cb)(void *, const char *), void *cbopaque,
 			    nregs, slab_size / page, util, nfills, nflushes,
 			    nslabs, nreslabs);
 
-			/* Output less info for bin mutexes to save space. */
 			if (mutex) {
 				malloc_cprintf(write_cb, cbopaque,
 				    " %12"FMTu64" %12"FMTu64" %12"FMTu64
-				    " %14"FMTu64" %12"FMTu64"\n",
-				    mutex_stats[mutex_counter_num_ops],
-				    mutex_stats[mutex_counter_num_wait],
-				    mutex_stats[mutex_counter_num_spin_acq],
-				    mutex_stats[mutex_counter_total_wait_time],
-				    mutex_stats[mutex_counter_max_wait_time]);
+				    " %14"FMTu64" %14"FMTu64" %12"FMTu64
+				    " %10"FMTu32"\n",
+					mutex_stats_64[mutex_counter_num_ops],
+					mutex_stats_64[mutex_counter_num_wait],
+					mutex_stats_64[mutex_counter_num_spin_acq],
+					mutex_stats_64[mutex_counter_num_owner_switch],
+					mutex_stats_64[mutex_counter_total_wait_time],
+					mutex_stats_64[mutex_counter_max_wait_time],
+					mutex_stats_32[mutex_counter_max_num_thds]);
 			} else {
 				malloc_cprintf(write_cb, cbopaque, "\n");
 			}
@@ -326,7 +335,8 @@ stats_arena_lextents_print(void (*write_cb)(void *, const char *),
 
 static void
 read_arena_mutex_stats(unsigned arena_ind,
-    uint64_t results[mutex_prof_num_arena_mutexes][mutex_prof_num_counters]) {
+    uint64_t results_uint64_t[mutex_prof_num_arena_mutexes][mutex_prof_num_uint64_t_counters],
+	uint32_t results_uint32_t[mutex_prof_num_arena_mutexes][mutex_prof_num_uint32_t_counters]) {
 	char cmd[MUTEX_CTL_STR_MAX_LENGTH];
 
 	mutex_prof_arena_ind_t i;
@@ -335,7 +345,7 @@ read_arena_mutex_stats(unsigned arena_ind,
 		gen_mutex_ctl_str(cmd, MUTEX_CTL_STR_MAX_LENGTH,	\
 		    "arenas.0.mutexes",	arena_mutex_names[i], #c);	\
 		CTL_M2_GET(cmd, arena_ind,				\
-		    (t *)&results[i][mutex_counter_##c], t);
+		    (t *)&results_##t[i][mutex_counter_##c], t);
 MUTEX_PROF_COUNTERS
 #undef OP
 	}
@@ -343,7 +353,8 @@ MUTEX_PROF_COUNTERS
 
 static void
 mutex_stats_output(void (*write_cb)(void *, const char *), void *cbopaque,
-    const char *name, uint64_t stats[mutex_prof_num_counters],
+    const char *name, uint64_t stats_uint64_t[mutex_prof_num_uint64_t_counters],
+    uint32_t stats_uint32_t[mutex_prof_num_uint32_t_counters],
     bool first_mutex) {
 	if (first_mutex) {
 		/* Print title. */
@@ -361,7 +372,7 @@ mutex_stats_output(void (*write_cb)(void *, const char *), void *cbopaque,
 #define OP(c, t)							\
 	malloc_cprintf(write_cb, cbopaque,				\
 	    fmt_str[sizeof(t) / sizeof(uint32_t) - 1],			\
-	    (t)stats[mutex_counter_##c]);
+	    (t)stats_##t[mutex_counter_##c]);
 MUTEX_PROF_COUNTERS
 #undef OP
 	malloc_cprintf(write_cb, cbopaque, "\n");
@@ -370,8 +381,9 @@ MUTEX_PROF_COUNTERS
 static void
 stats_arena_mutexes_print(void (*write_cb)(void *, const char *),
     void *cbopaque, bool json, bool json_end, unsigned arena_ind) {
-	uint64_t mutex_stats[mutex_prof_num_arena_mutexes][mutex_prof_num_counters];
-	read_arena_mutex_stats(arena_ind, mutex_stats);
+	uint64_t mutex_stats_64[mutex_prof_num_arena_mutexes][mutex_prof_num_uint64_t_counters];
+	uint32_t mutex_stats_32[mutex_prof_num_arena_mutexes][mutex_prof_num_uint32_t_counters];
+	read_arena_mutex_stats(arena_ind, mutex_stats_64, mutex_stats_32);
 
 	/* Output mutex stats. */
 	if (json) {
@@ -380,7 +392,7 @@ stats_arena_mutexes_print(void (*write_cb)(void *, const char *),
 		last_mutex = mutex_prof_num_arena_mutexes - 1;
 		for (i = 0; i < mutex_prof_num_arena_mutexes; i++) {
 			mutex_stats_output_json(write_cb, cbopaque,
-			    arena_mutex_names[i], mutex_stats[i],
+			    arena_mutex_names[i], mutex_stats_64[i], mutex_stats_32[i],
 			    "\t\t\t\t\t", (i == last_mutex));
 		}
 		malloc_cprintf(write_cb, cbopaque, "\t\t\t\t}%s\n",
@@ -389,7 +401,7 @@ stats_arena_mutexes_print(void (*write_cb)(void *, const char *),
 		mutex_prof_arena_ind_t i;
 		for (i = 0; i < mutex_prof_num_arena_mutexes; i++) {
 			mutex_stats_output(write_cb, cbopaque,
-			    arena_mutex_names[i], mutex_stats[i], i == 0);
+			    arena_mutex_names[i], mutex_stats_64[i],  mutex_stats_32[i], i == 0);
 		}
 	}
 }
@@ -401,7 +413,7 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	const char *dss;
 	ssize_t dirty_decay_ms, muzzy_decay_ms;
 	size_t page, pactive, pdirty, pmuzzy, mapped, retained;
-	size_t base, internal, resident;
+	size_t base, internal, resident, metadata_thp;
 	uint64_t dirty_npurge, dirty_nmadvise, dirty_purged;
 	uint64_t muzzy_npurge, muzzy_nmadvise, muzzy_purged;
 	size_t small_allocated;
@@ -613,6 +625,15 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		    "internal:                %12zu\n", internal);
 	}
 
+	CTL_M2_GET("stats.arenas.0.metadata_thp", i, &metadata_thp, size_t);
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\"metadata_thp\": %zu,\n", metadata_thp);
+	} else {
+		malloc_cprintf(write_cb, cbopaque,
+		    "metadata_thp:            %12zu\n", metadata_thp);
+	}
+
 	CTL_M2_GET("stats.arenas.0.tcache_bytes", i, &tcache_bytes, size_t);
 	if (json) {
 		malloc_cprintf(write_cb, cbopaque,
@@ -806,9 +827,11 @@ stats_general_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	OPT_WRITE_CHAR_P(dss, ",")
 	OPT_WRITE_UNSIGNED(narenas, ",")
 	OPT_WRITE_CHAR_P(percpu_arena, ",")
+	OPT_WRITE_CHAR_P(metadata_thp, ",")
 	OPT_WRITE_BOOL_MUTABLE(background_thread, background_thread, ",")
 	OPT_WRITE_SSIZE_T_MUTABLE(dirty_decay_ms, arenas.dirty_decay_ms, ",")
 	OPT_WRITE_SSIZE_T_MUTABLE(muzzy_decay_ms, arenas.muzzy_decay_ms, ",")
+	OPT_WRITE_UNSIGNED(lg_extent_max_active_fit, ",")
 	OPT_WRITE_CHAR_P(junk, ",")
 	OPT_WRITE_BOOL(zero, ",")
 	OPT_WRITE_BOOL(utrace, ",")
@@ -843,7 +866,9 @@ stats_general_print(void (*write_cb)(void *, const char *), void *cbopaque,
 
 #undef OPT_WRITE_BOOL
 #undef OPT_WRITE_BOOL_MUTABLE
+#undef OPT_WRITE_UNSIGNED
 #undef OPT_WRITE_SSIZE_T
+#undef OPT_WRITE_SSIZE_T_MUTABLE
 #undef OPT_WRITE_CHAR_P
 
 	/* arenas. */
@@ -988,7 +1013,8 @@ stats_general_print(void (*write_cb)(void *, const char *), void *cbopaque,
 
 static void
 read_global_mutex_stats(
-    uint64_t results[mutex_prof_num_global_mutexes][mutex_prof_num_counters]) {
+    uint64_t results_uint64_t[mutex_prof_num_global_mutexes][mutex_prof_num_uint64_t_counters],
+	uint32_t results_uint32_t[mutex_prof_num_global_mutexes][mutex_prof_num_uint32_t_counters]) {
 	char cmd[MUTEX_CTL_STR_MAX_LENGTH];
 
 	mutex_prof_global_ind_t i;
@@ -996,7 +1022,7 @@ read_global_mutex_stats(
 #define OP(c, t)							\
 		gen_mutex_ctl_str(cmd, MUTEX_CTL_STR_MAX_LENGTH,	\
 		    "mutexes", global_mutex_names[i], #c);		\
-		CTL_GET(cmd, (t *)&results[i][mutex_counter_##c], t);
+		CTL_GET(cmd, (t *)&results_##t[i][mutex_counter_##c], t);
 MUTEX_PROF_COUNTERS
 #undef OP
 	}
@@ -1006,20 +1032,23 @@ static void
 stats_print_helper(void (*write_cb)(void *, const char *), void *cbopaque,
     bool json, bool merged, bool destroyed, bool unmerged, bool bins,
     bool large, bool mutex) {
-	size_t allocated, active, metadata, resident, mapped, retained;
+	size_t allocated, active, metadata, metadata_thp, resident, mapped,
+	    retained;
 	size_t num_background_threads;
 	uint64_t background_thread_num_runs, background_thread_run_interval;
 
 	CTL_GET("stats.allocated", &allocated, size_t);
 	CTL_GET("stats.active", &active, size_t);
 	CTL_GET("stats.metadata", &metadata, size_t);
+	CTL_GET("stats.metadata_thp", &metadata_thp, size_t);
 	CTL_GET("stats.resident", &resident, size_t);
 	CTL_GET("stats.mapped", &mapped, size_t);
 	CTL_GET("stats.retained", &retained, size_t);
 
-	uint64_t mutex_stats[mutex_prof_num_global_mutexes][mutex_prof_num_counters];
+	uint64_t mutex_stats_uint64_t[mutex_prof_num_global_mutexes][mutex_prof_num_uint64_t_counters];
+	uint32_t mutex_stats_uint32_t[mutex_prof_num_global_mutexes][mutex_prof_num_uint32_t_counters];
 	if (mutex) {
-		read_global_mutex_stats(mutex_stats);
+		read_global_mutex_stats(mutex_stats_uint64_t, mutex_stats_uint32_t);
 	}
 
 	if (have_background_thread) {
@@ -1046,6 +1075,8 @@ stats_print_helper(void (*write_cb)(void *, const char *), void *cbopaque,
 		malloc_cprintf(write_cb, cbopaque,
 		    "\t\t\t\"metadata\": %zu,\n", metadata);
 		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\"metadata_thp\": %zu,\n", metadata_thp);
+		malloc_cprintf(write_cb, cbopaque,
 		    "\t\t\t\"resident\": %zu,\n", resident);
 		malloc_cprintf(write_cb, cbopaque,
 		    "\t\t\t\"mapped\": %zu,\n", mapped);
@@ -1071,7 +1102,7 @@ stats_print_helper(void (*write_cb)(void *, const char *), void *cbopaque,
 			mutex_prof_global_ind_t i;
 			for (i = 0; i < mutex_prof_num_global_mutexes; i++) {
 				mutex_stats_output_json(write_cb, cbopaque,
-				    global_mutex_names[i], mutex_stats[i],
+				    global_mutex_names[i], mutex_stats_uint64_t[i], mutex_stats_uint32_t[i],
 				    "\t\t\t\t",
 				    i == mutex_prof_num_global_mutexes - 1);
 			}
@@ -1081,9 +1112,10 @@ stats_print_helper(void (*write_cb)(void *, const char *), void *cbopaque,
 		    "\t\t}%s\n", (merged || unmerged || destroyed) ? "," : "");
 	} else {
 		malloc_cprintf(write_cb, cbopaque,
-		    "Allocated: %zu, active: %zu, metadata: %zu,"
+		    "Allocated: %zu, active: %zu, metadata: %zu (n_thp %zu),"
 		    " resident: %zu, mapped: %zu, retained: %zu\n",
-		    allocated, active, metadata, resident, mapped, retained);
+		    allocated, active, metadata, metadata_thp, resident, mapped,
+		    retained);
 
 		if (have_background_thread && num_background_threads > 0) {
 			malloc_cprintf(write_cb, cbopaque,
@@ -1097,7 +1129,7 @@ stats_print_helper(void (*write_cb)(void *, const char *), void *cbopaque,
 			mutex_prof_global_ind_t i;
 			for (i = 0; i < mutex_prof_num_global_mutexes; i++) {
 				mutex_stats_output(write_cb, cbopaque,
-				    global_mutex_names[i], mutex_stats[i],
+				    global_mutex_names[i], mutex_stats_uint64_t[i], mutex_stats_uint32_t[i],
 				    i == 0);
 			}
 		}
diff --git a/src/sz.c b/src/sz.c
index 0986615..9de77e4 100644
--- a/src/sz.c
+++ b/src/sz.c
@@ -26,7 +26,8 @@ const size_t sz_index2size_tab[NSIZES] = {
 JEMALLOC_ALIGNED(CACHELINE)
 const uint8_t sz_size2index_tab[] = {
 #if LG_TINY_MIN == 0
-#warning "Dangerous LG_TINY_MIN"
+/* The div module doesn't support division by 1. */
+#error "Unsupported LG_TINY_MIN"
 #define S2B_0(i)	i,
 #elif LG_TINY_MIN == 1
 #warning "Dangerous LG_TINY_MIN"
diff --git a/src/tcache.c b/src/tcache.c
index 936ef31..a769a6b 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -12,7 +12,7 @@
 bool	opt_tcache = true;
 ssize_t	opt_lg_tcache_max = LG_TCACHE_MAXCLASS_DEFAULT;
 
-tcache_bin_info_t	*tcache_bin_info;
+cache_bin_info_t	*tcache_bin_info;
 static unsigned		stack_nelms; /* Total stack elms per tcache. */
 
 unsigned		nhbins;
@@ -40,7 +40,7 @@ void
 tcache_event_hard(tsd_t *tsd, tcache_t *tcache) {
 	szind_t binind = tcache->next_gc_bin;
 
-	tcache_bin_t *tbin;
+	cache_bin_t *tbin;
 	if (binind < NBINS) {
 		tbin = tcache_small_bin_get(tcache, binind);
 	} else {
@@ -58,7 +58,7 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache) {
 			 * Reduce fill count by 2X.  Limit lg_fill_div such that
 			 * the fill count is always at least 1.
 			 */
-			tcache_bin_info_t *tbin_info = &tcache_bin_info[binind];
+			cache_bin_info_t *tbin_info = &tcache_bin_info[binind];
 			if ((tbin_info->ncached_max >>
 			     (tcache->lg_fill_div[binind] + 1)) >= 1) {
 				tcache->lg_fill_div[binind]++;
@@ -86,7 +86,7 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache) {
 
 void *
 tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
-    tcache_bin_t *tbin, szind_t binind, bool *tcache_success) {
+    cache_bin_t *tbin, szind_t binind, bool *tcache_success) {
 	void *ret;
 
 	assert(tcache->arena != NULL);
@@ -95,18 +95,18 @@ tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
 	if (config_prof) {
 		tcache->prof_accumbytes = 0;
 	}
-	ret = tcache_alloc_easy(tbin, tcache_success);
+	ret = cache_bin_alloc_easy(tbin, tcache_success);
 
 	return ret;
 }
 
 void
-tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
+tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
     szind_t binind, unsigned rem) {
 	bool merged_stats = false;
 
 	assert(binind < NBINS);
-	assert(rem <= tbin->ncached);
+	assert((cache_bin_sz_t)rem <= tbin->ncached);
 
 	arena_t *arena = tcache->arena;
 	assert(arena != NULL);
@@ -121,7 +121,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
 		/* Lock the arena bin associated with the first object. */
 		extent_t *extent = item_extent[0];
 		arena_t *bin_arena = extent_arena_get(extent);
-		arena_bin_t *bin = &bin_arena->bins[binind];
+		bin_t *bin = &bin_arena->bins[binind];
 
 		if (config_prof && bin_arena == arena) {
 			if (arena_prof_accum(tsd_tsdn(tsd), arena,
@@ -169,7 +169,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
 		 * The flush loop didn't happen to flush to this thread's
 		 * arena, so the stats didn't get merged.  Manually do so now.
 		 */
-		arena_bin_t *bin = &arena->bins[binind];
+		bin_t *bin = &arena->bins[binind];
 		malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
 		bin->stats.nflushes++;
 		bin->stats.nrequests += tbin->tstats.nrequests;
@@ -180,18 +180,18 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
 	memmove(tbin->avail - rem, tbin->avail - tbin->ncached, rem *
 	    sizeof(void *));
 	tbin->ncached = rem;
-	if ((low_water_t)tbin->ncached < tbin->low_water) {
+	if (tbin->ncached < tbin->low_water) {
 		tbin->low_water = tbin->ncached;
 	}
 }
 
 void
-tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, szind_t binind,
+tcache_bin_flush_large(tsd_t *tsd, cache_bin_t *tbin, szind_t binind,
     unsigned rem, tcache_t *tcache) {
 	bool merged_stats = false;
 
 	assert(binind < nhbins);
-	assert(rem <= tbin->ncached);
+	assert((cache_bin_sz_t)rem <= tbin->ncached);
 
 	arena_t *arena = tcache->arena;
 	assert(arena != NULL);
@@ -278,7 +278,7 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, szind_t binind,
 	memmove(tbin->avail - rem, tbin->avail - tbin->ncached, rem *
 	    sizeof(void *));
 	tbin->ncached = rem;
-	if ((low_water_t)tbin->ncached < tbin->low_water) {
+	if (tbin->ncached < tbin->low_water) {
 		tbin->low_water = tbin->ncached;
 	}
 }
@@ -291,8 +291,15 @@ tcache_arena_associate(tsdn_t *tsdn, tcache_t *tcache, arena_t *arena) {
 	if (config_stats) {
 		/* Link into list of extant tcaches. */
 		malloc_mutex_lock(tsdn, &arena->tcache_ql_mtx);
+
 		ql_elm_new(tcache, link);
 		ql_tail_insert(&arena->tcache_ql, tcache, link);
+		cache_bin_array_descriptor_init(
+		    &tcache->cache_bin_array_descriptor, tcache->bins_small,
+		    tcache->bins_large);
+		ql_tail_insert(&arena->cache_bin_array_descriptor_ql,
+		    &tcache->cache_bin_array_descriptor, link);
+
 		malloc_mutex_unlock(tsdn, &arena->tcache_ql_mtx);
 	}
 }
@@ -316,6 +323,8 @@ tcache_arena_dissociate(tsdn_t *tsdn, tcache_t *tcache) {
 			assert(in_ql);
 		}
 		ql_remove(&arena->tcache_ql, tcache, link);
+		ql_remove(&arena->cache_bin_array_descriptor_ql,
+		    &tcache->cache_bin_array_descriptor, link);
 		tcache_stats_merge(tsdn, tcache, arena);
 		malloc_mutex_unlock(tsdn, &arena->tcache_ql_mtx);
 	}
@@ -354,8 +363,8 @@ tcache_init(tsd_t *tsd, tcache_t *tcache, void *avail_stack) {
 
 	size_t stack_offset = 0;
 	assert((TCACHE_NSLOTS_SMALL_MAX & 1U) == 0);
-	memset(tcache->tbins_small, 0, sizeof(tcache_bin_t) * NBINS);
-	memset(tcache->tbins_large, 0, sizeof(tcache_bin_t) * (nhbins - NBINS));
+	memset(tcache->bins_small, 0, sizeof(cache_bin_t) * NBINS);
+	memset(tcache->bins_large, 0, sizeof(cache_bin_t) * (nhbins - NBINS));
 	unsigned i = 0;
 	for (; i < NBINS; i++) {
 		tcache->lg_fill_div[i] = 1;
@@ -450,7 +459,7 @@ tcache_flush_cache(tsd_t *tsd, tcache_t *tcache) {
 	assert(tcache->arena != NULL);
 
 	for (unsigned i = 0; i < NBINS; i++) {
-		tcache_bin_t *tbin = tcache_small_bin_get(tcache, i);
+		cache_bin_t *tbin = tcache_small_bin_get(tcache, i);
 		tcache_bin_flush_small(tsd, tcache, tbin, i, 0);
 
 		if (config_stats) {
@@ -458,7 +467,7 @@ tcache_flush_cache(tsd_t *tsd, tcache_t *tcache) {
 		}
 	}
 	for (unsigned i = NBINS; i < nhbins; i++) {
-		tcache_bin_t *tbin = tcache_large_bin_get(tcache, i);
+		cache_bin_t *tbin = tcache_large_bin_get(tcache, i);
 		tcache_bin_flush_large(tsd, tbin, i, 0, tcache);
 
 		if (config_stats) {
@@ -524,8 +533,8 @@ tcache_stats_merge(tsdn_t *tsdn, tcache_t *tcache, arena_t *arena) {
 
 	/* Merge and reset tcache stats. */
 	for (i = 0; i < NBINS; i++) {
-		arena_bin_t *bin = &arena->bins[i];
-		tcache_bin_t *tbin = tcache_small_bin_get(tcache, i);
+		bin_t *bin = &arena->bins[i];
+		cache_bin_t *tbin = tcache_small_bin_get(tcache, i);
 		malloc_mutex_lock(tsdn, &bin->lock);
 		bin->stats.nrequests += tbin->tstats.nrequests;
 		malloc_mutex_unlock(tsdn, &bin->lock);
@@ -533,7 +542,7 @@ tcache_stats_merge(tsdn_t *tsdn, tcache_t *tcache, arena_t *arena) {
 	}
 
 	for (; i < nhbins; i++) {
-		tcache_bin_t *tbin = tcache_large_bin_get(tcache, i);
+		cache_bin_t *tbin = tcache_large_bin_get(tcache, i);
 		arena_stats_large_nrequests_add(tsdn, &arena->stats, i,
 		    tbin->tstats.nrequests);
 		tbin->tstats.nrequests = 0;
@@ -657,21 +666,21 @@ tcache_boot(tsdn_t *tsdn) {
 	nhbins = sz_size2index(tcache_maxclass) + 1;
 
 	/* Initialize tcache_bin_info. */
-	tcache_bin_info = (tcache_bin_info_t *)base_alloc(tsdn, b0get(), nhbins
-	    * sizeof(tcache_bin_info_t), CACHELINE);
+	tcache_bin_info = (cache_bin_info_t *)base_alloc(tsdn, b0get(), nhbins
+	    * sizeof(cache_bin_info_t), CACHELINE);
 	if (tcache_bin_info == NULL) {
 		return true;
 	}
 	stack_nelms = 0;
 	unsigned i;
 	for (i = 0; i < NBINS; i++) {
-		if ((arena_bin_info[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MIN) {
+		if ((bin_infos[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MIN) {
 			tcache_bin_info[i].ncached_max =
 			    TCACHE_NSLOTS_SMALL_MIN;
-		} else if ((arena_bin_info[i].nregs << 1) <=
+		} else if ((bin_infos[i].nregs << 1) <=
 		    TCACHE_NSLOTS_SMALL_MAX) {
 			tcache_bin_info[i].ncached_max =
-			    (arena_bin_info[i].nregs << 1);
+			    (bin_infos[i].nregs << 1);
 		} else {
 			tcache_bin_info[i].ncached_max =
 			    TCACHE_NSLOTS_SMALL_MAX;
diff --git a/src/tsd.c b/src/tsd.c
index f968992..c143068 100644
--- a/src/tsd.c
+++ b/src/tsd.c
@@ -71,6 +71,16 @@ tsd_data_init(tsd_t *tsd) {
 	 */
 	rtree_ctx_data_init(tsd_rtree_ctxp_get_unsafe(tsd));
 
+	/*
+	 * A nondeterministic seed based on the address of tsd reduces
+	 * the likelihood of lockstep non-uniform cache index
+	 * utilization among identical concurrent processes, but at the
+	 * cost of test repeatability.  For debug builds, instead use a
+	 * deterministic seed.
+	 */
+	*tsd_offset_statep_get(tsd) = config_debug ? 0 :
+	    (uint64_t)(uintptr_t)tsd;
+
 	return tsd_tcache_enabled_data_init(tsd);
 }
 
diff --git a/src/zone.c b/src/zone.c
index 9d3b7b4..23dfdd0 100644
--- a/src/zone.c
+++ b/src/zone.c
@@ -89,6 +89,7 @@ JEMALLOC_ATTR(weak_import);
 static malloc_zone_t *default_zone, *purgeable_zone;
 static malloc_zone_t jemalloc_zone;
 static struct malloc_introspection_t jemalloc_zone_introspect;
+static pid_t zone_force_lock_pid = -1;
 
 /******************************************************************************/
 /* Function prototypes for non-inline static functions. */
@@ -270,6 +271,12 @@ zone_log(malloc_zone_t *zone, void *address) {
 static void
 zone_force_lock(malloc_zone_t *zone) {
 	if (isthreaded) {
+		/*
+		 * See the note in zone_force_unlock, below, to see why we need
+		 * this.
+		 */
+		assert(zone_force_lock_pid == -1);
+		zone_force_lock_pid = getpid();
 		jemalloc_prefork();
 	}
 }
@@ -277,14 +284,25 @@ zone_force_lock(malloc_zone_t *zone) {
 static void
 zone_force_unlock(malloc_zone_t *zone) {
 	/*
-	 * Call jemalloc_postfork_child() rather than
-	 * jemalloc_postfork_parent(), because this function is executed by both
-	 * parent and child.  The parent can tolerate having state
-	 * reinitialized, but the child cannot unlock mutexes that were locked
-	 * by the parent.
+	 * zone_force_lock and zone_force_unlock are the entry points to the
+	 * forking machinery on OS X.  The tricky thing is, the child is not
+	 * allowed to unlock mutexes locked in the parent, even if owned by the
+	 * forking thread (and the mutex type we use in OS X will fail an assert
+	 * if we try).  In the child, we can get away with reinitializing all
+	 * the mutexes, which has the effect of unlocking them.  In the parent,
+	 * doing this would mean we wouldn't wake any waiters blocked on the
+	 * mutexes we unlock.  So, we record the pid of the current thread in
+	 * zone_force_lock, and use that to detect if we're in the parent or
+	 * child here, to decide which unlock logic we need.
 	 */
 	if (isthreaded) {
-		jemalloc_postfork_child();
+		assert(zone_force_lock_pid != -1);
+		if (getpid() == zone_force_lock_pid) {
+			jemalloc_postfork_parent();
+		} else {
+			jemalloc_postfork_child();
+		}
+		zone_force_lock_pid = -1;
 	}
 }