42 files changed, 1069 insertions, 439 deletions
diff --git a/include/jemalloc/internal/arena_externs.h b/include/jemalloc/internal/arena_externs.h
index af16d15..4b3732b 100644
--- a/include/jemalloc/internal/arena_externs.h
+++ b/include/jemalloc/internal/arena_externs.h
@@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_ARENA_EXTERNS_H
 #define JEMALLOC_INTERNAL_ARENA_EXTERNS_H
 
+#include "jemalloc/internal/bin.h"
 #include "jemalloc/internal/extent_dss.h"
 #include "jemalloc/internal/pages.h"
 #include "jemalloc/internal/size_classes.h"
@@ -9,25 +10,19 @@
 extern ssize_t opt_dirty_decay_ms;
 extern ssize_t opt_muzzy_decay_ms;
 
-extern const arena_bin_info_t arena_bin_info[NBINS];
-
 extern percpu_arena_mode_t opt_percpu_arena;
 extern const char *percpu_arena_mode_names[];
 
 extern const uint64_t h_steps[SMOOTHSTEP_NSTEPS];
 extern malloc_mutex_t arenas_lock;
 
-void arena_stats_large_nrequests_add(tsdn_t *tsdn, arena_stats_t *arena_stats,
-    szind_t szind, uint64_t nrequests);
-void arena_stats_mapped_add(tsdn_t *tsdn, arena_stats_t *arena_stats,
-    size_t size);
 void arena_basic_stats_merge(tsdn_t *tsdn, arena_t *arena,
     unsigned *nthreads, const char **dss, ssize_t *dirty_decay_ms,
     ssize_t *muzzy_decay_ms, size_t *nactive, size_t *ndirty, size_t *nmuzzy);
 void arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
     const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms,
     size_t *nactive, size_t *ndirty, size_t *nmuzzy, arena_stats_t *astats,
-    malloc_bin_stats_t *bstats, malloc_large_stats_t *lstats);
+    bin_stats_t *bstats, arena_stats_large_t *lstats);
 void arena_extents_dirty_dalloc(tsdn_t *tsdn, arena_t *arena,
     extent_hooks_t **r_extent_hooks, extent_t *extent);
 #ifdef JEMALLOC_JET
@@ -50,11 +45,11 @@ void arena_decay(tsdn_t *tsdn, arena_t *arena, bool is_background_thread,
 void arena_reset(tsd_t *tsd, arena_t *arena);
 void arena_destroy(tsd_t *tsd, arena_t *arena);
 void arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
-    tcache_bin_t *tbin, szind_t binind, uint64_t prof_accumbytes);
-void arena_alloc_junk_small(void *ptr, const arena_bin_info_t *bin_info,
+    cache_bin_t *tbin, szind_t binind, uint64_t prof_accumbytes);
+void arena_alloc_junk_small(void *ptr, const bin_info_t *bin_info,
     bool zero);
 
-typedef void (arena_dalloc_junk_small_t)(void *, const arena_bin_info_t *);
+typedef void (arena_dalloc_junk_small_t)(void *, const bin_info_t *);
 extern arena_dalloc_junk_small_t *JET_MUTABLE arena_dalloc_junk_small;
 
 void *arena_malloc_hard(tsdn_t *tsdn, arena_t *arena, size_t size,
@@ -77,6 +72,8 @@ ssize_t arena_dirty_decay_ms_default_get(void);
 bool arena_dirty_decay_ms_default_set(ssize_t decay_ms);
 ssize_t arena_muzzy_decay_ms_default_get(void);
 bool arena_muzzy_decay_ms_default_set(ssize_t decay_ms);
+bool arena_retain_grow_limit_get_set(tsd_t *tsd, arena_t *arena,
+    size_t *old_limit, size_t *new_limit);
 unsigned arena_nthreads_get(arena_t *arena, bool internal);
 void arena_nthreads_inc(arena_t *arena, bool internal);
 void arena_nthreads_dec(arena_t *arena, bool internal);
diff --git a/include/jemalloc/internal/arena_inlines_b.h b/include/jemalloc/internal/arena_inlines_b.h
index 003abe1..7b10d9e 100644
--- a/include/jemalloc/internal/arena_inlines_b.h
+++ b/include/jemalloc/internal/arena_inlines_b.h
@@ -8,13 +8,6 @@
 #include "jemalloc/internal/sz.h"
 #include "jemalloc/internal/ticker.h"
 
-static inline szind_t
-arena_bin_index(arena_t *arena, arena_bin_t *bin) {
-	szind_t binind = (szind_t)(bin - arena->bins);
-	assert(binind < NBINS);
-	return binind;
-}
-
 JEMALLOC_ALWAYS_INLINE prof_tctx_t *
 arena_prof_tctx_get(tsdn_t *tsdn, const void *ptr, alloc_ctx_t *alloc_ctx) {
 	cassert(config_prof);
diff --git a/include/jemalloc/internal/arena_stats.h b/include/jemalloc/internal/arena_stats.h
new file mode 100644
index 0000000..837d4eb
--- /dev/null
+++ b/include/jemalloc/internal/arena_stats.h
@@ -0,0 +1,237 @@
+#ifndef JEMALLOC_INTERNAL_ARENA_STATS_H
+#define JEMALLOC_INTERNAL_ARENA_STATS_H
+
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/mutex_prof.h"
+#include "jemalloc/internal/size_classes.h"
+
+/*
+ * In those architectures that support 64-bit atomics, we use atomic updates for
+ * our 64-bit values.  Otherwise, we use a plain uint64_t and synchronize
+ * externally.
+ */
+#ifdef JEMALLOC_ATOMIC_U64
+typedef atomic_u64_t arena_stats_u64_t;
+#else
+/* Must hold the arena stats mutex while reading atomically. */
+typedef uint64_t arena_stats_u64_t;
+#endif
+
+typedef struct arena_stats_large_s arena_stats_large_t;
+struct arena_stats_large_s {
+	/*
+	 * Total number of allocation/deallocation requests served directly by
+	 * the arena.
+	 */
+	arena_stats_u64_t	nmalloc;
+	arena_stats_u64_t	ndalloc;
+
+	/*
+	 * Number of allocation requests that correspond to this size class.
+	 * This includes requests served by tcache, though tcache only
+	 * periodically merges into this counter.
+	 */
+	arena_stats_u64_t	nrequests; /* Partially derived. */
+
+	/* Current number of allocations of this size class. */
+	size_t		curlextents; /* Derived. */
+};
+
+typedef struct arena_stats_decay_s arena_stats_decay_t;
+struct arena_stats_decay_s {
+	/* Total number of purge sweeps. */
+	arena_stats_u64_t	npurge;
+	/* Total number of madvise calls made. */
+	arena_stats_u64_t	nmadvise;
+	/* Total number of pages purged. */
+	arena_stats_u64_t	purged;
+};
+
+/*
+ * Arena stats.  Note that fields marked "derived" are not directly maintained
+ * within the arena code; rather their values are derived during stats merge
+ * requests.
+ */
+typedef struct arena_stats_s arena_stats_t;
+struct arena_stats_s {
+#ifndef JEMALLOC_ATOMIC_U64
+	malloc_mutex_t		mtx;
+#endif
+
+	/* Number of bytes currently mapped, excluding retained memory. */
+	atomic_zu_t		mapped; /* Partially derived. */
+
+	/*
+	 * Number of unused virtual memory bytes currently retained.  Retained
+	 * bytes are technically mapped (though always decommitted or purged),
+	 * but they are excluded from the mapped statistic (above).
+	 */
+	atomic_zu_t		retained; /* Derived. */
+
+	arena_stats_decay_t	decay_dirty;
+	arena_stats_decay_t	decay_muzzy;
+
+	atomic_zu_t		base; /* Derived. */
+	atomic_zu_t		internal;
+	atomic_zu_t		resident; /* Derived. */
+	atomic_zu_t		metadata_thp;
+
+	atomic_zu_t		allocated_large; /* Derived. */
+	arena_stats_u64_t	nmalloc_large; /* Derived. */
+	arena_stats_u64_t	ndalloc_large; /* Derived. */
+	arena_stats_u64_t	nrequests_large; /* Derived. */
+
+	/* Number of bytes cached in tcache associated with this arena. */
+	atomic_zu_t		tcache_bytes; /* Derived. */
+
+	mutex_prof_data_t mutex_prof_data[mutex_prof_num_arena_mutexes];
+
+	/* One element for each large size class. */
+	arena_stats_large_t	lstats[NSIZES - NBINS];
+
+	/* Arena uptime. */
+	nstime_t		uptime;
+};
+
+static inline bool
+arena_stats_init(tsdn_t *tsdn, arena_stats_t *arena_stats) {
+	if (config_debug) {
+		for (size_t i = 0; i < sizeof(arena_stats_t); i++) {
+			assert(((char *)arena_stats)[i] == 0);
+		}
+	}
+#ifndef JEMALLOC_ATOMIC_U64
+	if (malloc_mutex_init(&arena_stats->mtx, "arena_stats",
+	    WITNESS_RANK_ARENA_STATS, malloc_mutex_rank_exclusive)) {
+		return true;
+	}
+#endif
+	/* Memory is zeroed, so there is no need to clear stats. */
+	return false;
+}
+
+static inline void
+arena_stats_lock(tsdn_t *tsdn, arena_stats_t *arena_stats) {
+#ifndef JEMALLOC_ATOMIC_U64
+	malloc_mutex_lock(tsdn, &arena_stats->mtx);
+#endif
+}
+
+static inline void
+arena_stats_unlock(tsdn_t *tsdn, arena_stats_t *arena_stats) {
+#ifndef JEMALLOC_ATOMIC_U64
+	malloc_mutex_unlock(tsdn, &arena_stats->mtx);
+#endif
+}
+
+static inline uint64_t
+arena_stats_read_u64(tsdn_t *tsdn, arena_stats_t *arena_stats,
+    arena_stats_u64_t *p) {
+#ifdef JEMALLOC_ATOMIC_U64
+	return atomic_load_u64(p, ATOMIC_RELAXED);
+#else
+	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
+	return *p;
+#endif
+}
+
+static inline void
+arena_stats_add_u64(tsdn_t *tsdn, arena_stats_t *arena_stats,
+    arena_stats_u64_t *p, uint64_t x) {
+#ifdef JEMALLOC_ATOMIC_U64
+	atomic_fetch_add_u64(p, x, ATOMIC_RELAXED);
+#else
+	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
+	*p += x;
+#endif
+}
+
+UNUSED static inline void
+arena_stats_sub_u64(tsdn_t *tsdn, arena_stats_t *arena_stats,
+    arena_stats_u64_t *p, uint64_t x) {
+#ifdef JEMALLOC_ATOMIC_U64
+	UNUSED uint64_t r = atomic_fetch_sub_u64(p, x, ATOMIC_RELAXED);
+	assert(r - x <= r);
+#else
+	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
+	*p -= x;
+	assert(*p + x >= *p);
+#endif
+}
+
+/*
+ * Non-atomically sets *dst += src.  *dst needs external synchronization.
+ * This lets us avoid the cost of a fetch_add when its unnecessary (note that
+ * the types here are atomic).
+ */
+static inline void
+arena_stats_accum_u64(arena_stats_u64_t *dst, uint64_t src) {
+#ifdef JEMALLOC_ATOMIC_U64
+	uint64_t cur_dst = atomic_load_u64(dst, ATOMIC_RELAXED);
+	atomic_store_u64(dst, src + cur_dst, ATOMIC_RELAXED);
+#else
+	*dst += src;
+#endif
+}
+
+static inline size_t
+arena_stats_read_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p) {
+#ifdef JEMALLOC_ATOMIC_U64
+	return atomic_load_zu(p, ATOMIC_RELAXED);
+#else
+	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
+	return atomic_load_zu(p, ATOMIC_RELAXED);
+#endif
+}
+
+static inline void
+arena_stats_add_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p,
+    size_t x) {
+#ifdef JEMALLOC_ATOMIC_U64
+	atomic_fetch_add_zu(p, x, ATOMIC_RELAXED);
+#else
+	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
+	size_t cur = atomic_load_zu(p, ATOMIC_RELAXED);
+	atomic_store_zu(p, cur + x, ATOMIC_RELAXED);
+#endif
+}
+
+static inline void
+arena_stats_sub_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p,
+    size_t x) {
+#ifdef JEMALLOC_ATOMIC_U64
+	UNUSED size_t r = atomic_fetch_sub_zu(p, x, ATOMIC_RELAXED);
+	assert(r - x <= r);
+#else
+	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
+	size_t cur = atomic_load_zu(p, ATOMIC_RELAXED);
+	atomic_store_zu(p, cur - x, ATOMIC_RELAXED);
+#endif
+}
+
+/* Like the _u64 variant, needs an externally synchronized *dst. */
+static inline void
+arena_stats_accum_zu(atomic_zu_t *dst, size_t src) {
+	size_t cur_dst = atomic_load_zu(dst, ATOMIC_RELAXED);
+	atomic_store_zu(dst, src + cur_dst, ATOMIC_RELAXED);
+}
+
+static inline void
+arena_stats_large_nrequests_add(tsdn_t *tsdn, arena_stats_t *arena_stats,
+    szind_t szind, uint64_t nrequests) {
+	arena_stats_lock(tsdn, arena_stats);
+	arena_stats_add_u64(tsdn, arena_stats, &arena_stats->lstats[szind -
+	    NBINS].nrequests, nrequests);
+	arena_stats_unlock(tsdn, arena_stats);
+}
+
+static inline void
+arena_stats_mapped_add(tsdn_t *tsdn, arena_stats_t *arena_stats, size_t size) {
+	arena_stats_lock(tsdn, arena_stats);
+	arena_stats_add_zu(tsdn, arena_stats, &arena_stats->mapped, size);
+	arena_stats_unlock(tsdn, arena_stats);
+}
+
+
+#endif /* JEMALLOC_INTERNAL_ARENA_STATS_H */
diff --git a/include/jemalloc/internal/arena_structs_b.h b/include/jemalloc/internal/arena_structs_b.h
index d1fffec..38bc959 100644
--- a/include/jemalloc/internal/arena_structs_b.h
+++ b/include/jemalloc/internal/arena_structs_b.h
@@ -1,7 +1,9 @@
 #ifndef JEMALLOC_INTERNAL_ARENA_STRUCTS_B_H
 #define JEMALLOC_INTERNAL_ARENA_STRUCTS_B_H
 
+#include "jemalloc/internal/arena_stats.h"
 #include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/bin.h"
 #include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/extent_dss.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"
@@ -10,45 +12,8 @@
 #include "jemalloc/internal/ql.h"
 #include "jemalloc/internal/size_classes.h"
 #include "jemalloc/internal/smoothstep.h"
-#include "jemalloc/internal/stats.h"
 #include "jemalloc/internal/ticker.h"
 
-/*
- * Read-only information associated with each element of arena_t's bins array
- * is stored separately, partly to reduce memory usage (only one copy, rather
- * than one per arena), but mainly to avoid false cacheline sharing.
- *
- * Each slab has the following layout:
- *
- *   /--------------------\
- *   | region 0           |
- *   |--------------------|
- *   | region 1           |
- *   |--------------------|
- *   | ...                |
- *   | ...                |
- *   | ...                |
- *   |--------------------|
- *   | region nregs-1     |
- *   \--------------------/
- */
-struct arena_bin_info_s {
-	/* Size of regions in a slab for this bin's size class. */
-	size_t			reg_size;
-
-	/* Total size of a slab for this bin's size class. */
-	size_t			slab_size;
-
-	/* Total number of regions in a slab for this bin's size class. */
-	uint32_t		nregs;
-
-	/*
-	 * Metadata used to manipulate bitmaps for slabs associated with this
-	 * bin.
-	 */
-	bitmap_info_t		bitmap_info;
-};
-
 struct arena_decay_s {
 	/* Synchronizes all non-atomic fields. */
 	malloc_mutex_t		mtx;
@@ -104,37 +69,11 @@ struct arena_decay_s {
 	 * arena and ctl code.
 	 *
 	 * Synchronization: Same as associated arena's stats field. */
-	decay_stats_t		*stats;
+	arena_stats_decay_t	*stats;
 	/* Peak number of pages in associated extents.  Used for debug only. */
 	uint64_t		ceil_npages;
 };
 
-struct arena_bin_s {
-	/* All operations on arena_bin_t fields require lock ownership. */
-	malloc_mutex_t		lock;
-
-	/*
-	 * Current slab being used to service allocations of this bin's size
-	 * class.  slabcur is independent of slabs_{nonfull,full}; whenever
-	 * slabcur is reassigned, the previous slab must be deallocated or
-	 * inserted into slabs_{nonfull,full}.
-	 */
-	extent_t		*slabcur;
-
-	/*
-	 * Heap of non-full slabs.  This heap is used to assure that new
-	 * allocations come from the non-full slab that is oldest/lowest in
-	 * memory.
-	 */
-	extent_heap_t		slabs_nonfull;
-
-	/* List used to track full slabs. */
-	extent_list_t		slabs_full;
-
-	/* Bin statistics. */
-	malloc_bin_stats_t	stats;
-};
-
 struct arena_s {
 	/*
 	 * Number of threads currently assigned to this arena.  Each thread has
@@ -162,14 +101,15 @@ struct arena_s {
 	arena_stats_t		stats;
 
 	/*
-	 * List of tcaches for extant threads associated with this arena.
-	 * Stats from these are merged incrementally, and at exit if
-	 * opt_stats_print is enabled.
+	 * Lists of tcaches and cache_bin_array_descriptors for extant threads
+	 * associated with this arena.  Stats from these are merged
+	 * incrementally, and at exit if opt_stats_print is enabled.
 	 *
 	 * Synchronization: tcache_ql_mtx.
 	 */
-	ql_head(tcache_t)	tcache_ql;
-	malloc_mutex_t		tcache_ql_mtx;
+	ql_head(tcache_t)			tcache_ql;
+	ql_head(cache_bin_array_descriptor_t)	cache_bin_array_descriptor_ql;
+	malloc_mutex_t				tcache_ql_mtx;
 
 	/* Synchronization: internal. */
 	prof_accum_t		prof_accum;
@@ -239,9 +179,14 @@ struct arena_s {
 	 * be effective even if multiple arenas' extent allocation requests are
 	 * highly interleaved.
 	 *
+	 * retain_grow_limit is the max allowed size ind to expand (unless the
+	 * required size is greater).  Default is no limit, and controlled
+	 * through mallctl only.
+	 *
 	 * Synchronization: extent_grow_mtx
 	 */
 	pszind_t		extent_grow_next;
+	pszind_t		retain_grow_limit;
 	malloc_mutex_t		extent_grow_mtx;
 
 	/*
@@ -258,7 +203,7 @@ struct arena_s {
 	 *
 	 * Synchronization: internal.
 	 */
-	arena_bin_t		bins[NBINS];
+	bin_t			bins[NBINS];
 
 	/*
 	 * Base allocator, from which arena metadata are allocated.
diff --git a/include/jemalloc/internal/arena_types.h b/include/jemalloc/internal/arena_types.h
index a691bd8..70001b5 100644
--- a/include/jemalloc/internal/arena_types.h
+++ b/include/jemalloc/internal/arena_types.h
@@ -12,9 +12,7 @@
 #define DECAY_NTICKS_PER_UPDATE	1000
 
 typedef struct arena_slab_data_s arena_slab_data_t;
-typedef struct arena_bin_info_s arena_bin_info_t;
 typedef struct arena_decay_s arena_decay_t;
-typedef struct arena_bin_s arena_bin_t;
 typedef struct arena_s arena_t;
 typedef struct arena_tdata_s arena_tdata_t;
 typedef struct alloc_ctx_s alloc_ctx_t;
diff --git a/include/jemalloc/internal/base_externs.h b/include/jemalloc/internal/base_externs.h
index a4fd5ac..7b705c9 100644
--- a/include/jemalloc/internal/base_externs.h
+++ b/include/jemalloc/internal/base_externs.h
@@ -1,6 +1,9 @@
 #ifndef JEMALLOC_INTERNAL_BASE_EXTERNS_H
 #define JEMALLOC_INTERNAL_BASE_EXTERNS_H
 
+extern metadata_thp_mode_t opt_metadata_thp;
+extern const char *metadata_thp_mode_names[];
+
 base_t *b0get(void);
 base_t *base_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks);
 void base_delete(tsdn_t *tsdn, base_t *base);
@@ -10,7 +13,7 @@ extent_hooks_t *base_extent_hooks_set(base_t *base,
 void *base_alloc(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment);
 extent_t *base_alloc_extent(tsdn_t *tsdn, base_t *base);
 void base_stats_get(tsdn_t *tsdn, base_t *base, size_t *allocated,
-    size_t *resident, size_t *mapped);
+    size_t *resident, size_t *mapped, size_t *n_thp);
 void base_prefork(tsdn_t *tsdn, base_t *base);
 void base_postfork_parent(tsdn_t *tsdn, base_t *base);
 void base_postfork_child(tsdn_t *tsdn, base_t *base);
diff --git a/include/jemalloc/internal/base_inlines.h b/include/jemalloc/internal/base_inlines.h
index 931560b..aec0e2e 100644
--- a/include/jemalloc/internal/base_inlines.h
+++ b/include/jemalloc/internal/base_inlines.h
@@ -6,4 +6,8 @@ base_ind_get(const base_t *base) {
 	return base->ind;
 }
 
+static inline bool
+metadata_thp_enabled(void) {
+	return (opt_metadata_thp != metadata_thp_disabled);
+}
 #endif /* JEMALLOC_INTERNAL_BASE_INLINES_H */
diff --git a/include/jemalloc/internal/base_structs.h b/include/jemalloc/internal/base_structs.h
index 18e227b..2102247 100644
--- a/include/jemalloc/internal/base_structs.h
+++ b/include/jemalloc/internal/base_structs.h
@@ -30,6 +30,8 @@ struct base_s {
 	/* Protects base_alloc() and base_stats_get() operations. */
 	malloc_mutex_t	mtx;
 
+	/* Using THP when true (metadata_thp auto mode). */
+	bool		auto_thp_switched;
 	/*
 	 * Most recent size class in the series of increasingly large base
 	 * extents.  Logarithmic spacing between subsequent allocations ensures
@@ -50,6 +52,8 @@ struct base_s {
 	size_t		allocated;
 	size_t		resident;
 	size_t		mapped;
+	/* Number of THP regions touched. */
+	size_t		n_thp;
 };
 
 #endif /* JEMALLOC_INTERNAL_BASE_STRUCTS_H */
diff --git a/include/jemalloc/internal/base_types.h b/include/jemalloc/internal/base_types.h
index be7ee82..b6db77d 100644
--- a/include/jemalloc/internal/base_types.h
+++ b/include/jemalloc/internal/base_types.h
@@ -4,4 +4,30 @@
 typedef struct base_block_s base_block_t;
 typedef struct base_s base_t;
 
+#define METADATA_THP_DEFAULT metadata_thp_disabled
+
+/*
+ * In auto mode, arenas switch to huge pages for the base allocator on the
+ * second base block.  a0 switches to thp on the 5th block (after 20 megabytes
+ * of metadata), since more metadata (e.g. rtree nodes) come from a0's base.
+ */
+
+#define BASE_AUTO_THP_THRESHOLD    2
+#define BASE_AUTO_THP_THRESHOLD_A0 5
+
+typedef enum {
+	metadata_thp_disabled   = 0,
+	/*
+	 * Lazily enable hugepage for metadata. To avoid high RSS caused by THP
+	 * + low usage arena (i.e. THP becomes a significant percentage), the
+	 * "auto" option only starts using THP after a base allocator used up
+	 * the first THP region.  Starting from the second hugepage (in a single
+	 * arena), "auto" behaves the same as "always", i.e. madvise hugepage
+	 * right away.
+	 */
+	metadata_thp_auto       = 1,
+	metadata_thp_always     = 2,
+	metadata_thp_mode_limit = 3
+} metadata_thp_mode_t;
+
 #endif /* JEMALLOC_INTERNAL_BASE_TYPES_H */
diff --git a/include/jemalloc/internal/bin.h b/include/jemalloc/internal/bin.h
new file mode 100644
index 0000000..9b416ad
--- /dev/null
+++ b/include/jemalloc/internal/bin.h
@@ -0,0 +1,106 @@
+#ifndef JEMALLOC_INTERNAL_BIN_H
+#define JEMALLOC_INTERNAL_BIN_H
+
+#include "jemalloc/internal/extent_types.h"
+#include "jemalloc/internal/extent_structs.h"
+#include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/bin_stats.h"
+
+/*
+ * A bin contains a set of extents that are currently being used for slab
+ * allocations.
+ */
+
+/*
+ * Read-only information associated with each element of arena_t's bins array
+ * is stored separately, partly to reduce memory usage (only one copy, rather
+ * than one per arena), but mainly to avoid false cacheline sharing.
+ *
+ * Each slab has the following layout:
+ *
+ *   /--------------------\
+ *   | region 0           |
+ *   |--------------------|
+ *   | region 1           |
+ *   |--------------------|
+ *   | ...                |
+ *   | ...                |
+ *   | ...                |
+ *   |--------------------|
+ *   | region nregs-1     |
+ *   \--------------------/
+ */
+typedef struct bin_info_s bin_info_t;
+struct bin_info_s {
+	/* Size of regions in a slab for this bin's size class. */
+	size_t			reg_size;
+
+	/* Total size of a slab for this bin's size class. */
+	size_t			slab_size;
+
+	/* Total number of regions in a slab for this bin's size class. */
+	uint32_t		nregs;
+
+	/*
+	 * Metadata used to manipulate bitmaps for slabs associated with this
+	 * bin.
+	 */
+	bitmap_info_t		bitmap_info;
+};
+
+extern const bin_info_t bin_infos[NBINS];
+
+
+typedef struct bin_s bin_t;
+struct bin_s {
+	/* All operations on bin_t fields require lock ownership. */
+	malloc_mutex_t		lock;
+
+	/*
+	 * Current slab being used to service allocations of this bin's size
+	 * class.  slabcur is independent of slabs_{nonfull,full}; whenever
+	 * slabcur is reassigned, the previous slab must be deallocated or
+	 * inserted into slabs_{nonfull,full}.
+	 */
+	extent_t		*slabcur;
+
+	/*
+	 * Heap of non-full slabs.  This heap is used to assure that new
+	 * allocations come from the non-full slab that is oldest/lowest in
+	 * memory.
+	 */
+	extent_heap_t		slabs_nonfull;
+
+	/* List used to track full slabs. */
+	extent_list_t		slabs_full;
+
+	/* Bin statistics. */
+	bin_stats_t	stats;
+};
+
+/* Initializes a bin to empty.  Returns true on error. */
+bool bin_init(bin_t *bin);
+
+/* Forking. */
+void bin_prefork(tsdn_t *tsdn, bin_t *bin);
+void bin_postfork_parent(tsdn_t *tsdn, bin_t *bin);
+void bin_postfork_child(tsdn_t *tsdn, bin_t *bin);
+
+/* Stats. */
+static inline void
+bin_stats_merge(tsdn_t *tsdn, bin_stats_t *dst_bin_stats, bin_t *bin) {
+	malloc_mutex_lock(tsdn, &bin->lock);
+	malloc_mutex_prof_read(tsdn, &dst_bin_stats->mutex_data, &bin->lock);
+	dst_bin_stats->nmalloc += bin->stats.nmalloc;
+	dst_bin_stats->ndalloc += bin->stats.ndalloc;
+	dst_bin_stats->nrequests += bin->stats.nrequests;
+	dst_bin_stats->curregs += bin->stats.curregs;
+	dst_bin_stats->nfills += bin->stats.nfills;
+	dst_bin_stats->nflushes += bin->stats.nflushes;
+	dst_bin_stats->nslabs += bin->stats.nslabs;
+	dst_bin_stats->reslabs += bin->stats.reslabs;
+	dst_bin_stats->curslabs += bin->stats.curslabs;
+	malloc_mutex_unlock(tsdn, &bin->lock);
+}
+
+#endif /* JEMALLOC_INTERNAL_BIN_H */
diff --git a/include/jemalloc/internal/bin_stats.h b/include/jemalloc/internal/bin_stats.h
new file mode 100644
index 0000000..86e673e
--- /dev/null
+++ b/include/jemalloc/internal/bin_stats.h
@@ -0,0 +1,51 @@
+#ifndef JEMALLOC_INTERNAL_BIN_STATS_H
+#define JEMALLOC_INTERNAL_BIN_STATS_H
+
+#include "jemalloc/internal/mutex_prof.h"
+
+typedef struct bin_stats_s bin_stats_t;
+struct bin_stats_s {
+	/*
+	 * Total number of allocation/deallocation requests served directly by
+	 * the bin.  Note that tcache may allocate an object, then recycle it
+	 * many times, resulting many increments to nrequests, but only one
+	 * each to nmalloc and ndalloc.
+	 */
+	uint64_t	nmalloc;
+	uint64_t	ndalloc;
+
+	/*
+	 * Number of allocation requests that correspond to the size of this
+	 * bin.  This includes requests served by tcache, though tcache only
+	 * periodically merges into this counter.
+	 */
+	uint64_t	nrequests;
+
+	/*
+	 * Current number of regions of this size class, including regions
+	 * currently cached by tcache.
+	 */
+	size_t		curregs;
+
+	/* Number of tcache fills from this bin. */
+	uint64_t	nfills;
+
+	/* Number of tcache flushes to this bin. */
+	uint64_t	nflushes;
+
+	/* Total number of slabs created for this bin's size class. */
+	uint64_t	nslabs;
+
+	/*
+	 * Total number of slabs reused by extracting them from the slabs heap
+	 * for this bin's size class.
+	 */
+	uint64_t	reslabs;
+
+	/* Current number of slabs in this bin. */
+	size_t		curslabs;
+
+	mutex_prof_data_t mutex_data;
+};
+
+#endif /* JEMALLOC_INTERNAL_BIN_STATS_H */
diff --git a/include/jemalloc/internal/cache_bin.h b/include/jemalloc/internal/cache_bin.h
new file mode 100644
index 0000000..12f3ef2
--- /dev/null
+++ b/include/jemalloc/internal/cache_bin.h
@@ -0,0 +1,114 @@
+#ifndef JEMALLOC_INTERNAL_CACHE_BIN_H
+#define JEMALLOC_INTERNAL_CACHE_BIN_H
+
+#include "jemalloc/internal/ql.h"
+
+/*
+ * The cache_bins are the mechanism that the tcache and the arena use to
+ * communicate.  The tcache fills from and flushes to the arena by passing a
+ * cache_bin_t to fill/flush.  When the arena needs to pull stats from the
+ * tcaches associated with it, it does so by iterating over its
+ * cache_bin_array_descriptor_t objects and reading out per-bin stats it
+ * contains.  This makes it so that the arena need not know about the existence
+ * of the tcache at all.
+ */
+
+
+/*
+ * The count of the number of cached allocations in a bin.  We make this signed
+ * so that negative numbers can encode "invalid" states (e.g. a low water mark
+ * of -1 for a cache that has been depleted).
+ */
+typedef int32_t cache_bin_sz_t;
+
+typedef struct cache_bin_stats_s cache_bin_stats_t;
+struct cache_bin_stats_s {
+	/*
+	 * Number of allocation requests that corresponded to the size of this
+	 * bin.
+	 */
+	uint64_t nrequests;
+};
+
+/*
+ * Read-only information associated with each element of tcache_t's tbins array
+ * is stored separately, mainly to reduce memory usage.
+ */
+typedef struct cache_bin_info_s cache_bin_info_t;
+struct cache_bin_info_s {
+	/* Upper limit on ncached. */
+	cache_bin_sz_t ncached_max;
+};
+
+typedef struct cache_bin_s cache_bin_t;
+struct cache_bin_s {
+	/* Min # cached since last GC. */
+	cache_bin_sz_t low_water;
+	/* # of cached objects. */
+	cache_bin_sz_t ncached;
+	/*
+	 * ncached and stats are both modified frequently.  Let's keep them
+	 * close so that they have a higher chance of being on the same
+	 * cacheline, thus less write-backs.
+	 */
+	cache_bin_stats_t tstats;
+	/*
+	 * Stack of available objects.
+	 *
+	 * To make use of adjacent cacheline prefetch, the items in the avail
+	 * stack goes to higher address for newer allocations.  avail points
+	 * just above the available space, which means that
+	 * avail[-ncached, ... -1] are available items and the lowest item will
+	 * be allocated first.
+	 */
+	void **avail;
+};
+
+typedef struct cache_bin_array_descriptor_s cache_bin_array_descriptor_t;
+struct cache_bin_array_descriptor_s {
+	/*
+	 * The arena keeps a list of the cache bins associated with it, for
+	 * stats collection.
+	 */
+	ql_elm(cache_bin_array_descriptor_t) link;
+	/* Pointers to the tcache bins. */
+	cache_bin_t *bins_small;
+	cache_bin_t *bins_large;
+};
+
+static inline void
+cache_bin_array_descriptor_init(cache_bin_array_descriptor_t *descriptor,
+    cache_bin_t *bins_small, cache_bin_t *bins_large) {
+	ql_elm_new(descriptor, link);
+	descriptor->bins_small = bins_small;
+	descriptor->bins_large = bins_large;
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+cache_bin_alloc_easy(cache_bin_t *bin, bool *success) {
+	void *ret;
+
+	if (unlikely(bin->ncached == 0)) {
+		bin->low_water = -1;
+		*success = false;
+		return NULL;
+	}
+	/*
+	 * success (instead of ret) should be checked upon the return of this
+	 * function.  We avoid checking (ret == NULL) because there is never a
+	 * null stored on the avail stack (which is unknown to the compiler),
+	 * and eagerly checking ret would cause pipeline stall (waiting for the
+	 * cacheline).
+	 */
+	*success = true;
+	ret = *(bin->avail - bin->ncached);
+	bin->ncached--;
+
+	if (unlikely(bin->ncached < bin->low_water)) {
+		bin->low_water = bin->ncached;
+	}
+
+	return ret;
+}
+
+#endif /* JEMALLOC_INTERNAL_CACHE_BIN_H */
diff --git a/include/jemalloc/internal/ctl.h b/include/jemalloc/internal/ctl.h
index a91c4cf..d927d94 100644
--- a/include/jemalloc/internal/ctl.h
+++ b/include/jemalloc/internal/ctl.h
@@ -40,14 +40,15 @@ typedef struct ctl_arena_stats_s {
 	uint64_t ndalloc_small;
 	uint64_t nrequests_small;
 
-	malloc_bin_stats_t bstats[NBINS];
-	malloc_large_stats_t lstats[NSIZES - NBINS];
+	bin_stats_t bstats[NBINS];
+	arena_stats_large_t lstats[NSIZES - NBINS];
 } ctl_arena_stats_t;
 
 typedef struct ctl_stats_s {
 	size_t allocated;
 	size_t active;
 	size_t metadata;
+	size_t metadata_thp;
 	size_t resident;
 	size_t mapped;
 	size_t retained;
diff --git a/include/jemalloc/internal/div.h b/include/jemalloc/internal/div.h
new file mode 100644
index 0000000..aebae93
--- /dev/null
+++ b/include/jemalloc/internal/div.h
@@ -0,0 +1,41 @@
+#ifndef JEMALLOC_INTERNAL_DIV_H
+#define JEMALLOC_INTERNAL_DIV_H
+
+#include "jemalloc/internal/assert.h"
+
+/*
+ * This module does the division that computes the index of a region in a slab,
+ * given its offset relative to the base.
+ * That is, given a divisor d, an n = i * d (all integers), we'll return i.
+ * We do some pre-computation to do this more quickly than a CPU division
+ * instruction.
+ * We bound n < 2^32, and don't support dividing by one.
+ */
+
+typedef struct div_info_s div_info_t;
+struct div_info_s {
+	uint32_t magic;
+#ifdef JEMALLOC_DEBUG
+	size_t d;
+#endif
+};
+
+void div_init(div_info_t *div_info, size_t divisor);
+
+static inline size_t
+div_compute(div_info_t *div_info, size_t n) {
+	assert(n <= (uint32_t)-1);
+	/*
+	 * This generates, e.g. mov; imul; shr on x86-64. On a 32-bit machine,
+	 * the compilers I tried were all smart enough to turn this into the
+	 * appropriate "get the high 32 bits of the result of a multiply" (e.g.
+	 * mul; mov edx eax; on x86, umull on arm, etc.).
+	 */
+	size_t i = ((uint64_t)n * (uint64_t)div_info->magic) >> 32;
+#ifdef JEMALLOC_DEBUG
+	assert(i * div_info->d == n);
+#endif
+	return i;
+}
+
+#endif /* JEMALLOC_INTERNAL_DIV_H */
diff --git a/include/jemalloc/internal/extent_externs.h b/include/jemalloc/internal/extent_externs.h
index 489a813..b8a4d02 100644
--- a/include/jemalloc/internal/extent_externs.h
+++ b/include/jemalloc/internal/extent_externs.h
@@ -4,12 +4,13 @@
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/mutex_pool.h"
 #include "jemalloc/internal/ph.h"
-#include "jemalloc/internal/rb.h"
 #include "jemalloc/internal/rtree.h"
 
-extern rtree_t			extents_rtree;
-extern const extent_hooks_t	extent_hooks_default;
-extern mutex_pool_t		extent_mutex_pool;
+extern size_t opt_lg_extent_max_active_fit;
+
+extern rtree_t extents_rtree;
+extern const extent_hooks_t extent_hooks_default;
+extern mutex_pool_t extent_mutex_pool;
 
 extent_t *extent_alloc(tsdn_t *tsdn, arena_t *arena);
 void extent_dalloc(tsdn_t *tsdn, arena_t *arena, extent_t *extent);
diff --git a/include/jemalloc/internal/extent_inlines.h b/include/jemalloc/internal/extent_inlines.h
index bb2bd69..9b8ddc2 100644
--- a/include/jemalloc/internal/extent_inlines.h
+++ b/include/jemalloc/internal/extent_inlines.h
@@ -94,6 +94,12 @@ extent_committed_get(const extent_t *extent) {
 }
 
 static inline bool
+extent_dumpable_get(const extent_t *extent) {
+	return (bool)((extent->e_bits & EXTENT_BITS_DUMPABLE_MASK) >>
+	    EXTENT_BITS_DUMPABLE_SHIFT);
+}
+
+static inline bool
 extent_slab_get(const extent_t *extent) {
 	return (bool)((extent->e_bits & EXTENT_BITS_SLAB_MASK) >>
 	    EXTENT_BITS_SLAB_SHIFT);
@@ -190,9 +196,16 @@ extent_addr_randomize(tsdn_t *tsdn, extent_t *extent, size_t alignment) {
 	if (alignment < PAGE) {
 		unsigned lg_range = LG_PAGE -
 		    lg_floor(CACHELINE_CEILING(alignment));
-		size_t r =
-		    prng_lg_range_zu(&extent_arena_get(extent)->offset_state,
-		    lg_range, true);
+		size_t r;
+		if (!tsdn_null(tsdn)) {
+			tsd_t *tsd = tsdn_tsd(tsdn);
+			r = (size_t)prng_lg_range_u64(
+			    tsd_offset_statep_get(tsd), lg_range);
+		} else {
+			r = prng_lg_range_zu(
+			    &extent_arena_get(extent)->offset_state,
+			    lg_range, true);
+		}
 		uintptr_t random_offset = ((uintptr_t)r) << (LG_PAGE -
 		    lg_range);
 		extent->e_addr = (void *)((uintptr_t)extent->e_addr +
@@ -270,6 +283,12 @@ extent_committed_set(extent_t *extent, bool committed) {
 }
 
 static inline void
+extent_dumpable_set(extent_t *extent, bool dumpable) {
+	extent->e_bits = (extent->e_bits & ~EXTENT_BITS_DUMPABLE_MASK) |
+	    ((uint64_t)dumpable << EXTENT_BITS_DUMPABLE_SHIFT);
+}
+
+static inline void
 extent_slab_set(extent_t *extent, bool slab) {
 	extent->e_bits = (extent->e_bits & ~EXTENT_BITS_SLAB_MASK) |
 	    ((uint64_t)slab << EXTENT_BITS_SLAB_SHIFT);
@@ -283,7 +302,7 @@ extent_prof_tctx_set(extent_t *extent, prof_tctx_t *tctx) {
 static inline void
 extent_init(extent_t *extent, arena_t *arena, void *addr, size_t size,
     bool slab, szind_t szind, size_t sn, extent_state_t state, bool zeroed,
-    bool committed) {
+    bool committed, bool dumpable) {
 	assert(addr == PAGE_ADDR2BASE(addr) || !slab);
 
 	extent_arena_set(extent, arena);
@@ -295,6 +314,7 @@ extent_init(extent_t *extent, arena_t *arena, void *addr, size_t size,
 	extent_state_set(extent, state);
 	extent_zeroed_set(extent, zeroed);
 	extent_committed_set(extent, committed);
+	extent_dumpable_set(extent, dumpable);
 	ql_elm_new(extent, ql_link);
 	if (config_prof) {
 		extent_prof_tctx_set(extent, NULL);
@@ -312,6 +332,7 @@ extent_binit(extent_t *extent, void *addr, size_t bsize, size_t sn) {
 	extent_state_set(extent, extent_state_active);
 	extent_zeroed_set(extent, true);
 	extent_committed_set(extent, true);
+	extent_dumpable_set(extent, true);
 }
 
 static inline void
@@ -335,6 +356,11 @@ extent_list_append(extent_list_t *list, extent_t *extent) {
 }
 
 static inline void
+extent_list_prepend(extent_list_t *list, extent_t *extent) {
+	ql_head_insert(list, extent, ql_link);
+}
+
+static inline void
 extent_list_replace(extent_list_t *list, extent_t *to_remove,
     extent_t *to_insert) {
 	ql_after_insert(to_remove, to_insert, ql_link);
diff --git a/include/jemalloc/internal/extent_structs.h b/include/jemalloc/internal/extent_structs.h
index d297950..4873b9e 100644
--- a/include/jemalloc/internal/extent_structs.h
+++ b/include/jemalloc/internal/extent_structs.h
@@ -5,7 +5,6 @@
 #include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/ql.h"
-#include "jemalloc/internal/rb.h"
 #include "jemalloc/internal/ph.h"
 #include "jemalloc/internal/size_classes.h"
 
@@ -24,13 +23,14 @@ struct extent_s {
 	 * a: arena_ind
 	 * b: slab
 	 * c: committed
+	 * d: dumpable
 	 * z: zeroed
 	 * t: state
 	 * i: szind
 	 * f: nfree
 	 * n: sn
 	 *
-	 * nnnnnnnn ... nnnnnfff fffffffi iiiiiiit tzcbaaaa aaaaaaaa
+	 * nnnnnnnn ... nnnnffff ffffffii iiiiiitt zdcbaaaa aaaaaaaa
 	 *
 	 * arena_ind: Arena from which this extent came, or all 1 bits if
 	 *            unassociated.
@@ -45,6 +45,23 @@ struct extent_s {
 	 *            as on a system that overcommits and satisfies physical
 	 *            memory needs on demand via soft page faults.
 	 *
+	 * dumpable: The dumpable flag indicates whether or not we've set the
+	 *           memory in question to be dumpable.  Note that this
+	 *           interacts somewhat subtly with user-specified extent hooks,
+	 *           since we don't know if *they* are fiddling with
+	 *           dumpability (in which case, we don't want to undo whatever
+	 *           they're doing).  To deal with this scenario, we:
+	 *             - Make dumpable false only for memory allocated with the
+	 *               default hooks.
+	 *             - Only allow memory to go from non-dumpable to dumpable,
+	 *               and only once.
+	 *             - Never make the OS call to allow dumping when the
+	 *               dumpable bit is already set.
+	 *           These three constraints mean that we will never
+	 *           accidentally dump user memory that the user meant to set
+	 *           nondumpable with their extent hooks.
+	 *
+	 *
 	 * zeroed: The zeroed flag is used by extent recycling code to track
 	 *         whether memory is zero-filled.
 	 *
@@ -69,38 +86,42 @@ struct extent_s {
 	 *     serial number to both resulting adjacent extents.
 	 */
 	uint64_t		e_bits;
-#define EXTENT_BITS_ARENA_SHIFT		0
-#define EXTENT_BITS_ARENA_MASK \
-    (((uint64_t)(1U << MALLOCX_ARENA_BITS) - 1) << EXTENT_BITS_ARENA_SHIFT)
+#define MASK(CURRENT_FIELD_WIDTH, CURRENT_FIELD_SHIFT) ((((((uint64_t)0x1U) << (CURRENT_FIELD_WIDTH)) - 1)) << (CURRENT_FIELD_SHIFT))
+
+#define EXTENT_BITS_ARENA_WIDTH  MALLOCX_ARENA_BITS
+#define EXTENT_BITS_ARENA_SHIFT  0
+#define EXTENT_BITS_ARENA_MASK  MASK(EXTENT_BITS_ARENA_WIDTH, EXTENT_BITS_ARENA_SHIFT)
 
-#define EXTENT_BITS_SLAB_SHIFT		MALLOCX_ARENA_BITS
-#define EXTENT_BITS_SLAB_MASK \
-    ((uint64_t)0x1U << EXTENT_BITS_SLAB_SHIFT)
+#define EXTENT_BITS_SLAB_WIDTH  1
+#define EXTENT_BITS_SLAB_SHIFT  (EXTENT_BITS_ARENA_WIDTH + EXTENT_BITS_ARENA_SHIFT)
+#define EXTENT_BITS_SLAB_MASK  MASK(EXTENT_BITS_SLAB_WIDTH, EXTENT_BITS_SLAB_SHIFT)
 
-#define EXTENT_BITS_COMMITTED_SHIFT	(MALLOCX_ARENA_BITS + 1)
-#define EXTENT_BITS_COMMITTED_MASK \
-    ((uint64_t)0x1U << EXTENT_BITS_COMMITTED_SHIFT)
+#define EXTENT_BITS_COMMITTED_WIDTH  1
+#define EXTENT_BITS_COMMITTED_SHIFT  (EXTENT_BITS_SLAB_WIDTH + EXTENT_BITS_SLAB_SHIFT)
+#define EXTENT_BITS_COMMITTED_MASK  MASK(EXTENT_BITS_COMMITTED_WIDTH, EXTENT_BITS_COMMITTED_SHIFT)
 
-#define EXTENT_BITS_ZEROED_SHIFT	(MALLOCX_ARENA_BITS + 2)
-#define EXTENT_BITS_ZEROED_MASK \
-    ((uint64_t)0x1U << EXTENT_BITS_ZEROED_SHIFT)
+#define EXTENT_BITS_DUMPABLE_WIDTH  1
+#define EXTENT_BITS_DUMPABLE_SHIFT  (EXTENT_BITS_COMMITTED_WIDTH + EXTENT_BITS_COMMITTED_SHIFT)
+#define EXTENT_BITS_DUMPABLE_MASK  MASK(EXTENT_BITS_DUMPABLE_WIDTH, EXTENT_BITS_DUMPABLE_SHIFT)
 
-#define EXTENT_BITS_STATE_SHIFT		(MALLOCX_ARENA_BITS + 3)
-#define EXTENT_BITS_STATE_MASK \
-    ((uint64_t)0x3U << EXTENT_BITS_STATE_SHIFT)
+#define EXTENT_BITS_ZEROED_WIDTH  1
+#define EXTENT_BITS_ZEROED_SHIFT  (EXTENT_BITS_DUMPABLE_WIDTH + EXTENT_BITS_DUMPABLE_SHIFT)
+#define EXTENT_BITS_ZEROED_MASK  MASK(EXTENT_BITS_ZEROED_WIDTH, EXTENT_BITS_ZEROED_SHIFT)
 
-#define EXTENT_BITS_SZIND_SHIFT		(MALLOCX_ARENA_BITS + 5)
-#define EXTENT_BITS_SZIND_MASK \
-    (((uint64_t)(1U << LG_CEIL_NSIZES) - 1) << EXTENT_BITS_SZIND_SHIFT)
+#define EXTENT_BITS_STATE_WIDTH  2
+#define EXTENT_BITS_STATE_SHIFT  (EXTENT_BITS_ZEROED_WIDTH + EXTENT_BITS_ZEROED_SHIFT)
+#define EXTENT_BITS_STATE_MASK  MASK(EXTENT_BITS_STATE_WIDTH, EXTENT_BITS_STATE_SHIFT)
 
-#define EXTENT_BITS_NFREE_SHIFT \
-    (MALLOCX_ARENA_BITS + 5 + LG_CEIL_NSIZES)
-#define EXTENT_BITS_NFREE_MASK \
-    ((uint64_t)((1U << (LG_SLAB_MAXREGS + 1)) - 1) << EXTENT_BITS_NFREE_SHIFT)
+#define EXTENT_BITS_SZIND_WIDTH  LG_CEIL_NSIZES
+#define EXTENT_BITS_SZIND_SHIFT  (EXTENT_BITS_STATE_WIDTH + EXTENT_BITS_STATE_SHIFT)
+#define EXTENT_BITS_SZIND_MASK  MASK(EXTENT_BITS_SZIND_WIDTH, EXTENT_BITS_SZIND_SHIFT)
 
-#define EXTENT_BITS_SN_SHIFT \
-    (MALLOCX_ARENA_BITS + 5 + LG_CEIL_NSIZES + (LG_SLAB_MAXREGS + 1))
-#define EXTENT_BITS_SN_MASK		(UINT64_MAX << EXTENT_BITS_SN_SHIFT)
+#define EXTENT_BITS_NFREE_WIDTH  (LG_SLAB_MAXREGS + 1)
+#define EXTENT_BITS_NFREE_SHIFT  (EXTENT_BITS_SZIND_WIDTH + EXTENT_BITS_SZIND_SHIFT)
+#define EXTENT_BITS_NFREE_MASK  MASK(EXTENT_BITS_NFREE_WIDTH, EXTENT_BITS_NFREE_SHIFT)
+
+#define EXTENT_BITS_SN_SHIFT  (EXTENT_BITS_NFREE_WIDTH + EXTENT_BITS_NFREE_SHIFT)
+#define EXTENT_BITS_SN_MASK  (UINT64_MAX << EXTENT_BITS_SN_SHIFT)
 
 	/* Pointer to the extent that this structure is responsible for. */
 	void			*e_addr;
@@ -120,20 +141,19 @@ struct extent_s {
 		size_t			e_bsize;
 	};
 
-	union {
-		/*
-		 * List linkage, used by a variety of lists:
-		 * - arena_bin_t's slabs_full
-		 * - extents_t's LRU
-		 * - stashed dirty extents
-		 * - arena's large allocations
-		 */
-		ql_elm(extent_t)	ql_link;
-		/* Red-black tree linkage, used by arena's extent_avail. */
-		rb_node(extent_t)	rb_link;
-	};
+	/*
+	 * List linkage, used by a variety of lists:
+	 * - bin_t's slabs_full
+	 * - extents_t's LRU
+	 * - stashed dirty extents
+	 * - arena's large allocations
+	 */
+	ql_elm(extent_t)	ql_link;
 
-	/* Linkage for per size class sn/address-ordered heaps. */
+	/*
+	 * Linkage for per size class sn/address-ordered heaps, and
+	 * for extent_avail
+	 */
 	phn(extent_t)		ph_link;
 
 	union {
@@ -148,7 +168,7 @@ struct extent_s {
 	};
 };
 typedef ql_head(extent_t) extent_list_t;
-typedef rb_tree(extent_t) extent_tree_t;
+typedef ph(extent_t) extent_tree_t;
 typedef ph(extent_t) extent_heap_t;
 
 /* Quantized collection of extents, with built-in LRU queue. */
diff --git a/include/jemalloc/internal/extent_types.h b/include/jemalloc/internal/extent_types.h
index b6905ce..c0561d9 100644
--- a/include/jemalloc/internal/extent_types.h
+++ b/include/jemalloc/internal/extent_types.h
@@ -6,4 +6,12 @@ typedef struct extents_s extents_t;
 
 #define EXTENT_HOOKS_INITIALIZER	NULL
 
+#define EXTENT_GROW_MAX_PIND (NPSIZES - 1)
+
+/*
+ * When reuse (and split) an active extent, (1U << opt_lg_extent_max_active_fit)
+ * is the max ratio between the size of the active extent and the new extent.
+ */
+#define LG_EXTENT_MAX_ACTIVE_FIT_DEFAULT 6
+
 #endif /* JEMALLOC_INTERNAL_EXTENT_TYPES_H */
diff --git a/include/jemalloc/internal/hash.h b/include/jemalloc/internal/hash.h
index 188296c..dcfc992 100644
--- a/include/jemalloc/internal/hash.h
+++ b/include/jemalloc/internal/hash.h
@@ -260,22 +260,22 @@ hash_x64_128(const void *key, const int len, const uint32_t seed,
 		uint64_t k2 = 0;
 
 		switch (len & 15) {
-		case 15: k2 ^= ((uint64_t)(tail[14])) << 48;
-		case 14: k2 ^= ((uint64_t)(tail[13])) << 40;
-		case 13: k2 ^= ((uint64_t)(tail[12])) << 32;
-		case 12: k2 ^= ((uint64_t)(tail[11])) << 24;
-		case 11: k2 ^= ((uint64_t)(tail[10])) << 16;
-		case 10: k2 ^= ((uint64_t)(tail[ 9])) << 8;
+		case 15: k2 ^= ((uint64_t)(tail[14])) << 48; /* falls through */
+		case 14: k2 ^= ((uint64_t)(tail[13])) << 40; /* falls through */
+		case 13: k2 ^= ((uint64_t)(tail[12])) << 32; /* falls through */
+		case 12: k2 ^= ((uint64_t)(tail[11])) << 24; /* falls through */
+		case 11: k2 ^= ((uint64_t)(tail[10])) << 16; /* falls through */
+		case 10: k2 ^= ((uint64_t)(tail[ 9])) << 8;  /* falls through */
 		case  9: k2 ^= ((uint64_t)(tail[ 8])) << 0;
 			k2 *= c2; k2 = hash_rotl_64(k2, 33); k2 *= c1; h2 ^= k2;
-
-		case  8: k1 ^= ((uint64_t)(tail[ 7])) << 56;
-		case  7: k1 ^= ((uint64_t)(tail[ 6])) << 48;
-		case  6: k1 ^= ((uint64_t)(tail[ 5])) << 40;
-		case  5: k1 ^= ((uint64_t)(tail[ 4])) << 32;
-		case  4: k1 ^= ((uint64_t)(tail[ 3])) << 24;
-		case  3: k1 ^= ((uint64_t)(tail[ 2])) << 16;
-		case  2: k1 ^= ((uint64_t)(tail[ 1])) << 8;
+			/* falls through */
+		case  8: k1 ^= ((uint64_t)(tail[ 7])) << 56; /* falls through */
+		case  7: k1 ^= ((uint64_t)(tail[ 6])) << 48; /* falls through */
+		case  6: k1 ^= ((uint64_t)(tail[ 5])) << 40; /* falls through */
+		case  5: k1 ^= ((uint64_t)(tail[ 4])) << 32; /* falls through */
+		case  4: k1 ^= ((uint64_t)(tail[ 3])) << 24; /* falls through */
+		case  3: k1 ^= ((uint64_t)(tail[ 2])) << 16; /* falls through */
+		case  2: k1 ^= ((uint64_t)(tail[ 1])) << 8;  /* falls through */
 		case  1: k1 ^= ((uint64_t)(tail[ 0])) << 0;
 			k1 *= c1; k1 = hash_rotl_64(k1, 31); k1 *= c2; h1 ^= k1;
 		}
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index c0f834f..8dad9a1 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -33,6 +33,8 @@
  * order to yield to another virtual CPU.
  */
 #undef CPU_SPINWAIT
+/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */
+#undef HAVE_CPU_SPINWAIT
 
 /*
  * Number of significant bits in virtual addresses.  This may be less than the
@@ -238,6 +240,12 @@
 #undef JEMALLOC_CACHE_OBLIVIOUS
 
 /*
+ * If defined, enable logging facilities.  We make this a configure option to
+ * avoid taking extra branches everywhere.
+ */
+#undef JEMALLOC_LOG
+
+/*
  * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
  */
 #undef JEMALLOC_ZONE
@@ -255,6 +263,12 @@
 #undef JEMALLOC_HAVE_MADVISE
 
 /*
+ * Defined if transparent huge pages are supported via the MADV_[NO]HUGEPAGE
+ * arguments to madvise(2).
+ */
+#undef JEMALLOC_HAVE_MADVISE_HUGE
+
+/*
  * Methods for purging unused pages differ between operating systems.
  *
  *   madvise(..., MADV_FREE) : This marks pages as being unused, such that they
@@ -271,6 +285,14 @@
 #undef JEMALLOC_PURGE_MADVISE_DONTNEED
 #undef JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS
 
+/* Defined if madvise(2) is available but MADV_FREE is not (x86 Linux only). */
+#undef JEMALLOC_DEFINE_MADVISE_FREE
+
+/*
+ * Defined if MADV_DO[NT]DUMP is supported as an argument to madvise.
+ */
+#undef JEMALLOC_MADVISE_DONTDUMP
+
 /*
  * Defined if transparent huge pages (THPs) are supported via the
  * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled.
@@ -336,4 +358,9 @@
 /* If defined, jemalloc takes the malloc/free/etc. symbol names. */
 #undef JEMALLOC_IS_MALLOC
 
+/*
+ * Defined if strerror_r returns char * if _GNU_SOURCE is defined.
+ */
+#undef JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE
+
 #endif /* JEMALLOC_INTERNAL_DEFS_H_ */
diff --git a/include/jemalloc/internal/jemalloc_internal_inlines_a.h b/include/jemalloc/internal/jemalloc_internal_inlines_a.h
index 24ea416..c6a1f7e 100644
--- a/include/jemalloc/internal/jemalloc_internal_inlines_a.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_a.h
@@ -106,16 +106,16 @@ decay_ticker_get(tsd_t *tsd, unsigned ind) {
 	return &tdata->decay_ticker;
 }
 
-JEMALLOC_ALWAYS_INLINE tcache_bin_t *
+JEMALLOC_ALWAYS_INLINE cache_bin_t *
 tcache_small_bin_get(tcache_t *tcache, szind_t binind) {
 	assert(binind < NBINS);
-	return &tcache->tbins_small[binind];
+	return &tcache->bins_small[binind];
 }
 
-JEMALLOC_ALWAYS_INLINE tcache_bin_t *
+JEMALLOC_ALWAYS_INLINE cache_bin_t *
 tcache_large_bin_get(tcache_t *tcache, szind_t binind) {
 	assert(binind >= NBINS &&binind < nhbins);
-	return &tcache->tbins_large[binind - NBINS];
+	return &tcache->bins_large[binind - NBINS];
 }
 
 JEMALLOC_ALWAYS_INLINE bool
@@ -151,6 +151,7 @@ pre_reentrancy(tsd_t *tsd, arena_t *arena) {
 	assert(arena != arena_get(tsd_tsdn(tsd), 0, false));
 
 	bool fast = tsd_fast(tsd);
+	assert(tsd_reentrancy_level_get(tsd) < INT8_MAX);
 	++*tsd_reentrancy_levelp_get(tsd);
 	if (fast) {
 		/* Prepare slow path for reentrancy. */
diff --git a/include/jemalloc/internal/jemalloc_internal_inlines_c.h b/include/jemalloc/internal/jemalloc_internal_inlines_c.h
index 7ffce6f..c829ac6 100644
--- a/include/jemalloc/internal/jemalloc_internal_inlines_c.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_c.h
@@ -5,6 +5,24 @@
 #include "jemalloc/internal/sz.h"
 #include "jemalloc/internal/witness.h"
 
+/*
+ * Translating the names of the 'i' functions:
+ *   Abbreviations used in the first part of the function name (before
+ *   alloc/dalloc) describe what that function accomplishes:
+ *     a: arena (query)
+ *     s: size (query, or sized deallocation)
+ *     e: extent (query)
+ *     p: aligned (allocates)
+ *     vs: size (query, without knowing that the pointer is into the heap)
+ *     r: rallocx implementation
+ *     x: xallocx implementation
+ *   Abbreviations used in the second part of the function name (after
+ *   alloc/dalloc) describe the arguments it takes
+ *     z: whether to return zeroed memory
+ *     t: accepts a tcache_t * parameter
+ *     m: accepts an arena_t * parameter
+ */
+
 JEMALLOC_ALWAYS_INLINE arena_t *
 iaalloc(tsdn_t *tsdn, const void *ptr) {
 	assert(ptr != NULL);
@@ -27,8 +45,10 @@ iallocztm(tsdn_t *tsdn, size_t size, szind_t ind, bool zero, tcache_t *tcache,
 	assert(size != 0);
 	assert(!is_internal || tcache == NULL);
 	assert(!is_internal || arena == NULL || arena_is_auto(arena));
-	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
-	    WITNESS_RANK_CORE, 0);
+	if (!tsdn_null(tsdn) && tsd_reentrancy_level_get(tsdn_tsd(tsdn)) == 0) {
+		witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
+		    WITNESS_RANK_CORE, 0);
+	}
 
 	ret = arena_malloc(tsdn, arena, size, ind, zero, tcache, slow_path);
 	if (config_stats && is_internal && likely(ret != NULL)) {
@@ -91,7 +111,8 @@ idalloctm(tsdn_t *tsdn, void *ptr, tcache_t *tcache, alloc_ctx_t *alloc_ctx,
 	if (config_stats && is_internal) {
 		arena_internal_sub(iaalloc(tsdn, ptr), isalloc(tsdn, ptr));
 	}
-	if (!is_internal && tsd_reentrancy_level_get(tsdn_tsd(tsdn)) != 0) {
+	if (!is_internal && !tsdn_null(tsdn) &&
+	    tsd_reentrancy_level_get(tsdn_tsd(tsdn)) != 0) {
 		assert(tcache == NULL);
 	}
 	arena_dalloc(tsdn, ptr, tcache, alloc_ctx, slow_path);
diff --git a/include/jemalloc/internal/jemalloc_internal_macros.h b/include/jemalloc/internal/jemalloc_internal_macros.h
index 4571895..ed75d37 100644
--- a/include/jemalloc/internal/jemalloc_internal_macros.h
+++ b/include/jemalloc/internal/jemalloc_internal_macros.h
@@ -37,4 +37,7 @@
 #  define JET_MUTABLE const
 #endif
 
+#define JEMALLOC_VA_ARGS_HEAD(head, ...) head
+#define JEMALLOC_VA_ARGS_TAIL(head, ...) __VA_ARGS__
+
 #endif /* JEMALLOC_INTERNAL_MACROS_H */
diff --git a/include/jemalloc/internal/jemalloc_internal_types.h b/include/jemalloc/internal/jemalloc_internal_types.h
index 50f9d00..1b750b1 100644
--- a/include/jemalloc/internal/jemalloc_internal_types.h
+++ b/include/jemalloc/internal/jemalloc_internal_types.h
@@ -79,22 +79,29 @@ typedef int malloc_cpuid_t;
 #  ifdef __hppa__
 #    define LG_QUANTUM		4
 #  endif
+#  ifdef __m68k__
+#    define LG_QUANTUM		3
+#  endif
 #  ifdef __mips__
 #    define LG_QUANTUM		3
 #  endif
+#  ifdef __nios2__
+#    define LG_QUANTUM		3
+#  endif
 #  ifdef __or1k__
 #    define LG_QUANTUM		3
 #  endif
 #  ifdef __powerpc__
 #    define LG_QUANTUM		4
 #  endif
-#  ifdef __riscv__
+#  if defined(__riscv) || defined(__riscv__)
 #    define LG_QUANTUM		4
 #  endif
 #  ifdef __s390__
 #    define LG_QUANTUM		4
 #  endif
-#  ifdef __SH4__
+#  if (defined (__SH3E__) || defined(__SH4_SINGLE__) || defined(__SH4__) || \
+	defined(__SH4_SINGLE_ONLY__))
 #    define LG_QUANTUM		4
 #  endif
 #  ifdef __tile__
diff --git a/include/jemalloc/internal/jemalloc_preamble.h.in b/include/jemalloc/internal/jemalloc_preamble.h.in
index 18539a0..f81f3a4 100644
--- a/include/jemalloc/internal/jemalloc_preamble.h.in
+++ b/include/jemalloc/internal/jemalloc_preamble.h.in
@@ -47,6 +47,10 @@
 #endif
 #include "jemalloc/internal/hooks.h"
 
+#ifdef JEMALLOC_DEFINE_MADVISE_FREE
+#  define JEMALLOC_MADV_FREE 8
+#endif
+
 static const bool config_debug =
 #ifdef JEMALLOC_DEBUG
     true
@@ -61,6 +65,13 @@ static const bool have_dss =
     false
 #endif
     ;
+static const bool have_madvise_huge =
+#ifdef JEMALLOC_HAVE_MADVISE_HUGE
+    true
+#else
+    false
+#endif
+    ;
 static const bool config_fill =
 #ifdef JEMALLOC_FILL
     true
@@ -146,6 +157,17 @@ static const bool config_cache_oblivious =
     false
 #endif
     ;
+/*
+ * Undocumented, for jemalloc development use only at the moment.  See the note
+ * in jemalloc/internal/log.h.
+ */
+static const bool config_log =
+#ifdef JEMALLOC_LOG
+    true
+#else
+    false
+#endif
+    ;
 #ifdef JEMALLOC_HAVE_SCHED_GETCPU
 /* Currently percpu_arena depends on sched_getcpu. */
 #define JEMALLOC_PERCPU_ARENA
diff --git a/include/jemalloc/internal/log.h b/include/jemalloc/internal/log.h
new file mode 100644
index 0000000..6420858
--- /dev/null
+++ b/include/jemalloc/internal/log.h
@@ -0,0 +1,115 @@
+#ifndef JEMALLOC_INTERNAL_LOG_H
+#define JEMALLOC_INTERNAL_LOG_H
+
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/malloc_io.h"
+#include "jemalloc/internal/mutex.h"
+
+#ifdef JEMALLOC_LOG
+#  define JEMALLOC_LOG_VAR_BUFSIZE 1000
+#else
+#  define JEMALLOC_LOG_VAR_BUFSIZE 1
+#endif
+
+#define JEMALLOC_LOG_BUFSIZE 4096
+
+/*
+ * The log malloc_conf option is a '|'-delimited list of log_var name segments
+ * which should be logged.  The names are themselves hierarchical, with '.' as
+ * the delimiter (a "segment" is just a prefix in the log namespace).  So, if
+ * you have:
+ *
+ * log("arena", "log msg for arena"); // 1
+ * log("arena.a", "log msg for arena.a"); // 2
+ * log("arena.b", "log msg for arena.b"); // 3
+ * log("arena.a.a", "log msg for arena.a.a"); // 4
+ * log("extent.a", "log msg for extent.a"); // 5
+ * log("extent.b", "log msg for extent.b"); // 6
+ *
+ * And your malloc_conf option is "log=arena.a|extent", then lines 2, 4, 5, and
+ * 6 will print at runtime.  You can enable logging from all log vars by
+ * writing "log=.".
+ *
+ * None of this should be regarded as a stable API for right now.  It's intended
+ * as a debugging interface, to let us keep around some of our printf-debugging
+ * statements.
+ */
+
+extern char log_var_names[JEMALLOC_LOG_VAR_BUFSIZE];
+extern atomic_b_t log_init_done;
+
+typedef struct log_var_s log_var_t;
+struct log_var_s {
+	/*
+	 * Lowest bit is "inited", second lowest is "enabled".  Putting them in
+	 * a single word lets us avoid any fences on weak architectures.
+	 */
+	atomic_u_t state;
+	const char *name;
+};
+
+#define LOG_NOT_INITIALIZED 0U
+#define LOG_INITIALIZED_NOT_ENABLED 1U
+#define LOG_ENABLED 2U
+
+#define LOG_VAR_INIT(name_str) {ATOMIC_INIT(LOG_NOT_INITIALIZED), name_str}
+
+/*
+ * Returns the value we should assume for state (which is not necessarily
+ * accurate; if logging is done before logging has finished initializing, then
+ * we default to doing the safe thing by logging everything).
+ */
+unsigned log_var_update_state(log_var_t *log_var);
+
+/* We factor out the metadata management to allow us to test more easily. */
+#define log_do_begin(log_var)						\
+if (config_log) {							\
+	unsigned log_state = atomic_load_u(&(log_var).state,		\
+	    ATOMIC_RELAXED);						\
+	if (unlikely(log_state == LOG_NOT_INITIALIZED)) {		\
+		log_state = log_var_update_state(&(log_var));		\
+		assert(log_state != LOG_NOT_INITIALIZED);		\
+	}								\
+	if (log_state == LOG_ENABLED) {					\
+		{
+			/* User code executes here. */
+#define log_do_end(log_var)						\
+		}							\
+	}								\
+}
+
+/*
+ * MSVC has some preprocessor bugs in its expansion of __VA_ARGS__ during
+ * preprocessing.  To work around this, we take all potential extra arguments in
+ * a var-args functions.  Since a varargs macro needs at least one argument in
+ * the "...", we accept the format string there, and require that the first
+ * argument in this "..." is a const char *.
+ */
+static inline void
+log_impl_varargs(const char *name, ...) {
+	char buf[JEMALLOC_LOG_BUFSIZE];
+	va_list ap;
+
+	va_start(ap, name);
+	const char *format = va_arg(ap, const char *);
+	size_t dst_offset = 0;
+	dst_offset += malloc_snprintf(buf, JEMALLOC_LOG_BUFSIZE, "%s: ", name);
+	dst_offset += malloc_vsnprintf(buf + dst_offset,
+	    JEMALLOC_LOG_BUFSIZE - dst_offset, format, ap);
+	dst_offset += malloc_snprintf(buf + dst_offset,
+	    JEMALLOC_LOG_BUFSIZE - dst_offset, "\n");
+	va_end(ap);
+
+	malloc_write(buf);
+}
+
+/* Call as log("log.var.str", "format_string %d", arg_for_format_string); */
+#define LOG(log_var_str, ...)						\
+do {									\
+	static log_var_t log_var = LOG_VAR_INIT(log_var_str);		\
+	log_do_begin(log_var)						\
+		log_impl_varargs((log_var).name, __VA_ARGS__);		\
+	log_do_end(log_var)						\
+} while (0)
+
+#endif /* JEMALLOC_INTERNAL_LOG_H */
diff --git a/include/jemalloc/internal/malloc_io.h b/include/jemalloc/internal/malloc_io.h
index 47ae58e..4992d1d 100644
--- a/include/jemalloc/internal/malloc_io.h
+++ b/include/jemalloc/internal/malloc_io.h
@@ -53,6 +53,10 @@ size_t malloc_vsnprintf(char *str, size_t size, const char *format,
     va_list ap);
 size_t malloc_snprintf(char *str, size_t size, const char *format, ...)
     JEMALLOC_FORMAT_PRINTF(3, 4);
+/*
+ * The caller can set write_cb and cbopaque to null to choose to print with the
+ * je_malloc_message hook.
+ */
 void malloc_vcprintf(void (*write_cb)(void *, const char *), void *cbopaque,
     const char *format, va_list ap);
 void malloc_cprintf(void (*write_cb)(void *, const char *), void *cbopaque,
diff --git a/include/jemalloc/internal/mutex_prof.h b/include/jemalloc/internal/mutex_prof.h
index 3358bcf..735c0ad 100644
--- a/include/jemalloc/internal/mutex_prof.h
+++ b/include/jemalloc/internal/mutex_prof.h
@@ -35,21 +35,34 @@ typedef enum {
 	mutex_prof_num_arena_mutexes
 } mutex_prof_arena_ind_t;
 
-#define MUTEX_PROF_COUNTERS						\
+#define MUTEX_PROF_UINT64_COUNTERS				\
     OP(num_ops, uint64_t)						\
     OP(num_wait, uint64_t)						\
-    OP(num_spin_acq, uint64_t)						\
-    OP(num_owner_switch, uint64_t)					\
-    OP(total_wait_time, uint64_t)					\
-    OP(max_wait_time, uint64_t)						\
+    OP(num_spin_acq, uint64_t)					\
+    OP(num_owner_switch, uint64_t)				\
+    OP(total_wait_time, uint64_t)				\
+    OP(max_wait_time, uint64_t)
+
+#define MUTEX_PROF_UINT32_COUNTERS				\
     OP(max_num_thds, uint32_t)
 
-typedef enum {
+#define MUTEX_PROF_COUNTERS		\
+		MUTEX_PROF_UINT64_COUNTERS \
+		MUTEX_PROF_UINT32_COUNTERS
+
 #define OP(counter, type) mutex_counter_##counter,
-	MUTEX_PROF_COUNTERS
+
+#define COUNTER_ENUM(counter_list, t)           \
+		typedef enum {                          \
+			counter_list                        \
+			mutex_prof_num_##t##_counters       \
+		} mutex_prof_##t##_counter_ind_t;
+
+COUNTER_ENUM(MUTEX_PROF_UINT64_COUNTERS, uint64_t)
+COUNTER_ENUM(MUTEX_PROF_UINT32_COUNTERS, uint32_t)
+
+#undef COUNTER_ENUM
 #undef OP
-	mutex_prof_num_counters
-} mutex_prof_counter_ind_t;
 
 typedef struct {
 	/*
diff --git a/include/jemalloc/internal/pages.h b/include/jemalloc/internal/pages.h
index 28383b7..dff2051 100644
--- a/include/jemalloc/internal/pages.h
+++ b/include/jemalloc/internal/pages.h
@@ -58,6 +58,9 @@ static const bool pages_can_purge_forced =
 #endif
     ;
 
+/* Whether transparent huge page state is "madvise". */
+extern bool thp_state_madvise;
+
 void *pages_map(void *addr, size_t size, size_t alignment, bool *commit);
 void pages_unmap(void *addr, size_t size);
 bool pages_commit(void *addr, size_t size);
@@ -66,6 +69,8 @@ bool pages_purge_lazy(void *addr, size_t size);
 bool pages_purge_forced(void *addr, size_t size);
 bool pages_huge(void *addr, size_t size);
 bool pages_nohuge(void *addr, size_t size);
+bool pages_dontdump(void *addr, size_t size);
+bool pages_dodump(void *addr, size_t size);
 bool pages_boot(void);
 
 #endif /* JEMALLOC_INTERNAL_PAGES_EXTERNS_H */
diff --git a/include/jemalloc/internal/rtree.h b/include/jemalloc/internal/rtree.h
index b5d4db3..4563db2 100644
--- a/include/jemalloc/internal/rtree.h
+++ b/include/jemalloc/internal/rtree.h
@@ -178,9 +178,21 @@ rtree_leaf_elm_bits_read(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
 
 JEMALLOC_ALWAYS_INLINE extent_t *
 rtree_leaf_elm_bits_extent_get(uintptr_t bits) {
+#    ifdef __aarch64__
+	/*
+	 * aarch64 doesn't sign extend the highest virtual address bit to set
+	 * the higher ones.  Instead, the high bits gets zeroed.
+	 */
+	uintptr_t high_bit_mask = ((uintptr_t)1 << LG_VADDR) - 1;
+	/* Mask off the slab bit. */
+	uintptr_t low_bit_mask = ~(uintptr_t)1;
+	uintptr_t mask = high_bit_mask & low_bit_mask;
+	return (extent_t *)(bits & mask);
+#    else
 	/* Restore sign-extended high bits, mask slab bit. */
 	return (extent_t *)((uintptr_t)((intptr_t)(bits << RTREE_NHIB) >>
 	    RTREE_NHIB) & ~((uintptr_t)0x1));
+#    endif
 }
 
 JEMALLOC_ALWAYS_INLINE szind_t
diff --git a/include/jemalloc/internal/rtree_tsd.h b/include/jemalloc/internal/rtree_tsd.h
index 3cdc862..93a7517 100644
--- a/include/jemalloc/internal/rtree_tsd.h
+++ b/include/jemalloc/internal/rtree_tsd.h
@@ -26,7 +26,7 @@
  * Zero initializer required for tsd initialization only.  Proper initialization
  * done via rtree_ctx_data_init().
  */
-#define RTREE_CTX_ZERO_INITIALIZER {{{0}}}
+#define RTREE_CTX_ZERO_INITIALIZER {{{0}}, {{0}}}
 
 
 typedef struct rtree_leaf_elm_s rtree_leaf_elm_t;
diff --git a/include/jemalloc/internal/spin.h b/include/jemalloc/internal/spin.h
index e2afc98..22804c6 100644
--- a/include/jemalloc/internal/spin.h
+++ b/include/jemalloc/internal/spin.h
@@ -1,25 +1,29 @@
 #ifndef JEMALLOC_INTERNAL_SPIN_H
 #define JEMALLOC_INTERNAL_SPIN_H
 
-#ifdef JEMALLOC_SPIN_C_
-#  define SPIN_INLINE extern inline
-#else
-#  define SPIN_INLINE inline
-#endif
-
 #define SPIN_INITIALIZER {0U}
 
 typedef struct {
 	unsigned iteration;
 } spin_t;
 
-SPIN_INLINE void
+static inline void
+spin_cpu_spinwait() {
+#  if HAVE_CPU_SPINWAIT
+	CPU_SPINWAIT;
+#  else
+	volatile int x = 0;
+	x = x;
+#  endif
+}
+
+static inline void
 spin_adaptive(spin_t *spin) {
 	volatile uint32_t i;
 
 	if (spin->iteration < 5) {
 		for (i = 0; i < (1U << spin->iteration); i++) {
-			CPU_SPINWAIT;
+			spin_cpu_spinwait();
 		}
 		spin->iteration++;
 	} else {
diff --git a/include/jemalloc/internal/stats.h b/include/jemalloc/internal/stats.h
index 1198779..852e342 100644
--- a/include/jemalloc/internal/stats.h
+++ b/include/jemalloc/internal/stats.h
@@ -1,12 +1,6 @@
 #ifndef JEMALLOC_INTERNAL_STATS_H
 #define JEMALLOC_INTERNAL_STATS_H
 
-#include "jemalloc/internal/atomic.h"
-#include "jemalloc/internal/mutex_prof.h"
-#include "jemalloc/internal/mutex.h"
-#include "jemalloc/internal/size_classes.h"
-#include "jemalloc/internal/stats_tsd.h"
-
 /*  OPTION(opt,		var_name,	default,	set_value_to) */
 #define STATS_PRINT_OPTIONS						\
     OPTION('J',		json,		false,		true)		\
@@ -33,132 +27,4 @@ extern char opt_stats_print_opts[stats_print_tot_num_options+1];
 void stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
     const char *opts);
 
-/*
- * In those architectures that support 64-bit atomics, we use atomic updates for
- * our 64-bit values.  Otherwise, we use a plain uint64_t and synchronize
- * externally.
- */
-#ifdef JEMALLOC_ATOMIC_U64
-typedef atomic_u64_t arena_stats_u64_t;
-#else
-/* Must hold the arena stats mutex while reading atomically. */
-typedef uint64_t arena_stats_u64_t;
-#endif
-
-typedef struct malloc_bin_stats_s {
-	/*
-	 * Total number of allocation/deallocation requests served directly by
-	 * the bin.  Note that tcache may allocate an object, then recycle it
-	 * many times, resulting many increments to nrequests, but only one
-	 * each to nmalloc and ndalloc.
-	 */
-	uint64_t	nmalloc;
-	uint64_t	ndalloc;
-
-	/*
-	 * Number of allocation requests that correspond to the size of this
-	 * bin.  This includes requests served by tcache, though tcache only
-	 * periodically merges into this counter.
-	 */
-	uint64_t	nrequests;
-
-	/*
-	 * Current number of regions of this size class, including regions
-	 * currently cached by tcache.
-	 */
-	size_t		curregs;
-
-	/* Number of tcache fills from this bin. */
-	uint64_t	nfills;
-
-	/* Number of tcache flushes to this bin. */
-	uint64_t	nflushes;
-
-	/* Total number of slabs created for this bin's size class. */
-	uint64_t	nslabs;
-
-	/*
-	 * Total number of slabs reused by extracting them from the slabs heap
-	 * for this bin's size class.
-	 */
-	uint64_t	reslabs;
-
-	/* Current number of slabs in this bin. */
-	size_t		curslabs;
-
-	mutex_prof_data_t mutex_data;
-} malloc_bin_stats_t;
-
-typedef struct malloc_large_stats_s {
-	/*
-	 * Total number of allocation/deallocation requests served directly by
-	 * the arena.
-	 */
-	arena_stats_u64_t	nmalloc;
-	arena_stats_u64_t	ndalloc;
-
-	/*
-	 * Number of allocation requests that correspond to this size class.
-	 * This includes requests served by tcache, though tcache only
-	 * periodically merges into this counter.
-	 */
-	arena_stats_u64_t	nrequests; /* Partially derived. */
-
-	/* Current number of allocations of this size class. */
-	size_t		curlextents; /* Derived. */
-} malloc_large_stats_t;
-
-typedef struct decay_stats_s {
-	/* Total number of purge sweeps. */
-	arena_stats_u64_t	npurge;
-	/* Total number of madvise calls made. */
-	arena_stats_u64_t	nmadvise;
-	/* Total number of pages purged. */
-	arena_stats_u64_t	purged;
-} decay_stats_t;
-
-/*
- * Arena stats.  Note that fields marked "derived" are not directly maintained
- * within the arena code; rather their values are derived during stats merge
- * requests.
- */
-typedef struct arena_stats_s {
-#ifndef JEMALLOC_ATOMIC_U64
-	malloc_mutex_t		mtx;
-#endif
-
-	/* Number of bytes currently mapped, excluding retained memory. */
-	atomic_zu_t		mapped; /* Partially derived. */
-
-	/*
-	 * Number of unused virtual memory bytes currently retained.  Retained
-	 * bytes are technically mapped (though always decommitted or purged),
-	 * but they are excluded from the mapped statistic (above).
-	 */
-	atomic_zu_t		retained; /* Derived. */
-
-	decay_stats_t		decay_dirty;
-	decay_stats_t		decay_muzzy;
-
-	atomic_zu_t		base; /* Derived. */
-	atomic_zu_t		internal;
-	atomic_zu_t		resident; /* Derived. */
-
-	atomic_zu_t		allocated_large; /* Derived. */
-	arena_stats_u64_t	nmalloc_large; /* Derived. */
-	arena_stats_u64_t	ndalloc_large; /* Derived. */
-	arena_stats_u64_t	nrequests_large; /* Derived. */
-
-	/* Number of bytes cached in tcache associated with this arena. */
-	atomic_zu_t		tcache_bytes; /* Derived. */
-
-	mutex_prof_data_t mutex_prof_data[mutex_prof_num_arena_mutexes];
-
-	/* One element for each large size class. */
-	malloc_large_stats_t	lstats[NSIZES - NBINS];
-
-	/* Arena uptime. */
-	nstime_t		uptime;
-} arena_stats_t;
-
 #endif /* JEMALLOC_INTERNAL_STATS_H */
diff --git a/include/jemalloc/internal/stats_tsd.h b/include/jemalloc/internal/stats_tsd.h
deleted file mode 100644
index d0c3bbe..0000000
--- a/include/jemalloc/internal/stats_tsd.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef JEMALLOC_INTERNAL_STATS_TSD_H
-#define JEMALLOC_INTERNAL_STATS_TSD_H
-
-typedef struct tcache_bin_stats_s {
-	/*
-	 * Number of allocation requests that corresponded to the size of this
-	 * bin.
-	 */
-	uint64_t	nrequests;
-} tcache_bin_stats_t;
-
-#endif /* JEMALLOC_INTERNAL_STATS_TSD_H */
diff --git a/include/jemalloc/internal/sz.h b/include/jemalloc/internal/sz.h
index 7f640d5..9794628 100644
--- a/include/jemalloc/internal/sz.h
+++ b/include/jemalloc/internal/sz.h
@@ -61,7 +61,7 @@ sz_psz2ind(size_t psz) {
 		pszind_t lg_delta = (x < LG_SIZE_CLASS_GROUP + LG_PAGE + 1) ?
 		    LG_PAGE : x - LG_SIZE_CLASS_GROUP - 1;
 
-		size_t delta_inverse_mask = ZD(-1) << lg_delta;
+		size_t delta_inverse_mask = ZU(-1) << lg_delta;
 		pszind_t mod = ((((psz-1) & delta_inverse_mask) >> lg_delta)) &
 		    ((ZU(1) << LG_SIZE_CLASS_GROUP) - 1);
 
@@ -142,7 +142,7 @@ sz_size2index_compute(size_t size) {
 		szind_t lg_delta = (x < LG_SIZE_CLASS_GROUP + LG_QUANTUM + 1)
 		    ? LG_QUANTUM : x - LG_SIZE_CLASS_GROUP - 1;
 
-		size_t delta_inverse_mask = ZD(-1) << lg_delta;
+		size_t delta_inverse_mask = ZU(-1) << lg_delta;
 		szind_t mod = ((((size-1) & delta_inverse_mask) >> lg_delta)) &
 		    ((ZU(1) << LG_SIZE_CLASS_GROUP) - 1);
 
diff --git a/include/jemalloc/internal/tcache_externs.h b/include/jemalloc/internal/tcache_externs.h
index db3e9c7..790367b 100644
--- a/include/jemalloc/internal/tcache_externs.h
+++ b/include/jemalloc/internal/tcache_externs.h
@@ -6,7 +6,7 @@
 extern bool	opt_tcache;
 extern ssize_t	opt_lg_tcache_max;
 
-extern tcache_bin_info_t	*tcache_bin_info;
+extern cache_bin_info_t	*tcache_bin_info;
 
 /*
  * Number of tcache bins.  There are NBINS small-object bins, plus 0 or more
@@ -30,10 +30,10 @@ extern tcaches_t	*tcaches;
 size_t	tcache_salloc(tsdn_t *tsdn, const void *ptr);
 void	tcache_event_hard(tsd_t *tsd, tcache_t *tcache);
 void	*tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
-    tcache_bin_t *tbin, szind_t binind, bool *tcache_success);
-void	tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
+    cache_bin_t *tbin, szind_t binind, bool *tcache_success);
+void	tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
     szind_t binind, unsigned rem);
-void	tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, szind_t binind,
+void	tcache_bin_flush_large(tsd_t *tsd, cache_bin_t *tbin, szind_t binind,
     unsigned rem, tcache_t *tcache);
 void	tcache_arena_reassociate(tsdn_t *tsdn, tcache_t *tcache,
     arena_t *arena);
diff --git a/include/jemalloc/internal/tcache_inlines.h b/include/jemalloc/internal/tcache_inlines.h
index c55bcd2..0a6feb5 100644
--- a/include/jemalloc/internal/tcache_inlines.h
+++ b/include/jemalloc/internal/tcache_inlines.h
@@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_TCACHE_INLINES_H
 #define JEMALLOC_INTERNAL_TCACHE_INLINES_H
 
+#include "jemalloc/internal/bin.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"
 #include "jemalloc/internal/size_classes.h"
 #include "jemalloc/internal/sz.h"
@@ -38,43 +39,16 @@ tcache_event(tsd_t *tsd, tcache_t *tcache) {
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-tcache_alloc_easy(tcache_bin_t *tbin, bool *tcache_success) {
-	void *ret;
-
-	if (unlikely(tbin->ncached == 0)) {
-		tbin->low_water = -1;
-		*tcache_success = false;
-		return NULL;
-	}
-	/*
-	 * tcache_success (instead of ret) should be checked upon the return of
-	 * this function.  We avoid checking (ret == NULL) because there is
-	 * never a null stored on the avail stack (which is unknown to the
-	 * compiler), and eagerly checking ret would cause pipeline stall
-	 * (waiting for the cacheline).
-	 */
-	*tcache_success = true;
-	ret = *(tbin->avail - tbin->ncached);
-	tbin->ncached--;
-
-	if (unlikely((low_water_t)tbin->ncached < tbin->low_water)) {
-		tbin->low_water = tbin->ncached;
-	}
-
-	return ret;
-}
-
-JEMALLOC_ALWAYS_INLINE void *
 tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
     szind_t binind, bool zero, bool slow_path) {
 	void *ret;
-	tcache_bin_t *tbin;
+	cache_bin_t *bin;
 	bool tcache_success;
 	size_t usize JEMALLOC_CC_SILENCE_INIT(0);
 
 	assert(binind < NBINS);
-	tbin = tcache_small_bin_get(tcache, binind);
-	ret = tcache_alloc_easy(tbin, &tcache_success);
+	bin = tcache_small_bin_get(tcache, binind);
+	ret = cache_bin_alloc_easy(bin, &tcache_success);
 	assert(tcache_success == (ret != NULL));
 	if (unlikely(!tcache_success)) {
 		bool tcache_hard_success;
@@ -84,7 +58,7 @@ tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
 		}
 
 		ret = tcache_alloc_small_hard(tsd_tsdn(tsd), arena, tcache,
-		    tbin, binind, &tcache_hard_success);
+		    bin, binind, &tcache_hard_success);
 		if (tcache_hard_success == false) {
 			return NULL;
 		}
@@ -103,22 +77,21 @@ tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
 	if (likely(!zero)) {
 		if (slow_path && config_fill) {
 			if (unlikely(opt_junk_alloc)) {
-				arena_alloc_junk_small(ret,
-				    &arena_bin_info[binind], false);
+				arena_alloc_junk_small(ret, &bin_infos[binind],
+				    false);
 			} else if (unlikely(opt_zero)) {
 				memset(ret, 0, usize);
 			}
 		}
 	} else {
 		if (slow_path && config_fill && unlikely(opt_junk_alloc)) {
-			arena_alloc_junk_small(ret, &arena_bin_info[binind],
-			    true);
+			arena_alloc_junk_small(ret, &bin_infos[binind], true);
 		}
 		memset(ret, 0, usize);
 	}
 
 	if (config_stats) {
-		tbin->tstats.nrequests++;
+		bin->tstats.nrequests++;
 	}
 	if (config_prof) {
 		tcache->prof_accumbytes += usize;
@@ -131,12 +104,12 @@ JEMALLOC_ALWAYS_INLINE void *
 tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
     szind_t binind, bool zero, bool slow_path) {
 	void *ret;
-	tcache_bin_t *tbin;
+	cache_bin_t *bin;
 	bool tcache_success;
 
 	assert(binind >= NBINS &&binind < nhbins);
-	tbin = tcache_large_bin_get(tcache, binind);
-	ret = tcache_alloc_easy(tbin, &tcache_success);
+	bin = tcache_large_bin_get(tcache, binind);
+	ret = cache_bin_alloc_easy(bin, &tcache_success);
 	assert(tcache_success == (ret != NULL));
 	if (unlikely(!tcache_success)) {
 		/*
@@ -176,7 +149,7 @@ tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
 		}
 
 		if (config_stats) {
-			tbin->tstats.nrequests++;
+			bin->tstats.nrequests++;
 		}
 		if (config_prof) {
 			tcache->prof_accumbytes += usize;
@@ -190,24 +163,24 @@ tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
 JEMALLOC_ALWAYS_INLINE void
 tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
     bool slow_path) {
-	tcache_bin_t *tbin;
-	tcache_bin_info_t *tbin_info;
+	cache_bin_t *bin;
+	cache_bin_info_t *bin_info;
 
 	assert(tcache_salloc(tsd_tsdn(tsd), ptr) <= SMALL_MAXCLASS);
 
 	if (slow_path && config_fill && unlikely(opt_junk_free)) {
-		arena_dalloc_junk_small(ptr, &arena_bin_info[binind]);
+		arena_dalloc_junk_small(ptr, &bin_infos[binind]);
 	}
 
-	tbin = tcache_small_bin_get(tcache, binind);
-	tbin_info = &tcache_bin_info[binind];
-	if (unlikely(tbin->ncached == tbin_info->ncached_max)) {
-		tcache_bin_flush_small(tsd, tcache, tbin, binind,
-		    (tbin_info->ncached_max >> 1));
+	bin = tcache_small_bin_get(tcache, binind);
+	bin_info = &tcache_bin_info[binind];
+	if (unlikely(bin->ncached == bin_info->ncached_max)) {
+		tcache_bin_flush_small(tsd, tcache, bin, binind,
+		    (bin_info->ncached_max >> 1));
 	}
-	assert(tbin->ncached < tbin_info->ncached_max);
-	tbin->ncached++;
-	*(tbin->avail - tbin->ncached) = ptr;
+	assert(bin->ncached < bin_info->ncached_max);
+	bin->ncached++;
+	*(bin->avail - bin->ncached) = ptr;
 
 	tcache_event(tsd, tcache);
 }
@@ -215,8 +188,8 @@ tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
 JEMALLOC_ALWAYS_INLINE void
 tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
     bool slow_path) {
-	tcache_bin_t *tbin;
-	tcache_bin_info_t *tbin_info;
+	cache_bin_t *bin;
+	cache_bin_info_t *bin_info;
 
 	assert(tcache_salloc(tsd_tsdn(tsd), ptr) > SMALL_MAXCLASS);
 	assert(tcache_salloc(tsd_tsdn(tsd), ptr) <= tcache_maxclass);
@@ -225,15 +198,15 @@ tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
 		large_dalloc_junk(ptr, sz_index2size(binind));
 	}
 
-	tbin = tcache_large_bin_get(tcache, binind);
-	tbin_info = &tcache_bin_info[binind];
-	if (unlikely(tbin->ncached == tbin_info->ncached_max)) {
-		tcache_bin_flush_large(tsd, tbin, binind,
-		    (tbin_info->ncached_max >> 1), tcache);
+	bin = tcache_large_bin_get(tcache, binind);
+	bin_info = &tcache_bin_info[binind];
+	if (unlikely(bin->ncached == bin_info->ncached_max)) {
+		tcache_bin_flush_large(tsd, bin, binind,
+		    (bin_info->ncached_max >> 1), tcache);
 	}
-	assert(tbin->ncached < tbin_info->ncached_max);
-	tbin->ncached++;
-	*(tbin->avail - tbin->ncached) = ptr;
+	assert(bin->ncached < bin_info->ncached_max);
+	bin->ncached++;
+	*(bin->avail - bin->ncached) = ptr;
 
 	tcache_event(tsd, tcache);
 }
diff --git a/include/jemalloc/internal/tcache_structs.h b/include/jemalloc/internal/tcache_structs.h
index 7eb516f..07b7387 100644
--- a/include/jemalloc/internal/tcache_structs.h
+++ b/include/jemalloc/internal/tcache_structs.h
@@ -3,54 +3,51 @@
 
 #include "jemalloc/internal/ql.h"
 #include "jemalloc/internal/size_classes.h"
-#include "jemalloc/internal/stats_tsd.h"
+#include "jemalloc/internal/cache_bin.h"
 #include "jemalloc/internal/ticker.h"
 
-/*
- * Read-only information associated with each element of tcache_t's tbins array
- * is stored separately, mainly to reduce memory usage.
- */
-struct tcache_bin_info_s {
-	unsigned	ncached_max;	/* Upper limit on ncached. */
-};
-
-struct tcache_bin_s {
-	low_water_t	low_water;	/* Min # cached since last GC. */
-	uint32_t	ncached;	/* # of cached objects. */
+struct tcache_s {
 	/*
-	 * ncached and stats are both modified frequently.  Let's keep them
-	 * close so that they have a higher chance of being on the same
-	 * cacheline, thus less write-backs.
+	 * To minimize our cache-footprint, we put the frequently accessed data
+	 * together at the start of this struct.
 	 */
-	tcache_bin_stats_t tstats;
+
+	/* Cleared after arena_prof_accum(). */
+	uint64_t	prof_accumbytes;
+	/* Drives incremental GC. */
+	ticker_t	gc_ticker;
 	/*
-	 * To make use of adjacent cacheline prefetch, the items in the avail
-	 * stack goes to higher address for newer allocations.  avail points
-	 * just above the available space, which means that
-	 * avail[-ncached, ... -1] are available items and the lowest item will
-	 * be allocated first.
+	 * The pointer stacks associated with bins follow as a contiguous array.
+	 * During tcache initialization, the avail pointer in each element of
+	 * tbins is initialized to point to the proper offset within this array.
 	 */
-	void		**avail;	/* Stack of available objects. */
-};
+	cache_bin_t	bins_small[NBINS];
 
-struct tcache_s {
-	/* Data accessed frequently first: prof, ticker and small bins. */
-	uint64_t	prof_accumbytes;/* Cleared after arena_prof_accum(). */
-	ticker_t	gc_ticker;	/* Drives incremental GC. */
 	/*
-	 * The pointer stacks associated with tbins follow as a contiguous
-	 * array.  During tcache initialization, the avail pointer in each
-	 * element of tbins is initialized to point to the proper offset within
-	 * this array.
+	 * This data is less hot; we can be a little less careful with our
+	 * footprint here.
 	 */
-	tcache_bin_t	tbins_small[NBINS];
-	/* Data accessed less often below. */
-	ql_elm(tcache_t) link;		/* Used for aggregating stats. */
-	arena_t		*arena;		/* Associated arena. */
-	szind_t		next_gc_bin;	/* Next bin to GC. */
+	/* Lets us track all the tcaches in an arena. */
+	ql_elm(tcache_t) link;
+	/*
+	 * The descriptor lets the arena find our cache bins without seeing the
+	 * tcache definition.  This enables arenas to aggregate stats across
+	 * tcaches without having a tcache dependency.
+	 */
+	cache_bin_array_descriptor_t cache_bin_array_descriptor;
+
+	/* The arena this tcache is associated with. */
+	arena_t		*arena;
+	/* Next bin to GC. */
+	szind_t		next_gc_bin;
 	/* For small bins, fill (ncached_max >> lg_fill_div). */
 	uint8_t		lg_fill_div[NBINS];
-	tcache_bin_t	tbins_large[NSIZES-NBINS];
+	/*
+	 * We put the cache bins for large size classes at the end of the
+	 * struct, since some of them might not get used.  This might end up
+	 * letting us avoid touching an extra page if we don't have to.
+	 */
+	cache_bin_t	bins_large[NSIZES-NBINS];
 };
 
 /* Linkage for list of available (previously used) explicit tcache IDs. */
diff --git a/include/jemalloc/internal/tcache_types.h b/include/jemalloc/internal/tcache_types.h
index 1155d62..e49bc9d 100644
--- a/include/jemalloc/internal/tcache_types.h
+++ b/include/jemalloc/internal/tcache_types.h
@@ -3,14 +3,9 @@
 
 #include "jemalloc/internal/size_classes.h"
 
-typedef struct tcache_bin_info_s tcache_bin_info_t;
-typedef struct tcache_bin_s tcache_bin_t;
 typedef struct tcache_s tcache_t;
 typedef struct tcaches_s tcaches_t;
 
-/* ncached is cast to this type for comparison. */
-typedef int32_t low_water_t;
-
 /*
  * tcache pointers close to NULL are used to encode state information that is
  * used for two purposes: preventing thread caching on a per thread basis and
diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index 155a2ec..0b9841a 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -65,6 +65,7 @@ typedef void (*test_callback_t)(int *);
     O(arenas_tdata_bypass,	bool,			bool)		\
     O(reentrancy_level,		int8_t,			int8_t)		\
     O(narenas_tdata,		uint32_t,		uint32_t)	\
+    O(offset_state,		uint64_t,		uint64_t)	\
     O(thread_allocated,		uint64_t,		uint64_t)	\
     O(thread_deallocated,	uint64_t,		uint64_t)	\
     O(prof_tdata,		prof_tdata_t *,		prof_tdata_t *)	\
@@ -84,6 +85,7 @@ typedef void (*test_callback_t)(int *);
     0,									\
     0,									\
     0,									\
+    0,									\
     NULL,								\
     RTREE_CTX_ZERO_INITIALIZER,						\
     NULL,								\
diff --git a/include/jemalloc/internal/witness.h b/include/jemalloc/internal/witness.h
index 33be666..7ace8ae 100644
--- a/include/jemalloc/internal/witness.h
+++ b/include/jemalloc/internal/witness.h
@@ -51,7 +51,7 @@
 #define WITNESS_RANK_ARENA_LARGE	19U
 
 #define WITNESS_RANK_LEAF		0xffffffffU
-#define WITNESS_RANK_ARENA_BIN		WITNESS_RANK_LEAF
+#define WITNESS_RANK_BIN		WITNESS_RANK_LEAF
 #define WITNESS_RANK_ARENA_STATS	WITNESS_RANK_LEAF
 #define WITNESS_RANK_DSS		WITNESS_RANK_LEAF
 #define WITNESS_RANK_PROF_ACTIVE	WITNESS_RANK_LEAF
diff --git a/include/jemalloc/jemalloc_mangle.sh b/include/jemalloc/jemalloc_mangle.sh
index df328b7..c675bb4 100755
--- a/include/jemalloc/jemalloc_mangle.sh
+++ b/include/jemalloc/jemalloc_mangle.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/sh -eu
 
 public_symbols_txt=$1
 symbol_prefix=$2