From e8a63b87c36ac814272d73b503658431d2000055 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Wed, 9 May 2018 15:03:38 -0700
Subject: Fix an incorrect assertion.

When configured with --with-lg-page, it's possible for the configured page size
to be greater than the system page size, in which case the page address may only
be aligned with the system page size.
---
 src/pages.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pages.c b/src/pages.c
index 2600269..cc967fc 100644
--- a/src/pages.c
+++ b/src/pages.c
@@ -261,7 +261,7 @@ pages_decommit(void *addr, size_t size) {
 
 bool
 pages_purge_lazy(void *addr, size_t size) {
-	assert(PAGE_ADDR2BASE(addr) == addr);
+	assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
 	assert(PAGE_CEILING(size) == size);
 
 	if (!pages_can_purge_lazy) {
-- 
cgit v0.12


From 312352faa89a39ff1e690d709d7d6f852f89d61d Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Fri, 11 May 2018 16:32:29 -0700
Subject: Fix background thread index issues with max_background_threads.

---
 include/jemalloc/internal/background_thread_inlines.h | 7 ++++++-
 src/ctl.c                                             | 6 ++----
 test/unit/arena_reset.c                               | 4 ++--
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/include/jemalloc/internal/background_thread_inlines.h b/include/jemalloc/internal/background_thread_inlines.h
index ef50231..f85e86f 100644
--- a/include/jemalloc/internal/background_thread_inlines.h
+++ b/include/jemalloc/internal/background_thread_inlines.h
@@ -15,7 +15,12 @@ background_thread_enabled_set(tsdn_t *tsdn, bool state) {
 JEMALLOC_ALWAYS_INLINE background_thread_info_t *
 arena_background_thread_info_get(arena_t *arena) {
 	unsigned arena_ind = arena_ind_get(arena);
-	return &background_thread_info[arena_ind % ncpus];
+	return &background_thread_info[arena_ind % max_background_threads];
+}
+
+JEMALLOC_ALWAYS_INLINE background_thread_info_t *
+background_thread_info_get(size_t ind) {
+	return &background_thread_info[ind % max_background_threads];
 }
 
 JEMALLOC_ALWAYS_INLINE uint64_t
diff --git a/src/ctl.c b/src/ctl.c
index 1e713a3..0eb8de1 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -2044,9 +2044,8 @@ arena_reset_prepare_background_thread(tsd_t *tsd, unsigned arena_ind) {
 	if (have_background_thread) {
 		malloc_mutex_lock(tsd_tsdn(tsd), &background_thread_lock);
 		if (background_thread_enabled()) {
-			unsigned ind = arena_ind % ncpus;
 			background_thread_info_t *info =
-			    &background_thread_info[ind];
+			    background_thread_info_get(arena_ind);
 			assert(info->state == background_thread_started);
 			malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx);
 			info->state = background_thread_paused;
@@ -2059,9 +2058,8 @@ static void
 arena_reset_finish_background_thread(tsd_t *tsd, unsigned arena_ind) {
 	if (have_background_thread) {
 		if (background_thread_enabled()) {
-			unsigned ind = arena_ind % ncpus;
 			background_thread_info_t *info =
-			    &background_thread_info[ind];
+			    background_thread_info_get(arena_ind);
 			assert(info->state == background_thread_paused);
 			malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx);
 			info->state = background_thread_started;
diff --git a/test/unit/arena_reset.c b/test/unit/arena_reset.c
index f5fb24d..c1ccb09 100644
--- a/test/unit/arena_reset.c
+++ b/test/unit/arena_reset.c
@@ -142,7 +142,7 @@ do_arena_reset_post(void **ptrs, unsigned nptrs, unsigned arena_ind) {
 
 	if (have_background_thread) {
 		malloc_mutex_lock(tsdn,
-		    &background_thread_info[arena_ind % ncpus].mtx);
+		    &background_thread_info_get(arena_ind)->mtx);
 	}
 	/* Verify allocations no longer exist. */
 	for (i = 0; i < nptrs; i++) {
@@ -151,7 +151,7 @@ do_arena_reset_post(void **ptrs, unsigned nptrs, unsigned arena_ind) {
 	}
 	if (have_background_thread) {
 		malloc_mutex_unlock(tsdn,
-		    &background_thread_info[arena_ind % ncpus].mtx);
+		    &background_thread_info_get(arena_ind)->mtx);
 	}
 
 	free(ptrs);
-- 
cgit v0.12


From b293a3eb86a32b9c242ac39d88312c0a9d317b8b Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Wed, 9 May 2018 16:17:37 -0700
Subject: Fix the max_background_thread test.

We may set number of background threads separately, e.g. through
--with-malloc-conf, so avoid assuming the default number in the test.
---
 test/unit/background_thread_enable.c | 52 +++++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 25 deletions(-)

diff --git a/test/unit/background_thread_enable.c b/test/unit/background_thread_enable.c
index ff95e67..d894e93 100644
--- a/test/unit/background_thread_enable.c
+++ b/test/unit/background_thread_enable.c
@@ -33,20 +33,19 @@ TEST_END
 TEST_BEGIN(test_max_background_threads) {
 	test_skip_if(!have_background_thread);
 
-	size_t maxt;
-	size_t opt_maxt;
-	size_t sz_m = sizeof(maxt);
+	size_t max_n_thds;
+	size_t opt_max_n_thds;
+	size_t sz_m = sizeof(max_n_thds);
 	assert_d_eq(mallctl("opt.max_background_threads",
-			    &opt_maxt, &sz_m, NULL, 0), 0,
-			    "Failed to get opt.max_background_threads");
-	assert_d_eq(mallctl("max_background_threads", &maxt, &sz_m, NULL, 0), 0,
-		    "Failed to get max background threads");
-	assert_zu_eq(20, maxt, "should be ncpus");
-	assert_zu_eq(opt_maxt, maxt,
-		     "max_background_threads and "
-		     "opt.max_background_threads should match");
-	assert_d_eq(mallctl("max_background_threads", NULL, NULL, &maxt, sz_m),
-		    0, "Failed to set max background threads");
+	    &opt_max_n_thds, &sz_m, NULL, 0), 0,
+	    "Failed to get opt.max_background_threads");
+	assert_d_eq(mallctl("max_background_threads", &max_n_thds, &sz_m, NULL,
+	    0), 0, "Failed to get max background threads");
+	assert_zu_eq(opt_max_n_thds, max_n_thds,
+	    "max_background_threads and "
+	    "opt.max_background_threads should match");
+	assert_d_eq(mallctl("max_background_threads", NULL, NULL, &max_n_thds,
+	    sz_m), 0, "Failed to set max background threads");
 
 	unsigned id;
 	size_t sz_u = sizeof(unsigned);
@@ -60,18 +59,21 @@ TEST_BEGIN(test_max_background_threads) {
 	size_t sz_b = sizeof(bool);
 	assert_d_eq(mallctl("background_thread", NULL, NULL, &enable, sz_b), 0,
 	    "Failed to enable background threads");
-	assert_zu_eq(n_background_threads, maxt,
-		     "Number of background threads should be 3.\n");
-	maxt = 10;
-	assert_d_eq(mallctl("max_background_threads", NULL, NULL, &maxt, sz_m),
-		    0, "Failed to set max background threads");
-	assert_zu_eq(n_background_threads, maxt,
-		     "Number of background threads should be 10.\n");
-	maxt = 3;
-	assert_d_eq(mallctl("max_background_threads", NULL, NULL, &maxt, sz_m),
-		    0, "Failed to set max background threads");
-	assert_zu_eq(n_background_threads, maxt,
-		     "Number of background threads should be 3.\n");
+	assert_zu_eq(n_background_threads, max_n_thds,
+	    "Number of background threads should not change.\n");
+	size_t new_max_thds = max_n_thds - 1;
+	if (new_max_thds > 0) {
+		assert_d_eq(mallctl("max_background_threads", NULL, NULL,
+		    &new_max_thds, sz_m), 0,
+		    "Failed to set max background threads");
+		assert_zu_eq(n_background_threads, new_max_thds,
+		    "Number of background threads should decrease by 1.\n");
+	}
+	new_max_thds = 1;
+	assert_d_eq(mallctl("max_background_threads", NULL, NULL, &new_max_thds,
+	    sz_m), 0, "Failed to set max background threads");
+	assert_zu_eq(n_background_threads, new_max_thds,
+	    "Number of background threads should be 1.\n");
 }
 TEST_END
 
-- 
cgit v0.12


From 09edea3f5c98dae3f298b7ac9f5adad13e528bc9 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Thu, 17 May 2018 10:53:54 -0700
Subject: Tweak the format of the per arena summary section.

Increase the width to ensure enough space for long running programs.
---
 src/stats.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/stats.c b/src/stats.c
index 08b9507..7411745 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -696,35 +696,35 @@ stats_arena_print(emitter_t *emitter, unsigned i, bool bins, bool large,
 	emitter_col_t alloc_count_title;
 	emitter_col_init(&alloc_count_title, &alloc_count_row);
 	alloc_count_title.justify = emitter_justify_left;
-	alloc_count_title.width = 25;
+	alloc_count_title.width = 21;
 	alloc_count_title.type = emitter_type_title;
 	alloc_count_title.str_val = "";
 
 	emitter_col_t alloc_count_allocated;
 	emitter_col_init(&alloc_count_allocated, &alloc_count_row);
 	alloc_count_allocated.justify = emitter_justify_right;
-	alloc_count_allocated.width = 12;
+	alloc_count_allocated.width = 16;
 	alloc_count_allocated.type = emitter_type_title;
 	alloc_count_allocated.str_val = "allocated";
 
 	emitter_col_t alloc_count_nmalloc;
 	emitter_col_init(&alloc_count_nmalloc, &alloc_count_row);
 	alloc_count_nmalloc.justify = emitter_justify_right;
-	alloc_count_nmalloc.width = 12;
+	alloc_count_nmalloc.width = 16;
 	alloc_count_nmalloc.type = emitter_type_title;
 	alloc_count_nmalloc.str_val = "nmalloc";
 
 	emitter_col_t alloc_count_ndalloc;
 	emitter_col_init(&alloc_count_ndalloc, &alloc_count_row);
 	alloc_count_ndalloc.justify = emitter_justify_right;
-	alloc_count_ndalloc.width = 12;
+	alloc_count_ndalloc.width = 16;
 	alloc_count_ndalloc.type = emitter_type_title;
 	alloc_count_ndalloc.str_val = "ndalloc";
 
 	emitter_col_t alloc_count_nrequests;
 	emitter_col_init(&alloc_count_nrequests, &alloc_count_row);
 	alloc_count_nrequests.justify = emitter_justify_right;
-	alloc_count_nrequests.width = 12;
+	alloc_count_nrequests.width = 16;
 	alloc_count_nrequests.type = emitter_type_title;
 	alloc_count_nrequests.str_val = "nrequests";
 
@@ -776,14 +776,14 @@ stats_arena_print(emitter_t *emitter, unsigned i, bool bins, bool large,
 	emitter_col_t mem_count_title;
 	emitter_col_init(&mem_count_title, &mem_count_row);
 	mem_count_title.justify = emitter_justify_left;
-	mem_count_title.width = 25;
+	mem_count_title.width = 21;
 	mem_count_title.type = emitter_type_title;
 	mem_count_title.str_val = "";
 
 	emitter_col_t mem_count_val;
 	emitter_col_init(&mem_count_val, &mem_count_row);
 	mem_count_val.justify = emitter_justify_right;
-	mem_count_val.width = 12;
+	mem_count_val.width = 16;
 	mem_count_val.type = emitter_type_title;
 	mem_count_val.str_val = "";
 
-- 
cgit v0.12


From e74a1a37c82fa3a44cee1002d9d8957bcc8274a7 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Thu, 8 Mar 2018 16:15:00 -0800
Subject: Atomics: Add atomic_u8_t, force-inline operations.

We're about to need an atomic uint8_t for state operations.

Unfortunately, we're at the point where things won't get inlined into the key
methods unless they're force-inlined.  This is embarassing and we should do
something about it, but in the meantime we'll force-inline a little more when we
need to.
---
 include/jemalloc/internal/atomic.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/jemalloc/internal/atomic.h b/include/jemalloc/internal/atomic.h
index adadb1a..a184e46 100644
--- a/include/jemalloc/internal/atomic.h
+++ b/include/jemalloc/internal/atomic.h
@@ -1,7 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_ATOMIC_H
 #define JEMALLOC_INTERNAL_ATOMIC_H
 
-#define ATOMIC_INLINE static inline
+#define ATOMIC_INLINE JEMALLOC_ALWAYS_INLINE
 
 #if defined(JEMALLOC_GCC_ATOMIC_ATOMICS)
 #  include "jemalloc/internal/atomic_gcc_atomic.h"
-- 
cgit v0.12


From 982c10de3566f38628770e57c62d1a6cdc5a09f9 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Thu, 8 Mar 2018 16:34:17 -0800
Subject: TSD: Make all state access happen through a function.

Shortly, tsd state will be atomic and have some complicated enough logic down
the state-setting path that we should be aware of it.
---
 include/jemalloc/internal/atomic.h                 |  2 ++
 .../internal/jemalloc_internal_inlines_a.h         |  2 +-
 include/jemalloc/internal/tsd.h                    | 37 ++++++++++++++--------
 src/tsd.c                                          | 30 +++++++++---------
 test/unit/tsd.c                                    |  4 +--
 5 files changed, 43 insertions(+), 32 deletions(-)

diff --git a/include/jemalloc/internal/atomic.h b/include/jemalloc/internal/atomic.h
index a184e46..bb751cf 100644
--- a/include/jemalloc/internal/atomic.h
+++ b/include/jemalloc/internal/atomic.h
@@ -66,6 +66,8 @@ JEMALLOC_GENERATE_INT_ATOMICS(size_t, zu, LG_SIZEOF_PTR)
 
 JEMALLOC_GENERATE_INT_ATOMICS(ssize_t, zd, LG_SIZEOF_PTR)
 
+JEMALLOC_GENERATE_INT_ATOMICS(uint8_t, u8, 0)
+
 JEMALLOC_GENERATE_INT_ATOMICS(uint32_t, u32, 2)
 
 #ifdef JEMALLOC_ATOMIC_U64
diff --git a/include/jemalloc/internal/jemalloc_internal_inlines_a.h b/include/jemalloc/internal/jemalloc_internal_inlines_a.h
index c6a1f7e..6577a4f 100644
--- a/include/jemalloc/internal/jemalloc_internal_inlines_a.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_a.h
@@ -156,7 +156,7 @@ pre_reentrancy(tsd_t *tsd, arena_t *arena) {
 	if (fast) {
 		/* Prepare slow path for reentrancy. */
 		tsd_slow_update(tsd);
-		assert(tsd->state == tsd_state_nominal_slow);
+		assert(tsd_state_get(tsd) == tsd_state_nominal_slow);
 	}
 }
 
diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index 0b9841a..aa64d93 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -107,9 +107,6 @@ enum {
 	tsd_state_uninitialized = 5
 };
 
-/* Manually limit tsd_state_t to a single byte. */
-typedef uint8_t tsd_state_t;
-
 /* The actual tsd. */
 struct tsd_s {
 	/*
@@ -117,13 +114,25 @@ struct tsd_s {
 	 * module.  Access any thread-local state through the getters and
 	 * setters below.
 	 */
-	tsd_state_t	state;
+
+	/* We manually limit the state to just a single byte. */
+	uint8_t state;
 #define O(n, t, nt)							\
 	t use_a_getter_or_setter_instead_##n;
 MALLOC_TSD
 #undef O
 };
 
+JEMALLOC_ALWAYS_INLINE uint8_t
+tsd_state_get(tsd_t *tsd) {
+	return tsd->state;
+}
+
+JEMALLOC_ALWAYS_INLINE void
+tsd_state_set(tsd_t *tsd, uint8_t state) {
+	tsd->state = state;
+}
+
 /*
  * Wrapper around tsd_t that makes it possible to avoid implicit conversion
  * between tsd_t and tsdn_t, where tsdn_t is "nullable" and has to be
@@ -191,10 +200,10 @@ MALLOC_TSD
 #define O(n, t, nt)							\
 JEMALLOC_ALWAYS_INLINE t *						\
 tsd_##n##p_get(tsd_t *tsd) {						\
-	assert(tsd->state == tsd_state_nominal ||			\
-	    tsd->state == tsd_state_nominal_slow ||			\
-	    tsd->state == tsd_state_reincarnated ||			\
-	    tsd->state == tsd_state_minimal_initialized);		\
+	assert(tsd_state_get(tsd) == tsd_state_nominal ||		\
+	    tsd_state_get(tsd) == tsd_state_nominal_slow ||		\
+	    tsd_state_get(tsd) == tsd_state_reincarnated ||		\
+	    tsd_state_get(tsd) == tsd_state_minimal_initialized);	\
 	return tsd_##n##p_get_unsafe(tsd);				\
 }
 MALLOC_TSD
@@ -229,8 +238,8 @@ MALLOC_TSD
 #define O(n, t, nt)							\
 JEMALLOC_ALWAYS_INLINE void						\
 tsd_##n##_set(tsd_t *tsd, t val) {					\
-	assert(tsd->state != tsd_state_reincarnated &&			\
-	    tsd->state != tsd_state_minimal_initialized);		\
+	assert(tsd_state_get(tsd) != tsd_state_reincarnated &&		\
+	    tsd_state_get(tsd) != tsd_state_minimal_initialized);	\
 	*tsd_##n##p_get(tsd) = val;					\
 }
 MALLOC_TSD
@@ -244,7 +253,7 @@ tsd_assert_fast(tsd_t *tsd) {
 
 JEMALLOC_ALWAYS_INLINE bool
 tsd_fast(tsd_t *tsd) {
-	bool fast = (tsd->state == tsd_state_nominal);
+	bool fast = (tsd_state_get(tsd) == tsd_state_nominal);
 	if (fast) {
 		tsd_assert_fast(tsd);
 	}
@@ -261,7 +270,7 @@ tsd_fetch_impl(bool init, bool minimal) {
 	}
 	assert(tsd != NULL);
 
-	if (unlikely(tsd->state != tsd_state_nominal)) {
+	if (unlikely(tsd_state_get(tsd) != tsd_state_nominal)) {
 		return tsd_fetch_slow(tsd, minimal);
 	}
 	assert(tsd_fast(tsd));
@@ -281,7 +290,7 @@ JEMALLOC_ALWAYS_INLINE tsd_t *
 tsd_internal_fetch(void) {
 	tsd_t *tsd = tsd_fetch_min();
 	/* Use reincarnated state to prevent full initialization. */
-	tsd->state = tsd_state_reincarnated;
+	tsd_state_set(tsd, tsd_state_reincarnated);
 
 	return tsd;
 }
@@ -293,7 +302,7 @@ tsd_fetch(void) {
 
 static inline bool
 tsd_nominal(tsd_t *tsd) {
-	return (tsd->state <= tsd_state_nominal_max);
+	return (tsd_state_get(tsd) <= tsd_state_nominal_max);
 }
 
 JEMALLOC_ALWAYS_INLINE tsdn_t *
diff --git a/src/tsd.c b/src/tsd.c
index c143068..f3320eb 100644
--- a/src/tsd.c
+++ b/src/tsd.c
@@ -56,9 +56,9 @@ tsd_slow_update(tsd_t *tsd) {
 	if (tsd_nominal(tsd)) {
 		if (malloc_slow || !tsd_tcache_enabled_get(tsd) ||
 		    tsd_reentrancy_level_get(tsd) > 0) {
-			tsd->state = tsd_state_nominal_slow;
+			tsd_state_set(tsd, tsd_state_nominal_slow);
 		} else {
-			tsd->state = tsd_state_nominal;
+			tsd_state_set(tsd, tsd_state_nominal);
 		}
 	}
 }
@@ -97,8 +97,8 @@ assert_tsd_data_cleanup_done(tsd_t *tsd) {
 
 static bool
 tsd_data_init_nocleanup(tsd_t *tsd) {
-	assert(tsd->state == tsd_state_reincarnated ||
-	    tsd->state == tsd_state_minimal_initialized);
+	assert(tsd_state_get(tsd) == tsd_state_reincarnated ||
+	    tsd_state_get(tsd) == tsd_state_minimal_initialized);
 	/*
 	 * During reincarnation, there is no guarantee that the cleanup function
 	 * will be called (deallocation may happen after all tsd destructors).
@@ -117,27 +117,27 @@ tsd_t *
 tsd_fetch_slow(tsd_t *tsd, bool minimal) {
 	assert(!tsd_fast(tsd));
 
-	if (tsd->state == tsd_state_nominal_slow) {
+	if (tsd_state_get(tsd) == tsd_state_nominal_slow) {
 		/* On slow path but no work needed. */
 		assert(malloc_slow || !tsd_tcache_enabled_get(tsd) ||
 		    tsd_reentrancy_level_get(tsd) > 0 ||
 		    *tsd_arenas_tdata_bypassp_get(tsd));
-	} else if (tsd->state == tsd_state_uninitialized) {
+	} else if (tsd_state_get(tsd) == tsd_state_uninitialized) {
 		if (!minimal) {
-			tsd->state = tsd_state_nominal;
+			tsd_state_set(tsd, tsd_state_nominal);
 			tsd_slow_update(tsd);
 			/* Trigger cleanup handler registration. */
 			tsd_set(tsd);
 			tsd_data_init(tsd);
 		} else {
-			tsd->state = tsd_state_minimal_initialized;
+			tsd_state_set(tsd, tsd_state_minimal_initialized);
 			tsd_set(tsd);
 			tsd_data_init_nocleanup(tsd);
 		}
-	} else if (tsd->state == tsd_state_minimal_initialized) {
+	} else if (tsd_state_get(tsd) == tsd_state_minimal_initialized) {
 		if (!minimal) {
 			/* Switch to fully initialized. */
-			tsd->state = tsd_state_nominal;
+			tsd_state_set(tsd, tsd_state_nominal);
 			assert(*tsd_reentrancy_levelp_get(tsd) >= 1);
 			(*tsd_reentrancy_levelp_get(tsd))--;
 			tsd_slow_update(tsd);
@@ -145,12 +145,12 @@ tsd_fetch_slow(tsd_t *tsd, bool minimal) {
 		} else {
 			assert_tsd_data_cleanup_done(tsd);
 		}
-	} else if (tsd->state == tsd_state_purgatory) {
-		tsd->state = tsd_state_reincarnated;
+	} else if (tsd_state_get(tsd) == tsd_state_purgatory) {
+		tsd_state_set(tsd, tsd_state_reincarnated);
 		tsd_set(tsd);
 		tsd_data_init_nocleanup(tsd);
 	} else {
-		assert(tsd->state == tsd_state_reincarnated);
+		assert(tsd_state_get(tsd) == tsd_state_reincarnated);
 	}
 
 	return tsd;
@@ -214,7 +214,7 @@ void
 tsd_cleanup(void *arg) {
 	tsd_t *tsd = (tsd_t *)arg;
 
-	switch (tsd->state) {
+	switch (tsd_state_get(tsd)) {
 	case tsd_state_uninitialized:
 		/* Do nothing. */
 		break;
@@ -232,7 +232,7 @@ tsd_cleanup(void *arg) {
 	case tsd_state_nominal:
 	case tsd_state_nominal_slow:
 		tsd_do_data_cleanup(tsd);
-		tsd->state = tsd_state_purgatory;
+		tsd_state_set(tsd, tsd_state_purgatory);
 		tsd_set(tsd);
 		break;
 	case tsd_state_purgatory:
diff --git a/test/unit/tsd.c b/test/unit/tsd.c
index 6c47913..3379891 100644
--- a/test/unit/tsd.c
+++ b/test/unit/tsd.c
@@ -98,11 +98,11 @@ thd_start_reincarnated(void *arg) {
 	tsd_cleanup((void *)tsd);
 	assert_ptr_null(*tsd_arenap_get_unsafe(tsd),
 	    "TSD arena should have been cleared.");
-	assert_u_eq(tsd->state, tsd_state_purgatory,
+	assert_u_eq(tsd_state_get(tsd), tsd_state_purgatory,
 	    "TSD state should be purgatory\n");
 
 	free(p);
-	assert_u_eq(tsd->state, tsd_state_reincarnated,
+	assert_u_eq(tsd_state_get(tsd), tsd_state_reincarnated,
 	    "TSD state should be reincarnated\n");
 	p = mallocx(1, MALLOCX_TCACHE_NONE);
 	assert_ptr_not_null(p, "Unexpected malloc() failure");
-- 
cgit v0.12


From 39d6420c0c39619176af3477b827e8a92442b768 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Thu, 8 Mar 2018 16:51:07 -0800
Subject: TSD: Make state atomic.

This will let us change the state of another thread remotely, eventually.
---
 include/jemalloc/internal/tsd.h | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index aa64d93..53ac741 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -78,7 +78,7 @@ typedef void (*test_callback_t)(int *);
     MALLOC_TEST_TSD
 
 #define TSD_INITIALIZER {						\
-    tsd_state_uninitialized,						\
+    ATOMIC_INIT(tsd_state_uninitialized),				\
     TCACHE_ENABLED_ZERO_INITIALIZER,					\
     false,								\
     0,									\
@@ -116,7 +116,7 @@ struct tsd_s {
 	 */
 
 	/* We manually limit the state to just a single byte. */
-	uint8_t state;
+	atomic_u8_t state;
 #define O(n, t, nt)							\
 	t use_a_getter_or_setter_instead_##n;
 MALLOC_TSD
@@ -125,12 +125,18 @@ MALLOC_TSD
 
 JEMALLOC_ALWAYS_INLINE uint8_t
 tsd_state_get(tsd_t *tsd) {
-	return tsd->state;
+	/*
+	 * This should be atomic.  Unfortunately, compilers right now can't tell
+	 * that this can be done as a memory comparison, and forces a load into
+	 * a register that hurts fast-path performance.
+	 */
+	/* return atomic_load_u8(&tsd->state, ATOMIC_RELAXED); */
+	return *(uint8_t *)&tsd->state;
 }
 
 JEMALLOC_ALWAYS_INLINE void
 tsd_state_set(tsd_t *tsd, uint8_t state) {
-	tsd->state = state;
+	atomic_store_u8(&tsd->state, state, ATOMIC_RELAXED);
 }
 
 /*
-- 
cgit v0.12


From feff510b9f938ae1b4e2f43815bc7b10f70fac12 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Fri, 11 May 2018 15:18:52 -0700
Subject: TSD: Pull name mangling into a macro.

---
 include/jemalloc/internal/tsd.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index 53ac741..c4faba5 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -107,6 +107,13 @@ enum {
 	tsd_state_uninitialized = 5
 };
 
+/*
+ * Some TSD accesses can only be done in a nominal state.  To enforce this, we
+ * wrap TSD member access in a function that asserts on TSD state, and mangle
+ * field names to prevent touching them accidentally.
+ */
+#define TSD_MANGLE(n) cant_access_tsd_items_directly_use_a_getter_or_setter_##n
+
 /* The actual tsd. */
 struct tsd_s {
 	/*
@@ -118,7 +125,7 @@ struct tsd_s {
 	/* We manually limit the state to just a single byte. */
 	atomic_u8_t state;
 #define O(n, t, nt)							\
-	t use_a_getter_or_setter_instead_##n;
+	t TSD_MANGLE(n);
 MALLOC_TSD
 #undef O
 };
@@ -197,7 +204,7 @@ void tsd_slow_update(tsd_t *tsd);
 #define O(n, t, nt)							\
 JEMALLOC_ALWAYS_INLINE t *						\
 tsd_##n##p_get_unsafe(tsd_t *tsd) {					\
-	return &tsd->use_a_getter_or_setter_instead_##n;		\
+	return &tsd->TSD_MANGLE(n);					\
 }
 MALLOC_TSD
 #undef O
-- 
cgit v0.12


From e870829e645bfd6d54e4a2d4cacce39478216a1e Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Mon, 2 Apr 2018 19:16:41 -0700
Subject: TSD: Add the ability to enter a global slow path.

This gives any thread the ability to send other threads down slow paths the next
time they fetch tsd.
---
 include/jemalloc/internal/tsd.h | 100 +++++++++++++++++------
 src/tsd.c                       | 171 +++++++++++++++++++++++++++++++++++++---
 test/unit/tsd.c                 | 130 +++++++++++++++++++++++++++++-
 3 files changed, 365 insertions(+), 36 deletions(-)

diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index c4faba5..251f565 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -59,6 +59,9 @@ typedef void (*test_callback_t)(int *);
 #  define MALLOC_TEST_TSD_INITIALIZER
 #endif
 
+/* Various uses of this struct need it to be a named type. */
+typedef ql_elm(tsd_t) tsd_link_t;
+
 /*  O(name,			type,			nullable type */
 #define MALLOC_TSD							\
     O(tcache_enabled,		bool,			bool)		\
@@ -73,6 +76,7 @@ typedef void (*test_callback_t)(int *);
     O(iarena,			arena_t *,		arena_t *)	\
     O(arena,			arena_t *,		arena_t *)	\
     O(arenas_tdata,		arena_tdata_t *,	arena_tdata_t *)\
+    O(link,			tsd_link_t,		tsd_link_t)	\
     O(tcache,			tcache_t,		tcache_t)	\
     O(witness_tsd,              witness_tsd_t,		witness_tsdn_t)	\
     MALLOC_TEST_TSD
@@ -91,20 +95,67 @@ typedef void (*test_callback_t)(int *);
     NULL,								\
     NULL,								\
     NULL,								\
+    {NULL},								\
     TCACHE_ZERO_INITIALIZER,						\
     WITNESS_TSD_INITIALIZER						\
     MALLOC_TEST_TSD_INITIALIZER						\
 }
 
+void *malloc_tsd_malloc(size_t size);
+void malloc_tsd_dalloc(void *wrapper);
+void malloc_tsd_cleanup_register(bool (*f)(void));
+tsd_t *malloc_tsd_boot0(void);
+void malloc_tsd_boot1(void);
+void tsd_cleanup(void *arg);
+tsd_t *tsd_fetch_slow(tsd_t *tsd, bool internal);
+void tsd_state_set(tsd_t *tsd, uint8_t new_state);
+void tsd_slow_update(tsd_t *tsd);
+
+/*
+ * Call ..._inc when your module wants to take all threads down the slow paths,
+ * and ..._dec when it no longer needs to.
+ */
+void tsd_global_slow_inc(tsdn_t *tsdn);
+void tsd_global_slow_dec(tsdn_t *tsdn);
+
 enum {
-	tsd_state_nominal = 0, /* Common case --> jnz. */
-	tsd_state_nominal_slow = 1, /* Initialized but on slow path. */
-	/* the above 2 nominal states should be lower values. */
-	tsd_state_nominal_max = 1, /* used for comparison only. */
-	tsd_state_minimal_initialized = 2,
-	tsd_state_purgatory = 3,
-	tsd_state_reincarnated = 4,
-	tsd_state_uninitialized = 5
+	/* Common case --> jnz. */
+	tsd_state_nominal = 0,
+	/* Initialized but on slow path. */
+	tsd_state_nominal_slow = 1,
+	/*
+	 * Some thread has changed global state in such a way that all nominal
+	 * threads need to recompute their fast / slow status the next time they
+	 * get a chance.
+	 *
+	 * Any thread can change another thread's status *to* recompute, but
+	 * threads are the only ones who can change their status *from*
+	 * recompute.
+	 */
+	tsd_state_nominal_recompute = 2,
+	/*
+	 * The above nominal states should be lower values.  We use
+	 * tsd_nominal_max to separate nominal states from threads in the
+	 * process of being born / dying.
+	 */
+	tsd_state_nominal_max = 2,
+
+	/*
+	 * A thread might free() during its death as its only allocator action;
+	 * in such scenarios, we need tsd, but set up in such a way that no
+	 * cleanup is necessary.
+	 */
+	tsd_state_minimal_initialized = 3,
+	/* States during which we know we're in thread death. */
+	tsd_state_purgatory = 4,
+	tsd_state_reincarnated = 5,
+	/*
+	 * What it says on the tin; tsd that hasn't been initialized.  Note
+	 * that even when the tsd struct lives in TLS, when need to keep track
+	 * of stuff like whether or not our pthread destructors have been
+	 * scheduled, so this really truly is different than the nominal state.
+	 */
+	tsd_state_uninitialized = 6
 };
 
 /*
@@ -141,11 +192,6 @@ tsd_state_get(tsd_t *tsd) {
 	return *(uint8_t *)&tsd->state;
 }
 
-JEMALLOC_ALWAYS_INLINE void
-tsd_state_set(tsd_t *tsd, uint8_t state) {
-	atomic_store_u8(&tsd->state, state, ATOMIC_RELAXED);
-}
-
 /*
  * Wrapper around tsd_t that makes it possible to avoid implicit conversion
  * between tsd_t and tsdn_t, where tsdn_t is "nullable" and has to be
@@ -172,15 +218,6 @@ tsdn_tsd(tsdn_t *tsdn) {
 	return &tsdn->tsd;
 }
 
-void *malloc_tsd_malloc(size_t size);
-void malloc_tsd_dalloc(void *wrapper);
-void malloc_tsd_cleanup_register(bool (*f)(void));
-tsd_t *malloc_tsd_boot0(void);
-void malloc_tsd_boot1(void);
-void tsd_cleanup(void *arg);
-tsd_t *tsd_fetch_slow(tsd_t *tsd, bool internal);
-void tsd_slow_update(tsd_t *tsd);
-
 /*
  * We put the platform-specific data declarations and inlines into their own
  * header files to avoid cluttering this file.  They define tsd_boot0,
@@ -213,10 +250,16 @@ MALLOC_TSD
 #define O(n, t, nt)							\
 JEMALLOC_ALWAYS_INLINE t *						\
 tsd_##n##p_get(tsd_t *tsd) {						\
-	assert(tsd_state_get(tsd) == tsd_state_nominal ||		\
-	    tsd_state_get(tsd) == tsd_state_nominal_slow ||		\
-	    tsd_state_get(tsd) == tsd_state_reincarnated ||		\
-	    tsd_state_get(tsd) == tsd_state_minimal_initialized);	\
+	/*								\
+	 * Because the state might change asynchronously if it's	\
+	 * nominal, we need to make sure that we only read it once.	\
+	 */								\
+	uint8_t state = tsd_state_get(tsd);				\
+	assert(state == tsd_state_nominal ||				\
+	    state == tsd_state_nominal_slow ||				\
+	    state == tsd_state_nominal_recompute ||			\
+	    state == tsd_state_reincarnated ||				\
+	    state == tsd_state_minimal_initialized);			\
 	return tsd_##n##p_get_unsafe(tsd);				\
 }
 MALLOC_TSD
@@ -260,6 +303,11 @@ MALLOC_TSD
 
 JEMALLOC_ALWAYS_INLINE void
 tsd_assert_fast(tsd_t *tsd) {
+	/*
+	 * Note that our fastness assertion does *not* include global slowness
+	 * counters; it's not in general possible to ensure that they won't
+	 * change asynchronously from underneath us.
+	 */
 	assert(!malloc_slow && tsd_tcache_enabled_get(tsd) &&
 	    tsd_reentrancy_level_get(tsd) == 0);
 }
diff --git a/src/tsd.c b/src/tsd.c
index f3320eb..c92cd22 100644
--- a/src/tsd.c
+++ b/src/tsd.c
@@ -51,14 +51,159 @@ bool tsd_booted = false;
 
 /******************************************************************************/
 
+/* A list of all the tsds in the nominal state. */
+typedef ql_head(tsd_t) tsd_list_t;
+static tsd_list_t tsd_nominal_tsds = ql_head_initializer(tsd_nominal_tsds);
+static malloc_mutex_t tsd_nominal_tsds_lock;
+
+/* How many slow-path-enabling features are turned on. */
+static atomic_u32_t tsd_global_slow_count = ATOMIC_INIT(0);
+
+static bool
+tsd_in_nominal_list(tsd_t *tsd) {
+	tsd_t *tsd_list;
+	bool found = false;
+	/*
+	 * We don't know that tsd is nominal; it might not be safe to get data
+	 * out of it here.
+	 */
+	malloc_mutex_lock(TSDN_NULL, &tsd_nominal_tsds_lock);
+	ql_foreach(tsd_list, &tsd_nominal_tsds, TSD_MANGLE(link)) {
+		if (tsd == tsd_list) {
+			found = true;
+			break;
+		}
+	}
+	malloc_mutex_unlock(TSDN_NULL, &tsd_nominal_tsds_lock);
+	return found;
+}
+
+static void
+tsd_add_nominal(tsd_t *tsd) {
+	assert(!tsd_in_nominal_list(tsd));
+	assert(tsd_state_get(tsd) <= tsd_state_nominal_max);
+	ql_elm_new(tsd, TSD_MANGLE(link));
+	malloc_mutex_lock(tsd_tsdn(tsd), &tsd_nominal_tsds_lock);
+	ql_tail_insert(&tsd_nominal_tsds, tsd, TSD_MANGLE(link));
+	malloc_mutex_unlock(tsd_tsdn(tsd), &tsd_nominal_tsds_lock);
+}
+
+static void
+tsd_remove_nominal(tsd_t *tsd) {
+	assert(tsd_in_nominal_list(tsd));
+	assert(tsd_state_get(tsd) <= tsd_state_nominal_max);
+	malloc_mutex_lock(tsd_tsdn(tsd), &tsd_nominal_tsds_lock);
+	ql_remove(&tsd_nominal_tsds, tsd, TSD_MANGLE(link));
+	malloc_mutex_unlock(tsd_tsdn(tsd), &tsd_nominal_tsds_lock);
+}
+
+static void
+tsd_force_recompute(tsdn_t *tsdn) {
+	/*
+	 * The stores to tsd->state here need to synchronize with the exchange
+	 * in tsd_slow_update.
+	 */
+	atomic_fence(ATOMIC_RELEASE);
+	malloc_mutex_lock(tsdn, &tsd_nominal_tsds_lock);
+	tsd_t *remote_tsd;
+	ql_foreach(remote_tsd, &tsd_nominal_tsds, TSD_MANGLE(link)) {
+		assert(atomic_load_u8(&remote_tsd->state, ATOMIC_RELAXED)
+		    <= tsd_state_nominal_max);
+		atomic_store_u8(&remote_tsd->state, tsd_state_nominal_recompute,
+		    ATOMIC_RELAXED);
+	}
+	malloc_mutex_unlock(tsdn, &tsd_nominal_tsds_lock);
+}
+
+void
+tsd_global_slow_inc(tsdn_t *tsdn) {
+	atomic_fetch_add_u32(&tsd_global_slow_count, 1, ATOMIC_RELAXED);
+	/*
+	 * We unconditionally force a recompute, even if the global slow count
+	 * was already positive.  If we didn't, then it would be possible for us
+	 * to return to the user, have the user synchronize externally with some
+	 * other thread, and then have that other thread not have picked up the
+	 * update yet (since the original incrementing thread might still be
+	 * making its way through the tsd list).
+	 */
+	tsd_force_recompute(tsdn);
+}
+
+void tsd_global_slow_dec(tsdn_t *tsdn) {
+	atomic_fetch_sub_u32(&tsd_global_slow_count, 1, ATOMIC_RELAXED);
+	/* See the note in ..._inc(). */
+	tsd_force_recompute(tsdn);
+}
+
+static bool
+tsd_local_slow(tsd_t *tsd) {
+	return !tsd_tcache_enabled_get(tsd)
+	    || tsd_reentrancy_level_get(tsd) > 0;
+}
+
+static bool
+tsd_global_slow() {
+	return atomic_load_u32(&tsd_global_slow_count, ATOMIC_RELAXED) > 0;
+}
+
+/******************************************************************************/
+
+static uint8_t
+tsd_state_compute(tsd_t *tsd) {
+	if (!tsd_nominal(tsd)) {
+		return tsd_state_get(tsd);
+	}
+	/* We're in *a* nominal state; but which one? */
+	if (malloc_slow || tsd_local_slow(tsd) || tsd_global_slow()) {
+		return tsd_state_nominal_slow;
+	} else {
+		return tsd_state_nominal;
+	}
+}
+
 void
 tsd_slow_update(tsd_t *tsd) {
-	if (tsd_nominal(tsd)) {
-		if (malloc_slow || !tsd_tcache_enabled_get(tsd) ||
-		    tsd_reentrancy_level_get(tsd) > 0) {
-			tsd_state_set(tsd, tsd_state_nominal_slow);
+	uint8_t old_state;
+	do {
+		uint8_t new_state = tsd_state_compute(tsd);
+		old_state = atomic_exchange_u8(&tsd->state, new_state,
+		    ATOMIC_ACQUIRE);
+	} while (old_state == tsd_state_nominal_recompute);
+}
+
+void
+tsd_state_set(tsd_t *tsd, uint8_t new_state) {
+	/* Only the tsd module can change the state *to* recompute. */
+	assert(new_state != tsd_state_nominal_recompute);
+	uint8_t old_state = atomic_load_u8(&tsd->state, ATOMIC_RELAXED);
+	if (old_state > tsd_state_nominal_max) {
+		/*
+		 * Not currently in the nominal list, but it might need to be
+		 * inserted there.
+		 */
+		assert(!tsd_in_nominal_list(tsd));
+		atomic_store_u8(&tsd->state, new_state, ATOMIC_RELAXED);
+		if (new_state <= tsd_state_nominal_max) {
+			tsd_add_nominal(tsd);
+		}
+	} else {
+		/*
+		 * We're currently nominal.  If the new state is non-nominal,
+		 * great; we take ourselves off the list and just enter the new
+		 * state.
+		 */
+		assert(tsd_in_nominal_list(tsd));
+		if (new_state > tsd_state_nominal_max) {
+			tsd_remove_nominal(tsd);
+			atomic_store_u8(&tsd->state, new_state, ATOMIC_RELAXED);
 		} else {
-			tsd_state_set(tsd, tsd_state_nominal);
+			/*
+			 * This is the tricky case.  We're transitioning from
+			 * one nominal state to another.  The caller can't know
+			 * about any races that are occuring at the same time,
+			 * so we always have to recompute no matter what.
+			 */
+			tsd_slow_update(tsd);
 		}
 	}
 }
@@ -118,10 +263,14 @@ tsd_fetch_slow(tsd_t *tsd, bool minimal) {
 	assert(!tsd_fast(tsd));
 
 	if (tsd_state_get(tsd) == tsd_state_nominal_slow) {
-		/* On slow path but no work needed. */
-		assert(malloc_slow || !tsd_tcache_enabled_get(tsd) ||
-		    tsd_reentrancy_level_get(tsd) > 0 ||
-		    *tsd_arenas_tdata_bypassp_get(tsd));
+		/*
+		 * On slow path but no work needed.  Note that we can't
+		 * necessarily *assert* that we're slow, because we might be
+		 * slow because of an asynchronous modification to global state,
+		 * which might be asynchronously modified *back*.
+		 */
+	} else if (tsd_state_get(tsd) == tsd_state_nominal_recompute) {
+		tsd_slow_update(tsd);
 	} else if (tsd_state_get(tsd) == tsd_state_uninitialized) {
 		if (!minimal) {
 			tsd_state_set(tsd, tsd_state_nominal);
@@ -260,6 +409,10 @@ malloc_tsd_boot0(void) {
 	tsd_t *tsd;
 
 	ncleanups = 0;
+	if (malloc_mutex_init(&tsd_nominal_tsds_lock, "tsd_nominal_tsds_lock",
+	    WITNESS_RANK_OMIT, malloc_mutex_rank_exclusive)) {
+		return NULL;
+	}
 	if (tsd_boot0()) {
 		return NULL;
 	}
diff --git a/test/unit/tsd.c b/test/unit/tsd.c
index 3379891..917884d 100644
--- a/test/unit/tsd.c
+++ b/test/unit/tsd.c
@@ -1,5 +1,10 @@
 #include "test/jemalloc_test.h"
 
+/*
+ * If we're e.g. in debug mode, we *never* enter the fast path, and so shouldn't
+ * be asserting that we're on one.
+ */
+static bool originally_fast;
 static int data_cleanup_count;
 
 void
@@ -124,6 +129,128 @@ TEST_BEGIN(test_tsd_reincarnation) {
 }
 TEST_END
 
+typedef struct {
+	atomic_u32_t phase;
+	atomic_b_t error;
+} global_slow_data_t;
+
+static void *
+thd_start_global_slow(void *arg) {
+	/* PHASE 0 */
+	global_slow_data_t *data = (global_slow_data_t *)arg;
+	free(mallocx(1, 0));
+
+	tsd_t *tsd = tsd_fetch();
+	/*
+	 * No global slowness has happened yet; there was an error if we were
+	 * originally fast but aren't now.
+	 */
+	atomic_store_b(&data->error, originally_fast && !tsd_fast(tsd),
+	    ATOMIC_SEQ_CST);
+	atomic_store_u32(&data->phase, 1, ATOMIC_SEQ_CST);
+
+	/* PHASE 2 */
+	while (atomic_load_u32(&data->phase, ATOMIC_SEQ_CST) != 2) {
+	}
+	free(mallocx(1, 0));
+	atomic_store_b(&data->error, tsd_fast(tsd), ATOMIC_SEQ_CST);
+	atomic_store_u32(&data->phase, 3, ATOMIC_SEQ_CST);
+
+	/* PHASE 4 */
+	while (atomic_load_u32(&data->phase, ATOMIC_SEQ_CST) != 4) {
+	}
+	free(mallocx(1, 0));
+	atomic_store_b(&data->error, tsd_fast(tsd), ATOMIC_SEQ_CST);
+	atomic_store_u32(&data->phase, 5, ATOMIC_SEQ_CST);
+
+	/* PHASE 6 */
+	while (atomic_load_u32(&data->phase, ATOMIC_SEQ_CST) != 6) {
+	}
+	free(mallocx(1, 0));
+	/* Only one decrement so far. */
+	atomic_store_b(&data->error, tsd_fast(tsd), ATOMIC_SEQ_CST);
+	atomic_store_u32(&data->phase, 7, ATOMIC_SEQ_CST);
+
+	/* PHASE 8 */
+	while (atomic_load_u32(&data->phase, ATOMIC_SEQ_CST) != 8) {
+	}
+	free(mallocx(1, 0));
+	/*
+	 * Both decrements happened; we should be fast again (if we ever
+	 * were)
+	 */
+	atomic_store_b(&data->error, originally_fast && !tsd_fast(tsd),
+	    ATOMIC_SEQ_CST);
+	atomic_store_u32(&data->phase, 9, ATOMIC_SEQ_CST);
+
+	return NULL;
+}
+
+TEST_BEGIN(test_tsd_global_slow) {
+	global_slow_data_t data = {ATOMIC_INIT(0), ATOMIC_INIT(false)};
+	/*
+	 * Note that the "mallocx" here (vs. malloc) is important, since the
+	 * compiler is allowed to optimize away free(malloc(1)) but not
+	 * free(mallocx(1)).
+	 */
+	free(mallocx(1, 0));
+	tsd_t *tsd = tsd_fetch();
+	originally_fast = tsd_fast(tsd);
+
+	thd_t thd;
+	thd_create(&thd, thd_start_global_slow, (void *)&data.phase);
+	/* PHASE 1 */
+	while (atomic_load_u32(&data.phase, ATOMIC_SEQ_CST) != 1) {
+		/*
+		 * We don't have a portable condvar/semaphore mechanism.
+		 * Spin-wait.
+		 */
+	}
+	assert_false(atomic_load_b(&data.error, ATOMIC_SEQ_CST), "");
+	tsd_global_slow_inc(tsd_tsdn(tsd));
+	free(mallocx(1, 0));
+	assert_false(tsd_fast(tsd), "");
+	atomic_store_u32(&data.phase, 2, ATOMIC_SEQ_CST);
+
+	/* PHASE 3 */
+	while (atomic_load_u32(&data.phase, ATOMIC_SEQ_CST) != 3) {
+	}
+	assert_false(atomic_load_b(&data.error, ATOMIC_SEQ_CST), "");
+	/* Increase again, so that we can test multiple fast/slow changes. */
+	tsd_global_slow_inc(tsd_tsdn(tsd));
+	atomic_store_u32(&data.phase, 4, ATOMIC_SEQ_CST);
+	free(mallocx(1, 0));
+	assert_false(tsd_fast(tsd), "");
+
+	/* PHASE 5 */
+	while (atomic_load_u32(&data.phase, ATOMIC_SEQ_CST) != 5) {
+	}
+	assert_false(atomic_load_b(&data.error, ATOMIC_SEQ_CST), "");
+	tsd_global_slow_dec(tsd_tsdn(tsd));
+	atomic_store_u32(&data.phase, 6, ATOMIC_SEQ_CST);
+	/* We only decreased once; things should still be slow. */
+	free(mallocx(1, 0));
+	assert_false(tsd_fast(tsd), "");
+
+	/* PHASE 7 */
+	while (atomic_load_u32(&data.phase, ATOMIC_SEQ_CST) != 7) {
+	}
+	assert_false(atomic_load_b(&data.error, ATOMIC_SEQ_CST), "");
+	tsd_global_slow_dec(tsd_tsdn(tsd));
+	atomic_store_u32(&data.phase, 8, ATOMIC_SEQ_CST);
+	/* We incremented and then decremented twice; we should be fast now. */
+	free(mallocx(1, 0));
+	assert_true(!originally_fast || tsd_fast(tsd), "");
+
+	/* PHASE 9 */
+	while (atomic_load_u32(&data.phase, ATOMIC_SEQ_CST) != 9) {
+	}
+	assert_false(atomic_load_b(&data.error, ATOMIC_SEQ_CST), "");
+
+	thd_join(thd, NULL);
+}
+TEST_END
+
 int
 main(void) {
 	/* Ensure tsd bootstrapped. */
@@ -135,5 +262,6 @@ main(void) {
 	return test_no_reentrancy(
 	    test_tsd_main_thread,
 	    test_tsd_sub_thread,
-	    test_tsd_reincarnation);
+	    test_tsd_reincarnation,
+	    test_tsd_global_slow);
 }
-- 
cgit v0.12


From c7a87e0e0bd02cf278760f3c22615d3129dc1ae2 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Mon, 9 Apr 2018 18:09:34 -0700
Subject: Rename hooks module to test_hooks.

"Hooks" is really the best name for the module that will contain the publicly
exposed hooks.  So lets rename the current "hooks" module (that hook external
dependencies, for reentrancy testing) to "test_hooks".
---
 Makefile.in                                      |  4 +--
 include/jemalloc/internal/hooks.h                | 19 ------------
 include/jemalloc/internal/jemalloc_preamble.h.in |  2 +-
 include/jemalloc/internal/test_hooks.h           | 19 ++++++++++++
 src/arena.c                                      |  4 +--
 src/hooks.c                                      | 12 --------
 src/prof.c                                       |  2 +-
 src/test_hooks.c                                 | 12 ++++++++
 test/include/test/jemalloc_test.h.in             |  2 +-
 test/src/test.c                                  | 14 ++++-----
 test/unit/hooks.c                                | 38 ------------------------
 test/unit/test_hooks.c                           | 38 ++++++++++++++++++++++++
 12 files changed, 83 insertions(+), 83 deletions(-)
 delete mode 100644 include/jemalloc/internal/hooks.h
 create mode 100644 include/jemalloc/internal/test_hooks.h
 delete mode 100644 src/hooks.c
 create mode 100644 src/test_hooks.c
 delete mode 100644 test/unit/hooks.c
 create mode 100644 test/unit/test_hooks.c

diff --git a/Makefile.in b/Makefile.in
index 9b9347f..c4f006b 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -102,7 +102,6 @@ C_SRCS := $(srcroot)src/jemalloc.c \
 	$(srcroot)src/extent_dss.c \
 	$(srcroot)src/extent_mmap.c \
 	$(srcroot)src/hash.c \
-	$(srcroot)src/hooks.c \
 	$(srcroot)src/large.c \
 	$(srcroot)src/log.c \
 	$(srcroot)src/malloc_io.c \
@@ -116,6 +115,7 @@ C_SRCS := $(srcroot)src/jemalloc.c \
 	$(srcroot)src/stats.c \
 	$(srcroot)src/sz.c \
 	$(srcroot)src/tcache.c \
+	$(srcroot)src/test_hooks.c \
 	$(srcroot)src/ticker.c \
 	$(srcroot)src/tsd.c \
 	$(srcroot)src/witness.c
@@ -172,7 +172,6 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/extent_quantize.c \
 	$(srcroot)test/unit/fork.c \
 	$(srcroot)test/unit/hash.c \
-	$(srcroot)test/unit/hooks.c \
 	$(srcroot)test/unit/junk.c \
 	$(srcroot)test/unit/junk_alloc.c \
 	$(srcroot)test/unit/junk_free.c \
@@ -205,6 +204,7 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/spin.c \
 	$(srcroot)test/unit/stats.c \
 	$(srcroot)test/unit/stats_print.c \
+	$(srcroot)test/unit/test_hooks.c \
 	$(srcroot)test/unit/ticker.c \
 	$(srcroot)test/unit/nstime.c \
 	$(srcroot)test/unit/tsd.c \
diff --git a/include/jemalloc/internal/hooks.h b/include/jemalloc/internal/hooks.h
deleted file mode 100644
index cd49afc..0000000
--- a/include/jemalloc/internal/hooks.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef JEMALLOC_INTERNAL_HOOKS_H
-#define JEMALLOC_INTERNAL_HOOKS_H
-
-extern JEMALLOC_EXPORT void (*hooks_arena_new_hook)();
-extern JEMALLOC_EXPORT void (*hooks_libc_hook)();
-
-#define JEMALLOC_HOOK(fn, hook) ((void)(hook != NULL && (hook(), 0)), fn)
-
-#define open JEMALLOC_HOOK(open, hooks_libc_hook)
-#define read JEMALLOC_HOOK(read, hooks_libc_hook)
-#define write JEMALLOC_HOOK(write, hooks_libc_hook)
-#define readlink JEMALLOC_HOOK(readlink, hooks_libc_hook)
-#define close JEMALLOC_HOOK(close, hooks_libc_hook)
-#define creat JEMALLOC_HOOK(creat, hooks_libc_hook)
-#define secure_getenv JEMALLOC_HOOK(secure_getenv, hooks_libc_hook)
-/* Note that this is undef'd and re-define'd in src/prof.c. */
-#define _Unwind_Backtrace JEMALLOC_HOOK(_Unwind_Backtrace, hooks_libc_hook)
-
-#endif /* JEMALLOC_INTERNAL_HOOKS_H */
diff --git a/include/jemalloc/internal/jemalloc_preamble.h.in b/include/jemalloc/internal/jemalloc_preamble.h.in
index e621fbc..1b12aee 100644
--- a/include/jemalloc/internal/jemalloc_preamble.h.in
+++ b/include/jemalloc/internal/jemalloc_preamble.h.in
@@ -45,7 +45,7 @@
 #    include "jemalloc/internal/private_namespace_jet.h"
 #  endif
 #endif
-#include "jemalloc/internal/hooks.h"
+#include "jemalloc/internal/test_hooks.h"
 
 #ifdef JEMALLOC_DEFINE_MADVISE_FREE
 #  define JEMALLOC_MADV_FREE 8
diff --git a/include/jemalloc/internal/test_hooks.h b/include/jemalloc/internal/test_hooks.h
new file mode 100644
index 0000000..a6351e5
--- /dev/null
+++ b/include/jemalloc/internal/test_hooks.h
@@ -0,0 +1,19 @@
+#ifndef JEMALLOC_INTERNAL_TEST_HOOKS_H
+#define JEMALLOC_INTERNAL_TEST_HOOKS_H
+
+extern JEMALLOC_EXPORT void (*test_hooks_arena_new_hook)();
+extern JEMALLOC_EXPORT void (*test_hooks_libc_hook)();
+
+#define JEMALLOC_HOOK(fn, hook) ((void)(hook != NULL && (hook(), 0)), fn)
+
+#define open JEMALLOC_HOOK(open, test_hooks_libc_hook)
+#define read JEMALLOC_HOOK(read, test_hooks_libc_hook)
+#define write JEMALLOC_HOOK(write, test_hooks_libc_hook)
+#define readlink JEMALLOC_HOOK(readlink, test_hooks_libc_hook)
+#define close JEMALLOC_HOOK(close, test_hooks_libc_hook)
+#define creat JEMALLOC_HOOK(creat, test_hooks_libc_hook)
+#define secure_getenv JEMALLOC_HOOK(secure_getenv, test_hooks_libc_hook)
+/* Note that this is undef'd and re-define'd in src/prof.c. */
+#define _Unwind_Backtrace JEMALLOC_HOOK(_Unwind_Backtrace, test_hooks_libc_hook)
+
+#endif /* JEMALLOC_INTERNAL_TEST_HOOKS_H */
diff --git a/src/arena.c b/src/arena.c
index 5d55bf1..311943f 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1900,8 +1900,8 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 		 */
 		assert(!tsdn_null(tsdn));
 		pre_reentrancy(tsdn_tsd(tsdn), arena);
-		if (hooks_arena_new_hook) {
-			hooks_arena_new_hook();
+		if (test_hooks_arena_new_hook) {
+			test_hooks_arena_new_hook();
 		}
 		post_reentrancy(tsdn_tsd(tsdn));
 	}
diff --git a/src/hooks.c b/src/hooks.c
deleted file mode 100644
index 6266ecd..0000000
--- a/src/hooks.c
+++ /dev/null
@@ -1,12 +0,0 @@
-#include "jemalloc/internal/jemalloc_preamble.h"
-
-/*
- * The hooks are a little bit screwy -- they're not genuinely exported in the
- * sense that we want them available to end-users, but we do want them visible
- * from outside the generated library, so that we can use them in test code.
- */
-JEMALLOC_EXPORT
-void (*hooks_arena_new_hook)() = NULL;
-
-JEMALLOC_EXPORT
-void (*hooks_libc_hook)() = NULL;
diff --git a/src/prof.c b/src/prof.c
index 13df641..405de4b 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -23,7 +23,7 @@
  */
 #undef _Unwind_Backtrace
 #include <unwind.h>
-#define _Unwind_Backtrace JEMALLOC_HOOK(_Unwind_Backtrace, hooks_libc_hook)
+#define _Unwind_Backtrace JEMALLOC_HOOK(_Unwind_Backtrace, test_hooks_libc_hook)
 #endif
 
 /******************************************************************************/
diff --git a/src/test_hooks.c b/src/test_hooks.c
new file mode 100644
index 0000000..ace00d9
--- /dev/null
+++ b/src/test_hooks.c
@@ -0,0 +1,12 @@
+#include "jemalloc/internal/jemalloc_preamble.h"
+
+/*
+ * The hooks are a little bit screwy -- they're not genuinely exported in the
+ * sense that we want them available to end-users, but we do want them visible
+ * from outside the generated library, so that we can use them in test code.
+ */
+JEMALLOC_EXPORT
+void (*test_hooks_arena_new_hook)() = NULL;
+
+JEMALLOC_EXPORT
+void (*test_hooks_libc_hook)() = NULL;
diff --git a/test/include/test/jemalloc_test.h.in b/test/include/test/jemalloc_test.h.in
index 67caa86..0209aea 100644
--- a/test/include/test/jemalloc_test.h.in
+++ b/test/include/test/jemalloc_test.h.in
@@ -69,7 +69,7 @@ static const bool config_debug =
 
 #  define JEMALLOC_N(n) @private_namespace@##n
 #  include "jemalloc/internal/private_namespace.h"
-#  include "jemalloc/internal/hooks.h"
+#  include "jemalloc/internal/test_hooks.h"
 
 /* Hermetic headers. */
 #  include "jemalloc/internal/assert.h"
diff --git a/test/src/test.c b/test/src/test.c
index 01a4d73..9c754e3 100644
--- a/test/src/test.c
+++ b/test/src/test.c
@@ -48,12 +48,12 @@ do_hook(bool *hook_ran, void (**hook)()) {
 
 static void
 libc_reentrancy_hook() {
-	do_hook(&libc_hook_ran, &hooks_libc_hook);
+	do_hook(&libc_hook_ran, &test_hooks_libc_hook);
 }
 
 static void
 arena_new_reentrancy_hook() {
-	do_hook(&arena_new_hook_ran, &hooks_arena_new_hook);
+	do_hook(&arena_new_hook_ran, &test_hooks_arena_new_hook);
 }
 
 /* Actual test infrastructure. */
@@ -131,7 +131,7 @@ p_test_impl(bool do_malloc_init, bool do_reentrant, test_t *t, va_list ap) {
 	for (; t != NULL; t = va_arg(ap, test_t *)) {
 		/* Non-reentrant run. */
 		reentrancy = non_reentrant;
-		hooks_arena_new_hook = hooks_libc_hook = NULL;
+		test_hooks_arena_new_hook = test_hooks_libc_hook = NULL;
 		t();
 		if (test_status > ret) {
 			ret = test_status;
@@ -139,16 +139,16 @@ p_test_impl(bool do_malloc_init, bool do_reentrant, test_t *t, va_list ap) {
 		/* Reentrant run. */
 		if (do_reentrant) {
 			reentrancy = libc_reentrant;
-			hooks_arena_new_hook = NULL;
-			hooks_libc_hook = &libc_reentrancy_hook;
+			test_hooks_arena_new_hook = NULL;
+			test_hooks_libc_hook = &libc_reentrancy_hook;
 			t();
 			if (test_status > ret) {
 				ret = test_status;
 			}
 
 			reentrancy = arena_new_reentrant;
-			hooks_libc_hook = NULL;
-			hooks_arena_new_hook = &arena_new_reentrancy_hook;
+			test_hooks_libc_hook = NULL;
+			test_hooks_arena_new_hook = &arena_new_reentrancy_hook;
 			t();
 			if (test_status > ret) {
 				ret = test_status;
diff --git a/test/unit/hooks.c b/test/unit/hooks.c
deleted file mode 100644
index b70172e..0000000
--- a/test/unit/hooks.c
+++ /dev/null
@@ -1,38 +0,0 @@
-#include "test/jemalloc_test.h"
-
-static bool hook_called = false;
-
-static void
-hook() {
-	hook_called = true;
-}
-
-static int
-func_to_hook(int arg1, int arg2) {
-	return arg1 + arg2;
-}
-
-#define func_to_hook JEMALLOC_HOOK(func_to_hook, hooks_libc_hook)
-
-TEST_BEGIN(unhooked_call) {
-	hooks_libc_hook = NULL;
-	hook_called = false;
-	assert_d_eq(3, func_to_hook(1, 2), "Hooking changed return value.");
-	assert_false(hook_called, "Nulling out hook didn't take.");
-}
-TEST_END
-
-TEST_BEGIN(hooked_call) {
-	hooks_libc_hook = &hook;
-	hook_called = false;
-	assert_d_eq(3, func_to_hook(1, 2), "Hooking changed return value.");
-	assert_true(hook_called, "Hook should have executed.");
-}
-TEST_END
-
-int
-main(void) {
-	return test(
-	    unhooked_call,
-	    hooked_call);
-}
diff --git a/test/unit/test_hooks.c b/test/unit/test_hooks.c
new file mode 100644
index 0000000..ded8698
--- /dev/null
+++ b/test/unit/test_hooks.c
@@ -0,0 +1,38 @@
+#include "test/jemalloc_test.h"
+
+static bool hook_called = false;
+
+static void
+hook() {
+	hook_called = true;
+}
+
+static int
+func_to_hook(int arg1, int arg2) {
+	return arg1 + arg2;
+}
+
+#define func_to_hook JEMALLOC_HOOK(func_to_hook, test_hooks_libc_hook)
+
+TEST_BEGIN(unhooked_call) {
+	test_hooks_libc_hook = NULL;
+	hook_called = false;
+	assert_d_eq(3, func_to_hook(1, 2), "Hooking changed return value.");
+	assert_false(hook_called, "Nulling out hook didn't take.");
+}
+TEST_END
+
+TEST_BEGIN(hooked_call) {
+	test_hooks_libc_hook = &hook;
+	hook_called = false;
+	assert_d_eq(3, func_to_hook(1, 2), "Hooking changed return value.");
+	assert_true(hook_called, "Hook should have executed.");
+}
+TEST_END
+
+int
+main(void) {
+	return test(
+	    unhooked_call,
+	    hooked_call);
+}
-- 
cgit v0.12


From 06a8c40b36403e902748d3f2a14e6dd43488ae89 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Fri, 13 Apr 2018 15:56:59 -0700
Subject: Add the Seq module, a simple seqlock implementation.

This allows fast reader-writer concurrency in cases where writers are rare.  The
immediate use case is for the hooking implementaiton.
---
 Makefile.in                     |  1 +
 include/jemalloc/internal/seq.h | 55 ++++++++++++++++++++++++
 test/unit/seq.c                 | 95 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 151 insertions(+)
 create mode 100644 include/jemalloc/internal/seq.h
 create mode 100644 test/unit/seq.c

diff --git a/Makefile.in b/Makefile.in
index c4f006b..074fdd4 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -197,6 +197,7 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/rb.c \
 	$(srcroot)test/unit/retained.c \
 	$(srcroot)test/unit/rtree.c \
+	$(srcroot)test/unit/seq.c \
 	$(srcroot)test/unit/SFMT.c \
 	$(srcroot)test/unit/size_classes.c \
 	$(srcroot)test/unit/slab.c \
diff --git a/include/jemalloc/internal/seq.h b/include/jemalloc/internal/seq.h
new file mode 100644
index 0000000..ef2df4c
--- /dev/null
+++ b/include/jemalloc/internal/seq.h
@@ -0,0 +1,55 @@
+#ifndef JEMALLOC_INTERNAL_SEQ_H
+#define JEMALLOC_INTERNAL_SEQ_H
+
+#include "jemalloc/internal/atomic.h"
+
+/*
+ * A simple seqlock implementation.
+ */
+
+#define seq_define(type, short_type)					\
+typedef struct {							\
+	atomic_zu_t seq;						\
+	atomic_zu_t data[						\
+	    (sizeof(type) + sizeof(size_t) - 1) / sizeof(size_t)];	\
+} seq_##short_type##_t;							\
+									\
+/*									\
+ * No internal synchronization -- the caller must ensure that there's	\
+ * only a single writer at a time.					\
+ */									\
+static inline void							\
+seq_store_##short_type(seq_##short_type##_t *dst, type *src) {		\
+	size_t buf[sizeof(dst->data) / sizeof(size_t)];			\
+	buf[sizeof(buf) / sizeof(size_t) - 1] = 0;			\
+	memcpy(buf, src, sizeof(type));					\
+	size_t old_seq = atomic_load_zu(&dst->seq, ATOMIC_RELAXED);	\
+	atomic_store_zu(&dst->seq, old_seq + 1, ATOMIC_RELAXED);	\
+	atomic_fence(ATOMIC_RELEASE);					\
+	for (size_t i = 0; i < sizeof(buf) / sizeof(size_t); i++) {	\
+		atomic_store_zu(&dst->data[i], buf[i], ATOMIC_RELAXED);	\
+	}								\
+	atomic_store_zu(&dst->seq, old_seq + 2, ATOMIC_RELEASE);	\
+}									\
+									\
+/* Returns whether or not the read was consistent. */			\
+static inline bool							\
+seq_try_load_##short_type(type *dst, seq_##short_type##_t *src) {	\
+	size_t buf[sizeof(src->data) / sizeof(size_t)];			\
+	size_t seq1 = atomic_load_zu(&src->seq, ATOMIC_ACQUIRE);	\
+	if (seq1 % 2 != 0) {						\
+		return false;						\
+	}								\
+	for (size_t i = 0; i < sizeof(buf) / sizeof(size_t); i++) {	\
+		buf[i] = atomic_load_zu(&src->data[i], ATOMIC_RELAXED);	\
+	}								\
+	atomic_fence(ATOMIC_ACQUIRE);					\
+	size_t seq2 = atomic_load_zu(&src->seq, ATOMIC_RELAXED);	\
+	if (seq1 != seq2) {						\
+		return false;						\
+	}								\
+	memcpy(dst, buf, sizeof(type));					\
+	return true;							\
+}
+
+#endif /* JEMALLOC_INTERNAL_SEQ_H */
diff --git a/test/unit/seq.c b/test/unit/seq.c
new file mode 100644
index 0000000..19613b0
--- /dev/null
+++ b/test/unit/seq.c
@@ -0,0 +1,95 @@
+#include "test/jemalloc_test.h"
+
+#include "jemalloc/internal/seq.h"
+
+typedef struct data_s data_t;
+struct data_s {
+	int arr[10];
+};
+
+static void
+set_data(data_t *data, int num) {
+	for (int i = 0; i < 10; i++) {
+		data->arr[i] = num;
+	}
+}
+
+static void
+assert_data(data_t *data) {
+	int num = data->arr[0];
+	for (int i = 0; i < 10; i++) {
+		assert_d_eq(num, data->arr[i], "Data consistency error");
+	}
+}
+
+seq_define(data_t, data)
+
+typedef struct thd_data_s thd_data_t;
+struct thd_data_s {
+	seq_data_t data;
+};
+
+static void *
+seq_reader_thd(void *arg) {
+	thd_data_t *thd_data = (thd_data_t *)arg;
+	int iter = 0;
+	data_t local_data;
+	while (iter < 1000 * 1000 - 1) {
+		bool success = seq_try_load_data(&local_data, &thd_data->data);
+		if (success) {
+			assert_data(&local_data);
+			assert_d_le(iter, local_data.arr[0],
+			    "Seq read went back in time.");
+			iter = local_data.arr[0];
+		}
+	}
+	return NULL;
+}
+
+static void *
+seq_writer_thd(void *arg) {
+	thd_data_t *thd_data = (thd_data_t *)arg;
+	data_t local_data;
+	memset(&local_data, 0, sizeof(local_data));
+	for (int i = 0; i < 1000 * 1000; i++) {
+		set_data(&local_data, i);
+		seq_store_data(&thd_data->data, &local_data);
+	}
+	return NULL;
+}
+
+TEST_BEGIN(test_seq_threaded) {
+	thd_data_t thd_data;
+	memset(&thd_data, 0, sizeof(thd_data));
+
+	thd_t reader;
+	thd_t writer;
+
+	thd_create(&reader, seq_reader_thd, &thd_data);
+	thd_create(&writer, seq_writer_thd, &thd_data);
+
+	thd_join(reader, NULL);
+	thd_join(writer, NULL);
+}
+TEST_END
+
+TEST_BEGIN(test_seq_simple) {
+	data_t data;
+	seq_data_t seq;
+	memset(&seq, 0, sizeof(seq));
+	for (int i = 0; i < 1000 * 1000; i++) {
+		set_data(&data, i);
+		seq_store_data(&seq, &data);
+		set_data(&data, 0);
+		bool success = seq_try_load_data(&data, &seq);
+		assert_b_eq(success, true, "Failed non-racing read");
+		assert_data(&data);
+	}
+}
+TEST_END
+
+int main(void) {
+	return test_no_reentrancy(
+	    test_seq_simple,
+	    test_seq_threaded);
+}
-- 
cgit v0.12


From 5ae6e7cbfa6d6788340cc87d7717548f4d7960fe Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Mon, 9 Apr 2018 19:11:46 -0700
Subject: Add "hook" module.

The hook module allows a low-reader-overhead way of finding hooks to invoke and
calling them.

For now, none of the allocation pathways are tied into the hooks; this will come
later.
---
 Makefile.in                         |   2 +
 include/jemalloc/internal/hook.h    | 125 +++++++++++++++++++++++++
 include/jemalloc/internal/witness.h |   1 +
 src/hook.c                          | 133 ++++++++++++++++++++++++++
 src/jemalloc.c                      |   2 +
 test/unit/hook.c                    | 180 ++++++++++++++++++++++++++++++++++++
 6 files changed, 443 insertions(+)
 create mode 100644 include/jemalloc/internal/hook.h
 create mode 100644 src/hook.c
 create mode 100644 test/unit/hook.c

diff --git a/Makefile.in b/Makefile.in
index 074fdd4..3b3191f 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -102,6 +102,7 @@ C_SRCS := $(srcroot)src/jemalloc.c \
 	$(srcroot)src/extent_dss.c \
 	$(srcroot)src/extent_mmap.c \
 	$(srcroot)src/hash.c \
+	$(srcroot)src/hook.c \
 	$(srcroot)src/large.c \
 	$(srcroot)src/log.c \
 	$(srcroot)src/malloc_io.c \
@@ -172,6 +173,7 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/extent_quantize.c \
 	$(srcroot)test/unit/fork.c \
 	$(srcroot)test/unit/hash.c \
+	$(srcroot)test/unit/hook.c \
 	$(srcroot)test/unit/junk.c \
 	$(srcroot)test/unit/junk_alloc.c \
 	$(srcroot)test/unit/junk_free.c \
diff --git a/include/jemalloc/internal/hook.h b/include/jemalloc/internal/hook.h
new file mode 100644
index 0000000..847c91b
--- /dev/null
+++ b/include/jemalloc/internal/hook.h
@@ -0,0 +1,125 @@
+#ifndef JEMALLOC_INTERNAL_HOOK_H
+#define JEMALLOC_INTERNAL_HOOK_H
+
+#include "jemalloc/internal/tsd.h"
+
+/*
+ * This API is *extremely* experimental, and may get ripped out, changed in API-
+ * and ABI-incompatible ways, be insufficiently or incorrectly documented, etc.
+ *
+ * It allows hooking the stateful parts of the API to see changes as they
+ * happen.
+ *
+ * Allocation hooks are called after the allocation is done, free hooks are
+ * called before the free is done, and expand hooks are called after the
+ * allocation is expanded.
+ *
+ * For realloc and rallocx, if the expansion happens in place, the expansion
+ * hook is called.  If it is moved, then the alloc hook is called on the new
+ * location, and then the free hook is called on the old location.
+ *
+ * (We omit no-ops, like free(NULL), etc.).
+ *
+ * Reentrancy:
+ *   Is not protected against.  If your hooks allocate, then the hooks will be
+ *   called again.  Note that you can guard against this with a thread-local
+ *   "in_hook" bool.
+ * Threading:
+ *   The installation of a hook synchronizes with all its uses.  If you can
+ *   prove the installation of a hook happens-before a jemalloc entry point,
+ *   then the hook will get invoked (unless there's a racing removal).
+ *
+ *   Hook insertion appears to be atomic at a per-thread level (i.e. if a thread
+ *   allocates and has the alloc hook invoked, then a subsequent free on the
+ *   same thread will also have the free hook invoked).
+ *
+ *   The *removal* of a hook does *not* block until all threads are done with
+ *   the hook.  Hook authors have to be resilient to this, and need some
+ *   out-of-band mechanism for cleaning up any dynamically allocated memory
+ *   associated with their hook.
+ * Ordering:
+ *   Order of hook execution is unspecified, and may be different than insertion
+ *   order.
+ */
+
+enum hook_alloc_e {
+	hook_alloc_malloc,
+	hook_alloc_posix_memalign,
+	hook_alloc_aligned_alloc,
+	hook_alloc_calloc,
+	hook_alloc_memalign,
+	hook_alloc_valloc,
+	hook_alloc_mallocx,
+
+	/* The reallocating functions have both alloc and dalloc variants */
+	hook_alloc_realloc,
+	hook_alloc_rallocx,
+};
+/*
+ * We put the enum typedef after the enum, since this file may get included by
+ * jemalloc_cpp.cpp, and C++ disallows enum forward declarations.
+ */
+typedef enum hook_alloc_e hook_alloc_t;
+
+enum hook_dalloc_e {
+	hook_dalloc_free,
+	hook_dalloc_dallocx,
+	hook_dalloc_sdallocx,
+
+	/*
+	 * The dalloc halves of reallocation (not called if in-place expansion
+	 * happens).
+	 */
+	hook_dalloc_realloc,
+	hook_dalloc_rallocx,
+};
+typedef enum hook_dalloc_e hook_dalloc_t;
+
+
+enum hook_expand_e {
+	hook_expand_realloc,
+	hook_expand_rallocx,
+	hook_expand_xallocx,
+};
+typedef enum hook_expand_e hook_expand_t;
+
+typedef void (*hook_alloc)(
+    void *extra, hook_alloc_t type, void *result, uintptr_t result_raw,
+    uintptr_t args_raw[3]);
+
+typedef void (*hook_dalloc)(
+    void *extra, hook_dalloc_t type, void *address, uintptr_t args_raw[3]);
+
+typedef void (*hook_expand)(
+    void *extra, hook_expand_t type, void *address, size_t old_usize,
+    size_t new_usize, uintptr_t result_raw, uintptr_t args_raw[4]);
+
+typedef struct hooks_s hooks_t;
+struct hooks_s {
+	hook_alloc alloc_hook;
+	hook_dalloc dalloc_hook;
+	hook_expand expand_hook;
+};
+
+/*
+ * Returns an opaque handle to be used when removing the hook.  NULL means that
+ * we couldn't install the hook.
+ */
+bool hook_boot();
+
+void *hook_install(tsdn_t *tsdn, hooks_t *hooks, void *extra);
+/* Uninstalls the hook with the handle previously returned from hook_install. */
+void hook_remove(tsdn_t *tsdn, void *opaque);
+
+/* Hooks */
+
+void hook_invoke_alloc(hook_alloc_t type, void *result, uintptr_t result_raw,
+    uintptr_t args_raw[3]);
+
+void hook_invoke_dalloc(hook_dalloc_t type, void *address,
+    uintptr_t args_raw[3]);
+
+void hook_invoke_expand(hook_expand_t type, void *address, size_t old_usize,
+    size_t new_usize, uintptr_t result_raw, uintptr_t args_raw[4]);
+
+#endif /* JEMALLOC_INTERNAL_HOOK_H */
diff --git a/include/jemalloc/internal/witness.h b/include/jemalloc/internal/witness.h
index 7ace8ae..80ea70c 100644
--- a/include/jemalloc/internal/witness.h
+++ b/include/jemalloc/internal/witness.h
@@ -49,6 +49,7 @@
 #define WITNESS_RANK_RTREE		17U
 #define WITNESS_RANK_BASE		18U
 #define WITNESS_RANK_ARENA_LARGE	19U
+#define WITNESS_RANK_HOOK		20U
 
 #define WITNESS_RANK_LEAF		0xffffffffU
 #define WITNESS_RANK_BIN		WITNESS_RANK_LEAF
diff --git a/src/hook.c b/src/hook.c
new file mode 100644
index 0000000..6b154bd
--- /dev/null
+++ b/src/hook.c
@@ -0,0 +1,133 @@
+#include "jemalloc/internal/jemalloc_preamble.h"
+
+#include "jemalloc/internal/hook.h"
+
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/seq.h"
+
+typedef struct hooks_internal_s hooks_internal_t;
+struct hooks_internal_s {
+	hooks_t hooks;
+	void *extra;
+	bool in_use;
+};
+
+seq_define(hooks_internal_t, hooks)
+
+#define HOOKS_MAX 4
+static seq_hooks_t hooks[HOOKS_MAX];
+static malloc_mutex_t hooks_mu;
+
+bool
+hook_boot() {
+	return malloc_mutex_init(&hooks_mu, "hooks", WITNESS_RANK_HOOK,
+	    malloc_mutex_rank_exclusive);
+}
+
+static void *
+hook_install_locked(hooks_t *to_install, void *extra) {
+	hooks_internal_t hooks_internal;
+	for (int i = 0; i < HOOKS_MAX; i++) {
+		bool success = seq_try_load_hooks(&hooks_internal, &hooks[i]);
+		/* We hold mu; no concurrent access. */
+		assert(success);
+		if (!hooks_internal.in_use) {
+			hooks_internal.hooks = *to_install;
+			hooks_internal.extra = extra;
+			hooks_internal.in_use = true;
+			seq_store_hooks(&hooks[i], &hooks_internal);
+			return &hooks[i];
+		}
+	}
+	return NULL;
+}
+
+void *
+hook_install(tsdn_t *tsdn, hooks_t *to_install, void *extra) {
+	malloc_mutex_lock(tsdn, &hooks_mu);
+	void *ret = hook_install_locked(to_install, extra);
+	if (ret != NULL) {
+		tsd_global_slow_inc(tsdn);
+	}
+	malloc_mutex_unlock(tsdn, &hooks_mu);
+	return ret;
+}
+
+static void
+hook_remove_locked(seq_hooks_t *to_remove) {
+	hooks_internal_t hooks_internal;
+	bool success = seq_try_load_hooks(&hooks_internal, to_remove);
+	/* We hold mu; no concurrent access. */
+	assert(success);
+	/* Should only remove hooks that were added. */
+	assert(hooks_internal.in_use);
+	hooks_internal.in_use = false;
+	seq_store_hooks(to_remove, &hooks_internal);
+}
+
+void
+hook_remove(tsdn_t *tsdn, void *opaque) {
+	if (config_debug) {
+		char *hooks_begin = (char *)&hooks[0];
+		char *hooks_end = (char *)&hooks[HOOKS_MAX];
+		char *hook = (char *)opaque;
+		assert(hooks_begin <= hook && hook < hooks_end
+		    && (hook - hooks_begin) % sizeof(seq_hooks_t) == 0);
+	}
+	malloc_mutex_lock(tsdn, &hooks_mu);
+	hook_remove_locked((seq_hooks_t *)opaque);
+	tsd_global_slow_dec(tsdn);
+	malloc_mutex_unlock(tsdn, &hooks_mu);
+}
+
+#define FOR_EACH_HOOK_BEGIN(hooks_internal_ptr)				\
+for (int for_each_hook_counter = 0;					\
+    for_each_hook_counter < HOOKS_MAX;					\
+    for_each_hook_counter++) {						\
+	bool for_each_hook_success = seq_try_load_hooks(		\
+	    (hooks_internal_ptr), &hooks[for_each_hook_counter]);	\
+	if (!for_each_hook_success) {					\
+		continue;						\
+	}								\
+	if (!(hooks_internal_ptr)->in_use) {				\
+		continue;						\
+	}
+#define FOR_EACH_HOOK_END						\
+}
+
+void
+hook_invoke_alloc(hook_alloc_t type, void *result, uintptr_t result_raw,
+    uintptr_t args_raw[3]) {
+	hooks_internal_t hook;
+	FOR_EACH_HOOK_BEGIN(&hook)
+		hook_alloc h = hook.hooks.alloc_hook;
+		if (h != NULL) {
+			h(hook.extra, type, result, result_raw, args_raw);
+		}
+	FOR_EACH_HOOK_END
+}
+
+void
+hook_invoke_dalloc(hook_dalloc_t type, void *address, uintptr_t args_raw[3]) {
+	hooks_internal_t hook;
+	FOR_EACH_HOOK_BEGIN(&hook)
+		hook_dalloc h = hook.hooks.dalloc_hook;
+		if (h != NULL) {
+			h(hook.extra, type, address, args_raw);
+		}
+	FOR_EACH_HOOK_END
+}
+
+void
+hook_invoke_expand(hook_expand_t type, void *address, size_t old_usize,
+    size_t new_usize, uintptr_t result_raw, uintptr_t args_raw[4]) {
+	hooks_internal_t hook;
+	FOR_EACH_HOOK_BEGIN(&hook)
+		hook_expand h = hook.hooks.expand_hook;
+		if (h != NULL) {
+			h(hook.extra, type, address, old_usize, new_usize,
+			    result_raw, args_raw);
+		}
+	FOR_EACH_HOOK_END
+}
diff --git a/src/jemalloc.c b/src/jemalloc.c
index f93c16f..f837e6b 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -7,6 +7,7 @@
 #include "jemalloc/internal/ctl.h"
 #include "jemalloc/internal/extent_dss.h"
 #include "jemalloc/internal/extent_mmap.h"
+#include "jemalloc/internal/hook.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"
 #include "jemalloc/internal/log.h"
 #include "jemalloc/internal/malloc_io.h"
@@ -1311,6 +1312,7 @@ malloc_init_hard_a0_locked() {
 	    malloc_mutex_rank_exclusive)) {
 		return true;
 	}
+	hook_boot();
 	/*
 	 * Create enough scaffolding to allow recursive allocation in
 	 * malloc_ncpus().
diff --git a/test/unit/hook.c b/test/unit/hook.c
new file mode 100644
index 0000000..a959096
--- /dev/null
+++ b/test/unit/hook.c
@@ -0,0 +1,180 @@
+#include "test/jemalloc_test.h"
+
+#include "jemalloc/internal/hook.h"
+
+static void *arg_extra;
+static int arg_type;
+static void *arg_result;
+static void *arg_address;
+static size_t arg_old_usize;
+static size_t arg_new_usize;
+static uintptr_t arg_result_raw;
+static uintptr_t arg_args_raw[4];
+
+static int call_count = 0;
+
+static void
+reset_args() {
+	arg_extra = NULL;
+	arg_type = 12345;
+	arg_result = NULL;
+	arg_address = NULL;
+	arg_old_usize = 0;
+	arg_new_usize = 0;
+	arg_result_raw = 0;
+	memset(arg_args_raw, 77, sizeof(arg_args_raw));
+}
+
+static void
+set_args_raw(uintptr_t *args_raw, int nargs) {
+	memcpy(arg_args_raw, args_raw, sizeof(uintptr_t) * nargs);
+}
+
+static void
+assert_args_raw(uintptr_t *args_raw_expected, int nargs) {
+	int cmp = memcmp(args_raw_expected, arg_args_raw,
+	    sizeof(uintptr_t) * nargs);
+	assert_d_eq(cmp, 0, "Raw args mismatch");
+}
+
+static void
+test_alloc_hook(void *extra, hook_alloc_t type, void *result,
+    uintptr_t result_raw, uintptr_t args_raw[3]) {
+	call_count++;
+	arg_extra = extra;
+	arg_type = (int)type;
+	arg_result = result;
+	arg_result_raw = result_raw;
+	set_args_raw(args_raw, 3);
+}
+
+static void
+test_dalloc_hook(void *extra, hook_dalloc_t type, void *address,
+    uintptr_t args_raw[3]) {
+	call_count++;
+	arg_extra = extra;
+	arg_type = (int)type;
+	arg_address = address;
+	set_args_raw(args_raw, 3);
+}
+
+static void
+test_expand_hook(void *extra, hook_expand_t type, void *address,
+    size_t old_usize, size_t new_usize, uintptr_t result_raw,
+    uintptr_t args_raw[4]) {
+	call_count++;
+	arg_extra = extra;
+	arg_type = (int)type;
+	arg_address = address;
+	arg_old_usize = old_usize;
+	arg_new_usize = new_usize;
+	arg_result_raw = result_raw;
+	set_args_raw(args_raw, 4);
+}
+
+TEST_BEGIN(test_hooks_basic) {
+	/* Just verify that the record their arguments correctly. */
+	hooks_t hooks = {
+		&test_alloc_hook, &test_dalloc_hook, &test_expand_hook};
+	void *handle = hook_install(TSDN_NULL, &hooks, (void *)111);
+	uintptr_t args_raw[4] = {10, 20, 30, 40};
+
+	/* Alloc */
+	reset_args();
+	hook_invoke_alloc(hook_alloc_posix_memalign, (void *)222, 333,
+	    args_raw);
+	assert_ptr_eq(arg_extra, (void *)111, "Passed wrong user pointer");
+	assert_d_eq((int)hook_alloc_posix_memalign, arg_type,
+	    "Passed wrong alloc type");
+	assert_ptr_eq((void *)222, arg_result, "Passed wrong result address");
+	assert_u64_eq(333, arg_result_raw, "Passed wrong result");
+	assert_args_raw(args_raw, 3);
+
+	/* Dalloc */
+	reset_args();
+	hook_invoke_dalloc(hook_dalloc_sdallocx, (void *)222, args_raw);
+	assert_d_eq((int)hook_dalloc_sdallocx, arg_type,
+	    "Passed wrong dalloc type");
+	assert_ptr_eq((void *)111, arg_extra, "Passed wrong user pointer");
+	assert_ptr_eq((void *)222, arg_address, "Passed wrong address");
+	assert_args_raw(args_raw, 3);
+
+	/* Expand */
+	reset_args();
+	hook_invoke_expand(hook_expand_xallocx, (void *)222, 333, 444, 555,
+	    args_raw);
+	assert_d_eq((int)hook_expand_xallocx, arg_type,
+	    "Passed wrong expand type");
+	assert_ptr_eq((void *)111, arg_extra, "Passed wrong user pointer");
+	assert_ptr_eq((void *)222, arg_address, "Passed wrong address");
+	assert_zu_eq(333, arg_old_usize, "Passed wrong old usize");
+	assert_zu_eq(444, arg_new_usize, "Passed wrong new usize");
+	assert_zu_eq(555, arg_result_raw, "Passed wrong result");
+	assert_args_raw(args_raw, 4);
+
+	hook_remove(TSDN_NULL, handle);
+}
+TEST_END
+
+TEST_BEGIN(test_hooks_null) {
+	/* Null hooks should be ignored, not crash. */
+	hooks_t hooks1 = {NULL, NULL, NULL};
+	hooks_t hooks2 = {&test_alloc_hook, NULL, NULL};
+	hooks_t hooks3 = {NULL, &test_dalloc_hook, NULL};
+	hooks_t hooks4 = {NULL, NULL, &test_expand_hook};
+
+	void *handle1 = hook_install(TSDN_NULL, &hooks1, NULL);
+	void *handle2 = hook_install(TSDN_NULL, &hooks2, NULL);
+	void *handle3 = hook_install(TSDN_NULL, &hooks3, NULL);
+	void *handle4 = hook_install(TSDN_NULL, &hooks4, NULL);
+
+	assert_ptr_ne(handle1, NULL, "Hook installation failed");
+	assert_ptr_ne(handle2, NULL, "Hook installation failed");
+	assert_ptr_ne(handle3, NULL, "Hook installation failed");
+	assert_ptr_ne(handle4, NULL, "Hook installation failed");
+
+	uintptr_t args_raw[4] = {10, 20, 30, 40};
+
+	call_count = 0;
+	hook_invoke_alloc(hook_alloc_malloc, NULL, 0, args_raw);
+	assert_d_eq(call_count, 1, "Called wrong number of times");
+
+	call_count = 0;
+	hook_invoke_dalloc(hook_dalloc_free, NULL, args_raw);
+	assert_d_eq(call_count, 1, "Called wrong number of times");
+
+	call_count = 0;
+	hook_invoke_expand(hook_expand_realloc, NULL, 0, 0, 0, args_raw);
+	assert_d_eq(call_count, 1, "Called wrong number of times");
+
+	hook_remove(TSDN_NULL, handle1);
+	hook_remove(TSDN_NULL, handle2);
+	hook_remove(TSDN_NULL, handle3);
+	hook_remove(TSDN_NULL, handle4);
+}
+TEST_END
+
+TEST_BEGIN(test_hooks_remove) {
+	hooks_t hooks = {&test_alloc_hook, NULL, NULL};
+	void *handle = hook_install(TSDN_NULL, &hooks, NULL);
+	assert_ptr_ne(handle, NULL, "Hook installation failed");
+	call_count = 0;
+	uintptr_t args_raw[4] = {10, 20, 30, 40};
+	hook_invoke_alloc(hook_alloc_malloc, NULL, 0, args_raw);
+	assert_d_eq(call_count, 1, "Hook not invoked");
+
+	call_count = 0;
+	hook_remove(TSDN_NULL, handle);
+	hook_invoke_alloc(hook_alloc_malloc, NULL, 0, NULL);
+	assert_d_eq(call_count, 0, "Hook invoked after removal");
+
+}
+TEST_END
+
+int
+main(void) {
+	return test(
+	    test_hooks_basic,
+	    test_hooks_null,
+	    test_hooks_remove);
+}
-- 
cgit v0.12


From fe0e39938593b5fb16dc09fcdbe29d6ad7b3cf05 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Thu, 19 Apr 2018 11:35:33 -0700
Subject: Hooks: add an early-exit path for the common no-hook case.

---
 src/hook.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/hook.c b/src/hook.c
index 6b154bd..5c6818f 100644
--- a/src/hook.c
+++ b/src/hook.c
@@ -16,6 +16,7 @@ struct hooks_internal_s {
 seq_define(hooks_internal_t, hooks)
 
 #define HOOKS_MAX 4
+static atomic_u_t nhooks = ATOMIC_INIT(0);
 static seq_hooks_t hooks[HOOKS_MAX];
 static malloc_mutex_t hooks_mu;
 
@@ -37,6 +38,9 @@ hook_install_locked(hooks_t *to_install, void *extra) {
 			hooks_internal.extra = extra;
 			hooks_internal.in_use = true;
 			seq_store_hooks(&hooks[i], &hooks_internal);
+			atomic_store_u(&nhooks,
+			    atomic_load_u(&nhooks, ATOMIC_RELAXED) + 1,
+			    ATOMIC_RELAXED);
 			return &hooks[i];
 		}
 	}
@@ -64,6 +68,8 @@ hook_remove_locked(seq_hooks_t *to_remove) {
 	assert(hooks_internal.in_use);
 	hooks_internal.in_use = false;
 	seq_store_hooks(to_remove, &hooks_internal);
+	atomic_store_u(&nhooks, atomic_load_u(&nhooks, ATOMIC_RELAXED) - 1,
+	    ATOMIC_RELAXED);
 }
 
 void
@@ -99,6 +105,9 @@ for (int for_each_hook_counter = 0;					\
 void
 hook_invoke_alloc(hook_alloc_t type, void *result, uintptr_t result_raw,
     uintptr_t args_raw[3]) {
+	if (likely(atomic_load_u(&nhooks, ATOMIC_RELAXED) == 0)) {
+		return;
+	}
 	hooks_internal_t hook;
 	FOR_EACH_HOOK_BEGIN(&hook)
 		hook_alloc h = hook.hooks.alloc_hook;
@@ -110,6 +119,9 @@ hook_invoke_alloc(hook_alloc_t type, void *result, uintptr_t result_raw,
 
 void
 hook_invoke_dalloc(hook_dalloc_t type, void *address, uintptr_t args_raw[3]) {
+	if (likely(atomic_load_u(&nhooks, ATOMIC_RELAXED) == 0)) {
+		return;
+	}
 	hooks_internal_t hook;
 	FOR_EACH_HOOK_BEGIN(&hook)
 		hook_dalloc h = hook.hooks.dalloc_hook;
@@ -122,6 +134,9 @@ hook_invoke_dalloc(hook_dalloc_t type, void *address, uintptr_t args_raw[3]) {
 void
 hook_invoke_expand(hook_expand_t type, void *address, size_t old_usize,
     size_t new_usize, uintptr_t result_raw, uintptr_t args_raw[4]) {
+	if (likely(atomic_load_u(&nhooks, ATOMIC_RELAXED) == 0)) {
+		return;
+	}
 	hooks_internal_t hook;
 	FOR_EACH_HOOK_BEGIN(&hook)
 		hook_expand h = hook.hooks.expand_hook;
-- 
cgit v0.12


From 226327cf66f6e1fb1aed24ed3e2e9c291d1843b7 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Thu, 19 Apr 2018 13:14:22 -0700
Subject: Hooks: hook the pure-allocation functions.

---
 include/jemalloc/internal/hook.h |   2 +-
 src/jemalloc.c                   |  66 +++++++++++++++++++--
 test/unit/hook.c                 | 124 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 184 insertions(+), 8 deletions(-)

diff --git a/include/jemalloc/internal/hook.h b/include/jemalloc/internal/hook.h
index 847c91b..fbf3a07 100644
--- a/include/jemalloc/internal/hook.h
+++ b/include/jemalloc/internal/hook.h
@@ -18,7 +18,7 @@
  * hook is called.  If it is moved, then the alloc hook is called on the new
  * location, and then the free hook is called on the old location.
  *
- * (We omit no-ops, like free(NULL), etc.).
+ * If we return NULL from OOM, then usize might not be trustworthy.
  *
  * Reentrancy:
  *   Is not protected against.  If your hooks allocate, then the hooks will be
diff --git a/src/jemalloc.c b/src/jemalloc.c
index f837e6b..df59f69 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -2038,6 +2038,14 @@ je_malloc(size_t size) {
 	dopts.item_size = size;
 
 	imalloc(&sopts, &dopts);
+	/*
+	 * Note that this branch gets optimized away -- it immediately follows
+	 * the check on tsd_fast that sets sopts.slow.
+	 */
+	if (sopts.slow) {
+		uintptr_t args[3] = {size};
+		hook_invoke_alloc(hook_alloc_malloc, ret, (uintptr_t)ret, args);
+	}
 
 	LOG("core.malloc.exit", "result: %p", ret);
 
@@ -2070,6 +2078,12 @@ je_posix_memalign(void **memptr, size_t alignment, size_t size) {
 	dopts.alignment = alignment;
 
 	ret = imalloc(&sopts, &dopts);
+	if (sopts.slow) {
+		uintptr_t args[3] = {(uintptr_t)memptr, (uintptr_t)alignment,
+			(uintptr_t)size};
+		hook_invoke_alloc(hook_alloc_posix_memalign, *memptr,
+		    (uintptr_t)ret, args);
+	}
 
 	LOG("core.posix_memalign.exit", "result: %d, alloc ptr: %p", ret,
 	    *memptr);
@@ -2107,6 +2121,11 @@ je_aligned_alloc(size_t alignment, size_t size) {
 	dopts.alignment = alignment;
 
 	imalloc(&sopts, &dopts);
+	if (sopts.slow) {
+		uintptr_t args[3] = {(uintptr_t)alignment, (uintptr_t)size};
+		hook_invoke_alloc(hook_alloc_aligned_alloc, ret,
+		    (uintptr_t)ret, args);
+	}
 
 	LOG("core.aligned_alloc.exit", "result: %p", ret);
 
@@ -2138,6 +2157,10 @@ je_calloc(size_t num, size_t size) {
 	dopts.zero = true;
 
 	imalloc(&sopts, &dopts);
+	if (sopts.slow) {
+		uintptr_t args[3] = {(uintptr_t)num, (uintptr_t)size};
+		hook_invoke_alloc(hook_alloc_calloc, ret, (uintptr_t)ret, args);
+	}
 
 	LOG("core.calloc.exit", "result: %p", ret);
 
@@ -2307,6 +2330,7 @@ je_realloc(void *ptr, size_t size) {
 			} else {
 				tcache = NULL;
 			}
+
 			ifree(tsd, ptr, tcache, true);
 
 			LOG("core.realloc.exit", "result: %p", NULL);
@@ -2330,9 +2354,12 @@ je_realloc(void *ptr, size_t size) {
 		assert(old_usize == isalloc(tsd_tsdn(tsd), ptr));
 		if (config_prof && opt_prof) {
 			usize = sz_s2u(size);
-			ret = unlikely(usize == 0 || usize > LARGE_MAXCLASS) ?
-			    NULL : irealloc_prof(tsd, ptr, old_usize, usize,
-			    &alloc_ctx);
+			if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) {
+				ret = NULL;
+			} else {
+				ret = irealloc_prof(tsd, ptr, old_usize, usize,
+				    &alloc_ctx);
+			}
 		} else {
 			if (config_stats) {
 				usize = sz_s2u(size);
@@ -2342,8 +2369,23 @@ je_realloc(void *ptr, size_t size) {
 		tsdn = tsd_tsdn(tsd);
 	} else {
 		/* realloc(NULL, size) is equivalent to malloc(size). */
-		void *ret = je_malloc(size);
-		LOG("core.realloc.exit", "result: %p", ret);
+		static_opts_t sopts;
+		dynamic_opts_t dopts;
+
+		static_opts_init(&sopts);
+		dynamic_opts_init(&dopts);
+
+		sopts.bump_empty_alloc = true;
+		sopts.null_out_result_on_error = true;
+		sopts.set_errno_on_error = true;
+		sopts.oom_string =
+		    "<jemalloc>: Error in realloc(): out of memory\n";
+
+		dopts.result = &ret;
+		dopts.num_items = 1;
+		dopts.item_size = size;
+
+		imalloc(&sopts, &dopts);
 		return ret;
 	}
 
@@ -2443,6 +2485,11 @@ je_memalign(size_t alignment, size_t size) {
 	dopts.alignment = alignment;
 
 	imalloc(&sopts, &dopts);
+	if (sopts.slow) {
+		uintptr_t args[3] = {alignment, size};
+		hook_invoke_alloc(hook_alloc_memalign, ret, (uintptr_t)ret,
+		    args);
+	}
 
 	LOG("core.memalign.exit", "result: %p", ret);
 	return ret;
@@ -2478,6 +2525,10 @@ je_valloc(size_t size) {
 	dopts.alignment = PAGE;
 
 	imalloc(&sopts, &dopts);
+	if (sopts.slow) {
+		uintptr_t args[3] = {size};
+		hook_invoke_alloc(hook_alloc_valloc, ret, (uintptr_t)ret, args);
+	}
 
 	LOG("core.valloc.exit", "result: %p\n", ret);
 	return ret;
@@ -2588,6 +2639,11 @@ je_mallocx(size_t size, int flags) {
 	}
 
 	imalloc(&sopts, &dopts);
+	if (sopts.slow) {
+		uintptr_t args[3] = {size, flags};
+		hook_invoke_alloc(hook_alloc_mallocx, ret, (uintptr_t)ret,
+		    args);
+	}
 
 	LOG("core.mallocx.exit", "result: %p", ret);
 	return ret;
diff --git a/test/unit/hook.c b/test/unit/hook.c
index a959096..06d4b82 100644
--- a/test/unit/hook.c
+++ b/test/unit/hook.c
@@ -38,6 +38,12 @@ assert_args_raw(uintptr_t *args_raw_expected, int nargs) {
 }
 
 static void
+reset() {
+	call_count = 0;
+	reset_args();
+}
+
+static void
 test_alloc_hook(void *extra, hook_alloc_t type, void *result,
     uintptr_t result_raw, uintptr_t args_raw[3]) {
 	call_count++;
@@ -171,10 +177,124 @@ TEST_BEGIN(test_hooks_remove) {
 }
 TEST_END
 
+TEST_BEGIN(test_hooks_alloc_simple) {
+	/* "Simple" in the sense that we're not in a realloc variant. */
+
+	hooks_t hooks = {&test_alloc_hook, NULL, NULL};
+	void *handle = hook_install(TSDN_NULL, &hooks, (void *)123);
+	assert_ptr_ne(handle, NULL, "Hook installation failed");
+
+	/* Stop malloc from being optimized away. */
+	volatile int err;
+	void *volatile ptr;
+
+	/* malloc */
+	reset();
+	ptr = malloc(1);
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, (int)hook_alloc_malloc, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_result, "Wrong result");
+	assert_u64_eq((uintptr_t)ptr, (uintptr_t)arg_result_raw,
+	    "Wrong raw result");
+	assert_u64_eq((uintptr_t)1, arg_args_raw[0], "Wrong argument");
+	free(ptr);
+
+	/* posix_memalign */
+	reset();
+	err = posix_memalign((void **)&ptr, 1024, 1);
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, (int)hook_alloc_posix_memalign,
+	    "Wrong hook type");
+	assert_ptr_eq(ptr, arg_result, "Wrong result");
+	assert_u64_eq((uintptr_t)err, (uintptr_t)arg_result_raw,
+	    "Wrong raw result");
+	assert_u64_eq((uintptr_t)&ptr, arg_args_raw[0], "Wrong argument");
+	assert_u64_eq((uintptr_t)1024, arg_args_raw[1], "Wrong argument");
+	assert_u64_eq((uintptr_t)1, arg_args_raw[2], "Wrong argument");
+	free(ptr);
+
+	/* aligned_alloc */
+	reset();
+	ptr = aligned_alloc(1024, 1);
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, (int)hook_alloc_aligned_alloc,
+	    "Wrong hook type");
+	assert_ptr_eq(ptr, arg_result, "Wrong result");
+	assert_u64_eq((uintptr_t)ptr, (uintptr_t)arg_result_raw,
+	    "Wrong raw result");
+	assert_u64_eq((uintptr_t)1024, arg_args_raw[0], "Wrong argument");
+	assert_u64_eq((uintptr_t)1, arg_args_raw[1], "Wrong argument");
+	free(ptr);
+
+	/* calloc */
+	reset();
+	ptr = calloc(11, 13);
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, (int)hook_alloc_calloc, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_result, "Wrong result");
+	assert_u64_eq((uintptr_t)ptr, (uintptr_t)arg_result_raw,
+	    "Wrong raw result");
+	assert_u64_eq((uintptr_t)11, arg_args_raw[0], "Wrong argument");
+	assert_u64_eq((uintptr_t)13, arg_args_raw[1], "Wrong argument");
+	free(ptr);
+
+	/* memalign */
+#ifdef JEMALLOC_OVERRIDE_MEMALIGN
+	reset();
+	ptr = memalign(1024, 1);
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, (int)hook_alloc_memalign, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_result, "Wrong result");
+	assert_u64_eq((uintptr_t)ptr, (uintptr_t)arg_result_raw,
+	    "Wrong raw result");
+	assert_u64_eq((uintptr_t)1024, arg_args_raw[0], "Wrong argument");
+	assert_u64_eq((uintptr_t)1, arg_args_raw[1], "Wrong argument");
+	free(ptr);
+#endif /* JEMALLOC_OVERRIDE_MEMALIGN */
+
+	/* valloc */
+#ifdef JEMALLOC_OVERRIDE_VALLOC
+	reset();
+	ptr = valloc(1);
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, (int)hook_alloc_valloc, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_result, "Wrong result");
+	assert_u64_eq((uintptr_t)ptr, (uintptr_t)arg_result_raw,
+	    "Wrong raw result");
+	assert_u64_eq((uintptr_t)1, arg_args_raw[0], "Wrong argument");
+	free(ptr);
+#endif /* JEMALLOC_OVERRIDE_VALLOC */
+
+	/* mallocx */
+	reset();
+	ptr = mallocx(1, MALLOCX_LG_ALIGN(10));
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, (int)hook_alloc_mallocx, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_result, "Wrong result");
+	assert_u64_eq((uintptr_t)ptr, (uintptr_t)arg_result_raw,
+	    "Wrong raw result");
+	assert_u64_eq((uintptr_t)1, arg_args_raw[0], "Wrong argument");
+	assert_u64_eq((uintptr_t)MALLOCX_LG_ALIGN(10), arg_args_raw[1],
+	    "Wrong flags");
+	free(ptr);
+
+	hook_remove(TSDN_NULL, handle);
+}
+TEST_END
+
 int
 main(void) {
-	return test(
+	/* We assert on call counts. */
+	return test_no_reentrancy(
 	    test_hooks_basic,
 	    test_hooks_null,
-	    test_hooks_remove);
+	    test_hooks_remove,
+	    test_hooks_alloc_simple);
 }
-- 
cgit v0.12


From c154f5881b72c52a131e88ade6108d663ac03700 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Thu, 19 Apr 2018 15:02:53 -0700
Subject: Hooks: hook the pure-deallocation functions.

---
 src/jemalloc.c   |  6 ++++++
 test/unit/hook.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/src/jemalloc.c b/src/jemalloc.c
index df59f69..42502ab 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -2441,6 +2441,8 @@ je_free(void *ptr) {
 			} else {
 				tcache = NULL;
 			}
+			uintptr_t args_raw[3] = {(uintptr_t)ptr};
+			hook_invoke_dalloc(hook_dalloc_free, ptr, args_raw);
 			ifree(tsd, ptr, tcache, true);
 		}
 		check_entry_exit_locking(tsd_tsdn(tsd));
@@ -3012,6 +3014,8 @@ je_dallocx(void *ptr, int flags) {
 		tsd_assert_fast(tsd);
 		ifree(tsd, ptr, tcache, false);
 	} else {
+		uintptr_t args_raw[3] = {(uintptr_t)ptr, flags};
+		hook_invoke_dalloc(hook_dalloc_dallocx, ptr, args_raw);
 		ifree(tsd, ptr, tcache, true);
 	}
 	check_entry_exit_locking(tsd_tsdn(tsd));
@@ -3074,6 +3078,8 @@ je_sdallocx(void *ptr, size_t size, int flags) {
 		tsd_assert_fast(tsd);
 		isfree(tsd, ptr, usize, tcache, false);
 	} else {
+		uintptr_t args_raw[3] = {(uintptr_t)ptr, size, flags};
+		hook_invoke_dalloc(hook_dalloc_sdallocx, ptr, args_raw);
 		isfree(tsd, ptr, usize, tcache, true);
 	}
 	check_entry_exit_locking(tsd_tsdn(tsd));
diff --git a/test/unit/hook.c b/test/unit/hook.c
index 06d4b82..2b61201 100644
--- a/test/unit/hook.c
+++ b/test/unit/hook.c
@@ -289,6 +289,53 @@ TEST_BEGIN(test_hooks_alloc_simple) {
 }
 TEST_END
 
+TEST_BEGIN(test_hooks_dalloc_simple) {
+	/* "Simple" in the sense that we're not in a realloc variant. */
+	hooks_t hooks = {NULL, &test_dalloc_hook, NULL};
+	void *handle = hook_install(TSDN_NULL, &hooks, (void *)123);
+	assert_ptr_ne(handle, NULL, "Hook installation failed");
+
+	void *volatile ptr;
+
+	/* free() */
+	reset();
+	ptr = malloc(1);
+	free(ptr);
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, (int)hook_dalloc_free, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_address, "Wrong pointer freed");
+	assert_u64_eq((uintptr_t)ptr, arg_args_raw[0], "Wrong raw arg");
+
+	/* dallocx() */
+	reset();
+	ptr = malloc(1);
+	dallocx(ptr, MALLOCX_TCACHE_NONE);
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, (int)hook_dalloc_dallocx, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_address, "Wrong pointer freed");
+	assert_u64_eq((uintptr_t)ptr, arg_args_raw[0], "Wrong raw arg");
+	assert_u64_eq((uintptr_t)MALLOCX_TCACHE_NONE, arg_args_raw[1],
+	    "Wrong raw arg");
+
+	/* sdallocx() */
+	reset();
+	ptr = malloc(1);
+	sdallocx(ptr, 1, MALLOCX_TCACHE_NONE);
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, (int)hook_dalloc_sdallocx, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_address, "Wrong pointer freed");
+	assert_u64_eq((uintptr_t)ptr, arg_args_raw[0], "Wrong raw arg");
+	assert_u64_eq((uintptr_t)1, arg_args_raw[1], "Wrong raw arg");
+	assert_u64_eq((uintptr_t)MALLOCX_TCACHE_NONE, arg_args_raw[2],
+	    "Wrong raw arg");
+
+	hook_remove(TSDN_NULL, handle);
+}
+TEST_END
+
 int
 main(void) {
 	/* We assert on call counts. */
@@ -296,5 +343,6 @@ main(void) {
 	    test_hooks_basic,
 	    test_hooks_null,
 	    test_hooks_remove,
-	    test_hooks_alloc_simple);
+	    test_hooks_alloc_simple,
+	    test_hooks_dalloc_simple);
 }
-- 
cgit v0.12


From 83e516154cfacfc1e010a03f2f420bf79913944a Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Thu, 19 Apr 2018 16:19:38 -0700
Subject: Hooks: hook the pure-expand function.

---
 src/jemalloc.c   |  6 ++++++
 test/unit/hook.c | 32 +++++++++++++++++++++++++++++++-
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/src/jemalloc.c b/src/jemalloc.c
index 42502ab..1a62180 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -2942,6 +2942,12 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags) {
 		*tsd_thread_deallocatedp_get(tsd) += old_usize;
 	}
 label_not_resized:
+	if (unlikely(!tsd_fast(tsd))) {
+		uintptr_t args[4] = {(uintptr_t)ptr, size, extra, flags};
+		hook_invoke_expand(hook_expand_xallocx, ptr, old_usize,
+		    usize, (uintptr_t)usize, args);
+	}
+
 	UTRACE(ptr, size, ptr);
 	check_entry_exit_locking(tsd_tsdn(tsd));
 
diff --git a/test/unit/hook.c b/test/unit/hook.c
index 2b61201..f923f72 100644
--- a/test/unit/hook.c
+++ b/test/unit/hook.c
@@ -336,6 +336,35 @@ TEST_BEGIN(test_hooks_dalloc_simple) {
 }
 TEST_END
 
+TEST_BEGIN(test_hooks_expand_simple) {
+	/* "Simple" in the sense that we're not in a realloc variant. */
+	hooks_t hooks = {NULL, NULL, &test_expand_hook};
+	void *handle = hook_install(TSDN_NULL, &hooks, (void *)123);
+	assert_ptr_ne(handle, NULL, "Hook installation failed");
+
+	void *volatile ptr;
+
+	/* xallocx() */
+	reset();
+	ptr = malloc(1);
+	size_t new_usize = xallocx(ptr, 100, 200, MALLOCX_TCACHE_NONE);
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, (int)hook_expand_xallocx, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_address, "Wrong pointer expanded");
+	assert_u64_eq(arg_old_usize, nallocx(1, 0), "Wrong old usize");
+	assert_u64_eq(arg_new_usize, sallocx(ptr, 0), "Wrong new usize");
+	assert_u64_eq(new_usize, arg_result_raw, "Wrong result");
+	assert_u64_eq((uintptr_t)ptr, arg_args_raw[0], "Wrong arg");
+	assert_u64_eq(100, arg_args_raw[1], "Wrong arg");
+	assert_u64_eq(200, arg_args_raw[2], "Wrong arg");
+	assert_u64_eq(MALLOCX_TCACHE_NONE, arg_args_raw[3], "Wrong arg");
+
+	hook_remove(TSDN_NULL, handle);
+}
+TEST_END
+
+
 int
 main(void) {
 	/* We assert on call counts. */
@@ -344,5 +373,6 @@ main(void) {
 	    test_hooks_null,
 	    test_hooks_remove,
 	    test_hooks_alloc_simple,
-	    test_hooks_dalloc_simple);
+	    test_hooks_dalloc_simple,
+	    test_hooks_expand_simple);
 }
-- 
cgit v0.12


From 67270040a56d8658ce6aec81b15d78571e0e9198 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Thu, 19 Apr 2018 16:44:25 -0700
Subject: Hooks: hook the realloc paths that act as pure malloc/free.

---
 include/jemalloc/internal/hook.h |  8 +++++--
 src/jemalloc.c                   | 12 +++++++++-
 test/unit/hook.c                 | 52 ++++++++++++++++++++++++++++++++++++++--
 3 files changed, 67 insertions(+), 5 deletions(-)

diff --git a/include/jemalloc/internal/hook.h b/include/jemalloc/internal/hook.h
index fbf3a07..ac1bcdb 100644
--- a/include/jemalloc/internal/hook.h
+++ b/include/jemalloc/internal/hook.h
@@ -16,9 +16,13 @@
  *
  * For realloc and rallocx, if the expansion happens in place, the expansion
  * hook is called.  If it is moved, then the alloc hook is called on the new
- * location, and then the free hook is called on the old location.
+ * location, and then the free hook is called on the old location (i.e. both
+ * hooks are invoked in between the alloc and the dalloc).
  *
- * If we return NULL from OOM, then usize might not be trustworthy.
+ * If we return NULL from OOM, then usize might not be trustworthy.  Calling
+ * realloc(NULL, size) only calls the alloc hook, and calling realloc(ptr, 0)
+ * only calls the free hook.  (Calling realloc(NULL, 0) is treated as malloc(0),
+ * and only calls the alloc hook).
  *
  * Reentrancy:
  *   Is not protected against.  If your hooks allocate, then the hooks will be
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 1a62180..57c2019 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -2311,11 +2311,12 @@ isfree(tsd_t *tsd, void *ptr, size_t usize, tcache_t *tcache, bool slow_path) {
 JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
 void JEMALLOC_NOTHROW *
 JEMALLOC_ALLOC_SIZE(2)
-je_realloc(void *ptr, size_t size) {
+je_realloc(void *ptr, size_t arg_size) {
 	void *ret;
 	tsdn_t *tsdn JEMALLOC_CC_SILENCE_INIT(NULL);
 	size_t usize JEMALLOC_CC_SILENCE_INIT(0);
 	size_t old_usize = 0;
+	size_t size = arg_size;
 
 	LOG("core.realloc.entry", "ptr: %p, size: %zu\n", ptr, size);
 
@@ -2331,6 +2332,9 @@ je_realloc(void *ptr, size_t size) {
 				tcache = NULL;
 			}
 
+			uintptr_t args[3] = {(uintptr_t)ptr, size};
+			hook_invoke_dalloc(hook_dalloc_realloc, ptr, args);
+
 			ifree(tsd, ptr, tcache, true);
 
 			LOG("core.realloc.exit", "result: %p", NULL);
@@ -2386,6 +2390,12 @@ je_realloc(void *ptr, size_t size) {
 		dopts.item_size = size;
 
 		imalloc(&sopts, &dopts);
+		if (sopts.slow) {
+			uintptr_t args[3] = {(uintptr_t)ptr, arg_size};
+			hook_invoke_alloc(hook_alloc_realloc, ret,
+			    (uintptr_t)ret, args);
+		}
+
 		return ret;
 	}
 
diff --git a/test/unit/hook.c b/test/unit/hook.c
index f923f72..8c9d680 100644
--- a/test/unit/hook.c
+++ b/test/unit/hook.c
@@ -179,7 +179,6 @@ TEST_END
 
 TEST_BEGIN(test_hooks_alloc_simple) {
 	/* "Simple" in the sense that we're not in a realloc variant. */
-
 	hooks_t hooks = {&test_alloc_hook, NULL, NULL};
 	void *handle = hook_install(TSDN_NULL, &hooks, (void *)123);
 	assert_ptr_ne(handle, NULL, "Hook installation failed");
@@ -364,6 +363,54 @@ TEST_BEGIN(test_hooks_expand_simple) {
 }
 TEST_END
 
+TEST_BEGIN(test_hooks_realloc_as_malloc_or_free) {
+	hooks_t hooks = {&test_alloc_hook, &test_dalloc_hook,
+		&test_expand_hook};
+	void *handle = hook_install(TSDN_NULL, &hooks, (void *)123);
+	assert_ptr_ne(handle, NULL, "Hook installation failed");
+
+	void *volatile ptr;
+
+	/* realloc(NULL, size) as malloc */
+	reset();
+	ptr = realloc(NULL, 1);
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, (int)hook_alloc_realloc, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_result, "Wrong result");
+	assert_u64_eq((uintptr_t)ptr, (uintptr_t)arg_result_raw,
+	    "Wrong raw result");
+	assert_u64_eq((uintptr_t)NULL, arg_args_raw[0], "Wrong argument");
+	assert_u64_eq((uintptr_t)1, arg_args_raw[1], "Wrong argument");
+	free(ptr);
+
+	/* realloc(ptr, 0) as free */
+	ptr = malloc(1);
+	reset();
+	realloc(ptr, 0);
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, (int)hook_dalloc_realloc, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_address, "Wrong pointer freed");
+	assert_u64_eq((uintptr_t)ptr, arg_args_raw[0], "Wrong raw arg");
+	assert_u64_eq((uintptr_t)0, arg_args_raw[1], "Wrong raw arg");
+
+	/* realloc(NULL, 0) as malloc(0) */
+	reset();
+	ptr = realloc(NULL, 0);
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, (int)hook_alloc_realloc, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_result, "Wrong result");
+	assert_u64_eq((uintptr_t)ptr, (uintptr_t)arg_result_raw,
+	    "Wrong raw result");
+	assert_u64_eq((uintptr_t)NULL, arg_args_raw[0], "Wrong argument");
+	assert_u64_eq((uintptr_t)0, arg_args_raw[1], "Wrong argument");
+	free(ptr);
+
+	hook_remove(TSDN_NULL, handle);
+}
+TEST_END
 
 int
 main(void) {
@@ -374,5 +421,6 @@ main(void) {
 	    test_hooks_remove,
 	    test_hooks_alloc_simple,
 	    test_hooks_dalloc_simple,
-	    test_hooks_expand_simple);
+	    test_hooks_expand_simple,
+	    test_hooks_realloc_as_malloc_or_free);
 }
-- 
cgit v0.12


From cb0707c0fc948875876b93514938646455650e2b Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Mon, 23 Apr 2018 18:07:40 -0700
Subject: Hooks: hook the realloc pathways that move/expand.

---
 include/jemalloc/internal/arena_externs.h          |   4 +-
 include/jemalloc/internal/hook.h                   |  31 ++++++
 .../internal/jemalloc_internal_inlines_c.h         |  45 ++++----
 include/jemalloc/internal/large_externs.h          |   7 +-
 src/arena.c                                        |  18 +++-
 src/jemalloc.c                                     |  44 +++++---
 src/large.c                                        |  17 +++-
 test/unit/hook.c                                   | 113 ++++++++++++++++++++-
 8 files changed, 231 insertions(+), 48 deletions(-)

diff --git a/include/jemalloc/internal/arena_externs.h b/include/jemalloc/internal/arena_externs.h
index 4b3732b..f4edcc7 100644
--- a/include/jemalloc/internal/arena_externs.h
+++ b/include/jemalloc/internal/arena_externs.h
@@ -3,6 +3,7 @@
 
 #include "jemalloc/internal/bin.h"
 #include "jemalloc/internal/extent_dss.h"
+#include "jemalloc/internal/hook.h"
 #include "jemalloc/internal/pages.h"
 #include "jemalloc/internal/size_classes.h"
 #include "jemalloc/internal/stats.h"
@@ -65,7 +66,8 @@ void arena_dalloc_small(tsdn_t *tsdn, void *ptr);
 bool arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
     size_t extra, bool zero);
 void *arena_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t oldsize,
-    size_t size, size_t alignment, bool zero, tcache_t *tcache);
+    size_t size, size_t alignment, bool zero, tcache_t *tcache,
+    hook_ralloc_args_t *hook_args);
 dss_prec_t arena_dss_prec_get(arena_t *arena);
 bool arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec);
 ssize_t arena_dirty_decay_ms_default_get(void);
diff --git a/include/jemalloc/internal/hook.h b/include/jemalloc/internal/hook.h
index ac1bcdb..fc61e9b 100644
--- a/include/jemalloc/internal/hook.h
+++ b/include/jemalloc/internal/hook.h
@@ -106,6 +106,37 @@ struct hooks_s {
 };
 
 /*
+ * Begin implementation details; everything above this point might one day live
+ * in a public API.  Everything below this point never will.
+ */
+
+/*
+ * The realloc pathways haven't gotten any refactoring love in a while, and it's
+ * fairly difficult to pass information from the entry point to the hooks.  We
+ * put the informaiton the hooks will need into a struct to encapsulate
+ * everything.
+ *
+ * Much of these pathways are force-inlined, so that the compiler can avoid
+ * materializing this struct until we hit an extern arena function.  For fairly
+ * goofy reasons, *many* of the realloc paths hit an extern arena function.
+ * These paths are cold enough that it doesn't matter; eventually, we should
+ * rewrite the realloc code to make the expand-in-place and the
+ * free-then-realloc paths more orthogonal, at which point we don't need to
+ * spread the hook logic all over the place.
+ */
+typedef struct hook_ralloc_args_s hook_ralloc_args_t;
+struct hook_ralloc_args_s {
+	/* I.e. as opposed to rallocx. */
+	bool is_realloc;
+	/*
+	 * The expand hook takes 4 arguments, even if only 3 are actually used;
+	 * we add an extra one in case the user decides to memcpy without
+	 * looking too closely at the hooked function.
+	 */
+	uintptr_t args[4];
+};
+
+/*
  * Returns an opaque handle to be used when removing the hook.  NULL means that
  * we couldn't install the hook.
  */
diff --git a/include/jemalloc/internal/jemalloc_internal_inlines_c.h b/include/jemalloc/internal/jemalloc_internal_inlines_c.h
index c829ac6..1b5c11e 100644
--- a/include/jemalloc/internal/jemalloc_internal_inlines_c.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_c.h
@@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_INLINES_C_H
 #define JEMALLOC_INTERNAL_INLINES_C_H
 
+#include "jemalloc/internal/hook.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"
 #include "jemalloc/internal/sz.h"
 #include "jemalloc/internal/witness.h"
@@ -133,31 +134,20 @@ isdalloct(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
 
 JEMALLOC_ALWAYS_INLINE void *
 iralloct_realign(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
-    size_t extra, size_t alignment, bool zero, tcache_t *tcache,
-    arena_t *arena) {
+    size_t alignment, bool zero, tcache_t *tcache, arena_t *arena,
+    hook_ralloc_args_t *hook_args) {
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 0);
 	void *p;
 	size_t usize, copysize;
 
-	usize = sz_sa2u(size + extra, alignment);
+	usize = sz_sa2u(size, alignment);
 	if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) {
 		return NULL;
 	}
 	p = ipalloct(tsdn, usize, alignment, zero, tcache, arena);
 	if (p == NULL) {
-		if (extra == 0) {
-			return NULL;
-		}
-		/* Try again, without extra this time. */
-		usize = sz_sa2u(size, alignment);
-		if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) {
-			return NULL;
-		}
-		p = ipalloct(tsdn, usize, alignment, zero, tcache, arena);
-		if (p == NULL) {
-			return NULL;
-		}
+		return NULL;
 	}
 	/*
 	 * Copy at most size bytes (not size+extra), since the caller has no
@@ -165,13 +155,26 @@ iralloct_realign(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
 	 */
 	copysize = (size < oldsize) ? size : oldsize;
 	memcpy(p, ptr, copysize);
+	hook_invoke_alloc(hook_args->is_realloc
+	    ? hook_alloc_realloc : hook_alloc_rallocx, p, (uintptr_t)p,
+	    hook_args->args);
+	hook_invoke_dalloc(hook_args->is_realloc
+	    ? hook_dalloc_realloc : hook_dalloc_rallocx, ptr, hook_args->args);
 	isdalloct(tsdn, ptr, oldsize, tcache, NULL, true);
 	return p;
 }
 
+/*
+ * is_realloc threads through the knowledge of whether or not this call comes
+ * from je_realloc (as opposed to je_rallocx); this ensures that we pass the
+ * correct entry point into any hooks.
+ * Note that these functions are all force-inlined, so no actual bool gets
+ * passed-around anywhere.
+ */
 JEMALLOC_ALWAYS_INLINE void *
 iralloct(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t alignment,
-    bool zero, tcache_t *tcache, arena_t *arena) {
+    bool zero, tcache_t *tcache, arena_t *arena, hook_ralloc_args_t *hook_args)
+{
 	assert(ptr != NULL);
 	assert(size != 0);
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
@@ -183,19 +186,19 @@ iralloct(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t alignment,
 		 * Existing object alignment is inadequate; allocate new space
 		 * and copy.
 		 */
-		return iralloct_realign(tsdn, ptr, oldsize, size, 0, alignment,
-		    zero, tcache, arena);
+		return iralloct_realign(tsdn, ptr, oldsize, size, alignment,
+		    zero, tcache, arena, hook_args);
 	}
 
 	return arena_ralloc(tsdn, arena, ptr, oldsize, size, alignment, zero,
-	    tcache);
+	    tcache, hook_args);
 }
 
 JEMALLOC_ALWAYS_INLINE void *
 iralloc(tsd_t *tsd, void *ptr, size_t oldsize, size_t size, size_t alignment,
-    bool zero) {
+    bool zero, hook_ralloc_args_t *hook_args) {
 	return iralloct(tsd_tsdn(tsd), ptr, oldsize, size, alignment, zero,
-	    tcache_get(tsd), NULL);
+	    tcache_get(tsd), NULL, hook_args);
 }
 
 JEMALLOC_ALWAYS_INLINE bool
diff --git a/include/jemalloc/internal/large_externs.h b/include/jemalloc/internal/large_externs.h
index 3f36282..88682ea 100644
--- a/include/jemalloc/internal/large_externs.h
+++ b/include/jemalloc/internal/large_externs.h
@@ -1,13 +1,16 @@
 #ifndef JEMALLOC_INTERNAL_LARGE_EXTERNS_H
 #define JEMALLOC_INTERNAL_LARGE_EXTERNS_H
 
+#include "jemalloc/internal/hook.h"
+
 void *large_malloc(tsdn_t *tsdn, arena_t *arena, size_t usize, bool zero);
 void *large_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment,
     bool zero);
 bool large_ralloc_no_move(tsdn_t *tsdn, extent_t *extent, size_t usize_min,
     size_t usize_max, bool zero);
-void *large_ralloc(tsdn_t *tsdn, arena_t *arena, extent_t *extent, size_t usize,
-    size_t alignment, bool zero, tcache_t *tcache);
+void *large_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t usize,
+    size_t alignment, bool zero, tcache_t *tcache,
+    hook_ralloc_args_t *hook_args);
 
 typedef void (large_dalloc_junk_t)(void *, size_t);
 extern large_dalloc_junk_t *JET_MUTABLE large_dalloc_junk;
diff --git a/src/arena.c b/src/arena.c
index 311943f..b76be5f 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1630,7 +1630,8 @@ arena_ralloc_move_helper(tsdn_t *tsdn, arena_t *arena, size_t usize,
 
 void *
 arena_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t oldsize,
-    size_t size, size_t alignment, bool zero, tcache_t *tcache) {
+    size_t size, size_t alignment, bool zero, tcache_t *tcache,
+    hook_ralloc_args_t *hook_args) {
 	size_t usize = sz_s2u(size);
 	if (unlikely(usize == 0 || size > LARGE_MAXCLASS)) {
 		return NULL;
@@ -1639,13 +1640,17 @@ arena_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t oldsize,
 	if (likely(usize <= SMALL_MAXCLASS)) {
 		/* Try to avoid moving the allocation. */
 		if (!arena_ralloc_no_move(tsdn, ptr, oldsize, usize, 0, zero)) {
+			hook_invoke_expand(hook_args->is_realloc
+			    ? hook_expand_realloc : hook_expand_rallocx,
+			    ptr, oldsize, usize, (uintptr_t)ptr,
+			    hook_args->args);
 			return ptr;
 		}
 	}
 
 	if (oldsize >= LARGE_MINCLASS && usize >= LARGE_MINCLASS) {
-		return large_ralloc(tsdn, arena, iealloc(tsdn, ptr), usize,
-		    alignment, zero, tcache);
+		return large_ralloc(tsdn, arena, ptr, usize,
+		    alignment, zero, tcache, hook_args);
 	}
 
 	/*
@@ -1658,11 +1663,16 @@ arena_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t oldsize,
 		return NULL;
 	}
 
+	hook_invoke_alloc(hook_args->is_realloc
+	    ? hook_alloc_realloc : hook_alloc_rallocx, ret, (uintptr_t)ret,
+	    hook_args->args);
+	hook_invoke_dalloc(hook_args->is_realloc
+	    ? hook_dalloc_realloc : hook_dalloc_rallocx, ptr, hook_args->args);
+
 	/*
 	 * Junk/zero-filling were already done by
 	 * ipalloc()/arena_malloc().
 	 */
-
 	size_t copysize = (usize < oldsize) ? usize : oldsize;
 	memcpy(ret, ptr, copysize);
 	isdalloct(tsdn, ptr, oldsize, tcache, NULL, true);
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 57c2019..264408f 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -2169,20 +2169,22 @@ je_calloc(size_t num, size_t size) {
 
 static void *
 irealloc_prof_sample(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t usize,
-    prof_tctx_t *tctx) {
+    prof_tctx_t *tctx, hook_ralloc_args_t *hook_args) {
 	void *p;
 
 	if (tctx == NULL) {
 		return NULL;
 	}
 	if (usize <= SMALL_MAXCLASS) {
-		p = iralloc(tsd, old_ptr, old_usize, LARGE_MINCLASS, 0, false);
+		p = iralloc(tsd, old_ptr, old_usize, LARGE_MINCLASS, 0, false,
+		    hook_args);
 		if (p == NULL) {
 			return NULL;
 		}
 		arena_prof_promote(tsd_tsdn(tsd), p, usize);
 	} else {
-		p = iralloc(tsd, old_ptr, old_usize, usize, 0, false);
+		p = iralloc(tsd, old_ptr, old_usize, usize, 0, false,
+		    hook_args);
 	}
 
 	return p;
@@ -2190,7 +2192,7 @@ irealloc_prof_sample(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t usize,
 
 JEMALLOC_ALWAYS_INLINE void *
 irealloc_prof(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t usize,
-   alloc_ctx_t *alloc_ctx) {
+   alloc_ctx_t *alloc_ctx, hook_ralloc_args_t *hook_args) {
 	void *p;
 	bool prof_active;
 	prof_tctx_t *old_tctx, *tctx;
@@ -2199,9 +2201,11 @@ irealloc_prof(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t usize,
 	old_tctx = prof_tctx_get(tsd_tsdn(tsd), old_ptr, alloc_ctx);
 	tctx = prof_alloc_prep(tsd, usize, prof_active, true);
 	if (unlikely((uintptr_t)tctx != (uintptr_t)1U)) {
-		p = irealloc_prof_sample(tsd, old_ptr, old_usize, usize, tctx);
+		p = irealloc_prof_sample(tsd, old_ptr, old_usize, usize, tctx,
+		    hook_args);
 	} else {
-		p = iralloc(tsd, old_ptr, old_usize, usize, 0, false);
+		p = iralloc(tsd, old_ptr, old_usize, usize, 0, false,
+		    hook_args);
 	}
 	if (unlikely(p == NULL)) {
 		prof_alloc_rollback(tsd, tctx, true);
@@ -2349,6 +2353,10 @@ je_realloc(void *ptr, size_t arg_size) {
 
 		check_entry_exit_locking(tsd_tsdn(tsd));
 
+
+		hook_ralloc_args_t hook_args = {true, {(uintptr_t)ptr,
+			(uintptr_t)arg_size, 0, 0}};
+
 		alloc_ctx_t alloc_ctx;
 		rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd);
 		rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree, rtree_ctx,
@@ -2362,13 +2370,14 @@ je_realloc(void *ptr, size_t arg_size) {
 				ret = NULL;
 			} else {
 				ret = irealloc_prof(tsd, ptr, old_usize, usize,
-				    &alloc_ctx);
+				    &alloc_ctx, &hook_args);
 			}
 		} else {
 			if (config_stats) {
 				usize = sz_s2u(size);
 			}
-			ret = iralloc(tsd, ptr, old_usize, size, 0, false);
+			ret = iralloc(tsd, ptr, old_usize, size, 0, false,
+			    &hook_args);
 		}
 		tsdn = tsd_tsdn(tsd);
 	} else {
@@ -2664,7 +2673,7 @@ je_mallocx(size_t size, int flags) {
 static void *
 irallocx_prof_sample(tsdn_t *tsdn, void *old_ptr, size_t old_usize,
     size_t usize, size_t alignment, bool zero, tcache_t *tcache, arena_t *arena,
-    prof_tctx_t *tctx) {
+    prof_tctx_t *tctx, hook_ralloc_args_t *hook_args) {
 	void *p;
 
 	if (tctx == NULL) {
@@ -2672,14 +2681,14 @@ irallocx_prof_sample(tsdn_t *tsdn, void *old_ptr, size_t old_usize,
 	}
 	if (usize <= SMALL_MAXCLASS) {
 		p = iralloct(tsdn, old_ptr, old_usize, LARGE_MINCLASS,
-		    alignment, zero, tcache, arena);
+		    alignment, zero, tcache, arena, hook_args);
 		if (p == NULL) {
 			return NULL;
 		}
 		arena_prof_promote(tsdn, p, usize);
 	} else {
 		p = iralloct(tsdn, old_ptr, old_usize, usize, alignment, zero,
-		    tcache, arena);
+		    tcache, arena, hook_args);
 	}
 
 	return p;
@@ -2688,7 +2697,7 @@ irallocx_prof_sample(tsdn_t *tsdn, void *old_ptr, size_t old_usize,
 JEMALLOC_ALWAYS_INLINE void *
 irallocx_prof(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t size,
     size_t alignment, size_t *usize, bool zero, tcache_t *tcache,
-    arena_t *arena, alloc_ctx_t *alloc_ctx) {
+    arena_t *arena, alloc_ctx_t *alloc_ctx, hook_ralloc_args_t *hook_args) {
 	void *p;
 	bool prof_active;
 	prof_tctx_t *old_tctx, *tctx;
@@ -2698,10 +2707,10 @@ irallocx_prof(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t size,
 	tctx = prof_alloc_prep(tsd, *usize, prof_active, false);
 	if (unlikely((uintptr_t)tctx != (uintptr_t)1U)) {
 		p = irallocx_prof_sample(tsd_tsdn(tsd), old_ptr, old_usize,
-		    *usize, alignment, zero, tcache, arena, tctx);
+		    *usize, alignment, zero, tcache, arena, tctx, hook_args);
 	} else {
 		p = iralloct(tsd_tsdn(tsd), old_ptr, old_usize, size, alignment,
-		    zero, tcache, arena);
+		    zero, tcache, arena, hook_args);
 	}
 	if (unlikely(p == NULL)) {
 		prof_alloc_rollback(tsd, tctx, false);
@@ -2775,6 +2784,9 @@ je_rallocx(void *ptr, size_t size, int flags) {
 	assert(alloc_ctx.szind != NSIZES);
 	old_usize = sz_index2size(alloc_ctx.szind);
 	assert(old_usize == isalloc(tsd_tsdn(tsd), ptr));
+
+	hook_ralloc_args_t hook_args = {false, {(uintptr_t)ptr, size, flags,
+		0}};
 	if (config_prof && opt_prof) {
 		usize = (alignment == 0) ?
 		    sz_s2u(size) : sz_sa2u(size, alignment);
@@ -2782,13 +2794,13 @@ je_rallocx(void *ptr, size_t size, int flags) {
 			goto label_oom;
 		}
 		p = irallocx_prof(tsd, ptr, old_usize, size, alignment, &usize,
-		    zero, tcache, arena, &alloc_ctx);
+		    zero, tcache, arena, &alloc_ctx, &hook_args);
 		if (unlikely(p == NULL)) {
 			goto label_oom;
 		}
 	} else {
 		p = iralloct(tsd_tsdn(tsd), ptr, old_usize, size, alignment,
-		    zero, tcache, arena);
+		    zero, tcache, arena, &hook_args);
 		if (unlikely(p == NULL)) {
 			goto label_oom;
 		}
diff --git a/src/large.c b/src/large.c
index 27a2c67..fdf183e 100644
--- a/src/large.c
+++ b/src/large.c
@@ -270,10 +270,12 @@ large_ralloc_move_helper(tsdn_t *tsdn, arena_t *arena, size_t usize,
 }
 
 void *
-large_ralloc(tsdn_t *tsdn, arena_t *arena, extent_t *extent, size_t usize,
-    size_t alignment, bool zero, tcache_t *tcache) {
-	size_t oldusize = extent_usize_get(extent);
+large_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t usize,
+    size_t alignment, bool zero, tcache_t *tcache,
+    hook_ralloc_args_t *hook_args) {
+	extent_t *extent = iealloc(tsdn, ptr);
 
+	size_t oldusize = extent_usize_get(extent);
 	/* The following should have been caught by callers. */
 	assert(usize > 0 && usize <= LARGE_MAXCLASS);
 	/* Both allocation sizes must be large to avoid a move. */
@@ -281,6 +283,9 @@ large_ralloc(tsdn_t *tsdn, arena_t *arena, extent_t *extent, size_t usize,
 
 	/* Try to avoid moving the allocation. */
 	if (!large_ralloc_no_move(tsdn, extent, usize, usize, zero)) {
+		hook_invoke_expand(hook_args->is_realloc
+		    ? hook_expand_realloc : hook_expand_rallocx, ptr, oldusize,
+		    usize, (uintptr_t)ptr, hook_args->args);
 		return extent_addr_get(extent);
 	}
 
@@ -295,6 +300,12 @@ large_ralloc(tsdn_t *tsdn, arena_t *arena, extent_t *extent, size_t usize,
 		return NULL;
 	}
 
+	hook_invoke_alloc(hook_args->is_realloc
+	    ? hook_alloc_realloc : hook_alloc_rallocx, ret, (uintptr_t)ret,
+	    hook_args->args);
+	hook_invoke_dalloc(hook_args->is_realloc
+	    ? hook_dalloc_realloc : hook_dalloc_rallocx, ptr, hook_args->args);
+
 	size_t copysize = (usize < oldusize) ? usize : oldusize;
 	memcpy(ret, extent_addr_get(extent), copysize);
 	isdalloct(tsdn, extent_addr_get(extent), oldusize, tcache, NULL, true);
diff --git a/test/unit/hook.c b/test/unit/hook.c
index 8c9d680..693cb23 100644
--- a/test/unit/hook.c
+++ b/test/unit/hook.c
@@ -412,6 +412,115 @@ TEST_BEGIN(test_hooks_realloc_as_malloc_or_free) {
 }
 TEST_END
 
+static void
+do_realloc_test(void *(*ralloc)(void *, size_t, int), int flags,
+    int expand_type, int dalloc_type) {
+	hooks_t hooks = {&test_alloc_hook, &test_dalloc_hook,
+		&test_expand_hook};
+	void *handle = hook_install(TSDN_NULL, &hooks, (void *)123);
+	assert_ptr_ne(handle, NULL, "Hook installation failed");
+
+	void *volatile ptr;
+	void *volatile ptr2;
+
+	/* Realloc in-place, small. */
+	ptr = malloc(129);
+	reset();
+	ptr2 = ralloc(ptr, 130, flags);
+	assert_ptr_eq(ptr, ptr2, "Small realloc moved");
+
+	assert_d_eq(call_count, 1, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, expand_type, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_address, "Wrong address");
+	assert_u64_eq((uintptr_t)ptr, (uintptr_t)arg_result_raw,
+	    "Wrong raw result");
+	assert_u64_eq((uintptr_t)ptr, arg_args_raw[0], "Wrong argument");
+	assert_u64_eq((uintptr_t)130, arg_args_raw[1], "Wrong argument");
+	free(ptr);
+
+	/*
+	 * Realloc in-place, large.  Since we can't guarantee the large case
+	 * across all platforms, we stay resilient to moving results.
+	 */
+	ptr = malloc(2 * 1024 * 1024);
+	free(ptr);
+	ptr2 = malloc(1 * 1024 * 1024);
+	reset();
+	ptr = ralloc(ptr2, 2 * 1024 * 1024, flags);
+	/* ptr is the new address, ptr2 is the old address. */
+	if (ptr == ptr2) {
+		assert_d_eq(call_count, 1, "Hook not called");
+		assert_d_eq(arg_type, expand_type, "Wrong hook type");
+	} else {
+		assert_d_eq(call_count, 2, "Wrong hooks called");
+		assert_ptr_eq(ptr, arg_result, "Wrong address");
+		assert_d_eq(arg_type, dalloc_type, "Wrong hook type");
+	}
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_ptr_eq(ptr2, arg_address, "Wrong address");
+	assert_u64_eq((uintptr_t)ptr, (uintptr_t)arg_result_raw,
+	    "Wrong raw result");
+	assert_u64_eq((uintptr_t)ptr2, arg_args_raw[0], "Wrong argument");
+	assert_u64_eq((uintptr_t)2 * 1024 * 1024, arg_args_raw[1],
+	    "Wrong argument");
+	free(ptr);
+
+	/* Realloc with move, small. */
+	ptr = malloc(8);
+	reset();
+	ptr2 = ralloc(ptr, 128, flags);
+	assert_ptr_ne(ptr, ptr2, "Small realloc didn't move");
+
+	assert_d_eq(call_count, 2, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, dalloc_type, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_address, "Wrong address");
+	assert_ptr_eq(ptr2, arg_result, "Wrong address");
+	assert_u64_eq((uintptr_t)ptr2, (uintptr_t)arg_result_raw,
+	    "Wrong raw result");
+	assert_u64_eq((uintptr_t)ptr, arg_args_raw[0], "Wrong argument");
+	assert_u64_eq((uintptr_t)128, arg_args_raw[1], "Wrong argument");
+	free(ptr2);
+
+	/* Realloc with move, large. */
+	ptr = malloc(1);
+	reset();
+	ptr2 = ralloc(ptr, 2 * 1024 * 1024, flags);
+	assert_ptr_ne(ptr, ptr2, "Large realloc didn't move");
+
+	assert_d_eq(call_count, 2, "Hook not called");
+	assert_ptr_eq(arg_extra, (void *)123, "Wrong extra");
+	assert_d_eq(arg_type, dalloc_type, "Wrong hook type");
+	assert_ptr_eq(ptr, arg_address, "Wrong address");
+	assert_ptr_eq(ptr2, arg_result, "Wrong address");
+	assert_u64_eq((uintptr_t)ptr2, (uintptr_t)arg_result_raw,
+	    "Wrong raw result");
+	assert_u64_eq((uintptr_t)ptr, arg_args_raw[0], "Wrong argument");
+	assert_u64_eq((uintptr_t)2 * 1024 * 1024, arg_args_raw[1],
+	    "Wrong argument");
+	free(ptr2);
+
+	hook_remove(TSDN_NULL, handle);
+}
+
+static void *
+realloc_wrapper(void *ptr, size_t size, UNUSED int flags) {
+	return realloc(ptr, size);
+}
+
+TEST_BEGIN(test_hooks_realloc) {
+	do_realloc_test(&realloc_wrapper, 0, hook_expand_realloc,
+	    hook_dalloc_realloc);
+}
+TEST_END
+
+TEST_BEGIN(test_hooks_rallocx) {
+	do_realloc_test(&rallocx, MALLOCX_TCACHE_NONE, hook_expand_rallocx,
+	    hook_dalloc_rallocx);
+}
+TEST_END
+
 int
 main(void) {
 	/* We assert on call counts. */
@@ -422,5 +531,7 @@ main(void) {
 	    test_hooks_alloc_simple,
 	    test_hooks_dalloc_simple,
 	    test_hooks_expand_simple,
-	    test_hooks_realloc_as_malloc_or_free);
+	    test_hooks_realloc_as_malloc_or_free,
+	    test_hooks_realloc,
+	    test_hooks_rallocx);
 }
-- 
cgit v0.12


From 126e9a84a5a793fb0d53ca4656a91889b3ae40e8 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Thu, 26 Apr 2018 15:46:08 -0700
Subject: Hooks: move the "extra" pointer into the hook_t itself.

This simplifies the mallctl call to install a hook, which should only take a
single argument.
---
 include/jemalloc/internal/hook.h |  3 ++-
 src/hook.c                       | 14 ++++++-------
 test/unit/hook.c                 | 45 ++++++++++++++++++++--------------------
 3 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/include/jemalloc/internal/hook.h b/include/jemalloc/internal/hook.h
index fc61e9b..678c601 100644
--- a/include/jemalloc/internal/hook.h
+++ b/include/jemalloc/internal/hook.h
@@ -103,6 +103,7 @@ struct hooks_s {
 	hook_alloc alloc_hook;
 	hook_dalloc dalloc_hook;
 	hook_expand expand_hook;
+	void *extra;
 };
 
 /*
@@ -142,7 +143,7 @@ struct hook_ralloc_args_s {
  */
 bool hook_boot();
 
-void *hook_install(tsdn_t *tsdn, hooks_t *hooks, void *extra);
+void *hook_install(tsdn_t *tsdn, hooks_t *hooks);
 /* Uninstalls the hook with the handle previously returned from hook_install. */
 void hook_remove(tsdn_t *tsdn, void *opaque);
 
diff --git a/src/hook.c b/src/hook.c
index 5c6818f..564c2a0 100644
--- a/src/hook.c
+++ b/src/hook.c
@@ -9,7 +9,6 @@
 typedef struct hooks_internal_s hooks_internal_t;
 struct hooks_internal_s {
 	hooks_t hooks;
-	void *extra;
 	bool in_use;
 };
 
@@ -27,7 +26,7 @@ hook_boot() {
 }
 
 static void *
-hook_install_locked(hooks_t *to_install, void *extra) {
+hook_install_locked(hooks_t *to_install) {
 	hooks_internal_t hooks_internal;
 	for (int i = 0; i < HOOKS_MAX; i++) {
 		bool success = seq_try_load_hooks(&hooks_internal, &hooks[i]);
@@ -35,7 +34,6 @@ hook_install_locked(hooks_t *to_install, void *extra) {
 		assert(success);
 		if (!hooks_internal.in_use) {
 			hooks_internal.hooks = *to_install;
-			hooks_internal.extra = extra;
 			hooks_internal.in_use = true;
 			seq_store_hooks(&hooks[i], &hooks_internal);
 			atomic_store_u(&nhooks,
@@ -48,9 +46,9 @@ hook_install_locked(hooks_t *to_install, void *extra) {
 }
 
 void *
-hook_install(tsdn_t *tsdn, hooks_t *to_install, void *extra) {
+hook_install(tsdn_t *tsdn, hooks_t *to_install) {
 	malloc_mutex_lock(tsdn, &hooks_mu);
-	void *ret = hook_install_locked(to_install, extra);
+	void *ret = hook_install_locked(to_install);
 	if (ret != NULL) {
 		tsd_global_slow_inc(tsdn);
 	}
@@ -112,7 +110,7 @@ hook_invoke_alloc(hook_alloc_t type, void *result, uintptr_t result_raw,
 	FOR_EACH_HOOK_BEGIN(&hook)
 		hook_alloc h = hook.hooks.alloc_hook;
 		if (h != NULL) {
-			h(hook.extra, type, result, result_raw, args_raw);
+			h(hook.hooks.extra, type, result, result_raw, args_raw);
 		}
 	FOR_EACH_HOOK_END
 }
@@ -126,7 +124,7 @@ hook_invoke_dalloc(hook_dalloc_t type, void *address, uintptr_t args_raw[3]) {
 	FOR_EACH_HOOK_BEGIN(&hook)
 		hook_dalloc h = hook.hooks.dalloc_hook;
 		if (h != NULL) {
-			h(hook.extra, type, address, args_raw);
+			h(hook.hooks.extra, type, address, args_raw);
 		}
 	FOR_EACH_HOOK_END
 }
@@ -141,7 +139,7 @@ hook_invoke_expand(hook_expand_t type, void *address, size_t old_usize,
 	FOR_EACH_HOOK_BEGIN(&hook)
 		hook_expand h = hook.hooks.expand_hook;
 		if (h != NULL) {
-			h(hook.extra, type, address, old_usize, new_usize,
+			h(hook.hooks.extra, type, address, old_usize, new_usize,
 			    result_raw, args_raw);
 		}
 	FOR_EACH_HOOK_END
diff --git a/test/unit/hook.c b/test/unit/hook.c
index 693cb23..3f85ff1 100644
--- a/test/unit/hook.c
+++ b/test/unit/hook.c
@@ -81,8 +81,9 @@ test_expand_hook(void *extra, hook_expand_t type, void *address,
 TEST_BEGIN(test_hooks_basic) {
 	/* Just verify that the record their arguments correctly. */
 	hooks_t hooks = {
-		&test_alloc_hook, &test_dalloc_hook, &test_expand_hook};
-	void *handle = hook_install(TSDN_NULL, &hooks, (void *)111);
+		&test_alloc_hook, &test_dalloc_hook, &test_expand_hook,
+		(void *)111};
+	void *handle = hook_install(TSDN_NULL, &hooks);
 	uintptr_t args_raw[4] = {10, 20, 30, 40};
 
 	/* Alloc */
@@ -124,15 +125,15 @@ TEST_END
 
 TEST_BEGIN(test_hooks_null) {
 	/* Null hooks should be ignored, not crash. */
-	hooks_t hooks1 = {NULL, NULL, NULL};
-	hooks_t hooks2 = {&test_alloc_hook, NULL, NULL};
-	hooks_t hooks3 = {NULL, &test_dalloc_hook, NULL};
-	hooks_t hooks4 = {NULL, NULL, &test_expand_hook};
+	hooks_t hooks1 = {NULL, NULL, NULL, NULL};
+	hooks_t hooks2 = {&test_alloc_hook, NULL, NULL, NULL};
+	hooks_t hooks3 = {NULL, &test_dalloc_hook, NULL, NULL};
+	hooks_t hooks4 = {NULL, NULL, &test_expand_hook, NULL};
 
-	void *handle1 = hook_install(TSDN_NULL, &hooks1, NULL);
-	void *handle2 = hook_install(TSDN_NULL, &hooks2, NULL);
-	void *handle3 = hook_install(TSDN_NULL, &hooks3, NULL);
-	void *handle4 = hook_install(TSDN_NULL, &hooks4, NULL);
+	void *handle1 = hook_install(TSDN_NULL, &hooks1);
+	void *handle2 = hook_install(TSDN_NULL, &hooks2);
+	void *handle3 = hook_install(TSDN_NULL, &hooks3);
+	void *handle4 = hook_install(TSDN_NULL, &hooks4);
 
 	assert_ptr_ne(handle1, NULL, "Hook installation failed");
 	assert_ptr_ne(handle2, NULL, "Hook installation failed");
@@ -161,8 +162,8 @@ TEST_BEGIN(test_hooks_null) {
 TEST_END
 
 TEST_BEGIN(test_hooks_remove) {
-	hooks_t hooks = {&test_alloc_hook, NULL, NULL};
-	void *handle = hook_install(TSDN_NULL, &hooks, NULL);
+	hooks_t hooks = {&test_alloc_hook, NULL, NULL, NULL};
+	void *handle = hook_install(TSDN_NULL, &hooks);
 	assert_ptr_ne(handle, NULL, "Hook installation failed");
 	call_count = 0;
 	uintptr_t args_raw[4] = {10, 20, 30, 40};
@@ -179,8 +180,8 @@ TEST_END
 
 TEST_BEGIN(test_hooks_alloc_simple) {
 	/* "Simple" in the sense that we're not in a realloc variant. */
-	hooks_t hooks = {&test_alloc_hook, NULL, NULL};
-	void *handle = hook_install(TSDN_NULL, &hooks, (void *)123);
+	hooks_t hooks = {&test_alloc_hook, NULL, NULL, (void *)123};
+	void *handle = hook_install(TSDN_NULL, &hooks);
 	assert_ptr_ne(handle, NULL, "Hook installation failed");
 
 	/* Stop malloc from being optimized away. */
@@ -290,8 +291,8 @@ TEST_END
 
 TEST_BEGIN(test_hooks_dalloc_simple) {
 	/* "Simple" in the sense that we're not in a realloc variant. */
-	hooks_t hooks = {NULL, &test_dalloc_hook, NULL};
-	void *handle = hook_install(TSDN_NULL, &hooks, (void *)123);
+	hooks_t hooks = {NULL, &test_dalloc_hook, NULL, (void *)123};
+	void *handle = hook_install(TSDN_NULL, &hooks);
 	assert_ptr_ne(handle, NULL, "Hook installation failed");
 
 	void *volatile ptr;
@@ -337,8 +338,8 @@ TEST_END
 
 TEST_BEGIN(test_hooks_expand_simple) {
 	/* "Simple" in the sense that we're not in a realloc variant. */
-	hooks_t hooks = {NULL, NULL, &test_expand_hook};
-	void *handle = hook_install(TSDN_NULL, &hooks, (void *)123);
+	hooks_t hooks = {NULL, NULL, &test_expand_hook, (void *)123};
+	void *handle = hook_install(TSDN_NULL, &hooks);
 	assert_ptr_ne(handle, NULL, "Hook installation failed");
 
 	void *volatile ptr;
@@ -365,8 +366,8 @@ TEST_END
 
 TEST_BEGIN(test_hooks_realloc_as_malloc_or_free) {
 	hooks_t hooks = {&test_alloc_hook, &test_dalloc_hook,
-		&test_expand_hook};
-	void *handle = hook_install(TSDN_NULL, &hooks, (void *)123);
+		&test_expand_hook, (void *)123};
+	void *handle = hook_install(TSDN_NULL, &hooks);
 	assert_ptr_ne(handle, NULL, "Hook installation failed");
 
 	void *volatile ptr;
@@ -416,8 +417,8 @@ static void
 do_realloc_test(void *(*ralloc)(void *, size_t, int), int flags,
     int expand_type, int dalloc_type) {
 	hooks_t hooks = {&test_alloc_hook, &test_dalloc_hook,
-		&test_expand_hook};
-	void *handle = hook_install(TSDN_NULL, &hooks, (void *)123);
+		&test_expand_hook, (void *)123};
+	void *handle = hook_install(TSDN_NULL, &hooks);
 	assert_ptr_ne(handle, NULL, "Hook installation failed");
 
 	void *volatile ptr;
-- 
cgit v0.12


From bb071db92ee8368fb6e64ef328d49fae6ba48089 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Tue, 24 Apr 2018 14:45:41 -0700
Subject: Mallctl: Add experimental.hooks.[install|remove].

---
 src/ctl.c           | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 test/unit/mallctl.c | 40 +++++++++++++++++++++++++++++++++++-
 2 files changed, 97 insertions(+), 2 deletions(-)

diff --git a/src/ctl.c b/src/ctl.c
index 0eb8de1..ef3eca4 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -202,6 +202,8 @@ CTL_PROTO(stats_metadata_thp)
 CTL_PROTO(stats_resident)
 CTL_PROTO(stats_mapped)
 CTL_PROTO(stats_retained)
+CTL_PROTO(experimental_hooks_install)
+CTL_PROTO(experimental_hooks_remove)
 
 #define MUTEX_STATS_CTL_PROTO_GEN(n)					\
 CTL_PROTO(stats_##n##_num_ops)						\
@@ -536,6 +538,15 @@ static const ctl_named_node_t stats_node[] = {
 	{NAME("arenas"),	CHILD(indexed, stats_arenas)}
 };
 
+static const ctl_named_node_t hooks_node[] = {
+	{NAME("install"),	CTL(experimental_hooks_install)},
+	{NAME("remove"),	CTL(experimental_hooks_remove)},
+};
+
+static const ctl_named_node_t experimental_node[] = {
+	{NAME("hooks"),		CHILD(named, hooks)}
+};
+
 static const ctl_named_node_t	root_node[] = {
 	{NAME("version"),	CTL(version)},
 	{NAME("epoch"),		CTL(epoch)},
@@ -548,7 +559,8 @@ static const ctl_named_node_t	root_node[] = {
 	{NAME("arena"),		CHILD(indexed, arena)},
 	{NAME("arenas"),	CHILD(named, arenas)},
 	{NAME("prof"),		CHILD(named, prof)},
-	{NAME("stats"),		CHILD(named, stats)}
+	{NAME("stats"),		CHILD(named, stats)},
+	{NAME("experimental"),	CHILD(named, experimental)}
 };
 static const ctl_named_node_t super_root_node[] = {
 	{NAME(""),		CHILD(named, root)}
@@ -2879,3 +2891,48 @@ label_return:
 	malloc_mutex_unlock(tsdn, &ctl_mtx);
 	return ret;
 }
+
+static int
+experimental_hooks_install_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+	int ret;
+	if (oldp == NULL || oldlenp == NULL|| newp == NULL) {
+		ret = EINVAL;
+		goto label_return;
+	}
+	/*
+	 * Note: this is a *private* struct.  This is an experimental interface;
+	 * forcing the user to know the jemalloc internals well enough to
+	 * extract the ABI hopefully ensures nobody gets too comfortable with
+	 * this API, which can change at a moment's notice.
+	 */
+	hooks_t hooks;
+	WRITE(hooks, hooks_t);
+	void *handle = hook_install(tsd_tsdn(tsd), &hooks);
+	if (handle == NULL) {
+		ret = EAGAIN;
+		goto label_return;
+	}
+	READ(handle, void *);
+
+	ret = 0;
+label_return:
+	return ret;
+}
+
+static int
+experimental_hooks_remove_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+	int ret;
+	WRITEONLY();
+	void *handle = NULL;
+	WRITE(handle, void *);
+	if (handle == NULL) {
+		ret = EINVAL;
+		goto label_return;
+	}
+	hook_remove(tsd_tsdn(tsd), handle);
+	ret = 0;
+label_return:
+	return ret;
+}
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index 1ecbab0..34a4d67 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -773,6 +773,43 @@ TEST_BEGIN(test_stats_arenas) {
 }
 TEST_END
 
+static void
+alloc_hook(void *extra, UNUSED hook_alloc_t type, UNUSED void *result,
+    UNUSED uintptr_t result_raw, UNUSED uintptr_t args_raw[3]) {
+	*(bool *)extra = true;
+}
+
+static void
+dalloc_hook(void *extra, UNUSED hook_dalloc_t type,
+    UNUSED void *address, UNUSED uintptr_t args_raw[3]) {
+	*(bool *)extra = true;
+}
+
+TEST_BEGIN(test_hooks) {
+	bool hook_called = false;
+	hooks_t hooks = {&alloc_hook, &dalloc_hook, NULL, &hook_called};
+	void *handle = NULL;
+	size_t sz = sizeof(handle);
+	int err = mallctl("experimental.hooks.install", &handle, &sz, &hooks,
+	    sizeof(hooks));
+	assert_d_eq(err, 0, "Hook installation failed");
+	assert_ptr_ne(handle, NULL, "Hook installation gave null handle");
+	void *ptr = mallocx(1, 0);
+	assert_true(hook_called, "Alloc hook not called");
+	hook_called = false;
+	free(ptr);
+	assert_true(hook_called, "Free hook not called");
+
+	err = mallctl("experimental.hooks.remove", NULL, NULL, &handle,
+	    sizeof(handle));
+	assert_d_eq(err, 0, "Hook removal failed");
+	hook_called = false;
+	ptr = mallocx(1, 0);
+	free(ptr);
+	assert_false(hook_called, "Hook called after removal");
+}
+TEST_END
+
 int
 main(void) {
 	return test(
@@ -801,5 +838,6 @@ main(void) {
 	    test_arenas_lextent_constants,
 	    test_arenas_create,
 	    test_arenas_lookup,
-	    test_stats_arenas);
+	    test_stats_arenas,
+	    test_hooks);
 }
-- 
cgit v0.12


From 59e371f46331a3f4b688d6622a0af7ccc4f96be6 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Mon, 30 Apr 2018 16:24:36 -0700
Subject: Hooks: Add a hook exhaustion test.

When we run out of space in which to store hooks, we should return EAGAIN from
the mallctl, but not otherwise misbehave.
---
 include/jemalloc/internal/hook.h |  2 ++
 src/hook.c                       |  9 ++++-----
 test/unit/mallctl.c              | 40 +++++++++++++++++++++++++++++++++++++++-
 3 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/include/jemalloc/internal/hook.h b/include/jemalloc/internal/hook.h
index 678c601..9ea9c6f 100644
--- a/include/jemalloc/internal/hook.h
+++ b/include/jemalloc/internal/hook.h
@@ -46,6 +46,8 @@
  *   order.
  */
 
+#define HOOK_MAX 4
+
 enum hook_alloc_e {
 	hook_alloc_malloc,
 	hook_alloc_posix_memalign,
diff --git a/src/hook.c b/src/hook.c
index 564c2a0..24afe99 100644
--- a/src/hook.c
+++ b/src/hook.c
@@ -14,9 +14,8 @@ struct hooks_internal_s {
 
 seq_define(hooks_internal_t, hooks)
 
-#define HOOKS_MAX 4
 static atomic_u_t nhooks = ATOMIC_INIT(0);
-static seq_hooks_t hooks[HOOKS_MAX];
+static seq_hooks_t hooks[HOOK_MAX];
 static malloc_mutex_t hooks_mu;
 
 bool
@@ -28,7 +27,7 @@ hook_boot() {
 static void *
 hook_install_locked(hooks_t *to_install) {
 	hooks_internal_t hooks_internal;
-	for (int i = 0; i < HOOKS_MAX; i++) {
+	for (int i = 0; i < HOOK_MAX; i++) {
 		bool success = seq_try_load_hooks(&hooks_internal, &hooks[i]);
 		/* We hold mu; no concurrent access. */
 		assert(success);
@@ -74,7 +73,7 @@ void
 hook_remove(tsdn_t *tsdn, void *opaque) {
 	if (config_debug) {
 		char *hooks_begin = (char *)&hooks[0];
-		char *hooks_end = (char *)&hooks[HOOKS_MAX];
+		char *hooks_end = (char *)&hooks[HOOK_MAX];
 		char *hook = (char *)opaque;
 		assert(hooks_begin <= hook && hook < hooks_end
 		    && (hook - hooks_begin) % sizeof(seq_hooks_t) == 0);
@@ -87,7 +86,7 @@ hook_remove(tsdn_t *tsdn, void *opaque) {
 
 #define FOR_EACH_HOOK_BEGIN(hooks_internal_ptr)				\
 for (int for_each_hook_counter = 0;					\
-    for_each_hook_counter < HOOKS_MAX;					\
+    for_each_hook_counter < HOOK_MAX;					\
     for_each_hook_counter++) {						\
 	bool for_each_hook_success = seq_try_load_hooks(		\
 	    (hooks_internal_ptr), &hooks[for_each_hook_counter]);	\
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index 34a4d67..8a36c0a 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -1,5 +1,6 @@
 #include "test/jemalloc_test.h"
 
+#include "jemalloc/internal/hook.h"
 #include "jemalloc/internal/util.h"
 
 TEST_BEGIN(test_mallctl_errors) {
@@ -810,6 +811,42 @@ TEST_BEGIN(test_hooks) {
 }
 TEST_END
 
+TEST_BEGIN(test_hooks_exhaustion) {
+	bool hook_called = false;
+	hooks_t hooks = {&alloc_hook, &dalloc_hook, NULL, &hook_called};
+
+	void *handle;
+	void *handles[HOOK_MAX];
+	size_t sz = sizeof(handle);
+	int err;
+	for (int i = 0; i < HOOK_MAX; i++) {
+		handle = NULL;
+		err = mallctl("experimental.hooks.install", &handle, &sz,
+		    &hooks, sizeof(hooks));
+		assert_d_eq(err, 0, "Error installation hooks");
+		assert_ptr_ne(handle, NULL, "Got NULL handle");
+		handles[i] = handle;
+	}
+	err = mallctl("experimental.hooks.install", &handle, &sz, &hooks,
+	    sizeof(hooks));
+	assert_d_eq(err, EAGAIN, "Should have failed hook installation");
+	for (int i = 0; i < HOOK_MAX; i++) {
+		err = mallctl("experimental.hooks.remove", NULL, NULL,
+		    &handles[i], sizeof(handles[i]));
+		assert_d_eq(err, 0, "Hook removal failed");
+	}
+	/* Insertion failed, but then we removed some; it should work now. */
+	handle = NULL;
+	err = mallctl("experimental.hooks.install", &handle, &sz, &hooks,
+	    sizeof(hooks));
+	assert_d_eq(err, 0, "Hook insertion failed");
+	assert_ptr_ne(handle, NULL, "Got NULL handle");
+	err = mallctl("experimental.hooks.remove", NULL, NULL, &handle,
+	    sizeof(handle));
+	assert_d_eq(err, 0, "Hook removal failed");
+}
+TEST_END
+
 int
 main(void) {
 	return test(
@@ -839,5 +876,6 @@ main(void) {
 	    test_arenas_create,
 	    test_arenas_lookup,
 	    test_stats_arenas,
-	    test_hooks);
+	    test_hooks,
+	    test_hooks_exhaustion);
 }
-- 
cgit v0.12


From 0379235f47585ac8f583ba85aab9d294abfa44b5 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Fri, 11 May 2018 16:43:43 -0700
Subject: Tests: Shouldn't be able to change global slowness.

This can help ensure that we don't leave slowness changes behind in case of
resource exhaustion.
---
 include/jemalloc/internal/tsd.h |  1 +
 src/tsd.c                       |  2 +-
 test/src/test.c                 | 17 +++++++++++++++++
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index 251f565..845a3f0 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -117,6 +117,7 @@ void tsd_slow_update(tsd_t *tsd);
  */
 void tsd_global_slow_inc(tsdn_t *tsdn);
 void tsd_global_slow_dec(tsdn_t *tsdn);
+bool tsd_global_slow();
 
 enum {
 	/* Common case --> jnz. */
diff --git a/src/tsd.c b/src/tsd.c
index c92cd22..91a964a 100644
--- a/src/tsd.c
+++ b/src/tsd.c
@@ -141,7 +141,7 @@ tsd_local_slow(tsd_t *tsd) {
 	    || tsd_reentrancy_level_get(tsd) > 0;
 }
 
-static bool
+bool
 tsd_global_slow() {
 	return atomic_load_u32(&tsd_global_slow_count, ATOMIC_RELAXED) > 0;
 }
diff --git a/test/src/test.c b/test/src/test.c
index 9c754e3..f97ce4d 100644
--- a/test/src/test.c
+++ b/test/src/test.c
@@ -110,6 +110,20 @@ p_test_fini(void) {
 	    test_status_string(test_status));
 }
 
+static void
+check_global_slow(test_status_t *status) {
+#ifdef JEMALLOC_UNIT_TEST
+	/*
+	 * This check needs to peek into tsd internals, which is why it's only
+	 * exposed in unit tests.
+	 */
+	if (tsd_global_slow()) {
+		malloc_printf("Testing increased global slow count\n");
+		*status = test_status_fail;
+	}
+#endif
+}
+
 static test_status_t
 p_test_impl(bool do_malloc_init, bool do_reentrant, test_t *t, va_list ap) {
 	test_status_t ret;
@@ -136,6 +150,7 @@ p_test_impl(bool do_malloc_init, bool do_reentrant, test_t *t, va_list ap) {
 		if (test_status > ret) {
 			ret = test_status;
 		}
+		check_global_slow(&ret);
 		/* Reentrant run. */
 		if (do_reentrant) {
 			reentrancy = libc_reentrant;
@@ -145,6 +160,7 @@ p_test_impl(bool do_malloc_init, bool do_reentrant, test_t *t, va_list ap) {
 			if (test_status > ret) {
 				ret = test_status;
 			}
+			check_global_slow(&ret);
 
 			reentrancy = arena_new_reentrant;
 			test_hooks_libc_hook = NULL;
@@ -153,6 +169,7 @@ p_test_impl(bool do_malloc_init, bool do_reentrant, test_t *t, va_list ap) {
 			if (test_status > ret) {
 				ret = test_status;
 			}
+			check_global_slow(&ret);
 		}
 	}
 
-- 
cgit v0.12


From a7f749c9af0d5ca51b5b5eaf35c2c2913d8a77e1 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Tue, 15 May 2018 14:15:43 -0700
Subject: Hooks: Protect against reentrancy.

Previously, we made the user deal with this themselves, but that's not good
enough; if hooks may allocate, we should test the allocation pathways down
hooks.  If we're doing that, we might as well actually implement the protection
for the user.
---
 include/jemalloc/internal/hook.h |  6 ++--
 include/jemalloc/internal/tsd.h  |  2 ++
 src/hook.c                       | 68 ++++++++++++++++++++++++++++++++++------
 test/unit/hook.c                 | 42 +++++++++++++++++++++++++
 4 files changed, 106 insertions(+), 12 deletions(-)

diff --git a/include/jemalloc/internal/hook.h b/include/jemalloc/internal/hook.h
index 9ea9c6f..ee246b1 100644
--- a/include/jemalloc/internal/hook.h
+++ b/include/jemalloc/internal/hook.h
@@ -25,9 +25,9 @@
  * and only calls the alloc hook).
  *
  * Reentrancy:
- *   Is not protected against.  If your hooks allocate, then the hooks will be
- *   called again.  Note that you can guard against this with a thread-local
- *   "in_hook" bool.
+ *   Reentrancy is guarded against from within the hook implementation.  If you
+ *   call allocator functions from within a hook, the hooks will not be invoked
+ *   again.
  * Threading:
  *   The installation of a hook synchronizes with all its uses.  If you can
  *   prove the installation of a hook happens-before a jemalloc entry point,
diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index 845a3f0..3097ce0 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -66,6 +66,7 @@ typedef ql_elm(tsd_t) tsd_link_t;
 #define MALLOC_TSD							\
     O(tcache_enabled,		bool,			bool)		\
     O(arenas_tdata_bypass,	bool,			bool)		\
+    O(in_hook,			bool,			bool)		\
     O(reentrancy_level,		int8_t,			int8_t)		\
     O(narenas_tdata,		uint32_t,		uint32_t)	\
     O(offset_state,		uint64_t,		uint64_t)	\
@@ -85,6 +86,7 @@ typedef ql_elm(tsd_t) tsd_link_t;
     ATOMIC_INIT(tsd_state_uninitialized),				\
     TCACHE_ENABLED_ZERO_INITIALIZER,					\
     false,								\
+    false,								\
     0,									\
     0,									\
     0,									\
diff --git a/src/hook.c b/src/hook.c
index 24afe99..f66d423 100644
--- a/src/hook.c
+++ b/src/hook.c
@@ -99,12 +99,62 @@ for (int for_each_hook_counter = 0;					\
 #define FOR_EACH_HOOK_END						\
 }
 
+static bool *
+hook_reentrantp() {
+	/*
+	 * We prevent user reentrancy within hooks.  This is basically just a
+	 * thread-local bool that triggers an early-exit.
+	 *
+	 * We don't fold in_hook into reentrancy.  There are two reasons for
+	 * this:
+	 * - Right now, we turn on reentrancy during things like extent hook
+	 *   execution.  Allocating during extent hooks is not officially
+	 *   supported, but we don't want to break it for the time being.  These
+	 *   sorts of allocations should probably still be hooked, though.
+	 * - If a hook allocates, we may want it to be relatively fast (after
+	 *   all, it executes on every allocator operation).  Turning on
+	 *   reentrancy is a fairly heavyweight mode (disabling tcache,
+	 *   redirecting to arena 0, etc.).  It's possible we may one day want
+	 *   to turn on reentrant mode here, if it proves too difficult to keep
+	 *   this working.  But that's fairly easy for us to see; OTOH, people
+	 *   not using hooks because they're too slow is easy for us to miss.
+	 *
+	 * The tricky part is
+	 * that this code might get invoked even if we don't have access to tsd.
+	 * This function mimics getting a pointer to thread-local data, except
+	 * that it might secretly return a pointer to some global data if we
+	 * know that the caller will take the early-exit path.
+	 * If we return a bool that indicates that we are reentrant, then the
+	 * caller will go down the early exit path, leaving the global
+	 * untouched.
+	 */
+	static bool in_hook_global = true;
+	tsdn_t *tsdn = tsdn_fetch();
+	bool *in_hook = tsdn_in_hookp_get(tsdn);
+	if (in_hook != NULL) {
+		return in_hook;
+	}
+	return &in_hook_global;
+}
+
+#define HOOK_PROLOGUE							\
+	if (likely(atomic_load_u(&nhooks, ATOMIC_RELAXED) == 0)) {	\
+		return;							\
+	}								\
+	bool *in_hook = hook_reentrantp();				\
+	if (*in_hook) {							\
+		return;							\
+	}								\
+	*in_hook = true;
+
+#define HOOK_EPILOGUE							\
+	*in_hook = false;
+
 void
 hook_invoke_alloc(hook_alloc_t type, void *result, uintptr_t result_raw,
     uintptr_t args_raw[3]) {
-	if (likely(atomic_load_u(&nhooks, ATOMIC_RELAXED) == 0)) {
-		return;
-	}
+	HOOK_PROLOGUE
+
 	hooks_internal_t hook;
 	FOR_EACH_HOOK_BEGIN(&hook)
 		hook_alloc h = hook.hooks.alloc_hook;
@@ -112,13 +162,13 @@ hook_invoke_alloc(hook_alloc_t type, void *result, uintptr_t result_raw,
 			h(hook.hooks.extra, type, result, result_raw, args_raw);
 		}
 	FOR_EACH_HOOK_END
+
+	HOOK_EPILOGUE
 }
 
 void
 hook_invoke_dalloc(hook_dalloc_t type, void *address, uintptr_t args_raw[3]) {
-	if (likely(atomic_load_u(&nhooks, ATOMIC_RELAXED) == 0)) {
-		return;
-	}
+	HOOK_PROLOGUE
 	hooks_internal_t hook;
 	FOR_EACH_HOOK_BEGIN(&hook)
 		hook_dalloc h = hook.hooks.dalloc_hook;
@@ -126,14 +176,13 @@ hook_invoke_dalloc(hook_dalloc_t type, void *address, uintptr_t args_raw[3]) {
 			h(hook.hooks.extra, type, address, args_raw);
 		}
 	FOR_EACH_HOOK_END
+	HOOK_EPILOGUE
 }
 
 void
 hook_invoke_expand(hook_expand_t type, void *address, size_t old_usize,
     size_t new_usize, uintptr_t result_raw, uintptr_t args_raw[4]) {
-	if (likely(atomic_load_u(&nhooks, ATOMIC_RELAXED) == 0)) {
-		return;
-	}
+	HOOK_PROLOGUE
 	hooks_internal_t hook;
 	FOR_EACH_HOOK_BEGIN(&hook)
 		hook_expand h = hook.hooks.expand_hook;
@@ -142,4 +191,5 @@ hook_invoke_expand(hook_expand_t type, void *address, size_t old_usize,
 			    result_raw, args_raw);
 		}
 	FOR_EACH_HOOK_END
+	HOOK_EPILOGUE
 }
diff --git a/test/unit/hook.c b/test/unit/hook.c
index 3f85ff1..72fcc43 100644
--- a/test/unit/hook.c
+++ b/test/unit/hook.c
@@ -26,6 +26,45 @@ reset_args() {
 }
 
 static void
+alloc_free_size(size_t sz) {
+	void *ptr = mallocx(1, 0);
+	free(ptr);
+	ptr = mallocx(1, 0);
+	free(ptr);
+	ptr = mallocx(1, MALLOCX_TCACHE_NONE);
+	dallocx(ptr, MALLOCX_TCACHE_NONE);
+}
+
+/*
+ * We want to support a degree of user reentrancy.  This tests a variety of
+ * allocation scenarios.
+ */
+static void
+be_reentrant() {
+	/* Let's make sure the tcache is non-empty if enabled. */
+	alloc_free_size(1);
+	alloc_free_size(1024);
+	alloc_free_size(64 * 1024);
+	alloc_free_size(256 * 1024);
+	alloc_free_size(1024 * 1024);
+
+	/* Some reallocation. */
+	void *ptr = mallocx(129, 0);
+	ptr = rallocx(ptr, 130, 0);
+	free(ptr);
+
+	ptr = mallocx(2 * 1024 * 1024, 0);
+	free(ptr);
+	ptr = mallocx(1 * 1024 * 1024, 0);
+	ptr = rallocx(ptr, 2 * 1024 * 1024, 0);
+	free(ptr);
+
+	ptr = mallocx(1, 0);
+	ptr = rallocx(ptr, 1000, 0);
+	free(ptr);
+}
+
+static void
 set_args_raw(uintptr_t *args_raw, int nargs) {
 	memcpy(arg_args_raw, args_raw, sizeof(uintptr_t) * nargs);
 }
@@ -52,6 +91,7 @@ test_alloc_hook(void *extra, hook_alloc_t type, void *result,
 	arg_result = result;
 	arg_result_raw = result_raw;
 	set_args_raw(args_raw, 3);
+	be_reentrant();
 }
 
 static void
@@ -62,6 +102,7 @@ test_dalloc_hook(void *extra, hook_dalloc_t type, void *address,
 	arg_type = (int)type;
 	arg_address = address;
 	set_args_raw(args_raw, 3);
+	be_reentrant();
 }
 
 static void
@@ -76,6 +117,7 @@ test_expand_hook(void *extra, hook_expand_t type, void *address,
 	arg_new_usize = new_usize;
 	arg_result_raw = result_raw;
 	set_args_raw(args_raw, 4);
+	be_reentrant();
 }
 
 TEST_BEGIN(test_hooks_basic) {
-- 
cgit v0.12


From d22e150320801c114b3694e860195254bad1ef0f Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Thu, 24 May 2018 12:18:54 -0700
Subject: Avoid taking extents_muzzy mutex when muzzy is disabled.

When muzzy decay is disabled, no need to allocate from extents_muzzy.  This
saves us a couple of mutex operations down the extents_alloc path.
---
 src/arena.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index b76be5f..1cecce7 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -324,6 +324,11 @@ arena_large_ralloc_stats_update(tsdn_t *tsdn, arena_t *arena, size_t oldusize,
 	arena_large_malloc_stats_update(tsdn, arena, usize);
 }
 
+static bool
+arena_may_have_muzzy(arena_t *arena) {
+	return (pages_can_purge_lazy && (arena_muzzy_decay_ms_get(arena) != 0));
+}
+
 extent_t *
 arena_extent_alloc_large(tsdn_t *tsdn, arena_t *arena, size_t usize,
     size_t alignment, bool *zero) {
@@ -338,7 +343,7 @@ arena_extent_alloc_large(tsdn_t *tsdn, arena_t *arena, size_t usize,
 	extent_t *extent = extents_alloc(tsdn, arena, &extent_hooks,
 	    &arena->extents_dirty, NULL, usize, sz_large_pad, alignment, false,
 	    szind, zero, &commit);
-	if (extent == NULL) {
+	if (extent == NULL && arena_may_have_muzzy(arena)) {
 		extent = extents_alloc(tsdn, arena, &extent_hooks,
 		    &arena->extents_muzzy, NULL, usize, sz_large_pad, alignment,
 		    false, szind, zero, &commit);
@@ -1124,7 +1129,7 @@ arena_slab_alloc(tsdn_t *tsdn, arena_t *arena, szind_t binind,
 	extent_t *slab = extents_alloc(tsdn, arena, &extent_hooks,
 	    &arena->extents_dirty, NULL, bin_info->slab_size, 0, PAGE, true,
 	    binind, &zero, &commit);
-	if (slab == NULL) {
+	if (slab == NULL && arena_may_have_muzzy(arena)) {
 		slab = extents_alloc(tsdn, arena, &extent_hooks,
 		    &arena->extents_muzzy, NULL, bin_info->slab_size, 0, PAGE,
 		    true, binind, &zero, &commit);
-- 
cgit v0.12


From 9bd8deb26044b7a3f056f8995aae95ffe86d19ed Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Mon, 4 Jun 2018 11:06:23 -0700
Subject: Fix stats output for opt.lg_extent_max_active_fit.

---
 src/stats.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/stats.c b/src/stats.c
index 7411745..85e68a7 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -891,6 +891,8 @@ stats_general_print(emitter_t *emitter) {
 #define OPT_WRITE_UNSIGNED(name)					\
 	OPT_WRITE(name, uv, usz, emitter_type_unsigned)
 
+#define OPT_WRITE_SIZE_T(name)						\
+	OPT_WRITE(name, sv, ssz, emitter_type_size)
 #define OPT_WRITE_SSIZE_T(name)						\
 	OPT_WRITE(name, ssv, sssz, emitter_type_ssize)
 #define OPT_WRITE_SSIZE_T_MUTABLE(name, altname)			\
@@ -912,7 +914,7 @@ stats_general_print(emitter_t *emitter) {
 	OPT_WRITE_BOOL_MUTABLE("background_thread", "background_thread")
 	OPT_WRITE_SSIZE_T_MUTABLE("dirty_decay_ms", "arenas.dirty_decay_ms")
 	OPT_WRITE_SSIZE_T_MUTABLE("muzzy_decay_ms", "arenas.muzzy_decay_ms")
-	OPT_WRITE_UNSIGNED("lg_extent_max_active_fit")
+	OPT_WRITE_SIZE_T("lg_extent_max_active_fit")
 	OPT_WRITE_CHAR_P("junk")
 	OPT_WRITE_BOOL("zero")
 	OPT_WRITE_BOOL("utrace")
-- 
cgit v0.12


From c834912aa9503d470c3dae2b2b7840607f0d6e34 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Tue, 29 May 2018 15:55:04 -0700
Subject: Avoid taking large_mtx for auto arenas.

On tcache flush path, we can avoid touching the large_mtx for auto arenas, since
it was only needed for manual arenas where arena_reset is allowed.
---
 src/large.c  | 3 ++-
 src/tcache.c | 9 +++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/large.c b/src/large.c
index fdf183e..4951f3e 100644
--- a/src/large.c
+++ b/src/large.c
@@ -329,8 +329,9 @@ large_dalloc_prep_impl(tsdn_t *tsdn, arena_t *arena, extent_t *extent,
 		large_dalloc_maybe_junk(extent_addr_get(extent),
 		    extent_usize_get(extent));
 	} else {
-		malloc_mutex_assert_owner(tsdn, &arena->large_mtx);
+		/* Only hold the large_mtx if necessary. */
 		if (!arena_is_auto(arena)) {
+			malloc_mutex_assert_owner(tsdn, &arena->large_mtx);
 			extent_list_remove(&arena->large, extent);
 		}
 	}
diff --git a/src/tcache.c b/src/tcache.c
index a769a6b..e249766 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -212,7 +212,10 @@ tcache_bin_flush_large(tsd_t *tsd, cache_bin_t *tbin, szind_t binind,
 			idump = false;
 		}
 
-		malloc_mutex_lock(tsd_tsdn(tsd), &locked_arena->large_mtx);
+		bool lock_large = !arena_is_auto(arena);
+		if (lock_large) {
+			malloc_mutex_lock(tsd_tsdn(tsd), &locked_arena->large_mtx);
+		}
 		for (unsigned i = 0; i < nflush; i++) {
 			void *ptr = *(tbin->avail - 1 - i);
 			assert(ptr != NULL);
@@ -236,7 +239,9 @@ tcache_bin_flush_large(tsd_t *tsd, cache_bin_t *tbin, szind_t binind,
 				tbin->tstats.nrequests = 0;
 			}
 		}
-		malloc_mutex_unlock(tsd_tsdn(tsd), &locked_arena->large_mtx);
+		if (lock_large) {
+			malloc_mutex_unlock(tsd_tsdn(tsd), &locked_arena->large_mtx);
+		}
 
 		unsigned ndeferred = 0;
 		for (unsigned i = 0; i < nflush; i++) {
-- 
cgit v0.12


From 0ff7ff3ec7b322881fff3bd6d4861fda6e9331d9 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Mon, 4 Jun 2018 13:36:06 -0700
Subject: Optimize ixalloc by avoiding a size lookup.

---
 include/jemalloc/internal/arena_externs.h          |  2 +-
 .../internal/jemalloc_internal_inlines_c.h         |  6 +++--
 src/arena.c                                        | 26 +++++++++++++++-------
 src/jemalloc.c                                     |  8 +++----
 4 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/include/jemalloc/internal/arena_externs.h b/include/jemalloc/internal/arena_externs.h
index f4edcc7..f46820f 100644
--- a/include/jemalloc/internal/arena_externs.h
+++ b/include/jemalloc/internal/arena_externs.h
@@ -64,7 +64,7 @@ void arena_dalloc_bin_junked_locked(tsdn_t *tsdn, arena_t *arena,
     extent_t *extent, void *ptr);
 void arena_dalloc_small(tsdn_t *tsdn, void *ptr);
 bool arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
-    size_t extra, bool zero);
+    size_t extra, bool zero, size_t *newsize);
 void *arena_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t oldsize,
     size_t size, size_t alignment, bool zero, tcache_t *tcache,
     hook_ralloc_args_t *hook_args);
diff --git a/include/jemalloc/internal/jemalloc_internal_inlines_c.h b/include/jemalloc/internal/jemalloc_internal_inlines_c.h
index 1b5c11e..2b0d4f4 100644
--- a/include/jemalloc/internal/jemalloc_internal_inlines_c.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_c.h
@@ -203,7 +203,7 @@ iralloc(tsd_t *tsd, void *ptr, size_t oldsize, size_t size, size_t alignment,
 
 JEMALLOC_ALWAYS_INLINE bool
 ixalloc(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t extra,
-    size_t alignment, bool zero) {
+    size_t alignment, bool zero, size_t *newsize) {
 	assert(ptr != NULL);
 	assert(size != 0);
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
@@ -212,10 +212,12 @@ ixalloc(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t extra,
 	if (alignment != 0 && ((uintptr_t)ptr & ((uintptr_t)alignment-1))
 	    != 0) {
 		/* Existing object alignment is inadequate. */
+		*newsize = oldsize;
 		return true;
 	}
 
-	return arena_ralloc_no_move(tsdn, ptr, oldsize, size, extra, zero);
+	return arena_ralloc_no_move(tsdn, ptr, oldsize, size, extra, zero,
+	    newsize);
 }
 
 #endif /* JEMALLOC_INTERNAL_INLINES_C_H */
diff --git a/src/arena.c b/src/arena.c
index 1cecce7..b5c3dbe 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1585,15 +1585,17 @@ arena_dalloc_small(tsdn_t *tsdn, void *ptr) {
 
 bool
 arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
-    size_t extra, bool zero) {
+    size_t extra, bool zero, size_t *newsize) {
+	bool ret;
 	/* Calls with non-zero extra had to clamp extra. */
 	assert(extra == 0 || size + extra <= LARGE_MAXCLASS);
 
+	extent_t *extent = iealloc(tsdn, ptr);
 	if (unlikely(size > LARGE_MAXCLASS)) {
-		return true;
+		ret = true;
+		goto done;
 	}
 
-	extent_t *extent = iealloc(tsdn, ptr);
 	size_t usize_min = sz_s2u(size);
 	size_t usize_max = sz_s2u(size + extra);
 	if (likely(oldsize <= SMALL_MAXCLASS && usize_min <= SMALL_MAXCLASS)) {
@@ -1606,17 +1608,23 @@ arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
 		if ((usize_max > SMALL_MAXCLASS || sz_size2index(usize_max) !=
 		    sz_size2index(oldsize)) && (size > oldsize || usize_max <
 		    oldsize)) {
-			return true;
+			ret = true;
+			goto done;
 		}
 
 		arena_decay_tick(tsdn, extent_arena_get(extent));
-		return false;
+		ret = false;
 	} else if (oldsize >= LARGE_MINCLASS && usize_max >= LARGE_MINCLASS) {
-		return large_ralloc_no_move(tsdn, extent, usize_min, usize_max,
+		ret = large_ralloc_no_move(tsdn, extent, usize_min, usize_max,
 		    zero);
+	} else {
+		ret = true;
 	}
+done:
+	assert(extent == iealloc(tsdn, ptr));
+	*newsize = extent_usize_get(extent);
 
-	return true;
+	return ret;
 }
 
 static void *
@@ -1644,7 +1652,9 @@ arena_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t oldsize,
 
 	if (likely(usize <= SMALL_MAXCLASS)) {
 		/* Try to avoid moving the allocation. */
-		if (!arena_ralloc_no_move(tsdn, ptr, oldsize, usize, 0, zero)) {
+		UNUSED size_t newsize;
+		if (!arena_ralloc_no_move(tsdn, ptr, oldsize, usize, 0, zero,
+		    &newsize)) {
 			hook_invoke_expand(hook_args->is_realloc
 			    ? hook_expand_realloc : hook_expand_rallocx,
 			    ptr, oldsize, usize, (uintptr_t)ptr,
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 264408f..300e897 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -2834,14 +2834,14 @@ label_oom:
 JEMALLOC_ALWAYS_INLINE size_t
 ixallocx_helper(tsdn_t *tsdn, void *ptr, size_t old_usize, size_t size,
     size_t extra, size_t alignment, bool zero) {
-	size_t usize;
+	size_t newsize;
 
-	if (ixalloc(tsdn, ptr, old_usize, size, extra, alignment, zero)) {
+	if (ixalloc(tsdn, ptr, old_usize, size, extra, alignment, zero,
+	    &newsize)) {
 		return old_usize;
 	}
-	usize = isalloc(tsdn, ptr);
 
-	return usize;
+	return newsize;
 }
 
 static size_t
-- 
cgit v0.12


From fec1ef7c91b5368ad0d6f0c84bc77fa71d9dc949 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Tue, 26 Jun 2018 11:40:53 -0700
Subject: Fix arena locking in tcache_bin_flush_large().

This regression was introduced in c834912 (incorrect arena used).
---
 src/tcache.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/tcache.c b/src/tcache.c
index e249766..af75754 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -193,8 +193,8 @@ tcache_bin_flush_large(tsd_t *tsd, cache_bin_t *tbin, szind_t binind,
 	assert(binind < nhbins);
 	assert((cache_bin_sz_t)rem <= tbin->ncached);
 
-	arena_t *arena = tcache->arena;
-	assert(arena != NULL);
+	arena_t *tcache_arena = tcache->arena;
+	assert(tcache_arena != NULL);
 	unsigned nflush = tbin->ncached - rem;
 	VARIABLE_ARRAY(extent_t *, item_extent, nflush);
 	/* Look up extent once per item. */
@@ -212,7 +212,7 @@ tcache_bin_flush_large(tsd_t *tsd, cache_bin_t *tbin, szind_t binind,
 			idump = false;
 		}
 
-		bool lock_large = !arena_is_auto(arena);
+		bool lock_large = !arena_is_auto(locked_arena);
 		if (lock_large) {
 			malloc_mutex_lock(tsd_tsdn(tsd), &locked_arena->large_mtx);
 		}
@@ -225,16 +225,17 @@ tcache_bin_flush_large(tsd_t *tsd, cache_bin_t *tbin, szind_t binind,
 				    extent);
 			}
 		}
-		if ((config_prof || config_stats) && locked_arena == arena) {
+		if ((config_prof || config_stats) &&
+		    (locked_arena == tcache_arena)) {
 			if (config_prof) {
-				idump = arena_prof_accum(tsd_tsdn(tsd), arena,
-				    tcache->prof_accumbytes);
+				idump = arena_prof_accum(tsd_tsdn(tsd),
+				    tcache_arena, tcache->prof_accumbytes);
 				tcache->prof_accumbytes = 0;
 			}
 			if (config_stats) {
 				merged_stats = true;
 				arena_stats_large_nrequests_add(tsd_tsdn(tsd),
-				    &arena->stats, binind,
+				    &tcache_arena->stats, binind,
 				    tbin->tstats.nrequests);
 				tbin->tstats.nrequests = 0;
 			}
@@ -275,8 +276,8 @@ tcache_bin_flush_large(tsd_t *tsd, cache_bin_t *tbin, szind_t binind,
 		 * The flush loop didn't happen to flush to this thread's
 		 * arena, so the stats didn't get merged.  Manually do so now.
 		 */
-		arena_stats_large_nrequests_add(tsd_tsdn(tsd), &arena->stats,
-		    binind, tbin->tstats.nrequests);
+		arena_stats_large_nrequests_add(tsd_tsdn(tsd),
+		    &tcache_arena->stats, binind, tbin->tstats.nrequests);
 		tbin->tstats.nrequests = 0;
 	}
 
-- 
cgit v0.12


From 50820010fef8f40e1221360ef745d9bb5fa93364 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Tue, 26 Jun 2018 13:27:44 -0700
Subject: Add test for remote deallocation.

---
 test/integration/mallocx.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/test/integration/mallocx.c b/test/integration/mallocx.c
index fd960f3..9fe3ad5 100644
--- a/test/integration/mallocx.c
+++ b/test/integration/mallocx.c
@@ -71,6 +71,38 @@ TEST_BEGIN(test_overflow) {
 }
 TEST_END
 
+static void *
+remote_alloc(void *arg) {
+	unsigned arena;
+	size_t sz = sizeof(unsigned);
+	assert_d_eq(mallctl("arenas.create", (void *)&arena, &sz, NULL, 0), 0,
+	    "Unexpected mallctl() failure");
+	size_t large_sz;
+	sz = sizeof(size_t);
+	assert_d_eq(mallctl("arenas.lextent.0.size", (void *)&large_sz, &sz,
+	    NULL, 0), 0, "Unexpected mallctl failure");
+
+	void *ptr = mallocx(large_sz, MALLOCX_ARENA(arena)
+	    | MALLOCX_TCACHE_NONE);
+	void **ret = (void **)arg;
+	*ret = ptr;
+
+	return NULL;
+}
+
+TEST_BEGIN(test_remote_free) {
+	thd_t thd;
+	void *ret;
+	thd_create(&thd, remote_alloc, (void *)&ret);
+	thd_join(thd, NULL);
+	assert_ptr_not_null(ret, "Unexpected mallocx failure");
+
+	/* Avoid TCACHE_NONE to explicitly test tcache_flush(). */
+	dallocx(ret, 0);
+	mallctl("thread.tcache.flush", NULL, NULL, NULL, 0);
+}
+TEST_END
+
 TEST_BEGIN(test_oom) {
 	size_t largemax;
 	bool oom;
@@ -223,6 +255,7 @@ main(void) {
 	return test(
 	    test_overflow,
 	    test_oom,
+	    test_remote_free,
 	    test_basic,
 	    test_alignment_and_size);
 }
-- 
cgit v0.12


From d1e11d48d4c706e17ef3508e2ddb910f109b779f Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Thu, 21 Jun 2018 13:02:49 -0700
Subject: Move tsd link and in_hook after tcache.

This can lead to better cache utilization down the common paths where we don't
touch the link.
---
 include/jemalloc/internal/tcache_structs.h |  9 +++++++++
 include/jemalloc/internal/tsd.h            |  7 -------
 src/hook.c                                 |  6 +++---
 src/tsd.c                                  | 10 +++++-----
 4 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/include/jemalloc/internal/tcache_structs.h b/include/jemalloc/internal/tcache_structs.h
index 07b7387..b3cd4e5 100644
--- a/include/jemalloc/internal/tcache_structs.h
+++ b/include/jemalloc/internal/tcache_structs.h
@@ -6,6 +6,10 @@
 #include "jemalloc/internal/cache_bin.h"
 #include "jemalloc/internal/ticker.h"
 
+/* Various uses of this struct need it to be a named type. */
+typedef struct tsd_s tsd_t;
+typedef ql_elm(tsd_t) tsd_link_t;
+
 struct tcache_s {
 	/*
 	 * To minimize our cache-footprint, we put the frequently accessed data
@@ -29,6 +33,11 @@ struct tcache_s {
 	 */
 	/* Lets us track all the tcaches in an arena. */
 	ql_elm(tcache_t) link;
+
+	/* Logically scoped to tsd, but put here for cache layout reasons. */
+	ql_elm(tsd_t) tsd_link;
+	bool in_hook;
+
 	/*
 	 * The descriptor lets the arena find our cache bins without seeing the
 	 * tcache definition.  This enables arenas to aggregate stats across
diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index 3097ce0..e5e82f4 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -59,14 +59,10 @@ typedef void (*test_callback_t)(int *);
 #  define MALLOC_TEST_TSD_INITIALIZER
 #endif
 
-/* Various uses of this struct need it to be a named type. */
-typedef ql_elm(tsd_t) tsd_link_t;
-
 /*  O(name,			type,			nullable type */
 #define MALLOC_TSD							\
     O(tcache_enabled,		bool,			bool)		\
     O(arenas_tdata_bypass,	bool,			bool)		\
-    O(in_hook,			bool,			bool)		\
     O(reentrancy_level,		int8_t,			int8_t)		\
     O(narenas_tdata,		uint32_t,		uint32_t)	\
     O(offset_state,		uint64_t,		uint64_t)	\
@@ -77,7 +73,6 @@ typedef ql_elm(tsd_t) tsd_link_t;
     O(iarena,			arena_t *,		arena_t *)	\
     O(arena,			arena_t *,		arena_t *)	\
     O(arenas_tdata,		arena_tdata_t *,	arena_tdata_t *)\
-    O(link,			tsd_link_t,		tsd_link_t)	\
     O(tcache,			tcache_t,		tcache_t)	\
     O(witness_tsd,              witness_tsd_t,		witness_tsdn_t)	\
     MALLOC_TEST_TSD
@@ -86,7 +81,6 @@ typedef ql_elm(tsd_t) tsd_link_t;
     ATOMIC_INIT(tsd_state_uninitialized),				\
     TCACHE_ENABLED_ZERO_INITIALIZER,					\
     false,								\
-    false,								\
     0,									\
     0,									\
     0,									\
@@ -97,7 +91,6 @@ typedef ql_elm(tsd_t) tsd_link_t;
     NULL,								\
     NULL,								\
     NULL,								\
-    {NULL},								\
     TCACHE_ZERO_INITIALIZER,						\
     WITNESS_TSD_INITIALIZER						\
     MALLOC_TEST_TSD_INITIALIZER						\
diff --git a/src/hook.c b/src/hook.c
index f66d423..9ac703c 100644
--- a/src/hook.c
+++ b/src/hook.c
@@ -130,9 +130,9 @@ hook_reentrantp() {
 	 */
 	static bool in_hook_global = true;
 	tsdn_t *tsdn = tsdn_fetch();
-	bool *in_hook = tsdn_in_hookp_get(tsdn);
-	if (in_hook != NULL) {
-		return in_hook;
+	tcache_t *tcache = tsdn_tcachep_get(tsdn);
+	if (tcache != NULL) {
+		return &tcache->in_hook;
 	}
 	return &in_hook_global;
 }
diff --git a/src/tsd.c b/src/tsd.c
index 91a964a..4eceee7 100644
--- a/src/tsd.c
+++ b/src/tsd.c
@@ -68,7 +68,7 @@ tsd_in_nominal_list(tsd_t *tsd) {
 	 * out of it here.
 	 */
 	malloc_mutex_lock(TSDN_NULL, &tsd_nominal_tsds_lock);
-	ql_foreach(tsd_list, &tsd_nominal_tsds, TSD_MANGLE(link)) {
+	ql_foreach(tsd_list, &tsd_nominal_tsds, TSD_MANGLE(tcache).tsd_link) {
 		if (tsd == tsd_list) {
 			found = true;
 			break;
@@ -82,9 +82,9 @@ static void
 tsd_add_nominal(tsd_t *tsd) {
 	assert(!tsd_in_nominal_list(tsd));
 	assert(tsd_state_get(tsd) <= tsd_state_nominal_max);
-	ql_elm_new(tsd, TSD_MANGLE(link));
+	ql_elm_new(tsd, TSD_MANGLE(tcache).tsd_link);
 	malloc_mutex_lock(tsd_tsdn(tsd), &tsd_nominal_tsds_lock);
-	ql_tail_insert(&tsd_nominal_tsds, tsd, TSD_MANGLE(link));
+	ql_tail_insert(&tsd_nominal_tsds, tsd, TSD_MANGLE(tcache).tsd_link);
 	malloc_mutex_unlock(tsd_tsdn(tsd), &tsd_nominal_tsds_lock);
 }
 
@@ -93,7 +93,7 @@ tsd_remove_nominal(tsd_t *tsd) {
 	assert(tsd_in_nominal_list(tsd));
 	assert(tsd_state_get(tsd) <= tsd_state_nominal_max);
 	malloc_mutex_lock(tsd_tsdn(tsd), &tsd_nominal_tsds_lock);
-	ql_remove(&tsd_nominal_tsds, tsd, TSD_MANGLE(link));
+	ql_remove(&tsd_nominal_tsds, tsd, TSD_MANGLE(tcache).tsd_link);
 	malloc_mutex_unlock(tsd_tsdn(tsd), &tsd_nominal_tsds_lock);
 }
 
@@ -106,7 +106,7 @@ tsd_force_recompute(tsdn_t *tsdn) {
 	atomic_fence(ATOMIC_RELEASE);
 	malloc_mutex_lock(tsdn, &tsd_nominal_tsds_lock);
 	tsd_t *remote_tsd;
-	ql_foreach(remote_tsd, &tsd_nominal_tsds, TSD_MANGLE(link)) {
+	ql_foreach(remote_tsd, &tsd_nominal_tsds, TSD_MANGLE(tcache).tsd_link) {
 		assert(atomic_load_u8(&remote_tsd->state, ATOMIC_RELAXED)
 		    <= tsd_state_nominal_max);
 		atomic_store_u8(&remote_tsd->state, tsd_state_nominal_recompute,
-- 
cgit v0.12


From 77a71ef2b76c2e858c81e10349f28534307f1c91 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Wed, 6 Jun 2018 15:52:52 -0700
Subject: Fall back to the default pthread_create if RTLD_NEXT fails.

---
 include/jemalloc/internal/background_thread_externs.h |  1 -
 src/background_thread.c                               | 16 +++++++++-------
 src/ctl.c                                             | 14 --------------
 3 files changed, 9 insertions(+), 22 deletions(-)

diff --git a/include/jemalloc/internal/background_thread_externs.h b/include/jemalloc/internal/background_thread_externs.h
index 3209aa4..0f997e1 100644
--- a/include/jemalloc/internal/background_thread_externs.h
+++ b/include/jemalloc/internal/background_thread_externs.h
@@ -8,7 +8,6 @@ extern atomic_b_t background_thread_enabled_state;
 extern size_t n_background_threads;
 extern size_t max_background_threads;
 extern background_thread_info_t *background_thread_info;
-extern bool can_enable_background_thread;
 
 bool background_thread_create(tsd_t *tsd, unsigned arena_ind);
 bool background_threads_enable(tsd_t *tsd);
diff --git a/src/background_thread.c b/src/background_thread.c
index 3517a3b..4613537 100644
--- a/src/background_thread.c
+++ b/src/background_thread.c
@@ -22,9 +22,6 @@ size_t max_background_threads;
 /* Thread info per-index. */
 background_thread_info_t *background_thread_info;
 
-/* False if no necessary runtime support. */
-bool can_enable_background_thread;
-
 /******************************************************************************/
 
 #ifdef JEMALLOC_PTHREAD_CREATE_WRAPPER
@@ -812,16 +809,21 @@ pthread_create_fptr_init(void) {
 	if (pthread_create_fptr != NULL) {
 		return false;
 	}
+	/*
+	 * Try the next symbol first, because 1) when use lazy_lock we have a
+	 * wrapper for pthread_create; and 2) application may define its own
+	 * wrapper as well (and can call malloc within the wrapper).
+	 */
 	pthread_create_fptr = dlsym(RTLD_NEXT, "pthread_create");
 	if (pthread_create_fptr == NULL) {
-		can_enable_background_thread = false;
-		if (config_lazy_lock || opt_background_thread) {
+		if (config_lazy_lock) {
 			malloc_write("<jemalloc>: Error in dlsym(RTLD_NEXT, "
 			    "\"pthread_create\")\n");
 			abort();
+		} else {
+			/* Fall back to the default symbol. */
+			pthread_create_fptr = pthread_create;
 		}
-	} else {
-		can_enable_background_thread = true;
 	}
 
 	return false;
diff --git a/src/ctl.c b/src/ctl.c
index ef3eca4..9ea2bb3 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -1556,13 +1556,6 @@ background_thread_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
 
 		background_thread_enabled_set(tsd_tsdn(tsd), newval);
 		if (newval) {
-			if (!can_enable_background_thread) {
-				malloc_printf("<jemalloc>: Error in dlsym("
-			            "RTLD_NEXT, \"pthread_create\"). Cannot "
-				    "enable background_thread\n");
-				ret = EFAULT;
-				goto label_return;
-			}
 			if (background_threads_enable(tsd)) {
 				ret = EFAULT;
 				goto label_return;
@@ -1617,13 +1610,6 @@ max_background_threads_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
 		}
 
 		if (background_thread_enabled()) {
-			if (!can_enable_background_thread) {
-				malloc_printf("<jemalloc>: Error in dlsym("
-			            "RTLD_NEXT, \"pthread_create\"). Cannot "
-				    "enable background_thread\n");
-				ret = EFAULT;
-				goto label_return;
-			}
 			background_thread_enabled_set(tsd_tsdn(tsd), false);
 			if (background_threads_disable(tsd)) {
 				ret = EFAULT;
-- 
cgit v0.12


From 94a88c26f4d9cffd884a349201e7605f13495f3f Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Mon, 21 May 2018 13:33:48 -0700
Subject: Implement huge arena: opt.huge_threshold.

The feature allows using a dedicated arena for huge allocations.  We want the
addtional arena to separate huge allocation because: 1) mixing small extents
with huge ones causes fragmentation over the long run (this feature reduces VM
size significantly); 2) with many arenas, huge extents rarely get reused across
threads; and 3) huge allocations happen way less frequently, therefore no
concerns for lock contention.
---
 include/jemalloc/internal/arena_externs.h          |  5 ++
 include/jemalloc/internal/arena_inlines_b.h        | 21 ++++++++
 include/jemalloc/internal/arena_types.h            |  6 +++
 .../internal/jemalloc_internal_inlines_b.h         |  4 +-
 src/arena.c                                        | 58 +++++++++++++++++++++-
 src/jemalloc.c                                     | 13 +++--
 src/large.c                                        |  2 +-
 test/unit/mallctl.c                                |  3 ++
 8 files changed, 106 insertions(+), 6 deletions(-)

diff --git a/include/jemalloc/internal/arena_externs.h b/include/jemalloc/internal/arena_externs.h
index f46820f..c145c91 100644
--- a/include/jemalloc/internal/arena_externs.h
+++ b/include/jemalloc/internal/arena_externs.h
@@ -17,6 +17,9 @@ extern const char *percpu_arena_mode_names[];
 extern const uint64_t h_steps[SMOOTHSTEP_NSTEPS];
 extern malloc_mutex_t arenas_lock;
 
+extern size_t opt_huge_threshold;
+extern size_t huge_threshold;
+
 void arena_basic_stats_merge(tsdn_t *tsdn, arena_t *arena,
     unsigned *nthreads, const char **dss, ssize_t *dirty_decay_ms,
     ssize_t *muzzy_decay_ms, size_t *nactive, size_t *ndirty, size_t *nmuzzy);
@@ -81,6 +84,8 @@ void arena_nthreads_inc(arena_t *arena, bool internal);
 void arena_nthreads_dec(arena_t *arena, bool internal);
 size_t arena_extent_sn_next(arena_t *arena);
 arena_t *arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks);
+bool arena_init_huge(void);
+arena_t *arena_choose_huge(tsd_t *tsd);
 void arena_boot(void);
 void arena_prefork0(tsdn_t *tsdn, arena_t *arena);
 void arena_prefork1(tsdn_t *tsdn, arena_t *arena);
diff --git a/include/jemalloc/internal/arena_inlines_b.h b/include/jemalloc/internal/arena_inlines_b.h
index 2b7e77e..401be75 100644
--- a/include/jemalloc/internal/arena_inlines_b.h
+++ b/include/jemalloc/internal/arena_inlines_b.h
@@ -8,6 +8,27 @@
 #include "jemalloc/internal/sz.h"
 #include "jemalloc/internal/ticker.h"
 
+JEMALLOC_ALWAYS_INLINE arena_t *
+arena_choose_maybe_huge(tsd_t *tsd, arena_t *arena, size_t size) {
+	if (arena != NULL) {
+		return arena;
+	}
+
+	/*
+	 * For huge allocations, use the dedicated huge arena if both are true:
+	 * 1) is using auto arena selection (i.e. arena == NULL), and 2) the
+	 * thread is not assigned to a manual arena.
+	 */
+	if (unlikely(size >= huge_threshold)) {
+		arena_t *tsd_arena = tsd_arena_get(tsd);
+		if (tsd_arena == NULL || arena_is_auto(tsd_arena)) {
+			return arena_choose_huge(tsd);
+		}
+	}
+
+	return arena_choose(tsd, NULL);
+}
+
 JEMALLOC_ALWAYS_INLINE prof_tctx_t *
 arena_prof_tctx_get(tsdn_t *tsdn, const void *ptr, alloc_ctx_t *alloc_ctx) {
 	cassert(config_prof);
diff --git a/include/jemalloc/internal/arena_types.h b/include/jemalloc/internal/arena_types.h
index 70001b5..759713c 100644
--- a/include/jemalloc/internal/arena_types.h
+++ b/include/jemalloc/internal/arena_types.h
@@ -40,4 +40,10 @@ typedef enum {
 #define PERCPU_ARENA_ENABLED(m)	((m) >= percpu_arena_mode_enabled_base)
 #define PERCPU_ARENA_DEFAULT	percpu_arena_disabled
 
+/*
+ * When allocation_size >= huge_threshold, use the dedicated huge arena (unless
+ * have explicitly spicified arena index).  0 disables the feature.
+ */
+#define HUGE_THRESHOLD_DEFAULT 0
+
 #endif /* JEMALLOC_INTERNAL_ARENA_TYPES_H */
diff --git a/include/jemalloc/internal/jemalloc_internal_inlines_b.h b/include/jemalloc/internal/jemalloc_internal_inlines_b.h
index 2e76e5d..8b0ac46 100644
--- a/include/jemalloc/internal/jemalloc_internal_inlines_b.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_b.h
@@ -71,7 +71,9 @@ arena_ichoose(tsd_t *tsd, arena_t *arena) {
 static inline bool
 arena_is_auto(arena_t *arena) {
 	assert(narenas_auto > 0);
-	return (arena_ind_get(arena) < narenas_auto);
+	unsigned offset = (opt_huge_threshold != 0) ? 1 : 0;
+
+	return (arena_ind_get(arena) < narenas_auto + offset);
 }
 
 JEMALLOC_ALWAYS_INLINE extent_t *
diff --git a/src/arena.c b/src/arena.c
index b5c3dbe..49d86d2 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -42,6 +42,10 @@ const uint64_t h_steps[SMOOTHSTEP_NSTEPS] = {
 
 static div_info_t arena_binind_div_info[NBINS];
 
+size_t opt_huge_threshold = HUGE_THRESHOLD_DEFAULT;
+size_t huge_threshold = HUGE_THRESHOLD_DEFAULT;
+static unsigned huge_arena_ind;
+
 /******************************************************************************/
 /*
  * Function prototypes for static functions that are referenced prior to
@@ -1378,7 +1382,7 @@ arena_malloc_hard(tsdn_t *tsdn, arena_t *arena, size_t size, szind_t ind,
 	assert(!tsdn_null(tsdn) || arena != NULL);
 
 	if (likely(!tsdn_null(tsdn))) {
-		arena = arena_choose(tsdn_tsd(tsdn), arena);
+		arena = arena_choose_maybe_huge(tsdn_tsd(tsdn), arena, size);
 	}
 	if (unlikely(arena == NULL)) {
 		return NULL;
@@ -1939,6 +1943,58 @@ label_error:
 	return NULL;
 }
 
+arena_t *
+arena_choose_huge(tsd_t *tsd) {
+	/* huge_arena_ind can be 0 during init (will use a0). */
+	if (huge_arena_ind == 0) {
+		assert(!malloc_initialized());
+	}
+
+	arena_t *huge_arena = arena_get(tsd_tsdn(tsd), huge_arena_ind, false);
+	if (huge_arena == NULL) {
+		/* Create the huge arena on demand. */
+		assert(huge_arena_ind != 0);
+		huge_arena = arena_get(tsd_tsdn(tsd), huge_arena_ind, true);
+		if (huge_arena == NULL) {
+			return NULL;
+		}
+		/*
+		 * Purge eagerly for huge allocations, because: 1) number of
+		 * huge allocations is usually small, which means ticker based
+		 * decay is not reliable; and 2) less immediate reuse is
+		 * expected for huge allocations.
+		 */
+		if (arena_dirty_decay_ms_default_get() > 0) {
+			arena_dirty_decay_ms_set(tsd_tsdn(tsd), huge_arena, 0);
+		}
+		if (arena_muzzy_decay_ms_default_get() > 0) {
+			arena_muzzy_decay_ms_set(tsd_tsdn(tsd), huge_arena, 0);
+		}
+	}
+
+	return huge_arena;
+}
+
+bool
+arena_init_huge(void) {
+	bool huge_enabled;
+
+	/* The threshold should be large size class. */
+	if (opt_huge_threshold > LARGE_MAXCLASS ||
+	    opt_huge_threshold < LARGE_MINCLASS) {
+		opt_huge_threshold = 0;
+		huge_threshold = LARGE_MAXCLASS + PAGE;
+		huge_enabled = false;
+	} else {
+		/* Reserve the index for the huge arena. */
+		huge_arena_ind = narenas_total_get();
+		huge_threshold = opt_huge_threshold;
+		huge_enabled = true;
+	}
+
+	return huge_enabled;
+}
+
 void
 arena_boot(void) {
 	arena_dirty_decay_ms_default_set(opt_dirty_decay_ms);
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 300e897..594669c 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -327,7 +327,7 @@ arena_init_locked(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 	 */
 	arena = arena_get(tsdn, ind, false);
 	if (arena != NULL) {
-		assert(ind < narenas_auto);
+		assert(arena_is_auto(arena));
 		return arena;
 	}
 
@@ -1142,11 +1142,15 @@ malloc_conf_init(void) {
 				CONF_HANDLE_BOOL(opt_xmalloc, "xmalloc")
 			}
 			CONF_HANDLE_BOOL(opt_tcache, "tcache")
+			CONF_HANDLE_SSIZE_T(opt_lg_tcache_max, "lg_tcache_max",
+			    -1, (sizeof(size_t) << 3) - 1)
+
+			CONF_HANDLE_SIZE_T(opt_huge_threshold, "huge_threshold",
+			    LARGE_MINCLASS, LARGE_MAXCLASS, yes, yes, false)
 			CONF_HANDLE_SIZE_T(opt_lg_extent_max_active_fit,
 			    "lg_extent_max_active_fit", 0,
 			    (sizeof(size_t) << 3), yes, yes, false)
-			CONF_HANDLE_SSIZE_T(opt_lg_tcache_max, "lg_tcache_max",
-			    -1, (sizeof(size_t) << 3) - 1)
+
 			if (strncmp("percpu_arena", k, klen) == 0) {
 				bool match = false;
 				for (int i = percpu_arena_mode_names_base; i <
@@ -1465,6 +1469,9 @@ malloc_init_narenas(void) {
 		    narenas_auto);
 	}
 	narenas_total_set(narenas_auto);
+	if (arena_init_huge()) {
+		narenas_total_inc();
+	}
 
 	return false;
 }
diff --git a/src/large.c b/src/large.c
index 4951f3e..03eecfa 100644
--- a/src/large.c
+++ b/src/large.c
@@ -42,7 +42,7 @@ large_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment,
 	 */
 	is_zeroed = zero;
 	if (likely(!tsdn_null(tsdn))) {
-		arena = arena_choose(tsdn_tsd(tsdn), arena);
+		arena = arena_choose_maybe_huge(tsdn_tsd(tsdn), arena, usize);
 	}
 	if (unlikely(arena == NULL) || (extent = arena_extent_alloc_large(tsdn,
 	    arena, usize, alignment, &is_zeroed)) == NULL) {
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index 8a36c0a..4ecf5bd 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -341,6 +341,9 @@ TEST_BEGIN(test_thread_arena) {
 	sz = sizeof(unsigned);
 	assert_d_eq(mallctl("arenas.narenas", (void *)&narenas, &sz, NULL, 0),
 	    0, "Unexpected mallctl() failure");
+	if (opt_huge_threshold != 0) {
+		narenas--;
+	}
 	assert_u_eq(narenas, opt_narenas, "Number of arenas incorrect");
 
 	if (strcmp(opa, "disabled") == 0) {
-- 
cgit v0.12


From 79522b2fc225f709a4ca7503c00f56df5d667160 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Fri, 1 Jun 2018 15:06:36 -0700
Subject: Refactor arena_is_auto.

---
 include/jemalloc/internal/jemalloc_internal_externs.h   | 3 +++
 include/jemalloc/internal/jemalloc_internal_inlines_b.h | 3 +--
 src/jemalloc.c                                          | 8 ++++++--
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/include/jemalloc/internal/jemalloc_internal_externs.h b/include/jemalloc/internal/jemalloc_internal_externs.h
index e10fb27..5beebc0 100644
--- a/include/jemalloc/internal/jemalloc_internal_externs.h
+++ b/include/jemalloc/internal/jemalloc_internal_externs.h
@@ -25,6 +25,9 @@ extern unsigned ncpus;
 /* Number of arenas used for automatic multiplexing of threads and arenas. */
 extern unsigned narenas_auto;
 
+/* Base index for manual arenas. */
+extern unsigned manual_arena_base;
+
 /*
  * Arenas that are used to service external requests.  Not all elements of the
  * arenas array are necessarily used; arenas are created lazily as needed.
diff --git a/include/jemalloc/internal/jemalloc_internal_inlines_b.h b/include/jemalloc/internal/jemalloc_internal_inlines_b.h
index 8b0ac46..70d6e57 100644
--- a/include/jemalloc/internal/jemalloc_internal_inlines_b.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_b.h
@@ -71,9 +71,8 @@ arena_ichoose(tsd_t *tsd, arena_t *arena) {
 static inline bool
 arena_is_auto(arena_t *arena) {
 	assert(narenas_auto > 0);
-	unsigned offset = (opt_huge_threshold != 0) ? 1 : 0;
 
-	return (arena_ind_get(arena) < narenas_auto + offset);
+	return (arena_ind_get(arena) < manual_arena_base);
 }
 
 JEMALLOC_ALWAYS_INLINE extent_t *
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 594669c..aded139 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -86,8 +86,10 @@ malloc_mutex_t arenas_lock;
 JEMALLOC_ALIGNED(CACHELINE)
 atomic_p_t		arenas[MALLOCX_ARENA_LIMIT];
 static atomic_u_t	narenas_total; /* Use narenas_total_*(). */
-static arena_t		*a0; /* arenas[0]; read-only after initialization. */
-unsigned		narenas_auto; /* Read-only after initialization. */
+/* Below three are read-only after initialization. */
+static arena_t		*a0; /* arenas[0]. */
+unsigned		narenas_auto;
+unsigned		manual_arena_base;
 
 typedef enum {
 	malloc_init_uninitialized	= 3,
@@ -1322,6 +1324,7 @@ malloc_init_hard_a0_locked() {
 	 * malloc_ncpus().
 	 */
 	narenas_auto = 1;
+	manual_arena_base = narenas_auto + 1;
 	memset(arenas, 0, sizeof(arena_t *) * narenas_auto);
 	/*
 	 * Initialize one arena here.  The rest are lazily created in
@@ -1472,6 +1475,7 @@ malloc_init_narenas(void) {
 	if (arena_init_huge()) {
 		narenas_total_inc();
 	}
+	manual_arena_base = narenas_total_get();
 
 	return false;
 }
-- 
cgit v0.12


From 1302af4c43e031304b422e36fcbb9e159804e0ac Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Fri, 1 Jun 2018 14:45:19 -0700
Subject: Add ctl and stats for opt.huge_threshold.

---
 src/ctl.c           | 3 +++
 src/stats.c         | 1 +
 test/unit/mallctl.c | 1 +
 3 files changed, 5 insertions(+)

diff --git a/src/ctl.c b/src/ctl.c
index 9ea2bb3..6d0bb92 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -85,6 +85,7 @@ CTL_PROTO(opt_retain)
 CTL_PROTO(opt_dss)
 CTL_PROTO(opt_narenas)
 CTL_PROTO(opt_percpu_arena)
+CTL_PROTO(opt_huge_threshold)
 CTL_PROTO(opt_background_thread)
 CTL_PROTO(opt_max_background_threads)
 CTL_PROTO(opt_dirty_decay_ms)
@@ -288,6 +289,7 @@ static const ctl_named_node_t opt_node[] = {
 	{NAME("dss"),		CTL(opt_dss)},
 	{NAME("narenas"),	CTL(opt_narenas)},
 	{NAME("percpu_arena"),	CTL(opt_percpu_arena)},
+	{NAME("huge_threshold"),	CTL(opt_huge_threshold)},
 	{NAME("background_thread"),	CTL(opt_background_thread)},
 	{NAME("max_background_threads"),	CTL(opt_max_background_threads)},
 	{NAME("dirty_decay_ms"), CTL(opt_dirty_decay_ms)},
@@ -1658,6 +1660,7 @@ CTL_RO_NL_GEN(opt_dss, opt_dss, const char *)
 CTL_RO_NL_GEN(opt_narenas, opt_narenas, unsigned)
 CTL_RO_NL_GEN(opt_percpu_arena, percpu_arena_mode_names[opt_percpu_arena],
     const char *)
+CTL_RO_NL_GEN(opt_huge_threshold, opt_huge_threshold, size_t)
 CTL_RO_NL_GEN(opt_background_thread, opt_background_thread, bool)
 CTL_RO_NL_GEN(opt_max_background_threads, opt_max_background_threads, size_t)
 CTL_RO_NL_GEN(opt_dirty_decay_ms, opt_dirty_decay_ms, ssize_t)
diff --git a/src/stats.c b/src/stats.c
index 85e68a7..9cfc850 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -910,6 +910,7 @@ stats_general_print(emitter_t *emitter) {
 	OPT_WRITE_CHAR_P("dss")
 	OPT_WRITE_UNSIGNED("narenas")
 	OPT_WRITE_CHAR_P("percpu_arena")
+	OPT_WRITE_UNSIGNED("huge_threshold")
 	OPT_WRITE_CHAR_P("metadata_thp")
 	OPT_WRITE_BOOL_MUTABLE("background_thread", "background_thread")
 	OPT_WRITE_SSIZE_T_MUTABLE("dirty_decay_ms", "arenas.dirty_decay_ms")
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index 4ecf5bd..b4e01af 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -164,6 +164,7 @@ TEST_BEGIN(test_mallctl_opt) {
 	TEST_MALLCTL_OPT(const char *, dss, always);
 	TEST_MALLCTL_OPT(unsigned, narenas, always);
 	TEST_MALLCTL_OPT(const char *, percpu_arena, always);
+	TEST_MALLCTL_OPT(size_t, huge_threshold, always);
 	TEST_MALLCTL_OPT(bool, background_thread, always);
 	TEST_MALLCTL_OPT(ssize_t, dirty_decay_ms, always);
 	TEST_MALLCTL_OPT(ssize_t, muzzy_decay_ms, always);
-- 
cgit v0.12


From ff622eeab51325979226d5430c68a08d3e00b26b Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Fri, 1 Jun 2018 15:58:31 -0700
Subject: Add unit test for opt.huge_threshold.

---
 Makefile.in      |   1 +
 test/unit/huge.c | 108 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 109 insertions(+)
 create mode 100644 test/unit/huge.c

diff --git a/Makefile.in b/Makefile.in
index 3b3191f..81f899f 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -174,6 +174,7 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/fork.c \
 	$(srcroot)test/unit/hash.c \
 	$(srcroot)test/unit/hook.c \
+	$(srcroot)test/unit/huge.c \
 	$(srcroot)test/unit/junk.c \
 	$(srcroot)test/unit/junk_alloc.c \
 	$(srcroot)test/unit/junk_free.c \
diff --git a/test/unit/huge.c b/test/unit/huge.c
new file mode 100644
index 0000000..7e54d07
--- /dev/null
+++ b/test/unit/huge.c
@@ -0,0 +1,108 @@
+#include "test/jemalloc_test.h"
+
+/* Threshold: 2 << 20 = 2097152. */
+const char *malloc_conf = "huge_threshold:2097152";
+
+#define HUGE_SZ (2 << 20)
+#define SMALL_SZ (8)
+
+TEST_BEGIN(huge_bind_thread) {
+	unsigned arena1, arena2;
+	size_t sz = sizeof(unsigned);
+
+	/* Bind to a manual arena. */
+	assert_d_eq(mallctl("arenas.create", &arena1, &sz, NULL, 0), 0,
+	    "Failed to create arena");
+	assert_d_eq(mallctl("thread.arena", NULL, NULL, &arena1,
+	    sizeof(arena1)), 0, "Fail to bind thread");
+
+	void *ptr = mallocx(HUGE_SZ, 0);
+	assert_ptr_not_null(ptr, "Fail to allocate huge size");
+	assert_d_eq(mallctl("arenas.lookup", &arena2, &sz, &ptr,
+	    sizeof(ptr)), 0, "Unexpected mallctl() failure");
+	assert_u_eq(arena1, arena2, "Wrong arena used after binding");
+	dallocx(ptr, 0);
+
+	/* Switch back to arena 0. */
+	test_skip_if(have_percpu_arena &&
+	    PERCPU_ARENA_ENABLED(opt_percpu_arena));
+	arena2 = 0;
+	assert_d_eq(mallctl("thread.arena", NULL, NULL, &arena2,
+	    sizeof(arena2)), 0, "Fail to bind thread");
+	ptr = mallocx(SMALL_SZ, MALLOCX_TCACHE_NONE);
+	assert_d_eq(mallctl("arenas.lookup", &arena2, &sz, &ptr,
+	    sizeof(ptr)), 0, "Unexpected mallctl() failure");
+	assert_u_eq(arena2, 0, "Wrong arena used after binding");
+	dallocx(ptr, MALLOCX_TCACHE_NONE);
+
+	/* Then huge allocation should use the huge arena. */
+	ptr = mallocx(HUGE_SZ, 0);
+	assert_ptr_not_null(ptr, "Fail to allocate huge size");
+	assert_d_eq(mallctl("arenas.lookup", &arena2, &sz, &ptr,
+	    sizeof(ptr)), 0, "Unexpected mallctl() failure");
+	assert_u_ne(arena2, 0, "Wrong arena used after binding");
+	assert_u_ne(arena1, arena2, "Wrong arena used after binding");
+	dallocx(ptr, 0);
+}
+TEST_END
+
+TEST_BEGIN(huge_mallocx) {
+	unsigned arena1, arena2;
+	size_t sz = sizeof(unsigned);
+
+	assert_d_eq(mallctl("arenas.create", &arena1, &sz, NULL, 0), 0,
+	    "Failed to create arena");
+	void *huge = mallocx(HUGE_SZ, MALLOCX_ARENA(arena1));
+	assert_ptr_not_null(huge, "Fail to allocate huge size");
+	assert_d_eq(mallctl("arenas.lookup", &arena2, &sz, &huge,
+	    sizeof(huge)), 0, "Unexpected mallctl() failure");
+	assert_u_eq(arena1, arena2, "Wrong arena used for mallocx");
+	dallocx(huge, MALLOCX_ARENA(arena1));
+
+	void *huge2 = mallocx(HUGE_SZ, 0);
+	assert_ptr_not_null(huge, "Fail to allocate huge size");
+	assert_d_eq(mallctl("arenas.lookup", &arena2, &sz, &huge2,
+	    sizeof(huge2)), 0, "Unexpected mallctl() failure");
+	assert_u_ne(arena1, arena2,
+	    "Huge allocation should not come from the manual arena.");
+	assert_u_ne(arena2, 0,
+	    "Huge allocation should not come from the arena 0.");
+	dallocx(huge2, 0);
+}
+TEST_END
+
+TEST_BEGIN(huge_allocation) {
+	unsigned arena1, arena2;
+
+	void *ptr = mallocx(HUGE_SZ, 0);
+	assert_ptr_not_null(ptr, "Fail to allocate huge size");
+	size_t sz = sizeof(unsigned);
+	assert_d_eq(mallctl("arenas.lookup", &arena1, &sz, &ptr, sizeof(ptr)),
+	    0, "Unexpected mallctl() failure");
+	assert_u_gt(arena1, 0, "Huge allocation should not come from arena 0");
+	dallocx(ptr, 0);
+
+	ptr = mallocx(HUGE_SZ >> 1, 0);
+	assert_ptr_not_null(ptr, "Fail to allocate half huge size");
+	assert_d_eq(mallctl("arenas.lookup", &arena2, &sz, &ptr,
+	    sizeof(ptr)), 0, "Unexpected mallctl() failure");
+	assert_u_ne(arena1, arena2, "Wrong arena used for half huge");
+	dallocx(ptr, 0);
+
+	ptr = mallocx(SMALL_SZ, MALLOCX_TCACHE_NONE);
+	assert_ptr_not_null(ptr, "Fail to allocate small size");
+	assert_d_eq(mallctl("arenas.lookup", &arena2, &sz, &ptr,
+	    sizeof(ptr)), 0, "Unexpected mallctl() failure");
+	assert_u_ne(arena1, arena2,
+	    "Huge and small should be from different arenas");
+	dallocx(ptr, 0);
+}
+TEST_END
+
+int
+main(void) {
+	return test(
+	    huge_allocation,
+	    huge_mallocx,
+	    huge_bind_thread);
+}
-- 
cgit v0.12


From cdf15b458a1c348722fa43cb1813ac3a93fdc634 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Mon, 4 Jun 2018 11:04:29 -0700
Subject: Rename huge_threshold to experimental, and tweak documentation.

---
 doc/jemalloc.xml.in | 13 ++++++++-----
 src/ctl.c           |  2 +-
 src/jemalloc.c      |  4 +++-
 src/stats.c         |  2 +-
 test/unit/huge.c    |  2 +-
 test/unit/mallctl.c |  2 +-
 6 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 1e12fd3..0dcfb98 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -1055,7 +1055,9 @@ mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".decay",
         linkend="arena.i.dirty_decay_ms"><mallctl>arena.&lt;i&gt;.dirty_decay_ms</mallctl></link>
         for related dynamic control options.  See <link
         linkend="opt.muzzy_decay_ms"><mallctl>opt.muzzy_decay_ms</mallctl></link>
-        for a description of muzzy pages.</para></listitem>
+        for a description of muzzy pages.  Note that when the huge_threshold
+        feature is enabled, the special auto arenas may use its own decay
+        settings.</para></listitem>
       </varlistentry>
 
       <varlistentry id="opt.muzzy_decay_ms">
@@ -1763,10 +1765,11 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         to control allocation for arenas explicitly created via <link
         linkend="arenas.create"><mallctl>arenas.create</mallctl></link> such
         that all extents originate from an application-supplied extent allocator
-        (by specifying the custom extent hook functions during arena creation),
-        but the automatically created arenas will have already created extents
-        prior to the application having an opportunity to take over extent
-        allocation.</para>
+        (by specifying the custom extent hook functions during arena creation).
+        However, the API guarantees for the automatically created arenas may be
+        relaxed -- hooks set there may be called in a "best effort" fashion; in
+        addition there may be extents created prior to the application having an
+        opportunity to take over extent allocation.</para>
 
         <programlisting language="C"><![CDATA[
 typedef extent_hooks_s extent_hooks_t;
diff --git a/src/ctl.c b/src/ctl.c
index 6d0bb92..5c94cdb 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -289,7 +289,7 @@ static const ctl_named_node_t opt_node[] = {
 	{NAME("dss"),		CTL(opt_dss)},
 	{NAME("narenas"),	CTL(opt_narenas)},
 	{NAME("percpu_arena"),	CTL(opt_percpu_arena)},
-	{NAME("huge_threshold"),	CTL(opt_huge_threshold)},
+	{NAME("experimental_huge_threshold"),	CTL(opt_huge_threshold)},
 	{NAME("background_thread"),	CTL(opt_background_thread)},
 	{NAME("max_background_threads"),	CTL(opt_max_background_threads)},
 	{NAME("dirty_decay_ms"), CTL(opt_dirty_decay_ms)},
diff --git a/src/jemalloc.c b/src/jemalloc.c
index aded139..28d1344 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1147,7 +1147,9 @@ malloc_conf_init(void) {
 			CONF_HANDLE_SSIZE_T(opt_lg_tcache_max, "lg_tcache_max",
 			    -1, (sizeof(size_t) << 3) - 1)
 
-			CONF_HANDLE_SIZE_T(opt_huge_threshold, "huge_threshold",
+			/* Experimental feature.  Will be documented later.*/
+			CONF_HANDLE_SIZE_T(opt_huge_threshold,
+			    "experimental_huge_threshold",
 			    LARGE_MINCLASS, LARGE_MAXCLASS, yes, yes, false)
 			CONF_HANDLE_SIZE_T(opt_lg_extent_max_active_fit,
 			    "lg_extent_max_active_fit", 0,
diff --git a/src/stats.c b/src/stats.c
index 9cfc850..93a04b7 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -910,7 +910,7 @@ stats_general_print(emitter_t *emitter) {
 	OPT_WRITE_CHAR_P("dss")
 	OPT_WRITE_UNSIGNED("narenas")
 	OPT_WRITE_CHAR_P("percpu_arena")
-	OPT_WRITE_UNSIGNED("huge_threshold")
+	OPT_WRITE_SIZE_T("experimental_huge_threshold")
 	OPT_WRITE_CHAR_P("metadata_thp")
 	OPT_WRITE_BOOL_MUTABLE("background_thread", "background_thread")
 	OPT_WRITE_SSIZE_T_MUTABLE("dirty_decay_ms", "arenas.dirty_decay_ms")
diff --git a/test/unit/huge.c b/test/unit/huge.c
index 7e54d07..f371198 100644
--- a/test/unit/huge.c
+++ b/test/unit/huge.c
@@ -1,7 +1,7 @@
 #include "test/jemalloc_test.h"
 
 /* Threshold: 2 << 20 = 2097152. */
-const char *malloc_conf = "huge_threshold:2097152";
+const char *malloc_conf = "experimental_huge_threshold:2097152";
 
 #define HUGE_SZ (2 << 20)
 #define SMALL_SZ (8)
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index b4e01af..d64b401 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -164,7 +164,7 @@ TEST_BEGIN(test_mallctl_opt) {
 	TEST_MALLCTL_OPT(const char *, dss, always);
 	TEST_MALLCTL_OPT(unsigned, narenas, always);
 	TEST_MALLCTL_OPT(const char *, percpu_arena, always);
-	TEST_MALLCTL_OPT(size_t, huge_threshold, always);
+	TEST_MALLCTL_OPT(size_t, experimental_huge_threshold, always);
 	TEST_MALLCTL_OPT(bool, background_thread, always);
 	TEST_MALLCTL_OPT(ssize_t, dirty_decay_ms, always);
 	TEST_MALLCTL_OPT(ssize_t, muzzy_decay_ms, always);
-- 
cgit v0.12


From ce5c073fe5017e802d1e272a9e057f7b631da345 Mon Sep 17 00:00:00 2001
From: Maks Naumov <maksqwe1@ukr.net>
Date: Thu, 31 May 2018 19:28:06 +0300
Subject: Fix MSVC build

---
 msvc/projects/vc2015/jemalloc/jemalloc.vcxproj         | 2 +-
 msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters | 2 +-
 msvc/projects/vc2017/jemalloc/jemalloc.vcxproj         | 3 ++-
 msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters | 5 ++++-
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj
index f7b175b..be252d7 100644
--- a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj
+++ b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj
@@ -47,7 +47,7 @@
     <ClCompile Include="..\..\..\..\src\extent_dss.c" />
     <ClCompile Include="..\..\..\..\src\extent_mmap.c" />
     <ClCompile Include="..\..\..\..\src\hash.c" />
-    <ClCompile Include="..\..\..\..\src\hooks.c" />
+    <ClCompile Include="..\..\..\..\src\hook.c" />
     <ClCompile Include="..\..\..\..\src\jemalloc.c" />
     <ClCompile Include="..\..\..\..\src\large.c" />
     <ClCompile Include="..\..\..\..\src\log.c" />
diff --git a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters
index 11cfcd0..00d0960 100644
--- a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters
+++ b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters
@@ -37,7 +37,7 @@
     <ClCompile Include="..\..\..\..\src\hash.c">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\..\..\src\hooks.c">
+    <ClCompile Include="..\..\..\..\src\hook.c">
       <Filter>Source Files</Filter>
     </ClCompile>
     <ClCompile Include="..\..\..\..\src\jemalloc.c">
diff --git a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj
index ed71de8..599cc42 100644
--- a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj
+++ b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj
@@ -47,7 +47,7 @@
     <ClCompile Include="..\..\..\..\src\extent_dss.c" />
     <ClCompile Include="..\..\..\..\src\extent_mmap.c" />
     <ClCompile Include="..\..\..\..\src\hash.c" />
-    <ClCompile Include="..\..\..\..\src\hooks.c" />
+    <ClCompile Include="..\..\..\..\src\hook.c" />
     <ClCompile Include="..\..\..\..\src\jemalloc.c" />
     <ClCompile Include="..\..\..\..\src\large.c" />
     <ClCompile Include="..\..\..\..\src\log.c" />
@@ -62,6 +62,7 @@
     <ClCompile Include="..\..\..\..\src\stats.c" />
     <ClCompile Include="..\..\..\..\src\sz.c" />
     <ClCompile Include="..\..\..\..\src\tcache.c" />
+    <ClCompile Include="..\..\..\..\src\test_hooks.c" />
     <ClCompile Include="..\..\..\..\src\ticker.c" />
     <ClCompile Include="..\..\..\..\src\tsd.c" />
     <ClCompile Include="..\..\..\..\src\witness.c" />
diff --git a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters
index 11cfcd0..b352721 100644
--- a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters
+++ b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters
@@ -37,7 +37,7 @@
     <ClCompile Include="..\..\..\..\src\hash.c">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\..\..\src\hooks.c">
+    <ClCompile Include="..\..\..\..\src\hook.c">
       <Filter>Source Files</Filter>
     </ClCompile>
     <ClCompile Include="..\..\..\..\src\jemalloc.c">
@@ -97,5 +97,8 @@
     <ClCompile Include="..\..\..\..\src\div.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\..\src\test_hooks.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
 </Project>
\ No newline at end of file
-- 
cgit v0.12


From 3d29d11ac2c1583b9959f73c0548545018d31c8a Mon Sep 17 00:00:00 2001
From: gnzlbg <gonzalobg88@gmail.com>
Date: Thu, 3 May 2018 11:40:53 +0200
Subject: Clean compilation -Wextra

Before this commit jemalloc produced many warnings when compiled with -Wextra
with both Clang and GCC. This commit fixes the issues raised by these warnings
or suppresses them if they were spurious at least for the Clang and GCC
versions covered by CI.

This commit:

* adds `JEMALLOC_DIAGNOSTIC` macros: `JEMALLOC_DIAGNOSTIC_{PUSH,POP}` are
  used to modify the stack of enabled diagnostics. The
  `JEMALLOC_DIAGNOSTIC_IGNORE_...` macros are used to ignore a concrete
  diagnostic.

* adds `JEMALLOC_FALLTHROUGH` macro to explicitly state that falling
  through `case` labels in a `switch` statement is intended

* Removes all UNUSED annotations on function parameters. The warning
  -Wunused-parameter is now disabled globally in
  `jemalloc_internal_macros.h` for all translation units that include
  that header. It is never re-enabled since that header cannot be
  included by users.

* locally suppresses some -Wextra diagnostics:

  * `-Wmissing-field-initializer` is buggy in older Clang and GCC versions,
    where it does not understanding that, in C, `= {0}` is a common C idiom
    to initialize a struct to zero

  * `-Wtype-bounds` is suppressed in a particular situation where a generic
    macro, used in multiple different places, compares an unsigned integer for
    smaller than zero, which is always true.

  * `-Walloc-larger-than-size=` diagnostics warn when an allocation function is
    called with a size that is too large (out-of-range). These are suppressed in
    the parts of the tests where `jemalloc` explicitly does this to test that the
    allocation functions fail properly.

* adds a new CI build bot that runs the log unit test on CI.

Closes #1196 .
---
 .travis.yml                                        |   7 +-
 configure.ac                                       |   2 +
 include/jemalloc/internal/arena_inlines_b.h        |   6 +-
 include/jemalloc/internal/arena_stats.h            |  22 ++--
 include/jemalloc/internal/atomic_gcc_sync.h        |  14 ++-
 include/jemalloc/internal/extent_inlines.h         |   2 +-
 include/jemalloc/internal/hash.h                   |  65 +++++-----
 .../jemalloc/internal/jemalloc_internal_macros.h   |  58 +++++++++
 include/jemalloc/internal/mutex.h                  |  18 ++-
 include/jemalloc/internal/prof_inlines_a.h         |   6 +-
 include/jemalloc/internal/rtree.h                  |  20 +--
 include/jemalloc/internal/rtree_tsd.h              |   2 +-
 include/jemalloc/internal/tcache_inlines.h         |   2 +-
 include/jemalloc/internal/tsd_generic.h            |   6 +
 include/jemalloc/internal/tsd_tls.h                |   2 +-
 src/arena.c                                        |  12 +-
 src/background_thread.c                            |   4 +-
 src/ctl.c                                          | 136 ++++++++++++---------
 src/extent.c                                       |   6 +-
 src/jemalloc.c                                     |  12 +-
 src/mutex.c                                        |   2 +-
 src/rtree.c                                        |   4 +-
 src/tcache.c                                       |   2 +-
 src/tsd.c                                          |   6 +
 test/integration/aligned_alloc.c                   |  14 +++
 test/integration/mallocx.c                         |  13 ++
 test/integration/overflow.c                        |  13 ++
 test/integration/rallocx.c                         |  13 ++
 test/unit/emitter.c                                |   6 +-
 29 files changed, 328 insertions(+), 147 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 4cc116e..7d93ead 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -143,7 +143,12 @@ matrix:
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--with-malloc-conf=dss:primary,background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--with-malloc-conf=percpu_arena:percpu,background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
-
+    - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-debug --disable-cache-oblivious --enable-stats --enable-log --enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+      script:
+        - make check
+        - make -j test/unit/log
+        - test/unit/log
 
 before_script:
   - autoconf
diff --git a/configure.ac b/configure.ac
index a6a08db..1c20911 100644
--- a/configure.ac
+++ b/configure.ac
@@ -242,6 +242,7 @@ if test "x$GCC" = "xyes" ; then
     fi
   fi
   JE_CFLAGS_ADD([-Wall])
+  JE_CFLAGS_ADD([-Wextra])
   JE_CFLAGS_ADD([-Wshorten-64-to-32])
   JE_CFLAGS_ADD([-Wsign-compare])
   JE_CFLAGS_ADD([-Wundef])
@@ -289,6 +290,7 @@ if test "x$enable_cxx" = "x1" ; then
   AX_CXX_COMPILE_STDCXX([14], [noext], [optional])
   if test "x${HAVE_CXX14}" = "x1" ; then
     JE_CXXFLAGS_ADD([-Wall])
+    JE_CXXFLAGS_ADD([-Wextra])
     JE_CXXFLAGS_ADD([-g3])
 
     SAVED_LIBS="${LIBS}"
diff --git a/include/jemalloc/internal/arena_inlines_b.h b/include/jemalloc/internal/arena_inlines_b.h
index 401be75..d388cae 100644
--- a/include/jemalloc/internal/arena_inlines_b.h
+++ b/include/jemalloc/internal/arena_inlines_b.h
@@ -49,7 +49,7 @@ arena_prof_tctx_get(tsdn_t *tsdn, const void *ptr, alloc_ctx_t *alloc_ctx) {
 }
 
 JEMALLOC_ALWAYS_INLINE void
-arena_prof_tctx_set(tsdn_t *tsdn, const void *ptr, UNUSED size_t usize,
+arena_prof_tctx_set(tsdn_t *tsdn, const void *ptr, size_t usize,
     alloc_ctx_t *alloc_ctx, prof_tctx_t *tctx) {
 	cassert(config_prof);
 	assert(ptr != NULL);
@@ -68,7 +68,7 @@ arena_prof_tctx_set(tsdn_t *tsdn, const void *ptr, UNUSED size_t usize,
 }
 
 static inline void
-arena_prof_tctx_reset(tsdn_t *tsdn, const void *ptr, UNUSED prof_tctx_t *tctx) {
+arena_prof_tctx_reset(tsdn_t *tsdn, const void *ptr, prof_tctx_t *tctx) {
 	cassert(config_prof);
 	assert(ptr != NULL);
 
@@ -318,7 +318,7 @@ arena_sdalloc(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
 
 	szind_t szind;
 	bool slab;
-	UNUSED alloc_ctx_t local_ctx;
+	alloc_ctx_t local_ctx;
 	if (config_prof && opt_prof) {
 		if (alloc_ctx == NULL) {
 			/* Uncommon case and should be a static check. */
diff --git a/include/jemalloc/internal/arena_stats.h b/include/jemalloc/internal/arena_stats.h
index 5f3dca8..39b7262 100644
--- a/include/jemalloc/internal/arena_stats.h
+++ b/include/jemalloc/internal/arena_stats.h
@@ -6,6 +6,8 @@
 #include "jemalloc/internal/mutex_prof.h"
 #include "jemalloc/internal/size_classes.h"
 
+JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
+
 /*
  * In those architectures that support 64-bit atomics, we use atomic updates for
  * our 64-bit values.  Otherwise, we use a plain uint64_t and synchronize
@@ -95,7 +97,7 @@ struct arena_stats_s {
 };
 
 static inline bool
-arena_stats_init(UNUSED tsdn_t *tsdn, arena_stats_t *arena_stats) {
+arena_stats_init(tsdn_t *tsdn, arena_stats_t *arena_stats) {
 	if (config_debug) {
 		for (size_t i = 0; i < sizeof(arena_stats_t); i++) {
 			assert(((char *)arena_stats)[i] == 0);
@@ -147,11 +149,11 @@ arena_stats_add_u64(tsdn_t *tsdn, arena_stats_t *arena_stats,
 #endif
 }
 
-UNUSED static inline void
+static inline void
 arena_stats_sub_u64(tsdn_t *tsdn, arena_stats_t *arena_stats,
     arena_stats_u64_t *p, uint64_t x) {
 #ifdef JEMALLOC_ATOMIC_U64
-	UNUSED uint64_t r = atomic_fetch_sub_u64(p, x, ATOMIC_RELAXED);
+	uint64_t r = atomic_fetch_sub_u64(p, x, ATOMIC_RELAXED);
 	assert(r - x <= r);
 #else
 	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
@@ -176,7 +178,8 @@ arena_stats_accum_u64(arena_stats_u64_t *dst, uint64_t src) {
 }
 
 static inline size_t
-arena_stats_read_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p) {
+arena_stats_read_zu(tsdn_t *tsdn, arena_stats_t *arena_stats,
+    atomic_zu_t *p) {
 #ifdef JEMALLOC_ATOMIC_U64
 	return atomic_load_zu(p, ATOMIC_RELAXED);
 #else
@@ -186,8 +189,8 @@ arena_stats_read_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p) {
 }
 
 static inline void
-arena_stats_add_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p,
-    size_t x) {
+arena_stats_add_zu(tsdn_t *tsdn, arena_stats_t *arena_stats,
+    atomic_zu_t *p, size_t x) {
 #ifdef JEMALLOC_ATOMIC_U64
 	atomic_fetch_add_zu(p, x, ATOMIC_RELAXED);
 #else
@@ -198,10 +201,10 @@ arena_stats_add_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p,
 }
 
 static inline void
-arena_stats_sub_zu(tsdn_t *tsdn, arena_stats_t *arena_stats, atomic_zu_t *p,
-    size_t x) {
+arena_stats_sub_zu(tsdn_t *tsdn, arena_stats_t *arena_stats,
+    atomic_zu_t *p, size_t x) {
 #ifdef JEMALLOC_ATOMIC_U64
-	UNUSED size_t r = atomic_fetch_sub_zu(p, x, ATOMIC_RELAXED);
+	size_t r = atomic_fetch_sub_zu(p, x, ATOMIC_RELAXED);
 	assert(r - x <= r);
 #else
 	malloc_mutex_assert_owner(tsdn, &arena_stats->mtx);
@@ -233,5 +236,4 @@ arena_stats_mapped_add(tsdn_t *tsdn, arena_stats_t *arena_stats, size_t size) {
 	arena_stats_unlock(tsdn, arena_stats);
 }
 
-
 #endif /* JEMALLOC_INTERNAL_ARENA_STATS_H */
diff --git a/include/jemalloc/internal/atomic_gcc_sync.h b/include/jemalloc/internal/atomic_gcc_sync.h
index 30846e4..06a0acf 100644
--- a/include/jemalloc/internal/atomic_gcc_sync.h
+++ b/include/jemalloc/internal/atomic_gcc_sync.h
@@ -113,8 +113,8 @@ atomic_store_##short_type(atomic_##short_type##_t *a,			\
 }									\
 									\
 ATOMIC_INLINE type							\
-atomic_exchange_##short_type(atomic_##short_type##_t *a, type val,	\
-    atomic_memory_order_t mo) {						\
+atomic_exchange_##short_type(atomic_##short_type##_t *a, type val, \
+    atomic_memory_order_t mo) {                  					 \
 	/*								\
 	 * Because of FreeBSD, we care about gcc 4.2, which doesn't have\
 	 * an atomic exchange builtin.  We fake it with a CAS loop.	\
@@ -129,8 +129,9 @@ atomic_exchange_##short_type(atomic_##short_type##_t *a, type val,	\
 									\
 ATOMIC_INLINE bool							\
 atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a,	\
-    type *expected, type desired, atomic_memory_order_t success_mo,	\
-    atomic_memory_order_t failure_mo) {					\
+    type *expected, type desired,                                     \
+    atomic_memory_order_t success_mo,                          \
+    atomic_memory_order_t failure_mo) {				                \
 	type prev = __sync_val_compare_and_swap(&a->repr, *expected,	\
 	    desired);							\
 	if (prev == *expected) {					\
@@ -142,8 +143,9 @@ atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a,	\
 }									\
 ATOMIC_INLINE bool							\
 atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a,	\
-    type *expected, type desired, atomic_memory_order_t success_mo,	\
-    atomic_memory_order_t failure_mo) {					\
+    type *expected, type desired,                                       \
+    atomic_memory_order_t success_mo,                            \
+    atomic_memory_order_t failure_mo) {                          \
 	type prev = __sync_val_compare_and_swap(&a->repr, *expected,	\
 	    desired);							\
 	if (prev == *expected) {					\
diff --git a/include/jemalloc/internal/extent_inlines.h b/include/jemalloc/internal/extent_inlines.h
index 77181df..9b8ddc2 100644
--- a/include/jemalloc/internal/extent_inlines.h
+++ b/include/jemalloc/internal/extent_inlines.h
@@ -190,7 +190,7 @@ extent_addr_set(extent_t *extent, void *addr) {
 }
 
 static inline void
-extent_addr_randomize(UNUSED tsdn_t *tsdn, extent_t *extent, size_t alignment) {
+extent_addr_randomize(tsdn_t *tsdn, extent_t *extent, size_t alignment) {
 	assert(extent_base_get(extent) == extent_addr_get(extent));
 
 	if (alignment < PAGE) {
diff --git a/include/jemalloc/internal/hash.h b/include/jemalloc/internal/hash.h
index dcfc992..0270034 100644
--- a/include/jemalloc/internal/hash.h
+++ b/include/jemalloc/internal/hash.h
@@ -104,8 +104,8 @@ hash_x86_32(const void *key, int len, uint32_t seed) {
 		uint32_t k1 = 0;
 
 		switch (len & 3) {
-		case 3: k1 ^= tail[2] << 16;
-		case 2: k1 ^= tail[1] << 8;
+		case 3: k1 ^= tail[2] << 16; JEMALLOC_FALLTHROUGH
+		case 2: k1 ^= tail[1] << 8; JEMALLOC_FALLTHROUGH
 		case 1: k1 ^= tail[0]; k1 *= c1; k1 = hash_rotl_32(k1, 15);
 			k1 *= c2; h1 ^= k1;
 		}
@@ -119,7 +119,7 @@ hash_x86_32(const void *key, int len, uint32_t seed) {
 	return h1;
 }
 
-UNUSED static inline void
+static inline void
 hash_x86_128(const void *key, const int len, uint32_t seed,
     uint64_t r_out[2]) {
 	const uint8_t * data = (const uint8_t *) key;
@@ -177,28 +177,29 @@ hash_x86_128(const void *key, const int len, uint32_t seed,
 		uint32_t k4 = 0;
 
 		switch (len & 15) {
-		case 15: k4 ^= tail[14] << 16;
-		case 14: k4 ^= tail[13] << 8;
+		case 15: k4 ^= tail[14] << 16; JEMALLOC_FALLTHROUGH
+		case 14: k4 ^= tail[13] << 8; JEMALLOC_FALLTHROUGH
 		case 13: k4 ^= tail[12] << 0;
 			k4 *= c4; k4 = hash_rotl_32(k4, 18); k4 *= c1; h4 ^= k4;
-
-		case 12: k3 ^= tail[11] << 24;
-		case 11: k3 ^= tail[10] << 16;
-		case 10: k3 ^= tail[ 9] << 8;
+      JEMALLOC_FALLTHROUGH
+		case 12: k3 ^= tail[11] << 24; JEMALLOC_FALLTHROUGH
+		case 11: k3 ^= tail[10] << 16; JEMALLOC_FALLTHROUGH
+		case 10: k3 ^= tail[ 9] << 8; JEMALLOC_FALLTHROUGH
 		case  9: k3 ^= tail[ 8] << 0;
 		     k3 *= c3; k3 = hash_rotl_32(k3, 17); k3 *= c4; h3 ^= k3;
-
-		case  8: k2 ^= tail[ 7] << 24;
-		case  7: k2 ^= tail[ 6] << 16;
-		case  6: k2 ^= tail[ 5] << 8;
+         JEMALLOC_FALLTHROUGH
+		case  8: k2 ^= tail[ 7] << 24; JEMALLOC_FALLTHROUGH
+		case  7: k2 ^= tail[ 6] << 16; JEMALLOC_FALLTHROUGH
+		case  6: k2 ^= tail[ 5] << 8; JEMALLOC_FALLTHROUGH
 		case  5: k2 ^= tail[ 4] << 0;
 			k2 *= c2; k2 = hash_rotl_32(k2, 16); k2 *= c3; h2 ^= k2;
-
-		case  4: k1 ^= tail[ 3] << 24;
-		case  3: k1 ^= tail[ 2] << 16;
-		case  2: k1 ^= tail[ 1] << 8;
+      JEMALLOC_FALLTHROUGH
+		case  4: k1 ^= tail[ 3] << 24; JEMALLOC_FALLTHROUGH
+		case  3: k1 ^= tail[ 2] << 16; JEMALLOC_FALLTHROUGH
+		case  2: k1 ^= tail[ 1] << 8; JEMALLOC_FALLTHROUGH
 		case  1: k1 ^= tail[ 0] << 0;
 			k1 *= c1; k1 = hash_rotl_32(k1, 15); k1 *= c2; h1 ^= k1;
+      JEMALLOC_FALLTHROUGH
 		}
 	}
 
@@ -220,7 +221,7 @@ hash_x86_128(const void *key, const int len, uint32_t seed,
 	r_out[1] = (((uint64_t) h4) << 32) | h3;
 }
 
-UNUSED static inline void
+static inline void
 hash_x64_128(const void *key, const int len, const uint32_t seed,
     uint64_t r_out[2]) {
 	const uint8_t *data = (const uint8_t *) key;
@@ -260,22 +261,22 @@ hash_x64_128(const void *key, const int len, const uint32_t seed,
 		uint64_t k2 = 0;
 
 		switch (len & 15) {
-		case 15: k2 ^= ((uint64_t)(tail[14])) << 48; /* falls through */
-		case 14: k2 ^= ((uint64_t)(tail[13])) << 40; /* falls through */
-		case 13: k2 ^= ((uint64_t)(tail[12])) << 32; /* falls through */
-		case 12: k2 ^= ((uint64_t)(tail[11])) << 24; /* falls through */
-		case 11: k2 ^= ((uint64_t)(tail[10])) << 16; /* falls through */
-		case 10: k2 ^= ((uint64_t)(tail[ 9])) << 8;  /* falls through */
+		case 15: k2 ^= ((uint64_t)(tail[14])) << 48; JEMALLOC_FALLTHROUGH
+		case 14: k2 ^= ((uint64_t)(tail[13])) << 40; JEMALLOC_FALLTHROUGH
+		case 13: k2 ^= ((uint64_t)(tail[12])) << 32; JEMALLOC_FALLTHROUGH
+		case 12: k2 ^= ((uint64_t)(tail[11])) << 24; JEMALLOC_FALLTHROUGH
+		case 11: k2 ^= ((uint64_t)(tail[10])) << 16; JEMALLOC_FALLTHROUGH
+		case 10: k2 ^= ((uint64_t)(tail[ 9])) << 8;  JEMALLOC_FALLTHROUGH
 		case  9: k2 ^= ((uint64_t)(tail[ 8])) << 0;
 			k2 *= c2; k2 = hash_rotl_64(k2, 33); k2 *= c1; h2 ^= k2;
-			/* falls through */
-		case  8: k1 ^= ((uint64_t)(tail[ 7])) << 56; /* falls through */
-		case  7: k1 ^= ((uint64_t)(tail[ 6])) << 48; /* falls through */
-		case  6: k1 ^= ((uint64_t)(tail[ 5])) << 40; /* falls through */
-		case  5: k1 ^= ((uint64_t)(tail[ 4])) << 32; /* falls through */
-		case  4: k1 ^= ((uint64_t)(tail[ 3])) << 24; /* falls through */
-		case  3: k1 ^= ((uint64_t)(tail[ 2])) << 16; /* falls through */
-		case  2: k1 ^= ((uint64_t)(tail[ 1])) << 8;  /* falls through */
+			JEMALLOC_FALLTHROUGH
+		case  8: k1 ^= ((uint64_t)(tail[ 7])) << 56; JEMALLOC_FALLTHROUGH
+		case  7: k1 ^= ((uint64_t)(tail[ 6])) << 48; JEMALLOC_FALLTHROUGH
+		case  6: k1 ^= ((uint64_t)(tail[ 5])) << 40; JEMALLOC_FALLTHROUGH
+		case  5: k1 ^= ((uint64_t)(tail[ 4])) << 32; JEMALLOC_FALLTHROUGH
+		case  4: k1 ^= ((uint64_t)(tail[ 3])) << 24; JEMALLOC_FALLTHROUGH
+		case  3: k1 ^= ((uint64_t)(tail[ 2])) << 16; JEMALLOC_FALLTHROUGH
+		case  2: k1 ^= ((uint64_t)(tail[ 1])) << 8;  JEMALLOC_FALLTHROUGH
 		case  1: k1 ^= ((uint64_t)(tail[ 0])) << 0;
 			k1 *= c1; k1 = hash_rotl_64(k1, 31); k1 *= c2; h1 ^= k1;
 		}
diff --git a/include/jemalloc/internal/jemalloc_internal_macros.h b/include/jemalloc/internal/jemalloc_internal_macros.h
index ed75d37..a1a761b 100644
--- a/include/jemalloc/internal/jemalloc_internal_macros.h
+++ b/include/jemalloc/internal/jemalloc_internal_macros.h
@@ -40,4 +40,62 @@
 #define JEMALLOC_VA_ARGS_HEAD(head, ...) head
 #define JEMALLOC_VA_ARGS_TAIL(head, ...) __VA_ARGS__
 
+#if (defined(__GNUC__) || defined(__GNUG__)) && !defined(__clang__) \
+  && defined(JEMALLOC_HAVE_ATTR) && (__GNUC__ >= 7)
+#define JEMALLOC_FALLTHROUGH JEMALLOC_ATTR(fallthrough);
+#else
+#define JEMALLOC_FALLTHROUGH /* falls through */
+#endif
+
+
+/* Diagnostic suppression macros */
+#if defined(_MSC_VER) && !defined(__clang__)
+#  define JEMALLOC_DIAGNOSTIC_PUSH __pragma(warning(push))
+#  define JEMALLOC_DIAGNOSTIC_POP __pragma(warning(pop))
+#  define JEMALLOC_DIAGNOSTIC_IGNORE(W) __pragma(warning(disable:W))
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+#  define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
+#elif defined(__GNUC__) || defined(__clang__)
+/*
+ * The JEMALLOC_PRAGMA__ macro is an implementation detail of the GCC and Clang
+ * diagnostic suppression macros and should not be used anywhere else.
+ */
+#  define JEMALLOC_PRAGMA__(X) _Pragma(#X)
+#  define JEMALLOC_DIAGNOSTIC_PUSH JEMALLOC_PRAGMA__(GCC diagnostic push)
+#  define JEMALLOC_DIAGNOSTIC_POP JEMALLOC_PRAGMA__(GCC diagnostic pop)
+#  define JEMALLOC_DIAGNOSTIC_IGNORE(W) JEMALLOC_PRAGMA__(GCC diagnostic ignored W)
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS \
+     JEMALLOC_DIAGNOSTIC_IGNORE("-Wmissing-field-initializers")
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS  \
+     JEMALLOC_DIAGNOSTIC_IGNORE("-Wtype-limits")
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_UNUSED_PARAMETER \
+     JEMALLOC_DIAGNOSTIC_IGNORE("-Wunused-parameter")
+#  if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ >= 7)
+#    define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN \
+       JEMALLOC_DIAGNOSTIC_IGNORE("-Walloc-size-larger-than=")
+#  else
+#    define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+#  endif
+#  define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS \
+  JEMALLOC_DIAGNOSTIC_PUSH \
+  JEMALLOC_DIAGNOSTIC_IGNORE_UNUSED_PARAMETER
+#else
+#  define JEMALLOC_DIAGNOSTIC_PUSH
+#  define JEMALLOC_DIAGNOSTIC_POP
+#  define JEMALLOC_DIAGNOSTIC_IGNORE(W)
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+#  define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
+#endif
+
+/*
+ * Disables spurious diagnostics for all headers
+ * Since these headers are not included by users directly,
+ * it does not affect their diagnostic settings.
+ */
+JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
+
 #endif /* JEMALLOC_INTERNAL_MACROS_H */
diff --git a/include/jemalloc/internal/mutex.h b/include/jemalloc/internal/mutex.h
index 6520c25..651ce5f 100644
--- a/include/jemalloc/internal/mutex.h
+++ b/include/jemalloc/internal/mutex.h
@@ -101,9 +101,15 @@ struct malloc_mutex_s {
 #ifdef _WIN32
 #  define MALLOC_MUTEX_INITIALIZER
 #elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
-#  define MALLOC_MUTEX_INITIALIZER					\
+#  if defined(JEMALLOC_DEBUG)
+#    define MALLOC_MUTEX_INITIALIZER					\
      {{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT}},		\
+         WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0}
+#  else
+#    define MALLOC_MUTEX_INITIALIZER                      \
+  {{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT}},		\
       WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
+#  endif
 #elif (defined(JEMALLOC_OSSPIN))
 #  define MALLOC_MUTEX_INITIALIZER					\
      {{{LOCK_PROF_DATA_INITIALIZER, 0}},				\
@@ -111,12 +117,18 @@ struct malloc_mutex_s {
 #elif (defined(JEMALLOC_MUTEX_INIT_CB))
 #  define MALLOC_MUTEX_INITIALIZER					\
      {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL}},	\
-      WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
+         WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
 #else
 #    define MALLOC_MUTEX_TYPE PTHREAD_MUTEX_DEFAULT
+#  if defined(JEMALLOC_DEBUG)
 #    define MALLOC_MUTEX_INITIALIZER					\
        {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER}},	\
-        WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
+           WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0}
+#  else
+#    define MALLOC_MUTEX_INITIALIZER                          \
+  {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER}},	\
+      WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
+#  endif
 #endif
 
 #ifdef JEMALLOC_LAZY_LOCK
diff --git a/include/jemalloc/internal/prof_inlines_a.h b/include/jemalloc/internal/prof_inlines_a.h
index a6efb48..c39bc3d 100644
--- a/include/jemalloc/internal/prof_inlines_a.h
+++ b/include/jemalloc/internal/prof_inlines_a.h
@@ -4,7 +4,8 @@
 #include "jemalloc/internal/mutex.h"
 
 static inline bool
-prof_accum_add(tsdn_t *tsdn, prof_accum_t *prof_accum, uint64_t accumbytes) {
+prof_accum_add(tsdn_t *tsdn, prof_accum_t *prof_accum,
+    uint64_t accumbytes) {
 	cassert(config_prof);
 
 	bool overflow;
@@ -42,7 +43,8 @@ prof_accum_add(tsdn_t *tsdn, prof_accum_t *prof_accum, uint64_t accumbytes) {
 }
 
 static inline void
-prof_accum_cancel(tsdn_t *tsdn, prof_accum_t *prof_accum, size_t usize) {
+prof_accum_cancel(tsdn_t *tsdn, prof_accum_t *prof_accum,
+    size_t usize) {
 	cassert(config_prof);
 
 	/*
diff --git a/include/jemalloc/internal/rtree.h b/include/jemalloc/internal/rtree.h
index b59d33a..dd452f1 100644
--- a/include/jemalloc/internal/rtree.h
+++ b/include/jemalloc/internal/rtree.h
@@ -170,8 +170,8 @@ rtree_subkey(uintptr_t key, unsigned level) {
  */
 #  ifdef RTREE_LEAF_COMPACT
 JEMALLOC_ALWAYS_INLINE uintptr_t
-rtree_leaf_elm_bits_read(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
-    bool dependent) {
+rtree_leaf_elm_bits_read(tsdn_t *tsdn, rtree_t *rtree,
+    rtree_leaf_elm_t *elm, bool dependent) {
 	return (uintptr_t)atomic_load_p(&elm->le_bits, dependent
 	    ? ATOMIC_RELAXED : ATOMIC_ACQUIRE);
 }
@@ -208,7 +208,7 @@ rtree_leaf_elm_bits_slab_get(uintptr_t bits) {
 #  endif
 
 JEMALLOC_ALWAYS_INLINE extent_t *
-rtree_leaf_elm_extent_read(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
+rtree_leaf_elm_extent_read(tsdn_t *tsdn, rtree_t *rtree,
     rtree_leaf_elm_t *elm, bool dependent) {
 #ifdef RTREE_LEAF_COMPACT
 	uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm, dependent);
@@ -221,7 +221,7 @@ rtree_leaf_elm_extent_read(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
 }
 
 JEMALLOC_ALWAYS_INLINE szind_t
-rtree_leaf_elm_szind_read(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
+rtree_leaf_elm_szind_read(tsdn_t *tsdn, rtree_t *rtree,
     rtree_leaf_elm_t *elm, bool dependent) {
 #ifdef RTREE_LEAF_COMPACT
 	uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm, dependent);
@@ -233,7 +233,7 @@ rtree_leaf_elm_szind_read(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
 }
 
 JEMALLOC_ALWAYS_INLINE bool
-rtree_leaf_elm_slab_read(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
+rtree_leaf_elm_slab_read(tsdn_t *tsdn, rtree_t *rtree,
     rtree_leaf_elm_t *elm, bool dependent) {
 #ifdef RTREE_LEAF_COMPACT
 	uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm, dependent);
@@ -245,7 +245,7 @@ rtree_leaf_elm_slab_read(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
 }
 
 static inline void
-rtree_leaf_elm_extent_write(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
+rtree_leaf_elm_extent_write(tsdn_t *tsdn, rtree_t *rtree,
     rtree_leaf_elm_t *elm, extent_t *extent) {
 #ifdef RTREE_LEAF_COMPACT
 	uintptr_t old_bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm, true);
@@ -259,7 +259,7 @@ rtree_leaf_elm_extent_write(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
 }
 
 static inline void
-rtree_leaf_elm_szind_write(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
+rtree_leaf_elm_szind_write(tsdn_t *tsdn, rtree_t *rtree,
     rtree_leaf_elm_t *elm, szind_t szind) {
 	assert(szind <= NSIZES);
 
@@ -277,7 +277,7 @@ rtree_leaf_elm_szind_write(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
 }
 
 static inline void
-rtree_leaf_elm_slab_write(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
+rtree_leaf_elm_slab_write(tsdn_t *tsdn, rtree_t *rtree,
     rtree_leaf_elm_t *elm, bool slab) {
 #ifdef RTREE_LEAF_COMPACT
 	uintptr_t old_bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm,
@@ -292,8 +292,8 @@ rtree_leaf_elm_slab_write(UNUSED tsdn_t *tsdn, UNUSED rtree_t *rtree,
 }
 
 static inline void
-rtree_leaf_elm_write(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
-    extent_t *extent, szind_t szind, bool slab) {
+rtree_leaf_elm_write(tsdn_t *tsdn, rtree_t *rtree,
+    rtree_leaf_elm_t *elm, extent_t *extent, szind_t szind, bool slab) {
 #ifdef RTREE_LEAF_COMPACT
 	uintptr_t bits = ((uintptr_t)szind << LG_VADDR) |
 	    ((uintptr_t)extent & (((uintptr_t)0x1 << LG_VADDR) - 1)) |
diff --git a/include/jemalloc/internal/rtree_tsd.h b/include/jemalloc/internal/rtree_tsd.h
index 93a7517..562e292 100644
--- a/include/jemalloc/internal/rtree_tsd.h
+++ b/include/jemalloc/internal/rtree_tsd.h
@@ -26,7 +26,7 @@
  * Zero initializer required for tsd initialization only.  Proper initialization
  * done via rtree_ctx_data_init().
  */
-#define RTREE_CTX_ZERO_INITIALIZER {{{0}}, {{0}}}
+#define RTREE_CTX_ZERO_INITIALIZER {{{0, 0}}, {{0, 0}}}
 
 
 typedef struct rtree_leaf_elm_s rtree_leaf_elm_t;
diff --git a/include/jemalloc/internal/tcache_inlines.h b/include/jemalloc/internal/tcache_inlines.h
index 0f6ab8c..c426c56 100644
--- a/include/jemalloc/internal/tcache_inlines.h
+++ b/include/jemalloc/internal/tcache_inlines.h
@@ -40,7 +40,7 @@ tcache_event(tsd_t *tsd, tcache_t *tcache) {
 
 JEMALLOC_ALWAYS_INLINE void *
 tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache,
-    UNUSED size_t size, szind_t binind, bool zero, bool slow_path) {
+    size_t size, szind_t binind, bool zero, bool slow_path) {
 	void *ret;
 	cache_bin_t *bin;
 	bool tcache_success;
diff --git a/include/jemalloc/internal/tsd_generic.h b/include/jemalloc/internal/tsd_generic.h
index 1e52ef7..cf73c0c 100644
--- a/include/jemalloc/internal/tsd_generic.h
+++ b/include/jemalloc/internal/tsd_generic.h
@@ -77,7 +77,10 @@ tsd_wrapper_get(bool init) {
 			abort();
 		} else {
 			wrapper->initialized = false;
+      JEMALLOC_DIAGNOSTIC_PUSH
+      JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
 			tsd_t initializer = TSD_INITIALIZER;
+      JEMALLOC_DIAGNOSTIC_POP
 			wrapper->val = initializer;
 		}
 		tsd_wrapper_set(wrapper);
@@ -107,7 +110,10 @@ tsd_boot1(void) {
 	tsd_boot_wrapper.initialized = false;
 	tsd_cleanup(&tsd_boot_wrapper.val);
 	wrapper->initialized = false;
+  JEMALLOC_DIAGNOSTIC_PUSH
+  JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
 	tsd_t initializer = TSD_INITIALIZER;
+  JEMALLOC_DIAGNOSTIC_POP
 	wrapper->val = initializer;
 	tsd_wrapper_set(wrapper);
 }
diff --git a/include/jemalloc/internal/tsd_tls.h b/include/jemalloc/internal/tsd_tls.h
index 0de64b7..757aaa0 100644
--- a/include/jemalloc/internal/tsd_tls.h
+++ b/include/jemalloc/internal/tsd_tls.h
@@ -39,7 +39,7 @@ tsd_get_allocates(void) {
 
 /* Get/set. */
 JEMALLOC_ALWAYS_INLINE tsd_t *
-tsd_get(UNUSED bool init) {
+tsd_get(bool init) {
 	assert(tsd_booted);
 	return &tsd_tls;
 }
diff --git a/src/arena.c b/src/arena.c
index 49d86d2..eefea0d 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -11,6 +11,8 @@
 #include "jemalloc/internal/size_classes.h"
 #include "jemalloc/internal/util.h"
 
+JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
+
 /******************************************************************************/
 /* Data. */
 
@@ -65,7 +67,7 @@ static void arena_bin_lower_slab(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
 /******************************************************************************/
 
 void
-arena_basic_stats_merge(UNUSED tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
+arena_basic_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
     const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms,
     size_t *nactive, size_t *ndirty, size_t *nmuzzy) {
 	*nthreads += arena_nthreads_get(arena, false);
@@ -752,7 +754,7 @@ static size_t
 arena_decay_stashed(tsdn_t *tsdn, arena_t *arena,
     extent_hooks_t **r_extent_hooks, arena_decay_t *decay, extents_t *extents,
     bool all, extent_list_t *decay_extents, bool is_background_thread) {
-	UNUSED size_t nmadvise, nunmapped;
+	size_t nmadvise, nunmapped;
 	size_t npurged;
 
 	if (config_stats) {
@@ -843,7 +845,7 @@ arena_decay_to_limit(tsdn_t *tsdn, arena_t *arena, arena_decay_t *decay,
 	size_t npurge = arena_stash_decayed(tsdn, arena, &extent_hooks, extents,
 	    npages_limit, npages_decay_max, &decay_extents);
 	if (npurge != 0) {
-		UNUSED size_t npurged = arena_decay_stashed(tsdn, arena,
+		size_t npurged = arena_decay_stashed(tsdn, arena,
 		    &extent_hooks, decay, extents, all, &decay_extents,
 		    is_background_thread);
 		assert(npurged == npurge);
@@ -872,7 +874,7 @@ arena_decay_impl(tsdn_t *tsdn, arena_t *arena, arena_decay_t *decay,
 
 	bool epoch_advanced = arena_maybe_decay(tsdn, arena, decay, extents,
 	    is_background_thread);
-	UNUSED size_t npages_new;
+	size_t npages_new;
 	if (epoch_advanced) {
 		/* Backlog is updated on epoch advance. */
 		npages_new = decay->backlog[SMOOTHSTEP_NSTEPS-1];
@@ -1508,7 +1510,7 @@ arena_dalloc_bin_slab(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
 }
 
 static void
-arena_bin_lower_slab(UNUSED tsdn_t *tsdn, arena_t *arena, extent_t *slab,
+arena_bin_lower_slab(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
     bin_t *bin) {
 	assert(extent_nfree_get(slab) > 0);
 
diff --git a/src/background_thread.c b/src/background_thread.c
index 4613537..feed856 100644
--- a/src/background_thread.c
+++ b/src/background_thread.c
@@ -4,6 +4,8 @@
 
 #include "jemalloc/internal/assert.h"
 
+JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
+
 /******************************************************************************/
 /* Data. */
 
@@ -78,7 +80,7 @@ background_thread_info_init(tsdn_t *tsdn, background_thread_info_t *info) {
 }
 
 static inline bool
-set_current_thread_affinity(UNUSED int cpu) {
+set_current_thread_affinity(int cpu) {
 #if defined(JEMALLOC_HAVE_SCHED_SETAFFINITY)
 	cpu_set_t cpuset;
 	CPU_ZERO(&cpuset);
diff --git a/src/ctl.c b/src/ctl.c
index 5c94cdb..3f7dea1 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -1392,8 +1392,8 @@ label_return:								\
 
 #define CTL_RO_CGEN(c, n, v, t)						\
 static int								\
-n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,	\
-    size_t *oldlenp, void *newp, size_t newlen) {			\
+n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, \
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {			\
 	int ret;							\
 	t oldval;							\
 									\
@@ -1435,8 +1435,8 @@ label_return:								\
  */
 #define CTL_RO_NL_CGEN(c, n, v, t)					\
 static int								\
-n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,	\
-    size_t *oldlenp, void *newp, size_t newlen) {			\
+n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, \
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {			\
 	int ret;							\
 	t oldval;							\
 									\
@@ -1454,8 +1454,8 @@ label_return:								\
 
 #define CTL_RO_NL_GEN(n, v, t)						\
 static int								\
-n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,	\
-    size_t *oldlenp, void *newp, size_t newlen) {			\
+n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, \
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {			\
 	int ret;							\
 	t oldval;							\
 									\
@@ -1489,8 +1489,8 @@ label_return:								\
 
 #define CTL_RO_CONFIG_GEN(n, t)						\
 static int								\
-n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,	\
-    size_t *oldlenp, void *newp, size_t newlen) {			\
+n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, \
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {			\
 	int ret;							\
 	t oldval;							\
 									\
@@ -1508,8 +1508,8 @@ label_return:								\
 CTL_RO_NL_GEN(version, JEMALLOC_VERSION, const char *)
 
 static int
-epoch_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+epoch_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	UNUSED uint64_t newval;
 
@@ -1527,8 +1527,9 @@ label_return:
 }
 
 static int
-background_thread_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+background_thread_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp,
+    void *newp, size_t newlen) {
 	int ret;
 	bool oldval;
 
@@ -1578,8 +1579,9 @@ label_return:
 }
 
 static int
-max_background_threads_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+max_background_threads_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen) {
 	int ret;
 	size_t oldval;
 
@@ -1691,8 +1693,8 @@ CTL_RO_NL_CGEN(config_prof, opt_prof_leak, opt_prof_leak, bool)
 /******************************************************************************/
 
 static int
-thread_arena_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+thread_arena_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	arena_t *oldarena;
 	unsigned newind, oldind;
@@ -1756,8 +1758,9 @@ CTL_TSD_RO_NL_CGEN(config_stats, thread_deallocatedp,
     tsd_thread_deallocatedp_get, uint64_t *)
 
 static int
-thread_tcache_enabled_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+thread_tcache_enabled_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen) {
 	int ret;
 	bool oldval;
 
@@ -1777,8 +1780,9 @@ label_return:
 }
 
 static int
-thread_tcache_flush_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+thread_tcache_flush_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen) {
 	int ret;
 
 	if (!tcache_available(tsd)) {
@@ -1797,8 +1801,9 @@ label_return:
 }
 
 static int
-thread_prof_name_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+thread_prof_name_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen) {
 	int ret;
 
 	if (!config_prof) {
@@ -1828,8 +1833,9 @@ label_return:
 }
 
 static int
-thread_prof_active_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+thread_prof_active_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen) {
 	int ret;
 	bool oldval;
 
@@ -1858,8 +1864,8 @@ label_return:
 /******************************************************************************/
 
 static int
-tcache_create_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+tcache_create_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	unsigned tcache_ind;
 
@@ -1876,8 +1882,8 @@ label_return:
 }
 
 static int
-tcache_flush_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+tcache_flush_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	unsigned tcache_ind;
 
@@ -1896,8 +1902,8 @@ label_return:
 }
 
 static int
-tcache_destroy_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+tcache_destroy_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	unsigned tcache_ind;
 
@@ -2299,8 +2305,9 @@ label_return:
 }
 
 static int
-arena_i_retain_grow_limit_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+arena_i_retain_grow_limit_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen) {
 	int ret;
 	unsigned arena_ind;
 	arena_t *arena;
@@ -2335,7 +2342,8 @@ label_return:
 }
 
 static const ctl_named_node_t *
-arena_i_index(tsdn_t *tsdn, const size_t *mib, size_t miblen, size_t i) {
+arena_i_index(tsdn_t *tsdn, const size_t *mib, size_t miblen,
+    size_t i) {
 	const ctl_named_node_t *ret;
 
 	malloc_mutex_lock(tsdn, &ctl_mtx);
@@ -2360,8 +2368,8 @@ label_return:
 /******************************************************************************/
 
 static int
-arenas_narenas_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+arenas_narenas_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	unsigned narenas;
 
@@ -2381,8 +2389,9 @@ label_return:
 }
 
 static int
-arenas_decay_ms_ctl_impl(tsd_t *tsd, const size_t *mib, size_t miblen,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen, bool dirty) {
+arenas_decay_ms_ctl_impl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen, bool dirty) {
 	int ret;
 
 	if (oldp != NULL && oldlenp != NULL) {
@@ -2430,7 +2439,8 @@ CTL_RO_NL_GEN(arenas_bin_i_size, bin_infos[mib[2]].reg_size, size_t)
 CTL_RO_NL_GEN(arenas_bin_i_nregs, bin_infos[mib[2]].nregs, uint32_t)
 CTL_RO_NL_GEN(arenas_bin_i_slab_size, bin_infos[mib[2]].slab_size, size_t)
 static const ctl_named_node_t *
-arenas_bin_i_index(tsdn_t *tsdn, const size_t *mib, size_t miblen, size_t i) {
+arenas_bin_i_index(tsdn_t *tsdn, const size_t *mib,
+    size_t miblen, size_t i) {
 	if (i > NBINS) {
 		return NULL;
 	}
@@ -2441,8 +2451,8 @@ CTL_RO_NL_GEN(arenas_nlextents, NSIZES - NBINS, unsigned)
 CTL_RO_NL_GEN(arenas_lextent_i_size, sz_index2size(NBINS+(szind_t)mib[2]),
     size_t)
 static const ctl_named_node_t *
-arenas_lextent_i_index(tsdn_t *tsdn, const size_t *mib, size_t miblen,
-    size_t i) {
+arenas_lextent_i_index(tsdn_t *tsdn, const size_t *mib,
+    size_t miblen, size_t i) {
 	if (i > NSIZES - NBINS) {
 		return NULL;
 	}
@@ -2450,8 +2460,8 @@ arenas_lextent_i_index(tsdn_t *tsdn, const size_t *mib, size_t miblen,
 }
 
 static int
-arenas_create_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+arenas_create_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	extent_hooks_t *extent_hooks;
 	unsigned arena_ind;
@@ -2473,8 +2483,9 @@ label_return:
 }
 
 static int
-arenas_lookup_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+arenas_lookup_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen) {
 	int ret;
 	unsigned arena_ind;
 	void *ptr;
@@ -2505,8 +2516,9 @@ label_return:
 /******************************************************************************/
 
 static int
-prof_thread_active_init_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+prof_thread_active_init_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen) {
 	int ret;
 	bool oldval;
 
@@ -2532,8 +2544,8 @@ label_return:
 }
 
 static int
-prof_active_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+prof_active_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	bool oldval;
 
@@ -2558,8 +2570,8 @@ label_return:
 }
 
 static int
-prof_dump_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+prof_dump_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	const char *filename = NULL;
 
@@ -2581,8 +2593,8 @@ label_return:
 }
 
 static int
-prof_gdump_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+prof_gdump_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	bool oldval;
 
@@ -2607,8 +2619,8 @@ label_return:
 }
 
 static int
-prof_reset_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen) {
+prof_reset_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
 	int ret;
 	size_t lg_sample = lg_prof_sample;
 
@@ -2764,8 +2776,9 @@ RO_MUTEX_CTL_GEN(arenas_i_bins_j_mutex,
 
 /* Resets all mutex stats, including global, arena and bin mutexes. */
 static int
-stats_mutexes_reset_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen) {
+stats_mutexes_reset_ctl(tsd_t *tsd, const size_t *mib,
+    size_t miblen, void *oldp, size_t *oldlenp,
+    void *newp, size_t newlen) {
 	if (!config_stats) {
 		return ENOENT;
 	}
@@ -2834,8 +2847,8 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_curslabs,
     arenas_i(mib[2])->astats->bstats[mib[4]].curslabs, size_t)
 
 static const ctl_named_node_t *
-stats_arenas_i_bins_j_index(tsdn_t *tsdn, const size_t *mib, size_t miblen,
-    size_t j) {
+stats_arenas_i_bins_j_index(tsdn_t *tsdn, const size_t *mib,
+    size_t miblen, size_t j) {
 	if (j > NBINS) {
 		return NULL;
 	}
@@ -2855,8 +2868,8 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_lextents_j_curlextents,
     arenas_i(mib[2])->astats->lstats[mib[4]].curlextents, size_t)
 
 static const ctl_named_node_t *
-stats_arenas_i_lextents_j_index(tsdn_t *tsdn, const size_t *mib, size_t miblen,
-    size_t j) {
+stats_arenas_i_lextents_j_index(tsdn_t *tsdn, const size_t *mib,
+    size_t miblen, size_t j) {
 	if (j > NSIZES - NBINS) {
 		return NULL;
 	}
@@ -2864,7 +2877,8 @@ stats_arenas_i_lextents_j_index(tsdn_t *tsdn, const size_t *mib, size_t miblen,
 }
 
 static const ctl_named_node_t *
-stats_arenas_i_index(tsdn_t *tsdn, const size_t *mib, size_t miblen, size_t i) {
+stats_arenas_i_index(tsdn_t *tsdn, const size_t *mib,
+    size_t miblen, size_t i) {
 	const ctl_named_node_t *ret;
 	size_t a;
 
diff --git a/src/extent.c b/src/extent.c
index 09d6d77..4b1a6df 100644
--- a/src/extent.c
+++ b/src/extent.c
@@ -119,9 +119,13 @@ static void extent_record(tsdn_t *tsdn, arena_t *arena,
 
 /******************************************************************************/
 
-ph_gen(UNUSED, extent_avail_, extent_tree_t, extent_t, ph_link,
+#define ATTR_NONE /* does nothing */
+
+ph_gen(ATTR_NONE, extent_avail_, extent_tree_t, extent_t, ph_link,
     extent_esnead_comp)
 
+#undef ATTR_NONE
+
 typedef enum {
 	lock_result_success,
 	lock_result_failure,
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 28d1344..82c0887 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -970,6 +970,14 @@ malloc_conf_init(void) {
 				}					\
 				continue;				\
 			}
+      /*
+       * One of the CONF_MIN macros below expands, in one of the use points,
+       * to "unsigned integer < 0", which is always false, triggering the
+       * GCC -Wtype-limits warning, which we disable here and re-enable below.
+       */
+      JEMALLOC_DIAGNOSTIC_PUSH
+      JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS
+
 #define CONF_MIN_no(um, min)	false
 #define CONF_MIN_yes(um, min)	((um) < (min))
 #define CONF_MAX_no(um, max)	false
@@ -1246,6 +1254,8 @@ malloc_conf_init(void) {
 #undef CONF_HANDLE_SIZE_T
 #undef CONF_HANDLE_SSIZE_T
 #undef CONF_HANDLE_CHAR_P
+    /* Re-enable diagnostic "-Wtype-limits" */
+    JEMALLOC_DIAGNOSTIC_POP
 		}
 		if (opt_abort_conf && had_conf_error) {
 			malloc_abort_invalid_conf();
@@ -2992,7 +3002,7 @@ label_not_resized:
 
 JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW
 JEMALLOC_ATTR(pure)
-je_sallocx(const void *ptr, UNUSED int flags) {
+je_sallocx(const void *ptr, int flags) {
 	size_t usize;
 	tsdn_t *tsdn;
 
diff --git a/src/mutex.c b/src/mutex.c
index 30222b3..55e37ad 100644
--- a/src/mutex.c
+++ b/src/mutex.c
@@ -46,7 +46,7 @@ JEMALLOC_EXPORT int	_pthread_mutex_init_calloc_cb(pthread_mutex_t *mutex,
 void
 malloc_mutex_lock_slow(malloc_mutex_t *mutex) {
 	mutex_prof_data_t *data = &mutex->prof_data;
-	UNUSED nstime_t before = NSTIME_ZERO_INITIALIZER;
+	nstime_t before = NSTIME_ZERO_INITIALIZER;
 
 	if (ncpus == 1) {
 		goto label_spin_done;
diff --git a/src/rtree.c b/src/rtree.c
index 53702cf..4ae41fe 100644
--- a/src/rtree.c
+++ b/src/rtree.c
@@ -39,7 +39,7 @@ rtree_node_dalloc_impl(tsdn_t *tsdn, rtree_t *rtree, rtree_node_elm_t *node) {
 	/* Nodes are never deleted during normal operation. */
 	not_reached();
 }
-UNUSED rtree_node_dalloc_t *JET_MUTABLE rtree_node_dalloc =
+rtree_node_dalloc_t *JET_MUTABLE rtree_node_dalloc =
     rtree_node_dalloc_impl;
 
 static rtree_leaf_elm_t *
@@ -54,7 +54,7 @@ rtree_leaf_dalloc_impl(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *leaf) {
 	/* Leaves are never deleted during normal operation. */
 	not_reached();
 }
-UNUSED rtree_leaf_dalloc_t *JET_MUTABLE rtree_leaf_dalloc =
+rtree_leaf_dalloc_t *JET_MUTABLE rtree_leaf_dalloc =
     rtree_leaf_dalloc_impl;
 
 #ifdef JEMALLOC_JET
diff --git a/src/tcache.c b/src/tcache.c
index af75754..d624d92 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -206,7 +206,7 @@ tcache_bin_flush_large(tsd_t *tsd, cache_bin_t *tbin, szind_t binind,
 		/* Lock the arena associated with the first object. */
 		extent_t *extent = item_extent[0];
 		arena_t *locked_arena = extent_arena_get(extent);
-		UNUSED bool idump;
+		bool idump;
 
 		if (config_prof) {
 			idump = false;
diff --git a/src/tsd.c b/src/tsd.c
index 4eceee7..f2b601d 100644
--- a/src/tsd.c
+++ b/src/tsd.c
@@ -12,6 +12,10 @@
 static unsigned ncleanups;
 static malloc_tsd_cleanup_t cleanups[MALLOC_TSD_CLEANUPS_MAX];
 
+/* TSD_INITIALIZER triggers "-Wmissing-field-initializer" */
+JEMALLOC_DIAGNOSTIC_PUSH
+JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
+
 #ifdef JEMALLOC_MALLOC_THREAD_CLEANUP
 __thread tsd_t JEMALLOC_TLS_MODEL tsd_tls = TSD_INITIALIZER;
 __thread bool JEMALLOC_TLS_MODEL tsd_initialized = false;
@@ -41,6 +45,7 @@ tsd_init_head_t	tsd_init_head = {
 	ql_head_initializer(blocks),
 	MALLOC_MUTEX_INITIALIZER
 };
+
 tsd_wrapper_t tsd_boot_wrapper = {
 	false,
 	TSD_INITIALIZER
@@ -48,6 +53,7 @@ tsd_wrapper_t tsd_boot_wrapper = {
 bool tsd_booted = false;
 #endif
 
+JEMALLOC_DIAGNOSTIC_POP
 
 /******************************************************************************/
 
diff --git a/test/integration/aligned_alloc.c b/test/integration/aligned_alloc.c
index 536b67e..cfe1df9 100644
--- a/test/integration/aligned_alloc.c
+++ b/test/integration/aligned_alloc.c
@@ -34,6 +34,17 @@ TEST_BEGIN(test_alignment_errors) {
 }
 TEST_END
 
+
+/*
+ * GCC "-Walloc-size-larger-than" warning detects when one of the memory
+ * allocation functions is called with a size larger than the maximum size that
+ * they support. Here we want to explicitly test that the allocation functions
+ * do indeed fail properly when this is the case, which triggers the warning.
+ * Therefore we disable the warning for these tests.
+ */
+JEMALLOC_DIAGNOSTIC_PUSH
+JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+
 TEST_BEGIN(test_oom_errors) {
 	size_t alignment, size;
 	void *p;
@@ -78,6 +89,9 @@ TEST_BEGIN(test_oom_errors) {
 }
 TEST_END
 
+/* Re-enable the "-Walloc-size-larger-than=" warning */
+JEMALLOC_DIAGNOSTIC_POP
+
 TEST_BEGIN(test_alignment_and_size) {
 #define NITER 4
 	size_t alignment, size, total;
diff --git a/test/integration/mallocx.c b/test/integration/mallocx.c
index 9fe3ad5..ce5069a 100644
--- a/test/integration/mallocx.c
+++ b/test/integration/mallocx.c
@@ -51,6 +51,16 @@ purge(void) {
 	    "Unexpected mallctl error");
 }
 
+/*
+ * GCC "-Walloc-size-larger-than" warning detects when one of the memory
+ * allocation functions is called with a size larger than the maximum size that
+ * they support. Here we want to explicitly test that the allocation functions
+ * do indeed fail properly when this is the case, which triggers the warning.
+ * Therefore we disable the warning for these tests.
+ */
+JEMALLOC_DIAGNOSTIC_PUSH
+JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+
 TEST_BEGIN(test_overflow) {
 	size_t largemax;
 
@@ -145,6 +155,9 @@ TEST_BEGIN(test_oom) {
 }
 TEST_END
 
+/* Re-enable the "-Walloc-size-larger-than=" warning */
+JEMALLOC_DIAGNOSTIC_POP
+
 TEST_BEGIN(test_basic) {
 #define MAXSZ (((size_t)1) << 23)
 	size_t sz;
diff --git a/test/integration/overflow.c b/test/integration/overflow.c
index 6a9785b..748ebb6 100644
--- a/test/integration/overflow.c
+++ b/test/integration/overflow.c
@@ -1,5 +1,15 @@
 #include "test/jemalloc_test.h"
 
+/*
+ * GCC "-Walloc-size-larger-than" warning detects when one of the memory
+ * allocation functions is called with a size larger than the maximum size that
+ * they support. Here we want to explicitly test that the allocation functions
+ * do indeed fail properly when this is the case, which triggers the warning.
+ * Therefore we disable the warning for these tests.
+ */
+JEMALLOC_DIAGNOSTIC_PUSH
+JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+
 TEST_BEGIN(test_overflow) {
 	unsigned nlextents;
 	size_t mib[4];
@@ -39,6 +49,9 @@ TEST_BEGIN(test_overflow) {
 }
 TEST_END
 
+/* Re-enable the "-Walloc-size-larger-than=" warning */
+JEMALLOC_DIAGNOSTIC_POP
+
 int
 main(void) {
 	return test(
diff --git a/test/integration/rallocx.c b/test/integration/rallocx.c
index 7821ca5..08ed08d 100644
--- a/test/integration/rallocx.c
+++ b/test/integration/rallocx.c
@@ -208,6 +208,16 @@ TEST_BEGIN(test_lg_align_and_zero) {
 }
 TEST_END
 
+/*
+ * GCC "-Walloc-size-larger-than" warning detects when one of the memory
+ * allocation functions is called with a size larger than the maximum size that
+ * they support. Here we want to explicitly test that the allocation functions
+ * do indeed fail properly when this is the case, which triggers the warning.
+ * Therefore we disable the warning for these tests.
+ */
+JEMALLOC_DIAGNOSTIC_PUSH
+JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+
 TEST_BEGIN(test_overflow) {
 	size_t largemax;
 	void *p;
@@ -234,6 +244,9 @@ TEST_BEGIN(test_overflow) {
 }
 TEST_END
 
+/* Re-enable the "-Walloc-size-larger-than=" warning */
+JEMALLOC_DIAGNOSTIC_POP
+
 int
 main(void) {
 	return test(
diff --git a/test/unit/emitter.c b/test/unit/emitter.c
index 535c7cf..6ffd1c3 100644
--- a/test/unit/emitter.c
+++ b/test/unit/emitter.c
@@ -347,11 +347,11 @@ static void
 emit_table_row(emitter_t *emitter) {
 	emitter_begin(emitter);
 	emitter_row_t row;
-	emitter_col_t abc = {emitter_justify_left, 10, emitter_type_title};
+	emitter_col_t abc = {emitter_justify_left, 10, emitter_type_title, {0}, {0, 0}};
 	abc.str_val = "ABC title";
-	emitter_col_t def = {emitter_justify_right, 15, emitter_type_title};
+	emitter_col_t def = {emitter_justify_right, 15, emitter_type_title, {0}, {0, 0}};
 	def.str_val = "DEF title";
-	emitter_col_t ghi = {emitter_justify_right, 5, emitter_type_title};
+	emitter_col_t ghi = {emitter_justify_right, 5, emitter_type_title, {0}, {0, 0}};
 	ghi.str_val = "GHI";
 
 	emitter_row_init(&row);
-- 
cgit v0.12


From fb924dd7bf5e765ffcb273b6b88a515fea54fea8 Mon Sep 17 00:00:00 2001
From: gnzlbg <gonzalobg88@gmail.com>
Date: Tue, 10 Jul 2018 14:48:18 +0200
Subject: Suppress -Wmissing-field-initializer warning only for compilers with
 buggy implementation

---
 include/jemalloc/internal/jemalloc_internal_macros.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/include/jemalloc/internal/jemalloc_internal_macros.h b/include/jemalloc/internal/jemalloc_internal_macros.h
index a1a761b..ec8782e 100644
--- a/include/jemalloc/internal/jemalloc_internal_macros.h
+++ b/include/jemalloc/internal/jemalloc_internal_macros.h
@@ -66,8 +66,19 @@
 #  define JEMALLOC_DIAGNOSTIC_PUSH JEMALLOC_PRAGMA__(GCC diagnostic push)
 #  define JEMALLOC_DIAGNOSTIC_POP JEMALLOC_PRAGMA__(GCC diagnostic pop)
 #  define JEMALLOC_DIAGNOSTIC_IGNORE(W) JEMALLOC_PRAGMA__(GCC diagnostic ignored W)
-#  define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS \
-     JEMALLOC_DIAGNOSTIC_IGNORE("-Wmissing-field-initializers")
+
+/*
+ * The -Wmissing-field-initializers warning is buggy in GCC versions < 5.1 and
+ * all clang versions up to version 7 (currently trunk, unreleased).
+ * This macro suppresses the warning for the affected compiler versions only.
+ */
+#  if ((defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ < 5)) || defined(__clang__)
+#    define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS  \
+          JEMALLOC_DIAGNOSTIC_IGNORE("-Wmissing-field-initializers")
+#  else
+#    define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
+#  endif
+
 #  define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS  \
      JEMALLOC_DIAGNOSTIC_IGNORE("-Wtype-limits")
 #  define JEMALLOC_DIAGNOSTIC_IGNORE_UNUSED_PARAMETER \
-- 
cgit v0.12


From e904f813b40b4286e10172163c880fd9e1d0608a Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Thu, 14 Dec 2017 12:46:39 -0800
Subject: Hide size class computation behind a layer of indirection.

This class removes almost all the dependencies on size_classes.h, accessing the
data there only via the new module sc.h, which does not depend on any
configuration options.

In a subsequent commit, we'll remove the configure-time size class computations,
doing them at boot time, instead.
---
 Makefile.in                                        |   1 +
 include/jemalloc/internal/arena_externs.h          |   1 -
 include/jemalloc/internal/arena_inlines_b.h        |  28 +-
 include/jemalloc/internal/arena_stats.h            |   6 +-
 include/jemalloc/internal/arena_structs_b.h        |   4 +-
 include/jemalloc/internal/arena_types.h            |   4 +-
 include/jemalloc/internal/base_structs.h           |   4 +-
 include/jemalloc/internal/bin.h                    |   8 +-
 include/jemalloc/internal/bit_util.h               |  68 +++++
 include/jemalloc/internal/bitmap.h                 |   6 +-
 include/jemalloc/internal/ctl.h                    |   6 +-
 include/jemalloc/internal/extent_inlines.h         |   9 +-
 include/jemalloc/internal/extent_structs.h         |   9 +-
 include/jemalloc/internal/extent_types.h           |   2 -
 .../jemalloc/internal/jemalloc_internal_externs.h  |   1 -
 .../internal/jemalloc_internal_inlines_a.h         |   8 +-
 .../internal/jemalloc_internal_inlines_c.h         |   2 +-
 include/jemalloc/internal/prof_inlines_a.h         |   8 +-
 include/jemalloc/internal/rtree.h                  |  14 +-
 include/jemalloc/internal/sc.h                     | 302 +++++++++++++++++++++
 include/jemalloc/internal/sz.h                     | 165 ++++++-----
 include/jemalloc/internal/tcache_externs.h         |   4 +-
 include/jemalloc/internal/tcache_inlines.h         |  12 +-
 include/jemalloc/internal/tcache_structs.h         |  10 +-
 include/jemalloc/internal/tcache_types.h           |   4 +-
 src/arena.c                                        | 108 ++++----
 src/base.c                                         |   8 +-
 src/bin.c                                          |  39 ++-
 src/ckh.c                                          |   7 +-
 src/ctl.c                                          |  28 +-
 src/extent.c                                       |  51 ++--
 src/extent_dss.c                                   |   4 +-
 src/jemalloc.c                                     |  78 +++---
 src/large.c                                        |  20 +-
 src/sc.c                                           |  62 +++++
 src/sz.c                                           | 152 ++++-------
 src/tcache.c                                       |  28 +-
 test/unit/arena_reset.c                            |   2 +-
 test/unit/junk.c                                   |   5 +-
 test/unit/mallctl.c                                |   9 +-
 test/unit/prof_gdump.c                             |   8 +-
 test/unit/rtree.c                                  |  17 +-
 test/unit/size_classes.c                           |  15 +-
 test/unit/slab.c                                   |   2 +-
 test/unit/stats.c                                  |  11 +-
 test/unit/zero.c                                   |   5 +-
 46 files changed, 886 insertions(+), 459 deletions(-)
 create mode 100644 include/jemalloc/internal/sc.h
 create mode 100644 src/sc.c

diff --git a/Makefile.in b/Makefile.in
index 81f899f..619aae7 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -114,6 +114,7 @@ C_SRCS := $(srcroot)src/jemalloc.c \
 	$(srcroot)src/prof.c \
 	$(srcroot)src/rtree.c \
 	$(srcroot)src/stats.c \
+	$(srcroot)src/sc.c \
 	$(srcroot)src/sz.c \
 	$(srcroot)src/tcache.c \
 	$(srcroot)src/test_hooks.c \
diff --git a/include/jemalloc/internal/arena_externs.h b/include/jemalloc/internal/arena_externs.h
index c145c91..7a46946 100644
--- a/include/jemalloc/internal/arena_externs.h
+++ b/include/jemalloc/internal/arena_externs.h
@@ -5,7 +5,6 @@
 #include "jemalloc/internal/extent_dss.h"
 #include "jemalloc/internal/hook.h"
 #include "jemalloc/internal/pages.h"
-#include "jemalloc/internal/size_classes.h"
 #include "jemalloc/internal/stats.h"
 
 extern ssize_t opt_dirty_decay_ms;
diff --git a/include/jemalloc/internal/arena_inlines_b.h b/include/jemalloc/internal/arena_inlines_b.h
index d388cae..8960396 100644
--- a/include/jemalloc/internal/arena_inlines_b.h
+++ b/include/jemalloc/internal/arena_inlines_b.h
@@ -4,7 +4,7 @@
 #include "jemalloc/internal/jemalloc_internal_types.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/rtree.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/sz.h"
 #include "jemalloc/internal/ticker.h"
 
@@ -111,7 +111,7 @@ arena_malloc(tsdn_t *tsdn, arena_t *arena, size_t size, szind_t ind, bool zero,
 	assert(size != 0);
 
 	if (likely(tcache != NULL)) {
-		if (likely(size <= SMALL_MAXCLASS)) {
+		if (likely(size <= sc_data_global.small_maxclass)) {
 			return tcache_alloc_small(tsdn_tsd(tsdn), arena,
 			    tcache, size, ind, zero, slow_path);
 		}
@@ -140,7 +140,7 @@ arena_salloc(tsdn_t *tsdn, const void *ptr) {
 
 	szind_t szind = rtree_szind_read(tsdn, &extents_rtree, rtree_ctx,
 	    (uintptr_t)ptr, true);
-	assert(szind != NSIZES);
+	assert(szind != SC_NSIZES);
 
 	return sz_index2size(szind);
 }
@@ -173,7 +173,7 @@ arena_vsalloc(tsdn_t *tsdn, const void *ptr) {
 	/* Only slab members should be looked up via interior pointers. */
 	assert(extent_addr_get(extent) == ptr || extent_slab_get(extent));
 
-	assert(szind != NSIZES);
+	assert(szind != SC_NSIZES);
 
 	return sz_index2size(szind);
 }
@@ -194,7 +194,7 @@ arena_dalloc_no_tcache(tsdn_t *tsdn, void *ptr) {
 		extent_t *extent = rtree_extent_read(tsdn, &extents_rtree,
 		    rtree_ctx, (uintptr_t)ptr, true);
 		assert(szind == extent_szind_get(extent));
-		assert(szind < NSIZES);
+		assert(szind < SC_NSIZES);
 		assert(slab == extent_slab_get(extent));
 	}
 
@@ -224,7 +224,7 @@ arena_dalloc(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
 	if (alloc_ctx != NULL) {
 		szind = alloc_ctx->szind;
 		slab = alloc_ctx->slab;
-		assert(szind != NSIZES);
+		assert(szind != SC_NSIZES);
 	} else {
 		rtree_ctx = tsd_rtree_ctx(tsdn_tsd(tsdn));
 		rtree_szind_slab_read(tsdn, &extents_rtree, rtree_ctx,
@@ -236,7 +236,7 @@ arena_dalloc(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
 		extent_t *extent = rtree_extent_read(tsdn, &extents_rtree,
 		    rtree_ctx, (uintptr_t)ptr, true);
 		assert(szind == extent_szind_get(extent));
-		assert(szind < NSIZES);
+		assert(szind < SC_NSIZES);
 		assert(slab == extent_slab_get(extent));
 	}
 
@@ -246,7 +246,7 @@ arena_dalloc(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
 		    slow_path);
 	} else {
 		if (szind < nhbins) {
-			if (config_prof && unlikely(szind < NBINS)) {
+			if (config_prof && unlikely(szind < SC_NBINS)) {
 				arena_dalloc_promoted(tsdn, ptr, tcache,
 				    slow_path);
 			} else {
@@ -263,7 +263,7 @@ arena_dalloc(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
 static inline void
 arena_sdalloc_no_tcache(tsdn_t *tsdn, void *ptr, size_t size) {
 	assert(ptr != NULL);
-	assert(size <= LARGE_MAXCLASS);
+	assert(size <= sc_data_global.large_maxclass);
 
 	szind_t szind;
 	bool slab;
@@ -273,7 +273,7 @@ arena_sdalloc_no_tcache(tsdn_t *tsdn, void *ptr, size_t size) {
 		 * object, so base szind and slab on the given size.
 		 */
 		szind = sz_size2index(size);
-		slab = (szind < NBINS);
+		slab = (szind < SC_NBINS);
 	}
 
 	if ((config_prof && opt_prof) || config_debug) {
@@ -285,7 +285,7 @@ arena_sdalloc_no_tcache(tsdn_t *tsdn, void *ptr, size_t size) {
 		    (uintptr_t)ptr, true, &szind, &slab);
 
 		assert(szind == sz_size2index(size));
-		assert((config_prof && opt_prof) || slab == (szind < NBINS));
+		assert((config_prof && opt_prof) || slab == (szind < SC_NBINS));
 
 		if (config_debug) {
 			extent_t *extent = rtree_extent_read(tsdn,
@@ -309,7 +309,7 @@ arena_sdalloc(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
     alloc_ctx_t *alloc_ctx, bool slow_path) {
 	assert(!tsdn_null(tsdn) || tcache == NULL);
 	assert(ptr != NULL);
-	assert(size <= LARGE_MAXCLASS);
+	assert(size <= sc_data_global.large_maxclass);
 
 	if (unlikely(tcache == NULL)) {
 		arena_sdalloc_no_tcache(tsdn, ptr, size);
@@ -339,7 +339,7 @@ arena_sdalloc(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
 		 * object, so base szind and slab on the given size.
 		 */
 		szind = sz_size2index(size);
-		slab = (szind < NBINS);
+		slab = (szind < SC_NBINS);
 	}
 
 	if (config_debug) {
@@ -358,7 +358,7 @@ arena_sdalloc(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
 		    slow_path);
 	} else {
 		if (szind < nhbins) {
-			if (config_prof && unlikely(szind < NBINS)) {
+			if (config_prof && unlikely(szind < SC_NBINS)) {
 				arena_dalloc_promoted(tsdn, ptr, tcache,
 				    slow_path);
 			} else {
diff --git a/include/jemalloc/internal/arena_stats.h b/include/jemalloc/internal/arena_stats.h
index 39b7262..6dacf74 100644
--- a/include/jemalloc/internal/arena_stats.h
+++ b/include/jemalloc/internal/arena_stats.h
@@ -4,7 +4,7 @@
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/mutex_prof.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 
 JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
 
@@ -90,7 +90,7 @@ struct arena_stats_s {
 	mutex_prof_data_t mutex_prof_data[mutex_prof_num_arena_mutexes];
 
 	/* One element for each large size class. */
-	arena_stats_large_t	lstats[NSIZES - NBINS];
+	arena_stats_large_t	lstats[SC_NSIZES - SC_NBINS];
 
 	/* Arena uptime. */
 	nstime_t		uptime;
@@ -225,7 +225,7 @@ arena_stats_large_nrequests_add(tsdn_t *tsdn, arena_stats_t *arena_stats,
     szind_t szind, uint64_t nrequests) {
 	arena_stats_lock(tsdn, arena_stats);
 	arena_stats_add_u64(tsdn, arena_stats, &arena_stats->lstats[szind -
-	    NBINS].nrequests, nrequests);
+	    SC_NBINS].nrequests, nrequests);
 	arena_stats_unlock(tsdn, arena_stats);
 }
 
diff --git a/include/jemalloc/internal/arena_structs_b.h b/include/jemalloc/internal/arena_structs_b.h
index 38bc959..96f25f8 100644
--- a/include/jemalloc/internal/arena_structs_b.h
+++ b/include/jemalloc/internal/arena_structs_b.h
@@ -10,7 +10,7 @@
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/nstime.h"
 #include "jemalloc/internal/ql.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/smoothstep.h"
 #include "jemalloc/internal/ticker.h"
 
@@ -203,7 +203,7 @@ struct arena_s {
 	 *
 	 * Synchronization: internal.
 	 */
-	bin_t			bins[NBINS];
+	bin_t			bins[SC_NBINS];
 
 	/*
 	 * Base allocator, from which arena metadata are allocated.
diff --git a/include/jemalloc/internal/arena_types.h b/include/jemalloc/internal/arena_types.h
index 759713c..c40ae6f 100644
--- a/include/jemalloc/internal/arena_types.h
+++ b/include/jemalloc/internal/arena_types.h
@@ -1,8 +1,10 @@
 #ifndef JEMALLOC_INTERNAL_ARENA_TYPES_H
 #define JEMALLOC_INTERNAL_ARENA_TYPES_H
 
+#include "jemalloc/internal/sc.h"
+
 /* Maximum number of regions in one slab. */
-#define LG_SLAB_MAXREGS		(LG_PAGE - LG_TINY_MIN)
+#define LG_SLAB_MAXREGS		(LG_PAGE - SC_LG_TINY_MIN)
 #define SLAB_MAXREGS		(1U << LG_SLAB_MAXREGS)
 
 /* Default decay times in milliseconds. */
diff --git a/include/jemalloc/internal/base_structs.h b/include/jemalloc/internal/base_structs.h
index 2102247..07f214e 100644
--- a/include/jemalloc/internal/base_structs.h
+++ b/include/jemalloc/internal/base_structs.h
@@ -3,7 +3,7 @@
 
 #include "jemalloc/internal/jemalloc_internal_types.h"
 #include "jemalloc/internal/mutex.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 
 /* Embedded at the beginning of every block of base-managed virtual memory. */
 struct base_block_s {
@@ -46,7 +46,7 @@ struct base_s {
 	base_block_t	*blocks;
 
 	/* Heap of extents that track unused trailing space within blocks. */
-	extent_heap_t	avail[NSIZES];
+	extent_heap_t	avail[SC_NSIZES];
 
 	/* Stats, only maintained if config_stats. */
 	size_t		allocated;
diff --git a/include/jemalloc/internal/bin.h b/include/jemalloc/internal/bin.h
index 9b416ad..e04b6c6 100644
--- a/include/jemalloc/internal/bin.h
+++ b/include/jemalloc/internal/bin.h
@@ -1,10 +1,11 @@
 #ifndef JEMALLOC_INTERNAL_BIN_H
 #define JEMALLOC_INTERNAL_BIN_H
 
+#include "jemalloc/internal/bin_stats.h"
 #include "jemalloc/internal/extent_types.h"
 #include "jemalloc/internal/extent_structs.h"
 #include "jemalloc/internal/mutex.h"
-#include "jemalloc/internal/bin_stats.h"
+#include "jemalloc/internal/sc.h"
 
 /*
  * A bin contains a set of extents that are currently being used for slab
@@ -48,7 +49,7 @@ struct bin_info_s {
 	bitmap_info_t		bitmap_info;
 };
 
-extern const bin_info_t bin_infos[NBINS];
+extern bin_info_t bin_infos[SC_NBINS];
 
 
 typedef struct bin_s bin_t;
@@ -78,6 +79,9 @@ struct bin_s {
 	bin_stats_t	stats;
 };
 
+void bin_infos_init(sc_data_t *sc_data, bin_info_t bin_infos[SC_NBINS]);
+void bin_boot();
+
 /* Initializes a bin to empty.  Returns true on error. */
 bool bin_init(bin_t *bin);
 
diff --git a/include/jemalloc/internal/bit_util.h b/include/jemalloc/internal/bit_util.h
index 8d078a8..435b497 100644
--- a/include/jemalloc/internal/bit_util.h
+++ b/include/jemalloc/internal/bit_util.h
@@ -162,4 +162,72 @@ lg_floor(size_t x) {
 
 #undef BIT_UTIL_INLINE
 
+/* A compile-time version of lg_ceil */
+#define LG_CEIL(x) (							\
+    (x) <= (1ULL << 0ULL) ? 0 :						\
+    (x) <= (1ULL << 1ULL) ? 1 :						\
+    (x) <= (1ULL << 2ULL) ? 2 :						\
+    (x) <= (1ULL << 3ULL) ? 3 :						\
+    (x) <= (1ULL << 4ULL) ? 4 :						\
+    (x) <= (1ULL << 5ULL) ? 5 :						\
+    (x) <= (1ULL << 6ULL) ? 6 :						\
+    (x) <= (1ULL << 7ULL) ? 7 :						\
+    (x) <= (1ULL << 8ULL) ? 8 :						\
+    (x) <= (1ULL << 9ULL) ? 9 :						\
+    (x) <= (1ULL << 10ULL) ? 10 :					\
+    (x) <= (1ULL << 11ULL) ? 11 :					\
+    (x) <= (1ULL << 12ULL) ? 12 :					\
+    (x) <= (1ULL << 13ULL) ? 13 :					\
+    (x) <= (1ULL << 14ULL) ? 14 :					\
+    (x) <= (1ULL << 15ULL) ? 15 :					\
+    (x) <= (1ULL << 16ULL) ? 16 :					\
+    (x) <= (1ULL << 17ULL) ? 17 :					\
+    (x) <= (1ULL << 18ULL) ? 18 :					\
+    (x) <= (1ULL << 19ULL) ? 19 :					\
+    (x) <= (1ULL << 20ULL) ? 20 :					\
+    (x) <= (1ULL << 21ULL) ? 21 :					\
+    (x) <= (1ULL << 22ULL) ? 22 :					\
+    (x) <= (1ULL << 23ULL) ? 23 :					\
+    (x) <= (1ULL << 24ULL) ? 24 :					\
+    (x) <= (1ULL << 25ULL) ? 25 :					\
+    (x) <= (1ULL << 26ULL) ? 26 :					\
+    (x) <= (1ULL << 27ULL) ? 27 :					\
+    (x) <= (1ULL << 28ULL) ? 28 :					\
+    (x) <= (1ULL << 29ULL) ? 29 :					\
+    (x) <= (1ULL << 30ULL) ? 30 :					\
+    (x) <= (1ULL << 31ULL) ? 31 :					\
+    (x) <= (1ULL << 32ULL) ? 32 :					\
+    (x) <= (1ULL << 33ULL) ? 33 :					\
+    (x) <= (1ULL << 34ULL) ? 34 :					\
+    (x) <= (1ULL << 35ULL) ? 35 :					\
+    (x) <= (1ULL << 36ULL) ? 36 :					\
+    (x) <= (1ULL << 37ULL) ? 37 :					\
+    (x) <= (1ULL << 38ULL) ? 38 :					\
+    (x) <= (1ULL << 39ULL) ? 39 :					\
+    (x) <= (1ULL << 40ULL) ? 40 :					\
+    (x) <= (1ULL << 41ULL) ? 41 :					\
+    (x) <= (1ULL << 42ULL) ? 42 :					\
+    (x) <= (1ULL << 43ULL) ? 43 :					\
+    (x) <= (1ULL << 44ULL) ? 44 :					\
+    (x) <= (1ULL << 45ULL) ? 45 :					\
+    (x) <= (1ULL << 46ULL) ? 46 :					\
+    (x) <= (1ULL << 47ULL) ? 47 :					\
+    (x) <= (1ULL << 48ULL) ? 48 :					\
+    (x) <= (1ULL << 49ULL) ? 49 :					\
+    (x) <= (1ULL << 50ULL) ? 50 :					\
+    (x) <= (1ULL << 51ULL) ? 51 :					\
+    (x) <= (1ULL << 52ULL) ? 52 :					\
+    (x) <= (1ULL << 53ULL) ? 53 :					\
+    (x) <= (1ULL << 54ULL) ? 54 :					\
+    (x) <= (1ULL << 55ULL) ? 55 :					\
+    (x) <= (1ULL << 56ULL) ? 56 :					\
+    (x) <= (1ULL << 57ULL) ? 57 :					\
+    (x) <= (1ULL << 58ULL) ? 58 :					\
+    (x) <= (1ULL << 59ULL) ? 59 :					\
+    (x) <= (1ULL << 60ULL) ? 60 :					\
+    (x) <= (1ULL << 61ULL) ? 61 :					\
+    (x) <= (1ULL << 62ULL) ? 62 :					\
+    (x) <= (1ULL << 63ULL) ? 63 :					\
+    64)
+
 #endif /* JEMALLOC_INTERNAL_BIT_UTIL_H */
diff --git a/include/jemalloc/internal/bitmap.h b/include/jemalloc/internal/bitmap.h
index ac99029..c3f9cb4 100644
--- a/include/jemalloc/internal/bitmap.h
+++ b/include/jemalloc/internal/bitmap.h
@@ -3,18 +3,18 @@
 
 #include "jemalloc/internal/arena_types.h"
 #include "jemalloc/internal/bit_util.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 
 typedef unsigned long bitmap_t;
 #define LG_SIZEOF_BITMAP	LG_SIZEOF_LONG
 
 /* Maximum bitmap bit count is 2^LG_BITMAP_MAXBITS. */
-#if LG_SLAB_MAXREGS > LG_CEIL_NSIZES
+#if LG_SLAB_MAXREGS > LG_CEIL(SC_NSIZES)
 /* Maximum bitmap bit count is determined by maximum regions per slab. */
 #  define LG_BITMAP_MAXBITS	LG_SLAB_MAXREGS
 #else
 /* Maximum bitmap bit count is determined by number of extent size classes. */
-#  define LG_BITMAP_MAXBITS	LG_CEIL_NSIZES
+#  define LG_BITMAP_MAXBITS	LG_CEIL(SC_NSIZES)
 #endif
 #define BITMAP_MAXBITS		(ZU(1) << LG_BITMAP_MAXBITS)
 
diff --git a/include/jemalloc/internal/ctl.h b/include/jemalloc/internal/ctl.h
index d927d94..5576310 100644
--- a/include/jemalloc/internal/ctl.h
+++ b/include/jemalloc/internal/ctl.h
@@ -5,7 +5,7 @@
 #include "jemalloc/internal/malloc_io.h"
 #include "jemalloc/internal/mutex_prof.h"
 #include "jemalloc/internal/ql.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/stats.h"
 
 /* Maximum ctl tree depth. */
@@ -40,8 +40,8 @@ typedef struct ctl_arena_stats_s {
 	uint64_t ndalloc_small;
 	uint64_t nrequests_small;
 
-	bin_stats_t bstats[NBINS];
-	arena_stats_large_t lstats[NSIZES - NBINS];
+	bin_stats_t bstats[SC_NBINS];
+	arena_stats_large_t lstats[SC_NSIZES - SC_NBINS];
 } ctl_arena_stats_t;
 
 typedef struct ctl_stats_s {
diff --git a/include/jemalloc/internal/extent_inlines.h b/include/jemalloc/internal/extent_inlines.h
index 9b8ddc2..a43d00d 100644
--- a/include/jemalloc/internal/extent_inlines.h
+++ b/include/jemalloc/internal/extent_inlines.h
@@ -6,6 +6,7 @@
 #include "jemalloc/internal/pages.h"
 #include "jemalloc/internal/prng.h"
 #include "jemalloc/internal/ql.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/sz.h"
 
 static inline void
@@ -53,14 +54,14 @@ static inline szind_t
 extent_szind_get_maybe_invalid(const extent_t *extent) {
 	szind_t szind = (szind_t)((extent->e_bits & EXTENT_BITS_SZIND_MASK) >>
 	    EXTENT_BITS_SZIND_SHIFT);
-	assert(szind <= NSIZES);
+	assert(szind <= SC_NSIZES);
 	return szind;
 }
 
 static inline szind_t
 extent_szind_get(const extent_t *extent) {
 	szind_t szind = extent_szind_get_maybe_invalid(extent);
-	assert(szind < NSIZES); /* Never call when "invalid". */
+	assert(szind < SC_NSIZES); /* Never call when "invalid". */
 	return szind;
 }
 
@@ -234,7 +235,7 @@ extent_bsize_set(extent_t *extent, size_t bsize) {
 
 static inline void
 extent_szind_set(extent_t *extent, szind_t szind) {
-	assert(szind <= NSIZES); /* NSIZES means "invalid". */
+	assert(szind <= SC_NSIZES); /* SC_NSIZES means "invalid". */
 	extent->e_bits = (extent->e_bits & ~EXTENT_BITS_SZIND_MASK) |
 	    ((uint64_t)szind << EXTENT_BITS_SZIND_SHIFT);
 }
@@ -327,7 +328,7 @@ extent_binit(extent_t *extent, void *addr, size_t bsize, size_t sn) {
 	extent_addr_set(extent, addr);
 	extent_bsize_set(extent, bsize);
 	extent_slab_set(extent, false);
-	extent_szind_set(extent, NSIZES);
+	extent_szind_set(extent, SC_NSIZES);
 	extent_sn_set(extent, sn);
 	extent_state_set(extent, extent_state_active);
 	extent_zeroed_set(extent, true);
diff --git a/include/jemalloc/internal/extent_structs.h b/include/jemalloc/internal/extent_structs.h
index 4873b9e..c6c1e23 100644
--- a/include/jemalloc/internal/extent_structs.h
+++ b/include/jemalloc/internal/extent_structs.h
@@ -2,11 +2,12 @@
 #define JEMALLOC_INTERNAL_EXTENT_STRUCTS_H
 
 #include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/bit_util.h"
 #include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/ql.h"
 #include "jemalloc/internal/ph.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 
 typedef enum {
 	extent_state_active   = 0,
@@ -112,7 +113,7 @@ struct extent_s {
 #define EXTENT_BITS_STATE_SHIFT  (EXTENT_BITS_ZEROED_WIDTH + EXTENT_BITS_ZEROED_SHIFT)
 #define EXTENT_BITS_STATE_MASK  MASK(EXTENT_BITS_STATE_WIDTH, EXTENT_BITS_STATE_SHIFT)
 
-#define EXTENT_BITS_SZIND_WIDTH  LG_CEIL_NSIZES
+#define EXTENT_BITS_SZIND_WIDTH  LG_CEIL(SC_NSIZES)
 #define EXTENT_BITS_SZIND_SHIFT  (EXTENT_BITS_STATE_WIDTH + EXTENT_BITS_STATE_SHIFT)
 #define EXTENT_BITS_SZIND_MASK  MASK(EXTENT_BITS_SZIND_WIDTH, EXTENT_BITS_SZIND_SHIFT)
 
@@ -180,14 +181,14 @@ struct extents_s {
 	 *
 	 * Synchronization: mtx.
 	 */
-	extent_heap_t		heaps[NPSIZES+1];
+	extent_heap_t		heaps[SC_NPSIZES_MAX + 1];
 
 	/*
 	 * Bitmap for which set bits correspond to non-empty heaps.
 	 *
 	 * Synchronization: mtx.
 	 */
-	bitmap_t		bitmap[BITMAP_GROUPS(NPSIZES+1)];
+	bitmap_t		bitmap[BITMAP_GROUPS(SC_NPSIZES_MAX + 1)];
 
 	/*
 	 * LRU of all extents in heaps.
diff --git a/include/jemalloc/internal/extent_types.h b/include/jemalloc/internal/extent_types.h
index c0561d9..acbcf27 100644
--- a/include/jemalloc/internal/extent_types.h
+++ b/include/jemalloc/internal/extent_types.h
@@ -6,8 +6,6 @@ typedef struct extents_s extents_t;
 
 #define EXTENT_HOOKS_INITIALIZER	NULL
 
-#define EXTENT_GROW_MAX_PIND (NPSIZES - 1)
-
 /*
  * When reuse (and split) an active extent, (1U << opt_lg_extent_max_active_fit)
  * is the max ratio between the size of the active extent and the new extent.
diff --git a/include/jemalloc/internal/jemalloc_internal_externs.h b/include/jemalloc/internal/jemalloc_internal_externs.h
index 5beebc0..b784362 100644
--- a/include/jemalloc/internal/jemalloc_internal_externs.h
+++ b/include/jemalloc/internal/jemalloc_internal_externs.h
@@ -2,7 +2,6 @@
 #define JEMALLOC_INTERNAL_EXTERNS_H
 
 #include "jemalloc/internal/atomic.h"
-#include "jemalloc/internal/size_classes.h"
 #include "jemalloc/internal/tsd_types.h"
 
 /* TSD checks this to set thread local slow state accordingly. */
diff --git a/include/jemalloc/internal/jemalloc_internal_inlines_a.h b/include/jemalloc/internal/jemalloc_internal_inlines_a.h
index 6577a4f..8adc02a 100644
--- a/include/jemalloc/internal/jemalloc_internal_inlines_a.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_a.h
@@ -4,7 +4,7 @@
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/bit_util.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/ticker.h"
 
 JEMALLOC_ALWAYS_INLINE malloc_cpuid_t
@@ -108,14 +108,14 @@ decay_ticker_get(tsd_t *tsd, unsigned ind) {
 
 JEMALLOC_ALWAYS_INLINE cache_bin_t *
 tcache_small_bin_get(tcache_t *tcache, szind_t binind) {
-	assert(binind < NBINS);
+	assert(binind < SC_NBINS);
 	return &tcache->bins_small[binind];
 }
 
 JEMALLOC_ALWAYS_INLINE cache_bin_t *
 tcache_large_bin_get(tcache_t *tcache, szind_t binind) {
-	assert(binind >= NBINS &&binind < nhbins);
-	return &tcache->bins_large[binind - NBINS];
+	assert(binind >= SC_NBINS &&binind < nhbins);
+	return &tcache->bins_large[binind - SC_NBINS];
 }
 
 JEMALLOC_ALWAYS_INLINE bool
diff --git a/include/jemalloc/internal/jemalloc_internal_inlines_c.h b/include/jemalloc/internal/jemalloc_internal_inlines_c.h
index 2b0d4f4..83ad10f 100644
--- a/include/jemalloc/internal/jemalloc_internal_inlines_c.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_c.h
@@ -142,7 +142,7 @@ iralloct_realign(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
 	size_t usize, copysize;
 
 	usize = sz_sa2u(size, alignment);
-	if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) {
+	if (unlikely(usize == 0 || usize > sc_data_global.large_maxclass)) {
 		return NULL;
 	}
 	p = ipalloct(tsdn, usize, alignment, zero, tcache, arena);
diff --git a/include/jemalloc/internal/prof_inlines_a.h b/include/jemalloc/internal/prof_inlines_a.h
index c39bc3d..07bfd9f 100644
--- a/include/jemalloc/internal/prof_inlines_a.h
+++ b/include/jemalloc/internal/prof_inlines_a.h
@@ -57,15 +57,15 @@ prof_accum_cancel(tsdn_t *tsdn, prof_accum_t *prof_accum,
 #ifdef JEMALLOC_ATOMIC_U64
 	a0 = atomic_load_u64(&prof_accum->accumbytes, ATOMIC_RELAXED);
 	do {
-		a1 = (a0 >= LARGE_MINCLASS - usize) ?  a0 - (LARGE_MINCLASS -
-		    usize) : 0;
+		a1 = (a0 >= sc_data_global.large_minclass - usize)
+		    ? a0 - (sc_data_global.large_minclass - usize) : 0;
 	} while (!atomic_compare_exchange_weak_u64(&prof_accum->accumbytes, &a0,
 	    a1, ATOMIC_RELAXED, ATOMIC_RELAXED));
 #else
 	malloc_mutex_lock(tsdn, &prof_accum->mtx);
 	a0 = prof_accum->accumbytes;
-	a1 = (a0 >= LARGE_MINCLASS - usize) ?  a0 - (LARGE_MINCLASS - usize) :
-	    0;
+	a1 = (a0 >= sc_data_global.large_minclass - usize)
+	    ?  a0 - (sc_data_global.large_minclass - usize) : 0;
 	prof_accum->accumbytes = a1;
 	malloc_mutex_unlock(tsdn, &prof_accum->mtx);
 #endif
diff --git a/include/jemalloc/internal/rtree.h b/include/jemalloc/internal/rtree.h
index dd452f1..8564965 100644
--- a/include/jemalloc/internal/rtree.h
+++ b/include/jemalloc/internal/rtree.h
@@ -4,7 +4,7 @@
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/rtree_tsd.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/tsd.h"
 
 /*
@@ -31,7 +31,7 @@
 #  error Unsupported number of significant virtual address bits
 #endif
 /* Use compact leaf representation if virtual address encoding allows. */
-#if RTREE_NHIB >= LG_CEIL_NSIZES
+#if RTREE_NHIB >= LG_CEIL(SC_NSIZES)
 #  define RTREE_LEAF_COMPACT
 #endif
 
@@ -261,7 +261,7 @@ rtree_leaf_elm_extent_write(tsdn_t *tsdn, rtree_t *rtree,
 static inline void
 rtree_leaf_elm_szind_write(tsdn_t *tsdn, rtree_t *rtree,
     rtree_leaf_elm_t *elm, szind_t szind) {
-	assert(szind <= NSIZES);
+	assert(szind <= SC_NSIZES);
 
 #ifdef RTREE_LEAF_COMPACT
 	uintptr_t old_bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm,
@@ -313,7 +313,7 @@ rtree_leaf_elm_write(tsdn_t *tsdn, rtree_t *rtree,
 static inline void
 rtree_leaf_elm_szind_slab_update(tsdn_t *tsdn, rtree_t *rtree,
     rtree_leaf_elm_t *elm, szind_t szind, bool slab) {
-	assert(!slab || szind < NBINS);
+	assert(!slab || szind < SC_NBINS);
 
 	/*
 	 * The caller implicitly assures that it is the only writer to the szind
@@ -429,7 +429,7 @@ rtree_szind_read(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
 	rtree_leaf_elm_t *elm = rtree_read(tsdn, rtree, rtree_ctx, key,
 	    dependent);
 	if (!dependent && elm == NULL) {
-		return NSIZES;
+		return SC_NSIZES;
 	}
 	return rtree_leaf_elm_szind_read(tsdn, rtree, elm, dependent);
 }
@@ -474,7 +474,7 @@ rtree_szind_slab_read(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
 static inline void
 rtree_szind_slab_update(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
     uintptr_t key, szind_t szind, bool slab) {
-	assert(!slab || szind < NBINS);
+	assert(!slab || szind < SC_NBINS);
 
 	rtree_leaf_elm_t *elm = rtree_read(tsdn, rtree, rtree_ctx, key, true);
 	rtree_leaf_elm_szind_slab_update(tsdn, rtree, elm, szind, slab);
@@ -486,7 +486,7 @@ rtree_clear(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
 	rtree_leaf_elm_t *elm = rtree_read(tsdn, rtree, rtree_ctx, key, true);
 	assert(rtree_leaf_elm_extent_read(tsdn, rtree, elm, false) !=
 	    NULL);
-	rtree_leaf_elm_write(tsdn, rtree, elm, NULL, NSIZES, false);
+	rtree_leaf_elm_write(tsdn, rtree, elm, NULL, SC_NSIZES, false);
 }
 
 #endif /* JEMALLOC_INTERNAL_RTREE_H */
diff --git a/include/jemalloc/internal/sc.h b/include/jemalloc/internal/sc.h
new file mode 100644
index 0000000..df295bc
--- /dev/null
+++ b/include/jemalloc/internal/sc.h
@@ -0,0 +1,302 @@
+#ifndef JEMALLOC_INTERNAL_SC_H
+#define JEMALLOC_INTERNAL_SC_H
+
+#include "jemalloc/internal/jemalloc_internal_types.h"
+
+/*
+ * Size class computations:
+ *
+ * These are a little tricky; we'll first start by describing how things
+ * generally work, and then describe some of the details.
+ *
+ * Ignore the first few size classes for a moment. We can then split all the
+ * remaining size classes into groups. The size classes in a group are spaced
+ * such that they cover allocation request sizes in a power-of-2 range. The
+ * power of two is called the base of the group, and the size classes in it
+ * satisfy allocations in the half-open range (base, base * 2]. There are
+ * SC_NGROUP size classes in each group, equally spaced in the range, so that
+ * each one covers allocations for base / SC_NGROUP possible allocation sizes.
+ * We call that value (base / SC_NGROUP) the delta of the group. Each size class
+ * is delta larger than the one before it (including the initial size class in a
+ * group, which is delta large than 2**base, the largest size class in the
+ * previous group).
+ * To make the math all work out nicely, we require that SC_NGROUP is a power of
+ * two, and define it in terms of SC_LG_NGROUP. We'll often talk in terms of
+ * lg_base and lg_delta. For each of these groups then, we have that
+ * lg_delta == lg_base - SC_LG_NGROUP.
+ * The size classes in a group with a given lg_base and lg_delta (which, recall,
+ * can be computed from lg_base for these groups) are therefore:
+ *   base + 1 * delta
+ *     which covers allocations in (base, base + 1 * delta]
+ *   base + 2 * delta
+ *     which covers allocations in (base + 1 * delta, base + 2 * delta].
+ *   base + 3 * delta
+ *     which covers allocations in (base + 2 * delta, base + 3 * delta].
+ *   ...
+ *   base + SC_NGROUP * delta ( == 2 * base)
+ *     which covers allocations in (base + (SC_NGROUP - 1) * delta, 2 * base].
+ * (Note that currently SC_NGROUP is always 4, so the "..." is empty in
+ * practice.)
+ * Note that the last size class in the group is the next power of two (after
+ * base), so that we've set up the induction correctly for the next group's
+ * selection of delta.
+ *
+ * Now, let's start considering the first few size classes. Two extra constants
+ * come into play here: LG_QUANTUM and SC_LG_TINY_MIN. LG_QUANTUM ensures
+ * correct platform alignment; all objects of size (1 << LG_QUANTUM) or larger
+ * are at least (1 << LG_QUANTUM) aligned; this can be used to ensure that we
+ * never return improperly aligned memory, by making (1 << LG_QUANTUM) equal the
+ * highest required alignment of a platform. For allocation sizes smaller than
+ * (1 << LG_QUANTUM) though, we can be more relaxed (since we don't support
+ * platforms with types with alignment larger than their size). To allow such
+ * allocations (without wasting space unnecessarily), we introduce tiny size
+ * classes; one per power of two, up until we hit the quantum size. There are
+ * therefore LG_QUANTUM - SC_LG_TINY_MIN such size classes.
+ *
+ * Next, we have a size class of size LG_QUANTUM. This can't be the start of a
+ * group in the sense we described above (covering a power of two range) since,
+ * if we divided into it to pick a value of delta, we'd get a delta smaller than
+ * (1 << LG_QUANTUM) for sizes >= (1 << LG_QUANTUM), which is against the rules.
+ *
+ * The first base we can divide by SC_NGROUP while still being at least
+ * (1 << LG_QUANTUM) is SC_NGROUP * (1 << LG_QUANTUM). We can get there by
+ * having SC_NGROUP size classes, spaced (1 << LG_QUANTUM) apart. These size
+ * classes are:
+ *   1 * (1 << LG_QUANTUM)
+ *   2 * (1 << LG_QUANTUM)
+ *   3 * (1 << LG_QUANTUM)
+ *   ... (although, as above, this "..." is empty in practice)
+ *   SC_NGROUP * (1 << LG_QUANTUM).
+ *
+ * There are SC_NGROUP of these size classes, so we can regard it as a sort of
+ * pseudo-group, even though it spans multiple powers of 2, is divided
+ * differently, and both starts and ends on a power of 2 (as opposed to just
+ * ending). SC_NGROUP is itself a power of two, so the first group after the
+ * pseudo-group has the power-of-two base SC_NGROUP * (1 << LG_QUANTUM), for a
+ * lg_base of LG_QUANTUM + SC_LG_NGROUP. We can divide this base into SC_NGROUP
+ * sizes without violating our LG_QUANTUM requirements, so we can safely set
+ * lg_delta = lg_base - SC_LG_GROUP (== LG_QUANTUM).
+ *
+ * So, in order, the size classes are:
+ *
+ * Tiny size classes:
+ * - Count: LG_QUANTUM - SC_LG_TINY_MIN.
+ * - Sizes:
+ *     1 << SC_LG_TINY_MIN
+ *     1 << (SC_LG_TINY_MIN + 1)
+ *     1 << (SC_LG_TINY_MIN + 2)
+ *     ...
+ *     1 << (LG_QUANTUM - 1)
+ *
+ * Initial pseudo-group:
+ * - Count: SC_NGROUP
+ * - Sizes:
+ *     1 * (1 << LG_QUANTUM)
+ *     2 * (1 << LG_QUANTUM)
+ *     3 * (1 << LG_QUANTUM)
+ *     ...
+ *     SC_NGROUP * (1 << LG_QUANTUM)
+ *
+ * Regular group 0:
+ * - Count: SC_NGROUP
+ * - Sizes:
+ *   (relative to lg_base of LG_QUANTUM + SC_LG_NGROUP and lg_delta of
+ *   lg_base - SC_LG_NGROUP)
+ *     (1 << lg_base) + 1 * (1 << lg_delta)
+ *     (1 << lg_base) + 2 * (1 << lg_delta)
+ *     (1 << lg_base) + 3 * (1 << lg_delta)
+ *     ...
+ *     (1 << lg_base) + SC_NGROUP * (1 << lg_delta) [ == (1 << (lg_base + 1)) ]
+ *
+ * Regular group 1:
+ * - Count: SC_NGROUP
+ * - Sizes:
+ *   (relative to lg_base of LG_QUANTUM + SC_LG_NGROUP + 1 and lg_delta of
+ *   lg_base - SC_LG_NGROUP)
+ *     (1 << lg_base) + 1 * (1 << lg_delta)
+ *     (1 << lg_base) + 2 * (1 << lg_delta)
+ *     (1 << lg_base) + 3 * (1 << lg_delta)
+ *     ...
+ *     (1 << lg_base) + SC_NGROUP * (1 << lg_delta) [ == (1 << (lg_base + 1)) ]
+ *
+ * ...
+ *
+ * Regular group N:
+ * - Count: SC_NGROUP
+ * - Sizes:
+ *   (relative to lg_base of LG_QUANTUM + SC_LG_NGROUP + N and lg_delta of
+ *   lg_base - SC_LG_NGROUP)
+ *     (1 << lg_base) + 1 * (1 << lg_delta)
+ *     (1 << lg_base) + 2 * (1 << lg_delta)
+ *     (1 << lg_base) + 3 * (1 << lg_delta)
+ *     ...
+ *     (1 << lg_base) + SC_NGROUP * (1 << lg_delta) [ == (1 << (lg_base + 1)) ]
+ *
+ *
+ * Representation of metadata:
+ * To make the math easy, we'll mostly work in lg quantities. We record lg_base,
+ * lg_delta, and ndelta (i.e. number of deltas above the base) on a
+ * per-size-class basis, and maintain the invariant that, across all size
+ * classes, size == (1 << lg_base) + ndelta * (1 << lg_delta).
+ *
+ * For regular groups (i.e. those with lg_base >= LG_QUANTUM + SC_LG_NGROUP),
+ * lg_delta is lg_base - SC_LG_NGROUP, and ndelta goes from 1 to SC_NGROUP.
+ *
+ * For the initial tiny size classes (if any), lg_base is lg(size class size).
+ * lg_delta is lg_base for the first size class, and lg_base - 1 for all
+ * subsequent ones. ndelta is always 0.
+ *
+ * For the pseudo-group, if there are no tiny size classes, then we set
+ * lg_base == LG_QUANTUM, lg_delta == LG_QUANTUM, and have ndelta range from 0
+ * to SC_NGROUP - 1. (Note that delta == base, so base + (SC_NGROUP - 1) * delta
+ * is just SC_NGROUP * base, or (1 << (SC_LG_NGROUP + LG_QUANTUM)), so we do
+ * indeed get a power of two that way). If there *are* tiny size classes, then
+ * the first size class needs to have lg_delta relative to the largest tiny size
+ * class. We therefore set lg_base == LG_QUANTUM - 1,
+ * lg_delta == LG_QUANTUM - 1, and ndelta == 1, keeping the rest of the
+ * pseudo-group the same.
+ *
+ *
+ * Other terminology:
+ * "Small" size classes mean those that are allocated out of bins, which is the
+ * same as those that are slab allocated.
+ * "Large" size classes are those that are not small. The cutoff for counting as
+ * large is page size * group size.
+ */
+
+/*
+ * Size class N + (1 << SC_LG_NGROUP) twice the size of size class N.
+ */
+#define SC_LG_NGROUP 2
+#define SC_LG_TINY_MIN 3
+
+#if SC_LG_TINY_MIN == 0
+/* The div module doesn't support division by 1, which this would require. */
+#error "Unsupported LG_TINY_MIN"
+#endif
+
+/*
+ * The definitions below are all determined by the above settings and system
+ * characteristics.
+ */
+#define SC_NGROUP (1ULL << SC_LG_NGROUP)
+#define SC_PTR_BITS ((1ULL << LG_SIZEOF_PTR) * 8)
+#define SC_NTINY (LG_QUANTUM - SC_LG_TINY_MIN)
+#define SC_NPSEUDO SC_NGROUP
+#define SC_LG_FIRST_REGULAR_BASE (LG_QUANTUM + SC_LG_NGROUP)
+/*
+ * We cap allocations to be less than 2 ** (ptr_bits - 1), so the highest base
+ * we need is 2 ** (ptr_bits - 2). (This also means that the last group is 1
+ * size class shorter than the others).
+ * We could probably save some space in arenas by capping this at LG_VADDR size.
+ */
+#define SC_LG_BASE_MAX (SC_PTR_BITS - 2)
+#define SC_NREGULAR (SC_NGROUP * 					\
+    (SC_LG_BASE_MAX - SC_LG_FIRST_REGULAR_BASE + 1) - 1)
+#define SC_NSIZES (SC_NTINY + SC_NPSEUDO + SC_NREGULAR)
+
+/*
+ * The number of size classes that are at least a page in size. Note that
+ * because delta may be smaller than a page, this is not the same as the number
+ * of size classes that are *multiples* of the page size.
+ */
+#define SC_NPSIZES_MAX (						\
+    /* Start with all the size classes. */				\
+    SC_NSIZES								\
+    /* Subtract out those groups with too small a base. */		\
+    - (LG_PAGE - 1 - SC_LG_FIRST_REGULAR_BASE) * SC_NGROUP		\
+    /* And the pseudo-group. */						\
+    - SC_NPSEUDO							\
+    /* And the tiny group. */						\
+    - SC_NTINY								\
+    /*									\
+     * In the lg_base == lg_page - 1 group, only the last sc is big	\
+     * enough to make it to lg_page.					\
+     */									\
+    - (SC_NGROUP - 1))
+
+/*
+ * We declare a size class is binnable if size < page size * group. Or, in other
+ * words, lg(size) < lg(page size) + lg(group size).
+ */
+#define SC_NBINS (							\
+    /* Sub-regular size classes. */					\
+    SC_NTINY + SC_NPSEUDO						\
+    /* Groups with lg_regular_min_base <= lg_base <= lg_base_max */	\
+    + SC_NGROUP * (LG_PAGE + SC_LG_NGROUP - SC_LG_FIRST_REGULAR_BASE)	\
+    /* Last SC of the last group hits the bound exactly; exclude it. */	\
+    - 1)
+
+/*
+ * The size2index_tab lookup table uses uint8_t to encode each bin index, so we
+ * cannot support more than 256 small size classes.
+ */
+#if (SC_NBINS > 256)
+#  error "Too many small size classes"
+#endif
+
+/* The largest size class in the lookup table. */
+#define SC_LOOKUP_MAXCLASS ((size_t)1 << 12)
+
+typedef struct sc_s sc_t;
+struct sc_s {
+	/* Size class index, or -1 if not a valid size class. */
+	int index;
+	/* Lg group base size (no deltas added). */
+	int lg_base;
+	/* Lg delta to previous size class. */
+	int lg_delta;
+	/* Delta multiplier.  size == 1<<lg_base + ndelta<<lg_delta */
+	int ndelta;
+	/*
+	 * True if the size class is a multiple of the page size, false
+	 * otherwise.
+	 */
+	bool psz;
+	/*
+	 * True if the size class is a small, bin, size class. False otherwise.
+	 */
+	bool bin;
+	/* The slab page count if a small bin size class, 0 otherwise. */
+	int pgs;
+	/* Same as lg_delta if a lookup table size class, 0 otherwise. */
+	int lg_delta_lookup;
+};
+
+typedef struct sc_data_s sc_data_t;
+struct sc_data_s {
+	/* Number of tiny size classes. */
+	unsigned ntiny;
+	/* Number of bins supported by the lookup table. */
+	int nlbins;
+	/* Number of small size class bins. */
+	int nbins;
+	/* Number of size classes. */
+	int nsizes;
+	/* Number of bits required to store NSIZES. */
+	int lg_ceil_nsizes;
+	/* Number of size classes that are a multiple of (1U << LG_PAGE). */
+	unsigned npsizes;
+	/* Lg of maximum tiny size class (or -1, if none). */
+	int lg_tiny_maxclass;
+	/* Maximum size class included in lookup table. */
+	size_t lookup_maxclass;
+	/* Maximum small size class. */
+	size_t small_maxclass;
+	/* Lg of minimum large size class. */
+	int lg_large_minclass;
+	/* The minimum large size class. */
+	size_t large_minclass;
+	/* Maximum (large) size class. */
+	size_t large_maxclass;
+	/* True if the sc_data_t has been initialized (for debugging only). */
+	bool initialized;
+
+	sc_t sc[SC_NSIZES];
+};
+
+extern sc_data_t sc_data_global;
+void sc_data_init(sc_data_t *data);
+void sc_boot();
+
+#endif /* JEMALLOC_INTERNAL_SC_H */
diff --git a/include/jemalloc/internal/sz.h b/include/jemalloc/internal/sz.h
index 9794628..b37e796 100644
--- a/include/jemalloc/internal/sz.h
+++ b/include/jemalloc/internal/sz.h
@@ -3,7 +3,7 @@
 
 #include "jemalloc/internal/bit_util.h"
 #include "jemalloc/internal/pages.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/util.h"
 
 /*
@@ -26,18 +26,18 @@
  * sz_pind2sz_tab encodes the same information as could be computed by
  * sz_pind2sz_compute().
  */
-extern size_t const sz_pind2sz_tab[NPSIZES+1];
+extern size_t sz_pind2sz_tab[SC_NPSIZES_MAX + 1];
 /*
  * sz_index2size_tab encodes the same information as could be computed (at
  * unacceptable cost in some code paths) by sz_index2size_compute().
  */
-extern size_t const sz_index2size_tab[NSIZES];
+extern size_t sz_index2size_tab[SC_NSIZES];
 /*
  * sz_size2index_tab is a compact lookup table that rounds request sizes up to
  * size classes.  In order to reduce cache footprint, the table is compressed,
  * and all accesses are via sz_size2index().
  */
-extern uint8_t const sz_size2index_tab[];
+extern uint8_t sz_size2index_tab[];
 
 static const size_t sz_large_pad =
 #ifdef JEMALLOC_CACHE_OBLIVIOUS
@@ -47,49 +47,47 @@ static const size_t sz_large_pad =
 #endif
     ;
 
+extern void sz_boot(const sc_data_t *sc_data);
+
 JEMALLOC_ALWAYS_INLINE pszind_t
 sz_psz2ind(size_t psz) {
-	if (unlikely(psz > LARGE_MAXCLASS)) {
-		return NPSIZES;
+	if (unlikely(psz > sc_data_global.large_maxclass)) {
+		return sc_data_global.npsizes;
 	}
-	{
-		pszind_t x = lg_floor((psz<<1)-1);
-		pszind_t shift = (x < LG_SIZE_CLASS_GROUP + LG_PAGE) ? 0 : x -
-		    (LG_SIZE_CLASS_GROUP + LG_PAGE);
-		pszind_t grp = shift << LG_SIZE_CLASS_GROUP;
+	pszind_t x = lg_floor((psz<<1)-1);
+	pszind_t shift = (x < SC_LG_NGROUP + LG_PAGE) ?
+	    0 : x - (SC_LG_NGROUP + LG_PAGE);
+	pszind_t grp = shift << SC_LG_NGROUP;
 
-		pszind_t lg_delta = (x < LG_SIZE_CLASS_GROUP + LG_PAGE + 1) ?
-		    LG_PAGE : x - LG_SIZE_CLASS_GROUP - 1;
+	pszind_t lg_delta = (x < SC_LG_NGROUP + LG_PAGE + 1) ?
+	    LG_PAGE : x - SC_LG_NGROUP - 1;
 
-		size_t delta_inverse_mask = ZU(-1) << lg_delta;
-		pszind_t mod = ((((psz-1) & delta_inverse_mask) >> lg_delta)) &
-		    ((ZU(1) << LG_SIZE_CLASS_GROUP) - 1);
+	size_t delta_inverse_mask = ZU(-1) << lg_delta;
+	pszind_t mod = ((((psz-1) & delta_inverse_mask) >> lg_delta)) &
+	    ((ZU(1) << SC_LG_NGROUP) - 1);
 
-		pszind_t ind = grp + mod;
-		return ind;
-	}
+	pszind_t ind = grp + mod;
+	return ind;
 }
 
 static inline size_t
 sz_pind2sz_compute(pszind_t pind) {
-	if (unlikely(pind == NPSIZES)) {
-		return LARGE_MAXCLASS + PAGE;
+	if (unlikely(pind == sc_data_global.npsizes)) {
+		return sc_data_global.large_maxclass + PAGE;
 	}
-	{
-		size_t grp = pind >> LG_SIZE_CLASS_GROUP;
-		size_t mod = pind & ((ZU(1) << LG_SIZE_CLASS_GROUP) - 1);
+	size_t grp = pind >> SC_LG_NGROUP;
+	size_t mod = pind & ((ZU(1) << SC_LG_NGROUP) - 1);
 
-		size_t grp_size_mask = ~((!!grp)-1);
-		size_t grp_size = ((ZU(1) << (LG_PAGE +
-		    (LG_SIZE_CLASS_GROUP-1))) << grp) & grp_size_mask;
+	size_t grp_size_mask = ~((!!grp)-1);
+	size_t grp_size = ((ZU(1) << (LG_PAGE + (SC_LG_NGROUP-1))) << grp)
+	    & grp_size_mask;
 
-		size_t shift = (grp == 0) ? 1 : grp;
-		size_t lg_delta = shift + (LG_PAGE-1);
-		size_t mod_size = (mod+1) << lg_delta;
+	size_t shift = (grp == 0) ? 1 : grp;
+	size_t lg_delta = shift + (LG_PAGE-1);
+	size_t mod_size = (mod+1) << lg_delta;
 
-		size_t sz = grp_size + mod_size;
-		return sz;
-	}
+	size_t sz = grp_size + mod_size;
+	return sz;
 }
 
 static inline size_t
@@ -101,70 +99,67 @@ sz_pind2sz_lookup(pszind_t pind) {
 
 static inline size_t
 sz_pind2sz(pszind_t pind) {
-	assert(pind < NPSIZES+1);
+	assert(pind < sc_data_global.npsizes + 1);
 	return sz_pind2sz_lookup(pind);
 }
 
 static inline size_t
 sz_psz2u(size_t psz) {
-	if (unlikely(psz > LARGE_MAXCLASS)) {
-		return LARGE_MAXCLASS + PAGE;
-	}
-	{
-		size_t x = lg_floor((psz<<1)-1);
-		size_t lg_delta = (x < LG_SIZE_CLASS_GROUP + LG_PAGE + 1) ?
-		    LG_PAGE : x - LG_SIZE_CLASS_GROUP - 1;
-		size_t delta = ZU(1) << lg_delta;
-		size_t delta_mask = delta - 1;
-		size_t usize = (psz + delta_mask) & ~delta_mask;
-		return usize;
+	if (unlikely(psz > sc_data_global.large_maxclass)) {
+		return sc_data_global.large_maxclass + PAGE;
 	}
+	size_t x = lg_floor((psz<<1)-1);
+	size_t lg_delta = (x < SC_LG_NGROUP + LG_PAGE + 1) ?
+	    LG_PAGE : x - SC_LG_NGROUP - 1;
+	size_t delta = ZU(1) << lg_delta;
+	size_t delta_mask = delta - 1;
+	size_t usize = (psz + delta_mask) & ~delta_mask;
+	return usize;
 }
 
 static inline szind_t
 sz_size2index_compute(size_t size) {
-	if (unlikely(size > LARGE_MAXCLASS)) {
-		return NSIZES;
+	if (unlikely(size > sc_data_global.large_maxclass)) {
+		return SC_NSIZES;
 	}
-#if (NTBINS != 0)
-	if (size <= (ZU(1) << LG_TINY_MAXCLASS)) {
-		szind_t lg_tmin = LG_TINY_MAXCLASS - NTBINS + 1;
+#if (SC_NTINY != 0)
+	if (size <= (ZU(1) << sc_data_global.lg_tiny_maxclass)) {
+		szind_t lg_tmin = sc_data_global.lg_tiny_maxclass
+		    - sc_data_global.ntiny + 1;
 		szind_t lg_ceil = lg_floor(pow2_ceil_zu(size));
 		return (lg_ceil < lg_tmin ? 0 : lg_ceil - lg_tmin);
 	}
 #endif
 	{
 		szind_t x = lg_floor((size<<1)-1);
-		szind_t shift = (x < LG_SIZE_CLASS_GROUP + LG_QUANTUM) ? 0 :
-		    x - (LG_SIZE_CLASS_GROUP + LG_QUANTUM);
-		szind_t grp = shift << LG_SIZE_CLASS_GROUP;
+		szind_t shift = (x < SC_LG_NGROUP + LG_QUANTUM) ? 0 :
+		    x - (SC_LG_NGROUP + LG_QUANTUM);
+		szind_t grp = shift << SC_LG_NGROUP;
 
-		szind_t lg_delta = (x < LG_SIZE_CLASS_GROUP + LG_QUANTUM + 1)
-		    ? LG_QUANTUM : x - LG_SIZE_CLASS_GROUP - 1;
+		szind_t lg_delta = (x < SC_LG_NGROUP + LG_QUANTUM + 1)
+		    ? LG_QUANTUM : x - SC_LG_NGROUP - 1;
 
 		size_t delta_inverse_mask = ZU(-1) << lg_delta;
 		szind_t mod = ((((size-1) & delta_inverse_mask) >> lg_delta)) &
-		    ((ZU(1) << LG_SIZE_CLASS_GROUP) - 1);
+		    ((ZU(1) << SC_LG_NGROUP) - 1);
 
-		szind_t index = NTBINS + grp + mod;
+		szind_t index = sc_data_global.ntiny + grp + mod;
 		return index;
 	}
 }
 
 JEMALLOC_ALWAYS_INLINE szind_t
 sz_size2index_lookup(size_t size) {
-	assert(size <= LOOKUP_MAXCLASS);
-	{
-		szind_t ret = (sz_size2index_tab[(size-1) >> LG_TINY_MIN]);
-		assert(ret == sz_size2index_compute(size));
-		return ret;
-	}
+	assert(size <= SC_LOOKUP_MAXCLASS);
+	szind_t ret = (sz_size2index_tab[(size-1) >> SC_LG_TINY_MIN]);
+	assert(ret == sz_size2index_compute(size));
+	return ret;
 }
 
 JEMALLOC_ALWAYS_INLINE szind_t
 sz_size2index(size_t size) {
 	assert(size > 0);
-	if (likely(size <= LOOKUP_MAXCLASS)) {
+	if (likely(size <= SC_LOOKUP_MAXCLASS)) {
 		return sz_size2index_lookup(size);
 	}
 	return sz_size2index_compute(size);
@@ -172,20 +167,21 @@ sz_size2index(size_t size) {
 
 static inline size_t
 sz_index2size_compute(szind_t index) {
-#if (NTBINS > 0)
-	if (index < NTBINS) {
-		return (ZU(1) << (LG_TINY_MAXCLASS - NTBINS + 1 + index));
+#if (SC_NTINY > 0)
+	if (index < sc_data_global.ntiny) {
+		return (ZU(1) << (sc_data_global.lg_tiny_maxclass
+		    - sc_data_global.ntiny + 1 + index));
 	}
 #endif
 	{
-		size_t reduced_index = index - NTBINS;
-		size_t grp = reduced_index >> LG_SIZE_CLASS_GROUP;
-		size_t mod = reduced_index & ((ZU(1) << LG_SIZE_CLASS_GROUP) -
+		size_t reduced_index = index - sc_data_global.ntiny;
+		size_t grp = reduced_index >> SC_LG_NGROUP;
+		size_t mod = reduced_index & ((ZU(1) << SC_LG_NGROUP) -
 		    1);
 
 		size_t grp_size_mask = ~((!!grp)-1);
 		size_t grp_size = ((ZU(1) << (LG_QUANTUM +
-		    (LG_SIZE_CLASS_GROUP-1))) << grp) & grp_size_mask;
+		    (SC_LG_NGROUP-1))) << grp) & grp_size_mask;
 
 		size_t shift = (grp == 0) ? 1 : grp;
 		size_t lg_delta = shift + (LG_QUANTUM-1);
@@ -205,18 +201,19 @@ sz_index2size_lookup(szind_t index) {
 
 JEMALLOC_ALWAYS_INLINE size_t
 sz_index2size(szind_t index) {
-	assert(index < NSIZES);
+	assert(index < SC_NSIZES);
 	return sz_index2size_lookup(index);
 }
 
 JEMALLOC_ALWAYS_INLINE size_t
 sz_s2u_compute(size_t size) {
-	if (unlikely(size > LARGE_MAXCLASS)) {
+	if (unlikely(size > sc_data_global.large_maxclass)) {
 		return 0;
 	}
-#if (NTBINS > 0)
-	if (size <= (ZU(1) << LG_TINY_MAXCLASS)) {
-		size_t lg_tmin = LG_TINY_MAXCLASS - NTBINS + 1;
+#if (SC_NTINY > 0)
+	if (size <= (ZU(1) << sc_data_global.lg_tiny_maxclass)) {
+		size_t lg_tmin = sc_data_global.lg_tiny_maxclass
+		    - sc_data_global.ntiny + 1;
 		size_t lg_ceil = lg_floor(pow2_ceil_zu(size));
 		return (lg_ceil < lg_tmin ? (ZU(1) << lg_tmin) :
 		    (ZU(1) << lg_ceil));
@@ -224,8 +221,8 @@ sz_s2u_compute(size_t size) {
 #endif
 	{
 		size_t x = lg_floor((size<<1)-1);
-		size_t lg_delta = (x < LG_SIZE_CLASS_GROUP + LG_QUANTUM + 1)
-		    ?  LG_QUANTUM : x - LG_SIZE_CLASS_GROUP - 1;
+		size_t lg_delta = (x < SC_LG_NGROUP + LG_QUANTUM + 1)
+		    ?  LG_QUANTUM : x - SC_LG_NGROUP - 1;
 		size_t delta = ZU(1) << lg_delta;
 		size_t delta_mask = delta - 1;
 		size_t usize = (size + delta_mask) & ~delta_mask;
@@ -248,7 +245,7 @@ sz_s2u_lookup(size_t size) {
 JEMALLOC_ALWAYS_INLINE size_t
 sz_s2u(size_t size) {
 	assert(size > 0);
-	if (likely(size <= LOOKUP_MAXCLASS)) {
+	if (likely(size <= SC_LOOKUP_MAXCLASS)) {
 		return sz_s2u_lookup(size);
 	}
 	return sz_s2u_compute(size);
@@ -265,7 +262,7 @@ sz_sa2u(size_t size, size_t alignment) {
 	assert(alignment != 0 && ((alignment - 1) & alignment) == 0);
 
 	/* Try for a small size class. */
-	if (size <= SMALL_MAXCLASS && alignment < PAGE) {
+	if (size <= sc_data_global.small_maxclass && alignment < PAGE) {
 		/*
 		 * Round size up to the nearest multiple of alignment.
 		 *
@@ -281,20 +278,20 @@ sz_sa2u(size_t size, size_t alignment) {
 		 *    192 | 11000000 |  64
 		 */
 		usize = sz_s2u(ALIGNMENT_CEILING(size, alignment));
-		if (usize < LARGE_MINCLASS) {
+		if (usize < sc_data_global.large_minclass) {
 			return usize;
 		}
 	}
 
 	/* Large size class.  Beware of overflow. */
 
-	if (unlikely(alignment > LARGE_MAXCLASS)) {
+	if (unlikely(alignment > sc_data_global.large_maxclass)) {
 		return 0;
 	}
 
 	/* Make sure result is a large size class. */
-	if (size <= LARGE_MINCLASS) {
-		usize = LARGE_MINCLASS;
+	if (size <= sc_data_global.large_minclass) {
+		usize = sc_data_global.large_minclass;
 	} else {
 		usize = sz_s2u(size);
 		if (usize < size) {
diff --git a/include/jemalloc/internal/tcache_externs.h b/include/jemalloc/internal/tcache_externs.h
index 790367b..d63eafd 100644
--- a/include/jemalloc/internal/tcache_externs.h
+++ b/include/jemalloc/internal/tcache_externs.h
@@ -1,15 +1,13 @@
 #ifndef JEMALLOC_INTERNAL_TCACHE_EXTERNS_H
 #define JEMALLOC_INTERNAL_TCACHE_EXTERNS_H
 
-#include "jemalloc/internal/size_classes.h"
-
 extern bool	opt_tcache;
 extern ssize_t	opt_lg_tcache_max;
 
 extern cache_bin_info_t	*tcache_bin_info;
 
 /*
- * Number of tcache bins.  There are NBINS small-object bins, plus 0 or more
+ * Number of tcache bins.  There are SC_NBINS small-object bins, plus 0 or more
  * large-object bins.
  */
 extern unsigned	nhbins;
diff --git a/include/jemalloc/internal/tcache_inlines.h b/include/jemalloc/internal/tcache_inlines.h
index c426c56..b060043 100644
--- a/include/jemalloc/internal/tcache_inlines.h
+++ b/include/jemalloc/internal/tcache_inlines.h
@@ -3,7 +3,7 @@
 
 #include "jemalloc/internal/bin.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/sz.h"
 #include "jemalloc/internal/ticker.h"
 #include "jemalloc/internal/util.h"
@@ -46,7 +46,7 @@ tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache,
 	bool tcache_success;
 	size_t usize JEMALLOC_CC_SILENCE_INIT(0);
 
-	assert(binind < NBINS);
+	assert(binind < SC_NBINS);
 	bin = tcache_small_bin_get(tcache, binind);
 	ret = cache_bin_alloc_easy(bin, &tcache_success);
 	assert(tcache_success == (ret != NULL));
@@ -107,7 +107,7 @@ tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
 	cache_bin_t *bin;
 	bool tcache_success;
 
-	assert(binind >= NBINS &&binind < nhbins);
+	assert(binind >= SC_NBINS &&binind < nhbins);
 	bin = tcache_large_bin_get(tcache, binind);
 	ret = cache_bin_alloc_easy(bin, &tcache_success);
 	assert(tcache_success == (ret != NULL));
@@ -166,7 +166,8 @@ tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
 	cache_bin_t *bin;
 	cache_bin_info_t *bin_info;
 
-	assert(tcache_salloc(tsd_tsdn(tsd), ptr) <= SMALL_MAXCLASS);
+	assert(tcache_salloc(tsd_tsdn(tsd), ptr)
+	    <= sc_data_global.small_maxclass);
 
 	if (slow_path && config_fill && unlikely(opt_junk_free)) {
 		arena_dalloc_junk_small(ptr, &bin_infos[binind]);
@@ -191,7 +192,8 @@ tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
 	cache_bin_t *bin;
 	cache_bin_info_t *bin_info;
 
-	assert(tcache_salloc(tsd_tsdn(tsd), ptr) > SMALL_MAXCLASS);
+	assert(tcache_salloc(tsd_tsdn(tsd), ptr)
+	    > sc_data_global.small_maxclass);
 	assert(tcache_salloc(tsd_tsdn(tsd), ptr) <= tcache_maxclass);
 
 	if (slow_path && config_fill && unlikely(opt_junk_free)) {
diff --git a/include/jemalloc/internal/tcache_structs.h b/include/jemalloc/internal/tcache_structs.h
index b3cd4e5..2708703 100644
--- a/include/jemalloc/internal/tcache_structs.h
+++ b/include/jemalloc/internal/tcache_structs.h
@@ -1,9 +1,9 @@
 #ifndef JEMALLOC_INTERNAL_TCACHE_STRUCTS_H
 #define JEMALLOC_INTERNAL_TCACHE_STRUCTS_H
 
-#include "jemalloc/internal/ql.h"
-#include "jemalloc/internal/size_classes.h"
 #include "jemalloc/internal/cache_bin.h"
+#include "jemalloc/internal/ql.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/ticker.h"
 
 /* Various uses of this struct need it to be a named type. */
@@ -25,7 +25,7 @@ struct tcache_s {
 	 * During tcache initialization, the avail pointer in each element of
 	 * tbins is initialized to point to the proper offset within this array.
 	 */
-	cache_bin_t	bins_small[NBINS];
+	cache_bin_t	bins_small[SC_NBINS];
 
 	/*
 	 * This data is less hot; we can be a little less careful with our
@@ -50,13 +50,13 @@ struct tcache_s {
 	/* Next bin to GC. */
 	szind_t		next_gc_bin;
 	/* For small bins, fill (ncached_max >> lg_fill_div). */
-	uint8_t		lg_fill_div[NBINS];
+	uint8_t		lg_fill_div[SC_NBINS];
 	/*
 	 * We put the cache bins for large size classes at the end of the
 	 * struct, since some of them might not get used.  This might end up
 	 * letting us avoid touching an extra page if we don't have to.
 	 */
-	cache_bin_t	bins_large[NSIZES-NBINS];
+	cache_bin_t	bins_large[SC_NSIZES-SC_NBINS];
 };
 
 /* Linkage for list of available (previously used) explicit tcache IDs. */
diff --git a/include/jemalloc/internal/tcache_types.h b/include/jemalloc/internal/tcache_types.h
index e49bc9d..f953b8c 100644
--- a/include/jemalloc/internal/tcache_types.h
+++ b/include/jemalloc/internal/tcache_types.h
@@ -1,7 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_TCACHE_TYPES_H
 #define JEMALLOC_INTERNAL_TCACHE_TYPES_H
 
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 
 typedef struct tcache_s tcache_t;
 typedef struct tcaches_s tcaches_t;
@@ -45,7 +45,7 @@ typedef struct tcaches_s tcaches_t;
 
 /* Number of tcache allocation/deallocation events between incremental GCs. */
 #define TCACHE_GC_INCR							\
-    ((TCACHE_GC_SWEEP / NBINS) + ((TCACHE_GC_SWEEP / NBINS == 0) ? 0 : 1))
+    ((TCACHE_GC_SWEEP / SC_NBINS) + ((TCACHE_GC_SWEEP / SC_NBINS == 0) ? 0 : 1))
 
 /* Used in TSD static initializer only. Real init in tcache_data_init(). */
 #define TCACHE_ZERO_INITIALIZER {0}
diff --git a/src/arena.c b/src/arena.c
index eefea0d..07d9103 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -8,7 +8,6 @@
 #include "jemalloc/internal/extent_mmap.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/rtree.h"
-#include "jemalloc/internal/size_classes.h"
 #include "jemalloc/internal/util.h"
 
 JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
@@ -42,7 +41,7 @@ const uint64_t h_steps[SMOOTHSTEP_NSTEPS] = {
 #undef STEP
 };
 
-static div_info_t arena_binind_div_info[NBINS];
+static div_info_t arena_binind_div_info[SC_NBINS];
 
 size_t opt_huge_threshold = HUGE_THRESHOLD_DEFAULT;
 size_t huge_threshold = HUGE_THRESHOLD_DEFAULT;
@@ -128,7 +127,7 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
 	    extents_npages_get(&arena->extents_dirty) +
 	    extents_npages_get(&arena->extents_muzzy)) << LG_PAGE)));
 
-	for (szind_t i = 0; i < NSIZES - NBINS; i++) {
+	for (szind_t i = 0; i < SC_NSIZES - SC_NBINS; i++) {
 		uint64_t nmalloc = arena_stats_read_u64(tsdn, &arena->stats,
 		    &arena->stats.lstats[i].nmalloc);
 		arena_stats_accum_u64(&lstats[i].nmalloc, nmalloc);
@@ -151,7 +150,7 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
 		size_t curlextents = (size_t)(nmalloc - ndalloc);
 		lstats[i].curlextents += curlextents;
 		arena_stats_accum_zu(&astats->allocated_large,
-		    curlextents * sz_index2size(NBINS + i));
+		    curlextents * sz_index2size(SC_NBINS + i));
 	}
 
 	arena_stats_unlock(tsdn, &arena->stats);
@@ -162,7 +161,7 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
 	cache_bin_array_descriptor_t *descriptor;
 	ql_foreach(descriptor, &arena->cache_bin_array_descriptor_ql, link) {
 		szind_t i = 0;
-		for (; i < NBINS; i++) {
+		for (; i < SC_NBINS; i++) {
 			cache_bin_t *tbin = &descriptor->bins_small[i];
 			arena_stats_accum_zu(&astats->tcache_bytes,
 			    tbin->ncached * sz_index2size(i));
@@ -206,7 +205,7 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
 	nstime_update(&astats->uptime);
 	nstime_subtract(&astats->uptime, &arena->create_time);
 
-	for (szind_t i = 0; i < NBINS; i++) {
+	for (szind_t i = 0; i < SC_NBINS; i++) {
 		bin_stats_merge(tsdn, &bstats[i], &arena->bins[i]);
 	}
 }
@@ -297,11 +296,11 @@ arena_large_malloc_stats_update(tsdn_t *tsdn, arena_t *arena, size_t usize) {
 
 	cassert(config_stats);
 
-	if (usize < LARGE_MINCLASS) {
-		usize = LARGE_MINCLASS;
+	if (usize < sc_data_global.large_minclass) {
+		usize = sc_data_global.large_minclass;
 	}
 	index = sz_size2index(usize);
-	hindex = (index >= NBINS) ? index - NBINS : 0;
+	hindex = (index >= SC_NBINS) ? index - SC_NBINS : 0;
 
 	arena_stats_add_u64(tsdn, &arena->stats,
 	    &arena->stats.lstats[hindex].nmalloc, 1);
@@ -313,11 +312,11 @@ arena_large_dalloc_stats_update(tsdn_t *tsdn, arena_t *arena, size_t usize) {
 
 	cassert(config_stats);
 
-	if (usize < LARGE_MINCLASS) {
-		usize = LARGE_MINCLASS;
+	if (usize < sc_data_global.large_minclass) {
+		usize = sc_data_global.large_minclass;
 	}
 	index = sz_size2index(usize);
-	hindex = (index >= NBINS) ? index - NBINS : 0;
+	hindex = (index >= SC_NBINS) ? index - SC_NBINS : 0;
 
 	arena_stats_add_u64(tsdn, &arena->stats,
 	    &arena->stats.lstats[hindex].ndalloc, 1);
@@ -994,7 +993,7 @@ arena_reset(tsd_t *tsd, arena_t *arena) {
 		rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd);
 		rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree, rtree_ctx,
 		    (uintptr_t)ptr, true, &alloc_ctx.szind, &alloc_ctx.slab);
-		assert(alloc_ctx.szind != NSIZES);
+		assert(alloc_ctx.szind != SC_NSIZES);
 
 		if (config_stats || (config_prof && opt_prof)) {
 			usize = sz_index2size(alloc_ctx.szind);
@@ -1010,7 +1009,7 @@ arena_reset(tsd_t *tsd, arena_t *arena) {
 	malloc_mutex_unlock(tsd_tsdn(tsd), &arena->large_mtx);
 
 	/* Bins. */
-	for (unsigned i = 0; i < NBINS; i++) {
+	for (unsigned i = 0; i < SC_NBINS; i++) {
 		extent_t *slab;
 		bin_t *bin = &arena->bins[i];
 		malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
@@ -1331,7 +1330,7 @@ arena_malloc_small(tsdn_t *tsdn, arena_t *arena, szind_t binind, bool zero) {
 	size_t usize;
 	extent_t *slab;
 
-	assert(binind < NBINS);
+	assert(binind < SC_NBINS);
 	bin = &arena->bins[binind];
 	usize = sz_index2size(binind);
 
@@ -1390,7 +1389,7 @@ arena_malloc_hard(tsdn_t *tsdn, arena_t *arena, size_t size, szind_t ind,
 		return NULL;
 	}
 
-	if (likely(size <= SMALL_MAXCLASS)) {
+	if (likely(size <= sc_data_global.small_maxclass)) {
 		return arena_malloc_small(tsdn, arena, ind, zero);
 	}
 	return large_malloc(tsdn, arena, sz_index2size(ind), zero);
@@ -1401,8 +1400,9 @@ arena_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment,
     bool zero, tcache_t *tcache) {
 	void *ret;
 
-	if (usize <= SMALL_MAXCLASS && (alignment < PAGE || (alignment == PAGE
-	    && (usize & PAGE_MASK) == 0))) {
+	if (usize <= sc_data_global.small_maxclass
+	    && (alignment < PAGE
+	    || (alignment == PAGE && (usize & PAGE_MASK) == 0))) {
 		/* Small; alignment doesn't require special slab placement. */
 		ret = arena_malloc(tsdn, arena, usize, sz_size2index(usize),
 		    zero, tcache, true);
@@ -1420,8 +1420,8 @@ void
 arena_prof_promote(tsdn_t *tsdn, const void *ptr, size_t usize) {
 	cassert(config_prof);
 	assert(ptr != NULL);
-	assert(isalloc(tsdn, ptr) == LARGE_MINCLASS);
-	assert(usize <= SMALL_MAXCLASS);
+	assert(isalloc(tsdn, ptr) == sc_data_global.large_minclass);
+	assert(usize <= sc_data_global.small_maxclass);
 
 	rtree_ctx_t rtree_ctx_fallback;
 	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
@@ -1445,15 +1445,15 @@ arena_prof_demote(tsdn_t *tsdn, extent_t *extent, const void *ptr) {
 	cassert(config_prof);
 	assert(ptr != NULL);
 
-	extent_szind_set(extent, NBINS);
+	extent_szind_set(extent, SC_NBINS);
 	rtree_ctx_t rtree_ctx_fallback;
 	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
 	rtree_szind_slab_update(tsdn, &extents_rtree, rtree_ctx, (uintptr_t)ptr,
-	    NBINS, false);
+	    SC_NBINS, false);
 
-	assert(isalloc(tsdn, ptr) == LARGE_MINCLASS);
+	assert(isalloc(tsdn, ptr) == sc_data_global.large_minclass);
 
-	return LARGE_MINCLASS;
+	return sc_data_global.large_minclass;
 }
 
 void
@@ -1594,33 +1594,35 @@ arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
     size_t extra, bool zero, size_t *newsize) {
 	bool ret;
 	/* Calls with non-zero extra had to clamp extra. */
-	assert(extra == 0 || size + extra <= LARGE_MAXCLASS);
+	assert(extra == 0 || size + extra <= sc_data_global.large_maxclass);
 
 	extent_t *extent = iealloc(tsdn, ptr);
-	if (unlikely(size > LARGE_MAXCLASS)) {
+	if (unlikely(size > sc_data_global.large_maxclass)) {
 		ret = true;
 		goto done;
 	}
 
 	size_t usize_min = sz_s2u(size);
 	size_t usize_max = sz_s2u(size + extra);
-	if (likely(oldsize <= SMALL_MAXCLASS && usize_min <= SMALL_MAXCLASS)) {
+	if (likely(oldsize <= sc_data_global.small_maxclass && usize_min
+	    <= sc_data_global.small_maxclass)) {
 		/*
 		 * Avoid moving the allocation if the size class can be left the
 		 * same.
 		 */
 		assert(bin_infos[sz_size2index(oldsize)].reg_size ==
 		    oldsize);
-		if ((usize_max > SMALL_MAXCLASS || sz_size2index(usize_max) !=
-		    sz_size2index(oldsize)) && (size > oldsize || usize_max <
-		    oldsize)) {
+		if ((usize_max > sc_data_global.small_maxclass
+		    || sz_size2index(usize_max) != sz_size2index(oldsize))
+		    && (size > oldsize || usize_max < oldsize)) {
 			ret = true;
 			goto done;
 		}
 
 		arena_decay_tick(tsdn, extent_arena_get(extent));
 		ret = false;
-	} else if (oldsize >= LARGE_MINCLASS && usize_max >= LARGE_MINCLASS) {
+	} else if (oldsize >= sc_data_global.large_minclass
+	    && usize_max >= sc_data_global.large_minclass) {
 		ret = large_ralloc_no_move(tsdn, extent, usize_min, usize_max,
 		    zero);
 	} else {
@@ -1641,7 +1643,7 @@ arena_ralloc_move_helper(tsdn_t *tsdn, arena_t *arena, size_t usize,
 		    zero, tcache, true);
 	}
 	usize = sz_sa2u(usize, alignment);
-	if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) {
+	if (unlikely(usize == 0 || usize > sc_data_global.large_maxclass)) {
 		return NULL;
 	}
 	return ipalloct(tsdn, usize, alignment, zero, tcache, arena);
@@ -1652,11 +1654,11 @@ arena_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t oldsize,
     size_t size, size_t alignment, bool zero, tcache_t *tcache,
     hook_ralloc_args_t *hook_args) {
 	size_t usize = sz_s2u(size);
-	if (unlikely(usize == 0 || size > LARGE_MAXCLASS)) {
+	if (unlikely(usize == 0 || size > sc_data_global.large_maxclass)) {
 		return NULL;
 	}
 
-	if (likely(usize <= SMALL_MAXCLASS)) {
+	if (likely(usize <= sc_data_global.small_maxclass)) {
 		/* Try to avoid moving the allocation. */
 		UNUSED size_t newsize;
 		if (!arena_ralloc_no_move(tsdn, ptr, oldsize, usize, 0, zero,
@@ -1669,7 +1671,8 @@ arena_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t oldsize,
 		}
 	}
 
-	if (oldsize >= LARGE_MINCLASS && usize >= LARGE_MINCLASS) {
+	if (oldsize >= sc_data_global.large_minclass
+	    && usize >= sc_data_global.large_minclass) {
 		return large_ralloc(tsdn, arena, ptr, usize,
 		    alignment, zero, tcache, hook_args);
 	}
@@ -1751,8 +1754,8 @@ arena_retain_grow_limit_get_set(tsd_t *tsd, arena_t *arena, size_t *old_limit,
 	if (new_limit != NULL) {
 		size_t limit = *new_limit;
 		/* Grow no more than the new limit. */
-		if ((new_ind = sz_psz2ind(limit + 1) - 1) >
-		     EXTENT_GROW_MAX_PIND) {
+		if ((new_ind = sz_psz2ind(limit + 1) - 1)
+		    >= sc_data_global.npsizes) {
 			return true;
 		}
 	}
@@ -1896,7 +1899,7 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 	}
 
 	arena->extent_grow_next = sz_psz2ind(HUGEPAGE);
-	arena->retain_grow_limit = EXTENT_GROW_MAX_PIND;
+	arena->retain_grow_limit = sc_data_global.npsizes - 1;
 	if (malloc_mutex_init(&arena->extent_grow_mtx, "extent_grow",
 	    WITNESS_RANK_EXTENT_GROW, malloc_mutex_rank_exclusive)) {
 		goto label_error;
@@ -1909,7 +1912,7 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 	}
 
 	/* Initialize bins. */
-	for (i = 0; i < NBINS; i++) {
+	for (i = 0; i < SC_NBINS; i++) {
 		bool err = bin_init(&arena->bins[i]);
 		if (err) {
 			goto label_error;
@@ -1982,10 +1985,10 @@ arena_init_huge(void) {
 	bool huge_enabled;
 
 	/* The threshold should be large size class. */
-	if (opt_huge_threshold > LARGE_MAXCLASS ||
-	    opt_huge_threshold < LARGE_MINCLASS) {
+	if (opt_huge_threshold > sc_data_global.large_maxclass ||
+	    opt_huge_threshold < sc_data_global.large_minclass) {
 		opt_huge_threshold = 0;
-		huge_threshold = LARGE_MAXCLASS + PAGE;
+		huge_threshold = sc_data_global.large_maxclass + PAGE;
 		huge_enabled = false;
 	} else {
 		/* Reserve the index for the huge arena. */
@@ -2001,16 +2004,11 @@ void
 arena_boot(void) {
 	arena_dirty_decay_ms_default_set(opt_dirty_decay_ms);
 	arena_muzzy_decay_ms_default_set(opt_muzzy_decay_ms);
-#define REGIND_bin_yes(index, reg_size) 				\
-	div_init(&arena_binind_div_info[(index)], (reg_size));
-#define REGIND_bin_no(index, reg_size)
-#define SC(index, lg_grp, lg_delta, ndelta, psz, bin, pgs,		\
-    lg_delta_lookup)							\
-	REGIND_bin_##bin(index, (1U<<lg_grp) + (ndelta << lg_delta))
-	SIZE_CLASSES
-#undef REGIND_bin_yes
-#undef REGIND_bin_no
-#undef SC
+	for (unsigned i = 0; i < SC_NBINS; i++) {
+		sc_t *sc = &sc_data_global.sc[i];
+		div_init(&arena_binind_div_info[i],
+		    (1U << sc->lg_base) + (sc->ndelta << sc->lg_delta));
+	}
 }
 
 void
@@ -2055,7 +2053,7 @@ arena_prefork6(tsdn_t *tsdn, arena_t *arena) {
 
 void
 arena_prefork7(tsdn_t *tsdn, arena_t *arena) {
-	for (unsigned i = 0; i < NBINS; i++) {
+	for (unsigned i = 0; i < SC_NBINS; i++) {
 		bin_prefork(tsdn, &arena->bins[i]);
 	}
 }
@@ -2064,7 +2062,7 @@ void
 arena_postfork_parent(tsdn_t *tsdn, arena_t *arena) {
 	unsigned i;
 
-	for (i = 0; i < NBINS; i++) {
+	for (i = 0; i < SC_NBINS; i++) {
 		bin_postfork_parent(tsdn, &arena->bins[i]);
 	}
 	malloc_mutex_postfork_parent(tsdn, &arena->large_mtx);
@@ -2108,7 +2106,7 @@ arena_postfork_child(tsdn_t *tsdn, arena_t *arena) {
 		}
 	}
 
-	for (i = 0; i < NBINS; i++) {
+	for (i = 0; i < SC_NBINS; i++) {
 		bin_postfork_child(tsdn, &arena->bins[i]);
 	}
 	malloc_mutex_postfork_child(tsdn, &arena->large_mtx);
diff --git a/src/base.c b/src/base.c
index b0324b5..cabf66c 100644
--- a/src/base.c
+++ b/src/base.c
@@ -262,8 +262,8 @@ base_block_alloc(tsdn_t *tsdn, base_t *base, extent_hooks_t *extent_hooks,
 	 */
 	size_t min_block_size = HUGEPAGE_CEILING(sz_psz2u(header_size + gap_size
 	    + usize));
-	pszind_t pind_next = (*pind_last + 1 < NPSIZES) ? *pind_last + 1 :
-	    *pind_last;
+	pszind_t pind_next = (*pind_last + 1 < sc_data_global.npsizes) ?
+	    *pind_last + 1 : *pind_last;
 	size_t next_block_size = HUGEPAGE_CEILING(sz_pind2sz(pind_next));
 	size_t block_size = (min_block_size > next_block_size) ? min_block_size
 	    : next_block_size;
@@ -372,7 +372,7 @@ base_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 	base->extent_sn_next = extent_sn_next;
 	base->blocks = block;
 	base->auto_thp_switched = false;
-	for (szind_t i = 0; i < NSIZES; i++) {
+	for (szind_t i = 0; i < SC_NSIZES; i++) {
 		extent_heap_new(&base->avail[i]);
 	}
 	if (config_stats) {
@@ -426,7 +426,7 @@ base_alloc_impl(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment,
 
 	extent_t *extent = NULL;
 	malloc_mutex_lock(tsdn, &base->mtx);
-	for (szind_t i = sz_size2index(asize); i < NSIZES; i++) {
+	for (szind_t i = sz_size2index(asize); i < SC_NSIZES; i++) {
 		extent = extent_heap_remove_first(&base->avail[i]);
 		if (extent != NULL) {
 			/* Use existing space. */
diff --git a/src/bin.c b/src/bin.c
index 0886bc4..e62babd 100644
--- a/src/bin.c
+++ b/src/bin.c
@@ -1,23 +1,34 @@
 #include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/jemalloc_internal_includes.h"
 
+#include "jemalloc/internal/assert.h"
 #include "jemalloc/internal/bin.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/witness.h"
 
-const bin_info_t bin_infos[NBINS] = {
-#define BIN_INFO_bin_yes(reg_size, slab_size, nregs)			\
-	{reg_size, slab_size, nregs, BITMAP_INFO_INITIALIZER(nregs)},
-#define BIN_INFO_bin_no(reg_size, slab_size, nregs)
-#define SC(index, lg_grp, lg_delta, ndelta, psz, bin, pgs,		\
-    lg_delta_lookup)							\
-	BIN_INFO_bin_##bin((1U<<lg_grp) + (ndelta<<lg_delta),		\
-	    (pgs << LG_PAGE), (pgs << LG_PAGE) / ((1U<<lg_grp) +	\
-	    (ndelta<<lg_delta)))
-	SIZE_CLASSES
-#undef BIN_INFO_bin_yes
-#undef BIN_INFO_bin_no
-#undef SC
-};
+bin_info_t bin_infos[SC_NBINS];
+
+void
+bin_infos_init(sc_data_t *sc_data, bin_info_t bin_infos[SC_NBINS]) {
+	for (unsigned i = 0; i < SC_NBINS; i++) {
+		bin_info_t *bin_info = &bin_infos[i];
+		sc_t *sc = &sc_data->sc[i];
+		bin_info->reg_size = ((size_t)1U << sc->lg_base)
+		    + ((size_t)sc->ndelta << sc->lg_delta);
+		bin_info->slab_size = (sc->pgs << LG_PAGE);
+		bin_info->nregs =
+		    (uint32_t)(bin_info->slab_size / bin_info->reg_size);
+		bitmap_info_t bitmap_info = BITMAP_INFO_INITIALIZER(
+		    bin_info->nregs);
+		bin_info->bitmap_info = bitmap_info;
+	}
+}
+
+void
+bin_boot(sc_data_t *sc_data) {
+	assert(sc_data->initialized);
+	bin_infos_init(sc_data, bin_infos);
+}
 
 bool
 bin_init(bin_t *bin) {
diff --git a/src/ckh.c b/src/ckh.c
index e95e0a3..94c4fe6 100644
--- a/src/ckh.c
+++ b/src/ckh.c
@@ -275,7 +275,8 @@ ckh_grow(tsd_t *tsd, ckh_t *ckh) {
 
 		lg_curcells++;
 		usize = sz_sa2u(sizeof(ckhc_t) << lg_curcells, CACHELINE);
-		if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) {
+		if (unlikely(usize == 0
+		    || usize > sc_data_global.large_maxclass)) {
 			ret = true;
 			goto label_return;
 		}
@@ -320,7 +321,7 @@ ckh_shrink(tsd_t *tsd, ckh_t *ckh) {
 	lg_prevbuckets = ckh->lg_curbuckets;
 	lg_curcells = ckh->lg_curbuckets + LG_CKH_BUCKET_CELLS - 1;
 	usize = sz_sa2u(sizeof(ckhc_t) << lg_curcells, CACHELINE);
-	if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) {
+	if (unlikely(usize == 0 || usize > sc_data_global.large_maxclass)) {
 		return;
 	}
 	tab = (ckhc_t *)ipallocztm(tsd_tsdn(tsd), usize, CACHELINE, true, NULL,
@@ -396,7 +397,7 @@ ckh_new(tsd_t *tsd, ckh_t *ckh, size_t minitems, ckh_hash_t *hash,
 	ckh->keycomp = keycomp;
 
 	usize = sz_sa2u(sizeof(ckhc_t) << lg_mincells, CACHELINE);
-	if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) {
+	if (unlikely(usize == 0 || usize > sc_data_global.large_maxclass)) {
 		ret = true;
 		goto label_return;
 	}
diff --git a/src/ctl.c b/src/ctl.c
index 3f7dea1..38529d0 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -8,7 +8,7 @@
 #include "jemalloc/internal/extent_mmap.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/nstime.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/util.h"
 
 /******************************************************************************/
@@ -710,9 +710,9 @@ ctl_arena_clear(ctl_arena_t *ctl_arena) {
 		ctl_arena->astats->nmalloc_small = 0;
 		ctl_arena->astats->ndalloc_small = 0;
 		ctl_arena->astats->nrequests_small = 0;
-		memset(ctl_arena->astats->bstats, 0, NBINS *
+		memset(ctl_arena->astats->bstats, 0, SC_NBINS *
 		    sizeof(bin_stats_t));
-		memset(ctl_arena->astats->lstats, 0, (NSIZES - NBINS) *
+		memset(ctl_arena->astats->lstats, 0, (SC_NSIZES - SC_NBINS) *
 		    sizeof(arena_stats_large_t));
 	}
 }
@@ -729,7 +729,7 @@ ctl_arena_stats_amerge(tsdn_t *tsdn, ctl_arena_t *ctl_arena, arena_t *arena) {
 		    &ctl_arena->astats->astats, ctl_arena->astats->bstats,
 		    ctl_arena->astats->lstats);
 
-		for (i = 0; i < NBINS; i++) {
+		for (i = 0; i < SC_NBINS; i++) {
 			ctl_arena->astats->allocated_small +=
 			    ctl_arena->astats->bstats[i].curregs *
 			    sz_index2size(i);
@@ -841,7 +841,7 @@ MUTEX_PROF_ARENA_MUTEXES
 			sdstats->astats.uptime = astats->astats.uptime;
 		}
 
-		for (i = 0; i < NBINS; i++) {
+		for (i = 0; i < SC_NBINS; i++) {
 			sdstats->bstats[i].nmalloc += astats->bstats[i].nmalloc;
 			sdstats->bstats[i].ndalloc += astats->bstats[i].ndalloc;
 			sdstats->bstats[i].nrequests +=
@@ -867,7 +867,7 @@ MUTEX_PROF_ARENA_MUTEXES
 			    &astats->bstats[i].mutex_data);
 		}
 
-		for (i = 0; i < NSIZES - NBINS; i++) {
+		for (i = 0; i < SC_NSIZES - SC_NBINS; i++) {
 			ctl_accum_arena_stats_u64(&sdstats->lstats[i].nmalloc,
 			    &astats->lstats[i].nmalloc);
 			ctl_accum_arena_stats_u64(&sdstats->lstats[i].ndalloc,
@@ -2433,7 +2433,7 @@ arenas_muzzy_decay_ms_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
 CTL_RO_NL_GEN(arenas_quantum, QUANTUM, size_t)
 CTL_RO_NL_GEN(arenas_page, PAGE, size_t)
 CTL_RO_NL_GEN(arenas_tcache_max, tcache_maxclass, size_t)
-CTL_RO_NL_GEN(arenas_nbins, NBINS, unsigned)
+CTL_RO_NL_GEN(arenas_nbins, SC_NBINS, unsigned)
 CTL_RO_NL_GEN(arenas_nhbins, nhbins, unsigned)
 CTL_RO_NL_GEN(arenas_bin_i_size, bin_infos[mib[2]].reg_size, size_t)
 CTL_RO_NL_GEN(arenas_bin_i_nregs, bin_infos[mib[2]].nregs, uint32_t)
@@ -2441,19 +2441,19 @@ CTL_RO_NL_GEN(arenas_bin_i_slab_size, bin_infos[mib[2]].slab_size, size_t)
 static const ctl_named_node_t *
 arenas_bin_i_index(tsdn_t *tsdn, const size_t *mib,
     size_t miblen, size_t i) {
-	if (i > NBINS) {
+	if (i > SC_NBINS) {
 		return NULL;
 	}
 	return super_arenas_bin_i_node;
 }
 
-CTL_RO_NL_GEN(arenas_nlextents, NSIZES - NBINS, unsigned)
-CTL_RO_NL_GEN(arenas_lextent_i_size, sz_index2size(NBINS+(szind_t)mib[2]),
+CTL_RO_NL_GEN(arenas_nlextents, SC_NSIZES - SC_NBINS, unsigned)
+CTL_RO_NL_GEN(arenas_lextent_i_size, sz_index2size(SC_NBINS+(szind_t)mib[2]),
     size_t)
 static const ctl_named_node_t *
 arenas_lextent_i_index(tsdn_t *tsdn, const size_t *mib,
     size_t miblen, size_t i) {
-	if (i > NSIZES - NBINS) {
+	if (i > SC_NSIZES - SC_NBINS) {
 		return NULL;
 	}
 	return super_arenas_lextent_i_node;
@@ -2818,7 +2818,7 @@ stats_mutexes_reset_ctl(tsd_t *tsd, const size_t *mib,
 		MUTEX_PROF_RESET(arena->tcache_ql_mtx);
 		MUTEX_PROF_RESET(arena->base->mtx);
 
-		for (szind_t i = 0; i < NBINS; i++) {
+		for (szind_t i = 0; i < SC_NBINS; i++) {
 			bin_t *bin = &arena->bins[i];
 			MUTEX_PROF_RESET(bin->lock);
 		}
@@ -2849,7 +2849,7 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_curslabs,
 static const ctl_named_node_t *
 stats_arenas_i_bins_j_index(tsdn_t *tsdn, const size_t *mib,
     size_t miblen, size_t j) {
-	if (j > NBINS) {
+	if (j > SC_NBINS) {
 		return NULL;
 	}
 	return super_stats_arenas_i_bins_j_node;
@@ -2870,7 +2870,7 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_lextents_j_curlextents,
 static const ctl_named_node_t *
 stats_arenas_i_lextents_j_index(tsdn_t *tsdn, const size_t *mib,
     size_t miblen, size_t j) {
-	if (j > NSIZES - NBINS) {
+	if (j > SC_NSIZES - SC_NBINS) {
 		return NULL;
 	}
 	return super_stats_arenas_i_lextents_j_node;
diff --git a/src/extent.c b/src/extent.c
index 4b1a6df..0953940 100644
--- a/src/extent.c
+++ b/src/extent.c
@@ -20,7 +20,7 @@ mutex_pool_t	extent_mutex_pool;
 size_t opt_lg_extent_max_active_fit = LG_EXTENT_MAX_ACTIVE_FIT_DEFAULT;
 
 static const bitmap_info_t extents_bitmap_info =
-    BITMAP_INFO_INITIALIZER(NPSIZES+1);
+    BITMAP_INFO_INITIALIZER(SC_NPSIZES_MAX+1);
 
 static void *extent_alloc_default(extent_hooks_t *extent_hooks, void *new_addr,
     size_t size, size_t alignment, bool *zero, bool *commit,
@@ -259,7 +259,7 @@ extent_size_quantize_ceil(size_t size) {
 	size_t ret;
 
 	assert(size > 0);
-	assert(size - sz_large_pad <= LARGE_MAXCLASS);
+	assert(size - sz_large_pad <= sc_data_global.large_maxclass);
 	assert((size & PAGE_MASK) == 0);
 
 	ret = extent_size_quantize_floor(size);
@@ -288,7 +288,7 @@ extents_init(tsdn_t *tsdn, extents_t *extents, extent_state_t state,
 	    malloc_mutex_rank_exclusive)) {
 		return true;
 	}
-	for (unsigned i = 0; i < NPSIZES+1; i++) {
+	for (unsigned i = 0; i < sc_data_global.npsizes + 1; i++) {
 		extent_heap_new(&extents->heaps[i]);
 	}
 	bitmap_init(extents->bitmap, &extents_bitmap_info, true);
@@ -375,7 +375,7 @@ extents_fit_alignment(extents_t *extents, size_t min_size, size_t max_size,
 	    &extents_bitmap_info, (size_t)pind); i < pind_max; i =
 	    (pszind_t)bitmap_ffu(extents->bitmap, &extents_bitmap_info,
 	    (size_t)i+1)) {
-		assert(i < NPSIZES);
+		assert(i < sc_data_global.npsizes);
 		assert(!extent_heap_empty(&extents->heaps[i]));
 		extent_t *extent = extent_heap_first(&extents->heaps[i]);
 		uintptr_t base = (uintptr_t)extent_base_get(extent);
@@ -405,7 +405,7 @@ extents_best_fit_locked(tsdn_t *tsdn, arena_t *arena, extents_t *extents,
 	pszind_t pind = sz_psz2ind(extent_size_quantize_ceil(size));
 	pszind_t i = (pszind_t)bitmap_ffu(extents->bitmap, &extents_bitmap_info,
 	    (size_t)pind);
-	if (i < NPSIZES+1) {
+	if (i < sc_data_global.npsizes + 1) {
 		/*
 		 * In order to reduce fragmentation, avoid reusing and splitting
 		 * large extents for much smaller sizes.
@@ -433,8 +433,9 @@ extents_first_fit_locked(tsdn_t *tsdn, arena_t *arena, extents_t *extents,
 
 	pszind_t pind = sz_psz2ind(extent_size_quantize_ceil(size));
 	for (pszind_t i = (pszind_t)bitmap_ffu(extents->bitmap,
-	    &extents_bitmap_info, (size_t)pind); i < NPSIZES+1; i =
-	    (pszind_t)bitmap_ffu(extents->bitmap, &extents_bitmap_info,
+	    &extents_bitmap_info, (size_t)pind);
+	    i < sc_data_global.npsizes + 1;
+	    i = (pszind_t)bitmap_ffu(extents->bitmap, &extents_bitmap_info,
 	    (size_t)i+1)) {
 		assert(!extent_heap_empty(&extents->heaps[i]));
 		extent_t *extent = extent_heap_first(&extents->heaps[i]);
@@ -442,10 +443,10 @@ extents_first_fit_locked(tsdn_t *tsdn, arena_t *arena, extents_t *extents,
 		if (ret == NULL || extent_snad_comp(extent, ret) < 0) {
 			ret = extent;
 		}
-		if (i == NPSIZES) {
+		if (i == sc_data_global.npsizes) {
 			break;
 		}
-		assert(i < NPSIZES);
+		assert(i < sc_data_global.npsizes);
 	}
 
 	return ret;
@@ -821,7 +822,7 @@ extent_deregister_impl(tsdn_t *tsdn, extent_t *extent, bool gdump) {
 
 	extent_lock(tsdn, extent);
 
-	extent_rtree_write_acquired(tsdn, elm_a, elm_b, NULL, NSIZES, false);
+	extent_rtree_write_acquired(tsdn, elm_a, elm_b, NULL, SC_NSIZES, false);
 	if (extent_slab_get(extent)) {
 		extent_interior_deregister(tsdn, rtree_ctx, extent);
 		extent_slab_set(extent, false);
@@ -962,7 +963,7 @@ extent_split_interior(tsdn_t *tsdn, arena_t *arena,
 	if (leadsize != 0) {
 		*lead = *extent;
 		*extent = extent_split_impl(tsdn, arena, r_extent_hooks,
-		    *lead, leadsize, NSIZES, false, esize + trailsize, szind,
+		    *lead, leadsize, SC_NSIZES, false, esize + trailsize, szind,
 		    slab, growing_retained);
 		if (*extent == NULL) {
 			*to_leak = *lead;
@@ -974,7 +975,7 @@ extent_split_interior(tsdn_t *tsdn, arena_t *arena,
 	/* Split the trail. */
 	if (trailsize != 0) {
 		*trail = extent_split_impl(tsdn, arena, r_extent_hooks, *extent,
-		    esize, szind, slab, trailsize, NSIZES, false,
+		    esize, szind, slab, trailsize, SC_NSIZES, false,
 		    growing_retained);
 		if (*trail == NULL) {
 			*to_leak = *extent;
@@ -991,7 +992,7 @@ extent_split_interior(tsdn_t *tsdn, arena_t *arena,
 		 * splitting occurred.
 		 */
 		extent_szind_set(*extent, szind);
-		if (szind != NSIZES) {
+		if (szind != SC_NSIZES) {
 			rtree_szind_slab_update(tsdn, &extents_rtree, rtree_ctx,
 			    (uintptr_t)extent_addr_get(*extent), szind, slab);
 			if (slab && extent_size_get(*extent) > PAGE) {
@@ -1248,11 +1249,13 @@ extent_grow_retained(tsdn_t *tsdn, arena_t *arena,
 	size_t alloc_size = sz_pind2sz(arena->extent_grow_next + egn_skip);
 	while (alloc_size < alloc_size_min) {
 		egn_skip++;
-		if (arena->extent_grow_next + egn_skip == NPSIZES) {
+		if (arena->extent_grow_next + egn_skip ==
+		    sc_data_global.npsizes) {
 			/* Outside legal range. */
 			goto label_err;
 		}
-		assert(arena->extent_grow_next + egn_skip < NPSIZES);
+		assert(arena->extent_grow_next + egn_skip
+		    < sc_data_global.npsizes);
 		alloc_size = sz_pind2sz(arena->extent_grow_next + egn_skip);
 	}
 
@@ -1275,7 +1278,7 @@ extent_grow_retained(tsdn_t *tsdn, arena_t *arena,
 		extent_hook_post_reentrancy(tsdn);
 	}
 
-	extent_init(extent, arena, ptr, alloc_size, false, NSIZES,
+	extent_init(extent, arena, ptr, alloc_size, false, SC_NSIZES,
 	    arena_extent_sn_next(arena), extent_state_active, zeroed,
 	    committed, true);
 	if (ptr == NULL) {
@@ -1610,7 +1613,7 @@ extent_record(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 	malloc_mutex_lock(tsdn, &extents->mtx);
 	extent_hooks_assure_initialized(arena, r_extent_hooks);
 
-	extent_szind_set(extent, NSIZES);
+	extent_szind_set(extent, SC_NSIZES);
 	if (extent_slab_get(extent)) {
 		extent_interior_deregister(tsdn, rtree_ctx, extent);
 		extent_slab_set(extent, false);
@@ -1622,7 +1625,7 @@ extent_record(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 	if (!extents->delay_coalesce) {
 		extent = extent_try_coalesce(tsdn, arena, r_extent_hooks,
 		    rtree_ctx, extents, extent, NULL, growing_retained);
-	} else if (extent_size_get(extent) >= LARGE_MINCLASS) {
+	} else if (extent_size_get(extent) >= sc_data_global.large_minclass) {
 		/* Always coalesce large extents eagerly. */
 		bool coalesced;
 		size_t prev_size;
@@ -1633,7 +1636,8 @@ extent_record(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 			    r_extent_hooks, rtree_ctx, extents, extent,
 			    &coalesced, growing_retained);
 		} while (coalesced &&
-		    extent_size_get(extent) >= prev_size + LARGE_MINCLASS);
+		    extent_size_get(extent)
+		    >= prev_size + sc_data_global.large_minclass);
 	}
 	extent_deactivate_locked(tsdn, arena, extents, extent);
 
@@ -2132,22 +2136,23 @@ extent_merge_impl(tsdn_t *tsdn, arena_t *arena,
 
 	if (a_elm_b != NULL) {
 		rtree_leaf_elm_write(tsdn, &extents_rtree, a_elm_b, NULL,
-		    NSIZES, false);
+		    SC_NSIZES, false);
 	}
 	if (b_elm_b != NULL) {
 		rtree_leaf_elm_write(tsdn, &extents_rtree, b_elm_a, NULL,
-		    NSIZES, false);
+		    SC_NSIZES, false);
 	} else {
 		b_elm_b = b_elm_a;
 	}
 
 	extent_size_set(a, extent_size_get(a) + extent_size_get(b));
-	extent_szind_set(a, NSIZES);
+	extent_szind_set(a, SC_NSIZES);
 	extent_sn_set(a, (extent_sn_get(a) < extent_sn_get(b)) ?
 	    extent_sn_get(a) : extent_sn_get(b));
 	extent_zeroed_set(a, extent_zeroed_get(a) && extent_zeroed_get(b));
 
-	extent_rtree_write_acquired(tsdn, a_elm_a, b_elm_b, a, NSIZES, false);
+	extent_rtree_write_acquired(tsdn, a_elm_a, b_elm_b, a, SC_NSIZES,
+	    false);
 
 	extent_unlock2(tsdn, a, b);
 
diff --git a/src/extent_dss.c b/src/extent_dss.c
index 2b1ea9c..6c56cf6 100644
--- a/src/extent_dss.c
+++ b/src/extent_dss.c
@@ -154,7 +154,7 @@ extent_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr, size_t size,
 			    (uintptr_t)gap_addr_page;
 			if (gap_size_page != 0) {
 				extent_init(gap, arena, gap_addr_page,
-				    gap_size_page, false, NSIZES,
+				    gap_size_page, false, SC_NSIZES,
 				    arena_extent_sn_next(arena),
 				    extent_state_active, false, true, true);
 			}
@@ -198,7 +198,7 @@ extent_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr, size_t size,
 					extent_t extent;
 
 					extent_init(&extent, arena, ret, size,
-					    size, false, NSIZES,
+					    size, false, SC_NSIZES,
 					    extent_state_active, false, true,
 					    true);
 					if (extent_purge_forced_wrapper(tsdn,
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 82c0887..664c5f8 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -13,7 +13,7 @@
 #include "jemalloc/internal/malloc_io.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/rtree.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/spin.h"
 #include "jemalloc/internal/sz.h"
 #include "jemalloc/internal/ticker.h"
@@ -1158,7 +1158,8 @@ malloc_conf_init(void) {
 			/* Experimental feature.  Will be documented later.*/
 			CONF_HANDLE_SIZE_T(opt_huge_threshold,
 			    "experimental_huge_threshold",
-			    LARGE_MINCLASS, LARGE_MAXCLASS, yes, yes, false)
+			    sc_data_global.large_minclass,
+			    sc_data_global.large_maxclass, yes, yes, false)
 			CONF_HANDLE_SIZE_T(opt_lg_extent_max_active_fit,
 			    "lg_extent_max_active_fit", 0,
 			    (sizeof(size_t) << 3), yes, yes, false)
@@ -1294,6 +1295,10 @@ static bool
 malloc_init_hard_a0_locked() {
 	malloc_initializer = INITIALIZER;
 
+	sc_boot();
+	sz_boot(&sc_data_global);
+	bin_boot(&sc_data_global);
+
 	if (config_prof) {
 		prof_boot0();
 	}
@@ -1747,12 +1752,13 @@ imalloc_sample(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd,
 	szind_t ind_large;
 	size_t bumped_usize = usize;
 
-	if (usize <= SMALL_MAXCLASS) {
-		assert(((dopts->alignment == 0) ? sz_s2u(LARGE_MINCLASS) :
-		    sz_sa2u(LARGE_MINCLASS, dopts->alignment))
-		    == LARGE_MINCLASS);
-		ind_large = sz_size2index(LARGE_MINCLASS);
-		bumped_usize = sz_s2u(LARGE_MINCLASS);
+	if (usize <= sc_data_global.small_maxclass) {
+		assert(((dopts->alignment == 0) ?
+		    sz_s2u(sc_data_global.large_minclass) :
+		    sz_sa2u(sc_data_global.large_minclass, dopts->alignment))
+			== sc_data_global.large_minclass);
+		ind_large = sz_size2index(sc_data_global.large_minclass);
+		bumped_usize = sz_s2u(sc_data_global.large_minclass);
 		ret = imalloc_no_sample(sopts, dopts, tsd, bumped_usize,
 		    bumped_usize, ind_large);
 		if (unlikely(ret == NULL)) {
@@ -1855,16 +1861,18 @@ imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) {
 
 	if (dopts->alignment == 0) {
 		ind = sz_size2index(size);
-		if (unlikely(ind >= NSIZES)) {
+		if (unlikely(ind >= SC_NSIZES)) {
 			goto label_oom;
 		}
 		if (config_stats || (config_prof && opt_prof)) {
 			usize = sz_index2size(ind);
-			assert(usize > 0 && usize <= LARGE_MAXCLASS);
+			assert(usize > 0 && usize
+			    <= sc_data_global.large_maxclass);
 		}
 	} else {
 		usize = sz_sa2u(size, dopts->alignment);
-		if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) {
+		if (unlikely(usize == 0
+		    || usize > sc_data_global.large_maxclass)) {
 			goto label_oom;
 		}
 	}
@@ -1900,7 +1908,8 @@ imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) {
 
 		alloc_ctx_t alloc_ctx;
 		if (likely((uintptr_t)tctx == (uintptr_t)1U)) {
-			alloc_ctx.slab = (usize <= SMALL_MAXCLASS);
+			alloc_ctx.slab = (usize
+			    <= sc_data_global.small_maxclass);
 			allocation = imalloc_no_sample(
 			    sopts, dopts, tsd, usize, usize, ind);
 		} else if ((uintptr_t)tctx > (uintptr_t)1U) {
@@ -2198,9 +2207,9 @@ irealloc_prof_sample(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t usize,
 	if (tctx == NULL) {
 		return NULL;
 	}
-	if (usize <= SMALL_MAXCLASS) {
-		p = iralloc(tsd, old_ptr, old_usize, LARGE_MINCLASS, 0, false,
-		    hook_args);
+	if (usize <= sc_data_global.small_maxclass) {
+		p = iralloc(tsd, old_ptr, old_usize,
+		    sc_data_global.large_minclass, 0, false, hook_args);
 		if (p == NULL) {
 			return NULL;
 		}
@@ -2257,7 +2266,7 @@ ifree(tsd_t *tsd, void *ptr, tcache_t *tcache, bool slow_path) {
 	rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd);
 	rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree, rtree_ctx,
 	    (uintptr_t)ptr, true, &alloc_ctx.szind, &alloc_ctx.slab);
-	assert(alloc_ctx.szind != NSIZES);
+	assert(alloc_ctx.szind != SC_NSIZES);
 
 	size_t usize;
 	if (config_prof && opt_prof) {
@@ -2384,12 +2393,13 @@ je_realloc(void *ptr, size_t arg_size) {
 		rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd);
 		rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree, rtree_ctx,
 		    (uintptr_t)ptr, true, &alloc_ctx.szind, &alloc_ctx.slab);
-		assert(alloc_ctx.szind != NSIZES);
+		assert(alloc_ctx.szind != SC_NSIZES);
 		old_usize = sz_index2size(alloc_ctx.szind);
 		assert(old_usize == isalloc(tsd_tsdn(tsd), ptr));
 		if (config_prof && opt_prof) {
 			usize = sz_s2u(size);
-			if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) {
+			if (unlikely(usize == 0
+			    || usize > sc_data_global.large_maxclass)) {
 				ret = NULL;
 			} else {
 				ret = irealloc_prof(tsd, ptr, old_usize, usize,
@@ -2702,9 +2712,10 @@ irallocx_prof_sample(tsdn_t *tsdn, void *old_ptr, size_t old_usize,
 	if (tctx == NULL) {
 		return NULL;
 	}
-	if (usize <= SMALL_MAXCLASS) {
-		p = iralloct(tsdn, old_ptr, old_usize, LARGE_MINCLASS,
-		    alignment, zero, tcache, arena, hook_args);
+	if (usize <= sc_data_global.small_maxclass) {
+		p = iralloct(tsdn, old_ptr, old_usize,
+		    sc_data_global.large_minclass, alignment, zero, tcache,
+		    arena, hook_args);
 		if (p == NULL) {
 			return NULL;
 		}
@@ -2804,7 +2815,7 @@ je_rallocx(void *ptr, size_t size, int flags) {
 	rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd);
 	rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree, rtree_ctx,
 	    (uintptr_t)ptr, true, &alloc_ctx.szind, &alloc_ctx.slab);
-	assert(alloc_ctx.szind != NSIZES);
+	assert(alloc_ctx.szind != SC_NSIZES);
 	old_usize = sz_index2size(alloc_ctx.szind);
 	assert(old_usize == isalloc(tsd_tsdn(tsd), ptr));
 
@@ -2813,7 +2824,8 @@ je_rallocx(void *ptr, size_t size, int flags) {
 	if (config_prof && opt_prof) {
 		usize = (alignment == 0) ?
 		    sz_s2u(size) : sz_sa2u(size, alignment);
-		if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) {
+		if (unlikely(usize == 0
+		    || usize > sc_data_global.large_maxclass)) {
 			goto label_oom;
 		}
 		p = irallocx_prof(tsd, ptr, old_usize, size, alignment, &usize,
@@ -2898,17 +2910,19 @@ ixallocx_prof(tsd_t *tsd, void *ptr, size_t old_usize, size_t size,
 	 */
 	if (alignment == 0) {
 		usize_max = sz_s2u(size+extra);
-		assert(usize_max > 0 && usize_max <= LARGE_MAXCLASS);
+		assert(usize_max > 0
+		    && usize_max <= sc_data_global.large_maxclass);
 	} else {
 		usize_max = sz_sa2u(size+extra, alignment);
-		if (unlikely(usize_max == 0 || usize_max > LARGE_MAXCLASS)) {
+		if (unlikely(usize_max == 0
+		    || usize_max > sc_data_global.large_maxclass)) {
 			/*
 			 * usize_max is out of range, and chances are that
 			 * allocation will fail, but use the maximum possible
 			 * value and carry on with prof_alloc_prep(), just in
 			 * case allocation succeeds.
 			 */
-			usize_max = LARGE_MAXCLASS;
+			usize_max = sc_data_global.large_maxclass;
 		}
 	}
 	tctx = prof_alloc_prep(tsd, usize_max, prof_active, false);
@@ -2951,24 +2965,24 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags) {
 	rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd);
 	rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree, rtree_ctx,
 	    (uintptr_t)ptr, true, &alloc_ctx.szind, &alloc_ctx.slab);
-	assert(alloc_ctx.szind != NSIZES);
+	assert(alloc_ctx.szind != SC_NSIZES);
 	old_usize = sz_index2size(alloc_ctx.szind);
 	assert(old_usize == isalloc(tsd_tsdn(tsd), ptr));
 	/*
 	 * The API explicitly absolves itself of protecting against (size +
 	 * extra) numerical overflow, but we may need to clamp extra to avoid
-	 * exceeding LARGE_MAXCLASS.
+	 * exceeding sc_data_global.large_maxclass.
 	 *
 	 * Ordinarily, size limit checking is handled deeper down, but here we
 	 * have to check as part of (size + extra) clamping, since we need the
 	 * clamped value in the above helper functions.
 	 */
-	if (unlikely(size > LARGE_MAXCLASS)) {
+	if (unlikely(size > sc_data_global.large_maxclass)) {
 		usize = old_usize;
 		goto label_not_resized;
 	}
-	if (unlikely(LARGE_MAXCLASS - size < extra)) {
-		extra = LARGE_MAXCLASS - size;
+	if (unlikely(sc_data_global.large_maxclass - size < extra)) {
+		extra = sc_data_global.large_maxclass - size;
 	}
 
 	if (config_prof && opt_prof) {
@@ -3155,7 +3169,7 @@ je_nallocx(size_t size, int flags) {
 	check_entry_exit_locking(tsdn);
 
 	usize = inallocx(tsdn, size, flags);
-	if (unlikely(usize > LARGE_MAXCLASS)) {
+	if (unlikely(usize > sc_data_global.large_maxclass)) {
 		LOG("core.nallocx.exit", "result: %zu", ZU(0));
 		return 0;
 	}
diff --git a/src/large.c b/src/large.c
index 03eecfa..87d9ec0 100644
--- a/src/large.c
+++ b/src/large.c
@@ -28,7 +28,7 @@ large_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment,
 	assert(!tsdn_null(tsdn) || arena != NULL);
 
 	ausize = sz_sa2u(usize, alignment);
-	if (unlikely(ausize == 0 || ausize > LARGE_MAXCLASS)) {
+	if (unlikely(ausize == 0 || ausize > sc_data_global.large_maxclass)) {
 		return NULL;
 	}
 
@@ -109,7 +109,7 @@ large_ralloc_no_move_shrink(tsdn_t *tsdn, extent_t *extent, size_t usize) {
 	if (diff != 0) {
 		extent_t *trail = extent_split_wrapper(tsdn, arena,
 		    &extent_hooks, extent, usize + sz_large_pad,
-		    sz_size2index(usize), false, diff, NSIZES, false);
+		    sz_size2index(usize), false, diff, SC_NSIZES, false);
 		if (trail == NULL) {
 			return true;
 		}
@@ -154,17 +154,17 @@ large_ralloc_no_move_expand(tsdn_t *tsdn, extent_t *extent, size_t usize,
 	bool new_mapping;
 	if ((trail = extents_alloc(tsdn, arena, &extent_hooks,
 	    &arena->extents_dirty, extent_past_get(extent), trailsize, 0,
-	    CACHELINE, false, NSIZES, &is_zeroed_trail, &commit)) != NULL
+	    CACHELINE, false, SC_NSIZES, &is_zeroed_trail, &commit)) != NULL
 	    || (trail = extents_alloc(tsdn, arena, &extent_hooks,
 	    &arena->extents_muzzy, extent_past_get(extent), trailsize, 0,
-	    CACHELINE, false, NSIZES, &is_zeroed_trail, &commit)) != NULL) {
+	    CACHELINE, false, SC_NSIZES, &is_zeroed_trail, &commit)) != NULL) {
 		if (config_stats) {
 			new_mapping = false;
 		}
 	} else {
 		if ((trail = extent_alloc_wrapper(tsdn, arena, &extent_hooks,
 		    extent_past_get(extent), trailsize, 0, CACHELINE, false,
-		    NSIZES, &is_zeroed_trail, &commit)) == NULL) {
+		    SC_NSIZES, &is_zeroed_trail, &commit)) == NULL) {
 			return true;
 		}
 		if (config_stats) {
@@ -221,9 +221,10 @@ large_ralloc_no_move(tsdn_t *tsdn, extent_t *extent, size_t usize_min,
 	size_t oldusize = extent_usize_get(extent);
 
 	/* The following should have been caught by callers. */
-	assert(usize_min > 0 && usize_max <= LARGE_MAXCLASS);
+	assert(usize_min > 0 && usize_max <= sc_data_global.large_maxclass);
 	/* Both allocation sizes must be large to avoid a move. */
-	assert(oldusize >= LARGE_MINCLASS && usize_max >= LARGE_MINCLASS);
+	assert(oldusize >= sc_data_global.large_minclass
+	    && usize_max >= sc_data_global.large_minclass);
 
 	if (usize_max > oldusize) {
 		/* Attempt to expand the allocation in-place. */
@@ -277,9 +278,10 @@ large_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t usize,
 
 	size_t oldusize = extent_usize_get(extent);
 	/* The following should have been caught by callers. */
-	assert(usize > 0 && usize <= LARGE_MAXCLASS);
+	assert(usize > 0 && usize <= sc_data_global.large_maxclass);
 	/* Both allocation sizes must be large to avoid a move. */
-	assert(oldusize >= LARGE_MINCLASS && usize >= LARGE_MINCLASS);
+	assert(oldusize >= sc_data_global.large_minclass
+	    && usize >= sc_data_global.large_minclass);
 
 	/* Try to avoid moving the allocation. */
 	if (!large_ralloc_no_move(tsdn, extent, usize, usize, zero)) {
diff --git a/src/sc.c b/src/sc.c
new file mode 100644
index 0000000..943d787
--- /dev/null
+++ b/src/sc.c
@@ -0,0 +1,62 @@
+#include "jemalloc/internal/jemalloc_preamble.h"
+
+#include "jemalloc/internal/assert.h"
+#include "jemalloc/internal/sc.h"
+#include "jemalloc/internal/size_classes.h"
+
+sc_data_t sc_data_global;
+
+static void
+fill_sc(sc_data_t *data, int index, int lg_base, int lg_delta, int ndelta,
+    bool psz, bool bin, int pgs, int lg_delta_lookup) {
+	sc_t *sc = &data->sc[index];
+	sc->index = index;
+	sc->lg_base = lg_base;
+	sc->lg_delta = lg_delta;
+	sc->ndelta = ndelta;
+	sc->psz = psz;
+	sc->bin = bin;
+	sc->pgs = pgs;
+	sc->lg_delta_lookup = lg_delta_lookup;
+}
+
+void
+sc_data_init(sc_data_t *data) {
+	assert(SC_NTINY == NTBINS);
+	assert(SC_NSIZES == NSIZES);
+	assert(SC_NBINS == NBINS);
+	assert(NPSIZES <= SC_NPSIZES_MAX);
+	assert(!data->initialized);
+	data->initialized = true;
+	data->ntiny = NTBINS;
+	data->nlbins = NLBINS;
+	data->nbins = NBINS;
+	data->nsizes = NSIZES;
+	data->lg_ceil_nsizes = LG_CEIL_NSIZES;
+	data->npsizes = NPSIZES;
+#if SC_NTINY != 0
+	data->lg_tiny_maxclass = LG_TINY_MAXCLASS;
+#else
+	data->lg_tiny_maxclass = -1;
+#endif
+	data->lookup_maxclass = LOOKUP_MAXCLASS;
+	data->small_maxclass = SMALL_MAXCLASS;
+	data->lg_large_minclass = LG_LARGE_MINCLASS;
+	data->large_minclass = LARGE_MINCLASS;
+	data->large_maxclass = LARGE_MAXCLASS;
+#define no 0
+#define yes 1
+#define SC(index, lg_base_base, lg_delta, ndelta, psz, bin, pgs,	\
+    lg_delta_lookup)							\
+	fill_sc(data, index, lg_base_base, lg_delta, ndelta, psz, bin, 	\
+	    pgs, lg_delta_lookup);
+	SIZE_CLASSES
+#undef no
+#undef yes
+#undef SC
+}
+
+void
+sc_boot() {
+	sc_data_init(&sc_data_global);
+}
diff --git a/src/sz.c b/src/sz.c
index 9de77e4..e038728 100644
--- a/src/sz.c
+++ b/src/sz.c
@@ -2,106 +2,60 @@
 #include "jemalloc/internal/sz.h"
 
 JEMALLOC_ALIGNED(CACHELINE)
-const size_t sz_pind2sz_tab[NPSIZES+1] = {
-#define PSZ_yes(lg_grp, ndelta, lg_delta)				\
-	(((ZU(1)<<lg_grp) + (ZU(ndelta)<<lg_delta))),
-#define PSZ_no(lg_grp, ndelta, lg_delta)
-#define SC(index, lg_grp, lg_delta, ndelta, psz, bin, pgs, lg_delta_lookup) \
-	PSZ_##psz(lg_grp, ndelta, lg_delta)
-	SIZE_CLASSES
-#undef PSZ_yes
-#undef PSZ_no
-#undef SC
-	(LARGE_MAXCLASS + PAGE)
-};
+size_t sz_pind2sz_tab[SC_NPSIZES_MAX+1];
+
+static void
+sz_boot_pind2sz_tab(const sc_data_t *sc_data) {
+	int pind = 0;
+	for (unsigned i = 0; i < SC_NSIZES; i++) {
+		const sc_t *sc = &sc_data->sc[i];
+		if (sc->psz) {
+			sz_pind2sz_tab[pind] = (ZU(1) << sc->lg_base)
+			    + (ZU(sc->ndelta) << sc->lg_delta);
+			pind++;
+		}
+	}
+	sz_pind2sz_tab[pind] = sc_data->large_maxclass + PAGE;
+}
 
 JEMALLOC_ALIGNED(CACHELINE)
-const size_t sz_index2size_tab[NSIZES] = {
-#define SC(index, lg_grp, lg_delta, ndelta, psz, bin, pgs, lg_delta_lookup) \
-	((ZU(1)<<lg_grp) + (ZU(ndelta)<<lg_delta)),
-	SIZE_CLASSES
-#undef SC
-};
+size_t sz_index2size_tab[SC_NSIZES];
+
+static void
+sz_boot_index2size_tab(const sc_data_t *sc_data) {
+	for (unsigned i = 0; i < SC_NSIZES; i++) {
+		const sc_t *sc = &sc_data->sc[i];
+		sz_index2size_tab[i] = (ZU(1) << sc->lg_base)
+		    + (ZU(sc->ndelta) << (sc->lg_delta));
+	}
+}
 
+/*
+ * To keep this table small, we divide sizes by the tiny min size, which gives
+ * the smallest interval for which the result can change.
+ */
 JEMALLOC_ALIGNED(CACHELINE)
-const uint8_t sz_size2index_tab[] = {
-#if LG_TINY_MIN == 0
-/* The div module doesn't support division by 1. */
-#error "Unsupported LG_TINY_MIN"
-#define S2B_0(i)	i,
-#elif LG_TINY_MIN == 1
-#warning "Dangerous LG_TINY_MIN"
-#define S2B_1(i)	i,
-#elif LG_TINY_MIN == 2
-#warning "Dangerous LG_TINY_MIN"
-#define S2B_2(i)	i,
-#elif LG_TINY_MIN == 3
-#define S2B_3(i)	i,
-#elif LG_TINY_MIN == 4
-#define S2B_4(i)	i,
-#elif LG_TINY_MIN == 5
-#define S2B_5(i)	i,
-#elif LG_TINY_MIN == 6
-#define S2B_6(i)	i,
-#elif LG_TINY_MIN == 7
-#define S2B_7(i)	i,
-#elif LG_TINY_MIN == 8
-#define S2B_8(i)	i,
-#elif LG_TINY_MIN == 9
-#define S2B_9(i)	i,
-#elif LG_TINY_MIN == 10
-#define S2B_10(i)	i,
-#elif LG_TINY_MIN == 11
-#define S2B_11(i)	i,
-#else
-#error "Unsupported LG_TINY_MIN"
-#endif
-#if LG_TINY_MIN < 1
-#define S2B_1(i)	S2B_0(i) S2B_0(i)
-#endif
-#if LG_TINY_MIN < 2
-#define S2B_2(i)	S2B_1(i) S2B_1(i)
-#endif
-#if LG_TINY_MIN < 3
-#define S2B_3(i)	S2B_2(i) S2B_2(i)
-#endif
-#if LG_TINY_MIN < 4
-#define S2B_4(i)	S2B_3(i) S2B_3(i)
-#endif
-#if LG_TINY_MIN < 5
-#define S2B_5(i)	S2B_4(i) S2B_4(i)
-#endif
-#if LG_TINY_MIN < 6
-#define S2B_6(i)	S2B_5(i) S2B_5(i)
-#endif
-#if LG_TINY_MIN < 7
-#define S2B_7(i)	S2B_6(i) S2B_6(i)
-#endif
-#if LG_TINY_MIN < 8
-#define S2B_8(i)	S2B_7(i) S2B_7(i)
-#endif
-#if LG_TINY_MIN < 9
-#define S2B_9(i)	S2B_8(i) S2B_8(i)
-#endif
-#if LG_TINY_MIN < 10
-#define S2B_10(i)	S2B_9(i) S2B_9(i)
-#endif
-#if LG_TINY_MIN < 11
-#define S2B_11(i)	S2B_10(i) S2B_10(i)
-#endif
-#define S2B_no(i)
-#define SC(index, lg_grp, lg_delta, ndelta, psz, bin, pgs, lg_delta_lookup) \
-	S2B_##lg_delta_lookup(index)
-	SIZE_CLASSES
-#undef S2B_3
-#undef S2B_4
-#undef S2B_5
-#undef S2B_6
-#undef S2B_7
-#undef S2B_8
-#undef S2B_9
-#undef S2B_10
-#undef S2B_11
-#undef S2B_no
-#undef SC
-};
+uint8_t sz_size2index_tab[SC_LOOKUP_MAXCLASS >> SC_LG_TINY_MIN];
+
+static void
+sz_boot_size2index_tab(const sc_data_t *sc_data) {
+	size_t dst_max = (SC_LOOKUP_MAXCLASS >> SC_LG_TINY_MIN);
+	size_t dst_ind = 0;
+	for (unsigned sc_ind = 0; sc_ind < SC_NSIZES && dst_ind < dst_max;
+	    sc_ind++) {
+		const sc_t *sc = &sc_data->sc[sc_ind];
+		size_t sz = (ZU(1) << sc->lg_base)
+		    + (ZU(sc->ndelta) << sc->lg_delta);
+		size_t max_ind = ((sz - 1) >> SC_LG_TINY_MIN);
+		for (; dst_ind <= max_ind && dst_ind < dst_max; dst_ind++) {
+			sz_size2index_tab[dst_ind] = sc_ind;
+		}
+	}
+}
+
+void
+sz_boot(const sc_data_t *sc_data) {
+	sz_boot_pind2sz_tab(sc_data);
+	sz_boot_index2size_tab(sc_data);
+	sz_boot_size2index_tab(sc_data);
+}
diff --git a/src/tcache.c b/src/tcache.c
index d624d92..edd047a 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -4,7 +4,7 @@
 
 #include "jemalloc/internal/assert.h"
 #include "jemalloc/internal/mutex.h"
-#include "jemalloc/internal/size_classes.h"
+#include "jemalloc/internal/sc.h"
 
 /******************************************************************************/
 /* Data. */
@@ -41,7 +41,7 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache) {
 	szind_t binind = tcache->next_gc_bin;
 
 	cache_bin_t *tbin;
-	if (binind < NBINS) {
+	if (binind < SC_NBINS) {
 		tbin = tcache_small_bin_get(tcache, binind);
 	} else {
 		tbin = tcache_large_bin_get(tcache, binind);
@@ -50,7 +50,7 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache) {
 		/*
 		 * Flush (ceiling) 3/4 of the objects below the low water mark.
 		 */
-		if (binind < NBINS) {
+		if (binind < SC_NBINS) {
 			tcache_bin_flush_small(tsd, tcache, tbin, binind,
 			    tbin->ncached - tbin->low_water + (tbin->low_water
 			    >> 2));
@@ -72,7 +72,7 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache) {
 		 * Increase fill count by 2X for small bins.  Make sure
 		 * lg_fill_div stays greater than 0.
 		 */
-		if (binind < NBINS && tcache->lg_fill_div[binind] > 1) {
+		if (binind < SC_NBINS && tcache->lg_fill_div[binind] > 1) {
 			tcache->lg_fill_div[binind]--;
 		}
 	}
@@ -105,7 +105,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
     szind_t binind, unsigned rem) {
 	bool merged_stats = false;
 
-	assert(binind < NBINS);
+	assert(binind < SC_NBINS);
 	assert((cache_bin_sz_t)rem <= tbin->ncached);
 
 	arena_t *arena = tcache->arena;
@@ -369,10 +369,10 @@ tcache_init(tsd_t *tsd, tcache_t *tcache, void *avail_stack) {
 
 	size_t stack_offset = 0;
 	assert((TCACHE_NSLOTS_SMALL_MAX & 1U) == 0);
-	memset(tcache->bins_small, 0, sizeof(cache_bin_t) * NBINS);
-	memset(tcache->bins_large, 0, sizeof(cache_bin_t) * (nhbins - NBINS));
+	memset(tcache->bins_small, 0, sizeof(cache_bin_t) * SC_NBINS);
+	memset(tcache->bins_large, 0, sizeof(cache_bin_t) * (nhbins - SC_NBINS));
 	unsigned i = 0;
-	for (; i < NBINS; i++) {
+	for (; i < SC_NBINS; i++) {
 		tcache->lg_fill_div[i] = 1;
 		stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *);
 		/*
@@ -464,7 +464,7 @@ static void
 tcache_flush_cache(tsd_t *tsd, tcache_t *tcache) {
 	assert(tcache->arena != NULL);
 
-	for (unsigned i = 0; i < NBINS; i++) {
+	for (unsigned i = 0; i < SC_NBINS; i++) {
 		cache_bin_t *tbin = tcache_small_bin_get(tcache, i);
 		tcache_bin_flush_small(tsd, tcache, tbin, i, 0);
 
@@ -472,7 +472,7 @@ tcache_flush_cache(tsd_t *tsd, tcache_t *tcache) {
 			assert(tbin->tstats.nrequests == 0);
 		}
 	}
-	for (unsigned i = NBINS; i < nhbins; i++) {
+	for (unsigned i = SC_NBINS; i < nhbins; i++) {
 		cache_bin_t *tbin = tcache_large_bin_get(tcache, i);
 		tcache_bin_flush_large(tsd, tbin, i, 0, tcache);
 
@@ -538,7 +538,7 @@ tcache_stats_merge(tsdn_t *tsdn, tcache_t *tcache, arena_t *arena) {
 	cassert(config_stats);
 
 	/* Merge and reset tcache stats. */
-	for (i = 0; i < NBINS; i++) {
+	for (i = 0; i < SC_NBINS; i++) {
 		bin_t *bin = &arena->bins[i];
 		cache_bin_t *tbin = tcache_small_bin_get(tcache, i);
 		malloc_mutex_lock(tsdn, &bin->lock);
@@ -658,8 +658,8 @@ bool
 tcache_boot(tsdn_t *tsdn) {
 	/* If necessary, clamp opt_lg_tcache_max. */
 	if (opt_lg_tcache_max < 0 || (ZU(1) << opt_lg_tcache_max) <
-	    SMALL_MAXCLASS) {
-		tcache_maxclass = SMALL_MAXCLASS;
+	    sc_data_global.small_maxclass) {
+		tcache_maxclass = sc_data_global.small_maxclass;
 	} else {
 		tcache_maxclass = (ZU(1) << opt_lg_tcache_max);
 	}
@@ -679,7 +679,7 @@ tcache_boot(tsdn_t *tsdn) {
 	}
 	stack_nelms = 0;
 	unsigned i;
-	for (i = 0; i < NBINS; i++) {
+	for (i = 0; i < SC_NBINS; i++) {
 		if ((bin_infos[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MIN) {
 			tcache_bin_info[i].ncached_max =
 			    TCACHE_NSLOTS_SMALL_MIN;
diff --git a/test/unit/arena_reset.c b/test/unit/arena_reset.c
index c1ccb09..96b042d 100644
--- a/test/unit/arena_reset.c
+++ b/test/unit/arena_reset.c
@@ -77,7 +77,7 @@ vsalloc(tsdn_t *tsdn, const void *ptr) {
 		return 0;
 	}
 
-	if (szind == NSIZES) {
+	if (szind == SC_NSIZES) {
 		return 0;
 	}
 
diff --git a/test/unit/junk.c b/test/unit/junk.c
index 243ced4..91c6e5b 100644
--- a/test/unit/junk.c
+++ b/test/unit/junk.c
@@ -123,13 +123,14 @@ test_junk(size_t sz_min, size_t sz_max) {
 
 TEST_BEGIN(test_junk_small) {
 	test_skip_if(!config_fill);
-	test_junk(1, SMALL_MAXCLASS-1);
+	test_junk(1, sc_data_global.small_maxclass - 1);
 }
 TEST_END
 
 TEST_BEGIN(test_junk_large) {
 	test_skip_if(!config_fill);
-	test_junk(SMALL_MAXCLASS+1, (1U << (LG_LARGE_MINCLASS+1)));
+	test_junk(sc_data_global.small_maxclass + 1,
+	    (1U << (sc_data_global.lg_large_minclass + 1)));
 }
 TEST_END
 
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index d64b401..230ecb0 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -581,7 +581,7 @@ TEST_BEGIN(test_arena_i_retain_grow_limit) {
 
 	assert_d_eq(mallctlbymib(mib, miblen, &default_limit, &sz, NULL, 0), 0,
 	    "Unexpected mallctl() failure");
-	assert_zu_eq(default_limit, sz_pind2sz(EXTENT_GROW_MAX_PIND),
+	assert_zu_eq(default_limit, sz_pind2sz(sc_data_global.npsizes - 1),
 	    "Unexpected default for retain_grow_limit");
 
 	new_limit = PAGE - 1;
@@ -686,8 +686,8 @@ TEST_BEGIN(test_arenas_constants) {
 
 	TEST_ARENAS_CONSTANT(size_t, quantum, QUANTUM);
 	TEST_ARENAS_CONSTANT(size_t, page, PAGE);
-	TEST_ARENAS_CONSTANT(unsigned, nbins, NBINS);
-	TEST_ARENAS_CONSTANT(unsigned, nlextents, NSIZES - NBINS);
+	TEST_ARENAS_CONSTANT(unsigned, nbins, SC_NBINS);
+	TEST_ARENAS_CONSTANT(unsigned, nlextents, SC_NSIZES - SC_NBINS);
 
 #undef TEST_ARENAS_CONSTANT
 }
@@ -720,7 +720,8 @@ TEST_BEGIN(test_arenas_lextent_constants) {
 	assert_zu_eq(name, expected, "Incorrect "#name" size");		\
 } while (0)
 
-	TEST_ARENAS_LEXTENT_CONSTANT(size_t, size, LARGE_MINCLASS);
+	TEST_ARENAS_LEXTENT_CONSTANT(size_t, size,
+	    sc_data_global.large_minclass);
 
 #undef TEST_ARENAS_LEXTENT_CONSTANT
 }
diff --git a/test/unit/prof_gdump.c b/test/unit/prof_gdump.c
index fcb434c..0b8d7c3 100644
--- a/test/unit/prof_gdump.c
+++ b/test/unit/prof_gdump.c
@@ -29,12 +29,12 @@ TEST_BEGIN(test_gdump) {
 	prof_dump_open = prof_dump_open_intercept;
 
 	did_prof_dump_open = false;
-	p = mallocx((1U << LG_LARGE_MINCLASS), 0);
+	p = mallocx((1U << sc_data_global.lg_large_minclass), 0);
 	assert_ptr_not_null(p, "Unexpected mallocx() failure");
 	assert_true(did_prof_dump_open, "Expected a profile dump");
 
 	did_prof_dump_open = false;
-	q = mallocx((1U << LG_LARGE_MINCLASS), 0);
+	q = mallocx((1U << sc_data_global.lg_large_minclass), 0);
 	assert_ptr_not_null(q, "Unexpected mallocx() failure");
 	assert_true(did_prof_dump_open, "Expected a profile dump");
 
@@ -45,7 +45,7 @@ TEST_BEGIN(test_gdump) {
 	    "Unexpected mallctl failure while disabling prof.gdump");
 	assert(gdump_old);
 	did_prof_dump_open = false;
-	r = mallocx((1U << LG_LARGE_MINCLASS), 0);
+	r = mallocx((1U << sc_data_global.lg_large_minclass), 0);
 	assert_ptr_not_null(q, "Unexpected mallocx() failure");
 	assert_false(did_prof_dump_open, "Unexpected profile dump");
 
@@ -56,7 +56,7 @@ TEST_BEGIN(test_gdump) {
 	    "Unexpected mallctl failure while enabling prof.gdump");
 	assert(!gdump_old);
 	did_prof_dump_open = false;
-	s = mallocx((1U << LG_LARGE_MINCLASS), 0);
+	s = mallocx((1U << sc_data_global.lg_large_minclass), 0);
 	assert_ptr_not_null(q, "Unexpected mallocx() failure");
 	assert_true(did_prof_dump_open, "Expected a profile dump");
 
diff --git a/test/unit/rtree.c b/test/unit/rtree.c
index 908100f..4d1daf2 100644
--- a/test/unit/rtree.c
+++ b/test/unit/rtree.c
@@ -85,10 +85,10 @@ TEST_END
 
 TEST_BEGIN(test_rtree_extrema) {
 	extent_t extent_a, extent_b;
-	extent_init(&extent_a, NULL, NULL, LARGE_MINCLASS, false,
-	    sz_size2index(LARGE_MINCLASS), 0, extent_state_active, false,
-	    false, true);
-	extent_init(&extent_b, NULL, NULL, 0, false, NSIZES, 0,
+	extent_init(&extent_a, NULL, NULL, sc_data_global.large_minclass, false,
+	    sz_size2index(sc_data_global.large_minclass), 0,
+	    extent_state_active, false, false, true);
+	extent_init(&extent_b, NULL, NULL, 0, false, SC_NSIZES, 0,
 	    extent_state_active, false, false, true);
 
 	tsdn_t *tsdn = tsdn_fetch();
@@ -125,7 +125,7 @@ TEST_BEGIN(test_rtree_bits) {
 	    PAGE + (((uintptr_t)1) << LG_PAGE) - 1};
 
 	extent_t extent;
-	extent_init(&extent, NULL, NULL, 0, false, NSIZES, 0,
+	extent_init(&extent, NULL, NULL, 0, false, SC_NSIZES, 0,
 	    extent_state_active, false, false, true);
 
 	rtree_t *rtree = &test_rtree;
@@ -135,7 +135,7 @@ TEST_BEGIN(test_rtree_bits) {
 
 	for (unsigned i = 0; i < sizeof(keys)/sizeof(uintptr_t); i++) {
 		assert_false(rtree_write(tsdn, rtree, &rtree_ctx, keys[i],
-		    &extent, NSIZES, false),
+		    &extent, SC_NSIZES, false),
 		    "Unexpected rtree_write() failure");
 		for (unsigned j = 0; j < sizeof(keys)/sizeof(uintptr_t); j++) {
 			assert_ptr_eq(rtree_extent_read(tsdn, rtree, &rtree_ctx,
@@ -166,7 +166,7 @@ TEST_BEGIN(test_rtree_random) {
 	rtree_ctx_data_init(&rtree_ctx);
 
 	extent_t extent;
-	extent_init(&extent, NULL, NULL, 0, false, NSIZES, 0,
+	extent_init(&extent, NULL, NULL, 0, false, SC_NSIZES, 0,
 	    extent_state_active, false, false, true);
 
 	assert_false(rtree_new(rtree, false), "Unexpected rtree_new() failure");
@@ -177,7 +177,8 @@ TEST_BEGIN(test_rtree_random) {
 		    &rtree_ctx, keys[i], false, true);
 		assert_ptr_not_null(elm,
 		    "Unexpected rtree_leaf_elm_lookup() failure");
-		rtree_leaf_elm_write(tsdn, rtree, elm, &extent, NSIZES, false);
+		rtree_leaf_elm_write(tsdn, rtree, elm, &extent, SC_NSIZES,
+		    false);
 		assert_ptr_eq(rtree_extent_read(tsdn, rtree, &rtree_ctx,
 		    keys[i], true), &extent,
 		    "rtree_extent_read() should return previously set value");
diff --git a/test/unit/size_classes.c b/test/unit/size_classes.c
index bcff560..7c28e16 100644
--- a/test/unit/size_classes.c
+++ b/test/unit/size_classes.c
@@ -142,11 +142,11 @@ TEST_BEGIN(test_overflow) {
 	max_size_class = get_max_size_class();
 	max_psz = max_size_class + PAGE;
 
-	assert_u_eq(sz_size2index(max_size_class+1), NSIZES,
+	assert_u_eq(sz_size2index(max_size_class+1), SC_NSIZES,
 	    "sz_size2index() should return NSIZES on overflow");
-	assert_u_eq(sz_size2index(ZU(PTRDIFF_MAX)+1), NSIZES,
+	assert_u_eq(sz_size2index(ZU(PTRDIFF_MAX)+1), SC_NSIZES,
 	    "sz_size2index() should return NSIZES on overflow");
-	assert_u_eq(sz_size2index(SIZE_T_MAX), NSIZES,
+	assert_u_eq(sz_size2index(SIZE_T_MAX), SC_NSIZES,
 	    "sz_size2index() should return NSIZES on overflow");
 
 	assert_zu_eq(sz_s2u(max_size_class+1), 0,
@@ -156,13 +156,16 @@ TEST_BEGIN(test_overflow) {
 	assert_zu_eq(sz_s2u(SIZE_T_MAX), 0,
 	    "sz_s2u() should return 0 on overflow");
 
-	assert_u_eq(sz_psz2ind(max_size_class+1), NPSIZES,
+	assert_u_eq(sz_psz2ind(max_size_class+1), sc_data_global.npsizes,
 	    "sz_psz2ind() should return NPSIZES on overflow");
-	assert_u_eq(sz_psz2ind(ZU(PTRDIFF_MAX)+1), NPSIZES,
+	assert_u_eq(sz_psz2ind(ZU(PTRDIFF_MAX)+1), sc_data_global.npsizes,
 	    "sz_psz2ind() should return NPSIZES on overflow");
-	assert_u_eq(sz_psz2ind(SIZE_T_MAX), NPSIZES,
+	assert_u_eq(sz_psz2ind(SIZE_T_MAX), sc_data_global.npsizes,
 	    "sz_psz2ind() should return NPSIZES on overflow");
 
+	assert_u_le(sc_data_global.npsizes, SC_NPSIZES_MAX,
+	    "Dynamic value of npsizes is higher than static bound.");
+
 	assert_zu_eq(sz_psz2u(max_size_class+1), max_psz,
 	    "sz_psz2u() should return (LARGE_MAXCLASS + PAGE) for unsupported"
 	    " size");
diff --git a/test/unit/slab.c b/test/unit/slab.c
index 7e662ae..ef71882 100644
--- a/test/unit/slab.c
+++ b/test/unit/slab.c
@@ -3,7 +3,7 @@
 TEST_BEGIN(test_arena_slab_regind) {
 	szind_t binind;
 
-	for (binind = 0; binind < NBINS; binind++) {
+	for (binind = 0; binind < SC_NBINS; binind++) {
 		size_t regind;
 		extent_t slab;
 		const bin_info_t *bin_info = &bin_infos[binind];
diff --git a/test/unit/stats.c b/test/unit/stats.c
index 231010e..8fe0f3a 100644
--- a/test/unit/stats.c
+++ b/test/unit/stats.c
@@ -33,7 +33,7 @@ TEST_BEGIN(test_stats_large) {
 	size_t sz;
 	int expected = config_stats ? 0 : ENOENT;
 
-	p = mallocx(SMALL_MAXCLASS+1, MALLOCX_ARENA(0));
+	p = mallocx(sc_data_global.small_maxclass + 1, MALLOCX_ARENA(0));
 	assert_ptr_not_null(p, "Unexpected mallocx() failure");
 
 	assert_d_eq(mallctl("epoch", NULL, NULL, (void *)&epoch, sizeof(epoch)),
@@ -74,9 +74,10 @@ TEST_BEGIN(test_stats_arenas_summary) {
 	uint64_t dirty_npurge, dirty_nmadvise, dirty_purged;
 	uint64_t muzzy_npurge, muzzy_nmadvise, muzzy_purged;
 
-	little = mallocx(SMALL_MAXCLASS, MALLOCX_ARENA(0));
+	little = mallocx(sc_data_global.small_maxclass, MALLOCX_ARENA(0));
 	assert_ptr_not_null(little, "Unexpected mallocx() failure");
-	large = mallocx((1U << LG_LARGE_MINCLASS), MALLOCX_ARENA(0));
+	large = mallocx((1U << sc_data_global.lg_large_minclass),
+	    MALLOCX_ARENA(0));
 	assert_ptr_not_null(large, "Unexpected mallocx() failure");
 
 	dallocx(little, 0);
@@ -148,7 +149,7 @@ TEST_BEGIN(test_stats_arenas_small) {
 
 	no_lazy_lock(); /* Lazy locking would dodge tcache testing. */
 
-	p = mallocx(SMALL_MAXCLASS, MALLOCX_ARENA(0));
+	p = mallocx(sc_data_global.small_maxclass, MALLOCX_ARENA(0));
 	assert_ptr_not_null(p, "Unexpected mallocx() failure");
 
 	assert_d_eq(mallctl("thread.tcache.flush", NULL, NULL, NULL, 0),
@@ -191,7 +192,7 @@ TEST_BEGIN(test_stats_arenas_large) {
 	uint64_t epoch, nmalloc, ndalloc;
 	int expected = config_stats ? 0 : ENOENT;
 
-	p = mallocx((1U << LG_LARGE_MINCLASS), MALLOCX_ARENA(0));
+	p = mallocx((1U << sc_data_global.lg_large_minclass), MALLOCX_ARENA(0));
 	assert_ptr_not_null(p, "Unexpected mallocx() failure");
 
 	assert_d_eq(mallctl("epoch", NULL, NULL, (void *)&epoch, sizeof(epoch)),
diff --git a/test/unit/zero.c b/test/unit/zero.c
index 553692b..20a7062 100644
--- a/test/unit/zero.c
+++ b/test/unit/zero.c
@@ -41,13 +41,14 @@ test_zero(size_t sz_min, size_t sz_max) {
 
 TEST_BEGIN(test_zero_small) {
 	test_skip_if(!config_fill);
-	test_zero(1, SMALL_MAXCLASS-1);
+	test_zero(1, sc_data_global.small_maxclass - 1);
 }
 TEST_END
 
 TEST_BEGIN(test_zero_large) {
 	test_skip_if(!config_fill);
-	test_zero(SMALL_MAXCLASS+1, (1U << (LG_LARGE_MINCLASS+1)));
+	test_zero(sc_data_global.small_maxclass + 1,
+	    1U << (sc_data_global.lg_large_minclass + 1));
 }
 TEST_END
 
-- 
cgit v0.12


From 07b89c76736313159e952648a9df3bdcfe57eda2 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Mon, 18 Dec 2017 17:45:21 -0800
Subject: Move quantum detection into its own file.

This is logically fairly independent.
---
 .../jemalloc/internal/jemalloc_internal_types.h    | 75 +--------------------
 include/jemalloc/internal/quantum.h                | 77 ++++++++++++++++++++++
 2 files changed, 79 insertions(+), 73 deletions(-)
 create mode 100644 include/jemalloc/internal/quantum.h

diff --git a/include/jemalloc/internal/jemalloc_internal_types.h b/include/jemalloc/internal/jemalloc_internal_types.h
index 1b750b1..e296c5a 100644
--- a/include/jemalloc/internal/jemalloc_internal_types.h
+++ b/include/jemalloc/internal/jemalloc_internal_types.h
@@ -1,6 +1,8 @@
 #ifndef JEMALLOC_INTERNAL_TYPES_H
 #define JEMALLOC_INTERNAL_TYPES_H
 
+#include "jemalloc/internal/quantum.h"
+
 /* Page size index type. */
 typedef unsigned pszind_t;
 
@@ -50,79 +52,6 @@ typedef int malloc_cpuid_t;
 /* Smallest size class to support. */
 #define TINY_MIN		(1U << LG_TINY_MIN)
 
-/*
- * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
- * classes).
- */
-#ifndef LG_QUANTUM
-#  if (defined(__i386__) || defined(_M_IX86))
-#    define LG_QUANTUM		4
-#  endif
-#  ifdef __ia64__
-#    define LG_QUANTUM		4
-#  endif
-#  ifdef __alpha__
-#    define LG_QUANTUM		4
-#  endif
-#  if (defined(__sparc64__) || defined(__sparcv9) || defined(__sparc_v9__))
-#    define LG_QUANTUM		4
-#  endif
-#  if (defined(__amd64__) || defined(__x86_64__) || defined(_M_X64))
-#    define LG_QUANTUM		4
-#  endif
-#  ifdef __arm__
-#    define LG_QUANTUM		3
-#  endif
-#  ifdef __aarch64__
-#    define LG_QUANTUM		4
-#  endif
-#  ifdef __hppa__
-#    define LG_QUANTUM		4
-#  endif
-#  ifdef __m68k__
-#    define LG_QUANTUM		3
-#  endif
-#  ifdef __mips__
-#    define LG_QUANTUM		3
-#  endif
-#  ifdef __nios2__
-#    define LG_QUANTUM		3
-#  endif
-#  ifdef __or1k__
-#    define LG_QUANTUM		3
-#  endif
-#  ifdef __powerpc__
-#    define LG_QUANTUM		4
-#  endif
-#  if defined(__riscv) || defined(__riscv__)
-#    define LG_QUANTUM		4
-#  endif
-#  ifdef __s390__
-#    define LG_QUANTUM		4
-#  endif
-#  if (defined (__SH3E__) || defined(__SH4_SINGLE__) || defined(__SH4__) || \
-	defined(__SH4_SINGLE_ONLY__))
-#    define LG_QUANTUM		4
-#  endif
-#  ifdef __tile__
-#    define LG_QUANTUM		4
-#  endif
-#  ifdef __le32__
-#    define LG_QUANTUM		4
-#  endif
-#  ifndef LG_QUANTUM
-#    error "Unknown minimum alignment for architecture; specify via "
-	 "--with-lg-quantum"
-#  endif
-#endif
-
-#define QUANTUM			((size_t)(1U << LG_QUANTUM))
-#define QUANTUM_MASK		(QUANTUM - 1)
-
-/* Return the smallest quantum multiple that is >= a. */
-#define QUANTUM_CEILING(a)						\
-	(((a) + QUANTUM_MASK) & ~QUANTUM_MASK)
-
 #define LONG			((size_t)(1U << LG_SIZEOF_LONG))
 #define LONG_MASK		(LONG - 1)
 
diff --git a/include/jemalloc/internal/quantum.h b/include/jemalloc/internal/quantum.h
new file mode 100644
index 0000000..821086e
--- /dev/null
+++ b/include/jemalloc/internal/quantum.h
@@ -0,0 +1,77 @@
+#ifndef JEMALLOC_INTERNAL_QUANTUM_H
+#define JEMALLOC_INTERNAL_QUANTUM_H
+
+/*
+ * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
+ * classes).
+ */
+#ifndef LG_QUANTUM
+#  if (defined(__i386__) || defined(_M_IX86))
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __ia64__
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __alpha__
+#    define LG_QUANTUM		4
+#  endif
+#  if (defined(__sparc64__) || defined(__sparcv9) || defined(__sparc_v9__))
+#    define LG_QUANTUM		4
+#  endif
+#  if (defined(__amd64__) || defined(__x86_64__) || defined(_M_X64))
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __arm__
+#    define LG_QUANTUM		3
+#  endif
+#  ifdef __aarch64__
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __hppa__
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __m68k__
+#    define LG_QUANTUM		3
+#  endif
+#  ifdef __mips__
+#    define LG_QUANTUM		3
+#  endif
+#  ifdef __nios2__
+#    define LG_QUANTUM		3
+#  endif
+#  ifdef __or1k__
+#    define LG_QUANTUM		3
+#  endif
+#  ifdef __powerpc__
+#    define LG_QUANTUM		4
+#  endif
+#  if defined(__riscv) || defined(__riscv__)
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __s390__
+#    define LG_QUANTUM		4
+#  endif
+#  if (defined (__SH3E__) || defined(__SH4_SINGLE__) || defined(__SH4__) || \
+	defined(__SH4_SINGLE_ONLY__))
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __tile__
+#    define LG_QUANTUM		4
+#  endif
+#  ifdef __le32__
+#    define LG_QUANTUM		4
+#  endif
+#  ifndef LG_QUANTUM
+#    error "Unknown minimum alignment for architecture; specify via "
+	 "--with-lg-quantum"
+#  endif
+#endif
+
+#define QUANTUM			((size_t)(1U << LG_QUANTUM))
+#define QUANTUM_MASK		(QUANTUM - 1)
+
+/* Return the smallest quantum multiple that is >= a. */
+#define QUANTUM_CEILING(a)						\
+	(((a) + QUANTUM_MASK) & ~QUANTUM_MASK)
+
+#endif /* JEMALLOC_INTERNAL_QUANTUM_H */
-- 
cgit v0.12


From 2f07e92adb7060045e9e8601126e5ec071091c42 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Fri, 22 Dec 2017 15:14:44 -0800
Subject: Add lg_ceil to bit_util.

Also, add the bit_util test back to the Makefile.
---
 Makefile.in                          |  1 +
 include/jemalloc/internal/bit_util.h | 87 +++++++++---------------------------
 test/unit/bit_util.c                 | 56 ++++++++++++++++++++++-
 3 files changed, 76 insertions(+), 68 deletions(-)

diff --git a/Makefile.in b/Makefile.in
index 619aae7..a747d6e 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -167,6 +167,7 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/background_thread_enable.c \
 	$(srcroot)test/unit/base.c \
 	$(srcroot)test/unit/bitmap.c \
+	$(srcroot)test/unit/bit_util.c \
 	$(srcroot)test/unit/ckh.c \
 	$(srcroot)test/unit/decay.c \
 	$(srcroot)test/unit/div.c \
diff --git a/include/jemalloc/internal/bit_util.h b/include/jemalloc/internal/bit_util.h
index 435b497..521f71b 100644
--- a/include/jemalloc/internal/bit_util.h
+++ b/include/jemalloc/internal/bit_util.h
@@ -160,74 +160,27 @@ lg_floor(size_t x) {
 }
 #endif
 
+BIT_UTIL_INLINE unsigned
+lg_ceil(size_t x) {
+	return lg_floor(x) + ((x & (x - 1)) == 0 ? 0 : 1);
+}
+
 #undef BIT_UTIL_INLINE
 
-/* A compile-time version of lg_ceil */
-#define LG_CEIL(x) (							\
-    (x) <= (1ULL << 0ULL) ? 0 :						\
-    (x) <= (1ULL << 1ULL) ? 1 :						\
-    (x) <= (1ULL << 2ULL) ? 2 :						\
-    (x) <= (1ULL << 3ULL) ? 3 :						\
-    (x) <= (1ULL << 4ULL) ? 4 :						\
-    (x) <= (1ULL << 5ULL) ? 5 :						\
-    (x) <= (1ULL << 6ULL) ? 6 :						\
-    (x) <= (1ULL << 7ULL) ? 7 :						\
-    (x) <= (1ULL << 8ULL) ? 8 :						\
-    (x) <= (1ULL << 9ULL) ? 9 :						\
-    (x) <= (1ULL << 10ULL) ? 10 :					\
-    (x) <= (1ULL << 11ULL) ? 11 :					\
-    (x) <= (1ULL << 12ULL) ? 12 :					\
-    (x) <= (1ULL << 13ULL) ? 13 :					\
-    (x) <= (1ULL << 14ULL) ? 14 :					\
-    (x) <= (1ULL << 15ULL) ? 15 :					\
-    (x) <= (1ULL << 16ULL) ? 16 :					\
-    (x) <= (1ULL << 17ULL) ? 17 :					\
-    (x) <= (1ULL << 18ULL) ? 18 :					\
-    (x) <= (1ULL << 19ULL) ? 19 :					\
-    (x) <= (1ULL << 20ULL) ? 20 :					\
-    (x) <= (1ULL << 21ULL) ? 21 :					\
-    (x) <= (1ULL << 22ULL) ? 22 :					\
-    (x) <= (1ULL << 23ULL) ? 23 :					\
-    (x) <= (1ULL << 24ULL) ? 24 :					\
-    (x) <= (1ULL << 25ULL) ? 25 :					\
-    (x) <= (1ULL << 26ULL) ? 26 :					\
-    (x) <= (1ULL << 27ULL) ? 27 :					\
-    (x) <= (1ULL << 28ULL) ? 28 :					\
-    (x) <= (1ULL << 29ULL) ? 29 :					\
-    (x) <= (1ULL << 30ULL) ? 30 :					\
-    (x) <= (1ULL << 31ULL) ? 31 :					\
-    (x) <= (1ULL << 32ULL) ? 32 :					\
-    (x) <= (1ULL << 33ULL) ? 33 :					\
-    (x) <= (1ULL << 34ULL) ? 34 :					\
-    (x) <= (1ULL << 35ULL) ? 35 :					\
-    (x) <= (1ULL << 36ULL) ? 36 :					\
-    (x) <= (1ULL << 37ULL) ? 37 :					\
-    (x) <= (1ULL << 38ULL) ? 38 :					\
-    (x) <= (1ULL << 39ULL) ? 39 :					\
-    (x) <= (1ULL << 40ULL) ? 40 :					\
-    (x) <= (1ULL << 41ULL) ? 41 :					\
-    (x) <= (1ULL << 42ULL) ? 42 :					\
-    (x) <= (1ULL << 43ULL) ? 43 :					\
-    (x) <= (1ULL << 44ULL) ? 44 :					\
-    (x) <= (1ULL << 45ULL) ? 45 :					\
-    (x) <= (1ULL << 46ULL) ? 46 :					\
-    (x) <= (1ULL << 47ULL) ? 47 :					\
-    (x) <= (1ULL << 48ULL) ? 48 :					\
-    (x) <= (1ULL << 49ULL) ? 49 :					\
-    (x) <= (1ULL << 50ULL) ? 50 :					\
-    (x) <= (1ULL << 51ULL) ? 51 :					\
-    (x) <= (1ULL << 52ULL) ? 52 :					\
-    (x) <= (1ULL << 53ULL) ? 53 :					\
-    (x) <= (1ULL << 54ULL) ? 54 :					\
-    (x) <= (1ULL << 55ULL) ? 55 :					\
-    (x) <= (1ULL << 56ULL) ? 56 :					\
-    (x) <= (1ULL << 57ULL) ? 57 :					\
-    (x) <= (1ULL << 58ULL) ? 58 :					\
-    (x) <= (1ULL << 59ULL) ? 59 :					\
-    (x) <= (1ULL << 60ULL) ? 60 :					\
-    (x) <= (1ULL << 61ULL) ? 61 :					\
-    (x) <= (1ULL << 62ULL) ? 62 :					\
-    (x) <= (1ULL << 63ULL) ? 63 :					\
-    64)
+/* A compile-time version of lg_floor and lg_ceil. */
+#define LG_FLOOR_1(x) 0
+#define LG_FLOOR_2(x) (x < (1ULL << 1) ? LG_FLOOR_1(x) : 1 + LG_FLOOR_1(x >> 1))
+#define LG_FLOOR_4(x) (x < (1ULL << 2) ? LG_FLOOR_2(x) : 2 + LG_FLOOR_2(x >> 2))
+#define LG_FLOOR_8(x) (x < (1ULL << 4) ? LG_FLOOR_4(x) : 4 + LG_FLOOR_4(x >> 4))
+#define LG_FLOOR_16(x) (x < (1ULL << 8) ? LG_FLOOR_8(x) : 8 + LG_FLOOR_8(x >> 8))
+#define LG_FLOOR_32(x) (x < (1ULL << 16) ? LG_FLOOR_16(x) : 16 + LG_FLOOR_16(x >> 16))
+#define LG_FLOOR_64(x) (x < (1ULL << 32) ? LG_FLOOR_32(x) : 32 + LG_FLOOR_32(x >> 32))
+#if LG_SIZEOF_PTR == 2
+#  define LG_FLOOR(x) LG_FLOOR_32((x))
+#else
+#  define LG_FLOOR(x) LG_FLOOR_64((x))
+#endif
+
+#define LG_CEIL(x) (LG_FLOOR(x) + (((x) & ((x) - 1)) == 0 ? 0 : 1))
 
 #endif /* JEMALLOC_INTERNAL_BIT_UTIL_H */
diff --git a/test/unit/bit_util.c b/test/unit/bit_util.c
index 42a9701..b747deb 100644
--- a/test/unit/bit_util.c
+++ b/test/unit/bit_util.c
@@ -48,10 +48,64 @@ TEST_BEGIN(test_pow2_ceil_zu) {
 }
 TEST_END
 
+void
+assert_lg_ceil_range(size_t input, unsigned answer) {
+	if (input == 1) {
+		assert_u_eq(0, answer, "Got %u as lg_ceil of 1", answer);
+		return;
+	}
+	assert_zu_le(input, (ZU(1) << answer),
+	    "Got %u as lg_ceil of %zu", answer, input);
+	assert_zu_gt(input, (ZU(1) << (answer - 1)),
+	    "Got %u as lg_ceil of %zu", answer, input);
+}
+
+void
+assert_lg_floor_range(size_t input, unsigned answer) {
+	if (input == 1) {
+		assert_u_eq(0, answer, "Got %u as lg_floor of 1", answer);
+		return;
+	}
+	assert_zu_ge(input, (ZU(1) << answer),
+	    "Got %u as lg_floor of %zu", answer, input);
+	assert_zu_lt(input, (ZU(1) << (answer + 1)),
+	    "Got %u as lg_floor of %zu", answer, input);
+}
+
+TEST_BEGIN(test_lg_ceil_floor) {
+	for (size_t i = 1; i < 10 * 1000 * 1000; i++) {
+		assert_lg_ceil_range(i, lg_ceil(i));
+		assert_lg_ceil_range(i, LG_CEIL(i));
+		assert_lg_floor_range(i, lg_floor(i));
+		assert_lg_floor_range(i, LG_FLOOR(i));
+	}
+	for (int i = 10; i < 8 * (1 << LG_SIZEOF_PTR) - 5; i++) {
+		for (size_t j = 0; j < (1 << 4); j++) {
+			size_t num1 = ((size_t)1 << i)
+			    - j * ((size_t)1 << (i - 4));
+			size_t num2 = ((size_t)1 << i)
+			    + j * ((size_t)1 << (i - 4));
+			assert_zu_ne(num1, 0, "Invalid lg argument");
+			assert_zu_ne(num2, 0, "Invalid lg argument");
+			assert_lg_ceil_range(num1, lg_ceil(num1));
+			assert_lg_ceil_range(num1, LG_CEIL(num1));
+			assert_lg_ceil_range(num2, lg_ceil(num2));
+			assert_lg_ceil_range(num2, LG_CEIL(num2));
+
+			assert_lg_floor_range(num1, lg_floor(num1));
+			assert_lg_floor_range(num1, LG_FLOOR(num1));
+			assert_lg_floor_range(num2, lg_floor(num2));
+			assert_lg_floor_range(num2, LG_FLOOR(num2));
+		}
+	}
+}
+TEST_END
+
 int
 main(void) {
 	return test(
 	    test_pow2_ceil_u64,
 	    test_pow2_ceil_u32,
-	    test_pow2_ceil_zu);
+	    test_pow2_ceil_zu,
+	    test_lg_ceil_floor);
 }
-- 
cgit v0.12


From 4f55c0ec220ae97eb5bc7e2bebc07d5c6100fa83 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Fri, 22 Dec 2017 15:01:34 -0800
Subject: Translate size class computation from bash shell into C.

This is the last big step in making size classes a runtime computation rather
than a configure-time one.

The compile-time computation has been left in, for now, to allow assertion
checking that the results are identical.
---
 src/sc.c | 305 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 272 insertions(+), 33 deletions(-)

diff --git a/src/sc.c b/src/sc.c
index 943d787..1d343d3 100644
--- a/src/sc.c
+++ b/src/sc.c
@@ -1,62 +1,301 @@
 #include "jemalloc/internal/jemalloc_preamble.h"
 
 #include "jemalloc/internal/assert.h"
+#include "jemalloc/internal/bit_util.h"
 #include "jemalloc/internal/sc.h"
-#include "jemalloc/internal/size_classes.h"
 
 sc_data_t sc_data_global;
 
+static size_t
+reg_size_compute(int lg_base, int lg_delta, int ndelta) {
+	return (ZU(1) << lg_base) + (ZU(ndelta) << lg_delta);
+}
+
+/* Returns the number of pages in the slab. */
+static int
+slab_size(int lg_page, int lg_base, int lg_delta, int ndelta) {
+	size_t page = (ZU(1) << lg_page);
+	size_t reg_size = reg_size_compute(lg_base, lg_delta, ndelta);
+
+	size_t try_slab_size = page;
+	size_t try_nregs = try_slab_size / reg_size;
+	size_t perfect_slab_size = 0;
+	bool perfect = false;
+	/*
+	 * This loop continues until we find the least common multiple of the
+	 * page size and size class size.  Size classes are all of the form
+	 * base + ndelta * delta == (ndelta + base/ndelta) * delta, which is
+	 * (ndelta + ngroup) * delta.  The way we choose slabbing strategies
+	 * means that delta is at most the page size and ndelta < ngroup.  So
+	 * the loop executes for at most 2 * ngroup - 1 iterations, which is
+	 * also the bound on the number of pages in a slab chosen by default.
+	 * With the current default settings, this is at most 7.
+	 */
+	while (!perfect) {
+		perfect_slab_size = try_slab_size;
+		size_t perfect_nregs = try_nregs;
+		try_slab_size += page;
+		try_nregs = try_slab_size / reg_size;
+		if (perfect_slab_size == perfect_nregs * reg_size) {
+			perfect = true;
+		}
+	}
+	return (int)(perfect_slab_size / page);
+}
+
 static void
-fill_sc(sc_data_t *data, int index, int lg_base, int lg_delta, int ndelta,
-    bool psz, bool bin, int pgs, int lg_delta_lookup) {
-	sc_t *sc = &data->sc[index];
+size_class(
+    /* Output. */
+    sc_t *sc,
+    /* Configuration decisions. */
+    int lg_max_lookup, int lg_page, int lg_ngroup,
+    /* Inputs specific to the size class. */
+    int index, int lg_base, int lg_delta, int ndelta) {
 	sc->index = index;
 	sc->lg_base = lg_base;
 	sc->lg_delta = lg_delta;
 	sc->ndelta = ndelta;
-	sc->psz = psz;
-	sc->bin = bin;
-	sc->pgs = pgs;
-	sc->lg_delta_lookup = lg_delta_lookup;
+	sc->psz = (reg_size_compute(lg_base, lg_delta, ndelta)
+	    % (ZU(1) << lg_page) == 0);
+	size_t size = (ZU(1) << lg_base) + (ZU(ndelta) << lg_delta);
+	if (index == 0) {
+		assert(!sc->psz);
+	}
+	if (size < (ZU(1) << (lg_page + lg_ngroup))) {
+		sc->bin = true;
+		sc->pgs = slab_size(lg_page, lg_base, lg_delta, ndelta);
+	} else {
+		sc->bin = false;
+		sc->pgs = 0;
+	}
+	if (size <= (ZU(1) << lg_max_lookup)) {
+		sc->lg_delta_lookup = lg_delta;
+	} else {
+		sc->lg_delta_lookup = 0;
+	}
+}
+
+static void
+size_classes(
+    /* Output. */
+    sc_data_t *sc_data,
+    /* Determined by the system. */
+    size_t lg_ptr_size, int lg_quantum,
+    /* Configuration decisions. */
+    int lg_tiny_min, int lg_max_lookup, int lg_page, int lg_ngroup) {
+	int ptr_bits = (1 << lg_ptr_size) * 8;
+	int ngroup = (1 << lg_ngroup);
+	int ntiny = 0;
+	int nlbins = 0;
+	int lg_tiny_maxclass = (unsigned)-1;
+	int nbins = 0;
+	int npsizes = 0;
+
+	int index = 0;
+
+	int ndelta = 0;
+	int lg_base = lg_tiny_min;
+	int lg_delta = lg_base;
+
+	/* Outputs that we update as we go. */
+	size_t lookup_maxclass = 0;
+	size_t small_maxclass = 0;
+	int lg_large_minclass = 0;
+	size_t large_maxclass = 0;
+
+	/* Tiny size classes. */
+	while (lg_base < lg_quantum) {
+		sc_t *sc = &sc_data->sc[index];
+		size_class(sc, lg_max_lookup, lg_page, lg_ngroup, index,
+		    lg_base, lg_delta, ndelta);
+		if (sc->lg_delta_lookup != 0) {
+			nlbins = index + 1;
+		}
+		if (sc->psz) {
+			npsizes++;
+		}
+		if (sc->bin) {
+			nbins++;
+		}
+		ntiny++;
+		/* Final written value is correct. */
+		lg_tiny_maxclass = lg_base;
+		index++;
+		lg_delta = lg_base;
+		lg_base++;
+	}
+
+	/* First non-tiny (pseudo) group. */
+	if (ntiny != 0) {
+		sc_t *sc = &sc_data->sc[index];
+		/*
+		 * See the note in sc.h; the first non-tiny size class has an
+		 * unusual encoding.
+		 */
+		lg_base--;
+		ndelta = 1;
+		size_class(sc, lg_max_lookup, lg_page, lg_ngroup, index,
+		    lg_base, lg_delta, ndelta);
+		index++;
+		lg_base++;
+		lg_delta++;
+		if (sc->psz) {
+			npsizes++;
+		}
+		if (sc->bin) {
+			nbins++;
+		}
+	}
+	while (ndelta < ngroup) {
+		sc_t *sc = &sc_data->sc[index];
+		size_class(sc, lg_max_lookup, lg_page, lg_ngroup, index,
+		    lg_base, lg_delta, ndelta);
+		index++;
+		ndelta++;
+		if (sc->psz) {
+			npsizes++;
+		}
+		if (sc->bin) {
+			nbins++;
+		}
+	}
+
+	/* All remaining groups. */
+	lg_base = lg_base + lg_ngroup;
+	while (lg_base < ptr_bits - 1) {
+		ndelta = 1;
+		int ndelta_limit;
+		if (lg_base == ptr_bits - 2) {
+			ndelta_limit = ngroup - 1;
+		} else {
+			ndelta_limit = ngroup;
+		}
+		while (ndelta <= ndelta_limit) {
+			sc_t *sc = &sc_data->sc[index];
+			size_class(sc, lg_max_lookup, lg_page, lg_ngroup, index,
+			    lg_base, lg_delta, ndelta);
+			if (sc->lg_delta_lookup != 0) {
+				nlbins = index + 1;
+				/* Final written value is correct. */
+				lookup_maxclass = (ZU(1) << lg_base)
+				    + (ZU(ndelta) << lg_delta);
+			}
+			if (sc->psz) {
+				npsizes++;
+			}
+			if (sc->bin) {
+				nbins++;
+				/* Final written value is correct. */
+				small_maxclass = (ZU(1) << lg_base)
+				    + (ZU(ndelta) << lg_delta);
+				if (lg_ngroup > 0) {
+					lg_large_minclass = lg_base + 1;
+				} else {
+					lg_large_minclass = lg_base + 2;
+				}
+			}
+			large_maxclass = (ZU(1) << lg_base)
+			    + (ZU(ndelta) << lg_delta);
+			index++;
+			ndelta++;
+		}
+		lg_base++;
+		lg_delta++;
+	}
+	/* Additional outputs. */
+	int nsizes = index;
+	unsigned lg_ceil_nsizes = lg_ceil(nsizes);
+
+	/* Fill in the output data. */
+	sc_data->ntiny = ntiny;
+	sc_data->nlbins = nlbins;
+	sc_data->nbins = nbins;
+	sc_data->nsizes = nsizes;
+	sc_data->lg_ceil_nsizes = lg_ceil_nsizes;
+	sc_data->npsizes = npsizes;
+	sc_data->lg_tiny_maxclass = lg_tiny_maxclass;
+	sc_data->lookup_maxclass = lookup_maxclass;
+	sc_data->small_maxclass = small_maxclass;
+	sc_data->lg_large_minclass = lg_large_minclass;
+	sc_data->large_minclass = (ZU(1) << lg_large_minclass);
+	sc_data->large_maxclass = large_maxclass;
 }
 
+/*
+ * Defined later (after size_classes.h becomes visible), but called during
+ * initialization.
+ */
+static void sc_data_assert(sc_data_t *sc_data);
+
 void
-sc_data_init(sc_data_t *data) {
+sc_data_init(sc_data_t *sc_data) {
+	assert(!sc_data->initialized);
+
+	int lg_max_lookup = 12;
+
+	size_classes(sc_data, LG_SIZEOF_PTR, LG_QUANTUM, SC_LG_TINY_MIN,
+	    lg_max_lookup, LG_PAGE, 2);
+
+	sc_data->initialized = true;
+
+	sc_data_assert(sc_data);
+}
+
+void
+sc_boot() {
+	sc_data_init(&sc_data_global);
+}
+
+/*
+ * We don't include size_classes.h until this point, to ensure only the asserts
+ * can see it.
+ */
+#include "jemalloc/internal/size_classes.h"
+
+static void
+sc_assert(sc_t *sc, int index, int lg_base, int lg_delta, int ndelta, int psz,
+    int bin, int pgs, int lg_delta_lookup) {
+	assert(sc->index == index);
+	assert(sc->lg_base == lg_base);
+	assert(sc->lg_delta == lg_delta);
+	assert(sc->ndelta == ndelta);
+	assert(sc->psz == psz);
+	assert(sc->bin == bin);
+	assert(sc->pgs == pgs);
+	assert(sc->lg_delta_lookup == lg_delta_lookup);
+}
+
+static void
+sc_data_assert(sc_data_t *sc_data) {
 	assert(SC_NTINY == NTBINS);
 	assert(SC_NSIZES == NSIZES);
 	assert(SC_NBINS == NBINS);
 	assert(NPSIZES <= SC_NPSIZES_MAX);
-	assert(!data->initialized);
-	data->initialized = true;
-	data->ntiny = NTBINS;
-	data->nlbins = NLBINS;
-	data->nbins = NBINS;
-	data->nsizes = NSIZES;
-	data->lg_ceil_nsizes = LG_CEIL_NSIZES;
-	data->npsizes = NPSIZES;
-#if SC_NTINY != 0
-	data->lg_tiny_maxclass = LG_TINY_MAXCLASS;
+	assert(sc_data->ntiny == NTBINS);
+	assert(sc_data->nlbins == NLBINS);
+	assert(sc_data->nbins == NBINS);
+	assert(sc_data->nsizes == NSIZES);
+	assert(sc_data->lg_ceil_nsizes == LG_CEIL_NSIZES);
+	assert(sc_data->npsizes == NPSIZES);
+#if NTBINS > 0
+	assert(sc_data->lg_tiny_maxclass == LG_TINY_MAXCLASS);
 #else
-	data->lg_tiny_maxclass = -1;
+	assert(sc_data->lg_tiny_maxclass == -1);
 #endif
-	data->lookup_maxclass = LOOKUP_MAXCLASS;
-	data->small_maxclass = SMALL_MAXCLASS;
-	data->lg_large_minclass = LG_LARGE_MINCLASS;
-	data->large_minclass = LARGE_MINCLASS;
-	data->large_maxclass = LARGE_MAXCLASS;
+	assert(sc_data->lookup_maxclass == LOOKUP_MAXCLASS);
+	assert(sc_data->small_maxclass == SMALL_MAXCLASS);
+	assert(sc_data->lg_large_minclass == LG_LARGE_MINCLASS);
+	assert(sc_data->large_minclass == LARGE_MINCLASS);
+	assert(sc_data->large_maxclass == LARGE_MAXCLASS);
+	assert(sc_data->initialized);
 #define no 0
 #define yes 1
-#define SC(index, lg_base_base, lg_delta, ndelta, psz, bin, pgs,	\
+#define SC(index, lg_base, lg_delta, ndelta, psz, bin, pgs,		\
     lg_delta_lookup)							\
-	fill_sc(data, index, lg_base_base, lg_delta, ndelta, psz, bin, 	\
-	    pgs, lg_delta_lookup);
+	sc_assert(&sc_data->sc[index], index, lg_base, lg_delta,	\
+	    ndelta, psz, bin, pgs, lg_delta_lookup);
 	SIZE_CLASSES
 #undef no
 #undef yes
 #undef SC
 }
-
-void
-sc_boot() {
-	sc_data_init(&sc_data_global);
-}
-- 
cgit v0.12


From 0552aad91b955db7ad1806907255e943af2fdb88 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Fri, 5 Jan 2018 13:11:44 -0800
Subject: Kill size_classes.sh.

We've moved size class computations to boot time; they were being used only to
check that the computations resulted in equal values.
---
 .gitignore                                |   1 -
 configure.ac                              |  12 -
 include/jemalloc/internal/size_classes.sh | 361 ------------------------------
 src/sc.c                                  |  62 -----
 4 files changed, 436 deletions(-)
 delete mode 100755 include/jemalloc/internal/size_classes.sh

diff --git a/.gitignore b/.gitignore
index 19199cc..5ca0ad1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -30,7 +30,6 @@
 /include/jemalloc/internal/public_namespace.h
 /include/jemalloc/internal/public_symbols.txt
 /include/jemalloc/internal/public_unnamespace.h
-/include/jemalloc/internal/size_classes.h
 /include/jemalloc/jemalloc.h
 /include/jemalloc/jemalloc_defs.h
 /include/jemalloc/jemalloc_macros.h
diff --git a/configure.ac b/configure.ac
index 1c20911..8727087 100644
--- a/configure.ac
+++ b/configure.ac
@@ -968,7 +968,6 @@ cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/private_symbols.sh"
 cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/private_namespace.sh"
 cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/public_namespace.sh"
 cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/public_unnamespace.sh"
-cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/size_classes.sh"
 cfghdrs_in="${cfghdrs_in} include/jemalloc/jemalloc_rename.sh"
 cfghdrs_in="${cfghdrs_in} include/jemalloc/jemalloc_mangle.sh"
 cfghdrs_in="${cfghdrs_in} include/jemalloc/jemalloc.sh"
@@ -981,7 +980,6 @@ cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/private_symbols_jet.awk"
 cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/public_symbols.txt"
 cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/public_namespace.h"
 cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/public_unnamespace.h"
-cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/size_classes.h"
 cfghdrs_out="${cfghdrs_out} include/jemalloc/jemalloc_protos_jet.h"
 cfghdrs_out="${cfghdrs_out} include/jemalloc/jemalloc_rename.h"
 cfghdrs_out="${cfghdrs_out} include/jemalloc/jemalloc_mangle.h"
@@ -2177,16 +2175,6 @@ AC_CONFIG_COMMANDS([include/jemalloc/internal/public_unnamespace.h], [
   srcdir="${srcdir}"
   objroot="${objroot}"
 ])
-AC_CONFIG_COMMANDS([include/jemalloc/internal/size_classes.h], [
-  mkdir -p "${objroot}include/jemalloc/internal"
-  "${SHELL}" "${srcdir}/include/jemalloc/internal/size_classes.sh" "${LG_QUANTA}" 3 "${LG_PAGE_SIZES}" 2 > "${objroot}include/jemalloc/internal/size_classes.h"
-], [
-  SHELL="${SHELL}"
-  srcdir="${srcdir}"
-  objroot="${objroot}"
-  LG_QUANTA="${LG_QUANTA}"
-  LG_PAGE_SIZES="${LG_PAGE_SIZES}"
-])
 AC_CONFIG_COMMANDS([include/jemalloc/jemalloc_protos_jet.h], [
   mkdir -p "${objroot}include/jemalloc"
   cat "${srcdir}/include/jemalloc/jemalloc_protos.h.in" | sed -e 's/@je_@/jet_/g' > "${objroot}include/jemalloc/jemalloc_protos_jet.h"
diff --git a/include/jemalloc/internal/size_classes.sh b/include/jemalloc/internal/size_classes.sh
deleted file mode 100755
index 998994d..0000000
--- a/include/jemalloc/internal/size_classes.sh
+++ /dev/null
@@ -1,361 +0,0 @@
-#!/bin/sh
-#
-# Usage: size_classes.sh <lg_qarr> <lg_tmin> <lg_parr> <lg_g>
-
-# The following limits are chosen such that they cover all supported platforms.
-
-# Pointer sizes.
-lg_zarr="2 3"
-
-# Quanta.
-lg_qarr=$1
-
-# The range of tiny size classes is [2^lg_tmin..2^(lg_q-1)].
-lg_tmin=$2
-
-# Maximum lookup size.
-lg_kmax=12
-
-# Page sizes.
-lg_parr=`echo $3 | tr ',' ' '`
-
-# Size class group size (number of size classes for each size doubling).
-lg_g=$4
-
-pow2() {
-  e=$1
-  pow2_result=1
-  while [ ${e} -gt 0 ] ; do
-    pow2_result=$((${pow2_result} + ${pow2_result}))
-    e=$((${e} - 1))
-  done
-}
-
-lg() {
-  x=$1
-  lg_result=0
-  while [ ${x} -gt 1 ] ; do
-    lg_result=$((${lg_result} + 1))
-    x=$((${x} / 2))
-  done
-}
-
-lg_ceil() {
-  y=$1
-  lg ${y}; lg_floor=${lg_result}
-  pow2 ${lg_floor}; pow2_floor=${pow2_result}
-  if [ ${pow2_floor} -lt ${y} ] ; then
-    lg_ceil_result=$((${lg_floor} + 1))
-  else
-    lg_ceil_result=${lg_floor}
-  fi
-}
-
-reg_size_compute() {
-  lg_grp=$1
-  lg_delta=$2
-  ndelta=$3
-
-  pow2 ${lg_grp}; grp=${pow2_result}
-  pow2 ${lg_delta}; delta=${pow2_result}
-  reg_size=$((${grp} + ${delta}*${ndelta}))
-}
-
-slab_size() {
-  lg_p=$1
-  lg_grp=$2
-  lg_delta=$3
-  ndelta=$4
-
-  pow2 ${lg_p}; p=${pow2_result}
-  reg_size_compute ${lg_grp} ${lg_delta} ${ndelta}
-
-  # Compute smallest slab size that is an integer multiple of reg_size.
-  try_slab_size=${p}
-  try_nregs=$((${try_slab_size} / ${reg_size}))
-  perfect=0
-  while [ ${perfect} -eq 0 ] ; do
-    perfect_slab_size=${try_slab_size}
-    perfect_nregs=${try_nregs}
-
-    try_slab_size=$((${try_slab_size} + ${p}))
-    try_nregs=$((${try_slab_size} / ${reg_size}))
-    if [ ${perfect_slab_size} -eq $((${perfect_nregs} * ${reg_size})) ] ; then
-      perfect=1
-    fi
-  done
-
-  slab_size_pgs=$((${perfect_slab_size} / ${p}))
-}
-
-size_class() {
-  index=$1
-  lg_grp=$2
-  lg_delta=$3
-  ndelta=$4
-  lg_p=$5
-  lg_kmax=$6
-
-  if [ ${lg_delta} -ge ${lg_p} ] ; then
-    psz="yes"
-  else
-    pow2 ${lg_p}; p=${pow2_result}
-    pow2 ${lg_grp}; grp=${pow2_result}
-    pow2 ${lg_delta}; delta=${pow2_result}
-    sz=$((${grp} + ${delta} * ${ndelta}))
-    npgs=$((${sz} / ${p}))
-    if [ ${sz} -eq $((${npgs} * ${p})) ] ; then
-      psz="yes"
-    else
-      psz="no"
-    fi
-  fi
-
-  lg ${ndelta}; lg_ndelta=${lg_result}; pow2 ${lg_ndelta}
-  if [ ${pow2_result} -lt ${ndelta} ] ; then
-    rem="yes"
-  else
-    rem="no"
-  fi
-
-  lg_size=${lg_grp}
-  if [ $((${lg_delta} + ${lg_ndelta})) -eq ${lg_grp} ] ; then
-    lg_size=$((${lg_grp} + 1))
-  else
-    lg_size=${lg_grp}
-    rem="yes"
-  fi
-
-  if [ ${lg_size} -lt $((${lg_p} + ${lg_g})) ] ; then
-    bin="yes"
-    slab_size ${lg_p} ${lg_grp} ${lg_delta} ${ndelta}; pgs=${slab_size_pgs}
-  else
-    bin="no"
-    pgs=0
-  fi
-  if [ ${lg_size} -lt ${lg_kmax} \
-      -o ${lg_size} -eq ${lg_kmax} -a ${rem} = "no" ] ; then
-    lg_delta_lookup=${lg_delta}
-  else
-    lg_delta_lookup="no"
-  fi
-  printf '    SC(%3d, %6d, %8d, %6d, %3s, %3s, %3d, %2s) \\\n' ${index} ${lg_grp} ${lg_delta} ${ndelta} ${psz} ${bin} ${pgs} ${lg_delta_lookup}
-  # Defined upon return:
-  # - psz ("yes" or "no")
-  # - bin ("yes" or "no")
-  # - pgs
-  # - lg_delta_lookup (${lg_delta} or "no")
-}
-
-sep_line() {
-  echo "                                                         \\"
-}
-
-size_classes() {
-  lg_z=$1
-  lg_q=$2
-  lg_t=$3
-  lg_p=$4
-  lg_g=$5
-
-  pow2 $((${lg_z} + 3)); ptr_bits=${pow2_result}
-  pow2 ${lg_g}; g=${pow2_result}
-
-  echo "#define SIZE_CLASSES \\"
-  echo "  /* index, lg_grp, lg_delta, ndelta, psz, bin, pgs, lg_delta_lookup */ \\"
-
-  ntbins=0
-  nlbins=0
-  lg_tiny_maxclass='"NA"'
-  nbins=0
-  npsizes=0
-
-  # Tiny size classes.
-  ndelta=0
-  index=0
-  lg_grp=${lg_t}
-  lg_delta=${lg_grp}
-  while [ ${lg_grp} -lt ${lg_q} ] ; do
-    size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
-    if [ ${lg_delta_lookup} != "no" ] ; then
-      nlbins=$((${index} + 1))
-    fi
-    if [ ${psz} = "yes" ] ; then
-      npsizes=$((${npsizes} + 1))
-    fi
-    if [ ${bin} != "no" ] ; then
-      nbins=$((${index} + 1))
-    fi
-    ntbins=$((${ntbins} + 1))
-    lg_tiny_maxclass=${lg_grp} # Final written value is correct.
-    index=$((${index} + 1))
-    lg_delta=${lg_grp}
-    lg_grp=$((${lg_grp} + 1))
-  done
-
-  # First non-tiny group.
-  if [ ${ntbins} -gt 0 ] ; then
-    sep_line
-    # The first size class has an unusual encoding, because the size has to be
-    # split between grp and delta*ndelta.
-    lg_grp=$((${lg_grp} - 1))
-    ndelta=1
-    size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
-    index=$((${index} + 1))
-    lg_grp=$((${lg_grp} + 1))
-    lg_delta=$((${lg_delta} + 1))
-    if [ ${psz} = "yes" ] ; then
-      npsizes=$((${npsizes} + 1))
-    fi
-  fi
-  while [ ${ndelta} -lt ${g} ] ; do
-    size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
-    index=$((${index} + 1))
-    ndelta=$((${ndelta} + 1))
-    if [ ${psz} = "yes" ] ; then
-      npsizes=$((${npsizes} + 1))
-    fi
-  done
-
-  # All remaining groups.
-  lg_grp=$((${lg_grp} + ${lg_g}))
-  while [ ${lg_grp} -lt $((${ptr_bits} - 1)) ] ; do
-    sep_line
-    ndelta=1
-    if [ ${lg_grp} -eq $((${ptr_bits} - 2)) ] ; then
-      ndelta_limit=$((${g} - 1))
-    else
-      ndelta_limit=${g}
-    fi
-    while [ ${ndelta} -le ${ndelta_limit} ] ; do
-      size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
-      if [ ${lg_delta_lookup} != "no" ] ; then
-        nlbins=$((${index} + 1))
-        # Final written value is correct:
-        lookup_maxclass="((((size_t)1) << ${lg_grp}) + (((size_t)${ndelta}) << ${lg_delta}))"
-      fi
-      if [ ${psz} = "yes" ] ; then
-        npsizes=$((${npsizes} + 1))
-      fi
-      if [ ${bin} != "no" ] ; then
-        nbins=$((${index} + 1))
-        # Final written value is correct:
-        small_maxclass="((((size_t)1) << ${lg_grp}) + (((size_t)${ndelta}) << ${lg_delta}))"
-        if [ ${lg_g} -gt 0 ] ; then
-          lg_large_minclass=$((${lg_grp} + 1))
-        else
-          lg_large_minclass=$((${lg_grp} + 2))
-        fi
-      fi
-      # Final written value is correct:
-      large_maxclass="((((size_t)1) << ${lg_grp}) + (((size_t)${ndelta}) << ${lg_delta}))"
-      index=$((${index} + 1))
-      ndelta=$((${ndelta} + 1))
-    done
-    lg_grp=$((${lg_grp} + 1))
-    lg_delta=$((${lg_delta} + 1))
-  done
-  echo
-  nsizes=${index}
-  lg_ceil ${nsizes}; lg_ceil_nsizes=${lg_ceil_result}
-
-  # Defined upon completion:
-  # - ntbins
-  # - nlbins
-  # - nbins
-  # - nsizes
-  # - lg_ceil_nsizes
-  # - npsizes
-  # - lg_tiny_maxclass
-  # - lookup_maxclass
-  # - small_maxclass
-  # - lg_large_minclass
-  # - large_maxclass
-}
-
-cat <<EOF
-#ifndef JEMALLOC_INTERNAL_SIZE_CLASSES_H
-#define JEMALLOC_INTERNAL_SIZE_CLASSES_H
-
-/* This file was automatically generated by size_classes.sh. */
-
-#include "jemalloc/internal/jemalloc_internal_types.h"
-
-/*
- * This header file defines:
- *
- *   LG_SIZE_CLASS_GROUP: Lg of size class count for each size doubling.
- *   LG_TINY_MIN: Lg of minimum size class to support.
- *   SIZE_CLASSES: Complete table of SC(index, lg_grp, lg_delta, ndelta, psz,
- *                 bin, pgs, lg_delta_lookup) tuples.
- *     index: Size class index.
- *     lg_grp: Lg group base size (no deltas added).
- *     lg_delta: Lg delta to previous size class.
- *     ndelta: Delta multiplier.  size == 1<<lg_grp + ndelta<<lg_delta
- *     psz: 'yes' if a multiple of the page size, 'no' otherwise.
- *     bin: 'yes' if a small bin size class, 'no' otherwise.
- *     pgs: Slab page count if a small bin size class, 0 otherwise.
- *     lg_delta_lookup: Same as lg_delta if a lookup table size class, 'no'
- *                      otherwise.
- *   NTBINS: Number of tiny bins.
- *   NLBINS: Number of bins supported by the lookup table.
- *   NBINS: Number of small size class bins.
- *   NSIZES: Number of size classes.
- *   LG_CEIL_NSIZES: Number of bits required to store NSIZES.
- *   NPSIZES: Number of size classes that are a multiple of (1U << LG_PAGE).
- *   LG_TINY_MAXCLASS: Lg of maximum tiny size class.
- *   LOOKUP_MAXCLASS: Maximum size class included in lookup table.
- *   SMALL_MAXCLASS: Maximum small size class.
- *   LG_LARGE_MINCLASS: Lg of minimum large size class.
- *   LARGE_MAXCLASS: Maximum (large) size class.
- */
-
-#define LG_SIZE_CLASS_GROUP	${lg_g}
-#define LG_TINY_MIN		${lg_tmin}
-
-EOF
-
-for lg_z in ${lg_zarr} ; do
-  for lg_q in ${lg_qarr} ; do
-    lg_t=${lg_tmin}
-    while [ ${lg_t} -le ${lg_q} ] ; do
-      # Iterate through page sizes and compute how many bins there are.
-      for lg_p in ${lg_parr} ; do
-        echo "#if (LG_SIZEOF_PTR == ${lg_z} && LG_TINY_MIN == ${lg_t} && LG_QUANTUM == ${lg_q} && LG_PAGE == ${lg_p})"
-        size_classes ${lg_z} ${lg_q} ${lg_t} ${lg_p} ${lg_g}
-        echo "#define SIZE_CLASSES_DEFINED"
-        echo "#define NTBINS			${ntbins}"
-        echo "#define NLBINS			${nlbins}"
-        echo "#define NBINS			${nbins}"
-        echo "#define NSIZES			${nsizes}"
-        echo "#define LG_CEIL_NSIZES		${lg_ceil_nsizes}"
-        echo "#define NPSIZES			${npsizes}"
-        echo "#define LG_TINY_MAXCLASS	${lg_tiny_maxclass}"
-        echo "#define LOOKUP_MAXCLASS		${lookup_maxclass}"
-        echo "#define SMALL_MAXCLASS		${small_maxclass}"
-        echo "#define LG_LARGE_MINCLASS	${lg_large_minclass}"
-        echo "#define LARGE_MINCLASS		(ZU(1) << LG_LARGE_MINCLASS)"
-        echo "#define LARGE_MAXCLASS		${large_maxclass}"
-        echo "#endif"
-        echo
-      done
-      lg_t=$((${lg_t} + 1))
-    done
-  done
-done
-
-cat <<EOF
-#ifndef SIZE_CLASSES_DEFINED
-#  error "No size class definitions match configuration"
-#endif
-#undef SIZE_CLASSES_DEFINED
-/*
- * The size2index_tab lookup table uses uint8_t to encode each bin index, so we
- * cannot support more than 256 small size classes.
- */
-#if (NBINS > 256)
-#  error "Too many small size classes"
-#endif
-
-#endif /* JEMALLOC_INTERNAL_SIZE_CLASSES_H */
-EOF
diff --git a/src/sc.c b/src/sc.c
index 1d343d3..61e1197 100644
--- a/src/sc.c
+++ b/src/sc.c
@@ -221,12 +221,6 @@ size_classes(
 	sc_data->large_maxclass = large_maxclass;
 }
 
-/*
- * Defined later (after size_classes.h becomes visible), but called during
- * initialization.
- */
-static void sc_data_assert(sc_data_t *sc_data);
-
 void
 sc_data_init(sc_data_t *sc_data) {
 	assert(!sc_data->initialized);
@@ -237,65 +231,9 @@ sc_data_init(sc_data_t *sc_data) {
 	    lg_max_lookup, LG_PAGE, 2);
 
 	sc_data->initialized = true;
-
-	sc_data_assert(sc_data);
 }
 
 void
 sc_boot() {
 	sc_data_init(&sc_data_global);
 }
-
-/*
- * We don't include size_classes.h until this point, to ensure only the asserts
- * can see it.
- */
-#include "jemalloc/internal/size_classes.h"
-
-static void
-sc_assert(sc_t *sc, int index, int lg_base, int lg_delta, int ndelta, int psz,
-    int bin, int pgs, int lg_delta_lookup) {
-	assert(sc->index == index);
-	assert(sc->lg_base == lg_base);
-	assert(sc->lg_delta == lg_delta);
-	assert(sc->ndelta == ndelta);
-	assert(sc->psz == psz);
-	assert(sc->bin == bin);
-	assert(sc->pgs == pgs);
-	assert(sc->lg_delta_lookup == lg_delta_lookup);
-}
-
-static void
-sc_data_assert(sc_data_t *sc_data) {
-	assert(SC_NTINY == NTBINS);
-	assert(SC_NSIZES == NSIZES);
-	assert(SC_NBINS == NBINS);
-	assert(NPSIZES <= SC_NPSIZES_MAX);
-	assert(sc_data->ntiny == NTBINS);
-	assert(sc_data->nlbins == NLBINS);
-	assert(sc_data->nbins == NBINS);
-	assert(sc_data->nsizes == NSIZES);
-	assert(sc_data->lg_ceil_nsizes == LG_CEIL_NSIZES);
-	assert(sc_data->npsizes == NPSIZES);
-#if NTBINS > 0
-	assert(sc_data->lg_tiny_maxclass == LG_TINY_MAXCLASS);
-#else
-	assert(sc_data->lg_tiny_maxclass == -1);
-#endif
-	assert(sc_data->lookup_maxclass == LOOKUP_MAXCLASS);
-	assert(sc_data->small_maxclass == SMALL_MAXCLASS);
-	assert(sc_data->lg_large_minclass == LG_LARGE_MINCLASS);
-	assert(sc_data->large_minclass == LARGE_MINCLASS);
-	assert(sc_data->large_maxclass == LARGE_MAXCLASS);
-	assert(sc_data->initialized);
-#define no 0
-#define yes 1
-#define SC(index, lg_base, lg_delta, ndelta, psz, bin, pgs,		\
-    lg_delta_lookup)							\
-	sc_assert(&sc_data->sc[index], index, lg_base, lg_delta,	\
-	    ndelta, psz, bin, pgs, lg_delta_lookup);
-	SIZE_CLASSES
-#undef no
-#undef yes
-#undef SC
-}
-- 
cgit v0.12


From 5b7fc9056c8114d0774282d293cd5c9cce4ff931 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Fri, 5 Jan 2018 13:33:37 -0800
Subject: Remove the --with-lg-page-sizes configure option.

This appears to be unused.
---
 INSTALL.md   | 7 -------
 configure.ac | 5 -----
 2 files changed, 12 deletions(-)

diff --git a/INSTALL.md b/INSTALL.md
index ef328c6..18cf288 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -221,13 +221,6 @@ any of the following arguments (not a definitive list) to 'configure':
     system page size may change between configuration and execution, e.g. when
     cross compiling.
 
-* `--with-lg-page-sizes=<lg-page-sizes>`
-
-    Specify the comma-separated base 2 logs of the page sizes to support.  This
-    option may be useful when cross compiling in combination with
-    `--with-lg-page`, but its primary use case is for integration with FreeBSD's
-    libc, wherein jemalloc is embedded.
-
 * `--with-lg-hugepage=<lg-hugepage>`
 
     Specify the base 2 log of the system huge page size.  This option is useful
diff --git a/configure.ac b/configure.ac
index 8727087..e18bc4b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1430,11 +1430,6 @@ if test "x${LG_PAGE}" != "xundefined" -a \
 fi
 AC_DEFINE_UNQUOTED([LG_HUGEPAGE], [${je_cv_lg_hugepage}])
 
-AC_ARG_WITH([lg_page_sizes],
-  [AS_HELP_STRING([--with-lg-page-sizes=<lg-page-sizes>],
-   [Base 2 logs of system page sizes to support])],
-  [LG_PAGE_SIZES="$with_lg_page_sizes"], [LG_PAGE_SIZES="$LG_PAGE"])
-
 dnl ============================================================================
 dnl jemalloc configuration.
 dnl
-- 
cgit v0.12


From 017dca198c74792967771d00b7501beade5b6fd0 Mon Sep 17 00:00:00 2001
From: "David T. Goldblatt" <davidtgoldblatt@gmail.com>
Date: Wed, 18 Apr 2018 19:36:40 -0700
Subject: SC module: Add a note on style.

---
 src/sc.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/sc.c b/src/sc.c
index 61e1197..f7458c5 100644
--- a/src/sc.c
+++ b/src/sc.c
@@ -4,6 +4,13 @@
 #include "jemalloc/internal/bit_util.h"
 #include "jemalloc/internal/sc.h"
 
+/*
+ * This module computes the size classes used to satisfy allocations.  The logic
+ * here was ported more or less line-by-line from a shell script, and because of
+ * that is not the most idiomatic C.  Eventually we should fix this, but for now
+ * at least the damage is compartmentalized to this file.
+ */
+
 sc_data_t sc_data_global;
 
 static size_t
-- 
cgit v0.12


From a7f68aed3ef53a194f6b932b92bddd8c84c43de4 Mon Sep 17 00:00:00 2001
From: "David T. Goldblatt" <davidtgoldblatt@gmail.com>
Date: Wed, 18 Apr 2018 20:32:12 -0700
Subject: SC: Add page customization functionality.

---
 Makefile.in                    |  1 +
 include/jemalloc/internal/sc.h |  6 ++++++
 src/sc.c                       | 42 ++++++++++++++++++++++++++++++++++++++++++
 test/unit/sc.c                 | 33 +++++++++++++++++++++++++++++++++
 4 files changed, 82 insertions(+)
 create mode 100644 test/unit/sc.c

diff --git a/Makefile.in b/Makefile.in
index a747d6e..05f67d9 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -204,6 +204,7 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/rtree.c \
 	$(srcroot)test/unit/seq.c \
 	$(srcroot)test/unit/SFMT.c \
+	$(srcroot)test/unit/sc.c \
 	$(srcroot)test/unit/size_classes.c \
 	$(srcroot)test/unit/slab.c \
 	$(srcroot)test/unit/smoothstep.c \
diff --git a/include/jemalloc/internal/sc.h b/include/jemalloc/internal/sc.h
index df295bc..592115a 100644
--- a/include/jemalloc/internal/sc.h
+++ b/include/jemalloc/internal/sc.h
@@ -297,6 +297,12 @@ struct sc_data_s {
 
 extern sc_data_t sc_data_global;
 void sc_data_init(sc_data_t *data);
+/*
+ * Updates slab sizes in [begin, end] to be pgs pages in length, if possible.
+ * Otherwise, does its best to accomodate the request.
+ */
+void sc_data_update_slab_size(sc_data_t *data, size_t begin, size_t end,
+    int pgs);
 void sc_boot();
 
 #endif /* JEMALLOC_INTERNAL_SC_H */
diff --git a/src/sc.c b/src/sc.c
index f7458c5..e8eef1c 100644
--- a/src/sc.c
+++ b/src/sc.c
@@ -2,6 +2,8 @@
 
 #include "jemalloc/internal/assert.h"
 #include "jemalloc/internal/bit_util.h"
+#include "jemalloc/internal/bitmap.h"
+#include "jemalloc/internal/pages.h"
 #include "jemalloc/internal/sc.h"
 
 /*
@@ -240,6 +242,46 @@ sc_data_init(sc_data_t *sc_data) {
 	sc_data->initialized = true;
 }
 
+static void
+sc_data_update_sc_slab_size(sc_t *sc, size_t reg_size, size_t pgs_guess) {
+	size_t min_pgs = reg_size / PAGE;
+	if (reg_size % PAGE != 0) {
+		min_pgs++;
+	}
+	/*
+	 * BITMAP_MAXBITS is actually determined by putting the smallest
+	 * possible size-class on one page, so this can never be 0.
+	 */
+	size_t max_pgs = BITMAP_MAXBITS * reg_size / PAGE;
+
+	assert(min_pgs <= max_pgs);
+	assert(min_pgs > 0);
+	assert(max_pgs >= 1);
+	if (pgs_guess < min_pgs) {
+		sc->pgs = (int)min_pgs;
+	} else if (pgs_guess > max_pgs) {
+		sc->pgs = (int)max_pgs;
+	} else {
+		sc->pgs = (int)pgs_guess;
+	}
+}
+
+void
+sc_data_update_slab_size(sc_data_t *data, size_t begin, size_t end, int pgs) {
+	assert(data->initialized);
+	for (int i = 0; i < data->nsizes; i++) {
+		sc_t *sc = &data->sc[i];
+		if (!sc->bin) {
+			break;
+		}
+		size_t reg_size = reg_size_compute(sc->lg_base, sc->lg_delta,
+		    sc->ndelta);
+		if (begin <= reg_size && reg_size <= end) {
+			sc_data_update_sc_slab_size(sc, reg_size, pgs);
+		}
+	}
+}
+
 void
 sc_boot() {
 	sc_data_init(&sc_data_global);
diff --git a/test/unit/sc.c b/test/unit/sc.c
new file mode 100644
index 0000000..bf51d8e
--- /dev/null
+++ b/test/unit/sc.c
@@ -0,0 +1,33 @@
+#include "test/jemalloc_test.h"
+
+TEST_BEGIN(test_update_slab_size) {
+	sc_data_t data;
+	memset(&data, 0, sizeof(data));
+	sc_data_init(&data);
+	sc_t *tiny = &data.sc[0];
+	size_t tiny_size = (ZU(1) << tiny->lg_base)
+	    + (ZU(tiny->ndelta) << tiny->lg_delta);
+	size_t pgs_too_big = (tiny_size * BITMAP_MAXBITS + PAGE - 1) / PAGE + 1;
+	sc_data_update_slab_size(&data, tiny_size, tiny_size, (int)pgs_too_big);
+	assert_zu_lt((size_t)tiny->pgs, pgs_too_big, "Allowed excessive pages");
+
+	sc_data_update_slab_size(&data, 1, 10 * PAGE, 1);
+	for (int i = 0; i < data.nbins; i++) {
+		sc_t *sc = &data.sc[i];
+		size_t reg_size = (ZU(1) << sc->lg_base)
+		    + (ZU(sc->ndelta) << sc->lg_delta);
+		if (reg_size <= PAGE) {
+			assert_d_eq(sc->pgs, 1, "Ignored valid page size hint");
+		} else {
+			assert_d_gt(sc->pgs, 1,
+			    "Allowed invalid page size hint");
+		}
+	}
+}
+TEST_END
+
+int
+main(void) {
+	return test(
+	    test_update_slab_size);
+}
-- 
cgit v0.12


From 4610ffa942a00d80a8e8af2365069bed7d561415 Mon Sep 17 00:00:00 2001
From: "David T. Goldblatt" <davidtgoldblatt@gmail.com>
Date: Fri, 20 Apr 2018 19:12:45 -0700
Subject: Bootstrapping: Parse MALLOC_CONF before using slab sizes.

I.e., parse before booting the bin module or sz module.  This lets us tweak size
class settings before committing to them by letting them leak into other
modules.

This commit does not actually do any tweaking of the size classes; it *just*
chanchanges bootstrapping order; this may help bisecting any bootstrapping
failures on poorly-tested architectures.
---
 src/jemalloc.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/jemalloc.c b/src/jemalloc.c
index 664c5f8..902bf9c 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1295,14 +1295,21 @@ static bool
 malloc_init_hard_a0_locked() {
 	malloc_initializer = INITIALIZER;
 
+	/*
+	 * Ordering here is somewhat tricky; we need sc_boot() first, since that
+	 * determines what the size classes will be, and then
+	 * malloc_conf_init(), since any slab size tweaking will need to be done
+	 * before sz_boot and bin_boot, which assume that the values they read
+	 * out of sc_data_global are final.
+	 */
 	sc_boot();
+	malloc_conf_init();
 	sz_boot(&sc_data_global);
 	bin_boot(&sc_data_global);
 
 	if (config_prof) {
 		prof_boot0();
 	}
-	malloc_conf_init();
 	if (opt_stats_print) {
 		/* Print statistics at exit. */
 		if (atexit(stats_print_atexit) != 0) {
-- 
cgit v0.12


From 5112d9e5fd2a15d6b75523a3a4122b726fbae479 Mon Sep 17 00:00:00 2001
From: "David T. Goldblatt" <davidtgoldblatt@gmail.com>
Date: Fri, 20 Apr 2018 21:11:03 -0700
Subject: Add MALLOC_CONF parsing for dynamic slab sizes.

This actually enables us to change the values.
---
 Makefile.in                    |  1 +
 src/jemalloc.c                 | 68 +++++++++++++++++++++++++++++++++++
 test/integration/slab_sizes.c  | 80 ++++++++++++++++++++++++++++++++++++++++++
 test/integration/slab_sizes.sh |  4 +++
 4 files changed, 153 insertions(+)
 create mode 100644 test/integration/slab_sizes.c
 create mode 100644 test/integration/slab_sizes.sh

diff --git a/Makefile.in b/Makefile.in
index 05f67d9..8b2f5ca 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -230,6 +230,7 @@ TESTS_INTEGRATION := $(srcroot)test/integration/aligned_alloc.c \
 	$(srcroot)test/integration/posix_memalign.c \
 	$(srcroot)test/integration/rallocx.c \
 	$(srcroot)test/integration/sdallocx.c \
+	$(srcroot)test/integration/slab_sizes.c \
 	$(srcroot)test/integration/thread_arena.c \
 	$(srcroot)test/integration/thread_tcache_enabled.c \
 	$(srcroot)test/integration/xallocx.c
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 902bf9c..4ffe5aa 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -765,6 +765,49 @@ init_opt_stats_print_opts(const char *v, size_t vlen) {
 }
 
 static bool
+malloc_conf_slab_sizes_next(const char **slab_size_segment_cur,
+    size_t *vlen_left, size_t *slab_start, size_t *slab_end, size_t *pgs) {
+	const char *cur = *slab_size_segment_cur;
+	char *end;
+	uintmax_t um;
+
+	set_errno(0);
+
+	/* First number, then '-' */
+	um = malloc_strtoumax(cur, &end, 0);
+	if (get_errno() != 0 || *end != '-') {
+		return true;
+	}
+	*slab_start = (size_t)um;
+	cur = end + 1;
+
+	/* Second number, then ':' */
+	um = malloc_strtoumax(cur, &end, 0);
+	if (get_errno() != 0 || *end != ':') {
+		return true;
+	}
+	*slab_end = (size_t)um;
+	cur = end + 1;
+
+	/* Last number */
+	um = malloc_strtoumax(cur, &end, 0);
+	if (get_errno() != 0) {
+		return true;
+	}
+	*pgs = (size_t)um;
+
+	/* Consume the separator if there is one. */
+	if (*end == '|') {
+		end++;
+	}
+
+	*vlen_left -= end - *slab_size_segment_cur;
+	*slab_size_segment_cur = end;
+
+	return false;
+}
+
+static bool
 malloc_conf_next(char const **opts_p, char const **k_p, size_t *klen_p,
     char const **v_p, size_t *vlen_p) {
 	bool accept;
@@ -1192,6 +1235,31 @@ malloc_conf_init(void) {
 					   "max_background_threads", 1,
 					   opt_max_background_threads, yes, yes,
 					   true);
+			if (CONF_MATCH("slab_sizes")) {
+				bool err;
+				const char *slab_size_segment_cur = v;
+				size_t vlen_left = vlen;
+				do {
+					size_t slab_start;
+					size_t slab_end;
+					size_t pgs;
+					err = malloc_conf_slab_sizes_next(
+					    &slab_size_segment_cur,
+					    &vlen_left, &slab_start, &slab_end,
+					    &pgs);
+					if (!err) {
+						sc_data_update_slab_size(
+						    &sc_data_global, slab_start,
+						    slab_end, (int)pgs);
+					} else {
+						malloc_conf_error(
+						    "Invalid settings for "
+						    "slab_sizes", k, klen, v,
+						    vlen);
+					}
+				} while (!err && vlen_left > 0);
+				continue;
+			}
 			if (config_prof) {
 				CONF_HANDLE_BOOL(opt_prof, "prof")
 				CONF_HANDLE_CHAR_P(opt_prof_prefix,
diff --git a/test/integration/slab_sizes.c b/test/integration/slab_sizes.c
new file mode 100644
index 0000000..af250c3
--- /dev/null
+++ b/test/integration/slab_sizes.c
@@ -0,0 +1,80 @@
+#include "test/jemalloc_test.h"
+
+/* Note that this test relies on the unusual slab sizes set in slab_sizes.sh. */
+
+TEST_BEGIN(test_slab_sizes) {
+	unsigned nbins;
+	size_t page;
+	size_t sizemib[4];
+	size_t slabmib[4];
+	size_t len;
+
+	len = sizeof(nbins);
+	assert_d_eq(mallctl("arenas.nbins", &nbins, &len, NULL, 0), 0,
+	    "nbins mallctl failure");
+
+	len = sizeof(page);
+	assert_d_eq(mallctl("arenas.page", &page, &len, NULL, 0), 0,
+	    "page mallctl failure");
+
+	len = 4;
+	assert_d_eq(mallctlnametomib("arenas.bin.0.size", sizemib, &len), 0,
+	    "bin size mallctlnametomib failure");
+
+	len = 4;
+	assert_d_eq(mallctlnametomib("arenas.bin.0.slab_size", slabmib, &len),
+	    0, "slab size mallctlnametomib failure");
+
+	size_t biggest_slab_seen = 0;
+
+	for (unsigned i = 0; i < nbins; i++) {
+		size_t bin_size;
+		size_t slab_size;
+		len = sizeof(size_t);
+		sizemib[2] = i;
+		slabmib[2] = i;
+		assert_d_eq(mallctlbymib(sizemib, 4, (void *)&bin_size, &len,
+		    NULL, 0), 0, "bin size mallctlbymib failure");
+
+		len = sizeof(size_t);
+		assert_d_eq(mallctlbymib(slabmib, 4, (void *)&slab_size, &len,
+		    NULL, 0), 0, "slab size mallctlbymib failure");
+
+		if (bin_size < 100) {
+			/*
+			 * Then we should be as close to 17 as possible.  Since
+			 * not all page sizes are valid (because of bitmap
+			 * limitations on the number of items in a slab), we
+			 * should at least make sure that the number of pages
+			 * goes up.
+			 */
+			assert_zu_ge(slab_size, biggest_slab_seen,
+			    "Slab sizes should go up");
+			biggest_slab_seen = slab_size;
+		} else if (
+		    (100 <= bin_size && bin_size < 128)
+		    || (128 < bin_size && bin_size <= 200)) {
+			assert_zu_eq(slab_size, page,
+			    "Forced-small slabs should be small");
+		} else if (bin_size == 128) {
+			assert_zu_eq(slab_size, 2 * page,
+			    "Forced-2-page slab should be 2 pages");
+		} else if (200 < bin_size && bin_size <= 4096) {
+			assert_zu_ge(slab_size, biggest_slab_seen,
+			    "Slab sizes should go up");
+			biggest_slab_seen = slab_size;
+		}
+	}
+	/*
+	 * For any reasonable configuration, 17 pages should be a valid slab
+	 * size for 4096-byte items.
+	 */
+	assert_zu_eq(biggest_slab_seen, 17 * page, "Didn't hit page target");
+}
+TEST_END
+
+int
+main(void) {
+	return test(
+	    test_slab_sizes);
+}
diff --git a/test/integration/slab_sizes.sh b/test/integration/slab_sizes.sh
new file mode 100644
index 0000000..07e3db8
--- /dev/null
+++ b/test/integration/slab_sizes.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+# Some screwy-looking slab sizes.
+export MALLOC_CONF="slab_sizes:1-4096:17|100-200:1|128-128:2"
-- 
cgit v0.12


From 55e5cc1341de87ad06254d719946a5ecd05f06ab Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Wed, 11 Jul 2018 16:05:58 -0700
Subject: SC: Make some key size classes static.

The largest small class, smallest large class, and largest large class may all
be needed down fast paths; to avoid the risk of touching another cache line, we
can make them available as constants.
---
 include/jemalloc/internal/arena_inlines_b.h        |  6 +--
 .../internal/jemalloc_internal_inlines_c.h         |  2 +-
 include/jemalloc/internal/prof_inlines_a.h         |  8 ++--
 include/jemalloc/internal/sc.h                     | 19 ++++++++
 include/jemalloc/internal/sz.h                     | 22 +++++-----
 include/jemalloc/internal/tcache_inlines.h         |  4 +-
 src/arena.c                                        | 50 +++++++++++-----------
 src/ckh.c                                          |  6 +--
 src/extent.c                                       |  6 +--
 src/jemalloc.c                                     | 50 +++++++++++-----------
 src/large.c                                        | 14 +++---
 src/sc.c                                           | 14 ++++++
 src/tcache.c                                       |  4 +-
 test/unit/junk.c                                   |  4 +-
 test/unit/mallctl.c                                |  2 +-
 test/unit/rtree.c                                  |  4 +-
 test/unit/stats.c                                  |  6 +--
 test/unit/zero.c                                   |  4 +-
 18 files changed, 129 insertions(+), 96 deletions(-)

diff --git a/include/jemalloc/internal/arena_inlines_b.h b/include/jemalloc/internal/arena_inlines_b.h
index 8960396..2b3915a 100644
--- a/include/jemalloc/internal/arena_inlines_b.h
+++ b/include/jemalloc/internal/arena_inlines_b.h
@@ -111,7 +111,7 @@ arena_malloc(tsdn_t *tsdn, arena_t *arena, size_t size, szind_t ind, bool zero,
 	assert(size != 0);
 
 	if (likely(tcache != NULL)) {
-		if (likely(size <= sc_data_global.small_maxclass)) {
+		if (likely(size <= SC_SMALL_MAXCLASS)) {
 			return tcache_alloc_small(tsdn_tsd(tsdn), arena,
 			    tcache, size, ind, zero, slow_path);
 		}
@@ -263,7 +263,7 @@ arena_dalloc(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
 static inline void
 arena_sdalloc_no_tcache(tsdn_t *tsdn, void *ptr, size_t size) {
 	assert(ptr != NULL);
-	assert(size <= sc_data_global.large_maxclass);
+	assert(size <= SC_LARGE_MAXCLASS);
 
 	szind_t szind;
 	bool slab;
@@ -309,7 +309,7 @@ arena_sdalloc(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
     alloc_ctx_t *alloc_ctx, bool slow_path) {
 	assert(!tsdn_null(tsdn) || tcache == NULL);
 	assert(ptr != NULL);
-	assert(size <= sc_data_global.large_maxclass);
+	assert(size <= SC_LARGE_MAXCLASS);
 
 	if (unlikely(tcache == NULL)) {
 		arena_sdalloc_no_tcache(tsdn, ptr, size);
diff --git a/include/jemalloc/internal/jemalloc_internal_inlines_c.h b/include/jemalloc/internal/jemalloc_internal_inlines_c.h
index 83ad10f..9c5fec6 100644
--- a/include/jemalloc/internal/jemalloc_internal_inlines_c.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_c.h
@@ -142,7 +142,7 @@ iralloct_realign(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
 	size_t usize, copysize;
 
 	usize = sz_sa2u(size, alignment);
-	if (unlikely(usize == 0 || usize > sc_data_global.large_maxclass)) {
+	if (unlikely(usize == 0 || usize > SC_LARGE_MAXCLASS)) {
 		return NULL;
 	}
 	p = ipalloct(tsdn, usize, alignment, zero, tcache, arena);
diff --git a/include/jemalloc/internal/prof_inlines_a.h b/include/jemalloc/internal/prof_inlines_a.h
index 07bfd9f..471d985 100644
--- a/include/jemalloc/internal/prof_inlines_a.h
+++ b/include/jemalloc/internal/prof_inlines_a.h
@@ -57,15 +57,15 @@ prof_accum_cancel(tsdn_t *tsdn, prof_accum_t *prof_accum,
 #ifdef JEMALLOC_ATOMIC_U64
 	a0 = atomic_load_u64(&prof_accum->accumbytes, ATOMIC_RELAXED);
 	do {
-		a1 = (a0 >= sc_data_global.large_minclass - usize)
-		    ? a0 - (sc_data_global.large_minclass - usize) : 0;
+		a1 = (a0 >= SC_LARGE_MINCLASS - usize)
+		    ? a0 - (SC_LARGE_MINCLASS - usize) : 0;
 	} while (!atomic_compare_exchange_weak_u64(&prof_accum->accumbytes, &a0,
 	    a1, ATOMIC_RELAXED, ATOMIC_RELAXED));
 #else
 	malloc_mutex_lock(tsdn, &prof_accum->mtx);
 	a0 = prof_accum->accumbytes;
-	a1 = (a0 >= sc_data_global.large_minclass - usize)
-	    ?  a0 - (sc_data_global.large_minclass - usize) : 0;
+	a1 = (a0 >= SC_LARGE_MINCLASS - usize)
+	    ?  a0 - (SC_LARGE_MINCLASS - usize) : 0;
 	prof_accum->accumbytes = a1;
 	malloc_mutex_unlock(tsdn, &prof_accum->mtx);
 #endif
diff --git a/include/jemalloc/internal/sc.h b/include/jemalloc/internal/sc.h
index 592115a..5c94378 100644
--- a/include/jemalloc/internal/sc.h
+++ b/include/jemalloc/internal/sc.h
@@ -238,6 +238,25 @@
 /* The largest size class in the lookup table. */
 #define SC_LOOKUP_MAXCLASS ((size_t)1 << 12)
 
+/* Internal, only used for the definition of SC_SMALL_MAXCLASS. */
+#define SC_SMALL_MAX_BASE ((size_t)1 << (LG_PAGE + SC_LG_NGROUP - 1))
+#define SC_SMALL_MAX_DELTA ((size_t)1 << (LG_PAGE - 1))
+
+/* The largest size class allocated out of a slab. */
+#define SC_SMALL_MAXCLASS (SC_SMALL_MAX_BASE				\
+    + (SC_NGROUP - 1) * SC_SMALL_MAX_DELTA)
+
+/* The smallest size class not allocated out of a slab. */
+#define SC_LARGE_MINCLASS ((size_t)1ULL << (LG_PAGE + SC_LG_NGROUP))
+#define SC_LG_LARGE_MINCLASS (LG_PAGE + SC_LG_NGROUP)
+
+/* Internal; only used for the definition of SC_LARGE_MAXCLASS. */
+#define SC_MAX_BASE ((size_t)1 << (SC_PTR_BITS - 2))
+#define SC_MAX_DELTA ((size_t)1 << (SC_PTR_BITS - 2 - SC_LG_NGROUP))
+
+/* The largest size class supported. */
+#define SC_LARGE_MAXCLASS (SC_MAX_BASE + (SC_NGROUP - 1) * SC_MAX_DELTA)
+
 typedef struct sc_s sc_t;
 struct sc_s {
 	/* Size class index, or -1 if not a valid size class. */
diff --git a/include/jemalloc/internal/sz.h b/include/jemalloc/internal/sz.h
index b37e796..e743d87 100644
--- a/include/jemalloc/internal/sz.h
+++ b/include/jemalloc/internal/sz.h
@@ -51,7 +51,7 @@ extern void sz_boot(const sc_data_t *sc_data);
 
 JEMALLOC_ALWAYS_INLINE pszind_t
 sz_psz2ind(size_t psz) {
-	if (unlikely(psz > sc_data_global.large_maxclass)) {
+	if (unlikely(psz > SC_LARGE_MAXCLASS)) {
 		return sc_data_global.npsizes;
 	}
 	pszind_t x = lg_floor((psz<<1)-1);
@@ -73,7 +73,7 @@ sz_psz2ind(size_t psz) {
 static inline size_t
 sz_pind2sz_compute(pszind_t pind) {
 	if (unlikely(pind == sc_data_global.npsizes)) {
-		return sc_data_global.large_maxclass + PAGE;
+		return SC_LARGE_MAXCLASS + PAGE;
 	}
 	size_t grp = pind >> SC_LG_NGROUP;
 	size_t mod = pind & ((ZU(1) << SC_LG_NGROUP) - 1);
@@ -105,8 +105,8 @@ sz_pind2sz(pszind_t pind) {
 
 static inline size_t
 sz_psz2u(size_t psz) {
-	if (unlikely(psz > sc_data_global.large_maxclass)) {
-		return sc_data_global.large_maxclass + PAGE;
+	if (unlikely(psz > SC_LARGE_MAXCLASS)) {
+		return SC_LARGE_MAXCLASS + PAGE;
 	}
 	size_t x = lg_floor((psz<<1)-1);
 	size_t lg_delta = (x < SC_LG_NGROUP + LG_PAGE + 1) ?
@@ -119,7 +119,7 @@ sz_psz2u(size_t psz) {
 
 static inline szind_t
 sz_size2index_compute(size_t size) {
-	if (unlikely(size > sc_data_global.large_maxclass)) {
+	if (unlikely(size > SC_LARGE_MAXCLASS)) {
 		return SC_NSIZES;
 	}
 #if (SC_NTINY != 0)
@@ -207,7 +207,7 @@ sz_index2size(szind_t index) {
 
 JEMALLOC_ALWAYS_INLINE size_t
 sz_s2u_compute(size_t size) {
-	if (unlikely(size > sc_data_global.large_maxclass)) {
+	if (unlikely(size > SC_LARGE_MAXCLASS)) {
 		return 0;
 	}
 #if (SC_NTINY > 0)
@@ -262,7 +262,7 @@ sz_sa2u(size_t size, size_t alignment) {
 	assert(alignment != 0 && ((alignment - 1) & alignment) == 0);
 
 	/* Try for a small size class. */
-	if (size <= sc_data_global.small_maxclass && alignment < PAGE) {
+	if (size <= SC_SMALL_MAXCLASS && alignment < PAGE) {
 		/*
 		 * Round size up to the nearest multiple of alignment.
 		 *
@@ -278,20 +278,20 @@ sz_sa2u(size_t size, size_t alignment) {
 		 *    192 | 11000000 |  64
 		 */
 		usize = sz_s2u(ALIGNMENT_CEILING(size, alignment));
-		if (usize < sc_data_global.large_minclass) {
+		if (usize < SC_LARGE_MINCLASS) {
 			return usize;
 		}
 	}
 
 	/* Large size class.  Beware of overflow. */
 
-	if (unlikely(alignment > sc_data_global.large_maxclass)) {
+	if (unlikely(alignment > SC_LARGE_MAXCLASS)) {
 		return 0;
 	}
 
 	/* Make sure result is a large size class. */
-	if (size <= sc_data_global.large_minclass) {
-		usize = sc_data_global.large_minclass;
+	if (size <= SC_LARGE_MINCLASS) {
+		usize = SC_LARGE_MINCLASS;
 	} else {
 		usize = sz_s2u(size);
 		if (usize < size) {
diff --git a/include/jemalloc/internal/tcache_inlines.h b/include/jemalloc/internal/tcache_inlines.h
index b060043..7c95646 100644
--- a/include/jemalloc/internal/tcache_inlines.h
+++ b/include/jemalloc/internal/tcache_inlines.h
@@ -167,7 +167,7 @@ tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
 	cache_bin_info_t *bin_info;
 
 	assert(tcache_salloc(tsd_tsdn(tsd), ptr)
-	    <= sc_data_global.small_maxclass);
+	    <= SC_SMALL_MAXCLASS);
 
 	if (slow_path && config_fill && unlikely(opt_junk_free)) {
 		arena_dalloc_junk_small(ptr, &bin_infos[binind]);
@@ -193,7 +193,7 @@ tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
 	cache_bin_info_t *bin_info;
 
 	assert(tcache_salloc(tsd_tsdn(tsd), ptr)
-	    > sc_data_global.small_maxclass);
+	    > SC_SMALL_MAXCLASS);
 	assert(tcache_salloc(tsd_tsdn(tsd), ptr) <= tcache_maxclass);
 
 	if (slow_path && config_fill && unlikely(opt_junk_free)) {
diff --git a/src/arena.c b/src/arena.c
index 07d9103..91043cf 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -296,8 +296,8 @@ arena_large_malloc_stats_update(tsdn_t *tsdn, arena_t *arena, size_t usize) {
 
 	cassert(config_stats);
 
-	if (usize < sc_data_global.large_minclass) {
-		usize = sc_data_global.large_minclass;
+	if (usize < SC_LARGE_MINCLASS) {
+		usize = SC_LARGE_MINCLASS;
 	}
 	index = sz_size2index(usize);
 	hindex = (index >= SC_NBINS) ? index - SC_NBINS : 0;
@@ -312,8 +312,8 @@ arena_large_dalloc_stats_update(tsdn_t *tsdn, arena_t *arena, size_t usize) {
 
 	cassert(config_stats);
 
-	if (usize < sc_data_global.large_minclass) {
-		usize = sc_data_global.large_minclass;
+	if (usize < SC_LARGE_MINCLASS) {
+		usize = SC_LARGE_MINCLASS;
 	}
 	index = sz_size2index(usize);
 	hindex = (index >= SC_NBINS) ? index - SC_NBINS : 0;
@@ -1389,7 +1389,7 @@ arena_malloc_hard(tsdn_t *tsdn, arena_t *arena, size_t size, szind_t ind,
 		return NULL;
 	}
 
-	if (likely(size <= sc_data_global.small_maxclass)) {
+	if (likely(size <= SC_SMALL_MAXCLASS)) {
 		return arena_malloc_small(tsdn, arena, ind, zero);
 	}
 	return large_malloc(tsdn, arena, sz_index2size(ind), zero);
@@ -1400,7 +1400,7 @@ arena_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment,
     bool zero, tcache_t *tcache) {
 	void *ret;
 
-	if (usize <= sc_data_global.small_maxclass
+	if (usize <= SC_SMALL_MAXCLASS
 	    && (alignment < PAGE
 	    || (alignment == PAGE && (usize & PAGE_MASK) == 0))) {
 		/* Small; alignment doesn't require special slab placement. */
@@ -1420,8 +1420,8 @@ void
 arena_prof_promote(tsdn_t *tsdn, const void *ptr, size_t usize) {
 	cassert(config_prof);
 	assert(ptr != NULL);
-	assert(isalloc(tsdn, ptr) == sc_data_global.large_minclass);
-	assert(usize <= sc_data_global.small_maxclass);
+	assert(isalloc(tsdn, ptr) == SC_LARGE_MINCLASS);
+	assert(usize <= SC_SMALL_MAXCLASS);
 
 	rtree_ctx_t rtree_ctx_fallback;
 	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
@@ -1451,9 +1451,9 @@ arena_prof_demote(tsdn_t *tsdn, extent_t *extent, const void *ptr) {
 	rtree_szind_slab_update(tsdn, &extents_rtree, rtree_ctx, (uintptr_t)ptr,
 	    SC_NBINS, false);
 
-	assert(isalloc(tsdn, ptr) == sc_data_global.large_minclass);
+	assert(isalloc(tsdn, ptr) == SC_LARGE_MINCLASS);
 
-	return sc_data_global.large_minclass;
+	return SC_LARGE_MINCLASS;
 }
 
 void
@@ -1594,25 +1594,25 @@ arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
     size_t extra, bool zero, size_t *newsize) {
 	bool ret;
 	/* Calls with non-zero extra had to clamp extra. */
-	assert(extra == 0 || size + extra <= sc_data_global.large_maxclass);
+	assert(extra == 0 || size + extra <= SC_LARGE_MAXCLASS);
 
 	extent_t *extent = iealloc(tsdn, ptr);
-	if (unlikely(size > sc_data_global.large_maxclass)) {
+	if (unlikely(size > SC_LARGE_MAXCLASS)) {
 		ret = true;
 		goto done;
 	}
 
 	size_t usize_min = sz_s2u(size);
 	size_t usize_max = sz_s2u(size + extra);
-	if (likely(oldsize <= sc_data_global.small_maxclass && usize_min
-	    <= sc_data_global.small_maxclass)) {
+	if (likely(oldsize <= SC_SMALL_MAXCLASS && usize_min
+	    <= SC_SMALL_MAXCLASS)) {
 		/*
 		 * Avoid moving the allocation if the size class can be left the
 		 * same.
 		 */
 		assert(bin_infos[sz_size2index(oldsize)].reg_size ==
 		    oldsize);
-		if ((usize_max > sc_data_global.small_maxclass
+		if ((usize_max > SC_SMALL_MAXCLASS
 		    || sz_size2index(usize_max) != sz_size2index(oldsize))
 		    && (size > oldsize || usize_max < oldsize)) {
 			ret = true;
@@ -1621,8 +1621,8 @@ arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
 
 		arena_decay_tick(tsdn, extent_arena_get(extent));
 		ret = false;
-	} else if (oldsize >= sc_data_global.large_minclass
-	    && usize_max >= sc_data_global.large_minclass) {
+	} else if (oldsize >= SC_LARGE_MINCLASS
+	    && usize_max >= SC_LARGE_MINCLASS) {
 		ret = large_ralloc_no_move(tsdn, extent, usize_min, usize_max,
 		    zero);
 	} else {
@@ -1643,7 +1643,7 @@ arena_ralloc_move_helper(tsdn_t *tsdn, arena_t *arena, size_t usize,
 		    zero, tcache, true);
 	}
 	usize = sz_sa2u(usize, alignment);
-	if (unlikely(usize == 0 || usize > sc_data_global.large_maxclass)) {
+	if (unlikely(usize == 0 || usize > SC_LARGE_MAXCLASS)) {
 		return NULL;
 	}
 	return ipalloct(tsdn, usize, alignment, zero, tcache, arena);
@@ -1654,11 +1654,11 @@ arena_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t oldsize,
     size_t size, size_t alignment, bool zero, tcache_t *tcache,
     hook_ralloc_args_t *hook_args) {
 	size_t usize = sz_s2u(size);
-	if (unlikely(usize == 0 || size > sc_data_global.large_maxclass)) {
+	if (unlikely(usize == 0 || size > SC_LARGE_MAXCLASS)) {
 		return NULL;
 	}
 
-	if (likely(usize <= sc_data_global.small_maxclass)) {
+	if (likely(usize <= SC_SMALL_MAXCLASS)) {
 		/* Try to avoid moving the allocation. */
 		UNUSED size_t newsize;
 		if (!arena_ralloc_no_move(tsdn, ptr, oldsize, usize, 0, zero,
@@ -1671,8 +1671,8 @@ arena_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t oldsize,
 		}
 	}
 
-	if (oldsize >= sc_data_global.large_minclass
-	    && usize >= sc_data_global.large_minclass) {
+	if (oldsize >= SC_LARGE_MINCLASS
+	    && usize >= SC_LARGE_MINCLASS) {
 		return large_ralloc(tsdn, arena, ptr, usize,
 		    alignment, zero, tcache, hook_args);
 	}
@@ -1985,10 +1985,10 @@ arena_init_huge(void) {
 	bool huge_enabled;
 
 	/* The threshold should be large size class. */
-	if (opt_huge_threshold > sc_data_global.large_maxclass ||
-	    opt_huge_threshold < sc_data_global.large_minclass) {
+	if (opt_huge_threshold > SC_LARGE_MAXCLASS ||
+	    opt_huge_threshold < SC_LARGE_MINCLASS) {
 		opt_huge_threshold = 0;
-		huge_threshold = sc_data_global.large_maxclass + PAGE;
+		huge_threshold = SC_LARGE_MAXCLASS + PAGE;
 		huge_enabled = false;
 	} else {
 		/* Reserve the index for the huge arena. */
diff --git a/src/ckh.c b/src/ckh.c
index 94c4fe6..1bf6df5 100644
--- a/src/ckh.c
+++ b/src/ckh.c
@@ -276,7 +276,7 @@ ckh_grow(tsd_t *tsd, ckh_t *ckh) {
 		lg_curcells++;
 		usize = sz_sa2u(sizeof(ckhc_t) << lg_curcells, CACHELINE);
 		if (unlikely(usize == 0
-		    || usize > sc_data_global.large_maxclass)) {
+		    || usize > SC_LARGE_MAXCLASS)) {
 			ret = true;
 			goto label_return;
 		}
@@ -321,7 +321,7 @@ ckh_shrink(tsd_t *tsd, ckh_t *ckh) {
 	lg_prevbuckets = ckh->lg_curbuckets;
 	lg_curcells = ckh->lg_curbuckets + LG_CKH_BUCKET_CELLS - 1;
 	usize = sz_sa2u(sizeof(ckhc_t) << lg_curcells, CACHELINE);
-	if (unlikely(usize == 0 || usize > sc_data_global.large_maxclass)) {
+	if (unlikely(usize == 0 || usize > SC_LARGE_MAXCLASS)) {
 		return;
 	}
 	tab = (ckhc_t *)ipallocztm(tsd_tsdn(tsd), usize, CACHELINE, true, NULL,
@@ -397,7 +397,7 @@ ckh_new(tsd_t *tsd, ckh_t *ckh, size_t minitems, ckh_hash_t *hash,
 	ckh->keycomp = keycomp;
 
 	usize = sz_sa2u(sizeof(ckhc_t) << lg_mincells, CACHELINE);
-	if (unlikely(usize == 0 || usize > sc_data_global.large_maxclass)) {
+	if (unlikely(usize == 0 || usize > SC_LARGE_MAXCLASS)) {
 		ret = true;
 		goto label_return;
 	}
diff --git a/src/extent.c b/src/extent.c
index 0953940..74076b6 100644
--- a/src/extent.c
+++ b/src/extent.c
@@ -259,7 +259,7 @@ extent_size_quantize_ceil(size_t size) {
 	size_t ret;
 
 	assert(size > 0);
-	assert(size - sz_large_pad <= sc_data_global.large_maxclass);
+	assert(size - sz_large_pad <= SC_LARGE_MAXCLASS);
 	assert((size & PAGE_MASK) == 0);
 
 	ret = extent_size_quantize_floor(size);
@@ -1625,7 +1625,7 @@ extent_record(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 	if (!extents->delay_coalesce) {
 		extent = extent_try_coalesce(tsdn, arena, r_extent_hooks,
 		    rtree_ctx, extents, extent, NULL, growing_retained);
-	} else if (extent_size_get(extent) >= sc_data_global.large_minclass) {
+	} else if (extent_size_get(extent) >= SC_LARGE_MINCLASS) {
 		/* Always coalesce large extents eagerly. */
 		bool coalesced;
 		size_t prev_size;
@@ -1637,7 +1637,7 @@ extent_record(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 			    &coalesced, growing_retained);
 		} while (coalesced &&
 		    extent_size_get(extent)
-		    >= prev_size + sc_data_global.large_minclass);
+		    >= prev_size + SC_LARGE_MINCLASS);
 	}
 	extent_deactivate_locked(tsdn, arena, extents, extent);
 
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 4ffe5aa..e66735c 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1201,8 +1201,8 @@ malloc_conf_init(void) {
 			/* Experimental feature.  Will be documented later.*/
 			CONF_HANDLE_SIZE_T(opt_huge_threshold,
 			    "experimental_huge_threshold",
-			    sc_data_global.large_minclass,
-			    sc_data_global.large_maxclass, yes, yes, false)
+			    SC_LARGE_MINCLASS,
+			    SC_LARGE_MAXCLASS, yes, yes, false)
 			CONF_HANDLE_SIZE_T(opt_lg_extent_max_active_fit,
 			    "lg_extent_max_active_fit", 0,
 			    (sizeof(size_t) << 3), yes, yes, false)
@@ -1827,13 +1827,13 @@ imalloc_sample(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd,
 	szind_t ind_large;
 	size_t bumped_usize = usize;
 
-	if (usize <= sc_data_global.small_maxclass) {
+	if (usize <= SC_SMALL_MAXCLASS) {
 		assert(((dopts->alignment == 0) ?
-		    sz_s2u(sc_data_global.large_minclass) :
-		    sz_sa2u(sc_data_global.large_minclass, dopts->alignment))
-			== sc_data_global.large_minclass);
-		ind_large = sz_size2index(sc_data_global.large_minclass);
-		bumped_usize = sz_s2u(sc_data_global.large_minclass);
+		    sz_s2u(SC_LARGE_MINCLASS) :
+		    sz_sa2u(SC_LARGE_MINCLASS, dopts->alignment))
+			== SC_LARGE_MINCLASS);
+		ind_large = sz_size2index(SC_LARGE_MINCLASS);
+		bumped_usize = sz_s2u(SC_LARGE_MINCLASS);
 		ret = imalloc_no_sample(sopts, dopts, tsd, bumped_usize,
 		    bumped_usize, ind_large);
 		if (unlikely(ret == NULL)) {
@@ -1942,12 +1942,12 @@ imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) {
 		if (config_stats || (config_prof && opt_prof)) {
 			usize = sz_index2size(ind);
 			assert(usize > 0 && usize
-			    <= sc_data_global.large_maxclass);
+			    <= SC_LARGE_MAXCLASS);
 		}
 	} else {
 		usize = sz_sa2u(size, dopts->alignment);
 		if (unlikely(usize == 0
-		    || usize > sc_data_global.large_maxclass)) {
+		    || usize > SC_LARGE_MAXCLASS)) {
 			goto label_oom;
 		}
 	}
@@ -1984,7 +1984,7 @@ imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) {
 		alloc_ctx_t alloc_ctx;
 		if (likely((uintptr_t)tctx == (uintptr_t)1U)) {
 			alloc_ctx.slab = (usize
-			    <= sc_data_global.small_maxclass);
+			    <= SC_SMALL_MAXCLASS);
 			allocation = imalloc_no_sample(
 			    sopts, dopts, tsd, usize, usize, ind);
 		} else if ((uintptr_t)tctx > (uintptr_t)1U) {
@@ -2282,9 +2282,9 @@ irealloc_prof_sample(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t usize,
 	if (tctx == NULL) {
 		return NULL;
 	}
-	if (usize <= sc_data_global.small_maxclass) {
+	if (usize <= SC_SMALL_MAXCLASS) {
 		p = iralloc(tsd, old_ptr, old_usize,
-		    sc_data_global.large_minclass, 0, false, hook_args);
+		    SC_LARGE_MINCLASS, 0, false, hook_args);
 		if (p == NULL) {
 			return NULL;
 		}
@@ -2474,7 +2474,7 @@ je_realloc(void *ptr, size_t arg_size) {
 		if (config_prof && opt_prof) {
 			usize = sz_s2u(size);
 			if (unlikely(usize == 0
-			    || usize > sc_data_global.large_maxclass)) {
+			    || usize > SC_LARGE_MAXCLASS)) {
 				ret = NULL;
 			} else {
 				ret = irealloc_prof(tsd, ptr, old_usize, usize,
@@ -2787,9 +2787,9 @@ irallocx_prof_sample(tsdn_t *tsdn, void *old_ptr, size_t old_usize,
 	if (tctx == NULL) {
 		return NULL;
 	}
-	if (usize <= sc_data_global.small_maxclass) {
+	if (usize <= SC_SMALL_MAXCLASS) {
 		p = iralloct(tsdn, old_ptr, old_usize,
-		    sc_data_global.large_minclass, alignment, zero, tcache,
+		    SC_LARGE_MINCLASS, alignment, zero, tcache,
 		    arena, hook_args);
 		if (p == NULL) {
 			return NULL;
@@ -2900,7 +2900,7 @@ je_rallocx(void *ptr, size_t size, int flags) {
 		usize = (alignment == 0) ?
 		    sz_s2u(size) : sz_sa2u(size, alignment);
 		if (unlikely(usize == 0
-		    || usize > sc_data_global.large_maxclass)) {
+		    || usize > SC_LARGE_MAXCLASS)) {
 			goto label_oom;
 		}
 		p = irallocx_prof(tsd, ptr, old_usize, size, alignment, &usize,
@@ -2986,18 +2986,18 @@ ixallocx_prof(tsd_t *tsd, void *ptr, size_t old_usize, size_t size,
 	if (alignment == 0) {
 		usize_max = sz_s2u(size+extra);
 		assert(usize_max > 0
-		    && usize_max <= sc_data_global.large_maxclass);
+		    && usize_max <= SC_LARGE_MAXCLASS);
 	} else {
 		usize_max = sz_sa2u(size+extra, alignment);
 		if (unlikely(usize_max == 0
-		    || usize_max > sc_data_global.large_maxclass)) {
+		    || usize_max > SC_LARGE_MAXCLASS)) {
 			/*
 			 * usize_max is out of range, and chances are that
 			 * allocation will fail, but use the maximum possible
 			 * value and carry on with prof_alloc_prep(), just in
 			 * case allocation succeeds.
 			 */
-			usize_max = sc_data_global.large_maxclass;
+			usize_max = SC_LARGE_MAXCLASS;
 		}
 	}
 	tctx = prof_alloc_prep(tsd, usize_max, prof_active, false);
@@ -3046,18 +3046,18 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags) {
 	/*
 	 * The API explicitly absolves itself of protecting against (size +
 	 * extra) numerical overflow, but we may need to clamp extra to avoid
-	 * exceeding sc_data_global.large_maxclass.
+	 * exceeding SC_LARGE_MAXCLASS.
 	 *
 	 * Ordinarily, size limit checking is handled deeper down, but here we
 	 * have to check as part of (size + extra) clamping, since we need the
 	 * clamped value in the above helper functions.
 	 */
-	if (unlikely(size > sc_data_global.large_maxclass)) {
+	if (unlikely(size > SC_LARGE_MAXCLASS)) {
 		usize = old_usize;
 		goto label_not_resized;
 	}
-	if (unlikely(sc_data_global.large_maxclass - size < extra)) {
-		extra = sc_data_global.large_maxclass - size;
+	if (unlikely(SC_LARGE_MAXCLASS - size < extra)) {
+		extra = SC_LARGE_MAXCLASS - size;
 	}
 
 	if (config_prof && opt_prof) {
@@ -3244,7 +3244,7 @@ je_nallocx(size_t size, int flags) {
 	check_entry_exit_locking(tsdn);
 
 	usize = inallocx(tsdn, size, flags);
-	if (unlikely(usize > sc_data_global.large_maxclass)) {
+	if (unlikely(usize > SC_LARGE_MAXCLASS)) {
 		LOG("core.nallocx.exit", "result: %zu", ZU(0));
 		return 0;
 	}
diff --git a/src/large.c b/src/large.c
index 87d9ec0..8407361 100644
--- a/src/large.c
+++ b/src/large.c
@@ -28,7 +28,7 @@ large_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment,
 	assert(!tsdn_null(tsdn) || arena != NULL);
 
 	ausize = sz_sa2u(usize, alignment);
-	if (unlikely(ausize == 0 || ausize > sc_data_global.large_maxclass)) {
+	if (unlikely(ausize == 0 || ausize > SC_LARGE_MAXCLASS)) {
 		return NULL;
 	}
 
@@ -221,10 +221,10 @@ large_ralloc_no_move(tsdn_t *tsdn, extent_t *extent, size_t usize_min,
 	size_t oldusize = extent_usize_get(extent);
 
 	/* The following should have been caught by callers. */
-	assert(usize_min > 0 && usize_max <= sc_data_global.large_maxclass);
+	assert(usize_min > 0 && usize_max <= SC_LARGE_MAXCLASS);
 	/* Both allocation sizes must be large to avoid a move. */
-	assert(oldusize >= sc_data_global.large_minclass
-	    && usize_max >= sc_data_global.large_minclass);
+	assert(oldusize >= SC_LARGE_MINCLASS
+	    && usize_max >= SC_LARGE_MINCLASS);
 
 	if (usize_max > oldusize) {
 		/* Attempt to expand the allocation in-place. */
@@ -278,10 +278,10 @@ large_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t usize,
 
 	size_t oldusize = extent_usize_get(extent);
 	/* The following should have been caught by callers. */
-	assert(usize > 0 && usize <= sc_data_global.large_maxclass);
+	assert(usize > 0 && usize <= SC_LARGE_MAXCLASS);
 	/* Both allocation sizes must be large to avoid a move. */
-	assert(oldusize >= sc_data_global.large_minclass
-	    && usize >= sc_data_global.large_minclass);
+	assert(oldusize >= SC_LARGE_MINCLASS
+	    && usize >= SC_LARGE_MINCLASS);
 
 	/* Try to avoid moving the allocation. */
 	if (!large_ralloc_no_move(tsdn, extent, usize, usize, zero)) {
diff --git a/src/sc.c b/src/sc.c
index e8eef1c..74c9101 100644
--- a/src/sc.c
+++ b/src/sc.c
@@ -228,6 +228,20 @@ size_classes(
 	sc_data->lg_large_minclass = lg_large_minclass;
 	sc_data->large_minclass = (ZU(1) << lg_large_minclass);
 	sc_data->large_maxclass = large_maxclass;
+
+	/*
+	 * We compute these values in two ways:
+	 *   - Incrementally, as above.
+	 *   - In macros, in sc.h.
+	 * The computation is easier when done incrementally, but putting it in
+	 * a constant makes it available to the fast paths without having to
+	 * touch the extra global cacheline.  We assert, however, that the two
+	 * computations are equivalent.
+	 */
+	assert(sc_data->small_maxclass == SC_SMALL_MAXCLASS);
+	assert(sc_data->large_minclass == SC_LARGE_MINCLASS);
+	assert(sc_data->lg_large_minclass == SC_LG_LARGE_MINCLASS);
+	assert(sc_data->large_maxclass == SC_LARGE_MAXCLASS);
 }
 
 void
diff --git a/src/tcache.c b/src/tcache.c
index edd047a..7346df8 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -658,8 +658,8 @@ bool
 tcache_boot(tsdn_t *tsdn) {
 	/* If necessary, clamp opt_lg_tcache_max. */
 	if (opt_lg_tcache_max < 0 || (ZU(1) << opt_lg_tcache_max) <
-	    sc_data_global.small_maxclass) {
-		tcache_maxclass = sc_data_global.small_maxclass;
+	    SC_SMALL_MAXCLASS) {
+		tcache_maxclass = SC_SMALL_MAXCLASS;
 	} else {
 		tcache_maxclass = (ZU(1) << opt_lg_tcache_max);
 	}
diff --git a/test/unit/junk.c b/test/unit/junk.c
index 91c6e5b..be8933a 100644
--- a/test/unit/junk.c
+++ b/test/unit/junk.c
@@ -123,13 +123,13 @@ test_junk(size_t sz_min, size_t sz_max) {
 
 TEST_BEGIN(test_junk_small) {
 	test_skip_if(!config_fill);
-	test_junk(1, sc_data_global.small_maxclass - 1);
+	test_junk(1, SC_SMALL_MAXCLASS - 1);
 }
 TEST_END
 
 TEST_BEGIN(test_junk_large) {
 	test_skip_if(!config_fill);
-	test_junk(sc_data_global.small_maxclass + 1,
+	test_junk(SC_SMALL_MAXCLASS + 1,
 	    (1U << (sc_data_global.lg_large_minclass + 1)));
 }
 TEST_END
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index 230ecb0..f636200 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -721,7 +721,7 @@ TEST_BEGIN(test_arenas_lextent_constants) {
 } while (0)
 
 	TEST_ARENAS_LEXTENT_CONSTANT(size_t, size,
-	    sc_data_global.large_minclass);
+	    SC_LARGE_MINCLASS);
 
 #undef TEST_ARENAS_LEXTENT_CONSTANT
 }
diff --git a/test/unit/rtree.c b/test/unit/rtree.c
index 4d1daf2..b017bc0 100644
--- a/test/unit/rtree.c
+++ b/test/unit/rtree.c
@@ -85,8 +85,8 @@ TEST_END
 
 TEST_BEGIN(test_rtree_extrema) {
 	extent_t extent_a, extent_b;
-	extent_init(&extent_a, NULL, NULL, sc_data_global.large_minclass, false,
-	    sz_size2index(sc_data_global.large_minclass), 0,
+	extent_init(&extent_a, NULL, NULL, SC_LARGE_MINCLASS, false,
+	    sz_size2index(SC_LARGE_MINCLASS), 0,
 	    extent_state_active, false, false, true);
 	extent_init(&extent_b, NULL, NULL, 0, false, SC_NSIZES, 0,
 	    extent_state_active, false, false, true);
diff --git a/test/unit/stats.c b/test/unit/stats.c
index 8fe0f3a..b8f549b 100644
--- a/test/unit/stats.c
+++ b/test/unit/stats.c
@@ -33,7 +33,7 @@ TEST_BEGIN(test_stats_large) {
 	size_t sz;
 	int expected = config_stats ? 0 : ENOENT;
 
-	p = mallocx(sc_data_global.small_maxclass + 1, MALLOCX_ARENA(0));
+	p = mallocx(SC_SMALL_MAXCLASS + 1, MALLOCX_ARENA(0));
 	assert_ptr_not_null(p, "Unexpected mallocx() failure");
 
 	assert_d_eq(mallctl("epoch", NULL, NULL, (void *)&epoch, sizeof(epoch)),
@@ -74,7 +74,7 @@ TEST_BEGIN(test_stats_arenas_summary) {
 	uint64_t dirty_npurge, dirty_nmadvise, dirty_purged;
 	uint64_t muzzy_npurge, muzzy_nmadvise, muzzy_purged;
 
-	little = mallocx(sc_data_global.small_maxclass, MALLOCX_ARENA(0));
+	little = mallocx(SC_SMALL_MAXCLASS, MALLOCX_ARENA(0));
 	assert_ptr_not_null(little, "Unexpected mallocx() failure");
 	large = mallocx((1U << sc_data_global.lg_large_minclass),
 	    MALLOCX_ARENA(0));
@@ -149,7 +149,7 @@ TEST_BEGIN(test_stats_arenas_small) {
 
 	no_lazy_lock(); /* Lazy locking would dodge tcache testing. */
 
-	p = mallocx(sc_data_global.small_maxclass, MALLOCX_ARENA(0));
+	p = mallocx(SC_SMALL_MAXCLASS, MALLOCX_ARENA(0));
 	assert_ptr_not_null(p, "Unexpected mallocx() failure");
 
 	assert_d_eq(mallctl("thread.tcache.flush", NULL, NULL, NULL, 0),
diff --git a/test/unit/zero.c b/test/unit/zero.c
index 20a7062..8b8d207 100644
--- a/test/unit/zero.c
+++ b/test/unit/zero.c
@@ -41,13 +41,13 @@ test_zero(size_t sz_min, size_t sz_max) {
 
 TEST_BEGIN(test_zero_small) {
 	test_skip_if(!config_fill);
-	test_zero(1, sc_data_global.small_maxclass - 1);
+	test_zero(1, SC_SMALL_MAXCLASS - 1);
 }
 TEST_END
 
 TEST_BEGIN(test_zero_large) {
 	test_skip_if(!config_fill);
-	test_zero(sc_data_global.small_maxclass + 1,
+	test_zero(SC_SMALL_MAXCLASS + 1,
 	    1U << (sc_data_global.lg_large_minclass + 1));
 }
 TEST_END
-- 
cgit v0.12


From 0eb0641cac0c3031f84469953b5e75b380867ccb Mon Sep 17 00:00:00 2001
From: gnzlbg <gonzalobg88@gmail.com>
Date: Tue, 10 Jul 2018 14:41:20 +0200
Subject: Simplify output of gen_travis.py script

This commit simplifies the output of the
`gen_travis.py` script by reusing addons.

The `.travis.yml` script is updated to
reflect these changes.
---
 .travis.yml           | 43 +++++++++---------------------------
 scripts/gen_travis.py | 60 ++++++++++++++++++++++++++++++---------------------
 2 files changed, 46 insertions(+), 57 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 7d93ead..854f878 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -11,7 +11,7 @@ matrix:
       env: CC=clang CXX=clang++ COMPILER_FLAGS="" CONFIGURE_FLAGS="" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
-      addons:
+      addons: &gcc_multilib
         apt:
           packages:
             - gcc-multilib
@@ -41,10 +41,7 @@ matrix:
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=clang CXX=clang++ COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
+      addons: *gcc_multilib
     - os: linux
       env: CC=clang CXX=clang++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-debug" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
@@ -61,46 +58,25 @@ matrix:
       env: CC=clang CXX=clang++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--with-malloc-conf=background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--enable-debug" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
+      addons: *gcc_multilib
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
+      addons: *gcc_multilib
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--disable-stats" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
+      addons: *gcc_multilib
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
+      addons: *gcc_multilib
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--with-malloc-conf=dss:primary" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
+      addons: *gcc_multilib
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--with-malloc-conf=percpu_arena:percpu" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
+      addons: *gcc_multilib
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--with-malloc-conf=background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
+      addons: *gcc_multilib
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-debug --enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
@@ -150,6 +126,7 @@ matrix:
         - make -j test/unit/log
         - test/unit/log
 
+
 before_script:
   - autoconf
   - ./configure ${COMPILER_FLAGS:+       CC="$CC $COMPILER_FLAGS"       CXX="$CXX $COMPILER_FLAGS" }       $CONFIGURE_FLAGS
diff --git a/scripts/gen_travis.py b/scripts/gen_travis.py
index 6dd3929..1570883 100755
--- a/scripts/gen_travis.py
+++ b/scripts/gen_travis.py
@@ -4,6 +4,7 @@ from itertools import combinations
 
 travis_template = """\
 language: generic
+dist: precise
 
 matrix:
   include:
@@ -61,47 +62,58 @@ unusual_combinations_to_test = []
 for i in xrange(MAX_UNUSUAL_OPTIONS + 1):
     unusual_combinations_to_test += combinations(all_unusuals, i)
 
-include_rows = ""
-for unusual_combination in unusual_combinations_to_test:
-    os = os_default
-    if os_unusual in unusual_combination:
-        os = os_unusual
-
-    compilers = compilers_default
-    if compilers_unusual in unusual_combination:
-        compilers = compilers_unusual
+gcc_multilib_set = False
+# Formats a job from a combination of flags
+def format_job(combination):
+    global gcc_multilib_set
 
-    compiler_flags = [
-        x for x in unusual_combination if x in compiler_flag_unusuals]
+    os = os_unusual if os_unusual in combination else os_default
+    compilers = compilers_unusual if compilers_unusual in combination else compilers_default
 
-    configure_flags = [
-        x for x in unusual_combination if x in configure_flag_unusuals]
+    compiler_flags = [x for x in combination if x in compiler_flag_unusuals]
+    configure_flags = [x for x in combination if x in configure_flag_unusuals]
+    malloc_conf = [x for x in combination if x in malloc_conf_unusuals]
 
-    malloc_conf = [
-        x for x in unusual_combination if x in malloc_conf_unusuals]
     # Filter out unsupported configurations on OS X.
     if os == 'osx' and ('dss:primary' in malloc_conf or \
       'percpu_arena:percpu' in malloc_conf or 'background_thread:true' \
       in malloc_conf):
-        continue
+        return ""
     if len(malloc_conf) > 0:
         configure_flags.append('--with-malloc-conf=' + ",".join(malloc_conf))
 
     # Filter out an unsupported configuration - heap profiling on OS X.
     if os == 'osx' and '--enable-prof' in configure_flags:
-        continue
+        return ""
 
     # We get some spurious errors when -Warray-bounds is enabled.
     env_string = ('{} COMPILER_FLAGS="{}" CONFIGURE_FLAGS="{}" '
 	'EXTRA_CFLAGS="-Werror -Wno-array-bounds"').format(
         compilers, " ".join(compiler_flags), " ".join(configure_flags))
 
-    include_rows += '    - os: %s\n' % os
-    include_rows += '      env: %s\n' % env_string
-    if '-m32' in unusual_combination and os == 'linux':
-        include_rows += '      addons:\n'
-	include_rows += '        apt:\n'
-	include_rows += '          packages:\n'
-	include_rows += '            - gcc-multilib\n'
+    job = ""
+    job += '    - os: %s\n' % os
+    job += '      env: %s\n' % env_string
+    if '-m32' in combination and os == 'linux':
+        job += '      addons:'
+        if gcc_multilib_set:
+            job += ' *gcc_multilib\n'
+        else:
+            job += ' &gcc_multilib\n'
+            job += '        apt:\n'
+            job += '          packages:\n'
+            job += '            - gcc-multilib\n'
+            gcc_multilib_set = True
+    return job
+
+include_rows = ""
+for combination in unusual_combinations_to_test:
+    include_rows += format_job(combination)
+
+# Development build
+include_rows += '''\
+    - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-debug --disable-cache-oblivious --enable-stats --enable-log --enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+'''
 
 print travis_template % include_rows
-- 
cgit v0.12


From 6deed86deb48d3b432d972a139a413a9fb38283b Mon Sep 17 00:00:00 2001
From: gnzlbg <gonzalobg88@gmail.com>
Date: Wed, 11 Jul 2018 15:18:40 +0200
Subject: Test that .travis.yml has been produced by gen_travis.py on CI

This commits checks on Travis-CI that the current `.travis.yml` file
equals the output of the `gen_travis.py` script, and updated
the `.travis.yml` file accordingly.
---
 .travis.yml           | 5 +----
 scripts/gen_travis.py | 1 +
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 854f878..cd3be83 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -121,14 +121,11 @@ matrix:
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--with-malloc-conf=percpu_arena:percpu,background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-debug --disable-cache-oblivious --enable-stats --enable-log --enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
-      script:
-        - make check
-        - make -j test/unit/log
-        - test/unit/log
 
 
 before_script:
   - autoconf
+  - scripts/gen_travis.py > travis_script && diff .travis.yml travis_script
   - ./configure ${COMPILER_FLAGS:+       CC="$CC $COMPILER_FLAGS"       CXX="$CXX $COMPILER_FLAGS" }       $CONFIGURE_FLAGS
   - make -j3
   - make -j3 tests
diff --git a/scripts/gen_travis.py b/scripts/gen_travis.py
index 1570883..4473205 100755
--- a/scripts/gen_travis.py
+++ b/scripts/gen_travis.py
@@ -12,6 +12,7 @@ matrix:
 
 before_script:
   - autoconf
+  - scripts/gen_travis.py > travis_script && diff .travis.yml travis_script
   - ./configure ${COMPILER_FLAGS:+ \
       CC="$CC $COMPILER_FLAGS" \
       CXX="$CXX $COMPILER_FLAGS" } \
-- 
cgit v0.12


From 4bc48718b2eb98e3646a86af816f9c6db29d1612 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Tue, 17 Jul 2018 14:09:31 -0700
Subject: Tolerate experimental features for abort_conf.

Not aborting with unrecognized experimental options.  This helps us testing
experimental features with abort_conf enabled.
---
 src/jemalloc.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/jemalloc.c b/src/jemalloc.c
index e66735c..8e0a581 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -896,6 +896,11 @@ malloc_conf_error(const char *msg, const char *k, size_t klen, const char *v,
 	malloc_printf("<jemalloc>: %s: %.*s:%.*s\n", msg, (int)klen, k,
 	    (int)vlen, v);
 	/* If abort_conf is set, error out after processing all options. */
+	const char *experimental = "experimental_";
+	if (strncmp(k, experimental, strlen(experimental)) == 0) {
+		/* However, tolerate experimental features. */
+		return;
+	}
 	had_conf_error = true;
 }
 
-- 
cgit v0.12


From 3aba072cef71d0f2bacc4ef10932a46f1df43192 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Thu, 19 Jul 2018 17:08:10 -0700
Subject: SC: Remove global data.

The global data is mostly only used at initialization, or for easy access to
values we could compute statically.  Instead of consuming that space (and
risking TLB misses), we can just pass around a pointer to stack data during
bootstrapping.
---
 include/jemalloc/internal/arena_externs.h  |  2 +-
 include/jemalloc/internal/extent_structs.h |  4 ++--
 include/jemalloc/internal/sc.h             | 13 +++++--------
 include/jemalloc/internal/sz.h             | 27 ++++++++++++---------------
 src/arena.c                                |  9 ++++-----
 src/base.c                                 |  2 +-
 src/extent.c                               | 20 +++++++++-----------
 src/jemalloc.c                             | 19 ++++++++++++-------
 src/sc.c                                   |  6 ++++--
 src/sz.c                                   |  6 ++++--
 test/unit/junk.c                           |  3 +--
 test/unit/mallctl.c                        |  2 +-
 test/unit/prof_gdump.c                     |  8 ++++----
 test/unit/size_classes.c                   | 18 ++++++++++--------
 test/unit/stats.c                          |  4 ++--
 test/unit/zero.c                           |  3 +--
 16 files changed, 73 insertions(+), 73 deletions(-)

diff --git a/include/jemalloc/internal/arena_externs.h b/include/jemalloc/internal/arena_externs.h
index 7a46946..4f744ca 100644
--- a/include/jemalloc/internal/arena_externs.h
+++ b/include/jemalloc/internal/arena_externs.h
@@ -85,7 +85,7 @@ size_t arena_extent_sn_next(arena_t *arena);
 arena_t *arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks);
 bool arena_init_huge(void);
 arena_t *arena_choose_huge(tsd_t *tsd);
-void arena_boot(void);
+void arena_boot(sc_data_t *sc_data);
 void arena_prefork0(tsdn_t *tsdn, arena_t *arena);
 void arena_prefork1(tsdn_t *tsdn, arena_t *arena);
 void arena_prefork2(tsdn_t *tsdn, arena_t *arena);
diff --git a/include/jemalloc/internal/extent_structs.h b/include/jemalloc/internal/extent_structs.h
index c6c1e23..1983097 100644
--- a/include/jemalloc/internal/extent_structs.h
+++ b/include/jemalloc/internal/extent_structs.h
@@ -181,14 +181,14 @@ struct extents_s {
 	 *
 	 * Synchronization: mtx.
 	 */
-	extent_heap_t		heaps[SC_NPSIZES_MAX + 1];
+	extent_heap_t		heaps[SC_NPSIZES + 1];
 
 	/*
 	 * Bitmap for which set bits correspond to non-empty heaps.
 	 *
 	 * Synchronization: mtx.
 	 */
-	bitmap_t		bitmap[BITMAP_GROUPS(SC_NPSIZES_MAX + 1)];
+	bitmap_t		bitmap[BITMAP_GROUPS(SC_NPSIZES + 1)];
 
 	/*
 	 * LRU of all extents in heaps.
diff --git a/include/jemalloc/internal/sc.h b/include/jemalloc/internal/sc.h
index 5c94378..5b79bb4 100644
--- a/include/jemalloc/internal/sc.h
+++ b/include/jemalloc/internal/sc.h
@@ -182,6 +182,7 @@
 #define SC_NGROUP (1ULL << SC_LG_NGROUP)
 #define SC_PTR_BITS ((1ULL << LG_SIZEOF_PTR) * 8)
 #define SC_NTINY (LG_QUANTUM - SC_LG_TINY_MIN)
+#define SC_LG_TINY_MAXCLASS (LG_QUANTUM > SC_LG_TINY_MIN ? LG_QUANTUM - 1 : -1)
 #define SC_NPSEUDO SC_NGROUP
 #define SC_LG_FIRST_REGULAR_BASE (LG_QUANTUM + SC_LG_NGROUP)
 /*
@@ -200,7 +201,7 @@
  * because delta may be smaller than a page, this is not the same as the number
  * of size classes that are *multiples* of the page size.
  */
-#define SC_NPSIZES_MAX (						\
+#define SC_NPSIZES (							\
     /* Start with all the size classes. */				\
     SC_NSIZES								\
     /* Subtract out those groups with too small a base. */		\
@@ -209,11 +210,8 @@
     - SC_NPSEUDO							\
     /* And the tiny group. */						\
     - SC_NTINY								\
-    /*									\
-     * In the lg_base == lg_page - 1 group, only the last sc is big	\
-     * enough to make it to lg_page.					\
-     */									\
-    - (SC_NGROUP - 1))
+    /* Groups where ndelta*delta is not a multiple of the page size. */	\
+    - (2 * (SC_NGROUP)))
 
 /*
  * We declare a size class is binnable if size < page size * group. Or, in other
@@ -314,7 +312,6 @@ struct sc_data_s {
 	sc_t sc[SC_NSIZES];
 };
 
-extern sc_data_t sc_data_global;
 void sc_data_init(sc_data_t *data);
 /*
  * Updates slab sizes in [begin, end] to be pgs pages in length, if possible.
@@ -322,6 +319,6 @@ void sc_data_init(sc_data_t *data);
  */
 void sc_data_update_slab_size(sc_data_t *data, size_t begin, size_t end,
     int pgs);
-void sc_boot();
+void sc_boot(sc_data_t *data);
 
 #endif /* JEMALLOC_INTERNAL_SC_H */
diff --git a/include/jemalloc/internal/sz.h b/include/jemalloc/internal/sz.h
index e743d87..69625ee 100644
--- a/include/jemalloc/internal/sz.h
+++ b/include/jemalloc/internal/sz.h
@@ -26,7 +26,7 @@
  * sz_pind2sz_tab encodes the same information as could be computed by
  * sz_pind2sz_compute().
  */
-extern size_t sz_pind2sz_tab[SC_NPSIZES_MAX + 1];
+extern size_t sz_pind2sz_tab[SC_NPSIZES + 1];
 /*
  * sz_index2size_tab encodes the same information as could be computed (at
  * unacceptable cost in some code paths) by sz_index2size_compute().
@@ -52,7 +52,7 @@ extern void sz_boot(const sc_data_t *sc_data);
 JEMALLOC_ALWAYS_INLINE pszind_t
 sz_psz2ind(size_t psz) {
 	if (unlikely(psz > SC_LARGE_MAXCLASS)) {
-		return sc_data_global.npsizes;
+		return SC_NPSIZES;
 	}
 	pszind_t x = lg_floor((psz<<1)-1);
 	pszind_t shift = (x < SC_LG_NGROUP + LG_PAGE) ?
@@ -72,7 +72,7 @@ sz_psz2ind(size_t psz) {
 
 static inline size_t
 sz_pind2sz_compute(pszind_t pind) {
-	if (unlikely(pind == sc_data_global.npsizes)) {
+	if (unlikely(pind == SC_NPSIZES)) {
 		return SC_LARGE_MAXCLASS + PAGE;
 	}
 	size_t grp = pind >> SC_LG_NGROUP;
@@ -99,7 +99,7 @@ sz_pind2sz_lookup(pszind_t pind) {
 
 static inline size_t
 sz_pind2sz(pszind_t pind) {
-	assert(pind < sc_data_global.npsizes + 1);
+	assert(pind < SC_NPSIZES + 1);
 	return sz_pind2sz_lookup(pind);
 }
 
@@ -123,9 +123,8 @@ sz_size2index_compute(size_t size) {
 		return SC_NSIZES;
 	}
 #if (SC_NTINY != 0)
-	if (size <= (ZU(1) << sc_data_global.lg_tiny_maxclass)) {
-		szind_t lg_tmin = sc_data_global.lg_tiny_maxclass
-		    - sc_data_global.ntiny + 1;
+	if (size <= (ZU(1) << SC_LG_TINY_MAXCLASS)) {
+		szind_t lg_tmin = SC_LG_TINY_MAXCLASS - SC_NTINY + 1;
 		szind_t lg_ceil = lg_floor(pow2_ceil_zu(size));
 		return (lg_ceil < lg_tmin ? 0 : lg_ceil - lg_tmin);
 	}
@@ -143,7 +142,7 @@ sz_size2index_compute(size_t size) {
 		szind_t mod = ((((size-1) & delta_inverse_mask) >> lg_delta)) &
 		    ((ZU(1) << SC_LG_NGROUP) - 1);
 
-		szind_t index = sc_data_global.ntiny + grp + mod;
+		szind_t index = SC_NTINY + grp + mod;
 		return index;
 	}
 }
@@ -168,13 +167,12 @@ sz_size2index(size_t size) {
 static inline size_t
 sz_index2size_compute(szind_t index) {
 #if (SC_NTINY > 0)
-	if (index < sc_data_global.ntiny) {
-		return (ZU(1) << (sc_data_global.lg_tiny_maxclass
-		    - sc_data_global.ntiny + 1 + index));
+	if (index < SC_NTINY) {
+		return (ZU(1) << (SC_LG_TINY_MAXCLASS - SC_NTINY + 1 + index));
 	}
 #endif
 	{
-		size_t reduced_index = index - sc_data_global.ntiny;
+		size_t reduced_index = index - SC_NTINY;
 		size_t grp = reduced_index >> SC_LG_NGROUP;
 		size_t mod = reduced_index & ((ZU(1) << SC_LG_NGROUP) -
 		    1);
@@ -211,9 +209,8 @@ sz_s2u_compute(size_t size) {
 		return 0;
 	}
 #if (SC_NTINY > 0)
-	if (size <= (ZU(1) << sc_data_global.lg_tiny_maxclass)) {
-		size_t lg_tmin = sc_data_global.lg_tiny_maxclass
-		    - sc_data_global.ntiny + 1;
+	if (size <= (ZU(1) << SC_LG_TINY_MAXCLASS)) {
+		size_t lg_tmin = SC_LG_TINY_MAXCLASS - SC_NTINY + 1;
 		size_t lg_ceil = lg_floor(pow2_ceil_zu(size));
 		return (lg_ceil < lg_tmin ? (ZU(1) << lg_tmin) :
 		    (ZU(1) << lg_ceil));
diff --git a/src/arena.c b/src/arena.c
index 91043cf..da7fd78 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1754,8 +1754,7 @@ arena_retain_grow_limit_get_set(tsd_t *tsd, arena_t *arena, size_t *old_limit,
 	if (new_limit != NULL) {
 		size_t limit = *new_limit;
 		/* Grow no more than the new limit. */
-		if ((new_ind = sz_psz2ind(limit + 1) - 1)
-		    >= sc_data_global.npsizes) {
+		if ((new_ind = sz_psz2ind(limit + 1) - 1) >= SC_NPSIZES) {
 			return true;
 		}
 	}
@@ -1899,7 +1898,7 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 	}
 
 	arena->extent_grow_next = sz_psz2ind(HUGEPAGE);
-	arena->retain_grow_limit = sc_data_global.npsizes - 1;
+	arena->retain_grow_limit = sz_psz2ind(SC_LARGE_MAXCLASS);
 	if (malloc_mutex_init(&arena->extent_grow_mtx, "extent_grow",
 	    WITNESS_RANK_EXTENT_GROW, malloc_mutex_rank_exclusive)) {
 		goto label_error;
@@ -2001,11 +2000,11 @@ arena_init_huge(void) {
 }
 
 void
-arena_boot(void) {
+arena_boot(sc_data_t *sc_data) {
 	arena_dirty_decay_ms_default_set(opt_dirty_decay_ms);
 	arena_muzzy_decay_ms_default_set(opt_muzzy_decay_ms);
 	for (unsigned i = 0; i < SC_NBINS; i++) {
-		sc_t *sc = &sc_data_global.sc[i];
+		sc_t *sc = &sc_data->sc[i];
 		div_init(&arena_binind_div_info[i],
 		    (1U << sc->lg_base) + (sc->ndelta << sc->lg_delta));
 	}
diff --git a/src/base.c b/src/base.c
index cabf66c..f3c6166 100644
--- a/src/base.c
+++ b/src/base.c
@@ -262,7 +262,7 @@ base_block_alloc(tsdn_t *tsdn, base_t *base, extent_hooks_t *extent_hooks,
 	 */
 	size_t min_block_size = HUGEPAGE_CEILING(sz_psz2u(header_size + gap_size
 	    + usize));
-	pszind_t pind_next = (*pind_last + 1 < sc_data_global.npsizes) ?
+	pszind_t pind_next = (*pind_last + 1 < sz_psz2ind(SC_LARGE_MAXCLASS)) ?
 	    *pind_last + 1 : *pind_last;
 	size_t next_block_size = HUGEPAGE_CEILING(sz_pind2sz(pind_next));
 	size_t block_size = (min_block_size > next_block_size) ? min_block_size
diff --git a/src/extent.c b/src/extent.c
index 74076b6..592974a 100644
--- a/src/extent.c
+++ b/src/extent.c
@@ -20,7 +20,7 @@ mutex_pool_t	extent_mutex_pool;
 size_t opt_lg_extent_max_active_fit = LG_EXTENT_MAX_ACTIVE_FIT_DEFAULT;
 
 static const bitmap_info_t extents_bitmap_info =
-    BITMAP_INFO_INITIALIZER(SC_NPSIZES_MAX+1);
+    BITMAP_INFO_INITIALIZER(SC_NPSIZES+1);
 
 static void *extent_alloc_default(extent_hooks_t *extent_hooks, void *new_addr,
     size_t size, size_t alignment, bool *zero, bool *commit,
@@ -288,7 +288,7 @@ extents_init(tsdn_t *tsdn, extents_t *extents, extent_state_t state,
 	    malloc_mutex_rank_exclusive)) {
 		return true;
 	}
-	for (unsigned i = 0; i < sc_data_global.npsizes + 1; i++) {
+	for (unsigned i = 0; i < SC_NPSIZES + 1; i++) {
 		extent_heap_new(&extents->heaps[i]);
 	}
 	bitmap_init(extents->bitmap, &extents_bitmap_info, true);
@@ -375,7 +375,7 @@ extents_fit_alignment(extents_t *extents, size_t min_size, size_t max_size,
 	    &extents_bitmap_info, (size_t)pind); i < pind_max; i =
 	    (pszind_t)bitmap_ffu(extents->bitmap, &extents_bitmap_info,
 	    (size_t)i+1)) {
-		assert(i < sc_data_global.npsizes);
+		assert(i < SC_NPSIZES);
 		assert(!extent_heap_empty(&extents->heaps[i]));
 		extent_t *extent = extent_heap_first(&extents->heaps[i]);
 		uintptr_t base = (uintptr_t)extent_base_get(extent);
@@ -405,7 +405,7 @@ extents_best_fit_locked(tsdn_t *tsdn, arena_t *arena, extents_t *extents,
 	pszind_t pind = sz_psz2ind(extent_size_quantize_ceil(size));
 	pszind_t i = (pszind_t)bitmap_ffu(extents->bitmap, &extents_bitmap_info,
 	    (size_t)pind);
-	if (i < sc_data_global.npsizes + 1) {
+	if (i < SC_NPSIZES + 1) {
 		/*
 		 * In order to reduce fragmentation, avoid reusing and splitting
 		 * large extents for much smaller sizes.
@@ -434,7 +434,7 @@ extents_first_fit_locked(tsdn_t *tsdn, arena_t *arena, extents_t *extents,
 	pszind_t pind = sz_psz2ind(extent_size_quantize_ceil(size));
 	for (pszind_t i = (pszind_t)bitmap_ffu(extents->bitmap,
 	    &extents_bitmap_info, (size_t)pind);
-	    i < sc_data_global.npsizes + 1;
+	    i < SC_NPSIZES + 1;
 	    i = (pszind_t)bitmap_ffu(extents->bitmap, &extents_bitmap_info,
 	    (size_t)i+1)) {
 		assert(!extent_heap_empty(&extents->heaps[i]));
@@ -443,10 +443,10 @@ extents_first_fit_locked(tsdn_t *tsdn, arena_t *arena, extents_t *extents,
 		if (ret == NULL || extent_snad_comp(extent, ret) < 0) {
 			ret = extent;
 		}
-		if (i == sc_data_global.npsizes) {
+		if (i == SC_NPSIZES) {
 			break;
 		}
-		assert(i < sc_data_global.npsizes);
+		assert(i < SC_NPSIZES);
 	}
 
 	return ret;
@@ -1249,13 +1249,11 @@ extent_grow_retained(tsdn_t *tsdn, arena_t *arena,
 	size_t alloc_size = sz_pind2sz(arena->extent_grow_next + egn_skip);
 	while (alloc_size < alloc_size_min) {
 		egn_skip++;
-		if (arena->extent_grow_next + egn_skip ==
-		    sc_data_global.npsizes) {
+		if (arena->extent_grow_next + egn_skip >=
+		    sz_psz2ind(SC_LARGE_MAXCLASS)) {
 			/* Outside legal range. */
 			goto label_err;
 		}
-		assert(arena->extent_grow_next + egn_skip
-		    < sc_data_global.npsizes);
 		alloc_size = sz_pind2sz(arena->extent_grow_next + egn_skip);
 	}
 
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 8e0a581..d473664 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -920,7 +920,7 @@ malloc_slow_flag_init(void) {
 }
 
 static void
-malloc_conf_init(void) {
+malloc_conf_init(sc_data_t *sc_data) {
 	unsigned i;
 	char buf[PATH_MAX + 1];
 	const char *opts, *k, *v;
@@ -1254,7 +1254,7 @@ malloc_conf_init(void) {
 					    &pgs);
 					if (!err) {
 						sc_data_update_slab_size(
-						    &sc_data_global, slab_start,
+						    sc_data, slab_start,
 						    slab_end, (int)pgs);
 					} else {
 						malloc_conf_error(
@@ -1368,6 +1368,11 @@ static bool
 malloc_init_hard_a0_locked() {
 	malloc_initializer = INITIALIZER;
 
+	JEMALLOC_DIAGNOSTIC_PUSH
+	JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
+	sc_data_t sc_data = {0};
+	JEMALLOC_DIAGNOSTIC_POP
+
 	/*
 	 * Ordering here is somewhat tricky; we need sc_boot() first, since that
 	 * determines what the size classes will be, and then
@@ -1375,10 +1380,10 @@ malloc_init_hard_a0_locked() {
 	 * before sz_boot and bin_boot, which assume that the values they read
 	 * out of sc_data_global are final.
 	 */
-	sc_boot();
-	malloc_conf_init();
-	sz_boot(&sc_data_global);
-	bin_boot(&sc_data_global);
+	sc_boot(&sc_data);
+	malloc_conf_init(&sc_data);
+	sz_boot(&sc_data);
+	bin_boot(&sc_data);
 
 	if (config_prof) {
 		prof_boot0();
@@ -1407,7 +1412,7 @@ malloc_init_hard_a0_locked() {
 	if (config_prof) {
 		prof_boot1();
 	}
-	arena_boot();
+	arena_boot(&sc_data);
 	if (tcache_boot(TSDN_NULL)) {
 		return true;
 	}
diff --git a/src/sc.c b/src/sc.c
index 74c9101..8784bdd 100644
--- a/src/sc.c
+++ b/src/sc.c
@@ -238,6 +238,8 @@ size_classes(
 	 * touch the extra global cacheline.  We assert, however, that the two
 	 * computations are equivalent.
 	 */
+	assert(sc_data->npsizes == SC_NPSIZES);
+	assert(sc_data->lg_tiny_maxclass == SC_LG_TINY_MAXCLASS);
 	assert(sc_data->small_maxclass == SC_SMALL_MAXCLASS);
 	assert(sc_data->large_minclass == SC_LARGE_MINCLASS);
 	assert(sc_data->lg_large_minclass == SC_LG_LARGE_MINCLASS);
@@ -297,6 +299,6 @@ sc_data_update_slab_size(sc_data_t *data, size_t begin, size_t end, int pgs) {
 }
 
 void
-sc_boot() {
-	sc_data_init(&sc_data_global);
+sc_boot(sc_data_t *data) {
+	sc_data_init(data);
 }
diff --git a/src/sz.c b/src/sz.c
index e038728..77f89c6 100644
--- a/src/sz.c
+++ b/src/sz.c
@@ -2,7 +2,7 @@
 #include "jemalloc/internal/sz.h"
 
 JEMALLOC_ALIGNED(CACHELINE)
-size_t sz_pind2sz_tab[SC_NPSIZES_MAX+1];
+size_t sz_pind2sz_tab[SC_NPSIZES+1];
 
 static void
 sz_boot_pind2sz_tab(const sc_data_t *sc_data) {
@@ -15,7 +15,9 @@ sz_boot_pind2sz_tab(const sc_data_t *sc_data) {
 			pind++;
 		}
 	}
-	sz_pind2sz_tab[pind] = sc_data->large_maxclass + PAGE;
+	for (int i = pind; i <= (int)SC_NPSIZES; i++) {
+		sz_pind2sz_tab[pind] = sc_data->large_maxclass + PAGE;
+	}
 }
 
 JEMALLOC_ALIGNED(CACHELINE)
diff --git a/test/unit/junk.c b/test/unit/junk.c
index be8933a..57e3ad4 100644
--- a/test/unit/junk.c
+++ b/test/unit/junk.c
@@ -129,8 +129,7 @@ TEST_END
 
 TEST_BEGIN(test_junk_large) {
 	test_skip_if(!config_fill);
-	test_junk(SC_SMALL_MAXCLASS + 1,
-	    (1U << (sc_data_global.lg_large_minclass + 1)));
+	test_junk(SC_SMALL_MAXCLASS + 1, (1U << (SC_LG_LARGE_MINCLASS + 1)));
 }
 TEST_END
 
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index f636200..452d884 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -581,7 +581,7 @@ TEST_BEGIN(test_arena_i_retain_grow_limit) {
 
 	assert_d_eq(mallctlbymib(mib, miblen, &default_limit, &sz, NULL, 0), 0,
 	    "Unexpected mallctl() failure");
-	assert_zu_eq(default_limit, sz_pind2sz(sc_data_global.npsizes - 1),
+	assert_zu_eq(default_limit, SC_LARGE_MAXCLASS,
 	    "Unexpected default for retain_grow_limit");
 
 	new_limit = PAGE - 1;
diff --git a/test/unit/prof_gdump.c b/test/unit/prof_gdump.c
index 0b8d7c3..f7e0aac 100644
--- a/test/unit/prof_gdump.c
+++ b/test/unit/prof_gdump.c
@@ -29,12 +29,12 @@ TEST_BEGIN(test_gdump) {
 	prof_dump_open = prof_dump_open_intercept;
 
 	did_prof_dump_open = false;
-	p = mallocx((1U << sc_data_global.lg_large_minclass), 0);
+	p = mallocx((1U << SC_LG_LARGE_MINCLASS), 0);
 	assert_ptr_not_null(p, "Unexpected mallocx() failure");
 	assert_true(did_prof_dump_open, "Expected a profile dump");
 
 	did_prof_dump_open = false;
-	q = mallocx((1U << sc_data_global.lg_large_minclass), 0);
+	q = mallocx((1U << SC_LG_LARGE_MINCLASS), 0);
 	assert_ptr_not_null(q, "Unexpected mallocx() failure");
 	assert_true(did_prof_dump_open, "Expected a profile dump");
 
@@ -45,7 +45,7 @@ TEST_BEGIN(test_gdump) {
 	    "Unexpected mallctl failure while disabling prof.gdump");
 	assert(gdump_old);
 	did_prof_dump_open = false;
-	r = mallocx((1U << sc_data_global.lg_large_minclass), 0);
+	r = mallocx((1U << SC_LG_LARGE_MINCLASS), 0);
 	assert_ptr_not_null(q, "Unexpected mallocx() failure");
 	assert_false(did_prof_dump_open, "Unexpected profile dump");
 
@@ -56,7 +56,7 @@ TEST_BEGIN(test_gdump) {
 	    "Unexpected mallctl failure while enabling prof.gdump");
 	assert(!gdump_old);
 	did_prof_dump_open = false;
-	s = mallocx((1U << sc_data_global.lg_large_minclass), 0);
+	s = mallocx((1U << SC_LG_LARGE_MINCLASS), 0);
 	assert_ptr_not_null(q, "Unexpected mallocx() failure");
 	assert_true(did_prof_dump_open, "Expected a profile dump");
 
diff --git a/test/unit/size_classes.c b/test/unit/size_classes.c
index 7c28e16..6947336 100644
--- a/test/unit/size_classes.c
+++ b/test/unit/size_classes.c
@@ -108,8 +108,13 @@ TEST_BEGIN(test_psize_classes) {
 		    size_class, sz_psz2ind(size_class),
 		    sz_pind2sz(sz_psz2ind(size_class)));
 
-		assert_u_eq(pind+1, sz_psz2ind(size_class+1),
-		    "Next size_class does not round up properly");
+		if (size_class == SC_LARGE_MAXCLASS) {
+			assert_u_eq(SC_NPSIZES, sz_psz2ind(size_class + 1),
+			    "Next size_class does not round up properly");
+		} else {
+			assert_u_eq(pind + 1, sz_psz2ind(size_class + 1),
+			    "Next size_class does not round up properly");
+		}
 
 		assert_zu_eq(size_class, (pind > 0) ?
 		    sz_psz2u(sz_pind2sz(pind-1)+1) : sz_psz2u(1),
@@ -156,16 +161,13 @@ TEST_BEGIN(test_overflow) {
 	assert_zu_eq(sz_s2u(SIZE_T_MAX), 0,
 	    "sz_s2u() should return 0 on overflow");
 
-	assert_u_eq(sz_psz2ind(max_size_class+1), sc_data_global.npsizes,
+	assert_u_eq(sz_psz2ind(max_size_class+1), SC_NPSIZES,
 	    "sz_psz2ind() should return NPSIZES on overflow");
-	assert_u_eq(sz_psz2ind(ZU(PTRDIFF_MAX)+1), sc_data_global.npsizes,
+	assert_u_eq(sz_psz2ind(ZU(PTRDIFF_MAX)+1), SC_NPSIZES,
 	    "sz_psz2ind() should return NPSIZES on overflow");
-	assert_u_eq(sz_psz2ind(SIZE_T_MAX), sc_data_global.npsizes,
+	assert_u_eq(sz_psz2ind(SIZE_T_MAX), SC_NPSIZES,
 	    "sz_psz2ind() should return NPSIZES on overflow");
 
-	assert_u_le(sc_data_global.npsizes, SC_NPSIZES_MAX,
-	    "Dynamic value of npsizes is higher than static bound.");
-
 	assert_zu_eq(sz_psz2u(max_size_class+1), max_psz,
 	    "sz_psz2u() should return (LARGE_MAXCLASS + PAGE) for unsupported"
 	    " size");
diff --git a/test/unit/stats.c b/test/unit/stats.c
index b8f549b..4323bfa 100644
--- a/test/unit/stats.c
+++ b/test/unit/stats.c
@@ -76,7 +76,7 @@ TEST_BEGIN(test_stats_arenas_summary) {
 
 	little = mallocx(SC_SMALL_MAXCLASS, MALLOCX_ARENA(0));
 	assert_ptr_not_null(little, "Unexpected mallocx() failure");
-	large = mallocx((1U << sc_data_global.lg_large_minclass),
+	large = mallocx((1U << SC_LG_LARGE_MINCLASS),
 	    MALLOCX_ARENA(0));
 	assert_ptr_not_null(large, "Unexpected mallocx() failure");
 
@@ -192,7 +192,7 @@ TEST_BEGIN(test_stats_arenas_large) {
 	uint64_t epoch, nmalloc, ndalloc;
 	int expected = config_stats ? 0 : ENOENT;
 
-	p = mallocx((1U << sc_data_global.lg_large_minclass), MALLOCX_ARENA(0));
+	p = mallocx((1U << SC_LG_LARGE_MINCLASS), MALLOCX_ARENA(0));
 	assert_ptr_not_null(p, "Unexpected mallocx() failure");
 
 	assert_d_eq(mallctl("epoch", NULL, NULL, (void *)&epoch, sizeof(epoch)),
diff --git a/test/unit/zero.c b/test/unit/zero.c
index 8b8d207..271fd5c 100644
--- a/test/unit/zero.c
+++ b/test/unit/zero.c
@@ -47,8 +47,7 @@ TEST_END
 
 TEST_BEGIN(test_zero_large) {
 	test_skip_if(!config_fill);
-	test_zero(SC_SMALL_MAXCLASS + 1,
-	    1U << (sc_data_global.lg_large_minclass + 1));
+	test_zero(SC_SMALL_MAXCLASS + 1, 1U << (SC_LG_LARGE_MINCLASS + 1));
 }
 TEST_END
 
-- 
cgit v0.12


From 013ab26c8674e07d40098f7385e570c6d8b0dee9 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Thu, 26 Jul 2018 14:17:36 -0700
Subject: TSD: Add a tsd_nominal_list death assertion.

A thread should have had its state transition away from nominal before it dies.
This change adds that to the list of thread death assertions.
---
 src/tsd.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/tsd.c b/src/tsd.c
index f2b601d..26142ff 100644
--- a/src/tsd.c
+++ b/src/tsd.c
@@ -238,6 +238,7 @@ tsd_data_init(tsd_t *tsd) {
 static void
 assert_tsd_data_cleanup_done(tsd_t *tsd) {
 	assert(!tsd_nominal(tsd));
+	assert(!tsd_in_nominal_list(tsd));
 	assert(*tsd_arenap_get_unsafe(tsd) == NULL);
 	assert(*tsd_iarenap_get_unsafe(tsd) == NULL);
 	assert(*tsd_arenas_tdata_bypassp_get_unsafe(tsd) == true);
-- 
cgit v0.12


From 41b7372eadee941b9164751b8d4963f915d3ceae Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Thu, 26 Jul 2018 14:42:37 -0700
Subject: TSD: Add fork support to tsd_nominal_tsds.

In case of multithreaded fork, we want to leave the child in a reasonable state,
in which tsd_nominal_tsds is either empty or contains only the forking thread.
---
 include/jemalloc/internal/tsd.h |  3 +++
 src/jemalloc.c                  |  5 +++++
 src/tsd.c                       | 20 ++++++++++++++++++++
 3 files changed, 28 insertions(+)

diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index e5e82f4..59a1885 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -105,6 +105,9 @@ void tsd_cleanup(void *arg);
 tsd_t *tsd_fetch_slow(tsd_t *tsd, bool internal);
 void tsd_state_set(tsd_t *tsd, uint8_t new_state);
 void tsd_slow_update(tsd_t *tsd);
+void tsd_prefork(tsd_t *tsd);
+void tsd_postfork_parent(tsd_t *tsd);
+void tsd_postfork_child(tsd_t *tsd);
 
 /*
  * Call ..._inc when your module wants to take all threads down the slow paths,
diff --git a/src/jemalloc.c b/src/jemalloc.c
index d473664..85ec9e0 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -3470,6 +3470,7 @@ _malloc_prefork(void)
 		}
 	}
 	prof_prefork1(tsd_tsdn(tsd));
+	tsd_prefork(tsd);
 }
 
 #ifndef JEMALLOC_MUTEX_INIT_CB
@@ -3492,6 +3493,8 @@ _malloc_postfork(void)
 
 	tsd = tsd_fetch();
 
+	tsd_postfork_parent(tsd);
+
 	witness_postfork_parent(tsd_witness_tsdp_get(tsd));
 	/* Release all mutexes, now that fork() has completed. */
 	for (i = 0, narenas = narenas_total_get(); i < narenas; i++) {
@@ -3519,6 +3522,8 @@ jemalloc_postfork_child(void) {
 
 	tsd = tsd_fetch();
 
+	tsd_postfork_child(tsd);
+
 	witness_postfork_child(tsd_witness_tsdp_get(tsd));
 	/* Release all mutexes, now that fork() has completed. */
 	for (i = 0, narenas = narenas_total_get(); i < narenas; i++) {
diff --git a/src/tsd.c b/src/tsd.c
index 26142ff..1204a0d 100644
--- a/src/tsd.c
+++ b/src/tsd.c
@@ -509,3 +509,23 @@ tsd_init_finish(tsd_init_head_t *head, tsd_init_block_t *block) {
 	malloc_mutex_unlock(TSDN_NULL, &head->lock);
 }
 #endif
+
+void
+tsd_prefork(tsd_t *tsd) {
+	malloc_mutex_prefork(tsd_tsdn(tsd), &tsd_nominal_tsds_lock);
+}
+
+void
+tsd_postfork_parent(tsd_t *tsd) {
+	malloc_mutex_postfork_parent(tsd_tsdn(tsd), &tsd_nominal_tsds_lock);
+}
+
+void
+tsd_postfork_child(tsd_t *tsd) {
+	malloc_mutex_postfork_child(tsd_tsdn(tsd), &tsd_nominal_tsds_lock);
+	ql_new(&tsd_nominal_tsds);
+
+	if (tsd_state_get(tsd) <= tsd_state_nominal_max) {
+		tsd_add_nominal(tsd);
+	}
+}
-- 
cgit v0.12


From eb261e53a6bfaef9797395fe09d6a425b11acb42 Mon Sep 17 00:00:00 2001
From: Tyler Etzel <tyleretzel1@gmail.com>
Date: Thu, 5 Jul 2018 10:31:43 -0700
Subject: Small refactoring of emitter

- Make API more clear for using as standalone json emitter
- Support cases that weren't possible before, e.g.
	- emitting primitive values in an array
	- emitting nested arrays
---
 include/jemalloc/internal/emitter.h | 314 +++++++++++++++++++++---------------
 src/stats.c                         |  92 +++++------
 test/unit/emitter.c                 |  94 ++++++++---
 3 files changed, 301 insertions(+), 199 deletions(-)

diff --git a/include/jemalloc/internal/emitter.h b/include/jemalloc/internal/emitter.h
index 3a2b2f7..f8da228 100644
--- a/include/jemalloc/internal/emitter.h
+++ b/include/jemalloc/internal/emitter.h
@@ -60,17 +60,6 @@ struct emitter_row_s {
 	ql_head(emitter_col_t) cols;
 };
 
-static inline void
-emitter_row_init(emitter_row_t *row) {
-	ql_new(&row->cols);
-}
-
-static inline void
-emitter_col_init(emitter_col_t *col, emitter_row_t *row) {
-	ql_elm_new(col, link);
-	ql_tail_insert(&row->cols, col, link);
-}
-
 typedef struct emitter_s emitter_t;
 struct emitter_s {
 	emitter_output_t output;
@@ -80,18 +69,10 @@ struct emitter_s {
 	int nesting_depth;
 	/* True if we've already emitted a value at the given depth. */
 	bool item_at_depth;
+	/* True if we emitted a key and will emit corresponding value next. */
+	bool emitted_key;
 };
 
-static inline void
-emitter_init(emitter_t *emitter, emitter_output_t emitter_output,
-    void (*write_cb)(void *, const char *), void *cbopaque) {
-	emitter->output = emitter_output;
-	emitter->write_cb = write_cb;
-	emitter->cbopaque = cbopaque;
-	emitter->item_at_depth = false;
-	emitter->nesting_depth = 0;
-}
-
 /* Internal convenience function.  Write to the emitter the given string. */
 JEMALLOC_FORMAT_PRINTF(2, 3)
 static inline void
@@ -103,18 +84,6 @@ emitter_printf(emitter_t *emitter, const char *format, ...) {
 	va_end(ap);
 }
 
-/* Write to the emitter the given string, but only in table mode. */
-JEMALLOC_FORMAT_PRINTF(2, 3)
-static inline void
-emitter_table_printf(emitter_t *emitter, const char *format, ...) {
-	if (emitter->output == emitter_output_table) {
-		va_list ap;
-		va_start(ap, format);
-		malloc_vcprintf(emitter->write_cb, emitter->cbopaque, format, ap);
-		va_end(ap);
-	}
-}
-
 static inline void
 emitter_gen_fmt(char *out_fmt, size_t out_size, const char *fmt_specifier,
     emitter_justify_t justify, int width) {
@@ -235,201 +204,278 @@ emitter_indent(emitter_t *emitter) {
 
 static inline void
 emitter_json_key_prefix(emitter_t *emitter) {
+	if (emitter->emitted_key) {
+		emitter->emitted_key = false;
+		return;
+	}
 	emitter_printf(emitter, "%s\n", emitter->item_at_depth ? "," : "");
 	emitter_indent(emitter);
 }
 
+/******************************************************************************/
+/* Public functions for emitter_t. */
+
 static inline void
-emitter_begin(emitter_t *emitter) {
-	if (emitter->output == emitter_output_json) {
-		assert(emitter->nesting_depth == 0);
-		emitter_printf(emitter, "{");
-		emitter_nest_inc(emitter);
-	} else {
-		// tabular init
-		emitter_printf(emitter, "%s", "");
-	}
+emitter_init(emitter_t *emitter, emitter_output_t emitter_output,
+    void (*write_cb)(void *, const char *), void *cbopaque) {
+	emitter->output = emitter_output;
+	emitter->write_cb = write_cb;
+	emitter->cbopaque = cbopaque;
+	emitter->item_at_depth = false;
+	emitter->emitted_key = false; 
+	emitter->nesting_depth = 0;
 }
 
+/******************************************************************************/
+/* JSON public API. */
+
+/* 
+ * Emits a key (e.g. as appears in an object). The next json entity emitted will
+ * be the corresponding value.
+ */
 static inline void
-emitter_end(emitter_t *emitter) {
+emitter_json_key(emitter_t *emitter, const char *json_key) {
 	if (emitter->output == emitter_output_json) {
-		assert(emitter->nesting_depth == 1);
-		emitter_nest_dec(emitter);
-		emitter_printf(emitter, "\n}\n");
+		emitter_json_key_prefix(emitter);
+		emitter_printf(emitter, "\"%s\": ", json_key);
+		emitter->emitted_key = true;
 	}
 }
 
-/*
- * Note emits a different kv pair as well, but only in table mode.  Omits the
- * note if table_note_key is NULL.
- */
 static inline void
-emitter_kv_note(emitter_t *emitter, const char *json_key, const char *table_key,
-    emitter_type_t value_type, const void *value,
-    const char *table_note_key, emitter_type_t table_note_value_type,
-    const void *table_note_value) {
+emitter_json_value(emitter_t *emitter, emitter_type_t value_type,
+    const void *value) {
 	if (emitter->output == emitter_output_json) {
-		assert(emitter->nesting_depth > 0);
 		emitter_json_key_prefix(emitter);
-		emitter_printf(emitter, "\"%s\": ", json_key);
-		emitter_print_value(emitter, emitter_justify_none, -1,
-		    value_type, value);
-	} else {
-		emitter_indent(emitter);
-		emitter_printf(emitter, "%s: ", table_key);
 		emitter_print_value(emitter, emitter_justify_none, -1,
 		    value_type, value);
-		if (table_note_key != NULL) {
-			emitter_printf(emitter, " (%s: ", table_note_key);
-			emitter_print_value(emitter, emitter_justify_none, -1,
-			    table_note_value_type, table_note_value);
-			emitter_printf(emitter, ")");
-		}
-		emitter_printf(emitter, "\n");
+		emitter->item_at_depth = true;
 	}
-	emitter->item_at_depth = true;
 }
 
+/* Shorthand for calling emitter_json_key and then emitter_json_value. */
 static inline void
-emitter_kv(emitter_t *emitter, const char *json_key, const char *table_key,
+emitter_json_kv(emitter_t *emitter, const char *json_key,
     emitter_type_t value_type, const void *value) {
-	emitter_kv_note(emitter, json_key, table_key, value_type, value, NULL,
-	    emitter_type_bool, NULL);
+	emitter_json_key(emitter, json_key);
+	emitter_json_value(emitter, value_type, value);
 }
 
 static inline void
-emitter_json_kv(emitter_t *emitter, const char *json_key,
-    emitter_type_t value_type, const void *value) {
+emitter_json_array_begin(emitter_t *emitter) {
 	if (emitter->output == emitter_output_json) {
-		emitter_kv(emitter, json_key, NULL, value_type, value);
+		emitter_json_key_prefix(emitter);
+		emitter_printf(emitter, "[");
+		emitter_nest_inc(emitter);
 	}
 }
 
+/* Shorthand for calling emitter_json_key and then emitter_json_array_begin. */
 static inline void
-emitter_table_kv(emitter_t *emitter, const char *table_key,
-    emitter_type_t value_type, const void *value) {
-	if (emitter->output == emitter_output_table) {
-		emitter_kv(emitter, NULL, table_key, value_type, value);
+emitter_json_array_kv_begin(emitter_t *emitter, const char *json_key) {
+	emitter_json_key(emitter, json_key);
+	emitter_json_array_begin(emitter);
+}
+
+static inline void
+emitter_json_array_end(emitter_t *emitter) {
+	if (emitter->output == emitter_output_json) {
+		assert(emitter->nesting_depth > 0);
+		emitter_nest_dec(emitter);
+		emitter_printf(emitter, "\n");
+		emitter_indent(emitter);
+		emitter_printf(emitter, "]");
 	}
 }
 
 static inline void
-emitter_dict_begin(emitter_t *emitter, const char *json_key,
-    const char *table_header) {
+emitter_json_object_begin(emitter_t *emitter) {
 	if (emitter->output == emitter_output_json) {
 		emitter_json_key_prefix(emitter);
-		emitter_printf(emitter, "\"%s\": {", json_key);
-		emitter_nest_inc(emitter);
-	} else {
-		emitter_indent(emitter);
-		emitter_printf(emitter, "%s\n", table_header);
+		emitter_printf(emitter, "{");
 		emitter_nest_inc(emitter);
 	}
 }
 
+/* Shorthand for calling emitter_json_key and then emitter_json_object_begin. */
 static inline void
-emitter_dict_end(emitter_t *emitter) {
+emitter_json_object_kv_begin(emitter_t *emitter, const char *json_key) {
+	emitter_json_key(emitter, json_key);
+	emitter_json_object_begin(emitter);
+}
+
+static inline void
+emitter_json_object_end(emitter_t *emitter) {
 	if (emitter->output == emitter_output_json) {
 		assert(emitter->nesting_depth > 0);
 		emitter_nest_dec(emitter);
 		emitter_printf(emitter, "\n");
 		emitter_indent(emitter);
 		emitter_printf(emitter, "}");
-	} else {
-		emitter_nest_dec(emitter);
 	}
 }
 
+
+/******************************************************************************/
+/* Table public API. */
+
 static inline void
-emitter_json_dict_begin(emitter_t *emitter, const char *json_key) {
-	if (emitter->output == emitter_output_json) {
-		emitter_dict_begin(emitter, json_key, NULL);
+emitter_table_dict_begin(emitter_t *emitter, const char *table_key) {
+	if (emitter->output == emitter_output_table) {
+		emitter_indent(emitter);
+		emitter_printf(emitter, "%s\n", table_key);
+		emitter_nest_inc(emitter);
 	}
 }
 
 static inline void
-emitter_json_dict_end(emitter_t *emitter) {
-	if (emitter->output == emitter_output_json) {
-		emitter_dict_end(emitter);
+emitter_table_dict_end(emitter_t *emitter) {
+	if (emitter->output == emitter_output_table) {
+		emitter_nest_dec(emitter);
 	}
 }
 
 static inline void
-emitter_table_dict_begin(emitter_t *emitter, const char *table_key) {
+emitter_table_kv_note(emitter_t *emitter, const char *table_key,
+    emitter_type_t value_type, const void *value,
+    const char *table_note_key, emitter_type_t table_note_value_type,
+    const void *table_note_value) {
 	if (emitter->output == emitter_output_table) {
-		emitter_dict_begin(emitter, NULL, table_key);
+		emitter_indent(emitter);
+		emitter_printf(emitter, "%s: ", table_key);
+		emitter_print_value(emitter, emitter_justify_none, -1,
+		    value_type, value);
+		if (table_note_key != NULL) {
+			emitter_printf(emitter, " (%s: ", table_note_key);
+			emitter_print_value(emitter, emitter_justify_none, -1,
+			    table_note_value_type, table_note_value);
+			emitter_printf(emitter, ")");
+		}
+		emitter_printf(emitter, "\n");
 	}
+	emitter->item_at_depth = true;
 }
 
 static inline void
-emitter_table_dict_end(emitter_t *emitter) {
+emitter_table_kv(emitter_t *emitter, const char *table_key,
+    emitter_type_t value_type, const void *value) {
+	emitter_table_kv_note(emitter, table_key, value_type, value, NULL,
+	    emitter_type_bool, NULL);
+}
+
+
+/* Write to the emitter the given string, but only in table mode. */
+JEMALLOC_FORMAT_PRINTF(2, 3)
+static inline void
+emitter_table_printf(emitter_t *emitter, const char *format, ...) {
 	if (emitter->output == emitter_output_table) {
-		emitter_dict_end(emitter);
+		va_list ap;
+		va_start(ap, format);
+		malloc_vcprintf(emitter->write_cb, emitter->cbopaque, format, ap);
+		va_end(ap);
 	}
 }
 
 static inline void
-emitter_json_arr_begin(emitter_t *emitter, const char *json_key) {
-	if (emitter->output == emitter_output_json) {
-		emitter_json_key_prefix(emitter);
-		emitter_printf(emitter, "\"%s\": [", json_key);
-		emitter_nest_inc(emitter);
+emitter_table_row(emitter_t *emitter, emitter_row_t *row) {
+	if (emitter->output != emitter_output_table) {
+		return;
+	}
+	emitter_col_t *col;
+	ql_foreach(col, &row->cols, link) {
+		emitter_print_value(emitter, col->justify, col->width,
+		    col->type, (const void *)&col->bool_val);
 	}
+	emitter_table_printf(emitter, "\n");
+}
+
+static inline void
+emitter_row_init(emitter_row_t *row) {
+	ql_new(&row->cols);
 }
 
 static inline void
-emitter_json_arr_end(emitter_t *emitter) {
+emitter_col_init(emitter_col_t *col, emitter_row_t *row) {
+	ql_elm_new(col, link);
+	ql_tail_insert(&row->cols, col, link);
+}
+
+
+/******************************************************************************/
+/*
+ * Generalized public API. Emits using either JSON or table, according to
+ * settings in the emitter_t. */
+
+/*
+ * Note emits a different kv pair as well, but only in table mode.  Omits the
+ * note if table_note_key is NULL.
+ */
+static inline void
+emitter_kv_note(emitter_t *emitter, const char *json_key, const char *table_key,
+    emitter_type_t value_type, const void *value,
+    const char *table_note_key, emitter_type_t table_note_value_type,
+    const void *table_note_value) {
 	if (emitter->output == emitter_output_json) {
-		assert(emitter->nesting_depth > 0);
-		emitter_nest_dec(emitter);
-		emitter_printf(emitter, "\n");
-		emitter_indent(emitter);
-		emitter_printf(emitter, "]");
+		emitter_json_key(emitter, json_key);
+		emitter_json_value(emitter, value_type, value);
+	} else {
+		emitter_table_kv_note(emitter, table_key, value_type, value,
+		    table_note_key, table_note_value_type, table_note_value);
 	}
+	emitter->item_at_depth = true;
 }
 
 static inline void
-emitter_json_arr_obj_begin(emitter_t *emitter) {
+emitter_kv(emitter_t *emitter, const char *json_key, const char *table_key,
+    emitter_type_t value_type, const void *value) {
+	emitter_kv_note(emitter, json_key, table_key, value_type, value, NULL,
+	    emitter_type_bool, NULL);
+}
+
+static inline void
+emitter_dict_begin(emitter_t *emitter, const char *json_key,
+    const char *table_header) {
 	if (emitter->output == emitter_output_json) {
-		emitter_json_key_prefix(emitter);
-		emitter_printf(emitter, "{");
-		emitter_nest_inc(emitter);
+		emitter_json_key(emitter, json_key);
+		emitter_json_object_begin(emitter);
+	} else {
+		emitter_table_dict_begin(emitter, table_header);
 	}
 }
 
 static inline void
-emitter_json_arr_obj_end(emitter_t *emitter) {
+emitter_dict_end(emitter_t *emitter) {
 	if (emitter->output == emitter_output_json) {
-		assert(emitter->nesting_depth > 0);
-		emitter_nest_dec(emitter);
-		emitter_printf(emitter, "\n");
-		emitter_indent(emitter);
-		emitter_printf(emitter, "}");
+		emitter_json_object_end(emitter);
+	} else {
+		emitter_table_dict_end(emitter);
 	}
 }
 
 static inline void
-emitter_json_arr_value(emitter_t *emitter, emitter_type_t value_type,
-    const void *value) {
+emitter_begin(emitter_t *emitter) {
 	if (emitter->output == emitter_output_json) {
-		emitter_json_key_prefix(emitter);
-		emitter_print_value(emitter, emitter_justify_none, -1,
-		    value_type, value);
+		assert(emitter->nesting_depth == 0);
+		emitter_printf(emitter, "{");
+		emitter_nest_inc(emitter);
+	} else {
+		/*
+		 * This guarantees that we always call write_cb at least once.
+		 * This is useful if some invariant is established by each call
+		 * to write_cb, but doesn't hold initially: e.g., some buffer
+		 * holds a null-terminated string.
+		 */
+		emitter_printf(emitter, "%s", "");
 	}
 }
 
 static inline void
-emitter_table_row(emitter_t *emitter, emitter_row_t *row) {
-	if (emitter->output != emitter_output_table) {
-		return;
-	}
-	emitter_col_t *col;
-	ql_foreach(col, &row->cols, link) {
-		emitter_print_value(emitter, col->justify, col->width,
-		    col->type, (const void *)&col->bool_val);
+emitter_end(emitter_t *emitter) {
+	if (emitter->output == emitter_output_json) {
+		assert(emitter->nesting_depth == 1);
+		emitter_nest_dec(emitter);
+		emitter_printf(emitter, "\n}\n");
 	}
-	emitter_table_printf(emitter, "\n");
 }
 
 #endif /* JEMALLOC_INTERNAL_EMITTER_H */
diff --git a/src/stats.c b/src/stats.c
index 93a04b7..64d7323 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -287,7 +287,7 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i) {
 	header_col_size.width -=5;
 	emitter_table_printf(emitter, "bins:");
 	emitter_table_row(emitter, &header_row);
-	emitter_json_arr_begin(emitter, "bins");
+	emitter_json_array_kv_begin(emitter, "bins");
 
 	for (j = 0, in_gap = false; j < nbins; j++) {
 		uint64_t nslabs;
@@ -333,7 +333,7 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i) {
 			    col_mutex32);
 		}
 
-		emitter_json_arr_obj_begin(emitter);
+		emitter_json_object_begin(emitter);
 		emitter_json_kv(emitter, "nmalloc", emitter_type_uint64,
 		    &nmalloc);
 		emitter_json_kv(emitter, "ndalloc", emitter_type_uint64,
@@ -351,12 +351,12 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i) {
 		emitter_json_kv(emitter, "curslabs", emitter_type_size,
 		    &curslabs);
 		if (mutex) {
-			emitter_json_dict_begin(emitter, "mutex");
+			emitter_json_object_kv_begin(emitter, "mutex");
 			mutex_stats_emit(emitter, NULL, col_mutex64,
 			    col_mutex32);
-			emitter_json_dict_end(emitter);
+			emitter_json_object_end(emitter);
 		}
-		emitter_json_arr_obj_end(emitter);
+		emitter_json_object_end(emitter);
 
 		size_t availregs = nregs * curslabs;
 		char util[6];
@@ -400,7 +400,7 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i) {
 
 		emitter_table_row(emitter, &row);
 	}
-	emitter_json_arr_end(emitter); /* Close "bins". */
+	emitter_json_array_end(emitter); /* Close "bins". */
 
 	if (in_gap) {
 		emitter_table_printf(emitter, "                     ---\n");
@@ -447,7 +447,7 @@ stats_arena_lextents_print(emitter_t *emitter, unsigned i) {
 	header_size.width -= 6;
 	emitter_table_printf(emitter, "large:");
 	emitter_table_row(emitter, &header_row);
-	emitter_json_arr_begin(emitter, "lextents");
+	emitter_json_array_kv_begin(emitter, "lextents");
 
 	for (j = 0, in_gap = false; j < nlextents; j++) {
 		uint64_t nmalloc, ndalloc, nrequests;
@@ -471,10 +471,10 @@ stats_arena_lextents_print(emitter_t *emitter, unsigned i) {
 		CTL_M2_M4_GET("stats.arenas.0.lextents.0.curlextents", i, j,
 		    &curlextents, size_t);
 
-		emitter_json_arr_obj_begin(emitter);
+		emitter_json_object_begin(emitter);
 		emitter_json_kv(emitter, "curlextents", emitter_type_size,
 		    &curlextents);
-		emitter_json_arr_obj_end(emitter);
+		emitter_json_object_end(emitter);
 
 		col_size.size_val = lextent_size;
 		col_ind.unsigned_val = nbins + j;
@@ -488,7 +488,7 @@ stats_arena_lextents_print(emitter_t *emitter, unsigned i) {
 			emitter_table_row(emitter, &row);
 		}
 	}
-	emitter_json_arr_end(emitter); /* Close "lextents". */
+	emitter_json_array_end(emitter); /* Close "lextents". */
 	if (in_gap) {
 		emitter_table_printf(emitter, "                     ---\n");
 	}
@@ -504,19 +504,19 @@ stats_arena_mutexes_print(emitter_t *emitter, unsigned arena_ind) {
 	emitter_row_init(&row);
 	mutex_stats_init_cols(&row, "", &col_name, col64, col32);
 
-	emitter_json_dict_begin(emitter, "mutexes");
+	emitter_json_object_kv_begin(emitter, "mutexes");
 	emitter_table_row(emitter, &row);
 
 	for (mutex_prof_arena_ind_t i = 0; i < mutex_prof_num_arena_mutexes;
 	    i++) {
 		const char *name = arena_mutex_names[i];
-		emitter_json_dict_begin(emitter, name);
+		emitter_json_object_kv_begin(emitter, name);
 		mutex_stats_read_arena(arena_ind, i, name, &col_name, col64,
 		    col32);
 		mutex_stats_emit(emitter, &row, col64, col32);
-		emitter_json_dict_end(emitter); /* Close the mutex dict. */
+		emitter_json_object_end(emitter); /* Close the mutex dict. */
 	}
-	emitter_json_dict_end(emitter); /* End "mutexes". */
+	emitter_json_object_end(emitter); /* End "mutexes". */
 }
 
 static void
@@ -738,7 +738,7 @@ stats_arena_print(emitter_t *emitter, unsigned i, bool bins, bool large,
 	alloc_count_##name.type = emitter_type_##valtype;		\
 	alloc_count_##name.valtype##_val = small_or_large##_##name;
 
-	emitter_json_dict_begin(emitter, "small");
+	emitter_json_object_kv_begin(emitter, "small");
 	alloc_count_title.str_val = "small:";
 
 	GET_AND_EMIT_ALLOC_STAT(small, allocated, size)
@@ -747,9 +747,9 @@ stats_arena_print(emitter_t *emitter, unsigned i, bool bins, bool large,
 	GET_AND_EMIT_ALLOC_STAT(small, nrequests, uint64)
 
 	emitter_table_row(emitter, &alloc_count_row);
-	emitter_json_dict_end(emitter); /* Close "small". */
+	emitter_json_object_end(emitter); /* Close "small". */
 
-	emitter_json_dict_begin(emitter, "large");
+	emitter_json_object_kv_begin(emitter, "large");
 	alloc_count_title.str_val = "large:";
 
 	GET_AND_EMIT_ALLOC_STAT(large, allocated, size)
@@ -758,7 +758,7 @@ stats_arena_print(emitter_t *emitter, unsigned i, bool bins, bool large,
 	GET_AND_EMIT_ALLOC_STAT(large, nrequests, uint64)
 
 	emitter_table_row(emitter, &alloc_count_row);
-	emitter_json_dict_end(emitter); /* Close "large". */
+	emitter_json_object_end(emitter); /* Close "large". */
 
 #undef GET_AND_EMIT_ALLOC_STAT
 
@@ -980,7 +980,7 @@ stats_general_print(emitter_t *emitter) {
 	 * The json output sticks arena info into an "arenas" dict; the table
 	 * output puts them at the top-level.
 	 */
-	emitter_json_dict_begin(emitter, "arenas");
+	emitter_json_object_kv_begin(emitter, "arenas");
 
 	CTL_GET("arenas.narenas", &uv, unsigned);
 	emitter_kv(emitter, "narenas", "Arenas", emitter_type_unsigned, &uv);
@@ -1021,9 +1021,9 @@ stats_general_print(emitter_t *emitter) {
 	 * (not just omit the printing).
 	 */
 	if (emitter->output == emitter_output_json) {
-		emitter_json_arr_begin(emitter, "bin");
+		emitter_json_array_kv_begin(emitter, "bin");
 		for (unsigned i = 0; i < nbins; i++) {
-			emitter_json_arr_obj_begin(emitter);
+			emitter_json_object_begin(emitter);
 
 			CTL_M2_GET("arenas.bin.0.size", i, &sv, size_t);
 			emitter_json_kv(emitter, "size", emitter_type_size,
@@ -1037,9 +1037,9 @@ stats_general_print(emitter_t *emitter) {
 			emitter_json_kv(emitter, "slab_size", emitter_type_size,
 			    &sv);
 
-			emitter_json_arr_obj_end(emitter);
+			emitter_json_object_end(emitter);
 		}
-		emitter_json_arr_end(emitter); /* Close "bin". */
+		emitter_json_array_end(emitter); /* Close "bin". */
 	}
 
 	unsigned nlextents;
@@ -1048,20 +1048,20 @@ stats_general_print(emitter_t *emitter) {
 	    emitter_type_unsigned, &nlextents);
 
 	if (emitter->output == emitter_output_json) {
-		emitter_json_arr_begin(emitter, "lextent");
+		emitter_json_array_kv_begin(emitter, "lextent");
 		for (unsigned i = 0; i < nlextents; i++) {
-			emitter_json_arr_obj_begin(emitter);
+			emitter_json_object_begin(emitter);
 
 			CTL_M2_GET("arenas.lextent.0.size", i, &sv, size_t);
 			emitter_json_kv(emitter, "size", emitter_type_size,
 			    &sv);
 
-			emitter_json_arr_obj_end(emitter);
+			emitter_json_object_end(emitter);
 		}
-		emitter_json_arr_end(emitter); /* Close "lextent". */
+		emitter_json_array_end(emitter); /* Close "lextent". */
 	}
 
-	emitter_json_dict_end(emitter); /* Close "arenas" */
+	emitter_json_object_end(emitter); /* Close "arenas" */
 }
 
 static void
@@ -1098,7 +1098,7 @@ stats_print_helper(emitter_t *emitter, bool merged, bool destroyed,
 	}
 
 	/* Generic global stats. */
-	emitter_json_dict_begin(emitter, "stats");
+	emitter_json_object_kv_begin(emitter, "stats");
 	emitter_json_kv(emitter, "allocated", emitter_type_size, &allocated);
 	emitter_json_kv(emitter, "active", emitter_type_size, &active);
 	emitter_json_kv(emitter, "metadata", emitter_type_size, &metadata);
@@ -1114,14 +1114,14 @@ stats_print_helper(emitter_t *emitter, bool merged, bool destroyed,
 	    resident, mapped, retained);
 
 	/* Background thread stats. */
-	emitter_json_dict_begin(emitter, "background_thread");
+	emitter_json_object_kv_begin(emitter, "background_thread");
 	emitter_json_kv(emitter, "num_threads", emitter_type_size,
 	    &num_background_threads);
 	emitter_json_kv(emitter, "num_runs", emitter_type_uint64,
 	    &background_thread_num_runs);
 	emitter_json_kv(emitter, "run_interval", emitter_type_uint64,
 	    &background_thread_run_interval);
-	emitter_json_dict_end(emitter); /* Close "background_thread". */
+	emitter_json_object_end(emitter); /* Close "background_thread". */
 
 	emitter_table_printf(emitter, "Background threads: %zu, "
 	    "num_runs: %"FMTu64", run_interval: %"FMTu64" ns\n",
@@ -1138,25 +1138,25 @@ stats_print_helper(emitter_t *emitter, bool merged, bool destroyed,
 		mutex_stats_init_cols(&row, "", &name, col64, col32);
 
 		emitter_table_row(emitter, &row);
-		emitter_json_dict_begin(emitter, "mutexes");
+		emitter_json_object_kv_begin(emitter, "mutexes");
 
 		for (int i = 0; i < mutex_prof_num_global_mutexes; i++) {
 			mutex_stats_read_global(global_mutex_names[i], &name,
 			    col64, col32);
-			emitter_json_dict_begin(emitter, global_mutex_names[i]);
+			emitter_json_object_kv_begin(emitter, global_mutex_names[i]);
 			mutex_stats_emit(emitter, &row, col64, col32);
-			emitter_json_dict_end(emitter);
+			emitter_json_object_end(emitter);
 		}
 
-		emitter_json_dict_end(emitter); /* Close "mutexes". */
+		emitter_json_object_end(emitter); /* Close "mutexes". */
 	}
 
-	emitter_json_dict_end(emitter); /* Close "stats". */
+	emitter_json_object_end(emitter); /* Close "stats". */
 
 	if (merged || destroyed || unmerged) {
 		unsigned narenas;
 
-		emitter_json_dict_begin(emitter, "stats.arenas");
+		emitter_json_object_kv_begin(emitter, "stats.arenas");
 
 		CTL_GET("arenas.narenas", &narenas, unsigned);
 		size_t mib[3];
@@ -1185,10 +1185,10 @@ stats_print_helper(emitter_t *emitter, bool merged, bool destroyed,
 		if (merged && (ninitialized > 1 || !unmerged)) {
 			/* Print merged arena stats. */
 			emitter_table_printf(emitter, "Merged arenas stats:\n");
-			emitter_json_dict_begin(emitter, "merged");
+			emitter_json_object_kv_begin(emitter, "merged");
 			stats_arena_print(emitter, MALLCTL_ARENAS_ALL, bins,
 			    large, mutex);
-			emitter_json_dict_end(emitter); /* Close "merged". */
+			emitter_json_object_end(emitter); /* Close "merged". */
 		}
 
 		/* Destroyed stats. */
@@ -1196,10 +1196,10 @@ stats_print_helper(emitter_t *emitter, bool merged, bool destroyed,
 			/* Print destroyed arena stats. */
 			emitter_table_printf(emitter,
 			    "Destroyed arenas stats:\n");
-			emitter_json_dict_begin(emitter, "destroyed");
+			emitter_json_object_kv_begin(emitter, "destroyed");
 			stats_arena_print(emitter, MALLCTL_ARENAS_DESTROYED,
 			    bins, large, mutex);
-			emitter_json_dict_end(emitter); /* Close "destroyed". */
+			emitter_json_object_end(emitter); /* Close "destroyed". */
 		}
 
 		/* Unmerged stats. */
@@ -1209,18 +1209,18 @@ stats_print_helper(emitter_t *emitter, bool merged, bool destroyed,
 					char arena_ind_str[20];
 					malloc_snprintf(arena_ind_str,
 					    sizeof(arena_ind_str), "%u", i);
-					emitter_json_dict_begin(emitter,
+					emitter_json_object_kv_begin(emitter,
 					    arena_ind_str);
 					emitter_table_printf(emitter,
 					    "arenas[%s]:\n", arena_ind_str);
 					stats_arena_print(emitter, i, bins,
 					    large, mutex);
 					/* Close "<arena-ind>". */
-					emitter_json_dict_end(emitter);
+					emitter_json_object_end(emitter);
 				}
 			}
 		}
-		emitter_json_dict_end(emitter); /* Close "stats.arenas". */
+		emitter_json_object_end(emitter); /* Close "stats.arenas". */
 	}
 }
 
@@ -1273,7 +1273,7 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	    cbopaque);
 	emitter_begin(&emitter);
 	emitter_table_printf(&emitter, "___ Begin jemalloc statistics ___\n");
-	emitter_json_dict_begin(&emitter, "jemalloc");
+	emitter_json_object_kv_begin(&emitter, "jemalloc");
 
 	if (general) {
 		stats_general_print(&emitter);
@@ -1283,7 +1283,7 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		    bins, large, mutex);
 	}
 
-	emitter_json_dict_end(&emitter); /* Closes the "jemalloc" dict. */
+	emitter_json_object_end(&emitter); /* Closes the "jemalloc" dict. */
 	emitter_table_printf(&emitter, "--- End jemalloc statistics ---\n");
 	emitter_end(&emitter);
 }
diff --git a/test/unit/emitter.c b/test/unit/emitter.c
index 6ffd1c3..b4a693f 100644
--- a/test/unit/emitter.c
+++ b/test/unit/emitter.c
@@ -169,7 +169,7 @@ static void emit_nested_dict(emitter_t *emitter) {
 	emitter_end(emitter);
 }
 
-static const char *nested_dict_json =
+static const char *nested_object_json =
 "{\n"
 "\t\"json1\": {\n"
 "\t\t\"json2\": {\n"
@@ -183,7 +183,7 @@ static const char *nested_dict_json =
 "\t}\n"
 "}\n";
 
-static const char *nested_dict_table =
+static const char *nested_object_table =
 "Dict 1\n"
 "  Dict 2\n"
 "    A primitive: 123\n"
@@ -192,8 +192,8 @@ static const char *nested_dict_table =
 "  Another primitive: 123\n";
 
 TEST_BEGIN(test_nested_dict) {
-	assert_emit_output(&emit_nested_dict, nested_dict_json,
-	    nested_dict_table);
+	assert_emit_output(&emit_nested_dict, nested_object_json,
+	    nested_object_table);
 }
 TEST_END
 
@@ -256,13 +256,14 @@ emit_modal(emitter_t *emitter) {
 	int val = 123;
 	emitter_begin(emitter);
 	emitter_dict_begin(emitter, "j0", "T0");
-	emitter_json_dict_begin(emitter, "j1");
+	emitter_json_key(emitter, "j1");
+	emitter_json_object_begin(emitter);
 	emitter_kv(emitter, "i1", "I1", emitter_type_int, &val);
 	emitter_json_kv(emitter, "i2", emitter_type_int, &val);
 	emitter_table_kv(emitter, "I3", emitter_type_int, &val);
 	emitter_table_dict_begin(emitter, "T1");
 	emitter_kv(emitter, "i4", "I4", emitter_type_int, &val);
-	emitter_json_dict_end(emitter); /* Close j1 */
+	emitter_json_object_end(emitter); /* Close j1 */
 	emitter_kv(emitter, "i5", "I5", emitter_type_int, &val);
 	emitter_table_dict_end(emitter); /* Close T1 */
 	emitter_kv(emitter, "i6", "I6", emitter_type_int, &val);
@@ -302,24 +303,26 @@ emit_json_arr(emitter_t *emitter) {
 	int ival = 123;
 
 	emitter_begin(emitter);
-	emitter_json_dict_begin(emitter, "dict");
-	emitter_json_arr_begin(emitter, "arr");
-	emitter_json_arr_obj_begin(emitter);
+	emitter_json_key(emitter, "dict");
+	emitter_json_object_begin(emitter);
+	emitter_json_key(emitter, "arr");
+	emitter_json_array_begin(emitter);
+	emitter_json_object_begin(emitter);
 	emitter_json_kv(emitter, "foo", emitter_type_int, &ival);
-	emitter_json_arr_obj_end(emitter); /* Close arr[0] */
+	emitter_json_object_end(emitter); /* Close arr[0] */
 	/* arr[1] and arr[2] are primitives. */
-	emitter_json_arr_value(emitter, emitter_type_int, &ival);
-	emitter_json_arr_value(emitter, emitter_type_int, &ival);
-	emitter_json_arr_obj_begin(emitter);
+	emitter_json_value(emitter, emitter_type_int, &ival);
+	emitter_json_value(emitter, emitter_type_int, &ival);
+	emitter_json_object_begin(emitter);
 	emitter_json_kv(emitter, "bar", emitter_type_int, &ival);
 	emitter_json_kv(emitter, "baz", emitter_type_int, &ival);
-	emitter_json_arr_obj_end(emitter); /* Close arr[3]. */
-	emitter_json_arr_end(emitter); /* Close arr. */
-	emitter_json_dict_end(emitter); /* Close dict. */
+	emitter_json_object_end(emitter); /* Close arr[3]. */
+	emitter_json_array_end(emitter); /* Close arr. */
+	emitter_json_object_end(emitter); /* Close dict. */
 	emitter_end(emitter);
 }
 
-static const char *json_arr_json =
+static const char *json_array_json =
 "{\n"
 "\t\"dict\": {\n"
 "\t\t\"arr\": [\n"
@@ -336,10 +339,62 @@ static const char *json_arr_json =
 "\t}\n"
 "}\n";
 
-static const char *json_arr_table = "";
+static const char *json_array_table = "";
 
 TEST_BEGIN(test_json_arr) {
-	assert_emit_output(&emit_json_arr, json_arr_json, json_arr_table);
+	assert_emit_output(&emit_json_arr, json_array_json, json_array_table);
+}
+TEST_END
+
+static void
+emit_json_nested_array(emitter_t *emitter) {
+	int ival = 123;
+	char *sval = "foo";
+	emitter_begin(emitter);
+	emitter_json_array_begin(emitter);
+		emitter_json_array_begin(emitter);
+		emitter_json_value(emitter, emitter_type_int, &ival);
+		emitter_json_value(emitter, emitter_type_string, &sval);
+		emitter_json_value(emitter, emitter_type_int, &ival);
+		emitter_json_value(emitter, emitter_type_string, &sval);
+		emitter_json_array_end(emitter);
+		emitter_json_array_begin(emitter);
+		emitter_json_value(emitter, emitter_type_int, &ival);
+		emitter_json_array_end(emitter);
+		emitter_json_array_begin(emitter);
+		emitter_json_value(emitter, emitter_type_string, &sval);
+		emitter_json_value(emitter, emitter_type_int, &ival);
+		emitter_json_array_end(emitter);
+		emitter_json_array_begin(emitter);
+		emitter_json_array_end(emitter);
+	emitter_json_array_end(emitter);
+	emitter_end(emitter);
+}
+
+static const char *json_nested_array_json =
+"{\n"
+"\t[\n"
+"\t\t[\n"
+"\t\t\t123,\n"
+"\t\t\t\"foo\",\n"
+"\t\t\t123,\n"
+"\t\t\t\"foo\"\n"
+"\t\t],\n"
+"\t\t[\n"
+"\t\t\t123\n"
+"\t\t],\n"
+"\t\t[\n"
+"\t\t\t\"foo\",\n"
+"\t\t\t123\n"
+"\t\t],\n"
+"\t\t[\n"
+"\t\t]\n"
+"\t]\n"
+"}\n";
+
+TEST_BEGIN(test_json_nested_arr) {
+	assert_emit_output(&emit_json_nested_array, json_nested_array_json,
+	    json_array_table);
 }
 TEST_END
 
@@ -409,5 +464,6 @@ main(void) {
 	    test_types,
 	    test_modal,
 	    test_json_arr,
+	    test_json_nested_arr,
 	    test_table_row);
 }
-- 
cgit v0.12


From b664bd79356d7f6da6f413023f9aef014b85c145 Mon Sep 17 00:00:00 2001
From: Tyler Etzel <tyleretzel1@gmail.com>
Date: Thu, 5 Jul 2018 10:56:33 -0700
Subject: Add logging for sampled allocations

- prof_opt_log flag starts logging automatically at runtime
- prof_log_{start,stop} mallctl for manual control
---
 include/jemalloc/internal/arena_inlines_b.h |  26 ++
 include/jemalloc/internal/extent_inlines.h  |  10 +
 include/jemalloc/internal/extent_structs.h  |  12 +-
 include/jemalloc/internal/large_externs.h   |   3 +
 include/jemalloc/internal/prof_externs.h    |   6 +-
 include/jemalloc/internal/prof_inlines_b.h  |  21 +-
 include/jemalloc/internal/witness.h         |  32 +-
 src/ctl.c                                   |  44 ++-
 src/jemalloc.c                              |   1 +
 src/large.c                                 |  10 +
 src/prof.c                                  | 565 +++++++++++++++++++++++++++-
 11 files changed, 704 insertions(+), 26 deletions(-)

diff --git a/include/jemalloc/internal/arena_inlines_b.h b/include/jemalloc/internal/arena_inlines_b.h
index 2b3915a..8bf0a81 100644
--- a/include/jemalloc/internal/arena_inlines_b.h
+++ b/include/jemalloc/internal/arena_inlines_b.h
@@ -78,6 +78,32 @@ arena_prof_tctx_reset(tsdn_t *tsdn, const void *ptr, prof_tctx_t *tctx) {
 	large_prof_tctx_reset(tsdn, extent);
 }
 
+JEMALLOC_ALWAYS_INLINE nstime_t
+arena_prof_alloc_time_get(tsdn_t *tsdn, const void *ptr,
+    alloc_ctx_t *alloc_ctx) {
+	cassert(config_prof);
+	assert(ptr != NULL);
+
+	extent_t *extent = iealloc(tsdn, ptr);
+	/* 
+	 * Unlike arena_prof_prof_tctx_{get, set}, we only call this once we're
+	 * sure we have a sampled allocation.
+	 */
+	assert(!extent_slab_get(extent));
+	return large_prof_alloc_time_get(extent);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_prof_alloc_time_set(tsdn_t *tsdn, const void *ptr, alloc_ctx_t *alloc_ctx,
+    nstime_t t) {
+	cassert(config_prof);
+	assert(ptr != NULL);
+
+	extent_t *extent = iealloc(tsdn, ptr);
+	assert(!extent_slab_get(extent));
+	large_prof_alloc_time_set(extent, t);
+}
+
 JEMALLOC_ALWAYS_INLINE void
 arena_decay_ticks(tsdn_t *tsdn, arena_t *arena, unsigned nticks) {
 	tsd_t *tsd;
diff --git a/include/jemalloc/internal/extent_inlines.h b/include/jemalloc/internal/extent_inlines.h
index a43d00d..145fa2d 100644
--- a/include/jemalloc/internal/extent_inlines.h
+++ b/include/jemalloc/internal/extent_inlines.h
@@ -177,6 +177,11 @@ extent_prof_tctx_get(const extent_t *extent) {
 	    ATOMIC_ACQUIRE);
 }
 
+static inline nstime_t
+extent_prof_alloc_time_get(const extent_t *extent) {
+	return extent->e_alloc_time;
+}
+
 static inline void
 extent_arena_set(extent_t *extent, arena_t *arena) {
 	unsigned arena_ind = (arena != NULL) ? arena_ind_get(arena) : ((1U <<
@@ -301,6 +306,11 @@ extent_prof_tctx_set(extent_t *extent, prof_tctx_t *tctx) {
 }
 
 static inline void
+extent_prof_alloc_time_set(extent_t *extent, nstime_t t) {
+	nstime_copy(&extent->e_alloc_time, &t);
+}
+
+static inline void
 extent_init(extent_t *extent, arena_t *arena, void *addr, size_t size,
     bool slab, szind_t szind, size_t sn, extent_state_t state, bool zeroed,
     bool committed, bool dumpable) {
diff --git a/include/jemalloc/internal/extent_structs.h b/include/jemalloc/internal/extent_structs.h
index 1983097..d709577 100644
--- a/include/jemalloc/internal/extent_structs.h
+++ b/include/jemalloc/internal/extent_structs.h
@@ -161,11 +161,13 @@ struct extent_s {
 		/* Small region slab metadata. */
 		arena_slab_data_t	e_slab_data;
 
-		/*
-		 * Profile counters, used for large objects.  Points to a
-		 * prof_tctx_t.
-		 */
-		atomic_p_t		e_prof_tctx;
+		/* Profiling data, used for large objects. */
+		struct {
+			/* Time when this was allocated. */
+			nstime_t		e_alloc_time;
+			/* Points to a prof_tctx_t. */
+			atomic_p_t		e_prof_tctx;
+		};
 	};
 };
 typedef ql_head(extent_t) extent_list_t;
diff --git a/include/jemalloc/internal/large_externs.h b/include/jemalloc/internal/large_externs.h
index 88682ea..a05019e 100644
--- a/include/jemalloc/internal/large_externs.h
+++ b/include/jemalloc/internal/large_externs.h
@@ -26,4 +26,7 @@ prof_tctx_t *large_prof_tctx_get(tsdn_t *tsdn, const extent_t *extent);
 void large_prof_tctx_set(tsdn_t *tsdn, extent_t *extent, prof_tctx_t *tctx);
 void large_prof_tctx_reset(tsdn_t *tsdn, extent_t *extent);
 
+nstime_t large_prof_alloc_time_get(const extent_t *extent);
+void large_prof_alloc_time_set(extent_t *extent, nstime_t time);
+
 #endif /* JEMALLOC_INTERNAL_LARGE_EXTERNS_H */
diff --git a/include/jemalloc/internal/prof_externs.h b/include/jemalloc/internal/prof_externs.h
index 0434869..74315ce 100644
--- a/include/jemalloc/internal/prof_externs.h
+++ b/include/jemalloc/internal/prof_externs.h
@@ -14,6 +14,7 @@ extern bool	opt_prof_gdump;       /* High-water memory dumping. */
 extern bool	opt_prof_final;       /* Final profile dumping. */
 extern bool	opt_prof_leak;        /* Dump leak summary at exit. */
 extern bool	opt_prof_accum;       /* Report cumulative bytes. */
+extern bool	opt_prof_log;	      /* Turn logging on at boot. */
 extern char	opt_prof_prefix[
     /* Minimize memory bloat for non-prof builds. */
 #ifdef JEMALLOC_PROF
@@ -45,7 +46,8 @@ extern size_t	lg_prof_sample;
 void prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx, bool updated);
 void prof_malloc_sample_object(tsdn_t *tsdn, const void *ptr, size_t usize,
     prof_tctx_t *tctx);
-void prof_free_sampled_object(tsd_t *tsd, size_t usize, prof_tctx_t *tctx);
+void prof_free_sampled_object(tsd_t *tsd, const void *ptr, size_t usize,
+    prof_tctx_t *tctx);
 void bt_init(prof_bt_t *bt, void **vec);
 void prof_backtrace(prof_bt_t *bt);
 prof_tctx_t *prof_lookup(tsd_t *tsd, prof_bt_t *bt);
@@ -72,6 +74,8 @@ void prof_reset(tsd_t *tsd, size_t lg_sample);
 void prof_tdata_cleanup(tsd_t *tsd);
 bool prof_active_get(tsdn_t *tsdn);
 bool prof_active_set(tsdn_t *tsdn, bool active);
+bool prof_log_start(tsdn_t *tsdn, const char *filename);
+bool prof_log_stop(tsdn_t *tsdn);
 const char *prof_thread_name_get(tsd_t *tsd);
 int prof_thread_name_set(tsd_t *tsd, const char *thread_name);
 bool prof_thread_active_get(tsd_t *tsd);
diff --git a/include/jemalloc/internal/prof_inlines_b.h b/include/jemalloc/internal/prof_inlines_b.h
index 6ff465a..5e0b064 100644
--- a/include/jemalloc/internal/prof_inlines_b.h
+++ b/include/jemalloc/internal/prof_inlines_b.h
@@ -61,6 +61,23 @@ prof_tctx_reset(tsdn_t *tsdn, const void *ptr, prof_tctx_t *tctx) {
 	arena_prof_tctx_reset(tsdn, ptr, tctx);
 }
 
+JEMALLOC_ALWAYS_INLINE nstime_t
+prof_alloc_time_get(tsdn_t *tsdn, const void *ptr, alloc_ctx_t *alloc_ctx) {
+	cassert(config_prof);
+	assert(ptr != NULL);
+
+	return arena_prof_alloc_time_get(tsdn, ptr, alloc_ctx);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+prof_alloc_time_set(tsdn_t *tsdn, const void *ptr, alloc_ctx_t *alloc_ctx,
+    nstime_t t) { 
+	cassert(config_prof);
+	assert(ptr != NULL);
+
+	arena_prof_alloc_time_set(tsdn, ptr, alloc_ctx, t);
+}
+
 JEMALLOC_ALWAYS_INLINE bool
 prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
     prof_tdata_t **tdata_out) {
@@ -187,7 +204,7 @@ prof_realloc(tsd_t *tsd, const void *ptr, size_t usize, prof_tctx_t *tctx,
 	 * counters.
 	 */
 	if (unlikely(old_sampled)) {
-		prof_free_sampled_object(tsd, old_usize, old_tctx);
+		prof_free_sampled_object(tsd, ptr, old_usize, old_tctx);
 	}
 }
 
@@ -199,7 +216,7 @@ prof_free(tsd_t *tsd, const void *ptr, size_t usize, alloc_ctx_t *alloc_ctx) {
 	assert(usize == isalloc(tsd_tsdn(tsd), ptr));
 
 	if (unlikely((uintptr_t)tctx > (uintptr_t)1U)) {
-		prof_free_sampled_object(tsd, usize, tctx);
+		prof_free_sampled_object(tsd, ptr, usize, tctx);
 	}
 }
 
diff --git a/include/jemalloc/internal/witness.h b/include/jemalloc/internal/witness.h
index 80ea70c..fff9e98 100644
--- a/include/jemalloc/internal/witness.h
+++ b/include/jemalloc/internal/witness.h
@@ -27,9 +27,9 @@
 #define WITNESS_RANK_PROF_BT2GCTX	6U
 #define WITNESS_RANK_PROF_TDATAS	7U
 #define WITNESS_RANK_PROF_TDATA		8U
-#define WITNESS_RANK_PROF_GCTX		9U
-
-#define WITNESS_RANK_BACKGROUND_THREAD	10U
+#define WITNESS_RANK_PROF_LOG		9U
+#define WITNESS_RANK_PROF_GCTX		10U
+#define WITNESS_RANK_BACKGROUND_THREAD	11U
 
 /*
  * Used as an argument to witness_assert_depth_to_rank() in order to validate
@@ -37,19 +37,19 @@
  * witness_assert_depth_to_rank() is inclusive rather than exclusive, this
  * definition can have the same value as the minimally ranked core lock.
  */
-#define WITNESS_RANK_CORE		11U
-
-#define WITNESS_RANK_DECAY		11U
-#define WITNESS_RANK_TCACHE_QL		12U
-#define WITNESS_RANK_EXTENT_GROW	13U
-#define WITNESS_RANK_EXTENTS		14U
-#define WITNESS_RANK_EXTENT_AVAIL	15U
-
-#define WITNESS_RANK_EXTENT_POOL	16U
-#define WITNESS_RANK_RTREE		17U
-#define WITNESS_RANK_BASE		18U
-#define WITNESS_RANK_ARENA_LARGE	19U
-#define WITNESS_RANK_HOOK		20U
+#define WITNESS_RANK_CORE		12U
+
+#define WITNESS_RANK_DECAY		12U
+#define WITNESS_RANK_TCACHE_QL		13U
+#define WITNESS_RANK_EXTENT_GROW	14U
+#define WITNESS_RANK_EXTENTS		15U
+#define WITNESS_RANK_EXTENT_AVAIL	16U
+
+#define WITNESS_RANK_EXTENT_POOL	17U
+#define WITNESS_RANK_RTREE		18U
+#define WITNESS_RANK_BASE		19U
+#define WITNESS_RANK_ARENA_LARGE	20U
+#define WITNESS_RANK_HOOK		21U
 
 #define WITNESS_RANK_LEAF		0xffffffffU
 #define WITNESS_RANK_BIN		WITNESS_RANK_LEAF
diff --git a/src/ctl.c b/src/ctl.c
index 38529d0..448ec7b 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -148,6 +148,8 @@ CTL_PROTO(prof_gdump)
 CTL_PROTO(prof_reset)
 CTL_PROTO(prof_interval)
 CTL_PROTO(lg_prof_sample)
+CTL_PROTO(prof_log_start)
+CTL_PROTO(prof_log_stop)
 CTL_PROTO(stats_arenas_i_small_allocated)
 CTL_PROTO(stats_arenas_i_small_nmalloc)
 CTL_PROTO(stats_arenas_i_small_ndalloc)
@@ -389,7 +391,9 @@ static const ctl_named_node_t	prof_node[] = {
 	{NAME("gdump"),		CTL(prof_gdump)},
 	{NAME("reset"),		CTL(prof_reset)},
 	{NAME("interval"),	CTL(prof_interval)},
-	{NAME("lg_sample"),	CTL(lg_prof_sample)}
+	{NAME("lg_sample"),	CTL(lg_prof_sample)},
+	{NAME("log_start"),	CTL(prof_log_start)},
+	{NAME("log_stop"),	CTL(prof_log_stop)}
 };
 
 static const ctl_named_node_t stats_arenas_i_small_node[] = {
@@ -2644,6 +2648,44 @@ label_return:
 CTL_RO_NL_CGEN(config_prof, prof_interval, prof_interval, uint64_t)
 CTL_RO_NL_CGEN(config_prof, lg_prof_sample, lg_prof_sample, size_t)
 
+static int
+prof_log_start_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
+    size_t *oldlenp, void *newp, size_t newlen) {
+	int ret;
+	
+	const char *filename = NULL;
+
+	if (!config_prof) {
+		return ENOENT;
+	}
+
+	WRITEONLY();
+	WRITE(filename, const char *);
+
+	if (prof_log_start(tsd_tsdn(tsd), filename)) {
+		ret = EFAULT;
+		goto label_return; 
+	}
+
+	ret = 0;
+label_return:
+	return ret;
+}
+
+static int
+prof_log_stop_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
+    size_t *oldlenp, void *newp, size_t newlen) {
+	if (!config_prof) {
+		return ENOENT;
+	}
+
+	if (prof_log_stop(tsd_tsdn(tsd))) {
+		return EFAULT;
+	}
+
+	return 0;
+}
+
 /******************************************************************************/
 
 CTL_RO_CGEN(config_stats, stats_allocated, ctl_stats->allocated, size_t)
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 85ec9e0..e8f110f 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1282,6 +1282,7 @@ malloc_conf_init(sc_data_t *sc_data) {
 				CONF_HANDLE_BOOL(opt_prof_gdump, "prof_gdump")
 				CONF_HANDLE_BOOL(opt_prof_final, "prof_final")
 				CONF_HANDLE_BOOL(opt_prof_leak, "prof_leak")
+				CONF_HANDLE_BOOL(opt_prof_log, "prof_log")
 			}
 			if (config_log) {
 				if (CONF_MATCH("log")) {
diff --git a/src/large.c b/src/large.c
index 8407361..8e7a781 100644
--- a/src/large.c
+++ b/src/large.c
@@ -383,3 +383,13 @@ void
 large_prof_tctx_reset(tsdn_t *tsdn, extent_t *extent) {
 	large_prof_tctx_set(tsdn, extent, (prof_tctx_t *)(uintptr_t)1U);
 }
+
+nstime_t
+large_prof_alloc_time_get(const extent_t *extent) {
+	return extent_prof_alloc_time_get(extent);
+}
+
+void
+large_prof_alloc_time_set(extent_t *extent, nstime_t t) {
+	extent_prof_alloc_time_set(extent, t);
+}
diff --git a/src/prof.c b/src/prof.c
index 405de4b..21421c0 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -7,6 +7,7 @@
 #include "jemalloc/internal/hash.h"
 #include "jemalloc/internal/malloc_io.h"
 #include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/emitter.h"
 
 /******************************************************************************/
 
@@ -38,6 +39,7 @@ bool		opt_prof_gdump = false;
 bool		opt_prof_final = false;
 bool		opt_prof_leak = false;
 bool		opt_prof_accum = false;
+bool		opt_prof_log = false;
 char		opt_prof_prefix[
     /* Minimize memory bloat for non-prof builds. */
 #ifdef JEMALLOC_PROF
@@ -70,6 +72,96 @@ uint64_t	prof_interval = 0;
 
 size_t		lg_prof_sample;
 
+typedef enum prof_logging_state_e prof_logging_state_t;
+enum prof_logging_state_e {
+	prof_logging_state_stopped,
+	prof_logging_state_started,
+	prof_logging_state_dumping
+};
+
+/*
+ * - stopped: log_start never called, or previous log_stop has completed.
+ * - started: log_start called, log_stop not called yet. Allocations are logged.
+ * - dumping: log_stop called but not finished; samples are not logged anymore.
+ */
+prof_logging_state_t prof_logging_state = prof_logging_state_stopped;
+
+/* Incremented for every log file that is output. */
+static uint64_t log_seq = 0;
+static char log_filename[
+    /* Minimize memory bloat for non-prof builds. */
+#ifdef JEMALLOC_PROF
+    PATH_MAX +
+#endif
+    1];
+
+/* Timestamp for most recent call to log_start(). */
+static nstime_t log_start_timestamp = NSTIME_ZERO_INITIALIZER;
+
+/* Increment these when adding to the log_bt and log_thr linked lists. */
+static size_t log_bt_index = 0;
+static size_t log_thr_index = 0;
+
+/* Linked list node definitions. These are only used in prof.c. */
+typedef struct prof_bt_node_s prof_bt_node_t;
+
+struct prof_bt_node_s {
+	prof_bt_node_t *next;
+	size_t index;
+	prof_bt_t bt;
+	/* Variable size backtrace vector pointed to by bt. */
+	void *vec[1];
+};
+
+typedef struct prof_thr_node_s prof_thr_node_t;
+
+struct prof_thr_node_s {
+	prof_thr_node_t *next;
+	size_t index;
+	uint64_t thr_uid;
+	/* Variable size based on thr_name_sz. */
+	char name[1];
+}; 
+
+typedef struct prof_alloc_node_s prof_alloc_node_t;
+
+/* This is output when logging sampled allocations. */
+struct prof_alloc_node_s {
+	prof_alloc_node_t *next;
+	/* Indices into an array of thread data. */
+	size_t alloc_thr_ind;
+	size_t free_thr_ind;
+
+	/* Indices into an array of backtraces. */
+	size_t alloc_bt_ind;
+	size_t free_bt_ind;
+
+	uint64_t alloc_time_ns;
+	uint64_t free_time_ns;
+
+	size_t usize;
+};
+
+/*
+ * Created on the first call to prof_log_start and deleted on prof_log_stop.
+ * These are the backtraces and threads that have already been logged by an
+ * allocation.
+ */
+static bool log_tables_initialized = false;
+static ckh_t log_bt_node_set;
+static ckh_t log_thr_node_set;
+
+/* Store linked lists for logged data. */
+static prof_bt_node_t *log_bt_first = NULL;
+static prof_bt_node_t *log_bt_last = NULL;
+static prof_thr_node_t *log_thr_first = NULL;
+static prof_thr_node_t *log_thr_last = NULL;
+static prof_alloc_node_t *log_alloc_first = NULL;
+static prof_alloc_node_t *log_alloc_last = NULL;
+
+/* Protects the prof_logging_state and any log_{...} variable. */
+static malloc_mutex_t log_mtx;
+
 /*
  * Table of mutexes that are shared among gctx's.  These are leaf locks, so
  * there is no problem with using them for more than one gctx at the same time.
@@ -145,6 +237,12 @@ static void	prof_tdata_destroy(tsd_t *tsd, prof_tdata_t *tdata,
     bool even_if_attached);
 static char	*prof_thread_name_alloc(tsdn_t *tsdn, const char *thread_name);
 
+/* Hashtable functions for log_bt_node_set and log_thr_node_set. */
+static void prof_thr_node_hash(const void *key, size_t r_hash[2]);
+static bool prof_thr_node_keycomp(const void *k1, const void *k2);
+static void prof_bt_node_hash(const void *key, size_t r_hash[2]);
+static bool prof_bt_node_keycomp(const void *k1, const void *k2);
+
 /******************************************************************************/
 /* Red-black trees. */
 
@@ -242,6 +340,12 @@ prof_malloc_sample_object(tsdn_t *tsdn, const void *ptr, size_t usize,
     prof_tctx_t *tctx) {
 	prof_tctx_set(tsdn, ptr, usize, NULL, tctx);
 
+	/* Get the current time and set this in the extent_t. We'll read this
+	 * when free() is called. */
+	nstime_t t = NSTIME_ZERO_INITIALIZER;
+	nstime_update(&t);
+	prof_alloc_time_set(tsdn, ptr, NULL, t);
+
 	malloc_mutex_lock(tsdn, tctx->tdata->lock);
 	tctx->cnts.curobjs++;
 	tctx->cnts.curbytes += usize;
@@ -253,14 +357,171 @@ prof_malloc_sample_object(tsdn_t *tsdn, const void *ptr, size_t usize,
 	malloc_mutex_unlock(tsdn, tctx->tdata->lock);
 }
 
+static size_t
+prof_log_bt_index(tsd_t *tsd, prof_bt_t *bt) {
+	assert(prof_logging_state == prof_logging_state_started);
+	malloc_mutex_assert_owner(tsd_tsdn(tsd), &log_mtx);
+
+	prof_bt_node_t dummy_node;
+	dummy_node.bt = *bt;
+	prof_bt_node_t *node;
+
+	/* See if this backtrace is already cached in the table. */
+	if (ckh_search(&log_bt_node_set, (void *)(&dummy_node),
+	    (void **)(&node), NULL)) {
+		size_t sz = offsetof(prof_bt_node_t, vec) +
+			        (bt->len * sizeof(void *));
+		prof_bt_node_t *new_node = (prof_bt_node_t *)
+			ialloc(tsd, sz, sz_size2index(sz), false, true);
+		if (log_bt_first == NULL) {
+			log_bt_first = new_node;
+			log_bt_last = new_node;
+		} else {
+			log_bt_last->next = new_node;
+			log_bt_last = new_node;
+		}
+
+		new_node->next = NULL;
+		new_node->index = log_bt_index;
+		/* 
+		 * Copy the backtrace: bt is inside a tdata or gctx, which
+		 * might die before prof_log_stop is called.
+		 */
+		new_node->bt.len = bt->len;
+		memcpy(new_node->vec, bt->vec, bt->len * sizeof(void *));
+		new_node->bt.vec = new_node->vec;
+
+		log_bt_index++;
+		ckh_insert(tsd, &log_bt_node_set, (void *)new_node, NULL);
+		return new_node->index;
+	} else {
+		return node->index;
+	}
+} 
+static size_t
+prof_log_thr_index(tsd_t *tsd, uint64_t thr_uid, const char *name) {
+	assert(prof_logging_state == prof_logging_state_started);
+	malloc_mutex_assert_owner(tsd_tsdn(tsd), &log_mtx);
+
+	prof_thr_node_t dummy_node;
+	dummy_node.thr_uid = thr_uid;
+	prof_thr_node_t *node;
+
+	/* See if this thread is already cached in the table. */
+	if (ckh_search(&log_thr_node_set, (void *)(&dummy_node),
+	    (void **)(&node), NULL)) {
+		size_t sz = offsetof(prof_thr_node_t, name) + strlen(name) + 1;
+		prof_thr_node_t *new_node = (prof_thr_node_t *)
+			ialloc(tsd, sz, sz_size2index(sz), false, true);
+		if (log_thr_first == NULL) {
+			log_thr_first = new_node;
+			log_thr_last = new_node;
+		} else {
+			log_thr_last->next = new_node;
+			log_thr_last = new_node;
+		}
+
+		new_node->next = NULL;
+		new_node->index = log_thr_index;
+		new_node->thr_uid = thr_uid;
+		strcpy(new_node->name, name);
+
+		log_thr_index++;
+		ckh_insert(tsd, &log_thr_node_set, (void *)new_node, NULL);
+		return new_node->index;
+	} else {
+		return node->index;
+	}
+}
+
+static void
+prof_try_log(tsd_t *tsd, const void *ptr, size_t usize, prof_tctx_t *tctx) {
+	malloc_mutex_assert_owner(tsd_tsdn(tsd), tctx->tdata->lock);
+
+	prof_tdata_t *cons_tdata = prof_tdata_get(tsd, false);
+	if (cons_tdata == NULL) {
+		/*
+		 * We decide not to log these allocations. cons_tdata will be
+		 * NULL only when the current thread is in a weird state (e.g.
+		 * it's being destroyed).
+		 */
+		return;
+	}	
+
+	malloc_mutex_lock(tsd_tsdn(tsd), &log_mtx);
+
+	if (prof_logging_state != prof_logging_state_started) {
+		goto label_done;
+	}
+
+	if (!log_tables_initialized) {
+		bool err1 = ckh_new(tsd, &log_bt_node_set, PROF_CKH_MINITEMS,
+				prof_bt_node_hash, prof_bt_node_keycomp);
+		bool err2 = ckh_new(tsd, &log_thr_node_set, PROF_CKH_MINITEMS,
+				prof_thr_node_hash, prof_thr_node_keycomp);
+		if (err1 || err2) {
+			goto label_done;
+		}
+		log_tables_initialized = true;
+	}
+
+	nstime_t alloc_time = prof_alloc_time_get(tsd_tsdn(tsd), ptr,
+			          (alloc_ctx_t *)NULL);
+	nstime_t free_time = NSTIME_ZERO_INITIALIZER;
+	nstime_update(&free_time);
+
+	prof_alloc_node_t *new_node = (prof_alloc_node_t *)
+		ialloc(tsd, sizeof(prof_alloc_node_t),
+		    sz_size2index(sizeof(prof_alloc_node_t)), false, true);
+ 
+	const char *prod_thr_name = (tctx->tdata->thread_name == NULL)?
+				        "" : tctx->tdata->thread_name;
+	const char *cons_thr_name = prof_thread_name_get(tsd);
+
+	prof_bt_t bt;
+	/* Initialize the backtrace, using the buffer in tdata to store it. */
+	bt_init(&bt, cons_tdata->vec);
+	prof_backtrace(&bt);
+	prof_bt_t *cons_bt = &bt;
+
+	/* We haven't destroyed tctx yet, so gctx should be good to read. */
+	prof_bt_t *prod_bt = &tctx->gctx->bt;
+
+	new_node->next = NULL;
+	new_node->alloc_thr_ind = prof_log_thr_index(tsd, tctx->tdata->thr_uid,
+				      prod_thr_name);
+	new_node->free_thr_ind = prof_log_thr_index(tsd, cons_tdata->thr_uid,
+				     cons_thr_name);
+	new_node->alloc_bt_ind = prof_log_bt_index(tsd, prod_bt);
+	new_node->free_bt_ind = prof_log_bt_index(tsd, cons_bt);
+	new_node->alloc_time_ns = nstime_ns(&alloc_time);
+	new_node->free_time_ns = nstime_ns(&free_time);
+	new_node->usize = usize;
+
+	if (log_alloc_first == NULL) {
+		log_alloc_first = new_node;
+		log_alloc_last = new_node;
+	} else {
+		log_alloc_last->next = new_node;
+		log_alloc_last = new_node;
+	}
+
+label_done:
+	malloc_mutex_unlock(tsd_tsdn(tsd), &log_mtx);	
+}
+
 void
-prof_free_sampled_object(tsd_t *tsd, size_t usize, prof_tctx_t *tctx) {
+prof_free_sampled_object(tsd_t *tsd, const void *ptr, size_t usize, 
+    prof_tctx_t *tctx) {
 	malloc_mutex_lock(tsd_tsdn(tsd), tctx->tdata->lock);
+
 	assert(tctx->cnts.curobjs > 0);
 	assert(tctx->cnts.curbytes >= usize);
 	tctx->cnts.curobjs--;
 	tctx->cnts.curbytes -= usize;
 
+	prof_try_log(tsd, ptr, usize, tctx);
+
 	if (prof_tctx_should_destroy(tsd_tsdn(tsd), tctx)) {
 		prof_tctx_destroy(tsd, tctx);
 	} else {
@@ -1887,6 +2148,33 @@ prof_bt_keycomp(const void *k1, const void *k2) {
 	return (memcmp(bt1->vec, bt2->vec, bt1->len * sizeof(void *)) == 0);
 }
 
+static void
+prof_bt_node_hash(const void *key, size_t r_hash[2]) {
+	const prof_bt_node_t *bt_node = (prof_bt_node_t *)key;
+	prof_bt_hash((void *)(&bt_node->bt), r_hash);
+}
+
+static bool
+prof_bt_node_keycomp(const void *k1, const void *k2) {
+	const prof_bt_node_t *bt_node1 = (prof_bt_node_t *)k1;
+	const prof_bt_node_t *bt_node2 = (prof_bt_node_t *)k2;
+	return prof_bt_keycomp((void *)(&bt_node1->bt),
+	    (void *)(&bt_node2->bt));
+}
+
+static void
+prof_thr_node_hash(const void *key, size_t r_hash[2]) {
+	const prof_thr_node_t *thr_node = (prof_thr_node_t *)key;
+	hash(&thr_node->thr_uid, sizeof(uint64_t), 0x94122f35U, r_hash);
+}
+
+static bool
+prof_thr_node_keycomp(const void *k1, const void *k2) {
+	const prof_thr_node_t *thr_node1 = (prof_thr_node_t *)k1;
+	const prof_thr_node_t *thr_node2 = (prof_thr_node_t *)k2;
+	return thr_node1->thr_uid == thr_node2->thr_uid;
+}
+
 static uint64_t
 prof_thr_uid_alloc(tsdn_t *tsdn) {
 	uint64_t thr_uid;
@@ -2119,6 +2407,252 @@ prof_active_set(tsdn_t *tsdn, bool active) {
 	return prof_active_old;
 }
 
+bool
+prof_log_start(tsdn_t *tsdn, const char *filename) {
+	if (!opt_prof || !prof_booted) {
+		return true;
+	}
+
+	bool ret = false;
+	size_t buf_size = PATH_MAX + 1;
+
+	malloc_mutex_lock(tsdn, &log_mtx);
+
+	if (prof_logging_state != prof_logging_state_stopped) {
+		ret = true;
+	} else if (filename == NULL) {
+		/* Make default name. */
+		malloc_snprintf(log_filename, buf_size, "%s.%d.%"FMTu64".json",
+		    opt_prof_prefix, prof_getpid(), log_seq);
+		log_seq++;
+		prof_logging_state = prof_logging_state_started;
+	} else if (strlen(filename) >= buf_size) {
+		ret = true;
+	} else {
+		strcpy(log_filename, filename);
+		prof_logging_state = prof_logging_state_started;
+	}
+
+	if (!ret) {
+		nstime_update(&log_start_timestamp);
+	}
+
+	malloc_mutex_unlock(tsdn, &log_mtx);
+
+	return ret;
+}
+
+/* Used as an atexit function to stop logging on exit. */
+static void
+prof_log_stop_final(void) {
+	tsd_t *tsd = tsd_fetch();
+	prof_log_stop(tsd_tsdn(tsd));
+}
+
+struct prof_emitter_cb_arg_s {
+	int fd;
+	ssize_t ret;
+};
+
+static void
+prof_emitter_write_cb(void *opaque, const char *to_write) {
+	struct prof_emitter_cb_arg_s *arg =
+	    (struct prof_emitter_cb_arg_s *)opaque;
+	size_t bytes = strlen(to_write);
+	arg->ret = write(arg->fd, (void *)to_write, bytes);
+}
+
+/*
+ * prof_log_emit_{...} goes through the appropriate linked list, emitting each
+ * node to the json and deallocating it.
+ */
+static void
+prof_log_emit_threads(tsd_t *tsd, emitter_t *emitter) {
+	emitter_json_array_kv_begin(emitter, "threads");
+	prof_thr_node_t *thr_node = log_thr_first;
+	prof_thr_node_t *thr_old_node;
+	while (thr_node != NULL) {
+		emitter_json_object_begin(emitter);
+
+		emitter_json_kv(emitter, "thr_uid", emitter_type_uint64,
+		    &thr_node->thr_uid);
+
+		char *thr_name = thr_node->name;
+
+		emitter_json_kv(emitter, "thr_name", emitter_type_string,
+		    &thr_name);
+
+		emitter_json_object_end(emitter);
+		thr_old_node = thr_node;
+		thr_node = thr_node->next;
+		idalloc(tsd, thr_old_node);
+	}
+	emitter_json_array_end(emitter);
+}
+
+static void
+prof_log_emit_traces(tsd_t *tsd, emitter_t *emitter) {
+	emitter_json_array_kv_begin(emitter, "stack_traces");
+	prof_bt_node_t *bt_node = log_bt_first;
+	prof_bt_node_t *bt_old_node; 
+	/* 
+	 * Calculate how many hex digits we need: twice number of bytes, two for
+	 * "0x", and then one more for terminating '\0'.
+	 */
+	char buf[2 * sizeof(intptr_t) + 3];
+	size_t buf_sz = sizeof(buf);
+	while (bt_node != NULL) {
+		emitter_json_array_begin(emitter);
+		size_t i;
+		for (i = 0; i < bt_node->bt.len; i++) {
+			malloc_snprintf(buf, buf_sz, "%p", bt_node->bt.vec[i]);
+			char *trace_str = buf;
+			emitter_json_value(emitter, emitter_type_string,
+			    &trace_str);
+		}
+		emitter_json_array_end(emitter);
+
+		bt_old_node = bt_node;
+		bt_node = bt_node->next;
+		idalloc(tsd, bt_old_node);
+	}
+	emitter_json_array_end(emitter);
+}
+
+static void
+prof_log_emit_allocs(tsd_t *tsd, emitter_t *emitter) {
+	emitter_json_array_kv_begin(emitter, "allocations");
+	prof_alloc_node_t *alloc_node = log_alloc_first;
+	prof_alloc_node_t *alloc_old_node;
+	while (alloc_node != NULL) {
+		emitter_json_object_begin(emitter);
+
+		emitter_json_kv(emitter, "alloc_thread", emitter_type_size,
+		    &alloc_node->alloc_thr_ind);
+
+		emitter_json_kv(emitter, "free_thread", emitter_type_size,
+		    &alloc_node->free_thr_ind);
+
+		emitter_json_kv(emitter, "alloc_trace", emitter_type_size,
+		    &alloc_node->alloc_bt_ind);
+
+		emitter_json_kv(emitter, "free_trace", emitter_type_size,
+		    &alloc_node->free_bt_ind);
+
+		emitter_json_kv(emitter, "alloc_timestamp",
+		    emitter_type_uint64, &alloc_node->alloc_time_ns);
+
+		emitter_json_kv(emitter, "free_timestamp", emitter_type_uint64,
+		    &alloc_node->free_time_ns);
+
+		emitter_json_kv(emitter, "usize", emitter_type_uint64,
+		    &alloc_node->usize);
+
+		emitter_json_object_end(emitter);
+
+		alloc_old_node = alloc_node;
+		alloc_node = alloc_node->next;
+		idalloc(tsd, alloc_old_node);
+	}
+	emitter_json_array_end(emitter);
+}
+
+static void
+prof_log_emit_metadata(emitter_t *emitter) {
+	emitter_json_object_kv_begin(emitter, "info");
+
+	nstime_t now = NSTIME_ZERO_INITIALIZER;
+
+	nstime_update(&now);
+	uint64_t ns = nstime_ns(&now) - nstime_ns(&log_start_timestamp);
+	emitter_json_kv(emitter, "duration", emitter_type_uint64, &ns);
+
+	char *vers = JEMALLOC_VERSION;
+	emitter_json_kv(emitter, "version",
+	    emitter_type_string, &vers);
+
+	emitter_json_kv(emitter, "lg_sample_rate",
+	    emitter_type_int, &lg_prof_sample);
+
+	int pid = prof_getpid();
+	emitter_json_kv(emitter, "pid", emitter_type_int, &pid);
+
+	emitter_json_object_end(emitter);
+}
+
+
+bool
+prof_log_stop(tsdn_t *tsdn) {
+	if (!opt_prof || !prof_booted) {
+		return true;
+	}
+
+	tsd_t *tsd = tsdn_tsd(tsdn);
+	malloc_mutex_lock(tsdn, &log_mtx);
+
+	if (prof_logging_state != prof_logging_state_started) {
+		malloc_mutex_unlock(tsdn, &log_mtx);
+		return true;
+	}
+
+	/*
+	 * Set the state to dumping. We'll set it to stopped when we're done.
+	 * Since other threads won't be able to start/stop/log when the state is
+	 * dumping, we don't have to hold the lock during the whole method.
+	 */
+	prof_logging_state = prof_logging_state_dumping;
+	malloc_mutex_unlock(tsdn, &log_mtx);
+
+
+	emitter_t emitter;
+
+	/* Create a file. */
+	int fd = creat(log_filename, 0644);
+
+	if (fd == -1) {
+		malloc_printf("<jemalloc>: creat() for log file \"%s\" "
+			      " failed with %d\n", log_filename, errno);
+		if (opt_abort) {
+			abort();
+		}
+		return true;
+	}
+
+	/* Emit to json. */
+	struct prof_emitter_cb_arg_s arg;
+	arg.fd = fd;
+	emitter_init(&emitter, emitter_output_json, &prof_emitter_write_cb,
+	    (void *)(&arg));
+
+	emitter_json_object_begin(&emitter);
+	prof_log_emit_metadata(&emitter);
+	prof_log_emit_threads(tsd, &emitter);
+	prof_log_emit_traces(tsd, &emitter);
+	prof_log_emit_allocs(tsd, &emitter);
+	emitter_json_object_end(&emitter);
+
+	/* Reset global state. */
+	if (log_tables_initialized) {
+		ckh_delete(tsd, &log_bt_node_set);
+		ckh_delete(tsd, &log_thr_node_set);
+	}
+	log_tables_initialized = false;
+	log_bt_index = 0;
+	log_thr_index = 0;
+	log_bt_first = NULL;
+	log_bt_last = NULL;
+	log_thr_first = NULL;
+	log_thr_last = NULL;
+	log_alloc_first = NULL;
+	log_alloc_last = NULL;
+
+	malloc_mutex_lock(tsdn, &log_mtx);
+	prof_logging_state = prof_logging_state_stopped;
+	malloc_mutex_unlock(tsdn, &log_mtx);
+
+	return close(fd);
+}
+
 const char *
 prof_thread_name_get(tsd_t *tsd) {
 	prof_tdata_t *tdata;
@@ -2355,6 +2889,35 @@ prof_boot2(tsd_t *tsd) {
 			}
 		}
 
+		if (opt_prof_log) {
+			prof_log_start(tsd_tsdn(tsd), NULL);
+		}
+
+		if (atexit(prof_log_stop_final) != 0) {
+			malloc_write("<jemalloc>: Error in atexit() "
+				     "for logging\n");
+			if (opt_abort) {
+				abort();
+			}
+		}
+
+		if (malloc_mutex_init(&log_mtx, "prof_log",
+		    WITNESS_RANK_PROF_LOG, malloc_mutex_rank_exclusive)) {
+			return true;
+		}
+
+		if (ckh_new(tsd, &log_bt_node_set, PROF_CKH_MINITEMS,
+		    prof_bt_node_hash, prof_bt_node_keycomp)) {
+			return true;
+		}
+
+		if (ckh_new(tsd, &log_thr_node_set, PROF_CKH_MINITEMS,
+		    prof_thr_node_hash, prof_thr_node_keycomp)) {
+			return true;
+		}
+
+		log_tables_initialized = true;
+
 		gctx_locks = (malloc_mutex_t *)base_alloc(tsd_tsdn(tsd),
 		    b0get(), PROF_NCTX_LOCKS * sizeof(malloc_mutex_t),
 		    CACHELINE);
-- 
cgit v0.12


From 5e23f96dd4e4ff2847a85d44a01b66e4ed2da21f Mon Sep 17 00:00:00 2001
From: Tyler Etzel <tyleretzel1@gmail.com>
Date: Tue, 3 Jul 2018 11:10:09 -0700
Subject: Add unit tests for logging

---
 Makefile.in                              |   1 +
 include/jemalloc/internal/prof_externs.h |  13 ++-
 src/prof.c                               | 122 +++++++++++++++++++++++++-
 test/unit/prof_log.c                     | 146 +++++++++++++++++++++++++++++++
 test/unit/prof_log.sh                    |   5 ++
 5 files changed, 284 insertions(+), 3 deletions(-)
 create mode 100644 test/unit/prof_log.c
 create mode 100644 test/unit/prof_log.sh

diff --git a/Makefile.in b/Makefile.in
index 8b2f5ca..49585ed 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -194,6 +194,7 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/prof_active.c \
 	$(srcroot)test/unit/prof_gdump.c \
 	$(srcroot)test/unit/prof_idump.c \
+	$(srcroot)test/unit/prof_log.c \
 	$(srcroot)test/unit/prof_reset.c \
 	$(srcroot)test/unit/prof_tctx.c \
 	$(srcroot)test/unit/prof_thread_name.c \
diff --git a/include/jemalloc/internal/prof_externs.h b/include/jemalloc/internal/prof_externs.h
index 74315ce..094f3e1 100644
--- a/include/jemalloc/internal/prof_externs.h
+++ b/include/jemalloc/internal/prof_externs.h
@@ -74,8 +74,6 @@ void prof_reset(tsd_t *tsd, size_t lg_sample);
 void prof_tdata_cleanup(tsd_t *tsd);
 bool prof_active_get(tsdn_t *tsdn);
 bool prof_active_set(tsdn_t *tsdn, bool active);
-bool prof_log_start(tsdn_t *tsdn, const char *filename);
-bool prof_log_stop(tsdn_t *tsdn);
 const char *prof_thread_name_get(tsd_t *tsd);
 int prof_thread_name_set(tsd_t *tsd, const char *thread_name);
 bool prof_thread_active_get(tsd_t *tsd);
@@ -93,4 +91,15 @@ void prof_postfork_parent(tsdn_t *tsdn);
 void prof_postfork_child(tsdn_t *tsdn);
 void prof_sample_threshold_update(prof_tdata_t *tdata);
 
+bool prof_log_start(tsdn_t *tsdn, const char *filename);
+bool prof_log_stop(tsdn_t *tsdn);
+#ifdef JEMALLOC_JET
+size_t prof_log_bt_count(void);
+size_t prof_log_alloc_count(void);
+size_t prof_log_thr_count(void);
+bool prof_log_is_logging(void);
+bool prof_log_rep_check(void);
+void prof_log_dummy_set(bool new_value);
+#endif
+
 #endif /* JEMALLOC_INTERNAL_PROF_EXTERNS_H */
diff --git a/src/prof.c b/src/prof.c
index 21421c0..458c6cd 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -86,6 +86,10 @@ enum prof_logging_state_e {
  */
 prof_logging_state_t prof_logging_state = prof_logging_state_stopped;
 
+#ifdef JEMALLOC_JET
+static bool prof_log_dummy = false;
+#endif
+
 /* Incremented for every log file that is output. */
 static uint64_t log_seq = 0;
 static char log_filename[
@@ -2407,6 +2411,102 @@ prof_active_set(tsdn_t *tsdn, bool active) {
 	return prof_active_old;
 }
 
+#ifdef JEMALLOC_JET
+size_t
+prof_log_bt_count(void) {
+	size_t cnt = 0;
+	prof_bt_node_t *node = log_bt_first;
+	while (node != NULL) {
+		cnt++;
+		node = node->next;
+	}
+	return cnt;
+}
+
+size_t
+prof_log_alloc_count(void) {
+	size_t cnt = 0;
+	prof_alloc_node_t *node = log_alloc_first;
+	while (node != NULL) {
+		cnt++;
+		node = node->next;
+	}
+	return cnt;
+}
+
+size_t
+prof_log_thr_count(void) {
+	size_t cnt = 0;
+	prof_thr_node_t *node = log_thr_first;
+	while (node != NULL) {
+		cnt++;
+		node = node->next;
+	}
+	return cnt;
+}
+
+bool
+prof_log_is_logging(void) {
+	return prof_logging_state == prof_logging_state_started;
+}
+
+bool
+prof_log_rep_check(void) {
+	if (prof_logging_state == prof_logging_state_stopped
+	    && log_tables_initialized) {
+		return true;
+	}
+
+	if (log_bt_last != NULL && log_bt_last->next != NULL) {
+		return true;
+	}
+	if (log_thr_last != NULL && log_thr_last->next != NULL) {
+		return true;
+	}
+	if (log_alloc_last != NULL && log_alloc_last->next != NULL) {
+		return true;
+	}
+
+	size_t bt_count = prof_log_bt_count();
+	size_t thr_count = prof_log_thr_count();
+	size_t alloc_count = prof_log_alloc_count();
+
+
+	if (prof_logging_state == prof_logging_state_stopped) {
+		if (bt_count != 0 || thr_count != 0 || alloc_count || 0) {
+			return true;
+		}
+	}
+
+	prof_alloc_node_t *node = log_alloc_first;
+	while (node != NULL) {
+		if (node->alloc_bt_ind >= bt_count) {
+			return true;
+		}
+		if (node->free_bt_ind >= bt_count) {
+			return true;
+		}
+		if (node->alloc_thr_ind >= thr_count) {
+			return true;
+		}
+		if (node->free_thr_ind >= thr_count) {
+			return true;
+		}
+		if (node->alloc_time_ns > node->free_time_ns) {
+			return true;
+		}
+		node = node->next;
+	}
+
+	return false;
+}
+
+void
+prof_log_dummy_set(bool new_value) {
+	prof_log_dummy = new_value;
+}
+#endif
+
 bool
 prof_log_start(tsdn_t *tsdn, const char *filename) {
 	if (!opt_prof || !prof_booted) {
@@ -2459,6 +2559,11 @@ prof_emitter_write_cb(void *opaque, const char *to_write) {
 	struct prof_emitter_cb_arg_s *arg =
 	    (struct prof_emitter_cb_arg_s *)opaque;
 	size_t bytes = strlen(to_write);
+#ifdef JEMALLOC_JET
+	if (prof_log_dummy) {
+		return;
+	}
+#endif
 	arg->ret = write(arg->fd, (void *)to_write, bytes);
 }
 
@@ -2607,7 +2712,17 @@ prof_log_stop(tsdn_t *tsdn) {
 	emitter_t emitter;
 
 	/* Create a file. */
-	int fd = creat(log_filename, 0644);
+
+	int fd;
+#ifdef JEMALLOC_JET
+	if (prof_log_dummy) {
+		fd = 0;
+	} else {
+		fd = creat(log_filename, 0644);
+	}
+#else
+	fd = creat(log_filename, 0644);
+#endif
 
 	if (fd == -1) {
 		malloc_printf("<jemalloc>: creat() for log file \"%s\" "
@@ -2650,6 +2765,11 @@ prof_log_stop(tsdn_t *tsdn) {
 	prof_logging_state = prof_logging_state_stopped;
 	malloc_mutex_unlock(tsdn, &log_mtx);
 
+#ifdef JEMALLOC_JET
+	if (prof_log_dummy) {
+		return false;
+	}
+#endif
 	return close(fd);
 }
 
diff --git a/test/unit/prof_log.c b/test/unit/prof_log.c
new file mode 100644
index 0000000..6a3464b
--- /dev/null
+++ b/test/unit/prof_log.c
@@ -0,0 +1,146 @@
+#include "test/jemalloc_test.h"
+
+#define N_PARAM 100
+#define N_THREADS 10
+
+static void assert_rep() {
+	assert_b_eq(prof_log_rep_check(), false, "Rep check failed");
+}
+
+static void assert_log_empty() {
+	assert_zu_eq(prof_log_bt_count(), 0,
+	    "The log has backtraces; it isn't empty");
+	assert_zu_eq(prof_log_thr_count(), 0,
+	    "The log has threads; it isn't empty");
+	assert_zu_eq(prof_log_alloc_count(), 0,
+	    "The log has allocations; it isn't empty");
+}
+
+void *buf[N_PARAM];
+
+static void f() {
+	int i;
+	for (i = 0; i < N_PARAM; i++) {
+		buf[i] = malloc(100);
+	}
+	for (i = 0; i < N_PARAM; i++) {
+		free(buf[i]);
+	}
+}
+
+TEST_BEGIN(test_prof_log_many_logs) {
+	int i;
+
+	test_skip_if(!config_prof);
+
+	for (i = 0; i < N_PARAM; i++) {
+		assert_b_eq(prof_log_is_logging(), false,
+		    "Logging shouldn't have started yet");
+		assert_d_eq(mallctl("prof.log_start", NULL, NULL, NULL, 0), 0,
+		    "Unexpected mallctl failure when starting logging");
+		assert_b_eq(prof_log_is_logging(), true,
+		    "Logging should be started by now");
+		assert_log_empty();
+		assert_rep();
+		f();
+		assert_zu_eq(prof_log_thr_count(), 1, "Wrong thread count");
+		assert_rep();
+		assert_b_eq(prof_log_is_logging(), true,
+		    "Logging should still be on");
+		assert_d_eq(mallctl("prof.log_stop", NULL, NULL, NULL, 0), 0,
+		    "Unexpected mallctl failure when stopping logging");
+		assert_b_eq(prof_log_is_logging(), false,
+		    "Logging should have turned off");
+	}
+}
+TEST_END
+
+thd_t thr_buf[N_THREADS];
+
+static void *f_thread(void *unused) {
+	int i;
+	for (i = 0; i < N_PARAM; i++) {
+		void *p = malloc(100);
+		memset(p, 100, sizeof(char));
+		free(p);
+	}
+
+	return NULL;
+}
+
+TEST_BEGIN(test_prof_log_many_threads) {
+
+	test_skip_if(!config_prof);
+
+	int i;
+	assert_d_eq(mallctl("prof.log_start", NULL, NULL, NULL, 0), 0,
+	    "Unexpected mallctl failure when starting logging");
+	for (i = 0; i < N_THREADS; i++) {
+		thd_create(&thr_buf[i], &f_thread, NULL);
+	}
+
+	for (i = 0; i < N_THREADS; i++) {
+		thd_join(thr_buf[i], NULL);
+	}
+	assert_zu_eq(prof_log_thr_count(), N_THREADS,
+	    "Wrong number of thread entries");
+	assert_rep();
+	assert_d_eq(mallctl("prof.log_stop", NULL, NULL, NULL, 0), 0,
+	    "Unexpected mallctl failure when stopping logging");
+}
+TEST_END
+
+static void f3() {
+	void *p = malloc(100);
+	free(p);
+}
+
+static void f1() {
+	void *p = malloc(100);
+	f3();
+	free(p);
+}
+
+static void f2() {
+	void *p = malloc(100);
+	free(p);
+}
+
+TEST_BEGIN(test_prof_log_many_traces) {
+
+	test_skip_if(!config_prof);
+
+	assert_d_eq(mallctl("prof.log_start", NULL, NULL, NULL, 0), 0,
+	    "Unexpected mallctl failure when starting logging");
+	int i;
+	assert_rep();
+	assert_log_empty();
+	for (i = 0; i < N_PARAM; i++) {
+		assert_rep();
+		f1();
+		assert_rep();
+		f2();
+		assert_rep();
+		f3();
+		assert_rep();
+	}
+	/*
+	 * There should be 8 total backtraces: two for malloc/free in f1(),
+	 * two for malloc/free in f2(), two for malloc/free in f3(), and then
+	 * two for malloc/free in f1()'s call to f3().
+	 */
+	assert_zu_eq(prof_log_bt_count(), 8,
+	    "Wrong number of backtraces given sample workload");
+	assert_d_eq(mallctl("prof.log_stop", NULL, NULL, NULL, 0), 0,
+	    "Unexpected mallctl failure when stopping logging");
+}
+TEST_END
+
+int
+main(void) {
+	prof_log_dummy_set(true);
+	return test_no_reentrancy(
+	    test_prof_log_many_logs,
+	    test_prof_log_many_traces,
+	    test_prof_log_many_threads);
+}
diff --git a/test/unit/prof_log.sh b/test/unit/prof_log.sh
new file mode 100644
index 0000000..8fcc7d8
--- /dev/null
+++ b/test/unit/prof_log.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+if [ "x${enable_prof}" = "x1" ] ; then
+  export MALLOC_CONF="prof:true,lg_prof_sample:0"
+fi
-- 
cgit v0.12


From 33f1aa5badd2f9caf91991bab60df64a37c394bb Mon Sep 17 00:00:00 2001
From: Tyler Etzel <tyleretzel1@gmail.com>
Date: Mon, 30 Jul 2018 13:31:34 -0700
Subject: Fix comment on SC_NPSIZES.

---
 include/jemalloc/internal/sc.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/include/jemalloc/internal/sc.h b/include/jemalloc/internal/sc.h
index 5b79bb4..ef0a451 100644
--- a/include/jemalloc/internal/sc.h
+++ b/include/jemalloc/internal/sc.h
@@ -196,11 +196,7 @@
     (SC_LG_BASE_MAX - SC_LG_FIRST_REGULAR_BASE + 1) - 1)
 #define SC_NSIZES (SC_NTINY + SC_NPSEUDO + SC_NREGULAR)
 
-/*
- * The number of size classes that are at least a page in size. Note that
- * because delta may be smaller than a page, this is not the same as the number
- * of size classes that are *multiples* of the page size.
- */
+ /* The number of size classes that are a multiple of the page size. */
 #define SC_NPSIZES (							\
     /* Start with all the size classes. */				\
     SC_NSIZES								\
-- 
cgit v0.12


From c14e6c08192034d9140d61197d7c4981ca293610 Mon Sep 17 00:00:00 2001
From: Tyler Etzel <tyleretzel1@gmail.com>
Date: Tue, 31 Jul 2018 09:49:49 -0700
Subject: Add extents information to mallocstats output

- Show number/bytes of extents of each size that are dirty, muzzy, retained.
---
 doc/jemalloc.xml.in                        |  33 +++++++-
 include/jemalloc/internal/arena_externs.h  |   3 +-
 include/jemalloc/internal/arena_stats.h    |  16 ++++
 include/jemalloc/internal/ctl.h            |   1 +
 include/jemalloc/internal/extent_externs.h |   4 +
 include/jemalloc/internal/extent_structs.h |   2 +
 include/jemalloc/internal/stats.h          |   3 +-
 src/arena.c                                |  25 +++++-
 src/ctl.c                                  |  82 +++++++++++++++++++-
 src/extent.c                               |  36 +++++++++
 src/stats.c                                | 117 +++++++++++++++++++++++++++--
 11 files changed, 307 insertions(+), 15 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 0dcfb98..08d4830 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -433,10 +433,11 @@ for (i = 0; i < nbins; i++) {
       arena statistics, respectively; <quote>b</quote> and <quote>l</quote> can
       be specified to omit per size class statistics for bins and large objects,
       respectively; <quote>x</quote> can be specified to omit all mutex
-      statistics.  Unrecognized characters are silently ignored.  Note that
-      thread caching may prevent some statistics from being completely up to
-      date, since extra locking would be required to merge counters that track
-      thread cache operations.</para>
+      statistics; <quote>e</quote> can be used to omit extent statistics.
+      Unrecognized characters are silently ignored.  Note that thread caching
+      may prevent some statistics from being completely up to date, since extra
+      locking would be required to merge counters that track thread cache
+      operations.</para>
 
       <para>The <function>malloc_usable_size()</function> function
       returns the usable size of the allocation pointed to by
@@ -2925,6 +2926,30 @@ struct extent_hooks_s {
         counters</link>.</para></listitem>
       </varlistentry>
 
+      <varlistentry id="stats.arenas.i.extents.n">
+        <term>
+          <mallctl>stats.arenas.&lt;i&gt;.extents.&lt;j&gt;.n{extent_type}</mallctl>
+          (<type>size_t</type>)
+          <literal>r-</literal>
+          [<option>--enable-stats</option>]
+        </term>
+        <listitem><para> Number of extents of the given type in this arena in
+	the bucket corresponding to page size index &lt;j&gt;. The extent type
+	is one of dirty, muzzy, or retained.</para></listitem>
+      </varlistentry>
+
+      <varlistentry id="stats.arenas.i.extents.bytes">
+        <term>
+          <mallctl>stats.arenas.&lt;i&gt;.extents.&lt;j&gt;.{extent_type}_bytes</mallctl>
+          (<type>size_t</type>)
+          <literal>r-</literal>
+          [<option>--enable-stats</option>]
+        </term>
+	<listitem><para> Sum of the bytes managed by extents of the given type
+	in this arena in the bucket corresponding to page size index &lt;j&gt;.
+	The extent type is one of dirty, muzzy, or retained.</para></listitem>
+      </varlistentry>
+
       <varlistentry id="stats.arenas.i.lextents.j.nmalloc">
         <term>
           <mallctl>stats.arenas.&lt;i&gt;.lextents.&lt;j&gt;.nmalloc</mallctl>
diff --git a/include/jemalloc/internal/arena_externs.h b/include/jemalloc/internal/arena_externs.h
index 4f744ca..073e587 100644
--- a/include/jemalloc/internal/arena_externs.h
+++ b/include/jemalloc/internal/arena_externs.h
@@ -25,7 +25,8 @@ void arena_basic_stats_merge(tsdn_t *tsdn, arena_t *arena,
 void arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
     const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms,
     size_t *nactive, size_t *ndirty, size_t *nmuzzy, arena_stats_t *astats,
-    bin_stats_t *bstats, arena_stats_large_t *lstats);
+    bin_stats_t *bstats, arena_stats_large_t *lstats,
+    arena_stats_extents_t *estats);
 void arena_extents_dirty_dalloc(tsdn_t *tsdn, arena_t *arena,
     extent_hooks_t **r_extent_hooks, extent_t *extent);
 #ifdef JEMALLOC_JET
diff --git a/include/jemalloc/internal/arena_stats.h b/include/jemalloc/internal/arena_stats.h
index 6dacf74..470ddfc 100644
--- a/include/jemalloc/internal/arena_stats.h
+++ b/include/jemalloc/internal/arena_stats.h
@@ -50,6 +50,22 @@ struct arena_stats_decay_s {
 	arena_stats_u64_t	purged;
 };
 
+typedef struct arena_stats_extents_s arena_stats_extents_t;
+struct arena_stats_extents_s {
+	/*
+	 * Stats for a given index in the range [0, SC_NPSIZES] in an extents_t.
+	 * We track both bytes and # of extents: two extents in the same bucket
+	 * may have different sizes if adjacent size classes differ by more than
+	 * a page, so bytes cannot always be derived from # of extents.
+	 */
+	atomic_zu_t ndirty;
+	atomic_zu_t dirty_bytes;
+	atomic_zu_t nmuzzy;
+	atomic_zu_t muzzy_bytes;
+	atomic_zu_t nretained;
+	atomic_zu_t retained_bytes;
+};
+
 /*
  * Arena stats.  Note that fields marked "derived" are not directly maintained
  * within the arena code; rather their values are derived during stats merge
diff --git a/include/jemalloc/internal/ctl.h b/include/jemalloc/internal/ctl.h
index 5576310..775fdec 100644
--- a/include/jemalloc/internal/ctl.h
+++ b/include/jemalloc/internal/ctl.h
@@ -42,6 +42,7 @@ typedef struct ctl_arena_stats_s {
 
 	bin_stats_t bstats[SC_NBINS];
 	arena_stats_large_t lstats[SC_NSIZES - SC_NBINS];
+	arena_stats_extents_t estats[SC_NPSIZES];
 } ctl_arena_stats_t;
 
 typedef struct ctl_stats_s {
diff --git a/include/jemalloc/internal/extent_externs.h b/include/jemalloc/internal/extent_externs.h
index b8a4d02..8680251 100644
--- a/include/jemalloc/internal/extent_externs.h
+++ b/include/jemalloc/internal/extent_externs.h
@@ -31,6 +31,10 @@ bool extents_init(tsdn_t *tsdn, extents_t *extents, extent_state_t state,
     bool delay_coalesce);
 extent_state_t extents_state_get(const extents_t *extents);
 size_t extents_npages_get(extents_t *extents);
+/* Get the number of extents in the given page size index. */
+size_t extents_nextents_get(extents_t *extents, pszind_t ind);
+/* Get the sum total bytes of the extents in the given page size index. */
+size_t extents_nbytes_get(extents_t *extents, pszind_t ind);
 extent_t *extents_alloc(tsdn_t *tsdn, arena_t *arena,
     extent_hooks_t **r_extent_hooks, extents_t *extents, void *new_addr,
     size_t size, size_t pad, size_t alignment, bool slab, szind_t szind,
diff --git a/include/jemalloc/internal/extent_structs.h b/include/jemalloc/internal/extent_structs.h
index d709577..50e77bf 100644
--- a/include/jemalloc/internal/extent_structs.h
+++ b/include/jemalloc/internal/extent_structs.h
@@ -184,6 +184,8 @@ struct extents_s {
 	 * Synchronization: mtx.
 	 */
 	extent_heap_t		heaps[SC_NPSIZES + 1];
+	atomic_zu_t		nextents[SC_NPSIZES + 1];
+	atomic_zu_t		nbytes[SC_NPSIZES + 1];
 
 	/*
 	 * Bitmap for which set bits correspond to non-empty heaps.
diff --git a/include/jemalloc/internal/stats.h b/include/jemalloc/internal/stats.h
index 852e342..3b9e0ea 100644
--- a/include/jemalloc/internal/stats.h
+++ b/include/jemalloc/internal/stats.h
@@ -10,7 +10,8 @@
     OPTION('a',		unmerged,	config_stats,	false)		\
     OPTION('b',		bins,		true,		false)		\
     OPTION('l',		large,		true,		false)		\
-    OPTION('x',		mutex,		true,		false)
+    OPTION('x',		mutex,		true,		false)		\
+    OPTION('e',		extents,	true,		false)
 
 enum {
 #define OPTION(o, v, d, s) stats_print_option_num_##v,
diff --git a/src/arena.c b/src/arena.c
index da7fd78..ab3f138 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -82,7 +82,8 @@ void
 arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
     const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms,
     size_t *nactive, size_t *ndirty, size_t *nmuzzy, arena_stats_t *astats,
-    bin_stats_t *bstats, arena_stats_large_t *lstats) {
+    bin_stats_t *bstats, arena_stats_large_t *lstats,
+    arena_stats_extents_t *estats) {
 	cassert(config_stats);
 
 	arena_basic_stats_merge(tsdn, arena, nthreads, dss, dirty_decay_ms,
@@ -153,6 +154,28 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
 		    curlextents * sz_index2size(SC_NBINS + i));
 	}
 
+	for (pszind_t i = 0; i < SC_NPSIZES; i++) {
+		size_t dirty, muzzy, retained, dirty_bytes, muzzy_bytes,
+		    retained_bytes;
+		dirty = extents_nextents_get(&arena->extents_dirty, i);
+		muzzy = extents_nextents_get(&arena->extents_muzzy, i);
+		retained = extents_nextents_get(&arena->extents_retained, i);
+		dirty_bytes = extents_nbytes_get(&arena->extents_dirty, i);
+		muzzy_bytes = extents_nbytes_get(&arena->extents_muzzy, i);
+		retained_bytes =
+		    extents_nbytes_get(&arena->extents_retained, i);
+
+		atomic_store_zu(&estats[i].ndirty, dirty, ATOMIC_RELAXED);
+		atomic_store_zu(&estats[i].nmuzzy, muzzy, ATOMIC_RELAXED);
+		atomic_store_zu(&estats[i].nretained, retained, ATOMIC_RELAXED);
+		atomic_store_zu(&estats[i].dirty_bytes, dirty_bytes,
+		    ATOMIC_RELAXED);
+		atomic_store_zu(&estats[i].muzzy_bytes, muzzy_bytes,
+		    ATOMIC_RELAXED);
+		atomic_store_zu(&estats[i].retained_bytes, retained_bytes,
+		    ATOMIC_RELAXED);
+	}
+
 	arena_stats_unlock(tsdn, &arena->stats);
 
 	/* tcache_bytes counts currently cached bytes. */
diff --git a/src/ctl.c b/src/ctl.c
index 448ec7b..10bdc8e 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -173,6 +173,13 @@ CTL_PROTO(stats_arenas_i_lextents_j_ndalloc)
 CTL_PROTO(stats_arenas_i_lextents_j_nrequests)
 CTL_PROTO(stats_arenas_i_lextents_j_curlextents)
 INDEX_PROTO(stats_arenas_i_lextents_j)
+CTL_PROTO(stats_arenas_i_extents_j_ndirty)
+CTL_PROTO(stats_arenas_i_extents_j_nmuzzy)
+CTL_PROTO(stats_arenas_i_extents_j_nretained)
+CTL_PROTO(stats_arenas_i_extents_j_dirty_bytes)
+CTL_PROTO(stats_arenas_i_extents_j_muzzy_bytes)
+CTL_PROTO(stats_arenas_i_extents_j_retained_bytes)
+INDEX_PROTO(stats_arenas_i_extents_j)
 CTL_PROTO(stats_arenas_i_nthreads)
 CTL_PROTO(stats_arenas_i_uptime)
 CTL_PROTO(stats_arenas_i_dss)
@@ -395,7 +402,6 @@ static const ctl_named_node_t	prof_node[] = {
 	{NAME("log_start"),	CTL(prof_log_start)},
 	{NAME("log_stop"),	CTL(prof_log_stop)}
 };
-
 static const ctl_named_node_t stats_arenas_i_small_node[] = {
 	{NAME("allocated"),	CTL(stats_arenas_i_small_allocated)},
 	{NAME("nmalloc"),	CTL(stats_arenas_i_small_nmalloc)},
@@ -466,6 +472,23 @@ static const ctl_indexed_node_t stats_arenas_i_lextents_node[] = {
 	{INDEX(stats_arenas_i_lextents_j)}
 };
 
+static const ctl_named_node_t stats_arenas_i_extents_j_node[] = {
+	{NAME("ndirty"),	CTL(stats_arenas_i_extents_j_ndirty)},
+	{NAME("nmuzzy"),	CTL(stats_arenas_i_extents_j_nmuzzy)},
+	{NAME("nretained"),	CTL(stats_arenas_i_extents_j_nretained)},
+	{NAME("dirty_bytes"),	CTL(stats_arenas_i_extents_j_dirty_bytes)},
+	{NAME("muzzy_bytes"),	CTL(stats_arenas_i_extents_j_muzzy_bytes)},
+	{NAME("retained_bytes"), CTL(stats_arenas_i_extents_j_retained_bytes)}
+};
+
+static const ctl_named_node_t super_stats_arenas_i_extents_j_node[] = {
+	{NAME(""),		CHILD(named, stats_arenas_i_extents_j)}
+};
+
+static const ctl_indexed_node_t stats_arenas_i_extents_node[] = {
+	{INDEX(stats_arenas_i_extents_j)}
+};
+
 #define OP(mtx)  MUTEX_PROF_DATA_NODE(arenas_i_mutexes_##mtx)
 MUTEX_PROF_ARENA_MUTEXES
 #undef OP
@@ -502,6 +525,7 @@ static const ctl_named_node_t stats_arenas_i_node[] = {
 	{NAME("large"),		CHILD(named, stats_arenas_i_large)},
 	{NAME("bins"),		CHILD(indexed, stats_arenas_i_bins)},
 	{NAME("lextents"),	CHILD(indexed, stats_arenas_i_lextents)},
+	{NAME("extents"),	CHILD(indexed, stats_arenas_i_extents)},
 	{NAME("mutexes"),	CHILD(named, stats_arenas_i_mutexes)}
 };
 static const ctl_named_node_t super_stats_arenas_i_node[] = {
@@ -718,6 +742,8 @@ ctl_arena_clear(ctl_arena_t *ctl_arena) {
 		    sizeof(bin_stats_t));
 		memset(ctl_arena->astats->lstats, 0, (SC_NSIZES - SC_NBINS) *
 		    sizeof(arena_stats_large_t));
+		memset(ctl_arena->astats->estats, 0, SC_NPSIZES *
+		    sizeof(arena_stats_extents_t));
 	}
 }
 
@@ -731,7 +757,7 @@ ctl_arena_stats_amerge(tsdn_t *tsdn, ctl_arena_t *ctl_arena, arena_t *arena) {
 		    &ctl_arena->muzzy_decay_ms, &ctl_arena->pactive,
 		    &ctl_arena->pdirty, &ctl_arena->pmuzzy,
 		    &ctl_arena->astats->astats, ctl_arena->astats->bstats,
-		    ctl_arena->astats->lstats);
+		    ctl_arena->astats->lstats, ctl_arena->astats->estats);
 
 		for (i = 0; i < SC_NBINS; i++) {
 			ctl_arena->astats->allocated_small +=
@@ -845,6 +871,7 @@ MUTEX_PROF_ARENA_MUTEXES
 			sdstats->astats.uptime = astats->astats.uptime;
 		}
 
+		/* Merge bin stats. */
 		for (i = 0; i < SC_NBINS; i++) {
 			sdstats->bstats[i].nmalloc += astats->bstats[i].nmalloc;
 			sdstats->bstats[i].ndalloc += astats->bstats[i].ndalloc;
@@ -871,6 +898,7 @@ MUTEX_PROF_ARENA_MUTEXES
 			    &astats->bstats[i].mutex_data);
 		}
 
+		/* Merge stats for large allocations. */
 		for (i = 0; i < SC_NSIZES - SC_NBINS; i++) {
 			ctl_accum_arena_stats_u64(&sdstats->lstats[i].nmalloc,
 			    &astats->lstats[i].nmalloc);
@@ -885,6 +913,22 @@ MUTEX_PROF_ARENA_MUTEXES
 				assert(astats->lstats[i].curlextents == 0);
 			}
 		}
+
+		/* Merge extents stats. */
+		for (i = 0; i < SC_NPSIZES; i++) {
+			accum_atomic_zu(&sdstats->estats[i].ndirty,
+			    &astats->estats[i].ndirty);
+			accum_atomic_zu(&sdstats->estats[i].nmuzzy,
+			    &astats->estats[i].nmuzzy);
+			accum_atomic_zu(&sdstats->estats[i].nretained,
+			    &astats->estats[i].nretained);
+			accum_atomic_zu(&sdstats->estats[i].dirty_bytes,
+			    &astats->estats[i].dirty_bytes);
+			accum_atomic_zu(&sdstats->estats[i].muzzy_bytes,
+			    &astats->estats[i].muzzy_bytes);
+			accum_atomic_zu(&sdstats->estats[i].retained_bytes,
+			    &astats->estats[i].retained_bytes);
+		}
 	}
 }
 
@@ -2918,6 +2962,40 @@ stats_arenas_i_lextents_j_index(tsdn_t *tsdn, const size_t *mib,
 	return super_stats_arenas_i_lextents_j_node;
 }
 
+CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_ndirty,
+    atomic_load_zu(
+        &arenas_i(mib[2])->astats->estats[mib[4]].ndirty,
+	ATOMIC_RELAXED), size_t);
+CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_nmuzzy,
+    atomic_load_zu(
+        &arenas_i(mib[2])->astats->estats[mib[4]].nmuzzy,
+	ATOMIC_RELAXED), size_t);
+CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_nretained,
+    atomic_load_zu(
+        &arenas_i(mib[2])->astats->estats[mib[4]].nretained,
+	ATOMIC_RELAXED), size_t);
+CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_dirty_bytes,
+    atomic_load_zu(
+        &arenas_i(mib[2])->astats->estats[mib[4]].dirty_bytes,
+	ATOMIC_RELAXED), size_t);
+CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_muzzy_bytes,
+    atomic_load_zu(
+        &arenas_i(mib[2])->astats->estats[mib[4]].muzzy_bytes,
+	ATOMIC_RELAXED), size_t);
+CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_retained_bytes,
+    atomic_load_zu(
+        &arenas_i(mib[2])->astats->estats[mib[4]].retained_bytes,
+	ATOMIC_RELAXED), size_t);
+
+static const ctl_named_node_t *
+stats_arenas_i_extents_j_index(tsdn_t *tsdn, const size_t *mib,
+    size_t miblen, size_t j) {
+	if (j >= SC_NPSIZES) {
+		return NULL;
+	}
+	return super_stats_arenas_i_extents_j_node;
+}
+
 static const ctl_named_node_t *
 stats_arenas_i_index(tsdn_t *tsdn, const size_t *mib,
     size_t miblen, size_t i) {
diff --git a/src/extent.c b/src/extent.c
index 592974a..1af93bb 100644
--- a/src/extent.c
+++ b/src/extent.c
@@ -309,6 +309,32 @@ extents_npages_get(extents_t *extents) {
 	return atomic_load_zu(&extents->npages, ATOMIC_RELAXED);
 }
 
+size_t
+extents_nextents_get(extents_t *extents, pszind_t pind) {
+	return atomic_load_zu(&extents->nextents[pind], ATOMIC_RELAXED);
+}
+
+size_t
+extents_nbytes_get(extents_t *extents, pszind_t pind) {
+	return atomic_load_zu(&extents->nbytes[pind], ATOMIC_RELAXED);
+}
+
+static void
+extents_stats_add(extents_t *extent, pszind_t pind, size_t sz) {
+	size_t cur = atomic_load_zu(&extent->nextents[pind], ATOMIC_RELAXED);
+	atomic_store_zu(&extent->nextents[pind], cur + 1, ATOMIC_RELAXED);
+	cur = atomic_load_zu(&extent->nbytes[pind], ATOMIC_RELAXED);
+	atomic_store_zu(&extent->nbytes[pind], cur + sz, ATOMIC_RELAXED);
+}
+
+static void
+extents_stats_sub(extents_t *extent, pszind_t pind, size_t sz) {
+	size_t cur = atomic_load_zu(&extent->nextents[pind], ATOMIC_RELAXED);
+	atomic_store_zu(&extent->nextents[pind], cur - 1, ATOMIC_RELAXED);
+	cur = atomic_load_zu(&extent->nbytes[pind], ATOMIC_RELAXED);
+	atomic_store_zu(&extent->nbytes[pind], cur - sz, ATOMIC_RELAXED);
+}
+
 static void
 extents_insert_locked(tsdn_t *tsdn, extents_t *extents, extent_t *extent) {
 	malloc_mutex_assert_owner(tsdn, &extents->mtx);
@@ -322,6 +348,11 @@ extents_insert_locked(tsdn_t *tsdn, extents_t *extents, extent_t *extent) {
 		    (size_t)pind);
 	}
 	extent_heap_insert(&extents->heaps[pind], extent);
+
+	if (config_stats) {
+		extents_stats_add(extents, pind, size);
+	}
+
 	extent_list_append(&extents->lru, extent);
 	size_t npages = size >> LG_PAGE;
 	/*
@@ -344,6 +375,11 @@ extents_remove_locked(tsdn_t *tsdn, extents_t *extents, extent_t *extent) {
 	size_t psz = extent_size_quantize_floor(size);
 	pszind_t pind = sz_psz2ind(psz);
 	extent_heap_remove(&extents->heaps[pind], extent);
+
+	if (config_stats) {
+		extents_stats_sub(extents, pind, size);
+	}
+
 	if (extent_heap_empty(&extents->heaps[pind])) {
 		bitmap_set(extents->bitmap, &extents_bitmap_info,
 		    (size_t)pind);
diff --git a/src/stats.c b/src/stats.c
index 64d7323..754b641 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -495,6 +495,108 @@ stats_arena_lextents_print(emitter_t *emitter, unsigned i) {
 }
 
 static void
+stats_arena_extents_print(emitter_t *emitter, unsigned i) {
+	unsigned j;
+	bool in_gap, in_gap_prev;
+	emitter_row_t header_row;
+	emitter_row_init(&header_row);
+	emitter_row_t row;
+	emitter_row_init(&row);
+#define COL(name, left_or_right, col_width, etype)			\
+	emitter_col_t header_##name;					\
+	emitter_col_init(&header_##name, &header_row);			\
+	header_##name.justify = emitter_justify_##left_or_right;	\
+	header_##name.width = col_width;				\
+	header_##name.type = emitter_type_title;			\
+	header_##name.str_val = #name;					\
+									\
+	emitter_col_t col_##name;					\
+	emitter_col_init(&col_##name, &row);				\
+	col_##name.justify = emitter_justify_##left_or_right;		\
+	col_##name.width = col_width;					\
+	col_##name.type = emitter_type_##etype;
+
+	COL(size, right, 20, size)
+	COL(ind, right, 4, unsigned)
+	COL(ndirty, right, 13, size)
+	COL(dirty, right, 13, size)
+	COL(nmuzzy, right, 13, size)
+	COL(muzzy, right, 13, size)
+	COL(nretained, right, 13, size)
+	COL(retained, right, 13, size)
+	COL(ntotal, right, 13, size)
+	COL(total, right, 13, size)
+#undef COL
+
+	/* Label this section. */
+	header_size.width -= 8;
+	emitter_table_printf(emitter, "extents:");
+	emitter_table_row(emitter, &header_row);
+	emitter_json_array_kv_begin(emitter, "extents");
+
+	in_gap = false;
+	for (j = 0; j < SC_NPSIZES; j++) {
+		size_t ndirty, nmuzzy, nretained, total, dirty_bytes,
+		    muzzy_bytes, retained_bytes, total_bytes;
+		CTL_M2_M4_GET("stats.arenas.0.extents.0.ndirty", i, j,
+		    &ndirty, size_t);
+		CTL_M2_M4_GET("stats.arenas.0.extents.0.nmuzzy", i, j,
+		    &nmuzzy, size_t);
+		CTL_M2_M4_GET("stats.arenas.0.extents.0.nretained", i, j,
+		    &nretained, size_t);
+		CTL_M2_M4_GET("stats.arenas.0.extents.0.dirty_bytes", i, j,
+		    &dirty_bytes, size_t);
+		CTL_M2_M4_GET("stats.arenas.0.extents.0.muzzy_bytes", i, j,
+		    &muzzy_bytes, size_t);
+		CTL_M2_M4_GET("stats.arenas.0.extents.0.retained_bytes", i, j,
+		    &retained_bytes, size_t);
+		total = ndirty + nmuzzy + nretained;
+		total_bytes = dirty_bytes + muzzy_bytes + retained_bytes;
+
+		in_gap_prev = in_gap;
+		in_gap = (total == 0);
+
+		if (in_gap_prev && !in_gap) {
+			emitter_table_printf(emitter,
+			    "                     ---\n");
+		}
+
+		emitter_json_object_begin(emitter);
+		emitter_json_kv(emitter, "ndirty", emitter_type_size, &ndirty);
+		emitter_json_kv(emitter, "nmuzzy", emitter_type_size, &nmuzzy);
+		emitter_json_kv(emitter, "nretained", emitter_type_size,
+		    &nretained);
+
+		emitter_json_kv(emitter, "dirty_bytes", emitter_type_size,
+		    &dirty_bytes);
+		emitter_json_kv(emitter, "muzzy_bytes", emitter_type_size,
+		    &muzzy_bytes);
+		emitter_json_kv(emitter, "retained_bytes", emitter_type_size,
+		    &retained_bytes);
+		emitter_json_object_end(emitter);
+
+		col_size.size_val = sz_pind2sz(j);
+		col_ind.size_val = j;
+		col_ndirty.size_val = ndirty;
+		col_dirty.size_val = dirty_bytes;
+		col_nmuzzy.size_val = nmuzzy;
+		col_muzzy.size_val = muzzy_bytes;
+		col_nretained.size_val = nretained;
+		col_retained.size_val = retained_bytes;
+		col_ntotal.size_val = total;
+		col_total.size_val = total_bytes;
+
+		if (!in_gap) {
+			emitter_table_row(emitter, &row);
+		}
+	}
+	emitter_json_array_end(emitter); /* Close "extents". */
+	if (in_gap) {
+		emitter_table_printf(emitter, "                     ---\n");
+	}
+}
+
+static void
 stats_arena_mutexes_print(emitter_t *emitter, unsigned arena_ind) {
 	emitter_row_t row;
 	emitter_col_t col_name;
@@ -521,7 +623,7 @@ stats_arena_mutexes_print(emitter_t *emitter, unsigned arena_ind) {
 
 static void
 stats_arena_print(emitter_t *emitter, unsigned i, bool bins, bool large,
-    bool mutex) {
+    bool mutex, bool extents) {
 	unsigned nthreads;
 	const char *dss;
 	ssize_t dirty_decay_ms, muzzy_decay_ms;
@@ -820,6 +922,9 @@ stats_arena_print(emitter_t *emitter, unsigned i, bool bins, bool large,
 	if (large) {
 		stats_arena_lextents_print(emitter, i);
 	}
+	if (extents) {
+		stats_arena_extents_print(emitter, i);
+	}
 }
 
 static void
@@ -1066,7 +1171,7 @@ stats_general_print(emitter_t *emitter) {
 
 static void
 stats_print_helper(emitter_t *emitter, bool merged, bool destroyed,
-    bool unmerged, bool bins, bool large, bool mutex) {
+    bool unmerged, bool bins, bool large, bool mutex, bool extents) {
 	/*
 	 * These should be deleted.  We keep them around for a while, to aid in
 	 * the transition to the emitter code.
@@ -1187,7 +1292,7 @@ stats_print_helper(emitter_t *emitter, bool merged, bool destroyed,
 			emitter_table_printf(emitter, "Merged arenas stats:\n");
 			emitter_json_object_kv_begin(emitter, "merged");
 			stats_arena_print(emitter, MALLCTL_ARENAS_ALL, bins,
-			    large, mutex);
+			    large, mutex, extents);
 			emitter_json_object_end(emitter); /* Close "merged". */
 		}
 
@@ -1198,7 +1303,7 @@ stats_print_helper(emitter_t *emitter, bool merged, bool destroyed,
 			    "Destroyed arenas stats:\n");
 			emitter_json_object_kv_begin(emitter, "destroyed");
 			stats_arena_print(emitter, MALLCTL_ARENAS_DESTROYED,
-			    bins, large, mutex);
+			    bins, large, mutex, extents);
 			emitter_json_object_end(emitter); /* Close "destroyed". */
 		}
 
@@ -1214,7 +1319,7 @@ stats_print_helper(emitter_t *emitter, bool merged, bool destroyed,
 					emitter_table_printf(emitter,
 					    "arenas[%s]:\n", arena_ind_str);
 					stats_arena_print(emitter, i, bins,
-					    large, mutex);
+					    large, mutex, extents);
 					/* Close "<arena-ind>". */
 					emitter_json_object_end(emitter);
 				}
@@ -1280,7 +1385,7 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	}
 	if (config_stats) {
 		stats_print_helper(&emitter, merged, destroyed, unmerged,
-		    bins, large, mutex);
+		    bins, large, mutex, extents);
 	}
 
 	emitter_json_object_end(&emitter); /* Closes the "jemalloc" dict. */
-- 
cgit v0.12


From 126252a7e6bd098d649f6a82a947c7c056816c2c Mon Sep 17 00:00:00 2001
From: Tyler Etzel <tyleretzel1@gmail.com>
Date: Wed, 1 Aug 2018 14:14:33 -0700
Subject: Add stats for the size of extent_avail heap

---
 doc/jemalloc.xml.in                         | 11 +++++++++++
 include/jemalloc/internal/arena_stats.h     |  3 +++
 include/jemalloc/internal/arena_structs_b.h |  1 +
 src/arena.c                                 |  4 ++++
 src/ctl.c                                   |  8 ++++++++
 src/extent.c                                |  2 ++
 src/stats.c                                 |  3 ++-
 7 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 08d4830..058e9db 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -2597,6 +2597,17 @@ struct extent_hooks_s {
         details.</para></listitem>
       </varlistentry>
 
+      <varlistentry id="stats.arenas.i.extent_avail">
+        <term>
+          <mallctl>stats.arenas.&lt;i&gt;.extent_avail</mallctl>
+          (<type>size_t</type>)
+          <literal>r-</literal>
+          [<option>--enable-stats</option>]
+        </term>
+        <listitem><para>Number of allocated (but unused) extent structs in this
+	arena.</para></listitem>
+      </varlistentry>
+
       <varlistentry id="stats.arenas.i.base">
         <term>
           <mallctl>stats.arenas.&lt;i&gt;.base</mallctl>
diff --git a/include/jemalloc/internal/arena_stats.h b/include/jemalloc/internal/arena_stats.h
index 470ddfc..ef1e25b 100644
--- a/include/jemalloc/internal/arena_stats.h
+++ b/include/jemalloc/internal/arena_stats.h
@@ -87,6 +87,9 @@ struct arena_stats_s {
 	 */
 	atomic_zu_t		retained; /* Derived. */
 
+	/* Number of extent_t structs allocated by base, but not being used. */
+	atomic_zu_t		extent_avail;
+
 	arena_stats_decay_t	decay_dirty;
 	arena_stats_decay_t	decay_muzzy;
 
diff --git a/include/jemalloc/internal/arena_structs_b.h b/include/jemalloc/internal/arena_structs_b.h
index 96f25f8..509f11c 100644
--- a/include/jemalloc/internal/arena_structs_b.h
+++ b/include/jemalloc/internal/arena_structs_b.h
@@ -196,6 +196,7 @@ struct arena_s {
 	 * Synchronization: extent_avail_mtx.
 	 */
 	extent_tree_t		extent_avail;
+	atomic_zu_t		extent_avail_cnt;
 	malloc_mutex_t		extent_avail_mtx;
 
 	/*
diff --git a/src/arena.c b/src/arena.c
index ab3f138..29f447b 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -100,6 +100,10 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
 	arena_stats_accum_zu(&astats->retained,
 	    extents_npages_get(&arena->extents_retained) << LG_PAGE);
 
+	atomic_store_zu(&astats->extent_avail,
+	    atomic_load_zu(&arena->extent_avail_cnt, ATOMIC_RELAXED),
+	    ATOMIC_RELAXED);
+
 	arena_stats_accum_u64(&astats->decay_dirty.npurge,
 	    arena_stats_read_u64(tsdn, &arena->stats,
 	    &arena->stats.decay_dirty.npurge));
diff --git a/src/ctl.c b/src/ctl.c
index 10bdc8e..b482fc5 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -190,6 +190,7 @@ CTL_PROTO(stats_arenas_i_pdirty)
 CTL_PROTO(stats_arenas_i_pmuzzy)
 CTL_PROTO(stats_arenas_i_mapped)
 CTL_PROTO(stats_arenas_i_retained)
+CTL_PROTO(stats_arenas_i_extent_avail)
 CTL_PROTO(stats_arenas_i_dirty_npurge)
 CTL_PROTO(stats_arenas_i_dirty_nmadvise)
 CTL_PROTO(stats_arenas_i_dirty_purged)
@@ -510,6 +511,7 @@ static const ctl_named_node_t stats_arenas_i_node[] = {
 	{NAME("pmuzzy"),	CTL(stats_arenas_i_pmuzzy)},
 	{NAME("mapped"),	CTL(stats_arenas_i_mapped)},
 	{NAME("retained"),	CTL(stats_arenas_i_retained)},
+	{NAME("extent_avail"),	CTL(stats_arenas_i_extent_avail)},
 	{NAME("dirty_npurge"),	CTL(stats_arenas_i_dirty_npurge)},
 	{NAME("dirty_nmadvise"), CTL(stats_arenas_i_dirty_nmadvise)},
 	{NAME("dirty_purged"),	CTL(stats_arenas_i_dirty_purged)},
@@ -804,6 +806,8 @@ ctl_arena_stats_sdmerge(ctl_arena_t *ctl_sdarena, ctl_arena_t *ctl_arena,
 			    &astats->astats.mapped);
 			accum_atomic_zu(&sdstats->astats.retained,
 			    &astats->astats.retained);
+			accum_atomic_zu(&sdstats->astats.extent_avail,
+			    &astats->astats.extent_avail);
 		}
 
 		ctl_accum_arena_stats_u64(&sdstats->astats.decay_dirty.npurge,
@@ -2764,6 +2768,10 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_mapped,
 CTL_RO_CGEN(config_stats, stats_arenas_i_retained,
     atomic_load_zu(&arenas_i(mib[2])->astats->astats.retained, ATOMIC_RELAXED),
     size_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_extent_avail,
+    atomic_load_zu(&arenas_i(mib[2])->astats->astats.extent_avail,
+        ATOMIC_RELAXED),
+    size_t)
 
 CTL_RO_CGEN(config_stats, stats_arenas_i_dirty_npurge,
     ctl_arena_stats_read_u64(
diff --git a/src/extent.c b/src/extent.c
index 1af93bb..847e4b9 100644
--- a/src/extent.c
+++ b/src/extent.c
@@ -186,6 +186,7 @@ extent_alloc(tsdn_t *tsdn, arena_t *arena) {
 		return base_alloc_extent(tsdn, arena->base);
 	}
 	extent_avail_remove(&arena->extent_avail, extent);
+	atomic_fetch_sub_zu(&arena->extent_avail_cnt, 1, ATOMIC_RELAXED);
 	malloc_mutex_unlock(tsdn, &arena->extent_avail_mtx);
 	return extent;
 }
@@ -194,6 +195,7 @@ void
 extent_dalloc(tsdn_t *tsdn, arena_t *arena, extent_t *extent) {
 	malloc_mutex_lock(tsdn, &arena->extent_avail_mtx);
 	extent_avail_insert(&arena->extent_avail, extent);
+	atomic_fetch_add_zu(&arena->extent_avail_cnt, 1, ATOMIC_RELAXED);
 	malloc_mutex_unlock(tsdn, &arena->extent_avail_mtx);
 }
 
diff --git a/src/stats.c b/src/stats.c
index 754b641..e4e1337 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -628,7 +628,7 @@ stats_arena_print(emitter_t *emitter, unsigned i, bool bins, bool large,
 	const char *dss;
 	ssize_t dirty_decay_ms, muzzy_decay_ms;
 	size_t page, pactive, pdirty, pmuzzy, mapped, retained;
-	size_t base, internal, resident, metadata_thp;
+	size_t base, internal, resident, metadata_thp, extent_avail;
 	uint64_t dirty_npurge, dirty_nmadvise, dirty_purged;
 	uint64_t muzzy_npurge, muzzy_nmadvise, muzzy_purged;
 	size_t small_allocated;
@@ -911,6 +911,7 @@ stats_arena_print(emitter_t *emitter, unsigned i, bool bins, bool large,
 	GET_AND_EMIT_MEM_STAT(metadata_thp)
 	GET_AND_EMIT_MEM_STAT(tcache_bytes)
 	GET_AND_EMIT_MEM_STAT(resident)
+	GET_AND_EMIT_MEM_STAT(extent_avail)
 #undef GET_AND_EMIT_MEM_STAT
 
 	if (mutex) {
-- 
cgit v0.12


From e8ec9528abac90efe4e0cc3a29da8d7aea59f23d Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Fri, 3 Aug 2018 12:47:40 -0700
Subject: Allow the use of readlinkat over readlink.

This can be useful in situations where readlink is disallowed.
---
 configure.ac                                          | 16 ++++++++++++++++
 include/jemalloc/internal/jemalloc_internal_defs.h.in |  6 ++++++
 src/jemalloc.c                                        |  5 +++++
 3 files changed, 27 insertions(+)

diff --git a/configure.ac b/configure.ac
index e18bc4b..cd5bdd6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1277,6 +1277,22 @@ if test "x$enable_log" = "x1" ; then
 fi
 AC_SUBST([enable_log])
 
+dnl Do not use readlinkat by default
+AC_ARG_ENABLE([readlinkat],
+  [AS_HELP_STRING([--enable-readlinkat], [Use readlinkat over readlink])],
+[if test "x$enable_readlinkat" = "xno" ; then
+  enable_readlinkat="0"
+else
+  enable_readlinkat="1"
+fi
+],
+[enable_readlinkat="0"]
+)
+if test "x$enable_readlinkat" = "x1" ; then
+  AC_DEFINE([JEMALLOC_READLINKAT], [ ])
+fi
+AC_SUBST([enable_readlinkat])
+
 
 JE_COMPILABLE([a program using __builtin_unreachable], [
 void foo (void) {
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index 8dad9a1..cec41aa 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -246,6 +246,12 @@
 #undef JEMALLOC_LOG
 
 /*
+ * If defined, use readlinkat() (instead of readlink()) to follow
+ * /etc/malloc_conf.
+ */
+#undef JEMALLOC_READLINKAT
+
+/*
  * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
  */
 #undef JEMALLOC_ZONE
diff --git a/src/jemalloc.c b/src/jemalloc.c
index e8f110f..2828c17 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -961,7 +961,12 @@ malloc_conf_init(sc_data_t *sc_data) {
 			 * Try to use the contents of the "/etc/malloc.conf"
 			 * symbolic link's name.
 			 */
+#ifndef JEMALLOC_READLINKAT
 			linklen = readlink(linkname, buf, sizeof(buf) - 1);
+#else
+			linklen = readlinkat(AT_FDCWD, linkname, buf,
+			    sizeof(buf) - 1);
+#endif
 			if (linklen == -1) {
 				/* No configuration specified. */
 				linklen = 0;
-- 
cgit v0.12


From 0771ff2cea6dc18fcd3f6bf452b4224a4e17ae38 Mon Sep 17 00:00:00 2001
From: David Carlier <devnexen@gmail.com>
Date: Sun, 5 Aug 2018 10:37:53 +0100
Subject: FreeBSD build changes and allow to run the tests.

---
 include/jemalloc/internal/mutex.h | 13 ++++++++++---
 scripts/gen_run_tests.py          | 25 +++++++++++++++++++------
 src/pages.c                       |  2 --
 3 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/include/jemalloc/internal/mutex.h b/include/jemalloc/internal/mutex.h
index 651ce5f..5a955d9 100644
--- a/include/jemalloc/internal/mutex.h
+++ b/include/jemalloc/internal/mutex.h
@@ -115,9 +115,16 @@ struct malloc_mutex_s {
      {{{LOCK_PROF_DATA_INITIALIZER, 0}},				\
       WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
 #elif (defined(JEMALLOC_MUTEX_INIT_CB))
-#  define MALLOC_MUTEX_INITIALIZER					\
-     {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL}},	\
-         WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
+#  if (defined(JEMALLOC_DEBUG))
+#     define MALLOC_MUTEX_INITIALIZER					\
+       {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL}},	\
+           WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0}
+#  else
+#     define MALLOC_MUTEX_INITIALIZER					\
+       {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL}},	\
+           WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
+#  endif
+
 #else
 #    define MALLOC_MUTEX_TYPE PTHREAD_MUTEX_DEFAULT
 #  if defined(JEMALLOC_DEBUG)
diff --git a/scripts/gen_run_tests.py b/scripts/gen_run_tests.py
index a87ecff..5052b3e 100755
--- a/scripts/gen_run_tests.py
+++ b/scripts/gen_run_tests.py
@@ -4,6 +4,7 @@ import sys
 from itertools import combinations
 from os import uname
 from multiprocessing import cpu_count
+from subprocess import call
 
 # Later, we want to test extended vaddr support.  Apparently, the "real" way of
 # checking this is flaky on OS X.
@@ -13,13 +14,25 @@ nparallel = cpu_count() * 2
 
 uname = uname()[0]
 
+if "BSD" in uname:
+    make_cmd = 'gmake'
+else:
+    make_cmd = 'make'
+
 def powerset(items):
     result = []
     for i in xrange(len(items) + 1):
         result += combinations(items, i)
     return result
 
-possible_compilers = [('gcc', 'g++'), ('clang', 'clang++')]
+possible_compilers = []
+for cc, cxx in (['gcc', 'g++'], ['clang', 'clang++']):
+    try:
+        cmd_ret = call([cc, "-v"])
+        if cmd_ret == 0:
+            possible_compilers.append((cc, cxx))
+    except:
+        pass
 possible_compiler_opts = [
     '-m32',
 ]
@@ -39,7 +52,7 @@ possible_malloc_conf_opts = [
 ]
 
 print 'set -e'
-print 'if [ -f Makefile ] ; then make relclean ; fi'
+print 'if [ -f Makefile ] ; then %(make_cmd)s relclean ; fi' % {'make_cmd': make_cmd}
 print 'autoconf'
 print 'rm -rf run_tests.out'
 print 'mkdir run_tests.out'
@@ -102,11 +115,11 @@ cd run_test_%(ind)d.out
 echo "==> %(config_line)s" >> run_test.log
 %(config_line)s >> run_test.log 2>&1 || abort
 
-run_cmd make all tests
-run_cmd make check
-run_cmd make distclean
+run_cmd %(make_cmd)s all tests
+run_cmd %(make_cmd)s check
+run_cmd %(make_cmd)s distclean
 EOF
-chmod 755 run_test_%(ind)d.sh""" % {'ind': ind, 'config_line': config_line}
+chmod 755 run_test_%(ind)d.sh""" % {'ind': ind, 'config_line': config_line, 'make_cmd': make_cmd}
                     ind += 1
 
 print 'for i in `seq 0 %(last_ind)d` ; do echo run_test_${i}.sh ; done | xargs -P %(nparallel)d -n 1 sh' % {'last_ind': ind-1, 'nparallel': nparallel}
diff --git a/src/pages.c b/src/pages.c
index cc967fc..9561eb3 100644
--- a/src/pages.c
+++ b/src/pages.c
@@ -390,8 +390,6 @@ os_page_detect(void) {
 	SYSTEM_INFO si;
 	GetSystemInfo(&si);
 	return si.dwPageSize;
-#elif defined(__FreeBSD__)
-	return getpagesize();
 #else
 	long result = sysconf(_SC_PAGESIZE);
 	if (result == -1) {
-- 
cgit v0.12


From 1f71e1ca4319de7788d53d1d0ba905995c7f52bd Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Wed, 1 Aug 2018 14:22:05 -0700
Subject: Add hook microbenchmark.

---
 Makefile.in             |  4 ++-
 test/stress/hookbench.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+), 1 deletion(-)
 create mode 100644 test/stress/hookbench.c

diff --git a/Makefile.in b/Makefile.in
index 49585ed..c35bb7e 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -242,7 +242,9 @@ else
 CPP_SRCS :=
 TESTS_INTEGRATION_CPP :=
 endif
-TESTS_STRESS := $(srcroot)test/stress/microbench.c
+TESTS_STRESS := $(srcroot)test/stress/microbench.c \
+	$(srcroot)test/stress/hookbench.c
+
 
 TESTS := $(TESTS_UNIT) $(TESTS_INTEGRATION) $(TESTS_INTEGRATION_CPP) $(TESTS_STRESS)
 
diff --git a/test/stress/hookbench.c b/test/stress/hookbench.c
new file mode 100644
index 0000000..97e90b0
--- /dev/null
+++ b/test/stress/hookbench.c
@@ -0,0 +1,73 @@
+#include "test/jemalloc_test.h"
+
+static void
+noop_alloc_hook(void *extra, hook_alloc_t type, void *result,
+    uintptr_t result_raw, uintptr_t args_raw[3]) {
+}
+
+static void
+noop_dalloc_hook(void *extra, hook_dalloc_t type, void *address,
+    uintptr_t args_raw[3]) {
+}
+
+static void
+noop_expand_hook(void *extra, hook_expand_t type, void *address,
+    size_t old_usize, size_t new_usize, uintptr_t result_raw,
+    uintptr_t args_raw[4]) {
+}
+
+static void
+malloc_free_loop(int iters) {
+	for (int i = 0; i < iters; i++) {
+		void *p = mallocx(1, 0);
+		free(p);
+	}
+}
+
+static void
+test_hooked(int iters) {
+	hooks_t hooks = {&noop_alloc_hook, &noop_dalloc_hook, &noop_expand_hook,
+		NULL};
+
+	int err;
+	void *handles[HOOK_MAX];
+	size_t sz = sizeof(handles[0]);
+
+	for (int i = 0; i < HOOK_MAX; i++) {
+		err = mallctl("experimental.hooks.install", &handles[i],
+		    &sz, &hooks, sizeof(hooks));
+		assert(err == 0);
+
+		timedelta_t timer;
+		timer_start(&timer);
+		malloc_free_loop(iters);
+		timer_stop(&timer);
+		malloc_printf("With %d hook%s: %"FMTu64"us\n", i + 1,
+		    i + 1 == 1 ? "" : "s", timer_usec(&timer));
+	}
+	for (int i = 0; i < HOOK_MAX; i++) {
+		err = mallctl("experimental.hooks.remove", NULL, NULL,
+		    &handles[i], sizeof(handles[i]));
+		assert(err == 0);
+	}
+}
+
+static void
+test_unhooked(int iters) {
+	timedelta_t timer;
+	timer_start(&timer);
+	malloc_free_loop(iters);
+	timer_stop(&timer);
+
+	malloc_printf("Without hooks: %"FMTu64"us\n", timer_usec(&timer));
+}
+
+int
+main(void) {
+	/* Initialize */
+	free(mallocx(1, 0));
+	int iters = 10 * 1000 * 1000;
+	malloc_printf("Benchmarking hooks with %d iterations:\n", iters);
+	test_hooked(iters);
+	test_unhooked(iters);
+}
-- 
cgit v0.12


From 36eb0b3d77404f389cfddad6675fe1f479e76be7 Mon Sep 17 00:00:00 2001
From: gnzlbg <gonzalobg88@gmail.com>
Date: Tue, 10 Jul 2018 13:58:37 +0200
Subject: Add valgrind build bots to CI

This commit adds two build-bots to CI that test the release builds
of jemalloc on linux and macOS under valgrind.

The macOS build is not enabled because valgrind reports
errors about reads of uninitialized memory in some tests and
segfaults in others.
---
 .travis.yml           |  9 +++++++++
 scripts/gen_travis.py | 20 ++++++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index cd3be83..07d3081 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -119,9 +119,18 @@ matrix:
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--with-malloc-conf=dss:primary,background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--with-malloc-conf=percpu_arena:percpu,background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    # Development build
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-debug --disable-cache-oblivious --enable-stats --enable-log --enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
 
+    # Valgrind
+    - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="" EXTRA_CFLAGS="-Werror -Wno-array-bounds" JEMALLOC_TEST_PREFIX="valgrind"
+      addons:
+        apt:
+          packages:
+            - valgrind
+
 
 before_script:
   - autoconf
diff --git a/scripts/gen_travis.py b/scripts/gen_travis.py
index 4473205..743f1e5 100755
--- a/scripts/gen_travis.py
+++ b/scripts/gen_travis.py
@@ -113,8 +113,28 @@ for combination in unusual_combinations_to_test:
 
 # Development build
 include_rows += '''\
+    # Development build
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-debug --disable-cache-oblivious --enable-stats --enable-log --enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
 '''
 
+# Valgrind build bots
+include_rows += '''
+    # Valgrind
+    - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="" EXTRA_CFLAGS="-Werror -Wno-array-bounds" JEMALLOC_TEST_PREFIX="valgrind"
+      addons:
+        apt:
+          packages:
+            - valgrind
+'''
+
+# To enable valgrind on macosx add:
+#
+#  - os: osx
+#    env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="" EXTRA_CFLAGS="-Werror -Wno-array-bounds" JEMALLOC_TEST_PREFIX="valgrind"
+#    install: brew install valgrind
+#
+# It currently fails due to: https://github.com/jemalloc/jemalloc/issues/1274
+
 print travis_template % include_rows
-- 
cgit v0.12


From 4c548a61c89b0472b9952fcc4090eb00c2a88870 Mon Sep 17 00:00:00 2001
From: Rajeev Misra <rajeev.misra@gmail.com>
Date: Fri, 10 Aug 2018 20:27:35 -0700
Subject: Bit_util: Use intrinsics for pow2_ceil, where available.

---
 include/jemalloc/internal/bit_util.h | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/include/jemalloc/internal/bit_util.h b/include/jemalloc/internal/bit_util.h
index 521f71b..27a8c97 100644
--- a/include/jemalloc/internal/bit_util.h
+++ b/include/jemalloc/internal/bit_util.h
@@ -63,6 +63,22 @@ ffs_u32(uint32_t bitmap) {
 
 BIT_UTIL_INLINE uint64_t
 pow2_ceil_u64(uint64_t x) {
+#if (defined(__amd64__) || defined(__x86_64__) || defined(JEMALLOC_HAVE_BUILTIN_CLZ))
+	if(unlikely(x <= 1)) {
+		return x;
+	}
+	size_t msb_on_index;
+#if (defined(__amd64__) || defined(__x86_64__))
+	asm ("bsrq %1, %0"
+			: "=r"(msb_on_index) // Outputs.
+			: "r"(x-1)           // Inputs.
+		);
+#elif (defined(JEMALLOC_HAVE_BUILTIN_CLZ))
+	msb_on_index = (63 ^ __builtin_clzll(x - 1));
+#endif
+	assert(msb_on_index < 63);
+	return 1ULL << (msb_on_index + 1);
+#else
 	x--;
 	x |= x >> 1;
 	x |= x >> 2;
@@ -72,10 +88,27 @@ pow2_ceil_u64(uint64_t x) {
 	x |= x >> 32;
 	x++;
 	return x;
+#endif
 }
 
 BIT_UTIL_INLINE uint32_t
 pow2_ceil_u32(uint32_t x) {
+#if (defined(__i386__) || defined(JEMALLOC_HAVE_BUILTIN_CLZ))
+	if(unlikely(x <= 1)) {
+		return x;
+	}
+	size_t msb_on_index;
+#if (defined(__i386__))
+	asm ("bsr %1, %0"
+			: "=r"(msb_on_index) // Outputs.
+			: "r"(x-1)           // Inputs.
+		);
+#elif (defined(JEMALLOC_HAVE_BUILTIN_CLZ))
+	msb_on_index = (31 ^ __builtin_clz(x - 1));
+#endif
+	assert(msb_on_index < 31);
+	return 1U << (msb_on_index + 1);
+#else
 	x--;
 	x |= x >> 1;
 	x |= x >> 2;
@@ -84,6 +117,7 @@ pow2_ceil_u32(uint32_t x) {
 	x |= x >> 16;
 	x++;
 	return x;
+#endif
 }
 
 /* Compute the smallest power of 2 that is >= x. */
-- 
cgit v0.12


From 9f43defb6eac30c36dbde25d82e88be23f97309f Mon Sep 17 00:00:00 2001
From: rustyx <me@rustyx.org>
Date: Fri, 31 Aug 2018 15:45:47 +0200
Subject: Add sc.c to the MSVC project

---
 msvc/projects/vc2015/jemalloc/jemalloc.vcxproj         | 1 +
 msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters | 3 +++
 msvc/projects/vc2017/jemalloc/jemalloc.vcxproj         | 1 +
 msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters | 3 +++
 4 files changed, 8 insertions(+)

diff --git a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj
index be252d7..ddc6781 100644
--- a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj
+++ b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj
@@ -59,6 +59,7 @@
     <ClCompile Include="..\..\..\..\src\prng.c" />
     <ClCompile Include="..\..\..\..\src\prof.c" />
     <ClCompile Include="..\..\..\..\src\rtree.c" />
+    <ClCompile Include="..\..\..\..\src\sc.c" />
     <ClCompile Include="..\..\..\..\src\stats.c" />
     <ClCompile Include="..\..\..\..\src\sz.c" />
     <ClCompile Include="..\..\..\..\src\tcache.c" />
diff --git a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters
index 00d0960..1dcf4ed 100644
--- a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters
+++ b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters
@@ -70,6 +70,9 @@
     <ClCompile Include="..\..\..\..\src\rtree.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\..\src\sc.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\..\src\stats.c">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj
index 599cc42..21481d5 100644
--- a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj
+++ b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj
@@ -59,6 +59,7 @@
     <ClCompile Include="..\..\..\..\src\prng.c" />
     <ClCompile Include="..\..\..\..\src\prof.c" />
     <ClCompile Include="..\..\..\..\src\rtree.c" />
+    <ClCompile Include="..\..\..\..\src\sc.c" />
     <ClCompile Include="..\..\..\..\src\stats.c" />
     <ClCompile Include="..\..\..\..\src\sz.c" />
     <ClCompile Include="..\..\..\..\src\tcache.c" />
diff --git a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters
index b352721..466dc63 100644
--- a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters
+++ b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters
@@ -70,6 +70,9 @@
     <ClCompile Include="..\..\..\..\src\rtree.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\..\src\sc.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\..\src\stats.c">
       <Filter>Source Files</Filter>
     </ClCompile>
-- 
cgit v0.12


From 88771fa0138c75a2d29601cc33025d81822b082a Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Wed, 12 Sep 2018 15:32:16 -0700
Subject: Bootstrapping: don't overwrite opt_prof_prefix.

---
 src/jemalloc.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/jemalloc.c b/src/jemalloc.c
index 2828c17..15c0609 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1387,13 +1387,18 @@ malloc_init_hard_a0_locked() {
 	 * out of sc_data_global are final.
 	 */
 	sc_boot(&sc_data);
+	/*
+	 * prof_boot0 only initializes opt_prof_prefix.  We need to do it before
+	 * we parse malloc_conf options, in case malloc_conf parsing overwrites
+	 * it.
+	 */
+	if (config_prof) {
+		prof_boot0();
+	}
 	malloc_conf_init(&sc_data);
 	sz_boot(&sc_data);
 	bin_boot(&sc_data);
 
-	if (config_prof) {
-		prof_boot0();
-	}
 	if (opt_stats_print) {
 		/* Print statistics at exit. */
 		if (atexit(stats_print_atexit) != 0) {
-- 
cgit v0.12


From 115ce93562ab76f90a2509bf0640bc7df6b2d48f Mon Sep 17 00:00:00 2001
From: Rajeev Misra <rajeev.misra@gmail.com>
Date: Thu, 23 Aug 2018 20:58:48 -0700
Subject: bit_util: Don't use __builtin_clz on s390x

There's an optimizer bug upstream that results in test failures; reported at
https://bugzilla.redhat.com/show_bug.cgi?id=1619354.  This works around the
failure reported at https://github.com/jemalloc/jemalloc/issues/1307.
---
 include/jemalloc/internal/bit_util.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/jemalloc/internal/bit_util.h b/include/jemalloc/internal/bit_util.h
index 27a8c97..8c59c39 100644
--- a/include/jemalloc/internal/bit_util.h
+++ b/include/jemalloc/internal/bit_util.h
@@ -93,7 +93,7 @@ pow2_ceil_u64(uint64_t x) {
 
 BIT_UTIL_INLINE uint32_t
 pow2_ceil_u32(uint32_t x) {
-#if (defined(__i386__) || defined(JEMALLOC_HAVE_BUILTIN_CLZ))
+#if ((defined(__i386__) || defined(JEMALLOC_HAVE_BUILTIN_CLZ)) && (!defined(__s390__)))
 	if(unlikely(x <= 1)) {
 		return x;
 	}
-- 
cgit v0.12


From 676cdd66792ccb629a978837ea2a066d5db342cc Mon Sep 17 00:00:00 2001
From: Edward Tomasz Napierala <trasz@FreeBSD.org>
Date: Sat, 23 Jun 2018 05:44:23 +0100
Subject: Disable runtime detection of lazy purging support on FreeBSD.

The check doesn't seem to serve any purpose here, and this shaves
off three syscalls on binary startup.
---
 src/pages.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/pages.c b/src/pages.c
index 9561eb3..7ef3de7 100644
--- a/src/pages.c
+++ b/src/pages.c
@@ -586,6 +586,11 @@ pages_boot(void) {
 
 	init_thp_state();
 
+#ifdef __FreeBSD__
+	/*
+	 * FreeBSD doesn't need the check; madvise(2) is known to work.
+	 */
+#else
 	/* Detect lazy purge runtime support. */
 	if (pages_can_purge_lazy) {
 		bool committed = false;
@@ -599,6 +604,7 @@ pages_boot(void) {
 		}
 		os_pages_unmap(madv_free_page, PAGE);
 	}
+#endif
 
 	return false;
 }
-- 
cgit v0.12


From f80c97e477d1b3fe7778c65d9439d673738b4131 Mon Sep 17 00:00:00 2001
From: Edward Tomasz Napierala <trasz@FreeBSD.org>
Date: Sat, 23 Jun 2018 06:51:33 +0100
Subject: Rework the way jemalloc uses mmap(2) on FreeBSD.

This makes it directly use MAP_EXCL and MAP_ALIGNED() instead
of weird workarounds involving mapping at random places and then
unmapping parts of them.
---
 src/pages.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/src/pages.c b/src/pages.c
index 7ef3de7..88a9d63 100644
--- a/src/pages.c
+++ b/src/pages.c
@@ -180,6 +180,31 @@ pages_map(void *addr, size_t size, size_t alignment, bool *commit) {
 	assert(alignment >= PAGE);
 	assert(ALIGNMENT_ADDR2BASE(addr, alignment) == addr);
 
+#if defined(__FreeBSD__) && defined(MAP_EXCL)
+	/*
+	 * FreeBSD has mechanisms both to mmap at specific address without
+	 * touching existing mappings, and to mmap with specific alignment.
+	 */
+	{
+		int prot = *commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
+		int flags = mmap_flags;
+
+		if (addr != NULL) {
+			flags |= MAP_FIXED | MAP_EXCL;
+		} else {
+			unsigned alignment_bits = ffs_zu(alignment);
+			assert(alignment_bits > 1);
+			flags |= MAP_ALIGNED(alignment_bits - 1);
+		}
+
+		void *ret = mmap(addr, size, prot, flags, -1, 0);
+		if (ret == MAP_FAILED) {
+			ret = NULL;
+		}
+
+		return ret;
+	}
+#endif
 	/*
 	 * Ideally, there would be a way to specify alignment to mmap() (like
 	 * NetBSD has), but in the absence of such a feature, we have to work
-- 
cgit v0.12


From 856319dc8a3d15c3eddf83d106e01e6f63c349a7 Mon Sep 17 00:00:00 2001
From: jsteemann <jan@arangodb.com>
Date: Fri, 5 Oct 2018 01:29:19 +0200
Subject: check return value of `malloc_read_fd`

in case `malloc_read_fd` returns a negative error number, the result
would afterwards be casted to an unsigned size_t, and may have
theoretically caused an out-of-bounds memory access in the following
`strncmp` call.
---
 src/pages.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/pages.c b/src/pages.c
index 88a9d63..479a89e 100644
--- a/src/pages.c
+++ b/src/pages.c
@@ -567,6 +567,10 @@ init_thp_state(void) {
 	close(fd);
 #endif
 
+        if (nread < 0) {
+		goto label_error; 
+        }
+
 	if (strncmp(buf, sys_state_madvise, (size_t)nread) == 0) {
 		init_system_thp_mode = thp_mode_default;
 	} else if (strncmp(buf, sys_state_always, (size_t)nread) == 0) {
-- 
cgit v0.12


From 09adf18f1aefcee71cc716f4f366c7e2e889b7fa Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Mon, 8 Oct 2018 12:29:57 -0700
Subject: Remove a branch from cache_bin_alloc_easy

Combine the branches for checking for an empty cache_bin, and
checking for the low watermark.
---
 include/jemalloc/internal/cache_bin.h | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/include/jemalloc/internal/cache_bin.h b/include/jemalloc/internal/cache_bin.h
index 12f3ef2..40d942e 100644
--- a/include/jemalloc/internal/cache_bin.h
+++ b/include/jemalloc/internal/cache_bin.h
@@ -88,11 +88,21 @@ JEMALLOC_ALWAYS_INLINE void *
 cache_bin_alloc_easy(cache_bin_t *bin, bool *success) {
 	void *ret;
 
-	if (unlikely(bin->ncached == 0)) {
-		bin->low_water = -1;
-		*success = false;
-		return NULL;
+	bin->ncached--;
+
+	/* 
+	 * Check for both bin->ncached == 0 and ncached < low_water
+	 * in a single branch.
+	 */
+	if (unlikely(bin->ncached <= bin->low_water)) {
+		bin->low_water = bin->ncached;
+		if (bin->ncached == -1) {
+			bin->ncached = 0;
+			*success = false;
+			return NULL;
+		}
 	}
+        
 	/*
 	 * success (instead of ret) should be checked upon the return of this
 	 * function.  We avoid checking (ret == NULL) because there is never a
@@ -101,12 +111,7 @@ cache_bin_alloc_easy(cache_bin_t *bin, bool *success) {
 	 * cacheline).
 	 */
 	*success = true;
-	ret = *(bin->avail - bin->ncached);
-	bin->ncached--;
-
-	if (unlikely(bin->ncached < bin->low_water)) {
-		bin->low_water = bin->ncached;
-	}
+	ret = *(bin->avail - (bin->ncached + 1));
 
 	return ret;
 }
-- 
cgit v0.12


From 9ed3bdc8484049bd304c771a1b10070d5d7c95db Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Tue, 9 Oct 2018 10:59:02 -0700
Subject: move bytes until sample to tsd.  Fastpath allocation does not need to
 load tdata now, avoiding several branches.

---
 include/jemalloc/internal/prof_inlines_b.h |  8 +++++---
 include/jemalloc/internal/prof_structs.h   |  1 -
 include/jemalloc/internal/tsd.h            |  2 ++
 src/prof.c                                 | 13 ++++++-------
 4 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/include/jemalloc/internal/prof_inlines_b.h b/include/jemalloc/internal/prof_inlines_b.h
index 5e0b064..bfc66f7 100644
--- a/include/jemalloc/internal/prof_inlines_b.h
+++ b/include/jemalloc/internal/prof_inlines_b.h
@@ -82,6 +82,7 @@ JEMALLOC_ALWAYS_INLINE bool
 prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
     prof_tdata_t **tdata_out) {
 	prof_tdata_t *tdata;
+	uint64_t bytes_until_sample;
 
 	cassert(config_prof);
 
@@ -98,9 +99,10 @@ prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
 		return true;
 	}
 
-	if (likely(tdata->bytes_until_sample >= usize)) {
-		if (update) {
-			tdata->bytes_until_sample -= usize;
+	bytes_until_sample = tsd_bytes_until_sample_get(tsd);
+	if (likely(bytes_until_sample >= usize)) {
+		if (update && tsd_nominal(tsd)) {
+			tsd_bytes_until_sample_set(tsd, bytes_until_sample - usize);
 		}
 		return true;
 	} else {
diff --git a/include/jemalloc/internal/prof_structs.h b/include/jemalloc/internal/prof_structs.h
index 0d58ae1..34ed482 100644
--- a/include/jemalloc/internal/prof_structs.h
+++ b/include/jemalloc/internal/prof_structs.h
@@ -169,7 +169,6 @@ struct prof_tdata_s {
 
 	/* Sampling state. */
 	uint64_t		prng_state;
-	uint64_t		bytes_until_sample;
 
 	/* State used to avoid dumping while operating on prof internals. */
 	bool			enq;
diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index 59a1885..69fb05c 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -68,6 +68,7 @@ typedef void (*test_callback_t)(int *);
     O(offset_state,		uint64_t,		uint64_t)	\
     O(thread_allocated,		uint64_t,		uint64_t)	\
     O(thread_deallocated,	uint64_t,		uint64_t)	\
+    O(bytes_until_sample,	uint64_t,		uint64_t)	\
     O(prof_tdata,		prof_tdata_t *,		prof_tdata_t *)	\
     O(rtree_ctx,		rtree_ctx_t,		rtree_ctx_t)	\
     O(iarena,			arena_t *,		arena_t *)	\
@@ -86,6 +87,7 @@ typedef void (*test_callback_t)(int *);
     0,									\
     0,									\
     0,									\
+    0,									\
     NULL,								\
     RTREE_CTX_ZERO_INITIALIZER,						\
     NULL,								\
diff --git a/src/prof.c b/src/prof.c
index 458c6cd..83d492d 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -1136,15 +1136,12 @@ prof_lookup(tsd_t *tsd, prof_bt_t *bt) {
 void
 prof_sample_threshold_update(prof_tdata_t *tdata) {
 #ifdef JEMALLOC_PROF
-	uint64_t r;
-	double u;
-
 	if (!config_prof) {
 		return;
 	}
 
 	if (lg_prof_sample == 0) {
-		tdata->bytes_until_sample = 0;
+		tsd_bytes_until_sample_set(tsd_fetch(), 0);
 		return;
 	}
 
@@ -1166,11 +1163,13 @@ prof_sample_threshold_update(prof_tdata_t *tdata) {
 	 *   pp 500
 	 *   (http://luc.devroye.org/rnbookindex.html)
 	 */
-	r = prng_lg_range_u64(&tdata->prng_state, 53);
-	u = (double)r * (1.0/9007199254740992.0L);
-	tdata->bytes_until_sample = (uint64_t)(log(u) /
+	uint64_t r = prng_lg_range_u64(&tdata->prng_state, 53);
+	double u = (double)r * (1.0/9007199254740992.0L);
+	uint64_t bytes_until_sample = (uint64_t)(log(u) /
 	    log(1.0 - (1.0 / (double)((uint64_t)1U << lg_prof_sample))))
 	    + (uint64_t)1U;
+	tsd_bytes_until_sample_set(tsd_fetch(), bytes_until_sample);
+
 #endif
 }
 
-- 
cgit v0.12


From 0ac524308d3f636d1a4b5149fa7adf24cf426d9c Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Tue, 9 Oct 2018 11:07:24 -0700
Subject: refactor prof accum, so that tdata is not loaded if we aren't going
 to sample.

---
 include/jemalloc/internal/prof_inlines_b.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/jemalloc/internal/prof_inlines_b.h b/include/jemalloc/internal/prof_inlines_b.h
index bfc66f7..b2f5a04 100644
--- a/include/jemalloc/internal/prof_inlines_b.h
+++ b/include/jemalloc/internal/prof_inlines_b.h
@@ -86,6 +86,14 @@ prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
 
 	cassert(config_prof);
 
+	bytes_until_sample = tsd_bytes_until_sample_get(tsd);
+	if (likely(bytes_until_sample >= usize)) {
+		if (update && tsd_nominal(tsd)) {
+			tsd_bytes_until_sample_set(tsd, bytes_until_sample - usize);
+		}
+		return true;
+	}
+
 	tdata = prof_tdata_get(tsd, true);
 	if (unlikely((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)) {
 		tdata = NULL;
@@ -99,22 +107,14 @@ prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
 		return true;
 	}
 
-	bytes_until_sample = tsd_bytes_until_sample_get(tsd);
-	if (likely(bytes_until_sample >= usize)) {
-		if (update && tsd_nominal(tsd)) {
-			tsd_bytes_until_sample_set(tsd, bytes_until_sample - usize);
-		}
+	if (tsd_reentrancy_level_get(tsd) > 0) {
 		return true;
-	} else {
-		if (tsd_reentrancy_level_get(tsd) > 0) {
-			return true;
-		}
-		/* Compute new sample threshold. */
-		if (update) {
-			prof_sample_threshold_update(tdata);
-		}
-		return !tdata->active;
 	}
+	/* Compute new sample threshold. */
+	if (update) {
+		prof_sample_threshold_update(tdata);
+	}
+	return !tdata->active;
 }
 
 JEMALLOC_ALWAYS_INLINE prof_tctx_t *
-- 
cgit v0.12


From d1a861fa80c66221be8c4d94e51128a4641809da Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Tue, 9 Oct 2018 11:16:19 -0700
Subject: add a check for SC_LARGE_MAXCLASS

If we assume SC_LARGE_MAXCLASS will always fit in a SSIZE_T, then we can
optimize some checks by unconditional subtraction, and then checking flags
only, without a compare statement in x86.
---
 src/sc.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/sc.c b/src/sc.c
index 8784bdd..89ddb6b 100644
--- a/src/sc.c
+++ b/src/sc.c
@@ -244,6 +244,15 @@ size_classes(
 	assert(sc_data->large_minclass == SC_LARGE_MINCLASS);
 	assert(sc_data->lg_large_minclass == SC_LG_LARGE_MINCLASS);
 	assert(sc_data->large_maxclass == SC_LARGE_MAXCLASS);
+
+	/* 
+	 * In the allocation fastpath, we want to assume that we can
+	 * unconditionally subtract the requested allocation size from
+	 * a ssize_t, and detect passing through 0 correctly.  This
+	 * results in optimal generated code.  For this to work, the
+	 * maximum allocation size must be less than SSIZE_MAX.
+	 */
+	assert(SC_LARGE_MAXCLASS < SSIZE_MAX);
 }
 
 void
-- 
cgit v0.12


From 997d86acc6d2cc632b79669ebf3f938290e9f5da Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Tue, 9 Oct 2018 11:25:36 -0700
Subject: restrict bytes_until_sample to int64_t.  This allows optimal asm
 generation of sub bytes_until_sample, usize; je; for x86 arch. Subtraction is
 unconditional, and only flags are checked for the jump, no extra compare is
 necessary.  This also reduces register pressure.

---
 include/jemalloc/internal/prof_inlines_b.h | 14 +++++++++-----
 include/jemalloc/internal/tsd.h            |  2 +-
 src/prof.c                                 |  3 +++
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/include/jemalloc/internal/prof_inlines_b.h b/include/jemalloc/internal/prof_inlines_b.h
index b2f5a04..085111f 100644
--- a/include/jemalloc/internal/prof_inlines_b.h
+++ b/include/jemalloc/internal/prof_inlines_b.h
@@ -82,17 +82,21 @@ JEMALLOC_ALWAYS_INLINE bool
 prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
     prof_tdata_t **tdata_out) {
 	prof_tdata_t *tdata;
-	uint64_t bytes_until_sample;
+	int64_t bytes_until_sample;
 
 	cassert(config_prof);
+	ssize_t check = update ? 0 : usize;
 
 	bytes_until_sample = tsd_bytes_until_sample_get(tsd);
-	if (likely(bytes_until_sample >= usize)) {
-		if (update && tsd_nominal(tsd)) {
-			tsd_bytes_until_sample_set(tsd, bytes_until_sample - usize);
+	if (update) {
+		bytes_until_sample -= usize;
+		if (tsd_nominal(tsd)) {
+			tsd_bytes_until_sample_set(tsd, bytes_until_sample);
 		}
-		return true;
 	}
+	if (likely(bytes_until_sample >= check)) {
+		return true;
+	} 
 
 	tdata = prof_tdata_get(tsd, true);
 	if (unlikely((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)) {
diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index 69fb05c..c931441 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -68,7 +68,7 @@ typedef void (*test_callback_t)(int *);
     O(offset_state,		uint64_t,		uint64_t)	\
     O(thread_allocated,		uint64_t,		uint64_t)	\
     O(thread_deallocated,	uint64_t,		uint64_t)	\
-    O(bytes_until_sample,	uint64_t,		uint64_t)	\
+    O(bytes_until_sample,	int64_t,		int64_t)	\
     O(prof_tdata,		prof_tdata_t *,		prof_tdata_t *)	\
     O(rtree_ctx,		rtree_ctx_t,		rtree_ctx_t)	\
     O(iarena,			arena_t *,		arena_t *)	\
diff --git a/src/prof.c b/src/prof.c
index 83d492d..71de2d3 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -1168,6 +1168,9 @@ prof_sample_threshold_update(prof_tdata_t *tdata) {
 	uint64_t bytes_until_sample = (uint64_t)(log(u) /
 	    log(1.0 - (1.0 / (double)((uint64_t)1U << lg_prof_sample))))
 	    + (uint64_t)1U;
+	if (bytes_until_sample > SSIZE_MAX) {
+		bytes_until_sample = SSIZE_MAX;
+	}
 	tsd_bytes_until_sample_set(tsd_fetch(), bytes_until_sample);
 
 #endif
-- 
cgit v0.12


From 325e3305fc7563600a710341d1f98cb8e04caaba Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Wed, 3 Oct 2018 14:47:31 -0700
Subject: remove malloc_init() off the fastpath

---
 include/jemalloc/internal/tsd_malloc_thread_cleanup.h |  1 -
 include/jemalloc/internal/tsd_tls.h                   |  1 -
 src/jemalloc.c                                        | 19 ++++++++++++++++---
 src/tsd.c                                             | 12 +++++++-----
 4 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/include/jemalloc/internal/tsd_malloc_thread_cleanup.h b/include/jemalloc/internal/tsd_malloc_thread_cleanup.h
index beb467a..bf8801e 100644
--- a/include/jemalloc/internal/tsd_malloc_thread_cleanup.h
+++ b/include/jemalloc/internal/tsd_malloc_thread_cleanup.h
@@ -47,7 +47,6 @@ tsd_get_allocates(void) {
 /* Get/set. */
 JEMALLOC_ALWAYS_INLINE tsd_t *
 tsd_get(bool init) {
-	assert(tsd_booted);
 	return &tsd_tls;
 }
 JEMALLOC_ALWAYS_INLINE void
diff --git a/include/jemalloc/internal/tsd_tls.h b/include/jemalloc/internal/tsd_tls.h
index 757aaa0..f4f165c 100644
--- a/include/jemalloc/internal/tsd_tls.h
+++ b/include/jemalloc/internal/tsd_tls.h
@@ -40,7 +40,6 @@ tsd_get_allocates(void) {
 /* Get/set. */
 JEMALLOC_ALWAYS_INLINE tsd_t *
 tsd_get(bool init) {
-	assert(tsd_booted);
 	return &tsd_tls;
 }
 
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 15c0609..237bfe7 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -2110,9 +2110,8 @@ label_invalid_alignment:
 	return EINVAL;
 }
 
-/* Returns the errno-style error code of the allocation. */
-JEMALLOC_ALWAYS_INLINE int
-imalloc(static_opts_t *sopts, dynamic_opts_t *dopts) {
+JEMALLOC_ALWAYS_INLINE bool
+imalloc_init_check(static_opts_t *sopts, dynamic_opts_t *dopts) {
 	if (unlikely(!malloc_initialized()) && unlikely(malloc_init())) {
 		if (config_xmalloc && unlikely(opt_xmalloc)) {
 			malloc_write(sopts->oom_string);
@@ -2122,6 +2121,16 @@ imalloc(static_opts_t *sopts, dynamic_opts_t *dopts) {
 		set_errno(ENOMEM);
 		*dopts->result = NULL;
 
+		return false;
+	}
+
+	return true;
+}
+
+/* Returns the errno-style error code of the allocation. */
+JEMALLOC_ALWAYS_INLINE int
+imalloc(static_opts_t *sopts, dynamic_opts_t *dopts) {
+	if (tsd_get_allocates() && !imalloc_init_check(sopts, dopts)) {
 		return ENOMEM;
 	}
 
@@ -2134,6 +2143,10 @@ imalloc(static_opts_t *sopts, dynamic_opts_t *dopts) {
 		sopts->slow = false;
 		return imalloc_body(sopts, dopts, tsd);
 	} else {
+		if (!tsd_get_allocates() && !imalloc_init_check(sopts, dopts)) {
+			return ENOMEM;
+		}
+          
 		sopts->slow = true;
 		return imalloc_body(sopts, dopts, tsd);
 	}
diff --git a/src/tsd.c b/src/tsd.c
index 1204a0d..f317d48 100644
--- a/src/tsd.c
+++ b/src/tsd.c
@@ -280,11 +280,13 @@ tsd_fetch_slow(tsd_t *tsd, bool minimal) {
 		tsd_slow_update(tsd);
 	} else if (tsd_state_get(tsd) == tsd_state_uninitialized) {
 		if (!minimal) {
-			tsd_state_set(tsd, tsd_state_nominal);
-			tsd_slow_update(tsd);
-			/* Trigger cleanup handler registration. */
-			tsd_set(tsd);
-			tsd_data_init(tsd);
+			if (tsd_booted) {
+				tsd_state_set(tsd, tsd_state_nominal);
+				tsd_slow_update(tsd);
+				/* Trigger cleanup handler registration. */
+				tsd_set(tsd);
+				tsd_data_init(tsd);
+			}
 		} else {
 			tsd_state_set(tsd, tsd_state_minimal_initialized);
 			tsd_set(tsd);
-- 
cgit v0.12


From 08260a6b944a67a3d9f63e7eb738718fc760e0ea Mon Sep 17 00:00:00 2001
From: gnzlbg <gonzalobg88@gmail.com>
Date: Wed, 15 Nov 2017 18:26:49 +0100
Subject: Add experimental API: smallocx_return_t smallocx(size, flags)

---

Motivation:

This new experimental memory-allocaction API returns a pointer to
the allocation as well as the usable size of the allocated memory
region.

The `s` in `smallocx` stands for `sized`-`mallocx`, attempting to
convey that this API returns the size of the allocated memory region.

It should allow C++ P0901r0 [0] and Rust Alloc::alloc_excess to make
use of it.

The main purpose of these APIs is to improve telemetry. It is more accurate
to register `smallocx(size, flags)` than `smallocx(nallocx(size), flags)`,
for example. The latter will always line up perfectly with the existing
size classes, causing a loss of telemetry information about the internal
fragmentation induced by potentially poor size-classes choices.

Instrumenting `nallocx` does not help much since user code can cache its
result and use it repeatedly.

---

Implementation:

The implementation adds a new `usize` option to `static_opts_s` and an `usize`
variable to `dynamic_opts_s`. These are then used to cache the result of
`sz_index2size` and similar functions in the code paths in which they are
unconditionally invoked. In the code-paths in which these functions are not
unconditionally invoked, `smallocx` calls, as opposed to `mallocx`, these
functions explicitly.

---

[0]: http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0901r0.html
---
 configure.ac                                       | 19 +++++-
 .../jemalloc/internal/jemalloc_internal_defs.h.in  |  3 +
 include/jemalloc/jemalloc_protos.h.in              |  4 ++
 include/jemalloc/jemalloc_typedefs.h.in            |  7 ++
 src/jemalloc.c                                     | 77 +++++++++++++++++++++-
 5 files changed, 108 insertions(+), 2 deletions(-)

diff --git a/configure.ac b/configure.ac
index cd5bdd6..018ee3f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -850,7 +850,7 @@ AC_ARG_WITH([export],
 fi]
 )
 
-public_syms="aligned_alloc calloc dallocx free mallctl mallctlbymib mallctlnametomib malloc malloc_conf malloc_message malloc_stats_print malloc_usable_size mallocx nallocx posix_memalign rallocx realloc sallocx sdallocx xallocx"
+public_syms="aligned_alloc calloc dallocx free mallctl mallctlbymib mallctlnametomib malloc malloc_conf malloc_message malloc_stats_print malloc_usable_size mallocx smallocx nallocx posix_memalign rallocx realloc sallocx sdallocx xallocx"
 dnl Check for additional platform-specific public API functions.
 AC_CHECK_FUNC([memalign],
 	      [AC_DEFINE([JEMALLOC_OVERRIDE_MEMALIGN], [ ])
@@ -1043,6 +1043,22 @@ if test "x$enable_stats" = "x1" ; then
 fi
 AC_SUBST([enable_stats])
 
+dnl Do not enable smallocx by default.
+AC_ARG_ENABLE([experimental_smallocx],
+  [AS_HELP_STRING([--enable-experimental-smallocx], [Enable experimental smallocx API])],
+[if test "x$enable_experimental_smallocx" = "xno" ; then
+enable_experimental_smallocx="0"
+else
+enable_experimental_smallocx="1"
+fi
+],
+[enable_experimental_smallocx="0"]
+)
+if test "x$enable_experimental_smallocx" = "x1" ; then
+  AC_DEFINE([JEMALLOC_EXPERIMENTAL_SMALLOCX_API])
+fi
+AC_SUBST([enable_experimental_smallocx])
+
 dnl Do not enable profiling by default.
 AC_ARG_ENABLE([prof],
   [AS_HELP_STRING([--enable-prof], [Enable allocation profiling])],
@@ -2281,6 +2297,7 @@ AC_MSG_RESULT([malloc_conf        : ${config_malloc_conf}])
 AC_MSG_RESULT([autogen            : ${enable_autogen}])
 AC_MSG_RESULT([debug              : ${enable_debug}])
 AC_MSG_RESULT([stats              : ${enable_stats}])
+AC_MSG_RESULT([experimetal_smallocx : ${enable_experimental_smallocx}])
 AC_MSG_RESULT([prof               : ${enable_prof}])
 AC_MSG_RESULT([prof-libunwind     : ${enable_prof_libunwind}])
 AC_MSG_RESULT([prof-libgcc        : ${enable_prof_libgcc}])
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index cec41aa..c1eb8ed 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -153,6 +153,9 @@
 /* JEMALLOC_STATS enables statistics calculation. */
 #undef JEMALLOC_STATS
 
+/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */
+#undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API
+
 /* JEMALLOC_PROF enables allocation profiling. */
 #undef JEMALLOC_PROF
 
diff --git a/include/jemalloc/jemalloc_protos.h.in b/include/jemalloc/jemalloc_protos.h.in
index a78414b..05fc056 100644
--- a/include/jemalloc/jemalloc_protos.h.in
+++ b/include/jemalloc/jemalloc_protos.h.in
@@ -28,6 +28,10 @@ JEMALLOC_EXPORT void JEMALLOC_NOTHROW	@je_@free(void *ptr)
 JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
     void JEMALLOC_NOTHROW	*@je_@mallocx(size_t size, int flags)
     JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1);
+#ifdef JEMALLOC_EXPERIMENTAL_SMALLOCX_API
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+     smallocx_return_t JEMALLOC_NOTHROW @je_@smallocx(size_t size, int flags);
+#endif
 JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
     void JEMALLOC_NOTHROW	*@je_@rallocx(void *ptr, size_t size,
     int flags) JEMALLOC_ALLOC_SIZE(2);
diff --git a/include/jemalloc/jemalloc_typedefs.h.in b/include/jemalloc/jemalloc_typedefs.h.in
index 1a58874..fe0d7d1 100644
--- a/include/jemalloc/jemalloc_typedefs.h.in
+++ b/include/jemalloc/jemalloc_typedefs.h.in
@@ -75,3 +75,10 @@ struct extent_hooks_s {
 	extent_split_t		*split;
 	extent_merge_t		*merge;
 };
+
+#ifdef JEMALLOC_EXPERIMENTAL_SMALLOCX_API
+typedef struct {
+	void *ptr;
+	size_t size;
+} smallocx_return_t;
+#endif
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 237bfe7..01e2db9 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1747,6 +1747,11 @@ struct static_opts_s {
 	 * initialization) options.
 	 */
 	bool slow;
+	/*
+	 * Return size
+	 *
+	 */
+	bool usize;
 };
 
 JEMALLOC_ALWAYS_INLINE void
@@ -1760,6 +1765,7 @@ static_opts_init(static_opts_t *static_opts) {
 	static_opts->oom_string = "";
 	static_opts->invalid_alignment_string = "";
 	static_opts->slow = false;
+	static_opts->usize = false;
 }
 
 /*
@@ -1774,6 +1780,7 @@ static_opts_init(static_opts_t *static_opts) {
 typedef struct dynamic_opts_s dynamic_opts_t;
 struct dynamic_opts_s {
 	void **result;
+	size_t usize;
 	size_t num_items;
 	size_t item_size;
 	size_t alignment;
@@ -1785,6 +1792,7 @@ struct dynamic_opts_s {
 JEMALLOC_ALWAYS_INLINE void
 dynamic_opts_init(dynamic_opts_t *dynamic_opts) {
 	dynamic_opts->result = NULL;
+	dynamic_opts->usize = 0;
 	dynamic_opts->num_items = 0;
 	dynamic_opts->item_size = 0;
 	dynamic_opts->alignment = 0;
@@ -1960,13 +1968,15 @@ imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) {
 		if (unlikely(ind >= SC_NSIZES)) {
 			goto label_oom;
 		}
-		if (config_stats || (config_prof && opt_prof)) {
+		if (config_stats || (config_prof && opt_prof) || sopts->usize) {
 			usize = sz_index2size(ind);
+			dopts->usize = usize;
 			assert(usize > 0 && usize
 			    <= SC_LARGE_MAXCLASS);
 		}
 	} else {
 		usize = sz_sa2u(size, dopts->alignment);
+		dopts->usize = usize;
 		if (unlikely(usize == 0
 		    || usize > SC_LARGE_MAXCLASS)) {
 			goto label_oom;
@@ -2759,6 +2769,71 @@ int __posix_memalign(void** r, size_t a, size_t s) PREALIAS(je_posix_memalign);
  * Begin non-standard functions.
  */
 
+#ifdef JEMALLOC_EXPERIMENTAL_SMALLOCX_API
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+smallocx_return_t JEMALLOC_NOTHROW
+/*
+ * The attribute JEMALLOC_ATTR(malloc) cannot be used due to:
+ *  - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86488
+ */
+  je_smallocx(size_t size, int flags) {
+	/*
+	 * Note: the attribute JEMALLOC_ALLOC_SIZE(1) cannot be
+	 * used here because it makes writing beyond the `size`
+	 * of the `ptr` undefined behavior, but the objective
+	 * of this function is to allow writing beyond `size`
+	 * up to `smallocx_return_t::size`.
+	 */
+	smallocx_return_t ret;
+	static_opts_t sopts;
+	dynamic_opts_t dopts;
+
+	LOG("core.smallocx.entry", "size: %zu, flags: %d", size, flags);
+
+	static_opts_init(&sopts);
+	dynamic_opts_init(&dopts);
+
+	sopts.assert_nonempty_alloc = true;
+	sopts.null_out_result_on_error = true;
+	sopts.oom_string = "<jemalloc>: Error in mallocx(): out of memory\n";
+	sopts.usize = true;
+
+	dopts.result = &ret.ptr;
+	dopts.num_items = 1;
+	dopts.item_size = size;
+	if (unlikely(flags != 0)) {
+		if ((flags & MALLOCX_LG_ALIGN_MASK) != 0) {
+			dopts.alignment = MALLOCX_ALIGN_GET_SPECIFIED(flags);
+		}
+
+		dopts.zero = MALLOCX_ZERO_GET(flags);
+
+		if ((flags & MALLOCX_TCACHE_MASK) != 0) {
+			if ((flags & MALLOCX_TCACHE_MASK)
+			    == MALLOCX_TCACHE_NONE) {
+				dopts.tcache_ind = TCACHE_IND_NONE;
+			} else {
+				dopts.tcache_ind = MALLOCX_TCACHE_GET(flags);
+			}
+		} else {
+			dopts.tcache_ind = TCACHE_IND_AUTOMATIC;
+		}
+
+		if ((flags & MALLOCX_ARENA_MASK) != 0)
+			dopts.arena_ind = MALLOCX_ARENA_GET(flags);
+	}
+
+
+
+	imalloc(&sopts, &dopts);
+	assert(dopts.usize == je_nallocx(size, flags));
+	ret.size = dopts.usize;
+
+	LOG("core.smallocx.exit", "result: %p, size: %zu", ret.ptr, ret.size);
+	return ret;
+}
+#endif
+
 JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
 void JEMALLOC_NOTHROW *
 JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1)
-- 
cgit v0.12


From 730e57b08fe5bd6bdc38ca4ff6a73959984d8ef0 Mon Sep 17 00:00:00 2001
From: gnzlbg <gonzalobg88@gmail.com>
Date: Wed, 11 Jul 2018 15:04:48 +0200
Subject: Adapts mallocx integration tests for smallocx

---
 Makefile.in                  |   5 +
 test/integration/smallocx.c  | 292 +++++++++++++++++++++++++++++++++++++++++++
 test/integration/smallocx.sh |   5 +
 3 files changed, 302 insertions(+)
 create mode 100644 test/integration/smallocx.c
 create mode 100644 test/integration/smallocx.sh

diff --git a/Makefile.in b/Makefile.in
index c35bb7e..3d99a40 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -57,6 +57,7 @@ cfgoutputs_out := @cfgoutputs_out@
 enable_autogen := @enable_autogen@
 enable_prof := @enable_prof@
 enable_zone_allocator := @enable_zone_allocator@
+enable_experimental_smallocx := @enable_experimental_smallocx@
 MALLOC_CONF := @JEMALLOC_CPREFIX@MALLOC_CONF
 link_whole_archive := @link_whole_archive@
 DSO_LDFLAGS = @DSO_LDFLAGS@
@@ -235,6 +236,10 @@ TESTS_INTEGRATION := $(srcroot)test/integration/aligned_alloc.c \
 	$(srcroot)test/integration/thread_arena.c \
 	$(srcroot)test/integration/thread_tcache_enabled.c \
 	$(srcroot)test/integration/xallocx.c
+ifeq (@enable_experimental_smallocx@, 1)
+TESTS_INTEGRATION += \
+  $(srcroot)test/integration/smallocx.c
+endif
 ifeq (@enable_cxx@, 1)
 CPP_SRCS := $(srcroot)src/jemalloc_cpp.cpp
 TESTS_INTEGRATION_CPP := $(srcroot)test/integration/cpp/basic.cpp
diff --git a/test/integration/smallocx.c b/test/integration/smallocx.c
new file mode 100644
index 0000000..376fec2
--- /dev/null
+++ b/test/integration/smallocx.c
@@ -0,0 +1,292 @@
+#include "test/jemalloc_test.h"
+
+static unsigned
+get_nsizes_impl(const char *cmd) {
+	unsigned ret;
+	size_t z;
+
+	z = sizeof(unsigned);
+	assert_d_eq(mallctl(cmd, (void *)&ret, &z, NULL, 0), 0,
+	    "Unexpected mallctl(\"%s\", ...) failure", cmd);
+
+	return ret;
+}
+
+static unsigned
+get_nlarge(void) {
+	return get_nsizes_impl("arenas.nlextents");
+}
+
+static size_t
+get_size_impl(const char *cmd, size_t ind) {
+	size_t ret;
+	size_t z;
+	size_t mib[4];
+	size_t miblen = 4;
+
+	z = sizeof(size_t);
+	assert_d_eq(mallctlnametomib(cmd, mib, &miblen),
+	    0, "Unexpected mallctlnametomib(\"%s\", ...) failure", cmd);
+	mib[2] = ind;
+	z = sizeof(size_t);
+	assert_d_eq(mallctlbymib(mib, miblen, (void *)&ret, &z, NULL, 0),
+	    0, "Unexpected mallctlbymib([\"%s\", %zu], ...) failure", cmd, ind);
+
+	return ret;
+}
+
+static size_t
+get_large_size(size_t ind) {
+	return get_size_impl("arenas.lextent.0.size", ind);
+}
+
+/*
+ * On systems which can't merge extents, tests that call this function generate
+ * a lot of dirty memory very quickly.  Purging between cycles mitigates
+ * potential OOM on e.g. 32-bit Windows.
+ */
+static void
+purge(void) {
+	assert_d_eq(mallctl("arena.0.purge", NULL, NULL, NULL, 0), 0,
+	    "Unexpected mallctl error");
+}
+
+/*
+ * GCC "-Walloc-size-larger-than" warning detects when one of the memory
+ * allocation functions is called with a size larger than the maximum size that
+ * they support. Here we want to explicitly test that the allocation functions
+ * do indeed fail properly when this is the case, which triggers the warning.
+ * Therefore we disable the warning for these tests.
+ */
+JEMALLOC_DIAGNOSTIC_PUSH
+JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+
+TEST_BEGIN(test_overflow) {
+	size_t largemax;
+
+	largemax = get_large_size(get_nlarge()-1);
+
+	assert_ptr_null(smallocx(largemax+1, 0).ptr,
+	    "Expected OOM for smallocx(size=%#zx, 0)", largemax+1);
+
+	assert_ptr_null(smallocx(ZU(PTRDIFF_MAX)+1, 0).ptr,
+	    "Expected OOM for smallocx(size=%#zx, 0)", ZU(PTRDIFF_MAX)+1);
+
+	assert_ptr_null(smallocx(SIZE_T_MAX, 0).ptr,
+	    "Expected OOM for smallocx(size=%#zx, 0)", SIZE_T_MAX);
+
+	assert_ptr_null(smallocx(1, MALLOCX_ALIGN(ZU(PTRDIFF_MAX)+1)).ptr,
+	    "Expected OOM for smallocx(size=1, MALLOCX_ALIGN(%#zx))",
+	    ZU(PTRDIFF_MAX)+1);
+}
+TEST_END
+
+static void *
+remote_alloc(void *arg) {
+	unsigned arena;
+	size_t sz = sizeof(unsigned);
+	assert_d_eq(mallctl("arenas.create", (void *)&arena, &sz, NULL, 0), 0,
+	    "Unexpected mallctl() failure");
+	size_t large_sz;
+	sz = sizeof(size_t);
+	assert_d_eq(mallctl("arenas.lextent.0.size", (void *)&large_sz, &sz,
+	    NULL, 0), 0, "Unexpected mallctl failure");
+
+	smallocx_return_t r = smallocx(large_sz, MALLOCX_ARENA(arena)
+                                 | MALLOCX_TCACHE_NONE);
+	void *ptr = r.ptr;
+  assert_zu_eq(r.size, nallocx(large_sz, MALLOCX_ARENA(arena)
+                               | MALLOCX_TCACHE_NONE),
+               "Expected smalloc(size,flags).size == nallocx(size,flags)");
+	void **ret = (void **)arg;
+	*ret = ptr;
+
+	return NULL;
+}
+
+TEST_BEGIN(test_remote_free) {
+	thd_t thd;
+	void *ret;
+	thd_create(&thd, remote_alloc, (void *)&ret);
+	thd_join(thd, NULL);
+	assert_ptr_not_null(ret, "Unexpected smallocx failure");
+
+	/* Avoid TCACHE_NONE to explicitly test tcache_flush(). */
+	dallocx(ret, 0);
+	mallctl("thread.tcache.flush", NULL, NULL, NULL, 0);
+}
+TEST_END
+
+TEST_BEGIN(test_oom) {
+	size_t largemax;
+	bool oom;
+	void *ptrs[3];
+	unsigned i;
+
+	/*
+	 * It should be impossible to allocate three objects that each consume
+	 * nearly half the virtual address space.
+	 */
+	largemax = get_large_size(get_nlarge()-1);
+	oom = false;
+	for (i = 0; i < sizeof(ptrs) / sizeof(void *); i++) {
+		ptrs[i] = smallocx(largemax, 0).ptr;
+		if (ptrs[i] == NULL) {
+			oom = true;
+		}
+	}
+	assert_true(oom,
+	    "Expected OOM during series of calls to smallocx(size=%zu, 0)",
+	    largemax);
+	for (i = 0; i < sizeof(ptrs) / sizeof(void *); i++) {
+		if (ptrs[i] != NULL) {
+			dallocx(ptrs[i], 0);
+		}
+	}
+	purge();
+
+#if LG_SIZEOF_PTR == 3
+	assert_ptr_null(smallocx(0x8000000000000000ULL,
+	    MALLOCX_ALIGN(0x8000000000000000ULL)).ptr,
+	    "Expected OOM for smallocx()");
+	assert_ptr_null(smallocx(0x8000000000000000ULL,
+	    MALLOCX_ALIGN(0x80000000)).ptr,
+	    "Expected OOM for smallocx()");
+#else
+	assert_ptr_null(smallocx(0x80000000UL, MALLOCX_ALIGN(0x80000000UL)).ptr,
+	    "Expected OOM for smallocx()");
+#endif
+}
+TEST_END
+
+/* Re-enable the "-Walloc-size-larger-than=" warning */
+JEMALLOC_DIAGNOSTIC_POP
+
+TEST_BEGIN(test_basic) {
+#define MAXSZ (((size_t)1) << 23)
+	size_t sz;
+
+	for (sz = 1; sz < MAXSZ; sz = nallocx(sz, 0) + 1) {
+    smallocx_return_t ret;
+		size_t nsz, rsz, smz;
+		void *p;
+		nsz = nallocx(sz, 0);
+		assert_zu_ne(nsz, 0, "Unexpected nallocx() error");
+		ret = smallocx(sz, 0);
+    p = ret.ptr;
+    smz = ret.size;
+		assert_ptr_not_null(p,
+		    "Unexpected smallocx(size=%zx, flags=0) error", sz);
+		rsz = sallocx(p, 0);
+		assert_zu_ge(rsz, sz, "Real size smaller than expected");
+		assert_zu_eq(nsz, rsz, "nallocx()/sallocx() size mismatch");
+    assert_zu_eq(nsz, smz, "nallocx()/smallocx() size mismatch");
+		dallocx(p, 0);
+
+		ret = smallocx(sz, 0);
+    p = ret.ptr;
+    smz = ret.size;
+		assert_ptr_not_null(p,
+		    "Unexpected smallocx(size=%zx, flags=0) error", sz);
+		dallocx(p, 0);
+
+		nsz = nallocx(sz, MALLOCX_ZERO);
+		assert_zu_ne(nsz, 0, "Unexpected nallocx() error");
+    assert_zu_ne(smz, 0, "Unexpected smallocx() error");
+    ret = smallocx(sz, MALLOCX_ZERO);
+		p = ret.ptr;
+		assert_ptr_not_null(p,
+		    "Unexpected smallocx(size=%zx, flags=MALLOCX_ZERO) error",
+		    nsz);
+		rsz = sallocx(p, 0);
+		assert_zu_eq(nsz, rsz, "nallocx()/sallocx() rsize mismatch");
+    assert_zu_eq(nsz, smz, "nallocx()/smallocx() size mismatch");
+		dallocx(p, 0);
+		purge();
+	}
+#undef MAXSZ
+}
+TEST_END
+
+TEST_BEGIN(test_alignment_and_size) {
+	const char *percpu_arena;
+	size_t sz = sizeof(percpu_arena);
+
+	if(mallctl("opt.percpu_arena", (void *)&percpu_arena, &sz, NULL, 0) ||
+	    strcmp(percpu_arena, "disabled") != 0) {
+		test_skip("test_alignment_and_size skipped: "
+		    "not working with percpu arena.");
+	};
+#define MAXALIGN (((size_t)1) << 23)
+#define NITER 4
+	size_t nsz, rsz, smz, alignment, total;
+	unsigned i;
+	void *ps[NITER];
+
+	for (i = 0; i < NITER; i++) {
+		ps[i] = NULL;
+	}
+
+	for (alignment = 8;
+	    alignment <= MAXALIGN;
+	    alignment <<= 1) {
+		total = 0;
+		for (sz = 1;
+		    sz < 3 * alignment && sz < (1U << 31);
+		    sz += (alignment >> (LG_SIZEOF_PTR-1)) - 1) {
+			for (i = 0; i < NITER; i++) {
+				nsz = nallocx(sz, MALLOCX_ALIGN(alignment) |
+				    MALLOCX_ZERO);
+				assert_zu_ne(nsz, 0,
+				    "nallocx() error for alignment=%zu, "
+				    "size=%zu (%#zx)", alignment, sz, sz);
+        smallocx_return_t ret = smallocx(sz, MALLOCX_ALIGN(alignment) |
+                                         MALLOCX_ZERO);
+				ps[i] = ret.ptr;
+				assert_ptr_not_null(ps[i],
+				    "smallocx() error for alignment=%zu, "
+				    "size=%zu (%#zx)", alignment, sz, sz);
+				rsz = sallocx(ps[i], 0);
+        smz = ret.size;
+				assert_zu_ge(rsz, sz,
+				    "Real size smaller than expected for "
+				    "alignment=%zu, size=%zu", alignment, sz);
+				assert_zu_eq(nsz, rsz,
+				    "nallocx()/sallocx() size mismatch for "
+				    "alignment=%zu, size=%zu", alignment, sz);
+        assert_zu_eq(nsz, smz,
+            "nallocx()/smallocx() size mismatch for "
+            "alignment=%zu, size=%zu", alignment, sz);
+				assert_ptr_null(
+				    (void *)((uintptr_t)ps[i] & (alignment-1)),
+				    "%p inadequately aligned for"
+				    " alignment=%zu, size=%zu", ps[i],
+				    alignment, sz);
+				total += rsz;
+				if (total >= (MAXALIGN << 1)) {
+					break;
+				}
+			}
+			for (i = 0; i < NITER; i++) {
+				if (ps[i] != NULL) {
+					dallocx(ps[i], 0);
+					ps[i] = NULL;
+				}
+			}
+		}
+		purge();
+	}
+#undef MAXALIGN
+#undef NITER
+}
+TEST_END
+
+int
+main(void) {
+	return test(
+	    test_overflow,
+	    test_oom,
+	    test_remote_free,
+	    test_basic,
+	    test_alignment_and_size);
+}
diff --git a/test/integration/smallocx.sh b/test/integration/smallocx.sh
new file mode 100644
index 0000000..d07f10f
--- /dev/null
+++ b/test/integration/smallocx.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+if [ "x${enable_fill}" = "x1" ] ; then
+    export MALLOC_CONF="junk:false"
+fi
-- 
cgit v0.12


From 741fca1bb7773e14cf929824b94506eb9f545e5e Mon Sep 17 00:00:00 2001
From: gnzlbg <gonzalobg88@gmail.com>
Date: Wed, 11 Jul 2018 21:39:44 +0200
Subject: Hide smallocx even when enabled from the library API

The experimental `smallocx` API is not exposed via header files,
requiring the users to peek at `jemalloc`'s source code to manually
add the external declarations to their own programs.

This should reinforce that `smallocx` is experimental, and that `jemalloc`
does not offer any kind of backwards compatiblity or ABI gurantees for it.
---
 include/jemalloc/jemalloc_protos.h.in   | 4 ----
 include/jemalloc/jemalloc_typedefs.h.in | 7 -------
 src/jemalloc.c                          | 5 +++++
 test/integration/smallocx.c             | 7 +++++++
 4 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/include/jemalloc/jemalloc_protos.h.in b/include/jemalloc/jemalloc_protos.h.in
index 05fc056..a78414b 100644
--- a/include/jemalloc/jemalloc_protos.h.in
+++ b/include/jemalloc/jemalloc_protos.h.in
@@ -28,10 +28,6 @@ JEMALLOC_EXPORT void JEMALLOC_NOTHROW	@je_@free(void *ptr)
 JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
     void JEMALLOC_NOTHROW	*@je_@mallocx(size_t size, int flags)
     JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1);
-#ifdef JEMALLOC_EXPERIMENTAL_SMALLOCX_API
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-     smallocx_return_t JEMALLOC_NOTHROW @je_@smallocx(size_t size, int flags);
-#endif
 JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
     void JEMALLOC_NOTHROW	*@je_@rallocx(void *ptr, size_t size,
     int flags) JEMALLOC_ALLOC_SIZE(2);
diff --git a/include/jemalloc/jemalloc_typedefs.h.in b/include/jemalloc/jemalloc_typedefs.h.in
index fe0d7d1..1a58874 100644
--- a/include/jemalloc/jemalloc_typedefs.h.in
+++ b/include/jemalloc/jemalloc_typedefs.h.in
@@ -75,10 +75,3 @@ struct extent_hooks_s {
 	extent_split_t		*split;
 	extent_merge_t		*merge;
 };
-
-#ifdef JEMALLOC_EXPERIMENTAL_SMALLOCX_API
-typedef struct {
-	void *ptr;
-	size_t size;
-} smallocx_return_t;
-#endif
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 01e2db9..57d9f15 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -2770,6 +2770,11 @@ int __posix_memalign(void** r, size_t a, size_t s) PREALIAS(je_posix_memalign);
  */
 
 #ifdef JEMALLOC_EXPERIMENTAL_SMALLOCX_API
+typedef struct {
+	void *ptr;
+	size_t size;
+} smallocx_return_t;
+
 JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
 smallocx_return_t JEMALLOC_NOTHROW
 /*
diff --git a/test/integration/smallocx.c b/test/integration/smallocx.c
index 376fec2..f49ec84 100644
--- a/test/integration/smallocx.c
+++ b/test/integration/smallocx.c
@@ -1,5 +1,12 @@
 #include "test/jemalloc_test.h"
 
+typedef struct {
+	void *ptr;
+	size_t size;
+} smallocx_return_t;
+
+extern smallocx_return_t smallocx(size_t size, int flags);
+
 static unsigned
 get_nsizes_impl(const char *cmd) {
 	unsigned ret;
-- 
cgit v0.12


From 837de32496b1f20524c723516775a11bf236f891 Mon Sep 17 00:00:00 2001
From: gnzlbg <gonzalobg88@gmail.com>
Date: Wed, 11 Jul 2018 15:11:53 +0200
Subject: Test smallocx on Travis-CI

This commit updates the gen_travis script with a new build bot
that covers the experimental `smallocx` API and updates the
travis CI script to test this API under travis.
---
 .travis.yml           | 3 +++
 scripts/gen_travis.py | 7 +++++++
 2 files changed, 10 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index 07d3081..38e6655 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -122,6 +122,9 @@ matrix:
     # Development build
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-debug --disable-cache-oblivious --enable-stats --enable-log --enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    # --enable-expermental-smallocx:
+    - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-debug --enable-experimental-smallocx --enable-stats --enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
 
     # Valgrind
     - os: linux
diff --git a/scripts/gen_travis.py b/scripts/gen_travis.py
index 743f1e5..e92660f 100755
--- a/scripts/gen_travis.py
+++ b/scripts/gen_travis.py
@@ -118,6 +118,13 @@ include_rows += '''\
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-debug --disable-cache-oblivious --enable-stats --enable-log --enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
 '''
 
+# Enable-expermental-smallocx
+include_rows += '''\
+    # --enable-expermental-smallocx:
+    - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-debug --enable-experimental-smallocx --enable-stats --enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+'''
+
 # Valgrind build bots
 include_rows += '''
     # Valgrind
-- 
cgit v0.12


From 01e2a38e5a5523350496b11af46cf1d4c1d74e4c Mon Sep 17 00:00:00 2001
From: gnzlbg <gonzalobg88@gmail.com>
Date: Fri, 5 Oct 2018 13:11:21 +0200
Subject: Make `smallocx` symbol name depend on the `JEMALLOC_VERSION_GID`

This comments concatenates the `JEMALLOC_VERSION_GID` to the
`smallocx` symbol name, such that the symbol ends up exported
as `smallocx_{git_hash}`.
---
 configure.ac                          | 126 ++++++++++++++++++----------------
 include/jemalloc/jemalloc_macros.h.in |   1 +
 src/jemalloc.c                        |  15 ++--
 test/integration/smallocx.c           |  55 +++++++++------
 4 files changed, 110 insertions(+), 87 deletions(-)

diff --git a/configure.ac b/configure.ac
index 018ee3f..e27ea91 100644
--- a/configure.ac
+++ b/configure.ac
@@ -538,6 +538,66 @@ AC_PROG_NM
 
 AC_PROG_AWK
 
+dnl ============================================================================
+dnl jemalloc version.
+dnl
+
+AC_ARG_WITH([version],
+  [AS_HELP_STRING([--with-version=<major>.<minor>.<bugfix>-<nrev>-g<gid>],
+   [Version string])],
+  [
+    echo "${with_version}" | grep ['^[0-9]\+\.[0-9]\+\.[0-9]\+-[0-9]\+-g[0-9a-f]\+$'] 2>&1 1>/dev/null
+    if test $? -eq 0 ; then
+      echo "$with_version" > "${objroot}VERSION"
+    else
+      echo "${with_version}" | grep ['^VERSION$'] 2>&1 1>/dev/null
+      if test $? -ne 0 ; then
+        AC_MSG_ERROR([${with_version} does not match <major>.<minor>.<bugfix>-<nrev>-g<gid> or VERSION])
+      fi
+    fi
+  ], [
+    dnl Set VERSION if source directory is inside a git repository.
+    if test "x`test ! \"${srcroot}\" && cd \"${srcroot}\"; git rev-parse --is-inside-work-tree 2>/dev/null`" = "xtrue" ; then
+      dnl Pattern globs aren't powerful enough to match both single- and
+      dnl double-digit version numbers, so iterate over patterns to support up
+      dnl to version 99.99.99 without any accidental matches.
+      for pattern in ['[0-9].[0-9].[0-9]' '[0-9].[0-9].[0-9][0-9]' \
+                     '[0-9].[0-9][0-9].[0-9]' '[0-9].[0-9][0-9].[0-9][0-9]' \
+                     '[0-9][0-9].[0-9].[0-9]' '[0-9][0-9].[0-9].[0-9][0-9]' \
+                     '[0-9][0-9].[0-9][0-9].[0-9]' \
+                     '[0-9][0-9].[0-9][0-9].[0-9][0-9]']; do
+        (test ! "${srcroot}" && cd "${srcroot}"; git describe --long --abbrev=40 --match="${pattern}") > "${objroot}VERSION.tmp" 2>/dev/null
+        if test $? -eq 0 ; then
+          mv "${objroot}VERSION.tmp" "${objroot}VERSION"
+          break
+        fi
+      done
+    fi
+    rm -f "${objroot}VERSION.tmp"
+  ])
+
+if test ! -e "${objroot}VERSION" ; then
+  if test ! -e "${srcroot}VERSION" ; then
+    AC_MSG_RESULT(
+      [Missing VERSION file, and unable to generate it; creating bogus VERSION])
+    echo "0.0.0-0-g0000000000000000000000000000000000000000" > "${objroot}VERSION"
+  else
+    cp ${srcroot}VERSION ${objroot}VERSION
+  fi
+fi
+jemalloc_version=`cat "${objroot}VERSION"`
+jemalloc_version_major=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]1}'`
+jemalloc_version_minor=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]2}'`
+jemalloc_version_bugfix=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]3}'`
+jemalloc_version_nrev=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]4}'`
+jemalloc_version_gid=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]5}'`
+AC_SUBST([jemalloc_version])
+AC_SUBST([jemalloc_version_major])
+AC_SUBST([jemalloc_version_minor])
+AC_SUBST([jemalloc_version_bugfix])
+AC_SUBST([jemalloc_version_nrev])
+AC_SUBST([jemalloc_version_gid])
+
 dnl Platform-specific settings.  abi and RPATH can probably be determined
 dnl programmatically, but doing so is error-prone, which makes it generally
 dnl not worth the trouble.
@@ -850,7 +910,7 @@ AC_ARG_WITH([export],
 fi]
 )
 
-public_syms="aligned_alloc calloc dallocx free mallctl mallctlbymib mallctlnametomib malloc malloc_conf malloc_message malloc_stats_print malloc_usable_size mallocx smallocx nallocx posix_memalign rallocx realloc sallocx sdallocx xallocx"
+public_syms="aligned_alloc calloc dallocx free mallctl mallctlbymib mallctlnametomib malloc malloc_conf malloc_message malloc_stats_print malloc_usable_size mallocx smallocx_${jemalloc_version_gid} nallocx posix_memalign rallocx realloc sallocx sdallocx xallocx"
 dnl Check for additional platform-specific public API functions.
 AC_CHECK_FUNC([memalign],
 	      [AC_DEFINE([JEMALLOC_OVERRIDE_MEMALIGN], [ ])
@@ -991,6 +1051,10 @@ cfghdrs_tup="include/jemalloc/jemalloc_defs.h:include/jemalloc/jemalloc_defs.h.i
 cfghdrs_tup="${cfghdrs_tup} include/jemalloc/internal/jemalloc_internal_defs.h:include/jemalloc/internal/jemalloc_internal_defs.h.in"
 cfghdrs_tup="${cfghdrs_tup} test/include/test/jemalloc_test_defs.h:test/include/test/jemalloc_test_defs.h.in"
 
+dnl ============================================================================
+dnl jemalloc build options.
+dnl
+
 dnl Do not compile with debugging by default.
 AC_ARG_ENABLE([debug],
   [AS_HELP_STRING([--enable-debug],
@@ -1463,66 +1527,6 @@ fi
 AC_DEFINE_UNQUOTED([LG_HUGEPAGE], [${je_cv_lg_hugepage}])
 
 dnl ============================================================================
-dnl jemalloc configuration.
-dnl
-
-AC_ARG_WITH([version],
-  [AS_HELP_STRING([--with-version=<major>.<minor>.<bugfix>-<nrev>-g<gid>],
-   [Version string])],
-  [
-    echo "${with_version}" | grep ['^[0-9]\+\.[0-9]\+\.[0-9]\+-[0-9]\+-g[0-9a-f]\+$'] 2>&1 1>/dev/null
-    if test $? -eq 0 ; then
-      echo "$with_version" > "${objroot}VERSION"
-    else
-      echo "${with_version}" | grep ['^VERSION$'] 2>&1 1>/dev/null
-      if test $? -ne 0 ; then
-        AC_MSG_ERROR([${with_version} does not match <major>.<minor>.<bugfix>-<nrev>-g<gid> or VERSION])
-      fi
-    fi
-  ], [
-    dnl Set VERSION if source directory is inside a git repository.
-    if test "x`test ! \"${srcroot}\" && cd \"${srcroot}\"; git rev-parse --is-inside-work-tree 2>/dev/null`" = "xtrue" ; then
-      dnl Pattern globs aren't powerful enough to match both single- and
-      dnl double-digit version numbers, so iterate over patterns to support up
-      dnl to version 99.99.99 without any accidental matches.
-      for pattern in ['[0-9].[0-9].[0-9]' '[0-9].[0-9].[0-9][0-9]' \
-                     '[0-9].[0-9][0-9].[0-9]' '[0-9].[0-9][0-9].[0-9][0-9]' \
-                     '[0-9][0-9].[0-9].[0-9]' '[0-9][0-9].[0-9].[0-9][0-9]' \
-                     '[0-9][0-9].[0-9][0-9].[0-9]' \
-                     '[0-9][0-9].[0-9][0-9].[0-9][0-9]']; do
-        (test ! "${srcroot}" && cd "${srcroot}"; git describe --long --abbrev=40 --match="${pattern}") > "${objroot}VERSION.tmp" 2>/dev/null
-        if test $? -eq 0 ; then
-          mv "${objroot}VERSION.tmp" "${objroot}VERSION"
-          break
-        fi
-      done
-    fi
-    rm -f "${objroot}VERSION.tmp"
-  ])
-
-if test ! -e "${objroot}VERSION" ; then
-  if test ! -e "${srcroot}VERSION" ; then
-    AC_MSG_RESULT(
-      [Missing VERSION file, and unable to generate it; creating bogus VERSION])
-    echo "0.0.0-0-g0000000000000000000000000000000000000000" > "${objroot}VERSION"
-  else
-    cp ${srcroot}VERSION ${objroot}VERSION
-  fi
-fi
-jemalloc_version=`cat "${objroot}VERSION"`
-jemalloc_version_major=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]1}'`
-jemalloc_version_minor=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]2}'`
-jemalloc_version_bugfix=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]3}'`
-jemalloc_version_nrev=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]4}'`
-jemalloc_version_gid=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]5}'`
-AC_SUBST([jemalloc_version])
-AC_SUBST([jemalloc_version_major])
-AC_SUBST([jemalloc_version_minor])
-AC_SUBST([jemalloc_version_bugfix])
-AC_SUBST([jemalloc_version_nrev])
-AC_SUBST([jemalloc_version_gid])
-
-dnl ============================================================================
 dnl Configure pthreads.
 
 if test "x$abi" != "xpecoff" ; then
diff --git a/include/jemalloc/jemalloc_macros.h.in b/include/jemalloc/jemalloc_macros.h.in
index aee5543..a00ce11 100644
--- a/include/jemalloc/jemalloc_macros.h.in
+++ b/include/jemalloc/jemalloc_macros.h.in
@@ -10,6 +10,7 @@
 #define JEMALLOC_VERSION_BUGFIX @jemalloc_version_bugfix@
 #define JEMALLOC_VERSION_NREV @jemalloc_version_nrev@
 #define JEMALLOC_VERSION_GID "@jemalloc_version_gid@"
+#define JEMALLOC_VERSION_GID_IDENT @jemalloc_version_gid@
 
 #define MALLOCX_LG_ALIGN(la)	((int)(la))
 #if LG_SIZEOF_PTR == 2
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 57d9f15..f1bec9a 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1748,8 +1748,7 @@ struct static_opts_s {
 	 */
 	bool slow;
 	/*
-	 * Return size
-	 *
+	 * Return size.
 	 */
 	bool usize;
 };
@@ -2770,6 +2769,11 @@ int __posix_memalign(void** r, size_t a, size_t s) PREALIAS(je_posix_memalign);
  */
 
 #ifdef JEMALLOC_EXPERIMENTAL_SMALLOCX_API
+
+#define JEMALLOC_SMALLOCX_CONCAT_HELPER(x, y) x ## y
+#define JEMALLOC_SMALLOCX_CONCAT_HELPER2(x, y)  \
+  JEMALLOC_SMALLOCX_CONCAT_HELPER(x, y)
+
 typedef struct {
 	void *ptr;
 	size_t size;
@@ -2781,7 +2785,8 @@ smallocx_return_t JEMALLOC_NOTHROW
  * The attribute JEMALLOC_ATTR(malloc) cannot be used due to:
  *  - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86488
  */
-  je_smallocx(size_t size, int flags) {
+JEMALLOC_SMALLOCX_CONCAT_HELPER2(je_smallocx_, JEMALLOC_VERSION_GID_IDENT)
+  (size_t size, int flags) {
 	/*
 	 * Note: the attribute JEMALLOC_ALLOC_SIZE(1) cannot be
 	 * used here because it makes writing beyond the `size`
@@ -2828,8 +2833,6 @@ smallocx_return_t JEMALLOC_NOTHROW
 			dopts.arena_ind = MALLOCX_ARENA_GET(flags);
 	}
 
-
-
 	imalloc(&sopts, &dopts);
 	assert(dopts.usize == je_nallocx(size, flags));
 	ret.size = dopts.usize;
@@ -2837,6 +2840,8 @@ smallocx_return_t JEMALLOC_NOTHROW
 	LOG("core.smallocx.exit", "result: %p, size: %zu", ret.ptr, ret.size);
 	return ret;
 }
+#undef JEMALLOC_SMALLOCX_CONCAT_HELPER
+#undef JEMALLOC_SMALLOCX_CONCAT_HELPER2
 #endif
 
 JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
diff --git a/test/integration/smallocx.c b/test/integration/smallocx.c
index f49ec84..2486752 100644
--- a/test/integration/smallocx.c
+++ b/test/integration/smallocx.c
@@ -1,11 +1,24 @@
 #include "test/jemalloc_test.h"
+#include "jemalloc/jemalloc_macros.h"
+
+#define STR_HELPER(x) #x
+#define STR(x) STR_HELPER(x)
+
+#ifndef JEMALLOC_VERSION_GID_IDENT
+  #error "JEMALLOC_VERSION_GID_IDENT not defined"
+#endif
+
+#define JOIN(x, y) x ## y
+#define JOIN2(x, y) JOIN(x, y)
+#define smallocx JOIN2(smallocx_, JEMALLOC_VERSION_GID_IDENT)
 
 typedef struct {
 	void *ptr;
 	size_t size;
 } smallocx_return_t;
 
-extern smallocx_return_t smallocx(size_t size, int flags);
+extern smallocx_return_t
+smallocx(size_t size, int flags);
 
 static unsigned
 get_nsizes_impl(const char *cmd) {
@@ -99,12 +112,12 @@ remote_alloc(void *arg) {
 	assert_d_eq(mallctl("arenas.lextent.0.size", (void *)&large_sz, &sz,
 	    NULL, 0), 0, "Unexpected mallctl failure");
 
-	smallocx_return_t r = smallocx(large_sz, MALLOCX_ARENA(arena)
-                                 | MALLOCX_TCACHE_NONE);
+	smallocx_return_t r
+	    = smallocx(large_sz, MALLOCX_ARENA(arena) | MALLOCX_TCACHE_NONE);
 	void *ptr = r.ptr;
-  assert_zu_eq(r.size, nallocx(large_sz, MALLOCX_ARENA(arena)
-                               | MALLOCX_TCACHE_NONE),
-               "Expected smalloc(size,flags).size == nallocx(size,flags)");
+	assert_zu_eq(r.size,
+	    nallocx(large_sz, MALLOCX_ARENA(arena) | MALLOCX_TCACHE_NONE),
+	    "Expected smalloc(size,flags).size == nallocx(size,flags)");
 	void **ret = (void **)arg;
 	*ret = ptr;
 
@@ -174,40 +187,40 @@ TEST_BEGIN(test_basic) {
 	size_t sz;
 
 	for (sz = 1; sz < MAXSZ; sz = nallocx(sz, 0) + 1) {
-    smallocx_return_t ret;
+		smallocx_return_t ret;
 		size_t nsz, rsz, smz;
 		void *p;
 		nsz = nallocx(sz, 0);
 		assert_zu_ne(nsz, 0, "Unexpected nallocx() error");
 		ret = smallocx(sz, 0);
-    p = ret.ptr;
-    smz = ret.size;
+		p = ret.ptr;
+		smz = ret.size;
 		assert_ptr_not_null(p,
 		    "Unexpected smallocx(size=%zx, flags=0) error", sz);
 		rsz = sallocx(p, 0);
 		assert_zu_ge(rsz, sz, "Real size smaller than expected");
 		assert_zu_eq(nsz, rsz, "nallocx()/sallocx() size mismatch");
-    assert_zu_eq(nsz, smz, "nallocx()/smallocx() size mismatch");
+		assert_zu_eq(nsz, smz, "nallocx()/smallocx() size mismatch");
 		dallocx(p, 0);
 
 		ret = smallocx(sz, 0);
-    p = ret.ptr;
-    smz = ret.size;
+		p = ret.ptr;
+		smz = ret.size;
 		assert_ptr_not_null(p,
 		    "Unexpected smallocx(size=%zx, flags=0) error", sz);
 		dallocx(p, 0);
 
 		nsz = nallocx(sz, MALLOCX_ZERO);
 		assert_zu_ne(nsz, 0, "Unexpected nallocx() error");
-    assert_zu_ne(smz, 0, "Unexpected smallocx() error");
-    ret = smallocx(sz, MALLOCX_ZERO);
+		assert_zu_ne(smz, 0, "Unexpected smallocx() error");
+		ret = smallocx(sz, MALLOCX_ZERO);
 		p = ret.ptr;
 		assert_ptr_not_null(p,
 		    "Unexpected smallocx(size=%zx, flags=MALLOCX_ZERO) error",
 		    nsz);
 		rsz = sallocx(p, 0);
 		assert_zu_eq(nsz, rsz, "nallocx()/sallocx() rsize mismatch");
-    assert_zu_eq(nsz, smz, "nallocx()/smallocx() size mismatch");
+		assert_zu_eq(nsz, smz, "nallocx()/smallocx() size mismatch");
 		dallocx(p, 0);
 		purge();
 	}
@@ -247,23 +260,23 @@ TEST_BEGIN(test_alignment_and_size) {
 				assert_zu_ne(nsz, 0,
 				    "nallocx() error for alignment=%zu, "
 				    "size=%zu (%#zx)", alignment, sz, sz);
-        smallocx_return_t ret = smallocx(sz, MALLOCX_ALIGN(alignment) |
-                                         MALLOCX_ZERO);
+				smallocx_return_t ret
+				    = smallocx(sz, MALLOCX_ALIGN(alignment) | MALLOCX_ZERO);
 				ps[i] = ret.ptr;
 				assert_ptr_not_null(ps[i],
 				    "smallocx() error for alignment=%zu, "
 				    "size=%zu (%#zx)", alignment, sz, sz);
 				rsz = sallocx(ps[i], 0);
-        smz = ret.size;
+				smz = ret.size;
 				assert_zu_ge(rsz, sz,
 				    "Real size smaller than expected for "
 				    "alignment=%zu, size=%zu", alignment, sz);
 				assert_zu_eq(nsz, rsz,
 				    "nallocx()/sallocx() size mismatch for "
 				    "alignment=%zu, size=%zu", alignment, sz);
-        assert_zu_eq(nsz, smz,
-            "nallocx()/smallocx() size mismatch for "
-            "alignment=%zu, size=%zu", alignment, sz);
+				assert_zu_eq(nsz, smz,
+				    "nallocx()/smallocx() size mismatch for "
+				    "alignment=%zu, size=%zu", alignment, sz);
 				assert_ptr_null(
 				    (void *)((uintptr_t)ps[i] & (alignment-1)),
 				    "%p inadequately aligned for"
-- 
cgit v0.12


From 2b112ea5932d280288882d8bb38e7942b166fe5a Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Tue, 9 Oct 2018 08:41:36 -0700
Subject: add test for zero-sized alloc and aligned alloc

---
 Makefile.in                      |  1 +
 test/integration/aligned_alloc.c | 12 +++++++++++-
 test/integration/malloc.c        | 16 ++++++++++++++++
 3 files changed, 28 insertions(+), 1 deletion(-)
 create mode 100644 test/integration/malloc.c

diff --git a/Makefile.in b/Makefile.in
index 3d99a40..c9bd95a 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -226,6 +226,7 @@ endif
 TESTS_INTEGRATION := $(srcroot)test/integration/aligned_alloc.c \
 	$(srcroot)test/integration/allocated.c \
 	$(srcroot)test/integration/extent.c \
+	$(srcroot)test/integration/malloc.c \
 	$(srcroot)test/integration/mallocx.c \
 	$(srcroot)test/integration/MALLOCX_ARENA.c \
 	$(srcroot)test/integration/overflow.c \
diff --git a/test/integration/aligned_alloc.c b/test/integration/aligned_alloc.c
index cfe1df9..4375b17 100644
--- a/test/integration/aligned_alloc.c
+++ b/test/integration/aligned_alloc.c
@@ -138,10 +138,20 @@ TEST_BEGIN(test_alignment_and_size) {
 }
 TEST_END
 
+TEST_BEGIN(test_zero_alloc) {
+	void *res = aligned_alloc(8, 0);
+	assert(res);
+	size_t usable = malloc_usable_size(res);
+	assert(usable > 0);
+	free(res);
+}
+TEST_END
+
 int
 main(void) {
 	return test(
 	    test_alignment_errors,
 	    test_oom_errors,
-	    test_alignment_and_size);
+	    test_alignment_and_size,
+	    test_zero_alloc);
 }
diff --git a/test/integration/malloc.c b/test/integration/malloc.c
new file mode 100644
index 0000000..8b33bc8
--- /dev/null
+++ b/test/integration/malloc.c
@@ -0,0 +1,16 @@
+#include "test/jemalloc_test.h"
+
+TEST_BEGIN(test_zero_alloc) {
+	void *res = malloc(0);
+	assert(res);
+	size_t usable = malloc_usable_size(res);
+	assert(usable > 0);
+	free(res);
+}
+TEST_END
+
+int
+main(void) {
+	return test(
+	    test_zero_alloc);
+}
-- 
cgit v0.12


From 4edbb7c64c83aa2059ade469bc798dadf3da194c Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Mon, 8 Oct 2018 10:11:04 -0700
Subject: sz: Support 0 size in size2index lookup/compute

---
 include/jemalloc/internal/sz.h | 13 ++++++++++---
 src/sz.c                       |  7 ++++---
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/include/jemalloc/internal/sz.h b/include/jemalloc/internal/sz.h
index 69625ee..68e558a 100644
--- a/include/jemalloc/internal/sz.h
+++ b/include/jemalloc/internal/sz.h
@@ -122,6 +122,10 @@ sz_size2index_compute(size_t size) {
 	if (unlikely(size > SC_LARGE_MAXCLASS)) {
 		return SC_NSIZES;
 	}
+
+	if (size == 0) {
+		return 0;
+	}
 #if (SC_NTINY != 0)
 	if (size <= (ZU(1) << SC_LG_TINY_MAXCLASS)) {
 		szind_t lg_tmin = SC_LG_TINY_MAXCLASS - SC_NTINY + 1;
@@ -150,14 +154,14 @@ sz_size2index_compute(size_t size) {
 JEMALLOC_ALWAYS_INLINE szind_t
 sz_size2index_lookup(size_t size) {
 	assert(size <= SC_LOOKUP_MAXCLASS);
-	szind_t ret = (sz_size2index_tab[(size-1) >> SC_LG_TINY_MIN]);
+	szind_t ret = (sz_size2index_tab[(size + (ZU(1) << SC_LG_TINY_MIN) - 1)
+					 >> SC_LG_TINY_MIN]);
 	assert(ret == sz_size2index_compute(size));
 	return ret;
 }
 
 JEMALLOC_ALWAYS_INLINE szind_t
 sz_size2index(size_t size) {
-	assert(size > 0);
 	if (likely(size <= SC_LOOKUP_MAXCLASS)) {
 		return sz_size2index_lookup(size);
 	}
@@ -208,6 +212,10 @@ sz_s2u_compute(size_t size) {
 	if (unlikely(size > SC_LARGE_MAXCLASS)) {
 		return 0;
 	}
+
+	if (size == 0) {
+		size++;
+	}
 #if (SC_NTINY > 0)
 	if (size <= (ZU(1) << SC_LG_TINY_MAXCLASS)) {
 		size_t lg_tmin = SC_LG_TINY_MAXCLASS - SC_NTINY + 1;
@@ -241,7 +249,6 @@ sz_s2u_lookup(size_t size) {
  */
 JEMALLOC_ALWAYS_INLINE size_t
 sz_s2u(size_t size) {
-	assert(size > 0);
 	if (likely(size <= SC_LOOKUP_MAXCLASS)) {
 		return sz_s2u_lookup(size);
 	}
diff --git a/src/sz.c b/src/sz.c
index 77f89c6..8633fb0 100644
--- a/src/sz.c
+++ b/src/sz.c
@@ -37,18 +37,19 @@ sz_boot_index2size_tab(const sc_data_t *sc_data) {
  * the smallest interval for which the result can change.
  */
 JEMALLOC_ALIGNED(CACHELINE)
-uint8_t sz_size2index_tab[SC_LOOKUP_MAXCLASS >> SC_LG_TINY_MIN];
+uint8_t sz_size2index_tab[(SC_LOOKUP_MAXCLASS >> SC_LG_TINY_MIN) + 1];
 
 static void
 sz_boot_size2index_tab(const sc_data_t *sc_data) {
-	size_t dst_max = (SC_LOOKUP_MAXCLASS >> SC_LG_TINY_MIN);
+	size_t dst_max = (SC_LOOKUP_MAXCLASS >> SC_LG_TINY_MIN) + 1;
 	size_t dst_ind = 0;
 	for (unsigned sc_ind = 0; sc_ind < SC_NSIZES && dst_ind < dst_max;
 	    sc_ind++) {
 		const sc_t *sc = &sc_data->sc[sc_ind];
 		size_t sz = (ZU(1) << sc->lg_base)
 		    + (ZU(sc->ndelta) << sc->lg_delta);
-		size_t max_ind = ((sz - 1) >> SC_LG_TINY_MIN);
+		size_t max_ind = ((sz + (ZU(1) << SC_LG_TINY_MIN) - 1)
+				   >> SC_LG_TINY_MIN);
 		for (; dst_ind <= max_ind && dst_ind < dst_max; dst_ind++) {
 			sz_size2index_tab[dst_ind] = sc_ind;
 		}
-- 
cgit v0.12


From ac34afb4037d7e9e87efde2b8e913d87aae131da Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Mon, 8 Oct 2018 10:13:02 -0700
Subject: drop bump_empty_alloc option.  Size class lookup support used
 instead.

---
 include/jemalloc/internal/arena_inlines_b.h             |  1 -
 include/jemalloc/internal/jemalloc_internal_inlines_c.h |  1 -
 src/jemalloc.c                                          | 17 +----------------
 3 files changed, 1 insertion(+), 18 deletions(-)

diff --git a/include/jemalloc/internal/arena_inlines_b.h b/include/jemalloc/internal/arena_inlines_b.h
index 8bf0a81..3d0121d 100644
--- a/include/jemalloc/internal/arena_inlines_b.h
+++ b/include/jemalloc/internal/arena_inlines_b.h
@@ -134,7 +134,6 @@ JEMALLOC_ALWAYS_INLINE void *
 arena_malloc(tsdn_t *tsdn, arena_t *arena, size_t size, szind_t ind, bool zero,
     tcache_t *tcache, bool slow_path) {
 	assert(!tsdn_null(tsdn) || tcache == NULL);
-	assert(size != 0);
 
 	if (likely(tcache != NULL)) {
 		if (likely(size <= SC_SMALL_MAXCLASS)) {
diff --git a/include/jemalloc/internal/jemalloc_internal_inlines_c.h b/include/jemalloc/internal/jemalloc_internal_inlines_c.h
index 9c5fec6..cdb10eb 100644
--- a/include/jemalloc/internal/jemalloc_internal_inlines_c.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_c.h
@@ -43,7 +43,6 @@ iallocztm(tsdn_t *tsdn, size_t size, szind_t ind, bool zero, tcache_t *tcache,
     bool is_internal, arena_t *arena, bool slow_path) {
 	void *ret;
 
-	assert(size != 0);
 	assert(!is_internal || tcache == NULL);
 	assert(!is_internal || arena == NULL || arena_is_auto(arena));
 	if (!tsdn_null(tsdn) && tsd_reentrancy_level_get(tsdn_tsd(tsdn)) == 0) {
diff --git a/src/jemalloc.c b/src/jemalloc.c
index f1bec9a..0636c83 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1713,8 +1713,7 @@ typedef struct static_opts_s static_opts_t;
 struct static_opts_s {
 	/* Whether or not allocation size may overflow. */
 	bool may_overflow;
-	/* Whether or not allocations of size 0 should be treated as size 1. */
-	bool bump_empty_alloc;
+
 	/*
 	 * Whether to assert that allocations are not of size 0 (after any
 	 * bumping).
@@ -1756,7 +1755,6 @@ struct static_opts_s {
 JEMALLOC_ALWAYS_INLINE void
 static_opts_init(static_opts_t *static_opts) {
 	static_opts->may_overflow = false;
-	static_opts->bump_empty_alloc = false;
 	static_opts->assert_nonempty_alloc = false;
 	static_opts->null_out_result_on_error = false;
 	static_opts->set_errno_on_error = false;
@@ -1945,12 +1943,6 @@ imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) {
 	}
 
 	/* Validate the user input. */
-	if (sopts->bump_empty_alloc) {
-		if (unlikely(size == 0)) {
-			size = 1;
-		}
-	}
-
 	if (sopts->assert_nonempty_alloc) {
 		assert (size != 0);
 	}
@@ -2178,7 +2170,6 @@ je_malloc(size_t size) {
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);
 
-	sopts.bump_empty_alloc = true;
 	sopts.null_out_result_on_error = true;
 	sopts.set_errno_on_error = true;
 	sopts.oom_string = "<jemalloc>: Error in malloc(): out of memory\n";
@@ -2215,7 +2206,6 @@ je_posix_memalign(void **memptr, size_t alignment, size_t size) {
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);
 
-	sopts.bump_empty_alloc = true;
 	sopts.min_alignment = sizeof(void *);
 	sopts.oom_string =
 	    "<jemalloc>: Error allocating aligned memory: out of memory\n";
@@ -2256,7 +2246,6 @@ je_aligned_alloc(size_t alignment, size_t size) {
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);
 
-	sopts.bump_empty_alloc = true;
 	sopts.null_out_result_on_error = true;
 	sopts.set_errno_on_error = true;
 	sopts.min_alignment = 1;
@@ -2296,7 +2285,6 @@ je_calloc(size_t num, size_t size) {
 	dynamic_opts_init(&dopts);
 
 	sopts.may_overflow = true;
-	sopts.bump_empty_alloc = true;
 	sopts.null_out_result_on_error = true;
 	sopts.set_errno_on_error = true;
 	sopts.oom_string = "<jemalloc>: Error in calloc(): out of memory\n";
@@ -2539,7 +2527,6 @@ je_realloc(void *ptr, size_t arg_size) {
 		static_opts_init(&sopts);
 		dynamic_opts_init(&dopts);
 
-		sopts.bump_empty_alloc = true;
 		sopts.null_out_result_on_error = true;
 		sopts.set_errno_on_error = true;
 		sopts.oom_string =
@@ -2643,7 +2630,6 @@ je_memalign(size_t alignment, size_t size) {
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);
 
-	sopts.bump_empty_alloc = true;
 	sopts.min_alignment = 1;
 	sopts.oom_string =
 	    "<jemalloc>: Error allocating aligned memory: out of memory\n";
@@ -2683,7 +2669,6 @@ je_valloc(size_t size) {
 	static_opts_init(&sopts);
 	dynamic_opts_init(&dopts);
 
-	sopts.bump_empty_alloc = true;
 	sopts.null_out_result_on_error = true;
 	sopts.min_alignment = PAGE;
 	sopts.oom_string =
-- 
cgit v0.12


From 0ec656eb7117127602f295510de694083353f23e Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Tue, 16 Oct 2018 10:23:08 -0700
Subject: ticker: add ticker_trytick

For the fastpath, we want to tick, but undo the tick and jump to the
slowpath if ticker would fire.
---
 include/jemalloc/internal/ticker.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/include/jemalloc/internal/ticker.h b/include/jemalloc/internal/ticker.h
index 4b36047..52d0db4 100644
--- a/include/jemalloc/internal/ticker.h
+++ b/include/jemalloc/internal/ticker.h
@@ -75,4 +75,17 @@ ticker_tick(ticker_t *ticker) {
 	return ticker_ticks(ticker, 1);
 }
 
+/* 
+ * Try to tick.  If ticker would fire, return true, but rely on
+ * slowpath to reset ticker.
+ */
+static inline bool
+ticker_trytick(ticker_t *ticker) {
+	--ticker->tick;
+	if (unlikely(ticker->tick < 0)) {
+		return true;
+	}
+	return false;
+}
+
 #endif /* JEMALLOC_INTERNAL_TICKER_H */
-- 
cgit v0.12


From 0f8313659e93379d930995ea2d2af0a079cc422e Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Wed, 10 Oct 2018 11:54:58 -0700
Subject: malloc: Add a fastpath

This diff adds a fastpath that assumes size <= SC_LOOKUP_MAXCLASS, and
that we hit tcache.  If either of these is false, we fall back to
the previous codepath (renamed 'malloc_default').

Crucially, we only tail call malloc_default, and with the same kind
and number of arguments, so that both clang and gcc tail-calling
will kick in - therefore malloc() gets treated as a leaf function,
and there are *no* caller-saved registers.   Previously malloc() contained
5 caller saved registers on x64, resulting in at least 10 extra
memory-movement instructions.

In microbenchmarks this results in up to ~10% improvement in malloc()
fastpath.  In real programs, this is a ~1% CPU and latency improvement
overall.
---
 src/jemalloc.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 89 insertions(+), 8 deletions(-)

diff --git a/src/jemalloc.c b/src/jemalloc.c
index 0636c83..f1f9e39 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -2152,15 +2152,9 @@ imalloc(static_opts_t *sopts, dynamic_opts_t *dopts) {
 		return imalloc_body(sopts, dopts, tsd);
 	}
 }
-/******************************************************************************/
-/*
- * Begin malloc(3)-compatible functions.
- */
 
-JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
-void JEMALLOC_NOTHROW *
-JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1)
-je_malloc(size_t size) {
+void *
+malloc_default(size_t size) {
 	void *ret;
 	static_opts_t sopts;
 	dynamic_opts_t dopts;
@@ -2193,6 +2187,93 @@ je_malloc(size_t size) {
 	return ret;
 }
 
+/******************************************************************************/
+/*
+ * Begin malloc(3)-compatible functions.
+ */
+
+/*
+ * malloc() fastpath.
+ *
+ * Fastpath assumes size <= SC_LOOKUP_MAXCLASS, and that we hit
+ * tcache.  If either of these is false, we tail-call to the slowpath,
+ * malloc_default().  Tail-calling is used to avoid any caller-saved
+ * registers.
+ *
+ * fastpath supports ticker and profiling, both of which will also
+ * tail-call to the slowpath if they fire.
+ */
+JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
+void JEMALLOC_NOTHROW *
+JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1)
+je_malloc(size_t size) {
+	LOG("core.malloc.entry", "size: %zu", size);
+
+	if (tsd_get_allocates() && unlikely(!malloc_initialized())) {
+		return malloc_default(size);
+	}
+
+	tsd_t *tsd = tsd_get(false);
+	if (unlikely(!tsd || !tsd_fast(tsd) || (size > SC_LOOKUP_MAXCLASS))) {
+		return malloc_default(size);
+	}
+
+	tcache_t *tcache = tsd_tcachep_get(tsd);
+
+	if (unlikely(ticker_trytick(&tcache->gc_ticker))) {
+		return malloc_default(size);
+	}
+
+	szind_t ind = sz_size2index_lookup(size);
+	size_t usize;
+	if (config_stats || config_prof) {
+		usize = sz_index2size(ind);
+	}
+	/* Fast path relies on size being a bin. I.e. SC_LOOKUP_MAXCLASS < SC_SMALL_MAXCLASS */
+	assert(ind < SC_NBINS);
+	assert(size <= SC_SMALL_MAXCLASS);
+
+	if (config_prof) {
+		int64_t bytes_until_sample = tsd_bytes_until_sample_get(tsd);
+		bytes_until_sample -= usize;
+		tsd_bytes_until_sample_set(tsd, bytes_until_sample);
+
+		if (unlikely(bytes_until_sample < 0)) {
+			/* 
+			 * Avoid a prof_active check on the fastpath.
+			 * If prof_active is false, set bytes_until_sample to
+			 * a large value.  If prof_active is set to true,
+			 * bytes_until_sample will be reset.
+			 */
+			if (!prof_active) {
+				tsd_bytes_until_sample_set(tsd, SSIZE_MAX);
+			}
+			return malloc_default(size);
+		}
+	}
+
+	cache_bin_t *bin = tcache_small_bin_get(tcache, ind);
+	bool tcache_success;
+	void* ret = cache_bin_alloc_easy(bin, &tcache_success);
+
+	if (tcache_success) {
+		if (config_stats) {
+			*tsd_thread_allocatedp_get(tsd) += usize;
+			bin->tstats.nrequests++;
+		}
+		if (config_prof) {
+			tcache->prof_accumbytes += usize;
+		}
+
+		LOG("core.malloc.exit", "result: %p", ret);
+
+		/* Fastpath success */
+		return ret;
+	}
+
+	return malloc_default(size);
+}
+
 JEMALLOC_EXPORT int JEMALLOC_NOTHROW
 JEMALLOC_ATTR(nonnull(1))
 je_posix_memalign(void **memptr, size_t alignment, size_t size) {
-- 
cgit v0.12


From 936bc2aa15504076f884ed97a51e169924fe4a89 Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Tue, 23 Oct 2018 08:12:46 -0700
Subject: prof: Fix memory regression

The diff 'refactor prof accum...' moved the bytes_until_sample
subtraction before the load of tdata.  If tdata is null,
tdata_get(true) will overwrite bytes_until_sample, but we
still sample the current allocation.   Instead, do the subtraction
and check logic again, to keep the previous behavior.

blame-rev: 0ac524308d3f636d1a4b5149fa7adf24cf426d9c
---
 include/jemalloc/internal/prof_inlines_b.h | 36 +++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 8 deletions(-)

diff --git a/include/jemalloc/internal/prof_inlines_b.h b/include/jemalloc/internal/prof_inlines_b.h
index 085111f..8358bff 100644
--- a/include/jemalloc/internal/prof_inlines_b.h
+++ b/include/jemalloc/internal/prof_inlines_b.h
@@ -79,15 +79,10 @@ prof_alloc_time_set(tsdn_t *tsdn, const void *ptr, alloc_ctx_t *alloc_ctx,
 }
 
 JEMALLOC_ALWAYS_INLINE bool
-prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
-    prof_tdata_t **tdata_out) {
-	prof_tdata_t *tdata;
-	int64_t bytes_until_sample;
-
-	cassert(config_prof);
+prof_sample_check(tsd_t *tsd, size_t usize, bool update) {
 	ssize_t check = update ? 0 : usize;
 
-	bytes_until_sample = tsd_bytes_until_sample_get(tsd);
+	int64_t bytes_until_sample = tsd_bytes_until_sample_get(tsd);
 	if (update) {
 		bytes_until_sample -= usize;
 		if (tsd_nominal(tsd)) {
@@ -96,8 +91,24 @@ prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
 	}
 	if (likely(bytes_until_sample >= check)) {
 		return true;
-	} 
+	}
+
+	return false;
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
+			 prof_tdata_t **tdata_out) {
+	prof_tdata_t *tdata;
 
+	cassert(config_prof);
+
+	/* Fastpath: no need to load tdata */
+	if (likely(prof_sample_check(tsd, usize, update))) {
+		return true;
+	}
+
+	bool booted = tsd_prof_tdata_get(tsd);
 	tdata = prof_tdata_get(tsd, true);
 	if (unlikely((uintptr_t)tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)) {
 		tdata = NULL;
@@ -111,6 +122,15 @@ prof_sample_accum_update(tsd_t *tsd, size_t usize, bool update,
 		return true;
 	}
 
+	/*
+	 * If this was the first creation of tdata, then
+	 * prof_tdata_get() reset bytes_until_sample, so decrement and
+	 * check it again
+	 */
+	if (!booted && prof_sample_check(tsd, usize, update)) {
+		return true;
+	}
+
 	if (tsd_reentrancy_level_get(tsd) > 0) {
 		return true;
 	}
-- 
cgit v0.12


From ceba1dde2774e4eae659a548263970cd9b74d319 Mon Sep 17 00:00:00 2001
From: Edward Tomasz Napierala <trasz@FreeBSD.org>
Date: Sat, 6 Oct 2018 16:43:07 +0100
Subject: Make use of pthread_set_name_np(3) on FreeBSD.

---
 include/jemalloc/internal/jemalloc_internal_decls.h | 3 +++
 src/background_thread.c                             | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/include/jemalloc/internal/jemalloc_internal_decls.h b/include/jemalloc/internal/jemalloc_internal_decls.h
index be70df5..7d6053e 100644
--- a/include/jemalloc/internal/jemalloc_internal_decls.h
+++ b/include/jemalloc/internal/jemalloc_internal_decls.h
@@ -31,6 +31,9 @@
 #    include <sys/uio.h>
 #  endif
 #  include <pthread.h>
+#  ifdef __FreeBSD__
+#  include <pthread_np.h>
+#  endif
 #  include <signal.h>
 #  ifdef JEMALLOC_OS_UNFAIR_LOCK
 #    include <os/lock.h>
diff --git a/src/background_thread.c b/src/background_thread.c
index feed856..24f6730 100644
--- a/src/background_thread.c
+++ b/src/background_thread.c
@@ -509,6 +509,8 @@ background_thread_entry(void *ind_arg) {
 	assert(thread_ind < max_background_threads);
 #ifdef JEMALLOC_HAVE_PTHREAD_SETNAME_NP
 	pthread_setname_np(pthread_self(), "jemalloc_bg_thd");
+#elif defined(__FreeBSD__)
+	pthread_set_name_np(pthread_self(), "jemalloc_bg_thd");
 #endif
 	if (opt_percpu_arena != percpu_arena_disabled) {
 		set_current_thread_affinity((int)thread_ind);
-- 
cgit v0.12


From be0749f59151ffecbdf7d9f82193350f018904dd Mon Sep 17 00:00:00 2001
From: Justin Hibbits <chmeeedalf@gmail.com>
Date: Tue, 23 Oct 2018 16:41:14 -0500
Subject: Restrict lwsync to powerpc64 only

Nearly all 32-bit powerpc hardware treats lwsync as sync, and some cores
(Freescale e500) trap lwsync as an illegal instruction, which then gets
emulated in the kernel.  To avoid unnecessary traps on the e500, use
sync on all 32-bit powerpc.  This pessimizes 32-bit software running on
64-bit hardware, but those numbers should be slim.
---
 include/jemalloc/internal/atomic_gcc_sync.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/jemalloc/internal/atomic_gcc_sync.h b/include/jemalloc/internal/atomic_gcc_sync.h
index 06a0acf..e02b7cb 100644
--- a/include/jemalloc/internal/atomic_gcc_sync.h
+++ b/include/jemalloc/internal/atomic_gcc_sync.h
@@ -27,8 +27,10 @@ atomic_fence(atomic_memory_order_t mo) {
 	asm volatile("" ::: "memory");
 #  if defined(__i386__) || defined(__x86_64__)
 	/* This is implicit on x86. */
-#  elif defined(__ppc__)
+#  elif defined(__ppc64__)
 	asm volatile("lwsync");
+#  elif defined(__ppc__)
+	asm volatile("sync");
 #  elif defined(__sparc__) && defined(__arch64__)
 	if (mo == atomic_memory_order_acquire) {
 		asm volatile("membar #LoadLoad | #LoadStore");
-- 
cgit v0.12


From 50b473c8839f5408df179bdf6f2b3fd2cf5c3b2f Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Fri, 2 Nov 2018 14:01:45 -0700
Subject: Set commit properly for FreeBSD w/ overcommit.

When overcommit is enabled, commit needs to be set when doing mmap().  The
regression was introduced in f80c97e.
---
 src/pages.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/pages.c b/src/pages.c
index 479a89e..9f3085c 100644
--- a/src/pages.c
+++ b/src/pages.c
@@ -186,6 +186,10 @@ pages_map(void *addr, size_t size, size_t alignment, bool *commit) {
 	 * touching existing mappings, and to mmap with specific alignment.
 	 */
 	{
+		if (os_overcommits) {
+			*commit = true;
+		}
+
 		int prot = *commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
 		int flags = mmap_flags;
 
-- 
cgit v0.12


From 8dabf81df1b7db0fd16903abab889dfd61b4c07f Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Wed, 31 Oct 2018 14:54:53 -0700
Subject: Bypass extent_dalloc when retain is enabled.

When retain is enabled, the default dalloc hook does nothing (since we avoid
munmap).  But the overhead preparing the call is high, specifically the extent
de-register and re-register involve locking and extent / rtree modifications.
Bypass the call with retain in this diff.
---
 src/extent.c | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/src/extent.c b/src/extent.c
index 847e4b9..b787b21 100644
--- a/src/extent.c
+++ b/src/extent.c
@@ -1696,6 +1696,12 @@ extent_dalloc_gap(tsdn_t *tsdn, arena_t *arena, extent_t *extent) {
 }
 
 static bool
+extent_may_dalloc(void) {
+	/* With retain enabled, the default dalloc always fails. */
+	return !opt_retain;
+}
+
+static bool
 extent_dalloc_default_impl(void *addr, size_t size) {
 	if (!have_dss || !extent_in_dss(addr)) {
 		return extent_dalloc_mmap(addr, size);
@@ -1750,16 +1756,20 @@ extent_dalloc_wrapper(tsdn_t *tsdn, arena_t *arena,
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 0);
 
-	/*
-	 * Deregister first to avoid a race with other allocating threads, and
-	 * reregister if deallocation fails.
-	 */
-	extent_deregister(tsdn, extent);
-	if (!extent_dalloc_wrapper_try(tsdn, arena, r_extent_hooks, extent)) {
-		return;
+	/* Avoid calling the default extent_dalloc unless have to. */
+	if (*r_extent_hooks != &extent_hooks_default || extent_may_dalloc()) {
+		/*
+		 * Deregister first to avoid a race with other allocating
+		 * threads, and reregister if deallocation fails.
+		 */
+		extent_deregister(tsdn, extent);
+		if (!extent_dalloc_wrapper_try(tsdn, arena, r_extent_hooks,
+		    extent)) {
+			return;
+		}
+		extent_reregister(tsdn, extent);
 	}
 
-	extent_reregister(tsdn, extent);
 	if (*r_extent_hooks != &extent_hooks_default) {
 		extent_hook_pre_reentrancy(tsdn, arena);
 	}
-- 
cgit v0.12


From d66f97662879a1a0c61ee12ba4b760fa6f458eef Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Tue, 23 Oct 2018 13:50:42 -0700
Subject: Optimize large deallocation.

We eagerly coalesce large buffers when deallocating, however the previous logic
around this introduced extra lock overhead -- when coalescing we always lock the
neighbors even if they are active, while for active extents nothing can be done.

This commit checks if the neighbor extents are potentially active before
locking, and avoids locking if possible.  This speeds up large_dalloc by ~20%.
It also fixes some undesired behavior: we could stop coalescing because a small
buffer was merged, while a large neighbor was ignored on the other side.
---
 src/extent.c | 58 +++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 41 insertions(+), 17 deletions(-)

diff --git a/src/extent.c b/src/extent.c
index b787b21..ab71215 100644
--- a/src/extent.c
+++ b/src/extent.c
@@ -134,13 +134,16 @@ typedef enum {
 
 static lock_result_t
 extent_rtree_leaf_elm_try_lock(tsdn_t *tsdn, rtree_leaf_elm_t *elm,
-    extent_t **result) {
+    extent_t **result, bool inactive_only) {
 	extent_t *extent1 = rtree_leaf_elm_extent_read(tsdn, &extents_rtree,
 	    elm, true);
 
-	if (extent1 == NULL) {
+	/* Slab implies active extents and should be skipped. */
+	if (extent1 == NULL || (inactive_only && rtree_leaf_elm_slab_read(tsdn,
+	    &extents_rtree, elm, true))) {
 		return lock_result_no_extent;
 	}
+
 	/*
 	 * It's possible that the extent changed out from under us, and with it
 	 * the leaf->extent mapping.  We have to recheck while holding the lock.
@@ -163,7 +166,8 @@ extent_rtree_leaf_elm_try_lock(tsdn_t *tsdn, rtree_leaf_elm_t *elm,
  * address, and NULL otherwise.
  */
 static extent_t *
-extent_lock_from_addr(tsdn_t *tsdn, rtree_ctx_t *rtree_ctx, void *addr) {
+extent_lock_from_addr(tsdn_t *tsdn, rtree_ctx_t *rtree_ctx, void *addr,
+    bool inactive_only) {
 	extent_t *ret = NULL;
 	rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, &extents_rtree,
 	    rtree_ctx, (uintptr_t)addr, false, false);
@@ -172,7 +176,8 @@ extent_lock_from_addr(tsdn_t *tsdn, rtree_ctx_t *rtree_ctx, void *addr) {
 	}
 	lock_result_t lock_result;
 	do {
-		lock_result = extent_rtree_leaf_elm_try_lock(tsdn, elm, &ret);
+		lock_result = extent_rtree_leaf_elm_try_lock(tsdn, elm, &ret,
+		    inactive_only);
 	} while (lock_result == lock_result_failure);
 	return ret;
 }
@@ -917,7 +922,8 @@ extent_recycle_extract(tsdn_t *tsdn, arena_t *arena,
 	extent_hooks_assure_initialized(arena, r_extent_hooks);
 	extent_t *extent;
 	if (new_addr != NULL) {
-		extent = extent_lock_from_addr(tsdn, rtree_ctx, new_addr);
+		extent = extent_lock_from_addr(tsdn, rtree_ctx, new_addr,
+		    false);
 		if (extent != NULL) {
 			/*
 			 * We might null-out extent to report an error, but we
@@ -1088,8 +1094,8 @@ extent_recycle_split(tsdn_t *tsdn, arena_t *arena,
 			extent_deregister_no_gdump_sub(tsdn, to_leak);
 			extents_leak(tsdn, arena, r_extent_hooks, extents,
 			    to_leak, growing_retained);
-			assert(extent_lock_from_addr(tsdn, rtree_ctx, leak)
-			    == NULL);
+			assert(extent_lock_from_addr(tsdn, rtree_ctx, leak,
+			    false) == NULL);
 		}
 		return NULL;
 	}
@@ -1567,9 +1573,15 @@ extent_coalesce(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 }
 
 static extent_t *
-extent_try_coalesce(tsdn_t *tsdn, arena_t *arena,
+extent_try_coalesce_impl(tsdn_t *tsdn, arena_t *arena,
     extent_hooks_t **r_extent_hooks, rtree_ctx_t *rtree_ctx, extents_t *extents,
-    extent_t *extent, bool *coalesced, bool growing_retained) {
+    extent_t *extent, bool *coalesced, bool growing_retained,
+    bool inactive_only) {
+	/*
+	 * We avoid checking / locking inactive neighbors for large size
+	 * classes, since they are eagerly coalesced on deallocation which can
+	 * cause lock contention.
+	 */
 	/*
 	 * Continue attempting to coalesce until failure, to protect against
 	 * races with other threads that are thwarted by this one.
@@ -1580,7 +1592,7 @@ extent_try_coalesce(tsdn_t *tsdn, arena_t *arena,
 
 		/* Try to coalesce forward. */
 		extent_t *next = extent_lock_from_addr(tsdn, rtree_ctx,
-		    extent_past_get(extent));
+		    extent_past_get(extent), inactive_only);
 		if (next != NULL) {
 			/*
 			 * extents->mtx only protects against races for
@@ -1606,7 +1618,7 @@ extent_try_coalesce(tsdn_t *tsdn, arena_t *arena,
 
 		/* Try to coalesce backward. */
 		extent_t *prev = extent_lock_from_addr(tsdn, rtree_ctx,
-		    extent_before_get(extent));
+		    extent_before_get(extent), inactive_only);
 		if (prev != NULL) {
 			bool can_coalesce = extent_can_coalesce(arena, extents,
 			    extent, prev);
@@ -1632,6 +1644,22 @@ extent_try_coalesce(tsdn_t *tsdn, arena_t *arena,
 	return extent;
 }
 
+static extent_t *
+extent_try_coalesce(tsdn_t *tsdn, arena_t *arena,
+    extent_hooks_t **r_extent_hooks, rtree_ctx_t *rtree_ctx, extents_t *extents,
+    extent_t *extent, bool *coalesced, bool growing_retained) {
+	return extent_try_coalesce_impl(tsdn, arena, r_extent_hooks, rtree_ctx,
+	    extents, extent, coalesced, growing_retained, false);
+}
+
+static extent_t *
+extent_try_coalesce_large(tsdn_t *tsdn, arena_t *arena,
+    extent_hooks_t **r_extent_hooks, rtree_ctx_t *rtree_ctx, extents_t *extents,
+    extent_t *extent, bool *coalesced, bool growing_retained) {
+	return extent_try_coalesce_impl(tsdn, arena, r_extent_hooks, rtree_ctx,
+	    extents, extent, coalesced, growing_retained, true);
+}
+
 /*
  * Does the metadata management portions of putting an unused extent into the
  * given extents_t (coalesces, deregisters slab interiors, the heap operations).
@@ -1664,16 +1692,12 @@ extent_record(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 	} else if (extent_size_get(extent) >= SC_LARGE_MINCLASS) {
 		/* Always coalesce large extents eagerly. */
 		bool coalesced;
-		size_t prev_size;
 		do {
-			prev_size = extent_size_get(extent);
 			assert(extent_state_get(extent) == extent_state_active);
-			extent = extent_try_coalesce(tsdn, arena,
+			extent = extent_try_coalesce_large(tsdn, arena,
 			    r_extent_hooks, rtree_ctx, extents, extent,
 			    &coalesced, growing_retained);
-		} while (coalesced &&
-		    extent_size_get(extent)
-		    >= prev_size + SC_LARGE_MINCLASS);
+		} while (coalesced);
 	}
 	extent_deactivate_locked(tsdn, arena, extents, extent);
 
-- 
cgit v0.12


From 7ee0b6cc37ecbecf8f53ba46326258275053ca50 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Thu, 8 Nov 2018 12:24:38 -0800
Subject: Properly trigger decay on tcache destory.

When destroying tcache, decay may not be triggered since tsd is non-nominal.
Explicitly decay to avoid pathological cases.
---
 src/tcache.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/tcache.c b/src/tcache.c
index 7346df8..bc9e435 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -497,6 +497,7 @@ tcache_flush(tsd_t *tsd) {
 static void
 tcache_destroy(tsd_t *tsd, tcache_t *tcache, bool tsd_tcache) {
 	tcache_flush_cache(tsd, tcache);
+	arena_t *arena = tcache->arena;
 	tcache_arena_dissociate(tsd_tsdn(tsd), tcache);
 
 	if (tsd_tcache) {
@@ -509,6 +510,23 @@ tcache_destroy(tsd_t *tsd, tcache_t *tcache, bool tsd_tcache) {
 		/* Release both the tcache struct and avail array. */
 		idalloctm(tsd_tsdn(tsd), tcache, NULL, NULL, true, true);
 	}
+
+	/*
+	 * The deallocation and tcache flush above may not trigger decay since
+	 * we are on the tcache shutdown path (potentially with non-nominal
+	 * tsd).  Manually trigger decay to avoid pathological cases.  Also
+	 * include arena 0 because the tcache array is allocated from it.
+	 */
+	arena_decay(tsd_tsdn(tsd), arena_get(tsd_tsdn(tsd), 0, false),
+	    false, false);
+
+	unsigned nthreads = arena_nthreads_get(arena, false);
+	if (nthreads == 0) {
+		/* Force purging when no threads assigned to the arena anymore. */
+		arena_decay(tsd_tsdn(tsd), arena, false, true);
+	} else {
+		arena_decay(tsd_tsdn(tsd), arena, false, false);
+	}
 }
 
 /* For auto tcache (embedded in TSD) only. */
-- 
cgit v0.12


From cd2931ad9bbd78208565716ab102e86d858c2fff Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Thu, 8 Nov 2018 16:20:48 -0800
Subject: Fix tcaches_flush.

The regression was introduced in 3a1363b.
---
 src/tcache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tcache.c b/src/tcache.c
index bc9e435..7859da9 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -655,7 +655,7 @@ tcaches_flush(tsd_t *tsd, unsigned ind) {
 	tcache_t *tcache = tcaches_elm_remove(tsd, &tcaches[ind]);
 	malloc_mutex_unlock(tsd_tsdn(tsd), &tcaches_mtx);
 	if (tcache != NULL) {
-		tcache_destroy(tsd, tcache, false);
+		tcache_flush_cache(tsd, tcache);
 	}
 }
 
-- 
cgit v0.12


From a4c6b9ae011628d012dd8eaab39fb60aa595b922 Mon Sep 17 00:00:00 2001
From: Edward Tomasz Napierala <trasz@FreeBSD.org>
Date: Thu, 25 Oct 2018 16:06:42 +0100
Subject: Restore a FreeBSD-specific getpagesize(3) optimization.

It was removed in 0771ff2cea6dc18fcd3f6bf452b4224a4e17ae38.
Add a comment explaining its purpose.
---
 src/pages.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/pages.c b/src/pages.c
index 9f3085c..13de27a 100644
--- a/src/pages.c
+++ b/src/pages.c
@@ -419,6 +419,12 @@ os_page_detect(void) {
 	SYSTEM_INFO si;
 	GetSystemInfo(&si);
 	return si.dwPageSize;
+#elif defined(__FreeBSD__)
+	/*
+	 * This returns the value obtained from
+	 * the auxv vector, avoiding a syscall.
+	 */
+	return getpagesize();
 #else
 	long result = sysconf(_SC_PAGESIZE);
 	if (result == -1) {
-- 
cgit v0.12


From 5e795297b33f25329a034fd898ee7d80c57b9a8f Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Thu, 18 Oct 2018 12:51:54 -0700
Subject: rtree: add rtree_szind_slab_read_fast

For a free fastpath, we want something that will not make additional
calls.  Assume most free() calls will hit the L1 cache, and use
a custom rtree function for this.

Additionally, roll the ptr=NULL check in to the rtree cache check.
---
 include/jemalloc/internal/rtree.h | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/include/jemalloc/internal/rtree.h b/include/jemalloc/internal/rtree.h
index 8564965..16ccbeb 100644
--- a/include/jemalloc/internal/rtree.h
+++ b/include/jemalloc/internal/rtree.h
@@ -452,6 +452,42 @@ rtree_extent_szind_read(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
 	return false;
 }
 
+/*
+ * Try to read szind_slab from the L1 cache.  Returns true on a hit,
+ * and fills in r_szind and r_slab.  Otherwise returns false.
+ *
+ * Key is allowed to be NULL in order to save an extra branch on the
+ * fastpath.  returns false in this case.
+ */
+JEMALLOC_ALWAYS_INLINE bool
+rtree_szind_slab_read_fast(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
+			    uintptr_t key, szind_t *r_szind, bool *r_slab) {
+	rtree_leaf_elm_t *elm;
+
+	size_t slot = rtree_cache_direct_map(key);
+	uintptr_t leafkey = rtree_leafkey(key);
+	assert(leafkey != RTREE_LEAFKEY_INVALID);
+
+	if (likely(rtree_ctx->cache[slot].leafkey == leafkey)) {
+		rtree_leaf_elm_t *leaf = rtree_ctx->cache[slot].leaf;
+		assert(leaf != NULL);
+		uintptr_t subkey = rtree_subkey(key, RTREE_HEIGHT-1);
+		elm = &leaf[subkey];
+
+#ifdef RTREE_LEAF_COMPACT
+		uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree,
+							  elm, true);
+		*r_szind = rtree_leaf_elm_bits_szind_get(bits);
+		*r_slab = rtree_leaf_elm_bits_slab_get(bits);
+#else
+		*r_szind = rtree_leaf_elm_szind_read(tsdn, rtree, elm, true);
+		*r_slab = rtree_leaf_elm_slab_read(tsdn, rtree, elm, true);
+#endif
+		return true;
+	} else {
+		return false;
+	}
+}
 JEMALLOC_ALWAYS_INLINE bool
 rtree_szind_slab_read(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
     uintptr_t key, bool dependent, szind_t *r_szind, bool *r_slab) {
-- 
cgit v0.12


From e2ab215324d7d19e37f4be87beb7a179528a300f Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Thu, 18 Oct 2018 13:13:57 -0700
Subject: refactor tcache_dalloc_small

Add a cache_bin_dalloc_easy (to match the alloc_easy function),
and use it in tcache_dalloc_small.  It will also be used in the
new free fastpath.
---
 include/jemalloc/internal/cache_bin.h      | 16 ++++++++++++++--
 include/jemalloc/internal/tcache_inlines.h |  7 +++----
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/include/jemalloc/internal/cache_bin.h b/include/jemalloc/internal/cache_bin.h
index 40d942e..d14556a 100644
--- a/include/jemalloc/internal/cache_bin.h
+++ b/include/jemalloc/internal/cache_bin.h
@@ -90,7 +90,7 @@ cache_bin_alloc_easy(cache_bin_t *bin, bool *success) {
 
 	bin->ncached--;
 
-	/* 
+	/*
 	 * Check for both bin->ncached == 0 and ncached < low_water
 	 * in a single branch.
 	 */
@@ -102,7 +102,7 @@ cache_bin_alloc_easy(cache_bin_t *bin, bool *success) {
 			return NULL;
 		}
 	}
-        
+
 	/*
 	 * success (instead of ret) should be checked upon the return of this
 	 * function.  We avoid checking (ret == NULL) because there is never a
@@ -116,4 +116,16 @@ cache_bin_alloc_easy(cache_bin_t *bin, bool *success) {
 	return ret;
 }
 
+JEMALLOC_ALWAYS_INLINE bool
+cache_bin_dalloc_easy(cache_bin_t *bin, cache_bin_info_t *bin_info, void *ptr) {
+	if (unlikely(bin->ncached == bin_info->ncached_max)) {
+		return false;
+	}
+	assert(bin->ncached < bin_info->ncached_max);
+	bin->ncached++;
+	*(bin->avail - bin->ncached) = ptr;
+
+	return true;
+}
+
 #endif /* JEMALLOC_INTERNAL_CACHE_BIN_H */
diff --git a/include/jemalloc/internal/tcache_inlines.h b/include/jemalloc/internal/tcache_inlines.h
index 7c95646..c2c3ac3 100644
--- a/include/jemalloc/internal/tcache_inlines.h
+++ b/include/jemalloc/internal/tcache_inlines.h
@@ -175,13 +175,12 @@ tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
 
 	bin = tcache_small_bin_get(tcache, binind);
 	bin_info = &tcache_bin_info[binind];
-	if (unlikely(bin->ncached == bin_info->ncached_max)) {
+	if (unlikely(!cache_bin_dalloc_easy(bin, bin_info, ptr))) {
 		tcache_bin_flush_small(tsd, tcache, bin, binind,
 		    (bin_info->ncached_max >> 1));
+		bool ret = cache_bin_dalloc_easy(bin, bin_info, ptr);
+		assert(ret);
 	}
-	assert(bin->ncached < bin_info->ncached_max);
-	bin->ncached++;
-	*(bin->avail - bin->ncached) = ptr;
 
 	tcache_event(tsd, tcache);
 }
-- 
cgit v0.12


From 794e29c0abbd77624d1e5599313ebd77bdc17ccc Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Thu, 18 Oct 2018 13:14:04 -0700
Subject: Add a free() and sdallocx(where flags=0) fastpath

Add unsized and sized deallocation fastpaths.  Similar to the malloc()
fastpath, this removes all frame manipulation for the majority of
free() calls.  The performance advantages here are less than that
of the malloc() fastpath, but from prod tests seems to still be half
a percent or so of improvement.

Stats and sampling a both supported (sdallocx needs a sampling check,
for rtree lookups slab will only be set for unsampled objects).

We don't support flush, any flush requests go to the slowpath.
---
 src/jemalloc.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 86 insertions(+), 11 deletions(-)

diff --git a/src/jemalloc.c b/src/jemalloc.c
index f1f9e39..68a21f9 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -2147,12 +2147,13 @@ imalloc(static_opts_t *sopts, dynamic_opts_t *dopts) {
 		if (!tsd_get_allocates() && !imalloc_init_check(sopts, dopts)) {
 			return ENOMEM;
 		}
-          
+
 		sopts->slow = true;
 		return imalloc_body(sopts, dopts, tsd);
 	}
 }
 
+JEMALLOC_NOINLINE
 void *
 malloc_default(size_t size) {
 	void *ret;
@@ -2239,7 +2240,7 @@ je_malloc(size_t size) {
 		tsd_bytes_until_sample_set(tsd, bytes_until_sample);
 
 		if (unlikely(bytes_until_sample < 0)) {
-			/* 
+			/*
 			 * Avoid a prof_active check on the fastpath.
 			 * If prof_active is false, set bytes_until_sample to
 			 * a large value.  If prof_active is set to true,
@@ -2650,10 +2651,9 @@ je_realloc(void *ptr, size_t arg_size) {
 	return ret;
 }
 
-JEMALLOC_EXPORT void JEMALLOC_NOTHROW
-je_free(void *ptr) {
-	LOG("core.free.entry", "ptr: %p", ptr);
-
+JEMALLOC_NOINLINE
+void
+free_default(void *ptr) {
 	UTRACE(ptr, 0, 0);
 	if (likely(ptr != NULL)) {
 		/*
@@ -2685,6 +2685,73 @@ je_free(void *ptr) {
 		}
 		check_entry_exit_locking(tsd_tsdn(tsd));
 	}
+}
+
+JEMALLOC_ALWAYS_INLINE
+bool free_fastpath(void *ptr, size_t size, bool size_hint) {
+	tsd_t *tsd = tsd_get(false);
+	if (unlikely(!tsd || !tsd_fast(tsd))) {
+		return false;
+	}
+
+	tcache_t *tcache = tsd_tcachep_get(tsd);
+
+	alloc_ctx_t alloc_ctx;
+	/* 
+	 * If !config_cache_oblivious, we can check PAGE alignment to
+	 * detect sampled objects.  Otherwise addresses are
+	 * randomized, and we have to look it up in the rtree anyway.
+	 * See also isfree().
+	 */
+	if (!size_hint || config_cache_oblivious) {
+		rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd);
+		bool res = rtree_szind_slab_read_fast(tsd_tsdn(tsd), &extents_rtree,
+						      rtree_ctx, (uintptr_t)ptr,
+						      &alloc_ctx.szind, &alloc_ctx.slab);
+		assert(alloc_ctx.szind != SC_NSIZES);
+
+		/* Note: profiled objects will have alloc_ctx.slab set */
+		if (!res || !alloc_ctx.slab) {
+			return false;
+		}
+	} else {
+		/*
+		 * Check for both sizes that are too large, and for sampled objects.
+		 * Sampled objects are always page-aligned.  The sampled object check
+		 * will also check for null ptr.
+		 */
+		if (size > SC_LOOKUP_MAXCLASS || (((uintptr_t)ptr & PAGE_MASK) == 0)) {
+			return false;
+		}
+		alloc_ctx.szind = sz_size2index_lookup(size);
+	}
+
+	if (unlikely(ticker_trytick(&tcache->gc_ticker))) {
+		return false;
+	}
+
+	cache_bin_t *bin = tcache_small_bin_get(tcache, alloc_ctx.szind);
+	cache_bin_info_t *bin_info = &tcache_bin_info[alloc_ctx.szind];
+	if (!cache_bin_dalloc_easy(bin, bin_info, ptr)) {
+		return false;
+	}
+
+	if (config_stats) {
+		size_t usize = sz_index2size(alloc_ctx.szind);
+		*tsd_thread_deallocatedp_get(tsd) += usize;
+	}
+
+	return true;
+}
+
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW
+je_free(void *ptr) {
+	LOG("core.free.entry", "ptr: %p", ptr);
+
+	if (!free_fastpath(ptr, 0, false)) {
+		free_default(ptr);
+	}
+
 	LOG("core.free.exit", "");
 }
 
@@ -3362,14 +3429,11 @@ inallocx(tsdn_t *tsdn, size_t size, int flags) {
 	return usize;
 }
 
-JEMALLOC_EXPORT void JEMALLOC_NOTHROW
-je_sdallocx(void *ptr, size_t size, int flags) {
+JEMALLOC_NOINLINE void
+sdallocx_default(void *ptr, size_t size, int flags) {
 	assert(ptr != NULL);
 	assert(malloc_initialized() || IS_INITIALIZER);
 
-	LOG("core.sdallocx.entry", "ptr: %p, size: %zu, flags: %d", ptr,
-	    size, flags);
-
 	tsd_t *tsd = tsd_fetch();
 	bool fast = tsd_fast(tsd);
 	size_t usize = inallocx(tsd_tsdn(tsd), size, flags);
@@ -3409,6 +3473,17 @@ je_sdallocx(void *ptr, size_t size, int flags) {
 	}
 	check_entry_exit_locking(tsd_tsdn(tsd));
 
+}
+
+JEMALLOC_EXPORT void JEMALLOC_NOTHROW
+je_sdallocx(void *ptr, size_t size, int flags) {
+	LOG("core.sdallocx.entry", "ptr: %p, size: %zu, flags: %d", ptr,
+		size, flags);
+
+	if (flags !=0 || !free_fastpath(ptr, size, true)) {
+		sdallocx_default(ptr, size, flags);
+	}
+
 	LOG("core.sdallocx.exit", "");
 }
 
-- 
cgit v0.12


From 1f561157042a779be12a2159a385de0416133f6b Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Fri, 9 Nov 2018 14:45:06 -0800
Subject: Fix tcache_flush (follow up cd2931a).

Also catch invalid tcache id.
---
 include/jemalloc/internal/tcache_inlines.h |  3 +++
 include/jemalloc/internal/tcache_types.h   |  3 +++
 src/tcache.c                               | 19 ++++++++++++++-----
 3 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/include/jemalloc/internal/tcache_inlines.h b/include/jemalloc/internal/tcache_inlines.h
index c2c3ac3..5eca20e 100644
--- a/include/jemalloc/internal/tcache_inlines.h
+++ b/include/jemalloc/internal/tcache_inlines.h
@@ -216,6 +216,9 @@ JEMALLOC_ALWAYS_INLINE tcache_t *
 tcaches_get(tsd_t *tsd, unsigned ind) {
 	tcaches_t *elm = &tcaches[ind];
 	if (unlikely(elm->tcache == NULL)) {
+		malloc_printf("<jemalloc>: invalid tcache id (%u).\n", ind);
+		abort();
+	} else if (unlikely(elm->tcache == TCACHES_ELM_NEED_REINIT)) {
 		elm->tcache = tcache_create_explicit(tsd);
 	}
 	return elm->tcache;
diff --git a/include/jemalloc/internal/tcache_types.h b/include/jemalloc/internal/tcache_types.h
index f953b8c..dce6938 100644
--- a/include/jemalloc/internal/tcache_types.h
+++ b/include/jemalloc/internal/tcache_types.h
@@ -53,4 +53,7 @@ typedef struct tcaches_s tcaches_t;
 /* Used in TSD static initializer only. Will be initialized to opt_tcache. */
 #define TCACHE_ENABLED_ZERO_INITIALIZER false
 
+/* Used for explicit tcache only. Means flushed but not destroyed. */
+#define TCACHES_ELM_NEED_REINIT ((tcache_t *)(uintptr_t)1)
+
 #endif /* JEMALLOC_INTERNAL_TCACHE_TYPES_H */
diff --git a/src/tcache.c b/src/tcache.c
index 7859da9..ee632f6 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -638,24 +638,33 @@ label_return:
 }
 
 static tcache_t *
-tcaches_elm_remove(tsd_t *tsd, tcaches_t *elm) {
+tcaches_elm_remove(tsd_t *tsd, tcaches_t *elm, bool allow_reinit) {
 	malloc_mutex_assert_owner(tsd_tsdn(tsd), &tcaches_mtx);
 
 	if (elm->tcache == NULL) {
 		return NULL;
 	}
 	tcache_t *tcache = elm->tcache;
-	elm->tcache = NULL;
+	if (allow_reinit) {
+		elm->tcache = TCACHES_ELM_NEED_REINIT;
+	} else {
+		elm->tcache = NULL;
+	}
+
+	if (tcache == TCACHES_ELM_NEED_REINIT) {
+		return NULL;
+	}
 	return tcache;
 }
 
 void
 tcaches_flush(tsd_t *tsd, unsigned ind) {
 	malloc_mutex_lock(tsd_tsdn(tsd), &tcaches_mtx);
-	tcache_t *tcache = tcaches_elm_remove(tsd, &tcaches[ind]);
+	tcache_t *tcache = tcaches_elm_remove(tsd, &tcaches[ind], true);
 	malloc_mutex_unlock(tsd_tsdn(tsd), &tcaches_mtx);
 	if (tcache != NULL) {
-		tcache_flush_cache(tsd, tcache);
+		/* Destroy the tcache; recreate in tcaches_get() if needed. */
+		tcache_destroy(tsd, tcache, false);
 	}
 }
 
@@ -663,7 +672,7 @@ void
 tcaches_destroy(tsd_t *tsd, unsigned ind) {
 	malloc_mutex_lock(tsd_tsdn(tsd), &tcaches_mtx);
 	tcaches_t *elm = &tcaches[ind];
-	tcache_t *tcache = tcaches_elm_remove(tsd, elm);
+	tcache_t *tcache = tcaches_elm_remove(tsd, elm, false);
 	elm->next = tcaches_avail;
 	tcaches_avail = elm;
 	malloc_mutex_unlock(tsd_tsdn(tsd), &tcaches_mtx);
-- 
cgit v0.12


From 57553c3b1a5592dc4c03f3c6831d9b794e523865 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Mon, 12 Nov 2018 11:15:03 -0800
Subject: Avoid touching all pages in extent_recycle for debug build.

We may have a large number of pages with *zero set (since they are populated on
demand).  Only check the first page to avoid paging in all of them.
---
 src/extent.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/extent.c b/src/extent.c
index ab71215..9605dac 100644
--- a/src/extent.c
+++ b/src/extent.c
@@ -1162,14 +1162,15 @@ extent_recycle(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 
 	if (*zero) {
 		void *addr = extent_base_get(extent);
-		size_t size = extent_size_get(extent);
 		if (!extent_zeroed_get(extent)) {
+			size_t size = extent_size_get(extent);
 			if (pages_purge_forced(addr, size)) {
 				memset(addr, 0, size);
 			}
 		} else if (config_debug) {
 			size_t *p = (size_t *)(uintptr_t)addr;
-			for (size_t i = 0; i < size / sizeof(size_t); i++) {
+			/* Check the first page only. */
+			for (size_t i = 0; i < PAGE / sizeof(size_t); i++) {
 				assert(p[i] == 0);
 			}
 		}
-- 
cgit v0.12


From 4b82872ebf5e8b701e8b37c6d1297ceb88405df8 Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Thu, 18 Oct 2018 09:49:45 -0700
Subject: arena: Refactor tcache_fill to batch fill from slab

Refactor tcache_fill, introducing a new function arena_slab_reg_alloc_batch,
which will fill multiple pointers from a slab.

There should be no functional changes here, but allows future optimization
on reg_alloc_batch.
---
 src/arena.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 46 insertions(+), 14 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 29f447b..fc2a7df 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -268,6 +268,27 @@ arena_slab_reg_alloc(extent_t *slab, const bin_info_t *bin_info) {
 	return ret;
 }
 
+static void
+arena_slab_reg_alloc_batch(extent_t *slab, const bin_info_t *bin_info,
+			   unsigned cnt, void** ptrs) {
+	arena_slab_data_t *slab_data = extent_slab_data_get(slab);
+
+	assert(extent_nfree_get(slab) > 0);
+	assert(!bitmap_full(slab_data->bitmap, &bin_info->bitmap_info));
+
+	size_t regind = 0;
+	for (unsigned i = 0; i < cnt; i++) {
+		void *ret;
+
+		regind = bitmap_sfu(slab_data->bitmap, &bin_info->bitmap_info);
+		ret = (void *)((uintptr_t)extent_addr_get(slab) +
+		    (uintptr_t)(bin_info->reg_size * regind));
+		extent_nfree_dec(slab);
+
+		*(ptrs + i) = ret;
+	}
+}
+
 #ifndef JEMALLOC_JET
 static
 #endif
@@ -1286,7 +1307,7 @@ arena_bin_malloc_hard(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
 void
 arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
     cache_bin_t *tbin, szind_t binind, uint64_t prof_accumbytes) {
-	unsigned i, nfill;
+	unsigned i, nfill, cnt;
 	bin_t *bin;
 
 	assert(tbin->ncached == 0);
@@ -1297,32 +1318,43 @@ arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
 	bin = &arena->bins[binind];
 	malloc_mutex_lock(tsdn, &bin->lock);
 	for (i = 0, nfill = (tcache_bin_info[binind].ncached_max >>
-	    tcache->lg_fill_div[binind]); i < nfill; i++) {
+	    tcache->lg_fill_div[binind]); i < nfill; i += cnt) {
 		extent_t *slab;
-		void *ptr;
 		if ((slab = bin->slabcur) != NULL && extent_nfree_get(slab) >
 		    0) {
-			ptr = arena_slab_reg_alloc(slab, &bin_infos[binind]);
+			unsigned tofill = nfill - i;
+			cnt = tofill < extent_nfree_get(slab) ?
+				tofill : extent_nfree_get(slab);
+			arena_slab_reg_alloc_batch(
+			   slab, &bin_infos[binind], cnt,
+			   tbin->avail - nfill + i);
 		} else {
-			ptr = arena_bin_malloc_hard(tsdn, arena, bin, binind);
-		}
-		if (ptr == NULL) {
+			cnt = 1;
+			void *ptr = arena_bin_malloc_hard(tsdn, arena, bin,
+							  binind);
 			/*
 			 * OOM.  tbin->avail isn't yet filled down to its first
 			 * element, so the successful allocations (if any) must
 			 * be moved just before tbin->avail before bailing out.
 			 */
-			if (i > 0) {
-				memmove(tbin->avail - i, tbin->avail - nfill,
-				    i * sizeof(void *));
+			if (ptr == NULL) {
+				if (i > 0) {
+					memmove(tbin->avail - i,
+						tbin->avail - nfill,
+						i * sizeof(void *));
+				}
+				break;
 			}
-			break;
+			/* Insert such that low regions get used first. */
+			*(tbin->avail - nfill + i) = ptr;
 		}
 		if (config_fill && unlikely(opt_junk_alloc)) {
-			arena_alloc_junk_small(ptr, &bin_infos[binind], true);
+			for (unsigned j = 0; j < cnt; j++) {
+				void* ptr = *(tbin->avail - nfill + i + j);
+				arena_alloc_junk_small(ptr, &bin_infos[binind],
+						       true);
+			}
 		}
-		/* Insert such that low regions get used first. */
-		*(tbin->avail - nfill + i) = ptr;
 	}
 	if (config_stats) {
 		bin->stats.nmalloc += i;
-- 
cgit v0.12


From 17aa470760cefb3057be746f7022196035f0cfbe Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Mon, 29 Oct 2018 15:09:21 -0700
Subject: add extent_nfree_sub

---
 include/jemalloc/internal/extent_inlines.h | 6 ++++++
 src/arena.c                                | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/jemalloc/internal/extent_inlines.h b/include/jemalloc/internal/extent_inlines.h
index 145fa2d..c931fd5 100644
--- a/include/jemalloc/internal/extent_inlines.h
+++ b/include/jemalloc/internal/extent_inlines.h
@@ -265,6 +265,12 @@ extent_nfree_dec(extent_t *extent) {
 }
 
 static inline void
+extent_nfree_sub(extent_t *extent, uint64_t n) {
+	assert(extent_slab_get(extent));
+	extent->e_bits -= (n << EXTENT_BITS_NFREE_SHIFT);
+}
+
+static inline void
 extent_sn_set(extent_t *extent, size_t sn) {
 	extent->e_bits = (extent->e_bits & ~EXTENT_BITS_SN_MASK) |
 	    ((uint64_t)sn << EXTENT_BITS_SN_SHIFT);
diff --git a/src/arena.c b/src/arena.c
index fc2a7df..841f295 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -283,10 +283,10 @@ arena_slab_reg_alloc_batch(extent_t *slab, const bin_info_t *bin_info,
 		regind = bitmap_sfu(slab_data->bitmap, &bin_info->bitmap_info);
 		ret = (void *)((uintptr_t)extent_addr_get(slab) +
 		    (uintptr_t)(bin_info->reg_size * regind));
-		extent_nfree_dec(slab);
 
 		*(ptrs + i) = ret;
 	}
+	extent_nfree_sub(slab, cnt);
 }
 
 #ifndef JEMALLOC_JET
-- 
cgit v0.12


From 13c237c7ef5baa63c820539e0cfef4c4c5c74ea2 Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Mon, 29 Oct 2018 16:01:09 -0700
Subject: Add a fastpath for arena_slab_reg_alloc_batch

Also adds a configure.ac check for __builtin_popcount, which is used
in the new fastpath.
---
 configure.ac                                       | 17 +++++++-
 include/jemalloc/internal/bit_util.h               | 19 +++++++++
 .../jemalloc/internal/jemalloc_internal_defs.h.in  |  6 +++
 src/arena.c                                        | 45 +++++++++++++++++-----
 4 files changed, 77 insertions(+), 10 deletions(-)

diff --git a/configure.ac b/configure.ac
index e27ea91..5cfe9af 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1429,6 +1429,21 @@ else
   fi
 fi
 
+JE_COMPILABLE([a program using __builtin_popcountl], [
+#include <stdio.h>
+#include <strings.h>
+#include <string.h>
+], [
+	{
+		int rv = __builtin_popcountl(0x08);
+		printf("%d\n", rv);
+	}
+], [je_cv_gcc_builtin_popcountl])
+if test "x${je_cv_gcc_builtin_popcountl}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_INTERNAL_POPCOUNT], [__builtin_popcount])
+  AC_DEFINE([JEMALLOC_INTERNAL_POPCOUNTL], [__builtin_popcountl])
+fi
+
 AC_ARG_WITH([lg_quantum],
   [AS_HELP_STRING([--with-lg-quantum=<lg-quantum>],
    [Base 2 log of minimum allocation alignment])],
@@ -1901,7 +1916,7 @@ if test "x${je_cv_madvise}" = "xyes" ; then
   if test "x${je_cv_madv_dontdump}" = "xyes" ; then
     AC_DEFINE([JEMALLOC_MADVISE_DONTDUMP], [ ])
   fi
- 
+
   dnl Check for madvise(..., MADV_[NO]HUGEPAGE).
   JE_COMPILABLE([madvise(..., MADV_[[NO]]HUGEPAGE)], [
 #include <sys/mman.h>
diff --git a/include/jemalloc/internal/bit_util.h b/include/jemalloc/internal/bit_util.h
index 8c59c39..c045eb8 100644
--- a/include/jemalloc/internal/bit_util.h
+++ b/include/jemalloc/internal/bit_util.h
@@ -27,6 +27,25 @@ ffs_u(unsigned bitmap) {
 	return JEMALLOC_INTERNAL_FFS(bitmap);
 }
 
+#ifdef JEMALLOC_INTERNAL_POPCOUNTL
+BIT_UTIL_INLINE unsigned
+popcount_lu(unsigned long bitmap) {
+  return JEMALLOC_INTERNAL_POPCOUNTL(bitmap);
+}
+#endif
+
+/*
+ * Clears first unset bit in bitmap, and returns
+ * place of bit.  bitmap *must not* be 0.
+ */
+
+BIT_UTIL_INLINE size_t
+cfs_lu(unsigned long* bitmap) {
+	size_t bit = ffs_lu(*bitmap) - 1;
+	*bitmap ^= ZU(1) << bit;
+	return bit;
+}
+
 BIT_UTIL_INLINE unsigned
 ffs_zu(size_t bitmap) {
 #if LG_SIZEOF_PTR == LG_SIZEOF_INT
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index c1eb8ed..3eac275 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -237,6 +237,12 @@
 #undef JEMALLOC_INTERNAL_FFS
 
 /*
+ * popcount*() functions to use for bitmapping.
+ */
+#undef JEMALLOC_INTERNAL_POPCOUNTL
+#undef JEMALLOC_INTERNAL_POPCOUNT
+
+/*
  * If defined, explicitly attempt to more uniformly distribute large allocation
  * pointer alignments across all cache indices.
  */
diff --git a/src/arena.c b/src/arena.c
index 841f295..5fc90c5 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -273,19 +273,46 @@ arena_slab_reg_alloc_batch(extent_t *slab, const bin_info_t *bin_info,
 			   unsigned cnt, void** ptrs) {
 	arena_slab_data_t *slab_data = extent_slab_data_get(slab);
 
-	assert(extent_nfree_get(slab) > 0);
+	assert(extent_nfree_get(slab) >= cnt);
 	assert(!bitmap_full(slab_data->bitmap, &bin_info->bitmap_info));
 
-	size_t regind = 0;
+#if (! defined JEMALLOC_INTERNAL_POPCOUNTL) || (defined BITMAP_USE_TREE)
 	for (unsigned i = 0; i < cnt; i++) {
-		void *ret;
-
-		regind = bitmap_sfu(slab_data->bitmap, &bin_info->bitmap_info);
-		ret = (void *)((uintptr_t)extent_addr_get(slab) +
+		size_t regind = bitmap_sfu(slab_data->bitmap,
+					   &bin_info->bitmap_info);
+		*(ptrs + i) = (void *)((uintptr_t)extent_addr_get(slab) +
 		    (uintptr_t)(bin_info->reg_size * regind));
+	}
+#else
+	unsigned group = 0;
+	bitmap_t g = slab_data->bitmap[group];
+	unsigned i = 0;
+	while (i < cnt) {
+		while (g == 0) {
+			g = slab_data->bitmap[++group];
+		}
+		size_t shift = group << LG_BITMAP_GROUP_NBITS;
+		size_t pop = popcount_lu(g);
+		if (pop > (cnt - i)) {
+			pop = cnt - i;
+		}
 
-		*(ptrs + i) = ret;
+		/*
+		 * Load from memory locations only once, outside the
+		 * hot loop below.
+		 */
+		uintptr_t base = (uintptr_t)extent_addr_get(slab);
+		uintptr_t regsize = (uintptr_t)bin_info->reg_size;
+		while (pop--) {
+			size_t bit = cfs_lu(&g);
+			size_t regind = shift + bit;
+			*(ptrs + i) = (void *)(base + regsize * regind);
+
+			i++;
+		}
+		slab_data->bitmap[group] = g;
 	}
+#endif
 	extent_nfree_sub(slab, cnt);
 }
 
@@ -1331,7 +1358,7 @@ arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
 		} else {
 			cnt = 1;
 			void *ptr = arena_bin_malloc_hard(tsdn, arena, bin,
-							  binind);
+								binind);
 			/*
 			 * OOM.  tbin->avail isn't yet filled down to its first
 			 * element, so the successful allocations (if any) must
@@ -1352,7 +1379,7 @@ arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
 			for (unsigned j = 0; j < cnt; j++) {
 				void* ptr = *(tbin->avail - nfill + i + j);
 				arena_alloc_junk_small(ptr, &bin_infos[binind],
-						       true);
+							true);
 			}
 		}
 	}
-- 
cgit v0.12


From 43f3b1ad0cd0900797688aa8b52b1face6416999 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Thu, 8 Nov 2018 15:43:09 -0800
Subject: Deprecate OSSpinLock.

---
 configure.ac                                         | 20 ++------------------
 .../jemalloc/internal/jemalloc_internal_defs.h.in    |  6 ------
 include/jemalloc/internal/jemalloc_preamble.h.in     |  2 +-
 include/jemalloc/internal/mutex.h                    | 10 ----------
 src/mutex.c                                          |  4 +---
 test/include/test/jemalloc_test.h.in                 |  2 +-
 test/include/test/mtx.h                              |  2 --
 test/src/mtx.c                                       |  7 -------
 8 files changed, 5 insertions(+), 48 deletions(-)

diff --git a/configure.ac b/configure.ac
index 5cfe9af..072808c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1845,7 +1845,7 @@ fi
 dnl ============================================================================
 dnl Check for atomic(3) operations as provided on Darwin.
 dnl We need this not for the atomic operations (which are provided above), but
-dnl rather for the OSSpinLock type it exposes.
+dnl rather for the OS_unfair_lock type it exposes.
 
 JE_COMPILABLE([Darwin OSAtomic*()], [
 #include <libkern/OSAtomic.h>
@@ -2012,21 +2012,6 @@ if test "x${je_cv_os_unfair_lock}" = "xyes" ; then
 fi
 
 dnl ============================================================================
-dnl Check for spinlock(3) operations as provided on Darwin.
-
-JE_COMPILABLE([Darwin OSSpin*()], [
-#include <libkern/OSAtomic.h>
-#include <inttypes.h>
-], [
-	OSSpinLock lock = 0;
-	OSSpinLockLock(&lock);
-	OSSpinLockUnlock(&lock);
-], [je_cv_osspin])
-if test "x${je_cv_osspin}" = "xyes" ; then
-  AC_DEFINE([JEMALLOC_OSSPIN], [ ])
-fi
-
-dnl ============================================================================
 dnl Darwin-related configuration.
 
 AC_ARG_ENABLE([zone-allocator],
@@ -2079,8 +2064,7 @@ dnl ============================================================================
 dnl Enable background threads if possible.
 
 if test "x${have_pthread}" = "x1" -a "x${have_dlsym}" = "x1" \
-    -a "x${je_cv_os_unfair_lock}" != "xyes" \
-    -a "x${je_cv_osspin}" != "xyes" ; then
+    -a "x${je_cv_os_unfair_lock}" != "xyes" ; then
   AC_DEFINE([JEMALLOC_BACKGROUND_THREAD])
 fi
 
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index 3eac275..3e94c02 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -78,12 +78,6 @@
  */
 #undef JEMALLOC_OS_UNFAIR_LOCK
 
-/*
- * Defined if OSSpin*() functions are available, as provided by Darwin, and
- * documented in the spinlock(3) manual page.
- */
-#undef JEMALLOC_OSSPIN
-
 /* Defined if syscall(2) is usable. */
 #undef JEMALLOC_USE_SYSCALL
 
diff --git a/include/jemalloc/internal/jemalloc_preamble.h.in b/include/jemalloc/internal/jemalloc_preamble.h.in
index 1b12aee..857fa32 100644
--- a/include/jemalloc/internal/jemalloc_preamble.h.in
+++ b/include/jemalloc/internal/jemalloc_preamble.h.in
@@ -21,7 +21,7 @@
 #  include "../jemalloc@install_suffix@.h"
 #endif
 
-#if (defined(JEMALLOC_OSATOMIC) || defined(JEMALLOC_OSSPIN))
+#if defined(JEMALLOC_OSATOMIC)
 #include <libkern/OSAtomic.h>
 #endif
 
diff --git a/include/jemalloc/internal/mutex.h b/include/jemalloc/internal/mutex.h
index 5a955d9..c530cc9 100644
--- a/include/jemalloc/internal/mutex.h
+++ b/include/jemalloc/internal/mutex.h
@@ -37,8 +37,6 @@ struct malloc_mutex_s {
 #  endif
 #elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
 			os_unfair_lock		lock;
-#elif (defined(JEMALLOC_OSSPIN))
-			OSSpinLock		lock;
 #elif (defined(JEMALLOC_MUTEX_INIT_CB))
 			pthread_mutex_t		lock;
 			malloc_mutex_t		*postponed_next;
@@ -84,10 +82,6 @@ struct malloc_mutex_s {
 #    define MALLOC_MUTEX_LOCK(m)    os_unfair_lock_lock(&(m)->lock)
 #    define MALLOC_MUTEX_UNLOCK(m)  os_unfair_lock_unlock(&(m)->lock)
 #    define MALLOC_MUTEX_TRYLOCK(m) (!os_unfair_lock_trylock(&(m)->lock))
-#elif (defined(JEMALLOC_OSSPIN))
-#    define MALLOC_MUTEX_LOCK(m)    OSSpinLockLock(&(m)->lock)
-#    define MALLOC_MUTEX_UNLOCK(m)  OSSpinLockUnlock(&(m)->lock)
-#    define MALLOC_MUTEX_TRYLOCK(m) (!OSSpinLockTry(&(m)->lock))
 #else
 #    define MALLOC_MUTEX_LOCK(m)    pthread_mutex_lock(&(m)->lock)
 #    define MALLOC_MUTEX_UNLOCK(m)  pthread_mutex_unlock(&(m)->lock)
@@ -110,10 +104,6 @@ struct malloc_mutex_s {
   {{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT}},		\
       WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
 #  endif
-#elif (defined(JEMALLOC_OSSPIN))
-#  define MALLOC_MUTEX_INITIALIZER					\
-     {{{LOCK_PROF_DATA_INITIALIZER, 0}},				\
-      WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
 #elif (defined(JEMALLOC_MUTEX_INIT_CB))
 #  if (defined(JEMALLOC_DEBUG))
 #     define MALLOC_MUTEX_INITIALIZER					\
diff --git a/src/mutex.c b/src/mutex.c
index 55e37ad..eb6c4c6 100644
--- a/src/mutex.c
+++ b/src/mutex.c
@@ -144,9 +144,7 @@ malloc_mutex_init(malloc_mutex_t *mutex, const char *name,
 	}
 #  endif
 #elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
-	mutex->lock = OS_UNFAIR_LOCK_INIT;
-#elif (defined(JEMALLOC_OSSPIN))
-	mutex->lock = 0;
+       mutex->lock = OS_UNFAIR_LOCK_INIT;
 #elif (defined(JEMALLOC_MUTEX_INIT_CB))
 	if (postpone_init) {
 		mutex->postponed_next = postponed_mutexes;
diff --git a/test/include/test/jemalloc_test.h.in b/test/include/test/jemalloc_test.h.in
index 0209aea..c46af5d 100644
--- a/test/include/test/jemalloc_test.h.in
+++ b/test/include/test/jemalloc_test.h.in
@@ -25,7 +25,7 @@ extern "C" {
 
 #include "test/jemalloc_test_defs.h"
 
-#ifdef JEMALLOC_OSSPIN
+#if defined(JEMALLOC_OSATOMIC)
 #  include <libkern/OSAtomic.h>
 #endif
 
diff --git a/test/include/test/mtx.h b/test/include/test/mtx.h
index 58afbc3..066a213 100644
--- a/test/include/test/mtx.h
+++ b/test/include/test/mtx.h
@@ -10,8 +10,6 @@ typedef struct {
 	CRITICAL_SECTION	lock;
 #elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
 	os_unfair_lock		lock;
-#elif (defined(JEMALLOC_OSSPIN))
-	OSSpinLock		lock;
 #else
 	pthread_mutex_t		lock;
 #endif
diff --git a/test/src/mtx.c b/test/src/mtx.c
index a393c01..d9ce375 100644
--- a/test/src/mtx.c
+++ b/test/src/mtx.c
@@ -13,8 +13,6 @@ mtx_init(mtx_t *mtx) {
 	}
 #elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
 	mtx->lock = OS_UNFAIR_LOCK_INIT;
-#elif (defined(JEMALLOC_OSSPIN))
-	mtx->lock = 0;
 #else
 	pthread_mutexattr_t attr;
 
@@ -35,7 +33,6 @@ void
 mtx_fini(mtx_t *mtx) {
 #ifdef _WIN32
 #elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
-#elif (defined(JEMALLOC_OSSPIN))
 #else
 	pthread_mutex_destroy(&mtx->lock);
 #endif
@@ -47,8 +44,6 @@ mtx_lock(mtx_t *mtx) {
 	EnterCriticalSection(&mtx->lock);
 #elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
 	os_unfair_lock_lock(&mtx->lock);
-#elif (defined(JEMALLOC_OSSPIN))
-	OSSpinLockLock(&mtx->lock);
 #else
 	pthread_mutex_lock(&mtx->lock);
 #endif
@@ -60,8 +55,6 @@ mtx_unlock(mtx_t *mtx) {
 	LeaveCriticalSection(&mtx->lock);
 #elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
 	os_unfair_lock_unlock(&mtx->lock);
-#elif (defined(JEMALLOC_OSSPIN))
-	OSSpinLockUnlock(&mtx->lock);
 #else
 	pthread_mutex_unlock(&mtx->lock);
 #endif
-- 
cgit v0.12


From c4063ce439523d382f2dfbbc5bf6da657e6badb0 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Thu, 15 Nov 2018 13:01:05 -0800
Subject: Set the default number of background threads to 4.

The setting has been tested in production for a while.  No negative effect while
we were able to reduce number of threads per process.
---
 include/jemalloc/internal/background_thread_structs.h | 1 +
 src/background_thread.c                               | 7 +++----
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/jemalloc/internal/background_thread_structs.h b/include/jemalloc/internal/background_thread_structs.h
index c1107df..c02aa43 100644
--- a/include/jemalloc/internal/background_thread_structs.h
+++ b/include/jemalloc/internal/background_thread_structs.h
@@ -9,6 +9,7 @@
 
 #define BACKGROUND_THREAD_INDEFINITE_SLEEP UINT64_MAX
 #define MAX_BACKGROUND_THREAD_LIMIT MALLOCX_ARENA_LIMIT
+#define DEFAULT_NUM_BACKGROUND_THREAD 4
 
 typedef enum {
 	background_thread_stopped,
diff --git a/src/background_thread.c b/src/background_thread.c
index 24f6730..813867e 100644
--- a/src/background_thread.c
+++ b/src/background_thread.c
@@ -13,7 +13,7 @@ JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
 #define BACKGROUND_THREAD_DEFAULT false
 /* Read-only after initialization. */
 bool opt_background_thread = BACKGROUND_THREAD_DEFAULT;
-size_t opt_max_background_threads = MAX_BACKGROUND_THREAD_LIMIT;
+size_t opt_max_background_threads = MAX_BACKGROUND_THREAD_LIMIT + 1;
 
 /* Used for thread creation, termination and stats. */
 malloc_mutex_t background_thread_lock;
@@ -872,9 +872,8 @@ background_thread_boot1(tsdn_t *tsdn) {
 	assert(have_background_thread);
 	assert(narenas_total_get() > 0);
 
-	if (opt_max_background_threads == MAX_BACKGROUND_THREAD_LIMIT &&
-	    ncpus < MAX_BACKGROUND_THREAD_LIMIT) {
-		opt_max_background_threads = ncpus;
+	if (opt_max_background_threads > MAX_BACKGROUND_THREAD_LIMIT) {
+		opt_max_background_threads = DEFAULT_NUM_BACKGROUND_THREAD;
 	}
 	max_background_threads = opt_max_background_threads;
 
-- 
cgit v0.12


From b23336af96e6ef9efb47591ce7bf2c8a1eab866b Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Mon, 26 Nov 2018 08:11:00 -0800
Subject: mutex: fix trylock spin wait contention

If there are 3 or more threads spin-waiting on the same mutex,
there will be excessive exclusive cacheline contention because
pthread_trylock() immediately tries to CAS in a new value, instead
of first checking if the lock is locked.

This diff adds a 'locked' hint flag, and we will only spin wait
without trylock()ing while set.  I don't know of any other portable
way to get the same behavior as pthread_mutex_lock().

This is pretty easy to test via ttest, e.g.

./ttest1 500 3 10000 1 100

Throughput is nearly 3x as fast.

This blames to the mutex profiling changes, however, we almost never
have 3 or more threads contending in properly configured production
workloads, but still worth fixing.
---
 include/jemalloc/internal/mutex.h | 21 +++++++++++++++------
 src/mutex.c                       |  3 ++-
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/include/jemalloc/internal/mutex.h b/include/jemalloc/internal/mutex.h
index c530cc9..8f4a307 100644
--- a/include/jemalloc/internal/mutex.h
+++ b/include/jemalloc/internal/mutex.h
@@ -43,6 +43,11 @@ struct malloc_mutex_s {
 #else
 			pthread_mutex_t		lock;
 #endif
+			/* 
+			 * Hint flag to avoid exclusive cache line contention
+			 * during spin waiting
+			 */
+			atomic_b_t		locked;
 		};
 		/*
 		 * We only touch witness when configured w/ debug.  However we
@@ -97,21 +102,21 @@ struct malloc_mutex_s {
 #elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
 #  if defined(JEMALLOC_DEBUG)
 #    define MALLOC_MUTEX_INITIALIZER					\
-     {{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT}},		\
+  {{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT, ATOMIC_INIT(false)}}, \
          WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0}
 #  else
 #    define MALLOC_MUTEX_INITIALIZER                      \
-  {{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT}},		\
+  {{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT, ATOMIC_INIT(false)}},  \
       WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
 #  endif
 #elif (defined(JEMALLOC_MUTEX_INIT_CB))
 #  if (defined(JEMALLOC_DEBUG))
 #     define MALLOC_MUTEX_INITIALIZER					\
-       {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL}},	\
+      {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL, ATOMIC_INIT(false)}},	\
            WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0}
 #  else
 #     define MALLOC_MUTEX_INITIALIZER					\
-       {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL}},	\
+      {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL, ATOMIC_INIT(false)}},	\
            WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
 #  endif
 
@@ -119,11 +124,11 @@ struct malloc_mutex_s {
 #    define MALLOC_MUTEX_TYPE PTHREAD_MUTEX_DEFAULT
 #  if defined(JEMALLOC_DEBUG)
 #    define MALLOC_MUTEX_INITIALIZER					\
-       {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER}},	\
+     {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, ATOMIC_INIT(false)}}, \
            WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0}
 #  else
 #    define MALLOC_MUTEX_INITIALIZER                          \
-  {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER}},	\
+     {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, ATOMIC_INIT(false)}},	\
       WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
 #  endif
 #endif
@@ -148,6 +153,7 @@ void malloc_mutex_lock_slow(malloc_mutex_t *mutex);
 static inline void
 malloc_mutex_lock_final(malloc_mutex_t *mutex) {
 	MALLOC_MUTEX_LOCK(mutex);
+	atomic_store_b(&mutex->locked, true, ATOMIC_RELAXED);
 }
 
 static inline bool
@@ -173,6 +179,7 @@ malloc_mutex_trylock(tsdn_t *tsdn, malloc_mutex_t *mutex) {
 	witness_assert_not_owner(tsdn_witness_tsdp_get(tsdn), &mutex->witness);
 	if (isthreaded) {
 		if (malloc_mutex_trylock_final(mutex)) {
+			atomic_store_b(&mutex->locked, true, ATOMIC_RELAXED);
 			return true;
 		}
 		mutex_owner_stats_update(tsdn, mutex);
@@ -212,6 +219,7 @@ malloc_mutex_lock(tsdn_t *tsdn, malloc_mutex_t *mutex) {
 	if (isthreaded) {
 		if (malloc_mutex_trylock_final(mutex)) {
 			malloc_mutex_lock_slow(mutex);
+			atomic_store_b(&mutex->locked, true, ATOMIC_RELAXED);
 		}
 		mutex_owner_stats_update(tsdn, mutex);
 	}
@@ -220,6 +228,7 @@ malloc_mutex_lock(tsdn_t *tsdn, malloc_mutex_t *mutex) {
 
 static inline void
 malloc_mutex_unlock(tsdn_t *tsdn, malloc_mutex_t *mutex) {
+	atomic_store_b(&mutex->locked, false, ATOMIC_RELAXED);
 	witness_unlock(tsdn_witness_tsdp_get(tsdn), &mutex->witness);
 	if (isthreaded) {
 		MALLOC_MUTEX_UNLOCK(mutex);
diff --git a/src/mutex.c b/src/mutex.c
index eb6c4c6..3f920f5 100644
--- a/src/mutex.c
+++ b/src/mutex.c
@@ -55,7 +55,8 @@ malloc_mutex_lock_slow(malloc_mutex_t *mutex) {
 	int cnt = 0, max_cnt = MALLOC_MUTEX_MAX_SPIN;
 	do {
 		spin_cpu_spinwait();
-		if (!malloc_mutex_trylock_final(mutex)) {
+		if (!atomic_load_b(&mutex->locked, ATOMIC_RELAXED)
+                    && !malloc_mutex_trylock_final(mutex)) {
 			data->n_spin_acquired++;
 			return;
 		}
-- 
cgit v0.12


From 37b89139252db18c95ebce3e0eac67817fa4a8ab Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Mon, 12 Nov 2018 15:56:04 -0800
Subject: Add support for sharded bins within an arena.

This makes it possible to have multiple set of bins in an arena, which improves
arena scalability because the bins (especially the small ones) are always the
limiting factor in production workload.

A bin shard is picked on allocation; each extent tracks the bin shard id for
deallocation.  The shard size will be determined using runtime options.
---
 include/jemalloc/internal/arena_externs.h   |   6 +-
 include/jemalloc/internal/arena_structs_b.h |   5 +-
 include/jemalloc/internal/bin.h             |  18 ++-
 include/jemalloc/internal/extent_inlines.h  |  26 +++++
 include/jemalloc/internal/extent_structs.h  |  15 ++-
 include/jemalloc/internal/mutex.h           |  22 ++++
 include/jemalloc/internal/tsd.h             |   2 +
 src/arena.c                                 | 163 ++++++++++++++++++----------
 src/bin.c                                   |   6 +
 src/ctl.c                                   |   6 +-
 src/jemalloc.c                              |   3 +
 src/tcache.c                                |  18 +--
 12 files changed, 217 insertions(+), 73 deletions(-)

diff --git a/include/jemalloc/internal/arena_externs.h b/include/jemalloc/internal/arena_externs.h
index 073e587..04d9954 100644
--- a/include/jemalloc/internal/arena_externs.h
+++ b/include/jemalloc/internal/arena_externs.h
@@ -63,8 +63,8 @@ void *arena_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize,
 void arena_prof_promote(tsdn_t *tsdn, const void *ptr, size_t usize);
 void arena_dalloc_promoted(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
     bool slow_path);
-void arena_dalloc_bin_junked_locked(tsdn_t *tsdn, arena_t *arena,
-    extent_t *extent, void *ptr);
+void arena_dalloc_bin_junked_locked(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
+    szind_t binind, extent_t *extent, void *ptr);
 void arena_dalloc_small(tsdn_t *tsdn, void *ptr);
 bool arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
     size_t extra, bool zero, size_t *newsize);
@@ -86,6 +86,8 @@ size_t arena_extent_sn_next(arena_t *arena);
 arena_t *arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks);
 bool arena_init_huge(void);
 arena_t *arena_choose_huge(tsd_t *tsd);
+bin_t *arena_bin_choose_lock(tsdn_t *tsdn, arena_t *arena, szind_t binind,
+    unsigned *binshard);
 void arena_boot(sc_data_t *sc_data);
 void arena_prefork0(tsdn_t *tsdn, arena_t *arena);
 void arena_prefork1(tsdn_t *tsdn, arena_t *arena);
diff --git a/include/jemalloc/internal/arena_structs_b.h b/include/jemalloc/internal/arena_structs_b.h
index 509f11c..950bd13 100644
--- a/include/jemalloc/internal/arena_structs_b.h
+++ b/include/jemalloc/internal/arena_structs_b.h
@@ -90,6 +90,9 @@ struct arena_s {
 	 */
 	atomic_u_t		nthreads[2];
 
+	/* Next bin shard for binding new threads. Synchronization: atomic. */
+	atomic_u_t		binshard_next;
+
 	/*
 	 * When percpu_arena is enabled, to amortize the cost of reading /
 	 * updating the current CPU id, track the most recent thread accessing
@@ -204,7 +207,7 @@ struct arena_s {
 	 *
 	 * Synchronization: internal.
 	 */
-	bin_t			bins[SC_NBINS];
+	bins_t			bins[SC_NBINS];
 
 	/*
 	 * Base allocator, from which arena metadata are allocated.
diff --git a/include/jemalloc/internal/bin.h b/include/jemalloc/internal/bin.h
index e04b6c6..3fddef7 100644
--- a/include/jemalloc/internal/bin.h
+++ b/include/jemalloc/internal/bin.h
@@ -7,6 +7,11 @@
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/sc.h"
 
+#define BIN_SHARDS_MAX (1 << EXTENT_BITS_BINSHARD_WIDTH)
+
+extern unsigned opt_bin_shard_maxszind;
+extern unsigned opt_n_bin_shards;
+
 /*
  * A bin contains a set of extents that are currently being used for slab
  * allocations.
@@ -42,6 +47,9 @@ struct bin_info_s {
 	/* Total number of regions in a slab for this bin's size class. */
 	uint32_t		nregs;
 
+	/* Number of sharded bins in each arena for this size class. */
+	uint32_t		n_shards;
+
 	/*
 	 * Metadata used to manipulate bitmaps for slabs associated with this
 	 * bin.
@@ -51,7 +59,6 @@ struct bin_info_s {
 
 extern bin_info_t bin_infos[SC_NBINS];
 
-
 typedef struct bin_s bin_t;
 struct bin_s {
 	/* All operations on bin_t fields require lock ownership. */
@@ -79,6 +86,13 @@ struct bin_s {
 	bin_stats_t	stats;
 };
 
+/* A set of sharded bins of the same size class. */
+typedef struct bins_s bins_t;
+struct bins_s {
+	/* Sharded bins.  Dynamically sized. */
+	bin_t *bin_shards;
+};
+
 void bin_infos_init(sc_data_t *sc_data, bin_info_t bin_infos[SC_NBINS]);
 void bin_boot();
 
@@ -94,7 +108,7 @@ void bin_postfork_child(tsdn_t *tsdn, bin_t *bin);
 static inline void
 bin_stats_merge(tsdn_t *tsdn, bin_stats_t *dst_bin_stats, bin_t *bin) {
 	malloc_mutex_lock(tsdn, &bin->lock);
-	malloc_mutex_prof_read(tsdn, &dst_bin_stats->mutex_data, &bin->lock);
+	malloc_mutex_prof_accum(tsdn, &dst_bin_stats->mutex_data, &bin->lock);
 	dst_bin_stats->nmalloc += bin->stats.nmalloc;
 	dst_bin_stats->ndalloc += bin->stats.ndalloc;
 	dst_bin_stats->nrequests += bin->stats.nrequests;
diff --git a/include/jemalloc/internal/extent_inlines.h b/include/jemalloc/internal/extent_inlines.h
index c931fd5..b572860 100644
--- a/include/jemalloc/internal/extent_inlines.h
+++ b/include/jemalloc/internal/extent_inlines.h
@@ -70,6 +70,14 @@ extent_usize_get(const extent_t *extent) {
 	return sz_index2size(extent_szind_get(extent));
 }
 
+static inline unsigned
+extent_binshard_get(const extent_t *extent) {
+	unsigned binshard = (unsigned)((extent->e_bits &
+	    EXTENT_BITS_BINSHARD_MASK) >> EXTENT_BITS_BINSHARD_SHIFT);
+	assert(binshard < bin_infos[extent_szind_get(extent)].n_shards);
+	return binshard;
+}
+
 static inline size_t
 extent_sn_get(const extent_t *extent) {
 	return (size_t)((extent->e_bits & EXTENT_BITS_SN_MASK) >>
@@ -191,6 +199,14 @@ extent_arena_set(extent_t *extent, arena_t *arena) {
 }
 
 static inline void
+extent_binshard_set(extent_t *extent, unsigned binshard) {
+	/* The assertion assumes szind is set already. */
+	assert(binshard < bin_infos[extent_szind_get(extent)].n_shards);
+	extent->e_bits = (extent->e_bits & ~EXTENT_BITS_BINSHARD_MASK) |
+	    ((uint64_t)binshard << EXTENT_BITS_BINSHARD_SHIFT);
+}
+
+static inline void
 extent_addr_set(extent_t *extent, void *addr) {
 	extent->e_addr = addr;
 }
@@ -253,6 +269,16 @@ extent_nfree_set(extent_t *extent, unsigned nfree) {
 }
 
 static inline void
+extent_nfree_binshard_set(extent_t *extent, unsigned nfree, unsigned binshard) {
+	/* The assertion assumes szind is set already. */
+	assert(binshard < bin_infos[extent_szind_get(extent)].n_shards);
+	extent->e_bits = (extent->e_bits &
+	    (~EXTENT_BITS_NFREE_MASK & ~EXTENT_BITS_BINSHARD_MASK)) |
+	    ((uint64_t)binshard << EXTENT_BITS_BINSHARD_SHIFT) |
+	    ((uint64_t)nfree << EXTENT_BITS_NFREE_SHIFT);
+}
+
+static inline void
 extent_nfree_inc(extent_t *extent) {
 	assert(extent_slab_get(extent));
 	extent->e_bits += ((uint64_t)1U << EXTENT_BITS_NFREE_SHIFT);
diff --git a/include/jemalloc/internal/extent_structs.h b/include/jemalloc/internal/extent_structs.h
index 50e77bf..1626452 100644
--- a/include/jemalloc/internal/extent_structs.h
+++ b/include/jemalloc/internal/extent_structs.h
@@ -29,9 +29,10 @@ struct extent_s {
 	 * t: state
 	 * i: szind
 	 * f: nfree
+	 * s: bin_shard
 	 * n: sn
 	 *
-	 * nnnnnnnn ... nnnnffff ffffffii iiiiiitt zdcbaaaa aaaaaaaa
+	 * nnnnnnnn ... nnnnnnss ssssffff ffffffii iiiiiitt zdcbaaaa aaaaaaaa
 	 *
 	 * arena_ind: Arena from which this extent came, or all 1 bits if
 	 *            unassociated.
@@ -76,6 +77,8 @@ struct extent_s {
 	 *
 	 * nfree: Number of free regions in slab.
 	 *
+	 * bin_shard: the shard of the bin from which this extent came.
+	 *
 	 * sn: Serial number (potentially non-unique).
 	 *
 	 *     Serial numbers may wrap around if !opt_retain, but as long as
@@ -121,7 +124,15 @@ struct extent_s {
 #define EXTENT_BITS_NFREE_SHIFT  (EXTENT_BITS_SZIND_WIDTH + EXTENT_BITS_SZIND_SHIFT)
 #define EXTENT_BITS_NFREE_MASK  MASK(EXTENT_BITS_NFREE_WIDTH, EXTENT_BITS_NFREE_SHIFT)
 
-#define EXTENT_BITS_SN_SHIFT  (EXTENT_BITS_NFREE_WIDTH + EXTENT_BITS_NFREE_SHIFT)
+#define EXTENT_BITS_BINSHARD_WIDTH  6
+#define EXTENT_BITS_BINSHARD_SHIFT  (EXTENT_BITS_NFREE_WIDTH + EXTENT_BITS_NFREE_SHIFT)
+#define EXTENT_BITS_BINSHARD_MASK  MASK(EXTENT_BITS_BINSHARD_WIDTH, EXTENT_BITS_BINSHARD_SHIFT)
+
+/* Will make dynamic options. */
+#define OPT_N_BIN_SHARDS (1)
+#define OPT_BIN_SHARD_MAXSZIND (0)
+
+#define EXTENT_BITS_SN_SHIFT (EXTENT_BITS_BINSHARD_WIDTH + EXTENT_BITS_BINSHARD_SHIFT)
 #define EXTENT_BITS_SN_MASK  (UINT64_MAX << EXTENT_BITS_SN_SHIFT)
 
 	/* Pointer to the extent that this structure is responsible for. */
diff --git a/include/jemalloc/internal/mutex.h b/include/jemalloc/internal/mutex.h
index 8f4a307..7c24f07 100644
--- a/include/jemalloc/internal/mutex.h
+++ b/include/jemalloc/internal/mutex.h
@@ -263,4 +263,26 @@ malloc_mutex_prof_read(tsdn_t *tsdn, mutex_prof_data_t *data,
 	atomic_store_u32(&data->n_waiting_thds, 0, ATOMIC_RELAXED);
 }
 
+static inline void
+malloc_mutex_prof_accum(tsdn_t *tsdn, mutex_prof_data_t *data,
+    malloc_mutex_t *mutex) {
+	mutex_prof_data_t *source = &mutex->prof_data;
+	/* Can only read holding the mutex. */
+	malloc_mutex_assert_owner(tsdn, mutex);
+
+	nstime_add(&data->tot_wait_time, &source->tot_wait_time);
+	if (nstime_compare(&source->max_wait_time, &data->max_wait_time) > 0) {
+		nstime_copy(&data->max_wait_time, &source->max_wait_time);
+	}
+	data->n_wait_times += source->n_wait_times;
+	data->n_spin_acquired += source->n_spin_acquired;
+	if (data->max_n_thds < source->max_n_thds) {
+		data->max_n_thds = source->max_n_thds;
+	}
+	/* n_wait_thds is not reported. */
+	atomic_store_u32(&data->n_waiting_thds, 0, ATOMIC_RELAXED);
+	data->n_owner_switches += source->n_owner_switches;
+	data->n_lock_ops += source->n_lock_ops;
+}
+
 #endif /* JEMALLOC_INTERNAL_MUTEX_H */
diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index c931441..4dc2274 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -74,6 +74,7 @@ typedef void (*test_callback_t)(int *);
     O(iarena,			arena_t *,		arena_t *)	\
     O(arena,			arena_t *,		arena_t *)	\
     O(arenas_tdata,		arena_tdata_t *,	arena_tdata_t *)\
+    O(binshard,			unsigned,		unsigned)	\
     O(tcache,			tcache_t,		tcache_t)	\
     O(witness_tsd,              witness_tsd_t,		witness_tsdn_t)	\
     MALLOC_TEST_TSD
@@ -93,6 +94,7 @@ typedef void (*test_callback_t)(int *);
     NULL,								\
     NULL,								\
     NULL,								\
+    ((unsigned)-1),							\
     TCACHE_ZERO_INITIALIZER,						\
     WITNESS_TSD_INITIALIZER						\
     MALLOC_TEST_TSD_INITIALIZER						\
diff --git a/src/arena.c b/src/arena.c
index 5fc90c5..893c9b5 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -233,7 +233,10 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
 	nstime_subtract(&astats->uptime, &arena->create_time);
 
 	for (szind_t i = 0; i < SC_NBINS; i++) {
-		bin_stats_merge(tsdn, &bstats[i], &arena->bins[i]);
+		for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
+			bin_stats_merge(tsdn, &bstats[i],
+			    &arena->bins[i].bin_shards[j]);
+		}
 	}
 }
 
@@ -1039,6 +1042,37 @@ arena_bin_slabs_full_remove(arena_t *arena, bin_t *bin, extent_t *slab) {
 	extent_list_remove(&bin->slabs_full, slab);
 }
 
+static void
+arena_bin_reset(tsd_t *tsd, arena_t *arena, bin_t *bin) {
+	extent_t *slab;
+
+	malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
+	if (bin->slabcur != NULL) {
+		slab = bin->slabcur;
+		bin->slabcur = NULL;
+		malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock);
+		arena_slab_dalloc(tsd_tsdn(tsd), arena, slab);
+		malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
+	}
+	while ((slab = extent_heap_remove_first(&bin->slabs_nonfull)) != NULL) {
+		malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock);
+		arena_slab_dalloc(tsd_tsdn(tsd), arena, slab);
+		malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
+	}
+	for (slab = extent_list_first(&bin->slabs_full); slab != NULL;
+	     slab = extent_list_first(&bin->slabs_full)) {
+		arena_bin_slabs_full_remove(arena, bin, slab);
+		malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock);
+		arena_slab_dalloc(tsd_tsdn(tsd), arena, slab);
+		malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
+	}
+	if (config_stats) {
+		bin->stats.curregs = 0;
+		bin->stats.curslabs = 0;
+	}
+	malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock);
+}
+
 void
 arena_reset(tsd_t *tsd, arena_t *arena) {
 	/*
@@ -1085,34 +1119,10 @@ arena_reset(tsd_t *tsd, arena_t *arena) {
 
 	/* Bins. */
 	for (unsigned i = 0; i < SC_NBINS; i++) {
-		extent_t *slab;
-		bin_t *bin = &arena->bins[i];
-		malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
-		if (bin->slabcur != NULL) {
-			slab = bin->slabcur;
-			bin->slabcur = NULL;
-			malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock);
-			arena_slab_dalloc(tsd_tsdn(tsd), arena, slab);
-			malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
-		}
-		while ((slab = extent_heap_remove_first(&bin->slabs_nonfull)) !=
-		    NULL) {
-			malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock);
-			arena_slab_dalloc(tsd_tsdn(tsd), arena, slab);
-			malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
+		for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
+			arena_bin_reset(tsd, arena,
+			    &arena->bins[i].bin_shards[j]);
 		}
-		for (slab = extent_list_first(&bin->slabs_full); slab != NULL;
-		    slab = extent_list_first(&bin->slabs_full)) {
-			arena_bin_slabs_full_remove(arena, bin, slab);
-			malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock);
-			arena_slab_dalloc(tsd_tsdn(tsd), arena, slab);
-			malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
-		}
-		if (config_stats) {
-			bin->stats.curregs = 0;
-			bin->stats.curslabs = 0;
-		}
-		malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock);
 	}
 
 	atomic_store_zu(&arena->nactive, 0, ATOMIC_RELAXED);
@@ -1197,7 +1207,7 @@ arena_slab_alloc_hard(tsdn_t *tsdn, arena_t *arena,
 }
 
 static extent_t *
-arena_slab_alloc(tsdn_t *tsdn, arena_t *arena, szind_t binind,
+arena_slab_alloc(tsdn_t *tsdn, arena_t *arena, szind_t binind, unsigned binshard,
     const bin_info_t *bin_info) {
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 0);
@@ -1225,7 +1235,7 @@ arena_slab_alloc(tsdn_t *tsdn, arena_t *arena, szind_t binind,
 
 	/* Initialize slab internals. */
 	arena_slab_data_t *slab_data = extent_slab_data_get(slab);
-	extent_nfree_set(slab, bin_info->nregs);
+	extent_nfree_binshard_set(slab, bin_info->nregs, binshard);
 	bitmap_init(slab_data->bitmap, &bin_info->bitmap_info, false);
 
 	arena_nactive_add(arena, extent_size_get(slab) >> LG_PAGE);
@@ -1235,7 +1245,7 @@ arena_slab_alloc(tsdn_t *tsdn, arena_t *arena, szind_t binind,
 
 static extent_t *
 arena_bin_nonfull_slab_get(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
-    szind_t binind) {
+    szind_t binind, unsigned binshard) {
 	extent_t *slab;
 	const bin_info_t *bin_info;
 
@@ -1251,7 +1261,7 @@ arena_bin_nonfull_slab_get(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
 	/* Allocate a new slab. */
 	malloc_mutex_unlock(tsdn, &bin->lock);
 	/******************************/
-	slab = arena_slab_alloc(tsdn, arena, binind, bin_info);
+	slab = arena_slab_alloc(tsdn, arena, binind, binshard, bin_info);
 	/********************************/
 	malloc_mutex_lock(tsdn, &bin->lock);
 	if (slab != NULL) {
@@ -1278,7 +1288,7 @@ arena_bin_nonfull_slab_get(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
 /* Re-fill bin->slabcur, then call arena_slab_reg_alloc(). */
 static void *
 arena_bin_malloc_hard(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
-    szind_t binind) {
+    szind_t binind, unsigned binshard) {
 	const bin_info_t *bin_info;
 	extent_t *slab;
 
@@ -1287,7 +1297,7 @@ arena_bin_malloc_hard(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
 		arena_bin_slabs_full_insert(arena, bin, bin->slabcur);
 		bin->slabcur = NULL;
 	}
-	slab = arena_bin_nonfull_slab_get(tsdn, arena, bin, binind);
+	slab = arena_bin_nonfull_slab_get(tsdn, arena, bin, binind, binshard);
 	if (bin->slabcur != NULL) {
 		/*
 		 * Another thread updated slabcur while this one ran without the
@@ -1331,19 +1341,39 @@ arena_bin_malloc_hard(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
 	return arena_slab_reg_alloc(slab, bin_info);
 }
 
+/* Choose a bin shard and return the locked bin. */
+bin_t *
+arena_bin_choose_lock(tsdn_t *tsdn, arena_t *arena, szind_t binind,
+    unsigned *binshard) {
+	bin_t *bin;
+	if (binind >= opt_bin_shard_maxszind || tsdn_null(tsdn) ||
+	    tsd_arena_get(tsdn_tsd(tsdn)) == NULL) {
+		*binshard = 0;
+	} else {
+		*binshard = tsd_binshard_get(tsdn_tsd(tsdn)) %
+		    bin_infos[binind].n_shards;
+	}
+	assert(*binshard < bin_infos[binind].n_shards);
+	bin = &arena->bins[binind].bin_shards[*binshard];
+	malloc_mutex_lock(tsdn, &bin->lock);
+
+	return bin;
+}
+
 void
 arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
     cache_bin_t *tbin, szind_t binind, uint64_t prof_accumbytes) {
 	unsigned i, nfill, cnt;
-	bin_t *bin;
 
 	assert(tbin->ncached == 0);
 
 	if (config_prof && arena_prof_accum(tsdn, arena, prof_accumbytes)) {
 		prof_idump(tsdn);
 	}
-	bin = &arena->bins[binind];
-	malloc_mutex_lock(tsdn, &bin->lock);
+
+	unsigned binshard;
+	bin_t *bin = arena_bin_choose_lock(tsdn, arena, binind, &binshard);
+
 	for (i = 0, nfill = (tcache_bin_info[binind].ncached_max >>
 	    tcache->lg_fill_div[binind]); i < nfill; i += cnt) {
 		extent_t *slab;
@@ -1358,7 +1388,7 @@ arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
 		} else {
 			cnt = 1;
 			void *ptr = arena_bin_malloc_hard(tsdn, arena, bin,
-								binind);
+			    binind, binshard);
 			/*
 			 * OOM.  tbin->avail isn't yet filled down to its first
 			 * element, so the successful allocations (if any) must
@@ -1417,14 +1447,14 @@ arena_malloc_small(tsdn_t *tsdn, arena_t *arena, szind_t binind, bool zero) {
 	extent_t *slab;
 
 	assert(binind < SC_NBINS);
-	bin = &arena->bins[binind];
 	usize = sz_index2size(binind);
+	unsigned binshard;
+	bin = arena_bin_choose_lock(tsdn, arena, binind, &binshard);
 
-	malloc_mutex_lock(tsdn, &bin->lock);
 	if ((slab = bin->slabcur) != NULL && extent_nfree_get(slab) > 0) {
 		ret = arena_slab_reg_alloc(slab, &bin_infos[binind]);
 	} else {
-		ret = arena_bin_malloc_hard(tsdn, arena, bin, binind);
+		ret = arena_bin_malloc_hard(tsdn, arena, bin, binind, binshard);
 	}
 
 	if (ret == NULL) {
@@ -1623,11 +1653,9 @@ arena_bin_lower_slab(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
 }
 
 static void
-arena_dalloc_bin_locked_impl(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
-    void *ptr, bool junked) {
+arena_dalloc_bin_locked_impl(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
+    szind_t binind, extent_t *slab, void *ptr, bool junked) {
 	arena_slab_data_t *slab_data = extent_slab_data_get(slab);
-	szind_t binind = extent_szind_get(slab);
-	bin_t *bin = &arena->bins[binind];
 	const bin_info_t *bin_info = &bin_infos[binind];
 
 	if (!junked && config_fill && unlikely(opt_junk_free)) {
@@ -1651,18 +1679,21 @@ arena_dalloc_bin_locked_impl(tsdn_t *tsdn, arena_t *arena, extent_t *slab,
 }
 
 void
-arena_dalloc_bin_junked_locked(tsdn_t *tsdn, arena_t *arena, extent_t *extent,
-    void *ptr) {
-	arena_dalloc_bin_locked_impl(tsdn, arena, extent, ptr, true);
+arena_dalloc_bin_junked_locked(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
+    szind_t binind, extent_t *extent, void *ptr) {
+	arena_dalloc_bin_locked_impl(tsdn, arena, bin, binind, extent, ptr,
+	    true);
 }
 
 static void
 arena_dalloc_bin(tsdn_t *tsdn, arena_t *arena, extent_t *extent, void *ptr) {
 	szind_t binind = extent_szind_get(extent);
-	bin_t *bin = &arena->bins[binind];
+	unsigned binshard = extent_binshard_get(extent);
+	bin_t *bin = &arena->bins[binind].bin_shards[binshard];
 
 	malloc_mutex_lock(tsdn, &bin->lock);
-	arena_dalloc_bin_locked_impl(tsdn, arena, extent, ptr, false);
+	arena_dalloc_bin_locked_impl(tsdn, arena, bin, binind, extent, ptr,
+	    false);
 	malloc_mutex_unlock(tsdn, &bin->lock);
 }
 
@@ -1892,7 +1923,10 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 		}
 	}
 
-	arena = (arena_t *)base_alloc(tsdn, base, sizeof(arena_t), CACHELINE);
+	size_t arena_size = sizeof(arena_t) +
+	    sizeof(bin_t) * opt_n_bin_shards * opt_bin_shard_maxszind +
+	    sizeof(bin_t) * (SC_NBINS - opt_bin_shard_maxszind);
+	arena = (arena_t *)base_alloc(tsdn, base, arena_size, CACHELINE);
 	if (arena == NULL) {
 		goto label_error;
 	}
@@ -1997,12 +2031,20 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 	}
 
 	/* Initialize bins. */
+	uintptr_t bin_addr = (uintptr_t)arena + sizeof(arena_t);
+	atomic_store_u(&arena->binshard_next, 0, ATOMIC_RELEASE);
 	for (i = 0; i < SC_NBINS; i++) {
-		bool err = bin_init(&arena->bins[i]);
-		if (err) {
-			goto label_error;
+		unsigned nshards = bin_infos[i].n_shards;
+		arena->bins[i].bin_shards = (bin_t *)bin_addr;
+		bin_addr += nshards * sizeof(bin_t);
+		for (unsigned j = 0; j < nshards; j++) {
+			bool err = bin_init(&arena->bins[i].bin_shards[j]);
+			if (err) {
+				goto label_error;
+			}
 		}
 	}
+	assert(bin_addr == (uintptr_t)arena + arena_size);
 
 	arena->base = base;
 	/* Set arena before creating background threads. */
@@ -2139,7 +2181,9 @@ arena_prefork6(tsdn_t *tsdn, arena_t *arena) {
 void
 arena_prefork7(tsdn_t *tsdn, arena_t *arena) {
 	for (unsigned i = 0; i < SC_NBINS; i++) {
-		bin_prefork(tsdn, &arena->bins[i]);
+		for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
+			bin_prefork(tsdn, &arena->bins[i].bin_shards[j]);
+		}
 	}
 }
 
@@ -2148,7 +2192,10 @@ arena_postfork_parent(tsdn_t *tsdn, arena_t *arena) {
 	unsigned i;
 
 	for (i = 0; i < SC_NBINS; i++) {
-		bin_postfork_parent(tsdn, &arena->bins[i]);
+		for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
+			bin_postfork_parent(tsdn,
+			    &arena->bins[i].bin_shards[j]);
+		}
 	}
 	malloc_mutex_postfork_parent(tsdn, &arena->large_mtx);
 	base_postfork_parent(tsdn, arena->base);
@@ -2192,7 +2239,9 @@ arena_postfork_child(tsdn_t *tsdn, arena_t *arena) {
 	}
 
 	for (i = 0; i < SC_NBINS; i++) {
-		bin_postfork_child(tsdn, &arena->bins[i]);
+		for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
+			bin_postfork_child(tsdn, &arena->bins[i].bin_shards[j]);
+		}
 	}
 	malloc_mutex_postfork_child(tsdn, &arena->large_mtx);
 	base_postfork_child(tsdn, arena->base);
diff --git a/src/bin.c b/src/bin.c
index e62babd..8dd964f 100644
--- a/src/bin.c
+++ b/src/bin.c
@@ -6,6 +6,9 @@
 #include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/witness.h"
 
+unsigned opt_bin_shard_maxszind;
+unsigned opt_n_bin_shards;
+
 bin_info_t bin_infos[SC_NBINS];
 
 void
@@ -18,6 +21,7 @@ bin_infos_init(sc_data_t *sc_data, bin_info_t bin_infos[SC_NBINS]) {
 		bin_info->slab_size = (sc->pgs << LG_PAGE);
 		bin_info->nregs =
 		    (uint32_t)(bin_info->slab_size / bin_info->reg_size);
+		bin_info->n_shards = (i < opt_bin_shard_maxszind) ? opt_n_bin_shards : 1;
 		bitmap_info_t bitmap_info = BITMAP_INFO_INITIALIZER(
 		    bin_info->nregs);
 		bin_info->bitmap_info = bitmap_info;
@@ -27,6 +31,8 @@ bin_infos_init(sc_data_t *sc_data, bin_info_t bin_infos[SC_NBINS]) {
 void
 bin_boot(sc_data_t *sc_data) {
 	assert(sc_data->initialized);
+	opt_bin_shard_maxszind = OPT_BIN_SHARD_MAXSZIND;
+	opt_n_bin_shards = OPT_N_BIN_SHARDS;
 	bin_infos_init(sc_data, bin_infos);
 }
 
diff --git a/src/ctl.c b/src/ctl.c
index b482fc5..72ad587 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -2913,8 +2913,10 @@ stats_mutexes_reset_ctl(tsd_t *tsd, const size_t *mib,
 		MUTEX_PROF_RESET(arena->base->mtx);
 
 		for (szind_t i = 0; i < SC_NBINS; i++) {
-			bin_t *bin = &arena->bins[i];
-			MUTEX_PROF_RESET(bin->lock);
+			for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
+				bin_t *bin = &arena->bins[i].bin_shards[j];
+				MUTEX_PROF_RESET(bin->lock);
+			}
 		}
 	}
 #undef MUTEX_PROF_RESET
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 68a21f9..c635ecb 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -379,6 +379,9 @@ arena_bind(tsd_t *tsd, unsigned ind, bool internal) {
 		tsd_iarena_set(tsd, arena);
 	} else {
 		tsd_arena_set(tsd, arena);
+		unsigned binshard = atomic_fetch_add_u(&arena->binshard_next, 1,
+		    ATOMIC_RELAXED) % BIN_SHARDS_MAX;
+		tsd_binshard_set(tsd, binshard);
 	}
 }
 
diff --git a/src/tcache.c b/src/tcache.c
index ee632f6..51e3131 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -121,7 +121,9 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
 		/* Lock the arena bin associated with the first object. */
 		extent_t *extent = item_extent[0];
 		arena_t *bin_arena = extent_arena_get(extent);
-		bin_t *bin = &bin_arena->bins[binind];
+		unsigned binshard = extent_binshard_get(extent);
+		assert(binshard < bin_infos[binind].n_shards);
+		bin_t *bin = &bin_arena->bins[binind].bin_shards[binshard];
 
 		if (config_prof && bin_arena == arena) {
 			if (arena_prof_accum(tsd_tsdn(tsd), arena,
@@ -145,9 +147,10 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
 			extent = item_extent[i];
 			assert(ptr != NULL && extent != NULL);
 
-			if (extent_arena_get(extent) == bin_arena) {
+			if (extent_arena_get(extent) == bin_arena
+			    && extent_binshard_get(extent) == binshard) {
 				arena_dalloc_bin_junked_locked(tsd_tsdn(tsd),
-				    bin_arena, extent, ptr);
+				    bin_arena, bin, binind, extent, ptr);
 			} else {
 				/*
 				 * This object was allocated via a different
@@ -169,8 +172,9 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
 		 * The flush loop didn't happen to flush to this thread's
 		 * arena, so the stats didn't get merged.  Manually do so now.
 		 */
-		bin_t *bin = &arena->bins[binind];
-		malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
+		unsigned binshard;
+		bin_t *bin = arena_bin_choose_lock(tsd_tsdn(tsd), arena, binind,
+		    &binshard);
 		bin->stats.nflushes++;
 		bin->stats.nrequests += tbin->tstats.nrequests;
 		tbin->tstats.nrequests = 0;
@@ -557,9 +561,9 @@ tcache_stats_merge(tsdn_t *tsdn, tcache_t *tcache, arena_t *arena) {
 
 	/* Merge and reset tcache stats. */
 	for (i = 0; i < SC_NBINS; i++) {
-		bin_t *bin = &arena->bins[i];
 		cache_bin_t *tbin = tcache_small_bin_get(tcache, i);
-		malloc_mutex_lock(tsdn, &bin->lock);
+		unsigned binshard;
+		bin_t *bin = arena_bin_choose_lock(tsdn, arena, i, &binshard);
 		bin->stats.nrequests += tbin->tstats.nrequests;
 		malloc_mutex_unlock(tsdn, &bin->lock);
 		tbin->tstats.nrequests = 0;
-- 
cgit v0.12


From 3f9f2833f6228e07673d75c9bce6f5fb58c5f3b0 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Tue, 20 Nov 2018 13:51:32 -0800
Subject: Add opt.bin_shards to specify number of bin shards.

The option uses the same format as "slab_sizes" to specify number of shards for
each bin size.
---
 include/jemalloc/internal/bin.h            | 10 +++----
 include/jemalloc/internal/extent_structs.h |  4 ---
 src/arena.c                                | 11 +++----
 src/bin.c                                  | 48 +++++++++++++++++++++++-------
 src/jemalloc.c                             | 39 +++++++++++++++++++-----
 5 files changed, 81 insertions(+), 31 deletions(-)

diff --git a/include/jemalloc/internal/bin.h b/include/jemalloc/internal/bin.h
index 3fddef7..baa0acf 100644
--- a/include/jemalloc/internal/bin.h
+++ b/include/jemalloc/internal/bin.h
@@ -8,9 +8,7 @@
 #include "jemalloc/internal/sc.h"
 
 #define BIN_SHARDS_MAX (1 << EXTENT_BITS_BINSHARD_WIDTH)
-
-extern unsigned opt_bin_shard_maxszind;
-extern unsigned opt_n_bin_shards;
+#define N_BIN_SHARDS_DEFAULT 1
 
 /*
  * A bin contains a set of extents that are currently being used for slab
@@ -93,8 +91,10 @@ struct bins_s {
 	bin_t *bin_shards;
 };
 
-void bin_infos_init(sc_data_t *sc_data, bin_info_t bin_infos[SC_NBINS]);
-void bin_boot();
+void bin_shard_sizes_boot(unsigned bin_shards[SC_NBINS]);
+bool bin_update_shard_size(unsigned bin_shards[SC_NBINS], size_t start_size,
+    size_t end_size, size_t nshards);
+void bin_boot(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS]);
 
 /* Initializes a bin to empty.  Returns true on error. */
 bool bin_init(bin_t *bin);
diff --git a/include/jemalloc/internal/extent_structs.h b/include/jemalloc/internal/extent_structs.h
index 1626452..ceb1897 100644
--- a/include/jemalloc/internal/extent_structs.h
+++ b/include/jemalloc/internal/extent_structs.h
@@ -128,10 +128,6 @@ struct extent_s {
 #define EXTENT_BITS_BINSHARD_SHIFT  (EXTENT_BITS_NFREE_WIDTH + EXTENT_BITS_NFREE_SHIFT)
 #define EXTENT_BITS_BINSHARD_MASK  MASK(EXTENT_BITS_BINSHARD_WIDTH, EXTENT_BITS_BINSHARD_SHIFT)
 
-/* Will make dynamic options. */
-#define OPT_N_BIN_SHARDS (1)
-#define OPT_BIN_SHARD_MAXSZIND (0)
-
 #define EXTENT_BITS_SN_SHIFT (EXTENT_BITS_BINSHARD_WIDTH + EXTENT_BITS_BINSHARD_SHIFT)
 #define EXTENT_BITS_SN_MASK  (UINT64_MAX << EXTENT_BITS_SN_SHIFT)
 
diff --git a/src/arena.c b/src/arena.c
index 893c9b5..7017bd7 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1346,8 +1346,7 @@ bin_t *
 arena_bin_choose_lock(tsdn_t *tsdn, arena_t *arena, szind_t binind,
     unsigned *binshard) {
 	bin_t *bin;
-	if (binind >= opt_bin_shard_maxszind || tsdn_null(tsdn) ||
-	    tsd_arena_get(tsdn_tsd(tsdn)) == NULL) {
+	if (tsdn_null(tsdn) || tsd_arena_get(tsdn_tsd(tsdn)) == NULL) {
 		*binshard = 0;
 	} else {
 		*binshard = tsd_binshard_get(tsdn_tsd(tsdn)) %
@@ -1923,9 +1922,11 @@ arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 		}
 	}
 
-	size_t arena_size = sizeof(arena_t) +
-	    sizeof(bin_t) * opt_n_bin_shards * opt_bin_shard_maxszind +
-	    sizeof(bin_t) * (SC_NBINS - opt_bin_shard_maxszind);
+	unsigned nbins_total = 0;
+	for (i = 0; i < SC_NBINS; i++) {
+		nbins_total += bin_infos[i].n_shards;
+	}
+	size_t arena_size = sizeof(arena_t) + sizeof(bin_t) * nbins_total;
 	arena = (arena_t *)base_alloc(tsdn, base, arena_size, CACHELINE);
 	if (arena == NULL) {
 		goto label_error;
diff --git a/src/bin.c b/src/bin.c
index 8dd964f..bca6b12 100644
--- a/src/bin.c
+++ b/src/bin.c
@@ -6,13 +6,11 @@
 #include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/witness.h"
 
-unsigned opt_bin_shard_maxszind;
-unsigned opt_n_bin_shards;
-
 bin_info_t bin_infos[SC_NBINS];
 
-void
-bin_infos_init(sc_data_t *sc_data, bin_info_t bin_infos[SC_NBINS]) {
+static void
+bin_infos_init(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS],
+    bin_info_t bin_infos[SC_NBINS]) {
 	for (unsigned i = 0; i < SC_NBINS; i++) {
 		bin_info_t *bin_info = &bin_infos[i];
 		sc_t *sc = &sc_data->sc[i];
@@ -21,19 +19,49 @@ bin_infos_init(sc_data_t *sc_data, bin_info_t bin_infos[SC_NBINS]) {
 		bin_info->slab_size = (sc->pgs << LG_PAGE);
 		bin_info->nregs =
 		    (uint32_t)(bin_info->slab_size / bin_info->reg_size);
-		bin_info->n_shards = (i < opt_bin_shard_maxszind) ? opt_n_bin_shards : 1;
+		bin_info->n_shards = bin_shard_sizes[i];
 		bitmap_info_t bitmap_info = BITMAP_INFO_INITIALIZER(
 		    bin_info->nregs);
 		bin_info->bitmap_info = bitmap_info;
 	}
 }
 
+bool
+bin_update_shard_size(unsigned bin_shard_sizes[SC_NBINS], size_t start_size,
+    size_t end_size, size_t nshards) {
+	if (nshards > BIN_SHARDS_MAX || nshards == 0) {
+		return true;
+	}
+
+	if (start_size > SC_SMALL_MAXCLASS) {
+		return false;
+	}
+	if (end_size > SC_SMALL_MAXCLASS) {
+		end_size = SC_SMALL_MAXCLASS;
+	}
+
+	/* Compute the index since this may happen before sz init. */
+	szind_t ind1 = sz_size2index_compute(start_size);
+	szind_t ind2 = sz_size2index_compute(end_size);
+	for (unsigned i = ind1; i <= ind2; i++) {
+		bin_shard_sizes[i] = (unsigned)nshards;
+	}
+
+	return false;
+}
+
+void
+bin_shard_sizes_boot(unsigned bin_shard_sizes[SC_NBINS]) {
+	/* Load the default number of shards. */
+	for (unsigned i = 0; i < SC_NBINS; i++) {
+		bin_shard_sizes[i] = N_BIN_SHARDS_DEFAULT;
+	}
+}
+
 void
-bin_boot(sc_data_t *sc_data) {
+bin_boot(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS]) {
 	assert(sc_data->initialized);
-	opt_bin_shard_maxszind = OPT_BIN_SHARD_MAXSZIND;
-	opt_n_bin_shards = OPT_N_BIN_SHARDS;
-	bin_infos_init(sc_data, bin_infos);
+	bin_infos_init(sc_data, bin_shard_sizes, bin_infos);
 }
 
 bool
diff --git a/src/jemalloc.c b/src/jemalloc.c
index c635ecb..1f7ed2e 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -767,9 +767,10 @@ init_opt_stats_print_opts(const char *v, size_t vlen) {
 	assert(opts_len == strlen(opt_stats_print_opts));
 }
 
+/* Reads the next size pair in a multi-sized option. */
 static bool
-malloc_conf_slab_sizes_next(const char **slab_size_segment_cur,
-    size_t *vlen_left, size_t *slab_start, size_t *slab_end, size_t *pgs) {
+malloc_conf_multi_sizes_next(const char **slab_size_segment_cur,
+    size_t *vlen_left, size_t *slab_start, size_t *slab_end, size_t *new_size) {
 	const char *cur = *slab_size_segment_cur;
 	char *end;
 	uintmax_t um;
@@ -797,7 +798,7 @@ malloc_conf_slab_sizes_next(const char **slab_size_segment_cur,
 	if (get_errno() != 0) {
 		return true;
 	}
-	*pgs = (size_t)um;
+	*new_size = (size_t)um;
 
 	/* Consume the separator if there is one. */
 	if (*end == '|') {
@@ -923,7 +924,7 @@ malloc_slow_flag_init(void) {
 }
 
 static void
-malloc_conf_init(sc_data_t *sc_data) {
+malloc_conf_init(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS]) {
 	unsigned i;
 	char buf[PATH_MAX + 1];
 	const char *opts, *k, *v;
@@ -1161,6 +1162,28 @@ malloc_conf_init(sc_data_t *sc_data) {
 			}
 			CONF_HANDLE_UNSIGNED(opt_narenas, "narenas", 1,
 			    UINT_MAX, yes, no, false)
+			if (CONF_MATCH("bin_shards")) {
+				const char *bin_shards_segment_cur = v;
+				size_t vlen_left = vlen;
+				do {
+					size_t size_start;
+					size_t size_end;
+					size_t nshards;
+					bool err = malloc_conf_multi_sizes_next(
+					    &bin_shards_segment_cur, &vlen_left,
+					    &size_start, &size_end, &nshards);
+					if (err || bin_update_shard_size(
+					    bin_shard_sizes, size_start,
+					    size_end, nshards)) {
+						malloc_conf_error(
+						    "Invalid settings for "
+						    "bin_shards", k, klen, v,
+						    vlen);
+						break;
+					}
+				} while (vlen_left > 0);
+				continue;
+			}
 			CONF_HANDLE_SSIZE_T(opt_dirty_decay_ms,
 			    "dirty_decay_ms", -1, NSTIME_SEC_MAX * KQU(1000) <
 			    QU(SSIZE_MAX) ? NSTIME_SEC_MAX * KQU(1000) :
@@ -1256,7 +1279,7 @@ malloc_conf_init(sc_data_t *sc_data) {
 					size_t slab_start;
 					size_t slab_end;
 					size_t pgs;
-					err = malloc_conf_slab_sizes_next(
+					err = malloc_conf_multi_sizes_next(
 					    &slab_size_segment_cur,
 					    &vlen_left, &slab_start, &slab_end,
 					    &pgs);
@@ -1390,6 +1413,8 @@ malloc_init_hard_a0_locked() {
 	 * out of sc_data_global are final.
 	 */
 	sc_boot(&sc_data);
+	unsigned bin_shard_sizes[SC_NBINS];
+	bin_shard_sizes_boot(bin_shard_sizes);
 	/*
 	 * prof_boot0 only initializes opt_prof_prefix.  We need to do it before
 	 * we parse malloc_conf options, in case malloc_conf parsing overwrites
@@ -1398,9 +1423,9 @@ malloc_init_hard_a0_locked() {
 	if (config_prof) {
 		prof_boot0();
 	}
-	malloc_conf_init(&sc_data);
+	malloc_conf_init(&sc_data, bin_shard_sizes);
 	sz_boot(&sc_data);
-	bin_boot(&sc_data);
+	bin_boot(&sc_data, bin_shard_sizes);
 
 	if (opt_stats_print) {
 		/* Print statistics at exit. */
-- 
cgit v0.12


From 45bb4483baef0f9bb1362349d9838ee041c42754 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Wed, 21 Nov 2018 11:17:31 -0800
Subject: Add stats for arenas.bin.i.nshards.

---
 src/ctl.c           | 5 ++++-
 src/stats.c         | 9 ++++++++-
 test/unit/mallctl.c | 1 +
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/ctl.c b/src/ctl.c
index 72ad587..a150891 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -127,6 +127,7 @@ INDEX_PROTO(arena_i)
 CTL_PROTO(arenas_bin_i_size)
 CTL_PROTO(arenas_bin_i_nregs)
 CTL_PROTO(arenas_bin_i_slab_size)
+CTL_PROTO(arenas_bin_i_nshards)
 INDEX_PROTO(arenas_bin_i)
 CTL_PROTO(arenas_lextent_i_size)
 INDEX_PROTO(arenas_lextent_i)
@@ -355,7 +356,8 @@ static const ctl_indexed_node_t arena_node[] = {
 static const ctl_named_node_t arenas_bin_i_node[] = {
 	{NAME("size"),		CTL(arenas_bin_i_size)},
 	{NAME("nregs"),		CTL(arenas_bin_i_nregs)},
-	{NAME("slab_size"),	CTL(arenas_bin_i_slab_size)}
+	{NAME("slab_size"),	CTL(arenas_bin_i_slab_size)},
+	{NAME("nshards"),	CTL(arenas_bin_i_nshards)}
 };
 static const ctl_named_node_t super_arenas_bin_i_node[] = {
 	{NAME(""),		CHILD(named, arenas_bin_i)}
@@ -2490,6 +2492,7 @@ CTL_RO_NL_GEN(arenas_nhbins, nhbins, unsigned)
 CTL_RO_NL_GEN(arenas_bin_i_size, bin_infos[mib[2]].reg_size, size_t)
 CTL_RO_NL_GEN(arenas_bin_i_nregs, bin_infos[mib[2]].nregs, uint32_t)
 CTL_RO_NL_GEN(arenas_bin_i_slab_size, bin_infos[mib[2]].slab_size, size_t)
+CTL_RO_NL_GEN(arenas_bin_i_nshards, bin_infos[mib[2]].n_shards, uint32_t)
 static const ctl_named_node_t *
 arenas_bin_i_index(tsdn_t *tsdn, const size_t *mib,
     size_t miblen, size_t i) {
diff --git a/src/stats.c b/src/stats.c
index e4e1337..e2a1100 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -249,6 +249,7 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i) {
 	COL(nmalloc, right, 13, uint64)
 	COL(ndalloc, right, 13, uint64)
 	COL(nrequests, right, 13, uint64)
+	COL(nshards, right, 9, unsigned)
 	COL(curregs, right, 13, size)
 	COL(curslabs, right, 13, size)
 	COL(regs, right, 5, unsigned)
@@ -293,7 +294,7 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i) {
 		uint64_t nslabs;
 		size_t reg_size, slab_size, curregs;
 		size_t curslabs;
-		uint32_t nregs;
+		uint32_t nregs, nshards;
 		uint64_t nmalloc, ndalloc, nrequests, nfills, nflushes;
 		uint64_t nreslabs;
 
@@ -310,6 +311,7 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i) {
 		CTL_M2_GET("arenas.bin.0.size", j, &reg_size, size_t);
 		CTL_M2_GET("arenas.bin.0.nregs", j, &nregs, uint32_t);
 		CTL_M2_GET("arenas.bin.0.slab_size", j, &slab_size, size_t);
+		CTL_M2_GET("arenas.bin.0.nshards", j, &nshards, uint32_t);
 
 		CTL_M2_M4_GET("stats.arenas.0.bins.0.nmalloc", i, j, &nmalloc,
 		    uint64_t);
@@ -383,6 +385,7 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i) {
 		col_nmalloc.uint64_val = nmalloc;
 		col_ndalloc.uint64_val = ndalloc;
 		col_nrequests.uint64_val = nrequests;
+		col_nshards.unsigned_val = nshards;
 		col_curregs.size_val = curregs;
 		col_curslabs.size_val = curslabs;
 		col_regs.unsigned_val = nregs;
@@ -1143,6 +1146,10 @@ stats_general_print(emitter_t *emitter) {
 			emitter_json_kv(emitter, "slab_size", emitter_type_size,
 			    &sv);
 
+			CTL_M2_GET("arenas.bin.0.nshards", i, &u32v, uint32_t);
+			emitter_json_kv(emitter, "nshards", emitter_type_uint32,
+			    &u32v);
+
 			emitter_json_object_end(emitter);
 		}
 		emitter_json_array_end(emitter); /* Close "bin". */
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index 452d884..039a881 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -706,6 +706,7 @@ TEST_BEGIN(test_arenas_bin_constants) {
 	TEST_ARENAS_BIN_CONSTANT(uint32_t, nregs, bin_infos[0].nregs);
 	TEST_ARENAS_BIN_CONSTANT(size_t, slab_size,
 	    bin_infos[0].slab_size);
+	TEST_ARENAS_BIN_CONSTANT(uint32_t, nshards, bin_infos[0].n_shards);
 
 #undef TEST_ARENAS_BIN_CONSTANT
 }
-- 
cgit v0.12


From 98b56ab23dd4d3dc826f06906e6c51c9c9d4d52a Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Tue, 27 Nov 2018 12:38:47 -0800
Subject: Store the bin shard selection in TSD.

This avoids having to choose bin shard on the fly, also will allow flexible bin
binding for each thread.
---
 include/jemalloc/internal/bin.h       |  4 +---
 include/jemalloc/internal/bin_types.h | 17 +++++++++++++++++
 include/jemalloc/internal/tsd.h       |  5 +++--
 src/arena.c                           |  3 +--
 src/jemalloc.c                        | 11 ++++++++---
 5 files changed, 30 insertions(+), 10 deletions(-)
 create mode 100644 include/jemalloc/internal/bin_types.h

diff --git a/include/jemalloc/internal/bin.h b/include/jemalloc/internal/bin.h
index baa0acf..f542c88 100644
--- a/include/jemalloc/internal/bin.h
+++ b/include/jemalloc/internal/bin.h
@@ -2,14 +2,12 @@
 #define JEMALLOC_INTERNAL_BIN_H
 
 #include "jemalloc/internal/bin_stats.h"
+#include "jemalloc/internal/bin_types.h"
 #include "jemalloc/internal/extent_types.h"
 #include "jemalloc/internal/extent_structs.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/sc.h"
 
-#define BIN_SHARDS_MAX (1 << EXTENT_BITS_BINSHARD_WIDTH)
-#define N_BIN_SHARDS_DEFAULT 1
-
 /*
  * A bin contains a set of extents that are currently being used for slab
  * allocations.
diff --git a/include/jemalloc/internal/bin_types.h b/include/jemalloc/internal/bin_types.h
new file mode 100644
index 0000000..3533606
--- /dev/null
+++ b/include/jemalloc/internal/bin_types.h
@@ -0,0 +1,17 @@
+#ifndef JEMALLOC_INTERNAL_BIN_TYPES_H
+#define JEMALLOC_INTERNAL_BIN_TYPES_H
+
+#include "jemalloc/internal/sc.h"
+
+#define BIN_SHARDS_MAX (1 << EXTENT_BITS_BINSHARD_WIDTH)
+#define N_BIN_SHARDS_DEFAULT 1
+
+/* Used in TSD static initializer only. Real init in arena_bind(). */
+#define TSD_BINSHARDS_ZERO_INITIALIZER {{UINT8_MAX}}
+
+typedef struct tsd_binshards_s tsd_binshards_t;
+struct tsd_binshards_s {
+	uint8_t binshard[SC_NBINS];
+};
+
+#endif /* JEMALLOC_INTERNAL_BIN_TYPES_H */
diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index 4dc2274..00a9500 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -3,6 +3,7 @@
 
 #include "jemalloc/internal/arena_types.h"
 #include "jemalloc/internal/assert.h"
+#include "jemalloc/internal/bin_types.h"
 #include "jemalloc/internal/jemalloc_internal_externs.h"
 #include "jemalloc/internal/prof_types.h"
 #include "jemalloc/internal/ql.h"
@@ -74,7 +75,7 @@ typedef void (*test_callback_t)(int *);
     O(iarena,			arena_t *,		arena_t *)	\
     O(arena,			arena_t *,		arena_t *)	\
     O(arenas_tdata,		arena_tdata_t *,	arena_tdata_t *)\
-    O(binshard,			unsigned,		unsigned)	\
+    O(binshards,		tsd_binshards_t,	tsd_binshards_t)\
     O(tcache,			tcache_t,		tcache_t)	\
     O(witness_tsd,              witness_tsd_t,		witness_tsdn_t)	\
     MALLOC_TEST_TSD
@@ -94,7 +95,7 @@ typedef void (*test_callback_t)(int *);
     NULL,								\
     NULL,								\
     NULL,								\
-    ((unsigned)-1),							\
+    TSD_BINSHARDS_ZERO_INITIALIZER,					\
     TCACHE_ZERO_INITIALIZER,						\
     WITNESS_TSD_INITIALIZER						\
     MALLOC_TEST_TSD_INITIALIZER						\
diff --git a/src/arena.c b/src/arena.c
index 7017bd7..d34de85 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1349,8 +1349,7 @@ arena_bin_choose_lock(tsdn_t *tsdn, arena_t *arena, szind_t binind,
 	if (tsdn_null(tsdn) || tsd_arena_get(tsdn_tsd(tsdn)) == NULL) {
 		*binshard = 0;
 	} else {
-		*binshard = tsd_binshard_get(tsdn_tsd(tsdn)) %
-		    bin_infos[binind].n_shards;
+		*binshard = tsd_binshardsp_get(tsdn_tsd(tsdn))->binshard[binind];
 	}
 	assert(*binshard < bin_infos[binind].n_shards);
 	bin = &arena->bins[binind].bin_shards[*binshard];
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 1f7ed2e..1620d0d 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -379,9 +379,14 @@ arena_bind(tsd_t *tsd, unsigned ind, bool internal) {
 		tsd_iarena_set(tsd, arena);
 	} else {
 		tsd_arena_set(tsd, arena);
-		unsigned binshard = atomic_fetch_add_u(&arena->binshard_next, 1,
-		    ATOMIC_RELAXED) % BIN_SHARDS_MAX;
-		tsd_binshard_set(tsd, binshard);
+		unsigned shard = atomic_fetch_add_u(&arena->binshard_next, 1,
+		    ATOMIC_RELAXED);
+		tsd_binshards_t *bins = tsd_binshardsp_get(tsd);
+		for (unsigned i = 0; i < SC_NBINS; i++) {
+			assert(bin_infos[i].n_shards > 0 &&
+			    bin_infos[i].n_shards <= BIN_SHARDS_MAX);
+			bins->binshard[i] = shard % bin_infos[i].n_shards;
+		}
 	}
 }
 
-- 
cgit v0.12


From 711a61f3b41880718eb23fcfdd572d0daa5fb6ca Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Wed, 28 Nov 2018 16:23:18 -0800
Subject: Add unit test for sharded bins.

---
 Makefile.in           |   1 +
 test/unit/binshard.c  | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++
 test/unit/binshard.sh |   3 ++
 3 files changed, 107 insertions(+)
 create mode 100644 test/unit/binshard.c
 create mode 100644 test/unit/binshard.sh

diff --git a/Makefile.in b/Makefile.in
index c9bd95a..31a9cea 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -169,6 +169,7 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/base.c \
 	$(srcroot)test/unit/bitmap.c \
 	$(srcroot)test/unit/bit_util.c \
+	$(srcroot)test/unit/binshard.c \
 	$(srcroot)test/unit/ckh.c \
 	$(srcroot)test/unit/decay.c \
 	$(srcroot)test/unit/div.c \
diff --git a/test/unit/binshard.c b/test/unit/binshard.c
new file mode 100644
index 0000000..829ba43
--- /dev/null
+++ b/test/unit/binshard.c
@@ -0,0 +1,103 @@
+#include "test/jemalloc_test.h"
+
+/* Config -- "narenas:1,bin_shards:1-160:16|129-512:4|256-256:8" */
+
+static void *
+thd_start(void *varg) {
+	void *ptr, *ptr2;
+	extent_t *extent;
+	unsigned shard1, shard2;
+
+	tsdn_t *tsdn = tsdn_fetch();
+	/* Try triggering allocations from sharded bins. */
+	for (unsigned i = 0; i < 1024; i++) {
+		ptr = mallocx(1, MALLOCX_TCACHE_NONE);
+		ptr2 = mallocx(129, MALLOCX_TCACHE_NONE);
+
+		extent = iealloc(tsdn, ptr);
+		shard1 = extent_binshard_get(extent);
+		dallocx(ptr, 0);
+		assert_u_lt(shard1, 16, "Unexpected bin shard used");
+
+		extent = iealloc(tsdn, ptr2);
+		shard2 = extent_binshard_get(extent);
+		dallocx(ptr2, 0);
+		assert_u_lt(shard2, 4, "Unexpected bin shard used");
+
+		if (shard1 > 0 || shard2 > 0) {
+			/* Triggered sharded bin usage. */
+			return (void *)(uintptr_t)shard1;
+		}
+	}
+
+	return NULL;
+}
+
+TEST_BEGIN(test_bin_shard_mt) {
+#define NTHREADS 16
+	thd_t thds[NTHREADS];
+	unsigned i;
+	for (i = 0; i < NTHREADS; i++) {
+		thd_create(&thds[i], thd_start, NULL);
+	}
+	bool sharded = false;
+	for (i = 0; i < NTHREADS; i++) {
+		void *ret;
+		thd_join(thds[i], &ret);
+		if (ret != NULL) {
+			sharded = true;
+		}
+	}
+	assert_b_eq(sharded, true, "Did not find sharded bins");
+}
+TEST_END
+
+TEST_BEGIN(test_bin_shard) {
+	unsigned nbins, i;
+	size_t mib[4], mib2[4];
+	size_t miblen, miblen2, len;
+
+	len = sizeof(nbins);
+	assert_d_eq(mallctl("arenas.nbins", (void *)&nbins, &len, NULL, 0), 0,
+	    "Unexpected mallctl() failure");
+
+	miblen = 4;
+	assert_d_eq(mallctlnametomib("arenas.bin.0.nshards", mib, &miblen), 0,
+	    "Unexpected mallctlnametomib() failure");
+	miblen2 = 4;
+	assert_d_eq(mallctlnametomib("arenas.bin.0.size", mib2, &miblen2), 0,
+	    "Unexpected mallctlnametomib() failure");
+
+	for (i = 0; i < nbins; i++) {
+		uint32_t nshards;
+		size_t size, sz1, sz2;
+
+		mib[2] = i;
+		sz1 = sizeof(nshards);
+		assert_d_eq(mallctlbymib(mib, miblen, (void *)&nshards, &sz1,
+		    NULL, 0), 0, "Unexpected mallctlbymib() failure");
+
+		mib2[2] = i;
+		sz2 = sizeof(size);
+		assert_d_eq(mallctlbymib(mib2, miblen2, (void *)&size, &sz2,
+		    NULL, 0), 0, "Unexpected mallctlbymib() failure");
+
+		if (size >= 1 && size <= 128) {
+			assert_u_eq(nshards, 16, "Unexpected nshards");
+		} else if (size == 256) {
+			assert_u_eq(nshards, 8, "Unexpected nshards");
+		} else if (size > 128 && size <= 512) {
+			assert_u_eq(nshards, 4, "Unexpected nshards");
+		} else {
+			assert_u_eq(nshards, 1, "Unexpected nshards");
+		}
+	}
+}
+TEST_END
+
+int
+main(void) {
+	return test_no_reentrancy(
+	    test_bin_shard,
+	    test_bin_shard_mt);
+}
diff --git a/test/unit/binshard.sh b/test/unit/binshard.sh
new file mode 100644
index 0000000..c1d58c8
--- /dev/null
+++ b/test/unit/binshard.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+export MALLOC_CONF="narenas:1,bin_shards:1-160:16|129-512:4|256-256:8"
-- 
cgit v0.12


From 99f4eefb61ae1f13e47af6eac34748fd0a789404 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Fri, 7 Dec 2018 18:06:04 -0800
Subject: Fix incorrect stats mreging with sharded bins.

With sharded bins, we may not flush all items from the same arena in one run.
Adjust the stats merging logic accordingly.
---
 src/tcache.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/tcache.c b/src/tcache.c
index 51e3131..92be273 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -134,8 +134,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
 		}
 
 		malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
-		if (config_stats && bin_arena == arena) {
-			assert(!merged_stats);
+		if (config_stats && bin_arena == arena && !merged_stats) {
 			merged_stats = true;
 			bin->stats.nflushes++;
 			bin->stats.nrequests += tbin->tstats.nrequests;
-- 
cgit v0.12


From 36de5189c70fee959ebcdfadd8dfa374ff430de5 Mon Sep 17 00:00:00 2001
From: Alexander Zinoviev <zin@fb.com>
Date: Mon, 10 Dec 2018 11:29:44 -0800
Subject: Add rate counters to stats

---
 include/jemalloc/internal/emitter.h    |   2 +
 include/jemalloc/internal/mutex_prof.h |  25 +-
 src/stats.c                            | 454 +++++++++++++++++----------------
 3 files changed, 249 insertions(+), 232 deletions(-)

diff --git a/include/jemalloc/internal/emitter.h b/include/jemalloc/internal/emitter.h
index f8da228..0a8bc2c 100644
--- a/include/jemalloc/internal/emitter.h
+++ b/include/jemalloc/internal/emitter.h
@@ -45,7 +45,9 @@ struct emitter_col_s {
 		int int_val;
 		unsigned unsigned_val;
 		uint32_t uint32_val;
+		uint32_t uint32_t_val;
 		uint64_t uint64_val;
+		uint64_t uint64_t_val;
 		size_t size_val;
 		ssize_t ssize_val;
 		const char *str_val;
diff --git a/include/jemalloc/internal/mutex_prof.h b/include/jemalloc/internal/mutex_prof.h
index ce183d3..2cb8fb0 100644
--- a/include/jemalloc/internal/mutex_prof.h
+++ b/include/jemalloc/internal/mutex_prof.h
@@ -35,22 +35,31 @@ typedef enum {
 	mutex_prof_num_arena_mutexes
 } mutex_prof_arena_ind_t;
 
+/*
+ * The forth parameter is a boolean value that is true for derived rate counters
+ * and false for real ones.
+ */
 #define MUTEX_PROF_UINT64_COUNTERS					\
-    OP(num_ops, uint64_t, "n_lock_ops")					\
-    OP(num_wait, uint64_t, "n_waiting")					\
-    OP(num_spin_acq, uint64_t, "n_spin_acq")				\
-    OP(num_owner_switch, uint64_t, "n_owner_switch")			\
-    OP(total_wait_time, uint64_t, "total_wait_ns")			\
-    OP(max_wait_time, uint64_t, "max_wait_ns")
+    OP(num_ops, uint64_t, "n_lock_ops", false, num_ops)					\
+    OP(num_ops_ps, uint64_t, "(#/sec)", true, num_ops)				\
+    OP(num_wait, uint64_t, "n_waiting", false, num_wait)				\
+    OP(num_wait_ps, uint64_t, "(#/sec)", true, num_wait)				\
+    OP(num_spin_acq, uint64_t, "n_spin_acq", false, num_spin_acq)			\
+    OP(num_spin_acq_ps, uint64_t, "(#/sec)", true, num_spin_acq)			\
+    OP(num_owner_switch, uint64_t, "n_owner_switch", false, num_owner_switch)		\
+    OP(num_owner_switch_ps, uint64_t, "(#/sec)", true, num_owner_switch)	\
+    OP(total_wait_time, uint64_t, "total_wait_ns", false, total_wait_time)		\
+    OP(total_wait_time_ps, uint64_t, "(#/sec)", true, total_wait_time)		\
+    OP(max_wait_time, uint64_t, "max_wait_ns", false, max_wait_time)
 
 #define MUTEX_PROF_UINT32_COUNTERS					\
-    OP(max_num_thds, uint32_t, "max_n_thds")
+    OP(max_num_thds, uint32_t, "max_n_thds", false, max_num_thds)
 
 #define MUTEX_PROF_COUNTERS						\
 		MUTEX_PROF_UINT64_COUNTERS				\
 		MUTEX_PROF_UINT32_COUNTERS
 
-#define OP(counter, type, human) mutex_counter_##counter,
+#define OP(counter, type, human, derived, base_counter) mutex_counter_##counter,
 
 #define COUNTER_ENUM(counter_list, t)					\
 		typedef enum {						\
diff --git a/src/stats.c b/src/stats.c
index e2a1100..f105e26 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -52,6 +52,20 @@ char opt_stats_print_opts[stats_print_tot_num_options+1] = "";
 
 /******************************************************************************/
 
+static uint64_t
+rate_per_second(uint64_t value, uint64_t uptime_ns) {
+	uint64_t billion = 1000000000;
+	if (uptime_ns == 0 || value == 0) {
+		return 0;
+	}
+	if (uptime_ns < billion) {
+		return value;
+	} else {
+		uint64_t uptime_s = uptime_ns / billion;
+		return value / uptime_s;
+	}
+}
+
 /* Calculate x.yyy and output a string (takes a fixed sized char array). */
 static bool
 get_rate_str(uint64_t dividend, uint64_t divisor, char str[6]) {
@@ -104,12 +118,12 @@ mutex_stats_init_cols(emitter_row_t *row, const char *table_name,
 
 #define WIDTH_uint32_t 12
 #define WIDTH_uint64_t 16
-#define OP(counter, counter_type, human)				\
+#define OP(counter, counter_type, human, derived, base_counter)	\
 	col = &col_##counter_type[k_##counter_type];			\
 	++k_##counter_type;						\
 	emitter_col_init(col, row);					\
 	col->justify = emitter_justify_right;				\
-	col->width = WIDTH_##counter_type;				\
+	col->width = derived ? 8 : WIDTH_##counter_type;		\
 	col->type = emitter_type_title;					\
 	col->str_val = human;
 	MUTEX_PROF_COUNTERS
@@ -121,7 +135,8 @@ mutex_stats_init_cols(emitter_row_t *row, const char *table_name,
 static void
 mutex_stats_read_global(const char *name, emitter_col_t *col_name,
     emitter_col_t col_uint64_t[mutex_prof_num_uint64_t_counters],
-    emitter_col_t col_uint32_t[mutex_prof_num_uint32_t_counters]) {
+    emitter_col_t col_uint32_t[mutex_prof_num_uint32_t_counters],
+    uint64_t uptime) {
 	char cmd[MUTEX_CTL_STR_MAX_LENGTH];
 
 	col_name->str_val = name;
@@ -129,12 +144,17 @@ mutex_stats_read_global(const char *name, emitter_col_t *col_name,
 	emitter_col_t *dst;
 #define EMITTER_TYPE_uint32_t emitter_type_uint32
 #define EMITTER_TYPE_uint64_t emitter_type_uint64
-#define OP(counter, counter_type, human)				\
+#define OP(counter, counter_type, human, derived, base_counter)	\
 	dst = &col_##counter_type[mutex_counter_##counter];		\
 	dst->type = EMITTER_TYPE_##counter_type;			\
-	gen_mutex_ctl_str(cmd, MUTEX_CTL_STR_MAX_LENGTH,		\
-	    "mutexes", name, #counter);					\
-	CTL_GET(cmd, (counter_type *)&dst->bool_val, counter_type);
+	if (!derived) {							\
+		gen_mutex_ctl_str(cmd, MUTEX_CTL_STR_MAX_LENGTH,	\
+		    "mutexes", name, #counter);				\
+		CTL_GET(cmd, (counter_type *)&dst->bool_val, counter_type);	\
+	} else { \
+	    emitter_col_t *base = &col_##counter_type[mutex_counter_##base_counter];	\
+	    dst->counter_type##_val = rate_per_second(base->counter_type##_val, uptime); \
+	}
 	MUTEX_PROF_COUNTERS
 #undef OP
 #undef EMITTER_TYPE_uint32_t
@@ -145,7 +165,8 @@ static void
 mutex_stats_read_arena(unsigned arena_ind, mutex_prof_arena_ind_t mutex_ind,
     const char *name, emitter_col_t *col_name,
     emitter_col_t col_uint64_t[mutex_prof_num_uint64_t_counters],
-    emitter_col_t col_uint32_t[mutex_prof_num_uint32_t_counters]) {
+    emitter_col_t col_uint32_t[mutex_prof_num_uint32_t_counters],
+    uint64_t uptime) {
 	char cmd[MUTEX_CTL_STR_MAX_LENGTH];
 
 	col_name->str_val = name;
@@ -153,13 +174,17 @@ mutex_stats_read_arena(unsigned arena_ind, mutex_prof_arena_ind_t mutex_ind,
 	emitter_col_t *dst;
 #define EMITTER_TYPE_uint32_t emitter_type_uint32
 #define EMITTER_TYPE_uint64_t emitter_type_uint64
-#define OP(counter, counter_type, human)				\
+#define OP(counter, counter_type, human, derived, base_counter)	\
 	dst = &col_##counter_type[mutex_counter_##counter];		\
 	dst->type = EMITTER_TYPE_##counter_type;			\
-	gen_mutex_ctl_str(cmd, MUTEX_CTL_STR_MAX_LENGTH,		\
-	    "arenas.0.mutexes",	arena_mutex_names[mutex_ind], #counter);\
-	CTL_M2_GET(cmd, arena_ind,					\
-	    (counter_type *)&dst->bool_val, counter_type);
+	if (!derived) {                                   \
+		gen_mutex_ctl_str(cmd, MUTEX_CTL_STR_MAX_LENGTH,        \
+		    "arenas.0.mutexes", arena_mutex_names[mutex_ind], #counter);\
+		CTL_M2_GET(cmd, arena_ind, (counter_type *)&dst->bool_val, counter_type); \
+	} else {                      \
+		emitter_col_t *base = &col_##counter_type[mutex_counter_##base_counter];	\
+		dst->counter_type##_val = rate_per_second(base->counter_type##_val, uptime); \
+	}
 	MUTEX_PROF_COUNTERS
 #undef OP
 #undef EMITTER_TYPE_uint32_t
@@ -169,19 +194,25 @@ mutex_stats_read_arena(unsigned arena_ind, mutex_prof_arena_ind_t mutex_ind,
 static void
 mutex_stats_read_arena_bin(unsigned arena_ind, unsigned bin_ind,
     emitter_col_t col_uint64_t[mutex_prof_num_uint64_t_counters],
-    emitter_col_t col_uint32_t[mutex_prof_num_uint32_t_counters]) {
+    emitter_col_t col_uint32_t[mutex_prof_num_uint32_t_counters],
+    uint64_t uptime) {
 	char cmd[MUTEX_CTL_STR_MAX_LENGTH];
 	emitter_col_t *dst;
 
 #define EMITTER_TYPE_uint32_t emitter_type_uint32
 #define EMITTER_TYPE_uint64_t emitter_type_uint64
-#define OP(counter, counter_type, human)				\
+#define OP(counter, counter_type, human, derived, base_counter)	\
 	dst = &col_##counter_type[mutex_counter_##counter];		\
 	dst->type = EMITTER_TYPE_##counter_type;			\
-	gen_mutex_ctl_str(cmd, MUTEX_CTL_STR_MAX_LENGTH,		\
-	    "arenas.0.bins.0","mutex", #counter);			\
-	CTL_M2_M4_GET(cmd, arena_ind, bin_ind,				\
-	    (counter_type *)&dst->bool_val, counter_type);
+	if (!derived) {                                   \
+		gen_mutex_ctl_str(cmd, MUTEX_CTL_STR_MAX_LENGTH,        \
+		    "arenas.0.bins.0","mutex", #counter);            \
+		CTL_M2_M4_GET(cmd, arena_ind, bin_ind,                \
+		    (counter_type *)&dst->bool_val, counter_type);  \
+	} else {                      \
+		emitter_col_t *base = &col_##counter_type[mutex_counter_##base_counter]; \
+		dst->counter_type##_val = rate_per_second(base->counter_type##_val, uptime); \
+	}
 	MUTEX_PROF_COUNTERS
 #undef OP
 #undef EMITTER_TYPE_uint32_t
@@ -204,19 +235,38 @@ mutex_stats_emit(emitter_t *emitter, emitter_row_t *row,
 
 #define EMITTER_TYPE_uint32_t emitter_type_uint32
 #define EMITTER_TYPE_uint64_t emitter_type_uint64
-#define OP(counter, type, human)					\
-	col = &col_##type[k_##type];						\
-	++k_##type;							\
-	emitter_json_kv(emitter, #counter, EMITTER_TYPE_##type,		\
-	    (const void *)&col->bool_val);
+#define OP(counter, type, human, derived, base_counter)		\
+	if (!derived) {                    \
+		col = &col_##type[k_##type];                        \
+		++k_##type;                            \
+		emitter_json_kv(emitter, #counter, EMITTER_TYPE_##type,        \
+		    (const void *)&col->bool_val); \
+	}
 	MUTEX_PROF_COUNTERS;
 #undef OP
 #undef EMITTER_TYPE_uint32_t
 #undef EMITTER_TYPE_uint64_t
 }
 
+#define COL(row_name, column_name, left_or_right, col_width, etype)      \
+	emitter_col_t col_##column_name;                                     \
+	emitter_col_init(&col_##column_name, &row_name);                     \
+	col_##column_name.justify = emitter_justify_##left_or_right;         \
+	col_##column_name.width = col_width;                                 \
+	col_##column_name.type = emitter_type_##etype;
+
+#define COL_HDR(row_name, column_name, human, left_or_right, col_width, etype)  \
+	COL(row_name, column_name, left_or_right, col_width, etype)	         \
+	emitter_col_t header_##column_name;                                  \
+	emitter_col_init(&header_##column_name, &header_##row_name);         \
+	header_##column_name.justify = emitter_justify_##left_or_right;      \
+	header_##column_name.width = col_width;                              \
+	header_##column_name.type = emitter_type_title;                      \
+	header_##column_name.str_val = human ? human : #column_name;
+
+
 static void
-stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i) {
+stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i, uint64_t uptime) {
 	size_t page;
 	bool in_gap, in_gap_prev;
 	unsigned nbins, j;
@@ -230,44 +280,36 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i) {
 
 	emitter_row_t row;
 	emitter_row_init(&row);
-#define COL(name, left_or_right, col_width, etype)			\
-	emitter_col_t col_##name;					\
-	emitter_col_init(&col_##name, &row);				\
-	col_##name.justify = emitter_justify_##left_or_right;		\
-	col_##name.width = col_width;					\
-	col_##name.type = emitter_type_##etype;				\
-	emitter_col_t header_col_##name;				\
-	emitter_col_init(&header_col_##name, &header_row);		\
-	header_col_##name.justify = emitter_justify_##left_or_right;	\
-	header_col_##name.width = col_width;				\
-	header_col_##name.type = emitter_type_title;			\
-	header_col_##name.str_val = #name;
-
-	COL(size, right, 20, size)
-	COL(ind, right, 4, unsigned)
-	COL(allocated, right, 13, uint64)
-	COL(nmalloc, right, 13, uint64)
-	COL(ndalloc, right, 13, uint64)
-	COL(nrequests, right, 13, uint64)
-	COL(nshards, right, 9, unsigned)
-	COL(curregs, right, 13, size)
-	COL(curslabs, right, 13, size)
-	COL(regs, right, 5, unsigned)
-	COL(pgs, right, 4, size)
+
+	COL_HDR(row, size, NULL, right, 20, size)
+	COL_HDR(row, ind, NULL, right, 4, unsigned)
+	COL_HDR(row, allocated, NULL, right, 13, uint64)
+	COL_HDR(row, nmalloc, NULL, right, 13, uint64)
+	COL_HDR(row, nmalloc_ps, "(#/sec)", right, 8, uint64)
+	COL_HDR(row, ndalloc, NULL, right, 13, uint64)
+	COL_HDR(row, ndalloc_ps, "(#/sec)", right, 8, uint64)
+	COL_HDR(row, nrequests, NULL, right, 13, uint64)
+	COL_HDR(row, nrequests_ps, "(#/sec)", right, 8, uint64)
+	COL_HDR(row, nshards, NULL, right, 9, unsigned)
+	COL_HDR(row, curregs, NULL, right, 13, size)
+	COL_HDR(row, curslabs, NULL, right, 13, size)
+	COL_HDR(row, regs, NULL, right, 5, unsigned)
+	COL_HDR(row, pgs, NULL, right, 4, size)
 	/* To buffer a right- and left-justified column. */
-	COL(justify_spacer, right, 1, title)
-	COL(util, right, 6, title)
-	COL(nfills, right, 13, uint64)
-	COL(nflushes, right, 13, uint64)
-	COL(nslabs, right, 13, uint64)
-	COL(nreslabs, right, 13, uint64)
-#undef COL
+	COL_HDR(row, justify_spacer, NULL, right, 1, title)
+	COL_HDR(row, util, NULL, right, 6, title)
+	COL_HDR(row, nfills, NULL, right, 13, uint64)
+	COL_HDR(row, nfills_ps, "(#/sec)", right, 8, uint64)
+	COL_HDR(row, nflushes, NULL, right, 13, uint64)
+	COL_HDR(row, nflushes_ps, "(#/sec)", right, 8, uint64)
+	COL_HDR(row, nslabs, NULL, right, 13, uint64)
+	COL_HDR(row, nreslabs, NULL, right, 13, uint64)
+	COL_HDR(row, nreslabs_ps, "(#/sec)", right, 8, uint64)
 
 	/* Don't want to actually print the name. */
-	header_col_justify_spacer.str_val = " ";
+	header_justify_spacer.str_val = " ";
 	col_justify_spacer.str_val = " ";
 
-
 	emitter_col_t col_mutex64[mutex_prof_num_uint64_t_counters];
 	emitter_col_t col_mutex32[mutex_prof_num_uint32_t_counters];
 
@@ -285,7 +327,7 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i) {
 	 * We print a "bins:" header as part of the table row; we need to adjust
 	 * the header size column to compensate.
 	 */
-	header_col_size.width -=5;
+	header_size.width -=5;
 	emitter_table_printf(emitter, "bins:");
 	emitter_table_row(emitter, &header_row);
 	emitter_json_array_kv_begin(emitter, "bins");
@@ -332,7 +374,7 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i) {
 
 		if (mutex) {
 			mutex_stats_read_arena_bin(i, j, col_mutex64,
-			    col_mutex32);
+			    col_mutex32, uptime);
 		}
 
 		emitter_json_object_begin(emitter);
@@ -383,8 +425,11 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i) {
 		col_ind.unsigned_val = j;
 		col_allocated.size_val = curregs * reg_size;
 		col_nmalloc.uint64_val = nmalloc;
+		col_nmalloc_ps.uint64_val = rate_per_second(nmalloc, uptime);
 		col_ndalloc.uint64_val = ndalloc;
+		col_ndalloc_ps.uint64_val = rate_per_second(ndalloc, uptime);
 		col_nrequests.uint64_val = nrequests;
+		col_nrequests_ps.uint64_val = rate_per_second(nrequests, uptime);
 		col_nshards.unsigned_val = nshards;
 		col_curregs.size_val = curregs;
 		col_curslabs.size_val = curslabs;
@@ -392,9 +437,12 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i) {
 		col_pgs.size_val = slab_size / page;
 		col_util.str_val = util;
 		col_nfills.uint64_val = nfills;
+		col_nfills_ps.uint64_val = rate_per_second(nfills, uptime);
 		col_nflushes.uint64_val = nflushes;
+		col_nflushes_ps.uint64_val = rate_per_second(nflushes, uptime);
 		col_nslabs.uint64_val = nslabs;
 		col_nreslabs.uint64_val = nreslabs;
+		col_nreslabs_ps.uint64_val = rate_per_second(nreslabs, uptime);
 
 		/*
 		 * Note that mutex columns were initialized above, if mutex ==
@@ -411,7 +459,7 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i) {
 }
 
 static void
-stats_arena_lextents_print(emitter_t *emitter, unsigned i) {
+stats_arena_lextents_print(emitter_t *emitter, unsigned i, uint64_t uptime) {
 	unsigned nbins, nlextents, j;
 	bool in_gap, in_gap_prev;
 
@@ -423,28 +471,16 @@ stats_arena_lextents_print(emitter_t *emitter, unsigned i) {
 	emitter_row_t row;
 	emitter_row_init(&row);
 
-#define COL(name, left_or_right, col_width, etype)			\
-	emitter_col_t header_##name;					\
-	emitter_col_init(&header_##name, &header_row);			\
-	header_##name.justify = emitter_justify_##left_or_right;	\
-	header_##name.width = col_width;				\
-	header_##name.type = emitter_type_title;			\
-	header_##name.str_val = #name;					\
-									\
-	emitter_col_t col_##name;					\
-	emitter_col_init(&col_##name, &row);				\
-	col_##name.justify = emitter_justify_##left_or_right;		\
-	col_##name.width = col_width;					\
-	col_##name.type = emitter_type_##etype;
-
-	COL(size, right, 20, size)
-	COL(ind, right, 4, unsigned)
-	COL(allocated, right, 13, size)
-	COL(nmalloc, right, 13, uint64)
-	COL(ndalloc, right, 13, uint64)
-	COL(nrequests, right, 13, uint64)
-	COL(curlextents, right, 13, size)
-#undef COL
+	COL_HDR(row, size, NULL, right, 20, size)
+	COL_HDR(row, ind, NULL, right, 4, unsigned)
+	COL_HDR(row, allocated, NULL, right, 13, size)
+	COL_HDR(row, nmalloc, NULL, right, 13, uint64)
+	COL_HDR(row, nmalloc_ps, "(#/sec)", right, 8, uint64)
+	COL_HDR(row, ndalloc, NULL, right, 13, uint64)
+	COL_HDR(row, ndalloc_ps, "(#/sec)", right, 8, uint64)
+	COL_HDR(row, nrequests, NULL, right, 13, uint64)
+	COL_HDR(row, nrequests_ps, "(#/sec)", right, 8, uint64)
+	COL_HDR(row, curlextents, NULL, right, 13, size)
 
 	/* As with bins, we label the large extents table. */
 	header_size.width -= 6;
@@ -483,8 +519,11 @@ stats_arena_lextents_print(emitter_t *emitter, unsigned i) {
 		col_ind.unsigned_val = nbins + j;
 		col_allocated.size_val = curlextents * lextent_size;
 		col_nmalloc.uint64_val = nmalloc;
+		col_nmalloc_ps.uint64_val = rate_per_second(nmalloc, uptime);
 		col_ndalloc.uint64_val = ndalloc;
+		col_ndalloc_ps.uint64_val = rate_per_second(ndalloc, uptime);
 		col_nrequests.uint64_val = nrequests;
+		col_nrequests_ps.uint64_val = rate_per_second(nrequests, uptime);
 		col_curlextents.size_val = curlextents;
 
 		if (!in_gap) {
@@ -505,31 +544,17 @@ stats_arena_extents_print(emitter_t *emitter, unsigned i) {
 	emitter_row_init(&header_row);
 	emitter_row_t row;
 	emitter_row_init(&row);
-#define COL(name, left_or_right, col_width, etype)			\
-	emitter_col_t header_##name;					\
-	emitter_col_init(&header_##name, &header_row);			\
-	header_##name.justify = emitter_justify_##left_or_right;	\
-	header_##name.width = col_width;				\
-	header_##name.type = emitter_type_title;			\
-	header_##name.str_val = #name;					\
-									\
-	emitter_col_t col_##name;					\
-	emitter_col_init(&col_##name, &row);				\
-	col_##name.justify = emitter_justify_##left_or_right;		\
-	col_##name.width = col_width;					\
-	col_##name.type = emitter_type_##etype;
-
-	COL(size, right, 20, size)
-	COL(ind, right, 4, unsigned)
-	COL(ndirty, right, 13, size)
-	COL(dirty, right, 13, size)
-	COL(nmuzzy, right, 13, size)
-	COL(muzzy, right, 13, size)
-	COL(nretained, right, 13, size)
-	COL(retained, right, 13, size)
-	COL(ntotal, right, 13, size)
-	COL(total, right, 13, size)
-#undef COL
+
+	COL_HDR(row, size, NULL, right, 20, size)
+	COL_HDR(row, ind, NULL, right, 4, unsigned)
+	COL_HDR(row, ndirty, NULL, right, 13, size)
+	COL_HDR(row, dirty, NULL, right, 13, size)
+	COL_HDR(row, nmuzzy, NULL, right, 13, size)
+	COL_HDR(row, muzzy, NULL, right, 13, size)
+	COL_HDR(row, nretained, NULL, right, 13, size)
+	COL_HDR(row, retained, NULL, right, 13, size)
+	COL_HDR(row, ntotal, NULL, right, 13, size)
+	COL_HDR(row, total, NULL, right, 13, size)
 
 	/* Label this section. */
 	header_size.width -= 8;
@@ -600,7 +625,7 @@ stats_arena_extents_print(emitter_t *emitter, unsigned i) {
 }
 
 static void
-stats_arena_mutexes_print(emitter_t *emitter, unsigned arena_ind) {
+stats_arena_mutexes_print(emitter_t *emitter, unsigned arena_ind, uint64_t uptime) {
 	emitter_row_t row;
 	emitter_col_t col_name;
 	emitter_col_t col64[mutex_prof_num_uint64_t_counters];
@@ -617,7 +642,7 @@ stats_arena_mutexes_print(emitter_t *emitter, unsigned arena_ind) {
 		const char *name = arena_mutex_names[i];
 		emitter_json_object_kv_begin(emitter, name);
 		mutex_stats_read_arena(arena_ind, i, name, &col_name, col64,
-		    col32);
+		    col32, uptime);
 		mutex_stats_emit(emitter, &row, col64, col32);
 		emitter_json_object_end(emitter); /* Close the mutex dict. */
 	}
@@ -699,98 +724,74 @@ stats_arena_print(emitter_t *emitter, unsigned i, bool bins, bool large,
 	    &muzzy_purged);
 
 	/* Table-style emission. */
-	emitter_col_t decay_type;
-	emitter_col_init(&decay_type, &decay_row);
-	decay_type.justify = emitter_justify_right;
-	decay_type.width = 9;
-	decay_type.type = emitter_type_title;
-	decay_type.str_val = "decaying:";
-
-	emitter_col_t decay_time;
-	emitter_col_init(&decay_time, &decay_row);
-	decay_time.justify = emitter_justify_right;
-	decay_time.width = 6;
-	decay_time.type = emitter_type_title;
-	decay_time.str_val = "time";
-
-	emitter_col_t decay_npages;
-	emitter_col_init(&decay_npages, &decay_row);
-	decay_npages.justify = emitter_justify_right;
-	decay_npages.width = 13;
-	decay_npages.type = emitter_type_title;
-	decay_npages.str_val = "npages";
-
-	emitter_col_t decay_sweeps;
-	emitter_col_init(&decay_sweeps, &decay_row);
-	decay_sweeps.justify = emitter_justify_right;
-	decay_sweeps.width = 13;
-	decay_sweeps.type = emitter_type_title;
-	decay_sweeps.str_val = "sweeps";
-
-	emitter_col_t decay_madvises;
-	emitter_col_init(&decay_madvises, &decay_row);
-	decay_madvises.justify = emitter_justify_right;
-	decay_madvises.width = 13;
-	decay_madvises.type = emitter_type_title;
-	decay_madvises.str_val = "madvises";
-
-	emitter_col_t decay_purged;
-	emitter_col_init(&decay_purged, &decay_row);
-	decay_purged.justify = emitter_justify_right;
-	decay_purged.width = 13;
-	decay_purged.type = emitter_type_title;
-	decay_purged.str_val = "purged";
+	COL(decay_row, decay_type, right, 9, title);
+	col_decay_type.str_val = "decaying:";
+
+	COL(decay_row, decay_time, right, 6, title);
+	col_decay_time.str_val = "time";
+
+	COL(decay_row, decay_npages, right, 13, title);
+	col_decay_npages.str_val = "npages";
+
+	COL(decay_row, decay_sweeps, right, 13, title);
+	col_decay_sweeps.str_val = "sweeps";
+
+	COL(decay_row, decay_madvises, right, 13, title);
+	col_decay_madvises.str_val = "madvises";
+
+	COL(decay_row, decay_purged, right, 13, title);
+	col_decay_purged.str_val = "purged";
 
 	/* Title row. */
 	emitter_table_row(emitter, &decay_row);
 
 	/* Dirty row. */
-	decay_type.str_val = "dirty:";
+	col_decay_type.str_val = "dirty:";
 
 	if (dirty_decay_ms >= 0) {
-		decay_time.type = emitter_type_ssize;
-		decay_time.ssize_val = dirty_decay_ms;
+		col_decay_time.type = emitter_type_ssize;
+		col_decay_time.ssize_val = dirty_decay_ms;
 	} else {
-		decay_time.type = emitter_type_title;
-		decay_time.str_val = "N/A";
+		col_decay_time.type = emitter_type_title;
+		col_decay_time.str_val = "N/A";
 	}
 
-	decay_npages.type = emitter_type_size;
-	decay_npages.size_val = pdirty;
+	col_decay_npages.type = emitter_type_size;
+	col_decay_npages.size_val = pdirty;
 
-	decay_sweeps.type = emitter_type_uint64;
-	decay_sweeps.uint64_val = dirty_npurge;
+	col_decay_sweeps.type = emitter_type_uint64;
+	col_decay_sweeps.uint64_val = dirty_npurge;
 
-	decay_madvises.type = emitter_type_uint64;
-	decay_madvises.uint64_val = dirty_nmadvise;
+	col_decay_madvises.type = emitter_type_uint64;
+	col_decay_madvises.uint64_val = dirty_nmadvise;
 
-	decay_purged.type = emitter_type_uint64;
-	decay_purged.uint64_val = dirty_purged;
+	col_decay_purged.type = emitter_type_uint64;
+	col_decay_purged.uint64_val = dirty_purged;
 
 	emitter_table_row(emitter, &decay_row);
 
 	/* Muzzy row. */
-	decay_type.str_val = "muzzy:";
+	col_decay_type.str_val = "muzzy:";
 
 	if (muzzy_decay_ms >= 0) {
-		decay_time.type = emitter_type_ssize;
-		decay_time.ssize_val = muzzy_decay_ms;
+		col_decay_time.type = emitter_type_ssize;
+		col_decay_time.ssize_val = muzzy_decay_ms;
 	} else {
-		decay_time.type = emitter_type_title;
-		decay_time.str_val = "N/A";
+		col_decay_time.type = emitter_type_title;
+		col_decay_time.str_val = "N/A";
 	}
 
-	decay_npages.type = emitter_type_size;
-	decay_npages.size_val = pmuzzy;
+	col_decay_npages.type = emitter_type_size;
+	col_decay_npages.size_val = pmuzzy;
 
-	decay_sweeps.type = emitter_type_uint64;
-	decay_sweeps.uint64_val = muzzy_npurge;
+	col_decay_sweeps.type = emitter_type_uint64;
+	col_decay_sweeps.uint64_val = muzzy_npurge;
 
-	decay_madvises.type = emitter_type_uint64;
-	decay_madvises.uint64_val = muzzy_nmadvise;
+	col_decay_madvises.type = emitter_type_uint64;
+	col_decay_madvises.uint64_val = muzzy_nmadvise;
 
-	decay_purged.type = emitter_type_uint64;
-	decay_purged.uint64_val = muzzy_purged;
+	col_decay_purged.type = emitter_type_uint64;
+	col_decay_purged.uint64_val = muzzy_purged;
 
 	emitter_table_row(emitter, &decay_row);
 
@@ -798,69 +799,71 @@ stats_arena_print(emitter_t *emitter, unsigned i, bool bins, bool large,
 	emitter_row_t alloc_count_row;
 	emitter_row_init(&alloc_count_row);
 
-	emitter_col_t alloc_count_title;
-	emitter_col_init(&alloc_count_title, &alloc_count_row);
-	alloc_count_title.justify = emitter_justify_left;
-	alloc_count_title.width = 21;
-	alloc_count_title.type = emitter_type_title;
-	alloc_count_title.str_val = "";
-
-	emitter_col_t alloc_count_allocated;
-	emitter_col_init(&alloc_count_allocated, &alloc_count_row);
-	alloc_count_allocated.justify = emitter_justify_right;
-	alloc_count_allocated.width = 16;
-	alloc_count_allocated.type = emitter_type_title;
-	alloc_count_allocated.str_val = "allocated";
-
-	emitter_col_t alloc_count_nmalloc;
-	emitter_col_init(&alloc_count_nmalloc, &alloc_count_row);
-	alloc_count_nmalloc.justify = emitter_justify_right;
-	alloc_count_nmalloc.width = 16;
-	alloc_count_nmalloc.type = emitter_type_title;
-	alloc_count_nmalloc.str_val = "nmalloc";
-
-	emitter_col_t alloc_count_ndalloc;
-	emitter_col_init(&alloc_count_ndalloc, &alloc_count_row);
-	alloc_count_ndalloc.justify = emitter_justify_right;
-	alloc_count_ndalloc.width = 16;
-	alloc_count_ndalloc.type = emitter_type_title;
-	alloc_count_ndalloc.str_val = "ndalloc";
-
-	emitter_col_t alloc_count_nrequests;
-	emitter_col_init(&alloc_count_nrequests, &alloc_count_row);
-	alloc_count_nrequests.justify = emitter_justify_right;
-	alloc_count_nrequests.width = 16;
-	alloc_count_nrequests.type = emitter_type_title;
-	alloc_count_nrequests.str_val = "nrequests";
+	COL(alloc_count_row, count_title, left, 21, title);
+	col_count_title.str_val = "";
+
+	COL(alloc_count_row, count_allocated, right, 16, title);
+	col_count_allocated.str_val = "allocated";
+
+	COL(alloc_count_row, count_nmalloc, right, 16, title);
+	col_count_nmalloc.str_val = "nmalloc";
+	COL(alloc_count_row, count_nmalloc_ps, right, 8, title);
+	col_count_nmalloc_ps.str_val = "(#/sec)";
+
+	COL(alloc_count_row, count_ndalloc, right, 16, title);
+	col_count_ndalloc.str_val = "ndalloc";
+	COL(alloc_count_row, count_ndalloc_ps, right, 8, title);
+	col_count_ndalloc_ps.str_val = "(#/sec)";
+
+	COL(alloc_count_row, count_nrequests, right, 16, title);
+	col_count_nrequests.str_val = "nrequests";
+	COL(alloc_count_row, count_nrequests_ps, right, 8, title);
+	col_count_nrequests_ps.str_val = "(#/sec)";
 
 	emitter_table_row(emitter, &alloc_count_row);
 
+	col_count_nmalloc_ps.type = emitter_type_uint64;
+	col_count_ndalloc_ps.type = emitter_type_uint64;
+	col_count_nrequests_ps.type = emitter_type_uint64;
+
 #define GET_AND_EMIT_ALLOC_STAT(small_or_large, name, valtype)		\
 	CTL_M2_GET("stats.arenas.0." #small_or_large "." #name, i,	\
 	    &small_or_large##_##name, valtype##_t);			\
 	emitter_json_kv(emitter, #name, emitter_type_##valtype,		\
 	    &small_or_large##_##name);					\
-	alloc_count_##name.type = emitter_type_##valtype;		\
-	alloc_count_##name.valtype##_val = small_or_large##_##name;
+	col_count_##name.type = emitter_type_##valtype;		\
+	col_count_##name.valtype##_val = small_or_large##_##name;
 
 	emitter_json_object_kv_begin(emitter, "small");
-	alloc_count_title.str_val = "small:";
+	col_count_title.str_val = "small:";
 
 	GET_AND_EMIT_ALLOC_STAT(small, allocated, size)
 	GET_AND_EMIT_ALLOC_STAT(small, nmalloc, uint64)
+	col_count_nmalloc_ps.uint64_val =
+	    rate_per_second(col_count_nmalloc.uint64_val, uptime);
 	GET_AND_EMIT_ALLOC_STAT(small, ndalloc, uint64)
+	col_count_ndalloc_ps.uint64_val =
+	    rate_per_second(col_count_ndalloc.uint64_val, uptime);
 	GET_AND_EMIT_ALLOC_STAT(small, nrequests, uint64)
+	col_count_nrequests_ps.uint64_val =
+	    rate_per_second(col_count_nrequests.uint64_val, uptime);
 
 	emitter_table_row(emitter, &alloc_count_row);
 	emitter_json_object_end(emitter); /* Close "small". */
 
 	emitter_json_object_kv_begin(emitter, "large");
-	alloc_count_title.str_val = "large:";
+	col_count_title.str_val = "large:";
 
 	GET_AND_EMIT_ALLOC_STAT(large, allocated, size)
 	GET_AND_EMIT_ALLOC_STAT(large, nmalloc, uint64)
+	col_count_nmalloc_ps.uint64_val =
+	    rate_per_second(col_count_nmalloc.uint64_val, uptime);
 	GET_AND_EMIT_ALLOC_STAT(large, ndalloc, uint64)
+	col_count_ndalloc_ps.uint64_val =
+	    rate_per_second(col_count_ndalloc.uint64_val, uptime);
 	GET_AND_EMIT_ALLOC_STAT(large, nrequests, uint64)
+	col_count_nrequests_ps.uint64_val =
+	    rate_per_second(col_count_nrequests.uint64_val, uptime);
 
 	emitter_table_row(emitter, &alloc_count_row);
 	emitter_json_object_end(emitter); /* Close "large". */
@@ -868,11 +871,11 @@ stats_arena_print(emitter_t *emitter, unsigned i, bool bins, bool large,
 #undef GET_AND_EMIT_ALLOC_STAT
 
 	/* Aggregated small + large stats are emitter only in table mode. */
-	alloc_count_title.str_val = "total:";
-	alloc_count_allocated.size_val = small_allocated + large_allocated;
-	alloc_count_nmalloc.uint64_val = small_nmalloc + large_nmalloc;
-	alloc_count_ndalloc.uint64_val = small_ndalloc + large_ndalloc;
-	alloc_count_nrequests.uint64_val = small_nrequests + large_nrequests;
+	col_count_title.str_val = "total:";
+	col_count_allocated.size_val = small_allocated + large_allocated;
+	col_count_nmalloc.uint64_val = small_nmalloc + large_nmalloc;
+	col_count_ndalloc.uint64_val = small_ndalloc + large_ndalloc;
+	col_count_nrequests.uint64_val = small_nrequests + large_nrequests;
 	emitter_table_row(emitter, &alloc_count_row);
 
 	emitter_row_t mem_count_row;
@@ -918,13 +921,13 @@ stats_arena_print(emitter_t *emitter, unsigned i, bool bins, bool large,
 #undef GET_AND_EMIT_MEM_STAT
 
 	if (mutex) {
-		stats_arena_mutexes_print(emitter, i);
+		stats_arena_mutexes_print(emitter, i, uptime);
 	}
 	if (bins) {
-		stats_arena_bins_print(emitter, mutex, i);
+		stats_arena_bins_print(emitter, mutex, i, uptime);
 	}
 	if (large) {
-		stats_arena_lextents_print(emitter, i);
+		stats_arena_lextents_print(emitter, i, uptime);
 	}
 	if (extents) {
 		stats_arena_extents_print(emitter, i);
@@ -1246,6 +1249,7 @@ stats_print_helper(emitter_t *emitter, bool merged, bool destroyed,
 		emitter_col_t name;
 		emitter_col_t col64[mutex_prof_num_uint64_t_counters];
 		emitter_col_t col32[mutex_prof_num_uint32_t_counters];
+		uint64_t uptime;
 
 		emitter_row_init(&row);
 		mutex_stats_init_cols(&row, "", &name, col64, col32);
@@ -1253,9 +1257,11 @@ stats_print_helper(emitter_t *emitter, bool merged, bool destroyed,
 		emitter_table_row(emitter, &row);
 		emitter_json_object_kv_begin(emitter, "mutexes");
 
+		CTL_M2_GET("stats.arenas.0.uptime", 0, &uptime, uint64_t);
+
 		for (int i = 0; i < mutex_prof_num_global_mutexes; i++) {
 			mutex_stats_read_global(global_mutex_names[i], &name,
-			    col64, col32);
+			    col64, col32, uptime);
 			emitter_json_object_kv_begin(emitter, global_mutex_names[i]);
 			mutex_stats_emit(emitter, &row, col64, col32);
 			emitter_json_object_end(emitter);
-- 
cgit v0.12


From 441335d924984022a3e17c3f013a0ad33806a5ff Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Mon, 17 Dec 2018 15:29:37 -0800
Subject: Add unit test for producer-consumer pattern.

---
 test/unit/binshard.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 50 insertions(+), 2 deletions(-)

diff --git a/test/unit/binshard.c b/test/unit/binshard.c
index 829ba43..406c46c 100644
--- a/test/unit/binshard.c
+++ b/test/unit/binshard.c
@@ -2,6 +2,54 @@
 
 /* Config -- "narenas:1,bin_shards:1-160:16|129-512:4|256-256:8" */
 
+#define NTHREADS 16
+#define REMOTE_NALLOC 256
+
+static void *
+thd_producer(void *varg) {
+	void **mem = varg;
+	unsigned arena, i;
+	size_t sz;
+
+	sz = sizeof(arena);
+	/* Remote arena. */
+	assert_d_eq(mallctl("arenas.create", (void *)&arena, &sz, NULL, 0), 0,
+	    "Unexpected mallctl() failure");
+	for (i = 0; i < REMOTE_NALLOC / 2; i++) {
+		mem[i] = mallocx(1, MALLOCX_TCACHE_NONE | MALLOCX_ARENA(arena));
+	}
+
+	/* Remote bin. */
+	for (; i < REMOTE_NALLOC; i++) {
+		mem[i] = mallocx(1, MALLOCX_TCACHE_NONE | MALLOCX_ARENA(0));
+	}
+
+	return NULL;
+}
+
+TEST_BEGIN(test_producer_consumer) {
+	thd_t thds[NTHREADS];
+	void *mem[NTHREADS][REMOTE_NALLOC];
+	unsigned i;
+
+	/* Create producer threads to allocate. */
+	for (i = 0; i < NTHREADS; i++) {
+		thd_create(&thds[i], thd_producer, mem[i]);
+	}
+	for (i = 0; i < NTHREADS; i++) {
+		thd_join(thds[i], NULL);
+	}
+	/* Remote deallocation by the current thread. */
+	for (i = 0; i < NTHREADS; i++) {
+		for (unsigned j = 0; j < REMOTE_NALLOC; j++) {
+			assert_ptr_not_null(mem[i][j],
+			    "Unexpected remote allocation failure");
+			dallocx(mem[i][j], 0);
+		}
+	}
+}
+TEST_END
+
 static void *
 thd_start(void *varg) {
 	void *ptr, *ptr2;
@@ -34,7 +82,6 @@ thd_start(void *varg) {
 }
 
 TEST_BEGIN(test_bin_shard_mt) {
-#define NTHREADS 16
 	thd_t thds[NTHREADS];
 	unsigned i;
 	for (i = 0; i < NTHREADS; i++) {
@@ -99,5 +146,6 @@ int
 main(void) {
 	return test_no_reentrancy(
 	    test_bin_shard,
-	    test_bin_shard_mt);
+	    test_bin_shard_mt,
+	    test_producer_consumer);
 }
-- 
cgit v0.12


From 7241bf5b745ba5ec24b26b0ef2bd30b1c0a428dc Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Mon, 3 Dec 2018 18:30:58 -0800
Subject: Only read arena index from extent on the tcache flush path.

Add exten_arena_ind_get() to avoid loading the actual arena ptr in case we just
need to check arena matching.
---
 include/jemalloc/internal/extent_inlines.h | 19 ++++++++++---------
 src/tcache.c                               | 14 +++++++++-----
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/include/jemalloc/internal/extent_inlines.h b/include/jemalloc/internal/extent_inlines.h
index b572860..63b710d 100644
--- a/include/jemalloc/internal/extent_inlines.h
+++ b/include/jemalloc/internal/extent_inlines.h
@@ -35,18 +35,19 @@ extent_unlock2(tsdn_t *tsdn, extent_t *extent1, extent_t *extent2) {
 	    (uintptr_t)extent2);
 }
 
-static inline arena_t *
-extent_arena_get(const extent_t *extent) {
+static inline unsigned
+extent_arena_ind_get(const extent_t *extent) {
 	unsigned arena_ind = (unsigned)((extent->e_bits &
 	    EXTENT_BITS_ARENA_MASK) >> EXTENT_BITS_ARENA_SHIFT);
-	/*
-	 * The following check is omitted because we should never actually read
-	 * a NULL arena pointer.
-	 */
-	if (false && arena_ind >= MALLOCX_ARENA_LIMIT) {
-		return NULL;
-	}
 	assert(arena_ind < MALLOCX_ARENA_LIMIT);
+
+	return arena_ind;
+}
+
+static inline arena_t *
+extent_arena_get(const extent_t *extent) {
+	unsigned arena_ind = extent_arena_ind_get(extent);
+
 	return (arena_t *)atomic_load_p(&arenas[arena_ind], ATOMIC_ACQUIRE);
 }
 
diff --git a/src/tcache.c b/src/tcache.c
index 92be273..182e8bf 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -120,7 +120,9 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
 	while (nflush > 0) {
 		/* Lock the arena bin associated with the first object. */
 		extent_t *extent = item_extent[0];
-		arena_t *bin_arena = extent_arena_get(extent);
+		unsigned bin_arena_ind = extent_arena_ind_get(extent);
+		arena_t *bin_arena = arena_get(tsd_tsdn(tsd), bin_arena_ind,
+		    false);
 		unsigned binshard = extent_binshard_get(extent);
 		assert(binshard < bin_infos[binind].n_shards);
 		bin_t *bin = &bin_arena->bins[binind].bin_shards[binshard];
@@ -146,7 +148,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
 			extent = item_extent[i];
 			assert(ptr != NULL && extent != NULL);
 
-			if (extent_arena_get(extent) == bin_arena
+			if (extent_arena_ind_get(extent) == bin_arena_ind
 			    && extent_binshard_get(extent) == binshard) {
 				arena_dalloc_bin_junked_locked(tsd_tsdn(tsd),
 				    bin_arena, bin, binind, extent, ptr);
@@ -208,7 +210,9 @@ tcache_bin_flush_large(tsd_t *tsd, cache_bin_t *tbin, szind_t binind,
 	while (nflush > 0) {
 		/* Lock the arena associated with the first object. */
 		extent_t *extent = item_extent[0];
-		arena_t *locked_arena = extent_arena_get(extent);
+		unsigned locked_arena_ind = extent_arena_ind_get(extent);
+		arena_t *locked_arena = arena_get(tsd_tsdn(tsd),
+		    locked_arena_ind, false);
 		bool idump;
 
 		if (config_prof) {
@@ -223,7 +227,7 @@ tcache_bin_flush_large(tsd_t *tsd, cache_bin_t *tbin, szind_t binind,
 			void *ptr = *(tbin->avail - 1 - i);
 			assert(ptr != NULL);
 			extent = item_extent[i];
-			if (extent_arena_get(extent) == locked_arena) {
+			if (extent_arena_ind_get(extent) == locked_arena_ind) {
 				large_dalloc_prep_junked_locked(tsd_tsdn(tsd),
 				    extent);
 			}
@@ -253,7 +257,7 @@ tcache_bin_flush_large(tsd_t *tsd, cache_bin_t *tbin, szind_t binind,
 			extent = item_extent[i];
 			assert(ptr != NULL && extent != NULL);
 
-			if (extent_arena_get(extent) == locked_arena) {
+			if (extent_arena_ind_get(extent) == locked_arena_ind) {
 				large_dalloc_finish(tsd_tsdn(tsd), extent);
 			} else {
 				/*
-- 
cgit v0.12


From 4e920d2c9d5aecc9dec7069a0c9736b1f14eead9 Mon Sep 17 00:00:00 2001
From: John Ericson <John.Ericson@Obsidian.Systems>
Date: Fri, 14 Dec 2018 15:28:34 -0500
Subject: Add --{enable,disable}-{static,shared} to configure script

My distro offers a custom toolchain where it's not possible to make
static libs, so it's insufficient to just delete the libs I don't want.
I actually need to avoid building them in the first place.
---
 Makefile.in  | 17 +++++++++++++++--
 configure.ac | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/Makefile.in b/Makefile.in
index 31a9cea..b788a09 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -55,6 +55,8 @@ cfghdrs_out := @cfghdrs_out@
 cfgoutputs_in := $(addprefix $(srcroot),@cfgoutputs_in@)
 cfgoutputs_out := @cfgoutputs_out@
 enable_autogen := @enable_autogen@
+enable_shared := @enable_shared@
+enable_static := @enable_static@
 enable_prof := @enable_prof@
 enable_zone_allocator := @enable_zone_allocator@
 enable_experimental_smallocx := @enable_experimental_smallocx@
@@ -430,7 +432,12 @@ $(objroot)test/stress/%$(EXE): $(objroot)test/stress/%.$(O) $(C_JET_OBJS) $(C_TE
 
 build_lib_shared: $(DSOS)
 build_lib_static: $(STATIC_LIBS)
-build_lib: build_lib_shared build_lib_static
+ifeq ($(enable_shared), 1)
+build_lib: build_lib_shared
+endif
+ifeq ($(enable_static), 1)
+build_lib: build_lib_static
+endif
 
 install_bin:
 	$(INSTALL) -d $(BINDIR)
@@ -467,7 +474,13 @@ install_lib_pc: $(PC)
 	$(INSTALL) -m 644 $$l $(LIBDIR)/pkgconfig; \
 done
 
-install_lib: install_lib_shared install_lib_static install_lib_pc
+ifeq ($(enable_shared), 1)
+install_lib: install_lib_shared
+endif
+ifeq ($(enable_static), 1)
+install_lib: install_lib_static
+endif
+install_lib: install_lib_pc
 
 install_doc_html:
 	$(INSTALL) -d $(DATADIR)/doc/jemalloc$(install_suffix)
diff --git a/configure.ac b/configure.ac
index 072808c..e9093e8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -878,6 +878,36 @@ AC_PROG_RANLIB
 AC_PATH_PROG([LD], [ld], [false], [$PATH])
 AC_PATH_PROG([AUTOCONF], [autoconf], [false], [$PATH])
 
+dnl Enable shared libs
+AC_ARG_ENABLE([shared],
+  [AS_HELP_STRING([--enable-shared], [Build shared libaries])],
+if test "x$enable_shared" = "xno" ; then
+  enable_shared="0"
+else
+  enable_shared="1"
+fi
+,
+enable_shared="1"
+)
+AC_SUBST([enable_shared])
+
+dnl Enable static libs
+AC_ARG_ENABLE([static],
+  [AS_HELP_STRING([--enable-static], [Build static libaries])],
+if test "x$enable_static" = "xno" ; then
+  enable_static="0"
+else
+  enable_static="1"
+fi
+,
+enable_static="1"
+)
+AC_SUBST([enable_static])
+
+if test "$enable_shared$enable_static" = "00" ; then
+  AC_MSG_ERROR([Please enable one of shared or static builds])
+fi
+
 dnl Perform no name mangling by default.
 AC_ARG_WITH([mangling],
   [AS_HELP_STRING([--with-mangling=<map>], [Mangle symbols in <map>])],
@@ -2297,6 +2327,8 @@ AC_MSG_RESULT([JEMALLOC_PRIVATE_NAMESPACE])
 AC_MSG_RESULT([                   : ${JEMALLOC_PRIVATE_NAMESPACE}])
 AC_MSG_RESULT([install_suffix     : ${install_suffix}])
 AC_MSG_RESULT([malloc_conf        : ${config_malloc_conf}])
+AC_MSG_RESULT([shared libs        : ${enable_shared}])
+AC_MSG_RESULT([static libs        : ${enable_static}])
 AC_MSG_RESULT([autogen            : ${enable_autogen}])
 AC_MSG_RESULT([debug              : ${enable_debug}])
 AC_MSG_RESULT([stats              : ${enable_stats}])
-- 
cgit v0.12


From daa0e436ba232d67b832e1b270b13c5061eebfe9 Mon Sep 17 00:00:00 2001
From: Leonardo Santagada <santagada@gmail.com>
Date: Wed, 31 Oct 2018 12:03:42 +0100
Subject: implement malloc_getcpu for windows

---
 include/jemalloc/internal/jemalloc_internal_inlines_a.h | 4 +++-
 include/jemalloc/internal/jemalloc_preamble.h.in        | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/jemalloc/internal/jemalloc_internal_inlines_a.h b/include/jemalloc/internal/jemalloc_internal_inlines_a.h
index 8adc02a..ddde9b4 100644
--- a/include/jemalloc/internal/jemalloc_internal_inlines_a.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_a.h
@@ -10,7 +10,9 @@
 JEMALLOC_ALWAYS_INLINE malloc_cpuid_t
 malloc_getcpu(void) {
 	assert(have_percpu_arena);
-#if defined(JEMALLOC_HAVE_SCHED_GETCPU)
+#if defined(_WIN32)
+	return GetCurrentProcessorNumber();
+#elif defined(JEMALLOC_HAVE_SCHED_GETCPU)
 	return (malloc_cpuid_t)sched_getcpu();
 #else
 	not_reached();
diff --git a/include/jemalloc/internal/jemalloc_preamble.h.in b/include/jemalloc/internal/jemalloc_preamble.h.in
index 857fa32..4bfdb32 100644
--- a/include/jemalloc/internal/jemalloc_preamble.h.in
+++ b/include/jemalloc/internal/jemalloc_preamble.h.in
@@ -161,7 +161,7 @@ static const bool config_log =
     false
 #endif
     ;
-#ifdef JEMALLOC_HAVE_SCHED_GETCPU
+#if defined(_WIN32) || defined(JEMALLOC_HAVE_SCHED_GETCPU)
 /* Currently percpu_arena depends on sched_getcpu. */
 #define JEMALLOC_PERCPU_ARENA
 #endif
-- 
cgit v0.12


From 471191075d6a88eb1364fb5f332237eb3d512872 Mon Sep 17 00:00:00 2001
From: Faidon Liambotis <paravoid@debian.org>
Date: Tue, 8 Jan 2019 03:31:53 +0200
Subject: Replace -lpthread with -pthread

This automatically adds -latomic if and when needed, e.g. on riscv64
systems.

Fixes #1401.
---
 Makefile.in  | 2 +-
 configure.ac | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile.in b/Makefile.in
index b788a09..2d59e59 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -420,7 +420,7 @@ $(objroot)test/unit/%$(EXE): $(objroot)test/unit/%.$(O) $(C_JET_OBJS) $(C_TESTLI
 
 $(objroot)test/integration/%$(EXE): $(objroot)test/integration/%.$(O) $(C_TESTLIB_INTEGRATION_OBJS) $(C_UTIL_INTEGRATION_OBJS) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB)
 	@mkdir -p $(@D)
-	$(CC) $(TEST_LD_MODE) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(LJEMALLOC) $(LDFLAGS) $(filter-out -lm,$(filter -lrt -lpthread -lstdc++,$(LIBS))) $(LM) $(EXTRA_LDFLAGS)
+	$(CC) $(TEST_LD_MODE) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(LJEMALLOC) $(LDFLAGS) $(filter-out -lm,$(filter -lrt -pthread -lstdc++,$(LIBS))) $(LM) $(EXTRA_LDFLAGS)
 
 $(objroot)test/integration/cpp/%$(EXE): $(objroot)test/integration/cpp/%.$(O) $(C_TESTLIB_INTEGRATION_OBJS) $(C_UTIL_INTEGRATION_OBJS) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB)
 	@mkdir -p $(@D)
diff --git a/configure.ac b/configure.ac
index e9093e8..c0911db 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1579,7 +1579,7 @@ if test "x$abi" != "xpecoff" ; then
   AC_CHECK_HEADERS([pthread.h], , [AC_MSG_ERROR([pthread.h is missing])])
   dnl Some systems may embed pthreads functionality in libc; check for libpthread
   dnl first, but try libc too before failing.
-  AC_CHECK_LIB([pthread], [pthread_create], [JE_APPEND_VS(LIBS, -lpthread)],
+  AC_CHECK_LIB([pthread], [pthread_create], [JE_APPEND_VS(LIBS, -pthread)],
                [AC_SEARCH_LIBS([pthread_create], , ,
                                AC_MSG_ERROR([libpthread is missing]))])
   wrap_syms="${wrap_syms} pthread_create"
-- 
cgit v0.12


From 6910fcb208e2703f72bcbfbd1db22426d02b1e27 Mon Sep 17 00:00:00 2001
From: Li-Wen Hsu <lwhsu@lwhsu.org>
Date: Fri, 4 Jan 2019 17:07:09 +0800
Subject: Add Cirrus-CI config for FreeBSD builds

---
 .cirrus.yml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 .cirrus.yml

diff --git a/.cirrus.yml b/.cirrus.yml
new file mode 100644
index 0000000..019d2c3
--- /dev/null
+++ b/.cirrus.yml
@@ -0,0 +1,21 @@
+env:
+  CIRRUS_CLONE_DEPTH: 1
+  ARCH: amd64
+
+task:
+  freebsd_instance:
+    matrix:
+      image: freebsd-12-0-release-amd64
+      image: freebsd-11-2-release-amd64
+  install_script:
+    - sed -i.bak -e 's,pkg+http://pkg.FreeBSD.org/\${ABI}/quarterly,pkg+http://pkg.FreeBSD.org/\${ABI}/latest,' /etc/pkg/FreeBSD.conf
+    - pkg upgrade -y
+    - pkg install -y autoconf gmake
+  script:
+    - autoconf
+    #- ./configure ${COMPILER_FLAGS:+       CC="$CC $COMPILER_FLAGS"       CXX="$CXX $COMPILER_FLAGS" }       $CONFIGURE_FLAGS
+    - ./configure
+    - export JFLAG=`sysctl -n kern.smp.cpus`
+    - gmake -j${JFLAG}
+    - gmake -j${JFLAG} tests
+    - gmake check
-- 
cgit v0.12


From 646af596d8c4ffefc1f7edf432aa2b4e669bcc78 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 9 Jan 2019 17:07:11 -0800
Subject: Customize cloning to include tags so that VERSION is valid.

---
 .cirrus.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index 019d2c3..8b1b38d 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -10,7 +10,11 @@ task:
   install_script:
     - sed -i.bak -e 's,pkg+http://pkg.FreeBSD.org/\${ABI}/quarterly,pkg+http://pkg.FreeBSD.org/\${ABI}/latest,' /etc/pkg/FreeBSD.conf
     - pkg upgrade -y
-    - pkg install -y autoconf gmake
+    - pkg install -y autoconf git gmake
+  clone_script:
+    - git clone --tags --branch=${CIRRUS_BASE_BRANCH} https://x-access-token:${CIRRUS_REPO_CLONE_TOKEN}@github.com/${CIRRUS_REPO_FULL_NAME}.git ${CIRRUS_WORKING_DIR}
+    - git fetch origin ${CIRRUS_BRANCH}/head:${CIRRUS_BRANCH}
+    - git checkout ${CIRRUS_BRANCH}
   script:
     - autoconf
     #- ./configure ${COMPILER_FLAGS:+       CC="$CC $COMPILER_FLAGS"       CXX="$CXX $COMPILER_FLAGS" }       $CONFIGURE_FLAGS
-- 
cgit v0.12


From fc13a7f1fa7d1cfc1d393d7a448e68d0f433d840 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 11 Jan 2019 12:38:14 -0800
Subject: Remove --branch=${CIRRUS_BASE_BRANCH} in git clone command.

The --branch parameter is unnecessary, and may avoid problems when
testing directly on the dev branch.
---
 .cirrus.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index 8b1b38d..5e6756a 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -12,7 +12,7 @@ task:
     - pkg upgrade -y
     - pkg install -y autoconf git gmake
   clone_script:
-    - git clone --tags --branch=${CIRRUS_BASE_BRANCH} https://x-access-token:${CIRRUS_REPO_CLONE_TOKEN}@github.com/${CIRRUS_REPO_FULL_NAME}.git ${CIRRUS_WORKING_DIR}
+    - git clone --tags https://x-access-token:${CIRRUS_REPO_CLONE_TOKEN}@github.com/${CIRRUS_REPO_FULL_NAME}.git ${CIRRUS_WORKING_DIR}
     - git fetch origin ${CIRRUS_BRANCH}/head:${CIRRUS_BRANCH}
     - git checkout ${CIRRUS_BRANCH}
   script:
-- 
cgit v0.12


From 0ecd5addb1215f5ae9fad2b9cb4cf91ed5376ee8 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Fri, 11 Jan 2019 11:22:11 -0800
Subject: Force purge on thread death only when w/o bg thds.

---
 src/tcache.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/tcache.c b/src/tcache.c
index 182e8bf..9125179 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -527,8 +527,8 @@ tcache_destroy(tsd_t *tsd, tcache_t *tcache, bool tsd_tcache) {
 	arena_decay(tsd_tsdn(tsd), arena_get(tsd_tsdn(tsd), 0, false),
 	    false, false);
 
-	unsigned nthreads = arena_nthreads_get(arena, false);
-	if (nthreads == 0) {
+	if (arena_nthreads_get(arena, false) == 0 &&
+	    !background_thread_enabled()) {
 		/* Force purging when no threads assigned to the arena anymore. */
 		arena_decay(tsd_tsdn(tsd), arena, false, true);
 	} else {
-- 
cgit v0.12


From f459454afe019251712728b983d2eed0b03f5c80 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Fri, 10 Aug 2018 16:08:50 -0700
Subject: Avoid potential issues on extent zero-out.

When custom extent_hooks or transparent huge pages are in use, the purging
semantics may change, which means we may not get zeroed pages on repopulating.
Fixing the issue by manually memset for such cases.
---
 include/jemalloc/internal/arena_inlines_b.h |  5 +++++
 src/extent.c                                | 25 +++++++++++++++++++++----
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/include/jemalloc/internal/arena_inlines_b.h b/include/jemalloc/internal/arena_inlines_b.h
index 3d0121d..c7d35b7 100644
--- a/include/jemalloc/internal/arena_inlines_b.h
+++ b/include/jemalloc/internal/arena_inlines_b.h
@@ -8,6 +8,11 @@
 #include "jemalloc/internal/sz.h"
 #include "jemalloc/internal/ticker.h"
 
+JEMALLOC_ALWAYS_INLINE bool
+arena_has_default_hooks(arena_t *arena) {
+	return (extent_hooks_get(arena) == &extent_hooks_default);
+}
+
 JEMALLOC_ALWAYS_INLINE arena_t *
 arena_choose_maybe_huge(tsd_t *tsd, arena_t *arena, size_t size) {
 	if (arena != NULL) {
diff --git a/src/extent.c b/src/extent.c
index 9605dac..fd6c837 100644
--- a/src/extent.c
+++ b/src/extent.c
@@ -1102,6 +1102,17 @@ extent_recycle_split(tsdn_t *tsdn, arena_t *arena,
 	unreachable();
 }
 
+static bool
+extent_need_manual_zero(arena_t *arena) {
+	/*
+	 * Need to manually zero the extent on repopulating if either; 1) non
+	 * default extent hooks installed (in which case the purge semantics may
+	 * change); or 2) transparent huge pages enabled.
+	 */
+	return (!arena_has_default_hooks(arena) ||
+		(opt_thp == thp_mode_always));
+}
+
 /*
  * Tries to satisfy the given allocation request by reusing one of the extents
  * in the given extents_t.
@@ -1141,7 +1152,9 @@ extent_recycle(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 			    extent, growing_retained);
 			return NULL;
 		}
-		extent_zeroed_set(extent, true);
+		if (!extent_need_manual_zero(arena)) {
+			extent_zeroed_set(extent, true);
+		}
 	}
 
 	if (extent_committed_get(extent)) {
@@ -1164,7 +1177,8 @@ extent_recycle(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 		void *addr = extent_base_get(extent);
 		if (!extent_zeroed_get(extent)) {
 			size_t size = extent_size_get(extent);
-			if (pages_purge_forced(addr, size)) {
+			if (extent_need_manual_zero(arena) ||
+			    pages_purge_forced(addr, size)) {
 				memset(addr, 0, size);
 			}
 		} else if (config_debug) {
@@ -1391,7 +1405,9 @@ extent_grow_retained(tsdn_t *tsdn, arena_t *arena,
 			    &arena->extents_retained, extent, true);
 			goto label_err;
 		}
-		extent_zeroed_set(extent, true);
+		if (!extent_need_manual_zero(arena)) {
+			extent_zeroed_set(extent, true);
+		}
 	}
 
 	/*
@@ -1425,7 +1441,8 @@ extent_grow_retained(tsdn_t *tsdn, arena_t *arena,
 	if (*zero && !extent_zeroed_get(extent)) {
 		void *addr = extent_base_get(extent);
 		size_t size = extent_size_get(extent);
-		if (pages_purge_forced(addr, size)) {
+		if (extent_need_manual_zero(arena) ||
+		    pages_purge_forced(addr, size)) {
 			memset(addr, 0, size);
 		}
 	}
-- 
cgit v0.12


From 225d89998bae562b13b681f74019697b66e07f02 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Mon, 14 Jan 2019 07:10:39 -0800
Subject: Revert "Remove --branch=${CIRRUS_BASE_BRANCH} in git clone command."

This reverts commit fc13a7f1fa7d1cfc1d393d7a448e68d0f433d840.
---
 .cirrus.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index 5e6756a..8b1b38d 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -12,7 +12,7 @@ task:
     - pkg upgrade -y
     - pkg install -y autoconf git gmake
   clone_script:
-    - git clone --tags https://x-access-token:${CIRRUS_REPO_CLONE_TOKEN}@github.com/${CIRRUS_REPO_FULL_NAME}.git ${CIRRUS_WORKING_DIR}
+    - git clone --tags --branch=${CIRRUS_BASE_BRANCH} https://x-access-token:${CIRRUS_REPO_CLONE_TOKEN}@github.com/${CIRRUS_REPO_FULL_NAME}.git ${CIRRUS_WORKING_DIR}
     - git fetch origin ${CIRRUS_BRANCH}/head:${CIRRUS_BRANCH}
     - git checkout ${CIRRUS_BRANCH}
   script:
-- 
cgit v0.12


From b6f1f2669a0961fa463afede7d4b190d79647c90 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Mon, 14 Jan 2019 07:11:04 -0800
Subject: Revert "Customize cloning to include tags so that VERSION is valid."

This reverts commit 646af596d8c4ffefc1f7edf432aa2b4e669bcc78.
---
 .cirrus.yml | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index 8b1b38d..019d2c3 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -10,11 +10,7 @@ task:
   install_script:
     - sed -i.bak -e 's,pkg+http://pkg.FreeBSD.org/\${ABI}/quarterly,pkg+http://pkg.FreeBSD.org/\${ABI}/latest,' /etc/pkg/FreeBSD.conf
     - pkg upgrade -y
-    - pkg install -y autoconf git gmake
-  clone_script:
-    - git clone --tags --branch=${CIRRUS_BASE_BRANCH} https://x-access-token:${CIRRUS_REPO_CLONE_TOKEN}@github.com/${CIRRUS_REPO_FULL_NAME}.git ${CIRRUS_WORKING_DIR}
-    - git fetch origin ${CIRRUS_BRANCH}/head:${CIRRUS_BRANCH}
-    - git checkout ${CIRRUS_BRANCH}
+    - pkg install -y autoconf gmake
   script:
     - autoconf
     #- ./configure ${COMPILER_FLAGS:+       CC="$CC $COMPILER_FLAGS"       CXX="$CXX $COMPILER_FLAGS" }       $CONFIGURE_FLAGS
-- 
cgit v0.12


From bbe8e6a9097203c7b29140b5410c787a6e204593 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Mon, 14 Jan 2019 14:16:09 -0800
Subject: Avoid creating bg thds for huge arena lone.

For low arena count settings, the huge threshold feature may trigger an unwanted
bg thd creation.  Given that the huge arena does eager purging by default,
bypass bg thd creation when initializing the huge arena.
---
 include/jemalloc/internal/arena_externs.h |  1 +
 src/arena.c                               |  8 ++++++++
 src/background_thread.c                   | 20 ++++++++++++++++----
 src/ctl.c                                 | 11 +++++++++++
 src/jemalloc.c                            | 18 +++++++-----------
 5 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/include/jemalloc/internal/arena_externs.h b/include/jemalloc/internal/arena_externs.h
index 04d9954..bcc016e 100644
--- a/include/jemalloc/internal/arena_externs.h
+++ b/include/jemalloc/internal/arena_externs.h
@@ -85,6 +85,7 @@ void arena_nthreads_dec(arena_t *arena, bool internal);
 size_t arena_extent_sn_next(arena_t *arena);
 arena_t *arena_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks);
 bool arena_init_huge(void);
+bool arena_is_huge(unsigned arena_ind);
 arena_t *arena_choose_huge(tsd_t *tsd);
 bin_t *arena_bin_choose_lock(tsdn_t *tsdn, arena_t *arena, szind_t binind,
     unsigned *binshard);
diff --git a/src/arena.c b/src/arena.c
index d34de85..552a0f3 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -2127,6 +2127,14 @@ arena_init_huge(void) {
 	return huge_enabled;
 }
 
+bool
+arena_is_huge(unsigned arena_ind) {
+	if (huge_arena_ind == 0) {
+		return false;
+	}
+	return (arena_ind == huge_arena_ind);
+}
+
 void
 arena_boot(sc_data_t *sc_data) {
 	arena_dirty_decay_ms_default_set(opt_dirty_decay_ms);
diff --git a/src/background_thread.c b/src/background_thread.c
index 813867e..acf8083 100644
--- a/src/background_thread.c
+++ b/src/background_thread.c
@@ -535,9 +535,8 @@ background_thread_init(tsd_t *tsd, background_thread_info_t *info) {
 	n_background_threads++;
 }
 
-/* Create a new background thread if needed. */
-bool
-background_thread_create(tsd_t *tsd, unsigned arena_ind) {
+static bool
+background_thread_create_locked(tsd_t *tsd, unsigned arena_ind) {
 	assert(have_background_thread);
 	malloc_mutex_assert_owner(tsd_tsdn(tsd), &background_thread_lock);
 
@@ -590,6 +589,19 @@ background_thread_create(tsd_t *tsd, unsigned arena_ind) {
 	return false;
 }
 
+/* Create a new background thread if needed. */
+bool
+background_thread_create(tsd_t *tsd, unsigned arena_ind) {
+	assert(have_background_thread);
+
+	bool ret;
+	malloc_mutex_lock(tsd_tsdn(tsd), &background_thread_lock);
+	ret = background_thread_create_locked(tsd, arena_ind);
+	malloc_mutex_unlock(tsd_tsdn(tsd), &background_thread_lock);
+
+	return ret;
+}
+
 bool
 background_threads_enable(tsd_t *tsd) {
 	assert(n_background_threads == 0);
@@ -623,7 +635,7 @@ background_threads_enable(tsd_t *tsd) {
 		}
 	}
 
-	return background_thread_create(tsd, 0);
+	return background_thread_create_locked(tsd, 0);
 }
 
 bool
diff --git a/src/ctl.c b/src/ctl.c
index a150891..81e8fbe 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -2276,6 +2276,17 @@ arena_i_decay_ms_ctl_impl(tsd_t *tsd, const size_t *mib, size_t miblen,
 			ret = EINVAL;
 			goto label_return;
 		}
+		if (arena_is_huge(arena_ind) && *(ssize_t *)newp > 0) {
+			/*
+			 * By default the huge arena purges eagerly.  If it is
+			 * set to non-zero decay time afterwards, background
+			 * thread might be needed.
+			 */
+			if (background_thread_create(tsd, arena_ind)) {
+				ret = EFAULT;
+				goto label_return;
+			}
+		}
 		if (dirty ? arena_dirty_decay_ms_set(tsd_tsdn(tsd), arena,
 		    *(ssize_t *)newp) : arena_muzzy_decay_ms_set(tsd_tsdn(tsd),
 		    arena, *(ssize_t *)newp)) {
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 1620d0d..2a47dcb 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -344,12 +344,12 @@ arena_new_create_background_thread(tsdn_t *tsdn, unsigned ind) {
 	if (ind == 0) {
 		return;
 	}
-	if (have_background_thread) {
-		bool err;
-		malloc_mutex_lock(tsdn, &background_thread_lock);
-		err = background_thread_create(tsdn_tsd(tsdn), ind);
-		malloc_mutex_unlock(tsdn, &background_thread_lock);
-		if (err) {
+	/*
+	 * Avoid creating a new background thread just for the huge arena, which
+	 * purges eagerly by default.
+	 */
+	if (have_background_thread && !arena_is_huge(ind)) {
+		if (background_thread_create(tsdn_tsd(tsdn), ind)) {
 			malloc_printf("<jemalloc>: error in background thread "
 				      "creation for arena %u. Abort.\n", ind);
 			abort();
@@ -1719,11 +1719,7 @@ malloc_init_hard(void) {
 		 * sets isthreaded) needs to be called without holding any lock.
 		 */
 		background_thread_ctl_init(tsd_tsdn(tsd));
-
-		malloc_mutex_lock(tsd_tsdn(tsd), &background_thread_lock);
-		bool err = background_thread_create(tsd, 0);
-		malloc_mutex_unlock(tsd_tsdn(tsd), &background_thread_lock);
-		if (err) {
+		if (background_thread_create(tsd, 0)) {
 			return true;
 		}
 	}
-- 
cgit v0.12


From 7a815c1b7c796ef35e7ede60cb2dd44aba9626b4 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Tue, 15 Jan 2019 16:14:18 -0800
Subject: Un-experimental the huge_threshold feature.

---
 src/ctl.c           | 2 +-
 src/jemalloc.c      | 3 +--
 src/stats.c         | 2 +-
 test/unit/huge.c    | 2 +-
 test/unit/mallctl.c | 2 +-
 5 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/ctl.c b/src/ctl.c
index 81e8fbe..0ec9224 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -300,7 +300,7 @@ static const ctl_named_node_t opt_node[] = {
 	{NAME("dss"),		CTL(opt_dss)},
 	{NAME("narenas"),	CTL(opt_narenas)},
 	{NAME("percpu_arena"),	CTL(opt_percpu_arena)},
-	{NAME("experimental_huge_threshold"),	CTL(opt_huge_threshold)},
+	{NAME("huge_threshold"),	CTL(opt_huge_threshold)},
 	{NAME("background_thread"),	CTL(opt_background_thread)},
 	{NAME("max_background_threads"),	CTL(opt_max_background_threads)},
 	{NAME("dirty_decay_ms"), CTL(opt_dirty_decay_ms)},
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 2a47dcb..6745df6 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1241,8 +1241,7 @@ malloc_conf_init(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS]) {
 
 			/* Experimental feature.  Will be documented later.*/
 			CONF_HANDLE_SIZE_T(opt_huge_threshold,
-			    "experimental_huge_threshold",
-			    SC_LARGE_MINCLASS,
+			    "huge_threshold", SC_LARGE_MINCLASS,
 			    SC_LARGE_MAXCLASS, yes, yes, false)
 			CONF_HANDLE_SIZE_T(opt_lg_extent_max_active_fit,
 			    "lg_extent_max_active_fit", 0,
diff --git a/src/stats.c b/src/stats.c
index f105e26..8794880 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -1022,7 +1022,7 @@ stats_general_print(emitter_t *emitter) {
 	OPT_WRITE_CHAR_P("dss")
 	OPT_WRITE_UNSIGNED("narenas")
 	OPT_WRITE_CHAR_P("percpu_arena")
-	OPT_WRITE_SIZE_T("experimental_huge_threshold")
+	OPT_WRITE_SIZE_T("huge_threshold")
 	OPT_WRITE_CHAR_P("metadata_thp")
 	OPT_WRITE_BOOL_MUTABLE("background_thread", "background_thread")
 	OPT_WRITE_SSIZE_T_MUTABLE("dirty_decay_ms", "arenas.dirty_decay_ms")
diff --git a/test/unit/huge.c b/test/unit/huge.c
index f371198..7e54d07 100644
--- a/test/unit/huge.c
+++ b/test/unit/huge.c
@@ -1,7 +1,7 @@
 #include "test/jemalloc_test.h"
 
 /* Threshold: 2 << 20 = 2097152. */
-const char *malloc_conf = "experimental_huge_threshold:2097152";
+const char *malloc_conf = "huge_threshold:2097152";
 
 #define HUGE_SZ (2 << 20)
 #define SMALL_SZ (8)
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index 039a881..b8b9340 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -164,7 +164,7 @@ TEST_BEGIN(test_mallctl_opt) {
 	TEST_MALLCTL_OPT(const char *, dss, always);
 	TEST_MALLCTL_OPT(unsigned, narenas, always);
 	TEST_MALLCTL_OPT(const char *, percpu_arena, always);
-	TEST_MALLCTL_OPT(size_t, experimental_huge_threshold, always);
+	TEST_MALLCTL_OPT(size_t, huge_threshold, always);
 	TEST_MALLCTL_OPT(bool, background_thread, always);
 	TEST_MALLCTL_OPT(ssize_t, dirty_decay_ms, always);
 	TEST_MALLCTL_OPT(ssize_t, muzzy_decay_ms, always);
-- 
cgit v0.12


From 8c9571376e65c8099ea315261c24e940410386c8 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Fri, 18 Jan 2019 15:22:44 -0800
Subject: Fix stats output (rate for total # of requests).

The rate calculation for the total row was missing.
---
 src/stats.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/stats.c b/src/stats.c
index 8794880..986f51b 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -876,6 +876,12 @@ stats_arena_print(emitter_t *emitter, unsigned i, bool bins, bool large,
 	col_count_nmalloc.uint64_val = small_nmalloc + large_nmalloc;
 	col_count_ndalloc.uint64_val = small_ndalloc + large_ndalloc;
 	col_count_nrequests.uint64_val = small_nrequests + large_nrequests;
+	col_count_nmalloc_ps.uint64_val =
+	    rate_per_second(col_count_nmalloc.uint64_val, uptime);
+	col_count_ndalloc_ps.uint64_val =
+	    rate_per_second(col_count_ndalloc.uint64_val, uptime);
+	col_count_nrequests_ps.uint64_val =
+	    rate_per_second(col_count_nrequests.uint64_val, uptime);
 	emitter_table_row(emitter, &alloc_count_row);
 
 	emitter_row_t mem_count_row;
-- 
cgit v0.12


From 522d1e7b4b603d9ddc11c684c16d37113a9c0c12 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Fri, 18 Jan 2019 15:51:01 -0800
Subject: Tweak the spacing for nrequests in stats output.

---
 src/stats.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/stats.c b/src/stats.c
index 986f51b..75ccf3b 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -289,7 +289,7 @@ stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i, uint64_t upti
 	COL_HDR(row, ndalloc, NULL, right, 13, uint64)
 	COL_HDR(row, ndalloc_ps, "(#/sec)", right, 8, uint64)
 	COL_HDR(row, nrequests, NULL, right, 13, uint64)
-	COL_HDR(row, nrequests_ps, "(#/sec)", right, 8, uint64)
+	COL_HDR(row, nrequests_ps, "(#/sec)", right, 10, uint64)
 	COL_HDR(row, nshards, NULL, right, 9, unsigned)
 	COL_HDR(row, curregs, NULL, right, 13, size)
 	COL_HDR(row, curslabs, NULL, right, 13, size)
@@ -817,7 +817,7 @@ stats_arena_print(emitter_t *emitter, unsigned i, bool bins, bool large,
 
 	COL(alloc_count_row, count_nrequests, right, 16, title);
 	col_count_nrequests.str_val = "nrequests";
-	COL(alloc_count_row, count_nrequests_ps, right, 8, title);
+	COL(alloc_count_row, count_nrequests_ps, right, 10, title);
 	col_count_nrequests_ps.str_val = "(#/sec)";
 
 	emitter_table_row(emitter, &alloc_count_row);
-- 
cgit v0.12


From a7b0a124c3ebe505cfd8c2d5cc797b8f0c96fbc6 Mon Sep 17 00:00:00 2001
From: Edward Tomasz Napierala <trasz@FreeBSD.org>
Date: Fri, 30 Nov 2018 13:57:49 +0000
Subject: Mention different mmap(2) behaviour with retain:true.

---
 doc/jemalloc.xml.in | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 058e9db..a73f0ad 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -944,6 +944,9 @@ mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".decay",
         <citerefentry><refentrytitle>munmap</refentrytitle>
         <manvolnum>2</manvolnum></citerefentry> or equivalent (see <link
         linkend="stats.retained">stats.retained</link> for related details).
+        It also makes jemalloc use <citerefentry>
+        <refentrytitle>mmap</refentrytitle><manvolnum>2</manvolnum>
+        </citerefentry> in a more greedy way, mapping larger chunks in one go.
         This option is disabled by default unless discarding virtual memory is
         known to trigger
         platform-specific performance problems, e.g. for [64-bit] Linux, which
-- 
cgit v0.12


From d3145014a00d6420824a45bb24fa9237a553d8dc Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Fri, 18 Jan 2019 14:20:07 -0800
Subject: Explicitly use arena 0 in alignment and OOM tests.

This helps us avoid issues with size based routing (i.e. the huge_threshold
feature).
---
 test/integration/mallocx.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/integration/mallocx.c b/test/integration/mallocx.c
index ce5069a..645d4db 100644
--- a/test/integration/mallocx.c
+++ b/test/integration/mallocx.c
@@ -126,7 +126,7 @@ TEST_BEGIN(test_oom) {
 	largemax = get_large_size(get_nlarge()-1);
 	oom = false;
 	for (i = 0; i < sizeof(ptrs) / sizeof(void *); i++) {
-		ptrs[i] = mallocx(largemax, 0);
+		ptrs[i] = mallocx(largemax, MALLOCX_ARENA(0));
 		if (ptrs[i] == NULL) {
 			oom = true;
 		}
@@ -223,12 +223,12 @@ TEST_BEGIN(test_alignment_and_size) {
 		    sz += (alignment >> (LG_SIZEOF_PTR-1)) - 1) {
 			for (i = 0; i < NITER; i++) {
 				nsz = nallocx(sz, MALLOCX_ALIGN(alignment) |
-				    MALLOCX_ZERO);
+				    MALLOCX_ZERO | MALLOCX_ARENA(0));
 				assert_zu_ne(nsz, 0,
 				    "nallocx() error for alignment=%zu, "
 				    "size=%zu (%#zx)", alignment, sz, sz);
 				ps[i] = mallocx(sz, MALLOCX_ALIGN(alignment) |
-				    MALLOCX_ZERO);
+				    MALLOCX_ZERO | MALLOCX_ARENA(0));
 				assert_ptr_not_null(ps[i],
 				    "mallocx() error for alignment=%zu, "
 				    "size=%zu (%#zx)", alignment, sz, sz);
-- 
cgit v0.12


From 350809dc5d43ea994de04f7a970b6978a8fec6d2 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Wed, 16 Jan 2019 12:25:24 -0800
Subject: Set huge_threshold to 8M by default.

This feature uses an dedicated arena to handle huge requests, which
significantly improves VM fragmentation.  In production workload we tested it
often reduces VM size by >30%.
---
 include/jemalloc/internal/arena_types.h | 2 +-
 src/jemalloc.c                          | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/include/jemalloc/internal/arena_types.h b/include/jemalloc/internal/arena_types.h
index c40ae6f..cf07cc0 100644
--- a/include/jemalloc/internal/arena_types.h
+++ b/include/jemalloc/internal/arena_types.h
@@ -46,6 +46,6 @@ typedef enum {
  * When allocation_size >= huge_threshold, use the dedicated huge arena (unless
  * have explicitly spicified arena index).  0 disables the feature.
  */
-#define HUGE_THRESHOLD_DEFAULT 0
+#define HUGE_THRESHOLD_DEFAULT (8 << 20)
 
 #endif /* JEMALLOC_INTERNAL_ARENA_TYPES_H */
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 6745df6..6bfc613 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1239,7 +1239,14 @@ malloc_conf_init(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS]) {
 			CONF_HANDLE_SSIZE_T(opt_lg_tcache_max, "lg_tcache_max",
 			    -1, (sizeof(size_t) << 3) - 1)
 
-			/* Experimental feature.  Will be documented later.*/
+			/*
+			 * The runtime option of huge_threshold remains
+			 * undocumented.  It may be tweaked in the next major
+			 * release (6.0).  The default value 8M is rather
+			 * conservative / safe.  Tuning it further down may
+			 * improve fragmentation a bit more, but may also cause
+			 * contention on the huge arena.
+			 */
 			CONF_HANDLE_SIZE_T(opt_huge_threshold,
 			    "huge_threshold", SC_LARGE_MINCLASS,
 			    SC_LARGE_MAXCLASS, yes, yes, false)
-- 
cgit v0.12


From e3db480f6f3c147a8630c0ec45fde1da5764270b Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Thu, 24 Jan 2019 16:15:04 -0800
Subject: Rename huge_threshold to oversize_threshold.

The keyword huge tend to remind people of huge pages which is not relevent to
the feature.
---
 doc/jemalloc.xml.in                         |  4 +---
 include/jemalloc/internal/arena_externs.h   |  4 ++--
 include/jemalloc/internal/arena_inlines_b.h |  2 +-
 include/jemalloc/internal/arena_types.h     |  6 +++---
 src/arena.c                                 | 14 +++++++-------
 src/ctl.c                                   |  6 +++---
 src/jemalloc.c                              |  6 +++---
 src/stats.c                                 |  2 +-
 test/unit/huge.c                            |  2 +-
 test/unit/mallctl.c                         |  4 ++--
 10 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index a73f0ad..fe322e1 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -1059,9 +1059,7 @@ mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".decay",
         linkend="arena.i.dirty_decay_ms"><mallctl>arena.&lt;i&gt;.dirty_decay_ms</mallctl></link>
         for related dynamic control options.  See <link
         linkend="opt.muzzy_decay_ms"><mallctl>opt.muzzy_decay_ms</mallctl></link>
-        for a description of muzzy pages.  Note that when the huge_threshold
-        feature is enabled, the special auto arenas may use its own decay
-        settings.</para></listitem>
+        for a description of muzzy pages.</para></listitem>
       </varlistentry>
 
       <varlistentry id="opt.muzzy_decay_ms">
diff --git a/include/jemalloc/internal/arena_externs.h b/include/jemalloc/internal/arena_externs.h
index bcc016e..2bdddb7 100644
--- a/include/jemalloc/internal/arena_externs.h
+++ b/include/jemalloc/internal/arena_externs.h
@@ -16,8 +16,8 @@ extern const char *percpu_arena_mode_names[];
 extern const uint64_t h_steps[SMOOTHSTEP_NSTEPS];
 extern malloc_mutex_t arenas_lock;
 
-extern size_t opt_huge_threshold;
-extern size_t huge_threshold;
+extern size_t opt_oversize_threshold;
+extern size_t oversize_threshold;
 
 void arena_basic_stats_merge(tsdn_t *tsdn, arena_t *arena,
     unsigned *nthreads, const char **dss, ssize_t *dirty_decay_ms,
diff --git a/include/jemalloc/internal/arena_inlines_b.h b/include/jemalloc/internal/arena_inlines_b.h
index c7d35b7..b7cdcea 100644
--- a/include/jemalloc/internal/arena_inlines_b.h
+++ b/include/jemalloc/internal/arena_inlines_b.h
@@ -24,7 +24,7 @@ arena_choose_maybe_huge(tsd_t *tsd, arena_t *arena, size_t size) {
 	 * 1) is using auto arena selection (i.e. arena == NULL), and 2) the
 	 * thread is not assigned to a manual arena.
 	 */
-	if (unlikely(size >= huge_threshold)) {
+	if (unlikely(size >= oversize_threshold)) {
 		arena_t *tsd_arena = tsd_arena_get(tsd);
 		if (tsd_arena == NULL || arena_is_auto(tsd_arena)) {
 			return arena_choose_huge(tsd);
diff --git a/include/jemalloc/internal/arena_types.h b/include/jemalloc/internal/arena_types.h
index cf07cc0..8917ea3 100644
--- a/include/jemalloc/internal/arena_types.h
+++ b/include/jemalloc/internal/arena_types.h
@@ -43,9 +43,9 @@ typedef enum {
 #define PERCPU_ARENA_DEFAULT	percpu_arena_disabled
 
 /*
- * When allocation_size >= huge_threshold, use the dedicated huge arena (unless
- * have explicitly spicified arena index).  0 disables the feature.
+ * When allocation_size >= oversize_threshold, use the dedicated huge arena
+ * (unless have explicitly spicified arena index).  0 disables the feature.
  */
-#define HUGE_THRESHOLD_DEFAULT (8 << 20)
+#define OVERSIZE_THRESHOLD_DEFAULT (8 << 20)
 
 #endif /* JEMALLOC_INTERNAL_ARENA_TYPES_H */
diff --git a/src/arena.c b/src/arena.c
index 552a0f3..60eac23 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -43,8 +43,8 @@ const uint64_t h_steps[SMOOTHSTEP_NSTEPS] = {
 
 static div_info_t arena_binind_div_info[SC_NBINS];
 
-size_t opt_huge_threshold = HUGE_THRESHOLD_DEFAULT;
-size_t huge_threshold = HUGE_THRESHOLD_DEFAULT;
+size_t opt_oversize_threshold = OVERSIZE_THRESHOLD_DEFAULT;
+size_t oversize_threshold = OVERSIZE_THRESHOLD_DEFAULT;
 static unsigned huge_arena_ind;
 
 /******************************************************************************/
@@ -2112,15 +2112,15 @@ arena_init_huge(void) {
 	bool huge_enabled;
 
 	/* The threshold should be large size class. */
-	if (opt_huge_threshold > SC_LARGE_MAXCLASS ||
-	    opt_huge_threshold < SC_LARGE_MINCLASS) {
-		opt_huge_threshold = 0;
-		huge_threshold = SC_LARGE_MAXCLASS + PAGE;
+	if (opt_oversize_threshold > SC_LARGE_MAXCLASS ||
+	    opt_oversize_threshold < SC_LARGE_MINCLASS) {
+		opt_oversize_threshold = 0;
+		oversize_threshold = SC_LARGE_MAXCLASS + PAGE;
 		huge_enabled = false;
 	} else {
 		/* Reserve the index for the huge arena. */
 		huge_arena_ind = narenas_total_get();
-		huge_threshold = opt_huge_threshold;
+		oversize_threshold = opt_oversize_threshold;
 		huge_enabled = true;
 	}
 
diff --git a/src/ctl.c b/src/ctl.c
index 0ec9224..09310a9 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -85,7 +85,7 @@ CTL_PROTO(opt_retain)
 CTL_PROTO(opt_dss)
 CTL_PROTO(opt_narenas)
 CTL_PROTO(opt_percpu_arena)
-CTL_PROTO(opt_huge_threshold)
+CTL_PROTO(opt_oversize_threshold)
 CTL_PROTO(opt_background_thread)
 CTL_PROTO(opt_max_background_threads)
 CTL_PROTO(opt_dirty_decay_ms)
@@ -300,7 +300,7 @@ static const ctl_named_node_t opt_node[] = {
 	{NAME("dss"),		CTL(opt_dss)},
 	{NAME("narenas"),	CTL(opt_narenas)},
 	{NAME("percpu_arena"),	CTL(opt_percpu_arena)},
-	{NAME("huge_threshold"),	CTL(opt_huge_threshold)},
+	{NAME("oversize_threshold"),	CTL(opt_oversize_threshold)},
 	{NAME("background_thread"),	CTL(opt_background_thread)},
 	{NAME("max_background_threads"),	CTL(opt_max_background_threads)},
 	{NAME("dirty_decay_ms"), CTL(opt_dirty_decay_ms)},
@@ -1716,7 +1716,7 @@ CTL_RO_NL_GEN(opt_dss, opt_dss, const char *)
 CTL_RO_NL_GEN(opt_narenas, opt_narenas, unsigned)
 CTL_RO_NL_GEN(opt_percpu_arena, percpu_arena_mode_names[opt_percpu_arena],
     const char *)
-CTL_RO_NL_GEN(opt_huge_threshold, opt_huge_threshold, size_t)
+CTL_RO_NL_GEN(opt_oversize_threshold, opt_oversize_threshold, size_t)
 CTL_RO_NL_GEN(opt_background_thread, opt_background_thread, bool)
 CTL_RO_NL_GEN(opt_max_background_threads, opt_max_background_threads, size_t)
 CTL_RO_NL_GEN(opt_dirty_decay_ms, opt_dirty_decay_ms, ssize_t)
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 6bfc613..855a98b 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1240,15 +1240,15 @@ malloc_conf_init(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS]) {
 			    -1, (sizeof(size_t) << 3) - 1)
 
 			/*
-			 * The runtime option of huge_threshold remains
+			 * The runtime option of oversize_threshold remains
 			 * undocumented.  It may be tweaked in the next major
 			 * release (6.0).  The default value 8M is rather
 			 * conservative / safe.  Tuning it further down may
 			 * improve fragmentation a bit more, but may also cause
 			 * contention on the huge arena.
 			 */
-			CONF_HANDLE_SIZE_T(opt_huge_threshold,
-			    "huge_threshold", SC_LARGE_MINCLASS,
+			CONF_HANDLE_SIZE_T(opt_oversize_threshold,
+			    "oversize_threshold", SC_LARGE_MINCLASS,
 			    SC_LARGE_MAXCLASS, yes, yes, false)
 			CONF_HANDLE_SIZE_T(opt_lg_extent_max_active_fit,
 			    "lg_extent_max_active_fit", 0,
diff --git a/src/stats.c b/src/stats.c
index 75ccf3b..eb21075 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -1028,7 +1028,7 @@ stats_general_print(emitter_t *emitter) {
 	OPT_WRITE_CHAR_P("dss")
 	OPT_WRITE_UNSIGNED("narenas")
 	OPT_WRITE_CHAR_P("percpu_arena")
-	OPT_WRITE_SIZE_T("huge_threshold")
+	OPT_WRITE_SIZE_T("oversize_threshold")
 	OPT_WRITE_CHAR_P("metadata_thp")
 	OPT_WRITE_BOOL_MUTABLE("background_thread", "background_thread")
 	OPT_WRITE_SSIZE_T_MUTABLE("dirty_decay_ms", "arenas.dirty_decay_ms")
diff --git a/test/unit/huge.c b/test/unit/huge.c
index 7e54d07..ab72cf0 100644
--- a/test/unit/huge.c
+++ b/test/unit/huge.c
@@ -1,7 +1,7 @@
 #include "test/jemalloc_test.h"
 
 /* Threshold: 2 << 20 = 2097152. */
-const char *malloc_conf = "huge_threshold:2097152";
+const char *malloc_conf = "oversize_threshold:2097152";
 
 #define HUGE_SZ (2 << 20)
 #define SMALL_SZ (8)
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index b8b9340..498f9e0 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -164,7 +164,7 @@ TEST_BEGIN(test_mallctl_opt) {
 	TEST_MALLCTL_OPT(const char *, dss, always);
 	TEST_MALLCTL_OPT(unsigned, narenas, always);
 	TEST_MALLCTL_OPT(const char *, percpu_arena, always);
-	TEST_MALLCTL_OPT(size_t, huge_threshold, always);
+	TEST_MALLCTL_OPT(size_t, oversize_threshold, always);
 	TEST_MALLCTL_OPT(bool, background_thread, always);
 	TEST_MALLCTL_OPT(ssize_t, dirty_decay_ms, always);
 	TEST_MALLCTL_OPT(ssize_t, muzzy_decay_ms, always);
@@ -342,7 +342,7 @@ TEST_BEGIN(test_thread_arena) {
 	sz = sizeof(unsigned);
 	assert_d_eq(mallctl("arenas.narenas", (void *)&narenas, &sz, NULL, 0),
 	    0, "Unexpected mallctl() failure");
-	if (opt_huge_threshold != 0) {
+	if (opt_oversize_threshold != 0) {
 		narenas--;
 	}
 	assert_u_eq(narenas, opt_narenas, "Number of arenas incorrect");
-- 
cgit v0.12


From 374dc30d3dc6c5b664fda9b1fa0510559e568b6a Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Thu, 24 Jan 2019 16:18:30 -0800
Subject: Update copyright dates.

---
 COPYING | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/COPYING b/COPYING
index 98458d9..3b7fd35 100644
--- a/COPYING
+++ b/COPYING
@@ -1,10 +1,10 @@
 Unless otherwise specified, files in the jemalloc source distribution are
 subject to the following license:
 --------------------------------------------------------------------------------
-Copyright (C) 2002-2018 Jason Evans <jasone@canonware.com>.
+Copyright (C) 2002-present Jason Evans <jasone@canonware.com>.
 All rights reserved.
 Copyright (C) 2007-2012 Mozilla Foundation.  All rights reserved.
-Copyright (C) 2009-2018 Facebook, Inc.  All rights reserved.
+Copyright (C) 2009-present Facebook, Inc.  All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
-- 
cgit v0.12


From b33eb26dee1c161572b209a8fe3f58419ce4874f Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Mon, 28 Jan 2019 14:05:20 -0800
Subject: Tweak the spacing for the total_wait_time per second.

---
 src/stats.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/stats.c b/src/stats.c
index eb21075..4c427e0 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -130,6 +130,7 @@ mutex_stats_init_cols(emitter_row_t *row, const char *table_name,
 #undef OP
 #undef WIDTH_uint32_t
 #undef WIDTH_uint64_t
+	col_uint64_t[mutex_counter_total_wait_time_ps].width = 10;
 }
 
 static void
-- 
cgit v0.12


From e13400c919e6b6730284ff011875bbcdd6821f1c Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Tue, 22 Jan 2019 13:59:23 -0800
Subject: Sanity check szind on tcache flush.

This adds some overhead to the tcache flush path (which is one of the
popular paths).  Guard it behind a config option.
---
 configure.ac                                       | 16 +++++++++
 .../jemalloc/internal/jemalloc_internal_defs.h.in  |  3 ++
 src/tcache.c                                       | 42 ++++++++++++++++++++--
 3 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/configure.ac b/configure.ac
index c0911db..8049ded 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1403,6 +1403,22 @@ if test "x$enable_readlinkat" = "x1" ; then
 fi
 AC_SUBST([enable_readlinkat])
 
+dnl Avoid the extra size checking by default
+AC_ARG_ENABLE([extra-size-check],
+  [AS_HELP_STRING([--enable-extra-size-check],
+  [Perform additonal size related sanity checks])],
+[if test "x$enable_extra_size_check" = "xno" ; then
+  enable_extra_size_check="0"
+else
+  enable_extra_size_check="1"
+fi
+],
+[enable_extra_size_check=="0"]
+)
+if test "x$enable_extra_size_check" = "x1" ; then
+  AC_DEFINE([JEMALLOC_EXTRA_SIZE_CHECK], [ ])
+fi
+AC_SUBST([enable_extra_size_check])
 
 JE_COMPILABLE([a program using __builtin_unreachable], [
 void foo (void) {
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index 3e94c02..4f0359a 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -372,4 +372,7 @@
  */
 #undef JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE
 
+/* Performs additional size-matching sanity checks when defined. */
+#undef JEMALLOC_EXTRA_SIZE_CHECK
+
 #endif /* JEMALLOC_INTERNAL_DEFS_H_ */
diff --git a/src/tcache.c b/src/tcache.c
index 9125179..be4fb87 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -100,6 +100,34 @@ tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
 	return ret;
 }
 
+/* Enabled with --enable-extra-size-check. */
+#ifdef JEMALLOC_EXTRA_SIZE_CHECK
+static void
+tbin_extents_lookup_size_check(tsdn_t *tsdn, cache_bin_t *tbin, szind_t binind,
+    size_t nflush, extent_t **extents){
+	rtree_ctx_t rtree_ctx_fallback;
+	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
+
+	/*
+	 * Verify that the items in the tcache all have the correct size; this
+	 * is useful for catching sized deallocation bugs, also to fail early
+	 * instead of corrupting metadata.  Since this can be turned on for opt
+	 * builds, avoid the branch in the loop.
+	 */
+	szind_t szind;
+	size_t sz_sum = binind * nflush;
+	for (unsigned i = 0 ; i < nflush; i++) {
+		rtree_extent_szind_read(tsdn, &extents_rtree,
+		    rtree_ctx, (uintptr_t)*(tbin->avail - 1 - i), true,
+		    &extents[i], &szind);
+		sz_sum -= szind;
+	}
+	if (sz_sum != 0) {
+		abort();
+	}
+}
+#endif
+
 void
 tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
     szind_t binind, unsigned rem) {
@@ -112,11 +140,16 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin,
 	assert(arena != NULL);
 	unsigned nflush = tbin->ncached - rem;
 	VARIABLE_ARRAY(extent_t *, item_extent, nflush);
+
+#ifndef JEMALLOC_EXTRA_SIZE_CHECK
 	/* Look up extent once per item. */
 	for (unsigned i = 0 ; i < nflush; i++) {
 		item_extent[i] = iealloc(tsd_tsdn(tsd), *(tbin->avail - 1 - i));
 	}
-
+#else
+	tbin_extents_lookup_size_check(tsd_tsdn(tsd), tbin, binind, nflush,
+	    item_extent);
+#endif
 	while (nflush > 0) {
 		/* Lock the arena bin associated with the first object. */
 		extent_t *extent = item_extent[0];
@@ -202,11 +235,16 @@ tcache_bin_flush_large(tsd_t *tsd, cache_bin_t *tbin, szind_t binind,
 	assert(tcache_arena != NULL);
 	unsigned nflush = tbin->ncached - rem;
 	VARIABLE_ARRAY(extent_t *, item_extent, nflush);
+
+#ifndef JEMALLOC_EXTRA_SIZE_CHECK
 	/* Look up extent once per item. */
 	for (unsigned i = 0 ; i < nflush; i++) {
 		item_extent[i] = iealloc(tsd_tsdn(tsd), *(tbin->avail - 1 - i));
 	}
-
+#else
+	tbin_extents_lookup_size_check(tsd_tsdn(tsd), tbin, binind, nflush,
+	    item_extent);
+#endif
 	while (nflush > 0) {
 		/* Lock the arena associated with the first object. */
 		extent_t *extent = item_extent[0];
-- 
cgit v0.12


From 8e9a613122251d4c519059f8e1e11f27f6572b4c Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Mon, 28 Jan 2019 15:25:30 -0800
Subject: Disable muzzy decay by default.

---
 include/jemalloc/internal/arena_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/jemalloc/internal/arena_types.h b/include/jemalloc/internal/arena_types.h
index 8917ea3..624937e 100644
--- a/include/jemalloc/internal/arena_types.h
+++ b/include/jemalloc/internal/arena_types.h
@@ -9,7 +9,7 @@
 
 /* Default decay times in milliseconds. */
 #define DIRTY_DECAY_MS_DEFAULT	ZD(10 * 1000)
-#define MUZZY_DECAY_MS_DEFAULT	ZD(10 * 1000)
+#define MUZZY_DECAY_MS_DEFAULT	(0)
 /* Number of event ticks between time checks. */
 #define DECAY_NTICKS_PER_UPDATE	1000
 
-- 
cgit v0.12


From 1f55a15467357bb559701687dbef1be84047ddfe Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Thu, 7 Jun 2018 12:27:19 -0700
Subject: Add configure option --disable-libdl.

This makes it possible to build full static binary.
---
 INSTALL.md   |  5 +++++
 configure.ac | 37 +++++++++++++++++++++++++++++--------
 2 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/INSTALL.md b/INSTALL.md
index 18cf288..b8f729b 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -269,6 +269,11 @@ any of the following arguments (not a definitive list) to 'configure':
     in the same process, which will almost certainly result in confusing runtime
     crashes if pointers leak from one implementation to the other.
 
+* `--disable-libdl`
+
+    Disable the usage of libdl, namely dlsym(3) which is required by the lazy
+    lock option.  This can allow building static binaries.
+
 The following environment variables (not a definitive list) impact configure's
 behavior:
 
diff --git a/configure.ac b/configure.ac
index 8049ded..fd468df 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1588,6 +1588,21 @@ fi
 AC_DEFINE_UNQUOTED([LG_HUGEPAGE], [${je_cv_lg_hugepage}])
 
 dnl ============================================================================
+dnl Enable libdl by default.
+AC_ARG_ENABLE([libdl],
+  [AS_HELP_STRING([--disable-libdl],
+  [Do not use libdl])],
+[if test "x$enable_libdl" = "xno" ; then
+  enable_libdl="0"
+else
+  enable_libdl="1"
+fi
+],
+[enable_libdl="1"]
+)
+AC_SUBST([libdl])
+
+dnl ============================================================================
 dnl Configure pthreads.
 
 if test "x$abi" != "xpecoff" ; then
@@ -1600,15 +1615,21 @@ if test "x$abi" != "xpecoff" ; then
                                AC_MSG_ERROR([libpthread is missing]))])
   wrap_syms="${wrap_syms} pthread_create"
   have_pthread="1"
-  dnl Check if we have dlsym support.
-  have_dlsym="1"
-  AC_CHECK_HEADERS([dlfcn.h],
-    AC_CHECK_FUNC([dlsym], [],
-      [AC_CHECK_LIB([dl], [dlsym], [LIBS="$LIBS -ldl"], [have_dlsym="0"])]),
-    [have_dlsym="0"])
-  if test "x$have_dlsym" = "x1" ; then
-    AC_DEFINE([JEMALLOC_HAVE_DLSYM], [ ])
+
+dnl Check if we have dlsym support.
+  if test "x$enable_libdl" = "x1" ; then
+    have_dlsym="1"
+    AC_CHECK_HEADERS([dlfcn.h],
+      AC_CHECK_FUNC([dlsym], [],
+        [AC_CHECK_LIB([dl], [dlsym], [LIBS="$LIBS -ldl"], [have_dlsym="0"])]),
+      [have_dlsym="0"])
+    if test "x$have_dlsym" = "x1" ; then
+      AC_DEFINE([JEMALLOC_HAVE_DLSYM], [ ])
+    fi
+  else
+    have_dlsym="0"
   fi
+
   JE_COMPILABLE([pthread_atfork(3)], [
 #include <pthread.h>
 ], [
-- 
cgit v0.12


From 2db2d2ef5e1cf2eb2c0de362c916d0f7a2f1a9ef Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Thu, 7 Jun 2018 12:28:40 -0700
Subject: Make background_thread not dependent on libdl.

When not using libdl, still allows background_thread to be enabled.
---
 configure.ac            | 3 +--
 src/background_thread.c | 9 ++++++++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/configure.ac b/configure.ac
index fd468df..a668e67 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2130,8 +2130,7 @@ fi
 dnl ============================================================================
 dnl Enable background threads if possible.
 
-if test "x${have_pthread}" = "x1" -a "x${have_dlsym}" = "x1" \
-    -a "x${je_cv_os_unfair_lock}" != "xyes" ; then
+if test "x${have_pthread}" = "x1" -a "x${je_cv_os_unfair_lock}" != "xyes" ; then
   AC_DEFINE([JEMALLOC_BACKGROUND_THREAD])
 fi
 
diff --git a/src/background_thread.c b/src/background_thread.c
index acf8083..5ed6c1c 100644
--- a/src/background_thread.c
+++ b/src/background_thread.c
@@ -27,7 +27,6 @@ background_thread_info_t *background_thread_info;
 /******************************************************************************/
 
 #ifdef JEMALLOC_PTHREAD_CREATE_WRAPPER
-#include <dlfcn.h>
 
 static int (*pthread_create_fptr)(pthread_t *__restrict, const pthread_attr_t *,
     void *(*)(void *), void *__restrict);
@@ -820,6 +819,10 @@ background_thread_stats_read(tsdn_t *tsdn, background_thread_stats_t *stats) {
 #undef BILLION
 #undef BACKGROUND_THREAD_MIN_INTERVAL_NS
 
+#ifdef JEMALLOC_HAVE_DLSYM
+#include <dlfcn.h>
+#endif
+
 static bool
 pthread_create_fptr_init(void) {
 	if (pthread_create_fptr != NULL) {
@@ -830,7 +833,11 @@ pthread_create_fptr_init(void) {
 	 * wrapper for pthread_create; and 2) application may define its own
 	 * wrapper as well (and can call malloc within the wrapper).
 	 */
+#ifdef JEMALLOC_HAVE_DLSYM
 	pthread_create_fptr = dlsym(RTLD_NEXT, "pthread_create");
+#else
+	pthread_create_fptr = NULL;
+#endif
 	if (pthread_create_fptr == NULL) {
 		if (config_lazy_lock) {
 			malloc_write("<jemalloc>: Error in dlsym(RTLD_NEXT, "
-- 
cgit v0.12


From 23b15e764b3d87c8e69a348d60d13e7e44f137b5 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Wed, 6 Feb 2019 13:36:56 -0800
Subject: Add --disable-libdl to travis.

---
 .travis.yml           | 23 +++++++++++++++++++++++
 scripts/gen_travis.py |  1 +
 2 files changed, 24 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index 38e6655..40b2eb5 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -22,6 +22,8 @@ matrix:
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--disable-stats" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--disable-libdl" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--with-malloc-conf=dss:primary" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
@@ -38,6 +40,8 @@ matrix:
     - os: osx
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--disable-stats" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: osx
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--disable-libdl" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: osx
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=clang CXX=clang++ COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
@@ -49,6 +53,8 @@ matrix:
     - os: linux
       env: CC=clang CXX=clang++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--disable-stats" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
+      env: CC=clang CXX=clang++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--disable-libdl" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
       env: CC=clang CXX=clang++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=clang CXX=clang++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--with-malloc-conf=dss:primary" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
@@ -66,6 +72,9 @@ matrix:
       env: CC=gcc CXX=g++ COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--disable-stats" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
       addons: *gcc_multilib
     - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--disable-libdl" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+      addons: *gcc_multilib
+    - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
       addons: *gcc_multilib
     - os: linux
@@ -82,6 +91,8 @@ matrix:
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-debug --disable-stats" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-debug --disable-libdl" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-debug --with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-debug --with-malloc-conf=dss:primary" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
@@ -92,6 +103,8 @@ matrix:
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-prof --disable-stats" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-prof --disable-libdl" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-prof --with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-prof --with-malloc-conf=dss:primary" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
@@ -100,6 +113,8 @@ matrix:
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--enable-prof --with-malloc-conf=background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--disable-stats --disable-libdl" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--disable-stats --with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--disable-stats --with-malloc-conf=dss:primary" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
@@ -108,6 +123,14 @@ matrix:
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--disable-stats --with-malloc-conf=background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--disable-libdl --with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--disable-libdl --with-malloc-conf=dss:primary" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--disable-libdl --with-malloc-conf=percpu_arena:percpu" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--disable-libdl --with-malloc-conf=background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--with-malloc-conf=tcache:false,dss:primary" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
     - os: linux
       env: CC=gcc CXX=g++ COMPILER_FLAGS="" CONFIGURE_FLAGS="--with-malloc-conf=tcache:false,percpu_arena:percpu" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
diff --git a/scripts/gen_travis.py b/scripts/gen_travis.py
index e92660f..65b0b67 100755
--- a/scripts/gen_travis.py
+++ b/scripts/gen_travis.py
@@ -45,6 +45,7 @@ configure_flag_unusuals = [
     '--enable-debug',
     '--enable-prof',
     '--disable-stats',
+    '--disable-libdl',
 ]
 
 malloc_conf_unusuals = [
-- 
cgit v0.12


From 9015deb126d7b2b90ef822cf0183f96abb9b97f9 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Thu, 7 Feb 2019 18:47:49 -0800
Subject: Add build_doc by default.

However, skip building the docs (and output warnings) if XML support is missing.
This allows `make install` to succeed w/o `make dist`.
---
 Makefile.in  | 17 ++++++++++++++++-
 configure.ac |  3 +++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/Makefile.in b/Makefile.in
index 2d59e59..0777f6a 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -47,6 +47,7 @@ REV := @rev@
 install_suffix := @install_suffix@
 ABI := @abi@
 XSLTPROC := @XSLTPROC@
+XSLROOT := @XSLROOT@
 AUTOCONF := @AUTOCONF@
 _RPATH = @RPATH@
 RPATH = $(if $(1),$(call _RPATH,$(1)))
@@ -294,10 +295,24 @@ all: build_lib
 dist: build_doc
 
 $(objroot)doc/%.html : $(objroot)doc/%.xml $(srcroot)doc/stylesheet.xsl $(objroot)doc/html.xsl
+ifneq ($(XSLROOT),)
 	$(XSLTPROC) -o $@ $(objroot)doc/html.xsl $<
+else
+ifeq ($(wildcard $(DOCS_HTML)),)
+	@echo "<p>Missing xsltproc.  Doc not built.</p>" > $@
+endif
+	@echo "Missing xsltproc.  "$@" not (re)built."
+endif
 
 $(objroot)doc/%.3 : $(objroot)doc/%.xml $(srcroot)doc/stylesheet.xsl $(objroot)doc/manpages.xsl
+ifneq ($(XSLROOT),)
 	$(XSLTPROC) -o $@ $(objroot)doc/manpages.xsl $<
+else
+ifeq ($(wildcard $(DOCS_MAN3)),)
+	@echo "Missing xsltproc.  Doc not built." > $@
+endif
+	@echo "Missing xsltproc.  "$@" not (re)built."
+endif
 
 build_doc_html: $(DOCS_HTML)
 build_doc_man: $(DOCS_MAN3)
@@ -496,7 +511,7 @@ install_doc_man:
 	$(INSTALL) -m 644 $$d $(MANDIR)/man3; \
 done
 
-install_doc: install_doc_html install_doc_man
+install_doc: build_doc install_doc_html install_doc_man
 
 install: install_bin install_include install_lib install_doc
 
diff --git a/configure.ac b/configure.ac
index a668e67..9b00bbf 100644
--- a/configure.ac
+++ b/configure.ac
@@ -175,6 +175,9 @@ fi
 ],
   XSLROOT="${DEFAULT_XSLROOT}"
 )
+if test "x$XSLTPROC" = "xfalse" ; then
+  XSLROOT=""
+fi
 AC_SUBST([XSLROOT])
 
 dnl If CFLAGS isn't defined, set CFLAGS to something reasonable.  Otherwise,
-- 
cgit v0.12


From dca7060d5e49b8a07179a1f13bf39f6d30e709c8 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Wed, 20 Feb 2019 18:45:23 -0800
Subject: Avoid redefining tsd_t.

This fixes a build failure when integrating with FreeBSD's libc.  This
regression was introduced by d1e11d48d4c706e17ef3508e2ddb910f109b779f
(Move tsd link and in_hook after tcache.).
---
 include/jemalloc/internal/tcache_structs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/jemalloc/internal/tcache_structs.h b/include/jemalloc/internal/tcache_structs.h
index 2708703..172ef90 100644
--- a/include/jemalloc/internal/tcache_structs.h
+++ b/include/jemalloc/internal/tcache_structs.h
@@ -5,9 +5,9 @@
 #include "jemalloc/internal/ql.h"
 #include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/ticker.h"
+#include "jemalloc/internal/tsd_types.h"
 
 /* Various uses of this struct need it to be a named type. */
-typedef struct tsd_s tsd_t;
 typedef ql_elm(tsd_t) tsd_link_t;
 
 struct tcache_s {
-- 
cgit v0.12


From 18450d0abe36757fe6e4eb08f6b15f8ce943f9cb Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Tue, 19 Feb 2019 15:58:13 -0800
Subject: Guard libgcc unwind init with opt_prof.

Only triggers libgcc unwind init when prof is enabled.  This helps workaround
some bootstrapping issues.
---
 src/prof.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/prof.c b/src/prof.c
index 71de2d3..296de52 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -3067,16 +3067,14 @@ prof_boot2(tsd_t *tsd) {
 				return true;
 			}
 		}
-	}
-
 #ifdef JEMALLOC_PROF_LIBGCC
-	/*
-	 * Cause the backtracing machinery to allocate its internal state
-	 * before enabling profiling.
-	 */
-	_Unwind_Backtrace(prof_unwind_init_callback, NULL);
+		/*
+		 * Cause the backtracing machinery to allocate its internal
+		 * state before enabling profiling.
+		 */
+		_Unwind_Backtrace(prof_unwind_init_callback, NULL);
 #endif
-
+	}
 	prof_booted = true;
 
 	return false;
-- 
cgit v0.12


From cbdb1807cea6828d0f61e1a0516613efc3e7189e Mon Sep 17 00:00:00 2001
From: Dave Rigby <d.rigby@me.com>
Date: Fri, 22 Feb 2019 19:00:46 +0000
Subject: Stringify tls_callback linker directive

Proposed fix for #1444 - ensure that `tls_callback` in the `#pragma comment(linker)`directive gets the same prefix added as it does i the C declaration.
---
 src/tsd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tsd.c b/src/tsd.c
index f317d48..2eceed9 100644
--- a/src/tsd.c
+++ b/src/tsd.c
@@ -472,7 +472,7 @@ _tls_callback(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpvReserved) {
 #    pragma comment(linker, "/INCLUDE:_tls_callback")
 #  else
 #    pragma comment(linker, "/INCLUDE:_tls_used")
-#    pragma comment(linker, "/INCLUDE:tls_callback")
+#    pragma comment(linker, "/INCLUDE:" STRINGIFY(tls_callback) )
 #  endif
 #  pragma section(".CRT$XLY",long,read)
 #endif
-- 
cgit v0.12


From 775fe302a75c4770edd9708e7348e626c96dfe58 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Fri, 22 Feb 2019 11:10:11 -0800
Subject: Remove JE_FORCE_SYNC_COMPARE_AND_SWAP_[48].

These macros have been unused since
d4ac7582f32f506d5203bea2f0115076202add38 (Introduce a backport of C11
atomics).
---
 configure.ac                                       | 34 ----------------------
 .../jemalloc/internal/jemalloc_internal_defs.h.in  | 16 ----------
 2 files changed, 50 deletions(-)

diff --git a/configure.ac b/configure.ac
index 9b00bbf..4dafed5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2006,40 +2006,6 @@ esac
 fi
 
 dnl ============================================================================
-dnl Check whether __sync_{add,sub}_and_fetch() are available despite
-dnl __GCC_HAVE_SYNC_COMPARE_AND_SWAP_n macros being undefined.
-
-AC_DEFUN([JE_SYNC_COMPARE_AND_SWAP_CHECK],[
-  AC_CACHE_CHECK([whether to force $1-bit __sync_{add,sub}_and_fetch()],
-               [je_cv_sync_compare_and_swap_$2],
-               [AC_LINK_IFELSE([AC_LANG_PROGRAM([
-                                                 #include <stdint.h>
-                                                ],
-                                                [
-                                                 #ifndef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_$2
-                                                 {
-                                                    uint$1_t x$1 = 0;
-                                                    __sync_add_and_fetch(&x$1, 42);
-                                                    __sync_sub_and_fetch(&x$1, 1);
-                                                 }
-                                                 #else
-                                                 #error __GCC_HAVE_SYNC_COMPARE_AND_SWAP_$2 is defined, no need to force
-                                                 #endif
-                                                ])],
-                               [je_cv_sync_compare_and_swap_$2=yes],
-                               [je_cv_sync_compare_and_swap_$2=no])])
-
-  if test "x${je_cv_sync_compare_and_swap_$2}" = "xyes" ; then
-    AC_DEFINE([JE_FORCE_SYNC_COMPARE_AND_SWAP_$2], [ ])
-  fi
-])
-
-if test "x${je_cv_atomic9}" != "xyes" -a "x${je_cv_osatomic}" != "xyes" ; then
-  JE_SYNC_COMPARE_AND_SWAP_CHECK(32, 4)
-  JE_SYNC_COMPARE_AND_SWAP_CHECK(64, 8)
-fi
-
-dnl ============================================================================
 dnl Check for __builtin_clz() and __builtin_clzl().
 
 AC_CACHE_CHECK([for __builtin_clz],
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index 4f0359a..7914b2f 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -53,22 +53,6 @@
 #undef JEMALLOC_GCC_SYNC_ATOMICS
 
 /*
- * Defined if __sync_add_and_fetch(uint32_t *, uint32_t) and
- * __sync_sub_and_fetch(uint32_t *, uint32_t) are available, despite
- * __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 not being defined (which means the
- * functions are defined in libgcc instead of being inlines).
- */
-#undef JE_FORCE_SYNC_COMPARE_AND_SWAP_4
-
-/*
- * Defined if __sync_add_and_fetch(uint64_t *, uint64_t) and
- * __sync_sub_and_fetch(uint64_t *, uint64_t) are available, despite
- * __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 not being defined (which means the
- * functions are defined in libgcc instead of being inlines).
- */
-#undef JE_FORCE_SYNC_COMPARE_AND_SWAP_8
-
-/*
  * Defined if __builtin_clz() and __builtin_clzl() are available.
  */
 #undef JEMALLOC_HAVE_BUILTIN_CLZ
-- 
cgit v0.12


From ac24ffb21e28ba1ed86250fa6a6dcaf02b43f7da Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Fri, 22 Feb 2019 13:00:14 -0800
Subject: Fix a syntax error in configure.ac

Introduced in e13400c919e6b6730284ff011875bbcdd6821f1c.
---
 configure.ac | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 4dafed5..afaaf5d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1416,7 +1416,7 @@ else
   enable_extra_size_check="1"
 fi
 ],
-[enable_extra_size_check=="0"]
+[enable_extra_size_check="0"]
 )
 if test "x$enable_extra_size_check" = "x1" ; then
   AC_DEFINE([JEMALLOC_EXTRA_SIZE_CHECK], [ ])
-- 
cgit v0.12


From 14d3686c9f3ed28f1ef4c9ec5f7bde945473194b Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sat, 9 Mar 2019 10:51:03 -0800
Subject: Do not use #pragma GCC diagnostic with gcc < 4.6.

This regression was introduced by
3d29d11ac2c1583b9959f73c0548545018d31c8a (Clean compilation -Wextra).
---
 .../jemalloc/internal/jemalloc_internal_macros.h   | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/include/jemalloc/internal/jemalloc_internal_macros.h b/include/jemalloc/internal/jemalloc_internal_macros.h
index ec8782e..d8ea06f 100644
--- a/include/jemalloc/internal/jemalloc_internal_macros.h
+++ b/include/jemalloc/internal/jemalloc_internal_macros.h
@@ -30,7 +30,7 @@
 #  define restrict
 #endif
 
-/* Various function pointers are statick and immutable except during testing. */
+/* Various function pointers are static and immutable except during testing. */
 #ifdef JEMALLOC_JET
 #  define JET_MUTABLE
 #else
@@ -47,7 +47,6 @@
 #define JEMALLOC_FALLTHROUGH /* falls through */
 #endif
 
-
 /* Diagnostic suppression macros */
 #if defined(_MSC_VER) && !defined(__clang__)
 #  define JEMALLOC_DIAGNOSTIC_PUSH __pragma(warning(push))
@@ -57,7 +56,9 @@
 #  define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS
 #  define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
 #  define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
-#elif defined(__GNUC__) || defined(__clang__)
+/* #pragma GCC diagnostic first appeared in gcc 4.6. */
+#elif (defined(__GNUC__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && \
+  (__GNUC_MINOR__ > 5)))) || defined(__clang__)
 /*
  * The JEMALLOC_PRAGMA__ macro is an implementation detail of the GCC and Clang
  * diagnostic suppression macros and should not be used anywhere else.
@@ -65,14 +66,16 @@
 #  define JEMALLOC_PRAGMA__(X) _Pragma(#X)
 #  define JEMALLOC_DIAGNOSTIC_PUSH JEMALLOC_PRAGMA__(GCC diagnostic push)
 #  define JEMALLOC_DIAGNOSTIC_POP JEMALLOC_PRAGMA__(GCC diagnostic pop)
-#  define JEMALLOC_DIAGNOSTIC_IGNORE(W) JEMALLOC_PRAGMA__(GCC diagnostic ignored W)
+#  define JEMALLOC_DIAGNOSTIC_IGNORE(W) \
+     JEMALLOC_PRAGMA__(GCC diagnostic ignored W)
 
 /*
  * The -Wmissing-field-initializers warning is buggy in GCC versions < 5.1 and
- * all clang versions up to version 7 (currently trunk, unreleased).
- * This macro suppresses the warning for the affected compiler versions only.
+ * all clang versions up to version 7 (currently trunk, unreleased).  This macro
+ * suppresses the warning for the affected compiler versions only.
  */
-#  if ((defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ < 5)) || defined(__clang__)
+#  if ((defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ < 5)) || \
+     defined(__clang__)
 #    define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS  \
           JEMALLOC_DIAGNOSTIC_IGNORE("-Wmissing-field-initializers")
 #  else
@@ -103,9 +106,8 @@
 #endif
 
 /*
- * Disables spurious diagnostics for all headers
- * Since these headers are not included by users directly,
- * it does not affect their diagnostic settings.
+ * Disables spurious diagnostics for all headers.  Since these headers are not
+ * included by users directly, it does not affect their diagnostic settings.
  */
 JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
 
-- 
cgit v0.12


From 06f0850427e26cb24950de60bbe70bc192ffce6a Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Thu, 7 Mar 2019 15:58:26 -0800
Subject: Detect if 8-bit atomics are available.

In some rare cases (older compiler, e.g. gcc 4.2 w/ MIPS), 8-bit atomics might
be unavailable.  Detect such cases so that we can workaround.
---
 configure.ac                                       | 25 ++++++++++++++++++++++
 include/jemalloc/internal/atomic.h                 |  7 ++++++
 .../jemalloc/internal/jemalloc_internal_defs.h.in  |  7 ++++++
 3 files changed, 39 insertions(+)

diff --git a/configure.ac b/configure.ac
index afaaf5d..96f76d3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1896,6 +1896,19 @@ JE_COMPILABLE([GCC __atomic atomics], [
 ], [je_cv_gcc_atomic_atomics])
 if test "x${je_cv_gcc_atomic_atomics}" = "xyes" ; then
   AC_DEFINE([JEMALLOC_GCC_ATOMIC_ATOMICS])
+
+  dnl check for 8-bit atomic support
+  JE_COMPILABLE([GCC 8-bit __atomic atomics], [
+  ], [
+      unsigned char x = 0;
+      int val = 1;
+      int y = __atomic_fetch_add(&x, val, __ATOMIC_RELAXED);
+      int after_add = (int)x;
+      return after_add == 1;
+  ], [je_cv_gcc_u8_atomic_atomics])
+  if test "x${je_cv_gcc_u8_atomic_atomics}" = "xyes" ; then
+    AC_DEFINE([JEMALLOC_GCC_U8_ATOMIC_ATOMICS])
+  fi
 fi
 
 dnl ============================================================================
@@ -1910,6 +1923,18 @@ JE_COMPILABLE([GCC __sync atomics], [
 ], [je_cv_gcc_sync_atomics])
 if test "x${je_cv_gcc_sync_atomics}" = "xyes" ; then
   AC_DEFINE([JEMALLOC_GCC_SYNC_ATOMICS])
+
+  dnl check for 8-bit atomic support
+  JE_COMPILABLE([GCC 8-bit __sync atomics], [
+  ], [
+      unsigned char x = 0;
+      int before_add = __sync_fetch_and_add(&x, 1);
+      int after_add = (int)x;
+      return (before_add == 0) && (after_add == 1);
+  ], [je_cv_gcc_u8_sync_atomics])
+  if test "x${je_cv_gcc_u8_sync_atomics}" = "xyes" ; then
+    AC_DEFINE([JEMALLOC_GCC_U8_SYNC_ATOMICS])
+  fi
 fi
 
 dnl ============================================================================
diff --git a/include/jemalloc/internal/atomic.h b/include/jemalloc/internal/atomic.h
index bb751cf..a76f54c 100644
--- a/include/jemalloc/internal/atomic.h
+++ b/include/jemalloc/internal/atomic.h
@@ -3,10 +3,17 @@
 
 #define ATOMIC_INLINE JEMALLOC_ALWAYS_INLINE
 
+#define JEMALLOC_U8_ATOMICS
 #if defined(JEMALLOC_GCC_ATOMIC_ATOMICS)
 #  include "jemalloc/internal/atomic_gcc_atomic.h"
+#  if !defined(JEMALLOC_GCC_U8_ATOMIC_ATOMICS)
+#    undef JEMALLOC_U8_ATOMICS
+#  endif
 #elif defined(JEMALLOC_GCC_SYNC_ATOMICS)
 #  include "jemalloc/internal/atomic_gcc_sync.h"
+#  if !defined(JEMALLOC_GCC_U8_SYNC_ATOMICS)
+#    undef JEMALLOC_U8_ATOMICS
+#  endif
 #elif defined(_MSC_VER)
 #  include "jemalloc/internal/atomic_msvc.h"
 #elif defined(JEMALLOC_C11_ATOMICS)
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index 7914b2f..05016b1 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -48,9 +48,16 @@
 
 /* Defined if GCC __atomic atomics are available. */
 #undef JEMALLOC_GCC_ATOMIC_ATOMICS
+/* and the 8-bit variant support. */
+#undef JEMALLOC_GCC_U8_ATOMIC_ATOMICS
 
 /* Defined if GCC __sync atomics are available. */
 #undef JEMALLOC_GCC_SYNC_ATOMICS
+/* and the 8-bit variant support. */
+#undef JEMALLOC_GCC_U8_SYNC_ATOMICS
+
+/* Defined if 8-bit atomics are supported. */
+
 
 /*
  * Defined if __builtin_clz() and __builtin_clzl() are available.
-- 
cgit v0.12


From b804d0f019df87d8cc96e3c812e98793256cb418 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Thu, 7 Mar 2019 16:01:55 -0800
Subject: Fallback to 32-bit when 8-bit atomics are missing for TSD.

When it happens, this might cause a slowdown on the fast path operations.
However such case is very rare.
---
 include/jemalloc/internal/tsd.h | 19 +++++++++++++++++--
 src/tsd.c                       | 13 +++++++------
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index 00a9500..9ba2600 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -169,6 +169,18 @@ enum {
  */
 #define TSD_MANGLE(n) cant_access_tsd_items_directly_use_a_getter_or_setter_##n
 
+#ifdef JEMALLOC_U8_ATOMICS
+#  define tsd_state_t atomic_u8_t
+#  define tsd_atomic_load atomic_load_u8
+#  define tsd_atomic_store atomic_store_u8
+#  define tsd_atomic_exchange atomic_exchange_u8
+#else
+#  define tsd_state_t atomic_u32_t
+#  define tsd_atomic_load atomic_load_u32
+#  define tsd_atomic_store atomic_store_u32
+#  define tsd_atomic_exchange atomic_exchange_u32
+#endif
+
 /* The actual tsd. */
 struct tsd_s {
 	/*
@@ -177,8 +189,11 @@ struct tsd_s {
 	 * setters below.
 	 */
 
-	/* We manually limit the state to just a single byte. */
-	atomic_u8_t state;
+	/*
+	 * We manually limit the state to just a single byte.  Unless the 8-bit
+	 * atomics are unavailable (which is rare).
+	 */
+	tsd_state_t state;
 #define O(n, t, nt)							\
 	t TSD_MANGLE(n);
 MALLOC_TSD
diff --git a/src/tsd.c b/src/tsd.c
index 2eceed9..d5fb4d6 100644
--- a/src/tsd.c
+++ b/src/tsd.c
@@ -113,9 +113,9 @@ tsd_force_recompute(tsdn_t *tsdn) {
 	malloc_mutex_lock(tsdn, &tsd_nominal_tsds_lock);
 	tsd_t *remote_tsd;
 	ql_foreach(remote_tsd, &tsd_nominal_tsds, TSD_MANGLE(tcache).tsd_link) {
-		assert(atomic_load_u8(&remote_tsd->state, ATOMIC_RELAXED)
+		assert(tsd_atomic_load(&remote_tsd->state, ATOMIC_RELAXED)
 		    <= tsd_state_nominal_max);
-		atomic_store_u8(&remote_tsd->state, tsd_state_nominal_recompute,
+		tsd_atomic_store(&remote_tsd->state, tsd_state_nominal_recompute,
 		    ATOMIC_RELAXED);
 	}
 	malloc_mutex_unlock(tsdn, &tsd_nominal_tsds_lock);
@@ -172,7 +172,7 @@ tsd_slow_update(tsd_t *tsd) {
 	uint8_t old_state;
 	do {
 		uint8_t new_state = tsd_state_compute(tsd);
-		old_state = atomic_exchange_u8(&tsd->state, new_state,
+		old_state = tsd_atomic_exchange(&tsd->state, new_state,
 		    ATOMIC_ACQUIRE);
 	} while (old_state == tsd_state_nominal_recompute);
 }
@@ -181,14 +181,14 @@ void
 tsd_state_set(tsd_t *tsd, uint8_t new_state) {
 	/* Only the tsd module can change the state *to* recompute. */
 	assert(new_state != tsd_state_nominal_recompute);
-	uint8_t old_state = atomic_load_u8(&tsd->state, ATOMIC_RELAXED);
+	uint8_t old_state = tsd_atomic_load(&tsd->state, ATOMIC_RELAXED);
 	if (old_state > tsd_state_nominal_max) {
 		/*
 		 * Not currently in the nominal list, but it might need to be
 		 * inserted there.
 		 */
 		assert(!tsd_in_nominal_list(tsd));
-		atomic_store_u8(&tsd->state, new_state, ATOMIC_RELAXED);
+		tsd_atomic_store(&tsd->state, new_state, ATOMIC_RELAXED);
 		if (new_state <= tsd_state_nominal_max) {
 			tsd_add_nominal(tsd);
 		}
@@ -201,7 +201,8 @@ tsd_state_set(tsd_t *tsd, uint8_t new_state) {
 		assert(tsd_in_nominal_list(tsd));
 		if (new_state > tsd_state_nominal_max) {
 			tsd_remove_nominal(tsd);
-			atomic_store_u8(&tsd->state, new_state, ATOMIC_RELAXED);
+			tsd_atomic_store(&tsd->state, new_state,
+			    ATOMIC_RELAXED);
 		} else {
 			/*
 			 * This is the tricky case.  We're transitioning from
-- 
cgit v0.12


From f6c30cbafab1a841dd08f00541ed9651054bbe4a Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Mon, 11 Mar 2019 13:17:20 -0700
Subject: Remove some unused comments.

---
 include/jemalloc/internal/jemalloc_internal_defs.h.in | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index 05016b1..21b6514 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -56,9 +56,6 @@
 /* and the 8-bit variant support. */
 #undef JEMALLOC_GCC_U8_SYNC_ATOMICS
 
-/* Defined if 8-bit atomics are supported. */
-
-
 /*
  * Defined if __builtin_clz() and __builtin_clzl() are available.
  */
-- 
cgit v0.12


From fb56766ca9b398d07e2def5ead75a021fc08da03 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Tue, 12 Mar 2019 15:02:41 -0700
Subject: Eagerly purge oversized merged extents.

This change improves memory usage slightly, at virtually no CPU cost.
---
 include/jemalloc/internal/arena_inlines_b.h | 20 ++++++++++++++++++++
 src/extent.c                                |  7 +++++++
 test/unit/decay.c                           | 12 +++++++++---
 3 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/include/jemalloc/internal/arena_inlines_b.h b/include/jemalloc/internal/arena_inlines_b.h
index b7cdcea..614dedd 100644
--- a/include/jemalloc/internal/arena_inlines_b.h
+++ b/include/jemalloc/internal/arena_inlines_b.h
@@ -135,6 +135,26 @@ arena_decay_tick(tsdn_t *tsdn, arena_t *arena) {
 	arena_decay_ticks(tsdn, arena, 1);
 }
 
+/* Purge a single extent to retained / unmapped directly. */
+JEMALLOC_ALWAYS_INLINE void
+arena_decay_extent(tsdn_t *tsdn,arena_t *arena, extent_hooks_t **r_extent_hooks,
+    extent_t *extent) {
+	size_t extent_size = extent_size_get(extent);
+	extent_dalloc_wrapper(tsdn, arena,
+	    r_extent_hooks, extent);
+	if (config_stats) {
+		/* Update stats accordingly. */
+		arena_stats_lock(tsdn, &arena->stats);
+		arena_stats_add_u64(tsdn, &arena->stats,
+		    &arena->decay_dirty.stats->nmadvise, 1);
+		arena_stats_add_u64(tsdn, &arena->stats,
+		    &arena->decay_dirty.stats->purged, extent_size >> LG_PAGE);
+		arena_stats_sub_zu(tsdn, &arena->stats, &arena->stats.mapped,
+		    extent_size);
+		arena_stats_unlock(tsdn, &arena->stats);
+	}
+}
+
 JEMALLOC_ALWAYS_INLINE void *
 arena_malloc(tsdn_t *tsdn, arena_t *arena, size_t size, szind_t ind, bool zero,
     tcache_t *tcache, bool slow_path) {
diff --git a/src/extent.c b/src/extent.c
index fd6c837..3396a9d 100644
--- a/src/extent.c
+++ b/src/extent.c
@@ -1708,6 +1708,7 @@ extent_record(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 		extent = extent_try_coalesce(tsdn, arena, r_extent_hooks,
 		    rtree_ctx, extents, extent, NULL, growing_retained);
 	} else if (extent_size_get(extent) >= SC_LARGE_MINCLASS) {
+		assert(extents == &arena->extents_dirty);
 		/* Always coalesce large extents eagerly. */
 		bool coalesced;
 		do {
@@ -1716,6 +1717,12 @@ extent_record(tsdn_t *tsdn, arena_t *arena, extent_hooks_t **r_extent_hooks,
 			    r_extent_hooks, rtree_ctx, extents, extent,
 			    &coalesced, growing_retained);
 		} while (coalesced);
+		if (extent_size_get(extent) >= oversize_threshold) {
+			/* Shortcut to purge the oversize extent eagerly. */
+			malloc_mutex_unlock(tsdn, &extents->mtx);
+			arena_decay_extent(tsdn, arena, r_extent_hooks, extent);
+			return;
+		}
 	}
 	extent_deactivate_locked(tsdn, arena, extents, extent);
 
diff --git a/test/unit/decay.c b/test/unit/decay.c
index f727bf9..cf3c079 100644
--- a/test/unit/decay.c
+++ b/test/unit/decay.c
@@ -122,6 +122,12 @@ get_arena_dirty_npurge(unsigned arena_ind) {
 }
 
 static uint64_t
+get_arena_dirty_purged(unsigned arena_ind) {
+	do_epoch();
+	return get_arena_npurge_impl("stats.arenas.0.dirty_purged", arena_ind);
+}
+
+static uint64_t
 get_arena_muzzy_npurge(unsigned arena_ind) {
 	do_epoch();
 	return get_arena_npurge_impl("stats.arenas.0.muzzy_npurge", arena_ind);
@@ -559,7 +565,7 @@ TEST_BEGIN(test_decay_now) {
 TEST_END
 
 TEST_BEGIN(test_decay_never) {
-	test_skip_if(check_background_thread_enabled());
+	test_skip_if(check_background_thread_enabled() || !config_stats);
 
 	unsigned arena_ind = do_arena_create(-1, -1);
 	int flags = MALLOCX_ARENA(arena_ind) | MALLOCX_TCACHE_NONE;
@@ -579,8 +585,8 @@ TEST_BEGIN(test_decay_never) {
 		dallocx(ptrs[i], flags);
 		size_t pdirty = get_arena_pdirty(arena_ind);
 		size_t pmuzzy = get_arena_pmuzzy(arena_ind);
-		assert_zu_gt(pdirty, pdirty_prev,
-		    "Expected dirty pages to increase.");
+		assert_zu_gt(pdirty + (size_t)get_arena_dirty_purged(arena_ind),
+		    pdirty_prev, "Expected dirty pages to increase.");
 		assert_zu_eq(pmuzzy, 0, "Unexpected muzzy pages");
 		pdirty_prev = pdirty;
 	}
-- 
cgit v0.12


From a4d017f5e5aea12b745e67679ba40753f6d7a778 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Thu, 21 Mar 2019 22:21:43 -0700
Subject: Output message before aborting on tcache size-matching check.

---
 src/tcache.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/tcache.c b/src/tcache.c
index be4fb87..e7b970d 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -123,6 +123,9 @@ tbin_extents_lookup_size_check(tsdn_t *tsdn, cache_bin_t *tbin, szind_t binind,
 		sz_sum -= szind;
 	}
 	if (sz_sum != 0) {
+		malloc_printf("<jemalloc>: size mismatch in thread cache "
+		    "detected, likely caused by sized deallocation bugs by "
+		    "application. Abort.\n");
 		abort();
 	}
 }
-- 
cgit v0.12


From 788a657cee745c1f827ddf1db50d580bd5e4347b Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Wed, 27 Mar 2019 21:47:20 -0700
Subject: Allow low values of oversize_threshold to disable the feature.

We should allow a way to easily disable the feature (e.g. not reserving the
arena id at all).
---
 src/jemalloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/jemalloc.c b/src/jemalloc.c
index 855a98b..bb70395 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1248,8 +1248,8 @@ malloc_conf_init(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS]) {
 			 * contention on the huge arena.
 			 */
 			CONF_HANDLE_SIZE_T(opt_oversize_threshold,
-			    "oversize_threshold", SC_LARGE_MINCLASS,
-			    SC_LARGE_MAXCLASS, yes, yes, false)
+			    "oversize_threshold", 0, SC_LARGE_MAXCLASS, no, yes,
+			    false)
 			CONF_HANDLE_SIZE_T(opt_lg_extent_max_active_fit,
 			    "lg_extent_max_active_fit", 0,
 			    (sizeof(size_t) << 3), yes, yes, false)
-- 
cgit v0.12


From ce03e4c7b8ddeaec5e72c8fb160e378f418ed651 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Thu, 28 Mar 2019 20:30:56 -0700
Subject: Document opt.oversize_threshold.

---
 doc/jemalloc.xml.in | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index fe322e1..4acc212 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -992,6 +992,24 @@ mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".decay",
         number of CPUs, or one if there is a single CPU.</para></listitem>
       </varlistentry>
 
+      <varlistentry id="opt.oversize_threshold">
+        <term>
+          <mallctl>opt.oversize_threshold</mallctl>
+          (<type>size_t</type>)
+          <literal>r-</literal>
+        </term>
+        <listitem><para>The threshold in bytes of which requests are considered
+        oversize.  Allocation requests with greater sizes are fulfilled from a
+        dedicated arena (automatically managed, however not within
+        <literal>narenas</literal>), in order to reduce fragmentation by not
+        mixing huge allocations with small ones.  In addition, the reserved
+        special arena may have its own default decay settings.  Note that
+        requests with arena index specified via
+        <constant>MALLOCX_ARENA</constant>, or threads associated with explicit
+        arenas will not be considered.  The default threshold is 8MiB.  Values
+        not within large size classes disables this feature.</para></listitem>
+      </varlistentry>
+
       <varlistentry id="opt.percpu_arena">
         <term>
           <mallctl>opt.percpu_arena</mallctl>
@@ -1013,7 +1031,7 @@ mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".decay",
       <varlistentry id="opt.background_thread">
         <term>
           <mallctl>opt.background_thread</mallctl>
-          (<type>const bool</type>)
+          (<type>bool</type>)
           <literal>r-</literal>
         </term>
         <listitem><para>Internal background worker threads enabled/disabled.
@@ -1028,7 +1046,7 @@ mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".decay",
       <varlistentry id="opt.max_background_threads">
         <term>
           <mallctl>opt.max_background_threads</mallctl>
-          (<type>const size_t</type>)
+          (<type>size_t</type>)
           <literal>r-</literal>
         </term>
         <listitem><para>Maximum number of background threads that will be created
@@ -1059,7 +1077,11 @@ mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".decay",
         linkend="arena.i.dirty_decay_ms"><mallctl>arena.&lt;i&gt;.dirty_decay_ms</mallctl></link>
         for related dynamic control options.  See <link
         linkend="opt.muzzy_decay_ms"><mallctl>opt.muzzy_decay_ms</mallctl></link>
-        for a description of muzzy pages.</para></listitem>
+        for a description of muzzy pages.for a description of muzzy pages.  Note
+        that when the <link
+        linkend="opt.oversize_threshold"><mallctl>oversize_threshold</mallctl></link>
+        feature is enabled, the arenas reserved for oversize requests may have
+        its own default decay settings.</para></listitem>
       </varlistentry>
 
       <varlistentry id="opt.muzzy_decay_ms">
-- 
cgit v0.12


From 59d98919482b2a101c4092428a4c0092abb797a1 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Fri, 29 Mar 2019 13:27:20 -0700
Subject: Add the missing unlock in the error path of extent_register.

---
 src/extent.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/extent.c b/src/extent.c
index 3396a9d..62086c7 100644
--- a/src/extent.c
+++ b/src/extent.c
@@ -796,6 +796,7 @@ extent_register_impl(tsdn_t *tsdn, extent_t *extent, bool gdump_add) {
 
 	if (extent_rtree_leaf_elms_lookup(tsdn, rtree_ctx, extent, false, true,
 	    &elm_a, &elm_b)) {
+		extent_unlock(tsdn, extent);
 		return true;
 	}
 
-- 
cgit v0.12


From 0101d5ebef7230ef5aa1597be425e2a60e92f348 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Fri, 29 Mar 2019 13:31:02 -0700
Subject: Avoid check_min for opt_lg_extent_max_active_fit.

This fixes a compiler warning.
---
 src/jemalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/jemalloc.c b/src/jemalloc.c
index bb70395..c8afa9c 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1252,7 +1252,7 @@ malloc_conf_init(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS]) {
 			    false)
 			CONF_HANDLE_SIZE_T(opt_lg_extent_max_active_fit,
 			    "lg_extent_max_active_fit", 0,
-			    (sizeof(size_t) << 3), yes, yes, false)
+			    (sizeof(size_t) << 3), no, yes, false)
 
 			if (strncmp("percpu_arena", k, klen) == 0) {
 				bool match = false;
-- 
cgit v0.12


From 064d6e570e7073096471413f6a5159541478eb01 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Sun, 31 Mar 2019 17:45:22 -0700
Subject: Tweak the wording about oversize_threshold.

---
 doc/jemalloc.xml.in | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index 4acc212..fd0edb3 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -1002,9 +1002,9 @@ mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".decay",
         oversize.  Allocation requests with greater sizes are fulfilled from a
         dedicated arena (automatically managed, however not within
         <literal>narenas</literal>), in order to reduce fragmentation by not
-        mixing huge allocations with small ones.  In addition, the reserved
-        special arena may have its own default decay settings.  Note that
-        requests with arena index specified via
+        mixing huge allocations with small ones.  In addition, the decay API
+        guarantees on the extents greater than the specified threshold may be
+        overridden.  Note that requests with arena index specified via
         <constant>MALLOCX_ARENA</constant>, or threads associated with explicit
         arenas will not be considered.  The default threshold is 8MiB.  Values
         not within large size classes disables this feature.</para></listitem>
-- 
cgit v0.12


From 6fe11633b066d74bdbb0f037a373af6e12a8b6c2 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Tue, 2 Apr 2019 13:02:56 -0700
Subject: Fix the binshard unit test.

The test attempts to trigger usage of multiple sharded bins, which percpu_arena
makes it less reliable.
---
 test/unit/binshard.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/unit/binshard.c b/test/unit/binshard.c
index 406c46c..d7a8df8 100644
--- a/test/unit/binshard.c
+++ b/test/unit/binshard.c
@@ -82,6 +82,9 @@ thd_start(void *varg) {
 }
 
 TEST_BEGIN(test_bin_shard_mt) {
+	test_skip_if(have_percpu_arena &&
+	    PERCPU_ARENA_ENABLED(opt_percpu_arena));
+
 	thd_t thds[NTHREADS];
 	unsigned i;
 	for (i = 0; i < NTHREADS; i++) {
-- 
cgit v0.12


From 978a7a21ae5fe8e5367732b2dba9f92742aef9f1 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Tue, 2 Apr 2019 13:34:50 -0700
Subject: Use iallocztm instead of ialloc in prof_log functions.

Explicitly use iallocztm for internal allocations.  ialloc could trigger arena
creation, which may cause lock order reversal (narenas_mtx and log_mtx).
---
 src/prof.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/prof.c b/src/prof.c
index 296de52..4d7d65d 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -376,7 +376,8 @@ prof_log_bt_index(tsd_t *tsd, prof_bt_t *bt) {
 		size_t sz = offsetof(prof_bt_node_t, vec) +
 			        (bt->len * sizeof(void *));
 		prof_bt_node_t *new_node = (prof_bt_node_t *)
-			ialloc(tsd, sz, sz_size2index(sz), false, true);
+		    iallocztm(tsd_tsdn(tsd), sz, sz_size2index(sz), false, NULL,
+		    true, arena_get(TSDN_NULL, 0, true), true);
 		if (log_bt_first == NULL) {
 			log_bt_first = new_node;
 			log_bt_last = new_node;
@@ -416,7 +417,8 @@ prof_log_thr_index(tsd_t *tsd, uint64_t thr_uid, const char *name) {
 	    (void **)(&node), NULL)) {
 		size_t sz = offsetof(prof_thr_node_t, name) + strlen(name) + 1;
 		prof_thr_node_t *new_node = (prof_thr_node_t *)
-			ialloc(tsd, sz, sz_size2index(sz), false, true);
+		    iallocztm(tsd_tsdn(tsd), sz, sz_size2index(sz), false, NULL,
+		    true, arena_get(TSDN_NULL, 0, true), true);
 		if (log_thr_first == NULL) {
 			log_thr_first = new_node;
 			log_thr_last = new_node;
@@ -474,10 +476,11 @@ prof_try_log(tsd_t *tsd, const void *ptr, size_t usize, prof_tctx_t *tctx) {
 	nstime_t free_time = NSTIME_ZERO_INITIALIZER;
 	nstime_update(&free_time);
 
+	size_t sz = sizeof(prof_alloc_node_t);
 	prof_alloc_node_t *new_node = (prof_alloc_node_t *)
-		ialloc(tsd, sizeof(prof_alloc_node_t),
-		    sz_size2index(sizeof(prof_alloc_node_t)), false, true);
- 
+	    iallocztm(tsd_tsdn(tsd), sz, sz_size2index(sz), false, NULL, true,
+	    arena_get(TSDN_NULL, 0, true), true);
+
 	const char *prod_thr_name = (tctx->tdata->thread_name == NULL)?
 				        "" : tctx->tdata->thread_name;
 	const char *cons_thr_name = prof_thread_name_get(tsd);
-- 
cgit v0.12


From f7489dc8f1fac233b0cd4e40331de8b738b1f2e2 Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Tue, 19 Mar 2019 16:57:55 -0700
Subject: Update Changelog for 5.2.0.

---
 ChangeLog | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 104 insertions(+), 1 deletion(-)

diff --git a/ChangeLog b/ChangeLog
index 29a00fb..7c73a8f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -4,7 +4,110 @@ brevity.  Much more detail can be found in the git revision history:
 
     https://github.com/jemalloc/jemalloc
 
-* 5.1.0 (May 4th, 2018)
+* 5.2.0 (April 2, 2019)
+
+  This release includes a few notable improvements, which are summarized below:
+  1) improved fast-path performance from the optimizations by @djwatson; 2)
+  reduced virtual memory fragmentation and metadata usage; and 3) bug fixes on
+  setting the number of background threads.  In addition, peak / spike memory
+  usage is improved with certain allocation patterns.  As usual, the release and
+  prior dev versions have gone through large-scale production testing.
+
+  New features:
+  - Implement oversize_threshold, which uses a dedicated arena for allocations
+    crossing the specified threshold to reduce fragmentation.  (@interwq)
+  - Add extents usage information to stats.  (@tyleretzel)
+  - Log time information for sampled allocations.  (@tyleretzel)
+  - Support 0 size in sdallocx.  (@djwatson)
+  - Output rate for certain counters in malloc_stats.  (@zinoale)
+  - Add configure option --enable-readlinkat, which allows the use of readlinkat
+    over readlink.  (@davidtgoldblatt)
+  - Add configure options --{enable,disable}-{static,shared} to allow not
+    building unwanted libraries.  (@Ericson2314)
+  - Add configure option --disable-libdl to enable fully static builds.
+    (@interwq)
+  - Add mallctl interfaces:
+	+ opt.oversize_threshold (@interwq)
+	+ stats.arenas.<i>.extent_avail (@tyleretzel)
+	+ stats.arenas.<i>.extents.<j>.n{dirty,muzzy,retained} (@tyleretzel)
+	+ stats.arenas.<i>.extents.<j>.{dirty,muzzy,retained}_bytes
+	  (@tyleretzel)
+
+  Portability improvements:
+  - Update MSVC builds.  (@maksqwe, @rustyx)
+  - Workaround a compiler optimizer bug on s390x.  (@rkmisra)
+  - Make use of pthread_set_name_np(3) on FreeBSD.  (@trasz)
+  - Implement malloc_getcpu() to enable percpu_arena for windows.  (@santagada)
+  - Link against -pthread instead of -lpthread.  (@paravoid)
+  - Make background_thread not dependent on libdl.  (@interwq)
+  - Add stringify to fix a linker directive issue on MSVC.  (@daverigby)
+  - Detect and fall back when 8-bit atomics are unavailable.  (@interwq)
+  - Fall back to the default pthread_create if dlsym(3) fails.  (@interwq)
+
+  Optimizations and refactors:
+  - Refactor the TSD module.  (@davidtgoldblatt)
+  - Avoid taking extents_muzzy mutex when muzzy is disabled.  (@interwq)
+  - Avoid taking large_mtx for auto arenas on the tcache flush path.  (@interwq)
+  - Optimize ixalloc by avoiding a size lookup.  (@interwq)
+  - Implement opt.oversize_threshold which uses a dedicated arena for requests
+    crossing the threshold, also eagerly purges the oversize extents.  Default
+    the threshold to 8 MiB.  (@interwq)
+  - Clean compilation with -Wextra.  (@gnzlbg, @jasone)
+  - Refactor the size class module.  (@davidtgoldblatt)
+  - Refactor the stats emitter.  (@tyleretzel)
+  - Optimize pow2_ceil.  (@rkmisra)
+  - Avoid runtime detection of lazy purging on FreeBSD.  (@trasz)
+  - Optimize mmap(2) alignment handling on FreeBSD.  (@trasz)
+  - Improve error handling for THP state initialization.  (@jsteemann)
+  - Rework the malloc() fast path.  (@djwatson)
+  - Rework the free() fast path.  (@djwatson)
+  - Refactor and optimize the tcache fill / flush paths.  (@djwatson)
+  - Optimize sync / lwsync on PowerPC.  (@chmeeedalf)
+  - Bypass extent_dalloc() when retain is enabled.  (@interwq)
+  - Optimize the locking on large deallocation.  (@interwq)
+  - Reduce the number of pages committed from sanity checking in debug build.
+    (@trasz, @interwq)
+  - Deprecate OSSpinLock.  (@interwq)
+  - Lower the default number of background threads to 4 (when the feature
+    is enabled).  (@interwq)
+  - Optimize the trylock spin wait.  (@djwatson)
+  - Use arena index for arena-matching checks.  (@interwq)
+  - Avoid forced decay on thread termination when using background threads.
+    (@interwq)
+  - Disable muzzy decay by default.  (@djwatson, @interwq)
+  - Only initialize libgcc unwinder when profiling is enabled.  (@paravoid,
+    @interwq)
+
+  Bug fixes (all only relevant to jemalloc 5.x):
+  - Fix background thread index issues with max_background_threads.  (@djwatson,
+    @interwq)
+  - Fix stats output for opt.lg_extent_max_active_fit.  (@interwq)
+  - Fix opt.prof_prefix initialization.  (@davidtgoldblatt)
+  - Properly trigger decay on tcache destroy.  (@interwq, @amosbird)
+  - Fix tcache.flush.  (@interwq)
+  - Detect whether explicit extent zero out is necessary with huge pages or
+    custom extent hooks, which may change the purge semantics.  (@interwq)
+  - Fix a side effect caused by extent_max_active_fit combined with decay-based
+    purging, where freed extents can accumulate and not be reused for an
+    extended period of time.  (@interwq, @mpghf)
+  - Fix a missing unlock on extent register error handling.  (@zoulasc)
+
+  Testing:
+  - Simplify the Travis script output.  (@gnzlbg)
+  - Update the test scripts for FreeBSD.  (@devnexen)
+  - Add unit tests for the producer-consumer pattern.  (@interwq)
+  - Add Cirrus-CI config for FreeBSD builds.  (@jasone)
+  - Add size-matching sanity checks on tcache flush.  (@davidtgoldblatt,
+    @interwq)
+
+  Incompatible changes:
+  - Remove --with-lg-page-sizes.  (@davidtgoldblatt)
+
+  Documentation:
+  - Attempt to build docs by default, however skip doc building when xsltproc
+    is missing. (@interwq, @cmuellner)
+
+* 5.1.0 (May 4, 2018)
 
   This release is primarily about fine-tuning, ranging from several new features
   to numerous notable performance and portability enhancements.  The release and
-- 
cgit v0.12