From fa5d245aef7087c19c375590a7ee2966a0ae339a Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Tue, 15 Mar 2011 10:25:59 -0700
Subject: Set default symbol visibility to hidden.

Compile with -fvisibility=hidden rather than -fvisibility=internal, in
order to avoid PLT lookups for internal functions.  Also fix a
regression that caused the -fvisibility flag to be omitted, due to:
    Port to Mac OS X.
    2dbecf1f6267fae7a161b9c39cfd4d04ce168a29
---
 jemalloc/configure.ac | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/jemalloc/configure.ac b/jemalloc/configure.ac
index 46a2bd4..f10641b 100644
--- a/jemalloc/configure.ac
+++ b/jemalloc/configure.ac
@@ -157,17 +157,6 @@ case "${host_cpu}" in
 esac
 AC_DEFINE_UNQUOTED([CPU_SPINWAIT], [$CPU_SPINWAIT])
 
-JE_COMPILABLE([__attribute__ syntax],
-              [static __attribute__((unused)) void foo(void){}],
-              [],
-              [attribute])
-if test "x${attribute}" = "xyes" ; then
-  AC_DEFINE([JEMALLOC_HAVE_ATTR], [ ])
-  if test "x$GCC" = "xyes" -a "${abi}" = "xelf"; then
-    JE_CFLAGS_APPEND([-fvisibility=internal])
-  fi
-fi
-
 dnl Platform-specific settings.  abi and RPATH can probably be determined
 dnl programmatically, but doing so is error-prone, which makes it generally
 dnl not worth the trouble.
@@ -227,6 +216,17 @@ esac
 AC_SUBST([abi])
 AC_SUBST([RPATH])
 
+JE_COMPILABLE([__attribute__ syntax],
+              [static __attribute__((unused)) void foo(void){}],
+              [],
+              [attribute])
+if test "x${attribute}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_ATTR], [ ])
+  if test "x${GCC}" = "xyes" -a "x${abi}" = "xelf"; then
+    JE_CFLAGS_APPEND([-fvisibility=hidden])
+  fi
+fi
+
 JE_COMPILABLE([mremap(...MREMAP_FIXED...)], [
 #define _GNU_SOURCE
 #include <sys/mman.h>
-- 
cgit v0.12


From ff7450727f64180367f430b1b747f9e682e26df4 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Mon, 14 Mar 2011 22:22:29 -0700
Subject: Expand a comment regarding geometric sampling.

---
 jemalloc/include/jemalloc/internal/prof.h | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/jemalloc/include/jemalloc/internal/prof.h b/jemalloc/include/jemalloc/internal/prof.h
index 7864000..db63465 100644
--- a/jemalloc/include/jemalloc/internal/prof.h
+++ b/jemalloc/include/jemalloc/internal/prof.h
@@ -247,8 +247,22 @@ prof_sample_threshold_update(prof_tdata_t *prof_tdata)
 	double u;
 
 	/*
-	 * Compute prof_sample_threshold as a geometrically distributed random
+	 * Compute sample threshold as a geometrically distributed random
 	 * variable with mean (2^opt_lg_prof_sample).
+	 *
+	 *                         __        __
+	 *                         |  log(u)  |                     1
+	 * prof_tdata->threshold = | -------- |, where p = -------------------
+	 *                         | log(1-p) |             opt_lg_prof_sample
+	 *                                                 2
+	 *
+	 * For more information on the math, see:
+	 *
+	 *   Non-Uniform Random Variate Generation
+	 *   Luc Devroye
+	 *   Springer-Verlag, New York, 1986
+	 *   pp 500
+	 *   (http://cg.scs.carleton.ca/~luc/rnbookindex.html)
 	 */
 	prn64(r, 53, prof_tdata->prn_state,
 	    (uint64_t)6364136223846793005LLU, (uint64_t)1442695040888963407LLU);
@@ -374,7 +388,7 @@ prof_sample_accum_update(size_t size)
 	/* Take care to avoid integer overflow. */
 	if (size >= prof_tdata->threshold - prof_tdata->accum) {
 		prof_tdata->accum -= (prof_tdata->threshold - size);
-		/* Compute new prof_sample_threshold. */
+		/* Compute new sample threshold. */
 		prof_sample_threshold_update(prof_tdata);
 		while (prof_tdata->accum >= prof_tdata->threshold) {
 			prof_tdata->accum -= prof_tdata->threshold;
-- 
cgit v0.12


From 41ade967c29ea9312c0b7390ee43bc0c63373f39 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Sun, 6 Mar 2011 22:56:36 -0800
Subject: Reduce size of small_size2bin lookup table.

Convert all direct small_size2bin[...] accesses to SMALL_SIZE2BIN(...)
macro calls, and use a couple of cheap math operations to allow
compacting the table by 4X or 8X, on 32- and 64-bit systems,
respectively.
---
 jemalloc/include/jemalloc/internal/arena.h         |  7 ++
 .../jemalloc/internal/jemalloc_internal.h.in       |  4 +-
 jemalloc/include/jemalloc/internal/tcache.h        |  2 +-
 jemalloc/src/arena.c                               | 80 ++++++++++++----------
 4 files changed, 52 insertions(+), 41 deletions(-)

diff --git a/jemalloc/include/jemalloc/internal/arena.h b/jemalloc/include/jemalloc/internal/arena.h
index a43d1fa..78828ef 100644
--- a/jemalloc/include/jemalloc/internal/arena.h
+++ b/jemalloc/include/jemalloc/internal/arena.h
@@ -19,6 +19,7 @@
 #ifdef JEMALLOC_TINY
    /* Smallest size class to support. */
 #  define LG_TINY_MIN		LG_SIZEOF_PTR
+#  define TINY_MIN		(1U << LG_TINY_MIN)
 #endif
 
 /*
@@ -389,7 +390,13 @@ struct arena_s {
 extern size_t	opt_lg_qspace_max;
 extern size_t	opt_lg_cspace_max;
 extern ssize_t		opt_lg_dirty_mult;
+/*
+ * small_size2bin is a compact lookup table that rounds request sizes up to
+ * size classes.  In order to reduce cache footprint, the table is compressed,
+ * and all accesses are via the SMALL_SIZE2BIN macro.
+ */
 extern uint8_t const	*small_size2bin;
+#define	SMALL_SIZE2BIN(s)	(small_size2bin[(s-1) >> LG_TINY_MIN])
 
 /* Various bin-related settings. */
 #ifdef JEMALLOC_TINY		/* Number of (2^n)-spaced tiny bins. */
diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
index aab2bfb..0f58a7a 100644
--- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
@@ -402,7 +402,7 @@ s2u(size_t size)
 {
 
 	if (size <= small_maxclass)
-		return (arenas[0]->bins[small_size2bin[size]].reg_size);
+		return (arenas[0]->bins[SMALL_SIZE2BIN(size)].reg_size);
 	if (size <= arena_maxclass)
 		return (PAGE_CEILING(size));
 	return (CHUNK_CEILING(size));
@@ -448,7 +448,7 @@ sa2u(size_t size, size_t alignment, size_t *run_size_p)
 	if (usize <= arena_maxclass && alignment <= PAGE_SIZE) {
 		if (usize <= small_maxclass) {
 			return
-			    (arenas[0]->bins[small_size2bin[usize]].reg_size);
+			    (arenas[0]->bins[SMALL_SIZE2BIN(usize)].reg_size);
 		}
 		return (PAGE_CEILING(usize));
 	} else {
diff --git a/jemalloc/include/jemalloc/internal/tcache.h b/jemalloc/include/jemalloc/internal/tcache.h
index f431c66..7b71172 100644
--- a/jemalloc/include/jemalloc/internal/tcache.h
+++ b/jemalloc/include/jemalloc/internal/tcache.h
@@ -223,7 +223,7 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
 	size_t binind;
 	tcache_bin_t *tbin;
 
-	binind = small_size2bin[size];
+	binind = SMALL_SIZE2BIN(size);
 	assert(binind < nbins);
 	tbin = &tcache->tbins[binind];
 	ret = tcache_alloc_easy(tbin);
diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index 3cf15ff..2811fd1 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -25,26 +25,27 @@ size_t		mspace_mask;
 
 /*
  * const_small_size2bin is a static constant lookup table that in the common
- * case can be used as-is for small_size2bin.  For dynamically linked programs,
- * this avoids a page of memory overhead per process.
+ * case can be used as-is for small_size2bin.
  */
-#define	S2B_1(i)	i,
-#define	S2B_2(i)	S2B_1(i) S2B_1(i)
-#define	S2B_4(i)	S2B_2(i) S2B_2(i)
+#if (LG_TINY_MIN == 2)
+#define	S2B_4(i)	i,
 #define	S2B_8(i)	S2B_4(i) S2B_4(i)
+#elif (LG_TINY_MIN == 3)
+#define	S2B_8(i)	i,
+#else
+#  error "Unsupported LG_TINY_MIN"
+#endif
 #define	S2B_16(i)	S2B_8(i) S2B_8(i)
 #define	S2B_32(i)	S2B_16(i) S2B_16(i)
 #define	S2B_64(i)	S2B_32(i) S2B_32(i)
 #define	S2B_128(i)	S2B_64(i) S2B_64(i)
 #define	S2B_256(i)	S2B_128(i) S2B_128(i)
 /*
- * The number of elements in const_small_size2bin is dependent on page size
- * and on the definition for SUBPAGE.  If SUBPAGE changes, the '- 255' must also
- * change, along with the addition/removal of static lookup table element
- * definitions.
+ * The number of elements in const_small_size2bin is dependent on the
+ * definition for SUBPAGE.
  */
-static const uint8_t	const_small_size2bin[STATIC_PAGE_SIZE - 255] = {
-	S2B_1(0xffU)		/*    0 */
+static JEMALLOC_ATTR(aligned(CACHELINE))
+    const uint8_t	const_small_size2bin[] = {
 #if (LG_QUANTUM == 4)
 /* 16-byte quantum **********************/
 #  ifdef JEMALLOC_TINY
@@ -1475,7 +1476,7 @@ arena_malloc_small(arena_t *arena, size_t size, bool zero)
 	arena_run_t *run;
 	size_t binind;
 
-	binind = small_size2bin[size];
+	binind = SMALL_SIZE2BIN(size);
 	assert(binind < nbins);
 	bin = &arena->bins[binind];
 	size = bin->reg_size;
@@ -1713,7 +1714,7 @@ arena_prof_promoted(const void *ptr, size_t size)
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
-	binind = small_size2bin[size];
+	binind = SMALL_SIZE2BIN(size);
 	assert(binind < nbins);
 	chunk->map[pageind-map_bias].bits = (chunk->map[pageind-map_bias].bits &
 	    ~CHUNK_MAP_CLASS_MASK) | ((binind+1) << CHUNK_MAP_CLASS_SHIFT);
@@ -2166,11 +2167,11 @@ arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
 	 */
 	if (oldsize <= arena_maxclass) {
 		if (oldsize <= small_maxclass) {
-			assert(choose_arena()->bins[small_size2bin[
-			    oldsize]].reg_size == oldsize);
+			assert(choose_arena()->bins[SMALL_SIZE2BIN(
+			    oldsize)].reg_size == oldsize);
 			if ((size + extra <= small_maxclass &&
-			    small_size2bin[size + extra] ==
-			    small_size2bin[oldsize]) || (size <= oldsize &&
+			    SMALL_SIZE2BIN(size + extra) ==
+			    SMALL_SIZE2BIN(oldsize)) || (size <= oldsize &&
 			    size + extra >= oldsize)) {
 #ifdef JEMALLOC_FILL
 				if (opt_junk && size < oldsize) {
@@ -2371,40 +2372,39 @@ small_size2bin_validate(void)
 {
 	size_t i, size, binind;
 
-	assert(small_size2bin[0] == 0xffU);
 	i = 1;
 #  ifdef JEMALLOC_TINY
 	/* Tiny. */
 	for (; i < (1U << LG_TINY_MIN); i++) {
 		size = pow2_ceil(1U << LG_TINY_MIN);
 		binind = ffs((int)(size >> (LG_TINY_MIN + 1)));
-		assert(small_size2bin[i] == binind);
+		assert(SMALL_SIZE2BIN(i) == binind);
 	}
 	for (; i < qspace_min; i++) {
 		size = pow2_ceil(i);
 		binind = ffs((int)(size >> (LG_TINY_MIN + 1)));
-		assert(small_size2bin[i] == binind);
+		assert(SMALL_SIZE2BIN(i) == binind);
 	}
 #  endif
 	/* Quantum-spaced. */
 	for (; i <= qspace_max; i++) {
 		size = QUANTUM_CEILING(i);
 		binind = ntbins + (size >> LG_QUANTUM) - 1;
-		assert(small_size2bin[i] == binind);
+		assert(SMALL_SIZE2BIN(i) == binind);
 	}
 	/* Cacheline-spaced. */
 	for (; i <= cspace_max; i++) {
 		size = CACHELINE_CEILING(i);
 		binind = ntbins + nqbins + ((size - cspace_min) >>
 		    LG_CACHELINE);
-		assert(small_size2bin[i] == binind);
+		assert(SMALL_SIZE2BIN(i) == binind);
 	}
 	/* Sub-page. */
 	for (; i <= sspace_max; i++) {
 		size = SUBPAGE_CEILING(i);
 		binind = ntbins + nqbins + ncbins + ((size - sspace_min)
 		    >> LG_SUBPAGE);
-		assert(small_size2bin[i] == binind);
+		assert(SMALL_SIZE2BIN(i) == binind);
 	}
 }
 #endif
@@ -2415,12 +2415,12 @@ small_size2bin_init(void)
 
 	if (opt_lg_qspace_max != LG_QSPACE_MAX_DEFAULT
 	    || opt_lg_cspace_max != LG_CSPACE_MAX_DEFAULT
-	    || sizeof(const_small_size2bin) != small_maxclass + 1)
+	    || (sizeof(const_small_size2bin) != ((small_maxclass-1) >>
+	    LG_TINY_MIN) + 1))
 		return (small_size2bin_init_hard());
 
 	small_size2bin = const_small_size2bin;
 #ifdef JEMALLOC_DEBUG
-	assert(sizeof(const_small_size2bin) == small_maxclass + 1);
 	small_size2bin_validate();
 #endif
 	return (false);
@@ -2431,49 +2431,52 @@ small_size2bin_init_hard(void)
 {
 	size_t i, size, binind;
 	uint8_t *custom_small_size2bin;
+#define	CUSTOM_SMALL_SIZE2BIN(s)					\
+    custom_small_size2bin[(s-1) >> LG_TINY_MIN]
 
 	assert(opt_lg_qspace_max != LG_QSPACE_MAX_DEFAULT
 	    || opt_lg_cspace_max != LG_CSPACE_MAX_DEFAULT
-	    || sizeof(const_small_size2bin) != small_maxclass + 1);
+	    || (sizeof(const_small_size2bin) != ((small_maxclass-1) >>
+	    LG_TINY_MIN) + 1));
 
-	custom_small_size2bin = (uint8_t *)base_alloc(small_maxclass + 1);
+	custom_small_size2bin = (uint8_t *)
+	    base_alloc(small_maxclass >> LG_TINY_MIN);
 	if (custom_small_size2bin == NULL)
 		return (true);
 
-	custom_small_size2bin[0] = 0xffU;
 	i = 1;
 #ifdef JEMALLOC_TINY
 	/* Tiny. */
-	for (; i < (1U << LG_TINY_MIN); i++) {
+	for (; i < (1U << LG_TINY_MIN); i += TINY_MIN) {
 		size = pow2_ceil(1U << LG_TINY_MIN);
 		binind = ffs((int)(size >> (LG_TINY_MIN + 1)));
-		custom_small_size2bin[i] = binind;
+		CUSTOM_SMALL_SIZE2BIN(i) = binind;
 	}
-	for (; i < qspace_min; i++) {
+	for (; i < qspace_min; i += TINY_MIN) {
 		size = pow2_ceil(i);
 		binind = ffs((int)(size >> (LG_TINY_MIN + 1)));
-		custom_small_size2bin[i] = binind;
+		CUSTOM_SMALL_SIZE2BIN(i) = binind;
 	}
 #endif
 	/* Quantum-spaced. */
-	for (; i <= qspace_max; i++) {
+	for (; i <= qspace_max; i += TINY_MIN) {
 		size = QUANTUM_CEILING(i);
 		binind = ntbins + (size >> LG_QUANTUM) - 1;
-		custom_small_size2bin[i] = binind;
+		CUSTOM_SMALL_SIZE2BIN(i) = binind;
 	}
 	/* Cacheline-spaced. */
-	for (; i <= cspace_max; i++) {
+	for (; i <= cspace_max; i += TINY_MIN) {
 		size = CACHELINE_CEILING(i);
 		binind = ntbins + nqbins + ((size - cspace_min) >>
 		    LG_CACHELINE);
-		custom_small_size2bin[i] = binind;
+		CUSTOM_SMALL_SIZE2BIN(i) = binind;
 	}
 	/* Sub-page. */
-	for (; i <= sspace_max; i++) {
+	for (; i <= sspace_max; i += TINY_MIN) {
 		size = SUBPAGE_CEILING(i);
 		binind = ntbins + nqbins + ncbins + ((size - sspace_min) >>
 		    LG_SUBPAGE);
-		custom_small_size2bin[i] = binind;
+		CUSTOM_SMALL_SIZE2BIN(i) = binind;
 	}
 
 	small_size2bin = custom_small_size2bin;
@@ -2481,6 +2484,7 @@ small_size2bin_init_hard(void)
 	small_size2bin_validate();
 #endif
 	return (false);
+#undef CUSTOM_SMALL_SIZE2BIN
 }
 
 bool
-- 
cgit v0.12


From 1b17768e249cf910d242be5b53a6f2dea18eeb2c Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Tue, 15 Mar 2011 09:40:07 -0700
Subject: Fix a build dependency regression.

Fix the automatic header dependency generation to handle the .pic.o
suffix.  This regression was due to:
    Build both PIC and no PIC static libraries
    af5d6987f829ccd6e14dd1f57586cfb072a533c7
---
 jemalloc/Makefile.in | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/jemalloc/Makefile.in b/jemalloc/Makefile.in
index 6dfaf5b..7a13f21 100644
--- a/jemalloc/Makefile.in
+++ b/jemalloc/Makefile.in
@@ -94,6 +94,7 @@ doc: $(DOCS)
 # Include generated dependency files.
 #
 -include $(CSRCS:@srcroot@%.c=@objroot@%.d)
+-include $(CSRCS:@srcroot@%.c=@objroot@%.pic.d)
 
 @objroot@src/%.o: @srcroot@src/%.c
 	@mkdir -p $(@D)
@@ -103,7 +104,7 @@ doc: $(DOCS)
 @objroot@src/%.pic.o: @srcroot@src/%.c
 	@mkdir -p $(@D)
 	$(CC) $(CFLAGS) -fPIC -DPIC -c $(CPPFLAGS) -o $@ $<
-	@$(SHELL) -ec "$(CC) -MM $(CPPFLAGS) $< | sed \"s/\($(subst /,\/,$(notdir $(basename $@)))\)\.o\([ :]*\)/$(subst /,\/,$(strip $(dir $@)))\1.o \2/g\" > $(@:%.o=%.d)"
+	@$(SHELL) -ec "$(CC) -MM $(CPPFLAGS) $< | sed \"s/\($(subst /,\/,$(notdir $(basename $(basename $@))))\)\.o\([ :]*\)/$(subst /,\/,$(strip $(dir $@)))\1.pic.o \2/g\" > $(@:%.o=%.d)"
 
 %.$(SO) : %.$(SO).$(REV)
 	@mkdir -p $(@D)
-- 
cgit v0.12


From 49f7e8f35ac63d0dd526cf68791dc0ca29538ac9 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Tue, 15 Mar 2011 13:59:15 -0700
Subject: Create arena_bin_info_t.

Move read-only fields from arena_bin_t into arena_bin_info_t, primarily
in order to avoid false cacheline sharing.
---
 jemalloc/include/jemalloc/internal/arena.h         |  95 +++--
 .../jemalloc/internal/jemalloc_internal.h.in       |   8 +-
 jemalloc/include/jemalloc/internal/tcache.h        |   6 +-
 jemalloc/src/arena.c                               | 426 ++++++++++++---------
 jemalloc/src/ctl.c                                 |   6 +-
 jemalloc/src/tcache.c                              |   6 +-
 6 files changed, 324 insertions(+), 223 deletions(-)

diff --git a/jemalloc/include/jemalloc/internal/arena.h b/jemalloc/include/jemalloc/internal/arena.h
index 78828ef..467ec65 100644
--- a/jemalloc/include/jemalloc/internal/arena.h
+++ b/jemalloc/include/jemalloc/internal/arena.h
@@ -71,6 +71,7 @@
 typedef struct arena_chunk_map_s arena_chunk_map_t;
 typedef struct arena_chunk_s arena_chunk_t;
 typedef struct arena_run_s arena_run_t;
+typedef struct arena_bin_info_s arena_bin_info_t;
 typedef struct arena_bin_s arena_bin_t;
 typedef struct arena_s arena_t;
 
@@ -218,6 +219,33 @@ struct arena_run_s {
 	unsigned	nfree;
 };
 
+/*
+ * Read-only information associated with each element for arena_t's bins array
+ * is stored separately, partly to reduce memory usage (only one copy, rather
+ * than one per arena), but mainly to avoid false cacheline sharing.
+ */
+struct arena_bin_info_s {
+	/* Size of regions in a run for this bin's size class. */
+	size_t		reg_size;
+
+	/* Total size of a run for this bin's size class. */
+	size_t		run_size;
+
+	/* Total number of regions in a run for this bin's size class. */
+	uint32_t	nregs;
+
+#ifdef JEMALLOC_PROF
+	/*
+	 * Offset of first (prof_ctx_t *) in a run header for this bin's size
+	 * class, or 0 if (opt_prof == false).
+	 */
+	uint32_t	ctx0_offset;
+#endif
+
+	/* Offset of first region in a run for this bin's size class. */
+	uint32_t	reg0_offset;
+};
+
 struct arena_bin_s {
 	/*
 	 * All operations on runcur, runs, and stats require that lock be
@@ -242,26 +270,6 @@ struct arena_bin_s {
 	 */
 	arena_run_tree_t runs;
 
-	/* Size of regions in a run for this bin's size class. */
-	size_t		reg_size;
-
-	/* Total size of a run for this bin's size class. */
-	size_t		run_size;
-
-	/* Total number of regions in a run for this bin's size class. */
-	uint32_t	nregs;
-
-#ifdef JEMALLOC_PROF
-	/*
-	 * Offset of first (prof_ctx_t *) in a run header for this bin's size
-	 * class, or 0 if (opt_prof == false).
-	 */
-	uint32_t	ctx0_offset;
-#endif
-
-	/* Offset of first region in a run for this bin's size class. */
-	uint32_t	reg0_offset;
-
 #ifdef JEMALLOC_STATS
 	/* Bin statistics. */
 	malloc_bin_stats_t stats;
@@ -398,6 +406,8 @@ extern ssize_t		opt_lg_dirty_mult;
 extern uint8_t const	*small_size2bin;
 #define	SMALL_SIZE2BIN(s)	(small_size2bin[(s-1) >> LG_TINY_MIN])
 
+extern arena_bin_info_t	*arena_bin_info;
+
 /* Various bin-related settings. */
 #ifdef JEMALLOC_TINY		/* Number of (2^n)-spaced tiny bins. */
 #  define		ntbins	((unsigned)(LG_QUANTUM - LG_TINY_MIN))
@@ -463,7 +473,8 @@ bool	arena_boot(void);
 #ifdef JEMALLOC_H_INLINES
 
 #ifndef JEMALLOC_ENABLE_INLINE
-unsigned	arena_run_regind(arena_run_t *run, arena_bin_t *bin,
+size_t	arena_bin_index(arena_t *arena, arena_bin_t *bin);
+unsigned	arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info,
     const void *ptr, size_t size);
 #  ifdef JEMALLOC_PROF
 prof_ctx_t	*arena_prof_ctx_get(const void *ptr);
@@ -473,8 +484,16 @@ void	arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ARENA_C_))
+JEMALLOC_INLINE size_t
+arena_bin_index(arena_t *arena, arena_bin_t *bin)
+{
+	size_t binind = bin - arena->bins;
+	assert(binind < nbins);
+	return (binind);
+}
+
 JEMALLOC_INLINE unsigned
-arena_run_regind(arena_run_t *run, arena_bin_t *bin, const void *ptr,
+arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, const void *ptr,
     size_t size)
 {
 	unsigned shift, diff, regind;
@@ -485,7 +504,8 @@ arena_run_regind(arena_run_t *run, arena_bin_t *bin, const void *ptr,
 	 * Avoid doing division with a variable divisor if possible.  Using
 	 * actual division here can reduce allocator throughput by over 20%!
 	 */
-	diff = (unsigned)((uintptr_t)ptr - (uintptr_t)run - bin->reg0_offset);
+	diff = (unsigned)((uintptr_t)ptr - (uintptr_t)run -
+	    bin_info->reg0_offset);
 
 	/* Rescale (factor powers of 2 out of the numerator and denominator). */
 	shift = ffs(size) - 1;
@@ -531,7 +551,7 @@ arena_run_regind(arena_run_t *run, arena_bin_t *bin, const void *ptr,
 #undef SIZE_INV_SHIFT
 	}
 	assert(diff == regind * size);
-	assert(regind < bin->nregs);
+	assert(regind < bin_info->nregs);
 
 	return (regind);
 }
@@ -558,13 +578,15 @@ arena_prof_ctx_get(const void *ptr)
 			arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
 			    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
 			    PAGE_SHIFT));
-			arena_bin_t *bin = run->bin;
+			size_t binind = arena_bin_index(chunk->arena, run->bin);
+			arena_bin_info_t *bin_info = &arena_bin_info[binind];
 			unsigned regind;
 
 			assert(run->magic == ARENA_RUN_MAGIC);
-			regind = arena_run_regind(run, bin, ptr, bin->reg_size);
+			regind = arena_run_regind(run, bin_info, ptr,
+			    bin_info->reg_size);
 			ret = *(prof_ctx_t **)((uintptr_t)run +
-			    bin->ctx0_offset + (regind *
+			    bin_info->ctx0_offset + (regind *
 			    sizeof(prof_ctx_t *)));
 		}
 	} else
@@ -593,11 +615,15 @@ arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 			    PAGE_SHIFT));
 			arena_bin_t *bin = run->bin;
 			unsigned regind;
+			size_t binind;
+			arena_bin_info_t *bin_info;
 
 			assert(run->magic == ARENA_RUN_MAGIC);
 			regind = arena_run_regind(run, bin, ptr, bin->reg_size);
+			binind = arena_bin_index(chunk->arena, bin);
+			bin_info = &arena_bin_info[binind];
 
-			*((prof_ctx_t **)((uintptr_t)run + bin->ctx0_offset
+			*((prof_ctx_t **)((uintptr_t)run + bin_info->ctx0_offset
 			    + (regind * sizeof(prof_ctx_t *)))) = ctx;
 		} else
 			assert((uintptr_t)ctx == (uintptr_t)1U);
@@ -637,10 +663,17 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 			    (uintptr_t)((pageind - (mapelm->bits >>
 			    PAGE_SHIFT)) << PAGE_SHIFT));
 			assert(run->magic == ARENA_RUN_MAGIC);
-			assert(((uintptr_t)ptr - ((uintptr_t)run +
-			    (uintptr_t)run->bin->reg0_offset)) %
-			    run->bin->reg_size == 0);
 			bin = run->bin;
+#ifndef NDEBUG
+			{
+				size_t binind = arena_bin_index(arena, bin);
+				arena_bin_info_t *bin_info =
+				    &arena_bin_info[binind];
+				assert(((uintptr_t)ptr - ((uintptr_t)run +
+				    (uintptr_t)bin_info->reg0_offset)) %
+				    bin_info->reg_size == 0);
+			}
+#endif
 			malloc_mutex_lock(&bin->lock);
 			arena_dalloc_bin(arena, chunk, ptr, mapelm);
 			malloc_mutex_unlock(&bin->lock);
diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
index 0f58a7a..34b2a23 100644
--- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
@@ -402,7 +402,7 @@ s2u(size_t size)
 {
 
 	if (size <= small_maxclass)
-		return (arenas[0]->bins[SMALL_SIZE2BIN(size)].reg_size);
+		return (arena_bin_info[SMALL_SIZE2BIN(size)].reg_size);
 	if (size <= arena_maxclass)
 		return (PAGE_CEILING(size));
 	return (CHUNK_CEILING(size));
@@ -446,10 +446,8 @@ sa2u(size_t size, size_t alignment, size_t *run_size_p)
 	}
 
 	if (usize <= arena_maxclass && alignment <= PAGE_SIZE) {
-		if (usize <= small_maxclass) {
-			return
-			    (arenas[0]->bins[SMALL_SIZE2BIN(usize)].reg_size);
-		}
+		if (usize <= small_maxclass)
+			return (arena_bin_info[SMALL_SIZE2BIN(usize)].reg_size);
 		return (PAGE_CEILING(usize));
 	} else {
 		size_t run_size;
diff --git a/jemalloc/include/jemalloc/internal/tcache.h b/jemalloc/include/jemalloc/internal/tcache.h
index 7b71172..ab02545 100644
--- a/jemalloc/include/jemalloc/internal/tcache.h
+++ b/jemalloc/include/jemalloc/internal/tcache.h
@@ -232,7 +232,7 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
 		if (ret == NULL)
 			return (NULL);
 	}
-	assert(arena_salloc(ret) == tcache->arena->bins[binind].reg_size);
+	assert(arena_salloc(ret) == arena_bin_info[binind].reg_size);
 
 	if (zero == false) {
 #ifdef JEMALLOC_FILL
@@ -248,7 +248,7 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
 	tbin->tstats.nrequests++;
 #endif
 #ifdef JEMALLOC_PROF
-	tcache->prof_accumbytes += tcache->arena->bins[binind].reg_size;
+	tcache->prof_accumbytes += arena_bin_info[binind].reg_size;
 #endif
 	tcache_event(tcache);
 	return (ret);
@@ -331,7 +331,7 @@ tcache_dalloc_small(tcache_t *tcache, void *ptr)
 
 #ifdef JEMALLOC_FILL
 	if (opt_junk)
-		memset(ptr, 0x5a, bin->reg_size);
+		memset(ptr, 0x5a, arena_bin_info[binind].reg_size);
 #endif
 
 	tbin = &tcache->tbins[binind];
diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index 2811fd1..e49b8ed 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -8,6 +8,7 @@ size_t	opt_lg_qspace_max = LG_QSPACE_MAX_DEFAULT;
 size_t	opt_lg_cspace_max = LG_CSPACE_MAX_DEFAULT;
 ssize_t		opt_lg_dirty_mult = LG_DIRTY_MULT_DEFAULT;
 uint8_t const	*small_size2bin;
+arena_bin_info_t	*arena_bin_info;
 
 /* Various bin-related settings. */
 unsigned	nqbins;
@@ -174,7 +175,6 @@ static void	arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk,
     arena_run_t *run, size_t oldsize, size_t newsize, bool dirty);
 static arena_run_t *arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin);
 static void	*arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin);
-static size_t	arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size);
 static void	arena_dissociate_bin_run(arena_chunk_t *chunk, arena_run_t *run,
     arena_bin_t *bin);
 static void	arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk,
@@ -192,6 +192,9 @@ static bool	small_size2bin_init(void);
 static void	small_size2bin_validate(void);
 #endif
 static bool	small_size2bin_init_hard(void);
+static size_t	bin_info_run_size_calc(arena_bin_info_t *bin_info,
+    size_t min_run_size);
+static bool	bin_info_init(void);
 
 /******************************************************************************/
 
@@ -247,7 +250,7 @@ rb_gen(static JEMALLOC_ATTR(unused), arena_avail_tree_, arena_avail_tree_t,
     arena_chunk_map_t, u.rb_link, arena_avail_comp)
 
 static inline void *
-arena_run_reg_alloc(arena_run_t *run, arena_bin_t *bin)
+arena_run_reg_alloc(arena_run_t *run, arena_bin_info_t *bin_info)
 {
 	void *ret;
 
@@ -261,16 +264,16 @@ arena_run_reg_alloc(arena_run_t *run, arena_bin_t *bin)
 		assert(ret != NULL);
 		/* Write-after free can cause assertion failure. */
 		assert((uintptr_t)ret >= (uintptr_t)run +
-		    (uintptr_t)bin->reg0_offset);
+		    (uintptr_t)bin_info->reg0_offset);
 		assert((uintptr_t)ret < (uintptr_t)run->next);
 		assert(((uintptr_t)ret - ((uintptr_t)run +
-		    (uintptr_t)bin->reg0_offset)) % (uintptr_t)bin->reg_size ==
-		    0);
+		    (uintptr_t)bin_info->reg0_offset)) %
+		    (uintptr_t)bin_info->reg_size == 0);
 		run->avail = *(void **)ret;
 		return (ret);
 	}
 	ret = run->next;
-	run->next = (void *)((uintptr_t)ret + (uintptr_t)bin->reg_size);
+	run->next = (void *)((uintptr_t)ret + (uintptr_t)bin_info->reg_size);
 	assert(ret != NULL);
 	return (ret);
 }
@@ -279,22 +282,27 @@ static inline void
 arena_run_reg_dalloc(arena_run_t *run, void *ptr)
 {
 
-	assert(run->nfree < run->bin->nregs);
+#ifndef NDEBUG
+	arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
+	size_t binind = arena_bin_index(chunk->arena, run->bin);
+	arena_bin_info_t *bin_info = &arena_bin_info[binind];
+	assert(run->nfree < bin_info->nregs);
 	/* Freeing an interior pointer can cause assertion failure. */
 	assert(((uintptr_t)ptr - ((uintptr_t)run +
-	    (uintptr_t)run->bin->reg0_offset)) % (uintptr_t)run->bin->reg_size
+	    (uintptr_t)bin_info->reg0_offset)) % (uintptr_t)bin_info->reg_size
 	    == 0);
 	/*
 	 * Freeing a pointer lower than region zero can cause assertion
 	 * failure.
 	 */
 	assert((uintptr_t)ptr >= (uintptr_t)run +
-	    (uintptr_t)run->bin->reg0_offset);
+	    (uintptr_t)bin_info->reg0_offset);
 	/*
 	 * Freeing a pointer past in the run's frontier can cause assertion
 	 * failure.
 	 */
 	assert((uintptr_t)ptr < (uintptr_t)run->next);
+#endif
 
 	*(void **)ptr = run->avail;
 	run->avail = ptr;
@@ -765,7 +773,11 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
 
 				assert((mapelm->bits >> PAGE_SHIFT) == 0);
 				assert(run->magic == ARENA_RUN_MAGIC);
-				pageind += run->bin->run_size >> PAGE_SHIFT;
+				size_t binind = arena_bin_index(arena,
+				    run->bin);
+				arena_bin_info_t *bin_info =
+				    &arena_bin_info[binind];
+				pageind += bin_info->run_size >> PAGE_SHIFT;
 			}
 		}
 	}
@@ -947,8 +959,11 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty)
 		    CHUNK_MAP_LARGE) != 0);
 		assert((chunk->map[run_ind+(size>>PAGE_SHIFT)-1-map_bias].bits &
 		    CHUNK_MAP_ALLOCATED) != 0);
-	} else
-		size = run->bin->run_size;
+	} else {
+		size_t binind = arena_bin_index(arena, run->bin);
+		arena_bin_info_t *bin_info = &arena_bin_info[binind];
+		size = bin_info->run_size;
+	}
 	run_pages = (size >> PAGE_SHIFT);
 	arena->nactive -= run_pages;
 
@@ -1175,6 +1190,8 @@ arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin)
 {
 	arena_chunk_map_t *mapelm;
 	arena_run_t *run;
+	size_t binind;
+	arena_bin_info_t *bin_info;
 
 	/* Look for a usable run. */
 	mapelm = arena_run_tree_first(&bin->runs);
@@ -1198,18 +1215,21 @@ arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin)
 	}
 	/* No existing runs have any space available. */
 
+	binind = arena_bin_index(arena, bin);
+	bin_info = &arena_bin_info[binind];
+
 	/* Allocate a new run. */
 	malloc_mutex_unlock(&bin->lock);
 	/******************************/
 	malloc_mutex_lock(&arena->lock);
-	run = arena_run_alloc(arena, bin->run_size, false, false);
+	run = arena_run_alloc(arena, bin_info->run_size, false, false);
 	if (run != NULL) {
 		/* Initialize run internals. */
 		run->bin = bin;
 		run->avail = NULL;
 		run->next = (void *)((uintptr_t)run +
-		    (uintptr_t)bin->reg0_offset);
-		run->nfree = bin->nregs;
+		    (uintptr_t)bin_info->reg0_offset);
+		run->nfree = bin_info->nregs;
 #ifdef JEMALLOC_DEBUG
 		run->magic = ARENA_RUN_MAGIC;
 #endif
@@ -1260,18 +1280,23 @@ static void *
 arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin)
 {
 	void *ret;
+	size_t binind;
+	arena_bin_info_t *bin_info;
 	arena_run_t *run;
 
+	binind = arena_bin_index(arena, bin);
+	bin_info = &arena_bin_info[binind];
 	bin->runcur = NULL;
 	run = arena_bin_nonfull_run_get(arena, bin);
 	if (bin->runcur != NULL && bin->runcur->nfree > 0) {
+
 		/*
 		 * Another thread updated runcur while this one ran without the
 		 * bin lock in arena_bin_nonfull_run_get().
 		 */
 		assert(bin->runcur->magic == ARENA_RUN_MAGIC);
 		assert(bin->runcur->nfree > 0);
-		ret = arena_run_reg_alloc(bin->runcur, bin);
+		ret = arena_run_reg_alloc(bin->runcur, bin_info);
 		if (run != NULL) {
 			arena_chunk_t *chunk;
 
@@ -1284,7 +1309,7 @@ arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin)
 			 * from the run.
 			 */
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
-			if (run->nfree == bin->nregs)
+			if (run->nfree == bin_info->nregs)
 				arena_dalloc_bin_run(arena, chunk, run, bin);
 			else
 				arena_bin_lower_run(arena, chunk, run, bin);
@@ -1300,7 +1325,7 @@ arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin)
 	assert(bin->runcur->magic == ARENA_RUN_MAGIC);
 	assert(bin->runcur->nfree > 0);
 
-	return (arena_run_reg_alloc(bin->runcur, bin));
+	return (arena_run_reg_alloc(bin->runcur, bin_info));
 }
 
 #ifdef JEMALLOC_PROF
@@ -1342,7 +1367,7 @@ arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, size_t binind
 	malloc_mutex_lock(&bin->lock);
 	for (i = 0, nfill = (tbin->ncached_max >> 1); i < nfill; i++) {
 		if ((run = bin->runcur) != NULL && run->nfree > 0)
-			ptr = arena_run_reg_alloc(run, bin);
+			ptr = arena_run_reg_alloc(run, &arena_bin_info[binind]);
 		else
 			ptr = arena_bin_malloc_hard(arena, bin);
 		if (ptr == NULL)
@@ -1351,7 +1376,8 @@ arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, size_t binind
 		tbin->avail = ptr;
 	}
 #ifdef JEMALLOC_STATS
-	bin->stats.allocated += (i - tbin->ncached) * bin->reg_size;
+	bin->stats.allocated += (i - tbin->ncached) *
+	    arena_bin_info[binind].reg_size;
 	bin->stats.nmalloc += i;
 	bin->stats.nrequests += tbin->tstats.nrequests;
 	bin->stats.nfills++;
@@ -1362,112 +1388,6 @@ arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, size_t binind
 }
 #endif
 
-/*
- * Calculate bin->run_size such that it meets the following constraints:
- *
- *   *) bin->run_size >= min_run_size
- *   *) bin->run_size <= arena_maxclass
- *   *) run header overhead <= RUN_MAX_OVRHD (or header overhead relaxed).
- *
- * bin->nregs and bin->reg0_offset are also calculated here, since these
- * settings are all interdependent.
- */
-static size_t
-arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size)
-{
-	size_t try_run_size, good_run_size;
-	uint32_t try_nregs, good_nregs;
-	uint32_t try_hdr_size, good_hdr_size;
-#ifdef JEMALLOC_PROF
-	uint32_t try_ctx0_offset, good_ctx0_offset;
-#endif
-	uint32_t try_reg0_offset, good_reg0_offset;
-
-	assert(min_run_size >= PAGE_SIZE);
-	assert(min_run_size <= arena_maxclass);
-
-	/*
-	 * Calculate known-valid settings before entering the run_size
-	 * expansion loop, so that the first part of the loop always copies
-	 * valid settings.
-	 *
-	 * The do..while loop iteratively reduces the number of regions until
-	 * the run header and the regions no longer overlap.  A closed formula
-	 * would be quite messy, since there is an interdependency between the
-	 * header's mask length and the number of regions.
-	 */
-	try_run_size = min_run_size;
-	try_nregs = ((try_run_size - sizeof(arena_run_t)) / bin->reg_size)
-	    + 1; /* Counter-act try_nregs-- in loop. */
-	do {
-		try_nregs--;
-		try_hdr_size = sizeof(arena_run_t);
-#ifdef JEMALLOC_PROF
-		if (opt_prof && prof_promote == false) {
-			/* Pad to a quantum boundary. */
-			try_hdr_size = QUANTUM_CEILING(try_hdr_size);
-			try_ctx0_offset = try_hdr_size;
-			/* Add space for one (prof_ctx_t *) per region. */
-			try_hdr_size += try_nregs * sizeof(prof_ctx_t *);
-		} else
-			try_ctx0_offset = 0;
-#endif
-		try_reg0_offset = try_run_size - (try_nregs * bin->reg_size);
-	} while (try_hdr_size > try_reg0_offset);
-
-	/* run_size expansion loop. */
-	do {
-		/*
-		 * Copy valid settings before trying more aggressive settings.
-		 */
-		good_run_size = try_run_size;
-		good_nregs = try_nregs;
-		good_hdr_size = try_hdr_size;
-#ifdef JEMALLOC_PROF
-		good_ctx0_offset = try_ctx0_offset;
-#endif
-		good_reg0_offset = try_reg0_offset;
-
-		/* Try more aggressive settings. */
-		try_run_size += PAGE_SIZE;
-		try_nregs = ((try_run_size - sizeof(arena_run_t)) /
-		    bin->reg_size) + 1; /* Counter-act try_nregs-- in loop. */
-		do {
-			try_nregs--;
-			try_hdr_size = sizeof(arena_run_t);
-#ifdef JEMALLOC_PROF
-			if (opt_prof && prof_promote == false) {
-				/* Pad to a quantum boundary. */
-				try_hdr_size = QUANTUM_CEILING(try_hdr_size);
-				try_ctx0_offset = try_hdr_size;
-				/*
-				 * Add space for one (prof_ctx_t *) per region.
-				 */
-				try_hdr_size += try_nregs *
-				    sizeof(prof_ctx_t *);
-			}
-#endif
-			try_reg0_offset = try_run_size - (try_nregs *
-			    bin->reg_size);
-		} while (try_hdr_size > try_reg0_offset);
-	} while (try_run_size <= arena_maxclass
-	    && try_run_size <= arena_maxclass
-	    && RUN_MAX_OVRHD * (bin->reg_size << 3) > RUN_MAX_OVRHD_RELAX
-	    && (try_reg0_offset << RUN_BFP) > RUN_MAX_OVRHD * try_run_size);
-
-	assert(good_hdr_size <= good_reg0_offset);
-
-	/* Copy final settings. */
-	bin->run_size = good_run_size;
-	bin->nregs = good_nregs;
-#ifdef JEMALLOC_PROF
-	bin->ctx0_offset = good_ctx0_offset;
-#endif
-	bin->reg0_offset = good_reg0_offset;
-
-	return (good_run_size);
-}
-
 void *
 arena_malloc_small(arena_t *arena, size_t size, bool zero)
 {
@@ -1479,11 +1399,11 @@ arena_malloc_small(arena_t *arena, size_t size, bool zero)
 	binind = SMALL_SIZE2BIN(size);
 	assert(binind < nbins);
 	bin = &arena->bins[binind];
-	size = bin->reg_size;
+	size = arena_bin_info[binind].reg_size;
 
 	malloc_mutex_lock(&bin->lock);
 	if ((run = bin->runcur) != NULL && run->nfree > 0)
-		ret = arena_run_reg_alloc(run, bin);
+		ret = arena_run_reg_alloc(run, &arena_bin_info[binind]);
 	else
 		ret = arena_bin_malloc_hard(arena, bin);
 
@@ -1688,10 +1608,12 @@ arena_salloc(const void *ptr)
 		    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
 		    PAGE_SHIFT));
 		assert(run->magic == ARENA_RUN_MAGIC);
+		size_t binind = arena_bin_index(chunk->arena, run->bin);
+		arena_bin_info_t *bin_info = &arena_bin_info[binind];
 		assert(((uintptr_t)ptr - ((uintptr_t)run +
-		    (uintptr_t)run->bin->reg0_offset)) % run->bin->reg_size ==
+		    (uintptr_t)bin_info->reg0_offset)) % bin_info->reg_size ==
 		    0);
-		ret = run->bin->reg_size;
+		ret = bin_info->reg_size;
 	} else {
 		assert(((uintptr_t)ptr & PAGE_MASK) == 0);
 		ret = mapbits & ~PAGE_MASK;
@@ -1739,10 +1661,12 @@ arena_salloc_demote(const void *ptr)
 		    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
 		    PAGE_SHIFT));
 		assert(run->magic == ARENA_RUN_MAGIC);
+		size_t binind = arena_bin_index(chunk->arena, run->bin);
+		arena_bin_info_t *bin_info = &arena_bin_info[binind];
 		assert(((uintptr_t)ptr - ((uintptr_t)run +
-		    (uintptr_t)run->bin->reg0_offset)) % run->bin->reg_size ==
+		    (uintptr_t)bin_info->reg0_offset)) % bin_info->reg_size ==
 		    0);
-		ret = run->bin->reg_size;
+		ret = bin_info->reg_size;
 	} else {
 		assert(((uintptr_t)ptr & PAGE_MASK) == 0);
 		ret = mapbits & ~PAGE_MASK;
@@ -1751,7 +1675,7 @@ arena_salloc_demote(const void *ptr)
 			size_t binind = ((mapbits & CHUNK_MAP_CLASS_MASK) >>
 			    CHUNK_MAP_CLASS_SHIFT) - 1;
 			assert(binind < nbins);
-			ret = chunk->arena->bins[binind].reg_size;
+			ret = arena_bin_info[binind].reg_size;
 		}
 		assert(ret != 0);
 	}
@@ -1768,17 +1692,22 @@ arena_dissociate_bin_run(arena_chunk_t *chunk, arena_run_t *run,
 	/* Dissociate run from bin. */
 	if (run == bin->runcur)
 		bin->runcur = NULL;
-	else if (bin->nregs != 1) {
-		size_t run_pageind = (((uintptr_t)run - (uintptr_t)chunk)) >>
-		    PAGE_SHIFT;
-		arena_chunk_map_t *run_mapelm =
-		    &chunk->map[run_pageind-map_bias];
-		/*
-		 * This block's conditional is necessary because if the run
-		 * only contains one region, then it never gets inserted into
-		 * the non-full runs tree.
-		 */
-		arena_run_tree_remove(&bin->runs, run_mapelm);
+	else {
+		size_t binind = arena_bin_index(chunk->arena, bin);
+		arena_bin_info_t *bin_info = &arena_bin_info[binind];
+
+		if (bin_info->nregs != 1) {
+			size_t run_pageind = (((uintptr_t)run -
+			    (uintptr_t)chunk)) >> PAGE_SHIFT;
+			arena_chunk_map_t *run_mapelm =
+			    &chunk->map[run_pageind-map_bias];
+			/*
+			 * This block's conditional is necessary because if the
+			 * run only contains one region, then it never gets
+			 * inserted into the non-full runs tree.
+			 */
+			arena_run_tree_remove(&bin->runs, run_mapelm);
+		}
 	}
 }
 
@@ -1786,15 +1715,20 @@ static void
 arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
     arena_bin_t *bin)
 {
+	size_t binind;
+	arena_bin_info_t *bin_info;
 	size_t npages, run_ind, past;
 
 	assert(run != bin->runcur);
 	assert(arena_run_tree_search(&bin->runs, &chunk->map[
 	    (((uintptr_t)run-(uintptr_t)chunk)>>PAGE_SHIFT)-map_bias]) == NULL);
 
+	binind = arena_bin_index(chunk->arena, run->bin);
+	bin_info = &arena_bin_info[binind];
+
 	malloc_mutex_unlock(&bin->lock);
 	/******************************/
-	npages = bin->run_size >> PAGE_SHIFT;
+	npages = bin_info->run_size >> PAGE_SHIFT;
 	run_ind = (size_t)(((uintptr_t)run - (uintptr_t)chunk) >> PAGE_SHIFT);
 	past = (size_t)((PAGE_CEILING((uintptr_t)run->next) - (uintptr_t)chunk)
 	    >> PAGE_SHIFT);
@@ -1814,7 +1748,7 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 		chunk->map[run_ind+npages-1-map_bias].bits = CHUNK_MAP_LARGE |
 		    (chunk->map[run_ind+npages-1-map_bias].bits &
 		    CHUNK_MAP_FLAGS_MASK);
-		chunk->map[run_ind-map_bias].bits = bin->run_size |
+		chunk->map[run_ind-map_bias].bits = bin_info->run_size |
 		    CHUNK_MAP_LARGE | (chunk->map[run_ind-map_bias].bits &
 		    CHUNK_MAP_FLAGS_MASK);
 		arena_run_trim_tail(arena, chunk, run, (npages << PAGE_SHIFT),
@@ -1885,8 +1819,10 @@ arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	    (mapelm->bits >> PAGE_SHIFT)) << PAGE_SHIFT));
 	assert(run->magic == ARENA_RUN_MAGIC);
 	bin = run->bin;
+	size_t binind = arena_bin_index(arena, bin);
+	arena_bin_info_t *bin_info = &arena_bin_info[binind];
 #if (defined(JEMALLOC_FILL) || defined(JEMALLOC_STATS))
-	size = bin->reg_size;
+	size = bin_info->reg_size;
 #endif
 
 #ifdef JEMALLOC_FILL
@@ -1895,7 +1831,7 @@ arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 #endif
 
 	arena_run_reg_dalloc(run, ptr);
-	if (run->nfree == bin->nregs) {
+	if (run->nfree == bin_info->nregs) {
 		arena_dissociate_bin_run(chunk, run, bin);
 		arena_dalloc_bin_run(arena, chunk, run, bin);
 	} else if (run->nfree == 1 && run != bin->runcur)
@@ -2167,8 +2103,8 @@ arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
 	 */
 	if (oldsize <= arena_maxclass) {
 		if (oldsize <= small_maxclass) {
-			assert(choose_arena()->bins[SMALL_SIZE2BIN(
-			    oldsize)].reg_size == oldsize);
+			assert(arena_bin_info[SMALL_SIZE2BIN(oldsize)].reg_size
+			    == oldsize);
 			if ((size + extra <= small_maxclass &&
 			    SMALL_SIZE2BIN(size + extra) ==
 			    SMALL_SIZE2BIN(oldsize)) || (size <= oldsize &&
@@ -2248,7 +2184,6 @@ arena_new(arena_t *arena, unsigned ind)
 {
 	unsigned i;
 	arena_bin_t *bin;
-	size_t prev_run_size;
 
 	arena->ind = ind;
 
@@ -2284,8 +2219,6 @@ arena_new(arena_t *arena, unsigned ind)
 	arena_avail_tree_new(&arena->runs_avail_dirty);
 
 	/* Initialize bins. */
-	prev_run_size = PAGE_SIZE;
-
 	i = 0;
 #ifdef JEMALLOC_TINY
 	/* (2^n)-spaced tiny bins. */
@@ -2295,11 +2228,6 @@ arena_new(arena_t *arena, unsigned ind)
 			return (true);
 		bin->runcur = NULL;
 		arena_run_tree_new(&bin->runs);
-
-		bin->reg_size = (1U << (LG_TINY_MIN + i));
-
-		prev_run_size = arena_bin_run_size_calc(bin, prev_run_size);
-
 #ifdef JEMALLOC_STATS
 		memset(&bin->stats, 0, sizeof(malloc_bin_stats_t));
 #endif
@@ -2313,11 +2241,6 @@ arena_new(arena_t *arena, unsigned ind)
 			return (true);
 		bin->runcur = NULL;
 		arena_run_tree_new(&bin->runs);
-
-		bin->reg_size = (i - ntbins + 1) << LG_QUANTUM;
-
-		prev_run_size = arena_bin_run_size_calc(bin, prev_run_size);
-
 #ifdef JEMALLOC_STATS
 		memset(&bin->stats, 0, sizeof(malloc_bin_stats_t));
 #endif
@@ -2330,12 +2253,6 @@ arena_new(arena_t *arena, unsigned ind)
 			return (true);
 		bin->runcur = NULL;
 		arena_run_tree_new(&bin->runs);
-
-		bin->reg_size = cspace_min + ((i - (ntbins + nqbins)) <<
-		    LG_CACHELINE);
-
-		prev_run_size = arena_bin_run_size_calc(bin, prev_run_size);
-
 #ifdef JEMALLOC_STATS
 		memset(&bin->stats, 0, sizeof(malloc_bin_stats_t));
 #endif
@@ -2348,12 +2265,6 @@ arena_new(arena_t *arena, unsigned ind)
 			return (true);
 		bin->runcur = NULL;
 		arena_run_tree_new(&bin->runs);
-
-		bin->reg_size = sspace_min + ((i - (ntbins + nqbins + ncbins))
-		    << LG_SUBPAGE);
-
-		prev_run_size = arena_bin_run_size_calc(bin, prev_run_size);
-
 #ifdef JEMALLOC_STATS
 		memset(&bin->stats, 0, sizeof(malloc_bin_stats_t));
 #endif
@@ -2487,6 +2398,162 @@ small_size2bin_init_hard(void)
 #undef CUSTOM_SMALL_SIZE2BIN
 }
 
+/*
+ * Calculate bin_info->run_size such that it meets the following constraints:
+ *
+ *   *) bin_info->run_size >= min_run_size
+ *   *) bin_info->run_size <= arena_maxclass
+ *   *) run header overhead <= RUN_MAX_OVRHD (or header overhead relaxed).
+ *
+ * bin_info->nregs and bin_info->reg0_offset are also calculated here, since
+ * these settings are all interdependent.
+ */
+static size_t
+bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size)
+{
+	size_t try_run_size, good_run_size;
+	uint32_t try_nregs, good_nregs;
+	uint32_t try_hdr_size, good_hdr_size;
+#ifdef JEMALLOC_PROF
+	uint32_t try_ctx0_offset, good_ctx0_offset;
+#endif
+	uint32_t try_reg0_offset, good_reg0_offset;
+
+	assert(min_run_size >= PAGE_SIZE);
+	assert(min_run_size <= arena_maxclass);
+
+	/*
+	 * Calculate known-valid settings before entering the run_size
+	 * expansion loop, so that the first part of the loop always copies
+	 * valid settings.
+	 *
+	 * The do..while loop iteratively reduces the number of regions until
+	 * the run header and the regions no longer overlap.  A closed formula
+	 * would be quite messy, since there is an interdependency between the
+	 * header's mask length and the number of regions.
+	 */
+	try_run_size = min_run_size;
+	try_nregs = ((try_run_size - sizeof(arena_run_t)) / bin_info->reg_size)
+	    + 1; /* Counter-act try_nregs-- in loop. */
+	do {
+		try_nregs--;
+		try_hdr_size = sizeof(arena_run_t);
+#ifdef JEMALLOC_PROF
+		if (opt_prof && prof_promote == false) {
+			/* Pad to a quantum boundary. */
+			try_hdr_size = QUANTUM_CEILING(try_hdr_size);
+			try_ctx0_offset = try_hdr_size;
+			/* Add space for one (prof_ctx_t *) per region. */
+			try_hdr_size += try_nregs * sizeof(prof_ctx_t *);
+		} else
+			try_ctx0_offset = 0;
+#endif
+		try_reg0_offset = try_run_size - (try_nregs *
+		    bin_info->reg_size);
+	} while (try_hdr_size > try_reg0_offset);
+
+	/* run_size expansion loop. */
+	do {
+		/*
+		 * Copy valid settings before trying more aggressive settings.
+		 */
+		good_run_size = try_run_size;
+		good_nregs = try_nregs;
+		good_hdr_size = try_hdr_size;
+#ifdef JEMALLOC_PROF
+		good_ctx0_offset = try_ctx0_offset;
+#endif
+		good_reg0_offset = try_reg0_offset;
+
+		/* Try more aggressive settings. */
+		try_run_size += PAGE_SIZE;
+		try_nregs = ((try_run_size - sizeof(arena_run_t)) /
+		    bin_info->reg_size)
+		    + 1; /* Counter-act try_nregs-- in loop. */
+		do {
+			try_nregs--;
+			try_hdr_size = sizeof(arena_run_t);
+#ifdef JEMALLOC_PROF
+			if (opt_prof && prof_promote == false) {
+				/* Pad to a quantum boundary. */
+				try_hdr_size = QUANTUM_CEILING(try_hdr_size);
+				try_ctx0_offset = try_hdr_size;
+				/*
+				 * Add space for one (prof_ctx_t *) per region.
+				 */
+				try_hdr_size += try_nregs *
+				    sizeof(prof_ctx_t *);
+			}
+#endif
+			try_reg0_offset = try_run_size - (try_nregs *
+			    bin_info->reg_size);
+		} while (try_hdr_size > try_reg0_offset);
+	} while (try_run_size <= arena_maxclass
+	    && try_run_size <= arena_maxclass
+	    && RUN_MAX_OVRHD * (bin_info->reg_size << 3) > RUN_MAX_OVRHD_RELAX
+	    && (try_reg0_offset << RUN_BFP) > RUN_MAX_OVRHD * try_run_size);
+
+	assert(good_hdr_size <= good_reg0_offset);
+
+	/* Copy final settings. */
+	bin_info->run_size = good_run_size;
+	bin_info->nregs = good_nregs;
+#ifdef JEMALLOC_PROF
+	bin_info->ctx0_offset = good_ctx0_offset;
+#endif
+	bin_info->reg0_offset = good_reg0_offset;
+
+	return (good_run_size);
+}
+
+static bool
+bin_info_init(void)
+{
+	arena_bin_info_t *bin_info;
+	unsigned i;
+	size_t prev_run_size;
+
+	arena_bin_info = base_alloc(sizeof(arena_bin_info_t) * nbins);
+	if (arena_bin_info == NULL)
+		return (true);
+
+	prev_run_size = PAGE_SIZE;
+	i = 0;
+#ifdef JEMALLOC_TINY
+	/* (2^n)-spaced tiny bins. */
+	for (; i < ntbins; i++) {
+		bin_info = &arena_bin_info[i];
+		bin_info->reg_size = (1U << (LG_TINY_MIN + i));
+		prev_run_size = bin_info_run_size_calc(bin_info, prev_run_size);
+	}
+#endif
+
+	/* Quantum-spaced bins. */
+	for (; i < ntbins + nqbins; i++) {
+		bin_info = &arena_bin_info[i];
+		bin_info->reg_size = (i - ntbins + 1) << LG_QUANTUM;
+		prev_run_size = bin_info_run_size_calc(bin_info, prev_run_size);
+	}
+
+	/* Cacheline-spaced bins. */
+	for (; i < ntbins + nqbins + ncbins; i++) {
+		bin_info = &arena_bin_info[i];
+		bin_info->reg_size = cspace_min + ((i - (ntbins + nqbins)) <<
+		    LG_CACHELINE);
+		prev_run_size = bin_info_run_size_calc(bin_info, prev_run_size);
+	}
+
+	/* Subpage-spaced bins. */
+	for (; i < nbins; i++) {
+		bin_info = &arena_bin_info[i];
+		bin_info->reg_size = sspace_min + ((i - (ntbins + nqbins +
+		    ncbins)) << LG_SUBPAGE);
+		prev_run_size = bin_info_run_size_calc(bin_info, prev_run_size);
+	}
+
+	return (false);
+}
+
 bool
 arena_boot(void)
 {
@@ -2545,9 +2612,6 @@ arena_boot(void)
 	    abort();
 	}
 
-	if (small_size2bin_init())
-		return (true);
-
 	/*
 	 * Compute the header size such that it is large enough to contain the
 	 * page map.  The page map is biased to omit entries for the header
@@ -2571,5 +2635,11 @@ arena_boot(void)
 
 	arena_maxclass = chunksize - (map_bias << PAGE_SHIFT);
 
+	if (small_size2bin_init())
+		return (true);
+
+	if (bin_info_init())
+		return (true);
+
 	return (false);
 }
diff --git a/jemalloc/src/ctl.c b/jemalloc/src/ctl.c
index 1b28da4..c32e955 100644
--- a/jemalloc/src/ctl.c
+++ b/jemalloc/src/ctl.c
@@ -1289,9 +1289,9 @@ CTL_RO_NL_GEN(opt_overcommit, opt_overcommit, bool)
 
 /******************************************************************************/
 
-CTL_RO_NL_GEN(arenas_bin_i_size, arenas[0]->bins[mib[2]].reg_size, size_t)
-CTL_RO_NL_GEN(arenas_bin_i_nregs, arenas[0]->bins[mib[2]].nregs, uint32_t)
-CTL_RO_NL_GEN(arenas_bin_i_run_size, arenas[0]->bins[mib[2]].run_size, size_t)
+CTL_RO_NL_GEN(arenas_bin_i_size, arena_bin_info[mib[2]].reg_size, size_t)
+CTL_RO_NL_GEN(arenas_bin_i_nregs, arena_bin_info[mib[2]].nregs, uint32_t)
+CTL_RO_NL_GEN(arenas_bin_i_run_size, arena_bin_info[mib[2]].run_size, size_t)
 const ctl_node_t *
 arenas_bin_i_index(const size_t *mib, size_t miblen, size_t i)
 {
diff --git a/jemalloc/src/tcache.c b/jemalloc/src/tcache.c
index e9b067d..88e1cc7 100644
--- a/jemalloc/src/tcache.c
+++ b/jemalloc/src/tcache.c
@@ -253,9 +253,9 @@ tcache_create(arena_t *arena)
 	tcache->arena = arena;
 	assert((TCACHE_NSLOTS_SMALL_MAX & 1U) == 0);
 	for (i = 0; i < nbins; i++) {
-		if ((arena->bins[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MAX) {
-			tcache->tbins[i].ncached_max = (arena->bins[i].nregs <<
-			    1);
+		if ((arena_bin_info[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MAX) {
+			tcache->tbins[i].ncached_max = (arena_bin_info[i].nregs
+			    << 1);
 		} else
 			tcache->tbins[i].ncached_max = TCACHE_NSLOTS_SMALL_MAX;
 	}
-- 
cgit v0.12


From 819d11be068e3f86e31db0956f5a0b29d9971e7f Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Tue, 15 Mar 2011 14:25:56 -0700
Subject: Add missing error checks.

Add missing error checks for pthread_mutex_init() calls.  In practice,
mutex initialization never fails, so this is merely good hygiene.
---
 jemalloc/src/jemalloc.c | 3 ++-
 jemalloc/src/rtree.c    | 5 ++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c
index 61a36c7..c1aadda 100644
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@@ -735,7 +735,8 @@ malloc_init_hard(void)
 	 */
 	ARENA_SET(arenas[0]);
 
-	malloc_mutex_init(&arenas_lock);
+	if (malloc_mutex_init(&arenas_lock))
+		return (true);
 
 #ifdef JEMALLOC_PROF
 	if (prof_boot2()) {
diff --git a/jemalloc/src/rtree.c b/jemalloc/src/rtree.c
index 7753743..eb440aa 100644
--- a/jemalloc/src/rtree.c
+++ b/jemalloc/src/rtree.c
@@ -20,7 +20,10 @@ rtree_new(unsigned bits)
 	memset(ret, 0, offsetof(rtree_t, level2bits) + (sizeof(unsigned) *
 	    height));
 
-	malloc_mutex_init(&ret->mutex);
+	if (malloc_mutex_init(&ret->mutex)) {
+		/* Leak the rtree. */
+		return (NULL);
+	}
 	ret->height = height;
 	if (bits_per_level * height > bits)
 		ret->level2bits[0] = bits % bits_per_level;
-- 
cgit v0.12


From b602daa6710dab61d8e1ca0cd3c44ac8a564fd9f Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Tue, 15 Mar 2011 22:19:45 -0700
Subject: Clean up after arena_bin_info_t change.

Fix a couple of problems related to the addition of arena_bin_info_t.
---
 jemalloc/include/jemalloc/internal/arena.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/jemalloc/include/jemalloc/internal/arena.h b/jemalloc/include/jemalloc/internal/arena.h
index 467ec65..bd983f2 100644
--- a/jemalloc/include/jemalloc/internal/arena.h
+++ b/jemalloc/include/jemalloc/internal/arena.h
@@ -475,7 +475,7 @@ bool	arena_boot(void);
 #ifndef JEMALLOC_ENABLE_INLINE
 size_t	arena_bin_index(arena_t *arena, arena_bin_t *bin);
 unsigned	arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info,
-    const void *ptr, size_t size);
+    const void *ptr);
 #  ifdef JEMALLOC_PROF
 prof_ctx_t	*arena_prof_ctx_get(const void *ptr);
 void	arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
@@ -493,10 +493,10 @@ arena_bin_index(arena_t *arena, arena_bin_t *bin)
 }
 
 JEMALLOC_INLINE unsigned
-arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, const void *ptr,
-    size_t size)
+arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, const void *ptr)
 {
 	unsigned shift, diff, regind;
+	size_t size;
 
 	assert(run->magic == ARENA_RUN_MAGIC);
 
@@ -508,6 +508,7 @@ arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, const void *ptr,
 	    bin_info->reg0_offset);
 
 	/* Rescale (factor powers of 2 out of the numerator and denominator). */
+	size = bin_info->reg_size;
 	shift = ffs(size) - 1;
 	diff >>= shift;
 	size >>= shift;
@@ -583,8 +584,7 @@ arena_prof_ctx_get(const void *ptr)
 			unsigned regind;
 
 			assert(run->magic == ARENA_RUN_MAGIC);
-			regind = arena_run_regind(run, bin_info, ptr,
-			    bin_info->reg_size);
+			regind = arena_run_regind(run, bin_info, ptr);
 			ret = *(prof_ctx_t **)((uintptr_t)run +
 			    bin_info->ctx0_offset + (regind *
 			    sizeof(prof_ctx_t *)));
@@ -614,14 +614,14 @@ arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 			    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
 			    PAGE_SHIFT));
 			arena_bin_t *bin = run->bin;
-			unsigned regind;
 			size_t binind;
 			arena_bin_info_t *bin_info;
+			unsigned regind;
 
 			assert(run->magic == ARENA_RUN_MAGIC);
-			regind = arena_run_regind(run, bin, ptr, bin->reg_size);
 			binind = arena_bin_index(chunk->arena, bin);
 			bin_info = &arena_bin_info[binind];
+			regind = arena_run_regind(run, bin_info, ptr);
 
 			*((prof_ctx_t **)((uintptr_t)run + bin_info->ctx0_offset
 			    + (regind * sizeof(prof_ctx_t *)))) = ctx;
-- 
cgit v0.12


From 77f350be08c8b9cd03ceed820b3113dbac9b4151 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Tue, 15 Mar 2011 22:23:12 -0700
Subject: Improve backtracing-related configuration.

Clean up configuration for backtracing when profiling is enabled, and
document the configuration logic in INSTALL.

Disable libgcc-based backtracing except on x64 (where it is known to
work).

Add the --disable-prof-gcc option.
---
 jemalloc/INSTALL                             |  21 +++--
 jemalloc/configure.ac                        | 121 +++++++++++++++++++--------
 jemalloc/include/jemalloc/jemalloc_defs.h.in |   3 +
 jemalloc/src/prof.c                          |  78 ++++++++---------
 4 files changed, 140 insertions(+), 83 deletions(-)

diff --git a/jemalloc/INSTALL b/jemalloc/INSTALL
index e0a5dc4..11a457a 100644
--- a/jemalloc/INSTALL
+++ b/jemalloc/INSTALL
@@ -62,18 +62,23 @@ any of the following arguments (not a definitive list) to 'configure':
 
 --enable-prof
     Enable heap profiling and leak detection functionality.  See the "opt.prof"
-    option documentation for usage details.
+    option documentation for usage details.  When enabled, there are several
+    approaches to backtracing, and the configure script chooses the first one
+    in the following list that appears to function correctly:
 
---disable-prof-libgcc
-    Disable the use of libgcc's backtracing functionality.  Ordinarily, libgcc's
-    backtracing functionality is superior to the alternatives, but it may fail
-    to capture backtraces on some systems.
+    + libunwind      (requires --enable-prof-libunwind)
+    + libgcc         (unless --disable-prof-libgcc)
+    + gcc intrinsics (unless --disable-prof-gcc)
 
 --enable-prof-libunwind
     Use the libunwind library (http://www.nongnu.org/libunwind/) for stack
-    backtracing.  libunwind is quite slow, but it tends to work across a wider
-    variety of system configurations than the default backtracing code, which is
-    based on libgcc functionality or gcc intrinsics.
+    backtracing.
+
+--disable-prof-libgcc
+    Disable the use of libgcc's backtracing functionality.
+
+--disable-prof-gcc
+    Disable the use of gcc intrinsics for backtracing.
 
 --with-static-libunwind=<libunwind.a>
     Statically link against the specified libunwind.a rather than dynamically
diff --git a/jemalloc/configure.ac b/jemalloc/configure.ac
index f10641b..dfe2b9b 100644
--- a/jemalloc/configure.ac
+++ b/jemalloc/configure.ac
@@ -404,17 +404,12 @@ fi
 ],
 [enable_prof="0"]
 )
-AC_ARG_ENABLE([prof-libgcc],
-  [AS_HELP_STRING([--disable-prof-libgcc],
-  [Do not use libgcc for backtracing])],
-[if test "x$enable_prof_libgcc" = "xno" ; then
-  enable_prof_libgcc="0"
+if test "x$enable_prof" = "x1" ; then
+  backtrace_method=""
 else
-  enable_prof_libgcc="1"
+  backtrace_method="N/A"
 fi
-],
-[enable_prof_libgcc="1"]
-)
+
 AC_ARG_ENABLE([prof-libunwind],
   [AS_HELP_STRING([--enable-prof-libunwind], [Use libunwind for backtracing])],
 [if test "x$enable_prof_libunwind" = "xno" ; then
@@ -438,39 +433,90 @@ else
 fi,
   LUNWIND="-lunwind"
 )
-if test "x$enable_prof" = "x1" ; then
-  LIBS="$LIBS -lm"
-  AC_DEFINE([JEMALLOC_PROF], [ ])
-  if test "x$enable_prof_libunwind" = "x1" ; then
-    AC_CHECK_HEADERS([libunwind.h], , [enable_prof_libunwind="0"])
-    if test "x$LUNWIND" = "x-lunwind" ; then
-      AC_CHECK_LIB([unwind], [backtrace], [LIBS="$LIBS $LUNWIND"],
-                   [enable_prof_libunwind="0"])
-    else
-      LIBS="$LIBS $LUNWIND"
-    fi
-    if test "x${enable_prof_libunwind}" = "x1" ; then
-      AC_DEFINE([JEMALLOC_PROF_LIBUNWIND], [ ])
-    fi
+if test "x$backtrace_method" = "x" -a "x$enable_prof_libunwind" = "x1" ; then
+  AC_CHECK_HEADERS([libunwind.h], , [enable_prof_libunwind="0"])
+  if test "x$LUNWIND" = "x-lunwind" ; then
+    AC_CHECK_LIB([unwind], [backtrace], [LIBS="$LIBS $LUNWIND"],
+                 [enable_prof_libunwind="0"])
+  else
+    LIBS="$LIBS $LUNWIND"
+  fi
+  if test "x${enable_prof_libunwind}" = "x1" ; then
+    backtrace_method="libunwind"
+    AC_DEFINE([JEMALLOC_PROF_LIBUNWIND], [ ])
   fi
 fi
-AC_SUBST([enable_prof])
 
-dnl If libunwind isn't enabled, try to use libgcc rather than gcc intrinsics
-dnl for backtracing.
-if test "x$enable_prof" = "x1" -a "x$enable_prof_libgcc" = "x1" ; then
-  if test "x$enable_prof_libunwind" = "x0" -a "x$GCC" = "xyes" ; then
-    enable_prof_libgcc="1"
-    AC_CHECK_HEADERS([unwind.h], , [enable_prof_libgcc="0"])
-    AC_CHECK_LIB([gcc], [_Unwind_Backtrace], [LIBS="$LIBS -lgcc"], [enable_prof_libgcc="0"])
-    if test "x${enable_prof_libgcc}" = "x1" ; then
-      AC_DEFINE([JEMALLOC_PROF_LIBGCC], [ ])
-    fi
-  else
-    enable_prof_libgcc="0"
+AC_ARG_ENABLE([prof-libgcc],
+  [AS_HELP_STRING([--disable-prof-libgcc],
+  [Do not use libgcc for backtracing])],
+[if test "x$enable_prof_libgcc" = "xno" ; then
+  enable_prof_libgcc="0"
+else
+  enable_prof_libgcc="1"
+fi
+],
+[enable_prof_libgcc="1"]
+)
+if test "x$backtrace_method" = "x" -a "x$enable_prof_libgcc" = "x1" \
+     -a "x$GCC" = "xyes" ; then
+  AC_CHECK_HEADERS([unwind.h], , [enable_prof_libgcc="0"])
+  AC_CHECK_LIB([gcc], [_Unwind_Backtrace], [LIBS="$LIBS -lgcc"], [enable_prof_libgcc="0"])
+  dnl The following is conservative, in that it only has entries for CPUs on
+  dnl which jemalloc has been tested.
+  AC_MSG_CHECKING([libgcc-based backtracing reliability on ${host_cpu}])
+  case "${host_cpu}" in
+    i[[3456]]86)
+      AC_MSG_RESULT([unreliable])
+      enable_prof_libgcc="0";
+      ;;
+    x86_64)
+      AC_MSG_RESULT([reliable])
+      ;;
+    *)
+      AC_MSG_RESULT([unreliable])
+      enable_prof_libgcc="0";
+      ;;
+  esac
+  if test "x${enable_prof_libgcc}" = "x1" ; then
+    backtrace_method="libgcc"
+    AC_DEFINE([JEMALLOC_PROF_LIBGCC], [ ])
   fi
+else
+  enable_prof_libgcc="0"
+fi
+
+AC_ARG_ENABLE([prof-gcc],
+  [AS_HELP_STRING([--disable-prof-gcc],
+  [Do not use gcc intrinsics for backtracing])],
+[if test "x$enable_prof_gcc" = "xno" ; then
+  enable_prof_gcc="0"
+else
+  enable_prof_gcc="1"
+fi
+],
+[enable_prof_gcc="1"]
+)
+if test "x$backtrace_method" = "x" -a "x$enable_prof_gcc" = "x1" \
+     -a "x$GCC" = "xyes" ; then
+  backtrace_method="gcc intrinsics"
+  AC_DEFINE([JEMALLOC_PROF_GCC], [ ])
+else
+  enable_prof_gcc="0"
 fi
 
+if test "x$backtrace_method" = "x" ; then
+  backtrace_method="none (disabling profiling)"
+  enable_prof="0"
+fi
+AC_MSG_CHECKING([configured backtracing method])
+AC_MSG_RESULT([$backtrace_method])
+if test "x$enable_prof" = "x1" ; then
+  LIBS="$LIBS -lm"
+  AC_DEFINE([JEMALLOC_PROF], [ ])
+fi
+AC_SUBST([enable_prof])
+
 dnl Enable tiny allocations by default.
 AC_ARG_ENABLE([tiny],
   [AS_HELP_STRING([--disable-tiny], [Disable tiny (sub-quantum) allocations])],
@@ -810,8 +856,9 @@ AC_MSG_RESULT([cc-silence         : ${enable_cc_silence}])
 AC_MSG_RESULT([debug              : ${enable_debug}])
 AC_MSG_RESULT([stats              : ${enable_stats}])
 AC_MSG_RESULT([prof               : ${enable_prof}])
-AC_MSG_RESULT([prof-libgcc        : ${enable_prof_libgcc}])
 AC_MSG_RESULT([prof-libunwind     : ${enable_prof_libunwind}])
+AC_MSG_RESULT([prof-libgcc        : ${enable_prof_libgcc}])
+AC_MSG_RESULT([prof-gcc           : ${enable_prof_gcc}])
 AC_MSG_RESULT([tiny               : ${enable_tiny}])
 AC_MSG_RESULT([tcache             : ${enable_tcache}])
 AC_MSG_RESULT([fill               : ${enable_fill}])
diff --git a/jemalloc/include/jemalloc/jemalloc_defs.h.in b/jemalloc/include/jemalloc/jemalloc_defs.h.in
index 5f46c5c..773c9f8 100644
--- a/jemalloc/include/jemalloc/jemalloc_defs.h.in
+++ b/jemalloc/include/jemalloc/jemalloc_defs.h.in
@@ -53,6 +53,9 @@
 /* Use libgcc for profile backtracing if defined. */
 #undef JEMALLOC_PROF_LIBGCC
 
+/* Use gcc intrinsics for profile backtracing if defined. */
+#undef JEMALLOC_PROF_GCC
+
 /*
  * JEMALLOC_TINY enables support for tiny objects, which are smaller than one
  * quantum.
diff --git a/jemalloc/src/prof.c b/jemalloc/src/prof.c
index 3566c6d..8370042 100644
--- a/jemalloc/src/prof.c
+++ b/jemalloc/src/prof.c
@@ -3,15 +3,15 @@
 #ifdef JEMALLOC_PROF
 /******************************************************************************/
 
-#ifdef JEMALLOC_PROF_LIBGCC
-#include <unwind.h>
-#endif
-
 #ifdef JEMALLOC_PROF_LIBUNWIND
 #define	UNW_LOCAL_ONLY
 #include <libunwind.h>
 #endif
 
+#ifdef JEMALLOC_PROF_LIBGCC
+#include <unwind.h>
+#endif
+
 /******************************************************************************/
 /* Data. */
 
@@ -169,39 +169,7 @@ prof_leave(void)
 		prof_gdump();
 }
 
-#ifdef JEMALLOC_PROF_LIBGCC
-static _Unwind_Reason_Code
-prof_unwind_init_callback(struct _Unwind_Context *context, void *arg)
-{
-
-	return (_URC_NO_REASON);
-}
-
-static _Unwind_Reason_Code
-prof_unwind_callback(struct _Unwind_Context *context, void *arg)
-{
-	prof_unwind_data_t *data = (prof_unwind_data_t *)arg;
-
-	if (data->nignore > 0)
-		data->nignore--;
-	else {
-		data->bt->vec[data->bt->len] = (void *)_Unwind_GetIP(context);
-		data->bt->len++;
-		if (data->bt->len == data->max)
-			return (_URC_END_OF_STACK);
-	}
-
-	return (_URC_NO_REASON);
-}
-
-void
-prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
-{
-	prof_unwind_data_t data = {bt, nignore, max};
-
-	_Unwind_Backtrace(prof_unwind_callback, &data);
-}
-#elif defined(JEMALLOC_PROF_LIBUNWIND)
+#ifdef JEMALLOC_PROF_LIBUNWIND
 void
 prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
 {
@@ -236,7 +204,41 @@ prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
 			break;
 	}
 }
-#else
+#endif
+#ifdef JEMALLOC_PROF_LIBGCC
+static _Unwind_Reason_Code
+prof_unwind_init_callback(struct _Unwind_Context *context, void *arg)
+{
+
+	return (_URC_NO_REASON);
+}
+
+static _Unwind_Reason_Code
+prof_unwind_callback(struct _Unwind_Context *context, void *arg)
+{
+	prof_unwind_data_t *data = (prof_unwind_data_t *)arg;
+
+	if (data->nignore > 0)
+		data->nignore--;
+	else {
+		data->bt->vec[data->bt->len] = (void *)_Unwind_GetIP(context);
+		data->bt->len++;
+		if (data->bt->len == data->max)
+			return (_URC_END_OF_STACK);
+	}
+
+	return (_URC_NO_REASON);
+}
+
+void
+prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
+{
+	prof_unwind_data_t data = {bt, nignore, max};
+
+	_Unwind_Backtrace(prof_unwind_callback, &data);
+}
+#endif
+#ifdef JEMALLOC_PROF_GCC
 void
 prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max)
 {
-- 
cgit v0.12


From 84c8eefeffa246607790ad12e28b0f6a24ecc59d Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Wed, 16 Mar 2011 10:30:13 -0700
Subject: Use bitmaps to track small regions.

The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills.  Fix this in two places:

  - arena_run_t: Use a new bitmap implementation to track which regions
                 are available.  Furthermore, revert to preferring the
                 lowest available region (as jemalloc did with its old
                 bitmap-based approach).

  - tcache_t: Move read-only tcache_bin_t metadata into
              tcache_bin_info_t, and add a contiguous array of pointers
              to tcache_t in order to track cached objects.  This
              substantially increases the size of tcache_t, but results
              in much higher data locality for common tcache operations.
              As a side benefit, it is again possible to efficiently
              flush the least recently used cached objects, so this
              change changes flushing from MRU to LRU.

The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast.  In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.

Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.

Use JEMALLOC_DEBUG rather than NDEBUG.

Add dassert(), and use it for debug-only asserts.
---
 jemalloc/Makefile.in                               |  10 +-
 jemalloc/configure.ac                              |  18 ++
 jemalloc/include/jemalloc/internal/arena.h         |  41 +++--
 jemalloc/include/jemalloc/internal/bitmap.h        | 184 +++++++++++++++++++++
 .../jemalloc/internal/jemalloc_internal.h.in       |  32 +++-
 jemalloc/include/jemalloc/internal/prof.h          |   4 +-
 jemalloc/include/jemalloc/internal/tcache.h        |  51 ++++--
 jemalloc/include/jemalloc/jemalloc_defs.h.in       |   3 +
 jemalloc/src/arena.c                               | 107 ++++++------
 jemalloc/src/bitmap.c                              |  90 ++++++++++
 jemalloc/src/ckh.c                                 |  12 +-
 jemalloc/src/jemalloc.c                            |   5 +-
 jemalloc/src/tcache.c                              | 129 ++++++++++-----
 jemalloc/test/bitmap.c                             | 153 +++++++++++++++++
 jemalloc/test/bitmap.exp                           |   2 +
 15 files changed, 702 insertions(+), 139 deletions(-)
 create mode 100644 jemalloc/include/jemalloc/internal/bitmap.h
 create mode 100644 jemalloc/src/bitmap.c
 create mode 100644 jemalloc/test/bitmap.c
 create mode 100644 jemalloc/test/bitmap.exp

diff --git a/jemalloc/Makefile.in b/jemalloc/Makefile.in
index 7a13f21..8ee4c93 100644
--- a/jemalloc/Makefile.in
+++ b/jemalloc/Makefile.in
@@ -46,7 +46,7 @@ BINS := @srcroot@bin/pprof
 CHDRS := @objroot@include/jemalloc/jemalloc@install_suffix@.h \
 	@objroot@include/jemalloc/jemalloc_defs@install_suffix@.h
 CSRCS := @srcroot@src/jemalloc.c @srcroot@src/arena.c @srcroot@src/base.c \
-	@srcroot@src/chunk.c @srcroot@src/chunk_dss.c \
+	@srcroot@src/bitmap.c @srcroot@src/chunk.c @srcroot@src/chunk_dss.c \
 	@srcroot@src/chunk_mmap.c @srcroot@src/chunk_swap.c @srcroot@src/ckh.c \
 	@srcroot@src/ctl.c @srcroot@src/extent.c @srcroot@src/hash.c \
 	@srcroot@src/huge.c @srcroot@src/mb.c @srcroot@src/mutex.c \
@@ -65,8 +65,9 @@ DOCS_HTML := $(DOCS_XML:@objroot@%.xml=@srcroot@%.html)
 DOCS_MAN3 := $(DOCS_XML:@objroot@%.xml=@srcroot@%.3)
 DOCS := $(DOCS_HTML) $(DOCS_MAN3)
 CTESTS := @srcroot@test/allocated.c @srcroot@test/allocm.c \
-	@srcroot@test/mremap.c @srcroot@test/posix_memalign.c \
-	@srcroot@test/rallocm.c @srcroot@test/thread_arena.c
+	@srcroot@test/bitmap.c @srcroot@test/mremap.c \
+	@srcroot@test/posix_memalign.c @srcroot@test/rallocm.c \
+	@srcroot@test/thread_arena.c
 
 .PHONY: all dist doc_html doc_man doc
 .PHONY: install_bin install_include install_lib
@@ -127,6 +128,9 @@ doc: $(DOCS)
 	$(CC) $(CFLAGS) -c $(CPPFLAGS) -I@objroot@test -o $@ $<
 	@$(SHELL) -ec "$(CC) -MM $(CPPFLAGS) -I@objroot@test $< | sed \"s/\($(subst /,\/,$(notdir $(basename $@)))\)\.o\([ :]*\)/$(subst /,\/,$(strip $(dir $@)))\1.o \2/g\" > $(@:%.o=%.d)"
 
+# Automatic dependency generation misses #include "*.c".
+@objroot@test/bitmap.o : @objroot@src/bitmap.o
+
 @objroot@test/%: @objroot@test/%.o \
 		 @objroot@lib/libjemalloc@install_suffix@.$(SO)
 	@mkdir -p $(@D)
diff --git a/jemalloc/configure.ac b/jemalloc/configure.ac
index dfe2b9b..dc77d75 100644
--- a/jemalloc/configure.ac
+++ b/jemalloc/configure.ac
@@ -132,6 +132,16 @@ else
 fi
 AC_DEFINE_UNQUOTED([LG_SIZEOF_INT], [$LG_SIZEOF_INT])
 
+AC_CHECK_SIZEOF([long])
+if test "x${ac_cv_sizeof_long}" = "x8" ; then
+  LG_SIZEOF_LONG=3
+elif test "x${ac_cv_sizeof_long}" = "x4" ; then
+  LG_SIZEOF_LONG=2
+else
+  AC_MSG_ERROR([Unsupported long size: ${ac_cv_sizeof_long}])
+fi
+AC_DEFINE_UNQUOTED([LG_SIZEOF_LONG], [$LG_SIZEOF_LONG])
+
 AC_CANONICAL_HOST
 dnl CPU-specific settings.
 CPU_SPINWAIT=""
@@ -753,6 +763,14 @@ if test "x${enable_tls}" = "x0" ; then
 fi
 
 dnl ============================================================================
+dnl Check for ffsl(3), and fail if not found.  This function exists on all
+dnl platforms that jemalloc currently has a chance of functioning on without
+dnl modification.
+
+AC_CHECK_FUNC([ffsl], [],
+	      [AC_MSG_ERROR([Cannot build without ffsl(3)])])
+
+dnl ============================================================================
 dnl Check for allocator-related functions that should be wrapped.
 
 AC_CHECK_FUNC([memalign],
diff --git a/jemalloc/include/jemalloc/internal/arena.h b/jemalloc/include/jemalloc/internal/arena.h
index bd983f2..1744b45 100644
--- a/jemalloc/include/jemalloc/internal/arena.h
+++ b/jemalloc/include/jemalloc/internal/arena.h
@@ -209,18 +209,15 @@ struct arena_run_s {
 	/* Bin this run is associated with. */
 	arena_bin_t	*bin;
 
-	/* Stack of available freed regions, or NULL. */
-	void		*avail;
-
-	/* Next region that has never been allocated, or run boundary. */
-	void		*next;
+	/* Index of next region that has never been allocated, or nregs. */
+	uint32_t	nextind;
 
 	/* Number of free regions in run. */
 	unsigned	nfree;
 };
 
 /*
- * Read-only information associated with each element for arena_t's bins array
+ * Read-only information associated with each element of arena_t's bins array
  * is stored separately, partly to reduce memory usage (only one copy, rather
  * than one per arena), but mainly to avoid false cacheline sharing.
  */
@@ -234,6 +231,18 @@ struct arena_bin_info_s {
 	/* Total number of regions in a run for this bin's size class. */
 	uint32_t	nregs;
 
+	/*
+	 * Offset of first bitmap_t element in a run header for this bin's size
+	 * class.
+	 */
+	uint32_t	bitmap_offset;
+
+	/*
+	 * Metadata used to manipulate bitmaps for runs associated with this
+	 * bin.
+	 */
+	bitmap_info_t	bitmap_info;
+
 #ifdef JEMALLOC_PROF
 	/*
 	 * Offset of first (prof_ctx_t *) in a run header for this bin's size
@@ -397,7 +406,7 @@ struct arena_s {
 
 extern size_t	opt_lg_qspace_max;
 extern size_t	opt_lg_cspace_max;
-extern ssize_t		opt_lg_dirty_mult;
+extern ssize_t	opt_lg_dirty_mult;
 /*
  * small_size2bin is a compact lookup table that rounds request sizes up to
  * size classes.  In order to reduce cache footprint, the table is compressed,
@@ -498,7 +507,13 @@ arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, const void *ptr)
 	unsigned shift, diff, regind;
 	size_t size;
 
-	assert(run->magic == ARENA_RUN_MAGIC);
+	dassert(run->magic == ARENA_RUN_MAGIC);
+	/*
+	 * Freeing a pointer lower than region zero can cause assertion
+	 * failure.
+	 */
+	assert((uintptr_t)ptr >= (uintptr_t)run +
+	    (uintptr_t)bin_info->reg0_offset);
 
 	/*
 	 * Avoid doing division with a variable divisor if possible.  Using
@@ -583,7 +598,7 @@ arena_prof_ctx_get(const void *ptr)
 			arena_bin_info_t *bin_info = &arena_bin_info[binind];
 			unsigned regind;
 
-			assert(run->magic == ARENA_RUN_MAGIC);
+			dassert(run->magic == ARENA_RUN_MAGIC);
 			regind = arena_run_regind(run, bin_info, ptr);
 			ret = *(prof_ctx_t **)((uintptr_t)run +
 			    bin_info->ctx0_offset + (regind *
@@ -618,7 +633,7 @@ arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 			arena_bin_info_t *bin_info;
 			unsigned regind;
 
-			assert(run->magic == ARENA_RUN_MAGIC);
+			dassert(run->magic == ARENA_RUN_MAGIC);
 			binind = arena_bin_index(chunk->arena, bin);
 			bin_info = &arena_bin_info[binind];
 			regind = arena_run_regind(run, bin_info, ptr);
@@ -639,7 +654,7 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 	arena_chunk_map_t *mapelm;
 
 	assert(arena != NULL);
-	assert(arena->magic == ARENA_MAGIC);
+	dassert(arena->magic == ARENA_MAGIC);
 	assert(chunk->arena == arena);
 	assert(ptr != NULL);
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
@@ -662,9 +677,9 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 			run = (arena_run_t *)((uintptr_t)chunk +
 			    (uintptr_t)((pageind - (mapelm->bits >>
 			    PAGE_SHIFT)) << PAGE_SHIFT));
-			assert(run->magic == ARENA_RUN_MAGIC);
+			dassert(run->magic == ARENA_RUN_MAGIC);
 			bin = run->bin;
-#ifndef NDEBUG
+#ifdef JEMALLOC_DEBUG
 			{
 				size_t binind = arena_bin_index(arena, bin);
 				arena_bin_info_t *bin_info =
diff --git a/jemalloc/include/jemalloc/internal/bitmap.h b/jemalloc/include/jemalloc/internal/bitmap.h
new file mode 100644
index 0000000..4bb2212
--- /dev/null
+++ b/jemalloc/include/jemalloc/internal/bitmap.h
@@ -0,0 +1,184 @@
+/******************************************************************************/
+#ifdef JEMALLOC_H_TYPES
+
+/* Maximum bitmap bit count is 2^LG_BITMAP_MAXBITS. */
+#define	LG_BITMAP_MAXBITS	18
+
+typedef struct bitmap_level_s bitmap_level_t;
+typedef struct bitmap_info_s bitmap_info_t;
+typedef unsigned long bitmap_t;
+#define	LG_SIZEOF_BITMAP	LG_SIZEOF_LONG
+
+/* Number of bits per group. */
+#define	LG_BITMAP_GROUP_NBITS		(LG_SIZEOF_BITMAP + 3)
+#define	BITMAP_GROUP_NBITS		(ZU(1) << LG_BITMAP_GROUP_NBITS)
+#define	BITMAP_GROUP_NBITS_MASK		(BITMAP_GROUP_NBITS-1)
+
+/* Maximum number of levels possible. */
+#define	BITMAP_MAX_LEVELS						\
+    (LG_BITMAP_MAXBITS / LG_SIZEOF_BITMAP)				\
+    + !!(LG_BITMAP_MAXBITS % LG_SIZEOF_BITMAP)
+
+#endif /* JEMALLOC_H_TYPES */
+/******************************************************************************/
+#ifdef JEMALLOC_H_STRUCTS
+
+struct bitmap_level_s {
+	/* Offset of this level's groups within the array of groups. */
+	size_t group_offset;
+};
+
+struct bitmap_info_s {
+	/* Logical number of bits in bitmap (stored at bottom level). */
+	size_t nbits;
+
+	/* Number of levels necessary for nbits. */
+	unsigned nlevels;
+
+	/*
+	 * Only the first (nlevels+1) elements are used, and levels are ordered
+	 * bottom to top (e.g. the bottom level is stored in levels[0]).
+	 */
+	bitmap_level_t levels[BITMAP_MAX_LEVELS+1];
+};
+
+#endif /* JEMALLOC_H_STRUCTS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_EXTERNS
+
+void	bitmap_info_init(bitmap_info_t *binfo, size_t nbits);
+size_t	bitmap_info_ngroups(const bitmap_info_t *binfo);
+size_t	bitmap_size(size_t nbits);
+void	bitmap_init(bitmap_t *bitmap, const bitmap_info_t *binfo);
+
+#endif /* JEMALLOC_H_EXTERNS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_INLINES
+
+#ifndef JEMALLOC_ENABLE_INLINE
+bool	bitmap_full(bitmap_t *bitmap, const bitmap_info_t *binfo);
+bool	bitmap_get(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit);
+void	bitmap_set(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit);
+size_t	bitmap_sfu(bitmap_t *bitmap, const bitmap_info_t *binfo);
+void	bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_BITMAP_C_))
+JEMALLOC_INLINE bool
+bitmap_full(bitmap_t *bitmap, const bitmap_info_t *binfo)
+{
+	unsigned rgoff = binfo->levels[binfo->nlevels].group_offset - 1;
+	bitmap_t rg = bitmap[rgoff];
+	/* The bitmap is full iff the root group is 0. */
+	return (rg == 0);
+}
+
+JEMALLOC_INLINE bool
+bitmap_get(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
+{
+	size_t goff;
+	bitmap_t g;
+
+	assert(bit < binfo->nbits);
+	goff = bit >> LG_BITMAP_GROUP_NBITS;
+	g = bitmap[goff];
+	return (!(g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK))));
+}
+
+JEMALLOC_INLINE void
+bitmap_set(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
+{
+	size_t goff;
+	bitmap_t *gp;
+	bitmap_t g;
+
+	assert(bit < binfo->nbits);
+	assert(bitmap_get(bitmap, binfo, bit) == false);
+	goff = bit >> LG_BITMAP_GROUP_NBITS;
+	gp = &bitmap[goff];
+	g = *gp;
+	assert(g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK)));
+	g ^= 1LU << (bit & BITMAP_GROUP_NBITS_MASK);
+	*gp = g;
+	assert(bitmap_get(bitmap, binfo, bit));
+	/* Propagate group state transitions up the tree. */
+	if (g == 0) {
+		unsigned i;
+		for (i = 1; i < binfo->nlevels; i++) {
+			bit = goff;
+			goff = bit >> LG_BITMAP_GROUP_NBITS;
+			gp = &bitmap[binfo->levels[i].group_offset + goff];
+			g = *gp;
+			assert(g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK)));
+			g ^= 1LU << (bit & BITMAP_GROUP_NBITS_MASK);
+			*gp = g;
+			if (g != 0)
+				break;
+		}
+	}
+}
+
+/* sfu: set first unset. */
+JEMALLOC_INLINE size_t
+bitmap_sfu(bitmap_t *bitmap, const bitmap_info_t *binfo)
+{
+	size_t bit;
+	bitmap_t g;
+	unsigned i;
+
+	assert(bitmap_full(bitmap, binfo) == false);
+
+	i = binfo->nlevels - 1;
+	g = bitmap[binfo->levels[i].group_offset];
+	bit = ffsl(g) - 1;
+	while (i > 0) {
+		i--;
+		g = bitmap[binfo->levels[i].group_offset + bit];
+		bit = (bit << LG_BITMAP_GROUP_NBITS) + (ffsl(g) - 1);
+	}
+
+	bitmap_set(bitmap, binfo, bit);
+	return (bit);
+}
+
+JEMALLOC_INLINE void
+bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
+{
+	size_t goff;
+	bitmap_t *gp;
+	bitmap_t g;
+	bool propagate;
+
+	assert(bit < binfo->nbits);
+	assert(bitmap_get(bitmap, binfo, bit));
+	goff = bit >> LG_BITMAP_GROUP_NBITS;
+	gp = &bitmap[goff];
+	g = *gp;
+	propagate = (g == 0);
+	assert((g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK))) == 0);
+	g ^= 1LU << (bit & BITMAP_GROUP_NBITS_MASK);
+	*gp = g;
+	assert(bitmap_get(bitmap, binfo, bit) == false);
+	/* Propagate group state transitions up the tree. */
+	if (propagate) {
+		unsigned i;
+		for (i = 1; i < binfo->nlevels; i++) {
+			bit = goff;
+			goff = bit >> LG_BITMAP_GROUP_NBITS;
+			gp = &bitmap[binfo->levels[i].group_offset + goff];
+			g = *gp;
+			propagate = (g == 0);
+			assert((g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK)))
+			    == 0);
+			g ^= 1LU << (bit & BITMAP_GROUP_NBITS_MASK);
+			*gp = g;
+			if (propagate == false)
+				break;
+		}
+	}
+}
+
+#endif
+
+#endif /* JEMALLOC_H_INLINES */
+/******************************************************************************/
diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
index 34b2a23..a80fc7c 100644
--- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
@@ -55,8 +55,9 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
  * Define a custom assert() in order to reduce the chances of deadlock during
  * assertion failure.
  */
-#ifdef JEMALLOC_DEBUG
-#  define assert(e) do {						\
+#ifndef assert
+#  ifdef JEMALLOC_DEBUG
+#    define assert(e) do {						\
 	if (!(e)) {							\
 		char line_buf[UMAX2S_BUFSIZE];				\
 		malloc_write("<jemalloc>: ");				\
@@ -70,8 +71,15 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 		abort();						\
 	}								\
 } while (0)
+#  else
+#    define assert(e)
+#  endif
+#endif
+
+#ifdef JEMALLOC_DEBUG
+#  define dassert(e) assert(e)
 #else
-#define assert(e)
+#  define dassert(e)
 #endif
 
 /*
@@ -146,7 +154,19 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #define	QUANTUM_CEILING(a)						\
 	(((a) + QUANTUM_MASK) & ~QUANTUM_MASK)
 
+#define	LONG			((size_t)(1U << LG_SIZEOF_LONG))
+#define	LONG_MASK		(LONG - 1)
+
+/* Return the smallest long multiple that is >= a. */
+#define	LONG_CEILING(a)						\
+	(((a) + LONG_MASK) & ~LONG_MASK)
+
 #define	SIZEOF_PTR		(1U << LG_SIZEOF_PTR)
+#define	PTR_MASK		(SIZEOF_PTR - 1)
+
+/* Return the smallest (void *) multiple that is >= a. */
+#define	PTR_CEILING(a)						\
+	(((a) + PTR_MASK) & ~PTR_MASK)
 
 /*
  * Maximum size of L1 cache line.  This is used to avoid cache line aliasing.
@@ -199,6 +219,7 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #include "jemalloc/internal/ctl.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/mb.h"
+#include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/extent.h"
 #include "jemalloc/internal/arena.h"
 #include "jemalloc/internal/base.h"
@@ -222,6 +243,7 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #include "jemalloc/internal/ctl.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/mb.h"
+#include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/extent.h"
 #include "jemalloc/internal/arena.h"
 #include "jemalloc/internal/base.h"
@@ -335,6 +357,7 @@ void	jemalloc_postfork(void);
 #include "jemalloc/internal/ctl.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/mb.h"
+#include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/extent.h"
 #include "jemalloc/internal/arena.h"
 #include "jemalloc/internal/base.h"
@@ -545,6 +568,7 @@ thread_allocated_get(void)
 #endif
 #endif
 
+#include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/arena.h"
@@ -628,7 +652,7 @@ isalloc(const void *ptr)
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (chunk != ptr) {
 		/* Region. */
-		assert(chunk->arena->magic == ARENA_MAGIC);
+		dassert(chunk->arena->magic == ARENA_MAGIC);
 
 #ifdef JEMALLOC_PROF
 		ret = arena_salloc_demote(ptr);
diff --git a/jemalloc/include/jemalloc/internal/prof.h b/jemalloc/include/jemalloc/internal/prof.h
index db63465..f943873 100644
--- a/jemalloc/include/jemalloc/internal/prof.h
+++ b/jemalloc/include/jemalloc/internal/prof.h
@@ -348,7 +348,7 @@ prof_ctx_get(const void *ptr)
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (chunk != ptr) {
 		/* Region. */
-		assert(chunk->arena->magic == ARENA_MAGIC);
+		dassert(chunk->arena->magic == ARENA_MAGIC);
 
 		ret = arena_prof_ctx_get(ptr);
 	} else
@@ -367,7 +367,7 @@ prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (chunk != ptr) {
 		/* Region. */
-		assert(chunk->arena->magic == ARENA_MAGIC);
+		dassert(chunk->arena->magic == ARENA_MAGIC);
 
 		arena_prof_ctx_set(ptr, ctx);
 	} else
diff --git a/jemalloc/include/jemalloc/internal/tcache.h b/jemalloc/include/jemalloc/internal/tcache.h
index ab02545..5434d32 100644
--- a/jemalloc/include/jemalloc/internal/tcache.h
+++ b/jemalloc/include/jemalloc/internal/tcache.h
@@ -2,6 +2,7 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_TYPES
 
+typedef struct tcache_bin_info_s tcache_bin_info_t;
 typedef struct tcache_bin_s tcache_bin_t;
 typedef struct tcache_s tcache_t;
 
@@ -32,14 +33,21 @@ typedef struct tcache_s tcache_t;
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS
 
+/*
+ * Read-only information associated with each element of tcache_t's tbins array
+ * is stored separately, mainly to reduce memory usage.
+ */
+struct tcache_bin_info_s {
+	unsigned	ncached_max;	/* Upper limit on ncached. */
+};
+
 struct tcache_bin_s {
 #  ifdef JEMALLOC_STATS
 	tcache_bin_stats_t tstats;
 #  endif
 	unsigned	low_water;	/* Min # cached since last GC. */
 	unsigned	ncached;	/* # of cached objects. */
-	unsigned	ncached_max;	/* Upper limit on ncached. */
-	void		*avail;		/* Chain of available objects. */
+	void		**avail;	/* Stack of available objects. */
 };
 
 struct tcache_s {
@@ -53,6 +61,12 @@ struct tcache_s {
 	unsigned	ev_cnt;		/* Event count since incremental GC. */
 	unsigned	next_gc_bin;	/* Next bin to GC. */
 	tcache_bin_t	tbins[1];	/* Dynamically sized. */
+	/*
+	 * The pointer stacks associated with tbins follow as a contiguous
+	 * array.  During tcache initialization, the avail pointer in each
+	 * element of tbins is initialized to point to the proper offset within
+	 * this array.
+	 */
 };
 
 #endif /* JEMALLOC_H_STRUCTS */
@@ -63,6 +77,8 @@ extern bool	opt_tcache;
 extern ssize_t	opt_lg_tcache_max;
 extern ssize_t	opt_lg_tcache_gc_sweep;
 
+extern tcache_bin_info_t	*tcache_bin_info;
+
 /* Map of thread-specific caches. */
 #ifndef NO_TLS
 extern __thread tcache_t	*tcache_tls
@@ -109,7 +125,7 @@ void	tcache_destroy(tcache_t *tcache);
 #ifdef JEMALLOC_STATS
 void	tcache_stats_merge(tcache_t *tcache, arena_t *arena);
 #endif
-void	tcache_boot(void);
+bool	tcache_boot(void);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
@@ -211,8 +227,7 @@ tcache_alloc_easy(tcache_bin_t *tbin)
 	tbin->ncached--;
 	if (tbin->ncached < tbin->low_water)
 		tbin->low_water = tbin->ncached;
-	ret = tbin->avail;
-	tbin->avail = *(void **)ret;
+	ret = tbin->avail[tbin->ncached];
 	return (ret);
 }
 
@@ -312,6 +327,7 @@ tcache_dalloc_small(tcache_t *tcache, void *ptr)
 	arena_run_t *run;
 	arena_bin_t *bin;
 	tcache_bin_t *tbin;
+	tcache_bin_info_t *tbin_info;
 	size_t pageind, binind;
 	arena_chunk_map_t *mapelm;
 
@@ -323,7 +339,7 @@ tcache_dalloc_small(tcache_t *tcache, void *ptr)
 	mapelm = &chunk->map[pageind-map_bias];
 	run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
 	    (mapelm->bits >> PAGE_SHIFT)) << PAGE_SHIFT));
-	assert(run->magic == ARENA_RUN_MAGIC);
+	dassert(run->magic == ARENA_RUN_MAGIC);
 	bin = run->bin;
 	binind = ((uintptr_t)bin - (uintptr_t)&arena->bins) /
 	    sizeof(arena_bin_t);
@@ -335,16 +351,17 @@ tcache_dalloc_small(tcache_t *tcache, void *ptr)
 #endif
 
 	tbin = &tcache->tbins[binind];
-	if (tbin->ncached == tbin->ncached_max) {
-		tcache_bin_flush_small(tbin, binind, (tbin->ncached_max >> 1)
+	tbin_info = &tcache_bin_info[binind];
+	if (tbin->ncached == tbin_info->ncached_max) {
+		tcache_bin_flush_small(tbin, binind, (tbin_info->ncached_max >>
+		    1)
 #if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
 		    , tcache
 #endif
 		    );
 	}
-	assert(tbin->ncached < tbin->ncached_max);
-	*(void **)ptr = tbin->avail;
-	tbin->avail = ptr;
+	assert(tbin->ncached < tbin_info->ncached_max);
+	tbin->avail[tbin->ncached] = ptr;
 	tbin->ncached++;
 
 	tcache_event(tcache);
@@ -357,6 +374,7 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
 	arena_chunk_t *chunk;
 	size_t pageind, binind;
 	tcache_bin_t *tbin;
+	tcache_bin_info_t *tbin_info;
 
 	assert((size & PAGE_MASK) == 0);
 	assert(arena_salloc(ptr) > small_maxclass);
@@ -373,16 +391,17 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
 #endif
 
 	tbin = &tcache->tbins[binind];
-	if (tbin->ncached == tbin->ncached_max) {
-		tcache_bin_flush_large(tbin, binind, (tbin->ncached_max >> 1)
+	tbin_info = &tcache_bin_info[binind];
+	if (tbin->ncached == tbin_info->ncached_max) {
+		tcache_bin_flush_large(tbin, binind, (tbin_info->ncached_max >>
+		    1)
 #if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
 		    , tcache
 #endif
 		    );
 	}
-	assert(tbin->ncached < tbin->ncached_max);
-	*(void **)ptr = tbin->avail;
-	tbin->avail = ptr;
+	assert(tbin->ncached < tbin_info->ncached_max);
+	tbin->avail[tbin->ncached] = ptr;
 	tbin->ncached++;
 
 	tcache_event(tcache);
diff --git a/jemalloc/include/jemalloc/jemalloc_defs.h.in b/jemalloc/include/jemalloc/jemalloc_defs.h.in
index 773c9f8..d669841 100644
--- a/jemalloc/include/jemalloc/jemalloc_defs.h.in
+++ b/jemalloc/include/jemalloc/jemalloc_defs.h.in
@@ -140,4 +140,7 @@
 /* sizeof(int) == 2^LG_SIZEOF_INT. */
 #undef LG_SIZEOF_INT
 
+/* sizeof(long) == 2^LG_SIZEOF_LONG. */
+#undef LG_SIZEOF_LONG
+
 #endif /* JEMALLOC_DEFS_H_ */
diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index e49b8ed..87bd9bb 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -253,59 +253,45 @@ static inline void *
 arena_run_reg_alloc(arena_run_t *run, arena_bin_info_t *bin_info)
 {
 	void *ret;
+	unsigned regind;
+	bitmap_t *bitmap = (bitmap_t *)((uintptr_t)run +
+	    (uintptr_t)bin_info->bitmap_offset);
 
-	assert(run->magic == ARENA_RUN_MAGIC);
+	dassert(run->magic == ARENA_RUN_MAGIC);
 	assert(run->nfree > 0);
+	assert(bitmap_full(bitmap, &bin_info->bitmap_info) == false);
 
+	regind = bitmap_sfu(bitmap, &bin_info->bitmap_info);
+	ret = (void *)((uintptr_t)run + (uintptr_t)bin_info->reg0_offset +
+	    (uintptr_t)(bin_info->reg_size * regind));
 	run->nfree--;
-	ret = run->avail;
-	if (ret != NULL) {
-		/* Double free can cause assertion failure.*/
-		assert(ret != NULL);
-		/* Write-after free can cause assertion failure. */
-		assert((uintptr_t)ret >= (uintptr_t)run +
-		    (uintptr_t)bin_info->reg0_offset);
-		assert((uintptr_t)ret < (uintptr_t)run->next);
-		assert(((uintptr_t)ret - ((uintptr_t)run +
-		    (uintptr_t)bin_info->reg0_offset)) %
-		    (uintptr_t)bin_info->reg_size == 0);
-		run->avail = *(void **)ret;
-		return (ret);
-	}
-	ret = run->next;
-	run->next = (void *)((uintptr_t)ret + (uintptr_t)bin_info->reg_size);
-	assert(ret != NULL);
+	if (regind == run->nextind)
+		run->nextind++;
+	assert(regind < run->nextind);
 	return (ret);
 }
 
 static inline void
 arena_run_reg_dalloc(arena_run_t *run, void *ptr)
 {
-
-#ifndef NDEBUG
 	arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
 	size_t binind = arena_bin_index(chunk->arena, run->bin);
 	arena_bin_info_t *bin_info = &arena_bin_info[binind];
+	unsigned regind = arena_run_regind(run, bin_info, ptr);
+	bitmap_t *bitmap = (bitmap_t *)((uintptr_t)run +
+	    (uintptr_t)bin_info->bitmap_offset);
+
 	assert(run->nfree < bin_info->nregs);
 	/* Freeing an interior pointer can cause assertion failure. */
 	assert(((uintptr_t)ptr - ((uintptr_t)run +
 	    (uintptr_t)bin_info->reg0_offset)) % (uintptr_t)bin_info->reg_size
 	    == 0);
-	/*
-	 * Freeing a pointer lower than region zero can cause assertion
-	 * failure.
-	 */
 	assert((uintptr_t)ptr >= (uintptr_t)run +
 	    (uintptr_t)bin_info->reg0_offset);
-	/*
-	 * Freeing a pointer past in the run's frontier can cause assertion
-	 * failure.
-	 */
-	assert((uintptr_t)ptr < (uintptr_t)run->next);
-#endif
+	/* Freeing an unallocated pointer can cause assertion failure. */
+	assert(bitmap_get(bitmap, &bin_info->bitmap_info, regind));
 
-	*(void **)ptr = run->avail;
-	run->avail = ptr;
+	bitmap_unset(bitmap, &bin_info->bitmap_info, regind);
 	run->nfree++;
 }
 
@@ -772,7 +758,7 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
 				    chunk + (uintptr_t)(pageind << PAGE_SHIFT));
 
 				assert((mapelm->bits >> PAGE_SHIFT) == 0);
-				assert(run->magic == ARENA_RUN_MAGIC);
+				dassert(run->magic == ARENA_RUN_MAGIC);
 				size_t binind = arena_bin_index(arena,
 				    run->bin);
 				arena_bin_info_t *bin_info =
@@ -1224,12 +1210,14 @@ arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin)
 	malloc_mutex_lock(&arena->lock);
 	run = arena_run_alloc(arena, bin_info->run_size, false, false);
 	if (run != NULL) {
+		bitmap_t *bitmap = (bitmap_t *)((uintptr_t)run +
+		    (uintptr_t)bin_info->bitmap_offset);
+
 		/* Initialize run internals. */
 		run->bin = bin;
-		run->avail = NULL;
-		run->next = (void *)((uintptr_t)run +
-		    (uintptr_t)bin_info->reg0_offset);
+		run->nextind = 0;
 		run->nfree = bin_info->nregs;
+		bitmap_init(bitmap, &bin_info->bitmap_info);
 #ifdef JEMALLOC_DEBUG
 		run->magic = ARENA_RUN_MAGIC;
 #endif
@@ -1289,12 +1277,11 @@ arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin)
 	bin->runcur = NULL;
 	run = arena_bin_nonfull_run_get(arena, bin);
 	if (bin->runcur != NULL && bin->runcur->nfree > 0) {
-
 		/*
 		 * Another thread updated runcur while this one ran without the
 		 * bin lock in arena_bin_nonfull_run_get().
 		 */
-		assert(bin->runcur->magic == ARENA_RUN_MAGIC);
+		dassert(bin->runcur->magic == ARENA_RUN_MAGIC);
 		assert(bin->runcur->nfree > 0);
 		ret = arena_run_reg_alloc(bin->runcur, bin_info);
 		if (run != NULL) {
@@ -1302,7 +1289,7 @@ arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin)
 
 			/*
 			 * arena_run_alloc() may have allocated run, or it may
-			 * have pulled it from the bin's run tree.  Therefore
+			 * have pulled run from the bin's run tree.  Therefore
 			 * it is unsafe to make any assumptions about how run
 			 * has previously been used, and arena_bin_lower_run()
 			 * must be called, as if a region were just deallocated
@@ -1322,7 +1309,7 @@ arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin)
 
 	bin->runcur = run;
 
-	assert(bin->runcur->magic == ARENA_RUN_MAGIC);
+	dassert(bin->runcur->magic == ARENA_RUN_MAGIC);
 	assert(bin->runcur->nfree > 0);
 
 	return (arena_run_reg_alloc(bin->runcur, bin_info));
@@ -1365,15 +1352,15 @@ arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, size_t binind
 #endif
 	bin = &arena->bins[binind];
 	malloc_mutex_lock(&bin->lock);
-	for (i = 0, nfill = (tbin->ncached_max >> 1); i < nfill; i++) {
+	for (i = 0, nfill = (tcache_bin_info[binind].ncached_max >> 1);
+	    i < nfill; i++) {
 		if ((run = bin->runcur) != NULL && run->nfree > 0)
 			ptr = arena_run_reg_alloc(run, &arena_bin_info[binind]);
 		else
 			ptr = arena_bin_malloc_hard(arena, bin);
 		if (ptr == NULL)
 			break;
-		*(void **)ptr = tbin->avail;
-		tbin->avail = ptr;
+		tbin->avail[i] = ptr;
 	}
 #ifdef JEMALLOC_STATS
 	bin->stats.allocated += (i - tbin->ncached) *
@@ -1607,7 +1594,7 @@ arena_salloc(const void *ptr)
 		arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
 		    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
 		    PAGE_SHIFT));
-		assert(run->magic == ARENA_RUN_MAGIC);
+		dassert(run->magic == ARENA_RUN_MAGIC);
 		size_t binind = arena_bin_index(chunk->arena, run->bin);
 		arena_bin_info_t *bin_info = &arena_bin_info[binind];
 		assert(((uintptr_t)ptr - ((uintptr_t)run +
@@ -1660,7 +1647,7 @@ arena_salloc_demote(const void *ptr)
 		arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
 		    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
 		    PAGE_SHIFT));
-		assert(run->magic == ARENA_RUN_MAGIC);
+		dassert(run->magic == ARENA_RUN_MAGIC);
 		size_t binind = arena_bin_index(chunk->arena, run->bin);
 		arena_bin_info_t *bin_info = &arena_bin_info[binind];
 		assert(((uintptr_t)ptr - ((uintptr_t)run +
@@ -1730,8 +1717,9 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 	/******************************/
 	npages = bin_info->run_size >> PAGE_SHIFT;
 	run_ind = (size_t)(((uintptr_t)run - (uintptr_t)chunk) >> PAGE_SHIFT);
-	past = (size_t)((PAGE_CEILING((uintptr_t)run->next) - (uintptr_t)chunk)
-	    >> PAGE_SHIFT);
+	past = (size_t)(PAGE_CEILING((uintptr_t)run +
+	    (uintptr_t)bin_info->reg0_offset + (uintptr_t)(run->nextind *
+	    bin_info->reg_size) - (uintptr_t)chunk) >> PAGE_SHIFT);
 	malloc_mutex_lock(&arena->lock);
 
 	/*
@@ -1817,7 +1805,7 @@ arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
 	run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
 	    (mapelm->bits >> PAGE_SHIFT)) << PAGE_SHIFT));
-	assert(run->magic == ARENA_RUN_MAGIC);
+	dassert(run->magic == ARENA_RUN_MAGIC);
 	bin = run->bin;
 	size_t binind = arena_bin_index(arena, bin);
 	arena_bin_info_t *bin_info = &arena_bin_info[binind];
@@ -2065,7 +2053,7 @@ arena_ralloc_large(void *ptr, size_t oldsize, size_t size, size_t extra,
 
 		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 		arena = chunk->arena;
-		assert(arena->magic == ARENA_MAGIC);
+		dassert(arena->magic == ARENA_MAGIC);
 
 		if (psize < oldsize) {
 #ifdef JEMALLOC_FILL
@@ -2405,8 +2393,8 @@ small_size2bin_init_hard(void)
  *   *) bin_info->run_size <= arena_maxclass
  *   *) run header overhead <= RUN_MAX_OVRHD (or header overhead relaxed).
  *
- * bin_info->nregs and bin_info->reg0_offset are also calculated here, since
- * these settings are all interdependent.
+ * bin_info->nregs, bin_info->bitmap_offset, and bin_info->reg0_offset are also
+ * calculated here, since these settings are all interdependent.
  */
 static size_t
 bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size)
@@ -2414,6 +2402,7 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size)
 	size_t try_run_size, good_run_size;
 	uint32_t try_nregs, good_nregs;
 	uint32_t try_hdr_size, good_hdr_size;
+	uint32_t try_bitmap_offset, good_bitmap_offset;
 #ifdef JEMALLOC_PROF
 	uint32_t try_ctx0_offset, good_ctx0_offset;
 #endif
@@ -2438,6 +2427,11 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size)
 	do {
 		try_nregs--;
 		try_hdr_size = sizeof(arena_run_t);
+		/* Pad to a long boundary. */
+		try_hdr_size = LONG_CEILING(try_hdr_size);
+		try_bitmap_offset = try_hdr_size;
+		/* Add space for bitmap. */
+		try_hdr_size += bitmap_size(try_nregs);
 #ifdef JEMALLOC_PROF
 		if (opt_prof && prof_promote == false) {
 			/* Pad to a quantum boundary. */
@@ -2460,6 +2454,7 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size)
 		good_run_size = try_run_size;
 		good_nregs = try_nregs;
 		good_hdr_size = try_hdr_size;
+		good_bitmap_offset = try_bitmap_offset;
 #ifdef JEMALLOC_PROF
 		good_ctx0_offset = try_ctx0_offset;
 #endif
@@ -2473,6 +2468,11 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size)
 		do {
 			try_nregs--;
 			try_hdr_size = sizeof(arena_run_t);
+			/* Pad to a long boundary. */
+			try_hdr_size = LONG_CEILING(try_hdr_size);
+			try_bitmap_offset = try_hdr_size;
+			/* Add space for bitmap. */
+			try_hdr_size += bitmap_size(try_nregs);
 #ifdef JEMALLOC_PROF
 			if (opt_prof && prof_promote == false) {
 				/* Pad to a quantum boundary. */
@@ -2498,6 +2498,7 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size)
 	/* Copy final settings. */
 	bin_info->run_size = good_run_size;
 	bin_info->nregs = good_nregs;
+	bin_info->bitmap_offset = good_bitmap_offset;
 #ifdef JEMALLOC_PROF
 	bin_info->ctx0_offset = good_ctx0_offset;
 #endif
@@ -2525,6 +2526,7 @@ bin_info_init(void)
 		bin_info = &arena_bin_info[i];
 		bin_info->reg_size = (1U << (LG_TINY_MIN + i));
 		prev_run_size = bin_info_run_size_calc(bin_info, prev_run_size);
+		bitmap_info_init(&bin_info->bitmap_info, bin_info->nregs);
 	}
 #endif
 
@@ -2533,6 +2535,7 @@ bin_info_init(void)
 		bin_info = &arena_bin_info[i];
 		bin_info->reg_size = (i - ntbins + 1) << LG_QUANTUM;
 		prev_run_size = bin_info_run_size_calc(bin_info, prev_run_size);
+		bitmap_info_init(&bin_info->bitmap_info, bin_info->nregs);
 	}
 
 	/* Cacheline-spaced bins. */
@@ -2541,6 +2544,7 @@ bin_info_init(void)
 		bin_info->reg_size = cspace_min + ((i - (ntbins + nqbins)) <<
 		    LG_CACHELINE);
 		prev_run_size = bin_info_run_size_calc(bin_info, prev_run_size);
+		bitmap_info_init(&bin_info->bitmap_info, bin_info->nregs);
 	}
 
 	/* Subpage-spaced bins. */
@@ -2549,6 +2553,7 @@ bin_info_init(void)
 		bin_info->reg_size = sspace_min + ((i - (ntbins + nqbins +
 		    ncbins)) << LG_SUBPAGE);
 		prev_run_size = bin_info_run_size_calc(bin_info, prev_run_size);
+		bitmap_info_init(&bin_info->bitmap_info, bin_info->nregs);
 	}
 
 	return (false);
diff --git a/jemalloc/src/bitmap.c b/jemalloc/src/bitmap.c
new file mode 100644
index 0000000..b47e262
--- /dev/null
+++ b/jemalloc/src/bitmap.c
@@ -0,0 +1,90 @@
+#define JEMALLOC_BITMAP_C_
+#include "jemalloc/internal/jemalloc_internal.h"
+
+/******************************************************************************/
+/* Function prototypes for non-inline static functions. */
+
+static size_t	bits2groups(size_t nbits);
+
+/******************************************************************************/
+
+static size_t
+bits2groups(size_t nbits)
+{
+
+	return ((nbits >> LG_BITMAP_GROUP_NBITS) +
+	    !!(nbits & BITMAP_GROUP_NBITS_MASK));
+}
+
+void
+bitmap_info_init(bitmap_info_t *binfo, size_t nbits)
+{
+	unsigned i;
+	size_t group_count;
+
+	assert(nbits > 0);
+	assert(nbits <= (ZU(1) << LG_BITMAP_MAXBITS));
+
+	/*
+	 * Compute the number of groups necessary to store nbits bits, and
+	 * progressively work upward through the levels until reaching a level
+	 * that requires only one group.
+	 */
+	binfo->levels[0].group_offset = 0;
+	group_count = bits2groups(nbits);
+	for (i = 1; group_count > 1; i++) {
+		assert(i < BITMAP_MAX_LEVELS);
+		binfo->levels[i].group_offset = binfo->levels[i-1].group_offset
+		    + group_count;
+		group_count = bits2groups(group_count);
+	}
+	binfo->levels[i].group_offset = binfo->levels[i-1].group_offset
+	    + group_count;
+	binfo->nlevels = i;
+	binfo->nbits = nbits;
+}
+
+size_t
+bitmap_info_ngroups(const bitmap_info_t *binfo)
+{
+
+	return (binfo->levels[binfo->nlevels].group_offset << LG_SIZEOF_BITMAP);
+}
+
+size_t
+bitmap_size(size_t nbits)
+{
+	bitmap_info_t binfo;
+
+	bitmap_info_init(&binfo, nbits);
+	return (bitmap_info_ngroups(&binfo));
+}
+
+void
+bitmap_init(bitmap_t *bitmap, const bitmap_info_t *binfo)
+{
+	size_t extra;
+	unsigned i;
+
+	/*
+	 * Bits are actually inverted with regard to the external bitmap
+	 * interface, so the bitmap starts out with all 1 bits, except for
+	 * trailing unused bits (if any).  Note that each group uses bit 0 to
+	 * correspond to the first logical bit in the group, so extra bits
+	 * are the most significant bits of the last group.
+	 */
+	memset(bitmap, 0xffU, binfo->levels[binfo->nlevels].group_offset <<
+	    LG_SIZEOF_BITMAP);
+	extra = (BITMAP_GROUP_NBITS - (binfo->nbits & BITMAP_GROUP_NBITS_MASK))
+	    & BITMAP_GROUP_NBITS_MASK;
+	if (extra != 0)
+		bitmap[binfo->levels[1].group_offset - 1] >>= extra;
+	for (i = 1; i < binfo->nlevels; i++) {
+		size_t group_count = binfo->levels[i].group_offset -
+		    binfo->levels[i-1].group_offset;
+		extra = (BITMAP_GROUP_NBITS - (group_count &
+		    BITMAP_GROUP_NBITS_MASK)) & BITMAP_GROUP_NBITS_MASK;
+		if (extra != 0)
+			bitmap[binfo->levels[i+1].group_offset - 1] >>= extra;
+	}
+}
diff --git a/jemalloc/src/ckh.c b/jemalloc/src/ckh.c
index e386a53..75ae7fd 100644
--- a/jemalloc/src/ckh.c
+++ b/jemalloc/src/ckh.c
@@ -73,7 +73,7 @@ ckh_isearch(ckh_t *ckh, const void *key)
 	size_t hash1, hash2, bucket, cell;
 
 	assert(ckh != NULL);
-	assert(ckh->magic == CKH_MAGIC);
+	dassert(ckh->magic == CKH_MAGIC);
 
 	ckh->hash(key, ckh->lg_curbuckets, &hash1, &hash2);
 
@@ -396,7 +396,7 @@ ckh_delete(ckh_t *ckh)
 {
 
 	assert(ckh != NULL);
-	assert(ckh->magic == CKH_MAGIC);
+	dassert(ckh->magic == CKH_MAGIC);
 
 #ifdef CKH_VERBOSE
 	malloc_printf(
@@ -421,7 +421,7 @@ ckh_count(ckh_t *ckh)
 {
 
 	assert(ckh != NULL);
-	assert(ckh->magic == CKH_MAGIC);
+	dassert(ckh->magic == CKH_MAGIC);
 
 	return (ckh->count);
 }
@@ -452,7 +452,7 @@ ckh_insert(ckh_t *ckh, const void *key, const void *data)
 	bool ret;
 
 	assert(ckh != NULL);
-	assert(ckh->magic == CKH_MAGIC);
+	dassert(ckh->magic == CKH_MAGIC);
 	assert(ckh_search(ckh, key, NULL, NULL));
 
 #ifdef CKH_COUNT
@@ -477,7 +477,7 @@ ckh_remove(ckh_t *ckh, const void *searchkey, void **key, void **data)
 	size_t cell;
 
 	assert(ckh != NULL);
-	assert(ckh->magic == CKH_MAGIC);
+	dassert(ckh->magic == CKH_MAGIC);
 
 	cell = ckh_isearch(ckh, searchkey);
 	if (cell != SIZE_T_MAX) {
@@ -509,7 +509,7 @@ ckh_search(ckh_t *ckh, const void *searchkey, void **key, void **data)
 	size_t cell;
 
 	assert(ckh != NULL);
-	assert(ckh->magic == CKH_MAGIC);
+	dassert(ckh->magic == CKH_MAGIC);
 
 	cell = ckh_isearch(ckh, searchkey);
 	if (cell != SIZE_T_MAX) {
diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c
index c1aadda..9f2fa92 100644
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@@ -693,7 +693,10 @@ malloc_init_hard(void)
 	}
 
 #ifdef JEMALLOC_TCACHE
-	tcache_boot();
+	if (tcache_boot()) {
+		malloc_mutex_unlock(&init_lock);
+		return (true);
+	}
 #endif
 
 	if (huge_boot()) {
diff --git a/jemalloc/src/tcache.c b/jemalloc/src/tcache.c
index 88e1cc7..2f4804e 100644
--- a/jemalloc/src/tcache.c
+++ b/jemalloc/src/tcache.c
@@ -8,6 +8,9 @@ bool	opt_tcache = true;
 ssize_t	opt_lg_tcache_max = LG_TCACHE_MAXCLASS_DEFAULT;
 ssize_t	opt_lg_tcache_gc_sweep = LG_TCACHE_GC_SWEEP_DEFAULT;
 
+tcache_bin_info_t	*tcache_bin_info;
+static unsigned		stack_nelms; /* Total stack elms per tcache. */
+
 /* Map of thread-specific caches. */
 #ifndef NO_TLS
 __thread tcache_t	*tcache_tls JEMALLOC_ATTR(tls_model("initial-exec"));
@@ -55,21 +58,19 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem
 #endif
     )
 {
-	void *flush, *deferred, *ptr;
+	void *ptr;
 	unsigned i, nflush, ndeferred;
-	bool first_pass;
 #ifdef JEMALLOC_STATS
 	bool merged_stats = false;
 #endif
 
 	assert(binind < nbins);
 	assert(rem <= tbin->ncached);
-	assert(tbin->ncached > 0 || tbin->avail == NULL);
 
-	for (flush = tbin->avail, nflush = tbin->ncached - rem, first_pass =
-	    true; flush != NULL; flush = deferred, nflush = ndeferred) {
+	for (nflush = tbin->ncached - rem; nflush > 0; nflush = ndeferred) {
 		/* Lock the arena bin associated with the first object. */
-		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(flush);
+		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(
+		    tbin->avail[0]);
 		arena_t *arena = chunk->arena;
 		arena_bin_t *bin = &arena->bins[binind];
 
@@ -92,12 +93,10 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem
 			tbin->tstats.nrequests = 0;
 		}
 #endif
-		deferred = NULL;
 		ndeferred = 0;
 		for (i = 0; i < nflush; i++) {
-			ptr = flush;
+			ptr = tbin->avail[i];
 			assert(ptr != NULL);
-			flush = *(void **)ptr;
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 			if (chunk->arena == arena) {
 				size_t pageind = ((uintptr_t)ptr -
@@ -112,17 +111,11 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem
 				 * locked.  Stash the object, so that it can be
 				 * handled in a future pass.
 				 */
-				*(void **)ptr = deferred;
-				deferred = ptr;
+				tbin->avail[ndeferred] = ptr;
 				ndeferred++;
 			}
 		}
 		malloc_mutex_unlock(&bin->lock);
-
-		if (first_pass) {
-			tbin->avail = flush;
-			first_pass = false;
-		}
 	}
 #ifdef JEMALLOC_STATS
 	if (merged_stats == false) {
@@ -139,6 +132,8 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem
 	}
 #endif
 
+	memmove(tbin->avail, &tbin->avail[tbin->ncached - rem],
+	    rem * sizeof(void *));
 	tbin->ncached = rem;
 	if (tbin->ncached < tbin->low_water)
 		tbin->low_water = tbin->ncached;
@@ -151,18 +146,19 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem
 #endif
     )
 {
-	void *flush, *deferred, *ptr;
+	void *ptr;
 	unsigned i, nflush, ndeferred;
-	bool first_pass;
+#ifdef JEMALLOC_STATS
+	bool merged_stats = false;
+#endif
 
 	assert(binind < nhbins);
 	assert(rem <= tbin->ncached);
-	assert(tbin->ncached > 0 || tbin->avail == NULL);
 
-	for (flush = tbin->avail, nflush = tbin->ncached - rem, first_pass =
-	    true; flush != NULL; flush = deferred, nflush = ndeferred) {
+	for (nflush = tbin->ncached - rem; nflush > 0; nflush = ndeferred) {
 		/* Lock the arena associated with the first object. */
-		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(flush);
+		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(
+		    tbin->avail[0]);
 		arena_t *arena = chunk->arena;
 
 		malloc_mutex_lock(&arena->lock);
@@ -174,6 +170,7 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem
 			tcache->prof_accumbytes = 0;
 #endif
 #ifdef JEMALLOC_STATS
+			merged_stats = true;
 			arena->stats.nrequests_large += tbin->tstats.nrequests;
 			arena->stats.lstats[binind - nbins].nrequests +=
 			    tbin->tstats.nrequests;
@@ -182,12 +179,10 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem
 #if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
 		}
 #endif
-		deferred = NULL;
 		ndeferred = 0;
 		for (i = 0; i < nflush; i++) {
-			ptr = flush;
+			ptr = tbin->avail[i];
 			assert(ptr != NULL);
-			flush = *(void **)ptr;
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 			if (chunk->arena == arena)
 				arena_dalloc_large(arena, chunk, ptr);
@@ -198,19 +193,30 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem
 				 * Stash the object, so that it can be handled
 				 * in a future pass.
 				 */
-				*(void **)ptr = deferred;
-				deferred = ptr;
+				tbin->avail[ndeferred] = ptr;
 				ndeferred++;
 			}
 		}
 		malloc_mutex_unlock(&arena->lock);
-
-		if (first_pass) {
-			tbin->avail = flush;
-			first_pass = false;
-		}
 	}
+#ifdef JEMALLOC_STATS
+	if (merged_stats == false) {
+		/*
+		 * The flush loop didn't happen to flush to this thread's
+		 * arena, so the stats didn't get merged.  Manually do so now.
+		 */
+		arena_t *arena = tcache->arena;
+		malloc_mutex_lock(&arena->lock);
+		arena->stats.nrequests_large += tbin->tstats.nrequests;
+		arena->stats.lstats[binind - nbins].nrequests +=
+		    tbin->tstats.nrequests;
+		tbin->tstats.nrequests = 0;
+		malloc_mutex_unlock(&arena->lock);
+	}
+#endif
 
+	memmove(tbin->avail, &tbin->avail[tbin->ncached - rem],
+	    rem * sizeof(void *));
 	tbin->ncached = rem;
 	if (tbin->ncached < tbin->low_water)
 		tbin->low_water = tbin->ncached;
@@ -220,10 +226,14 @@ tcache_t *
 tcache_create(arena_t *arena)
 {
 	tcache_t *tcache;
-	size_t size;
+	size_t size, stack_offset;
 	unsigned i;
 
 	size = offsetof(tcache_t, tbins) + (sizeof(tcache_bin_t) * nhbins);
+	/* Naturally align the pointer stacks. */
+	size = PTR_CEILING(size);
+	stack_offset = size;
+	size += stack_nelms * sizeof(void *);
 	/*
 	 * Round up to the nearest multiple of the cacheline size, in order to
 	 * avoid the possibility of false cacheline sharing.
@@ -236,6 +246,8 @@ tcache_create(arena_t *arena)
 
 	if (size <= small_maxclass)
 		tcache = (tcache_t *)arena_malloc_small(arena, size, true);
+	else if (size <= tcache_maxclass)
+		tcache = (tcache_t *)arena_malloc_large(arena, size, true);
 	else
 		tcache = (tcache_t *)icalloc(size);
 
@@ -252,15 +264,11 @@ tcache_create(arena_t *arena)
 
 	tcache->arena = arena;
 	assert((TCACHE_NSLOTS_SMALL_MAX & 1U) == 0);
-	for (i = 0; i < nbins; i++) {
-		if ((arena_bin_info[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MAX) {
-			tcache->tbins[i].ncached_max = (arena_bin_info[i].nregs
-			    << 1);
-		} else
-			tcache->tbins[i].ncached_max = TCACHE_NSLOTS_SMALL_MAX;
+	for (i = 0; i < nhbins; i++) {
+		tcache->tbins[i].avail = (void **)((uintptr_t)tcache +
+		    (uintptr_t)stack_offset);
+		stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *);
 	}
-	for (; i < nhbins; i++)
-		tcache->tbins[i].ncached_max = TCACHE_NSLOTS_LARGE;
 
 	TCACHE_SET(tcache);
 
@@ -271,6 +279,7 @@ void
 tcache_destroy(tcache_t *tcache)
 {
 	unsigned i;
+	size_t tcache_size;
 
 #ifdef JEMALLOC_STATS
 	/* Unlink from list of extant tcaches. */
@@ -327,7 +336,8 @@ tcache_destroy(tcache_t *tcache)
 	}
 #endif
 
-	if (arena_salloc(tcache) <= small_maxclass) {
+	tcache_size = arena_salloc(tcache);
+	if (tcache_size <= small_maxclass) {
 		arena_chunk_t *chunk = CHUNK_ADDR2BASE(tcache);
 		arena_t *arena = chunk->arena;
 		size_t pageind = ((uintptr_t)tcache - (uintptr_t)chunk) >>
@@ -341,6 +351,13 @@ tcache_destroy(tcache_t *tcache)
 		malloc_mutex_lock(&bin->lock);
 		arena_dalloc_bin(arena, chunk, tcache, mapelm);
 		malloc_mutex_unlock(&bin->lock);
+	} else if (tcache_size <= tcache_maxclass) {
+		arena_chunk_t *chunk = CHUNK_ADDR2BASE(tcache);
+		arena_t *arena = chunk->arena;
+
+		malloc_mutex_lock(&arena->lock);
+		arena_dalloc_large(arena, chunk, tcache);
+		malloc_mutex_unlock(&arena->lock);
 	} else
 		idalloc(tcache);
 }
@@ -397,11 +414,13 @@ tcache_stats_merge(tcache_t *tcache, arena_t *arena)
 }
 #endif
 
-void
+bool
 tcache_boot(void)
 {
 
 	if (opt_tcache) {
+		unsigned i;
+
 		/*
 		 * If necessary, clamp opt_lg_tcache_max, now that
 		 * small_maxclass and arena_maxclass are known.
@@ -416,6 +435,28 @@ tcache_boot(void)
 
 		nhbins = nbins + (tcache_maxclass >> PAGE_SHIFT);
 
+		/* Initialize tcache_bin_info. */
+		tcache_bin_info = (tcache_bin_info_t *)base_alloc(nhbins *
+		    sizeof(tcache_bin_info_t));
+		if (tcache_bin_info == NULL)
+			return (true);
+		stack_nelms = 0;
+		for (i = 0; i < nbins; i++) {
+			if ((arena_bin_info[i].nregs << 1) <=
+			    TCACHE_NSLOTS_SMALL_MAX) {
+				tcache_bin_info[i].ncached_max =
+				    (arena_bin_info[i].nregs << 1);
+			} else {
+				tcache_bin_info[i].ncached_max =
+				    TCACHE_NSLOTS_SMALL_MAX;
+			}
+			stack_nelms += tcache_bin_info[i].ncached_max;
+		}
+		for (; i < nhbins; i++) {
+			tcache_bin_info[i].ncached_max = TCACHE_NSLOTS_LARGE;
+			stack_nelms += tcache_bin_info[i].ncached_max;
+		}
+
 		/* Compute incremental GC event threshold. */
 		if (opt_lg_tcache_gc_sweep >= 0) {
 			tcache_gc_incr = ((1U << opt_lg_tcache_gc_sweep) /
@@ -431,6 +472,8 @@ tcache_boot(void)
 			abort();
 		}
 	}
+
+	return (false);
 }
 /******************************************************************************/
 #endif /* JEMALLOC_TCACHE */
diff --git a/jemalloc/test/bitmap.c b/jemalloc/test/bitmap.c
new file mode 100644
index 0000000..7a017c8
--- /dev/null
+++ b/jemalloc/test/bitmap.c
@@ -0,0 +1,153 @@
+#define	JEMALLOC_MANGLE
+#include "jemalloc_test.h"
+
+/*
+ * Avoid using the assert() from jemalloc_internal.h, since it requires
+ * internal libjemalloc functionality.
+ * */
+#include <assert.h>
+
+/*
+ * Directly include the bitmap code, since it isn't exposed outside
+ * libjemalloc.
+ */
+#include "../src/bitmap.c"
+
+#define	MAXBITS	4500
+
+static void
+test_bitmap_size(void)
+{
+	size_t i, prev_size;
+
+	prev_size = 0;
+	for (i = 1; i <= MAXBITS; i++) {
+		size_t size = bitmap_size(i);
+		assert(size >= prev_size);
+		prev_size = size;
+	}
+}
+
+static void
+test_bitmap_init(void)
+{
+	size_t i;
+
+	for (i = 1; i <= MAXBITS; i++) {
+		bitmap_info_t binfo;
+		bitmap_info_init(&binfo, i);
+		{
+			size_t j;
+			bitmap_t bitmap[bitmap_info_ngroups(&binfo)];
+			bitmap_init(bitmap, &binfo);
+
+			for (j = 0; j < i; j++)
+				assert(bitmap_get(bitmap, &binfo, j) == false);
+
+		}
+	}
+}
+
+static void
+test_bitmap_set(void)
+{
+	size_t i;
+
+	for (i = 1; i <= MAXBITS; i++) {
+		bitmap_info_t binfo;
+		bitmap_info_init(&binfo, i);
+		{
+			size_t j;
+			bitmap_t bitmap[bitmap_info_ngroups(&binfo)];
+			bitmap_init(bitmap, &binfo);
+
+			for (j = 0; j < i; j++)
+				bitmap_set(bitmap, &binfo, j);
+			assert(bitmap_full(bitmap, &binfo));
+		}
+	}
+}
+
+static void
+test_bitmap_unset(void)
+{
+	size_t i;
+
+	for (i = 1; i <= MAXBITS; i++) {
+		bitmap_info_t binfo;
+		bitmap_info_init(&binfo, i);
+		{
+			size_t j;
+			bitmap_t bitmap[bitmap_info_ngroups(&binfo)];
+			bitmap_init(bitmap, &binfo);
+
+			for (j = 0; j < i; j++)
+				bitmap_set(bitmap, &binfo, j);
+			assert(bitmap_full(bitmap, &binfo));
+			for (j = 0; j < i; j++)
+				bitmap_unset(bitmap, &binfo, j);
+			for (j = 0; j < i; j++)
+				bitmap_set(bitmap, &binfo, j);
+			assert(bitmap_full(bitmap, &binfo));
+		}
+	}
+}
+
+static void
+test_bitmap_sfu(void)
+{
+	size_t i;
+
+	for (i = 1; i <= MAXBITS; i++) {
+		bitmap_info_t binfo;
+		bitmap_info_init(&binfo, i);
+		{
+			ssize_t j;
+			bitmap_t bitmap[bitmap_info_ngroups(&binfo)];
+			bitmap_init(bitmap, &binfo);
+
+			/* Iteratively set bits starting at the beginning. */
+			for (j = 0; j < i; j++)
+				assert(bitmap_sfu(bitmap, &binfo) == j);
+			assert(bitmap_full(bitmap, &binfo));
+
+			/*
+			 * Iteratively unset bits starting at the end, and
+			 * verify that bitmap_sfu() reaches the unset bits.
+			 */
+			for (j = i - 1; j >= 0; j--) {
+				bitmap_unset(bitmap, &binfo, j);
+				assert(bitmap_sfu(bitmap, &binfo) == j);
+				bitmap_unset(bitmap, &binfo, j);
+			}
+			assert(bitmap_get(bitmap, &binfo, 0) == false);
+
+			/*
+			 * Iteratively set bits starting at the beginning, and
+			 * verify that bitmap_sfu() looks past them.
+			 */
+			for (j = 1; j < i; j++) {
+				bitmap_set(bitmap, &binfo, j - 1);
+				assert(bitmap_sfu(bitmap, &binfo) == j);
+				bitmap_unset(bitmap, &binfo, j);
+			}
+			assert(bitmap_sfu(bitmap, &binfo) == i - 1);
+			assert(bitmap_full(bitmap, &binfo));
+		}
+	}
+}
+
+int
+main(void)
+{
+	fprintf(stderr, "Test begin\n");
+
+	test_bitmap_size();
+	test_bitmap_init();
+	test_bitmap_set();
+	test_bitmap_unset();
+	test_bitmap_sfu();
+
+	fprintf(stderr, "Test end\n");
+	return (0);
+}
diff --git a/jemalloc/test/bitmap.exp b/jemalloc/test/bitmap.exp
new file mode 100644
index 0000000..369a88d
--- /dev/null
+++ b/jemalloc/test/bitmap.exp
@@ -0,0 +1,2 @@
+Test begin
+Test end
-- 
cgit v0.12


From 9c43c13a35220c10d97a886616899189daceb359 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Fri, 18 Mar 2011 10:53:15 -0700
Subject: Reverse tcache fill order.

Refill the thread cache such that low regions get used first.  This
fixes a regression due to the recent transition to bitmap-based region
management.
---
 jemalloc/src/arena.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index 87bd9bb..a1fa2a3 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -1360,7 +1360,8 @@ arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, size_t binind
 			ptr = arena_bin_malloc_hard(arena, bin);
 		if (ptr == NULL)
 			break;
-		tbin->avail[i] = ptr;
+		/* Insert such that low regions get used first. */
+		tbin->avail[nfill - 1 - i] = ptr;
 	}
 #ifdef JEMALLOC_STATS
 	bin->stats.allocated += (i - tbin->ncached) *
-- 
cgit v0.12


From 597632be188d2bcc135dad2145cc46ef44897aad Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Fri, 18 Mar 2011 13:41:33 -0700
Subject: Improve thread-->arena assignment.

Rather than blindly assigning threads to arenas in round-robin fashion,
choose the lowest-numbered arena that currently has the smallest number
of threads assigned to it.

Add the "stats.arenas.<i>.nthreads" mallctl.
---
 jemalloc/doc/jemalloc.xml.in                       | 10 +++
 jemalloc/include/jemalloc/internal/arena.h         | 14 ++++-
 jemalloc/include/jemalloc/internal/ctl.h           |  1 +
 .../jemalloc/internal/jemalloc_internal.h.in       |  3 +-
 jemalloc/src/arena.c                               |  1 +
 jemalloc/src/ctl.c                                 | 13 ++++
 jemalloc/src/jemalloc.c                            | 71 ++++++++++++++++++----
 jemalloc/src/stats.c                               |  4 ++
 8 files changed, 101 insertions(+), 16 deletions(-)

diff --git a/jemalloc/doc/jemalloc.xml.in b/jemalloc/doc/jemalloc.xml.in
index 97893c1..2bde890 100644
--- a/jemalloc/doc/jemalloc.xml.in
+++ b/jemalloc/doc/jemalloc.xml.in
@@ -1644,6 +1644,16 @@ malloc_conf = "xmalloc:true";]]></programlisting>
 
       <varlistentry>
         <term>
+          <mallctl>stats.arenas.&lt;i&gt;.nthreads</mallctl>
+          (<type>unsigned</type>)
+          <literal>r-</literal>
+        </term>
+        <listitem><para>Number of threads currently assigned to
+        arena.</para></listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term>
           <mallctl>stats.arenas.&lt;i&gt;.pactive</mallctl>
           (<type>size_t</type>)
           <literal>r-</literal>
diff --git a/jemalloc/include/jemalloc/internal/arena.h b/jemalloc/include/jemalloc/internal/arena.h
index 1744b45..94b7f3d 100644
--- a/jemalloc/include/jemalloc/internal/arena.h
+++ b/jemalloc/include/jemalloc/internal/arena.h
@@ -295,8 +295,18 @@ struct arena_s {
 	unsigned		ind;
 
 	/*
-	 * All non-bin-related operations on this arena require that lock be
-	 * locked.
+	 * Number of threads currently assigned to this arena.  This field is
+	 * protected by arenas_lock.
+	 */
+	unsigned		nthreads;
+
+	/*
+	 * There are three classes of arena operations from a locking
+	 * perspective:
+	 * 1) Thread asssignment (modifies nthreads) is protected by
+	 *    arenas_lock.
+	 * 2) Bin-related operations are protected by bin locks.
+	 * 3) Chunk- and run-related operations are protected by this mutex.
 	 */
 	malloc_mutex_t		lock;
 
diff --git a/jemalloc/include/jemalloc/internal/ctl.h b/jemalloc/include/jemalloc/internal/ctl.h
index 8776ad1..f1f5eb7 100644
--- a/jemalloc/include/jemalloc/internal/ctl.h
+++ b/jemalloc/include/jemalloc/internal/ctl.h
@@ -29,6 +29,7 @@ struct ctl_node_s {
 
 struct ctl_arena_stats_s {
 	bool			initialized;
+	unsigned		nthreads;
 	size_t			pactive;
 	size_t			pdirty;
 #ifdef JEMALLOC_STATS
diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
index a80fc7c..a7472c0 100644
--- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
@@ -293,6 +293,7 @@ extern size_t		lg_pagesize;
 extern unsigned		ncpus;
 
 extern malloc_mutex_t	arenas_lock; /* Protects arenas initialization. */
+extern pthread_key_t	arenas_tsd;
 #ifndef NO_TLS
 /*
  * Map of pthread_self() --> arenas[???], used for selecting an arena to use
@@ -302,9 +303,9 @@ extern __thread arena_t	*arenas_tls JEMALLOC_ATTR(tls_model("initial-exec"));
 #  define ARENA_GET()	arenas_tls
 #  define ARENA_SET(v)	do {						\
 	arenas_tls = (v);						\
+	pthread_setspecific(arenas_tsd, (void *)(v));			\
 } while (0)
 #else
-extern pthread_key_t	arenas_tsd;
 #  define ARENA_GET()	((arena_t *)pthread_getspecific(arenas_tsd))
 #  define ARENA_SET(v)	do {						\
 	pthread_setspecific(arenas_tsd, (void *)(v));			\
diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index a1fa2a3..022f9ec 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -2175,6 +2175,7 @@ arena_new(arena_t *arena, unsigned ind)
 	arena_bin_t *bin;
 
 	arena->ind = ind;
+	arena->nthreads = 0;
 
 	if (malloc_mutex_init(&arena->lock))
 		return (true);
diff --git a/jemalloc/src/ctl.c b/jemalloc/src/ctl.c
index c32e955..b4f280d 100644
--- a/jemalloc/src/ctl.c
+++ b/jemalloc/src/ctl.c
@@ -182,6 +182,7 @@ CTL_PROTO(stats_arenas_i_lruns_j_highruns)
 CTL_PROTO(stats_arenas_i_lruns_j_curruns)
 INDEX_PROTO(stats_arenas_i_lruns_j)
 #endif
+CTL_PROTO(stats_arenas_i_nthreads)
 CTL_PROTO(stats_arenas_i_pactive)
 CTL_PROTO(stats_arenas_i_pdirty)
 #ifdef JEMALLOC_STATS
@@ -434,6 +435,7 @@ static const ctl_node_t stats_arenas_i_lruns_node[] = {
 #endif
 
 static const ctl_node_t stats_arenas_i_node[] = {
+	{NAME("nthreads"),		CTL(stats_arenas_i_nthreads)},
 	{NAME("pactive"),		CTL(stats_arenas_i_pactive)},
 	{NAME("pdirty"),		CTL(stats_arenas_i_pdirty)}
 #ifdef JEMALLOC_STATS
@@ -620,6 +622,7 @@ ctl_arena_refresh(arena_t *arena, unsigned i)
 
 	ctl_arena_clear(astats);
 
+	sstats->nthreads += astats->nthreads;
 #ifdef JEMALLOC_STATS
 	ctl_arena_stats_amerge(astats, arena);
 	/* Merge into sum stats as well. */
@@ -657,10 +660,17 @@ ctl_refresh(void)
 	 * Clear sum stats, since they will be merged into by
 	 * ctl_arena_refresh().
 	 */
+	ctl_stats.arenas[narenas].nthreads = 0;
 	ctl_arena_clear(&ctl_stats.arenas[narenas]);
 
 	malloc_mutex_lock(&arenas_lock);
 	memcpy(tarenas, arenas, sizeof(arena_t *) * narenas);
+	for (i = 0; i < narenas; i++) {
+		if (arenas[i] != NULL)
+			ctl_stats.arenas[i].nthreads = arenas[i]->nthreads;
+		else
+			ctl_stats.arenas[i].nthreads = 0;
+	}
 	malloc_mutex_unlock(&arenas_lock);
 	for (i = 0; i < narenas; i++) {
 		bool initialized = (tarenas[i] != NULL);
@@ -1129,6 +1139,8 @@ thread_arena_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
 		malloc_mutex_lock(&arenas_lock);
 		if ((arena = arenas[newind]) == NULL)
 			arena = arenas_extend(newind);
+		arenas[oldind]->nthreads--;
+		arenas[newind]->nthreads++;
 		malloc_mutex_unlock(&arenas_lock);
 		if (arena == NULL) {
 			ret = EAGAIN;
@@ -1536,6 +1548,7 @@ stats_arenas_i_lruns_j_index(const size_t *mib, size_t miblen, size_t j)
 }
 
 #endif
+CTL_RO_GEN(stats_arenas_i_nthreads, ctl_stats.arenas[mib[2]].nthreads, unsigned)
 CTL_RO_GEN(stats_arenas_i_pactive, ctl_stats.arenas[mib[2]].pactive, size_t)
 CTL_RO_GEN(stats_arenas_i_pdirty, ctl_stats.arenas[mib[2]].pdirty, size_t)
 #ifdef JEMALLOC_STATS
diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c
index 9f2fa92..ecd521c 100644
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@@ -7,12 +7,10 @@
 malloc_mutex_t		arenas_lock;
 arena_t			**arenas;
 unsigned		narenas;
-static unsigned		next_arena;
 
+pthread_key_t		arenas_tsd;
 #ifndef NO_TLS
 __thread arena_t	*arenas_tls JEMALLOC_ATTR(tls_model("initial-exec"));
-#else
-pthread_key_t		arenas_tsd;
 #endif
 
 #ifdef JEMALLOC_STATS
@@ -70,6 +68,7 @@ size_t	opt_narenas = 0;
 static void	wrtmessage(void *cbopaque, const char *s);
 static void	stats_print_atexit(void);
 static unsigned	malloc_ncpus(void);
+static void	arenas_cleanup(void *arg);
 #if (defined(JEMALLOC_STATS) && defined(NO_TLS))
 static void	thread_allocated_cleanup(void *arg);
 #endif
@@ -147,13 +146,53 @@ choose_arena_hard(void)
 	arena_t *ret;
 
 	if (narenas > 1) {
+		unsigned i, choose, first_null;
+
+		choose = 0;
+		first_null = narenas;
 		malloc_mutex_lock(&arenas_lock);
-		if ((ret = arenas[next_arena]) == NULL)
-			ret = arenas_extend(next_arena);
-		next_arena = (next_arena + 1) % narenas;
+		assert(arenas[i] != NULL);
+		for (i = 1; i < narenas; i++) {
+			if (arenas[i] != NULL) {
+				/*
+				 * Choose the first arena that has the lowest
+				 * number of threads assigned to it.
+				 */
+				if (arenas[i]->nthreads <
+				    arenas[choose]->nthreads)
+					choose = i;
+			} else if (first_null == narenas) {
+				/*
+				 * Record the index of the first uninitialized
+				 * arena, in case all extant arenas are in use.
+				 *
+				 * NB: It is possible for there to be
+				 * discontinuities in terms of initialized
+				 * versus uninitialized arenas, due to the
+				 * "thread.arena" mallctl.
+				 */
+				first_null = i;
+			}
+		}
+
+		if (arenas[choose] == 0 || first_null == narenas) {
+			/*
+			 * Use an unloaded arena, or the least loaded arena if
+			 * all arenas are already initialized.
+			 */
+			ret = arenas[choose];
+		} else {
+			/* Initialize a new arena. */
+			ret = arenas_extend(first_null);
+		}
+		ret->nthreads++;
 		malloc_mutex_unlock(&arenas_lock);
-	} else
+	} else {
 		ret = arenas[0];
+		malloc_mutex_lock(&arenas_lock);
+		ret->nthreads++;
+		malloc_mutex_unlock(&arenas_lock);
+	}
 
 	ARENA_SET(ret);
 
@@ -259,6 +298,16 @@ malloc_ncpus(void)
 	return (ret);
 }
 
+static void
+arenas_cleanup(void *arg)
+{
+	arena_t *arena = (arena_t *)arg;
+
+	malloc_mutex_lock(&arenas_lock);
+	arena->nthreads--;
+	malloc_mutex_unlock(&arenas_lock);
+}
+
 #if (defined(JEMALLOC_STATS) && defined(NO_TLS))
 static void
 thread_allocated_cleanup(void *arg)
@@ -737,6 +786,7 @@ malloc_init_hard(void)
 	 * threaded mode.
 	 */
 	ARENA_SET(arenas[0]);
+	arenas[0]->nthreads++;
 
 	if (malloc_mutex_init(&arenas_lock))
 		return (true);
@@ -779,14 +829,10 @@ malloc_init_hard(void)
 		malloc_write(")\n");
 	}
 
-	next_arena = (narenas > 0) ? 1 : 0;
-
-#ifdef NO_TLS
-	if (pthread_key_create(&arenas_tsd, NULL) != 0) {
+	if (pthread_key_create(&arenas_tsd, arenas_cleanup) != 0) {
 		malloc_mutex_unlock(&init_lock);
 		return (true);
 	}
-#endif
 
 	/* Allocate and initialize arenas. */
 	arenas = (arena_t **)base_alloc(sizeof(arena_t *) * narenas);
@@ -819,7 +865,6 @@ malloc_init_hard(void)
 	return (false);
 }
 
-
 #ifdef JEMALLOC_ZONE
 JEMALLOC_ATTR(constructor)
 void
diff --git a/jemalloc/src/stats.c b/jemalloc/src/stats.c
index 3dfe0d2..81105c4 100644
--- a/jemalloc/src/stats.c
+++ b/jemalloc/src/stats.c
@@ -319,6 +319,7 @@ static void
 stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
     unsigned i)
 {
+	unsigned nthreads;
 	size_t pagesize, pactive, pdirty, mapped;
 	uint64_t npurge, nmadvise, purged;
 	size_t small_allocated;
@@ -328,6 +329,9 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 
 	CTL_GET("arenas.pagesize", &pagesize, size_t);
 
+	CTL_I_GET("stats.arenas.0.nthreads", &nthreads, unsigned);
+	malloc_cprintf(write_cb, cbopaque,
+	    "assigned threads: %u\n", nthreads);
 	CTL_I_GET("stats.arenas.0.pactive", &pactive, size_t);
 	CTL_I_GET("stats.arenas.0.pdirty", &pdirty, size_t);
 	CTL_I_GET("stats.arenas.0.npurge", &npurge, uint64_t);
-- 
cgit v0.12


From 0657f12acd43eb2082a71230341449eca648bc9b Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Fri, 18 Mar 2011 17:56:14 -0700
Subject: Add the "stats.cactive" mallctl.

Add the "stats.cactive" mallctl, which can be used to efficiently and
repeatedly query approximately how much active memory the application is
utilizing.
---
 jemalloc/Makefile.in                               | 15 +++++----
 jemalloc/doc/jemalloc.xml.in                       | 19 +++++++++++
 jemalloc/include/jemalloc/internal/hash.h          |  2 +-
 .../jemalloc/internal/jemalloc_internal.h.in       |  4 +++
 jemalloc/include/jemalloc/internal/mb.h            |  2 +-
 jemalloc/include/jemalloc/internal/rtree.h         |  2 +-
 jemalloc/include/jemalloc/internal/stats.h         | 37 ++++++++++++++++++++--
 jemalloc/src/arena.c                               | 34 ++++++++++++++++++++
 jemalloc/src/ckh.c                                 |  2 +-
 jemalloc/src/ctl.c                                 |  3 ++
 jemalloc/src/hash.c                                |  2 +-
 jemalloc/src/huge.c                                |  3 ++
 jemalloc/src/jemalloc.c                            |  2 +-
 jemalloc/src/mb.c                                  |  2 +-
 jemalloc/src/rtree.c                               |  2 +-
 jemalloc/src/stats.c                               | 15 +++++++--
 16 files changed, 126 insertions(+), 20 deletions(-)

diff --git a/jemalloc/Makefile.in b/jemalloc/Makefile.in
index 8ee4c93..26da0e2 100644
--- a/jemalloc/Makefile.in
+++ b/jemalloc/Makefile.in
@@ -45,13 +45,13 @@ endif
 BINS := @srcroot@bin/pprof
 CHDRS := @objroot@include/jemalloc/jemalloc@install_suffix@.h \
 	@objroot@include/jemalloc/jemalloc_defs@install_suffix@.h
-CSRCS := @srcroot@src/jemalloc.c @srcroot@src/arena.c @srcroot@src/base.c \
-	@srcroot@src/bitmap.c @srcroot@src/chunk.c @srcroot@src/chunk_dss.c \
-	@srcroot@src/chunk_mmap.c @srcroot@src/chunk_swap.c @srcroot@src/ckh.c \
-	@srcroot@src/ctl.c @srcroot@src/extent.c @srcroot@src/hash.c \
-	@srcroot@src/huge.c @srcroot@src/mb.c @srcroot@src/mutex.c \
-	@srcroot@src/prof.c @srcroot@src/rtree.c \
-	@srcroot@src/stats.c @srcroot@src/tcache.c
+CSRCS := @srcroot@src/jemalloc.c @srcroot@src/arena.c @srcroot@src/atomic.c \
+	@srcroot@src/base.c @srcroot@src/bitmap.c @srcroot@src/chunk.c \
+	@srcroot@src/chunk_dss.c @srcroot@src/chunk_mmap.c \
+	@srcroot@src/chunk_swap.c @srcroot@src/ckh.c @srcroot@src/ctl.c \
+	@srcroot@src/extent.c @srcroot@src/hash.c @srcroot@src/huge.c \
+	@srcroot@src/mb.c @srcroot@src/mutex.c @srcroot@src/prof.c \
+	@srcroot@src/rtree.c @srcroot@src/stats.c @srcroot@src/tcache.c
 ifeq (macho, @abi@)
 CSRCS += @srcroot@src/zone.c
 endif
@@ -96,6 +96,7 @@ doc: $(DOCS)
 #
 -include $(CSRCS:@srcroot@%.c=@objroot@%.d)
 -include $(CSRCS:@srcroot@%.c=@objroot@%.pic.d)
+-include $(CTESTS:@srcroot@%.c=@objroot@%.d)
 
 @objroot@src/%.o: @srcroot@src/%.c
 	@mkdir -p $(@D)
diff --git a/jemalloc/doc/jemalloc.xml.in b/jemalloc/doc/jemalloc.xml.in
index 2bde890..13f3aae 100644
--- a/jemalloc/doc/jemalloc.xml.in
+++ b/jemalloc/doc/jemalloc.xml.in
@@ -1535,6 +1535,25 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         option for additional information.</para></listitem>
       </varlistentry>
 
+      <varlistentry id="stats.cactive">
+        <term>
+          <mallctl>stats.cactive</mallctl>
+          (<type>size_t *</type>)
+          <literal>r-</literal>
+          [<option>--enable-stats</option>]
+        </term>
+        <listitem><para>Pointer to a counter that contains an approximate count
+        of the current number of bytes in active pages.  The estimate may be
+        high, but never low, because each arena rounds up to the nearest
+        multiple of the chunk size when computing its contribution to the
+        counter.  Note that the <link
+        linkend="epoch"><mallctl>epoch</mallctl></link> mallctl has no bearing
+        on this counter.  Furthermore, counter consistency is maintained via
+        atomic operations, so it is necessary to use an atomic operation in
+        order to guarantee a consistent read when dereferencing the pointer.
+        </para></listitem>
+      </varlistentry>
+
       <varlistentry id="stats.allocated">
         <term>
           <mallctl>stats.allocated</mallctl>
diff --git a/jemalloc/include/jemalloc/internal/hash.h b/jemalloc/include/jemalloc/internal/hash.h
index 9073d83..93905bf 100644
--- a/jemalloc/include/jemalloc/internal/hash.h
+++ b/jemalloc/include/jemalloc/internal/hash.h
@@ -17,7 +17,7 @@
 uint64_t	hash(const void *key, size_t len, uint64_t seed);
 #endif
 
-#if (defined(JEMALLOC_ENABLE_INLINE) || defined(HASH_C_))
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_HASH_C_))
 /*
  * The following hash function is based on MurmurHash64A(), placed into the
  * public domain by Austin Appleby.  See http://murmurhash.googlepages.com/ for
diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
index a7472c0..90cd604 100644
--- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
@@ -213,6 +213,7 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #define	PAGE_CEILING(s)							\
 	(((s) + PAGE_MASK) & ~PAGE_MASK)
 
+#include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/prn.h"
 #include "jemalloc/internal/ckh.h"
 #include "jemalloc/internal/stats.h"
@@ -237,6 +238,7 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 /******************************************************************************/
 #define JEMALLOC_H_STRUCTS
 
+#include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/prn.h"
 #include "jemalloc/internal/ckh.h"
 #include "jemalloc/internal/stats.h"
@@ -352,6 +354,7 @@ int	buferror(int errnum, char *buf, size_t buflen);
 void	jemalloc_prefork(void);
 void	jemalloc_postfork(void);
 
+#include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/prn.h"
 #include "jemalloc/internal/ckh.h"
 #include "jemalloc/internal/stats.h"
@@ -376,6 +379,7 @@ void	jemalloc_postfork(void);
 /******************************************************************************/
 #define JEMALLOC_H_INLINES
 
+#include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/prn.h"
 #include "jemalloc/internal/ckh.h"
 #include "jemalloc/internal/stats.h"
diff --git a/jemalloc/include/jemalloc/internal/mb.h b/jemalloc/include/jemalloc/internal/mb.h
index 1707aa9..dc9f2a5 100644
--- a/jemalloc/include/jemalloc/internal/mb.h
+++ b/jemalloc/include/jemalloc/internal/mb.h
@@ -17,7 +17,7 @@
 void	mb_write(void);
 #endif
 
-#if (defined(JEMALLOC_ENABLE_INLINE) || defined(MB_C_))
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_MB_C_))
 #ifdef __i386__
 /*
  * According to the Intel Architecture Software Developer's Manual, current
diff --git a/jemalloc/include/jemalloc/internal/rtree.h b/jemalloc/include/jemalloc/internal/rtree.h
index 9d58eba..95d6355 100644
--- a/jemalloc/include/jemalloc/internal/rtree.h
+++ b/jemalloc/include/jemalloc/internal/rtree.h
@@ -49,7 +49,7 @@ void	*rtree_get(rtree_t *rtree, uintptr_t key);
 bool	rtree_set(rtree_t *rtree, uintptr_t key, void *val);
 #endif
 
-#if (defined(JEMALLOC_ENABLE_INLINE) || defined(RTREE_C_))
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_RTREE_C_))
 #define	RTREE_GET_GENERATE(f)						\
 /* The least significant bits of the key are ignored. */		\
 JEMALLOC_INLINE void *							\
diff --git a/jemalloc/include/jemalloc/internal/stats.h b/jemalloc/include/jemalloc/internal/stats.h
index 3fc2080..2a9b31d 100644
--- a/jemalloc/include/jemalloc/internal/stats.h
+++ b/jemalloc/include/jemalloc/internal/stats.h
@@ -154,6 +154,10 @@ struct chunk_stats_s {
 
 extern bool	opt_stats_print;
 
+#ifdef JEMALLOC_STATS
+extern size_t	stats_cactive;
+#endif
+
 char	*u2s(uint64_t x, unsigned base, char *s);
 #ifdef JEMALLOC_STATS
 void malloc_cprintf(void (*write)(void *, const char *), void *cbopaque,
@@ -166,9 +170,38 @@ void	stats_print(void (*write)(void *, const char *), void *cbopaque,
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
-#ifdef JEMALLOC_STATS
 #ifdef JEMALLOC_H_INLINES
+#ifdef JEMALLOC_STATS
+
+#ifndef JEMALLOC_ENABLE_INLINE
+size_t	stats_cactive_get(void);
+void	stats_cactive_add(size_t size);
+void	stats_cactive_sub(size_t size);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_STATS_C_))
+JEMALLOC_INLINE size_t
+stats_cactive_get(void)
+{
+
+	return (atomic_read_z(&stats_cactive));
+}
+
+JEMALLOC_INLINE void
+stats_cactive_add(size_t size)
+{
+
+	atomic_add_z(&stats_cactive, size);
+}
+
+JEMALLOC_INLINE void
+stats_cactive_sub(size_t size)
+{
+
+	atomic_sub_z(&stats_cactive, size);
+}
+#endif
 
-#endif /* JEMALLOC_H_INLINES */
 #endif /* JEMALLOC_STATS */
+#endif /* JEMALLOC_H_INLINES */
 /******************************************************************************/
diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index 022f9ec..4cbca57 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -315,6 +315,9 @@ arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large,
 	size_t old_ndirty, run_ind, total_pages, need_pages, rem_pages, i;
 	size_t flag_dirty;
 	arena_avail_tree_t *runs_avail;
+#ifdef JEMALLOC_STATS
+	size_t cactive_diff;
+#endif
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
 	old_ndirty = chunk->ndirty;
@@ -333,6 +336,13 @@ arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large,
 	rem_pages = total_pages - need_pages;
 
 	arena_avail_tree_remove(runs_avail, &chunk->map[run_ind-map_bias]);
+#ifdef JEMALLOC_STATS
+	/* Update stats_cactive if nactive is crossing a chunk multiple. */
+	cactive_diff = CHUNK_CEILING((arena->nactive + need_pages) <<
+	    PAGE_SHIFT) - CHUNK_CEILING(arena->nactive << PAGE_SHIFT);
+	if (cactive_diff != 0)
+		stats_cactive_add(cactive_diff);
+#endif
 	arena->nactive += need_pages;
 
 	/* Keep track of trailing unused pages for later use. */
@@ -720,6 +730,9 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
 			assert(pageind + npages <= chunk_npages);
 			if (mapelm->bits & CHUNK_MAP_DIRTY) {
 				size_t i;
+#ifdef JEMALLOC_STATS
+				size_t cactive_diff;
+#endif
 
 				arena_avail_tree_remove(
 				    &arena->runs_avail_dirty, mapelm);
@@ -742,6 +755,17 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
 					    CHUNK_MAP_ALLOCATED;
 				}
 
+#ifdef JEMALLOC_STATS
+				/*
+				 * Update stats_cactive if nactive is crossing a
+				 * chunk multiple.
+				 */
+				cactive_diff = CHUNK_CEILING((arena->nactive +
+				    npages) << PAGE_SHIFT) -
+				    CHUNK_CEILING(arena->nactive << PAGE_SHIFT);
+				if (cactive_diff != 0)
+					stats_cactive_add(cactive_diff);
+#endif
 				arena->nactive += npages;
 				/* Append to list for later processing. */
 				ql_elm_new(mapelm, u.ql_link);
@@ -930,6 +954,9 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty)
 	arena_chunk_t *chunk;
 	size_t size, run_ind, run_pages, flag_dirty;
 	arena_avail_tree_t *runs_avail;
+#ifdef JEMALLOC_STATS
+	size_t cactive_diff;
+#endif
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
 	run_ind = (size_t)(((uintptr_t)run - (uintptr_t)chunk)
@@ -951,6 +978,13 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty)
 		size = bin_info->run_size;
 	}
 	run_pages = (size >> PAGE_SHIFT);
+#ifdef JEMALLOC_STATS
+	/* Update stats_cactive if nactive is crossing a chunk multiple. */
+	cactive_diff = CHUNK_CEILING(arena->nactive << PAGE_SHIFT) -
+	    CHUNK_CEILING((arena->nactive - run_pages) << PAGE_SHIFT);
+	if (cactive_diff != 0)
+		stats_cactive_sub(cactive_diff);
+#endif
 	arena->nactive -= run_pages;
 
 	/*
diff --git a/jemalloc/src/ckh.c b/jemalloc/src/ckh.c
index 75ae7fd..22319ab 100644
--- a/jemalloc/src/ckh.c
+++ b/jemalloc/src/ckh.c
@@ -34,7 +34,7 @@
  * respectively.
  *
  ******************************************************************************/
-#define	CKH_C_
+#define	JEMALLOC_CKH_C_
 #include "jemalloc/internal/jemalloc_internal.h"
 
 /******************************************************************************/
diff --git a/jemalloc/src/ctl.c b/jemalloc/src/ctl.c
index b4f280d..40fdbac 100644
--- a/jemalloc/src/ctl.c
+++ b/jemalloc/src/ctl.c
@@ -193,6 +193,7 @@ CTL_PROTO(stats_arenas_i_purged)
 #endif
 INDEX_PROTO(stats_arenas_i)
 #ifdef JEMALLOC_STATS
+CTL_PROTO(stats_cactive)
 CTL_PROTO(stats_allocated)
 CTL_PROTO(stats_active)
 CTL_PROTO(stats_mapped)
@@ -460,6 +461,7 @@ static const ctl_node_t stats_arenas_node[] = {
 
 static const ctl_node_t stats_node[] = {
 #ifdef JEMALLOC_STATS
+	{NAME("cactive"),		CTL(stats_cactive)},
 	{NAME("allocated"),		CTL(stats_allocated)},
 	{NAME("active"),		CTL(stats_active)},
 	{NAME("mapped"),		CTL(stats_mapped)},
@@ -1580,6 +1582,7 @@ RETURN:
 }
 
 #ifdef JEMALLOC_STATS
+CTL_RO_GEN(stats_cactive, &stats_cactive, size_t *)
 CTL_RO_GEN(stats_allocated, ctl_stats.allocated, size_t)
 CTL_RO_GEN(stats_active, ctl_stats.active, size_t)
 CTL_RO_GEN(stats_mapped, ctl_stats.mapped, size_t)
diff --git a/jemalloc/src/hash.c b/jemalloc/src/hash.c
index 6a13d7a..cfa4da0 100644
--- a/jemalloc/src/hash.c
+++ b/jemalloc/src/hash.c
@@ -1,2 +1,2 @@
-#define	HASH_C_
+#define	JEMALLOC_HASH_C_
 #include "jemalloc/internal/jemalloc_internal.h"
diff --git a/jemalloc/src/huge.c b/jemalloc/src/huge.c
index de09198..ac3f3a0 100644
--- a/jemalloc/src/huge.c
+++ b/jemalloc/src/huge.c
@@ -50,6 +50,7 @@ huge_malloc(size_t size, bool zero)
 	malloc_mutex_lock(&huge_mtx);
 	extent_tree_ad_insert(&huge, node);
 #ifdef JEMALLOC_STATS
+	stats_cactive_add(csize);
 	huge_nmalloc++;
 	huge_allocated += csize;
 #endif
@@ -134,6 +135,7 @@ huge_palloc(size_t size, size_t alignment, bool zero)
 	malloc_mutex_lock(&huge_mtx);
 	extent_tree_ad_insert(&huge, node);
 #ifdef JEMALLOC_STATS
+	stats_cactive_add(chunk_size);
 	huge_nmalloc++;
 	huge_allocated += chunk_size;
 #endif
@@ -278,6 +280,7 @@ huge_dalloc(void *ptr, bool unmap)
 	extent_tree_ad_remove(&huge, node);
 
 #ifdef JEMALLOC_STATS
+	stats_cactive_sub(node->size);
 	huge_ndalloc++;
 	huge_allocated -= node->size;
 #endif
diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c
index ecd521c..0efafde 100644
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@@ -151,7 +151,7 @@ choose_arena_hard(void)
 		choose = 0;
 		first_null = narenas;
 		malloc_mutex_lock(&arenas_lock);
-		assert(arenas[i] != NULL);
+		assert(arenas[0] != NULL);
 		for (i = 1; i < narenas; i++) {
 			if (arenas[i] != NULL) {
 				/*
diff --git a/jemalloc/src/mb.c b/jemalloc/src/mb.c
index 30a1a2e..dc2c0a2 100644
--- a/jemalloc/src/mb.c
+++ b/jemalloc/src/mb.c
@@ -1,2 +1,2 @@
-#define	MB_C_
+#define	JEMALLOC_MB_C_
 #include "jemalloc/internal/jemalloc_internal.h"
diff --git a/jemalloc/src/rtree.c b/jemalloc/src/rtree.c
index eb440aa..eb0ff1e 100644
--- a/jemalloc/src/rtree.c
+++ b/jemalloc/src/rtree.c
@@ -1,4 +1,4 @@
-#define	RTREE_C_
+#define	JEMALLOC_RTREE_C_
 #include "jemalloc/internal/jemalloc_internal.h"
 
 rtree_t *
diff --git a/jemalloc/src/stats.c b/jemalloc/src/stats.c
index 81105c4..cbbbb5b 100644
--- a/jemalloc/src/stats.c
+++ b/jemalloc/src/stats.c
@@ -39,6 +39,10 @@
 
 bool	opt_stats_print = false;
 
+#ifdef JEMALLOC_STATS
+size_t	stats_cactive = 0;
+#endif
+
 /******************************************************************************/
 /* Function prototypes for non-inline static functions. */
 
@@ -673,21 +677,26 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 #ifdef JEMALLOC_STATS
 	{
 		int err;
-		size_t ssz;
+		size_t sszp, ssz;
+		size_t *cactive;
 		size_t allocated, active, mapped;
 		size_t chunks_current, chunks_high, swap_avail;
 		uint64_t chunks_total;
 		size_t huge_allocated;
 		uint64_t huge_nmalloc, huge_ndalloc;
 
+		sszp = sizeof(size_t *);
 		ssz = sizeof(size_t);
 
+		CTL_GET("stats.cactive", &cactive, size_t *);
 		CTL_GET("stats.allocated", &allocated, size_t);
 		CTL_GET("stats.active", &active, size_t);
 		CTL_GET("stats.mapped", &mapped, size_t);
 		malloc_cprintf(write_cb, cbopaque,
-		    "Allocated: %zu, active: %zu, mapped: %zu\n", allocated,
-		    active, mapped);
+		    "Allocated: %zu, active: %zu, mapped: %zu\n",
+		    allocated, active, mapped);
+		malloc_cprintf(write_cb, cbopaque,
+		    "Current active ceiling: %zu\n", atomic_read_z(cactive));
 
 		/* Print chunk stats. */
 		CTL_GET("stats.chunks.total", &chunks_total, uint64_t);
-- 
cgit v0.12


From 92d3284ff8548c85b9b928f2615b96da4c4b2618 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Fri, 18 Mar 2011 18:15:37 -0700
Subject: Add atomic.[ch].

Add atomic.[ch], which should have been part of the previous commit.
---
 jemalloc/include/jemalloc/internal/atomic.h | 77 +++++++++++++++++++++++++++++
 jemalloc/src/atomic.c                       |  2 +
 2 files changed, 79 insertions(+)
 create mode 100644 jemalloc/include/jemalloc/internal/atomic.h
 create mode 100644 jemalloc/src/atomic.c

diff --git a/jemalloc/include/jemalloc/internal/atomic.h b/jemalloc/include/jemalloc/internal/atomic.h
new file mode 100644
index 0000000..43faeaf
--- /dev/null
+++ b/jemalloc/include/jemalloc/internal/atomic.h
@@ -0,0 +1,77 @@
+/******************************************************************************/
+#ifdef JEMALLOC_H_TYPES
+
+#endif /* JEMALLOC_H_TYPES */
+/******************************************************************************/
+#ifdef JEMALLOC_H_STRUCTS
+
+#endif /* JEMALLOC_H_STRUCTS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_EXTERNS
+
+#define	atomic_read_uint64(p)	atomic_add_uint64(p, 0)
+#define	atomic_read_uint32(p)	atomic_add_uint32(p, 0)
+
+#if (LG_SIZEOF_PTR == 3)
+#  define atomic_read_z(p) atomic_add_uint64(p, 0)
+#  define atomic_add_z(p, x) atomic_add_uint64(p, x)
+#  define atomic_sub_z(p, x) atomic_sub_uint64(p, x)
+#elif (LG_SIZEOF_PTR == 2)
+#  define atomic_read_z(p) atomic_add_uint32(p, 0)
+#  define atomic_add_z(p, x) atomic_add_uint32(p, x)
+#  define atomic_sub_z(p, x) atomic_sub_uint32(p, x)
+#endif
+
+#endif /* JEMALLOC_H_EXTERNS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_INLINES
+
+#ifndef JEMALLOC_ENABLE_INLINE
+uint64_t	atomic_add_uint64(uint64_t *p, uint64_t x);
+uint64_t	atomic_sub_uint64(uint64_t *p, uint64_t x);
+uint32_t	atomic_add_uint32(uint32_t *p, uint32_t x);
+uint32_t	atomic_sub_uint32(uint32_t *p, uint32_t x);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ATOMIC_C_))
+/* 64-bit operations. */
+#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
+JEMALLOC_INLINE uint64_t
+atomic_add_uint64(uint64_t *p, uint64_t x)
+{
+
+	return (__sync_add_and_fetch(p, x));
+}
+
+JEMALLOC_INLINE uint64_t
+atomic_sub_uint64(uint64_t *p, uint64_t x)
+{
+
+	return (__sync_sub_and_fetch(p, x));
+}
+#else
+#  error "Missing implementation for 64-bit atomic operations"
+#endif
+
+/* 32-bit operations. */
+#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4
+JEMALLOC_INLINE uint32_t
+atomic_add_uint32(uint32_t *p, uint32_t x)
+{
+
+	return (__sync_add_and_fetch(p, x));
+}
+
+JEMALLOC_INLINE uint32_t
+atomic_sub_uint32(uint32_t *p, uint32_t x)
+{
+
+	return (__sync_sub_and_fetch(p, x));
+}
+#else
+#  error "Missing implementation for 32-bit atomic operations"
+#endif
+#endif
+
+#endif /* JEMALLOC_H_INLINES */
+/******************************************************************************/
diff --git a/jemalloc/src/atomic.c b/jemalloc/src/atomic.c
new file mode 100644
index 0000000..77ee313
--- /dev/null
+++ b/jemalloc/src/atomic.c
@@ -0,0 +1,2 @@
+#define	JEMALLOC_ATOMIC_C_
+#include "jemalloc/internal/jemalloc_internal.h"
-- 
cgit v0.12


From 9a8fc41bb9752129510f3387f5c20cb798ff6b1a Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Fri, 18 Mar 2011 18:18:42 -0700
Subject: Update pprof.

Import updated pprof from google-perftools 1.7.
---
 jemalloc/bin/pprof | 209 ++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 160 insertions(+), 49 deletions(-)

diff --git a/jemalloc/bin/pprof b/jemalloc/bin/pprof
index 1655f07..280ddcc 100755
--- a/jemalloc/bin/pprof
+++ b/jemalloc/bin/pprof
@@ -72,7 +72,7 @@ use strict;
 use warnings;
 use Getopt::Long;
 
-my $PPROF_VERSION = "1.5";
+my $PPROF_VERSION = "1.7";
 
 # These are the object tools we use which can come from a
 # user-specified location using --tools, from the PPROF_TOOLS
@@ -89,6 +89,7 @@ my %obj_tool_map = (
 );
 my $DOT = "dot";          # leave non-absolute, since it may be in /usr/local
 my $GV = "gv";
+my $EVINCE = "evince";    # could also be xpdf or perhaps acroread
 my $KCACHEGRIND = "kcachegrind";
 my $PS2PDF = "ps2pdf";
 # These are used for dynamic profiles
@@ -103,6 +104,7 @@ my $GROWTH_PAGE = "/pprof/growth";
 my $CONTENTION_PAGE = "/pprof/contention";
 my $WALL_PAGE = "/pprof/wall(?:\\?.*)?";  # accepts options like namefilter
 my $FILTEREDPROFILE_PAGE = "/pprof/filteredprofile(?:\\?.*)?";
+my $CENSUSPROFILE_PAGE = "/pprof/censusprofile";  # must support "?seconds=#"
 my $SYMBOL_PAGE = "/pprof/symbol";     # must support symbol lookup via POST
 my $PROGRAM_NAME_PAGE = "/pprof/cmdline";
 
@@ -110,7 +112,7 @@ my $PROGRAM_NAME_PAGE = "/pprof/cmdline";
 # All the alternatives must begin with /.
 my $PROFILES = "($HEAP_PAGE|$PROFILE_PAGE|$PMUPROFILE_PAGE|" .
                "$GROWTH_PAGE|$CONTENTION_PAGE|$WALL_PAGE|" .
-               "$FILTEREDPROFILE_PAGE)";
+               "$FILTEREDPROFILE_PAGE|$CENSUSPROFILE_PAGE)";
 
 # default binary name
 my $UNKNOWN_BINARY = "(unknown)";
@@ -148,7 +150,7 @@ pprof [options] <profile>
 
    The /<service> can be $HEAP_PAGE, $PROFILE_PAGE, /pprof/pmuprofile,
                          $GROWTH_PAGE, $CONTENTION_PAGE, /pprof/wall,
-                         or /pprof/filteredprofile.
+                         $CENSUSPROFILE_PAGE, or /pprof/filteredprofile.
    For instance: "pprof http://myserver.com:80$HEAP_PAGE".
    If /<service> is omitted, the service defaults to $PROFILE_PAGE (cpu profiling).
 pprof --symbols <program>
@@ -180,6 +182,7 @@ Output type:
    --text              Generate text report
    --callgrind         Generate callgrind format to stdout
    --gv                Generate Postscript and display
+   --evince            Generate PDF and display
    --web               Generate SVG and display
    --list=<regexp>     Generate source listing of matching routines
    --disasm=<regexp>   Generate disassembly of matching routines
@@ -208,6 +211,7 @@ Call-graph Options:
    --nodecount=<n>     Show at most so many nodes [default=80]
    --nodefraction=<f>  Hide nodes below <f>*total [default=.005]
    --edgefraction=<f>  Hide edges below <f>*total [default=.001]
+   --maxdegree=<n>     Max incoming/outgoing edges per node [default=8]
    --focus=<regexp>    Focus on nodes matching <regexp>
    --ignore=<regexp>   Ignore nodes matching <regexp>
    --scale=<n>         Set GV scaling [default=0]
@@ -304,6 +308,7 @@ sub Init() {
   $main::opt_disasm = "";
   $main::opt_symbols = 0;
   $main::opt_gv = 0;
+  $main::opt_evince = 0;
   $main::opt_web = 0;
   $main::opt_dot = 0;
   $main::opt_ps = 0;
@@ -315,6 +320,7 @@ sub Init() {
   $main::opt_nodecount = 80;
   $main::opt_nodefraction = 0.005;
   $main::opt_edgefraction = 0.001;
+  $main::opt_maxdegree = 8;
   $main::opt_focus = '';
   $main::opt_ignore = '';
   $main::opt_scale = 0;
@@ -372,6 +378,7 @@ sub Init() {
              "disasm=s"       => \$main::opt_disasm,
              "symbols!"       => \$main::opt_symbols,
              "gv!"            => \$main::opt_gv,
+             "evince!"        => \$main::opt_evince,
              "web!"           => \$main::opt_web,
              "dot!"           => \$main::opt_dot,
              "ps!"            => \$main::opt_ps,
@@ -383,6 +390,7 @@ sub Init() {
              "nodecount=i"    => \$main::opt_nodecount,
              "nodefraction=f" => \$main::opt_nodefraction,
              "edgefraction=f" => \$main::opt_edgefraction,
+             "maxdegree=i"    => \$main::opt_maxdegree,
              "focus=s"        => \$main::opt_focus,
              "ignore=s"       => \$main::opt_ignore,
              "scale=i"        => \$main::opt_scale,
@@ -452,6 +460,7 @@ sub Init() {
       ($main::opt_disasm eq '' ? 0 : 1) +
       ($main::opt_symbols == 0 ? 0 : 1) +
       $main::opt_gv +
+      $main::opt_evince +
       $main::opt_web +
       $main::opt_dot +
       $main::opt_ps +
@@ -646,6 +655,8 @@ sub Main() {
       if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
         if ($main::opt_gv) {
           RunGV(TempName($main::next_tmpfile, "ps"), "");
+        } elsif ($main::opt_evince) {
+	  RunEvince(TempName($main::next_tmpfile, "pdf"), "");
         } elsif ($main::opt_web) {
           my $tmp = TempName($main::next_tmpfile, "svg");
           RunWeb($tmp);
@@ -708,6 +719,12 @@ sub RunGV {
   }
 }
 
+sub RunEvince {
+  my $fname = shift;
+  my $bg = shift;       # "" or " &" if we should run in background
+  system("$EVINCE " . $fname . $bg);
+}
+
 sub RunWeb {
   my $fname = shift;
   print STDERR "Loading web page file:///$fname\n";
@@ -805,6 +822,7 @@ sub InteractiveCommand {
   $main::opt_disasm = 0;
   $main::opt_list = 0;
   $main::opt_gv = 0;
+  $main::opt_evince = 0;
   $main::opt_cum = 0;
 
   if (m/^\s*(text|top)(\d*)\s*(.*)/) {
@@ -878,11 +896,14 @@ sub InteractiveCommand {
     PrintDisassembly($libs, $flat, $cumulative, $routine, $total);
     return 1;
   }
-  if (m/^\s*(gv|web)\s*(.*)/) {
+  if (m/^\s*(gv|web|evince)\s*(.*)/) {
     $main::opt_gv = 0;
+    $main::opt_evince = 0;
     $main::opt_web = 0;
     if ($1 eq "gv") {
       $main::opt_gv = 1;
+    } elsif ($1 eq "evince") {
+      $main::opt_evince = 1;
     } elsif ($1 eq "web") {
       $main::opt_web = 1;
     }
@@ -902,6 +923,8 @@ sub InteractiveCommand {
     if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
       if ($main::opt_gv) {
         RunGV(TempName($main::next_tmpfile, "ps"), " &");
+      } elsif ($main::opt_evince) {
+        RunEvince(TempName($main::next_tmpfile, "pdf"), " &");
       } elsif ($main::opt_web) {
         RunWeb(TempName($main::next_tmpfile, "svg"));
       }
@@ -1685,6 +1708,8 @@ sub PrintDot {
   my $output;
   if ($main::opt_gv) {
     $output = "| $DOT -Tps2 >" . TempName($main::next_tmpfile, "ps");
+  } elsif ($main::opt_evince) {
+    $output = "| $DOT -Tps2 | $PS2PDF - " . TempName($main::next_tmpfile, "pdf");
   } elsif ($main::opt_ps) {
     $output = "| $DOT -Tps2";
   } elsif ($main::opt_pdf) {
@@ -1792,12 +1817,38 @@ sub PrintDot {
     }
   }
 
-  # Print edges
-  foreach my $e (keys(%edge)) {
+  # Print edges (process in order of decreasing counts)
+  my %indegree = ();   # Number of incoming edges added per node so far
+  my %outdegree = ();  # Number of outgoing edges added per node so far
+  foreach my $e (sort { $edge{$b} <=> $edge{$a} } keys(%edge)) {
     my @x = split(/\001/, $e);
     $n = $edge{$e};
 
-    if (abs($n) > $edgelimit) {
+    # Initialize degree of kept incoming and outgoing edges if necessary
+    my $src = $x[0];
+    my $dst = $x[1];
+    if (!exists($outdegree{$src})) { $outdegree{$src} = 0; }
+    if (!exists($indegree{$dst})) { $indegree{$dst} = 0; }
+
+    my $keep;
+    if ($indegree{$dst} == 0) {
+      # Keep edge if needed for reachability
+      $keep = 1;
+    } elsif (abs($n) <= $edgelimit) {
+      # Drop if we are below --edgefraction
+      $keep = 0;
+    } elsif ($outdegree{$src} >= $main::opt_maxdegree ||
+             $indegree{$dst} >= $main::opt_maxdegree) {
+      # Keep limited number of in/out edges per node
+      $keep = 0;
+    } else {
+      $keep = 1;
+    }
+
+    if ($keep) {
+      $outdegree{$src}++;
+      $indegree{$dst}++;
+
       # Compute line width based on edge count
       my $fraction = abs($local_total ? (3 * ($n / $local_total)) : 0);
       if ($fraction > 1) { $fraction = 1; }
@@ -2135,6 +2186,19 @@ function handleMouseUp(evt) {
 EOF
 }
 
+# Return a small number that identifies the argument.
+# Multiple calls with the same argument will return the same number.
+# Calls with different arguments will return different numbers.
+sub ShortIdFor {
+  my $key = shift;
+  my $id = $main::uniqueid{$key};
+  if (!defined($id)) {
+    $id = keys(%main::uniqueid) + 1;
+    $main::uniqueid{$key} = $id;
+  }
+  return $id;
+}
+
 # Translate a stack of addresses into a stack of symbols
 sub TranslateStack {
   my $symbols = shift;
@@ -2172,6 +2236,15 @@ sub TranslateStack {
       if ($j > 2) {
         $func = "$func (inline)";
       }
+
+      # Do not merge nodes corresponding to Callback::Run since that
+      # causes confusing cycles in dot display.  Instead, we synthesize
+      # a unique name for this frame per caller.
+      if ($func =~ m/Callback.*::Run$/) {
+        my $caller = ($i > 0) ? $addrs[$i-1] : 0;
+        $func = "Run#" . ShortIdFor($caller);
+      }
+
       if ($main::opt_addresses) {
         push(@result, "$a $func $fileline");
       } elsif ($main::opt_lines) {
@@ -2415,7 +2488,16 @@ sub RemoveUninterestingFrames {
     # old code out of the system.
     $skip_regexp = "TCMalloc|^tcmalloc::";
   } elsif ($main::profile_type eq 'contention') {
-    foreach my $vname ('Mutex::Unlock', 'Mutex::UnlockSlow') {
+    foreach my $vname ('base::RecordLockProfileData',
+                       'base::SubmitMutexProfileData',
+                       'base::SubmitSpinLockProfileData',
+                       'Mutex::Unlock',
+                       'Mutex::UnlockSlow',
+                       'Mutex::ReaderUnlock',
+                       'MutexLock::~MutexLock',
+                       'SpinLock::Unlock',
+                       'SpinLock::SlowUnlock',
+                       'SpinLockHolder::~SpinLockHolder') {
       $skip{$vname} = 1;
     }
   } elsif ($main::profile_type eq 'cpu') {
@@ -2955,7 +3037,7 @@ sub FetchDynamicProfile {
 
     my $fetcher = AddFetchTimeout($URL_FETCHER, $fetch_timeout);
     my $cmd = "$fetcher '$url' > '$tmp_profile'";
-    if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE/){
+    if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE|$CENSUSPROFILE_PAGE/){
       print STDERR "Gathering CPU profile from $url for $main::opt_seconds seconds to\n  ${real_profile}\n";
       if ($encourage_patience) {
         print STDERR "Be patient...\n";
@@ -3154,24 +3236,47 @@ BEGIN {
   }
 }
 
-# Return the next line from the profile file, assuming it's a text
-# line (which in this case means, doesn't start with a NUL byte).  If
-# it's not a text line, return "".  At EOF, return undef, like perl does.
-# Input file should be in binmode.
-sub ReadProfileLine {
+# Reads the top, 'header' section of a profile, and returns the last
+# line of the header, commonly called a 'header line'.  The header
+# section of a profile consists of zero or more 'command' lines that
+# are instructions to pprof, which pprof executes when reading the
+# header.  All 'command' lines start with a %.  After the command
+# lines is the 'header line', which is a profile-specific line that
+# indicates what type of profile it is, and perhaps other global
+# information about the profile.  For instance, here's a header line
+# for a heap profile:
+#   heap profile:     53:    38236 [  5525:  1284029] @ heapprofile
+# For historical reasons, the CPU profile does not contain a text-
+# readable header line.  If the profile looks like a CPU profile,
+# this function returns "".  If no header line could be found, this
+# function returns undef.
+#
+# The following commands are recognized:
+#   %warn -- emit the rest of this line to stderr, prefixed by 'WARNING:'
+#
+# The input file should be in binmode.
+sub ReadProfileHeader {
   local *PROFILE = shift;
   my $firstchar = "";
   my $line = "";
   read(PROFILE, $firstchar, 1);
-  seek(PROFILE, -1, 1);          # unread the firstchar
-  if ($firstchar eq "\0") {
+  seek(PROFILE, -1, 1);                    # unread the firstchar
+  if ($firstchar !~ /[[:print:]]/) {       # is not a text character
     return "";
   }
-  $line = <PROFILE>;
-  if (defined($line)) {
+  while (defined($line = <PROFILE>)) {
     $line =~ s/\r//g;   # turn windows-looking lines into unix-looking lines
+    if ($line =~ /^%warn\s+(.*)/) {        # 'warn' command
+      # Note this matches both '%warn blah\n' and '%warn\n'.
+      print STDERR "WARNING: $1\n";        # print the rest of the line
+    } elsif ($line =~ /^%/) {
+      print STDERR "Ignoring unknown command from profile header: $line";
+    } else {
+      # End of commands, must be the header line.
+      return $line;
+    }
   }
-  return $line;
+  return undef;     # got to EOF without seeing a header line
 }
 
 sub IsSymbolizedProfileFile {
@@ -3182,7 +3287,7 @@ sub IsSymbolizedProfileFile {
   # Check if the file contains a symbol-section marker.
   open(TFILE, "<$file_name");
   binmode TFILE;
-  my $firstline = ReadProfileLine(*TFILE);
+  my $firstline = ReadProfileHeader(*TFILE);
   close(TFILE);
   if (!$firstline) {
     return 0;
@@ -3202,14 +3307,7 @@ sub IsSymbolizedProfileFile {
 sub ReadProfile {
   my $prog = shift;
   my $fname = shift;
-
-  if (IsSymbolizedProfileFile($fname) && !$main::use_symbolized_profile) {
-    # we have both a binary and symbolized profiles, abort
-    usage("Symbolized profile '$fname' cannot be used with a binary arg.  " .
-          "Try again without passing '$prog'.");
-  }
-
-  $main::profile_type = '';
+  my $result;            # return value
 
   $CONTENTION_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
   my $contention_marker = $&;
@@ -3226,40 +3324,45 @@ sub ReadProfile {
   # whole firstline, since it may be gigabytes(!) of data.
   open(PROFILE, "<$fname") || error("$fname: $!\n");
   binmode PROFILE;      # New perls do UTF-8 processing
-  my $header = ReadProfileLine(*PROFILE);
+  my $header = ReadProfileHeader(*PROFILE);
   if (!defined($header)) {   # means "at EOF"
     error("Profile is empty.\n");
   }
 
   my $symbols;
   if ($header =~ m/^--- *$symbol_marker/o) {
+    # Verify that the user asked for a symbolized profile
+    if (!$main::use_symbolized_profile) {
+      # we have both a binary and symbolized profiles, abort
+      error("FATAL ERROR: Symbolized profile\n   $fname\ncannot be used with " .
+	    "a binary arg. Try again without passing\n   $prog\n");
+    }
     # Read the symbol section of the symbolized profile file.
     $symbols = ReadSymbols(*PROFILE{IO});
     # Read the next line to get the header for the remaining profile.
-    $header = ReadProfileLine(*PROFILE) || "";
+    $header = ReadProfileHeader(*PROFILE) || "";
   }
 
-  my $result;
-
+  $main::profile_type = '';
   if ($header =~ m/^heap profile:.*$growth_marker/o) {
     $main::profile_type = 'growth';
-    $result =  ReadHeapProfile($prog, $fname, $header);
+    $result =  ReadHeapProfile($prog, *PROFILE, $header);
   } elsif ($header =~ m/^heap profile:/) {
     $main::profile_type = 'heap';
-    $result =  ReadHeapProfile($prog, $fname, $header);
+    $result =  ReadHeapProfile($prog, *PROFILE, $header);
   } elsif ($header =~ m/^--- *$contention_marker/o) {
     $main::profile_type = 'contention';
-    $result = ReadSynchProfile($prog, $fname);
+    $result = ReadSynchProfile($prog, *PROFILE);
   } elsif ($header =~ m/^--- *Stacks:/) {
     print STDERR
       "Old format contention profile: mistakenly reports " .
       "condition variable signals as lock contentions.\n";
     $main::profile_type = 'contention';
-    $result = ReadSynchProfile($prog, $fname);
+    $result = ReadSynchProfile($prog, *PROFILE);
   } elsif ($header =~ m/^--- *$profile_marker/) {
     # the binary cpu profile data starts immediately after this line
     $main::profile_type = 'cpu';
-    $result = ReadCPUProfile($prog, $fname);
+    $result = ReadCPUProfile($prog, $fname, *PROFILE);
   } else {
     if (defined($symbols)) {
       # a symbolized profile contains a format we don't recognize, bail out
@@ -3267,9 +3370,11 @@ sub ReadProfile {
     }
     # no ascii header present -- must be a CPU profile
     $main::profile_type = 'cpu';
-    $result = ReadCPUProfile($prog, $fname);
+    $result = ReadCPUProfile($prog, $fname, *PROFILE);
   }
 
+  close(PROFILE);
+
   # if we got symbols along with the profile, return those as well
   if (defined($symbols)) {
     $result->{symbols} = $symbols;
@@ -3308,7 +3413,8 @@ sub FixCallerAddresses {
 # CPU profile reader
 sub ReadCPUProfile {
   my $prog = shift;
-  my $fname = shift;
+  my $fname = shift;       # just used for logging
+  local *PROFILE = shift;
   my $version;
   my $period;
   my $i;
@@ -3375,7 +3481,6 @@ sub ReadCPUProfile {
   my $map = '';
   seek(PROFILE, $i * 4, 0);
   read(PROFILE, $map, (stat PROFILE)[7]);
-  close(PROFILE);
 
   my $r = {};
   $r->{version} = $version;
@@ -3389,7 +3494,7 @@ sub ReadCPUProfile {
 
 sub ReadHeapProfile {
   my $prog = shift;
-  my $fname = shift;
+  local *PROFILE = shift;
   my $header = shift;
 
   my $index = 1;
@@ -3534,14 +3639,14 @@ sub ReadHeapProfile {
 	  if ($n1 != 0) {
 	    my $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
 	    my $scale_factor = 1/(1 - exp(-$ratio));
-          $n1 *= $scale_factor;
-          $s1 *= $scale_factor;
+	    $n1 *= $scale_factor;
+	    $s1 *= $scale_factor;
 	  }
 	  if ($n2 != 0) {
 	    my $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
 	    my $scale_factor = 1/(1 - exp(-$ratio));
-          $n2 *= $scale_factor;
-          $s2 *= $scale_factor;
+	    $n2 *= $scale_factor;
+	    $s2 *= $scale_factor;
 	  }
         } else {
           # Remote-heap version 1
@@ -3574,7 +3679,9 @@ sub ReadHeapProfile {
 }
 
 sub ReadSynchProfile {
-  my ($prog, $fname, $header) = @_;
+  my $prog = shift;
+  local *PROFILE = shift;
+  my $header = shift;
 
   my $map = '';
   my $profile = {};
@@ -3649,7 +3756,6 @@ sub ReadSynchProfile {
       $map .= $line;
     }
   }
-  close PROFILE;
 
   if (!$seen_clockrate) {
     printf STDERR ("No cycles/second entry in profile; Guessing %.1f GHz\n",
@@ -4098,8 +4204,9 @@ sub ExtractSymbols {
   # advance through the libraries as we advance the pc.  Sometimes the
   # addresses of libraries may overlap with the addresses of the main
   # binary, so to make sure the libraries 'win', we iterate over the
-  # libraries in reverse order (binary will have the lowest start addr).
-  my @pcs = (sort { $a cmp $b } keys(%{$pcset}));
+  # libraries in reverse order (which assumes the binary doesn't start
+  # in the middle of a library, which seems a fair assumption).
+  my @pcs = (sort { $a cmp $b } keys(%{$pcset}));  # pcset is 0-extended strings
   foreach my $lib (sort {$b->[1] cmp $a->[1]} @{$libs}) {
     my $libname = $lib->[0];
     my $start = $lib->[1];
@@ -4109,14 +4216,18 @@ sub ExtractSymbols {
     # Get list of pcs that belong in this library.
     my $contained = [];
     my ($start_pc_index, $finish_pc_index);
+    # Find smallest finish_pc_index such that $finish < $pc[$finish_pc_index].
     for ($finish_pc_index = $#pcs + 1; $finish_pc_index > 0;
 	 $finish_pc_index--) {
       last if $pcs[$finish_pc_index - 1] le $finish;
     }
+    # Find smallest start_pc_index such that $start <= $pc[$start_pc_index].
     for ($start_pc_index = $finish_pc_index; $start_pc_index > 0;
 	 $start_pc_index--) {
       last if $pcs[$start_pc_index - 1] lt $start;
     }
+    # This keeps PC values higher than $pc[$finish_pc_index] in @pcs,
+    # in case there are overlaps in libraries and the main binary.
     @{$contained} = splice(@pcs, $start_pc_index,
 			   $finish_pc_index - $start_pc_index);
     # Map to symbols
-- 
cgit v0.12


From 763baa6cfcc8a9df9d3b7f676b2193ac7cd5ef51 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Fri, 18 Mar 2011 19:10:31 -0700
Subject: Add atomic operation support for OS X.

---
 jemalloc/configure.ac                              | 22 +++++++++++++++++
 jemalloc/include/jemalloc/internal/atomic.h        | 28 ++++++++++++++++++++++
 .../jemalloc/internal/jemalloc_internal.h.in       |  4 ++++
 jemalloc/include/jemalloc/jemalloc_defs.h.in       |  6 +++++
 4 files changed, 60 insertions(+)

diff --git a/jemalloc/configure.ac b/jemalloc/configure.ac
index dc77d75..c40d22f 100644
--- a/jemalloc/configure.ac
+++ b/jemalloc/configure.ac
@@ -771,6 +771,28 @@ AC_CHECK_FUNC([ffsl], [],
 	      [AC_MSG_ERROR([Cannot build without ffsl(3)])])
 
 dnl ============================================================================
+dnl Check for atomic(3) operations as provided on Darwin.
+
+JE_COMPILABLE([Darwin OSAtomic*()], [
+#include <libkern/OSAtomic.h>
+#include <inttypes.h>
+], [
+	{
+		int32_t x32 = 0;
+		volatile int32_t *x32p = &x32;
+		OSAtomicAdd32(1, x32p);
+	}
+	{
+		int64_t x64 = 0;
+		volatile int64_t *x64p = &x64;
+		OSAtomicAdd64(1, x64p);
+	}
+], [osatomic])
+if test "x${osatomic}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_OSATOMIC])
+fi
+
+dnl ============================================================================
 dnl Check for allocator-related functions that should be wrapped.
 
 AC_CHECK_FUNC([memalign],
diff --git a/jemalloc/include/jemalloc/internal/atomic.h b/jemalloc/include/jemalloc/internal/atomic.h
index 43faeaf..089affa 100644
--- a/jemalloc/include/jemalloc/internal/atomic.h
+++ b/jemalloc/include/jemalloc/internal/atomic.h
@@ -49,6 +49,20 @@ atomic_sub_uint64(uint64_t *p, uint64_t x)
 
 	return (__sync_sub_and_fetch(p, x));
 }
+#elif (defined(JEMALLOC_OSATOMIC))
+JEMALLOC_INLINE uint64_t
+atomic_add_uint64(uint64_t *p, uint64_t x)
+{
+
+	return (OSAtomicAdd64((int64_t)x, (int64_t *)p));
+}
+
+JEMALLOC_INLINE uint64_t
+atomic_sub_uint64(uint64_t *p, uint64_t x)
+{
+
+	return (OSAtomicAdd64(-((int64_t)x), (int64_t *)p));
+}
 #else
 #  error "Missing implementation for 64-bit atomic operations"
 #endif
@@ -68,6 +82,20 @@ atomic_sub_uint32(uint32_t *p, uint32_t x)
 
 	return (__sync_sub_and_fetch(p, x));
 }
+#elif (defined(JEMALLOC_OSATOMIC))
+JEMALLOC_INLINE uint32_t
+atomic_add_uint32(uint32_t *p, uint32_t x)
+{
+
+	return (OSAtomicAdd32((int32_t)x, (int32_t *)p));
+}
+
+JEMALLOC_INLINE uint32_t
+atomic_sub_uint32(uint32_t *p, uint32_t x)
+{
+
+	return (OSAtomicAdd32(-((int32_t)x), (int32_t *)p));
+}
 #else
 #  error "Missing implementation for 32-bit atomic operations"
 #endif
diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
index 90cd604..f660bc8 100644
--- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
@@ -33,6 +33,10 @@
 #define	JEMALLOC_MANGLE
 #include "../jemalloc@install_suffix@.h"
 
+#ifdef JEMALLOC_OSATOMIC
+#include <libkern/OSAtomic.h>
+#endif
+
 #ifdef JEMALLOC_ZONE
 #include <mach/mach_error.h>
 #include <mach/mach_init.h>
diff --git a/jemalloc/include/jemalloc/jemalloc_defs.h.in b/jemalloc/include/jemalloc/jemalloc_defs.h.in
index d669841..c08c5a2 100644
--- a/jemalloc/include/jemalloc/jemalloc_defs.h.in
+++ b/jemalloc/include/jemalloc/jemalloc_defs.h.in
@@ -24,6 +24,12 @@
  */
 #undef CPU_SPINWAIT
 
+/*
+ * Defined if OSAtomic*() functions are available, as provided by Darwin, and
+ * documented in the atomic(3) manual page.
+ */
+#undef JEMALLOC_OSATOMIC
+
 /* Defined if __attribute__((...)) syntax is supported. */
 #undef JEMALLOC_HAVE_ATTR
 #ifdef JEMALLOC_HAVE_ATTR
-- 
cgit v0.12


From 893a0ed7c8c11962524ba6f2adeb304d038be2a9 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Fri, 18 Mar 2011 19:30:18 -0700
Subject: Use OSSpinLock*() for locking on OS X.

pthread_mutex_lock() can call malloc() on OS X (!!!), which causes
deadlock.  Work around this by using spinlocks that are built of more
primitive stuff.
---
 jemalloc/configure.ac                              | 15 +++++++++++++
 jemalloc/include/jemalloc/internal/atomic.h        | 18 ++++++++++-----
 .../jemalloc/internal/jemalloc_internal.h.in       |  2 +-
 jemalloc/include/jemalloc/internal/mutex.h         | 26 ++++++++++++++++++----
 jemalloc/include/jemalloc/jemalloc_defs.h.in       |  6 +++++
 jemalloc/src/jemalloc.c                            |  8 ++++++-
 jemalloc/src/mutex.c                               |  6 +++++
 7 files changed, 69 insertions(+), 12 deletions(-)

diff --git a/jemalloc/configure.ac b/jemalloc/configure.ac
index c40d22f..412d3d1 100644
--- a/jemalloc/configure.ac
+++ b/jemalloc/configure.ac
@@ -793,6 +793,21 @@ if test "x${osatomic}" = "xyes" ; then
 fi
 
 dnl ============================================================================
+dnl Check for spinlock(3) operations as provided on Darwin.
+
+JE_COMPILABLE([Darwin OSSpin*()], [
+#include <libkern/OSAtomic.h>
+#include <inttypes.h>
+], [
+	OSSpinLock lock = 0;
+	OSSpinLockLock(&lock);
+	OSSpinLockUnlock(&lock);
+], [osspin])
+if test "x${osspin}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_OSSPIN])
+fi
+
+dnl ============================================================================
 dnl Check for allocator-related functions that should be wrapped.
 
 AC_CHECK_FUNC([memalign],
diff --git a/jemalloc/include/jemalloc/internal/atomic.h b/jemalloc/include/jemalloc/internal/atomic.h
index 089affa..f1f0c2b 100644
--- a/jemalloc/include/jemalloc/internal/atomic.h
+++ b/jemalloc/include/jemalloc/internal/atomic.h
@@ -13,13 +13,19 @@
 #define	atomic_read_uint32(p)	atomic_add_uint32(p, 0)
 
 #if (LG_SIZEOF_PTR == 3)
-#  define atomic_read_z(p) atomic_add_uint64(p, 0)
-#  define atomic_add_z(p, x) atomic_add_uint64(p, x)
-#  define atomic_sub_z(p, x) atomic_sub_uint64(p, x)
+#  define atomic_read_z(p)						\
+    (size_t)atomic_add_uint64((uint64_t *)p, (uint64_t)0)
+#  define atomic_add_z(p, x)						\
+    (size_t)atomic_add_uint64((uint64_t *)p, (uint64_t)x)
+#  define atomic_sub_z(p, x)						\
+    (size_t)atomic_sub_uint64((uint64_t *)p, (uint64_t)x)
 #elif (LG_SIZEOF_PTR == 2)
-#  define atomic_read_z(p) atomic_add_uint32(p, 0)
-#  define atomic_add_z(p, x) atomic_add_uint32(p, x)
-#  define atomic_sub_z(p, x) atomic_sub_uint32(p, x)
+#  define atomic_read_z(p)						\
+    (size_t)atomic_add_uint32((uint32_t *)p, (uint32_t)0)
+#  define atomic_add_z(p, x)						\
+    (size_t)atomic_add_uint32((uint32_t *)p, (uint32_t)x)
+#  define atomic_sub_z(p, x)						\
+    (size_t)atomic_sub_uint32((uint32_t *)p, (uint32_t)x)
 #endif
 
 #endif /* JEMALLOC_H_EXTERNS */
diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
index f660bc8..fc944a8 100644
--- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
@@ -33,7 +33,7 @@
 #define	JEMALLOC_MANGLE
 #include "../jemalloc@install_suffix@.h"
 
-#ifdef JEMALLOC_OSATOMIC
+#if (defined(JEMALLOC_OSATOMIC) || defined(JEMALLOC_OSSPIN))
 #include <libkern/OSAtomic.h>
 #endif
 
diff --git a/jemalloc/include/jemalloc/internal/mutex.h b/jemalloc/include/jemalloc/internal/mutex.h
index dcca01e..62947ce 100644
--- a/jemalloc/include/jemalloc/internal/mutex.h
+++ b/jemalloc/include/jemalloc/internal/mutex.h
@@ -1,7 +1,11 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_TYPES
 
+#ifdef JEMALLOC_OSSPIN
+typedef OSSpinLock malloc_mutex_t;
+#else
 typedef pthread_mutex_t malloc_mutex_t;
+#endif
 
 #ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
 #  define MALLOC_MUTEX_INITIALIZER PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
@@ -41,17 +45,26 @@ JEMALLOC_INLINE void
 malloc_mutex_lock(malloc_mutex_t *mutex)
 {
 
-	if (isthreaded)
+	if (isthreaded) {
+#ifdef JEMALLOC_OSSPIN
+		OSSpinLockLock(mutex);
+#else
 		pthread_mutex_lock(mutex);
+#endif
+	}
 }
 
 JEMALLOC_INLINE bool
 malloc_mutex_trylock(malloc_mutex_t *mutex)
 {
 
-	if (isthreaded)
+	if (isthreaded) {
+#ifdef JEMALLOC_OSSPIN
+		return (OSSpinLockTry(mutex) == false);
+#else
 		return (pthread_mutex_trylock(mutex) != 0);
-	else
+#endif
+	} else
 		return (false);
 }
 
@@ -59,8 +72,13 @@ JEMALLOC_INLINE void
 malloc_mutex_unlock(malloc_mutex_t *mutex)
 {
 
-	if (isthreaded)
+	if (isthreaded) {
+#ifdef JEMALLOC_OSSPIN
+		OSSpinLockUnlock(mutex);
+#else
 		pthread_mutex_unlock(mutex);
+#endif
+	}
 }
 #endif
 
diff --git a/jemalloc/include/jemalloc/jemalloc_defs.h.in b/jemalloc/include/jemalloc/jemalloc_defs.h.in
index c08c5a2..d8c81d7 100644
--- a/jemalloc/include/jemalloc/jemalloc_defs.h.in
+++ b/jemalloc/include/jemalloc/jemalloc_defs.h.in
@@ -30,6 +30,12 @@
  */
 #undef JEMALLOC_OSATOMIC
 
+/*
+ * Defined if OSSpin*() functions are available, as provided by Darwin, and
+ * documented in the spinlock(3) manual page.
+ */
+#undef JEMALLOC_OSSPIN
+
 /* Defined if __attribute__((...)) syntax is supported. */
 #undef JEMALLOC_HAVE_ATTR
 #ifdef JEMALLOC_HAVE_ATTR
diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c
index 0efafde..dccce6b 100644
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@@ -28,7 +28,13 @@ static bool		malloc_initialized = false;
 static pthread_t	malloc_initializer = (unsigned long)0;
 
 /* Used to avoid initialization races. */
-static malloc_mutex_t	init_lock = MALLOC_MUTEX_INITIALIZER;
+static malloc_mutex_t	init_lock =
+#ifdef JEMALLOC_OSSPIN
+    0
+#else
+    MALLOC_MUTEX_INITIALIZER
+#endif
+    ;
 
 #ifdef DYNAMIC_PAGE_SHIFT
 size_t		pagesize;
diff --git a/jemalloc/src/mutex.c b/jemalloc/src/mutex.c
index 3ecb18a..ca89ef1 100644
--- a/jemalloc/src/mutex.c
+++ b/jemalloc/src/mutex.c
@@ -55,6 +55,9 @@ pthread_create(pthread_t *__restrict thread,
 bool
 malloc_mutex_init(malloc_mutex_t *mutex)
 {
+#ifdef JEMALLOC_OSSPIN
+	*mutex = 0;
+#else
 	pthread_mutexattr_t attr;
 
 	if (pthread_mutexattr_init(&attr) != 0)
@@ -70,6 +73,7 @@ malloc_mutex_init(malloc_mutex_t *mutex)
 	}
 	pthread_mutexattr_destroy(&attr);
 
+#endif
 	return (false);
 }
 
@@ -77,8 +81,10 @@ void
 malloc_mutex_destroy(malloc_mutex_t *mutex)
 {
 
+#ifndef JEMALLOC_OSSPIN
 	if (pthread_mutex_destroy(mutex) != 0) {
 		malloc_write("<jemalloc>: Error in pthread_mutex_destroy()\n");
 		abort();
 	}
+#endif
 }
-- 
cgit v0.12


From 1dcb4f86b23a5760f5a717ace716360b63b33fad Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Mon, 21 Mar 2011 00:18:17 -0700
Subject: Dynamically adjust tcache fill count.

Dynamically adjust tcache fill count (number of objects allocated per
tcache refill) such that if GC has to flush inactive objects, the fill
count gradually decreases.  Conversely, if refills occur while the fill
count is depressed, the fill count gradually increases back to its
maximum value.
---
 jemalloc/include/jemalloc/internal/tcache.h | 24 +++++++++++++++++++++---
 jemalloc/src/arena.c                        |  7 +++----
 jemalloc/src/tcache.c                       |  5 +++--
 3 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/jemalloc/include/jemalloc/internal/tcache.h b/jemalloc/include/jemalloc/internal/tcache.h
index 5434d32..da3c68c 100644
--- a/jemalloc/include/jemalloc/internal/tcache.h
+++ b/jemalloc/include/jemalloc/internal/tcache.h
@@ -45,7 +45,8 @@ struct tcache_bin_s {
 #  ifdef JEMALLOC_STATS
 	tcache_bin_stats_t tstats;
 #  endif
-	unsigned	low_water;	/* Min # cached since last GC. */
+	int		low_water;	/* Min # cached since last GC. */
+	unsigned	lg_fill_div;	/* Fill (ncached_max >> lg_fill_div). */
 	unsigned	ncached;	/* # of cached objects. */
 	void		**avail;	/* Stack of available objects. */
 };
@@ -184,6 +185,7 @@ tcache_event(tcache_t *tcache)
 	if (tcache->ev_cnt == tcache_gc_incr) {
 		size_t binind = tcache->next_gc_bin;
 		tcache_bin_t *tbin = &tcache->tbins[binind];
+		tcache_bin_info_t *tbin_info = &tcache_bin_info[binind];
 
 		if (tbin->low_water > 0) {
 			/*
@@ -207,6 +209,20 @@ tcache_event(tcache_t *tcache)
 #endif
 				    );
 			}
+			/*
+			 * Reduce fill count by 2X.  Limit lg_fill_div such that
+			 * the fill count is always at least 1.
+			 */
+			if ((tbin_info->ncached_max >> (tbin->lg_fill_div+1))
+			    >= 1)
+				tbin->lg_fill_div++;
+		} else if (tbin->low_water < 0) {
+			/*
+			 * Increase fill count by 2X.  Make sure lg_fill_div
+			 * stays greater than 0.
+			 */
+			if (tbin->lg_fill_div > 1)
+				tbin->lg_fill_div--;
 		}
 		tbin->low_water = tbin->ncached;
 
@@ -222,10 +238,12 @@ tcache_alloc_easy(tcache_bin_t *tbin)
 {
 	void *ret;
 
-	if (tbin->ncached == 0)
+	if (tbin->ncached == 0) {
+		tbin->low_water = -1;
 		return (NULL);
+	}
 	tbin->ncached--;
-	if (tbin->ncached < tbin->low_water)
+	if ((int)tbin->ncached < tbin->low_water)
 		tbin->low_water = tbin->ncached;
 	ret = tbin->avail[tbin->ncached];
 	return (ret);
diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index 4cbca57..0f4f12a 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -1386,8 +1386,8 @@ arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, size_t binind
 #endif
 	bin = &arena->bins[binind];
 	malloc_mutex_lock(&bin->lock);
-	for (i = 0, nfill = (tcache_bin_info[binind].ncached_max >> 1);
-	    i < nfill; i++) {
+	for (i = 0, nfill = (tcache_bin_info[binind].ncached_max >>
+	    tbin->lg_fill_div); i < nfill; i++) {
 		if ((run = bin->runcur) != NULL && run->nfree > 0)
 			ptr = arena_run_reg_alloc(run, &arena_bin_info[binind]);
 		else
@@ -1398,8 +1398,7 @@ arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, size_t binind
 		tbin->avail[nfill - 1 - i] = ptr;
 	}
 #ifdef JEMALLOC_STATS
-	bin->stats.allocated += (i - tbin->ncached) *
-	    arena_bin_info[binind].reg_size;
+	bin->stats.allocated += i * arena_bin_info[binind].reg_size;
 	bin->stats.nmalloc += i;
 	bin->stats.nrequests += tbin->tstats.nrequests;
 	bin->stats.nfills++;
diff --git a/jemalloc/src/tcache.c b/jemalloc/src/tcache.c
index 2f4804e..31c329e 100644
--- a/jemalloc/src/tcache.c
+++ b/jemalloc/src/tcache.c
@@ -135,7 +135,7 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem
 	memmove(tbin->avail, &tbin->avail[tbin->ncached - rem],
 	    rem * sizeof(void *));
 	tbin->ncached = rem;
-	if (tbin->ncached < tbin->low_water)
+	if ((int)tbin->ncached < tbin->low_water)
 		tbin->low_water = tbin->ncached;
 }
 
@@ -218,7 +218,7 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem
 	memmove(tbin->avail, &tbin->avail[tbin->ncached - rem],
 	    rem * sizeof(void *));
 	tbin->ncached = rem;
-	if (tbin->ncached < tbin->low_water)
+	if ((int)tbin->ncached < tbin->low_water)
 		tbin->low_water = tbin->ncached;
 }
 
@@ -265,6 +265,7 @@ tcache_create(arena_t *arena)
 	tcache->arena = arena;
 	assert((TCACHE_NSLOTS_SMALL_MAX & 1U) == 0);
 	for (i = 0; i < nhbins; i++) {
+		tcache->tbins[i].lg_fill_div = 1;
 		tcache->tbins[i].avail = (void **)((uintptr_t)tcache +
 		    (uintptr_t)stack_offset);
 		stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *);
-- 
cgit v0.12


From 47e57f9bdadfaf999c9dea5d126edf3a4f1b2995 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Tue, 22 Mar 2011 09:00:56 -0700
Subject: Avoid overflow in arena_run_regind().

Fix a regression due to:
    Remove an arena_bin_run_size_calc() constraint.
    2a6f2af6e446a98a635caadd281a23ca09a491cb
The removed constraint required that small run headers fit in one page,
which indirectly limited runs such that they would not cause overflow in
arena_run_regind().  Add an explicit constraint to
arena_bin_run_size_calc() based on the largest number of regions that
arena_run_regind() can handle (2^11 as currently configured).
---
 jemalloc/include/jemalloc/internal/arena.h                |  8 ++++++--
 jemalloc/include/jemalloc/internal/atomic.h               |  4 +++-
 jemalloc/include/jemalloc/internal/bitmap.h               |  2 +-
 jemalloc/include/jemalloc/internal/jemalloc_internal.h.in |  2 +-
 jemalloc/src/arena.c                                      | 12 +++++++++++-
 jemalloc/test/bitmap.c                                    |  6 +++++-
 6 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/jemalloc/include/jemalloc/internal/arena.h b/jemalloc/include/jemalloc/internal/arena.h
index 94b7f3d..b80c118 100644
--- a/jemalloc/include/jemalloc/internal/arena.h
+++ b/jemalloc/include/jemalloc/internal/arena.h
@@ -58,6 +58,10 @@
 #define	RUN_MAX_OVRHD		0x0000003dU
 #define	RUN_MAX_OVRHD_RELAX	0x00001800U
 
+/* Maximum number of regions in one run. */
+#define	LG_RUN_MAXREGS		11
+#define	RUN_MAXREGS		(1U << LG_RUN_MAXREGS)
+
 /*
  * The minimum ratio of active:dirty pages per arena is computed as:
  *
@@ -556,8 +560,8 @@ arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, const void *ptr)
 		 * divide by 0, and 1 and 2 are both powers of two, which are
 		 * handled above.
 		 */
-#define	SIZE_INV_SHIFT 21
-#define	SIZE_INV(s) (((1U << SIZE_INV_SHIFT) / (s)) + 1)
+#define	SIZE_INV_SHIFT	((sizeof(unsigned) << 3) - LG_RUN_MAXREGS)
+#define	SIZE_INV(s)	(((1U << SIZE_INV_SHIFT) / (s)) + 1)
 		static const unsigned size_invs[] = {
 		    SIZE_INV(3),
 		    SIZE_INV(4), SIZE_INV(5), SIZE_INV(6), SIZE_INV(7),
diff --git a/jemalloc/include/jemalloc/internal/atomic.h b/jemalloc/include/jemalloc/internal/atomic.h
index f1f0c2b..821c2ef 100644
--- a/jemalloc/include/jemalloc/internal/atomic.h
+++ b/jemalloc/include/jemalloc/internal/atomic.h
@@ -70,7 +70,9 @@ atomic_sub_uint64(uint64_t *p, uint64_t x)
 	return (OSAtomicAdd64(-((int64_t)x), (int64_t *)p));
 }
 #else
-#  error "Missing implementation for 64-bit atomic operations"
+#  if (LG_SIZEOF_PTR == 3)
+#    error "Missing implementation for 64-bit atomic operations"
+#  endif
 #endif
 
 /* 32-bit operations. */
diff --git a/jemalloc/include/jemalloc/internal/bitmap.h b/jemalloc/include/jemalloc/internal/bitmap.h
index 4bb2212..605ebac 100644
--- a/jemalloc/include/jemalloc/internal/bitmap.h
+++ b/jemalloc/include/jemalloc/internal/bitmap.h
@@ -2,7 +2,7 @@
 #ifdef JEMALLOC_H_TYPES
 
 /* Maximum bitmap bit count is 2^LG_BITMAP_MAXBITS. */
-#define	LG_BITMAP_MAXBITS	18
+#define	LG_BITMAP_MAXBITS	LG_RUN_MAXREGS
 
 typedef struct bitmap_level_s bitmap_level_t;
 typedef struct bitmap_info_s bitmap_info_t;
diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
index fc944a8..f82385d 100644
--- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
@@ -224,9 +224,9 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #include "jemalloc/internal/ctl.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/mb.h"
-#include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/extent.h"
 #include "jemalloc/internal/arena.h"
+#include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index 0f4f12a..0693f36 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -2427,6 +2427,7 @@ small_size2bin_init_hard(void)
  *   *) bin_info->run_size >= min_run_size
  *   *) bin_info->run_size <= arena_maxclass
  *   *) run header overhead <= RUN_MAX_OVRHD (or header overhead relaxed).
+ *   *) bin_info->nregs <= RUN_MAXREGS
  *
  * bin_info->nregs, bin_info->bitmap_offset, and bin_info->reg0_offset are also
  * calculated here, since these settings are all interdependent.
@@ -2459,6 +2460,10 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size)
 	try_run_size = min_run_size;
 	try_nregs = ((try_run_size - sizeof(arena_run_t)) / bin_info->reg_size)
 	    + 1; /* Counter-act try_nregs-- in loop. */
+	if (try_nregs > RUN_MAXREGS) {
+		try_nregs = RUN_MAXREGS
+		    + 1; /* Counter-act try_nregs-- in loop. */
+	}
 	do {
 		try_nregs--;
 		try_hdr_size = sizeof(arena_run_t);
@@ -2500,6 +2505,10 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size)
 		try_nregs = ((try_run_size - sizeof(arena_run_t)) /
 		    bin_info->reg_size)
 		    + 1; /* Counter-act try_nregs-- in loop. */
+		if (try_nregs > RUN_MAXREGS) {
+			try_nregs = RUN_MAXREGS
+			    + 1; /* Counter-act try_nregs-- in loop. */
+		}
 		do {
 			try_nregs--;
 			try_hdr_size = sizeof(arena_run_t);
@@ -2526,7 +2535,8 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size)
 	} while (try_run_size <= arena_maxclass
 	    && try_run_size <= arena_maxclass
 	    && RUN_MAX_OVRHD * (bin_info->reg_size << 3) > RUN_MAX_OVRHD_RELAX
-	    && (try_reg0_offset << RUN_BFP) > RUN_MAX_OVRHD * try_run_size);
+	    && (try_reg0_offset << RUN_BFP) > RUN_MAX_OVRHD * try_run_size
+	    && try_nregs < RUN_MAXREGS);
 
 	assert(good_hdr_size <= good_reg0_offset);
 
diff --git a/jemalloc/test/bitmap.c b/jemalloc/test/bitmap.c
index 7a017c8..adfaacf 100644
--- a/jemalloc/test/bitmap.c
+++ b/jemalloc/test/bitmap.c
@@ -13,7 +13,11 @@
  */
 #include "../src/bitmap.c"
 
-#define	MAXBITS	4500
+#if (LG_BITMAP_MAXBITS > 12)
+#  define MAXBITS	4500
+#else
+#  define MAXBITS	(1U << LG_BITMAP_MAXBITS)
+#endif
 
 static void
 test_bitmap_size(void)
-- 
cgit v0.12


From c957398b4f973158de323366dbd424b7bb812ddf Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Wed, 23 Mar 2011 00:27:50 -0700
Subject: Fix bootstrapping order bug.

Initialize arenas_tsd earlier, so that the non-TLS case works when
profiling is enabled.
---
 jemalloc/src/jemalloc.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c
index dccce6b..4d24470 100644
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@@ -797,6 +797,11 @@ malloc_init_hard(void)
 	if (malloc_mutex_init(&arenas_lock))
 		return (true);
 
+	if (pthread_key_create(&arenas_tsd, arenas_cleanup) != 0) {
+		malloc_mutex_unlock(&init_lock);
+		return (true);
+	}
+
 #ifdef JEMALLOC_PROF
 	if (prof_boot2()) {
 		malloc_mutex_unlock(&init_lock);
@@ -835,11 +840,6 @@ malloc_init_hard(void)
 		malloc_write(")\n");
 	}
 
-	if (pthread_key_create(&arenas_tsd, arenas_cleanup) != 0) {
-		malloc_mutex_unlock(&init_lock);
-		return (true);
-	}
-
 	/* Allocate and initialize arenas. */
 	arenas = (arena_t **)base_alloc(sizeof(arena_t *) * narenas);
 	if (arenas == NULL) {
-- 
cgit v0.12


From eacb896c014d822cf563490d1c1f1cdc3cda24a2 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Wed, 23 Mar 2011 00:30:30 -0700
Subject: Fix rallocm() rsize bug.

Add code to set *rsize even when profiling is enabled.
---
 jemalloc/src/jemalloc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c
index 4d24470..1b8a278 100644
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@@ -1677,6 +1677,8 @@ JEMALLOC_P(rallocm)(void **ptr, size_t *rsize, size_t size, size_t extra,
 			usize = isalloc(q);
 		}
 		prof_realloc(q, usize, cnt, old_size, old_ctx);
+		if (rsize != NULL)
+			*rsize = usize;
 	} else
 #endif
 	{
-- 
cgit v0.12


From 38d9210c464c4ad49655a4da6bc84ea4fbec83d2 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Wed, 23 Mar 2011 00:37:29 -0700
Subject: Fix error detection for ipalloc() when profiling.

sa2u() returns 0 on overflow, but the profiling code was blindly calling
sa2u() and allowing the error to silently propagate, ultimately ending
in a later assertion failure.  Refactor all ipalloc() callers to call
sa2u(), check for overflow before calling ipalloc(), and pass usize
rather than size.  This allows ipalloc() to avoid calling sa2u() in the
common case.
---
 .../jemalloc/internal/jemalloc_internal.h.in       | 59 +++++++++++++--------
 jemalloc/src/arena.c                               | 19 ++++---
 jemalloc/src/ckh.c                                 | 28 +++++++---
 jemalloc/src/jemalloc.c                            | 61 +++++++++++++---------
 4 files changed, 105 insertions(+), 62 deletions(-)

diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
index f82385d..254adb6 100644
--- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
@@ -589,7 +589,7 @@ thread_allocated_get(void)
 #ifndef JEMALLOC_ENABLE_INLINE
 void	*imalloc(size_t size);
 void	*icalloc(size_t size);
-void	*ipalloc(size_t size, size_t alignment, bool zero);
+void	*ipalloc(size_t usize, size_t alignment, bool zero);
 size_t	isalloc(const void *ptr);
 #  ifdef JEMALLOC_IVSALLOC
 size_t	ivsalloc(const void *ptr);
@@ -623,28 +623,39 @@ icalloc(size_t size)
 }
 
 JEMALLOC_INLINE void *
-ipalloc(size_t size, size_t alignment, bool zero)
+ipalloc(size_t usize, size_t alignment, bool zero)
 {
 	void *ret;
-	size_t usize;
-	size_t run_size
-#  ifdef JEMALLOC_CC_SILENCE
-	    = 0
-#  endif
-	    ;
 
-	usize = sa2u(size, alignment, &run_size);
-	if (usize == 0)
-		return (NULL);
+	assert(usize != 0);
+	assert(usize == sa2u(usize, alignment, NULL));
+
 	if (usize <= arena_maxclass && alignment <= PAGE_SIZE)
 		ret = arena_malloc(usize, zero);
-	else if (run_size <= arena_maxclass) {
-		ret = arena_palloc(choose_arena(), usize, run_size, alignment,
-		    zero);
-	} else if (alignment <= chunksize)
-		ret = huge_malloc(usize, zero);
-	else
-		ret = huge_palloc(usize, alignment, zero);
+	else {
+		size_t run_size
+#ifdef JEMALLOC_CC_SILENCE
+		    = 0
+#endif
+		    ;
+
+		/*
+		 * Ideally we would only ever call sa2u() once per aligned
+		 * allocation request, and the caller of this function has
+		 * already done so once.  However, it's rather burdensome to
+		 * require every caller to pass in run_size, especially given
+		 * that it's only relevant to large allocations.  Therefore,
+		 * just call it again here in order to get run_size.
+		 */
+		sa2u(usize, alignment, &run_size);
+		if (run_size <= arena_maxclass) {
+			ret = arena_palloc(choose_arena(), usize, run_size,
+			    alignment, zero);
+		} else if (alignment <= chunksize)
+			ret = huge_malloc(usize, zero);
+		else
+			ret = huge_palloc(usize, alignment, zero);
+	}
 
 	assert(((uintptr_t)ret & (alignment - 1)) == 0);
 	return (ret);
@@ -715,7 +726,7 @@ iralloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero,
 
 	if (alignment != 0 && ((uintptr_t)ptr & ((uintptr_t)alignment-1))
 	    != 0) {
-		size_t copysize;
+		size_t usize, copysize;
 
 		/*
 		 * Existing object alignment is inadquate; allocate new space
@@ -723,12 +734,18 @@ iralloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero,
 		 */
 		if (no_move)
 			return (NULL);
-		ret = ipalloc(size + extra, alignment, zero);
+		usize = sa2u(size + extra, alignment, NULL);
+		if (usize == 0)
+			return (NULL);
+		ret = ipalloc(usize, alignment, zero);
 		if (ret == NULL) {
 			if (extra == 0)
 				return (NULL);
 			/* Try again, without extra this time. */
-			ret = ipalloc(size, alignment, zero);
+			usize = sa2u(size, alignment, NULL);
+			if (usize == 0)
+				return (NULL);
+			ret = ipalloc(usize, alignment, zero);
 			if (ret == NULL)
 				return (NULL);
 		}
diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index 0693f36..1954da9 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -2165,24 +2165,29 @@ arena_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
 	if (ret != NULL)
 		return (ret);
 
-
 	/*
 	 * size and oldsize are different enough that we need to move the
 	 * object.  In that case, fall back to allocating new space and
 	 * copying.
 	 */
-	if (alignment != 0)
-		ret = ipalloc(size + extra, alignment, zero);
-	else
+	if (alignment != 0) {
+		size_t usize = sa2u(size + extra, alignment, NULL);
+		if (usize == 0)
+			return (NULL);
+		ret = ipalloc(usize, alignment, zero);
+	} else
 		ret = arena_malloc(size + extra, zero);
 
 	if (ret == NULL) {
 		if (extra == 0)
 			return (NULL);
 		/* Try again, this time without extra. */
-		if (alignment != 0)
-			ret = ipalloc(size, alignment, zero);
-		else
+		if (alignment != 0) {
+			size_t usize = sa2u(size, alignment, NULL);
+			if (usize == 0)
+				return (NULL);
+			ret = ipalloc(usize, alignment, zero);
+		} else
 			ret = arena_malloc(size, zero);
 
 		if (ret == NULL)
diff --git a/jemalloc/src/ckh.c b/jemalloc/src/ckh.c
index 22319ab..143b5b5 100644
--- a/jemalloc/src/ckh.c
+++ b/jemalloc/src/ckh.c
@@ -262,9 +262,15 @@ ckh_grow(ckh_t *ckh)
 	lg_prevbuckets = ckh->lg_curbuckets;
 	lg_curcells = ckh->lg_curbuckets + LG_CKH_BUCKET_CELLS;
 	while (true) {
+		size_t usize;
+
 		lg_curcells++;
-		tab = (ckhc_t *)ipalloc(sizeof(ckhc_t) << lg_curcells,
-		    ZU(1) << LG_CACHELINE, true);
+		usize = sa2u(sizeof(ckhc_t) << lg_curcells, CACHELINE, NULL);
+		if (usize == 0) {
+			ret = true;
+			goto RETURN;
+		}
+		tab = (ckhc_t *)ipalloc(usize, CACHELINE, true);
 		if (tab == NULL) {
 			ret = true;
 			goto RETURN;
@@ -295,7 +301,7 @@ static void
 ckh_shrink(ckh_t *ckh)
 {
 	ckhc_t *tab, *ttab;
-	size_t lg_curcells;
+	size_t lg_curcells, usize;
 	unsigned lg_prevbuckets;
 
 	/*
@@ -304,8 +310,10 @@ ckh_shrink(ckh_t *ckh)
 	 */
 	lg_prevbuckets = ckh->lg_curbuckets;
 	lg_curcells = ckh->lg_curbuckets + LG_CKH_BUCKET_CELLS - 1;
-	tab = (ckhc_t *)ipalloc(sizeof(ckhc_t) << lg_curcells,
-	    ZU(1) << LG_CACHELINE, true);
+	usize = sa2u(sizeof(ckhc_t) << lg_curcells, CACHELINE, NULL);
+	if (usize == 0)
+		return;
+	tab = (ckhc_t *)ipalloc(usize, CACHELINE, true);
 	if (tab == NULL) {
 		/*
 		 * An OOM error isn't worth propagating, since it doesn't
@@ -340,7 +348,7 @@ bool
 ckh_new(ckh_t *ckh, size_t minitems, ckh_hash_t *hash, ckh_keycomp_t *keycomp)
 {
 	bool ret;
-	size_t mincells;
+	size_t mincells, usize;
 	unsigned lg_mincells;
 
 	assert(minitems > 0);
@@ -375,8 +383,12 @@ ckh_new(ckh_t *ckh, size_t minitems, ckh_hash_t *hash, ckh_keycomp_t *keycomp)
 	ckh->hash = hash;
 	ckh->keycomp = keycomp;
 
-	ckh->tab = (ckhc_t *)ipalloc(sizeof(ckhc_t) << lg_mincells,
-	    (ZU(1) << LG_CACHELINE), true);
+	usize = sa2u(sizeof(ckhc_t) << lg_mincells, CACHELINE, NULL);
+	if (usize == 0) {
+		ret = true;
+		goto RETURN;
+	}
+	ckh->tab = (ckhc_t *)ipalloc(usize, CACHELINE, true);
 	if (ckh->tab == NULL) {
 		ret = true;
 		goto RETURN;
diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c
index 1b8a278..e287516 100644
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@@ -993,14 +993,12 @@ int
 JEMALLOC_P(posix_memalign)(void **memptr, size_t alignment, size_t size)
 {
 	int ret;
-	void *result;
-#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
 	size_t usize
-#  ifdef JEMALLOC_CC_SILENCE
+#ifdef JEMALLOC_CC_SILENCE
 	    = 0
-#  endif
-	    ;
 #endif
+	    ;
+	void *result;
 #ifdef JEMALLOC_PROF
 	prof_thr_cnt_t *cnt
 #  ifdef JEMALLOC_CC_SILENCE
@@ -1050,34 +1048,37 @@ JEMALLOC_P(posix_memalign)(void **memptr, size_t alignment, size_t size)
 			goto RETURN;
 		}
 
+		usize = sa2u(size, alignment, NULL);
+		if (usize == 0) {
+			result = NULL;
+			ret = ENOMEM;
+			goto RETURN;
+		}
+
 #ifdef JEMALLOC_PROF
 		if (opt_prof) {
-			usize = sa2u(size, alignment, NULL);
 			if ((cnt = prof_alloc_prep(usize)) == NULL) {
 				result = NULL;
 				ret = EINVAL;
 			} else {
 				if (prof_promote && (uintptr_t)cnt !=
 				    (uintptr_t)1U && usize <= small_maxclass) {
-					result = ipalloc(small_maxclass+1,
-					    alignment, false);
+					assert(sa2u(small_maxclass+1,
+					    alignment, NULL) != 0);
+					result = ipalloc(sa2u(small_maxclass+1,
+					    alignment, NULL), alignment, false);
 					if (result != NULL) {
 						arena_prof_promoted(result,
 						    usize);
 					}
 				} else {
-					result = ipalloc(size, alignment,
+					result = ipalloc(usize, alignment,
 					    false);
 				}
 			}
 		} else
 #endif
-		{
-#ifdef JEMALLOC_STATS
-			usize = sa2u(size, alignment, NULL);
-#endif
-			result = ipalloc(size, alignment, false);
-		}
+			result = ipalloc(usize, alignment, false);
 	}
 
 	if (result == NULL) {
@@ -1531,15 +1532,18 @@ JEMALLOC_P(mallctlbymib)(const size_t *mib, size_t miblen, void *oldp,
 }
 
 JEMALLOC_INLINE void *
-iallocm(size_t size, size_t alignment, bool zero)
+iallocm(size_t usize, size_t alignment, bool zero)
 {
 
+	assert(usize == ((alignment == 0) ? s2u(usize) : sa2u(usize, alignment,
+	    NULL)));
+
 	if (alignment != 0)
-		return (ipalloc(size, alignment, zero));
+		return (ipalloc(usize, alignment, zero));
 	else if (zero)
-		return (icalloc(size));
+		return (icalloc(usize));
 	else
-		return (imalloc(size));
+		return (imalloc(usize));
 }
 
 JEMALLOC_ATTR(nonnull(1))
@@ -1562,20 +1566,27 @@ JEMALLOC_P(allocm)(void **ptr, size_t *rsize, size_t size, int flags)
 	if (malloc_init())
 		goto OOM;
 
+	usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment,
+	    NULL);
+	if (usize == 0)
+		goto OOM;
+
 #ifdef JEMALLOC_PROF
 	if (opt_prof) {
-		usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment,
-		    NULL);
 		if ((cnt = prof_alloc_prep(usize)) == NULL)
 			goto OOM;
 		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && usize <=
 		    small_maxclass) {
-			p = iallocm(small_maxclass+1, alignment, zero);
+			size_t usize_promoted = (alignment == 0) ?
+			    s2u(small_maxclass+1) : sa2u(small_maxclass+1,
+			    alignment, NULL);
+			assert(usize_promoted != 0);
+			p = iallocm(usize_promoted, alignment, zero);
 			if (p == NULL)
 				goto OOM;
 			arena_prof_promoted(p, usize);
 		} else {
-			p = iallocm(size, alignment, zero);
+			p = iallocm(usize, alignment, zero);
 			if (p == NULL)
 				goto OOM;
 		}
@@ -1585,15 +1596,13 @@ JEMALLOC_P(allocm)(void **ptr, size_t *rsize, size_t size, int flags)
 	} else
 #endif
 	{
-		p = iallocm(size, alignment, zero);
+		p = iallocm(usize, alignment, zero);
 		if (p == NULL)
 			goto OOM;
 #ifndef JEMALLOC_STATS
 		if (rsize != NULL)
 #endif
 		{
-			usize = (alignment == 0) ? s2u(size) : sa2u(size,
-			    alignment, NULL);
 #ifdef JEMALLOC_STATS
 			if (rsize != NULL)
 #endif
-- 
cgit v0.12


From 4bcd987251826a7f9c49a1e2e6968bbb639a06c8 Mon Sep 17 00:00:00 2001
From: Jason Evans <je@fb.com>
Date: Tue, 22 Mar 2011 15:30:22 -0700
Subject: Update ChangeLog for 2.2.0.

---
 jemalloc/ChangeLog | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/jemalloc/ChangeLog b/jemalloc/ChangeLog
index 08526c8..6db63db 100644
--- a/jemalloc/ChangeLog
+++ b/jemalloc/ChangeLog
@@ -6,6 +6,35 @@ found in the git revision history:
     http://www.canonware.com/cgi-bin/gitweb.cgi?p=jemalloc.git
     git://canonware.com/jemalloc.git
 
+* 2.2.0 (March 22, 2011)
+
+  This version incorporates several improvements to algorithms and data
+  structures that tend to reduce fragmentation and increase speed.
+
+  New features:
+  - Add the "stats.cactive" mallctl.
+  - Update pprof (from google-perftools 1.7).
+  - Improve backtracing-related configuration logic, and add the
+    --disable-prof-libgcc option.
+
+  Bug fixes:
+  - Change default symbol visibility from "internal", to "hidden", which
+    decreases the overhead of library-internal function calls.
+  - Fix symbol visibility so that it is also set on OS X.
+  - Fix a build dependency regression caused by the introduction of the .pic.o
+    suffix for PIC object files.
+  - Add missing checks for mutex initialization failures.
+  - Don't use libgcc-based backtracing except on x64, where it is known to work.
+  - Fix deadlocks on OS X that were due to memory allocation in
+    pthread_mutex_lock().
+  - Heap profiling-specific fixes:
+    + Fix memory corruption due to integer overflow in small region index
+      computation, when using a small enough sample interval that profiling
+      context pointers are stored in small run headers.
+    + Fix a bootstrap ordering bug that only occurred with TLS disabled.
+    + Fix a rallocm() rsize bug.
+    + Fix error detection bugs for aligned memory allocation.
+
 * 2.1.3 (March 14, 2011)
 
   Bug fixes:
-- 
cgit v0.12