13 files changed, 651 insertions, 116 deletions
diff --git a/jemalloc/include/jemalloc/internal/arena.h b/jemalloc/include/jemalloc/internal/arena.h
index a43d1fa..b80c118 100644
--- a/jemalloc/include/jemalloc/internal/arena.h
+++ b/jemalloc/include/jemalloc/internal/arena.h
@@ -19,6 +19,7 @@
 #ifdef JEMALLOC_TINY
    /* Smallest size class to support. */
 #  define LG_TINY_MIN		LG_SIZEOF_PTR
+#  define TINY_MIN		(1U << LG_TINY_MIN)
 #endif
 
 /*
@@ -57,6 +58,10 @@
 #define	RUN_MAX_OVRHD		0x0000003dU
 #define	RUN_MAX_OVRHD_RELAX	0x00001800U
 
+/* Maximum number of regions in one run. */
+#define	LG_RUN_MAXREGS		11
+#define	RUN_MAXREGS		(1U << LG_RUN_MAXREGS)
+
 /*
  * The minimum ratio of active:dirty pages per arena is computed as:
  *
@@ -70,6 +75,7 @@
 typedef struct arena_chunk_map_s arena_chunk_map_t;
 typedef struct arena_chunk_s arena_chunk_t;
 typedef struct arena_run_s arena_run_t;
+typedef struct arena_bin_info_s arena_bin_info_t;
 typedef struct arena_bin_s arena_bin_t;
 typedef struct arena_s arena_t;
 
@@ -207,16 +213,52 @@ struct arena_run_s {
 	/* Bin this run is associated with. */
 	arena_bin_t	*bin;
 
-	/* Stack of available freed regions, or NULL. */
-	void		*avail;
-
-	/* Next region that has never been allocated, or run boundary. */
-	void		*next;
+	/* Index of next region that has never been allocated, or nregs. */
+	uint32_t	nextind;
 
 	/* Number of free regions in run. */
 	unsigned	nfree;
 };
 
+/*
+ * Read-only information associated with each element of arena_t's bins array
+ * is stored separately, partly to reduce memory usage (only one copy, rather
+ * than one per arena), but mainly to avoid false cacheline sharing.
+ */
+struct arena_bin_info_s {
+	/* Size of regions in a run for this bin's size class. */
+	size_t		reg_size;
+
+	/* Total size of a run for this bin's size class. */
+	size_t		run_size;
+
+	/* Total number of regions in a run for this bin's size class. */
+	uint32_t	nregs;
+
+	/*
+	 * Offset of first bitmap_t element in a run header for this bin's size
+	 * class.
+	 */
+	uint32_t	bitmap_offset;
+
+	/*
+	 * Metadata used to manipulate bitmaps for runs associated with this
+	 * bin.
+	 */
+	bitmap_info_t	bitmap_info;
+
+#ifdef JEMALLOC_PROF
+	/*
+	 * Offset of first (prof_ctx_t *) in a run header for this bin's size
+	 * class, or 0 if (opt_prof == false).
+	 */
+	uint32_t	ctx0_offset;
+#endif
+
+	/* Offset of first region in a run for this bin's size class. */
+	uint32_t	reg0_offset;
+};
+
 struct arena_bin_s {
 	/*
 	 * All operations on runcur, runs, and stats require that lock be
@@ -241,26 +283,6 @@ struct arena_bin_s {
 	 */
 	arena_run_tree_t runs;
 
-	/* Size of regions in a run for this bin's size class. */
-	size_t		reg_size;
-
-	/* Total size of a run for this bin's size class. */
-	size_t		run_size;
-
-	/* Total number of regions in a run for this bin's size class. */
-	uint32_t	nregs;
-
-#ifdef JEMALLOC_PROF
-	/*
-	 * Offset of first (prof_ctx_t *) in a run header for this bin's size
-	 * class, or 0 if (opt_prof == false).
-	 */
-	uint32_t	ctx0_offset;
-#endif
-
-	/* Offset of first region in a run for this bin's size class. */
-	uint32_t	reg0_offset;
-
 #ifdef JEMALLOC_STATS
 	/* Bin statistics. */
 	malloc_bin_stats_t stats;
@@ -277,8 +299,18 @@ struct arena_s {
 	unsigned		ind;
 
 	/*
-	 * All non-bin-related operations on this arena require that lock be
-	 * locked.
+	 * Number of threads currently assigned to this arena.  This field is
+	 * protected by arenas_lock.
+	 */
+	unsigned		nthreads;
+
+	/*
+	 * There are three classes of arena operations from a locking
+	 * perspective:
+	 * 1) Thread asssignment (modifies nthreads) is protected by
+	 *    arenas_lock.
+	 * 2) Bin-related operations are protected by bin locks.
+	 * 3) Chunk- and run-related operations are protected by this mutex.
 	 */
 	malloc_mutex_t		lock;
 
@@ -388,8 +420,16 @@ struct arena_s {
 
 extern size_t	opt_lg_qspace_max;
 extern size_t	opt_lg_cspace_max;
-extern ssize_t		opt_lg_dirty_mult;
+extern ssize_t	opt_lg_dirty_mult;
+/*
+ * small_size2bin is a compact lookup table that rounds request sizes up to
+ * size classes.  In order to reduce cache footprint, the table is compressed,
+ * and all accesses are via the SMALL_SIZE2BIN macro.
+ */
 extern uint8_t const	*small_size2bin;
+#define	SMALL_SIZE2BIN(s)	(small_size2bin[(s-1) >> LG_TINY_MIN])
+
+extern arena_bin_info_t	*arena_bin_info;
 
 /* Various bin-related settings. */
 #ifdef JEMALLOC_TINY		/* Number of (2^n)-spaced tiny bins. */
@@ -456,8 +496,9 @@ bool	arena_boot(void);
 #ifdef JEMALLOC_H_INLINES
 
 #ifndef JEMALLOC_ENABLE_INLINE
-unsigned	arena_run_regind(arena_run_t *run, arena_bin_t *bin,
-    const void *ptr, size_t size);
+size_t	arena_bin_index(arena_t *arena, arena_bin_t *bin);
+unsigned	arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info,
+    const void *ptr);
 #  ifdef JEMALLOC_PROF
 prof_ctx_t	*arena_prof_ctx_get(const void *ptr);
 void	arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
@@ -466,21 +507,37 @@ void	arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ARENA_C_))
+JEMALLOC_INLINE size_t
+arena_bin_index(arena_t *arena, arena_bin_t *bin)
+{
+	size_t binind = bin - arena->bins;
+	assert(binind < nbins);
+	return (binind);
+}
+
 JEMALLOC_INLINE unsigned
-arena_run_regind(arena_run_t *run, arena_bin_t *bin, const void *ptr,
-    size_t size)
+arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, const void *ptr)
 {
 	unsigned shift, diff, regind;
+	size_t size;
 
-	assert(run->magic == ARENA_RUN_MAGIC);
+	dassert(run->magic == ARENA_RUN_MAGIC);
+	/*
+	 * Freeing a pointer lower than region zero can cause assertion
+	 * failure.
+	 */
+	assert((uintptr_t)ptr >= (uintptr_t)run +
+	    (uintptr_t)bin_info->reg0_offset);
 
 	/*
 	 * Avoid doing division with a variable divisor if possible.  Using
 	 * actual division here can reduce allocator throughput by over 20%!
 	 */
-	diff = (unsigned)((uintptr_t)ptr - (uintptr_t)run - bin->reg0_offset);
+	diff = (unsigned)((uintptr_t)ptr - (uintptr_t)run -
+	    bin_info->reg0_offset);
 
 	/* Rescale (factor powers of 2 out of the numerator and denominator). */
+	size = bin_info->reg_size;
 	shift = ffs(size) - 1;
 	diff >>= shift;
 	size >>= shift;
@@ -503,8 +560,8 @@ arena_run_regind(arena_run_t *run, arena_bin_t *bin, const void *ptr,
 		 * divide by 0, and 1 and 2 are both powers of two, which are
 		 * handled above.
 		 */
-#define	SIZE_INV_SHIFT 21
-#define	SIZE_INV(s) (((1U << SIZE_INV_SHIFT) / (s)) + 1)
+#define	SIZE_INV_SHIFT	((sizeof(unsigned) << 3) - LG_RUN_MAXREGS)
+#define	SIZE_INV(s)	(((1U << SIZE_INV_SHIFT) / (s)) + 1)
 		static const unsigned size_invs[] = {
 		    SIZE_INV(3),
 		    SIZE_INV(4), SIZE_INV(5), SIZE_INV(6), SIZE_INV(7),
@@ -524,7 +581,7 @@ arena_run_regind(arena_run_t *run, arena_bin_t *bin, const void *ptr,
 #undef SIZE_INV_SHIFT
 	}
 	assert(diff == regind * size);
-	assert(regind < bin->nregs);
+	assert(regind < bin_info->nregs);
 
 	return (regind);
 }
@@ -551,13 +608,14 @@ arena_prof_ctx_get(const void *ptr)
 			arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
 			    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
 			    PAGE_SHIFT));
-			arena_bin_t *bin = run->bin;
+			size_t binind = arena_bin_index(chunk->arena, run->bin);
+			arena_bin_info_t *bin_info = &arena_bin_info[binind];
 			unsigned regind;
 
-			assert(run->magic == ARENA_RUN_MAGIC);
-			regind = arena_run_regind(run, bin, ptr, bin->reg_size);
+			dassert(run->magic == ARENA_RUN_MAGIC);
+			regind = arena_run_regind(run, bin_info, ptr);
 			ret = *(prof_ctx_t **)((uintptr_t)run +
-			    bin->ctx0_offset + (regind *
+			    bin_info->ctx0_offset + (regind *
 			    sizeof(prof_ctx_t *)));
 		}
 	} else
@@ -585,12 +643,16 @@ arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 			    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
 			    PAGE_SHIFT));
 			arena_bin_t *bin = run->bin;
+			size_t binind;
+			arena_bin_info_t *bin_info;
 			unsigned regind;
 
-			assert(run->magic == ARENA_RUN_MAGIC);
-			regind = arena_run_regind(run, bin, ptr, bin->reg_size);
+			dassert(run->magic == ARENA_RUN_MAGIC);
+			binind = arena_bin_index(chunk->arena, bin);
+			bin_info = &arena_bin_info[binind];
+			regind = arena_run_regind(run, bin_info, ptr);
 
-			*((prof_ctx_t **)((uintptr_t)run + bin->ctx0_offset
+			*((prof_ctx_t **)((uintptr_t)run + bin_info->ctx0_offset
 			    + (regind * sizeof(prof_ctx_t *)))) = ctx;
 		} else
 			assert((uintptr_t)ctx == (uintptr_t)1U);
@@ -606,7 +668,7 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 	arena_chunk_map_t *mapelm;
 
 	assert(arena != NULL);
-	assert(arena->magic == ARENA_MAGIC);
+	dassert(arena->magic == ARENA_MAGIC);
 	assert(chunk->arena == arena);
 	assert(ptr != NULL);
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
@@ -629,11 +691,18 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 			run = (arena_run_t *)((uintptr_t)chunk +
 			    (uintptr_t)((pageind - (mapelm->bits >>
 			    PAGE_SHIFT)) << PAGE_SHIFT));
-			assert(run->magic == ARENA_RUN_MAGIC);
-			assert(((uintptr_t)ptr - ((uintptr_t)run +
-			    (uintptr_t)run->bin->reg0_offset)) %
-			    run->bin->reg_size == 0);
+			dassert(run->magic == ARENA_RUN_MAGIC);
 			bin = run->bin;
+#ifdef JEMALLOC_DEBUG
+			{
+				size_t binind = arena_bin_index(arena, bin);
+				arena_bin_info_t *bin_info =
+				    &arena_bin_info[binind];
+				assert(((uintptr_t)ptr - ((uintptr_t)run +
+				    (uintptr_t)bin_info->reg0_offset)) %
+				    bin_info->reg_size == 0);
+			}
+#endif
 			malloc_mutex_lock(&bin->lock);
 			arena_dalloc_bin(arena, chunk, ptr, mapelm);
 			malloc_mutex_unlock(&bin->lock);
diff --git a/jemalloc/include/jemalloc/internal/atomic.h b/jemalloc/include/jemalloc/internal/atomic.h
new file mode 100644
index 0000000..821c2ef
--- /dev/null
+++ b/jemalloc/include/jemalloc/internal/atomic.h
@@ -0,0 +1,113 @@
+/******************************************************************************/
+#ifdef JEMALLOC_H_TYPES
+
+#endif /* JEMALLOC_H_TYPES */
+/******************************************************************************/
+#ifdef JEMALLOC_H_STRUCTS
+
+#endif /* JEMALLOC_H_STRUCTS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_EXTERNS
+
+#define	atomic_read_uint64(p)	atomic_add_uint64(p, 0)
+#define	atomic_read_uint32(p)	atomic_add_uint32(p, 0)
+
+#if (LG_SIZEOF_PTR == 3)
+#  define atomic_read_z(p)						\
+    (size_t)atomic_add_uint64((uint64_t *)p, (uint64_t)0)
+#  define atomic_add_z(p, x)						\
+    (size_t)atomic_add_uint64((uint64_t *)p, (uint64_t)x)
+#  define atomic_sub_z(p, x)						\
+    (size_t)atomic_sub_uint64((uint64_t *)p, (uint64_t)x)
+#elif (LG_SIZEOF_PTR == 2)
+#  define atomic_read_z(p)						\
+    (size_t)atomic_add_uint32((uint32_t *)p, (uint32_t)0)
+#  define atomic_add_z(p, x)						\
+    (size_t)atomic_add_uint32((uint32_t *)p, (uint32_t)x)
+#  define atomic_sub_z(p, x)						\
+    (size_t)atomic_sub_uint32((uint32_t *)p, (uint32_t)x)
+#endif
+
+#endif /* JEMALLOC_H_EXTERNS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_INLINES
+
+#ifndef JEMALLOC_ENABLE_INLINE
+uint64_t	atomic_add_uint64(uint64_t *p, uint64_t x);
+uint64_t	atomic_sub_uint64(uint64_t *p, uint64_t x);
+uint32_t	atomic_add_uint32(uint32_t *p, uint32_t x);
+uint32_t	atomic_sub_uint32(uint32_t *p, uint32_t x);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ATOMIC_C_))
+/* 64-bit operations. */
+#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
+JEMALLOC_INLINE uint64_t
+atomic_add_uint64(uint64_t *p, uint64_t x)
+{
+
+	return (__sync_add_and_fetch(p, x));
+}
+
+JEMALLOC_INLINE uint64_t
+atomic_sub_uint64(uint64_t *p, uint64_t x)
+{
+
+	return (__sync_sub_and_fetch(p, x));
+}
+#elif (defined(JEMALLOC_OSATOMIC))
+JEMALLOC_INLINE uint64_t
+atomic_add_uint64(uint64_t *p, uint64_t x)
+{
+
+	return (OSAtomicAdd64((int64_t)x, (int64_t *)p));
+}
+
+JEMALLOC_INLINE uint64_t
+atomic_sub_uint64(uint64_t *p, uint64_t x)
+{
+
+	return (OSAtomicAdd64(-((int64_t)x), (int64_t *)p));
+}
+#else
+#  if (LG_SIZEOF_PTR == 3)
+#    error "Missing implementation for 64-bit atomic operations"
+#  endif
+#endif
+
+/* 32-bit operations. */
+#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4
+JEMALLOC_INLINE uint32_t
+atomic_add_uint32(uint32_t *p, uint32_t x)
+{
+
+	return (__sync_add_and_fetch(p, x));
+}
+
+JEMALLOC_INLINE uint32_t
+atomic_sub_uint32(uint32_t *p, uint32_t x)
+{
+
+	return (__sync_sub_and_fetch(p, x));
+}
+#elif (defined(JEMALLOC_OSATOMIC))
+JEMALLOC_INLINE uint32_t
+atomic_add_uint32(uint32_t *p, uint32_t x)
+{
+
+	return (OSAtomicAdd32((int32_t)x, (int32_t *)p));
+}
+
+JEMALLOC_INLINE uint32_t
+atomic_sub_uint32(uint32_t *p, uint32_t x)
+{
+
+	return (OSAtomicAdd32(-((int32_t)x), (int32_t *)p));
+}
+#else
+#  error "Missing implementation for 32-bit atomic operations"
+#endif
+#endif
+
+#endif /* JEMALLOC_H_INLINES */
+/******************************************************************************/
diff --git a/jemalloc/include/jemalloc/internal/bitmap.h b/jemalloc/include/jemalloc/internal/bitmap.h
new file mode 100644
index 0000000..605ebac
--- /dev/null
+++ b/jemalloc/include/jemalloc/internal/bitmap.h
@@ -0,0 +1,184 @@
+/******************************************************************************/
+#ifdef JEMALLOC_H_TYPES
+
+/* Maximum bitmap bit count is 2^LG_BITMAP_MAXBITS. */
+#define	LG_BITMAP_MAXBITS	LG_RUN_MAXREGS
+
+typedef struct bitmap_level_s bitmap_level_t;
+typedef struct bitmap_info_s bitmap_info_t;
+typedef unsigned long bitmap_t;
+#define	LG_SIZEOF_BITMAP	LG_SIZEOF_LONG
+
+/* Number of bits per group. */
+#define	LG_BITMAP_GROUP_NBITS		(LG_SIZEOF_BITMAP + 3)
+#define	BITMAP_GROUP_NBITS		(ZU(1) << LG_BITMAP_GROUP_NBITS)
+#define	BITMAP_GROUP_NBITS_MASK		(BITMAP_GROUP_NBITS-1)
+
+/* Maximum number of levels possible. */
+#define	BITMAP_MAX_LEVELS						\
+    (LG_BITMAP_MAXBITS / LG_SIZEOF_BITMAP)				\
+    + !!(LG_BITMAP_MAXBITS % LG_SIZEOF_BITMAP)
+
+#endif /* JEMALLOC_H_TYPES */
+/******************************************************************************/
+#ifdef JEMALLOC_H_STRUCTS
+
+struct bitmap_level_s {
+	/* Offset of this level's groups within the array of groups. */
+	size_t group_offset;
+};
+
+struct bitmap_info_s {
+	/* Logical number of bits in bitmap (stored at bottom level). */
+	size_t nbits;
+
+	/* Number of levels necessary for nbits. */
+	unsigned nlevels;
+
+	/*
+	 * Only the first (nlevels+1) elements are used, and levels are ordered
+	 * bottom to top (e.g. the bottom level is stored in levels[0]).
+	 */
+	bitmap_level_t levels[BITMAP_MAX_LEVELS+1];
+};
+
+#endif /* JEMALLOC_H_STRUCTS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_EXTERNS
+
+void	bitmap_info_init(bitmap_info_t *binfo, size_t nbits);
+size_t	bitmap_info_ngroups(const bitmap_info_t *binfo);
+size_t	bitmap_size(size_t nbits);
+void	bitmap_init(bitmap_t *bitmap, const bitmap_info_t *binfo);
+
+#endif /* JEMALLOC_H_EXTERNS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_INLINES
+
+#ifndef JEMALLOC_ENABLE_INLINE
+bool	bitmap_full(bitmap_t *bitmap, const bitmap_info_t *binfo);
+bool	bitmap_get(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit);
+void	bitmap_set(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit);
+size_t	bitmap_sfu(bitmap_t *bitmap, const bitmap_info_t *binfo);
+void	bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_BITMAP_C_))
+JEMALLOC_INLINE bool
+bitmap_full(bitmap_t *bitmap, const bitmap_info_t *binfo)
+{
+	unsigned rgoff = binfo->levels[binfo->nlevels].group_offset - 1;
+	bitmap_t rg = bitmap[rgoff];
+	/* The bitmap is full iff the root group is 0. */
+	return (rg == 0);
+}
+
+JEMALLOC_INLINE bool
+bitmap_get(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
+{
+	size_t goff;
+	bitmap_t g;
+
+	assert(bit < binfo->nbits);
+	goff = bit >> LG_BITMAP_GROUP_NBITS;
+	g = bitmap[goff];
+	return (!(g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK))));
+}
+
+JEMALLOC_INLINE void
+bitmap_set(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
+{
+	size_t goff;
+	bitmap_t *gp;
+	bitmap_t g;
+
+	assert(bit < binfo->nbits);
+	assert(bitmap_get(bitmap, binfo, bit) == false);
+	goff = bit >> LG_BITMAP_GROUP_NBITS;
+	gp = &bitmap[goff];
+	g = *gp;
+	assert(g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK)));
+	g ^= 1LU << (bit & BITMAP_GROUP_NBITS_MASK);
+	*gp = g;
+	assert(bitmap_get(bitmap, binfo, bit));
+	/* Propagate group state transitions up the tree. */
+	if (g == 0) {
+		unsigned i;
+		for (i = 1; i < binfo->nlevels; i++) {
+			bit = goff;
+			goff = bit >> LG_BITMAP_GROUP_NBITS;
+			gp = &bitmap[binfo->levels[i].group_offset + goff];
+			g = *gp;
+			assert(g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK)));
+			g ^= 1LU << (bit & BITMAP_GROUP_NBITS_MASK);
+			*gp = g;
+			if (g != 0)
+				break;
+		}
+	}
+}
+
+/* sfu: set first unset. */
+JEMALLOC_INLINE size_t
+bitmap_sfu(bitmap_t *bitmap, const bitmap_info_t *binfo)
+{
+	size_t bit;
+	bitmap_t g;
+	unsigned i;
+
+	assert(bitmap_full(bitmap, binfo) == false);
+
+	i = binfo->nlevels - 1;
+	g = bitmap[binfo->levels[i].group_offset];
+	bit = ffsl(g) - 1;
+	while (i > 0) {
+		i--;
+		g = bitmap[binfo->levels[i].group_offset + bit];
+		bit = (bit << LG_BITMAP_GROUP_NBITS) + (ffsl(g) - 1);
+	}
+
+	bitmap_set(bitmap, binfo, bit);
+	return (bit);
+}
+
+JEMALLOC_INLINE void
+bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
+{
+	size_t goff;
+	bitmap_t *gp;
+	bitmap_t g;
+	bool propagate;
+
+	assert(bit < binfo->nbits);
+	assert(bitmap_get(bitmap, binfo, bit));
+	goff = bit >> LG_BITMAP_GROUP_NBITS;
+	gp = &bitmap[goff];
+	g = *gp;
+	propagate = (g == 0);
+	assert((g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK))) == 0);
+	g ^= 1LU << (bit & BITMAP_GROUP_NBITS_MASK);
+	*gp = g;
+	assert(bitmap_get(bitmap, binfo, bit) == false);
+	/* Propagate group state transitions up the tree. */
+	if (propagate) {
+		unsigned i;
+		for (i = 1; i < binfo->nlevels; i++) {
+			bit = goff;
+			goff = bit >> LG_BITMAP_GROUP_NBITS;
+			gp = &bitmap[binfo->levels[i].group_offset + goff];
+			g = *gp;
+			propagate = (g == 0);
+			assert((g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK)))
+			    == 0);
+			g ^= 1LU << (bit & BITMAP_GROUP_NBITS_MASK);
+			*gp = g;
+			if (propagate == false)
+				break;
+		}
+	}
+}
+
+#endif
+
+#endif /* JEMALLOC_H_INLINES */
+/******************************************************************************/
diff --git a/jemalloc/include/jemalloc/internal/ctl.h b/jemalloc/include/jemalloc/internal/ctl.h
index 8776ad1..f1f5eb7 100644
--- a/jemalloc/include/jemalloc/internal/ctl.h
+++ b/jemalloc/include/jemalloc/internal/ctl.h
@@ -29,6 +29,7 @@ struct ctl_node_s {
 
 struct ctl_arena_stats_s {
 	bool			initialized;
+	unsigned		nthreads;
 	size_t			pactive;
 	size_t			pdirty;
 #ifdef JEMALLOC_STATS
diff --git a/jemalloc/include/jemalloc/internal/hash.h b/jemalloc/include/jemalloc/internal/hash.h
index 9073d83..93905bf 100644
--- a/jemalloc/include/jemalloc/internal/hash.h
+++ b/jemalloc/include/jemalloc/internal/hash.h
@@ -17,7 +17,7 @@
 uint64_t	hash(const void *key, size_t len, uint64_t seed);
 #endif
 
-#if (defined(JEMALLOC_ENABLE_INLINE) || defined(HASH_C_))
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_HASH_C_))
 /*
  * The following hash function is based on MurmurHash64A(), placed into the
  * public domain by Austin Appleby.  See http://murmurhash.googlepages.com/ for
diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
index aab2bfb..254adb6 100644
--- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
@@ -33,6 +33,10 @@
 #define	JEMALLOC_MANGLE
 #include "../jemalloc@install_suffix@.h"
 
+#if (defined(JEMALLOC_OSATOMIC) || defined(JEMALLOC_OSSPIN))
+#include <libkern/OSAtomic.h>
+#endif
+
 #ifdef JEMALLOC_ZONE
 #include <mach/mach_error.h>
 #include <mach/mach_init.h>
@@ -55,8 +59,9 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
  * Define a custom assert() in order to reduce the chances of deadlock during
  * assertion failure.
  */
-#ifdef JEMALLOC_DEBUG
-#  define assert(e) do {						\
+#ifndef assert
+#  ifdef JEMALLOC_DEBUG
+#    define assert(e) do {						\
 	if (!(e)) {							\
 		char line_buf[UMAX2S_BUFSIZE];				\
 		malloc_write("<jemalloc>: ");				\
@@ -70,8 +75,15 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 		abort();						\
 	}								\
 } while (0)
+#  else
+#    define assert(e)
+#  endif
+#endif
+
+#ifdef JEMALLOC_DEBUG
+#  define dassert(e) assert(e)
 #else
-#define assert(e)
+#  define dassert(e)
 #endif
 
 /*
@@ -146,7 +158,19 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #define	QUANTUM_CEILING(a)						\
 	(((a) + QUANTUM_MASK) & ~QUANTUM_MASK)
 
+#define	LONG			((size_t)(1U << LG_SIZEOF_LONG))
+#define	LONG_MASK		(LONG - 1)
+
+/* Return the smallest long multiple that is >= a. */
+#define	LONG_CEILING(a)						\
+	(((a) + LONG_MASK) & ~LONG_MASK)
+
 #define	SIZEOF_PTR		(1U << LG_SIZEOF_PTR)
+#define	PTR_MASK		(SIZEOF_PTR - 1)
+
+/* Return the smallest (void *) multiple that is >= a. */
+#define	PTR_CEILING(a)						\
+	(((a) + PTR_MASK) & ~PTR_MASK)
 
 /*
  * Maximum size of L1 cache line.  This is used to avoid cache line aliasing.
@@ -193,6 +217,7 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #define	PAGE_CEILING(s)							\
 	(((s) + PAGE_MASK) & ~PAGE_MASK)
 
+#include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/prn.h"
 #include "jemalloc/internal/ckh.h"
 #include "jemalloc/internal/stats.h"
@@ -201,6 +226,7 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #include "jemalloc/internal/mb.h"
 #include "jemalloc/internal/extent.h"
 #include "jemalloc/internal/arena.h"
+#include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
@@ -216,12 +242,14 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 /******************************************************************************/
 #define JEMALLOC_H_STRUCTS
 
+#include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/prn.h"
 #include "jemalloc/internal/ckh.h"
 #include "jemalloc/internal/stats.h"
 #include "jemalloc/internal/ctl.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/mb.h"
+#include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/extent.h"
 #include "jemalloc/internal/arena.h"
 #include "jemalloc/internal/base.h"
@@ -271,6 +299,7 @@ extern size_t		lg_pagesize;
 extern unsigned		ncpus;
 
 extern malloc_mutex_t	arenas_lock; /* Protects arenas initialization. */
+extern pthread_key_t	arenas_tsd;
 #ifndef NO_TLS
 /*
  * Map of pthread_self() --> arenas[???], used for selecting an arena to use
@@ -280,9 +309,9 @@ extern __thread arena_t	*arenas_tls JEMALLOC_ATTR(tls_model("initial-exec"));
 #  define ARENA_GET()	arenas_tls
 #  define ARENA_SET(v)	do {						\
 	arenas_tls = (v);						\
+	pthread_setspecific(arenas_tsd, (void *)(v));			\
 } while (0)
 #else
-extern pthread_key_t	arenas_tsd;
 #  define ARENA_GET()	((arena_t *)pthread_getspecific(arenas_tsd))
 #  define ARENA_SET(v)	do {						\
 	pthread_setspecific(arenas_tsd, (void *)(v));			\
@@ -329,12 +358,14 @@ int	buferror(int errnum, char *buf, size_t buflen);
 void	jemalloc_prefork(void);
 void	jemalloc_postfork(void);
 
+#include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/prn.h"
 #include "jemalloc/internal/ckh.h"
 #include "jemalloc/internal/stats.h"
 #include "jemalloc/internal/ctl.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/mb.h"
+#include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/extent.h"
 #include "jemalloc/internal/arena.h"
 #include "jemalloc/internal/base.h"
@@ -352,6 +383,7 @@ void	jemalloc_postfork(void);
 /******************************************************************************/
 #define JEMALLOC_H_INLINES
 
+#include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/prn.h"
 #include "jemalloc/internal/ckh.h"
 #include "jemalloc/internal/stats.h"
@@ -402,7 +434,7 @@ s2u(size_t size)
 {
 
 	if (size <= small_maxclass)
-		return (arenas[0]->bins[small_size2bin[size]].reg_size);
+		return (arena_bin_info[SMALL_SIZE2BIN(size)].reg_size);
 	if (size <= arena_maxclass)
 		return (PAGE_CEILING(size));
 	return (CHUNK_CEILING(size));
@@ -446,10 +478,8 @@ sa2u(size_t size, size_t alignment, size_t *run_size_p)
 	}
 
 	if (usize <= arena_maxclass && alignment <= PAGE_SIZE) {
-		if (usize <= small_maxclass) {
-			return
-			    (arenas[0]->bins[small_size2bin[usize]].reg_size);
-		}
+		if (usize <= small_maxclass)
+			return (arena_bin_info[SMALL_SIZE2BIN(usize)].reg_size);
 		return (PAGE_CEILING(usize));
 	} else {
 		size_t run_size;
@@ -547,6 +577,7 @@ thread_allocated_get(void)
 #endif
 #endif
 
+#include "jemalloc/internal/bitmap.h"
 #include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/arena.h"
@@ -558,7 +589,7 @@ thread_allocated_get(void)
 #ifndef JEMALLOC_ENABLE_INLINE
 void	*imalloc(size_t size);
 void	*icalloc(size_t size);
-void	*ipalloc(size_t size, size_t alignment, bool zero);
+void	*ipalloc(size_t usize, size_t alignment, bool zero);
 size_t	isalloc(const void *ptr);
 #  ifdef JEMALLOC_IVSALLOC
 size_t	ivsalloc(const void *ptr);
@@ -592,28 +623,39 @@ icalloc(size_t size)
 }
 
 JEMALLOC_INLINE void *
-ipalloc(size_t size, size_t alignment, bool zero)
+ipalloc(size_t usize, size_t alignment, bool zero)
 {
 	void *ret;
-	size_t usize;
-	size_t run_size
-#  ifdef JEMALLOC_CC_SILENCE
-	    = 0
-#  endif
-	    ;
 
-	usize = sa2u(size, alignment, &run_size);
-	if (usize == 0)
-		return (NULL);
+	assert(usize != 0);
+	assert(usize == sa2u(usize, alignment, NULL));
+
 	if (usize <= arena_maxclass && alignment <= PAGE_SIZE)
 		ret = arena_malloc(usize, zero);
-	else if (run_size <= arena_maxclass) {
-		ret = arena_palloc(choose_arena(), usize, run_size, alignment,
-		    zero);
-	} else if (alignment <= chunksize)
-		ret = huge_malloc(usize, zero);
-	else
-		ret = huge_palloc(usize, alignment, zero);
+	else {
+		size_t run_size
+#ifdef JEMALLOC_CC_SILENCE
+		    = 0
+#endif
+		    ;
+
+		/*
+		 * Ideally we would only ever call sa2u() once per aligned
+		 * allocation request, and the caller of this function has
+		 * already done so once.  However, it's rather burdensome to
+		 * require every caller to pass in run_size, especially given
+		 * that it's only relevant to large allocations.  Therefore,
+		 * just call it again here in order to get run_size.
+		 */
+		sa2u(usize, alignment, &run_size);
+		if (run_size <= arena_maxclass) {
+			ret = arena_palloc(choose_arena(), usize, run_size,
+			    alignment, zero);
+		} else if (alignment <= chunksize)
+			ret = huge_malloc(usize, zero);
+		else
+			ret = huge_palloc(usize, alignment, zero);
+	}
 
 	assert(((uintptr_t)ret & (alignment - 1)) == 0);
 	return (ret);
@@ -630,7 +672,7 @@ isalloc(const void *ptr)
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (chunk != ptr) {
 		/* Region. */
-		assert(chunk->arena->magic == ARENA_MAGIC);
+		dassert(chunk->arena->magic == ARENA_MAGIC);
 
 #ifdef JEMALLOC_PROF
 		ret = arena_salloc_demote(ptr);
@@ -684,7 +726,7 @@ iralloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero,
 
 	if (alignment != 0 && ((uintptr_t)ptr & ((uintptr_t)alignment-1))
 	    != 0) {
-		size_t copysize;
+		size_t usize, copysize;
 
 		/*
 		 * Existing object alignment is inadquate; allocate new space
@@ -692,12 +734,18 @@ iralloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero,
 		 */
 		if (no_move)
 			return (NULL);
-		ret = ipalloc(size + extra, alignment, zero);
+		usize = sa2u(size + extra, alignment, NULL);
+		if (usize == 0)
+			return (NULL);
+		ret = ipalloc(usize, alignment, zero);
 		if (ret == NULL) {
 			if (extra == 0)
 				return (NULL);
 			/* Try again, without extra this time. */
-			ret = ipalloc(size, alignment, zero);
+			usize = sa2u(size, alignment, NULL);
+			if (usize == 0)
+				return (NULL);
+			ret = ipalloc(usize, alignment, zero);
 			if (ret == NULL)
 				return (NULL);
 		}
diff --git a/jemalloc/include/jemalloc/internal/mb.h b/jemalloc/include/jemalloc/internal/mb.h
index 1707aa9..dc9f2a5 100644
--- a/jemalloc/include/jemalloc/internal/mb.h
+++ b/jemalloc/include/jemalloc/internal/mb.h
@@ -17,7 +17,7 @@
 void	mb_write(void);
 #endif
 
-#if (defined(JEMALLOC_ENABLE_INLINE) || defined(MB_C_))
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_MB_C_))
 #ifdef __i386__
 /*
  * According to the Intel Architecture Software Developer's Manual, current
diff --git a/jemalloc/include/jemalloc/internal/mutex.h b/jemalloc/include/jemalloc/internal/mutex.h
index dcca01e..62947ce 100644
--- a/jemalloc/include/jemalloc/internal/mutex.h
+++ b/jemalloc/include/jemalloc/internal/mutex.h
@@ -1,7 +1,11 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_TYPES
 
+#ifdef JEMALLOC_OSSPIN
+typedef OSSpinLock malloc_mutex_t;
+#else
 typedef pthread_mutex_t malloc_mutex_t;
+#endif
 
 #ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
 #  define MALLOC_MUTEX_INITIALIZER PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
@@ -41,17 +45,26 @@ JEMALLOC_INLINE void
 malloc_mutex_lock(malloc_mutex_t *mutex)
 {
 
-	if (isthreaded)
+	if (isthreaded) {
+#ifdef JEMALLOC_OSSPIN
+		OSSpinLockLock(mutex);
+#else
 		pthread_mutex_lock(mutex);
+#endif
+	}
 }
 
 JEMALLOC_INLINE bool
 malloc_mutex_trylock(malloc_mutex_t *mutex)
 {
 
-	if (isthreaded)
+	if (isthreaded) {
+#ifdef JEMALLOC_OSSPIN
+		return (OSSpinLockTry(mutex) == false);
+#else
 		return (pthread_mutex_trylock(mutex) != 0);
-	else
+#endif
+	} else
 		return (false);
 }
 
@@ -59,8 +72,13 @@ JEMALLOC_INLINE void
 malloc_mutex_unlock(malloc_mutex_t *mutex)
 {
 
-	if (isthreaded)
+	if (isthreaded) {
+#ifdef JEMALLOC_OSSPIN
+		OSSpinLockUnlock(mutex);
+#else
 		pthread_mutex_unlock(mutex);
+#endif
+	}
 }
 #endif
 
diff --git a/jemalloc/include/jemalloc/internal/prof.h b/jemalloc/include/jemalloc/internal/prof.h
index 7864000..f943873 100644
--- a/jemalloc/include/jemalloc/internal/prof.h
+++ b/jemalloc/include/jemalloc/internal/prof.h
@@ -247,8 +247,22 @@ prof_sample_threshold_update(prof_tdata_t *prof_tdata)
 	double u;
 
 	/*
-	 * Compute prof_sample_threshold as a geometrically distributed random
+	 * Compute sample threshold as a geometrically distributed random
 	 * variable with mean (2^opt_lg_prof_sample).
+	 *
+	 *                         __        __
+	 *                         |  log(u)  |                     1
+	 * prof_tdata->threshold = | -------- |, where p = -------------------
+	 *                         | log(1-p) |             opt_lg_prof_sample
+	 *                                                 2
+	 *
+	 * For more information on the math, see:
+	 *
+	 *   Non-Uniform Random Variate Generation
+	 *   Luc Devroye
+	 *   Springer-Verlag, New York, 1986
+	 *   pp 500
+	 *   (http://cg.scs.carleton.ca/~luc/rnbookindex.html)
 	 */
 	prn64(r, 53, prof_tdata->prn_state,
 	    (uint64_t)6364136223846793005LLU, (uint64_t)1442695040888963407LLU);
@@ -334,7 +348,7 @@ prof_ctx_get(const void *ptr)
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (chunk != ptr) {
 		/* Region. */
-		assert(chunk->arena->magic == ARENA_MAGIC);
+		dassert(chunk->arena->magic == ARENA_MAGIC);
 
 		ret = arena_prof_ctx_get(ptr);
 	} else
@@ -353,7 +367,7 @@ prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (chunk != ptr) {
 		/* Region. */
-		assert(chunk->arena->magic == ARENA_MAGIC);
+		dassert(chunk->arena->magic == ARENA_MAGIC);
 
 		arena_prof_ctx_set(ptr, ctx);
 	} else
@@ -374,7 +388,7 @@ prof_sample_accum_update(size_t size)
 	/* Take care to avoid integer overflow. */
 	if (size >= prof_tdata->threshold - prof_tdata->accum) {
 		prof_tdata->accum -= (prof_tdata->threshold - size);
-		/* Compute new prof_sample_threshold. */
+		/* Compute new sample threshold. */
 		prof_sample_threshold_update(prof_tdata);
 		while (prof_tdata->accum >= prof_tdata->threshold) {
 			prof_tdata->accum -= prof_tdata->threshold;
diff --git a/jemalloc/include/jemalloc/internal/rtree.h b/jemalloc/include/jemalloc/internal/rtree.h
index 9d58eba..95d6355 100644
--- a/jemalloc/include/jemalloc/internal/rtree.h
+++ b/jemalloc/include/jemalloc/internal/rtree.h
@@ -49,7 +49,7 @@ void	*rtree_get(rtree_t *rtree, uintptr_t key);
 bool	rtree_set(rtree_t *rtree, uintptr_t key, void *val);
 #endif
 
-#if (defined(JEMALLOC_ENABLE_INLINE) || defined(RTREE_C_))
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_RTREE_C_))
 #define	RTREE_GET_GENERATE(f)						\
 /* The least significant bits of the key are ignored. */		\
 JEMALLOC_INLINE void *							\
diff --git a/jemalloc/include/jemalloc/internal/stats.h b/jemalloc/include/jemalloc/internal/stats.h
index 3fc2080..2a9b31d 100644
--- a/jemalloc/include/jemalloc/internal/stats.h
+++ b/jemalloc/include/jemalloc/internal/stats.h
@@ -154,6 +154,10 @@ struct chunk_stats_s {
 
 extern bool	opt_stats_print;
 
+#ifdef JEMALLOC_STATS
+extern size_t	stats_cactive;
+#endif
+
 char	*u2s(uint64_t x, unsigned base, char *s);
 #ifdef JEMALLOC_STATS
 void malloc_cprintf(void (*write)(void *, const char *), void *cbopaque,
@@ -166,9 +170,38 @@ void	stats_print(void (*write)(void *, const char *), void *cbopaque,
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
-#ifdef JEMALLOC_STATS
 #ifdef JEMALLOC_H_INLINES
+#ifdef JEMALLOC_STATS
+
+#ifndef JEMALLOC_ENABLE_INLINE
+size_t	stats_cactive_get(void);
+void	stats_cactive_add(size_t size);
+void	stats_cactive_sub(size_t size);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_STATS_C_))
+JEMALLOC_INLINE size_t
+stats_cactive_get(void)
+{
+
+	return (atomic_read_z(&stats_cactive));
+}
+
+JEMALLOC_INLINE void
+stats_cactive_add(size_t size)
+{
+
+	atomic_add_z(&stats_cactive, size);
+}
+
+JEMALLOC_INLINE void
+stats_cactive_sub(size_t size)
+{
+
+	atomic_sub_z(&stats_cactive, size);
+}
+#endif
 
-#endif /* JEMALLOC_H_INLINES */
 #endif /* JEMALLOC_STATS */
+#endif /* JEMALLOC_H_INLINES */
 /******************************************************************************/
diff --git a/jemalloc/include/jemalloc/internal/tcache.h b/jemalloc/include/jemalloc/internal/tcache.h
index f431c66..da3c68c 100644
--- a/jemalloc/include/jemalloc/internal/tcache.h
+++ b/jemalloc/include/jemalloc/internal/tcache.h
@@ -2,6 +2,7 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_TYPES
 
+typedef struct tcache_bin_info_s tcache_bin_info_t;
 typedef struct tcache_bin_s tcache_bin_t;
 typedef struct tcache_s tcache_t;
 
@@ -32,14 +33,22 @@ typedef struct tcache_s tcache_t;
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS
 
+/*
+ * Read-only information associated with each element of tcache_t's tbins array
+ * is stored separately, mainly to reduce memory usage.
+ */
+struct tcache_bin_info_s {
+	unsigned	ncached_max;	/* Upper limit on ncached. */
+};
+
 struct tcache_bin_s {
 #  ifdef JEMALLOC_STATS
 	tcache_bin_stats_t tstats;
 #  endif
-	unsigned	low_water;	/* Min # cached since last GC. */
+	int		low_water;	/* Min # cached since last GC. */
+	unsigned	lg_fill_div;	/* Fill (ncached_max >> lg_fill_div). */
 	unsigned	ncached;	/* # of cached objects. */
-	unsigned	ncached_max;	/* Upper limit on ncached. */
-	void		*avail;		/* Chain of available objects. */
+	void		**avail;	/* Stack of available objects. */
 };
 
 struct tcache_s {
@@ -53,6 +62,12 @@ struct tcache_s {
 	unsigned	ev_cnt;		/* Event count since incremental GC. */
 	unsigned	next_gc_bin;	/* Next bin to GC. */
 	tcache_bin_t	tbins[1];	/* Dynamically sized. */
+	/*
+	 * The pointer stacks associated with tbins follow as a contiguous
+	 * array.  During tcache initialization, the avail pointer in each
+	 * element of tbins is initialized to point to the proper offset within
+	 * this array.
+	 */
 };
 
 #endif /* JEMALLOC_H_STRUCTS */
@@ -63,6 +78,8 @@ extern bool	opt_tcache;
 extern ssize_t	opt_lg_tcache_max;
 extern ssize_t	opt_lg_tcache_gc_sweep;
 
+extern tcache_bin_info_t	*tcache_bin_info;
+
 /* Map of thread-specific caches. */
 #ifndef NO_TLS
 extern __thread tcache_t	*tcache_tls
@@ -109,7 +126,7 @@ void	tcache_destroy(tcache_t *tcache);
 #ifdef JEMALLOC_STATS
 void	tcache_stats_merge(tcache_t *tcache, arena_t *arena);
 #endif
-void	tcache_boot(void);
+bool	tcache_boot(void);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
@@ -168,6 +185,7 @@ tcache_event(tcache_t *tcache)
 	if (tcache->ev_cnt == tcache_gc_incr) {
 		size_t binind = tcache->next_gc_bin;
 		tcache_bin_t *tbin = &tcache->tbins[binind];
+		tcache_bin_info_t *tbin_info = &tcache_bin_info[binind];
 
 		if (tbin->low_water > 0) {
 			/*
@@ -191,6 +209,20 @@ tcache_event(tcache_t *tcache)
 #endif
 				    );
 			}
+			/*
+			 * Reduce fill count by 2X.  Limit lg_fill_div such that
+			 * the fill count is always at least 1.
+			 */
+			if ((tbin_info->ncached_max >> (tbin->lg_fill_div+1))
+			    >= 1)
+				tbin->lg_fill_div++;
+		} else if (tbin->low_water < 0) {
+			/*
+			 * Increase fill count by 2X.  Make sure lg_fill_div
+			 * stays greater than 0.
+			 */
+			if (tbin->lg_fill_div > 1)
+				tbin->lg_fill_div--;
 		}
 		tbin->low_water = tbin->ncached;
 
@@ -206,13 +238,14 @@ tcache_alloc_easy(tcache_bin_t *tbin)
 {
 	void *ret;
 
-	if (tbin->ncached == 0)
+	if (tbin->ncached == 0) {
+		tbin->low_water = -1;
 		return (NULL);
+	}
 	tbin->ncached--;
-	if (tbin->ncached < tbin->low_water)
+	if ((int)tbin->ncached < tbin->low_water)
 		tbin->low_water = tbin->ncached;
-	ret = tbin->avail;
-	tbin->avail = *(void **)ret;
+	ret = tbin->avail[tbin->ncached];
 	return (ret);
 }
 
@@ -223,7 +256,7 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
 	size_t binind;
 	tcache_bin_t *tbin;
 
-	binind = small_size2bin[size];
+	binind = SMALL_SIZE2BIN(size);
 	assert(binind < nbins);
 	tbin = &tcache->tbins[binind];
 	ret = tcache_alloc_easy(tbin);
@@ -232,7 +265,7 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
 		if (ret == NULL)
 			return (NULL);
 	}
-	assert(arena_salloc(ret) == tcache->arena->bins[binind].reg_size);
+	assert(arena_salloc(ret) == arena_bin_info[binind].reg_size);
 
 	if (zero == false) {
 #ifdef JEMALLOC_FILL
@@ -248,7 +281,7 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
 	tbin->tstats.nrequests++;
 #endif
 #ifdef JEMALLOC_PROF
-	tcache->prof_accumbytes += tcache->arena->bins[binind].reg_size;
+	tcache->prof_accumbytes += arena_bin_info[binind].reg_size;
 #endif
 	tcache_event(tcache);
 	return (ret);
@@ -312,6 +345,7 @@ tcache_dalloc_small(tcache_t *tcache, void *ptr)
 	arena_run_t *run;
 	arena_bin_t *bin;
 	tcache_bin_t *tbin;
+	tcache_bin_info_t *tbin_info;
 	size_t pageind, binind;
 	arena_chunk_map_t *mapelm;
 
@@ -323,7 +357,7 @@ tcache_dalloc_small(tcache_t *tcache, void *ptr)
 	mapelm = &chunk->map[pageind-map_bias];
 	run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
 	    (mapelm->bits >> PAGE_SHIFT)) << PAGE_SHIFT));
-	assert(run->magic == ARENA_RUN_MAGIC);
+	dassert(run->magic == ARENA_RUN_MAGIC);
 	bin = run->bin;
 	binind = ((uintptr_t)bin - (uintptr_t)&arena->bins) /
 	    sizeof(arena_bin_t);
@@ -331,20 +365,21 @@ tcache_dalloc_small(tcache_t *tcache, void *ptr)
 
 #ifdef JEMALLOC_FILL
 	if (opt_junk)
-		memset(ptr, 0x5a, bin->reg_size);
+		memset(ptr, 0x5a, arena_bin_info[binind].reg_size);
 #endif
 
 	tbin = &tcache->tbins[binind];
-	if (tbin->ncached == tbin->ncached_max) {
-		tcache_bin_flush_small(tbin, binind, (tbin->ncached_max >> 1)
+	tbin_info = &tcache_bin_info[binind];
+	if (tbin->ncached == tbin_info->ncached_max) {
+		tcache_bin_flush_small(tbin, binind, (tbin_info->ncached_max >>
+		    1)
 #if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
 		    , tcache
 #endif
 		    );
 	}
-	assert(tbin->ncached < tbin->ncached_max);
-	*(void **)ptr = tbin->avail;
-	tbin->avail = ptr;
+	assert(tbin->ncached < tbin_info->ncached_max);
+	tbin->avail[tbin->ncached] = ptr;
 	tbin->ncached++;
 
 	tcache_event(tcache);
@@ -357,6 +392,7 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
 	arena_chunk_t *chunk;
 	size_t pageind, binind;
 	tcache_bin_t *tbin;
+	tcache_bin_info_t *tbin_info;
 
 	assert((size & PAGE_MASK) == 0);
 	assert(arena_salloc(ptr) > small_maxclass);
@@ -373,16 +409,17 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
 #endif
 
 	tbin = &tcache->tbins[binind];
-	if (tbin->ncached == tbin->ncached_max) {
-		tcache_bin_flush_large(tbin, binind, (tbin->ncached_max >> 1)
+	tbin_info = &tcache_bin_info[binind];
+	if (tbin->ncached == tbin_info->ncached_max) {
+		tcache_bin_flush_large(tbin, binind, (tbin_info->ncached_max >>
+		    1)
 #if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF))
 		    , tcache
 #endif
 		    );
 	}
-	assert(tbin->ncached < tbin->ncached_max);
-	*(void **)ptr = tbin->avail;
-	tbin->avail = ptr;
+	assert(tbin->ncached < tbin_info->ncached_max);
+	tbin->avail[tbin->ncached] = ptr;
 	tbin->ncached++;
 
 	tcache_event(tcache);
diff --git a/jemalloc/include/jemalloc/jemalloc_defs.h.in b/jemalloc/include/jemalloc/jemalloc_defs.h.in
index 5f46c5c..d8c81d7 100644
--- a/jemalloc/include/jemalloc/jemalloc_defs.h.in
+++ b/jemalloc/include/jemalloc/jemalloc_defs.h.in
@@ -24,6 +24,18 @@
  */
 #undef CPU_SPINWAIT
 
+/*
+ * Defined if OSAtomic*() functions are available, as provided by Darwin, and
+ * documented in the atomic(3) manual page.
+ */
+#undef JEMALLOC_OSATOMIC
+
+/*
+ * Defined if OSSpin*() functions are available, as provided by Darwin, and
+ * documented in the spinlock(3) manual page.
+ */
+#undef JEMALLOC_OSSPIN
+
 /* Defined if __attribute__((...)) syntax is supported. */
 #undef JEMALLOC_HAVE_ATTR
 #ifdef JEMALLOC_HAVE_ATTR
@@ -53,6 +65,9 @@
 /* Use libgcc for profile backtracing if defined. */
 #undef JEMALLOC_PROF_LIBGCC
 
+/* Use gcc intrinsics for profile backtracing if defined. */
+#undef JEMALLOC_PROF_GCC
+
 /*
  * JEMALLOC_TINY enables support for tiny objects, which are smaller than one
  * quantum.
@@ -137,4 +152,7 @@
 /* sizeof(int) == 2^LG_SIZEOF_INT. */
 #undef LG_SIZEOF_INT
 
+/* sizeof(long) == 2^LG_SIZEOF_LONG. */
+#undef LG_SIZEOF_LONG
+
 #endif /* JEMALLOC_DEFS_H_ */