15 files changed, 1207 insertions, 232 deletions
diff --git a/jemalloc/include/jemalloc/internal/arena.h b/jemalloc/include/jemalloc/internal/arena.h
index c1955f1..9556c2c 100644
--- a/jemalloc/include/jemalloc/internal/arena.h
+++ b/jemalloc/include/jemalloc/internal/arena.h
@@ -121,17 +121,17 @@ struct arena_chunk_map_s {
 	 *
 	 * p : run page offset
 	 * s : run size
-	 * c : size class (used only if prof_promote is true)
+	 * c : (binind+1) for size class (used only if prof_promote is true)
 	 * x : don't care
 	 * - : 0
 	 * + : 1
-	 * [DZLA] : bit set
-	 * [dzla] : bit unset
+	 * [DULA] : bit set
+	 * [dula] : bit unset
 	 *
 	 *   Unallocated (clean):
-	 *     ssssssss ssssssss ssss---- ----dz--
-	 *     xxxxxxxx xxxxxxxx xxxx---- -----Zxx
-	 *     ssssssss ssssssss ssss---- ----dZ--
+	 *     ssssssss ssssssss ssss---- ----du--
+	 *     xxxxxxxx xxxxxxxx xxxx---- -----Uxx
+	 *     ssssssss ssssssss ssss---- ----dU--
 	 *
 	 *   Unallocated (dirty):
 	 *     ssssssss ssssssss ssss---- ----D---
@@ -144,7 +144,7 @@ struct arena_chunk_map_s {
 	 *     pppppppp pppppppp pppp---- ----d--a
 	 *
 	 *   Large:
-	 *     ssssssss ssssssss ssss++++ ++++D-la
+	 *     ssssssss ssssssss ssss---- ----D-la
 	 *     xxxxxxxx xxxxxxxx xxxx---- ----xxxx
 	 *     -------- -------- -------- ----D-la
 	 *
@@ -152,7 +152,7 @@ struct arena_chunk_map_s {
 	 *     ssssssss ssssssss sssscccc ccccD-la
 	 *
 	 *   Large (not sampled, size == PAGE_SIZE):
-	 *     ssssssss ssssssss ssss++++ ++++D-la
+	 *     ssssssss ssssssss ssss---- ----D-la
 	 */
 	size_t				bits;
 #ifdef JEMALLOC_PROF
@@ -161,7 +161,7 @@ struct arena_chunk_map_s {
 #endif
 #define	CHUNK_MAP_FLAGS_MASK	((size_t)0xfU)
 #define	CHUNK_MAP_DIRTY		((size_t)0x8U)
-#define	CHUNK_MAP_ZEROED	((size_t)0x4U)
+#define	CHUNK_MAP_UNZEROED	((size_t)0x4U)
 #define	CHUNK_MAP_LARGE		((size_t)0x2U)
 #define	CHUNK_MAP_ALLOCATED	((size_t)0x1U)
 #define	CHUNK_MAP_KEY		CHUNK_MAP_ALLOCATED
@@ -187,7 +187,12 @@ struct arena_chunk_s {
 	/* Number of dirty pages. */
 	size_t		ndirty;
 
-	/* Map of pages within chunk that keeps track of free/large/small. */
+	/*
+	 * Map of pages within chunk that keeps track of free/large/small.  The
+	 * first map_bias entries are omitted, since the chunk header does not
+	 * need to be tracked in the map.  This omission saves a header page
+	 * for common chunk sizes (e.g. 4 MiB).
+	 */
 	arena_chunk_map_t map[1]; /* Dynamically sized. */
 };
 typedef rb_tree(arena_chunk_t) arena_chunk_tree_t;
@@ -416,8 +421,12 @@ extern size_t		sspace_min;
 extern size_t		sspace_max;
 #define			small_maxclass	sspace_max
 
-#define			nlclasses (chunk_npages - arena_chunk_header_npages)
+#define			nlclasses (chunk_npages - map_bias)
 
+void	arena_purge_all(arena_t *arena);
+#ifdef JEMALLOC_PROF
+void	arena_prof_accum(arena_t *arena, uint64_t accumbytes);
+#endif
 #ifdef JEMALLOC_TCACHE
 void	arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin,
     size_t binind
@@ -426,20 +435,15 @@ void	arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin,
 #  endif
     );
 #endif
-#ifdef JEMALLOC_PROF
-void	arena_prof_accum(arena_t *arena, uint64_t accumbytes);
-#endif
 void	*arena_malloc_small(arena_t *arena, size_t size, bool zero);
 void	*arena_malloc_large(arena_t *arena, size_t size, bool zero);
 void	*arena_malloc(size_t size, bool zero);
-void	*arena_palloc(arena_t *arena, size_t alignment, size_t size,
-    size_t alloc_size);
+void	*arena_palloc(arena_t *arena, size_t size, size_t alloc_size,
+    size_t alignment, bool zero);
 size_t	arena_salloc(const void *ptr);
 #ifdef JEMALLOC_PROF
 void	arena_prof_promoted(const void *ptr, size_t size);
 size_t	arena_salloc_demote(const void *ptr);
-prof_ctx_t	*arena_prof_ctx_get(const void *ptr);
-void	arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 #endif
 void	arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
     arena_chunk_map_t *mapelm);
@@ -449,7 +453,10 @@ void	arena_stats_merge(arena_t *arena, size_t *nactive, size_t *ndirty,
     arena_stats_t *astats, malloc_bin_stats_t *bstats,
     malloc_large_stats_t *lstats);
 #endif
-void	*arena_ralloc(void *ptr, size_t size, size_t oldsize);
+void	*arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
+    size_t extra, bool zero);
+void	*arena_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
+    size_t alignment, bool zero);
 bool	arena_new(arena_t *arena, unsigned ind);
 bool	arena_boot(void);
 
@@ -458,10 +465,149 @@ bool	arena_boot(void);
 #ifdef JEMALLOC_H_INLINES
 
 #ifndef JEMALLOC_ENABLE_INLINE
+unsigned	arena_run_regind(arena_run_t *run, arena_bin_t *bin,
+    const void *ptr, size_t size);
+#  ifdef JEMALLOC_PROF
+prof_ctx_t	*arena_prof_ctx_get(const void *ptr);
+void	arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
+#  endif
 void	arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ARENA_C_))
+JEMALLOC_INLINE unsigned
+arena_run_regind(arena_run_t *run, arena_bin_t *bin, const void *ptr,
+    size_t size)
+{
+	unsigned shift, diff, regind;
+
+	assert(run->magic == ARENA_RUN_MAGIC);
+
+	/*
+	 * Avoid doing division with a variable divisor if possible.  Using
+	 * actual division here can reduce allocator throughput by over 20%!
+	 */
+	diff = (unsigned)((uintptr_t)ptr - (uintptr_t)run - bin->reg0_offset);
+
+	/* Rescale (factor powers of 2 out of the numerator and denominator). */
+	shift = ffs(size) - 1;
+	diff >>= shift;
+	size >>= shift;
+
+	if (size == 1) {
+		/* The divisor was a power of 2. */
+		regind = diff;
+	} else {
+		/*
+		 * To divide by a number D that is not a power of two we
+		 * multiply by (2^21 / D) and then right shift by 21 positions.
+		 *
+		 *   X / D
+		 *
+		 * becomes
+		 *
+		 *   (X * size_invs[D - 3]) >> SIZE_INV_SHIFT
+		 *
+		 * We can omit the first three elements, because we never
+		 * divide by 0, and 1 and 2 are both powers of two, which are
+		 * handled above.
+		 */
+#define	SIZE_INV_SHIFT 21
+#define	SIZE_INV(s) (((1U << SIZE_INV_SHIFT) / (s)) + 1)
+		static const unsigned size_invs[] = {
+		    SIZE_INV(3),
+		    SIZE_INV(4), SIZE_INV(5), SIZE_INV(6), SIZE_INV(7),
+		    SIZE_INV(8), SIZE_INV(9), SIZE_INV(10), SIZE_INV(11),
+		    SIZE_INV(12), SIZE_INV(13), SIZE_INV(14), SIZE_INV(15),
+		    SIZE_INV(16), SIZE_INV(17), SIZE_INV(18), SIZE_INV(19),
+		    SIZE_INV(20), SIZE_INV(21), SIZE_INV(22), SIZE_INV(23),
+		    SIZE_INV(24), SIZE_INV(25), SIZE_INV(26), SIZE_INV(27),
+		    SIZE_INV(28), SIZE_INV(29), SIZE_INV(30), SIZE_INV(31)
+		};
+
+		if (size <= ((sizeof(size_invs) / sizeof(unsigned)) + 2))
+			regind = (diff * size_invs[size - 3]) >> SIZE_INV_SHIFT;
+		else
+			regind = diff / size;
+#undef SIZE_INV
+#undef SIZE_INV_SHIFT
+	}
+	assert(diff == regind * size);
+	assert(regind < bin->nregs);
+
+	return (regind);
+}
+
+#ifdef JEMALLOC_PROF
+JEMALLOC_INLINE prof_ctx_t *
+arena_prof_ctx_get(const void *ptr)
+{
+	prof_ctx_t *ret;
+	arena_chunk_t *chunk;
+	size_t pageind, mapbits;
+
+	assert(ptr != NULL);
+	assert(CHUNK_ADDR2BASE(ptr) != ptr);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapbits = chunk->map[pageind-map_bias].bits;
+	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
+	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
+		if (prof_promote)
+			ret = (prof_ctx_t *)(uintptr_t)1U;
+		else {
+			arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
+			    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
+			    PAGE_SHIFT));
+			arena_bin_t *bin = run->bin;
+			unsigned regind;
+
+			assert(run->magic == ARENA_RUN_MAGIC);
+			regind = arena_run_regind(run, bin, ptr, bin->reg_size);
+			ret = *(prof_ctx_t **)((uintptr_t)run +
+			    bin->ctx0_offset + (regind *
+			    sizeof(prof_ctx_t *)));
+		}
+	} else
+		ret = chunk->map[pageind-map_bias].prof_ctx;
+
+	return (ret);
+}
+
+JEMALLOC_INLINE void
+arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
+{
+	arena_chunk_t *chunk;
+	size_t pageind, mapbits;
+
+	assert(ptr != NULL);
+	assert(CHUNK_ADDR2BASE(ptr) != ptr);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapbits = chunk->map[pageind-map_bias].bits;
+	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
+	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
+		if (prof_promote == false) {
+			arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
+			    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
+			    PAGE_SHIFT));
+			arena_bin_t *bin = run->bin;
+			unsigned regind;
+
+			assert(run->magic == ARENA_RUN_MAGIC);
+			regind = arena_run_regind(run, bin, ptr, bin->reg_size);
+
+			*((prof_ctx_t **)((uintptr_t)run + bin->ctx0_offset
+			    + (regind * sizeof(prof_ctx_t *)))) = ctx;
+		} else
+			assert((uintptr_t)ctx == (uintptr_t)1U);
+	} else
+		chunk->map[pageind-map_bias].prof_ctx = ctx;
+}
+#endif
+
 JEMALLOC_INLINE void
 arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 {
@@ -474,8 +620,8 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 	assert(ptr != NULL);
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
-	mapelm = &chunk->map[pageind];
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapelm = &chunk->map[pageind-map_bias];
 	assert((mapelm->bits & CHUNK_MAP_ALLOCATED) != 0);
 	if ((mapelm->bits & CHUNK_MAP_LARGE) == 0) {
 		/* Small allocation. */
diff --git a/jemalloc/include/jemalloc/internal/chunk.h b/jemalloc/include/jemalloc/internal/chunk.h
index 1f6abf7..a60f0ad 100644
--- a/jemalloc/include/jemalloc/internal/chunk.h
+++ b/jemalloc/include/jemalloc/internal/chunk.h
@@ -39,13 +39,17 @@ extern malloc_mutex_t	chunks_mtx;
 extern chunk_stats_t	stats_chunks;
 #endif
 
+#ifdef JEMALLOC_IVSALLOC
+extern rtree_t		*chunks_rtree;
+#endif
+
 extern size_t		chunksize;
 extern size_t		chunksize_mask; /* (chunksize - 1). */
 extern size_t		chunk_npages;
-extern size_t		arena_chunk_header_npages;
+extern size_t		map_bias; /* Number of arena chunk header pages. */
 extern size_t		arena_maxclass; /* Max size class for arenas. */
 
-void	*chunk_alloc(size_t size, bool *zero);
+void	*chunk_alloc(size_t size, bool base, bool *zero);
 void	chunk_dealloc(void *chunk, size_t size);
 bool	chunk_boot(void);
 
diff --git a/jemalloc/include/jemalloc/internal/chunk_mmap.h b/jemalloc/include/jemalloc/internal/chunk_mmap.h
index dc52448..07b50a4 100644
--- a/jemalloc/include/jemalloc/internal/chunk_mmap.h
+++ b/jemalloc/include/jemalloc/internal/chunk_mmap.h
@@ -13,6 +13,8 @@ void	*chunk_alloc_mmap(size_t size);
 void	*chunk_alloc_mmap_noreserve(size_t size);
 void	chunk_dealloc_mmap(void *chunk, size_t size);
 
+bool	chunk_mmap_boot(void);
+
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
diff --git a/jemalloc/include/jemalloc/internal/ckh.h b/jemalloc/include/jemalloc/internal/ckh.h
index c39ea5c..d4e391b 100644
--- a/jemalloc/include/jemalloc/internal/ckh.h
+++ b/jemalloc/include/jemalloc/internal/ckh.h
@@ -45,7 +45,7 @@ struct ckh_s {
 #endif
 
 	/* Used for pseudo-random number generation. */
-#define	CKH_A		12345
+#define	CKH_A		1103515241
 #define	CKH_C		12347
 	uint32_t	prn_state;
 
diff --git a/jemalloc/include/jemalloc/internal/ctl.h b/jemalloc/include/jemalloc/internal/ctl.h
index 7bbf21e..8776ad1 100644
--- a/jemalloc/include/jemalloc/internal/ctl.h
+++ b/jemalloc/include/jemalloc/internal/ctl.h
@@ -82,9 +82,9 @@ bool	ctl_boot(void);
 #define	xmallctl(name, oldp, oldlenp, newp, newlen) do {		\
 	if (JEMALLOC_P(mallctl)(name, oldp, oldlenp, newp, newlen)	\
 	    != 0) {							\
-		malloc_write("<jemalloc>: Invalid xmallctl(\"");	\
+		malloc_write("<jemalloc>: Failure in xmallctl(\"");	\
 		malloc_write(name);					\
-		malloc_write("\", ...) call\n");			\
+		malloc_write("\", ...)\n");				\
 		abort();						\
 	}								\
 } while (0)
@@ -92,9 +92,9 @@ bool	ctl_boot(void);
 #define	xmallctlnametomib(name, mibp, miblenp) do {			\
 	if (JEMALLOC_P(mallctlnametomib)(name, mibp, miblenp) != 0) {	\
 		malloc_write(						\
-		    "<jemalloc>: Invalid xmallctlnametomib(\"");	\
+		    "<jemalloc>: Failure in xmallctlnametomib(\"");	\
 		malloc_write(name);					\
-		malloc_write("\", ...) call\n");			\
+		malloc_write("\", ...)\n");				\
 		abort();						\
 	}								\
 } while (0)
@@ -103,7 +103,7 @@ bool	ctl_boot(void);
 	if (JEMALLOC_P(mallctlbymib)(mib, miblen, oldp, oldlenp, newp,	\
 	    newlen) != 0) {						\
 		malloc_write(						\
-		    "<jemalloc>: Invalid xmallctlbymib() call\n");	\
+		    "<jemalloc>: Failure in xmallctlbymib()\n");	\
 		abort();						\
 	}								\
 } while (0)
diff --git a/jemalloc/include/jemalloc/internal/huge.h b/jemalloc/include/jemalloc/internal/huge.h
index 0c0582f..bf23127 100644
--- a/jemalloc/include/jemalloc/internal/huge.h
+++ b/jemalloc/include/jemalloc/internal/huge.h
@@ -20,8 +20,11 @@ extern size_t		huge_allocated;
 extern malloc_mutex_t	huge_mtx;
 
 void	*huge_malloc(size_t size, bool zero);
-void	*huge_palloc(size_t alignment, size_t size);
-void	*huge_ralloc(void *ptr, size_t size, size_t oldsize);
+void	*huge_palloc(size_t size, size_t alignment, bool zero);
+void	*huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
+    size_t extra);
+void	*huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
+    size_t alignment, bool zero);
 void	huge_dalloc(void *ptr);
 size_t	huge_salloc(const void *ptr);
 #ifdef JEMALLOC_PROF
diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
index 2c3f32f..3d25300 100644
--- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in
@@ -17,16 +17,29 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
+#include <stddef.h>
+#ifndef offsetof
+#  define offsetof(type, member)	((size_t)&(((type *)NULL)->member))
+#endif
 #include <inttypes.h>
 #include <string.h>
 #include <strings.h>
+#include <ctype.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <pthread.h>
+#include <math.h>
 
 #define	JEMALLOC_MANGLE
 #include "../jemalloc@install_suffix@.h"
 
+#ifdef JEMALLOC_ZONE
+#include <mach/mach_error.h>
+#include <mach/mach_init.h>
+#include <mach/vm_map.h>
+#include <malloc/malloc.h>
+#endif
+
 #ifdef JEMALLOC_LAZY_LOCK
 #include <dlfcn.h>
 #endif
@@ -49,7 +62,7 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 		malloc_write("<jemalloc>: ");				\
 		malloc_write(__FILE__);					\
 		malloc_write(":");					\
-		malloc_write(umax2s(__LINE__, 10, line_buf));		\
+		malloc_write(u2s(__LINE__, 10, line_buf));		\
 		malloc_write(": Failed assertion: ");			\
 		malloc_write("\"");					\
 		malloc_write(#e);					\
@@ -77,6 +90,8 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 /******************************************************************************/
 #define JEMALLOC_H_TYPES
 
+#define	ALLOCM_LG_ALIGN_MASK	((int)0x3f)
+
 #define	ZU(z)	((size_t)z)
 
 #ifndef __DECONST
@@ -92,8 +107,8 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #  define JEMALLOC_INLINE static inline
 #endif
 
-/* Size of stack-allocated buffer passed to strerror_r(). */
-#define	STRERROR_BUF		64
+/* Size of stack-allocated buffer passed to buferror(). */
+#define	BUFERROR_BUF		64
 
 /* Minimum alignment of allocations is 2^LG_QUANTUM bytes. */
 #ifdef __i386__
@@ -159,6 +174,16 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #define	STATIC_PAGE_SIZE ((size_t)(1U << STATIC_PAGE_SHIFT))
 #define	STATIC_PAGE_MASK ((size_t)(STATIC_PAGE_SIZE - 1))
 
+#ifdef PAGE_SHIFT
+#  undef PAGE_SHIFT
+#endif
+#ifdef PAGE_SIZE
+#  undef PAGE_SIZE
+#endif
+#ifdef PAGE_MASK
+#  undef PAGE_MASK
+#endif
+
 #ifdef DYNAMIC_PAGE_SHIFT
 #  define PAGE_SHIFT	lg_pagesize
 #  define PAGE_SIZE	pagesize
@@ -184,8 +209,12 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
+#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/hash.h"
+#ifdef JEMALLOC_ZONE
+#include "jemalloc/internal/zone.h"
+#endif
 #include "jemalloc/internal/prof.h"
 
 #undef JEMALLOC_H_TYPES
@@ -203,8 +232,12 @@ extern void	(*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s);
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
+#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/hash.h"
+#ifdef JEMALLOC_ZONE
+#include "jemalloc/internal/zone.h"
+#endif
 #include "jemalloc/internal/prof.h"
 
 #undef JEMALLOC_H_STRUCTS
@@ -224,6 +257,7 @@ extern bool	opt_xmalloc;
 #ifdef JEMALLOC_FILL
 extern bool	opt_zero;
 #endif
+extern size_t	opt_narenas;
 
 #ifdef DYNAMIC_PAGE_SHIFT
 extern size_t		pagesize;
@@ -240,8 +274,19 @@ extern malloc_mutex_t	arenas_lock; /* Protects arenas initialization. */
  * Map of pthread_self() --> arenas[???], used for selecting an arena to use
  * for allocations.
  */
-extern __thread arena_t	*arenas_map JEMALLOC_ATTR(tls_model("initial-exec"));
+extern __thread arena_t	*arenas_tls JEMALLOC_ATTR(tls_model("initial-exec"));
+#  define ARENA_GET()	arenas_tls
+#  define ARENA_SET(v)	do {						\
+	arenas_tls = (v);						\
+} while (0)
+#else
+extern pthread_key_t	arenas_tsd;
+#  define ARENA_GET()	((arena_t *)pthread_getspecific(arenas_tsd))
+#  define ARENA_SET(v)	do {						\
+	pthread_setspecific(arenas_tsd, (void *)(v));			\
+} while (0)
 #endif
+
 /*
  * Arenas that are used to service external requests.  Not all elements of the
  * arenas array are necessarily used; arenas are created lazily as needed.
@@ -249,10 +294,55 @@ extern __thread arena_t	*arenas_map JEMALLOC_ATTR(tls_model("initial-exec"));
 extern arena_t		**arenas;
 extern unsigned		narenas;
 
+#ifdef JEMALLOC_STATS
+typedef struct {
+	uint64_t	allocated;
+	uint64_t	deallocated;
+} thread_allocated_t;
+#  ifndef NO_TLS
+extern __thread thread_allocated_t	thread_allocated_tls;
+#    define ALLOCATED_GET() thread_allocated_tls.allocated
+#    define DEALLOCATED_GET() thread_allocated_tls.deallocated
+#    define ALLOCATED_ADD(a, d) do {					\
+	thread_allocated_tls.allocated += a;				\
+	thread_allocated_tls.deallocated += d;				\
+} while (0)
+#  else
+extern pthread_key_t	thread_allocated_tsd;
+#    define ALLOCATED_GET()						\
+	(uint64_t)((pthread_getspecific(thread_allocated_tsd) != NULL)	\
+	    ? ((thread_allocated_t *)					\
+	    pthread_getspecific(thread_allocated_tsd))->allocated : 0)
+#    define DEALLOCATED_GET()						\
+	(uint64_t)((pthread_getspecific(thread_allocated_tsd) != NULL)	\
+	    ? ((thread_allocated_t					\
+	    *)pthread_getspecific(thread_allocated_tsd))->deallocated :	\
+	    0)
+#    define ALLOCATED_ADD(a, d) do {					\
+	thread_allocated_t *thread_allocated = (thread_allocated_t *)	\
+	    pthread_getspecific(thread_allocated_tsd);			\
+	if (thread_allocated != NULL) {					\
+		thread_allocated->allocated += (a);			\
+		thread_allocated->deallocated += (d);			\
+	} else {							\
+		thread_allocated = (thread_allocated_t *)		\
+		    imalloc(sizeof(thread_allocated_t));		\
+		if (thread_allocated != NULL) {				\
+			pthread_setspecific(thread_allocated_tsd,	\
+			    thread_allocated);				\
+			thread_allocated->allocated = (a);		\
+			thread_allocated->deallocated = (d);		\
+		}							\
+	}								\
+} while (0)
+#  endif
+#endif
+
 arena_t	*arenas_extend(unsigned ind);
-#ifndef NO_TLS
 arena_t	*choose_arena_hard(void);
-#endif
+int	buferror(int errnum, char *buf, size_t buflen);
+void	jemalloc_prefork(void);
+void	jemalloc_postfork(void);
 
 #include "jemalloc/internal/prn.h"
 #include "jemalloc/internal/ckh.h"
@@ -265,8 +355,12 @@ arena_t	*choose_arena_hard(void);
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/chunk.h"
 #include "jemalloc/internal/huge.h"
+#include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/tcache.h"
 #include "jemalloc/internal/hash.h"
+#ifdef JEMALLOC_ZONE
+#include "jemalloc/internal/zone.h"
+#endif
 #include "jemalloc/internal/prof.h"
 
 #undef JEMALLOC_H_EXTERNS
@@ -285,133 +379,55 @@ arena_t	*choose_arena_hard(void);
 #include "jemalloc/internal/huge.h"
 
 #ifndef JEMALLOC_ENABLE_INLINE
+size_t	pow2_ceil(size_t x);
+size_t	s2u(size_t size);
+size_t	sa2u(size_t size, size_t alignment, size_t *run_size_p);
 void	malloc_write(const char *s);
 arena_t	*choose_arena(void);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
-/*
- * Wrapper around malloc_message() that avoids the need for
- * JEMALLOC_P(malloc_message)(...) throughout the code.
- */
-JEMALLOC_INLINE void
-malloc_write(const char *s)
+/* Compute the smallest power of 2 that is >= x. */
+JEMALLOC_INLINE size_t
+pow2_ceil(size_t x)
 {
 
-	JEMALLOC_P(malloc_message)(NULL, s);
+	x--;
+	x |= x >> 1;
+	x |= x >> 2;
+	x |= x >> 4;
+	x |= x >> 8;
+	x |= x >> 16;
+#if (LG_SIZEOF_PTR == 3)
+	x |= x >> 32;
+#endif
+	x++;
+	return (x);
 }
 
 /*
- * Choose an arena based on a per-thread value (fast-path code, calls slow-path
- * code if necessary).
+ * Compute usable size that would result from allocating an object with the
+ * specified size.
  */
-JEMALLOC_INLINE arena_t *
-choose_arena(void)
-{
-	arena_t *ret;
-
-	/*
-	 * We can only use TLS if this is a PIC library, since for the static
-	 * library version, libc's malloc is used by TLS allocation, which
-	 * introduces a bootstrapping issue.
-	 */
-#ifndef NO_TLS
-	ret = arenas_map;
-	if (ret == NULL) {
-		ret = choose_arena_hard();
-		assert(ret != NULL);
-	}
-#else
-	if (isthreaded && narenas > 1) {
-		unsigned long ind;
-
-		/*
-		 * Hash pthread_self() to one of the arenas.  There is a prime
-		 * number of arenas, so this has a reasonable chance of
-		 * working.  Even so, the hashing can be easily thwarted by
-		 * inconvenient pthread_self() values.  Without specific
-		 * knowledge of how pthread_self() calculates values, we can't
-		 * easily do much better than this.
-		 */
-		ind = (unsigned long) pthread_self() % narenas;
-
-		/*
-		 * Optimistially assume that arenas[ind] has been initialized.
-		 * At worst, we find out that some other thread has already
-		 * done so, after acquiring the lock in preparation.  Note that
-		 * this lazy locking also has the effect of lazily forcing
-		 * cache coherency; without the lock acquisition, there's no
-		 * guarantee that modification of arenas[ind] by another thread
-		 * would be seen on this CPU for an arbitrary amount of time.
-		 *
-		 * In general, this approach to modifying a synchronized value
-		 * isn't a good idea, but in this case we only ever modify the
-		 * value once, so things work out well.
-		 */
-		ret = arenas[ind];
-		if (ret == NULL) {
-			/*
-			 * Avoid races with another thread that may have already
-			 * initialized arenas[ind].
-			 */
-			malloc_mutex_lock(&arenas_lock);
-			if (arenas[ind] == NULL)
-				ret = arenas_extend((unsigned)ind);
-			else
-				ret = arenas[ind];
-			malloc_mutex_unlock(&arenas_lock);
-		}
-	} else
-		ret = arenas[0];
-#endif
-
-	assert(ret != NULL);
-	return (ret);
-}
-#endif
-
-#include "jemalloc/internal/tcache.h"
-#include "jemalloc/internal/arena.h"
-#include "jemalloc/internal/hash.h"
-#include "jemalloc/internal/prof.h"
-
-#ifndef JEMALLOC_ENABLE_INLINE
-void	*imalloc(size_t size);
-void	*icalloc(size_t size);
-void	*ipalloc(size_t alignment, size_t size);
-size_t	isalloc(const void *ptr);
-void	*iralloc(void *ptr, size_t size);
-void	idalloc(void *ptr);
-#endif
-
-#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
-JEMALLOC_INLINE void *
-imalloc(size_t size)
-{
-
-	assert(size != 0);
-
-	if (size <= arena_maxclass)
-		return (arena_malloc(size, false));
-	else
-		return (huge_malloc(size, false));
-}
-
-JEMALLOC_INLINE void *
-icalloc(size_t size)
+JEMALLOC_INLINE size_t
+s2u(size_t size)
 {
 
+	if (size <= small_maxclass)
+		return arenas[0]->bins[small_size2bin[size]].reg_size;
 	if (size <= arena_maxclass)
-		return (arena_malloc(size, true));
-	else
-		return (huge_malloc(size, true));
+		return PAGE_CEILING(size);
+	return CHUNK_CEILING(size);
 }
 
-JEMALLOC_INLINE void *
-ipalloc(size_t alignment, size_t size)
+/*
+ * Compute usable size that would result from allocating an object with the
+ * specified size and alignment.
+ */
+JEMALLOC_INLINE size_t
+sa2u(size_t size, size_t alignment, size_t *run_size_p)
 {
-	void *ret;
-	size_t ceil_size;
+	size_t usize;
 
 	/*
 	 * Round size up to the nearest multiple of alignment.
@@ -431,20 +447,23 @@ ipalloc(size_t alignment, size_t size)
 	 * will further round up to a power of two, but that never causes
 	 * correctness issues.
 	 */
-	ceil_size = (size + (alignment - 1)) & (-alignment);
+	usize = (size + (alignment - 1)) & (-alignment);
 	/*
-	 * (ceil_size < size) protects against the combination of maximal
+	 * (usize < size) protects against the combination of maximal
 	 * alignment and size greater than maximal alignment.
 	 */
-	if (ceil_size < size) {
+	if (usize < size) {
 		/* size_t overflow. */
-		return (NULL);
+		return (0);
 	}
 
-	if (ceil_size <= PAGE_SIZE || (alignment <= PAGE_SIZE
-	    && ceil_size <= arena_maxclass))
-		ret = arena_malloc(ceil_size, false);
-	else {
+	if (usize <= arena_maxclass && alignment <= PAGE_SIZE) {
+		if (usize <= small_maxclass) {
+			return
+			    (arenas[0]->bins[small_size2bin[usize]].reg_size);
+		}
+		return (PAGE_CEILING(usize));
+	} else {
 		size_t run_size;
 
 		/*
@@ -452,30 +471,30 @@ ipalloc(size_t alignment, size_t size)
 		 * permanently; it makes later calculations simpler.
 		 */
 		alignment = PAGE_CEILING(alignment);
-		ceil_size = PAGE_CEILING(size);
+		usize = PAGE_CEILING(size);
 		/*
-		 * (ceil_size < size) protects against very large sizes within
+		 * (usize < size) protects against very large sizes within
 		 * PAGE_SIZE of SIZE_T_MAX.
 		 *
-		 * (ceil_size + alignment < ceil_size) protects against the
-		 * combination of maximal alignment and ceil_size large enough
+		 * (usize + alignment < usize) protects against the
+		 * combination of maximal alignment and usize large enough
 		 * to cause overflow.  This is similar to the first overflow
 		 * check above, but it needs to be repeated due to the new
-		 * ceil_size value, which may now be *equal* to maximal
+		 * usize value, which may now be *equal* to maximal
 		 * alignment, whereas before we only detected overflow if the
 		 * original size was *greater* than maximal alignment.
 		 */
-		if (ceil_size < size || ceil_size + alignment < ceil_size) {
+		if (usize < size || usize + alignment < usize) {
 			/* size_t overflow. */
-			return (NULL);
+			return (0);
 		}
 
 		/*
 		 * Calculate the size of the over-size run that arena_palloc()
 		 * would need to allocate in order to guarantee the alignment.
 		 */
-		if (ceil_size >= alignment)
-			run_size = ceil_size + alignment - PAGE_SIZE;
+		if (usize >= alignment)
+			run_size = usize + alignment - PAGE_SIZE;
 		else {
 			/*
 			 * It is possible that (alignment << 1) will cause
@@ -488,15 +507,112 @@ ipalloc(size_t alignment, size_t size)
 			 */
 			run_size = (alignment << 1) - PAGE_SIZE;
 		}
+		if (run_size_p != NULL)
+			*run_size_p = run_size;
 
-		if (run_size <= arena_maxclass) {
-			ret = arena_palloc(choose_arena(), alignment, ceil_size,
-			    run_size);
-		} else if (alignment <= chunksize)
-			ret = huge_malloc(ceil_size, false);
-		else
-			ret = huge_palloc(alignment, ceil_size);
+		if (run_size <= arena_maxclass)
+			return (PAGE_CEILING(usize));
+		return (CHUNK_CEILING(usize));
 	}
+}
+
+/*
+ * Wrapper around malloc_message() that avoids the need for
+ * JEMALLOC_P(malloc_message)(...) throughout the code.
+ */
+JEMALLOC_INLINE void
+malloc_write(const char *s)
+{
+
+	JEMALLOC_P(malloc_message)(NULL, s);
+}
+
+/*
+ * Choose an arena based on a per-thread value (fast-path code, calls slow-path
+ * code if necessary).
+ */
+JEMALLOC_INLINE arena_t *
+choose_arena(void)
+{
+	arena_t *ret;
+
+	ret = ARENA_GET();
+	if (ret == NULL) {
+		ret = choose_arena_hard();
+		assert(ret != NULL);
+	}
+
+	return (ret);
+}
+#endif
+
+#include "jemalloc/internal/rtree.h"
+#include "jemalloc/internal/tcache.h"
+#include "jemalloc/internal/arena.h"
+#include "jemalloc/internal/hash.h"
+#ifdef JEMALLOC_ZONE
+#include "jemalloc/internal/zone.h"
+#endif
+
+#ifndef JEMALLOC_ENABLE_INLINE
+void	*imalloc(size_t size);
+void	*icalloc(size_t size);
+void	*ipalloc(size_t size, size_t alignment, bool zero);
+size_t	isalloc(const void *ptr);
+#  ifdef JEMALLOC_IVSALLOC
+size_t	ivsalloc(const void *ptr);
+#  endif
+void	idalloc(void *ptr);
+void	*iralloc(void *ptr, size_t size, size_t extra, size_t alignment,
+    bool zero, bool no_move);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
+JEMALLOC_INLINE void *
+imalloc(size_t size)
+{
+
+	assert(size != 0);
+
+	if (size <= arena_maxclass)
+		return (arena_malloc(size, false));
+	else
+		return (huge_malloc(size, false));
+}
+
+JEMALLOC_INLINE void *
+icalloc(size_t size)
+{
+
+	if (size <= arena_maxclass)
+		return (arena_malloc(size, true));
+	else
+		return (huge_malloc(size, true));
+}
+
+JEMALLOC_INLINE void *
+ipalloc(size_t size, size_t alignment, bool zero)
+{
+	void *ret;
+	size_t usize;
+	size_t run_size
+#  ifdef JEMALLOC_CC_SILENCE
+	    = 0
+#  endif
+	    ;
+
+	usize = sa2u(size, alignment, &run_size);
+	if (usize == 0)
+		return (NULL);
+	if (usize <= arena_maxclass && alignment <= PAGE_SIZE)
+		ret = arena_malloc(usize, zero);
+	else if (run_size <= arena_maxclass) {
+		ret = arena_palloc(choose_arena(), usize, run_size, alignment,
+		    zero);
+	} else if (alignment <= chunksize)
+		ret = huge_malloc(usize, zero);
+	else
+		ret = huge_palloc(usize, alignment, zero);
 
 	assert(((uintptr_t)ret & (alignment - 1)) == 0);
 	return (ret);
@@ -526,21 +642,18 @@ isalloc(const void *ptr)
 	return (ret);
 }
 
-JEMALLOC_INLINE void *
-iralloc(void *ptr, size_t size)
+#ifdef JEMALLOC_IVSALLOC
+JEMALLOC_INLINE size_t
+ivsalloc(const void *ptr)
 {
-	size_t oldsize;
 
-	assert(ptr != NULL);
-	assert(size != 0);
+	/* Return 0 if ptr is not within a chunk managed by jemalloc. */
+	if (rtree_get(chunks_rtree, (uintptr_t)CHUNK_ADDR2BASE(ptr)) == NULL)
+		return (0);
 
-	oldsize = isalloc(ptr);
-
-	if (size <= arena_maxclass)
-		return (arena_ralloc(ptr, size, oldsize));
-	else
-		return (huge_ralloc(ptr, size, oldsize));
+	return (isalloc(ptr));
 }
+#endif
 
 JEMALLOC_INLINE void
 idalloc(void *ptr)
@@ -555,7 +668,70 @@ idalloc(void *ptr)
 	else
 		huge_dalloc(ptr);
 }
+
+JEMALLOC_INLINE void *
+iralloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero,
+    bool no_move)
+{
+	void *ret;
+	size_t oldsize;
+
+	assert(ptr != NULL);
+	assert(size != 0);
+
+	oldsize = isalloc(ptr);
+
+	if (alignment != 0 && ((uintptr_t)ptr & ((uintptr_t)alignment-1))
+	    != 0) {
+		size_t copysize;
+
+		/*
+		 * Existing object alignment is inadquate; allocate new space
+		 * and copy.
+		 */
+		if (no_move)
+			return (NULL);
+		ret = ipalloc(size + extra, alignment, zero);
+		if (ret == NULL) {
+			if (extra == 0)
+				return (NULL);
+			/* Try again, without extra this time. */
+			ret = ipalloc(size, alignment, zero);
+			if (ret == NULL)
+				return (NULL);
+		}
+		/*
+		 * Copy at most size bytes (not size+extra), since the caller
+		 * has no expectation that the extra bytes will be reliably
+		 * preserved.
+		 */
+		copysize = (size < oldsize) ? size : oldsize;
+		memcpy(ret, ptr, copysize);
+		idalloc(ptr);
+		return (ret);
+	}
+
+	if (no_move) {
+		if (size <= arena_maxclass) {
+			return (arena_ralloc_no_move(ptr, oldsize, size,
+			    extra, zero));
+		} else {
+			return (huge_ralloc_no_move(ptr, oldsize, size,
+			    extra));
+		}
+	} else {
+		if (size + extra <= arena_maxclass) {
+			return (arena_ralloc(ptr, oldsize, size, extra,
+			    alignment, zero));
+		} else {
+			return (huge_ralloc(ptr, oldsize, size, extra,
+			    alignment, zero));
+		}
+	}
+}
 #endif
 
+#include "jemalloc/internal/prof.h"
+
 #undef JEMALLOC_H_INLINES
 /******************************************************************************/
diff --git a/jemalloc/include/jemalloc/internal/mutex.h b/jemalloc/include/jemalloc/internal/mutex.h
index 108bfa8..dcca01e 100644
--- a/jemalloc/include/jemalloc/internal/mutex.h
+++ b/jemalloc/include/jemalloc/internal/mutex.h
@@ -3,6 +3,12 @@
 
 typedef pthread_mutex_t malloc_mutex_t;
 
+#ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+#  define MALLOC_MUTEX_INITIALIZER PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+#else
+#  define MALLOC_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
+#endif
+
 #endif /* JEMALLOC_H_TYPES */
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS
@@ -18,6 +24,7 @@ extern bool isthreaded;
 #endif
 
 bool	malloc_mutex_init(malloc_mutex_t *mutex);
+void	malloc_mutex_destroy(malloc_mutex_t *mutex);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
diff --git a/jemalloc/include/jemalloc/internal/prof.h b/jemalloc/include/jemalloc/internal/prof.h
index fb55fb9..7864000 100644
--- a/jemalloc/include/jemalloc/internal/prof.h
+++ b/jemalloc/include/jemalloc/internal/prof.h
@@ -6,20 +6,25 @@ typedef struct prof_bt_s prof_bt_t;
 typedef struct prof_cnt_s prof_cnt_t;
 typedef struct prof_thr_cnt_s prof_thr_cnt_t;
 typedef struct prof_ctx_s prof_ctx_t;
-typedef struct prof_s prof_t;
+typedef struct prof_tdata_s prof_tdata_t;
 
 /* Option defaults. */
-#define	LG_PROF_BT_MAX_DEFAULT		2
+#define	PROF_PREFIX_DEFAULT		"jeprof"
+#define	LG_PROF_BT_MAX_DEFAULT		7
 #define	LG_PROF_SAMPLE_DEFAULT		0
-#define	LG_PROF_INTERVAL_DEFAULT	30
+#define	LG_PROF_INTERVAL_DEFAULT	-1
+#define	LG_PROF_TCMAX_DEFAULT		-1
 
 /*
  * Hard limit on stack backtrace depth.  Note that the version of
  * prof_backtrace() that is based on __builtin_return_address() necessarily has
- * a hard-coded number of backtrace frame handlers, so increasing
- * LG_PROF_BT_MAX requires changing prof_backtrace().
+ * a hard-coded number of backtrace frame handlers.
  */
-#define	LG_PROF_BT_MAX		7 /* >= LG_PROF_BT_MAX_DEFAULT */
+#if (defined(JEMALLOC_PROF_LIBGCC) || defined(JEMALLOC_PROF_LIBUNWIND))
+#  define LG_PROF_BT_MAX	((ZU(1) << (LG_SIZEOF_PTR+3)) - 1)
+#else
+#  define LG_PROF_BT_MAX	7 /* >= LG_PROF_BT_MAX_DEFAULT */
+#endif
 #define	PROF_BT_MAX		(1U << LG_PROF_BT_MAX)
 
 /* Initial hash table size. */
@@ -34,16 +39,16 @@ typedef struct prof_s prof_t;
 
 struct prof_bt_s {
 	/* Backtrace, stored as len program counters. */
-	void			**vec;
-	unsigned		len;
+	void		**vec;
+	unsigned	len;
 };
 
 #ifdef JEMALLOC_PROF_LIBGCC
 /* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
 typedef struct {
-	prof_bt_t *bt;
-	unsigned nignore;
-	unsigned max;
+	prof_bt_t	*bt;
+	unsigned	nignore;
+	unsigned	max;
 } prof_unwind_data_t;
 #endif
 
@@ -51,11 +56,11 @@ struct prof_cnt_s {
 	/*
 	 * Profiling counters.  An allocation/deallocation pair can operate on
 	 * different prof_thr_cnt_t objects that are linked into the same
-	 * prof_ctx_t sets_ql, so it is possible for the cur* counters to go
+	 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
 	 * negative.  In principle it is possible for the *bytes counters to
-	 * overflow/underflow, but a general solution would require some form
-	 * of 128-bit counter solution; this implementation doesn't bother to
-	 * solve that problem.
+	 * overflow/underflow, but a general solution would require something
+	 * like 128-bit counters; this implementation doesn't bother to solve
+	 * that problem.
 	 */
 	int64_t		curobjs;
 	int64_t		curbytes;
@@ -64,15 +69,18 @@ struct prof_cnt_s {
 };
 
 struct prof_thr_cnt_s {
-	/* Linkage into prof_ctx_t's sets_ql. */
-	ql_elm(prof_thr_cnt_t)	link;
+	/* Linkage into prof_ctx_t's cnts_ql. */
+	ql_elm(prof_thr_cnt_t)	cnts_link;
+
+	/* Linkage into thread's LRU. */
+	ql_elm(prof_thr_cnt_t)	lru_link;
 
 	/*
 	 * Associated context.  If a thread frees an object that it did not
 	 * allocate, it is possible that the context is not cached in the
 	 * thread's hash table, in which case it must be able to look up the
 	 * context, insert a new prof_thr_cnt_t into the thread's hash table,
-	 * and link it into the prof_ctx_t's sets_ql.
+	 * and link it into the prof_ctx_t's cnts_ql.
 	 */
 	prof_ctx_t		*ctx;
 
@@ -101,11 +109,11 @@ struct prof_ctx_s {
 	/* Associated backtrace. */
 	prof_bt_t		*bt;
 
-	/* Protects cnt_merged and sets_ql. */
+	/* Protects cnt_merged and cnts_ql. */
 	malloc_mutex_t		lock;
 
-	/* Temporary storage for aggregation during dump. */
-	prof_cnt_t		cnt_dump;
+	/* Temporary storage for summation during dump. */
+	prof_cnt_t		cnt_summed;
 
 	/* When threads exit, they merge their stats into cnt_merged. */
 	prof_cnt_t		cnt_merged;
@@ -117,6 +125,31 @@ struct prof_ctx_s {
 	ql_head(prof_thr_cnt_t)	cnts_ql;
 };
 
+struct prof_tdata_s {
+	/*
+	 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread keeps a
+	 * cache of backtraces, with associated thread-specific prof_thr_cnt_t
+	 * objects.  Other threads may read the prof_thr_cnt_t contents, but no
+	 * others will ever write them.
+	 *
+	 * Upon thread exit, the thread must merge all the prof_thr_cnt_t
+	 * counter data into the associated prof_ctx_t objects, and unlink/free
+	 * the prof_thr_cnt_t objects.
+	 */
+	ckh_t			bt2cnt;
+
+	/* LRU for contents of bt2cnt. */
+	ql_head(prof_thr_cnt_t)	lru_ql;
+
+	/* Backtrace vector, used for calls to prof_backtrace(). */
+	void			**vec;
+
+	/* Sampling state. */
+	uint64_t		prn_state;
+	uint64_t		threshold;
+	uint64_t		accum;
+};
+
 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
@@ -129,11 +162,14 @@ extern bool	opt_prof;
  * to notice state changes.
  */
 extern bool	opt_prof_active;
-extern size_t	opt_lg_prof_bt_max; /* Maximum backtrace depth. */
-extern size_t	opt_lg_prof_sample; /* Mean bytes between samples. */
+extern size_t	opt_lg_prof_bt_max;   /* Maximum backtrace depth. */
+extern size_t	opt_lg_prof_sample;   /* Mean bytes between samples. */
 extern ssize_t	opt_lg_prof_interval; /* lg(prof_interval). */
-extern bool	opt_prof_udump; /* High-water memory dumping. */
-extern bool	opt_prof_leak; /* Dump leak summary at exit. */
+extern bool	opt_prof_gdump;       /* High-water memory dumping. */
+extern bool	opt_prof_leak;        /* Dump leak summary at exit. */
+extern bool	opt_prof_accum;       /* Report cumulative bytes. */
+extern ssize_t	opt_lg_prof_tcmax;    /* lg(max per thread bactrace cache) */
+extern char	opt_prof_prefix[PATH_MAX + 1];
 
 /*
  * Profile dump interval, measured in bytes allocated.  Each arena triggers a
@@ -150,25 +186,362 @@ extern uint64_t	prof_interval;
  */
 extern bool	prof_promote;
 
-bool	prof_init(prof_t *prof, bool master);
-void	prof_destroy(prof_t *prof);
+/* (1U << opt_lg_prof_bt_max). */
+extern unsigned	prof_bt_max;
 
-prof_thr_cnt_t	*prof_alloc_prep(size_t size);
-prof_ctx_t	*prof_ctx_get(const void *ptr);
-void	prof_malloc(const void *ptr, prof_thr_cnt_t *cnt);
-void	prof_realloc(const void *ptr, prof_thr_cnt_t *cnt, const void *old_ptr,
-    size_t old_size, prof_ctx_t *old_ctx);
-void	prof_free(const void *ptr);
+/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
+#ifndef NO_TLS
+extern __thread prof_tdata_t	*prof_tdata_tls
+    JEMALLOC_ATTR(tls_model("initial-exec"));
+#  define PROF_TCACHE_GET()	prof_tdata_tls
+#  define PROF_TCACHE_SET(v)	do {					\
+	prof_tdata_tls = (v);						\
+	pthread_setspecific(prof_tdata_tsd, (void *)(v));		\
+} while (0)
+#else
+#  define PROF_TCACHE_GET()						\
+	((prof_tdata_t *)pthread_getspecific(prof_tdata_tsd))
+#  define PROF_TCACHE_SET(v)	do {					\
+	pthread_setspecific(prof_tdata_tsd, (void *)(v));		\
+} while (0)
+#endif
+/*
+ * Same contents as b2cnt_tls, but initialized such that the TSD destructor is
+ * called when a thread exits, so that prof_tdata_tls contents can be merged,
+ * unlinked, and deallocated.
+ */
+extern pthread_key_t	prof_tdata_tsd;
+
+void	bt_init(prof_bt_t *bt, void **vec);
+void	prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max);
+prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
 void	prof_idump(void);
 bool	prof_mdump(const char *filename);
-void	prof_udump(void);
+void	prof_gdump(void);
+prof_tdata_t	*prof_tdata_init(void);
 void	prof_boot0(void);
-bool	prof_boot1(void);
+void	prof_boot1(void);
+bool	prof_boot2(void);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_INLINES
 
+#ifndef JEMALLOC_ENABLE_INLINE
+void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
+prof_thr_cnt_t	*prof_alloc_prep(size_t size);
+prof_ctx_t	*prof_ctx_get(const void *ptr);
+void	prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
+bool	prof_sample_accum_update(size_t size);
+void	prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt);
+void	prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
+    size_t old_size, prof_ctx_t *old_ctx);
+void	prof_free(const void *ptr, size_t size);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
+JEMALLOC_INLINE void
+prof_sample_threshold_update(prof_tdata_t *prof_tdata)
+{
+	uint64_t r;
+	double u;
+
+	/*
+	 * Compute prof_sample_threshold as a geometrically distributed random
+	 * variable with mean (2^opt_lg_prof_sample).
+	 */
+	prn64(r, 53, prof_tdata->prn_state,
+	    (uint64_t)6364136223846793005LLU, (uint64_t)1442695040888963407LLU);
+	u = (double)r * (1.0/9007199254740992.0L);
+	prof_tdata->threshold = (uint64_t)(log(u) /
+	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
+	    + (uint64_t)1U;
+}
+
+JEMALLOC_INLINE prof_thr_cnt_t *
+prof_alloc_prep(size_t size)
+{
+#ifdef JEMALLOC_ENABLE_INLINE
+   /* This function does not have its own stack frame, because it is inlined. */
+#  define NIGNORE 1
+#else
+#  define NIGNORE 2
+#endif
+	prof_thr_cnt_t *ret;
+	prof_tdata_t *prof_tdata;
+	prof_bt_t bt;
+
+	assert(size == s2u(size));
+
+	prof_tdata = PROF_TCACHE_GET();
+	if (prof_tdata == NULL) {
+		prof_tdata = prof_tdata_init();
+		if (prof_tdata == NULL)
+			return (NULL);
+	}
+
+	if (opt_prof_active == false) {
+		/* Sampling is currently inactive, so avoid sampling. */
+		ret = (prof_thr_cnt_t *)(uintptr_t)1U;
+	} else if (opt_lg_prof_sample == 0) {
+		/*
+		 * Don't bother with sampling logic, since sampling interval is
+		 * 1.
+		 */
+		bt_init(&bt, prof_tdata->vec);
+		prof_backtrace(&bt, NIGNORE, prof_bt_max);
+		ret = prof_lookup(&bt);
+	} else {
+		if (prof_tdata->threshold == 0) {
+			/*
+			 * Initialize.  Seed the prng differently for each
+			 * thread.
+			 */
+			prof_tdata->prn_state = (uint64_t)(uintptr_t)&size;
+			prof_sample_threshold_update(prof_tdata);
+		}
+
+		/*
+		 * Determine whether to capture a backtrace based on whether
+		 * size is enough for prof_accum to reach
+		 * prof_tdata->threshold.  However, delay updating these
+		 * variables until prof_{m,re}alloc(), because we don't know
+		 * for sure that the allocation will succeed.
+		 *
+		 * Use subtraction rather than addition to avoid potential
+		 * integer overflow.
+		 */
+		if (size >= prof_tdata->threshold - prof_tdata->accum) {
+			bt_init(&bt, prof_tdata->vec);
+			prof_backtrace(&bt, NIGNORE, prof_bt_max);
+			ret = prof_lookup(&bt);
+		} else
+			ret = (prof_thr_cnt_t *)(uintptr_t)1U;
+	}
+
+	return (ret);
+#undef NIGNORE
+}
+
+JEMALLOC_INLINE prof_ctx_t *
+prof_ctx_get(const void *ptr)
+{
+	prof_ctx_t *ret;
+	arena_chunk_t *chunk;
+
+	assert(ptr != NULL);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	if (chunk != ptr) {
+		/* Region. */
+		assert(chunk->arena->magic == ARENA_MAGIC);
+
+		ret = arena_prof_ctx_get(ptr);
+	} else
+		ret = huge_prof_ctx_get(ptr);
+
+	return (ret);
+}
+
+JEMALLOC_INLINE void
+prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
+{
+	arena_chunk_t *chunk;
+
+	assert(ptr != NULL);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	if (chunk != ptr) {
+		/* Region. */
+		assert(chunk->arena->magic == ARENA_MAGIC);
+
+		arena_prof_ctx_set(ptr, ctx);
+	} else
+		huge_prof_ctx_set(ptr, ctx);
+}
+
+JEMALLOC_INLINE bool
+prof_sample_accum_update(size_t size)
+{
+	prof_tdata_t *prof_tdata;
+
+	/* Sampling logic is unnecessary if the interval is 1. */
+	assert(opt_lg_prof_sample != 0);
+
+	prof_tdata = PROF_TCACHE_GET();
+	assert(prof_tdata != NULL);
+
+	/* Take care to avoid integer overflow. */
+	if (size >= prof_tdata->threshold - prof_tdata->accum) {
+		prof_tdata->accum -= (prof_tdata->threshold - size);
+		/* Compute new prof_sample_threshold. */
+		prof_sample_threshold_update(prof_tdata);
+		while (prof_tdata->accum >= prof_tdata->threshold) {
+			prof_tdata->accum -= prof_tdata->threshold;
+			prof_sample_threshold_update(prof_tdata);
+		}
+		return (false);
+	} else {
+		prof_tdata->accum += size;
+		return (true);
+	}
+}
+
+JEMALLOC_INLINE void
+prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt)
+{
+
+	assert(ptr != NULL);
+	assert(size == isalloc(ptr));
+
+	if (opt_lg_prof_sample != 0) {
+		if (prof_sample_accum_update(size)) {
+			/*
+			 * Don't sample.  For malloc()-like allocation, it is
+			 * always possible to tell in advance how large an
+			 * object's usable size will be, so there should never
+			 * be a difference between the size passed to
+			 * prof_alloc_prep() and prof_malloc().
+			 */
+			assert((uintptr_t)cnt == (uintptr_t)1U);
+		}
+	}
+
+	if ((uintptr_t)cnt > (uintptr_t)1U) {
+		prof_ctx_set(ptr, cnt->ctx);
+
+		cnt->epoch++;
+		/*********/
+		mb_write();
+		/*********/
+		cnt->cnts.curobjs++;
+		cnt->cnts.curbytes += size;
+		if (opt_prof_accum) {
+			cnt->cnts.accumobjs++;
+			cnt->cnts.accumbytes += size;
+		}
+		/*********/
+		mb_write();
+		/*********/
+		cnt->epoch++;
+		/*********/
+		mb_write();
+		/*********/
+	} else
+		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
+}
+
+JEMALLOC_INLINE void
+prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
+    size_t old_size, prof_ctx_t *old_ctx)
+{
+	prof_thr_cnt_t *told_cnt;
+
+	assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
+
+	if (ptr != NULL) {
+		assert(size == isalloc(ptr));
+		if (opt_lg_prof_sample != 0) {
+			if (prof_sample_accum_update(size)) {
+				/*
+				 * Don't sample.  The size passed to
+				 * prof_alloc_prep() was larger than what
+				 * actually got allocated, so a backtrace was
+				 * captured for this allocation, even though
+				 * its actual size was insufficient to cross
+				 * the sample threshold.
+				 */
+				cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
+			}
+		}
+	}
+
+	if ((uintptr_t)old_ctx > (uintptr_t)1U) {
+		told_cnt = prof_lookup(old_ctx->bt);
+		if (told_cnt == NULL) {
+			/*
+			 * It's too late to propagate OOM for this realloc(),
+			 * so operate directly on old_cnt->ctx->cnt_merged.
+			 */
+			malloc_mutex_lock(&old_ctx->lock);
+			old_ctx->cnt_merged.curobjs--;
+			old_ctx->cnt_merged.curbytes -= old_size;
+			malloc_mutex_unlock(&old_ctx->lock);
+			told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
+		}
+	} else
+		told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
+
+	if ((uintptr_t)told_cnt > (uintptr_t)1U)
+		told_cnt->epoch++;
+	if ((uintptr_t)cnt > (uintptr_t)1U) {
+		prof_ctx_set(ptr, cnt->ctx);
+		cnt->epoch++;
+	} else
+		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
+	/*********/
+	mb_write();
+	/*********/
+	if ((uintptr_t)told_cnt > (uintptr_t)1U) {
+		told_cnt->cnts.curobjs--;
+		told_cnt->cnts.curbytes -= old_size;
+	}
+	if ((uintptr_t)cnt > (uintptr_t)1U) {
+		cnt->cnts.curobjs++;
+		cnt->cnts.curbytes += size;
+		if (opt_prof_accum) {
+			cnt->cnts.accumobjs++;
+			cnt->cnts.accumbytes += size;
+		}
+	}
+	/*********/
+	mb_write();
+	/*********/
+	if ((uintptr_t)told_cnt > (uintptr_t)1U)
+		told_cnt->epoch++;
+	if ((uintptr_t)cnt > (uintptr_t)1U)
+		cnt->epoch++;
+	/*********/
+	mb_write(); /* Not strictly necessary. */
+}
+
+JEMALLOC_INLINE void
+prof_free(const void *ptr, size_t size)
+{
+	prof_ctx_t *ctx = prof_ctx_get(ptr);
+
+	if ((uintptr_t)ctx > (uintptr_t)1) {
+		assert(size == isalloc(ptr));
+		prof_thr_cnt_t *tcnt = prof_lookup(ctx->bt);
+
+		if (tcnt != NULL) {
+			tcnt->epoch++;
+			/*********/
+			mb_write();
+			/*********/
+			tcnt->cnts.curobjs--;
+			tcnt->cnts.curbytes -= size;
+			/*********/
+			mb_write();
+			/*********/
+			tcnt->epoch++;
+			/*********/
+			mb_write();
+			/*********/
+		} else {
+			/*
+			 * OOM during free() cannot be propagated, so operate
+			 * directly on cnt->ctx->cnt_merged.
+			 */
+			malloc_mutex_lock(&ctx->lock);
+			ctx->cnt_merged.curobjs--;
+			ctx->cnt_merged.curbytes -= size;
+			malloc_mutex_unlock(&ctx->lock);
+		}
+	}
+}
+#endif
+
 #endif /* JEMALLOC_H_INLINES */
 /******************************************************************************/
 #endif /* JEMALLOC_PROF */
diff --git a/jemalloc/include/jemalloc/internal/rtree.h b/jemalloc/include/jemalloc/internal/rtree.h
new file mode 100644
index 0000000..9d58eba
--- /dev/null
+++ b/jemalloc/include/jemalloc/internal/rtree.h
@@ -0,0 +1,161 @@
+/*
+ * This radix tree implementation is tailored to the singular purpose of
+ * tracking which chunks are currently owned by jemalloc.  This functionality
+ * is mandatory for OS X, where jemalloc must be able to respond to object
+ * ownership queries.
+ *
+ *******************************************************************************
+ */
+#ifdef JEMALLOC_H_TYPES
+
+typedef struct rtree_s rtree_t;
+
+/*
+ * Size of each radix tree node (must be a power of 2).  This impacts tree
+ * depth.
+ */
+#if (LG_SIZEOF_PTR == 2)
+#  define RTREE_NODESIZE (1U << 14)
+#else
+#  define RTREE_NODESIZE CACHELINE
+#endif
+
+#endif /* JEMALLOC_H_TYPES */
+/******************************************************************************/
+#ifdef JEMALLOC_H_STRUCTS
+
+struct rtree_s {
+	malloc_mutex_t	mutex;
+	void		**root;
+	unsigned	height;
+	unsigned	level2bits[1]; /* Dynamically sized. */
+};
+
+#endif /* JEMALLOC_H_STRUCTS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_EXTERNS
+
+rtree_t	*rtree_new(unsigned bits);
+
+#endif /* JEMALLOC_H_EXTERNS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_INLINES
+
+#ifndef JEMALLOC_ENABLE_INLINE
+#ifndef JEMALLOC_DEBUG
+void	*rtree_get_locked(rtree_t *rtree, uintptr_t key);
+#endif
+void	*rtree_get(rtree_t *rtree, uintptr_t key);
+bool	rtree_set(rtree_t *rtree, uintptr_t key, void *val);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(RTREE_C_))
+#define	RTREE_GET_GENERATE(f)						\
+/* The least significant bits of the key are ignored. */		\
+JEMALLOC_INLINE void *							\
+f(rtree_t *rtree, uintptr_t key)					\
+{									\
+	void *ret;							\
+	uintptr_t subkey;						\
+	unsigned i, lshift, height, bits;				\
+	void **node, **child;						\
+									\
+	RTREE_LOCK(&rtree->mutex);					\
+	for (i = lshift = 0, height = rtree->height, node = rtree->root;\
+	    i < height - 1;						\
+	    i++, lshift += bits, node = child) {			\
+		bits = rtree->level2bits[i];				\
+		subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR + \
+		    3)) - bits);					\
+		child = (void**)node[subkey];				\
+		if (child == NULL) {					\
+			RTREE_UNLOCK(&rtree->mutex);			\
+			return (NULL);					\
+		}							\
+	}								\
+									\
+	/*								\
+	 * node is a leaf, so it contains values rather than node	\
+	 * pointers.							\
+	 */								\
+	bits = rtree->level2bits[i];					\
+	subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) -	\
+	    bits);							\
+	ret = node[subkey];						\
+	RTREE_UNLOCK(&rtree->mutex);					\
+									\
+	RTREE_GET_VALIDATE						\
+	return (ret);							\
+}
+
+#ifdef JEMALLOC_DEBUG
+#  define RTREE_LOCK(l)		malloc_mutex_lock(l)
+#  define RTREE_UNLOCK(l)	malloc_mutex_unlock(l)
+#  define RTREE_GET_VALIDATE
+RTREE_GET_GENERATE(rtree_get_locked)
+#  undef RTREE_LOCK
+#  undef RTREE_UNLOCK
+#  undef RTREE_GET_VALIDATE
+#endif
+
+#define	RTREE_LOCK(l)
+#define	RTREE_UNLOCK(l)
+#ifdef JEMALLOC_DEBUG
+   /*
+    * Suppose that it were possible for a jemalloc-allocated chunk to be
+    * munmap()ped, followed by a different allocator in another thread re-using
+    * overlapping virtual memory, all without invalidating the cached rtree
+    * value.  The result would be a false positive (the rtree would claim that
+    * jemalloc owns memory that it had actually discarded).  This scenario
+    * seems impossible, but the following assertion is a prudent sanity check.
+    */
+#  define RTREE_GET_VALIDATE						\
+	assert(rtree_get_locked(rtree, key) == ret);
+#else
+#  define RTREE_GET_VALIDATE
+#endif
+RTREE_GET_GENERATE(rtree_get)
+#undef RTREE_LOCK
+#undef RTREE_UNLOCK
+#undef RTREE_GET_VALIDATE
+
+JEMALLOC_INLINE bool
+rtree_set(rtree_t *rtree, uintptr_t key, void *val)
+{
+	uintptr_t subkey;
+	unsigned i, lshift, height, bits;
+	void **node, **child;
+
+	malloc_mutex_lock(&rtree->mutex);
+	for (i = lshift = 0, height = rtree->height, node = rtree->root;
+	    i < height - 1;
+	    i++, lshift += bits, node = child) {
+		bits = rtree->level2bits[i];
+		subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) -
+		    bits);
+		child = (void**)node[subkey];
+		if (child == NULL) {
+			child = (void**)base_alloc(sizeof(void *) <<
+			    rtree->level2bits[i+1]);
+			if (child == NULL) {
+				malloc_mutex_unlock(&rtree->mutex);
+				return (true);
+			}
+			memset(child, 0, sizeof(void *) <<
+			    rtree->level2bits[i+1]);
+			node[subkey] = child;
+		}
+	}
+
+	/* node is a leaf, so it contains values rather than node pointers. */
+	bits = rtree->level2bits[i];
+	subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) - bits);
+	node[subkey] = val;
+	malloc_mutex_unlock(&rtree->mutex);
+
+	return (false);
+}
+#endif
+
+#endif /* JEMALLOC_H_INLINES */
+/******************************************************************************/
diff --git a/jemalloc/include/jemalloc/internal/stats.h b/jemalloc/include/jemalloc/internal/stats.h
index cbf035f..3fc2080 100644
--- a/jemalloc/include/jemalloc/internal/stats.h
+++ b/jemalloc/include/jemalloc/internal/stats.h
@@ -154,7 +154,7 @@ struct chunk_stats_s {
 
 extern bool	opt_stats_print;
 
-char	*umax2s(uintmax_t x, unsigned base, char *s);
+char	*u2s(uint64_t x, unsigned base, char *s);
 #ifdef JEMALLOC_STATS
 void malloc_cprintf(void (*write)(void *, const char *), void *cbopaque,
     const char *format, ...) JEMALLOC_ATTR(format(printf, 3, 4));
diff --git a/jemalloc/include/jemalloc/internal/tcache.h b/jemalloc/include/jemalloc/internal/tcache.h
index a8be436..1ad91a9 100644
--- a/jemalloc/include/jemalloc/internal/tcache.h
+++ b/jemalloc/include/jemalloc/internal/tcache.h
@@ -17,7 +17,7 @@ typedef struct tcache_s tcache_t;
 /* Number of cache slots for large size classes. */
 #define	TCACHE_NSLOTS_LARGE		20
 
-/* (1U << opt_lg_tcache_maxclass) is used to compute tcache_maxclass. */
+/* (1U << opt_lg_tcache_max) is used to compute tcache_maxclass. */
 #define	LG_TCACHE_MAXCLASS_DEFAULT	15
 
 /*
@@ -61,12 +61,25 @@ struct tcache_s {
 #ifdef JEMALLOC_H_EXTERNS
 
 extern bool	opt_tcache;
-extern ssize_t	opt_lg_tcache_maxclass;
+extern ssize_t	opt_lg_tcache_max;
 extern ssize_t	opt_lg_tcache_gc_sweep;
 
 /* Map of thread-specific caches. */
+#ifndef NO_TLS
 extern __thread tcache_t	*tcache_tls
     JEMALLOC_ATTR(tls_model("initial-exec"));
+#  define TCACHE_GET()	tcache_tls
+#  define TCACHE_SET(v)	do {						\
+	tcache_tls = (tcache_t *)(v);					\
+	pthread_setspecific(tcache_tsd, (void *)(v));			\
+} while (0)
+#else
+#  define TCACHE_GET()	((tcache_t *)pthread_getspecific(tcache_tsd))
+#  define TCACHE_SET(v)	do {						\
+	pthread_setspecific(tcache_tsd, (void *)(v));			\
+} while (0)
+#endif
+extern pthread_key_t		tcache_tsd;
 
 /*
  * Number of tcache bins.  There are nbins small-object bins, plus 0 or more
@@ -122,14 +135,23 @@ tcache_get(void)
 	if ((isthreaded & opt_tcache) == false)
 		return (NULL);
 
-	tcache = tcache_tls;
-	if ((uintptr_t)tcache <= (uintptr_t)1) {
+	tcache = TCACHE_GET();
+	if ((uintptr_t)tcache <= (uintptr_t)2) {
 		if (tcache == NULL) {
 			tcache = tcache_create(choose_arena());
 			if (tcache == NULL)
 				return (NULL);
-		} else
+		} else {
+			if (tcache == (void *)(uintptr_t)1) {
+				/*
+				 * Make a note that an allocator function was
+				 * called after the tcache_thread_cleanup() was
+				 * called.
+				 */
+				TCACHE_SET((uintptr_t)2);
+			}
 			return (NULL);
+		}
 	}
 
 	return (tcache);
@@ -258,9 +280,9 @@ tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
 	} else {
 #ifdef JEMALLOC_PROF
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ret);
-		size_t pageind = (unsigned)(((uintptr_t)ret - (uintptr_t)chunk)
-		    >> PAGE_SHIFT);
-		chunk->map[pageind].bits |= CHUNK_MAP_CLASS_MASK;
+		size_t pageind = (((uintptr_t)ret - (uintptr_t)chunk) >>
+		    PAGE_SHIFT);
+		chunk->map[pageind-map_bias].bits &= ~CHUNK_MAP_CLASS_MASK;
 #endif
 		if (zero == false) {
 #ifdef JEMALLOC_FILL
@@ -299,8 +321,8 @@ tcache_dalloc_small(tcache_t *tcache, void *ptr)
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	arena = chunk->arena;
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
-	mapelm = &chunk->map[pageind];
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
+	mapelm = &chunk->map[pageind-map_bias];
 	run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
 	    (mapelm->bits >> PAGE_SHIFT)) << PAGE_SHIFT));
 	assert(run->magic == ARENA_RUN_MAGIC);
@@ -339,7 +361,6 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
 	arena_chunk_t *chunk;
 	size_t pageind, binind;
 	tcache_bin_t *tbin;
-	arena_chunk_map_t *mapelm;
 
 	assert((size & PAGE_MASK) == 0);
 	assert(arena_salloc(ptr) > small_maxclass);
@@ -347,8 +368,7 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size)
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	arena = chunk->arena;
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT);
-	mapelm = &chunk->map[pageind];
+	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
 	binind = nbins + (size >> PAGE_SHIFT) - 1;
 
 #ifdef JEMALLOC_FILL
diff --git a/jemalloc/include/jemalloc/internal/zone.h b/jemalloc/include/jemalloc/internal/zone.h
new file mode 100644
index 0000000..859b529
--- /dev/null
+++ b/jemalloc/include/jemalloc/internal/zone.h
@@ -0,0 +1,23 @@
+#ifndef JEMALLOC_ZONE
+#  error "This source file is for zones on Darwin (OS X)."
+#endif
+/******************************************************************************/
+#ifdef JEMALLOC_H_TYPES
+
+#endif /* JEMALLOC_H_TYPES */
+/******************************************************************************/
+#ifdef JEMALLOC_H_STRUCTS
+
+#endif /* JEMALLOC_H_STRUCTS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_EXTERNS
+
+malloc_zone_t *create_zone(void);
+void	szone2ozone(malloc_zone_t *zone);
+
+#endif /* JEMALLOC_H_EXTERNS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_INLINES
+
+#endif /* JEMALLOC_H_INLINES */
+/******************************************************************************/
diff --git a/jemalloc/include/jemalloc/jemalloc.h.in b/jemalloc/include/jemalloc/jemalloc.h.in
index 8ef8183..4dd3981 100644
--- a/jemalloc/include/jemalloc/jemalloc.h.in
+++ b/jemalloc/include/jemalloc/jemalloc.h.in
@@ -4,6 +4,9 @@
 extern "C" {
 #endif
 
+#include <limits.h>
+#include <strings.h>
+
 #define	JEMALLOC_VERSION "@jemalloc_version@"
 #define	JEMALLOC_VERSION_MAJOR @jemalloc_version_major@
 #define	JEMALLOC_VERSION_MINOR @jemalloc_version_minor@
@@ -16,7 +19,20 @@ extern "C" {
 #  define JEMALLOC_P(s) s
 #endif
 
-extern const char	*JEMALLOC_P(malloc_options);
+#define	ALLOCM_LG_ALIGN	((int)0x3f)
+#if LG_SIZEOF_PTR == 2
+#define	ALLOCM_ALIGN(a)	(ffs(a)-1)
+#else
+#define	ALLOCM_ALIGN(a)	((a < (size_t)INT_MAX) ? ffs(a)-1 : ffs(a>>32)+31)
+#endif
+#define	ALLOCM_ZERO	((int)0x40)
+#define	ALLOCM_NO_MOVE	((int)0x80)
+
+#define	ALLOCM_SUCCESS		0
+#define	ALLOCM_ERR_OOM		1
+#define	ALLOCM_ERR_NOT_MOVED	2
+
+extern const char	*JEMALLOC_P(malloc_conf);
 extern void		(*JEMALLOC_P(malloc_message))(void *, const char *);
 
 void	*JEMALLOC_P(malloc)(size_t size) JEMALLOC_ATTR(malloc);
@@ -36,6 +52,14 @@ int	JEMALLOC_P(mallctlnametomib)(const char *name, size_t *mibp,
 int	JEMALLOC_P(mallctlbymib)(const size_t *mib, size_t miblen, void *oldp,
     size_t *oldlenp, void *newp, size_t newlen);
 
+int	JEMALLOC_P(allocm)(void **ptr, size_t *rsize, size_t size, int flags)
+    JEMALLOC_ATTR(nonnull(1));
+int	JEMALLOC_P(rallocm)(void **ptr, size_t *rsize, size_t size,
+    size_t extra, int flags) JEMALLOC_ATTR(nonnull(1));
+int	JEMALLOC_P(sallocm)(const void *ptr, size_t *rsize, int flags)
+    JEMALLOC_ATTR(nonnull(1));
+int	JEMALLOC_P(dallocm)(void *ptr, int flags) JEMALLOC_ATTR(nonnull(1));
+
 #ifdef __cplusplus
 };
 #endif
diff --git a/jemalloc/include/jemalloc/jemalloc_defs.h.in b/jemalloc/include/jemalloc/jemalloc_defs.h.in
index 8b98d67..b8f3f36 100644
--- a/jemalloc/include/jemalloc/jemalloc_defs.h.in
+++ b/jemalloc/include/jemalloc/jemalloc_defs.h.in
@@ -13,6 +13,7 @@
  * the API prefixing.
  */
 #undef JEMALLOC_PREFIX
+#undef JEMALLOC_CPREFIX
 #if (defined(JEMALLOC_PREFIX) && defined(JEMALLOC_MANGLE))
 #undef JEMALLOC_P
 #endif
@@ -31,6 +32,9 @@
 #  define JEMALLOC_ATTR(s)
 #endif
 
+/* JEMALLOC_CC_SILENCE enables code that silences unuseful compiler warnings. */
+#undef JEMALLOC_CC_SILENCE
+
 /*
  * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables
  * inline functions.
@@ -92,6 +96,38 @@
 /* TLS is used to map arenas and magazine caches to threads. */
 #undef NO_TLS
 
+/*
+ * JEMALLOC_IVSALLOC enables ivsalloc(), which verifies that pointers reside
+ * within jemalloc-owned chunks before dereferencing them.
+ */
+#undef JEMALLOC_IVSALLOC
+
+/*
+ * Define overrides for non-standard allocator-related functions if they
+ * are present on the system.
+ */
+#undef JEMALLOC_OVERRIDE_MEMALIGN
+#undef JEMALLOC_OVERRIDE_VALLOC
+
+/*
+ * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
+ */
+#undef JEMALLOC_ZONE
+#undef JEMALLOC_ZONE_VERSION
+
+/*
+ * Methods for purging unused pages differ between operating systems.
+ *
+ *   madvise(..., MADV_DONTNEED) : On Linux, this immediately discards pages,
+ *                                 such that new pages will be demand-zeroed if
+ *                                 the address region is later touched.
+ *   madvise(..., MADV_FREE) : On FreeBSD and Darwin, this marks pages as being
+ *                             unused, such that they will be discarded rather
+ *                             than swapped out.
+ */
+#undef JEMALLOC_PURGE_MADVISE_DONTNEED
+#undef JEMALLOC_PURGE_MADVISE_FREE
+
 /* sizeof(void *) == 2^LG_SIZEOF_PTR. */
 #undef LG_SIZEOF_PTR