From 8a03cf039cd06f9fa6972711195055d865673966 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Mon, 4 May 2015 09:58:36 -0700
Subject: Implement cache index randomization for large allocations.

Extract szad size quantization into {extent,run}_quantize(), and .
quantize szad run sizes to the union of valid small region run sizes and
large run sizes.

Refactor iteration in arena_run_first_fit() to use
run_quantize{,_first,_next(), and add support for padded large runs.

For large allocations that have no specified alignment constraints,
compute a pseudo-random offset from the beginning of the first backing
page that is a multiple of the cache line size.  Under typical
configurations with 4-KiB pages and 64-byte cache lines this results in
a uniform distribution among 64 page boundary offsets.

Add the --disable-cache-oblivious option, primarily intended for
performance testing.

This resolves #13.
---
 ChangeLog                                          |   3 +
 INSTALL                                            |   9 +
 configure.ac                                       |  18 ++
 include/jemalloc/internal/arena.h                  |  53 +++--
 include/jemalloc/internal/jemalloc_internal.h.in   |   7 +
 .../jemalloc/internal/jemalloc_internal_defs.h.in  |   6 +
 include/jemalloc/internal/prng.h                   |  12 +-
 src/arena.c                                        | 216 +++++++++++++++++----
 src/extent.c                                       |  25 ++-
 src/jemalloc.c                                     |   3 +-
 10 files changed, 279 insertions(+), 73 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 33139f9..b6fa366 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -101,6 +101,9 @@ found in the git revision history:
     run fragmentation, smaller runs reduce external fragmentation for small size
     classes, and packed (less uniformly aligned) metadata layout improves CPU
     cache set distribution.
+  - Randomly distribute large allocation base pointer alignment relative to page
+    boundaries in order to more uniformly utilize CPU cache sets.  This can be
+    disabled via the --disable-cache-oblivious configure option.
   - Micro-optimize the fast paths for the public API functions.
   - Refactor thread-specific data to reside in a single structure.  This assures
     that only a single TLS read is necessary per call into the public API.
diff --git a/INSTALL b/INSTALL
index cd760ca..8d39687 100644
--- a/INSTALL
+++ b/INSTALL
@@ -185,6 +185,15 @@ any of the following arguments (not a definitive list) to 'configure':
     thread-local variables via the __thread keyword.  If TLS is available,
     jemalloc uses it for several purposes.
 
+--disable-cache-oblivious
+    Disable cache-oblivious large allocation alignment for large allocation
+    requests with no alignment constraints.  If this feature is disabled, all
+    large allocations are page-aligned as an implementation artifact, which can
+    severely harm CPU cache utilization.  However, the cache-oblivious layout
+    comes at the cost of one extra page per large allocation, which in the
+    most extreme case increases physical memory usage for the 16 KiB size class
+    to 20 KiB.
+
 --with-xslroot=<path>
     Specify where to find DocBook XSL stylesheets when building the
     documentation.
diff --git a/configure.ac b/configure.ac
index 05c0d56..bb6f3a3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -952,6 +952,23 @@ if test "x$enable_xmalloc" = "x1" ; then
 fi
 AC_SUBST([enable_xmalloc])
 
+dnl Support cache-oblivious allocation alignment by default.
+AC_ARG_ENABLE([cache-oblivious],
+  [AS_HELP_STRING([--disable-cache-oblivious],
+                  [Disable support for cache-oblivious allocation alignment])],
+[if test "x$enable_cache_oblivious" = "xno" ; then
+  enable_cache_oblivious="0"
+else
+  enable_cache_oblivious="1"
+fi
+],
+[enable_cache_oblivious="1"]
+)
+if test "x$enable_cache_oblivious" = "x1" ; then
+  AC_DEFINE([JEMALLOC_CACHE_OBLIVIOUS], [ ])
+fi
+AC_SUBST([enable_cache_oblivious])
+
 dnl ============================================================================
 dnl Check for  __builtin_ffsl(), then ffsl(3), and fail if neither are found.
 dnl One of those two functions should (theoretically) exist on all platforms
@@ -1663,4 +1680,5 @@ AC_MSG_RESULT([xmalloc            : ${enable_xmalloc}])
 AC_MSG_RESULT([munmap             : ${enable_munmap}])
 AC_MSG_RESULT([lazy_lock          : ${enable_lazy_lock}])
 AC_MSG_RESULT([tls                : ${enable_tls}])
+AC_MSG_RESULT([cache-oblivious    : ${enable_cache_oblivious}])
 AC_MSG_RESULT([===============================================================================])
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index dff99fb..fba1b81 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -290,6 +290,12 @@ struct arena_s {
 
 	uint64_t		prof_accumbytes;
 
+	/*
+	 * PRNG state for cache index randomization of large allocation base
+	 * pointers.
+	 */
+	uint64_t		offset_state;
+
 	dss_prec_t		dss_prec;
 
 	/*
@@ -394,7 +400,15 @@ struct arena_s {
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
-extern ssize_t	opt_lg_dirty_mult;
+static const size_t	large_pad =
+#ifdef JEMALLOC_CACHE_OBLIVIOUS
+    PAGE
+#else
+    0
+#endif
+    ;
+
+extern ssize_t		opt_lg_dirty_mult;
 
 extern arena_bin_info_t	arena_bin_info[NBINS];
 
@@ -475,7 +489,7 @@ void	arena_stats_merge(arena_t *arena, const char **dss,
     arena_stats_t *astats, malloc_bin_stats_t *bstats,
     malloc_large_stats_t *lstats, malloc_huge_stats_t *hstats);
 arena_t	*arena_new(unsigned ind);
-void	arena_boot(void);
+bool	arena_boot(void);
 void	arena_prefork(arena_t *arena);
 void	arena_postfork_parent(arena_t *arena);
 void	arena_postfork_child(arena_t *arena);
@@ -721,7 +735,7 @@ arena_mapbits_unallocated_set(arena_chunk_t *chunk, size_t pageind, size_t size,
 {
 	size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
 
-	assert((size & PAGE_MASK) == 0);
+	assert(size == PAGE_CEILING(size));
 	assert((flags & ~CHUNK_MAP_FLAGS_MASK) == 0);
 	assert((flags & (CHUNK_MAP_DIRTY|CHUNK_MAP_UNZEROED)) == flags);
 	arena_mapbitsp_write(mapbitsp, size | CHUNK_MAP_BININD_INVALID | flags);
@@ -734,7 +748,7 @@ arena_mapbits_unallocated_size_set(arena_chunk_t *chunk, size_t pageind,
 	size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
 	size_t mapbits = arena_mapbitsp_read(mapbitsp);
 
-	assert((size & PAGE_MASK) == 0);
+	assert(size == PAGE_CEILING(size));
 	assert((mapbits & (CHUNK_MAP_LARGE|CHUNK_MAP_ALLOCATED)) == 0);
 	arena_mapbitsp_write(mapbitsp, size | (mapbits & PAGE_MASK));
 }
@@ -747,7 +761,7 @@ arena_mapbits_large_set(arena_chunk_t *chunk, size_t pageind, size_t size,
 	size_t mapbits = arena_mapbitsp_read(mapbitsp);
 	size_t unzeroed;
 
-	assert((size & PAGE_MASK) == 0);
+	assert(size == PAGE_CEILING(size));
 	assert((flags & CHUNK_MAP_DIRTY) == flags);
 	unzeroed = mapbits & CHUNK_MAP_UNZEROED; /* Preserve unzeroed. */
 	arena_mapbitsp_write(mapbitsp, size | CHUNK_MAP_BININD_INVALID | flags
@@ -762,7 +776,8 @@ arena_mapbits_large_binind_set(arena_chunk_t *chunk, size_t pageind,
 	size_t mapbits = arena_mapbitsp_read(mapbitsp);
 
 	assert(binind <= BININD_INVALID);
-	assert(arena_mapbits_large_size_get(chunk, pageind) == LARGE_MINCLASS);
+	assert(arena_mapbits_large_size_get(chunk, pageind) == LARGE_MINCLASS +
+	    large_pad);
 	arena_mapbitsp_write(mapbitsp, (mapbits & ~CHUNK_MAP_BININD_MASK) |
 	    (binind << CHUNK_MAP_BININD_SHIFT));
 }
@@ -1107,13 +1122,16 @@ arena_salloc(const void *ptr, bool demote)
 			 * end up looking at binind to determine that ptr is a
 			 * small allocation.
 			 */
-			assert(((uintptr_t)ptr & PAGE_MASK) == 0);
-			ret = arena_mapbits_large_size_get(chunk, pageind);
+			assert(config_cache_oblivious || ((uintptr_t)ptr &
+			    PAGE_MASK) == 0);
+			ret = arena_mapbits_large_size_get(chunk, pageind) -
+			    large_pad;
 			assert(ret != 0);
-			assert(pageind + (ret>>LG_PAGE) <= chunk_npages);
+			assert(pageind + ((ret+large_pad)>>LG_PAGE) <=
+			    chunk_npages);
 			assert(arena_mapbits_dirty_get(chunk, pageind) ==
 			    arena_mapbits_dirty_get(chunk,
-			    pageind+(ret>>LG_PAGE)-1));
+			    pageind+((ret+large_pad)>>LG_PAGE)-1));
 		} else {
 			/*
 			 * Small allocation (possibly promoted to a large
@@ -1157,11 +1175,13 @@ arena_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache)
 			size_t size = arena_mapbits_large_size_get(chunk,
 			    pageind);
 
-			assert(((uintptr_t)ptr & PAGE_MASK) == 0);
+			assert(config_cache_oblivious || ((uintptr_t)ptr &
+			    PAGE_MASK) == 0);
 
-			if (likely(tcache != NULL) && size <= tcache_maxclass)
-				tcache_dalloc_large(tsd, tcache, ptr, size);
-			else {
+			if (likely(tcache != NULL) && size <= tcache_maxclass) {
+				tcache_dalloc_large(tsd, tcache, ptr, size -
+				    large_pad);
+			} else {
 				arena_dalloc_large(extent_node_arena_get(
 				    &chunk->node), chunk, ptr);
 			}
@@ -1188,7 +1208,7 @@ arena_sdalloc(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache)
 				 */
 				assert(((uintptr_t)ptr & PAGE_MASK) == 0);
 				size = arena_mapbits_large_size_get(chunk,
-				    pageind);
+				    pageind) - large_pad;
 			}
 		}
 		assert(s2u(size) == s2u(arena_salloc(ptr, false)));
@@ -1205,7 +1225,8 @@ arena_sdalloc(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache)
 				    &chunk->node), chunk, ptr, pageind);
 			}
 		} else {
-			assert(((uintptr_t)ptr & PAGE_MASK) == 0);
+			assert(config_cache_oblivious || ((uintptr_t)ptr &
+			    PAGE_MASK) == 0);
 
 			if (likely(tcache != NULL) && size <= tcache_maxclass)
 				tcache_dalloc_large(tsd, tcache, ptr, size);
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index b398f31..910ebf7 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -126,6 +126,13 @@ static const bool config_ivsalloc =
     false
 #endif
     ;
+static const bool config_cache_oblivious =
+#ifdef JEMALLOC_CACHE_OBLIVIOUS
+    true
+#else
+    false
+#endif
+    ;
 
 #ifdef JEMALLOC_C11ATOMICS
 #include <stdatomic.h>
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index a943d23..ed8347a 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -193,6 +193,12 @@
 #undef JEMALLOC_IVSALLOC
 
 /*
+ * If defined, explicitly attempt to more uniformly distribute large allocation
+ * pointer alignments across all cache indices.
+ */
+#undef JEMALLOC_CACHE_OBLIVIOUS
+
+/*
  * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
  */
 #undef JEMALLOC_ZONE
diff --git a/include/jemalloc/internal/prng.h b/include/jemalloc/internal/prng.h
index c6b1797..216d0ef 100644
--- a/include/jemalloc/internal/prng.h
+++ b/include/jemalloc/internal/prng.h
@@ -26,22 +26,22 @@
  *   const uint32_t a, c : See above discussion.
  */
 #define	prng32(r, lg_range, state, a, c) do {				\
-	assert(lg_range > 0);						\
-	assert(lg_range <= 32);						\
+	assert((lg_range) > 0);						\
+	assert((lg_range) <= 32);					\
 									\
 	r = (state * (a)) + (c);					\
 	state = r;							\
-	r >>= (32 - lg_range);						\
+	r >>= (32 - (lg_range));					\
 } while (false)
 
 /* Same as prng32(), but 64 bits of pseudo-randomness, using uint64_t. */
 #define	prng64(r, lg_range, state, a, c) do {				\
-	assert(lg_range > 0);						\
-	assert(lg_range <= 64);						\
+	assert((lg_range) > 0);						\
+	assert((lg_range) <= 64);					\
 									\
 	r = (state * (a)) + (c);					\
 	state = r;							\
-	r >>= (64 - lg_range);						\
+	r >>= (64 - (lg_range));					\
 } while (false)
 
 #endif /* JEMALLOC_H_TYPES */
diff --git a/src/arena.c b/src/arena.c
index 3041068..a053adf 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -12,6 +12,8 @@ size_t		map_bias;
 size_t		map_misc_offset;
 size_t		arena_maxrun; /* Max run size for arenas. */
 size_t		arena_maxclass; /* Max size class for arenas. */
+static size_t	small_maxrun; /* Max run size used for small size classes. */
+static bool	*small_run_tab; /* Valid small run page multiples. */
 unsigned	nlclasses; /* Number of large size classes. */
 unsigned	nhclasses; /* Number of huge size classes. */
 
@@ -56,33 +58,102 @@ arena_run_comp(arena_chunk_map_misc_t *a, arena_chunk_map_misc_t *b)
 rb_gen(static UNUSED, arena_run_tree_, arena_run_tree_t, arena_chunk_map_misc_t,
     rb_link, arena_run_comp)
 
+static size_t
+run_quantize(size_t size)
+{
+	size_t qsize;
+
+	assert(size != 0);
+	assert(size == PAGE_CEILING(size));
+
+	/* Don't change sizes that are valid small run sizes. */
+	if (size <= small_maxrun && small_run_tab[size >> LG_PAGE])
+		return (size);
+
+	/*
+	 * Round down to the nearest run size that can actually be requested
+	 * during normal large allocation.  Add large_pad so that cache index
+	 * randomization can offset the allocation from the page boundary.
+	 */
+	qsize = index2size(size2index(size - large_pad + 1) - 1) + large_pad;
+	if (qsize <= SMALL_MAXCLASS + large_pad)
+		return (run_quantize(size - large_pad));
+	assert(qsize <= size);
+	return (qsize);
+}
+
+static size_t
+run_quantize_next(size_t size)
+{
+	size_t large_run_size_next;
+
+	assert(size != 0);
+	assert(size == PAGE_CEILING(size));
+
+	/*
+	 * Return the next quantized size greater than the input size.
+	 * Quantized sizes comprise the union of run sizes that back small
+	 * region runs, and run sizes that back large regions with no explicit
+	 * alignment constraints.
+	 */
+
+	if (size > SMALL_MAXCLASS) {
+		large_run_size_next = PAGE_CEILING(index2size(size2index(size -
+		    large_pad) + 1) + large_pad);
+	} else
+		large_run_size_next = SIZE_T_MAX;
+	if (size >= small_maxrun)
+		return (large_run_size_next);
+
+	while (true) {
+		size += PAGE;
+		assert(size <= small_maxrun);
+		if (small_run_tab[size >> LG_PAGE]) {
+			if (large_run_size_next < size)
+				return (large_run_size_next);
+			return (size);
+		}
+	}
+}
+
+static size_t
+run_quantize_first(size_t size)
+{
+	size_t qsize = run_quantize(size);
+
+	if (qsize < size) {
+		/*
+		 * Skip a quantization that may have an adequately large run,
+		 * because under-sized runs may be mixed in.  This only happens
+		 * when an unusual size is requested, i.e. for aligned
+		 * allocation, and is just one of several places where linear
+		 * search would potentially find sufficiently aligned available
+		 * memory somewhere lower.
+		 */
+		qsize = run_quantize_next(size);
+	}
+	return (qsize);
+}
+
 JEMALLOC_INLINE_C int
 arena_avail_comp(arena_chunk_map_misc_t *a, arena_chunk_map_misc_t *b)
 {
 	int ret;
 	uintptr_t a_miscelm = (uintptr_t)a;
-	size_t a_size;
-	size_t b_size = arena_miscelm_to_bits(b) & ~PAGE_MASK;
-	index_t a_index, b_index;
+	size_t a_qsize;
+	size_t b_qsize = run_quantize(arena_miscelm_to_bits(b) & ~PAGE_MASK);
 
 	if (a_miscelm & CHUNK_MAP_KEY) {
-		a_size = a_miscelm & ~PAGE_MASK;
-		assert(a_size == s2u(a_size));
+		size_t a_size = a_miscelm & ~PAGE_MASK;
+		a_qsize = run_quantize(a_size);
 	} else
-		a_size = arena_miscelm_to_bits(a) & ~PAGE_MASK;
+		a_qsize = run_quantize(arena_miscelm_to_bits(a) & ~PAGE_MASK);
 
 	/*
-	 * Compute the index of the largest size class that the run can satisfy
-	 * a request for.
+	 * Compare based on quantized size rather than size, in order to sort
+	 * equally useful runs only by address.
 	 */
-	a_index = size2index(a_size + 1) - 1;
-	b_index = size2index(b_size + 1) - 1;
-
-	/*
-	 * Compare based on size class index rather than size, in order to
-	 * sort equally useful runs only by address.
-	 */
-	ret = (a_index > b_index) - (a_index < b_index);
+	ret = (a_qsize > b_qsize) - (a_qsize < b_qsize);
 	if (ret == 0) {
 		if (!(a_miscelm & CHUNK_MAP_KEY)) {
 			uintptr_t b_miscelm = (uintptr_t)b;
@@ -913,7 +984,7 @@ static arena_run_t *
 arena_run_first_fit(arena_t *arena, size_t size)
 {
 	arena_run_t *run;
-	index_t index, max_index;
+	size_t search_size, max_size;
 
 	assert(size == s2u(size));
 	assert(size == PAGE_CEILING(size));
@@ -924,14 +995,14 @@ arena_run_first_fit(arena_t *arena, size_t size)
 	 * and choose the lowest of the runs found.
 	 */
 	run = NULL;
-	for (index = size2index(size), max_index = size2index(arena_maxclass);
-	    index <= max_index;) {
+	for (search_size = run_quantize_first(size), max_size =
+	    run_quantize(arena_maxclass + large_pad); search_size <= max_size;
+	    search_size = run_quantize_next(search_size)) {
 		arena_run_t *currun;
 		arena_chunk_t *currun_chunk;
 		size_t currun_pageind, currun_size;
-		size_t usize = PAGE_CEILING(index2size(index));
-		arena_chunk_map_misc_t *key = (arena_chunk_map_misc_t *)(usize |
-		    CHUNK_MAP_KEY);
+		arena_chunk_map_misc_t *key = (arena_chunk_map_misc_t *)
+		    (search_size | CHUNK_MAP_KEY);
 		arena_chunk_map_misc_t *miscelm =
 		    arena_avail_tree_nsearch(&arena->runs_avail, key);
 		if (miscelm == NULL)
@@ -939,12 +1010,13 @@ arena_run_first_fit(arena_t *arena, size_t size)
 		currun = &miscelm->run;
 		if (run == NULL || (uintptr_t)currun < (uintptr_t)run)
 			run = currun;
+		/* Skip iteration(s) if run is larger than the search size. */
 		currun_chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(currun);
 		currun_pageind = arena_miscelm_to_pageind(miscelm);
 		currun_size = arena_mapbits_unallocated_size_get(currun_chunk,
 		    currun_pageind);
-		assert(size2index(currun_size) + 1 > index);
-		index = size2index(currun_size) + 1;
+		assert(currun_size >= search_size);
+		search_size = currun_size;
 	}
 
 	return (run);
@@ -966,7 +1038,7 @@ arena_run_alloc_large(arena_t *arena, size_t size, bool zero)
 	arena_run_t *run;
 
 	assert(size <= arena_maxrun);
-	assert((size & PAGE_MASK) == 0);
+	assert(size == PAGE_CEILING(size));
 
 	/* Search the arena's chunks for the lowest best fit. */
 	run = arena_run_alloc_large_helper(arena, size, zero);
@@ -994,7 +1066,7 @@ arena_run_alloc_large(arena_t *arena, size_t size, bool zero)
 static arena_run_t *
 arena_run_alloc_small_helper(arena_t *arena, size_t size, index_t binind)
 {
-	arena_run_t *run = arena_run_first_fit(arena, PAGE_CEILING(size));
+	arena_run_t *run = arena_run_first_fit(arena, size);
 	if (run != NULL)
 		arena_run_split_small(arena, run, size, binind);
 	return (run);
@@ -1007,7 +1079,7 @@ arena_run_alloc_small(arena_t *arena, size_t size, index_t binind)
 	arena_run_t *run;
 
 	assert(size <= arena_maxrun);
-	assert((size & PAGE_MASK) == 0);
+	assert(size == PAGE_CEILING(size));
 	assert(binind != BININD_INVALID);
 
 	/* Search the arena's chunks for the lowest best fit. */
@@ -1965,6 +2037,8 @@ arena_malloc_large(arena_t *arena, size_t size, bool zero)
 {
 	void *ret;
 	size_t usize;
+	uint64_t r;
+	uintptr_t random_offset;
 	arena_run_t *run;
 	arena_chunk_map_misc_t *miscelm;
 	UNUSED bool idump;
@@ -1972,13 +2046,25 @@ arena_malloc_large(arena_t *arena, size_t size, bool zero)
 	/* Large allocation. */
 	usize = s2u(size);
 	malloc_mutex_lock(&arena->lock);
-	run = arena_run_alloc_large(arena, usize, zero);
+	if (config_cache_oblivious) {
+		/*
+		 * Compute a uniformly distributed offset within the first page
+		 * that is a multiple of the cacheline size, e.g. [0 .. 63) * 64
+		 * for 4 KiB pages and 64-byte cachelines.
+		 */
+		prng64(r, LG_PAGE - LG_CACHELINE, arena->offset_state,
+		    UINT64_C(6364136223846793009), UINT64_C(1442695040888963409));
+		random_offset = ((uintptr_t)r) << LG_CACHELINE;
+	} else
+		random_offset = 0;
+	run = arena_run_alloc_large(arena, usize + large_pad, zero);
 	if (run == NULL) {
 		malloc_mutex_unlock(&arena->lock);
 		return (NULL);
 	}
 	miscelm = arena_run_to_miscelm(run);
-	ret = arena_miscelm_to_rpages(miscelm);
+	ret = (void *)((uintptr_t)arena_miscelm_to_rpages(miscelm) +
+	    random_offset);
 	if (config_stats) {
 		index_t index = size2index(usize) - NBINS;
 
@@ -2019,14 +2105,14 @@ arena_palloc_large(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
 	arena_chunk_map_misc_t *miscelm;
 	void *rpages;
 
-	assert((size & PAGE_MASK) == 0);
+	assert(size == PAGE_CEILING(size));
 
 	arena = arena_choose(tsd, arena);
 	if (unlikely(arena == NULL))
 		return (NULL);
 
 	alignment = PAGE_CEILING(alignment);
-	alloc_size = size + alignment - PAGE;
+	alloc_size = size + large_pad + alignment - PAGE;
 
 	malloc_mutex_lock(&arena->lock);
 	run = arena_run_alloc_large(arena, alloc_size, false);
@@ -2041,7 +2127,7 @@ arena_palloc_large(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
 	leadsize = ALIGNMENT_CEILING((uintptr_t)rpages, alignment) -
 	    (uintptr_t)rpages;
 	assert(alloc_size >= leadsize + size);
-	trailsize = alloc_size - leadsize - size;
+	trailsize = alloc_size - leadsize - size - large_pad;
 	if (leadsize != 0) {
 		arena_chunk_map_misc_t *head_miscelm = miscelm;
 		arena_run_t *head_run = run;
@@ -2055,10 +2141,10 @@ arena_palloc_large(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
 		    alloc_size - leadsize);
 	}
 	if (trailsize != 0) {
-		arena_run_trim_tail(arena, chunk, run, size + trailsize, size,
-		    false);
+		arena_run_trim_tail(arena, chunk, run, size + large_pad +
+		    trailsize, size + large_pad, false);
 	}
-	arena_run_init_large(arena, run, size, zero);
+	arena_run_init_large(arena, run, size + large_pad, zero);
 	ret = arena_miscelm_to_rpages(miscelm);
 
 	if (config_stats) {
@@ -2088,7 +2174,8 @@ arena_palloc(tsd_t *tsd, arena_t *arena, size_t usize, size_t alignment,
 {
 	void *ret;
 
-	if (usize <= SMALL_MAXCLASS && alignment < PAGE)
+	if (usize <= SMALL_MAXCLASS && (alignment < PAGE || (alignment == PAGE
+	    && (usize & PAGE_MASK) == 0)))
 		ret = arena_malloc(tsd, arena, usize, zero, tcache);
 	else {
 		if (likely(usize <= arena_maxclass)) {
@@ -2292,7 +2379,8 @@ arena_dalloc_large_locked_impl(arena_t *arena, arena_chunk_t *chunk,
 	arena_run_t *run = &miscelm->run;
 
 	if (config_fill || config_stats) {
-		size_t usize = arena_mapbits_large_size_get(chunk, pageind);
+		size_t usize = arena_mapbits_large_size_get(chunk, pageind) -
+		    large_pad;
 
 		if (!junked)
 			arena_dalloc_junk_large(ptr, usize);
@@ -2341,7 +2429,8 @@ arena_ralloc_large_shrink(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	 * allocations.
 	 */
 	malloc_mutex_lock(&arena->lock);
-	arena_run_trim_tail(arena, chunk, run, oldsize, size, true);
+	arena_run_trim_tail(arena, chunk, run, oldsize + large_pad, size +
+	    large_pad, true);
 	if (config_stats) {
 		index_t oldindex = size2index(oldsize) - NBINS;
 		index_t index = size2index(size) - NBINS;
@@ -2370,7 +2459,8 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	size_t followsize;
 	size_t usize_min = s2u(size);
 
-	assert(oldsize == arena_mapbits_large_size_get(chunk, pageind));
+	assert(oldsize == arena_mapbits_large_size_get(chunk, pageind) -
+	    large_pad);
 
 	/* Try to extend the run. */
 	assert(usize_min > oldsize);
@@ -2391,7 +2481,7 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 		while (oldsize + followsize < usize)
 			usize = index2size(size2index(usize)-1);
 		assert(usize >= usize_min);
-		splitsize = usize - oldsize;
+		splitsize = usize - oldsize + large_pad;
 
 		run = &arena_miscelm_get(chunk, pageind+npages)->run;
 		arena_run_split_large(arena, run, splitsize, zero);
@@ -2755,6 +2845,18 @@ arena_new(unsigned ind)
 	if (config_prof)
 		arena->prof_accumbytes = 0;
 
+	if (config_cache_oblivious) {
+		/*
+		 * A nondeterministic seed based on the address of arena reduces
+		 * the likelihood of lockstep non-uniform cache index
+		 * utilization among identical concurrent processes, but at the
+		 * cost of test repeatability.  For debug builds, instead use a
+		 * deterministic seed.
+		 */
+		arena->offset_state = config_debug ? ind :
+		    (uint64_t)(uintptr_t)arena;
+	}
+
 	arena->dss_prec = chunk_dss_prec_get();
 
 	arena->spare = NULL;
@@ -2890,6 +2992,9 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info)
 	bin_info->reg0_offset = actual_run_size - (actual_nregs *
 	    bin_info->reg_interval) - pad_size + bin_info->redzone_size;
 
+	if (actual_run_size > small_maxrun)
+		small_maxrun = actual_run_size;
+
 	assert(bin_info->reg0_offset - bin_info->redzone_size + (bin_info->nregs
 	    * bin_info->reg_interval) + pad_size == bin_info->run_size);
 }
@@ -2899,7 +3004,7 @@ bin_info_init(void)
 {
 	arena_bin_info_t *bin_info;
 
-#define	BIN_INFO_INIT_bin_yes(index, size) \
+#define	BIN_INFO_INIT_bin_yes(index, size)				\
 	bin_info = &arena_bin_info[index];				\
 	bin_info->reg_size = size;					\
 	bin_info_run_size_calc(bin_info);				\
@@ -2913,7 +3018,33 @@ bin_info_init(void)
 #undef SC
 }
 
-void
+static bool
+small_run_size_init(void)
+{
+
+	assert(small_maxrun != 0);
+
+	small_run_tab = (bool *)base_alloc(sizeof(bool) * (small_maxrun >>
+	    LG_PAGE));
+	if (small_run_tab == NULL)
+		return (true);
+
+#define	TAB_INIT_bin_yes(index, size) {					\
+		arena_bin_info_t *bin_info = &arena_bin_info[index];	\
+		small_run_tab[bin_info->run_size >> LG_PAGE] = true;	\
+	}
+#define	TAB_INIT_bin_no(index, size)
+#define	SC(index, lg_grp, lg_delta, ndelta, bin, lg_delta_lookup)	\
+	TAB_INIT_bin_##bin(index, (ZU(1)<<lg_grp) + (ZU(ndelta)<<lg_delta))
+	SIZE_CLASSES
+#undef TAB_INIT_bin_yes
+#undef TAB_INIT_bin_no
+#undef SC
+
+	return (false);
+}
+
+bool
 arena_boot(void)
 {
 	size_t header_size;
@@ -2961,6 +3092,7 @@ arena_boot(void)
 	nhclasses = NSIZES - nlclasses - NBINS;
 
 	bin_info_init();
+	return (small_run_size_init());
 }
 
 void
diff --git a/src/extent.c b/src/extent.c
index e16f8f6..13f9441 100644
--- a/src/extent.c
+++ b/src/extent.c
@@ -3,20 +3,29 @@
 
 /******************************************************************************/
 
+JEMALLOC_INLINE_C size_t
+extent_quantize(size_t size)
+{
+
+	/*
+	 * Round down to the nearest chunk size that can actually be requested
+	 * during normal huge allocation.
+	 */
+	return (index2size(size2index(size + 1) - 1));
+}
+
 JEMALLOC_INLINE_C int
 extent_szad_comp(extent_node_t *a, extent_node_t *b)
 {
 	int ret;
-	size_t a_size = extent_node_size_get(a);
-	size_t b_size = extent_node_size_get(b);
+	size_t a_qsize = extent_quantize(extent_node_size_get(a));
+	size_t b_qsize = extent_quantize(extent_node_size_get(b));
+
 	/*
-	 * Compute the index of the largest size class that the chunk can
-	 * satisfy a request for.
+	 * Compare based on quantized size rather than size, in order to sort
+	 * equally useful extents only by address.
 	 */
-	size_t a_index = size2index(a_size + 1) - 1;
-	size_t b_index = size2index(b_size + 1) - 1;
-
-	ret = (a_index > b_index) - (a_index < b_index);
+	ret = (a_qsize > b_qsize) - (a_qsize < b_qsize);
 	if (ret == 0) {
 		uintptr_t a_addr = (uintptr_t)extent_node_addr_get(a);
 		uintptr_t b_addr = (uintptr_t)extent_node_addr_get(b);
diff --git a/src/jemalloc.c b/src/jemalloc.c
index a2d1c5c..7f26652 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1182,7 +1182,8 @@ malloc_init_hard_a0_locked(void)
 		return (true);
 	if (config_prof)
 		prof_boot1();
-	arena_boot();
+	if (arena_boot())
+		return (true);
 	if (config_tcache && tcache_boot())
 		return (true);
 	if (malloc_mutex_init(&arenas_lock))
-- 
cgit v0.12