Fast-path improvement: reduce # of branches and unnecessary operations.

- Combine multiple runtime branches into a single malloc_slow check. - Avoid calling arena_choose / size2index / index2size on fast path. - A few micro optimizations.
author: Qi Wang <interwq@gwu.edu> 2015-10-27 22:12:10 (GMT)
committer: Jason Evans <je@fb.com> 2015-11-10 22:28:34 (GMT)
commit: f4a0f32d340985de477bbe329ecdaecd69ed1055 (patch)
tree: a148610f4d2253186c59e671dcb065ce3647d2f5 /src/tcache.c
parent: 710ca112e31e8621177d08162f60158c27dd2974 (diff)
download: jemalloc-f4a0f32d340985de477bbe329ecdaecd69ed1055.zip
jemalloc-f4a0f32d340985de477bbe329ecdaecd69ed1055.tar.gz
jemalloc-f4a0f32d340985de477bbe329ecdaecd69ed1055.tar.bz2
1 files changed, 19 insertions, 14 deletions
diff --git a/src/tcache.c b/src/tcache.c
index fdafd0c..78c6230 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -72,7 +72,7 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache)
 
 void *
 tcache_alloc_small_hard(tsd_t *tsd, arena_t *arena, tcache_t *tcache,
-    tcache_bin_t *tbin, szind_t binind)
+    tcache_bin_t *tbin, szind_t binind, bool *tcache_success)
 {
 	void *ret;
 
@@ -80,7 +80,7 @@ tcache_alloc_small_hard(tsd_t *tsd, arena_t *arena, tcache_t *tcache,
 	    tcache->prof_accumbytes : 0);
 	if (config_prof)
 		tcache->prof_accumbytes = 0;
-	ret = tcache_alloc_easy(tbin);
+	ret = tcache_alloc_easy(tbin, tcache_success);
 
 	return (ret);
 }
@@ -102,7 +102,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
 	for (nflush = tbin->ncached - rem; nflush > 0; nflush = ndeferred) {
 		/* Lock the arena bin associated with the first object. */
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(
-		    tbin->avail[0]);
+		    *(tbin->avail - 1));
 		arena_t *bin_arena = extent_node_arena_get(&chunk->node);
 		arena_bin_t *bin = &bin_arena->bins[binind];
 
@@ -122,7 +122,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
 		}
 		ndeferred = 0;
 		for (i = 0; i < nflush; i++) {
-			ptr = tbin->avail[i];
+			ptr = *(tbin->avail - 1 - i);
 			assert(ptr != NULL);
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 			if (extent_node_arena_get(&chunk->node) == bin_arena) {
@@ -139,7 +139,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
 				 * locked.  Stash the object, so that it can be
 				 * handled in a future pass.
 				 */
-				tbin->avail[ndeferred] = ptr;
+				*(tbin->avail - 1 - ndeferred) = ptr;
 				ndeferred++;
 			}
 		}
@@ -158,8 +158,8 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
 		malloc_mutex_unlock(&bin->lock);
 	}
 
-	memmove(tbin->avail, &tbin->avail[tbin->ncached - rem],
-	    rem * sizeof(void *));
+	memmove(tbin->avail - rem, tbin->avail - tbin->ncached, rem *
+	    sizeof(void *));
 	tbin->ncached = rem;
 	if ((int)tbin->ncached < tbin->low_water)
 		tbin->low_water = tbin->ncached;
@@ -182,7 +182,7 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, szind_t binind,
 	for (nflush = tbin->ncached - rem; nflush > 0; nflush = ndeferred) {
 		/* Lock the arena associated with the first object. */
 		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(
-		    tbin->avail[0]);
+		    *(tbin->avail - 1));
 		arena_t *locked_arena = extent_node_arena_get(&chunk->node);
 		UNUSED bool idump;
 
@@ -206,7 +206,7 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, szind_t binind,
 		}
 		ndeferred = 0;
 		for (i = 0; i < nflush; i++) {
-			ptr = tbin->avail[i];
+			ptr = *(tbin->avail - 1 - i);
 			assert(ptr != NULL);
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 			if (extent_node_arena_get(&chunk->node) ==
@@ -220,7 +220,7 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, szind_t binind,
 				 * Stash the object, so that it can be handled
 				 * in a future pass.
 				 */
-				tbin->avail[ndeferred] = ptr;
+				*(tbin->avail - 1 - ndeferred) = ptr;
 				ndeferred++;
 			}
 		}
@@ -241,8 +241,8 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, szind_t binind,
 		malloc_mutex_unlock(&arena->lock);
 	}
 
-	memmove(tbin->avail, &tbin->avail[tbin->ncached - rem],
-	    rem * sizeof(void *));
+	memmove(tbin->avail - rem, tbin->avail - tbin->ncached, rem *
+	    sizeof(void *));
 	tbin->ncached = rem;
 	if ((int)tbin->ncached < tbin->low_water)
 		tbin->low_water = tbin->ncached;
@@ -333,9 +333,14 @@ tcache_create(tsd_t *tsd, arena_t *arena)
 	assert((TCACHE_NSLOTS_SMALL_MAX & 1U) == 0);
 	for (i = 0; i < nhbins; i++) {
 		tcache->tbins[i].lg_fill_div = 1;
+		stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *);
+		/*
+		 * avail points past the available space.  Allocations will
+		 * access the slots toward higher addresses (for the benefit of
+		 * prefetch).
+		 */
 		tcache->tbins[i].avail = (void **)((uintptr_t)tcache +
 		    (uintptr_t)stack_offset);
-		stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *);
 	}
 
 	return (tcache);
@@ -379,7 +384,7 @@ tcache_destroy(tsd_t *tsd, tcache_t *tcache)
 	    arena_prof_accum(arena, tcache->prof_accumbytes))
 		prof_idump();
 
-	idalloctm(tsd, tcache, false, true);
+	idalloctm(tsd, tcache, false, true, true);
 }
 
 void
author	Qi Wang <interwq@gwu.edu>	2015-10-27 22:12:10 (GMT)
committer	Jason Evans <je@fb.com>	2015-11-10 22:28:34 (GMT)
commit	f4a0f32d340985de477bbe329ecdaecd69ed1055 (patch)
tree	a148610f4d2253186c59e671dcb065ce3647d2f5 /src/tcache.c
parent	710ca112e31e8621177d08162f60158c27dd2974 (diff)
download	jemalloc-f4a0f32d340985de477bbe329ecdaecd69ed1055.zip jemalloc-f4a0f32d340985de477bbe329ecdaecd69ed1055.tar.gz jemalloc-f4a0f32d340985de477bbe329ecdaecd69ed1055.tar.bz2