Use bitmaps to track small regions.

The previous free list implementation, which embedded singly linked lists in available regions, had the unfortunate side effect of causing many cache misses during thread cache fills. Fix this in two places: - arena_run_t: Use a new bitmap implementation to track which regions are available. Furthermore, revert to preferring the lowest available region (as jemalloc did with its old bitmap-based approach). - tcache_t: Move read-only tcache_bin_t metadata into tcache_bin_info_t, and add a contiguous array of pointers to tcache_t in order to track cached objects. This substantially increases the size of tcache_t, but results in much higher data locality for common tcache operations. As a side benefit, it is again possible to efficiently flush the least recently used cached objects, so this change changes flushing from MRU to LRU. The new bitmap implementation uses a multi-level summary approach to make finding the lowest available region very fast. In practice, bitmaps only have one or two levels, though the implementation is general enough to handle extremely large bitmaps, mainly so that large page sizes can still be entertained. Fix tcache_bin_flush_large() to always flush statistics, in the same way that tcache_bin_flush_small() was recently fixed. Use JEMALLOC_DEBUG rather than NDEBUG. Add dassert(), and use it for debug-only asserts.
author: Jason Evans <je@fb.com> 2011-03-16 17:30:13 (GMT)
committer: Jason Evans <je@fb.com> 2011-03-17 23:29:32 (GMT)
commit: 84c8eefeffa246607790ad12e28b0f6a24ecc59d (patch)
tree: 3b62221829ca737794d161f36ad0683016a63771 /jemalloc/src
parent: 77f350be08c8b9cd03ceed820b3113dbac9b4151 (diff)
download: jemalloc-84c8eefeffa246607790ad12e28b0f6a24ecc59d.zip
jemalloc-84c8eefeffa246607790ad12e28b0f6a24ecc59d.tar.gz
jemalloc-84c8eefeffa246607790ad12e28b0f6a24ecc59d.tar.bz2
5 files changed, 242 insertions, 101 deletions
diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c
index e49b8ed..87bd9bb 100644
--- a/jemalloc/src/arena.c
+++ b/jemalloc/src/arena.c
@@ -253,59 +253,45 @@ static inline void *
 arena_run_reg_alloc(arena_run_t *run, arena_bin_info_t *bin_info)
 {
 	void *ret;
+	unsigned regind;
+	bitmap_t *bitmap = (bitmap_t *)((uintptr_t)run +
+	    (uintptr_t)bin_info->bitmap_offset);
 
-	assert(run->magic == ARENA_RUN_MAGIC);
+	dassert(run->magic == ARENA_RUN_MAGIC);
 	assert(run->nfree > 0);
+	assert(bitmap_full(bitmap, &bin_info->bitmap_info) == false);
 
+	regind = bitmap_sfu(bitmap, &bin_info->bitmap_info);
+	ret = (void *)((uintptr_t)run + (uintptr_t)bin_info->reg0_offset +
+	    (uintptr_t)(bin_info->reg_size * regind));
 	run->nfree--;
-	ret = run->avail;
-	if (ret != NULL) {
-		/* Double free can cause assertion failure.*/
-		assert(ret != NULL);
-		/* Write-after free can cause assertion failure. */
-		assert((uintptr_t)ret >= (uintptr_t)run +
-		    (uintptr_t)bin_info->reg0_offset);
-		assert((uintptr_t)ret < (uintptr_t)run->next);
-		assert(((uintptr_t)ret - ((uintptr_t)run +
-		    (uintptr_t)bin_info->reg0_offset)) %
-		    (uintptr_t)bin_info->reg_size == 0);
-		run->avail = *(void **)ret;
-		return (ret);
-	}
-	ret = run->next;
-	run->next = (void *)((uintptr_t)ret + (uintptr_t)bin_info->reg_size);
-	assert(ret != NULL);
+	if (regind == run->nextind)
+		run->nextind++;
+	assert(regind < run->nextind);
 	return (ret);
 }
 
 static inline void
 arena_run_reg_dalloc(arena_run_t *run, void *ptr)
 {
-
-#ifndef NDEBUG
 	arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
 	size_t binind = arena_bin_index(chunk->arena, run->bin);
 	arena_bin_info_t *bin_info = &arena_bin_info[binind];
+	unsigned regind = arena_run_regind(run, bin_info, ptr);
+	bitmap_t *bitmap = (bitmap_t *)((uintptr_t)run +
+	    (uintptr_t)bin_info->bitmap_offset);
+
 	assert(run->nfree < bin_info->nregs);
 	/* Freeing an interior pointer can cause assertion failure. */
 	assert(((uintptr_t)ptr - ((uintptr_t)run +
 	    (uintptr_t)bin_info->reg0_offset)) % (uintptr_t)bin_info->reg_size
 	    == 0);
-	/*
-	 * Freeing a pointer lower than region zero can cause assertion
-	 * failure.
-	 */
 	assert((uintptr_t)ptr >= (uintptr_t)run +
 	    (uintptr_t)bin_info->reg0_offset);
-	/*
-	 * Freeing a pointer past in the run's frontier can cause assertion
-	 * failure.
-	 */
-	assert((uintptr_t)ptr < (uintptr_t)run->next);
-#endif
+	/* Freeing an unallocated pointer can cause assertion failure. */
+	assert(bitmap_get(bitmap, &bin_info->bitmap_info, regind));
 
-	*(void **)ptr = run->avail;
-	run->avail = ptr;
+	bitmap_unset(bitmap, &bin_info->bitmap_info, regind);
 	run->nfree++;
 }
 
@@ -772,7 +758,7 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk)
 				    chunk + (uintptr_t)(pageind << PAGE_SHIFT));
 
 				assert((mapelm->bits >> PAGE_SHIFT) == 0);
-				assert(run->magic == ARENA_RUN_MAGIC);
+				dassert(run->magic == ARENA_RUN_MAGIC);
 				size_t binind = arena_bin_index(arena,
 				    run->bin);
 				arena_bin_info_t *bin_info =
@@ -1224,12 +1210,14 @@ arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin)
 	malloc_mutex_lock(&arena->lock);
 	run = arena_run_alloc(arena, bin_info->run_size, false, false);
 	if (run != NULL) {
+		bitmap_t *bitmap = (bitmap_t *)((uintptr_t)run +
+		    (uintptr_t)bin_info->bitmap_offset);
+
 		/* Initialize run internals. */
 		run->bin = bin;
-		run->avail = NULL;
-		run->next = (void *)((uintptr_t)run +
-		    (uintptr_t)bin_info->reg0_offset);
+		run->nextind = 0;
 		run->nfree = bin_info->nregs;
+		bitmap_init(bitmap, &bin_info->bitmap_info);
 #ifdef JEMALLOC_DEBUG
 		run->magic = ARENA_RUN_MAGIC;
 #endif
@@ -1289,12 +1277,11 @@ arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin)
 	bin->runcur = NULL;
 	run = arena_bin_nonfull_run_get(arena, bin);
 	if (bin->runcur != NULL && bin->runcur->nfree > 0) {
-
 		/*
 		 * Another thread updated runcur while this one ran without the
 		 * bin lock in arena_bin_nonfull_run_get().
 		 */
-		assert(bin->runcur->magic == ARENA_RUN_MAGIC);
+		dassert(bin->runcur->magic == ARENA_RUN_MAGIC);
 		assert(bin->runcur->nfree > 0);
 		ret = arena_run_reg_alloc(bin->runcur, bin_info);
 		if (run != NULL) {
@@ -1302,7 +1289,7 @@ arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin)
 
 			/*
 			 * arena_run_alloc() may have allocated run, or it may
-			 * have pulled it from the bin's run tree.  Therefore
+			 * have pulled run from the bin's run tree.  Therefore
 			 * it is unsafe to make any assumptions about how run
 			 * has previously been used, and arena_bin_lower_run()
 			 * must be called, as if a region were just deallocated
@@ -1322,7 +1309,7 @@ arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin)
 
 	bin->runcur = run;
 
-	assert(bin->runcur->magic == ARENA_RUN_MAGIC);
+	dassert(bin->runcur->magic == ARENA_RUN_MAGIC);
 	assert(bin->runcur->nfree > 0);
 
 	return (arena_run_reg_alloc(bin->runcur, bin_info));
@@ -1365,15 +1352,15 @@ arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, size_t binind
 #endif
 	bin = &arena->bins[binind];
 	malloc_mutex_lock(&bin->lock);
-	for (i = 0, nfill = (tbin->ncached_max >> 1); i < nfill; i++) {
+	for (i = 0, nfill = (tcache_bin_info[binind].ncached_max >> 1);
+	    i < nfill; i++) {
 		if ((run = bin->runcur) != NULL && run->nfree > 0)
 			ptr = arena_run_reg_alloc(run, &arena_bin_info[binind]);
 		else
 			ptr = arena_bin_malloc_hard(arena, bin);
 		if (ptr == NULL)
 			break;
-		*(void **)ptr = tbin->avail;
-		tbin->avail = ptr;
+		tbin->avail[i] = ptr;
 	}
 #ifdef JEMALLOC_STATS
 	bin->stats.allocated += (i - tbin->ncached) *
@@ -1607,7 +1594,7 @@ arena_salloc(const void *ptr)
 		arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
 		    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
 		    PAGE_SHIFT));
-		assert(run->magic == ARENA_RUN_MAGIC);
+		dassert(run->magic == ARENA_RUN_MAGIC);
 		size_t binind = arena_bin_index(chunk->arena, run->bin);
 		arena_bin_info_t *bin_info = &arena_bin_info[binind];
 		assert(((uintptr_t)ptr - ((uintptr_t)run +
@@ -1660,7 +1647,7 @@ arena_salloc_demote(const void *ptr)
 		arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
 		    (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) <<
 		    PAGE_SHIFT));
-		assert(run->magic == ARENA_RUN_MAGIC);
+		dassert(run->magic == ARENA_RUN_MAGIC);
 		size_t binind = arena_bin_index(chunk->arena, run->bin);
 		arena_bin_info_t *bin_info = &arena_bin_info[binind];
 		assert(((uintptr_t)ptr - ((uintptr_t)run +
@@ -1730,8 +1717,9 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run,
 	/******************************/
 	npages = bin_info->run_size >> PAGE_SHIFT;
 	run_ind = (size_t)(((uintptr_t)run - (uintptr_t)chunk) >> PAGE_SHIFT);
-	past = (size_t)((PAGE_CEILING((uintptr_t)run->next) - (uintptr_t)chunk)
-	    >> PAGE_SHIFT);
+	past = (size_t)(PAGE_CEILING((uintptr_t)run +
+	    (uintptr_t)bin_info->reg0_offset + (uintptr_t)(run->nextind *
+	    bin_info->reg_size) - (uintptr_t)chunk) >> PAGE_SHIFT);
 	malloc_mutex_lock(&arena->lock);
 
 	/*
@@ -1817,7 +1805,7 @@ arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT;
 	run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind -
 	    (mapelm->bits >> PAGE_SHIFT)) << PAGE_SHIFT));
-	assert(run->magic == ARENA_RUN_MAGIC);
+	dassert(run->magic == ARENA_RUN_MAGIC);
 	bin = run->bin;
 	size_t binind = arena_bin_index(arena, bin);
 	arena_bin_info_t *bin_info = &arena_bin_info[binind];
@@ -2065,7 +2053,7 @@ arena_ralloc_large(void *ptr, size_t oldsize, size_t size, size_t extra,
 
 		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 		arena = chunk->arena;
-		assert(arena->magic == ARENA_MAGIC);
+		dassert(arena->magic == ARENA_MAGIC);
 
 		if (psize < oldsize) {
 #ifdef JEMALLOC_FILL
@@ -2405,8 +2393,8 @@ small_size2bin_init_hard(void)
  *   *) bin_info->run_size <= arena_maxclass
  *   *) run header overhead <= RUN_MAX_OVRHD (or header overhead relaxed).
  *
- * bin_info->nregs and bin_info->reg0_offset are also calculated here, since
- * these settings are all interdependent.
+ * bin_info->nregs, bin_info->bitmap_offset, and bin_info->reg0_offset are also
+ * calculated here, since these settings are all interdependent.
  */
 static size_t
 bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size)
@@ -2414,6 +2402,7 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size)
 	size_t try_run_size, good_run_size;
 	uint32_t try_nregs, good_nregs;
 	uint32_t try_hdr_size, good_hdr_size;
+	uint32_t try_bitmap_offset, good_bitmap_offset;
 #ifdef JEMALLOC_PROF
 	uint32_t try_ctx0_offset, good_ctx0_offset;
 #endif
@@ -2438,6 +2427,11 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size)
 	do {
 		try_nregs--;
 		try_hdr_size = sizeof(arena_run_t);
+		/* Pad to a long boundary. */
+		try_hdr_size = LONG_CEILING(try_hdr_size);
+		try_bitmap_offset = try_hdr_size;
+		/* Add space for bitmap. */
+		try_hdr_size += bitmap_size(try_nregs);
 #ifdef JEMALLOC_PROF
 		if (opt_prof && prof_promote == false) {
 			/* Pad to a quantum boundary. */
@@ -2460,6 +2454,7 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size)
 		good_run_size = try_run_size;
 		good_nregs = try_nregs;
 		good_hdr_size = try_hdr_size;
+		good_bitmap_offset = try_bitmap_offset;
 #ifdef JEMALLOC_PROF
 		good_ctx0_offset = try_ctx0_offset;
 #endif
@@ -2473,6 +2468,11 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size)
 		do {
 			try_nregs--;
 			try_hdr_size = sizeof(arena_run_t);
+			/* Pad to a long boundary. */
+			try_hdr_size = LONG_CEILING(try_hdr_size);
+			try_bitmap_offset = try_hdr_size;
+			/* Add space for bitmap. */
+			try_hdr_size += bitmap_size(try_nregs);
 #ifdef JEMALLOC_PROF
 			if (opt_prof && prof_promote == false) {
 				/* Pad to a quantum boundary. */
@@ -2498,6 +2498,7 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size)
 	/* Copy final settings. */
 	bin_info->run_size = good_run_size;
 	bin_info->nregs = good_nregs;
+	bin_info->bitmap_offset = good_bitmap_offset;
 #ifdef JEMALLOC_PROF
 	bin_info->ctx0_offset = good_ctx0_offset;
 #endif
@@ -2525,6 +2526,7 @@ bin_info_init(void)
 		bin_info = &arena_bin_info[i];
 		bin_info->reg_size = (1U << (LG_TINY_MIN + i));
 		prev_run_size = bin_info_run_size_calc(bin_info, prev_run_size);
+		bitmap_info_init(&bin_info->bitmap_info, bin_info->nregs);
 	}
 #endif
 
@@ -2533,6 +2535,7 @@ bin_info_init(void)
 		bin_info = &arena_bin_info[i];
 		bin_info->reg_size = (i - ntbins + 1) << LG_QUANTUM;
 		prev_run_size = bin_info_run_size_calc(bin_info, prev_run_size);
+		bitmap_info_init(&bin_info->bitmap_info, bin_info->nregs);
 	}
 
 	/* Cacheline-spaced bins. */
@@ -2541,6 +2544,7 @@ bin_info_init(void)
 		bin_info->reg_size = cspace_min + ((i - (ntbins + nqbins)) <<
 		    LG_CACHELINE);
 		prev_run_size = bin_info_run_size_calc(bin_info, prev_run_size);
+		bitmap_info_init(&bin_info->bitmap_info, bin_info->nregs);
 	}
 
 	/* Subpage-spaced bins. */
@@ -2549,6 +2553,7 @@ bin_info_init(void)
 		bin_info->reg_size = sspace_min + ((i - (ntbins + nqbins +
 		    ncbins)) << LG_SUBPAGE);
 		prev_run_size = bin_info_run_size_calc(bin_info, prev_run_size);
+		bitmap_info_init(&bin_info->bitmap_info, bin_info->nregs);
 	}
 
 	return (false);
diff --git a/jemalloc/src/bitmap.c b/jemalloc/src/bitmap.c
new file mode 100644
index 0000000..b47e262
--- /dev/null
+++ b/jemalloc/src/bitmap.c
@@ -0,0 +1,90 @@
+#define JEMALLOC_BITMAP_C_
+#include "jemalloc/internal/jemalloc_internal.h"
+
+/******************************************************************************/
+/* Function prototypes for non-inline static functions. */
+
+static size_t	bits2groups(size_t nbits);
+
+/******************************************************************************/
+
+static size_t
+bits2groups(size_t nbits)
+{
+
+	return ((nbits >> LG_BITMAP_GROUP_NBITS) +
+	    !!(nbits & BITMAP_GROUP_NBITS_MASK));
+}
+
+void
+bitmap_info_init(bitmap_info_t *binfo, size_t nbits)
+{
+	unsigned i;
+	size_t group_count;
+
+	assert(nbits > 0);
+	assert(nbits <= (ZU(1) << LG_BITMAP_MAXBITS));
+
+	/*
+	 * Compute the number of groups necessary to store nbits bits, and
+	 * progressively work upward through the levels until reaching a level
+	 * that requires only one group.
+	 */
+	binfo->levels[0].group_offset = 0;
+	group_count = bits2groups(nbits);
+	for (i = 1; group_count > 1; i++) {
+		assert(i < BITMAP_MAX_LEVELS);
+		binfo->levels[i].group_offset = binfo->levels[i-1].group_offset
+		    + group_count;
+		group_count = bits2groups(group_count);
+	}
+	binfo->levels[i].group_offset = binfo->levels[i-1].group_offset
+	    + group_count;
+	binfo->nlevels = i;
+	binfo->nbits = nbits;
+}
+
+size_t
+bitmap_info_ngroups(const bitmap_info_t *binfo)
+{
+
+	return (binfo->levels[binfo->nlevels].group_offset << LG_SIZEOF_BITMAP);
+}
+
+size_t
+bitmap_size(size_t nbits)
+{
+	bitmap_info_t binfo;
+
+	bitmap_info_init(&binfo, nbits);
+	return (bitmap_info_ngroups(&binfo));
+}
+
+void
+bitmap_init(bitmap_t *bitmap, const bitmap_info_t *binfo)
+{
+	size_t extra;
+	unsigned i;
+
+	/*
+	 * Bits are actually inverted with regard to the external bitmap
+	 * interface, so the bitmap starts out with all 1 bits, except for
+	 * trailing unused bits (if any).  Note that each group uses bit 0 to
+	 * correspond to the first logical bit in the group, so extra bits
+	 * are the most significant bits of the last group.
+	 */
+	memset(bitmap, 0xffU, binfo->levels[binfo->nlevels].group_offset <<
+	    LG_SIZEOF_BITMAP);
+	extra = (BITMAP_GROUP_NBITS - (binfo->nbits & BITMAP_GROUP_NBITS_MASK))
+	    & BITMAP_GROUP_NBITS_MASK;
+	if (extra != 0)
+		bitmap[binfo->levels[1].group_offset - 1] >>= extra;
+	for (i = 1; i < binfo->nlevels; i++) {
+		size_t group_count = binfo->levels[i].group_offset -
+		    binfo->levels[i-1].group_offset;
+		extra = (BITMAP_GROUP_NBITS - (group_count &
+		    BITMAP_GROUP_NBITS_MASK)) & BITMAP_GROUP_NBITS_MASK;
+		if (extra != 0)
+			bitmap[binfo->levels[i+1].group_offset - 1] >>= extra;
+	}
+}
diff --git a/jemalloc/src/ckh.c b/jemalloc/src/ckh.c
index e386a53..75ae7fd 100644
--- a/jemalloc/src/ckh.c
+++ b/jemalloc/src/ckh.c
@@ -73,7 +73,7 @@ ckh_isearch(ckh_t *ckh, const void *key)
 	size_t hash1, hash2, bucket, cell;
 
 	assert(ckh != NULL);
-	assert(ckh->magic == CKH_MAGIC);
+	dassert(ckh->magic == CKH_MAGIC);
 
 	ckh->hash(key, ckh->lg_curbuckets, &hash1, &hash2);
 
@@ -396,7 +396,7 @@ ckh_delete(ckh_t *ckh)
 {
 
 	assert(ckh != NULL);
-	assert(ckh->magic == CKH_MAGIC);
+	dassert(ckh->magic == CKH_MAGIC);
 
 #ifdef CKH_VERBOSE
 	malloc_printf(
@@ -421,7 +421,7 @@ ckh_count(ckh_t *ckh)
 {
 
 	assert(ckh != NULL);
-	assert(ckh->magic == CKH_MAGIC);
+	dassert(ckh->magic == CKH_MAGIC);
 
 	return (ckh->count);
 }
@@ -452,7 +452,7 @@ ckh_insert(ckh_t *ckh, const void *key, const void *data)
 	bool ret;
 
 	assert(ckh != NULL);
-	assert(ckh->magic == CKH_MAGIC);
+	dassert(ckh->magic == CKH_MAGIC);
 	assert(ckh_search(ckh, key, NULL, NULL));
 
 #ifdef CKH_COUNT
@@ -477,7 +477,7 @@ ckh_remove(ckh_t *ckh, const void *searchkey, void **key, void **data)
 	size_t cell;
 
 	assert(ckh != NULL);
-	assert(ckh->magic == CKH_MAGIC);
+	dassert(ckh->magic == CKH_MAGIC);
 
 	cell = ckh_isearch(ckh, searchkey);
 	if (cell != SIZE_T_MAX) {
@@ -509,7 +509,7 @@ ckh_search(ckh_t *ckh, const void *searchkey, void **key, void **data)
 	size_t cell;
 
 	assert(ckh != NULL);
-	assert(ckh->magic == CKH_MAGIC);
+	dassert(ckh->magic == CKH_MAGIC);
 
 	cell = ckh_isearch(ckh, searchkey);
 	if (cell != SIZE_T_MAX) {
diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c
index c1aadda..9f2fa92 100644
--- a/jemalloc/src/jemalloc.c
+++ b/jemalloc/src/jemalloc.c
@@ -693,7 +693,10 @@ malloc_init_hard(void)
 	}
 
 #ifdef JEMALLOC_TCACHE
-	tcache_boot();
+	if (tcache_boot()) {
+		malloc_mutex_unlock(&init_lock);
+		return (true);
+	}
 #endif
 
 	if (huge_boot()) {
diff --git a/jemalloc/src/tcache.c b/jemalloc/src/tcache.c
index 88e1cc7..2f4804e 100644
--- a/jemalloc/src/tcache.c
+++ b/jemalloc/src/tcache.c
@@ -8,6 +8,9 @@ bool	opt_tcache = true;
 ssize_t	opt_lg_tcache_max = LG_TCACHE_MAXCLASS_DEFAULT;
 ssize_t	opt_lg_tcache_gc_sweep = LG_TCACHE_GC_SWEEP_DEFAULT;
 
+tcache_bin_info_t	*tcache_bin_info;
+static unsigned		stack_nelms; /* Total stack elms per tcache. */
+
 /* Map of thread-specific caches. */
 #ifndef NO_TLS
 __thread tcache_t	*tcache_tls JEMALLOC_ATTR(tls_model("initial-exec"));
@@ -55,21 +58,19 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem
 #endif
     )
 {
-	void *flush, *deferred, *ptr;
+	void *ptr;
 	unsigned i, nflush, ndeferred;
-	bool first_pass;
 #ifdef JEMALLOC_STATS
 	bool merged_stats = false;
 #endif
 
 	assert(binind < nbins);
 	assert(rem <= tbin->ncached);
-	assert(tbin->ncached > 0 || tbin->avail == NULL);
 
-	for (flush = tbin->avail, nflush = tbin->ncached - rem, first_pass =
-	    true; flush != NULL; flush = deferred, nflush = ndeferred) {
+	for (nflush = tbin->ncached - rem; nflush > 0; nflush = ndeferred) {
 		/* Lock the arena bin associated with the first object. */
-		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(flush);
+		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(
+		    tbin->avail[0]);
 		arena_t *arena = chunk->arena;
 		arena_bin_t *bin = &arena->bins[binind];
 
@@ -92,12 +93,10 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem
 			tbin->tstats.nrequests = 0;
 		}
 #endif
-		deferred = NULL;
 		ndeferred = 0;
 		for (i = 0; i < nflush; i++) {
-			ptr = flush;
+			ptr = tbin->avail[i];
 			assert(ptr != NULL);
-			flush = *(void **)ptr;
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 			if (chunk->arena == arena) {
 				size_t pageind = ((uintptr_t)ptr -
@@ -112,17 +111,11 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem
 				 * locked.  Stash the object, so that it can be
 				 * handled in a future pass.
 				 */
-				*(void **)ptr = deferred;
-				deferred = ptr;
+				tbin->avail[ndeferred] = ptr;
 				ndeferred++;
 			}
 		}
 		malloc_mutex_unlock(&bin->lock);
-
-		if (first_pass) {
-			tbin->avail = flush;
-			first_pass = false;
-		}
 	}
 #ifdef JEMALLOC_STATS
 	if (merged_stats == false) {
@@ -139,6 +132,8 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem
 	}
 #endif
 
+	memmove(tbin->avail, &tbin->avail[tbin->ncached - rem],
+	    rem * sizeof(void *));
 	tbin->ncached = rem;
 	if (tbin->ncached < tbin->low_water)
 		tbin->low_water = tbin->ncached;
@@ -151,18 +146,19 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem
 #endif
     )
 {
-	void *flush, *deferred, *ptr;
+	void *ptr;
 	unsigned i, nflush, ndeferred;
-	bool first_pass;
+#ifdef JEMALLOC_STATS
+	bool merged_stats = false;
+#endif
 
 	assert(binind < nhbins);
 	assert(rem <= tbin->ncached);
-	assert(tbin->ncached > 0 || tbin->avail == NULL);
 
-	for (flush = tbin->avail, nflush = tbin->ncached - rem, first_pass =
-	    true; flush != NULL; flush = deferred, nflush = ndeferred) {
+	for (nflush = tbin->ncached - rem; nflush > 0; nflush = ndeferred) {
 		/* Lock the arena associated with the first object. */
-		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(flush);
+		arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(
+		    tbin->avail[0]);
 		arena_t *arena = chunk->arena;
 
 		malloc_mutex_lock(&arena->lock);
@@ -174,6 +170,7 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem
 			tcache->prof_accumbytes = 0;
 #endif
 #ifdef JEMALLOC_STATS
+			merged_stats = true;
 			arena->stats.nrequests_large += tbin->tstats.nrequests;
 			arena->stats.lstats[binind - nbins].nrequests +=
 			    tbin->tstats.nrequests;
@@ -182,12 +179,10 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem
 #if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS))
 		}
 #endif
-		deferred = NULL;
 		ndeferred = 0;
 		for (i = 0; i < nflush; i++) {
-			ptr = flush;
+			ptr = tbin->avail[i];
 			assert(ptr != NULL);
-			flush = *(void **)ptr;
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 			if (chunk->arena == arena)
 				arena_dalloc_large(arena, chunk, ptr);
@@ -198,19 +193,30 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem
 				 * Stash the object, so that it can be handled
 				 * in a future pass.
 				 */
-				*(void **)ptr = deferred;
-				deferred = ptr;
+				tbin->avail[ndeferred] = ptr;
 				ndeferred++;
 			}
 		}
 		malloc_mutex_unlock(&arena->lock);
-
-		if (first_pass) {
-			tbin->avail = flush;
-			first_pass = false;
-		}
 	}
+#ifdef JEMALLOC_STATS
+	if (merged_stats == false) {
+		/*
+		 * The flush loop didn't happen to flush to this thread's
+		 * arena, so the stats didn't get merged.  Manually do so now.
+		 */
+		arena_t *arena = tcache->arena;
+		malloc_mutex_lock(&arena->lock);
+		arena->stats.nrequests_large += tbin->tstats.nrequests;
+		arena->stats.lstats[binind - nbins].nrequests +=
+		    tbin->tstats.nrequests;
+		tbin->tstats.nrequests = 0;
+		malloc_mutex_unlock(&arena->lock);
+	}
+#endif
 
+	memmove(tbin->avail, &tbin->avail[tbin->ncached - rem],
+	    rem * sizeof(void *));
 	tbin->ncached = rem;
 	if (tbin->ncached < tbin->low_water)
 		tbin->low_water = tbin->ncached;
@@ -220,10 +226,14 @@ tcache_t *
 tcache_create(arena_t *arena)
 {
 	tcache_t *tcache;
-	size_t size;
+	size_t size, stack_offset;
 	unsigned i;
 
 	size = offsetof(tcache_t, tbins) + (sizeof(tcache_bin_t) * nhbins);
+	/* Naturally align the pointer stacks. */
+	size = PTR_CEILING(size);
+	stack_offset = size;
+	size += stack_nelms * sizeof(void *);
 	/*
 	 * Round up to the nearest multiple of the cacheline size, in order to
 	 * avoid the possibility of false cacheline sharing.
@@ -236,6 +246,8 @@ tcache_create(arena_t *arena)
 
 	if (size <= small_maxclass)
 		tcache = (tcache_t *)arena_malloc_small(arena, size, true);
+	else if (size <= tcache_maxclass)
+		tcache = (tcache_t *)arena_malloc_large(arena, size, true);
 	else
 		tcache = (tcache_t *)icalloc(size);
 
@@ -252,15 +264,11 @@ tcache_create(arena_t *arena)
 
 	tcache->arena = arena;
 	assert((TCACHE_NSLOTS_SMALL_MAX & 1U) == 0);
-	for (i = 0; i < nbins; i++) {
-		if ((arena_bin_info[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MAX) {
-			tcache->tbins[i].ncached_max = (arena_bin_info[i].nregs
-			    << 1);
-		} else
-			tcache->tbins[i].ncached_max = TCACHE_NSLOTS_SMALL_MAX;
+	for (i = 0; i < nhbins; i++) {
+		tcache->tbins[i].avail = (void **)((uintptr_t)tcache +
+		    (uintptr_t)stack_offset);
+		stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *);
 	}
-	for (; i < nhbins; i++)
-		tcache->tbins[i].ncached_max = TCACHE_NSLOTS_LARGE;
 
 	TCACHE_SET(tcache);
 
@@ -271,6 +279,7 @@ void
 tcache_destroy(tcache_t *tcache)
 {
 	unsigned i;
+	size_t tcache_size;
 
 #ifdef JEMALLOC_STATS
 	/* Unlink from list of extant tcaches. */
@@ -327,7 +336,8 @@ tcache_destroy(tcache_t *tcache)
 	}
 #endif
 
-	if (arena_salloc(tcache) <= small_maxclass) {
+	tcache_size = arena_salloc(tcache);
+	if (tcache_size <= small_maxclass) {
 		arena_chunk_t *chunk = CHUNK_ADDR2BASE(tcache);
 		arena_t *arena = chunk->arena;
 		size_t pageind = ((uintptr_t)tcache - (uintptr_t)chunk) >>
@@ -341,6 +351,13 @@ tcache_destroy(tcache_t *tcache)
 		malloc_mutex_lock(&bin->lock);
 		arena_dalloc_bin(arena, chunk, tcache, mapelm);
 		malloc_mutex_unlock(&bin->lock);
+	} else if (tcache_size <= tcache_maxclass) {
+		arena_chunk_t *chunk = CHUNK_ADDR2BASE(tcache);
+		arena_t *arena = chunk->arena;
+
+		malloc_mutex_lock(&arena->lock);
+		arena_dalloc_large(arena, chunk, tcache);
+		malloc_mutex_unlock(&arena->lock);
 	} else
 		idalloc(tcache);
 }
@@ -397,11 +414,13 @@ tcache_stats_merge(tcache_t *tcache, arena_t *arena)
 }
 #endif
 
-void
+bool
 tcache_boot(void)
 {
 
 	if (opt_tcache) {
+		unsigned i;
+
 		/*
 		 * If necessary, clamp opt_lg_tcache_max, now that
 		 * small_maxclass and arena_maxclass are known.
@@ -416,6 +435,28 @@ tcache_boot(void)
 
 		nhbins = nbins + (tcache_maxclass >> PAGE_SHIFT);
 
+		/* Initialize tcache_bin_info. */
+		tcache_bin_info = (tcache_bin_info_t *)base_alloc(nhbins *
+		    sizeof(tcache_bin_info_t));
+		if (tcache_bin_info == NULL)
+			return (true);
+		stack_nelms = 0;
+		for (i = 0; i < nbins; i++) {
+			if ((arena_bin_info[i].nregs << 1) <=
+			    TCACHE_NSLOTS_SMALL_MAX) {
+				tcache_bin_info[i].ncached_max =
+				    (arena_bin_info[i].nregs << 1);
+			} else {
+				tcache_bin_info[i].ncached_max =
+				    TCACHE_NSLOTS_SMALL_MAX;
+			}
+			stack_nelms += tcache_bin_info[i].ncached_max;
+		}
+		for (; i < nhbins; i++) {
+			tcache_bin_info[i].ncached_max = TCACHE_NSLOTS_LARGE;
+			stack_nelms += tcache_bin_info[i].ncached_max;
+		}
+
 		/* Compute incremental GC event threshold. */
 		if (opt_lg_tcache_gc_sweep >= 0) {
 			tcache_gc_incr = ((1U << opt_lg_tcache_gc_sweep) /
@@ -431,6 +472,8 @@ tcache_boot(void)
 			abort();
 		}
 	}
+
+	return (false);
 }
 /******************************************************************************/
 #endif /* JEMALLOC_TCACHE */
author	Jason Evans <je@fb.com>	2011-03-16 17:30:13 (GMT)
committer	Jason Evans <je@fb.com>	2011-03-17 23:29:32 (GMT)
commit	84c8eefeffa246607790ad12e28b0f6a24ecc59d (patch)
tree	3b62221829ca737794d161f36ad0683016a63771 /jemalloc/src
parent	77f350be08c8b9cd03ceed820b3113dbac9b4151 (diff)
download	jemalloc-84c8eefeffa246607790ad12e28b0f6a24ecc59d.zip jemalloc-84c8eefeffa246607790ad12e28b0f6a24ecc59d.tar.gz jemalloc-84c8eefeffa246607790ad12e28b0f6a24ecc59d.tar.bz2