diff options
author | Jason Evans <je@fb.com> | 2011-03-16 17:30:13 (GMT) |
---|---|---|
committer | Jason Evans <je@fb.com> | 2011-03-17 23:29:32 (GMT) |
commit | 84c8eefeffa246607790ad12e28b0f6a24ecc59d (patch) | |
tree | 3b62221829ca737794d161f36ad0683016a63771 /jemalloc/src | |
parent | 77f350be08c8b9cd03ceed820b3113dbac9b4151 (diff) | |
download | jemalloc-84c8eefeffa246607790ad12e28b0f6a24ecc59d.zip jemalloc-84c8eefeffa246607790ad12e28b0f6a24ecc59d.tar.gz jemalloc-84c8eefeffa246607790ad12e28b0f6a24ecc59d.tar.bz2 |
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked
lists in available regions, had the unfortunate side effect of causing
many cache misses during thread cache fills. Fix this in two places:
- arena_run_t: Use a new bitmap implementation to track which regions
are available. Furthermore, revert to preferring the
lowest available region (as jemalloc did with its old
bitmap-based approach).
- tcache_t: Move read-only tcache_bin_t metadata into
tcache_bin_info_t, and add a contiguous array of pointers
to tcache_t in order to track cached objects. This
substantially increases the size of tcache_t, but results
in much higher data locality for common tcache operations.
As a side benefit, it is again possible to efficiently
flush the least recently used cached objects, so this
change changes flushing from MRU to LRU.
The new bitmap implementation uses a multi-level summary approach to
make finding the lowest available region very fast. In practice,
bitmaps only have one or two levels, though the implementation is
general enough to handle extremely large bitmaps, mainly so that large
page sizes can still be entertained.
Fix tcache_bin_flush_large() to always flush statistics, in the same way
that tcache_bin_flush_small() was recently fixed.
Use JEMALLOC_DEBUG rather than NDEBUG.
Add dassert(), and use it for debug-only asserts.
Diffstat (limited to 'jemalloc/src')
-rw-r--r-- | jemalloc/src/arena.c | 107 | ||||
-rw-r--r-- | jemalloc/src/bitmap.c | 90 | ||||
-rw-r--r-- | jemalloc/src/ckh.c | 12 | ||||
-rw-r--r-- | jemalloc/src/jemalloc.c | 5 | ||||
-rw-r--r-- | jemalloc/src/tcache.c | 129 |
5 files changed, 242 insertions, 101 deletions
diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c index e49b8ed..87bd9bb 100644 --- a/jemalloc/src/arena.c +++ b/jemalloc/src/arena.c @@ -253,59 +253,45 @@ static inline void * arena_run_reg_alloc(arena_run_t *run, arena_bin_info_t *bin_info) { void *ret; + unsigned regind; + bitmap_t *bitmap = (bitmap_t *)((uintptr_t)run + + (uintptr_t)bin_info->bitmap_offset); - assert(run->magic == ARENA_RUN_MAGIC); + dassert(run->magic == ARENA_RUN_MAGIC); assert(run->nfree > 0); + assert(bitmap_full(bitmap, &bin_info->bitmap_info) == false); + regind = bitmap_sfu(bitmap, &bin_info->bitmap_info); + ret = (void *)((uintptr_t)run + (uintptr_t)bin_info->reg0_offset + + (uintptr_t)(bin_info->reg_size * regind)); run->nfree--; - ret = run->avail; - if (ret != NULL) { - /* Double free can cause assertion failure.*/ - assert(ret != NULL); - /* Write-after free can cause assertion failure. */ - assert((uintptr_t)ret >= (uintptr_t)run + - (uintptr_t)bin_info->reg0_offset); - assert((uintptr_t)ret < (uintptr_t)run->next); - assert(((uintptr_t)ret - ((uintptr_t)run + - (uintptr_t)bin_info->reg0_offset)) % - (uintptr_t)bin_info->reg_size == 0); - run->avail = *(void **)ret; - return (ret); - } - ret = run->next; - run->next = (void *)((uintptr_t)ret + (uintptr_t)bin_info->reg_size); - assert(ret != NULL); + if (regind == run->nextind) + run->nextind++; + assert(regind < run->nextind); return (ret); } static inline void arena_run_reg_dalloc(arena_run_t *run, void *ptr) { - -#ifndef NDEBUG arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run); size_t binind = arena_bin_index(chunk->arena, run->bin); arena_bin_info_t *bin_info = &arena_bin_info[binind]; + unsigned regind = arena_run_regind(run, bin_info, ptr); + bitmap_t *bitmap = (bitmap_t *)((uintptr_t)run + + (uintptr_t)bin_info->bitmap_offset); + assert(run->nfree < bin_info->nregs); /* Freeing an interior pointer can cause assertion failure. */ assert(((uintptr_t)ptr - ((uintptr_t)run + (uintptr_t)bin_info->reg0_offset)) % (uintptr_t)bin_info->reg_size == 0); - /* - * Freeing a pointer lower than region zero can cause assertion - * failure. - */ assert((uintptr_t)ptr >= (uintptr_t)run + (uintptr_t)bin_info->reg0_offset); - /* - * Freeing a pointer past in the run's frontier can cause assertion - * failure. - */ - assert((uintptr_t)ptr < (uintptr_t)run->next); -#endif + /* Freeing an unallocated pointer can cause assertion failure. */ + assert(bitmap_get(bitmap, &bin_info->bitmap_info, regind)); - *(void **)ptr = run->avail; - run->avail = ptr; + bitmap_unset(bitmap, &bin_info->bitmap_info, regind); run->nfree++; } @@ -772,7 +758,7 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk) chunk + (uintptr_t)(pageind << PAGE_SHIFT)); assert((mapelm->bits >> PAGE_SHIFT) == 0); - assert(run->magic == ARENA_RUN_MAGIC); + dassert(run->magic == ARENA_RUN_MAGIC); size_t binind = arena_bin_index(arena, run->bin); arena_bin_info_t *bin_info = @@ -1224,12 +1210,14 @@ arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin) malloc_mutex_lock(&arena->lock); run = arena_run_alloc(arena, bin_info->run_size, false, false); if (run != NULL) { + bitmap_t *bitmap = (bitmap_t *)((uintptr_t)run + + (uintptr_t)bin_info->bitmap_offset); + /* Initialize run internals. */ run->bin = bin; - run->avail = NULL; - run->next = (void *)((uintptr_t)run + - (uintptr_t)bin_info->reg0_offset); + run->nextind = 0; run->nfree = bin_info->nregs; + bitmap_init(bitmap, &bin_info->bitmap_info); #ifdef JEMALLOC_DEBUG run->magic = ARENA_RUN_MAGIC; #endif @@ -1289,12 +1277,11 @@ arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin) bin->runcur = NULL; run = arena_bin_nonfull_run_get(arena, bin); if (bin->runcur != NULL && bin->runcur->nfree > 0) { - /* * Another thread updated runcur while this one ran without the * bin lock in arena_bin_nonfull_run_get(). */ - assert(bin->runcur->magic == ARENA_RUN_MAGIC); + dassert(bin->runcur->magic == ARENA_RUN_MAGIC); assert(bin->runcur->nfree > 0); ret = arena_run_reg_alloc(bin->runcur, bin_info); if (run != NULL) { @@ -1302,7 +1289,7 @@ arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin) /* * arena_run_alloc() may have allocated run, or it may - * have pulled it from the bin's run tree. Therefore + * have pulled run from the bin's run tree. Therefore * it is unsafe to make any assumptions about how run * has previously been used, and arena_bin_lower_run() * must be called, as if a region were just deallocated @@ -1322,7 +1309,7 @@ arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin) bin->runcur = run; - assert(bin->runcur->magic == ARENA_RUN_MAGIC); + dassert(bin->runcur->magic == ARENA_RUN_MAGIC); assert(bin->runcur->nfree > 0); return (arena_run_reg_alloc(bin->runcur, bin_info)); @@ -1365,15 +1352,15 @@ arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, size_t binind #endif bin = &arena->bins[binind]; malloc_mutex_lock(&bin->lock); - for (i = 0, nfill = (tbin->ncached_max >> 1); i < nfill; i++) { + for (i = 0, nfill = (tcache_bin_info[binind].ncached_max >> 1); + i < nfill; i++) { if ((run = bin->runcur) != NULL && run->nfree > 0) ptr = arena_run_reg_alloc(run, &arena_bin_info[binind]); else ptr = arena_bin_malloc_hard(arena, bin); if (ptr == NULL) break; - *(void **)ptr = tbin->avail; - tbin->avail = ptr; + tbin->avail[i] = ptr; } #ifdef JEMALLOC_STATS bin->stats.allocated += (i - tbin->ncached) * @@ -1607,7 +1594,7 @@ arena_salloc(const void *ptr) arena_run_t *run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) << PAGE_SHIFT)); - assert(run->magic == ARENA_RUN_MAGIC); + dassert(run->magic == ARENA_RUN_MAGIC); size_t binind = arena_bin_index(chunk->arena, run->bin); arena_bin_info_t *bin_info = &arena_bin_info[binind]; assert(((uintptr_t)ptr - ((uintptr_t)run + @@ -1660,7 +1647,7 @@ arena_salloc_demote(const void *ptr) arena_run_t *run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) << PAGE_SHIFT)); - assert(run->magic == ARENA_RUN_MAGIC); + dassert(run->magic == ARENA_RUN_MAGIC); size_t binind = arena_bin_index(chunk->arena, run->bin); arena_bin_info_t *bin_info = &arena_bin_info[binind]; assert(((uintptr_t)ptr - ((uintptr_t)run + @@ -1730,8 +1717,9 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run, /******************************/ npages = bin_info->run_size >> PAGE_SHIFT; run_ind = (size_t)(((uintptr_t)run - (uintptr_t)chunk) >> PAGE_SHIFT); - past = (size_t)((PAGE_CEILING((uintptr_t)run->next) - (uintptr_t)chunk) - >> PAGE_SHIFT); + past = (size_t)(PAGE_CEILING((uintptr_t)run + + (uintptr_t)bin_info->reg0_offset + (uintptr_t)(run->nextind * + bin_info->reg_size) - (uintptr_t)chunk) >> PAGE_SHIFT); malloc_mutex_lock(&arena->lock); /* @@ -1817,7 +1805,7 @@ arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr, pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT; run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind - (mapelm->bits >> PAGE_SHIFT)) << PAGE_SHIFT)); - assert(run->magic == ARENA_RUN_MAGIC); + dassert(run->magic == ARENA_RUN_MAGIC); bin = run->bin; size_t binind = arena_bin_index(arena, bin); arena_bin_info_t *bin_info = &arena_bin_info[binind]; @@ -2065,7 +2053,7 @@ arena_ralloc_large(void *ptr, size_t oldsize, size_t size, size_t extra, chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); arena = chunk->arena; - assert(arena->magic == ARENA_MAGIC); + dassert(arena->magic == ARENA_MAGIC); if (psize < oldsize) { #ifdef JEMALLOC_FILL @@ -2405,8 +2393,8 @@ small_size2bin_init_hard(void) * *) bin_info->run_size <= arena_maxclass * *) run header overhead <= RUN_MAX_OVRHD (or header overhead relaxed). * - * bin_info->nregs and bin_info->reg0_offset are also calculated here, since - * these settings are all interdependent. + * bin_info->nregs, bin_info->bitmap_offset, and bin_info->reg0_offset are also + * calculated here, since these settings are all interdependent. */ static size_t bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size) @@ -2414,6 +2402,7 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size) size_t try_run_size, good_run_size; uint32_t try_nregs, good_nregs; uint32_t try_hdr_size, good_hdr_size; + uint32_t try_bitmap_offset, good_bitmap_offset; #ifdef JEMALLOC_PROF uint32_t try_ctx0_offset, good_ctx0_offset; #endif @@ -2438,6 +2427,11 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size) do { try_nregs--; try_hdr_size = sizeof(arena_run_t); + /* Pad to a long boundary. */ + try_hdr_size = LONG_CEILING(try_hdr_size); + try_bitmap_offset = try_hdr_size; + /* Add space for bitmap. */ + try_hdr_size += bitmap_size(try_nregs); #ifdef JEMALLOC_PROF if (opt_prof && prof_promote == false) { /* Pad to a quantum boundary. */ @@ -2460,6 +2454,7 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size) good_run_size = try_run_size; good_nregs = try_nregs; good_hdr_size = try_hdr_size; + good_bitmap_offset = try_bitmap_offset; #ifdef JEMALLOC_PROF good_ctx0_offset = try_ctx0_offset; #endif @@ -2473,6 +2468,11 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size) do { try_nregs--; try_hdr_size = sizeof(arena_run_t); + /* Pad to a long boundary. */ + try_hdr_size = LONG_CEILING(try_hdr_size); + try_bitmap_offset = try_hdr_size; + /* Add space for bitmap. */ + try_hdr_size += bitmap_size(try_nregs); #ifdef JEMALLOC_PROF if (opt_prof && prof_promote == false) { /* Pad to a quantum boundary. */ @@ -2498,6 +2498,7 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size) /* Copy final settings. */ bin_info->run_size = good_run_size; bin_info->nregs = good_nregs; + bin_info->bitmap_offset = good_bitmap_offset; #ifdef JEMALLOC_PROF bin_info->ctx0_offset = good_ctx0_offset; #endif @@ -2525,6 +2526,7 @@ bin_info_init(void) bin_info = &arena_bin_info[i]; bin_info->reg_size = (1U << (LG_TINY_MIN + i)); prev_run_size = bin_info_run_size_calc(bin_info, prev_run_size); + bitmap_info_init(&bin_info->bitmap_info, bin_info->nregs); } #endif @@ -2533,6 +2535,7 @@ bin_info_init(void) bin_info = &arena_bin_info[i]; bin_info->reg_size = (i - ntbins + 1) << LG_QUANTUM; prev_run_size = bin_info_run_size_calc(bin_info, prev_run_size); + bitmap_info_init(&bin_info->bitmap_info, bin_info->nregs); } /* Cacheline-spaced bins. */ @@ -2541,6 +2544,7 @@ bin_info_init(void) bin_info->reg_size = cspace_min + ((i - (ntbins + nqbins)) << LG_CACHELINE); prev_run_size = bin_info_run_size_calc(bin_info, prev_run_size); + bitmap_info_init(&bin_info->bitmap_info, bin_info->nregs); } /* Subpage-spaced bins. */ @@ -2549,6 +2553,7 @@ bin_info_init(void) bin_info->reg_size = sspace_min + ((i - (ntbins + nqbins + ncbins)) << LG_SUBPAGE); prev_run_size = bin_info_run_size_calc(bin_info, prev_run_size); + bitmap_info_init(&bin_info->bitmap_info, bin_info->nregs); } return (false); diff --git a/jemalloc/src/bitmap.c b/jemalloc/src/bitmap.c new file mode 100644 index 0000000..b47e262 --- /dev/null +++ b/jemalloc/src/bitmap.c @@ -0,0 +1,90 @@ +#define JEMALLOC_BITMAP_C_ +#include "jemalloc/internal/jemalloc_internal.h" + +/******************************************************************************/ +/* Function prototypes for non-inline static functions. */ + +static size_t bits2groups(size_t nbits); + +/******************************************************************************/ + +static size_t +bits2groups(size_t nbits) +{ + + return ((nbits >> LG_BITMAP_GROUP_NBITS) + + !!(nbits & BITMAP_GROUP_NBITS_MASK)); +} + +void +bitmap_info_init(bitmap_info_t *binfo, size_t nbits) +{ + unsigned i; + size_t group_count; + + assert(nbits > 0); + assert(nbits <= (ZU(1) << LG_BITMAP_MAXBITS)); + + /* + * Compute the number of groups necessary to store nbits bits, and + * progressively work upward through the levels until reaching a level + * that requires only one group. + */ + binfo->levels[0].group_offset = 0; + group_count = bits2groups(nbits); + for (i = 1; group_count > 1; i++) { + assert(i < BITMAP_MAX_LEVELS); + binfo->levels[i].group_offset = binfo->levels[i-1].group_offset + + group_count; + group_count = bits2groups(group_count); + } + binfo->levels[i].group_offset = binfo->levels[i-1].group_offset + + group_count; + binfo->nlevels = i; + binfo->nbits = nbits; +} + +size_t +bitmap_info_ngroups(const bitmap_info_t *binfo) +{ + + return (binfo->levels[binfo->nlevels].group_offset << LG_SIZEOF_BITMAP); +} + +size_t +bitmap_size(size_t nbits) +{ + bitmap_info_t binfo; + + bitmap_info_init(&binfo, nbits); + return (bitmap_info_ngroups(&binfo)); +} + +void +bitmap_init(bitmap_t *bitmap, const bitmap_info_t *binfo) +{ + size_t extra; + unsigned i; + + /* + * Bits are actually inverted with regard to the external bitmap + * interface, so the bitmap starts out with all 1 bits, except for + * trailing unused bits (if any). Note that each group uses bit 0 to + * correspond to the first logical bit in the group, so extra bits + * are the most significant bits of the last group. + */ + memset(bitmap, 0xffU, binfo->levels[binfo->nlevels].group_offset << + LG_SIZEOF_BITMAP); + extra = (BITMAP_GROUP_NBITS - (binfo->nbits & BITMAP_GROUP_NBITS_MASK)) + & BITMAP_GROUP_NBITS_MASK; + if (extra != 0) + bitmap[binfo->levels[1].group_offset - 1] >>= extra; + for (i = 1; i < binfo->nlevels; i++) { + size_t group_count = binfo->levels[i].group_offset - + binfo->levels[i-1].group_offset; + extra = (BITMAP_GROUP_NBITS - (group_count & + BITMAP_GROUP_NBITS_MASK)) & BITMAP_GROUP_NBITS_MASK; + if (extra != 0) + bitmap[binfo->levels[i+1].group_offset - 1] >>= extra; + } +} diff --git a/jemalloc/src/ckh.c b/jemalloc/src/ckh.c index e386a53..75ae7fd 100644 --- a/jemalloc/src/ckh.c +++ b/jemalloc/src/ckh.c @@ -73,7 +73,7 @@ ckh_isearch(ckh_t *ckh, const void *key) size_t hash1, hash2, bucket, cell; assert(ckh != NULL); - assert(ckh->magic == CKH_MAGIC); + dassert(ckh->magic == CKH_MAGIC); ckh->hash(key, ckh->lg_curbuckets, &hash1, &hash2); @@ -396,7 +396,7 @@ ckh_delete(ckh_t *ckh) { assert(ckh != NULL); - assert(ckh->magic == CKH_MAGIC); + dassert(ckh->magic == CKH_MAGIC); #ifdef CKH_VERBOSE malloc_printf( @@ -421,7 +421,7 @@ ckh_count(ckh_t *ckh) { assert(ckh != NULL); - assert(ckh->magic == CKH_MAGIC); + dassert(ckh->magic == CKH_MAGIC); return (ckh->count); } @@ -452,7 +452,7 @@ ckh_insert(ckh_t *ckh, const void *key, const void *data) bool ret; assert(ckh != NULL); - assert(ckh->magic == CKH_MAGIC); + dassert(ckh->magic == CKH_MAGIC); assert(ckh_search(ckh, key, NULL, NULL)); #ifdef CKH_COUNT @@ -477,7 +477,7 @@ ckh_remove(ckh_t *ckh, const void *searchkey, void **key, void **data) size_t cell; assert(ckh != NULL); - assert(ckh->magic == CKH_MAGIC); + dassert(ckh->magic == CKH_MAGIC); cell = ckh_isearch(ckh, searchkey); if (cell != SIZE_T_MAX) { @@ -509,7 +509,7 @@ ckh_search(ckh_t *ckh, const void *searchkey, void **key, void **data) size_t cell; assert(ckh != NULL); - assert(ckh->magic == CKH_MAGIC); + dassert(ckh->magic == CKH_MAGIC); cell = ckh_isearch(ckh, searchkey); if (cell != SIZE_T_MAX) { diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c index c1aadda..9f2fa92 100644 --- a/jemalloc/src/jemalloc.c +++ b/jemalloc/src/jemalloc.c @@ -693,7 +693,10 @@ malloc_init_hard(void) } #ifdef JEMALLOC_TCACHE - tcache_boot(); + if (tcache_boot()) { + malloc_mutex_unlock(&init_lock); + return (true); + } #endif if (huge_boot()) { diff --git a/jemalloc/src/tcache.c b/jemalloc/src/tcache.c index 88e1cc7..2f4804e 100644 --- a/jemalloc/src/tcache.c +++ b/jemalloc/src/tcache.c @@ -8,6 +8,9 @@ bool opt_tcache = true; ssize_t opt_lg_tcache_max = LG_TCACHE_MAXCLASS_DEFAULT; ssize_t opt_lg_tcache_gc_sweep = LG_TCACHE_GC_SWEEP_DEFAULT; +tcache_bin_info_t *tcache_bin_info; +static unsigned stack_nelms; /* Total stack elms per tcache. */ + /* Map of thread-specific caches. */ #ifndef NO_TLS __thread tcache_t *tcache_tls JEMALLOC_ATTR(tls_model("initial-exec")); @@ -55,21 +58,19 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem #endif ) { - void *flush, *deferred, *ptr; + void *ptr; unsigned i, nflush, ndeferred; - bool first_pass; #ifdef JEMALLOC_STATS bool merged_stats = false; #endif assert(binind < nbins); assert(rem <= tbin->ncached); - assert(tbin->ncached > 0 || tbin->avail == NULL); - for (flush = tbin->avail, nflush = tbin->ncached - rem, first_pass = - true; flush != NULL; flush = deferred, nflush = ndeferred) { + for (nflush = tbin->ncached - rem; nflush > 0; nflush = ndeferred) { /* Lock the arena bin associated with the first object. */ - arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(flush); + arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE( + tbin->avail[0]); arena_t *arena = chunk->arena; arena_bin_t *bin = &arena->bins[binind]; @@ -92,12 +93,10 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem tbin->tstats.nrequests = 0; } #endif - deferred = NULL; ndeferred = 0; for (i = 0; i < nflush; i++) { - ptr = flush; + ptr = tbin->avail[i]; assert(ptr != NULL); - flush = *(void **)ptr; chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); if (chunk->arena == arena) { size_t pageind = ((uintptr_t)ptr - @@ -112,17 +111,11 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem * locked. Stash the object, so that it can be * handled in a future pass. */ - *(void **)ptr = deferred; - deferred = ptr; + tbin->avail[ndeferred] = ptr; ndeferred++; } } malloc_mutex_unlock(&bin->lock); - - if (first_pass) { - tbin->avail = flush; - first_pass = false; - } } #ifdef JEMALLOC_STATS if (merged_stats == false) { @@ -139,6 +132,8 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem } #endif + memmove(tbin->avail, &tbin->avail[tbin->ncached - rem], + rem * sizeof(void *)); tbin->ncached = rem; if (tbin->ncached < tbin->low_water) tbin->low_water = tbin->ncached; @@ -151,18 +146,19 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem #endif ) { - void *flush, *deferred, *ptr; + void *ptr; unsigned i, nflush, ndeferred; - bool first_pass; +#ifdef JEMALLOC_STATS + bool merged_stats = false; +#endif assert(binind < nhbins); assert(rem <= tbin->ncached); - assert(tbin->ncached > 0 || tbin->avail == NULL); - for (flush = tbin->avail, nflush = tbin->ncached - rem, first_pass = - true; flush != NULL; flush = deferred, nflush = ndeferred) { + for (nflush = tbin->ncached - rem; nflush > 0; nflush = ndeferred) { /* Lock the arena associated with the first object. */ - arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(flush); + arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE( + tbin->avail[0]); arena_t *arena = chunk->arena; malloc_mutex_lock(&arena->lock); @@ -174,6 +170,7 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem tcache->prof_accumbytes = 0; #endif #ifdef JEMALLOC_STATS + merged_stats = true; arena->stats.nrequests_large += tbin->tstats.nrequests; arena->stats.lstats[binind - nbins].nrequests += tbin->tstats.nrequests; @@ -182,12 +179,10 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem #if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS)) } #endif - deferred = NULL; ndeferred = 0; for (i = 0; i < nflush; i++) { - ptr = flush; + ptr = tbin->avail[i]; assert(ptr != NULL); - flush = *(void **)ptr; chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); if (chunk->arena == arena) arena_dalloc_large(arena, chunk, ptr); @@ -198,19 +193,30 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem * Stash the object, so that it can be handled * in a future pass. */ - *(void **)ptr = deferred; - deferred = ptr; + tbin->avail[ndeferred] = ptr; ndeferred++; } } malloc_mutex_unlock(&arena->lock); - - if (first_pass) { - tbin->avail = flush; - first_pass = false; - } } +#ifdef JEMALLOC_STATS + if (merged_stats == false) { + /* + * The flush loop didn't happen to flush to this thread's + * arena, so the stats didn't get merged. Manually do so now. + */ + arena_t *arena = tcache->arena; + malloc_mutex_lock(&arena->lock); + arena->stats.nrequests_large += tbin->tstats.nrequests; + arena->stats.lstats[binind - nbins].nrequests += + tbin->tstats.nrequests; + tbin->tstats.nrequests = 0; + malloc_mutex_unlock(&arena->lock); + } +#endif + memmove(tbin->avail, &tbin->avail[tbin->ncached - rem], + rem * sizeof(void *)); tbin->ncached = rem; if (tbin->ncached < tbin->low_water) tbin->low_water = tbin->ncached; @@ -220,10 +226,14 @@ tcache_t * tcache_create(arena_t *arena) { tcache_t *tcache; - size_t size; + size_t size, stack_offset; unsigned i; size = offsetof(tcache_t, tbins) + (sizeof(tcache_bin_t) * nhbins); + /* Naturally align the pointer stacks. */ + size = PTR_CEILING(size); + stack_offset = size; + size += stack_nelms * sizeof(void *); /* * Round up to the nearest multiple of the cacheline size, in order to * avoid the possibility of false cacheline sharing. @@ -236,6 +246,8 @@ tcache_create(arena_t *arena) if (size <= small_maxclass) tcache = (tcache_t *)arena_malloc_small(arena, size, true); + else if (size <= tcache_maxclass) + tcache = (tcache_t *)arena_malloc_large(arena, size, true); else tcache = (tcache_t *)icalloc(size); @@ -252,15 +264,11 @@ tcache_create(arena_t *arena) tcache->arena = arena; assert((TCACHE_NSLOTS_SMALL_MAX & 1U) == 0); - for (i = 0; i < nbins; i++) { - if ((arena_bin_info[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MAX) { - tcache->tbins[i].ncached_max = (arena_bin_info[i].nregs - << 1); - } else - tcache->tbins[i].ncached_max = TCACHE_NSLOTS_SMALL_MAX; + for (i = 0; i < nhbins; i++) { + tcache->tbins[i].avail = (void **)((uintptr_t)tcache + + (uintptr_t)stack_offset); + stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *); } - for (; i < nhbins; i++) - tcache->tbins[i].ncached_max = TCACHE_NSLOTS_LARGE; TCACHE_SET(tcache); @@ -271,6 +279,7 @@ void tcache_destroy(tcache_t *tcache) { unsigned i; + size_t tcache_size; #ifdef JEMALLOC_STATS /* Unlink from list of extant tcaches. */ @@ -327,7 +336,8 @@ tcache_destroy(tcache_t *tcache) } #endif - if (arena_salloc(tcache) <= small_maxclass) { + tcache_size = arena_salloc(tcache); + if (tcache_size <= small_maxclass) { arena_chunk_t *chunk = CHUNK_ADDR2BASE(tcache); arena_t *arena = chunk->arena; size_t pageind = ((uintptr_t)tcache - (uintptr_t)chunk) >> @@ -341,6 +351,13 @@ tcache_destroy(tcache_t *tcache) malloc_mutex_lock(&bin->lock); arena_dalloc_bin(arena, chunk, tcache, mapelm); malloc_mutex_unlock(&bin->lock); + } else if (tcache_size <= tcache_maxclass) { + arena_chunk_t *chunk = CHUNK_ADDR2BASE(tcache); + arena_t *arena = chunk->arena; + + malloc_mutex_lock(&arena->lock); + arena_dalloc_large(arena, chunk, tcache); + malloc_mutex_unlock(&arena->lock); } else idalloc(tcache); } @@ -397,11 +414,13 @@ tcache_stats_merge(tcache_t *tcache, arena_t *arena) } #endif -void +bool tcache_boot(void) { if (opt_tcache) { + unsigned i; + /* * If necessary, clamp opt_lg_tcache_max, now that * small_maxclass and arena_maxclass are known. @@ -416,6 +435,28 @@ tcache_boot(void) nhbins = nbins + (tcache_maxclass >> PAGE_SHIFT); + /* Initialize tcache_bin_info. */ + tcache_bin_info = (tcache_bin_info_t *)base_alloc(nhbins * + sizeof(tcache_bin_info_t)); + if (tcache_bin_info == NULL) + return (true); + stack_nelms = 0; + for (i = 0; i < nbins; i++) { + if ((arena_bin_info[i].nregs << 1) <= + TCACHE_NSLOTS_SMALL_MAX) { + tcache_bin_info[i].ncached_max = + (arena_bin_info[i].nregs << 1); + } else { + tcache_bin_info[i].ncached_max = + TCACHE_NSLOTS_SMALL_MAX; + } + stack_nelms += tcache_bin_info[i].ncached_max; + } + for (; i < nhbins; i++) { + tcache_bin_info[i].ncached_max = TCACHE_NSLOTS_LARGE; + stack_nelms += tcache_bin_info[i].ncached_max; + } + /* Compute incremental GC event threshold. */ if (opt_lg_tcache_gc_sweep >= 0) { tcache_gc_incr = ((1U << opt_lg_tcache_gc_sweep) / @@ -431,6 +472,8 @@ tcache_boot(void) abort(); } } + + return (false); } /******************************************************************************/ #endif /* JEMALLOC_TCACHE */ |