diff options
author | Qi Wang <interwq@gwu.edu> | 2017-04-06 19:35:22 (GMT) |
---|---|---|
committer | Qi Wang <interwq@gmail.com> | 2017-04-07 21:06:17 (GMT) |
commit | 36bd90b96212772f1adbd421a6b091b542278995 (patch) | |
tree | b9b833c8124a4bd8615064cd746d4e8a3dccb0c6 | |
parent | 4dec507546040896338d8bbdb2075c7ad3a4b9f3 (diff) | |
download | jemalloc-36bd90b96212772f1adbd421a6b091b542278995.zip jemalloc-36bd90b96212772f1adbd421a6b091b542278995.tar.gz jemalloc-36bd90b96212772f1adbd421a6b091b542278995.tar.bz2 |
Optimizing TSD and thread cache layout.
1) Re-organize TSD so that frequently accessed fields are closer to the
beginning and more compact. Assuming 64-bit, the first 2.5 cachelines now
contains everything needed on tcache fast path, expect the tcache struct itself.
2) Re-organize tcache and tbins. Take lg_fill_div out of tbin, and reduce tbin
to 24 bytes (down from 32). Split tbins into tbins_small and tbins_large, and
place tbins_small close to the beginning.
-rw-r--r-- | include/jemalloc/internal/arena_externs.h | 2 | ||||
-rw-r--r-- | include/jemalloc/internal/jemalloc_internal.h.in | 69 | ||||
-rw-r--r-- | include/jemalloc/internal/rtree_structs.h | 3 | ||||
-rw-r--r-- | include/jemalloc/internal/tcache_inlines.h | 12 | ||||
-rw-r--r-- | include/jemalloc/internal/tcache_structs.h | 30 | ||||
-rw-r--r-- | include/jemalloc/internal/tcache_types.h | 7 | ||||
-rw-r--r-- | include/jemalloc/internal/tsd_structs.h | 55 | ||||
-rw-r--r-- | include/jemalloc/internal/tsd_types.h | 14 | ||||
-rw-r--r-- | src/arena.c | 16 | ||||
-rw-r--r-- | src/tcache.c | 76 |
10 files changed, 185 insertions, 99 deletions
diff --git a/include/jemalloc/internal/arena_externs.h b/include/jemalloc/internal/arena_externs.h index a35fe18..0f86dc0 100644 --- a/include/jemalloc/internal/arena_externs.h +++ b/include/jemalloc/internal/arena_externs.h @@ -51,7 +51,7 @@ bool arena_muzzy_decay_time_set(tsdn_t *tsdn, arena_t *arena, void arena_decay(tsdn_t *tsdn, arena_t *arena, bool all); void arena_reset(tsd_t *tsd, arena_t *arena); void arena_destroy(tsd_t *tsd, arena_t *arena); -void arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, +void arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache, tcache_bin_t *tbin, szind_t binind, uint64_t prof_accumbytes); void arena_alloc_junk_small(void *ptr, const arena_bin_info_t *bin_info, bool zero); diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in index 3b137fc..c00912b 100644 --- a/include/jemalloc/internal/jemalloc_internal.h.in +++ b/include/jemalloc/internal/jemalloc_internal.h.in @@ -538,33 +538,35 @@ bool malloc_initialized(void); #include "jemalloc/internal/mutex_inlines.h" #ifndef JEMALLOC_ENABLE_INLINE -pszind_t psz2ind(size_t psz); -size_t pind2sz_compute(pszind_t pind); -size_t pind2sz_lookup(pszind_t pind); -size_t pind2sz(pszind_t pind); -size_t psz2u(size_t psz); -szind_t size2index_compute(size_t size); -szind_t size2index_lookup(size_t size); -szind_t size2index(size_t size); -size_t index2size_compute(szind_t index); -size_t index2size_lookup(szind_t index); -size_t index2size(szind_t index); -size_t s2u_compute(size_t size); -size_t s2u_lookup(size_t size); -size_t s2u(size_t size); -size_t sa2u(size_t size, size_t alignment); -arena_t *arena_choose_impl(tsd_t *tsd, arena_t *arena, bool internal); -arena_t *arena_choose(tsd_t *tsd, arena_t *arena); -arena_t *arena_ichoose(tsd_t *tsd, arena_t *arena); -arena_tdata_t *arena_tdata_get(tsd_t *tsd, unsigned ind, +pszind_t psz2ind(size_t psz); +size_t pind2sz_compute(pszind_t pind); +size_t pind2sz_lookup(pszind_t pind); +size_t pind2sz(pszind_t pind); +size_t psz2u(size_t psz); +szind_t size2index_compute(size_t size); +szind_t size2index_lookup(size_t size); +szind_t size2index(size_t size); +size_t index2size_compute(szind_t index); +size_t index2size_lookup(szind_t index); +size_t index2size(szind_t index); +size_t s2u_compute(size_t size); +size_t s2u_lookup(size_t size); +size_t s2u(size_t size); +size_t sa2u(size_t size, size_t alignment); +arena_t *arena_choose_impl(tsd_t *tsd, arena_t *arena, bool internal); +arena_t *arena_choose(tsd_t *tsd, arena_t *arena); +arena_t *arena_ichoose(tsd_t *tsd, arena_t *arena); +arena_tdata_t *arena_tdata_get(tsd_t *tsd, unsigned ind, bool refresh_if_missing); -arena_t *arena_get(tsdn_t *tsdn, unsigned ind, bool init_if_missing); -ticker_t *decay_ticker_get(tsd_t *tsd, unsigned ind); -bool tcache_available(tsd_t *tsd); -tcache_t *tcache_get(tsd_t *tsd); -malloc_cpuid_t malloc_getcpu(void); -unsigned percpu_arena_choose(void); -unsigned percpu_arena_ind_limit(void); +arena_t *arena_get(tsdn_t *tsdn, unsigned ind, bool init_if_missing); +ticker_t *decay_ticker_get(tsd_t *tsd, unsigned ind); +bool tcache_available(tsd_t *tsd); +tcache_bin_t *tcache_small_bin_get(tcache_t *tcache, szind_t binind); +tcache_bin_t *tcache_large_bin_get(tcache_t *tcache, szind_t binind); +tcache_t *tcache_get(tsd_t *tsd); +malloc_cpuid_t malloc_getcpu(void); +unsigned percpu_arena_choose(void); +unsigned percpu_arena_ind_limit(void); #endif #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_)) @@ -933,6 +935,18 @@ decay_ticker_get(tsd_t *tsd, unsigned ind) { return &tdata->decay_ticker; } +JEMALLOC_ALWAYS_INLINE tcache_bin_t * +tcache_small_bin_get(tcache_t *tcache, szind_t binind) { + assert(binind < NBINS); + return &tcache->tbins_small[binind]; +} + +JEMALLOC_ALWAYS_INLINE tcache_bin_t * +tcache_large_bin_get(tcache_t *tcache, szind_t binind) { + assert(binind >= NBINS &&binind < nhbins); + return &tcache->tbins_large[binind - NBINS]; +} + JEMALLOC_ALWAYS_INLINE bool tcache_available(tsd_t *tsd) { cassert(config_tcache); @@ -945,7 +959,8 @@ tcache_available(tsd_t *tsd) { if (likely(tsd_tcache_enabled_get(tsd) == true)) { /* Associated arena == null implies tcache init in progress. */ if (tsd_tcachep_get(tsd)->arena != NULL) { - assert(tsd_tcachep_get(tsd)->tbins[0].avail != NULL); + assert(tcache_small_bin_get(tsd_tcachep_get(tsd), + 0)->avail != NULL); } return true; } diff --git a/include/jemalloc/internal/rtree_structs.h b/include/jemalloc/internal/rtree_structs.h index 8dd9cda..123248a 100644 --- a/include/jemalloc/internal/rtree_structs.h +++ b/include/jemalloc/internal/rtree_structs.h @@ -53,9 +53,6 @@ struct rtree_ctx_cache_elm_s { }; struct rtree_ctx_s { -#ifndef _MSC_VER - JEMALLOC_ALIGNED(CACHELINE) -#endif rtree_ctx_cache_elm_t cache[RTREE_CTX_NCACHE]; }; diff --git a/include/jemalloc/internal/tcache_inlines.h b/include/jemalloc/internal/tcache_inlines.h index 929d8a7..dae43f9 100644 --- a/include/jemalloc/internal/tcache_inlines.h +++ b/include/jemalloc/internal/tcache_inlines.h @@ -73,7 +73,7 @@ tcache_alloc_easy(tcache_bin_t *tbin, bool *tcache_success) { ret = *(tbin->avail - tbin->ncached); tbin->ncached--; - if (unlikely((int)tbin->ncached < tbin->low_water)) { + if (unlikely((low_water_t)tbin->ncached < tbin->low_water)) { tbin->low_water = tbin->ncached; } @@ -89,7 +89,7 @@ tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size, size_t usize JEMALLOC_CC_SILENCE_INIT(0); assert(binind < NBINS); - tbin = &tcache->tbins[binind]; + tbin = tcache_small_bin_get(tcache, binind); ret = tcache_alloc_easy(tbin, &tcache_success); assert(tcache_success == (ret != NULL)); if (unlikely(!tcache_success)) { @@ -150,8 +150,8 @@ tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size, tcache_bin_t *tbin; bool tcache_success; - assert(binind < nhbins); - tbin = &tcache->tbins[binind]; + assert(binind >= NBINS &&binind < nhbins); + tbin = tcache_large_bin_get(tcache, binind); ret = tcache_alloc_easy(tbin, &tcache_success); assert(tcache_success == (ret != NULL)); if (unlikely(!tcache_success)) { @@ -215,7 +215,7 @@ tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind, arena_dalloc_junk_small(ptr, &arena_bin_info[binind]); } - tbin = &tcache->tbins[binind]; + tbin = tcache_small_bin_get(tcache, binind); tbin_info = &tcache_bin_info[binind]; if (unlikely(tbin->ncached == tbin_info->ncached_max)) { tcache_bin_flush_small(tsd, tcache, tbin, binind, @@ -241,7 +241,7 @@ tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind, large_dalloc_junk(ptr, index2size(binind)); } - tbin = &tcache->tbins[binind]; + tbin = tcache_large_bin_get(tcache, binind); tbin_info = &tcache_bin_info[binind]; if (unlikely(tbin->ncached == tbin_info->ncached_max)) { tcache_bin_flush_large(tsd, tbin, binind, diff --git a/include/jemalloc/internal/tcache_structs.h b/include/jemalloc/internal/tcache_structs.h index d7ec4b6..4e10160 100644 --- a/include/jemalloc/internal/tcache_structs.h +++ b/include/jemalloc/internal/tcache_structs.h @@ -10,10 +10,14 @@ struct tcache_bin_info_s { }; struct tcache_bin_s { + low_water_t low_water; /* Min # cached since last GC. */ + uint32_t ncached; /* # of cached objects. */ + /* + * ncached and stats are both modified frequently. Let's keep them + * close so that they have a higher chance of being on the same + * cacheline, thus less write-backs. + */ tcache_bin_stats_t tstats; - int low_water; /* Min # cached since last GC. */ - unsigned lg_fill_div; /* Fill (ncached_max >> lg_fill_div). */ - unsigned ncached; /* # of cached objects. */ /* * To make use of adjacent cacheline prefetch, the items in the avail * stack goes to higher address for newer allocations. avail points @@ -25,11 +29,9 @@ struct tcache_bin_s { }; struct tcache_s { - ql_elm(tcache_t) link; /* Used for aggregating stats. */ + /* Data accessed frequently first: prof, ticker and small bins. */ uint64_t prof_accumbytes;/* Cleared after arena_prof_accum(). */ ticker_t gc_ticker; /* Drives incremental GC. */ - szind_t next_gc_bin; /* Next bin to GC. */ - arena_t *arena; /* Associated arena. */ /* * The pointer stacks associated with tbins follow as a contiguous * array. During tcache initialization, the avail pointer in each @@ -37,9 +39,21 @@ struct tcache_s { * this array. */ #ifdef JEMALLOC_TCACHE - tcache_bin_t tbins[NSIZES]; + tcache_bin_t tbins_small[NBINS]; +#else + tcache_bin_t tbins_small[0]; +#endif + /* Data accessed less often below. */ + ql_elm(tcache_t) link; /* Used for aggregating stats. */ + arena_t *arena; /* Associated arena. */ + szind_t next_gc_bin; /* Next bin to GC. */ +#ifdef JEMALLOC_TCACHE + /* For small bins, fill (ncached_max >> lg_fill_div). */ + uint8_t lg_fill_div[NBINS]; + tcache_bin_t tbins_large[NSIZES-NBINS]; #else - tcache_bin_t tbins[0]; + uint8_t lg_fill_div[0]; + tcache_bin_t tbins_large[0]; #endif }; diff --git a/include/jemalloc/internal/tcache_types.h b/include/jemalloc/internal/tcache_types.h index 70f8960..a60db6f 100644 --- a/include/jemalloc/internal/tcache_types.h +++ b/include/jemalloc/internal/tcache_types.h @@ -6,6 +6,9 @@ typedef struct tcache_bin_s tcache_bin_t; typedef struct tcache_s tcache_t; typedef struct tcaches_s tcaches_t; +/* ncached is cast to this type for comparison. */ +typedef int32_t low_water_t; + /* * tcache pointers close to NULL are used to encode state information that is * used for two purposes: preventing thread caching on a per thread basis and @@ -48,9 +51,9 @@ typedef struct tcaches_s tcaches_t; ((TCACHE_GC_SWEEP / NBINS) + ((TCACHE_GC_SWEEP / NBINS == 0) ? 0 : 1)) /* Used in TSD static initializer only. Real init in tcache_data_init(). */ -#define TCACHE_ZERO_INITIALIZER {{NULL}} +#define TCACHE_ZERO_INITIALIZER {0} /* Used in TSD static initializer only. Will be initialized to opt_tcache. */ -#define TCACHE_ENABLED_DEFAULT false +#define TCACHE_ENABLED_ZERO_INITIALIZER false #endif /* JEMALLOC_INTERNAL_TCACHE_TYPES_H */ diff --git a/include/jemalloc/internal/tsd_structs.h b/include/jemalloc/internal/tsd_structs.h index f327c76..2dca0bd 100644 --- a/include/jemalloc/internal/tsd_structs.h +++ b/include/jemalloc/internal/tsd_structs.h @@ -14,19 +14,54 @@ struct tsd_init_head_s { }; #endif +/* + * Thread-Specific-Data layout + * --- data accessed on tcache fast path: state, rtree_ctx, stats, prof --- + * s: state + * e: tcache_enabled + * m: thread_allocated (config_stats) + * f: thread_deallocated (config_stats) + * p: prof_tdata (config_prof) + * c: rtree_ctx (rtree cache accessed on deallocation) + * t: tcache + * --- data not accessed on tcache fast path: arena related fields --- + * d: arenas_tdata_bypass + * r: narenas_tdata + * x: blank space (1 byte) + * i: iarena + * a: arena + * o: arenas_tdata + * Loading TSD data is on the critical path of basically all malloc operations. + * In particular, tcache and rtree_ctx rely on hot CPU cache to be effective. + * Use a compact layout to reduce cache footprint. + * +--- 64-bit and 64B cacheline; 1B each letter; First byte on the left. ---+ + * |---------------------------- 1st cacheline ----------------------------| + * | sedxrrrr mmmmmmmm ffffffff pppppppp [c * 32 ........ ........ .......] | + * |---------------------------- 2nd cacheline ----------------------------| + * | [c * 64 ........ ........ ........ ........ ........ ........ .......] | + * |---------------------------- 3nd cacheline ----------------------------| + * | [c * 32 ........ ........ .......] iiiiiiii aaaaaaaa oooooooo [t...... | + * +-------------------------------------------------------------------------+ + * Note: the entire tcache is embedded into TSD and spans multiple cachelines. + * + * The last 3 members (i, a and o) before tcache isn't really needed on tcache + * fast path. However we have a number of unused tcache bins and witnesses + * (never touched unless config_debug) at the end of tcache, so we place them + * there to avoid breaking the cachelines and possibly paging in an extra page. + */ #define MALLOC_TSD \ /* O(name, type, [gs]et, init, cleanup) */ \ - O(tcache, tcache_t, yes, no, yes) \ + O(tcache_enabled, bool, yes, yes, no) \ + O(arenas_tdata_bypass, bool, no, no, no) \ + O(narenas_tdata, uint32_t, yes, no, no) \ O(thread_allocated, uint64_t, yes, no, no) \ O(thread_deallocated, uint64_t, yes, no, no) \ O(prof_tdata, prof_tdata_t *, yes, no, yes) \ + O(rtree_ctx, rtree_ctx_t, no, yes, no) \ O(iarena, arena_t *, yes, no, yes) \ O(arena, arena_t *, yes, no, yes) \ O(arenas_tdata, arena_tdata_t *,yes, no, yes) \ - O(narenas_tdata, unsigned, yes, no, no) \ - O(arenas_tdata_bypass, bool, no, no, no) \ - O(tcache_enabled, bool, yes, yes, no) \ - O(rtree_ctx, rtree_ctx_t, no, yes, no) \ + O(tcache, tcache_t, yes, no, yes) \ O(witnesses, witness_list_t, no, no, yes) \ O(rtree_leaf_elm_witnesses, rtree_leaf_elm_witness_tsd_t, \ no, no, no) \ @@ -34,17 +69,17 @@ struct tsd_init_head_s { #define TSD_INITIALIZER { \ tsd_state_uninitialized, \ - TCACHE_ZERO_INITIALIZER, \ + TCACHE_ENABLED_ZERO_INITIALIZER, \ + false, \ + 0, \ 0, \ 0, \ NULL, \ + RTREE_CTX_ZERO_INITIALIZER, \ NULL, \ NULL, \ NULL, \ - 0, \ - false, \ - TCACHE_ENABLED_DEFAULT, \ - RTREE_CTX_ZERO_INITIALIZER, \ + TCACHE_ZERO_INITIALIZER, \ ql_head_initializer(witnesses), \ RTREE_ELM_WITNESS_TSD_INITIALIZER, \ false \ diff --git a/include/jemalloc/internal/tsd_types.h b/include/jemalloc/internal/tsd_types.h index 29c6378..4d5fef5 100644 --- a/include/jemalloc/internal/tsd_types.h +++ b/include/jemalloc/internal/tsd_types.h @@ -17,12 +17,14 @@ typedef struct tsdn_s tsdn_t; #define TSDN_NULL ((tsdn_t *)0) -typedef enum { - tsd_state_uninitialized, - tsd_state_nominal, - tsd_state_purgatory, - tsd_state_reincarnated -} tsd_state_t; +enum { + tsd_state_uninitialized = 0, + tsd_state_nominal = 1, + tsd_state_purgatory = 2, + tsd_state_reincarnated = 3 +}; +/* Manually limit tsd_state_t to a single byte. */ +typedef uint8_t tsd_state_t; /* * TLS/TSD-agnostic macro-based implementation of thread-specific data. There diff --git a/src/arena.c b/src/arena.c index feb1f76..b78719e 100644 --- a/src/arena.c +++ b/src/arena.c @@ -287,8 +287,14 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads, atomic_store_zu(&astats->tcache_bytes, 0, ATOMIC_RELAXED); malloc_mutex_lock(tsdn, &arena->tcache_ql_mtx); ql_foreach(tcache, &arena->tcache_ql, link) { - for (szind_t i = 0; i < nhbins; i++) { - tbin = &tcache->tbins[i]; + szind_t i = 0; + for (; i < NBINS; i++) { + tbin = tcache_small_bin_get(tcache, i); + arena_stats_accum_zu(&astats->tcache_bytes, + tbin->ncached * index2size(i)); + } + for (; i < nhbins; i++) { + tbin = tcache_large_bin_get(tcache, i); arena_stats_accum_zu(&astats->tcache_bytes, tbin->ncached * index2size(i)); } @@ -1317,8 +1323,8 @@ arena_bin_malloc_hard(tsdn_t *tsdn, arena_t *arena, arena_bin_t *bin, } void -arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_bin_t *tbin, - szind_t binind, uint64_t prof_accumbytes) { +arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache, + tcache_bin_t *tbin, szind_t binind, uint64_t prof_accumbytes) { unsigned i, nfill; arena_bin_t *bin; @@ -1330,7 +1336,7 @@ arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_bin_t *tbin, bin = &arena->bins[binind]; malloc_mutex_lock(tsdn, &bin->lock); for (i = 0, nfill = (tcache_bin_info[binind].ncached_max >> - tbin->lg_fill_div); i < nfill; i++) { + tcache->lg_fill_div[binind]); i < nfill; i++) { extent_t *slab; void *ptr; if ((slab = bin->slabcur) != NULL && extent_nfree_get(slab) > diff --git a/src/tcache.c b/src/tcache.c index b8ce4a0..34b46af 100644 --- a/src/tcache.c +++ b/src/tcache.c @@ -40,9 +40,13 @@ tcache_salloc(tsdn_t *tsdn, const void *ptr) { void tcache_event_hard(tsd_t *tsd, tcache_t *tcache) { szind_t binind = tcache->next_gc_bin; - tcache_bin_t *tbin = &tcache->tbins[binind]; - tcache_bin_info_t *tbin_info = &tcache_bin_info[binind]; + tcache_bin_t *tbin; + if (binind < NBINS) { + tbin = tcache_small_bin_get(tcache, binind); + } else { + tbin = tcache_large_bin_get(tcache, binind); + } if (tbin->low_water > 0) { /* * Flush (ceiling) 3/4 of the objects below the low water mark. @@ -51,24 +55,26 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache) { tcache_bin_flush_small(tsd, tcache, tbin, binind, tbin->ncached - tbin->low_water + (tbin->low_water >> 2)); + /* + * Reduce fill count by 2X. Limit lg_fill_div such that + * the fill count is always at least 1. + */ + tcache_bin_info_t *tbin_info = &tcache_bin_info[binind]; + if ((tbin_info->ncached_max >> + (tcache->lg_fill_div[binind] + 1)) >= 1) { + tcache->lg_fill_div[binind]++; + } } else { tcache_bin_flush_large(tsd, tbin, binind, tbin->ncached - tbin->low_water + (tbin->low_water >> 2), tcache); } - /* - * Reduce fill count by 2X. Limit lg_fill_div such that the - * fill count is always at least 1. - */ - if ((tbin_info->ncached_max >> (tbin->lg_fill_div+1)) >= 1) { - tbin->lg_fill_div++; - } } else if (tbin->low_water < 0) { /* - * Increase fill count by 2X. Make sure lg_fill_div stays - * greater than 0. + * Increase fill count by 2X for small bins. Make sure + * lg_fill_div stays greater than 0. */ - if (tbin->lg_fill_div > 1) { - tbin->lg_fill_div--; + if (binind < NBINS && tcache->lg_fill_div[binind] > 1) { + tcache->lg_fill_div[binind]--; } } tbin->low_water = tbin->ncached; @@ -85,8 +91,8 @@ tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache, void *ret; assert(tcache->arena); - arena_tcache_fill_small(tsdn, arena, tbin, binind, config_prof ? - tcache->prof_accumbytes : 0); + arena_tcache_fill_small(tsdn, arena, tcache, tbin, binind, + config_prof ? tcache->prof_accumbytes : 0); if (config_prof) { tcache->prof_accumbytes = 0; } @@ -175,7 +181,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin, memmove(tbin->avail - rem, tbin->avail - tbin->ncached, rem * sizeof(void *)); tbin->ncached = rem; - if ((int)tbin->ncached < tbin->low_water) { + if ((low_water_t)tbin->ncached < tbin->low_water) { tbin->low_water = tbin->ncached; } } @@ -273,7 +279,7 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, szind_t binind, memmove(tbin->avail - rem, tbin->avail - tbin->ncached, rem * sizeof(void *)); tbin->ncached = rem; - if ((int)tbin->ncached < tbin->low_water) { + if ((low_water_t)tbin->ncached < tbin->low_water) { tbin->low_water = tbin->ncached; } } @@ -347,17 +353,24 @@ tcache_init(tsd_t *tsd, tcache_t *tcache, void *avail_stack) { size_t stack_offset = 0; assert((TCACHE_NSLOTS_SMALL_MAX & 1U) == 0); - memset(tcache->tbins, 0, sizeof(tcache_bin_t) * nhbins); - for (unsigned i = 0; i < nhbins; i++) { - tcache->tbins[i].lg_fill_div = 1; + memset(tcache->tbins_small, 0, sizeof(tcache_bin_t) * NBINS); + memset(tcache->tbins_large, 0, sizeof(tcache_bin_t) * (nhbins - NBINS)); + unsigned i = 0; + for (; i < NBINS; i++) { + tcache->lg_fill_div[i] = 1; stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *); /* * avail points past the available space. Allocations will * access the slots toward higher addresses (for the benefit of * prefetch). */ - tcache->tbins[i].avail = (void **)((uintptr_t)avail_stack + - (uintptr_t)stack_offset); + tcache_small_bin_get(tcache, i)->avail = + (void **)((uintptr_t)avail_stack + (uintptr_t)stack_offset); + } + for (; i < nhbins; i++) { + stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *); + tcache_large_bin_get(tcache, i)->avail = + (void **)((uintptr_t)avail_stack + (uintptr_t)stack_offset); } assert(stack_offset == stack_nelms * sizeof(void *)); } @@ -370,7 +383,7 @@ tsd_tcache_data_init(tsd_t *tsd) { } tcache_t *tcache = &tsd->tcache; - assert(tcache->tbins[0].avail == NULL); + assert(tcache_small_bin_get(tcache, 0)->avail == NULL); size_t size = stack_nelms * sizeof(void *); /* Avoid false cacheline sharing. */ size = sa2u(size, CACHELINE); @@ -443,7 +456,7 @@ tcache_flush_cache(tsd_t *tsd, tcache_t *tcache) { unsigned i; for (i = 0; i < NBINS; i++) { - tcache_bin_t *tbin = &tcache->tbins[i]; + tcache_bin_t *tbin = tcache_small_bin_get(tcache, i); tcache_bin_flush_small(tsd, tcache, tbin, i, 0); if (config_stats) { @@ -451,7 +464,7 @@ tcache_flush_cache(tsd_t *tsd, tcache_t *tcache) { } } for (; i < nhbins; i++) { - tcache_bin_t *tbin = &tcache->tbins[i]; + tcache_bin_t *tbin = tcache_large_bin_get(tcache, i); tcache_bin_flush_large(tsd, tbin, i, 0, tcache); if (config_stats) { @@ -483,7 +496,8 @@ tcache_destroy(tsd_t *tsd, tcache_t *tcache, bool tsd_tcache) { if (tsd_tcache) { /* Release the avail array for the TSD embedded auto tcache. */ - void *avail_array = (void *)((uintptr_t)tcache->tbins[0].avail - + void *avail_array = + (void *)((uintptr_t)tcache_small_bin_get(tcache, 0)->avail - (uintptr_t)tcache_bin_info[0].ncached_max * sizeof(void *)); idalloctm(tsd_tsdn(tsd), avail_array, NULL, true, true); } else { @@ -503,16 +517,16 @@ tcache_cleanup(tsd_t *tsd) { if (!tcache_available(tsd)) { assert(tsd_tcache_enabled_get(tsd) == false); if (config_debug) { - assert(tcache->tbins[0].avail == NULL); + assert(tcache_small_bin_get(tcache, 0)->avail == NULL); } return; } assert(tsd_tcache_enabled_get(tsd)); - assert(tcache->tbins[0].avail != NULL); + assert(tcache_small_bin_get(tcache, 0)->avail != NULL); tcache_destroy(tsd, tcache, true); if (config_debug) { - tcache->tbins[0].avail = NULL; + tcache_small_bin_get(tcache, 0)->avail = NULL; } } @@ -525,7 +539,7 @@ tcache_stats_merge(tsdn_t *tsdn, tcache_t *tcache, arena_t *arena) { /* Merge and reset tcache stats. */ for (i = 0; i < NBINS; i++) { arena_bin_t *bin = &arena->bins[i]; - tcache_bin_t *tbin = &tcache->tbins[i]; + tcache_bin_t *tbin = tcache_small_bin_get(tcache, i); malloc_mutex_lock(tsdn, &bin->lock); bin->stats.nrequests += tbin->tstats.nrequests; malloc_mutex_unlock(tsdn, &bin->lock); @@ -533,7 +547,7 @@ tcache_stats_merge(tsdn_t *tsdn, tcache_t *tcache, arena_t *arena) { } for (; i < nhbins; i++) { - tcache_bin_t *tbin = &tcache->tbins[i]; + tcache_bin_t *tbin = tcache_large_bin_get(tcache, i); arena_stats_large_nrequests_add(tsdn, &arena->stats, i, tbin->tstats.nrequests); tbin->tstats.nrequests = 0; |