Optimizing TSD and thread cache layout.

1) Re-organize TSD so that frequently accessed fields are closer to the beginning and more compact. Assuming 64-bit, the first 2.5 cachelines now contains everything needed on tcache fast path, expect the tcache struct itself. 2) Re-organize tcache and tbins. Take lg_fill_div out of tbin, and reduce tbin to 24 bytes (down from 32). Split tbins into tbins_small and tbins_large, and place tbins_small close to the beginning.
author: Qi Wang <interwq@gwu.edu> 2017-04-06 19:35:22 (GMT)
committer: Qi Wang <interwq@gmail.com> 2017-04-07 21:06:17 (GMT)
commit: 36bd90b96212772f1adbd421a6b091b542278995 (patch)
tree: b9b833c8124a4bd8615064cd746d4e8a3dccb0c6
parent: 4dec507546040896338d8bbdb2075c7ad3a4b9f3 (diff)
download: jemalloc-36bd90b96212772f1adbd421a6b091b542278995.zip
jemalloc-36bd90b96212772f1adbd421a6b091b542278995.tar.gz
jemalloc-36bd90b96212772f1adbd421a6b091b542278995.tar.bz2
10 files changed, 185 insertions, 99 deletions
diff --git a/include/jemalloc/internal/arena_externs.h b/include/jemalloc/internal/arena_externs.h
index a35fe18..0f86dc0 100644
--- a/include/jemalloc/internal/arena_externs.h
+++ b/include/jemalloc/internal/arena_externs.h
@@ -51,7 +51,7 @@ bool arena_muzzy_decay_time_set(tsdn_t *tsdn, arena_t *arena,
 void arena_decay(tsdn_t *tsdn, arena_t *arena, bool all);
 void	arena_reset(tsd_t *tsd, arena_t *arena);
 void	arena_destroy(tsd_t *tsd, arena_t *arena);
-void	arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena,
+void	arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
     tcache_bin_t *tbin, szind_t binind, uint64_t prof_accumbytes);
 void	arena_alloc_junk_small(void *ptr, const arena_bin_info_t *bin_info,
     bool zero);
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 3b137fc..c00912b 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -538,33 +538,35 @@ bool malloc_initialized(void);
 #include "jemalloc/internal/mutex_inlines.h"
 
 #ifndef JEMALLOC_ENABLE_INLINE
-pszind_t	psz2ind(size_t psz);
-size_t	pind2sz_compute(pszind_t pind);
-size_t	pind2sz_lookup(pszind_t pind);
-size_t	pind2sz(pszind_t pind);
-size_t	psz2u(size_t psz);
-szind_t	size2index_compute(size_t size);
-szind_t	size2index_lookup(size_t size);
-szind_t	size2index(size_t size);
-size_t	index2size_compute(szind_t index);
-size_t	index2size_lookup(szind_t index);
-size_t	index2size(szind_t index);
-size_t	s2u_compute(size_t size);
-size_t	s2u_lookup(size_t size);
-size_t	s2u(size_t size);
-size_t	sa2u(size_t size, size_t alignment);
-arena_t	*arena_choose_impl(tsd_t *tsd, arena_t *arena, bool internal);
-arena_t	*arena_choose(tsd_t *tsd, arena_t *arena);
-arena_t	*arena_ichoose(tsd_t *tsd, arena_t *arena);
-arena_tdata_t	*arena_tdata_get(tsd_t *tsd, unsigned ind,
+pszind_t psz2ind(size_t psz);
+size_t pind2sz_compute(pszind_t pind);
+size_t pind2sz_lookup(pszind_t pind);
+size_t pind2sz(pszind_t pind);
+size_t psz2u(size_t psz);
+szind_t size2index_compute(size_t size);
+szind_t size2index_lookup(size_t size);
+szind_t size2index(size_t size);
+size_t index2size_compute(szind_t index);
+size_t index2size_lookup(szind_t index);
+size_t index2size(szind_t index);
+size_t s2u_compute(size_t size);
+size_t s2u_lookup(size_t size);
+size_t s2u(size_t size);
+size_t sa2u(size_t size, size_t alignment);
+arena_t *arena_choose_impl(tsd_t *tsd, arena_t *arena, bool internal);
+arena_t *arena_choose(tsd_t *tsd, arena_t *arena);
+arena_t *arena_ichoose(tsd_t *tsd, arena_t *arena);
+arena_tdata_t *arena_tdata_get(tsd_t *tsd, unsigned ind,
     bool refresh_if_missing);
-arena_t	*arena_get(tsdn_t *tsdn, unsigned ind, bool init_if_missing);
-ticker_t	*decay_ticker_get(tsd_t *tsd, unsigned ind);
-bool	tcache_available(tsd_t *tsd);
-tcache_t	*tcache_get(tsd_t *tsd);
-malloc_cpuid_t	malloc_getcpu(void);
-unsigned	percpu_arena_choose(void);
-unsigned	percpu_arena_ind_limit(void);
+arena_t *arena_get(tsdn_t *tsdn, unsigned ind, bool init_if_missing);
+ticker_t *decay_ticker_get(tsd_t *tsd, unsigned ind);
+bool tcache_available(tsd_t *tsd);
+tcache_bin_t *tcache_small_bin_get(tcache_t *tcache, szind_t binind);
+tcache_bin_t *tcache_large_bin_get(tcache_t *tcache, szind_t binind);
+tcache_t *tcache_get(tsd_t *tsd);
+malloc_cpuid_t malloc_getcpu(void);
+unsigned percpu_arena_choose(void);
+unsigned percpu_arena_ind_limit(void);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
@@ -933,6 +935,18 @@ decay_ticker_get(tsd_t *tsd, unsigned ind) {
 	return &tdata->decay_ticker;
 }
 
+JEMALLOC_ALWAYS_INLINE tcache_bin_t *
+tcache_small_bin_get(tcache_t *tcache, szind_t binind) {
+	assert(binind < NBINS);
+	return &tcache->tbins_small[binind];
+}
+
+JEMALLOC_ALWAYS_INLINE tcache_bin_t *
+tcache_large_bin_get(tcache_t *tcache, szind_t binind) {
+	assert(binind >= NBINS &&binind < nhbins);
+	return &tcache->tbins_large[binind - NBINS];
+}
+
 JEMALLOC_ALWAYS_INLINE bool
 tcache_available(tsd_t *tsd) {
 	cassert(config_tcache);
@@ -945,7 +959,8 @@ tcache_available(tsd_t *tsd) {
 	if (likely(tsd_tcache_enabled_get(tsd) == true)) {
 		/* Associated arena == null implies tcache init in progress. */
 		if (tsd_tcachep_get(tsd)->arena != NULL) {
-			assert(tsd_tcachep_get(tsd)->tbins[0].avail != NULL);
+			assert(tcache_small_bin_get(tsd_tcachep_get(tsd),
+			    0)->avail != NULL);
 		}
 		return true;
 	}
diff --git a/include/jemalloc/internal/rtree_structs.h b/include/jemalloc/internal/rtree_structs.h
index 8dd9cda..123248a 100644
--- a/include/jemalloc/internal/rtree_structs.h
+++ b/include/jemalloc/internal/rtree_structs.h
@@ -53,9 +53,6 @@ struct rtree_ctx_cache_elm_s {
 };
 
 struct rtree_ctx_s {
-#ifndef _MSC_VER
-	JEMALLOC_ALIGNED(CACHELINE)
-#endif
 	rtree_ctx_cache_elm_t	cache[RTREE_CTX_NCACHE];
 };
 
diff --git a/include/jemalloc/internal/tcache_inlines.h b/include/jemalloc/internal/tcache_inlines.h
index 929d8a7..dae43f9 100644
--- a/include/jemalloc/internal/tcache_inlines.h
+++ b/include/jemalloc/internal/tcache_inlines.h
@@ -73,7 +73,7 @@ tcache_alloc_easy(tcache_bin_t *tbin, bool *tcache_success) {
 	ret = *(tbin->avail - tbin->ncached);
 	tbin->ncached--;
 
-	if (unlikely((int)tbin->ncached < tbin->low_water)) {
+	if (unlikely((low_water_t)tbin->ncached < tbin->low_water)) {
 		tbin->low_water = tbin->ncached;
 	}
 
@@ -89,7 +89,7 @@ tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
 	size_t usize JEMALLOC_CC_SILENCE_INIT(0);
 
 	assert(binind < NBINS);
-	tbin = &tcache->tbins[binind];
+	tbin = tcache_small_bin_get(tcache, binind);
 	ret = tcache_alloc_easy(tbin, &tcache_success);
 	assert(tcache_success == (ret != NULL));
 	if (unlikely(!tcache_success)) {
@@ -150,8 +150,8 @@ tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
 	tcache_bin_t *tbin;
 	bool tcache_success;
 
-	assert(binind < nhbins);
-	tbin = &tcache->tbins[binind];
+	assert(binind >= NBINS &&binind < nhbins);
+	tbin = tcache_large_bin_get(tcache, binind);
 	ret = tcache_alloc_easy(tbin, &tcache_success);
 	assert(tcache_success == (ret != NULL));
 	if (unlikely(!tcache_success)) {
@@ -215,7 +215,7 @@ tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
 		arena_dalloc_junk_small(ptr, &arena_bin_info[binind]);
 	}
 
-	tbin = &tcache->tbins[binind];
+	tbin = tcache_small_bin_get(tcache, binind);
 	tbin_info = &tcache_bin_info[binind];
 	if (unlikely(tbin->ncached == tbin_info->ncached_max)) {
 		tcache_bin_flush_small(tsd, tcache, tbin, binind,
@@ -241,7 +241,7 @@ tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
 		large_dalloc_junk(ptr, index2size(binind));
 	}
 
-	tbin = &tcache->tbins[binind];
+	tbin = tcache_large_bin_get(tcache, binind);
 	tbin_info = &tcache_bin_info[binind];
 	if (unlikely(tbin->ncached == tbin_info->ncached_max)) {
 		tcache_bin_flush_large(tsd, tbin, binind,
diff --git a/include/jemalloc/internal/tcache_structs.h b/include/jemalloc/internal/tcache_structs.h
index d7ec4b6..4e10160 100644
--- a/include/jemalloc/internal/tcache_structs.h
+++ b/include/jemalloc/internal/tcache_structs.h
@@ -10,10 +10,14 @@ struct tcache_bin_info_s {
 };
 
 struct tcache_bin_s {
+	low_water_t	low_water;	/* Min # cached since last GC. */
+	uint32_t	ncached;	/* # of cached objects. */
+	/*
+	 * ncached and stats are both modified frequently.  Let's keep them
+	 * close so that they have a higher chance of being on the same
+	 * cacheline, thus less write-backs.
+	 */
 	tcache_bin_stats_t tstats;
-	int		low_water;	/* Min # cached since last GC. */
-	unsigned	lg_fill_div;	/* Fill (ncached_max >> lg_fill_div). */
-	unsigned	ncached;	/* # of cached objects. */
 	/*
 	 * To make use of adjacent cacheline prefetch, the items in the avail
 	 * stack goes to higher address for newer allocations.  avail points
@@ -25,11 +29,9 @@ struct tcache_bin_s {
 };
 
 struct tcache_s {
-	ql_elm(tcache_t) link;		/* Used for aggregating stats. */
+	/* Data accessed frequently first: prof, ticker and small bins. */
 	uint64_t	prof_accumbytes;/* Cleared after arena_prof_accum(). */
 	ticker_t	gc_ticker;	/* Drives incremental GC. */
-	szind_t		next_gc_bin;	/* Next bin to GC. */
-	arena_t		*arena;		/* Associated arena. */
 	/*
 	 * The pointer stacks associated with tbins follow as a contiguous
 	 * array.  During tcache initialization, the avail pointer in each
@@ -37,9 +39,21 @@ struct tcache_s {
 	 * this array.
 	 */
 #ifdef JEMALLOC_TCACHE
-	tcache_bin_t	tbins[NSIZES];
+	tcache_bin_t	tbins_small[NBINS];
+#else
+	tcache_bin_t	tbins_small[0];
+#endif
+	/* Data accessed less often below. */
+	ql_elm(tcache_t) link;		/* Used for aggregating stats. */
+	arena_t		*arena;		/* Associated arena. */
+	szind_t		next_gc_bin;	/* Next bin to GC. */
+#ifdef JEMALLOC_TCACHE
+	/* For small bins, fill (ncached_max >> lg_fill_div). */
+	uint8_t		lg_fill_div[NBINS];
+	tcache_bin_t	tbins_large[NSIZES-NBINS];
 #else
-	tcache_bin_t	tbins[0];
+	uint8_t		lg_fill_div[0];
+	tcache_bin_t	tbins_large[0];
 #endif
 };
 
diff --git a/include/jemalloc/internal/tcache_types.h b/include/jemalloc/internal/tcache_types.h
index 70f8960..a60db6f 100644
--- a/include/jemalloc/internal/tcache_types.h
+++ b/include/jemalloc/internal/tcache_types.h
@@ -6,6 +6,9 @@ typedef struct tcache_bin_s tcache_bin_t;
 typedef struct tcache_s tcache_t;
 typedef struct tcaches_s tcaches_t;
 
+/* ncached is cast to this type for comparison. */
+typedef int32_t low_water_t;
+
 /*
  * tcache pointers close to NULL are used to encode state information that is
  * used for two purposes: preventing thread caching on a per thread basis and
@@ -48,9 +51,9 @@ typedef struct tcaches_s tcaches_t;
     ((TCACHE_GC_SWEEP / NBINS) + ((TCACHE_GC_SWEEP / NBINS == 0) ? 0 : 1))
 
 /* Used in TSD static initializer only. Real init in tcache_data_init(). */
-#define TCACHE_ZERO_INITIALIZER {{NULL}}
+#define TCACHE_ZERO_INITIALIZER {0}
 
 /* Used in TSD static initializer only. Will be initialized to opt_tcache. */
-#define TCACHE_ENABLED_DEFAULT false
+#define TCACHE_ENABLED_ZERO_INITIALIZER false
 
 #endif /* JEMALLOC_INTERNAL_TCACHE_TYPES_H */
diff --git a/include/jemalloc/internal/tsd_structs.h b/include/jemalloc/internal/tsd_structs.h
index f327c76..2dca0bd 100644
--- a/include/jemalloc/internal/tsd_structs.h
+++ b/include/jemalloc/internal/tsd_structs.h
@@ -14,19 +14,54 @@ struct tsd_init_head_s {
 };
 #endif
 
+/*
+ * Thread-Specific-Data layout
+ * --- data accessed on tcache fast path: state, rtree_ctx, stats, prof ---
+ * s: state
+ * e: tcache_enabled
+ * m: thread_allocated (config_stats)
+ * f: thread_deallocated (config_stats)
+ * p: prof_tdata (config_prof)
+ * c: rtree_ctx (rtree cache accessed on deallocation)
+ * t: tcache
+ * --- data not accessed on tcache fast path: arena related fields ---
+ * d: arenas_tdata_bypass
+ * r: narenas_tdata
+ * x: blank space (1 byte)
+ * i: iarena
+ * a: arena
+ * o: arenas_tdata
+ * Loading TSD data is on the critical path of basically all malloc operations.
+ * In particular, tcache and rtree_ctx rely on hot CPU cache to be effective.
+ * Use a compact layout to reduce cache footprint.
+ * +--- 64-bit and 64B cacheline; 1B each letter; First byte on the left. ---+
+ * |----------------------------  1st cacheline  ----------------------------|
+ * | sedxrrrr mmmmmmmm ffffffff pppppppp [c * 32  ........ ........ .......] |
+ * |----------------------------  2nd cacheline  ----------------------------|
+ * | [c * 64  ........ ........ ........ ........ ........ ........ .......] |
+ * |----------------------------  3nd cacheline  ----------------------------|
+ * | [c * 32  ........ ........ .......] iiiiiiii aaaaaaaa oooooooo [t...... |
+ * +-------------------------------------------------------------------------+
+ * Note: the entire tcache is embedded into TSD and spans multiple cachelines.
+ *
+ * The last 3 members (i, a and o) before tcache isn't really needed on tcache
+ * fast path.  However we have a number of unused tcache bins and witnesses
+ * (never touched unless config_debug) at the end of tcache, so we place them
+ * there to avoid breaking the cachelines and possibly paging in an extra page.
+ */
 #define MALLOC_TSD							\
 /*  O(name,			type,		[gs]et,	init,	cleanup) */ \
-    O(tcache,			tcache_t,	yes,	no,	yes)	\
+    O(tcache_enabled,		bool,		yes,	yes,	no)	\
+    O(arenas_tdata_bypass,	bool,		no,	no,	no)	\
+    O(narenas_tdata,		uint32_t,	yes,	no,	no)	\
     O(thread_allocated,		uint64_t,	yes,	no,	no)	\
     O(thread_deallocated,	uint64_t,	yes,	no,	no)	\
     O(prof_tdata,		prof_tdata_t *,	yes,	no,	yes)	\
+    O(rtree_ctx,		rtree_ctx_t,	no,	yes,	no)	\
     O(iarena,			arena_t *,	yes,	no,	yes)	\
     O(arena,			arena_t *,	yes,	no,	yes)	\
     O(arenas_tdata,		arena_tdata_t *,yes,	no,	yes)	\
-    O(narenas_tdata,		unsigned,	yes,	no,	no)	\
-    O(arenas_tdata_bypass,	bool,		no,	no,	no)	\
-    O(tcache_enabled,		bool,		yes,	yes,	no)	\
-    O(rtree_ctx,		rtree_ctx_t,	no,	yes,	no)	\
+    O(tcache,			tcache_t,	yes,	no,	yes)	\
     O(witnesses,		witness_list_t,	no,	no,	yes)	\
     O(rtree_leaf_elm_witnesses,	rtree_leaf_elm_witness_tsd_t,		\
 						no,	no,	no)	\
@@ -34,17 +69,17 @@ struct tsd_init_head_s {
 
 #define TSD_INITIALIZER {						\
     tsd_state_uninitialized,						\
-    TCACHE_ZERO_INITIALIZER,						\
+    TCACHE_ENABLED_ZERO_INITIALIZER,					\
+    false,								\
+    0,									\
     0,									\
     0,									\
     NULL,								\
+    RTREE_CTX_ZERO_INITIALIZER,						\
     NULL,								\
     NULL,								\
     NULL,								\
-    0,									\
-    false,								\
-    TCACHE_ENABLED_DEFAULT,						\
-    RTREE_CTX_ZERO_INITIALIZER,						\
+    TCACHE_ZERO_INITIALIZER,						\
     ql_head_initializer(witnesses),					\
     RTREE_ELM_WITNESS_TSD_INITIALIZER,					\
     false								\
diff --git a/include/jemalloc/internal/tsd_types.h b/include/jemalloc/internal/tsd_types.h
index 29c6378..4d5fef5 100644
--- a/include/jemalloc/internal/tsd_types.h
+++ b/include/jemalloc/internal/tsd_types.h
@@ -17,12 +17,14 @@ typedef struct tsdn_s tsdn_t;
 
 #define TSDN_NULL	((tsdn_t *)0)
 
-typedef enum {
-	tsd_state_uninitialized,
-	tsd_state_nominal,
-	tsd_state_purgatory,
-	tsd_state_reincarnated
-} tsd_state_t;
+enum {
+	tsd_state_uninitialized = 0,
+	tsd_state_nominal = 1,
+	tsd_state_purgatory = 2,
+	tsd_state_reincarnated = 3
+};
+/* Manually limit tsd_state_t to a single byte. */
+typedef uint8_t tsd_state_t;
 
 /*
  * TLS/TSD-agnostic macro-based implementation of thread-specific data.  There
diff --git a/src/arena.c b/src/arena.c
index feb1f76..b78719e 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -287,8 +287,14 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
 		atomic_store_zu(&astats->tcache_bytes, 0, ATOMIC_RELAXED);
 		malloc_mutex_lock(tsdn, &arena->tcache_ql_mtx);
 		ql_foreach(tcache, &arena->tcache_ql, link) {
-			for (szind_t i = 0; i < nhbins; i++) {
-				tbin = &tcache->tbins[i];
+			szind_t i = 0;
+			for (; i < NBINS; i++) {
+				tbin = tcache_small_bin_get(tcache, i);
+				arena_stats_accum_zu(&astats->tcache_bytes,
+				    tbin->ncached * index2size(i));
+			}
+			for (; i < nhbins; i++) {
+				tbin = tcache_large_bin_get(tcache, i);
 				arena_stats_accum_zu(&astats->tcache_bytes,
 				    tbin->ncached * index2size(i));
 			}
@@ -1317,8 +1323,8 @@ arena_bin_malloc_hard(tsdn_t *tsdn, arena_t *arena, arena_bin_t *bin,
 }
 
 void
-arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_bin_t *tbin,
-    szind_t binind, uint64_t prof_accumbytes) {
+arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
+    tcache_bin_t *tbin, szind_t binind, uint64_t prof_accumbytes) {
 	unsigned i, nfill;
 	arena_bin_t *bin;
 
@@ -1330,7 +1336,7 @@ arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_bin_t *tbin,
 	bin = &arena->bins[binind];
 	malloc_mutex_lock(tsdn, &bin->lock);
 	for (i = 0, nfill = (tcache_bin_info[binind].ncached_max >>
-	    tbin->lg_fill_div); i < nfill; i++) {
+	    tcache->lg_fill_div[binind]); i < nfill; i++) {
 		extent_t *slab;
 		void *ptr;
 		if ((slab = bin->slabcur) != NULL && extent_nfree_get(slab) >
diff --git a/src/tcache.c b/src/tcache.c
index b8ce4a0..34b46af 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -40,9 +40,13 @@ tcache_salloc(tsdn_t *tsdn, const void *ptr) {
 void
 tcache_event_hard(tsd_t *tsd, tcache_t *tcache) {
 	szind_t binind = tcache->next_gc_bin;
-	tcache_bin_t *tbin = &tcache->tbins[binind];
-	tcache_bin_info_t *tbin_info = &tcache_bin_info[binind];
 
+	tcache_bin_t *tbin;
+	if (binind < NBINS) {
+		tbin = tcache_small_bin_get(tcache, binind);
+	} else {
+		tbin = tcache_large_bin_get(tcache, binind);
+	}
 	if (tbin->low_water > 0) {
 		/*
 		 * Flush (ceiling) 3/4 of the objects below the low water mark.
@@ -51,24 +55,26 @@ tcache_event_hard(tsd_t *tsd, tcache_t *tcache) {
 			tcache_bin_flush_small(tsd, tcache, tbin, binind,
 			    tbin->ncached - tbin->low_water + (tbin->low_water
 			    >> 2));
+			/*
+			 * Reduce fill count by 2X.  Limit lg_fill_div such that
+			 * the fill count is always at least 1.
+			 */
+			tcache_bin_info_t *tbin_info = &tcache_bin_info[binind];
+			if ((tbin_info->ncached_max >>
+			     (tcache->lg_fill_div[binind] + 1)) >= 1) {
+				tcache->lg_fill_div[binind]++;
+			}
 		} else {
 			tcache_bin_flush_large(tsd, tbin, binind, tbin->ncached
 			    - tbin->low_water + (tbin->low_water >> 2), tcache);
 		}
-		/*
-		 * Reduce fill count by 2X.  Limit lg_fill_div such that the
-		 * fill count is always at least 1.
-		 */
-		if ((tbin_info->ncached_max >> (tbin->lg_fill_div+1)) >= 1) {
-			tbin->lg_fill_div++;
-		}
 	} else if (tbin->low_water < 0) {
 		/*
-		 * Increase fill count by 2X.  Make sure lg_fill_div stays
-		 * greater than 0.
+		 * Increase fill count by 2X for small bins.  Make sure
+		 * lg_fill_div stays greater than 0.
 		 */
-		if (tbin->lg_fill_div > 1) {
-			tbin->lg_fill_div--;
+		if (binind < NBINS && tcache->lg_fill_div[binind] > 1) {
+			tcache->lg_fill_div[binind]--;
 		}
 	}
 	tbin->low_water = tbin->ncached;
@@ -85,8 +91,8 @@ tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
 	void *ret;
 
 	assert(tcache->arena);
-	arena_tcache_fill_small(tsdn, arena, tbin, binind, config_prof ?
-	    tcache->prof_accumbytes : 0);
+	arena_tcache_fill_small(tsdn, arena, tcache, tbin, binind,
+	    config_prof ? tcache->prof_accumbytes : 0);
 	if (config_prof) {
 		tcache->prof_accumbytes = 0;
 	}
@@ -175,7 +181,7 @@ tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, tcache_bin_t *tbin,
 	memmove(tbin->avail - rem, tbin->avail - tbin->ncached, rem *
 	    sizeof(void *));
 	tbin->ncached = rem;
-	if ((int)tbin->ncached < tbin->low_water) {
+	if ((low_water_t)tbin->ncached < tbin->low_water) {
 		tbin->low_water = tbin->ncached;
 	}
 }
@@ -273,7 +279,7 @@ tcache_bin_flush_large(tsd_t *tsd, tcache_bin_t *tbin, szind_t binind,
 	memmove(tbin->avail - rem, tbin->avail - tbin->ncached, rem *
 	    sizeof(void *));
 	tbin->ncached = rem;
-	if ((int)tbin->ncached < tbin->low_water) {
+	if ((low_water_t)tbin->ncached < tbin->low_water) {
 		tbin->low_water = tbin->ncached;
 	}
 }
@@ -347,17 +353,24 @@ tcache_init(tsd_t *tsd, tcache_t *tcache, void *avail_stack) {
 
 	size_t stack_offset = 0;
 	assert((TCACHE_NSLOTS_SMALL_MAX & 1U) == 0);
-	memset(tcache->tbins, 0, sizeof(tcache_bin_t) * nhbins);
-	for (unsigned i = 0; i < nhbins; i++) {
-		tcache->tbins[i].lg_fill_div = 1;
+	memset(tcache->tbins_small, 0, sizeof(tcache_bin_t) * NBINS);
+	memset(tcache->tbins_large, 0, sizeof(tcache_bin_t) * (nhbins - NBINS));
+	unsigned i = 0;
+	for (; i < NBINS; i++) {
+		tcache->lg_fill_div[i] = 1;
 		stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *);
 		/*
 		 * avail points past the available space.  Allocations will
 		 * access the slots toward higher addresses (for the benefit of
 		 * prefetch).
 		 */
-		tcache->tbins[i].avail = (void **)((uintptr_t)avail_stack +
-		    (uintptr_t)stack_offset);
+		tcache_small_bin_get(tcache, i)->avail =
+		    (void **)((uintptr_t)avail_stack + (uintptr_t)stack_offset);
+	}
+	for (; i < nhbins; i++) {
+		stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *);
+		tcache_large_bin_get(tcache, i)->avail =
+		    (void **)((uintptr_t)avail_stack + (uintptr_t)stack_offset);
 	}
 	assert(stack_offset == stack_nelms * sizeof(void *));
 }
@@ -370,7 +383,7 @@ tsd_tcache_data_init(tsd_t *tsd) {
 	}
 
 	tcache_t *tcache = &tsd->tcache;
-	assert(tcache->tbins[0].avail == NULL);
+	assert(tcache_small_bin_get(tcache, 0)->avail == NULL);
 	size_t size = stack_nelms * sizeof(void *);
 	/* Avoid false cacheline sharing. */
 	size = sa2u(size, CACHELINE);
@@ -443,7 +456,7 @@ tcache_flush_cache(tsd_t *tsd, tcache_t *tcache) {
 	unsigned i;
 
 	for (i = 0; i < NBINS; i++) {
-		tcache_bin_t *tbin = &tcache->tbins[i];
+		tcache_bin_t *tbin = tcache_small_bin_get(tcache, i);
 		tcache_bin_flush_small(tsd, tcache, tbin, i, 0);
 
 		if (config_stats) {
@@ -451,7 +464,7 @@ tcache_flush_cache(tsd_t *tsd, tcache_t *tcache) {
 		}
 	}
 	for (; i < nhbins; i++) {
-		tcache_bin_t *tbin = &tcache->tbins[i];
+		tcache_bin_t *tbin = tcache_large_bin_get(tcache, i);
 		tcache_bin_flush_large(tsd, tbin, i, 0, tcache);
 
 		if (config_stats) {
@@ -483,7 +496,8 @@ tcache_destroy(tsd_t *tsd, tcache_t *tcache, bool tsd_tcache) {
 
 	if (tsd_tcache) {
 		/* Release the avail array for the TSD embedded auto tcache. */
-		void *avail_array = (void *)((uintptr_t)tcache->tbins[0].avail -
+		void *avail_array =
+		    (void *)((uintptr_t)tcache_small_bin_get(tcache, 0)->avail -
 		    (uintptr_t)tcache_bin_info[0].ncached_max * sizeof(void *));
 		idalloctm(tsd_tsdn(tsd), avail_array, NULL, true, true);
 	} else {
@@ -503,16 +517,16 @@ tcache_cleanup(tsd_t *tsd) {
 	if (!tcache_available(tsd)) {
 		assert(tsd_tcache_enabled_get(tsd) == false);
 		if (config_debug) {
-			assert(tcache->tbins[0].avail == NULL);
+			assert(tcache_small_bin_get(tcache, 0)->avail == NULL);
 		}
 		return;
 	}
 	assert(tsd_tcache_enabled_get(tsd));
-	assert(tcache->tbins[0].avail != NULL);
+	assert(tcache_small_bin_get(tcache, 0)->avail != NULL);
 
 	tcache_destroy(tsd, tcache, true);
 	if (config_debug) {
-		tcache->tbins[0].avail = NULL;
+		tcache_small_bin_get(tcache, 0)->avail = NULL;
 	}
 }
 
@@ -525,7 +539,7 @@ tcache_stats_merge(tsdn_t *tsdn, tcache_t *tcache, arena_t *arena) {
 	/* Merge and reset tcache stats. */
 	for (i = 0; i < NBINS; i++) {
 		arena_bin_t *bin = &arena->bins[i];
-		tcache_bin_t *tbin = &tcache->tbins[i];
+		tcache_bin_t *tbin = tcache_small_bin_get(tcache, i);
 		malloc_mutex_lock(tsdn, &bin->lock);
 		bin->stats.nrequests += tbin->tstats.nrequests;
 		malloc_mutex_unlock(tsdn, &bin->lock);
@@ -533,7 +547,7 @@ tcache_stats_merge(tsdn_t *tsdn, tcache_t *tcache, arena_t *arena) {
 	}
 
 	for (; i < nhbins; i++) {
-		tcache_bin_t *tbin = &tcache->tbins[i];
+		tcache_bin_t *tbin = tcache_large_bin_get(tcache, i);
 		arena_stats_large_nrequests_add(tsdn, &arena->stats, i,
 		    tbin->tstats.nrequests);
 		tbin->tstats.nrequests = 0;
author	Qi Wang <interwq@gwu.edu>	2017-04-06 19:35:22 (GMT)
committer	Qi Wang <interwq@gmail.com>	2017-04-07 21:06:17 (GMT)
commit	36bd90b96212772f1adbd421a6b091b542278995 (patch)
tree	b9b833c8124a4bd8615064cd746d4e8a3dccb0c6
parent	4dec507546040896338d8bbdb2075c7ad3a4b9f3 (diff)
download	jemalloc-36bd90b96212772f1adbd421a6b091b542278995.zip jemalloc-36bd90b96212772f1adbd421a6b091b542278995.tar.gz jemalloc-36bd90b96212772f1adbd421a6b091b542278995.tar.bz2