summaryrefslogtreecommitdiffstats
path: root/include/jemalloc
diff options
context:
space:
mode:
authorQi Wang <interwq@gwu.edu>2017-04-06 19:35:22 (GMT)
committerQi Wang <interwq@gmail.com>2017-04-07 21:06:17 (GMT)
commit36bd90b96212772f1adbd421a6b091b542278995 (patch)
treeb9b833c8124a4bd8615064cd746d4e8a3dccb0c6 /include/jemalloc
parent4dec507546040896338d8bbdb2075c7ad3a4b9f3 (diff)
downloadjemalloc-36bd90b96212772f1adbd421a6b091b542278995.zip
jemalloc-36bd90b96212772f1adbd421a6b091b542278995.tar.gz
jemalloc-36bd90b96212772f1adbd421a6b091b542278995.tar.bz2
Optimizing TSD and thread cache layout.
1) Re-organize TSD so that frequently accessed fields are closer to the beginning and more compact. Assuming 64-bit, the first 2.5 cachelines now contains everything needed on tcache fast path, expect the tcache struct itself. 2) Re-organize tcache and tbins. Take lg_fill_div out of tbin, and reduce tbin to 24 bytes (down from 32). Split tbins into tbins_small and tbins_large, and place tbins_small close to the beginning.
Diffstat (limited to 'include/jemalloc')
-rw-r--r--include/jemalloc/internal/arena_externs.h2
-rw-r--r--include/jemalloc/internal/jemalloc_internal.h.in69
-rw-r--r--include/jemalloc/internal/rtree_structs.h3
-rw-r--r--include/jemalloc/internal/tcache_inlines.h12
-rw-r--r--include/jemalloc/internal/tcache_structs.h30
-rw-r--r--include/jemalloc/internal/tcache_types.h7
-rw-r--r--include/jemalloc/internal/tsd_structs.h55
-rw-r--r--include/jemalloc/internal/tsd_types.h14
8 files changed, 129 insertions, 63 deletions
diff --git a/include/jemalloc/internal/arena_externs.h b/include/jemalloc/internal/arena_externs.h
index a35fe18..0f86dc0 100644
--- a/include/jemalloc/internal/arena_externs.h
+++ b/include/jemalloc/internal/arena_externs.h
@@ -51,7 +51,7 @@ bool arena_muzzy_decay_time_set(tsdn_t *tsdn, arena_t *arena,
void arena_decay(tsdn_t *tsdn, arena_t *arena, bool all);
void arena_reset(tsd_t *tsd, arena_t *arena);
void arena_destroy(tsd_t *tsd, arena_t *arena);
-void arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena,
+void arena_tcache_fill_small(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache,
tcache_bin_t *tbin, szind_t binind, uint64_t prof_accumbytes);
void arena_alloc_junk_small(void *ptr, const arena_bin_info_t *bin_info,
bool zero);
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 3b137fc..c00912b 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -538,33 +538,35 @@ bool malloc_initialized(void);
#include "jemalloc/internal/mutex_inlines.h"
#ifndef JEMALLOC_ENABLE_INLINE
-pszind_t psz2ind(size_t psz);
-size_t pind2sz_compute(pszind_t pind);
-size_t pind2sz_lookup(pszind_t pind);
-size_t pind2sz(pszind_t pind);
-size_t psz2u(size_t psz);
-szind_t size2index_compute(size_t size);
-szind_t size2index_lookup(size_t size);
-szind_t size2index(size_t size);
-size_t index2size_compute(szind_t index);
-size_t index2size_lookup(szind_t index);
-size_t index2size(szind_t index);
-size_t s2u_compute(size_t size);
-size_t s2u_lookup(size_t size);
-size_t s2u(size_t size);
-size_t sa2u(size_t size, size_t alignment);
-arena_t *arena_choose_impl(tsd_t *tsd, arena_t *arena, bool internal);
-arena_t *arena_choose(tsd_t *tsd, arena_t *arena);
-arena_t *arena_ichoose(tsd_t *tsd, arena_t *arena);
-arena_tdata_t *arena_tdata_get(tsd_t *tsd, unsigned ind,
+pszind_t psz2ind(size_t psz);
+size_t pind2sz_compute(pszind_t pind);
+size_t pind2sz_lookup(pszind_t pind);
+size_t pind2sz(pszind_t pind);
+size_t psz2u(size_t psz);
+szind_t size2index_compute(size_t size);
+szind_t size2index_lookup(size_t size);
+szind_t size2index(size_t size);
+size_t index2size_compute(szind_t index);
+size_t index2size_lookup(szind_t index);
+size_t index2size(szind_t index);
+size_t s2u_compute(size_t size);
+size_t s2u_lookup(size_t size);
+size_t s2u(size_t size);
+size_t sa2u(size_t size, size_t alignment);
+arena_t *arena_choose_impl(tsd_t *tsd, arena_t *arena, bool internal);
+arena_t *arena_choose(tsd_t *tsd, arena_t *arena);
+arena_t *arena_ichoose(tsd_t *tsd, arena_t *arena);
+arena_tdata_t *arena_tdata_get(tsd_t *tsd, unsigned ind,
bool refresh_if_missing);
-arena_t *arena_get(tsdn_t *tsdn, unsigned ind, bool init_if_missing);
-ticker_t *decay_ticker_get(tsd_t *tsd, unsigned ind);
-bool tcache_available(tsd_t *tsd);
-tcache_t *tcache_get(tsd_t *tsd);
-malloc_cpuid_t malloc_getcpu(void);
-unsigned percpu_arena_choose(void);
-unsigned percpu_arena_ind_limit(void);
+arena_t *arena_get(tsdn_t *tsdn, unsigned ind, bool init_if_missing);
+ticker_t *decay_ticker_get(tsd_t *tsd, unsigned ind);
+bool tcache_available(tsd_t *tsd);
+tcache_bin_t *tcache_small_bin_get(tcache_t *tcache, szind_t binind);
+tcache_bin_t *tcache_large_bin_get(tcache_t *tcache, szind_t binind);
+tcache_t *tcache_get(tsd_t *tsd);
+malloc_cpuid_t malloc_getcpu(void);
+unsigned percpu_arena_choose(void);
+unsigned percpu_arena_ind_limit(void);
#endif
#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
@@ -933,6 +935,18 @@ decay_ticker_get(tsd_t *tsd, unsigned ind) {
return &tdata->decay_ticker;
}
+JEMALLOC_ALWAYS_INLINE tcache_bin_t *
+tcache_small_bin_get(tcache_t *tcache, szind_t binind) {
+ assert(binind < NBINS);
+ return &tcache->tbins_small[binind];
+}
+
+JEMALLOC_ALWAYS_INLINE tcache_bin_t *
+tcache_large_bin_get(tcache_t *tcache, szind_t binind) {
+ assert(binind >= NBINS &&binind < nhbins);
+ return &tcache->tbins_large[binind - NBINS];
+}
+
JEMALLOC_ALWAYS_INLINE bool
tcache_available(tsd_t *tsd) {
cassert(config_tcache);
@@ -945,7 +959,8 @@ tcache_available(tsd_t *tsd) {
if (likely(tsd_tcache_enabled_get(tsd) == true)) {
/* Associated arena == null implies tcache init in progress. */
if (tsd_tcachep_get(tsd)->arena != NULL) {
- assert(tsd_tcachep_get(tsd)->tbins[0].avail != NULL);
+ assert(tcache_small_bin_get(tsd_tcachep_get(tsd),
+ 0)->avail != NULL);
}
return true;
}
diff --git a/include/jemalloc/internal/rtree_structs.h b/include/jemalloc/internal/rtree_structs.h
index 8dd9cda..123248a 100644
--- a/include/jemalloc/internal/rtree_structs.h
+++ b/include/jemalloc/internal/rtree_structs.h
@@ -53,9 +53,6 @@ struct rtree_ctx_cache_elm_s {
};
struct rtree_ctx_s {
-#ifndef _MSC_VER
- JEMALLOC_ALIGNED(CACHELINE)
-#endif
rtree_ctx_cache_elm_t cache[RTREE_CTX_NCACHE];
};
diff --git a/include/jemalloc/internal/tcache_inlines.h b/include/jemalloc/internal/tcache_inlines.h
index 929d8a7..dae43f9 100644
--- a/include/jemalloc/internal/tcache_inlines.h
+++ b/include/jemalloc/internal/tcache_inlines.h
@@ -73,7 +73,7 @@ tcache_alloc_easy(tcache_bin_t *tbin, bool *tcache_success) {
ret = *(tbin->avail - tbin->ncached);
tbin->ncached--;
- if (unlikely((int)tbin->ncached < tbin->low_water)) {
+ if (unlikely((low_water_t)tbin->ncached < tbin->low_water)) {
tbin->low_water = tbin->ncached;
}
@@ -89,7 +89,7 @@ tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
size_t usize JEMALLOC_CC_SILENCE_INIT(0);
assert(binind < NBINS);
- tbin = &tcache->tbins[binind];
+ tbin = tcache_small_bin_get(tcache, binind);
ret = tcache_alloc_easy(tbin, &tcache_success);
assert(tcache_success == (ret != NULL));
if (unlikely(!tcache_success)) {
@@ -150,8 +150,8 @@ tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size,
tcache_bin_t *tbin;
bool tcache_success;
- assert(binind < nhbins);
- tbin = &tcache->tbins[binind];
+ assert(binind >= NBINS &&binind < nhbins);
+ tbin = tcache_large_bin_get(tcache, binind);
ret = tcache_alloc_easy(tbin, &tcache_success);
assert(tcache_success == (ret != NULL));
if (unlikely(!tcache_success)) {
@@ -215,7 +215,7 @@ tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
arena_dalloc_junk_small(ptr, &arena_bin_info[binind]);
}
- tbin = &tcache->tbins[binind];
+ tbin = tcache_small_bin_get(tcache, binind);
tbin_info = &tcache_bin_info[binind];
if (unlikely(tbin->ncached == tbin_info->ncached_max)) {
tcache_bin_flush_small(tsd, tcache, tbin, binind,
@@ -241,7 +241,7 @@ tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind,
large_dalloc_junk(ptr, index2size(binind));
}
- tbin = &tcache->tbins[binind];
+ tbin = tcache_large_bin_get(tcache, binind);
tbin_info = &tcache_bin_info[binind];
if (unlikely(tbin->ncached == tbin_info->ncached_max)) {
tcache_bin_flush_large(tsd, tbin, binind,
diff --git a/include/jemalloc/internal/tcache_structs.h b/include/jemalloc/internal/tcache_structs.h
index d7ec4b6..4e10160 100644
--- a/include/jemalloc/internal/tcache_structs.h
+++ b/include/jemalloc/internal/tcache_structs.h
@@ -10,10 +10,14 @@ struct tcache_bin_info_s {
};
struct tcache_bin_s {
+ low_water_t low_water; /* Min # cached since last GC. */
+ uint32_t ncached; /* # of cached objects. */
+ /*
+ * ncached and stats are both modified frequently. Let's keep them
+ * close so that they have a higher chance of being on the same
+ * cacheline, thus less write-backs.
+ */
tcache_bin_stats_t tstats;
- int low_water; /* Min # cached since last GC. */
- unsigned lg_fill_div; /* Fill (ncached_max >> lg_fill_div). */
- unsigned ncached; /* # of cached objects. */
/*
* To make use of adjacent cacheline prefetch, the items in the avail
* stack goes to higher address for newer allocations. avail points
@@ -25,11 +29,9 @@ struct tcache_bin_s {
};
struct tcache_s {
- ql_elm(tcache_t) link; /* Used for aggregating stats. */
+ /* Data accessed frequently first: prof, ticker and small bins. */
uint64_t prof_accumbytes;/* Cleared after arena_prof_accum(). */
ticker_t gc_ticker; /* Drives incremental GC. */
- szind_t next_gc_bin; /* Next bin to GC. */
- arena_t *arena; /* Associated arena. */
/*
* The pointer stacks associated with tbins follow as a contiguous
* array. During tcache initialization, the avail pointer in each
@@ -37,9 +39,21 @@ struct tcache_s {
* this array.
*/
#ifdef JEMALLOC_TCACHE
- tcache_bin_t tbins[NSIZES];
+ tcache_bin_t tbins_small[NBINS];
+#else
+ tcache_bin_t tbins_small[0];
+#endif
+ /* Data accessed less often below. */
+ ql_elm(tcache_t) link; /* Used for aggregating stats. */
+ arena_t *arena; /* Associated arena. */
+ szind_t next_gc_bin; /* Next bin to GC. */
+#ifdef JEMALLOC_TCACHE
+ /* For small bins, fill (ncached_max >> lg_fill_div). */
+ uint8_t lg_fill_div[NBINS];
+ tcache_bin_t tbins_large[NSIZES-NBINS];
#else
- tcache_bin_t tbins[0];
+ uint8_t lg_fill_div[0];
+ tcache_bin_t tbins_large[0];
#endif
};
diff --git a/include/jemalloc/internal/tcache_types.h b/include/jemalloc/internal/tcache_types.h
index 70f8960..a60db6f 100644
--- a/include/jemalloc/internal/tcache_types.h
+++ b/include/jemalloc/internal/tcache_types.h
@@ -6,6 +6,9 @@ typedef struct tcache_bin_s tcache_bin_t;
typedef struct tcache_s tcache_t;
typedef struct tcaches_s tcaches_t;
+/* ncached is cast to this type for comparison. */
+typedef int32_t low_water_t;
+
/*
* tcache pointers close to NULL are used to encode state information that is
* used for two purposes: preventing thread caching on a per thread basis and
@@ -48,9 +51,9 @@ typedef struct tcaches_s tcaches_t;
((TCACHE_GC_SWEEP / NBINS) + ((TCACHE_GC_SWEEP / NBINS == 0) ? 0 : 1))
/* Used in TSD static initializer only. Real init in tcache_data_init(). */
-#define TCACHE_ZERO_INITIALIZER {{NULL}}
+#define TCACHE_ZERO_INITIALIZER {0}
/* Used in TSD static initializer only. Will be initialized to opt_tcache. */
-#define TCACHE_ENABLED_DEFAULT false
+#define TCACHE_ENABLED_ZERO_INITIALIZER false
#endif /* JEMALLOC_INTERNAL_TCACHE_TYPES_H */
diff --git a/include/jemalloc/internal/tsd_structs.h b/include/jemalloc/internal/tsd_structs.h
index f327c76..2dca0bd 100644
--- a/include/jemalloc/internal/tsd_structs.h
+++ b/include/jemalloc/internal/tsd_structs.h
@@ -14,19 +14,54 @@ struct tsd_init_head_s {
};
#endif
+/*
+ * Thread-Specific-Data layout
+ * --- data accessed on tcache fast path: state, rtree_ctx, stats, prof ---
+ * s: state
+ * e: tcache_enabled
+ * m: thread_allocated (config_stats)
+ * f: thread_deallocated (config_stats)
+ * p: prof_tdata (config_prof)
+ * c: rtree_ctx (rtree cache accessed on deallocation)
+ * t: tcache
+ * --- data not accessed on tcache fast path: arena related fields ---
+ * d: arenas_tdata_bypass
+ * r: narenas_tdata
+ * x: blank space (1 byte)
+ * i: iarena
+ * a: arena
+ * o: arenas_tdata
+ * Loading TSD data is on the critical path of basically all malloc operations.
+ * In particular, tcache and rtree_ctx rely on hot CPU cache to be effective.
+ * Use a compact layout to reduce cache footprint.
+ * +--- 64-bit and 64B cacheline; 1B each letter; First byte on the left. ---+
+ * |---------------------------- 1st cacheline ----------------------------|
+ * | sedxrrrr mmmmmmmm ffffffff pppppppp [c * 32 ........ ........ .......] |
+ * |---------------------------- 2nd cacheline ----------------------------|
+ * | [c * 64 ........ ........ ........ ........ ........ ........ .......] |
+ * |---------------------------- 3nd cacheline ----------------------------|
+ * | [c * 32 ........ ........ .......] iiiiiiii aaaaaaaa oooooooo [t...... |
+ * +-------------------------------------------------------------------------+
+ * Note: the entire tcache is embedded into TSD and spans multiple cachelines.
+ *
+ * The last 3 members (i, a and o) before tcache isn't really needed on tcache
+ * fast path. However we have a number of unused tcache bins and witnesses
+ * (never touched unless config_debug) at the end of tcache, so we place them
+ * there to avoid breaking the cachelines and possibly paging in an extra page.
+ */
#define MALLOC_TSD \
/* O(name, type, [gs]et, init, cleanup) */ \
- O(tcache, tcache_t, yes, no, yes) \
+ O(tcache_enabled, bool, yes, yes, no) \
+ O(arenas_tdata_bypass, bool, no, no, no) \
+ O(narenas_tdata, uint32_t, yes, no, no) \
O(thread_allocated, uint64_t, yes, no, no) \
O(thread_deallocated, uint64_t, yes, no, no) \
O(prof_tdata, prof_tdata_t *, yes, no, yes) \
+ O(rtree_ctx, rtree_ctx_t, no, yes, no) \
O(iarena, arena_t *, yes, no, yes) \
O(arena, arena_t *, yes, no, yes) \
O(arenas_tdata, arena_tdata_t *,yes, no, yes) \
- O(narenas_tdata, unsigned, yes, no, no) \
- O(arenas_tdata_bypass, bool, no, no, no) \
- O(tcache_enabled, bool, yes, yes, no) \
- O(rtree_ctx, rtree_ctx_t, no, yes, no) \
+ O(tcache, tcache_t, yes, no, yes) \
O(witnesses, witness_list_t, no, no, yes) \
O(rtree_leaf_elm_witnesses, rtree_leaf_elm_witness_tsd_t, \
no, no, no) \
@@ -34,17 +69,17 @@ struct tsd_init_head_s {
#define TSD_INITIALIZER { \
tsd_state_uninitialized, \
- TCACHE_ZERO_INITIALIZER, \
+ TCACHE_ENABLED_ZERO_INITIALIZER, \
+ false, \
+ 0, \
0, \
0, \
NULL, \
+ RTREE_CTX_ZERO_INITIALIZER, \
NULL, \
NULL, \
NULL, \
- 0, \
- false, \
- TCACHE_ENABLED_DEFAULT, \
- RTREE_CTX_ZERO_INITIALIZER, \
+ TCACHE_ZERO_INITIALIZER, \
ql_head_initializer(witnesses), \
RTREE_ELM_WITNESS_TSD_INITIALIZER, \
false \
diff --git a/include/jemalloc/internal/tsd_types.h b/include/jemalloc/internal/tsd_types.h
index 29c6378..4d5fef5 100644
--- a/include/jemalloc/internal/tsd_types.h
+++ b/include/jemalloc/internal/tsd_types.h
@@ -17,12 +17,14 @@ typedef struct tsdn_s tsdn_t;
#define TSDN_NULL ((tsdn_t *)0)
-typedef enum {
- tsd_state_uninitialized,
- tsd_state_nominal,
- tsd_state_purgatory,
- tsd_state_reincarnated
-} tsd_state_t;
+enum {
+ tsd_state_uninitialized = 0,
+ tsd_state_nominal = 1,
+ tsd_state_purgatory = 2,
+ tsd_state_reincarnated = 3
+};
+/* Manually limit tsd_state_t to a single byte. */
+typedef uint8_t tsd_state_t;
/*
* TLS/TSD-agnostic macro-based implementation of thread-specific data. There