diff options
author | Qi Wang <interwq@gwu.edu> | 2017-02-03 01:02:05 (GMT) |
---|---|---|
committer | Qi Wang <interwq@gmail.com> | 2017-03-09 07:19:01 (GMT) |
commit | ec532e2c5c0b25fb7ab09383fe5a274583a90def (patch) | |
tree | b3306921b534baa43b1bb698d086041226d64d6e /include/jemalloc | |
parent | 8721e19c0414dce0f47a627ff948130d4294b4d7 (diff) | |
download | jemalloc-ec532e2c5c0b25fb7ab09383fe5a274583a90def.zip jemalloc-ec532e2c5c0b25fb7ab09383fe5a274583a90def.tar.gz jemalloc-ec532e2c5c0b25fb7ab09383fe5a274583a90def.tar.bz2 |
Implement per-CPU arena.
The new feature, opt.percpu_arena, determines thread-arena association
dynamically based CPU id. Three modes are supported: "percpu", "phycpu"
and disabled.
"percpu" uses the current core id (with help from sched_getcpu())
directly as the arena index, while "phycpu" will assign threads on the
same physical CPU to the same arena. In other words, "percpu" means # of
arenas == # of CPUs, while "phycpu" has # of arenas == 1/2 * (# of
CPUs). Note that no runtime check on whether hyper threading is enabled
is added yet.
When enabled, threads will be migrated between arenas when a CPU change
is detected. In the current design, to reduce overhead from reading CPU
id, each arena tracks the thread accessed most recently. When a new
thread comes in, we will read CPU id and update arena if necessary.
Diffstat (limited to 'include/jemalloc')
-rw-r--r-- | include/jemalloc/internal/arena_externs.h | 4 | ||||
-rw-r--r-- | include/jemalloc/internal/arena_inlines_a.h | 25 | ||||
-rw-r--r-- | include/jemalloc/internal/arena_structs_b.h | 7 | ||||
-rw-r--r-- | include/jemalloc/internal/arena_types.h | 16 | ||||
-rw-r--r-- | include/jemalloc/internal/jemalloc_internal.h.in | 123 | ||||
-rw-r--r-- | include/jemalloc/internal/jemalloc_internal_defs.h.in | 3 | ||||
-rw-r--r-- | include/jemalloc/internal/private_symbols.txt | 4 |
7 files changed, 162 insertions, 20 deletions
diff --git a/include/jemalloc/internal/arena_externs.h b/include/jemalloc/internal/arena_externs.h index 2df5518..349bae9 100644 --- a/include/jemalloc/internal/arena_externs.h +++ b/include/jemalloc/internal/arena_externs.h @@ -13,6 +13,10 @@ extern ssize_t opt_decay_time; extern const arena_bin_info_t arena_bin_info[NBINS]; +extern percpu_arena_mode_t percpu_arena_mode; +extern const char *opt_percpu_arena; +extern const char *percpu_arena_mode_names[]; + void arena_stats_large_nrequests_add(tsdn_t *tsdn, arena_stats_t *arena_stats, szind_t szind, uint64_t nrequests); void arena_stats_mapped_add(tsdn_t *tsdn, arena_stats_t *arena_stats, diff --git a/include/jemalloc/internal/arena_inlines_a.h b/include/jemalloc/internal/arena_inlines_a.h index ea7e099..9dd5304 100644 --- a/include/jemalloc/internal/arena_inlines_a.h +++ b/include/jemalloc/internal/arena_inlines_a.h @@ -7,6 +7,7 @@ void arena_internal_add(arena_t *arena, size_t size); void arena_internal_sub(arena_t *arena, size_t size); size_t arena_internal_get(arena_t *arena); bool arena_prof_accum(tsdn_t *tsdn, arena_t *arena, uint64_t accumbytes); +void percpu_arena_update(tsd_t *tsd, unsigned cpu); #endif /* JEMALLOC_ENABLE_INLINE */ #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ARENA_C_)) @@ -42,6 +43,30 @@ arena_prof_accum(tsdn_t *tsdn, arena_t *arena, uint64_t accumbytes) { return prof_accum_add(tsdn, &arena->prof_accum, accumbytes); } +JEMALLOC_INLINE void +percpu_arena_update(tsd_t *tsd, unsigned cpu) { + assert(have_percpu_arena); + arena_t *oldarena = tsd_arena_get(tsd); + assert(oldarena != NULL); + unsigned oldind = arena_ind_get(oldarena); + + if (oldind != cpu) { + unsigned newind = cpu; + arena_t *newarena = arena_get(tsd_tsdn(tsd), newind, true); + assert(newarena != NULL); + + /* Set new arena/tcache associations. */ + arena_migrate(tsd, oldind, newind); + if (config_tcache) { + tcache_t *tcache = tsd_tcache_get(tsd); + if (tcache) { + tcache_arena_reassociate(tsd_tsdn(tsd), tcache, + newarena); + } + } + } +} + #endif /* (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ARENA_C_)) */ #endif /* JEMALLOC_INTERNAL_ARENA_INLINES_A_H */ diff --git a/include/jemalloc/internal/arena_structs_b.h b/include/jemalloc/internal/arena_structs_b.h index ebcdbc4..ba8bb8a 100644 --- a/include/jemalloc/internal/arena_structs_b.h +++ b/include/jemalloc/internal/arena_structs_b.h @@ -125,6 +125,13 @@ struct arena_s { */ unsigned nthreads[2]; + /* + * When percpu_arena is enabled, to amortize the cost of reading / + * updating the current CPU id, track the most recent thread accessing + * this arena, and only read CPU if there is a mismatch. + */ + tsdn_t *last_thd; + /* Synchronization: internal. */ arena_stats_t stats; diff --git a/include/jemalloc/internal/arena_types.h b/include/jemalloc/internal/arena_types.h index d821be4..067c9ee 100644 --- a/include/jemalloc/internal/arena_types.h +++ b/include/jemalloc/internal/arena_types.h @@ -19,4 +19,20 @@ typedef struct arena_bin_s arena_bin_t; typedef struct arena_s arena_t; typedef struct arena_tdata_s arena_tdata_t; +typedef enum { + percpu_arena_disabled = 0, + percpu_arena = 1, + per_phycpu_arena = 2, /* i.e. hyper threads share arena. */ + + percpu_arena_mode_limit = 3 +} percpu_arena_mode_t; + +#ifdef JEMALLOC_PERCPU_ARENA +#define PERCPU_ARENA_MODE_DEFAULT percpu_arena +#define OPT_PERCPU_ARENA_DEFAULT "percpu" +#else +#define PERCPU_ARENA_MODE_DEFAULT percpu_arena_disabled +#define OPT_PERCPU_ARENA_DEFAULT "disabled" +#endif + #endif /* JEMALLOC_INTERNAL_ARENA_TYPES_H */ diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in index 8d2ec7d..97b41bb 100644 --- a/include/jemalloc/internal/jemalloc_internal.h.in +++ b/include/jemalloc/internal/jemalloc_internal.h.in @@ -145,6 +145,17 @@ static const bool have_thp = false #endif ; +#ifdef JEMALLOC_HAVE_SCHED_GETCPU +/* Currently percpu_arena depends on sched_getcpu. */ +#define JEMALLOC_PERCPU_ARENA +#endif +static const bool have_percpu_arena = +#ifdef JEMALLOC_PERCPU_ARENA + true +#else + false +#endif + ; #if (defined(JEMALLOC_OSATOMIC) || defined(JEMALLOC_OSSPIN)) #include <libkern/OSAtomic.h> @@ -220,6 +231,9 @@ typedef unsigned pszind_t; /* Size class index type. */ typedef unsigned szind_t; +/* Processor / core id type. */ +typedef int malloc_cpuid_t; + /* * Flags bits: * @@ -455,7 +469,7 @@ extern unsigned narenas_auto; * Arenas that are used to service external requests. Not all elements of the * arenas array are necessarily used; arenas are created lazily as needed. */ -extern arena_t **arenas; +extern arena_t *arenas[]; /* * pind2sz_tab encodes the same information as could be computed by @@ -548,6 +562,10 @@ arena_tdata_t *arena_tdata_get(tsd_t *tsd, unsigned ind, bool refresh_if_missing); arena_t *arena_get(tsdn_t *tsdn, unsigned ind, bool init_if_missing); ticker_t *decay_ticker_get(tsd_t *tsd, unsigned ind); +malloc_cpuid_t malloc_getcpu(void); +unsigned percpu_arena_choose(void); +unsigned percpu_arena_ind_limit(void); + #endif #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_)) @@ -818,32 +836,53 @@ sa2u(size_t size, size_t alignment) { return usize; } -/* Choose an arena based on a per-thread value. */ -JEMALLOC_INLINE arena_t * -arena_choose_impl(tsd_t *tsd, arena_t *arena, bool internal) { - arena_t *ret; - - if (arena != NULL) { - return arena; - } +JEMALLOC_ALWAYS_INLINE malloc_cpuid_t +malloc_getcpu(void) { + assert(have_percpu_arena); +#if defined(JEMALLOC_HAVE_SCHED_GETCPU) + return (malloc_cpuid_t)sched_getcpu(); +#else + not_reached(); + return -1; +#endif +} - ret = internal ? tsd_iarena_get(tsd) : tsd_arena_get(tsd); - if (unlikely(ret == NULL)) { - ret = arena_choose_hard(tsd, internal); +/* Return the chosen arena index based on current cpu. */ +JEMALLOC_ALWAYS_INLINE unsigned +percpu_arena_choose(void) { + unsigned arena_ind; + assert(have_percpu_arena && (percpu_arena_mode != percpu_arena_disabled)); + + malloc_cpuid_t cpuid = malloc_getcpu(); + assert(cpuid >= 0); + if ((percpu_arena_mode == percpu_arena) || + ((unsigned)cpuid < ncpus / 2)) { + arena_ind = cpuid; + } else { + assert(percpu_arena_mode == per_phycpu_arena); + /* Hyper threads on the same physical CPU share arena. */ + arena_ind = cpuid - ncpus / 2; } - return ret; + return arena_ind; } -JEMALLOC_INLINE arena_t * -arena_choose(tsd_t *tsd, arena_t *arena) { - return arena_choose_impl(tsd, arena, false); +/* Return the limit of percpu auto arena range, i.e. arenas[0...ind_limit). */ +JEMALLOC_ALWAYS_INLINE unsigned +percpu_arena_ind_limit(void) { + assert(have_percpu_arena && (percpu_arena_mode != percpu_arena_disabled)); + if (percpu_arena_mode == per_phycpu_arena && ncpus > 1) { + if (ncpus % 2) { + /* This likely means a misconfig. */ + return ncpus / 2 + 1; + } + return ncpus / 2; + } else { + return ncpus; + } } -JEMALLOC_INLINE arena_t * -arena_ichoose(tsd_t *tsd, arena_t *arena) { - return arena_choose_impl(tsd, arena, true); -} + JEMALLOC_INLINE arena_tdata_t * arena_tdata_get(tsd_t *tsd, unsigned ind, bool refresh_if_missing) { @@ -912,6 +951,50 @@ extent_t *iealloc(tsdn_t *tsdn, const void *ptr); #endif #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_)) +/* Choose an arena based on a per-thread value. */ +JEMALLOC_INLINE arena_t * +arena_choose_impl(tsd_t *tsd, arena_t *arena, bool internal) { + arena_t *ret; + + if (arena != NULL) { + return arena; + } + + ret = internal ? tsd_iarena_get(tsd) : tsd_arena_get(tsd); + if (unlikely(ret == NULL)) { + ret = arena_choose_hard(tsd, internal); + } + + assert(ret != NULL); + /* + * Note that for percpu arena, if the current arena is outside of the + * auto percpu arena range, (i.e. thread is assigned to a manually + * managed arena), then percpu arena is skipped. + */ + if (have_percpu_arena && (percpu_arena_mode != percpu_arena_disabled) && + (arena_ind_get(ret) < percpu_arena_ind_limit()) && + (ret->last_thd != tsd_tsdn(tsd))) { + unsigned ind = percpu_arena_choose(); + if (arena_ind_get(ret) != ind) { + percpu_arena_update(tsd, ind); + ret = tsd_arena_get(tsd); + } + ret->last_thd = tsd_tsdn(tsd); + } + + return ret; +} + +JEMALLOC_INLINE arena_t * +arena_choose(tsd_t *tsd, arena_t *arena) { + return arena_choose_impl(tsd, arena, false); +} + +JEMALLOC_INLINE arena_t * +arena_ichoose(tsd_t *tsd, arena_t *arena) { + return arena_choose_impl(tsd, arena, true); +} + JEMALLOC_ALWAYS_INLINE extent_t * iealloc(tsdn_t *tsdn, const void *ptr) { return extent_lookup(tsdn, ptr, true); diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in index b2e0077..500f427 100644 --- a/include/jemalloc/internal/jemalloc_internal_defs.h.in +++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in @@ -308,6 +308,9 @@ /* Adaptive mutex support in pthreads. */ #undef JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP +/* GNU specific sched_getcpu support */ +#undef JEMALLOC_HAVE_SCHED_GETCPU + /* * If defined, jemalloc symbols are not exported (doesn't work when * JEMALLOC_PREFIX is not defined). diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt index 64bea33..c0211e5 100644 --- a/include/jemalloc/internal/private_symbols.txt +++ b/include/jemalloc/internal/private_symbols.txt @@ -258,6 +258,7 @@ large_salloc lg_floor lg_prof_sample malloc_cprintf +malloc_getcpu malloc_mutex_assert_not_owner malloc_mutex_assert_owner malloc_mutex_boot @@ -330,6 +331,9 @@ pages_purge_forced pages_purge_lazy pages_trim pages_unmap +percpu_arena_choose +percpu_arena_ind_limit +percpu_arena_update pind2sz pind2sz_compute pind2sz_lookup |