diff options
Diffstat (limited to 'include/jemalloc/internal/prof.h')
| -rw-r--r-- | include/jemalloc/internal/prof.h | 177 |
1 files changed, 104 insertions, 73 deletions
diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h index e9064ba..c3e3f9e 100644 --- a/include/jemalloc/internal/prof.h +++ b/include/jemalloc/internal/prof.h @@ -1,4 +1,3 @@ -#ifdef JEMALLOC_PROF /******************************************************************************/ #ifdef JEMALLOC_H_TYPES @@ -10,28 +9,41 @@ typedef struct prof_tdata_s prof_tdata_t; /* Option defaults. */ #define PROF_PREFIX_DEFAULT "jeprof" -#define LG_PROF_BT_MAX_DEFAULT 7 -#define LG_PROF_SAMPLE_DEFAULT 0 +#define LG_PROF_SAMPLE_DEFAULT 19 #define LG_PROF_INTERVAL_DEFAULT -1 -#define LG_PROF_TCMAX_DEFAULT -1 /* - * Hard limit on stack backtrace depth. Note that the version of - * prof_backtrace() that is based on __builtin_return_address() necessarily has - * a hard-coded number of backtrace frame handlers. + * Hard limit on stack backtrace depth. The version of prof_backtrace() that + * is based on __builtin_return_address() necessarily has a hard-coded number + * of backtrace frame handlers, and should be kept in sync with this setting. */ -#if (defined(JEMALLOC_PROF_LIBGCC) || defined(JEMALLOC_PROF_LIBUNWIND)) -# define LG_PROF_BT_MAX ((ZU(1) << (LG_SIZEOF_PTR+3)) - 1) -#else -# define LG_PROF_BT_MAX 7 /* >= LG_PROF_BT_MAX_DEFAULT */ -#endif -#define PROF_BT_MAX (1U << LG_PROF_BT_MAX) +#define PROF_BT_MAX 128 + +/* Maximum number of backtraces to store in each per thread LRU cache. */ +#define PROF_TCMAX 1024 /* Initial hash table size. */ -#define PROF_CKH_MINITEMS 64 +#define PROF_CKH_MINITEMS 64 /* Size of memory buffer to use when writing dump files. */ -#define PROF_DUMP_BUF_SIZE 65536 +#define PROF_DUMP_BUFSIZE 65536 + +/* Size of stack-allocated buffer used by prof_printf(). */ +#define PROF_PRINTF_BUFSIZE 128 + +/* + * Number of mutexes shared among all ctx's. No space is allocated for these + * unless profiling is enabled, so it's okay to over-provision. + */ +#define PROF_NCTX_LOCKS 1024 + +/* + * prof_tdata pointers close to NULL are used to encode state information that + * is used for cleaning up during thread shutdown. + */ +#define PROF_TDATA_STATE_REINCARNATED ((prof_tdata_t *)(uintptr_t)1) +#define PROF_TDATA_STATE_PURGATORY ((prof_tdata_t *)(uintptr_t)2) +#define PROF_TDATA_STATE_MAX PROF_TDATA_STATE_PURGATORY #endif /* JEMALLOC_H_TYPES */ /******************************************************************************/ @@ -109,8 +121,18 @@ struct prof_ctx_s { /* Associated backtrace. */ prof_bt_t *bt; - /* Protects cnt_merged and cnts_ql. */ - malloc_mutex_t lock; + /* Protects nlimbo, cnt_merged, and cnts_ql. */ + malloc_mutex_t *lock; + + /* + * Number of threads that currently cause this ctx to be in a state of + * limbo due to one of: + * - Initializing per thread counters associated with this ctx. + * - Preparing to destroy this ctx. + * nlimbo must be 1 (single destroyer) in order to safely destroy the + * ctx. + */ + unsigned nlimbo; /* Temporary storage for summation during dump. */ prof_cnt_t cnt_summed; @@ -145,9 +167,14 @@ struct prof_tdata_s { void **vec; /* Sampling state. */ - uint64_t prn_state; + uint64_t prng_state; uint64_t threshold; uint64_t accum; + + /* State used to avoid dumping while operating on prof internals. */ + bool enq; + bool enq_idump; + bool enq_gdump; }; #endif /* JEMALLOC_H_STRUCTS */ @@ -162,13 +189,12 @@ extern bool opt_prof; * to notice state changes. */ extern bool opt_prof_active; -extern size_t opt_lg_prof_bt_max; /* Maximum backtrace depth. */ extern size_t opt_lg_prof_sample; /* Mean bytes between samples. */ extern ssize_t opt_lg_prof_interval; /* lg(prof_interval). */ extern bool opt_prof_gdump; /* High-water memory dumping. */ +extern bool opt_prof_final; /* Final profile dumping. */ extern bool opt_prof_leak; /* Dump leak summary at exit. */ extern bool opt_prof_accum; /* Report cumulative bytes. */ -extern ssize_t opt_lg_prof_tcmax; /* lg(max per thread bactrace cache) */ extern char opt_prof_prefix[PATH_MAX + 1]; /* @@ -186,39 +212,14 @@ extern uint64_t prof_interval; */ extern bool prof_promote; -/* (1U << opt_lg_prof_bt_max). */ -extern unsigned prof_bt_max; - -/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */ -#ifndef NO_TLS -extern __thread prof_tdata_t *prof_tdata_tls - JEMALLOC_ATTR(tls_model("initial-exec")); -# define PROF_TCACHE_GET() prof_tdata_tls -# define PROF_TCACHE_SET(v) do { \ - prof_tdata_tls = (v); \ - pthread_setspecific(prof_tdata_tsd, (void *)(v)); \ -} while (0) -#else -# define PROF_TCACHE_GET() \ - ((prof_tdata_t *)pthread_getspecific(prof_tdata_tsd)) -# define PROF_TCACHE_SET(v) do { \ - pthread_setspecific(prof_tdata_tsd, (void *)(v)); \ -} while (0) -#endif -/* - * Same contents as b2cnt_tls, but initialized such that the TSD destructor is - * called when a thread exits, so that prof_tdata_tls contents can be merged, - * unlinked, and deallocated. - */ -extern pthread_key_t prof_tdata_tsd; - void bt_init(prof_bt_t *bt, void **vec); -void prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max); +void prof_backtrace(prof_bt_t *bt, unsigned nignore); prof_thr_cnt_t *prof_lookup(prof_bt_t *bt); void prof_idump(void); bool prof_mdump(const char *filename); void prof_gdump(void); prof_tdata_t *prof_tdata_init(void); +void prof_tdata_cleanup(void *arg); void prof_boot0(void); void prof_boot1(void); bool prof_boot2(void); @@ -233,13 +234,13 @@ bool prof_boot2(void); \ assert(size == s2u(size)); \ \ - prof_tdata = PROF_TCACHE_GET(); \ - if (prof_tdata == NULL) { \ - prof_tdata = prof_tdata_init(); \ - if (prof_tdata == NULL) { \ + prof_tdata = prof_tdata_get(); \ + if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX) { \ + if (prof_tdata != NULL) \ + ret = (prof_thr_cnt_t *)(uintptr_t)1U; \ + else \ ret = NULL; \ - break; \ - } \ + break; \ } \ \ if (opt_prof_active == false) { \ @@ -249,13 +250,13 @@ bool prof_boot2(void); /* Don't bother with sampling logic, since sampling */\ /* interval is 1. */\ bt_init(&bt, prof_tdata->vec); \ - prof_backtrace(&bt, nignore, prof_bt_max); \ + prof_backtrace(&bt, nignore); \ ret = prof_lookup(&bt); \ } else { \ if (prof_tdata->threshold == 0) { \ /* Initialize. Seed the prng differently for */\ /* each thread. */\ - prof_tdata->prn_state = \ + prof_tdata->prng_state = \ (uint64_t)(uintptr_t)&size; \ prof_sample_threshold_update(prof_tdata); \ } \ @@ -272,7 +273,7 @@ bool prof_boot2(void); if (size >= prof_tdata->threshold - \ prof_tdata->accum) { \ bt_init(&bt, prof_tdata->vec); \ - prof_backtrace(&bt, nignore, prof_bt_max); \ + prof_backtrace(&bt, nignore); \ ret = prof_lookup(&bt); \ } else \ ret = (prof_thr_cnt_t *)(uintptr_t)1U; \ @@ -280,6 +281,9 @@ bool prof_boot2(void); } while (0) #ifndef JEMALLOC_ENABLE_INLINE +malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *) + +prof_tdata_t *prof_tdata_get(void); void prof_sample_threshold_update(prof_tdata_t *prof_tdata); prof_ctx_t *prof_ctx_get(const void *ptr); void prof_ctx_set(const void *ptr, prof_ctx_t *ctx); @@ -291,12 +295,35 @@ void prof_free(const void *ptr, size_t size); #endif #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_)) +/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */ +malloc_tsd_externs(prof_tdata, prof_tdata_t *) +malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL, + prof_tdata_cleanup) + +JEMALLOC_INLINE prof_tdata_t * +prof_tdata_get(void) +{ + prof_tdata_t *prof_tdata; + + cassert(config_prof); + + prof_tdata = *prof_tdata_tsd_get(); + if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX) { + if (prof_tdata == NULL) + prof_tdata = prof_tdata_init(); + } + + return (prof_tdata); +} + JEMALLOC_INLINE void prof_sample_threshold_update(prof_tdata_t *prof_tdata) { uint64_t r; double u; + cassert(config_prof); + /* * Compute sample threshold as a geometrically distributed random * variable with mean (2^opt_lg_prof_sample). @@ -315,8 +342,8 @@ prof_sample_threshold_update(prof_tdata_t *prof_tdata) * pp 500 * (http://cg.scs.carleton.ca/~luc/rnbookindex.html) */ - prn64(r, 53, prof_tdata->prn_state, - (uint64_t)6364136223846793005LLU, (uint64_t)1442695040888963407LLU); + prng64(r, 53, prof_tdata->prng_state, + UINT64_C(6364136223846793005), UINT64_C(1442695040888963407)); u = (double)r * (1.0/9007199254740992.0L); prof_tdata->threshold = (uint64_t)(log(u) / log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample)))) @@ -329,13 +356,12 @@ prof_ctx_get(const void *ptr) prof_ctx_t *ret; arena_chunk_t *chunk; + cassert(config_prof); assert(ptr != NULL); chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); if (chunk != ptr) { /* Region. */ - dassert(chunk->arena->magic == ARENA_MAGIC); - ret = arena_prof_ctx_get(ptr); } else ret = huge_prof_ctx_get(ptr); @@ -348,13 +374,12 @@ prof_ctx_set(const void *ptr, prof_ctx_t *ctx) { arena_chunk_t *chunk; + cassert(config_prof); assert(ptr != NULL); chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); if (chunk != ptr) { /* Region. */ - dassert(chunk->arena->magic == ARENA_MAGIC); - arena_prof_ctx_set(ptr, ctx); } else huge_prof_ctx_set(ptr, ctx); @@ -365,11 +390,13 @@ prof_sample_accum_update(size_t size) { prof_tdata_t *prof_tdata; + cassert(config_prof); /* Sampling logic is unnecessary if the interval is 1. */ assert(opt_lg_prof_sample != 0); - prof_tdata = PROF_TCACHE_GET(); - assert(prof_tdata != NULL); + prof_tdata = *prof_tdata_tsd_get(); + if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX) + return (true); /* Take care to avoid integer overflow. */ if (size >= prof_tdata->threshold - prof_tdata->accum) { @@ -391,8 +418,9 @@ JEMALLOC_INLINE void prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt) { + cassert(config_prof); assert(ptr != NULL); - assert(size == isalloc(ptr)); + assert(size == isalloc(ptr, true)); if (opt_lg_prof_sample != 0) { if (prof_sample_accum_update(size)) { @@ -437,10 +465,11 @@ prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt, { prof_thr_cnt_t *told_cnt; + cassert(config_prof); assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U); if (ptr != NULL) { - assert(size == isalloc(ptr)); + assert(size == isalloc(ptr, true)); if (opt_lg_prof_sample != 0) { if (prof_sample_accum_update(size)) { /* @@ -463,10 +492,10 @@ prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt, * It's too late to propagate OOM for this realloc(), * so operate directly on old_cnt->ctx->cnt_merged. */ - malloc_mutex_lock(&old_ctx->lock); + malloc_mutex_lock(old_ctx->lock); old_ctx->cnt_merged.curobjs--; old_ctx->cnt_merged.curbytes -= old_size; - malloc_mutex_unlock(&old_ctx->lock); + malloc_mutex_unlock(old_ctx->lock); told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U; } } else @@ -510,9 +539,12 @@ prof_free(const void *ptr, size_t size) { prof_ctx_t *ctx = prof_ctx_get(ptr); + cassert(config_prof); + if ((uintptr_t)ctx > (uintptr_t)1) { - assert(size == isalloc(ptr)); - prof_thr_cnt_t *tcnt = prof_lookup(ctx->bt); + prof_thr_cnt_t *tcnt; + assert(size == isalloc(ptr, true)); + tcnt = prof_lookup(ctx->bt); if (tcnt != NULL) { tcnt->epoch++; @@ -533,10 +565,10 @@ prof_free(const void *ptr, size_t size) * OOM during free() cannot be propagated, so operate * directly on cnt->ctx->cnt_merged. */ - malloc_mutex_lock(&ctx->lock); + malloc_mutex_lock(ctx->lock); ctx->cnt_merged.curobjs--; ctx->cnt_merged.curbytes -= size; - malloc_mutex_unlock(&ctx->lock); + malloc_mutex_unlock(ctx->lock); } } } @@ -544,4 +576,3 @@ prof_free(const void *ptr, size_t size) #endif /* JEMALLOC_H_INLINES */ /******************************************************************************/ -#endif /* JEMALLOC_PROF */ |
