From a881cd2c61c1ced56f87fcb9d7ef6e92b81e6c58 Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Sat, 2 Oct 2010 15:18:50 -0700 Subject: Make cumulative heap profile data optional. Add the R option to control whether cumulative heap profile data are maintained. Add the T option to control the size of per thread backtrace caches, primarily because when the R option is specified, backtraces that no longer have allocations associated with them are discarded as soon as no thread caches refer to them. --- jemalloc/doc/jemalloc.3.in | 55 ++++++ jemalloc/include/jemalloc/internal/mutex.h | 1 + jemalloc/include/jemalloc/internal/prof.h | 63 ++++--- jemalloc/src/ctl.c | 6 + jemalloc/src/jemalloc.c | 15 ++ jemalloc/src/mutex.c | 10 + jemalloc/src/prof.c | 288 ++++++++++++++++++----------- jemalloc/src/stats.c | 14 ++ 8 files changed, 328 insertions(+), 124 deletions(-) diff --git a/jemalloc/doc/jemalloc.3.in b/jemalloc/doc/jemalloc.3.in index 2ac093a..23e6ca0 100644 --- a/jemalloc/doc/jemalloc.3.in +++ b/jemalloc/doc/jemalloc.3.in @@ -484,6 +484,12 @@ will disable dirty page purging. @roff_prof@.Dq S @roff_prof@option for probabilistic sampling control. @roff_prof@See the +@roff_prof@.Dq R +@roff_prof@option for control of cumulative sample reporting. +@roff_prof@See the +@roff_prof@.Dq T +@roff_prof@option for control of per thread backtrace caching. +@roff_prof@See the @roff_prof@.Dq I @roff_prof@option for information on interval-triggered profile dumping, and the @roff_prof@.Dq U @@ -595,6 +601,18 @@ Double/halve the size of the maximum size class that is a multiple of the quantum (8 or 16 bytes, depending on architecture). Above this size, cacheline spacing is used for size classes. The default value is 128 bytes. +@roff_prof@.It R +@roff_prof@Enable/disable reporting of cumulative object/byte counts in profile +@roff_prof@dumps. +@roff_prof@If this option is enabled, every unique backtrace must be stored for +@roff_prof@the duration of execution. +@roff_prof@Depending on the application, this can impose a large memory +@roff_prof@overhead, and the cumulative counts are not always of interest. +@roff_prof@See the +@roff_prof@.Dq T +@roff_prof@option for control of per thread backtrace caching, which has +@roff_prof@important interactions. +@roff_prof@This option is enabled by default. @roff_prof@.It S @roff_prof@Double/halve the average interval between allocation samples, as @roff_prof@measured in bytes of allocation activity. @@ -602,6 +620,22 @@ The default value is 128 bytes. @roff_prof@also decreases the computational overhead. @roff_prof@The default sample interval is one (i.e. all allocations are @roff_prof@sampled). +@roff_prof@.It T +@roff_prof@Double/halve the maximum per thread backtrace cache used for heap +@roff_prof@profiling. +@roff_prof@A backtrace can only be discarded if the +@roff_prof@.Dq R +@roff_prof@option is disabled, and no thread caches currently refer to the +@roff_prof@backtrace. +@roff_prof@Therefore, a backtrace cache limit should be imposed if the +@roff_prof@intention is to limit how much memory is used by backtraces. +@roff_prof@By default, no limit is imposed. +@roff_prof@This is internally encoded as (1 << -1), and each +@roff_prof@.Dq T +@roff_prof@that is specified increments the shift amount. +@roff_prof@Therefore, e.g. +@roff_prof@.Ev JEMALLOC_OPTIONS=11T +@roff_prof@specifies a backtrace cache limit of 1024 backtraces. @roff_prof@.It U @roff_prof@Trigger a memory profile dump every time the total virtual memory @roff_prof@exceeds the previous maximum. @@ -992,6 +1026,27 @@ option. @roff_prof@option. @roff_prof@.Ed .\"----------------------------------------------------------------------------- +@roff_prof@.It Sy "opt.prof_accum (bool) r-" +@roff_prof@.Bd -ragged -offset indent -compact +@roff_prof@See the +@roff_prof@.Dq R +@roff_prof@option. +@roff_prof@.Ed +.\"----------------------------------------------------------------------------- +@roff_prof@.It Sy "opt.lg_prof_tcmax (ssize_t) r-" +@roff_prof@.Bd -ragged -offset indent -compact +@roff_prof@See the +@roff_prof@.Dq T +@roff_prof@option. +@roff_prof@.Ed +.\"----------------------------------------------------------------------------- +@roff_prof@.It Sy "opt.lg_prof_sample (ssize_t) r-" +@roff_prof@.Bd -ragged -offset indent -compact +@roff_prof@See the +@roff_prof@.Dq S +@roff_prof@option. +@roff_prof@.Ed +.\"----------------------------------------------------------------------------- @roff_prof@.It Sy "opt.lg_prof_interval (ssize_t) r-" @roff_prof@.Bd -ragged -offset indent -compact @roff_prof@See the diff --git a/jemalloc/include/jemalloc/internal/mutex.h b/jemalloc/include/jemalloc/internal/mutex.h index 8113415..dcca01e 100644 --- a/jemalloc/include/jemalloc/internal/mutex.h +++ b/jemalloc/include/jemalloc/internal/mutex.h @@ -24,6 +24,7 @@ extern bool isthreaded; #endif bool malloc_mutex_init(malloc_mutex_t *mutex); +void malloc_mutex_destroy(malloc_mutex_t *mutex); #endif /* JEMALLOC_H_EXTERNS */ /******************************************************************************/ diff --git a/jemalloc/include/jemalloc/internal/prof.h b/jemalloc/include/jemalloc/internal/prof.h index 2c195f3..a8f67bb 100644 --- a/jemalloc/include/jemalloc/internal/prof.h +++ b/jemalloc/include/jemalloc/internal/prof.h @@ -6,12 +6,13 @@ typedef struct prof_bt_s prof_bt_t; typedef struct prof_cnt_s prof_cnt_t; typedef struct prof_thr_cnt_s prof_thr_cnt_t; typedef struct prof_ctx_s prof_ctx_t; -typedef struct prof_s prof_t; +typedef struct prof_tcache_s prof_tcache_t; /* Option defaults. */ #define LG_PROF_BT_MAX_DEFAULT 2 #define LG_PROF_SAMPLE_DEFAULT 0 #define LG_PROF_INTERVAL_DEFAULT -1 +#define LG_PROF_TCMAX_DEFAULT -1 /* * Hard limit on stack backtrace depth. Note that the version of @@ -41,9 +42,9 @@ struct prof_bt_s { #ifdef JEMALLOC_PROF_LIBGCC /* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */ typedef struct { - prof_bt_t *bt; - unsigned nignore; - unsigned max; + prof_bt_t *bt; + unsigned nignore; + unsigned max; } prof_unwind_data_t; #endif @@ -51,11 +52,11 @@ struct prof_cnt_s { /* * Profiling counters. An allocation/deallocation pair can operate on * different prof_thr_cnt_t objects that are linked into the same - * prof_ctx_t sets_ql, so it is possible for the cur* counters to go + * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go * negative. In principle it is possible for the *bytes counters to - * overflow/underflow, but a general solution would require some form - * of 128-bit counter solution; this implementation doesn't bother to - * solve that problem. + * overflow/underflow, but a general solution would require something + * like 128-bit counters; this implementation doesn't bother to solve + * that problem. */ int64_t curobjs; int64_t curbytes; @@ -64,15 +65,18 @@ struct prof_cnt_s { }; struct prof_thr_cnt_s { - /* Linkage into prof_ctx_t's sets_ql. */ - ql_elm(prof_thr_cnt_t) link; + /* Linkage into prof_ctx_t's cnts_ql. */ + ql_elm(prof_thr_cnt_t) cnts_link; + + /* Linkage into thread's LRU. */ + ql_elm(prof_thr_cnt_t) lru_link; /* * Associated context. If a thread frees an object that it did not * allocate, it is possible that the context is not cached in the * thread's hash table, in which case it must be able to look up the * context, insert a new prof_thr_cnt_t into the thread's hash table, - * and link it into the prof_ctx_t's sets_ql. + * and link it into the prof_ctx_t's cnts_ql. */ prof_ctx_t *ctx; @@ -101,11 +105,11 @@ struct prof_ctx_s { /* Associated backtrace. */ prof_bt_t *bt; - /* Protects cnt_merged and sets_ql. */ + /* Protects cnt_merged and cnts_ql. */ malloc_mutex_t lock; - /* Temporary storage for aggregation during dump. */ - prof_cnt_t cnt_dump; + /* Temporary storage for summation during dump. */ + prof_cnt_t cnt_summed; /* When threads exit, they merge their stats into cnt_merged. */ prof_cnt_t cnt_merged; @@ -117,6 +121,24 @@ struct prof_ctx_s { ql_head(prof_thr_cnt_t) cnts_ql; }; +/* + * Thread-specific hash of (prof_bt_t *)-->(prof_thr_cnt_t *). Each thread + * keeps a cache of backtraces, with associated thread-specific prof_thr_cnt_t + * objects. Other threads may read the prof_thr_cnt_t contents, but no others + * will ever write them. + * + * Upon thread exit, the thread must merge all the prof_thr_cnt_t counter data + * into the associated prof_ctx_t objects, and unlink/free the prof_thr_cnt_t + * objects. + */ +struct prof_tcache_s { + /* (prof_bt_t *)-->(prof_thr_cnt_t *). */ + ckh_t bt2cnt; + + /* LRU for contents of bt2cnt. */ + ql_head(prof_thr_cnt_t) lru_ql; +}; + #endif /* JEMALLOC_H_STRUCTS */ /******************************************************************************/ #ifdef JEMALLOC_H_EXTERNS @@ -129,11 +151,13 @@ extern bool opt_prof; * to notice state changes. */ extern bool opt_prof_active; -extern size_t opt_lg_prof_bt_max; /* Maximum backtrace depth. */ -extern size_t opt_lg_prof_sample; /* Mean bytes between samples. */ +extern size_t opt_lg_prof_bt_max; /* Maximum backtrace depth. */ +extern size_t opt_lg_prof_sample; /* Mean bytes between samples. */ extern ssize_t opt_lg_prof_interval; /* lg(prof_interval). */ -extern bool opt_prof_udump; /* High-water memory dumping. */ -extern bool opt_prof_leak; /* Dump leak summary at exit. */ +extern bool opt_prof_udump; /* High-water memory dumping. */ +extern bool opt_prof_leak; /* Dump leak summary at exit. */ +extern bool opt_prof_accum; /* Report cumulative bytes. */ +extern ssize_t opt_lg_prof_tcmax; /* lg(max per thread bactrace cache) */ /* * Profile dump interval, measured in bytes allocated. Each arena triggers a @@ -150,9 +174,6 @@ extern uint64_t prof_interval; */ extern bool prof_promote; -bool prof_init(prof_t *prof, bool master); -void prof_destroy(prof_t *prof); - prof_thr_cnt_t *prof_alloc_prep(size_t size); prof_ctx_t *prof_ctx_get(const void *ptr); void prof_malloc(const void *ptr, prof_thr_cnt_t *cnt); diff --git a/jemalloc/src/ctl.c b/jemalloc/src/ctl.c index e904fdf..7642259 100644 --- a/jemalloc/src/ctl.c +++ b/jemalloc/src/ctl.c @@ -82,6 +82,8 @@ CTL_PROTO(opt_lg_prof_sample) CTL_PROTO(opt_lg_prof_interval) CTL_PROTO(opt_prof_udump) CTL_PROTO(opt_prof_leak) +CTL_PROTO(opt_prof_accum) +CTL_PROTO(opt_lg_prof_tcmax) #endif CTL_PROTO(opt_stats_print) CTL_PROTO(opt_lg_qspace_max) @@ -260,6 +262,8 @@ static const ctl_node_t opt_node[] = { {NAME("lg_prof_interval"), CTL(opt_lg_prof_interval)}, {NAME("prof_udump"), CTL(opt_prof_udump)}, {NAME("prof_leak"), CTL(opt_prof_leak)}, + {NAME("prof_accum"), CTL(opt_prof_accum)}, + {NAME("lg_prof_tcmax"), CTL(opt_lg_prof_tcmax)}, #endif {NAME("stats_print"), CTL(opt_stats_print)}, {NAME("lg_qspace_max"), CTL(opt_lg_qspace_max)}, @@ -1207,6 +1211,8 @@ CTL_RO_GEN(opt_lg_prof_sample, opt_lg_prof_sample, size_t) CTL_RO_GEN(opt_lg_prof_interval, opt_lg_prof_interval, ssize_t) CTL_RO_GEN(opt_prof_udump, opt_prof_udump, bool) CTL_RO_GEN(opt_prof_leak, opt_prof_leak, bool) +CTL_RO_GEN(opt_prof_accum, opt_prof_accum, bool) +CTL_RO_GEN(opt_lg_prof_tcmax, opt_lg_prof_tcmax, ssize_t) #endif CTL_RO_GEN(opt_stats_print, opt_stats_print, bool) CTL_RO_GEN(opt_lg_qspace_max, opt_lg_qspace_max, size_t) diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c index 98c4485..6f9ec76 100644 --- a/jemalloc/src/jemalloc.c +++ b/jemalloc/src/jemalloc.c @@ -512,6 +512,12 @@ MALLOC_OUT: opt_lg_qspace_max++; break; #ifdef JEMALLOC_PROF + case 'r': + opt_prof_accum = false; + break; + case 'R': + opt_prof_accum = true; + break; case 's': if (opt_lg_prof_sample > 0) opt_lg_prof_sample--; @@ -521,6 +527,15 @@ MALLOC_OUT: (sizeof(uint64_t) << 3)) opt_lg_prof_sample++; break; + case 't': + if (opt_lg_prof_tcmax >= 0) + opt_lg_prof_tcmax--; + break; + case 'T': + if (opt_lg_prof_tcmax + 1 < + (sizeof(size_t) << 3)) + opt_lg_prof_tcmax++; + break; case 'u': opt_prof_udump = false; break; diff --git a/jemalloc/src/mutex.c b/jemalloc/src/mutex.c index 337312b..3ecb18a 100644 --- a/jemalloc/src/mutex.c +++ b/jemalloc/src/mutex.c @@ -72,3 +72,13 @@ malloc_mutex_init(malloc_mutex_t *mutex) return (false); } + +void +malloc_mutex_destroy(malloc_mutex_t *mutex) +{ + + if (pthread_mutex_destroy(mutex) != 0) { + malloc_write(": Error in pthread_mutex_destroy()\n"); + abort(); + } +} diff --git a/jemalloc/src/prof.c b/jemalloc/src/prof.c index 8d13451..7ffda23 100644 --- a/jemalloc/src/prof.c +++ b/jemalloc/src/prof.c @@ -24,47 +24,41 @@ size_t opt_lg_prof_sample = LG_PROF_SAMPLE_DEFAULT; ssize_t opt_lg_prof_interval = LG_PROF_INTERVAL_DEFAULT; bool opt_prof_udump = false; bool opt_prof_leak = false; +bool opt_prof_accum = true; +ssize_t opt_lg_prof_tcmax = LG_PROF_TCMAX_DEFAULT; uint64_t prof_interval; bool prof_promote; /* * Global hash of (prof_bt_t *)-->(prof_ctx_t *). This is the master data - * structure that knows about all backtraces ever captured. + * structure that knows about all backtraces currently captured. */ static ckh_t bt2ctx; static malloc_mutex_t bt2ctx_mtx; -/* - * Thread-specific hash of (prof_bt_t *)-->(prof_thr_cnt_t *). Each thread - * keeps a cache of backtraces, with associated thread-specific prof_thr_cnt_t - * objects. Other threads may read the prof_thr_cnt_t contents, but no others - * will ever write them. - * - * Upon thread exit, the thread must merge all the prof_thr_cnt_t counter data - * into the associated prof_ctx_t objects, and unlink/free the prof_thr_cnt_t - * objects. - */ +/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */ #ifndef NO_TLS -static __thread ckh_t *bt2cnt_tls JEMALLOC_ATTR(tls_model("initial-exec")); -# define BT2CNT_GET() bt2cnt_tls -# define BT2CNT_SET(v) do { \ - bt2cnt_tls = (v); \ - pthread_setspecific(bt2cnt_tsd, (void *)(v)); \ +static __thread prof_tcache_t *prof_tcache_tls + JEMALLOC_ATTR(tls_model("initial-exec")); +# define PROF_TCACHE_GET() prof_tcache_tls +# define PROF_TCACHE_SET(v) do { \ + prof_tcache_tls = (v); \ + pthread_setspecific(prof_tcache_tsd, (void *)(v)); \ } while (0) #else -# define BT2CNT_GET() ((ckh_t *)pthread_getspecific(bt2cnt_tsd)) -# define BT2CNT_SET(v) do { \ - pthread_setspecific(bt2cnt_tsd, (void *)(v)); \ +# define PROF_TCACHE_GET() ((ckh_t *)pthread_getspecific(prof_tcache_tsd)) +# define PROF_TCACHE_SET(v) do { \ + pthread_setspecific(prof_tcache_tsd, (void *)(v)); \ } while (0) #endif /* * Same contents as b2cnt_tls, but initialized such that the TSD destructor is - * called when a thread exits, so that bt2cnt_tls contents can be merged, + * called when a thread exits, so that prof_tcache_tls contents can be merged, * unlinked, and deallocated. */ -static pthread_key_t bt2cnt_tsd; +static pthread_key_t prof_tcache_tsd; /* (1U << opt_lg_prof_bt_max). */ static unsigned prof_bt_max; @@ -137,6 +131,7 @@ static bool enq_udump; static prof_bt_t *bt_dup(prof_bt_t *bt); static void bt_init(prof_bt_t *bt, void **vec); +static void bt_destroy(prof_bt_t *bt); #ifdef JEMALLOC_PROF_LIBGCC static _Unwind_Reason_Code prof_unwind_init_callback( struct _Unwind_Context *context, void *arg); @@ -148,8 +143,10 @@ static prof_thr_cnt_t *prof_lookup(prof_bt_t *bt); static void prof_ctx_set(const void *ptr, prof_ctx_t *ctx); static bool prof_flush(bool propagate_err); static bool prof_write(const char *s, bool propagate_err); -static void prof_ctx_merge(prof_ctx_t *ctx, prof_cnt_t *cnt_all, +static void prof_ctx_sum(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx); +static void prof_ctx_destroy(prof_ctx_t *ctx); +static void prof_ctx_merge(prof_ctx_t *ctx, prof_thr_cnt_t *cnt); static bool prof_dump_ctx(prof_ctx_t *ctx, prof_bt_t *bt, bool propagate_err); static bool prof_dump_maps(bool propagate_err); @@ -160,7 +157,7 @@ static void prof_fdump(void); static void prof_bt_hash(const void *key, unsigned minbits, size_t *hash1, size_t *hash2); static bool prof_bt_keycomp(const void *k1, const void *k2); -static void bt2cnt_thread_cleanup(void *arg); +static void prof_tcache_cleanup(void *arg); #ifdef NO_TLS static void prof_sample_state_thread_cleanup(void *arg); #endif @@ -175,6 +172,13 @@ bt_init(prof_bt_t *bt, void **vec) bt->len = 0; } +static void +bt_destroy(prof_bt_t *bt) +{ + + idalloc(bt); +} + static prof_bt_t * bt_dup(prof_bt_t *bt) { @@ -487,23 +491,25 @@ prof_lookup(prof_bt_t *bt) prof_thr_cnt_t *p; void *v; } ret; - ckh_t *bt2cnt = BT2CNT_GET(); + prof_tcache_t *prof_tcache = PROF_TCACHE_GET(); - if (bt2cnt == NULL) { + if (prof_tcache == NULL) { /* Initialize an empty cache for this thread. */ - bt2cnt = (ckh_t *)imalloc(sizeof(ckh_t)); - if (bt2cnt == NULL) + prof_tcache = (prof_tcache_t *)imalloc(sizeof(prof_tcache_t)); + if (prof_tcache == NULL) return (NULL); - if (ckh_new(bt2cnt, PROF_CKH_MINITEMS, prof_bt_hash, - prof_bt_keycomp)) { - idalloc(bt2cnt); + + if (ckh_new(&prof_tcache->bt2cnt, PROF_CKH_MINITEMS, + prof_bt_hash, prof_bt_keycomp)) { + idalloc(prof_tcache); return (NULL); } + ql_new(&prof_tcache->lru_ql); - BT2CNT_SET(bt2cnt); + PROF_TCACHE_SET(prof_tcache); } - if (ckh_search(bt2cnt, bt, NULL, &ret.v)) { + if (ckh_search(&prof_tcache->bt2cnt, bt, NULL, &ret.v)) { union { prof_bt_t *p; void *v; @@ -519,7 +525,6 @@ prof_lookup(prof_bt_t *bt) */ prof_enter(); if (ckh_search(&bt2ctx, bt, &btkey.v, &ctx.v)) { - /* bt has never been seen before. Insert it. */ ctx.v = imalloc(sizeof(prof_ctx_t)); if (ctx.v == NULL) { @@ -544,28 +549,60 @@ prof_lookup(prof_bt_t *bt) if (ckh_insert(&bt2ctx, btkey.v, ctx.v)) { /* OOM. */ prof_leave(); + malloc_mutex_destroy(&ctx.p->lock); idalloc(btkey.v); idalloc(ctx.v); return (NULL); } } + /* + * Acquire ctx's lock before releasing bt2ctx_mtx, in order to + * avoid a race condition with prof_ctx_destroy(). + */ + malloc_mutex_lock(&ctx.p->lock); prof_leave(); /* Link a prof_thd_cnt_t into ctx for this thread. */ - ret.v = imalloc(sizeof(prof_thr_cnt_t)); - if (ret.p == NULL) - return (NULL); - ql_elm_new(ret.p, link); + if (opt_lg_prof_tcmax >= 0 && ckh_count(&prof_tcache->bt2cnt) + == (ZU(1) << opt_lg_prof_tcmax)) { + assert(ckh_count(&prof_tcache->bt2cnt) > 0); + /* + * Flush the least least recently used cnt in order to + * keep bt2cnt from becoming too large. + */ + ret.p = ql_last(&prof_tcache->lru_ql, lru_link); + assert(ret.v != NULL); + ckh_remove(&prof_tcache->bt2cnt, ret.p->ctx->bt, NULL, + NULL); + ql_remove(&prof_tcache->lru_ql, ret.p, lru_link); + prof_ctx_merge(ret.p->ctx, ret.p); + /* ret can now be re-used. */ + } else { + assert(opt_lg_prof_tcmax < 0 || + ckh_count(&prof_tcache->bt2cnt) < (ZU(1) << + opt_lg_prof_tcmax)); + /* Allocate and partially initialize a new cnt. */ + ret.v = imalloc(sizeof(prof_thr_cnt_t)); + if (ret.p == NULL) + return (NULL); + ql_elm_new(ret.p, cnts_link); + ql_elm_new(ret.p, lru_link); + } + /* Finish initializing ret. */ ret.p->ctx = ctx.p; ret.p->epoch = 0; memset(&ret.p->cnts, 0, sizeof(prof_cnt_t)); - if (ckh_insert(bt2cnt, btkey.v, ret.v)) { + if (ckh_insert(&prof_tcache->bt2cnt, btkey.v, ret.v)) { idalloc(ret.v); return (NULL); } - malloc_mutex_lock(&ctx.p->lock); - ql_tail_insert(&ctx.p->cnts_ql, ret.p, link); + ql_head_insert(&prof_tcache->lru_ql, ret.p, lru_link); + ql_tail_insert(&ctx.p->cnts_ql, ret.p, cnts_link); malloc_mutex_unlock(&ctx.p->lock); + } else { + /* Move ret to the front of the LRU. */ + ql_remove(&prof_tcache->lru_ql, ret.p, lru_link); + ql_head_insert(&prof_tcache->lru_ql, ret.p, lru_link); } return (ret.p); @@ -729,8 +766,10 @@ prof_malloc(const void *ptr, prof_thr_cnt_t *cnt) /*********/ cnt->cnts.curobjs++; cnt->cnts.curbytes += size; - cnt->cnts.accumobjs++; - cnt->cnts.accumbytes += size; + if (opt_prof_accum) { + cnt->cnts.accumobjs++; + cnt->cnts.accumbytes += size; + } /*********/ mb_write(); /*********/ @@ -796,8 +835,10 @@ prof_realloc(const void *ptr, prof_thr_cnt_t *cnt, const void *old_ptr, if ((uintptr_t)cnt > (uintptr_t)1U) { cnt->cnts.curobjs++; cnt->cnts.curbytes += size; - cnt->cnts.accumobjs++; - cnt->cnts.accumbytes += size; + if (opt_prof_accum) { + cnt->cnts.accumobjs++; + cnt->cnts.accumbytes += size; + } } /*********/ mb_write(); @@ -896,15 +937,15 @@ prof_write(const char *s, bool propagate_err) } static void -prof_ctx_merge(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx) +prof_ctx_sum(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx) { prof_thr_cnt_t *thr_cnt; prof_cnt_t tcnt; malloc_mutex_lock(&ctx->lock); - memcpy(&ctx->cnt_dump, &ctx->cnt_merged, sizeof(prof_cnt_t)); - ql_foreach(thr_cnt, &ctx->cnts_ql, link) { + memcpy(&ctx->cnt_summed, &ctx->cnt_merged, sizeof(prof_cnt_t)); + ql_foreach(thr_cnt, &ctx->cnts_ql, cnts_link) { volatile unsigned *epoch = &thr_cnt->epoch; while (true) { @@ -921,22 +962,77 @@ prof_ctx_merge(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx) break; } - ctx->cnt_dump.curobjs += tcnt.curobjs; - ctx->cnt_dump.curbytes += tcnt.curbytes; - ctx->cnt_dump.accumobjs += tcnt.accumobjs; - ctx->cnt_dump.accumbytes += tcnt.accumbytes; + ctx->cnt_summed.curobjs += tcnt.curobjs; + ctx->cnt_summed.curbytes += tcnt.curbytes; + if (opt_prof_accum) { + ctx->cnt_summed.accumobjs += tcnt.accumobjs; + ctx->cnt_summed.accumbytes += tcnt.accumbytes; + } if (tcnt.curobjs != 0) (*leak_nctx)++; } - /* Merge into cnt_all. */ - cnt_all->curobjs += ctx->cnt_dump.curobjs; - cnt_all->curbytes += ctx->cnt_dump.curbytes; - cnt_all->accumobjs += ctx->cnt_dump.accumobjs; - cnt_all->accumbytes += ctx->cnt_dump.accumbytes; + /* Add to cnt_all. */ + cnt_all->curobjs += ctx->cnt_summed.curobjs; + cnt_all->curbytes += ctx->cnt_summed.curbytes; + if (opt_prof_accum) { + cnt_all->accumobjs += ctx->cnt_summed.accumobjs; + cnt_all->accumbytes += ctx->cnt_summed.accumbytes; + } + + malloc_mutex_unlock(&ctx->lock); +} + +static void +prof_ctx_destroy(prof_ctx_t *ctx) +{ + + /* + * Check that ctx is still unused by any thread cache before destroying + * it. prof_lookup() interlocks bt2ctx_mtx and ctx->lock in order to + * avoid a race condition with this function. + */ + prof_enter(); + malloc_mutex_lock(&ctx->lock); + if (ql_first(&ctx->cnts_ql) == NULL && ctx->cnt_merged.curobjs == 0) { + assert(ctx->cnt_merged.curbytes == 0); + assert(ctx->cnt_merged.accumobjs == 0); + assert(ctx->cnt_merged.accumbytes == 0); + /* Remove ctx from bt2ctx. */ + ckh_remove(&bt2ctx, ctx->bt, NULL, NULL); + prof_leave(); + /* Destroy ctx. */ + malloc_mutex_unlock(&ctx->lock); + bt_destroy(ctx->bt); + malloc_mutex_destroy(&ctx->lock); + idalloc(ctx); + } else { + malloc_mutex_unlock(&ctx->lock); + prof_leave(); + } +} + +static void +prof_ctx_merge(prof_ctx_t *ctx, prof_thr_cnt_t *cnt) +{ + bool destroy; + /* Merge cnt stats and detach from ctx. */ + malloc_mutex_lock(&ctx->lock); + ctx->cnt_merged.curobjs += cnt->cnts.curobjs; + ctx->cnt_merged.curbytes += cnt->cnts.curbytes; + ctx->cnt_merged.accumobjs += cnt->cnts.accumobjs; + ctx->cnt_merged.accumbytes += cnt->cnts.accumbytes; + ql_remove(&ctx->cnts_ql, cnt, cnts_link); + if (opt_prof_accum == false && ql_first(&ctx->cnts_ql) == NULL && + ctx->cnt_merged.curobjs == 0) + destroy = true; + else + destroy = false; malloc_mutex_unlock(&ctx->lock); + if (destroy) + prof_ctx_destroy(ctx); } static bool @@ -945,15 +1041,22 @@ prof_dump_ctx(prof_ctx_t *ctx, prof_bt_t *bt, bool propagate_err) char buf[UMAX2S_BUFSIZE]; unsigned i; - if (prof_write(umax2s(ctx->cnt_dump.curobjs, 10, buf), propagate_err) + if (opt_prof_accum == false && ctx->cnt_summed.curobjs == 0) { + assert(ctx->cnt_summed.curbytes == 0); + assert(ctx->cnt_summed.accumobjs == 0); + assert(ctx->cnt_summed.accumbytes == 0); + return (false); + } + + if (prof_write(umax2s(ctx->cnt_summed.curobjs, 10, buf), propagate_err) || prof_write(": ", propagate_err) - || prof_write(umax2s(ctx->cnt_dump.curbytes, 10, buf), + || prof_write(umax2s(ctx->cnt_summed.curbytes, 10, buf), propagate_err) || prof_write(" [", propagate_err) - || prof_write(umax2s(ctx->cnt_dump.accumobjs, 10, buf), + || prof_write(umax2s(ctx->cnt_summed.accumobjs, 10, buf), propagate_err) || prof_write(": ", propagate_err) - || prof_write(umax2s(ctx->cnt_dump.accumbytes, 10, buf), + || prof_write(umax2s(ctx->cnt_summed.accumbytes, 10, buf), propagate_err) || prof_write("] @", propagate_err)) return (true); @@ -1060,7 +1163,7 @@ prof_dump(const char *filename, bool leakcheck, bool propagate_err) leak_nctx = 0; for (tabind = 0; ckh_iter(&bt2ctx, &tabind, NULL, &ctx.v) == false;) { - prof_ctx_merge(ctx.p, &cnt_all, &leak_nctx); + prof_ctx_sum(ctx.p, &cnt_all, &leak_nctx); } /* Dump profile header. */ @@ -1319,54 +1422,33 @@ prof_bt_keycomp(const void *k1, const void *k2) } static void -bt2cnt_thread_cleanup(void *arg) +prof_tcache_cleanup(void *arg) { - ckh_t *bt2cnt; + prof_tcache_t *prof_tcache; - bt2cnt = BT2CNT_GET(); - if (bt2cnt != NULL) { - ql_head(prof_thr_cnt_t) cnts_ql; - size_t tabind; - union { - prof_thr_cnt_t *p; - void *v; - } cnt; - - /* Iteratively merge cnt's into the global stats. */ - ql_new(&cnts_ql); - tabind = 0; - while (ckh_iter(bt2cnt, &tabind, NULL, &cnt.v) == - false) { - prof_ctx_t *ctx = cnt.p->ctx; - /* Merge stats and detach from ctx. */ - malloc_mutex_lock(&ctx->lock); - ctx->cnt_merged.curobjs += cnt.p->cnts.curobjs; - ctx->cnt_merged.curbytes += cnt.p->cnts.curbytes; - ctx->cnt_merged.accumobjs += cnt.p->cnts.accumobjs; - ctx->cnt_merged.accumbytes += cnt.p->cnts.accumbytes; - ql_remove(&ctx->cnts_ql, cnt.p, link); - malloc_mutex_unlock(&ctx->lock); + prof_tcache = PROF_TCACHE_GET(); + if (prof_tcache != NULL) { + prof_thr_cnt_t *cnt; - /* - * Stash cnt for deletion after finishing with - * ckh_iter(). - */ - ql_tail_insert(&cnts_ql, cnt.p, link); - } + /* + * Delete the hash table. All of its contents can still be + * iterated over via the LRU. + */ + ckh_delete(&prof_tcache->bt2cnt); /* - * Delete the hash table now that cnts_ql has a list of all - * cnt's. + * Iteratively merge cnt's into the global stats and delete + * them. */ - ckh_delete(bt2cnt); - idalloc(bt2cnt); - BT2CNT_SET(NULL); - - /* Delete cnt's. */ - while ((cnt.p = ql_last(&cnts_ql, link)) != NULL) { - ql_remove(&cnts_ql, cnt.p, link); - idalloc(cnt.v); + while ((cnt = ql_last(&prof_tcache->lru_ql, lru_link)) != + NULL) { + prof_ctx_merge(cnt->ctx, cnt); + ql_remove(&prof_tcache->lru_ql, cnt, lru_link); + idalloc(cnt); } + + idalloc(prof_tcache); + PROF_TCACHE_SET(NULL); } } @@ -1419,7 +1501,7 @@ prof_boot1(void) return (true); if (malloc_mutex_init(&bt2ctx_mtx)) return (true); - if (pthread_key_create(&bt2cnt_tsd, bt2cnt_thread_cleanup) + if (pthread_key_create(&prof_tcache_tsd, prof_tcache_cleanup) != 0) { malloc_write( ": Error in pthread_key_create()\n"); diff --git a/jemalloc/src/stats.c b/jemalloc/src/stats.c index 9dc7529..9b3271b 100644 --- a/jemalloc/src/stats.c +++ b/jemalloc/src/stats.c @@ -469,6 +469,9 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque, if ((err = JEMALLOC_P(mallctl)("opt.stats_print", &bv, &bsz, NULL, 0)) == 0) write_cb(cbopaque, bv ? "P" : "p"); + if ((err = JEMALLOC_P(mallctl)("opt.prof_accum", &bv, &bsz, + NULL, 0)) == 0) + write_cb(cbopaque, bv ? "R" : "r"); if ((err = JEMALLOC_P(mallctl)("opt.prof_udump", &bv, &bsz, NULL, 0)) == 0) write_cb(cbopaque, bv ? "U" : "u"); @@ -580,6 +583,17 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque, write_cb(cbopaque, umax2s((1U << sv), 10, s)); write_cb(cbopaque, "\n"); + CTL_GET("opt.lg_prof_tcmax", &ssv, ssize_t); + write_cb(cbopaque, + "Maximum per thread backtrace cache: "); + if (ssv >= 0) { + write_cb(cbopaque, umax2s((1U << ssv), 10, s)); + write_cb(cbopaque, " (2^"); + write_cb(cbopaque, umax2s(ssv, 10, s)); + write_cb(cbopaque, ")\n"); + } else + write_cb(cbopaque, "N/A\n"); + CTL_GET("opt.lg_prof_sample", &sv, size_t); write_cb(cbopaque, "Average profile sample interval: "); write_cb(cbopaque, umax2s((1U << sv), 10, s)); -- cgit v0.12