From fa5d245aef7087c19c375590a7ee2966a0ae339a Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Tue, 15 Mar 2011 10:25:59 -0700 Subject: Set default symbol visibility to hidden. Compile with -fvisibility=hidden rather than -fvisibility=internal, in order to avoid PLT lookups for internal functions. Also fix a regression that caused the -fvisibility flag to be omitted, due to: Port to Mac OS X. 2dbecf1f6267fae7a161b9c39cfd4d04ce168a29 --- jemalloc/configure.ac | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/jemalloc/configure.ac b/jemalloc/configure.ac index 46a2bd4..f10641b 100644 --- a/jemalloc/configure.ac +++ b/jemalloc/configure.ac @@ -157,17 +157,6 @@ case "${host_cpu}" in esac AC_DEFINE_UNQUOTED([CPU_SPINWAIT], [$CPU_SPINWAIT]) -JE_COMPILABLE([__attribute__ syntax], - [static __attribute__((unused)) void foo(void){}], - [], - [attribute]) -if test "x${attribute}" = "xyes" ; then - AC_DEFINE([JEMALLOC_HAVE_ATTR], [ ]) - if test "x$GCC" = "xyes" -a "${abi}" = "xelf"; then - JE_CFLAGS_APPEND([-fvisibility=internal]) - fi -fi - dnl Platform-specific settings. abi and RPATH can probably be determined dnl programmatically, but doing so is error-prone, which makes it generally dnl not worth the trouble. @@ -227,6 +216,17 @@ esac AC_SUBST([abi]) AC_SUBST([RPATH]) +JE_COMPILABLE([__attribute__ syntax], + [static __attribute__((unused)) void foo(void){}], + [], + [attribute]) +if test "x${attribute}" = "xyes" ; then + AC_DEFINE([JEMALLOC_HAVE_ATTR], [ ]) + if test "x${GCC}" = "xyes" -a "x${abi}" = "xelf"; then + JE_CFLAGS_APPEND([-fvisibility=hidden]) + fi +fi + JE_COMPILABLE([mremap(...MREMAP_FIXED...)], [ #define _GNU_SOURCE #include -- cgit v0.12 From ff7450727f64180367f430b1b747f9e682e26df4 Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Mon, 14 Mar 2011 22:22:29 -0700 Subject: Expand a comment regarding geometric sampling. --- jemalloc/include/jemalloc/internal/prof.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/jemalloc/include/jemalloc/internal/prof.h b/jemalloc/include/jemalloc/internal/prof.h index 7864000..db63465 100644 --- a/jemalloc/include/jemalloc/internal/prof.h +++ b/jemalloc/include/jemalloc/internal/prof.h @@ -247,8 +247,22 @@ prof_sample_threshold_update(prof_tdata_t *prof_tdata) double u; /* - * Compute prof_sample_threshold as a geometrically distributed random + * Compute sample threshold as a geometrically distributed random * variable with mean (2^opt_lg_prof_sample). + * + * __ __ + * | log(u) | 1 + * prof_tdata->threshold = | -------- |, where p = ------------------- + * | log(1-p) | opt_lg_prof_sample + * 2 + * + * For more information on the math, see: + * + * Non-Uniform Random Variate Generation + * Luc Devroye + * Springer-Verlag, New York, 1986 + * pp 500 + * (http://cg.scs.carleton.ca/~luc/rnbookindex.html) */ prn64(r, 53, prof_tdata->prn_state, (uint64_t)6364136223846793005LLU, (uint64_t)1442695040888963407LLU); @@ -374,7 +388,7 @@ prof_sample_accum_update(size_t size) /* Take care to avoid integer overflow. */ if (size >= prof_tdata->threshold - prof_tdata->accum) { prof_tdata->accum -= (prof_tdata->threshold - size); - /* Compute new prof_sample_threshold. */ + /* Compute new sample threshold. */ prof_sample_threshold_update(prof_tdata); while (prof_tdata->accum >= prof_tdata->threshold) { prof_tdata->accum -= prof_tdata->threshold; -- cgit v0.12 From 41ade967c29ea9312c0b7390ee43bc0c63373f39 Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Sun, 6 Mar 2011 22:56:36 -0800 Subject: Reduce size of small_size2bin lookup table. Convert all direct small_size2bin[...] accesses to SMALL_SIZE2BIN(...) macro calls, and use a couple of cheap math operations to allow compacting the table by 4X or 8X, on 32- and 64-bit systems, respectively. --- jemalloc/include/jemalloc/internal/arena.h | 7 ++ .../jemalloc/internal/jemalloc_internal.h.in | 4 +- jemalloc/include/jemalloc/internal/tcache.h | 2 +- jemalloc/src/arena.c | 80 ++++++++++++---------- 4 files changed, 52 insertions(+), 41 deletions(-) diff --git a/jemalloc/include/jemalloc/internal/arena.h b/jemalloc/include/jemalloc/internal/arena.h index a43d1fa..78828ef 100644 --- a/jemalloc/include/jemalloc/internal/arena.h +++ b/jemalloc/include/jemalloc/internal/arena.h @@ -19,6 +19,7 @@ #ifdef JEMALLOC_TINY /* Smallest size class to support. */ # define LG_TINY_MIN LG_SIZEOF_PTR +# define TINY_MIN (1U << LG_TINY_MIN) #endif /* @@ -389,7 +390,13 @@ struct arena_s { extern size_t opt_lg_qspace_max; extern size_t opt_lg_cspace_max; extern ssize_t opt_lg_dirty_mult; +/* + * small_size2bin is a compact lookup table that rounds request sizes up to + * size classes. In order to reduce cache footprint, the table is compressed, + * and all accesses are via the SMALL_SIZE2BIN macro. + */ extern uint8_t const *small_size2bin; +#define SMALL_SIZE2BIN(s) (small_size2bin[(s-1) >> LG_TINY_MIN]) /* Various bin-related settings. */ #ifdef JEMALLOC_TINY /* Number of (2^n)-spaced tiny bins. */ diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in index aab2bfb..0f58a7a 100644 --- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in +++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in @@ -402,7 +402,7 @@ s2u(size_t size) { if (size <= small_maxclass) - return (arenas[0]->bins[small_size2bin[size]].reg_size); + return (arenas[0]->bins[SMALL_SIZE2BIN(size)].reg_size); if (size <= arena_maxclass) return (PAGE_CEILING(size)); return (CHUNK_CEILING(size)); @@ -448,7 +448,7 @@ sa2u(size_t size, size_t alignment, size_t *run_size_p) if (usize <= arena_maxclass && alignment <= PAGE_SIZE) { if (usize <= small_maxclass) { return - (arenas[0]->bins[small_size2bin[usize]].reg_size); + (arenas[0]->bins[SMALL_SIZE2BIN(usize)].reg_size); } return (PAGE_CEILING(usize)); } else { diff --git a/jemalloc/include/jemalloc/internal/tcache.h b/jemalloc/include/jemalloc/internal/tcache.h index f431c66..7b71172 100644 --- a/jemalloc/include/jemalloc/internal/tcache.h +++ b/jemalloc/include/jemalloc/internal/tcache.h @@ -223,7 +223,7 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero) size_t binind; tcache_bin_t *tbin; - binind = small_size2bin[size]; + binind = SMALL_SIZE2BIN(size); assert(binind < nbins); tbin = &tcache->tbins[binind]; ret = tcache_alloc_easy(tbin); diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c index 3cf15ff..2811fd1 100644 --- a/jemalloc/src/arena.c +++ b/jemalloc/src/arena.c @@ -25,26 +25,27 @@ size_t mspace_mask; /* * const_small_size2bin is a static constant lookup table that in the common - * case can be used as-is for small_size2bin. For dynamically linked programs, - * this avoids a page of memory overhead per process. + * case can be used as-is for small_size2bin. */ -#define S2B_1(i) i, -#define S2B_2(i) S2B_1(i) S2B_1(i) -#define S2B_4(i) S2B_2(i) S2B_2(i) +#if (LG_TINY_MIN == 2) +#define S2B_4(i) i, #define S2B_8(i) S2B_4(i) S2B_4(i) +#elif (LG_TINY_MIN == 3) +#define S2B_8(i) i, +#else +# error "Unsupported LG_TINY_MIN" +#endif #define S2B_16(i) S2B_8(i) S2B_8(i) #define S2B_32(i) S2B_16(i) S2B_16(i) #define S2B_64(i) S2B_32(i) S2B_32(i) #define S2B_128(i) S2B_64(i) S2B_64(i) #define S2B_256(i) S2B_128(i) S2B_128(i) /* - * The number of elements in const_small_size2bin is dependent on page size - * and on the definition for SUBPAGE. If SUBPAGE changes, the '- 255' must also - * change, along with the addition/removal of static lookup table element - * definitions. + * The number of elements in const_small_size2bin is dependent on the + * definition for SUBPAGE. */ -static const uint8_t const_small_size2bin[STATIC_PAGE_SIZE - 255] = { - S2B_1(0xffU) /* 0 */ +static JEMALLOC_ATTR(aligned(CACHELINE)) + const uint8_t const_small_size2bin[] = { #if (LG_QUANTUM == 4) /* 16-byte quantum **********************/ # ifdef JEMALLOC_TINY @@ -1475,7 +1476,7 @@ arena_malloc_small(arena_t *arena, size_t size, bool zero) arena_run_t *run; size_t binind; - binind = small_size2bin[size]; + binind = SMALL_SIZE2BIN(size); assert(binind < nbins); bin = &arena->bins[binind]; size = bin->reg_size; @@ -1713,7 +1714,7 @@ arena_prof_promoted(const void *ptr, size_t size) chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT; - binind = small_size2bin[size]; + binind = SMALL_SIZE2BIN(size); assert(binind < nbins); chunk->map[pageind-map_bias].bits = (chunk->map[pageind-map_bias].bits & ~CHUNK_MAP_CLASS_MASK) | ((binind+1) << CHUNK_MAP_CLASS_SHIFT); @@ -2166,11 +2167,11 @@ arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra, */ if (oldsize <= arena_maxclass) { if (oldsize <= small_maxclass) { - assert(choose_arena()->bins[small_size2bin[ - oldsize]].reg_size == oldsize); + assert(choose_arena()->bins[SMALL_SIZE2BIN( + oldsize)].reg_size == oldsize); if ((size + extra <= small_maxclass && - small_size2bin[size + extra] == - small_size2bin[oldsize]) || (size <= oldsize && + SMALL_SIZE2BIN(size + extra) == + SMALL_SIZE2BIN(oldsize)) || (size <= oldsize && size + extra >= oldsize)) { #ifdef JEMALLOC_FILL if (opt_junk && size < oldsize) { @@ -2371,40 +2372,39 @@ small_size2bin_validate(void) { size_t i, size, binind; - assert(small_size2bin[0] == 0xffU); i = 1; # ifdef JEMALLOC_TINY /* Tiny. */ for (; i < (1U << LG_TINY_MIN); i++) { size = pow2_ceil(1U << LG_TINY_MIN); binind = ffs((int)(size >> (LG_TINY_MIN + 1))); - assert(small_size2bin[i] == binind); + assert(SMALL_SIZE2BIN(i) == binind); } for (; i < qspace_min; i++) { size = pow2_ceil(i); binind = ffs((int)(size >> (LG_TINY_MIN + 1))); - assert(small_size2bin[i] == binind); + assert(SMALL_SIZE2BIN(i) == binind); } # endif /* Quantum-spaced. */ for (; i <= qspace_max; i++) { size = QUANTUM_CEILING(i); binind = ntbins + (size >> LG_QUANTUM) - 1; - assert(small_size2bin[i] == binind); + assert(SMALL_SIZE2BIN(i) == binind); } /* Cacheline-spaced. */ for (; i <= cspace_max; i++) { size = CACHELINE_CEILING(i); binind = ntbins + nqbins + ((size - cspace_min) >> LG_CACHELINE); - assert(small_size2bin[i] == binind); + assert(SMALL_SIZE2BIN(i) == binind); } /* Sub-page. */ for (; i <= sspace_max; i++) { size = SUBPAGE_CEILING(i); binind = ntbins + nqbins + ncbins + ((size - sspace_min) >> LG_SUBPAGE); - assert(small_size2bin[i] == binind); + assert(SMALL_SIZE2BIN(i) == binind); } } #endif @@ -2415,12 +2415,12 @@ small_size2bin_init(void) if (opt_lg_qspace_max != LG_QSPACE_MAX_DEFAULT || opt_lg_cspace_max != LG_CSPACE_MAX_DEFAULT - || sizeof(const_small_size2bin) != small_maxclass + 1) + || (sizeof(const_small_size2bin) != ((small_maxclass-1) >> + LG_TINY_MIN) + 1)) return (small_size2bin_init_hard()); small_size2bin = const_small_size2bin; #ifdef JEMALLOC_DEBUG - assert(sizeof(const_small_size2bin) == small_maxclass + 1); small_size2bin_validate(); #endif return (false); @@ -2431,49 +2431,52 @@ small_size2bin_init_hard(void) { size_t i, size, binind; uint8_t *custom_small_size2bin; +#define CUSTOM_SMALL_SIZE2BIN(s) \ + custom_small_size2bin[(s-1) >> LG_TINY_MIN] assert(opt_lg_qspace_max != LG_QSPACE_MAX_DEFAULT || opt_lg_cspace_max != LG_CSPACE_MAX_DEFAULT - || sizeof(const_small_size2bin) != small_maxclass + 1); + || (sizeof(const_small_size2bin) != ((small_maxclass-1) >> + LG_TINY_MIN) + 1)); - custom_small_size2bin = (uint8_t *)base_alloc(small_maxclass + 1); + custom_small_size2bin = (uint8_t *) + base_alloc(small_maxclass >> LG_TINY_MIN); if (custom_small_size2bin == NULL) return (true); - custom_small_size2bin[0] = 0xffU; i = 1; #ifdef JEMALLOC_TINY /* Tiny. */ - for (; i < (1U << LG_TINY_MIN); i++) { + for (; i < (1U << LG_TINY_MIN); i += TINY_MIN) { size = pow2_ceil(1U << LG_TINY_MIN); binind = ffs((int)(size >> (LG_TINY_MIN + 1))); - custom_small_size2bin[i] = binind; + CUSTOM_SMALL_SIZE2BIN(i) = binind; } - for (; i < qspace_min; i++) { + for (; i < qspace_min; i += TINY_MIN) { size = pow2_ceil(i); binind = ffs((int)(size >> (LG_TINY_MIN + 1))); - custom_small_size2bin[i] = binind; + CUSTOM_SMALL_SIZE2BIN(i) = binind; } #endif /* Quantum-spaced. */ - for (; i <= qspace_max; i++) { + for (; i <= qspace_max; i += TINY_MIN) { size = QUANTUM_CEILING(i); binind = ntbins + (size >> LG_QUANTUM) - 1; - custom_small_size2bin[i] = binind; + CUSTOM_SMALL_SIZE2BIN(i) = binind; } /* Cacheline-spaced. */ - for (; i <= cspace_max; i++) { + for (; i <= cspace_max; i += TINY_MIN) { size = CACHELINE_CEILING(i); binind = ntbins + nqbins + ((size - cspace_min) >> LG_CACHELINE); - custom_small_size2bin[i] = binind; + CUSTOM_SMALL_SIZE2BIN(i) = binind; } /* Sub-page. */ - for (; i <= sspace_max; i++) { + for (; i <= sspace_max; i += TINY_MIN) { size = SUBPAGE_CEILING(i); binind = ntbins + nqbins + ncbins + ((size - sspace_min) >> LG_SUBPAGE); - custom_small_size2bin[i] = binind; + CUSTOM_SMALL_SIZE2BIN(i) = binind; } small_size2bin = custom_small_size2bin; @@ -2481,6 +2484,7 @@ small_size2bin_init_hard(void) small_size2bin_validate(); #endif return (false); +#undef CUSTOM_SMALL_SIZE2BIN } bool -- cgit v0.12 From 1b17768e249cf910d242be5b53a6f2dea18eeb2c Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Tue, 15 Mar 2011 09:40:07 -0700 Subject: Fix a build dependency regression. Fix the automatic header dependency generation to handle the .pic.o suffix. This regression was due to: Build both PIC and no PIC static libraries af5d6987f829ccd6e14dd1f57586cfb072a533c7 --- jemalloc/Makefile.in | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/jemalloc/Makefile.in b/jemalloc/Makefile.in index 6dfaf5b..7a13f21 100644 --- a/jemalloc/Makefile.in +++ b/jemalloc/Makefile.in @@ -94,6 +94,7 @@ doc: $(DOCS) # Include generated dependency files. # -include $(CSRCS:@srcroot@%.c=@objroot@%.d) +-include $(CSRCS:@srcroot@%.c=@objroot@%.pic.d) @objroot@src/%.o: @srcroot@src/%.c @mkdir -p $(@D) @@ -103,7 +104,7 @@ doc: $(DOCS) @objroot@src/%.pic.o: @srcroot@src/%.c @mkdir -p $(@D) $(CC) $(CFLAGS) -fPIC -DPIC -c $(CPPFLAGS) -o $@ $< - @$(SHELL) -ec "$(CC) -MM $(CPPFLAGS) $< | sed \"s/\($(subst /,\/,$(notdir $(basename $@)))\)\.o\([ :]*\)/$(subst /,\/,$(strip $(dir $@)))\1.o \2/g\" > $(@:%.o=%.d)" + @$(SHELL) -ec "$(CC) -MM $(CPPFLAGS) $< | sed \"s/\($(subst /,\/,$(notdir $(basename $(basename $@))))\)\.o\([ :]*\)/$(subst /,\/,$(strip $(dir $@)))\1.pic.o \2/g\" > $(@:%.o=%.d)" %.$(SO) : %.$(SO).$(REV) @mkdir -p $(@D) -- cgit v0.12 From 49f7e8f35ac63d0dd526cf68791dc0ca29538ac9 Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Tue, 15 Mar 2011 13:59:15 -0700 Subject: Create arena_bin_info_t. Move read-only fields from arena_bin_t into arena_bin_info_t, primarily in order to avoid false cacheline sharing. --- jemalloc/include/jemalloc/internal/arena.h | 95 +++-- .../jemalloc/internal/jemalloc_internal.h.in | 8 +- jemalloc/include/jemalloc/internal/tcache.h | 6 +- jemalloc/src/arena.c | 426 ++++++++++++--------- jemalloc/src/ctl.c | 6 +- jemalloc/src/tcache.c | 6 +- 6 files changed, 324 insertions(+), 223 deletions(-) diff --git a/jemalloc/include/jemalloc/internal/arena.h b/jemalloc/include/jemalloc/internal/arena.h index 78828ef..467ec65 100644 --- a/jemalloc/include/jemalloc/internal/arena.h +++ b/jemalloc/include/jemalloc/internal/arena.h @@ -71,6 +71,7 @@ typedef struct arena_chunk_map_s arena_chunk_map_t; typedef struct arena_chunk_s arena_chunk_t; typedef struct arena_run_s arena_run_t; +typedef struct arena_bin_info_s arena_bin_info_t; typedef struct arena_bin_s arena_bin_t; typedef struct arena_s arena_t; @@ -218,6 +219,33 @@ struct arena_run_s { unsigned nfree; }; +/* + * Read-only information associated with each element for arena_t's bins array + * is stored separately, partly to reduce memory usage (only one copy, rather + * than one per arena), but mainly to avoid false cacheline sharing. + */ +struct arena_bin_info_s { + /* Size of regions in a run for this bin's size class. */ + size_t reg_size; + + /* Total size of a run for this bin's size class. */ + size_t run_size; + + /* Total number of regions in a run for this bin's size class. */ + uint32_t nregs; + +#ifdef JEMALLOC_PROF + /* + * Offset of first (prof_ctx_t *) in a run header for this bin's size + * class, or 0 if (opt_prof == false). + */ + uint32_t ctx0_offset; +#endif + + /* Offset of first region in a run for this bin's size class. */ + uint32_t reg0_offset; +}; + struct arena_bin_s { /* * All operations on runcur, runs, and stats require that lock be @@ -242,26 +270,6 @@ struct arena_bin_s { */ arena_run_tree_t runs; - /* Size of regions in a run for this bin's size class. */ - size_t reg_size; - - /* Total size of a run for this bin's size class. */ - size_t run_size; - - /* Total number of regions in a run for this bin's size class. */ - uint32_t nregs; - -#ifdef JEMALLOC_PROF - /* - * Offset of first (prof_ctx_t *) in a run header for this bin's size - * class, or 0 if (opt_prof == false). - */ - uint32_t ctx0_offset; -#endif - - /* Offset of first region in a run for this bin's size class. */ - uint32_t reg0_offset; - #ifdef JEMALLOC_STATS /* Bin statistics. */ malloc_bin_stats_t stats; @@ -398,6 +406,8 @@ extern ssize_t opt_lg_dirty_mult; extern uint8_t const *small_size2bin; #define SMALL_SIZE2BIN(s) (small_size2bin[(s-1) >> LG_TINY_MIN]) +extern arena_bin_info_t *arena_bin_info; + /* Various bin-related settings. */ #ifdef JEMALLOC_TINY /* Number of (2^n)-spaced tiny bins. */ # define ntbins ((unsigned)(LG_QUANTUM - LG_TINY_MIN)) @@ -463,7 +473,8 @@ bool arena_boot(void); #ifdef JEMALLOC_H_INLINES #ifndef JEMALLOC_ENABLE_INLINE -unsigned arena_run_regind(arena_run_t *run, arena_bin_t *bin, +size_t arena_bin_index(arena_t *arena, arena_bin_t *bin); +unsigned arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, const void *ptr, size_t size); # ifdef JEMALLOC_PROF prof_ctx_t *arena_prof_ctx_get(const void *ptr); @@ -473,8 +484,16 @@ void arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr); #endif #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ARENA_C_)) +JEMALLOC_INLINE size_t +arena_bin_index(arena_t *arena, arena_bin_t *bin) +{ + size_t binind = bin - arena->bins; + assert(binind < nbins); + return (binind); +} + JEMALLOC_INLINE unsigned -arena_run_regind(arena_run_t *run, arena_bin_t *bin, const void *ptr, +arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, const void *ptr, size_t size) { unsigned shift, diff, regind; @@ -485,7 +504,8 @@ arena_run_regind(arena_run_t *run, arena_bin_t *bin, const void *ptr, * Avoid doing division with a variable divisor if possible. Using * actual division here can reduce allocator throughput by over 20%! */ - diff = (unsigned)((uintptr_t)ptr - (uintptr_t)run - bin->reg0_offset); + diff = (unsigned)((uintptr_t)ptr - (uintptr_t)run - + bin_info->reg0_offset); /* Rescale (factor powers of 2 out of the numerator and denominator). */ shift = ffs(size) - 1; @@ -531,7 +551,7 @@ arena_run_regind(arena_run_t *run, arena_bin_t *bin, const void *ptr, #undef SIZE_INV_SHIFT } assert(diff == regind * size); - assert(regind < bin->nregs); + assert(regind < bin_info->nregs); return (regind); } @@ -558,13 +578,15 @@ arena_prof_ctx_get(const void *ptr) arena_run_t *run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) << PAGE_SHIFT)); - arena_bin_t *bin = run->bin; + size_t binind = arena_bin_index(chunk->arena, run->bin); + arena_bin_info_t *bin_info = &arena_bin_info[binind]; unsigned regind; assert(run->magic == ARENA_RUN_MAGIC); - regind = arena_run_regind(run, bin, ptr, bin->reg_size); + regind = arena_run_regind(run, bin_info, ptr, + bin_info->reg_size); ret = *(prof_ctx_t **)((uintptr_t)run + - bin->ctx0_offset + (regind * + bin_info->ctx0_offset + (regind * sizeof(prof_ctx_t *))); } } else @@ -593,11 +615,15 @@ arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx) PAGE_SHIFT)); arena_bin_t *bin = run->bin; unsigned regind; + size_t binind; + arena_bin_info_t *bin_info; assert(run->magic == ARENA_RUN_MAGIC); regind = arena_run_regind(run, bin, ptr, bin->reg_size); + binind = arena_bin_index(chunk->arena, bin); + bin_info = &arena_bin_info[binind]; - *((prof_ctx_t **)((uintptr_t)run + bin->ctx0_offset + *((prof_ctx_t **)((uintptr_t)run + bin_info->ctx0_offset + (regind * sizeof(prof_ctx_t *)))) = ctx; } else assert((uintptr_t)ctx == (uintptr_t)1U); @@ -637,10 +663,17 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr) (uintptr_t)((pageind - (mapelm->bits >> PAGE_SHIFT)) << PAGE_SHIFT)); assert(run->magic == ARENA_RUN_MAGIC); - assert(((uintptr_t)ptr - ((uintptr_t)run + - (uintptr_t)run->bin->reg0_offset)) % - run->bin->reg_size == 0); bin = run->bin; +#ifndef NDEBUG + { + size_t binind = arena_bin_index(arena, bin); + arena_bin_info_t *bin_info = + &arena_bin_info[binind]; + assert(((uintptr_t)ptr - ((uintptr_t)run + + (uintptr_t)bin_info->reg0_offset)) % + bin_info->reg_size == 0); + } +#endif malloc_mutex_lock(&bin->lock); arena_dalloc_bin(arena, chunk, ptr, mapelm); malloc_mutex_unlock(&bin->lock); diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in index 0f58a7a..34b2a23 100644 --- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in +++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in @@ -402,7 +402,7 @@ s2u(size_t size) { if (size <= small_maxclass) - return (arenas[0]->bins[SMALL_SIZE2BIN(size)].reg_size); + return (arena_bin_info[SMALL_SIZE2BIN(size)].reg_size); if (size <= arena_maxclass) return (PAGE_CEILING(size)); return (CHUNK_CEILING(size)); @@ -446,10 +446,8 @@ sa2u(size_t size, size_t alignment, size_t *run_size_p) } if (usize <= arena_maxclass && alignment <= PAGE_SIZE) { - if (usize <= small_maxclass) { - return - (arenas[0]->bins[SMALL_SIZE2BIN(usize)].reg_size); - } + if (usize <= small_maxclass) + return (arena_bin_info[SMALL_SIZE2BIN(usize)].reg_size); return (PAGE_CEILING(usize)); } else { size_t run_size; diff --git a/jemalloc/include/jemalloc/internal/tcache.h b/jemalloc/include/jemalloc/internal/tcache.h index 7b71172..ab02545 100644 --- a/jemalloc/include/jemalloc/internal/tcache.h +++ b/jemalloc/include/jemalloc/internal/tcache.h @@ -232,7 +232,7 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero) if (ret == NULL) return (NULL); } - assert(arena_salloc(ret) == tcache->arena->bins[binind].reg_size); + assert(arena_salloc(ret) == arena_bin_info[binind].reg_size); if (zero == false) { #ifdef JEMALLOC_FILL @@ -248,7 +248,7 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero) tbin->tstats.nrequests++; #endif #ifdef JEMALLOC_PROF - tcache->prof_accumbytes += tcache->arena->bins[binind].reg_size; + tcache->prof_accumbytes += arena_bin_info[binind].reg_size; #endif tcache_event(tcache); return (ret); @@ -331,7 +331,7 @@ tcache_dalloc_small(tcache_t *tcache, void *ptr) #ifdef JEMALLOC_FILL if (opt_junk) - memset(ptr, 0x5a, bin->reg_size); + memset(ptr, 0x5a, arena_bin_info[binind].reg_size); #endif tbin = &tcache->tbins[binind]; diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c index 2811fd1..e49b8ed 100644 --- a/jemalloc/src/arena.c +++ b/jemalloc/src/arena.c @@ -8,6 +8,7 @@ size_t opt_lg_qspace_max = LG_QSPACE_MAX_DEFAULT; size_t opt_lg_cspace_max = LG_CSPACE_MAX_DEFAULT; ssize_t opt_lg_dirty_mult = LG_DIRTY_MULT_DEFAULT; uint8_t const *small_size2bin; +arena_bin_info_t *arena_bin_info; /* Various bin-related settings. */ unsigned nqbins; @@ -174,7 +175,6 @@ static void arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run, size_t oldsize, size_t newsize, bool dirty); static arena_run_t *arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin); static void *arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin); -static size_t arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size); static void arena_dissociate_bin_run(arena_chunk_t *chunk, arena_run_t *run, arena_bin_t *bin); static void arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, @@ -192,6 +192,9 @@ static bool small_size2bin_init(void); static void small_size2bin_validate(void); #endif static bool small_size2bin_init_hard(void); +static size_t bin_info_run_size_calc(arena_bin_info_t *bin_info, + size_t min_run_size); +static bool bin_info_init(void); /******************************************************************************/ @@ -247,7 +250,7 @@ rb_gen(static JEMALLOC_ATTR(unused), arena_avail_tree_, arena_avail_tree_t, arena_chunk_map_t, u.rb_link, arena_avail_comp) static inline void * -arena_run_reg_alloc(arena_run_t *run, arena_bin_t *bin) +arena_run_reg_alloc(arena_run_t *run, arena_bin_info_t *bin_info) { void *ret; @@ -261,16 +264,16 @@ arena_run_reg_alloc(arena_run_t *run, arena_bin_t *bin) assert(ret != NULL); /* Write-after free can cause assertion failure. */ assert((uintptr_t)ret >= (uintptr_t)run + - (uintptr_t)bin->reg0_offset); + (uintptr_t)bin_info->reg0_offset); assert((uintptr_t)ret < (uintptr_t)run->next); assert(((uintptr_t)ret - ((uintptr_t)run + - (uintptr_t)bin->reg0_offset)) % (uintptr_t)bin->reg_size == - 0); + (uintptr_t)bin_info->reg0_offset)) % + (uintptr_t)bin_info->reg_size == 0); run->avail = *(void **)ret; return (ret); } ret = run->next; - run->next = (void *)((uintptr_t)ret + (uintptr_t)bin->reg_size); + run->next = (void *)((uintptr_t)ret + (uintptr_t)bin_info->reg_size); assert(ret != NULL); return (ret); } @@ -279,22 +282,27 @@ static inline void arena_run_reg_dalloc(arena_run_t *run, void *ptr) { - assert(run->nfree < run->bin->nregs); +#ifndef NDEBUG + arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run); + size_t binind = arena_bin_index(chunk->arena, run->bin); + arena_bin_info_t *bin_info = &arena_bin_info[binind]; + assert(run->nfree < bin_info->nregs); /* Freeing an interior pointer can cause assertion failure. */ assert(((uintptr_t)ptr - ((uintptr_t)run + - (uintptr_t)run->bin->reg0_offset)) % (uintptr_t)run->bin->reg_size + (uintptr_t)bin_info->reg0_offset)) % (uintptr_t)bin_info->reg_size == 0); /* * Freeing a pointer lower than region zero can cause assertion * failure. */ assert((uintptr_t)ptr >= (uintptr_t)run + - (uintptr_t)run->bin->reg0_offset); + (uintptr_t)bin_info->reg0_offset); /* * Freeing a pointer past in the run's frontier can cause assertion * failure. */ assert((uintptr_t)ptr < (uintptr_t)run->next); +#endif *(void **)ptr = run->avail; run->avail = ptr; @@ -765,7 +773,11 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk) assert((mapelm->bits >> PAGE_SHIFT) == 0); assert(run->magic == ARENA_RUN_MAGIC); - pageind += run->bin->run_size >> PAGE_SHIFT; + size_t binind = arena_bin_index(arena, + run->bin); + arena_bin_info_t *bin_info = + &arena_bin_info[binind]; + pageind += bin_info->run_size >> PAGE_SHIFT; } } } @@ -947,8 +959,11 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty) CHUNK_MAP_LARGE) != 0); assert((chunk->map[run_ind+(size>>PAGE_SHIFT)-1-map_bias].bits & CHUNK_MAP_ALLOCATED) != 0); - } else - size = run->bin->run_size; + } else { + size_t binind = arena_bin_index(arena, run->bin); + arena_bin_info_t *bin_info = &arena_bin_info[binind]; + size = bin_info->run_size; + } run_pages = (size >> PAGE_SHIFT); arena->nactive -= run_pages; @@ -1175,6 +1190,8 @@ arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin) { arena_chunk_map_t *mapelm; arena_run_t *run; + size_t binind; + arena_bin_info_t *bin_info; /* Look for a usable run. */ mapelm = arena_run_tree_first(&bin->runs); @@ -1198,18 +1215,21 @@ arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin) } /* No existing runs have any space available. */ + binind = arena_bin_index(arena, bin); + bin_info = &arena_bin_info[binind]; + /* Allocate a new run. */ malloc_mutex_unlock(&bin->lock); /******************************/ malloc_mutex_lock(&arena->lock); - run = arena_run_alloc(arena, bin->run_size, false, false); + run = arena_run_alloc(arena, bin_info->run_size, false, false); if (run != NULL) { /* Initialize run internals. */ run->bin = bin; run->avail = NULL; run->next = (void *)((uintptr_t)run + - (uintptr_t)bin->reg0_offset); - run->nfree = bin->nregs; + (uintptr_t)bin_info->reg0_offset); + run->nfree = bin_info->nregs; #ifdef JEMALLOC_DEBUG run->magic = ARENA_RUN_MAGIC; #endif @@ -1260,18 +1280,23 @@ static void * arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin) { void *ret; + size_t binind; + arena_bin_info_t *bin_info; arena_run_t *run; + binind = arena_bin_index(arena, bin); + bin_info = &arena_bin_info[binind]; bin->runcur = NULL; run = arena_bin_nonfull_run_get(arena, bin); if (bin->runcur != NULL && bin->runcur->nfree > 0) { + /* * Another thread updated runcur while this one ran without the * bin lock in arena_bin_nonfull_run_get(). */ assert(bin->runcur->magic == ARENA_RUN_MAGIC); assert(bin->runcur->nfree > 0); - ret = arena_run_reg_alloc(bin->runcur, bin); + ret = arena_run_reg_alloc(bin->runcur, bin_info); if (run != NULL) { arena_chunk_t *chunk; @@ -1284,7 +1309,7 @@ arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin) * from the run. */ chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run); - if (run->nfree == bin->nregs) + if (run->nfree == bin_info->nregs) arena_dalloc_bin_run(arena, chunk, run, bin); else arena_bin_lower_run(arena, chunk, run, bin); @@ -1300,7 +1325,7 @@ arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin) assert(bin->runcur->magic == ARENA_RUN_MAGIC); assert(bin->runcur->nfree > 0); - return (arena_run_reg_alloc(bin->runcur, bin)); + return (arena_run_reg_alloc(bin->runcur, bin_info)); } #ifdef JEMALLOC_PROF @@ -1342,7 +1367,7 @@ arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, size_t binind malloc_mutex_lock(&bin->lock); for (i = 0, nfill = (tbin->ncached_max >> 1); i < nfill; i++) { if ((run = bin->runcur) != NULL && run->nfree > 0) - ptr = arena_run_reg_alloc(run, bin); + ptr = arena_run_reg_alloc(run, &arena_bin_info[binind]); else ptr = arena_bin_malloc_hard(arena, bin); if (ptr == NULL) @@ -1351,7 +1376,8 @@ arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, size_t binind tbin->avail = ptr; } #ifdef JEMALLOC_STATS - bin->stats.allocated += (i - tbin->ncached) * bin->reg_size; + bin->stats.allocated += (i - tbin->ncached) * + arena_bin_info[binind].reg_size; bin->stats.nmalloc += i; bin->stats.nrequests += tbin->tstats.nrequests; bin->stats.nfills++; @@ -1362,112 +1388,6 @@ arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, size_t binind } #endif -/* - * Calculate bin->run_size such that it meets the following constraints: - * - * *) bin->run_size >= min_run_size - * *) bin->run_size <= arena_maxclass - * *) run header overhead <= RUN_MAX_OVRHD (or header overhead relaxed). - * - * bin->nregs and bin->reg0_offset are also calculated here, since these - * settings are all interdependent. - */ -static size_t -arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size) -{ - size_t try_run_size, good_run_size; - uint32_t try_nregs, good_nregs; - uint32_t try_hdr_size, good_hdr_size; -#ifdef JEMALLOC_PROF - uint32_t try_ctx0_offset, good_ctx0_offset; -#endif - uint32_t try_reg0_offset, good_reg0_offset; - - assert(min_run_size >= PAGE_SIZE); - assert(min_run_size <= arena_maxclass); - - /* - * Calculate known-valid settings before entering the run_size - * expansion loop, so that the first part of the loop always copies - * valid settings. - * - * The do..while loop iteratively reduces the number of regions until - * the run header and the regions no longer overlap. A closed formula - * would be quite messy, since there is an interdependency between the - * header's mask length and the number of regions. - */ - try_run_size = min_run_size; - try_nregs = ((try_run_size - sizeof(arena_run_t)) / bin->reg_size) - + 1; /* Counter-act try_nregs-- in loop. */ - do { - try_nregs--; - try_hdr_size = sizeof(arena_run_t); -#ifdef JEMALLOC_PROF - if (opt_prof && prof_promote == false) { - /* Pad to a quantum boundary. */ - try_hdr_size = QUANTUM_CEILING(try_hdr_size); - try_ctx0_offset = try_hdr_size; - /* Add space for one (prof_ctx_t *) per region. */ - try_hdr_size += try_nregs * sizeof(prof_ctx_t *); - } else - try_ctx0_offset = 0; -#endif - try_reg0_offset = try_run_size - (try_nregs * bin->reg_size); - } while (try_hdr_size > try_reg0_offset); - - /* run_size expansion loop. */ - do { - /* - * Copy valid settings before trying more aggressive settings. - */ - good_run_size = try_run_size; - good_nregs = try_nregs; - good_hdr_size = try_hdr_size; -#ifdef JEMALLOC_PROF - good_ctx0_offset = try_ctx0_offset; -#endif - good_reg0_offset = try_reg0_offset; - - /* Try more aggressive settings. */ - try_run_size += PAGE_SIZE; - try_nregs = ((try_run_size - sizeof(arena_run_t)) / - bin->reg_size) + 1; /* Counter-act try_nregs-- in loop. */ - do { - try_nregs--; - try_hdr_size = sizeof(arena_run_t); -#ifdef JEMALLOC_PROF - if (opt_prof && prof_promote == false) { - /* Pad to a quantum boundary. */ - try_hdr_size = QUANTUM_CEILING(try_hdr_size); - try_ctx0_offset = try_hdr_size; - /* - * Add space for one (prof_ctx_t *) per region. - */ - try_hdr_size += try_nregs * - sizeof(prof_ctx_t *); - } -#endif - try_reg0_offset = try_run_size - (try_nregs * - bin->reg_size); - } while (try_hdr_size > try_reg0_offset); - } while (try_run_size <= arena_maxclass - && try_run_size <= arena_maxclass - && RUN_MAX_OVRHD * (bin->reg_size << 3) > RUN_MAX_OVRHD_RELAX - && (try_reg0_offset << RUN_BFP) > RUN_MAX_OVRHD * try_run_size); - - assert(good_hdr_size <= good_reg0_offset); - - /* Copy final settings. */ - bin->run_size = good_run_size; - bin->nregs = good_nregs; -#ifdef JEMALLOC_PROF - bin->ctx0_offset = good_ctx0_offset; -#endif - bin->reg0_offset = good_reg0_offset; - - return (good_run_size); -} - void * arena_malloc_small(arena_t *arena, size_t size, bool zero) { @@ -1479,11 +1399,11 @@ arena_malloc_small(arena_t *arena, size_t size, bool zero) binind = SMALL_SIZE2BIN(size); assert(binind < nbins); bin = &arena->bins[binind]; - size = bin->reg_size; + size = arena_bin_info[binind].reg_size; malloc_mutex_lock(&bin->lock); if ((run = bin->runcur) != NULL && run->nfree > 0) - ret = arena_run_reg_alloc(run, bin); + ret = arena_run_reg_alloc(run, &arena_bin_info[binind]); else ret = arena_bin_malloc_hard(arena, bin); @@ -1688,10 +1608,12 @@ arena_salloc(const void *ptr) (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) << PAGE_SHIFT)); assert(run->magic == ARENA_RUN_MAGIC); + size_t binind = arena_bin_index(chunk->arena, run->bin); + arena_bin_info_t *bin_info = &arena_bin_info[binind]; assert(((uintptr_t)ptr - ((uintptr_t)run + - (uintptr_t)run->bin->reg0_offset)) % run->bin->reg_size == + (uintptr_t)bin_info->reg0_offset)) % bin_info->reg_size == 0); - ret = run->bin->reg_size; + ret = bin_info->reg_size; } else { assert(((uintptr_t)ptr & PAGE_MASK) == 0); ret = mapbits & ~PAGE_MASK; @@ -1739,10 +1661,12 @@ arena_salloc_demote(const void *ptr) (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) << PAGE_SHIFT)); assert(run->magic == ARENA_RUN_MAGIC); + size_t binind = arena_bin_index(chunk->arena, run->bin); + arena_bin_info_t *bin_info = &arena_bin_info[binind]; assert(((uintptr_t)ptr - ((uintptr_t)run + - (uintptr_t)run->bin->reg0_offset)) % run->bin->reg_size == + (uintptr_t)bin_info->reg0_offset)) % bin_info->reg_size == 0); - ret = run->bin->reg_size; + ret = bin_info->reg_size; } else { assert(((uintptr_t)ptr & PAGE_MASK) == 0); ret = mapbits & ~PAGE_MASK; @@ -1751,7 +1675,7 @@ arena_salloc_demote(const void *ptr) size_t binind = ((mapbits & CHUNK_MAP_CLASS_MASK) >> CHUNK_MAP_CLASS_SHIFT) - 1; assert(binind < nbins); - ret = chunk->arena->bins[binind].reg_size; + ret = arena_bin_info[binind].reg_size; } assert(ret != 0); } @@ -1768,17 +1692,22 @@ arena_dissociate_bin_run(arena_chunk_t *chunk, arena_run_t *run, /* Dissociate run from bin. */ if (run == bin->runcur) bin->runcur = NULL; - else if (bin->nregs != 1) { - size_t run_pageind = (((uintptr_t)run - (uintptr_t)chunk)) >> - PAGE_SHIFT; - arena_chunk_map_t *run_mapelm = - &chunk->map[run_pageind-map_bias]; - /* - * This block's conditional is necessary because if the run - * only contains one region, then it never gets inserted into - * the non-full runs tree. - */ - arena_run_tree_remove(&bin->runs, run_mapelm); + else { + size_t binind = arena_bin_index(chunk->arena, bin); + arena_bin_info_t *bin_info = &arena_bin_info[binind]; + + if (bin_info->nregs != 1) { + size_t run_pageind = (((uintptr_t)run - + (uintptr_t)chunk)) >> PAGE_SHIFT; + arena_chunk_map_t *run_mapelm = + &chunk->map[run_pageind-map_bias]; + /* + * This block's conditional is necessary because if the + * run only contains one region, then it never gets + * inserted into the non-full runs tree. + */ + arena_run_tree_remove(&bin->runs, run_mapelm); + } } } @@ -1786,15 +1715,20 @@ static void arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run, arena_bin_t *bin) { + size_t binind; + arena_bin_info_t *bin_info; size_t npages, run_ind, past; assert(run != bin->runcur); assert(arena_run_tree_search(&bin->runs, &chunk->map[ (((uintptr_t)run-(uintptr_t)chunk)>>PAGE_SHIFT)-map_bias]) == NULL); + binind = arena_bin_index(chunk->arena, run->bin); + bin_info = &arena_bin_info[binind]; + malloc_mutex_unlock(&bin->lock); /******************************/ - npages = bin->run_size >> PAGE_SHIFT; + npages = bin_info->run_size >> PAGE_SHIFT; run_ind = (size_t)(((uintptr_t)run - (uintptr_t)chunk) >> PAGE_SHIFT); past = (size_t)((PAGE_CEILING((uintptr_t)run->next) - (uintptr_t)chunk) >> PAGE_SHIFT); @@ -1814,7 +1748,7 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run, chunk->map[run_ind+npages-1-map_bias].bits = CHUNK_MAP_LARGE | (chunk->map[run_ind+npages-1-map_bias].bits & CHUNK_MAP_FLAGS_MASK); - chunk->map[run_ind-map_bias].bits = bin->run_size | + chunk->map[run_ind-map_bias].bits = bin_info->run_size | CHUNK_MAP_LARGE | (chunk->map[run_ind-map_bias].bits & CHUNK_MAP_FLAGS_MASK); arena_run_trim_tail(arena, chunk, run, (npages << PAGE_SHIFT), @@ -1885,8 +1819,10 @@ arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr, (mapelm->bits >> PAGE_SHIFT)) << PAGE_SHIFT)); assert(run->magic == ARENA_RUN_MAGIC); bin = run->bin; + size_t binind = arena_bin_index(arena, bin); + arena_bin_info_t *bin_info = &arena_bin_info[binind]; #if (defined(JEMALLOC_FILL) || defined(JEMALLOC_STATS)) - size = bin->reg_size; + size = bin_info->reg_size; #endif #ifdef JEMALLOC_FILL @@ -1895,7 +1831,7 @@ arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr, #endif arena_run_reg_dalloc(run, ptr); - if (run->nfree == bin->nregs) { + if (run->nfree == bin_info->nregs) { arena_dissociate_bin_run(chunk, run, bin); arena_dalloc_bin_run(arena, chunk, run, bin); } else if (run->nfree == 1 && run != bin->runcur) @@ -2167,8 +2103,8 @@ arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra, */ if (oldsize <= arena_maxclass) { if (oldsize <= small_maxclass) { - assert(choose_arena()->bins[SMALL_SIZE2BIN( - oldsize)].reg_size == oldsize); + assert(arena_bin_info[SMALL_SIZE2BIN(oldsize)].reg_size + == oldsize); if ((size + extra <= small_maxclass && SMALL_SIZE2BIN(size + extra) == SMALL_SIZE2BIN(oldsize)) || (size <= oldsize && @@ -2248,7 +2184,6 @@ arena_new(arena_t *arena, unsigned ind) { unsigned i; arena_bin_t *bin; - size_t prev_run_size; arena->ind = ind; @@ -2284,8 +2219,6 @@ arena_new(arena_t *arena, unsigned ind) arena_avail_tree_new(&arena->runs_avail_dirty); /* Initialize bins. */ - prev_run_size = PAGE_SIZE; - i = 0; #ifdef JEMALLOC_TINY /* (2^n)-spaced tiny bins. */ @@ -2295,11 +2228,6 @@ arena_new(arena_t *arena, unsigned ind) return (true); bin->runcur = NULL; arena_run_tree_new(&bin->runs); - - bin->reg_size = (1U << (LG_TINY_MIN + i)); - - prev_run_size = arena_bin_run_size_calc(bin, prev_run_size); - #ifdef JEMALLOC_STATS memset(&bin->stats, 0, sizeof(malloc_bin_stats_t)); #endif @@ -2313,11 +2241,6 @@ arena_new(arena_t *arena, unsigned ind) return (true); bin->runcur = NULL; arena_run_tree_new(&bin->runs); - - bin->reg_size = (i - ntbins + 1) << LG_QUANTUM; - - prev_run_size = arena_bin_run_size_calc(bin, prev_run_size); - #ifdef JEMALLOC_STATS memset(&bin->stats, 0, sizeof(malloc_bin_stats_t)); #endif @@ -2330,12 +2253,6 @@ arena_new(arena_t *arena, unsigned ind) return (true); bin->runcur = NULL; arena_run_tree_new(&bin->runs); - - bin->reg_size = cspace_min + ((i - (ntbins + nqbins)) << - LG_CACHELINE); - - prev_run_size = arena_bin_run_size_calc(bin, prev_run_size); - #ifdef JEMALLOC_STATS memset(&bin->stats, 0, sizeof(malloc_bin_stats_t)); #endif @@ -2348,12 +2265,6 @@ arena_new(arena_t *arena, unsigned ind) return (true); bin->runcur = NULL; arena_run_tree_new(&bin->runs); - - bin->reg_size = sspace_min + ((i - (ntbins + nqbins + ncbins)) - << LG_SUBPAGE); - - prev_run_size = arena_bin_run_size_calc(bin, prev_run_size); - #ifdef JEMALLOC_STATS memset(&bin->stats, 0, sizeof(malloc_bin_stats_t)); #endif @@ -2487,6 +2398,162 @@ small_size2bin_init_hard(void) #undef CUSTOM_SMALL_SIZE2BIN } +/* + * Calculate bin_info->run_size such that it meets the following constraints: + * + * *) bin_info->run_size >= min_run_size + * *) bin_info->run_size <= arena_maxclass + * *) run header overhead <= RUN_MAX_OVRHD (or header overhead relaxed). + * + * bin_info->nregs and bin_info->reg0_offset are also calculated here, since + * these settings are all interdependent. + */ +static size_t +bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size) +{ + size_t try_run_size, good_run_size; + uint32_t try_nregs, good_nregs; + uint32_t try_hdr_size, good_hdr_size; +#ifdef JEMALLOC_PROF + uint32_t try_ctx0_offset, good_ctx0_offset; +#endif + uint32_t try_reg0_offset, good_reg0_offset; + + assert(min_run_size >= PAGE_SIZE); + assert(min_run_size <= arena_maxclass); + + /* + * Calculate known-valid settings before entering the run_size + * expansion loop, so that the first part of the loop always copies + * valid settings. + * + * The do..while loop iteratively reduces the number of regions until + * the run header and the regions no longer overlap. A closed formula + * would be quite messy, since there is an interdependency between the + * header's mask length and the number of regions. + */ + try_run_size = min_run_size; + try_nregs = ((try_run_size - sizeof(arena_run_t)) / bin_info->reg_size) + + 1; /* Counter-act try_nregs-- in loop. */ + do { + try_nregs--; + try_hdr_size = sizeof(arena_run_t); +#ifdef JEMALLOC_PROF + if (opt_prof && prof_promote == false) { + /* Pad to a quantum boundary. */ + try_hdr_size = QUANTUM_CEILING(try_hdr_size); + try_ctx0_offset = try_hdr_size; + /* Add space for one (prof_ctx_t *) per region. */ + try_hdr_size += try_nregs * sizeof(prof_ctx_t *); + } else + try_ctx0_offset = 0; +#endif + try_reg0_offset = try_run_size - (try_nregs * + bin_info->reg_size); + } while (try_hdr_size > try_reg0_offset); + + /* run_size expansion loop. */ + do { + /* + * Copy valid settings before trying more aggressive settings. + */ + good_run_size = try_run_size; + good_nregs = try_nregs; + good_hdr_size = try_hdr_size; +#ifdef JEMALLOC_PROF + good_ctx0_offset = try_ctx0_offset; +#endif + good_reg0_offset = try_reg0_offset; + + /* Try more aggressive settings. */ + try_run_size += PAGE_SIZE; + try_nregs = ((try_run_size - sizeof(arena_run_t)) / + bin_info->reg_size) + + 1; /* Counter-act try_nregs-- in loop. */ + do { + try_nregs--; + try_hdr_size = sizeof(arena_run_t); +#ifdef JEMALLOC_PROF + if (opt_prof && prof_promote == false) { + /* Pad to a quantum boundary. */ + try_hdr_size = QUANTUM_CEILING(try_hdr_size); + try_ctx0_offset = try_hdr_size; + /* + * Add space for one (prof_ctx_t *) per region. + */ + try_hdr_size += try_nregs * + sizeof(prof_ctx_t *); + } +#endif + try_reg0_offset = try_run_size - (try_nregs * + bin_info->reg_size); + } while (try_hdr_size > try_reg0_offset); + } while (try_run_size <= arena_maxclass + && try_run_size <= arena_maxclass + && RUN_MAX_OVRHD * (bin_info->reg_size << 3) > RUN_MAX_OVRHD_RELAX + && (try_reg0_offset << RUN_BFP) > RUN_MAX_OVRHD * try_run_size); + + assert(good_hdr_size <= good_reg0_offset); + + /* Copy final settings. */ + bin_info->run_size = good_run_size; + bin_info->nregs = good_nregs; +#ifdef JEMALLOC_PROF + bin_info->ctx0_offset = good_ctx0_offset; +#endif + bin_info->reg0_offset = good_reg0_offset; + + return (good_run_size); +} + +static bool +bin_info_init(void) +{ + arena_bin_info_t *bin_info; + unsigned i; + size_t prev_run_size; + + arena_bin_info = base_alloc(sizeof(arena_bin_info_t) * nbins); + if (arena_bin_info == NULL) + return (true); + + prev_run_size = PAGE_SIZE; + i = 0; +#ifdef JEMALLOC_TINY + /* (2^n)-spaced tiny bins. */ + for (; i < ntbins; i++) { + bin_info = &arena_bin_info[i]; + bin_info->reg_size = (1U << (LG_TINY_MIN + i)); + prev_run_size = bin_info_run_size_calc(bin_info, prev_run_size); + } +#endif + + /* Quantum-spaced bins. */ + for (; i < ntbins + nqbins; i++) { + bin_info = &arena_bin_info[i]; + bin_info->reg_size = (i - ntbins + 1) << LG_QUANTUM; + prev_run_size = bin_info_run_size_calc(bin_info, prev_run_size); + } + + /* Cacheline-spaced bins. */ + for (; i < ntbins + nqbins + ncbins; i++) { + bin_info = &arena_bin_info[i]; + bin_info->reg_size = cspace_min + ((i - (ntbins + nqbins)) << + LG_CACHELINE); + prev_run_size = bin_info_run_size_calc(bin_info, prev_run_size); + } + + /* Subpage-spaced bins. */ + for (; i < nbins; i++) { + bin_info = &arena_bin_info[i]; + bin_info->reg_size = sspace_min + ((i - (ntbins + nqbins + + ncbins)) << LG_SUBPAGE); + prev_run_size = bin_info_run_size_calc(bin_info, prev_run_size); + } + + return (false); +} + bool arena_boot(void) { @@ -2545,9 +2612,6 @@ arena_boot(void) abort(); } - if (small_size2bin_init()) - return (true); - /* * Compute the header size such that it is large enough to contain the * page map. The page map is biased to omit entries for the header @@ -2571,5 +2635,11 @@ arena_boot(void) arena_maxclass = chunksize - (map_bias << PAGE_SHIFT); + if (small_size2bin_init()) + return (true); + + if (bin_info_init()) + return (true); + return (false); } diff --git a/jemalloc/src/ctl.c b/jemalloc/src/ctl.c index 1b28da4..c32e955 100644 --- a/jemalloc/src/ctl.c +++ b/jemalloc/src/ctl.c @@ -1289,9 +1289,9 @@ CTL_RO_NL_GEN(opt_overcommit, opt_overcommit, bool) /******************************************************************************/ -CTL_RO_NL_GEN(arenas_bin_i_size, arenas[0]->bins[mib[2]].reg_size, size_t) -CTL_RO_NL_GEN(arenas_bin_i_nregs, arenas[0]->bins[mib[2]].nregs, uint32_t) -CTL_RO_NL_GEN(arenas_bin_i_run_size, arenas[0]->bins[mib[2]].run_size, size_t) +CTL_RO_NL_GEN(arenas_bin_i_size, arena_bin_info[mib[2]].reg_size, size_t) +CTL_RO_NL_GEN(arenas_bin_i_nregs, arena_bin_info[mib[2]].nregs, uint32_t) +CTL_RO_NL_GEN(arenas_bin_i_run_size, arena_bin_info[mib[2]].run_size, size_t) const ctl_node_t * arenas_bin_i_index(const size_t *mib, size_t miblen, size_t i) { diff --git a/jemalloc/src/tcache.c b/jemalloc/src/tcache.c index e9b067d..88e1cc7 100644 --- a/jemalloc/src/tcache.c +++ b/jemalloc/src/tcache.c @@ -253,9 +253,9 @@ tcache_create(arena_t *arena) tcache->arena = arena; assert((TCACHE_NSLOTS_SMALL_MAX & 1U) == 0); for (i = 0; i < nbins; i++) { - if ((arena->bins[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MAX) { - tcache->tbins[i].ncached_max = (arena->bins[i].nregs << - 1); + if ((arena_bin_info[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MAX) { + tcache->tbins[i].ncached_max = (arena_bin_info[i].nregs + << 1); } else tcache->tbins[i].ncached_max = TCACHE_NSLOTS_SMALL_MAX; } -- cgit v0.12 From 819d11be068e3f86e31db0956f5a0b29d9971e7f Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Tue, 15 Mar 2011 14:25:56 -0700 Subject: Add missing error checks. Add missing error checks for pthread_mutex_init() calls. In practice, mutex initialization never fails, so this is merely good hygiene. --- jemalloc/src/jemalloc.c | 3 ++- jemalloc/src/rtree.c | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c index 61a36c7..c1aadda 100644 --- a/jemalloc/src/jemalloc.c +++ b/jemalloc/src/jemalloc.c @@ -735,7 +735,8 @@ malloc_init_hard(void) */ ARENA_SET(arenas[0]); - malloc_mutex_init(&arenas_lock); + if (malloc_mutex_init(&arenas_lock)) + return (true); #ifdef JEMALLOC_PROF if (prof_boot2()) { diff --git a/jemalloc/src/rtree.c b/jemalloc/src/rtree.c index 7753743..eb440aa 100644 --- a/jemalloc/src/rtree.c +++ b/jemalloc/src/rtree.c @@ -20,7 +20,10 @@ rtree_new(unsigned bits) memset(ret, 0, offsetof(rtree_t, level2bits) + (sizeof(unsigned) * height)); - malloc_mutex_init(&ret->mutex); + if (malloc_mutex_init(&ret->mutex)) { + /* Leak the rtree. */ + return (NULL); + } ret->height = height; if (bits_per_level * height > bits) ret->level2bits[0] = bits % bits_per_level; -- cgit v0.12 From b602daa6710dab61d8e1ca0cd3c44ac8a564fd9f Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Tue, 15 Mar 2011 22:19:45 -0700 Subject: Clean up after arena_bin_info_t change. Fix a couple of problems related to the addition of arena_bin_info_t. --- jemalloc/include/jemalloc/internal/arena.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/jemalloc/include/jemalloc/internal/arena.h b/jemalloc/include/jemalloc/internal/arena.h index 467ec65..bd983f2 100644 --- a/jemalloc/include/jemalloc/internal/arena.h +++ b/jemalloc/include/jemalloc/internal/arena.h @@ -475,7 +475,7 @@ bool arena_boot(void); #ifndef JEMALLOC_ENABLE_INLINE size_t arena_bin_index(arena_t *arena, arena_bin_t *bin); unsigned arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, - const void *ptr, size_t size); + const void *ptr); # ifdef JEMALLOC_PROF prof_ctx_t *arena_prof_ctx_get(const void *ptr); void arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx); @@ -493,10 +493,10 @@ arena_bin_index(arena_t *arena, arena_bin_t *bin) } JEMALLOC_INLINE unsigned -arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, const void *ptr, - size_t size) +arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, const void *ptr) { unsigned shift, diff, regind; + size_t size; assert(run->magic == ARENA_RUN_MAGIC); @@ -508,6 +508,7 @@ arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, const void *ptr, bin_info->reg0_offset); /* Rescale (factor powers of 2 out of the numerator and denominator). */ + size = bin_info->reg_size; shift = ffs(size) - 1; diff >>= shift; size >>= shift; @@ -583,8 +584,7 @@ arena_prof_ctx_get(const void *ptr) unsigned regind; assert(run->magic == ARENA_RUN_MAGIC); - regind = arena_run_regind(run, bin_info, ptr, - bin_info->reg_size); + regind = arena_run_regind(run, bin_info, ptr); ret = *(prof_ctx_t **)((uintptr_t)run + bin_info->ctx0_offset + (regind * sizeof(prof_ctx_t *))); @@ -614,14 +614,14 @@ arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx) (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) << PAGE_SHIFT)); arena_bin_t *bin = run->bin; - unsigned regind; size_t binind; arena_bin_info_t *bin_info; + unsigned regind; assert(run->magic == ARENA_RUN_MAGIC); - regind = arena_run_regind(run, bin, ptr, bin->reg_size); binind = arena_bin_index(chunk->arena, bin); bin_info = &arena_bin_info[binind]; + regind = arena_run_regind(run, bin_info, ptr); *((prof_ctx_t **)((uintptr_t)run + bin_info->ctx0_offset + (regind * sizeof(prof_ctx_t *)))) = ctx; -- cgit v0.12 From 77f350be08c8b9cd03ceed820b3113dbac9b4151 Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Tue, 15 Mar 2011 22:23:12 -0700 Subject: Improve backtracing-related configuration. Clean up configuration for backtracing when profiling is enabled, and document the configuration logic in INSTALL. Disable libgcc-based backtracing except on x64 (where it is known to work). Add the --disable-prof-gcc option. --- jemalloc/INSTALL | 21 +++-- jemalloc/configure.ac | 121 +++++++++++++++++++-------- jemalloc/include/jemalloc/jemalloc_defs.h.in | 3 + jemalloc/src/prof.c | 78 ++++++++--------- 4 files changed, 140 insertions(+), 83 deletions(-) diff --git a/jemalloc/INSTALL b/jemalloc/INSTALL index e0a5dc4..11a457a 100644 --- a/jemalloc/INSTALL +++ b/jemalloc/INSTALL @@ -62,18 +62,23 @@ any of the following arguments (not a definitive list) to 'configure': --enable-prof Enable heap profiling and leak detection functionality. See the "opt.prof" - option documentation for usage details. + option documentation for usage details. When enabled, there are several + approaches to backtracing, and the configure script chooses the first one + in the following list that appears to function correctly: ---disable-prof-libgcc - Disable the use of libgcc's backtracing functionality. Ordinarily, libgcc's - backtracing functionality is superior to the alternatives, but it may fail - to capture backtraces on some systems. + + libunwind (requires --enable-prof-libunwind) + + libgcc (unless --disable-prof-libgcc) + + gcc intrinsics (unless --disable-prof-gcc) --enable-prof-libunwind Use the libunwind library (http://www.nongnu.org/libunwind/) for stack - backtracing. libunwind is quite slow, but it tends to work across a wider - variety of system configurations than the default backtracing code, which is - based on libgcc functionality or gcc intrinsics. + backtracing. + +--disable-prof-libgcc + Disable the use of libgcc's backtracing functionality. + +--disable-prof-gcc + Disable the use of gcc intrinsics for backtracing. --with-static-libunwind= Statically link against the specified libunwind.a rather than dynamically diff --git a/jemalloc/configure.ac b/jemalloc/configure.ac index f10641b..dfe2b9b 100644 --- a/jemalloc/configure.ac +++ b/jemalloc/configure.ac @@ -404,17 +404,12 @@ fi ], [enable_prof="0"] ) -AC_ARG_ENABLE([prof-libgcc], - [AS_HELP_STRING([--disable-prof-libgcc], - [Do not use libgcc for backtracing])], -[if test "x$enable_prof_libgcc" = "xno" ; then - enable_prof_libgcc="0" +if test "x$enable_prof" = "x1" ; then + backtrace_method="" else - enable_prof_libgcc="1" + backtrace_method="N/A" fi -], -[enable_prof_libgcc="1"] -) + AC_ARG_ENABLE([prof-libunwind], [AS_HELP_STRING([--enable-prof-libunwind], [Use libunwind for backtracing])], [if test "x$enable_prof_libunwind" = "xno" ; then @@ -438,39 +433,90 @@ else fi, LUNWIND="-lunwind" ) -if test "x$enable_prof" = "x1" ; then - LIBS="$LIBS -lm" - AC_DEFINE([JEMALLOC_PROF], [ ]) - if test "x$enable_prof_libunwind" = "x1" ; then - AC_CHECK_HEADERS([libunwind.h], , [enable_prof_libunwind="0"]) - if test "x$LUNWIND" = "x-lunwind" ; then - AC_CHECK_LIB([unwind], [backtrace], [LIBS="$LIBS $LUNWIND"], - [enable_prof_libunwind="0"]) - else - LIBS="$LIBS $LUNWIND" - fi - if test "x${enable_prof_libunwind}" = "x1" ; then - AC_DEFINE([JEMALLOC_PROF_LIBUNWIND], [ ]) - fi +if test "x$backtrace_method" = "x" -a "x$enable_prof_libunwind" = "x1" ; then + AC_CHECK_HEADERS([libunwind.h], , [enable_prof_libunwind="0"]) + if test "x$LUNWIND" = "x-lunwind" ; then + AC_CHECK_LIB([unwind], [backtrace], [LIBS="$LIBS $LUNWIND"], + [enable_prof_libunwind="0"]) + else + LIBS="$LIBS $LUNWIND" + fi + if test "x${enable_prof_libunwind}" = "x1" ; then + backtrace_method="libunwind" + AC_DEFINE([JEMALLOC_PROF_LIBUNWIND], [ ]) fi fi -AC_SUBST([enable_prof]) -dnl If libunwind isn't enabled, try to use libgcc rather than gcc intrinsics -dnl for backtracing. -if test "x$enable_prof" = "x1" -a "x$enable_prof_libgcc" = "x1" ; then - if test "x$enable_prof_libunwind" = "x0" -a "x$GCC" = "xyes" ; then - enable_prof_libgcc="1" - AC_CHECK_HEADERS([unwind.h], , [enable_prof_libgcc="0"]) - AC_CHECK_LIB([gcc], [_Unwind_Backtrace], [LIBS="$LIBS -lgcc"], [enable_prof_libgcc="0"]) - if test "x${enable_prof_libgcc}" = "x1" ; then - AC_DEFINE([JEMALLOC_PROF_LIBGCC], [ ]) - fi - else - enable_prof_libgcc="0" +AC_ARG_ENABLE([prof-libgcc], + [AS_HELP_STRING([--disable-prof-libgcc], + [Do not use libgcc for backtracing])], +[if test "x$enable_prof_libgcc" = "xno" ; then + enable_prof_libgcc="0" +else + enable_prof_libgcc="1" +fi +], +[enable_prof_libgcc="1"] +) +if test "x$backtrace_method" = "x" -a "x$enable_prof_libgcc" = "x1" \ + -a "x$GCC" = "xyes" ; then + AC_CHECK_HEADERS([unwind.h], , [enable_prof_libgcc="0"]) + AC_CHECK_LIB([gcc], [_Unwind_Backtrace], [LIBS="$LIBS -lgcc"], [enable_prof_libgcc="0"]) + dnl The following is conservative, in that it only has entries for CPUs on + dnl which jemalloc has been tested. + AC_MSG_CHECKING([libgcc-based backtracing reliability on ${host_cpu}]) + case "${host_cpu}" in + i[[3456]]86) + AC_MSG_RESULT([unreliable]) + enable_prof_libgcc="0"; + ;; + x86_64) + AC_MSG_RESULT([reliable]) + ;; + *) + AC_MSG_RESULT([unreliable]) + enable_prof_libgcc="0"; + ;; + esac + if test "x${enable_prof_libgcc}" = "x1" ; then + backtrace_method="libgcc" + AC_DEFINE([JEMALLOC_PROF_LIBGCC], [ ]) fi +else + enable_prof_libgcc="0" +fi + +AC_ARG_ENABLE([prof-gcc], + [AS_HELP_STRING([--disable-prof-gcc], + [Do not use gcc intrinsics for backtracing])], +[if test "x$enable_prof_gcc" = "xno" ; then + enable_prof_gcc="0" +else + enable_prof_gcc="1" +fi +], +[enable_prof_gcc="1"] +) +if test "x$backtrace_method" = "x" -a "x$enable_prof_gcc" = "x1" \ + -a "x$GCC" = "xyes" ; then + backtrace_method="gcc intrinsics" + AC_DEFINE([JEMALLOC_PROF_GCC], [ ]) +else + enable_prof_gcc="0" fi +if test "x$backtrace_method" = "x" ; then + backtrace_method="none (disabling profiling)" + enable_prof="0" +fi +AC_MSG_CHECKING([configured backtracing method]) +AC_MSG_RESULT([$backtrace_method]) +if test "x$enable_prof" = "x1" ; then + LIBS="$LIBS -lm" + AC_DEFINE([JEMALLOC_PROF], [ ]) +fi +AC_SUBST([enable_prof]) + dnl Enable tiny allocations by default. AC_ARG_ENABLE([tiny], [AS_HELP_STRING([--disable-tiny], [Disable tiny (sub-quantum) allocations])], @@ -810,8 +856,9 @@ AC_MSG_RESULT([cc-silence : ${enable_cc_silence}]) AC_MSG_RESULT([debug : ${enable_debug}]) AC_MSG_RESULT([stats : ${enable_stats}]) AC_MSG_RESULT([prof : ${enable_prof}]) -AC_MSG_RESULT([prof-libgcc : ${enable_prof_libgcc}]) AC_MSG_RESULT([prof-libunwind : ${enable_prof_libunwind}]) +AC_MSG_RESULT([prof-libgcc : ${enable_prof_libgcc}]) +AC_MSG_RESULT([prof-gcc : ${enable_prof_gcc}]) AC_MSG_RESULT([tiny : ${enable_tiny}]) AC_MSG_RESULT([tcache : ${enable_tcache}]) AC_MSG_RESULT([fill : ${enable_fill}]) diff --git a/jemalloc/include/jemalloc/jemalloc_defs.h.in b/jemalloc/include/jemalloc/jemalloc_defs.h.in index 5f46c5c..773c9f8 100644 --- a/jemalloc/include/jemalloc/jemalloc_defs.h.in +++ b/jemalloc/include/jemalloc/jemalloc_defs.h.in @@ -53,6 +53,9 @@ /* Use libgcc for profile backtracing if defined. */ #undef JEMALLOC_PROF_LIBGCC +/* Use gcc intrinsics for profile backtracing if defined. */ +#undef JEMALLOC_PROF_GCC + /* * JEMALLOC_TINY enables support for tiny objects, which are smaller than one * quantum. diff --git a/jemalloc/src/prof.c b/jemalloc/src/prof.c index 3566c6d..8370042 100644 --- a/jemalloc/src/prof.c +++ b/jemalloc/src/prof.c @@ -3,15 +3,15 @@ #ifdef JEMALLOC_PROF /******************************************************************************/ -#ifdef JEMALLOC_PROF_LIBGCC -#include -#endif - #ifdef JEMALLOC_PROF_LIBUNWIND #define UNW_LOCAL_ONLY #include #endif +#ifdef JEMALLOC_PROF_LIBGCC +#include +#endif + /******************************************************************************/ /* Data. */ @@ -169,39 +169,7 @@ prof_leave(void) prof_gdump(); } -#ifdef JEMALLOC_PROF_LIBGCC -static _Unwind_Reason_Code -prof_unwind_init_callback(struct _Unwind_Context *context, void *arg) -{ - - return (_URC_NO_REASON); -} - -static _Unwind_Reason_Code -prof_unwind_callback(struct _Unwind_Context *context, void *arg) -{ - prof_unwind_data_t *data = (prof_unwind_data_t *)arg; - - if (data->nignore > 0) - data->nignore--; - else { - data->bt->vec[data->bt->len] = (void *)_Unwind_GetIP(context); - data->bt->len++; - if (data->bt->len == data->max) - return (_URC_END_OF_STACK); - } - - return (_URC_NO_REASON); -} - -void -prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max) -{ - prof_unwind_data_t data = {bt, nignore, max}; - - _Unwind_Backtrace(prof_unwind_callback, &data); -} -#elif defined(JEMALLOC_PROF_LIBUNWIND) +#ifdef JEMALLOC_PROF_LIBUNWIND void prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max) { @@ -236,7 +204,41 @@ prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max) break; } } -#else +#endif +#ifdef JEMALLOC_PROF_LIBGCC +static _Unwind_Reason_Code +prof_unwind_init_callback(struct _Unwind_Context *context, void *arg) +{ + + return (_URC_NO_REASON); +} + +static _Unwind_Reason_Code +prof_unwind_callback(struct _Unwind_Context *context, void *arg) +{ + prof_unwind_data_t *data = (prof_unwind_data_t *)arg; + + if (data->nignore > 0) + data->nignore--; + else { + data->bt->vec[data->bt->len] = (void *)_Unwind_GetIP(context); + data->bt->len++; + if (data->bt->len == data->max) + return (_URC_END_OF_STACK); + } + + return (_URC_NO_REASON); +} + +void +prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max) +{ + prof_unwind_data_t data = {bt, nignore, max}; + + _Unwind_Backtrace(prof_unwind_callback, &data); +} +#endif +#ifdef JEMALLOC_PROF_GCC void prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max) { -- cgit v0.12 From 84c8eefeffa246607790ad12e28b0f6a24ecc59d Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Wed, 16 Mar 2011 10:30:13 -0700 Subject: Use bitmaps to track small regions. The previous free list implementation, which embedded singly linked lists in available regions, had the unfortunate side effect of causing many cache misses during thread cache fills. Fix this in two places: - arena_run_t: Use a new bitmap implementation to track which regions are available. Furthermore, revert to preferring the lowest available region (as jemalloc did with its old bitmap-based approach). - tcache_t: Move read-only tcache_bin_t metadata into tcache_bin_info_t, and add a contiguous array of pointers to tcache_t in order to track cached objects. This substantially increases the size of tcache_t, but results in much higher data locality for common tcache operations. As a side benefit, it is again possible to efficiently flush the least recently used cached objects, so this change changes flushing from MRU to LRU. The new bitmap implementation uses a multi-level summary approach to make finding the lowest available region very fast. In practice, bitmaps only have one or two levels, though the implementation is general enough to handle extremely large bitmaps, mainly so that large page sizes can still be entertained. Fix tcache_bin_flush_large() to always flush statistics, in the same way that tcache_bin_flush_small() was recently fixed. Use JEMALLOC_DEBUG rather than NDEBUG. Add dassert(), and use it for debug-only asserts. --- jemalloc/Makefile.in | 10 +- jemalloc/configure.ac | 18 ++ jemalloc/include/jemalloc/internal/arena.h | 41 +++-- jemalloc/include/jemalloc/internal/bitmap.h | 184 +++++++++++++++++++++ .../jemalloc/internal/jemalloc_internal.h.in | 32 +++- jemalloc/include/jemalloc/internal/prof.h | 4 +- jemalloc/include/jemalloc/internal/tcache.h | 51 ++++-- jemalloc/include/jemalloc/jemalloc_defs.h.in | 3 + jemalloc/src/arena.c | 107 ++++++------ jemalloc/src/bitmap.c | 90 ++++++++++ jemalloc/src/ckh.c | 12 +- jemalloc/src/jemalloc.c | 5 +- jemalloc/src/tcache.c | 129 ++++++++++----- jemalloc/test/bitmap.c | 153 +++++++++++++++++ jemalloc/test/bitmap.exp | 2 + 15 files changed, 702 insertions(+), 139 deletions(-) create mode 100644 jemalloc/include/jemalloc/internal/bitmap.h create mode 100644 jemalloc/src/bitmap.c create mode 100644 jemalloc/test/bitmap.c create mode 100644 jemalloc/test/bitmap.exp diff --git a/jemalloc/Makefile.in b/jemalloc/Makefile.in index 7a13f21..8ee4c93 100644 --- a/jemalloc/Makefile.in +++ b/jemalloc/Makefile.in @@ -46,7 +46,7 @@ BINS := @srcroot@bin/pprof CHDRS := @objroot@include/jemalloc/jemalloc@install_suffix@.h \ @objroot@include/jemalloc/jemalloc_defs@install_suffix@.h CSRCS := @srcroot@src/jemalloc.c @srcroot@src/arena.c @srcroot@src/base.c \ - @srcroot@src/chunk.c @srcroot@src/chunk_dss.c \ + @srcroot@src/bitmap.c @srcroot@src/chunk.c @srcroot@src/chunk_dss.c \ @srcroot@src/chunk_mmap.c @srcroot@src/chunk_swap.c @srcroot@src/ckh.c \ @srcroot@src/ctl.c @srcroot@src/extent.c @srcroot@src/hash.c \ @srcroot@src/huge.c @srcroot@src/mb.c @srcroot@src/mutex.c \ @@ -65,8 +65,9 @@ DOCS_HTML := $(DOCS_XML:@objroot@%.xml=@srcroot@%.html) DOCS_MAN3 := $(DOCS_XML:@objroot@%.xml=@srcroot@%.3) DOCS := $(DOCS_HTML) $(DOCS_MAN3) CTESTS := @srcroot@test/allocated.c @srcroot@test/allocm.c \ - @srcroot@test/mremap.c @srcroot@test/posix_memalign.c \ - @srcroot@test/rallocm.c @srcroot@test/thread_arena.c + @srcroot@test/bitmap.c @srcroot@test/mremap.c \ + @srcroot@test/posix_memalign.c @srcroot@test/rallocm.c \ + @srcroot@test/thread_arena.c .PHONY: all dist doc_html doc_man doc .PHONY: install_bin install_include install_lib @@ -127,6 +128,9 @@ doc: $(DOCS) $(CC) $(CFLAGS) -c $(CPPFLAGS) -I@objroot@test -o $@ $< @$(SHELL) -ec "$(CC) -MM $(CPPFLAGS) -I@objroot@test $< | sed \"s/\($(subst /,\/,$(notdir $(basename $@)))\)\.o\([ :]*\)/$(subst /,\/,$(strip $(dir $@)))\1.o \2/g\" > $(@:%.o=%.d)" +# Automatic dependency generation misses #include "*.c". +@objroot@test/bitmap.o : @objroot@src/bitmap.o + @objroot@test/%: @objroot@test/%.o \ @objroot@lib/libjemalloc@install_suffix@.$(SO) @mkdir -p $(@D) diff --git a/jemalloc/configure.ac b/jemalloc/configure.ac index dfe2b9b..dc77d75 100644 --- a/jemalloc/configure.ac +++ b/jemalloc/configure.ac @@ -132,6 +132,16 @@ else fi AC_DEFINE_UNQUOTED([LG_SIZEOF_INT], [$LG_SIZEOF_INT]) +AC_CHECK_SIZEOF([long]) +if test "x${ac_cv_sizeof_long}" = "x8" ; then + LG_SIZEOF_LONG=3 +elif test "x${ac_cv_sizeof_long}" = "x4" ; then + LG_SIZEOF_LONG=2 +else + AC_MSG_ERROR([Unsupported long size: ${ac_cv_sizeof_long}]) +fi +AC_DEFINE_UNQUOTED([LG_SIZEOF_LONG], [$LG_SIZEOF_LONG]) + AC_CANONICAL_HOST dnl CPU-specific settings. CPU_SPINWAIT="" @@ -753,6 +763,14 @@ if test "x${enable_tls}" = "x0" ; then fi dnl ============================================================================ +dnl Check for ffsl(3), and fail if not found. This function exists on all +dnl platforms that jemalloc currently has a chance of functioning on without +dnl modification. + +AC_CHECK_FUNC([ffsl], [], + [AC_MSG_ERROR([Cannot build without ffsl(3)])]) + +dnl ============================================================================ dnl Check for allocator-related functions that should be wrapped. AC_CHECK_FUNC([memalign], diff --git a/jemalloc/include/jemalloc/internal/arena.h b/jemalloc/include/jemalloc/internal/arena.h index bd983f2..1744b45 100644 --- a/jemalloc/include/jemalloc/internal/arena.h +++ b/jemalloc/include/jemalloc/internal/arena.h @@ -209,18 +209,15 @@ struct arena_run_s { /* Bin this run is associated with. */ arena_bin_t *bin; - /* Stack of available freed regions, or NULL. */ - void *avail; - - /* Next region that has never been allocated, or run boundary. */ - void *next; + /* Index of next region that has never been allocated, or nregs. */ + uint32_t nextind; /* Number of free regions in run. */ unsigned nfree; }; /* - * Read-only information associated with each element for arena_t's bins array + * Read-only information associated with each element of arena_t's bins array * is stored separately, partly to reduce memory usage (only one copy, rather * than one per arena), but mainly to avoid false cacheline sharing. */ @@ -234,6 +231,18 @@ struct arena_bin_info_s { /* Total number of regions in a run for this bin's size class. */ uint32_t nregs; + /* + * Offset of first bitmap_t element in a run header for this bin's size + * class. + */ + uint32_t bitmap_offset; + + /* + * Metadata used to manipulate bitmaps for runs associated with this + * bin. + */ + bitmap_info_t bitmap_info; + #ifdef JEMALLOC_PROF /* * Offset of first (prof_ctx_t *) in a run header for this bin's size @@ -397,7 +406,7 @@ struct arena_s { extern size_t opt_lg_qspace_max; extern size_t opt_lg_cspace_max; -extern ssize_t opt_lg_dirty_mult; +extern ssize_t opt_lg_dirty_mult; /* * small_size2bin is a compact lookup table that rounds request sizes up to * size classes. In order to reduce cache footprint, the table is compressed, @@ -498,7 +507,13 @@ arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, const void *ptr) unsigned shift, diff, regind; size_t size; - assert(run->magic == ARENA_RUN_MAGIC); + dassert(run->magic == ARENA_RUN_MAGIC); + /* + * Freeing a pointer lower than region zero can cause assertion + * failure. + */ + assert((uintptr_t)ptr >= (uintptr_t)run + + (uintptr_t)bin_info->reg0_offset); /* * Avoid doing division with a variable divisor if possible. Using @@ -583,7 +598,7 @@ arena_prof_ctx_get(const void *ptr) arena_bin_info_t *bin_info = &arena_bin_info[binind]; unsigned regind; - assert(run->magic == ARENA_RUN_MAGIC); + dassert(run->magic == ARENA_RUN_MAGIC); regind = arena_run_regind(run, bin_info, ptr); ret = *(prof_ctx_t **)((uintptr_t)run + bin_info->ctx0_offset + (regind * @@ -618,7 +633,7 @@ arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx) arena_bin_info_t *bin_info; unsigned regind; - assert(run->magic == ARENA_RUN_MAGIC); + dassert(run->magic == ARENA_RUN_MAGIC); binind = arena_bin_index(chunk->arena, bin); bin_info = &arena_bin_info[binind]; regind = arena_run_regind(run, bin_info, ptr); @@ -639,7 +654,7 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr) arena_chunk_map_t *mapelm; assert(arena != NULL); - assert(arena->magic == ARENA_MAGIC); + dassert(arena->magic == ARENA_MAGIC); assert(chunk->arena == arena); assert(ptr != NULL); assert(CHUNK_ADDR2BASE(ptr) != ptr); @@ -662,9 +677,9 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr) run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind - (mapelm->bits >> PAGE_SHIFT)) << PAGE_SHIFT)); - assert(run->magic == ARENA_RUN_MAGIC); + dassert(run->magic == ARENA_RUN_MAGIC); bin = run->bin; -#ifndef NDEBUG +#ifdef JEMALLOC_DEBUG { size_t binind = arena_bin_index(arena, bin); arena_bin_info_t *bin_info = diff --git a/jemalloc/include/jemalloc/internal/bitmap.h b/jemalloc/include/jemalloc/internal/bitmap.h new file mode 100644 index 0000000..4bb2212 --- /dev/null +++ b/jemalloc/include/jemalloc/internal/bitmap.h @@ -0,0 +1,184 @@ +/******************************************************************************/ +#ifdef JEMALLOC_H_TYPES + +/* Maximum bitmap bit count is 2^LG_BITMAP_MAXBITS. */ +#define LG_BITMAP_MAXBITS 18 + +typedef struct bitmap_level_s bitmap_level_t; +typedef struct bitmap_info_s bitmap_info_t; +typedef unsigned long bitmap_t; +#define LG_SIZEOF_BITMAP LG_SIZEOF_LONG + +/* Number of bits per group. */ +#define LG_BITMAP_GROUP_NBITS (LG_SIZEOF_BITMAP + 3) +#define BITMAP_GROUP_NBITS (ZU(1) << LG_BITMAP_GROUP_NBITS) +#define BITMAP_GROUP_NBITS_MASK (BITMAP_GROUP_NBITS-1) + +/* Maximum number of levels possible. */ +#define BITMAP_MAX_LEVELS \ + (LG_BITMAP_MAXBITS / LG_SIZEOF_BITMAP) \ + + !!(LG_BITMAP_MAXBITS % LG_SIZEOF_BITMAP) + +#endif /* JEMALLOC_H_TYPES */ +/******************************************************************************/ +#ifdef JEMALLOC_H_STRUCTS + +struct bitmap_level_s { + /* Offset of this level's groups within the array of groups. */ + size_t group_offset; +}; + +struct bitmap_info_s { + /* Logical number of bits in bitmap (stored at bottom level). */ + size_t nbits; + + /* Number of levels necessary for nbits. */ + unsigned nlevels; + + /* + * Only the first (nlevels+1) elements are used, and levels are ordered + * bottom to top (e.g. the bottom level is stored in levels[0]). + */ + bitmap_level_t levels[BITMAP_MAX_LEVELS+1]; +}; + +#endif /* JEMALLOC_H_STRUCTS */ +/******************************************************************************/ +#ifdef JEMALLOC_H_EXTERNS + +void bitmap_info_init(bitmap_info_t *binfo, size_t nbits); +size_t bitmap_info_ngroups(const bitmap_info_t *binfo); +size_t bitmap_size(size_t nbits); +void bitmap_init(bitmap_t *bitmap, const bitmap_info_t *binfo); + +#endif /* JEMALLOC_H_EXTERNS */ +/******************************************************************************/ +#ifdef JEMALLOC_H_INLINES + +#ifndef JEMALLOC_ENABLE_INLINE +bool bitmap_full(bitmap_t *bitmap, const bitmap_info_t *binfo); +bool bitmap_get(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit); +void bitmap_set(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit); +size_t bitmap_sfu(bitmap_t *bitmap, const bitmap_info_t *binfo); +void bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit); +#endif + +#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_BITMAP_C_)) +JEMALLOC_INLINE bool +bitmap_full(bitmap_t *bitmap, const bitmap_info_t *binfo) +{ + unsigned rgoff = binfo->levels[binfo->nlevels].group_offset - 1; + bitmap_t rg = bitmap[rgoff]; + /* The bitmap is full iff the root group is 0. */ + return (rg == 0); +} + +JEMALLOC_INLINE bool +bitmap_get(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit) +{ + size_t goff; + bitmap_t g; + + assert(bit < binfo->nbits); + goff = bit >> LG_BITMAP_GROUP_NBITS; + g = bitmap[goff]; + return (!(g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK)))); +} + +JEMALLOC_INLINE void +bitmap_set(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit) +{ + size_t goff; + bitmap_t *gp; + bitmap_t g; + + assert(bit < binfo->nbits); + assert(bitmap_get(bitmap, binfo, bit) == false); + goff = bit >> LG_BITMAP_GROUP_NBITS; + gp = &bitmap[goff]; + g = *gp; + assert(g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK))); + g ^= 1LU << (bit & BITMAP_GROUP_NBITS_MASK); + *gp = g; + assert(bitmap_get(bitmap, binfo, bit)); + /* Propagate group state transitions up the tree. */ + if (g == 0) { + unsigned i; + for (i = 1; i < binfo->nlevels; i++) { + bit = goff; + goff = bit >> LG_BITMAP_GROUP_NBITS; + gp = &bitmap[binfo->levels[i].group_offset + goff]; + g = *gp; + assert(g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK))); + g ^= 1LU << (bit & BITMAP_GROUP_NBITS_MASK); + *gp = g; + if (g != 0) + break; + } + } +} + +/* sfu: set first unset. */ +JEMALLOC_INLINE size_t +bitmap_sfu(bitmap_t *bitmap, const bitmap_info_t *binfo) +{ + size_t bit; + bitmap_t g; + unsigned i; + + assert(bitmap_full(bitmap, binfo) == false); + + i = binfo->nlevels - 1; + g = bitmap[binfo->levels[i].group_offset]; + bit = ffsl(g) - 1; + while (i > 0) { + i--; + g = bitmap[binfo->levels[i].group_offset + bit]; + bit = (bit << LG_BITMAP_GROUP_NBITS) + (ffsl(g) - 1); + } + + bitmap_set(bitmap, binfo, bit); + return (bit); +} + +JEMALLOC_INLINE void +bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit) +{ + size_t goff; + bitmap_t *gp; + bitmap_t g; + bool propagate; + + assert(bit < binfo->nbits); + assert(bitmap_get(bitmap, binfo, bit)); + goff = bit >> LG_BITMAP_GROUP_NBITS; + gp = &bitmap[goff]; + g = *gp; + propagate = (g == 0); + assert((g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK))) == 0); + g ^= 1LU << (bit & BITMAP_GROUP_NBITS_MASK); + *gp = g; + assert(bitmap_get(bitmap, binfo, bit) == false); + /* Propagate group state transitions up the tree. */ + if (propagate) { + unsigned i; + for (i = 1; i < binfo->nlevels; i++) { + bit = goff; + goff = bit >> LG_BITMAP_GROUP_NBITS; + gp = &bitmap[binfo->levels[i].group_offset + goff]; + g = *gp; + propagate = (g == 0); + assert((g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK))) + == 0); + g ^= 1LU << (bit & BITMAP_GROUP_NBITS_MASK); + *gp = g; + if (propagate == false) + break; + } + } +} + +#endif + +#endif /* JEMALLOC_H_INLINES */ +/******************************************************************************/ diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in index 34b2a23..a80fc7c 100644 --- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in +++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in @@ -55,8 +55,9 @@ extern void (*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s); * Define a custom assert() in order to reduce the chances of deadlock during * assertion failure. */ -#ifdef JEMALLOC_DEBUG -# define assert(e) do { \ +#ifndef assert +# ifdef JEMALLOC_DEBUG +# define assert(e) do { \ if (!(e)) { \ char line_buf[UMAX2S_BUFSIZE]; \ malloc_write(": "); \ @@ -70,8 +71,15 @@ extern void (*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s); abort(); \ } \ } while (0) +# else +# define assert(e) +# endif +#endif + +#ifdef JEMALLOC_DEBUG +# define dassert(e) assert(e) #else -#define assert(e) +# define dassert(e) #endif /* @@ -146,7 +154,19 @@ extern void (*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s); #define QUANTUM_CEILING(a) \ (((a) + QUANTUM_MASK) & ~QUANTUM_MASK) +#define LONG ((size_t)(1U << LG_SIZEOF_LONG)) +#define LONG_MASK (LONG - 1) + +/* Return the smallest long multiple that is >= a. */ +#define LONG_CEILING(a) \ + (((a) + LONG_MASK) & ~LONG_MASK) + #define SIZEOF_PTR (1U << LG_SIZEOF_PTR) +#define PTR_MASK (SIZEOF_PTR - 1) + +/* Return the smallest (void *) multiple that is >= a. */ +#define PTR_CEILING(a) \ + (((a) + PTR_MASK) & ~PTR_MASK) /* * Maximum size of L1 cache line. This is used to avoid cache line aliasing. @@ -199,6 +219,7 @@ extern void (*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s); #include "jemalloc/internal/ctl.h" #include "jemalloc/internal/mutex.h" #include "jemalloc/internal/mb.h" +#include "jemalloc/internal/bitmap.h" #include "jemalloc/internal/extent.h" #include "jemalloc/internal/arena.h" #include "jemalloc/internal/base.h" @@ -222,6 +243,7 @@ extern void (*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s); #include "jemalloc/internal/ctl.h" #include "jemalloc/internal/mutex.h" #include "jemalloc/internal/mb.h" +#include "jemalloc/internal/bitmap.h" #include "jemalloc/internal/extent.h" #include "jemalloc/internal/arena.h" #include "jemalloc/internal/base.h" @@ -335,6 +357,7 @@ void jemalloc_postfork(void); #include "jemalloc/internal/ctl.h" #include "jemalloc/internal/mutex.h" #include "jemalloc/internal/mb.h" +#include "jemalloc/internal/bitmap.h" #include "jemalloc/internal/extent.h" #include "jemalloc/internal/arena.h" #include "jemalloc/internal/base.h" @@ -545,6 +568,7 @@ thread_allocated_get(void) #endif #endif +#include "jemalloc/internal/bitmap.h" #include "jemalloc/internal/rtree.h" #include "jemalloc/internal/tcache.h" #include "jemalloc/internal/arena.h" @@ -628,7 +652,7 @@ isalloc(const void *ptr) chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); if (chunk != ptr) { /* Region. */ - assert(chunk->arena->magic == ARENA_MAGIC); + dassert(chunk->arena->magic == ARENA_MAGIC); #ifdef JEMALLOC_PROF ret = arena_salloc_demote(ptr); diff --git a/jemalloc/include/jemalloc/internal/prof.h b/jemalloc/include/jemalloc/internal/prof.h index db63465..f943873 100644 --- a/jemalloc/include/jemalloc/internal/prof.h +++ b/jemalloc/include/jemalloc/internal/prof.h @@ -348,7 +348,7 @@ prof_ctx_get(const void *ptr) chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); if (chunk != ptr) { /* Region. */ - assert(chunk->arena->magic == ARENA_MAGIC); + dassert(chunk->arena->magic == ARENA_MAGIC); ret = arena_prof_ctx_get(ptr); } else @@ -367,7 +367,7 @@ prof_ctx_set(const void *ptr, prof_ctx_t *ctx) chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); if (chunk != ptr) { /* Region. */ - assert(chunk->arena->magic == ARENA_MAGIC); + dassert(chunk->arena->magic == ARENA_MAGIC); arena_prof_ctx_set(ptr, ctx); } else diff --git a/jemalloc/include/jemalloc/internal/tcache.h b/jemalloc/include/jemalloc/internal/tcache.h index ab02545..5434d32 100644 --- a/jemalloc/include/jemalloc/internal/tcache.h +++ b/jemalloc/include/jemalloc/internal/tcache.h @@ -2,6 +2,7 @@ /******************************************************************************/ #ifdef JEMALLOC_H_TYPES +typedef struct tcache_bin_info_s tcache_bin_info_t; typedef struct tcache_bin_s tcache_bin_t; typedef struct tcache_s tcache_t; @@ -32,14 +33,21 @@ typedef struct tcache_s tcache_t; /******************************************************************************/ #ifdef JEMALLOC_H_STRUCTS +/* + * Read-only information associated with each element of tcache_t's tbins array + * is stored separately, mainly to reduce memory usage. + */ +struct tcache_bin_info_s { + unsigned ncached_max; /* Upper limit on ncached. */ +}; + struct tcache_bin_s { # ifdef JEMALLOC_STATS tcache_bin_stats_t tstats; # endif unsigned low_water; /* Min # cached since last GC. */ unsigned ncached; /* # of cached objects. */ - unsigned ncached_max; /* Upper limit on ncached. */ - void *avail; /* Chain of available objects. */ + void **avail; /* Stack of available objects. */ }; struct tcache_s { @@ -53,6 +61,12 @@ struct tcache_s { unsigned ev_cnt; /* Event count since incremental GC. */ unsigned next_gc_bin; /* Next bin to GC. */ tcache_bin_t tbins[1]; /* Dynamically sized. */ + /* + * The pointer stacks associated with tbins follow as a contiguous + * array. During tcache initialization, the avail pointer in each + * element of tbins is initialized to point to the proper offset within + * this array. + */ }; #endif /* JEMALLOC_H_STRUCTS */ @@ -63,6 +77,8 @@ extern bool opt_tcache; extern ssize_t opt_lg_tcache_max; extern ssize_t opt_lg_tcache_gc_sweep; +extern tcache_bin_info_t *tcache_bin_info; + /* Map of thread-specific caches. */ #ifndef NO_TLS extern __thread tcache_t *tcache_tls @@ -109,7 +125,7 @@ void tcache_destroy(tcache_t *tcache); #ifdef JEMALLOC_STATS void tcache_stats_merge(tcache_t *tcache, arena_t *arena); #endif -void tcache_boot(void); +bool tcache_boot(void); #endif /* JEMALLOC_H_EXTERNS */ /******************************************************************************/ @@ -211,8 +227,7 @@ tcache_alloc_easy(tcache_bin_t *tbin) tbin->ncached--; if (tbin->ncached < tbin->low_water) tbin->low_water = tbin->ncached; - ret = tbin->avail; - tbin->avail = *(void **)ret; + ret = tbin->avail[tbin->ncached]; return (ret); } @@ -312,6 +327,7 @@ tcache_dalloc_small(tcache_t *tcache, void *ptr) arena_run_t *run; arena_bin_t *bin; tcache_bin_t *tbin; + tcache_bin_info_t *tbin_info; size_t pageind, binind; arena_chunk_map_t *mapelm; @@ -323,7 +339,7 @@ tcache_dalloc_small(tcache_t *tcache, void *ptr) mapelm = &chunk->map[pageind-map_bias]; run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind - (mapelm->bits >> PAGE_SHIFT)) << PAGE_SHIFT)); - assert(run->magic == ARENA_RUN_MAGIC); + dassert(run->magic == ARENA_RUN_MAGIC); bin = run->bin; binind = ((uintptr_t)bin - (uintptr_t)&arena->bins) / sizeof(arena_bin_t); @@ -335,16 +351,17 @@ tcache_dalloc_small(tcache_t *tcache, void *ptr) #endif tbin = &tcache->tbins[binind]; - if (tbin->ncached == tbin->ncached_max) { - tcache_bin_flush_small(tbin, binind, (tbin->ncached_max >> 1) + tbin_info = &tcache_bin_info[binind]; + if (tbin->ncached == tbin_info->ncached_max) { + tcache_bin_flush_small(tbin, binind, (tbin_info->ncached_max >> + 1) #if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF)) , tcache #endif ); } - assert(tbin->ncached < tbin->ncached_max); - *(void **)ptr = tbin->avail; - tbin->avail = ptr; + assert(tbin->ncached < tbin_info->ncached_max); + tbin->avail[tbin->ncached] = ptr; tbin->ncached++; tcache_event(tcache); @@ -357,6 +374,7 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size) arena_chunk_t *chunk; size_t pageind, binind; tcache_bin_t *tbin; + tcache_bin_info_t *tbin_info; assert((size & PAGE_MASK) == 0); assert(arena_salloc(ptr) > small_maxclass); @@ -373,16 +391,17 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size) #endif tbin = &tcache->tbins[binind]; - if (tbin->ncached == tbin->ncached_max) { - tcache_bin_flush_large(tbin, binind, (tbin->ncached_max >> 1) + tbin_info = &tcache_bin_info[binind]; + if (tbin->ncached == tbin_info->ncached_max) { + tcache_bin_flush_large(tbin, binind, (tbin_info->ncached_max >> + 1) #if (defined(JEMALLOC_STATS) || defined(JEMALLOC_PROF)) , tcache #endif ); } - assert(tbin->ncached < tbin->ncached_max); - *(void **)ptr = tbin->avail; - tbin->avail = ptr; + assert(tbin->ncached < tbin_info->ncached_max); + tbin->avail[tbin->ncached] = ptr; tbin->ncached++; tcache_event(tcache); diff --git a/jemalloc/include/jemalloc/jemalloc_defs.h.in b/jemalloc/include/jemalloc/jemalloc_defs.h.in index 773c9f8..d669841 100644 --- a/jemalloc/include/jemalloc/jemalloc_defs.h.in +++ b/jemalloc/include/jemalloc/jemalloc_defs.h.in @@ -140,4 +140,7 @@ /* sizeof(int) == 2^LG_SIZEOF_INT. */ #undef LG_SIZEOF_INT +/* sizeof(long) == 2^LG_SIZEOF_LONG. */ +#undef LG_SIZEOF_LONG + #endif /* JEMALLOC_DEFS_H_ */ diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c index e49b8ed..87bd9bb 100644 --- a/jemalloc/src/arena.c +++ b/jemalloc/src/arena.c @@ -253,59 +253,45 @@ static inline void * arena_run_reg_alloc(arena_run_t *run, arena_bin_info_t *bin_info) { void *ret; + unsigned regind; + bitmap_t *bitmap = (bitmap_t *)((uintptr_t)run + + (uintptr_t)bin_info->bitmap_offset); - assert(run->magic == ARENA_RUN_MAGIC); + dassert(run->magic == ARENA_RUN_MAGIC); assert(run->nfree > 0); + assert(bitmap_full(bitmap, &bin_info->bitmap_info) == false); + regind = bitmap_sfu(bitmap, &bin_info->bitmap_info); + ret = (void *)((uintptr_t)run + (uintptr_t)bin_info->reg0_offset + + (uintptr_t)(bin_info->reg_size * regind)); run->nfree--; - ret = run->avail; - if (ret != NULL) { - /* Double free can cause assertion failure.*/ - assert(ret != NULL); - /* Write-after free can cause assertion failure. */ - assert((uintptr_t)ret >= (uintptr_t)run + - (uintptr_t)bin_info->reg0_offset); - assert((uintptr_t)ret < (uintptr_t)run->next); - assert(((uintptr_t)ret - ((uintptr_t)run + - (uintptr_t)bin_info->reg0_offset)) % - (uintptr_t)bin_info->reg_size == 0); - run->avail = *(void **)ret; - return (ret); - } - ret = run->next; - run->next = (void *)((uintptr_t)ret + (uintptr_t)bin_info->reg_size); - assert(ret != NULL); + if (regind == run->nextind) + run->nextind++; + assert(regind < run->nextind); return (ret); } static inline void arena_run_reg_dalloc(arena_run_t *run, void *ptr) { - -#ifndef NDEBUG arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run); size_t binind = arena_bin_index(chunk->arena, run->bin); arena_bin_info_t *bin_info = &arena_bin_info[binind]; + unsigned regind = arena_run_regind(run, bin_info, ptr); + bitmap_t *bitmap = (bitmap_t *)((uintptr_t)run + + (uintptr_t)bin_info->bitmap_offset); + assert(run->nfree < bin_info->nregs); /* Freeing an interior pointer can cause assertion failure. */ assert(((uintptr_t)ptr - ((uintptr_t)run + (uintptr_t)bin_info->reg0_offset)) % (uintptr_t)bin_info->reg_size == 0); - /* - * Freeing a pointer lower than region zero can cause assertion - * failure. - */ assert((uintptr_t)ptr >= (uintptr_t)run + (uintptr_t)bin_info->reg0_offset); - /* - * Freeing a pointer past in the run's frontier can cause assertion - * failure. - */ - assert((uintptr_t)ptr < (uintptr_t)run->next); -#endif + /* Freeing an unallocated pointer can cause assertion failure. */ + assert(bitmap_get(bitmap, &bin_info->bitmap_info, regind)); - *(void **)ptr = run->avail; - run->avail = ptr; + bitmap_unset(bitmap, &bin_info->bitmap_info, regind); run->nfree++; } @@ -772,7 +758,7 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk) chunk + (uintptr_t)(pageind << PAGE_SHIFT)); assert((mapelm->bits >> PAGE_SHIFT) == 0); - assert(run->magic == ARENA_RUN_MAGIC); + dassert(run->magic == ARENA_RUN_MAGIC); size_t binind = arena_bin_index(arena, run->bin); arena_bin_info_t *bin_info = @@ -1224,12 +1210,14 @@ arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin) malloc_mutex_lock(&arena->lock); run = arena_run_alloc(arena, bin_info->run_size, false, false); if (run != NULL) { + bitmap_t *bitmap = (bitmap_t *)((uintptr_t)run + + (uintptr_t)bin_info->bitmap_offset); + /* Initialize run internals. */ run->bin = bin; - run->avail = NULL; - run->next = (void *)((uintptr_t)run + - (uintptr_t)bin_info->reg0_offset); + run->nextind = 0; run->nfree = bin_info->nregs; + bitmap_init(bitmap, &bin_info->bitmap_info); #ifdef JEMALLOC_DEBUG run->magic = ARENA_RUN_MAGIC; #endif @@ -1289,12 +1277,11 @@ arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin) bin->runcur = NULL; run = arena_bin_nonfull_run_get(arena, bin); if (bin->runcur != NULL && bin->runcur->nfree > 0) { - /* * Another thread updated runcur while this one ran without the * bin lock in arena_bin_nonfull_run_get(). */ - assert(bin->runcur->magic == ARENA_RUN_MAGIC); + dassert(bin->runcur->magic == ARENA_RUN_MAGIC); assert(bin->runcur->nfree > 0); ret = arena_run_reg_alloc(bin->runcur, bin_info); if (run != NULL) { @@ -1302,7 +1289,7 @@ arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin) /* * arena_run_alloc() may have allocated run, or it may - * have pulled it from the bin's run tree. Therefore + * have pulled run from the bin's run tree. Therefore * it is unsafe to make any assumptions about how run * has previously been used, and arena_bin_lower_run() * must be called, as if a region were just deallocated @@ -1322,7 +1309,7 @@ arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin) bin->runcur = run; - assert(bin->runcur->magic == ARENA_RUN_MAGIC); + dassert(bin->runcur->magic == ARENA_RUN_MAGIC); assert(bin->runcur->nfree > 0); return (arena_run_reg_alloc(bin->runcur, bin_info)); @@ -1365,15 +1352,15 @@ arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, size_t binind #endif bin = &arena->bins[binind]; malloc_mutex_lock(&bin->lock); - for (i = 0, nfill = (tbin->ncached_max >> 1); i < nfill; i++) { + for (i = 0, nfill = (tcache_bin_info[binind].ncached_max >> 1); + i < nfill; i++) { if ((run = bin->runcur) != NULL && run->nfree > 0) ptr = arena_run_reg_alloc(run, &arena_bin_info[binind]); else ptr = arena_bin_malloc_hard(arena, bin); if (ptr == NULL) break; - *(void **)ptr = tbin->avail; - tbin->avail = ptr; + tbin->avail[i] = ptr; } #ifdef JEMALLOC_STATS bin->stats.allocated += (i - tbin->ncached) * @@ -1607,7 +1594,7 @@ arena_salloc(const void *ptr) arena_run_t *run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) << PAGE_SHIFT)); - assert(run->magic == ARENA_RUN_MAGIC); + dassert(run->magic == ARENA_RUN_MAGIC); size_t binind = arena_bin_index(chunk->arena, run->bin); arena_bin_info_t *bin_info = &arena_bin_info[binind]; assert(((uintptr_t)ptr - ((uintptr_t)run + @@ -1660,7 +1647,7 @@ arena_salloc_demote(const void *ptr) arena_run_t *run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) << PAGE_SHIFT)); - assert(run->magic == ARENA_RUN_MAGIC); + dassert(run->magic == ARENA_RUN_MAGIC); size_t binind = arena_bin_index(chunk->arena, run->bin); arena_bin_info_t *bin_info = &arena_bin_info[binind]; assert(((uintptr_t)ptr - ((uintptr_t)run + @@ -1730,8 +1717,9 @@ arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run, /******************************/ npages = bin_info->run_size >> PAGE_SHIFT; run_ind = (size_t)(((uintptr_t)run - (uintptr_t)chunk) >> PAGE_SHIFT); - past = (size_t)((PAGE_CEILING((uintptr_t)run->next) - (uintptr_t)chunk) - >> PAGE_SHIFT); + past = (size_t)(PAGE_CEILING((uintptr_t)run + + (uintptr_t)bin_info->reg0_offset + (uintptr_t)(run->nextind * + bin_info->reg_size) - (uintptr_t)chunk) >> PAGE_SHIFT); malloc_mutex_lock(&arena->lock); /* @@ -1817,7 +1805,7 @@ arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr, pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT; run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)((pageind - (mapelm->bits >> PAGE_SHIFT)) << PAGE_SHIFT)); - assert(run->magic == ARENA_RUN_MAGIC); + dassert(run->magic == ARENA_RUN_MAGIC); bin = run->bin; size_t binind = arena_bin_index(arena, bin); arena_bin_info_t *bin_info = &arena_bin_info[binind]; @@ -2065,7 +2053,7 @@ arena_ralloc_large(void *ptr, size_t oldsize, size_t size, size_t extra, chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); arena = chunk->arena; - assert(arena->magic == ARENA_MAGIC); + dassert(arena->magic == ARENA_MAGIC); if (psize < oldsize) { #ifdef JEMALLOC_FILL @@ -2405,8 +2393,8 @@ small_size2bin_init_hard(void) * *) bin_info->run_size <= arena_maxclass * *) run header overhead <= RUN_MAX_OVRHD (or header overhead relaxed). * - * bin_info->nregs and bin_info->reg0_offset are also calculated here, since - * these settings are all interdependent. + * bin_info->nregs, bin_info->bitmap_offset, and bin_info->reg0_offset are also + * calculated here, since these settings are all interdependent. */ static size_t bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size) @@ -2414,6 +2402,7 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size) size_t try_run_size, good_run_size; uint32_t try_nregs, good_nregs; uint32_t try_hdr_size, good_hdr_size; + uint32_t try_bitmap_offset, good_bitmap_offset; #ifdef JEMALLOC_PROF uint32_t try_ctx0_offset, good_ctx0_offset; #endif @@ -2438,6 +2427,11 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size) do { try_nregs--; try_hdr_size = sizeof(arena_run_t); + /* Pad to a long boundary. */ + try_hdr_size = LONG_CEILING(try_hdr_size); + try_bitmap_offset = try_hdr_size; + /* Add space for bitmap. */ + try_hdr_size += bitmap_size(try_nregs); #ifdef JEMALLOC_PROF if (opt_prof && prof_promote == false) { /* Pad to a quantum boundary. */ @@ -2460,6 +2454,7 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size) good_run_size = try_run_size; good_nregs = try_nregs; good_hdr_size = try_hdr_size; + good_bitmap_offset = try_bitmap_offset; #ifdef JEMALLOC_PROF good_ctx0_offset = try_ctx0_offset; #endif @@ -2473,6 +2468,11 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size) do { try_nregs--; try_hdr_size = sizeof(arena_run_t); + /* Pad to a long boundary. */ + try_hdr_size = LONG_CEILING(try_hdr_size); + try_bitmap_offset = try_hdr_size; + /* Add space for bitmap. */ + try_hdr_size += bitmap_size(try_nregs); #ifdef JEMALLOC_PROF if (opt_prof && prof_promote == false) { /* Pad to a quantum boundary. */ @@ -2498,6 +2498,7 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size) /* Copy final settings. */ bin_info->run_size = good_run_size; bin_info->nregs = good_nregs; + bin_info->bitmap_offset = good_bitmap_offset; #ifdef JEMALLOC_PROF bin_info->ctx0_offset = good_ctx0_offset; #endif @@ -2525,6 +2526,7 @@ bin_info_init(void) bin_info = &arena_bin_info[i]; bin_info->reg_size = (1U << (LG_TINY_MIN + i)); prev_run_size = bin_info_run_size_calc(bin_info, prev_run_size); + bitmap_info_init(&bin_info->bitmap_info, bin_info->nregs); } #endif @@ -2533,6 +2535,7 @@ bin_info_init(void) bin_info = &arena_bin_info[i]; bin_info->reg_size = (i - ntbins + 1) << LG_QUANTUM; prev_run_size = bin_info_run_size_calc(bin_info, prev_run_size); + bitmap_info_init(&bin_info->bitmap_info, bin_info->nregs); } /* Cacheline-spaced bins. */ @@ -2541,6 +2544,7 @@ bin_info_init(void) bin_info->reg_size = cspace_min + ((i - (ntbins + nqbins)) << LG_CACHELINE); prev_run_size = bin_info_run_size_calc(bin_info, prev_run_size); + bitmap_info_init(&bin_info->bitmap_info, bin_info->nregs); } /* Subpage-spaced bins. */ @@ -2549,6 +2553,7 @@ bin_info_init(void) bin_info->reg_size = sspace_min + ((i - (ntbins + nqbins + ncbins)) << LG_SUBPAGE); prev_run_size = bin_info_run_size_calc(bin_info, prev_run_size); + bitmap_info_init(&bin_info->bitmap_info, bin_info->nregs); } return (false); diff --git a/jemalloc/src/bitmap.c b/jemalloc/src/bitmap.c new file mode 100644 index 0000000..b47e262 --- /dev/null +++ b/jemalloc/src/bitmap.c @@ -0,0 +1,90 @@ +#define JEMALLOC_BITMAP_C_ +#include "jemalloc/internal/jemalloc_internal.h" + +/******************************************************************************/ +/* Function prototypes for non-inline static functions. */ + +static size_t bits2groups(size_t nbits); + +/******************************************************************************/ + +static size_t +bits2groups(size_t nbits) +{ + + return ((nbits >> LG_BITMAP_GROUP_NBITS) + + !!(nbits & BITMAP_GROUP_NBITS_MASK)); +} + +void +bitmap_info_init(bitmap_info_t *binfo, size_t nbits) +{ + unsigned i; + size_t group_count; + + assert(nbits > 0); + assert(nbits <= (ZU(1) << LG_BITMAP_MAXBITS)); + + /* + * Compute the number of groups necessary to store nbits bits, and + * progressively work upward through the levels until reaching a level + * that requires only one group. + */ + binfo->levels[0].group_offset = 0; + group_count = bits2groups(nbits); + for (i = 1; group_count > 1; i++) { + assert(i < BITMAP_MAX_LEVELS); + binfo->levels[i].group_offset = binfo->levels[i-1].group_offset + + group_count; + group_count = bits2groups(group_count); + } + binfo->levels[i].group_offset = binfo->levels[i-1].group_offset + + group_count; + binfo->nlevels = i; + binfo->nbits = nbits; +} + +size_t +bitmap_info_ngroups(const bitmap_info_t *binfo) +{ + + return (binfo->levels[binfo->nlevels].group_offset << LG_SIZEOF_BITMAP); +} + +size_t +bitmap_size(size_t nbits) +{ + bitmap_info_t binfo; + + bitmap_info_init(&binfo, nbits); + return (bitmap_info_ngroups(&binfo)); +} + +void +bitmap_init(bitmap_t *bitmap, const bitmap_info_t *binfo) +{ + size_t extra; + unsigned i; + + /* + * Bits are actually inverted with regard to the external bitmap + * interface, so the bitmap starts out with all 1 bits, except for + * trailing unused bits (if any). Note that each group uses bit 0 to + * correspond to the first logical bit in the group, so extra bits + * are the most significant bits of the last group. + */ + memset(bitmap, 0xffU, binfo->levels[binfo->nlevels].group_offset << + LG_SIZEOF_BITMAP); + extra = (BITMAP_GROUP_NBITS - (binfo->nbits & BITMAP_GROUP_NBITS_MASK)) + & BITMAP_GROUP_NBITS_MASK; + if (extra != 0) + bitmap[binfo->levels[1].group_offset - 1] >>= extra; + for (i = 1; i < binfo->nlevels; i++) { + size_t group_count = binfo->levels[i].group_offset - + binfo->levels[i-1].group_offset; + extra = (BITMAP_GROUP_NBITS - (group_count & + BITMAP_GROUP_NBITS_MASK)) & BITMAP_GROUP_NBITS_MASK; + if (extra != 0) + bitmap[binfo->levels[i+1].group_offset - 1] >>= extra; + } +} diff --git a/jemalloc/src/ckh.c b/jemalloc/src/ckh.c index e386a53..75ae7fd 100644 --- a/jemalloc/src/ckh.c +++ b/jemalloc/src/ckh.c @@ -73,7 +73,7 @@ ckh_isearch(ckh_t *ckh, const void *key) size_t hash1, hash2, bucket, cell; assert(ckh != NULL); - assert(ckh->magic == CKH_MAGIC); + dassert(ckh->magic == CKH_MAGIC); ckh->hash(key, ckh->lg_curbuckets, &hash1, &hash2); @@ -396,7 +396,7 @@ ckh_delete(ckh_t *ckh) { assert(ckh != NULL); - assert(ckh->magic == CKH_MAGIC); + dassert(ckh->magic == CKH_MAGIC); #ifdef CKH_VERBOSE malloc_printf( @@ -421,7 +421,7 @@ ckh_count(ckh_t *ckh) { assert(ckh != NULL); - assert(ckh->magic == CKH_MAGIC); + dassert(ckh->magic == CKH_MAGIC); return (ckh->count); } @@ -452,7 +452,7 @@ ckh_insert(ckh_t *ckh, const void *key, const void *data) bool ret; assert(ckh != NULL); - assert(ckh->magic == CKH_MAGIC); + dassert(ckh->magic == CKH_MAGIC); assert(ckh_search(ckh, key, NULL, NULL)); #ifdef CKH_COUNT @@ -477,7 +477,7 @@ ckh_remove(ckh_t *ckh, const void *searchkey, void **key, void **data) size_t cell; assert(ckh != NULL); - assert(ckh->magic == CKH_MAGIC); + dassert(ckh->magic == CKH_MAGIC); cell = ckh_isearch(ckh, searchkey); if (cell != SIZE_T_MAX) { @@ -509,7 +509,7 @@ ckh_search(ckh_t *ckh, const void *searchkey, void **key, void **data) size_t cell; assert(ckh != NULL); - assert(ckh->magic == CKH_MAGIC); + dassert(ckh->magic == CKH_MAGIC); cell = ckh_isearch(ckh, searchkey); if (cell != SIZE_T_MAX) { diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c index c1aadda..9f2fa92 100644 --- a/jemalloc/src/jemalloc.c +++ b/jemalloc/src/jemalloc.c @@ -693,7 +693,10 @@ malloc_init_hard(void) } #ifdef JEMALLOC_TCACHE - tcache_boot(); + if (tcache_boot()) { + malloc_mutex_unlock(&init_lock); + return (true); + } #endif if (huge_boot()) { diff --git a/jemalloc/src/tcache.c b/jemalloc/src/tcache.c index 88e1cc7..2f4804e 100644 --- a/jemalloc/src/tcache.c +++ b/jemalloc/src/tcache.c @@ -8,6 +8,9 @@ bool opt_tcache = true; ssize_t opt_lg_tcache_max = LG_TCACHE_MAXCLASS_DEFAULT; ssize_t opt_lg_tcache_gc_sweep = LG_TCACHE_GC_SWEEP_DEFAULT; +tcache_bin_info_t *tcache_bin_info; +static unsigned stack_nelms; /* Total stack elms per tcache. */ + /* Map of thread-specific caches. */ #ifndef NO_TLS __thread tcache_t *tcache_tls JEMALLOC_ATTR(tls_model("initial-exec")); @@ -55,21 +58,19 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem #endif ) { - void *flush, *deferred, *ptr; + void *ptr; unsigned i, nflush, ndeferred; - bool first_pass; #ifdef JEMALLOC_STATS bool merged_stats = false; #endif assert(binind < nbins); assert(rem <= tbin->ncached); - assert(tbin->ncached > 0 || tbin->avail == NULL); - for (flush = tbin->avail, nflush = tbin->ncached - rem, first_pass = - true; flush != NULL; flush = deferred, nflush = ndeferred) { + for (nflush = tbin->ncached - rem; nflush > 0; nflush = ndeferred) { /* Lock the arena bin associated with the first object. */ - arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(flush); + arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE( + tbin->avail[0]); arena_t *arena = chunk->arena; arena_bin_t *bin = &arena->bins[binind]; @@ -92,12 +93,10 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem tbin->tstats.nrequests = 0; } #endif - deferred = NULL; ndeferred = 0; for (i = 0; i < nflush; i++) { - ptr = flush; + ptr = tbin->avail[i]; assert(ptr != NULL); - flush = *(void **)ptr; chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); if (chunk->arena == arena) { size_t pageind = ((uintptr_t)ptr - @@ -112,17 +111,11 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem * locked. Stash the object, so that it can be * handled in a future pass. */ - *(void **)ptr = deferred; - deferred = ptr; + tbin->avail[ndeferred] = ptr; ndeferred++; } } malloc_mutex_unlock(&bin->lock); - - if (first_pass) { - tbin->avail = flush; - first_pass = false; - } } #ifdef JEMALLOC_STATS if (merged_stats == false) { @@ -139,6 +132,8 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem } #endif + memmove(tbin->avail, &tbin->avail[tbin->ncached - rem], + rem * sizeof(void *)); tbin->ncached = rem; if (tbin->ncached < tbin->low_water) tbin->low_water = tbin->ncached; @@ -151,18 +146,19 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem #endif ) { - void *flush, *deferred, *ptr; + void *ptr; unsigned i, nflush, ndeferred; - bool first_pass; +#ifdef JEMALLOC_STATS + bool merged_stats = false; +#endif assert(binind < nhbins); assert(rem <= tbin->ncached); - assert(tbin->ncached > 0 || tbin->avail == NULL); - for (flush = tbin->avail, nflush = tbin->ncached - rem, first_pass = - true; flush != NULL; flush = deferred, nflush = ndeferred) { + for (nflush = tbin->ncached - rem; nflush > 0; nflush = ndeferred) { /* Lock the arena associated with the first object. */ - arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(flush); + arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE( + tbin->avail[0]); arena_t *arena = chunk->arena; malloc_mutex_lock(&arena->lock); @@ -174,6 +170,7 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem tcache->prof_accumbytes = 0; #endif #ifdef JEMALLOC_STATS + merged_stats = true; arena->stats.nrequests_large += tbin->tstats.nrequests; arena->stats.lstats[binind - nbins].nrequests += tbin->tstats.nrequests; @@ -182,12 +179,10 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem #if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS)) } #endif - deferred = NULL; ndeferred = 0; for (i = 0; i < nflush; i++) { - ptr = flush; + ptr = tbin->avail[i]; assert(ptr != NULL); - flush = *(void **)ptr; chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); if (chunk->arena == arena) arena_dalloc_large(arena, chunk, ptr); @@ -198,19 +193,30 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem * Stash the object, so that it can be handled * in a future pass. */ - *(void **)ptr = deferred; - deferred = ptr; + tbin->avail[ndeferred] = ptr; ndeferred++; } } malloc_mutex_unlock(&arena->lock); - - if (first_pass) { - tbin->avail = flush; - first_pass = false; - } } +#ifdef JEMALLOC_STATS + if (merged_stats == false) { + /* + * The flush loop didn't happen to flush to this thread's + * arena, so the stats didn't get merged. Manually do so now. + */ + arena_t *arena = tcache->arena; + malloc_mutex_lock(&arena->lock); + arena->stats.nrequests_large += tbin->tstats.nrequests; + arena->stats.lstats[binind - nbins].nrequests += + tbin->tstats.nrequests; + tbin->tstats.nrequests = 0; + malloc_mutex_unlock(&arena->lock); + } +#endif + memmove(tbin->avail, &tbin->avail[tbin->ncached - rem], + rem * sizeof(void *)); tbin->ncached = rem; if (tbin->ncached < tbin->low_water) tbin->low_water = tbin->ncached; @@ -220,10 +226,14 @@ tcache_t * tcache_create(arena_t *arena) { tcache_t *tcache; - size_t size; + size_t size, stack_offset; unsigned i; size = offsetof(tcache_t, tbins) + (sizeof(tcache_bin_t) * nhbins); + /* Naturally align the pointer stacks. */ + size = PTR_CEILING(size); + stack_offset = size; + size += stack_nelms * sizeof(void *); /* * Round up to the nearest multiple of the cacheline size, in order to * avoid the possibility of false cacheline sharing. @@ -236,6 +246,8 @@ tcache_create(arena_t *arena) if (size <= small_maxclass) tcache = (tcache_t *)arena_malloc_small(arena, size, true); + else if (size <= tcache_maxclass) + tcache = (tcache_t *)arena_malloc_large(arena, size, true); else tcache = (tcache_t *)icalloc(size); @@ -252,15 +264,11 @@ tcache_create(arena_t *arena) tcache->arena = arena; assert((TCACHE_NSLOTS_SMALL_MAX & 1U) == 0); - for (i = 0; i < nbins; i++) { - if ((arena_bin_info[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MAX) { - tcache->tbins[i].ncached_max = (arena_bin_info[i].nregs - << 1); - } else - tcache->tbins[i].ncached_max = TCACHE_NSLOTS_SMALL_MAX; + for (i = 0; i < nhbins; i++) { + tcache->tbins[i].avail = (void **)((uintptr_t)tcache + + (uintptr_t)stack_offset); + stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *); } - for (; i < nhbins; i++) - tcache->tbins[i].ncached_max = TCACHE_NSLOTS_LARGE; TCACHE_SET(tcache); @@ -271,6 +279,7 @@ void tcache_destroy(tcache_t *tcache) { unsigned i; + size_t tcache_size; #ifdef JEMALLOC_STATS /* Unlink from list of extant tcaches. */ @@ -327,7 +336,8 @@ tcache_destroy(tcache_t *tcache) } #endif - if (arena_salloc(tcache) <= small_maxclass) { + tcache_size = arena_salloc(tcache); + if (tcache_size <= small_maxclass) { arena_chunk_t *chunk = CHUNK_ADDR2BASE(tcache); arena_t *arena = chunk->arena; size_t pageind = ((uintptr_t)tcache - (uintptr_t)chunk) >> @@ -341,6 +351,13 @@ tcache_destroy(tcache_t *tcache) malloc_mutex_lock(&bin->lock); arena_dalloc_bin(arena, chunk, tcache, mapelm); malloc_mutex_unlock(&bin->lock); + } else if (tcache_size <= tcache_maxclass) { + arena_chunk_t *chunk = CHUNK_ADDR2BASE(tcache); + arena_t *arena = chunk->arena; + + malloc_mutex_lock(&arena->lock); + arena_dalloc_large(arena, chunk, tcache); + malloc_mutex_unlock(&arena->lock); } else idalloc(tcache); } @@ -397,11 +414,13 @@ tcache_stats_merge(tcache_t *tcache, arena_t *arena) } #endif -void +bool tcache_boot(void) { if (opt_tcache) { + unsigned i; + /* * If necessary, clamp opt_lg_tcache_max, now that * small_maxclass and arena_maxclass are known. @@ -416,6 +435,28 @@ tcache_boot(void) nhbins = nbins + (tcache_maxclass >> PAGE_SHIFT); + /* Initialize tcache_bin_info. */ + tcache_bin_info = (tcache_bin_info_t *)base_alloc(nhbins * + sizeof(tcache_bin_info_t)); + if (tcache_bin_info == NULL) + return (true); + stack_nelms = 0; + for (i = 0; i < nbins; i++) { + if ((arena_bin_info[i].nregs << 1) <= + TCACHE_NSLOTS_SMALL_MAX) { + tcache_bin_info[i].ncached_max = + (arena_bin_info[i].nregs << 1); + } else { + tcache_bin_info[i].ncached_max = + TCACHE_NSLOTS_SMALL_MAX; + } + stack_nelms += tcache_bin_info[i].ncached_max; + } + for (; i < nhbins; i++) { + tcache_bin_info[i].ncached_max = TCACHE_NSLOTS_LARGE; + stack_nelms += tcache_bin_info[i].ncached_max; + } + /* Compute incremental GC event threshold. */ if (opt_lg_tcache_gc_sweep >= 0) { tcache_gc_incr = ((1U << opt_lg_tcache_gc_sweep) / @@ -431,6 +472,8 @@ tcache_boot(void) abort(); } } + + return (false); } /******************************************************************************/ #endif /* JEMALLOC_TCACHE */ diff --git a/jemalloc/test/bitmap.c b/jemalloc/test/bitmap.c new file mode 100644 index 0000000..7a017c8 --- /dev/null +++ b/jemalloc/test/bitmap.c @@ -0,0 +1,153 @@ +#define JEMALLOC_MANGLE +#include "jemalloc_test.h" + +/* + * Avoid using the assert() from jemalloc_internal.h, since it requires + * internal libjemalloc functionality. + * */ +#include + +/* + * Directly include the bitmap code, since it isn't exposed outside + * libjemalloc. + */ +#include "../src/bitmap.c" + +#define MAXBITS 4500 + +static void +test_bitmap_size(void) +{ + size_t i, prev_size; + + prev_size = 0; + for (i = 1; i <= MAXBITS; i++) { + size_t size = bitmap_size(i); + assert(size >= prev_size); + prev_size = size; + } +} + +static void +test_bitmap_init(void) +{ + size_t i; + + for (i = 1; i <= MAXBITS; i++) { + bitmap_info_t binfo; + bitmap_info_init(&binfo, i); + { + size_t j; + bitmap_t bitmap[bitmap_info_ngroups(&binfo)]; + bitmap_init(bitmap, &binfo); + + for (j = 0; j < i; j++) + assert(bitmap_get(bitmap, &binfo, j) == false); + + } + } +} + +static void +test_bitmap_set(void) +{ + size_t i; + + for (i = 1; i <= MAXBITS; i++) { + bitmap_info_t binfo; + bitmap_info_init(&binfo, i); + { + size_t j; + bitmap_t bitmap[bitmap_info_ngroups(&binfo)]; + bitmap_init(bitmap, &binfo); + + for (j = 0; j < i; j++) + bitmap_set(bitmap, &binfo, j); + assert(bitmap_full(bitmap, &binfo)); + } + } +} + +static void +test_bitmap_unset(void) +{ + size_t i; + + for (i = 1; i <= MAXBITS; i++) { + bitmap_info_t binfo; + bitmap_info_init(&binfo, i); + { + size_t j; + bitmap_t bitmap[bitmap_info_ngroups(&binfo)]; + bitmap_init(bitmap, &binfo); + + for (j = 0; j < i; j++) + bitmap_set(bitmap, &binfo, j); + assert(bitmap_full(bitmap, &binfo)); + for (j = 0; j < i; j++) + bitmap_unset(bitmap, &binfo, j); + for (j = 0; j < i; j++) + bitmap_set(bitmap, &binfo, j); + assert(bitmap_full(bitmap, &binfo)); + } + } +} + +static void +test_bitmap_sfu(void) +{ + size_t i; + + for (i = 1; i <= MAXBITS; i++) { + bitmap_info_t binfo; + bitmap_info_init(&binfo, i); + { + ssize_t j; + bitmap_t bitmap[bitmap_info_ngroups(&binfo)]; + bitmap_init(bitmap, &binfo); + + /* Iteratively set bits starting at the beginning. */ + for (j = 0; j < i; j++) + assert(bitmap_sfu(bitmap, &binfo) == j); + assert(bitmap_full(bitmap, &binfo)); + + /* + * Iteratively unset bits starting at the end, and + * verify that bitmap_sfu() reaches the unset bits. + */ + for (j = i - 1; j >= 0; j--) { + bitmap_unset(bitmap, &binfo, j); + assert(bitmap_sfu(bitmap, &binfo) == j); + bitmap_unset(bitmap, &binfo, j); + } + assert(bitmap_get(bitmap, &binfo, 0) == false); + + /* + * Iteratively set bits starting at the beginning, and + * verify that bitmap_sfu() looks past them. + */ + for (j = 1; j < i; j++) { + bitmap_set(bitmap, &binfo, j - 1); + assert(bitmap_sfu(bitmap, &binfo) == j); + bitmap_unset(bitmap, &binfo, j); + } + assert(bitmap_sfu(bitmap, &binfo) == i - 1); + assert(bitmap_full(bitmap, &binfo)); + } + } +} + +int +main(void) +{ + fprintf(stderr, "Test begin\n"); + + test_bitmap_size(); + test_bitmap_init(); + test_bitmap_set(); + test_bitmap_unset(); + test_bitmap_sfu(); + + fprintf(stderr, "Test end\n"); + return (0); +} diff --git a/jemalloc/test/bitmap.exp b/jemalloc/test/bitmap.exp new file mode 100644 index 0000000..369a88d --- /dev/null +++ b/jemalloc/test/bitmap.exp @@ -0,0 +1,2 @@ +Test begin +Test end -- cgit v0.12 From 9c43c13a35220c10d97a886616899189daceb359 Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Fri, 18 Mar 2011 10:53:15 -0700 Subject: Reverse tcache fill order. Refill the thread cache such that low regions get used first. This fixes a regression due to the recent transition to bitmap-based region management. --- jemalloc/src/arena.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c index 87bd9bb..a1fa2a3 100644 --- a/jemalloc/src/arena.c +++ b/jemalloc/src/arena.c @@ -1360,7 +1360,8 @@ arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, size_t binind ptr = arena_bin_malloc_hard(arena, bin); if (ptr == NULL) break; - tbin->avail[i] = ptr; + /* Insert such that low regions get used first. */ + tbin->avail[nfill - 1 - i] = ptr; } #ifdef JEMALLOC_STATS bin->stats.allocated += (i - tbin->ncached) * -- cgit v0.12 From 597632be188d2bcc135dad2145cc46ef44897aad Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Fri, 18 Mar 2011 13:41:33 -0700 Subject: Improve thread-->arena assignment. Rather than blindly assigning threads to arenas in round-robin fashion, choose the lowest-numbered arena that currently has the smallest number of threads assigned to it. Add the "stats.arenas..nthreads" mallctl. --- jemalloc/doc/jemalloc.xml.in | 10 +++ jemalloc/include/jemalloc/internal/arena.h | 14 ++++- jemalloc/include/jemalloc/internal/ctl.h | 1 + .../jemalloc/internal/jemalloc_internal.h.in | 3 +- jemalloc/src/arena.c | 1 + jemalloc/src/ctl.c | 13 ++++ jemalloc/src/jemalloc.c | 71 ++++++++++++++++++---- jemalloc/src/stats.c | 4 ++ 8 files changed, 101 insertions(+), 16 deletions(-) diff --git a/jemalloc/doc/jemalloc.xml.in b/jemalloc/doc/jemalloc.xml.in index 97893c1..2bde890 100644 --- a/jemalloc/doc/jemalloc.xml.in +++ b/jemalloc/doc/jemalloc.xml.in @@ -1644,6 +1644,16 @@ malloc_conf = "xmalloc:true";]]> + stats.arenas.<i>.nthreads + (unsigned) + r- + + Number of threads currently assigned to + arena. + + + + stats.arenas.<i>.pactive (size_t) r- diff --git a/jemalloc/include/jemalloc/internal/arena.h b/jemalloc/include/jemalloc/internal/arena.h index 1744b45..94b7f3d 100644 --- a/jemalloc/include/jemalloc/internal/arena.h +++ b/jemalloc/include/jemalloc/internal/arena.h @@ -295,8 +295,18 @@ struct arena_s { unsigned ind; /* - * All non-bin-related operations on this arena require that lock be - * locked. + * Number of threads currently assigned to this arena. This field is + * protected by arenas_lock. + */ + unsigned nthreads; + + /* + * There are three classes of arena operations from a locking + * perspective: + * 1) Thread asssignment (modifies nthreads) is protected by + * arenas_lock. + * 2) Bin-related operations are protected by bin locks. + * 3) Chunk- and run-related operations are protected by this mutex. */ malloc_mutex_t lock; diff --git a/jemalloc/include/jemalloc/internal/ctl.h b/jemalloc/include/jemalloc/internal/ctl.h index 8776ad1..f1f5eb7 100644 --- a/jemalloc/include/jemalloc/internal/ctl.h +++ b/jemalloc/include/jemalloc/internal/ctl.h @@ -29,6 +29,7 @@ struct ctl_node_s { struct ctl_arena_stats_s { bool initialized; + unsigned nthreads; size_t pactive; size_t pdirty; #ifdef JEMALLOC_STATS diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in index a80fc7c..a7472c0 100644 --- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in +++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in @@ -293,6 +293,7 @@ extern size_t lg_pagesize; extern unsigned ncpus; extern malloc_mutex_t arenas_lock; /* Protects arenas initialization. */ +extern pthread_key_t arenas_tsd; #ifndef NO_TLS /* * Map of pthread_self() --> arenas[???], used for selecting an arena to use @@ -302,9 +303,9 @@ extern __thread arena_t *arenas_tls JEMALLOC_ATTR(tls_model("initial-exec")); # define ARENA_GET() arenas_tls # define ARENA_SET(v) do { \ arenas_tls = (v); \ + pthread_setspecific(arenas_tsd, (void *)(v)); \ } while (0) #else -extern pthread_key_t arenas_tsd; # define ARENA_GET() ((arena_t *)pthread_getspecific(arenas_tsd)) # define ARENA_SET(v) do { \ pthread_setspecific(arenas_tsd, (void *)(v)); \ diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c index a1fa2a3..022f9ec 100644 --- a/jemalloc/src/arena.c +++ b/jemalloc/src/arena.c @@ -2175,6 +2175,7 @@ arena_new(arena_t *arena, unsigned ind) arena_bin_t *bin; arena->ind = ind; + arena->nthreads = 0; if (malloc_mutex_init(&arena->lock)) return (true); diff --git a/jemalloc/src/ctl.c b/jemalloc/src/ctl.c index c32e955..b4f280d 100644 --- a/jemalloc/src/ctl.c +++ b/jemalloc/src/ctl.c @@ -182,6 +182,7 @@ CTL_PROTO(stats_arenas_i_lruns_j_highruns) CTL_PROTO(stats_arenas_i_lruns_j_curruns) INDEX_PROTO(stats_arenas_i_lruns_j) #endif +CTL_PROTO(stats_arenas_i_nthreads) CTL_PROTO(stats_arenas_i_pactive) CTL_PROTO(stats_arenas_i_pdirty) #ifdef JEMALLOC_STATS @@ -434,6 +435,7 @@ static const ctl_node_t stats_arenas_i_lruns_node[] = { #endif static const ctl_node_t stats_arenas_i_node[] = { + {NAME("nthreads"), CTL(stats_arenas_i_nthreads)}, {NAME("pactive"), CTL(stats_arenas_i_pactive)}, {NAME("pdirty"), CTL(stats_arenas_i_pdirty)} #ifdef JEMALLOC_STATS @@ -620,6 +622,7 @@ ctl_arena_refresh(arena_t *arena, unsigned i) ctl_arena_clear(astats); + sstats->nthreads += astats->nthreads; #ifdef JEMALLOC_STATS ctl_arena_stats_amerge(astats, arena); /* Merge into sum stats as well. */ @@ -657,10 +660,17 @@ ctl_refresh(void) * Clear sum stats, since they will be merged into by * ctl_arena_refresh(). */ + ctl_stats.arenas[narenas].nthreads = 0; ctl_arena_clear(&ctl_stats.arenas[narenas]); malloc_mutex_lock(&arenas_lock); memcpy(tarenas, arenas, sizeof(arena_t *) * narenas); + for (i = 0; i < narenas; i++) { + if (arenas[i] != NULL) + ctl_stats.arenas[i].nthreads = arenas[i]->nthreads; + else + ctl_stats.arenas[i].nthreads = 0; + } malloc_mutex_unlock(&arenas_lock); for (i = 0; i < narenas; i++) { bool initialized = (tarenas[i] != NULL); @@ -1129,6 +1139,8 @@ thread_arena_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp, malloc_mutex_lock(&arenas_lock); if ((arena = arenas[newind]) == NULL) arena = arenas_extend(newind); + arenas[oldind]->nthreads--; + arenas[newind]->nthreads++; malloc_mutex_unlock(&arenas_lock); if (arena == NULL) { ret = EAGAIN; @@ -1536,6 +1548,7 @@ stats_arenas_i_lruns_j_index(const size_t *mib, size_t miblen, size_t j) } #endif +CTL_RO_GEN(stats_arenas_i_nthreads, ctl_stats.arenas[mib[2]].nthreads, unsigned) CTL_RO_GEN(stats_arenas_i_pactive, ctl_stats.arenas[mib[2]].pactive, size_t) CTL_RO_GEN(stats_arenas_i_pdirty, ctl_stats.arenas[mib[2]].pdirty, size_t) #ifdef JEMALLOC_STATS diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c index 9f2fa92..ecd521c 100644 --- a/jemalloc/src/jemalloc.c +++ b/jemalloc/src/jemalloc.c @@ -7,12 +7,10 @@ malloc_mutex_t arenas_lock; arena_t **arenas; unsigned narenas; -static unsigned next_arena; +pthread_key_t arenas_tsd; #ifndef NO_TLS __thread arena_t *arenas_tls JEMALLOC_ATTR(tls_model("initial-exec")); -#else -pthread_key_t arenas_tsd; #endif #ifdef JEMALLOC_STATS @@ -70,6 +68,7 @@ size_t opt_narenas = 0; static void wrtmessage(void *cbopaque, const char *s); static void stats_print_atexit(void); static unsigned malloc_ncpus(void); +static void arenas_cleanup(void *arg); #if (defined(JEMALLOC_STATS) && defined(NO_TLS)) static void thread_allocated_cleanup(void *arg); #endif @@ -147,13 +146,53 @@ choose_arena_hard(void) arena_t *ret; if (narenas > 1) { + unsigned i, choose, first_null; + + choose = 0; + first_null = narenas; malloc_mutex_lock(&arenas_lock); - if ((ret = arenas[next_arena]) == NULL) - ret = arenas_extend(next_arena); - next_arena = (next_arena + 1) % narenas; + assert(arenas[i] != NULL); + for (i = 1; i < narenas; i++) { + if (arenas[i] != NULL) { + /* + * Choose the first arena that has the lowest + * number of threads assigned to it. + */ + if (arenas[i]->nthreads < + arenas[choose]->nthreads) + choose = i; + } else if (first_null == narenas) { + /* + * Record the index of the first uninitialized + * arena, in case all extant arenas are in use. + * + * NB: It is possible for there to be + * discontinuities in terms of initialized + * versus uninitialized arenas, due to the + * "thread.arena" mallctl. + */ + first_null = i; + } + } + + if (arenas[choose] == 0 || first_null == narenas) { + /* + * Use an unloaded arena, or the least loaded arena if + * all arenas are already initialized. + */ + ret = arenas[choose]; + } else { + /* Initialize a new arena. */ + ret = arenas_extend(first_null); + } + ret->nthreads++; malloc_mutex_unlock(&arenas_lock); - } else + } else { ret = arenas[0]; + malloc_mutex_lock(&arenas_lock); + ret->nthreads++; + malloc_mutex_unlock(&arenas_lock); + } ARENA_SET(ret); @@ -259,6 +298,16 @@ malloc_ncpus(void) return (ret); } +static void +arenas_cleanup(void *arg) +{ + arena_t *arena = (arena_t *)arg; + + malloc_mutex_lock(&arenas_lock); + arena->nthreads--; + malloc_mutex_unlock(&arenas_lock); +} + #if (defined(JEMALLOC_STATS) && defined(NO_TLS)) static void thread_allocated_cleanup(void *arg) @@ -737,6 +786,7 @@ malloc_init_hard(void) * threaded mode. */ ARENA_SET(arenas[0]); + arenas[0]->nthreads++; if (malloc_mutex_init(&arenas_lock)) return (true); @@ -779,14 +829,10 @@ malloc_init_hard(void) malloc_write(")\n"); } - next_arena = (narenas > 0) ? 1 : 0; - -#ifdef NO_TLS - if (pthread_key_create(&arenas_tsd, NULL) != 0) { + if (pthread_key_create(&arenas_tsd, arenas_cleanup) != 0) { malloc_mutex_unlock(&init_lock); return (true); } -#endif /* Allocate and initialize arenas. */ arenas = (arena_t **)base_alloc(sizeof(arena_t *) * narenas); @@ -819,7 +865,6 @@ malloc_init_hard(void) return (false); } - #ifdef JEMALLOC_ZONE JEMALLOC_ATTR(constructor) void diff --git a/jemalloc/src/stats.c b/jemalloc/src/stats.c index 3dfe0d2..81105c4 100644 --- a/jemalloc/src/stats.c +++ b/jemalloc/src/stats.c @@ -319,6 +319,7 @@ static void stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque, unsigned i) { + unsigned nthreads; size_t pagesize, pactive, pdirty, mapped; uint64_t npurge, nmadvise, purged; size_t small_allocated; @@ -328,6 +329,9 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque, CTL_GET("arenas.pagesize", &pagesize, size_t); + CTL_I_GET("stats.arenas.0.nthreads", &nthreads, unsigned); + malloc_cprintf(write_cb, cbopaque, + "assigned threads: %u\n", nthreads); CTL_I_GET("stats.arenas.0.pactive", &pactive, size_t); CTL_I_GET("stats.arenas.0.pdirty", &pdirty, size_t); CTL_I_GET("stats.arenas.0.npurge", &npurge, uint64_t); -- cgit v0.12 From 0657f12acd43eb2082a71230341449eca648bc9b Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Fri, 18 Mar 2011 17:56:14 -0700 Subject: Add the "stats.cactive" mallctl. Add the "stats.cactive" mallctl, which can be used to efficiently and repeatedly query approximately how much active memory the application is utilizing. --- jemalloc/Makefile.in | 15 +++++---- jemalloc/doc/jemalloc.xml.in | 19 +++++++++++ jemalloc/include/jemalloc/internal/hash.h | 2 +- .../jemalloc/internal/jemalloc_internal.h.in | 4 +++ jemalloc/include/jemalloc/internal/mb.h | 2 +- jemalloc/include/jemalloc/internal/rtree.h | 2 +- jemalloc/include/jemalloc/internal/stats.h | 37 ++++++++++++++++++++-- jemalloc/src/arena.c | 34 ++++++++++++++++++++ jemalloc/src/ckh.c | 2 +- jemalloc/src/ctl.c | 3 ++ jemalloc/src/hash.c | 2 +- jemalloc/src/huge.c | 3 ++ jemalloc/src/jemalloc.c | 2 +- jemalloc/src/mb.c | 2 +- jemalloc/src/rtree.c | 2 +- jemalloc/src/stats.c | 15 +++++++-- 16 files changed, 126 insertions(+), 20 deletions(-) diff --git a/jemalloc/Makefile.in b/jemalloc/Makefile.in index 8ee4c93..26da0e2 100644 --- a/jemalloc/Makefile.in +++ b/jemalloc/Makefile.in @@ -45,13 +45,13 @@ endif BINS := @srcroot@bin/pprof CHDRS := @objroot@include/jemalloc/jemalloc@install_suffix@.h \ @objroot@include/jemalloc/jemalloc_defs@install_suffix@.h -CSRCS := @srcroot@src/jemalloc.c @srcroot@src/arena.c @srcroot@src/base.c \ - @srcroot@src/bitmap.c @srcroot@src/chunk.c @srcroot@src/chunk_dss.c \ - @srcroot@src/chunk_mmap.c @srcroot@src/chunk_swap.c @srcroot@src/ckh.c \ - @srcroot@src/ctl.c @srcroot@src/extent.c @srcroot@src/hash.c \ - @srcroot@src/huge.c @srcroot@src/mb.c @srcroot@src/mutex.c \ - @srcroot@src/prof.c @srcroot@src/rtree.c \ - @srcroot@src/stats.c @srcroot@src/tcache.c +CSRCS := @srcroot@src/jemalloc.c @srcroot@src/arena.c @srcroot@src/atomic.c \ + @srcroot@src/base.c @srcroot@src/bitmap.c @srcroot@src/chunk.c \ + @srcroot@src/chunk_dss.c @srcroot@src/chunk_mmap.c \ + @srcroot@src/chunk_swap.c @srcroot@src/ckh.c @srcroot@src/ctl.c \ + @srcroot@src/extent.c @srcroot@src/hash.c @srcroot@src/huge.c \ + @srcroot@src/mb.c @srcroot@src/mutex.c @srcroot@src/prof.c \ + @srcroot@src/rtree.c @srcroot@src/stats.c @srcroot@src/tcache.c ifeq (macho, @abi@) CSRCS += @srcroot@src/zone.c endif @@ -96,6 +96,7 @@ doc: $(DOCS) # -include $(CSRCS:@srcroot@%.c=@objroot@%.d) -include $(CSRCS:@srcroot@%.c=@objroot@%.pic.d) +-include $(CTESTS:@srcroot@%.c=@objroot@%.d) @objroot@src/%.o: @srcroot@src/%.c @mkdir -p $(@D) diff --git a/jemalloc/doc/jemalloc.xml.in b/jemalloc/doc/jemalloc.xml.in index 2bde890..13f3aae 100644 --- a/jemalloc/doc/jemalloc.xml.in +++ b/jemalloc/doc/jemalloc.xml.in @@ -1535,6 +1535,25 @@ malloc_conf = "xmalloc:true";]]> option for additional information. + + + stats.cactive + (size_t *) + r- + [] + + Pointer to a counter that contains an approximate count + of the current number of bytes in active pages. The estimate may be + high, but never low, because each arena rounds up to the nearest + multiple of the chunk size when computing its contribution to the + counter. Note that the epoch mallctl has no bearing + on this counter. Furthermore, counter consistency is maintained via + atomic operations, so it is necessary to use an atomic operation in + order to guarantee a consistent read when dereferencing the pointer. + + + stats.allocated diff --git a/jemalloc/include/jemalloc/internal/hash.h b/jemalloc/include/jemalloc/internal/hash.h index 9073d83..93905bf 100644 --- a/jemalloc/include/jemalloc/internal/hash.h +++ b/jemalloc/include/jemalloc/internal/hash.h @@ -17,7 +17,7 @@ uint64_t hash(const void *key, size_t len, uint64_t seed); #endif -#if (defined(JEMALLOC_ENABLE_INLINE) || defined(HASH_C_)) +#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_HASH_C_)) /* * The following hash function is based on MurmurHash64A(), placed into the * public domain by Austin Appleby. See http://murmurhash.googlepages.com/ for diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in index a7472c0..90cd604 100644 --- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in +++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in @@ -213,6 +213,7 @@ extern void (*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s); #define PAGE_CEILING(s) \ (((s) + PAGE_MASK) & ~PAGE_MASK) +#include "jemalloc/internal/atomic.h" #include "jemalloc/internal/prn.h" #include "jemalloc/internal/ckh.h" #include "jemalloc/internal/stats.h" @@ -237,6 +238,7 @@ extern void (*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s); /******************************************************************************/ #define JEMALLOC_H_STRUCTS +#include "jemalloc/internal/atomic.h" #include "jemalloc/internal/prn.h" #include "jemalloc/internal/ckh.h" #include "jemalloc/internal/stats.h" @@ -352,6 +354,7 @@ int buferror(int errnum, char *buf, size_t buflen); void jemalloc_prefork(void); void jemalloc_postfork(void); +#include "jemalloc/internal/atomic.h" #include "jemalloc/internal/prn.h" #include "jemalloc/internal/ckh.h" #include "jemalloc/internal/stats.h" @@ -376,6 +379,7 @@ void jemalloc_postfork(void); /******************************************************************************/ #define JEMALLOC_H_INLINES +#include "jemalloc/internal/atomic.h" #include "jemalloc/internal/prn.h" #include "jemalloc/internal/ckh.h" #include "jemalloc/internal/stats.h" diff --git a/jemalloc/include/jemalloc/internal/mb.h b/jemalloc/include/jemalloc/internal/mb.h index 1707aa9..dc9f2a5 100644 --- a/jemalloc/include/jemalloc/internal/mb.h +++ b/jemalloc/include/jemalloc/internal/mb.h @@ -17,7 +17,7 @@ void mb_write(void); #endif -#if (defined(JEMALLOC_ENABLE_INLINE) || defined(MB_C_)) +#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_MB_C_)) #ifdef __i386__ /* * According to the Intel Architecture Software Developer's Manual, current diff --git a/jemalloc/include/jemalloc/internal/rtree.h b/jemalloc/include/jemalloc/internal/rtree.h index 9d58eba..95d6355 100644 --- a/jemalloc/include/jemalloc/internal/rtree.h +++ b/jemalloc/include/jemalloc/internal/rtree.h @@ -49,7 +49,7 @@ void *rtree_get(rtree_t *rtree, uintptr_t key); bool rtree_set(rtree_t *rtree, uintptr_t key, void *val); #endif -#if (defined(JEMALLOC_ENABLE_INLINE) || defined(RTREE_C_)) +#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_RTREE_C_)) #define RTREE_GET_GENERATE(f) \ /* The least significant bits of the key are ignored. */ \ JEMALLOC_INLINE void * \ diff --git a/jemalloc/include/jemalloc/internal/stats.h b/jemalloc/include/jemalloc/internal/stats.h index 3fc2080..2a9b31d 100644 --- a/jemalloc/include/jemalloc/internal/stats.h +++ b/jemalloc/include/jemalloc/internal/stats.h @@ -154,6 +154,10 @@ struct chunk_stats_s { extern bool opt_stats_print; +#ifdef JEMALLOC_STATS +extern size_t stats_cactive; +#endif + char *u2s(uint64_t x, unsigned base, char *s); #ifdef JEMALLOC_STATS void malloc_cprintf(void (*write)(void *, const char *), void *cbopaque, @@ -166,9 +170,38 @@ void stats_print(void (*write)(void *, const char *), void *cbopaque, #endif /* JEMALLOC_H_EXTERNS */ /******************************************************************************/ -#ifdef JEMALLOC_STATS #ifdef JEMALLOC_H_INLINES +#ifdef JEMALLOC_STATS + +#ifndef JEMALLOC_ENABLE_INLINE +size_t stats_cactive_get(void); +void stats_cactive_add(size_t size); +void stats_cactive_sub(size_t size); +#endif + +#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_STATS_C_)) +JEMALLOC_INLINE size_t +stats_cactive_get(void) +{ + + return (atomic_read_z(&stats_cactive)); +} + +JEMALLOC_INLINE void +stats_cactive_add(size_t size) +{ + + atomic_add_z(&stats_cactive, size); +} + +JEMALLOC_INLINE void +stats_cactive_sub(size_t size) +{ + + atomic_sub_z(&stats_cactive, size); +} +#endif -#endif /* JEMALLOC_H_INLINES */ #endif /* JEMALLOC_STATS */ +#endif /* JEMALLOC_H_INLINES */ /******************************************************************************/ diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c index 022f9ec..4cbca57 100644 --- a/jemalloc/src/arena.c +++ b/jemalloc/src/arena.c @@ -315,6 +315,9 @@ arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large, size_t old_ndirty, run_ind, total_pages, need_pages, rem_pages, i; size_t flag_dirty; arena_avail_tree_t *runs_avail; +#ifdef JEMALLOC_STATS + size_t cactive_diff; +#endif chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run); old_ndirty = chunk->ndirty; @@ -333,6 +336,13 @@ arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large, rem_pages = total_pages - need_pages; arena_avail_tree_remove(runs_avail, &chunk->map[run_ind-map_bias]); +#ifdef JEMALLOC_STATS + /* Update stats_cactive if nactive is crossing a chunk multiple. */ + cactive_diff = CHUNK_CEILING((arena->nactive + need_pages) << + PAGE_SHIFT) - CHUNK_CEILING(arena->nactive << PAGE_SHIFT); + if (cactive_diff != 0) + stats_cactive_add(cactive_diff); +#endif arena->nactive += need_pages; /* Keep track of trailing unused pages for later use. */ @@ -720,6 +730,9 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk) assert(pageind + npages <= chunk_npages); if (mapelm->bits & CHUNK_MAP_DIRTY) { size_t i; +#ifdef JEMALLOC_STATS + size_t cactive_diff; +#endif arena_avail_tree_remove( &arena->runs_avail_dirty, mapelm); @@ -742,6 +755,17 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk) CHUNK_MAP_ALLOCATED; } +#ifdef JEMALLOC_STATS + /* + * Update stats_cactive if nactive is crossing a + * chunk multiple. + */ + cactive_diff = CHUNK_CEILING((arena->nactive + + npages) << PAGE_SHIFT) - + CHUNK_CEILING(arena->nactive << PAGE_SHIFT); + if (cactive_diff != 0) + stats_cactive_add(cactive_diff); +#endif arena->nactive += npages; /* Append to list for later processing. */ ql_elm_new(mapelm, u.ql_link); @@ -930,6 +954,9 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty) arena_chunk_t *chunk; size_t size, run_ind, run_pages, flag_dirty; arena_avail_tree_t *runs_avail; +#ifdef JEMALLOC_STATS + size_t cactive_diff; +#endif chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run); run_ind = (size_t)(((uintptr_t)run - (uintptr_t)chunk) @@ -951,6 +978,13 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty) size = bin_info->run_size; } run_pages = (size >> PAGE_SHIFT); +#ifdef JEMALLOC_STATS + /* Update stats_cactive if nactive is crossing a chunk multiple. */ + cactive_diff = CHUNK_CEILING(arena->nactive << PAGE_SHIFT) - + CHUNK_CEILING((arena->nactive - run_pages) << PAGE_SHIFT); + if (cactive_diff != 0) + stats_cactive_sub(cactive_diff); +#endif arena->nactive -= run_pages; /* diff --git a/jemalloc/src/ckh.c b/jemalloc/src/ckh.c index 75ae7fd..22319ab 100644 --- a/jemalloc/src/ckh.c +++ b/jemalloc/src/ckh.c @@ -34,7 +34,7 @@ * respectively. * ******************************************************************************/ -#define CKH_C_ +#define JEMALLOC_CKH_C_ #include "jemalloc/internal/jemalloc_internal.h" /******************************************************************************/ diff --git a/jemalloc/src/ctl.c b/jemalloc/src/ctl.c index b4f280d..40fdbac 100644 --- a/jemalloc/src/ctl.c +++ b/jemalloc/src/ctl.c @@ -193,6 +193,7 @@ CTL_PROTO(stats_arenas_i_purged) #endif INDEX_PROTO(stats_arenas_i) #ifdef JEMALLOC_STATS +CTL_PROTO(stats_cactive) CTL_PROTO(stats_allocated) CTL_PROTO(stats_active) CTL_PROTO(stats_mapped) @@ -460,6 +461,7 @@ static const ctl_node_t stats_arenas_node[] = { static const ctl_node_t stats_node[] = { #ifdef JEMALLOC_STATS + {NAME("cactive"), CTL(stats_cactive)}, {NAME("allocated"), CTL(stats_allocated)}, {NAME("active"), CTL(stats_active)}, {NAME("mapped"), CTL(stats_mapped)}, @@ -1580,6 +1582,7 @@ RETURN: } #ifdef JEMALLOC_STATS +CTL_RO_GEN(stats_cactive, &stats_cactive, size_t *) CTL_RO_GEN(stats_allocated, ctl_stats.allocated, size_t) CTL_RO_GEN(stats_active, ctl_stats.active, size_t) CTL_RO_GEN(stats_mapped, ctl_stats.mapped, size_t) diff --git a/jemalloc/src/hash.c b/jemalloc/src/hash.c index 6a13d7a..cfa4da0 100644 --- a/jemalloc/src/hash.c +++ b/jemalloc/src/hash.c @@ -1,2 +1,2 @@ -#define HASH_C_ +#define JEMALLOC_HASH_C_ #include "jemalloc/internal/jemalloc_internal.h" diff --git a/jemalloc/src/huge.c b/jemalloc/src/huge.c index de09198..ac3f3a0 100644 --- a/jemalloc/src/huge.c +++ b/jemalloc/src/huge.c @@ -50,6 +50,7 @@ huge_malloc(size_t size, bool zero) malloc_mutex_lock(&huge_mtx); extent_tree_ad_insert(&huge, node); #ifdef JEMALLOC_STATS + stats_cactive_add(csize); huge_nmalloc++; huge_allocated += csize; #endif @@ -134,6 +135,7 @@ huge_palloc(size_t size, size_t alignment, bool zero) malloc_mutex_lock(&huge_mtx); extent_tree_ad_insert(&huge, node); #ifdef JEMALLOC_STATS + stats_cactive_add(chunk_size); huge_nmalloc++; huge_allocated += chunk_size; #endif @@ -278,6 +280,7 @@ huge_dalloc(void *ptr, bool unmap) extent_tree_ad_remove(&huge, node); #ifdef JEMALLOC_STATS + stats_cactive_sub(node->size); huge_ndalloc++; huge_allocated -= node->size; #endif diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c index ecd521c..0efafde 100644 --- a/jemalloc/src/jemalloc.c +++ b/jemalloc/src/jemalloc.c @@ -151,7 +151,7 @@ choose_arena_hard(void) choose = 0; first_null = narenas; malloc_mutex_lock(&arenas_lock); - assert(arenas[i] != NULL); + assert(arenas[0] != NULL); for (i = 1; i < narenas; i++) { if (arenas[i] != NULL) { /* diff --git a/jemalloc/src/mb.c b/jemalloc/src/mb.c index 30a1a2e..dc2c0a2 100644 --- a/jemalloc/src/mb.c +++ b/jemalloc/src/mb.c @@ -1,2 +1,2 @@ -#define MB_C_ +#define JEMALLOC_MB_C_ #include "jemalloc/internal/jemalloc_internal.h" diff --git a/jemalloc/src/rtree.c b/jemalloc/src/rtree.c index eb440aa..eb0ff1e 100644 --- a/jemalloc/src/rtree.c +++ b/jemalloc/src/rtree.c @@ -1,4 +1,4 @@ -#define RTREE_C_ +#define JEMALLOC_RTREE_C_ #include "jemalloc/internal/jemalloc_internal.h" rtree_t * diff --git a/jemalloc/src/stats.c b/jemalloc/src/stats.c index 81105c4..cbbbb5b 100644 --- a/jemalloc/src/stats.c +++ b/jemalloc/src/stats.c @@ -39,6 +39,10 @@ bool opt_stats_print = false; +#ifdef JEMALLOC_STATS +size_t stats_cactive = 0; +#endif + /******************************************************************************/ /* Function prototypes for non-inline static functions. */ @@ -673,21 +677,26 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque, #ifdef JEMALLOC_STATS { int err; - size_t ssz; + size_t sszp, ssz; + size_t *cactive; size_t allocated, active, mapped; size_t chunks_current, chunks_high, swap_avail; uint64_t chunks_total; size_t huge_allocated; uint64_t huge_nmalloc, huge_ndalloc; + sszp = sizeof(size_t *); ssz = sizeof(size_t); + CTL_GET("stats.cactive", &cactive, size_t *); CTL_GET("stats.allocated", &allocated, size_t); CTL_GET("stats.active", &active, size_t); CTL_GET("stats.mapped", &mapped, size_t); malloc_cprintf(write_cb, cbopaque, - "Allocated: %zu, active: %zu, mapped: %zu\n", allocated, - active, mapped); + "Allocated: %zu, active: %zu, mapped: %zu\n", + allocated, active, mapped); + malloc_cprintf(write_cb, cbopaque, + "Current active ceiling: %zu\n", atomic_read_z(cactive)); /* Print chunk stats. */ CTL_GET("stats.chunks.total", &chunks_total, uint64_t); -- cgit v0.12 From 92d3284ff8548c85b9b928f2615b96da4c4b2618 Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Fri, 18 Mar 2011 18:15:37 -0700 Subject: Add atomic.[ch]. Add atomic.[ch], which should have been part of the previous commit. --- jemalloc/include/jemalloc/internal/atomic.h | 77 +++++++++++++++++++++++++++++ jemalloc/src/atomic.c | 2 + 2 files changed, 79 insertions(+) create mode 100644 jemalloc/include/jemalloc/internal/atomic.h create mode 100644 jemalloc/src/atomic.c diff --git a/jemalloc/include/jemalloc/internal/atomic.h b/jemalloc/include/jemalloc/internal/atomic.h new file mode 100644 index 0000000..43faeaf --- /dev/null +++ b/jemalloc/include/jemalloc/internal/atomic.h @@ -0,0 +1,77 @@ +/******************************************************************************/ +#ifdef JEMALLOC_H_TYPES + +#endif /* JEMALLOC_H_TYPES */ +/******************************************************************************/ +#ifdef JEMALLOC_H_STRUCTS + +#endif /* JEMALLOC_H_STRUCTS */ +/******************************************************************************/ +#ifdef JEMALLOC_H_EXTERNS + +#define atomic_read_uint64(p) atomic_add_uint64(p, 0) +#define atomic_read_uint32(p) atomic_add_uint32(p, 0) + +#if (LG_SIZEOF_PTR == 3) +# define atomic_read_z(p) atomic_add_uint64(p, 0) +# define atomic_add_z(p, x) atomic_add_uint64(p, x) +# define atomic_sub_z(p, x) atomic_sub_uint64(p, x) +#elif (LG_SIZEOF_PTR == 2) +# define atomic_read_z(p) atomic_add_uint32(p, 0) +# define atomic_add_z(p, x) atomic_add_uint32(p, x) +# define atomic_sub_z(p, x) atomic_sub_uint32(p, x) +#endif + +#endif /* JEMALLOC_H_EXTERNS */ +/******************************************************************************/ +#ifdef JEMALLOC_H_INLINES + +#ifndef JEMALLOC_ENABLE_INLINE +uint64_t atomic_add_uint64(uint64_t *p, uint64_t x); +uint64_t atomic_sub_uint64(uint64_t *p, uint64_t x); +uint32_t atomic_add_uint32(uint32_t *p, uint32_t x); +uint32_t atomic_sub_uint32(uint32_t *p, uint32_t x); +#endif + +#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ATOMIC_C_)) +/* 64-bit operations. */ +#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 +JEMALLOC_INLINE uint64_t +atomic_add_uint64(uint64_t *p, uint64_t x) +{ + + return (__sync_add_and_fetch(p, x)); +} + +JEMALLOC_INLINE uint64_t +atomic_sub_uint64(uint64_t *p, uint64_t x) +{ + + return (__sync_sub_and_fetch(p, x)); +} +#else +# error "Missing implementation for 64-bit atomic operations" +#endif + +/* 32-bit operations. */ +#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 +JEMALLOC_INLINE uint32_t +atomic_add_uint32(uint32_t *p, uint32_t x) +{ + + return (__sync_add_and_fetch(p, x)); +} + +JEMALLOC_INLINE uint32_t +atomic_sub_uint32(uint32_t *p, uint32_t x) +{ + + return (__sync_sub_and_fetch(p, x)); +} +#else +# error "Missing implementation for 32-bit atomic operations" +#endif +#endif + +#endif /* JEMALLOC_H_INLINES */ +/******************************************************************************/ diff --git a/jemalloc/src/atomic.c b/jemalloc/src/atomic.c new file mode 100644 index 0000000..77ee313 --- /dev/null +++ b/jemalloc/src/atomic.c @@ -0,0 +1,2 @@ +#define JEMALLOC_ATOMIC_C_ +#include "jemalloc/internal/jemalloc_internal.h" -- cgit v0.12 From 9a8fc41bb9752129510f3387f5c20cb798ff6b1a Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Fri, 18 Mar 2011 18:18:42 -0700 Subject: Update pprof. Import updated pprof from google-perftools 1.7. --- jemalloc/bin/pprof | 209 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 160 insertions(+), 49 deletions(-) diff --git a/jemalloc/bin/pprof b/jemalloc/bin/pprof index 1655f07..280ddcc 100755 --- a/jemalloc/bin/pprof +++ b/jemalloc/bin/pprof @@ -72,7 +72,7 @@ use strict; use warnings; use Getopt::Long; -my $PPROF_VERSION = "1.5"; +my $PPROF_VERSION = "1.7"; # These are the object tools we use which can come from a # user-specified location using --tools, from the PPROF_TOOLS @@ -89,6 +89,7 @@ my %obj_tool_map = ( ); my $DOT = "dot"; # leave non-absolute, since it may be in /usr/local my $GV = "gv"; +my $EVINCE = "evince"; # could also be xpdf or perhaps acroread my $KCACHEGRIND = "kcachegrind"; my $PS2PDF = "ps2pdf"; # These are used for dynamic profiles @@ -103,6 +104,7 @@ my $GROWTH_PAGE = "/pprof/growth"; my $CONTENTION_PAGE = "/pprof/contention"; my $WALL_PAGE = "/pprof/wall(?:\\?.*)?"; # accepts options like namefilter my $FILTEREDPROFILE_PAGE = "/pprof/filteredprofile(?:\\?.*)?"; +my $CENSUSPROFILE_PAGE = "/pprof/censusprofile"; # must support "?seconds=#" my $SYMBOL_PAGE = "/pprof/symbol"; # must support symbol lookup via POST my $PROGRAM_NAME_PAGE = "/pprof/cmdline"; @@ -110,7 +112,7 @@ my $PROGRAM_NAME_PAGE = "/pprof/cmdline"; # All the alternatives must begin with /. my $PROFILES = "($HEAP_PAGE|$PROFILE_PAGE|$PMUPROFILE_PAGE|" . "$GROWTH_PAGE|$CONTENTION_PAGE|$WALL_PAGE|" . - "$FILTEREDPROFILE_PAGE)"; + "$FILTEREDPROFILE_PAGE|$CENSUSPROFILE_PAGE)"; # default binary name my $UNKNOWN_BINARY = "(unknown)"; @@ -148,7 +150,7 @@ pprof [options] The / can be $HEAP_PAGE, $PROFILE_PAGE, /pprof/pmuprofile, $GROWTH_PAGE, $CONTENTION_PAGE, /pprof/wall, - or /pprof/filteredprofile. + $CENSUSPROFILE_PAGE, or /pprof/filteredprofile. For instance: "pprof http://myserver.com:80$HEAP_PAGE". If / is omitted, the service defaults to $PROFILE_PAGE (cpu profiling). pprof --symbols @@ -180,6 +182,7 @@ Output type: --text Generate text report --callgrind Generate callgrind format to stdout --gv Generate Postscript and display + --evince Generate PDF and display --web Generate SVG and display --list= Generate source listing of matching routines --disasm= Generate disassembly of matching routines @@ -208,6 +211,7 @@ Call-graph Options: --nodecount= Show at most so many nodes [default=80] --nodefraction= Hide nodes below *total [default=.005] --edgefraction= Hide edges below *total [default=.001] + --maxdegree= Max incoming/outgoing edges per node [default=8] --focus= Focus on nodes matching --ignore= Ignore nodes matching --scale= Set GV scaling [default=0] @@ -304,6 +308,7 @@ sub Init() { $main::opt_disasm = ""; $main::opt_symbols = 0; $main::opt_gv = 0; + $main::opt_evince = 0; $main::opt_web = 0; $main::opt_dot = 0; $main::opt_ps = 0; @@ -315,6 +320,7 @@ sub Init() { $main::opt_nodecount = 80; $main::opt_nodefraction = 0.005; $main::opt_edgefraction = 0.001; + $main::opt_maxdegree = 8; $main::opt_focus = ''; $main::opt_ignore = ''; $main::opt_scale = 0; @@ -372,6 +378,7 @@ sub Init() { "disasm=s" => \$main::opt_disasm, "symbols!" => \$main::opt_symbols, "gv!" => \$main::opt_gv, + "evince!" => \$main::opt_evince, "web!" => \$main::opt_web, "dot!" => \$main::opt_dot, "ps!" => \$main::opt_ps, @@ -383,6 +390,7 @@ sub Init() { "nodecount=i" => \$main::opt_nodecount, "nodefraction=f" => \$main::opt_nodefraction, "edgefraction=f" => \$main::opt_edgefraction, + "maxdegree=i" => \$main::opt_maxdegree, "focus=s" => \$main::opt_focus, "ignore=s" => \$main::opt_ignore, "scale=i" => \$main::opt_scale, @@ -452,6 +460,7 @@ sub Init() { ($main::opt_disasm eq '' ? 0 : 1) + ($main::opt_symbols == 0 ? 0 : 1) + $main::opt_gv + + $main::opt_evince + $main::opt_web + $main::opt_dot + $main::opt_ps + @@ -646,6 +655,8 @@ sub Main() { if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) { if ($main::opt_gv) { RunGV(TempName($main::next_tmpfile, "ps"), ""); + } elsif ($main::opt_evince) { + RunEvince(TempName($main::next_tmpfile, "pdf"), ""); } elsif ($main::opt_web) { my $tmp = TempName($main::next_tmpfile, "svg"); RunWeb($tmp); @@ -708,6 +719,12 @@ sub RunGV { } } +sub RunEvince { + my $fname = shift; + my $bg = shift; # "" or " &" if we should run in background + system("$EVINCE " . $fname . $bg); +} + sub RunWeb { my $fname = shift; print STDERR "Loading web page file:///$fname\n"; @@ -805,6 +822,7 @@ sub InteractiveCommand { $main::opt_disasm = 0; $main::opt_list = 0; $main::opt_gv = 0; + $main::opt_evince = 0; $main::opt_cum = 0; if (m/^\s*(text|top)(\d*)\s*(.*)/) { @@ -878,11 +896,14 @@ sub InteractiveCommand { PrintDisassembly($libs, $flat, $cumulative, $routine, $total); return 1; } - if (m/^\s*(gv|web)\s*(.*)/) { + if (m/^\s*(gv|web|evince)\s*(.*)/) { $main::opt_gv = 0; + $main::opt_evince = 0; $main::opt_web = 0; if ($1 eq "gv") { $main::opt_gv = 1; + } elsif ($1 eq "evince") { + $main::opt_evince = 1; } elsif ($1 eq "web") { $main::opt_web = 1; } @@ -902,6 +923,8 @@ sub InteractiveCommand { if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) { if ($main::opt_gv) { RunGV(TempName($main::next_tmpfile, "ps"), " &"); + } elsif ($main::opt_evince) { + RunEvince(TempName($main::next_tmpfile, "pdf"), " &"); } elsif ($main::opt_web) { RunWeb(TempName($main::next_tmpfile, "svg")); } @@ -1685,6 +1708,8 @@ sub PrintDot { my $output; if ($main::opt_gv) { $output = "| $DOT -Tps2 >" . TempName($main::next_tmpfile, "ps"); + } elsif ($main::opt_evince) { + $output = "| $DOT -Tps2 | $PS2PDF - " . TempName($main::next_tmpfile, "pdf"); } elsif ($main::opt_ps) { $output = "| $DOT -Tps2"; } elsif ($main::opt_pdf) { @@ -1792,12 +1817,38 @@ sub PrintDot { } } - # Print edges - foreach my $e (keys(%edge)) { + # Print edges (process in order of decreasing counts) + my %indegree = (); # Number of incoming edges added per node so far + my %outdegree = (); # Number of outgoing edges added per node so far + foreach my $e (sort { $edge{$b} <=> $edge{$a} } keys(%edge)) { my @x = split(/\001/, $e); $n = $edge{$e}; - if (abs($n) > $edgelimit) { + # Initialize degree of kept incoming and outgoing edges if necessary + my $src = $x[0]; + my $dst = $x[1]; + if (!exists($outdegree{$src})) { $outdegree{$src} = 0; } + if (!exists($indegree{$dst})) { $indegree{$dst} = 0; } + + my $keep; + if ($indegree{$dst} == 0) { + # Keep edge if needed for reachability + $keep = 1; + } elsif (abs($n) <= $edgelimit) { + # Drop if we are below --edgefraction + $keep = 0; + } elsif ($outdegree{$src} >= $main::opt_maxdegree || + $indegree{$dst} >= $main::opt_maxdegree) { + # Keep limited number of in/out edges per node + $keep = 0; + } else { + $keep = 1; + } + + if ($keep) { + $outdegree{$src}++; + $indegree{$dst}++; + # Compute line width based on edge count my $fraction = abs($local_total ? (3 * ($n / $local_total)) : 0); if ($fraction > 1) { $fraction = 1; } @@ -2135,6 +2186,19 @@ function handleMouseUp(evt) { EOF } +# Return a small number that identifies the argument. +# Multiple calls with the same argument will return the same number. +# Calls with different arguments will return different numbers. +sub ShortIdFor { + my $key = shift; + my $id = $main::uniqueid{$key}; + if (!defined($id)) { + $id = keys(%main::uniqueid) + 1; + $main::uniqueid{$key} = $id; + } + return $id; +} + # Translate a stack of addresses into a stack of symbols sub TranslateStack { my $symbols = shift; @@ -2172,6 +2236,15 @@ sub TranslateStack { if ($j > 2) { $func = "$func (inline)"; } + + # Do not merge nodes corresponding to Callback::Run since that + # causes confusing cycles in dot display. Instead, we synthesize + # a unique name for this frame per caller. + if ($func =~ m/Callback.*::Run$/) { + my $caller = ($i > 0) ? $addrs[$i-1] : 0; + $func = "Run#" . ShortIdFor($caller); + } + if ($main::opt_addresses) { push(@result, "$a $func $fileline"); } elsif ($main::opt_lines) { @@ -2415,7 +2488,16 @@ sub RemoveUninterestingFrames { # old code out of the system. $skip_regexp = "TCMalloc|^tcmalloc::"; } elsif ($main::profile_type eq 'contention') { - foreach my $vname ('Mutex::Unlock', 'Mutex::UnlockSlow') { + foreach my $vname ('base::RecordLockProfileData', + 'base::SubmitMutexProfileData', + 'base::SubmitSpinLockProfileData', + 'Mutex::Unlock', + 'Mutex::UnlockSlow', + 'Mutex::ReaderUnlock', + 'MutexLock::~MutexLock', + 'SpinLock::Unlock', + 'SpinLock::SlowUnlock', + 'SpinLockHolder::~SpinLockHolder') { $skip{$vname} = 1; } } elsif ($main::profile_type eq 'cpu') { @@ -2955,7 +3037,7 @@ sub FetchDynamicProfile { my $fetcher = AddFetchTimeout($URL_FETCHER, $fetch_timeout); my $cmd = "$fetcher '$url' > '$tmp_profile'"; - if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE/){ + if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE|$CENSUSPROFILE_PAGE/){ print STDERR "Gathering CPU profile from $url for $main::opt_seconds seconds to\n ${real_profile}\n"; if ($encourage_patience) { print STDERR "Be patient...\n"; @@ -3154,24 +3236,47 @@ BEGIN { } } -# Return the next line from the profile file, assuming it's a text -# line (which in this case means, doesn't start with a NUL byte). If -# it's not a text line, return "". At EOF, return undef, like perl does. -# Input file should be in binmode. -sub ReadProfileLine { +# Reads the top, 'header' section of a profile, and returns the last +# line of the header, commonly called a 'header line'. The header +# section of a profile consists of zero or more 'command' lines that +# are instructions to pprof, which pprof executes when reading the +# header. All 'command' lines start with a %. After the command +# lines is the 'header line', which is a profile-specific line that +# indicates what type of profile it is, and perhaps other global +# information about the profile. For instance, here's a header line +# for a heap profile: +# heap profile: 53: 38236 [ 5525: 1284029] @ heapprofile +# For historical reasons, the CPU profile does not contain a text- +# readable header line. If the profile looks like a CPU profile, +# this function returns "". If no header line could be found, this +# function returns undef. +# +# The following commands are recognized: +# %warn -- emit the rest of this line to stderr, prefixed by 'WARNING:' +# +# The input file should be in binmode. +sub ReadProfileHeader { local *PROFILE = shift; my $firstchar = ""; my $line = ""; read(PROFILE, $firstchar, 1); - seek(PROFILE, -1, 1); # unread the firstchar - if ($firstchar eq "\0") { + seek(PROFILE, -1, 1); # unread the firstchar + if ($firstchar !~ /[[:print:]]/) { # is not a text character return ""; } - $line = ; - if (defined($line)) { + while (defined($line = )) { $line =~ s/\r//g; # turn windows-looking lines into unix-looking lines + if ($line =~ /^%warn\s+(.*)/) { # 'warn' command + # Note this matches both '%warn blah\n' and '%warn\n'. + print STDERR "WARNING: $1\n"; # print the rest of the line + } elsif ($line =~ /^%/) { + print STDERR "Ignoring unknown command from profile header: $line"; + } else { + # End of commands, must be the header line. + return $line; + } } - return $line; + return undef; # got to EOF without seeing a header line } sub IsSymbolizedProfileFile { @@ -3182,7 +3287,7 @@ sub IsSymbolizedProfileFile { # Check if the file contains a symbol-section marker. open(TFILE, "<$file_name"); binmode TFILE; - my $firstline = ReadProfileLine(*TFILE); + my $firstline = ReadProfileHeader(*TFILE); close(TFILE); if (!$firstline) { return 0; @@ -3202,14 +3307,7 @@ sub IsSymbolizedProfileFile { sub ReadProfile { my $prog = shift; my $fname = shift; - - if (IsSymbolizedProfileFile($fname) && !$main::use_symbolized_profile) { - # we have both a binary and symbolized profiles, abort - usage("Symbolized profile '$fname' cannot be used with a binary arg. " . - "Try again without passing '$prog'."); - } - - $main::profile_type = ''; + my $result; # return value $CONTENTION_PAGE =~ m,[^/]+$,; # matches everything after the last slash my $contention_marker = $&; @@ -3226,40 +3324,45 @@ sub ReadProfile { # whole firstline, since it may be gigabytes(!) of data. open(PROFILE, "<$fname") || error("$fname: $!\n"); binmode PROFILE; # New perls do UTF-8 processing - my $header = ReadProfileLine(*PROFILE); + my $header = ReadProfileHeader(*PROFILE); if (!defined($header)) { # means "at EOF" error("Profile is empty.\n"); } my $symbols; if ($header =~ m/^--- *$symbol_marker/o) { + # Verify that the user asked for a symbolized profile + if (!$main::use_symbolized_profile) { + # we have both a binary and symbolized profiles, abort + error("FATAL ERROR: Symbolized profile\n $fname\ncannot be used with " . + "a binary arg. Try again without passing\n $prog\n"); + } # Read the symbol section of the symbolized profile file. $symbols = ReadSymbols(*PROFILE{IO}); # Read the next line to get the header for the remaining profile. - $header = ReadProfileLine(*PROFILE) || ""; + $header = ReadProfileHeader(*PROFILE) || ""; } - my $result; - + $main::profile_type = ''; if ($header =~ m/^heap profile:.*$growth_marker/o) { $main::profile_type = 'growth'; - $result = ReadHeapProfile($prog, $fname, $header); + $result = ReadHeapProfile($prog, *PROFILE, $header); } elsif ($header =~ m/^heap profile:/) { $main::profile_type = 'heap'; - $result = ReadHeapProfile($prog, $fname, $header); + $result = ReadHeapProfile($prog, *PROFILE, $header); } elsif ($header =~ m/^--- *$contention_marker/o) { $main::profile_type = 'contention'; - $result = ReadSynchProfile($prog, $fname); + $result = ReadSynchProfile($prog, *PROFILE); } elsif ($header =~ m/^--- *Stacks:/) { print STDERR "Old format contention profile: mistakenly reports " . "condition variable signals as lock contentions.\n"; $main::profile_type = 'contention'; - $result = ReadSynchProfile($prog, $fname); + $result = ReadSynchProfile($prog, *PROFILE); } elsif ($header =~ m/^--- *$profile_marker/) { # the binary cpu profile data starts immediately after this line $main::profile_type = 'cpu'; - $result = ReadCPUProfile($prog, $fname); + $result = ReadCPUProfile($prog, $fname, *PROFILE); } else { if (defined($symbols)) { # a symbolized profile contains a format we don't recognize, bail out @@ -3267,9 +3370,11 @@ sub ReadProfile { } # no ascii header present -- must be a CPU profile $main::profile_type = 'cpu'; - $result = ReadCPUProfile($prog, $fname); + $result = ReadCPUProfile($prog, $fname, *PROFILE); } + close(PROFILE); + # if we got symbols along with the profile, return those as well if (defined($symbols)) { $result->{symbols} = $symbols; @@ -3308,7 +3413,8 @@ sub FixCallerAddresses { # CPU profile reader sub ReadCPUProfile { my $prog = shift; - my $fname = shift; + my $fname = shift; # just used for logging + local *PROFILE = shift; my $version; my $period; my $i; @@ -3375,7 +3481,6 @@ sub ReadCPUProfile { my $map = ''; seek(PROFILE, $i * 4, 0); read(PROFILE, $map, (stat PROFILE)[7]); - close(PROFILE); my $r = {}; $r->{version} = $version; @@ -3389,7 +3494,7 @@ sub ReadCPUProfile { sub ReadHeapProfile { my $prog = shift; - my $fname = shift; + local *PROFILE = shift; my $header = shift; my $index = 1; @@ -3534,14 +3639,14 @@ sub ReadHeapProfile { if ($n1 != 0) { my $ratio = (($s1*1.0)/$n1)/($sample_adjustment); my $scale_factor = 1/(1 - exp(-$ratio)); - $n1 *= $scale_factor; - $s1 *= $scale_factor; + $n1 *= $scale_factor; + $s1 *= $scale_factor; } if ($n2 != 0) { my $ratio = (($s2*1.0)/$n2)/($sample_adjustment); my $scale_factor = 1/(1 - exp(-$ratio)); - $n2 *= $scale_factor; - $s2 *= $scale_factor; + $n2 *= $scale_factor; + $s2 *= $scale_factor; } } else { # Remote-heap version 1 @@ -3574,7 +3679,9 @@ sub ReadHeapProfile { } sub ReadSynchProfile { - my ($prog, $fname, $header) = @_; + my $prog = shift; + local *PROFILE = shift; + my $header = shift; my $map = ''; my $profile = {}; @@ -3649,7 +3756,6 @@ sub ReadSynchProfile { $map .= $line; } } - close PROFILE; if (!$seen_clockrate) { printf STDERR ("No cycles/second entry in profile; Guessing %.1f GHz\n", @@ -4098,8 +4204,9 @@ sub ExtractSymbols { # advance through the libraries as we advance the pc. Sometimes the # addresses of libraries may overlap with the addresses of the main # binary, so to make sure the libraries 'win', we iterate over the - # libraries in reverse order (binary will have the lowest start addr). - my @pcs = (sort { $a cmp $b } keys(%{$pcset})); + # libraries in reverse order (which assumes the binary doesn't start + # in the middle of a library, which seems a fair assumption). + my @pcs = (sort { $a cmp $b } keys(%{$pcset})); # pcset is 0-extended strings foreach my $lib (sort {$b->[1] cmp $a->[1]} @{$libs}) { my $libname = $lib->[0]; my $start = $lib->[1]; @@ -4109,14 +4216,18 @@ sub ExtractSymbols { # Get list of pcs that belong in this library. my $contained = []; my ($start_pc_index, $finish_pc_index); + # Find smallest finish_pc_index such that $finish < $pc[$finish_pc_index]. for ($finish_pc_index = $#pcs + 1; $finish_pc_index > 0; $finish_pc_index--) { last if $pcs[$finish_pc_index - 1] le $finish; } + # Find smallest start_pc_index such that $start <= $pc[$start_pc_index]. for ($start_pc_index = $finish_pc_index; $start_pc_index > 0; $start_pc_index--) { last if $pcs[$start_pc_index - 1] lt $start; } + # This keeps PC values higher than $pc[$finish_pc_index] in @pcs, + # in case there are overlaps in libraries and the main binary. @{$contained} = splice(@pcs, $start_pc_index, $finish_pc_index - $start_pc_index); # Map to symbols -- cgit v0.12 From 763baa6cfcc8a9df9d3b7f676b2193ac7cd5ef51 Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Fri, 18 Mar 2011 19:10:31 -0700 Subject: Add atomic operation support for OS X. --- jemalloc/configure.ac | 22 +++++++++++++++++ jemalloc/include/jemalloc/internal/atomic.h | 28 ++++++++++++++++++++++ .../jemalloc/internal/jemalloc_internal.h.in | 4 ++++ jemalloc/include/jemalloc/jemalloc_defs.h.in | 6 +++++ 4 files changed, 60 insertions(+) diff --git a/jemalloc/configure.ac b/jemalloc/configure.ac index dc77d75..c40d22f 100644 --- a/jemalloc/configure.ac +++ b/jemalloc/configure.ac @@ -771,6 +771,28 @@ AC_CHECK_FUNC([ffsl], [], [AC_MSG_ERROR([Cannot build without ffsl(3)])]) dnl ============================================================================ +dnl Check for atomic(3) operations as provided on Darwin. + +JE_COMPILABLE([Darwin OSAtomic*()], [ +#include +#include +], [ + { + int32_t x32 = 0; + volatile int32_t *x32p = &x32; + OSAtomicAdd32(1, x32p); + } + { + int64_t x64 = 0; + volatile int64_t *x64p = &x64; + OSAtomicAdd64(1, x64p); + } +], [osatomic]) +if test "x${osatomic}" = "xyes" ; then + AC_DEFINE([JEMALLOC_OSATOMIC]) +fi + +dnl ============================================================================ dnl Check for allocator-related functions that should be wrapped. AC_CHECK_FUNC([memalign], diff --git a/jemalloc/include/jemalloc/internal/atomic.h b/jemalloc/include/jemalloc/internal/atomic.h index 43faeaf..089affa 100644 --- a/jemalloc/include/jemalloc/internal/atomic.h +++ b/jemalloc/include/jemalloc/internal/atomic.h @@ -49,6 +49,20 @@ atomic_sub_uint64(uint64_t *p, uint64_t x) return (__sync_sub_and_fetch(p, x)); } +#elif (defined(JEMALLOC_OSATOMIC)) +JEMALLOC_INLINE uint64_t +atomic_add_uint64(uint64_t *p, uint64_t x) +{ + + return (OSAtomicAdd64((int64_t)x, (int64_t *)p)); +} + +JEMALLOC_INLINE uint64_t +atomic_sub_uint64(uint64_t *p, uint64_t x) +{ + + return (OSAtomicAdd64(-((int64_t)x), (int64_t *)p)); +} #else # error "Missing implementation for 64-bit atomic operations" #endif @@ -68,6 +82,20 @@ atomic_sub_uint32(uint32_t *p, uint32_t x) return (__sync_sub_and_fetch(p, x)); } +#elif (defined(JEMALLOC_OSATOMIC)) +JEMALLOC_INLINE uint32_t +atomic_add_uint32(uint32_t *p, uint32_t x) +{ + + return (OSAtomicAdd32((int32_t)x, (int32_t *)p)); +} + +JEMALLOC_INLINE uint32_t +atomic_sub_uint32(uint32_t *p, uint32_t x) +{ + + return (OSAtomicAdd32(-((int32_t)x), (int32_t *)p)); +} #else # error "Missing implementation for 32-bit atomic operations" #endif diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in index 90cd604..f660bc8 100644 --- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in +++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in @@ -33,6 +33,10 @@ #define JEMALLOC_MANGLE #include "../jemalloc@install_suffix@.h" +#ifdef JEMALLOC_OSATOMIC +#include +#endif + #ifdef JEMALLOC_ZONE #include #include diff --git a/jemalloc/include/jemalloc/jemalloc_defs.h.in b/jemalloc/include/jemalloc/jemalloc_defs.h.in index d669841..c08c5a2 100644 --- a/jemalloc/include/jemalloc/jemalloc_defs.h.in +++ b/jemalloc/include/jemalloc/jemalloc_defs.h.in @@ -24,6 +24,12 @@ */ #undef CPU_SPINWAIT +/* + * Defined if OSAtomic*() functions are available, as provided by Darwin, and + * documented in the atomic(3) manual page. + */ +#undef JEMALLOC_OSATOMIC + /* Defined if __attribute__((...)) syntax is supported. */ #undef JEMALLOC_HAVE_ATTR #ifdef JEMALLOC_HAVE_ATTR -- cgit v0.12 From 893a0ed7c8c11962524ba6f2adeb304d038be2a9 Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Fri, 18 Mar 2011 19:30:18 -0700 Subject: Use OSSpinLock*() for locking on OS X. pthread_mutex_lock() can call malloc() on OS X (!!!), which causes deadlock. Work around this by using spinlocks that are built of more primitive stuff. --- jemalloc/configure.ac | 15 +++++++++++++ jemalloc/include/jemalloc/internal/atomic.h | 18 ++++++++++----- .../jemalloc/internal/jemalloc_internal.h.in | 2 +- jemalloc/include/jemalloc/internal/mutex.h | 26 ++++++++++++++++++---- jemalloc/include/jemalloc/jemalloc_defs.h.in | 6 +++++ jemalloc/src/jemalloc.c | 8 ++++++- jemalloc/src/mutex.c | 6 +++++ 7 files changed, 69 insertions(+), 12 deletions(-) diff --git a/jemalloc/configure.ac b/jemalloc/configure.ac index c40d22f..412d3d1 100644 --- a/jemalloc/configure.ac +++ b/jemalloc/configure.ac @@ -793,6 +793,21 @@ if test "x${osatomic}" = "xyes" ; then fi dnl ============================================================================ +dnl Check for spinlock(3) operations as provided on Darwin. + +JE_COMPILABLE([Darwin OSSpin*()], [ +#include +#include +], [ + OSSpinLock lock = 0; + OSSpinLockLock(&lock); + OSSpinLockUnlock(&lock); +], [osspin]) +if test "x${osspin}" = "xyes" ; then + AC_DEFINE([JEMALLOC_OSSPIN]) +fi + +dnl ============================================================================ dnl Check for allocator-related functions that should be wrapped. AC_CHECK_FUNC([memalign], diff --git a/jemalloc/include/jemalloc/internal/atomic.h b/jemalloc/include/jemalloc/internal/atomic.h index 089affa..f1f0c2b 100644 --- a/jemalloc/include/jemalloc/internal/atomic.h +++ b/jemalloc/include/jemalloc/internal/atomic.h @@ -13,13 +13,19 @@ #define atomic_read_uint32(p) atomic_add_uint32(p, 0) #if (LG_SIZEOF_PTR == 3) -# define atomic_read_z(p) atomic_add_uint64(p, 0) -# define atomic_add_z(p, x) atomic_add_uint64(p, x) -# define atomic_sub_z(p, x) atomic_sub_uint64(p, x) +# define atomic_read_z(p) \ + (size_t)atomic_add_uint64((uint64_t *)p, (uint64_t)0) +# define atomic_add_z(p, x) \ + (size_t)atomic_add_uint64((uint64_t *)p, (uint64_t)x) +# define atomic_sub_z(p, x) \ + (size_t)atomic_sub_uint64((uint64_t *)p, (uint64_t)x) #elif (LG_SIZEOF_PTR == 2) -# define atomic_read_z(p) atomic_add_uint32(p, 0) -# define atomic_add_z(p, x) atomic_add_uint32(p, x) -# define atomic_sub_z(p, x) atomic_sub_uint32(p, x) +# define atomic_read_z(p) \ + (size_t)atomic_add_uint32((uint32_t *)p, (uint32_t)0) +# define atomic_add_z(p, x) \ + (size_t)atomic_add_uint32((uint32_t *)p, (uint32_t)x) +# define atomic_sub_z(p, x) \ + (size_t)atomic_sub_uint32((uint32_t *)p, (uint32_t)x) #endif #endif /* JEMALLOC_H_EXTERNS */ diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in index f660bc8..fc944a8 100644 --- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in +++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in @@ -33,7 +33,7 @@ #define JEMALLOC_MANGLE #include "../jemalloc@install_suffix@.h" -#ifdef JEMALLOC_OSATOMIC +#if (defined(JEMALLOC_OSATOMIC) || defined(JEMALLOC_OSSPIN)) #include #endif diff --git a/jemalloc/include/jemalloc/internal/mutex.h b/jemalloc/include/jemalloc/internal/mutex.h index dcca01e..62947ce 100644 --- a/jemalloc/include/jemalloc/internal/mutex.h +++ b/jemalloc/include/jemalloc/internal/mutex.h @@ -1,7 +1,11 @@ /******************************************************************************/ #ifdef JEMALLOC_H_TYPES +#ifdef JEMALLOC_OSSPIN +typedef OSSpinLock malloc_mutex_t; +#else typedef pthread_mutex_t malloc_mutex_t; +#endif #ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP # define MALLOC_MUTEX_INITIALIZER PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP @@ -41,17 +45,26 @@ JEMALLOC_INLINE void malloc_mutex_lock(malloc_mutex_t *mutex) { - if (isthreaded) + if (isthreaded) { +#ifdef JEMALLOC_OSSPIN + OSSpinLockLock(mutex); +#else pthread_mutex_lock(mutex); +#endif + } } JEMALLOC_INLINE bool malloc_mutex_trylock(malloc_mutex_t *mutex) { - if (isthreaded) + if (isthreaded) { +#ifdef JEMALLOC_OSSPIN + return (OSSpinLockTry(mutex) == false); +#else return (pthread_mutex_trylock(mutex) != 0); - else +#endif + } else return (false); } @@ -59,8 +72,13 @@ JEMALLOC_INLINE void malloc_mutex_unlock(malloc_mutex_t *mutex) { - if (isthreaded) + if (isthreaded) { +#ifdef JEMALLOC_OSSPIN + OSSpinLockUnlock(mutex); +#else pthread_mutex_unlock(mutex); +#endif + } } #endif diff --git a/jemalloc/include/jemalloc/jemalloc_defs.h.in b/jemalloc/include/jemalloc/jemalloc_defs.h.in index c08c5a2..d8c81d7 100644 --- a/jemalloc/include/jemalloc/jemalloc_defs.h.in +++ b/jemalloc/include/jemalloc/jemalloc_defs.h.in @@ -30,6 +30,12 @@ */ #undef JEMALLOC_OSATOMIC +/* + * Defined if OSSpin*() functions are available, as provided by Darwin, and + * documented in the spinlock(3) manual page. + */ +#undef JEMALLOC_OSSPIN + /* Defined if __attribute__((...)) syntax is supported. */ #undef JEMALLOC_HAVE_ATTR #ifdef JEMALLOC_HAVE_ATTR diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c index 0efafde..dccce6b 100644 --- a/jemalloc/src/jemalloc.c +++ b/jemalloc/src/jemalloc.c @@ -28,7 +28,13 @@ static bool malloc_initialized = false; static pthread_t malloc_initializer = (unsigned long)0; /* Used to avoid initialization races. */ -static malloc_mutex_t init_lock = MALLOC_MUTEX_INITIALIZER; +static malloc_mutex_t init_lock = +#ifdef JEMALLOC_OSSPIN + 0 +#else + MALLOC_MUTEX_INITIALIZER +#endif + ; #ifdef DYNAMIC_PAGE_SHIFT size_t pagesize; diff --git a/jemalloc/src/mutex.c b/jemalloc/src/mutex.c index 3ecb18a..ca89ef1 100644 --- a/jemalloc/src/mutex.c +++ b/jemalloc/src/mutex.c @@ -55,6 +55,9 @@ pthread_create(pthread_t *__restrict thread, bool malloc_mutex_init(malloc_mutex_t *mutex) { +#ifdef JEMALLOC_OSSPIN + *mutex = 0; +#else pthread_mutexattr_t attr; if (pthread_mutexattr_init(&attr) != 0) @@ -70,6 +73,7 @@ malloc_mutex_init(malloc_mutex_t *mutex) } pthread_mutexattr_destroy(&attr); +#endif return (false); } @@ -77,8 +81,10 @@ void malloc_mutex_destroy(malloc_mutex_t *mutex) { +#ifndef JEMALLOC_OSSPIN if (pthread_mutex_destroy(mutex) != 0) { malloc_write(": Error in pthread_mutex_destroy()\n"); abort(); } +#endif } -- cgit v0.12 From 1dcb4f86b23a5760f5a717ace716360b63b33fad Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Mon, 21 Mar 2011 00:18:17 -0700 Subject: Dynamically adjust tcache fill count. Dynamically adjust tcache fill count (number of objects allocated per tcache refill) such that if GC has to flush inactive objects, the fill count gradually decreases. Conversely, if refills occur while the fill count is depressed, the fill count gradually increases back to its maximum value. --- jemalloc/include/jemalloc/internal/tcache.h | 24 +++++++++++++++++++++--- jemalloc/src/arena.c | 7 +++---- jemalloc/src/tcache.c | 5 +++-- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/jemalloc/include/jemalloc/internal/tcache.h b/jemalloc/include/jemalloc/internal/tcache.h index 5434d32..da3c68c 100644 --- a/jemalloc/include/jemalloc/internal/tcache.h +++ b/jemalloc/include/jemalloc/internal/tcache.h @@ -45,7 +45,8 @@ struct tcache_bin_s { # ifdef JEMALLOC_STATS tcache_bin_stats_t tstats; # endif - unsigned low_water; /* Min # cached since last GC. */ + int low_water; /* Min # cached since last GC. */ + unsigned lg_fill_div; /* Fill (ncached_max >> lg_fill_div). */ unsigned ncached; /* # of cached objects. */ void **avail; /* Stack of available objects. */ }; @@ -184,6 +185,7 @@ tcache_event(tcache_t *tcache) if (tcache->ev_cnt == tcache_gc_incr) { size_t binind = tcache->next_gc_bin; tcache_bin_t *tbin = &tcache->tbins[binind]; + tcache_bin_info_t *tbin_info = &tcache_bin_info[binind]; if (tbin->low_water > 0) { /* @@ -207,6 +209,20 @@ tcache_event(tcache_t *tcache) #endif ); } + /* + * Reduce fill count by 2X. Limit lg_fill_div such that + * the fill count is always at least 1. + */ + if ((tbin_info->ncached_max >> (tbin->lg_fill_div+1)) + >= 1) + tbin->lg_fill_div++; + } else if (tbin->low_water < 0) { + /* + * Increase fill count by 2X. Make sure lg_fill_div + * stays greater than 0. + */ + if (tbin->lg_fill_div > 1) + tbin->lg_fill_div--; } tbin->low_water = tbin->ncached; @@ -222,10 +238,12 @@ tcache_alloc_easy(tcache_bin_t *tbin) { void *ret; - if (tbin->ncached == 0) + if (tbin->ncached == 0) { + tbin->low_water = -1; return (NULL); + } tbin->ncached--; - if (tbin->ncached < tbin->low_water) + if ((int)tbin->ncached < tbin->low_water) tbin->low_water = tbin->ncached; ret = tbin->avail[tbin->ncached]; return (ret); diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c index 4cbca57..0f4f12a 100644 --- a/jemalloc/src/arena.c +++ b/jemalloc/src/arena.c @@ -1386,8 +1386,8 @@ arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, size_t binind #endif bin = &arena->bins[binind]; malloc_mutex_lock(&bin->lock); - for (i = 0, nfill = (tcache_bin_info[binind].ncached_max >> 1); - i < nfill; i++) { + for (i = 0, nfill = (tcache_bin_info[binind].ncached_max >> + tbin->lg_fill_div); i < nfill; i++) { if ((run = bin->runcur) != NULL && run->nfree > 0) ptr = arena_run_reg_alloc(run, &arena_bin_info[binind]); else @@ -1398,8 +1398,7 @@ arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin, size_t binind tbin->avail[nfill - 1 - i] = ptr; } #ifdef JEMALLOC_STATS - bin->stats.allocated += (i - tbin->ncached) * - arena_bin_info[binind].reg_size; + bin->stats.allocated += i * arena_bin_info[binind].reg_size; bin->stats.nmalloc += i; bin->stats.nrequests += tbin->tstats.nrequests; bin->stats.nfills++; diff --git a/jemalloc/src/tcache.c b/jemalloc/src/tcache.c index 2f4804e..31c329e 100644 --- a/jemalloc/src/tcache.c +++ b/jemalloc/src/tcache.c @@ -135,7 +135,7 @@ tcache_bin_flush_small(tcache_bin_t *tbin, size_t binind, unsigned rem memmove(tbin->avail, &tbin->avail[tbin->ncached - rem], rem * sizeof(void *)); tbin->ncached = rem; - if (tbin->ncached < tbin->low_water) + if ((int)tbin->ncached < tbin->low_water) tbin->low_water = tbin->ncached; } @@ -218,7 +218,7 @@ tcache_bin_flush_large(tcache_bin_t *tbin, size_t binind, unsigned rem memmove(tbin->avail, &tbin->avail[tbin->ncached - rem], rem * sizeof(void *)); tbin->ncached = rem; - if (tbin->ncached < tbin->low_water) + if ((int)tbin->ncached < tbin->low_water) tbin->low_water = tbin->ncached; } @@ -265,6 +265,7 @@ tcache_create(arena_t *arena) tcache->arena = arena; assert((TCACHE_NSLOTS_SMALL_MAX & 1U) == 0); for (i = 0; i < nhbins; i++) { + tcache->tbins[i].lg_fill_div = 1; tcache->tbins[i].avail = (void **)((uintptr_t)tcache + (uintptr_t)stack_offset); stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *); -- cgit v0.12 From 47e57f9bdadfaf999c9dea5d126edf3a4f1b2995 Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Tue, 22 Mar 2011 09:00:56 -0700 Subject: Avoid overflow in arena_run_regind(). Fix a regression due to: Remove an arena_bin_run_size_calc() constraint. 2a6f2af6e446a98a635caadd281a23ca09a491cb The removed constraint required that small run headers fit in one page, which indirectly limited runs such that they would not cause overflow in arena_run_regind(). Add an explicit constraint to arena_bin_run_size_calc() based on the largest number of regions that arena_run_regind() can handle (2^11 as currently configured). --- jemalloc/include/jemalloc/internal/arena.h | 8 ++++++-- jemalloc/include/jemalloc/internal/atomic.h | 4 +++- jemalloc/include/jemalloc/internal/bitmap.h | 2 +- jemalloc/include/jemalloc/internal/jemalloc_internal.h.in | 2 +- jemalloc/src/arena.c | 12 +++++++++++- jemalloc/test/bitmap.c | 6 +++++- 6 files changed, 27 insertions(+), 7 deletions(-) diff --git a/jemalloc/include/jemalloc/internal/arena.h b/jemalloc/include/jemalloc/internal/arena.h index 94b7f3d..b80c118 100644 --- a/jemalloc/include/jemalloc/internal/arena.h +++ b/jemalloc/include/jemalloc/internal/arena.h @@ -58,6 +58,10 @@ #define RUN_MAX_OVRHD 0x0000003dU #define RUN_MAX_OVRHD_RELAX 0x00001800U +/* Maximum number of regions in one run. */ +#define LG_RUN_MAXREGS 11 +#define RUN_MAXREGS (1U << LG_RUN_MAXREGS) + /* * The minimum ratio of active:dirty pages per arena is computed as: * @@ -556,8 +560,8 @@ arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, const void *ptr) * divide by 0, and 1 and 2 are both powers of two, which are * handled above. */ -#define SIZE_INV_SHIFT 21 -#define SIZE_INV(s) (((1U << SIZE_INV_SHIFT) / (s)) + 1) +#define SIZE_INV_SHIFT ((sizeof(unsigned) << 3) - LG_RUN_MAXREGS) +#define SIZE_INV(s) (((1U << SIZE_INV_SHIFT) / (s)) + 1) static const unsigned size_invs[] = { SIZE_INV(3), SIZE_INV(4), SIZE_INV(5), SIZE_INV(6), SIZE_INV(7), diff --git a/jemalloc/include/jemalloc/internal/atomic.h b/jemalloc/include/jemalloc/internal/atomic.h index f1f0c2b..821c2ef 100644 --- a/jemalloc/include/jemalloc/internal/atomic.h +++ b/jemalloc/include/jemalloc/internal/atomic.h @@ -70,7 +70,9 @@ atomic_sub_uint64(uint64_t *p, uint64_t x) return (OSAtomicAdd64(-((int64_t)x), (int64_t *)p)); } #else -# error "Missing implementation for 64-bit atomic operations" +# if (LG_SIZEOF_PTR == 3) +# error "Missing implementation for 64-bit atomic operations" +# endif #endif /* 32-bit operations. */ diff --git a/jemalloc/include/jemalloc/internal/bitmap.h b/jemalloc/include/jemalloc/internal/bitmap.h index 4bb2212..605ebac 100644 --- a/jemalloc/include/jemalloc/internal/bitmap.h +++ b/jemalloc/include/jemalloc/internal/bitmap.h @@ -2,7 +2,7 @@ #ifdef JEMALLOC_H_TYPES /* Maximum bitmap bit count is 2^LG_BITMAP_MAXBITS. */ -#define LG_BITMAP_MAXBITS 18 +#define LG_BITMAP_MAXBITS LG_RUN_MAXREGS typedef struct bitmap_level_s bitmap_level_t; typedef struct bitmap_info_s bitmap_info_t; diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in index fc944a8..f82385d 100644 --- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in +++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in @@ -224,9 +224,9 @@ extern void (*JEMALLOC_P(malloc_message))(void *wcbopaque, const char *s); #include "jemalloc/internal/ctl.h" #include "jemalloc/internal/mutex.h" #include "jemalloc/internal/mb.h" -#include "jemalloc/internal/bitmap.h" #include "jemalloc/internal/extent.h" #include "jemalloc/internal/arena.h" +#include "jemalloc/internal/bitmap.h" #include "jemalloc/internal/base.h" #include "jemalloc/internal/chunk.h" #include "jemalloc/internal/huge.h" diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c index 0f4f12a..0693f36 100644 --- a/jemalloc/src/arena.c +++ b/jemalloc/src/arena.c @@ -2427,6 +2427,7 @@ small_size2bin_init_hard(void) * *) bin_info->run_size >= min_run_size * *) bin_info->run_size <= arena_maxclass * *) run header overhead <= RUN_MAX_OVRHD (or header overhead relaxed). + * *) bin_info->nregs <= RUN_MAXREGS * * bin_info->nregs, bin_info->bitmap_offset, and bin_info->reg0_offset are also * calculated here, since these settings are all interdependent. @@ -2459,6 +2460,10 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size) try_run_size = min_run_size; try_nregs = ((try_run_size - sizeof(arena_run_t)) / bin_info->reg_size) + 1; /* Counter-act try_nregs-- in loop. */ + if (try_nregs > RUN_MAXREGS) { + try_nregs = RUN_MAXREGS + + 1; /* Counter-act try_nregs-- in loop. */ + } do { try_nregs--; try_hdr_size = sizeof(arena_run_t); @@ -2500,6 +2505,10 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size) try_nregs = ((try_run_size - sizeof(arena_run_t)) / bin_info->reg_size) + 1; /* Counter-act try_nregs-- in loop. */ + if (try_nregs > RUN_MAXREGS) { + try_nregs = RUN_MAXREGS + + 1; /* Counter-act try_nregs-- in loop. */ + } do { try_nregs--; try_hdr_size = sizeof(arena_run_t); @@ -2526,7 +2535,8 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size) } while (try_run_size <= arena_maxclass && try_run_size <= arena_maxclass && RUN_MAX_OVRHD * (bin_info->reg_size << 3) > RUN_MAX_OVRHD_RELAX - && (try_reg0_offset << RUN_BFP) > RUN_MAX_OVRHD * try_run_size); + && (try_reg0_offset << RUN_BFP) > RUN_MAX_OVRHD * try_run_size + && try_nregs < RUN_MAXREGS); assert(good_hdr_size <= good_reg0_offset); diff --git a/jemalloc/test/bitmap.c b/jemalloc/test/bitmap.c index 7a017c8..adfaacf 100644 --- a/jemalloc/test/bitmap.c +++ b/jemalloc/test/bitmap.c @@ -13,7 +13,11 @@ */ #include "../src/bitmap.c" -#define MAXBITS 4500 +#if (LG_BITMAP_MAXBITS > 12) +# define MAXBITS 4500 +#else +# define MAXBITS (1U << LG_BITMAP_MAXBITS) +#endif static void test_bitmap_size(void) -- cgit v0.12 From c957398b4f973158de323366dbd424b7bb812ddf Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Wed, 23 Mar 2011 00:27:50 -0700 Subject: Fix bootstrapping order bug. Initialize arenas_tsd earlier, so that the non-TLS case works when profiling is enabled. --- jemalloc/src/jemalloc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c index dccce6b..4d24470 100644 --- a/jemalloc/src/jemalloc.c +++ b/jemalloc/src/jemalloc.c @@ -797,6 +797,11 @@ malloc_init_hard(void) if (malloc_mutex_init(&arenas_lock)) return (true); + if (pthread_key_create(&arenas_tsd, arenas_cleanup) != 0) { + malloc_mutex_unlock(&init_lock); + return (true); + } + #ifdef JEMALLOC_PROF if (prof_boot2()) { malloc_mutex_unlock(&init_lock); @@ -835,11 +840,6 @@ malloc_init_hard(void) malloc_write(")\n"); } - if (pthread_key_create(&arenas_tsd, arenas_cleanup) != 0) { - malloc_mutex_unlock(&init_lock); - return (true); - } - /* Allocate and initialize arenas. */ arenas = (arena_t **)base_alloc(sizeof(arena_t *) * narenas); if (arenas == NULL) { -- cgit v0.12 From eacb896c014d822cf563490d1c1f1cdc3cda24a2 Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Wed, 23 Mar 2011 00:30:30 -0700 Subject: Fix rallocm() rsize bug. Add code to set *rsize even when profiling is enabled. --- jemalloc/src/jemalloc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c index 4d24470..1b8a278 100644 --- a/jemalloc/src/jemalloc.c +++ b/jemalloc/src/jemalloc.c @@ -1677,6 +1677,8 @@ JEMALLOC_P(rallocm)(void **ptr, size_t *rsize, size_t size, size_t extra, usize = isalloc(q); } prof_realloc(q, usize, cnt, old_size, old_ctx); + if (rsize != NULL) + *rsize = usize; } else #endif { -- cgit v0.12 From 38d9210c464c4ad49655a4da6bc84ea4fbec83d2 Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Wed, 23 Mar 2011 00:37:29 -0700 Subject: Fix error detection for ipalloc() when profiling. sa2u() returns 0 on overflow, but the profiling code was blindly calling sa2u() and allowing the error to silently propagate, ultimately ending in a later assertion failure. Refactor all ipalloc() callers to call sa2u(), check for overflow before calling ipalloc(), and pass usize rather than size. This allows ipalloc() to avoid calling sa2u() in the common case. --- .../jemalloc/internal/jemalloc_internal.h.in | 59 +++++++++++++-------- jemalloc/src/arena.c | 19 ++++--- jemalloc/src/ckh.c | 28 +++++++--- jemalloc/src/jemalloc.c | 61 +++++++++++++--------- 4 files changed, 105 insertions(+), 62 deletions(-) diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in index f82385d..254adb6 100644 --- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in +++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in @@ -589,7 +589,7 @@ thread_allocated_get(void) #ifndef JEMALLOC_ENABLE_INLINE void *imalloc(size_t size); void *icalloc(size_t size); -void *ipalloc(size_t size, size_t alignment, bool zero); +void *ipalloc(size_t usize, size_t alignment, bool zero); size_t isalloc(const void *ptr); # ifdef JEMALLOC_IVSALLOC size_t ivsalloc(const void *ptr); @@ -623,28 +623,39 @@ icalloc(size_t size) } JEMALLOC_INLINE void * -ipalloc(size_t size, size_t alignment, bool zero) +ipalloc(size_t usize, size_t alignment, bool zero) { void *ret; - size_t usize; - size_t run_size -# ifdef JEMALLOC_CC_SILENCE - = 0 -# endif - ; - usize = sa2u(size, alignment, &run_size); - if (usize == 0) - return (NULL); + assert(usize != 0); + assert(usize == sa2u(usize, alignment, NULL)); + if (usize <= arena_maxclass && alignment <= PAGE_SIZE) ret = arena_malloc(usize, zero); - else if (run_size <= arena_maxclass) { - ret = arena_palloc(choose_arena(), usize, run_size, alignment, - zero); - } else if (alignment <= chunksize) - ret = huge_malloc(usize, zero); - else - ret = huge_palloc(usize, alignment, zero); + else { + size_t run_size +#ifdef JEMALLOC_CC_SILENCE + = 0 +#endif + ; + + /* + * Ideally we would only ever call sa2u() once per aligned + * allocation request, and the caller of this function has + * already done so once. However, it's rather burdensome to + * require every caller to pass in run_size, especially given + * that it's only relevant to large allocations. Therefore, + * just call it again here in order to get run_size. + */ + sa2u(usize, alignment, &run_size); + if (run_size <= arena_maxclass) { + ret = arena_palloc(choose_arena(), usize, run_size, + alignment, zero); + } else if (alignment <= chunksize) + ret = huge_malloc(usize, zero); + else + ret = huge_palloc(usize, alignment, zero); + } assert(((uintptr_t)ret & (alignment - 1)) == 0); return (ret); @@ -715,7 +726,7 @@ iralloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero, if (alignment != 0 && ((uintptr_t)ptr & ((uintptr_t)alignment-1)) != 0) { - size_t copysize; + size_t usize, copysize; /* * Existing object alignment is inadquate; allocate new space @@ -723,12 +734,18 @@ iralloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero, */ if (no_move) return (NULL); - ret = ipalloc(size + extra, alignment, zero); + usize = sa2u(size + extra, alignment, NULL); + if (usize == 0) + return (NULL); + ret = ipalloc(usize, alignment, zero); if (ret == NULL) { if (extra == 0) return (NULL); /* Try again, without extra this time. */ - ret = ipalloc(size, alignment, zero); + usize = sa2u(size, alignment, NULL); + if (usize == 0) + return (NULL); + ret = ipalloc(usize, alignment, zero); if (ret == NULL) return (NULL); } diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c index 0693f36..1954da9 100644 --- a/jemalloc/src/arena.c +++ b/jemalloc/src/arena.c @@ -2165,24 +2165,29 @@ arena_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra, if (ret != NULL) return (ret); - /* * size and oldsize are different enough that we need to move the * object. In that case, fall back to allocating new space and * copying. */ - if (alignment != 0) - ret = ipalloc(size + extra, alignment, zero); - else + if (alignment != 0) { + size_t usize = sa2u(size + extra, alignment, NULL); + if (usize == 0) + return (NULL); + ret = ipalloc(usize, alignment, zero); + } else ret = arena_malloc(size + extra, zero); if (ret == NULL) { if (extra == 0) return (NULL); /* Try again, this time without extra. */ - if (alignment != 0) - ret = ipalloc(size, alignment, zero); - else + if (alignment != 0) { + size_t usize = sa2u(size, alignment, NULL); + if (usize == 0) + return (NULL); + ret = ipalloc(usize, alignment, zero); + } else ret = arena_malloc(size, zero); if (ret == NULL) diff --git a/jemalloc/src/ckh.c b/jemalloc/src/ckh.c index 22319ab..143b5b5 100644 --- a/jemalloc/src/ckh.c +++ b/jemalloc/src/ckh.c @@ -262,9 +262,15 @@ ckh_grow(ckh_t *ckh) lg_prevbuckets = ckh->lg_curbuckets; lg_curcells = ckh->lg_curbuckets + LG_CKH_BUCKET_CELLS; while (true) { + size_t usize; + lg_curcells++; - tab = (ckhc_t *)ipalloc(sizeof(ckhc_t) << lg_curcells, - ZU(1) << LG_CACHELINE, true); + usize = sa2u(sizeof(ckhc_t) << lg_curcells, CACHELINE, NULL); + if (usize == 0) { + ret = true; + goto RETURN; + } + tab = (ckhc_t *)ipalloc(usize, CACHELINE, true); if (tab == NULL) { ret = true; goto RETURN; @@ -295,7 +301,7 @@ static void ckh_shrink(ckh_t *ckh) { ckhc_t *tab, *ttab; - size_t lg_curcells; + size_t lg_curcells, usize; unsigned lg_prevbuckets; /* @@ -304,8 +310,10 @@ ckh_shrink(ckh_t *ckh) */ lg_prevbuckets = ckh->lg_curbuckets; lg_curcells = ckh->lg_curbuckets + LG_CKH_BUCKET_CELLS - 1; - tab = (ckhc_t *)ipalloc(sizeof(ckhc_t) << lg_curcells, - ZU(1) << LG_CACHELINE, true); + usize = sa2u(sizeof(ckhc_t) << lg_curcells, CACHELINE, NULL); + if (usize == 0) + return; + tab = (ckhc_t *)ipalloc(usize, CACHELINE, true); if (tab == NULL) { /* * An OOM error isn't worth propagating, since it doesn't @@ -340,7 +348,7 @@ bool ckh_new(ckh_t *ckh, size_t minitems, ckh_hash_t *hash, ckh_keycomp_t *keycomp) { bool ret; - size_t mincells; + size_t mincells, usize; unsigned lg_mincells; assert(minitems > 0); @@ -375,8 +383,12 @@ ckh_new(ckh_t *ckh, size_t minitems, ckh_hash_t *hash, ckh_keycomp_t *keycomp) ckh->hash = hash; ckh->keycomp = keycomp; - ckh->tab = (ckhc_t *)ipalloc(sizeof(ckhc_t) << lg_mincells, - (ZU(1) << LG_CACHELINE), true); + usize = sa2u(sizeof(ckhc_t) << lg_mincells, CACHELINE, NULL); + if (usize == 0) { + ret = true; + goto RETURN; + } + ckh->tab = (ckhc_t *)ipalloc(usize, CACHELINE, true); if (ckh->tab == NULL) { ret = true; goto RETURN; diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c index 1b8a278..e287516 100644 --- a/jemalloc/src/jemalloc.c +++ b/jemalloc/src/jemalloc.c @@ -993,14 +993,12 @@ int JEMALLOC_P(posix_memalign)(void **memptr, size_t alignment, size_t size) { int ret; - void *result; -#if (defined(JEMALLOC_PROF) || defined(JEMALLOC_STATS)) size_t usize -# ifdef JEMALLOC_CC_SILENCE +#ifdef JEMALLOC_CC_SILENCE = 0 -# endif - ; #endif + ; + void *result; #ifdef JEMALLOC_PROF prof_thr_cnt_t *cnt # ifdef JEMALLOC_CC_SILENCE @@ -1050,34 +1048,37 @@ JEMALLOC_P(posix_memalign)(void **memptr, size_t alignment, size_t size) goto RETURN; } + usize = sa2u(size, alignment, NULL); + if (usize == 0) { + result = NULL; + ret = ENOMEM; + goto RETURN; + } + #ifdef JEMALLOC_PROF if (opt_prof) { - usize = sa2u(size, alignment, NULL); if ((cnt = prof_alloc_prep(usize)) == NULL) { result = NULL; ret = EINVAL; } else { if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && usize <= small_maxclass) { - result = ipalloc(small_maxclass+1, - alignment, false); + assert(sa2u(small_maxclass+1, + alignment, NULL) != 0); + result = ipalloc(sa2u(small_maxclass+1, + alignment, NULL), alignment, false); if (result != NULL) { arena_prof_promoted(result, usize); } } else { - result = ipalloc(size, alignment, + result = ipalloc(usize, alignment, false); } } } else #endif - { -#ifdef JEMALLOC_STATS - usize = sa2u(size, alignment, NULL); -#endif - result = ipalloc(size, alignment, false); - } + result = ipalloc(usize, alignment, false); } if (result == NULL) { @@ -1531,15 +1532,18 @@ JEMALLOC_P(mallctlbymib)(const size_t *mib, size_t miblen, void *oldp, } JEMALLOC_INLINE void * -iallocm(size_t size, size_t alignment, bool zero) +iallocm(size_t usize, size_t alignment, bool zero) { + assert(usize == ((alignment == 0) ? s2u(usize) : sa2u(usize, alignment, + NULL))); + if (alignment != 0) - return (ipalloc(size, alignment, zero)); + return (ipalloc(usize, alignment, zero)); else if (zero) - return (icalloc(size)); + return (icalloc(usize)); else - return (imalloc(size)); + return (imalloc(usize)); } JEMALLOC_ATTR(nonnull(1)) @@ -1562,20 +1566,27 @@ JEMALLOC_P(allocm)(void **ptr, size_t *rsize, size_t size, int flags) if (malloc_init()) goto OOM; + usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment, + NULL); + if (usize == 0) + goto OOM; + #ifdef JEMALLOC_PROF if (opt_prof) { - usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment, - NULL); if ((cnt = prof_alloc_prep(usize)) == NULL) goto OOM; if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && usize <= small_maxclass) { - p = iallocm(small_maxclass+1, alignment, zero); + size_t usize_promoted = (alignment == 0) ? + s2u(small_maxclass+1) : sa2u(small_maxclass+1, + alignment, NULL); + assert(usize_promoted != 0); + p = iallocm(usize_promoted, alignment, zero); if (p == NULL) goto OOM; arena_prof_promoted(p, usize); } else { - p = iallocm(size, alignment, zero); + p = iallocm(usize, alignment, zero); if (p == NULL) goto OOM; } @@ -1585,15 +1596,13 @@ JEMALLOC_P(allocm)(void **ptr, size_t *rsize, size_t size, int flags) } else #endif { - p = iallocm(size, alignment, zero); + p = iallocm(usize, alignment, zero); if (p == NULL) goto OOM; #ifndef JEMALLOC_STATS if (rsize != NULL) #endif { - usize = (alignment == 0) ? s2u(size) : sa2u(size, - alignment, NULL); #ifdef JEMALLOC_STATS if (rsize != NULL) #endif -- cgit v0.12 From 4bcd987251826a7f9c49a1e2e6968bbb639a06c8 Mon Sep 17 00:00:00 2001 From: Jason Evans Date: Tue, 22 Mar 2011 15:30:22 -0700 Subject: Update ChangeLog for 2.2.0. --- jemalloc/ChangeLog | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/jemalloc/ChangeLog b/jemalloc/ChangeLog index 08526c8..6db63db 100644 --- a/jemalloc/ChangeLog +++ b/jemalloc/ChangeLog @@ -6,6 +6,35 @@ found in the git revision history: http://www.canonware.com/cgi-bin/gitweb.cgi?p=jemalloc.git git://canonware.com/jemalloc.git +* 2.2.0 (March 22, 2011) + + This version incorporates several improvements to algorithms and data + structures that tend to reduce fragmentation and increase speed. + + New features: + - Add the "stats.cactive" mallctl. + - Update pprof (from google-perftools 1.7). + - Improve backtracing-related configuration logic, and add the + --disable-prof-libgcc option. + + Bug fixes: + - Change default symbol visibility from "internal", to "hidden", which + decreases the overhead of library-internal function calls. + - Fix symbol visibility so that it is also set on OS X. + - Fix a build dependency regression caused by the introduction of the .pic.o + suffix for PIC object files. + - Add missing checks for mutex initialization failures. + - Don't use libgcc-based backtracing except on x64, where it is known to work. + - Fix deadlocks on OS X that were due to memory allocation in + pthread_mutex_lock(). + - Heap profiling-specific fixes: + + Fix memory corruption due to integer overflow in small region index + computation, when using a small enough sample interval that profiling + context pointers are stored in small run headers. + + Fix a bootstrap ordering bug that only occurred with TLS disabled. + + Fix a rallocm() rsize bug. + + Fix error detection bugs for aligned memory allocation. + * 2.1.3 (March 14, 2011) Bug fixes: -- cgit v0.12