diff options
author | Jason Evans <jasone@canonware.com> | 2016-06-03 01:43:10 (GMT) |
---|---|---|
committer | Jason Evans <jasone@canonware.com> | 2016-06-06 03:59:57 (GMT) |
commit | 6f29a8392403f70bfa1080964a65540b6f3699fe (patch) | |
tree | 60dcedc3dc0b77c6e3280979ae5a5a267834de4e /include | |
parent | 7be2ebc23f0f145e095e7230d7d8a202b8dcc55e (diff) | |
download | jemalloc-6f29a8392403f70bfa1080964a65540b6f3699fe.zip jemalloc-6f29a8392403f70bfa1080964a65540b6f3699fe.tar.gz jemalloc-6f29a8392403f70bfa1080964a65540b6f3699fe.tar.bz2 |
Add rtree lookup path caching.
rtree-based extent lookups remain more expensive than chunk-based run
lookups, but with this optimization the fast path slowdown is ~3 CPU
cycles per metadata lookup (on Intel Core i7-4980HQ), versus ~11 cycles
prior. The path caching speedup tends to degrade gracefully unless
allocated memory is spread far apart (as is the case when using a
mixture of sbrk() and mmap()).
Diffstat (limited to 'include')
-rw-r--r-- | include/jemalloc/internal/extent.h | 5 | ||||
-rw-r--r-- | include/jemalloc/internal/private_symbols.txt | 5 | ||||
-rw-r--r-- | include/jemalloc/internal/rtree.h | 183 | ||||
-rw-r--r-- | include/jemalloc/internal/tsd.h | 19 |
4 files changed, 176 insertions, 36 deletions
diff --git a/include/jemalloc/internal/extent.h b/include/jemalloc/internal/extent.h index a41a15f..8b8dbe8 100644 --- a/include/jemalloc/internal/extent.h +++ b/include/jemalloc/internal/extent.h @@ -181,8 +181,11 @@ void extent_ring_remove(extent_t *extent); JEMALLOC_INLINE extent_t * extent_lookup(tsdn_t *tsdn, const void *ptr, bool dependent) { + rtree_ctx_t rtree_ctx_fallback; + rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback); - return (rtree_read(tsdn, &extents_rtree, (uintptr_t)ptr, dependent)); + return (rtree_read(tsdn, &extents_rtree, rtree_ctx, (uintptr_t)ptr, + dependent)); } JEMALLOC_INLINE arena_t * diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt index d4e5525..07e7f28 100644 --- a/include/jemalloc/internal/private_symbols.txt +++ b/include/jemalloc/internal/private_symbols.txt @@ -399,6 +399,7 @@ rtree_child_read rtree_child_read_hard rtree_child_tryread rtree_clear +rtree_ctx_start_level rtree_delete rtree_elm_acquire rtree_elm_lookup @@ -502,6 +503,9 @@ tsd_nominal tsd_prof_tdata_get tsd_prof_tdata_set tsd_prof_tdatap_get +tsd_rtree_ctx_get +tsd_rtree_ctx_set +tsd_rtree_ctxp_get tsd_rtree_elm_witnesses_get tsd_rtree_elm_witnesses_set tsd_rtree_elm_witnessesp_get @@ -529,6 +533,7 @@ tsd_witnesses_set tsd_witnessesp_get tsdn_fetch tsdn_null +tsdn_rtree_ctx tsdn_tsd witness_assert_lockless witness_assert_not_owner diff --git a/include/jemalloc/internal/rtree.h b/include/jemalloc/internal/rtree.h index a47a79e..fc88dfe 100644 --- a/include/jemalloc/internal/rtree.h +++ b/include/jemalloc/internal/rtree.h @@ -10,6 +10,7 @@ typedef struct rtree_elm_s rtree_elm_t; typedef struct rtree_elm_witness_s rtree_elm_witness_t; typedef struct rtree_elm_witness_tsd_s rtree_elm_witness_tsd_t; typedef struct rtree_level_s rtree_level_t; +typedef struct rtree_ctx_s rtree_ctx_t; typedef struct rtree_s rtree_t; /* @@ -25,6 +26,13 @@ typedef struct rtree_s rtree_t; /* Used for two-stage lock-free node initialization. */ #define RTREE_NODE_INITIALIZING ((rtree_elm_t *)0x1) +#define RTREE_CTX_INITIALIZER { \ + false, \ + 0, \ + 0, \ + {NULL /* C initializes all trailing elements to NULL. */} \ +} + /* * Maximum number of concurrently acquired elements per thread. This controls * how many witness_t structures are embedded in tsd. Ideally rtree_elm_t would @@ -78,9 +86,9 @@ struct rtree_level_s { * * Suppose keys comprise 48 bits, and LG_RTREE_BITS_PER_LEVEL is 4. * This results in a 3-level tree, and the leftmost leaf can be directly - * accessed via subtrees[2], the subtree prefixed by 0x0000 (excluding - * 0x00000000) can be accessed via subtrees[1], and the remainder of the - * tree can be accessed via subtrees[0]. + * accessed via levels[2], the subtree prefixed by 0x0000 (excluding + * 0x00000000) can be accessed via levels[1], and the remainder of the + * tree can be accessed via levels[0]. * * levels[0] : [<unused> | 0x0001******** | 0x0002******** | ...] * @@ -90,7 +98,7 @@ struct rtree_level_s { * * This has practical implications on x64, which currently uses only the * lower 47 bits of virtual address space in userland, thus leaving - * subtrees[0] unused and avoiding a level of tree traversal. + * levels[0] unused and avoiding a level of tree traversal. */ union { void *subtree_pun; @@ -105,13 +113,31 @@ struct rtree_level_s { unsigned cumbits; }; +struct rtree_ctx_s { + /* If false, key/elms have not yet been initialized by a lookup. */ + bool valid; + /* Key that corresponds to the tree path recorded in elms. */ + uintptr_t key; + /* Memoized rtree_start_level(key). */ + unsigned start_level; + /* + * A path through rtree, driven by key. Only elements that could + * actually be used for subsequent lookups are initialized, i.e. if + * start_level = rtree_start_level(key) is non-zero, the first + * start_level elements are uninitialized. The last element contains a + * pointer to the leaf node element that corresponds to key, so that + * exact matches require no tree node offset computation. + */ + rtree_elm_t *elms[RTREE_HEIGHT_MAX + 1]; +}; + struct rtree_s { unsigned height; /* * Precomputed table used to convert from the number of leading 0 key * bits to which subtree level to start at. */ - unsigned start_level[RTREE_HEIGHT_MAX]; + unsigned start_level[RTREE_HEIGHT_MAX + 1]; rtree_level_t levels[RTREE_HEIGHT_MAX]; }; @@ -143,7 +169,9 @@ void rtree_elm_witness_release(tsdn_t *tsdn, const rtree_t *rtree, #ifdef JEMALLOC_H_INLINES #ifndef JEMALLOC_ENABLE_INLINE -unsigned rtree_start_level(rtree_t *rtree, uintptr_t key); +unsigned rtree_start_level(const rtree_t *rtree, uintptr_t key); +unsigned rtree_ctx_start_level(const rtree_t *rtree, + const rtree_ctx_t *rtree_ctx, uintptr_t key); uintptr_t rtree_subkey(rtree_t *rtree, uintptr_t key, unsigned level); bool rtree_node_valid(rtree_elm_t *node); @@ -156,33 +184,55 @@ rtree_elm_t *rtree_subtree_tryread(rtree_t *rtree, unsigned level, bool dependent); rtree_elm_t *rtree_subtree_read(tsdn_t *tsdn, rtree_t *rtree, unsigned level, bool dependent); -rtree_elm_t *rtree_elm_lookup(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key, - bool dependent, bool init_missing); - -bool rtree_write(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key, - const extent_t *extent); -extent_t *rtree_read(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key, - bool dependent); -rtree_elm_t *rtree_elm_acquire(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key, - bool dependent, bool init_missing); +rtree_elm_t *rtree_elm_lookup(tsdn_t *tsdn, rtree_t *rtree, + rtree_ctx_t *rtree_ctx, uintptr_t key, bool dependent, bool init_missing); + +bool rtree_write(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, + uintptr_t key, const extent_t *extent); +extent_t *rtree_read(tsdn_t *tsdn, rtree_t *rtree, + rtree_ctx_t *rtree_ctx, uintptr_t key, bool dependent); +rtree_elm_t *rtree_elm_acquire(tsdn_t *tsdn, rtree_t *rtree, + rtree_ctx_t *rtree_ctx, uintptr_t key, bool dependent, bool init_missing); extent_t *rtree_elm_read_acquired(tsdn_t *tsdn, const rtree_t *rtree, rtree_elm_t *elm); void rtree_elm_write_acquired(tsdn_t *tsdn, const rtree_t *rtree, rtree_elm_t *elm, const extent_t *extent); void rtree_elm_release(tsdn_t *tsdn, const rtree_t *rtree, rtree_elm_t *elm); -void rtree_clear(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key); +void rtree_clear(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, + uintptr_t key); #endif #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_RTREE_C_)) JEMALLOC_ALWAYS_INLINE unsigned -rtree_start_level(rtree_t *rtree, uintptr_t key) +rtree_start_level(const rtree_t *rtree, uintptr_t key) { unsigned start_level; if (unlikely(key == 0)) return (rtree->height - 1); - start_level = rtree->start_level[lg_floor(key) >> + start_level = rtree->start_level[(lg_floor(key) + 1) >> + LG_RTREE_BITS_PER_LEVEL]; + assert(start_level < rtree->height); + return (start_level); +} + +JEMALLOC_ALWAYS_INLINE unsigned +rtree_ctx_start_level(const rtree_t *rtree, const rtree_ctx_t *rtree_ctx, + uintptr_t key) +{ + unsigned start_level; + uintptr_t key_diff; + + /* Compute the difference between old and new lookup keys. */ + key_diff = key ^ rtree_ctx->key; + assert(key_diff != 0); /* Handled in rtree_elm_lookup(). */ + + /* + * Compute the last traversal path element at which the keys' paths + * are the same. + */ + start_level = rtree->start_level[(lg_floor(key_diff) + 1) >> LG_RTREE_BITS_PER_LEVEL]; assert(start_level < rtree->height); return (start_level); @@ -291,8 +341,8 @@ rtree_subtree_read(tsdn_t *tsdn, rtree_t *rtree, unsigned level, bool dependent) } JEMALLOC_ALWAYS_INLINE rtree_elm_t * -rtree_elm_lookup(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key, bool dependent, - bool init_missing) +rtree_elm_lookup(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, + uintptr_t key, bool dependent, bool init_missing) { uintptr_t subkey; unsigned start_level; @@ -300,35 +350,95 @@ rtree_elm_lookup(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key, bool dependent, assert(!dependent || !init_missing); - start_level = rtree_start_level(rtree, key); + if (dependent || init_missing) { + if (likely(rtree_ctx->valid)) { + if (key == rtree_ctx->key) + return (rtree_ctx->elms[rtree->height]); + else { + unsigned no_ctx_start_level = + rtree_start_level(rtree, key); + unsigned ctx_start_level; + + if (likely(no_ctx_start_level <= + rtree_ctx->start_level && (ctx_start_level = + rtree_ctx_start_level(rtree, rtree_ctx, + key)) >= rtree_ctx->start_level)) { + start_level = ctx_start_level; + node = rtree_ctx->elms[ctx_start_level]; + } else { + start_level = no_ctx_start_level; + node = init_missing ? + rtree_subtree_read(tsdn, rtree, + no_ctx_start_level, dependent) : + rtree_subtree_tryread(rtree, + no_ctx_start_level, dependent); + rtree_ctx->start_level = + no_ctx_start_level; + rtree_ctx->elms[no_ctx_start_level] = + node; + } + } + } else { + unsigned no_ctx_start_level = rtree_start_level(rtree, + key); + + start_level = no_ctx_start_level; + node = init_missing ? rtree_subtree_read(tsdn, rtree, + no_ctx_start_level, dependent) : + rtree_subtree_tryread(rtree, no_ctx_start_level, + dependent); + rtree_ctx->valid = true; + rtree_ctx->start_level = no_ctx_start_level; + rtree_ctx->elms[no_ctx_start_level] = node; + } + rtree_ctx->key = key; + } else { + start_level = rtree_start_level(rtree, key); + node = init_missing ? rtree_subtree_read(tsdn, rtree, + start_level, dependent) : rtree_subtree_tryread(rtree, + start_level, dependent); + } - node = init_missing ? rtree_subtree_read(tsdn, rtree, start_level, - dependent) : rtree_subtree_tryread(rtree, start_level, dependent); #define RTREE_GET_BIAS (RTREE_HEIGHT_MAX - rtree->height) switch (start_level + RTREE_GET_BIAS) { #define RTREE_GET_SUBTREE(level) \ case level: \ assert(level < (RTREE_HEIGHT_MAX-1)); \ - if (!dependent && unlikely(!rtree_node_valid(node))) \ + if (!dependent && unlikely(!rtree_node_valid(node))) { \ + if (init_missing) \ + rtree_ctx->valid = false; \ return (NULL); \ + } \ subkey = rtree_subkey(rtree, key, level - \ RTREE_GET_BIAS); \ node = init_missing ? rtree_child_read(tsdn, rtree, \ &node[subkey], level - RTREE_GET_BIAS, dependent) : \ rtree_child_tryread(&node[subkey], dependent); \ + if (dependent || init_missing) { \ + rtree_ctx->elms[level - RTREE_GET_BIAS + 1] = \ + node; \ + } \ /* Fall through. */ #define RTREE_GET_LEAF(level) \ case level: \ assert(level == (RTREE_HEIGHT_MAX-1)); \ - if (!dependent && unlikely(!rtree_node_valid(node))) \ + if (!dependent && unlikely(!rtree_node_valid(node))) { \ + if (init_missing) \ + rtree_ctx->valid = false; \ return (NULL); \ + } \ subkey = rtree_subkey(rtree, key, level - \ RTREE_GET_BIAS); \ /* \ * node is a leaf, so it contains values rather than \ * child pointers. \ */ \ - return (&node[subkey]); + node = &node[subkey]; \ + if (dependent || init_missing) { \ + rtree_ctx->elms[level - RTREE_GET_BIAS + 1] = \ + node; \ + } \ + return (node); #if RTREE_HEIGHT_MAX > 1 RTREE_GET_SUBTREE(0) #endif @@ -387,14 +497,15 @@ rtree_elm_lookup(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key, bool dependent, } JEMALLOC_INLINE bool -rtree_write(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key, const extent_t *extent) +rtree_write(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, uintptr_t key, + const extent_t *extent) { rtree_elm_t *elm; assert(extent != NULL); /* Use rtree_clear() for this case. */ assert(((uintptr_t)extent & (uintptr_t)0x1) == (uintptr_t)0x0); - elm = rtree_elm_lookup(tsdn, rtree, key, false, true); + elm = rtree_elm_lookup(tsdn, rtree, rtree_ctx, key, false, true); if (elm == NULL) return (true); assert(rtree_elm_read(elm, false) == NULL); @@ -404,11 +515,12 @@ rtree_write(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key, const extent_t *extent) } JEMALLOC_ALWAYS_INLINE extent_t * -rtree_read(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key, bool dependent) +rtree_read(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, uintptr_t key, + bool dependent) { rtree_elm_t *elm; - elm = rtree_elm_lookup(tsdn, rtree, key, dependent, false); + elm = rtree_elm_lookup(tsdn, rtree, rtree_ctx, key, dependent, false); if (elm == NULL) return (NULL); @@ -416,12 +528,13 @@ rtree_read(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key, bool dependent) } JEMALLOC_INLINE rtree_elm_t * -rtree_elm_acquire(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key, bool dependent, - bool init_missing) +rtree_elm_acquire(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, + uintptr_t key, bool dependent, bool init_missing) { rtree_elm_t *elm; - elm = rtree_elm_lookup(tsdn, rtree, key, dependent, init_missing); + elm = rtree_elm_lookup(tsdn, rtree, rtree_ctx, key, dependent, + init_missing); if (!dependent && elm == NULL) return (NULL); { @@ -481,11 +594,11 @@ rtree_elm_release(tsdn_t *tsdn, const rtree_t *rtree, rtree_elm_t *elm) } JEMALLOC_INLINE void -rtree_clear(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key) +rtree_clear(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, uintptr_t key) { rtree_elm_t *elm; - elm = rtree_elm_acquire(tsdn, rtree, key, true, false); + elm = rtree_elm_acquire(tsdn, rtree, rtree_ctx, key, true, false); rtree_elm_write_acquired(tsdn, rtree, elm, NULL); rtree_elm_release(tsdn, rtree, elm); } diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h index 988edf5..2355f9c 100644 --- a/include/jemalloc/internal/tsd.h +++ b/include/jemalloc/internal/tsd.h @@ -572,6 +572,7 @@ struct tsd_init_head_s { O(narenas_tdata, unsigned, no) \ O(arenas_tdata_bypass, bool, no) \ O(tcache_enabled, tcache_enabled_t, no) \ + O(rtree_ctx, rtree_ctx_t, no) \ O(witnesses, witness_list_t, yes) \ O(rtree_elm_witnesses, rtree_elm_witness_tsd_t,no) \ O(witness_fork, bool, no) \ @@ -588,6 +589,7 @@ struct tsd_init_head_s { 0, \ false, \ tcache_enabled_default, \ + RTREE_CTX_INITIALIZER, \ ql_head_initializer(witnesses), \ RTREE_ELM_WITNESS_TSD_INITIALIZER, \ false \ @@ -651,6 +653,7 @@ MALLOC_TSD tsdn_t *tsdn_fetch(void); bool tsdn_null(const tsdn_t *tsdn); tsd_t *tsdn_tsd(tsdn_t *tsdn); +rtree_ctx_t *tsdn_rtree_ctx(tsdn_t *tsdn, rtree_ctx_t *fallback); #endif #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_TSD_C_)) @@ -741,6 +744,22 @@ tsdn_tsd(tsdn_t *tsdn) return (&tsdn->tsd); } + +JEMALLOC_ALWAYS_INLINE rtree_ctx_t * +tsdn_rtree_ctx(tsdn_t *tsdn, rtree_ctx_t *fallback) +{ + + /* + * If tsd cannot be accessed, initialize the fallback rtree_ctx and + * return a pointer to it. + */ + if (unlikely(tsdn_null(tsdn))) { + static const rtree_ctx_t rtree_ctx = RTREE_CTX_INITIALIZER; + memcpy(fallback, &rtree_ctx, sizeof(rtree_ctx_t)); + return (fallback); + } + return (tsd_rtree_ctxp_get(tsdn_tsd(tsdn))); +} #endif #endif /* JEMALLOC_H_INLINES */ |