summaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorJason Evans <jasone@canonware.com>2016-06-03 01:43:10 (GMT)
committerJason Evans <jasone@canonware.com>2016-06-06 03:59:57 (GMT)
commit6f29a8392403f70bfa1080964a65540b6f3699fe (patch)
tree60dcedc3dc0b77c6e3280979ae5a5a267834de4e /include
parent7be2ebc23f0f145e095e7230d7d8a202b8dcc55e (diff)
downloadjemalloc-6f29a8392403f70bfa1080964a65540b6f3699fe.zip
jemalloc-6f29a8392403f70bfa1080964a65540b6f3699fe.tar.gz
jemalloc-6f29a8392403f70bfa1080964a65540b6f3699fe.tar.bz2
Add rtree lookup path caching.
rtree-based extent lookups remain more expensive than chunk-based run lookups, but with this optimization the fast path slowdown is ~3 CPU cycles per metadata lookup (on Intel Core i7-4980HQ), versus ~11 cycles prior. The path caching speedup tends to degrade gracefully unless allocated memory is spread far apart (as is the case when using a mixture of sbrk() and mmap()).
Diffstat (limited to 'include')
-rw-r--r--include/jemalloc/internal/extent.h5
-rw-r--r--include/jemalloc/internal/private_symbols.txt5
-rw-r--r--include/jemalloc/internal/rtree.h183
-rw-r--r--include/jemalloc/internal/tsd.h19
4 files changed, 176 insertions, 36 deletions
diff --git a/include/jemalloc/internal/extent.h b/include/jemalloc/internal/extent.h
index a41a15f..8b8dbe8 100644
--- a/include/jemalloc/internal/extent.h
+++ b/include/jemalloc/internal/extent.h
@@ -181,8 +181,11 @@ void extent_ring_remove(extent_t *extent);
JEMALLOC_INLINE extent_t *
extent_lookup(tsdn_t *tsdn, const void *ptr, bool dependent)
{
+ rtree_ctx_t rtree_ctx_fallback;
+ rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
- return (rtree_read(tsdn, &extents_rtree, (uintptr_t)ptr, dependent));
+ return (rtree_read(tsdn, &extents_rtree, rtree_ctx, (uintptr_t)ptr,
+ dependent));
}
JEMALLOC_INLINE arena_t *
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index d4e5525..07e7f28 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -399,6 +399,7 @@ rtree_child_read
rtree_child_read_hard
rtree_child_tryread
rtree_clear
+rtree_ctx_start_level
rtree_delete
rtree_elm_acquire
rtree_elm_lookup
@@ -502,6 +503,9 @@ tsd_nominal
tsd_prof_tdata_get
tsd_prof_tdata_set
tsd_prof_tdatap_get
+tsd_rtree_ctx_get
+tsd_rtree_ctx_set
+tsd_rtree_ctxp_get
tsd_rtree_elm_witnesses_get
tsd_rtree_elm_witnesses_set
tsd_rtree_elm_witnessesp_get
@@ -529,6 +533,7 @@ tsd_witnesses_set
tsd_witnessesp_get
tsdn_fetch
tsdn_null
+tsdn_rtree_ctx
tsdn_tsd
witness_assert_lockless
witness_assert_not_owner
diff --git a/include/jemalloc/internal/rtree.h b/include/jemalloc/internal/rtree.h
index a47a79e..fc88dfe 100644
--- a/include/jemalloc/internal/rtree.h
+++ b/include/jemalloc/internal/rtree.h
@@ -10,6 +10,7 @@ typedef struct rtree_elm_s rtree_elm_t;
typedef struct rtree_elm_witness_s rtree_elm_witness_t;
typedef struct rtree_elm_witness_tsd_s rtree_elm_witness_tsd_t;
typedef struct rtree_level_s rtree_level_t;
+typedef struct rtree_ctx_s rtree_ctx_t;
typedef struct rtree_s rtree_t;
/*
@@ -25,6 +26,13 @@ typedef struct rtree_s rtree_t;
/* Used for two-stage lock-free node initialization. */
#define RTREE_NODE_INITIALIZING ((rtree_elm_t *)0x1)
+#define RTREE_CTX_INITIALIZER { \
+ false, \
+ 0, \
+ 0, \
+ {NULL /* C initializes all trailing elements to NULL. */} \
+}
+
/*
* Maximum number of concurrently acquired elements per thread. This controls
* how many witness_t structures are embedded in tsd. Ideally rtree_elm_t would
@@ -78,9 +86,9 @@ struct rtree_level_s {
*
* Suppose keys comprise 48 bits, and LG_RTREE_BITS_PER_LEVEL is 4.
* This results in a 3-level tree, and the leftmost leaf can be directly
- * accessed via subtrees[2], the subtree prefixed by 0x0000 (excluding
- * 0x00000000) can be accessed via subtrees[1], and the remainder of the
- * tree can be accessed via subtrees[0].
+ * accessed via levels[2], the subtree prefixed by 0x0000 (excluding
+ * 0x00000000) can be accessed via levels[1], and the remainder of the
+ * tree can be accessed via levels[0].
*
* levels[0] : [<unused> | 0x0001******** | 0x0002******** | ...]
*
@@ -90,7 +98,7 @@ struct rtree_level_s {
*
* This has practical implications on x64, which currently uses only the
* lower 47 bits of virtual address space in userland, thus leaving
- * subtrees[0] unused and avoiding a level of tree traversal.
+ * levels[0] unused and avoiding a level of tree traversal.
*/
union {
void *subtree_pun;
@@ -105,13 +113,31 @@ struct rtree_level_s {
unsigned cumbits;
};
+struct rtree_ctx_s {
+ /* If false, key/elms have not yet been initialized by a lookup. */
+ bool valid;
+ /* Key that corresponds to the tree path recorded in elms. */
+ uintptr_t key;
+ /* Memoized rtree_start_level(key). */
+ unsigned start_level;
+ /*
+ * A path through rtree, driven by key. Only elements that could
+ * actually be used for subsequent lookups are initialized, i.e. if
+ * start_level = rtree_start_level(key) is non-zero, the first
+ * start_level elements are uninitialized. The last element contains a
+ * pointer to the leaf node element that corresponds to key, so that
+ * exact matches require no tree node offset computation.
+ */
+ rtree_elm_t *elms[RTREE_HEIGHT_MAX + 1];
+};
+
struct rtree_s {
unsigned height;
/*
* Precomputed table used to convert from the number of leading 0 key
* bits to which subtree level to start at.
*/
- unsigned start_level[RTREE_HEIGHT_MAX];
+ unsigned start_level[RTREE_HEIGHT_MAX + 1];
rtree_level_t levels[RTREE_HEIGHT_MAX];
};
@@ -143,7 +169,9 @@ void rtree_elm_witness_release(tsdn_t *tsdn, const rtree_t *rtree,
#ifdef JEMALLOC_H_INLINES
#ifndef JEMALLOC_ENABLE_INLINE
-unsigned rtree_start_level(rtree_t *rtree, uintptr_t key);
+unsigned rtree_start_level(const rtree_t *rtree, uintptr_t key);
+unsigned rtree_ctx_start_level(const rtree_t *rtree,
+ const rtree_ctx_t *rtree_ctx, uintptr_t key);
uintptr_t rtree_subkey(rtree_t *rtree, uintptr_t key, unsigned level);
bool rtree_node_valid(rtree_elm_t *node);
@@ -156,33 +184,55 @@ rtree_elm_t *rtree_subtree_tryread(rtree_t *rtree, unsigned level,
bool dependent);
rtree_elm_t *rtree_subtree_read(tsdn_t *tsdn, rtree_t *rtree,
unsigned level, bool dependent);
-rtree_elm_t *rtree_elm_lookup(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key,
- bool dependent, bool init_missing);
-
-bool rtree_write(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key,
- const extent_t *extent);
-extent_t *rtree_read(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key,
- bool dependent);
-rtree_elm_t *rtree_elm_acquire(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key,
- bool dependent, bool init_missing);
+rtree_elm_t *rtree_elm_lookup(tsdn_t *tsdn, rtree_t *rtree,
+ rtree_ctx_t *rtree_ctx, uintptr_t key, bool dependent, bool init_missing);
+
+bool rtree_write(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
+ uintptr_t key, const extent_t *extent);
+extent_t *rtree_read(tsdn_t *tsdn, rtree_t *rtree,
+ rtree_ctx_t *rtree_ctx, uintptr_t key, bool dependent);
+rtree_elm_t *rtree_elm_acquire(tsdn_t *tsdn, rtree_t *rtree,
+ rtree_ctx_t *rtree_ctx, uintptr_t key, bool dependent, bool init_missing);
extent_t *rtree_elm_read_acquired(tsdn_t *tsdn, const rtree_t *rtree,
rtree_elm_t *elm);
void rtree_elm_write_acquired(tsdn_t *tsdn, const rtree_t *rtree,
rtree_elm_t *elm, const extent_t *extent);
void rtree_elm_release(tsdn_t *tsdn, const rtree_t *rtree, rtree_elm_t *elm);
-void rtree_clear(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key);
+void rtree_clear(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
+ uintptr_t key);
#endif
#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_RTREE_C_))
JEMALLOC_ALWAYS_INLINE unsigned
-rtree_start_level(rtree_t *rtree, uintptr_t key)
+rtree_start_level(const rtree_t *rtree, uintptr_t key)
{
unsigned start_level;
if (unlikely(key == 0))
return (rtree->height - 1);
- start_level = rtree->start_level[lg_floor(key) >>
+ start_level = rtree->start_level[(lg_floor(key) + 1) >>
+ LG_RTREE_BITS_PER_LEVEL];
+ assert(start_level < rtree->height);
+ return (start_level);
+}
+
+JEMALLOC_ALWAYS_INLINE unsigned
+rtree_ctx_start_level(const rtree_t *rtree, const rtree_ctx_t *rtree_ctx,
+ uintptr_t key)
+{
+ unsigned start_level;
+ uintptr_t key_diff;
+
+ /* Compute the difference between old and new lookup keys. */
+ key_diff = key ^ rtree_ctx->key;
+ assert(key_diff != 0); /* Handled in rtree_elm_lookup(). */
+
+ /*
+ * Compute the last traversal path element at which the keys' paths
+ * are the same.
+ */
+ start_level = rtree->start_level[(lg_floor(key_diff) + 1) >>
LG_RTREE_BITS_PER_LEVEL];
assert(start_level < rtree->height);
return (start_level);
@@ -291,8 +341,8 @@ rtree_subtree_read(tsdn_t *tsdn, rtree_t *rtree, unsigned level, bool dependent)
}
JEMALLOC_ALWAYS_INLINE rtree_elm_t *
-rtree_elm_lookup(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key, bool dependent,
- bool init_missing)
+rtree_elm_lookup(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
+ uintptr_t key, bool dependent, bool init_missing)
{
uintptr_t subkey;
unsigned start_level;
@@ -300,35 +350,95 @@ rtree_elm_lookup(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key, bool dependent,
assert(!dependent || !init_missing);
- start_level = rtree_start_level(rtree, key);
+ if (dependent || init_missing) {
+ if (likely(rtree_ctx->valid)) {
+ if (key == rtree_ctx->key)
+ return (rtree_ctx->elms[rtree->height]);
+ else {
+ unsigned no_ctx_start_level =
+ rtree_start_level(rtree, key);
+ unsigned ctx_start_level;
+
+ if (likely(no_ctx_start_level <=
+ rtree_ctx->start_level && (ctx_start_level =
+ rtree_ctx_start_level(rtree, rtree_ctx,
+ key)) >= rtree_ctx->start_level)) {
+ start_level = ctx_start_level;
+ node = rtree_ctx->elms[ctx_start_level];
+ } else {
+ start_level = no_ctx_start_level;
+ node = init_missing ?
+ rtree_subtree_read(tsdn, rtree,
+ no_ctx_start_level, dependent) :
+ rtree_subtree_tryread(rtree,
+ no_ctx_start_level, dependent);
+ rtree_ctx->start_level =
+ no_ctx_start_level;
+ rtree_ctx->elms[no_ctx_start_level] =
+ node;
+ }
+ }
+ } else {
+ unsigned no_ctx_start_level = rtree_start_level(rtree,
+ key);
+
+ start_level = no_ctx_start_level;
+ node = init_missing ? rtree_subtree_read(tsdn, rtree,
+ no_ctx_start_level, dependent) :
+ rtree_subtree_tryread(rtree, no_ctx_start_level,
+ dependent);
+ rtree_ctx->valid = true;
+ rtree_ctx->start_level = no_ctx_start_level;
+ rtree_ctx->elms[no_ctx_start_level] = node;
+ }
+ rtree_ctx->key = key;
+ } else {
+ start_level = rtree_start_level(rtree, key);
+ node = init_missing ? rtree_subtree_read(tsdn, rtree,
+ start_level, dependent) : rtree_subtree_tryread(rtree,
+ start_level, dependent);
+ }
- node = init_missing ? rtree_subtree_read(tsdn, rtree, start_level,
- dependent) : rtree_subtree_tryread(rtree, start_level, dependent);
#define RTREE_GET_BIAS (RTREE_HEIGHT_MAX - rtree->height)
switch (start_level + RTREE_GET_BIAS) {
#define RTREE_GET_SUBTREE(level) \
case level: \
assert(level < (RTREE_HEIGHT_MAX-1)); \
- if (!dependent && unlikely(!rtree_node_valid(node))) \
+ if (!dependent && unlikely(!rtree_node_valid(node))) { \
+ if (init_missing) \
+ rtree_ctx->valid = false; \
return (NULL); \
+ } \
subkey = rtree_subkey(rtree, key, level - \
RTREE_GET_BIAS); \
node = init_missing ? rtree_child_read(tsdn, rtree, \
&node[subkey], level - RTREE_GET_BIAS, dependent) : \
rtree_child_tryread(&node[subkey], dependent); \
+ if (dependent || init_missing) { \
+ rtree_ctx->elms[level - RTREE_GET_BIAS + 1] = \
+ node; \
+ } \
/* Fall through. */
#define RTREE_GET_LEAF(level) \
case level: \
assert(level == (RTREE_HEIGHT_MAX-1)); \
- if (!dependent && unlikely(!rtree_node_valid(node))) \
+ if (!dependent && unlikely(!rtree_node_valid(node))) { \
+ if (init_missing) \
+ rtree_ctx->valid = false; \
return (NULL); \
+ } \
subkey = rtree_subkey(rtree, key, level - \
RTREE_GET_BIAS); \
/* \
* node is a leaf, so it contains values rather than \
* child pointers. \
*/ \
- return (&node[subkey]);
+ node = &node[subkey]; \
+ if (dependent || init_missing) { \
+ rtree_ctx->elms[level - RTREE_GET_BIAS + 1] = \
+ node; \
+ } \
+ return (node);
#if RTREE_HEIGHT_MAX > 1
RTREE_GET_SUBTREE(0)
#endif
@@ -387,14 +497,15 @@ rtree_elm_lookup(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key, bool dependent,
}
JEMALLOC_INLINE bool
-rtree_write(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key, const extent_t *extent)
+rtree_write(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, uintptr_t key,
+ const extent_t *extent)
{
rtree_elm_t *elm;
assert(extent != NULL); /* Use rtree_clear() for this case. */
assert(((uintptr_t)extent & (uintptr_t)0x1) == (uintptr_t)0x0);
- elm = rtree_elm_lookup(tsdn, rtree, key, false, true);
+ elm = rtree_elm_lookup(tsdn, rtree, rtree_ctx, key, false, true);
if (elm == NULL)
return (true);
assert(rtree_elm_read(elm, false) == NULL);
@@ -404,11 +515,12 @@ rtree_write(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key, const extent_t *extent)
}
JEMALLOC_ALWAYS_INLINE extent_t *
-rtree_read(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key, bool dependent)
+rtree_read(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, uintptr_t key,
+ bool dependent)
{
rtree_elm_t *elm;
- elm = rtree_elm_lookup(tsdn, rtree, key, dependent, false);
+ elm = rtree_elm_lookup(tsdn, rtree, rtree_ctx, key, dependent, false);
if (elm == NULL)
return (NULL);
@@ -416,12 +528,13 @@ rtree_read(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key, bool dependent)
}
JEMALLOC_INLINE rtree_elm_t *
-rtree_elm_acquire(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key, bool dependent,
- bool init_missing)
+rtree_elm_acquire(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
+ uintptr_t key, bool dependent, bool init_missing)
{
rtree_elm_t *elm;
- elm = rtree_elm_lookup(tsdn, rtree, key, dependent, init_missing);
+ elm = rtree_elm_lookup(tsdn, rtree, rtree_ctx, key, dependent,
+ init_missing);
if (!dependent && elm == NULL)
return (NULL);
{
@@ -481,11 +594,11 @@ rtree_elm_release(tsdn_t *tsdn, const rtree_t *rtree, rtree_elm_t *elm)
}
JEMALLOC_INLINE void
-rtree_clear(tsdn_t *tsdn, rtree_t *rtree, uintptr_t key)
+rtree_clear(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, uintptr_t key)
{
rtree_elm_t *elm;
- elm = rtree_elm_acquire(tsdn, rtree, key, true, false);
+ elm = rtree_elm_acquire(tsdn, rtree, rtree_ctx, key, true, false);
rtree_elm_write_acquired(tsdn, rtree, elm, NULL);
rtree_elm_release(tsdn, rtree, elm);
}
diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index 988edf5..2355f9c 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -572,6 +572,7 @@ struct tsd_init_head_s {
O(narenas_tdata, unsigned, no) \
O(arenas_tdata_bypass, bool, no) \
O(tcache_enabled, tcache_enabled_t, no) \
+ O(rtree_ctx, rtree_ctx_t, no) \
O(witnesses, witness_list_t, yes) \
O(rtree_elm_witnesses, rtree_elm_witness_tsd_t,no) \
O(witness_fork, bool, no) \
@@ -588,6 +589,7 @@ struct tsd_init_head_s {
0, \
false, \
tcache_enabled_default, \
+ RTREE_CTX_INITIALIZER, \
ql_head_initializer(witnesses), \
RTREE_ELM_WITNESS_TSD_INITIALIZER, \
false \
@@ -651,6 +653,7 @@ MALLOC_TSD
tsdn_t *tsdn_fetch(void);
bool tsdn_null(const tsdn_t *tsdn);
tsd_t *tsdn_tsd(tsdn_t *tsdn);
+rtree_ctx_t *tsdn_rtree_ctx(tsdn_t *tsdn, rtree_ctx_t *fallback);
#endif
#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_TSD_C_))
@@ -741,6 +744,22 @@ tsdn_tsd(tsdn_t *tsdn)
return (&tsdn->tsd);
}
+
+JEMALLOC_ALWAYS_INLINE rtree_ctx_t *
+tsdn_rtree_ctx(tsdn_t *tsdn, rtree_ctx_t *fallback)
+{
+
+ /*
+ * If tsd cannot be accessed, initialize the fallback rtree_ctx and
+ * return a pointer to it.
+ */
+ if (unlikely(tsdn_null(tsdn))) {
+ static const rtree_ctx_t rtree_ctx = RTREE_CTX_INITIALIZER;
+ memcpy(fallback, &rtree_ctx, sizeof(rtree_ctx_t));
+ return (fallback);
+ }
+ return (tsd_rtree_ctxp_get(tsdn_tsd(tsdn)));
+}
#endif
#endif /* JEMALLOC_H_INLINES */