summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJason Evans <jasone@canonware.com>2017-03-20 23:38:21 (GMT)
committerJason Evans <jasone@canonware.com>2017-03-23 01:33:32 (GMT)
commit0ee0e0c155a05d0d028a9972ad86b9eaac4ccabd (patch)
tree2bbd812dc4d63b4eae1fba1db567d7a0b2062b93
parentce41ab0c57d5f0c9310200d0fecea99ef334a834 (diff)
downloadjemalloc-0ee0e0c155a05d0d028a9972ad86b9eaac4ccabd.zip
jemalloc-0ee0e0c155a05d0d028a9972ad86b9eaac4ccabd.tar.gz
jemalloc-0ee0e0c155a05d0d028a9972ad86b9eaac4ccabd.tar.bz2
Implement compact rtree leaf element representation.
If a single virtual adddress pointer has enough unused bits to pack {szind_t, extent_t *, bool, bool}, use a single pointer-sized field in each rtree leaf element, rather than using three separate fields. This has little impact on access speed (fewer loads/stores, but more bit twiddling), except that denser representation increases TLB effectiveness.
-rw-r--r--include/jemalloc/internal/private_symbols.txt5
-rw-r--r--include/jemalloc/internal/rtree_inlines.h127
-rw-r--r--include/jemalloc/internal/rtree_structs.h19
-rw-r--r--include/jemalloc/internal/rtree_types.h4
-rwxr-xr-xinclude/jemalloc/internal/size_classes.sh15
5 files changed, 163 insertions, 7 deletions
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 169e7d1..35c7028 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -419,6 +419,11 @@ rtree_extent_szind_read
rtree_leaf_alloc
rtree_leaf_dalloc
rtree_leaf_elm_acquire
+rtree_leaf_elm_bits_extent_get
+rtree_leaf_elm_bits_locked_get
+rtree_leaf_elm_bits_read
+rtree_leaf_elm_bits_slab_get
+rtree_leaf_elm_bits_szind_get
rtree_leaf_elm_extent_read
rtree_leaf_elm_extent_write
rtree_leaf_elm_lookup
diff --git a/include/jemalloc/internal/rtree_inlines.h b/include/jemalloc/internal/rtree_inlines.h
index 9c337b8..6f92df9 100644
--- a/include/jemalloc/internal/rtree_inlines.h
+++ b/include/jemalloc/internal/rtree_inlines.h
@@ -4,6 +4,14 @@
#ifndef JEMALLOC_ENABLE_INLINE
uintptr_t rtree_leafkey(uintptr_t key);
uintptr_t rtree_subkey(uintptr_t key, unsigned level);
+# ifdef RTREE_LEAF_COMPACT
+uintptr_t rtree_leaf_elm_bits_read(tsdn_t *tsdn, rtree_t *rtree,
+ rtree_leaf_elm_t *elm, bool acquired, bool dependent);
+extent_t *rtree_leaf_elm_bits_extent_get(uintptr_t bits);
+szind_t rtree_leaf_elm_bits_szind_get(uintptr_t bits);
+bool rtree_leaf_elm_bits_slab_get(uintptr_t bits);
+bool rtree_leaf_elm_bits_locked_get(uintptr_t bits);
+# endif
extent_t *rtree_leaf_elm_extent_read(tsdn_t *tsdn, rtree_t *rtree,
rtree_leaf_elm_t *elm, bool acquired, bool dependent);
szind_t rtree_leaf_elm_szind_read(tsdn_t *tsdn, rtree_t *rtree,
@@ -75,6 +83,42 @@ rtree_subkey(uintptr_t key, unsigned level) {
* dependent on a previous rtree write, which means a stale read
* could result if synchronization were omitted here.
*/
+# ifdef RTREE_LEAF_COMPACT
+JEMALLOC_ALWAYS_INLINE uintptr_t
+rtree_leaf_elm_bits_read(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
+ bool acquired, bool dependent) {
+ if (config_debug && acquired) {
+ assert(dependent);
+ rtree_leaf_elm_witness_access(tsdn, rtree, elm);
+ }
+
+ return (uintptr_t)atomic_load_p(&elm->le_bits, dependent
+ ? ATOMIC_RELAXED : ATOMIC_ACQUIRE);
+}
+
+JEMALLOC_ALWAYS_INLINE extent_t *
+rtree_leaf_elm_bits_extent_get(uintptr_t bits) {
+ /* Restore sign-extended high bits, mask slab and lock bits. */
+ return (extent_t *)((uintptr_t)((intptr_t)(bits << RTREE_NHIB) >>
+ RTREE_NHIB) & ~((uintptr_t)0x3));
+}
+
+JEMALLOC_ALWAYS_INLINE szind_t
+rtree_leaf_elm_bits_szind_get(uintptr_t bits) {
+ return (szind_t)(bits >> LG_VADDR);
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+rtree_leaf_elm_bits_slab_get(uintptr_t bits) {
+ return (bool)((bits >> 1) & (uintptr_t)0x1);
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+rtree_leaf_elm_bits_locked_get(uintptr_t bits) {
+ return (bool)(bits & (uintptr_t)0x1);
+}
+# endif
+
JEMALLOC_ALWAYS_INLINE extent_t *
rtree_leaf_elm_extent_read(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
bool acquired, bool dependent) {
@@ -83,6 +127,12 @@ rtree_leaf_elm_extent_read(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
rtree_leaf_elm_witness_access(tsdn, rtree, elm);
}
+#ifdef RTREE_LEAF_COMPACT
+ uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm, acquired,
+ dependent);
+ assert(!acquired || rtree_leaf_elm_bits_locked_get(bits));
+ return rtree_leaf_elm_bits_extent_get(bits);
+#else
extent_t *extent = (extent_t *)atomic_load_p(&elm->le_extent, dependent
? ATOMIC_RELAXED : ATOMIC_ACQUIRE);
assert(!acquired || ((uintptr_t)extent & (uintptr_t)0x1) ==
@@ -90,6 +140,7 @@ rtree_leaf_elm_extent_read(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
/* Mask lock bit. */
extent = (extent_t *)((uintptr_t)extent & ~((uintptr_t)0x1));
return extent;
+#endif
}
JEMALLOC_ALWAYS_INLINE szind_t
@@ -100,8 +151,15 @@ rtree_leaf_elm_szind_read(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
rtree_leaf_elm_witness_access(tsdn, rtree, elm);
}
+#ifdef RTREE_LEAF_COMPACT
+ uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm, acquired,
+ dependent);
+ assert(!acquired || rtree_leaf_elm_bits_locked_get(bits));
+ return rtree_leaf_elm_bits_szind_get(bits);
+#else
return (szind_t)atomic_load_u(&elm->le_szind, dependent ? ATOMIC_RELAXED
: ATOMIC_ACQUIRE);
+#endif
}
JEMALLOC_ALWAYS_INLINE bool
@@ -112,8 +170,15 @@ rtree_leaf_elm_slab_read(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
rtree_leaf_elm_witness_access(tsdn, rtree, elm);
}
+#ifdef RTREE_LEAF_COMPACT
+ uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm, acquired,
+ dependent);
+ assert(!acquired || rtree_leaf_elm_bits_locked_get(bits));
+ return rtree_leaf_elm_bits_slab_get(bits);
+#else
return atomic_load_b(&elm->le_slab, dependent ? ATOMIC_RELAXED :
ATOMIC_ACQUIRE);
+#endif
}
JEMALLOC_INLINE void
@@ -124,11 +189,21 @@ rtree_leaf_elm_extent_write(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
}
assert(((uintptr_t)extent & (uintptr_t)0x1) == (uintptr_t)0x0);
+#ifdef RTREE_LEAF_COMPACT
+ uintptr_t old_bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm,
+ acquired, acquired);
+ uintptr_t bits = ((uintptr_t)rtree_leaf_elm_bits_szind_get(old_bits) <<
+ LG_VADDR) | ((uintptr_t)extent & (((uintptr_t)0x1 << LG_VADDR) - 1))
+ | ((uintptr_t)rtree_leaf_elm_bits_slab_get(old_bits) << 1) |
+ (uintptr_t)acquired;
+ atomic_store_p(&elm->le_bits, (void *)bits, ATOMIC_RELEASE);
+#else
if (acquired) {
/* Overlay lock bit. */
extent = (extent_t *)((uintptr_t)extent | (uintptr_t)0x1);
}
atomic_store_p(&elm->le_extent, extent, ATOMIC_RELEASE);
+#endif
}
JEMALLOC_INLINE void
@@ -139,7 +214,18 @@ rtree_leaf_elm_szind_write(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
}
assert(szind <= NSIZES);
+#ifdef RTREE_LEAF_COMPACT
+ uintptr_t old_bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm,
+ acquired, acquired);
+ uintptr_t bits = ((uintptr_t)szind << LG_VADDR) |
+ ((uintptr_t)rtree_leaf_elm_bits_extent_get(old_bits) &
+ (((uintptr_t)0x1 << LG_VADDR) - 1)) |
+ ((uintptr_t)rtree_leaf_elm_bits_slab_get(old_bits) << 1) |
+ (uintptr_t)acquired;
+ atomic_store_p(&elm->le_bits, (void *)bits, ATOMIC_RELEASE);
+#else
atomic_store_u(&elm->le_szind, szind, ATOMIC_RELEASE);
+#endif
}
JEMALLOC_INLINE void
@@ -149,12 +235,35 @@ rtree_leaf_elm_slab_write(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
rtree_leaf_elm_witness_access(tsdn, rtree, elm);
}
+#ifdef RTREE_LEAF_COMPACT
+ uintptr_t old_bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm,
+ acquired, acquired);
+ uintptr_t bits = ((uintptr_t)rtree_leaf_elm_bits_szind_get(old_bits) <<
+ LG_VADDR) | ((uintptr_t)rtree_leaf_elm_bits_extent_get(old_bits) &
+ (((uintptr_t)0x1 << LG_VADDR) - 1)) | ((uintptr_t)slab << 1) |
+ (uintptr_t)acquired;
+ atomic_store_p(&elm->le_bits, (void *)bits, ATOMIC_RELEASE);
+#else
atomic_store_b(&elm->le_slab, slab, ATOMIC_RELEASE);
+#endif
}
JEMALLOC_INLINE void
rtree_leaf_elm_write(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
bool acquired, extent_t *extent, szind_t szind, bool slab) {
+#ifdef RTREE_LEAF_COMPACT
+ if (config_debug && acquired) {
+ rtree_leaf_elm_witness_access(tsdn, rtree, elm);
+ }
+ assert(!slab || szind < NBINS);
+
+ uintptr_t bits = ((uintptr_t)szind << LG_VADDR) |
+ ((uintptr_t)extent & (((uintptr_t)0x1 << LG_VADDR) - 1)) |
+ ((uintptr_t)slab << 1) |
+ (uintptr_t)acquired;
+
+ atomic_store_p(&elm->le_bits, (void *)bits, ATOMIC_RELEASE);
+#else
rtree_leaf_elm_slab_write(tsdn, rtree, elm, acquired, slab);
rtree_leaf_elm_szind_write(tsdn, rtree, elm, acquired, szind);
/*
@@ -162,6 +271,7 @@ rtree_leaf_elm_write(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm,
* as soon as the extent field is non-NULL.
*/
rtree_leaf_elm_extent_write(tsdn, rtree, elm, acquired, extent);
+#endif
}
JEMALLOC_ALWAYS_INLINE rtree_leaf_elm_t *
@@ -317,19 +427,24 @@ rtree_leaf_elm_acquire(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx,
spin_t spinner = SPIN_INITIALIZER;
while (true) {
/* The least significant bit serves as a lock. */
- void *extent_and_lock = atomic_load_p(&elm->le_extent,
+#ifdef RTREE_LEAF_COMPACT
+# define RTREE_FIELD_WITH_LOCK le_bits
+#else
+# define RTREE_FIELD_WITH_LOCK le_extent
+#endif
+ void *bits = atomic_load_p(&elm->RTREE_FIELD_WITH_LOCK,
ATOMIC_RELAXED);
- if (likely(((uintptr_t)extent_and_lock & (uintptr_t)0x1) == 0))
- {
- void *locked = (void *)((uintptr_t)extent_and_lock
- | (uintptr_t)0x1);
+ if (likely(((uintptr_t)bits & (uintptr_t)0x1) == 0)) {
+ void *locked = (void *)((uintptr_t)bits |
+ (uintptr_t)0x1);
if (likely(atomic_compare_exchange_strong_p(
- &elm->le_extent, &extent_and_lock, locked,
+ &elm->RTREE_FIELD_WITH_LOCK, &bits, locked,
ATOMIC_ACQUIRE, ATOMIC_RELAXED))) {
break;
}
}
spin_adaptive(&spinner);
+#undef RTREE_FIELD_WITH_LOCK
}
if (config_debug) {
diff --git a/include/jemalloc/internal/rtree_structs.h b/include/jemalloc/internal/rtree_structs.h
index 3ecdf81..8dd9cda 100644
--- a/include/jemalloc/internal/rtree_structs.h
+++ b/include/jemalloc/internal/rtree_structs.h
@@ -6,9 +6,26 @@ struct rtree_node_elm_s {
};
struct rtree_leaf_elm_s {
- atomic_p_t le_extent; /* (extent_t *) */
+#ifdef RTREE_LEAF_COMPACT
+ /*
+ * Single pointer-width field containing all three leaf element fields.
+ * For example, on a 64-bit x64 system with 48 significant virtual
+ * memory address bits, the index, extent, and slab fields are packed as
+ * such:
+ *
+ * x: index
+ * e: extent
+ * b: slab
+ * k: lock
+ *
+ * 00000000 xxxxxxxx eeeeeeee [...] eeeeeeee eeee00bk
+ */
+ atomic_p_t le_bits;
+#else
+ atomic_p_t le_extent; /* (extent_t *), lock in low bit */
atomic_u_t le_szind; /* (szind_t) */
atomic_b_t le_slab; /* (bool) */
+#endif
};
struct rtree_leaf_elm_witness_s {
diff --git a/include/jemalloc/internal/rtree_types.h b/include/jemalloc/internal/rtree_types.h
index 18fc5b0..de3893b 100644
--- a/include/jemalloc/internal/rtree_types.h
+++ b/include/jemalloc/internal/rtree_types.h
@@ -33,6 +33,10 @@ typedef struct rtree_s rtree_t;
#else
# error Unsupported number of significant virtual address bits
#endif
+/* Use compact leaf representation if virtual address encoding allows. */
+#if RTREE_NHIB >= LG_CEIL_NSIZES
+# define RTREE_LEAF_COMPACT
+#endif
/*
* Number of leafkey/leaf pairs to cache. Each entry supports an entire leaf,
diff --git a/include/jemalloc/internal/size_classes.sh b/include/jemalloc/internal/size_classes.sh
index 06892d8..60bdbd2 100755
--- a/include/jemalloc/internal/size_classes.sh
+++ b/include/jemalloc/internal/size_classes.sh
@@ -40,6 +40,17 @@ lg() {
done
}
+lg_ceil() {
+ y=$1
+ lg ${y}; lg_floor=${lg_result}
+ pow2 ${lg_floor}; pow2_floor=${pow2_result}
+ if [ ${pow2_floor} -lt ${y} ] ; then
+ lg_ceil_result=$((${lg_floor} + 1))
+ else
+ lg_ceil_result=${lg_floor}
+ fi
+}
+
reg_size_compute() {
lg_grp=$1
lg_delta=$2
@@ -246,12 +257,14 @@ size_classes() {
done
echo
nsizes=${index}
+ lg_ceil ${nsizes}; lg_ceil_nsizes=${lg_ceil_result}
# Defined upon completion:
# - ntbins
# - nlbins
# - nbins
# - nsizes
+ # - lg_ceil_nsizes
# - npsizes
# - lg_tiny_maxclass
# - lookup_maxclass
@@ -286,6 +299,7 @@ cat <<EOF
* NLBINS: Number of bins supported by the lookup table.
* NBINS: Number of small size class bins.
* NSIZES: Number of size classes.
+ * LG_CEIL_NSIZES: Number of bits required to store NSIZES.
* NPSIZES: Number of size classes that are a multiple of (1U << LG_PAGE).
* LG_TINY_MAXCLASS: Lg of maximum tiny size class.
* LOOKUP_MAXCLASS: Maximum size class included in lookup table.
@@ -311,6 +325,7 @@ for lg_z in ${lg_zarr} ; do
echo "#define NLBINS ${nlbins}"
echo "#define NBINS ${nbins}"
echo "#define NSIZES ${nsizes}"
+ echo "#define LG_CEIL_NSIZES ${lg_ceil_nsizes}"
echo "#define NPSIZES ${npsizes}"
echo "#define LG_TINY_MAXCLASS ${lg_tiny_maxclass}"
echo "#define LOOKUP_MAXCLASS ${lookup_maxclass}"