From 45f087eb033927338b9df847eb9be6886ef48cf7 Mon Sep 17 00:00:00 2001
From: Jason Evans <jasone@canonware.com>
Date: Sun, 16 Apr 2017 09:25:56 -0700
Subject: Revert "Remove BITMAP_USE_TREE."

Some systems use a native 64 KiB page size, which means that the bitmap
for the smallest size class can be 8192 bits, not just 512 bits as when
the page size is 4 KiB.  Linear search in bitmap_{sfu,ffu}() is
unacceptably slow for such large bitmaps.

This reverts commit 7c00f04ff40a34627e31488d02ff1081c749c7ba.
---
 include/jemalloc/internal/bitmap_inlines.h |  95 +++++++++++++++++++++++++
 include/jemalloc/internal/bitmap_structs.h |  11 +++
 include/jemalloc/internal/bitmap_types.h   | 107 +++++++++++++++++++++++++++++
 src/bitmap.c                               |  78 +++++++++++++++++++++
 test/unit/bitmap.c                         |  16 +++++
 5 files changed, 307 insertions(+)

diff --git a/include/jemalloc/internal/bitmap_inlines.h b/include/jemalloc/internal/bitmap_inlines.h
index fc4bad4..c236201 100644
--- a/include/jemalloc/internal/bitmap_inlines.h
+++ b/include/jemalloc/internal/bitmap_inlines.h
@@ -16,6 +16,12 @@ void bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit);
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_BITMAP_C_))
 JEMALLOC_INLINE bool
 bitmap_full(bitmap_t *bitmap, const bitmap_info_t *binfo) {
+#ifdef BITMAP_USE_TREE
+	size_t rgoff = binfo->levels[binfo->nlevels].group_offset - 1;
+	bitmap_t rg = bitmap[rgoff];
+	/* The bitmap is full iff the root group is 0. */
+	return (rg == 0);
+#else
 	size_t i;
 
 	for (i = 0; i < binfo->ngroups; i++) {
@@ -24,6 +30,7 @@ bitmap_full(bitmap_t *bitmap, const bitmap_info_t *binfo) {
 		}
 	}
 	return true;
+#endif
 }
 
 JEMALLOC_INLINE bool
@@ -52,6 +59,24 @@ bitmap_set(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit) {
 	g ^= ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK);
 	*gp = g;
 	assert(bitmap_get(bitmap, binfo, bit));
+#ifdef BITMAP_USE_TREE
+	/* Propagate group state transitions up the tree. */
+	if (g == 0) {
+		unsigned i;
+		for (i = 1; i < binfo->nlevels; i++) {
+			bit = goff;
+			goff = bit >> LG_BITMAP_GROUP_NBITS;
+			gp = &bitmap[binfo->levels[i].group_offset + goff];
+			g = *gp;
+			assert(g & (ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK)));
+			g ^= ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK);
+			*gp = g;
+			if (g != 0) {
+				break;
+			}
+		}
+	}
+#endif
 }
 
 /* ffu: find first unset >= bit. */
@@ -59,6 +84,44 @@ JEMALLOC_INLINE size_t
 bitmap_ffu(const bitmap_t *bitmap, const bitmap_info_t *binfo, size_t min_bit) {
 	assert(min_bit < binfo->nbits);
 
+#ifdef BITMAP_USE_TREE
+	size_t bit = 0;
+	for (unsigned level = binfo->nlevels; level--;) {
+		size_t lg_bits_per_group = (LG_BITMAP_GROUP_NBITS * (level +
+		    1));
+		bitmap_t group = bitmap[binfo->levels[level].group_offset + (bit
+		    >> lg_bits_per_group)];
+		unsigned group_nmask = ((min_bit > bit) ? (min_bit - bit) : 0)
+		    >> (lg_bits_per_group - LG_BITMAP_GROUP_NBITS);
+		assert(group_nmask <= BITMAP_GROUP_NBITS);
+		bitmap_t group_mask = ~((1LU << group_nmask) - 1);
+		bitmap_t group_masked = group & group_mask;
+		if (group_masked == 0LU) {
+			if (group == 0LU) {
+				return binfo->nbits;
+			}
+			/*
+			 * min_bit was preceded by one or more unset bits in
+			 * this group, but there are no other unset bits in this
+			 * group.  Try again starting at the first bit of the
+			 * next sibling.  This will recurse at most once per
+			 * non-root level.
+			 */
+			size_t sib_base = bit + (1U << lg_bits_per_group);
+			assert(sib_base > min_bit);
+			assert(sib_base > bit);
+			if (sib_base >= binfo->nbits) {
+				return binfo->nbits;
+			}
+			return bitmap_ffu(bitmap, binfo, sib_base);
+		}
+		bit += (ffs_lu(group_masked) - 1) << (lg_bits_per_group -
+		    LG_BITMAP_GROUP_NBITS);
+	}
+	assert(bit >= min_bit);
+	assert(bit < binfo->nbits);
+	return bit;
+#else
 	size_t i = min_bit >> LG_BITMAP_GROUP_NBITS;
 	bitmap_t g = bitmap[i] & ~((1LU << (min_bit & BITMAP_GROUP_NBITS_MASK))
 	    - 1);
@@ -72,6 +135,7 @@ bitmap_ffu(const bitmap_t *bitmap, const bitmap_info_t *binfo, size_t min_bit) {
 		g = bitmap[i];
 	} while (i < binfo->ngroups);
 	return binfo->nbits;
+#endif
 }
 
 /* sfu: set first unset. */
@@ -83,6 +147,16 @@ bitmap_sfu(bitmap_t *bitmap, const bitmap_info_t *binfo) {
 
 	assert(!bitmap_full(bitmap, binfo));
 
+#ifdef BITMAP_USE_TREE
+	i = binfo->nlevels - 1;
+	g = bitmap[binfo->levels[i].group_offset];
+	bit = ffs_lu(g) - 1;
+	while (i > 0) {
+		i--;
+		g = bitmap[binfo->levels[i].group_offset + bit];
+		bit = (bit << LG_BITMAP_GROUP_NBITS) + (ffs_lu(g) - 1);
+	}
+#else
 	i = 0;
 	g = bitmap[0];
 	while ((bit = ffs_lu(g)) == 0) {
@@ -90,6 +164,7 @@ bitmap_sfu(bitmap_t *bitmap, const bitmap_info_t *binfo) {
 		g = bitmap[i];
 	}
 	bit = (i << LG_BITMAP_GROUP_NBITS) + (bit - 1);
+#endif
 	bitmap_set(bitmap, binfo, bit);
 	return bit;
 }
@@ -111,6 +186,26 @@ bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit) {
 	g ^= ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK);
 	*gp = g;
 	assert(!bitmap_get(bitmap, binfo, bit));
+#ifdef BITMAP_USE_TREE
+	/* Propagate group state transitions up the tree. */
+	if (propagate) {
+		unsigned i;
+		for (i = 1; i < binfo->nlevels; i++) {
+			bit = goff;
+			goff = bit >> LG_BITMAP_GROUP_NBITS;
+			gp = &bitmap[binfo->levels[i].group_offset + goff];
+			g = *gp;
+			propagate = (g == 0);
+			assert((g & (ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK)))
+			    == 0);
+			g ^= ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK);
+			*gp = g;
+			if (!propagate) {
+				break;
+			}
+		}
+	}
+#endif /* BITMAP_USE_TREE */
 }
 
 #endif
diff --git a/include/jemalloc/internal/bitmap_structs.h b/include/jemalloc/internal/bitmap_structs.h
index dde1532..297ae66 100644
--- a/include/jemalloc/internal/bitmap_structs.h
+++ b/include/jemalloc/internal/bitmap_structs.h
@@ -10,8 +10,19 @@ struct bitmap_info_s {
 	/* Logical number of bits in bitmap (stored at bottom level). */
 	size_t nbits;
 
+#ifdef BITMAP_USE_TREE
+	/* Number of levels necessary for nbits. */
+	unsigned nlevels;
+
+	/*
+	 * Only the first (nlevels+1) elements are used, and levels are ordered
+	 * bottom to top (e.g. the bottom level is stored in levels[0]).
+	 */
+	bitmap_level_t levels[BITMAP_MAX_LEVELS+1];
+#else /* BITMAP_USE_TREE */
 	/* Number of groups necessary for nbits. */
 	size_t ngroups;
+#endif /* BITMAP_USE_TREE */
 };
 
 #endif /* JEMALLOC_INTERNAL_BITMAP_STRUCTS_H */
diff --git a/include/jemalloc/internal/bitmap_types.h b/include/jemalloc/internal/bitmap_types.h
index 091ccea..b334769 100644
--- a/include/jemalloc/internal/bitmap_types.h
+++ b/include/jemalloc/internal/bitmap_types.h
@@ -21,10 +21,115 @@ typedef unsigned long bitmap_t;
 #define BITMAP_GROUP_NBITS		(1U << LG_BITMAP_GROUP_NBITS)
 #define BITMAP_GROUP_NBITS_MASK		(BITMAP_GROUP_NBITS-1)
 
+/*
+ * Do some analysis on how big the bitmap is before we use a tree.  For a brute
+ * force linear search, if we would have to call ffs_lu() more than 2^3 times,
+ * use a tree instead.
+ */
+#if LG_BITMAP_MAXBITS - LG_BITMAP_GROUP_NBITS > 3
+#  define BITMAP_USE_TREE
+#endif
+
 /* Number of groups required to store a given number of bits. */
 #define BITMAP_BITS2GROUPS(nbits)					\
     (((nbits) + BITMAP_GROUP_NBITS_MASK) >> LG_BITMAP_GROUP_NBITS)
 
+/*
+ * Number of groups required at a particular level for a given number of bits.
+ */
+#define BITMAP_GROUPS_L0(nbits)						\
+    BITMAP_BITS2GROUPS(nbits)
+#define BITMAP_GROUPS_L1(nbits)						\
+    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(nbits))
+#define BITMAP_GROUPS_L2(nbits)						\
+    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS((nbits))))
+#define BITMAP_GROUPS_L3(nbits)						\
+    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(		\
+	BITMAP_BITS2GROUPS((nbits)))))
+#define BITMAP_GROUPS_L4(nbits)						\
+    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(		\
+	BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS((nbits))))))
+
+/*
+ * Assuming the number of levels, number of groups required for a given number
+ * of bits.
+ */
+#define BITMAP_GROUPS_1_LEVEL(nbits)					\
+    BITMAP_GROUPS_L0(nbits)
+#define BITMAP_GROUPS_2_LEVEL(nbits)					\
+    (BITMAP_GROUPS_1_LEVEL(nbits) + BITMAP_GROUPS_L1(nbits))
+#define BITMAP_GROUPS_3_LEVEL(nbits)					\
+    (BITMAP_GROUPS_2_LEVEL(nbits) + BITMAP_GROUPS_L2(nbits))
+#define BITMAP_GROUPS_4_LEVEL(nbits)					\
+    (BITMAP_GROUPS_3_LEVEL(nbits) + BITMAP_GROUPS_L3(nbits))
+#define BITMAP_GROUPS_5_LEVEL(nbits)					\
+    (BITMAP_GROUPS_4_LEVEL(nbits) + BITMAP_GROUPS_L4(nbits))
+
+/*
+ * Maximum number of groups required to support LG_BITMAP_MAXBITS.
+ */
+#ifdef BITMAP_USE_TREE
+
+#if LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS
+#  define BITMAP_GROUPS(nbits)	BITMAP_GROUPS_1_LEVEL(nbits)
+#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_1_LEVEL(BITMAP_MAXBITS)
+#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 2
+#  define BITMAP_GROUPS(nbits)	BITMAP_GROUPS_2_LEVEL(nbits)
+#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_2_LEVEL(BITMAP_MAXBITS)
+#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 3
+#  define BITMAP_GROUPS(nbits)	BITMAP_GROUPS_3_LEVEL(nbits)
+#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_3_LEVEL(BITMAP_MAXBITS)
+#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 4
+#  define BITMAP_GROUPS(nbits)	BITMAP_GROUPS_4_LEVEL(nbits)
+#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_4_LEVEL(BITMAP_MAXBITS)
+#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 5
+#  define BITMAP_GROUPS(nbits)	BITMAP_GROUPS_5_LEVEL(nbits)
+#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_5_LEVEL(BITMAP_MAXBITS)
+#else
+#  error "Unsupported bitmap size"
+#endif
+
+/*
+ * Maximum number of levels possible.  This could be statically computed based
+ * on LG_BITMAP_MAXBITS:
+ *
+ * #define BITMAP_MAX_LEVELS \
+ *     (LG_BITMAP_MAXBITS / LG_SIZEOF_BITMAP) \
+ *     + !!(LG_BITMAP_MAXBITS % LG_SIZEOF_BITMAP)
+ *
+ * However, that would not allow the generic BITMAP_INFO_INITIALIZER() macro, so
+ * instead hardcode BITMAP_MAX_LEVELS to the largest number supported by the
+ * various cascading macros.  The only additional cost this incurs is some
+ * unused trailing entries in bitmap_info_t structures; the bitmaps themselves
+ * are not impacted.
+ */
+#define BITMAP_MAX_LEVELS	5
+
+#define BITMAP_INFO_INITIALIZER(nbits) {				\
+	/* nbits. */							\
+	nbits,								\
+	/* nlevels. */							\
+	(BITMAP_GROUPS_L0(nbits) > BITMAP_GROUPS_L1(nbits)) +		\
+	    (BITMAP_GROUPS_L1(nbits) > BITMAP_GROUPS_L2(nbits)) +	\
+	    (BITMAP_GROUPS_L2(nbits) > BITMAP_GROUPS_L3(nbits)) +	\
+	    (BITMAP_GROUPS_L3(nbits) > BITMAP_GROUPS_L4(nbits)) + 1,	\
+	/* levels. */							\
+	{								\
+		{0},							\
+		{BITMAP_GROUPS_L0(nbits)},				\
+		{BITMAP_GROUPS_L1(nbits) + BITMAP_GROUPS_L0(nbits)},	\
+		{BITMAP_GROUPS_L2(nbits) + BITMAP_GROUPS_L1(nbits) +	\
+		    BITMAP_GROUPS_L0(nbits)},				\
+		{BITMAP_GROUPS_L3(nbits) + BITMAP_GROUPS_L2(nbits) +	\
+		    BITMAP_GROUPS_L1(nbits) + BITMAP_GROUPS_L0(nbits)},	\
+		{BITMAP_GROUPS_L4(nbits) + BITMAP_GROUPS_L3(nbits) +	\
+		     BITMAP_GROUPS_L2(nbits) + BITMAP_GROUPS_L1(nbits)	\
+		     + BITMAP_GROUPS_L0(nbits)}				\
+	}								\
+}
+
+#else /* BITMAP_USE_TREE */
+
 #define BITMAP_GROUPS(nbits)	BITMAP_BITS2GROUPS(nbits)
 #define BITMAP_GROUPS_MAX	BITMAP_BITS2GROUPS(BITMAP_MAXBITS)
 
@@ -35,4 +140,6 @@ typedef unsigned long bitmap_t;
 	BITMAP_BITS2GROUPS(nbits)					\
 }
 
+#endif /* BITMAP_USE_TREE */
+
 #endif /* JEMALLOC_INTERNAL_BITMAP_TYPES_H */
diff --git a/src/bitmap.c b/src/bitmap.c
index 275636b..468b317 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -6,6 +6,82 @@
 
 /******************************************************************************/
 
+#ifdef BITMAP_USE_TREE
+
+void
+bitmap_info_init(bitmap_info_t *binfo, size_t nbits) {
+	unsigned i;
+	size_t group_count;
+
+	assert(nbits > 0);
+	assert(nbits <= (ZU(1) << LG_BITMAP_MAXBITS));
+
+	/*
+	 * Compute the number of groups necessary to store nbits bits, and
+	 * progressively work upward through the levels until reaching a level
+	 * that requires only one group.
+	 */
+	binfo->levels[0].group_offset = 0;
+	group_count = BITMAP_BITS2GROUPS(nbits);
+	for (i = 1; group_count > 1; i++) {
+		assert(i < BITMAP_MAX_LEVELS);
+		binfo->levels[i].group_offset = binfo->levels[i-1].group_offset
+		    + group_count;
+		group_count = BITMAP_BITS2GROUPS(group_count);
+	}
+	binfo->levels[i].group_offset = binfo->levels[i-1].group_offset
+	    + group_count;
+	assert(binfo->levels[i].group_offset <= BITMAP_GROUPS_MAX);
+	binfo->nlevels = i;
+	binfo->nbits = nbits;
+}
+
+static size_t
+bitmap_info_ngroups(const bitmap_info_t *binfo) {
+	return binfo->levels[binfo->nlevels].group_offset;
+}
+
+void
+bitmap_init(bitmap_t *bitmap, const bitmap_info_t *binfo, bool fill) {
+	size_t extra;
+	unsigned i;
+
+	/*
+	 * Bits are actually inverted with regard to the external bitmap
+	 * interface.
+	 */
+
+	if (fill) {
+		/* The "filled" bitmap starts out with all 0 bits. */
+		memset(bitmap, 0, bitmap_size(binfo));
+		return;
+	}
+
+	/*
+	 * The "empty" bitmap starts out with all 1 bits, except for trailing
+	 * unused bits (if any).  Note that each group uses bit 0 to correspond
+	 * to the first logical bit in the group, so extra bits are the most
+	 * significant bits of the last group.
+	 */
+	memset(bitmap, 0xffU, bitmap_size(binfo));
+	extra = (BITMAP_GROUP_NBITS - (binfo->nbits & BITMAP_GROUP_NBITS_MASK))
+	    & BITMAP_GROUP_NBITS_MASK;
+	if (extra != 0) {
+		bitmap[binfo->levels[1].group_offset - 1] >>= extra;
+	}
+	for (i = 1; i < binfo->nlevels; i++) {
+		size_t group_count = binfo->levels[i].group_offset -
+		    binfo->levels[i-1].group_offset;
+		extra = (BITMAP_GROUP_NBITS - (group_count &
+		    BITMAP_GROUP_NBITS_MASK)) & BITMAP_GROUP_NBITS_MASK;
+		if (extra != 0) {
+			bitmap[binfo->levels[i+1].group_offset - 1] >>= extra;
+		}
+	}
+}
+
+#else /* BITMAP_USE_TREE */
+
 void
 bitmap_info_init(bitmap_info_t *binfo, size_t nbits) {
 	assert(nbits > 0);
@@ -37,6 +113,8 @@ bitmap_init(bitmap_t *bitmap, const bitmap_info_t *binfo, bool fill) {
 	}
 }
 
+#endif /* BITMAP_USE_TREE */
+
 size_t
 bitmap_size(const bitmap_info_t *binfo) {
 	return (bitmap_info_ngroups(binfo) << LG_SIZEOF_BITMAP);
diff --git a/test/unit/bitmap.c b/test/unit/bitmap.c
index f65ed53..cafb203 100644
--- a/test/unit/bitmap.c
+++ b/test/unit/bitmap.c
@@ -103,8 +103,24 @@ test_bitmap_initializer_body(const bitmap_info_t *binfo, size_t nbits) {
 	assert_zu_eq(binfo->nbits, binfo_dyn.nbits,
 	    "Unexpected difference between static and dynamic initialization, "
 	    "nbits=%zu", nbits);
+#ifdef BITMAP_USE_TREE
+	assert_u_eq(binfo->nlevels, binfo_dyn.nlevels,
+	    "Unexpected difference between static and dynamic initialization, "
+	    "nbits=%zu", nbits);
+	{
+		unsigned i;
+
+		for (i = 0; i < binfo->nlevels; i++) {
+			assert_zu_eq(binfo->levels[i].group_offset,
+			    binfo_dyn.levels[i].group_offset,
+			    "Unexpected difference between static and dynamic "
+			    "initialization, nbits=%zu, level=%u", nbits, i);
+		}
+	}
+#else
 	assert_zu_eq(binfo->ngroups, binfo_dyn.ngroups,
 	    "Unexpected difference between static and dynamic initialization");
+#endif
 }
 
 TEST_BEGIN(test_bitmap_initializer) {
-- 
cgit v0.12