From 9c0549007dcb64f4ff35d37390a9a6a8d3cea880 Mon Sep 17 00:00:00 2001
From: David Goldblatt <davidgoldblatt@fb.com>
Date: Fri, 11 Aug 2017 17:34:21 -0700
Subject: Make arena stats collection go through cache bins.

This eliminates the need for the arena stats code to "know" about tcaches; all
that it needs is a cache_bin_array_descriptor_t to tell it where to find
cache_bins whose stats it should aggregate.
---
 include/jemalloc/internal/arena_structs_b.h | 11 ++++----
 include/jemalloc/internal/cache_bin.h       | 34 ++++++++++++++++++++++++-
 include/jemalloc/internal/tcache_structs.h  | 39 +++++++++++++++++++++++------
 src/arena.c                                 |  8 +++---
 src/tcache.c                                |  9 +++++++
 5 files changed, 84 insertions(+), 17 deletions(-)

diff --git a/include/jemalloc/internal/arena_structs_b.h b/include/jemalloc/internal/arena_structs_b.h
index d1fffec..c4e4310 100644
--- a/include/jemalloc/internal/arena_structs_b.h
+++ b/include/jemalloc/internal/arena_structs_b.h
@@ -162,14 +162,15 @@ struct arena_s {
 	arena_stats_t		stats;
 
 	/*
-	 * List of tcaches for extant threads associated with this arena.
-	 * Stats from these are merged incrementally, and at exit if
-	 * opt_stats_print is enabled.
+	 * Lists of tcaches and cache_bin_array_descriptors for extant threads
+	 * associated with this arena.  Stats from these are merged
+	 * incrementally, and at exit if opt_stats_print is enabled.
 	 *
 	 * Synchronization: tcache_ql_mtx.
 	 */
-	ql_head(tcache_t)	tcache_ql;
-	malloc_mutex_t		tcache_ql_mtx;
+	ql_head(tcache_t)			tcache_ql;
+	ql_head(cache_bin_array_descriptor_t)	cache_bin_array_descriptor_ql;
+	malloc_mutex_t				tcache_ql_mtx;
 
 	/* Synchronization: internal. */
 	prof_accum_t		prof_accum;
diff --git a/include/jemalloc/internal/cache_bin.h b/include/jemalloc/internal/cache_bin.h
index 37025b5..85d9de0 100644
--- a/include/jemalloc/internal/cache_bin.h
+++ b/include/jemalloc/internal/cache_bin.h
@@ -1,6 +1,19 @@
 #ifndef JEMALLOC_INTERNAL_CACHE_BIN_H
 #define JEMALLOC_INTERNAL_CACHE_BIN_H
 
+#include "jemalloc/internal/ql.h"
+
+/*
+ * The cache_bins are the mechanism that the tcache and the arena use to
+ * communicate.  The tcache fills from and flushes to the arena by passing a
+ * cache_bin_t to fill/flush.  When the arena needs to pull stats from the
+ * tcaches associated with it, it does so by iterating over its
+ * cache_bin_array_descriptor_t objects and reading out per-bin stats it
+ * contains.  This makes it so that the arena need not know about the existence
+ * of the tcache at all.
+ */
+
+
 /*
  * The count of the number of cached allocations in a bin.  We make this signed
  * so that negative numbers can encode "invalid" states (e.g. a low water mark
@@ -51,6 +64,26 @@ struct cache_bin_s {
 	void **avail;
 };
 
+typedef struct cache_bin_array_descriptor_s cache_bin_array_descriptor_t;
+struct cache_bin_array_descriptor_s {
+	/*
+	 * The arena keeps a list of the cache bins associated with it, for
+	 * stats collection.
+	 */
+	ql_elm(cache_bin_array_descriptor_t) link;
+	/* Pointers to the tcache bins. */
+	cache_bin_t *bins_small;
+	cache_bin_t *bins_large;
+};
+
+static inline void
+cache_bin_array_descriptor_init(cache_bin_array_descriptor_t *descriptor,
+    cache_bin_t *bins_small, cache_bin_t *bins_large) {
+	ql_elm_new(descriptor, link);
+	descriptor->bins_small = bins_small;
+	descriptor->bins_large = bins_large;
+}
+
 JEMALLOC_ALWAYS_INLINE void *
 cache_alloc_easy(cache_bin_t *bin, bool *success) {
 	void *ret;
@@ -76,7 +109,6 @@ cache_alloc_easy(cache_bin_t *bin, bool *success) {
 	}
 
 	return ret;
-
 }
 
 #endif /* JEMALLOC_INTERNAL_CACHE_BIN_H */
diff --git a/include/jemalloc/internal/tcache_structs.h b/include/jemalloc/internal/tcache_structs.h
index ad0fe66..07b7387 100644
--- a/include/jemalloc/internal/tcache_structs.h
+++ b/include/jemalloc/internal/tcache_structs.h
@@ -7,21 +7,46 @@
 #include "jemalloc/internal/ticker.h"
 
 struct tcache_s {
-	/* Data accessed frequently first: prof, ticker and small bins. */
-	uint64_t	prof_accumbytes;/* Cleared after arena_prof_accum(). */
-	ticker_t	gc_ticker;	/* Drives incremental GC. */
+	/*
+	 * To minimize our cache-footprint, we put the frequently accessed data
+	 * together at the start of this struct.
+	 */
+
+	/* Cleared after arena_prof_accum(). */
+	uint64_t	prof_accumbytes;
+	/* Drives incremental GC. */
+	ticker_t	gc_ticker;
 	/*
 	 * The pointer stacks associated with bins follow as a contiguous array.
 	 * During tcache initialization, the avail pointer in each element of
 	 * tbins is initialized to point to the proper offset within this array.
 	 */
 	cache_bin_t	bins_small[NBINS];
-	/* Data accessed less often below. */
-	ql_elm(tcache_t) link;		/* Used for aggregating stats. */
-	arena_t		*arena;		/* Associated arena. */
-	szind_t		next_gc_bin;	/* Next bin to GC. */
+
+	/*
+	 * This data is less hot; we can be a little less careful with our
+	 * footprint here.
+	 */
+	/* Lets us track all the tcaches in an arena. */
+	ql_elm(tcache_t) link;
+	/*
+	 * The descriptor lets the arena find our cache bins without seeing the
+	 * tcache definition.  This enables arenas to aggregate stats across
+	 * tcaches without having a tcache dependency.
+	 */
+	cache_bin_array_descriptor_t cache_bin_array_descriptor;
+
+	/* The arena this tcache is associated with. */
+	arena_t		*arena;
+	/* Next bin to GC. */
+	szind_t		next_gc_bin;
 	/* For small bins, fill (ncached_max >> lg_fill_div). */
 	uint8_t		lg_fill_div[NBINS];
+	/*
+	 * We put the cache bins for large size classes at the end of the
+	 * struct, since some of them might not get used.  This might end up
+	 * letting us avoid touching an extra page if we don't have to.
+	 */
 	cache_bin_t	bins_large[NSIZES-NBINS];
 };
 
diff --git a/src/arena.c b/src/arena.c
index 60b482e..19aafaf 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -303,16 +303,16 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
 	/* tcache_bytes counts currently cached bytes. */
 	atomic_store_zu(&astats->tcache_bytes, 0, ATOMIC_RELAXED);
 	malloc_mutex_lock(tsdn, &arena->tcache_ql_mtx);
-	tcache_t *tcache;
-	ql_foreach(tcache, &arena->tcache_ql, link) {
+	cache_bin_array_descriptor_t *descriptor;
+	ql_foreach(descriptor, &arena->cache_bin_array_descriptor_ql, link) {
 		szind_t i = 0;
 		for (; i < NBINS; i++) {
-			cache_bin_t *tbin = tcache_small_bin_get(tcache, i);
+			cache_bin_t *tbin = &descriptor->bins_small[i];
 			arena_stats_accum_zu(&astats->tcache_bytes,
 			    tbin->ncached * sz_index2size(i));
 		}
 		for (; i < nhbins; i++) {
-			cache_bin_t *tbin = tcache_large_bin_get(tcache, i);
+			cache_bin_t *tbin = &descriptor->bins_large[i];
 			arena_stats_accum_zu(&astats->tcache_bytes,
 			    tbin->ncached * sz_index2size(i));
 		}
diff --git a/src/tcache.c b/src/tcache.c
index 7d32d4d..e22f806 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -291,8 +291,15 @@ tcache_arena_associate(tsdn_t *tsdn, tcache_t *tcache, arena_t *arena) {
 	if (config_stats) {
 		/* Link into list of extant tcaches. */
 		malloc_mutex_lock(tsdn, &arena->tcache_ql_mtx);
+
 		ql_elm_new(tcache, link);
 		ql_tail_insert(&arena->tcache_ql, tcache, link);
+		cache_bin_array_descriptor_init(
+		    &tcache->cache_bin_array_descriptor, tcache->bins_small,
+		    tcache->bins_large);
+		ql_tail_insert(&arena->cache_bin_array_descriptor_ql,
+		    &tcache->cache_bin_array_descriptor, link);
+
 		malloc_mutex_unlock(tsdn, &arena->tcache_ql_mtx);
 	}
 }
@@ -316,6 +323,8 @@ tcache_arena_dissociate(tsdn_t *tsdn, tcache_t *tcache) {
 			assert(in_ql);
 		}
 		ql_remove(&arena->tcache_ql, tcache, link);
+		ql_remove(&arena->cache_bin_array_descriptor_ql,
+		    &tcache->cache_bin_array_descriptor, link);
 		tcache_stats_merge(tsdn, tcache, arena);
 		malloc_mutex_unlock(tsdn, &arena->tcache_ql_mtx);
 	}
-- 
cgit v0.12