From 47b20bb6544de9cdd4ca7ab870d6ad257c0ce4ff Mon Sep 17 00:00:00 2001
From: Qi Wang <interwq@gwu.edu>
Date: Thu, 24 Aug 2017 14:29:28 -0700
Subject: Change opt.metadata_thp to [disabled,auto,always].

To avoid the high RSS caused by THP + low usage arena (i.e. THP becomes a
significant percentage), added a new "auto" option which will only start using
THP after a base allocator used up the first THP region.  Starting from the
second hugepage (in a single arena), "auto" behaves the same as "always",
i.e. madvise hugepage right away.
---
 doc/jemalloc.xml.in                      | 12 +++++----
 include/jemalloc/internal/base_externs.h |  3 ++-
 include/jemalloc/internal/base_inlines.h |  4 +++
 include/jemalloc/internal/base_types.h   | 17 +++++++++++-
 src/base.c                               | 46 +++++++++++++++++++++++---------
 src/ctl.c                                |  3 ++-
 src/jemalloc.c                           | 18 ++++++++++++-
 src/pages.c                              |  2 +-
 src/stats.c                              |  2 +-
 test/unit/mallctl.c                      |  2 +-
 10 files changed, 84 insertions(+), 25 deletions(-)

diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index f1712f0..0c95604 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -919,13 +919,15 @@ mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".decay",
       <varlistentry id="opt.metadata_thp">
         <term>
           <mallctl>opt.metadata_thp</mallctl>
-          (<type>bool</type>)
+          (<type>const char *</type>)
           <literal>r-</literal>
         </term>
-        <listitem><para>If true, allow jemalloc to use transparent huge page
-        (THP) for internal metadata (see <link
-        linkend="stats.metadata">stats.metadata</link> for details).  This
-        option is disabled by default.</para></listitem>
+        <listitem><para>Controls whether to allow jemalloc to use transparent
+        huge page (THP) for internal metadata (see <link
+        linkend="stats.metadata">stats.metadata</link>).  <quote>always</quote>
+        allows such usage.  <quote>auto</quote> uses no THP initially, but may
+        begin to do so when metadata usage reaches certain level.  The default
+        is <quote>disabled</quote>.</para></listitem>
       </varlistentry>
 
       <varlistentry id="opt.retain">
diff --git a/include/jemalloc/internal/base_externs.h b/include/jemalloc/internal/base_externs.h
index a5cb8a8..6cd1187 100644
--- a/include/jemalloc/internal/base_externs.h
+++ b/include/jemalloc/internal/base_externs.h
@@ -1,7 +1,8 @@
 #ifndef JEMALLOC_INTERNAL_BASE_EXTERNS_H
 #define JEMALLOC_INTERNAL_BASE_EXTERNS_H
 
-extern bool opt_metadata_thp;
+extern metadata_thp_mode_t opt_metadata_thp;
+extern const char *metadata_thp_mode_names[];
 
 base_t *b0get(void);
 base_t *base_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks);
diff --git a/include/jemalloc/internal/base_inlines.h b/include/jemalloc/internal/base_inlines.h
index 931560b..aec0e2e 100644
--- a/include/jemalloc/internal/base_inlines.h
+++ b/include/jemalloc/internal/base_inlines.h
@@ -6,4 +6,8 @@ base_ind_get(const base_t *base) {
 	return base->ind;
 }
 
+static inline bool
+metadata_thp_enabled(void) {
+	return (opt_metadata_thp != metadata_thp_disabled);
+}
 #endif /* JEMALLOC_INTERNAL_BASE_INLINES_H */
diff --git a/include/jemalloc/internal/base_types.h b/include/jemalloc/internal/base_types.h
index 6e71033..97e38a9 100644
--- a/include/jemalloc/internal/base_types.h
+++ b/include/jemalloc/internal/base_types.h
@@ -4,6 +4,21 @@
 typedef struct base_block_s base_block_t;
 typedef struct base_s base_t;
 
-#define METADATA_THP_DEFAULT false
+#define METADATA_THP_DEFAULT metadata_thp_disabled
+
+typedef enum {
+	metadata_thp_disabled   = 0,
+	/*
+	 * Lazily enable hugepage for metadata. To avoid high RSS caused by THP
+	 * + low usage arena (i.e. THP becomes a significant percentage), the
+	 * "auto" option only starts using THP after a base allocator used up
+	 * the first THP region.  Starting from the second hugepage (in a single
+	 * arena), "auto" behaves the same as "always", i.e. madvise hugepage
+	 * right away.
+	 */
+	metadata_thp_auto       = 1,
+	metadata_thp_always     = 2,
+	metadata_thp_mode_limit = 3
+} metadata_thp_mode_t;
 
 #endif /* JEMALLOC_INTERNAL_BASE_TYPES_H */
diff --git a/src/base.c b/src/base.c
index 9925978..9cb02b6 100644
--- a/src/base.c
+++ b/src/base.c
@@ -12,7 +12,13 @@
 
 static base_t *b0;
 
-bool opt_metadata_thp = METADATA_THP_DEFAULT;
+metadata_thp_mode_t opt_metadata_thp = METADATA_THP_DEFAULT;
+
+const char *metadata_thp_mode_names[] = {
+	"disabled",
+	"auto",
+	"always"
+};
 
 /******************************************************************************/
 
@@ -24,7 +30,7 @@ base_map(tsdn_t *tsdn, extent_hooks_t *extent_hooks, unsigned ind, size_t size)
 
 	/* We use hugepage sizes regardless of opt_metadata_thp. */
 	assert(size == HUGEPAGE_CEILING(size));
-	size_t alignment = opt_metadata_thp ? HUGEPAGE : PAGE;
+	size_t alignment = metadata_thp_enabled() ? HUGEPAGE : PAGE;
 	if (extent_hooks == &extent_hooks_default) {
 		addr = extent_alloc_mmap(NULL, size, alignment, &zero, &commit);
 	} else {
@@ -36,12 +42,6 @@ base_map(tsdn_t *tsdn, extent_hooks_t *extent_hooks, unsigned ind, size_t size)
 		post_reentrancy(tsd);
 	}
 
-	if (addr != NULL && opt_metadata_thp && thp_state_madvise) {
-		assert(((uintptr_t)addr & HUGEPAGE_MASK) == 0 &&
-		    (size & HUGEPAGE_MASK) == 0);
-		pages_huge(addr, size);
-	}
-
 	return addr;
 }
 
@@ -101,7 +101,7 @@ base_unmap(tsdn_t *tsdn, extent_hooks_t *extent_hooks, unsigned ind, void *addr,
 		post_reentrancy(tsd);
 	}
 label_done:
-	if (opt_metadata_thp && thp_state_madvise) {
+	if (metadata_thp_enabled() && thp_state_madvise) {
 		/* Set NOHUGEPAGE after unmap to avoid kernel defrag. */
 		assert(((uintptr_t)addr & HUGEPAGE_MASK) == 0 &&
 		    (size & HUGEPAGE_MASK) == 0);
@@ -181,8 +181,8 @@ base_extent_bump_alloc(tsdn_t *tsdn, base_t *base, extent_t *extent,
  * On success a pointer to the initialized base_block_t header is returned.
  */
 static base_block_t *
-base_block_alloc(tsdn_t *tsdn, extent_hooks_t *extent_hooks, unsigned ind,
-    pszind_t *pind_last, size_t *extent_sn_next, size_t size,
+base_block_alloc(tsdn_t *tsdn, base_t *base, extent_hooks_t *extent_hooks,
+    unsigned ind, pszind_t *pind_last, size_t *extent_sn_next, size_t size,
     size_t alignment) {
 	alignment = ALIGNMENT_CEILING(alignment, QUANTUM);
 	size_t usize = ALIGNMENT_CEILING(size, alignment);
@@ -208,6 +208,26 @@ base_block_alloc(tsdn_t *tsdn, extent_hooks_t *extent_hooks, unsigned ind,
 	if (block == NULL) {
 		return NULL;
 	}
+
+	if (metadata_thp_enabled() && thp_state_madvise) {
+		void *addr = (void *)block;
+		assert(((uintptr_t)addr & HUGEPAGE_MASK) == 0 &&
+		    (block_size & HUGEPAGE_MASK) == 0);
+		/* base == NULL indicates this is a new base. */
+		if (base != NULL || opt_metadata_thp == metadata_thp_always) {
+			/* Use hugepage for the new block. */
+			pages_huge(addr, block_size);
+		}
+		if (base != NULL && opt_metadata_thp == metadata_thp_auto) {
+			/* Make the first block THP lazily. */
+			base_block_t *first_block = base->blocks;
+			if (first_block->next == NULL) {
+				assert((first_block->size & HUGEPAGE_MASK) == 0);
+				pages_huge(first_block, first_block->size);
+			}
+		}
+	}
+
 	*pind_last = sz_psz2ind(block_size);
 	block->size = block_size;
 	block->next = NULL;
@@ -231,7 +251,7 @@ base_extent_alloc(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment) {
 	 * called.
 	 */
 	malloc_mutex_unlock(tsdn, &base->mtx);
-	base_block_t *block = base_block_alloc(tsdn, extent_hooks,
+	base_block_t *block = base_block_alloc(tsdn, base, extent_hooks,
 	    base_ind_get(base), &base->pind_last, &base->extent_sn_next, size,
 	    alignment);
 	malloc_mutex_lock(tsdn, &base->mtx);
@@ -259,7 +279,7 @@ base_t *
 base_new(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) {
 	pszind_t pind_last = 0;
 	size_t extent_sn_next = 0;
-	base_block_t *block = base_block_alloc(tsdn, extent_hooks, ind,
+	base_block_t *block = base_block_alloc(tsdn, NULL, extent_hooks, ind,
 	    &pind_last, &extent_sn_next, sizeof(base_t), QUANTUM);
 	if (block == NULL) {
 		return NULL;
diff --git a/src/ctl.c b/src/ctl.c
index c299103..ace10b0 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -1570,7 +1570,8 @@ CTL_RO_CONFIG_GEN(config_xmalloc, bool)
 
 CTL_RO_NL_GEN(opt_abort, opt_abort, bool)
 CTL_RO_NL_GEN(opt_abort_conf, opt_abort_conf, bool)
-CTL_RO_NL_GEN(opt_metadata_thp, opt_metadata_thp, bool)
+CTL_RO_NL_GEN(opt_metadata_thp, metadata_thp_mode_names[opt_metadata_thp],
+    const char *)
 CTL_RO_NL_GEN(opt_retain, opt_retain, bool)
 CTL_RO_NL_GEN(opt_dss, opt_dss, const char *)
 CTL_RO_NL_GEN(opt_narenas, opt_narenas, unsigned)
diff --git a/src/jemalloc.c b/src/jemalloc.c
index cbae259..3c0ea7d 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1055,7 +1055,23 @@ malloc_conf_init(void) {
 			if (opt_abort_conf && had_conf_error) {
 				malloc_abort_invalid_conf();
 			}
-			CONF_HANDLE_BOOL(opt_metadata_thp, "metadata_thp")
+			if (strncmp("metadata_thp", k, klen) == 0) {
+				int i;
+				bool match = false;
+				for (i = 0; i < metadata_thp_mode_limit; i++) {
+					if (strncmp(metadata_thp_mode_names[i],
+					    v, vlen) == 0) {
+						opt_metadata_thp = i;
+						match = true;
+						break;
+					}
+				}
+				if (!match) {
+					malloc_conf_error("Invalid conf value",
+					    k, klen, v, vlen);
+				}
+				continue;
+			}
 			CONF_HANDLE_BOOL(opt_retain, "retain")
 			if (strncmp("dss", k, klen) == 0) {
 				int i;
diff --git a/src/pages.c b/src/pages.c
index 70f1fd3..4ca3107 100644
--- a/src/pages.c
+++ b/src/pages.c
@@ -418,7 +418,7 @@ os_overcommits_proc(void) {
 static void
 init_thp_state(void) {
 	if (!have_madvise_huge) {
-		if (opt_metadata_thp && opt_abort) {
+		if (metadata_thp_enabled() && opt_abort) {
 			malloc_write("<jemalloc>: no MADV_HUGEPAGE support\n");
 			abort();
 		}
diff --git a/src/stats.c b/src/stats.c
index 746cc42..e1a3f8c 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -802,11 +802,11 @@ stats_general_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	}
 	OPT_WRITE_BOOL(abort, ",")
 	OPT_WRITE_BOOL(abort_conf, ",")
-	OPT_WRITE_BOOL(metadata_thp, ",")
 	OPT_WRITE_BOOL(retain, ",")
 	OPT_WRITE_CHAR_P(dss, ",")
 	OPT_WRITE_UNSIGNED(narenas, ",")
 	OPT_WRITE_CHAR_P(percpu_arena, ",")
+	OPT_WRITE_CHAR_P(metadata_thp, ",")
 	OPT_WRITE_BOOL_MUTABLE(background_thread, background_thread, ",")
 	OPT_WRITE_SSIZE_T_MUTABLE(dirty_decay_ms, arenas.dirty_decay_ms, ",")
 	OPT_WRITE_SSIZE_T_MUTABLE(muzzy_decay_ms, arenas.muzzy_decay_ms, ",")
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index 0b14e78..5612cce 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -158,7 +158,7 @@ TEST_BEGIN(test_mallctl_opt) {
 
 	TEST_MALLCTL_OPT(bool, abort, always);
 	TEST_MALLCTL_OPT(bool, abort_conf, always);
-	TEST_MALLCTL_OPT(bool, metadata_thp, always);
+	TEST_MALLCTL_OPT(const char *, metadata_thp, always);
 	TEST_MALLCTL_OPT(bool, retain, always);
 	TEST_MALLCTL_OPT(const char *, dss, always);
 	TEST_MALLCTL_OPT(unsigned, narenas, always);
-- 
cgit v0.12