summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjhendersonHDF <jhenderson@hdfgroup.org>2023-04-04 17:35:52 (GMT)
committerGitHub <noreply@github.com>2023-04-04 17:35:52 (GMT)
commit101210c16ee978f51a9effcc56813f00c16cb1cd (patch)
tree214d9e9a30259ac0b10a928d2e43f021497498d0
parentf13f8705b18dbd21ea08192d7167a24a4f5309fe (diff)
downloadhdf5-101210c16ee978f51a9effcc56813f00c16cb1cd.zip
hdf5-101210c16ee978f51a9effcc56813f00c16cb1cd.tar.gz
hdf5-101210c16ee978f51a9effcc56813f00c16cb1cd.tar.bz2
Subfiling VFD - fix issues with I/O concentrator selection strategies (#2571) (#2618)
Fix multiple bugs with the SELECT_IOC_EVERY_NTH_RANK and SELECT_IOC_TOTAL I/O concentrator selection strategies and add a regression test for them
-rw-r--r--release_docs/RELEASE.txt20
-rw-r--r--src/H5FDsubfiling/H5subfiling_common.c110
-rw-r--r--testpar/t_subfiling_vfd.c195
3 files changed, 286 insertions, 39 deletions
diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt
index 9eaf33b..1a90c23 100644
--- a/release_docs/RELEASE.txt
+++ b/release_docs/RELEASE.txt
@@ -160,6 +160,26 @@ Bug Fixes since HDF5-1.14.0 release
(JTH - 2023/03/23)
+ - Fixed issues in the Subfiling VFD when using the SELECT_IOC_EVERY_NTH_RANK
+ or SELECT_IOC_TOTAL I/O concentrator selection strategies
+
+ Multiple bugs involving these I/O concentrator selection strategies
+ were fixed, including:
+
+ * A bug that caused the selection strategy to be altered when
+ criteria for the strategy was specified in the
+ H5FD_SUBFILING_IOC_SELECTION_CRITERIA environment variable as
+ a single value, rather than in the old and undocumented
+ 'integer:integer' format
+ * Two bugs which caused a request for 'N' I/O concentrators to
+ result in 'N - 1' I/O concentrators being assigned, which also
+ lead to issues if only 1 I/O concentrator was requested
+
+ Also added a regression test for these two I/O concentrator selection
+ strategies to prevent future issues.
+
+ (JTH - 2023/03/15)
+
- Fixed an issue with collective metadata writes of global heap data
New test failures in parallel netCDF started occurring with debug
diff --git a/src/H5FDsubfiling/H5subfiling_common.c b/src/H5FDsubfiling/H5subfiling_common.c
index b58c4d3..e4dcf25 100644
--- a/src/H5FDsubfiling/H5subfiling_common.c
+++ b/src/H5FDsubfiling/H5subfiling_common.c
@@ -992,17 +992,24 @@ init_app_topology(H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm, MPI_
HDprintf("invalid IOC selection strategy string '%s' for strategy "
"SELECT_IOC_EVERY_NTH_RANK; defaulting to SELECT_IOC_ONE_PER_NODE\n",
ioc_sel_str);
+
ioc_select_val = 1;
ioc_selection_type = SELECT_IOC_ONE_PER_NODE;
+
+ H5_CHECK_OVERFLOW(iocs_per_node, long, int);
+ ioc_count = (int)iocs_per_node;
+
+ break;
}
}
+ if (ioc_select_val > comm_size)
+ ioc_select_val = comm_size;
+
H5_CHECK_OVERFLOW(ioc_select_val, long, int);
- ioc_count = (comm_size / (int)ioc_select_val);
+ ioc_count = ((comm_size - 1) / (int)ioc_select_val) + 1;
- if ((comm_size % ioc_select_val) != 0) {
- ioc_count++;
- }
+ rank_multiple = (int)ioc_select_val;
break;
}
@@ -1017,19 +1024,31 @@ init_app_topology(H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm, MPI_
if (ioc_sel_str) {
errno = 0;
ioc_select_val = HDstrtol(ioc_sel_str, NULL, 0);
- if ((ERANGE == errno) || (ioc_select_val <= 0) || (ioc_select_val >= comm_size)) {
+ if ((ERANGE == errno) || (ioc_select_val <= 0)) {
HDprintf("invalid IOC selection strategy string '%s' for strategy SELECT_IOC_TOTAL; "
"defaulting to SELECT_IOC_ONE_PER_NODE\n",
ioc_sel_str);
+
ioc_select_val = 1;
ioc_selection_type = SELECT_IOC_ONE_PER_NODE;
+
+ H5_CHECK_OVERFLOW(iocs_per_node, long, int);
+ ioc_count = (int)iocs_per_node;
+
+ break;
}
}
+ if (ioc_select_val > comm_size)
+ ioc_select_val = comm_size;
+
H5_CHECK_OVERFLOW(ioc_select_val, long, int);
ioc_count = (int)ioc_select_val;
- rank_multiple = (comm_size / ioc_count);
+ if (ioc_select_val > 1)
+ rank_multiple = (comm_size - 1) / ((int)ioc_select_val - 1);
+ else
+ rank_multiple = 1;
break;
}
@@ -1145,34 +1164,48 @@ get_ioc_selection_criteria_from_env(H5FD_subfiling_ioc_select_t *ioc_selection_t
*ioc_sel_info_str = NULL;
if (env_value) {
- long check_value;
-
/*
- * For non-default options, the environment variable
- * should have the following form: integer:[integer|string]
- * In particular, EveryNthRank == 1:64 or every 64 ranks assign an IOC
- * or WithConfig == 2:/<full_path_to_config_file>
+ * Parse I/O Concentrator selection strategy criteria as
+ * either a single value or two colon-separated values of
+ * the form 'integer:[integer|string]'. If two values are
+ * given, the first value specifies the I/O Concentrator
+ * selection strategy to use (given as the integer value
+ * corresponding to the H5FD_subfiling_ioc_select_t enum
+ * value for that strategy) and the second value specifies
+ * the criteria for that strategy.
+ *
+ * For example, to assign every 64th MPI rank as an I/O
+ * Concentrator, the criteria string should have the format
+ * '1:64' to specify the "every Nth rank" strategy with a
+ * criteria of '64'.
*/
- if ((opt_value = HDstrchr(env_value, ':')))
+ opt_value = HDstrchr(env_value, ':');
+ if (opt_value) {
+ long check_value;
+
*opt_value++ = '\0';
- errno = 0;
- check_value = HDstrtol(env_value, NULL, 0);
+ errno = 0;
+ check_value = HDstrtol(env_value, NULL, 0);
- if (errno == ERANGE)
- H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL,
- "couldn't parse value from " H5FD_SUBFILING_IOC_SELECTION_CRITERIA
- " environment variable");
+ if (errno == ERANGE)
+ H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL,
+ "couldn't parse value from " H5FD_SUBFILING_IOC_SELECTION_CRITERIA
+ " environment variable");
- if ((check_value < 0) || (check_value >= ioc_selection_options))
- H5_SUBFILING_GOTO_ERROR(
- H5E_VFL, H5E_BADVALUE, FAIL,
- "invalid IOC selection type value %ld from " H5FD_SUBFILING_IOC_SELECTION_CRITERIA
- " environment variable",
- check_value);
+ if ((check_value < 0) || (check_value >= ioc_selection_options))
+ H5_SUBFILING_GOTO_ERROR(
+ H5E_VFL, H5E_BADVALUE, FAIL,
+ "invalid IOC selection type value %ld from " H5FD_SUBFILING_IOC_SELECTION_CRITERIA
+ " environment variable",
+ check_value);
- *ioc_selection_type = (H5FD_subfiling_ioc_select_t)check_value;
- *ioc_sel_info_str = opt_value;
+ *ioc_selection_type = (H5FD_subfiling_ioc_select_t)check_value;
+ *ioc_sel_info_str = opt_value;
+ }
+ else {
+ *ioc_sel_info_str = env_value;
+ }
}
done:
@@ -1562,8 +1595,8 @@ identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride)
max_iocs = app_topology->n_io_concentrators;
- if (NULL == (app_topology->io_concentrators = HDmalloc((size_t)app_topology->n_io_concentrators *
- sizeof(*app_topology->io_concentrators))))
+ if (NULL == (app_topology->io_concentrators =
+ HDmalloc((size_t)max_iocs * sizeof(*app_topology->io_concentrators))))
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
"couldn't allocate array of I/O concentrator ranks");
@@ -1622,30 +1655,29 @@ identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride)
case SELECT_IOC_EVERY_NTH_RANK:
case SELECT_IOC_TOTAL: {
- int world_size = app_layout->world_size;
- int ioc_next = 0;
+ int num_iocs_assigned = 0;
+ int world_size = app_layout->world_size;
HDassert(rank_stride > 0);
- for (int i = 0; ioc_next < app_topology->n_io_concentrators; ioc_next++) {
+ for (int i = 0; num_iocs_assigned < max_iocs; num_iocs_assigned++) {
int ioc_index = rank_stride * i++;
+ if (num_iocs_assigned >= max_iocs)
+ break;
if (ioc_index >= world_size)
break;
- io_concentrators[ioc_next] = app_layout->layout[ioc_index].rank;
+ io_concentrators[num_iocs_assigned] = app_layout->layout[ioc_index].rank;
- if (app_layout->world_rank == io_concentrators[ioc_next]) {
- app_topology->ioc_idx = ioc_next;
+ if (app_layout->world_rank == io_concentrators[num_iocs_assigned]) {
+ app_topology->ioc_idx = num_iocs_assigned;
app_topology->rank_is_ioc = TRUE;
}
-
- if (ioc_next + 1 >= max_iocs)
- break;
}
/* Set final number of I/O concentrators after adjustments */
- app_topology->n_io_concentrators = ioc_next;
+ app_topology->n_io_concentrators = num_iocs_assigned;
break;
}
diff --git a/testpar/t_subfiling_vfd.c b/testpar/t_subfiling_vfd.c
index e1f9e05..380c068 100644
--- a/testpar/t_subfiling_vfd.c
+++ b/testpar/t_subfiling_vfd.c
@@ -95,6 +95,7 @@ static hid_t create_subfiling_ioc_fapl(MPI_Comm comm, MPI_Info info, hbool_t cus
static void test_create_and_close(void);
static void test_config_file(void);
static void test_stripe_sizes(void);
+static void test_selection_strategies(void);
static void test_read_different_stripe_size(void);
static void test_subfiling_precreate_rank_0(void);
static void test_subfiling_write_many_read_one(void);
@@ -105,6 +106,7 @@ static test_func tests[] = {
test_create_and_close,
test_config_file,
test_stripe_sizes,
+ test_selection_strategies,
test_read_different_stripe_size,
test_subfiling_precreate_rank_0,
test_subfiling_write_many_read_one,
@@ -795,6 +797,199 @@ test_stripe_sizes(void)
#undef SUBF_NITER
/*
+ * Test the different I/O Concentator selection strategies
+ * for the Subfiling VFD
+ */
+#define SUBF_FILENAME "test_subfiling_selection_strategies.h5"
+#define NUM_RANKS_CHOICES 2
+#define NUM_CRITERIA_FORMATS 2
+static void
+test_selection_strategies(void)
+{
+ H5FD_subfiling_params_t cfg;
+ hid_t file_id = H5I_INVALID_HID;
+ hid_t fapl_id = H5I_INVALID_HID;
+ char *tmp_filename = NULL;
+
+ curr_nerrors = nerrors;
+
+ if (MAINPROCESS)
+ TESTING_2("I/O concentrator selection strategies");
+
+ tmp_filename = HDmalloc(PATH_MAX);
+ VRFY(tmp_filename, "HDmalloc succeeded");
+
+ for (H5FD_subfiling_ioc_select_t strategy = 0; strategy < ioc_selection_options; strategy++) {
+ /* Skip 1 IOC per node strategy since we assume it's
+ * the default strategy tested in this file. Skip
+ * "with config" strategy since it isn't supported.
+ */
+ if (strategy == SELECT_IOC_ONE_PER_NODE || strategy == SELECT_IOC_WITH_CONFIG)
+ continue;
+
+ /* Test with 1 MPI rank and then all MPI ranks */
+ for (size_t num_ranks_choice = 0; num_ranks_choice < NUM_RANKS_CHOICES; num_ranks_choice++) {
+ int num_active_ranks = mpi_size;
+
+ if (num_ranks_choice == 0)
+ num_active_ranks = 1;
+
+ /* Test with a selection strategy criteria string
+ * in the 'integer:[integer|string]' form and in
+ * the form of just a single value.
+ */
+ for (size_t criteria_format_choice = 0; criteria_format_choice < NUM_CRITERIA_FORMATS;
+ criteria_format_choice++) {
+ MPI_Comm file_comm = comm_g;
+ char criteria_buf[256];
+ char sel_criteria[128]; /* Use char buffer for criteria as we may support
+ the "with config" strategy in the future */
+ int expected_num_subfiles;
+
+ cfg.ioc_selection = strategy;
+ cfg.stripe_size = H5FD_SUBFILING_DEFAULT_STRIPE_SIZE;
+ cfg.stripe_count = H5FD_SUBFILING_DEFAULT_STRIPE_COUNT;
+
+ switch (strategy) {
+ case SELECT_IOC_EVERY_NTH_RANK: {
+ int stride;
+
+ /* Try to select a reasonable stride value */
+ if (num_active_ranks <= 2)
+ stride = 1;
+ else if (num_active_ranks <= 8)
+ stride = 2;
+ else if (num_active_ranks <= 32)
+ stride = 4;
+ else if (num_active_ranks <= 128)
+ stride = 8;
+ else
+ stride = 16;
+
+ HDsnprintf(sel_criteria, 128, "%d", stride);
+
+ expected_num_subfiles = ((num_active_ranks - 1) / stride) + 1;
+
+ break;
+ }
+
+ case SELECT_IOC_TOTAL: {
+ int n_iocs;
+
+ /* Try to select a reasonable number of IOCs */
+ if (num_active_ranks <= 2)
+ n_iocs = 1;
+ else if (num_active_ranks <= 8)
+ n_iocs = 2;
+ else if (num_active_ranks <= 32)
+ n_iocs = 4;
+ else if (num_active_ranks <= 128)
+ n_iocs = 8;
+ else
+ n_iocs = 16;
+
+ HDsnprintf(sel_criteria, 128, "%d", n_iocs);
+
+ expected_num_subfiles = n_iocs;
+
+ break;
+ }
+
+ case SELECT_IOC_ONE_PER_NODE:
+ case SELECT_IOC_WITH_CONFIG:
+ default:
+ HDprintf("invalid IOC selection strategy\n");
+ MPI_Abort(comm_g, -1);
+ }
+
+ if (criteria_format_choice == 0) {
+ HDsnprintf(criteria_buf, 256, "%d:%s", strategy, sel_criteria);
+ }
+ else if (criteria_format_choice == 1) {
+ HDsnprintf(criteria_buf, 256, "%s", sel_criteria);
+ }
+
+ VRFY(HDsetenv(H5FD_SUBFILING_IOC_SELECTION_CRITERIA, criteria_buf, 1) >= 0,
+ "HDsetenv succeeded");
+
+ HDassert(num_active_ranks == mpi_size || num_active_ranks == 1);
+
+ if ((num_active_ranks == mpi_size) || (mpi_rank == 0)) {
+ h5_stat_t file_info;
+ FILE *subfile_ptr;
+ int num_digits;
+
+ if (num_active_ranks < mpi_size)
+ file_comm = MPI_COMM_SELF;
+
+ fapl_id = create_subfiling_ioc_fapl(file_comm, info_g, TRUE, &cfg,
+ H5FD_IOC_DEFAULT_THREAD_POOL_SIZE);
+ VRFY((fapl_id >= 0), "FAPL creation succeeded");
+
+ file_id = H5Fcreate(SUBF_FILENAME, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id);
+ VRFY((file_id >= 0), "H5Fcreate succeeded");
+
+ /*
+ * Get the file inode value so we can construct the subfile names
+ */
+ VRFY((HDstat(SUBF_FILENAME, &file_info) >= 0), "HDstat succeeded");
+
+ num_digits = (int)(HDlog10(expected_num_subfiles) + 1);
+
+ /* Ensure all the subfiles are present */
+ for (int i = 0; i < expected_num_subfiles; i++) {
+ HDsnprintf(tmp_filename, PATH_MAX, H5FD_SUBFILING_FILENAME_TEMPLATE, SUBF_FILENAME,
+ (uint64_t)file_info.st_ino, num_digits, i + 1, expected_num_subfiles);
+
+ /* Ensure file exists */
+ subfile_ptr = HDfopen(tmp_filename, "r");
+ VRFY(subfile_ptr, "HDfopen on subfile succeeded");
+ VRFY((HDfclose(subfile_ptr) >= 0), "HDfclose on subfile succeeded");
+ }
+
+ /* Ensure no extra subfiles are present */
+ HDsnprintf(tmp_filename, PATH_MAX, H5FD_SUBFILING_FILENAME_TEMPLATE, SUBF_FILENAME,
+ (uint64_t)file_info.st_ino, num_digits, expected_num_subfiles + 1,
+ expected_num_subfiles);
+
+ /* Ensure file doesn't exist */
+ subfile_ptr = HDfopen(tmp_filename, "r");
+ VRFY(subfile_ptr == NULL, "HDfopen on subfile correctly failed");
+
+ VRFY((H5Fclose(file_id) >= 0), "File close succeeded");
+
+ mpi_code_g = MPI_Barrier(file_comm);
+ VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Barrier succeeded");
+
+ H5E_BEGIN_TRY
+ {
+ H5Fdelete(SUBF_FILENAME, fapl_id);
+ }
+ H5E_END_TRY;
+
+ VRFY((H5Pclose(fapl_id) >= 0), "FAPL close succeeded");
+
+ VRFY(HDunsetenv(H5FD_SUBFILING_IOC_SELECTION_CRITERIA) >= 0, "HDunsetenv succeeded");
+ }
+
+ mpi_code_g = MPI_Barrier(comm_g);
+ VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Barrier succeeded");
+ }
+ }
+ }
+
+ mpi_code_g = MPI_Barrier(comm_g);
+ VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Barrier succeeded");
+
+ HDfree(tmp_filename);
+
+ CHECK_PASSED();
+}
+#undef SUBF_FILENAME
+#undef NUM_RANKS_CHOICES
+#undef NUM_CRITERIA_FORMATS
+
+/*
* Test that opening a file with a different stripe
* size/count than was used when creating the file
* results in the original stripe size/count being