diff options
author | jhendersonHDF <jhenderson@hdfgroup.org> | 2023-04-04 17:35:52 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-04-04 17:35:52 (GMT) |
commit | 101210c16ee978f51a9effcc56813f00c16cb1cd (patch) | |
tree | 214d9e9a30259ac0b10a928d2e43f021497498d0 | |
parent | f13f8705b18dbd21ea08192d7167a24a4f5309fe (diff) | |
download | hdf5-101210c16ee978f51a9effcc56813f00c16cb1cd.zip hdf5-101210c16ee978f51a9effcc56813f00c16cb1cd.tar.gz hdf5-101210c16ee978f51a9effcc56813f00c16cb1cd.tar.bz2 |
Subfiling VFD - fix issues with I/O concentrator selection strategies (#2571) (#2618)
Fix multiple bugs with the SELECT_IOC_EVERY_NTH_RANK and
SELECT_IOC_TOTAL I/O concentrator selection strategies and add a
regression test for them
-rw-r--r-- | release_docs/RELEASE.txt | 20 | ||||
-rw-r--r-- | src/H5FDsubfiling/H5subfiling_common.c | 110 | ||||
-rw-r--r-- | testpar/t_subfiling_vfd.c | 195 |
3 files changed, 286 insertions, 39 deletions
diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt index 9eaf33b..1a90c23 100644 --- a/release_docs/RELEASE.txt +++ b/release_docs/RELEASE.txt @@ -160,6 +160,26 @@ Bug Fixes since HDF5-1.14.0 release (JTH - 2023/03/23) + - Fixed issues in the Subfiling VFD when using the SELECT_IOC_EVERY_NTH_RANK + or SELECT_IOC_TOTAL I/O concentrator selection strategies + + Multiple bugs involving these I/O concentrator selection strategies + were fixed, including: + + * A bug that caused the selection strategy to be altered when + criteria for the strategy was specified in the + H5FD_SUBFILING_IOC_SELECTION_CRITERIA environment variable as + a single value, rather than in the old and undocumented + 'integer:integer' format + * Two bugs which caused a request for 'N' I/O concentrators to + result in 'N - 1' I/O concentrators being assigned, which also + lead to issues if only 1 I/O concentrator was requested + + Also added a regression test for these two I/O concentrator selection + strategies to prevent future issues. + + (JTH - 2023/03/15) + - Fixed an issue with collective metadata writes of global heap data New test failures in parallel netCDF started occurring with debug diff --git a/src/H5FDsubfiling/H5subfiling_common.c b/src/H5FDsubfiling/H5subfiling_common.c index b58c4d3..e4dcf25 100644 --- a/src/H5FDsubfiling/H5subfiling_common.c +++ b/src/H5FDsubfiling/H5subfiling_common.c @@ -992,17 +992,24 @@ init_app_topology(H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm, MPI_ HDprintf("invalid IOC selection strategy string '%s' for strategy " "SELECT_IOC_EVERY_NTH_RANK; defaulting to SELECT_IOC_ONE_PER_NODE\n", ioc_sel_str); + ioc_select_val = 1; ioc_selection_type = SELECT_IOC_ONE_PER_NODE; + + H5_CHECK_OVERFLOW(iocs_per_node, long, int); + ioc_count = (int)iocs_per_node; + + break; } } + if (ioc_select_val > comm_size) + ioc_select_val = comm_size; + H5_CHECK_OVERFLOW(ioc_select_val, long, int); - ioc_count = (comm_size / (int)ioc_select_val); + ioc_count = ((comm_size - 1) / (int)ioc_select_val) + 1; - if ((comm_size % ioc_select_val) != 0) { - ioc_count++; - } + rank_multiple = (int)ioc_select_val; break; } @@ -1017,19 +1024,31 @@ init_app_topology(H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm, MPI_ if (ioc_sel_str) { errno = 0; ioc_select_val = HDstrtol(ioc_sel_str, NULL, 0); - if ((ERANGE == errno) || (ioc_select_val <= 0) || (ioc_select_val >= comm_size)) { + if ((ERANGE == errno) || (ioc_select_val <= 0)) { HDprintf("invalid IOC selection strategy string '%s' for strategy SELECT_IOC_TOTAL; " "defaulting to SELECT_IOC_ONE_PER_NODE\n", ioc_sel_str); + ioc_select_val = 1; ioc_selection_type = SELECT_IOC_ONE_PER_NODE; + + H5_CHECK_OVERFLOW(iocs_per_node, long, int); + ioc_count = (int)iocs_per_node; + + break; } } + if (ioc_select_val > comm_size) + ioc_select_val = comm_size; + H5_CHECK_OVERFLOW(ioc_select_val, long, int); ioc_count = (int)ioc_select_val; - rank_multiple = (comm_size / ioc_count); + if (ioc_select_val > 1) + rank_multiple = (comm_size - 1) / ((int)ioc_select_val - 1); + else + rank_multiple = 1; break; } @@ -1145,34 +1164,48 @@ get_ioc_selection_criteria_from_env(H5FD_subfiling_ioc_select_t *ioc_selection_t *ioc_sel_info_str = NULL; if (env_value) { - long check_value; - /* - * For non-default options, the environment variable - * should have the following form: integer:[integer|string] - * In particular, EveryNthRank == 1:64 or every 64 ranks assign an IOC - * or WithConfig == 2:/<full_path_to_config_file> + * Parse I/O Concentrator selection strategy criteria as + * either a single value or two colon-separated values of + * the form 'integer:[integer|string]'. If two values are + * given, the first value specifies the I/O Concentrator + * selection strategy to use (given as the integer value + * corresponding to the H5FD_subfiling_ioc_select_t enum + * value for that strategy) and the second value specifies + * the criteria for that strategy. + * + * For example, to assign every 64th MPI rank as an I/O + * Concentrator, the criteria string should have the format + * '1:64' to specify the "every Nth rank" strategy with a + * criteria of '64'. */ - if ((opt_value = HDstrchr(env_value, ':'))) + opt_value = HDstrchr(env_value, ':'); + if (opt_value) { + long check_value; + *opt_value++ = '\0'; - errno = 0; - check_value = HDstrtol(env_value, NULL, 0); + errno = 0; + check_value = HDstrtol(env_value, NULL, 0); - if (errno == ERANGE) - H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, - "couldn't parse value from " H5FD_SUBFILING_IOC_SELECTION_CRITERIA - " environment variable"); + if (errno == ERANGE) + H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, + "couldn't parse value from " H5FD_SUBFILING_IOC_SELECTION_CRITERIA + " environment variable"); - if ((check_value < 0) || (check_value >= ioc_selection_options)) - H5_SUBFILING_GOTO_ERROR( - H5E_VFL, H5E_BADVALUE, FAIL, - "invalid IOC selection type value %ld from " H5FD_SUBFILING_IOC_SELECTION_CRITERIA - " environment variable", - check_value); + if ((check_value < 0) || (check_value >= ioc_selection_options)) + H5_SUBFILING_GOTO_ERROR( + H5E_VFL, H5E_BADVALUE, FAIL, + "invalid IOC selection type value %ld from " H5FD_SUBFILING_IOC_SELECTION_CRITERIA + " environment variable", + check_value); - *ioc_selection_type = (H5FD_subfiling_ioc_select_t)check_value; - *ioc_sel_info_str = opt_value; + *ioc_selection_type = (H5FD_subfiling_ioc_select_t)check_value; + *ioc_sel_info_str = opt_value; + } + else { + *ioc_sel_info_str = env_value; + } } done: @@ -1562,8 +1595,8 @@ identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride) max_iocs = app_topology->n_io_concentrators; - if (NULL == (app_topology->io_concentrators = HDmalloc((size_t)app_topology->n_io_concentrators * - sizeof(*app_topology->io_concentrators)))) + if (NULL == (app_topology->io_concentrators = + HDmalloc((size_t)max_iocs * sizeof(*app_topology->io_concentrators)))) H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't allocate array of I/O concentrator ranks"); @@ -1622,30 +1655,29 @@ identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride) case SELECT_IOC_EVERY_NTH_RANK: case SELECT_IOC_TOTAL: { - int world_size = app_layout->world_size; - int ioc_next = 0; + int num_iocs_assigned = 0; + int world_size = app_layout->world_size; HDassert(rank_stride > 0); - for (int i = 0; ioc_next < app_topology->n_io_concentrators; ioc_next++) { + for (int i = 0; num_iocs_assigned < max_iocs; num_iocs_assigned++) { int ioc_index = rank_stride * i++; + if (num_iocs_assigned >= max_iocs) + break; if (ioc_index >= world_size) break; - io_concentrators[ioc_next] = app_layout->layout[ioc_index].rank; + io_concentrators[num_iocs_assigned] = app_layout->layout[ioc_index].rank; - if (app_layout->world_rank == io_concentrators[ioc_next]) { - app_topology->ioc_idx = ioc_next; + if (app_layout->world_rank == io_concentrators[num_iocs_assigned]) { + app_topology->ioc_idx = num_iocs_assigned; app_topology->rank_is_ioc = TRUE; } - - if (ioc_next + 1 >= max_iocs) - break; } /* Set final number of I/O concentrators after adjustments */ - app_topology->n_io_concentrators = ioc_next; + app_topology->n_io_concentrators = num_iocs_assigned; break; } diff --git a/testpar/t_subfiling_vfd.c b/testpar/t_subfiling_vfd.c index e1f9e05..380c068 100644 --- a/testpar/t_subfiling_vfd.c +++ b/testpar/t_subfiling_vfd.c @@ -95,6 +95,7 @@ static hid_t create_subfiling_ioc_fapl(MPI_Comm comm, MPI_Info info, hbool_t cus static void test_create_and_close(void); static void test_config_file(void); static void test_stripe_sizes(void); +static void test_selection_strategies(void); static void test_read_different_stripe_size(void); static void test_subfiling_precreate_rank_0(void); static void test_subfiling_write_many_read_one(void); @@ -105,6 +106,7 @@ static test_func tests[] = { test_create_and_close, test_config_file, test_stripe_sizes, + test_selection_strategies, test_read_different_stripe_size, test_subfiling_precreate_rank_0, test_subfiling_write_many_read_one, @@ -795,6 +797,199 @@ test_stripe_sizes(void) #undef SUBF_NITER /* + * Test the different I/O Concentator selection strategies + * for the Subfiling VFD + */ +#define SUBF_FILENAME "test_subfiling_selection_strategies.h5" +#define NUM_RANKS_CHOICES 2 +#define NUM_CRITERIA_FORMATS 2 +static void +test_selection_strategies(void) +{ + H5FD_subfiling_params_t cfg; + hid_t file_id = H5I_INVALID_HID; + hid_t fapl_id = H5I_INVALID_HID; + char *tmp_filename = NULL; + + curr_nerrors = nerrors; + + if (MAINPROCESS) + TESTING_2("I/O concentrator selection strategies"); + + tmp_filename = HDmalloc(PATH_MAX); + VRFY(tmp_filename, "HDmalloc succeeded"); + + for (H5FD_subfiling_ioc_select_t strategy = 0; strategy < ioc_selection_options; strategy++) { + /* Skip 1 IOC per node strategy since we assume it's + * the default strategy tested in this file. Skip + * "with config" strategy since it isn't supported. + */ + if (strategy == SELECT_IOC_ONE_PER_NODE || strategy == SELECT_IOC_WITH_CONFIG) + continue; + + /* Test with 1 MPI rank and then all MPI ranks */ + for (size_t num_ranks_choice = 0; num_ranks_choice < NUM_RANKS_CHOICES; num_ranks_choice++) { + int num_active_ranks = mpi_size; + + if (num_ranks_choice == 0) + num_active_ranks = 1; + + /* Test with a selection strategy criteria string + * in the 'integer:[integer|string]' form and in + * the form of just a single value. + */ + for (size_t criteria_format_choice = 0; criteria_format_choice < NUM_CRITERIA_FORMATS; + criteria_format_choice++) { + MPI_Comm file_comm = comm_g; + char criteria_buf[256]; + char sel_criteria[128]; /* Use char buffer for criteria as we may support + the "with config" strategy in the future */ + int expected_num_subfiles; + + cfg.ioc_selection = strategy; + cfg.stripe_size = H5FD_SUBFILING_DEFAULT_STRIPE_SIZE; + cfg.stripe_count = H5FD_SUBFILING_DEFAULT_STRIPE_COUNT; + + switch (strategy) { + case SELECT_IOC_EVERY_NTH_RANK: { + int stride; + + /* Try to select a reasonable stride value */ + if (num_active_ranks <= 2) + stride = 1; + else if (num_active_ranks <= 8) + stride = 2; + else if (num_active_ranks <= 32) + stride = 4; + else if (num_active_ranks <= 128) + stride = 8; + else + stride = 16; + + HDsnprintf(sel_criteria, 128, "%d", stride); + + expected_num_subfiles = ((num_active_ranks - 1) / stride) + 1; + + break; + } + + case SELECT_IOC_TOTAL: { + int n_iocs; + + /* Try to select a reasonable number of IOCs */ + if (num_active_ranks <= 2) + n_iocs = 1; + else if (num_active_ranks <= 8) + n_iocs = 2; + else if (num_active_ranks <= 32) + n_iocs = 4; + else if (num_active_ranks <= 128) + n_iocs = 8; + else + n_iocs = 16; + + HDsnprintf(sel_criteria, 128, "%d", n_iocs); + + expected_num_subfiles = n_iocs; + + break; + } + + case SELECT_IOC_ONE_PER_NODE: + case SELECT_IOC_WITH_CONFIG: + default: + HDprintf("invalid IOC selection strategy\n"); + MPI_Abort(comm_g, -1); + } + + if (criteria_format_choice == 0) { + HDsnprintf(criteria_buf, 256, "%d:%s", strategy, sel_criteria); + } + else if (criteria_format_choice == 1) { + HDsnprintf(criteria_buf, 256, "%s", sel_criteria); + } + + VRFY(HDsetenv(H5FD_SUBFILING_IOC_SELECTION_CRITERIA, criteria_buf, 1) >= 0, + "HDsetenv succeeded"); + + HDassert(num_active_ranks == mpi_size || num_active_ranks == 1); + + if ((num_active_ranks == mpi_size) || (mpi_rank == 0)) { + h5_stat_t file_info; + FILE *subfile_ptr; + int num_digits; + + if (num_active_ranks < mpi_size) + file_comm = MPI_COMM_SELF; + + fapl_id = create_subfiling_ioc_fapl(file_comm, info_g, TRUE, &cfg, + H5FD_IOC_DEFAULT_THREAD_POOL_SIZE); + VRFY((fapl_id >= 0), "FAPL creation succeeded"); + + file_id = H5Fcreate(SUBF_FILENAME, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); + VRFY((file_id >= 0), "H5Fcreate succeeded"); + + /* + * Get the file inode value so we can construct the subfile names + */ + VRFY((HDstat(SUBF_FILENAME, &file_info) >= 0), "HDstat succeeded"); + + num_digits = (int)(HDlog10(expected_num_subfiles) + 1); + + /* Ensure all the subfiles are present */ + for (int i = 0; i < expected_num_subfiles; i++) { + HDsnprintf(tmp_filename, PATH_MAX, H5FD_SUBFILING_FILENAME_TEMPLATE, SUBF_FILENAME, + (uint64_t)file_info.st_ino, num_digits, i + 1, expected_num_subfiles); + + /* Ensure file exists */ + subfile_ptr = HDfopen(tmp_filename, "r"); + VRFY(subfile_ptr, "HDfopen on subfile succeeded"); + VRFY((HDfclose(subfile_ptr) >= 0), "HDfclose on subfile succeeded"); + } + + /* Ensure no extra subfiles are present */ + HDsnprintf(tmp_filename, PATH_MAX, H5FD_SUBFILING_FILENAME_TEMPLATE, SUBF_FILENAME, + (uint64_t)file_info.st_ino, num_digits, expected_num_subfiles + 1, + expected_num_subfiles); + + /* Ensure file doesn't exist */ + subfile_ptr = HDfopen(tmp_filename, "r"); + VRFY(subfile_ptr == NULL, "HDfopen on subfile correctly failed"); + + VRFY((H5Fclose(file_id) >= 0), "File close succeeded"); + + mpi_code_g = MPI_Barrier(file_comm); + VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Barrier succeeded"); + + H5E_BEGIN_TRY + { + H5Fdelete(SUBF_FILENAME, fapl_id); + } + H5E_END_TRY; + + VRFY((H5Pclose(fapl_id) >= 0), "FAPL close succeeded"); + + VRFY(HDunsetenv(H5FD_SUBFILING_IOC_SELECTION_CRITERIA) >= 0, "HDunsetenv succeeded"); + } + + mpi_code_g = MPI_Barrier(comm_g); + VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Barrier succeeded"); + } + } + } + + mpi_code_g = MPI_Barrier(comm_g); + VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Barrier succeeded"); + + HDfree(tmp_filename); + + CHECK_PASSED(); +} +#undef SUBF_FILENAME +#undef NUM_RANKS_CHOICES +#undef NUM_CRITERIA_FORMATS + +/* * Test that opening a file with a different stripe * size/count than was used when creating the file * results in the original stripe size/count being |