From 79bdc6183ee96cc36bd04569d4cce48202a8ae68 Mon Sep 17 00:00:00 2001 From: jhendersonHDF Date: Thu, 29 Sep 2022 21:06:17 -0500 Subject: Subfiling testing fix and documentation (#2132) * Fix a sporadic failure in Subfiling VFD tests * Subfiling VFD - add note to H5Pget_fapl_subfiling documentation Adds note about how H5Pget_fapl_subfiling only returns the original settings on a FAPL and those settings could have been modified by the Subfiling VFD's environment variables --- src/H5FDsubfiling/H5FDioc_threads.c | 9 +++ src/H5FDsubfiling/H5FDsubfile_int.c | 48 +++++++++++--- src/H5FDsubfiling/H5FDsubfiling.h | 10 +++ src/H5FDsubfiling/H5subfiling_common.h | 1 + testpar/t_subfiling_vfd.c | 118 +++++++++++++++++++++++++++------ 5 files changed, 157 insertions(+), 29 deletions(-) diff --git a/src/H5FDsubfiling/H5FDioc_threads.c b/src/H5FDsubfiling/H5FDioc_threads.c index b3e8ebc..5bbecab 100644 --- a/src/H5FDsubfiling/H5FDioc_threads.c +++ b/src/H5FDsubfiling/H5FDioc_threads.c @@ -1157,6 +1157,7 @@ ioc_file_truncate(sf_work_request_t *msg) int64_t subfile_idx; int fd; int ioc_idx; + int mpi_code; int ret_value = 0; HDassert(msg); @@ -1181,6 +1182,14 @@ ioc_file_truncate(sf_work_request_t *msg) if (HDftruncate(fd, (off_t)length) != 0) H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_SEEKERROR, -1, "HDftruncate failed"); + /* + * Send a completion message back to the source that + * requested the truncation operation + */ + if (MPI_SUCCESS != (mpi_code = MPI_Send(msg->header, 1, H5_subfiling_rpc_msg_type, msg->source, + TRUNC_COMPLETED, sf_context->sf_eof_comm))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Send failed", mpi_code); + #ifdef H5FD_IOC_DEBUG HDprintf("[ioc(%d) %s]: truncated subfile to %lld bytes. ret = %d\n", ioc_idx, __func__, (long long)length, errno); diff --git a/src/H5FDsubfiling/H5FDsubfile_int.c b/src/H5FDsubfiling/H5FDsubfile_int.c index c089509..be71b3d 100644 --- a/src/H5FDsubfiling/H5FDsubfile_int.c +++ b/src/H5FDsubfiling/H5FDsubfile_int.c @@ -73,7 +73,9 @@ herr_t H5FD__subfiling__truncate_sub_files(hid_t context_id, int64_t logical_file_eof, MPI_Comm comm) { subfiling_context_t *sf_context = NULL; + MPI_Request *recv_reqs = NULL; int64_t msg[3] = {0}; + int64_t *recv_msgs = NULL; int mpi_size; int mpi_code; herr_t ret_value = SUCCEED; @@ -93,13 +95,35 @@ H5FD__subfiling__truncate_sub_files(hid_t context_id, int64_t logical_file_eof, int64_t num_full_stripes; int64_t num_leftover_stripes; int64_t partial_stripe_len; + int num_subfiles_owned; num_full_stripes = logical_file_eof / sf_context->sf_blocksize_per_stripe; partial_stripe_len = logical_file_eof % sf_context->sf_blocksize_per_stripe; num_leftover_stripes = partial_stripe_len / sf_context->sf_stripe_size; + num_subfiles_owned = sf_context->sf_num_fids; + + if (NULL == (recv_reqs = HDmalloc((size_t)num_subfiles_owned * sizeof(*recv_reqs)))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, + "can't allocate receive requests array"); + if (NULL == (recv_msgs = HDmalloc((size_t)num_subfiles_owned * 3 * sizeof(*recv_msgs)))) + H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate message array"); + + /* + * Post early receives for messages from the IOC main + * thread that will signal completion of the truncate + * operation + */ + for (int i = 0; i < num_subfiles_owned; i++) { + if (MPI_SUCCESS != + (mpi_code = MPI_Irecv(&recv_msgs[3 * i], 1, H5_subfiling_rpc_msg_type, + sf_context->topology->io_concentrators[sf_context->topology->ioc_idx], + TRUNC_COMPLETED, sf_context->sf_eof_comm, &recv_reqs[i]))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Irecv failed", mpi_code); + } + /* Compute the EOF for each subfile this IOC owns */ - for (int i = 0; i < sf_context->sf_num_fids; i++) { + for (int i = 0; i < num_subfiles_owned; i++) { int64_t subfile_eof = num_full_stripes * sf_context->sf_stripe_size; int64_t global_subfile_idx; @@ -125,14 +149,18 @@ H5FD__subfiling__truncate_sub_files(hid_t context_id, int64_t logical_file_eof, H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Send failed", mpi_code); } - /* sanity check -- compute the file eof using the same mechanism used to - * compute the subfile eof. Assert that the computed value and the - * actual value match. - * - * Do this only for debug builds -- probably delete this before release. - * - * JRM -- 12/15/21 - */ + /* Wait for truncate operations to complete */ + if (MPI_SUCCESS != (mpi_code = MPI_Waitall(num_subfiles_owned, recv_reqs, MPI_STATUSES_IGNORE))) + H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Waitall", mpi_code); + + /* sanity check -- compute the file eof using the same mechanism used to + * compute the subfile eof. Assert that the computed value and the + * actual value match. + * + * Do this only for debug builds -- probably delete this before release. + * + * JRM -- 12/15/21 + */ #ifndef NDEBUG { @@ -160,6 +188,8 @@ H5FD__subfiling__truncate_sub_files(hid_t context_id, int64_t logical_file_eof, H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code); done: + HDfree(recv_msgs); + HDfree(recv_reqs); H5_SUBFILING_FUNC_LEAVE; } /* H5FD__subfiling__truncate_sub_files() */ diff --git a/src/H5FDsubfiling/H5FDsubfiling.h b/src/H5FDsubfiling/H5FDsubfiling.h index 93d0c3e..23dae62 100644 --- a/src/H5FDsubfiling/H5FDsubfiling.h +++ b/src/H5FDsubfiling/H5FDsubfiling.h @@ -359,6 +359,16 @@ H5_DLL herr_t H5Pset_fapl_subfiling(hid_t fapl_id, const H5FD_subfiling_config_t * the default values and then calling H5Pset_fapl_subfiling() with the configured * H5FD_subfiling_config_t structure. * + * \note H5Pget_fapl_subfiling() returns the #H5FD_SUBFILING driver properties as they + * were initially set for the File Access Property List using H5Pset_fapl_subfiling(). + * Alternatively, the driver properties can be modified at runtime according to values + * set for the #H5FD_SUBFILING_STRIPE_SIZE, #H5FD_SUBFILING_IOC_PER_NODE and + * #H5FD_SUBFILING_IOC_SELECTION_CRITERIA environment variables. However, driver + * properties set through environment variables will not be reflected in what is + * returned by H5Pget_fapl_subfiling(), so an application may need to check those + * environment variables to get accurate values for the #H5FD_SUBFILING driver + * properties. + * * \since 1.13.2 * */ diff --git a/src/H5FDsubfiling/H5subfiling_common.h b/src/H5FDsubfiling/H5subfiling_common.h index ba6dfdc..d4eecee 100644 --- a/src/H5FDsubfiling/H5subfiling_common.h +++ b/src/H5FDsubfiling/H5subfiling_common.h @@ -107,6 +107,7 @@ #define WRITE_COLL (COLL_FUNC | WRITE_OP) #define GET_EOF_COMPLETED (COMPLETED | GET_EOF_OP) +#define TRUNC_COMPLETED (COMPLETED | TRUNC_OP) #define SET_LOGGING (LOGGING_OP) diff --git a/testpar/t_subfiling_vfd.c b/testpar/t_subfiling_vfd.c index a214502..af263cf 100644 --- a/testpar/t_subfiling_vfd.c +++ b/testpar/t_subfiling_vfd.c @@ -242,8 +242,15 @@ test_config_file(void) * Choose a random Subfiling stripe size between * the smallest allowed value and 32MiB */ - stripe_size = (rand() % (H5FD_SUBFILING_DEFAULT_STRIPE_SIZE - SUBFILING_MIN_STRIPE_SIZE + 1)) + - SUBFILING_MIN_STRIPE_SIZE; + if (mpi_rank == 0) { + stripe_size = (rand() % (H5FD_SUBFILING_DEFAULT_STRIPE_SIZE - SUBFILING_MIN_STRIPE_SIZE + 1)) + + SUBFILING_MIN_STRIPE_SIZE; + } + + if (mpi_size > 1) { + mpi_code_g = MPI_Bcast(&stripe_size, 1, MPI_INT64_T, 0, comm_g); + VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Bcast succeeded"); + } cfg.ioc_selection = SELECT_IOC_ONE_PER_NODE; cfg.stripe_size = (stripe_size_g > 0) ? stripe_size_g : stripe_size; @@ -432,8 +439,15 @@ test_stripe_sizes(void) * Choose a random Subfiling stripe size between * the smallest allowed value and the default value */ - stripe_size = (rand() % (H5FD_SUBFILING_DEFAULT_STRIPE_SIZE - SUBFILING_MIN_STRIPE_SIZE + 1)) + - SUBFILING_MIN_STRIPE_SIZE; + if (mpi_rank == 0) { + stripe_size = (rand() % (H5FD_SUBFILING_DEFAULT_STRIPE_SIZE - SUBFILING_MIN_STRIPE_SIZE + 1)) + + SUBFILING_MIN_STRIPE_SIZE; + } + + if (mpi_size > 1) { + mpi_code_g = MPI_Bcast(&stripe_size, 1, MPI_INT64_T, 0, comm_g); + VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Bcast succeeded"); + } cfg.ioc_selection = SELECT_IOC_ONE_PER_NODE; cfg.stripe_size = (stripe_size_g > 0) ? stripe_size_g : stripe_size; @@ -497,6 +511,8 @@ test_stripe_sizes(void) file_end_addr += nbytes; + VRFY((H5FDtruncate(file_ptr, dxpl_id, 0) >= 0), "H5FDtruncate succeeded"); + for (int j = 0; j < num_subfiles; j++) { h5_stat_size_t subfile_size; h5_stat_t subfile_info; @@ -541,6 +557,8 @@ test_stripe_sizes(void) H5FDwrite_vector(file_ptr, dxpl_id, 1, &write_type, &write_addr, &nbytes, &c_write_buf); VRFY((write_status >= 0), "H5FDwrite_vector succeeded"); + VRFY((H5FDtruncate(file_ptr, dxpl_id, 0) >= 0), "H5FDtruncate succeeded"); + for (int j = 0; j < num_subfiles; j++) { h5_stat_size_t subfile_size; h5_stat_t subfile_info; @@ -653,6 +671,8 @@ test_stripe_sizes(void) file_end_addr += ((size_t)mpi_size * nbytes); + VRFY((H5FDtruncate(file_ptr, dxpl_id, 0) >= 0), "H5FDtruncate succeeded"); + mpi_code_g = MPI_Barrier(comm_g); VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Barrier succeeded"); @@ -708,6 +728,8 @@ test_stripe_sizes(void) H5FDwrite_vector(file_ptr, dxpl_id, 1, &write_type, &write_addr, &nbytes, &c_write_buf); VRFY((write_status >= 0), "H5FDwrite_vector succeeded"); + VRFY((H5FDtruncate(file_ptr, dxpl_id, 0) >= 0), "H5FDtruncate succeeded"); + mpi_code_g = MPI_Barrier(comm_g); VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Barrier succeeded"); @@ -933,7 +955,7 @@ test_read_different_stripe_size(void) h5_stat_t file_info; FILE *subfile_ptr; int num_subfiles = cfg.stripe_count; - int num_digits = (int)(HDlog10(num_subfiles) + 1); + int num_digits = (int)(HDlog10(num_subfiles / 2) + 1); VRFY((HDstat(SUBF_FILENAME, &file_info) >= 0), "HDstat succeeded"); @@ -1317,6 +1339,7 @@ test_subfiling_write_many_read_few(void) hsize_t start[1]; hsize_t count[1]; hsize_t dset_dims[1]; + hbool_t reading_file = FALSE; size_t target_size; hid_t file_id = H5I_INVALID_HID; hid_t fapl_id = H5I_INVALID_HID; @@ -1396,12 +1419,40 @@ test_subfiling_write_many_read_few(void) VRFY((H5Dclose(dset_id) >= 0), "Dataset close succeeded"); VRFY((H5Fclose(file_id) >= 0), "File close succeeded"); - /* Read file back with half the number of MPI ranks */ - int color = (mpi_rank < (mpi_size / 2)); - mpi_code_g = MPI_Comm_split(comm_g, color, mpi_rank, &sub_comm); - VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Comm_split succeeded"); + /* + * If only using 1 node, read file back with a + * few ranks from that node. Otherwise, read file + * back with 1 MPI rank per node + */ + if (num_nodes_g == 1) { + int color; + + if (mpi_size < 2) { + color = 1; + } + else if (mpi_size < 4) { + color = (mpi_rank < (mpi_size / 2)); + } + else { + color = (mpi_rank < (mpi_size / 4)); + } + + if (mpi_size > 1) { + mpi_code_g = MPI_Comm_split(comm_g, color, mpi_rank, &sub_comm); + VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Comm_split succeeded"); + } - if (color) { + if (color) + reading_file = TRUE; + } + else { + if (node_local_rank == 0) { + sub_comm = ioc_comm; + reading_file = TRUE; + } + } + + if (reading_file) { fapl_id = create_subfiling_ioc_fapl(sub_comm, MPI_INFO_NULL, FALSE, NULL, 0); VRFY((fapl_id >= 0), "FAPL creation succeeded"); @@ -1440,8 +1491,10 @@ test_subfiling_write_many_read_few(void) VRFY((H5Pclose(fapl_id) >= 0), "FAPL close succeeded"); } - mpi_code_g = MPI_Comm_free(&sub_comm); - VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Comm_free succeeded"); + if ((sub_comm != MPI_COMM_NULL) && (num_nodes_g == 1)) { + mpi_code_g = MPI_Comm_free(&sub_comm); + VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Comm_free succeeded"); + } mpi_code_g = MPI_Barrier(comm_g); VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Barrier succeeded"); @@ -1703,9 +1756,9 @@ parse_subfiling_env_vars(void) int main(int argc, char **argv) { - time_t seed; - int required = MPI_THREAD_MULTIPLE; - int provided = 0; + unsigned seed; + int required = MPI_THREAD_MULTIPLE; + int provided = 0; HDcompile_assert(SUBFILING_MIN_STRIPE_SIZE <= H5FD_SUBFILING_DEFAULT_STRIPE_SIZE); @@ -1808,11 +1861,29 @@ main(int argc, char **argv) TestAlarmOn(); - seed = time(NULL); - srand((unsigned)seed); + /* + * Obtain and broadcast seed value since ranks + * aren't guaranteed to arrive here at exactly + * the same time and could end up out of sync + * with each other in regards to random number + * generation + */ + if (mpi_rank == 0) + seed = (unsigned)time(NULL); + + if (mpi_size > 1) { + if (MPI_SUCCESS != (mpi_code_g = MPI_Bcast(&seed, 1, MPI_UNSIGNED, 0, comm_g))) { + if (MAINPROCESS) + HDprintf("MPI_Bcast failed with error code %d\n", mpi_code_g); + nerrors++; + goto exit; + } + } + + srand(seed); if (MAINPROCESS) - HDprintf("Using seed: %lld\n\n", (long long)seed); + HDprintf("Using seed: %u\n\n", seed); /* Grab values from environment variables if set */ parse_subfiling_env_vars(); @@ -1853,8 +1924,15 @@ main(int argc, char **argv) * Choose a random Subfiling stripe size between * the smallest allowed value and the default value */ - stripe_size = (rand() % (H5FD_SUBFILING_DEFAULT_STRIPE_SIZE - SUBFILING_MIN_STRIPE_SIZE + 1)) + - SUBFILING_MIN_STRIPE_SIZE; + if (mpi_rank == 0) { + stripe_size = (rand() % (H5FD_SUBFILING_DEFAULT_STRIPE_SIZE - SUBFILING_MIN_STRIPE_SIZE + 1)) + + SUBFILING_MIN_STRIPE_SIZE; + } + + if (mpi_size > 1) { + mpi_code_g = MPI_Bcast(&stripe_size, 1, MPI_INT64_T, 0, comm_g); + VRFY((mpi_code_g == MPI_SUCCESS), "MPI_Bcast succeeded"); + } HDsnprintf(tmp, sizeof(tmp), "%" PRId64, stripe_size); -- cgit v0.12