From af49eb5b8647e8d9ffb527fd533def0910eb535c Mon Sep 17 00:00:00 2001 From: jhendersonHDF Date: Mon, 23 Oct 2023 21:06:44 -0500 Subject: Fix hangs during collective I/O with independent metadata writes (#3693) --- release_docs/RELEASE.txt | 19 +++++++++ src/H5Cmpio.c | 38 +++++++++++++++-- src/H5Pfapl.c | 2 +- testpar/t_coll_md.c | 103 +++++++++++++++++++++++++++++++++++++++++++++++ testpar/testphdf5.c | 2 + testpar/testphdf5.h | 1 + 6 files changed, 160 insertions(+), 5 deletions(-) diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt index 222c277..e5d53e4 100644 --- a/release_docs/RELEASE.txt +++ b/release_docs/RELEASE.txt @@ -392,6 +392,25 @@ Bug Fixes since HDF5-1.14.0 release =================================== Library ------- + - Fixed potential hangs in parallel library during collective I/O with + independent metadata writes + + When performing collective parallel writes to a dataset where metadata + writes are requested as (or left as the default setting of) independent, + hangs could potentially occur during metadata cache sync points. This + was due to incorrect management of the internal state tracking whether + an I/O operation should be collective or not, causing the library to + attempt collective writes of metadata when they were meant to be + independent writes. During the metadata cache sync points, if the number + of cache entries being flushed was a multiple of the number of MPI ranks + in the MPI communicator used to access the HDF5 file, an equal amount of + collective MPI I/O calls were made and the dataset write call would be + successful. However, when the number of cache entries being flushed was + NOT a multiple of the number of MPI ranks, the ranks with more entries + than others would get stuck in an MPI_File_set_view call, while other + ranks would get stuck in a post-write MPI_Barrier call. This issue has + been fixed by correctly switching to independent I/O temporarily when + writing metadata independently during collective dataset I/O. - Dropped support for MPI-2 diff --git a/src/H5Cmpio.c b/src/H5Cmpio.c index 643bbc8..c8db535 100644 --- a/src/H5Cmpio.c +++ b/src/H5Cmpio.c @@ -154,8 +154,9 @@ herr_t H5C_apply_candidate_list(H5F_t *f, H5C_t *cache_ptr, unsigned num_candidates, haddr_t *candidates_list_ptr, int mpi_rank, int mpi_size) { - unsigned first_entry_to_flush; - unsigned last_entry_to_flush; + H5FD_mpio_xfer_t orig_xfer_mode; + unsigned first_entry_to_flush; + unsigned last_entry_to_flush; #ifndef NDEBUG unsigned total_entries_to_clear = 0; unsigned total_entries_to_flush = 0; @@ -172,8 +173,9 @@ H5C_apply_candidate_list(H5F_t *f, H5C_t *cache_ptr, unsigned num_candidates, ha char *tbl_buf = NULL; #endif /* H5C_APPLY_CANDIDATE_LIST__DEBUG */ unsigned m, n; - unsigned u; /* Local index variable */ - herr_t ret_value = SUCCEED; /* Return value */ + unsigned u; /* Local index variable */ + bool restore_io_mode = false; + herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_NOAPI(FAIL) @@ -185,6 +187,10 @@ H5C_apply_candidate_list(H5F_t *f, H5C_t *cache_ptr, unsigned num_candidates, ha assert(0 <= mpi_rank); assert(mpi_rank < mpi_size); + /* Get I/O transfer mode */ + if (H5CX_get_io_xfer_mode(&orig_xfer_mode) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode"); + /* Initialize the entries_to_flush and entries_to_clear arrays */ memset(entries_to_flush, 0, sizeof(entries_to_flush)); memset(entries_to_clear, 0, sizeof(entries_to_clear)); @@ -418,6 +424,19 @@ H5C_apply_candidate_list(H5F_t *f, H5C_t *cache_ptr, unsigned num_candidates, ha num_candidates, total_entries_to_clear, total_entries_to_flush); #endif /* H5C_APPLY_CANDIDATE_LIST__DEBUG */ + /* + * If collective I/O was requested, but collective metadata + * writes were not requested, temporarily disable collective + * I/O while flushing candidate entries so that we don't cause + * a hang in the case where the number of candidate entries + * to flush isn't a multiple of mpi_size. + */ + if ((orig_xfer_mode == H5FD_MPIO_COLLECTIVE) && !f->shared->coll_md_write) { + if (H5CX_set_io_xfer_mode(H5FD_MPIO_INDEPENDENT) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTSET, FAIL, "can't set MPI-I/O transfer mode"); + restore_io_mode = true; + } + /* We have now marked all the entries on the candidate list for * either flush or clear -- now scan the LRU and the pinned list * for these entries and do the deed. Do this via a call to @@ -431,6 +450,13 @@ H5C_apply_candidate_list(H5F_t *f, H5C_t *cache_ptr, unsigned num_candidates, ha if (H5C__flush_candidate_entries(f, entries_to_flush, entries_to_clear) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "flush candidates failed"); + /* Restore collective I/O if we temporarily disabled it */ + if (restore_io_mode) { + if (H5CX_set_io_xfer_mode(orig_xfer_mode) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTSET, FAIL, "can't set MPI-I/O transfer mode"); + restore_io_mode = false; + } + /* If we've deferred writing to do it collectively, take care of that now */ if (f->shared->coll_md_write) { /* Sanity check */ @@ -442,6 +468,10 @@ H5C_apply_candidate_list(H5F_t *f, H5C_t *cache_ptr, unsigned num_candidates, ha } /* end if */ done: + /* Restore collective I/O if we temporarily disabled it */ + if (restore_io_mode && (H5CX_set_io_xfer_mode(orig_xfer_mode) < 0)) + HDONE_ERROR(H5E_CACHE, H5E_CANTSET, FAIL, "can't set MPI-I/O transfer mode"); + if (candidate_assignment_table != NULL) candidate_assignment_table = (unsigned *)H5MM_xfree((void *)candidate_assignment_table); if (cache_ptr->coll_write_list) { diff --git a/src/H5Pfapl.c b/src/H5Pfapl.c index 5f5782c..dc122af 100644 --- a/src/H5Pfapl.c +++ b/src/H5Pfapl.c @@ -5174,7 +5174,7 @@ done: * Function: H5Pset_coll_metadata_write * * Purpose: Tell the library whether the metadata write operations will - * be done collectively (1) or not (0). Default is collective. + * be done collectively (1) or not (0). Default is independent. * * Return: Non-negative on success/Negative on failure * diff --git a/testpar/t_coll_md.c b/testpar/t_coll_md.c index 1220111..9c6fc71 100644 --- a/testpar/t_coll_md.c +++ b/testpar/t_coll_md.c @@ -43,6 +43,11 @@ #define COLL_GHEAP_WRITE_ATTR_NAME "coll_gheap_write_attr" #define COLL_GHEAP_WRITE_ATTR_DIMS 1 +#define COLL_IO_IND_MD_WRITE_NDIMS 2 +#define COLL_IO_IND_MD_WRITE_CHUNK0 4 +#define COLL_IO_IND_MD_WRITE_CHUNK1 256 +#define COLL_IO_IND_MD_WRITE_NCHUNK1 16384 + /* * A test for issue HDFFV-10501. A parallel hang was reported which occurred * in linked-chunk I/O when collective metadata reads are enabled and some ranks @@ -569,3 +574,101 @@ test_collective_global_heap_write(void) VRFY((H5Pclose(fapl_id) >= 0), "H5Pclose succeeded"); VRFY((H5Fclose(file_id) >= 0), "H5Fclose succeeded"); } + +/* + * A test to ensure that hangs don't occur when collective I/O + * is requested at the interface level (by a call to + * H5Pset_dxpl_mpio(dxpl_id, H5FD_MPIO_COLLECTIVE)), while + * collective metadata writes are NOT requested. + */ +void +test_coll_io_ind_md_write(void) +{ + const char *filename; + long long *data = NULL; + hsize_t dset_dims[COLL_IO_IND_MD_WRITE_NDIMS]; + hsize_t chunk_dims[COLL_IO_IND_MD_WRITE_NDIMS]; + hsize_t sel_dims[COLL_IO_IND_MD_WRITE_NDIMS]; + hsize_t offset[COLL_IO_IND_MD_WRITE_NDIMS]; + hid_t file_id = H5I_INVALID_HID; + hid_t fapl_id = H5I_INVALID_HID; + hid_t dset_id = H5I_INVALID_HID; + hid_t dset_id2 = H5I_INVALID_HID; + hid_t dcpl_id = H5I_INVALID_HID; + hid_t dxpl_id = H5I_INVALID_HID; + hid_t fspace_id = H5I_INVALID_HID; + int mpi_rank, mpi_size; + + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); + + filename = GetTestParameters(); + + fapl_id = create_faccess_plist(MPI_COMM_WORLD, MPI_INFO_NULL, facc_type); + VRFY((fapl_id >= 0), "create_faccess_plist succeeded"); + + VRFY((H5Pset_all_coll_metadata_ops(fapl_id, false) >= 0), "Unset collective metadata reads succeeded"); + VRFY((H5Pset_coll_metadata_write(fapl_id, false) >= 0), "Unset collective metadata writes succeeded"); + + file_id = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); + VRFY((file_id >= 0), "H5Fcreate succeeded"); + + dset_dims[0] = (hsize_t)(mpi_size * COLL_IO_IND_MD_WRITE_CHUNK0); + dset_dims[1] = (hsize_t)(COLL_IO_IND_MD_WRITE_CHUNK1 * COLL_IO_IND_MD_WRITE_NCHUNK1); + + fspace_id = H5Screate_simple(COLL_IO_IND_MD_WRITE_NDIMS, dset_dims, NULL); + VRFY((fspace_id >= 0), "H5Screate_simple succeeded"); + + dcpl_id = H5Pcreate(H5P_DATASET_CREATE); + VRFY((dcpl_id >= 0), "H5Pcreate succeeded"); + + chunk_dims[0] = (hsize_t)(COLL_IO_IND_MD_WRITE_CHUNK0); + chunk_dims[1] = (hsize_t)(COLL_IO_IND_MD_WRITE_CHUNK1); + + VRFY((H5Pset_chunk(dcpl_id, COLL_IO_IND_MD_WRITE_NDIMS, chunk_dims) >= 0), "H5Pset_chunk succeeded"); + + VRFY((H5Pset_shuffle(dcpl_id) >= 0), "H5Pset_shuffle succeeded"); + + dset_id = H5Dcreate2(file_id, "dset1", H5T_NATIVE_LLONG, fspace_id, H5P_DEFAULT, dcpl_id, H5P_DEFAULT); + VRFY((dset_id >= 0), "H5Dcreate2 succeeded"); + + sel_dims[0] = (hsize_t)(COLL_IO_IND_MD_WRITE_CHUNK0); + sel_dims[1] = (hsize_t)(COLL_IO_IND_MD_WRITE_CHUNK1 * COLL_IO_IND_MD_WRITE_NCHUNK1); + + offset[0] = (hsize_t)mpi_rank * sel_dims[0]; + offset[1] = 0; + + VRFY((H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, offset, NULL, sel_dims, NULL) >= 0), + "H5Sselect_hyperslab succeeded"); + + dxpl_id = H5Pcreate(H5P_DATASET_XFER); + VRFY((dxpl_id >= 0), "H5Pcreate succeeded"); + + VRFY((H5Pset_dxpl_mpio(dxpl_id, H5FD_MPIO_COLLECTIVE) >= 0), "H5Pset_dxpl_mpio succeeded"); + + data = malloc(sel_dims[0] * sel_dims[1] * sizeof(long long)); + for (size_t i = 0; i < sel_dims[0] * sel_dims[1]; i++) + data[i] = rand(); + + VRFY((H5Dwrite(dset_id, H5T_NATIVE_LLONG, H5S_BLOCK, fspace_id, dxpl_id, data) >= 0), + "H5Dwrite succeeded"); + + dset_id2 = H5Dcreate2(file_id, "dset2", H5T_NATIVE_LLONG, fspace_id, H5P_DEFAULT, dcpl_id, H5P_DEFAULT); + VRFY((dset_id2 >= 0), "H5Dcreate2 succeeded"); + + for (size_t i = 0; i < sel_dims[0] * sel_dims[1]; i++) + data[i] = rand(); + + VRFY((H5Dwrite(dset_id2, H5T_NATIVE_LLONG, H5S_BLOCK, fspace_id, dxpl_id, data) >= 0), + "H5Dwrite succeeded"); + + free(data); + + VRFY((H5Sclose(fspace_id) >= 0), "H5Sclose succeeded"); + VRFY((H5Dclose(dset_id) >= 0), "H5Dclose succeeded"); + VRFY((H5Dclose(dset_id2) >= 0), "H5Dclose succeeded"); + VRFY((H5Pclose(dcpl_id) >= 0), "H5Pclose succeeded"); + VRFY((H5Pclose(dxpl_id) >= 0), "H5Pclose succeeded"); + VRFY((H5Pclose(fapl_id) >= 0), "H5Pclose succeeded"); + VRFY((H5Fclose(file_id) >= 0), "H5Fclose succeeded"); +} diff --git a/testpar/testphdf5.c b/testpar/testphdf5.c index 584ca1f..2d85e1a 100644 --- a/testpar/testphdf5.c +++ b/testpar/testphdf5.c @@ -521,6 +521,8 @@ main(int argc, char **argv) "Collective MD read with link chunk I/O (H5D__sort_chunk)", PARATESTFILE); AddTest("GH_coll_MD_wr", test_collective_global_heap_write, NULL, "Collective MD write of global heap data", PARATESTFILE); + AddTest("COLLIO_INDMDWR", test_coll_io_ind_md_write, NULL, + "Collective I/O with Independent metadata writes", PARATESTFILE); /* Display testing information */ TestInfo(argv[0]); diff --git a/testpar/testphdf5.h b/testpar/testphdf5.h index 6ac8080..5699760 100644 --- a/testpar/testphdf5.h +++ b/testpar/testphdf5.h @@ -296,6 +296,7 @@ void test_partial_no_selection_coll_md_read(void); void test_multi_chunk_io_addrmap_issue(void); void test_link_chunk_io_sort_chunk_issue(void); void test_collective_global_heap_write(void); +void test_coll_io_ind_md_write(void); void test_oflush(void); /* commonly used prototypes */ -- cgit v0.12