diff options
author | jhendersonHDF <jhenderson@hdfgroup.org> | 2023-10-24 02:06:44 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-10-24 02:06:44 (GMT) |
commit | af49eb5b8647e8d9ffb527fd533def0910eb535c (patch) | |
tree | f4f368a972fd547b3a04b3c9682758c148c3dcff /src | |
parent | ceb03358a1d713078ae36bfff07be62b433d970a (diff) | |
download | hdf5-af49eb5b8647e8d9ffb527fd533def0910eb535c.zip hdf5-af49eb5b8647e8d9ffb527fd533def0910eb535c.tar.gz hdf5-af49eb5b8647e8d9ffb527fd533def0910eb535c.tar.bz2 |
Fix hangs during collective I/O with independent metadata writes (#3693)
Diffstat (limited to 'src')
-rw-r--r-- | src/H5Cmpio.c | 38 | ||||
-rw-r--r-- | src/H5Pfapl.c | 2 |
2 files changed, 35 insertions, 5 deletions
diff --git a/src/H5Cmpio.c b/src/H5Cmpio.c index 643bbc8..c8db535 100644 --- a/src/H5Cmpio.c +++ b/src/H5Cmpio.c @@ -154,8 +154,9 @@ herr_t H5C_apply_candidate_list(H5F_t *f, H5C_t *cache_ptr, unsigned num_candidates, haddr_t *candidates_list_ptr, int mpi_rank, int mpi_size) { - unsigned first_entry_to_flush; - unsigned last_entry_to_flush; + H5FD_mpio_xfer_t orig_xfer_mode; + unsigned first_entry_to_flush; + unsigned last_entry_to_flush; #ifndef NDEBUG unsigned total_entries_to_clear = 0; unsigned total_entries_to_flush = 0; @@ -172,8 +173,9 @@ H5C_apply_candidate_list(H5F_t *f, H5C_t *cache_ptr, unsigned num_candidates, ha char *tbl_buf = NULL; #endif /* H5C_APPLY_CANDIDATE_LIST__DEBUG */ unsigned m, n; - unsigned u; /* Local index variable */ - herr_t ret_value = SUCCEED; /* Return value */ + unsigned u; /* Local index variable */ + bool restore_io_mode = false; + herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_NOAPI(FAIL) @@ -185,6 +187,10 @@ H5C_apply_candidate_list(H5F_t *f, H5C_t *cache_ptr, unsigned num_candidates, ha assert(0 <= mpi_rank); assert(mpi_rank < mpi_size); + /* Get I/O transfer mode */ + if (H5CX_get_io_xfer_mode(&orig_xfer_mode) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode"); + /* Initialize the entries_to_flush and entries_to_clear arrays */ memset(entries_to_flush, 0, sizeof(entries_to_flush)); memset(entries_to_clear, 0, sizeof(entries_to_clear)); @@ -418,6 +424,19 @@ H5C_apply_candidate_list(H5F_t *f, H5C_t *cache_ptr, unsigned num_candidates, ha num_candidates, total_entries_to_clear, total_entries_to_flush); #endif /* H5C_APPLY_CANDIDATE_LIST__DEBUG */ + /* + * If collective I/O was requested, but collective metadata + * writes were not requested, temporarily disable collective + * I/O while flushing candidate entries so that we don't cause + * a hang in the case where the number of candidate entries + * to flush isn't a multiple of mpi_size. + */ + if ((orig_xfer_mode == H5FD_MPIO_COLLECTIVE) && !f->shared->coll_md_write) { + if (H5CX_set_io_xfer_mode(H5FD_MPIO_INDEPENDENT) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTSET, FAIL, "can't set MPI-I/O transfer mode"); + restore_io_mode = true; + } + /* We have now marked all the entries on the candidate list for * either flush or clear -- now scan the LRU and the pinned list * for these entries and do the deed. Do this via a call to @@ -431,6 +450,13 @@ H5C_apply_candidate_list(H5F_t *f, H5C_t *cache_ptr, unsigned num_candidates, ha if (H5C__flush_candidate_entries(f, entries_to_flush, entries_to_clear) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "flush candidates failed"); + /* Restore collective I/O if we temporarily disabled it */ + if (restore_io_mode) { + if (H5CX_set_io_xfer_mode(orig_xfer_mode) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTSET, FAIL, "can't set MPI-I/O transfer mode"); + restore_io_mode = false; + } + /* If we've deferred writing to do it collectively, take care of that now */ if (f->shared->coll_md_write) { /* Sanity check */ @@ -442,6 +468,10 @@ H5C_apply_candidate_list(H5F_t *f, H5C_t *cache_ptr, unsigned num_candidates, ha } /* end if */ done: + /* Restore collective I/O if we temporarily disabled it */ + if (restore_io_mode && (H5CX_set_io_xfer_mode(orig_xfer_mode) < 0)) + HDONE_ERROR(H5E_CACHE, H5E_CANTSET, FAIL, "can't set MPI-I/O transfer mode"); + if (candidate_assignment_table != NULL) candidate_assignment_table = (unsigned *)H5MM_xfree((void *)candidate_assignment_table); if (cache_ptr->coll_write_list) { diff --git a/src/H5Pfapl.c b/src/H5Pfapl.c index 5f5782c..dc122af 100644 --- a/src/H5Pfapl.c +++ b/src/H5Pfapl.c @@ -5174,7 +5174,7 @@ done: * Function: H5Pset_coll_metadata_write * * Purpose: Tell the library whether the metadata write operations will - * be done collectively (1) or not (0). Default is collective. + * be done collectively (1) or not (0). Default is independent. * * Return: Non-negative on success/Negative on failure * |