summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjhendersonHDF <jhenderson@hdfgroup.org>2023-10-24 02:06:44 (GMT)
committerGitHub <noreply@github.com>2023-10-24 02:06:44 (GMT)
commitaf49eb5b8647e8d9ffb527fd533def0910eb535c (patch)
treef4f368a972fd547b3a04b3c9682758c148c3dcff
parentceb03358a1d713078ae36bfff07be62b433d970a (diff)
downloadhdf5-af49eb5b8647e8d9ffb527fd533def0910eb535c.zip
hdf5-af49eb5b8647e8d9ffb527fd533def0910eb535c.tar.gz
hdf5-af49eb5b8647e8d9ffb527fd533def0910eb535c.tar.bz2
Fix hangs during collective I/O with independent metadata writes (#3693)
-rw-r--r--release_docs/RELEASE.txt19
-rw-r--r--src/H5Cmpio.c38
-rw-r--r--src/H5Pfapl.c2
-rw-r--r--testpar/t_coll_md.c103
-rw-r--r--testpar/testphdf5.c2
-rw-r--r--testpar/testphdf5.h1
6 files changed, 160 insertions, 5 deletions
diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt
index 222c277..e5d53e4 100644
--- a/release_docs/RELEASE.txt
+++ b/release_docs/RELEASE.txt
@@ -392,6 +392,25 @@ Bug Fixes since HDF5-1.14.0 release
===================================
Library
-------
+ - Fixed potential hangs in parallel library during collective I/O with
+ independent metadata writes
+
+ When performing collective parallel writes to a dataset where metadata
+ writes are requested as (or left as the default setting of) independent,
+ hangs could potentially occur during metadata cache sync points. This
+ was due to incorrect management of the internal state tracking whether
+ an I/O operation should be collective or not, causing the library to
+ attempt collective writes of metadata when they were meant to be
+ independent writes. During the metadata cache sync points, if the number
+ of cache entries being flushed was a multiple of the number of MPI ranks
+ in the MPI communicator used to access the HDF5 file, an equal amount of
+ collective MPI I/O calls were made and the dataset write call would be
+ successful. However, when the number of cache entries being flushed was
+ NOT a multiple of the number of MPI ranks, the ranks with more entries
+ than others would get stuck in an MPI_File_set_view call, while other
+ ranks would get stuck in a post-write MPI_Barrier call. This issue has
+ been fixed by correctly switching to independent I/O temporarily when
+ writing metadata independently during collective dataset I/O.
- Dropped support for MPI-2
diff --git a/src/H5Cmpio.c b/src/H5Cmpio.c
index 643bbc8..c8db535 100644
--- a/src/H5Cmpio.c
+++ b/src/H5Cmpio.c
@@ -154,8 +154,9 @@ herr_t
H5C_apply_candidate_list(H5F_t *f, H5C_t *cache_ptr, unsigned num_candidates, haddr_t *candidates_list_ptr,
int mpi_rank, int mpi_size)
{
- unsigned first_entry_to_flush;
- unsigned last_entry_to_flush;
+ H5FD_mpio_xfer_t orig_xfer_mode;
+ unsigned first_entry_to_flush;
+ unsigned last_entry_to_flush;
#ifndef NDEBUG
unsigned total_entries_to_clear = 0;
unsigned total_entries_to_flush = 0;
@@ -172,8 +173,9 @@ H5C_apply_candidate_list(H5F_t *f, H5C_t *cache_ptr, unsigned num_candidates, ha
char *tbl_buf = NULL;
#endif /* H5C_APPLY_CANDIDATE_LIST__DEBUG */
unsigned m, n;
- unsigned u; /* Local index variable */
- herr_t ret_value = SUCCEED; /* Return value */
+ unsigned u; /* Local index variable */
+ bool restore_io_mode = false;
+ herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_NOAPI(FAIL)
@@ -185,6 +187,10 @@ H5C_apply_candidate_list(H5F_t *f, H5C_t *cache_ptr, unsigned num_candidates, ha
assert(0 <= mpi_rank);
assert(mpi_rank < mpi_size);
+ /* Get I/O transfer mode */
+ if (H5CX_get_io_xfer_mode(&orig_xfer_mode) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode");
+
/* Initialize the entries_to_flush and entries_to_clear arrays */
memset(entries_to_flush, 0, sizeof(entries_to_flush));
memset(entries_to_clear, 0, sizeof(entries_to_clear));
@@ -418,6 +424,19 @@ H5C_apply_candidate_list(H5F_t *f, H5C_t *cache_ptr, unsigned num_candidates, ha
num_candidates, total_entries_to_clear, total_entries_to_flush);
#endif /* H5C_APPLY_CANDIDATE_LIST__DEBUG */
+ /*
+ * If collective I/O was requested, but collective metadata
+ * writes were not requested, temporarily disable collective
+ * I/O while flushing candidate entries so that we don't cause
+ * a hang in the case where the number of candidate entries
+ * to flush isn't a multiple of mpi_size.
+ */
+ if ((orig_xfer_mode == H5FD_MPIO_COLLECTIVE) && !f->shared->coll_md_write) {
+ if (H5CX_set_io_xfer_mode(H5FD_MPIO_INDEPENDENT) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTSET, FAIL, "can't set MPI-I/O transfer mode");
+ restore_io_mode = true;
+ }
+
/* We have now marked all the entries on the candidate list for
* either flush or clear -- now scan the LRU and the pinned list
* for these entries and do the deed. Do this via a call to
@@ -431,6 +450,13 @@ H5C_apply_candidate_list(H5F_t *f, H5C_t *cache_ptr, unsigned num_candidates, ha
if (H5C__flush_candidate_entries(f, entries_to_flush, entries_to_clear) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "flush candidates failed");
+ /* Restore collective I/O if we temporarily disabled it */
+ if (restore_io_mode) {
+ if (H5CX_set_io_xfer_mode(orig_xfer_mode) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTSET, FAIL, "can't set MPI-I/O transfer mode");
+ restore_io_mode = false;
+ }
+
/* If we've deferred writing to do it collectively, take care of that now */
if (f->shared->coll_md_write) {
/* Sanity check */
@@ -442,6 +468,10 @@ H5C_apply_candidate_list(H5F_t *f, H5C_t *cache_ptr, unsigned num_candidates, ha
} /* end if */
done:
+ /* Restore collective I/O if we temporarily disabled it */
+ if (restore_io_mode && (H5CX_set_io_xfer_mode(orig_xfer_mode) < 0))
+ HDONE_ERROR(H5E_CACHE, H5E_CANTSET, FAIL, "can't set MPI-I/O transfer mode");
+
if (candidate_assignment_table != NULL)
candidate_assignment_table = (unsigned *)H5MM_xfree((void *)candidate_assignment_table);
if (cache_ptr->coll_write_list) {
diff --git a/src/H5Pfapl.c b/src/H5Pfapl.c
index 5f5782c..dc122af 100644
--- a/src/H5Pfapl.c
+++ b/src/H5Pfapl.c
@@ -5174,7 +5174,7 @@ done:
* Function: H5Pset_coll_metadata_write
*
* Purpose: Tell the library whether the metadata write operations will
- * be done collectively (1) or not (0). Default is collective.
+ * be done collectively (1) or not (0). Default is independent.
*
* Return: Non-negative on success/Negative on failure
*
diff --git a/testpar/t_coll_md.c b/testpar/t_coll_md.c
index 1220111..9c6fc71 100644
--- a/testpar/t_coll_md.c
+++ b/testpar/t_coll_md.c
@@ -43,6 +43,11 @@
#define COLL_GHEAP_WRITE_ATTR_NAME "coll_gheap_write_attr"
#define COLL_GHEAP_WRITE_ATTR_DIMS 1
+#define COLL_IO_IND_MD_WRITE_NDIMS 2
+#define COLL_IO_IND_MD_WRITE_CHUNK0 4
+#define COLL_IO_IND_MD_WRITE_CHUNK1 256
+#define COLL_IO_IND_MD_WRITE_NCHUNK1 16384
+
/*
* A test for issue HDFFV-10501. A parallel hang was reported which occurred
* in linked-chunk I/O when collective metadata reads are enabled and some ranks
@@ -569,3 +574,101 @@ test_collective_global_heap_write(void)
VRFY((H5Pclose(fapl_id) >= 0), "H5Pclose succeeded");
VRFY((H5Fclose(file_id) >= 0), "H5Fclose succeeded");
}
+
+/*
+ * A test to ensure that hangs don't occur when collective I/O
+ * is requested at the interface level (by a call to
+ * H5Pset_dxpl_mpio(dxpl_id, H5FD_MPIO_COLLECTIVE)), while
+ * collective metadata writes are NOT requested.
+ */
+void
+test_coll_io_ind_md_write(void)
+{
+ const char *filename;
+ long long *data = NULL;
+ hsize_t dset_dims[COLL_IO_IND_MD_WRITE_NDIMS];
+ hsize_t chunk_dims[COLL_IO_IND_MD_WRITE_NDIMS];
+ hsize_t sel_dims[COLL_IO_IND_MD_WRITE_NDIMS];
+ hsize_t offset[COLL_IO_IND_MD_WRITE_NDIMS];
+ hid_t file_id = H5I_INVALID_HID;
+ hid_t fapl_id = H5I_INVALID_HID;
+ hid_t dset_id = H5I_INVALID_HID;
+ hid_t dset_id2 = H5I_INVALID_HID;
+ hid_t dcpl_id = H5I_INVALID_HID;
+ hid_t dxpl_id = H5I_INVALID_HID;
+ hid_t fspace_id = H5I_INVALID_HID;
+ int mpi_rank, mpi_size;
+
+ MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+ MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
+
+ filename = GetTestParameters();
+
+ fapl_id = create_faccess_plist(MPI_COMM_WORLD, MPI_INFO_NULL, facc_type);
+ VRFY((fapl_id >= 0), "create_faccess_plist succeeded");
+
+ VRFY((H5Pset_all_coll_metadata_ops(fapl_id, false) >= 0), "Unset collective metadata reads succeeded");
+ VRFY((H5Pset_coll_metadata_write(fapl_id, false) >= 0), "Unset collective metadata writes succeeded");
+
+ file_id = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id);
+ VRFY((file_id >= 0), "H5Fcreate succeeded");
+
+ dset_dims[0] = (hsize_t)(mpi_size * COLL_IO_IND_MD_WRITE_CHUNK0);
+ dset_dims[1] = (hsize_t)(COLL_IO_IND_MD_WRITE_CHUNK1 * COLL_IO_IND_MD_WRITE_NCHUNK1);
+
+ fspace_id = H5Screate_simple(COLL_IO_IND_MD_WRITE_NDIMS, dset_dims, NULL);
+ VRFY((fspace_id >= 0), "H5Screate_simple succeeded");
+
+ dcpl_id = H5Pcreate(H5P_DATASET_CREATE);
+ VRFY((dcpl_id >= 0), "H5Pcreate succeeded");
+
+ chunk_dims[0] = (hsize_t)(COLL_IO_IND_MD_WRITE_CHUNK0);
+ chunk_dims[1] = (hsize_t)(COLL_IO_IND_MD_WRITE_CHUNK1);
+
+ VRFY((H5Pset_chunk(dcpl_id, COLL_IO_IND_MD_WRITE_NDIMS, chunk_dims) >= 0), "H5Pset_chunk succeeded");
+
+ VRFY((H5Pset_shuffle(dcpl_id) >= 0), "H5Pset_shuffle succeeded");
+
+ dset_id = H5Dcreate2(file_id, "dset1", H5T_NATIVE_LLONG, fspace_id, H5P_DEFAULT, dcpl_id, H5P_DEFAULT);
+ VRFY((dset_id >= 0), "H5Dcreate2 succeeded");
+
+ sel_dims[0] = (hsize_t)(COLL_IO_IND_MD_WRITE_CHUNK0);
+ sel_dims[1] = (hsize_t)(COLL_IO_IND_MD_WRITE_CHUNK1 * COLL_IO_IND_MD_WRITE_NCHUNK1);
+
+ offset[0] = (hsize_t)mpi_rank * sel_dims[0];
+ offset[1] = 0;
+
+ VRFY((H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, offset, NULL, sel_dims, NULL) >= 0),
+ "H5Sselect_hyperslab succeeded");
+
+ dxpl_id = H5Pcreate(H5P_DATASET_XFER);
+ VRFY((dxpl_id >= 0), "H5Pcreate succeeded");
+
+ VRFY((H5Pset_dxpl_mpio(dxpl_id, H5FD_MPIO_COLLECTIVE) >= 0), "H5Pset_dxpl_mpio succeeded");
+
+ data = malloc(sel_dims[0] * sel_dims[1] * sizeof(long long));
+ for (size_t i = 0; i < sel_dims[0] * sel_dims[1]; i++)
+ data[i] = rand();
+
+ VRFY((H5Dwrite(dset_id, H5T_NATIVE_LLONG, H5S_BLOCK, fspace_id, dxpl_id, data) >= 0),
+ "H5Dwrite succeeded");
+
+ dset_id2 = H5Dcreate2(file_id, "dset2", H5T_NATIVE_LLONG, fspace_id, H5P_DEFAULT, dcpl_id, H5P_DEFAULT);
+ VRFY((dset_id2 >= 0), "H5Dcreate2 succeeded");
+
+ for (size_t i = 0; i < sel_dims[0] * sel_dims[1]; i++)
+ data[i] = rand();
+
+ VRFY((H5Dwrite(dset_id2, H5T_NATIVE_LLONG, H5S_BLOCK, fspace_id, dxpl_id, data) >= 0),
+ "H5Dwrite succeeded");
+
+ free(data);
+
+ VRFY((H5Sclose(fspace_id) >= 0), "H5Sclose succeeded");
+ VRFY((H5Dclose(dset_id) >= 0), "H5Dclose succeeded");
+ VRFY((H5Dclose(dset_id2) >= 0), "H5Dclose succeeded");
+ VRFY((H5Pclose(dcpl_id) >= 0), "H5Pclose succeeded");
+ VRFY((H5Pclose(dxpl_id) >= 0), "H5Pclose succeeded");
+ VRFY((H5Pclose(fapl_id) >= 0), "H5Pclose succeeded");
+ VRFY((H5Fclose(file_id) >= 0), "H5Fclose succeeded");
+}
diff --git a/testpar/testphdf5.c b/testpar/testphdf5.c
index 584ca1f..2d85e1a 100644
--- a/testpar/testphdf5.c
+++ b/testpar/testphdf5.c
@@ -521,6 +521,8 @@ main(int argc, char **argv)
"Collective MD read with link chunk I/O (H5D__sort_chunk)", PARATESTFILE);
AddTest("GH_coll_MD_wr", test_collective_global_heap_write, NULL,
"Collective MD write of global heap data", PARATESTFILE);
+ AddTest("COLLIO_INDMDWR", test_coll_io_ind_md_write, NULL,
+ "Collective I/O with Independent metadata writes", PARATESTFILE);
/* Display testing information */
TestInfo(argv[0]);
diff --git a/testpar/testphdf5.h b/testpar/testphdf5.h
index 6ac8080..5699760 100644
--- a/testpar/testphdf5.h
+++ b/testpar/testphdf5.h
@@ -296,6 +296,7 @@ void test_partial_no_selection_coll_md_read(void);
void test_multi_chunk_io_addrmap_issue(void);
void test_link_chunk_io_sort_chunk_issue(void);
void test_collective_global_heap_write(void);
+void test_coll_io_ind_md_write(void);
void test_oflush(void);
/* commonly used prototypes */