summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--release_docs/RELEASE.txt16
-rw-r--r--src/H5C.c55
-rw-r--r--src/H5CX.c4
-rw-r--r--src/H5Cimage.c4
-rw-r--r--src/H5Dchunk.c22
-rw-r--r--src/H5Dmpio.c6
-rw-r--r--src/H5Z.c2
-rw-r--r--testpar/t_cache.c16
-rw-r--r--testpar/t_coll_md_read.c336
-rw-r--r--testpar/testphdf5.c5
-rw-r--r--testpar/testphdf5.h2
11 files changed, 381 insertions, 87 deletions
diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt
index ef3bda0..316798d 100644
--- a/release_docs/RELEASE.txt
+++ b/release_docs/RELEASE.txt
@@ -264,6 +264,22 @@ Bug Fixes since HDF5-1.10.3 release
Library
-------
+ - Fix hangs with collective metadata reads during chunked dataset I/O
+
+ In the parallel library, it was discovered that when a particular
+ sequence of operations following a pattern of:
+
+ "write to chunked dataset" -> "flush file" -> "read from dataset"
+
+ occurred with collective metadata reads enabled, hangs could be
+ observed due to certain MPI ranks not participating in the collective
+ metadata reads.
+
+ To fix the issue, collective metadata reads are now disabled during
+ chunked dataset raw data I/O.
+
+ (JTH - 2019/02/11, HDFFV-10563, HDFFV-10688)
+
- Performance issue when closing an object
The slow down is due to the search of the "tag_list" to find
diff --git a/src/H5C.c b/src/H5C.c
index c1d9d17..09c5fe8 100644
--- a/src/H5C.c
+++ b/src/H5C.c
@@ -1483,31 +1483,17 @@ H5C_insert_entry(H5F_t * f,
H5C__UPDATE_STATS_FOR_INSERTION(cache_ptr, entry_ptr)
#ifdef H5_HAVE_PARALLEL
- if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) {
- coll_access = (H5P_USER_TRUE == f->coll_md_read ? TRUE : FALSE);
-
- /* If not explicitly disabled, get the cmdr setting from the API context */
- if(!coll_access && H5P_FORCE_FALSE != f->coll_md_read)
- coll_access = H5CX_get_coll_metadata_read();
- } /* end if */
+ if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI))
+ coll_access = H5CX_get_coll_metadata_read();
entry_ptr->coll_access = coll_access;
if(coll_access) {
H5C__INSERT_IN_COLL_LIST(cache_ptr, entry_ptr, FAIL)
/* Make sure the size of the collective entries in the cache remain in check */
- if(H5P_USER_TRUE == f->coll_md_read) {
- if(cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100) {
- if(H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't clear collective metadata entries")
- } /* end if */
- } /* end if */
- else {
- if(cache_ptr->max_cache_size * 40 < cache_ptr->coll_list_size * 100) {
- if(H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't clear collective metadata entries")
- } /* end if */
- } /* end else */
+ if(cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100)
+ if(H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't clear collective metadata entries")
} /* end if */
#endif
@@ -2243,13 +2229,8 @@ H5C_protect(H5F_t * f,
ring = H5CX_get_ring();
#ifdef H5_HAVE_PARALLEL
- if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) {
- coll_access = (H5P_USER_TRUE == f->coll_md_read ? TRUE : FALSE);
-
- /* If not explicitly disabled, get the cmdr setting from the API context */
- if(!coll_access && H5P_FORCE_FALSE != f->coll_md_read)
- coll_access = H5CX_get_coll_metadata_read();
- } /* end if */
+ if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI))
+ coll_access = H5CX_get_coll_metadata_read();
#endif /* H5_HAVE_PARALLEL */
/* first check to see if the target is in cache */
@@ -2286,7 +2267,7 @@ H5C_protect(H5F_t * f,
the entry in their cache still have to participate in the
bcast. */
#ifdef H5_HAVE_PARALLEL
- if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI) && coll_access) {
+ if(coll_access) {
if(!(entry_ptr->is_dirty) && !(entry_ptr->coll_access)) {
MPI_Comm comm; /* File MPI Communicator */
int mpi_code; /* MPI error code */
@@ -2601,21 +2582,11 @@ H5C_protect(H5F_t * f,
} /* end if */
#ifdef H5_HAVE_PARALLEL
- if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) {
- /* Make sure the size of the collective entries in the cache remain in check */
- if(coll_access) {
- if(H5P_USER_TRUE == f->coll_md_read) {
- if(cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100)
- if(H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, NULL, "can't clear collective metadata entries")
- } /* end if */
- else {
- if(cache_ptr->max_cache_size * 40 < cache_ptr->coll_list_size * 100)
- if(H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, NULL, "can't clear collective metadata entries")
- } /* end else */
- } /* end if */
- } /* end if */
+ /* Make sure the size of the collective entries in the cache remain in check */
+ if(coll_access)
+ if(cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100)
+ if(H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, NULL, "can't clear collective metadata entries")
#endif /* H5_HAVE_PARALLEL */
done:
diff --git a/src/H5CX.c b/src/H5CX.c
index 0d20132..f945df0 100644
--- a/src/H5CX.c
+++ b/src/H5CX.c
@@ -815,8 +815,8 @@ H5CX_set_apl(hid_t *acspl_id, const H5P_libclass_t *libclass,
#ifdef H5_HAVE_PARALLEL
/* If this routine is not guaranteed to be collective (i.e. it doesn't
- * modify the structural metadata in a file), check if we should use
- * a collective metadata read.
+ * modify the structural metadata in a file), check if the application
+ * specified a collective metadata read for just this operation.
*/
if(!is_collective) {
H5P_genplist_t *plist; /* Property list pointer */
diff --git a/src/H5Cimage.c b/src/H5Cimage.c
index bdab1a3..4684630 100644
--- a/src/H5Cimage.c
+++ b/src/H5Cimage.c
@@ -1065,9 +1065,9 @@ H5C__read_cache_image(H5F_t *f, H5C_t *cache_ptr)
H5C__UPDATE_STATS_FOR_CACHE_IMAGE_READ(cache_ptr)
#ifdef H5_HAVE_PARALLEL
- if ( aux_ptr ) {
+ if ( aux_ptr ) {
- /* Broadcast cache image */
+ /* Broadcast cache image */
if ( MPI_SUCCESS !=
(mpi_result = MPI_Bcast(cache_ptr->image_buffer,
(int)cache_ptr->image_len, MPI_BYTE,
diff --git a/src/H5Dchunk.c b/src/H5Dchunk.c
index e165569..af8ff91 100644
--- a/src/H5Dchunk.c
+++ b/src/H5Dchunk.c
@@ -2914,9 +2914,6 @@ H5D__chunk_lookup(const H5D_t *dset, const hsize_t *scaled,
/* Check for cached information */
if(!H5D__chunk_cinfo_cache_found(&dset->shared->cache.chunk.last, udata)) {
H5D_chk_idx_info_t idx_info; /* Chunked index info */
-#ifdef H5_HAVE_PARALLEL
- H5P_coll_md_read_flag_t temp_cmr; /* Temp value to hold the coll metadata read setting */
-#endif /* H5_HAVE_PARALLEL */
/* Compose chunked index info struct */
idx_info.f = dset->oloc.file;
@@ -2925,25 +2922,18 @@ H5D__chunk_lookup(const H5D_t *dset, const hsize_t *scaled,
idx_info.storage = &dset->shared->layout.storage.u.chunk;
#ifdef H5_HAVE_PARALLEL
- if(H5F_HAS_FEATURE(idx_info.f, H5FD_FEAT_HAS_MPI)) {
- /* disable collective metadata read for chunk indexes
- as it is highly unlikely that users would read the
- same chunks from all processes. MSC - might turn on
- for root node? */
- temp_cmr = H5F_COLL_MD_READ(idx_info.f);
- H5F_set_coll_md_read(idx_info.f, H5P_FORCE_FALSE);
- } /* end if */
+ /* Disable collective metadata read for chunk indexes as it is
+ * highly unlikely that users would read the same chunks from all
+ * processes.
+ */
+ if(H5F_HAS_FEATURE(idx_info.f, H5FD_FEAT_HAS_MPI))
+ H5CX_set_coll_metadata_read(FALSE);
#endif /* H5_HAVE_PARALLEL */
/* Go get the chunk information */
if((dset->shared->layout.storage.u.chunk.ops->get_addr)(&idx_info, udata) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't query chunk address")
-#ifdef H5_HAVE_PARALLEL
- if(H5F_HAS_FEATURE(idx_info.f, H5FD_FEAT_HAS_MPI))
- H5F_set_coll_md_read(idx_info.f, temp_cmr);
-#endif /* H5_HAVE_PARALLEL */
-
/*
* Cache the information retrieved.
*
diff --git a/src/H5Dmpio.c b/src/H5Dmpio.c
index f5da33d..0423006 100644
--- a/src/H5Dmpio.c
+++ b/src/H5Dmpio.c
@@ -800,6 +800,10 @@ H5D__chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_inf
HDassert(type_info);
HDassert(fm);
+ /* Disable collective metadata reads for chunked dataset I/O operations
+ * in order to prevent potential hangs */
+ H5CX_set_coll_metadata_read(FALSE);
+
/* Check the optional property list for the collective chunk IO optimization option */
if(H5CX_get_mpio_chunk_opt_mode(&chunk_opt_mode) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "couldn't get chunk optimization option")
@@ -2313,7 +2317,7 @@ if(H5DEBUG(D))
/* Broadcasting the MPI_IO option info. and chunk address info. */
if(MPI_SUCCESS != (mpi_code = MPI_Bcast(total_chunk_addr_array, (int)(sizeof(haddr_t) * fm->layout->u.chunk.nchunks), MPI_BYTE, (int)0, io_info->comm)))
- HMPI_GOTO_ERROR(FAIL, "MPI_BCast failed", mpi_code)
+ HMPI_GOTO_ERROR(FAIL, "MPI_BCast failed", mpi_code)
} /* end if */
/* Start at first node in chunk skip list */
diff --git a/src/H5Z.c b/src/H5Z.c
index f41588f..0e2a7ba 100644
--- a/src/H5Z.c
+++ b/src/H5Z.c
@@ -611,7 +611,7 @@ H5Z__flush_file_cb(void *obj_ptr, hid_t H5_ATTR_UNUSED obj_id, void *key)
/* Sanity check for collectively calling H5Zunregister, if requested */
/* (Sanity check assumes that a barrier on one file's comm
* is sufficient (i.e. that there aren't different comms for
- * different files). -QAK, 2018/02/14
+ * different files). -QAK, 2018/02/14)
*/
if(H5_coll_api_sanity_check_g && !object->sanity_checked) {
MPI_Comm mpi_comm; /* File's communicator */
diff --git a/testpar/t_cache.c b/testpar/t_cache.c
index 7488728..41c95e2 100644
--- a/testpar/t_cache.c
+++ b/testpar/t_cache.c
@@ -7225,7 +7225,7 @@ smoke_check_6(int metadata_write_strategy)
/* some error occured in the server -- report failure */
nerrors++;
if ( verbose ) {
- HDfprintf(stdout, "%d:%s: server_main() failed.\n",
+ HDfprintf(stdout, "%d:%s: server_main() failed.\n",
world_mpi_rank, FUNC);
}
}
@@ -7241,7 +7241,7 @@ smoke_check_6(int metadata_write_strategy)
fid = -1;
cache_ptr = NULL;
if ( verbose ) {
- HDfprintf(stdout, "%d:%s: setup_cache_for_test() failed.\n",
+ HDfprintf(stdout, "%d:%s: setup_cache_for_test() failed.\n",
world_mpi_rank, FUNC);
}
}
@@ -7250,7 +7250,7 @@ smoke_check_6(int metadata_write_strategy)
virt_num_data_entries = NUM_DATA_ENTRIES;
/* insert the first half collectively */
- file_ptr->coll_md_read = H5P_USER_TRUE;
+ H5CX_set_coll_metadata_read(TRUE);
for ( i = 0; i < virt_num_data_entries/2; i++ )
{
struct datum * entry_ptr;
@@ -7271,7 +7271,7 @@ smoke_check_6(int metadata_write_strategy)
}
/* insert the other half independently */
- file_ptr->coll_md_read = H5P_USER_FALSE;
+ H5CX_set_coll_metadata_read(FALSE);
for ( i = virt_num_data_entries/2; i < virt_num_data_entries; i++ )
{
struct datum * entry_ptr;
@@ -7291,7 +7291,7 @@ smoke_check_6(int metadata_write_strategy)
HDassert(cache_ptr->max_cache_size*0.8 > cache_ptr->coll_list_size);
}
- /* flush the file */
+ /* flush the file */
if ( H5Fflush(fid, H5F_SCOPE_GLOBAL) < 0 ) {
nerrors++;
if ( verbose ) {
@@ -7301,7 +7301,7 @@ smoke_check_6(int metadata_write_strategy)
}
/* Protect the first half of the entries collectively */
- file_ptr->coll_md_read = H5P_USER_TRUE;
+ H5CX_set_coll_metadata_read(TRUE);
for ( i = 0; i < (virt_num_data_entries / 2); i++ )
{
struct datum * entry_ptr;
@@ -7322,13 +7322,13 @@ smoke_check_6(int metadata_write_strategy)
}
/* protect the other half independently */
- file_ptr->coll_md_read = H5P_USER_FALSE;
+ H5CX_set_coll_metadata_read(FALSE);
for ( i = virt_num_data_entries/2; i < virt_num_data_entries; i++ )
{
struct datum * entry_ptr;
entry_ptr = &(data[i]);
- lock_entry(file_ptr, i);
+ lock_entry(file_ptr, i);
if(FALSE != entry_ptr->header.coll_access) {
nerrors++;
diff --git a/testpar/t_coll_md_read.c b/testpar/t_coll_md_read.c
index f945d2b..912388c 100644
--- a/testpar/t_coll_md_read.c
+++ b/testpar/t_coll_md_read.c
@@ -32,6 +32,14 @@
#define PARTIAL_NO_SELECTION_Y_DIM_SCALE 5
#define PARTIAL_NO_SELECTION_X_DIM_SCALE 5
+#define MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS 2
+
+#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_NO_SEL_PROCESS (mpi_rank == mpi_size - 1)
+#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DATASET_NAME "linked_chunk_io_sort_chunk_issue"
+#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_Y_DIM_SCALE 20000
+#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE 1
+#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS 1
+
/*
* A test for issue HDFFV-10501. A parallel hang was reported which occurred
* in linked-chunk I/O when collective metadata reads are enabled and some ranks
@@ -57,13 +65,13 @@ void test_partial_no_selection_coll_md_read(void)
hsize_t stride[PARTIAL_NO_SELECTION_DATASET_NDIMS];
hsize_t count[PARTIAL_NO_SELECTION_DATASET_NDIMS];
hsize_t block[PARTIAL_NO_SELECTION_DATASET_NDIMS];
- hid_t file_id = -1;
- hid_t fapl_id = -1;
- hid_t dset_id = -1;
- hid_t dcpl_id = -1;
- hid_t dxpl_id = -1;
- hid_t fspace_id = -1;
- hid_t mspace_id = -1;
+ hid_t file_id = H5I_INVALID_HID;
+ hid_t fapl_id = H5I_INVALID_HID;
+ hid_t dset_id = H5I_INVALID_HID;
+ hid_t dcpl_id = H5I_INVALID_HID;
+ hid_t dxpl_id = H5I_INVALID_HID;
+ hid_t fspace_id = H5I_INVALID_HID;
+ hid_t mspace_id = H5I_INVALID_HID;
int mpi_rank, mpi_size;
void *data = NULL;
void *read_buf = NULL;
@@ -86,7 +94,7 @@ void test_partial_no_selection_coll_md_read(void)
file_id = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id);
VRFY((file_id >= 0), "H5Fcreate succeeded");
- dataset_dims = malloc(PARTIAL_NO_SELECTION_DATASET_NDIMS * sizeof(*dataset_dims));
+ dataset_dims = HDmalloc(PARTIAL_NO_SELECTION_DATASET_NDIMS * sizeof(*dataset_dims));
VRFY((dataset_dims != NULL), "malloc succeeded");
dataset_dims[0] = PARTIAL_NO_SELECTION_Y_DIM_SCALE * mpi_size;
@@ -129,7 +137,7 @@ void test_partial_no_selection_coll_md_read(void)
mspace_id = H5Screate_simple(1, sel_dims, NULL);
VRFY((mspace_id >= 0), "H5Screate_simple succeeded");
- data = calloc(1, count[1] * (PARTIAL_NO_SELECTION_Y_DIM_SCALE * PARTIAL_NO_SELECTION_X_DIM_SCALE) * sizeof(int));
+ data = HDcalloc(1, count[1] * (PARTIAL_NO_SELECTION_Y_DIM_SCALE * PARTIAL_NO_SELECTION_X_DIM_SCALE) * sizeof(int));
VRFY((data != NULL), "calloc succeeded");
dxpl_id = H5Pcreate(H5P_DATASET_XFER);
@@ -151,7 +159,7 @@ void test_partial_no_selection_coll_md_read(void)
*/
VRFY((H5Pset_dxpl_mpio_chunk_opt(dxpl_id, H5FD_MPIO_CHUNK_ONE_IO) >= 0), "H5Pset_dxpl_mpio_chunk_opt succeeded");
- read_buf = malloc(count[1] * (PARTIAL_NO_SELECTION_Y_DIM_SCALE * PARTIAL_NO_SELECTION_X_DIM_SCALE) * sizeof(int));
+ read_buf = HDmalloc(count[1] * (PARTIAL_NO_SELECTION_Y_DIM_SCALE * PARTIAL_NO_SELECTION_X_DIM_SCALE) * sizeof(int));
VRFY((read_buf != NULL), "malloc succeeded");
/*
@@ -171,21 +179,321 @@ void test_partial_no_selection_coll_md_read(void)
* Check data integrity just to be sure.
*/
if (!PARTIAL_NO_SELECTION_NO_SEL_PROCESS) {
- VRFY((!memcmp(data, read_buf, count[1] * (PARTIAL_NO_SELECTION_Y_DIM_SCALE * PARTIAL_NO_SELECTION_X_DIM_SCALE) * sizeof(int))), "memcmp succeeded");
+ VRFY((!HDmemcmp(data, read_buf, count[1] * (PARTIAL_NO_SELECTION_Y_DIM_SCALE * PARTIAL_NO_SELECTION_X_DIM_SCALE) * sizeof(int))), "memcmp succeeded");
+ }
+
+ if (dataset_dims) {
+ HDfree(dataset_dims);
+ dataset_dims = NULL;
}
+ if (data) {
+ HDfree(data);
+ data = NULL;
+ }
+
+ if (read_buf) {
+ HDfree(read_buf);
+ read_buf = NULL;
+ }
+
+ VRFY((H5Sclose(fspace_id) >= 0), "H5Sclose succeeded");
+ VRFY((H5Sclose(mspace_id) >= 0), "H5Sclose succeeded");
+ VRFY((H5Pclose(dcpl_id) >= 0), "H5Pclose succeeded");
+ VRFY((H5Pclose(dxpl_id) >= 0), "H5Pclose succeeded");
+ VRFY((H5Dclose(dset_id) >= 0), "H5Dclose succeeded");
+ VRFY((H5Pclose(fapl_id) >= 0), "H5Pclose succeeded");
+ VRFY((H5Fclose(file_id) >= 0), "H5Fclose succeeded");
+}
+
+/*
+ * A test for HDFFV-10562 which attempts to verify that using multi-chunk
+ * I/O with collective metadata reads enabled doesn't causes issues due to
+ * collective metadata reads being made only by process 0 in H5D__chunk_addrmap().
+ *
+ * Failure in this test may either cause a hang, or, due to how the MPI calls
+ * pertaining to this issue might mistakenly match up, may cause an MPI error
+ * message similar to:
+ *
+ * #008: H5Dmpio.c line 2546 in H5D__obtain_mpio_mode(): MPI_BCast failed
+ * major: Internal error (too specific to document in detail)
+ * minor: Some MPI function failed
+ * #009: H5Dmpio.c line 2546 in H5D__obtain_mpio_mode(): Message truncated, error stack:
+ *PMPI_Bcast(1600)..................: MPI_Bcast(buf=0x1df98e0, count=18, MPI_BYTE, root=0, comm=0x84000006) failed
+ *MPIR_Bcast_impl(1452).............:
+ *MPIR_Bcast(1476)..................:
+ *MPIR_Bcast_intra(1249)............:
+ *MPIR_SMP_Bcast(1088)..............:
+ *MPIR_Bcast_binomial(239)..........:
+ *MPIDI_CH3U_Receive_data_found(131): Message from rank 0 and tag 2 truncated; 2616 bytes received but buffer size is 18
+ * major: Internal error (too specific to document in detail)
+ * minor: MPI Error String
+ *
+ */
+void test_multi_chunk_io_addrmap_issue(void)
+{
+ const char *filename;
+ hsize_t start[MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS];
+ hsize_t stride[MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS];
+ hsize_t count[MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS];
+ hsize_t block[MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS];
+ hsize_t dims[MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS] = {10, 5};
+ hsize_t chunk_dims[MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS] = {5, 5};
+ hsize_t max_dims[MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS] = {H5S_UNLIMITED, H5S_UNLIMITED};
+ hid_t file_id = H5I_INVALID_HID;
+ hid_t fapl_id = H5I_INVALID_HID;
+ hid_t dset_id = H5I_INVALID_HID;
+ hid_t dcpl_id = H5I_INVALID_HID;
+ hid_t dxpl_id = H5I_INVALID_HID;
+ hid_t space_id = H5I_INVALID_HID;
+ void *read_buf = NULL;
+ int mpi_rank;
+ int data[5][5] = { {0, 1, 2, 3, 4},
+ {0, 1, 2, 3, 4},
+ {0, 1, 2, 3, 4},
+ {0, 1, 2, 3, 4},
+ {0, 1, 2, 3, 4} };
+
+ MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+
+ filename = GetTestParameters();
+
+ fapl_id = create_faccess_plist(MPI_COMM_WORLD, MPI_INFO_NULL, facc_type);
+ VRFY((fapl_id >= 0), "create_faccess_plist succeeded");
+
+ /*
+ * Even though the testphdf5 framework currently sets collective metadata reads
+ * on the FAPL, we call it here just to be sure this is futureproof, since
+ * demonstrating this issue relies upon it.
+ */
+ VRFY((H5Pset_all_coll_metadata_ops(fapl_id, true) >= 0), "Set collective metadata reads succeeded");
+
+ file_id = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id);
+ VRFY((file_id >= 0), "H5Fcreate succeeded");
+
+ space_id = H5Screate_simple(MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS, dims, max_dims);
+ VRFY((space_id >= 0), "H5Screate_simple succeeded");
+
+ dcpl_id = H5Pcreate(H5P_DATASET_CREATE);
+ VRFY((dcpl_id >= 0), "H5Pcreate succeeded");
+
+ VRFY((H5Pset_chunk(dcpl_id, MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS, chunk_dims) >= 0), "H5Pset_chunk succeeded");
+
+ dset_id = H5Dcreate2(file_id, "dset", H5T_NATIVE_INT, space_id, H5P_DEFAULT, dcpl_id, H5P_DEFAULT);
+ VRFY((dset_id >= 0), "H5Dcreate2 succeeded");
+
+ dxpl_id = H5Pcreate(H5P_DATASET_XFER);
+ VRFY((dxpl_id >= 0), "H5Pcreate succeeded");
+
+ VRFY((H5Pset_dxpl_mpio(dxpl_id, H5FD_MPIO_COLLECTIVE) >= 0), "H5Pset_dxpl_mpio succeeded");
+ VRFY((H5Pset_dxpl_mpio_chunk_opt(dxpl_id, H5FD_MPIO_CHUNK_MULTI_IO) >= 0), "H5Pset_dxpl_mpio_chunk_opt succeeded");
+
+ start[1] = 0;
+ stride[0] = stride[1] = 1;
+ count[0] = count[1] = 5;
+ block[0] = block[1] = 1;
+
+ if (mpi_rank == 0)
+ start[0] = 0;
+ else
+ start[0] = 5;
+
+ VRFY((H5Sselect_hyperslab(space_id, H5S_SELECT_SET, start, stride, count, block) >= 0), "H5Sselect_hyperslab succeeded");
+ if (mpi_rank != 0)
+ VRFY((H5Sselect_none(space_id) >= 0), "H5Sselect_none succeeded");
+
+ VRFY((H5Dwrite(dset_id, H5T_NATIVE_INT, H5S_ALL, space_id, dxpl_id, data) >= 0), "H5Dwrite succeeded");
+
+ VRFY((H5Fflush(file_id, H5F_SCOPE_GLOBAL) >= 0), "H5Fflush succeeded");
+
+ read_buf = HDmalloc(50 * sizeof(int));
+ VRFY((read_buf != NULL), "malloc succeeded");
+
+ VRFY((H5Dread(dset_id, H5T_NATIVE_INT, H5S_ALL, H5S_ALL, dxpl_id, read_buf) >= 0), "H5Dread succeeded");
+
+ if (read_buf) {
+ HDfree(read_buf);
+ read_buf = NULL;
+ }
+
+ VRFY((H5Sclose(space_id) >= 0), "H5Sclose succeeded");
+ VRFY((H5Pclose(dcpl_id) >= 0), "H5Pclose succeeded");
+ VRFY((H5Pclose(dxpl_id) >= 0), "H5Pclose succeeded");
+ VRFY((H5Dclose(dset_id) >= 0), "H5Dclose succeeded");
+ VRFY((H5Pclose(fapl_id) >= 0), "H5Pclose succeeded");
+ VRFY((H5Fclose(file_id) >= 0), "H5Fclose succeeded");
+}
+
+/*
+ * A test for HDFFV-10562 which attempts to verify that using linked-chunk
+ * I/O with collective metadata reads enabled doesn't cause issues due to
+ * collective metadata reads being made only by process 0 in H5D__sort_chunk().
+ *
+ * NOTE: Due to the way that the threshold value which pertains to this test
+ * is currently calculated within HDF5, there are several conditions that this
+ * test must maintain. Refer to the function H5D__sort_chunk in H5Dmpio.c for
+ * a better idea of why.
+ *
+ * Condition 1: We need to make sure that the test always selects every single
+ * chunk in the dataset. It is fine if the selection is split up among multiple
+ * ranks, but their combined selection must cover the whole dataset.
+ *
+ * Condition 2: The number of chunks in the dataset divided by the number of MPI
+ * ranks must exceed or equal 10000. In other words, each MPI rank must be
+ * responsible for 10000 or more unique chunks.
+ *
+ * Condition 3: This test will currently only be reliably reproducable for 2 or 3
+ * MPI ranks. The threshold value calculated reduces to a constant 100 / mpi_size,
+ * and is compared against a default value of 30%.
+ *
+ * Failure in this test may either cause a hang, or, due to how the MPI calls
+ * pertaining to this issue might mistakenly match up, may cause an MPI error
+ * message similar to:
+ *
+ * #008: H5Dmpio.c line 2338 in H5D__sort_chunk(): MPI_BCast failed
+ * major: Internal error (too specific to document in detail)
+ * minor: Some MPI function failed
+ * #009: H5Dmpio.c line 2338 in H5D__sort_chunk(): Other MPI error, error stack:
+ *PMPI_Bcast(1600)........: MPI_Bcast(buf=0x7eae610, count=320000, MPI_BYTE, root=0, comm=0x84000006) failed
+ *MPIR_Bcast_impl(1452)...:
+ *MPIR_Bcast(1476)........:
+ *MPIR_Bcast_intra(1249)..:
+ *MPIR_SMP_Bcast(1088)....:
+ *MPIR_Bcast_binomial(250): message sizes do not match across processes in the collective routine: Received 2096 but expected 320000
+ * major: Internal error (too specific to document in detail)
+ * minor: MPI Error String
+ */
+void test_link_chunk_io_sort_chunk_issue(void)
+{
+ const char *filename;
+ hsize_t *dataset_dims = NULL;
+ hsize_t max_dataset_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
+ hsize_t sel_dims[1];
+ hsize_t chunk_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS] = { LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS };
+ hsize_t start[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
+ hsize_t stride[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
+ hsize_t count[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
+ hsize_t block[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
+ hid_t file_id = H5I_INVALID_HID;
+ hid_t fapl_id = H5I_INVALID_HID;
+ hid_t dset_id = H5I_INVALID_HID;
+ hid_t dcpl_id = H5I_INVALID_HID;
+ hid_t dxpl_id = H5I_INVALID_HID;
+ hid_t fspace_id = H5I_INVALID_HID;
+ hid_t mspace_id = H5I_INVALID_HID;
+ int mpi_rank, mpi_size;
+ void *data = NULL;
+ void *read_buf = NULL;
+
+ MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+ MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
+
+ filename = GetTestParameters();
+
+ fapl_id = create_faccess_plist(MPI_COMM_WORLD, MPI_INFO_NULL, facc_type);
+ VRFY((fapl_id >= 0), "create_faccess_plist succeeded");
+
+ /*
+ * Even though the testphdf5 framework currently sets collective metadata reads
+ * on the FAPL, we call it here just to be sure this is futureproof, since
+ * demonstrating this issue relies upon it.
+ */
+ VRFY((H5Pset_all_coll_metadata_ops(fapl_id, true) >= 0), "Set collective metadata reads succeeded");
+
+ file_id = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id);
+ VRFY((file_id >= 0), "H5Fcreate succeeded");
+
+ dataset_dims = HDmalloc(LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS * sizeof(*dataset_dims));
+ VRFY((dataset_dims != NULL), "malloc succeeded");
+
+ dataset_dims[0] = LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE * mpi_size * LINK_CHUNK_IO_SORT_CHUNK_ISSUE_Y_DIM_SCALE;
+ max_dataset_dims[0] = H5S_UNLIMITED;
+
+ fspace_id = H5Screate_simple(LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS, dataset_dims, max_dataset_dims);
+ VRFY((fspace_id >= 0), "H5Screate_simple succeeded");
+
+ /*
+ * Set up chunking on the dataset in order to reproduce the problem.
+ */
+ dcpl_id = H5Pcreate(H5P_DATASET_CREATE);
+ VRFY((dcpl_id >= 0), "H5Pcreate succeeded");
+
+ VRFY((H5Pset_chunk(dcpl_id, LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS, chunk_dims) >= 0), "H5Pset_chunk succeeded");
+
+ dset_id = H5Dcreate2(file_id, LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DATASET_NAME, H5T_NATIVE_INT, fspace_id, H5P_DEFAULT, dcpl_id, H5P_DEFAULT);
+ VRFY((dset_id >= 0), "H5Dcreate2 succeeded");
+
+ /*
+ * Setup hyperslab selection to split the dataset among the ranks.
+ *
+ * The ranks will write rows across the dataset.
+ */
+ stride[0] = LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE;
+ count[0] = (dataset_dims[0] / LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) / mpi_size;
+ start[0] = count[0] * mpi_rank;
+ block[0] = LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE;
+
+ VRFY((H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, start, stride, count, block) >= 0), "H5Sselect_hyperslab succeeded");
+
+ sel_dims[0] = count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE);
+
+ mspace_id = H5Screate_simple(1, sel_dims, NULL);
+ VRFY((mspace_id >= 0), "H5Screate_simple succeeded");
+
+ data = HDcalloc(1, count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) * sizeof(int));
+ VRFY((data != NULL), "calloc succeeded");
+
+ dxpl_id = H5Pcreate(H5P_DATASET_XFER);
+ VRFY((dxpl_id >= 0), "H5Pcreate succeeded");
+
+ /*
+ * Enable collective access for the data transfer.
+ */
+ VRFY((H5Pset_dxpl_mpio(dxpl_id, H5FD_MPIO_COLLECTIVE) >= 0), "H5Pset_dxpl_mpio succeeded");
+
+ VRFY((H5Dwrite(dset_id, H5T_NATIVE_INT, mspace_id, fspace_id, dxpl_id, data) >= 0), "H5Dwrite succeeded");
+
+ VRFY((H5Fflush(file_id, H5F_SCOPE_GLOBAL) >= 0), "H5Fflush succeeded");
+
+ /*
+ * Ensure that linked-chunk I/O is performed since this is
+ * the particular code path where the issue lies and we don't
+ * want the library doing multi-chunk I/O behind our backs.
+ */
+ VRFY((H5Pset_dxpl_mpio_chunk_opt(dxpl_id, H5FD_MPIO_CHUNK_ONE_IO) >= 0), "H5Pset_dxpl_mpio_chunk_opt succeeded");
+
+ read_buf = HDmalloc(count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) * sizeof(int));
+ VRFY((read_buf != NULL), "malloc succeeded");
+
+ VRFY((H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, start, stride, count, block) >= 0), "H5Sselect_hyperslab succeeded");
+
+ sel_dims[0] = count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE);
+
+ VRFY((H5Sclose(mspace_id) >= 0), "H5Sclose succeeded");
+
+ mspace_id = H5Screate_simple(1, sel_dims, NULL);
+ VRFY((mspace_id >= 0), "H5Screate_simple succeeded");
+
+ read_buf = HDrealloc(read_buf, count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) * sizeof(int));
+ VRFY((read_buf != NULL), "realloc succeeded");
+
+ /*
+ * Finally have each rank read their section of data back from the dataset.
+ */
+ VRFY((H5Dread(dset_id, H5T_NATIVE_INT, mspace_id, fspace_id, dxpl_id, read_buf) >= 0), "H5Dread succeeded");
+
if (dataset_dims) {
- free(dataset_dims);
+ HDfree(dataset_dims);
dataset_dims = NULL;
}
if (data) {
- free(data);
+ HDfree(data);
data = NULL;
}
if (read_buf) {
- free(read_buf);
+ HDfree(read_buf);
read_buf = NULL;
}
diff --git a/testpar/testphdf5.c b/testpar/testphdf5.c
index 69b66ae..999e17a 100644
--- a/testpar/testphdf5.c
+++ b/testpar/testphdf5.c
@@ -549,7 +549,10 @@ int main(int argc, char **argv)
AddTest("noselcollmdread", test_partial_no_selection_coll_md_read, NULL,
"Collective Metadata read with some ranks having no selection", PARATESTFILE);
-
+ AddTest("MC coll MD read", test_multi_chunk_io_addrmap_issue, NULL,
+ "Collective MD read with multi chunk I/O (H5D__chunk_addrmap)", PARATESTFILE);
+ AddTest("LC coll MD read", test_link_chunk_io_sort_chunk_issue, NULL,
+ "Collective MD read with link chunk I/O (H5D__sort_chunk)", PARATESTFILE);
/* Display testing information */
TestInfo(argv[0]);
diff --git a/testpar/testphdf5.h b/testpar/testphdf5.h
index 176574e..4409221 100644
--- a/testpar/testphdf5.h
+++ b/testpar/testphdf5.h
@@ -295,6 +295,8 @@ void compress_readAll(void);
#endif /* H5_HAVE_FILTER_DEFLATE */
void test_dense_attr(void);
void test_partial_no_selection_coll_md_read(void);
+void test_multi_chunk_io_addrmap_issue(void);
+void test_link_chunk_io_sort_chunk_issue(void);
/* commonly used prototypes */
hid_t create_faccess_plist(MPI_Comm comm, MPI_Info info, int l_facc_type);