summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjhendersonHDF <jhenderson@hdfgroup.org>2022-01-29 20:22:30 (GMT)
committerGitHub <noreply@github.com>2022-01-29 20:22:30 (GMT)
commit86ef00fd67d4f5007ff8e3390631009ad75891a5 (patch)
tree864ef4983785fd53728c15742c1ddce5b539551e
parentbcf95655354ab9c92f823656ee1722fd91694305 (diff)
downloadhdf5-86ef00fd67d4f5007ff8e3390631009ad75891a5.zip
hdf5-86ef00fd67d4f5007ff8e3390631009ad75891a5.tar.gz
hdf5-86ef00fd67d4f5007ff8e3390631009ad75891a5.tar.bz2
Unify handling of collective metadata reads status (#1206)
-rw-r--r--release_docs/RELEASE.txt21
-rw-r--r--src/H5C.c39
-rw-r--r--src/H5CX.c2
-rw-r--r--src/H5Dchunk.c17
-rw-r--r--src/H5Dmpio.c109
-rw-r--r--src/H5Fmpi.c132
-rw-r--r--src/H5Fprivate.h2
-rw-r--r--src/H5Pfapl.c24
-rw-r--r--src/H5Z.c14
-rw-r--r--testpar/t_cache.c40
-rw-r--r--testpar/t_coll_md_read.c93
11 files changed, 357 insertions, 136 deletions
diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt
index bba27c9..ece00cc 100644
--- a/release_docs/RELEASE.txt
+++ b/release_docs/RELEASE.txt
@@ -1083,6 +1083,27 @@ Bug Fixes since HDF5-1.12.0 release
(DER - 2021/11/23, HDFFV-11286)
+ - Unified handling of collective metadata reads to correctly fix old bugs
+
+ Due to MPI-related issues occurring in HDF5 from mismanagement of the
+ status of collective metadata reads, they were forced to be disabled
+ during chunked dataset raw data I/O in the HDF5 1.10.5 release. This
+ wouldn't generally have affected application performance because HDF5
+ already disables collective metadata reads during chunk lookup, since
+ it is generally unlikely that the same chunks will be read by all MPI
+ ranks in the I/O operation. However, this was only a partial solution
+ that wasn't granular enough.
+
+ This change now unifies the handling of the file-global flag and the
+ API context-level flag for collective metadata reads in order to
+ simplify querying of the true status of collective metadata reads. Thus,
+ collective metadata reads are once again enabled for chunked dataset
+ raw data I/O, but manually controlled at places where some processing
+ occurs on MPI rank 0 only and would cause issues when collective
+ metadata reads are enabled.
+
+ (JTH - 2021/11/16, HDFFV-10501/HDFFV-10562)
+
- Fixed several potential MPI deadlocks in library failure conditions
In the parallel library, there were several places where MPI rank 0
diff --git a/src/H5C.c b/src/H5C.c
index 889351d..22df59c 100644
--- a/src/H5C.c
+++ b/src/H5C.c
@@ -1518,17 +1518,26 @@ H5C_insert_entry(H5F_t *f, const H5C_class_t *type, haddr_t addr, void *thing, u
#ifdef H5_HAVE_PARALLEL
if (H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI))
- coll_access = H5CX_get_coll_metadata_read();
+ coll_access = H5F_get_coll_metadata_reads(f);
entry_ptr->coll_access = coll_access;
if (coll_access) {
H5C__INSERT_IN_COLL_LIST(cache_ptr, entry_ptr, FAIL)
/* Make sure the size of the collective entries in the cache remain in check */
- if (cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100)
- if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't clear collective metadata entries")
- } /* end if */
+ if (H5P_USER_TRUE == H5F_COLL_MD_READ(f)) {
+ if (cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100) {
+ if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't clear collective metadata entries")
+ } /* end if */
+ } /* end if */
+ else {
+ if (cache_ptr->max_cache_size * 40 < cache_ptr->coll_list_size * 100) {
+ if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't clear collective metadata entries")
+ } /* end if */
+ } /* end else */
+ } /* end if */
#endif
done:
@@ -2248,7 +2257,7 @@ H5C_protect(H5F_t *f, const H5C_class_t *type, haddr_t addr, void *udata, unsign
#ifdef H5_HAVE_PARALLEL
if (H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI))
- coll_access = H5CX_get_coll_metadata_read();
+ coll_access = H5F_get_coll_metadata_reads(f);
#endif /* H5_HAVE_PARALLEL */
/* first check to see if the target is in cache */
@@ -2600,11 +2609,19 @@ H5C_protect(H5F_t *f, const H5C_class_t *type, haddr_t addr, void *udata, unsign
#ifdef H5_HAVE_PARALLEL
/* Make sure the size of the collective entries in the cache remain in check */
- if (coll_access)
- if (cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100)
- if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, NULL, "can't clear collective metadata entries")
-#endif /* H5_HAVE_PARALLEL */
+ if (coll_access) {
+ if (H5P_USER_TRUE == H5F_COLL_MD_READ(f)) {
+ if (cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100)
+ if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, NULL, "can't clear collective metadata entries")
+ } /* end if */
+ else {
+ if (cache_ptr->max_cache_size * 40 < cache_ptr->coll_list_size * 100)
+ if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, NULL, "can't clear collective metadata entries")
+ } /* end else */
+ } /* end if */
+#endif /* H5_HAVE_PARALLEL */
done:
#if H5C_DO_EXTREME_SANITY_CHECKS
diff --git a/src/H5CX.c b/src/H5CX.c
index 89e4c91..c5bb8e4 100644
--- a/src/H5CX.c
+++ b/src/H5CX.c
@@ -1423,7 +1423,7 @@ done:
* Purpose: Sanity checks and sets up collective operations.
*
* Note: Should be called for all API routines that modify file
- * file metadata but don't pass in an access property list.
+ * metadata but don't pass in an access property list.
*
* Return: Non-negative on success / Negative on failure
*
diff --git a/src/H5Dchunk.c b/src/H5Dchunk.c
index 5d7c1b2..1b0e579 100644
--- a/src/H5Dchunk.c
+++ b/src/H5Dchunk.c
@@ -3178,7 +3178,9 @@ H5D__chunk_lookup(const H5D_t *dset, const hsize_t *scaled, H5D_chunk_ud_t *udat
unsigned idx = 0; /* Index of chunk in cache, if present */
hbool_t found = FALSE; /* In cache? */
#ifdef H5_HAVE_PARALLEL
- hbool_t reenable_coll_md_reads = FALSE;
+ H5P_coll_md_read_flag_t md_reads_file_flag;
+ hbool_t md_reads_context_flag;
+ hbool_t restore_md_reads_state = FALSE;
#endif
herr_t ret_value = SUCCEED; /* Return value */
@@ -3252,11 +3254,10 @@ H5D__chunk_lookup(const H5D_t *dset, const hsize_t *scaled, H5D_chunk_ud_t *udat
* processes.
*/
if (H5F_HAS_FEATURE(idx_info.f, H5FD_FEAT_HAS_MPI)) {
- hbool_t do_coll_md_reads = H5CX_get_coll_metadata_read();
- if (do_coll_md_reads) {
- H5CX_set_coll_metadata_read(FALSE);
- reenable_coll_md_reads = TRUE;
- }
+ md_reads_file_flag = H5P_FORCE_FALSE;
+ md_reads_context_flag = FALSE;
+ H5F_set_coll_metadata_reads(idx_info.f, &md_reads_file_flag, &md_reads_context_flag);
+ restore_md_reads_state = TRUE;
}
#endif /* H5_HAVE_PARALLEL */
@@ -3302,8 +3303,8 @@ H5D__chunk_lookup(const H5D_t *dset, const hsize_t *scaled, H5D_chunk_ud_t *udat
done:
#ifdef H5_HAVE_PARALLEL
/* Re-enable collective metadata reads if we disabled them */
- if (reenable_coll_md_reads)
- H5CX_set_coll_metadata_read(TRUE);
+ if (restore_md_reads_state)
+ H5F_set_coll_metadata_reads(dset->oloc.file, &md_reads_file_flag, &md_reads_context_flag);
#endif /* H5_HAVE_PARALLEL */
FUNC_LEAVE_NOAPI(ret_value)
diff --git a/src/H5Dmpio.c b/src/H5Dmpio.c
index cf8a6ef..ce790c4 100644
--- a/src/H5Dmpio.c
+++ b/src/H5Dmpio.c
@@ -808,10 +808,6 @@ H5D__chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_inf
HDassert(type_info);
HDassert(fm);
- /* Disable collective metadata reads for chunked dataset I/O operations
- * in order to prevent potential hangs */
- H5CX_set_coll_metadata_read(FALSE);
-
/* Check the optional property list for the collective chunk IO optimization option */
if (H5CX_get_mpio_chunk_opt_mode(&chunk_opt_mode) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "couldn't get chunk optimization option")
@@ -2306,17 +2302,20 @@ static herr_t
H5D__sort_chunk(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm,
H5D_chunk_addr_info_t chunk_addr_info_array[], int sum_chunk)
{
- H5SL_node_t * chunk_node; /* Current node in chunk skip list */
- H5D_chunk_info_t *chunk_info; /* Current chunking info. of this node. */
- haddr_t chunk_addr; /* Current chunking address of this node */
- haddr_t *total_chunk_addr_array = NULL; /* The array of chunk address for the total number of chunk */
- hbool_t do_sort = FALSE; /* Whether the addresses need to be sorted */
- int bsearch_coll_chunk_threshold;
- int many_chunk_opt = H5D_OBTAIN_ONE_CHUNK_ADDR_IND;
- int mpi_size; /* Number of MPI processes */
- int mpi_code; /* MPI return code */
- int i; /* Local index variable */
- herr_t ret_value = SUCCEED; /* Return value */
+ H5SL_node_t * chunk_node; /* Current node in chunk skip list */
+ H5D_chunk_info_t *chunk_info; /* Current chunking info. of this node. */
+ haddr_t chunk_addr; /* Current chunking address of this node */
+ haddr_t *total_chunk_addr_array = NULL; /* The array of chunk address for the total number of chunk */
+ H5P_coll_md_read_flag_t md_reads_file_flag;
+ hbool_t md_reads_context_flag;
+ hbool_t restore_md_reads_state = FALSE;
+ hbool_t do_sort = FALSE; /* Whether the addresses need to be sorted */
+ int bsearch_coll_chunk_threshold;
+ int many_chunk_opt = H5D_OBTAIN_ONE_CHUNK_ADDR_IND;
+ int mpi_size; /* Number of MPI processes */
+ int mpi_code; /* MPI return code */
+ int i; /* Local index variable */
+ herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_STATIC
@@ -2360,7 +2359,32 @@ H5D__sort_chunk(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm,
HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank")
if (mpi_rank == 0) {
- if (H5D__chunk_addrmap(io_info, total_chunk_addr_array) < 0) {
+ herr_t result;
+
+ /*
+ * If enabled, disable collective metadata reads here.
+ * Since the chunk address mapping is done on rank 0
+ * only here, it will cause problems if collective
+ * metadata reads are enabled.
+ */
+ if (H5F_get_coll_metadata_reads(io_info->dset->oloc.file)) {
+ md_reads_file_flag = H5P_FORCE_FALSE;
+ md_reads_context_flag = FALSE;
+ H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag,
+ &md_reads_context_flag);
+ restore_md_reads_state = TRUE;
+ }
+
+ result = H5D__chunk_addrmap(io_info, total_chunk_addr_array);
+
+ /* Ensure that we restore the old collective metadata reads state */
+ if (restore_md_reads_state) {
+ H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag,
+ &md_reads_context_flag);
+ restore_md_reads_state = FALSE;
+ }
+
+ if (result < 0) {
size_t u;
/* Clear total chunk address array */
@@ -2424,6 +2448,10 @@ H5D__sort_chunk(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm,
} /* end if */
done:
+ /* Re-enable collective metadata reads if we disabled them */
+ if (restore_md_reads_state)
+ H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag, &md_reads_context_flag);
+
if (total_chunk_addr_array)
H5MM_xfree(total_chunk_addr_array);
@@ -2471,20 +2499,23 @@ static herr_t
H5D__obtain_mpio_mode(H5D_io_info_t *io_info, H5D_chunk_map_t *fm, uint8_t assign_io_mode[],
haddr_t chunk_addr[])
{
- size_t total_chunks;
- unsigned percent_nproc_per_chunk, threshold_nproc_per_chunk;
- uint8_t * io_mode_info = NULL;
- uint8_t * recv_io_mode_info = NULL;
- uint8_t * mergebuf = NULL;
- uint8_t * tempbuf;
- H5SL_node_t * chunk_node;
- H5D_chunk_info_t *chunk_info;
- int mpi_size, mpi_rank;
- MPI_Comm comm;
- int root;
- size_t ic;
- int mpi_code;
- herr_t ret_value = SUCCEED;
+ size_t total_chunks;
+ unsigned percent_nproc_per_chunk, threshold_nproc_per_chunk;
+ uint8_t * io_mode_info = NULL;
+ uint8_t * recv_io_mode_info = NULL;
+ uint8_t * mergebuf = NULL;
+ uint8_t * tempbuf;
+ H5SL_node_t * chunk_node;
+ H5D_chunk_info_t * chunk_info;
+ H5P_coll_md_read_flag_t md_reads_file_flag;
+ hbool_t md_reads_context_flag;
+ hbool_t restore_md_reads_state = FALSE;
+ int mpi_size, mpi_rank;
+ MPI_Comm comm;
+ int root;
+ size_t ic;
+ int mpi_code;
+ herr_t ret_value = SUCCEED;
FUNC_ENTER_STATIC
@@ -2544,6 +2575,20 @@ H5D__obtain_mpio_mode(H5D_io_info_t *io_info, H5D_chunk_map_t *fm, uint8_t assig
size_t nproc;
unsigned *nproc_per_chunk;
+ /*
+ * If enabled, disable collective metadata reads here.
+ * Since the chunk address mapping is done on rank 0
+ * only here, it will cause problems if collective
+ * metadata reads are enabled.
+ */
+ if (H5F_get_coll_metadata_reads(io_info->dset->oloc.file)) {
+ md_reads_file_flag = H5P_FORCE_FALSE;
+ md_reads_context_flag = FALSE;
+ H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag,
+ &md_reads_context_flag);
+ restore_md_reads_state = TRUE;
+ }
+
/* pre-computing: calculate number of processes and
regularity of the selection occupied in each chunk */
if (NULL == (nproc_per_chunk = (unsigned *)H5MM_calloc(total_chunks * sizeof(unsigned))))
@@ -2610,6 +2655,10 @@ H5D__obtain_mpio_mode(H5D_io_info_t *io_info, H5D_chunk_map_t *fm, uint8_t assig
#endif
done:
+ /* Re-enable collective metadata reads if we disabled them */
+ if (restore_md_reads_state)
+ H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag, &md_reads_context_flag);
+
if (io_mode_info)
H5MM_free(io_mode_info);
if (mergebuf)
diff --git a/src/H5Fmpi.c b/src/H5Fmpi.c
index 53d2d78..78290c6 100644
--- a/src/H5Fmpi.c
+++ b/src/H5Fmpi.c
@@ -31,11 +31,12 @@
/***********/
/* Headers */
/***********/
-#include "H5private.h" /* Generic Functions */
-#include "H5Eprivate.h" /* Error handling */
-#include "H5Fpkg.h" /* File access */
-#include "H5FDprivate.h" /* File drivers */
-#include "H5Iprivate.h" /* IDs */
+#include "H5private.h" /* Generic Functions */
+#include "H5CXprivate.h" /* API Contexts */
+#include "H5Eprivate.h" /* Error handling */
+#include "H5Fpkg.h" /* File access */
+#include "H5FDprivate.h" /* File drivers */
+#include "H5Iprivate.h" /* IDs */
#include "H5VLnative_private.h" /* Native VOL connector */
@@ -402,4 +403,125 @@ H5F_mpi_retrieve_comm(hid_t loc_id, hid_t acspl_id, MPI_Comm *mpi_comm)
done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5F_mpi_retrieve_comm */
+
+/*-------------------------------------------------------------------------
+ * Function: H5F_get_coll_metadata_reads
+ *
+ * Purpose: Determines whether collective metadata reads should be
+ * performed. This routine is meant to be the single source of
+ * truth for the collective metadata reads status, as it
+ * coordinates between the file-global flag and the flag set
+ * for the current operation in the current API context.
+ *
+ * Return: TRUE/FALSE (can't fail)
+ *
+ *-------------------------------------------------------------------------
+ */
+hbool_t
+H5F_get_coll_metadata_reads(const H5F_t *file)
+{
+ H5P_coll_md_read_flag_t file_flag = H5P_USER_FALSE;
+ hbool_t ret_value = FALSE;
+
+ FUNC_ENTER_NOAPI_NOERR
+
+ HDassert(file && file->shared);
+
+ /* Retrieve the file-global flag */
+ file_flag = H5F_COLL_MD_READ(file);
+
+ /* If file flag is set to H5P_FORCE_FALSE, exit early
+ * with FALSE, since collective metadata reads have
+ * been explicitly disabled somewhere in the library.
+ */
+ if (H5P_FORCE_FALSE == file_flag)
+ ret_value = FALSE;
+ else {
+ /* If file flag is set to H5P_USER_TRUE, ignore
+ * any settings in the API context. A file-global
+ * setting of H5P_USER_TRUE for collective metadata
+ * reads should ignore any settings on an Access
+ * Property List for an individual operation.
+ */
+ if (H5P_USER_TRUE == file_flag)
+ ret_value = TRUE;
+ else {
+ /* Get the collective metadata reads flag from
+ * the current API context.
+ */
+ ret_value = H5CX_get_coll_metadata_read();
+ }
+ }
+
+ FUNC_LEAVE_NOAPI(ret_value)
+} /* end H5F_get_coll_metadata_reads() */
+
+/*-------------------------------------------------------------------------
+ * Function: H5F_set_coll_metadata_reads
+ *
+ * Purpose: Used to temporarily modify the collective metadata reads
+ * status. This is useful for cases where either:
+ *
+ * * Collective metadata reads are enabled, but need to be
+ * disabled for an operation about to occur that may trigger
+ * an independent metadata read (such as only rank 0 doing
+ * something)
+ *
+ * * Metadata reads are currently independent, but it is
+ * guaranteed that the application has maintained
+ * collectivity at the interface level (e.g., an operation
+ * that modifies metadata is being performed). In this case,
+ * it should be safe to enable collective metadata reads,
+ * barring any internal library issues that may occur
+ *
+ * After completion, the `file_flag` parameter will be set to
+ * the previous value of the file-global collective metadata
+ * reads flag. The `context_flag` parameter will be set to the
+ * previous value of the API context's collective metadata
+ * reads flag. Another call to this routine should be made to
+ * restore these values (see below warning).
+ *
+ * !! WARNING !!
+ * It is dangerous to modify the collective metadata reads
+ * status, as this can cause crashes, hangs and corruption in
+ * the HDF5 file when improperly done. Therefore, the
+ * `file_flag` and `context_flag` parameters are both
+ * mandatory, and it is assumed that the caller will guarantee
+ * these settings are restored with another call to this
+ * routine once the bracketed operation is complete.
+ * !! WARNING !!
+ *
+ * Return: Nothing
+ *
+ *-------------------------------------------------------------------------
+ */
+void
+H5F_set_coll_metadata_reads(H5F_t *file, H5P_coll_md_read_flag_t *file_flag, hbool_t *context_flag)
+{
+ H5P_coll_md_read_flag_t prev_file_flag = H5P_USER_FALSE;
+ hbool_t prev_context_flag = FALSE;
+
+ FUNC_ENTER_NOAPI_NOERR
+
+ HDassert(file && file->shared);
+ HDassert(file_flag);
+ HDassert(context_flag);
+
+ /* Save old state */
+ prev_file_flag = H5F_COLL_MD_READ(file);
+ prev_context_flag = H5CX_get_coll_metadata_read();
+
+ /* Set new desired state */
+ if (prev_file_flag != *file_flag) {
+ file->shared->coll_md_read = *file_flag;
+ *file_flag = prev_file_flag;
+ }
+ if (prev_context_flag != *context_flag) {
+ H5CX_set_coll_metadata_read(*context_flag);
+ *context_flag = prev_context_flag;
+ }
+
+ FUNC_LEAVE_NOAPI_VOID
+} /* end H5F_set_coll_metadata_reads() */
+
#endif /* H5_HAVE_PARALLEL */
diff --git a/src/H5Fprivate.h b/src/H5Fprivate.h
index a5ccbab..af65c9d 100644
--- a/src/H5Fprivate.h
+++ b/src/H5Fprivate.h
@@ -962,6 +962,8 @@ H5_DLL MPI_Comm H5F_mpi_get_comm(const H5F_t *f);
H5_DLL int H5F_shared_mpi_get_size(const H5F_shared_t *f_sh);
H5_DLL int H5F_mpi_get_size(const H5F_t *f);
H5_DLL herr_t H5F_mpi_retrieve_comm(hid_t loc_id, hid_t acspl_id, MPI_Comm *mpi_comm);
+H5_DLL hbool_t H5F_get_coll_metadata_reads(const H5F_t *f);
+H5_DLL void H5F_set_coll_metadata_reads(H5F_t *f, H5P_coll_md_read_flag_t *file_flag, hbool_t *context_flag);
#endif /* H5_HAVE_PARALLEL */
/* External file cache routines */
diff --git a/src/H5Pfapl.c b/src/H5Pfapl.c
index 47c17db..2c3caa8 100644
--- a/src/H5Pfapl.c
+++ b/src/H5Pfapl.c
@@ -5284,15 +5284,14 @@ H5P__decode_coll_md_read_flag_t(const void **_pp, void *_value)
* Function: H5Pset_all_coll_metadata_ops
*
* Purpose: Tell the library whether the metadata read operations will
- * be done collectively (1) or not (0). Default is independent.
- * With collective mode, the library will optimize access to
- * metadata operations on the file.
+ * be done collectively (1) or not (0). Default is independent.
+ * With collective mode, the library will optimize access to
+ * metadata operations on the file.
*
* Note: This routine accepts file access property lists, link
- * access property lists, attribute access property lists,
- * dataset access property lists, group access property lists,
- * named datatype access property lists,
- * and dataset transfer property lists.
+ * access property lists, attribute access property lists,
+ * dataset access property lists, group access property lists
+ * and named datatype access property lists.
*
* Return: Non-negative on success/Negative on failure
*
@@ -5312,7 +5311,7 @@ H5Pset_all_coll_metadata_ops(hid_t plist_id, hbool_t is_collective)
H5TRACE2("e", "ib", plist_id, is_collective);
/* Compare the property list's class against the other class */
- /* (Dataset, group, attribute, and named datype access property lists
+ /* (Dataset, group, attribute, and named datatype access property lists
* are sub-classes of link access property lists -QAK)
*/
if (TRUE != H5P_isa_class(plist_id, H5P_LINK_ACCESS) && TRUE != H5P_isa_class(plist_id, H5P_FILE_ACCESS))
@@ -5342,10 +5341,9 @@ done:
* Purpose: Gets information about collective metadata read mode.
*
* Note: This routine accepts file access property lists, link
- * access property lists, attribute access property lists,
- * dataset access property lists, group access property lists,
- * named datatype access property lists,
- * and dataset transfer property lists.
+ * access property lists, attribute access property lists,
+ * dataset access property lists, group access property lists,
+ * and named datatype access property lists.
*
* Return: Non-negative on success/Negative on failure
*
@@ -5363,7 +5361,7 @@ H5Pget_all_coll_metadata_ops(hid_t plist_id, hbool_t *is_collective /*out*/)
H5TRACE2("e", "ix", plist_id, is_collective);
/* Compare the property list's class against the other class */
- /* (Dataset, group, attribute, and named datype access property lists
+ /* (Dataset, group, attribute, and named datatype access property lists
* are sub-classes of link access property lists -QAK)
*/
if (TRUE != H5P_isa_class(plist_id, H5P_LINK_ACCESS) && TRUE != H5P_isa_class(plist_id, H5P_FILE_ACCESS))
diff --git a/src/H5Z.c b/src/H5Z.c
index bcdd837..763eac2 100644
--- a/src/H5Z.c
+++ b/src/H5Z.c
@@ -594,14 +594,9 @@ H5Z__flush_file_cb(void *obj_ptr, hid_t H5_ATTR_UNUSED obj_id, void H5_ATTR_PARA
/* Do a global flush if the file is opened for write */
if (H5F_ACC_RDWR & H5F_INTENT(f)) {
-/* When parallel HDF5 is defined, check for collective metadata reads on this
- * file and set the flag for metadata I/O in the API context. -QAK, 2018/02/14
- */
#ifdef H5_HAVE_PARALLEL
/* Check if MPIO driver is used */
if (H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) {
- H5P_coll_md_read_flag_t coll_md_read; /* Do all metadata reads collectively */
-
/* Sanity check for collectively calling H5Zunregister, if requested */
/* (Sanity check assumes that a barrier on one file's comm
* is sufficient (i.e. that there aren't different comms for
@@ -621,13 +616,8 @@ H5Z__flush_file_cb(void *obj_ptr, hid_t H5_ATTR_UNUSED obj_id, void H5_ATTR_PARA
/* Set the "sanity checked" flag */
object->sanity_checked = TRUE;
} /* end if */
-
- /* Check whether to use the collective metadata read DXPL */
- coll_md_read = H5F_COLL_MD_READ(f);
- if (H5P_USER_TRUE == coll_md_read)
- H5CX_set_coll_metadata_read(TRUE);
- } /* end if */
-#endif /* H5_HAVE_PARALLEL */
+ } /* end if */
+#endif /* H5_HAVE_PARALLEL */
/* Call the flush routine for mounted file hierarchies */
if (H5F_flush_mounts((H5F_t *)obj_ptr) < 0)
diff --git a/testpar/t_cache.c b/testpar/t_cache.c
index df34560..70ada01 100644
--- a/testpar/t_cache.c
+++ b/testpar/t_cache.c
@@ -6623,13 +6623,15 @@ trace_file_check(int metadata_write_strategy)
static hbool_t
smoke_check_6(int metadata_write_strategy)
{
- hbool_t success = TRUE;
- int i;
- int max_nerrors;
- hid_t fid = -1;
- H5F_t * file_ptr = NULL;
- H5C_t * cache_ptr = NULL;
- struct mssg_t mssg;
+ H5P_coll_md_read_flag_t md_reads_file_flag;
+ hbool_t md_reads_context_flag;
+ hbool_t success = TRUE;
+ int i;
+ int max_nerrors;
+ hid_t fid = -1;
+ H5F_t * file_ptr = NULL;
+ H5C_t * cache_ptr = NULL;
+ struct mssg_t mssg;
switch (metadata_write_strategy) {
@@ -6685,7 +6687,9 @@ smoke_check_6(int metadata_write_strategy)
virt_num_data_entries = NUM_DATA_ENTRIES;
/* insert the first half collectively */
- H5CX_set_coll_metadata_read(TRUE);
+ md_reads_file_flag = H5P_USER_TRUE;
+ md_reads_context_flag = TRUE;
+ H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
for (i = 0; i < virt_num_data_entries / 2; i++) {
struct datum *entry_ptr;
entry_ptr = &(data[i]);
@@ -6704,9 +6708,13 @@ smoke_check_6(int metadata_write_strategy)
H5_CHECK_OVERFLOW(cache_ptr->max_cache_size, size_t, double);
HDassert((double)cache_ptr->max_cache_size * 0.8 > cache_ptr->coll_list_size);
}
+ /* Restore collective metadata reads state */
+ H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
/* insert the other half independently */
- H5CX_set_coll_metadata_read(FALSE);
+ md_reads_file_flag = H5P_USER_FALSE;
+ md_reads_context_flag = FALSE;
+ H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
for (i = virt_num_data_entries / 2; i < virt_num_data_entries; i++) {
struct datum *entry_ptr;
entry_ptr = &(data[i]);
@@ -6724,6 +6732,8 @@ smoke_check_6(int metadata_write_strategy)
/* Make sure coll entries do not cross the 80% threshold */
HDassert((double)cache_ptr->max_cache_size * 0.8 > cache_ptr->coll_list_size);
}
+ /* Restore collective metadata reads state */
+ H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
/* flush the file */
if (H5Fflush(fid, H5F_SCOPE_GLOBAL) < 0) {
@@ -6734,7 +6744,9 @@ smoke_check_6(int metadata_write_strategy)
}
/* Protect the first half of the entries collectively */
- H5CX_set_coll_metadata_read(TRUE);
+ md_reads_file_flag = H5P_USER_TRUE;
+ md_reads_context_flag = TRUE;
+ H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
for (i = 0; i < (virt_num_data_entries / 2); i++) {
struct datum *entry_ptr;
entry_ptr = &(data[i]);
@@ -6752,9 +6764,13 @@ smoke_check_6(int metadata_write_strategy)
/* Make sure coll entries do not cross the 80% threshold */
HDassert((double)cache_ptr->max_cache_size * 0.8 > cache_ptr->coll_list_size);
}
+ /* Restore collective metadata reads state */
+ H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
/* protect the other half independently */
- H5CX_set_coll_metadata_read(FALSE);
+ md_reads_file_flag = H5P_USER_FALSE;
+ md_reads_context_flag = FALSE;
+ H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
for (i = virt_num_data_entries / 2; i < virt_num_data_entries; i++) {
struct datum *entry_ptr;
entry_ptr = &(data[i]);
@@ -6772,6 +6788,8 @@ smoke_check_6(int metadata_write_strategy)
/* Make sure coll entries do not cross the 80% threshold */
HDassert((double)cache_ptr->max_cache_size * 0.8 > cache_ptr->coll_list_size);
}
+ /* Restore collective metadata reads state */
+ H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
for (i = 0; i < (virt_num_data_entries); i++) {
unlock_entry(file_ptr, i, H5AC__NO_FLAGS_SET);
diff --git a/testpar/t_coll_md_read.c b/testpar/t_coll_md_read.c
index 66f3151..cabdea0 100644
--- a/testpar/t_coll_md_read.c
+++ b/testpar/t_coll_md_read.c
@@ -34,10 +34,9 @@
#define MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS 2
-#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DATASET_NAME "linked_chunk_io_sort_chunk_issue"
-#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_Y_DIM_SCALE 20000
-#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE 1
-#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS 1
+#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_COLL_THRESH_NUM 10000
+#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DATASET_NAME "linked_chunk_io_sort_chunk_issue"
+#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS 1
/*
* A test for issue HDFFV-10501. A parallel hang was reported which occurred
@@ -339,21 +338,34 @@ test_multi_chunk_io_addrmap_issue(void)
* collective metadata reads being made only by process 0 in H5D__sort_chunk().
*
* NOTE: Due to the way that the threshold value which pertains to this test
- * is currently calculated within HDF5, there are several conditions that this
- * test must maintain. Refer to the function H5D__sort_chunk in H5Dmpio.c for
- * a better idea of why.
+ * is currently calculated within HDF5, the following two conditions must be
+ * true to trigger the issue:
*
- * Condition 1: We need to make sure that the test always selects every single
- * chunk in the dataset. It is fine if the selection is split up among multiple
- * ranks, but their combined selection must cover the whole dataset.
+ * Condition 1: A certain threshold ratio must be met in order to have HDF5
+ * obtain all chunk addresses collectively inside H5D__sort_chunk(). This is
+ * given by the following:
*
- * Condition 2: The number of chunks in the dataset divided by the number of MPI
- * ranks must exceed or equal 10000. In other words, each MPI rank must be
- * responsible for 10000 or more unique chunks.
+ * (sum_chunk * 100) / (dataset_nchunks * mpi_size) >= 30%
*
- * Condition 3: This test will currently only be reliably reproducible for 2 or 3
- * MPI ranks. The threshold value calculated reduces to a constant 100 / mpi_size,
- * and is compared against a default value of 30%.
+ * where:
+ * * `sum_chunk` is the combined sum of the number of chunks selected in
+ * the dataset by all ranks (chunks selected by more than one rank count
+ * individually toward the sum for each rank selecting that chunk)
+ * * `dataset_nchunks` is the number of chunks in the dataset (selected
+ * or not)
+ * * `mpi_size` is the size of the MPI Communicator
+ *
+ * Condition 2: `sum_chunk` divided by `mpi_size` must exceed or equal a certain
+ * threshold (as of this writing, 10000).
+ *
+ * To satisfy both these conditions, we #define a macro,
+ * LINK_CHUNK_IO_SORT_CHUNK_ISSUE_COLL_THRESH_NUM, which corresponds to the
+ * value of the H5D_ALL_CHUNK_ADDR_THRES_COL_NUM macro in H5Dmpio.c (the
+ * 10000 threshold from condition 2). We then create a dataset of that many
+ * chunks and have each MPI rank write to and read from a piece of every single
+ * chunk in the dataset. This ensures chunk utilization is the max possible
+ * and exceeds our 30% target ratio, while always exactly matching the numeric
+ * chunk threshold value of condition 2.
*
* Failure in this test may either cause a hang, or, due to how the MPI calls
* pertaining to this issue might mistakenly match up, may cause an MPI error
@@ -375,10 +387,9 @@ void
test_link_chunk_io_sort_chunk_issue(void)
{
const char *filename;
- hsize_t * dataset_dims = NULL;
- hsize_t max_dataset_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
- hsize_t sel_dims[1];
- hsize_t chunk_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS] = {LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS};
+ hsize_t dataset_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
+ hsize_t sel_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
+ hsize_t chunk_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
hsize_t start[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
hsize_t stride[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
hsize_t count[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
@@ -412,14 +423,13 @@ test_link_chunk_io_sort_chunk_issue(void)
file_id = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id);
VRFY((file_id >= 0), "H5Fcreate succeeded");
- dataset_dims = HDmalloc(LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS * sizeof(*dataset_dims));
- VRFY((dataset_dims != NULL), "malloc succeeded");
-
- dataset_dims[0] = (hsize_t)LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE * (hsize_t)mpi_size *
- (hsize_t)LINK_CHUNK_IO_SORT_CHUNK_ISSUE_Y_DIM_SCALE;
- max_dataset_dims[0] = H5S_UNLIMITED;
+ /*
+ * Create a one-dimensional dataset of exactly LINK_CHUNK_IO_SORT_CHUNK_ISSUE_COLL_THRESH_NUM
+ * chunks, where every rank writes to a piece of every single chunk to keep utilization high.
+ */
+ dataset_dims[0] = (hsize_t)mpi_size * (hsize_t)LINK_CHUNK_IO_SORT_CHUNK_ISSUE_COLL_THRESH_NUM;
- fspace_id = H5Screate_simple(LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS, dataset_dims, max_dataset_dims);
+ fspace_id = H5Screate_simple(LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS, dataset_dims, NULL);
VRFY((fspace_id >= 0), "H5Screate_simple succeeded");
/*
@@ -428,6 +438,9 @@ test_link_chunk_io_sort_chunk_issue(void)
dcpl_id = H5Pcreate(H5P_DATASET_CREATE);
VRFY((dcpl_id >= 0), "H5Pcreate succeeded");
+ /* Chunk size is equal to MPI size since each rank writes to a piece of every chunk */
+ chunk_dims[0] = (hsize_t)mpi_size;
+
VRFY((H5Pset_chunk(dcpl_id, LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS, chunk_dims) >= 0),
"H5Pset_chunk succeeded");
@@ -437,23 +450,21 @@ test_link_chunk_io_sort_chunk_issue(void)
/*
* Setup hyperslab selection to split the dataset among the ranks.
- *
- * The ranks will write rows across the dataset.
*/
- stride[0] = LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE;
- count[0] = (dataset_dims[0] / LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) / (hsize_t)mpi_size;
- start[0] = count[0] * (hsize_t)mpi_rank;
- block[0] = LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE;
+ start[0] = (hsize_t)mpi_rank;
+ stride[0] = (hsize_t)mpi_size;
+ count[0] = LINK_CHUNK_IO_SORT_CHUNK_ISSUE_COLL_THRESH_NUM;
+ block[0] = 1;
VRFY((H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, start, stride, count, block) >= 0),
"H5Sselect_hyperslab succeeded");
- sel_dims[0] = count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE);
+ sel_dims[0] = count[0];
mspace_id = H5Screate_simple(1, sel_dims, NULL);
VRFY((mspace_id >= 0), "H5Screate_simple succeeded");
- data = HDcalloc(1, count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) * sizeof(int));
+ data = HDcalloc(1, count[0] * sizeof(int));
VRFY((data != NULL), "calloc succeeded");
dxpl_id = H5Pcreate(H5P_DATASET_XFER);
@@ -476,33 +487,25 @@ test_link_chunk_io_sort_chunk_issue(void)
VRFY((H5Pset_dxpl_mpio_chunk_opt(dxpl_id, H5FD_MPIO_CHUNK_ONE_IO) >= 0),
"H5Pset_dxpl_mpio_chunk_opt succeeded");
- read_buf = HDmalloc(count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) * sizeof(int));
+ read_buf = HDmalloc(count[0] * sizeof(int));
VRFY((read_buf != NULL), "malloc succeeded");
VRFY((H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, start, stride, count, block) >= 0),
"H5Sselect_hyperslab succeeded");
- sel_dims[0] = count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE);
+ sel_dims[0] = count[0];
VRFY((H5Sclose(mspace_id) >= 0), "H5Sclose succeeded");
mspace_id = H5Screate_simple(1, sel_dims, NULL);
VRFY((mspace_id >= 0), "H5Screate_simple succeeded");
- read_buf = HDrealloc(read_buf, count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) * sizeof(int));
- VRFY((read_buf != NULL), "realloc succeeded");
-
/*
* Finally have each rank read their section of data back from the dataset.
*/
VRFY((H5Dread(dset_id, H5T_NATIVE_INT, mspace_id, fspace_id, dxpl_id, read_buf) >= 0),
"H5Dread succeeded");
- if (dataset_dims) {
- HDfree(dataset_dims);
- dataset_dims = NULL;
- }
-
if (data) {
HDfree(data);
data = NULL;