From 1c9f219463e236487a27cee6cb839379d670cda4 Mon Sep 17 00:00:00 2001 From: jhendersonHDF Date: Thu, 3 Feb 2022 16:19:38 -0600 Subject: Unify handling of collective metadata reads status (#1206) (#1417) --- release_docs/RELEASE.txt | 21 ++++++++ src/H5C.c | 39 ++++++++++---- src/H5CX.c | 2 +- src/H5Dchunk.c | 17 +++--- src/H5Dmpio.c | 119 +++++++++++++++++++++++++++++++----------- src/H5Fmpi.c | 132 +++++++++++++++++++++++++++++++++++++++++++++-- src/H5Fprivate.h | 2 + src/H5Pfapl.c | 24 ++++----- src/H5Z.c | 14 +---- testpar/t_cache.c | 40 ++++++++++---- testpar/t_coll_md_read.c | 93 +++++++++++++++++---------------- 11 files changed, 366 insertions(+), 137 deletions(-) diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt index 5d586fb..55b6d11 100644 --- a/release_docs/RELEASE.txt +++ b/release_docs/RELEASE.txt @@ -217,6 +217,27 @@ Bug Fixes since HDF5-1.12.1 release =================================== Library ------- + - Unified handling of collective metadata reads to correctly fix old bugs + + Due to MPI-related issues occurring in HDF5 from mismanagement of the + status of collective metadata reads, they were forced to be disabled + during chunked dataset raw data I/O in the HDF5 1.10.5 release. This + wouldn't generally have affected application performance because HDF5 + already disables collective metadata reads during chunk lookup, since + it is generally unlikely that the same chunks will be read by all MPI + ranks in the I/O operation. However, this was only a partial solution + that wasn't granular enough. + + This change now unifies the handling of the file-global flag and the + API context-level flag for collective metadata reads in order to + simplify querying of the true status of collective metadata reads. Thus, + collective metadata reads are once again enabled for chunked dataset + raw data I/O, but manually controlled at places where some processing + occurs on MPI rank 0 only and would cause issues when collective + metadata reads are enabled. + + (JTH - 2021/11/16, HDFFV-10501/HDFFV-10562) + - Fixed an issue with collective metadata reads being permanently disabled after a dataset chunk lookup operation. This would usually cause a mismatched MPI_Bcast and MPI_ERR_TRUNCATE issue in the library for diff --git a/src/H5C.c b/src/H5C.c index 185b421..ba09f10 100644 --- a/src/H5C.c +++ b/src/H5C.c @@ -1521,17 +1521,26 @@ H5C_insert_entry(H5F_t *f, const H5C_class_t *type, haddr_t addr, void *thing, u #ifdef H5_HAVE_PARALLEL if (H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) - coll_access = H5CX_get_coll_metadata_read(); + coll_access = H5F_get_coll_metadata_reads(f); entry_ptr->coll_access = coll_access; if (coll_access) { H5C__INSERT_IN_COLL_LIST(cache_ptr, entry_ptr, FAIL) /* Make sure the size of the collective entries in the cache remain in check */ - if (cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100) - if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0) - HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't clear collective metadata entries") - } /* end if */ + if (H5P_USER_TRUE == H5F_COLL_MD_READ(f)) { + if (cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100) { + if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't clear collective metadata entries") + } /* end if */ + } /* end if */ + else { + if (cache_ptr->max_cache_size * 40 < cache_ptr->coll_list_size * 100) { + if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't clear collective metadata entries") + } /* end if */ + } /* end else */ + } /* end if */ #endif done: @@ -2251,7 +2260,7 @@ H5C_protect(H5F_t *f, const H5C_class_t *type, haddr_t addr, void *udata, unsign #ifdef H5_HAVE_PARALLEL if (H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) - coll_access = H5CX_get_coll_metadata_read(); + coll_access = H5F_get_coll_metadata_reads(f); #endif /* H5_HAVE_PARALLEL */ /* first check to see if the target is in cache */ @@ -2598,11 +2607,19 @@ H5C_protect(H5F_t *f, const H5C_class_t *type, haddr_t addr, void *udata, unsign #ifdef H5_HAVE_PARALLEL /* Make sure the size of the collective entries in the cache remain in check */ - if (coll_access) - if (cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100) - if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0) - HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, NULL, "can't clear collective metadata entries") -#endif /* H5_HAVE_PARALLEL */ + if (coll_access) { + if (H5P_USER_TRUE == H5F_COLL_MD_READ(f)) { + if (cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100) + if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, NULL, "can't clear collective metadata entries") + } /* end if */ + else { + if (cache_ptr->max_cache_size * 40 < cache_ptr->coll_list_size * 100) + if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, NULL, "can't clear collective metadata entries") + } /* end else */ + } /* end if */ +#endif /* H5_HAVE_PARALLEL */ done: #if H5C_DO_EXTREME_SANITY_CHECKS diff --git a/src/H5CX.c b/src/H5CX.c index ff3620e..88b476a 100644 --- a/src/H5CX.c +++ b/src/H5CX.c @@ -1420,7 +1420,7 @@ done: * Purpose: Sanity checks and sets up collective operations. * * Note: Should be called for all API routines that modify file - * file metadata but don't pass in an access property list. + * metadata but don't pass in an access property list. * * Return: Non-negative on success / Negative on failure * diff --git a/src/H5Dchunk.c b/src/H5Dchunk.c index d2fd8e5..a5296d3 100644 --- a/src/H5Dchunk.c +++ b/src/H5Dchunk.c @@ -3153,7 +3153,9 @@ H5D__chunk_lookup(const H5D_t *dset, const hsize_t *scaled, H5D_chunk_ud_t *udat unsigned idx = 0; /* Index of chunk in cache, if present */ hbool_t found = FALSE; /* In cache? */ #ifdef H5_HAVE_PARALLEL - hbool_t reenable_coll_md_reads = FALSE; + H5P_coll_md_read_flag_t md_reads_file_flag; + hbool_t md_reads_context_flag; + hbool_t restore_md_reads_state = FALSE; #endif herr_t ret_value = SUCCEED; /* Return value */ @@ -3227,11 +3229,10 @@ H5D__chunk_lookup(const H5D_t *dset, const hsize_t *scaled, H5D_chunk_ud_t *udat * processes. */ if (H5F_HAS_FEATURE(idx_info.f, H5FD_FEAT_HAS_MPI)) { - hbool_t do_coll_md_reads = H5CX_get_coll_metadata_read(); - if (do_coll_md_reads) { - H5CX_set_coll_metadata_read(FALSE); - reenable_coll_md_reads = TRUE; - } + md_reads_file_flag = H5P_FORCE_FALSE; + md_reads_context_flag = FALSE; + H5F_set_coll_metadata_reads(idx_info.f, &md_reads_file_flag, &md_reads_context_flag); + restore_md_reads_state = TRUE; } #endif /* H5_HAVE_PARALLEL */ @@ -3277,8 +3278,8 @@ H5D__chunk_lookup(const H5D_t *dset, const hsize_t *scaled, H5D_chunk_ud_t *udat done: #ifdef H5_HAVE_PARALLEL /* Re-enable collective metadata reads if we disabled them */ - if (reenable_coll_md_reads) - H5CX_set_coll_metadata_read(TRUE); + if (restore_md_reads_state) + H5F_set_coll_metadata_reads(dset->oloc.file, &md_reads_file_flag, &md_reads_context_flag); #endif /* H5_HAVE_PARALLEL */ FUNC_LEAVE_NOAPI(ret_value) diff --git a/src/H5Dmpio.c b/src/H5Dmpio.c index 752c144..1c00bec 100644 --- a/src/H5Dmpio.c +++ b/src/H5Dmpio.c @@ -805,10 +805,6 @@ H5D__chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_inf HDassert(type_info); HDassert(fm); - /* Disable collective metadata reads for chunked dataset I/O operations - * in order to prevent potential hangs */ - H5CX_set_coll_metadata_read(FALSE); - /* Check the optional property list for the collective chunk IO optimization option */ if (H5CX_get_mpio_chunk_opt_mode(&chunk_opt_mode) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "couldn't get chunk optimization option") @@ -2303,17 +2299,20 @@ static herr_t H5D__sort_chunk(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm, H5D_chunk_addr_info_t chunk_addr_info_array[], int sum_chunk) { - H5SL_node_t * chunk_node; /* Current node in chunk skip list */ - H5D_chunk_info_t *chunk_info; /* Current chunking info. of this node. */ - haddr_t chunk_addr; /* Current chunking address of this node */ - haddr_t *total_chunk_addr_array = NULL; /* The array of chunk address for the total number of chunk */ - hbool_t do_sort = FALSE; /* Whether the addresses need to be sorted */ - int bsearch_coll_chunk_threshold; - int many_chunk_opt = H5D_OBTAIN_ONE_CHUNK_ADDR_IND; - int mpi_size; /* Number of MPI processes */ - int mpi_code; /* MPI return code */ - int i; /* Local index variable */ - herr_t ret_value = SUCCEED; /* Return value */ + H5SL_node_t * chunk_node; /* Current node in chunk skip list */ + H5D_chunk_info_t *chunk_info; /* Current chunking info. of this node. */ + haddr_t chunk_addr; /* Current chunking address of this node */ + haddr_t *total_chunk_addr_array = NULL; /* The array of chunk address for the total number of chunk */ + H5P_coll_md_read_flag_t md_reads_file_flag; + hbool_t md_reads_context_flag; + hbool_t restore_md_reads_state = FALSE; + hbool_t do_sort = FALSE; /* Whether the addresses need to be sorted */ + int bsearch_coll_chunk_threshold; + int many_chunk_opt = H5D_OBTAIN_ONE_CHUNK_ADDR_IND; + int mpi_size; /* Number of MPI processes */ + int mpi_code; /* MPI return code */ + int i; /* Local index variable */ + herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_STATIC @@ -2357,8 +2356,41 @@ H5D__sort_chunk(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm, HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank") if (mpi_rank == 0) { - if (H5D__chunk_addrmap(io_info, total_chunk_addr_array) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address") + herr_t result; + + /* + * If enabled, disable collective metadata reads here. + * Since the chunk address mapping is done on rank 0 + * only here, it will cause problems if collective + * metadata reads are enabled. + */ + if (H5F_get_coll_metadata_reads(io_info->dset->oloc.file)) { + md_reads_file_flag = H5P_FORCE_FALSE; + md_reads_context_flag = FALSE; + H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag, + &md_reads_context_flag); + restore_md_reads_state = TRUE; + } + + result = H5D__chunk_addrmap(io_info, total_chunk_addr_array); + + /* Ensure that we restore the old collective metadata reads state */ + if (restore_md_reads_state) { + H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag, + &md_reads_context_flag); + restore_md_reads_state = FALSE; + } + + if (result < 0) { + size_t u; + + /* Clear total chunk address array */ + for (u = 0; u < (size_t)fm->layout->u.chunk.nchunks; u++) + total_chunk_addr_array[u] = HADDR_UNDEF; + + /* Push error, but still participate in following MPI_Bcast */ + HDONE_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address") + } } /* end if */ /* Broadcasting the MPI_IO option info. and chunk address info. */ @@ -2413,6 +2445,10 @@ H5D__sort_chunk(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm, } /* end if */ done: + /* Re-enable collective metadata reads if we disabled them */ + if (restore_md_reads_state) + H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag, &md_reads_context_flag); + if (total_chunk_addr_array) H5MM_xfree(total_chunk_addr_array); @@ -2460,20 +2496,23 @@ static herr_t H5D__obtain_mpio_mode(H5D_io_info_t *io_info, H5D_chunk_map_t *fm, uint8_t assign_io_mode[], haddr_t chunk_addr[]) { - size_t total_chunks; - unsigned percent_nproc_per_chunk, threshold_nproc_per_chunk; - uint8_t * io_mode_info = NULL; - uint8_t * recv_io_mode_info = NULL; - uint8_t * mergebuf = NULL; - uint8_t * tempbuf; - H5SL_node_t * chunk_node; - H5D_chunk_info_t *chunk_info; - int mpi_size, mpi_rank; - MPI_Comm comm; - int root; - size_t ic; - int mpi_code; - herr_t ret_value = SUCCEED; + size_t total_chunks; + unsigned percent_nproc_per_chunk, threshold_nproc_per_chunk; + uint8_t * io_mode_info = NULL; + uint8_t * recv_io_mode_info = NULL; + uint8_t * mergebuf = NULL; + uint8_t * tempbuf; + H5SL_node_t * chunk_node; + H5D_chunk_info_t * chunk_info; + H5P_coll_md_read_flag_t md_reads_file_flag; + hbool_t md_reads_context_flag; + hbool_t restore_md_reads_state = FALSE; + int mpi_size, mpi_rank; + MPI_Comm comm; + int root; + size_t ic; + int mpi_code; + herr_t ret_value = SUCCEED; FUNC_ENTER_STATIC @@ -2533,6 +2572,20 @@ H5D__obtain_mpio_mode(H5D_io_info_t *io_info, H5D_chunk_map_t *fm, uint8_t assig size_t nproc; unsigned *nproc_per_chunk; + /* + * If enabled, disable collective metadata reads here. + * Since the chunk address mapping is done on rank 0 + * only here, it will cause problems if collective + * metadata reads are enabled. + */ + if (H5F_get_coll_metadata_reads(io_info->dset->oloc.file)) { + md_reads_file_flag = H5P_FORCE_FALSE; + md_reads_context_flag = FALSE; + H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag, + &md_reads_context_flag); + restore_md_reads_state = TRUE; + } + /* pre-computing: calculate number of processes and regularity of the selection occupied in each chunk */ if (NULL == (nproc_per_chunk = (unsigned *)H5MM_calloc(total_chunks * sizeof(unsigned)))) @@ -2599,6 +2652,10 @@ H5D__obtain_mpio_mode(H5D_io_info_t *io_info, H5D_chunk_map_t *fm, uint8_t assig #endif done: + /* Re-enable collective metadata reads if we disabled them */ + if (restore_md_reads_state) + H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag, &md_reads_context_flag); + if (io_mode_info) H5MM_free(io_mode_info); if (mergebuf) diff --git a/src/H5Fmpi.c b/src/H5Fmpi.c index 97bfefd..01c3811 100644 --- a/src/H5Fmpi.c +++ b/src/H5Fmpi.c @@ -31,11 +31,12 @@ /***********/ /* Headers */ /***********/ -#include "H5private.h" /* Generic Functions */ -#include "H5Eprivate.h" /* Error handling */ -#include "H5Fpkg.h" /* File access */ -#include "H5FDprivate.h" /* File drivers */ -#include "H5Iprivate.h" /* IDs */ +#include "H5private.h" /* Generic Functions */ +#include "H5CXprivate.h" /* API Contexts */ +#include "H5Eprivate.h" /* Error handling */ +#include "H5Fpkg.h" /* File access */ +#include "H5FDprivate.h" /* File drivers */ +#include "H5Iprivate.h" /* IDs */ #include "H5VLnative_private.h" /* Native VOL connector */ @@ -391,4 +392,125 @@ H5F_mpi_retrieve_comm(hid_t loc_id, hid_t acspl_id, MPI_Comm *mpi_comm) done: FUNC_LEAVE_NOAPI(ret_value) } /* end H5F_mpi_retrieve_comm */ + +/*------------------------------------------------------------------------- + * Function: H5F_get_coll_metadata_reads + * + * Purpose: Determines whether collective metadata reads should be + * performed. This routine is meant to be the single source of + * truth for the collective metadata reads status, as it + * coordinates between the file-global flag and the flag set + * for the current operation in the current API context. + * + * Return: TRUE/FALSE (can't fail) + * + *------------------------------------------------------------------------- + */ +hbool_t +H5F_get_coll_metadata_reads(const H5F_t *file) +{ + H5P_coll_md_read_flag_t file_flag = H5P_USER_FALSE; + hbool_t ret_value = FALSE; + + FUNC_ENTER_NOAPI_NOINIT_NOERR + + HDassert(file && file->shared); + + /* Retrieve the file-global flag */ + file_flag = H5F_COLL_MD_READ(file); + + /* If file flag is set to H5P_FORCE_FALSE, exit early + * with FALSE, since collective metadata reads have + * been explicitly disabled somewhere in the library. + */ + if (H5P_FORCE_FALSE == file_flag) + ret_value = FALSE; + else { + /* If file flag is set to H5P_USER_TRUE, ignore + * any settings in the API context. A file-global + * setting of H5P_USER_TRUE for collective metadata + * reads should ignore any settings on an Access + * Property List for an individual operation. + */ + if (H5P_USER_TRUE == file_flag) + ret_value = TRUE; + else { + /* Get the collective metadata reads flag from + * the current API context. + */ + ret_value = H5CX_get_coll_metadata_read(); + } + } + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5F_get_coll_metadata_reads() */ + +/*------------------------------------------------------------------------- + * Function: H5F_set_coll_metadata_reads + * + * Purpose: Used to temporarily modify the collective metadata reads + * status. This is useful for cases where either: + * + * * Collective metadata reads are enabled, but need to be + * disabled for an operation about to occur that may trigger + * an independent metadata read (such as only rank 0 doing + * something) + * + * * Metadata reads are currently independent, but it is + * guaranteed that the application has maintained + * collectivity at the interface level (e.g., an operation + * that modifies metadata is being performed). In this case, + * it should be safe to enable collective metadata reads, + * barring any internal library issues that may occur + * + * After completion, the `file_flag` parameter will be set to + * the previous value of the file-global collective metadata + * reads flag. The `context_flag` parameter will be set to the + * previous value of the API context's collective metadata + * reads flag. Another call to this routine should be made to + * restore these values (see below warning). + * + * !! WARNING !! + * It is dangerous to modify the collective metadata reads + * status, as this can cause crashes, hangs and corruption in + * the HDF5 file when improperly done. Therefore, the + * `file_flag` and `context_flag` parameters are both + * mandatory, and it is assumed that the caller will guarantee + * these settings are restored with another call to this + * routine once the bracketed operation is complete. + * !! WARNING !! + * + * Return: Nothing + * + *------------------------------------------------------------------------- + */ +void +H5F_set_coll_metadata_reads(H5F_t *file, H5P_coll_md_read_flag_t *file_flag, hbool_t *context_flag) +{ + H5P_coll_md_read_flag_t prev_file_flag = H5P_USER_FALSE; + hbool_t prev_context_flag = FALSE; + + FUNC_ENTER_NOAPI_NOINIT_NOERR + + HDassert(file && file->shared); + HDassert(file_flag); + HDassert(context_flag); + + /* Save old state */ + prev_file_flag = H5F_COLL_MD_READ(file); + prev_context_flag = H5CX_get_coll_metadata_read(); + + /* Set new desired state */ + if (prev_file_flag != *file_flag) { + file->shared->coll_md_read = *file_flag; + *file_flag = prev_file_flag; + } + if (prev_context_flag != *context_flag) { + H5CX_set_coll_metadata_read(*context_flag); + *context_flag = prev_context_flag; + } + + FUNC_LEAVE_NOAPI_VOID +} /* end H5F_set_coll_metadata_reads() */ + #endif /* H5_HAVE_PARALLEL */ diff --git a/src/H5Fprivate.h b/src/H5Fprivate.h index 5c5937b..f7019e2 100644 --- a/src/H5Fprivate.h +++ b/src/H5Fprivate.h @@ -962,6 +962,8 @@ H5_DLL int H5F_mpi_get_size(const H5F_t *f); H5_DLL herr_t H5F_mpi_retrieve_comm(hid_t loc_id, hid_t acspl_id, MPI_Comm *mpi_comm); H5_DLL herr_t H5F_get_mpi_atomicity(H5F_t *file, hbool_t *flag); H5_DLL herr_t H5F_set_mpi_atomicity(H5F_t *file, hbool_t flag); +H5_DLL hbool_t H5F_get_coll_metadata_reads(const H5F_t *f); +H5_DLL void H5F_set_coll_metadata_reads(H5F_t *f, H5P_coll_md_read_flag_t *file_flag, hbool_t *context_flag); #endif /* H5_HAVE_PARALLEL */ /* External file cache routines */ diff --git a/src/H5Pfapl.c b/src/H5Pfapl.c index a7f84fb..cdf5dc1 100644 --- a/src/H5Pfapl.c +++ b/src/H5Pfapl.c @@ -4762,15 +4762,14 @@ H5P__decode_coll_md_read_flag_t(const void **_pp, void *_value) * Function: H5Pset_all_coll_metadata_ops * * Purpose: Tell the library whether the metadata read operations will - * be done collectively (1) or not (0). Default is independent. - * With collective mode, the library will optimize access to - * metadata operations on the file. + * be done collectively (1) or not (0). Default is independent. + * With collective mode, the library will optimize access to + * metadata operations on the file. * * Note: This routine accepts file access property lists, link - * access property lists, attribute access property lists, - * dataset access property lists, group access property lists, - * named datatype access property lists, - * and dataset transfer property lists. + * access property lists, attribute access property lists, + * dataset access property lists, group access property lists + * and named datatype access property lists. * * Return: Non-negative on success/Negative on failure * @@ -4790,7 +4789,7 @@ H5Pset_all_coll_metadata_ops(hid_t plist_id, hbool_t is_collective) H5TRACE2("e", "ib", plist_id, is_collective); /* Compare the property list's class against the other class */ - /* (Dataset, group, attribute, and named datype access property lists + /* (Dataset, group, attribute, and named datatype access property lists * are sub-classes of link access property lists -QAK) */ if (TRUE != H5P_isa_class(plist_id, H5P_LINK_ACCESS) && @@ -4821,10 +4820,9 @@ done: * Purpose: Gets information about collective metadata read mode. * * Note: This routine accepts file access property lists, link - * access property lists, attribute access property lists, - * dataset access property lists, group access property lists, - * named datatype access property lists, - * and dataset transfer property lists. + * access property lists, attribute access property lists, + * dataset access property lists, group access property lists, + * and named datatype access property lists. * * Return: Non-negative on success/Negative on failure * @@ -4842,7 +4840,7 @@ H5Pget_all_coll_metadata_ops(hid_t plist_id, hbool_t *is_collective) H5TRACE2("e", "i*b", plist_id, is_collective); /* Compare the property list's class against the other class */ - /* (Dataset, group, attribute, and named datype access property lists + /* (Dataset, group, attribute, and named datatype access property lists * are sub-classes of link access property lists -QAK) */ if (TRUE != H5P_isa_class(plist_id, H5P_LINK_ACCESS) && diff --git a/src/H5Z.c b/src/H5Z.c index 6b44024..05c4da7 100644 --- a/src/H5Z.c +++ b/src/H5Z.c @@ -599,14 +599,9 @@ H5Z__flush_file_cb(void *obj_ptr, hid_t H5_ATTR_UNUSED obj_id, void H5_ATTR_PARA /* Do a global flush if the file is opened for write */ if (H5F_ACC_RDWR & H5F_INTENT(f)) { -/* When parallel HDF5 is defined, check for collective metadata reads on this - * file and set the flag for metadata I/O in the API context. -QAK, 2018/02/14 - */ #ifdef H5_HAVE_PARALLEL /* Check if MPIO driver is used */ if (H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) { - H5P_coll_md_read_flag_t coll_md_read; /* Do all metadata reads collectively */ - /* Sanity check for collectively calling H5Zunregister, if requested */ /* (Sanity check assumes that a barrier on one file's comm * is sufficient (i.e. that there aren't different comms for @@ -626,13 +621,8 @@ H5Z__flush_file_cb(void *obj_ptr, hid_t H5_ATTR_UNUSED obj_id, void H5_ATTR_PARA /* Set the "sanity checked" flag */ object->sanity_checked = TRUE; } /* end if */ - - /* Check whether to use the collective metadata read DXPL */ - coll_md_read = H5F_COLL_MD_READ(f); - if (H5P_USER_TRUE == coll_md_read) - H5CX_set_coll_metadata_read(TRUE); - } /* end if */ -#endif /* H5_HAVE_PARALLEL */ + } /* end if */ +#endif /* H5_HAVE_PARALLEL */ /* Call the flush routine for mounted file hierarchies */ if (H5F_flush_mounts((H5F_t *)obj_ptr) < 0) diff --git a/testpar/t_cache.c b/testpar/t_cache.c index 263b467..39c5721 100644 --- a/testpar/t_cache.c +++ b/testpar/t_cache.c @@ -6617,13 +6617,15 @@ trace_file_check(int metadata_write_strategy) static hbool_t smoke_check_6(int metadata_write_strategy) { - hbool_t success = TRUE; - int i; - int max_nerrors; - hid_t fid = -1; - H5F_t * file_ptr = NULL; - H5C_t * cache_ptr = NULL; - struct mssg_t mssg; + H5P_coll_md_read_flag_t md_reads_file_flag; + hbool_t md_reads_context_flag; + hbool_t success = TRUE; + int i; + int max_nerrors; + hid_t fid = -1; + H5F_t * file_ptr = NULL; + H5C_t * cache_ptr = NULL; + struct mssg_t mssg; switch (metadata_write_strategy) { @@ -6679,7 +6681,9 @@ smoke_check_6(int metadata_write_strategy) virt_num_data_entries = NUM_DATA_ENTRIES; /* insert the first half collectively */ - H5CX_set_coll_metadata_read(TRUE); + md_reads_file_flag = H5P_USER_TRUE; + md_reads_context_flag = TRUE; + H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag); for (i = 0; i < virt_num_data_entries / 2; i++) { struct datum *entry_ptr; entry_ptr = &(data[i]); @@ -6698,9 +6702,13 @@ smoke_check_6(int metadata_write_strategy) H5_CHECK_OVERFLOW(cache_ptr->max_cache_size, size_t, double); HDassert((double)cache_ptr->max_cache_size * 0.8 > cache_ptr->coll_list_size); } + /* Restore collective metadata reads state */ + H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag); /* insert the other half independently */ - H5CX_set_coll_metadata_read(FALSE); + md_reads_file_flag = H5P_USER_FALSE; + md_reads_context_flag = FALSE; + H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag); for (i = virt_num_data_entries / 2; i < virt_num_data_entries; i++) { struct datum *entry_ptr; entry_ptr = &(data[i]); @@ -6718,6 +6726,8 @@ smoke_check_6(int metadata_write_strategy) /* Make sure coll entries do not cross the 80% threshold */ HDassert((double)cache_ptr->max_cache_size * 0.8 > cache_ptr->coll_list_size); } + /* Restore collective metadata reads state */ + H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag); /* flush the file */ if (H5Fflush(fid, H5F_SCOPE_GLOBAL) < 0) { @@ -6728,7 +6738,9 @@ smoke_check_6(int metadata_write_strategy) } /* Protect the first half of the entries collectively */ - H5CX_set_coll_metadata_read(TRUE); + md_reads_file_flag = H5P_USER_TRUE; + md_reads_context_flag = TRUE; + H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag); for (i = 0; i < (virt_num_data_entries / 2); i++) { struct datum *entry_ptr; entry_ptr = &(data[i]); @@ -6746,9 +6758,13 @@ smoke_check_6(int metadata_write_strategy) /* Make sure coll entries do not cross the 80% threshold */ HDassert((double)cache_ptr->max_cache_size * 0.8 > cache_ptr->coll_list_size); } + /* Restore collective metadata reads state */ + H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag); /* protect the other half independently */ - H5CX_set_coll_metadata_read(FALSE); + md_reads_file_flag = H5P_USER_FALSE; + md_reads_context_flag = FALSE; + H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag); for (i = virt_num_data_entries / 2; i < virt_num_data_entries; i++) { struct datum *entry_ptr; entry_ptr = &(data[i]); @@ -6766,6 +6782,8 @@ smoke_check_6(int metadata_write_strategy) /* Make sure coll entries do not cross the 80% threshold */ HDassert((double)cache_ptr->max_cache_size * 0.8 > cache_ptr->coll_list_size); } + /* Restore collective metadata reads state */ + H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag); for (i = 0; i < (virt_num_data_entries); i++) { unlock_entry(file_ptr, i, H5AC__NO_FLAGS_SET); diff --git a/testpar/t_coll_md_read.c b/testpar/t_coll_md_read.c index fd62eb6..cabdea0 100644 --- a/testpar/t_coll_md_read.c +++ b/testpar/t_coll_md_read.c @@ -34,10 +34,9 @@ #define MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS 2 -#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DATASET_NAME "linked_chunk_io_sort_chunk_issue" -#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_Y_DIM_SCALE 20000 -#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE 1 -#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS 1 +#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_COLL_THRESH_NUM 10000 +#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DATASET_NAME "linked_chunk_io_sort_chunk_issue" +#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS 1 /* * A test for issue HDFFV-10501. A parallel hang was reported which occurred @@ -339,21 +338,34 @@ test_multi_chunk_io_addrmap_issue(void) * collective metadata reads being made only by process 0 in H5D__sort_chunk(). * * NOTE: Due to the way that the threshold value which pertains to this test - * is currently calculated within HDF5, there are several conditions that this - * test must maintain. Refer to the function H5D__sort_chunk in H5Dmpio.c for - * a better idea of why. + * is currently calculated within HDF5, the following two conditions must be + * true to trigger the issue: * - * Condition 1: We need to make sure that the test always selects every single - * chunk in the dataset. It is fine if the selection is split up among multiple - * ranks, but their combined selection must cover the whole dataset. + * Condition 1: A certain threshold ratio must be met in order to have HDF5 + * obtain all chunk addresses collectively inside H5D__sort_chunk(). This is + * given by the following: * - * Condition 2: The number of chunks in the dataset divided by the number of MPI - * ranks must exceed or equal 10000. In other words, each MPI rank must be - * responsible for 10000 or more unique chunks. + * (sum_chunk * 100) / (dataset_nchunks * mpi_size) >= 30% * - * Condition 3: This test will currently only be reliably reproducable for 2 or 3 - * MPI ranks. The threshold value calculated reduces to a constant 100 / mpi_size, - * and is compared against a default value of 30%. + * where: + * * `sum_chunk` is the combined sum of the number of chunks selected in + * the dataset by all ranks (chunks selected by more than one rank count + * individually toward the sum for each rank selecting that chunk) + * * `dataset_nchunks` is the number of chunks in the dataset (selected + * or not) + * * `mpi_size` is the size of the MPI Communicator + * + * Condition 2: `sum_chunk` divided by `mpi_size` must exceed or equal a certain + * threshold (as of this writing, 10000). + * + * To satisfy both these conditions, we #define a macro, + * LINK_CHUNK_IO_SORT_CHUNK_ISSUE_COLL_THRESH_NUM, which corresponds to the + * value of the H5D_ALL_CHUNK_ADDR_THRES_COL_NUM macro in H5Dmpio.c (the + * 10000 threshold from condition 2). We then create a dataset of that many + * chunks and have each MPI rank write to and read from a piece of every single + * chunk in the dataset. This ensures chunk utilization is the max possible + * and exceeds our 30% target ratio, while always exactly matching the numeric + * chunk threshold value of condition 2. * * Failure in this test may either cause a hang, or, due to how the MPI calls * pertaining to this issue might mistakenly match up, may cause an MPI error @@ -375,10 +387,9 @@ void test_link_chunk_io_sort_chunk_issue(void) { const char *filename; - hsize_t * dataset_dims = NULL; - hsize_t max_dataset_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS]; - hsize_t sel_dims[1]; - hsize_t chunk_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS] = {LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS}; + hsize_t dataset_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS]; + hsize_t sel_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS]; + hsize_t chunk_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS]; hsize_t start[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS]; hsize_t stride[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS]; hsize_t count[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS]; @@ -412,14 +423,13 @@ test_link_chunk_io_sort_chunk_issue(void) file_id = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); VRFY((file_id >= 0), "H5Fcreate succeeded"); - dataset_dims = HDmalloc(LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS * sizeof(*dataset_dims)); - VRFY((dataset_dims != NULL), "malloc succeeded"); - - dataset_dims[0] = (hsize_t)LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE * (hsize_t)mpi_size * - (hsize_t)LINK_CHUNK_IO_SORT_CHUNK_ISSUE_Y_DIM_SCALE; - max_dataset_dims[0] = H5S_UNLIMITED; + /* + * Create a one-dimensional dataset of exactly LINK_CHUNK_IO_SORT_CHUNK_ISSUE_COLL_THRESH_NUM + * chunks, where every rank writes to a piece of every single chunk to keep utilization high. + */ + dataset_dims[0] = (hsize_t)mpi_size * (hsize_t)LINK_CHUNK_IO_SORT_CHUNK_ISSUE_COLL_THRESH_NUM; - fspace_id = H5Screate_simple(LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS, dataset_dims, max_dataset_dims); + fspace_id = H5Screate_simple(LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS, dataset_dims, NULL); VRFY((fspace_id >= 0), "H5Screate_simple succeeded"); /* @@ -428,6 +438,9 @@ test_link_chunk_io_sort_chunk_issue(void) dcpl_id = H5Pcreate(H5P_DATASET_CREATE); VRFY((dcpl_id >= 0), "H5Pcreate succeeded"); + /* Chunk size is equal to MPI size since each rank writes to a piece of every chunk */ + chunk_dims[0] = (hsize_t)mpi_size; + VRFY((H5Pset_chunk(dcpl_id, LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS, chunk_dims) >= 0), "H5Pset_chunk succeeded"); @@ -437,23 +450,21 @@ test_link_chunk_io_sort_chunk_issue(void) /* * Setup hyperslab selection to split the dataset among the ranks. - * - * The ranks will write rows across the dataset. */ - stride[0] = LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE; - count[0] = (dataset_dims[0] / LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) / (hsize_t)mpi_size; - start[0] = count[0] * (hsize_t)mpi_rank; - block[0] = LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE; + start[0] = (hsize_t)mpi_rank; + stride[0] = (hsize_t)mpi_size; + count[0] = LINK_CHUNK_IO_SORT_CHUNK_ISSUE_COLL_THRESH_NUM; + block[0] = 1; VRFY((H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, start, stride, count, block) >= 0), "H5Sselect_hyperslab succeeded"); - sel_dims[0] = count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE); + sel_dims[0] = count[0]; mspace_id = H5Screate_simple(1, sel_dims, NULL); VRFY((mspace_id >= 0), "H5Screate_simple succeeded"); - data = HDcalloc(1, count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) * sizeof(int)); + data = HDcalloc(1, count[0] * sizeof(int)); VRFY((data != NULL), "calloc succeeded"); dxpl_id = H5Pcreate(H5P_DATASET_XFER); @@ -476,33 +487,25 @@ test_link_chunk_io_sort_chunk_issue(void) VRFY((H5Pset_dxpl_mpio_chunk_opt(dxpl_id, H5FD_MPIO_CHUNK_ONE_IO) >= 0), "H5Pset_dxpl_mpio_chunk_opt succeeded"); - read_buf = HDmalloc(count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) * sizeof(int)); + read_buf = HDmalloc(count[0] * sizeof(int)); VRFY((read_buf != NULL), "malloc succeeded"); VRFY((H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, start, stride, count, block) >= 0), "H5Sselect_hyperslab succeeded"); - sel_dims[0] = count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE); + sel_dims[0] = count[0]; VRFY((H5Sclose(mspace_id) >= 0), "H5Sclose succeeded"); mspace_id = H5Screate_simple(1, sel_dims, NULL); VRFY((mspace_id >= 0), "H5Screate_simple succeeded"); - read_buf = HDrealloc(read_buf, count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) * sizeof(int)); - VRFY((read_buf != NULL), "realloc succeeded"); - /* * Finally have each rank read their section of data back from the dataset. */ VRFY((H5Dread(dset_id, H5T_NATIVE_INT, mspace_id, fspace_id, dxpl_id, read_buf) >= 0), "H5Dread succeeded"); - if (dataset_dims) { - HDfree(dataset_dims); - dataset_dims = NULL; - } - if (data) { HDfree(data); data = NULL; -- cgit v0.12