diff options
author | Jordan Henderson <jhenderson@hdfgroup.org> | 2019-01-25 21:32:51 (GMT) |
---|---|---|
committer | Jordan Henderson <jhenderson@hdfgroup.org> | 2019-02-12 20:38:02 (GMT) |
commit | a4cfd0bfd7be9cba5d9af02825358f898885c5c4 (patch) | |
tree | 7c9933648971055ebc24f7fa1e04285a5a2e13e8 /src | |
parent | 9f9485a17a87b075f122b654c1f8426d9114a6df (diff) | |
download | hdf5-a4cfd0bfd7be9cba5d9af02825358f898885c5c4.zip hdf5-a4cfd0bfd7be9cba5d9af02825358f898885c5c4.tar.gz hdf5-a4cfd0bfd7be9cba5d9af02825358f898885c5c4.tar.bz2 |
Fix some collective metadata read issues
Diffstat (limited to 'src')
-rw-r--r-- | src/H5C.c | 55 | ||||
-rw-r--r-- | src/H5CX.c | 4 | ||||
-rw-r--r-- | src/H5Cimage.c | 4 | ||||
-rw-r--r-- | src/H5Dchunk.c | 22 | ||||
-rw-r--r-- | src/H5Dmpio.c | 6 | ||||
-rw-r--r-- | src/H5Z.c | 2 |
6 files changed, 29 insertions, 64 deletions
@@ -1483,31 +1483,17 @@ H5C_insert_entry(H5F_t * f, H5C__UPDATE_STATS_FOR_INSERTION(cache_ptr, entry_ptr) #ifdef H5_HAVE_PARALLEL - if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) { - coll_access = (H5P_USER_TRUE == f->coll_md_read ? TRUE : FALSE); - - /* If not explicitly disabled, get the cmdr setting from the API context */ - if(!coll_access && H5P_FORCE_FALSE != f->coll_md_read) - coll_access = H5CX_get_coll_metadata_read(); - } /* end if */ + if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) + coll_access = H5CX_get_coll_metadata_read(); entry_ptr->coll_access = coll_access; if(coll_access) { H5C__INSERT_IN_COLL_LIST(cache_ptr, entry_ptr, FAIL) /* Make sure the size of the collective entries in the cache remain in check */ - if(H5P_USER_TRUE == f->coll_md_read) { - if(cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100) { - if(H5C_clear_coll_entries(cache_ptr, TRUE) < 0) - HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't clear collective metadata entries") - } /* end if */ - } /* end if */ - else { - if(cache_ptr->max_cache_size * 40 < cache_ptr->coll_list_size * 100) { - if(H5C_clear_coll_entries(cache_ptr, TRUE) < 0) - HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't clear collective metadata entries") - } /* end if */ - } /* end else */ + if(cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100) + if(H5C_clear_coll_entries(cache_ptr, TRUE) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't clear collective metadata entries") } /* end if */ #endif @@ -2243,13 +2229,8 @@ H5C_protect(H5F_t * f, ring = H5CX_get_ring(); #ifdef H5_HAVE_PARALLEL - if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) { - coll_access = (H5P_USER_TRUE == f->coll_md_read ? TRUE : FALSE); - - /* If not explicitly disabled, get the cmdr setting from the API context */ - if(!coll_access && H5P_FORCE_FALSE != f->coll_md_read) - coll_access = H5CX_get_coll_metadata_read(); - } /* end if */ + if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) + coll_access = H5CX_get_coll_metadata_read(); #endif /* H5_HAVE_PARALLEL */ /* first check to see if the target is in cache */ @@ -2286,7 +2267,7 @@ H5C_protect(H5F_t * f, the entry in their cache still have to participate in the bcast. */ #ifdef H5_HAVE_PARALLEL - if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI) && coll_access) { + if(coll_access) { if(!(entry_ptr->is_dirty) && !(entry_ptr->coll_access)) { MPI_Comm comm; /* File MPI Communicator */ int mpi_code; /* MPI error code */ @@ -2601,21 +2582,11 @@ H5C_protect(H5F_t * f, } /* end if */ #ifdef H5_HAVE_PARALLEL - if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) { - /* Make sure the size of the collective entries in the cache remain in check */ - if(coll_access) { - if(H5P_USER_TRUE == f->coll_md_read) { - if(cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100) - if(H5C_clear_coll_entries(cache_ptr, TRUE) < 0) - HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, NULL, "can't clear collective metadata entries") - } /* end if */ - else { - if(cache_ptr->max_cache_size * 40 < cache_ptr->coll_list_size * 100) - if(H5C_clear_coll_entries(cache_ptr, TRUE) < 0) - HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, NULL, "can't clear collective metadata entries") - } /* end else */ - } /* end if */ - } /* end if */ + /* Make sure the size of the collective entries in the cache remain in check */ + if(coll_access) + if(cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100) + if(H5C_clear_coll_entries(cache_ptr, TRUE) < 0) + HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, NULL, "can't clear collective metadata entries") #endif /* H5_HAVE_PARALLEL */ done: @@ -807,8 +807,8 @@ H5CX_set_apl(hid_t *acspl_id, const H5P_libclass_t *libclass, #ifdef H5_HAVE_PARALLEL /* If this routine is not guaranteed to be collective (i.e. it doesn't - * modify the structural metadata in a file), check if we should use - * a collective metadata read. + * modify the structural metadata in a file), check if the application + * specified a collective metadata read for just this operation. */ if(!is_collective) { H5P_genplist_t *plist; /* Property list pointer */ diff --git a/src/H5Cimage.c b/src/H5Cimage.c index 26b6506..298a2dd 100644 --- a/src/H5Cimage.c +++ b/src/H5Cimage.c @@ -1065,9 +1065,9 @@ H5C__read_cache_image(H5F_t *f, H5C_t *cache_ptr) H5C__UPDATE_STATS_FOR_CACHE_IMAGE_READ(cache_ptr) #ifdef H5_HAVE_PARALLEL - if ( aux_ptr ) { + if ( aux_ptr ) { - /* Broadcast cache image */ + /* Broadcast cache image */ if ( MPI_SUCCESS != (mpi_result = MPI_Bcast(cache_ptr->image_buffer, (int)cache_ptr->image_len, MPI_BYTE, diff --git a/src/H5Dchunk.c b/src/H5Dchunk.c index 3fa4a23..13aced6 100644 --- a/src/H5Dchunk.c +++ b/src/H5Dchunk.c @@ -2929,9 +2929,6 @@ H5D__chunk_lookup(const H5D_t *dset, const hsize_t *scaled, /* Check for cached information */ if(!H5D__chunk_cinfo_cache_found(&dset->shared->cache.chunk.last, udata)) { H5D_chk_idx_info_t idx_info; /* Chunked index info */ -#ifdef H5_HAVE_PARALLEL - H5P_coll_md_read_flag_t temp_cmr; /* Temp value to hold the coll metadata read setting */ -#endif /* H5_HAVE_PARALLEL */ /* Compose chunked index info struct */ idx_info.f = dset->oloc.file; @@ -2940,25 +2937,18 @@ H5D__chunk_lookup(const H5D_t *dset, const hsize_t *scaled, idx_info.storage = &dset->shared->layout.storage.u.chunk; #ifdef H5_HAVE_PARALLEL - if(H5F_HAS_FEATURE(idx_info.f, H5FD_FEAT_HAS_MPI)) { - /* disable collective metadata read for chunk indexes - as it is highly unlikely that users would read the - same chunks from all processes. MSC - might turn on - for root node? */ - temp_cmr = H5F_COLL_MD_READ(idx_info.f); - H5F_set_coll_md_read(idx_info.f, H5P_FORCE_FALSE); - } /* end if */ + /* Disable collective metadata read for chunk indexes as it is + * highly unlikely that users would read the same chunks from all + * processes. + */ + if(H5F_HAS_FEATURE(idx_info.f, H5FD_FEAT_HAS_MPI)) + H5CX_set_coll_metadata_read(FALSE); #endif /* H5_HAVE_PARALLEL */ /* Go get the chunk information */ if((dset->shared->layout.storage.u.chunk.ops->get_addr)(&idx_info, udata) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't query chunk address") -#ifdef H5_HAVE_PARALLEL - if(H5F_HAS_FEATURE(idx_info.f, H5FD_FEAT_HAS_MPI)) - H5F_set_coll_md_read(idx_info.f, temp_cmr); -#endif /* H5_HAVE_PARALLEL */ - /* * Cache the information retrieved. * diff --git a/src/H5Dmpio.c b/src/H5Dmpio.c index 7352375..6803b60 100644 --- a/src/H5Dmpio.c +++ b/src/H5Dmpio.c @@ -800,6 +800,10 @@ H5D__chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_inf HDassert(type_info); HDassert(fm); + /* Disable collective metadata reads for chunked dataset I/O operations + * in order to prevent potential hangs */ + H5CX_set_coll_metadata_read(FALSE); + /* Check the optional property list for the collective chunk IO optimization option */ if(H5CX_get_mpio_chunk_opt_mode(&chunk_opt_mode) < 0) HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "couldn't get chunk optimization option") @@ -2313,7 +2317,7 @@ if(H5DEBUG(D)) /* Broadcasting the MPI_IO option info. and chunk address info. */ if(MPI_SUCCESS != (mpi_code = MPI_Bcast(total_chunk_addr_array, (int)(sizeof(haddr_t) * fm->layout->u.chunk.nchunks), MPI_BYTE, (int)0, io_info->comm))) - HMPI_GOTO_ERROR(FAIL, "MPI_BCast failed", mpi_code) + HMPI_GOTO_ERROR(FAIL, "MPI_BCast failed", mpi_code) } /* end if */ /* Start at first node in chunk skip list */ @@ -603,7 +603,7 @@ H5Z__flush_file_cb(void *obj_ptr, hid_t H5_ATTR_UNUSED obj_id, void *key) /* Sanity check for collectively calling H5Zunregister, if requested */ /* (Sanity check assumes that a barrier on one file's comm * is sufficient (i.e. that there aren't different comms for - * different files). -QAK, 2018/02/14 + * different files). -QAK, 2018/02/14) */ if(H5_coll_api_sanity_check_g && !object->sanity_checked) { MPI_Comm mpi_comm; /* File's communicator */ |