From 99d3962a831167298ebc087f0b8e8b6209034d95 Mon Sep 17 00:00:00 2001 From: jhendersonHDF Date: Sat, 22 Jan 2022 08:40:33 -0600 Subject: Parallel rank0 deadlock fixes (#1183) * Fix several places where rank 0 can skip past collective MPI operations on failure * Committing clang-format changes Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com> --- release_docs/RELEASE.txt | 12 ++++++ src/H5AC.c | 11 ++++-- src/H5ACmpio.c | 97 ++++++++++++++++++++++++++++++------------------ src/H5C.c | 42 +++++++++++++++++---- src/H5CX.c | 8 +--- src/H5Cimage.c | 3 ++ src/H5Dcontig.c | 13 +++++-- src/H5Dmpio.c | 12 +++++- src/H5FDmpio.c | 53 +++++++++++++++++++------- 9 files changed, 180 insertions(+), 71 deletions(-) diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt index d059fb3..bba27c9 100644 --- a/release_docs/RELEASE.txt +++ b/release_docs/RELEASE.txt @@ -1083,6 +1083,18 @@ Bug Fixes since HDF5-1.12.0 release (DER - 2021/11/23, HDFFV-11286) + - Fixed several potential MPI deadlocks in library failure conditions + + In the parallel library, there were several places where MPI rank 0 + could end up skipping past collective MPI operations when some failure + occurs in rank 0-specific processing. This would lead to deadlocks + where rank 0 completes an operation while other ranks wait in the + collective operation. These places have been rewritten to have rank 0 + push an error and try to cleanup after the failure, then continue to + participate in the collective operation to the best of its ability. + + (JTH - 2021/11/09) + - Fixed an issue with collective metadata reads being permanently disabled after a dataset chunk lookup operation. This would usually cause a mismatched MPI_Bcast and MPI_ERR_TRUNCATE issue in the library for diff --git a/src/H5AC.c b/src/H5AC.c index 6fbc63e..e20be3b 100644 --- a/src/H5AC.c +++ b/src/H5AC.c @@ -1636,9 +1636,14 @@ H5AC_unprotect(H5F_t *f, const H5AC_class_t *type, haddr_t addr, void *thing, un if (H5AC__log_dirtied_entry((H5AC_info_t *)thing) < 0) HGOTO_ERROR(H5E_CACHE, H5E_CANTUNPROTECT, FAIL, "can't log dirtied entry") - if (deleted && aux_ptr->mpi_rank == 0) - if (H5AC__log_deleted_entry((H5AC_info_t *)thing) < 0) - HGOTO_ERROR(H5E_CACHE, H5E_CANTUNPROTECT, FAIL, "H5AC__log_deleted_entry() failed") + if (deleted && aux_ptr->mpi_rank == 0) { + if (H5AC__log_deleted_entry((H5AC_info_t *)thing) < 0) { + /* If we fail to log the deleted entry, push an error but still + * participate in a possible sync point ahead + */ + HDONE_ERROR(H5E_CACHE, H5E_CANTUNPROTECT, FAIL, "H5AC__log_deleted_entry() failed") + } + } } /* end if */ #endif /* H5_HAVE_PARALLEL */ diff --git a/src/H5ACmpio.c b/src/H5ACmpio.c index dc155f5..7eaf751 100644 --- a/src/H5ACmpio.c +++ b/src/H5ACmpio.c @@ -304,8 +304,10 @@ H5AC__broadcast_candidate_list(H5AC_t *cache_ptr, unsigned *num_entries_ptr, had * are used to receiving from process 0, and also load it * into a buffer for transmission. */ - if (H5AC__copy_candidate_list_to_buffer(cache_ptr, &chk_num_entries, &haddr_buf_ptr) < 0) - HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate buffer.") + if (H5AC__copy_candidate_list_to_buffer(cache_ptr, &chk_num_entries, &haddr_buf_ptr) < 0) { + /* Push an error, but still participate in following MPI_Bcast */ + HDONE_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate buffer.") + } HDassert(chk_num_entries == num_entries); HDassert(haddr_buf_ptr != NULL); @@ -428,18 +430,23 @@ H5AC__broadcast_clean_list(H5AC_t *cache_ptr) /* allocate a buffer to store the list of entry base addresses in */ buf_size = sizeof(haddr_t) * num_entries; - if (NULL == (addr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size))) - HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for addr buffer") - - /* Set up user data for callback */ - udata.aux_ptr = aux_ptr; - udata.addr_buf_ptr = addr_buf_ptr; - udata.u = 0; - - /* Free all the clean list entries, building the address list in the callback */ - /* (Callback also removes the matching entries from the dirtied list) */ - if (H5SL_free(aux_ptr->c_slist_ptr, H5AC__broadcast_clean_list_cb, &udata) < 0) - HGOTO_ERROR(H5E_CACHE, H5E_CANTFREE, FAIL, "Can't build address list for clean entries") + if (NULL == (addr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size))) { + /* Push an error, but still participate in following MPI_Bcast */ + HDONE_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for addr buffer") + } + else { + /* Set up user data for callback */ + udata.aux_ptr = aux_ptr; + udata.addr_buf_ptr = addr_buf_ptr; + udata.u = 0; + + /* Free all the clean list entries, building the address list in the callback */ + /* (Callback also removes the matching entries from the dirtied list) */ + if (H5SL_free(aux_ptr->c_slist_ptr, H5AC__broadcast_clean_list_cb, &udata) < 0) { + /* Push an error, but still participate in following MPI_Bcast */ + HDONE_ERROR(H5E_CACHE, H5E_CANTFREE, FAIL, "Can't build address list for clean entries") + } + } /* Now broadcast the list of cleaned entries */ if (MPI_SUCCESS != @@ -1448,8 +1455,10 @@ H5AC__receive_haddr_list(MPI_Comm mpi_comm, unsigned *num_entries_ptr, haddr_t * /* allocate buffers to store the list of entry base addresses in */ buf_size = sizeof(haddr_t) * num_entries; - if (NULL == (haddr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size))) - HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for haddr buffer") + if (NULL == (haddr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size))) { + /* Push an error, but still participate in following MPI_Bcast */ + HDONE_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for haddr buffer") + } /* Now receive the list of candidate entries */ if (MPI_SUCCESS != @@ -1800,10 +1809,14 @@ H5AC__rsp__dist_md_write__flush_to_min_clean(H5F_t *f) if (evictions_enabled) { /* construct candidate list -- process 0 only */ - if (aux_ptr->mpi_rank == 0) + if (aux_ptr->mpi_rank == 0) { + /* If constructing candidate list fails, push an error but still participate + * in collective operations during following candidate list propagation + */ if (H5AC__construct_candidate_list(cache_ptr, aux_ptr, H5AC_SYNC_POINT_OP__FLUSH_TO_MIN_CLEAN) < 0) - HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate list.") + HDONE_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate list.") + } /* propagate and apply candidate list -- all processes */ if (H5AC__propagate_and_apply_candidate_list(f) < 0) @@ -1899,15 +1912,21 @@ H5AC__rsp__p0_only__flush(H5F_t *f) aux_ptr->write_permitted = FALSE; /* Check for error on the write operation */ - if (result < 0) - HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't flush.") - - /* this code exists primarily for the test bed -- it allows us to - * enforce POSIX semantics on the server that pretends to be a - * file system in our parallel tests. - */ - if (aux_ptr->write_done) - (aux_ptr->write_done)(); + if (result < 0) { + /* If write operation fails, push an error but still participate + * in collective operations during following cache entry + * propagation + */ + HDONE_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't flush.") + } + else { + /* this code exists primarily for the test bed -- it allows us to + * enforce POSIX semantics on the server that pretends to be a + * file system in our parallel tests. + */ + if (aux_ptr->write_done) + (aux_ptr->write_done)(); + } } /* end if */ /* Propagate cleaned entries to other ranks. */ @@ -2019,15 +2038,21 @@ H5AC__rsp__p0_only__flush_to_min_clean(H5F_t *f) aux_ptr->write_permitted = FALSE; /* Check for error on the write operation */ - if (result < 0) - HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "H5C_flush_to_min_clean() failed.") - - /* this call exists primarily for the test code -- it is used - * to enforce POSIX semantics on the process used to simulate - * reads and writes in t_cache.c. - */ - if (aux_ptr->write_done) - (aux_ptr->write_done)(); + if (result < 0) { + /* If write operation fails, push an error but still participate + * in collective operations during following cache entry + * propagation + */ + HDONE_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "H5C_flush_to_min_clean() failed.") + } + else { + /* this call exists primarily for the test code -- it is used + * to enforce POSIX semantics on the process used to simulate + * reads and writes in t_cache.c. + */ + if (aux_ptr->write_done) + (aux_ptr->write_done)(); + } } /* end if */ if (H5AC__propagate_flushed_and_still_clean_entries_list(f) < 0) diff --git a/src/H5C.c b/src/H5C.c index d34c650..889351d 100644 --- a/src/H5C.c +++ b/src/H5C.c @@ -2307,9 +2307,14 @@ H5C_protect(H5F_t *f, const H5C_class_t *type, haddr_t addr, void *udata, unsign H5MM_memcpy(((uint8_t *)entry_ptr->image_ptr) + entry_ptr->size, H5C_IMAGE_SANITY_VALUE, H5C_IMAGE_EXTRA_SPACE); #endif /* H5C_DO_MEMORY_SANITY_CHECKS */ - if (0 == mpi_rank) - if (H5C__generate_image(f, cache_ptr, entry_ptr) < 0) - HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, NULL, "can't generate entry's image") + if (0 == mpi_rank) { + if (H5C__generate_image(f, cache_ptr, entry_ptr) < 0) { + /* If image generation fails, push an error but + * still participate in the following MPI_Bcast + */ + HDONE_ERROR(H5E_CACHE, H5E_CANTGET, NULL, "can't generate entry's image") + } + } } /* end if */ HDassert(entry_ptr->image_ptr); @@ -7182,8 +7187,20 @@ H5C__load_entry(H5F_t *f, #ifdef H5_HAVE_PARALLEL if (!coll_access || 0 == mpi_rank) { #endif /* H5_HAVE_PARALLEL */ - if (H5F_block_read(f, type->mem_type, addr, len, image) < 0) - HGOTO_ERROR(H5E_CACHE, H5E_READERROR, NULL, "Can't read image*") + + if (H5F_block_read(f, type->mem_type, addr, len, image) < 0) { + +#ifdef H5_HAVE_PARALLEL + if (coll_access) { + /* Push an error, but still participate in following MPI_Bcast */ + HDmemset(image, 0, len); + HDONE_ERROR(H5E_CACHE, H5E_READERROR, NULL, "Can't read image*") + } + else +#endif + HGOTO_ERROR(H5E_CACHE, H5E_READERROR, NULL, "Can't read image*") + } + #ifdef H5_HAVE_PARALLEL } /* end if */ /* if the collective metadata read optimization is turned on, @@ -7230,8 +7247,19 @@ H5C__load_entry(H5F_t *f, * loaded thing, go get the on-disk image again (the extra portion). */ if (H5F_block_read(f, type->mem_type, addr + len, actual_len - len, image + len) < - 0) - HGOTO_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, "can't read image") + 0) { + +#ifdef H5_HAVE_PARALLEL + if (coll_access) { + /* Push an error, but still participate in following MPI_Bcast */ + HDmemset(image + len, 0, actual_len - len); + HDONE_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, "can't read image") + } + else +#endif + HGOTO_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, "can't read image") + } + #ifdef H5_HAVE_PARALLEL } /* If the collective metadata read optimization is turned on, diff --git a/src/H5CX.c b/src/H5CX.c index b9ce682..89e4c91 100644 --- a/src/H5CX.c +++ b/src/H5CX.c @@ -1397,9 +1397,7 @@ H5CX_set_apl(hid_t *acspl_id, const H5P_libclass_t *libclass, /* If parallel is enabled and the file driver used is the MPI-IO * VFD, issue an MPI barrier for easier debugging if the API function - * calling this is supposed to be called collectively. Note that this - * happens only when the environment variable H5_COLL_BARRIER is set - * to non 0. + * calling this is supposed to be called collectively. */ if (H5_coll_api_sanity_check_g) { MPI_Comm mpi_comm; /* File communicator */ @@ -1456,9 +1454,7 @@ H5CX_set_loc(hid_t /* If parallel is enabled and the file driver used is the MPI-IO * VFD, issue an MPI barrier for easier debugging if the API function - * calling this is supposed to be called collectively. Note that this - * happens only when the environment variable H5_COLL_BARRIER is set - * to non 0. + * calling this is supposed to be called collectively. */ if (H5_coll_api_sanity_check_g) { MPI_Comm mpi_comm; /* File communicator */ diff --git a/src/H5Cimage.c b/src/H5Cimage.c index 7421c90..98c1291 100644 --- a/src/H5Cimage.c +++ b/src/H5Cimage.c @@ -997,6 +997,9 @@ H5C__read_cache_image(H5F_t *f, H5C_t *cache_ptr) #endif /* H5_HAVE_PARALLEL */ /* Read the buffer (if serial access, or rank 0 of parallel access) */ + /* NOTE: if this block read is being performed on rank 0 only, throwing + * an error here will cause other ranks to hang in the following MPI_Bcast. + */ if (H5F_block_read(f, H5FD_MEM_SUPER, cache_ptr->image_addr, cache_ptr->image_len, cache_ptr->image_buffer) < 0) HGOTO_ERROR(H5E_CACHE, H5E_READERROR, FAIL, "Can't read metadata cache image block") diff --git a/src/H5Dcontig.c b/src/H5Dcontig.c index 2ace14b..3828e8e 100644 --- a/src/H5Dcontig.c +++ b/src/H5Dcontig.c @@ -278,9 +278,16 @@ H5D__contig_fill(const H5D_io_info_t *io_info) if (using_mpi) { /* Write the chunks out from only one process */ /* !! Use the internal "independent" DXPL!! -QAK */ - if (H5_PAR_META_WRITE == mpi_rank) - if (H5D__contig_write_one(&ioinfo, offset, size) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "unable to write fill value to dataset") + if (H5_PAR_META_WRITE == mpi_rank) { + if (H5D__contig_write_one(&ioinfo, offset, size) < 0) { + /* If writing fails, push an error and stop writing, but + * still participate in following MPI_Barrier. + */ + blocks_written = TRUE; + HDONE_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "unable to write fill value to dataset") + break; + } + } /* Indicate that blocks are being written */ blocks_written = TRUE; diff --git a/src/H5Dmpio.c b/src/H5Dmpio.c index f8bce33..cf8a6ef 100644 --- a/src/H5Dmpio.c +++ b/src/H5Dmpio.c @@ -2360,8 +2360,16 @@ H5D__sort_chunk(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm, HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank") if (mpi_rank == 0) { - if (H5D__chunk_addrmap(io_info, total_chunk_addr_array) < 0) - HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address") + if (H5D__chunk_addrmap(io_info, total_chunk_addr_array) < 0) { + size_t u; + + /* Clear total chunk address array */ + for (u = 0; u < (size_t)fm->layout->u.chunk.nchunks; u++) + total_chunk_addr_array[u] = HADDR_UNDEF; + + /* Push error, but still participate in following MPI_Bcast */ + HDONE_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address") + } } /* end if */ /* Broadcasting the MPI_IO option info. and chunk address info. */ diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c index c72578d..1feff43 100644 --- a/src/H5FDmpio.c +++ b/src/H5FDmpio.c @@ -864,14 +864,19 @@ H5FD__mpio_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t H5_ATTR file->mpi_size = mpi_size; /* Only processor p0 will get the filesize and broadcast it. */ - if (mpi_rank == 0) + if (mpi_rank == 0) { + /* If MPI_File_get_size fails, broadcast file size as -1 to signal error */ if (MPI_SUCCESS != (mpi_code = MPI_File_get_size(fh, &file_size))) - HMPI_GOTO_ERROR(NULL, "MPI_File_get_size failed", mpi_code) + file_size = (MPI_Offset)-1; + } /* Broadcast file size */ if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&file_size, (int)sizeof(MPI_Offset), MPI_BYTE, 0, comm))) HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code) + if (file_size < 0) + HMPI_GOTO_ERROR(NULL, "MPI_File_get_size failed", mpi_code) + /* Determine if the file should be truncated */ if (file_size && (flags & H5F_ACC_TRUNC)) { /* Truncate the file */ @@ -1262,10 +1267,14 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU rank0_bcast = TRUE; /* Read on rank 0 Bcast to other ranks */ - if (file->mpi_rank == 0) + if (file->mpi_rank == 0) { + /* If MPI_File_read_at fails, push an error, but continue + * to participate in following MPI_Bcast */ if (MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat))) - HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code) + HMPI_DONE_ERROR(FAIL, "MPI_File_read_at failed", mpi_code) + } + if (MPI_SUCCESS != (mpi_code = MPI_Bcast(buf, size_i, buf_type, 0, file->comm))) HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code) } /* end if */ @@ -1309,11 +1318,21 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU if (!rank0_bcast || (rank0_bcast && file->mpi_rank == 0)) { /* How many bytes were actually read? */ #if MPI_VERSION >= 3 - if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_read))) + if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_read))) { #else - if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read))) + if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read))) { #endif - HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code) + if (rank0_bcast && file->mpi_rank == 0) { + /* If MPI_Get_elements(_x) fails for a rank 0 bcast strategy, + * push an error, but continue to participate in the following + * MPI_Bcast. + */ + bytes_read = -1; + HMPI_DONE_ERROR(FAIL, "MPI_Get_elements failed", mpi_code) + } + else + HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code) + } } /* end if */ /* If the rank0-bcast feature was used, broadcast the # of bytes read to @@ -1703,17 +1722,19 @@ H5FD__mpio_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t H5_ATTR HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code) /* Only processor p0 will get the filesize and broadcast it. */ - /* (Note that throwing an error here will cause non-rank 0 processes - * to hang in following Bcast. -QAK, 3/17/2018) - */ - if (0 == file->mpi_rank) + if (0 == file->mpi_rank) { + /* If MPI_File_get_size fails, broadcast file size as -1 to signal error */ if (MPI_SUCCESS != (mpi_code = MPI_File_get_size(file->f, &size))) - HMPI_GOTO_ERROR(FAIL, "MPI_File_get_size failed", mpi_code) + size = (MPI_Offset)-1; + } /* Broadcast file size */ if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&size, (int)sizeof(MPI_Offset), MPI_BYTE, 0, file->comm))) HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code) + if (size < 0) + HMPI_GOTO_ERROR(FAIL, "MPI_File_get_size failed", mpi_code) + if (H5FD_mpi_haddr_to_MPIOff(file->eoa, &needed_eof) < 0) HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "cannot convert from haddr_t to MPI_Offset") @@ -1796,9 +1817,13 @@ H5FD__mpio_delete(const char *filename, hid_t fapl_id) HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code) /* Delete the file */ - if (mpi_rank == 0) + if (mpi_rank == 0) { + /* If MPI_File_delete fails, push an error but + * still participate in the following MPI_Barrier + */ if (MPI_SUCCESS != (mpi_code = MPI_File_delete(filename, info))) - HMPI_GOTO_ERROR(FAIL, "MPI_File_delete failed", mpi_code) + HMPI_DONE_ERROR(FAIL, "MPI_File_delete failed", mpi_code) + } /* Set up a barrier (don't want processes to run ahead of the delete) */ if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm))) -- cgit v0.12