summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjhendersonHDF <jhenderson@hdfgroup.org>2022-01-22 14:40:33 (GMT)
committerGitHub <noreply@github.com>2022-01-22 14:40:33 (GMT)
commit99d3962a831167298ebc087f0b8e8b6209034d95 (patch)
tree5c879275551180b76d0b14be52cdbf0a1b98ad8c
parentd45124d7085de2771c0157f5d48d71b21a10de1f (diff)
downloadhdf5-99d3962a831167298ebc087f0b8e8b6209034d95.zip
hdf5-99d3962a831167298ebc087f0b8e8b6209034d95.tar.gz
hdf5-99d3962a831167298ebc087f0b8e8b6209034d95.tar.bz2
Parallel rank0 deadlock fixes (#1183)
* Fix several places where rank 0 can skip past collective MPI operations on failure * Committing clang-format changes Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
-rw-r--r--release_docs/RELEASE.txt12
-rw-r--r--src/H5AC.c11
-rw-r--r--src/H5ACmpio.c97
-rw-r--r--src/H5C.c42
-rw-r--r--src/H5CX.c8
-rw-r--r--src/H5Cimage.c3
-rw-r--r--src/H5Dcontig.c13
-rw-r--r--src/H5Dmpio.c12
-rw-r--r--src/H5FDmpio.c53
9 files changed, 180 insertions, 71 deletions
diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt
index d059fb3..bba27c9 100644
--- a/release_docs/RELEASE.txt
+++ b/release_docs/RELEASE.txt
@@ -1083,6 +1083,18 @@ Bug Fixes since HDF5-1.12.0 release
(DER - 2021/11/23, HDFFV-11286)
+ - Fixed several potential MPI deadlocks in library failure conditions
+
+ In the parallel library, there were several places where MPI rank 0
+ could end up skipping past collective MPI operations when some failure
+ occurs in rank 0-specific processing. This would lead to deadlocks
+ where rank 0 completes an operation while other ranks wait in the
+ collective operation. These places have been rewritten to have rank 0
+ push an error and try to cleanup after the failure, then continue to
+ participate in the collective operation to the best of its ability.
+
+ (JTH - 2021/11/09)
+
- Fixed an issue with collective metadata reads being permanently disabled
after a dataset chunk lookup operation. This would usually cause a
mismatched MPI_Bcast and MPI_ERR_TRUNCATE issue in the library for
diff --git a/src/H5AC.c b/src/H5AC.c
index 6fbc63e..e20be3b 100644
--- a/src/H5AC.c
+++ b/src/H5AC.c
@@ -1636,9 +1636,14 @@ H5AC_unprotect(H5F_t *f, const H5AC_class_t *type, haddr_t addr, void *thing, un
if (H5AC__log_dirtied_entry((H5AC_info_t *)thing) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTUNPROTECT, FAIL, "can't log dirtied entry")
- if (deleted && aux_ptr->mpi_rank == 0)
- if (H5AC__log_deleted_entry((H5AC_info_t *)thing) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTUNPROTECT, FAIL, "H5AC__log_deleted_entry() failed")
+ if (deleted && aux_ptr->mpi_rank == 0) {
+ if (H5AC__log_deleted_entry((H5AC_info_t *)thing) < 0) {
+ /* If we fail to log the deleted entry, push an error but still
+ * participate in a possible sync point ahead
+ */
+ HDONE_ERROR(H5E_CACHE, H5E_CANTUNPROTECT, FAIL, "H5AC__log_deleted_entry() failed")
+ }
+ }
} /* end if */
#endif /* H5_HAVE_PARALLEL */
diff --git a/src/H5ACmpio.c b/src/H5ACmpio.c
index dc155f5..7eaf751 100644
--- a/src/H5ACmpio.c
+++ b/src/H5ACmpio.c
@@ -304,8 +304,10 @@ H5AC__broadcast_candidate_list(H5AC_t *cache_ptr, unsigned *num_entries_ptr, had
* are used to receiving from process 0, and also load it
* into a buffer for transmission.
*/
- if (H5AC__copy_candidate_list_to_buffer(cache_ptr, &chk_num_entries, &haddr_buf_ptr) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate buffer.")
+ if (H5AC__copy_candidate_list_to_buffer(cache_ptr, &chk_num_entries, &haddr_buf_ptr) < 0) {
+ /* Push an error, but still participate in following MPI_Bcast */
+ HDONE_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate buffer.")
+ }
HDassert(chk_num_entries == num_entries);
HDassert(haddr_buf_ptr != NULL);
@@ -428,18 +430,23 @@ H5AC__broadcast_clean_list(H5AC_t *cache_ptr)
/* allocate a buffer to store the list of entry base addresses in */
buf_size = sizeof(haddr_t) * num_entries;
- if (NULL == (addr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size)))
- HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for addr buffer")
-
- /* Set up user data for callback */
- udata.aux_ptr = aux_ptr;
- udata.addr_buf_ptr = addr_buf_ptr;
- udata.u = 0;
-
- /* Free all the clean list entries, building the address list in the callback */
- /* (Callback also removes the matching entries from the dirtied list) */
- if (H5SL_free(aux_ptr->c_slist_ptr, H5AC__broadcast_clean_list_cb, &udata) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTFREE, FAIL, "Can't build address list for clean entries")
+ if (NULL == (addr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size))) {
+ /* Push an error, but still participate in following MPI_Bcast */
+ HDONE_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for addr buffer")
+ }
+ else {
+ /* Set up user data for callback */
+ udata.aux_ptr = aux_ptr;
+ udata.addr_buf_ptr = addr_buf_ptr;
+ udata.u = 0;
+
+ /* Free all the clean list entries, building the address list in the callback */
+ /* (Callback also removes the matching entries from the dirtied list) */
+ if (H5SL_free(aux_ptr->c_slist_ptr, H5AC__broadcast_clean_list_cb, &udata) < 0) {
+ /* Push an error, but still participate in following MPI_Bcast */
+ HDONE_ERROR(H5E_CACHE, H5E_CANTFREE, FAIL, "Can't build address list for clean entries")
+ }
+ }
/* Now broadcast the list of cleaned entries */
if (MPI_SUCCESS !=
@@ -1448,8 +1455,10 @@ H5AC__receive_haddr_list(MPI_Comm mpi_comm, unsigned *num_entries_ptr, haddr_t *
/* allocate buffers to store the list of entry base addresses in */
buf_size = sizeof(haddr_t) * num_entries;
- if (NULL == (haddr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size)))
- HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for haddr buffer")
+ if (NULL == (haddr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size))) {
+ /* Push an error, but still participate in following MPI_Bcast */
+ HDONE_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for haddr buffer")
+ }
/* Now receive the list of candidate entries */
if (MPI_SUCCESS !=
@@ -1800,10 +1809,14 @@ H5AC__rsp__dist_md_write__flush_to_min_clean(H5F_t *f)
if (evictions_enabled) {
/* construct candidate list -- process 0 only */
- if (aux_ptr->mpi_rank == 0)
+ if (aux_ptr->mpi_rank == 0) {
+ /* If constructing candidate list fails, push an error but still participate
+ * in collective operations during following candidate list propagation
+ */
if (H5AC__construct_candidate_list(cache_ptr, aux_ptr, H5AC_SYNC_POINT_OP__FLUSH_TO_MIN_CLEAN) <
0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate list.")
+ HDONE_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate list.")
+ }
/* propagate and apply candidate list -- all processes */
if (H5AC__propagate_and_apply_candidate_list(f) < 0)
@@ -1899,15 +1912,21 @@ H5AC__rsp__p0_only__flush(H5F_t *f)
aux_ptr->write_permitted = FALSE;
/* Check for error on the write operation */
- if (result < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't flush.")
-
- /* this code exists primarily for the test bed -- it allows us to
- * enforce POSIX semantics on the server that pretends to be a
- * file system in our parallel tests.
- */
- if (aux_ptr->write_done)
- (aux_ptr->write_done)();
+ if (result < 0) {
+ /* If write operation fails, push an error but still participate
+ * in collective operations during following cache entry
+ * propagation
+ */
+ HDONE_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't flush.")
+ }
+ else {
+ /* this code exists primarily for the test bed -- it allows us to
+ * enforce POSIX semantics on the server that pretends to be a
+ * file system in our parallel tests.
+ */
+ if (aux_ptr->write_done)
+ (aux_ptr->write_done)();
+ }
} /* end if */
/* Propagate cleaned entries to other ranks. */
@@ -2019,15 +2038,21 @@ H5AC__rsp__p0_only__flush_to_min_clean(H5F_t *f)
aux_ptr->write_permitted = FALSE;
/* Check for error on the write operation */
- if (result < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "H5C_flush_to_min_clean() failed.")
-
- /* this call exists primarily for the test code -- it is used
- * to enforce POSIX semantics on the process used to simulate
- * reads and writes in t_cache.c.
- */
- if (aux_ptr->write_done)
- (aux_ptr->write_done)();
+ if (result < 0) {
+ /* If write operation fails, push an error but still participate
+ * in collective operations during following cache entry
+ * propagation
+ */
+ HDONE_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "H5C_flush_to_min_clean() failed.")
+ }
+ else {
+ /* this call exists primarily for the test code -- it is used
+ * to enforce POSIX semantics on the process used to simulate
+ * reads and writes in t_cache.c.
+ */
+ if (aux_ptr->write_done)
+ (aux_ptr->write_done)();
+ }
} /* end if */
if (H5AC__propagate_flushed_and_still_clean_entries_list(f) < 0)
diff --git a/src/H5C.c b/src/H5C.c
index d34c650..889351d 100644
--- a/src/H5C.c
+++ b/src/H5C.c
@@ -2307,9 +2307,14 @@ H5C_protect(H5F_t *f, const H5C_class_t *type, haddr_t addr, void *udata, unsign
H5MM_memcpy(((uint8_t *)entry_ptr->image_ptr) + entry_ptr->size, H5C_IMAGE_SANITY_VALUE,
H5C_IMAGE_EXTRA_SPACE);
#endif /* H5C_DO_MEMORY_SANITY_CHECKS */
- if (0 == mpi_rank)
- if (H5C__generate_image(f, cache_ptr, entry_ptr) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, NULL, "can't generate entry's image")
+ if (0 == mpi_rank) {
+ if (H5C__generate_image(f, cache_ptr, entry_ptr) < 0) {
+ /* If image generation fails, push an error but
+ * still participate in the following MPI_Bcast
+ */
+ HDONE_ERROR(H5E_CACHE, H5E_CANTGET, NULL, "can't generate entry's image")
+ }
+ }
} /* end if */
HDassert(entry_ptr->image_ptr);
@@ -7182,8 +7187,20 @@ H5C__load_entry(H5F_t *f,
#ifdef H5_HAVE_PARALLEL
if (!coll_access || 0 == mpi_rank) {
#endif /* H5_HAVE_PARALLEL */
- if (H5F_block_read(f, type->mem_type, addr, len, image) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_READERROR, NULL, "Can't read image*")
+
+ if (H5F_block_read(f, type->mem_type, addr, len, image) < 0) {
+
+#ifdef H5_HAVE_PARALLEL
+ if (coll_access) {
+ /* Push an error, but still participate in following MPI_Bcast */
+ HDmemset(image, 0, len);
+ HDONE_ERROR(H5E_CACHE, H5E_READERROR, NULL, "Can't read image*")
+ }
+ else
+#endif
+ HGOTO_ERROR(H5E_CACHE, H5E_READERROR, NULL, "Can't read image*")
+ }
+
#ifdef H5_HAVE_PARALLEL
} /* end if */
/* if the collective metadata read optimization is turned on,
@@ -7230,8 +7247,19 @@ H5C__load_entry(H5F_t *f,
* loaded thing, go get the on-disk image again (the extra portion).
*/
if (H5F_block_read(f, type->mem_type, addr + len, actual_len - len, image + len) <
- 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, "can't read image")
+ 0) {
+
+#ifdef H5_HAVE_PARALLEL
+ if (coll_access) {
+ /* Push an error, but still participate in following MPI_Bcast */
+ HDmemset(image + len, 0, actual_len - len);
+ HDONE_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, "can't read image")
+ }
+ else
+#endif
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, "can't read image")
+ }
+
#ifdef H5_HAVE_PARALLEL
}
/* If the collective metadata read optimization is turned on,
diff --git a/src/H5CX.c b/src/H5CX.c
index b9ce682..89e4c91 100644
--- a/src/H5CX.c
+++ b/src/H5CX.c
@@ -1397,9 +1397,7 @@ H5CX_set_apl(hid_t *acspl_id, const H5P_libclass_t *libclass,
/* If parallel is enabled and the file driver used is the MPI-IO
* VFD, issue an MPI barrier for easier debugging if the API function
- * calling this is supposed to be called collectively. Note that this
- * happens only when the environment variable H5_COLL_BARRIER is set
- * to non 0.
+ * calling this is supposed to be called collectively.
*/
if (H5_coll_api_sanity_check_g) {
MPI_Comm mpi_comm; /* File communicator */
@@ -1456,9 +1454,7 @@ H5CX_set_loc(hid_t
/* If parallel is enabled and the file driver used is the MPI-IO
* VFD, issue an MPI barrier for easier debugging if the API function
- * calling this is supposed to be called collectively. Note that this
- * happens only when the environment variable H5_COLL_BARRIER is set
- * to non 0.
+ * calling this is supposed to be called collectively.
*/
if (H5_coll_api_sanity_check_g) {
MPI_Comm mpi_comm; /* File communicator */
diff --git a/src/H5Cimage.c b/src/H5Cimage.c
index 7421c90..98c1291 100644
--- a/src/H5Cimage.c
+++ b/src/H5Cimage.c
@@ -997,6 +997,9 @@ H5C__read_cache_image(H5F_t *f, H5C_t *cache_ptr)
#endif /* H5_HAVE_PARALLEL */
/* Read the buffer (if serial access, or rank 0 of parallel access) */
+ /* NOTE: if this block read is being performed on rank 0 only, throwing
+ * an error here will cause other ranks to hang in the following MPI_Bcast.
+ */
if (H5F_block_read(f, H5FD_MEM_SUPER, cache_ptr->image_addr, cache_ptr->image_len,
cache_ptr->image_buffer) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_READERROR, FAIL, "Can't read metadata cache image block")
diff --git a/src/H5Dcontig.c b/src/H5Dcontig.c
index 2ace14b..3828e8e 100644
--- a/src/H5Dcontig.c
+++ b/src/H5Dcontig.c
@@ -278,9 +278,16 @@ H5D__contig_fill(const H5D_io_info_t *io_info)
if (using_mpi) {
/* Write the chunks out from only one process */
/* !! Use the internal "independent" DXPL!! -QAK */
- if (H5_PAR_META_WRITE == mpi_rank)
- if (H5D__contig_write_one(&ioinfo, offset, size) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "unable to write fill value to dataset")
+ if (H5_PAR_META_WRITE == mpi_rank) {
+ if (H5D__contig_write_one(&ioinfo, offset, size) < 0) {
+ /* If writing fails, push an error and stop writing, but
+ * still participate in following MPI_Barrier.
+ */
+ blocks_written = TRUE;
+ HDONE_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "unable to write fill value to dataset")
+ break;
+ }
+ }
/* Indicate that blocks are being written */
blocks_written = TRUE;
diff --git a/src/H5Dmpio.c b/src/H5Dmpio.c
index f8bce33..cf8a6ef 100644
--- a/src/H5Dmpio.c
+++ b/src/H5Dmpio.c
@@ -2360,8 +2360,16 @@ H5D__sort_chunk(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm,
HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank")
if (mpi_rank == 0) {
- if (H5D__chunk_addrmap(io_info, total_chunk_addr_array) < 0)
- HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address")
+ if (H5D__chunk_addrmap(io_info, total_chunk_addr_array) < 0) {
+ size_t u;
+
+ /* Clear total chunk address array */
+ for (u = 0; u < (size_t)fm->layout->u.chunk.nchunks; u++)
+ total_chunk_addr_array[u] = HADDR_UNDEF;
+
+ /* Push error, but still participate in following MPI_Bcast */
+ HDONE_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address")
+ }
} /* end if */
/* Broadcasting the MPI_IO option info. and chunk address info. */
diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c
index c72578d..1feff43 100644
--- a/src/H5FDmpio.c
+++ b/src/H5FDmpio.c
@@ -864,14 +864,19 @@ H5FD__mpio_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t H5_ATTR
file->mpi_size = mpi_size;
/* Only processor p0 will get the filesize and broadcast it. */
- if (mpi_rank == 0)
+ if (mpi_rank == 0) {
+ /* If MPI_File_get_size fails, broadcast file size as -1 to signal error */
if (MPI_SUCCESS != (mpi_code = MPI_File_get_size(fh, &file_size)))
- HMPI_GOTO_ERROR(NULL, "MPI_File_get_size failed", mpi_code)
+ file_size = (MPI_Offset)-1;
+ }
/* Broadcast file size */
if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&file_size, (int)sizeof(MPI_Offset), MPI_BYTE, 0, comm)))
HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code)
+ if (file_size < 0)
+ HMPI_GOTO_ERROR(NULL, "MPI_File_get_size failed", mpi_code)
+
/* Determine if the file should be truncated */
if (file_size && (flags & H5F_ACC_TRUNC)) {
/* Truncate the file */
@@ -1262,10 +1267,14 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU
rank0_bcast = TRUE;
/* Read on rank 0 Bcast to other ranks */
- if (file->mpi_rank == 0)
+ if (file->mpi_rank == 0) {
+ /* If MPI_File_read_at fails, push an error, but continue
+ * to participate in following MPI_Bcast */
if (MPI_SUCCESS !=
(mpi_code = MPI_File_read_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
- HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
+ HMPI_DONE_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
+ }
+
if (MPI_SUCCESS != (mpi_code = MPI_Bcast(buf, size_i, buf_type, 0, file->comm)))
HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code)
} /* end if */
@@ -1309,11 +1318,21 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU
if (!rank0_bcast || (rank0_bcast && file->mpi_rank == 0)) {
/* How many bytes were actually read? */
#if MPI_VERSION >= 3
- if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_read)))
+ if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_read))) {
#else
- if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read)))
+ if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read))) {
#endif
- HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
+ if (rank0_bcast && file->mpi_rank == 0) {
+ /* If MPI_Get_elements(_x) fails for a rank 0 bcast strategy,
+ * push an error, but continue to participate in the following
+ * MPI_Bcast.
+ */
+ bytes_read = -1;
+ HMPI_DONE_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
+ }
+ else
+ HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
+ }
} /* end if */
/* If the rank0-bcast feature was used, broadcast the # of bytes read to
@@ -1703,17 +1722,19 @@ H5FD__mpio_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t H5_ATTR
HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code)
/* Only processor p0 will get the filesize and broadcast it. */
- /* (Note that throwing an error here will cause non-rank 0 processes
- * to hang in following Bcast. -QAK, 3/17/2018)
- */
- if (0 == file->mpi_rank)
+ if (0 == file->mpi_rank) {
+ /* If MPI_File_get_size fails, broadcast file size as -1 to signal error */
if (MPI_SUCCESS != (mpi_code = MPI_File_get_size(file->f, &size)))
- HMPI_GOTO_ERROR(FAIL, "MPI_File_get_size failed", mpi_code)
+ size = (MPI_Offset)-1;
+ }
/* Broadcast file size */
if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&size, (int)sizeof(MPI_Offset), MPI_BYTE, 0, file->comm)))
HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code)
+ if (size < 0)
+ HMPI_GOTO_ERROR(FAIL, "MPI_File_get_size failed", mpi_code)
+
if (H5FD_mpi_haddr_to_MPIOff(file->eoa, &needed_eof) < 0)
HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "cannot convert from haddr_t to MPI_Offset")
@@ -1796,9 +1817,13 @@ H5FD__mpio_delete(const char *filename, hid_t fapl_id)
HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code)
/* Delete the file */
- if (mpi_rank == 0)
+ if (mpi_rank == 0) {
+ /* If MPI_File_delete fails, push an error but
+ * still participate in the following MPI_Barrier
+ */
if (MPI_SUCCESS != (mpi_code = MPI_File_delete(filename, info)))
- HMPI_GOTO_ERROR(FAIL, "MPI_File_delete failed", mpi_code)
+ HMPI_DONE_ERROR(FAIL, "MPI_File_delete failed", mpi_code)
+ }
/* Set up a barrier (don't want processes to run ahead of the delete) */
if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm)))