summaryrefslogtreecommitdiffstats
path: root/src/H5C.c
diff options
context:
space:
mode:
authorjhendersonHDF <jhenderson@hdfgroup.org>2022-01-22 14:40:33 (GMT)
committerGitHub <noreply@github.com>2022-01-22 14:40:33 (GMT)
commit99d3962a831167298ebc087f0b8e8b6209034d95 (patch)
tree5c879275551180b76d0b14be52cdbf0a1b98ad8c /src/H5C.c
parentd45124d7085de2771c0157f5d48d71b21a10de1f (diff)
downloadhdf5-99d3962a831167298ebc087f0b8e8b6209034d95.zip
hdf5-99d3962a831167298ebc087f0b8e8b6209034d95.tar.gz
hdf5-99d3962a831167298ebc087f0b8e8b6209034d95.tar.bz2
Parallel rank0 deadlock fixes (#1183)
* Fix several places where rank 0 can skip past collective MPI operations on failure * Committing clang-format changes Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
Diffstat (limited to 'src/H5C.c')
-rw-r--r--src/H5C.c42
1 files changed, 35 insertions, 7 deletions
diff --git a/src/H5C.c b/src/H5C.c
index d34c650..889351d 100644
--- a/src/H5C.c
+++ b/src/H5C.c
@@ -2307,9 +2307,14 @@ H5C_protect(H5F_t *f, const H5C_class_t *type, haddr_t addr, void *udata, unsign
H5MM_memcpy(((uint8_t *)entry_ptr->image_ptr) + entry_ptr->size, H5C_IMAGE_SANITY_VALUE,
H5C_IMAGE_EXTRA_SPACE);
#endif /* H5C_DO_MEMORY_SANITY_CHECKS */
- if (0 == mpi_rank)
- if (H5C__generate_image(f, cache_ptr, entry_ptr) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, NULL, "can't generate entry's image")
+ if (0 == mpi_rank) {
+ if (H5C__generate_image(f, cache_ptr, entry_ptr) < 0) {
+ /* If image generation fails, push an error but
+ * still participate in the following MPI_Bcast
+ */
+ HDONE_ERROR(H5E_CACHE, H5E_CANTGET, NULL, "can't generate entry's image")
+ }
+ }
} /* end if */
HDassert(entry_ptr->image_ptr);
@@ -7182,8 +7187,20 @@ H5C__load_entry(H5F_t *f,
#ifdef H5_HAVE_PARALLEL
if (!coll_access || 0 == mpi_rank) {
#endif /* H5_HAVE_PARALLEL */
- if (H5F_block_read(f, type->mem_type, addr, len, image) < 0)
- HGOTO_ERROR(H5E_CACHE, H5E_READERROR, NULL, "Can't read image*")
+
+ if (H5F_block_read(f, type->mem_type, addr, len, image) < 0) {
+
+#ifdef H5_HAVE_PARALLEL
+ if (coll_access) {
+ /* Push an error, but still participate in following MPI_Bcast */
+ HDmemset(image, 0, len);
+ HDONE_ERROR(H5E_CACHE, H5E_READERROR, NULL, "Can't read image*")
+ }
+ else
+#endif
+ HGOTO_ERROR(H5E_CACHE, H5E_READERROR, NULL, "Can't read image*")
+ }
+
#ifdef H5_HAVE_PARALLEL
} /* end if */
/* if the collective metadata read optimization is turned on,
@@ -7230,8 +7247,19 @@ H5C__load_entry(H5F_t *f,
* loaded thing, go get the on-disk image again (the extra portion).
*/
if (H5F_block_read(f, type->mem_type, addr + len, actual_len - len, image + len) <
- 0)
- HGOTO_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, "can't read image")
+ 0) {
+
+#ifdef H5_HAVE_PARALLEL
+ if (coll_access) {
+ /* Push an error, but still participate in following MPI_Bcast */
+ HDmemset(image + len, 0, actual_len - len);
+ HDONE_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, "can't read image")
+ }
+ else
+#endif
+ HGOTO_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, "can't read image")
+ }
+
#ifdef H5_HAVE_PARALLEL
}
/* If the collective metadata read optimization is turned on,